diff --git a/.appveyor.yml b/.appveyor.yml index 911a9d317..2d1f70b18 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -1,56 +1,80 @@ # AppVeyor is a CI service to build and run tests under Windows # https://ci.appveyor.com/project/districtdatalabs/yellowbrick +image: + - Previous Visual Studio 2017 + +version: 0.9.{build} +pull_requests: + do_not_increment_build_number: true + +branches: + only: + - master + - develop + environment: matrix: # Tests failing on 32 bit architectures # https://github.com/numpy/numpy/issues/4384 - # - PYTHON: "C:\\Python27" - # PYTHON_VERSION: "2.7.14" - # PYTHON_ARCH: "32" - - - PYTHON: "C:\\Python27-x64" - PYTHON_VERSION: "2.7.14" - PYTHON_ARCH: "64" # - PYTHON: "C:\\Python36" # PYTHON_VERSION: "3.6.4" # PYTHON_ARCH: "32" - - PYTHON: "C:\\Python36-x64" - PYTHON_VERSION: "3.6.4" - PYTHON_ARCH: "64" - # - PYTHON: "C:\\Miniconda3" # PYTHON_VERSION: "3.6.4" # MINICONDA_VERSION: "4.4.10" # PYTHON_ARCH: "32" - - PYTHON: "C:\\Miniconda3-x64" - PYTHON_VERSION: "3.6.4" - MINICONDA_VERSION: "4.4.10" + - PYTHON: "C:\\Python36-x64" + PYTHON_VERSION: "3.6" + PYTHON_ARCH: "64" + + - PYTHON: "C:\\Python37-x64" + PYTHON_VERSION: "3.7" PYTHON_ARCH: "64" + - PYTHON: "C:\\Miniconda36-x64" + PYTHON_VERSION: "3.6" + MINICONDA_VERSION: "4.5.4" + PYTHON_ARCH: "64" + + # Failing Tests Due to TypeError: LoadLibrary() + # - PYTHON: "C:\\Miniconda37-x64" + # PYTHON_VERSION: "3.7" + # MINICONDA_VERSION: "4.5.12" + # PYTHON_ARCH: "64" + # Cancel pending jobs after first job failure matrix: fast_finish: true install: - - "%PYTHON%\\python.exe -m pip install wheel" - - "%PYTHON%\\python.exe -m pip install -r requirements.txt" - - "%PYTHON%\\python.exe -m pip install -r tests/requirements.txt" - - "%PYTHON%\\python.exe -m nltk.downloader popular" + - "SET PATH=%PYTHON%;%PYTHON%\\Scripts;%PATH%" + - cmd: "IF '%MINICONDA_VERSION%'=='' ( + python -m pip install -U pip && + python -m pip install -U wheel && + python -m pip install -U -r requirements.txt && + python -m pip install -U -r tests/requirements.txt + ) ELSE ( + conda update -n base conda --yes && + conda config --add channels conda-forge && + conda env create -f tests/requirements.txt -n yellowbrick && + call activate yellowbrick + )" + - "python -m nltk.downloader popular" # No requirement to build any C libraries build: off test_script: - - "%PYTHON%\\python.exe setup.py test" + - "python setup.py test" after_test: - - "%PYTHON%\\python.exe setup.py bdist_wheel" + - "python setup.py bdist_wheel" artifacts: - path: dist\* @@ -60,12 +84,10 @@ cache: - '%APPDATA%\pip\Cache' notifications: - - provider: Email - to: - - bbengfort@districtdatalabs.com - - rbilbro@districtdatalabs.com - - nathan.danielsen@gmail.com - - tojeda@districtdatalabs.com - on_build_success: false - on_build_failure: false + - provider: Slack + auth_token: + secure: 6hd0IQ66qUUiStas9s304izgV7Wh0XyY0gFjuua7H8gtYle41gbFozkfKgzs110oK/iT4XKWrNfWOgz3S5Jcp2SGFXQ9IvF5ZqYmX71ZQRY= + channel: '#yb-ci' + on_build_success: true + on_build_failure: true on_build_status_changed: true diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml new file mode 100644 index 000000000..e582a5b82 --- /dev/null +++ b/.github/FUNDING.yml @@ -0,0 +1,8 @@ +# These are supported funding model platforms + +github: # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2] +patreon: # Replace with a single Patreon username +open_collective: # Replace with a single Open Collective username +ko_fi: # Replace with a single Ko-fi username +tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel +custom: https://numfocus.org/donate diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md new file mode 100644 index 000000000..b55897af3 --- /dev/null +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -0,0 +1,73 @@ + + +This PR fixes #issue_number _(If you are fixing a bug)_ which reported a bug that caused a problem to occur when users... + +_(or if you are introducing a new feature)_ which requested a feature to allow the user to... + +I have made the following changes: + +1. +2. +3. + +### Sample Code and Plot + +_If you are adding or modifying a visualizer, PLEASE include a sample plot here along with the code you used to generate it._ + +### TODOs and questions + + + +Still to do: + +- [ ] +- [ ] +- [ ] + +Questions for the @DistrictDataLabs/team-oz-maintainers: + +- [ ] +- [ ] + +### CHECKLIST + + + +- [ ] _Is the commit message formatted correctly?_ +- [ ] _Have you noted the new functionality/bugfix in the release notes of the next release?_ + + + +- [ ] _Included a sample plot to visually illustrate your changes?_ +- [ ] _Do all of your functions and methods have docstrings?_ +- [ ] _Have you added/updated unit tests where appropriate?_ +- [ ] _Have you updated the baseline images if necessary?_ +- [ ] _Have you run the unit tests using `pytest`?_ +- [ ] _Is your code style correct (are you using PEP8, pyflakes)?_ +- [ ] _Have you documented your new feature/functionality in the docs?_ + + + +- [ ] _Have you built the docs using `make html`?_ diff --git a/.github/PULL_REQUEST_TEMPLATE/pull_request_template.md b/.github/PULL_REQUEST_TEMPLATE/pull_request_template.md deleted file mode 100644 index 9174d7e06..000000000 --- a/.github/PULL_REQUEST_TEMPLATE/pull_request_template.md +++ /dev/null @@ -1,17 +0,0 @@ - -Fixes # - - - - -Changes proposed in this pull request: - -- -- -- - - -@DistrictDataLabs/team-oz-maintainers diff --git a/.gitignore b/.gitignore index a8c557457..aa54c80dc 100644 --- a/.gitignore +++ b/.gitignore @@ -61,6 +61,7 @@ docs/_build/ # IDE/editor droppings *.swp *.swo +.vscode/settings.json # OS droppings .DS_Store @@ -120,6 +121,4 @@ fabric.properties # Data downloaded from Yellowbrick data/ -.vscode/settings.json - -yellowbrick/datasets/fixtures +yellowbrick/datasets/fixtures \ No newline at end of file diff --git a/.travis.yml b/.travis.yml index 50cee5af8..be339917a 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,28 +1,53 @@ +dist: xenial language: python -python: - - '2.7' - - '3.6' +matrix: + include: + - name: "Python 3.6 on Xenial Linux" + python: '3.6' + + - name: "Python 3.7 on Xenial Linux" + python: '3.7' + + - name: "Miniconda 3.6 on Xenial Linux" + env: ANACONDA="3.6" + + - name: "Miniconda 3.7 on Xenial Linux" + env: ANACONDA="3.7" before_install: - - sudo apt-get update - - sudo apt-get build-dep python-scipy +- sudo apt-get update; +- if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then + sudo apt-get build-dep python-scipy; + MINICONDA_OS="Linux"; + fi install: - - pip install -r requirements.txt - - pip install -r tests/requirements.txt - - pip install coveralls - - python -m nltk.downloader popular +- if [[ -z ${ANACONDA} ]]; then + pip install -r requirements.txt; + pip install -r tests/requirements.txt; + pip install coveralls; + else + wget https://repo.anaconda.com/miniconda/Miniconda3-latest-$MINICONDA_OS-x86_64.sh -O miniconda.sh; + bash miniconda.sh -b -p $HOME/miniconda; + export PATH="$HOME/miniconda/bin:$PATH"; + hash -r; + conda config --set always_yes yes --set changeps1 no; + export BASE_PYTHON=`python -c 'import sys; version=sys.version_info[:3]; print("{0}.{1}.{2}".format(*version))'`; + echo "Base Anaconda Python is $BASE_PYTHON"; + echo "Creating Anaconda Python $ANACONDA conda environment"; + conda update -n base conda --yes; + conda config --add channels conda-forge; + conda env create -f tests/requirements.txt -n yellowbrick python=$ANACONDA; + source activate yellowbrick; + conda install coveralls; + fi -script: make test +script: +- python -m nltk.downloader popular +- make test after_success: coveralls notifications: - email: - recipients: - - bbengfort@districtdatalabs.com - - rbilbro@districtdatalabs.com - - nathan.danielsen@gmail.com - - tojeda@districtdatalabs.com - on_success: change - on_failure: always + slack: + secure: mWKVHmEc22FJSp6Rrnd1j4QYCgZY4NJSrA8kZ5wj2/lf1iHI/CfWGTf7+Qihqe+rt0FOU0+UA9SzvSHRD1bV76q/zINayQ0EyJAfQzvIWIRGGnnMSO/79WoEYF56wwjpc5pLUTh6QV5qqfy+8nNGQ1/uJ0h6FtsUaSa/g61a5ZJEVBIjIpH8PgMxM64dRgJCmAdQuXkBP5Uf3yHlCtYk+Jr+gyXU2oqwMZ1VWgZkEo1Tqo7W9WY8dkOaAkzXDT61OqtcyyTuVSYbmK4i3c84681NBpb7wT6BfiCCAd3tn5AIKCkJVJ0ga0XeF6MdDpnicpku4FaN+fQjwkPiU47o/aFp8RNp27JQ9AhvH7wMuu5O8HDhszjRkfGOlUbuPOTavc22o4j0ShsrLiTQRJRhQQzJoquPuPj5wHqCCN+ice7IVUHj3ZC2jpJKDEYUNnr1fATtOwocimc6PhJM/IoeHgEEHpi37b+AxnhgOFoBlgsq2f4nsRD9JsLHqIpJCHgMjKxc6p3FtcFcXZDlDXQIcCzSRiPhG207dahspA3aPLj4Z+tOLJwh7/PSEfp02kcgPMM/MLYTWcaBv14aYi69kvQoZTfqVY8tIohg3ygda5siOCTTgqGriJYzkmdY5/Dp51kabhl+cEVIxPyY0miqyl3hZjqkqCnnOtg06qqxLLM= diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 053be200c..f2da93d0a 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -8,7 +8,7 @@ For more on the development path, goals, and motivations behind Yellowbrick, che Yellowbrick is an open source project that is supported by a community who will gratefully and humbly accept any contributions you might make to the project. Large or small, any contribution makes a big difference; and if you've never contributed to an open source project before, we hope you will start with Yellowbrick! -Principally, Yellowbrick development is about the addition and creation of *visualizers* --- objects that learn from data and create a visual representation of the data or model. Visualizers integrate with scikit-learn estimators, transformers, and pipelines for specific purposes and as a result, can be simple to build and deploy. The most common contribution is therefore a new visualizer for a specific model or model family. We'll discuss in detail how to build visualizers later. +Principally, Yellowbrick development is about the addition and creation of *visualizers* — objects that learn from data and create a visual representation of the data or model. Visualizers integrate with scikit-learn estimators, transformers, and pipelines for specific purposes and as a result, can be simple to build and deploy. The most common contribution is therefore a new visualizer for a specific model or model family. We'll discuss in detail how to build visualizers later. Beyond creating visualizers, there are many ways to contribute: @@ -32,20 +32,19 @@ The typical workflow for a contributor to the codebase is as follows: 1. **Discover** a bug or a feature by using Yellowbrick. 2. **Discuss** with the core contributes by [adding an issue](https://github.com/DistrictDataLabs/yellowbrick/issues). -3. **Assign** yourself the task by pulling a card from our [Waffle Kanban](https://waffle.io/DistrictDataLabs/yellowbrick). -4. **Fork** the repository into your own GitHub account. -5. Create a **Pull Request** first thing to [connect with us](https://github.com/DistrictDataLabs/yellowbrick/pulls) about your task. -6. **Code** the feature, write the documentation, add your contribution. -7. **Review** the code with core contributors who will guide you to a high quality submission. -8. **Merge** your contribution into the Yellowbrick codebase. +3. **Fork** the repository into your own GitHub account. +4. Create a **Pull Request** first thing to [connect with us](https://github.com/DistrictDataLabs/yellowbrick/pulls) about your task. +5. **Code** the feature, write the documentation, add your contribution. +6. **Review** the code with core contributors who will guide you to a high quality submission. +7. **Merge** your contribution into the Yellowbrick codebase. -**Note**: Create a pull request as soon as possible, even before you've started coding. This will allow the core contributors to give you advice about where to add your code or utilities and discuss other style choices and implementation details as you go. Don't wait! +We believe that *contribution is collaboration* and therefore emphasize *communication* throughout the open source process. We rely heavily on GitHub's social coding tools to allow us to do this. For instance, we use GitHub's [milestone](https://help.github.com/en/articles/about-milestones) feature to focus our development efforts for each Yellowbrick semester, so be sure to check out the issues associated with our [current milestone](https://github.com/districtdatalabs/yellowbrick/milestones)! -We believe that *contribution is collaboration* and therefore emphasize *communication* throughout the open source process. We rely heavily on GitHub's social coding tools to allow us to do this. +Once you have a good sense of how you are going to implement the new feature (or fix the bug!), you can reach out for feedback from the maintainers by creating a [pull request](https://github.com/DistrictDataLabs/yellowbrick/pulls). Please note that if we feel your solution has not been thought out in earnest, or if the PR is not aligned with our [current milestone](https://github.com/districtdatalabs/yellowbrick/milestones) goals, we may reach out to ask that you close the PR so that we can prioritize reviewing the most critical feature requests and bug fixes. Ideally, any pull request should be capable of resolution within 6 weeks of being opened. This timeline helps to keep our pull request queue small and allows Yellowbrick to maintain a robust release schedule to give our users the best experience possible. However, the most important thing is to keep the dialogue going! And if you're unsure whether you can complete your idea within 6 weeks, you should still go ahead and open a PR and we will be happy to help you scope it down as needed. -If we have comments or questions when we evaluate your pull request and receive no response, we will also close the PR after this period of time. Please know that this does not mean we don't value your contribution, just that things go stale. If in the future you want to pick it back up, feel free to address our original feedback and to reference the original PR in a new pull request. +If we have comments or questions when we evaluate your pull request and receive no response, we will also close the PR after this period of time. Please know that this does not mean we don't value your contribution, just that things go stale. If in the future you want to pick it back up, feel free to address our original feedback and to reference the original PR in a new pull request. ### Forking the Repository @@ -104,7 +103,7 @@ Once forked, use the following steps to get your development environment set up $ git checkout develop ``` -At this point you're ready to get started writing code. If you're going to take on a specific task, we'd strongly encourage you to check out the issue on [Waffle](https://waffle.io/DistrictDataLabs/yellowbrick) and create a [pull request](https://github.com/DistrictDataLabs/yellowbrick/pulls) **before you start coding** to better foster communication with other contributors. +At this point you're ready to get started writing code! ### Branching Conventions @@ -144,7 +143,7 @@ $ git branch -d feature-myfeature $ git push origin --delete feature-myfeature ``` -Head back to Waffle and checkout another issue! +Head back to Github and checkout another issue! ## Developing Visualizers @@ -159,12 +158,12 @@ There are two basic types of Visualizers: - **Feature Visualizers** are high dimensional data visualizations that are essentially transformers. - **Score Visualizers** wrap a scikit-learn regressor, classifier, or clusterer and visualize the behavior or performance of the model on test data. -These two basic types of visualizers map well to the two basic objects in scikit-learn: +These two basic types of visualizers map well to the two basic estimator objects in scikit-learn: - **Transformers** take input data and return a new data set. -- **Estimators** are fit to training data and can make predictions. +- **Models** are fit to training data and can make predictions. -The scikit-learn API is object oriented, and estimators and transformers are initialized with parameters by instantiating their class. Hyperparameters can also be set using the `set_attrs()` method and retrieved with the corresponding `get_attrs()` method. All scikit-learn estimators have a `fit(X, y=None)` method that accepts a two dimensional data array, `X`, and optionally a vector `y` of target values. The `fit()` method trains the estimator, making it ready to transform data or make predictions. Transformers have an associated `transform(X)` method that returns a new dataset, `Xprime` and models have a `predict(X)` method that returns a vector of predictions, `yhat`. Models also have a `score(X, y)` method that evaluate the performance of the model. +The scikit-learn API is object oriented, and estimators are initialized with parameters by instantiating their class. Hyperparameters can also be set using the `set_attrs()` method and retrieved with the corresponding `get_attrs()` method. All scikit-learn estimators have a `fit(X, y=None)` method that accepts a two dimensional data array, `X`, and optionally a vector `y` of target values. The `fit()` method trains the estimator, making it ready to transform data or make predictions. Transformers have an associated `transform(X)` method that returns a new dataset, `Xprime` and models have a `predict(X)` method that returns a vector of predictions, `yhat`. Models may also have a `score(X, y)` method that evaluate the performance of the model. Visualizers interact with scikit-learn objects by intersecting with them at the methods defined above. Specifically, visualizers perform actions related to `fit()`, `transform()`, `predict()`, and `score()` then call a `draw()` method which initializes the underlying figure associated with the visualizer. The user calls the visualizer's `poof()` method, which in turn calls a `finalize()` method on the visualizer to draw legends, titles, etc. and then `poof()` renders the figure. The Visualizer API is therefore: @@ -185,14 +184,13 @@ class MyVisualizer(Visualizer): super(MyVisualizer, self).__init__(ax, **kwargs) def fit(self, X, y=None): + super(MyVisualizer, self).fit(X, y) self.draw(X) return self def draw(self, X): - if self.ax is None: - self.ax = self.gca() - self.ax.plot(X) + return self.ax def finalize(self): self.set_title("My Visualizer") @@ -212,28 +210,23 @@ Score visualizers work on the same principle but accept an additional required ` The test package mirrors the `yellowbrick` package in structure and also contains several helper methods and base functionality. To add a test to your visualizer, find the corresponding file to add the test case, or create a new test file in the same place you added your code. -Visual tests are notoriously difficult to create --- how do you test a visualization or figure? Moreover, testing scikit-learn models with real data can consume a lot of memory. Therefore the primary test you should create is simply to test your visualizer from end to end and make sure that no exceptions occur. To assist with this, we have two primary helpers, `VisualTestCase` and `DatasetMixin`. Create your unit test as follows:: +Visual tests are notoriously difficult to create --- how do you test a visualization or figure? Moreover, testing scikit-learn models with real data can consume a lot of memory. Therefore the primary test you should create is simply to test your visualizer from end to end and make sure that no exceptions occur. To assist with this, we have a helper, `VisualTestCase`. Create your unit test as follows:: ```python import pytest +from yellowbrick.datasets import load_occupancy + from tests.base import VisualTestCase -from tests.dataset import DatasetMixin -class MyVisualizerTests(VisualTestCase, DatasetMixin): +class MyVisualizerTests(VisualTestCase): def test_my_visualizer(self): """ Test MyVisualizer on a real dataset """ - # Load the data from the fixture - dataset = self.load_data('occupancy') - - # Get the data - X = dataset[[ - "temperature", "relative_humidity", "light", "C02", "humidity" - ]] - y = dataset['occupancy'].astype(int) + # Load the data + X,y = load_occupancy() try: visualizer = MyVisualizer() @@ -255,7 +248,7 @@ You can also run your own test file as follows:: $ pytest tests/test_your_visualizer.py ``` -The Makefile uses the pytest runner and testing suite as well as the coverage library, so make sure you have those dependencies installed! The `DatasetMixin` also requires [requests.py](http://docs.python-requests.org/en/master/) to fetch data from our Amazon S3 account. +The Makefile uses the pytest runner and testing suite as well as the coverage library, so make sure you have those dependencies installed! **Note**: Advanced developers can use our _image comparison tests_ to assert that an image generated matches a baseline image. Read more about this in our [testing documentation](http://www.scikit-yb.org/en/latest/contributing.html#testing) @@ -303,8 +296,4 @@ class MyVisualizer(Visualizer): """ ``` -This is a very good start to producing a high quality visualizer, but unless it is part of the documentation on our website, it will not be visible. For details on including documentation in the `docs` directory see the [Contributing Documentation](http://www.scikit-yb.org/en/latest/contributing.html#documentation) section in the larger contributing guide. - -## Throughput - -[![Throughput Graph](https://graphs.waffle.io/DistrictDataLabs/yellowbrick/throughput.svg)](https://waffle.io/DistrictDataLabs/yellowbrick/metrics/throughput) +This is a very good start to producing a high quality visualizer, but unless it is part of the documentation on our website, it will not be visible. For details on including documentation in the `docs` directory see the [Contributing Documentation](http://www.scikit-yb.org/en/latest/contributing.html#documentation) section in the larger contributing guide. \ No newline at end of file diff --git a/DESCRIPTION.md b/DESCRIPTION.md new file mode 100644 index 000000000..7a862aea7 --- /dev/null +++ b/DESCRIPTION.md @@ -0,0 +1,70 @@ +# Yellowbrick + +[![Visualizers](docs/images/readme/banner.png)](https://www.scikit-yb.org/) + +Yellowbrick is a suite of visual analysis and diagnostic tools designed to facilitate machine learning with scikit-learn. The library implements a new core API object, the `Visualizer` that is an scikit-learn estimator — an object that learns from data. Similar to transformers or models, visualizers learn from data by creating a visual representation of the model selection workflow. + +Visualizer allow users to steer the model selection process, building intuition around feature engineering, algorithm selection and hyperparameter tuning. For instance, they can help diagnose common problems surrounding model complexity and bias, heteroscedasticity, underfit and overtraining, or class balance issues. By applying visualizers to the model selection workflow, Yellowbrick allows you to steer predictive models toward more successful results, faster. + +The full documentation can be found at [scikit-yb.org](https://scikit-yb.org/) and includes a [Quick Start Guide](https://www.scikit-yb.org/en/latest/quickstart.html) for new users. + +## Visualizers + +Visualizers are estimators — objects that learn from data — whose primary objective is to create visualizations that allow insight into the model selection process. In scikit-learn terms, they can be similar to transformers when visualizing the data space or wrap a model estimator similar to how the `ModelCV` (e.g. [`RidgeCV`](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.RidgeCV.html), [`LassoCV`](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LassoCV.html)) methods work. The primary goal of Yellowbrick is to create a sensical API similar to scikit-learn. Some of our most popular visualizers include: + +### Classification Visualization + +- **Classification Report**: a visual classification report that displays a model's precision, recall, and F1 per-class scores as a heatmap +- **Confusion Matrix**: a heatmap view of the confusion matrix of pairs of classes in multi-class classification +- **Discrimination Threshold**: a visualization of the precision, recall, F1-score, and queue rate with respect to the discrimination threshold of a binary classifier +- **Precision-Recall Curve**: plot the precision vs recall scores for different probability thresholds +- **ROCAUC**: graph the receiver operator characteristic (ROC) and area under the curve (AUC) + +### Clustering Visualization + +- **Intercluster Distance Maps**: visualize the relative distance and size of clusters +- **KElbow Visualizer**: visualize cluster according to the specified scoring function, looking for the "elbow" in the curve. +- **Silhouette Visualizer**: select `k` by visualizing the silhouette coefficient scores of each cluster in a single model + +### Feature Visualization + +- **Manifold Visualization**: high-dimensional visualization with manifold learning +- **Parallel Coordinates**: horizontal visualization of instances +- **PCA Projection**: projection of instances based on principal components +- **RadViz Visualizer**: separation of instances around a circular plot +- **Rank Features**: single or pairwise ranking of features to detect relationships + +### Model Selection Visualization + +- **Cross Validation Scores**: display the cross-validated scores as a bar chart with the average score plotted as a horizontal line +- **Feature Importances**: rank features based on their in-model performance +- **Learning Curve**: show if a model might benefit from more data or less complexity +- **Recursive Feature Elimination**: find the best subset of features based on importance +- **Validation Curve**: tune a model with respect to a single hyperparameter + +### Regression Visualization + +- **Alpha Selection**: show how the choice of alpha influences regularization +- **Cook's Distance**: show the influence of instances on linear regression +- **Prediction Error Plots**: find model breakdowns along the domain of the target +- **Residuals Plot**: show the difference in residuals of training and test data + +### Target Visualization + +- **Balanced Binning Reference**: generate a histogram with vertical lines showing the recommended value point to the bin data into evenly distributed bins +- **Class Balance**: show the relationship of the support for each class in both the training and test data by displaying how frequently each class occurs as a bar graph the frequency of the classes' representation in the dataset +- **Feature Correlation**: visualize the correlation between the dependent variables and the target + +### Text Visualization + +- **Dispersion Plot**: visualize how key terms are dispersed throughout a corpus +- **PosTag Visualizer**: plot the counts of different parts-of-speech throughout a tagged corpus +- **Token Frequency Distribution**: visualize the frequency distribution of terms in the corpus +- **t-SNE Corpus Visualization**: uses stochastic neighbor embedding to project documents +- **UMAP Corpus Visualization**: plot similar documents closer together to discover clusters + +... and more! Yellowbrick is adding new visualizers all the time so be sure to check out our [examples gallary](https://www.scikit-yb.org/en/latest/api/index.html) — or even the [develop](https://github.com/districtdatalabs/yellowbrick/tree/develop) branch — and feel free to contribute your ideas for new Visualizers! + +## Affiliations +[![District Data Labs](docs/images/readme/affiliates_ddl.png)](https://www.districtdatalabs.com/) [![NumFOCUS Affiliated Project](docs/images/readme/affiliates_numfocus.png)](https://numfocus.org/) + diff --git a/DESCRIPTION.rst b/DESCRIPTION.rst deleted file mode 100644 index 1b42fad27..000000000 --- a/DESCRIPTION.rst +++ /dev/null @@ -1,84 +0,0 @@ -.. -*- mode: rst -*- - -|Visualizers|_ - -.. |Visualizers| image:: http://www.scikit-yb.org/en/latest/_images/visualizers.png - :width: 800 px -.. _Visualizers: http://www.scikit-yb.org/ - -Yellowbrick -=========== - -Yellowbrick is a suite of visual analysis and diagnostic tools designed to facilitate machine learning with Scikit-Learn. The library implements a new core API object, the "Visualizer" that is an Scikit-Learn estimator: an object that learns from data. Like transformers or models, visualizers learn from data by creating a visual representation of the model selection workflow. - -Visualizers allow users to steer the model selection process, building intuition around feature engineering, algorithm selection, and hyperparameter tuning. For example, visualizers can help diagnose common problems surrounding model complexity and bias, heteroscedasticity, underfit and overtraining, or class balance issues. By applying visualizers to the model selection workflow, Yellowbrick allows you to steer predictive models to more successful results, faster. - -Please see the full documentation at: http://scikit-yb.org/ particularly the `quick start guide `_ - -Visualizers ------------ - -Visualizers are estimators (objects that learn from data) whose primary objective is to create visualizations that allow insight into the model selection process. In Scikit-Learn terms, they can be similar to transformers when visualizing the data space or wrap an model estimator similar to how the “ModelCV” (e.g. RidgeCV_, LassoCV_) methods work. The primary goal of Yellowbrick is to create a sensical API similar to Scikit-Learn. Some of our most popular visualizers include: - -.. _RidgeCV: http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.RidgeCV.html -.. _LassoCV: http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LassoCV.html - -Feature Visualization -~~~~~~~~~~~~~~~~~~~~~ - -- **Rank Features**: single or pairwise ranking of features to detect relationships -- **Parallel Coordinates**: horizontal visualization of instances -- **Radial Visualization**: separation of instances around a circular plot -- **PCA Projection**: projection of instances based on principal components -- **Manifold Visualization**: high dimensional visualization with manifold learning -- **Feature Importances**: rank features based on their in-model performance -- **Recursive Feature Elimination**: find the best subset of features by importance -- **Joint Plots**: direct data visualization with feature selection - -Classification Visualization -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -- **Class Balance**: see how the distribution of classes affects the model -- **Class Prediction Error**: shows error and support in classification -- **Classification Report**: visual representation of precision, recall, and F1 -- **ROC/AUC Curves**: receiver operator characteristics and area under the curve -- **Precision-Recall Curves**: precision vs recall for different probability thresholds -- **Confusion Matrices**: visual description of class decision making -- **Discrimination Threshold**: find a threshold that best separates binary classes - -Regression Visualization -~~~~~~~~~~~~~~~~~~~~~~~~ - -- **Prediction Error Plots**: find model breakdowns along the domain of the target -- **Residuals Plot**: show the difference in residuals of training and test data -- **Alpha Selection**: show how the choice of alpha influences regularization - -Clustering Visualization -~~~~~~~~~~~~~~~~~~~~~~~~ - -- **K-Elbow Plot**: select k using the elbow method and various metrics -- **Silhouette Plot**: select k by visualizing silhouette coefficient values -- **Intercluster Distance Maps**: show relative distance and size of clusters - -Model Selection Visualization -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -- **Validation Curve**: tune a model with respect to a single hyperparameter -- **Learning Curve**: show if a model might benefit from more data or less complexity - -Text Visualization -~~~~~~~~~~~~~~~~~~ - -- **Term Frequency**: visualize the frequency distribution of terms in the corpus -- **t-SNE Corpus Visualization**: use stochastic neighbor embedding to project documents -- **Dispersion Plot**: visualize how key terms are dispersed throughout a corpus - -Target Visualization -~~~~~~~~~~~~~~~~~~~~ - -- **Feature Correlation**: visualize the correlation between the dependent variables and the target - -... and more! Visualizers are being added all the time; be sure to check the examples_ (or even the develop_ branch) and feel free to contribute your ideas for new Visualizers! - -.. _examples: http://www.scikit-yb.org/en/latest/api/index.html -.. _develop: https://github.com/districtdatalabs/yellowbrick/tree/develop diff --git a/MANIFEST.in b/MANIFEST.in index 1dbdb8797..e3ac99581 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -4,11 +4,24 @@ include *.txt include *.yml include *.cfg include Makefile -recursive-include docs *.rst -recursive-include docs *.jpg -recursive-include docs *.png -recursive-include docs *.py -recursive-include docs Makefile -recursive-include tests *.py -recursive-include examples *.py -recursive-include examples *.ipynb +include MANIFEST.in + +include examples/*.ipynb +include examples/*.md + +graft docs +prune docs/_build + +graft tests +prune tests/fixtures +prune tests/actual_images + +graft yellowbrick +prune yellowbrick/datasets/fixtures + +global-exclude __pycache__ +global-exclude *.py[co] +global-exclude .ipynb_checkpoints +global-exclude .DS_Store +global-exclude .env +global-exclude .coverage.* \ No newline at end of file diff --git a/README.md b/README.md index 6c43e72ce..2cf88fdbe 100644 --- a/README.md +++ b/README.md @@ -5,83 +5,26 @@ [![Coverage Status](https://coveralls.io/repos/github/DistrictDataLabs/yellowbrick/badge.svg?branch=master)](https://coveralls.io/github/DistrictDataLabs/yellowbrick?branch=master) [![Total Alerts](https://img.shields.io/lgtm/alerts/g/DistrictDataLabs/yellowbrick.svg?logo=lgtm&logoWidth=18)](https://lgtm.com/projects/g/DistrictDataLabs/yellowbrick/alerts/) [![Language Grade: Python](https://img.shields.io/lgtm/grade/python/g/DistrictDataLabs/yellowbrick.svg?logo=lgtm&logoWidth=18)](https://lgtm.com/projects/g/DistrictDataLabs/yellowbrick/context:python) - [![PyPI version](https://badge.fury.io/py/yellowbrick.svg)](https://badge.fury.io/py/yellowbrick) [![Documentation Status](https://readthedocs.org/projects/yellowbrick/badge/?version=latest)](http://yellowbrick.readthedocs.io/en/latest/?badge=latest) [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.1206239.svg)](https://doi.org/10.5281/zenodo.1206239) +[![JOSS](http://joss.theoj.org/papers/10.21105/joss.01075/status.svg)](https://doi.org/10.21105/joss.01075) [![Binder](https://mybinder.org/badge.svg)](https://mybinder.org/v2/gh/DistrictDataLabs/yellowbrick/develop?filepath=examples%2Fexamples.ipynb) **Visual analysis and diagnostic tools to facilitate machine learning model selection.** -![Follow the yellow brick road](docs/images/yellowbrickroad.jpg) -Image by [Quatro Cinco](https://flic.kr/p/2Yj9mj), used with permission, Flickr Creative Commons. - -This README is a guide for developers, if you're new to Yellowbrick, get started at our [documentation](http://www.scikit-yb.org/). +[![Banner](docs/images/readme/banner.png)](https://www.scikit-yb.org/en/latest/gallery.html) ## What is Yellowbrick? -Yellowbrick is a suite of visual diagnostic tools called "Visualizers" that extend the scikit-learn API to allow human steering of the model selection process. In a nutshell, Yellowbrick combines scikit-learn with matplotlib in the best tradition of the scikit-learn documentation, but to produce visualizations for _your_ models! - -![Visualizers](docs/images/visualizers.png) - -### Visualizers - -Visualizers are estimators (objects that learn from data) whose primary objective is to create visualizations that allow insight into the model selection process. In scikit-learn terms, they can be similar to transformers when visualizing the data space or wrapping a model estimator similar to how the "ModelCV" (e.g. RidgeCV, LassoCV) methods work. The primary goal of Yellowbrick is to create a sensical API similar to scikit-learn. Some of our most popular visualizers include: - -#### Feature Visualization - -- **Rank Features**: single or pairwise ranking of features to detect relationships -- **Parallel Coordinates**: horizontal visualization of instances -- **Radial Visualization**: separation of instances around a circular plot -- **PCA Projection**: projection of instances based on principal components -- **Manifold Visualization**: high dimensional visualization with manifold learning -- **Feature Importances**: rank features based on their in-model performance -- **Recursive Feature Elimination**: find the best subset of features by importance -- **Joint Plots**: direct data visualization with feature selection - -#### Classification Visualization - -- **Class Balance**: see how the distribution of classes affects the model -- **Class Prediction Error**: shows error and support in classification -- **Classification Report**: visual representation of precision, recall, and F1 -- **ROC/AUC Curves**: receiver operator characteristics and area under the curve -- **Precision-Recall Curves**: precision vs recall for different probability thresholds -- **Confusion Matrices**: visual description of class decision making -- **Discrimination Threshold**: find a threshold that best separates binary classes - -#### Regression Visualization - -- **Prediction Error Plots**: find model breakdowns along the domain of the target -- **Residuals Plot**: show the difference in residuals of training and test data -- **Alpha Selection**: show how the choice of alpha influences regularization - -#### Clustering Visualization - -- **K-Elbow Plot**: select k using the elbow method and various metrics -- **Silhouette Plot**: select k by visualizing silhouette coefficient values -- **Intercluster Distance Maps**: show relative distance and size of clusters - -#### Model Selection Visualization - -- **Validation Curve**: tune a model with respect to a single hyperparameter -- **Learning Curve**: show if a model might benefit from more data or less complexity +Yellowbrick is a suite of visual diagnostic tools called "Visualizers" that extend the scikit-learn API to allow human steering of the model selection process. In a nutshell, Yellowbrick combines scikit-learn with matplotlib in the best tradition of the scikit-learn documentation, but to produce visualizations for _your_ machine learning workflow! -#### Text Visualization - -- **Term Frequency**: visualize the frequency distribution of terms in the corpus -- **t-SNE Corpus Visualization**: use stochastic neighbor embedding to project documents. -- **Dispersion Plot**: visualize how key terms are dispersed throughout a corpus - -#### Target Visualization - -- **Feature Correlation**: visualize the correlation between the dependent variables and the target - -And more! Visualizers are being added all the time, so be sure to check the examples (or even the develop branch) and feel free to contribute your ideas for Visualizers! +For complete documentation on the Yellowbrick API, a gallery of available visualizers, the contributor's guide, tutorials and teaching resources, frequently asked questions, and more, please visit our documentation at [www.scikit-yb.org](https://www.scikit-yb.org/). ## Installing Yellowbrick -Yellowbrick is compatible with Python 2.7 or later but it is preferred to use Python 3.5 or later to take full advantage of all functionality. Yellowbrick also depends on scikit-learn 0.18 or later and matplotlib 1.5 or later. The simplest way to install Yellowbrick is from PyPI with pip, Python's preferred package installer. +Yellowbrick is compatible with Python 3.4 or later and also depends on scikit-learn and matplotlib. The simplest way to install Yellowbrick and its dependencies is from PyPI with pip, Python's preferred package installer. $ pip install yellowbrick @@ -95,8 +38,6 @@ If you're using Anaconda (recommended for Windows users), you can take advantage conda install -c districtdatalabs yellowbrick -Note, however, that there is a [known bug](https://github.com/DistrictDataLabs/yellowbrick/issues/205) installing Yellowbrick on Linux with Anaconda. - ## Using Yellowbrick The Yellowbrick API is specifically designed to play nicely with scikit-learn. Here is an example of a typical workflow sequence with scikit-learn and Yellowbrick: @@ -108,10 +49,12 @@ In this example, we see how Rank2D performs pairwise comparisons of each feature ```python from yellowbrick.features import Rank2D -visualizer = Rank2D(features=features, algorithm='covariance') +visualizer = Rank2D( + features=features, algorithm='covariance' +) visualizer.fit(X, y) # Fit the data to the visualizer visualizer.transform(X) # Transform the data -visualizer.poof() # Draw/show/poof the data +visualizer.poof() # Show the data ``` ### Model Visualization @@ -129,23 +72,19 @@ visualizer.score(X,y) visualizer.poof() ``` -For additional information on getting started with Yellowbrick, check out our [examples notebook](https://github.com/DistrictDataLabs/yellowbrick/blob/develop/examples/examples.ipynb). - -We also have a [quick start guide](https://github.com/DistrictDataLabs/yellowbrick/blob/master/docs/quickstart.rst). +For additional information on getting started with Yellowbrick, view the quickstart guide in the [documentation](https://www.scikit-yb.org/en/latest/) and check out our [examples notebook](https://github.com/DistrictDataLabs/yellowbrick/blob/develop/examples/examples.ipynb). ## Contributing to Yellowbrick Yellowbrick is an open source project that is supported by a community who will gratefully and humbly accept any contributions you might make to the project. Large or small, any contribution makes a big difference; and if you've never contributed to an open source project before, we hope you will start with Yellowbrick! -Principally, Yellowbrick development is about the addition and creation of *visualizers* -- objects that learn from data and create a visual representation of the data or model. Visualizers integrate with scikit-learn estimators, transformers, and pipelines for specific purposes and as a result can be simple to build and deploy. The most common contribution is therefore a new visualizer for a specific model or model family. We'll discuss in detail how to build visualizers later. - -Beyond creating visualizers, there are many ways to contribute: +If you are interested in contributing, check out our [contributor's guide](https://www.scikit-yb.org/en/latest/contributing.html). Beyond creating visualizers, there are many ways to contribute: - Submit a bug report or feature request on [GitHub Issues](https://github.com/DistrictDataLabs/yellowbrick/issues). - Contribute a Jupyter notebook to our examples[ gallery](https://github.com/DistrictDataLabs/yellowbrick/tree/develop/examples). -- Assist us with [user testing](http://www.scikit-yb.org/en/latest/evaluation.html). -- Add to the documentation or help with our website, [scikit-yb.org](http://www.scikit-yb.org). -- Write unit or integration tests for our project. +- Assist us with [user testing](https://www.scikit-yb.org/en/latest/evaluation.html). +- Add to the documentation or help with our website, [scikit-yb.org](https://www.scikit-yb.org). +- [Write unit or integration tests](https://www.scikit-yb.org/en/latest/contributing.html#testing) for our project. - Answer questions on our issues, mailing list, Stack Overflow, and elsewhere. - Translate our documentation into another language. - Write a blog post, tweet, or share our project with others. @@ -153,30 +92,28 @@ Beyond creating visualizers, there are many ways to contribute: As you can see, there are lots of ways to get involved and we would be very happy for you to join us! The only thing we ask is that you abide by the principles of openness, respect, and consideration of others as described in the [Python Software Foundation Code of Conduct](https://www.python.org/psf/codeofconduct/). -For more information, checkout the `CONTRIBUTING.md` file in the root of the repository or the detailed documentation at [Contributing to Yellowbrick](http://www.scikit-yb.org/en/latest/contributing.html) +For more information, checkout the `CONTRIBUTING.md` file in the root of the repository or the detailed documentation at [Contributing to Yellowbrick](https://www.scikit-yb.org/en/latest/contributing.html) -## Development Scripts +## Yellowbrick Datasets -Yellowbrick contains scripts to help with development, including downloading fixture data for tests and managing images for comparison. +Yellowbrick gives easy access to several datasets that are used for the examples in the documentation and testing. These datasets are hosted in our CDN and must be downloaded for use. Typically, when a user calls one of the data loader functions, e.g. `load_bikeshare()` the data is automatically downloaded if it's not already on the user's computer. However, for development and testing, or if you know you will be working without internet access, it might be easier to simply download all the data at once. -### Images +The data downloader script can be run as follows: -The image comparison helper script manages the test directory's `baseline_images` folder by copying files from the `actual_images` folder to setup baselines. To use this script, first run the tests (which will cause image not found errors) then copy the images into baseline as follows: + $ python -m yellowbrick.download -``` -$ python -m tests.images tests/test_visualizer.py -``` +This will download the data to the fixtures directory inside of the Yellowbrick site packages. You can specify the location of the download either as an argument to the downloader script (use `--help` for more details) or by setting the `$YELLOWBRICK_DATA` environment variable. This is the preferred mechanism because this will also influence how data is loaded in Yellowbrick. -Where `tests/test_visualizer.py` is the test file that contains the image comparison tests. All related tests will be discovered, validated, and copied to the baseline directory. To clear out images from both actual and baseline to reset tests, use the `-C` flag: +_Note: Developers who have downloaded data from Yellowbrick versions earlier than v1.0 may experience some problems with the older data format. If this occurs, you can clear out your data cache as follows:_ -``` -$ python -m tests.images -C tests/test_visualizer.py -``` + $ python -m yellowbrick.download --cleanup -Glob syntax can be used to move multiple files. For example to reset all the classifier tests: +_This will remove old datasets and download the new ones. You can also use the `--no-download` flag to simply clear the cache without re-downloading data. Users who are having difficulty with datasets can also use this or they can uninstall and reinstall Yellowbrick using `pip`._ -``` -$ python -m tests.images tests/test_classifier/* -``` +## Citing Yellowbrick + +We would be glad if you used Yellowbrick in your scientific publications! If you do, please cite us using the [citation guidelines](https://www.scikit-yb.org/en/latest/about.html#citing-yellowbrick). + +## Affiliations -Though it is recommended that specific test cases are targeted, rather than updating entire directories. +[![District Data Labs](docs/images/readme/affiliates_ddl.png)](https://districtdatalabs.com/) [![NumFOCUS Affiliated Project](docs/images/readme/affiliates_numfocus.png)](https://numfocus.org) diff --git a/docs/README.md b/docs/README.md new file mode 100644 index 000000000..7de9bbe02 --- /dev/null +++ b/docs/README.md @@ -0,0 +1,87 @@ +# Yellowbrick Documentation + +*Welcome to the Yellowbrick docs!* + +If you're looking for information about how to use Yellowbrick, for our contributor's guide, for examples and teaching resources, for answers to frequently asked questions, and more, please visit the latest version of our documentation at [www.scikit-yb.org](https://www.scikit-yb.org/). + +## Building the Docs + +To build the documents locally, first install the documentation-specific requirements with `pip` using the `requirements.txt` file in the `docs` directory: + +```bash +$ pip install -r docs/requirements.txt +``` + +You will then be able to build the documentation from inside the `docs` directory by running `make html`; the documentation will be built and rendered in the `_build/html` directory. You can view it by opening `_build/html/index.html` then navigating to your documentation in the browser. + +## reStructuredText + +Yellowbrick uses [Sphinx](http://www.sphinx-doc.org/en/master/index.html) to build our documentation. The advantages of using Sphinx are many; we can more directly link to the documentation and source code of other projects like Matplotlib and scikit-learn using [intersphinx](http://www.sphinx-doc.org/en/master/usage/extensions/intersphinx.html). In addition, docstrings used to describe Yellowbrick visualizers can be automatically included when the documentation is built via [autodoc](http://www.sphinx-doc.org/en/master/usage/extensions/autodoc.html#sphinx.ext.autodoc). + +To take advantage of these features, our documentation must be written in reStructuredText (or "rst"). reStructuredText is similar to markdown, but not identical, and does take some getting used to. For instance, styling for things like codeblocks, external hyperlinks, internal cross references, notes, and fixed-width text are all unique in rst. + +If you would like to contribute to our documentation and do not have prior experience with rst, we recommend you make use of these resources: + +- [A reStructuredText Primer](http://docutils.sourceforge.net/docs/user/rst/quickstart.html) +- [rst notes and cheatsheet](https://cheat.readthedocs.io/en/latest/rst.html) +- [Using the plot directive](https://matplotlib.org/devel/plot_directive.html) + +## Adding New Visualizers to the Docs + +If you are adding a new visualizer to the docs, there are quite a few examples in the documentation on which you can base your files of similar types. + +The primary format for the API section is as follows: + +``` +.. -*- mode: rst -*- + +My Visualizer +============= + +A brief introduction to my visualizer and how it is useful in the machine learning process. + +.. plot:: + :context: close-figs + :include-source: False + :alt: Example using MyVisualizer + + visualizer = MyVisualizer(LinearRegression()) + + visualizer.fit(X, y) + g = visualizer.poof() + +Discussion about my visualizer and some interpretation of the above plot. + + +API Reference +------------- + +.. automodule:: yellowbrick.regressor.mymodule + :members: MyVisualizer + :undoc-members: + :show-inheritance: +``` + +This is a pretty good structure for a documentation page; a brief introduction followed by a code example with a visualization included using [the plot directive](https://matplotlib.org/devel/plot_directive.html). This will render the `MyVisualizer` image in the document along with links for the complete source code, the png, and the pdf versions of the image. It will also have the "alt-text" (for screen-readers) and will not display the source because of the `:include-source:` option. If `:include-source:` is omitted, the source will also be included. + +The primary section is wrapped up with a discussion about how to interpret the visualizer and use it in practice. Finally the `API Reference` section will use `automodule` to include the documentation from your docstring. + +There are several other places where you can list your visualizer, but to ensure it is included in the documentation it *must be listed in the TOC of the local index*. Find the `index.rst` file in your subdirectory and add your rst file (without the `.rst` extension) to the `..toctree::` directive. This will ensure your documentation is included when it is built. + +## Generating the Gallery + +In v1.0, we have adopted Matplotlib's [plot directive](https://matplotlib.org/devel/plot_directive.html) which means that the majority of the images generated for the documentation are generated automatically. One exception is the gallery; the images for the gallery must still be generated manually. + +If you have contributed a new visualizer as described in the above section, please also add it to the gallery, both to `docs/gallery.py` and to `docs/gallery.rst`. (Make sure you have already installed Yellowbrick in editable mode, from the top level directory: `pip install -e` .) + +If you want to regenerate a single image (e.g. the elbow curve plot), you can do so as follows: + +```bash +$ python docs/gallery.py elbow +``` + +If you want to regenerate them all (note: this takes a long time!) + +```bash +$ python docs/gallery.py all +``` diff --git a/docs/_static/theme_overrides.css b/docs/_static/theme_overrides.css index 63ee6cc74..88c13bfd2 100644 --- a/docs/_static/theme_overrides.css +++ b/docs/_static/theme_overrides.css @@ -11,3 +11,13 @@ overflow: visible !important; } } + +/* Add padding to the parameters list on RTD theme */ +dl.simple dt { + padding-left: 4px; +} + +dl.simple dt span.classifier { + padding-left: 6px; + font-weight: normal; +} \ No newline at end of file diff --git a/docs/about.rst b/docs/about.rst index d11fc67e7..9ee452210 100644 --- a/docs/about.rst +++ b/docs/about.rst @@ -45,33 +45,75 @@ From Wikipedia_: Team ---- -Yellowbrick is developed by data scientists who believe in open source and the project enjoys contributions from Python developers all over the world. The project was started by `@rebeccabilbro`_ and `@bbengfort`_ as an attempt to better explain machine learning concepts to their students; they quickly realized, however, that the potential for visual steering could have a large impact on practical data science and developed it into a high-level Python library. +Yellowbrick is developed by volunteer data scientists who believe in open source and the project enjoys contributions from Python developers all over the world. The project was started by `@rebeccabilbro`_ and `@bbengfort`_ as an attempt to better explain machine learning concepts to their students at Georgetown University where they teach a data science certificate program. They quickly realized, however, that the potential for visual steering could have a large impact on practical data science and developed it into a production-ready Python library. -Yellowbrick is incubated by `District Data Labs`_, an organization that is dedicated to collaboration and open source development. As part of District Data Labs, Yellowbrick was first introduced to the Python Community at `PyCon 2016 `_ in both talks and during the development sprints. The project was then carried on through DDL Research Labs (semester-long sprints where members of the DDL community contribute to various data-related projects). +Yellowbrick was then incubated by District Data Labs (DDL) in partnership with Georgetown University. District Data Labs is an organization that is dedicated to open source development and data science education and provided resources to help Yellowbrick grow. Yellowbrick was first introduced to the Python Community at `PyCon 2016 `_ in both talks and during the development sprints. The project was then carried on through DDL Research Labs -- semester-long sprints where members of the DDL community contribute to various data-related projects. + +Since then, Yellowbrick has enjoyed the participation of a large number of contributors from around the world and growing support in the PyData community. Yellowbrick has been featured in talks at PyData, Scipy, NumFOCUS, and PSF organized events as well as blog posts and Kaggle competitions. We are so thrilled to have such a dedicated community involved in active contributions both large and small. For a full list of current maintainers and core contributors, please see `MAINTAINERS.md `_ in the root of our GitHub repository. Thank you so much to everyone who has `contributed to Yellowbrick `_! +Affiliations +------------ + +Yellowbrick is proud to be affiliated with several organizations that provide institutional support to the project. Such support is sometimes financial, often material, and always in the spirit of free and open source software. We can't thank them enough for their role in making Yellowbrick what it is today. + +`District Data Labs`_: District Data Labs incubated Yellowbrick and sponsors research labs by purchasing food and organizing events. Research labs are semester long sprints that allow Yellowbrick contributors to meet in person, share a meal, and hack on the project. DDL also sponsors travel to PyCon and PyData conferences for Yellowbrick maintainers and helps us buy promotional material such as stickers and t-shirts. + +`NumFOCUS`_: Yellowbrick is a NumFOCUS affiliated project (not a fiscally sponsored project). Our relationship with NumFOCUS has given us a lot of data science cred in the community by being listed on their website. We are also eligible to apply for small development grants and infrastructure support. We often participate in the project developers mailing list and other activities such as Google Summer of Code. + +`Georgetown University`_: Georgetown primarily provides space for Yellowbrick events including the research labs. Additionally, Georgetown Data Science Certificate students are introduced to Yellowbrick at the beginning of their machine learning education and we often perform user testing of new features on them! + +How to Support Yellowbrick +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Yellowbrick is developed by volunteers who work on the project in their spare time and not as part of their regular full-time work. If Yellowbrick has become critical to the success of your organization, please consider giving back to Yellowbrick. + + "... open source thrives on human rather than financial resources. There + are many ways to grow human resources, such as distributing the + workload among more contributors or encouraging companies to + make open source part of their employees’ work. An effective + support strategy must include multiple ways to generate time and + resources besides directly financing development. It must start from + the principle that the open source approach is not inherently flawed, + but rather under-resourced." + + -- `Roads and Bridges: The Unseen Labor Behind our Digital Infrastructure `_ + +The main thing that the Yellowbrick maintainers need is *time*. There are many ways to provide that time through non-financial mechanisms such as: + +- Create a written policy in your company handbook that dedicates time for your employees to contribute to open source projects like Yellowbrick. +- Interact with our community giving encouragement and advice, particularly for long term planning and non-code related activities like design and documentation. +- Advocate and evangelize your use of Yellowbrick and other open source software through blog posts and social media. +- Consider long term support strategies rather than ad hoc or one-off actions. +- Teach your students Machine Learning with Yellowbrick. + +More concrete and financial support is also welcome, particularly if it's directed through a specific effort. If you are interested in this kind of support consider: + +- Making a donation to NumFOCUS on behalf of Yellowbrick. +- Engaging District Data Labs for coporate training on visual machine learning with Yellowbrick (which will directly support Yellowbrick maintainers). +- Supporting your employee's continuing professional education in the Georgetown Data Science Certificate. +- Providing long term support for fixed costs such as hosting. + +Yellowbrick's mission is to enhance the machine learning workflow through open source visual steering and diagnostics. If you're interested in a more formal affiliate relationship to support this mission, please get in contact with us directly. + License ------- Yellowbrick is an open source project and its `license `_ is an implementation of the FOSS `Apache 2.0 `_ license by the Apache Software Foundation. `In plain English `_ this means that you can use Yellowbrick for commercial purposes, modify and distribute the source code, and even sublicense it. We want you to use Yellowbrick, profit from it, and contribute back if you do cool things with it. -There are, however, a couple of requirements that we ask from you. First, when you copy or distribute Yellowbrick source code, please include our copyright and license found in the `LICENSE.txt `_ at the root of our software repository. In addition, if we create a file called "NOTICE" in our project you must also include that in your source distribution. The "NOTICE" file will include attribution and thanks to those who have worked so hard on the project! Finally you can't hold District Data Labs or any Yellowbrick contributor liable for your use of our software, nor use any of our names, trademarks, or logos. +There are, however, a couple of requirements that we ask from you. First, when you copy or distribute Yellowbrick source code, please include our copyright and license found in the `LICENSE.txt `_ at the root of our software repository. In addition, if we create a file called "NOTICE" in our project you must also include that in your source distribution. The "NOTICE" file will include attribution and thanks to those who have worked so hard on the project! Note that you may not use our names, trademarks, or logos to promote your work or in any other way than to reference Yellowbrick. Finally, we provide Yellowbrick with no warranty and you can't hold any Yellowbrick contributor or affiliate liable for your use of our software. We think that's a pretty fair deal, and we're big believers in open source. If you make any changes to our software, use it commercially or academically, or have any other interest, we'd love to hear about it. - -.. _SIGMOD: http://cseweb.ucsd.edu/~arunkk/vision/SIGMODRecord15.pdf -.. _Wikipedia: https://en.wikipedia.org/wiki/Yellow_brick_road -.. _`@rebeccabilbro`: https://github.com/rebeccabilbro -.. _`@bbengfort`: https://github.com/bbengfort -.. _`District Data Labs`: http://www.districtdatalabs.com/ - Presentations ------------- Yellowbrick has enjoyed the spotlight in several presentations at recent conferences. We hope that these notebooks, talks, and slides will help you understand Yellowbrick a bit better. +Papers: + - `Yellowbrick: Visualizing the Scikit-Learn Model Selection Process `_ + Conference Presentations (videos): - `Visual Diagnostics for More Informed Machine Learning: Within and Beyond Scikit-Learn (PyCon 2016) `_ - `Yellowbrick: Steering Machine Learning with Visual Transformers (PyData London 2017) `_ @@ -85,9 +127,6 @@ Slides: - `Visualizing Model Selection with Scikit-Yellowbrick `_ - `Visual Pipelines for Text Analysis (Data Intelligence 2017) `_ -.. _QuatroCinco: https://flic.kr/p/2Yj9mj -.. _API: http://scikit-learn.org/stable/modules/classes.html - Citing Yellowbrick ------------------ @@ -95,6 +134,9 @@ Citing Yellowbrick .. image:: https://zenodo.org/badge/DOI/10.5281/zenodo.1206239.svg :target: https://doi.org/10.5281/zenodo.1206239 +.. image:: http://joss.theoj.org/papers/10.21105/joss.01075/status.svg + :target: https://doi.org/10.21105/joss.01075 + We hope that Yellowbrick facilitates machine learning of all kinds and we're particularly fond of academic work and research. If you're writing a scientific publication that uses Yellowbrick you can cite *Bengfort et al. (2018)* with the following BibTex: .. code-block:: bibtex @@ -112,18 +154,33 @@ We hope that Yellowbrick facilitates machine learning of all kinds and we're par performance, stability, and predictive value of machine learning models, and assist in diagnosing problems throughout the machine learning workflow.}, - version = {0.6}, - author = {Bengfort, Benjamin and Danielsen, Nathan and - Bilbro, Rebecca and Gray, Larry and {McIntyre}, Kristen and - Richardson, George and Miller, Taylor and Mayfield, Gary and - Schafer, Phillip and Keung, Jason}, - date = {2018-03-17}, + version = {0.9.1}, + author = {Bengfort, Benjamin and Bilbro, Rebecca and Danielsen, Nathan and + Gray, Larry and {McIntyre}, Kristen and Roman, Prema and Poh, Zijie and + others}, + date = {2018-11-14}, + year = {2018}, doi = {10.5281/zenodo.1206264} } You can also find DOI (digital object identifiers) for every version of Yellowbrick on `zenodo.org `_; use the BibTeX on this site to reference specific versions or changes made to the software. -We're also currently working on a scientific paper that describes Yellowbrick in the context of *steering the model selection process*. Stay tuned for a pre-release of this paper on arXiv. +We've also published a paper in the `Journal of Open Source Software (JOSS) `_ that discusses how Yellowbrick is designed to influence the model selection workflow. You may cite this paper if you are discussing Yellowbrick more generally in your research (instead of a specific version) or are interested in discussing visual analytics or visualization for machine learning. Please cite *Bengfort and Bilbro (2019)* with the following BibTex: + +.. code-block:: bibtex + + @article{bengfort_yellowbrick_2019, + title = {Yellowbrick: {{Visualizing}} the {{Scikit}}-{{Learn Model Selection Process}}}, + journaltitle = {The Journal of Open Source Software}, + volume = {4}, + number = {35}, + series = {1075}, + date = {2019-03-24}, + year = {2019}, + author = {Bengfort, Benjamin and Bilbro, Rebecca}, + url = {http://joss.theoj.org/papers/10.21105/joss.01075}, + doi = {10.21105/joss.01075} + } Contacting Us ------------- @@ -139,3 +196,13 @@ The best way to contact the Yellowbrick team is to send us a note on one of the .. _`mailing list`: http://bit.ly/yb-listserv .. _`Stack Overflow`: https://stackoverflow.com/questions/tagged/yellowbrick .. _`Twitter`: https://twitter.com/scikit_yb + +.. _QuatroCinco: https://flic.kr/p/2Yj9mj +.. _API: http://scikit-learn.org/stable/modules/classes.html +.. _SIGMOD: http://cseweb.ucsd.edu/~arunkk/vision/SIGMODRecord15.pdf +.. _Wikipedia: https://en.wikipedia.org/wiki/Yellow_brick_road +.. _`@rebeccabilbro`: https://github.com/rebeccabilbro +.. _`@bbengfort`: https://github.com/bbengfort +.. _`District Data Labs`: http://www.districtdatalabs.com/ +.. _`Georgetown University`: https://scs.georgetown.edu/programs/375/certificate-in-data-science/ +.. _`NumFOCUS`: https://numfocus.org/ \ No newline at end of file diff --git a/docs/api/anscombe.py b/docs/api/anscombe.py index d8f1b8826..5f04cec09 100644 --- a/docs/api/anscombe.py +++ b/docs/api/anscombe.py @@ -1,4 +1,4 @@ -# Creates the anscombe visualization. +# Creates the anscombe visualization. import yellowbrick as yb import matplotlib.pyplot as plt diff --git a/docs/api/classifier/class_prediction_error.py b/docs/api/classifier/class_prediction_error.py deleted file mode 100644 index de6bab33b..000000000 --- a/docs/api/classifier/class_prediction_error.py +++ /dev/null @@ -1,63 +0,0 @@ -# class_prediction_error.py - -""" -Creates the visualizations for the class_prediction_error.rst documentation -""" - -########################################################################## -## Imports -########################################################################## - -import pandas as pd -import matplotlib.pyplot as plt - -from sklearn.datasets import make_classification -from sklearn.ensemble import RandomForestClassifier -from sklearn.model_selection import train_test_split as tts - -from yellowbrick.classifier import ClassPredictionError - - -def make_fruit_dataset(): - X, y = make_classification( - n_samples=1000, n_classes=5, n_informative=3, n_clusters_per_class=1 - ) - - classes = ['apple', 'kiwi', 'pear', 'banana', 'orange'] - return tts(X, y, test_size=0.20, random_state=42), classes - - -def load_credit_dataset(): - data = pd.read_csv("../../../examples/data/credit/credit.csv") - target = "default" - features = list(data.columns) - features.remove(target) - - X = data[features] - y = data[target] - - classes = ["default", "current"] - return tts(X, y, test_size=0.2, random_state=53), classes - - -def make_cb_pred_error(dataset="fruit", path=None, clf=None): - clf = clf or RandomForestClassifier() - - loader = { - 'fruit': make_fruit_dataset, - 'credit': load_credit_dataset, - }[dataset] - - (X_train, X_test, y_train, y_test), classes = loader() - - _, ax = plt.subplots() - viz = ClassPredictionError(clf, ax=ax, classes=classes) - viz.fit(X_train, y_train) - viz.score(X_test, y_test) - - return viz.poof(outpath=path) - - -if __name__ == '__main__': - make_cb_pred_error("fruit", "images/class_prediction_error.png") - make_cb_pred_error("credit", "images/class_prediction_error_credit.png") diff --git a/docs/api/classifier/class_prediction_error.rst b/docs/api/classifier/class_prediction_error.rst index b7e2ddf85..9e9696616 100644 --- a/docs/api/classifier/class_prediction_error.rst +++ b/docs/api/classifier/class_prediction_error.rst @@ -2,16 +2,25 @@ Class Prediction Error ====================== + +The Yellowbrick ``ClassPredictionError`` plot is a twist on other and sometimes more familiar classification model diagnostic tools like the :doc:`confusion_matrix` and :doc:`classification_report`. Like the :doc:`classification_report`, this plot shows the support (number of training samples) for each class in the fitted classification model as a stacked bar chart. Each bar is segmented to show the proportion of predictions (including false negatives and false positives, like a :doc:`confusion_matrix`) for each class. You can use a ``ClassPredictionError`` to visualize which classes your classifier is having a particularly difficult time with, and more importantly, what incorrect answers it is giving on a per-class basis. This can often enable you to better understand strengths and weaknesses of different models and particular challenges unique to your dataset. + The class prediction error chart provides a way to quickly understand how good your classifier is at predicting the right classes. -.. code:: python +.. plot:: + :context: close-figs + :alt: Class Prediction Error plot on Fruit from sklearn.datasets import make_classification from sklearn.model_selection import train_test_split + from sklearn.ensemble import RandomForestClassifier + from yellowbrick.classifier import ClassPredictionError + # Create classification dataset X, y = make_classification( - n_samples=1000, n_classes=5, n_informative=3, n_clusters_per_class=1 + n_samples=1000, n_classes=5, n_informative=3, n_clusters_per_class=1, + random_state=36, ) classes = ["apple", "kiwi", "pear", "banana", "orange"] @@ -19,16 +28,44 @@ The class prediction error chart provides a way to quickly understand how good y # Perform 80/20 training/test split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42) + # Instantiate the classification model and visualizer + visualizer = ClassPredictionError( + RandomForestClassifier(random_state=42, n_estimators=10), classes=classes + ) -.. code:: python + # Fit the training data to the visualizer + visualizer.fit(X_train, y_train) - from sklearn.ensemble import RandomForestClassifier + # Evaluate the model on the test data + visualizer.score(X_test, y_test) + # Draw visualization + visualizer.poof() + +In the above example, while the ``RandomForestClassifier`` appears to be fairly good at correctly predicting apples based on the features of the fruit, it often incorrectly labels pears as kiwis and mistakes kiwis for bananas. + +By contrast, in the following example, the ``RandomForestClassifier`` does a great job at correctly predicting accounts in default, but it is a bit of a coin toss in predicting account holders who stayed current on bills. + +.. plot:: + :context: close-figs + :alt: Class Prediction Error on account standing + + from sklearn.model_selection import train_test_split + from sklearn.ensemble import RandomForestClassifier from yellowbrick.classifier import ClassPredictionError + from yellowbrick.datasets import load_credit + + X, y = load_credit() + + classes = ['account in default', 'current with bills'] + # Perform 80/20 training/test split + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, + random_state=42) + # Instantiate the classification model and visualizer visualizer = ClassPredictionError( - RandomForestClassifier(), classes=classes + RandomForestClassifier(n_estimators=10), classes=classes ) # Fit the training data to the visualizer @@ -38,10 +75,10 @@ The class prediction error chart provides a way to quickly understand how good y visualizer.score(X_test, y_test) # Draw visualization - g = visualizer.poof() - -.. image:: images/class_prediction_error.png + visualizer.poof() + + API Reference ------------- diff --git a/docs/api/classifier/classification_report.py b/docs/api/classifier/classification_report.py deleted file mode 100644 index 2cabec28a..000000000 --- a/docs/api/classifier/classification_report.py +++ /dev/null @@ -1,58 +0,0 @@ -# classification_report -# Generates images for the classification report documentation. -# -# Author: Benjamin Bengfort -# Created: Sun Mar 18 16:35:30 2018 -0400 -# -# ID: classification_report.py [] benjamin@bengfort.com $ - -""" -Generates images for the classification report documentation. -""" - -########################################################################## -## Imports -########################################################################## - -import pandas as pd -import matplotlib.pyplot as plt - -from sklearn.naive_bayes import GaussianNB -from sklearn.model_selection import train_test_split as tts - -from yellowbrick.classifier import ClassificationReport - - -########################################################################## -## Quick Methods -########################################################################## - -def make_dataset(): - data = pd.read_csv("../../../examples/data/occupancy/occupancy.csv") - - X = data[["temperature", "relative humidity", "light", "C02", "humidity"]] - y = data.occupancy - - return tts(X, y, test_size=0.2) - - -def make_gb_report(path="images/classification_report.png"): - X_train, X_test, y_train, y_test = make_dataset() - - _, ax = plt.subplots() - - bayes = GaussianNB() - viz = ClassificationReport(bayes, ax=ax, classes=['unoccupied', 'occupied']) - - viz.fit(X_train, y_train) - viz.score(X_test, y_test) - - viz.poof(outpath=path) - - -########################################################################## -## Main Method -########################################################################## - -if __name__ == '__main__': - make_gb_report() diff --git a/docs/api/classifier/classification_report.rst b/docs/api/classifier/classification_report.rst index 8ac9a8a65..3c357b63a 100644 --- a/docs/api/classifier/classification_report.rst +++ b/docs/api/classifier/classification_report.rst @@ -5,42 +5,35 @@ Classification Report The classification report visualizer displays the precision, recall, F1, and support scores for the model. In order to support easier interpretation and problem detection, the report integrates numerical scores with a color-coded heatmap. All heatmaps are in the range ``(0.0, 1.0)`` to facilitate easy comparison of classification models across different classification reports. -.. code:: python - - from sklearn.model_selection import train_test_split - - # Load the classification data set - data = load_data("occupancy") - - # Specify the features of interest and the classes of the target - features = [ - "temperature", "relative humidity", "light", "C02", "humidity" - ] - classes = ["unoccupied", "occupied"] - - # Extract the instances and target - X = data[features] - y = data.occupancy - - # Create the train and test data - X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) - -.. code:: python +.. plot:: + :context: close-figs + :alt: Classification Report + from sklearn.model_selection import TimeSeriesSplit from sklearn.naive_bayes import GaussianNB + from yellowbrick.classifier import ClassificationReport + from yellowbrick.datasets import load_occupancy - # Instantiate the classification model and visualizer - bayes = GaussianNB() - visualizer = ClassificationReport(bayes, classes=classes, support=True) + # Load the classification dataset + X, y = load_occupancy() - visualizer.fit(X_train, y_train) # Fit the visualizer and the model - visualizer.score(X_test, y_test) # Evaluate the model on the test data - g = visualizer.poof() # Draw/show/poof the data + # Specify the target classes + classes = ["unoccupied", "occupied"] + # Create the training and test data + tscv = TimeSeriesSplit() + for train_index, test_index in tscv.split(X): + X_train, X_test = X.iloc[train_index], X.iloc[test_index] + y_train, y_test = y.iloc[train_index], y.iloc[test_index] + # Instantiate the classification model and visualizer + model = GaussianNB() + visualizer = ClassificationReport(model, classes=classes, support=True) -.. image:: images/classification_report.png + visualizer.fit(X_train, y_train) # Fit the visualizer and the model + visualizer.score(X_test, y_test) # Evaluate the model on the test data + visualizer.poof() # Draw/show/poof the data The classification report shows a representation of the main classification metrics on a per-class basis. This gives a deeper intuition of the classifier behavior over global accuracy which can mask functional weaknesses in one class of a multiclass problem. Visual classification reports are used to compare classification models to select models that are "redder", e.g. have stronger classification metrics or that are more balanced. @@ -60,6 +53,7 @@ The metrics are defined in terms of true and false positives, and true and false **support** Support is the number of actual occurrences of the class in the specified dataset. Imbalanced support in the training data may indicate structural weaknesses in the reported scores of the classifier and could indicate the need for stratified sampling or rebalancing. Support doesn't change between models but instead diagnoses the evaluation process. +.. note:: This example uses ``TimeSeriesSplit`` to split the data into the training and test sets. For more information on this cross-validation method, please refer to the scikit-learn `documentation `_. API Reference ------------- diff --git a/docs/api/classifier/confusion_matrix.py b/docs/api/classifier/confusion_matrix.py deleted file mode 100644 index c5a0952c4..000000000 --- a/docs/api/classifier/confusion_matrix.py +++ /dev/null @@ -1,36 +0,0 @@ -from sklearn.datasets import load_digits, load_iris -from sklearn.linear_model import LogisticRegression -from sklearn.model_selection import train_test_split as tts - -from yellowbrick.classifier import ConfusionMatrix - - -if __name__ == '__main__': - digits = load_digits() - digit_X = digits.data - digit_y = digits.target - d_X_train, d_X_test, d_y_train, d_y_test = tts( - digit_X, digit_y, test_size=0.2 - ) - model = LogisticRegression() - digit_cm = ConfusionMatrix(model, classes=[0,1,2,3,4,5,6,7,8,9]) - digit_cm.fit(d_X_train, d_y_train) - digit_cm.score(d_X_test, d_y_test) - d = digit_cm.poof(outpath="images/confusion_matrix_digits.png") - - - iris = load_iris() - iris_X = iris.data - iris_y = iris.target - iris_classes = iris.target_names - i_X_train, i_X_test, i_y_train, i_y_test = tts( - iris_X, iris_y, test_size=0.2 - ) - model = LogisticRegression() - iris_cm = ConfusionMatrix( - model, classes=iris_classes, - label_encoder={0: 'setosa', 1: 'versicolor', 2: 'virginica'} - ) - iris_cm.fit(i_X_train, i_y_train) - iris_cm.score(i_X_test, i_y_test) - i = iris_cm.poof(outpath="images/confusion_matrix_iris.png") diff --git a/docs/api/classifier/confusion_matrix.rst b/docs/api/classifier/confusion_matrix.rst index 878b910a8..c286543ef 100644 --- a/docs/api/classifier/confusion_matrix.rst +++ b/docs/api/classifier/confusion_matrix.rst @@ -3,29 +3,28 @@ Confusion Matrix ================ -The ``ConfusionMatrix`` visualizer is a ScoreVisualizer that takes a -fitted scikit-learn classifier and a set of test X and y values and +The ``ConfusionMatrix`` visualizer is a ``ScoreVisualizer`` that takes a +fitted scikit-learn classifier and a set of test ``X`` and ``y`` values and returns a report showing how each of the test values predicted classes compare to their actual classes. Data scientists use confusion matrices to understand which classes are most easily confused. These provide -similar information as what is available in a ClassificationReport, but +similar information as what is available in a ``ClassificationReport``, but rather than top-level scores, they provide deeper insight into the classification of individual data points. -Below are a few examples of using the ConfusionMatrix visualizer; more +Below are a few examples of using the ``ConfusionMatrix`` visualizer; more information can be found by looking at the scikit-learn documentation on `confusion matrices `_. -.. code:: python +.. plot:: + :context: close-figs + :alt: ConfusionMatrix plot of sklearn Digits dataset from sklearn.datasets import load_digits - from sklearn.model_selection import train_test_split + from sklearn.model_selection import train_test_split as tts from sklearn.linear_model import LogisticRegression - from yellowbrick.classifier import ConfusionMatrix -.. code:: python - # We'll use the handwritten digits data set from scikit-learn. # Each feature of this dataset is an 8x8 pixel image of a handwritten number. # Digits.data converts these 64 pixels into a single array of features @@ -33,9 +32,9 @@ scikit-learn documentation on `confusion matrices `_ (or anything with an `inverse_transform` method that performs the mapping), or a `dict` with the encoding-to-string mapping as in the example below: +Class names can be added to a ``ConfusionMatrix`` plot using the ``label_encoder`` argument. The ``label_encoder`` can be a `sklearn.preprocessing.LabelEncoder `_ (or anything with an ``inverse_transform`` method that performs the mapping), or a ``dict`` with the encoding-to-string mapping as in the example below: -.. code:: python +.. plot:: + :context: close-figs + :alt: ConfusionMatrix plot with class names + from sklearn.datasets import load_iris + from sklearn.model_selection import train_test_split as tts + from sklearn.linear_model import LogisticRegression + from yellowbrick.classifier import ConfusionMatrix + iris = load_iris() X = iris.data y = iris.target @@ -68,7 +71,7 @@ Class names can be added to a `ConfusionMatrix` plot using the `label_encoder` a X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2) - model = LogisticRegression() + model = LogisticRegression(multi_class="auto", solver="liblinear") iris_cm = ConfusionMatrix( model, classes=classes, @@ -81,9 +84,6 @@ Class names can be added to a `ConfusionMatrix` plot using the `label_encoder` a iris_cm.poof() -.. image:: images/confusion_matrix_iris.png - - API Reference ------------- diff --git a/docs/api/classifier/images/binary_precision_recall.png b/docs/api/classifier/images/binary_precision_recall.png deleted file mode 100644 index e3602a65e..000000000 Binary files a/docs/api/classifier/images/binary_precision_recall.png and /dev/null differ diff --git a/docs/api/classifier/images/churn_discrimination_threshold.png b/docs/api/classifier/images/churn_discrimination_threshold.png deleted file mode 100644 index c66bbe3ef..000000000 Binary files a/docs/api/classifier/images/churn_discrimination_threshold.png and /dev/null differ diff --git a/docs/api/classifier/images/class_prediction_error.png b/docs/api/classifier/images/class_prediction_error.png deleted file mode 100644 index ed7855f0d..000000000 Binary files a/docs/api/classifier/images/class_prediction_error.png and /dev/null differ diff --git a/docs/api/classifier/images/class_prediction_error_credit.png b/docs/api/classifier/images/class_prediction_error_credit.png deleted file mode 100644 index 0edb9b9b1..000000000 Binary files a/docs/api/classifier/images/class_prediction_error_credit.png and /dev/null differ diff --git a/docs/api/classifier/images/classification_report.png b/docs/api/classifier/images/classification_report.png deleted file mode 100644 index 81e8a3b2b..000000000 Binary files a/docs/api/classifier/images/classification_report.png and /dev/null differ diff --git a/docs/api/classifier/images/confusion_matrix_digits.png b/docs/api/classifier/images/confusion_matrix_digits.png deleted file mode 100644 index 120c56d8d..000000000 Binary files a/docs/api/classifier/images/confusion_matrix_digits.png and /dev/null differ diff --git a/docs/api/classifier/images/confusion_matrix_iris.png b/docs/api/classifier/images/confusion_matrix_iris.png deleted file mode 100644 index f536db142..000000000 Binary files a/docs/api/classifier/images/confusion_matrix_iris.png and /dev/null differ diff --git a/docs/api/classifier/images/multiclass_precision_recall.png b/docs/api/classifier/images/multiclass_precision_recall.png deleted file mode 100644 index 923c9b915..000000000 Binary files a/docs/api/classifier/images/multiclass_precision_recall.png and /dev/null differ diff --git a/docs/api/classifier/images/multiclass_precision_recall_full.png b/docs/api/classifier/images/multiclass_precision_recall_full.png deleted file mode 100644 index ec5e7ca51..000000000 Binary files a/docs/api/classifier/images/multiclass_precision_recall_full.png and /dev/null differ diff --git a/docs/api/classifier/images/rocauc.png b/docs/api/classifier/images/rocauc.png deleted file mode 100644 index 145a993c2..000000000 Binary files a/docs/api/classifier/images/rocauc.png and /dev/null differ diff --git a/docs/api/classifier/images/rocauc_binary.png b/docs/api/classifier/images/rocauc_binary.png deleted file mode 100644 index 2e9e7ce6e..000000000 Binary files a/docs/api/classifier/images/rocauc_binary.png and /dev/null differ diff --git a/docs/api/classifier/images/rocauc_multiclass.png b/docs/api/classifier/images/rocauc_multiclass.png deleted file mode 100644 index 0da15e398..000000000 Binary files a/docs/api/classifier/images/rocauc_multiclass.png and /dev/null differ diff --git a/docs/api/classifier/images/spam_discrimination_threshold.png b/docs/api/classifier/images/spam_discrimination_threshold.png deleted file mode 100644 index 5267df270..000000000 Binary files a/docs/api/classifier/images/spam_discrimination_threshold.png and /dev/null differ diff --git a/docs/api/classifier/prcurve.py b/docs/api/classifier/prcurve.py deleted file mode 100644 index f4ac5224a..000000000 --- a/docs/api/classifier/prcurve.py +++ /dev/null @@ -1,81 +0,0 @@ -#!/usr/bin/env python - -import os -import pandas as pd -import matplotlib.pyplot as plt - -from sklearn.naive_bayes import MultinomialNB -from sklearn.linear_model import RidgeClassifier -from sklearn.ensemble import RandomForestClassifier -from sklearn.model_selection import train_test_split -from sklearn.preprocessing import LabelEncoder - -from yellowbrick.classifier import PrecisionRecallCurve - - -# Location of downloaded datasets from Yellowbrick -FIXTURES = os.path.join( - os.path.dirname(__file__), "..", "..", "..", "yellowbrick", "datasets", "fixtures" -) - - -def load_binary(split=True): - data = pd.read_csv(os.path.join(FIXTURES, "spam", "spam.csv")) - - target = "is_spam" - features = [col for col in data.columns if col != target] - - X = data[features] - y = data[target] - - if split: - return train_test_split(X, y, test_size=0.2, shuffle=True) - return X, y - - -def load_multiclass(split=True): - data = pd.read_csv(os.path.join(FIXTURES, "game", "game.csv")) - - # Encode the categorical variables - data.replace({'x':0, 'o':1, 'b':2}, inplace=True) - - # Extract the numpy arrays from the data frame - X = data.iloc[:, data.columns != 'outcome'] - y = LabelEncoder().fit_transform(data['outcome']) - - if split: - return train_test_split(X, y, test_size=0.2, shuffle=True) - return X, y - - -def draw_binary(outpath=None): - _, ax = plt.subplots(figsize=(9,6)) - - X_train, X_test, y_train, y_test = load_binary(split=True) - - oz = PrecisionRecallCurve(RidgeClassifier(), ax=ax) - oz.fit(X_train, y_train) - oz.score(X_test, y_test) - oz.poof(outpath=outpath) - - -def draw_multiclass(outpath=None, simple=True): - _, ax = plt.subplots(figsize=(9,6)) - - X_train, X_test, y_train, y_test = load_multiclass() - - if simple: - oz = PrecisionRecallCurve(RandomForestClassifier(), ax=ax) - else: - oz = PrecisionRecallCurve(MultinomialNB(), ax=ax, per_class=True, iso_f1_curves=True, fill_area=False, micro=False) - - oz.fit(X_train, y_train) - oz.score(X_test, y_test) - oz.poof(outpath=outpath) - - - -if __name__ == '__main__': - draw_binary(outpath="images/binary_precision_recall.png") - draw_multiclass(simple=True, outpath="images/multiclass_precision_recall.png") - draw_multiclass(simple=False, outpath="images/multiclass_precision_recall_full.png") diff --git a/docs/api/classifier/prcurve.rst b/docs/api/classifier/prcurve.rst index d1762b80e..c0954029e 100644 --- a/docs/api/classifier/prcurve.rst +++ b/docs/api/classifier/prcurve.rst @@ -14,16 +14,17 @@ for the majority of classes it selects. Binary Classification --------------------- -.. code:: python +.. plot:: + :context: close-figs + :alt: PrecisionRecallCurve with Binary Classification from sklearn.linear_model import RidgeClassifier from sklearn.model_selection import train_test_split as tts from yellowbrick.classifier import PrecisionRecallCurve + from yellowbrick.datasets import load_spam # Load the dataset and split into train/test splits - data = load_spam() - X = data[[col for col in data.columns if col != "is_spam"]] - y = data["is_spam"] + X, y = load_spam() X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2, shuffle=True) @@ -34,8 +35,6 @@ Binary Classification viz.poof() -.. image:: images/binary_precision_recall.png - The base case for precision-recall curves is the binary classification case, and this case is also the most visually interpretable. In the figure above we can see the precision plotted on the y-axis against the recall on the x-axis. The larger the filled in area, the stronger the classifier is. The red line annotates the *average precision*, a summary of the entire plot computed as the weighted average of precision achieved at each threshold such that the weight is the difference in recall from the previous threshold. Multi-Label Classification @@ -43,45 +42,59 @@ Multi-Label Classification To support multi-label classification, the estimator is wrapped in a `OneVsRestClassifier `_ to produce binary comparisons for each class (e.g. the positive case is the class and the negative case is any other class). The Precision-Recall curve is then computed as the micro-average of the precision and recall for all classes: -.. code:: python +.. plot:: + :context: close-figs + :alt: PrecisionRecallCurves with Multi-label Classification from sklearn.ensemble import RandomForestClassifier - from sklearn.preprocessing import LabelEncoder + from sklearn.preprocessing import LabelEncoder, OrdinalEncoder + from sklearn.model_selection import train_test_split as tts + from yellowbrick.classifier import PrecisionRecallCurve + from yellowbrick.datasets import load_game # Load dataset and encode categorical variables - data = load_game() - data.replace({'x':0, 'o':1, 'b':2}, inplace=True) - - # Create train/test splits - X = data.iloc[:, data.columns != 'outcome'] - y = LabelEncoder().fit_transform(data['outcome']) + X, y = load_game() + X = OrdinalEncoder().fit_transform(X) + y = LabelEncoder().fit_transform(y) X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2, shuffle=True) # Create the visualizer, fit, score, and poof it - viz = PrecisionRecallCurve(RandomForestClassifier()) + viz = PrecisionRecallCurve(RandomForestClassifier(n_estimators=10)) viz.fit(X_train, y_train) viz.score(X_test, y_test) viz.poof() -.. image:: images/multiclass_precision_recall.png A more complex Precision-Recall curve can be computed, however, displaying the each curve individually, along with F1-score ISO curves (e.g. that show the relationship between precision and recall for various F1 scores). -.. code:: python +.. plot:: + :context: close-figs + :alt: PrecisionRecallCurves displaying each curve individually from sklearn.naive_bayes import MultinomialNB + from sklearn.preprocessing import LabelEncoder, OrdinalEncoder + from sklearn.model_selection import train_test_split as tts + from yellowbrick.classifier import PrecisionRecallCurve + from yellowbrick.datasets import load_game - oz = PrecisionRecallCurve( + # Load dataset and encode categorical variables + X, y = load_game() + X = OrdinalEncoder().fit_transform(X) + encoder = LabelEncoder() + y = encoder.fit_transform(y) + + X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2, shuffle=True) + + # Create the visualizer, fit, score, and poof it + viz = PrecisionRecallCurve( MultinomialNB(), per_class=True, iso_f1_curves=True, - fill_area=False, micro=False + fill_area=False, micro=False, classes=encoder.classes_ ) viz.fit(X_train, y_train) viz.score(X_test, y_test) viz.poof() -.. image:: images/multiclass_precision_recall_full.png - .. seealso:: `Scikit-Learn: Model Selection with Precision Recall Curves `_ diff --git a/docs/api/classifier/rocauc.py b/docs/api/classifier/rocauc.py deleted file mode 100644 index 98708dd32..000000000 --- a/docs/api/classifier/rocauc.py +++ /dev/null @@ -1,77 +0,0 @@ -import pandas as pd -import matplotlib.pyplot as plt - -from sklearn.svm import LinearSVC -from sklearn.linear_model import LogisticRegression, RidgeClassifier -from sklearn.model_selection import train_test_split - -from yellowbrick.classifier import ROCAUC - - -def load_occupancy(): - # Load the binary classification data set - room = pd.read_csv("../../../examples/data/occupancy/occupancy.csv") - - features = ["temperature", "relative humidity", "light", "C02", "humidity"] - classes = ['unoccupied', 'occupied'] - - # Extract the numpy arrays from the data frame - X = room[features].values - y = room.occupancy.values - - return X, y, classes - - -def load_game(): - # Load multi-class classification dataset - game = pd.read_csv('../../../examples/data/game/game.csv') - - classes = ["win", "loss", "draw"] - game.replace({'loss':-1, 'draw':0, 'win':1, 'x':2, 'o':3, 'b':4}, inplace=True) - - # Extract the numpy arrays from the data frame - X = game.iloc[:, game.columns != 'outcome'] - y = game['outcome'] - - return X, y, classes - - -def rocauc(X, y, model, outpath, **kwargs): - # Create a new figure and axes - _, ax = plt.subplots() - - # Instantiate the classification model and visualizer - visualizer = ROCAUC(model, ax=ax, **kwargs) - - # Create the train and test data - X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) - - visualizer.fit(X_train, y_train) - visualizer.score(X_test, y_test) - - # Save to disk - visualizer.poof(outpath=outpath) - -if __name__ == '__main__': - - # Occupancy data visualization - X, y, classes = load_occupancy() - - # Draw the binary rocauc - rocauc( - X, y, LogisticRegression(), "images/rocauc_binary.png", classes=classes - ) - - # Draw a single binary decision curve - rocauc( - X, y, LinearSVC(), "images/rocauc_binary.png", - micro=False, macro=False, per_class=False - ) - - # Game data visualization - X, y, classes = load_game() - - # Draw the multiclass roc_auc - rocauc( - X, y, RidgeClassifier(), "images/rocauc_multiclass.png", classes=classes - ) diff --git a/docs/api/classifier/rocauc.rst b/docs/api/classifier/rocauc.rst index 5291dc0d0..35fc89023 100644 --- a/docs/api/classifier/rocauc.rst +++ b/docs/api/classifier/rocauc.rst @@ -9,87 +9,79 @@ The Receiver Operating Characteristic (ROC) is a measure of a classifier's predi This leads to another metric, area under the curve (AUC), which is a computation of the relationship between false positives and true positives. The higher the AUC, the better the model generally is. However, it is also important to inspect the "steepness" of the curve, as this describes the maximization of the true positive rate while minimizing the false positive rate. -.. code:: python +.. plot:: + :context: close-figs + :alt: ROCAUC Binary Classification + from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split - # Load the classification data set - data = load_data("occupancy") - - # Specify the features of interest and the classes of the target - features = ["temperature", "relative humidity", "light", "C02", "humidity"] - classes = ["unoccupied", "occupied"] - - # Extract the instances and target - X = data[features] - y = data.occupancy - - # Create the train and test data - X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) - -.. code:: python - from yellowbrick.classifier import ROCAUC - from sklearn.linear_model import LogisticRegression + from yellowbrick.datasets import load_spam - # Instantiate the visualizer with the classification model - visualizer = ROCAUC(LogisticRegression(), classes=classes) + # Load the classification dataset + X, y = load_spam() - visualizer.fit(X_train, y_train) # Fit the training data to the visualizer - visualizer.score(X_test, y_test) # Evaluate the model on the test data - g = visualizer.poof() # Draw/show/poof the data + # Create the training and test data + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) + # Instantiate the visualizer with the classification model + model = LogisticRegression(multi_class="auto", solver="liblinear") + visualizer = ROCAUC(model, classes=["not_spam", "is_spam"]) -.. image:: images/rocauc_binary.png + visualizer.fit(X_train, y_train) # Fit the training data to the visualizer + visualizer.score(X_test, y_test) # Evaluate the model on the test data + visualizer.poof() # Draw/show/poof the data .. warning:: - Binary classification using a Scikit-learn-style estimator with only a - ``decision_function``, triggers an ``IndexError`` because the predictions - will be a 1D array, meaning there is only sufficient information to plot a - single curve. More on this bug can be found in this `notebook `_. The bug was addressed in a `July 2018 PR `_ - and will be fixed in v0.9, where the solution will be to set the ``micro``, - ``macro``, and ``per-class`` parameters of ``ROCAUC`` to ``False``. + Versions of Yellowbrick =< v0.8 had a `bug `_ + that triggered an ``IndexError`` when attempting binary classification using + a Scikit-learn-style estimator with only a ``decision_function``. This has been + fixed as of v0.9, where the ``micro``, ``macro``, and ``per-class`` parameters of + ``ROCAUC`` are set to ``False`` for such classifiers. Multi-class ROCAUC Curves -######################### +------------------------- Yellowbrick's ``ROCAUC`` Visualizer does allow for plotting multiclass classification curves. ROC curves are typically used in binary classification, and in fact the Scikit-Learn ``roc_curve`` metric is only able to perform metrics for binary classifiers. Yellowbrick addresses this by binarizing the output (per-class) or to use one-vs-rest (micro score) or one-vs-all (macro score) strategies of classification. -.. code:: +.. plot:: + :context: close-figs + :alt: ROCAUC multiclass classification curves - # Load multi-class classification dataset - game = load_game() + from sklearn.linear_model import RidgeClassifier + from sklearn.model_selection import train_test_split + from sklearn.preprocessing import OrdinalEncoder, LabelEncoder + + from yellowbrick.classifier import ROCAUC + from yellowbrick.datasets import load_game - classes = ["win", "loss", "draw"] + # Load multi-class classification dataset + X, y = load_game() # Encode the non-numeric columns - game.replace({'loss':-1, 'draw':0, 'win':1, 'x':2, 'o':3, 'b':4}, inplace=True) - - # Extract the instances and target - X = game.iloc[:, game.columns != 'outcome'] - y = game['outcome'] + X = OrdinalEncoder().fit_transform(X) + y = LabelEncoder().fit_transform(y) # Create the train and test data - X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) -.. code:: + # Instaniate the classification model and visualizer + model = RidgeClassifier() + visualizer = ROCAUC(model, classes=["win", "loss", "draw"]) - from sklearn.linear_model import RidgeClassifier - - visualizer = ROCAUC(RidgeClassifier(), classes=classes) + visualizer.fit(X_train, y_train) # Fit the training data to the visualizer + visualizer.score(X_test, y_test) # Evaluate the model on the test data + visualizer.poof() # Draw/show/poof the data - visualizer.fit(X_train, y_train) # Fit the training data to the visualizer - visualizer.score(X_test, y_test) # Evaluate the model on the test data - g = visualizer.poof() # Draw/show/poof the data +.. warning:: + The target ``y`` must be numeric for this figure to work, or update to the latest version of sklearn. By default with multi-class ROCAUC visualizations, a curve for each class is plotted, in addition to the micro- and macro-average curves for each class. This enables the user to inspect the tradeoff between sensitivity and specificity on a per-class basis. Note that for multi-class ``ROCAUC``, at least one of the ``micro``, ``macro``, or ``per_class`` parameters must be set to ``True`` (by default, all are set to ``True``). -.. image:: images/rocauc_multiclass.png - - API Reference ------------- diff --git a/docs/api/classifier/threshold.py b/docs/api/classifier/threshold.py deleted file mode 100644 index 056360be9..000000000 --- a/docs/api/classifier/threshold.py +++ /dev/null @@ -1,81 +0,0 @@ -import os -import pandas as pd -import matplotlib.pyplot as plt - -from yellowbrick.classifier import DiscriminationThreshold -from sklearn.linear_model import LogisticRegression -from sklearn.ensemble import RandomForestClassifier -from sklearn.preprocessing import LabelEncoder - -from functools import partial - -BASE = os.path.join("..", "..", "..", os.path.dirname("__file__")) -EXAMPLES = os.path.join(BASE, "examples", "data") - -# TODO: Make these examples part of the code base -CHURN_DATASET = os.path.join(EXAMPLES, "churn", "churn.txt") -SPAM_DATASET = os.path.join(EXAMPLES, "spam", "spam.csv") - - -def load_spam(): - df = pd.read_csv(SPAM_DATASET) - - target = 'is_spam' - features = [col for col in df.columns if col != target] - - X = df[features] - y = df[target] - - return X, y - - -def load_churn(): - df = pd.read_csv(CHURN_DATASET) - df.columns = [ - c.lower().replace(' ', '_').replace('?', '').replace("'", "") - for c in df.columns - ] - - state_encoder = LabelEncoder() - df.state = state_encoder.fit_transform(df.state) - - del df['phone'] - - for col in ['intl_plan', 'vmail_plan', 'churn']: - df[col] = df[col].map({'no': 0, 'False.': 0, 'yes': 1, 'True.': 1}) - - X = df[[c for c in df.columns if c != 'churn']] - y = df['churn'] - - return X, y - - -def plot_discrimination_threshold(clf, data='spam', outpath=None): - if data == 'spam': - X, y = load_spam() - elif data == 'churn': - X, y = load_churn() - else: - raise ValueError("no dataset loader '{}'".format(data)) - - _, ax = plt.subplots() - - visualizer = DiscriminationThreshold(clf, ax=ax) - visualizer.fit(X, y) - visualizer.poof(outpath=outpath) - - -plot_churn = partial( - plot_discrimination_threshold, data='churn', - outpath="images/churn_discrimination_threshold.png" -) - -plot_spam = partial( - plot_discrimination_threshold, data='spam', - outpath="images/spam_discrimination_threshold.png" -) - - -if __name__ == '__main__': - plot_churn(RandomForestClassifier()) - plot_spam(LogisticRegression()) diff --git a/docs/api/classifier/threshold.rst b/docs/api/classifier/threshold.rst index f287f1e5c..443d728a2 100644 --- a/docs/api/classifier/threshold.rst +++ b/docs/api/classifier/threshold.rst @@ -7,30 +7,24 @@ Discrimination Threshold A visualization of precision, recall, f1 score, and queue rate with respect to the discrimination threshold of a binary classifier. The *discrimination threshold* is the probability or score at which the positive class is chosen over the negative class. Generally, this is set to 50% but the threshold can be adjusted to increase or decrease the sensitivity to false positives or to other application factors. -.. code:: python - - # Load a binary classification dataset - data = load_data("spam") - target = "is_spam" - features = [col for col in data.columns if col != target] - - # Extract the instances and target from the dataset - X = data[features] - y = data[target] - -.. code:: python +.. plot:: + :context: close-figs + :alt: Discrimination Threshold of a binary classifier from sklearn.linear_model import LogisticRegression + from yellowbrick.classifier import DiscriminationThreshold + from yellowbrick.datasets import load_spam - # Instantiate the classification model and visualizer - logistic = LogisticRegression() - visualizer = DiscriminationThreshold(logistic) + # Load a binary classification dataset + X, y = load_spam() - visualizer.fit(X, y) # Fit the training data to the visualizer - visualizer.poof() # Draw/show/poof the data + # Instantiate the classification model and visualizer + model = LogisticRegression(multi_class="auto", solver="liblinear") + visualizer = DiscriminationThreshold(model) -.. image:: images/spam_discrimination_threshold.png + visualizer.fit(X, y) # Fit the data to the visualizer + visualizer.poof() # Draw/show/poof the data One common use of binary classification algorithms is to use the score or probability they produce to determine cases that require special treatment. For example, a fraud prevention application might use a classification algorithm to determine if a transaction is likely fraudulent and needs to be investigated in detail. In the figure above, we present an example where a binary classifier determines if an email is "spam" (the positive case) or "not spam" (the negative case). Emails that are detected as spam are moved to a hidden folder and eventually deleted. diff --git a/docs/api/cluster/elbow.py b/docs/api/cluster/elbow.py deleted file mode 100644 index fd362308d..000000000 --- a/docs/api/cluster/elbow.py +++ /dev/null @@ -1,55 +0,0 @@ -#!/usr/bin/env python - -""" -Generate images for the elbow plot documentation. -""" - -# Import necessary modules -import matplotlib.pyplot as plt - -from sklearn.cluster import KMeans -from sklearn.datasets import make_blobs -from yellowbrick.cluster import KElbowVisualizer - - -def draw_elbow(path="images/elbow.png"): - # Generate synthetic dataset with 8 blobs - X, y = make_blobs( - centers=8, n_features=12, n_samples=1000, - shuffle=True, random_state=42 - ) - - # Create a new figure to draw the clustering visualizer on - _, ax = plt.subplots() - - # Instantiate the clustering model and visualizer - model = KMeans() - visualizer = KElbowVisualizer(model, ax=ax, k=(4,12)) - - visualizer.fit(X) # Fit the data to the visualizer - visualizer.poof(outpath=path) # Draw/show/poof the data - - -def draw_calinski_harabaz(path="images/calinski_harabaz.png"): - # Generate synthetic dataset with 8 blobs - X, y = make_blobs( - centers=8, n_features=12, n_samples=1000, - shuffle=True, random_state=42 - ) - - # Create a new figure to draw the clustering visualizer on - _, ax = plt.subplots() - - # Instantiate the clustering model and visualizer - model = KMeans() - visualizer = KElbowVisualizer( - model, ax=ax, k=(4,12), - metric='calinski_harabaz', timings=False - ) - visualizer.fit(X) # Fit the data to the visualizer - visualizer.poof(outpath=path) # Draw/show/poof the data - - -if __name__ == '__main__': - draw_elbow() - draw_calinski_harabaz() diff --git a/docs/api/cluster/elbow.rst b/docs/api/cluster/elbow.rst index 50e3d8277..aed5d0c6d 100644 --- a/docs/api/cluster/elbow.rst +++ b/docs/api/cluster/elbow.rst @@ -3,53 +3,80 @@ Elbow Method ============ -The ``KElbowVisualizer`` implements the "elbow" method to help data scientists select the optimal number of clusters by fitting the model with a range of values for :math:`K`. If the line chart resembles an arm, then the "elbow" (the point of inflection on the curve) is a good indication that the underlying model fits best at that point. +The ``KElbowVisualizer`` implements the "elbow" method to help data scientists select the optimal number of clusters by fitting the model with a range of values for :math:`K`. If the line chart resembles an arm, then the "elbow" (the point of inflection on the curve) is a good indication that the underlying model fits best at that point. In the visualizer "elbow" will be annotated with a dashed line. -To demonstrate, in the following example the ``KElbowVisualizer`` fits the ``KMeans`` model for a range of :math:`K` values from 4 to 11 on a sample two-dimensional dataset with 8 random clusters of points. When the model is fit with 8 clusters, we can see an "elbow" in the graph, which in this case we know to be the optimal number. +To demonstrate, in the following example the ``KElbowVisualizer`` fits the ``KMeans`` model for a range of :math:`K` values from 4 to 11 on a sample two-dimensional dataset with 8 random clusters of points. When the model is fit with 8 clusters, we can see a line annotating the "elbow" in the graph, which in this case we know to be the optimal number. -.. code:: python +.. plot:: + :context: close-figs + :alt: KElbowVisualizer on synthetic dataset with 8 random clusters + from sklearn.cluster import KMeans from sklearn.datasets import make_blobs - # Create synthetic dataset with 8 random clusters - X, y = make_blobs(centers=8, n_features=12, shuffle=True, random_state=42) - -.. code:: python - - from sklearn.cluster import KMeans from yellowbrick.cluster import KElbowVisualizer + # Generate synthetic dataset with 8 random clusters + X, y = make_blobs(n_samples=1000, n_features=12, centers=8, random_state=42) + # Instantiate the clustering model and visualizer model = KMeans() visualizer = KElbowVisualizer(model, k=(4,12)) - visualizer.fit(X) # Fit the data to the visualizer - visualizer.poof() # Draw/show/poof the data - -.. image:: images/elbow.png + visualizer.fit(X) # Fit the data to the visualizer + visualizer.poof() # Draw/show/poof the data By default, the scoring parameter ``metric`` is set to ``distortion``, which computes the sum of squared distances from each point to its assigned center. -However, two other metrics can also be used with the ``KElbowVisualizer`` -- ``silhouette`` and ``calinski_harabaz``. The ``silhouette`` score calculates the mean Silhouette Coefficient of all samples, while the ``calinski_harabaz`` score computes the ratio of dispersion between and within clusters. +However, two other metrics can also be used with the ``KElbowVisualizer`` -- ``silhouette`` and ``calinski_harabasz``. The ``silhouette`` score calculates the mean Silhouette Coefficient of all samples, while the ``calinski_harabasz`` score computes the ratio of dispersion between and within clusters. -The ``KElbowVisualizer`` also displays the amount of time to train the clustering model per :math:`K` as a dashed green line, but is can be hidden by setting ``timings=False``. In the following example, we'll use the ``calinski_harabaz`` score and hide the time to fit the model. +The ``KElbowVisualizer`` also displays the amount of time to train the clustering model per :math:`K` as a dashed green line, but is can be hidden by setting ``timings=False``. In the following example, we'll use the ``calinski_harabasz`` score and hide the time to fit the model. -.. code:: python +.. plot:: + :context: close-figs + :alt: KElbowVisualizer on synthetic dataset with 8 random clusters from sklearn.cluster import KMeans + from sklearn.datasets import make_blobs + from yellowbrick.cluster import KElbowVisualizer - + + # Generate synthetic dataset with 8 random clusters + X, y = make_blobs(n_samples=1000, n_features=12, centers=8, random_state=42) + # Instantiate the clustering model and visualizer model = KMeans() visualizer = KElbowVisualizer( - model, k=(4,12), metric='calinski_harabaz', timings=False + model, k=(4,12), metric='calinski_harabasz', timings=False ) - visualizer.fit(X) # Fit the data to the visualizer - visualizer.poof() # Draw/show/poof the data + visualizer.fit(X) # Fit the data to the visualizer + visualizer.poof() # Draw/show/poof the data + +By default, the parameter ``locate_elbow`` is set to ``True``, which automatically find the "elbow" which likely corresponds to the optimal value of k using the "knee point detection algorithm". However, users can turn off the feature by setting ``locate_elbow=False``. You can read about the implementation of this algorithm at "`Knee point detection in Python `_" by Kevin Arvai. + +In the following example, we'll use the ``calinski_harabasz`` score and turn off ``locate_elbow`` feature. +.. plot:: + :context: close-figs + :alt: KElbowVisualizer on synthetic dataset with 8 random clusters + + from sklearn.cluster import KMeans + from sklearn.datasets import make_blobs + + from yellowbrick.cluster import KElbowVisualizer + + # Generate synthetic dataset with 8 random clusters + X, y = make_blobs(n_samples=1000, n_features=12, centers=8, random_state=42) + + # Instantiate the clustering model and visualizer + model = KMeans() + visualizer = KElbowVisualizer( + model, k=(4,12), metric='calinski_harabasz', timings=False, locate_elbow=False + ) -.. image:: images/calinski_harabaz.png + visualizer.fit(X) # Fit the data to the visualizer + visualizer.poof() # Draw/show/poof the data It is important to remember that the "elbow" method does not work well if the data is not very clustered. In this case, you might see a smooth curve and the optimal value of :math:`K` will be unclear. diff --git a/docs/api/cluster/icdm.py b/docs/api/cluster/icdm.py deleted file mode 100644 index 0438a90c2..000000000 --- a/docs/api/cluster/icdm.py +++ /dev/null @@ -1,27 +0,0 @@ -# Clustering Evaluation Imports -from functools import partial - -from sklearn.cluster import KMeans -from sklearn.datasets import make_blobs as sk_make_blobs - -from yellowbrick.cluster import InterclusterDistance - -# Helpers for easy dataset creation -N_SAMPLES = 1000 -N_FEATURES = 12 -SHUFFLE = True - -# Make blobs partial -make_blobs = partial(sk_make_blobs, n_samples=N_SAMPLES, n_features=N_FEATURES, shuffle=SHUFFLE) - - -if __name__ == '__main__': - # Make 8 blobs dataset - X, y = make_blobs(centers=12) - - # Instantiate the clustering model and visualizer - # Instantiate the clustering model and visualizer - visualizer = InterclusterDistance(KMeans(9)) - - visualizer.fit(X) # Fit the training data to the visualizer - visualizer.poof(outpath="images/icdm.png") # Draw/show/poof the data diff --git a/docs/api/cluster/icdm.rst b/docs/api/cluster/icdm.rst index 42af8e868..a7c1a70dd 100644 --- a/docs/api/cluster/icdm.rst +++ b/docs/api/cluster/icdm.rst @@ -5,26 +5,24 @@ Intercluster Distance Maps Intercluster distance maps display an embedding of the cluster centers in 2 dimensions with the distance to other centers preserved. E.g. the closer to centers are in the visualization, the closer they are in the original feature space. The clusters are sized according to a scoring metric. By default, they are sized by membership, e.g. the number of instances that belong to each center. This gives a sense of the relative importance of clusters. Note however, that because two clusters overlap in the 2D space, it does not imply that they overlap in the original feature space. -.. code:: python +.. plot:: + :context: close-figs + :alt: Intercluster Distance Visualizer on dataset with 12 random clusters + from sklearn.cluster import KMeans from sklearn.datasets import make_blobs - # Make 12 blobs dataset - X, y = make_blobs(centers=12, n_samples=1000, n_features=16, shuffle=True) - -.. code:: python - - from sklearn.cluster import KMeans from yellowbrick.cluster import InterclusterDistance - # Instantiate the clustering model and visualizer - visualizer = InterclusterDistance(KMeans(9)) - - visualizer.fit(X) # Fit the training data to the visualizer - visualizer.poof() # Draw/show/poof the data + # Generate synthetic dataset with 12 random clusters + X, y = make_blobs(n_samples=1000, n_features=12, centers=12, random_state=42) + # Instantiate the clustering model and visualizer + model = KMeans(6) + visualizer = InterclusterDistance(model) -.. image:: images/icdm.png + visualizer.fit(X) # Fit the data to the visualizer + visualizer.poof() # Draw/show/poof the data API Reference ------------- diff --git a/docs/api/cluster/images/calinski_harabaz.png b/docs/api/cluster/images/calinski_harabaz.png deleted file mode 100644 index 6cddfeeea..000000000 Binary files a/docs/api/cluster/images/calinski_harabaz.png and /dev/null differ diff --git a/docs/api/cluster/images/elbow.png b/docs/api/cluster/images/elbow.png deleted file mode 100644 index decf30c5f..000000000 Binary files a/docs/api/cluster/images/elbow.png and /dev/null differ diff --git a/docs/api/cluster/images/icdm.png b/docs/api/cluster/images/icdm.png deleted file mode 100644 index 51c5b5a0d..000000000 Binary files a/docs/api/cluster/images/icdm.png and /dev/null differ diff --git a/docs/api/cluster/images/silhouette.png b/docs/api/cluster/images/silhouette.png deleted file mode 100644 index b2d5d538b..000000000 Binary files a/docs/api/cluster/images/silhouette.png and /dev/null differ diff --git a/docs/api/cluster/silhouette.py b/docs/api/cluster/silhouette.py deleted file mode 100644 index c62f31751..000000000 --- a/docs/api/cluster/silhouette.py +++ /dev/null @@ -1,27 +0,0 @@ -# Clustering Evaluation Imports -from functools import partial - -from sklearn.cluster import MiniBatchKMeans -from sklearn.datasets import make_blobs as sk_make_blobs - -from yellowbrick.cluster import SilhouetteVisualizer - -# Helpers for easy dataset creation -N_SAMPLES = 1000 -N_FEATURES = 12 -SHUFFLE = True - -# Make blobs partial -make_blobs = partial(sk_make_blobs, n_samples=N_SAMPLES, n_features=N_FEATURES, shuffle=SHUFFLE) - - -if __name__ == '__main__': - # Make 8 blobs dataset - X, y = make_blobs(centers=8) - - # Instantiate the clustering model and visualizer - model = MiniBatchKMeans(6) - visualizer = SilhouetteVisualizer(model) - - visualizer.fit(X) # Fit the training data to the visualizer - visualizer.poof(outpath="images/silhouette.png") # Draw/show/poof the data diff --git a/docs/api/cluster/silhouette.rst b/docs/api/cluster/silhouette.rst index feee9af63..bfe237329 100644 --- a/docs/api/cluster/silhouette.rst +++ b/docs/api/cluster/silhouette.rst @@ -7,29 +7,28 @@ The Silhouette Coefficient is used when the ground-truth about the dataset is un The Silhouette Visualizer displays the silhouette coefficient for each sample on a per-cluster basis, visualizing which clusters are dense and which are not. This is particularly useful for determining cluster imbalance, or for selecting a value for :math:`K` by comparing multiple visualizers. -.. code:: python +.. plot:: + :context: close-figs + :alt: SilhouetteVisualizer on the nfl dataset with 4 clusters - from sklearn.datasets import make_blobs - - # Make 8 blobs dataset - X, y = make_blobs(centers=8) - -.. code:: python - - from sklearn.cluster import MiniBatchKMeans + from sklearn.cluster import KMeans from yellowbrick.cluster import SilhouetteVisualizer + from yellowbrick.datasets import load_nfl - # Instantiate the clustering model and visualizer - model = MiniBatchKMeans(6) - visualizer = SilhouetteVisualizer(model) - - visualizer.fit(X) # Fit the training data to the visualizer - visualizer.poof() # Draw/show/poof the data + # Load a clustering dataset + X, y = load_nfl() + # Specify the features to use for clustering + features = ['Rec', 'Yds', 'TD', 'Fmb', 'Ctch_Rate'] + X = X.query('Tgt >= 20')[features] -.. image:: images/silhouette.png + # Instantiate the clustering model and visualizer + model = KMeans(5, random_state=42) + visualizer = SilhouetteVisualizer(model, colors='yellowbrick') + visualizer.fit(X) # Fit the data to the visualizer + visualizer.poof() # Draw/show/poof the data API Reference diff --git a/docs/api/contrib/boundaries.py b/docs/api/contrib/boundaries.py deleted file mode 100644 index 4cd5ac823..000000000 --- a/docs/api/contrib/boundaries.py +++ /dev/null @@ -1,32 +0,0 @@ -import numpy as np - -from sklearn.model_selection import train_test_split -from sklearn.preprocessing import StandardScaler -from sklearn.datasets import make_moons, make_classification -from sklearn.neighbors import KNeighborsClassifier -from sklearn.svm import SVC - -from yellowbrick.contrib.classifier import DecisionViz - -X, y = make_classification(n_features=2, n_redundant=0, n_informative=2, - random_state=1, n_clusters_per_class=1) - -rng = np.random.RandomState(2) -X += 2 * rng.uniform(size=X.shape) -linearly_separable = (X, y) - -data_set = make_moons(noise=0.3, random_state=0) - -X, y = data_set -X = StandardScaler().fit_transform(X) -X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4, random_state=42) - -viz = DecisionViz(KNeighborsClassifier(3), title="Nearest Neighbors", features=['Feature One', 'Feature Two'], classes=['A', 'B']) -viz.fit(X_train, y_train) -viz.draw(X_test, y_test) -viz.poof(outpath="images/knn_decisionviz.png") - -viz = DecisionViz(SVC(kernel="linear", C=0.025), title="Linear SVM", features=['Feature One', 'Feature Two'], classes=['A', 'B']) -viz.fit(X_train, y_train) -viz.draw(X_test, y_test) -viz.poof(outpath="images/svc_decisionviz.png") diff --git a/docs/api/contrib/boundaries.rst b/docs/api/contrib/boundaries.rst index 2dbc68dc3..21d677033 100644 --- a/docs/api/contrib/boundaries.rst +++ b/docs/api/contrib/boundaries.rst @@ -5,37 +5,56 @@ DecisionBoundaries Vizualizer The DecisionBoundariesVisualizer is a bivariate data visualization algorithm that plots the decision boundaries of each class. -.. code:: python +.. plot:: + :context: close-figs + :alt: DecisionBoundariesVisualizer Nearest Neighbors - from sklearn.model_selection import train_test_split + from sklearn.model_selection import train_test_split as tts from sklearn.preprocessing import StandardScaler - from sklearn.datasets import make_moons, make_classification - - # Create dummy data - X, y = make_classification(n_features=2, n_redundant=0, n_informative=2, - random_state=1, n_clusters_per_class=1) - - rng = np.random.RandomState(2) - X += 2 * rng.uniform(size=X.shape) - linearly_separable = (X, y) + from sklearn.datasets import make_moons + from sklearn.neighbors import KNeighborsClassifier + from yellowbrick.contrib.classifier import DecisionViz data_set = make_moons(noise=0.3, random_state=0) X, y = data_set X = StandardScaler().fit_transform(X) - X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4, random_state=42) + X_train, X_test, y_train, y_test = tts(X, y, test_size=.4, random_state=42) + + viz = DecisionViz( + KNeighborsClassifier(3), title="Nearest Neighbors", + features=['Feature One', 'Feature Two'], classes=['A', 'B'] + ) + viz.fit(X_train, y_train) + viz.draw(X_test, y_test) + viz.poof() + -.. code:: python +.. plot:: + :context: close-figs + :alt: DecisionBoundariesVisualizer Linear SVM + from sklearn.svm import SVC + from sklearn.model_selection import train_test_split as tts + from sklearn.preprocessing import StandardScaler + from sklearn.datasets import make_moons from sklearn.neighbors import KNeighborsClassifier from yellowbrick.contrib.classifier import DecisionViz - viz = DecisionViz(KNeighborsClassifier(3), title="Nearest Neighbors", features=['Feature One', 'Feature Two'], classes=['A', 'B']) + data_set = make_moons(noise=0.3, random_state=0) + + X, y = data_set + X = StandardScaler().fit_transform(X) + X_train, X_test, y_train, y_test = tts(X, y, test_size=.4, random_state=42) + + + viz = DecisionViz( + SVC(kernel="linear", C=0.025), title="Linear SVM", + features=['Feature One', 'Feature Two'], classes=['A', 'B'] + ) viz.fit(X_train, y_train) viz.draw(X_test, y_test) - viz.poof(outpath="images/knn_decisionviz.png") - -.. image:: images/knn_decisionviz.png + viz.poof() API Reference diff --git a/docs/api/contrib/images/knn_decisionviz.png b/docs/api/contrib/images/knn_decisionviz.png deleted file mode 100644 index 94ab3d0e0..000000000 Binary files a/docs/api/contrib/images/knn_decisionviz.png and /dev/null differ diff --git a/docs/api/contrib/images/scatter.png b/docs/api/contrib/images/scatter.png deleted file mode 100644 index b71e17e9b..000000000 Binary files a/docs/api/contrib/images/scatter.png and /dev/null differ diff --git a/docs/api/contrib/images/svc_decisionviz.png b/docs/api/contrib/images/svc_decisionviz.png deleted file mode 100644 index b01cb867f..000000000 Binary files a/docs/api/contrib/images/svc_decisionviz.png and /dev/null differ diff --git a/docs/api/contrib/missing/bar.py b/docs/api/contrib/missing/bar.py deleted file mode 100644 index 51695b844..000000000 --- a/docs/api/contrib/missing/bar.py +++ /dev/null @@ -1,23 +0,0 @@ -import numpy as np -from sklearn.datasets import make_classification - -# Create dummy data -X, y = make_classification( - n_samples=400, n_features=10, n_informative=2, n_redundant=3, - n_classes=2, n_clusters_per_class=2, random_state=854 - ) - -# assign some NaN values -X[X > 1.5] = np.nan -features = ["Feature {}".format(str(n)) for n in range(10)] - -from yellowbrick.contrib.missing import MissingValuesBar - -viz = MissingValuesBar(features=features) -viz.fit(X) -viz.poof(outpath="images/missingbar.png") - - -viz = MissingValuesBar(features=features) -viz.fit(X, y=y) -viz.poof(outpath="images/missingbar_with_targets.png") diff --git a/docs/api/contrib/missing/bar.rst b/docs/api/contrib/missing/bar.rst index 1a77ea633..dd876f33e 100644 --- a/docs/api/contrib/missing/bar.rst +++ b/docs/api/contrib/missing/bar.rst @@ -1,54 +1,67 @@ .. -*- mode: rst -*- MissingValues Bar -============================= +================= -The MissingValues Bar visualizer creates a bar graph that counts the number of missing values per feature column. +The MissingValues Bar visualizer creates a bar graph that counts the number of missing values per feature column. If the target ``y`` is supplied to fit, a stacked bar chart is produced. -If the target y is supplied to fit, then produces a stacked bar chart. -**Setup** +Without Targets Supplied +------------------------ -.. code:: python +.. plot:: + :context: close-figs + :alt: MissingValues Bar visualization on a dataset with no targets supplied import numpy as np + from sklearn.datasets import make_classification + from yellowbrick.contrib.missing import MissingValuesBar + # Make a classification dataset X, y = make_classification( - n_samples=400, n_features=10, n_informative=2, n_redundant=3, - n_classes=2, n_clusters_per_class=2, random_state=854 - ) - # assign some NaN values + n_samples=400, n_features=10, n_informative=2, n_redundant=3, + n_classes=2, n_clusters_per_class=2, random_state=854 + ) + + # Assign NaN values X[X > 1.5] = np.nan features = ["Feature {}".format(str(n)) for n in range(10)] -------------------------------------------- -Without Targets Supplied -------------------------------------------- - -.. code:: python + # Instantiate the visualizer + visualizer = MissingValuesBar(features=features) - from yellowbrick.contrib.missing import MissingValuesBar + visualizer.fit(X) # Fit the data to the visualizer + visualizer.poof() # Draw/show/poof the data - viz = MissingValuesBar(features=features) - viz.fit(X) - viz.poof() -.. image:: images/missingbar.png +With Targets (``y``) Supplied +----------------------------- -------------------------------------------- -With Targets (y) Supplied -------------------------------------------- +.. plot:: + :context: close-figs + :alt: MissingValuesBar visualization on a dataset with targets supplied -.. code:: python + import numpy as np + from sklearn.datasets import make_classification from yellowbrick.contrib.missing import MissingValuesBar - viz = MissingValuesBar(features=features) - viz.fit(X, y=y) # supply the targets via y - viz.poof() + # Make a classification dataset + X, y = make_classification( + n_samples=400, n_features=10, n_informative=2, n_redundant=3, + n_classes=2, n_clusters_per_class=2, random_state=854 + ) + + # Assign NaN values + X[X > 1.5] = np.nan + features = ["Feature {}".format(str(n)) for n in range(10)] + + # Instantiate the visualizer + visualizer = MissingValuesBar(features=features) -.. image:: images/missingbar_with_targets.png + visualizer.fit(X, y=y) # Supply the targets via y + visualizer.poof() # Draw/show/poof the data API Reference diff --git a/docs/api/contrib/missing/dispersion.py b/docs/api/contrib/missing/dispersion.py deleted file mode 100644 index e09cdb6dd..000000000 --- a/docs/api/contrib/missing/dispersion.py +++ /dev/null @@ -1,23 +0,0 @@ -import numpy as np -from sklearn.datasets import make_classification - -# Create dummy data -X, y = make_classification( - n_samples=400, n_features=10, n_informative=2, n_redundant=3, - n_classes=2, n_clusters_per_class=2, random_state=854 - ) - -# assign some NaN values -X[X > 1.5] = np.nan -features = ["Feature {}".format(str(n)) for n in range(10)] - -from yellowbrick.contrib.missing import MissingValuesDispersion - -viz = MissingValuesDispersion(features=features) -viz.fit(X) -viz.poof(outpath="images/missingdispersion.png") - - -viz = MissingValuesDispersion(features=features) -viz.fit(X, y=y) -viz.poof(outpath="images/missingdispersion_with_targets.png") diff --git a/docs/api/contrib/missing/dispersion.rst b/docs/api/contrib/missing/dispersion.rst index 3c36e6139..791c2006e 100644 --- a/docs/api/contrib/missing/dispersion.rst +++ b/docs/api/contrib/missing/dispersion.rst @@ -1,54 +1,64 @@ .. -*- mode: rst -*- MissingValues Dispersion -============================= +======================== The MissingValues Dispersion visualizer creates a chart that maps the position of missing values by the order of the index. -**Setup** -.. code:: python +Without Targets Supplied +------------------------ + +.. plot:: + :context: close-figs + :alt: MissingValues Dispersion visualization on a dataset with no targets supplied import numpy as np + from sklearn.datasets import make_classification + from yellowbrick.contrib.missing import MissingValuesDispersion X, y = make_classification( - n_samples=400, n_features=10, n_informative=2, n_redundant=3, - n_classes=2, n_clusters_per_class=2, random_state=854 - ) + n_samples=400, n_features=10, n_informative=2, n_redundant=3, + n_classes=2, n_clusters_per_class=2, random_state=854 + ) + # assign some NaN values X[X > 1.5] = np.nan features = ["Feature {}".format(str(n)) for n in range(10)] -------------------------------------------- -Without Targets Supplied -------------------------------------------- - -.. code:: python - - from yellowbrick.contrib.missing import MissingValuesDispersion + visualizer = MissingValuesDispersion(features=features) - viz = MissingValuesDispersion(features=features) - viz.fit(X) - viz.poof() + visualizer.fit(X) + visualizer.poof() -.. image:: images/missingdispersion.png -------------------------------------------- With Targets (y) Supplied -------------------------------------------- +------------------------- + +.. plot:: + :context: close-figs + :alt: MissingValues Dispersion visualization on a dataset with no targets supplied -.. code:: python + import numpy as np + from sklearn.datasets import make_classification from yellowbrick.contrib.missing import MissingValuesDispersion - viz = MissingValuesDispersion(features=features) - viz.fit(X, y=y) # supply the targets via y - viz.poof() + X, y = make_classification( + n_samples=400, n_features=10, n_informative=2, n_redundant=3, + n_classes=2, n_clusters_per_class=2, random_state=854 + ) -.. image:: images/missingdispersion_with_targets.png + # assign some NaN values + X[X > 1.5] = np.nan + features = ["Feature {}".format(str(n)) for n in range(10)] + # Instantiate the visualizer + visualizer = MissingValuesDispersion(features=features) + visualizer.fit(X, y=y) # supply the targets via y + visualizer.poof() API Reference diff --git a/docs/api/contrib/missing/images/missingbar.png b/docs/api/contrib/missing/images/missingbar.png deleted file mode 100644 index b4cb4b7ff..000000000 Binary files a/docs/api/contrib/missing/images/missingbar.png and /dev/null differ diff --git a/docs/api/contrib/missing/images/missingbar_with_targets.png b/docs/api/contrib/missing/images/missingbar_with_targets.png deleted file mode 100644 index 9b8af9eb5..000000000 Binary files a/docs/api/contrib/missing/images/missingbar_with_targets.png and /dev/null differ diff --git a/docs/api/contrib/missing/images/missingdispersion.png b/docs/api/contrib/missing/images/missingdispersion.png deleted file mode 100644 index 662729b5b..000000000 Binary files a/docs/api/contrib/missing/images/missingdispersion.png and /dev/null differ diff --git a/docs/api/contrib/missing/images/missingdispersion_with_targets.png b/docs/api/contrib/missing/images/missingdispersion_with_targets.png deleted file mode 100644 index 0729760d7..000000000 Binary files a/docs/api/contrib/missing/images/missingdispersion_with_targets.png and /dev/null differ diff --git a/docs/api/contrib/scatter.py b/docs/api/contrib/scatter.py deleted file mode 100644 index 63cc234e6..000000000 --- a/docs/api/contrib/scatter.py +++ /dev/null @@ -1,34 +0,0 @@ -import pandas as pd -import matplotlib.pyplot as plt - -from yellowbrick.contrib.scatter import ScatterVisualizer - - -def scatter(data, target, outpath, **kwargs): - # Create a new figure and axes - _, ax = plt.subplots() - - # Create the visualizer - visualizer = ScatterVisualizer(ax=ax, **kwargs) - visualizer.fit(data, target) - visualizer.transform(data) - - # Save to disk - visualizer.poof(outpath=outpath) - print(outpath) - - -if __name__ == '__main__': - # Load the classification data set - data = pd.read_csv("../../../examples/data/occupancy/occupancy.csv") - - # Specify the features of interest and the classes of the target - features = ["temperature", "relative humidity", "light", "C02", "humidity"] - classes = ['unoccupied', 'occupied'] - - # Extract the numpy arrays from the data frame - X = data[features] - y = data.occupancy - - # Draw the scatter visualizer - scatter(X, y, "images/scatter.png", x='light', y='C02', classes=classes) diff --git a/docs/api/contrib/scatter.rst b/docs/api/contrib/scatter.rst index 58e4f08fc..4ca7a4c27 100644 --- a/docs/api/contrib/scatter.rst +++ b/docs/api/contrib/scatter.rst @@ -7,31 +7,25 @@ Sometimes for feature analysis you simply need a scatter plot to determine the d A scatter visualizer simply plots two features against each other and colors the points according to the target. This can be useful in assessing the relationship of pairs of features to an individual target. -.. code:: python +.. plot:: + :context: close-figs + :alt: ScatterVisualizer on occupancy dataset - # Load the classification data set - data = load_data("occupancy") + from yellowbrick.contrib import ScatterVisualizer + from yellowbrick.datasets import load_occupancy - # Specify the features of interest and the classes of the target - features = ["temperature", "relative humidity", "light", "C02", "humidity"] - classes = ["unoccupied", "occupied"] - - # Extract the numpy arrays from the data frame - X = data[features] - y = data.occupancy - -.. code:: python + # Load the classification dataset + X, y = load_occupancy() - from yellowbrick.contrib.scatter import ScatterVisualizer - - visualizer = ScatterVisualizer(x="light", y="C02", classes=classes) - - visualizer.fit(X, y) - visualizer.transform(X) - visualizer.poof() + # Specify the target classes + classes = ["unoccupied", "occupied"] + # Instantiate the visualizer + visualizer = ScatterVisualizer(x="light", y="CO2", classes=classes) -.. image:: images/scatter.png + visualizer.fit(X, y) # Fit the data to the visualizer + visualizer.transform(X) # Transform the data + visualizer.poof() # Draw/show/poof the data API Reference diff --git a/docs/api/datasets.rst b/docs/api/datasets.rst deleted file mode 100644 index b8e4a0618..000000000 --- a/docs/api/datasets.rst +++ /dev/null @@ -1,81 +0,0 @@ -.. -*- mode: rst -*- - -Example Datasets -================ - -Yellowbrick hosts several datasets wrangled from the `UCI Machine -Learning Repository `__ to present the -examples used throughout this documentation. If you haven't downloaded the data, you can do so by -running: - -:: - - $ python -m yellowbrick.download - -This should create a folder named ``data`` in your current working directory that contains all of the datasets. You can load a specified dataset with ``pandas.read_csv`` as follows: - -.. code:: python - - import pandas as pd - - data = pd.read_csv('data/concrete/concrete.csv') - -The following code snippet can be found at the top of the ``examples/examples.ipynb`` notebook in Yellowbrick. Please reference this code when trying to load a specific data set: - -.. code:: python - - import os - - from yellowbrick.download import download_all - - ## The path to the test data sets - FIXTURES = os.path.join(os.getcwd(), "data") - - ## Dataset loading mechanisms - datasets = { - "bikeshare": os.path.join(FIXTURES, "bikeshare", "bikeshare.csv"), - "concrete": os.path.join(FIXTURES, "concrete", "concrete.csv"), - "credit": os.path.join(FIXTURES, "credit", "credit.csv"), - "energy": os.path.join(FIXTURES, "energy", "energy.csv"), - "game": os.path.join(FIXTURES, "game", "game.csv"), - "mushroom": os.path.join(FIXTURES, "mushroom", "mushroom.csv"), - "occupancy": os.path.join(FIXTURES, "occupancy", "occupancy.csv"), - "spam": os.path.join(FIXTURES, "spam", "spam.csv"), - } - - - def load_data(name, download=True): - """ - Loads and wrangles the passed in dataset by name. - If download is specified, this method will download any missing files. - """ - - # Get the path from the datasets - path = datasets[name] - - # Check if the data exists, otherwise download or raise - if not os.path.exists(path): - if download: - download_all() - else: - raise ValueError(( - "'{}' dataset has not been downloaded, " - "use the download.py module to fetch datasets" - ).format(name)) - - - # Return the data frame - return pd.read_csv(path) - - -Unless otherwise specified, most of the examples currently use one or more of the listed datasets. Each dataset has a ``README.md`` with detailed information about the data source, attributes, and target. Here is a complete listing of all datasets in Yellowbrick and their associated analytical tasks: - -- **bikeshare**: suitable for regression -- **concrete**: suitable for regression -- **credit**: suitable for classification/clustering -- **energy**: suitable for regression -- **game**: suitable for classification -- **hobbies**: suitable for text analysis -- **mushroom**: suitable for classification/clustering -- **occupancy**: suitable for classification -- **spam**: suitable for binary classification diff --git a/docs/api/datasets/bikeshare.rst b/docs/api/datasets/bikeshare.rst new file mode 100644 index 000000000..1686ce52a --- /dev/null +++ b/docs/api/datasets/bikeshare.rst @@ -0,0 +1,33 @@ +.. -*- mode: rst -*- + +Bikeshare +========= + +This dataset contains the hourly and daily count of rental bikes between years 2011 and 2012 in Capital bikeshare system with the corresponding weather and seasonal information. + +================= =============== +Samples total 17379 +Dimensionality 12 +Features real, positive +Targets ints, 1-977 +Task(s) regression +================= =============== + +Description +----------- + +Bike sharing systems are new generation of traditional bike rentals where whole process from membership, rental and return back has become automatic. Through these systems, user is able to easily rent a bike from a particular position and return back at another position. Currently, there are about over 500 bike-sharing programs around the world which is composed of over 500 thousands bicycles. Today, there exists great interest in these systems due to their important role in traffic, environmental and health issues. + +Apart from interesting real world applications of bike sharing systems, the characteristics of data being generated by these systems make them attractive for the research. Opposed to other transport services such as bus or subway, the duration of travel, departure and arrival position is explicitly recorded in these systems. This feature turns bike sharing system into a virtual sensor network that can be used for sensing mobility in the city. Hence, it is expected that most of important events in the city could be detected via monitoring these data. + +Citation +-------- + +Downloaded from the `UCI Machine Learning Repository `_ on May 4, 2017. + +Fanaee-T, Hadi, and Gama, Joao, 'Event labeling combining ensemble detectors and background knowledge', Progress in Artificial Intelligence (2013): pp. 1-15, Springer Berlin Heidelberg + +Loader +------ + +.. autofunction:: yellowbrick.datasets.loaders.load_bikeshare diff --git a/docs/api/datasets/concrete.rst b/docs/api/datasets/concrete.rst new file mode 100644 index 000000000..4ea4771e8 --- /dev/null +++ b/docs/api/datasets/concrete.rst @@ -0,0 +1,31 @@ +.. -*- mode: rst -*- + +Concrete +======== + +Concrete is the most important material in civil engineering. The concrete compressive strength is a highly nonlinear function of age and ingredients. + +================= =============== +Samples total 1030 +Dimensionality 9 +Features real +Targets float, 2.3-82.6 +Task(s) regression +================= =============== + +Description +----------- + +Given are the variable name, variable type, the measurement unit and a brief description. The concrete compressive strength is the regression problem. The order of this listing corresponds to the order of numerals along the rows of the database. + +Citation +-------- + +Downloaded from the `UCI Machine Learning Repository `_ on October 13, 2016. + +Yeh, I-C. "Modeling of strength of high-performance concrete using artificial neural networks." Cement and Concrete research 28.12 (1998): 1797-1808. + +Loader +------ + +.. autofunction:: yellowbrick.datasets.loaders.load_concrete diff --git a/docs/api/datasets/credit.rst b/docs/api/datasets/credit.rst new file mode 100644 index 000000000..8b609a7c8 --- /dev/null +++ b/docs/api/datasets/credit.rst @@ -0,0 +1,31 @@ +.. -*- mode: rst -*- + +Credit +====== + +This research aimed at the case of customers' default payments in Taiwan and compares the predictive accuracy of probability of default among six data mining methods. + +================= =============== +Samples total 30000 +Dimensionality 24 +Features real, int +Targets int, 0 or 1 +Task(s) classification +================= =============== + +Description +----------- + +This research aimed at the case of customers' default payments in Taiwan and compares the predictive accuracy of probability of default among six data mining methods. From the perspective of risk management, the result of predictive accuracy of the estimated probability of default will be more valuable than the binary result of classification - credible or not credible clients. Because the real probability of default is unknown, this study presented the novel "Sorting Smoothing Method" to estimate the real probability of default. With the real probability of default as the response variable (Y), and the predictive probability of default as the independent variable (X), the simple linear regression result (Y = A + BX) shows that the forecasting model produced by artificial neural network has the highest coefficient of determination; its regression intercept (A) is close to zero, and regression coefficient (B) to one. Therefore, among the six data mining techniques, artificial neural network is the only one that can accurately estimate the real probability of default. + +Citation +-------- + +Downloaded from the `UCI Machine Learning Repository `_ on October 13, 2016. + +Yeh, I. C., & Lien, C. H. (2009). The comparisons of data mining techniques for the predictive accuracy of probability of default of credit card clients. Expert Systems with Applications, 36(2), 2473-2480. + +Loader +------ + +.. autofunction:: yellowbrick.datasets.loaders.load_credit diff --git a/docs/api/datasets/energy.rst b/docs/api/datasets/energy.rst new file mode 100644 index 000000000..06fb40475 --- /dev/null +++ b/docs/api/datasets/energy.rst @@ -0,0 +1,70 @@ +.. -*- mode: rst -*- + +Energy +====== + +The dataset was created by Angeliki Xifara (angxifara '@' gmail.com, Civil/Structural Engineer) and was processed by Athanasios Tsanas (tsanasthanasis '@' gmail.com, Oxford Centre for Industrial and Applied Mathematics, University of Oxford, UK). + +================= ========================== +Samples total 768 +Dimensionality 8 +Features real, int +Targets float, 6.01-43.1 +Task(s) regression, classification +================= ========================== + +Description +----------- + +We perform energy analysis using 12 different building shapes simulated in Ecotect. The buildings differ with respect to the glazing area, the glazing area distribution, and the orientation, amongst other parameters. We simulate various settings as functions of the afore-mentioned characteristics to obtain 768 building shapes. The dataset comprises 768 samples and 8 features, aiming to predict two real valued responses. It can also be used as a multi-class classification problem if the response is rounded to the nearest integer. + +Example +------- + +The energy dataset contains a multi-target supervised dataset for both the heating and the cooling load of buildings. By default only the heating load is returned for most examples. To perform a multi-target regression, simply access the dataframe and select both the heating and cooling load columns as follows: + +.. code:: python + + from yellowbrick.datasets import load_energy + from sklearn.ensemble import RandomForestRegressor + from sklearn.model_selection import train_test_split as tts + + features = [ + "relative compactness", + "surface area", + "wall area", + "roof area", + "overall height", + "orientation", + "glazing area", + "glazing area distribution", + ] + target = ["heating load", "cooling load"] + + df = load_energy(return_dataset=True).to_dataframe() + X, y = df[features], df[target] + + X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2) + + model = RandomForestRegressor().fit(X_train, y_train) + model.score(X_test, y_test) + + +Note that not all regressors support multi-target regression, one simple strategy in this case is to use a :class:`sklearn.multioutput.MultiOutputRegressor`, which fits an estimator for each target. + + +Citation +-------- + +Downloaded from the `UCI Machine Learning Repository `_ March 23, 2015. + +A. Tsanas, A. Xifara: 'Accurate quantitative estimation of energy performance of residential buildings using statistical machine learning tools', Energy and Buildings, Vol. 49, pp. 560-567, 2012 + +For further details on the data analysis methodology: + +A. Tsanas, 'Accurate telemonitoring of Parkinson's disease symptom severity using nonlinear speech signal processing and statistical machine learning', D.Phil. thesis, University of Oxford, 2012 + +Loader +------ + +.. autofunction:: yellowbrick.datasets.loaders.load_energy diff --git a/docs/api/datasets/game.rst b/docs/api/datasets/game.rst new file mode 100644 index 000000000..6c4388625 --- /dev/null +++ b/docs/api/datasets/game.rst @@ -0,0 +1,45 @@ +.. -*- mode: rst -*- + +Game +==== + +The dataset was created and donated to the UCI ML Repository by John Tromp (tromp '@' cwi.nl). + +================= ============================== +Samples total 67557 +Dimensionality 42 +Features categorical +Targets str: {"win", "loss", "draw"} +Task(s) classification +================= ============================== + +Description +----------- + +This database contains all legal 8-ply positions in the game of connect-4 in which neither player has won yet, and in which the next move is not forced. + +The symbol x represents the first player; o the second. The dataset contains the state of the game by representing each position in a 6x7 grid board. The outcome class is the game theoretical value for the first player. + +Example +------- + +Note that to use the game dataset the categorical data in the features array must be encoded numerically. There are a number of numeric encoding mechanisms such as the :class:`sklearn.preprocessing.OrdinalEncoder` or the :class:`sklearn.preprocessing.OneHotEncoder` that may be used as follows: + +.. code:: python + + from sklearn.preprocessing import OneHotEncoder + from yellowbrick.datasets import load_game + + X, y = load_game() + X = OneHotEncoder().fit_transform(X) + + +Citation +-------- + +Downloaded from the `UCI Machine Learning Repository `_ on May 4, 2017. + +Loader +------ + +.. autofunction:: yellowbrick.datasets.loaders.load_game diff --git a/docs/api/datasets/hobbies.rst b/docs/api/datasets/hobbies.rst new file mode 100644 index 000000000..4c1db2de1 --- /dev/null +++ b/docs/api/datasets/hobbies.rst @@ -0,0 +1,92 @@ +.. -*- mode: rst -*- + +Hobbies +======= + +The Baleen hobbies corpus contains 448 files in 5 categories. + +================= ========================================================== +Samples total 448 +Dimensionality 23738 +Features strings (tokens) +Targets str: {"books", "cinema", "cooking", "gaming", "sports"} +Task(s) classification, clustering +================= ========================================================== + +Description +----------- + +The hobbies corpus is a text corpus wrangled from the `Baleen RSS Corpus `_ in order to enable students and readers to practice different techniques in Natural Language Processing. For more information see `Applied Text Analysis with Python: Enabling Language-Aware Data Products with Machine Learning `_ and the associated `code repository `_. It is structured as: + +Documents and File Size +~~~~~~~~~~~~~~~~~~~~~~~ + +- books: 72 docs (4.1MiB) +- cinema: 100 docs (9.2MiB) +- cooking: 30 docs (3.0MiB) +- gaming: 128 docs (8.8MiB) +- sports: 118 docs (15.9MiB) + +Document Structure +~~~~~~~~~~~~~~~~~~ + +Overall: + +- 7,420 paragraphs (16.562 mean paragraphs per file) +- 14,251 sentences (1.921 mean sentences per paragraph). + +By Category: + +- books: 844 paragraphs and 2,030 sentences +- cinema: 1,475 paragraphs and 3,047 sentences +- cooking: 1,190 paragraphs and 2,425 sentences +- gaming: 1,802 paragraphs and 3,373 sentences +- sports: 2,109 paragraphs and 3,376 sentences + +Words and Vocabulary +~~~~~~~~~~~~~~~~~~~~ + +Word count of 288,520 with a vocabulary of 23,738 (12.154 lexical diversity). + +- books: 41,851 words with a vocabulary size of 7,838 +- cinema: 69,153 words with a vocabulary size of 10,274 +- cooking: 37,854 words with a vocabulary size of 5,038 +- gaming: 70,778 words with a vocabulary size of 9,120 +- sports: 68,884 words with a vocabulary size of 8,028 + +Example +------- + +The hobbies corpus loader returns a ``Corpus`` object with the raw text associated with the data set. This must be vectorized into a numeric form for use with scikit-learn. For example, you could use the :class:`sklearn.feature_extraction.text.TfidfVectorizer` as follows: + +.. code:: python + + from yellowbrick.datasets import load_hobbies + + from sklearn.naive_bayes import MultinomialNB + from sklearn.preprocessing import LabelEncoder + from sklearn.feature_extraction.text import TfidfVectorizer + from sklearn.model_selection import train_test_split as tts + + corpus = load_hobbies() + X = TfidfVectorizer().fit_transform(corpus.data) + y = LabelEncoder().fit_transform(corpus.target) + + X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2) + + model = MultinomialNB().fit(X_train, y_train) + model.score(X_test, y_test) + +For more detail on text analytics and machine learning with scikit-learn, please refer to `"Working with Text Data" `_ in the scikit-learn documentation. + +Citation +-------- + +Exported from S3 on: Jan 21, 2017 at 06:42. + +Bengfort, Benjamin, Rebecca Bilbro, and Tony Ojeda. Applied Text Analysis with Python: Enabling Language-aware Data Products with Machine Learning. " O'Reilly Media, Inc.", 2018. + +Loader +------ + +.. autofunction:: yellowbrick.datasets.loaders.load_hobbies diff --git a/docs/api/datasets/index.rst b/docs/api/datasets/index.rst new file mode 100644 index 000000000..55c3b67f5 --- /dev/null +++ b/docs/api/datasets/index.rst @@ -0,0 +1,199 @@ +.. -*- mode: rst -*- + +Example Datasets +================ + +Yellowbrick hosts several datasets wrangled from the `UCI Machine Learning Repository `__ to present the examples used throughout this documentation. These datasets are hosted in our CDN and must be downloaded for use. Typically, when a user calls one of the data loader functions, e.g. ``load_bikeshare()`` the data is automatically downloaded if it's not already on the user's computer. However, for development and testing, or if you know you will be working without internet access, it might be easier to simply download all the data at once. + +The data downloader script can be run as follows: + +:: + + $ python -m yellowbrick.download + +This will download all of the data to the ``fixtures`` directory inside of the Yellowbrick site packages. You can specify the location of the download either as an argument to the downloader script (use ``--help`` for more details) or by setting the ``$YELLOWBRICK_DATA`` environment variable. This is the preferred mechanism because this will also influence how data is loaded in Yellowbrick. + +.. NOTE:: Developers who have downloaded data from Yellowbrick versions earlier than v1.0 may experience some problems with the older data format. If this occurs, you can clear out your data cache by running ``python -m yellowbrick.download --cleanup``. This will remove old datasets and download the new ones. You can also use the ``--no-download`` flag to simply clear the cache without re-downloading data. Users who are having difficulty with datasets can also use this or they can uninstall and reinstall Yellowbrick using ``pip``. + +Once you have downloaded the example datasets, you can load and use them as follows: + +.. code:: python + + from yellowbrick.datasets import load_bikeshare + + X, y = load_bikeshare() # returns features and targets for the bikeshare dataset + + +Each dataset has a ``README.md`` with detailed information about the data source, attributes, and target as well as other metadata. To get access to the metadata or to more precisely control your data access you can return the dataset directly from the loader as follows: + +.. code:: python + + dataset = load_bikeshare(return_dataset=True) + print(dataset.README) + + df = dataset.to_dataframe() + df.head() + + +Datasets +-------- + +Unless otherwise specified, most of the documentation examples currently use one or more of the listed datasets. Here is a complete listing of all datasets in Yellowbrick and the analytical tasks with which they are most commonly associated: + +.. Below is a custom ToC, please add new datasets both to the list with a link and + the file containing the dataset information to the toctree directive below. + +- :doc:`bikeshare`: suitable for regression +- :doc:`concrete`: suitable for regression +- :doc:`credit`: suitable for classification/clustering +- :doc:`energy`: suitable for regression +- :doc:`game`: suitable for multi-class classification +- :doc:`hobbies`: suitable for text analysis/classification +- :doc:`mushroom`: suitable for classification/clustering +- :doc:`occupancy`: suitable for classification +- :doc:`spam`: suitable for binary classification +- :doc:`walking`: suitable for time series analysis/clustering +- :doc:`nfl`: suitable for clustering + +.. toctree:: + :hidden: + + bikeshare + concrete + credit + energy + game + hobbies + mushroom + occupancy + spam + walking + nfl + + +Yellowbrick has included these datasets in our package for demonstration purposes only. The datasets have been repackaged with the permission of the authors or in accordance with the terms of use of the source material. If you use a Yellowbrick wrangled dataset, please be sure to cite the original author. + +API Reference +------------- + +By default, the dataset loaders return a features table, ``X``, and a target vector ``y`` when called. If the user has Pandas installed, the data types will be a ``pd.DataFrame`` and ``pd.Series`` respectively, otherwise the data will be returned as numpy arrays. This functionality ensures that the primary use of the datasets, to follow along with the documentation examples, is as simple as possible. However, advanced users may note that there does exist an underlying object with advanced functionality that can be accessed as follows: + +.. code:: python + + dataset = load_occupancy(return_dataset=True) + + +There are two basic types of dataset, the ``Dataset`` which is used for :ref:`tabular data ` loaded from a CSV and the ``Corpus``, used to load :ref:`text corpora ` from disk. Both types of dataset give access to a readme file, a citation in BibTex format, json metadata that describe the fields and target, and different data types associated with the underlying datasset. Both objects are also responsible for locating the dataset on disk and downloading it safely if it doesn't exist yet. For more on how Yellowbrick downloads and stores data, please see :ref:`local-storage`. + +.. _tabular-data: + +Tabular Data +~~~~~~~~~~~~ + +Most example datasets are returned as tabular data structures loaded either from a .csv file (using Pandas) or from dtype encoded .npz file to ensure correct numpy arrays are returned. The ``Dataset`` object loads the data from these stored files, preferring to use Pandas if it is installed. It then uses metadata to slice the DataFrame into a feature matrix and target array. Using the dataset directly provides extra functionality, and can be retrieved as follows: + +.. code:: python + + from yellowbrick.datasets import load_concrete + dataset = load_concrete(return_dataset=True) + +For example if you wish to get the raw data frame you can do so as follows: + +.. code:: python + + df = dataset.to_dataframe() + df.head() + +There may be additional columns in the DataFrame that were part of the original dataset but were excluded from the featureset. For example, the :doc:`energy dataset ` contains two targets, the heating and the cooling load, but only the heating load is returned by default. The api documentation that follows describes in details the metadata properties and other functionality associated with the ``Dataset``: + +.. autoclass:: yellowbrick.datasets.base.Dataset + :show-inheritance: + :members: + :inherited-members: + +.. _text-corpora: + +Text Corpora +~~~~~~~~~~~~ + +Yellowbrick supports many text-specific machine learning visualizations in the :doc:`yellowbrick.text <../text/index>` module. To facilitate these examples and show an end-to-end visual diagnostics workflow that includes text preprocessing, Yellowbrick supports a ``Corpus`` dataset loader that provides access to raw text data from individual documents. Most notably used with the :doc:`hobbies corpus `, a collection of blog posts from different topics that can be used for text classification tasks. + +A text corpus is composed of individual documents that are stored on disk in a directory structure that also identifies document relationships. The file name of each document is a unique file ID (e.g. the MD5 hash of its contents). For example, the hobbies corpus is structured as follows: + +:: + + data/hobbies + ├── README.md + └── books + | ├── 56d62a53c1808113ffb87f1f.txt + | └── 5745a9c7c180810be6efd70b.txt + └── cinema + | ├── 56d629b5c1808113ffb87d8f.txt + | └── 57408e5fc180810be6e574c8.txt + └── cooking + | ├── 56d62b25c1808113ffb8813b.txt + | └── 573f0728c180810be6e2575c.txt + └── gaming + | ├── 56d62654c1808113ffb87938.txt + | └── 574585d7c180810be6ef7ffc.txt + └── sports + ├── 56d62adec1808113ffb88054.txt + └── 56d70f17c180810560aec345.txt + +Unlike the ``Dataset``, corpus dataset loaders do not return ``X`` and ``y`` specially prepared for machine learning. Instead, these loaders return a ``Corpus`` object, which can be used to get a more detailed view of the dataset. For example, to list the unique categories in the corpus, you would access the ``labels`` property as follows: + +.. code:: python + + from yellowbrick.datasets import load_hobbies + + corpus = load_hobbies() + corpus.labels + +Addtionally, you can access the list of the absolute paths of each file, which allows you to read individual documents or to use scikit-learn utilties that read the documents streaming one at a time rather than loading them into memory all at once. + +.. code:: python + + with open(corpus.files[8], 'r') as f: + print(f.read()) + +To get the raw text data and target labels, use the ``data`` and ``target`` properties. + +.. code:: python + + X, y = corpus.data, corpus.target + +For more details on the other metadata properties associated with the ``Corpus``, please refer to the API reference below. For more detail on text analytics and machine learning with scikit-learn, please refer to `"Working with Text Data" `_ in the scikit-learn documentation. + +.. autoclass:: yellowbrick.datasets.base.Corpus + :show-inheritance: + :members: + :inherited-members: + +.. _local-storage: + +Local Storage +~~~~~~~~~~~~~ + +Yellowbrick datasets are stored in a compressed format in the cloud to ensure that the install process is as streamlined and lightweight as possible. When you request a dataset via the loader module, Yellowbrick checks to see if it has been downloaded already, and if not, it downloads it to your local disk. + +By default the dataset is stored, uncompressed, in the ``site-packages`` folder of your Python installation alongside the Yellowbrick code. This means that if you install Yellowbrick in multiple virtual environments, the datasets will be downloaded multiple times in each environment. + +To cleanup downloaded datasets, you may use the download module as a command line tool. Note, however, that this will only cleanup the datasets in the yellowbrick package that is on the ``$PYTHON_PATH`` of the environment you're currently in. + +.. code:: + + $ python -m yellowbrick.download --cleanup --no-download + +Alternatively, because the data is stored in the same directory as the code, you can simply ``pip uninstall yellowbrick`` to cleanup the data. + +A better option may be to use a single dataset directory across all virtual environments. To specify this directory, you must set the ``$YELLOWBRICK_DATA`` environment variable, usually by adding it to your bash profile so it is exported every time you open a terminal window. This will ensure that you have only downloaded the data once. + +.. code:: + + $ export YELLOWBRICK_DATA="~/.yellowbrick" + $ python -m yellowbrick.download -f + $ ls $YELLOWBRICK_DATA + +To identify the location that the Yellowbrick datasets are stored for your installation of Python/Yellowbrick, you can use the ``get_data_home`` function: + +.. autofunction:: yellowbrick.datasets.path.get_data_home diff --git a/docs/api/datasets/mushroom.rst b/docs/api/datasets/mushroom.rst new file mode 100644 index 000000000..5617bc30d --- /dev/null +++ b/docs/api/datasets/mushroom.rst @@ -0,0 +1,37 @@ +.. -*- mode: rst -*- + +Mushroom +======== + +From Audobon Society Field Guide; mushrooms described in terms of physical characteristics; classification: poisonous or edible. + +================= ============================== +Samples total 8124 +Dimensionality 4 (reduced from 22) +Features categorical +Targets str: {"edible", "poisonous"} +Task(s) classification +================= ============================== + +Description +----------- + +This data set includes descriptions of hypothetical samples corresponding to 23 species of gilled mushrooms in the Agaricus and Lepiota Family (pp. 500-525). Each species is identified as definitely edible, definitely poisonous, or of unknown edibility and not recommended. This latter class was combined with the poisonous one. The Guide clearly states that there is no simple rule for determining the edibility of a mushroom; no rule like "leaflets three, let it be" for Poisonous Oak and Ivy. + +Citation +-------- + +Downloaded from the `UCI Machine Learning Repository `_ on February 28, 2017. + +Schlimmer, Jeffrey Curtis. "Concept acquisition through representational adjustment." (1987). + +Langley, Pat. "Trading off simplicity and coverage in incremental concept learning." Machine Learning Proceedings 1988 (2014): 73. + +Duch, Włodzisław, Rafał Adamczak, and Krzysztof Grabczewski. "Extraction of logical rules from training data using backpropagation networks." The 1st Online Workshop on Soft Computing. 1996. + +Duch, Wlodzislaw, Rafal Adamczak, and Krzysztof Grabczewski. "Extraction of crisp logical rules using constrained backpropagation networks." (1997). + +Loader +------ + +.. autofunction:: yellowbrick.datasets.loaders.load_mushroom diff --git a/docs/api/datasets/nfl.rst b/docs/api/datasets/nfl.rst new file mode 100644 index 000000000..c2e032dc2 --- /dev/null +++ b/docs/api/datasets/nfl.rst @@ -0,0 +1,32 @@ +.. -*- mode: rst -*- + +NFL +=== + +This dataset is comprised of statistics on all eligible receivers from the 2018 NFL regular season. + +================= ================== +Samples total 494 +Dimensionality 20 +Features str, int +Targets N/A +Task(s) clustering +================= ================== + +Description +----------- + +The dataset consists of an aggregate of all relevant statistics for eligible receivers that played in at least 1 game and had at least 1 target throughout the season. This is not limited to players specifically designated as wide-receivers, but may include other positions such as running-backs and tight-ends. + +Citation +-------- + +Redistributed with the permission of Sports Reference LLC on June 11, 2019 via email. + +Sports Reference LLC, "2018 NFL Receiving," Pro-Football-Reference.com - Pro Football Statistics and History. +[Online]. Available `here `_. [Accessed: 18-Jun-2019] + +Loader +------ + +.. autofunction:: yellowbrick.datasets.loaders.load_nfl diff --git a/docs/api/datasets/occupancy.rst b/docs/api/datasets/occupancy.rst new file mode 100644 index 000000000..23977a4f8 --- /dev/null +++ b/docs/api/datasets/occupancy.rst @@ -0,0 +1,29 @@ +.. -*- mode: rst -*- + +Occupancy +========= + +Experimental data used for binary classification (room occupancy) from Temperature, Humidity, Light and CO2. Ground-truth occupancy was obtained from time stamped pictures that were taken every minute. + +================= ========================================== +Samples total 20560 +Dimensionality 6 +Features real, positive +Targets int: {1 for occupied, 0 for not occupied} +Task(s) classification +Samples per class imbalanced +================= ========================================== + +Description +----------- + +Three data sets are submitted, for training and testing. Ground-truth occupancy was obtained from time stamped pictures that were taken every minute. For the journal publication, the processing R scripts can be found on `GitHub `_. + +Citation +-------- + +Downloaded from the `UCI Machine Learning Repository `_ on October 13, 2016. + +Candanedo, Luis M., and Véronique Feldheim. "Accurate occupancy detection of an office room from light, temperature, humidity and CO 2 measurements using statistical learning models." Energy and Buildings 112 (2016): 28-39. + +.. autofunction:: yellowbrick.datasets.loaders.load_occupancy diff --git a/docs/api/datasets/spam.rst b/docs/api/datasets/spam.rst new file mode 100644 index 000000000..0661d0614 --- /dev/null +++ b/docs/api/datasets/spam.rst @@ -0,0 +1,38 @@ +.. -*- mode: rst -*- + +Spam +==== + +Classifying Email as Spam or Non-Spam. + +================= ===================================== +Samples total 4601 +Dimensionality 57 +Features real, integer +Targets int: {1 for spam, 0 for not spam} +Task(s) classification +================= ===================================== + + +Description +----------- + +The "spam" concept is diverse: advertisements for products/web sites, make +money fast schemes, chain letters, pornography... + +Our collection of spam e-mails came from our postmaster and individuals who had filed spam. Our collection of non-spam e-mails came from filed work and personal e-mails, and hence the word 'george' and the area code '650' are indicators of non-spam. These are useful when constructing a personalized spam filter. One would either have to blind such non-spam indicators or get a very wide collection of non-spam to generate a general purpose spam filter. + +Determine whether a given email is spam or not. + +~7% misclassification error. False positives (marking good mail as spam) are very undesirable.If we insist on zero false positives in the training/testing set, 20-25% of the spam passed through the filter. + +Citation +-------- +Downloaded from the `UCI Machine Learning Repository `_ on March 23, 2018. + +Cranor, Lorrie Faith, and Brian A. LaMacchia. "Spam!." Communications of the ACM 41.8 (1998): 74-83. + +Loader +------ + +.. autofunction:: yellowbrick.datasets.loaders.load_spam diff --git a/docs/api/datasets/walking.rst b/docs/api/datasets/walking.rst new file mode 100644 index 000000000..335ec7768 --- /dev/null +++ b/docs/api/datasets/walking.rst @@ -0,0 +1,31 @@ +.. -*- mode: rst -*- + +Walking +======= + +The dataset collects data from an Android smartphone positioned in the chest pocket. Accelerometer Data are collected from 22 participants walking in the wild over a predefined path. The dataset is intended for Activity Recognition research purposes. It provides challenges for identification and authentication of people using motion patterns. Sampling frequency of the accelerometer: DELAY_FASTEST with network connections disabled. + +================= =========================== +Samples total 149331 +Dimensionality 4 +Features real +Targets int, 1-22 +Task(s) classification, clustering +================= =========================== + +Description +----------- + +In this article, a novel technique for user's authentication and verification using gait as a biometric unobtrusive pattern is proposed. The method is based on a two stages pipeline. First, a general activity recognition classifier is personalized for an specific user using a small sample of her/his walking pattern. As a result, the system is much more selective with respect to the new walking pattern. A second stage verifies whether the user is an authorized one or not. This stage is defined as a one-class classification problem. In order to solve this problem, a four-layer architecture is built around the geometric concept of convex hull. This architecture allows to improve robustness to outliers, modeling non-convex shapes, and to take into account temporal coherence information. Two different scenarios are proposed as validation with two different wearable systems. First, a custom high-performance wearable system is built and used in a free environment. A second dataset is acquired from an Android-based commercial device in a 'wild' scenario with rough terrains, adversarial conditions, crowded places and obstacles. Results on both systems and datasets are very promising, reducing the verification error rates by an order of magnitude with respect to the state-of-the-art technologies. + +Citation +-------- + +Downloaded from the `UCI Machine Learning Repository `_ on August 23, 2018. + +Casale, Pierluigi, Oriol Pujol, and Petia Radeva. "Personalization and user verification in wearable systems using biometric walking patterns." Personal and Ubiquitous Computing 16.5 (2012): 563-580. + +Loader +------ + +.. autofunction:: yellowbrick.datasets.loaders.load_walking diff --git a/docs/api/features/images/concrete_isomap_manifold.png b/docs/api/features/images/concrete_isomap_manifold.png index 186ac4e41..124264e9d 100644 Binary files a/docs/api/features/images/concrete_isomap_manifold.png and b/docs/api/features/images/concrete_isomap_manifold.png differ diff --git a/docs/api/features/images/concrete_tsne_manifold.png b/docs/api/features/images/concrete_tsne_manifold.png index f0e3d3bd6..44529db55 100644 Binary files a/docs/api/features/images/concrete_tsne_manifold.png and b/docs/api/features/images/concrete_tsne_manifold.png differ diff --git a/docs/api/features/images/fast_vs_slow_parallel_coordinates.png b/docs/api/features/images/fast_vs_slow_parallel_coordinates.png deleted file mode 100644 index d7d021b7b..000000000 Binary files a/docs/api/features/images/fast_vs_slow_parallel_coordinates.png and /dev/null differ diff --git a/docs/api/features/images/feature_importances.png b/docs/api/features/images/feature_importances.png deleted file mode 100644 index e179794db..000000000 Binary files a/docs/api/features/images/feature_importances.png and /dev/null differ diff --git a/docs/api/features/images/feature_importances_coef.png b/docs/api/features/images/feature_importances_coef.png deleted file mode 100644 index de8af8640..000000000 Binary files a/docs/api/features/images/feature_importances_coef.png and /dev/null differ diff --git a/docs/api/features/images/jointplot.png b/docs/api/features/images/jointplot.png deleted file mode 100644 index 89e116936..000000000 Binary files a/docs/api/features/images/jointplot.png and /dev/null differ diff --git a/docs/api/features/images/jointplot_hex.png b/docs/api/features/images/jointplot_hex.png deleted file mode 100644 index ee5964296..000000000 Binary files a/docs/api/features/images/jointplot_hex.png and /dev/null differ diff --git a/docs/api/features/images/normalized_sampled_parallel_coordinates.png b/docs/api/features/images/normalized_sampled_parallel_coordinates.png deleted file mode 100644 index fbefb8dec..000000000 Binary files a/docs/api/features/images/normalized_sampled_parallel_coordinates.png and /dev/null differ diff --git a/docs/api/features/images/occupancy_select_k_best_isomap_manifold.png b/docs/api/features/images/occupancy_select_k_best_isomap_manifold.png index 62fe1a378..b15feed80 100644 Binary files a/docs/api/features/images/occupancy_select_k_best_isomap_manifold.png and b/docs/api/features/images/occupancy_select_k_best_isomap_manifold.png differ diff --git a/docs/api/features/images/occupancy_tsne_manifold.png b/docs/api/features/images/occupancy_tsne_manifold.png index 4fe31fc8d..ed2f566eb 100644 Binary files a/docs/api/features/images/occupancy_tsne_manifold.png and b/docs/api/features/images/occupancy_tsne_manifold.png differ diff --git a/docs/api/features/images/parallel_coordinates.png b/docs/api/features/images/parallel_coordinates.png deleted file mode 100644 index 459030b96..000000000 Binary files a/docs/api/features/images/parallel_coordinates.png and /dev/null differ diff --git a/docs/api/features/images/pca_biplot_2d.png b/docs/api/features/images/pca_biplot_2d.png deleted file mode 100644 index 594072c2a..000000000 Binary files a/docs/api/features/images/pca_biplot_2d.png and /dev/null differ diff --git a/docs/api/features/images/pca_biplot_3d.png b/docs/api/features/images/pca_biplot_3d.png deleted file mode 100644 index a5c1abc61..000000000 Binary files a/docs/api/features/images/pca_biplot_3d.png and /dev/null differ diff --git a/docs/api/features/images/pca_projection_2d.png b/docs/api/features/images/pca_projection_2d.png deleted file mode 100644 index 71da82205..000000000 Binary files a/docs/api/features/images/pca_projection_2d.png and /dev/null differ diff --git a/docs/api/features/images/pca_projection_3d.png b/docs/api/features/images/pca_projection_3d.png deleted file mode 100644 index 47f22fa8b..000000000 Binary files a/docs/api/features/images/pca_projection_3d.png and /dev/null differ diff --git a/docs/api/features/images/radviz.png b/docs/api/features/images/radviz.png deleted file mode 100644 index 679a6ffce..000000000 Binary files a/docs/api/features/images/radviz.png and /dev/null differ diff --git a/docs/api/features/images/rank1d_shapiro.png b/docs/api/features/images/rank1d_shapiro.png deleted file mode 100644 index 5d7bcb29d..000000000 Binary files a/docs/api/features/images/rank1d_shapiro.png and /dev/null differ diff --git a/docs/api/features/images/rank2d_covariance.png b/docs/api/features/images/rank2d_covariance.png deleted file mode 100644 index d7a3bf57c..000000000 Binary files a/docs/api/features/images/rank2d_covariance.png and /dev/null differ diff --git a/docs/api/features/images/rank2d_pearson.png b/docs/api/features/images/rank2d_pearson.png deleted file mode 100644 index 54916b3e7..000000000 Binary files a/docs/api/features/images/rank2d_pearson.png and /dev/null differ diff --git a/docs/api/features/images/rfecv_credit.png b/docs/api/features/images/rfecv_credit.png deleted file mode 100644 index e324f8d64..000000000 Binary files a/docs/api/features/images/rfecv_credit.png and /dev/null differ diff --git a/docs/api/features/images/s_curve_hessian_manifold.png b/docs/api/features/images/s_curve_hessian_manifold.png deleted file mode 100644 index bc55f5454..000000000 Binary files a/docs/api/features/images/s_curve_hessian_manifold.png and /dev/null differ diff --git a/docs/api/features/images/s_curve_isomap_manifold.png b/docs/api/features/images/s_curve_isomap_manifold.png deleted file mode 100644 index b15f6a9ab..000000000 Binary files a/docs/api/features/images/s_curve_isomap_manifold.png and /dev/null differ diff --git a/docs/api/features/images/s_curve_lle_manifold.png b/docs/api/features/images/s_curve_lle_manifold.png deleted file mode 100644 index f9d271e3f..000000000 Binary files a/docs/api/features/images/s_curve_lle_manifold.png and /dev/null differ diff --git a/docs/api/features/images/s_curve_ltsa_manifold.png b/docs/api/features/images/s_curve_ltsa_manifold.png deleted file mode 100644 index 45aac6f49..000000000 Binary files a/docs/api/features/images/s_curve_ltsa_manifold.png and /dev/null differ diff --git a/docs/api/features/images/s_curve_mds_manifold.png b/docs/api/features/images/s_curve_mds_manifold.png deleted file mode 100644 index f72621153..000000000 Binary files a/docs/api/features/images/s_curve_mds_manifold.png and /dev/null differ diff --git a/docs/api/features/images/s_curve_modified_manifold.png b/docs/api/features/images/s_curve_modified_manifold.png deleted file mode 100644 index 33b1e6f64..000000000 Binary files a/docs/api/features/images/s_curve_modified_manifold.png and /dev/null differ diff --git a/docs/api/features/images/s_curve_spectral_manifold.png b/docs/api/features/images/s_curve_spectral_manifold.png deleted file mode 100644 index 973b45bff..000000000 Binary files a/docs/api/features/images/s_curve_spectral_manifold.png and /dev/null differ diff --git a/docs/api/features/images/s_curve_tsne_manifold.png b/docs/api/features/images/s_curve_tsne_manifold.png deleted file mode 100644 index 9dc07573d..000000000 Binary files a/docs/api/features/images/s_curve_tsne_manifold.png and /dev/null differ diff --git a/docs/api/features/importances.py b/docs/api/features/importances.py deleted file mode 100644 index 34abe033a..000000000 --- a/docs/api/features/importances.py +++ /dev/null @@ -1,51 +0,0 @@ -import os -import pandas as pd -import matplotlib.pyplot as plt - -from yellowbrick.features.importances import FeatureImportances -from sklearn.ensemble import GradientBoostingClassifier -from sklearn.linear_model import Lasso - - -DATA_DIR = os.path.relpath(os.path.join( - os.path.dirname(__file__), "..", "..", "..", "examples", "data" -)) - - -def feature_importances_(outpath): - occupancy = pd.read_csv(os.path.join(DATA_DIR, "occupancy", "occupancy.csv")) - - feats = [ - "temperature", "relative humidity", "light", "C02", "humidity" - ] - - X = occupancy[feats] - y = occupancy['occupancy'].astype(int) - - fig = plt.figure() - ax = fig.add_subplot() - - viz = FeatureImportances(GradientBoostingClassifier(), ax=ax) - viz.fit(X, y) - viz.poof(outpath=outpath) - - -def coef_(outpath): - concrete = pd.read_csv(os.path.join(DATA_DIR, "concrete", "concrete.csv")) - - feats = ['cement','slag','ash','water','splast','coarse','fine','age'] - X = concrete[feats] - y = concrete['strength'] - - fig = plt.figure() - ax = fig.add_subplot() - - feats = list(map(lambda s: s.title(), feats)) - viz = FeatureImportances(Lasso(), ax=ax, labels=feats, relative=False) - viz.fit(X, y) - viz.poof(outpath=outpath) - - -if __name__ == '__main__': - feature_importances_("images/feature_importances.png") - coef_("images/feature_importances_coef.png") diff --git a/docs/api/features/index.rst b/docs/api/features/index.rst index b9b769a75..63879a5fd 100644 --- a/docs/api/features/index.rst +++ b/docs/api/features/index.rst @@ -20,8 +20,6 @@ At the moment we have the following feature analysis visualizers implemented: detect classes or clusters - :doc:`pca`: project higher dimensions into a visual space using PCA - :doc:`manifold`: visualize high dimensional data using manifold learning -- :doc:`importances`: rank features by relative importance in a model -- :doc:`rfecv`: select a subset of features by importance - :doc:`jointplot`: (aka Jointplots) plot 2D correlation between features and target Feature analysis visualizers implement the ``Transformer`` API from @@ -41,8 +39,6 @@ is called which displays the image. from yellowbrick.features.jointplot import JointPlotVisualizer from yellowbrick.features.pca import PCADecomposition from yellowbrick.features.manifold import Manifold - from yellowbrick.features.importances import FeatureImportances - from yellowbrick.features.rfecv import RFECV .. toctree:: @@ -53,6 +49,4 @@ is called which displays the image. pcoords pca manifold - importances - rfecv jointplot diff --git a/docs/api/features/jointplot.py b/docs/api/features/jointplot.py deleted file mode 100644 index d86dff126..000000000 --- a/docs/api/features/jointplot.py +++ /dev/null @@ -1,34 +0,0 @@ -import pandas as pd -import matplotlib.pyplot as plt - -from yellowbrick.features import JointPlotVisualizer - - -def jointplot(X, y, outpath, **kwargs): - # Create the visualizer - visualizer = JointPlotVisualizer(**kwargs) - visualizer.fit(X, y) - visualizer.transform(X) - - # Save to disk - visualizer.poof(outpath=outpath) - plt.savefig(outpath) - - -if __name__ == '__main__': - - # Load the regression data set - data = pd.read_csv("../../../examples/data/concrete/concrete.csv") - - feature = 'cement' - target = 'strength' - - # Get the X and y data from the DataFrame - Xs = data[feature] - ys = data[target] - - # Draw the joint plot visualizer - jointplot(Xs, ys, "images/jointplot.png", feature=feature, target=target) - - # Draw the joint plot visualizer with hexadecimal scatter plot - jointplot(Xs, ys, "images/jointplot_hex.png", feature=feature, target=target, joint_plot='hex') diff --git a/docs/api/features/jointplot.rst b/docs/api/features/jointplot.rst index 90847fda3..cda14ccc9 100644 --- a/docs/api/features/jointplot.rst +++ b/docs/api/features/jointplot.rst @@ -8,49 +8,69 @@ Sometimes for feature analysis you simply need a scatter plot to determine the d Joint Plot Visualization ------------------------ -A joint plot visualizer plots a feature against the target and shows the distribution of each via a histogram on each axis. +The ``JointPlotVisualizer`` plots a feature against the target and shows the distribution of each via a histogram on each axis. -.. code:: python - # Load the data - df = load_data("concrete") - feature = "cement" - target = "strength" +.. plot:: + :context: close-figs + :alt: JointPlot - # Get the X and y data from the DataFrame - X = df[feature] - y = df[target] + from yellowbrick.datasets import load_concrete + from yellowbrick.features import JointPlotVisualizer + + # Load the dataset + X, y = load_concrete() + + # Instantiate the visualizer + visualizer = JointPlotVisualizer(columns="cement") + + visualizer.fit_transform(X, y) # Fit and transform the data + visualizer.poof() # Draw/show/poof the data -.. code:: python +The ``JointPlotVisualizer`` can also be used to compare two features. + +.. plot:: + :context: close-figs + :alt: JointPlot comparing two features + + from yellowbrick.datasets import load_concrete from yellowbrick.features import JointPlotVisualizer - visualizer = JointPlotVisualizer(feature=feature, target=target) + # Load the dataset + X, y = load_concrete() - visualizer.fit(X, y) - visualizer.poof() + # Instantiate the visualizer + visualizer = JointPlotVisualizer(columns=["cement", "ash"]) + visualizer.fit_transform(X, y) # Fit and transform the data + visualizer.poof() # Draw/show/poof the data -.. image:: images/jointplot.png -The joint plot visualizer can also be plotted with hexbins in the case of many, many points. +In addition, the ``JointPlotVisualizer`` can be plotted with hexbins in the case +of many, many points. -.. code:: python +.. plot:: + :context: close-figs + :alt: JointPlot + + from yellowbrick.datasets import load_concrete + from yellowbrick.features import JointPlotVisualizer - visualizer = JointPlotVisualizer( - feature=feature, target=target, joint_plot='hex' - ) + # Load the dataset + X, y = load_concrete() - visualizer.fit(X, y) - visualizer.poof() + # Instantiate the visualizer + visualizer = JointPlotVisualizer(columns="cement", kind="hexbin") -.. image:: images/jointplot_hex.png + visualizer.fit_transform(X, y) # Fit and transform the data + visualizer.poof() # Draw/show/poof the data API Reference ------------- .. automodule:: yellowbrick.features.jointplot - :members: JointPlotVisualizer + :members: JointPlot :undoc-members: :show-inheritance: diff --git a/docs/api/features/manifold.py b/docs/api/features/manifold.py index faff912fc..eb02263b6 100644 --- a/docs/api/features/manifold.py +++ b/docs/api/features/manifold.py @@ -2,9 +2,12 @@ # manifold.py # Produce images for manifold documentation. # -# Author: Benjamin Bengfort +# Author: Benjamin Bengfort # Created: Sat May 12 11:26:18 2018 -0400 # +# Copyright (C) 2018 The scikit-yb developers +# For license information, see LICENSE.txt +# # ID: manifold.py [] benjamin@bengfort.com $ """ @@ -16,84 +19,60 @@ ########################################################################## import os - -import pandas as pd import matplotlib.pyplot as plt from sklearn import datasets from sklearn.pipeline import Pipeline +from sklearn.feature_selection import f_classif from sklearn.feature_selection import SelectKBest -from sklearn.feature_selection import f_classif#, mutual_info_classif + +from yellowbrick.datasets import load_occupancy, load_concrete from yellowbrick.features.manifold import Manifold, MANIFOLD_ALGORITHMS + SKIP = ( - 'ltsa', # produces no result - 'hessian', # errors because of matrix - 'mds', # uses way too much memory + "ltsa", # produces no result + "hessian", # errors because of matrix + "mds", # uses way too much memory ) -FIXTURES = os.path.normpath(os.path.join( - os.path.dirname(__file__), - "..", "..", "..", "examples", "data" -)) - - -def load_occupancy_data(): - # Load the classification data set - data = pd.read_csv(os.path.join(FIXTURES, 'occupancy', 'occupancy.csv')) - - # Specify the features of interest and the classes of the target - features = ["temperature", "relative humidity", "light", "C02", "humidity"] - - X = data[features] - y = pd.Series(['occupied' if y == 1 else 'unoccupied' for y in data.occupancy]) - - return X, y - - -def load_concrete_data(): - # Load a regression data set - data = pd.read_csv(os.path.join(FIXTURES, 'concrete', 'concrete.csv')) - - # Specify the features of interest - feature_names = ['cement', 'slag', 'ash', 'water', 'splast', 'coarse', 'fine', 'age'] - target_name = 'strength' - - # Get the X and y data from the DataFrame - X = data[feature_names] - y = data[target_name] - - return X, y +FIXTURES = os.path.normpath( + os.path.join(os.path.dirname(__file__), "..", "..", "..", "examples", "data") +) -def dataset_example(dataset="occupancy", manifold="all", path="images/"): +def dataset_example(dataset="occupancy", manifold="all", path="images/", **kwargs): if manifold == "all": if path is not None and not os.path.isdir(path): "please specify a directory to save examples to" for algorithm in MANIFOLD_ALGORITHMS: - if algorithm in SKIP: continue + if algorithm in SKIP: + continue print("generating {} {} manifold".format(dataset, algorithm)) fpath = os.path.join(path, "{}_{}_manifold.png".format(dataset, algorithm)) try: dataset_example(dataset, algorithm, fpath) except Exception as e: - print("could not visualize {} manifold on {} data: {}".format(algorithm, dataset, e)) + print( + "could not visualize {} manifold on {} data: {}".format( + algorithm, dataset, e + ) + ) continue - # Break here! return # Create single example - _, ax = plt.subplots(figsize=(9,6)) - oz = Manifold(ax=ax, manifold=manifold) + _, ax = plt.subplots(figsize=(9, 6)) + oz = Manifold(ax=ax, manifold=manifold, **kwargs) if dataset == "occupancy": - X, y = load_occupancy_data() + X, y = load_occupancy() elif dataset == "concrete": - X, y = load_concrete_data() + X, y = load_concrete() else: raise Exception("unknown dataset '{}'".format(dataset)) @@ -101,17 +80,23 @@ def dataset_example(dataset="occupancy", manifold="all", path="images/"): oz.poof(outpath=path) -def select_features_example(algorithm='isomap', path="images/occupancy_select_k_best_isomap_manifold.png"): - _, ax = plt.subplots(figsize=(9,6)) +def select_features_example( + algorithm="isomap", + path="images/occupancy_select_k_best_isomap_manifold.png", + **kwargs +): + _, ax = plt.subplots(figsize=(9, 6)) - model = Pipeline([ - ("selectk", SelectKBest(k=3, score_func=f_classif)), - ("viz", Manifold(ax=ax, manifold=algorithm)), - ]) + model = Pipeline( + [ + ("selectk", SelectKBest(k=3, score_func=f_classif)), + ("viz", Manifold(ax=ax, manifold=algorithm, **kwargs)), + ] + ) - X, y = load_occupancy_data() + X, y = load_occupancy() model.fit(X, y) - model.named_steps['viz'].poof(outpath=path) + model.named_steps["viz"].poof(outpath=path) class SCurveExample(object): @@ -131,7 +116,7 @@ def _make_path(self, path, name): if not os.path.exists(path): os.mkdirs(path) - if os.path.isdir(path) : + if os.path.isdir(path): return os.path.join(path, name) return path @@ -146,13 +131,10 @@ def plot_manifold_embedding(self, algorithm="lle", path="images"): """ Draw the manifold embedding for the specified algorithm """ - _, ax = plt.subplots(figsize=(9,6)) + _, ax = plt.subplots(figsize=(9, 6)) path = self._make_path(path, "s_curve_{}_manifold.png".format(algorithm)) - oz = Manifold( - ax=ax, manifold=algorithm, - target='continuous', colors='nipy_spectral' - ) + oz = Manifold(ax=ax, manifold=algorithm, colors="nipy_spectral") oz.fit(self.X, self.y) oz.poof(outpath=path) @@ -165,11 +147,12 @@ def plot_all_manifolds(self, path="images"): self.plot_manifold_embedding(algorithm) -if __name__ == '__main__': +if __name__ == "__main__": # curve = SCurveExample() # curve.plot_all_manifolds() - - dataset_example('occupancy', 'tsne', path="images/occupancy_tsne_manifold.png") - # dataset_example('concrete', 'all') - - # select_features_example() + dataset_example("concrete", "tsne", path="images/concrete_tsne_manifold.png") + dataset_example("occupancy", "tsne", path="images/occupancy_tsne_manifold.png") + dataset_example( + "concrete", "isomap", path="images/concrete_isomap_manifold.png", n_neighbors=10 + ) + select_features_example(algorithm="isomap", n_neighbors=10) diff --git a/docs/api/features/manifold.rst b/docs/api/features/manifold.rst index c05147517..e45ba0f81 100644 --- a/docs/api/features/manifold.rst +++ b/docs/api/features/manifold.rst @@ -59,27 +59,22 @@ discrete labels - the classes or categories in the supervised problem, or the clusters they belong to in the unsupervised version. The manifold visualizes this by assigning a color to each label and showing the labels in a legend. -.. code:: python - - # Load the classification data set - data = load_data('occupancy') +.. note to contributors: the below code takes a long time to run so has not been + modified with a plot directive. See manifold.py to regenerate images. - # Specify the features of interest - features = [ - "temperature", "relative humidity", "light", "C02", "humidity" - ] +.. code:: python - # Extract the instances and target - X = data[features] - y = data.occupancy + from yellowbrick.features import Manifold + from yellowbrick.datasets import load_occupancy -.. code:: python + # Load the classification dataset + X, y = load_occupancy() - from yellowbrick.features.manifold import Manifold + # Instantiate the visualizer + visualizer = Manifold(manifold="tsne") - visualizer = Manifold(manifold='tsne', target='discrete') - visualizer.fit_transform(X,y) - visualizer.poof() + visualizer.fit(X, y) # Fit the data + visualizer.poof() # Draw/show/poof the data .. image:: images/occupancy_tsne_manifold.png @@ -91,35 +86,32 @@ another is to sample your instances (e.g. using ``train_test_split`` to preserve class stratification) or to filter features to decrease sparsity in the dataset. -One common mechanism is to use `SelectKBest` to select the features that have +One common mechanism is to use ``SelectKBest`` to select the features that have a statistical correlation with the target dataset. For example, we can use the ``f_classif`` score to find the 3 best features in our occupancy dataset. +.. note to contributors: the below code takes a long time to run so has not been + modified with a plot directive. See manifold.py to regenerate images. + .. code:: python from sklearn.pipeline import Pipeline - from sklearn.feature_selection import SelectKBest - from sklearn.feature_selection import f_classif + from sklearn.feature_selection import f_classif, SelectKBest - model = Pipeline([ - ("selectk", SelectKBest(k=3, score_func=f_classif)), - ("viz", Manifold(manifold='isomap', target='discrete')), - ]) + from yellowbrick.features import Manifold + from yellowbrick.datasets import load_occupancy # Load the classification dataset - data = load_data("occupancy") - - # Specify the features of interest - features = [ - "temperature", "relative humidity", "light", "CO2", "humidity" - ] + X, y = load_occupancy() - # Extract the instances and target - X = data[features] - y = data.occupancy + # Create a pipeline + model = Pipeline([ + ("selectk", SelectKBest(k=3, score_func=f_classif)), + ("viz", Manifold(manifold="isomap", n_neighbors=10)), + ]) - model.fit(X, y) - model.named_steps['viz'].poof() + model.fit(X, y) # Fit the data to the model + model.named_steps['viz'].poof() # Draw/show/poof the data .. image:: images/occupancy_select_k_best_isomap_manifold.png @@ -127,27 +119,26 @@ Continuous Target ----------------- For a regression target or to specify color as a heat-map of continuous -values, specify ``target='continuous'``. Note that by default the param -``target='auto'`` is set, which determines if the target is discrete or +values, specify ``target_type="continuous"``. Note that by default the param +``target_type="auto"`` is set, which determines if the target is discrete or continuous by counting the number of unique values in ``y``. +.. note to contributors: the below code takes a long time to run so has not been + modified with a plot directive. See manifold.py to regenerate images. + .. code:: python - # Specify the features of interest - feature_names = [ - 'cement', 'slag', 'ash', 'water', 'splast', 'coarse', 'fine', 'age' - ] - target_name = 'strength' + from yellowbrick.features import Manifold + from yellowbrick.datasets import load_concrete - # Get the X and y data from the DataFrame - X = data[feature_names] - y = data[target_name] + # Load the regression dataset + X, y = load_concrete() -.. code:: python + # Instantiate the visualizer + visualizer = Manifold(manifold="isomap", n_neighbors=10) - visualizer = Manifold(manifold='isomap', target='continuous') - visualizer.fit_transform(X,y) - visualizer.poof() + visualizer.fit(X, y) # Fit the data + visualizer.poof() # Draw/show/poof the data .. image:: images/concrete_isomap_manifold.png diff --git a/docs/api/features/pca.py b/docs/api/features/pca.py deleted file mode 100644 index 4daed6d5f..000000000 --- a/docs/api/features/pca.py +++ /dev/null @@ -1,57 +0,0 @@ -import numpy as np -import pandas as pd -import matplotlib.pyplot as plt - -from yellowbrick.features.pca import PCADecomposition - - -def pca(X, y, outpath, **kwargs): - # Create a new figure and axes - _, ax = plt.subplots() - - viz = PCADecomposition(ax=ax, **kwargs) - viz.fit_transform(X, y) - viz.poof(outpath=outpath) - - -def load_credit(): - # Load the credit data set - data = pd.read_csv("../../../examples/data/credit/credit.csv") - - # Specify the features of interest - target = "default" - features = [col for col in data.columns if col != target] - - # Extract the numpy arrays from the data frame - X = data[features] - y = data[target] - return X, y - - -def load_concrete(): - # Load the credit data set - data = pd.read_csv("../../../examples/data/concrete/concrete.csv") - - # Specify the features of interest - feature_names = ['cement', 'slag', 'ash', 'water', 'splast', 'coarse', 'fine', 'age'] - target_name = 'strength' - - # Get the X and y data from the DataFrame - X = data[feature_names] - y = data[target_name] - - return X, y - - -if __name__ == '__main__': - - # Draw PCA with credit data set - X, y = load_credit() - colors = np.array(['r' if yi else 'b' for yi in y]) - pca(X, y, "images/pca_projection_2d.png", scale=True, color=colors) - pca(X, y, "images/pca_projection_3d.png", scale=True, color=colors, proj_dim=3) - - # Draw biplots with concrete data set - X, y = load_concrete() - pca(X, y, "images/pca_biplot_2d.png", scale=True, proj_features=True) - pca(X, y, "images/pca_biplot_3d.png", scale=True, proj_features=True, proj_dim=3) diff --git a/docs/api/features/pca.rst b/docs/api/features/pca.rst index d10f66522..d3f799dcc 100644 --- a/docs/api/features/pca.rst +++ b/docs/api/features/pca.rst @@ -5,87 +5,79 @@ PCA Projection The PCA Decomposition visualizer utilizes principal component analysis to decompose high dimensional data into two or three dimensions so that each instance can be plotted in a scatter plot. The use of PCA means that the projected dataset can be analyzed along axes of principal variation and can be interpreted to determine if spherical distance metrics can be utilized. -.. code:: python +.. plot:: + :context: close-figs + :alt: PCA Projection, 2D - # Load the classification data set - data = load_data('credit') + from yellowbrick.datasets import load_credit + from yellowbrick.features.pca import PCADecomposition # Specify the features of interest and the target - target = "default" - features = [col for col in data.columns if col != target] - - # Extract the instance data and the target - X = data[features] - y = data[target] + X, y = load_credit() # Create a list of colors to assign to points in the plot colors = np.array(['r' if yi else 'b' for yi in y]) -.. code:: python - - from yellowbrick.features.pca import PCADecomposition - visualizer = PCADecomposition(scale=True, color=colors) visualizer.fit_transform(X, y) visualizer.poof() -.. image:: images/pca_projection_2d.png - The PCA projection can also be plotted in three dimensions to attempt to visualize more principal components and get a better sense of the distribution in high dimensions. -.. code:: python +.. plot:: + :context: close-figs + :alt: PCA Projection, 3D + + from yellowbrick.datasets import load_credit + from yellowbrick.features.pca import PCADecomposition + + X, y = load_credit() + + colors = np.array(['r' if yi else 'b' for yi in y]) visualizer = PCADecomposition(scale=True, color=colors, proj_dim=3) visualizer.fit_transform(X, y) visualizer.poof() -.. image:: images/pca_projection_3d.png - Biplot ------ -The PCA projection can be enhanced to a biplot whose points are the projected instances and whose vectors represent the structure of the data in high dimensional space. By using the ``proj_features=True`` flag, vectors for each feature in the dataset are drawn on the scatter plot in the direction of the maximum variance for that feature. These structures can be used to analyze the importance of a feature to the decomposition or to find features of related variance for further analysis. - -.. code:: python +The PCA projection can be enhanced to a biplot whose points are the projected instances and whose vectors represent the structure of the data in high dimensional space. By using ``proj_features=True``, vectors for each feature in the dataset are drawn on the scatter plot in the direction of the maximum variance for that feature. These structures can be used to analyze the importance of a feature to the decomposition or to find features of related variance for further analysis. - # Load the classification data set - data = load_data('concrete') +.. plot:: + :context: close-figs + :alt: PCA biplot projection, 2D - # Specify the features of interest and the target - target = "strength" - features = [ - 'cement', 'slag', 'ash', 'water', 'splast', 'coarse', 'fine', 'age' - ] - - # Extract the instance data and the target - X = data[features] - y = data[target] + from yellowbrick.datasets import load_concrete + from yellowbrick.features.pca import PCADecomposition -.. code:: python + # Load the concrete dataset + X, y = load_concrete() visualizer = PCADecomposition(scale=True, proj_features=True) visualizer.fit_transform(X, y) visualizer.poof() -.. image:: images/pca_biplot_2d.png +.. plot:: + :context: close-figs + :alt: PCA biplot projection, 3D -.. code:: python + from yellowbrick.datasets import load_concrete + from yellowbrick.features.pca import PCADecomposition + + X, y = load_concrete() visualizer = PCADecomposition(scale=True, proj_features=True, proj_dim=3) visualizer.fit_transform(X, y) visualizer.poof() - -.. image:: images/pca_biplot_3d.png - - API Reference ------------- .. automodule:: yellowbrick.features.pca - :members: PCADecomposition + :members: PCA :undoc-members: :show-inheritance: diff --git a/docs/api/features/pcoords.py b/docs/api/features/pcoords.py deleted file mode 100644 index e65454dee..000000000 --- a/docs/api/features/pcoords.py +++ /dev/null @@ -1,129 +0,0 @@ -import time -import numpy as np -import pandas as pd -import matplotlib.pyplot as plt - -from yellowbrick.features import ParallelCoordinates -from sklearn.datasets import load_iris - - -def load_occupancy_data(): - # Load the classification data set - data = pd.read_csv("../../../examples/data/occupancy/occupancy.csv") - - # Specify the features of interest and the classes of the target - features = ["temperature", "relative humidity", "light", "C02", "humidity"] - classes = ['unoccupied', 'occupied'] - - # Extract the instances and target - X = data[features] - y = data.occupancy - - return X, y, features, classes - - -def pcoords(X, y, outpath, **kwargs): - # Create a new figure and axes - _, ax = plt.subplots() - - # Create the visualizer - visualizer = ParallelCoordinates(ax=ax, **kwargs) - visualizer.fit_transform(X, y) - - # Save to disk - visualizer.poof(outpath=outpath) - - -def plot_fast_vs_slow(): - data = load_iris() - - _, axes = plt.subplots(nrows=2, figsize=(9,9)) - - for idx, fast in enumerate((False, True)): - title = "Fast Parallel Coordinates" if fast else "Standard Parallel Coordinates" - oz = ParallelCoordinates(ax=axes[idx], fast=fast, title=title) - oz.fit_transform(data.data, data.target) - oz.finalize() - - plt.tight_layout() - plt.savefig("images/fast_vs_slow_parallel_coordinates.png") - - -def plot_speedup(trials=5, factors=np.arange(1, 11)): - - def pcoords_time(X, y, fast=True): - _, ax = plt.subplots() - oz = ParallelCoordinates(fast=fast, ax=ax) - - start = time.time() - oz.fit_transform(X, y) - delta = time.time() - start - - plt.cla() # clear current axis - plt.clf() # clear current figure - plt.close("all") # close all existing plots - - return delta - - def pcoords_speedup(X, y): - fast_time = pcoords_time(X, y, fast=True) - slow_time = pcoords_time(X, y, fast=False) - - return slow_time / fast_time - - data = load_iris() - - speedups = [] - variance = [] - - for factor in factors: - X = np.repeat(data.data, factor, axis=0) - y = np.repeat(data.target, factor, axis=0) - - local_speedups = [] - for trial in range(trials): - local_speedups.append(pcoords_speedup(X, y)) - - local_speedups = np.array(local_speedups) - speedups.append(local_speedups.mean()) - variance.append(local_speedups.std()) - - speedups = np.array(speedups) - variance = np.array(variance) - - series = pd.Series(speedups, index=factors) - _, ax = plt.subplots(figsize=(9,6)) - series.plot(ax=ax, marker='o', label="speedup factor", color='b') - - # Plot one standard deviation above and below the mean - ax.fill_between( - factors, speedups - variance, speedups + variance, alpha=0.25, - color='b', - ) - - ax.set_ylabel("speedup factor") - ax.set_xlabel("dataset size (number of repeats in Iris dataset)") - ax.set_title("Speed Improvement of Fast Parallel Coordinates") - plt.savefig("images/fast_parallel_coordinates_speedup.png") - - -if __name__ == '__main__': - # plot_fast_vs_slow() - # plot_speedup() - - # Occupancy data visualizations - X, y, features, classes = load_occupancy_data() - - # Draw the full, original parallel coordinates - pcoords( - X, y, "images/parallel_coordinates.png", - classes=classes, features=features, - sample=0.05, shuffle=True, random_state=19, - ) - - # Draw the noramlized, sampled parallel coordinates - pcoords( - X, y, "images/normalized_sampled_parallel_coordinates.png", - classes=classes, features=features, - normalize='standard', sample=0.05, shuffle=True, random_state=19, - ) diff --git a/docs/api/features/pcoords.rst b/docs/api/features/pcoords.rst index 8cc96bc86..723942115 100644 --- a/docs/api/features/pcoords.rst +++ b/docs/api/features/pcoords.rst @@ -5,32 +5,29 @@ Parallel Coordinates Parallel coordinates is multi-dimensional feature visualization technique where the vertical axis is duplicated horizontally for each feature. Instances are displayed as a single line segment drawn from each vertical axes to the location representing their value for that feature. This allows many dimensions to be visualized at once; in fact given infinite horizontal space (e.g. a scrolling window), technically an infinite number of dimensions can be displayed! -Data scientists use this method to detect clusters of instances that have similar classes, and to note features that have high variance or different distributions. We can see this in action after first loading our occupancy classification dataset: +Data scientists use this method to detect clusters of instances that have similar classes, and to note features that have high variance or different distributions. We can see this in action after first loading our occupancy classification dataset. -.. code:: python +.. note:: These visualizations can be produced with either the ``ParallelCoordinates`` visualizer or by using the ``parallel_coordinates`` quick method. + +.. plot:: + :context: close-figs + :alt: Parallel Coordinates + + from yellowbrick.features import ParallelCoordinates + from yellowbrick.datasets import load_occupancy # Load the classification data set - data = load_data("occupancy") + X, y = load_occupancy() # Specify the features of interest and the classes of the target features = [ - "temperature", "relative humidity", "light", "C02", "humidity" + "temperature", "relative humidity", "light", "CO2", "humidity" ] classes = ["unoccupied", "occupied"] - # Extract the instances and target - X = data[features] - y = data.occupancy - -The visualization can be drawn with either the ``ParallelCoordinates`` visualizer or using the ``parallel_coordinates`` quick method: - -.. code:: python - - from yellowbrick.features import ParallelCoordinates - # Instantiate the visualizer visualizer = ParallelCoordinates( - classes=classes, features=features, sample=0.5, shuffle=True + classes=classes, features=features, sample=0.05, shuffle=True ) # Fit and transform the data to the visualizer @@ -40,17 +37,27 @@ The visualization can be drawn with either the ``ParallelCoordinates`` visualize visualizer.poof() -.. image:: images/parallel_coordinates.png - By inspecting the visualization closely, we can see that the combination of transparency and overlap gives us the sense of groups of similar instances, sometimes referred to as "braids". If there are distinct braids of different classes, it suggests that there is enough separability that a classification algorithm might be able to discern between each class. Unfortunately, as we inspect this class, we can see that the domain of each feature may make the visualization hard to interpret. In the above visualization, the domain of the ``light`` feature is from in ``[0, 1600]``, far larger than the range of temperature in ``[50, 96]``. To solve this problem, each feature should be scaled or normalized so they are approximately in the same domain. -Normalization techniques can be directly applied to the visualizer without pre-transforming the data (though you could also do this) by using the ``normalize`` parameter. Several transformers are available; try using ``minmax``, ``minabs``, ``standard``, ``l1``, or ``l2`` normalization to change perspectives in the parallel coordinates as follows: +Normalization techniques can be directly applied to the visualizer without pre-transforming the data (though you could also do this) by using the ``normalize`` parameter. Several transformers are available; try using ``minmax``, ``maxabs``, ``standard``, ``l1``, or ``l2`` normalization to change perspectives in the parallel coordinates as follows: -.. code:: python +.. plot:: + :context: close-figs + :alt: Parallel Coordinates with Normalization from yellowbrick.features import ParallelCoordinates + from yellowbrick.datasets import load_occupancy + + # Load the classification data set + X, y = load_occupancy() + + # Specify the features of interest and the classes of the target + features = [ + "temperature", "relative humidity", "light", "CO2", "humidity" + ] + classes = ["unoccupied", "occupied"] # Instantiate the visualizer visualizer = ParallelCoordinates( @@ -62,7 +69,6 @@ Normalization techniques can be directly applied to the visualizer without pre-t visualizer.fit_transform(X, y) visualizer.poof() -.. image:: images/normalized_sampled_parallel_coordinates.png Now we can see that each feature is in the range ``[-3, 3]`` where the mean of the feature is set to zero and each feature has a unit variance applied between ``[-1, 1]`` (because we're using the ``StandardScaler`` via the ``standard`` normalize parameter). This version of parallel coordinates gives us a much better sense of the distribution of the features and if any features are highly variable with respect to any one class. @@ -78,7 +84,26 @@ Parallel coordinates can take a long time to draw since each instance is represe The "fast" drawing mode vastly improves the performance of the parallel coordinates drawing algorithm by drawing each line segment by class rather than each instance individually. However, this improved performance comes at a cost, as the visualization produced is subtly different; compare the visualizations in fast and standard drawing modes below: -.. image:: images/fast_vs_slow_parallel_coordinates.png +.. plot:: + :include-source: False + :context: close-figs + :alt: Parallel Coordinates in fast drawing mode + + import matplotlib.pyplot as plt + from sklearn.datasets import load_iris + from yellowbrick.features import ParallelCoordinates + + data = load_iris() + + _, axes = plt.subplots(nrows=2, figsize=(9,9)) + + for idx, fast in enumerate((False, True)): + title = "Fast Parallel Coordinates" if fast else "Standard Parallel Coordinates" + oz = ParallelCoordinates(ax=axes[idx], fast=fast, title=title) + oz.fit_transform(data.data, data.target) + oz.finalize() + + plt.tight_layout() As you can see the "fast" drawing algorithm does not have the same build up of color density where instances of the same class intersect. Because there is only one line per class, there is only a darkening effect between classes. This can lead to a different interpretation of the plot, though it still may be effective for analytical purposes, particularly when you're plotting a lot of data. Needless to say, the performance benefits are dramatic: diff --git a/docs/api/features/pcoords_benchmark.py b/docs/api/features/pcoords_benchmark.py new file mode 100644 index 000000000..d47d7a3a5 --- /dev/null +++ b/docs/api/features/pcoords_benchmark.py @@ -0,0 +1,66 @@ +import time +import matplotlib.pyplot as plt +from sklearn.datasets import load_iris +from yellowbrick.features import ParallelCoordinates +import pandas as pd +import numpy as np + + +def plot_speedup(trials=5, factors=np.arange(1, 11)): + def pcoords_time(X, y, fast=True): + _, ax = plt.subplots() + oz = ParallelCoordinates(fast=fast, ax=ax) + + start = time.time() + oz.fit_transform(X, y) + delta = time.time() - start + + plt.cla() # clear current axis + plt.clf() # clear current figure + plt.close("all") # close all existing plots + + return delta + + def pcoords_speedup(X, y): + fast_time = pcoords_time(X, y, fast=True) + slow_time = pcoords_time(X, y, fast=False) + + return slow_time / fast_time + + data = load_iris() + + speedups = [] + variance = [] + + for factor in factors: + X = np.repeat(data.data, factor, axis=0) + y = np.repeat(data.target, factor, axis=0) + + local_speedups = [] + for trial in range(trials): + local_speedups.append(pcoords_speedup(X, y)) + + local_speedups = np.array(local_speedups) + speedups.append(local_speedups.mean()) + variance.append(local_speedups.std()) + + speedups = np.array(speedups) + variance = np.array(variance) + + series = pd.Series(speedups, index=factors) + _, ax = plt.subplots(figsize=(9, 6)) + series.plot(ax=ax, marker="o", label="speedup factor", color="b") + + # Plot one standard deviation above and below the mean + ax.fill_between( + factors, speedups - variance, speedups + variance, alpha=0.25, color="b" + ) + + ax.set_ylabel("speedup factor") + ax.set_xlabel("dataset size (number of repeats in Iris dataset)") + ax.set_title("Speed Improvement of Fast Parallel Coordinates") + plt.savefig("images/fast_parallel_coordinates_speedup_benchmark.png") + + +if __name__ == "__main__": + plot_speedup() diff --git a/docs/api/features/radviz.py b/docs/api/features/radviz.py deleted file mode 100644 index 45de47299..000000000 --- a/docs/api/features/radviz.py +++ /dev/null @@ -1,21 +0,0 @@ -import pandas as pd -from yellowbrick.features import RadViz - - -# Load the classification data set -data = pd.read_csv("../../../examples/data/occupancy/occupancy.csv") - -# Specify the features of interest and the classes of the target -features = ["temperature", "relative humidity", "light", "C02", "humidity"] -classes = ['unoccupied', 'occupied'] - -# Extract the instances and target -X = data[features] -y = data.occupancy - -# Instantiate the visualizer -visualizer = RadViz(classes=classes, features=features) - -visualizer.fit(X, y) -visualizer.transform(X) -visualizer.poof(outpath="images/radviz.png") diff --git a/docs/api/features/radviz.rst b/docs/api/features/radviz.rst index 425b097da..dc0529f98 100644 --- a/docs/api/features/radviz.rst +++ b/docs/api/features/radviz.rst @@ -3,7 +3,7 @@ RadViz Visualizer ================= -RadViz is a multivariate data visualization algorithm that plots each +``RadViz`` is a multivariate data visualization algorithm that plots each feature dimension uniformly around the circumference of a circle then plots points on the interior of the circle such that the point normalizes its values on the axes from the center to each arc. This @@ -16,43 +16,33 @@ just too much noise? If your data contains rows with missing values (``numpy.nan``), those missing values will not be plotted. In other words, you may not get the entire -picture of your data. RadViz will raise a DataWarning to inform you of the +picture of your data. ``RadViz`` will raise a ``DataWarning`` to inform you of the percent missing. If you do receive this warning, you may want to look at imputation strategies. A good starting place is the `scikit-learn Imputer. `_ -.. code:: python +.. plot:: + :context: close-figs + :alt: RadViz on the Occupancy Dataset - # Load the classification data set - data = load_data("occupancy") - - # Specify the features of interest and the classes of the target - features = ["temperature", "relative humidity", "light", "C02", "humidity"] - classes = ["unoccupied", "occupied"] - - # Extract the instances and target - X = data[features] - y = data.occupancy - -.. code:: python - - # Import the visualizer + from yellowbrick.datasets import load_occupancy from yellowbrick.features import RadViz - # Instantiate the visualizer - visualizer = RadViz(classes=classes, features=features) - - visualizer.fit(X, y) # Fit the data to the visualizer - visualizer.transform(X) # Transform the data - visualizer.poof() # Draw/show/poof the data - + # Load the classification dataset + X, y = load_occupancy() + # Specify the target classes + classes = ["unoccupied", "occupied"] -.. image:: images/radviz.png + # Instantiate the visualizer + visualizer = RadViz(classes=classes) + visualizer.fit(X, y) # Fit the data to the visualizer + visualizer.transform(X) # Transform the data + visualizer.poof() # Draw/show/poof the data -For regression, the RadViz visualizer should use a color sequence to +For regression, the ``RadViz`` visualizer should use a color sequence to display the target information, as opposed to discrete colors. API Reference diff --git a/docs/api/features/rankd.py b/docs/api/features/rankd.py deleted file mode 100644 index 75eca93dd..000000000 --- a/docs/api/features/rankd.py +++ /dev/null @@ -1,56 +0,0 @@ -import pandas as pd -import matplotlib.pyplot as plt - -from yellowbrick.features.rankd import Rank1D, Rank2D - -def rank1d(X, y, outpath, **kwargs): - # Create a new figure and axes - _, ax = plt.subplots() - - # Create the visualizer - visualizer = Rank1D(ax=ax, **kwargs) - visualizer.fit(X, y) - visualizer.transform(X) - - # Save to disk - visualizer.poof(outpath=outpath) - - -def rank2d(X, y, outpath, **kwargs): - # Create a new figure and axes - _, ax = plt.subplots() - - # Create the visualizer - visualizer = Rank2D(ax=ax, **kwargs) - visualizer.fit(X, y) - visualizer.transform(X) - - # Save to disk - plt.tight_layout() - visualizer.poof(outpath=outpath) - - -if __name__ == '__main__': - # Load the regression data set - data = pd.read_csv("../../../examples/data/credit/credit.csv") - - # Specify the features of interest - features = [ - 'limit', 'sex', 'edu', 'married', 'age', 'apr_delay', 'may_delay', - 'jun_delay', 'jul_delay', 'aug_delay', 'sep_delay', 'apr_bill', 'may_bill', - 'jun_bill', 'jul_bill', 'aug_bill', 'sep_bill', 'apr_pay', 'may_pay', 'jun_pay', - 'jul_pay', 'aug_pay', 'sep_pay', - ] - - # Extract the instances and target - X = data[features] - y = data.default - - # Instantiate the visualizer with the Shapiro-Wilk ranking algorithm - rank1d(X, y, "images/rank1d_shapiro.png", features=features, algorithm='shapiro') - - # Instantiate the visualizer with the Covariance ranking algorithm - rank2d(X, y, "images/rank2d_covariance.png", features=features, algorithm='covariance') - - # Instantiate the visualizer with the Pearson ranking algorithm - rank2d(X, y, "images/rank2d_pearson.png", features=features, algorithm='pearson') diff --git a/docs/api/features/rankd.rst b/docs/api/features/rankd.rst index f51b785d2..f0d437828 100644 --- a/docs/api/features/rankd.rst +++ b/docs/api/features/rankd.rst @@ -3,81 +3,76 @@ Rank Features ============= -Rank1D and Rank2D evaluate single features or pairs of features using a variety of metrics that score the features on the scale [-1, 1] or [0, 1] allowing them to be ranked. A similar concept to SPLOMs, the scores are visualized on a lower-left triangle heatmap so that patterns between pairs of features can be easily discerned for downstream analysis. +``Rank1D`` and ``Rank2D`` evaluate single features or pairs of features using a variety of metrics that score the features on the scale [-1, 1] or [0, 1] allowing them to be ranked. A similar concept to SPLOMs, the scores are visualized on a lower-left triangle heatmap so that patterns between pairs of features can be easily discerned for downstream analysis. In this example, we'll use the credit default data set from the UCI Machine Learning repository to rank features. The code below creates our instance matrix and target vector. -.. code:: python - - # Load the dataset - data = load_data('credit') - - # Specify the features of interest - features = [ - 'limit', 'sex', 'edu', 'married', 'age', 'apr_delay', 'may_delay', - 'jun_delay', 'jul_delay', 'aug_delay', 'sep_delay', 'apr_bill', 'may_bill', - 'jun_bill', 'jul_bill', 'aug_bill', 'sep_bill', 'apr_pay', 'may_pay', 'jun_pay', - 'jul_pay', 'aug_pay', 'sep_pay', - ] - - # Extract the instances and target - X = data[features] - y = data.default - Rank 1D ------- -A one dimensional ranking of features utilizes a ranking algorithm that takes into account only a single feature at a time (e.g. histogram analysis). By default we utilize the Shapiro-Wilk algorithm to assess the normality of the distribution of instances with respect to the feature. A barplot is then drawn showing the relative ranks of each feature. +A one-dimensional ranking of features utilizes a ranking algorithm that takes into account only a single feature at a time (e.g. histogram analysis). By default we utilize the Shapiro-Wilk algorithm to assess the normality of the distribution of instances with respect to the feature. A barplot is then drawn showing the relative ranks of each feature. -.. code:: python +.. plot:: + :context: close-figs + :alt: Rank1D on the credit dataset with the Shapiro ranking algorithm + from yellowbrick.datasets import load_credit from yellowbrick.features import Rank1D - # Instantiate the 1D visualizer with the Sharpiro ranking algorithm - visualizer = Rank1D(features=features, algorithm='shapiro') - - visualizer.fit(X, y) # Fit the data to the visualizer - visualizer.transform(X) # Transform the data - visualizer.poof() # Draw/show/poof the data + # Load the credit dataset + X, y = load_credit() + # Instantiate the 1D visualizer with the Sharpiro ranking algorithm + visualizer = Rank1D(algorithm='shapiro') -.. image:: images/rank1d_shapiro.png + visualizer.fit(X, y) # Fit the data to the visualizer + visualizer.transform(X) # Transform the data + visualizer.poof() # Draw/show/poof the data Rank 2D ------- -A two dimensional ranking of features utilizes a ranking algorithm that takes into account pairs of features at a time (e.g. joint plot analysis). The pairs of features are then ranked by score and visualized using the lower left triangle of a feature co-occurence matrix. +A two-dimensional ranking of features utilizes a ranking algorithm that takes into account pairs of features at a time (e.g. joint plot analysis). The pairs of features are then ranked by score and visualized using the lower left triangle of a feature co-occurence matrix. -The default ranking algorithm is covariance, which attempts to compute the mean value of the product of deviations of variates from their respective means. Covariance loosely attempts to detect a colinear relationship between features. +By default, the ``Rank2D`` visualizer utilizes the Pearson correlation score to detect colinear relationships. -.. code:: python +.. plot:: + :context: close-figs + :alt: Rank2D on the credit dataset using Pearson ranking algorithm + from yellowbrick.datasets import load_credit from yellowbrick.features import Rank2D - # Instantiate the visualizer with the Covariance ranking algorithm - visualizer = Rank2D(features=features, algorithm='covariance') - - visualizer.fit(X, y) # Fit the data to the visualizer - visualizer.transform(X) # Transform the data - visualizer.poof() # Draw/show/poof the data + # Load the credit dataset + X, y = load_credit() + + # Instantiate the visualizer with the Pearson ranking algorithm + visualizer = Rank2D(algorithm='pearson') + visualizer.fit(X, y) # Fit the data to the visualizer + visualizer.transform(X) # Transform the data + visualizer.poof() # Draw/show/poof the data -.. image:: images/rank2d_covariance.png -Alternatively, we can utilize a linear correlation algorithm such as a Pearson score to similarly detect colinear relationships. Compare the output from Pearson below to the covariance ranking above. +Alternatively, we can utilize the covariance ranking algorithm, which attempts to compute the mean value of the product of deviations of variates from their respective means. Covariance loosely attempts to detect a colinear relationship between features. Compare the output from Pearson above to the covariance ranking below. -.. code:: python +.. plot:: + :context: close-figs + :alt: Rank2D on the credit dataset with the covariance algorithm - # Instantiate the visualizer with the Pearson ranking algorithm - visualizer = Rank2D(features=features, algorithm='pearson') + from yellowbrick.datasets import load_credit + from yellowbrick.features import Rank2D - visualizer.fit(X, y) # Fit the data to the visualizer - visualizer.transform(X) # Transform the data - visualizer.poof() # Draw/show/poof the data + # Load the credit dataset + X, y = load_credit() + # Instantiate the visualizer with the covariance ranking algorithm + visualizer = Rank2D(algorithm='covariance') -.. image:: images/rank2d_pearson.png + visualizer.fit(X, y) # Fit the data to the visualizer + visualizer.transform(X) # Transform the data + visualizer.poof() # Draw/show/poof the data API Reference diff --git a/docs/api/figures.rst b/docs/api/figures.rst new file mode 100644 index 000000000..f80f4c6b7 --- /dev/null +++ b/docs/api/figures.rst @@ -0,0 +1,81 @@ +.. -*- mode: rst -*- + +Figures and Axes +================ + +This document is an open letter to the PyData community, particularly those that are involved in matplotlib development. We'd like to get some advice on the API choice we've made and thoughts about our use of the matplotlib Axes objects. + +One of the most complex parts of designing a visualization library around matplotlib is working with figures and axes. As defined in `The Lifecycle of a Plot `_, these central objects of matplotlib plots are as follows: + +- A Figure is the final image that may contain 1 or more Axes. +- An Axes represents an individual plot + +Based on these definitions and and the advice to "try to use the object-oriented interface over the pyplot interface", the Yellowbrick interface is designed to wrap a matplotlib ``axes.Axes``. We propose the following general use case for most visualizers: + +.. code:: python + + import matplotlib.pyplot as plt + from yellowbrick import Visualizer, quick_visualizer + + fig, ax = plt.subplots() + + # Object oriented approach + viz = Visualizer(ax=ax) + viz.fit(X, y) + viz.poof() + + # Quick method approach + viz = quick_visualizer(X, y, ax=ax) + viz.poof() + +This design allows users to more directly control the size, style, and interaction with the plot (though YB does provide some helpers for these as well). For example, if a user wanted to generate a report with multiple visualizers for a classification problem, it may looks something like: + +.. code:: python + + import matplotlib.pyplot as plt + + from yellowbrick.features import FeatureImportances + from yellowbrick.classifier import ConfusionMatrix, ClassificationReport, ROCAUC + from sklearn.linear_model import LogisticRegression + + fig, axes = plot.subplots(2, 2) + + model = LogisticRegression() + visualgrid = [ + FeatureImportances(ax=axes[0][0]), + ConfusionMatrix(model, ax=axes[0][1]), + ClassificationReport(model, ax=axes[1][0]), + ROCAUC(model, ax=axes[1][1]), + ] + + for viz in visualgrid: + viz.fit(X_train, y_train) + viz.score(X_test, y_test) + viz.finalize() + + plt.show() + +This is a common use case and we're working on the idea of "visual pipelines" to support this type of development because, for machine learning, users generally want a suite of visualizers or a report, not just a single visualization. The API requirement to support this has therefore been that visualizers use the ``ax`` object passed to them and not ``plt``. If the user does not pass a specific ``ax`` then the global current axes is used via ``plt.gca``. Generally, visualizers should behave as though they are a plot that as part of a larger figure. + +Visualizers are getting more complex, however, and some are becoming multi-axes plots in their own right. For example: + +- The ResidualsPlot has a scatter plot axes and a histogram axes +- The JointPlot has a scatter plot and two histogram axes +- Data driven scatter plot axes often have colorbar axes +- The PCA plot has scatter plot, color bar, and heatmap axes +- The confusion matrix probability histogram is a grid of axes for each class pair +- The ICDM has an inset axes that acts as a dynamic legend + +Although it would have been easier to simply embed the figure into the visualizer and use a ``GridSpec`` or other layout tool, the focus on ensuring visualizers are individual plots that wrap an Axes has made us bend over backward to adjust the plot inside of the axes area that was originally supplied, primarily by using ``make_axes_locateable``, which is part of the AxesGrid toolkit. + +Generally, it appears that the `AxesGrid Toolkit `_ is the right tool for Yellowbrick - many of the examples shown are similar to the things that Yellowbrick is trying to do. However, this package is not fully documented with examples and some helper utilities that would be useful, for example the ``ImageGrid``, still require a ``figure.Figure`` object. + +At this point we are left with some important questions about Yellowbrick's development roadmap: + +1. Like Seaborn, should YB have two classes of visualizer, one that wraps an axes and one that wraps a figure? +2. Should we go all in on the AxesGrid toolkit and continue to restrict our use of the figure, will this method be supported in the long run? + + +Other notes and discussion: + +- `Create equal aspect (square) plot with multiple axes when data limits are different? `_ diff --git a/docs/api/index.rst b/docs/api/index.rst index 6ce5d79f6..708b40e4f 100644 --- a/docs/api/index.rst +++ b/docs/api/index.rst @@ -8,7 +8,7 @@ Welcome to the API documentation for Yellowbrick! This section contains a comple .. toctree:: :maxdepth: 2 - datasets + datasets/index anscombe features/index target/index @@ -19,8 +19,9 @@ Welcome to the API documentation for Yellowbrick! This section contains a comple text/index contrib/index palettes + figures -.. note:: Many examples utilize data from the UCI Machine Learning repository. In order to run the accompanying code, make sure to follow the instructions in :doc:`datasets` to download and load the required data. +.. note:: Many examples utilize data from the UCI Machine Learning repository. In order to run the accompanying code, make sure to follow the instructions in :doc:`datasets/index` to download and load the required data. A guide to finding the visualizer you're looking for: generally speaking, visualizers can be data visualizers which visualize instances relative to the model space; score visualizers which visualize model performance; model selection visualizers which compare multiple model forms against each other; and application specific-visualizers. This can be a bit confusing, so we've grouped visualizers according to the type of analysis they are well suited for. diff --git a/docs/api/model_selection/cross_validation.py b/docs/api/model_selection/cross_validation.py deleted file mode 100644 index a2f60534b..000000000 --- a/docs/api/model_selection/cross_validation.py +++ /dev/null @@ -1,89 +0,0 @@ -#!/usr/bin/env python3 -# cross_validation.py - -""" -Generates a CVScores image -""" - -########################################################################## -## Imports -########################################################################## - -import pandas as pd -import matplotlib.pyplot as plt - -from sklearn.linear_model import Ridge -from sklearn.naive_bayes import MultinomialNB -from sklearn.model_selection import KFold, StratifiedKFold - -from yellowbrick.model_selection import CVScores - - -########################################################################## -## Helper Methods -########################################################################## - -def load_occupancy(): - # Load the classification data set - room = pd.read_csv("../../../examples/data/occupancy/occupancy.csv") - - features = ["temperature", "relative humidity", "light", "C02", "humidity"] - - # Extract the numpy arrays from the data frame - X = room[features].values - y = room.occupancy.values - - return X, y - - -def load_energy(): - # Load regression dataset - energy = pd.read_csv('../../../examples/data/energy/energy.csv') - - targets = ["heating load", "cooling load"] - features = [col for col in energy.columns if col not in targets] - - X = energy[features] - y = energy[targets[1]] - - return X, y - -def classification_cvscores(outpath="images/cv_scores_classifier.png", **kwargs): - X, y = load_occupancy() - - # Create a new figure and axes - _, ax = plt.subplots() - - cv = StratifiedKFold(12) - - oz = CVScores( - MultinomialNB(), ax=ax, cv=cv, scoring='f1_weighted' - ) - - oz.fit(X, y) - - # Save to disk - oz.poof(outpath=outpath) - - -def regression_cvscores(outpath="images/cv_scores_regressor.png", **kwargs): - X, y = load_energy() - - # Create a new figure and axes - _, ax = plt.subplots() - - cv = KFold(12) - - oz = CVScores( - Ridge(), ax=ax, cv=cv, scoring='r2' - ) - - oz.fit(X, y) - - # Save to disk - oz.poof(outpath=outpath) - - -if __name__ == '__main__': - classification_cvscores() - regression_cvscores() diff --git a/docs/api/model_selection/cross_validation.rst b/docs/api/model_selection/cross_validation.rst index b1479b5bf..5ab30977d 100644 --- a/docs/api/model_selection/cross_validation.rst +++ b/docs/api/model_selection/cross_validation.rst @@ -6,7 +6,7 @@ Cross Validation Scores Generally we determine whether a given model is optimal by looking at it's F1, precision, recall, and accuracy (for classification), or it's coefficient of determination (R2) and error (for regression). However, real world data is often distributed somewhat unevenly, meaning that the fitted model is likely to perform better on some sections of the data than on others. Yellowbrick's ``CVScores`` visualizer enables us to visually explore these variations in performance using different cross validation strategies. Cross Validation -################ +---------------- Cross-validation starts by shuffling the data (to prevent any unintentional ordering errors) and splitting it into `k` folds. Then `k` models are fit on :math:`\frac{k-1} {k}` of the data (called the training split) and evaluated on :math:`\frac {1} {k}` of the data (called the test split). The results from each evaluation are averaged together for a final score, then the final model is fit on the entire dataset for operationalization. @@ -17,90 +17,63 @@ In Yellowbrick, the ``CVScores`` visualizer displays cross-validated scores as a Classification -------------- -In the following example we show how to visualize cross-validated scores for a classification model. After loading a ``DataFrame``, we create a ``StratifiedKFold`` cross-validation strategy to ensure all of our classes in each split are represented with the same proportion. We then fit the ``CVScores`` visualizer using the ``f1_weighted`` scoring metric as opposed to the default metric, accuracy, to get a better sense of the relationship of precision and recall in our classifier across all of our folds. +In the following example, we show how to visualize cross-validated scores for a classification model. After loading our occupancy data as a ``DataFrame``, we created a ``StratifiedKFold`` cross-validation strategy to ensure all of our classes in each split are represented with the same proportion. We then fit the ``CVScores`` visualizer using the ``f1_weighted`` scoring metric as opposed to the default metric, accuracy, to get a better sense of the relationship of precision and recall in our classifier across all of our folds. -.. code:: python +.. plot:: + :context: close-figs + :alt: Cross validation on the occupancy data set using StratifiedKFold - import pandas as pd - import matplotlib.pyplot as plt - - from sklearn.naive_bayes import MultinomialNB from sklearn.model_selection import StratifiedKFold + from sklearn.naive_bayes import MultinomialNB + from yellowbrick.datasets import load_occupancy from yellowbrick.model_selection import CVScores - - # Load the classification data set - data = load_data("occupancy") - - # Specify the features of interest - features = ["temperature", "relative humidity", "light", "C02", "humidity"] - - # Extract the instances and target - X = data[features] - y = data.occupancy - - # Create a new figure and axes - _, ax = plt.subplots() + # Load the classification dataset + X, y = load_occupancy() # Create a cross-validation strategy - cv = StratifiedKFold(12) + cv = StratifiedKFold(n_splits=12, random_state=42) - # Create the cv score visualizer - oz = CVScores( - MultinomialNB(), ax=ax, cv=cv, scoring='f1_weighted' - ) - - oz.fit(X, y) - oz.poof() + # Instantiate the classification model and visualizer + model = MultinomialNB() + visualizer = CVScores(model, cv=cv, scoring='f1_weighted') + visualizer.fit(X, y) # Fit the data to the visualizer + visualizer.poof() # Draw/show/poof the data Our resulting visualization shows that while our average cross-validation score is quite high, there are some splits for which our fitted ``MultinomialNB`` classifier performs significantly less well. -.. image:: images/cv_scores_classifier.png - - Regression ---------- -In this next example we show how to visualize cross-validated scores for a regression model. After loading our energy data into a ``DataFrame``, we instantiate a simple ``KFold`` cross-validation strategy. We then fit the ``CVScores`` visualizer using the ``r2`` scoring metric, to get a sense of the coefficient of determination for our regressor across all of our folds. +In this next example we show how to visualize cross-validated scores for a regression model. After loading our energy data as a ``DataFrame``, we instantiated a simple ``KFold`` cross-validation strategy. We then fit the ``CVScores`` visualizer using the ``r2`` scoring metric, to get a sense of the coefficient of determination for our regressor across all of our folds. -.. code:: python +.. plot:: + :context: close-figs + :alt: Cross validation on the energy data set using KFold from sklearn.linear_model import Ridge from sklearn.model_selection import KFold + from yellowbrick.datasets import load_energy + from yellowbrick.model_selection import CVScores - # Load the regression data set - data = load_data("energy") - - # Specify the features of interest and the target - targets = ["heating load", "cooling load"] - features = [col for col in data.columns if col not in targets] - - # Extract the instances and target - X = data[features] - y = data[targets[1]] - - # Create a new figure and axes - _, ax = plt.subplots() - - cv = KFold(12) + # Load the regression dataset + X, y = load_energy() - oz = CVScores( - Ridge(), ax=ax, cv=cv, scoring='r2' - ) + # Instantiate the regression model and visualizer + cv = KFold(n_splits=12, random_state=42) - oz.fit(X, y) - oz.poof() + model = Ridge() + visualizer = CVScores(model, cv=cv, scoring='r2') + visualizer.fit(X, y) # Fit the data to the visualizer + visualizer.poof() # Draw/show/poof the data As with our classification ``CVScores`` visualization, our regression visualization suggests that our ``Ridge`` regressor performs very well (e.g. produces a high coefficient of determination) across nearly every fold, resulting in another fairly high overall R2 score. -.. image:: images/cv_scores_regressor.png - - API Reference ------------- diff --git a/docs/api/model_selection/images/cv_scores_classifier.png b/docs/api/model_selection/images/cv_scores_classifier.png deleted file mode 100644 index 7a557a14a..000000000 Binary files a/docs/api/model_selection/images/cv_scores_classifier.png and /dev/null differ diff --git a/docs/api/model_selection/images/cv_scores_regressor.png b/docs/api/model_selection/images/cv_scores_regressor.png deleted file mode 100644 index 9fd292f18..000000000 Binary files a/docs/api/model_selection/images/cv_scores_regressor.png and /dev/null differ diff --git a/docs/api/model_selection/images/learning_curve_classifier.png b/docs/api/model_selection/images/learning_curve_classifier.png deleted file mode 100644 index 43740aa5a..000000000 Binary files a/docs/api/model_selection/images/learning_curve_classifier.png and /dev/null differ diff --git a/docs/api/model_selection/images/learning_curve_clusterer.png b/docs/api/model_selection/images/learning_curve_clusterer.png deleted file mode 100644 index 6cb8c9c97..000000000 Binary files a/docs/api/model_selection/images/learning_curve_clusterer.png and /dev/null differ diff --git a/docs/api/model_selection/images/learning_curve_regressor.png b/docs/api/model_selection/images/learning_curve_regressor.png deleted file mode 100644 index d76462474..000000000 Binary files a/docs/api/model_selection/images/learning_curve_regressor.png and /dev/null differ diff --git a/docs/api/model_selection/images/rfecv_credit.png b/docs/api/model_selection/images/rfecv_credit.png new file mode 100644 index 000000000..57c77c012 Binary files /dev/null and b/docs/api/model_selection/images/rfecv_credit.png differ diff --git a/docs/api/features/images/rfecv_sklearn_example.png b/docs/api/model_selection/images/rfecv_sklearn_example.png similarity index 100% rename from docs/api/features/images/rfecv_sklearn_example.png rename to docs/api/model_selection/images/rfecv_sklearn_example.png diff --git a/docs/api/model_selection/images/validation_curve_classifier_alt.png b/docs/api/model_selection/images/validation_curve_classifier_knn.png similarity index 100% rename from docs/api/model_selection/images/validation_curve_classifier_alt.png rename to docs/api/model_selection/images/validation_curve_classifier_knn.png diff --git a/docs/api/model_selection/images/validation_curve_classifier.png b/docs/api/model_selection/images/validation_curve_classifier_svc.png similarity index 100% rename from docs/api/model_selection/images/validation_curve_classifier.png rename to docs/api/model_selection/images/validation_curve_classifier_svc.png diff --git a/docs/api/model_selection/images/validation_curve_regressor.png b/docs/api/model_selection/images/validation_curve_regressor.png deleted file mode 100644 index d796e662e..000000000 Binary files a/docs/api/model_selection/images/validation_curve_regressor.png and /dev/null differ diff --git a/docs/api/model_selection/images/validation_curve_sklearn_example.png b/docs/api/model_selection/images/validation_curve_sklearn_example.png deleted file mode 100644 index cbbe3905b..000000000 Binary files a/docs/api/model_selection/images/validation_curve_sklearn_example.png and /dev/null differ diff --git a/docs/api/features/importances.rst b/docs/api/model_selection/importances.rst similarity index 65% rename from docs/api/features/importances.rst rename to docs/api/model_selection/importances.rst index 4fa7a8767..a9e6666f5 100644 --- a/docs/api/features/importances.rst +++ b/docs/api/model_selection/importances.rst @@ -17,44 +17,30 @@ Random Forest, Gradient Boosting, and Ada Boost provide a ``feature_importances_`` attribute when fitted. The Yellowbrick ``FeatureImportances`` visualizer utilizes this attribute to rank and plot relative importances. Let's start with an example; first load a -classification dataset as follows: +classification dataset. -.. code:: python - - # Load the classification data set - data = load_data("occupancy") - - # Specify the features of interest - features = [ - "temperature", "relative humidity", "light", "C02", "humidity" - ] - - # Extract the instances and target - X = data[features] - y = data.occupancy - -Once the dataset has been loaded, we can create a new figure (this is +Then we can create a new figure (this is optional, if an ``Axes`` isn't specified, Yellowbrick will use the current figure or create one). We can then fit a ``FeatureImportances`` visualizer -with a ``GradientBoostingClassifier`` to visualize the ranked features: - -.. code:: python +with a ``GradientBoostingClassifier`` to visualize the ranked features. - import matplotlib.pyplot as plt +.. plot:: + :context: close-figs + :alt: Feature importances of Random Forest classifier - from sklearn.ensemble import GradientBoostingClassifier + from sklearn.ensemble import RandomForestClassifier - from yellowbrick.features.importances import FeatureImportances + from yellowbrick.datasets import load_occupancy + from yellowbrick.model_selection import FeatureImportances - # Create a new matplotlib figure - fig = plt.figure() - ax = fig.add_subplot() + # Load the classification data set + X, y = load_occupancy() - viz = FeatureImportances(GradientBoostingClassifier(), ax=ax) + model = RandomForestClassifier(n_estimators=10) + viz = FeatureImportances(model) viz.fit(X, y) viz.poof() -.. image:: images/feature_importances.png The above figure shows the features ranked according to the explained variance each feature contributes to the model. In this case the features are plotted @@ -64,22 +50,7 @@ most important feature. The visualizer also contains ``features_`` and For models that do not support a ``feature_importances_`` attribute, the ``FeatureImportances`` visualizer will also draw a bar plot for the ``coef_`` -attribute that many linear models provide. First we start by loading a -regression dataset: - -.. code:: python - - # Load a regression data set - data = load_data("concrete") - - # Specify the features of interest - features = [ - "cement","slag","ash","water","splast","coarse","fine","age" - ] - - # Extract the instances and target - X = data[features] - y = data.strength +attribute that many linear models provide. When using a model with a ``coef_`` attribute, it is better to set ``relative=False`` to draw the true magnitude of the coefficient (which may @@ -87,30 +58,51 @@ be negative). We can also specify our own set of labels if the dataset does not have column names or to print better titles. In the example below we title case our features for better readability: -.. code:: python +.. plot:: + :context: close-figs + :alt: Coefficient importances for LASSO regression - import matplotlib.pyplot as plt - from sklearn.linear_model import Lasso - - from yellowbrick.features.importances import FeatureImportances + from yellowbrick.datasets import load_concrete + from yellowbrick.model_selection import FeatureImportances - # Create a new figure - fig = plt.figure() - ax = fig.add_subplot() + # Load the regression dataset + dataset = load_concrete(return_dataset=True) + X, y = dataset.to_data() # Title case the feature for better display and create the visualizer - labels = list(map(lambda s: s.title(), features)) - viz = FeatureImportances(Lasso(), ax=ax, labels=labels, relative=False) + labels = list(map(lambda s: s.title(), dataset.meta['features'])) + viz = FeatureImportances(Lasso(), labels=labels, relative=False) # Fit and show the feature importances viz.fit(X, y) viz.poof() +.. NOTE:: The interpretation of the importance of coeficients depends on the model; see the discussion below for more details. -.. image:: images/feature_importances_coef.png +Stacked Feature Importances +--------------------------- + +Some estimators return a multi-dimensonal array for either ``feature_importances_`` or ``coef_`` attributes. For example the ``LogisticRegression`` classifier returns a ``coef_`` array in the shape of ``(n_classes, n_features)`` in the multiclass case. These coefficients map the importance of the feature to the prediction of the probability of a specific class. Although the interpretation of multi-dimensional feature importances depends on the specific estimator and model family, the data is treated the same in the ``FeatureImportances`` visualizer -- namely the importances are averaged. + +Taking the mean of the importances may be undesirable for several reasons. For example, a feature may be more informative for some classes than others. Multi-output estimators also do not benefit from having averages taken across what are essentially multiple internal models. In this case, use the ``stack=True`` parameter to draw a stacked bar chart of importances as follows: + +.. plot:: + :context: close-figs + :alt: Stacked per-class importances with Logistic Regression + + from yellowbrick.model_selection import FeatureImportances + from sklearn.linear_model import LogisticRegression + from sklearn.datasets import load_iris + + data = load_iris() + X, y = data.data, data.target + + model = LogisticRegression(multi_class="auto", solver="liblinear") + viz = FeatureImportances(model, stack=True, relative=False) + viz.fit(X, y) + viz.poof() -.. NOTE:: The interpretation of the importance of coeficients depends on the model; see the discussion below for more details. Discussion ---------- @@ -138,7 +130,7 @@ This method may also be used for instances; but generally there are very many in API Reference ------------- -.. automodule:: yellowbrick.features.importances +.. automodule:: yellowbrick.model_selection.importances :members: FeatureImportances :undoc-members: :show-inheritance: diff --git a/docs/api/model_selection/index.rst b/docs/api/model_selection/index.rst index f0744341d..126490862 100644 --- a/docs/api/model_selection/index.rst +++ b/docs/api/model_selection/index.rst @@ -12,6 +12,8 @@ The currently implemented model selection visualizers are as follows: - :doc:`validation_curve`: visualizes how the adjustment of a hyperparameter influences training and test scores to tune the bias/variance trade-off. - :doc:`learning_curve`: shows how the size of training data influences the model to diagnose if a model suffers more from variance error vs. bias error. - :doc:`cross_validation`: displays cross-validated scores as a bar chart with average as a horizontal line. +- :doc:`importances`: rank features by relative importance in a model +- :doc:`rfecv`: select a subset of features by importance Model selection makes heavy use of cross validation to measure the performance of an estimator. Cross validation splits a dataset into a training data set and a test data set; the model is fit on the training data and evaluated on the test data. This helps avoid a common pitfall, overfitting, where the model simply memorizes the training data and does not generalize well to new or unknown input. @@ -23,3 +25,5 @@ There are many ways to define how to split a dataset for cross validation. For m validation_curve learning_curve cross_validation + importances + rfecv diff --git a/docs/api/model_selection/learning_curve.py b/docs/api/model_selection/learning_curve.py deleted file mode 100644 index af4673820..000000000 --- a/docs/api/model_selection/learning_curve.py +++ /dev/null @@ -1,116 +0,0 @@ -#!/usr/bin/env python3 -# learning_curve.py - -""" -Generates the learning curve visualizations for the documentation -""" - -########################################################################## -## Imports -########################################################################## - -import os -import numpy as np -import pandas as pd -import matplotlib.pyplot as plt - -from sklearn.svm import SVC -from sklearn.cluster import KMeans -from sklearn.datasets import make_blobs -from sklearn.datasets import load_digits -from sklearn.linear_model import RidgeCV -from sklearn.naive_bayes import GaussianNB -from sklearn.naive_bayes import MultinomialNB -from sklearn.model_selection import ShuffleSplit -from sklearn.model_selection import StratifiedKFold - -from yellowbrick.model_selection import LearningCurve - - -FIXTURES = os.path.join("..", "..", "..", "examples", "data") - - -########################################################################## -## Helper Methods -########################################################################## - -def learning_curve_sklearn_example(path="images/learning_curve_sklearn_example.png"): - digits = load_digits() - X, y = digits.data, digits.target - - _, ax = plt.subplots(nrows=1, ncols=2, sharey=True, figsize=(9,4)) - - cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=0) - oz = LearningCurve(GaussianNB(), ax=ax[0], cv=cv, n_jobs=4) - oz.fit(X, y) - oz.finalize() - - cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0) - oz = LearningCurve(SVC(gamma=0.001), ax=ax[1], cv=cv, n_jobs=4) - oz.fit(X, y) - oz.poof(outpath=path) - - -def learning_curve_classifier(path="images/learning_curve_classifier.png"): - - data = pd.read_csv(os.path.join(FIXTURES, "game", "game.csv")) - - target = "outcome" - features = [col for col in data.columns if col != target] - - X = pd.get_dummies(data[features]) - y = data[target] - - _, ax = plt.subplots() - cv = StratifiedKFold(12) - sizes = np.linspace(0.3, 1.0, 10) - - oz = LearningCurve( - MultinomialNB(), ax=ax, cv=cv, n_jobs=4, - train_sizes=sizes, scoring='f1_weighted' - ) - - oz.fit(X, y) - oz.poof(outpath=path) - - -def learning_curve_regressor(path="images/learning_curve_regressor.png"): - - data = pd.read_csv(os.path.join(FIXTURES, "energy", "energy.csv")) - - targets = ["heating load", "cooling load"] - features = [col for col in data.columns if col not in targets] - - X = data[features] - y = data[targets[0]] - - _, ax = plt.subplots() - sizes = np.linspace(0.3, 1.0, 10) - - oz = LearningCurve(RidgeCV(), ax=ax, train_sizes=sizes, scoring='r2') - oz.fit(X, y) - oz.poof(outpath=path) - - -def learning_curve_clusterer(path="images/learning_curve_clusterer.png"): - - X, y = make_blobs(n_samples=1000, centers=5) - - _, ax = plt.subplots() - sizes = np.linspace(0.3, 1.0, 10) - - oz = LearningCurve( - KMeans(), ax=ax, train_sizes=sizes, scoring="adjusted_rand_score" - ) - oz.fit(X, y) - oz.poof(outpath=path) - -########################################################################## -## Main Method -########################################################################## - -if __name__ == '__main__': - learning_curve_sklearn_example() - learning_curve_classifier() - learning_curve_regressor() - learning_curve_clusterer() diff --git a/docs/api/model_selection/learning_curve.rst b/docs/api/model_selection/learning_curve.rst index cd6b860ea..e903e4296 100644 --- a/docs/api/model_selection/learning_curve.rst +++ b/docs/api/model_selection/learning_curve.rst @@ -3,7 +3,7 @@ Learning Curve ============== -A learning curve shows the relationship of the training score vs the cross validated test score for an estimator with a varying number of training samples. This visualization is typically used two show two things: +A learning curve shows the relationship of the training score versus the cross validated test score for an estimator with a varying number of training samples. This visualization is typically used two show two things: 1. How much the estimator benefits from more data (e.g. do we have "enough data" or will the estimator get better if used in an online fashion). 2. If the estimator is more sensitive to error due to variance vs. error due to bias. @@ -12,7 +12,7 @@ Consider the following learning curves (generated with Yellowbrick, but from `Pl .. image:: images/learning_curve_sklearn_example.png -If the training and cross validation scores converge together as more data is added (shown in the left figure), then the model will probably not benefit from more data. If the training score is much greater than the validation score (as shown in the right figure) then the model probably requires more training examples in order to generalize more effectively. +If the training and cross-validation scores converge together as more data is added (shown in the left figure), then the model will probably not benefit from more data. If the training score is much greater than the validation score then the model probably requires more training examples in order to generalize more effectively. The curves are plotted with the mean scores, however variability during cross-validation is shown with the shaded areas that represent a standard deviation above and below the mean for all cross-validations. If the model suffers from error due to bias, then there will likely be more variability around the training score curve. If the model suffers from error due to variance, then there will be more variability around the cross validated score. @@ -21,41 +21,40 @@ The curves are plotted with the mean scores, however variability during cross-va Classification -------------- -In the following example we show how to visualize the learning curve of a classification model. After loading a ``DataFrame`` and performing categorical encoding, we create a ``StratifiedKFold`` cross-validation strategy to ensure all of our classes in each split are represented with the same proportion. We then fit the visualizer using the ``f1_weighted`` scoring metric as opposed to the default metric, accuracy, to get a better sense of the relationship of precision and recall in our classifier. +In the following example, we show how to visualize the learning curve of a classification model. After loading a ``DataFrame`` and performing categorical encoding, we create a ``StratifiedKFold`` cross-validation strategy to ensure all of our classes in each split are represented with the same proportion. We then fit the visualizer using the ``f1_weighted`` scoring metric as opposed to the default metric, accuracy, to get a better sense of the relationship of precision and recall in our classifier. -.. code:: python +.. plot:: + :context: close-figs + :alt: Learning Curve on the Game dataset using StratifiedKFold import numpy as np - from sklearn.naive_bayes import MultinomialNB from sklearn.model_selection import StratifiedKFold - from yellowbrick.model_selection import LearningCurve + from sklearn.naive_bayes import MultinomialNB + from sklearn.preprocessing import OneHotEncoder, LabelEncoder - # Load a classification data set - data = load_data('game') + from yellowbrick.datasets import load_game + from yellowbrick.model_selection import LearningCurve - # Specify the features of interest and the target - target = "outcome" - features = [col for col in data.columns if col != target] + # Load a classification dataset + X, y = load_game() - # Encode the categorical data with one-hot encoding - X = pd.get_dummies(data[features]) - y = data[target] + # Encode the categorical data + X = OneHotEncoder().fit_transform(X) + y = LabelEncoder().fit_transform(y) # Create the learning curve visualizer - cv = StratifiedKFold(12) + cv = StratifiedKFold(n_splits=12) sizes = np.linspace(0.3, 1.0, 10) - viz = LearningCurve( - MultinomialNB(), cv=cv, train_sizes=sizes, - scoring='f1_weighted', n_jobs=4 + # Instantiate the classification model and visualizer + model = MultinomialNB() + visualizer = LearningCurve( + model, cv=cv, scoring='f1_weighted', train_sizes=sizes, n_jobs=4 ) - # Fit and poof the visualizer - viz.fit(X, y) - viz.poof() - -.. image:: images/learning_curve_classifier.png + visualizer.fit(X, y) # Fit the data to the visualizer + visualizer.poof() # Draw/show/poof the data This learning curve shows high test variability and a low score up to around 30,000 instances, however after this level the model begins to converge on an F1 score of around 0.6. We can see that the training and test scores have not yet converged, so potentially this model would benefit from more training data. Finally, this model suffers primarily from error due to variance (the CV scores for the test data are more variable than for training data) so it is possible that the model is overfitting. @@ -64,27 +63,24 @@ Regression Building a learning curve for a regression is straight forward and very similar. In the below example, after loading our data and selecting our target, we explore the learning curve score according to the coefficient of determination or R2 score. -.. code:: python +.. plot:: + :context: close-figs + :alt: Learning Curve on the Energy dataset using RidgeCV from sklearn.linear_model import RidgeCV - # Load a regression dataset - data = load_data('energy') - - # Specify features of interest and the target - targets = ["heating load", "cooling load"] - features = [col for col in data.columns if col not in targets] + from yellowbrick.datasets import load_energy + from yellowbrick.model_selection import LearningCurve - # Extract the instances and target - X = data[features] - y = data[targets[0]] + # Load a regression dataset + X, y = load_energy() - # Create the learning curve visualizer, fit and poof - viz = LearningCurve(RidgeCV(), train_sizes=sizes, scoring='r2') - viz.fit(X, y) - viz.poof() + # Instantiate the regression model and visualizer + model = RidgeCV() + visualizer = LearningCurve(model, scoring='r2') -.. image:: images/learning_curve_regressor.png + visualizer.fit(X, y) # Fit the data to the visualizer + visualizer.poof() # Draw/show/poof the data This learning curve shows a very high variability and much lower score until about 350 instances. It is clear that this model could benefit from more data because it is converging at a very high score. Potentially, with more data and a larger alpha for regularization, this model would become far less variable in the test data. @@ -93,23 +89,24 @@ Clustering Learning curves also work for clustering models and can use metrics that specify the shape or organization of clusters such as silhouette scores or density scores. If the membership is known in advance, then rand scores can be used to compare clustering performance as shown below: -.. code:: python +.. plot:: + :context: close-figs + :alt: Learning Curve on clustering models from sklearn.cluster import KMeans from sklearn.datasets import make_blobs - # Create a dataset of blobs - X, y = make_blobs(n_samples=1000, centers=5) - - viz = LearningCurve( - KMeans(), train_sizes=sizes, scoring="adjusted_rand_score" - ) + from yellowbrick.model_selection import LearningCurve - viz.fit(X, y) - viz.poof() + # Generate synthetic dataset with 5 random clusters + X, y = make_blobs(n_samples=1000, centers=5, random_state=42) + # Instantiate the clustering model and visualizer + model = KMeans() + visualizer = LearningCurve(model, scoring="adjusted_rand_score", random_state=42) -.. image:: images/learning_curve_clusterer.png + visualizer.fit(X, y) # Fit the data to the visualizer + visualizer.poof() # Draw/show/poof the data Unfortunately, with random data these curves are highly variable, but serve to point out some clustering-specific items. First, note the y-axis is very narrow, roughly speaking these curves are converged and actually the clustering algorithm is performing very well. Second, for clustering, convergence for data points is not necessarily a bad thing; in fact we want to ensure as more data is added, the training and cross-validation scores do not diverge. diff --git a/docs/api/features/rfecv.py b/docs/api/model_selection/rfecv.py similarity index 54% rename from docs/api/features/rfecv.py rename to docs/api/model_selection/rfecv.py index 2cd17b13c..05e7a5d06 100644 --- a/docs/api/features/rfecv.py +++ b/docs/api/model_selection/rfecv.py @@ -1,50 +1,57 @@ #!/usr/bin/env python3 +# rfecv.py # Generates RFECV visualizations for the documentation +# +# Copyright (C) 2018 The scikit-yb developers +# For license information, see LICENSE.txt +# +# ID: rfecv.py [] $ import os -import pandas as pd import matplotlib.pyplot as plt from sklearn.svm import SVC -from yellowbrick.features import RFECV from sklearn.datasets import make_classification from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import StratifiedKFold -CWD = os.path.dirname(__file__) -DATA = os.path.join(CWD, "..", "..", "..", "examples", "data") +from yellowbrick.model_selection import RFECV +from yellowbrick.datasets import load_credit + + +CWD = os.path.dirname(__file__) IMAGES = os.path.join(CWD, "images") def rfecv_sklearn_example(image="rfecv_sklearn_example.png"): X, y = make_classification( - n_samples=1000, n_features=25, n_informative=3, n_redundant=2, - n_repeated=0, n_classes=8, n_clusters_per_class=1, random_state=0 + n_samples=1000, + n_features=25, + n_informative=3, + n_redundant=2, + n_repeated=0, + n_classes=8, + n_clusters_per_class=1, + random_state=0, ) _, ax = plt.subplots() - oz = RFECV(SVC(kernel='linear', C=1), ax=ax) + oz = RFECV(SVC(kernel="linear", C=1), ax=ax) oz.fit(X, y) oz.poof(outpath=os.path.join(IMAGES, image)) def rfecv_credit_example(image="rfecv_credit.png"): - data = pd.read_csv(os.path.join(DATA, "credit", "credit.csv")) - - target = "default" - features = [col for col in data.columns if col != target] - - X = data[features] - y = data[target] + X, y = load_credit() _, ax = plt.subplots() cv = StratifiedKFold(5) - oz = RFECV(RandomForestClassifier(), ax=ax, cv=cv, scoring='f1_weighted') + oz = RFECV(RandomForestClassifier(), ax=ax, cv=cv, scoring="f1_weighted") oz.fit(X, y) oz.poof(outpath=os.path.join(IMAGES, image)) -if __name__ == '__main__': +if __name__ == "__main__": rfecv_sklearn_example() rfecv_credit_example() diff --git a/docs/api/features/rfecv.rst b/docs/api/model_selection/rfecv.rst similarity index 77% rename from docs/api/features/rfecv.rst rename to docs/api/model_selection/rfecv.rst index 39b61b12a..9d4228319 100644 --- a/docs/api/features/rfecv.rst +++ b/docs/api/model_selection/rfecv.rst @@ -9,12 +9,15 @@ RFE requires a specified number of features to keep, however it is often not kno To show how this works in practice, we'll start with a contrived example using a dataset that has only 3 informative features out of 25. +.. note to contributors: the below code takes a long time to run so has not been + modified with a plot directive. See rfecv.py to regenerate images. + .. code:: python from sklearn.svm import SVC from sklearn.datasets import make_classification - from yellowbrick.features import RFECV + from yellowbrick.model_selection import RFECV # Create a dataset with only 3 informative features X, y = make_classification( @@ -22,10 +25,11 @@ To show how this works in practice, we'll start with a contrived example using a n_repeated=0, n_classes=8, n_clusters_per_class=1, random_state=0 ) - # Create RFECV visualizer with linear SVM classifier - viz = RFECV(SVC(kernel='linear', C=1)) - viz.fit(X, y) - viz.poof() + # Instantiate RFECV visualizer with a linear SVM classifier + visualizer = RFECV(SVC(kernel='linear', C=1)) + + visualizer.fit(X, y) # Fit the data to the visualizer + visualizer.poof() # Draw/show/poof the data .. image:: images/rfecv_sklearn_example.png @@ -33,24 +37,25 @@ This figure shows an ideal RFECV curve, the curve jumps to an excellent accuracy Exploring a real dataset, we can see the impact of RFECV on a credit default binary classifier. +.. note to contributors: the below code takes a long time to run so has not been + modified with a plot directive. See rfecv.py to regenerate images. + .. code:: python from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import StratifiedKFold - df = load_data('credit') - - target = 'default' - features = [col for col in data.columns if col != target] + from yellowbrick.model_selection import RFECV + from yellowbrick.datasets import load_credit - X = data[features] - y = data[target] + # Load classification dataset + X, y = load_credit() cv = StratifiedKFold(5) - oz = RFECV(RandomForestClassifier(), cv=cv, scoring='f1_weighted') + visualizer = RFECV(RandomForestClassifier(), cv=cv, scoring='f1_weighted') - oz.fit(X, y) - oz.poof() + visualizer.fit(X, y) # Fit the data to the visualizer + visualizer.poof() # Draw/show/poof the data .. image:: images/rfecv_credit.png @@ -61,7 +66,7 @@ In this example we can see that 19 features were selected, though there doesn't API Reference ------------- -.. automodule:: yellowbrick.features.rfecv +.. automodule:: yellowbrick.model_selection.rfecv :members: RFECV :undoc-members: :show-inheritance: diff --git a/docs/api/model_selection/validation_curve.py b/docs/api/model_selection/validation_curve.py index e4f26ad22..be6b99b44 100644 --- a/docs/api/model_selection/validation_curve.py +++ b/docs/api/model_selection/validation_curve.py @@ -9,27 +9,27 @@ ## Imports ########################################################################## -import os import numpy as np -import pandas as pd import matplotlib.pyplot as plt from sklearn.svm import SVC from sklearn.datasets import load_digits -from sklearn.tree import DecisionTreeRegressor +from sklearn.preprocessing import OneHotEncoder from sklearn.neighbors import KNeighborsClassifier from sklearn.model_selection import StratifiedKFold +from yellowbrick.datasets import load_game from yellowbrick.model_selection import ValidationCurve -FIXTURES = os.path.join("..", "..", "..", "examples", "data") - ########################################################################## ## Helper Methods ########################################################################## -def validation_curve_sklearn_example(path="images/validation_curve_sklearn_example.png"): + +def validation_curve_sklearn_example( + path="images/validation_curve_sklearn_example.png" +): digits = load_digits() X, y = digits.data, digits.target @@ -37,71 +37,61 @@ def validation_curve_sklearn_example(path="images/validation_curve_sklearn_examp param_range = np.logspace(-6, -1, 5) oz = ValidationCurve( - SVC(), ax=ax, param_name="gamma", param_range=param_range, - logx=True, cv=10, scoring="accuracy", n_jobs=4 + SVC(), + ax=ax, + param_name="gamma", + param_range=param_range, + logx=True, + cv=10, + scoring="accuracy", + n_jobs=4, ) oz.fit(X, y) oz.poof(outpath=path) -def validation_curve_classifier(path="images/validation_curve_classifier.png"): - data = pd.read_csv(os.path.join(FIXTURES, "game", "game.csv")) - - target = "outcome" - features = [col for col in data.columns if col != target] - - X = pd.get_dummies(data[features]) - y = data[target] +def validation_curve_classifier_svc(path="images/validation_curve_classifier_svc.png"): + X, y = load_game() + X = OneHotEncoder().fit_transform(X) _, ax = plt.subplots() cv = StratifiedKFold(12) param_range = np.logspace(-6, -1, 12) + print("warning: generating the SVC validation curve can take a very long time!") + oz = ValidationCurve( - SVC(), ax=ax, param_name="gamma", param_range=param_range, - logx=True, cv=cv, scoring="f1_weighted", n_jobs=8, + SVC(), + ax=ax, + param_name="gamma", + param_range=param_range, + logx=True, + cv=cv, + scoring="f1_weighted", + n_jobs=8, ) oz.fit(X, y) oz.poof(outpath=path) -def validation_curve_classifier_alt(path="images/validation_curve_classifier_alt.png"): - data = pd.read_csv(os.path.join(FIXTURES, "game", "game.csv")) - - target = "outcome" - features = [col for col in data.columns if col != target] - - X = pd.get_dummies(data[features]) - y = data[target] +def validation_curve_classifier_knn(path="images/validation_curve_classifier_knn.png"): + X, y = load_game() + X = OneHotEncoder().fit_transform(X) _, ax = plt.subplots() cv = StratifiedKFold(4) param_range = np.arange(3, 20, 2) - oz = ValidationCurve( - KNeighborsClassifier(), ax=ax, param_name="n_neighbors", - param_range=param_range, cv=cv, scoring="f1_weighted", n_jobs=8, - ) - oz.fit(X, y) - oz.poof(outpath=path) - - -def validation_curve_regressor(path="images/validation_curve_regressor.png"): - - data = pd.read_csv(os.path.join(FIXTURES, "energy", "energy.csv")) - - targets = ["heating load", "cooling load"] - features = [col for col in data.columns if col not in targets] - - X = data[features] - y = data[targets[1]] - - _, ax = plt.subplots() - param_range = np.arange(1, 11) + print("warning: generating the KNN validation curve can take a very long time!") oz = ValidationCurve( - DecisionTreeRegressor(), ax=ax, param_name="max_depth", - param_range=param_range, cv=10, scoring="r2", n_jobs=8, + KNeighborsClassifier(), + ax=ax, + param_name="n_neighbors", + param_range=param_range, + cv=cv, + scoring="f1_weighted", + n_jobs=8, ) oz.fit(X, y) oz.poof(outpath=path) @@ -111,8 +101,7 @@ def validation_curve_regressor(path="images/validation_curve_regressor.png"): ## Main Method ########################################################################## -if __name__ == '__main__': - validation_curve_sklearn_example() - # validation_curve_classifier() - validation_curve_classifier_alt() - validation_curve_regressor() +if __name__ == "__main__": + # validation_curve_sklearn_example() + validation_curve_classifier_svc() + validation_curve_classifier_knn() diff --git a/docs/api/model_selection/validation_curve.rst b/docs/api/model_selection/validation_curve.rst index bd9c339fb..2c152c8a3 100644 --- a/docs/api/model_selection/validation_curve.rst +++ b/docs/api/model_selection/validation_curve.rst @@ -9,23 +9,19 @@ In order to maximize the score, the hyperparameters of the model must be selecte In our first example, we'll explore using the ``ValidationCurve`` visualizer with a regression dataset and in the second, a classification dataset. Note that any estimator that implements ``fit()`` and ``predict()`` and has an appropriate scoring mechanism can be used with this visualizer. -.. code:: python +.. plot:: + :context: close-figs + :alt: Validation Curve for Max Depth of Decision Tree regressor import numpy as np - from sklearn.tree import DecisionTreeRegressor + from yellowbrick.datasets import load_energy from yellowbrick.model_selection import ValidationCurve - # Load a regression dataset - data = load_data('energy') - - # Specify features of interest and the target - targets = ["heating load", "cooling load"] - features = [col for col in data.columns if col not in targets] + from sklearn.tree import DecisionTreeRegressor - # Extract the instances and target - X = data[features] - y = data[targets[0]] + # Load a regression dataset + X, y = load_energy() viz = ValidationCurve( DecisionTreeRegressor(), param_name="max_depth", @@ -36,29 +32,26 @@ In our first example, we'll explore using the ``ValidationCurve`` visualizer wit viz.fit(X, y) viz.poof() -.. image:: images/validation_curve_regressor.png - After loading and wrangling the data, we initialize the ``ValidationCurve`` with a ``DecisionTreeRegressor``. Decision trees become more overfit the deeper they are because at each level of the tree the partitions are dealing with a smaller subset of data. One way to deal with this overfitting process is to limit the depth of the tree. The validation curve explores the relationship of the ``"max_depth"`` parameter to the R2 score with 10 shuffle split cross-validation. The ``param_range`` argument specifies the values of ``max_depth``, here from 1 to 10 inclusive. We can see in the resulting visualization that a depth limit of less than 5 levels severely underfits the model on this data set because the training score and testing score climb together in this parameter range, and because of the high variability of cross validation on the test scores. After a depth of 7, the training and test scores diverge, this is because deeper trees are beginning to overfit the training data, providing no generalizability to the model. However, because the cross validation score does not necessarily decrease, the model is not suffering from high error due to variance. In the next visualizer, we will see an example that more dramatically visualizes the bias/variance tradeoff. +.. note to contributors: the below code takes a long time to run so has not been + modified with a plot directive. See validation_curve.py to regenerate images. + .. code:: python from sklearn.svm import SVC + from sklearn.preprocessing import OneHotEncoder from sklearn.model_selection import StratifiedKFold # Load a classification data set - data = load_data('game') - - # Specify the features of interest and the target - target = "outcome" - features = [col for col in data.columns if col != target] + X, y = load_game() # Encode the categorical data with one-hot encoding - X = pd.get_dummies(data[features]) - y = data[target] + X = OneHotEncoder().fit_transform(X) # Create the validation curve visualizer cv = StratifiedKFold(12) @@ -73,16 +66,19 @@ In the next visualizer, we will see an example that more dramatically visualizes viz.poof() -.. image:: images/validation_curve_classifier.png +.. image:: images/validation_curve_classifier_svc.png After loading data and one-hot encoding it using the Pandas ``get_dummies`` function, we create a stratified k-folds cross-validation strategy. The hyperparameter of interest is the gamma of a support vector classifier, the coefficient of the RBF kernel. Gamma controls how much influence a single example has, the larger gamma is, the tighter the support vector is around single points (overfitting the model). In this visualization we see a definite inflection point around ``gamma=0.1``. At this point the training score climbs rapidly as the SVC memorizes the data, while the cross-validation score begins to decrease as the model cannot generalize to unseen data. -.. warning:: Note that running this example may take a long time. Even with parallelism using n_jobs=8, this can take several hours. +.. warning:: Note that running this and the next example may take a long time. Even with parallelism using n_jobs=8, it can take several hours to go through all the combinations. Reducing the parameter range and minimizing the amount of cross-validation can speed up the validation curve visualization. Validation curves can be performance intensive since they are training ``n_params * n_splits`` models and scoring them. It is critically important to ensure that the specified hyperparameter range is correct, as we will see in the next example. +.. note to contributors: the below code takes a long time to run so has not been + modified with a plot directive. See validation_curve.py to regenerate images. + .. code:: python from sklearn.neighbors import KNeighborsClassifier @@ -99,8 +95,7 @@ Validation curves can be performance intensive since they are training ``n_param oz.fit(X, y) oz.poof() - -.. image:: images/validation_curve_classifier_alt.png +.. image:: images/validation_curve_classifier_knn.png The k nearest neighbors (kNN) model is commonly used when similarity is important to the interpretation of the model. Choosing k is difficult, the higher k is the more data is included in a classification, creating more complex decision topologies, whereas the lower k is, the simpler the model is and the less it may generalize. Using a validation curve seems like an excellent strategy for choosing k, and often it is. However in the example above, all we can see is a decreasing variability in the cross-validated scores. diff --git a/docs/api/regressor/alphas.py b/docs/api/regressor/alphas.py deleted file mode 100644 index 40780c166..000000000 --- a/docs/api/regressor/alphas.py +++ /dev/null @@ -1,25 +0,0 @@ -import numpy as np -import pandas as pd - -from sklearn.linear_model import LassoCV - -from yellowbrick.regressor import AlphaSelection - - -if __name__ == '__main__': - # Load the regression data set - df = pd.read_csv("../../../examples/data/concrete/concrete.csv") - - feature_names = ['cement', 'slag', 'ash', 'water', 'splast', 'coarse', 'fine', 'age'] - target_name = 'strength' - - # Get the X and y data from the DataFrame - X = df[feature_names] - y = df[target_name] - - # Instantiate the linear model and visualizer - alphas = np.logspace(-10, 1, 400) - visualizer = AlphaSelection(LassoCV(alphas=alphas)) - - visualizer.fit(X, y) - g = visualizer.poof(outpath="images/alpha_selection.png") diff --git a/docs/api/regressor/alphas.rst b/docs/api/regressor/alphas.rst index a94d889c2..dcbfa7624 100644 --- a/docs/api/regressor/alphas.rst +++ b/docs/api/regressor/alphas.rst @@ -7,27 +7,18 @@ Regularization is designed to penalize model complexity, therefore the higher th The AlphaSelection Visualizer demonstrates how different values of alpha influence model selection during the regularization of linear models. Generally speaking, alpha increases the affect of regularization, e.g. if alpha is zero there is no regularization and the higher the alpha, the more the regularization parameter influences the final model. -.. code:: python - - # Load the regression data set - df = load_data('concrete') - - # Specify the features of interest and the target - features = [ - 'cement', 'slag', 'ash', 'water', 'splast', 'coarse', 'fine', 'age' - ] - target = 'strength' - - # Extract the instances and target - X = df[features] - y = df[target] - -.. code:: python +.. plot:: + :context: close-figs + :alt: Alpha selection on the concrete data set import numpy as np from sklearn.linear_model import LassoCV from yellowbrick.regressor import AlphaSelection + from yellowbrick.datasets import load_concrete + + # Load the regression dataset + X, y = load_concrete() # Create a list of alphas to cross-validate against alphas = np.logspace(-10, 1, 400) @@ -35,12 +26,9 @@ The AlphaSelection Visualizer demonstrates how different values of alpha influen # Instantiate the linear model and visualizer model = LassoCV(alphas=alphas) visualizer = AlphaSelection(model) - visualizer.fit(X, y) - g = visualizer.poof() - + visualizer.poof() -.. image:: images/alpha_selection.png API Reference diff --git a/docs/api/regressor/images/alpha_selection.png b/docs/api/regressor/images/alpha_selection.png deleted file mode 100644 index 4a515cbc2..000000000 Binary files a/docs/api/regressor/images/alpha_selection.png and /dev/null differ diff --git a/docs/api/regressor/images/prediction_error.png b/docs/api/regressor/images/prediction_error.png deleted file mode 100644 index a649e12b8..000000000 Binary files a/docs/api/regressor/images/prediction_error.png and /dev/null differ diff --git a/docs/api/regressor/images/residuals.png b/docs/api/regressor/images/residuals.png deleted file mode 100644 index eb6f17fb1..000000000 Binary files a/docs/api/regressor/images/residuals.png and /dev/null differ diff --git a/docs/api/regressor/images/residuals_no_hist.png b/docs/api/regressor/images/residuals_no_hist.png deleted file mode 100644 index 48cf08ba0..000000000 Binary files a/docs/api/regressor/images/residuals_no_hist.png and /dev/null differ diff --git a/docs/api/regressor/index.rst b/docs/api/regressor/index.rst index 388de01b7..35898d16e 100644 --- a/docs/api/regressor/index.rst +++ b/docs/api/regressor/index.rst @@ -36,3 +36,4 @@ the final step in a ``Pipeline`` or ``VisualPipeline``. residuals peplot alphas + influence \ No newline at end of file diff --git a/docs/api/regressor/influence.rst b/docs/api/regressor/influence.rst new file mode 100644 index 000000000..6aeafb5ea --- /dev/null +++ b/docs/api/regressor/influence.rst @@ -0,0 +1,36 @@ +.. -*- mode: rst -*- + +Cook's Distance +=============== + +Cook's Distance is a measure of an observation or instances' influence on a linear +regression. Instances with a large influence may be outliers and datasets that have a +large number of highly influential points might not be good predictors to fit linear +models. The ``CooksDistance`` visualizer shows a stem plot of all instances by index +and their associated distance score, along with a heuristic threshold to quickly show +what percent of the dataset may be impacting OLS regression models. + +.. plot:: + :context: close-figs + :alt: Cook's distance using concrete dataset + + from yellowbrick.regressor import CooksDistance + from yellowbrick.datasets import load_concrete + + # Load the regression dataset + X, y = load_concrete() + + # Instantiate and fit the visualizer + visualizer = CooksDistance() + visualizer.fit(X, y) + visualizer.poof() + + +API Reference +------------- + +.. automodule:: yellowbrick.regressor.influence + :members: CooksDistance + :undoc-members: + :show-inheritance: + diff --git a/docs/api/regressor/peplot.py b/docs/api/regressor/peplot.py deleted file mode 100644 index f341f5a88..000000000 --- a/docs/api/regressor/peplot.py +++ /dev/null @@ -1,29 +0,0 @@ -import pandas as pd - -from sklearn.linear_model import Lasso -from sklearn.model_selection import train_test_split - -from yellowbrick.regressor import PredictionError - - -if __name__ == '__main__': - # Load the regression data set - df = pd.read_csv("../../../examples/data/concrete/concrete.csv") - - feature_names = ['cement', 'slag', 'ash', 'water', 'splast', 'coarse', 'fine', 'age'] - target_name = 'strength' - - # Get the X and y data from the DataFrame - X = df[feature_names] - y = df[target_name] - - # Create the train and test data - X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) - - # Instantiate the linear model and visualizer - lasso = Lasso() - visualizer = PredictionError(lasso) - - visualizer.fit(X_train, y_train) # Fit the training data to the visualizer - visualizer.score(X_test, y_test) # Evaluate the model on the test data - g = visualizer.poof(outpath="images/prediction_error.png") # Draw/show/poof the data diff --git a/docs/api/regressor/peplot.rst b/docs/api/regressor/peplot.rst index 0988c93e7..a42a74cb6 100644 --- a/docs/api/regressor/peplot.rst +++ b/docs/api/regressor/peplot.rst @@ -5,42 +5,28 @@ Prediction Error Plot A prediction error plot shows the actual targets from the dataset against the predicted values generated by our model. This allows us to see how much variance is in the model. Data scientists can diagnose regression models using this plot by comparing against the 45 degree line, where the prediction exactly matches the model. -.. code:: python +.. plot:: + :context: close-figs + :alt: Prediction Error plot on the Concrete dataset using a linear model from sklearn.model_selection import train_test_split + from sklearn.linear_model import Lasso + from yellowbrick.datasets import load_concrete + from yellowbrick.regressor import PredictionError - # Load the regression data set - data = load_data('concrete') - - # Specify the features of interest and the target - features = ['cement', 'slag', 'ash', 'water', 'splast', 'coarse', 'fine', 'age'] - target = 'strength' - - # Extract the instances and target - X = data[features] - y = data[target] + # Load a regression dataset + X, y = load_concrete() # Create the train and test data - X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) - -.. code:: python - - from sklearn.linear_model import Lasso - - from yellowbrick.regressor import PredictionError + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Instantiate the linear model and visualizer - lasso = Lasso() - visualizer = PredictionError(lasso) + model = Lasso() + visualizer = PredictionError(model) visualizer.fit(X_train, y_train) # Fit the training data to the visualizer visualizer.score(X_test, y_test) # Evaluate the model on the test data - g = visualizer.poof() # Draw/show/poof the data - - - -.. image:: images/prediction_error.png - + visualizer.poof() # Draw/show/poof the data API Reference ------------- @@ -49,3 +35,4 @@ API Reference :members: PredictionError :undoc-members: :show-inheritance: + :noindex: \ No newline at end of file diff --git a/docs/api/regressor/residuals.py b/docs/api/regressor/residuals.py deleted file mode 100644 index 2b6236b39..000000000 --- a/docs/api/regressor/residuals.py +++ /dev/null @@ -1,44 +0,0 @@ -import pandas as pd -import matplotlib.pyplot as plt - -from sklearn.linear_model import Ridge -from sklearn.model_selection import train_test_split - -from yellowbrick.regressor import ResidualsPlot - - -def plot_residuals(X, y, model, outpath="images/residuals.png", **kwargs): - X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) - - _, ax = plt.subplots() - - visualizer = ResidualsPlot(model, ax=ax, **kwargs) - visualizer.fit(X_train, y_train) - visualizer.score(X_test, y_test) - visualizer.poof(outpath=outpath) - - -def load_concrete(): - # Load the regression data set - df = pd.read_csv("../../../examples/data/concrete/concrete.csv") - - feature_names = [ - 'cement', 'slag', 'ash', 'water', 'splast', 'coarse', 'fine', 'age' - ] - target_name = 'strength' - - # Get the X and y data from the DataFrame - X = df[feature_names] - y = df[target_name] - - return X, y - - - -if __name__ == '__main__': - # Draw the default residuals graph - X, y = load_concrete() - plot_residuals(X, y, Ridge()) - - # Draw the residuals graph with no histogram - plot_residuals(X, y, Ridge(), "images/residuals_no_hist.png", hist=False) diff --git a/docs/api/regressor/residuals.rst b/docs/api/regressor/residuals.rst index 5689696d1..5adb3f045 100644 --- a/docs/api/regressor/residuals.rst +++ b/docs/api/regressor/residuals.rst @@ -3,58 +3,46 @@ Residuals Plot ============== -Residuals, in the context of regression models, are the difference between the observed value of the target variable (y) and the predicted value (ŷ), e.g. the error of the prediction. The residuals plot shows the difference between residuals on the vertical axis and the dependent variable on the horizontal axis, allowing you to detect regions within the target that may be susceptible to more or less error. +Residuals, in the context of regression models, are the difference between the observed value of the target variable (y) and the predicted value (ŷ), i.e. the error of the prediction. The residuals plot shows the difference between residuals on the vertical axis and the dependent variable on the horizontal axis, allowing you to detect regions within the target that may be susceptible to more or less error. -.. code:: python +.. plot:: + :context: close-figs + :alt: Residuals Plot on the Concrete dataset using a linear model from sklearn.model_selection import train_test_split + from sklearn.linear_model import Ridge + from yellowbrick.datasets import load_concrete + from yellowbrick.regressor import ResidualsPlot - # Load the data - df = load_data('concrete') - - # Identify the feature and target columns - feature_names = [ - 'cement', 'slag', 'ash', 'water', 'splast', 'coarse', 'fine', 'age' - ] - target_name = 'strength' - - # Separate the instance data from the target data - X = df[feature_names] - y = df[target_name] + # Load a regression dataset + X, y = load_concrete() # Create the train and test data - X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) - -.. code:: python - - from sklearn.linear_model import Ridge - from yellowbrick.regressor import ResidualsPlot + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Instantiate the linear model and visualizer - ridge = Ridge() - visualizer = ResidualsPlot(ridge) + model = Ridge() + visualizer = ResidualsPlot(model) - visualizer.fit(X_train, y_train) # Fit the training data to the model + visualizer.fit(X_train, y_train) # Fit the training data to the visualizer visualizer.score(X_test, y_test) # Evaluate the model on the test data visualizer.poof() # Draw/show/poof the data -.. image:: images/residuals.png - A common use of the residuals plot is to analyze the variance of the error of the regressor. If the points are randomly dispersed around the horizontal axis, a linear regression model is usually appropriate for the data; otherwise, a non-linear model is more appropriate. In the case above, we see a fairly random, uniform distribution of the residuals against the target in two dimensions. This seems to indicate that our linear model is performing well. We can also see from the histogram that our error is normally distributed around zero, which also generally indicates a well fitted model. Note that if the histogram is not desired, it can be turned off with the ``hist=False`` flag: -.. code:: python +.. plot:: + :context: close-figs + :alt: Residuals Plot on the Concrete dataset without a histogram - visualizer = ResidualsPlot(ridge, hist=False) + visualizer = ResidualsPlot(model, hist=False) visualizer.fit(X_train, y_train) visualizer.score(X_test, y_test) visualizer.poof() -.. image:: images/residuals_no_hist.png - -.. warning:: The histogram on the residuals plot requires matplotlib 2.0.2 or greater. If you are using an earlier version of matplotlib, simply set the ``hist=False`` flag so that the histogram is not drawn. +.. warning:: The histogram on the residuals plot requires matplotlib 2.0.2 or greater. If you are using an earlier version of matplotlib, simply set the ``hist=False`` flag so that the histogram is not drawn. API Reference ------------- diff --git a/docs/api/target/binning.py b/docs/api/target/binning.py deleted file mode 100644 index 1d799033b..000000000 --- a/docs/api/target/binning.py +++ /dev/null @@ -1,36 +0,0 @@ -# yellowbrick.target.binning -# Generates images for the balance binning reference documentation. -# -# Author: Kristen McIntyre -# Created: Tue Sept 11 12:09:40 2018 -0400 -# -# ID: binning.py [] kautumn06@gmail.com $ - -""" -Generates images for the balanced binning reference documentation. -""" - -########################################################################## -## Imports -########################################################################## - -from yellowbrick.target import BalancedBinningReference -from sklearn.datasets import load_diabetes - - -def balanced_binning_reference(path="images/balanced_binning_reference.png"): - # Load a regression data set - data = load_diabetes() - - # Extract the target variable - y = data['target'] - - # Instantiate and fit the visualizer - visualizer = BalancedBinningReference() - visualizer.fit(y) - return visualizer.poof(outpath=path) - - - -if __name__ == '__main__': - balanced_binning_reference() diff --git a/docs/api/target/binning.rst b/docs/api/target/binning.rst index a716cfffe..d87218458 100644 --- a/docs/api/target/binning.rst +++ b/docs/api/target/binning.rst @@ -10,24 +10,22 @@ Sometimes when the your target variable is continuously-valued, there simply are To help the user select the optimal number of bins, the ``BalancedBinningReference`` visualizer takes the target variable ``y`` as input and generates a histogram with vertical lines indicating the recommended value points to ensure that the data is evenly distributed into each bin. -.. code:: python +.. plot:: + :context: close-figs + :alt: BalancedBinningReference on concrete dataset + from yellowbrick.datasets import load_concrete from yellowbrick.target import BalancedBinningReference - # Load the a regression data set - data = load_data("concrete") - - # Extract the target of interest - y = data["strength"] + # Load the concrete dataset + X, y = load_concrete() # Instantiate the visualizer visualizer = BalancedBinningReference() - visualizer.fit(y) # Fit the data to the visualizer - visualizer.poof() # Draw/show/poof the data - + visualizer.fit(y) # Fit the data to the visualizer + visualizer.poof() # Draw/show/poof the data -.. image:: images/balanced_binning_reference.png .. seealso:: diff --git a/docs/api/target/class_balance.py b/docs/api/target/class_balance.py deleted file mode 100644 index 6a7fae6fc..000000000 --- a/docs/api/target/class_balance.py +++ /dev/null @@ -1,54 +0,0 @@ -# class_balance -# Generates images for the class balance documentation. -# -# Author: Benjamin Bengfort -# Created: Thu Jul 19 12:09:40 2018 -0400 -# -# ID: class_balance.py [] benjamin@bengfort.com $ - -""" -Generates images for the class balance documentation. -""" - -########################################################################## -## Imports -########################################################################## - -from yellowbrick.target import ClassBalance -from yellowbrick.datasets import load_occupancy, load_game - -from sklearn.model_selection import train_test_split - -def compare_class_balance(path="images/class_balance_compare.png"): - data = load_occupancy() - - features = ["temperature", "relative_humidity", "light", "C02", "humidity"] - classes = ['unoccupied', 'occupied'] - - # Extract the numpy arrays from the data frame - X = data[features] - y = data["occupancy"] - - # Create the train and test data - _, _, y_train, y_test = train_test_split(X, y, test_size=0.2) - - # Instantiate the classification model and visualizer - visualizer = ClassBalance(labels=classes) - - visualizer.fit(y_train, y_test) - return visualizer.poof(outpath=path) - - -def balance_class_balance(path="images/class_balance.png"): - data = load_game() - y = data["outcome"] - - oz = ClassBalance(labels=["draw", "loss", "win"]) - oz.fit(y) - return oz.poof(outpath=path) - - - -if __name__ == '__main__': - compare_class_balance() - balance_class_balance() diff --git a/docs/api/target/class_balance.rst b/docs/api/target/class_balance.rst index 7dd9bf3f0..1661f93d6 100644 --- a/docs/api/target/class_balance.rst +++ b/docs/api/target/class_balance.rst @@ -7,61 +7,59 @@ One of the biggest challenges for classification models is an imbalance of class There are several techniques for dealing with class imbalance such as stratified sampling, down sampling the majority class, weighting, etc. But before these actions can be taken, it is important to understand what the class balance is in the training data. The ``ClassBalance`` visualizer supports this by creating a bar chart of the *support* for each class, that is the frequency of the classes' representation in the dataset. -.. code:: python +.. plot:: + :context: close-figs + :alt: ClassBalance Visualizer on the game dataset from yellowbrick.datasets import load_game from yellowbrick.target import ClassBalance - # Load the classification data set - data = load_game() - - # Specify the target - y = data["outcome"] + # Load the classification dataset + X, y = load_game() + # Instantiate the visualizer visualizer = ClassBalance(labels=["draw", "loss", "win"]) - visualizer.fit(y) - visualizer.poof() -.. image:: images/class_balance.png + visualizer.fit(y) # Fit the data to the visualizer + visualizer.poof() # Draw/show/poof the data The resulting figure allows us to diagnose the severity of the balance issue. In this figure we can see that the ``"win"`` class dominates the other two classes. One potential solution might be to create a binary classifier: ``"win"`` vs ``"not win"`` and combining the ``"loss"`` and ``"draw"`` classes into one class. .. warning:: - The ``ClassBalance`` visualizer interface has changed in version 0.9, a classification model is no longer required to instantiate the visualizer, it can operate on data only. Additionally the signature of the fit method has changed from ``fit(X, y=None)`` to ``fit(y_train, y_test=None)``, passing in ``X`` is no longer required. + The ``ClassBalance`` visualizer interface has changed in version 0.9, a classification model is no longer required to instantiate the visualizer, it can operate on data only. Additionally, the signature of the fit method has changed from ``fit(X, y=None)`` to ``fit(y_train, y_test=None)``, passing in ``X`` is no longer required. If a class imbalance must be maintained during evaluation (e.g. the event being classified is actually as rare as the frequency implies) then *stratified sampling* should be used to create train and test splits. This ensures that the test data has roughly the same proportion of classes as the training data. While scikit-learn does this by default in ``train_test_split`` and other ``cv`` methods, it can be useful to compare the support of each class in both splits. The ``ClassBalance`` visualizer has a "compare" mode, where the train and test data can be passed to ``fit()``, creating a side-by-side bar chart instead of a single bar chart as follows: -.. code:: python - - from sklearn.model_selection import train_test_split - from yellowbrick.model_selection import ClassBalance +.. plot:: + :context: close-figs + :alt: ClassBalance Visualizer on the occupancy dataset - # Load the classification data set - data = load_data('occupancy') + from sklearn.model_selection import TimeSeriesSplit - # Specify the features of interest and the target - features = ["temperature", "relative_humidity", "light", "C02", "humidity"] - classes = ["unoccupied", "occupied"] + from yellowbrick.datasets import load_occupancy + from yellowbrick.target import ClassBalance - # Extract the instances and target - X = data[features] - y = data["occupancy"] + # Load the classification dataset + X, y = load_occupancy() - # Create the train and test data - _, _, y_train, y_test = train_test_split(X, y, test_size=0.2) + # Create the training and test data + tscv = TimeSeriesSplit() + for train_index, test_index in tscv.split(X): + X_train, X_test = X.iloc[train_index], X.iloc[test_index] + y_train, y_test = y.iloc[train_index], y.iloc[test_index] - # Instantiate the classification model and visualizer - visualizer = ClassBalance(labels=classes) + # Instantiate the visualizer + visualizer = ClassBalance(labels=["unoccupied", "occupied"]) - visualizer.fit(y_train, y_test) - return visualizer.poof() + visualizer.fit(y_train, y_test) # Fit the data to the visualizer + visualizer.poof() # Draw/show/poof the data -.. image:: images/class_balance_compare.png This visualization allows us to do a quick check to ensure that the proportion of each class is roughly similar in both splits. This visualization should be a first stop particularly when evaluation metrics are highly variable across different splits. +.. note:: This example uses ``TimeSeriesSplit`` to split the data into the training and test sets. For more information on this cross-validation method, please refer to the scikit-learn `documentation `_. API Reference ------------- diff --git a/docs/api/target/feature_correlation.py b/docs/api/target/feature_correlation.py deleted file mode 100644 index e451b4371..000000000 --- a/docs/api/target/feature_correlation.py +++ /dev/null @@ -1,71 +0,0 @@ -# feature_correlation -# Generates images for the feature correlation documentation. -# -# Author: Zijie (ZJ) Poh -# Created: Tue Jul 31 20:21:32 2018 -0700 -# -# -""" -Generates images for the feature correlation documentation. -""" - -########################################################################## -## Imports -########################################################################## - -import numpy as np -import pandas as pd -from sklearn import datasets - -from yellowbrick.target import FeatureCorrelation - - -########################################################################## -## Plotting Functions -########################################################################## - -def feature_correlation_pearson( - path="images/feature_correlation_pearson.png"): - data = datasets.load_diabetes() - X, y = data['data'], data['target'] - feature_names = np.array(data['feature_names']) - - visualizer = FeatureCorrelation(labels=feature_names) - visualizer.fit(X, y) - visualizer.poof(outpath=path, clear_figure=True) - - -def feature_correlation_mutual_info_classification( - path="images/feature_correlation_mutual_info_classification.png"): - data = datasets.load_wine() - X, y = data['data'], data['target'] - feature_names = np.array(data['feature_names']) - X_pd = pd.DataFrame(X, columns=feature_names) - - feature_to_plot = ['alcohol', 'ash', 'hue', 'proline', 'total_phenols'] - - visualizer = FeatureCorrelation(method='mutual_info-classification', - feature_names=feature_to_plot) - visualizer.fit(X_pd, y, random_state=0) - visualizer.poof(outpath=path, clear_figure=True) - - -def feature_correlation_mutual_info_regression( - path="images/feature_correlation_mutual_info_regression.png"): - data = datasets.load_diabetes() - X, y = data['data'], data['target'] - feature_names = np.array(data['feature_names']) - - discrete_features = [False for _ in range(len(feature_names))] - discrete_features[1] = True - - visualizer = FeatureCorrelation(method='mutual_info-regression', - labels=feature_names, sort=True) - visualizer.fit(X, y, discrete_features=discrete_features, random_state=0) - visualizer.poof(outpath=path, clear_figure=True) - - -if __name__ == '__main__': - feature_correlation_pearson() - feature_correlation_mutual_info_classification() - feature_correlation_mutual_info_regression() diff --git a/docs/api/target/feature_correlation.rst b/docs/api/target/feature_correlation.rst index c1c4166e5..b44836970 100644 --- a/docs/api/target/feature_correlation.rst +++ b/docs/api/target/feature_correlation.rst @@ -11,21 +11,25 @@ Pearson Correlation The default calculation is Pearson correlation, which is perform with ``scipy.stats.pearsonr``. -.. code:: python +.. plot:: + :context: close-figs + :alt: FeatureCorrelation on the diabetes dataset using Pearson correlation from sklearn import datasets from yellowbrick.target import FeatureCorrelation - # Load the regression data set + # Load the regression dataset data = datasets.load_diabetes() X, y = data['data'], data['target'] - feature_names = np.array(data['feature_names']) - visualizer = FeatureCorrelation(labels=feature_names) - visualizer.fit(X, y) - visualizer.poof() + # Create a list of the feature names + features = np.array(data['feature_names']) + + # Instantiate the visualizer + visualizer = FeatureCorrelation(labels=features) -.. image:: images/feature_correlation_pearson.png + visualizer.fit(X, y) # Fit the data to the visualizer + visualizer.poof() # Draw/show/poof the data Mutual Information - Regression ------------------------------- @@ -34,25 +38,30 @@ Mutual information between features and the dependent variable is calculated wit It is very important to specify discrete features when calculating mutual information because the calculation for continuous and discrete variables are different. See `scikit-learn documentation `_ for more details. -.. code:: python +.. plot:: + :context: close-figs + :alt: FeatureCorrelation on the diabetes dataset using mutual_info-regression from sklearn import datasets from yellowbrick.target import FeatureCorrelation - # Load the regression data set + # Load the regression dataset data = datasets.load_diabetes() X, y = data['data'], data['target'] - feature_names = np.array(data['feature_names']) - discrete_features = [False for _ in range(len(feature_names))] - discrete_features[1] = True + # Create a list of the feature names + features = np.array(data['feature_names']) + + # Create a list of the discrete features + discrete = [False for _ in range(len(features))] + discrete[1] = True - visualizer = FeatureCorrelation(method='mutual_info-regression', - labels=feature_names) - visualizer.fit(X, y, discrete_features=discrete_features, random_state=0) + # Instantiate the visualizer + visualizer = FeatureCorrelation(method='mutual_info-regression', labels=features) + + visualizer.fit(X, y, discrete_features=discrete, random_state=0) visualizer.poof() -.. image:: images/feature_correlation_mutual_info_regression.png Mutual Information - Classification ----------------------------------- @@ -60,25 +69,31 @@ Mutual Information - Classification By fitting with a pandas DataFrame, the feature labels are automatically obtained from the column names. This visualizer also allows sorting of the bar plot according to the calculated mutual information (or Pearson correlation coefficients) and selecting features to plot by specifying the names of the features or the feature index. -.. code:: python +.. plot:: + :context: close-figs + :alt: FeatureCorrelation on the wine dataset using mutual_info-classification + + import pandas as pd from sklearn import datasets from yellowbrick.target import FeatureCorrelation - - # Load the regression data set - data = datasets.load_diabetes() + + # Load the regression dataset + data = datasets.load_wine() X, y = data['data'], data['target'] - feature_names = np.array(data['feature_names']) - X_pd = pd.DataFrame(X, columns=feature_names) + X_pd = pd.DataFrame(X, columns=data['feature_names']) - feature_to_plot = ['alcohol', 'ash', 'hue', 'proline', 'total_phenols'] + # Create a list of the features to plot + features = ['alcohol', 'ash', 'hue', 'proline', 'total_phenols'] - visualizer = FeatureCorrelation(method='mutual_info-classification', - feature_names=feature_to_plot, sort=True) - visualizer.fit(X_pd, y, random_state=0) - visualizer.poof() + # Instaniate the visualizer + visualizer = FeatureCorrelation( + method='mutual_info-classification', feature_names=features, sort=True + ) + + visualizer.fit(X_pd, y) # Fit the data to the visualizer + visualizer.poof() # Draw/show/poof the data -.. image:: images/feature_correlation_mutual_info_classification.png API Reference ------------- diff --git a/docs/api/target/images/balanced_binning_reference.png b/docs/api/target/images/balanced_binning_reference.png deleted file mode 100644 index f287419a2..000000000 Binary files a/docs/api/target/images/balanced_binning_reference.png and /dev/null differ diff --git a/docs/api/target/images/class_balance.png b/docs/api/target/images/class_balance.png deleted file mode 100644 index 1393e7497..000000000 Binary files a/docs/api/target/images/class_balance.png and /dev/null differ diff --git a/docs/api/target/images/class_balance_compare.png b/docs/api/target/images/class_balance_compare.png deleted file mode 100644 index 59fdd2210..000000000 Binary files a/docs/api/target/images/class_balance_compare.png and /dev/null differ diff --git a/docs/api/target/images/feature_correlation_mutual_info_classification.png b/docs/api/target/images/feature_correlation_mutual_info_classification.png deleted file mode 100644 index 5bbb2cf0a..000000000 Binary files a/docs/api/target/images/feature_correlation_mutual_info_classification.png and /dev/null differ diff --git a/docs/api/target/images/feature_correlation_mutual_info_regression.png b/docs/api/target/images/feature_correlation_mutual_info_regression.png deleted file mode 100644 index 3d00a22b9..000000000 Binary files a/docs/api/target/images/feature_correlation_mutual_info_regression.png and /dev/null differ diff --git a/docs/api/target/images/feature_correlation_pearson.png b/docs/api/target/images/feature_correlation_pearson.png deleted file mode 100644 index 40dc20e89..000000000 Binary files a/docs/api/target/images/feature_correlation_pearson.png and /dev/null differ diff --git a/docs/api/text/corpus.py b/docs/api/text/corpus.py deleted file mode 100644 index 2c8942f99..000000000 --- a/docs/api/text/corpus.py +++ /dev/null @@ -1,44 +0,0 @@ -# Corpus loader for generating example images. - -import os -from sklearn.datasets.base import Bunch - -def load_corpus(path): - """ - Loads and wrangles the passed in text corpus by path. - """ - - # Check if the data exists, otherwise download or raise - if not os.path.exists(path): - raise ValueError(( - "'{}' dataset has not been downloaded, " - "use the yellowbrick.download module to fetch datasets" - ).format(path)) - - # Read the directories in the directory as the categories. - categories = [ - cat for cat in os.listdir(path) - if os.path.isdir(os.path.join(path, cat)) - ] - - files = [] # holds the file names relative to the root - data = [] # holds the text read from the file - target = [] # holds the string of the category - - # Load the data from the files in the corpus - for cat in categories: - for name in os.listdir(os.path.join(path, cat)): - files.append(os.path.join(path, cat, name)) - target.append(cat) - - with open(os.path.join(path, cat, name), 'r') as f: - data.append(f.read()) - - - # Return the data bunch for use similar to the newsgroups example - return Bunch( - categories=categories, - files=files, - data=data, - target=target, - ) diff --git a/docs/api/text/corpus.rst b/docs/api/text/corpus.rst deleted file mode 100644 index 2c0c91f87..000000000 --- a/docs/api/text/corpus.rst +++ /dev/null @@ -1,95 +0,0 @@ -.. -*- mode: rst -*- - -Loading a Text Corpus -===================== - -As in the previous sections, Yellowbrick has provided a sample dataset to run the following cells. In particular, we are going to use a text corpus wrangled from the `Baleen RSS Corpus `_ to present the following examples. If you haven't already downloaded the data, you can do so by running: - -:: - - $ python -m yellowbrick.download - -Note that this will create a directory called ``data`` in your current working directory that contains subdirectories with the provided datasets. - -.. note:: If you've already followed the instructions from :doc:`downloading example datasets <../datasets>`, you don't have to repeat these steps here. Simply check to ensure there is a directory called ``hobbies`` in your data directory. - -The following code snippet creates a utility that will load the corpus from disk into a scikit-learn Bunch object. This method creates a corpus that is exactly the same as the one found in the `"working with text data" `_ example on the scikit-learn website, hopefully making the examples easier to use. - -.. code:: python - - import os - from sklearn.datasets.base import Bunch - - def load_corpus(path): - """ - Loads and wrangles the passed in text corpus by path. - """ - - # Check if the data exists, otherwise download or raise - if not os.path.exists(path): - raise ValueError(( - "'{}' dataset has not been downloaded, " - "use the yellowbrick.download module to fetch datasets" - ).format(path)) - - # Read the directories in the directory as the categories. - categories = [ - cat for cat in os.listdir(path) - if os.path.isdir(os.path.join(path, cat)) - ] - - files = [] # holds the file names relative to the root - data = [] # holds the text read from the file - target = [] # holds the string of the category - - # Load the data from the files in the corpus - for cat in categories: - for name in os.listdir(os.path.join(path, cat)): - files.append(os.path.join(path, cat, name)) - target.append(cat) - - with open(os.path.join(path, cat, name), 'r') as f: - data.append(f.read()) - - - # Return the data bunch for use similar to the newsgroups example - return Bunch( - categories=categories, - files=files, - data=data, - target=target, - ) - -This is a fairly long bit of code, so let's walk through it step by step. The data in the corpus directory is stored as follows: - -:: - - data/hobbies - ├── README.md - └── books - | ├── 56d62a53c1808113ffb87f1f.txt - | └── 5745a9c7c180810be6efd70b.txt - └── cinema - | ├── 56d629b5c1808113ffb87d8f.txt - | └── 57408e5fc180810be6e574c8.txt - └── cooking - | ├── 56d62b25c1808113ffb8813b.txt - | └── 573f0728c180810be6e2575c.txt - └── gaming - | ├── 56d62654c1808113ffb87938.txt - | └── 574585d7c180810be6ef7ffc.txt - └── sports - ├── 56d62adec1808113ffb88054.txt - └── 56d70f17c180810560aec345.txt - -Each of the documents in the corpus is stored in a text file labeled with its hash signature in a directory that specifies its label or category. Therefore the first step after checking to make sure the specified path exists is to list all the directories in the `hobbies` directory---this gives us each of our categories, which we will store later in the bunch. - -The second step is to create placeholders for holding filenames, text data, and labels. We can then loop through the list of categories, list the files in each category directory, add those files to the files list, add the category name to the target list, then open and read the file to add it to data. - -To load the corpus into memory, we can simply use the following snippet: - -.. code:: python - - corpus = load_corpus("data/hobbies") - -We'll use this snippet in all of the text examples in this section! diff --git a/docs/api/text/dispersion.py b/docs/api/text/dispersion.py deleted file mode 100644 index 66dfab43a..000000000 --- a/docs/api/text/dispersion.py +++ /dev/null @@ -1,48 +0,0 @@ -# ID: dispersion.py [] lwgray@gmail.com $ - -""" -Generate figures for Dispersion Plot documentation. -""" - -########################################################################## -## Imports -########################################################################## - -import matplotlib.pyplot as plt - -from corpus import load_corpus -from yellowbrick.text.dispersion import DispersionPlot - -########################################################################## -## Generate -########################################################################## - -def dispersion(target_words, text, outpath, **kwargs): - # Create a new figure and axes - _, ax = plt.subplots() - - # Visualize the Dispersion of target words - visualizer = DispersionPlot(target_words, ax=ax, **kwargs) - visualizer.fit(text) - visualizer.poof(outpath=outpath) - - -########################################################################## -## Main Method -########################################################################## - -if __name__ == '__main__': - - # Load the corpus - corpus = load_corpus("../../../examples/data/hobbies") - - # Convert corpus into a list of all words from beginning to end - # text = [word for doc in corpus.data for word in doc.split()] - text = [doc.split() for doc in corpus.data] - # Select target words to visualize - target_words = ['Game', 'player', 'score', 'oil', 'Man'] - - # Display dispersion of target words throughout corpus - dispersion(target_words, text, "images/dispersion_docs.png") - - diff --git a/docs/api/text/dispersion.rst b/docs/api/text/dispersion.rst index 800440492..4c6ae7167 100644 --- a/docs/api/text/dispersion.rst +++ b/docs/api/text/dispersion.rst @@ -5,16 +5,15 @@ Dispersion Plot A word's importance can be weighed by its dispersion in a corpus. Lexical dispersion is a measure of a word's homogeneity across the parts of a corpus. This plot notes the occurrences of a word and how many words from the beginning of the corpus it appears. -.. code:: python +.. plot:: + :context: close-figs + :alt: Dispersion Plot from yellowbrick.text import DispersionPlot - -After importing the visualizer, we can :doc:`load the corpus ` - -.. code:: python + from yellowbrick.datasets import load_hobbies # Load the text data - corpus = load_corpus("hobbies") + corpus = load_hobbies() # Create a list of words from the corpus text text = [doc.split() for doc in corpus.data] @@ -28,8 +27,6 @@ After importing the visualizer, we can :doc:`load the corpus ` visualizer.poof() -.. image:: images/dispersion_docs.png - API Reference ------------- diff --git a/docs/api/text/freqdist.py b/docs/api/text/freqdist.py deleted file mode 100644 index 3ba42c61d..000000000 --- a/docs/api/text/freqdist.py +++ /dev/null @@ -1,45 +0,0 @@ -import matplotlib.pyplot as plt - -from corpus import load_corpus -from collections import defaultdict -from yellowbrick.text.freqdist import FreqDistVisualizer -from sklearn.feature_extraction.text import CountVectorizer - - -def freqdist(docs, outpath, corpus_kwargs={}, **kwargs): - # Create a new figure and axes - fig = plt.figure() - ax = fig.add_subplot(111) - - # Vectorize the corpus - vectorizer = CountVectorizer(**corpus_kwargs) - docs = vectorizer.fit_transform(docs) - features = vectorizer.get_feature_names() - - # Visualize the frequency distribution - visualizer = FreqDistVisualizer(ax=ax, features=features, **kwargs) - visualizer.fit(docs) - visualizer.poof(outpath=outpath) - - -if __name__ == '__main__': - - # Load the corpus - corpus = load_corpus("../../../examples/data/hobbies") - - # Whole corpus visualization - freqdist(corpus.data, "images/freqdist_corpus.png", orient='v') - - # Stopwords removed - freqdist(corpus.data, "images/freqdist_stopwords.png", {'stop_words': 'english'}, orient='v') - - # Specific categories - hobbies = defaultdict(list) - for text, label in zip(corpus.data, corpus.target): - hobbies[label].append(text) - - # Cooking Category - freqdist(hobbies["cooking"], "images/freqdist_cooking.png", {'stop_words': 'english'}, orient='v') - - # Gaming Category - freqdist(hobbies["gaming"], "images/freqdist_gaming.png", {'stop_words': 'english'}, orient='v') diff --git a/docs/api/text/freqdist.rst b/docs/api/text/freqdist.rst index 279e28973..39bccc586 100644 --- a/docs/api/text/freqdist.rst +++ b/docs/api/text/freqdist.rst @@ -5,74 +5,121 @@ Token Frequency Distribution A method for visualizing the frequency of tokens within and across corpora is frequency distribution. A frequency distribution tells us the frequency of each vocabulary item in the text. In general, it could count any kind of observable event. It is a distribution because it tells us how the total number of word tokens in the text are distributed across the vocabulary items. -.. code:: python +.. NOTE:: The ``FreqDistVisualizer`` does not perform any normalization or vectorization, and it expects text that has already been count vectorized. - from yellowbrick.text import FreqDistVisualizer - from sklearn.feature_extraction.text import CountVectorizer +We first instantiate a ``FreqDistVisualizer`` object, and then call ``fit()`` on that object with the count vectorized documents and the features (i.e. the words from the corpus), which computes the frequency distribution. The visualizer then plots a bar chart of the top 50 most frequent terms in the corpus, with the terms listed along the x-axis and frequency counts depicted at y-axis values. As with other Yellowbrick visualizers, when the user invokes ``poof()``, the finalized visualization is shown. Note that in this plot and in the subsequent one, we can orient our plot vertically by passing in ``orient='v'`` on instantiation (the plot will orient horizontally by default): -Note that the ``FreqDistVisualizer`` does not perform any normalization or vectorization, and it expects text that has already be count vectorized. +.. plot:: + :context: close-figs + :alt: Frequency Distribution Plot -We first instantiate a ``FreqDistVisualizer`` object, and then call ``fit()`` on that object with the count vectorized documents and the features (i.e. the words from the corpus), which computes the frequency distribution. The visualizer then plots a bar chart of the top 50 most frequent terms in the corpus, with the terms listed along the x-axis and frequency counts depicted at y-axis values. As with other Yellowbrick visualizers, when the user invokes ``poof()``, the finalized visualization is shown. + from sklearn.feature_extraction.text import CountVectorizer -.. code:: python + from yellowbrick.text import FreqDistVisualizer + from yellowbrick.datasets import load_hobbies + + # Load the text data + corpus = load_hobbies() vectorizer = CountVectorizer() docs = vectorizer.fit_transform(corpus.data) features = vectorizer.get_feature_names() - visualizer = FreqDistVisualizer(features=features) + visualizer = FreqDistVisualizer(features=features, orient='v') visualizer.fit(docs) visualizer.poof() -.. image:: images/freqdist_corpus.png It is interesting to compare the results of the ``FreqDistVisualizer`` before and after stopwords have been removed from the corpus: -.. code:: python + +.. plot:: + :context: close-figs + :include-source: False + :alt: Frequency Distribution Plot without Stopwords + + from sklearn.feature_extraction.text import CountVectorizer + + from yellowbrick.text import FreqDistVisualizer + from yellowbrick.datasets import load_hobbies + + # Load the text data + corpus = load_hobbies() vectorizer = CountVectorizer(stop_words='english') docs = vectorizer.fit_transform(corpus.data) features = vectorizer.get_feature_names() - visualizer = FreqDistVisualizer(features=features) + visualizer = FreqDistVisualizer(features=features, orient='v') visualizer.fit(docs) visualizer.poof() -.. image:: images/freqdist_stopwords.png +It is also interesting to explore the differences in tokens across a corpus. The hobbies corpus that comes with Yellowbrick has already been categorized (try ``corpus.target``), so let's visually compare the differences in the frequency distributions for two of the categories: *"cooking"* and *"gaming"*. -It is also interesting to explore the differences in tokens across a corpus. The hobbies corpus that comes with Yellowbrick has already been categorized (try ``corpus['categories']``), so let's visually compare the differences in the frequency distributions for two of the categories: *"cooking"* and *"gaming"*. +Here is the plot for the cooking corpus (oriented horizontally this time): -.. code:: python +.. plot:: + :context: close-figs + :include-source: False + :alt: Frequency Distribution Plot for Cooking Corpus from collections import defaultdict + from sklearn.feature_extraction.text import CountVectorizer + + from yellowbrick.text import FreqDistVisualizer + from yellowbrick.datasets import load_hobbies + + # Load the text data + corpus = load_hobbies() + + # Create a dict to map target labels to documents of that category hobbies = defaultdict(list) - for text, label in zip(corpus.data, corpus.label): + for text, label in zip(corpus.data, corpus.target): hobbies[label].append(text) -.. code:: python - vectorizer = CountVectorizer(stop_words='english') docs = vectorizer.fit_transform(text for text in hobbies['cooking']) features = vectorizer.get_feature_names() - visualizer = FreqDistVisualizer(features=features) + visualizer = FreqDistVisualizer( + features=features, size=(1080, 720) + ) visualizer.fit(docs) visualizer.poof() -.. image:: images/freqdist_cooking.png +And for the gaming corpus (again oriented horizontally): + +.. plot:: + :context: close-figs + :include-source: False + :alt: Frequency Distribution Plot for Gaming Corpus -.. code:: python + from collections import defaultdict + + from sklearn.feature_extraction.text import CountVectorizer + + from yellowbrick.text import FreqDistVisualizer + from yellowbrick.datasets import load_hobbies + + # Load the text data + corpus = load_hobbies() + + # Create a dict to map target labels to documents of that category + hobbies = defaultdict(list) + for text, label in zip(corpus.data, corpus.target): + hobbies[label].append(text) vectorizer = CountVectorizer(stop_words='english') docs = vectorizer.fit_transform(text for text in hobbies['gaming']) features = vectorizer.get_feature_names() - visualizer = FreqDistVisualizer(features=features) + visualizer = FreqDistVisualizer( + features=features, size=(1080, 720) + ) visualizer.fit(docs) visualizer.poof() -.. image:: images/freqdist_gaming.png API Reference ------------- diff --git a/docs/api/text/images/dispersion_docs.png b/docs/api/text/images/dispersion_docs.png deleted file mode 100644 index b2f623209..000000000 Binary files a/docs/api/text/images/dispersion_docs.png and /dev/null differ diff --git a/docs/api/text/images/freqdist_cooking.png b/docs/api/text/images/freqdist_cooking.png deleted file mode 100644 index d734b5264..000000000 Binary files a/docs/api/text/images/freqdist_cooking.png and /dev/null differ diff --git a/docs/api/text/images/freqdist_corpus.png b/docs/api/text/images/freqdist_corpus.png deleted file mode 100644 index 2c8f75a72..000000000 Binary files a/docs/api/text/images/freqdist_corpus.png and /dev/null differ diff --git a/docs/api/text/images/freqdist_gaming.png b/docs/api/text/images/freqdist_gaming.png deleted file mode 100644 index c06bff85f..000000000 Binary files a/docs/api/text/images/freqdist_gaming.png and /dev/null differ diff --git a/docs/api/text/images/freqdist_stopwords.png b/docs/api/text/images/freqdist_stopwords.png deleted file mode 100644 index 91699c746..000000000 Binary files a/docs/api/text/images/freqdist_stopwords.png and /dev/null differ diff --git a/docs/api/text/images/tsne_all_docs.png b/docs/api/text/images/tsne_all_docs.png deleted file mode 100644 index c057210e6..000000000 Binary files a/docs/api/text/images/tsne_all_docs.png and /dev/null differ diff --git a/docs/api/text/images/tsne_kmeans.png b/docs/api/text/images/tsne_kmeans.png deleted file mode 100644 index a571f80d0..000000000 Binary files a/docs/api/text/images/tsne_kmeans.png and /dev/null differ diff --git a/docs/api/text/images/tsne_no_labels.png b/docs/api/text/images/tsne_no_labels.png deleted file mode 100644 index ccfc0d38a..000000000 Binary files a/docs/api/text/images/tsne_no_labels.png and /dev/null differ diff --git a/docs/api/text/images/umap.png b/docs/api/text/images/umap.png new file mode 100644 index 000000000..10eaaf7b1 Binary files /dev/null and b/docs/api/text/images/umap.png differ diff --git a/docs/api/text/images/umap_cosine.png b/docs/api/text/images/umap_cosine.png new file mode 100644 index 000000000..50b115f66 Binary files /dev/null and b/docs/api/text/images/umap_cosine.png differ diff --git a/docs/api/text/images/umap_kmeans.png b/docs/api/text/images/umap_kmeans.png new file mode 100644 index 000000000..f360fe763 Binary files /dev/null and b/docs/api/text/images/umap_kmeans.png differ diff --git a/docs/api/text/images/umap_no_labels.png b/docs/api/text/images/umap_no_labels.png new file mode 100644 index 000000000..4d50694b5 Binary files /dev/null and b/docs/api/text/images/umap_no_labels.png differ diff --git a/docs/api/text/index.rst b/docs/api/text/index.rst index 33a1d2281..5166c6aed 100644 --- a/docs/api/text/index.rst +++ b/docs/api/text/index.rst @@ -5,19 +5,23 @@ Text Modeling Visualizers Yellowbrick provides the ``yellowbrick.text`` module for text-specific visualizers. The ``TextVisualizer`` class specifically deals with datasets that are corpora and not simple numeric arrays or DataFrames, providing utilities for analyzing word dispersion and distribution, showing document similarity, or simply wrapping some of the other standard visualizers with text-specific display properties. -We currently have three text-specific visualizations implemented: +We currently have four text-specific visualizations implemented: - :doc:`freqdist`: plot the frequency of tokens in a corpus - :doc:`tsne`: plot similar documents closer together to discover clusters +- :doc:`umap_vis`: plot similar documents closer together to discover clusters - :doc:`dispersion`: plot the dispersion of target words throughout a corpus +- :doc:`postag`: plot the counts of different parts-of-speech throughout a tagged corpus -Note that the examples in this section require a corpus of text data, see :doc:`loading a text corpus ` for more information. +Note that the examples in this section require a corpus of text data, see :doc:`the hobbies corpus <../datasets/hobbies>` for a sample dataset. .. code:: python from yellowbrick.text import FreqDistVisualizer from yellowbrick.text import TSNEVisualizer + from yellowbrick.text import UMAPVisualizer from yellowbrick.text import DispersionPlot + from yellowbrick.text import PosTagVisualizer from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.feature_extraction.text import CountVectorizer @@ -25,7 +29,8 @@ Note that the examples in this section require a corpus of text data, see :doc:` .. toctree:: :maxdepth: 2 - corpus freqdist tsne + umap_vis dispersion + postag \ No newline at end of file diff --git a/docs/api/text/postag.rst b/docs/api/text/postag.rst new file mode 100644 index 000000000..ace5548dd --- /dev/null +++ b/docs/api/text/postag.rst @@ -0,0 +1,233 @@ +.. -*- mode: rst -*- + +PosTag Visualization +==================== + +Parts of speech (e.g. verbs, nouns, prepositions, adjectives) indicate how a word is functioning within the context of a sentence. In English as in many other languages, a single word can function in multiple ways. Part-of-speech tagging lets us encode information not only about a word’s definition, but also its use in context (for example the words “ship” and “shop” can be either a verb or a noun, depending on the context). + +The ``PosTagVisualizer`` is intended to support grammar-based feature extraction techniques for machine learning workflows that require natural language processing. The visualizer reads in a corpus that has already been sentence- and word-segmented, and tagged, creating a bar chart to visualize the relative proportions of different parts-of-speech in a corpus. + +.. note:: + The ``PosTagVisualizer`` currently works with both Penn-Treebank (e.g. via NLTK) and Universal Dependencies (e.g. via SpaCy)-tagged corpora, but expects corpora that have already been tagged, and which take the form of a list of (document) lists of (sentence) lists of ``(token, tag)`` tuples, as in the example below. + +Penn Treebank Tags +------------------ + +.. plot:: + :context: close-figs + :alt: PosTag plot with Penn Treebank tags + + from yellowbrick.text import PosTagVisualizer + + + tagged_stanzas = [ + [ + [ + ('Whose', 'JJ'),('woods', 'NNS'),('these', 'DT'), + ('are', 'VBP'),('I', 'PRP'),('think', 'VBP'),('I', 'PRP'), + ('know', 'VBP'),('.', '.') + ], + [ + ('His', 'PRP$'),('house', 'NN'),('is', 'VBZ'),('in', 'IN'), + ('the', 'DT'),('village', 'NN'),('though', 'IN'),(';', ':'), + ('He', 'PRP'),('will', 'MD'),('not', 'RB'),('see', 'VB'), + ('me', 'PRP'),('stopping', 'VBG'), ('here', 'RB'),('To', 'TO'), + ('watch', 'VB'),('his', 'PRP$'),('woods', 'NNS'),('fill', 'VB'), + ('up', 'RP'),('with', 'IN'),('snow', 'NNS'),('.', '.') + ] + ], + [ + [ + ('My', 'PRP$'),('little', 'JJ'),('horse', 'NN'),('must', 'MD'), + ('think', 'VB'),('it', 'PRP'),('queer', 'JJR'),('To', 'TO'), + ('stop', 'VB'),('without', 'IN'),('a', 'DT'),('farmhouse', 'NN'), + ('near', 'IN'),('Between', 'NNP'),('the', 'DT'),('woods', 'NNS'), + ('and', 'CC'),('frozen', 'JJ'),('lake', 'VB'),('The', 'DT'), + ('darkest', 'JJS'),('evening', 'NN'),('of', 'IN'),('the', 'DT'), + ('year', 'NN'),('.', '.') + ] + ], + [ + [ + ('He', 'PRP'),('gives', 'VBZ'),('his', 'PRP$'),('harness', 'NN'), + ('bells', 'VBZ'),('a', 'DT'),('shake', 'NN'),('To', 'TO'), + ('ask', 'VB'),('if', 'IN'),('there', 'EX'),('is', 'VBZ'), + ('some', 'DT'),('mistake', 'NN'),('.', '.') + ], + [ + ('The', 'DT'),('only', 'JJ'),('other', 'JJ'),('sound', 'NN'), + ('’', 'NNP'),('s', 'VBZ'),('the', 'DT'),('sweep', 'NN'), + ('Of', 'IN'),('easy', 'JJ'),('wind', 'NN'),('and', 'CC'), + ('downy', 'JJ'),('flake', 'NN'),('.', '.') + ] + ], + [ + [ + ('The', 'DT'),('woods', 'NNS'),('are', 'VBP'),('lovely', 'RB'), + (',', ','),('dark', 'JJ'),('and', 'CC'),('deep', 'JJ'),(',', ','), + ('But', 'CC'),('I', 'PRP'),('have', 'VBP'),('promises', 'NNS'), + ('to', 'TO'),('keep', 'VB'),(',', ','),('And', 'CC'),('miles', 'NNS'), + ('to', 'TO'),('go', 'VB'),('before', 'IN'),('I', 'PRP'), + ('sleep', 'VBP'),(',', ','),('And', 'CC'),('miles', 'NNS'), + ('to', 'TO'),('go', 'VB'),('before', 'IN'),('I', 'PRP'), + ('sleep', 'VBP'),('.', '.') + ] + ] + ] + + # Create the visualizer, fit, score, and poof it + viz = PosTagVisualizer() + viz.fit(tagged_stanzas) + viz.poof() + +Universal Dependencies Tags +--------------------------- + +Libraries like SpaCy use tags from the Universal Dependencies (UD) framework. The ``PosTagVisualizer`` can also be used with text tagged using this framework by specifying the ``tagset`` keyword as "universal" on instantiation. + +.. code:: python + + tagged_speech = [ + [ + [ + ('In', 'ADP'),('all', 'DET'),('honesty', 'NOUN'),(',', 'PUNCT'), + ('I', 'PRON'),('said', 'VERB'),('yes', 'INTJ'),('to', 'ADP'), + ('the', 'DET'),('fear', 'NOUN'),('of', 'ADP'),('being', 'VERB'), + ('on', 'ADP'),('this', 'DET'),('stage', 'NOUN'),('tonight', 'NOUN'), + ('because', 'ADP'),('I', 'PRON'),('wanted', 'VERB'),('to', 'PART'), + ('be', 'VERB'),('here', 'ADV'),(',', 'PUNCT'),('to', 'PART'), + ('look', 'VERB'),('out', 'PART'),('into', 'ADP'),('this', 'DET'), + ('audience', 'NOUN'),(',', 'PUNCT'),('and', 'CCONJ'), + ('witness', 'VERB'),('this', 'DET'),('moment', 'NOUN'),('of', 'ADP'), + ('change', 'NOUN') + ], + [ + ('and', 'CCONJ'),('I', 'PRON'),("'m", 'VERB'),('not', 'ADV'), + ('fooling', 'VERB'),('myself', 'PRON'),('.', 'PUNCT') + ], + [ + ('I', 'PRON'),("'m", 'VERB'),('not', 'ADV'),('fooling', 'VERB'), + ('myself', 'PRON'),('.', 'PUNCT') + ], + [ + ('Next', 'ADJ'),('year', 'NOUN'),('could', 'VERB'),('be', 'VERB'), + ('different', 'ADJ'),('.', 'PUNCT') + ], + [ + ('It', 'PRON'),('probably', 'ADV'),('will', 'VERB'),('be', 'VERB'), + (',', 'PUNCT'),('but', 'CCONJ'),('right', 'ADV'),('now', 'ADV'), + ('this', 'DET'),('moment', 'NOUN'),('is', 'VERB'),('real', 'ADJ'), + ('.', 'PUNCT') + ], + [ + ('Trust', 'VERB'),('me', 'PRON'),(',', 'PUNCT'),('it', 'PRON'), + ('is', 'VERB'),('real', 'ADJ'),('because', 'ADP'),('I', 'PRON'), + ('see', 'VERB'),('you', 'PRON') + ], + [ + ('and', 'CCONJ'), ('I', 'PRON'), ('see', 'VERB'), ('you', 'PRON') + ], + [ + ('—', 'PUNCT') + ], + [ + ('all', 'ADJ'),('these', 'DET'),('faces', 'NOUN'),('of', 'ADP'), + ('change', 'NOUN') + ], + [ + ('—', 'PUNCT'),('and', 'CCONJ'),('now', 'ADV'),('so', 'ADV'), + ('will', 'VERB'),('everyone', 'NOUN'),('else', 'ADV'), ('.', 'PUNCT') + ] + ] + ] + + # Create the visualizer, fit, score, and poof it + viz = PosTagVisualizer(tagset="universal") + viz.fit(tagged_speech) + viz.poof() + + ++-------------------+------------------------------------------+----------------------+--------------------------+ +| Penn-Treebank Tag | Description | Universal Tag | Description | ++===================+==========================================+======================+==========================+ +| CC | Coordinating conjunction | ADJ | adjective | ++-------------------+------------------------------------------+----------------------+--------------------------+ +| CD | Cardinal number | ADP | adposition | ++-------------------+------------------------------------------+----------------------+--------------------------+ +| DT | Determiner | ADV | adverb | ++-------------------+------------------------------------------+----------------------+--------------------------+ +| EX | Existential *there* | AUX | auxiliary | ++-------------------+------------------------------------------+----------------------+--------------------------+ +| FW | Foreign word | CONJ | conjunction | ++-------------------+------------------------------------------+----------------------+--------------------------+ +| IN | Preposition or subordinating conjunction | CCONJ | coordinating conjunction | ++-------------------+------------------------------------------+----------------------+--------------------------+ +| JJ | Adjective | DET | determiner | ++-------------------+------------------------------------------+----------------------+--------------------------+ +| JJR | Adjective, comparative | INTJ | interjection | ++-------------------+------------------------------------------+----------------------+--------------------------+ +| JJS | Adjective, superlative | NOUN | noun | ++-------------------+------------------------------------------+----------------------+--------------------------+ +| LS | List item marker | NUM | numeral | ++-------------------+------------------------------------------+----------------------+--------------------------+ +| MD | Modal | PART | particle | ++-------------------+------------------------------------------+----------------------+--------------------------+ +| NN | Noun, singular or mass | PRON | pronoun | ++-------------------+------------------------------------------+----------------------+--------------------------+ +| NNS | Noun, plural | PROPN | proper noun | ++-------------------+------------------------------------------+----------------------+--------------------------+ +| NNP | Proper noun, singular | PUNCT | punctuation | ++-------------------+------------------------------------------+----------------------+--------------------------+ +| NNPS | Proper noun, plural | SCONJ | subordinating conjunction| ++-------------------+------------------------------------------+----------------------+--------------------------+ +| PDT | Predeterminer | SYM | symbol | ++-------------------+------------------------------------------+----------------------+--------------------------+ +| POS | Possessive ending | VERB | verb | ++-------------------+------------------------------------------+----------------------+--------------------------+ +| PRP | Personal pronoun | X | other | ++-------------------+------------------------------------------+----------------------+--------------------------+ +| PRP$ | Possessive pronoun | SPACE | space | ++-------------------+------------------------------------------+----------------------+--------------------------+ +| RB | Adverb | | | ++-------------------+------------------------------------------+----------------------+--------------------------+ +| RBR | Adverb, comparative | | | ++-------------------+------------------------------------------+----------------------+--------------------------+ +| RBS | Adverb, superlative | | | ++-------------------+------------------------------------------+----------------------+--------------------------+ +| RP | Particle | | | ++-------------------+------------------------------------------+----------------------+--------------------------+ +| SYM | Symbol | | | ++-------------------+------------------------------------------+----------------------+--------------------------+ +| TO | *to* | | | ++-------------------+------------------------------------------+----------------------+--------------------------+ +| UH | Interjection | | | ++-------------------+------------------------------------------+----------------------+--------------------------+ +| VB | Verb, base form | | | ++-------------------+------------------------------------------+----------------------+--------------------------+ +| VBD | Verb, past tense | | | ++-------------------+------------------------------------------+----------------------+--------------------------+ +| VBG | Verb, gerund or present participle | | | ++-------------------+------------------------------------------+----------------------+--------------------------+ +| VBN | Verb, past participle | | | ++-------------------+------------------------------------------+----------------------+--------------------------+ +| VBP | Verb, non-3rd person singular present | | | ++-------------------+------------------------------------------+----------------------+--------------------------+ +| VBZ | Verb, 3rd person singular present | | | ++-------------------+------------------------------------------+----------------------+--------------------------+ +| WDT | Wh-determiner | | | ++-------------------+------------------------------------------+----------------------+--------------------------+ +| WP | Wh-pronoun | | | ++-------------------+------------------------------------------+----------------------+--------------------------+ +| WP$ | Possessive wn-pronoun | | | ++-------------------+------------------------------------------+----------------------+--------------------------+ +| WRB | Wh-adverb | | | ++-------------------+------------------------------------------+----------------------+--------------------------+ + + +API Reference +------------- + +.. automodule:: yellowbrick.text.postag + :members: PosTagVisualizer + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/api/text/tsne.rst b/docs/api/text/tsne.rst index 09aa71654..837bb8495 100644 --- a/docs/api/text/tsne.rst +++ b/docs/api/text/tsne.rst @@ -7,59 +7,86 @@ One very popular method for visualizing document similarity is to use t-distribu Unfortunately, ``TSNE`` is very expensive, so typically a simpler decomposition method such as SVD or PCA is applied ahead of time. The ``TSNEVisualizer`` creates an inner transformer pipeline that applies such a decomposition first (SVD with 50 components by default), then performs the t-SNE embedding. The visualizer then plots the scatter plot, coloring by cluster or by class, or neither if a structural analysis is required. -.. code:: python - - from yellowbrick.text import TSNEVisualizer - from sklearn.feature_extraction.text import TfidfVectorizer +After importing the required tools, we can use the :doc:`hobbies corpus <../datasets/hobbies>` and vectorize the text using TF-IDF. Once the corpus is vectorized we can visualize it, showing the distribution of classes. -After importing the required tools, we can :doc:`load the corpus ` and vectorize the text using TF-IDF. -.. code:: python +.. plot:: + :context: close-figs + :alt: TSNE Plot - # Load the data and create document vectors - corpus = load_corpus('hobbies') - tfidf = TfidfVectorizer() + from sklearn.feature_extraction.text import TfidfVectorizer - docs = tfidf.fit_transform(corpus.data) - labels = corpus.target + from yellowbrick.text import TSNEVisualizer + from yellowbrick.datasets import load_hobbies -Now that the corpus is vectorized we can visualize it, showing the distribution of classes. + # Load the data and create document vectors + corpus = load_hobbies() + tfidf = TfidfVectorizer() -.. code:: python + X = tfidf.fit_transform(corpus.data) + y = corpus.target # Create the visualizer and draw the vectors tsne = TSNEVisualizer() - tsne.fit(docs, labels) + tsne.fit(X, y) tsne.poof() -.. image:: images/tsne_all_docs.png +Note that you can pass the class labels or document categories directly to the ``TSNEVisualizer`` as follows: + +.. code:: python + + labels = corpus.labels + tsne = TSNEVisualizer(labels=labels) + tsne.fit(X, y) + tsne.poof() If we omit the target during fit, we can visualize the whole dataset to see if any meaningful patterns are observed. -.. code:: python +.. plot:: + :context: close-figs + :include-source: False + :alt: TSNE Plot without Class Coloring - # Don't color points with their classes + from sklearn.feature_extraction.text import TfidfVectorizer + + from yellowbrick.text import TSNEVisualizer + from yellowbrick.datasets import load_hobbies + + # Load the data and create document vectors + corpus = load_hobbies() + tfidf = TfidfVectorizer() + + X = tfidf.fit_transform(corpus.data) tsne = TSNEVisualizer(labels=["documents"]) - tsne.fit(docs) + tsne.fit(X) tsne.poof() -.. image:: images/tsne_no_labels.png - This means we don't have to use class labels at all. Instead we can use cluster membership from K-Means to label each document. This will allow us to look for clusters of related text by their contents: -.. code:: python +.. plot:: + :context: close-figs + :include-source: False + :alt: TSNE Plot without Clustering - # Apply clustering instead of class names. from sklearn.cluster import KMeans + from sklearn.feature_extraction.text import TfidfVectorizer + + from yellowbrick.text import TSNEVisualizer + from yellowbrick.datasets import load_hobbies + + # Load the data and create document vectors + corpus = load_hobbies() + tfidf = TfidfVectorizer() + + X = tfidf.fit_transform(corpus.data) clusters = KMeans(n_clusters=5) - clusters.fit(docs) + clusters.fit(X) tsne = TSNEVisualizer() - tsne.fit(docs, ["c{}".format(c) for c in clusters.labels_]) + tsne.fit(X, ["c{}".format(c) for c in clusters.labels_]) tsne.poof() -.. image:: images/tsne_kmeans.png API Reference ------------- diff --git a/docs/api/text/tsne.py b/docs/api/text/umap_vis.py similarity index 53% rename from docs/api/text/tsne.py rename to docs/api/text/umap_vis.py index c85d6b206..114e42ea4 100644 --- a/docs/api/text/tsne.py +++ b/docs/api/text/umap_vis.py @@ -1,59 +1,58 @@ -# ID: tsne.py [] benjamin@bengfort.com $ +#!/usr/bin/env python3 +# ID: umap_vis.py [73a44e5] jchealy@gmail.com $ """ -Generate figures for TSNE documentation. +Manually generate figures for the UMAP documentation. """ - ########################################################################## ## Imports ########################################################################## import matplotlib.pyplot as plt -from corpus import load_corpus -from yellowbrick.text import TSNEVisualizer - from sklearn.cluster import KMeans from sklearn.feature_extraction.text import TfidfVectorizer +from yellowbrick.text import UMAPVisualizer +from yellowbrick.datasets import load_hobbies ########################################################################## ## Generate ########################################################################## -def tsne(docs, target, outpath, **kwargs): + +def umap(docs, target, outpath, **kwargs): # Create a new figure and axes fig = plt.figure() ax = fig.add_subplot(111) # Visualize the frequency distribution - visualizer = TSNEVisualizer(ax=ax, **kwargs) + visualizer = UMAPVisualizer(ax=ax, **kwargs) visualizer.fit(docs, target) visualizer.poof(outpath=outpath) -########################################################################## -## Main Method -########################################################################## - -if __name__ == '__main__': +if __name__ == "__main__": # Load and vectorize the corpus - corpus = load_corpus("../../../examples/data/hobbies") + corpus = load_hobbies() tfidf = TfidfVectorizer() - docs = tfidf.fit_transform(corpus.data) - target = corpus.target + docs = tfidf.fit_transform(corpus.data) + labels = corpus.target + + # Whole corpus visualization + umap(docs, labels, "images/umap.png") # Whole corpus visualization - tsne(docs, target, "images/tsne_all_docs.png") + umap(docs, labels, "images/umap_cosine.png", metric="cosine") # No labels - tsne(docs, None, "images/tsne_no_labels.png", labels=["documents"]) + umap(docs, None, "images/umap_no_labels.png", labels=["documents"], metric="cosine") - # Apply clustering instead of class names. + # Apply clustering instead of class names clusters = KMeans(n_clusters=5) clusters.fit(docs) centers = ["c{}".format(c) for c in clusters.labels_] - tsne(docs, centers, "images/tsne_kmeans.png") + umap(docs, centers, "images/umap_kmeans.png") diff --git a/docs/api/text/umap_vis.rst b/docs/api/text/umap_vis.rst new file mode 100644 index 000000000..8867d550c --- /dev/null +++ b/docs/api/text/umap_vis.rst @@ -0,0 +1,134 @@ +.. -*- mode: rst -*- + +UMAP Corpus Visualization +========================== + +`Uniform Manifold Approximation and Projection (UMAP) `__ is a nonlinear +dimensionality reduction method that is well suited to embedding in two +or three dimensions for visualization as a scatter plot. UMAP is a +relatively new technique but is very effective for visualizing clusters or +groups of data points and their relative proximities. It does a good job +of learning the local structure within your data but also attempts to +preserve the relationships between your groups as can be seen in its +`exploration of +MNIST `__. +It is fast, scalable, and can be applied directly to sparse matrices, +eliminating the need to run ``TruncatedSVD`` as a pre-processing step. +Additionally, it supports a wide variety of distance measures allowing +for easy exploration of your data. For a more detailed explanation of the algorithm +the paper can be found `here `__. + +In this example, we represent documents via a `term frequency inverse +document +frequency `__ (TF-IDF) +vector and then use UMAP to find a low dimensional representation of these +documents. Next, the Yellowbrick visualizer plots the scatter plot, +coloring by cluster or by class, or neither if a structural analysis is +required. + +After importing the required tools, we can use the :doc:`the hobbies corpus <../datasets/hobbies>` and vectorize the text using TF-IDF. Once the corpus is vectorized we can visualize it, showing the distribution of classes. + +.. note to contributors: the below code requires an additional dependency on umap-learn + that is not part of the core requirements, so has not been modified with a plot + directive. See umap_vis.py to regenerate images. + +.. code:: python + + from sklearn.feature_extraction.text import TfidfVectorizer + + from yellowbrick.datasets import load_hobbies + from yellowbrick.text import UMAPVisualizer + + # Load the text data + corpus = load_hobbies() + + tfidf = TfidfVectorizer() + docs = tfidf.fit_transform(corpus.data) + labels = corpus.target + + # Instantiate the text visualizer + umap = UMAPVisualizer() + umap.fit(docs, labels) + umap.poof() + + +.. image:: images/umap.png + +Alternatively, if we believed that cosine distance was a more +appropriate metric on our feature space we could specify that via a +``metric`` parameter passed through to the underlying UMAP function by +the ``UMAPVisualizer``. + +.. code:: python + + umap = UMAPVisualizer(metric='cosine') + umap.fit(docs, labels) + umap.poof() + + +.. image:: images/umap_cosine.png + +If we omit the target during fit, we can visualize the whole dataset to +see if any meaningful patterns are observed. + +.. code:: python + + # Don't color points with their classes + umap = UMAPVisualizer(labels=["documents"], metric='cosine') + umap.fit(docs) + umap.poof() + + +.. image:: images/umap_no_labels.png + +This means we don’t have to use class labels at all. Instead, we can use +cluster membership from K-Means to label each document. This will allow +us to look for clusters of related text by their contents: + +.. code:: python + + from sklearn.cluster import KMeans + from sklearn.feature_extraction.text import TfidfVectorizer + + from yellowbrick.datasets import load_hobbies + from yellowbrick.text import UMAPVisualizer + + # Load the text data + corpus = load_hobbies() + + tfidf = TfidfVectorizer() + docs = tfidf.fit_transform(corpus.data) + + # Instantiate the clustering model + clusters = KMeans(n_clusters=5) + clusters.fit(docs) + + umap = UMAPVisualizer() + umap.fit(docs, ["c{}".format(c) for c in clusters.labels_]) + umap.poof() + +.. image:: images/umap_kmeans.png + +On one hand, these clusters aren’t particularly well concentrated by the +two-dimensional embedding of UMAP; while on the other hand, the true labels +for this data are. That is a good indication that your data does indeed +live on a manifold in your TF-IDF space and that structure is being +ignored by the K-Means algorithm. Clustering can be quite tricky in high +dimensional spaces and it is often a good idea to reduce your dimension +before running clustering algorithms on your data. + +UMAP, it should be noted, is a manifold learning technique and as such +does not seek to preserve the distances between your data points in high +space but instead to learn the distances along an underlying manifold on +which your data points lie. As such, one shouldn’t be too surprised when +it disagrees with a non-manifold based clustering technique. A detailed +explanation of this phenomenon can be found in this `UMAP +documentation `__. + +API Reference +------------- + +.. automodule:: yellowbrick.text.umap_vis + :members: UMAPVisualizer + :undoc-members: + :show-inheritance: diff --git a/docs/changelog.rst b/docs/changelog.rst index ebb822017..0b4786591 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -3,6 +3,87 @@ Changelog ========= +Version 1.0 +----------- + +* Tag: v1.0_ +* Deployed: Not yet deployed +* Contributors: Benjamin Bengfort, Rebecca Bilbro, Nathan Danielsen, Kristen McIntyre, Larry Gray, Prema Roman, Carl Dawson, Daniel Navarrete, Francois Dion, Halee Mason, Jeff Hale, Jiayi Zhang, Jimmy Shah, John Healy, Justin Ormont, Kevin Arvai, Michael Garod, Mike Curry, Nabanita Dash, Naresh Bachwani, Nicholas A. Brown, Piyush Gautam, Pradeep Singh, Rohit Ganapathy, Ry Whittington, Sangarshanan, Sourav Singh, Thomas J Fan, Zijie (ZJ) Poh, Zonghan, Xie + +.. warning:: **Python 2 Deprecation**: Please note that this release deprecates Yellowbrick's support for Python 2.7. After careful consideration and following the lead of our primary dependencies (NumPy, scikit-learn, and Matplolib), we have chosen to move forward with the community and support Python 3.4 and later. + +Major Changes: + - New ``JointPlot`` visualizer that is specifically designed for machine learning. The new visualizer can compare a feature to a target, features to features, and even feature to feature to target using color. The visualizer gives correlation information at a glance and is designed to work on ML datasets. + - New ``PosTagVisualizer`` is specifically designed for diagnostics around natural language processing and grammar-based feature extraction for machine learning. This new visualizer shows counts of different parts-of-speech throughout a tagged corpus. + - New datasets module that provide greater support for interacting with Yellowbrick example datasets including support for Pandas, npz, and text corpora. + - Management repository for Yellowbrick example data, ``yellowbrick-datasets``. + - Add support for matplotlib 3.0.1 or greater. + - ``UMAPVisualizer`` as an alternative manifold to TSNE for corpus visualization that is fast enough to not require preprocessing PCA or SVD decomposition and preserves higher order similarities and distances. + - Added ``..plot::`` directives to the documentation to automatically build the images along with the docs and keep them as up to date as possible. The directives also include the source code making it much simpler to recreate examples. + - Added ``target_color_type`` functionality to determine continuous or discrete color representations based on the type of the target variable. + - Added alpha param for both test and train residual points in ``ResidualsPlot``. + - Added ``frameon`` param to ``Manifold``. + - Added frequency sort feature to ``PosTagVisualizer``. + - Added elbow detection using the "kneedle" method to the ``KElbowVisualizer``. + - Added governance document outlining new Yellowbrick structure. + - Added ``CooksDistance`` regression visualizer. + - Updated ``DataVisualizer`` to handle target type identification. + - Extended ``DataVisualizer`` and updated its subclasses. + - Added ``ProjectionVisualizer`` base class. + - Restructured ``yellowbrick.target``, ``yellowbrick.features``, and ``yellowbrick.model_selection`` API. + - Restructured regressor and classifier API. + +Minor Changes: + - Updated ``Rank2D`` to include Kendall-Tau metric. + - Added user specification of ISO F1 values to ``PrecisionRecallCurve`` and updated the quick method to accept train and test splits. + - Added code review checklist and conventions to the documentation and expanded the contributing docs to include other tricks and tips. + - Added polish to missing value visualizers code, tests, and documentation. + - Improved ``RankD`` tests for better coverage. + - Added quick method test for ``DispersionPlot`` visualizer. + - BugFix: fixed resolve colors bug in TSNE and UMAP text visualizers and added regression tests to prevent future errors. + - BugFix: Added support for Yellowbrick palettes to return ``colormap``. + - BugFix: fixed ``PrecisionRecallCurve`` visual display problem with multi-class labels. + - BugFix: fixed the ``RFECV`` step display bug. + - BugFix: fixed error in distortion score calculation. + - Extended ``FeatureImportances`` documentation and tests for stacked importances and added a warning when stack should be true. + - Improved the documentation readability and structure. + - Refreshed the ``README.md`` and added testing and documentation READMEs. + - Updated the gallery to generate thumbnail-quality images. + - Updated the example notebooks and created a quickstart notebook. + - Fixed broken links in the documentation. + - Enhanced the ``SilhouetteVisualizer`` with ``legend`` and ``color`` parameter, while also move labels to the y-axis. + - Extended ``FeatureImportances`` docs/tests for stacked importances. + - Documented the ``yellowbrick.download`` script. + - Added JOSS citation for "Yellowbrick: Visualizing the Scikit-Learn Model Selection Process". + - Added new pull request (PR) template. + - Added ``alpha`` param to PCA Decomposition Visualizer. + - Updated documentation with affiliations. + - Added a ``windows_tol`` for the visual unittest suite. + - Added stacked barchart to ``PosTagVisualizer``. + - Let users set colors for ``FreqDistVisualizer`` and other ``ax_bar`` visualizers. + - Updated ``Manifold`` to extend ``ProjectionVisualizer``. + - Check if an estimator is already fitted before calling ``fit`` method. + - Ensure ``poof`` returns ``ax``. + +Compatibility Notes: + - This version provides support for matplotlib 3.0.1 or greater and drops support for matplotlib versions less than 2.0. + - This version drops support for Python 2 + +.. _v1.0: https://github.com/DistrictDataLabs/yellowbrick/releases/tag/v1.0 + + +Hotfix 0.9.1 +------------ + +This hotfix adds matplotlib3 support by requiring any version of matplotlib except for 3.0.0 which had a backend bug that affected Yellowbrick. + +* Tag: v0.9.1_ +* Deployed: Tuesday, February 5, 2019 +* Contributors: Benjamin Bengfort, Rebecca Bilbro, Ian Ozsvald, Francois Dion + +.. _v0.9.1: https://github.com/DistrictDataLabs/yellowbrick/releases/tag/v0.9.1 + + Version 0.9 ----------- * Tag: v0.9_ diff --git a/docs/conf.py b/docs/conf.py index d1e700567..88b151555 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -1,81 +1,100 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- +# conf +# Yellowbrick documentation build config file, created by sphinx-quickstart # -# yellowbrick documentation build configuration file, created by -# sphinx-quickstart on Tue Jul 5 19:45:43 2016. +# Created: Tue Jul 05 19:45:43 2016 -0400 +# Copyright (C) 2016-2019 The scikit-yb developers +# For license information, see LICENSE.txt # -# This file is execfile()d with the current directory set to its -# containing dir. -# -# Note that not all possible configuration values are present in this -# autogenerated file. -# -# All configuration values have a default; values that are commented out -# serve to show the default. +# ID: conf.py [6d697b7] lalorenz6@gmail.com $ + +""" +Yellowbrick documentation build config file, created by sphinx-quickstart. + +This file is executed with the current directory set to its containing dir +by ``execfile()``, e.g. the working directory will be yellowbrick/docs. +Ensure that all specified paths relative to the docs directory are made +absolute by using ``os.path.abspath``. + +Note that not all possible configuration values are present in this +autogenerated file. + +All configuration values have a default; values that are commented out +serve to show the default. + +See: https://www.sphinx-doc.org/en/master/usage/configuration.html +for more details on configuring the documentation build. +""" + +########################################################################## +## Imports +########################################################################## + +import os +import sys # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. -# -import os -import sys -sys.path.insert(0, os.path.abspath('..')) +sys.path.insert(0, os.path.abspath("..")) # Set the backend of matplotlib to prevent build errors. import matplotlib -matplotlib.use('agg') +matplotlib.use("agg") + +# Import yellowbrick information. import yellowbrick as yb -# -- General configuration ------------------------------------------------ +########################################################################## +## General configuration +########################################################################## # If your documentation needs a minimal Sphinx version, state it here. -# -# needs_sphinx = '1.0' +# needs_sphinx = '1.8' + +# General information about the project. +project = "Yellowbrick" +copyright = "2016-2019, The scikit-yb developers." +author = "The scikit-yb developers" + +# The version info for the project you're documenting, acts as replacement for +# |version| and |release|, also used in various other places throughout the +# built documents. + +# The short X.Y version. +version = yb.get_version(short=True) +# The full version, including alpha/beta/rc tags. +release = "v" + yb.get_version(short=False) # Add any Sphinx extension module names here, as strings. They can be -# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom -# ones. +# extensions coming with Sphinx (named 'sphinx.ext.*') or custom ones. extensions = [ - 'sphinx.ext.autodoc', - 'sphinx.ext.autosummary', - 'sphinx.ext.intersphinx', - 'sphinx.ext.coverage', - 'sphinx.ext.mathjax', - 'sphinx.ext.viewcode', - 'sphinx.ext.todo', - 'numpydoc', + "sphinx.ext.autodoc", + "sphinx.ext.autosummary", + "sphinx.ext.intersphinx", + "sphinx.ext.coverage", + "sphinx.ext.mathjax", + "sphinx.ext.viewcode", + "sphinx.ext.todo", + "numpydoc", + "matplotlib.sphinxext.plot_directive", ] # Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] +templates_path = ["_templates"] # The suffix(es) of source filenames. # You can specify multiple suffix as a list of string: -# # source_suffix = ['.rst', '.md'] -source_suffix = '.rst' +source_suffix = ".rst" # The encoding of source files. -# # source_encoding = 'utf-8-sig' # The master toctree document. -master_doc = 'index' - -# General information about the project. -project = 'yellowbrick' -copyright = '2016, District Data Labs' -author = 'District Data Labs' - -# The version info for the project you're documenting, acts as replacement for -# |version| and |release|, also used in various other places throughout the -# built documents. -# -# The short X.Y version. -version = yb.__version__ -# The full version, including alpha/beta/rc tags. -release = yb.__version__ +master_doc = "index" # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. @@ -86,39 +105,32 @@ # There are two options for replacing |today|: either, you set today to some # non-false value, then it is used: -# # today = '' -# + # Else, today_fmt is used as the format for a strftime call. -# # today_fmt = '%B %d, %Y' # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This patterns also effect to html_static_path and html_extra_path -exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] +exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] -# The reST default role (used for this markup: `text`) to use for all -# documents. -# +# The reST default role (used for this markup: `text`) for all docs. # default_role = None # If true, '()' will be appended to :func: etc. cross-reference text. -# # add_function_parentheses = True # If true, the current module name will be prepended to all description # unit titles (such as .. function::). -# # add_module_names = True # If true, sectionauthor and moduleauthor directives will be shown in the # output. They are ignored by default. -# # show_authors = False # The name of the Pygments (syntax highlighting) style to use. -pygments_style = 'sphinx' +pygments_style = "sphinx" # A list of ignored prefixes for module index sorting. # modindex_common_prefix = [] @@ -126,16 +138,63 @@ # If true, keep warnings as "system message" paragraphs in the built documents. # keep_warnings = False +########################################################################## +## Extension Configuration +########################################################################## + # If true, `todo` and `todoList` produce output, else they produce nothing. todo_include_todos = False +# Auto-plot settings either as extension or (file format, dpi) +plot_formats = [ + "png", + "pdf", + # ('hires.png', 350), +] + +# By default, include the source code generating plots in documentation +plot_include_source = True + +# Whether to show a link to the source in HTML. +plot_html_show_source_link = True + +# Code that should be executed before each plot. +plot_pre_code = ( + "import numpy as np\n" + "import matplotlib.pyplot as plt\n" + "from yellowbrick.datasets import *\n" +) + +# Whether to show links to the files in HTML. +plot_html_show_formats = True -# -- Options for HTML output ---------------------------------------------- +# A dictionary containing any non-standard rcParams that should be applied before each plot. +plot_rcparams = {"figure.figsize": (9, 6), "figure.dpi": 128} + +# Autodoc requires numpy to skip class members otherwise we get an exception: +# toctree contains reference to nonexisting document +# See: https://github.com/phn/pytpm/issues/3#issuecomment-12133978 +numpydoc_show_class_members = False + +# Locations of objects.inv files for intersphinx extension that auto-links +# to external api docs. +intersphinx_mapping = { + "python": ("https://docs.python.org/3", None), + "matplotlib": ("http://matplotlib.org/", None), + "scipy": ("http://docs.scipy.org/doc/scipy/reference", None), + "numpy": ("https://docs.scipy.org/doc/numpy/", None), + "cycler": ("http://matplotlib.org/cycler/", None), + "sklearn": ("http://scikit-learn.org/stable/", None), +} + +########################################################################## +## Options for HTML output +########################################################################## # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # -html_theme = 'sphinx_rtd_theme' +html_theme = "sphinx_rtd_theme" # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the @@ -164,16 +223,18 @@ # the docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 # pixels large. # -# html_favicon = None +html_favicon = "images/favicon.ico" # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] +html_static_path = ["_static"] + def setup(app): app.add_stylesheet("theme_overrides.css") + # Add any extra paths that contain custom files (such as robots.txt or # .htaccess) here, relative to this directory. These files are copied # directly to the root of the documentation. @@ -252,34 +313,36 @@ def setup(app): # html_search_scorer = 'scorer.js' # Output file base name for HTML help builder. -htmlhelp_basename = 'yellowbrickdoc' +htmlhelp_basename = "yellowbrickdoc" -# -- Options for LaTeX output --------------------------------------------- +########################################################################## +## Options for LaTeX output +########################################################################## latex_elements = { - # The paper size ('letterpaper' or 'a4paper'). - # - # 'papersize': 'letterpaper', - - # The font size ('10pt', '11pt' or '12pt'). - # - # 'pointsize': '10pt', - - # Additional stuff for the LaTeX preamble. - # - # 'preamble': '', - - # Latex figure (float) alignment - # - # 'figure_align': 'htbp', + # The paper size ('letterpaper' or 'a4paper'). + # + # 'papersize': 'letterpaper', + # The font size ('10pt', '11pt' or '12pt'). + # + # 'pointsize': '10pt', + # Additional stuff for the LaTeX preamble. + # + # 'preamble': '', + # Latex figure (float) alignment + # + # 'figure_align': 'htbp', } -# Grouping the document tree into LaTeX files. List of tuples -# (source start file, target name, title, -# author, documentclass [howto, manual, or own class]). +# Grouping the document tree into LaTeX files. List of tuples. latex_documents = [ - (master_doc, 'yellowbrick.tex', 'yellowbrick Documentation', - 'District Data Labs', 'manual'), + ( + master_doc, # source start file + "yellowbrick.tex", # target name + "{} Documentation".format(project), # title + author, # author + "manual", # documentclass [howto,manual, or own class] + ) ] # The name of an image file (relative to this directory) to place at the top of @@ -308,15 +371,13 @@ def setup(app): # # latex_domain_indices = True - -# -- Options for manual page output --------------------------------------- +########################################################################## +## Options for manual page output +########################################################################## # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). -man_pages = [ - (master_doc, 'yellowbrick', 'yellowbrick Documentation', - [author], 1) -] +man_pages = [(master_doc, project, "{} Documentation".format(project), [author], 1)] # If true, show URL addresses after external links. # @@ -329,9 +390,15 @@ def setup(app): # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ - (master_doc, 'yellowbrick', 'yellowbrick Documentation', - author, 'yellowbrick', 'One line description of project.', - 'Miscellaneous'), + ( + master_doc, + "yellowbrick", + "{} Documentation".format(project), + author, + "yellowbrick", + "machine learning visualization", + "scientific visualization", + ) ] # Documents to append as an appendix to all manuals. @@ -349,16 +416,3 @@ def setup(app): # If true, do not generate a @detailmenu in the "Top" node's menu. # # texinfo_no_detailmenu = False - -# Autodoc requires numpy to skip class members otherwise we get an exception: -# toctree contains reference to nonexisting document -# See: https://github.com/phn/pytpm/issues/3#issuecomment-12133978 -numpydoc_show_class_members = False - -# Locations of objects.inv files for intersphinx extension that auto links to external api docs. -intersphinx_mapping = {'python': ('https://docs.python.org/3', None), - 'matplotlib': ('http://matplotlib.org/', None), - 'scipy': ('http://scipy.github.io/devdocs/', None), - 'numpy': ('https://docs.scipy.org/doc/numpy-dev/', None), - 'cycler': ('http://matplotlib.org/cycler/', None), - 'sklearn': ('http://scikit-learn.org/stable/', None)} diff --git a/docs/contributing.rst b/docs/contributing.rst deleted file mode 100644 index 9765ec1ed..000000000 --- a/docs/contributing.rst +++ /dev/null @@ -1,426 +0,0 @@ -.. -*- mode: rst -*- - -Contributing -============ - -Yellowbrick is an open source project that is supported by a community who will gratefully and humbly accept any contributions you might make to the project. Large or small, any contribution makes a big difference; and if you've never contributed to an open source project before, we hope you will start with Yellowbrick! - -Principally, Yellowbrick development is about the addition and creation of *visualizers* --- objects that learn from data and create a visual representation of the data or model. Visualizers integrate with scikit-learn estimators, transformers, and pipelines for specific purposes and as a result, can be simple to build and deploy. The most common contribution is a new visualizer for a specific model or model family. We'll discuss in detail how to build visualizers later. - -Beyond creating visualizers, there are many ways to contribute: - -- Submit a bug report or feature request on `GitHub issues`_. -- Contribute an Jupyter notebook to our `examples gallery`_. -- Assist us with :doc:`user testing `. -- Add to the documentation or help with our website, `scikit-yb.org`_ -- Write unit or integration tests for our project. -- Answer questions on our `GitHub issues`_, `mailing list`_, `Stack Overflow`_, and `Twitter`_. -- Translate our documentation into another language. -- Write a blog post, tweet, or share our project with others. -- Teach someone how to use Yellowbrick. - -As you can see, there are lots of ways to get involved and we would be very happy for you to join us! The only thing we ask is that you abide by the principles of openness, respect, and consideration of others as described in our :doc:`code_of_conduct`. - -.. note:: If you're unsure where to start, perhaps the best place is to drop the maintainers a note via our mailing list: http://bit.ly/yb-listserv. - -.. _`examples gallery`: https://github.com/DistrictDataLabs/yellowbrick/tree/develop/examples -.. _`scikit-yb.org`: http://www.scikit-yb.org -.. _`GitHub issues`: https://github.com/DistrictDataLabs/yellowbrick/issues -.. _`mailing list`: http://bit.ly/yb-listserv -.. _`Stack Overflow`: https://stackoverflow.com/questions/tagged/yellowbrick -.. _`Twitter`: https://twitter.com/scikit_yb - -Getting Started on GitHub -------------------------- - -Yellowbrick is hosted on GitHub at https://github.com/DistrictDataLabs/yellowbrick. - -The typical workflow for a contributor to the codebase is as follows: - -1. **Discover** a bug or a feature by using Yellowbrick. -2. **Discuss** with the core contributors by `adding an issue `_. -3. **Assign** yourself the task by pulling a card from our `Waffle Kanban `_ -4. **Fork** the repository into your own GitHub account. -5. Create a **Pull Request** first thing to `connect with us `_ about your task. -6. **Code** the feature, write the tests and documentation, add your contribution. -7. **Review** the code with core contributors who will guide you to a high quality submission. -8. **Merge** your contribution into the Yellowbrick codebase. - -.. note:: Please create a pull request as soon as possible, even before you've started coding. This will allow the core contributors to give you advice about where to add your code or utilities and discuss other style choices and implementation details as you go. Don't wait! - -We believe that *contribution is collaboration* and therefore emphasize *communication* throughout the open source process. We rely heavily on GitHub's social coding tools to allow us to do this. - -Forking the Repository -~~~~~~~~~~~~~~~~~~~~~~ - -The first step is to fork the repository into your own account. This will create a copy of the codebase that you can edit and write to. Do so by clicking the **"fork"** button in the upper right corner of the Yellowbrick GitHub page. - -Once forked, use the following steps to get your development environment set up on your computer: - -1. Clone the repository. - - After clicking the fork button, you should be redirected to the GitHub page of the repository in your user account. You can then clone a copy of the code to your local machine.:: - - $ git clone https://github.com/[YOURUSERNAME]/yellowbrick - $ cd yellowbrick - -2. Create a virtual environment. - - Yellowbrick developers typically use `virtualenv `_ (and `virtualenvwrapper `_), `pyenv `_ or `conda envs `_ in order to manage their Python version and dependencies. Using the virtual environment tool of your choice, create one for Yellowbrick. Here's how with virtualenv:: - - $ virtualenv venv - -3. Install dependencies. - - Yellowbrick's dependencies are in the ``requirements.txt`` document at the root of the repository. Open this file and uncomment the dependencies that are for development only. Then install the dependencies with ``pip``:: - - $ pip install -r requirements.txt - - Note that there may be other dependencies required for development and testing; you can simply install them with ``pip``. For example to install - the additional dependencies for building the documentation or to run the - test suite, use the ``requirements.txt`` files in those directories:: - - $ pip install -r tests/requirements.txt - $ pip install -r docs/requirements.txt - -4. Switch to the develop branch. - - The Yellowbrick repository has a ``develop`` branch that is the primary working branch for contributions. It is probably already the branch you're on, but you can make sure and switch to it as follows:: - - $ git fetch - $ git checkout develop - -At this point you're ready to get started writing code. If you're going to take on a specific task, we'd strongly encourage you to check out the issue on `Waffle `_ and create a `pull request `_ *before you start coding* to better foster communication with other contributors. More on this in the next section. - -Pull Requests -~~~~~~~~~~~~~ - -A `pull request (PR) `_ is a GitHub tool for initiating an exchange of code and creating a communication channel for Yellowbrick maintainers to discuss your contribution. In essenence, you are requesting that the maintainers merge code from your forked repository into the develop branch of the primary Yellowbrick repository. Once completed, your code will be part of Yellowbrick! - -When starting a Yellowbrick contribution, *open the pull request as soon as possible*. We use your PR issue page to discuss your intentions and to give guidance and direction. Every time you push a commit into your forked repository, the commit is automatically included with your pull request, therefore we can review as you code. The earlier you open a PR, the more easily we can incorporate your updates, we'd hate for you to do a ton of work only to discover someone else already did it or that you went in the wrong direction and need to refactor. - -.. note:: For a great example of a pull request for a new feature visualizer, check out `this one `_ by `Carlo Morales `_. - -When you open a pull request, ensure it is from your forked repository to the develop branch of `github.com/districtdatalabs/yellowbrick `_; we will not merge a PR into the master branch. Title your Pull Request so that it is easy to understand what you're working on at a glance. Also be sure to include a reference to the issue that you're working on so that correct references are set up. - -After you open a PR, you should get a message from one of the maintainers. Use that time to discuss your idea and where best to implement your work. Feel free to go back and forth as you are developing with questions in the comment thread of the PR. Once you are ready, please ensure that you explicitly ping the maintainer to do a code review. Before code review, your PR should contain the following: - -1. Your code contribution -2. Tests for your contribution -3. Documentation for your contribution -4. A PR comment describing the changes you made and how to use them -5. A PR comment that includes an image/example of your visualizer - -At this point your code will be formally reviewed by one of the contributors. We use GitHub's code review tool, starting a new code review and adding comments to specific lines of code as well as general global comments. Please respond to the comments promptly, and don't be afraid to ask for help implementing any requested changes! You may have to go back and forth a couple of times to complete the code review. - -When the following is true: - -1. Code is reviewed by at least one maintainer -2. Continuous Integration tests have passed -3. Code coverage and quality have not decreased -4. Code is up to date with the yellowbrick develop branch - -Then we will "Squash and Merge" your contribution, combining all of your commits into a single commit and merging it into the develop branch of Yellowbrick. Congratulations! Once your contribution has been merged into master, you will be officially listed as a contributor. - -Developing Visualizers ----------------------- - -In this section, we'll discuss the basics of developing visualizers. This of course is a big topic, but hopefully these simple tips and tricks will help make sense. First thing though, check out this presentation that we put together on yellowbrick development, it discusses the expected user workflow, our integration with scikit-learn, our plans and roadmap, etc: - -.. raw:: html - - - -One thing that is necessary is a good understanding of scikit-learn and Matplotlib. Because our API is intended to integrate with scikit-learn, a good start is to review `"APIs of scikit-learn objects" `_ and `"rolling your own estimator" `_. In terms of matplotlib, use Yellowbrick's guide :doc:`matplotlib`. Additional resources include `Nicolas P. Rougier's Matplotlib tutorial `_ and `Chris Moffitt's Effectively Using Matplotlib `_. - -Visualizer API -~~~~~~~~~~~~~~ - -There are two basic types of Visualizers: - -- **Feature Visualizers** are high dimensional data visualizations that are essentially transformers. -- **Score Visualizers** wrap a scikit-learn regressor, classifier, or clusterer and visualize the behavior or performance of the model on test data. - -These two basic types of visualizers map well to the two basic objects in scikit-learn: - -- **Transformers** take input data and return a new data set. -- **Estimators** are fit to training data and can make predictions. - -The scikit-learn API is object oriented, and estimators and transformers are initialized with parameters by instantiating their class. Hyperparameters can also be set using the ``set_attrs()`` method and retrieved with the corresponding ``get_attrs()`` method. All scikit-learn estimators have a ``fit(X, y=None)`` method that accepts a two dimensional data array, ``X``, and optionally a vector ``y`` of target values. The ``fit()`` method trains the estimator, making it ready to transform data or make predictions. Transformers have an associated ``transform(X)`` method that returns a new dataset, ``Xprime`` and models have a ``predict(X)`` method that returns a vector of predictions, ``yhat``. Models also have a ``score(X, y)`` method that evaluate the performance of the model. - -Visualizers interact with scikit-learn objects by intersecting with them at the methods defined above. Specifically, visualizers perform actions related to ``fit()``, ``transform()``, ``predict()``, and ``score()`` then call a ``draw()`` method which initializes the underlying figure associated with the visualizer. The user calls the visualizer's ``poof()`` method, which in turn calls a ``finalize()`` method on the visualizer to draw legends, titles, etc. and then ``poof()`` renders the figure. The Visualizer API is therefore: - -- ``draw()``: add visual elements to the underlying axes object -- ``finalize()``: prepare the figure for rendering, adding final touches such as legends, titles, axis labels, etc. -- ``poof()``: render the figure for the user (or saves it to disk). - -Creating a visualizer means defining a class that extends ``Visualizer`` or one of its subclasses, then implementing several of the methods described above. A barebones implementation is as follows:: - - import matplotlib.pyplot as plt - - from yellowbrick.base import Visualizer - - class MyVisualizer(Visualizer): - - def __init__(self, ax=None, **kwargs): - super(MyVisualizer, self).__init__(ax, **kwargs) - - def fit(self, X, y=None): - self.draw(X) - return self - - def draw(self, X): - if self.ax is None: - self.ax = self.gca() - - self.ax.plt(X) - - def finalize(self): - self.set_title("My Visualizer") - -This simple visualizer simply draws a line graph for some input dataset X, intersecting with the scikit-learn API at the ``fit()`` method. A user would use this visualizer in the typical style:: - - visualizer = MyVisualizer() - visualizer.fit(X) - visualizer.poof() - -Score visualizers work on the same principle but accept an additional required ``model`` argument. Score visualizers wrap the model (which can be either instantiated or uninstantiated) and then pass through all attributes and methods through to the underlying model, drawing where necessary. - -Testing -~~~~~~~ - -The test package mirrors the yellowbrick package in structure and also contains several helper methods and base functionality. To add a test to your visualizer, find the corresponding file to add the test case, or create a new test file in the same place you added your code. - -Visual tests are notoriously difficult to create --- how do you test a visualization or figure? Moreover, testing scikit-learn models with real data can consume a lot of memory. Therefore the primary test you should create is simply to test your visualizer from end to end and make sure that no exceptions occur. To assist with this, we have two primary helpers, ``VisualTestCase`` and ``DatasetMixin``. Create your unittest as follows:: - - import pytest - from tests.base import VisualTestCase - from tests.dataset import DatasetMixin - - class MyVisualizerTests(VisualTestCase, DatasetMixin): - - def test_my_visualizer(self): - """ - Test MyVisualizer on a real dataset - """ - # Load the data from the fixture - dataset = self.load_data('occupancy') - - # Get the data - X = dataset[[ - "temperature", "relative_humidity", "light", "C02", "humidity" - ]] - y = dataset['occupancy'].astype(int) - - try: - visualizer = MyVisualizer() - visualizer.fit(X) - visualizer.poof() - except Exception as e: - pytest.fail("my visualizer didn't work") - -Tests can be run as follows:: - - $ make test - -The Makefile uses the pytest runner and testing suite as well as the coverage library, so make sure you have those dependencies installed! The ``DatasetMixin`` also requires `requests.py `_ to fetch data from our Amazon S3 account. - -Image Comparison Tests -~~~~~~~~~~~~~~~~~~~~~~ - -Writing an image based comparison test is only a little more difficult than the simple testcase presented above. We have adapted matplotlib's image comparison test utility into an easy to use assert method : ``self.assert_images_similar(visualizer)`` - -The main consideration is that you must specify the “baseline”, or expected, image in the ``tests/baseline_images/`` folder structure. - -For example, create your unittest located in ``tests/test_regressor/test_myvisualizer.py`` as follows:: - - from tests.base import VisualTestCase - ... - def test_my_visualizer_output(self): - ... - visualizer = MyVisualizer() - visualizer.fit(X) - visualizer.poof() - self.assert_images_similar(visualizer) - -The first time this test is run, there will be no baseline image to compare against, so the test will fail. Copy the output images (in this case ``tests/actual_images/test_regressor/test_myvisualizer/test_my_visualizer_output.png``) to the correct subdirectory of baseline_images tree in the source directory (in this case ``tests/baseline_images/test_regressor/test_myvisualizer/test_my_visualizer_output.png``). Put this new file under source code revision control (with git add). When rerunning the tests, they should now pass. - -We also have a helper script, ``tests/images.py`` to clean up and manage baseline images automatically. It is run using the ``python -m`` command to execute a module as main, and it takes as an argument the path to your *test file*. To copy the figures as above:: - - $ python -m tests.images tests/test_regressor/test_myvisualizer.py - -This will move all related test images from ``actual_images`` to ``baseline_images`` on your behalf (note you'll have had to run the tests at least once to generate the images). You can also clean up images from both actual and baseline as follows:: - - $ python -m tests.images -C tests/test_regressor/test_myvisualizer.py - -This is useful particularly if you're stuck trying to get an image comparison to work. For more information on the images helper script, use ``python -m tests.images --help``. - -Documentation -~~~~~~~~~~~~~ - -The initial documentation for your visualizer will be a well structured docstring. Yellowbrick uses Sphinx to build documentation, therefore docstrings should be written in reStructuredText in numpydoc format (similar to scikit-learn). The primary location of your docstring should be right under the class definition, here is an example:: - - class MyVisualizer(Visualizer): - """ - This initial section should describe the visualizer and what - it's about, including how to use it. Take as many paragraphs - as needed to get as much detail as possible. - - In the next section describe the parameters to __init__. - - Parameters - ---------- - - model : a scikit-learn regressor - Should be an instance of a regressor, and specifically one whose name - ends with "CV" otherwise a will raise a YellowbrickTypeError exception - on instantiation. To use non-CV regressors see: - ``ManualAlphaSelection``. - - ax : matplotlib Axes, default: None - The axes to plot the figure on. If None is passed in the current axes - will be used (or generated if required). - - kwargs : dict - Keyword arguments that are passed to the base class and may influence - the visualization as defined in other Visualizers. - - Examples - -------- - - >>> model = MyVisualizer() - >>> model.fit(X) - >>> model.poof() - - Notes - ----- - - In the notes section specify any gotchas or other info. - """ - -When your visualizer is added to the API section of the documentation, this docstring will be rendered in HTML to show the various options and functionality of your visualizer! - -To add the visualizer to the documentation it needs to be added to the ``docs/api`` folder in the correct subdirectory. For example if your visualizer is a model score visualizer related to regression it would go in the ``docs/api/regressor`` subdirectory. If you have a question where your documentation should be located, please ask the maintainers via your pull request, we'd be happy to help! - -There are two primary files that need to be created: - -1. **mymodule.rst**: the reStructuredText document -2. **mymodule.py**: a python file that generates images for the rst document - -There are quite a few examples in the documentation on which you can base your files of similar types. The primary format for the API section is as follows:: - - .. -*- mode: rst -*- - - My Visualizer - ============= - - Intro to my visualizer - - .. code:: python - - # Example to run MyVisualizer - visualizer = MyVisualizer(LinearRegression()) - - visualizer.fit(X, y) - g = visualizer.poof() - - - .. image:: images/my_visualizer.png - - Discussion about my visualizer - - - API Reference - ------------- - - .. automodule:: yellowbrick.regressor.mymodule - :members: MyVisualizer - :undoc-members: - :show-inheritance: - -This is a pretty good structure for a documentation page; a brief introduction followed by a code example with a visualization included (using the ``mymodule.py`` to generate the images into the local directory's ``images`` subdirectory). The primary section is wrapped up with a discussion about how to interpret the visualizer and use it in practice. Finally the ``API Reference`` section will use ``automodule`` to include the documentation from your docstring. - -At this point there are several places where you can list your visualizer, but to ensure it is included in the documentation it *must be listed in the TOC of the local index*. Find the ``index.rst`` file in your subdirectory and add your rst file (without the ``.rst`` extension) to the ``..toctree::`` directive. This will ensure the documentation is included when it is built. - -Speaking of, you can build your documentation by changing into the ``docs`` directory and running ``make html``, the documentation will be built and rendered in the ``_build/html`` directory. You can view it by opening ``_build/html/index.html`` then navigating to your documentation in the browser. - -There are several other places that you can list your visualizer including: - - - ``docs/index.rst`` for a high level overview of our visualizers - - ``DESCRIPTION.rst`` for inclusion on PyPI - - ``README.md`` for inclusion on GitHub - -Please ask for the maintainer's advice about how to include your visualizer in these pages. - -Advanced Development --------------------- - -In this section we discuss more advanced contributing guidelines including setting up branches for development as well as the release cycle. This section is intended for maintainers and core contributors of the Yellowbrick project. If you would like to be a maintainer please contact one of the current maintainers of the project. - -Branching Convention -~~~~~~~~~~~~~~~~~~~~ - -The Yellowbrick repository is set up in a typical production/release/development cycle as described in "`A Successful Git Branching Model `_." The primary working branch is the ``develop`` branch. This should be the branch that you are working on and from, since this has all the latest code. The ``master`` branch contains the latest stable version and release_, which is pushed to PyPI_. No one but core contributors will generally push to master. - -.. note:: All pull requests should be into the ``yellowbrick/develop`` branch from your forked repository. - -You can work directly in your fork and create a pull request from your fork's develop branch into ours. We also recommend setting up an ``upstream`` remote so that you can easily pull the latest development changes from the main Yellowbrick repository (see `configuring a remote for a fork `_). You can do that as follows:: - - $ git remote add upstream https://github.com/DistrictDataLabs/yellowbrick.git - $ git remote -v - origin https://github.com/YOUR_USERNAME/YOUR_FORK.git (fetch) - origin https://github.com/YOUR_USERNAME/YOUR_FORK.git (push) - upstream https://github.com/DistrictDataLabs/yellowbrick.git (fetch) - upstream https://github.com/DistrictDataLabs/yellowbrick.git (push) - -When you're ready, request a code review for your pull request. Then, when reviewed and approved, you can merge your fork into our main branch. Make sure to use the "Squash and Merge" option in order to create a Git history that is understandable. - -.. note:: When merging a pull request, use the "squash and merge" option. - -Core contributors have write access to the repository. In order to reduce the number of merges (and merge conflicts) we recommend that you utilize a feature branch off of develop to do intermediate work in:: - - $ git checkout -b feature-myfeature develop - -Once you are done working (and everything is tested) merge your feature into develop.:: - - $ git checkout develop - $ git merge --no-ff feature-myfeature - $ git branch -d feature-myfeature - $ git push origin develop - -Head back to Waffle and checkout another issue! - -Releases -~~~~~~~~ - -When ready to create a new release we branch off of develop as follows:: - - $ git checkout -b release-x.x - -This creates a release branch for version x.x. At this point do the version bump by modifying ``version.py`` and the test version in ``tests/__init__.py``. Make sure all tests pass for the release and that the documentation is up to date. There may be style changes or deployment options that have to be done at this phase in the release branch. At this phase you'll also modify the ``changelog`` with the features and changes in the release. - -Once the release is ready for prime-time, merge into master:: - - $ git checkout master - $ git merge --no-ff --no-edit release-x.x - -Tag the release in GitHub:: - - $ git tag -a vx.x - $ git push origin vx.x - -You'll have to go to the release_ page to edit the release with similar information as added to the changelog. Once done, push the release to PyPI:: - - $ make build - $ make deploy - -Check that the PyPI page is updated with the correct version and that ``pip install -U yellowbrick`` updates the version and works correctly. Also check the documentation on PyHosted, ReadTheDocs, and on our website to make sure that it was correctly updated. Finally merge the release into develop and clean up:: - - $ git checkout develop - $ git merge --no-ff --no-edit release-x.x - $ git branch -d release-x.x - -Hotfixes and minor releases also follow a similar pattern; the goal is to effectively get new code to users as soon as possible! - -.. _release: https://github.com/DistrictDataLabs/yellowbrick/releases -.. _PyPI: https://pypi.python.org/pypi/yellowbrick diff --git a/docs/contributing/advanced_development_topics.rst b/docs/contributing/advanced_development_topics.rst new file mode 100644 index 000000000..6f32446c2 --- /dev/null +++ b/docs/contributing/advanced_development_topics.rst @@ -0,0 +1,233 @@ +.. -*- mode: rst -*- + +Advanced Development Topics +=========================== + +In this section we discuss more advanced contributing guidelines such as code conventions,the release life cycle or branch management. This section is intended for maintainers and core contributors of the Yellowbrick project. If you would like to be a maintainer please contact one of the current maintainers of the project. + +Reviewing Pull Requests +----------------------- + +We use several strategies when reviewing pull requests from contributors to Yellowbrick. If the pull request affects only a single file or a small portion of the code base, it is sometimes sufficient to review the code using `GitHub's lightweight code review feature `_. However, if the changes impact a number of files or modify the documentation, our convention is to add the contributor's fork as a remote, pull, and check out their feature branch locally. From inside your fork of Yellowbrick, this can be done as follows:: + + $ git remote add contribsusername https://github.com/contribsusername/yellowbrick.git + $ git fetch contribsusername + $ git checkout -b contribsfeaturebranch contribsusername/contribsfeaturebranch + +This will allow you to inspect their changes, run the tests, and build the docs locally. If the contributor has elected to allow reviewers to modify their feature branch, you will also be able to push changes directly to their branch:: + + $ git add filethatyouchanged.py + $ git commit -m "Adjusted tolerance levels to appease AppVeyor" + $ git push contribsusername contribsfeaturebranch + +These changes will automatically go into the pull request, which can be useful for making small modifications (e.g. visual test tolerance levels) to get the PR over the finish line. + + +Visualizer Review Checklist +--------------------------- + +As the visualizer API has matured over time, we've realized that there are a number of routine items that must be in place to consider a visualizer truly complete and ready for prime time. This list is also extremely helpful for reviewing code submissions to ensure that visualizers are consistently implemented, tested, and documented. Though we do not expect these items to be checked off on every PR, the below list includes some guidance about what to look for when reviewing or writing a new Visualizer. + +.. note:: The ``contrib`` module is a great place for work-in-progress Visualizers! + +Code Conventions +~~~~~~~~~~~~~~~~ + +- Ensure the visualizer API is met. + + The basic principle of the visualizer API is that scikit-learn methods such as ``fit()``, ``transform()``, ``score()``, etc. perform interactions with scikit-learn or other computations and call the ``draw()`` method. Calls to matplotlib should happen only in ``draw()`` or ``finalize()``. + +- Create a quick method for the visualizer. + + In addition to creating the visualizer class, ensure there is an associated quick method that returns the visualizer and creates the visualization in one line of code! + +- Subclass the correct visualizer. + + Ensure that the visualizer is correctly subclassed in the class hierarchy. If you're not sure what to subclass, please ping a maintainer, they'd be glad to help! + +- Ensure numpy array comparisons are not ambiguous. + + Often there is code such as ``if y:`` where ``y`` is an array. However this is ambiguous when used with numpy arrays and other data containers. Change this code to ``y is not None`` or ``len(y) > 0`` or use ``np.all`` or ``np.any`` to test if the contents of the array are truthy/falsy. + +- Add ``random_state`` argument to visualizer. + + If the visualizer uses/wraps a utility that also has ``random_state``, then the visualizer itself needs to also have this argument which defaults to ``None`` and is passed to all internal stochastic behaviors. This ensures that image comparison testing will work and that users can get repeated behavior from visualizers. + +- Use ``np.unique`` instead of `set`. + + If you need the unique values from a list or array, we prefer to use numpy methods wherever possible. We performed some limited benchmarking and believe that ``np.unique`` is a bit faster and more efficient. + +- Use sklearn underscore suffix for learned parameters. + + Any parameters that are learned during ``fit()`` should only be added to the visualizer when ``fit()`` is called (this is also how we determine if a visualizer is fitted or not) and should be identified with an underscore suffix. For example, in classification visualizers, the classes can be either passed in by the user or determined when they are passed in via fit, therefore it should be ``self.classes_``. This is also true for other learned parameters, e.g. ``self.score_``, even though this is not created during ``fit()``. + +- Correctly set the title in finalize. + + Use the ``self.set_title()`` method to set a default title; this allows the user to specify a custom title in the initialization arguments. + +Testing Conventions +~~~~~~~~~~~~~~~~~~~ + +- Ensure there is an image comparison test. + + Ensure there is at least one image comparison test per visualizer. This is the primary regression testing of Yellowbrick and these tests catch a lot when changes occur in our dependencies or environment. + +- Use pytest assertions rather than ``unittest.TestCase`` methods. + + We prefer ``assert 2+2 == 4`` rather than ``self.assertEquals(2+2, 4)``. As a result, test classes should not extend ``unittest.Testcase`` but should extend the ``VisualTestCase`` in the tests package. Note that if you're writing tests that do not generate matplotlib figures you can simply extend ``object``. + +- Use test fixtures and sklearn dataset generators. + + Data is the key to testing with Yellowbrick; often the test package will have fixtures in ``conftest.py`` that can be directly used (e.g. binary vs. multiclass in the ``test_classifier`` package). If one isn't available feel free to use randomly generated datasets from the ``sklearn.datasets`` module e.g. ``make_classification``, ``make_regression``, or ``make_blobs``. For integration testing, please feel free to use one of the Yellowbrick datasets. + +- Fix all ``random_state`` arguments. + + Be on the lookout for any method (particularly sklearn methods) that have a ``random_state`` argument and be sure to fix them so that tests always pass! + +- Test a variety of inputs. + + Machine learning can be done on a variety of inputs for ``X`` and ``y``, ensure there is a test with numpy arrays, pandas DataFrame and Series objects, and with Python lists. + +- Test that ``fit()`` returns self. + + When doing end-to-end testing, we like to ``assert oz.fit() is oz`` to ensure the API is maintained. + +- Test that ``score()`` between zero and one. + + With visualizers that have a ``score()`` method, we like to ``assert 0.0 <= oz.score() >=1.0`` to ensure the API is maintained. + +Documentation Conventions +~~~~~~~~~~~~~~~~~~~~~~~~~ + +- Visualizer DocString is correct. + + The visualizer docstring should be present under the class and contain a narrative about the visualizer and its arguments with the numpydoc style. + +- API Documentation. + + All visualizers should have their own API page under ``docs/api/[yb-module]``. This documentation should include an ``automodule`` statement. Generally speaking there is also an image generation script of the same name in this folder so that the documentation images can be generated on demand. + +- Listing the visualizer. + + The visualizer should be listed in a number of places including: ``docs/api/[yb-module]/index.rst``, ``docs/api/index.rst``, ``docs/index.rst``, ``README.md``, and ``DESCRIPTION.rst``. + +- Include a gallery image. + + Please also add the visualizer image to the gallery! + +- Update added to the changelog. + + To reduce the time it takes to put together the changelog, we'd like to update it when we add new features and visualizers rather than right before the release. + +Merging Pull Requests +--------------------- + +Our convention is that the person who performs the code review should merge the pull request (since reviewing is hard work and deserves due credit!). Only core contributors have write access to the repository and can merge pull requests. Some preferences for commit messages when merging in pull requests: + +- Make sure to use the "Squash and Merge" option in order to create a Git history that is understandable. +- Keep the title of the commit short and descriptive; be sure it includes the PR #. +- Craft a commit message body that is 1-3 sentences, depending on the complexity of the commit; it should explicitly reference any issues being closed or opened using `GitHub's commit message keywords `_. + +.. note:: When merging a pull request, use the "squash and merge" option. + + +Releases +-------- + +To ensure we get new code to our users as soon and as bug free as possible we periodically create major, minor, and hotfix version releases that are merged from the ``develop`` branch into ``master`` and pushed to PyPI and Anaconda Cloud. Our release cycle ensures that stable code can be found in the master branch and pip installed and that we can test our development code thoroughly before a release. + +.. note:: The following steps must be taken by a maintainer with access to the primary (upstream) Yellowbrick repository. Any reference to ``origin`` refers to github.com/DistrictDataLabs/yellowbrick. + +The first step is to create a release branch from develop - this allows us to do "release-work" (e.g. a version bump, changelog stuff, etc.) in a branch that is neither ``develop`` nor ``master`` and to test the release before deployment:: + + $ git checkout develop + $ git pull origin develop + $ git checkout -b release-x.x + +This creates a release branch for version ``x.x`` where ``x`` is a digit. Release versions are described as a number ``x.y.z`` where ``x`` is the major version, ``y`` is the minor version and ``z`` is a patch version. Generally speaking most releases are minor version releases where ``x.y`` becomes ``x.y+1```. Patch versions are infrequent but may also be needed where very little has changed or something quick has to be pushed to fix a critical bug, e.g.g ``x.y`` becomes ``x.y.1``. Major version releases where ``x.y`` become ``x+1.0`` are rare. + +At this point do the version bump by modifying ``version.py`` and the test version in ``tests/__init__.py``. Make sure all tests pass for the release and that the documentation is up to date. To build the docs see the :ref:`documentation notes `. There may be style changes or deployment options that have to be done at this phase in the release branch. At this phase you'll also modify the ``changelog`` with the features and changes in the release that have not already been marked. + +.. note:: Before merging the release to master make sure that the release checklist has been completed! + +Once the release is ready for prime-time, merge into master:: + + $ git checkout master + $ git merge --no-ff --no-edit release-x.x + $ git push origin master + +Tag the release in GitHub:: + + $ git tag -a vx.x + $ git push origin vx.x + +Now go to the release_ page to convert the tag into a release and add a Markdown version of the changelog notes for those that are accessing the release directly from GitHub. + +Deploying to PyPI +~~~~~~~~~~~~~~~~~ + +Deploying the release to PyPI is fairly straight forward. Ensure that you have valid PyPI login credentials in ``~/.pypirc`` and use the Makefile to deploy as follows:: + + $ make build + $ make deploy + +The build process should create ``build`` and ``dist`` directories containing the wheel and source packages as well as a ``.egg-info`` file for deployment. The deploy command registers the version in PyPI and uploads it with Twine. + +Deploying to Anaconda Cloud +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +These instructions follow the tutorial `"Building conda packages with conda skeleton" `_. To deploy release to Anaconda Cloud you first need to have Miniconda or Anaconda installed along with ``conda-build`` and ``anaconda-client`` (which can be installed using ``conda``). Make sure that you run the ``anaconda login`` command using the credentials that allow access to the Yellowbrick channel. If you have an old skeleton directory, make sure to save it with a different name (e.g. yellowbrick.old) before running the skeleton command:: + + $ conda skeleton pypi yellowbrick + +This should install the latest version of yellowbrick from PyPI - make sure the version matches the expected version of the release! There are some edits that must be made to the ``yellowbrick/meta.yaml`` that is generated as follows:: + + about: + home: http://scikit-yb.org/ + license_file: LICENSE.txt + doc_url: https://www.scikit-yb.org/en/latest/ + dev_url: https://github.com/DistrictDataLabs/yellowbrick + +In addition, you must remove the entire ``test:`` section of the yaml file and add the following to the ``requirements:`` under both ``host:`` and ``run:``. See `example meta.yaml `_ for a detailed version. Note that the description field in the metadata is pulled from the ``DESCRIPTION.rst`` in the root of the Yellowbrick project. However, Anaconda Cloud requires a Markdown description - the easiest thing to do is to copy it from the existing description. + +With the ``meta.yaml`` file setup you can now run the build command for the various Python distributes that Yellowbrick supports:: + + $ conda build --python 3.6 yellowbrick + $ conda build --python 3.7 yellowbrick + +After this command completes you should have build files in ``$MINICONDA_HOME/conda-bld/[OS]/yellowbrick-x.x-py3.x_0.tar.bz2``. You can now run conda convert for each of the Python versions using this directory as follows:: + + $ conda convert --platform all [path to build] -o $MINICONDA_HOME/conda-bld + +At this point you should have builds for all the versions of Python and all platforms Yellowbrick supports. Unfortunately at this point you have to upload them all to Anaconda Cloud:: + + $ anaconda upload $MINICONDA_HOME/conda-bld/[OS]/yellowbrick-x.x-py3.x_0.tar.bz2 + +Once uploaded, the Anaconda Cloud page should reflect the latest version, you may have to edit the description to make sure it's in Markdown format. + +Finalizing the Release +~~~~~~~~~~~~~~~~~~~~~~ + +The last steps in the release process are to check to make sure the release completed successfully. Make sure that the `PyPI page`_ and the `Anaconda Cloud Page`_ are correctly updated to the latest version. Also ensure that ReadTheDocs has correctly built the "latest" documentation on `scikit-yb.org `_. + +Make sure that you can update the package on your local machine, either in a virtual environment that does not include yellowbrick or in a Python install that is not used for development (e.g. not in the yellowbrick project directory):: + + $ pip install -U yellowbrick + $ python -c "import yellowbrick; print(yellowbrick.__version__)" + +After verifying that the version has been correctly updated you can clean up the project directory:: + + $ make clean + +After this, it's time to merge the release into develop so that we can get started on the next version! :: + + $ git checkout develop + $ git merge --no-ff --no-edit release-x.x + $ git branch -d release-x.x + $ git push origin develop + +Make sure to celebrate the release with the other maintainers and to tweet to everyone to let them know it's time to update Yellowbrick! + +.. _release: https://github.com/DistrictDataLabs/yellowbrick/releases +.. _PyPI Page: https://pypi.org/project/yellowbrick/ +.. _Anaconda Cloud Page: https://anaconda.org/DistrictDataLabs/yellowbrick diff --git a/docs/contributing/developing_visualizers.rst b/docs/contributing/developing_visualizers.rst new file mode 100644 index 000000000..2b1af4539 --- /dev/null +++ b/docs/contributing/developing_visualizers.rst @@ -0,0 +1,449 @@ +.. -*- mode: rst -*- + +Developing Visualizers +====================== + +In this section, we'll discuss the basics of developing visualizers. This of course is a big topic, but hopefully these simple tips and tricks will help make sense. First thing though, check out this presentation that we put together on yellowbrick development, it discusses the expected user workflow, our integration with scikit-learn, our plans and roadmap, etc: + +.. raw:: html + + + +One thing that is necessary is a good understanding of scikit-learn and Matplotlib. Because our API is intended to integrate with scikit-learn, a good start is to review `"APIs of scikit-learn objects" `_ and `"rolling your own estimator" `_. In terms of matplotlib, use Yellowbrick's guide :doc:`../matplotlib`. Additional resources include `Nicolas P. Rougier's Matplotlib tutorial `_ and `Chris Moffitt's Effectively Using Matplotlib `_. + +Visualizer API +-------------- + +There are two basic types of Visualizers: + +- **Feature Visualizers** are high dimensional data visualizations that are essentially transformers. +- **Score Visualizers** wrap a scikit-learn regressor, classifier, or clusterer and visualize the behavior or performance of the model on test data. + +These two basic types of visualizers map well to the two basic estimator objects in scikit-learn: + +- **Transformers** take input data and return a new data set. +- **Models** are fit to training data and can make predictions. + +The scikit-learn API is object oriented, and estimators are initialized with parameters by instantiating their class. Hyperparameters can also be set using the ``set_attrs()`` method and retrieved with the corresponding ``get_attrs()`` method. All scikit-learn estimators have a ``fit(X, y=None)`` method that accepts a two dimensional data array, ``X``, and optionally a vector ``y`` of target values. The ``fit()`` method trains the estimator, making it ready to transform data or make predictions. Transformers have an associated ``transform(X)`` method that returns a new dataset, ``Xprime`` and models have a ``predict(X)`` method that returns a vector of predictions, ``yhat``. Models may also have a ``score(X, y)`` method that evaluate the performance of the model. + +Visualizers interact with scikit-learn objects by intersecting with them at the methods defined above. Specifically, visualizers perform actions related to ``fit()``, ``transform()``, ``predict()``, and ``score()`` then call a ``draw()`` method which initializes the underlying figure associated with the visualizer. The user calls the visualizer's ``poof()`` method, which in turn calls a ``finalize()`` method on the visualizer to draw legends, titles, etc. and then ``poof()`` renders the figure. The Visualizer API is therefore: + +- ``draw()``: add visual elements to the underlying axes object +- ``finalize()``: prepare the figure for rendering, adding final touches such as legends, titles, axis labels, etc. +- ``poof()``: render the figure for the user (or saves it to disk). + +Creating a visualizer means defining a class that extends ``Visualizer`` or one of its subclasses, then implementing several of the methods described above. A barebones implementation is as follows: + +.. code:: python + + import matplotlib.pyplot as plt + + from yellowbrick.base import Visualizer + + class MyVisualizer(Visualizer): + + def __init__(self, ax=None, **kwargs): + super(MyVisualizer, self).__init__(ax, **kwargs) + + def fit(self, X, y=None): + self.draw(X) + return self + + def draw(self, X): + self.ax.plt(X) + return self.ax + + def finalize(self): + self.set_title("My Visualizer") + +This simple visualizer simply draws a line graph for some input dataset X, intersecting with the scikit-learn API at the ``fit()`` method. A user would use this visualizer in the typical style: + +.. code:: python + + visualizer = MyVisualizer() + visualizer.fit(X) + visualizer.poof() + +Score visualizers work on the same principle but accept an additional required ``estimator`` argument. Score visualizers wrap the model (which can be either fitted or unfitted) and then pass through all attributes and methods through to the underlying model, drawing where necessary. + +.. code:: python + + from yellowbrick.base import ScoreVisualizer + + class MyScoreVisualizer(ScoreVisualizer): + + def __init__(self, estimator, ax=None, **kwargs): + super(MyScoreVisualizer, self).__init__(estimator, ax=ax, **kwargs) + + def fit(self, X_train, y_train=None): + # Fit the underlying model + super(MyScoreVisualizer, self).fit(X_train, y_train) + self.draw(X_train, y_train) + return self + + def score(self, X_test, y_test): + # Score the underlying model + super(MyScoreVisualizer, self).fit(X_train, y_train) + self.draw(X_test, y_test) + return self.score_ + + def draw(self, X, y): + self.ax.scatter(X, c=y) + return self.ax + + def finalize(self): + self.set_title("My Score Visualizer") + +Note that the calls to ``super`` in the above code ensure that the base functionality (e.g. fitting a model and computing the score) are required to ensure the visualizer is consistent with other visualizers. + +Datasets +-------- + +Yellowbrick gives easy access to several datasets that are used for the examples in the documentation and testing. These datasets are hosted in our CDN and must be downloaded for use. Typically, when a user calls one of the data loader functions, e.g. ``load_bikeshare()`` the data is automatically downloaded if it's not already on the user's computer. However, for development and testing, or if you know you will be working without internet access, it might be easier to simply download all the data at once. + +The data downloader script can be run as follows:: + + $ python -m yellowbrick.download + +This will download the data to the fixtures directory inside of the Yellowbrick site packages. You can specify the location of the download either as an argument to the downloader script (use ``--help`` for more details) or by setting the ``$YELLOWBRICK_DATA`` environment variable. This is the preferred mechanism because this will also influence how data is loaded in Yellowbrick. + +Note that developers who have downloaded data from Yellowbrick versions earlier than v1.0 may experience some problems with the older data format. If this occurs, you can clear out your data cache as follows:: + + $ python -m yellowbrick.download --cleanup + +This will remove old datasets and download the new ones. You can also use the ``--no-download`` flag to simply clear the cache without re-downloading data. Users who are having difficulty with datasets can also use this or they can uninstall and reinstall Yellowbrick using ``pip``. + +Testing +------- + +The test package mirrors the yellowbrick package in structure and also contains several helper methods and base functionality. To add a test to your visualizer, find the corresponding file to add the test case, or create a new test file in the same place you added your code. + +Visual tests are notoriously difficult to create --- how do you test a visualization or figure? Moreover, testing scikit-learn models with real data can consume a lot of memory. Therefore the primary test you should create is simply to test your visualizer from end to end and make sure that no exceptions occur. To assist with this, we have a helper, ``VisualTestCase``. Create your tests as follows: + +.. code:: python + + import pytest + + from tests.base import VisualTestCase + from yellowbrick.datasets import load_occupancy + + class MyVisualizerTests(VisualTestCase): + + def test_my_visualizer(self): + """ + Test MyVisualizer on a real dataset + """ + # Load the occupancy dataset + X, y = load_occupancy() + + try: + visualizer = MyVisualizer() + assert visualizer.fit(X, y) is visualizer, "fit should return self" + visualizer.poof() + except Exception as e: + pytest.fail("my visualizer didn't work: {}".format(e)) + +This simple test case is an excellent start to a larger test package and we recommend starting with this test as you develop your visualizer. Once you've completed the development and prototyping you can start to include :ref:`test fixtures ` and test various normal use cases and edge cases with unit tests, then build :ref:`image similarity tests ` to more thoroughly define the integration tests. + + +Running the Test Suite +~~~~~~~~~~~~~~~~~~~~~~ + +To run the test suite, first install the testing dependencies that are located in the `tests` folder as follows:: + + $ pip install -r tests/requirements.txt + +The required dependencies for the test suite include testing utilities and libraries such as pandas and nltk that are not included in the core dependencies. + +Tests can be run as follows from the project root:: + + $ make test + +The Makefile uses the pytest runner and testing suite as well as the coverage library. + +.. _assert_images_similar: + +Image Comparison Tests +~~~~~~~~~~~~~~~~~~~~~~ + +Writing an image based comparison test is only a little more difficult than the simple testcase presented above. We have adapted matplotlib's image comparison test utility into an easy to use assert method : ``self.assert_images_similar(visualizer)`` + +The main consideration is that you must specify the “baseline”, or expected, image in the ``tests/baseline_images/`` folder structure. + +For example, create your test function located in ``tests/test_regressor/test_myvisualizer.py`` as follows: + +.. code:: python + + from tests.base import VisualTestCase + + class MyVisualizerTests(VisualTestCase): + + def test_my_visualizer_output(self): + visualizer = MyVisualizer() + visualizer.fit(X) + visualizer.poof() + self.assert_images_similar(visualizer) + +The first time this test is run, there will be no baseline image to compare against, so the test will fail. Copy the output images (in this case ``tests/actual_images/test_regressor/test_myvisualizer/test_my_visualizer_output.png``) to the correct subdirectory of baseline_images tree in the source directory (in this case ``tests/baseline_images/test_regressor/test_myvisualizer/test_my_visualizer_output.png``). Put this new file under source code revision control (with git add). When rerunning the tests, they should now pass. + +We also have a helper script, ``tests/images.py`` to clean up and manage baseline images automatically. It is run using the ``python -m`` command to execute a module as main, and it takes as an argument the path to your *test file*. To copy the figures as above:: + + $ python -m tests.images tests/test_regressor/test_myvisualizer.py + +This will move all related test images from ``actual_images`` to ``baseline_images`` on your behalf (note you'll have had to run the tests at least once to generate the images). You can also clean up images from both actual and baseline as follows:: + + $ python -m tests.images -C tests/test_regressor/test_myvisualizer.py + +This is useful particularly if you're stuck trying to get an image comparison to work. For more information on the images helper script, use ``python -m tests.images --help``. + +.. _fixtures: + +Test Fixtures +~~~~~~~~~~~~~ + +Often, you will need a controlled dataset to test your visualizer as specifically as possible. To do this, we recommend that you make use of `pytest fixtures `_ and `scikit-learn's generated datasets `_. Together these tools ensure that you have complete control over your test fixtures and can test different user scenarios as precisely as possible. For example, consider the case where we want to test both a binary and a multiclass dataset for a classification score visualizer. + +.. code:: python + + from tests.fixtures import Dataset, Split + + from sklearn.datasets import make_classification + from sklearn.model_selection import train_test_split as tts + + @pytest.fixture(scope="class") + def binary(request): + """ + Creates a random binary classification dataset fixture + """ + X, y = make_classification( + n_samples=500, + n_features=20, + n_informative=8, + n_redundant=2, + n_classes=2, + n_clusters_per_class=3, + random_state=2001, + ) + + X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2, random_state=42) + + dataset = Dataset(Split(X_train, X_test), Split(y_train, y_test)) + request.cls.binary = dataset + +In this example, we make use of :func:`sklearn.datasets.make_classification` to randomly generate exactly the dataset that we'd like, in this case a dataset with 2 classes and enough variability so as to be interesting. Because we're using this with a score visualizer, it is helpful to divide this into train and test splits. The ``Dataset`` and ``Split`` objects in ``tests.fixtures`` are namedtuples that allow you to easily access ``X`` and ``y`` properties on the dataset and ``train`` and ``test`` properties on the split. Creating a dataset this way means we can access ``dataset.X.train`` and ``dataset.y.test`` easily in our test functions. + +Similarly, we can create a custom multiclass function as well: + +.. code:: python + + @pytest.fixture(scope="class") + def multiclass(request): + """ + Creates a random multiclass classification dataset fixture + """ + X, y = make_classification( + n_samples=500, + n_features=20, + n_informative=8, + n_redundant=2, + n_classes=6, + n_clusters_per_class=3, + random_state=87, + ) + + X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2, random_state=93) + + dataset = Dataset(Split(X_train, X_test), Split(y_train, y_test)) + request.cls.multiclass = dataset + +.. note:: Fixtures that are added to ``conftest.py`` are available to tests in the same directory or a subdirectory as ``conftest.py``. This is special pytest magic since fixtures are identified by strings. Note that the two above example fixtures are in ``tests/test_classifier/conftest.py`` so you can use these exactly in the ``tests/test_classifier`` directory without having to create new fixtures. + +To use these fixtures with a ``VisualTestCase`` you must decorate the test class with the fixture. Once done, the fixture will be *generated once per class* and stored in the ``request.cls.`` variable. Here's how to use the above fixtures: + +.. code:: python + + @pytest.mark.usefixtures("binary", "multiclass") + class TestMyScoreVisualizer(VisualTestCase): + + def test_binary(self): + oz = MyScoreVisualizer() + assert oz.fit(self.binary.X.train, self.binary.y.train) is oz + assert 0.0 <= oz.score(self.binary.X.test, self.binary.y.test) <= 1.0 + oz.finalize() + + self.assert_images_similar(oz) + +In the above test examples, we showed the use of the yellowbrick dataset loaders, e.g. ``load_occupancy()``. You should feel free to use those datasets and the scikit-learn datasets for tests, particularly for integration tests (described next). The use of the generated datasets and fixtures allows a lot of control over what is being tested and ensures that the tests run as quickly as possible, therefore please use fixtures for the majority of test cases. + +Integration Tests +~~~~~~~~~~~~~~~~~ + +The majority of test cases will use generated test fixtures as described above. But as a visualizer is concluded, it is important to create two "integration tests" that use real-world data in the form of Pandas and numpy arrays from the yellowbrick datasets loaders. These tests often take the following form: + +.. code:: python + + try: + import pandas as pd + except ImportError: + pd = None + + class MyVisualizerTests(VisualTestCase): + + @pytest.mark.skipif(pd is None, reason="test requires pandas") + def test_pandas_integration(self): + """ + Test with Pandas DataFrame and Series input + """ + X, y = load_occupancy(return_datset=True).to_pandas() + oz = MyScoreVisualizer().fit(X, y) + oz.finalize() + self.assert_images_similar(oz) + + def test_numpy_integration(self): + """ + Test with numpy arrays + """ + X, y = load_occupancy(return_datset=True).to_numpy() + oz = MyScoreVisualizer().fit(X, y) + oz.finalize() + self.assert_images_similar(oz) + +These tests often offer the most complications with your visual test cases, so be sure to reserve them for the last tests you create! + +.. _documentation: + +Documentation +------------- + +Yellowbrick uses `Sphinx `_ to build our documentation. The advantages of using Sphinx are many; we can more directly link to the documentation and source code of other projects like Matplotlib and scikit-learn using `intersphinx `_. In addition, docstrings used to describe Yellowbrick visualizers can be automatically included when the documentation is built via `autodoc `_. + +To take advantage of these features, our documentation must be written in reStructuredText (or "rst"). reStructuredText is similar to markdown, but not identical, and does take some getting used to. For instance, styling for things like codeblocks, external hyperlinks, internal cross references, notes, and fixed-width text are all unique in rst. + +If you would like to contribute to our documentation and do not have prior experience with rst, we recommend you make use of these resources: + +- `A reStructuredText Primer `_ +- `rst notes and cheatsheet `_ +- `Using the plot directive `_ + +Docstrings +~~~~~~~~~~ + +The initial documentation for your visualizer will be a well structured docstring. Yellowbrick uses Sphinx to build documentation, therefore docstrings should be written in reStructuredText in numpydoc format (similar to scikit-learn). The primary location of your docstring should be right under the class definition, here is an example: + +.. code:: python + + class MyVisualizer(Visualizer): + """Short description of MyVisualizer + + This initial section should describe the visualizer and what + it's about, including how to use it. Take as many paragraphs + as needed to get as much detail as possible. + + In the next section describe the parameters to __init__. + + Parameters + ---------- + model : a scikit-learn regressor + Should be an instance of a regressor, and specifically one whose name + ends with "CV" otherwise a will raise a YellowbrickTypeError exception + on instantiation. To use non-CV regressors see: + ``ManualAlphaSelection``. + + ax : matplotlib Axes, default: None + The axes to plot the figure on. If None is passed in the current axes + will be used (or generated if required). + + kwargs : dict + Keyword arguments that are passed to the base class and may influence + the visualization as defined in other Visualizers. + + Attributes + ---------- + score_ : float + The coefficient of determination that is learned during the visual + diagnostic, saved for reference after the image has been created. + + Examples + -------- + >>> model = MyVisualizer() + >>> model.fit(X) + >>> model.poof() + + Notes + ----- + In the notes section specify any gotchas or other info. + """ + +When your visualizer is added to the API section of the documentation, this docstring will be rendered in HTML to show the various options and functionality of your visualizer! + +API Documentation Page +~~~~~~~~~~~~~~~~~~~~~~ + +To add the visualizer to the documentation it needs to be added to the ``docs/api`` folder in the correct subdirectory. For example if your visualizer is a model score visualizer related to regression it would go in the ``docs/api/regressor`` subdirectory. Add your file named after your module, e.g. ``docs/api/regressor/mymodule.rst``. If you have a question where your documentation should be located, please ask the maintainers via your pull request, we'd be happy to help! + +There are quite a few examples in the documentation on which you can base your files of similar types. The primary format for the API section is as follows: + +.. code:: rst + + .. -*- mode: rst -*- + + My Visualizer + ============= + + A brief introduction to my visualizer and how it is useful in the machine learning process. + + .. plot:: + :context: close-figs + :include-source: False + :alt: Example using MyVisualizer + + visualizer = MyVisualizer(LinearRegression()) + + visualizer.fit(X, y) + g = visualizer.poof() + + Discussion about my visualizer and some interpretation of the above plot. + + + API Reference + ------------- + + .. automodule:: yellowbrick.regressor.mymodule + :members: MyVisualizer + :undoc-members: + :show-inheritance: + +This is a pretty good structure for a documentation page; a brief introduction followed by a code example with a visualization included using `the plot directive `_. This will render the ``MyVisualizer`` image in the document along with links for the complete source code, the png, and the pdf versions of the image. It will also have the "alt-text" (for screen-readers) and will not display the source because of the ``:include-source:`` option. If ``:include-source:`` is omitted, the source will be included by default. + +The primary section is wrapped up with a discussion about how to interpret the visualizer and use it in practice. Finally the ``API Reference`` section will use ``automodule`` to include the documentation from your docstring. + +At this point there are several places where you can list your visualizer, but to ensure it is included in the documentation it *must be listed in the TOC of the local index*. Find the ``index.rst`` file in your subdirectory and add your rst file (without the ``.rst`` extension) to the ``..toctree::`` directive. This will ensure the documentation is included when it is built. + +Building the Docs +~~~~~~~~~~~~~~~~~ + +Speaking of, you can build your documentation by changing into the ``docs`` directory and running ``make html``, the documentation will be built and rendered in the ``_build/html`` directory. You can view it by opening ``_build/html/index.html`` then navigating to your documentation in the browser. + +There are several other places that you can list your visualizer including: + + - ``docs/index.rst`` for a high level overview of our visualizers + - ``DESCRIPTION.rst`` for inclusion on PyPI + - ``README.md`` for inclusion on GitHub + +Please ask for the maintainer's advice about how to include your visualizer in these pages. + + +Generating the Gallery +~~~~~~~~~~~~~~~~~~~~~~ + +In v1.0, we have adopted Matplotlib's `plot directive `_ which means that the majority of the images generated for the documentation are generated automatically. One exception is the gallery; the images for the gallery must still be generated manually. + +If you have contributed a new visualizer as described in the above section, please also add it to the gallery, both to docs/gallery.py and to docs/gallery.rst. (Make sure you have already installed Yellowbrick in editable mode, from the top level directory: pip install -e .) + +If you want to regenerate a single image (e.g. the elbow curve plot), you can do so as follows: :: + + $ python docs/gallery.py elbow + +If you want to regenerate them all (note: this takes a long time!) :: + + $ python docs/gallery.py all diff --git a/docs/contributing/getting_started.rst b/docs/contributing/getting_started.rst new file mode 100644 index 000000000..27a59369d --- /dev/null +++ b/docs/contributing/getting_started.rst @@ -0,0 +1,147 @@ +.. -*- mode: rst -*- + +Getting Started on GitHub +========================= + +Yellowbrick is hosted on GitHub at https://github.com/DistrictDataLabs/yellowbrick. + +The typical workflow for a contributor to the codebase is as follows: + +1. **Discover** a bug or a feature by using Yellowbrick. +2. **Discuss** with the core contributors by `adding an issue `_. +3. **Fork** the repository into your own GitHub account. +4. Create a **Pull Request** first thing to `connect with us `_ about your task. +5. **Code** the feature, write the tests and documentation, add your contribution. +6. **Review** the code with core contributors who will guide you to a high quality submission. +7. **Merge** your contribution into the Yellowbrick codebase. + +We believe that *contribution is collaboration* and therefore emphasize *communication* throughout the open source process. We rely heavily on GitHub's social coding tools to allow us to do this. For instance, we use GitHub's `milestone `_ feature to focus our development efforts for each Yellowbrick semester, so be sure to check out the issues associated with our `current milestone `_! + +Once you have a good sense of how you are going to implement the new feature (or fix the bug!), you can reach out for feedback from the maintainers by creating a `pull request `_. Ideally, any pull request should be capable of resolution within 6 weeks of being opened. This timeline helps to keep our pull request queue small and allows Yellowbrick to maintain a robust release schedule to give our users the best experience possible. However, the most important thing is to keep the dialogue going! And if you're unsure whether you can complete your idea within 6 weeks, you should still go ahead and open a PR and we will be happy to help you scope it down as needed. + +If we have comments or questions when we evaluate your pull request and receive no response, we will also close the PR after this period of time. Please know that this does not mean we don't value your contribution, just that things go stale. If in the future you want to pick it back up, feel free to address our original feedback and to reference the original PR in a new pull request. + +.. note:: Please note that if we feel your solution has not been thought out in earnest, or if the PR is not aligned with our `current milestone `_ goals, we may reach out to ask that you close the PR so that we can prioritize reviewing the most critical feature requests and bug fixes. + +Forking the Repository +---------------------- + +The first step is to fork the repository into your own account. This will create a copy of the codebase that you can edit and write to. Do so by clicking the **"fork"** button in the upper right corner of the Yellowbrick GitHub page. + +Once forked, use the following steps to get your development environment set up on your computer: + +1. Clone the repository. + + After clicking the fork button, you should be redirected to the GitHub page of the repository in your user account. You can then clone a copy of the code to your local machine.:: + + $ git clone https://github.com/[YOURUSERNAME]/yellowbrick + $ cd yellowbrick + +2. Create a virtual environment. + + Yellowbrick developers typically use `virtualenv `_ (and `virtualenvwrapper `_), `pyenv `_ or `conda envs `_ in order to manage their Python version and dependencies. Using the virtual environment tool of your choice, create one for Yellowbrick. Here's how with virtualenv:: + + $ virtualenv venv + + To develop with a conda environment, the conda-forge channel is needed to install some testing dependencies. The following command adds the channel with the highest priority:: + + $ conda config --add channels conda-forge + +3. Install dependencies. + + Yellowbrick's dependencies are in the ``requirements.txt`` document at the root of the repository. Open this file and uncomment any dependencies marked as for development only. Then install the package in editable mode:: + + $ pip install -e . + + This will add Yellowbrick to your PYTHONPATH so that you don't need to reinstall it each time you make a change during development. + + Note that there may be other dependencies required for development and testing; you can simply install them with ``pip``. For example to install + the additional dependencies for building the documentation or to run the + test suite, use the ``requirements.txt`` files in those directories:: + + $ pip install -r tests/requirements.txt + $ pip install -r docs/requirements.txt + +4. Switch to the develop branch. + + The Yellowbrick repository has a ``develop`` branch that is the primary working branch for contributions. It is probably already the branch you're on, but you can make sure and switch to it as follows:: + + $ git fetch + $ git checkout develop + +At this point you're ready to get started writing code. + +Branching Convention +-------------------- + +The Yellowbrick repository is set up in a typical production/release/development cycle as described in "`A Successful Git Branching Model `_." The primary working branch is the ``develop`` branch. This should be the branch that you are working on and from, since this has all the latest code. The ``master`` branch contains the latest stable version and release_, which is pushed to PyPI_. No one but core contributors will generally push to master. + +You should work directly in your fork. In order to reduce the number of merges (and merge conflicts) we kindly request that you utilize a feature branch off of ``develop`` to work in:: + + $ git checkout -b feature-myfeature develop + +We also recommend setting up an ``upstream`` remote so that you can easily pull the latest development changes from the main Yellowbrick repository (see `configuring a remote for a fork `_). You can do that as follows:: + + $ git remote add upstream https://github.com/DistrictDataLabs/yellowbrick.git + $ git remote -v + origin https://github.com/YOUR_USERNAME/YOUR_FORK.git (fetch) + origin https://github.com/YOUR_USERNAME/YOUR_FORK.git (push) + upstream https://github.com/DistrictDataLabs/yellowbrick.git (fetch) + upstream https://github.com/DistrictDataLabs/yellowbrick.git (push) + +When you're ready, request a code review for your pull request. + +Pull Requests +------------- + +A `pull request (PR) `_ is a GitHub tool for initiating an exchange of code and creating a communication channel for Yellowbrick maintainers to discuss your contribution. In essenence, you are requesting that the maintainers merge code from your forked repository into the develop branch of the primary Yellowbrick repository. Once completed, your code will be part of Yellowbrick! + +When starting a Yellowbrick contribution, *open the pull request as soon as possible*. We use your PR issue page to discuss your intentions and to give guidance and direction. Every time you push a commit into your forked repository, the commit is automatically included with your pull request, therefore we can review as you code. The earlier you open a PR, the more easily we can incorporate your updates, we'd hate for you to do a ton of work only to discover someone else already did it or that you went in the wrong direction and need to refactor. + +.. note:: For a great example of a pull request for a new feature visualizer, check out `this one `_ by `Carlo Morales `_. + +Opening a Pull Request +~~~~~~~~~~~~~~~~~~~~~~ + +When you open a pull request, ensure it is from your forked repository to the develop branch of `github.com/districtdatalabs/yellowbrick `_; we will not merge a PR into the master branch. Title your Pull Request so that it is easy to understand what you're working on at a glance. Also be sure to include a reference to the issue that you're working on so that correct references are set up. + +.. note:: All pull requests should be into the ``yellowbrick/develop`` branch from your forked repository. + +After you open a PR, you should get a message from one of the maintainers. Use that time to discuss your idea and where best to implement your work. Feel free to go back and forth as you are developing with questions in the comment thread of the PR. Once you are ready, please ensure that you explicitly ping the maintainer to do a code review. Before code review, your PR should contain the following: + +1. Your code contribution +2. Tests for your contribution +3. Documentation for your contribution +4. A PR comment describing the changes you made and how to use them +5. A PR comment that includes an image/example of your visualizer + +At this point your code will be formally reviewed by one of the contributors. We use GitHub's code review tool, starting a new code review and adding comments to specific lines of code as well as general global comments. Please respond to the comments promptly, and don't be afraid to ask for help implementing any requested changes! You may have to go back and forth a couple of times to complete the code review. + +When the following is true: + +1. Code is reviewed by at least one maintainer +2. Continuous Integration tests have passed +3. Code coverage and quality have not decreased +4. Code is up to date with the yellowbrick develop branch + +Then we will "Squash and Merge" your contribution, combining all of your commits into a single commit and merging it into the ``develop`` branch of Yellowbrick. Congratulations! Once your contribution has been merged into master, you will be officially listed as a contributor. + +After Your Pull Request is Merged +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +After your pull request is merged, you should update your local fork, either by pulling from ``upstream`` ``develop``:: + + $ git checkout develop + $ git pull upstream develop + $ git push origin develop + +or by manually merging your feature into your fork's ``develop`` branch.:: + + $ git checkout develop + $ git merge --no-ff feature-myfeature + $ git push origin develop + +Then you can safely delete the old feature branch, both locally and on GitHub. Now head back to `the backlog `_ and checkout another issue! + +.. _release: https://github.com/DistrictDataLabs/yellowbrick/releases +.. _PyPI: https://pypi.python.org/pypi/yellowbrick \ No newline at end of file diff --git a/docs/contributing/index.rst b/docs/contributing/index.rst new file mode 100644 index 000000000..d132dfb23 --- /dev/null +++ b/docs/contributing/index.rst @@ -0,0 +1,39 @@ +.. -*- mode: rst -*- + +Contributing +============ + +Yellowbrick is an open source project that is supported by a community who will gratefully and humbly accept any contributions you might make to the project. Large or small, any contribution makes a big difference; and if you've never contributed to an open source project before, we hope you will start with Yellowbrick! + +Principally, Yellowbrick development is about the addition and creation of *visualizers* --- objects that learn from data and create a visual representation of the data or model. Visualizers integrate with scikit-learn estimators, transformers, and pipelines for specific purposes and as a result, can be simple to build and deploy. The most common contribution is a new visualizer for a specific model or model family. We'll discuss in detail how to build visualizers later. + +Beyond creating visualizers, there are many ways to contribute: + +- Submit a bug report or feature request on `GitHub issues`_. +- Contribute an Jupyter notebook to our `examples gallery`_. +- Assist us with :doc:`user testing <../evaluation>`. +- Add to the documentation or help with our website, `scikit-yb.org`_ +- Write unit or integration tests for our project. +- Answer questions on our `GitHub issues`_, `mailing list`_, `Stack Overflow`_, and `Twitter`_. +- Translate our documentation into another language. +- Write a blog post, tweet, or share our project with others. +- Teach someone how to use Yellowbrick. + +As you can see, there are lots of ways to get involved and we would be very happy for you to join us! The only thing we ask is that you abide by the principles of openness, respect, and consideration of others as described in our :doc:`../code_of_conduct`. + +.. note:: If you're unsure where to start, perhaps the best place is to drop the maintainers a note via our mailing list: http://bit.ly/yb-listserv. + +.. _`examples gallery`: https://github.com/DistrictDataLabs/yellowbrick/tree/develop/examples +.. _`scikit-yb.org`: http://www.scikit-yb.org +.. _`GitHub issues`: https://github.com/DistrictDataLabs/yellowbrick/issues +.. _`mailing list`: http://bit.ly/yb-listserv +.. _`Stack Overflow`: https://stackoverflow.com/questions/tagged/yellowbrick +.. _`Twitter`: https://twitter.com/scikit_yb + +.. toctree:: + :caption: Contributing Guide + :maxdepth: 2 + + getting_started + developing_visualizers + advanced_development_topics diff --git a/docs/faq.rst b/docs/faq.rst index 036ad10c3..f8b231278 100644 --- a/docs/faq.rst +++ b/docs/faq.rst @@ -27,7 +27,7 @@ You can change the ``size`` of a plot by passing in the desired dimensions in pi Note: we are considering adding support for passing in ``size`` in inches in a future Yellowbrick release. For a convenient inch-to-pixel converter, check out `www.unitconversion.org `_. How can I change the title of a Yellowbrick plot? ---------------------------------------------------- +------------------------------------------------- You can change the ``title`` of a plot by passing in the desired title as a string on instantiation: @@ -51,23 +51,17 @@ You can change the ``title`` of a plot by passing in the desired title as a stri How can I change the color of a Yellowbrick plot? ------------------------------------------------- -To customize coloring in your plot, use the ``colors`` or ``cmap`` (or ``colormap``) arguments. Note that different visualizers may require slightly different arguments depending on how they construct the plots. - -For instance, the :doc:`api/features/manifold` accepts a ``colors`` argument, for which ``discrete`` targets should be the name of one of the :doc:`api/palettes` or a list of `matplotlib colors `_ represented as strings: -For instance, the :doc:`api/features/manifold` accepts a ``colors`` argument, for which ``discrete`` targets should be the name of a palette from the Yellowbrick :doc:`api/palettes` or a list of `matplotlib colors `_ represented as strings: - -.. code:: python +Yellowbrick uses colors to make visualzers as interpretable as possible for intuitive machine learning diagnostics. Generally, color is specified by the target variable, ``y`` that you might pass to an estimator's fit method. Therefore Yellowbrick considers color based on the datatype of the target: - from yellowbrick.features.manifold import Manifold +- **Discrete**: when the target is represented by discrete classes, Yellowbrick uses categorical colors that are easy to discriminate from each other. +- **Continuous**: when the target is represented by continous values, Yellowbrick uses a sequential colormap to show the range of data. - visualizer = Manifold( - manifold="tsne", target="discrete", colors=["teal", "orchid"] - ) +*Most* visualizers therefore accept the ``colors`` and ``colormap`` arguments when they are initialized. Generally speaking, if the target is discrete, specify `colors` as a list of valid matplotlib colors; otherwise if your target is continuous, specify a matplotlib colormap or colormap name. Most Yellowbrick visualizers are smart enough to figure out the colors for each of your data points based on what you pass in; for example if you pass in a colormap for a discrete target, the visualizer will create a list of discrete colors from the sequential colors. - ... +.. note:: Although most visualizers support these arguments, please be sure to check the visualizer as it may have specific color requirements. E.g. the :doc:`ResidualsPlot ` accepts the ``train_color``, ``test_color``, and ``line_color`` to modify its visualization. To see a visualizer's arguments you can use ``help(Visualizer)`` or ``visualizer.get_params()``. -... whereas for ``continuous`` targets, ``colors`` should be a colormap: +For example, the :doc:`Manifold ` can visualize both discrete and sequential targets. In the discrete case, pass a list of `valid color values `_ as follows: .. code:: python @@ -75,55 +69,31 @@ For instance, the :doc:`api/features/manifold` accepts a ``colors`` argument, fo from yellowbrick.features.manifold import Manifold visualizer = Manifold( - manifold="isomap", target="continuous", colors="YlOrRd" + manifold="tsne", target="discrete", colors=["teal", "orchid"] ) ... -Other visualizers accept a ``cmap`` argument: - -.. code:: python - - from sklearn.linear_model import LogisticRegression - from yellowbrick.classifier import ConfusionMatrix - - visualizer = ConfusionMatrix( - LogisticRegression(), cmap="YlGnBu" - ) - - ... +... whereas for ``continuous`` targets, it is better to specify a `matplotlib colormap `_: -Or a ``colormap`` argument: .. code:: python - from yellowbrick.features import ParallelCoordinates + from yellowbrick.features.manifold import Manifold - # Instantiate the visualizer - visualizer = ParallelCoordinates( - classes=classes, features=features, colormap="PRGn" + visualizer = Manifold( + manifold="isomap", target="continuous", colormap="YlOrRd" ) ... -The :doc:`api/regressor/residuals` accepts color argument for the training and test points, ``train_color`` and ``test_color``, respectively: - -.. code:: python - - from yellowbrick.regressor import ResidualsPlot - from sklearn.linear_model import ElasticNet - visualizer = ResidualsPlot( - model=ElasticNet() - train_color=train_color, # color of points model was trained on - test_color=train_color, # color of points model was tested on - line_color=line_color # color of zero-error line - ) +Finally please note that you can manipulate the default colors that Yellowbrick uses by modifying the `matplotlib styles `_, particularly the default color cycle. Yellowbrick also has some tools for style management, please see :doc:`api/palettes` for more information. How can I save a Yellowbrick plot? ------------------------------------ +---------------------------------- Save your Yellowbrick plot by passing an ``outpath`` into ``poof()``: @@ -143,7 +113,7 @@ Most backends support png, pdf, ps, eps and svg to save your work! How can I make overlapping points show up better? ----------------------------------------------------- +------------------------------------------------- You can use the ``alpha`` param to change the opacity of plotted points (where ``alpha=1`` is complete opacity, and ``alpha=0`` is complete transparency): @@ -157,6 +127,6 @@ You can use the ``alpha`` param to change the opacity of plotted points (where ` How can I access the sample datasets used in the examples? ---------------------------------------------------------------- +---------------------------------------------------------- -Visit the :doc:`api/datasets` page. +Visit the :doc:`api/datasets/index` page. diff --git a/docs/gallery.py b/docs/gallery.py new file mode 100644 index 000000000..ff7b185a8 --- /dev/null +++ b/docs/gallery.py @@ -0,0 +1,649 @@ +#!/usr/bin/env python3 +# Generates images for the gallery + +import os +import argparse +import numpy as np +import os.path as path +import matplotlib.pyplot as plt + +from yellowbrick.datasets import load_occupancy, load_credit, load_concrete +from yellowbrick.datasets import load_spam, load_game, load_energy, load_hobbies + +from yellowbrick.model_selection import RFECV, FeatureImportances + +from yellowbrick.features import PCA, Manifold, JointPlot +from yellowbrick.features import RadViz, Rank1D, Rank2D, ParallelCoordinates + +from yellowbrick.contrib.scatter import ScatterVisualizer + +from yellowbrick.regressor import ResidualsPlot, PredictionError, AlphaSelection + +from yellowbrick.classifier import DiscriminationThreshold +from yellowbrick.classifier import ClassificationReport, ConfusionMatrix +from yellowbrick.classifier import ROCAUC, PRCurve, ClassPredictionError + +from yellowbrick.cluster import KElbowVisualizer, SilhouetteVisualizer +from yellowbrick.cluster import InterclusterDistance + +from yellowbrick.model_selection import ValidationCurve, LearningCurve, CVScores +from yellowbrick.contrib.classifier import DecisionViz + +from yellowbrick.text import ( + FreqDistVisualizer, + TSNEVisualizer, + DispersionPlot, + PosTagVisualizer, +) + +from yellowbrick.target import ( + BalancedBinningReference, + ClassBalance, + FeatureCorrelation, +) + +from sklearn.tree import DecisionTreeRegressor +from sklearn.preprocessing import StandardScaler +from sklearn.linear_model import RidgeClassifier +from sklearn.neighbors import KNeighborsClassifier +from sklearn.cluster import KMeans, MiniBatchKMeans +from sklearn.linear_model import LogisticRegression +from sklearn.ensemble import RandomForestClassifier +from sklearn.naive_bayes import GaussianNB, MultinomialNB +from sklearn.model_selection import train_test_split as tts +from sklearn.preprocessing import OrdinalEncoder, LabelEncoder +from sklearn.linear_model import Ridge, Lasso, LassoCV, RidgeCV +from sklearn.datasets import load_iris, load_digits, load_diabetes +from sklearn.datasets import make_classification, make_blobs, make_moons +from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer + + +GALLERY = path.join(path.dirname(__file__), "images", "gallery") + + +########################################################################## +## Helper Methods +########################################################################## + + +def newfig(): + """ + Helper function to create an axes object of the gallery dimensions. + """ + # NOTE: this figsize generates a better thumbnail + _, ax = plt.subplots(figsize=(8, 4)) + return ax + + +def savefig(viz, name, gallery=GALLERY): + """ + Saves the figure to the gallery directory + """ + if not path.exists(gallery): + os.makedirs(gallery) + + # Must save as png + if len(name.split(".")) > 1: + raise ValueError("name should not specify extension") + + outpath = path.join(gallery, name + ".png") + viz.poof(outpath=outpath) + print("created {}".format(outpath)) + + +########################################################################## +## Feature Analysis +########################################################################## + + +def radviz(): + X, y = load_occupancy() + oz = RadViz(ax=newfig()) + oz.fit_transform(X, y) + savefig(oz, "radviz") + + +def rank1d(): + X, y = load_credit() + oz = Rank1D(algorithm="shapiro", ax=newfig()) + oz.fit_transform(X, y) + savefig(oz, "rank1d_shapiro") + + +def rank2d(): + X, y = load_credit() + oz = Rank2D(algorithm="covariance", ax=newfig()) + oz.fit_transform(X, y) + savefig(oz, "rank2d_covariance") + + +def pcoords(): + X, y = load_occupancy() + oz = ParallelCoordinates(sample=0.05, shuffle=True, ax=newfig()) + oz.fit_transform(X, y) + savefig(oz, "parallel_coordinates") + + +def pca(): + X, y = load_credit() + colors = np.array(["r" if yi else "b" for yi in y]) + oz = PCA(scale=True, color=colors, proj_dim=3) + oz.fit_transform(X, y) + savefig(oz, "pca_projection_3d") + + +def manifold(dataset, manifold): + if dataset == "concrete": + X, y = load_concrete() + elif dataset == "occupancy": + X, y = load_occupancy() + else: + raise ValueError("unknown dataset") + + oz = Manifold(manifold=manifold, ax=newfig()) + oz.fit_transform(X, y) + savefig(oz, "{}_{}_manifold".format(dataset, manifold)) + + +def scatter(): + X, y = load_occupancy() + oz = ScatterVisualizer(x="light", y="CO2", ax=newfig()) + oz.fit_transform(X, y) + savefig(oz, "scatter") + + +def jointplot(): + X, y = load_concrete() + oz = JointPlot(columns=["cement", "splast"], ax=newfig()) + oz.fit_transform(X, y) + savefig(oz, "jointplot") + + +########################################################################## +## Regression +########################################################################## + + +def residuals(): + X, y = load_concrete() + X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2) + oz = ResidualsPlot(Ridge(), ax=newfig()) + oz.fit(X_train, y_train) + oz.score(X_test, y_test) + savefig(oz, "residuals") + + +def peplot(): + X, y = load_concrete() + X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2) + oz = PredictionError(Lasso(), ax=newfig()) + oz.fit(X_train, y_train) + oz.score(X_test, y_test) + savefig(oz, "prediction_error") + + +def alphas(): + X, y = load_concrete() + alphas = np.logspace(-10, 1, 400) + oz = AlphaSelection(LassoCV(alphas=alphas), ax=newfig()) + oz.fit(X, y) + savefig(oz, "alpha_selection") + + +########################################################################## +## Classification +########################################################################## + + +def classreport(): + X, y = load_occupancy() + X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2) + oz = ClassificationReport(GaussianNB(), support=True, ax=newfig()) + oz.fit(X_train, y_train) + oz.score(X_test, y_test) + savefig(oz, "classification_report") + + +def confusion(dataset): + if dataset == "iris": + data = load_iris() + elif dataset == "digits": + data = load_digits() + else: + raise ValueError("uknown dataset") + + X_train, X_test, y_train, y_test = tts(data.data, data.target, test_size=0.2) + oz = ConfusionMatrix(LogisticRegression(), ax=newfig()) + oz.fit(X_train, y_train) + oz.score(X_test, y_test) + savefig(oz, "confusion_matrix_{}".format(dataset)) + + +def rocauc(dataset): + if dataset == "binary": + X, y = load_occupancy() + model = GaussianNB() + elif dataset == "multiclass": + X, y = load_game() + X = OrdinalEncoder().fit_transform(X) + model = RidgeClassifier() + else: + raise ValueError("uknown dataset") + + X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2) + oz = ROCAUC(model, ax=newfig()) + oz.fit(X_train, y_train) + oz.score(X_test, y_test) + savefig(oz, "rocauc_{}".format(dataset)) + + +def prcurve(dataset): + if dataset == "binary": + X, y = load_spam() + model = RidgeClassifier() + kws = {} + elif dataset == "multiclass": + X, y = load_game() + X = OrdinalEncoder().fit_transform(X) + y = LabelEncoder().fit_transform(y) + model = MultinomialNB() + kws = { + "per_class": True, + "iso_f1_curves": True, + "fill_area": False, + "micro": False, + } + else: + raise ValueError("uknown dataset") + + X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2, shuffle=True) + oz = PRCurve(model, ax=newfig(), **kws) + oz.fit(X_train, y_train) + oz.score(X_test, y_test) + savefig(oz, "precision_recall_{}".format(dataset)) + + +def classprede(): + X, y = make_classification( + n_samples=1000, n_classes=5, n_informative=3, n_clusters_per_class=1 + ) + + classes = ["apple", "kiwi", "pear", "banana", "orange"] + + # Perform 80/20 training/test split + X_train, X_test, y_train, y_test = tts(X, y, test_size=0.20) + oz = ClassPredictionError(RandomForestClassifier(), classes=classes, ax=newfig()) + oz.fit(X_train, y_train) + oz.score(X_test, y_test) + savefig(oz, "class_prediction_error") + + +def discrimination(): + X, y = load_spam() + oz = DiscriminationThreshold(LogisticRegression(solver="lbfgs"), ax=newfig()) + oz.fit(X, y) + savefig(oz, "discrimination_threshold") + + +########################################################################## +## Clustering +########################################################################## + + +def elbow(): + X, _ = make_blobs(centers=8, n_features=12, shuffle=True) + oz = KElbowVisualizer(KMeans(), k=(4, 12), ax=newfig()) + oz.fit(X) + savefig(oz, "elbow") + + +def silhouette(): + X, _ = make_blobs(centers=8) + oz = SilhouetteVisualizer(MiniBatchKMeans(6), ax=newfig()) + oz.fit(X) + savefig(oz, "silhouette") + + +def icdm(): + X, _ = make_blobs(centers=12, n_samples=1000, n_features=16, shuffle=True) + oz = InterclusterDistance(KMeans(9), ax=newfig()) + oz.fit(X) + savefig(oz, "icdm") + + +########################################################################## +## Model Selection +########################################################################## + + +def validation(): + X, y = load_energy() + oz = ValidationCurve( + DecisionTreeRegressor(), + param_name="max_depth", + param_range=np.arange(1, 11), + cv=10, + scoring="r2", + ax=newfig(), + ) + oz.fit(X, y) + savefig(oz, "validation_curve") + + +def learning(): + X, y = load_energy() + sizes = np.linspace(0.3, 1.0, 10) + oz = LearningCurve(RidgeCV(), train_sizes=sizes, scoring="r2", ax=newfig()) + oz.fit(X, y) + savefig(oz, "learning_curve") + + +def cvscores(): + X, y = load_energy() + oz = CVScores(Ridge(), scoring="r2", cv=10, ax=newfig()) + oz.fit(X, y) + savefig(oz, "cv_scores") + + +def decision(): + X, y = make_moons(noise=0.3) + X = StandardScaler().fit_transform(X) + X_train, X_test, y_train, y_test = tts(X, y, test_size=0.20) + + oz = DecisionViz(KNeighborsClassifier(3), ax=newfig()) + oz.fit(X_train, y_train) + oz.draw(X_test, y_test) + savefig(oz, "decision_boundaries") + + +def importances(): + X, y = load_occupancy() + oz = FeatureImportances(RandomForestClassifier(), ax=newfig()) + oz.fit(X, y) + savefig(oz, "feature_importances") + + +def rfecv(): + X, y = load_credit() + model = RandomForestClassifier(n_estimators=10) + oz = RFECV(model, cv=3, scoring="f1_weighted", ax=newfig()) + oz.fit(X, y) + savefig(oz, "rfecv_sklearn_example") + + +########################################################################## +## Text Model Diagnostics +########################################################################## + + +def freqdist(): + corpus = load_hobbies() + vecs = CountVectorizer() + docs = vecs.fit_transform(corpus.data) + + oz = FreqDistVisualizer(features=vecs.get_feature_names(), ax=newfig()) + oz.fit(docs) + savefig(oz, "freqdist") + + +def tsne(): + corpus = load_hobbies() + docs = TfidfVectorizer().fit_transform(corpus.data) + + oz = TSNEVisualizer(ax=newfig()) + oz.fit(docs, corpus.target) + savefig(oz, "corpus_tsne") + + +def dispersion(): + corpus = load_hobbies() + target_words = ["Game", "player", "score", "oil", "Man"] + + oz = DispersionPlot(target_words, ax=newfig()) + oz.fit([doc.split() for doc in corpus.data]) + savefig(oz, "dispersion") + + +def postag(): + tagged_stanzas = [ + [ + [ + ("Whose", "JJ"), + ("woods", "NNS"), + ("these", "DT"), + ("are", "VBP"), + ("I", "PRP"), + ("think", "VBP"), + ("I", "PRP"), + ("know", "VBP"), + (".", "."), + ], + [ + ("His", "PRP$"), + ("house", "NN"), + ("is", "VBZ"), + ("in", "IN"), + ("the", "DT"), + ("village", "NN"), + ("though", "IN"), + (";", ":"), + ("He", "PRP"), + ("will", "MD"), + ("not", "RB"), + ("see", "VB"), + ("me", "PRP"), + ("stopping", "VBG"), + ("here", "RB"), + ("To", "TO"), + ("watch", "VB"), + ("his", "PRP$"), + ("woods", "NNS"), + ("fill", "VB"), + ("up", "RP"), + ("with", "IN"), + ("snow", "NNS"), + (".", "."), + ], + ], + [ + [ + ("My", "PRP$"), + ("little", "JJ"), + ("horse", "NN"), + ("must", "MD"), + ("think", "VB"), + ("it", "PRP"), + ("queer", "JJR"), + ("To", "TO"), + ("stop", "VB"), + ("without", "IN"), + ("a", "DT"), + ("farmhouse", "NN"), + ("near", "IN"), + ("Between", "NNP"), + ("the", "DT"), + ("woods", "NNS"), + ("and", "CC"), + ("frozen", "JJ"), + ("lake", "VB"), + ("The", "DT"), + ("darkest", "JJS"), + ("evening", "NN"), + ("of", "IN"), + ("the", "DT"), + ("year", "NN"), + (".", "."), + ] + ], + [ + [ + ("He", "PRP"), + ("gives", "VBZ"), + ("his", "PRP$"), + ("harness", "NN"), + ("bells", "VBZ"), + ("a", "DT"), + ("shake", "NN"), + ("To", "TO"), + ("ask", "VB"), + ("if", "IN"), + ("there", "EX"), + ("is", "VBZ"), + ("some", "DT"), + ("mistake", "NN"), + (".", "."), + ], + [ + ("The", "DT"), + ("only", "JJ"), + ("other", "JJ"), + ("sound", "NN"), + ("’", "NNP"), + ("s", "VBZ"), + ("the", "DT"), + ("sweep", "NN"), + ("Of", "IN"), + ("easy", "JJ"), + ("wind", "NN"), + ("and", "CC"), + ("downy", "JJ"), + ("flake", "NN"), + (".", "."), + ], + ], + [ + [ + ("The", "DT"), + ("woods", "NNS"), + ("are", "VBP"), + ("lovely", "RB"), + (",", ","), + ("dark", "JJ"), + ("and", "CC"), + ("deep", "JJ"), + (",", ","), + ("But", "CC"), + ("I", "PRP"), + ("have", "VBP"), + ("promises", "NNS"), + ("to", "TO"), + ("keep", "VB"), + (",", ","), + ("And", "CC"), + ("miles", "NNS"), + ("to", "TO"), + ("go", "VB"), + ("before", "IN"), + ("I", "PRP"), + ("sleep", "VBP"), + (",", ","), + ("And", "CC"), + ("miles", "NNS"), + ("to", "TO"), + ("go", "VB"), + ("before", "IN"), + ("I", "PRP"), + ("sleep", "VBP"), + (".", "."), + ] + ], + ] + oz = PosTagVisualizer(ax=newfig()) + oz.fit(tagged_stanzas) + savefig(oz, "postag") + + +########################################################################## +## Target Visualizations +########################################################################## + + +def binning(): + _, y = load_concrete() + oz = BalancedBinningReference(ax=newfig()) + oz.fit(y) + savefig(oz, "balanced_binning_reference") + + +def balance(): + X, y = load_occupancy() + _, _, y_train, y_test = tts(X, y, test_size=0.2) + + oz = ClassBalance(ax=newfig(), labels=["unoccupied", "occupied"]) + oz.fit(y_train, y_test) + savefig(oz, "class_balance") + + +def featcorr(): + data = load_diabetes() + + oz = FeatureCorrelation(ax=newfig()) + oz.fit(data.data, data.target) + savefig(oz, "feature_correlation") + + +########################################################################## +## Main Method +########################################################################## + +if __name__ == "__main__": + plots = { + "all": None, + "radviz": radviz, + "rank1d": rank1d, + "rank2d": rank2d, + "pcoords": pcoords, + "pca": pca, + "concrete_tsne": lambda: manifold("concrete", "tsne"), + "occupancy_tsne": lambda: manifold("occupancy", "tsne"), + "concrete_isomap": lambda: manifold("concrete", "isomap"), + "importances": importances, + "rfecv": rfecv, + "scatter": scatter, + "jointplot": jointplot, + "residuals": residuals, + "peplot": peplot, + "alphas": alphas, + "classreport": classreport, + "confusion_digits": lambda: confusion("digits"), + "confusion_iris": lambda: confusion("iris"), + "rocauc_binary": lambda: rocauc("binary"), + "rocauc_multi": lambda: rocauc("multiclass"), + "prcurve_binary": lambda: prcurve("binary"), + "prcurve_multi": lambda: prcurve("multiclass"), + "classprede": classprede, + "discrimination": discrimination, + "elbow": elbow, + "silhouette": silhouette, + "icdm": icdm, + "validation": validation, + "learning": learning, + "cvscores": cvscores, + "freqdist": freqdist, + "tsne": tsne, + "dispersion": dispersion, + "postag": postag, + "decision": decision, + "binning": binning, + "balance": balance, + "featcorr": featcorr, + } + + parser = argparse.ArgumentParser(description="gallery image generator") + parser.add_argument( + "plots", + nargs="+", + choices=plots.keys(), + metavar="plot", + help="names of images to generate", + ) + args = parser.parse_args() + + queue = frozenset(args.plots) + if "all" in queue: + queue = frozenset(plots.keys()) + + for item in queue: + method = plots[item] + if method is not None: + method() diff --git a/docs/gallery.rst b/docs/gallery.rst index f78f56a4d..ade6cb809 100644 --- a/docs/gallery.rst +++ b/docs/gallery.rst @@ -6,73 +6,61 @@ Gallery Feature Analysis ---------------- -.. image:: api/features/images/radviz.png +.. image:: images/gallery/radviz.png :width: 200px :height: 100px - :alt: Features Analysis adViz Visualizer + :alt: Features Analysis RadViz Visualizer :target: api/features/radviz.html#radviz-visualizer -.. image:: api/features/images/rank1d_shapiro.png +.. image:: images/gallery/rank1d_shapiro.png :width: 200px :height: 100px - :alt: Features Analysis rank1d shapiro\ + :alt: Features Analysis Shapiro Rank1D :target: api/features/rankd.html#rank-1d -.. image:: api/features/images/rank2d_covariance.png +.. image:: images/gallery/rank2d_covariance.png :width: 200px :height: 100px - :alt: Features Analysis rank2d covariance + :alt: Features Analysis Covariance Rank2D :target: api/features/rankd.html#rank-2d -.. image:: api/features/images/parallel_coordinates.png +.. image:: images/gallery/parallel_coordinates.png :width: 200px :height: 100px :alt: Parallel Coordinates for 5 features :target: api/features/pcoords.html#parallel-coordinates -.. image:: api/features/images/pca_projection_3d.png +.. image:: images/gallery/pca_projection_3d.png :width: 200px :height: 100px - :alt: Principal Component Plot + :alt: Principal Component Plot 3D :target: api/features/pca.html#pca-projection -.. image:: api/features/images/concrete_tsne_manifold.png +.. image:: images/gallery/concrete_tsne_manifold.png :width: 200px :height: 100px :alt: t-SNE Manifold Visualization :target: api/features/manifold.html#manifold-visualization -.. image:: api/features/images/occupancy_tsne_manifold.png +.. image:: images/gallery/occupancy_tsne_manifold.png :width: 200px :height: 100px :alt: t-SNE Manifold Visualization Discrete Target :target: api/features/manifold.html#discrete-target -.. image:: api/features/images/concrete_isomap_manifold.png +.. image:: images/gallery/concrete_isomap_manifold.png :width: 200px :height: 100px :alt: Isomap Manifold Visualization :target: api/features/manifold.html#continuous-target -.. image:: api/features/images/feature_importances.png - :width: 200px - :height: 100px - :alt: Feature Importance using Gradient Boosting Classifier - :target: api/features/importances.html#feature-importances - -.. image:: api/features/images/rfecv_sklearn_example.png - :width: 200px - :height: 100px - :alt: Feature Importance using Gradient Boosting Classifier - :target: api/features/rfecv.html#recursive-feature-elimination - -.. image:: api/contrib/images/scatter.png +.. image:: images/gallery/scatter.png :width: 200px :height: 100px :alt: Scatter Visualization :target: api/contrib/scatter.html#scatter-visualization -.. image:: api/features/images/jointplot.png +.. image:: images/gallery/jointplot.png :width: 200px :height: 100px :alt: Joint Point Visualization @@ -82,98 +70,98 @@ Feature Analysis Regression Visualizers ---------------------- -.. image:: api/regressor/images/residuals.png +.. image:: images/gallery/residuals.png :width: 200px :height: 100px :alt: Residuals for Ridge Model :target: api/regressor/residuals.html#residuals-plot -.. image:: api/regressor/images/prediction_error.png +.. image:: images/gallery/prediction_error.png :width: 200px :height: 100px :alt: Prediction Error for Lasso :target: api/regressor/peplot.html#residuals-plot -.. image:: api/regressor/images/alpha_selection.png +.. image:: images/gallery/alpha_selection.png :width: 200px :height: 100px - :alt: Lasso Alpha Error + :alt: Alpha Selection for Lasso :target: api/regressor/alphas.html#alpha-selection Classification Visualizers -------------------------- -.. image:: api/classifier/images/classification_report.png +.. image:: images/gallery/classification_report.png :width: 200px :height: 100px :alt: GaussianNB Classification Report :target: api/classifier/classification_report.html#classification-report -.. image:: api/classifier/images/confusion_matrix_digits.png +.. image:: images/gallery/confusion_matrix_digits.png :width: 200px :height: 100px :alt: Logistic Regression Confusion Matrix with Numeric Labels :target: api/classifier/confusion_matrix.html#confusion-matrix -.. image:: api/classifier/images/confusion_matrix_iris.png +.. image:: images/gallery/confusion_matrix_iris.png :width: 200px :height: 100px :alt: Logistic Regression Confusion Matrix with Class Name Labels :target: api/classifier/confusion_matrix.html#plotting-with-class-names -.. image:: api/classifier/images/rocauc_binary.png +.. image:: images/gallery/rocauc_binary.png :width: 200px :height: 100px - :alt: Binary ROC Curves for Logistic Regression + :alt: Binary ROC Curves :target: api/classifier/rocauc.html#rocauc -.. image:: api/classifier/images/rocauc_multiclass.png +.. image:: images/gallery/rocauc_multiclass.png :width: 200px :height: 100px :alt: Multiclass ROC Curves :target: api/classifier/rocauc.html#multi-class-rocauc-curves -.. image:: api/classifier/images/binary_precision_recall.png +.. image:: images/gallery/precision_recall_binary.png :width: 200px :height: 100px :alt: Precision-Recall Curves :target: api/classifier/prcurve.html -.. image:: api/classifier/images/multiclass_precision_recall_full.png +.. image:: images/gallery/precision_recall_multiclass.png :width: 200px :height: 100px - :alt: Multi-Label Precision-Recall Curves + :alt: Multiclass Precision-Recall Curves :target: api/classifier/prcurve.html#multi-label-classification -.. image:: api/classifier/images/class_prediction_error.png +.. image:: images/gallery/class_prediction_error.png :width: 200px :height: 100px - :alt: Class Prediction Error for Random Forest Classifier + :alt: Class Prediction Error of Classifier :target: api/classifier/class_prediction_error.html#class-prediction-error -.. image:: api/classifier/images/spam_discrimination_threshold.png +.. image:: images/gallery/discrimination_threshold.png :width: 200px :height: 100px - :alt: Threshold Plot for Logistic Regression + :alt: Discrimination Threshold Plot :target: api/classifier/threshold.html#discrimination-threshold Clustering Visualizers ---------------------- -.. image:: api/cluster/images/elbow.png +.. image:: images/gallery/elbow.png :width: 200px :height: 100px :alt: Distortion Score Elbow for Mini Batch Means Clustering :target: api/cluster/elbow.html#elbow-method -.. image:: api/cluster/images/silhouette.png +.. image:: images/gallery/silhouette.png :width: 200px :height: 100px :alt: Silhoutte Plot of Mini Batch Kmeans Clustering :target: api/cluster/silhouette.html#silhouette-visualizer -.. image:: api/cluster/images/icdm.png +.. image:: images/gallery/icdm.png :width: 200px :height: 100px :alt: Intercluster Distance Maps @@ -182,95 +170,87 @@ Clustering Visualizers Model Selection Visualizers --------------------------- -.. image:: api/model_selection/images/validation_curve_regressor.png +.. image:: images/gallery/validation_curve.png :width: 200px :height: 100px - :alt: Validation Curve for Decision Tree Regresor + :alt: Validation Curve Hyperparameter Tuning :target: api/model_selection/validation_curve.html#validation-curve -.. image:: api/model_selection/images/learning_curve_classifier.png +.. image:: images/gallery/learning_curve.png :width: 200px :height: 100px - :alt: Learning Curve for MultinomialNB - :target: api/model_selection/learning_curve.html#classification + :alt: Learning Curves for Data Sufficiency + :target: api/model_selection/learning_curve.html -.. image:: api/model_selection/images/learning_curve_clusterer.png +.. image:: images/gallery/cv_scores.png :width: 200px :height: 100px - :alt: Learning Curve for KMeans - :target: api/model_selection/learning_curve.html#clustering + :alt: Cross Validation Scores + :target: api/model_selection/cross_validation.html + +.. image:: images/gallery/decision_boundaries.png + :width: 200px + :height: 100px + :alt: Nearest Neighbor Decision Boundary Prototype + :target: api/contrib/boundaries.html#decisionboundaries-vizualizer -.. image:: api/model_selection/images/cv_scores_classifier.png +.. image:: images/gallery/feature_importances.png :width: 200px :height: 100px - :alt: CV Scores for MultinomialNB Classification - :target: api/model_selection/cross_validation.html#classification + :alt: Feature Importance using Ensemble Classifier + :target: api/model_selection/importances.html#feature-importances -.. image:: api/model_selection/images/cv_scores_regressor.png +.. image:: images/gallery/rfecv_sklearn_example.png :width: 200px :height: 100px - :alt: CV Scores for Ridge Regression - :target: api/model_selection/cross_validation.html#regression + :alt: Recursive Feature Elimination with Ensemble Classifier + :target: api/model_selection/rfecv.html#recursive-feature-elimination Text Modeling Visualizers --------------------------- -.. image:: api/text/images/freqdist_corpus.png +.. image:: images/gallery/freqdist.png :width: 200px :height: 100px - :alt: Validation Curve for Decision Tree Regresor + :alt: Token Frequency Distribution :target: api/text/freqdist.html#token-frequency-distribution -.. image:: api/text/images/tsne_all_docs.png +.. image:: images/gallery/corpus_tsne.png :width: 200px :height: 100px :alt: TSNE Projection of Documents :target: api/text/tsne.html#t-sne-corpus-visualization -.. image:: api/text/images/dispersion_docs.png +.. image:: images/gallery/dispersion.png :width: 200px :height: 100px - :alt: Dispersion Plot + :alt: Dispersion of Words in a Corpus :target: api/text/dispersion.html#dispersion-plot -Decision Boundaries Visualizer ------------------------------- - -.. image:: api/contrib/images/knn_decisionviz.png +.. image:: images/gallery/postag.png :width: 200px :height: 100px - :alt: Nearest Neighbor Boundary Visualizer - :target: api/contrib/boundaries.html#decisionboundaries-vizualizer + :alt: Parts-of-Speech in a Tagged Corpus + :target: api/text/postag.html#postag-visualization + Target Visualizers ------------------ -.. image:: api/target/images/balanced_binning_reference.png +.. image:: images/gallery/balanced_binning_reference.png :width: 200px :height: 100px :alt: Balanced Binning Reference :target: api/target/binning.html#balanced-binning-reference -.. image:: api/target/images/class_balance_compare.png +.. image:: images/gallery/class_balance.png :width: 200px :height: 100px :alt: Class Balance :target: api/target/class_balance.html#class-balance -.. image:: api/target/images/feature_correlation_pearson.png +.. image:: images/gallery/feature_correlation.png :width: 200px :height: 100px :alt: Feature Correlation Pearson Correlation Coefficients :target: api/target/feature_correlation.html#pearson-correlation - -.. image:: api/target/images/feature_correlation_mutual_info_regression.png - :width: 200px - :height: 100px - :alt: Feature Correlation Mutual Information - Regression - :target: api/target/feature_correlation.html#mutual-information-regression - -.. image:: api/target/images/feature_correlation_mutual_info_classification.png - :width: 200px - :height: 100px - :alt: Feature Correlation Mutual Information - Classification - :target: api/target/feature_correlation.html#mutual-information-classification diff --git a/docs/governance/index.rst b/docs/governance/index.rst new file mode 100644 index 000000000..d27f18f26 --- /dev/null +++ b/docs/governance/index.rst @@ -0,0 +1,218 @@ +.. -*- mode: rst -*- + +Governance +========== + +**Version 1.1 on May 15, 2019** + +Purpose +------- +Yellowbrick has grown from a project with a handful of people hacking on it in their spare time into one with thousands of users, dozens of contributors, and several organizational affiliations. The burden of the effort to maintain the project has so far been borne by a few individuals dedicated to its success. However, as the project continues to grow, we believe it is important to establish a system that clearly defines how the project will be administered, organized, and maintained in the coming years. We hope that this clarity will lighten the administrative load of the maintainers and core-developers, streamline decision making, and allow new people to participate meaningfully. Most importantly, we hope that the product of these effects is to ensure the members of the Yellowbrick project are able to maintain a healthy and productive relationship with the project, allowing them to avoid burnout and fatigue. + +The purpose of this document is to establish a system of governance that will define the interactions of Yellowbrick members and facilitate decision-making processes. This governance document serves as guidance to Yellowbrick members, and all Yellowbrick members must agree to adhere to its guidance in order to participate in the project. Although not legally binding, this document may also serve as bylaws for the organization of Yellowbrick contributors. We believe this document should be treated as a living document that is amended regularly in order to support the future objectives of the project. + +Organization +------------ +The Yellowbrick project, also referred to as scikit-yellowbrick or scikit-yb, is an open source Python library for machine learning visual analysis and diagnostics. The library is licensed by the Apache 2.0 open source license, the source code is version controlled on GitHub, and the package is distributed via PyPI. The project is maintained by volunteer contributors who refer to themselves corporately as “the scikit-yb developers”. + +Founders and Leadership +~~~~~~~~~~~~~~~~~~~~~~~ +Yellowbrick was founded by Benjamin Bengfort and Rebecca Bilbro, who were solely responsible for the initial prototypes and development of the library. In the tradition of Python, they are sometimes referred to as the “benevolent dictators for life”, or BDFLs, of the project. For the purpose of this document and to emphasize the role of maintainers and core-contributors, they are simply referred to as “the founders”. From a governance perspective, the founders have a special role in that they provide vision, thought leadership, and high-level direction for the project’s members and contributors. + +The Yellowbrick project is bigger than two people, however, therefore the primary mechanism of governance will be a collaboration of contributors and advisors in the tradition of the Apache Software Foundation. This collaboration ensures that the most active project contributors and users (the primary stakeholders of the project) have a voice in decision-making responsibilities. Note that Yellowbrick defines who active project contributors are (e.g. advisors, maintainers, core-contributors, and contributors) in a very specific and time-limited fashion. Depending on the type of required action, a subset of the active project contributors will deliberate and make a decision through majority voting consensus, subject to a final say of the founders. To ensure this happens correctly, contributors must be individuals who represent themselves and not companies. + +Contributors and Roles +~~~~~~~~~~~~~~~~~~~~~~ +There are many ways to participate as a contributor to the Yellowbrick community, but first and foremost, all Yellowbrick contributors are users. Only by using Yellowbrick do contributors gain the requisite understanding and context to meaningfully contribute. Whether you use Yellowbrick as a student to learn machine learning (or as a teacher to teach it) or you are a professional data scientist or analyst who uses it for visual analytics or diagnostics, you cannot be a contributor without some use-case driven motivation. This definition also specifically excludes other motivations, such as financial motivations or attention-seeking. The reason for this is simple, Yellowbrick is a technical library that is intended for a technical audience. + +You become a contributor when you give something back to the Yellowbrick community. A contribution can be anything large or small—related to code or something unique to your talents. In the end, the rest of the community of contributors will decide what constitutes a meaningful contribution, but some of the most common contributions include: + +- Successfully merging a pull request after a code review. +- Contributing to the scikit-yb documentation. +- Translating the scikit-yb documentation into other languages. +- Submitting detailed, reproducible bug reports. +- Submitting detailed, prototyped visualizer or feature requests. +- Writing blog posts or giving talks that demonstrate Yellowbrick’s use. +- Answering questions on Stack Overflow or on our GitHub issues. +- Organizing a Yellowbrick users meetup or other events. + +Once you have contributed to Yellowbrick, you *will always be a contributor*. We believe this is a badge of honor and we hold all Yellowbrick contributors in special esteem and respect. + +If you are routinely or frequently contributing to Yellowbrick, you have the opportunity to assume one of the roles of an active project contributor. These roles allow you to fundamentally guide the course of the Yellowbrick project but are also time-limited to help us avoid burnout and fatigue. We would like to plan our efforts as fixed-length sprints rather than as an open-ended run. + +Note that the roles described below are not mutually exclusive. A Yellowbrick contributor can simultaneously be a core-contributor, a maintainer, and an advisor if they would like to take on all of those responsibilities at the same time. None of these roles “outranks” another, they are simply a delineation of different responsibilities. In fact, they are designed to be fluid so that members can pick and choose how they participate at any given time. A detailed description of the roles follows. + +Core Contributor +^^^^^^^^^^^^^^^^ +A core-contributor commits to actively participate in a 4-month *semester* of Yellowbrick development, which culminates in the next release (we will discuss semesters in greater detail later in this document). At the beginning of the semester, the core-contributor will outline their participation and goals for the release, usually by accepting the assignment of 1-5 issues that should be completed before the semester is over. Core-contributors work with the maintainers over the course of the semester to ensure the library is at the highest quality. The responsibilities of core-contributors are as follows: + +- Set specific and reasonable goals for contributions over the semester. +- Work with maintainers to complete issues and goals for the next release. +- Respond to maintainer requests and be generally available for discussion. +- Participating actively on the #yellowbrick Slack channel. +- Join any co-working or pair-programming sessions required to achieve goals. + +Core-contributors can reasonably set as high or as low a challenge for themselves as they feel comfortable. We expect core-contributors to work on Yellowbrick for a total of about 5 hours per week. If core-contributors do not meet their goals by the end of the semester, they will likely not be invited to participate at that level in the next semester, though they may still volunteer subject to the approval of the maintainers for that semester. + +Maintainer +^^^^^^^^^^ +Maintainers commit to actively manage Yellowbrick during a 4-month semester of Yellowbrick development and are primarily responsible for building and deploying the next release. Maintainers may also act as core-contributors and use the dedicated semester group to ensure the release objectives set by the advisors are met. This primarily manifests itself in the maintenance of GitHub issues and code reviews of Pull Requests. The responsibilities of maintainers are as follows: + +- Being active and responsive on the #yellowbrick and #oz-maintainers Slack channels. +- Being active and responsive on the team-oz/maintainers GitHub group. +- Respond to user messages on GitHub and the ListServ (Stack Overflow, etc). +- Code review pull requests and safely merge them into the project. +- Maintain the project’s high code quality and well-defined API. +- Release the next version of Yellowbrick. +- Find and share things of interest to the Yellowbrick community. + +The maintainer role is an exhausting and time-consuming role. We expect maintainers to work on Yellowbrick 10-20 hours per week. Maintainers should have first been core-contributors so they understand what the role entails (and to allow a current maintainer to mentor them to assume the role). Moreover, we recommend that maintainers periodically/regularly take semesters off to ensure they don’t get burnt out. + +Coordinator +^^^^^^^^^^^ +If a semester has a large enough group of maintainers and core-contributors, we may optionally appoint a contributor as a coordinator. Coordinators specialize in the project management aspects of a version release and also direct traffic to the actively participating group. The coordinator may or may not be a maintainer. If the coordinator is nominated from the maintainers, the coordination role is an auxiliary responsibility. Otherwise, the coordinator may be a dedicated contributor that focuses only on communication and progress. The coordinator’s primary responsibilities are as follows: + +- Ensure that the features, bugs, and issues assigned to the version release are - progressing. +- Be the first to respond welcomingly to new contributors. +- Assign incoming pull requests, issues, and other responses to maintainers. +- Communicate progress to the advisors and to the Yellowbrick community. +- Encourage and motivate the active contributors group. +- Coach core-contributors and maintainers. +- Organize meetups, video calls, and other social and engagement activities. + +The coordinator’s role is mostly about communication and the amount of dedicated effort can vary depending on external factors. Based on our experience it could be as little as a couple of hours per week to as much work as being a maintainer. The coordinator’s responsibilities are organizational and are ideal for someone who wants to be a part of the community but has less of a software background. In fact, we see this role as being similar to a tech project management role, which is often entry level and serves as experience to propel coordinators into more active development. Alternatively, the coordinator can be a more experienced maintainer who needs a break from review but is willing to focus on coaching. + +Mentor-Contributor +^^^^^^^^^^^^^^^^^^ + +A mentor-contributor assumes all the responsibilities of a core contributor but commits 25-50% of their time toward mentoring a new contributor who has little-to-no experience. The purpose of the mentor-contributor role is to assist in recruiting and developing a pipeline of new contributors for Yellowbrick. + +A mentor-contributor would guide a new contributor in understanding the community, roles, ethos, expectations, systems, and processes in participating and contributing to an open source project such as Yellowbrick. A new contributor is someone who is interested in open source software projects but has little-to-no experience working on an open source project or participating in the community. + +The mentor-contributor would work in tandem with the new contributor during the semester to complete assigned issues and prepare the new contributor to be a core contributor in a future semester. A mentor-contributor would mostly likely work on fewer issues during the semester in order to allocate time for mentoring. Mentor-contributors would be matched with new contributors prior to the start of each semester or recruit their own contributor (eg. colleague or friend). The responsibilities of mentor-contributors are as follows: + +- When possible, recruit new contributors to Yellowbrick. +- Schedule weekly mentoring sessions and assign discrete yet increasingly complex tasks to the new contributor. +- Work with the new contributor to complete assigned issues during the semester. +- Set specific and reasonable goals for contributions over the semester. +- Work with maintainers to complete issues and goals for the next release. +- Respond to maintainer requests and be generally available for discussion. +- Participating actively on the #yellowbrick Slack channel. +- Join any co-working or pair-programming sessions required to achieve goals. +- Make a determination at the end of the semester on the readiness of the new contributor to be a core-contributor. + +The mentor-contributor role may also appeal to students who use Yellowbrick in a machine learning course and seek the experience of contributing to an open source project. + +Advisor +^^^^^^^ +Advisors are the primary decision-making body for Yellowbrick. They serve over the course of 3 semesters (1 calendar year) and are broadly responsible for defining the roadmap for Yellowbrick development for the next three releases. Advisors meet regularly, at least at the beginning of every semester, to deliberate and discuss next steps for Yellowbrick and to give guidance for core-contributors and maintainers for the semester. The detailed responsibilities of the advisors are as follows: + +- Contribute dues based on the number of advisors in a year to meet fixed running costs. +- Make decisions that affect the entire group through votes (ensure a quorum). +- Meet at least three times a year at the beginning of each semester (or more if required). +- Elect officers to conduct the business of Yellowbrick. +- Ensure Yellowbrick’s financial responsibilities are met. +- Create a roadmap for Yellowbrick development. +- Approve the release of the next Yellowbrick version. +- Approve core-contributor and maintainer volunteers. +- Ensure Yellowbrick is growing by getting more users. +- Ensure Yellowbrick is a good citizen of the PyData and Python communities. +- Recruit more contributors to participate in the Yellowbrick community. + +The advisor role is primarily intended to allow members to guide the course of Yellowbrick and actively participate in decisions without making them feel that they need to participate as maintainers (seriously, take a break - don’t burn out!). As such, the role of advisors is limited to preparing for and participating in the semester meetings or any other meetings that are called. Therefore, we assume that the time commitment of an advisor is roughly 30-40 hours per year (less than 1 hour per week). + +The board of advisors is open to every contributor, and in fact, is open to all users because joining the board of advisors requires a financial contribution. Yellowbrick has limited fixed running costs every year, for example, $10 a month to Read The Docs and $14.99 for our domain name. When you volunteer to be an advisor for a year, you commit to paying an equal portion of those fixed running costs, based on the number of advisor volunteers. This also ensures that we have a fixed quorum to ensure votes run smoothly. + +Affiliations +~~~~~~~~~~~~ +Yellowbrick may be affiliated with one or more organizations that promote Yellowbrick and provide financial assistance or other support. Yellowbrick affiliations are determined by the advisors who should work to ensure that affiliations are in both the organization’s and Yellowbrick’s best interests. Yellowbrick is currently affiliated with: + +- District Data Labs +- NumFOCUS +- Georgetown University + +The advisors should update this section to reflect all current and past affiliations. + +Operations +---------- +The primary activity of Yellowbrick contributors is to develop the Yellowbrick library into the best tool for visual model diagnostics and machine learning analytics. A secondary activity is to support Yellowbrick users and to provide a high level of customer service. Tertiary activities include being good citizens of the Python and PyData communities and to the open source community at large. These activities are directed to the primary mission of Yellowbrick: to greatly enhance the machine learning workflow with visual steering and diagnostics. + +Although Yellowbrick is not a commercial enterprise, it does not preclude its members from pursuing commercial activities using Yellowbrick subject to our license agreement. + +In this section, we break down the maintenance and development operations of the project. These operations are a separate activity from administration and decision making, which we will discuss in the following section. + +Semesters and Service Terms +~~~~~~~~~~~~~~~~~~~~~~~~~~~ +In order to ensure that maintainers are able to contribute meaningfully and effectively without getting burned out, we divided the maintenance and development life cycle into three semesters a year as follows: + +- Spring semester: January - April +- Summer semester: May - August +- Fall semester: September - December + +Every maintainer and core-contributor serves in these terms and is only expected to participate at the commitment level described by the roles section for the term they have joined. At the end of the term, there is no expectation that the maintainer must continue to the next term. If they wish to do so, they must volunteer to participate in the next term. We hope that this allows maintainers to be more strategic in their participation, e.g. participating every other semester, or taking off a semester where they know they will be busy with other obligations. + +An advisor’s service term is 1 year, and they must accept the advisory role by the end of January in that calendar year by paying the dues computed by the number of participating advisors. Advisors can remain advisors so long as they wish to participate by paying the dues, though if advising meetings are unable to achieve quorum; absent advisors may be asked to step down. + +The goal of the advising service term is to allow maintainers who wish to take a semester off to still have a meaningful say in the development of the project. We hope that this will also facilitate maintainers feeling that they can take a break without being cut out of the loop, and allowing them to rejoin the project as maintainers or core-contributors in a meaningful way when they are ready again. + +Release Guidelines +~~~~~~~~~~~~~~~~~~ +The focus of the semester is to release the next version of Yellowbrick. The advisors set a roadmap based on the current issues and features they wish to address. Core-contributors volunteer to take on specific issues and maintainers are responsible for organizing themselves and the core-contributors to complete the work in the version, releasing it at the end of the semester. + +Maintainers may also include in the semester release any other contributions or pull requests made by members of the community at their discretion. Maintainers should address all pull-requests and opened issues (as well as emails on the listserv and questions on Stack Overflow, Twitter, etc.) - facilitating their inclusion in the release, or their closure if they become stale or are out of band. + +During the middle of a semester, maintainers may be required to issue a hotfix release for time-critical or security-related fixes. Hotfix releases should be small, incremental improvements on the previous release. We hope that the expected release schedule of April, August, and December also assists the user community in giving feedback and guidance about the direction of Yellowbrick. + +Advisors +-------- +Yellowbrick advisors are contributors who take on the responsibility of setting the vision and direction for the development of the project. They may, for example, make decisions about which features are focused on during a semester or to apply for a small development grant and use the funds to improve Yellowbrick. They may also ask to affiliate with other projects and programs that are tangential to but crucial to the success of Yellowbrick - e.g. organizing workshops or talks, creating t-shirts or stickers, etc. + +Advisors influence Yellowbrick primarily through advisor meetings. This section describes advisor interactions, meetings, and decision-making structure. + +Officers +~~~~~~~~ +During the first advisor meeting in January, three officers should be elected to serve in special capacities over the course of the year. The officer positions are as follows: + +**Chair**: the chair’s responsibility is to organize and lead the advisor meetings, to ensure good conduct and order, and to adjudicate any disputes. The chair may call additional advisor meetings and introduce proposals for the advisors to make decisions upon. The chair calls votes to be held and may make a tie-breaking vote if a tie exists. At the beginning of every meeting, the chair should report the status of Yellowbrick and the state of the current release. + +**Secretary**: the secretary’s responsibility is to create an agenda and record the minutes of advisor meetings, including all decisions undertaken and their results. The secretary may take the place of the chair if the chair is not available during a meeting. + +**Treasurer**: the treasurer assumes responsibility for tracking cash flow—both dues payments from the advisors as well as any outgoing payments for expenses. The treasurer may also be responsible to handle monies received from outside sources, such as development grants. The treasurer should report on the financial status of the project at the beginning of every advisor meeting. + +Officers may either self-nominate or be nominated by other advisors. Nominations should be submitted before the first advisor meeting in January, and the first order of business for that meeting should be to vote the officers into their positions. Officer votes follow the normal rules of voting as described below; in the case of a tie, the founders cast the tie-breaking decision. + +Meetings +~~~~~~~~ +Advisors must schedule at least three meetings per year in the first month of each semester. At the advisor’s discretion, they may schedule additional meetings as necessary. No vote may take place without a meeting and verbal deliberation (e.g. no voting by email or by poll). Meetings must be held with a virtual component. E.g. even if the meeting is in-person with available advisors, there must be some teleconference or video capability to allow remote advisors to participate meaningfully in the meetings. + +Scheduling +^^^^^^^^^^ +To schedule a meeting, the chair must prepare a poll to find the availability of all advisors with at least 6 options over the next 10 days. The chair must ensure that every reasonable step is taken to include as many of the advisors as possible. No meeting may be scheduled without a quorum attending. + +Meetings must be scheduled in January, May, and September; as close to the start of the semester as possible. It is advisable to send the scheduling poll for those meetings in December, April, and August. Advisors may hold any number of additional meetings. + +Voting +^^^^^^ +Voting rules are simple—a proposal for a vote must be made by one of the advisors, submitted to the chair who determines if a vote may be held. The proposal may be seconded to require the chair to hold a vote on the issue. Votes may only be held if a quorum of all advisors (half of the advisors plus one) is present or accounted for in some way (e.g. through a delegation). Prior to the vote, a period of discussion of no less than 5 minutes must be allowed so that members may speak for or against the proposal. + +Votes may take place in two ways, the mechanism of which must be decided by the chair as part of the vote proposal. The vote can either be performed by secret ballot as needed or, more generally, by counting the individual votes of advisors. Advisors can either submit a “yea”, “nay”, or “abstain” vote. A proposal is passed if a majority (half of the advisors plus one) submit a “yea” vote. A proposal is rejected if a majority (half of the advisors plus one) submit a “nay” vote. If neither a majority “yea” or “nay” votes are obtained, the vote can be conducted again at the next advisors meeting. + +The proposal is not ratified until it is agreed upon by the founders, who have the final say in all decision making. This can be interpreted as either a veto (the founders reject a passed proposal) or as a request for clarification (the founders require another vote for a rejected proposal). There is no way to overturn a veto by the founders, though they recognize that by “taking their ball and going home”, they are not fostering a sense of community and expect that these disagreements will be extraordinarily rare. + +Agenda and Minutes +^^^^^^^^^^^^^^^^^^ +The secretary is responsible for preparing a meeting agenda and sending it to all advisors at least 24 hours before the advisors meeting. Minutes that describe in detail the discussion, any proposals, and the outcome of all votes should be taken as well. Minutes should be made public to the rest of the Yellowbrick community. + +Amendments +---------- +Amendments or changes to this governance document can be made by a vote of the advisors. Proposed changes must be submitted to the advisors for review at least one week prior to the vote taking place. Prior to the vote, the advisors should allow for a period of discussion where the changes or amendments can be reviewed by the group. Amendments are ratified by the normal voting procedure described above. + +Amendments should update the version and date at the top of this document. + +Board of Advisors Minutes +------------------------- + +.. toctree:: + :maxdepth: 1 + + minutes/2019-05-15.rst \ No newline at end of file diff --git a/docs/governance/minutes/2019-05-15.rst b/docs/governance/minutes/2019-05-15.rst new file mode 100644 index 000000000..6da4070d3 --- /dev/null +++ b/docs/governance/minutes/2019-05-15.rst @@ -0,0 +1,287 @@ +.. -*- mode: rst -*- + +May 15, 2019 +============ + +Yellowbrick Advisory Board Meeting Held on May 15, 2019 from 2030-2230 EST via Video Conference Call. Minutes taken by Benjamin Bengfort. + +Attendees: Edwin Schmierer, Kristen McIntyre, Larry Gray, Nathan Danielsen, Adam Morris, Prema Roman, Rebecca Bilbro, Tony Ojeda, Benjamin Bengfort. + +Agenda +------ + +A broad overview of the topics for discussion in the order they were presented: + +1. Welcome and introduction (Benjamin Bengfort) +2. Officer nominations +3. Advisor dues and Yellowbrick budget +4. PyCon Sprints debrief (Larry Gray) +5. Summer 2019 contributors and roles +6. Google Summer of Code (Adam Morris) +7. Yellowbrick v1.0 milestone planning +8. Project roadmap through 2020 +9. Proposed amendment to governance: mentor role (Edwin Schmierer) +10. Other business + +Votes and Resolutions +--------------------- + +Officer Elections +~~~~~~~~~~~~~~~~~ + +During the first advisory board meeting of the year, officers are nominated and elected to manage Yellowbrick for the year. The following nominations were proposed: + +Nominations for Chair: + +- Rebecca Bilbro by Benjamin Bengfort + +Nominations for Secretary: + +- Benjamin Bengfort by Rebecca Bilbro + +Nominations for Treasurer: + +- Edwin Schmierer by Rebecca Bilbro + +Motion: a vote to elect the nominated officers: Rebecca Bilbro as Chair, Benjamin Bengfort as Secretary, and Edwin Schmierer as Treasurer. Moved by Nathan Danielsen, seconded by Adam Morris. + +*The motion was adopted unanimously*. + +Operating Budget +~~~~~~~~~~~~~~~~ + +The following operating budget is proposed for 2019. + +Last year’s budget consisted of only two line items, which we believe will also primarily contribute to this year’s budget. The total budget is **$266.49** we have 9 advisors this year, therefore the dues are $29.61 per advisor. + +Budget breakdown: + +============================================= ================ ========= + Description Frequency Total +============================================= ================ ========= +Name.com domain registration (scikit-yb.org) annually $12.99 +Read the Docs Gold Membership $10 monthly $120.00 +Stickers (StickerMule.com) annually $133.50 +Datasets hosting on S3 monthly (DDL) $10-15 +Cheatsheets annually (DDL) $70 +============================================= ================ ========= + +Based on this budget, it is proposed that the dues are rounded to **$30.00** per advisor. Further, dues should be submitted to the treasurer via Venmo or PayPal no later than May 30, 2019. The treasurer will reimburse the payees of our expenses directly. + +A special thank you to District Data Labs for covering travel costs to PyCon, the printing of cheatsheets, and hosting our datasets on S3. + +Motion: confirm the 2019 budget and advisor dues. Moved by Rebecca Bilbro, seconded by Nathan Danielsen. + +*The motion was adopted unanimously*. + +Governance Amendment +~~~~~~~~~~~~~~~~~~~~ + +A proposed amendment to the governance document adding a "mentor-contributor" role as follows: + +.. code-block:: text + + Mentor-Contributor + ^^^^^^^^^^^^^^^^^^ + + A mentor-contributor assumes all the responsibilities of a core contributor but + commits 25-50% of their time toward mentoring a new contributor who has + little-to-no experience. The purpose of the mentor-contributor role is to assist in + recruiting and developing a pipeline of new contributors for Yellowbrick. + + A mentor-contributor would guide a new contributor in understanding the community, + roles, ethos, expectations, systems, and processes in participating and + contributing to an open source project such as Yellowbrick. A new contributor is + someone who is interested in open source software projects but has little-to-no + experience working on an open source project or participating in the community. + + The mentor-contributor would work in tandem with the new contributor during the + semester to complete assigned issues and prepare the new contributor to be a core + contributor in a future semester. A mentor-contributor would mostly likely work on + fewer issues during the semester in order to allocate time for mentoring. + Mentor-contributors would be matched with new contributors prior to the start of + each semester or recruit their own contributor (eg. colleague or friend). The + responsibilities of mentor-contributors are as follows: + + - When possible, recruit new contributors to Yellowbrick. + - Schedule weekly mentoring sessions and assign discrete yet increasingly complex tasks to the new contributor. + - Work with the new contributor to complete assigned issues during the semester. + - Set specific and reasonable goals for contributions over the semester. + - Work with maintainers to complete issues and goals for the next release. + - Respond to maintainer requests and be generally available for discussion. + - Participating actively on the #yellowbrick Slack channel. + - Join any co-working or pair-programming sessions required to achieve goals. + - Make a determination at the end of the semester on the readiness of the new contributor to be a core-contributor. + + The mentor-contributor role may also appeal to students who use Yellowbrick in a + machine learning course and seek the experience of contributing to an open source + project. + +Motion: accept the amendment adding a mentor contributor role to the governance document. Moved by Edwin Schmierer, seconded by Rebecca Bilbro. + +*The motion was adopted unanimously without modifications*. + +Semester and Roadmap +-------------------- + +The summer semester will be dedicated to completing **Yellowbrick Version 1.0**. The issues associated with this release can be found in the `v1.0 Milestone on GitHub `_. This release has a number of deep issues: including figuring out the axes/figure API and compatibility. It is a year behind and it'll be good to move forward on it! + +We have a critical need for maintainers, we only have 2-3 people regularly reviewing PRs and far more PRs than we can handle. So far there have been no volunteers for summer maintainer status. We will address this in two ways: + +1. Maintainer mentorship +2. Redirect non v1.0 PRs to the fall + +Rebecca will take the lead in mentoring new maintainers so that they feel more confident in conducting code reviews. This will include peer-reviews and a deep discussion of what we're looking for during code reviews. These code reviews will only be conducted on PRs by the summer contributors. + +Otherwise PRs will be handled as follows this semester: + +- Triage new PRs/Issues. If it is critical to v1.0 a bugfix or a docfix, assign a maintainer. +- Otherwise we will request the contributor join the fall semester. +- If a PR is older than 30 days it is "gone stale" and will be closed. +- Recommend contributors push to ``yellowbrick.contrib`` or write blog post or notebook. + +We likely will need at least three maintainers per semester moving forward. + +Summer 2019 Contributors +~~~~~~~~~~~~~~~~~~~~~~~~ + +=================== ================= +Name Role +=================== ================= +Lawrence Gray Coordinator +Prashi Doval Core Contributor +Benjamin Bengfort Core Contributor +Saurabh Daalia Core Contributor +Bashar Jaan Khan Core Contributor +Piyush Gautam Core Contributor +Naresh Bachwani GSoC Student +Xinyu You Core Contributor +Sandeep Banerjee Core Contributor +Carl Dawson Core Contributor +Mike Curry Core Contributor +Nathan Danielsen Maintainer +Rebecca Bilbro Mentor +Prema Roman Maintainer +Kristen McIntyre Maintainer +=================== ================= + +Project Roadmap +~~~~~~~~~~~~~~~ + +The v1.0 release has a number of significant changes that may not be backward compatible with previous versions (though for the most part it will be). Because of this, and because many of the issues are *contagious* (e.g. affect many files in Yellowbrick), we are reluctant to plan too much into the future for Yellowbrick. Instead we have created a `v1.1 milestone in GitHub `_ to start tracking issues there. + +Broadly some roadmap items are: + +- *Make quick methods primetime*. Our primary API is the visualizer, which allows for the most configuration and customization of visualizations. The quick methods, however, are a simple workflow that is in demand. The quick methods will do all the work of the most basic visualizer functions in one line of code and return the visualizer for further customization. +- *Add a neural package for ANN specific modeling*. We already have a text package for natural language processing, as deep learning is becoming more important, Yellowbrick should help with the interpretability of these models as well. + +Other roadmap ideas and planning discussed included: + +- A yb-altair prototype potentially leading to an Altair backend side-by-side with matplotlib. +- Devops/engineering focused content for Yellowbrick (e.g. model management and maintenance). +- Fundraising to pay for more ambitious YB development. +- Having Yellowbrick attendees at more conferences. +- Determining who is really using Yellowbrick to better understand the community we're supporting. + +Minutes +------- + +Comments on the Inaugural Advisors Meeting +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +*Remarks delivered by Benjamin Bengfort* + +Welcome to the inaugural meeting of the Yellowbrick Board of Advisors. Due to the number of agenda items and limited time, this meeting will have a more formal tone than normal if simply to provide structure so that we can get to everything. Before we start, however, I wanted to say a few words to mark this occasion and to remark on the path that led us here. + +When Rebecca and I started Yellowbrick 3 years ago this month, I don’t think that we intended to create a top tier Python library, nor did we envision how fast the project would grow. At the time, we wanted to make a statement about the role of visual analytics - the iterative interaction between human and computer - in the machine learning workflow. Though Rebecca and I communicated to different audiences, we were lucky enough to say this at a time when our message resonated with both students and professionals. Because of this, Yellowbrick has enjoyed a lot of success across a variety of different metrics: downloads, stars, blog posts. But the most important metric to me personally is the number of contributors we have had. + +Yellowbrick has become more than a software package, it has also become a community. And it is in no small part due to the efforts of the people here tonight. I think it is not inappropriate and perhaps can serve as an introduction for me to individually recognize your contributions to the rest of the group. + +Starting at the beginning, *Tony Ojeda* has uniquely supported Yellowbrick in a way no one else could. He incubated the project personally through his company, District Data Labs, even though there was no direct market value to him. He put his money where his heart was and to me, that will always make him the model of the ideal entrepreneur. + +*Larry Gray* joined Yellowbrick by way of the research labs. In a way, he was exactly the type of person we hoped would join the labs - a professional data scientist who wanted to contribute to an open source data project. Since then he has become so much more, working hard to organize and coordinate the project and investing time and emotional energy which we have come to rely on. + +*Nathan Danielsen* represents a counterpoint to the data science contributor and instead brings a level of software engineering professionalism that is sorely needed in every open source project. Although we often mention his work on image similarity tests - what that effort really did was to open the door to many new contributors, allowing us to review and code with confidence know the tests had our backs. + +*Prema Roman* has brought a thoughtful, measured approach to the way she tackles issues and is personally responsible for the prototyping and development of several visualizers including JointPlot and CrossValidation. We have relied on her to deliver new, high-impact features to the library. + +*Kristen McIntyre* is the voice of our users, an essential role we could not do without. Without her, we would be envious of scikit-learn’s documentation. Instead, we are blessed to have clear, consistent, and extensive descriptions of the visualizers which unlock their use to new and veteran users of our software. + +*Adam Morris* has become the voice of our project, particularly on Twitter. I’m joyously surprised every time I read one of his tweets. It is not the voice I would have used and that’s what makes it so sweet. He has taken to heart our code of conduct’s admonishments to over the top positivity and encouragement, and I’m always delighted by it. + +*Edwin Schmierer* many of you may not have met yet - but without him, this project would never have happened. He is responsible for creating the Data Science program at Georgetown and he’s been a good friend and advisor ever since. During our lunches and breakfasts together, we have often discussed the context of Yellowbrick in the bigger picture. I’m very excited to have him be able to give us advice more directly. + +Every time I use Yellowbrick I can see or sense your individual impact and signature on the project. Thank you all so much for joining Rebecca and I in building Yellowbrick into what it is today. It is not an understatement that Yellowbrick has been successful beyond what we expected or hoped. + +With success comes responsibility. I know that we have all felt that responsibility keenly, particularly recently as the volume of direct contacts with the community has been increasing. We have reached a stage where that responsibility cannot be borne lightly by a few individuals, because the decisions we make reach and affect a much larger group of people then we may intend. To address this, we as a group, have ratified a new governance document whose primary purpose is to help us organize and act strategically as a cohesive group. I believe that the structures we have put into place will allow us to move Yellowbrick even further toward being a professional grade software library and will help us minimize risks to the project - particularly the risk of maintainer burnout. + +We are now making another statement at another pivotal moment because we are not alone in formalizing or changing our mode of governance. Perhaps the most public example of this is Python itself, but the message of maintainer-exhaustion and burnout has been discussed for the past couple of years in a variety of settings - OSCON, PyCON, video and podcast interviews, surveys and reports. It has caused growing concern and an essential question: “how do we support open source projects that are critical to research and infrastructure when they are not supported by traditional commercial and economic mechanisms”? + +Many of you have heard me say that “I believe in open source the way I believe in democracy”. But like democracy, open source is evolving as new contributors, new technology, and new cultures start to participate more. Yellowbrick is representative of what open source can achieve and what it does both for and to the people who maintain it. As we move forward from here, the choices we make may have an impact beyond our project and perhaps even beyond Python and I hope that you, like me, find that both exciting and terrifying. + +Today we are here to discuss how we will conduct the project toward a version 1.0 release, a significant milestone, while also mentoring a Google Summer of Code Student and managing the many open pull requests and issues currently outstanding. The details matter, but they must also be considered within a context. Therefore I would like to personally commission you all toward two goals that set that context: + +1. **To be a shining example of what an open source project should be.** +2. **To think strategically of how we can support a community of both users and maintainers.** + +I’m very much looking forward to the future, and am so excited to be doing it together with you all. + +PyCon 2019 Debrief +~~~~~~~~~~~~~~~~~~ + +Yellowbrick had an open source booth at PyCon 2019 for the second time, manned by Larry, Nathan, Prema, and Kristen. There were lots of visitors to the booth who were genuinely excited about using Yellowbrick. Next year it would be great to have a large banner and vertical display of visualizers to help people learn more. Guido von Rossum even had lunch at the booth, which sparked a lot of attention! Special thank you to Tony for his financial support including shirts, stickers, transportation, and supplies. + +We also hosted two days of development sprints with 13 sprinters and 9 PRs. The majority of sprinters were there for both days. The sprints included a wide variety of people including a high schooler and even a second year sprinter. Guido von Rossum also stopped by during the sprints. Prema and Kristen announced the sprints on stage and there is a great picture of them that we tweeted. Special thank you to Daniel for creating the cheatsheets, which cut down a lot of explaining, and to Nathan for the appetizers. Beets are delicious! + +Hot takes from PyCon: + +- Cheatsheets were a huge success and very important +- It would be good to have a "Contributing to Yellowbrick" cheatsheet for next year +- Have to capitalize on attention before the conference starts +- More visual stand out: e.g. metal grommets and banner with visualizers +- NumFOCUS has an in-house designer who may be able to help + +Important request: we know that there will be a large number of PRs after PyCon or during Hacktoberfest, we need to *plan* for this and ensure we know who/how is triaging them and handling them ahead of the event. The maintainers who did not attend PyCon ended up slammed with work even though they couldn't enjoy participating in the event itself. + +Google Summer of Code +~~~~~~~~~~~~~~~~~~~~~ + +We applied to GSoC under the NumFOCUS umbrella and have been given a student, Naresh, to work with us over the summer. Adam has already had initial communication with him and he's eager to get started. So far we've already interacted with him on Github via some of his early PRs and he's going to be a great resource. + +In terms of the GSoC actions, the coding period starts May 27 and ends August 26 when deliverables are due. Every 4 weeks we do a 360 review where he evaluates us and we evaluate him (pass/fail). NumFOCUS requires a blog post and will work with Naresh directly on the posts. + +We need to determine what he should work on. His proposal was about extending PCA, but Visual Pipelines are also extremely important. Proposal, break his work into three phases. First, get the first introductory PR across the finish line, then scope the ideas in his proposal and involve him in the visualizer audit. After working on blog posts and using YB on his own dataset, then get started on actual deliverables. + +Action Items: + +- Take note that Naresh is in India and is 9.5 hours ahead (time zone). +- Schedule introduction to team and maintainers. +- Coding period begins May 27. +- How do we want to manage communications (Slack) +- 360 evaluations due every 4 weeks +- Determine final deliverables due on Aug 19-26 + +Other Topics +~~~~~~~~~~~~ + +Yellowbrick as a startup. Perhaps we can think big and pay for a full-time developer? There are many potential grant sources. The problem is that more money means more responsibility and we can't keep things together as it is now. + +It would be good to send updates to NumFOCUS and District Data Labs to keep them apprised of what we're doing. For example: + +- JOSS paper releases +- Sprints and conference attendance +- Version releases +- Talks or presentations + +Hopefully this will allow them to also spread the word about what we're doing. + +Action Items +------------ + +- Send dues to Edwin Schmierer via Venmo/PayPal (all) +- Prepare more budget options for December board meeting (Treasurer) +- Create PR with Governance Amendment (Secretary) +- Modify documentation to change language about "open a PR as early as possible" (Larry) +- Add Naresh to Slack (Adam) +- Let Adam know what talks are coming up so he can tweet them (all) +- Send updates big and small to NumFOCUS (all) diff --git a/docs/images/favicon.ico b/docs/images/favicon.ico new file mode 100644 index 000000000..7b9871483 Binary files /dev/null and b/docs/images/favicon.ico differ diff --git a/docs/images/gallery/alpha_selection.png b/docs/images/gallery/alpha_selection.png new file mode 100644 index 000000000..3081dbcaf Binary files /dev/null and b/docs/images/gallery/alpha_selection.png differ diff --git a/docs/images/gallery/balanced_binning_reference.png b/docs/images/gallery/balanced_binning_reference.png new file mode 100644 index 000000000..76115b0ed Binary files /dev/null and b/docs/images/gallery/balanced_binning_reference.png differ diff --git a/docs/images/gallery/class_balance.png b/docs/images/gallery/class_balance.png new file mode 100644 index 000000000..95dccd4c7 Binary files /dev/null and b/docs/images/gallery/class_balance.png differ diff --git a/docs/images/gallery/class_prediction_error.png b/docs/images/gallery/class_prediction_error.png new file mode 100644 index 000000000..da0af1721 Binary files /dev/null and b/docs/images/gallery/class_prediction_error.png differ diff --git a/docs/images/gallery/classification_report.png b/docs/images/gallery/classification_report.png new file mode 100644 index 000000000..d879ec371 Binary files /dev/null and b/docs/images/gallery/classification_report.png differ diff --git a/docs/images/gallery/concrete_isomap_manifold.png b/docs/images/gallery/concrete_isomap_manifold.png new file mode 100644 index 000000000..7a500632d Binary files /dev/null and b/docs/images/gallery/concrete_isomap_manifold.png differ diff --git a/docs/images/gallery/concrete_tsne_manifold.png b/docs/images/gallery/concrete_tsne_manifold.png new file mode 100644 index 000000000..803970533 Binary files /dev/null and b/docs/images/gallery/concrete_tsne_manifold.png differ diff --git a/docs/images/gallery/confusion_matrix_digits.png b/docs/images/gallery/confusion_matrix_digits.png new file mode 100644 index 000000000..15d3cb1b7 Binary files /dev/null and b/docs/images/gallery/confusion_matrix_digits.png differ diff --git a/docs/images/gallery/confusion_matrix_iris.png b/docs/images/gallery/confusion_matrix_iris.png new file mode 100644 index 000000000..d94adb000 Binary files /dev/null and b/docs/images/gallery/confusion_matrix_iris.png differ diff --git a/docs/images/gallery/corpus_tsne.png b/docs/images/gallery/corpus_tsne.png new file mode 100644 index 000000000..6d51af342 Binary files /dev/null and b/docs/images/gallery/corpus_tsne.png differ diff --git a/docs/images/gallery/cv_scores.png b/docs/images/gallery/cv_scores.png new file mode 100644 index 000000000..52dbce3c6 Binary files /dev/null and b/docs/images/gallery/cv_scores.png differ diff --git a/docs/images/gallery/decision_boundaries.png b/docs/images/gallery/decision_boundaries.png new file mode 100644 index 000000000..993d6bba0 Binary files /dev/null and b/docs/images/gallery/decision_boundaries.png differ diff --git a/docs/images/gallery/discrimination_threshold.png b/docs/images/gallery/discrimination_threshold.png new file mode 100644 index 000000000..2378d752b Binary files /dev/null and b/docs/images/gallery/discrimination_threshold.png differ diff --git a/docs/images/gallery/dispersion.png b/docs/images/gallery/dispersion.png new file mode 100644 index 000000000..a14821ac4 Binary files /dev/null and b/docs/images/gallery/dispersion.png differ diff --git a/docs/images/gallery/elbow.png b/docs/images/gallery/elbow.png new file mode 100644 index 000000000..2c6708533 Binary files /dev/null and b/docs/images/gallery/elbow.png differ diff --git a/docs/images/gallery/feature_correlation.png b/docs/images/gallery/feature_correlation.png new file mode 100644 index 000000000..d6671c580 Binary files /dev/null and b/docs/images/gallery/feature_correlation.png differ diff --git a/docs/images/gallery/feature_importances.png b/docs/images/gallery/feature_importances.png new file mode 100644 index 000000000..80f958483 Binary files /dev/null and b/docs/images/gallery/feature_importances.png differ diff --git a/docs/images/gallery/freqdist.png b/docs/images/gallery/freqdist.png new file mode 100644 index 000000000..22f9f5eb5 Binary files /dev/null and b/docs/images/gallery/freqdist.png differ diff --git a/docs/images/gallery/icdm.png b/docs/images/gallery/icdm.png new file mode 100644 index 000000000..b1746433e Binary files /dev/null and b/docs/images/gallery/icdm.png differ diff --git a/docs/images/gallery/jointplot.png b/docs/images/gallery/jointplot.png new file mode 100644 index 000000000..572669956 Binary files /dev/null and b/docs/images/gallery/jointplot.png differ diff --git a/docs/images/gallery/learning_curve.png b/docs/images/gallery/learning_curve.png new file mode 100644 index 000000000..1d9ee09d5 Binary files /dev/null and b/docs/images/gallery/learning_curve.png differ diff --git a/docs/images/gallery/occupancy_tsne_manifold.png b/docs/images/gallery/occupancy_tsne_manifold.png new file mode 100644 index 000000000..a0e0a9c1e Binary files /dev/null and b/docs/images/gallery/occupancy_tsne_manifold.png differ diff --git a/docs/images/gallery/parallel_coordinates.png b/docs/images/gallery/parallel_coordinates.png new file mode 100644 index 000000000..16f1f0641 Binary files /dev/null and b/docs/images/gallery/parallel_coordinates.png differ diff --git a/docs/images/gallery/pca_projection_3d.png b/docs/images/gallery/pca_projection_3d.png new file mode 100644 index 000000000..fdadf4db9 Binary files /dev/null and b/docs/images/gallery/pca_projection_3d.png differ diff --git a/docs/images/gallery/postag.png b/docs/images/gallery/postag.png new file mode 100644 index 000000000..6f375fb13 Binary files /dev/null and b/docs/images/gallery/postag.png differ diff --git a/docs/images/gallery/precision_recall_binary.png b/docs/images/gallery/precision_recall_binary.png new file mode 100644 index 000000000..0cc48363c Binary files /dev/null and b/docs/images/gallery/precision_recall_binary.png differ diff --git a/docs/images/gallery/precision_recall_multiclass.png b/docs/images/gallery/precision_recall_multiclass.png new file mode 100644 index 000000000..b9e3835ec Binary files /dev/null and b/docs/images/gallery/precision_recall_multiclass.png differ diff --git a/docs/images/gallery/prediction_error.png b/docs/images/gallery/prediction_error.png new file mode 100644 index 000000000..a7e16c323 Binary files /dev/null and b/docs/images/gallery/prediction_error.png differ diff --git a/docs/images/gallery/radviz.png b/docs/images/gallery/radviz.png new file mode 100644 index 000000000..d158884f8 Binary files /dev/null and b/docs/images/gallery/radviz.png differ diff --git a/docs/images/gallery/rank1d_shapiro.png b/docs/images/gallery/rank1d_shapiro.png new file mode 100644 index 000000000..aae7a05dd Binary files /dev/null and b/docs/images/gallery/rank1d_shapiro.png differ diff --git a/docs/images/gallery/rank2d_covariance.png b/docs/images/gallery/rank2d_covariance.png new file mode 100644 index 000000000..e385c96f5 Binary files /dev/null and b/docs/images/gallery/rank2d_covariance.png differ diff --git a/docs/images/gallery/residuals.png b/docs/images/gallery/residuals.png new file mode 100644 index 000000000..3cb152a46 Binary files /dev/null and b/docs/images/gallery/residuals.png differ diff --git a/docs/images/gallery/rfecv_sklearn_example.png b/docs/images/gallery/rfecv_sklearn_example.png new file mode 100644 index 000000000..6e323f7d9 Binary files /dev/null and b/docs/images/gallery/rfecv_sklearn_example.png differ diff --git a/docs/images/gallery/rocauc_binary.png b/docs/images/gallery/rocauc_binary.png new file mode 100644 index 000000000..3328eb327 Binary files /dev/null and b/docs/images/gallery/rocauc_binary.png differ diff --git a/docs/images/gallery/rocauc_multiclass.png b/docs/images/gallery/rocauc_multiclass.png new file mode 100644 index 000000000..42434391d Binary files /dev/null and b/docs/images/gallery/rocauc_multiclass.png differ diff --git a/docs/images/gallery/scatter.png b/docs/images/gallery/scatter.png new file mode 100644 index 000000000..3751a780d Binary files /dev/null and b/docs/images/gallery/scatter.png differ diff --git a/docs/images/gallery/silhouette.png b/docs/images/gallery/silhouette.png new file mode 100644 index 000000000..8cc2f4b9c Binary files /dev/null and b/docs/images/gallery/silhouette.png differ diff --git a/docs/images/gallery/validation_curve.png b/docs/images/gallery/validation_curve.png new file mode 100644 index 000000000..7c4d1bf6b Binary files /dev/null and b/docs/images/gallery/validation_curve.png differ diff --git a/docs/images/matplotlib_anatomy.png b/docs/images/matplotlib/anatomy.png similarity index 100% rename from docs/images/matplotlib_anatomy.png rename to docs/images/matplotlib/anatomy.png diff --git a/docs/images/matplotlib_pbpython_example.png b/docs/images/matplotlib/pbpython_example.png similarity index 100% rename from docs/images/matplotlib_pbpython_example.png rename to docs/images/matplotlib/pbpython_example.png diff --git a/docs/images/matplotlib_single.png b/docs/images/matplotlib/single.png similarity index 100% rename from docs/images/matplotlib_single.png rename to docs/images/matplotlib/single.png diff --git a/docs/images/quickstart/bikeshare_ols_residuals.png b/docs/images/quickstart/bikeshare_ols_residuals.png deleted file mode 100644 index 05e6005f0..000000000 Binary files a/docs/images/quickstart/bikeshare_ols_residuals.png and /dev/null differ diff --git a/docs/images/quickstart/bikeshare_rank2d.png b/docs/images/quickstart/bikeshare_rank2d.png deleted file mode 100644 index 6f1c5a559..000000000 Binary files a/docs/images/quickstart/bikeshare_rank2d.png and /dev/null differ diff --git a/docs/images/quickstart/bikeshare_ridge_alphas.png b/docs/images/quickstart/bikeshare_ridge_alphas.png deleted file mode 100644 index d831e9e50..000000000 Binary files a/docs/images/quickstart/bikeshare_ridge_alphas.png and /dev/null differ diff --git a/docs/images/quickstart/bikeshare_ridge_prediction_error.png b/docs/images/quickstart/bikeshare_ridge_prediction_error.png deleted file mode 100644 index f012da42f..000000000 Binary files a/docs/images/quickstart/bikeshare_ridge_prediction_error.png and /dev/null differ diff --git a/docs/images/quickstart/temp_feelslike_jointplot.png b/docs/images/quickstart/temp_feelslike_jointplot.png deleted file mode 100644 index db70dbe8e..000000000 Binary files a/docs/images/quickstart/temp_feelslike_jointplot.png and /dev/null differ diff --git a/docs/images/readme/affiliates_ddl.png b/docs/images/readme/affiliates_ddl.png new file mode 100644 index 000000000..6dfc88b48 Binary files /dev/null and b/docs/images/readme/affiliates_ddl.png differ diff --git a/docs/images/readme/affiliates_numfocus.png b/docs/images/readme/affiliates_numfocus.png new file mode 100644 index 000000000..299e45f7e Binary files /dev/null and b/docs/images/readme/affiliates_numfocus.png differ diff --git a/docs/images/readme/banner.png b/docs/images/readme/banner.png new file mode 100644 index 000000000..916526abd Binary files /dev/null and b/docs/images/readme/banner.png differ diff --git a/docs/images/readme/readme_imgs.py b/docs/images/readme/readme_imgs.py new file mode 100755 index 000000000..c246c2049 --- /dev/null +++ b/docs/images/readme/readme_imgs.py @@ -0,0 +1,295 @@ +#!/usr/bin/env python3 + +import os +import sys +import argparse +import numpy as np +import matplotlib.pyplot as plt + +from yellowbrick.regressor import AlphaSelection +from yellowbrick.regressor import ResidualsPlot +from yellowbrick.regressor import PredictionError +from yellowbrick.cluster import KElbowVisualizer +from yellowbrick.cluster import SilhouetteVisualizer +from yellowbrick.cluster import InterclusterDistance +from yellowbrick.classifier import PrecisionRecallCurve +from yellowbrick.classifier import ClassPredictionError +from yellowbrick.classifier import DiscriminationThreshold +from yellowbrick.datasets import load_spam, load_concrete, load_game + + +from sklearn.cluster import KMeans +from sklearn.datasets import make_blobs +from sklearn.naive_bayes import GaussianNB +from sklearn.preprocessing import OneHotEncoder +from sklearn.ensemble import RandomForestClassifier +from sklearn.model_selection import train_test_split +from sklearn.linear_model import LogisticRegression, Lasso, LassoCV, Ridge + + +FIGSIZE = (20, 4) + +IMAGES = os.path.normpath(os.path.join(os.path.dirname(__file__), "..")) +YB_LOGO_PATH = os.path.join(IMAGES, "yb-fc.png") + + +def tts_plot(viz, X, y, test_size=0.20, random_state=42, score=True, finalize=True): + """ + Helper function to plot model visualizers with train_test_split + """ + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=test_size, random_state=random_state + ) + + viz.fit(X_train, y_train) + if score: + viz.score(X_test, y_test) + if finalize: + viz.finalize() + + return viz + + +def class_prediction_error(ax=None): + data = load_game(return_dataset=True) + X, y = data.to_numpy() + + X = OneHotEncoder().fit_transform(X).toarray() + + viz = ClassPredictionError(GaussianNB(), ax=ax) + return tts_plot(viz, X, y) + + +def confusion_matrix(ax=None): + data = load_spam(return_dataset=True) + X, y = data.to_pandas() + + viz = PrecisionRecallCurve(LogisticRegression(), ax=ax) + return tts_plot(viz, X, y) + + +def discrimination_threshold(ax=None): + data = load_spam(return_dataset=True) + X, y = data.to_pandas() + + viz = DiscriminationThreshold(RandomForestClassifier(n_estimators=10), ax=ax) + return tts_plot(viz, X, y, score=False) + + +def classification_visualizers(saveto=None): + _, (axa, axb, axc) = plt.subplots(nrows=1, ncols=3, figsize=FIGSIZE) + + class_prediction_error(axa) + confusion_matrix(axb) + discrimination_threshold(axc) + + plt.tight_layout(pad=1.5) + + if saveto is not None: + plt.savefig(saveto) + else: + plt.show() + + +def residuals_plot(ax=None): + data = load_concrete(return_dataset=True) + X, y = data.to_pandas() + + viz = ResidualsPlot(Ridge(), ax=ax) + return tts_plot(viz, X, y) + + +def prediction_error(ax=None): + data = load_concrete(return_dataset=True) + X, y = data.to_pandas() + + viz = PredictionError(Lasso(), ax=ax) + return tts_plot(viz, X, y) + + +def alpha_selection(ax=None): + data = load_concrete(return_dataset=True) + X, y = data.to_pandas() + + alphas = np.logspace(-10, 1, 400) + viz = AlphaSelection(LassoCV(alphas=alphas), ax=ax) + return tts_plot(viz, X, y) + + +def regression_visualizers(saveto=None): + _, (axa, axb, axc) = plt.subplots(nrows=1, ncols=3, figsize=FIGSIZE) + + residuals_plot(axa) + prediction_error(axb) + alpha_selection(axc) + + plt.tight_layout(pad=1.5) + + if saveto is not None: + plt.savefig(saveto) + else: + plt.show() + + +def intercluster_distance(ax=None): + X, y = make_blobs(centers=12, n_samples=1000, n_features=16, shuffle=True) + + viz = InterclusterDistance(KMeans(9), ax=ax) + viz.fit(X) + viz.finalize() + + return viz + + +def k_elbow(ax=None): + X, y = make_blobs(centers=12, n_samples=1000, n_features=16, shuffle=True) + + viz = KElbowVisualizer(KMeans(), k=(4, 12), ax=ax, locate_elbow=False) + viz.fit(X) + viz.finalize() + + return viz + + +def silhouette(ax=None): + X, y = make_blobs(centers=12, n_samples=1000, n_features=16, shuffle=True) + + viz = SilhouetteVisualizer(KMeans(9), ax=ax) + viz.fit(X) + viz.finalize() + + return viz + + +def clustering_visualizers(saveto=None): + _, (axa, axb, axc) = plt.subplots(nrows=1, ncols=3, figsize=FIGSIZE) + + intercluster_distance(axa) + k_elbow(axb) + silhouette(axc).ax.get_legend().remove() + + plt.tight_layout(pad=1.5) + + if saveto is not None: + plt.savefig(saveto) + else: + plt.show() + + +def yb_logo(path=YB_LOGO_PATH, ax=None): + """ + Reads the YB image logo from the specified path and writes it to the axes. + """ + # Load image + with open(path, "rb") as fobj: + img = plt.imread(fobj, format="png") + + if ax is None: + _, ax = plt.subplots() + + # Draw image + ax.imshow(img, interpolation="nearest") + + # Remove spines, ticks, grid, and other marks + ax.grid(False) + ax.set_xticks([]) + ax.set_yticks([]) + for pos in ["right", "top", "bottom", "left"]: + ax.spines[pos].set_visible(False) + + return ax + + +def full_image(saveto=None, center_logo=False): + _, axes = plt.subplots(nrows=3, ncols=3, figsize=(21, 14)) + + # Top row: classifiers + class_prediction_error(axes[0][0]) + confusion_matrix(axes[0][1]) + discrimination_threshold(axes[0][2]) + + # Middle row: regressors + residuals_plot(axes[1][0]) + alpha_selection(axes[1][2]) + + if center_logo: + yb_logo(ax=axes[1][1]) + else: + prediction_error(axes[1][1]) + + # Bottom row: clusterers + intercluster_distance(axes[2][0]) + k_elbow(axes[2][1]) + silhouette(axes[2][2]).ax.get_legend().remove() + + plt.tight_layout(pad=1.5) + + if saveto is not None: + plt.savefig(saveto) + else: + plt.show() + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser( + description="generates images for the README.md banner" + ) + + parser.add_argument( + "-c", + "--classifiers", + type=str, + metavar="PATH", + default="classifiers.png", + help="path to save the classifiers banner image", + ) + + parser.add_argument( + "-r", + "--regressors", + type=str, + metavar="PATH", + default="regressors.png", + help="path to save the regressors banner image", + ) + + parser.add_argument( + "-C", + "--clusterers", + type=str, + metavar="PATH", + default="clusterers.png", + help="path to save the clusterers banner image", + ) + + parser.add_argument( + "-b", + "--banner", + type=str, + metavar="PATH", + default="", + help="make full banner image and save to disk", + ) + + parser.add_argument( + "-y", + "--yb", + action="store_true", + help="replace middle image of banner with logo", + ) + + args = parser.parse_args() + + if args.banner: + full_image(args.banner, args.yb) + sys.exit(0) + + if args.classifiers: + classification_visualizers(args.classifiers) + + if args.regressors: + regression_visualizers(args.regressors) + + if args.clusterers: + clustering_visualizers(args.clusterers) diff --git a/docs/images/tutorial/modelselect_bagging_classifier.png b/docs/images/tutorial/modelselect_bagging_classifier.png index 8edcb5625..4d0a7a643 100644 Binary files a/docs/images/tutorial/modelselect_bagging_classifier.png and b/docs/images/tutorial/modelselect_bagging_classifier.png differ diff --git a/docs/images/tutorial/modelselect_extra_trees_classifier.png b/docs/images/tutorial/modelselect_extra_trees_classifier.png index 68e1405de..226e7cccb 100644 Binary files a/docs/images/tutorial/modelselect_extra_trees_classifier.png and b/docs/images/tutorial/modelselect_extra_trees_classifier.png differ diff --git a/docs/images/tutorial/modelselect_kneighbors_classifier.png b/docs/images/tutorial/modelselect_kneighbors_classifier.png index e978edc3c..4c8d0d57d 100644 Binary files a/docs/images/tutorial/modelselect_kneighbors_classifier.png and b/docs/images/tutorial/modelselect_kneighbors_classifier.png differ diff --git a/docs/images/tutorial/modelselect_linear_svc.png b/docs/images/tutorial/modelselect_linear_svc.png index 1fec34311..db98db518 100644 Binary files a/docs/images/tutorial/modelselect_linear_svc.png and b/docs/images/tutorial/modelselect_linear_svc.png differ diff --git a/docs/images/tutorial/modelselect_logistic_regression.png b/docs/images/tutorial/modelselect_logistic_regression.png index 0468dea79..665d0126e 100644 Binary files a/docs/images/tutorial/modelselect_logistic_regression.png and b/docs/images/tutorial/modelselect_logistic_regression.png differ diff --git a/docs/images/tutorial/modelselect_logistic_regression_cv.png b/docs/images/tutorial/modelselect_logistic_regression_cv.png index ce541c63c..1900f7798 100644 Binary files a/docs/images/tutorial/modelselect_logistic_regression_cv.png and b/docs/images/tutorial/modelselect_logistic_regression_cv.png differ diff --git a/docs/images/tutorial/modelselect_nu_svc.png b/docs/images/tutorial/modelselect_nu_svc.png index 1e5feb1c9..ebc4cd533 100644 Binary files a/docs/images/tutorial/modelselect_nu_svc.png and b/docs/images/tutorial/modelselect_nu_svc.png differ diff --git a/docs/images/tutorial/modelselect_random_forest_classifier.png b/docs/images/tutorial/modelselect_random_forest_classifier.png index 5c49ab188..acce5fa7d 100644 Binary files a/docs/images/tutorial/modelselect_random_forest_classifier.png and b/docs/images/tutorial/modelselect_random_forest_classifier.png differ diff --git a/docs/images/tutorial/modelselect_sgd_classifier.png b/docs/images/tutorial/modelselect_sgd_classifier.png index f75a94f79..153c029b6 100644 Binary files a/docs/images/tutorial/modelselect_sgd_classifier.png and b/docs/images/tutorial/modelselect_sgd_classifier.png differ diff --git a/docs/images/tutorial/modelselect_svc.png b/docs/images/tutorial/modelselect_svc.png index 2d51721e5..16206c69f 100644 Binary files a/docs/images/tutorial/modelselect_svc.png and b/docs/images/tutorial/modelselect_svc.png differ diff --git a/docs/images/visualizers.png b/docs/images/visualizers.png deleted file mode 100644 index 342b6ba86..000000000 Binary files a/docs/images/visualizers.png and /dev/null differ diff --git a/docs/images/yb-bw.png b/docs/images/yb-bw.png new file mode 100644 index 000000000..ef5a5e158 Binary files /dev/null and b/docs/images/yb-bw.png differ diff --git a/docs/images/yb-fc.png b/docs/images/yb-fc.png new file mode 100644 index 000000000..57d118252 Binary files /dev/null and b/docs/images/yb-fc.png differ diff --git a/docs/images/yb-lego.png b/docs/images/yb-lego.png new file mode 100644 index 000000000..fd3c36c50 Binary files /dev/null and b/docs/images/yb-lego.png differ diff --git a/docs/index.rst b/docs/index.rst index 8fb3ebee9..7ca743ad2 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -7,16 +7,48 @@ Yellowbrick: Machine Learning Visualization =========================================== -.. image:: images/visualizers.png +.. image:: images/readme/banner.png -Yellowbrick is a suite of visual diagnostic tools called "Visualizers" that extend the Scikit-Learn API to allow human steering of the model selection process. In a nutshell, Yellowbrick combines scikit-learn with matplotlib in the best tradition of the scikit-learn documentation, but to produce visualizations for *your* models! For more on Yellowbrick, please see the :doc:`about`. +Yellowbrick extends the Scikit-Learn API to make model selection and hyperparameter tuning easier. Under the hood, it's using Matplotlib. -If you're new to Yellowbrick, checkout the :doc:`quickstart` or skip ahead to the :doc:`tutorial`. Yellowbrick is a rich library with many Visualizers being added on a regular basis. For details on specific Visualizers and extended usage head over to the :doc:`api/index`. Interested in contributing to Yellowbrick? Checkout the :ref:`contributing guide ` . If you've signed up to do user testing, head over to the :doc:`evaluation` (and thank you!). +Recommended Learning Path +------------------------- + +1. Check out the :doc:`quickstart`, try the :doc:`tutorial`, and check out the :doc:`oneliners`. + +2. Use Yellowbrick in your work, referencing the :doc:`api/index` for assistance with specific visualizers and detailed information on optional parameters and customization options. + +3. Star us on `GitHub `_ and follow us on `Twitter (@scikit_yb) `_ so that you'll hear about new visualizers as soon as they're added. + +Contributing +------------ + +Interested in contributing to Yellowbrick? Yellowbrick is a welcoming, inclusive project and we would love to have you. +We follow the `Python Software Foundation Code of Conduct `_. + +No matter your level of technical skill, you can be helpful. We appreciate bug reports, user testing, feature requests, bug fixes, product enhancements, and documentation improvements. + +Check out the :doc:`contributing/index` guide! + +If you've signed up to do user testing, head over to the :doc:`evaluation`. + +Please consider joining the `Google Groups Listserv `_ listserve so you can respond to questions. + +Thank you for your contributions! + +Concepts & API +-------------- Visualizers ----------- +The primary goal of Yellowbrick is to create a sensical API similar to Scikit-Learn. + +Visualizers are the core objects in Yellowbrick. +They are similar to transformers in Scikit-Learn. +Visualizers can wrap a model estimator - similar to how the "ModelCV" (e.g. `RidgeCV `_, `LassoCV `_) methods work. + -Visualizers are estimators (objects that learn from data) whose primary objective is to create visualizations that allow insight into the model selection process. In Scikit-Learn terms, they can be similar to transformers when visualizing the data space or wrap an model estimator similar to how the "ModelCV" (e.g. `RidgeCV `_, `LassoCV `_) methods work. The primary goal of Yellowbrick is to create a sensical API similar to Scikit-Learn. Some of our most popular visualizers include: +Some of our most popular visualizers include: Feature Visualization ~~~~~~~~~~~~~~~~~~~~~ @@ -26,8 +58,6 @@ Feature Visualization - :doc:`Radial Visualization `: separation of instances around a circular plot - :doc:`api/features/pca`: projection of instances based on principal components - :doc:`api/features/manifold`: high dimensional visualization with manifold learning -- :doc:`api/features/importances`: rank features by importance or linear coefficients for a specific model -- :doc:`api/features/rfecv`: find the best subset of features based on importance - :doc:`Joint Plots `: direct data visualization with feature selection Classification Visualization @@ -36,7 +66,7 @@ Classification Visualization - :doc:`api/classifier/class_prediction_error`: shows error and support in classification - :doc:`api/classifier/classification_report`: visual representation of precision, recall, and F1 - :doc:`ROC/AUC Curves `: receiver operator characteristics and area under the curve -- :doc:`api/classifier/prcurve`: precision vs recall for different probability thresholds +- :doc:`api/classifier/prcurve`: precision vs recall for different probability thresholds - :doc:`Confusion Matrices `: visual description of class decision making - :doc:`Discrimination Threshold `: find a threshold that best separates binary classes @@ -46,6 +76,7 @@ Regression Visualization - :doc:`api/regressor/peplot`: find model breakdowns along the domain of the target - :doc:`api/regressor/residuals`: show the difference in residuals of training and test data - :doc:`api/regressor/alphas`: show how the choice of alpha influences regularization +- :doc:`api/regressor/influence`: show the influence of instances on linear regression Clustering Visualization ~~~~~~~~~~~~~~~~~~~~~~~~ @@ -59,6 +90,8 @@ Model Selection Visualization - :doc:`api/model_selection/validation_curve`: tune a model with respect to a single hyperparameter - :doc:`api/model_selection/learning_curve`: show if a model might benefit from more data or less complexity +- :doc:`api/model_selection/importances`: rank features by importance or linear coefficients for a specific model +- :doc:`api/model_selection/rfecv`: find the best subset of features based on importance Target Visualization ~~~~~~~~~~~~~~~~~~~~ @@ -73,20 +106,32 @@ Text Visualization - :doc:`Term Frequency `: visualize the frequency distribution of terms in the corpus - :doc:`api/text/tsne`: use stochastic neighbor embedding to project documents - :doc:`api/text/dispersion`: visualize how key terms are dispersed throughout a corpus +- :doc:`api/text/umap_vis`: plot similar documents closer together to discover clusters +- :doc:`api/text/postag`: plot the counts of different parts-of-speech throughout a tagged corpus -... and more! Visualizers are being added all the time; be sure to check the examples (or even the `develop branch `_) and feel free to contribute your ideas for new Visualizers! +... and more! Visualizers are being added all the time. Check the examples (or even the `develop branch `_). Feel free to contribute your ideas for new Visualizers! Getting Help ------------ -Yellowbrick is a welcoming, inclusive project in the tradition of matplotlib and scikit-learn. Similar to those projects, we follow the `Python Software Foundation Code of Conduct `_. Please don't hesitate to reach out to us for help or if you have any contributions or bugs to report! +Can't get someting to work? Here are places you can find help. + +1. The docs (you're here!). +2. `Stack Overflow `_. If you ask a question, please tag it with "yellowbrick". +3. The Yellowbrick `Google Groups Listserv `_. +4. You can also Tweet or direct message us on Twitter `@scikit_yb `_. + + +Find a Bug? +----------- + +Check if there's already an open `issue `_ on the topic. If needed, file an `issue `_. -The primary way to ask for help with Yellowbrick is to post on our `Google Groups Listserv `_. This is an email list/forum that members of the community can join and respond to each other; you should be able to receive the quickest response here. Please also consider joining the group so you can respond to questions! You can also ask questions on `Stack Overflow `_ and tag them with "yellowbrick". Or you can add issues on GitHub. You can also tweet or direct message us on Twitter `@scikit_yb `_. Open Source ----------- -The Yellowbrick `license `_ is an open source `Apache 2.0 `_ license. Yellowbrick enjoys a very active developer community; please consider joining them and `contributing `_! +The Yellowbrick `license `_ is an open source `Apache 2.0 `_ license. Yellowbrick enjoys a very active developer community; please consider :doc:`contributing/index`! Yellowbrick is hosted on `GitHub `_. The `issues `_ and `pull requests `_ are tracked there. @@ -94,23 +139,23 @@ Yellowbrick is hosted on `GitHub NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% + +:end +popd diff --git a/docs/matplotlib.rst b/docs/matplotlib.rst index 82484c081..552b486ea 100644 --- a/docs/matplotlib.rst +++ b/docs/matplotlib.rst @@ -5,14 +5,14 @@ Effective Matplotlib Yellowbrick generates visualizations by wrapping `matplotlib `_, the most prominent Python scientific visualization library. Because of this, Yellowbrick is able to generate publication-ready images for a variety of GUI backends, image formats, and Jupyter notebooks. Yellowbrick strives to provide well-styled visual diagnostic tools and complete information. However, to customize figures or roll your own visualizers, a strong background in using matplotlib is required. -With permission, we have included part of `Chris Moffitt's `_ `Effectively Using Matplotlib `_ as a crash course into Matplotlib terminology and usage. For a complete example, please visit his excellent post on creating a visual sales analysis! Additionally we recommend `Nicolas P. Rougier's Matplotlib tutorial `_ for an in-depth dive. +With permission, we have included part of `Chris Moffitt's `_ `Effectively Using Matplotlib `_ as a crash course into Matplotlib terminology and usage. For a complete example, please visit his excellent post on creating a visual sales analysis! This post was also adapated for matplotlib's documentation, `The Lifecycle of a Plot `_, which gives another good perspective. Additionally we recommend `Nicolas P. Rougier's Matplotlib tutorial `_ for an in-depth dive. Figures and Axes ---------------- This graphic from the `matplotlib faq is gold `_. Keep it handy to understand the different terminology of a plot. -.. image:: images/matplotlib_anatomy.png +.. image:: images/matplotlib/anatomy.png Most of the terms are straightforward but the main thing to remember is that the ``Figure`` is the final image that may contain 1 or more axes. The ``Axes`` represent an individual plot. Once you understand what these are and how to access them through the object oriented API, the rest of the process starts to fall into place. @@ -64,7 +64,7 @@ Therefore you have complete control of the style and customization of a Yellowbr Creating a Custom Plot ---------------------- -.. image:: images/matplotlib_pbpython_example.png +.. image:: images/matplotlib/pbpython_example.png The first step with any visualization is to plot the data. Often the simplest way to do this is using the standard pandas plotting function (given a ``DataFrame`` called ``top_10``): @@ -173,7 +173,7 @@ For this example, we’ll draw a line showing an average and include labels show # Hide the legend ax.legend().set_visible(False) -.. image:: images/matplotlib_single.png +.. image:: images/matplotlib/single.png While this may not be the most exciting plot it does show how much power you have when following this approach. diff --git a/docs/oneliners.rst b/docs/oneliners.rst new file mode 100644 index 000000000..274978d89 --- /dev/null +++ b/docs/oneliners.rst @@ -0,0 +1,347 @@ +.. -*- mode: rst -*- + +Oneliners +========= + +Yellowbrick's quick methods are visualizers in a single line of code! + +Yellowbrick is designed to give you as much control as you would like over the plots you create, offering parameters to help you customize everything from color, size, and title to preferred evaluation or correlation measure, optional bestfit lines or histograms, and cross validation techniques. To learn more about how to customize your visualizations using those parameters, check out the :doc:`api/index`. + +But... sometimes you just want to build a plot with a single line of code! + +On this page we'll explore the Yellowbrick quick methods (aka "oneliners"), which return a fully fitted, finalized visualizer object in only a single line. + +.. note:: This page illustrates oneliners for some of our most popular visualizers for feature analysis, classification, regression, clustering, and target evaluation, but is not a comprehensive list. Nearly every Yellowbrick visualizer has an associated quick method! + +Feature Analysis +---------------- + +Rank2D +~~~~~~ + +The ``rank1d`` and ``rank2d`` plots show pairwise rankings of features to help you detect relationships. More about :doc:`api/features/rankd`. + +.. plot:: + :context: close-figs + :alt: Rank2D Quick Method + + from yellowbrick.features import rank2d + from yellowbrick.datasets import load_credit + + + X, _ = load_credit() + visualizer = rank2d(X) + +.. plot:: + :context: close-figs + :alt: Rank1D Quick Method + + from yellowbrick.features import rank1d + from yellowbrick.datasets import load_energy + + + X, _ = load_energy() + visualizer = rank1d(X, color="r") + + +Parallel Coordinates +~~~~~~~~~~~~~~~~~~~~ + +The ``parallel_coordinates`` plot is a horizontal visualization of instances, disaggregated by the features that describe them. More about :doc:`api/features/pcoords`. + +.. plot:: + :context: close-figs + :alt: Parallel Coordinates Quick Method + + from sklearn.datasets import load_wine + from yellowbrick.features import parallel_coordinates + + + X, y = load_wine(return_X_y=True) + visualizer = parallel_coordinates(X, y, normalize="standard") + + +Radial Visualization +~~~~~~~~~~~~~~~~~~~~ + +The ``radviz`` plot shows the separation of instances around a unit circle. More about :doc:`api/features/radviz`. + +.. plot:: + :context: close-figs + :alt: Radviz Quick Method + + from yellowbrick.features import radviz + from yellowbrick.datasets import load_occupancy + + + X, y = load_occupancy() + visualizer = radviz(X, y, colors=["maroon", "gold"]) + + +PCA +~~~ + +A ``pca_decomposition`` is a projection of instances based on principal components. More about :doc:`api/features/pca`. + +.. plot:: + :context: close-figs + :alt: PCA Quick Method + + from yellowbrick.datasets import load_spam + from yellowbrick.features import pca_decomposition + + + X, y = load_spam() + visualizer = pca_decomposition(X, y) + + +Manifold +~~~~~~~~ + +The ``manifold_embedding`` plot is a high dimensional visualization with manifold learning, which can show nonlinear relationships in the features. More about :doc:`api/features/manifold`. + +.. plot:: + :context: close-figs + :alt: Manifold Quick Method + + from sklearn.datasets import load_iris + from yellowbrick.features import manifold_embedding + + + X, y = load_iris(return_X_y=True) + visualizer = manifold_embedding(X, y) + + +Classification +-------------- + +Class Prediction Error +~~~~~~~~~~~~~~~~~~~~~~ + +A ``class_prediction_error`` plot illustrates the error and support in a classification as a bar chart. More about :doc:`api/classifier/class_prediction_error`. + +.. plot:: + :context: close-figs + :alt: Class Prediction Error Quick Method + + from yellowbrick.datasets import load_game + from sklearn.preprocessing import OneHotEncoder + from sklearn.ensemble import RandomForestClassifier + from yellowbrick.classifier import class_prediction_error + + + X, y = load_game() + X = OneHotEncoder().fit_transform(X) + visualizer = class_prediction_error( + RandomForestClassifier(n_estimators=10), X, y + ) + + +Classification Report +~~~~~~~~~~~~~~~~~~~~~ + +A ``classification_report`` is a visual representation of precision, recall, and F1 score. More about :doc:`api/classifier/classification_report`. + +.. plot:: + :context: close-figs + :alt: Classification Report Quick Method + + from yellowbrick.datasets import load_credit + from sklearn.ensemble import RandomForestClassifier + from yellowbrick.classifier import classification_report + + + X, y = load_credit() + visualizer = classification_report( + RandomForestClassifier(n_estimators=10), X, y + ) + + +Confusion Matrix +~~~~~~~~~~~~~~~~ + +A ``confusion_matrix`` is a visual description of per-class decision making. More about :doc:`api/classifier/confusion_matrix`. + +.. plot:: + :context: close-figs + :alt: Confusion Matrix Quick Method + + from yellowbrick.datasets import load_game + from sklearn.preprocessing import OneHotEncoder + from sklearn.linear_model import RidgeClassifier + from yellowbrick.classifier import confusion_matrix + + + X, y = load_game() + X = OneHotEncoder().fit_transform(X) + visualizer = confusion_matrix(RidgeClassifier(), X, y, cmap="Greens") + + +Precision Recall +~~~~~~~~~~~~~~~~ + +A ``precision_recall_curve`` shows the tradeoff between precision and recall for different probability thresholds. More about :doc:`api/classifier/prcurve`. + +.. plot:: + :context: close-figs + :alt: Precision Recall Quick Method + + from sklearn.naive_bayes import GaussianNB + from yellowbrick.datasets import load_occupancy + from yellowbrick.classifier import precision_recall_curve + + + X, y = load_occupancy() + visualizer = precision_recall_curve(GaussianNB(), X, y) + + +ROCAUC +~~~~~~ + +A ``roc_auc`` plot shows the receiver operator characteristics and area under the curve. More about :doc:`api/classifier/rocauc`. + +.. plot:: + :context: close-figs + :alt: ROCAUC Quick Method + + from yellowbrick.classifier import roc_auc + from yellowbrick.datasets import load_spam + from sklearn.linear_model import LogisticRegression + + + X, y = load_spam() + visualizer = roc_auc(LogisticRegression(), X, y) + + +Discrimination Threshold +~~~~~~~~~~~~~~~~~~~~~~~~ + +A ``discrimination_threshold`` plot can help find a threshold that best separates binary classes. More about :doc:`api/classifier/threshold`. + +.. plot:: + :context: close-figs + :alt: Discrimination Threshold Quick Method + + from yellowbrick.classifier import discrimination_threshold + from sklearn.linear_model import LogisticRegression + from yellowbrick.datasets import load_spam + + X, y = load_spam() + visualizer = discrimination_threshold( + LogisticRegression(multi_class="auto", solver="liblinear"), X, y + ) + + +Regression +---------- + +Residuals Plot +~~~~~~~~~~~~~~ + +A ``residuals_plot`` shows the difference in residuals between the training and test data. More about :doc:`api/regressor/residuals`. + +.. plot:: + :context: close-figs + :alt: Residuals Quick Method + + from sklearn.linear_model import Ridge + from yellowbrick.datasets import load_concrete + from yellowbrick.regressor import residuals_plot + + + X, y = load_concrete() + visualizer = residuals_plot( + Ridge(), X, y, train_color="maroon", test_color="gold" + ) + +Prediction Error +~~~~~~~~~~~~~~~~ + +A ``prediction_error`` helps find where the regression is making the most errors. More about :doc:`api/regressor/peplot`. + +.. plot:: + :context: close-figs + :alt: Prediction Error Quick Method + + from sklearn.linear_model import Lasso + from yellowbrick.datasets import load_bikeshare + from yellowbrick.regressor import prediction_error + + + X, y = load_bikeshare() + visualizer = prediction_error(Lasso(), X, y) + + +Cooks Distance +~~~~~~~~~~~~~~ + +A ``cooks_distance`` plot shows the influence of instances on linear regression. More about :doc:`api/regressor/influence`. + +.. plot:: + :context: close-figs + :alt: Cooks Distance Quick Method + + from sklearn.datasets import load_diabetes + from yellowbrick.regressor import cooks_distance + + + X, y = load_diabetes(return_X_y=True) + visualizer = cooks_distance(X, y) + + +Clustering +---------- + +Silhouette Scores +~~~~~~~~~~~~~~~~~ + +A ``silhouette_visualizer`` can help you select ``k`` by visualizing silhouette coefficient values. More about :doc:`api/cluster/silhouette`. + +.. plot:: + :context: close-figs + :alt: Silhouette Scores Quick Method + + from sklearn.cluster import KMeans + from yellowbrick.datasets import load_nfl + from yellowbrick.cluster import silhouette_visualizer + + X, y = load_nfl() + visualizer = silhouette_visualizer(KMeans(5, random_state=42), X) + + +Intercluster Distance +~~~~~~~~~~~~~~~~~~~~~ + +A ``intercluster_distance`` shows size and relative distance between clusters. More about :doc:`api/cluster/icdm`. + +.. plot:: + :context: close-figs + :alt: ICDM Quick Method + + from yellowbrick.datasets import load_nfl + from sklearn.cluster import MiniBatchKMeans + from yellowbrick.cluster import intercluster_distance + + + X, y = load_nfl() + visualizer = intercluster_distance(MiniBatchKMeans(5, random_state=777), X) + + +Target Analysis +--------------- + +ClassBalance +~~~~~~~~~~~~ + +The ``class_balance`` plot can make it easier to see how the distribution of classes may affect the model. More about :doc:`api/target/class_balance`. + +.. plot:: + :context: close-figs + :alt: ClassBalance Quick Method + + from yellowbrick.datasets import load_game + from yellowbrick.target import class_balance + + + X, y = load_game() + visualizer = class_balance(y, labels=["draw", "loss", "win"]) diff --git a/docs/quickstart.rst b/docs/quickstart.rst index 957e95c6c..96553f382 100644 --- a/docs/quickstart.rst +++ b/docs/quickstart.rst @@ -5,16 +5,16 @@ Quick Start If you're new to Yellowbrick, this guide will get you started and help you include visualizers in your machine learning workflow. Before we begin, however, there are several notes about development environments that you should consider. -Yellowbrick has two primary dependencies: `scikit-learn `_ and `matplotlib `_. If you do not have these Python packages, they will be installed alongside Yellowbrick. Note that Yellowbrick works best with scikit-learn version 0.18 or later and matplotlib version 2.0 or later. Both of these packages require some C code to be compiled, which can be difficult on some systems, like Windows. If you're having trouble, try using a distribution of Python that includes these packages like `Anaconda `_. +Yellowbrick has two primary dependencies: `scikit-learn `_ and `matplotlib `_. If you do not have these Python packages, they will be installed alongside Yellowbrick. Note that Yellowbrick works best with scikit-learn version 0.20 or later and matplotlib version 3.0.1 or later. Both of these packages require some C code to be compiled, which can be difficult on some systems, like Windows. If you're having trouble, try using a distribution of Python that includes these packages like `Anaconda `_. Yellowbrick is also commonly used inside of a `Jupyter Notebook `_ alongside `Pandas `_ data frames. Notebooks make it especially easy to coordinate code and visualizations; however, you can also use Yellowbrick inside of regular Python scripts, either saving figures to disk or showing figures in a GUI window. If you're having trouble with this, please consult matplotlib's `backends documentation `_. -.. NOTE:: Jupyter, Pandas, and other ancillary libraries like NLTK for text visualizers are not installed with Yellowbrick and must be installed separately. +.. NOTE:: Jupyter, Pandas, and other ancillary libraries like the Natural Language Toolkit (NLTK) for text visualizers are not installed with Yellowbrick and must be installed separately. Installation ------------ -Yellowbrick is compatible with Python 2.7 or later, but it is preferred to use Python 3.5 or later to take full advantage of all functionality. The simplest way to install Yellowbrick is from PyPI_ with pip_, Python's preferred package installer. +Yellowbrick is a Python 3 package and works well with 3.4 or later. The simplest way to install Yellowbrick is from PyPI_ with pip_, Python's preferred package installer. .. code-block:: bash @@ -23,7 +23,7 @@ Yellowbrick is compatible with Python 2.7 or later, but it is preferred to use P .. _PyPI: https://pypi.python.org/pypi/yellowbrick .. _pip: https://docs.python.org/3/installing/ -Note that Yellowbrick is an active project and routinely publishes new releases with more visualizers and updates. In order to upgrade Yellowbrick to the latest version, use pip as follows. +Note that Yellowbrick is an active project and routinely publishes new releases with more visualizers and updates. In order to upgrade Yellowbrick to the latest version, use ``pip`` as follows. .. code-block:: bash @@ -37,12 +37,13 @@ If you're using Anaconda, you can take advantage of the `conda `_ installing matplotlib on Linux with Anaconda. If you're having trouble please let us know on GitHub. +If you're having trouble with installation, please let us know on GitHub. -Once installed, you should be able to import Yellowbrick without an error, both in Python and inside of Jupyter notebooks. Note that because of matplotlib, Yellowbrick does not work inside of a virtual environment without jumping through some hoops. +Once installed, you should be able to import Yellowbrick without an error, both in Python and inside of Jupyter notebooks. Note that because of matplotlib, Yellowbrick does not work inside of a virtual environment on macOS without jumping through some hoops. Using Yellowbrick ----------------- + The Yellowbrick API is specifically designed to play nicely with scikit-learn. The primary interface is therefore a ``Visualizer`` -- an object that learns from data to produce a visualization. Visualizers are scikit-learn `Estimator `_ objects and have a similar interface along with methods for drawing. In order to use visualizers, you simply use the same workflow as with a scikit-learn model, import the visualizer, instantiate it, call the visualizer's ``fit()`` method, then in order to render the visualization, call the visualizer's ``poof()`` method, which does the magic! For example, there are several visualizers that act as transformers, used to perform feature analysis prior to fitting a model. The following example visualizes a high-dimensional data set with parallel coordinates: @@ -63,9 +64,14 @@ The ``poof()`` method finalizes the drawing (adding titles, axes labels, etc) an visualizer.poof(outpath="pcoords.png") -The extension of the filename will determine how the image is rendered. In addition to the .png extension, .pdf is also commonly used. +The extension of the filename will determine how the image is rendered. In addition to the ``.png`` extension, ``.pdf`` is also commonly used for high-quality publication ready images. + +.. NOTE:: -.. NOTE:: Data input to Yellowbrick is identical to that of scikit-learn: a dataset, ``X``, which is a two-dimensional matrix of shape ``(n,m)`` where ``n`` is the number of instances (rows) and ``m`` is the number of features (columns). The dataset ``X`` can be a Pandas DataFrame, a NumPy array, or even a Python list of lists. Optionally, a vector ``y``, which represents the target variable (in supervised learning), can also be supplied as input. The target ``y`` must have length ``n`` -- the same number of elements as rows in ``X`` and can be a Pandas Series, NumPy array, or Python list. + Data input to Yellowbrick is identical to that of scikit-learn. Datasets are + usually described with a variable ``X`` (sometimes referred to simply as data) and an optional variable ``y`` (usually referred to as the target). The required data ``X`` is a table that contains instances (or samples) which are described by features. ``X`` is therefore a *two-dimensional matrix* with a shape of ``(n, m)`` where ``n`` is the number of instances (rows) and ``m`` is the number of features (columns). ``X`` can be a Pandas DataFrame, a NumPy array, or even a Python lists of lists. + + The optional target data, ``y``, is used to specify the ground truth in supervised machine learning. ``y`` is a vector (a one-dimensional array) that must have length ``n`` -- the same number of elements as rows in ``X``. ``y`` can be a Pandas Series, a Numpy array, or a Python list. Visualizers can also wrap scikit-learn models for evaluation, hyperparameter tuning and algorithm selection. For example, to produce a visual heatmap of a classification report, displaying the precision, recall, F1 score, and support for each class in a classifier, wrap the estimator in a visualizer as follows: @@ -105,26 +111,37 @@ These quick functions give you slightly less control over the machine learning w Walkthrough ----------- -Consider a regression analysis as a simple example of the use of visualizers in the machine learning workflow. Using a `bike sharing dataset `_ based upon the one uploaded to the `UCI Machine Learning Repository `_, we would like to predict the number of bikes rented in a given hour based on features like the season, weather, or if it's a holiday. +Let's consider a regression analysis as a simple example of the use of visualizers in the machine learning workflow. Using a bike sharing dataset based upon the one uploaded to the `UCI Machine Learning Repository `_, we would like to predict the number of bikes rented in a given hour based on features like the season, weather, or if it's a holiday. -.. note:: We have updated the dataset from the UCI ML repository to make it a bit easier to load into Pandas; make sure you download the `Yellowbrick version of the dataset `_. +.. NOTE:: We have updated the dataset from the UCI ML repository to make it a bit easier to load into Pandas; make sure you download the Yellowbrick version of the dataset using the ``load_bikeshare`` method below. Please also note that Pandas is required to follow the supplied code. Pandas can be installed using ``pip install pandas`` if you haven't already installed it. -After downloading the dataset and unzipping it in your current working directory, we can load our data as follows: +We can load our data using the ``yellowbrick.datasets`` module as follows: .. code-block:: python import pandas as pd + from yellowbrick.datasets import load_bikeshare + + X, y = load_bikeshare() + print(X.head()) - data = pd.read_csv('bikeshare.csv') - X = data[[ - "season", "month", "hour", "holiday", "weekday", "workingday", - "weather", "temp", "feelslike", "humidity", "windspeed" - ]] - y = data["riders"] +This prints out the first couple lines of our dataset which looks like:: -The machine learning workflow is the art of creating *model selection triples*, a combination of features, algorithm, and hyperparameters that uniquely identifies a model fitted on a specific data set. As part of our feature selection, we want to identify features that have a linear relationship with each other, potentially introducing covariance into our model and breaking OLS (guiding us toward removing features or using regularization). We can use the Rank2D_ visualizer to compute Pearson correlations between all pairs of features as follows: + season year month hour holiday weekday workingday weather temp \ + 0 1 0 1 0 0 6 0 1 0.24 + 1 1 0 1 1 0 6 0 1 0.22 + 2 1 0 1 2 0 6 0 1 0.22 + 3 1 0 1 3 0 6 0 1 0.24 + 4 1 0 1 4 0 6 0 1 0.24 -.. _Rank2D: http://www.scikit-yb.org/en/latest/api/yellowbrick.features.html#module-yellowbrick.features.rankd + feelslike humidity windspeed + 0 0.2879 0.81 0.0 + 1 0.2727 0.80 0.0 + 2 0.2727 0.80 0.0 + 3 0.2879 0.75 0.0 + 4 0.2879 0.75 0.0 + +The machine learning workflow is the art of creating *model selection triples*, a combination of features, algorithm, and hyperparameters that uniquely identifies a model fitted on a specific data set. As part of our feature selection, we want to identify features that have a linear relationship with each other, potentially introducing covariance into our model and breaking OLS (guiding us toward removing features or using regularization). We can use the :doc:`api/features/rankd` visualizer to compute Pearson correlations between all pairs of features as follows: .. code-block:: python @@ -134,21 +151,45 @@ The machine learning workflow is the art of creating *model selection triples*, visualizer.fit_transform(X) visualizer.poof() -.. image:: images/quickstart/bikeshare_rank2d.png +.. plot:: + :include-source: False + :context: close-figs + :alt: Rank2D of Bikeshare Features + + import matplotlib.pyplot as plt + from yellowbrick.features import Rank2D + from yellowbrick.datasets import load_bikeshare + + X, y = load_bikeshare() + visualizer = Rank2D(algorithm="pearson") + visualizer.fit_transform(X) + visualizer.poof() + plt.tight_layout() This figure shows us the Pearson correlation between pairs of features such that each cell in the grid represents two features identified in order on the x and y axes and whose color displays the magnitude of the correlation. A Pearson correlation of 1.0 means that there is a strong positive, linear relationship between the pairs of variables and a value of -1.0 indicates a strong negative, linear relationship (a value of zero indicates no relationship). Therefore we are looking for dark red and dark blue boxes to identify further. -In this chart, we see that the features ``temp`` and ``feelslike`` have a strong correlation and also that the feature ``season`` has a strong correlation with the feature ``month``. This seems to make sense; the apparent temperature we feel outside depends on the actual temperature and other airquality factors, and the season of the year is described by the month! To dive in deeper, we can use the `JointPlotVisualizer `_ to inspect those relationships. +In this chart, we see that the features ``temp`` and ``feelslike`` have a strong correlation and also that the feature ``season`` has a strong correlation with the feature ``month``. This seems to make sense; the apparent temperature we feel outside depends on the actual temperature and other airquality factors, and the season of the year is described by the month! To dive in deeper, we can use the :doc:`api/features/jointplot` (``JointPlotVisualizer``) to inspect those relationships. .. code-block:: python from yellowbrick.features import JointPlotVisualizer - visualizer = JointPlotVisualizer(feature='temp', target='feelslike') - visualizer.fit(X['temp'], X['feelslike']) + visualizer = JointPlotVisualizer(columns=['temp', 'feelslike']) + visualizer.fit_transform(X, y) visualizer.poof() -.. image:: images/quickstart/temp_feelslike_jointplot.png +.. plot:: + :include-source: False + :context: close-figs + :alt: JointPlot of temp vs feelslike + + from yellowbrick.features import JointPlotVisualizer + from yellowbrick.datasets import load_bikeshare + + X, y = load_bikeshare() + visualizer = JointPlotVisualizer(columns=['temp', 'feelslike']) + visualizer.fit_transform(X, y) + visualizer.poof() This visualizer plots a scatter diagram of the apparent temperature on the y axis and the actual measured temperature on the x axis and draws a line of best fit using a simple linear regression. Additionally, univariate distributions are shown as histograms above the x axis for temp and next to the y axis for feelslike. The ``JointPlotVisualizer`` gives an at-a-glance view of the very strong positive correlation of the features, as well as the range and distribution of each feature. Note that the axes are normalized to the space between zero and one, a common technique in machine learning to reduce the impact of one feature over another. @@ -176,9 +217,29 @@ At this point, we can train our model; let's fit a linear regression to our mode visualizer.score(X_test, y_test) visualizer.poof() -.. image:: images/quickstart/bikeshare_ols_residuals.png +.. plot:: + :include-source: False + :context: close-figs + :alt: ResidualsPlot of a simple LinearRegression -The residuals plot shows the error against the predicted value (the number of riders), and allows us to look for heteroskedasticity in the model; e.g. regions in the target where the error is greatest. The shape of the residuals can strongly inform us where OLS (ordinary least squares) is being most strongly affected by the components of our model (the features). In this case, we can see that the lower predicted number of riders results in lower model error, and conversely that the the higher predicted number of riders results in higher model error. This indicates that our model has more noise in certain regions of the target or that two variables are colinear, meaning that they are injecting error as the noise in their relationship changes. + from yellowbrick.datasets import load_bikeshare + from yellowbrick.regressor import ResidualsPlot + from sklearn.linear_model import LinearRegression + from sklearn.model_selection import train_test_split + + X, y = load_bikeshare() + + # Create training and test sets + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.1 + ) + + visualizer = ResidualsPlot(LinearRegression()) + visualizer.fit(X_train, y_train) + visualizer.score(X_test, y_test) + visualizer.poof() + +The residuals plot shows the error against the predicted value (the number of riders), and allows us to look for heteroskedasticity in the model; e.g. regions in the target where the error is greatest. The shape of the residuals can strongly inform us where OLS (ordinary least squares) is being most strongly affected by the components of our model (the features). In this case, we can see that the lower predicted number of riders results in lower model error, and conversely that the the higher predicted number of riders results in higher model error. This indicates that our model has more noise in certain regions of the target or that two variables are colinear, meaning that they are injecting error as the noise in their relationship changes. The residuals plot also shows how the model is injecting error, the bold horizontal line at ``residuals = 0`` is no error, and any point above or below that line indicates the magnitude of error. For example, most of the residuals are negative, and since the score is computed as ``actual - expected``, this means that the expected value is bigger than the actual value most of the time; e.g. that our model is primarily guessing more than the actual number of riders. Moreover, there is a very interesting boundary along the top right of the residuals graph, indicating an interesting effect in model space; possibly that some feature is strongly weighted in the region of that model. @@ -198,7 +259,22 @@ Along with generating the residuals plot, we also measured the performance by "s visualizer.fit(X, y) visualizer.poof() -.. image:: images/quickstart/bikeshare_ridge_alphas.png +.. plot:: + :include-source: False + :context: close-figs + :alt: AlphaSelection for L2 Regularization using RidgeCV + + import numpy as np + from yellowbrick.datasets import load_bikeshare + from sklearn.linear_model import RidgeCV + from yellowbrick.regressor import AlphaSelection + + X, y = load_bikeshare() + + alphas = np.logspace(-10, 1, 200) + visualizer = AlphaSelection(RidgeCV(alphas=alphas)) + visualizer.fit(X, y) + visualizer.poof() When exploring model families, the primary thing to consider is how the model becomes more *complex*. As the model increases in complexity, the error due to variance increases because the model is becoming more overfit and cannot generalize to unseen data. However, the simpler the model is the more error there is likely to be due to bias; the model is underfit and therefore misses its target more frequently. The goal therefore of most machine learning is to create a model that is *just complex enough*, finding a middle ground between bias and variance. @@ -218,8 +294,27 @@ We can now train our final model and visualize it with the ``PredictionError`` v visualizer.score(X_test, y_test) visualizer.poof() -.. image:: images/quickstart/bikeshare_ridge_prediction_error.png +.. plot:: + :include-source: False + :context: close-figs + :alt: PredictionError for L2 Regularization using Ridge + + from yellowbrick.datasets import load_bikeshare + from sklearn.linear_model import Ridge + from yellowbrick.regressor import PredictionError + from sklearn.model_selection import train_test_split + + X, y = load_bikeshare() + # Create training and test sets + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.1 + ) + + visualizer = PredictionError(Ridge(alpha=3.181)) + visualizer.fit(X_train, y_train) + visualizer.score(X_test, y_test) + visualizer.poof() The prediction error visualizer plots the actual (measured) vs. expected (predicted) values against each other. The dotted black line is the 45 degree line that indicates zero error. Like the residuals plot, this allows us to see where error is occurring and in what magnitude. diff --git a/docs/requirements.txt b/docs/requirements.txt index bc5a0d332..0ac9e363c 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,11 +1,11 @@ # Library Dependencies -matplotlib>=1.5.1,!=3.0.0 -scipy>=0.19 -scikit-learn>=0.19 +matplotlib>=3.0.2,!=3.1.1 +scipy>=1.0.0 +scikit-learn>=0.20 numpy>=1.13.0 cycler>=0.10.0 # Documentation Dependencies -Sphinx>=1.7.5 -sphinx-rtd-theme>=0.4.0 -numpydoc>=0.8.0 +Sphinx>=2.1 +sphinx-rtd-theme>=0.4.3 +numpydoc>=0.9.0 diff --git a/docs/teaching.rst b/docs/teaching.rst index a9cd2063d..b59eabf44 100644 --- a/docs/teaching.rst +++ b/docs/teaching.rst @@ -18,7 +18,7 @@ The following slide deck presents an approach to teaching students about the mac .. raw:: html - + Teachers are welcome to `download the slides `_ via SlideShare as a PowerPoint deck, and to add them to their course materials to assist in teaching these important concepts. diff --git a/docs/tutorial.py b/docs/tutorial.py index b49f67962..b1f98544a 100644 --- a/docs/tutorial.py +++ b/docs/tutorial.py @@ -1,111 +1,89 @@ #!/usr/bin/env python # Generate the classification report images for the tutorial -import os -import pandas as pd import matplotlib.pyplot as plt -from yellowbrick.classifier import ClassificationReport - from sklearn.pipeline import Pipeline -from sklearn.base import BaseEstimator, TransformerMixin -from sklearn.preprocessing import LabelEncoder, OneHotEncoder - from sklearn.svm import LinearSVC, NuSVC, SVC from sklearn.neighbors import KNeighborsClassifier +from sklearn.preprocessing import OneHotEncoder, LabelEncoder from sklearn.linear_model import LogisticRegressionCV, LogisticRegression, SGDClassifier -from sklearn.ensemble import BaggingClassifier, ExtraTreesClassifier, RandomForestClassifier - - -DATA = os.path.join( - os.path.dirname(__file__), "..", "examples", "data", "mushroom", "mushroom.csv" +from sklearn.ensemble import ( + BaggingClassifier, + ExtraTreesClassifier, + RandomForestClassifier, ) +from yellowbrick.datasets import load_mushroom +from yellowbrick.classifier import ClassificationReport + ESTIMATORS = { - LinearSVC: "images/tutorial/modelselect_linear_svc.png", - NuSVC: "images/tutorial/modelselect_nu_svc.png", - SVC: "images/tutorial/modelselect_svc.png", - SGDClassifier: "images/tutorial/modelselect_sgd_classifier.png", - KNeighborsClassifier: "images/tutorial/modelselect_kneighbors_classifier.png", - LogisticRegressionCV: "images/tutorial/modelselect_logistic_regression_cv.png", - LogisticRegression: "images/tutorial/modelselect_logistic_regression.png", - BaggingClassifier: "images/tutorial/modelselect_bagging_classifier.png", - ExtraTreesClassifier: "images/tutorial/modelselect_extra_trees_classifier.png", - RandomForestClassifier: "images/tutorial/modelselect_random_forest_classifier.png", + "SVC": {"model": SVC(gamma="auto"), "path": "images/tutorial/modelselect_svc.png"}, + "NuSVC": { + "model": NuSVC(gamma="auto"), + "path": "images/tutorial/modelselect_nu_svc.png", + }, + "LinearSVC": { + "model": LinearSVC(), + "path": "images/tutorial/modelselect_linear_svc.png", + }, + "SGD": { + "model": SGDClassifier(max_iter=100, tol=1e-3), + "path": "images/tutorial/modelselect_sgd_classifier.png", + }, + "KNN": { + "model": KNeighborsClassifier(), + "path": "images/tutorial/modelselect_kneighbors_classifier.png", + }, + "LR": { + "model": LogisticRegression(solver="lbfgs"), + "path": "images/tutorial/modelselect_logistic_regression.png", + }, + "LRCV": { + "model": LogisticRegressionCV(cv=3), + "path": "images/tutorial/modelselect_logistic_regression_cv.png", + }, + "Bags": { + "model": BaggingClassifier(), + "path": "images/tutorial/modelselect_bagging_classifier.png", + }, + "XTrees": { + "model": ExtraTreesClassifier(n_estimators=100), + "path": "images/tutorial/modelselect_extra_trees_classifier.png", + }, + "RF": { + "model": RandomForestClassifier(n_estimators=100), + "path": "images/tutorial/modelselect_random_forest_classifier.png", + }, } - -class EncodeCategorical(BaseEstimator, TransformerMixin): - """ - Encodes a specified list of columns or all columns if None. - """ - - def __init__(self, columns=None): - self.columns = [col for col in columns] - self.encoders = None - - def fit(self, data, target=None): - """ - Expects a data frame with named columns to encode. - """ - # Encode all columns if columns is None - if self.columns is None: - self.columns = data.columns - - # Fit a label encoder for each column in the data frame - self.encoders = { - column: LabelEncoder().fit(data[column]) - for column in self.columns - } - return self - - def transform(self, data): - """ - Uses the encoders to transform a data frame. - """ - output = data.copy() - for column, encoder in self.encoders.items(): - output[column] = encoder.transform(data[column]) - - return output - - -def load_data(path=DATA): - dataset = pd.read_csv(path) - features = ['shape', 'surface', 'color'] - target = ['target'] - - X = dataset[features] - y = dataset[target] - - y = LabelEncoder().fit_transform(y.values.ravel()) - - return X, y - - -def visual_model_selection(X, y, estimator, path): +def visualize_model(X, y, estimator, path, **kwargs): """ Test various estimators. """ - model = Pipeline([ - ('label_encoding', EncodeCategorical(X.keys())), - ('one_hot_encoder', OneHotEncoder()), - ('estimator', estimator) - ]) + y = LabelEncoder().fit_transform(y) + model = Pipeline([("one_hot_encoder", OneHotEncoder()), ("estimator", estimator)]) _, ax = plt.subplots() # Instantiate the classification model and visualizer - visualizer = ClassificationReport(model, ax=ax, classes=['edible', 'poisonous']) + visualizer = ClassificationReport( + model, + classes=["edible", "poisonous"], + cmap="YlGn", + size=(600, 360), + ax=ax, + **kwargs + ) visualizer.fit(X, y) visualizer.score(X, y) visualizer.poof(outpath=path) -if __name__ == '__main__': - X, y = load_data() +if __name__ == "__main__": + X, y = load_mushroom() - for clf, path in ESTIMATORS.items(): - visual_model_selection(X, y, clf(), path) + for clf in ESTIMATORS.values(): + visualize_model(X, y, clf["model"], clf["path"]) diff --git a/docs/tutorial.rst b/docs/tutorial.rst index 80dc12f9f..ebebd930e 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -3,10 +3,7 @@ Model Selection Tutorial ======================== -In this tutorial, we are going to look at scores for a variety of -`Scikit-Learn `__ models and compare them using -visual diagnostic tools from `Yellowbrick `__ -in order to select the best model for our data. +In this tutorial, we are going to look at scores for a variety of `Scikit-Learn `__ models and compare them using visual diagnostic tools from `Yellowbrick `__ in order to select the best model for our data. The Model Selection Triple -------------------------- @@ -29,112 +26,54 @@ The Yellowbrick library is a diagnostic visualization platform for machine learn About the Data -------------- -This tutorial uses a modified version of the mushroom dataset_ from -the `UCI Machine Learning Repository `__. -Our objective is to predict if a mushroom is poisonous or edible based on -its characteristics. +This tutorial uses the mushrooms data from the Yellowbrick :doc:`api/datasets/index` module. Our objective is to predict if a mushroom is poisonous or edible based on its characteristics. -.. _dataset: https://github.com/rebeccabilbro/rebeccabilbro.github.io/blob/master/data/agaricus-lepiota.txt +.. NOTE:: The YB version of the mushrooms data differs from the mushroom dataset from the `UCI Machine Learning Repository `__. The Yellowbrick version has been deliberately modified to make modeling a bit more of a challenge. -The data include descriptions of hypothetical samples corresponding to -23 species of gilled mushrooms in the Agaricus and Lepiota Family. Each -species was identified as definitely edible, definitely poisonous, or of -unknown edibility and not recommended (this latter class was combined -with the poisonous one). +The data include descriptions of hypothetical samples corresponding to 23 species of gilled mushrooms in the Agaricus and Lepiota Family. Each species was identified as definitely edible, definitely poisonous, or of unknown edibility and not recommended (this latter class was combined with the poisonous one). -Our file, "agaricus-lepiota.txt," contains information for 3 nominally -valued attributes and a target value from 8124 instances of mushrooms -(4208 edible, 3916 poisonous). +Our data contains information for 3 nominally valued attributes and a target value from 8124 instances of mushrooms (4208 edible, 3916 poisonous). -Let's load the data with Pandas. +Let's load the data: .. code:: python - import os - import pandas as pd + from yellowbrick.datasets import load_mushroom - names = [ - 'class', - 'cap-shape', - 'cap-surface', - 'cap-color' - ] - - mushrooms = os.path.join('data','agaricus-lepiota.txt') - dataset = pd.read_csv(mushrooms) - dataset.columns = names - dataset.head() - -= ========= ========= =========== ========= -. class cap-shape cap-surface cap-color -= ========= ========= =========== ========= -0 edible bell smooth white -1 poisonous convex scaly white -2 edible convex smooth gray -3 edible convex scaly yellow -4 edible bell smooth white -= ========= ========= =========== ========= + X, y = load_mushroom() + print(X[:5]) # inspect the first five rows -.. code:: python +.. parsed-literal:: - features = ['cap-shape', 'cap-surface', 'cap-color'] - target = ['class'] + shape surface color + 0 convex smooth yellow + 1 bell smooth white + 2 convex scaly white + 3 convex smooth gray + 4 convex scaly yellow - X = dataset[features] - y = dataset[target] Feature Extraction ------------------ -Our data, including the target, is categorical. We will need to change -these values to numeric ones for machine learning. In order to extract -this from the dataset, we'll have to use Scikit-Learn transformers to -transform our input dataset into something that can be fit to a model. -Luckily, Sckit-Learn does provide a transformer for converting -categorical labels into numeric integers: -`sklearn.preprocessing.LabelEncoder `__. -Unfortunately it can only transform a single vector at a time, so we'll -have to adapt it in order to apply it to multiple columns. +Our data, including the target, is categorical. We will need to change these values to numeric ones for machine learning. In order to extract this from the dataset, we'll have to use scikit-learn transformers to transform our input dataset into something that can be fit to a model. Luckily, scikit-learn does provide transformers for converting categorical labels into numeric integers: +`sklearn.preprocessing.LabelEncoder `__ and `sklearn.preprocessing.OneHotEncoder `__. + +We'll use a combination of scikit-learn's ``Pipeline`` object (`here's `__ a great post on using pipelines by `Zac Stewart `__), ``OneHotEncoder``, and ``LabelEncoder``: .. code:: python - from sklearn.base import BaseEstimator, TransformerMixin - from sklearn.preprocessing import LabelEncoder, OneHotEncoder + from sklearn.pipeline import Pipeline + from sklearn.preprocessing import OneHotEncoder, LabelEncoder + # Label-encode targets before modeling + y = LabelEncoder().fit_transform(y) - class EncodeCategorical(BaseEstimator, TransformerMixin): - """ - Encodes a specified list of columns or all columns if None. - """ - - def __init__(self, columns=None): - self.columns = [col for col in columns] - self.encoders = None - - def fit(self, data, target=None): - """ - Expects a data frame with named columns to encode. - """ - # Encode all columns if columns is None - if self.columns is None: - self.columns = data.columns - - # Fit a label encoder for each column in the data frame - self.encoders = { - column: LabelEncoder().fit(data[column]) - for column in self.columns - } - return self - - def transform(self, data): - """ - Uses the encoders to transform a data frame. - """ - output = data.copy() - for column, encoder in self.encoders.items(): - output[column] = encoder.transform(data[column]) - - return output + # One-hot encode columns before modeling + model = Pipeline([ + ('one_hot_encoder', OneHotEncoder()), + ('estimator', estimator) + ]) Modeling and Evaluation ----------------------- @@ -174,314 +113,121 @@ diagnostics from the Yellowbrick library). from sklearn.metrics import f1_score from sklearn.pipeline import Pipeline + from sklearn.svm import LinearSVC, NuSVC, SVC + from sklearn.neighbors import KNeighborsClassifier + from sklearn.preprocessing import OneHotEncoder, LabelEncoder + from sklearn.linear_model import LogisticRegressionCV, LogisticRegression, SGDClassifier + from sklearn.ensemble import BaggingClassifier, ExtraTreesClassifier, RandomForestClassifier - def model_selection(X, y, estimator): + models = [ + SVC(gamma='auto'), NuSVC(gamma='auto'), LinearSVC(), + SGDClassifier(max_iter=100, tol=1e-3), KNeighborsClassifier(), + LogisticRegression(solver='lbfgs'), LogisticRegressionCV(cv=3), + BaggingClassifier(), ExtraTreesClassifier(n_estimators=300), + RandomForestClassifier(n_estimators=300) + ] + + + def score_model(X, y, estimator, **kwargs): """ Test various estimators. """ - y = LabelEncoder().fit_transform(y.values.ravel()) + y = LabelEncoder().fit_transform(y) model = Pipeline([ - ('label_encoding', EncodeCategorical(X.keys())), - ('one_hot_encoder', OneHotEncoder()), - ('estimator', estimator) + ('one_hot_encoder', OneHotEncoder()), + ('estimator', estimator) ]) # Instantiate the classification model and visualizer - model.fit(X, y) + model.fit(X, y, **kwargs) expected = y predicted = model.predict(X) - # Compute and return the F1 score (the harmonic mean of precision and recall) - return (f1_score(expected, predicted)) - -.. code:: python - - # Try them all! - from sklearn.svm import LinearSVC, NuSVC, SVC - from sklearn.neighbors import KNeighborsClassifier - from sklearn.linear_model import LogisticRegressionCV, LogisticRegression, SGDClassifier - from sklearn.ensemble import BaggingClassifier, ExtraTreesClassifier, RandomForestClassifier - -.. code:: python - - model_selection(X, y, LinearSVC()) - - - - -.. parsed-literal:: - - 0.65846308387744845 - - - -.. code:: python - - model_selection(X, y, NuSVC()) - - - - -.. parsed-literal:: - - 0.63838842388991346 - - - -.. code:: python - - model_selection(X, y, SVC()) - - - - -.. parsed-literal:: - - 0.66251459711950167 - - - -.. code:: python - - model_selection(X, y, SGDClassifier()) - - - - -.. parsed-literal:: - - 0.69944182052382997 - - - -.. code:: python - - model_selection(X, y, KNeighborsClassifier()) - - - - -.. parsed-literal:: - - 0.65802139037433149 - - - -.. code:: python - - model_selection(X, y, LogisticRegressionCV()) - - - - -.. parsed-literal:: - - 0.65846308387744845 - - - -.. code:: python - - model_selection(X, y, LogisticRegression()) - - - - -.. parsed-literal:: - - 0.65812609897010799 - - - -.. code:: python - - model_selection(X, y, BaggingClassifier()) - + # Compute and return F1 (harmonic mean of precision and recall) + print("{}: {}".format(estimator.__class__.__name__, f1_score(expected, predicted))) + for model in models: + score_model(X, y, model) .. parsed-literal:: - 0.687643484132343 - - - -.. code:: python - - model_selection(X, y, ExtraTreesClassifier()) - - - - -.. parsed-literal:: - - 0.68713648045448383 - - - -.. code:: python - - model_selection(X, y, RandomForestClassifier()) - - - - -.. parsed-literal:: - - 0.69317131158367451 - + SVC: 0.6624286455630514 + NuSVC: 0.6726016476215785 + LinearSVC: 0.6583804143126177 + SGDClassifier: 0.5582697992842696 + KNeighborsClassifier: 0.6581185045215279 + LogisticRegression: 0.6580434509606933 + LogisticRegressionCV: 0.6583804143126177 + BaggingClassifier: 0.6879633373770051 + ExtraTreesClassifier: 0.6871364804544838 + RandomForestClassifier: 0.687643484132343 Preliminary Model Evaluation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Based on the results from the F1 scores above, which model is performing -the best? +Based on the results from the F1 scores above, which model is performing the best? Visual Model Evaluation ----------------------- -Now let's refactor our model evaluation function to use Yellowbrick's -``ClassificationReport`` class, a model visualizer that displays the -precision, recall, and F1 scores. This visual model analysis tool -integrates numerical scores as well as color-coded heatmaps in order to -support easy interpretation and detection, particularly the nuances of -Type I and Type II error, which are very relevant (lifesaving, even) to -our use case! - -**Type I error** (or a **"false positive"**) is detecting an effect that -is not present (e.g. determining a mushroom is poisonous when it is in -fact edible). +Now let's refactor our model evaluation function to use Yellowbrick's ``ClassificationReport`` class, a model visualizer that displays the precision, recall, and F1 scores. This visual model analysis tool integrates numerical scores as well as color-coded heatmaps in order to support easy interpretation and detection, particularly the nuances of Type I and Type II error, which are very relevant (lifesaving, even) to our use case! -**Type II error** (or a **"false negative"**) is failing to detect an -effect that is present (e.g. believing a mushroom is edible when it is -in fact poisonous). +**Type I error** (or a **"false positive"**) is detecting an effect that is not present (e.g. determining a mushroom is poisonous when it is in fact edible). -.. note:: When running in a Jupyter Notebook, be sure to add the following line at the top of the notebook: ``%matplotlib notebook``. This will ensure the figures are rendered correctly. For those running this code with a Python script, the figure should appear in a secondary window. +**Type II error** (or a **"false negative"**) is failing to detect an effect that is present (e.g. believing a mushroom is edible when it is in fact poisonous). .. code:: python - import matplotlib.pyplot as plt - from sklearn.pipeline import Pipeline from yellowbrick.classifier import ClassificationReport - def visual_model_selection(X, y, estimator): + def visualize_model(X, y, estimator, **kwargs): """ Test various estimators. """ - y = LabelEncoder().fit_transform(y.values.ravel()) + y = LabelEncoder().fit_transform(y) model = Pipeline([ - ('label_encoding', EncodeCategorical(X.keys())), - ('one_hot_encoder', OneHotEncoder()), - ('estimator', estimator) + ('one_hot_encoder', OneHotEncoder()), + ('estimator', estimator) ]) - # Create a new figure to draw the classification report on - _, ax = plt.subplots() - # Instantiate the classification model and visualizer visualizer = ClassificationReport( - model, ax=ax, classes=['edible', 'poisonous'] + model, classes=['edible', 'poisonous'], + cmap="YlGn", size=(600, 360), **kwargs ) visualizer.fit(X, y) visualizer.score(X, y) - - # Note that to save the figure to disk, you can specify an outpath - # argument to the poof method! visualizer.poof() - -.. code:: python - - visual_model_selection(X, y, LinearSVC()) - - - -.. image:: images/tutorial/modelselect_linear_svc.png - - -.. code:: python - - visual_model_selection(X, y, NuSVC()) - - - -.. image:: images/tutorial/modelselect_nu_svc.png - - -.. code:: python - - visual_model_selection(X, y, SVC()) + for model in models: + visualize_model(X, y, model) .. image:: images/tutorial/modelselect_svc.png +.. image:: images/tutorial/modelselect_nu_svc.png -.. code:: python - - visual_model_selection(X, y, SGDClassifier()) - - +.. image:: images/tutorial/modelselect_linear_svc.png .. image:: images/tutorial/modelselect_sgd_classifier.png - -.. code:: python - - visual_model_selection(X, y, KNeighborsClassifier()) - - - .. image:: images/tutorial/modelselect_kneighbors_classifier.png - -.. code:: python - - visual_model_selection(X, y, LogisticRegressionCV()) - - - -.. image:: images/tutorial/modelselect_logistic_regression_cv.png - - -.. code:: python - - visual_model_selection(X, y, LogisticRegression()) - - - .. image:: images/tutorial/modelselect_logistic_regression.png - -.. code:: python - - visual_model_selection(X, y, BaggingClassifier()) - - +.. image:: images/tutorial/modelselect_logistic_regression_cv.png .. image:: images/tutorial/modelselect_bagging_classifier.png - -.. code:: python - - visual_model_selection(X, y, ExtraTreesClassifier()) - - - .. image:: images/tutorial/modelselect_extra_trees_classifier.png - -.. code:: python - - visual_model_selection(X, y, RandomForestClassifier()) - - - .. image:: images/tutorial/modelselect_random_forest_classifier.png diff --git a/examples/README.md b/examples/README.md index d798ae2cb..745980248 100644 --- a/examples/README.md +++ b/examples/README.md @@ -1,66 +1,43 @@ -# Yellowbrick Examples +# Yellowbrick Examples -[![Visualizers](../docs/images/visualizers.png)](../docs/images/visualizers.png) +[![Visualizers](../docs/images/readme/banner.png)](../docs/images/readme/banner.png) -Welcome to the yellowbrick examples directory! This directory contains a gallery of visualizers and their application to classification, regression, clustering, and other machine learning techniques with Scikit-Learn. Examples have been submitted both by the Yellowbrick team and also users like you! The result is a rich gallery of tools and techniques to equip your machine learning with visual diagnostics and visualizer workflows! +Welcome to the yellowbrick examples directory! This directory contains a gallery of visualizers and their application to classification, regression, clustering, and other machine learning techniques with scikit-learn. Examples have been submitted both by the Yellowbrick team and also users like you! The result is a rich gallery of tools and techniques to equip your machine learning with visual diagnostics and visualizer workflows! -## Getting Started +## Getting Started -The notebook to explore first is the `examples.ipynb` Jupyter notebook. This notebook contains the executable examples from the tutorial in the documentation. However, before you can successfully run this notebook, you must first download the sample datasets. To download the samples run the downloader script: +The notebook to explore first is the `examples.ipynb` Jupyter notebook. This notebook contains the executable examples from the tutorial in the documentation. You can run the notebook as follows: ``` -$ python download.py +$ jupyter notebook examples.ipynb ``` -This should create a directory called `examples/data`, which in turn will contain CSV or text datasets. There are two primary problems that the download script may have: first, you may get the error `"The requests module is required to download data"`. To fix this problem: +If you don't have jupyter installed, or other dependencies, you may have to `pip install` them. -``` -$ pip install requests -``` - -The second problem may be `"Download signature does not match hardcoded signature!"` This problem means that the file you're trying to download has changed. Either download a more recent version of Yellowbrick, or use the URLs in the `download.py` script to fetch the data manually. If there are any other problems, please notify us via [GitHub Issues](https://github.com/DistrictDataLabs/yellowbrick/issues). - -Once the example data has been downloaded, you can run the examples notebook as follows: - -``` -$ jupyter notebook examples.ipynb -``` - -If you don't have jupyter installed, or other dependencies, you may have to `pip install` them. - -## Organization +## Organization The examples directory contains many notebooks, folders and files. At the top level you will see the following: -- examples.ipynb: a notebook with executable versions of the tutorial visualizers -- download.py: a script to download the example data sets -- palettes.ipynb: a visualization of the Yellowbrick palettes -- data: a directory containing the example datasets. +- examples.ipynb: a notebook with executable versions of the tutorial visualizers +- palettes.ipynb: a visualization of the Yellowbrick palettes +- regression.ipynb: a notebook exploring the regression model visualizers. -In addition to these files and directory, you will see many other directories, whose names are the GitHub usernames of their contributors. You can explore these user submitted examples or submit your own! +In addition to these files and directory, you will see many other directories, whose names are the GitHub usernames of their contributors. You can explore these user submitted examples or submit your own! -### Contributing +### Contributing To contribute an example notebook of your own, perform the following steps: -1. Fork the repository into your own account -2. Checkout the develop branch (see [contributing to Yellowbrick](http://www.scikit-yb.org/en/latest/about.html#contributing) for more. -3. Create a directory in the repo, `examples/username` where username is your GitHub username. -4. Create a notebook in that directory with your example. See [user testing](http://www.scikit-yb.org/en/latest/evaluation.html) for more. -5. Commit your changes back to your fork. -6. Submit a pull-request from your develop branch to the Yellowbrick develop branch. -7. Complete the code review steps with a Yellowbrick team member. - -That's it -- thank you for contributing your example! - -A couple of notes. First, please make sure that the Jupyter notebook you submit is "run" -- that is it has the output saved to the notebook and is viewable on GitHub (empty notebooks don't serve well as a gallery). Second, please do not commit datasets, but instead provide instructions for downloading the dataset. You can create a downloader utility similar to ours. - -One great tip, is to create your PR right after you fork the repo; that way we can work with you on the changes you're making and communicate about how to have a very successful contribution! +1. Fork the repository into your own account +2. Checkout the develop branch (see [contributing to Yellowbrick](http://www.scikit-yb.org/en/latest/about.html#contributing) for more. +3. Create a directory in the repo, `examples/username` where username is your GitHub username. +4. Create a notebook in that directory with your example. See [user testing](http://www.scikit-yb.org/en/latest/evaluation.html) for more. +5. Commit your changes back to your fork. +6. Submit a pull-request from your develop branch to the Yellowbrick develop branch. +7. Complete the code review steps with a Yellowbrick team member. -### User Examples +That's it -- thank you for contributing your example! -In this section we want to thank our examples contributors, and describe their notebooks so that you can find an example similar to your application! +A couple of notes. First, please make sure that the Jupyter notebook you submit is "run" -- that is it has the output saved to the notebook and is viewable on GitHub (empty notebooks don't serve well as a gallery). Second, please do not commit datasets, but instead provide instructions for downloading the dataset. You can create a downloader utility similar to ours. -- [bbengfort](https://github.com/bbengfort): visualizing text classification -- [rebeccabilbro](https://github.com/rebeccabilbro): visualizing book reviews data -- [nathan](https://github.com/ndanielsen/): visualizing the Iris dataset +One great tip, is to create your PR right after you fork the repo; that way we can work with you on the changes you're making and communicate about how to have a very successful contribution! diff --git a/examples/Sangarshanan/comparing_corpus_visualizers.ipynb b/examples/Sangarshanan/comparing_corpus_visualizers.ipynb new file mode 100644 index 000000000..ab17641f9 --- /dev/null +++ b/examples/Sangarshanan/comparing_corpus_visualizers.ipynb @@ -0,0 +1,242 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Comparing Corpus Visualizers on Yellowbrick\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "##### Import all the necessary Libraries\n", + "\n", + "from yellowbrick.text import TSNEVisualizer\n", + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "from sklearn.feature_extraction.text import CountVectorizer\n", + "from yellowbrick.text import UMAPVisualizer\n", + "from yellowbrick.datasets import load_hobbies" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### UMAP vs T-SNE\n", + "\n", + "Uniform Manifold Approximation and Projection (UMAP) is a dimension reduction technique that can be used for visualisation similarly to t-SNE, but also for general non-linear dimension reduction. The algorithm is founded on three assumptions about the data\n", + "\n", + "1. The data is uniformly distributed on a Riemannian manifold;\n", + "2. The Riemannian metric is locally constant (or can be approximated as such);\n", + "3. The manifold is locally connected.\n", + "\n", + "From these assumptions it is possible to model the manifold with a fuzzy topological structure. The embedding is found by searching for a low dimensional projection of the data that has the closest possible equivalent fuzzy topological structure.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "corpus = load_hobbies()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Writing a Function to quickly Visualize Corpus \n", + "\n", + "Which can then be used for rapid comparison" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "def visualize(dim_reduction,encoding,corpus,labels = True,alpha=0.7,metric=None):\n", + " if 'tfidf' in encoding.lower():\n", + " encode = TfidfVectorizer()\n", + " if 'count' in encoding.lower():\n", + " encode = CountVectorizer()\n", + " docs = encode.fit_transform(corpus.data)\n", + " if labels is True:\n", + " labels = corpus.target\n", + " else:\n", + " labels = None\n", + " if 'umap' in dim_reduction.lower():\n", + " if metric is None:\n", + " viz = UMAPVisualizer()\n", + " else:\n", + " viz = UMAPVisualizer(metric=metric)\n", + " if 't-sne' in dim_reduction.lower():\n", + " viz = TSNEVisualizer(alpha = alpha)\n", + " viz.fit(docs,labels)\n", + " viz.poof()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Quickly Comparing Plots by Controlling \n", + "\n", + "- The Dimensionality Reduction technique used \n", + "- The Encoding Technique used \n", + "- The dataset to be visualized \n", + "- Whether to differentiate Labels or not \n", + "- Set the alpha parameter\n", + "- Set the metric for UMAP " + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "visualize('t-sne','tfidf',corpus)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "visualize('t-sne','count',corpus,alpha = 0.5)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "visualize('t-sne','tfidf',corpus,labels =False)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAWUAAAD1CAYAAACIlORMAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvIxREBQAAIABJREFUeJzsvXmcHFd57/09VdVdPd2z9kgzI8nWbh0ZG9tgMLIkr4SAwQnhJgECOLZfbEdgErgQsty8JCR5b24SskDAwewmgIHcJITNBhuzWLK8gLG8SkerZS2j2bpn7bWW94/qHvXMdM/0NjM9o/p+Pv541F1bV3c95zm/ZznCdV18fHx8fBoDbbEvwMfHx8fnLL5R9vHx8WkgfKPs4+Pj00D4RtnHx8engfCNso+Pj08D4RtlHx8fnwbCWOwLqBYppQusVEoNFrz2W8D7lFLXSilvAb4E/LVS6s8LthHAESChlLq44PWXA88Af6qU+tuC128BPgEcA1xAABPAHyqlHp12Tetzx3624GUBfEIp9cUKP9/ngW8opX5UyX65fduAbymlrs/9ex9wrVJquNJjVXjePwN+D3hIKXVriW3eD9xeeO8L3vsv4LRS6n25f6/G+w578ByIv1NKfbXIfj8F1gEjuZeCwM+AP1JKjdX6uRYSKeUG4B+UUr+52Nfiszgsd0/5JeCd0167CggX2fY9wNeAO6WU0wer3Uqpy5RSr1BKXQb8LfBfRbYDSOa2vSy37RuBf5RSXlLJhSulbqvGIOfoAK4oONZl822Qc7wbeMcsBnkH8Mcl3vsjvO+mkL8BHldKXQq8Afi0lLKnxLk/XHDPL829dm+lH6ABWAfIxb4In8VjyXrKZfIscL6UcrtSam/utZuBr+I95ABIKVuAdwGvAS4Dfhv4+izHfQjPe2sHBmfZDqXUKSnlIWCLlPKVeIYrAowopa6TUn4E+B3AAg7iefpnct7fp5RS/yGl3A78XW4/B/ioUup7uWv/09xnsoBDwC143mVTzkO+PPfeSqXU4BznexTYAawFdgM3K6Wcws8jpTwP+DSwHm8W8GWl1MeklN8EzgO+IKX8c6XUN6ft1w3cBXwY+NNp712H933cjTeg5NGBttzsJpy75inXU+KeZ6WUHwTOSCm3KqUOSCnvAP4AsIG+3Oc+KKVsBj6Z+9wW8N/An+Xu4XNKqX/IXeM9+X9LKV/EM/hvAjqBv8jtfzmQBX5dKXVaSrkG+FTufgbwZj5/k5tRPQTch/ebi+bO+R/A54E1Usof5o7/SWAnkAGOArcqpcbnugc+S5fl7ikD/BtwE4CUMoznjf1g2jbvAg4qpfYDXwY+UOpgOQNxB94DOqtBzm1/JbAZeDz30kV4UsJ1UspbgRuAVyulLgGeA+6Ztn8HnoG4SSn1SuDX8TzGtVLKX8czwlfm5IBjwPuAWznrsdsFx5rrfJuAa4GXA9cD1xT5SF8DfqKUejmeIXqXlPLtSqm3AaeBdxYxyDqeEfswcGrae6vx5KF34hnMQv4093lPAS8Af6GU6i9yTTNQSiXxBp2XSymvB/4IuC7ndd8L/Hfuu/wrIARciDcg7yjxuacTyh3rQ8Bn8SSqS4ETeN8JwFeALyqlLsebufyKlPKtufc2Aj9USl2BN3v4+9x3dRtwRCn1euBKvO/jktwxjgIVzbh8lh5L2SgXqw/XmPlgfw34H1LKAPAW4Dt4HlEh78EzxuB50ZfnvNM8V0kp90kpnwKeB64GSml+Tblt90kpnwP+D56hOpF7/xml1Gju7xuALymlJnL//gTwWillsOB4VwKr8IzIPjzvysV7OH8F+L9KqTiAUuqDSqn/XeK6yjnfd5VSTk6HPYznwU0ipYzgGa27cucbwTPqN8xyTnL34GGl1IPTjhcAvgF8QCnVW2S/r+EZq9XAy4A/llJeUWS7UrhAAs8L/6ZSaiB33fcAa/C8/V8BvqCUspVSGaXUNUqpn5Zx7P/M/f8IcEYp9XTBv6O5e3UN8Ne57+0xPI/5stx2WbzvEuCXTLvXOZ7F+z0/LqX8a+A/C2Z8PsuUpSxfDOJNHQu91W5gqHCj3NT8l3ja7s3AB4EV+fellDuBi4E/klJ+KPdyBs9bzj8Au5VSN5Z5XcmcrlmKwqnn9EFRw/tORMFrOrBfKfWagmteDQzgebNuwevteJJKKeY6X7LgPZep15HfvthrgVnOCd5MpV9K+RagGW96vg9vMNwA/JOUEjxJSJdShoA/wZu2vxZAKXVISvkg3oD4xBzny8+KLsSbDVxXZBORu26LqffwfDxDPv3zB5lKuuDvbJHj67n9tyulErljrwBSeL+/TIE0VOxeo5QallJeijcQXg98U0r5L0qpfy72mX2WB0vZU74f+AMppQaT0/ybOet9FPJveNPMNqXUc9Peey/wFaXU+Uqp9Uqp9cCNeN712nm7eo8fArfmvCrwNM+HlVKFD/xjwAVSyqsBpJSX4WnHq4Ef5a6zNbftR/EGHQvPuE1/0Ms5X0lyHvRjwJ25a2kDfhd4cI79VimlLs0NVvnp+WVKqUdz9z0foLsbz6O9DW9wPQn8Vu5cK/AM8uMlTjOJlLIJ+Dhwv1LqeO5zv01KuTL3/q254x/Gu4c3Syk1KaWJp+tegzfovarg3NODkLOSmw09hvd95AfMR4A3z7GrRW6Qk1LeiKc971VKfRTvd3xp6V19lgNL2Si/H08LfE5K+QzwMPBNzsoQhfw33o/5K4Uv5h7S/wF8rPB1pdSP8YJev1//y57CF/CMwhNSyv3AK5mWLZKbcv8m8DEp5dN4n+EmpdRxpdR9eHrzI1LKZ/E8zT8DevGmxPullJ2VnK8M3okneTyL57H+J9N08HqglHLx9OT3SimfB34C/B+l1O4Su3wsJxn9Es9wj+MN0uRkk38Gfpw71s3AjTlP9S/xZkZPA08B9yml/gsvwLZKSqnwZJSfVvEx3gFsy92rx4GvK6W+Nsc+zwO2lPIJPMfjebzf+C+A7XgDr88yRvitOxuTnHH5M6XU/Yt9LT4+PgvHUtaUlyW53Of9eIUQjyzy5fj4+Cwwvqfs4+Pj00AsZU3Zx8fHZ9lRUr548sknTeDVeEGj6bm/Pj4+PoXoePn0P7/88svLyubxKc5smvKr8UptfXx8fMrlKmDPYl/EUmY2o9wLsGXLFoLB6XnzPj4+PmfJZDIcPHgQcnbDp3pmM8o2QDAYxDTNBbocHx+fJY4vddaIH+jz8fHxaSB8o+zj4+PTQPhG2cfHx6eB8I2yj4+PTwPhG2Uf7HSWZG8cO12sA+XyPbePTyPi9744h3Fsh8N3P0j/7gOkYxOY0QhdV21l867XoenzO14v5rl9fBoZ3yifwxy++0FO3bcPoWvooQBWIsOp+/YBsOXO1y/bc/v4NDK+S3KOYqez9O/ej5jmlQpdo3/3/nmVExbz3D4+jY5vlM9RMrFx0rFE8ffiCTKx+VsweTHP7ePT6PhG+RwlGG3GjEaKv9cRJhhtnrdz6xEToymAazsz3pvvc/v4NDq+pnyOopsBuq7aOqnr5nFth66rLkQ351oLtXIKg3tjB3vJjCYJRpuJrF+JEGJez+3js1TwjfI5zOZdrwOgf/d+MvEEwY4wXVddOPl6vSkM7jVv7mHi+ACpoTFc26HjkvPn9dw+PksF3yifw2i6xpY7X8+m264nExsnGG2eNy91RnBPCCLru4isXYHQNa747B0EW8Pzcm4fn6WErylXwXIreNDNAE2rOuZVNigZ3NM07FQWe8Lvi+7jA76nXBF+wUP15AOLViIz8z0/uOfjM4lvSSogr4laicyUgofDdz+42JfW8OQDi9MzLvzgno/PVHyjXCZLpeChkaWVzbtex5o3XoYRDuCksxjhAGveeJkf3PPxKcCXL8okr4nqoZkeXb7goWlVxyJcmcdSkFYWMrBYC3Y629DX57O88Y1ymTS6JrqUeknkA4uNxlIY2HyWP8v2l1bvaXwja6JLRVppdPyYgU8jsOw85fn0dha62KJcGl1amQ/qLTHMNbBtuu16X8rwWRCWnVGez2l8o2qijS6t1JP5GnQrHdh83dlnvlhWRnmhvJ350ERrecgXo4/FYjFfg265A5uvO/vMN8vKKC/FaXylD3kp492o0ko9yYwm6H3g6Rmv12PQLXdgW0oBVZ+lybIyyktxGl/uQz6X8a5EWqnn1HshpvH5z977wDMMPHIQzQxM6S4H9Rl05xrYfN3ZZyFYVkZ5qU3jK3nIyzXes0kr9Zx6L+Q0fvKzC9BCXh/m1MAIAM0buoD6DLpzDWxLcSbms/RYdiLYUqoaK3cFjnqlvNUz5Wuh0semfHZNw4w2gwtCCDKxcVzbqfugW6pBUz0XBmjkykufxWVZecrQuBkSxShXbqmHh1bPqfdCTuOnf/bIupUApGPjOGkLzRCs+tVLF2TQrcdMzA8U+szFsjPKeRq1aqyQch/yemjl9Zx6L+Q0fsZnL+jDjBBc9vfvonn9ygUzaLUGVP1Aoc9cLFujvFQo5yGvh4dWzyDoQgZUi35212X8xQE0M8Avfv+eBfU2a5mJ+YFCn3LwjfIiU+5DXquHVs8g6EIHVKd/9vTQKC7QtLoDIcSieJvVzMT8QKFPOfhGuUGY6yGvh1Zez1zmhcyLLvzsyd5hnvrwV7DT9pRtloK3uRRTNn0WHt8oLzFq0crrGQRdjICqbgbQTYPMSGpJeptLLWXTZ3HwjfI5SD2DoAtdcr7Uvc1zofLSpzZ8o1yA32SmfObjXpWTLrbUvc2llLLpszj4Rhk/d7QS5vNelZsuthy8zaWQsumzOPhGmcbNHbUdi7SVwDTC6FpjfFXzda8qSRdrVG/Tn2n51IPGeNIXkUbMHXVcB9X7GH2jx0hnk5iBJrpbNyBXbUMTtXmjtRiO+bxX1aSLNYq36c+0fOrJOW+UGzF3VPU+xsnYAYTQ0DUDy85yMnYAgAtXb5+ybblGth6GYz7v1VIO4DXqTMtnaXLOG+VGMwa2Y3Fm5BhimkcshMaZkWNs6bkCXTPIJtKoj99HbN+LZIeTcxrZehiO+bxXSzWA14gzLZ+lzTk/t2q0BVHTVoKMlSz6XsZKkUyPc/CuH/LgVR9l/8fvY3DvQVJn4lgT6ZJd2urVZW6+79VS6vCXp9xOf42E36GusTnnPWVorGi+aYQxA01Y9swHJmiEOPmlRzn9vX2kzgyj6ZrXW7h/FIDI+q6i3lk9ZYdy7lW1AcpGDeDNRqPNtGbD176XBr5RprGMga4ZdLdumNSU87iuQ1dkLYMPP4lru7iWDZq36gbCa2UZWbti0sgGo82T/y/XcJRjTGe7V/UKUDZKAK8clpLs4mvfSwPfKBfQKMZArtoGwJmRY2SsFEEjRE/bBtYiORPbjRbQEYaO65yVEdysjZO1CLSHefHfH2Xo0UNTvKGVO7Zw+gfPFDUcIqiz//TeioxpsXtVSYByOdFIM61S+Nr30mHJGuVGzAmt1zVpQuPC1dvZ0nPFFM/VTmcnPd5gtJnUwMjkGnUioCM0DRCceeDZGd7Qqjdcwqo3XcKZX7yAfSaF2RqZNBz1MKblBiiXI4000ypFI2YZ+RRnyT0ljaiLzdc16ZpBONh69t8FU+XIem8FjkxsHCdrEV7TwarXX8rAowdneEMYgsOjvyTyG+tI72zFsDvp6NzE5vN34LpOXYxpPkBZbNuMlSJtJaZ8llpoxAEZGmemVYylpH2f6yw5o7xQulglD/5CanWFU+Xw6g7atq6i4xUbkB94I9ZIglP37ZvhDY1dlmJidZpgOokR0HHcNKeHFZqhs27FxXUxpnMFKE0jXOUnPst8DH6NauDrzVLSvs91lpRRXghdrNIHv9xrqpu0MctUWdO1Gd6Qq7ukzrPQDZ1s7yms4RhksxAIcLztFBtfd2ldjOlsAcqetg11kS7qOfg14oxrvlkK2rfPEjPKC6GLVfrgl7wmxyLbFyNxaoDe+5+tv7RRZKpczBuyQy626RASKazBBK4QuAEdYdskh/s585Uv0P3GV9XFmJYKUOZfr4V6D8jnYibCUtC+fZaYUZ5vXayaB3/GNbkOxsnH0UaOoTkpDr/jUcaT7dhrr1yQh3+6N2Q2N9Hc3o499hKptiCWqePqIGwIJLKkHnqUC951O1C7MS0VoKwH9RyQz/VMhEbWvn2WmFGeb12s2qY4hddknHwcfegAIAiubCfdP4Rh9SM0Dev8K4H5ffiLeUP7jz3E8y+8RLbJQADCBVcDR9foXauxYXi4rsZ0eoCyHtRzQG6ETIRzRcv2qZwlZZRhfnWxah/8yWv66bNosSMIQ8fsbCHU3UaqfwShaegjx7DWvBpyxm6+H/5Cb+iCtTvYv/8HCNfF1QTCcQmkbEJjWcY2dCDaPANaT2PqpNNY8RhGRxTNNGs+Xj0H5MXMRJgvLds38suHJWeU51MXq/bBz1/T2hsv5Ogd96M3RxCawHVctIDh9YrIphDZBK7pGb2FTEOyDZdQqI1A/yCuLhAOCNcF14XuFWR1m3o9xq5t0/+5uxnbuwd7OIbeHqVl+066bt+F0PWz11RFKXa9BuTFzESot5Z9LgYslztLzijnmS9drJYHP9jTTbC7CyfpNagRmsDsbCbVNwKBEG7Ay2RY6DQk0wjTsv4CJlyBFY+B7WVfGB1Rwus31yVdLU//5+5m+IH7EbqOMEM4yQTDD9wPQPeuO2sqxa7ngLwYmQjzoWWfiwHL5c6SMcoLNT2r5cHXTJOW7TsnjRJAZN0KXMchZZyHk3UJdgQWPA1J1wy62zZycm0G8/y1ONksWiCAK6CnbWPdgnFOOs3Y3j1TPGIAoeuM7d3DyltvQw09WXP1YD0G5MXIRKi3ll0PI+84aaxMLwLQg6vQtNqlJp/aaHijvFjTs2of/K7bdwF40/eROHpbB2tuv4nO33032ZHkoml+helqNi66FiCqrWJz9PK6ncOKx7CHYwgzNOM9eyROOtY/r6XY9dax633semvZtRh517WJn/lXRga+TDZz2jPKgdW0dd1MtOe9CKEX3c9n/ml4o7zUpmdC1+nedScrb71txkNshGcaq4Uin662eeWrUF/8AfGfHWVw8ASPR5+v2yBndETR26OT8k0helsHdnMTmcH6l2KXq2MXUu5gX82xS1FvLbsWIx/vu5t432exrUEEXv8UK3OK4b7PIoRGtOfOiq7Fp340dCSgXs3ZFwPNNAn2rKq7x1Yrxz77Ewa+sx97PDtlkCvWHL9S8vKNa9tTXndtm5btO2mKtGMGmoruW0spdl7HdpKJKTp2/+fuLrlPfrC3EplZ70M1x55OZjRBfN+LZEYTdW3kX+2iA46TZmL4x9jWELhnXxdC4FhxJkZ+huOkK74en/rQ0J5yI+STNgJpyyaWSBMNm5iGXvVUeiGKJorJN5OepabXvRS7HB17+j0q9z5Uc+xCrIzFozd9isHHDmFPpNEjJiu2XcCVX3lf3bTszbteh5O16fvJc1iJDGZn86wxC9e1iZ3+R5Ljj+BYI4CG0AII0YQQAte1cKwBHCuGFlxV9XX5VE9DG+VG6my1GHmgtuNw996D7D7a7xnlpiCX9Sre8sJPoIqp9EIMcrPJN1D/Uuy5dGwrHiPYM9W4lHsfqjl2IY/e9Cn6fvoCQhOIgI6Tsej76Qs8etOnuOqbH6j5XuclmMHHDpFNZAiETTqv3DKrFBXvu5vE2B6EMEBogIvrpEEDIcIIYaAZK9GMaE3X5lM9DW2UG6Gz1WLmgd699yD37T+FrglCAZ34kSP8cCBGQl/Fu8zEjHSzuVjIQS4v38x4vc6l2HPp2EbHTONS7n2o5th5MqMJBh87hMivDpNDaILBxw6RGU0QbK0tFbEw3mI0mbgunHngWXRDLxpvcZw0idE9CBFANzpx7HFcNwNC4DpZHByMYCeRtmv8LIxFpKE1ZVj8xTTL1R7rTdqy2X2kHz3/UDsOdiyGJgRPmivI5L66/FTaSc+tATbSIrH56sFa0/Hm0rGLyQvl3odqjp1n4mg/9kTx78ROZJg42l/W5ytFNfEWx4rh2DEAAuY6AuZaNJGfBTgYwW7au++go3tXTdfmUxsN7SnD4na2WszGNbFEmlgyTSjgyRJuNoubzYKuMaYFGNaCdDkp7zrLmErnWY7tG2fTsUtR7n2o5tgAkY1d6BETJ2PNeE8PB4ls7KroM06nGilKM6JoehTXSQCCgLmBgLkO10khRBOrLvgqhlHfniU+ldPwRjnPYnS2qlWDdZy0FzAxohVPB6Nhk2jYJJH1HmoRCCACAVzHpsXJ0u6cnXrPNZUuZDm2b5xLxy5GufehmmMDBFvDrNh2waSmnMd1XFZsu6Bm6aIaKUrTTMKtOxmP31+Qh6yBMIl0vN43yA1CQ8gXdjpLsjfecClu+R9+0fdm0WBd1yZ25i5OH76F00du4fThW4iduQvXtYtuXwzT0LlqYxe2k8tZ0jT0aBTHdbk8PUgQb+pdzlS6GPlBbikbZNuxSGRGsR1v4KomDbHc+1DNsa/8yvvovvZlaEED13LQggbd176MK7/yvrKPUYpqpaiO7l00d9yA0MK4bhqhhWnuuMGXLBqIRfWU6xFEm8+siGoDjfG+uye9ESFCuE6C8bgXkKskKX/X9i0A7D7STzyZpmPTJi4LW7zlhadxM+myp9LLjVr6ZywkRtDgqm9+gMxogomj/UQ2dtXsIRdSjRQlhE60507au26rehbnM78I13WLvvHkk0+uB45dfPHFmPNUAHHwrh8WNXhr3njZnNV6C5UVcfY8M3/4xc7jOGlOH74lp9tNRWhhVm++p+KHoF55ysuF/af3Fs11Pi+6tez+GcuJRmjbmU6nee655wA2XH755S8uykUsExbNU641iLZQ5deVarD5CLcQM3NbHTteUVJ+3vgGOqKsKvCwSqWbnQvYjjWv/TOWIv5KIsuLRfv11hJEW4ysiHJ/+FMj3NPe0zvKSsqvZ7+F5UbaStRl9W0fn0Zl0QS4aoNocNagF30vZ9ArpV7BxnyEe3pQz3Vtwq07y5Iu6tFvYbliGuF56Z/h49MoLJqnXEu1Xj0r0+ZDm85HshOje3DsOJreQaR1Z1kR7lr7LSxHpmrqRt37Z/j4NBKL+guutpChnuXX86FNVxLhnr4sUq39FhqF6cHJapjR+yNsctXGLu648gqgfv0zpp6z8mWqfHzqyaL+6mopZKhHZdp8a9OaZpYM6pVK67og+oqq+y0Uo9zIfD2MKJQ2pLu2b0HXpt7nuc45vfdHImtx3/5TANy5czvrVlzOmdERelrbCAdrmz3UM83uXM+O8amNBTXKpQxENdHjelSmLWZrUNX7WMllkaLTlpSCyotEypVlKjGi5TC7Id1a9jln9P7IoWuChw/3Ydkujx4fqMs1w+zfR7lpdn6Adnb0D31FBzbV+bBH7H+8qfyqrCXAghjl+cwpriUdaLFag2asFCfjB2e8nk/ruuDdtwGV91sopFxZphwjWi6FhtR2XLK2Q0DX0DXB7iP93LbtAkxDL+uc03t/FPLsmTixZJqmoFHzNUP90uzmWjTWh02AqvMxJTDzYSrcQMpbgK1KqT+p6gRS3gN8Qyn1g2r2r5QFyb5YrE5rc1FL1zQnnSZzpres7myT+7gO+0/v5WH1DQZGjzM0cZrxVIzCAp6MlSLjpunedScbP3cPGz7zJTZ+7h66d91ZtrdVbgex2bzR3Uf6SVuVOSCxRJqhRIpjQ+PsOxXj6VNx9p2KcWxonFgiRSyRLvuc+d4fMz6b4zKRsTGnGetqrxnOptkVI59mNxdzBWgr+Z1A47Ye8Jl/5t1TXsxOa+Ww4Y7ryBhp4j87QnYoOac2XcsUNT9FRgh0zcDFIZn10veaQ55WXJjWVW2RSLmyzGzeaDyZJpZITylamYto2CSWyDAwnkIIgaYJHBcGxlMEdS33fnnnzPf+yHvUeVKWTThooAkx6/6VkE+zs+yZBrDcNLt6BWgXs3/3MudKKeVDQCvwUSAD/H9AChgC/h+l1LCU8h+Bnbl97lVKfSJ/ACnla4B/AX4beBXwx0AWOA28XSk11burknk3yo26pNOUwM72JMGdK+nUenjZhqsIhErrttVOUadPkc1AmFRmHCEEKStBxG0HqEtaV7myzPROdIV0NBX3VOeiiK2c8nol55zR+6PJ5PrNPTx6fIBkdqZHXLh/JYFLXas9za6WhviFLLWFgpcQE8CbgJXA47nXdiqlTkkp3w/8v1LKnwIbgG14tnGPlPLHuW23A68Ffk0p1Z8z3h9TSv2HlPJ38Yz9cD0udN6H3lqKROaTvNdq2Vl0zcDGot9+icOxJ0vuU8sUdfoUudnsIBRsRqDhODYIjfOiW+uS1lWuLGMaOjs2rCBrZadIKLbjctWmroqzMPJGsKslhC4EjuuiC0FXS4hoOMip4SEMzZ3a/W6Wc+qaxp07t/Lld+7gnnfs4Mvv3MEHrnkZV2/sLrm/oQnu2nOAm+99ZPK/u/YcwHZmd2Lkqm2cF92KrgWwHRtdC1T0fdTSEH/yMyzhhYKXAHuUUq5Sqh9IAAml1Kncew8DFwEXArtz22WBx4CX5bb5VaAdzzMG+CBwvZTyZ3gGuy5eMiyAp9wISzpNp9rATi1T1GJT5Gazg4jZjkBwtXw7QWPmcatlrpTB/EzhZZ1HOdCRYX9/gKwdYnV7F1dv6p70UiuhLaTTGoRg2GBtaxNZIKBpJDLDWNYIL5z8Di8NNnHN+vW47mr2HB2Y9ICv2tRV8pymoU+RJIp50Pn9Zwsi3rbtgpLecz2Wqaq2IX6eRp1VLhNeDSCl7AFCgCalXKWU6gWuwQsW7gduBf5ZShnAM7ZfBm7AkzzOA/4V+B3gDuCjOa/5M8BbctvWzIJkXzTaahfV9k+oZYpaaoqM67ImKutikKdP2UulDNqOxXMnH6Zv5CiaZvCWizTetNVmNDXGhT2ruXRtZRkMjutw4NReXtr3ENFRnaeGWjAtl0iglURXK4nsBFec5xIKeKlmvcOK127ayu1X7qgqNzrvQU83sqWCiJqALz5+mJ8d6WM4mZk1hS6/TFU1VNsQP089s4EaoXNcg9GUkyKa8QyqAP5LSukAceAWpdSglPLyCeEAAAAgAElEQVRaKeWjQBD4d6XUL6WUACilPi+l/G0p5TuAJ4DvSSnHgHHge/W60AUxyo222kW1gZ38FLXaHOJ6r+ScZ9a834KUwbx3fGbkaG6mIAgZYSJmB0FdsCKiEZt4Edt5TUVeoup9jKPPPIg9NMAbOgTCdnhhooX4+AhN42mu2NTEmwrsfOGMpNKgXCHTPehSQcQXYxP0jSVZ0WzWJYVuLooFaMupFKzHrLLBA4VH8FLY6n3MWVFK3QPcU+StHxXZ9g+LvHZLwd+Fwv53y7nASlnQ4pFGaTFYS2Cnlinq9CmyoQWxnAyu6+SWe68c27H4xO7n+JEaxNC1WY1OXkd3cABPk52e/VFppzXbsTgTP4wdj3tZJcAbV/bxus5+xiyDcBN0br0Epnmk89HRrVgQ0XZcYok0QV0nUGCUpudOzxeVVgrWOqts5EBhrshj1pxinyW0Rl+9qdZrrXWKCp6neHzwuZpKevMP+4n4Ub7ztEXW1TGNMM1mB4iZRqdQR9cQaELHxZmS/SGEVnGntbSVIJUah2x2iuENaC7tTRZaxsLJZmfco/no6FYsjS5rO2Rtl55Wc0YaXbUpdJVQaaVgLbPKRk8/9SmPc9Yo1xrY0UwTvWMF6dg4wahW0Y+9HiW9+WPEkxrjWY2g7pLKeb0Rsx3HtYklrUmjU6ijC8RkSh7CmyE4roNG5Sl5phEmFGohEwjAtMwDI2XTOmiTmeaJzmdHt+lBwDYzyNqOcFHDW0naXzW9QWqpFKxmVukHCpcH56xRLgyEhM3KptC16Ha1PKh5XdLQgpPHaDGhJeiStgUgmEgPk84mcFybpoBO//BTdLdcOUNHbza9hzOdTeDiENBNVrVvqljf1jWDno5NTHQcxh4YmExIdoGWk+PI7tcQX/GyeenoVvx6ZgYBP//YIc97LvCUy037q6U3yEI35F+stgE+9eWcM8r1CITMpdvNFtSp5kGdrkvqusFoYojWpk6COry8x+WJkwLHzeI42Zw0oXFxt03/qEL1Ci5cvX2Gjt5sdhAOttLduoGLz7u6as9VrtqG6zi8tO8hkmMxjIkUbYMOW7pfQ/ft76FH1+eckdSrS12ewiDgbCl0c1FLb5B6VApWQiOmn/pUzpI0yrWk+9QaCJlNt+vb8wLZXwszkHippFZczYM6Xe5wHYeMlWQ8HafZ7OBNW8HF5smTWRK25z2/vMfhTVuneuBy1TYc1+b08BFsO4sZCLOmbUvNq0BrQuNl5+1Ert5GcmIYfTyJGe2aoiOXSjWrd5e6YhR6z72jScBlVWu4DE939j4dcwUJ61EpWCmNln5ayD17/mReusTdsvNv/S5xi0WtXm49AiGz6XZDG4dJDOxHDwVLasWVPqjF5A4hNEKBMKnMBBGzHV0T3HihzatW9eHQRndrkGCBrchYKZLZcU4MvcDA2EvYThZDD9DVsrZmg1yIrhk0t6yAlvL3qWeXutmwHYfPP3aoIuM/W5+OWDLFqeEh1kWjsxrXSgPKtfZibrT002ksSpe4KRtL+QZgrVLqs3W+jroxr0a53gnstXq59QiElNLtXN0luxkiwanHLqYVV/KglpI7ImYHKcsllhBEAjYRM0RHpIOmQMuM/hNBI8TxgWc5PXwIITQMzbvG08OH0DS97OBivanFE61U7qjG+Bft0+HCeDoObmKyQnG2zJlyA8r17sXcKOmnjcZCtd+shXkxyvORwF4PL7cegZBSup0VsDG6wwhtZkee6VpxJZkfxeQO24HvHxA819eFYXTRFjK4etMqrtsQp3dY4RUrebiuQ1frWvpHX6qpX3B+gHVbmxixnaLGsFIvr5ouddXIHdUa/2IpduPpOInM+JQKxXIyZ+aqFPR7Mc8PUsom4EvAOrwqvf8A2oC7ga8DJ/A8+CeUUu+RUrYBXwA6c4f4A6XUs1LKw8BeYAvwUO4YVwBKKXWTlPJi4J8AHVgBvEcptbeaa54Xo1zvBHY7nWV0/ylSg+MYRVKYyvVy6xUIKabb9Vx9Mb0bhrCdmd3PSmnF5ZT0FpM7vn8AnjgpCAfDhAIB0jb84EAvgtW8dtPWGR74+Z0v42RMVZUFkB9ge3cf4D8RPN8RJhNtZvWWVZM9MjTXrcrLq6ZLXTUeby0tSguDhLFkCtdNcMV5bskKxWp04kZaLHcZlmfvAl5USr1dSnkBXqe4ttx7W/AaDSWAo7m+GP8TeEgp9enc9l/Ca+W5Hrge6AViwGuA38/t147X0OhDOQP+DrweGo1hlOuZwF7ocaeGxhg71EugNUxk/UpEwRy9knSfegRCSul27um98xLUKZQ7JtIpnu0LEg6GJ9PawPP69hwd4PYrd8zwwG3HmjW4aNgBkr3xog9ifoD9dkeEn0dMNMeFwTEGNcF9Gc+Y/tZzD1Xl5ZXqmVwqXa1aj7eWFqWFQcJTw0O8cPI7hALlD25zlVfbjsX4wAmyY3F0o0hT/wVaLLfBy7NrQQL3AyilDkkph4Ge3HuHlVJjAFLKXrxGRS/H6/72ttw2+YY2Q0qpl3LbTiilXsj9PZLb7xTwESllEi+qMlrtBdfdKNczgb3Q4zaaTIJtYZJ9IwA0b+gCKvdy6xkIma7b1au3heOkp6yCXSh3HI/FCBj7ihqGQq+v0DjMFlwUT6d44i8/W/RBzA+wWUPjmXBwSp/X9NA44bUrePhQL9ft3YtRpZdXSbpatR5vpca/GKahsy4a5aXB8jJn5iqvLnw/lZnAevNmml8aY9Vzw4iCrqTVLJZbDY1cnl0j+/E6xH1bSrkR+Bvg33LvuUW2PwB8VSl1r5SyC7htlm0L+RfgnUqp/VLKv8TzrKui7ka5XgnsxTzuyLqVAGRGEtjJNGZnc9XpPuUGQiqZztVaJei6NvG+u0mM7sGxY2h6lHDrTjq6dyGEjq4ZrGnvJBoOVez1FRswxNMpnK8OYGl60QcxP8CORUzGNUEg97N0AMuysdJZ4kmb2NgEXQZk0BjWgrQ7GYI4ZXl5pTq+FaOUx6uLDOc3J+ko0mgvHxC89QovE6uaXOWz11p+5sxcVZuF7xt6ELtrBcPCa8m7+lmvV3qli+VWyzIvz/4M8MVc32MdT/ddMcv2/xv4gpTyDs6uUlIOXwX+r5QyDpyc4xyzUnejXC/dtqjHLQSR9V2EUhle+ffvpPXCNSWPV6s2Vst0rtr2j/G+uxmP348QOkKEcJ0E43FPBoj2eDJAtV7fjGZIdoAn/vIzWNp0D/fsg5gfYFuSGVocl6QQ9Bkao7qGrWmcHhijpyVEc0uEr+rdPGmuYFQL0upkuDw9yDvt3rK9vOkd30ptU/jZBTbbOr7H+qZnWduWZfDFf58cxBxXFA0IfvF3rmQkla26SKWc2dBcVZubul45431z3ToAxtxB7H39BJrbK14st1oWsDx7wbvEKaVSwDtKvL2tYLvC6exvFDlOT4m/L8v9+U+5/2pmXgJ99dBtZ/O4zWikpEGulza20NM5x0mTGN2DENOMpNBJjO6hves2NM3zmGqpUMsPGMneeFkPYn6AfXkiw7fbmhgxdARgBHRs1yXtOPzF+usZHRhAE4IgNilNZ3eoi/C6rfxJnb28ws9+UfhbXNT6JB3hJtZGO6YMYl8//Np5yX8uZzY0V9XmWCpW5H2BuW499nlrWP0rH6Bl5fkLFtxbqPLsXJGH3yVuDubFKNdDt63W466HMV2M6ZxjxXDsGC4mWdshoGuTXc0cO+5pzEFPBqhkyl+Kch/E/ED6pt37uc9xMQQ4AYNgxCQaMTm/PcK+VJYtKwRuPIZrZRFGgGA0yr5Vm0hbdl1bY+Y/+7uvWMvJQ58koK2Y0v1NCJ2JkYfZe1QWaWBfv3ads82G5qrabAlFS78fCNO8ej1aqf4n85Ad4ZdnNxbzWjxSqNtW82Oq1OOulzFdjG5brtbO0XiAkURs0ihHwybroxE0vQPNmCkDlDPlL3Xfy3kQ8/tuuu16wm+9knX3PsIFoQAW7uSgkbZsJjI24rzzaVq7FjebRQQCoGnEk5l5a41pMEJQG0WImUJyKhMnnRkCsXLGe7Fkmv19w1zY3T5vfZTn0p6DRqji8uv5zo5o5PLsc426GuViBqCWH1OlHne9jOlidNv67GPHGenbxIWtT6BpGrbr0j+eAhwu2vj6SemiXMq576UexI23v5aDd/1wyr7tOyXR9jBJy6bQlAV0jUjQ8BrIawJRMOWudkXsctCMKJoexXWmLs3luC6uaMUIRMkUFuK5cDw+zkgyyx9++0lWNIfq3mOjkLm050ozdeZbTmvw8uxziroY5dkMQD1+TOVmStTLmC70dC6ff5u0bkQI2Bh+lpA+Tspp5omBS3n1q2+v+Jjl3PdSD+LBu344Y9+B+59G7tjCkytbpwQYXRe2rVvBeMZCFxnC+hgJu4WMHahqRexiTE8RBNA0k3DrzsnAKLi8GJsgnkjydPxyDg+lSWVtNnQ2I4TgeHycvtEkXS1NhE1j/peEKtCeR1NjTGQMVjaHJ0uxK8nUWUg5zS/PXnzqYpRLGQDbshl69OCCabP1NKYLOZ0rzL99NP5mnhi+YdK4JTI670harAoEyz5epQ/xdJmp1L6vfeY4K3e9jkeODxFPpmlvCvCa9e3s2nYBD/z878gm9hJghCxtBMLbufHKj9RwV+ZOEezo9jITEqN7OD54ilOjBscSr+YXozeyqlXnWGyM0yNJOiMmI8ksK5pD9LSEsB0XXRPzviSUVxJ+eNaS8HIydZZL8/qfNxvz0iXu1eOW3yWukFkNwI+fJzuRrqk0ulLqZUwXcjo3Pf/WdoOMWV7pfUeTUbEEUMtDPNu+djzBbXI1t+3Yws+PPUoq+xKuc5hnD3wcGTlMuDNK1g4R0DUEv2R04LOTqXxzUcwbnitFUAidaM+dhKO38tEnfshQpgnb9QYvIWBjZwumofPBq7dy69cfZTiZYXA8jaGLSb1+PpeEqlcHvGXUvH7Ru8RVdGAp1wPfKEyXy5Vi/7lS6r3zcU6og1Ge7SG2khkCERO3SC1MsR9TPSLL9Tam9ZjOzVVqW4+Ks0KmP8Su7eBkbbSATrAjjGUaHDvUS/fqDsKR0Kz7Tnkv950dHPg56cxhbyouXAKZA6TIQFqjOZS/VzNT+YpRyhtuW3lr2SmC8RS8NN48pdLPcV2ytkMqa/PgoTOMZbI4LmiawHFhYDwFwEXd7XSEwMr0ThkQaqXWXsxT9vGzIxoGpdQZYN4MMtTBKM+VT7xi2xZ6H3x21h/TfESWG0Ebq2Ql41pyj6eTf4hPfv8pEieGvO5ulo0T0PjJKzbwF3/2DUZtaNUFO87v5CMf/jUCubLtuQwAATGl8EFzJ9BI4GKQthI0047Xpc4lmx0km+nHDJ1f8lpLecOONYJjx4pmV0xPESycaeQDerFEhqztYOoaAxMpOsMm/eOpyZ4pQgjiE0l+e+MTDL746aLySC3U0gSpGH52RHUU6RL3AeD3gI3kKvyUUt+UUr4C+CRgAyng9oJj6MA9wPPAN8h5z1LKZ4CfAZfglWG/Ga/nxV3Aq4AzwAbg15RSL5Z7zTUb5bke4s27XocW0Gf9MS3Vuvu5PGDV+xgnhl7AxQvszNbmsZLc47nOC7Dh1ms5/cOnSQ2NImwXEdB5YHMPvwybBCYyBJuCpByXB48Pwse+y1/9r7dM7jubAUhZ41MKHxwRwRFhhJvBcW1sxyKZGSdtJbBcjePHHqK7xOomsxXMpCb2IbQ2cNMzPtv0FMHCmcaJ4Qn6xzzjK4Bw0ODUcJKelhBdLSFiExmyjkNA03jbph/z8rYTOLaG64JljTI08H0s26Frze8Xva/FKCa91NIEqRh+dkTVTO8S9zZgQCn1LillC/BLKeVDwOeA25RS+6SUb8arzvtDPBv5NeBhpdS/5iSNPK3A15VSvy+l/BpwA5AEOpVSV0gpVwKHKr3gugT6ZnuI5/oxLcW6+3I84Kyd4eCZX5DMjnqNf4RGyAgTMTsm2zxajphhgGfLPS7nvPlZx5mfvcDQz4+gaRqBziYC61eyPxJCEwInY6E3edqrBjxyYpDERGpSyij2nREQpKxxDC04tfBBBEhrWwhZz6BpBsnMGKnsBLgOWeMSLIeSA1G+YKaoN+yM0tRyNcmxvVOMtuvaRFp3zpAZdm3fgmW7/NPPvEFQFxCNhDi/PcxYOk4skeGy8zpY2xEhazuEDIsruw+hOYOkJ2IksinSlsZYtpl4b4r+Y9fwe9svnjVdbrZAZL0lqcmBOBCu+wxwGbbrLGR6l7hVwI9y/x6TUr6Ap3WvVkrty+3zMPC3ub8vxfN+Swn3T+X+fwKvW9x64NHc8QeklAcqveC6GOVyRvFScsJSjCzP1WwG4IVTe5hIx9E0LTdldklmxwEIGm18YvdzPHF8tKJ16co5b37W4VoWruMCDunYODHTYHzreQRcF9d1cR13siH/mO3SdzrOhgumNg7SzQBmT9uMgcB1mRxoACaC1wIObdoZ0tl+T8owLsq9XrrfcKlcY/C84eiqDzBitOWMXhxN7yCSM3rT0TWNt75iHd99/gSaJgjo2qQxjIZN+saSnpxh6JiGTpMWZ33zSzjZCZKWTcYCIRxagyMY2lH+49ABsnaAt75iXclZy1yByHpIUpVIYJWyjNt1FjK9S9zvAGngWzlP+eXAMeC0lPISpdQzwDWcDR4+ideD+Qkp5Q+Y2ZJzesTsOeAm4ONSyg68ns0VUdfikWp03KUWWZ6r2cyWnisAiE2c9hY5xSnYRpCyEvz46EoOxwcxdK3sqHxZ5826k7MOIQy0gI5rOwghCA2O0Zy1SBu6N7Uv8N5adEH36uLf25SBQLg42QEsN0Qg4C3Ely98aO/+PdqdRzjd++/oZDCdI5DRPcMstKL9hmfmGnvkvWFdDxPtuZP2rttmyAPFiIZNVjTP7KC3PhohqGt0NIGdGSAY7GTnxo20mVlcV5C1nYLFWgRBLcXRmMOjJ17gu8+fKFpoUk6vEl0zay6HL2cgrpalKhtWyPQucW8A7pRS7gGagL9USvVLKW8HPiWlFIAFvDt/AKVUUkr5HryWn2+bcYapfB+4QUq5F09TTgAz6+lnYdEXTl1qkeW5ms2krcTk32YgTCozXrg6E5mszYGBIGawsqh8OecVMfvsrEPTMKPNpPpHQYCRsbl4LMnP2yMECu6pA+w4f8WMLAwoGAjcLM2ZBwk6J9DcJI4Ik3W2ctlFd2G7NqYRZqT/M4yP7kbXNFw3iHAzhKxnAJgwry+5+kphrnEpb1jTzMmgXjH9Nk8pycB1bT7ymie4fOVBUukhQmYnTZFLGO4PYztxHJeCdQ1dJrIm46kRUlYHmiaKDpqzSi/TApHllMMXo5yBuNrFExZJNmyULnFPFNnuKeDqIofYlnt/D3DZtNfWF+z/JwBSyq3AbqXUnVLKTrzg4GA5HyTPohtlqD2yvJCa2FzNZvKGxww0TT4w6WwCx7VxXJuxjMHQRJJQ9jSmkVs9JGcQZm/UXsZ5o+6UWUe+/3Q6No6Ly9tammiKmDxjGozZLi26YMf5K/jIh3+t6GdNZccIJu+j2dqL4cRAGNhEcDWDQOYp4n1303Pehwq8RgPTCJPKyTQIDdM+yLizg56OLeiaMcOo5nON5/KG8/rtxMjPcKx+NKOLSNs1MzIlikkGb9/0U17Z+SS4BqFgGNwkibE9aHqEgBaA1BkEFrZrkLBaOTzazbjVgpGTQWDmoDmX9FKsV0ml5AfifFN8TWiTBnquZbzmYjFkw1yRx3LvEncC+Dsp5QfwPPM/VkrNjFbPQkMY5Wojy4uhiZXb6Dy/TbPZQcRsZyw5RCo7QVdzhNaQIG27k8Yrn9s7W1S+2HkzNoymHC7sWe+d12TqrCPXfzq8JsrKq7Zy4Ydu5DozQGIiRd/peNE85UJSQ18lYj2P7owCGrgOOmPggKN3kZ14YtLI5r3G5lA74BkUx7XRSHFe+yq29Lya2Jm7SlbnFXrDxYid+VeG+z6LY8VxXQshDpNJ7sd1HTpXnc2UmJ7F0hGCwRc/jetM/akLEUAI0APnkdE7GZoYx8HAcVyeGtpKxjZY2WxO8bgLB825pJd65DsH9BBpK0EiMzNYXGrmUS5LTTZcKiilJvBS46qmIYxynko16cXSxMppJlO4TdpKYjlZwsEWImYHL+9xeeKk8LqsZRNEzHYclzmj8vljnoof5VvPZ9nfb5CxI6zpsLh64wF2bd9SYtZxyZSBKhwJzQjqTcdx0iTH92IGQjhpm0INRnMnCBghXGcEx4qBFkETTbjYgFdA0kw7tmOj682sWfN64n3/ynjsuwgtVLKB/2zXMtL/ZezsoKeHCw1wsLODjPR/mY7uO4pKGataw1iZ3pIygx7opKllJ2u1p9BI0jse4oXRi3ng9NWsbPYq/gqZPmiWI73UwuG+X3gL8brulGCxi8vWVVfWtO7jUpMNzyUayihXwmJoYoX5wXM1kylsODOc6OfnR7+PoXvX86atnkf17BnBeAZCBly7ec2cUfn8MX90pJ2j8RNEQgbNQpDM2lM0z2pmHdMloLz3Gwl1MpE1cdys131ICHQBkWAETW9jZOjfSY49Rjp1ENsaxQh0EjDXAd6gE27dSfzM3cR7/xnHSSGEgR6IYgTXF63OK3ptmV7s7Okpi+UCIASZzGkSiZM0NxdvqTCbzCC0DjQtgiY0VrearG5v5pWb19LStZUHD/ZPOV8+lQ2gdzQxGbQrJr14S1Alql7dxDufpye3hKKI3ODtSRg6uhZgc/eriu9XgZTnF6Q0JkvWKC+kJpZPSzozcoRkZoKmYISetk3IVdvm1PR0zaA93EUoGMays7iuSzITZ8e6BK9e45C0TK5Yt5FL120pK8Upbdk8cnSQgDH1cwsBD6jT3PSqjbSGgmXPOkpJQBvvuHrSmJlmN1ZmAAQIBAjd+w+YGH4QIXQC5maEeBE7O4jrWJiRS4i07gTXYSz+fRw3NenhWpkBAALmhhlBsWK4eGXTAgdyy7cmrSxZ28V2BP/8nV9wydps0ZTC2WQGTXOZGHkAK3MS24rhullIPMVvdv+CgP7n7D4yNKlL79i4Esd1ufneR4qkMXrSi+043LXnwKwNiMqlMLCbl8Ac10YTOo7jkLVTBPSzTaqqkfL8gpTGZMka5YXUxA6cfhR15jHSVhLXdRhNaQwn+nFdl5et2THn/oV68ER6mGR2HCEEQV3Q2hRkcPwwqtcoK8VpevluYVlxKmtz09f28KtyddmGYDYJaMVvesbMCK4HwM56hisQXE2k7fUkxh6dNHRCCALmBozgWhAGPRs/g6aZnD58C5puIkQAXHtyWzsbQw+ch9BCoEVmXFdeqxZ6G2OxbwNZHHsccLFdF80VmJpg3G5jINU8a0phMZkh3LKN5NherMxJrGw/AoHAGzRS43t4+6Yfcdu2900a188/dmjO5kL1akAEMwO7AoEuvMe1mJ5ci5S3UC0J7tV/Z166xL3D/vqy6hK3ZDPE85qYaztTXq+3JmY7Fof6nvSq1Dir7aWyExzqe9LT/MpArtrG6o4LyNhpyBmAULCZZrNjMsWpnGNFwybtTUHSlo3juhyPj9M/lsJ2XExDw3Jc7tt/irv3zh3knksCam1/N80dN6DpEYzgakIt2+lY9X7WXvQwbSveiuvEZxxTCB3cFDgTkxIIaOhGFDeXZ++6LrY9RjrxFNnkQc4c/T1iZ+7CdW1c1yZ25i5OH76F00du4fhzVzPc/3kCgdV4P1cbDQdd2GjCIWQkeOvqf8DQHHYf6SddpItjPsNj9eZ7WL3pS6zefA9tnW/FtuLYVszz/gtwXZuJkZ8Q0KzJTJhizYWCepanXtxPKpOYswFRseuajfxA7rrTft9FVieZ63u007OnydrpLMne+Jzb1YF8l7h6/ldvI18SKeXHpZRr5/s8S9ZThoXRxBKZURKZkRl6pkCQSA0zNhajva1rzuNoQmPDyks5ETtbdakXTKfLSXGyHYfPP3aII4NjnBieIKBrJDI2oYCG67pEIybgTXHL6UQ2lwSUjSeJriqeruYIHU1rx7FHEFqAwvG9MCUsL4F4OjPYVgzHGQXXwgisJGCunxL0AwqkhgB29gwuNrrRidBCuI6dCygKso6J5ZpsiDzH9o5v8+P+N8/a6GdKhocRRdfCuG425yGfRQgD10lMyirTZyf5VbQ3hJ8jpI1y8tC9GOHtxJMXYgaCkx3q8ktmVdsedEqwOJvAsA1Wd14wY3WSaqW8c6Sir24opT6wEOdZ0kZ5YTQxwTRHCjuRwc7auLbDUx/6CmtecXFZP+SAHiJjJatOccpPj1e3NZGxHQYnUiSyWVwMupoFLcFRYuNxNE1nmBCDEynWtM2UBvKUKwFNT1dzXZvh/s+TSR0hmzmBpgXRjSgBcx2u62BGttM3bhMNT9VzA+Z6jOAaUuP7MEIrCJgbzt5loTMx8jNE7m/vPNlc+pvmebTCwBUGjuN9IZbrfdcaDpsjT/FM+DfKbvSjaSZNrdeRGP8FFFRduq6LEYyiG52TA8v05kLbOr7H1pYncNEQwiSgpXGTP+JXV73Elw9eN9mhzltnMciFXW1VLYulCQ3ZvQ3x3yOc+UUfVm+SeNsoh69KTPm9VSvlnQsVfbkucf8GrMbLIb4ar9T6L/A8iWa84pIM8M3cNuvxusFdDLwC+L5S6n9JKX+K1+Do7Xjd37rwus/9T6XUD6WUNwJ/BYwAceAZpdRHK73mZTEc5jWx2Qyyk06TOdOLk64oj9tLYwu2TVa424kMdtpLUzImdBhIc/pbD3PoU/fNeaxSKU7j6VjJBTPzFE6PhRCsj0a4ZFUr0bBJUHeINqXx5BUN23GxnQlODz1V8nhQvQSU7/lgBFcTCHaD62JlzpBNn+aXQ6/k/Q9dzM33PsLN9z7C1w9fS6TtDQgtjON4mrymtxIIrZ9xXMcawLb6J//t5RJ798R1bSvvcjUAACAASURBVMgFCoUQuHj/ATjoBESaazdWlu0QXfU+wq07AJGTCTSM4Er0wPmEC3KN85WCtuOiiwwbw8/i4nWWi4ZNNCHQNYN1Tc8QGx/Fdlw0IbAdl77R5OQxquHw3Q/S+/1nYNDGCAQnDefhux+c3Kaa77FWyWMJcQdwTCm1A/go0A1cBLxLKXUt8F/Ab+e23YhXXn0j8NfAB4HXUFByXUBaKXUD8H7gf+bae/4LcINS6jq8bnFVsSCe8mJ2oXJtm/7P3c3Y3j3YwzH09igt23fSdfsuhD73g6JrBpu7LkedeZyMlcDOWuCCloTW3acxn92HyCaJH/8Bvfoxet7z3qLHrTbFKc/kFNrQGU/HveIMxyYc0BlKOFiOIKBB3ziMpiBiavzxfcd4y6VR3rtja8mgX6US0PSeD/ngnutmOTas8cUDV4JwCwJdZ4DrePtGm+ToT7CdcXBTZNMv5tLiCvKfjZUIBK7rNaAXQkcPRLEyA2haEE1vw7FG0QW4GAgEjuuSsltpbe7mhitK38PCSkLwmtoLYNWmLzHc/zkmRn6C6yTQjc7JJvuFje/z6YpPvbgfUxtDCJNo5Gwus+O6mNo4Gzosjg6bWI6LoQlWtDSBy6SmPFcPDC+dztvGsJ2y0z4r/R6XYiOwKrkQ+AGAUuqAlHIAOAX8i5RyHFgDPJLb9qhSakRKmQb6lFIxACllkWU6ZnSHWwmMKqX6cq/vBnqqueB5NcqNoFn1f+5uhh+4H6HrCDOEk0ww/ICnX3bvKm+poq2rr0QIwcm+Awz2H0ZP6YSfOUn02497eqRm4KaSDN//fTRDK3rctJXw+lMIQcRsnzPFaTr5KXT/2ECuEtArolgZsXGdLKYOJ0dMJjLQFoKuZkhmHe574QSa8KrcCh/4vFGoVAIq1vNBCB0XjfFEPy2Bccass1N1XROkhj7PWPQgumagaU0IPUw24/128xKG69o0t18DMCV9zQiux3VdT0IxOnHsMRx7lKAwCQYM0DpYaZ5PW+cb0TVtxgoihe01bWsIOxvDsQZx3AwC0AOraeu6mTUX3ItrjyD0NkYGvsTpw7fMKOe+c+dWUles5eShewloabSCASVrO4xmw6xo7aE1EpiUL3TN05T/8afP88zp4ZKpct56fgenpNO9prOZi2IJgmUYzqLtVoF0/0jR7/Qcquh7DrgS+G8p5SZgBV7v5E251p1f5qxAWcz4lmL6tv1Ai5RypVJqAK8/xovVXPC8GuXF1qycdJqxvXtmeK5C1xnbu4eVt96GZs6t9eWLNjZ1vIK9n/w07lCK0LP7KFR/REBHCwWLHtdxHY4NPM1ochDbyaIJHTOQ63tB8RSn6ZiGzo4NK7j3F8enBR0F12yc4Op1E3zuyfNwXEE+AUCgYegGDx/pI2s7PHZ8sPQCnmWmRZUqxsjaDiPZJhJ2y5RVrQFWBfdhOWGc7IuT+cDCzZJNn0IzujECK2ZUwhWmr3X07KJt5a2TRnOGZ9uyHdd1PEM6rYS7sL2mnT1DJnUc3AxCMxFaGCtziuG+zyKERrTnToZ6PzlrOXcoGCbaeW0uMHn2d2VoLmcyl2G7QXQNdO3se0OJDLuP9hOYpStgsXS6B0/GOHFelF8fHJvxPZQynLoZwOxqm9MZmq2ib8W2Lcspb/kLwD1SyoeB43irinwV2C2lnAD68PTmmlBKOVLK9wH3SSlH8IxDxQ3uYR6NciM0r7fiMezhGMIs0gFtJI4VjxHsmb3cuJBgUxOrL7+I0996GLJJyGvArovZ2YrQRNHjqt7HOB0/RFA3STpZXByvexz8/+y9eZAk133n93kvr7r6qu7pY+4L0zNDAiAFEOfg4CVIJCWKluTQYYoCxQMU5ZDC61iFZcd67XA4dkXLkr2ixVPcXdmiRC7lQwREAiKJG0MQFzEABj33DGam7+qurjMrM9/zH5mVXVVd1dc0gAHR3whyMDXZ2dlVnb/8vd/7HqTtnhXnyXX87nu28er4UV6ZNCjWBBlbc+2w5q7diskC1HyNbS4+9B0rjRCCYxPzzJRd0rZ5xfzZTmKMsChdx029/8Se1EtxUb5c3UO3VUL4s/j+9CIfWCZA+6QyNzKw418jpROPGLoHPk6m98Phpp+ZBVUKRxnRZmP/yB/SN3RfPI6Yn/oapTa+xlr7VApHw05e+3jua6ArgEarAIVGyDTKn6OUf4TugY+vSs7djvfc3XuERP/dBFMTTbQ4z1cIQWxsVEejwRG0p9yZpmRsexZ3Ko8jmwvncjP/1TZDrSMPqzcJCKafOsmlB154PVa2b7hLHOFG3dfHxsYejJJHbhsbG/uvOhxbd3+rEm72Ef19OPrz7uilf93wb68C9dffBRwZGxtzR0dH/0/C0caa8boV5athZmX2ZTF6s6jKUomt0dOH2bfUyatd1JJyXfy5HGZflv33fRDte8yd/x66WkFYBk5/N+ldA23P22i/mI4646pfRqOo+VWuGbxxCcWpE1J2ml+/zqFc8yi4mi4HbAO07kNj0p0wqHihm5hjpck4fSitKbk+CdMgULppWb3WAM86OhWlff3jqPIPQEh8LGyjyp7UMQZT5ZA610JjEcLGrZ6Oucml/GPUKi+igzKIJNJIh6ZBEROinYHRcr7G9W5ayiSeexatwoIcQqNVjZBdk0D503iVV9vIuTUIjV+7TFAbRyZ2d3S2u29QAUaTQ911u3p5+NQk7VCnygEd8/z8rX2k+zPw1IlVzYrX0gy1jjzOfespJh489rqtbCORxxvtEncG+Obo6Oh/D1jA6maW60MBODo6OlomHF38/XpO8roV5athZiUdh67bjsQz5Tp0ENB125ElI4bWhIfBzC6y332B4pNPNG0SHviD+5gwzoYz5IQdG8a3O2+jXFYIQSaRJa17UVqhtWbP4PWrTpBoVAZ2OZKCC10OWFLzjm038rHre3ngldcwDTMuLG4tIGlJLsyVyJVd/EBjGoJsymGkK7Eu/my7ogRwQ/4TnJtNkSu7i5SwdIZsqkLguUix+OtWp55ptUBu/C8idd0FAm8uVPwFc+CBNEIusSUTbQ2MlvM11qoccpEJCPzI6Q5FWJgFIevCQ4g0yAwaE6UDBGFajFLlUHqNAi2Zm/4btmz/k/gB0EoVbJezCPDi5fkVs/qWy/N7z6feh3nfB1c1UlhPM2Q4FnY2w+xTJ95SsWyrQZQ+/d436Hv9JfCXV3qe160oXy0uVIOfDru6wpOPE+TnMHr6YvZFI9olPJx58SGmJs+wtVJeskk4/LnfR5pyxfO280EOUzwkhrTWbL+4f+gm/vb5eZ44O8tCVdGdkNy+Z5B/degmDgxLpJBNndr794/wH585xaV8BSEEUgqUhuliFduQ6+LP1tFYlGrVcyh/gt3ZrjgHry6eCILoT282mtOamHZoSiRlgmop7MYCLxfS3bQG5ccU8cCfxbQGETIRGxgBiw51HQyHDLOfZOYWinP3Az5C2uhYNRl1w9pDqSJu8RkunfgYOijGysOwSxagBULaVApPMTf5pRVd7VpN7VeT1bfiMaaxpJg2ruDqjcB6m6GrYWW7iRCv60bf1eBCJQyDofs+z5Z7P7XkF7iOtgkPShHMzVHYnkEdLyADHZ+vvpm30nlh9f7Lq8VXnjrFc5cd0omtJJ2QvfHc5fD1zx85yOePHOTem/YwsZBnuLsHQ5r8zbPtx26tpmutaDWkb5f60Wg+X6ucAGFgmFmcyCkOwLQGSPZ9hOL894Eg4h8baB2QSL+LcuGRKPfPj+e4dUFHvVsN060TSNnFzKUvUKu8HG/qgYqL/eL7G/oa92y5lyBYwK2MAWEuYTi20BApA6VMh3/XHkKaaOURJvgIwAJhYljbgIBS/pEVXe1asZqsvrXk+S1H81xvM3Q1rGw3EeJ1LcpXkwuVdJyOm3rtopaU54Hn4ScMfMfALi8uLRs385Y7bx2r8V9eDVr9FeoGNYYIb+ZP3ryPc9M/iUcwF2aSSLmDbMqmFihypRqeUlhSks2ES+d244vWlGYh+xAiXPKHM+T27AbDGsCvTeN7oQDEcnbHxbFv6D6ENOM5tBApkpl30TP0OarlY6igEBXVuhOcAB2gtRu50pmgA3xvnIWZb+Ik98ebekr5GGYGkA2GQ7eBVoyf/j1UkMO0B1B+CdPZA/igfdzyq5hWPyqYp/4QEDINlKPQWRCiJ1xd+HO4/gxCSHKX/4z+bX+M1v6qsgPbjTVa5/irOaaOlWie62mGrpaV7SbeIPHIG+VCtV60GzFIywLLwqx6mG6zmUynTcJOaPRWLtcWAEHK7uo4S+4ktmn0YNBaxzxnEfkr/OTsU7i1U00jGNc9hSV62Z3tZWdfsydDyjTbji9aU5q96it4tSkse6jJq0Irn0rxaBOnGMIxhO9NYydGSfcuRjZlhz9Pz5Z7yY3/BdXSC5QLj1AtH6M+5zWsLJ47RchaCqh3smgZNrJR4dRBHq2DhtmuCUiG934ZVClmZDT+DJa9lUrtOWqln4AQCJFASIHpjOCWZuJVjACQCYRMorWPYXaj/Lr3iQRhUFp4jGr5p4QPgaUpKp1yBFeT1bfSMaulea6nGXq9V7avHjVfF5e4g7es0e3pKsdV4X2x0Yq/tZ6v7YhBSoy+PjI/OROPLiBcOmZuumXZkUU7KK04MfH0slHxK4ltsimHvpTNdGEmVvRJaeCYKQYyWSq1Cxgthd4xJfv7S5zMdWMaMu6+Wmea8XW2sBm0Dgi8HDLyn7CcnYCMDOp/hFIlpAyLSKN9p1Ylhnb96RI5dX76G1QKTzbR1+qdrpU4jO/NoHwPhAO6LolXaF3FtLbi+zNo7UcbdIvXroI5UCXMDowMz70AKIRMYCcPI6SJW3oRvzbR0KET/RwWhtmHX5tF+YV401SjMc0sgXcJt3yMZObdgEXg5yjk7getQMiOkVcbgbXQPNfaDL0BK9u6S9xGYpQ3iNExOjp6LdA3Njb26Ov5fd7UorzRir+1nK91k6TdiGHvdR8ke+EFiqefIMjPIbt7EVpT/PFT5B98oK1kux2lDlYXFb8Sv9QxDQ70lzg/W0RGPFqtNeVakX1ZK7TMFEs/0nuuqbFrfoCnzy2sOK9sZTMoVQsFL9IA7aGVh6g7xalKtNxvFjcJYSCt4ZhXHJ+7A30t7nT3/O9MnPl0yHjQNdzK8dDzAo3AwErsQpUXIu+Q5mLR6Ey3lJGhYotOrf2oU7ZCCbeXw4yk3KGnRlh4TXs7prObavEZNAohrPh1t/RTtPaoVc6ggnw8z56tvIyVOIyQFl5gYVFaVeRVO6VlJ6yH5rlWXO0r2zcRvwpMAD+7RXmjFX+rOd9ymyRtI57uO4K699P4czly/+lb5H/4UNtZ3pbPfq5jOonWqmkjUSuN9nyEZcZR8Xh6RX4pluCu3TnyZc2xCSjWBClLs69fc8euPLbpEKilK7mkneAP73gn/u1ixZu/rthTfpFzMwXm3RqDjsKUAaZhkWgo+oaZJdl1S5w+Uken8NAw1mkcITNNnbjWHqgKgXcJrSthMRUmhtkfGdDL2GtZGmEw63Lfr1V1qJUXW3QKYUYFXWGYIyjlY9p70DpABSUMmcZOHiTVfSc9W+5l/NS9TfakSlXDwq5rBP4sIk6YDgi8WfKuxaXycINLnMNu+VjbzcF20uqVkkrqNM/cD75HkLIx3QAZ6LZ0zE3A6OjoAeAbgE+4WfEV4HcIl0bDwFfGxsa+ODo6+m7g3xHOzKrAp6Pj/xGYBX4E/C5QGx0dfY4wHPW9hDX0O2NjY/92o675TSvKG634W+35VtokMaS5xNNYOg5mX5bi00c7zvJmfvGdnJh5pm06ye4t18YbiaVz07i5IsoLkJaB1Z+isruIMa9XpCTpfgM/qPDhg4IP7pf8P69ITs0KXpyQnMkF3Lo7y127ZjCN9iwPQ7IyJ1lbFF/o57x+mTkR8nVLPWm6kwvMVdMU5irszmaaN/CEuWx4aLxxmH8Ut3ICgUSafYBA+Tm09pEyQXH+BwjZF6nuaPJgFoA0eugb+ggaqBSe7Pj9WlWHQloIYaGVj2ln8WuvLcq9RYJUz5309H8RYXTHM+l6AU313Bl1u9HMWViE8mpBE5sGTaAhCPIovQUpJYHWTBWrSC4zsmdp5NV6kkqUVsx+5HoubJ+hUshhFqv0zCoO9F6/hI65CQA+CDwN/EvgDuAwoQnRuwk/1GOjo6PfJvTD+NTY2NgLo6OjHwX+V+C/JizcN4yNjdVGR0cFMDE2Nvb06OjotwiVfOOExXrD8KYV5Y3mRa7mfE42tW4vjOVmeV5xnpOTz1BVpUie25xOsn/oBhwrSf70ZapTCyH/Vgp0oPAuFrn4jae45vc+sCwlyexLcWLqaRaqoX/Gg6e6OTaRwTYtbAO8wOCn4ylMOcz79s4ty/JYbrl86ksPcf5723jqve9k7/ApUnaFSzNbmHC2YqYd0kEJ+gfI9N7ZtIHXqm5TyiXwppZsupnWFnxvKprxgmGkQQik2UN54UdII41S9U08EXswp7qO0Dv8B3HR7Bv6zLLMh1bVoWkNoZSL1hD4YfwTWmBY3ZTzP0LKRDRi6G5/nvwjBP40hrmFRNcNVAtHm7+hhpqyolQUH1+H5lJCwHjRwqcnvtlcP2B8ocLDJyc7JpV0UlqOjR/l0vwJzD276FI7UJ6HaxrMDRxmeBWuh29DfB34Y0KnuDzwIPDk2NiYCzA6OvoS4ax769jY2AvR1zwK/Jvov8+OjY0tvSnht6NjhoF/avPv68abVpQ3mhe5mvP5czPr9sJYbpYXDPVTCcqxsq8OIQTlWh7XL7MltZPJ3FiTYb5Gk7xkMzM2xjW/94FlKUmncs9yef4ktpmgUPEZmw5TRvwATMPCsdKYhuTkTIb/5oPvB9wlc+2Vlsv11UbRtHngxHtJnjxC2ilTclMEwmHg57ZiscD/duPPk+1tfmDWhSR1yXS82SV7qVVPY9qh50vdCF8FZwjFGRLTXEzABkh1v5dK4bFoEzFNqvuDSGkyceazK26gNTIfGh8WwuhhfvKr5Mb/PErlNmIBixBixVRtHX9i4fUFtcv43mTUbZsgepivJUmZRQK9+J4LFMfzh7m9CoPm4vs/UagwNrXAQGT/2Sjt7pRUsoRPL2XcRNTHYGvlvb8N8FHgsbGxsf9hdHT0N4H/GZiN/I8dQm/lk8Dl0dHR68bGxl4E7mJx87DRpFoBcnR01CH0YP7N6PVXRkdH/25sbOz8Rlzwm/YJbjQvcjXnE1ewSbKcZDt943tALnT4ylDOu9s8zMnj/0xtDwQJhVGVJC9adL2QoFYNO/lOlKQ9n3kvj5/6NkJIMk4fcxVBuWZgGhqNin0uILyh89WAke6lsVIrLZfrq42upEWX0lSFRb7SA4BSAcWSpq97mP5M58iquckvsZB7AF8JTGmivSlq7gWgFtl0Ckx7K4E3BWjsxGg0EgjLnlt6iSCo4lXH0EEJIdN47kW0drETe5YYDvX0/+dIM4sQZhO3urFwSzMbGh1lP0px/rthTUYjZCIuhvVUbaJj6x14Iz1QGj2gXcr5H2Al9mGYg3i1c6hgAfQcaTMg72WpBQkSRolqkOFM+VpeKYeJKI3vf8YxMaRguhh6R+/pX2xCGuXXjWjHp69jNXFib1M8A/yH0dHR/45w7vTvgE8Qdrf9wP80NjY2Mzo6+mngL6MRhU97Y/tngS8Ax4EccJTQzP5B4MJGXfCb+lhdLS9ytRS3lc63Fi+Mdugk2e7/nU9x7KUvU3bzzdFRGtJONym7C/o1A2eyeMddgoTGqApEEB5sRZ18J0pSubbQdDMOdfXSlxJUAw2a8EaMvm/nG3r5YM9P3XJN02rj2nKNH6cdBDBpShZMAzW1wC4/4GtHT7bdjPL8Ci+eeYB8eZ6MHKfLKuKYAY508dzXMKwdSGlGij4LpcrUqmMReyEszFppVGUMFcyH/hN+jsCfRMgEnisjQYrGr11g7vKfU5y7H8PsJ2RYFKPzh4W7kPsupfnvI0QSpeYRooda+QQQxIwJw4q6ZdlLfuZbVIpH46KezNzSlNhdR6hGFJHDXJ0zbYbJ25U+zlXewYsLd1EOuqgFFh86FK6+Gt9/KQTZlM1UoUqu7LKzL40hRUeqIrTn09exGvvXqwBvuEvc2NjYaeBI/e+jo6N3AzeNjY39RstxzxNGRbXiloZj7gfuj/76I8Lopw3Hm1qUV+JFrpUytxqeZf/v/B7eQpnKT3+CKi509Kxoh+Uk24vpJJUoUURiW0n2D94QFlOHuJM3g+VXBq2UpNab0Tbg2mHN0xclhhAhh1j5aC25Y7T9Dd0a/tmIxuVy/Rp/KYoxur8nybhpoE0DSykmC1X+6okxlNb8l3ccajrPvz/6DIOVKbqtHBkzDwhqvsAwJaaoUKuexk7sCjfd0CHdTauIXeGj/FnAhGAOCKLeOWxctA7wmcFyduLXzuPXpkHo8GtVkUrxeUwri3T2ABrPPY/nXkCrKobVHxXucYKo2IfcaoVfm0apADu5j1L++9EDo96N34/vXcROXLPkPQtHMymc9LujEYbFcEZSpYTiOI9M/SLdiRQfOBCOh6aK1SXv/66+sDueLlYpuR7DXamOVEXYeMn+G41I5PFGu8S95XBVfIqdeJHrpcy1O19zgfdwem5m4Jat7P7DX8FMJdd0ve2k1fV0kon8GSpeiaSVZrhnb9Mm23oVU+1uxg8fDFkNL02ZTC6MkzID3rXN4K7dyfih0IjW8M9GNHbXjdf4CzMLPNCbJGkZWCkn7Fw1zJRc/v3Tp/nMrQfiB4DrBzx8xuWXBzMMJ84Tt+4CqoFFxgwIvItUg9moIBpYzq6YfaFUNbqauieFpon/rMso38Utn0IFC+HYQZgIaUVeFT6Bl8O0d+LXXsOrTUYeF6GnhlebBF3DMNJoVQUhIxFKjcCboOrlEIaJNHox7RGEsJGGg6qWm9SDdYSZg6EtaOO/7c5m2NFb5dYbDjHQsyt+f9q9/0KEx48O9vCFX/o5RrpTy/KUA9djJ6Oo7oCp0oUrkuy/XTE2NvYw8PCbfBnL4qooyu2w0ZS5JQXeVYw/fgHZ/eiGeMVKIRntv4HdYgdBJkky3bukc7kSxVQ7cct/di3c41YpeUbkrRwwPj+GIUUsSKmjHv65kltZ4zWePDtF4W+fwBZLNzAv58uML5TZnQ3TRXJll+my5lJ1H/vSP0U3pLKYuICJNJI4yUMIYVIpvQhorOQh0Ipa5ThKeaDrBbkVIaMl8GfR2kUaKUwzC8gGypuH1m5EoQvn7WHYqkCgwsIvHZA2TvIQXu1SRI0LUMpHBBUCbxbPvYBhdGFYWYRIoVUVYSymgmsdkO79IJXC0bbudKaZZaRvWyi4WcX7f/e+ofh9bId2K8aROw+w/d5bSTqZ+PfszczC3MTG4aotyhtJmXu9U1DWGs66WsVUqzqwUdxiSpsnTn4Hx5JYpo8Ui9zZTjvx9912AD/Q/PDUOBUvILuMss9wLBKDPc0z8kaI+P+AxU7w8dzHuLb7cbqtHIbwCbQRsgTMFEKYSCMTsh+0h1c9g1+bQgiJUqWwsGpNo0tcCBl/LyHCsYdhDkSMDYVWHobZR+DNghaRB3JUjIUVKvmih4QmUgMKOzRBiiTjASW0ciPrPB8I8GvTmPZWMtlfolI4uoQXPSfMJQksnYQz9fcfVucE14h2K8bx+19E6vABejVkYW5i43DVFuWNpMy93l6xGxHO2gilFccvP8F4/jS+75GwU7FPRl3cUnTnmStN4AXV2JionvvXbie+Tod76vw0Jdcn7ZjcsntgWfXYSHeSbT0pLs6XmyhbWmu296YYziwGlTqmE3WCHs/M38OhrqNIoUEH7MqETCEj6my92jnqhTc+rfZBWBhWH4E3D9TtNQnHFMJGyAyJ1CE89yKmNYTnno9FIGBiWsMhQ4JQZWeYfSgdub8JgZBOaK5vZSMHOg+twLB6ISix+JBRIfkt4pz3Dd3XFD9VL7jtElhahSyNWIsTXPy5raKhOP21H656zLfZTV/9uGqL8kZS5joWeKUwEhZGev3S1I0KZ43PpxWPvPpNJvNnI88FScJLUfPCmWt9LHF+5hg1vxJudgnRlPvXkxxcshPfSMdKOSYaeOjEOJYhO6rHHNPgE+/Zx5efOsF82Vu0/Uwb/Lc3/ZiZc/9XE/3ss7d+BoDHT/8KfqA41PMKIxmXlJ3BMHvizjbww00yhQRhERrQO6AhkXo3nnsBrzaBVsVwNBEZHplWP0KmcNLXoXUQBZoGMYvCsIZJdR8hkXl35LFh4tfOEXhh4bacvdjJ/QD43lxkeB8Q+LOgo98NTVSMQ5m3YfU3pas0fcYtwhlkGlQpYnZ0LraOacS2qSsV5pUaisr43KpWgZvd9FsHV21Rho2zElxS4LWmdH6a6mwBuzvN05/56rp/QTc6nPX4pSeYzJ+NmAXhHLXihcW2PpYIlM/l+dM4VpKqV1rsYgVUayWuGdzZNLpoR4cLxwQBj56eWDan7/dvH0UKwcOnJ5kpVhnIJPjtfT/g+r7nqNYEluHEvGGAzx/5fNQJ3kxfAkzy5Ge+RSn/EBAazIeGQwLL2R6KSZQHwsRzz4IwMZ0RhLRDZoSuRgUyGwlPAlKZ26gUjzYxH+pFsFI8ysi+r2OYPZQXHse0t2IlDpJIv4vsyB9hGCmUcsld/jMC73JYkAEtROjyJm0sext2Yh/hvDpBfvZb0fhi8QFUT9euc6QXcv+wKne4tfpdrLRiBLGqVWDTCMQ2qM0VufTd54A3Jll+E6vHVV2UN9JKsLHAz794AXe+jNPfRXr3lisyQtpI165A+YznT4cdcsO8VghB1S/j1Mq8dPFRpgsXmF44jxASiQxHtFFXbZlJdm25tum8TXQ4DUV3Lrb+nNYGPz7zBEeuOdLW37l1yd1tK37y3J/y4ni+2XQnm45VcY7pNKjRUmRHZS9B0wAAIABJREFU/mDR4F5VESKBYXVHFp9exITwsJPvYGTf12IptdYBc5H3slILCJkm3X2Err6PUso/EHoityr6gjl0kG8r/W5EtXwMO7EXzzXCjUFhovFDNzpnD6H5kIdAU8j9E0KEs2gVlJib+BL5qW9gWP1x8kkrR7qTO9xKAp7WfYSVVozJkd4Vx3zxCEQKSuemmrxXqtN59tx7N9YVxIJtYmNxVRflOjbCSrBe4Hd//A6e+vhfkvZ10y/5ejf91iNI6WTv6fplfOUhhREbusfn04qqX2Ry4Wxs/6lRKK1ImGlSTg9SSEzDIWk1z9sb6VhFd46qV6S+KZg2FYXKKcbGzSWMjUbUzde/+vhjDFamCLCRUsSmOwC7+mphEWwx3mld5udn/o75qb+Oim0hYlxInOQBFmb+pqnDHNj+J21jqTrl8knZjfIXqCkX0x7BtJeuUhqtPS1nN5azE61q+LVxfH8GFZRQQRHQ1Kqno3m3gZQZhCAMVxUSwxpqw5Fe/JlbpdvLCXgePT3JkZ2zzJXPLfHbXm7FKA254pivMj6HmytTnZhb4r1SujDL2F88wDv/5GMdP/tNvLF4SxTljURQcvEr/oZu+q02nLVdYnaj0b1jpkhYKapWKpwPN9y7glAkIqNi5VgpKrUCAK5fIZMIr7mdiKBOx7r/lYu4Xpn6iZUORSiO2Zmx0Yg6F/mXBruxjWr8uhBhN767f6Dt7LWOOGg1sv/UqgQqjHsKvaFrq/IfbnWBC6FxK2fQQYHzr7wXDVj2Vnq2fIK+4d+Pj1PKRSkXIXsaTPTDMYWV2IMtDuKk3kmleBTPPRceI8JoqrBQV6PvJnFLzyPNXsCLOdKtxvuND6nlBDyX56c4PlFgIN3eb7veUJTOTJHeO4jd4Iux0pjPzmawepPkX3ltCZtGWiZzz58lcL3Njb+rBG+7ovx6BESuJpwVVja6r4tEPD8sFq5XRulwlDGQ2YavG7ySddixhUo+TdHNMzp8U8xnbu3GP33LXkpunvtffo2yZ5CxNdcOaz58EGoB5ItV5spzpGwn/ppWN7k6F/ls+Z0c7Hq6iYvsBz6BfcuKgaJKuVQKTy4KR6RD+MgRBP4clrMr7jCX87NoZT74tSkCfwatw5GLALzaJeYmvwJCxnmC9XMF3ixKudGoQkUMDoNM3/uoFMKIKxUsRDzneqp1PaZKxq8Hfi4awYglaSiNxvvQWcCjtcY0qnQnWjbrInrj/i03cvYrP+q4SbfSmM9wLLLv2s3kw6807ZlorXGyGbyF6mZa9VWEt11R3khWR2t6yXIhqm0Ts1m88epdaqNIxLUqmNJkpGcfB0Zu5omT38YPvHgEEXokm2gNKTsTb/gdv/xk3I3bZoJwvaq5brDCttQMtSDJYFcPphTc/yocmxDMV+GrP/kehwY9PnrY4rHz/ZyYTTNXrsWbUffetI9syuHo3EcA2Js6RsIoUg0yXHSv58PbV6b/1UcHYXp1EL8fWmu0dlGBi9Aeys+xkPuHppy91lltnPl3+X9hfvJr6CDMP1RYCJEMFYjBPOX8I2jtx2b8QiQw7a24lTO4pZ8CXtxZK78QMimilJJQlFLvqOsPRR06w0UPkxAGjWko7fjKnQQkfuBzeNDDNmQoealnLyKo+VXG/vp7TD9wfEXK23JjvtE/+hAXvvM01Yl5tB8gTAMnmyG9ewtW2t5Mq76K8LYrynDlrI61ikVg9Q5fjSGrrXPnoe49vJY7Ho4gontaa03SymBIi4n8WZQKuDx/Mu7G58tTVGoFknYXXYksaSeBdBfwfMX3z2R5+qJEqRqWhKqvefaSycuTAVV/mpQdjkUaN6PqReWpuY/y9PwvkjIKFLwM9xzcQ8Kym36udgGicbJJUAAMVKTCQ/uAwi0fw7BG0CLRNjqqdVY7P/lVSvkfoHWFsKMlLKKSUI2nPXxvilL+Ry3nEkgp8QOfROodCJkAJOXCkyh/FtMaXmSKRMGuWkMQfgsqvoVl+KQsEy0skl03IGV6Rb5yOwHJ7QeGOdQ/RbE6E62OFnnn3U4/c4+cvmLhk5Vy2P/Ju7n43efQgUZaBsKQm2nVVyF+5oryasjxV8rqWI9YpNFUqLUbaufw1S4BZXTkFqpemdnCJUAjpEHSSpOObDtdr8x4/nTcfRaqOQrVWUBTqIZmRvXZc8mt8tNxjSEEWggMufjzH5+S7MmG50s7vQghYje5v/7NW4F6UQlwkkPcc7BZlRYnjbSz0ZQOya7bmJ/8ClqXw7kyirDUSVQwj9ZVxk9/ApQbFUsVZQNGkUzBHIE3zVzuO+TG/xytKygVFWVEOEpQHkqEjBTDSIeKQZlqusbAyxF2vwb1ZBEpLQINNfcc9WKMMFBKobTGVzauSoPQ4HuUEHSndrDtmr8LRx7LGO+Hn2t7AcmPjj9PxS0gpIh55xW3QIZeajOVDdkDaW1GrC5nQ9OqN7Ex+Jkpyushx6+H1bFesYghTQa7djc4yYVF2TaTjA7fvKLDV32TcL48gRbhwtkxkqSdvnhsYRgWvvIwpRWOOGoFtNbRvyuqXhEhBBkn7H4NYwvphMVcaTz+Pr4CNwj/FFKhdIARbcyFXs3eiqq0Rg/idmOH+sJdyHTEoAj52OFrDlImqRSexbKHAb8husnCMLPYyYMs5L5DMfePoS+FMEJKm468M4QBeOhgAS1MvNoEIBB2Mn6vtPbC8YS0omK/CGlmCfxJpMwQBAXAxwtMVORuVw7SpMwiQmg838dwdiOlE3out2F7tEOdzQLhaEsHPo5MUNO1mN6YtNNI28AaSBEU29h1rnEP5A1Iq97EBuBnRspTJ8f75VrT3O3Ulx7a0O9TF4u0Q10ssjx0lF9Rz7FoZ76zFPVNwkD5pOwuBIKqX6LkzoXn0oqtvftIWCk0OhpxCBbl0eH80/XKaDT96QRbukIzm0bjHFOCY4R/NrI9oNlNrlGV5vqLG5CdEqvrYwffX6BceBI7sZdE6hDS6A5VfVH2nRDJ6PiAwJsMlX3ai+Kbgsj9TYWbcTIRzXZD1zYhkxAdF3a4DnYijJPS2g3FKfH1WLEopfU2MIwUpjWIk34Xya5bkMk7OL5wgBdzo5R9C0fOY4sKlnSxpEtl4UkunfytKHl7bdBBwMW//j/IH3sO4/hZEqenSM8F9KdHyCSyeNqj76696KCFInkFY4d6M7JZkK9O/EwU5ZX8AQJ3aZexXtTFIu2wnFgkUD6TC+foSvTTn95KNr2V/vRWuhL9TC6cI1BLLTUbv7ZxkzDj9JGwM0hhUPFKSGGwPXuQQ1tvZ6h7D4HyIy6tCDnPWoXFVYQdd6B8dmb3cufeYZQmGp0sPhwODYbLdsdKxUW90U0uUIovPv4qn/jbJ+L/ffHxVwmUijfy2kEFc/jumfjfhUyExVhHYwfqZkSgtUHgF0C7KH+BwM+jVBXLHgztNv0ZhDAwrGykToxQ30gVKSxnWxz3ZCf2hCMF4UQuc10ku2/HtLc3XaPWAanu92KYA2HnKxNMFGpUfBOlwVcWC7Uu3MCk4ifxdQIpobLwOLmJv+z4GXbC1Fe/RPn7D2GWayAlIgjQU9PULrwGhOb1o5/8BbZ96F2YKQvlepgpi20fetfm2OFnFG/J8UXr3Pj1NhxqxHrTSxo3+oSQGA0sjJWifNptEmacPtJOL15Q4z17P0JXNCseHbkFpQIWKmHAqiEtLGmHPblWGNJiR99BRkdu4cBweK5HT5tcnleYRpXDgx4fHpU8eq6f4zOptm5yy6nSPnfbHoTsw/WKWIZENhgZSaMP09nbIPyQmNYAtSBPfR6stY4VjRoPKXvQIkloEiSiY6oImQY0pr0bAM99LWJJSJAOUqQIvNlQ5RjFUBnWAEO7/jTkSy+JkGpxgJsMHeA0krlKDcuQoKq4yiZjlUPfDg0JI6TfaR1Qyv8onpuvBvVRmIGk61KZ+d2ZyHxPhPL87dsZ7juAZdmbY4e3Ed5SRbnT3HjPvXdvOPd4OawnveRKonyW/VrDwTEXTfqlkLxj+x0g4LXZ4/FDQEcd8o7+Q7xj2x3hwYJ4PjxTqjI1/yxThTP8fy8HvDo9R8UvM9A10OQm5/oBD5+exFfh3LNedA0pePTUJH6g8edH2J14CtMwYwm21op09xFMs7tJ+GE5u/G9WZQ/A4BWxbCbRYUmRYjISc5AAIGfw06Mkuy+Paa4hcb2s2BYmNYgKshHIwyaRB3S6IsVfu3CVds5wOVmH0apGqaR4bm5azmoXuBw7ykUAilEPE8XwkSrcltFYyc0+qaMvDQPQGFbCj9hYFZcRuztTeb1jXsgm25vP7t4SxXl5ZJINjKEtROuJL3kSqJ82n1tuJFXwjITPHnqO03KQIBDW29HCqPJFH979mDbhArHNFgo/ZSZwinuPy559rIV2m5SoVid46ETGsuQ3HfbAf7s4Zd5/MwkShP5Xtjs6gvlx8cm5shVXFL2h6j1KfamjpErFghIcN3eD5Hq/zTjC2X6+j8NLAo/TGsYXxMyGsLAYFRQRMoUWqsm21ClaqG50PAfIETkp+FNgNBY1giWswvPPY/vTYXdtvYjWhuku25jfuprbVkhrXLsujQ8lb2Xf/PC95l1kwTapi9pc1icwxIaIcKsPa01pp3FMPuXVTS2otE3RWjYemwe9Uoe3zGwhc3+j793iR/Jptvbzz7eMkV5pbnxzX/9OeDKHeWWw0rpJSt1L+3SQ1aK8qkr8/YP3Rh/7XxpAtevkLBSpJ2+JcpAYAnf2ZQ2vqqhtVqcuzZ8j4n8WTwlOTYhWNQ1iJgW99jpKbxA8djpqbg7DpRmqhDKjnf0pinVAhzLQCOaeMxa9HKTt42j5482OKO9n8/c8knwJ5k8/y+xnO3UqW8IiVt6EQ2Y9pbIejMMOrXsrfSN/FGTn0ZQG2fy/B+jdXgtoUUoUQIJSKOHdM9daK0oLcMKaYeEneLduw5F4xo4Ov8r7Em/wp70SySMMCPQsLMY1g5SHcztO6HdKEwGGqvg0vPz72s7CltvRNom3jp4yxTllebGfr78us7dlnsoTD56HOUFzBw9uWz3spwwpBWdfDJu3f8xHjvxLTJaNXXcnRJHhJCcn3mpo98GLM6sC65JoSawjebrUDogV1H86OQElhm6wk0Xq3EHO1106U86pGyzaYYcaJuC38+pmQKTpYukbXPJDPqzN/WgVB4hQvGGiIqaYWXxapOY1tbQTa4ug+69Bx3kUcIIZ8PSQSZ2k+q5k+Lc/WEQq7SwnJB1keo6QnbrvwDg8qnfXVGM0g7Ngo+A/3fyX/Eb+x7iXb0vg65gmP3Y6duoJn8H1w+aKIKtUvVWrNY3BV7/BJ1NXB14yxTlJs8KpVCej7RMkLJpbrwRjnLtsNxDIX/sAu5METPtrKp7aScMaUUnn4yaX0Upf0Vl4ErngcWuuj6z7tIeXbbGDRo25yJaXNISlFyflGOyOxtusr02X8b1A7SG6ZJLyjLQkU1EHUprSq5PoqUgSQEPn7rE7964ta3jW7iBZyONbpRawDB7ASgXnqKUf6Bp9ADhGMivTRF4l5eYEWntU6scJ/BnotDWZrQaB7WiveDjvSjl4tVm+Pozszx6Zp5c+WgsSf/0Lfv56tFTK/omt/NN0ZZBxS/hiOaH9hu5ob2JNw9vmaJsOBYDtx/g1Jd/QG2+FPvB2r1p9n/2/WvqENa6SRK4HoHrYfUmUdUW6ppSeCWXdMuNciXdy3I+GbPFi0gReiQ0ei7D0g3D1fptNM6srx2WPH2xPsLQOFYapeED14zw1Plpyp4f5k0rjW0IbMPGkoLd/WnOzRY5myuwt38xBNT1AlK2sej10ODn7HoB33tlnMOpraT1qzHnOHpj6R26d9Huc/ZbTd4VjaMHgNL89yMK3HDcVWuI2RWBP4tXOYk0u2OaXB1146B2svBGNAo+IHSr+9ozBR44Pr2EifLo6UmKNb+jb3IrpONgDg0t6yL4ephpbeLqw1umKEOD7ivyH0DHwtpVYa2bJK3Hu7MFlOuR2TMYt4OBW8NMJZYsKWH93Usnn4yiO0fFK2FKmyCokbDTZCKJtdaKwe6dTWOR1fptwOK8+5cPnyHQHq9MmfhBki2ZAe7cN8R9tx1ASvjyUyeYK9fIlV2EEFhSM9yXRgrBnv4Ml/OVcOOwUqMv6fD+/SM8eW6KSiQwafRz7nIESdPnoneI7RZ0yctLqGlCGGBmY+e2RghhUMo/EhLlhBG/Vv/v/NR/wLQHEcJCyiTS7MGrTQJENLnIOGiZDcDlYp06+SMLAUfPz/DOkd6m1+tS9XZJL4Hr8dKZR5j0ziMNo72L4AaaaW3i6sVbpigHrsf0E2N07R1CByrulIUhmX5ijP2f+cCKv5Rr3SRpPT65tY/i2Skq4zmc/m7svhRD738HM0+ewC+6TdcE6+9e2lHgQrZFKJPuSfRTruWp1kpopehJDSGkYKpwnou5sbjD2j9046ppeI3z7tuuKQMO+WqwdBaqwy5Z66UPQyEEA2mHL/zSz8WKP8c0MA3BA8cvIQWxn3PdyzmcX5tM6J9j/97/EaEWlnSqys+FUmtMDBmmdi/+2zSgo8DUhsvUAYF3GdMaiA9f3ADMY1jhLDjdfWRdG4DQ2R/ZCxSlmo8XqOh6FzFXccmV3bjjrj/4J584zuW7phEpM3ZvC9/T5lXNRkWkbeLqxVumKDfO04QhMRo6hdV0pGvdJGl3vBCCrr1DGI7Bu7/wcZIjvQjTYObJE8y9cC62RLSzGVI7+hm847pVdS+tFqCtFLi6bFqjSZpppDTIJLKknV4QksHuHYzPn247N14rDa9x3p1qMH1z/YAnzkyzd6CLHb1pXrg8F/tq5Mo1dmY1Ugj6kmEUVGMhr2+UPXzqEq4f0OWI2Mu5jppfxVMBqZa5rtKKk9NjVMsFtK6E7mlmikyiFxBIc0tEfatGx2u8QGGKWriKavK1EFjObgyrytCuf4udPASsfwOwkz+yZUjSthkKTlrQKFWHxQd/0A0qJZCBCtNBIC7MTS6Cm/4VP/N4yxTlK52nrXWTZLnjvYUqhhPmp5344vfxC1XsbJraTAHlB7i5Aj2Ht63YvSxnAdpIn6t4RZRWJK1M7AgHYRflB17T3LjRgW4if5YjB349Ps9qaXjt0NgVWqZkIB0yMAA8pcJCKGUsxW5EfaPs3pv28E8vfZuk6TcxPKCzgGZs/CgX586QkftJ+C+Gqd1RkGza6SbTexcAxbkHOD9XJVd28QKFYyiGUlmSbYZboanRIaR08GvjcTRUK1baAOzkj6w13LJrgGKtuVg3StWh+cFvVDVGVaKtcAni5oqkdg4gZAcXwddpQ3sTbz7eMkX5Sudpay3qqzk+cD0mHz1O+eIstbkSKlBIQ8bn0n4AyxD6V7IArY8TyrUCz5z9LoFaanjT6gzX6MdrGwlcv7xqGt5y6EkY9CQkVT/sjkMGRlisAboTNndHs+dOSNkOh4f3Rl38yp1740Zlyb4bACc4gaRM1fcZHLwnZl88emqSavFxkkaNQGd4aeFajuUD7uBldme7G75Xs/l83d+5bd5fS3JIO7TzR75jXwP7ouX1xvenafUXCBIXTcp7aggEygvQng+2saK4aBM/W3hLfdJXMk9ba1FfzfGV8TnyL17AnSuFNr6GRAPudIH8sQvLjlRWawFqSJOuRB/DPfvajiG29l3DdOEC8+WpONev7sfr+hXOzxzjHdvuWBUNr+11NvClB1Muz122SFopMk4fe/ozbO9Jcce+Qf7F3e9oy8NtxVoENE0blUJSct5HSd+B1CU8nWB04LcQwsD1A/7u9Adwg9tIGQXKQReBthEEOKbF7v7LaJVvaz7fPu+vfXJIh3eIe2/ayr037Yln8KbUuH6Z+27bv6zFaeuDv+uFsFuvbvchJbCcBMN9+9a8qtnEWxtvqaJ8pfO0tRb1lY430g5e2W2z4wVeycVId76hG30PWlG3AG2Mllq2mF2CifmzTdcRJpKkmVq4wMGR9rzm1aCR5/zLhySGCDg2UWCupNjWO8Qdo83c205J3XWsRUDT1vNDWCjRiy2teEm/OFoJxSrxe4DBg+Mf4jfvupHBVBWfHuaqYSah0/Ate3rvJVjI4wYvoFhYNjmkjnbinsGu3cwVYKqwNJG6VS4NSx/8Qgu6n0+SeT5g8MOHOXToQ5sd8tsQb8lPfL3ztLUW9ZWOD0ouZiqB6xaaeK9aa8x0gqDkQnd7o6FG34P465RG13yM7u4lFqDLFbNdW67l1Ykf4wXVyCtCxokkKznQLYdWnrMh4ZcPwy+MQtlz+dC1N5OywwfPSkndjeesX/9K17Rav5BOG24Qbqz1pjJ8+elJHjtzsknI8dmb9zH79a/EM32Z7SFz5E4GP/FHGFbz59a6GdtOlDM2cRQQYbqLAC+oLRHqtKL9g/869n/qg0jZefS1iZ9dvCWL8pVirUW90/F2NkPvdTvIv3KJWq7YFEjZc3jrspuPTb4HUlI6P4M7W0TVarDtWvTXHm7Ln243hkhaGfozI3i+i9IKKWRcxGxjeQe65RAKPMqRL3PjOcEQLuACYVFeSTm42qLditWMOzptuNU31r7x9Om2VqOFxx/lY08vzvR1yWXhgSeRfk8c69VuMzZ1++1M3Nm75EHh+hUC5VHzy9HnEObsTeTPLJG/17HJpthEK96WRXmjYDgWQ3cewi+6pHcOxDxlgKE7D694c9X9DS7/x29THc+BnURtOYw/eOOaTGaEFzBAP+PqIoax+JGuxoGuE5RWnJ36KQvV0JdZCEnCTMXxU42MgNUoB09MPM3F3Kt4SlJwTbr0Url3O6x23NG64dbrmNw+kuET1+/gU//w7BKBh4Hm8dfm+IhhYbOY6tE602+3GTv7+D9T3Hk9yZ17mt4vL6ihlR9mL0Zz/WqtiFZqxdXKJptiE3VsFuUrxJWEUQrDYODez3DyBxWCrjzaSkFUcITBijLtxi7OzOdwbt1J+Z3bEbu2YZvJdVHf6hgbP8rl+ZPYZoJqzQc0Fa+I1pASGUa27IuLY+OGnNaqqVuv+VXKtQUuzZ3hH4+HLnSFmqDL1lw7LPnlw527yEastFFZp9393nv2cuKrX8N88gmM+Rle/NYIE9vvpHv3riZTDu155AOYlzaDqhq/XkMyX6wwND1DestA281YywMxOQvbd0E0YhBCRA58oln+LsBTNUzZnPS9iU10wmZRvkJc6fKzlivi5msYiaUFZyVRTGMXJ+0Ew89OETw7QeqePrZ/8nfXvUnU2PlmGpKy/UqVYnUanpgnd7rMiSNl9t/3QUxpI6VBoTqL61fiuXbCTNGdHAQE//fLNZ69ZCJFOP5wA8HTFwWB9rjtmvXNvNsh/42v4vwwssJ0EnSX8qSnxnElOLt2x8cJy6LHgF4VMh8C4JuZfTzrDLBgJtj60AluH7rEB+bnMFssNGWgyZydw722hpEIN2qVVouxVi0broYR2qbaLN3UXQ6bRvZvT2wW5Q3CepefMS2qWG1yvoPlRTGdKHUGEu/RpxAf/yyBxbq4ya2eGRmnDyY89GwFLQy6TqcJSh4X/+l5LvdfhHclmClcjL/GlBagqdSK9KSGMGSK41N14/xFSAGvTJnU59Lt0InN0e71du+JjeIGb5bHcw7Ojp3xexsguGNHH9a4B4bBNzP7eCw5hNSaVLaXitJ8/7U8ueHr+a25V5dc17ZzCm/LYSaKF5grTxIoNwqgFfiBhxQGUhokrTTdycE1zfU3jezf3tgsyhuI9XQ2wjTQWjP7/Fm0r5BWuFGY2r68THs5Sp2/MMcrZ3/EDLNtN9VWoq21UtG00tRmS4hAID2BUQ1bweINNSreefpquxBCYEgTpQM8FcZUJZwwkDVXrlELEkDoe7EIjR8kyVeDWNJdZznI3l5O5p5fsjF4zfBNnJx4uu2GYaf35DeLp8H3Oa6vZcHTsZDjszffyezXPWaffJJnrD4MaWBkszg7Q48MwzR4YXiUX505ji1A13yEbYJW9Nx6hKGdd6IvPkYtCB9GJXc+nCFrjW2l6E6EDJqR3r1reihuGtm/vbFZlDcAV9LZnPrSQ3iFKk62K2RweAHVmQLdh5aXabej1NUxdcsO3NpFhGE2MSHqXhWtBW3/0I14QTUu0ku8Nzwf5QUgIXkxVJ9pQ1Pd4YU+xp6L1grTsIDwIdKXHkEKSc13Sds+2/oGmS7MRIrDcObsWGm2ZAbIppwlLIfJ23aycN1WErt2N/0ME/mz+IHbluUx2n9D2/fEAH5HTLPtt+9gPqBJyDF03+fxf/W38b/5OKlkIu6k6yhv3c70mX2kfvwIqlxCptKk33MbA5/8DIHymSpciFYGNI16wlWDxUjv2sQfm0b2m9gsyhuA9XY29RtQmgaZPYPoBgaHEMvLtDulagcoyu/cjmk0f7RCSE5NPUvCTCHlYrF+dfwoJyZ+gmU4JO00wz1hEWmkovmmhyEMnLMGXS8k0EKTv7FCdbsHUrDg5Qh0gBn5IWsNOsras80E3Yku7tw7xAPHfdJObywDVxru3DeEYxpMfumL8c+iU0nyW0zU9DQuYnEWLATThQv0pYaW/Gx1lsdySePJdIp2SYoDfV1ke7vb8pyty/N488NUR38d4ZXRVopKXnL6qz9k+6dvXXXK+GpXUZtG9pvYLMpXiNpCmfEHf7rk9dV0Nq03YKP73WpuwHZRQs6dtyJ2LS09Gk3JXWiabRaqOUruPEorLGmxUDWYL0+htebwttubqGgXHn+C8WdeRBiChXdXqGyrgRYYlhkGniqN59ewTDtOK2mk5DVT1hR9SSv2gmidBfuOgZ8wkIHGn8vFs2ClA/yghtIKo4V+VxfJrCVeqY5OPGffVxy8OIsjJSDRTrgZKQiZMbs/eWdba1SBIGllSNlda15FbRrZb2KzKK8T9Ztt/MEXmX7iBNKxsCMf3Lq6b6XCeqU3YKcoobMn/n5JoVA6AHQs1AgIX0qQAAALRklEQVSL9DyB8kOmmAgjBKpeiZOTzzI6cnM8ykjZ3Rz47D1ILZl84hUqW10Mw0QaBiqq/4Y0CZQPOnR8M6XdRMlrH6lkRO9T8yzYdAPMaoCyJHgeyvOQjoMUBqZhtxWb2KZDoHyUqZe8J+0CSFtRf2g8enqCXKVKNpngluFe3vGdOejQtQbz1RUVhye++P01raI2jew3sVmU14l4ZCFAJix0oKhO5wHCZBJWLqwbdQNKx2nyyWhXKASCtNMTvxYon0AF8QOkzq0VCMpunkI1R29qcPGahGL7p29l6OPvpnDiO1hOAgSU3DmqfhnQGNJidORWrhm+kaSV6bCJ2BypBEvn4zLQdF0qM787g7AspBW9D1qzpWsnfuA2fX2hPIvQ8MSJ/0TCTscbf43vyUoQAt63N8c1fReZLVbpzyTY1rWLXH+KoNQmJCD6bEftzorD9c6HN43s397YLMrrQOvN5mQzVKcWEEKEm3U7BwBWVVhfjxuwnTR5W88BtNZcmhtrUd5pZDQL9gMPpQO01vzk7P1s7xtdwnawTQff8LBl2CJnElnSujccgRgO1+24e8386Hbz8ZGX5kErqjfsJ0BjS4vhnkX2xUT+LK5fpjBxjqBcJjFXxbUsgr4sF3eGYpDllIKtqMvEpZBsyZiAz+WFk1gf7UL/zcyyD81OisNKLr+u+fCm9Prtjc2ivA60zoLTu8KECDdXRLk+0hSM/Pz1qyqsr8cN2EmarLRCCBExGDxMwwpZEzL0ZFbaBwSGYYHWbdkOgQoIlEehmqMronwJIZHASO++NRXkRmpeu1nwoaFb6L/nU9S0G87CPU1tssD+7A0oHXD2hX+malSRtsTtdXAWPPzpKQAmjMSKSsH65pvRm+goE9fXm4zkrmPm0bFlH5rtFIdXOp7alF6/PbFZlNeBJTebEKR3D5LeOYAwJLf+zeexO7jDdcLrcQO2ForWYn1m6gVOTj6D65ejmbNAIMnYPXEMVTu2Q8bJUq4VMKRJzXfXnGbS0Zzos59rOwuWgcWpv1rcLKvcpfGvN9DFeUTWAg1eMvxVTix4+HM5atvLHf0mWjffjO0J5n/No2vX0JJja77Lzk/dzjW/94H2ToHLsCo258ObWA82i/I60PFm07D1569bc0FuxFoFKOsRrNSL9eFtRzCkycW5MaYWzmFIKzYdAjqyHYQQJKw0N+4J/X7rXaw7kV/VdazkKNc6C26kHMq0SXmwiJ6sYfpFRG9v5DcBfsJAL3gIz8MK5BIVXV2Ycvbvn+Xygy/Hm296zse7WKSkZZyLV0fdeMmQZtNDc7Wsis358CbWis2ivE5s9M22VurURkhx653zvsGf49ETf4dWqmkJvzzbIUHK7kZXPU7+2beZee4ybt5d8TpWcpTb1/dugvlqXNxb5/dBQhMkFDIwURUwqwFe0kQAWoI2BBgWw/3XxKOLRmGKP5tj/sQMVmYX/vabQcgwiumSRSVZiHPxYHmXvdVy0zfnw5tYKzaL8jqx0TfbWgUoGynFtc0E23oPxN1qjA5sB60VQ127mPnKl0Pb0QvjSDuF07MH37l52eto9dVoxPzZizz5hb8iuFiNi/u2j97YNL83qiIOGNUygTMfXpufMECDUfUZzuzm4I4j8XkbjZuQBqpawXDDn9XfcSsQRjEpv4R4JwRmsOxIZj2sCiOiTG4W5k2shM2ifIXYiFnwWm/yK5Xitht5dDKTb2Q71PwqlmHT37WT3vufZ+7738OdmAXDQgQexuxioet0HW0jnoDSuWm8S0X0XFfTQ0Z5QdP8vilgtKsfKysR8zlU3qVnRnEwcy0jv/a5uBNvFaYI20RaJjpQGPmz+NveA9JEaMHA6T5uOvgb+Ia3rInTWlV3mwZDm1gLNovyVYC13uTrleKuVBw6UbsObb2N/UM38sqlx8mVLjM+f4rXnDEy12axnj27ON4QMi50tTmv7XW0i3jSSlPNFUheCn016hCGZOboCfpvPcDEg8fih1DXCwm00sibM5g7e0nt3ssWY5CDu+/CTDSrGVtNioQUOP0ZqpN58KqhdNrpjjff7GQSu60YexFrZVVsGgxtYi3YLMpXAdZ6k6+XarWa4tDJTP7U5DNMLZwL6W++JiBgfn839j176Xvw3OKBUaGz+/o7XkdrV274Eue4oOvlFsc7pahO5tn+kRsAmPrhy/iVGk42zaHh69lzz3vxdHXZrradcVN6V8gjd+erBMrCTllr2g9YC6ti02BoE2vFZlG+CrBW6tR6qFZXUhxaN+ekZYFlIYIA97pB1A/OI4PIK9lKoGSCkWUoX63UPDOwePrMV/B19JDRmtL5adxcER0oHv21P0caAqs3hZVOMHDLgbi7t1g+0aOtcZMQpHZkGf7EPfT+6n+xrhnvajd6Nw2GNrFWbBblqwRrZXN0On7PvXdTGZ9bUmiupDgs2ZyTErMvGwo1skmsPf2os7PomocePsi2j9zY8Toa0diVNz5kSuenqU4toNEIaVAZzyEQJGo+5u4E4w8dQ1rGqpf+y5kUtYYErBar3ejdNBjaxFqxWZSvEqyVzdF6vNmT4uw3HubHn/xS23nxlRSHdptzzq7/v717aW0iCsMA/E5mJremte2iJRZKbHFRCNhSLLjoxpXU3yBupCD+IG87/4YbQRCkdBdcFLVdqbQSqE0vM8lkXEwn5DZzZpK5nIT3WRVT6pnNlznv+c45zkHw7dO/mL1TgPZoA4UHD7H08hVOPnz2HIcX90vmz6dvuDm7gKJmkF2YgVlvdM7nMOqNzgadMFP/YQc39R9SNOrVS6KFXm4gobBYlCUTtpvD/X3RaWTjFIdhi3OAguzqKlaqj7H+5H6n0IU9Fc3lfsncfbqFL89fQy/l0W5aMM/+OfdGAbCblnNlVi470tS//+AmIJnOCG4goTBYlKdA0Lx4nOLg1TLnXjEVZhx+CuUFFMvzaF2ZyOD2uqx22/k7uurcYYjopv5JdEZwAwmFwaI8BYLmxeMUB9u0UFE2sF7Z8uzjjWJRq/+NPrtYws3ZORQoyC2WgEwmsql/0p0RPGCIgmBRlogo1/T6PGxeHKY4+E3vB/6vMXLr7mfrfqMvlOeRyTrXY2UX56CFbF/zw84IkhGLsgREuabo8zgXk8JM70cZh9+zdb/RA4h86s/OCJIR93hKwC18rSuzp/B9f/Mx0OeAs5i0srcJraijbTShFXWs7G2O9UYpmt5bxuCNHGHH4fds7hu9mtN7fo6K+yViW+2ef2dnBKWJb8opExW+yrPdQLlnHItJo0zvw4xDht1u7Iwg2bAop0xU+C5/noYqjFEdkGTWG1Bncp7Te30uD8towTKaQwtnkHHIkOmyM4Jkw6KcMlGuObO2lFjuOSzftW0bdsuCcnvztG3baByfQsvr+Lr/fqy+XpkyXXZGkCyYKadMlGtm54qJ5Z7D8t3mxQ202XwnI77+VYcCp5/YK98Oipku0SAWZQmIFsfiWMTr55XvZjSnHW3n3T523r5AaW0ZpbVlQOk9YtNr4U8kiWcjmiSMLyQgyjWTyD1F+a51aUDN6WieX0eaATPTJerFoiyRIIfbxJV7Bs1348qAmekSORhfEIBg+S4zYKL48U2ZOoL07LKvlyheim3bQz84PDysADiuVqvI9Z09S9MtyNnCo54/TNPJMAzUajUAuLe9vX2S8nAmGt+UaUCQfJcZMFE8mCkTEUmERZmISCIsykREEmFRJiKSiN9CnwoApjm4UYCIqFtXnVDTHMc08CvKZQA4OjpKaChENAXKAH6kPYhJ5leUDwDsAvgNwEpmOEQ0oVQ4Bfkg7YFMOs/NI0RElDwu9BERSYRFmYhIIizKREQSYVEmIpLIf+9v7eriPhbKAAAAAElFTkSuQmCC\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "visualize('umap','tfidf',corpus)" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "visualize('umap','tfidf',corpus,labels = False)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAWUAAAD1CAYAAACIlORMAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvIxREBQAAIABJREFUeJzsvXmUJNdd5/u5N5bcas2q7urqvVtShyRbuxdtrcXG2PKCMTAGDB5ZWDaNJcAHBvM4nDfDPN5jhjEMMBiQFzDGGMYeDDbgNraQLam1elNrbUWrW713ddeSWVtusdz7/ojIrMysrK03dcvxPUdHXZkRN25EZn7v735/m9BakyBBggQJzg/IV3oCCRIkSJBgDgkpJ0iQIMF5hISUEyRIkOA8QkLKCRIkSHAeISHlBAkSJDiPkJByggQJEpxHMF/pCZwqHMfRwCrXdcebXvsp4F7XdW9zHOcDwGeB33Vd9z83HSOA/UDZdd3XNr1+BfAM8Fuu6/73ptc/APwJcADQgABKwH9yXffxtjltjsd+tullAfyJ67p/tcL7+wzwv13X/feVnBef2wv8k+u6b4r/3g3c5rru5ErHWuF1fxv4ReAB13XvWuCYXwU+1Pzsm977R+C467r3xn+vJfoM1xAZEL/vuu7fdjjvQWATMBW/ZAMPAR9zXXfmdO/rXMJxnC3AH7iu+5Ov9FwSvDJ4tVvKh4Gfa3ttO5DtcOwvAV8A7nEcp32x2uW67tWu617juu7VwH8H/rHDcQCV+Nir42PfDvyh4zhXrmTiruvefSqEHKMfeEPTWFefbUKO8UHgfYsQ8k3Aby7w3seIPptm/B7wpOu6VwFvA/7CcZw1C1z7N5qe+VXxa3+30hs4D7AJcF7pSSR45XDBWsrLxLPABsdxbnRd97H4tTuBvyX6kQPgOE438PPAG4Grgf8A/P0i4z5AZL31AeOLHIfrusccx3kJ2OY4zrVExJUDplzXvd1xnP8b+FkgAPYSWfonYuvvE67r/oPjODcCvx+fp4DfcV33X+O5/1Z8TwHwEvABIusyE1vI18XvrXJdd3yJ6z0O3ARsBHYBd7quq5rvx3Gc9cBfAJuJdgGfc133447jfBFYD/yl4zj/2XXdL7adNwT8GfAbwG+1vXc70edxH9GCUocB9Ma7m2w855b5LPDMfcdxfg044TjOpa7rvug4zoeBXwFC4GR833sdx+kC/jS+7wD4CvDb8TN8znXdP4jn+Nf1vx3HOUhE+O8ABoD/Ep9/HeADP+a67nHHcdYBn4ifp0W08/m9eEf1ALCT6DuXj6/5D8BngHWO43wjHv9PgZsBD3gZuMt13dmlnkGCCxevdksZ4G+A9wM4jpMlssb+re2Ynwf2uq67B/gc8NGFBosJ4sNEP9BFCTk+/gbgYuDJ+KXXEEkJtzuOcxdwB/B613WvBJ4D/rrt/H4igni/67rXAj9GZDFudBznx4hI+IZYDjgA3AvcxZzFHjaNtdT1LgJuA64A3gTc2uGWvgB823XdK4iI6Ocdx/kZ13V/GjgO/FwHQjaISOw3gGNt760lkod+jogwm/Fb8f0eA14A/ovruqMd5jQPrutWiBadKxzHeRPwMeD22Or+O+Ar8Wf5/wBp4DKiBfmmBe67Hel4rF8HPkUkUV0FHCH6TAA+D/yV67rXEe1cfsRxnPfG720FvuG67huIdg//I/6s7gb2u677VuAGos/jyniMl4EV7bgSXHi4kEm5U364ZP4P+wvATziOYwHvAf6ZyCJqxi8RkTFEVvR1sXVax3bHcXY7jvMU8DxwC7CQ5peJj93tOM5zwH8jIqoj8fvPuK47Hf/7DuCzruuW4r//BHiz4zh203g3AMNEJLKbyLrSRD/OHwH+j+u6RQDXdX/Ndd3/b4F5Led6/+K6rop12H1EFlwDjuPkiEjrz+LrTRGR+h2LXJP4GTzsuu79beNZwP8GPuq67kiH875ARFZrgcuB33Qc5w0djlsIGigTWeFfdF13LJ73XwPriKz9HwH+0nXd0HVdz3XdW13XfXAZY385/v9+4ITruk83/Z2Pn9WtwO/Gn9sTRBbz1fFxPtFnCfAD2p51jGeJvs9POo7zu8CXm3Z8CV6luJDli3GirWOztToETDQfFG/Nf0Ck7d4J/BowWH/fcZybgdcCH3Mc59fjlz0ia7n+A9jluu47lzmvSqxrLoTmrWf7oiiJPhPR9JoB7HFd941Nc14LjBFZs7rp9T4iSWUhLHW9StN7mtZ51I/v9Jq1yDUh2qmMOo7zHqCLaHu+m2gx3AL8T8dxIJKEDMdx0sD/RbRtfzOA67ovOY5zP9GC+J0lrlffFV1GtBu4vcMhIp53QOsz3EBE5O33b9OKWtO//Q7jG/H5N7quW47HHgSqRN8/r0ka6vSscV130nGcq4gWwjcBX3Qc53+5rvtHne45wasDF7Kl/HXgVxzHkdDY5t/JnPXRjL8h2mb2uq77XNt7HwE+77ruBtd1N7uuuxl4J5F1vfGszT7CN4C7YqsKIs3zYdd1m3/wTwCXOI5zC4DjOFcTacdrgX+P59kTH/s7RItOQERu7T/05VxvQcQW9BPAPfFceoH/CNy/xHnDruteFS9W9e351a7rPh4/97qD7j4ii/ZuosX1KPBT8bUGiQj5yQUu04DjOBngj4Gvu657KL7vn3YcZ1X8/l3x+PuInuGdjuNIx3FSRLrurUSL3uuart3uhFwU8W7oCaLPo75gPgq8e4lTA+JFznGcdxJpz4+5rvs7RN/jqxY+NcGrARcyKf8qkRb4nOM4zwAPA19kToZoxleIvsyfb34x/pH+BPDx5tdd1/0WkdPrl8/8tFvwl0Sk8B3HcfYA19IWLRJvuX8S+LjjOE8T3cP7Xdc95LruTiK9+VHHcZ4lsjR/Gxgh2hLvcRxnYCXXWwZ+jkjyeJbIYv0ybTr4mYDruppIT/6I4zjPA98G/pvrursWOOXjsWT0AyLiniVapIllkz8CvhWPdSfwzthS/a9EO6OngaeAna7r/iORg23YcRyXSEZ58BRu433A9fGzehL4e9d1v7DEOc8DoeM43yEyPJ4n+o5/D7iRaOFN8CqGSEp3np+IyeW3Xdf9+is9lwQJEpw7XMia8qsScezzHqJEiEdf4ekkSJDgHCOxlBMkSJDgPMKFrCknSJAgwasOC8oX3//+91PA64mcRu2xvwkSJEjQDIMonv6711133bKieRJ0xmKa8uuJUm0TJEiQYLnYDjzySk/iQsZipDwCsG3bNmy7PW4+QYIECebgeR579+6FmDcSnDoWI+UQwLZtUqnUOZpOggQJLnAkUudpInH0JUiQIMF5hISUEyRIkOA8QkLKCRIkSHAeISHlBAkSJDiPkJByggQJEpxHSEg5QYIECc4jJKScIEGCBOcRElJO8EMDpWoE3ghK1Tr+nSDB+YCkdGeCVz20DimevI/y9COosICQ/QihQSvCYAzDHCLbewv9QzsQwnilp5vghxwJKSd41aN48j5mi19HCAMh0niVF/Brh5DCAGkjxH5qlRdAK/LDZ7vZTIIEiyORLxK8qqFUjfL0Iw0LWOuQwDsC2kfpGgIBOiQMxpka+1wiZSR4xZGQcoJXFebpxkEBFRYa72tVRata3DtaoeNG1gKB7x0n8JJ6OgleWSTyRYJXBdp1Y2nkyfbcTO+qu5BGHq3K0XE0N/iWkaUcQ8T/JUjwSiIh5QSvCrTrxlqVmS1GPWezPTc33pMyhZAptKoipE2dhrXWmPY6DHv4FbyLBAkS+SLBqwDtunH8KuiA8tRDdA+8j0z3jSBSQIBpb0WaeaTsQmsFSAxrkN7VdyJlUqY2wSuLxFJOcMGjrhsLkQY0fu0QYVBAKQ+ta3jP345pDyGNPjLdt9B3yS8zPf43lKYeQgVjSHMVud5b6R/a8UrfSoIECSmfS4Q1H68wi53vwkhZr/R0XjWQZr6hG/u1QwT+aBxVUQPlEfgn0HjYaZvyzGNIs5f8mnvoW303KijEVvOchaxUrePrCRKcC/zQknItCCmUa+SzKVLm2U0YUKFi3333M7rrRaoTM1jZFKvf9Bqce9+GNM6+glS/15xtUvKCFd3zuSKoUAXUgjIpM4shV/a1lDIV68ZfIwwKCARaa7TyQIBSM6jaNCqcxjAHKE8/TN/qu5EyhWzSkBdyFiZJJQnOJX7oSDlUivse28uul0cbpLx962p23LgNQ54dgtx33/0c+9pTlI9OUCvMovyQ8e/tZ/yxvdz0d7+CDsKzYkHX7/Xh/Sd5ZqRI2QvJ2iZXru3jlq1Di97zuSIopRXuyBOcnD5Aza+QsjIM9WzBGb4eKZb/efQP7UAFU9RKu2OdOI6j0IDQgEbrgMAfRasgWmjanHqLOQvza+45MzecIMES+KEj5fse28vOPccwpCBtGZT9gJ17jgFwz82XnvHrhTWf0V17KB+doDo6DQKEFKA0o7te5JGf/mOkaVArlEjlc6zefikX73jLaVnQdcv4i08d5P69IxwuliiWPYQQ1Mo19pycYrYWALDjxos7WqjniqDckSc4WngRISSGNAlCn6OFFwG4bO2Nyxqjbs33rbmXSmk3KpxCa0F15hEQKj6qHv4mUKoEMjdvjPnOQhDCoDz9SMOyTpDgbOOCJ+VOOu1C2m0tCNm1fxRDtkajGlKwa/8od19/yWlLGe3X9gqzVCdmqRVm5wXBBrNVRne9yMDrLsJIWwRlj2M7dwOw7Z63rvzaTbuAiXKVvaMzdKdMCuUaQkQXF0JQKHls7Mvxtedf4JL+76FVtcVCRfvnhKBCFXBi6gCizSIWQjIyuZ/1+cvI2t0LyhmdrHkhBEKYQADCAB2CEAhhQSxrGGYOVAnoaYzV6ixshQqLHS3rBAnOBi5YUm7WaetW5qqbtqGB8Uf3drQ8C+UahUqNtDWfeIuVGoVyjeGe7Bmbz+rtl7Llrtsw0xZh1Ueaxhwxa9BBCFqj/BAjtoyFIRndtYeL7n7TiqWM5l2AFIJqEFLxA6p+SK5pLF8pJstFakGJYlkzkG21UC8Z3HLGCaqTNl0LynhBpYV0tdaUakWqfplde79I1u5eUM7oZM0rFWCYXWitkTKL1hIQCGEDEtPOY6UvR5r5lrGanYXtkEb/vOOXc38JEpwKLlhS3nff/RzbuRthyIaVufeTDyCArq1DHS3PfDZFPpui7AeNcQzhkTVm6Lb6ydkmI9PlU3L+dZrP0a89xcmH91A+ViCYrSKkQFomRtZGK4UwJdIykW2LhFcs4xVmyQz3L/v67bsAy5BYhiQIFYHSaK0b1rIlBaGu0GVDdxN/CCE5MXWAi1dfdVoE1YzFtOmUmSVlZQhCv3F8qVak4s8ihYFlpBaUMxaSG6Q0AcnwRZ9k8sQnKM88Aki09mNrGXK92ztGW2S6r6c0eX/LmFqH5HpuXpBotQ4pnPgEpalvo1UJwxxMnIMJTgsXJCnXdVrRpLvqUOEXS5ElqhTI+ZZnKmWxfetqdu45hikV1/f/K1uyz5GR03j08ulvfI1vjryV/mx2Rc6/lvkYATJdRlWzlA8UqBVmyF+9merJaWqFGYKajxaQ2zBAdWIGO9/Vch8Adn8WO9/VMn6zJNIpcqR9FyCFIJ+1GZ2pYhqCUGlMI9q+9+VslCpxxTqw23jDC6r4KmzJgms84yUIqhOW0qaHerY0NGWtFdWgjECQsrKNFOj6YrFtzRsaVvVScgOqRP/wR9FAtbQb1DTS6G4QZv1+WheMfqSRQ2uBVpNIo59c0/Ht0Drk2EvvozL9KFoHCGFiWHnCYLpxfwkSrBQXJCl7hVlqhTJGem5LrvwwkgMA5QfIlD13fJPluePGbQBUJz7J5vSTmIYJpJCqwtbsE9y2KuSxk3ew8/kjwPKcf5WJScr+NL2vewp7/UFEuoIup9GP91DbuQ0dhOSv3cLsgVFqEzNoDb2XrqVXSvyZastYOlSs3n4ZRsqaJ4mY+RwPXLkRd12eYsVriRzptAvY1B8R+2TFI2sZVHxFLmVy2VAP67pmuGNbOO9ebDNNysySiYkoIqzikgTVCctxnjnD1wNwYuoAFW8WrTUZu4uuVOsuwQuq1IIyWTvSgReTG4TsY2r8S1Rmn4jIVkZJI/3DH8Uw5uSp+QtGJVp4et9C7+B7l5QiCiOfoDz9KAIV6+KKwBtrPLfEOZjgVHBBkrKd7yKVzxGUvcZr0jIQsWYrrdbbarY8DSn5pRu3cOyl49T8QQwpeOZ4EQGE5Rob/V08+OwqQiPD/ceO8MHXbyXdRPDNqIdznSi+jPzAE4jUfsKqxJwywfLouvowAOLkJSAEXVuH6Nq8iqBU5ZqPv5/shoGYdPfgFcvY/VlWb7+Mi3e8BZgvifyTafDkoXGyFY/c5lXzIkfqu4C6hCEEbOjL8cE3XMJ7r9nUEqf88uiTsSzQtNvQijW9WxrW6GIJFsvBcpxnpj3MZWtvZNuaN1D2pvnegZ2EKph3vG0YGGoKpVJRfHEjNnm+NS+lpjR1/xzZ6mpL0ggsvmBUZp+gf80OpEwtqBVH538bCFqeoRCC0C8QWhOJczDBKeGCJGUjZbF6+6UNwoJIprD6c9GGV7bKGnXLsw4VFNBqkpSZphaE+KGCahXt1chYIdlslemKzcTYOHs//RmuvPcjHefRCOcipKd7BO0LwmwUgmVOShCS7qvHmf2WgjCek5Skh3rJDPchDcm2e97KRXe/qWMESbNE4wl4JmsjgdrELNmNgwgpWiJH6ruAXftHKVZq9GdSbL+oVYbpSUcLTLOF6gVVbDPNmt4tjdfraE+wWC6UqqFUDSF7o8w6FFr5CGkBcp42bUiT7nSeNb1bG3JG9AEqct636DNOcvLA37do0v0drPls9/VUZh5bMnJkqQUj9MeYLH51wThtFRRQqhzp1Lp1x6F1gJDZZWvviZMwQTMuSFIGGtZks5W57RffHEdfuHjFMmZPhvw1m9ly120t5zZvfS1DYkmJ50dWd8VPU65FW9weHWB+51FU7YPIVOuPpTmcS6pp7FxIUDFRXkCYUliWiZ3PoZUfacylaNvdaZEwUtY8p167RDMjJbNSYGnQfoj2AkT8XqFc5fiBUTZuHOSemy/l7usvWTJbUQrZsFBPNZOuE9p12tCfwPdG0BrARwgbw+ynf+jDHQmofbHoC3fRIw7SlcoDYp4mnV9zDz2D7yeovYyZ2gqqRGlyZwvZah2itQ+q0rBel4q2mC58ueH0a9bCtQ7oHXgvyByGOYBh5ufSumMIYZDrvX1Jgk0yCBN0wgVLyotZmbWfv4m9H/8HCnvGGPnmMxR2H2TNrZc3QuOat75SGORTBiNKI4Vi34nNBMpEAdfVxjGmJgiKBew1rdZicziXEjmUyGJkPIxMFFnRu24thmFSOjDNrOwlqHnz5InF0C7RdCtFt9JUhUBYBsI2QUPp0BiMz/DiV77Hsf65EMDlhvYZ0mzotGcC7TptZAUWgSjLTmMAGqVVx/ObF4uqV6Rw6JugB1qOqVu9vavuYmrssy2klum6HiH7QVfQWhN4Bwn9AloHSJlmavxL5IfvXVT+yHZdT2XmibbXNYF3mOLxP2K2+DUMcwBQGNY6AMKgEBE/Jtmem8ivubdlzp2s4SSDMEEnXLCkXEezlanDkNFP38fxz/0fZvceRocGoRxiJPdaDu47STUMufJX3g7QsvXdkK9QPqr5/uglfOulG+hSIdfVxvnZ2f0Yvf2Y/fl5tRlawrmERU1uIx08A0IiDRPDNNFasfra/8Alb/7wgmnUCyW6tEs0toYryh5P5lJkB7oQUlA6OEZ5dJo3lmpkUqeffHK6CILp2LqsW42KwDsK6Oi5yG4QErRieuzz5GPdthMMaZKSAVpNLigxFEf+mHIsVdRJrTR1P9LIoVRI4B0m8Mai+QiBNHqYLX4NgIG1v9pR/sj13Ex3/7spTbVa24F3MHLiCY1AolUZrQMMsxvDvJwwKCBlhmzP7eSH721pP7VQ8f0kgzBBJ5xzUu5EQqpWIygWMPvz82SChc6po/ncsc9+huI3dlJ++Tg60GgR8o1ta3lqeIjJ9AC9393PTz34PB+55TIMabQ4stKPfom1Dz/ALdYP6FMeNgodhnTdeBPuxPc71mZoDucq2bcBYIcuWUMgZK4RrSCEMU+eWCjZpDnFul2ieU9g0bNpXRR9Ua7BeETI75quNMY9neSTU0WdeGYn76c68yhCpjHMPIa5Om69JEDrKLMOAQJC/zihN4JMb15w3MUjLHqolnZ3JDWAbO/tFEf+NKp7IYzIkRsWCYJRvOpeEJBfc29HZ6ZStdZuJTok9AsxuZuxLg5CmGitWbXhv2JIG8MenkekC1nDKphKMggTdMQ5I+XOGXiX0Gu+zOzjjxJOFjD68nTfeDOrP7QDYRiLEpdAM/rp+5h57BHCyQKyp4/agf3I3kGCqg8avnrZ6/nOhm0IFJZSlH3BzmcOI02jEepWd2QN/sIOaoYg9cjj6GIF2dtP9403M/HOqzi2QG2Gdv2zmnkrfT2/wEWDl2Bag4taOp2STdqt3E4Sza1xnPLxA6O8+JXvkelAvKeSfHI6qBMPgBAp0GFc+GcuOgYRZdbVoeP/FsNiEkOm62oqMw8tQGpTdPW+mdLkNxBIAv84gT9GtCRItKoyW/gXhDA7Rpe0X1drH60DEALTzBMlo9SlkXFOHvgwhjU8Tw9eLMKjWtrd5ARtu+/YCZo4AH84cc5IuRMJjX7mU5TME3RtHUKk0qhKmclvRj/uoR33LEpcvca+xrFaQ1gsUjtyBHV0HB0oPGHwzNBGhFb1MjRoDKRorXPRUqXsjVnsG97OKmMVF22+FWFbvOB+saU2g1Ya7YecKO5n25o3nJKzrFPyCyxs5bY7AlOmwcaNgxzrbw0LrKM9+eRsop14DCvfkAzCcKal9VLdGabRWPY6zGVYggtJDL2r7mKk/OyCjjoztRXDHECrWcKg2OaIMxEyxdToZylNPYRWUy2ygg6n6F11V+O6qApSppFmL1ZqEzAnZwhpIo3ujnrwohEeappM9y3zIkW0Dsl238jk6GcSB+APKc4JKXckIRVgzhzCI0ArHVVOA4RhMPPYI+Tfd+fCxPXgs2A9iXf0MEGhgPYDhGWiZssoVUWIPmYyWWbtDJYK0UKgERgpC5m2WupctFcpCwkZCY5jFJ5i0+BrW2ozlA6ONUpvipRkzyM7ufzud2IYK3OWdUp+aby3TCu3U1ggdI7uOJtoJx7T3gxA6BdAe1j2BoJgIs7Yi1KdDaOP3lXLa70khLFgvPRiWYem2UO252ZmJr4aXTeOJY568eUJvKME3gmkOYCUacJgmsLxP2Vy9K+iXU5MhENb/pzQO8Ts5ANxXLJoyBkIMGLLuT7XZj14qQiP/PBHmTJ75y04aHVKDsDEsn514JyQsleYpTo+i5AySvIQClEaBa+MFkZLeBdAOFWkcuDIgsTlj09QGX+GcGY6qgomJYQKHQSAxshIumsVumsVqpZFqFNI2yK3IY+Qgn47yn5brErZiakDXLT62oYzr3RwrKX0pijD6NdewA5SK3aqdUp+aby3Aiu3U1jgcqM7zhTaiUcIgZXagmlvBGEyfNFnmZn4O0pT3ybwRzCtYXK9t6+49VKneOmFrOj66/1DO9AqwKvujax1YWLaeQxrHbXy00RffxO/dgC/dgStKgjfBOVjpW2KJ+5j8uRnMe0BhOxHyAwoDxXOolGY1uqG5VxHsx68mPyS67kZw8jOW3AAju/7wIocgIs5E3U4lZD0BYazTsoqVBz80uPMvDSCqtToVntJG+NYVoiojCOsNKKtII/R209my4YFicvq70UdqzZ5+UErhTZsUCFGNkdWl7li7CiPr78cJbvpWpcnt3k1odJsv2g1KdOg7E3Pq1JWhxdUCZTHUM8WjozvaSm9qdFkjlpIjFNyqp0pK7euOW/+hVuYHR+na3AQO5NZ9jzOBBYiHoCuvrdgmvUdhESKKHHkTGExK7r+/sDaXwUBs4V/AZEi9I/glZ9GBQWEzOCVdxOGAWgPhEQTEvgnCcMCWocIDAxriKD6QuTsk11YmUsw7WEsex3t9Vjbk2KWWjjqz7C+4ATeCGEwDkiEsFqe6UIOwPnOxFLLgpLIHxcWzjop77vvfk5881ns3izmzNOkg6NoIQhCA1OmMFQF78hhUps3A1FYW/eNN2P1dC1IXIM3bMI7kCMsFiPPPiCkRAhQdjfeth8DafAuI40yuni+L4uxaZCsZTYy3ICOVcrqqNeAcIavx5sqUyy/jMoKjKokc9Sie3e0XT9Vp9qZsHLnde0onVrXjqWwVKumxYinmTCkETm2znQsrpQpMPOooAAdrML8mnsRwmTy5GfxvZNIaSFkFiFThEERhEk9jhqiTMwwmIy0Yh0QeC8T+pMIIdC6hBQQKA+vegA7vTW+ikKFNbp63jRvYVgqXb0uOwijl6nxL+FXXopluKjAkWlvRgjRsUJfJ2di1Dh2HIGBaa9J4p8vMJxVUm7WknMb+jFOTqD8iHjDIETkclhdaYLpSaxKFTM/F30BCxPX1rtu4eALX6Xy4h7CYgEd+AjTwugfJAi70KlukCYG8J6ZKh++2WHg526el+FmSLMlrK2O9hoQr916K9O/+xw1XcOoCkQ4Zx0tJDcsRWSLJb8sF2eia8diWG6rpoWI51x081hOVpwQBn2r76Y89RCmNYiQFn7tMIF3griwNXWLNyrvqePUaY0QBqE/3diVae2jlY+V2kLgHQdh45VfQKkS0shSnnkcccKcZ5V2kl/a5x54E2hdQ5o9KD9KTa8XODLtjR0r9M13JqqmPoVBrOMbSfzzBYSzSsrNDi0RVCCsgRRIaaCJtvF+YCO602z4vd8n41yGb5icKM2lCC9EXN03bScszSI2bkR7PsK20Fphp7cwpTLzrM+F2istpwaEkbIYuumy2GpvkkxiuUFYisAbiawYYa2o51ynFOvloEUP1z5Sl1AihxDWsrp2LAedSP/IxAt4QZXXrr9l3tjtxHM2u3nUF73K+OcoTX1jSadYVKtiCiGjuURasMavltA6BJmCuDB+lOxioJVCmn3ocJI5Z54VxylrDLObVOYyVFBEyHREwrqybKu0eRfEzR79AAAgAElEQVQBFqF/Ak2Iaa3CtFYTBgXAJwym6V39lo46fLumr5XfcGwKYVKvIX0mnnmCc4OzSsrNDi0l0+jAQBJVABNCgIyC1fyyxth4EX/x3QOdG5p2IK66NT3z2COEQRGZ62pY2SpQy7Y+l1sDopPVvmr7Ngbe8xLH9/11w0qbUWs56l+GkOZZsV7rqAVlPL9ET/AIKbUXqcuEIsuMWscJddWSXTvqWMiib3eCNjqCBGUmSiMUSsdZ07t10bFPt5tHJ7RY7940q72/J22adKX70Fo1IjzarcL5c4kckmgI/HFSuasI/WNRhbdwOr7nalwbuQYiA2hMsx+/djhKq1YBXvVlDLOvxeHXySptj4xo30XUY6GFkIRBkXTuaqzUxohk0fQOvLejHjxP0xcGIFFaYdmDrdZ62zNPojXOT5xVUm52aKlQEMghMsHRKOXVNgABWuGn1/MXj77Evx8vLruhqTAMhnbcw6q77p6XDWgY8zPolpzrEjUgOskNU8VPMVv8txYHS23m23QZY5RSb5qba4ci7aeLlJmlTz+GHad2a0x0WCKjn2MQH228Z9EFYSlpor1VU70jSH0b74XVJRebpaIPToUImq13S9QQepaqZ0BwHJNqo9i8NHsJ/TFkav2iczFTG7CzlxE51jRgIowurNTWiKSDAmEQosMKdnojoAn8UdBgWH2EfjH6G7BSm+eeb2yVCmt1R3mlu//dLbuIyKkXadt1iUTIFEJaCAyQuQVJtH9oB2jF1Njn8L3j6LAMImpoUO840/zMk0JI5zfOuqOvbmGefPB5puzLEYYkI8cxbAWGRdi7BX/bzTwxOn1KDU1lKjWvWNCZwEIWZN1q76SXhipEoUmFeynp7dC0dWwv0n66EIT0cIxqk5Ua6hAw6BFHmdYBCGvBBWEpPbrZCdroCBITshQSKQwEYsnFZjnRB8tFu/VeLwRlquOoYAbMDPVi82EwxfTEl6Poixi9q+4iDKaolnaj1XTLXCKH3ggnD32skWUnU5uxUhtRoUcYjGOlt0TJHhgYdh7D2oAKnqau41qpjdRljrpV2pztqDWocCaqNqeCtlBCYy7xRkYE7dcOEvgTSKObw8+/CSHAMAeQZiuJRhayRJqrSZkDCGFGcdj+BKBIZa9seeZJIaTzG2edlJstzD1/+K+M7dpCaIDyy2gramxp33gpxaqHrTTCNhuJJHD6DU1XiuU6tzrppYY0kMIAXY413r44AzDATkXRHGdsnkGBrCVBd1ELyo3C8IYwMGXQuD7MXxCWis+uk2zdCaq0QmsVkbKGlD3XqmmpxWY50QfLxbxGq8KiJi7C1nuBKFRRINBoTGsg6jyidiCE2WoZduhEIoQRWZFqqk0Dl0gjjZCrGFz7MUYPfwwhu+ZlMMKcdVu3SgFKU7sIvMONSnX1iIryzONke25o6Qlo2pvjhq8pvOoBVDiJaQ3EKd3Hoo2l9rFkuoVE6waClBYQGQJWvKCAwZqtn2yEJp4L52uC08OZi5taAkbK4vLffDfr3nktZsZCeQHSNhm84WK6AkX4zGEKuw9RfOoAsy8dR4dRacf+TJToca5QtyCD0G+xIN2RJ1qOq2uUDSiFrnmkjDQhGZTIUTo4RnH3QSaePkjpK4fZ/xcPoMLOJStXCmnmMcw8Xel+BrrWMtC1lpSZwZAWiixK5BrH1sP76qiTWyfUSRYiJ+j6/KVYRipyHCFJt7Vqah97wfnKFGaHgj0rQd16b0bFug5FJtZSIwddPamjLiHULUOtyjR3Ipka+2zrHNs/0+b3jH6szKUY1nCr/GFvxrRXASmUqoJI0dV/B/1DO1BBAa/yTEzarS2jvMqz9OR/kq7+OxAyG0VdGDn61+xg/WX3Y2UuIZW7BtPeGDkS4wUx8McaERXl6UcakoYKC51mjdZVUKXGKwsfOye5JHhlcU6rxAk0vcY+hP1dpsb24h+RnPxOP1NqK9uu3szTq0xEqYxfCJk5eQhr/TA/cvvrUWPThKcQMrZYdbmOxy/TgoQmjbKwE//IUYJiAe37CMtEdl9HRU1RLk5h1CS5oylyuy2OBa0Fh1Y6v2a0a6SGtEiZOareNDVzW0M6aQ/vg+XFZ9flm21r3sC2NW/guaMPc3L6QLQTiNFp7LOJTiGMSnbjmxuwTJO03dXobAIRkSJzy7YMl9LA66nb89/XmPYgQtBSYwOZQ4Vzss/ctQUqLCGMno67iMAbAV0hqkddJQw9oBqH7mlqpd0Y1iCGNTx33jIdqmfD+ZrgzOKckvLop+9j8ptfp3ykgD8dZerZM/vpMqq868k9ZC5dx7PrtjJjpUiXS1z71GNc882v81jvNR1LWy6E5ZTF7IR52+MmdNqm9w/tYGbXw/iz45AJITThUJ7MD3z6TzxBZtPrW+OaDcHorj1sues2Dnz2wZb5DV5/CRt+6o2kV/Usm6Db9dru7FpIv55JdTVh6C3Y4mmx+Oyhns3sPfGdefLNa9bfgn0ivWT7qLON+SGMWbJdN5HTL8aOsvq9xBKCKq0oLG8xDVypGt3970brgMrME6iwSOCNAzSy+7SuNqSFnvxPIGWWMKy1kLVGY8hcbMH2zAslrBOnCktRLLSeJuoFGCW3aF2PX7YbRL5ch+rZcL4mOLM4d6U7azVmHnsEhKQ2MRN5h1W0lU+FI6AFP/78Md59/AWm7Qy54iRp20DYKcJV166ogPtyymJ2QicL0gthpgZ9mVTLNj2s+VRHJvC+ohHhNZD2oWohQgPl+8hjL2Hmr4M2gveKZdw/3snYo3sRhkSmTCafP8rorhfZ98l/p++qTctefBbSa7ctkbgCC8dna60XdQCe6fZRK0WnEEYpfqZJMy7Oc+CtxDLs9EznadJxh5Pu/ncyevi30G3lN+tWeM/g+0nlrsSr7Gl0JhHCwjTz2JlLF7RK68RZPHEfgT9GVOS0TuoarasImaHZAF9qMWn+fpxJ52uCM49z9qsKigXCyQJaG1GVNSmpd4QwVDWqratt7CBgUM3i+z4ibYJfRfhldKpnWQXcV1oWsxmtzi3J116EZ08IZmow1J1l3+Q+PvzGiznwqQcY3fUi3okTZA8+S2pVP7lNg3Mp37aJFF5j3s2wetIUnjrYmF/p4BjVsSmEEHgzFYLZ6oq7h7RbWstp8dSJ3AAebitVCvPlmzPZPupU0T6PhZyJQhgrsgybCaxeWrRw4s/mRSuUpu5Hq1KUkNKhH6AOywS1l8l03YAKZxsxx0JaaK3J9tyyqFXau+oupkY/ixASTVMtaiHQ2sc0N2BYAw1Lf7mLST1q40w5XxOceZwzUjb78xh9ecJSCWmZkSNPCKRl4HvpqPy4UCBFVMrTMCKSM9Noa85CXazWRFjzmd5zjOr4LGYH5+By6lTULcj7HtvPk0cUppT0ZLIYRh879xzj5MN7uL1u5Xb1oGSa6skpAHKbVwFRFbnU2iEqMo0AtKEJ0xpZ0gxeczEnH9yDkbbQYZTk0kjh9UOUHyBT9oILyFLp2ytFM7ktVaDpTIb0nSrqOrzuyTAVqpbU+YU6by/HMjyVtk3Nheqb+wGG4QwQcnTvz2JnrkQaufgaUwjZtSyrVIdTGNYAhjVErfQU9ZYAGg1aY9prkUZ+nrXd/Aw6LSbNURun2qk8wdnFOSNlmUrRfePNTH7z66QGuiIiEwIjY1HLrCMoVclxHGFIUoNdeMVZdBAS9m5pkQA61Zpo1pCrEzPMvDSC1ZMlt3lVi5NlOWUxpZBsXf1GTpQ8Bru9KB63Hp+r4dEj49xsRj3zkCaqdwvGxIvUJmbIbhxESIEOQ4bf/1Nk1cW8NPl9SgMV6JZke3rhigHsZ7OEJR/lh+ggjDMbQVgG0orutX0BWSxUzw91x+7VK3UkLscB+Eqh/hmP7HqRLyN4vj+Ll+9i7bZhbrloKMr8lJ3lHiEMevs/TFr9JEaPh5UdOjNtm5oK1df7ASpVjuKcRQoVTBLU9mDaG8n23k5X75sxU1ubKuctjGaHnGGtanTMjtppSSDaASxk4c4PfVMNSz0JfTu/cU5FwXpqtHx0FzrYjzcb4qfXk7r8Nta+8RJy3h5qz34PNTuNSKWYLfURrn9j4/yFSls2a8hmJoXdm6USW69dW1Yvem4nFMo1ihWftNXkOFKaYKbKdKCZkZKBOLQtiOcnC/sJS2Xs1asa6d7Fk09ij6/BDhTCiuKvR2b3Yb27G/358ai2tBnVWEBDKt8FMbG0LyCdkj0Ojb/I3z01yUsTXS2p6c0Sy0ocncst0PRKoP4Zf7U/x3dzKaTSMD7DuBTs9KIY7U6Zn8tx+p5O26b88EeZNHJMHPufKF0GHYca6gCtK/i1cbTWTI78KaXJb2CYAx2z5+qyCbEDsJ4gMlv8eiOFOwwKKOVh2Wvpzr9jUWt7Lo4+FVeNm9O0pexpyXZMcH7hrPzKVK2Gd2IEAVhrhhvpz+2p0SLbTVDymyy5dzUaocqePl7+7MPzKsRtues2Zg+OAoLMcJQc0a4h5zZFMoI3VSas1EgNdK2oLGY+G8VGl/0ANJQOjVGbmEX5AempMnL/SfSmwVgTlwQbbsC8+Ea2/sF7sdcMIVNzBfRlLMMo30NYFkJK9FUmw4UrGX/YxerN4k2VSA90N+bdvoAsFKq305V89+gEg92pltT0ZollJY5OWF6BpnONup/ATykODlSwPUmgomdTm5glu3FwwczP5Th9T6Vtk1I+2a4rIit84L0UT9wXdbgWHpH+q6OmsXgEfogQ9f6ArRLCnGyyi1rpmSiW2shiZ64k23MTXX1vpTzzGKY9jJ12SOeubkl6WQh1S9urvNBkZUvQISqcZLrwZQaGf3XRMRK8MjijpKzDkJOf/HPGv/A5/JHjAFjDaxn8+TsZ+vBHIp2YudRoVauhyzMI5r5gzWnTzbUmzN4sL//lt7h/++9QOV4EILuun/U//jpqhRJG2p6biBDkNq8mVa5y+cd+jIE3Xoy9gozAlGmwfetqdu45RvXweNRxBNBS8trZGuHJSUpat1rht11NetPGxhhReF2Z4MgxgmIBfB8sC7M/j7lhPRvvvolLPvgj1MamOfTlJ5l4fO+CdZU7hep5YeSEBIXSIUYcDiZpk1jqj2SZXa6XW6DpXKI2MYVx8bfouegoP9fjUanl2HvyYh50b0H7Gu0FFEM1L/NzuU7flbRtCoMCoT+BEFCZfojj5WdJ516HVmWkNAlDyVxLWAH4gGCuulxrjPTk6GeYLX6dwDtCGEa9BJVfwxcvUFKzdPXfwdqL/3rFDjkpU2S6rqc8vWuuN6KK/DimPRiF9A3tSCSM8xBn9Nc2+un7GP3LTxFMjDd0WP/YMUY/8ymEkAztiPLqdRi2dKJu72LdjHqtib1/9g32fvIBauNztW3LRwsc/LvHMLM2mbVNDg+tKR0aw5sq8/zv/zPpwe5lh5nVsePGbagg5Cs/eBlfQJfSXFn2eJchqQ71zlnhfWkGr93I1rtuaTk/ZWbRh44TTIxGDkspIQwJxkYxlCD1mojssusHuOxX30644y0L6r+dtN6ZGsx4gpQhWhM6vGCexFLHSgryny9RFgAV9ffkLt9HiEKENrbpccW65wB48KU3IWyTftual/m53F6IK2nbVDj+h5RnHqFeElOrMqXJr6N19NkIYaF1vVtOnZxVSy8/II5xHollE9GogRyNIQj9Aqa9sUHey2ky246egZ+kePKTBDNjqMBDBQK8HMrMYWwsJGU8z1OcMVJWtRrTux4inCq2ZjAJgZqaZHrXQ6y6625kKtVIIhGG0bGLdTvCms/JB5/HL5baxgZ/uoyRNhvNUyGSGyonp0iv7sXMpla0fa/DkJK7nbVc/PIY5WyKbqUiyzO2wtOVKluvVwQHvot6/H4O7vnnloVF+CHZ545SXWu3NAzSQpB97ijiLSGk5hc66jyX+Vpvdwq6bYWQ3S3PRNgmPaakW81P5z4bXa5PJytxWeOHZaYmPoe9fozQr7FZmRT9LiZm82wbeonvTt6BgkaLr2aspBficmN3q+Vnaa5RDCBkGilspNkT6b7BTBy7HJOy8uN/z8UbS6MfAVHKsxYtzV2BRoH6pWogL1Z+07BWEYyspjqhwdCgJGhJoKdBpZCXJtl75yPOGCkHxQLB+GhEjm1ecB34BBNjjRKbM489Ms8irnexrhN3M7zCLJXRmZZIhcbYfojVk2HV9kuZfOYQtUIJb6pMenVvI0QtGn952/dm2PkuuvtzZMoeqCCKO7ayIE0y009Te66CNM2OC0tQLLD68cOo69Ywsy5LkDYwqyHdJyr0vTBOrTBKZnjDsp/vxfnr8KbKFNQInvLIWGlu3trPD461xdkCN20YwDo8AWexy/WpZk2uFMWRP8b3jmBkLahoMl6ATE0hBUz6A6y/JMu1m9c1Wnw1YyW9EJfVtmkB7VkIA2HkMMw1WKlNeNX9sWNNI4UZZeD5Y0Q1nDfPWeD2cCybzEZEr8NofrpO3lELrU5JJsspv6l9yczuQayNoxA2/d6kZmb3IPrNEhL14rzDGSNlsz+PObgaYe2Dtm2zMC3MgVWY/flGEolIzXeqhFNFgmJhXilOO99FZnUPs/tONLIAG2NbBqlVvVz26+8EYHrPMb7/n/52XpyyDhWVE1PMHhzD7snMs+w6WXxGymLVTZcw+plPYcwcQvgVtJUh7FpPOjeNNPta59K0sJj9eYzuPoa/e5I1z9n4WYvxi7qYWZuluPUiJgoPskZftGQ/vXbyswezrLp1K84vvI1bHJP7HtvLrv2jFCs1+jMptl+0mg9/4FYO9D7AiQdfwBufxh7sYc1tlzd06uXEOi91zKlmTa4EStWolnYjpQ06xMjYGBkbSym6hGJjZgvv2PY20vbC/oKV9kJcLHZ3Me3ZzlxBtvsGKjOPEgaTCOqdszcR+ocI/QKBP4addsj13dogz7psYph5fO8k6CpaeSBMvPIzZHpuakkfr6NzCN/XUMEU+bW/HlWaK8wy/dDVdN2gsDceQKSq6Foa7/AWSk9cjXfXyntLJjj7OGOkLFMperbfSuXFPS2aMloje/vo2X4rMpVqJJGoyvwvttHbj9k/3yowUhZDt13O5J5jLZoyGuy+HGtuu7xBpD2XrSM92N3YsmqtKR0cwyvM4s9WeOBH/1+sbIa+KzcwdMtlbP3Qm3n50wuHj/WaL1MyT+ARoJFIAlLiGGa1CvTNm2s4VcQfH2PyX75K7cB+vCNHkLbNxDuvZmZTF0Jr7N5+QsIFi8TXgrAR4nbok//eQn7hrM/YP+/BDlJsu+et3HPzpdx9/SWN481QUTs5FfVABJSaS9BVWuEeX7ws6XJKl55O1uRKUG/hZJj5RgQBRE1ytfbp63vdooQMp9cLsV0aWEx77urbTn7NPfjVd3N8/wdaynvK1BZMeyNalRja9D+w0psb587JJg/Hcc5RLLE0ejDMPCqcpXjyvtbWVvNC+HQj7K1W2k2ltJtc76309H+QVH83lR/cQOXp1yPTZVQ1C6GJ3WedcSkrwZnBGXX0rf7QDrRSjH/hc3gjx6OQuOF1DL7/zrkY5aYkkmYJo97Ful26qOPiHW9BK8X+v36IyvFJALLr+9hy560tVk/7lrWexhxW/KjWrhdSq80w9cIxgtkaJx/eQzBb62jxXXz3bcw+/ihdW4eiushegLBN0IrK889FjNcm1Ri9/RT/6ctMfet+rOG1aN/Dm55kel0GqlWs9RtIbYriTtvTl0OlIss3bonVn7JY9/2XeYchaRZ72skvZRoM5dINi7r49GG86bkwu7AWcGznbo4PHMV/rViwrgUsrxlrJweaDhXKDwkrtYUzLleYjVi3TK34O9Eca2vZa8kPf3TJMRqfywp6IS4mDSylPRv2MIY1PM+aFsJAWmsw2qzwumzSM/h+Rvb9x8hhqMOWanfzWku1ySh+7VBj0dJaocKpRtjd3G+h6f7OsJSV4MzijJKyMAzWfOSXWf3BD3eMU66jpb/eVBGjt7+li3UnSEPi/PIdXPzhH6EyUqQep9zpi9Xc7aQ2PoM0JFoKZLoekiTwCrNk1+cZf+Il+q/Y2HJ+nfQ2vvOyhtQipEA0SMhA5nKoag2Znavvq8OQrjdcz+x3nmgsOKlNWyAtYM0AUpikNmyEJtdfc/ryfY/tZeeeY42WWKVyjccMA9WT4cenW+sft0dS1OUEiJyfKN0I5cttXo3OwJh3iD69mVY/7NzCACyrdGmzA62+E6lNz6LSCiMwOfilx3HufdtccsYyGwfM+8ybLNN60XatfBCSrv53LBmre6pYqjPHYtrzKVdhUyV0XK6TNrmi3dnXKqOotsiNqFlqPexuy4fuRK36F6qVx8GYhbCLdOYGLvqJXzuDTyzBmcRZCUCVqRTpTZsXfH+x/npLwUhZdG1evfj14y3r2ndcy+Mf+HOkZVB85nDLMToICcoeYanWqDfRDK9YJtT2glJL5jVX0P3GG5j9zhMtC0vfu97N1Dd3tmjmlg+mB8oIogpyTfdaT1+uBSG79o+2tMQStolpGTwjbN4+U2mJO26OHmiWE8KqP+cQFVAtzhL+eBeVLQHVNT5q9hjpVGuh+ubC9l5QQUoDpcNGy6fmY7J2T8tuZPbIODPXVlGXp9FdAsM32Tv+XeR9EueetwHLs74XQrNlGvoFZJghm7/9jFY0a5Yp6tdaqv7yYtrzqVRhW1FN5CbiJ47SEMi4rnO+MXcVFpkc/V9krzhITq+LvueWiRYHmRr/VNL66TzFK5oVcLb669WRGe4js6aPYLaKtAx0oBqNJIVpYGZtjFyqUW+iGXZ/lvTwwIJSS+9N2xnacQ+qtqNlYVG12jwil6Gm+1iZyYt6kVbTlr8pfXl0tkyhUiNtzV1HSIE9kGNmYoZpSzDoxUVp2rafzXJCS+o2UHuTibelhhASGcZE7c0CNIi5vjAoragFZSreTIOUU1aWrlT/vNoXF+94C8oP2f3cNwmusJAIDGli9NlU+nxeOvZ9Lq69GSyx7MYBnSCEQf+qHXhfDag89W3CkyVq3U8Q3Gh2jGtfCTrJFOnsFYTBBFJm5h2/VHha85ybrel66nTUDmqBXpMrtLAbxD/1EJHMITHtPKa9uWnMHiqz348kEWEh4++LIGn9dD7jnLWDWgnCmk9lpEhYm18YZyXn1i06rSIy86fLBFNlvKkSOo45Hrz+kujfTWgmvdUf2kHfj96BzGTRXg2ZydL3o3e0aOR2k0RT18x1GDbGU4Ygv2+KNeYwhpkiVCGGtFifv7SRvlxP7Z6bBMxWi9QGfNKbFP4ds0xeMYuRM1n39qtbdPS6nACR9GLnu6JOxiaEr7Eii1cLbCMTVRkTUPPLaHTLwrDv5PcIlY/SUVqwRlH1ZpmpFubVvpCGZP17Xw+vz2F3Z7B6sxiZaLchEJQGy1QmJpfdemoxjH76Pqa+cT96QiPNbCP8cPTT9y157mKY1yZKlSnPRBZ5J6y0M4cQJtOFf+TEy7/I8f0f4Pi+D1A48WdoHXY8vn9oR0t7KCGzjdZSdShVI/BG0Dogv+Ye1l7yefrX3IudvRIrtaXhBFcqQOsqtdnHqZWeplbajV87EIfbJa2fzme8svmzbTid2NeFzt36oTdz8uE9UTlQ00AHIdKUaBRWd5obPn8vL3/6AY7u2sPETJWB7jTrm0KmTkVqqRP29OOPcGyzZHZLP3pogO7NG1jdvZFNq64gY3W1tWiaS+02pGC2VqTqz6K04LqtKQYv3YS62ie/6jK2bdzecr1252Y9PrsSziJ7zajyXr6L7NpBSrUi1bjRqkCwLu/gDF/fqK/RlcojEFSDctwsVSKFycVDr5t3n7rbQHQZ4Ot579FloLuN0648V2+OULeIlSEIUgZmTSwY174cLFyEyEKIqLaFlM27mrlmqIE3sqyU55V2jV4sVlrrkMLIJ6hMf5tQlVsKG+WHP4oAqqXdqLhLtyFV1OFbGLEAVe9WAlZqS9L66TzGeUXKpxP7utC5YRAiBOSv2YLyQ6QRdTyRlhn9+MKQ+6/axMPZFIXpCvmeDLdcsoaLWnNUViS11Il84u1XUBt7gbRtg5QEKuD45EtIaXTUUusJEA/vP8GJWpmcBVesUbzj0kjKMNI2o6XDXKqCeVv+9njcvtesY+sNF3P89WNoi0aH8K50npzuAyG5xfkZbDPSvsv+bKO+Rv0YpRVSSJTW+GEVy2jV3bO5HrI9vZTHJ5t9l6Ah29tLNtdz2pXn6nHtpNOMvLavNRHn8DTrC+Okh9ct63NpxmJFiAxrgEz3zdTKzzY04Wz3jaAVx/d9YMFkjZbxT6NrdLterXXIsZfeR3n6USBACCsOl5uiPP0wIKN7kb1kum+hb+iXOHngIwhptYQT1tO3DWs93X1J66fzFecNKZ9O7GtY8zn56AuEPWBUdaMnnjAko996Hr9Uw8ymMOKx6/zhFcv82bee59+PFzGkINOVoqIUO/ccAzqXglz2/aiAk6UjGOn27K+FtVRDSu65+VLed+0Q//bsPnozJnbb732hYvMLxeOK44/FTrXWVWZ9/7YGIcP8+hpCSIyYRG3D6mjRGtLkomuuZ+/Tj+IVStGiZxnY+RwXXXV94/7asxFXUnmuHtd+9GKbyc1dUZe6UKMsyeS2AV7293M5KyflxR1reQbW/jpAw2KtFw5artW7aOW5ZWrTdRROfILK9CMINMSV3gJ/lDCYAK1J5a6JrqNrVGYei68RXbu57Ge9Pkc2af10XuO8IeXlFo9ph9KK515+iOO3TKByYFQl6aMm3bvTCC0IKh5WLjVPNwagP8sTo9MtEQ8AhhQLloJcLlbahLUZPeluhntPbcvfHo+73FKcp2rRXrruBoQUnCjup1opkc7kWNMfZSoulo1oWXbH8dohUymyN93EtPEcKD2XZq81Vn+ek6UjOB12DkuOu0zHmrSHCYJpSpP3M78r9cJW75nqGq1UjdLUt9E6bOR55G4AACAASURBVPlcBBAGk0iZi2O3jcacoizIPrSuUk/trocTSqOHgbW/vqDDMcErj/OGlFdSPKYZ7sgTnPQPIXIGMlRoS1PeEo3R81Qm7hS9jZH7n51X/8C+YRvFmt8S8fD/s/fmYW6d15nn77sbtlpRG3cW10vKWq2NWkjJsmVLsh0ncZw4TpxYtmTLW8edZJ7O08/MdHqmZyY9me5O3F4UL5GdeIkTO+50LEuybEsiqX2lFlKX+14rgSrsF3f55o8LoAAUgEIVi5Ro1atHD8kCcO/FLeB85zvnPe9bHhJJKO4sKcj54GxqqQsJkM0GM+YjxbkQLeVWx99/70MtpxHbge/5nPFXktf2o6QygI8aDhFatZzQ2rVnZVM1F3WtzM7ITD1MIf04QgmjavFS9lmq1DbJehfLNdp3E0g/Wxq1npEYCGyhPECdJZAkSxrQufQTQcO2NCWI0Ih23zQzhNJCzGgJbxzeNEG5XfGYao2KMt1KUVVC8Y5gWEIE3f/CKpeOFz0Gt18aNAp1tUb/oO+6zSz7wFX0PrKXnOvh2y75kSTFZLANj2gKZ7ydDH363ZUm43wU0c62lmou34Z0XUbPHMBRfQw92jBAtjuYsVAz1XYz0PrjL9Yo9sF7H2biIQvx/h78ri7wPXyhoMoYIcRZ2VTVN9ak0oXje/hSooqZRl3w3FClbACgh4aB1lnvYrhGK1ocVetH1eO4xYlKth7wxxVUvW9W1quovfQs+zyF3B7y6aeDoK7EiHReS8/g3W2JGS3hjcObJihDa/GYRuyKnnesw74+j6ZoFcaBncjgOx5EBYPvvYiNd91aU28tTKQ48cOnmXzyAKd+8iJ9YZ0jfR0I18d3PVRDQ0QMLkkXmHhgDweFYOM9ty6IFbJQFw/peUx8/V6MJ3azPJ1EDvbT8/ZtLL/rw7P4vq0GMzb3Xb0gWc3F0FJeaDmqGuXArqASPqmRW1dElBYJO5EhsjrOyt7NZy/CL3QOTB6pXdg6V9KZ3VUJUtVB0XMTQTlAypZZbzvKc3OhnHF7bjCd6TmJCt/ZiGxB1ZfVPL+ciacm/w7fyxKKXlwZTfe9LFPjXweYV318CecXb6qg3Eo8Zv+XH5rFrhi/fy/OCh1tfR8QuElH1/QjHRc9FGbr1jtQqrQp1JDOqX95rlLKKIxOcfPkNMWNy9nb34EdDhHK2lwymeb9mlrJ6nzHq7xmPqyQhWae1XrTqhaCRJr0Qw+iSlGjN93MJkoIhYMvPMnk956hOJk7Z7KarbDQclQ1qgN750tBw6ywysUL+yg5lyF9bUDnO0tN50YL2+kzL7DCO0pHZAigMpThOYmSzZNKR++tbWW9Z+saXTPVqJ9BKFFi3e+gd+jTTI1/fVYm3j1wJyOHPlFTZ4YgYKeTD6IIY0GskCWcH7ypgnIZ9c2qZlthBRXxcgF/rRd44VGifhkqy3o3zAqA1ceRnk8xkUFFcJt1mh0vFCgOdNPhuBhCoF4+DIqCncgy9sir896GVyu9hbT2M896Xu7M+WbrTTdrJmaPTpBLTBORnWjnSFazGo3q2fPRMm6G6sAupKDrxQidL0u8sCQkDC766HYOfqV2B9O/YzOr7ryOSKijrQWw2cImlU4KRYWOkjC9EAK9pPaG0Fi+8e/acqVeDLTKuBv93C2O1DA/pJS4xaN4TgJf5hFCR9MH0YzhmublfFkhSzg3eFMG5Xq02gpHdkH/b64lwdicJYLq4/iON6MRoQh0H2L5IigBj7msh6FFjAqlbtbxGmzD65Xeyg7T91y/GVWZO0ttpTftZKZIT5ygY8UwqqI1bCZKX2InMqi2glqo0tFYZFlNz3YonElx1N3LRO54w3r2fLWM69EosAtPoKYkQ3dcxJH7Hq08pkQ0EuY0p7Un2PfIy/SuX9WW6FFTlozQySnrcL0EWh0/u6Pn1nMSkOdqvDXLuOt/Xs/8cItHK6WX8vh49SBJ5XVLAyVvClwQQbnVVjjUE+Pi9TeBLuYsEVQfp1ojQgiBGtIrZj1CVwPhFs9n6N2XcObJA21vw+uV3soO0zDDe2613W6kNy0FjFzcQ2ZNJ8cSvySciVUCTn0zUTounusSOxmq8LXLmI9HXzNU1/Yn1yewt0rC8U5iwwOzhIbORsu4jGaBfd2dN/P0x79aCdbpywtBzRmBcyaHs7rYluhRK5ZMMXwbXT1T5DNPLLhRNxd838ZzJkglfhSYmS5C461GsIig5BKMzUs0LSj1uc54xQdQCHXerJCFQP2Tv1eBDYt82EPef/lo47n1CxQXRFBudys8V4mg/jhGvIPCxDQCQXR1HBDYZ9IYXRG0jlAlqzuoqW1twxspvcEM7/njV2/gxDd+2bJh2EhveuTiHqbWxtAH+tFUoyb4zWomhsJ0no4Se2l28CsvIuVyg+bpeFOFpsGy0eJRkQg1BMV1ErxqidCBhsMx89EyrkezwJ4fSVZ2PVKVFFa5FUU73/GQjosS0ucUPWrJkundTN+K6/H9Ty46dayaAWFn9+B7KVQ9EBRaaOOtOtMuLxyZqYdLOhphtAqdL4DrTCD9LIq+bNEXmybYAFiLfEwT2N/yCab5MWCLZVl/tqATmOa3gH+wLOvBhbx+vrgggjLMZEzNLI7me5zxXfuILO9BMVSEACPeRSgepX/bZlb/1rWEB7oqgajdbXgiZ89Seisjmbd5/t6HcX42d8OwWm/ayUyRXtOJPtBfEceH2snA+mbioUd/wSn3JVBnFgfp+fTt2MyrY7s4kznJ9NERvLE82gFJ35Fehm7cWlkcmumIrLvz5kpN3g37QcOtpDxnJzJE1/QjFHFW3OFmqA/s1bseLyxnrgWCXVBJ+a/6WpqVB+ZiyZxto64RZnQxBL6fpl6bYj6Nt1kUN6WHcOwyepd/oSSe//tIP49QwpT51XpoGCNsMrT2/w28Apeae28aXDBBuYx6i6O5UJ/tNcq8gJbb63a34WWlt5zjznqsJ6RTfPS1WeyHRrXeahGk9MQJjiV+OauuCXUax1U0tvpFRI9H8D7Qxf7NB8idmsLPe8i8iyYU3M2Q1FO4VYtDMx2R4nSukp2qBYFaUJB6SXWslJ2KkN42d3i+rIma5mnVrqdyLYZEKpJQX6yi9WFoYQw1RGL0y015uWfDz14IqnUxpG9X9JDL2hTlkkK7jbeZAK/gFkfx3L1kU4+SOvPPGJENOPZpnOIJFMWoDL9I6RPruanGmupXHNeZpvkLoAv4c6AI/CegAJwBPm5Z1pRpmv8FuLH0mu9ZlvXX5QOYpnkt8EXgQ8BVwL8DHOA08GHLsmZbyC8AF0xQrg4URne0YnEEjRkFcynO1Wde7Wyv59qG1yu9leH5km3LupCJHMyDt6uEQnSsGCacic1rMrB+ETnsvMqp1H4K2QxCKHiujewIVjY9pVJY5dL5cpjxXfsY/uj2pkMfiRePYnRH8GwX4YkZ7jCikp22MxxT/bspTKbRYyGG3vE2Nn/+toaUvWbN009+8p34jsfoo68gPQ97yEcJadhRB+wkMaObZd3rSE20p1uxGPzsdlCtiyEUnVona7cyNt1O4606wDv20Yr4kCJUHPswRfsEujGIbgwF5q3FUYTQ6R68862mf5EF3gsMAE+XfnajZVmnTNP8I+B/NU3zUWAdsI0gNu42TfOXpedeD7wTeL9lWeOl4P2XlmX90DTNPyAI9lOLcaFvSj3lesw1HdZId7kcxN1csSbbO3jvw+fkGm3XYySV485rNnDH1pVENQ3b8YhqGndsXclnb3lbRfO4Hq14u+Wap5S1i3A7wU8N6RhDnYxnjyOR+NILtKV9iUDgRySSYOvvhSXFZI7s4XHsRGONYzeVp/eKYWTJrbzzpTDRIwaiCHpfFE0zajSim+HgvQ9z8v4XmXrtJKl9p5h86gB7//JfefwjX8T3Zicb5eZptuigq5Js0eGne0/xf/7lvzL51AGmtmbxwxBSwmjREL70KBSzaGqITUNXtFRr83275bWeC5TZEaV/oWrxYGyaGTsnKT2ibTTeygG+3hYKJNIvoAjw3SSasYZQ7HJCscvRQuvpGbzrrTa9t9uyLGlZ1jiQA3KWZZ0qPbYTeBuwFdhVep4DPAVcVHrOuwmcksvB5o+BW0zTfIwgYC9KlgwXSKY83+mw8+W2DM2zuL/93euYLjglnnLw4V8ob3ehk4EwQ/lSFBVFqEjFD2iAMtjqowYiTmpBoPdGia0fbDn0YX7hDozuaKU0Ere66B8yWfWO9rjB5d9N7sSZoMkqRIWGOL77dawvPcjWP7qj6vo9dh4aI18MBPN930NRVJj22XWiwA22TXE9KChwyiXsRIisic84bzuTi6bWtlio18UoN99c5wyq1o2idrat5FYO8L6brJRBgEqQB6WSfStKuFQySb0V+chXA5imuQwIA4ppmsstyxoBbiJoFu4D7gT+m2maOkGw/TZwO0HJYxXwFeB3gU8Cf17Kmv8G+I3Sc88aF0RQnu902GKM+LaLdihwZSyUt3s2Nc9qyldIj1IoZlB1NShBSIH0JOGTGhQlg+/aitEVbbl46NHQWdHciokMhckMxURmluoars/4L19j8z23Vo6ZyNmcmhrHlzmCIQ4F6fsUZY58j8JUTuBXNRyLiSyxNQMIETQcXcILUms712I99boYRuQiuge20dX3QVR9oO1zzgT4++vKIACh0j2uFS2qf9/zdRm/QBEplSI6CAKqAP7ZNE0fSAIfsyxr0jTNm03TfBIwgH+0LOsF0zQBsCzrG6Zpfsg0zY8AzwA/MU0zDWSAnyzWhV4Qv4H5ToctxohvO5iLAlcv/Xm2vN2F1DyrKV9lTz5b5PDJoyYk2n4N/1gXA7dvqSwO7SweC6W5GfGOQEq1PLhTBaGruPlizaLZHVYx1AIFt4pJIiX4EI56dNs+qRYNx7DROy+1tvMl1rMYuhhllAO854zj2Cfw8z5uysD3wmgdGRTRj4gplfdXft8LdRk/CxwioLAt9jFbwrKsbwHfavDQzxs8908b/OxjVX+vbmD9azsXOF9cEEEZ5pdlLsaIbzuYiwLXTPqzUUCrHcte3FpfdfkjonfRHRmge2g5O+UAjw+PM73aJx4Ls/3J/cHk4SIMfTTLvtSQztA73saZ5w5XzF0BkBAqLaa1i6bN1kGH509plRguhEAKMDuzqIo/Z8NxPmpt87VwOhsEv3OPeHQQTVn477wc4LsH7uTAjz9LPvU8IuQg7RDupEDggzNCx7qNNe/7bFzGF4LSkEdLTvESLqCgPN8s82xHfMtoRdtqRYHrjdQZoTY7/lmOZbeDRuWPe584yMOHTqGqCmFVaVh2aZUNNwu67WRfmz9/GxNP7md89+vg+gg9kF6NruqbtWiGtCi/ebGBKlxeGRVkioKwa3NpOMXNzgipp22M1zuI/HoH9ioXfVXQcKyuubeblZ6NhVPDe9Tks7MYv/OG5RVXZ/KHl+Ham1HCOfxCFDwNVJdcr8OGL/8xeqSjdA3NxazacRlfwrnDBXfX2902n22poB0T11YUuO0bBtvKeFvVpO+5fuOCan3NgkG5/DHfskvNfZkj6LaTfSmqwrbvfIa9X7uf5C8O4GWc0v2dvWiqisaKnvW8f+vr3GYqjB2cRE9Oo6k+6nM6wrOxR6eI3q9ifuKmlg3HuYZAFsvCaa7Pznz6EPVoVV6p0XbJVpW5PA37lMSdctAD6YuzcsZZwrnFBReUy2h38GChtc9WJq7Vgb5sdrrr0DjJvE1vJMQNa/v4w3WDeLbT8tqaBkchuP+1vWzqfQ7pF9qu9bXrBr7QsgvMod+87Jo5sy9RCtxjqSPYO/IYN6+gTw6wIXY50RXxhjzlSvkleYhofhrhQ/i4QeeJLsTl/fiOi9YVZdMn3jX/MkvV50jRF8fCqdVnZ+2n3rXgBRFal1e6459su5dyti7jSzh3uOCCcruB52zQjFKHIjj4t48w+thenKl85dyfvudW7tq2icl0geR3dzF174s83ca1NQuOGTtJupAhmZP0Rduv9bXrBr7QsstcW95V8S1zZl/HJl+tCeqpoyOMJ/ZzdN8T9B+ON7xf5fLLajbxxI+OoUtjRmxJESghAzeVnxerptnnqO83biA7/eCCLZzmomNGf/u6BS+I7ZRX2u2lnK0zzhLOHS64O99u4DkbNKPUZY9OUBibItzfiRo2Zp07+73dTDywp+1raxQcpZTYTo4OAzqrYsBctb6Wjt513OyFll3m2vKCaJl9aYpRE9SzRycqFl7FdeDssxver0o22xEjanQuCqum2edIcgkDH7y9pikY1q+BkdsoRnMYc3g2zkXH7CgUiUdDZG0HWXQRhlYZCZ+rD9FOeWU+vZRZ/HdVZVlXP5uGrmj5HheKb+3+s3OiEvexG/9iSSXujcL5GgppRKkri+Irhoaiz9y28rlbjSc3u7ZGwdGXHq7vceVKMOpiY7Na31yO3o242Y3KLts3DFZ+3ghzbXmjRmfL7Mv1i5WgXtZ9Lg+glacKNW/mfglNnZXNSinxXQ+lauGYL6um1edoYtd+Nt4VNAWL+XGevfsHTD5+EC/7/6DGQvRv28R1f/85NGNuedhZj/VGifV1svnkGX5+MgGOh6KrhPo6CK/uZ7vZug/RjkO2orTfSynvQDYNXcnkyJdws8/iZ5KMHvr+ufLse0NU4mqebJq3AWssy/raIl/HouGCCsrnayikEaXOdzx8xyWyrAfqOuTV48nzvbb64NgTMdi0VuX2zbMX/2a1vrkcvRtlkaqi8Nkbt3DXtk1tU/Ha2fI2mz7cGL8SezKFoYTxcJGOi+94lSyxPFVYfb9O/PMzNdmsky7gFYoYvTEUTWmYCbYz9NHu5+jZj/+IsUf3IxSB0FX8osvo7r3s+tRfs+Obf9RwxzAXHfPIfY+y4/H9pHpjvBw1yPg++ugUV0YM7rn+HS3v/3wcsufTS0lNfINi+pG3hGff+ZLfPBtcUEH5fA2FQANKXXeY2Jr+hh90o43x5ObaFrOD4+Hxp0s15LlrfeU671yO3s2ypZCmNq1hNsKcMpd19DtdhDnytUd4ete92Iks+Zsk7uU6sdX9KLqK9HwkkshJvVJyMXqjqLHQjHWXlGSPTlBMZJCuhxLWMT9/G8Mfvp5QSWJVSq+lElzN76ONz1ExlWPyqQNBQNY8lC6b/LVdeKbByc7TPPba91jet7Fh83UuYX5NVfj1VJ470nnSikKn7xNN58HxINS6L7IYDtnVWGwa4JsNpmlGgPuAtQRTej8EuoF7ge8DJwgy+Gcsy/q0aZrdwDeBvtIh/o1lWa+YpnkQeALYDPyidIxrAMuyrI+apnkx8F8BFegHPm1Z1hMLueYLKiifr6EQaEypO/SNX5bOXatTPLh97vHkua6tOjjOR+uius7bytF7sdDuyHeZfldveBt7RpIq5CnIJHpfFOdkhsgpo2KMWr5fXtauZLPZoxM1Ohl+wWHkoZfQIkal9pwcu5dU4qe4vkBXQy2zvXY+R6l9p/DyeeLv20906xhirYMbDZNOr2Ri9BLsTJaT4nV832Pd4GU196EdYX4AQ0JfSYCp3Z3eYk4CwuLRAN/EuAc4alnWh03T3ESgFNddemwzgdBQDjhc0sX4t8AvLMv6aun59xFIeQ4DtwAjQAK4Fvh86XU9BIJGf1IK4B8h0ND41Q/KsHhDIe2iehs417kX69rmo3VRX+etdvTW9BDDfdcFI80LZKY0mzRsZ+S7Ue1WSEH3nijqIYXLvvoJTn/3WSYti2Ihh151v6TrEYrHcNKFWToZQldRQ0al9uyrLi8f/inTuWkcz0dXFeLREMPxWNNsby7ThNj6Qfp+/SAdl55EIvBjGqri0tNzFCEEInoJ2eI0r53ezYnE64SN6CzaYith/nrMd6e3WML77dSpL3CYwAMAlmUdME1zClhWeuygZVlpANM0RwiEii4hUH/7ndJzyjfgjGVZx0vPzVqWtbf09+nS604B/5tpmnmgE0gt9IIvuKC8GL5v7cC3bdxkAq03XnGPnuvci31t7QS+cp33RGIfEokiVIQiyI4k0V5yePqxry+INthq6kzgNM3SbNdjIpMjZrjoU27T2q1zJo+eU9nymdvxPvGu2fdLVRjcvoXj//xsrU5GaRwbRalkl9859CqD+XE8DBRF4EnJeKYAwNreYstsr5lpgtah0nt9hmJKBJteNTg3CLr7xhhzExScQqDGJsB285xI7AOa0xbP506vXcynTn2BYh+BQty/mKa5Hvi/gb8rPSYbPP914DuWZX3PNM1B4K4Wz63GF4Hfsyxrn2ma/5Egs14QLrigXMbZ+L61gvQ8xr9+L+knduNNJVB74nRefyODd99T8cyb69zn6toaIe84jKcd0oUsRXcahECdlhhP+8T2RBBhsSDaYKOpswf2HWcl3+OqwQOz6ra+FHz1cYsH9u5jIpslqntctkzllh2SrmclQtaGverMsNn92njPrfiOR/rgCH7BqYxjx9YOVI4huyI8etjm/YNdGGqh8lohAh74cF9/w2xvLtME303Qe/UAU3scilNZcAE10NbQuwKZTUQMz/eYyo4h8RFSkLfTbBy6Cr2BU0z5PcH52+k1Qv3g1WLXqd9k+Bvgb0u6xypB3be/xfP/L+Cbpml+khmXknbwHeCfTNNMAifnOEdLXLBB+Vxh/Ov3VkxLRSiMn88x9bMHkK5L/Ld+uyZzfiNQ/kKpPVG+/vwRfvraXsbSaTpD3Vw81Mltm1zSY8dRhF4TCBtR85qVJppNGt7Qdz9K4Vk8rx+lrkv//YPv5Id7XqPoZtEUQdFTePqkJN/fzXsvPUP3nplmYruZoaIqbPnCHUgBp3/yPGrIqDBfyseY9nwmcpIjuYvZ0vkMsqo56nounrFtVrbXDrVS0eOoepz428NI1yOVmaSo2IFlExoOBo5bDP5dsPEcD3yJrWR57Ef3ccsH7264KzlfO71GaDV4tZh16hY47ypxlmUVgI80eXhb1fOqGza/3uA4y5r8/fLSX/9r6f+zxlJQroJv26Sf2F3JiAGQkuKJE4z+9//G1IM/QYv3VzJn6bqzShxtn2ueer31X6j7V8V5YUU3Tr+LoQlsD549FbAZbnRkyeJpZogEZppJxlB3S0GcRpOGqiiyPvoKRQ8cz68EcSFUstM7efzwJhw3T3UhQBFwwOnEu9pGPaTgnMnPmRk2KhuZn7sNVVMZ37UPO5VFXRZm+VUXs/FTt+JISTwa4qnk+wBYH32FsJqh4HVw0r6M966aTelqlxJX2dZrKt09g2QKU9hOhoK6CUWGEX4BbB/P9oK3LUB4gjPTJ9j/Nw+x5TO3N/19ns/dVBlzDV6dC4PYapSGPJZU4ubAUlCugptM4E0lEKGZTrR97BjuxHig4SsU/HyO5EM/JbV7J0JRmpY4mmGher3VXygvovOSplBMpHFjAi1SqnkLeG1C47puFal7wTBGdiZIlksGcwniNJo0jKppImoGKQz0ugywUExSsCfxpTdrBDtTFNhDPVz+N7+BlhFNM8NWZSNFVdn4mVtx3h9l5MwhCqrDqJFAjj2FuXxbZQDnyeQHeGbqdqJqmrTTwXu2rCOszy4jtNtwq9/Wd0ZXMthxPeG+3yc2+jz7Tj+O67iVdUgiUfMKfhhGn9zLJnv+WhznCufTjWcJZ4cLwqPvfEHrjaP2lOqPvo+fy+EmzoAQCF1D6MGH1jl1gsyTj+NnMzUljvGv3zvnOcqCMtLP1RD1k2PNX1v/hUorChlFIHyBtL2yzQQAGUdQ7OxCzYvKMAbMbPddVWkpiGO7XmXS0PNnjpvzOsm6HcSjIZQ6x5Cw0Us43B/YNNWhw5DEYyGisS4iy3sbfvF932b0vv9M8hf34+dzDe+pNfIUp1MHwABN1St6INbIU9xz/eaKL2KuqOIxxHu2rKuZTvRsh/xIsiISNbh9S8VrsP4eVVzFS/SzFRu/xYoN97Fi47foW/F5YqFeLl61g6jaCW7As8YHNaugpRTUgoI3GrBGzjV838YtjszpNVjeHTSCncgGFMAGXpdLOP9YypSroIRCdFx3PePf+Br+9BS+XcDPZkHXMdasRagq0vNwEwnwXGTRQYSDLFWoKukndjNw511NSxllon7w9wJC6KUpqtZE/frtdqfv0+lLCkIgsiA7ZYU73WFIBjf0Eiv2oIXSs5pJY9kCk9kCiiLQVaUmOFcL4swew44S6byRtb3P11yblB4dPTu4Yd0aTk+nKLpZyqmjL+HiIZ818U0NKX2VXcPUTjJdu+C9Cv7JOOLlYYQUlXva/eHf49SYBZqooUkIoTCaPMiwWM2nr17XcDqxWR117R/soDidI/niEZxUoWVZpdG2XlcNNi27mlesnwESvGBgRxLYa4W66sX620O76ofz3XE13B1ISfbYBMXpHM//6XcI93cuurjXEuaPpaBcDwng43seCCVo4888gHQcpOOi6DrCqP3SeNNJ3GQCY1njupznTGDn9uC7KaR0EUJD1eNoxjCyBVG//gtlSLgkV+TpWAgjp6OHOii6eVzf59JVCusHNmFeug35G17NF9zzff7xxWMcmExTcDw0VVT4vEKIGkGcRpOGhnpDVSCo7dLfMyiQUvLTvfuYzGaJah6XrVC5e9vmpgav5V0DRRdsH3Rg3QQSEHvWgZSkXjnIU5/9aybeXURVtIB9MTwASOxjxyhOJznwP75P2A/RddM7WPbpz9WUkOrrqE7Wxvrqwxy671GMeCd6T4TBm7ZifuEO9DZMCaqxdfUNTOj7GU8dxYtIlIIgclKn43mDwdvnR3Gbr/rhfB1SGtHxsscmyI9NEx7sRouGzom41xLmj6WgXAUvl2PyO9/Gn06B5yFUFaW7B+H7eMkkcrWH0HWEpqH2xmdpYKjdvWi9zcn2qTM/wnNTCPxS7dXHLU4AYETe1pSo3+gL9f5UHiklBzcM4WrdDHb2c+1wD5+74M82igAAIABJREFU4SIMrVRHDSk1zaR7n9jPz/afpjuiY7sevoSJEp93TW+MG9b24U+k8KqytPox7GZdelXA57Zv5e7rNld4yl3hzqZDLzXjvbpA6DrS8wABqxLIV9eQO5SgkHSQGR216CF1PxgjBzSRxZkYR6QyOHtew80Xyb7wHOmnnmDjt7+HUNWGddTs0QnsyRSOphJZ1oNfcJl4fD9Gd3TegUgRCjs+dCf7/+YhRh/ZizdaINQVY/D25o3MZpnwfNQPFzoaXU3HsxNZitM5woPdlUlQOLc15mc7tHOiEnd1xl1SiftVxdiX/oriyRMBHa6cbfk+qGqpxpxFH1xGx3U34Odq64XS8+i8/saWpYt85ik0vQ/XGUeU9uFCCDznDJHB2dStatTzW0O9UT57/WZW33ULU7Yzp6BQNc1tbW+wrU5kizhSMl0octm4y6W/eJUnWmRpNRS6Jl36kKayqqez5XUkcjZdWtV4r6Kg9cZxJsaDnUnYRRpF7Mlp/F4TIXXCJ93Ah08I7DMpUKbw7Twde46j2G6wQPo+mScfZ+yrX2LZ5/5oVtmnrPQnhECWBKaUkHFWgUhRFbZ85nY22TMDMAD2+HRN4G2VCUvXm1cTbmY0OoTne6iKSqVk1GLHVU3HS+07xfN/+h20BruDxXZ8r8IbrhI3rwOb5jDwD9V0udIo9v9uWdZnzsU5YSkoV+DbNtk9LwUliaoGkBACIQTRa65nzV/8fxjLliM0bYYpMJ1E7e6tMAWaHr/0RdJDawHw3ARSOgihoyhddMU/2PL6WvFbI20EkmqamxAwHO9gTa8Matujx7nyuSmkpzXM0srTfTsPjJJI5Yl3Rdixadm8POXqJwQHYoK7N+hsiEtAYKwJ7ouTTOAVBYofpRjbgL/qWoCKNkZhlYunFSGfp+P5Y/T/8kDdiVxSOx9h8O57ZpV9fMerTAcKXa2RYD3bQKSGdEKD3U0Db6tMePVvXjM/hUG1h2zRx3ZO40sPRaiEtCgd4Z62RqPVkE7X1pWE+xdHm/qtBMuyRoFzFpBhKShX4CYT+OlptHgcZ3yiRmvBd4rErng74bXDlZ8N3fNZBu68q22ecrXGgB4aRg+tQfoOQtERSgeqPtDy9WUslN9aT3MTeNwQ/wnDkVcI942zZpWOe3wd+RevBanUZGlfffogP/rZHpxEFt/xyOsqPzw0iue53HXdmrY8BOtpeOkiPDGyAZWXGY53BUL3Q93YvQoZTIpX3E7uf54k9oxAyEAzo+vFSMC97oKhwj7yT+1F1C0KQtPxc/lKbb+67KPoKkILuNzlUe0yFiMQVQdeJaaRV/Mc2/08ruuQePJg00x4+KPb56WJcWDsRaa8IcL+BEIoSOlTcDIgfYaWv6ct3vubceT7XKCBStwXgE8B6ylN+FmW9QPTNK8A/jvgAQXg7qpjqMC3gNeAf6CUPZum+TLwGHApQdPpAwSaF18GrgJGgXXA+y3LOtruNS+1WEso0+GMNcPogwOgKkjfB1XBWLWaZZ/7wqzXKKEQxrLlbQ2OlDUGpCyXvxSEEkJKSXSBGgOe75IrpvD82bZO9QhpKjvW9xBVJlFFkW29P2FL5zMYIk/EVlB1h9DG14lc8XTlNcVkjtTYOLt37cQ/k0B6PkIRSN8nZ0/xT08+yS/3/iO79v+AfaefwJd+w3M3mxB8eup9PDNxOYgwmfw4Occmr11CIfpufCWQ+Exdmq89WFGy4prL6N1+C0KrWwikRO2No8Xjldr+xntuZeUdl6NFdaTrEV7WTai/szKqDeDj0fOO9aDXK2C0j3L9Gk0wfUWO0x+Z5vTvTzHyhyleXPEEE+sTSDFbPqGYzOFl7bYoejAj1Zo1bqGgXYrEQOAiMUiJjXQN3FV/iqaovjde3kYIyfJbLzmvI9/nAWWVuOuADwM3AROWZV0PvAv4T6Zp9gNfBz5nWdZNwFeYmc7TgO8CT1qW9Rd1x+4Cvl96zSngduDXgD7Lsq4BPgGsnu8FL2XKJSihEJ3X38jUzx4gtHYdxqo1SMcBVaX3tveiRs/eSHKxNAbmcpSuR5k+9WsDu7hMOcVIRqdLPUVeDtDfadCVyAbcEqlgrDlCfs/V4Ct03vQCE8nn+ODWA+TXRdk/tpFHrR04XeBFJem8R8YOEdZbewhWl058KStKbopQ+dnIHfz29isYH/kBjm6AmAlAHcOD5OWZhtOAgneRfuoJMk8+Dp6L0HTU3jjG6lU1tf36so/WHeXIfY8Gza6pLPntIC8Nc3J4nMn9P2Coax0bh67C8QrzchEv16+z17mkL7PxYn6lb+B1+mSvdlD1Al0vRmpeV86E29XEqJZqzYZuISu3o8gsvojhSYWiZ6Op7S3wiqqw8Z5b8VyP8V++hpOzmXzqAIqu/irR4upV4pYDPy/9O22a5l6CWvcKy7JeKr1mJ1AOwJcRZL/NtlEvlv48QaAWNww8WTr+hGmar8/3gpeCchXKNeFyrVjrjc9ZK54PFksLt5WjdKOgWE2fWhOPs7I7TyGbRDdCGOF1ZNMFCuOBmJEIFVDCOUKbX6Hz7aPoshccFUMrcsnKV0HAQ4kbAYgqHh2KC+gtPQTj0RC9UYN9Y9MkcsUqeU2DrYPddIRVCn6oYQA01jafBtz47e8x9tUvkdr5CH4ujxZv/vuqLvuUg/Srhx9jzDmGUmrqOm6R10ee5MDYcxVJ1KGudQwPXM1UvnUz1Yh3YPRHmVx9Bj8ig4BcToyFQHRr5FcV6Xw5XBl9r8+E29HEmGXJJXR80RNcg6rP24X64L0PM/qzVxCqghb5laTF1avE/S5gAz82TbOTQKrzCHDaNM1LLct6mSCbLjcPnyfQYH7GNM0HmS3JWb/9eRX4KPBXpmn2Emg2zwtLQbkKQlXnXSteCOajMeD5bo2m8lyO0vVBsRF9SlEMVMXA95LAWmJrA0GrYiKNmzNQtBjd104RWzsIMtijJQGJwqZlB/h56lqKns7WrgKh8EwQaOYhGNJUkDCWyqMoCooQeL5kLJVn62A3XeHO1t5/sS7UztkfVaGqLPvcHzF49z3z/33pggRjlYAMkLWTFJwsilCJGJ3YjsNXdlscTo7iyM5ZGiHVUEM6vTet50RkAqlI8CT4MuBcKwrSkGgrY4heFf+U3TQTnlOBcBFdqN8io9f1KnG3AZ81TXM3EAH+o2VZ46Zp3g18yTRNQaAJ+InyASzLypum+WkCyc/fmXWGWtwP3G6a5hMENeUcMK9Rybd8UC5TtGKGRrboBtlQqVb8RqJZiWJ130UtHaXrg2IjZwkhVFQ9jlMcKzUbQ4E4/upeQtrN9Hz4dxg78UhATxOwpjuCTGZJqwoRPU+vlmV5VOe9m2XFYw+aewjargcSBjsjJHI2ri/RFEF/ZwQkuL44q0CjtPh91S9qM9dU68wtpU/BzSGEwJc+vvS4/3WNZ0+pqCJLvKN7lkZIPcyP38bhB/eTdCaRyECjSBEIVeAXPHRf5/ovfwZvqnBW6nAbh66i6BY4kzmF4xVbOtO0wvnyvKzCm0Ul7pkGz3sR2NHgENtKj+8GLq/72XDV6/8MwDTNLcAuy7I+a5pmH0FzcLKdN1LGWzYoV2heh8Z4eSRJrugRNTQuXdHDjvVD86J7nQs0K1F4joPqakitNiACIEIkcgJV8Srb7FpnCVnhtWrGMGCgqF34fiqob/cE9W0p3Ro3io61AwwjKCTSFNIqn5JF2KrSMTxYOXWrAJrI2SQLRdb1dbCmN1YpX6iKYKpQJJGz52WB1Q7mqrvXlwF86SOljxACRSi4vsorowJFUAnSqtAqGiF3bds0q5Sh6wYbLruWF194AC9MDYNHKYCy10a9VcNYYKCrf0/le3TRyhub6je3wvn0vAQoDXn8qqvEnQD+s2maXyDIzP+dZVmthUnq8JYNymWK1vFklmSuNJSQs9k3Nk3GDtgMjbKh84FmJYrcsTPsGz+GcsLHWQeReBex4QE8H36yT3I4GcWRT9Vts0NEOm9gfPQfsL3CDK9VDTO47GP0Dt09q74thFrrRiFETSZ95dp/w8HE820H0Go6nqqI0rBDgPJo93wssNrBXHX3+jKAIpTgfktJyIiSsQXposBQg8k9par8U60RUo9h7SL2P/MwuSs83E4PAagZlY5XDCJPQ/HjC88+69+T57uMp45iaOGmbiet8FahxZ1PWJaVJaDGLRhvyaBcpmhBkMWVMxohBIlskTW9sabZ0Pm5vtysEkX26AQZO4m33EcLqUgdMrkk3jGPx3IreWU8Qle4B1VQ2Wb7rsdd5gpGixeTksOE5EEUivgyxJQcBv8y+prUt8sZczbxczw7hR4ZItb37orgzXwCaFl1rsxTLsPzJds3DDb1/mtXnKce7dTdAVb3XYQvPcZTxym6BSJ6F7506Qj1YnjQaUhsD0J6rCbrrdYIqb/OcF8XA4f7cPbZuNHAZ0rLKghPoPXoC84+59tLaBdvBieUJdTiLRmUyxQtANeTKFWBwvF9HM8n6TbPhs416rfW0pdk3Cm8mARfoBQFoiiQQiIPO5weXElXNdNKQuH4JP/jhSNsODZG9kNFjMHLiK29qUKfQugUUsfZvNyd9WWW0iM58hWm9vw9Tvo05H2K4xm0zu0Bpb4UQ9vxECxjtupciO0bBmvkNcuYrzhPPRotapXHnDyvntxJMjdSKWsMdq5h7cAlhLQo+0ef5vTUIRQcLl0eZs9ohI7QTGbr+ZIdG3pQ/XFcp4fDX9s56zr7b9jMyIMvo3tVnndnmX22ek/NGqzt4I10QllCY7wlg3J5O522HTRVUCUbjK4o6KpCh67XZENni0aOGs1Qv7X2HQfP8JCAmldmdDOkYKLDIZkqEO2YaeRlj02QG09RUCDRa6BqefySkE+1+EyzL3Ny7F4Sr3wN98xk0OzTFOSK00wd/hri6wpD98xWIZsLjVTnmu1C5iPO0wizaGNVsN0cY6kjKEKtlDVOTx1ACAUhBBPp43i+g6bqfOKatew8NsDuwxMk8zbxiMbvbHiUqwb2c/pQktwRm+nxHtz8dTXXueK2S1l5x+Xzzj5b7QxavadmDdb54I1wQllCY7wlg3L1djoeDTGRKQQiNVIS7wghJbO21QtFO0asjVDT+FLygSFqVqClajPFji5Bt6PjlBYW3/M5OpUjFdLwEXwj1s2G04J3DU5jJzJE1/RXGoSNvsy+b5NLPoaXnKqSLQUQsDpJ6tHHWmpGz4V61bl6tEvTauYvCM1pY77vIgQ19WEISgAHxp4nYsRQhIqmBAFxLHWQd27QuPu6G0jkbNT0NymkXkBKFSFDFKdOE9o0DkIh/8J1leuceNzium9/tmX2Wc0KEVKZc2ewmFS4NwrfU3/3nKjEfcT7/pJK3K8CytvmnQfHeMX3yRY9YiGNrYPd7Ngw1HBbvRA0M2IFWmac1Y2vXDHNIye+SX56ukbkHQndvV3c3L2SB18fQVUExybTJIVACOjxfBxFsCfRga/Be3qnkI6LCOlNv8y+m8AtjAfTjPXsk7CLV5xoqhndKlC2i7loWvnJNN8+Mt7UX7CMRmyOgc41jEwfrj2odBB+mnxxmrBRu1hU12uHOlROjz5R4Xv7jovneAglmILMvXoVnq6iFkQNnaw++2zECuGlAv4DEyiK2nJnMB+GSqusu94fcqG1+wXgglKJm3Ui0/wrAq2M4+fyPG/ZoFy/na7hKS9Sc6+hESuzXUpalTZURaMz3Mv6S65h/57Hcafz+K6PoqsY8RgbLtvGHSu2oAiFRw+MMVlw0BTodH36og7FsI9U4fVChJu7pvA10BW96ZdZ0eJokSGEfqikb1yFgoZqDMzSjK5XgItHQ1y3doAPXraGgY7wvO7nXDSt+w6M8OCB0YqwUbbo8K+vHqXg5PjCTZdWFplGbA6AZG4kKAFIn1jxUUL+foSfIYqCLF5KzrglMDcooVziMcjW8L0VXUPVNXzPxx/IknzvJLYWRS0oRMdDaL2NdwP1DArHKZJwjhG5UqsZwW40wNEOQ6VVPV4oksTIV8ikHgM/jar2ktnTz+SPN2Cfyc+7dv9Wg2VZswVwzgHeskG5jOrtdFd4/lzPVmhkxFqGN53EmZxg6l//paa0Eb3hBrr/4PcIhwKB+PKXLLl7H8q6AmKjwOiP0bN+JcvjGzGXb0MWPT6+YYjbNw7x8R89g/AlOTuJFw1qGkJC3lbxlG6W92zg4lU7mm53FSVEtGcHhd69uBOTtc4rJ3rp2nbTrIWjWgEupKnsHZti1+Ex7n3S4rIV8aZTcI3QiqbVc+MWHj92JmBwSMjYSbL2FJ7v8sOXjjDcuZuty69ky4rrKhog9c3Icgmgo/goYfdlPOnh+T4CF6X4HKqbp6jdhqJrCEVUSjyCUA13GyUY+87YSZyQjquEUDyB1AIhpYOJ52fR1OoZFNKXuJkCsuhRWAWdL0soSnzHQ9HVpgMcrRqszerxEon/zoex04/gI4MSTmoc7D3o5mncF677VRyxLqvE/R2wgoBDvINg1Po/EAiydRAMlxSBH5SeM0ygBncxcAVwv2VZ/940zUcJBI4+TKD+NkigPvdvLct6yDTN9wH/BzBNMAT7smVZfz7fa37LB+VzibLynJ+fbVipdveS/PGPmP7lw0EmHQ5zcqNBWn0VHv0rOtdtZqhrHcqPU5z+6R6EqtDzagdyn8TVPVbcPMDmT23j4FdmsiIRjxHbvByxpo/cZBrhuEg/GDLpjCks2zhAIjsy53X3Dt0Dl/gk93wbJ3UasqBMrqRr4A9n6UrUK8AdS2YYTwc1+lTBIW07LafgGqEZTSv2kRtJ/OAJwrpKxk6SKSTxpYcmHELkmUjnUcTTCCGa8nbN5dtAFimMfANXesFQiKIiUfHcIobzOqdPbEDPRTDiMTZfdkNpAdNqudtAdG0fmYlpcslhfE9F0ZWKXVUjmlqZQaEIhczxCYpnssiih5u1ESGF9Hge92gG6XoITSW8rButu/0GXqt6/IHpp1me3oVCyUtQ+hT9PGq3mBGh8rRftRFrgE8CRyzL+lBp2u414G3A71uWddo0zX8PfIhACW498G6C8esjwEqCMeljwL+vO65tWdbtpmneCvyJaZo/B74IXGdZ1phpmt9d6AUvBeVziGrlueoShvQ8Oq7ZRuaZpyo/H7m4h6nhjqBknEjirrI5MbmP4tQoHVXTWsIT6J7G5E4LipKRh1+pZEXkimw8NMZL0kWJa6gRHSmDgd8rVksMtT36lBAq8RWfp2fZJ3FzI8g06PHGEqX1CnCJbLHC6XX9QBEurKvsOjTOndesA+w5ec3NaFq26xGPhsgWHWwnh5QO71q2E7P7EJ16luVhlbwzzOhUf1PeriIUNg2YnJzuIJktEBgDgpcvggeqWkDpyEEyhHgqj3o0DaXSf73KnxQd2B2X4/fcQny1HzielxanRvdZV8PYbo50chJXLSL6BUpegK3iTRbI7ZtGD4VAEUjPx7ddjtz3aNtZa7N6vFQl9lACxctB+bPkS5ASPwxaOI8SzuFng2s9h84jbwS2Ag8CWJb1ummaEwQym180TTNDEHgfLz33sGVZ06Zp2sCYZVkJANM0Z2uuzlaHGwBSlmWNlX6+C1i2kAteKhydYwzefQ89774dJRJFFm2USJSed99O7298EG8qAYCvCtIrozM9PMfBdxxwfbL9OaQ6+zNhJ7KMPfLqrKzo19IF3j4yRVhTcXxBWBdcucrjji3BMeZDn1KUEEbHMKHlww0Dsue7hDWb3mgQBBzPx/FnNIG1kmM2Ek5NjfHAq//ELmtu/eUyyjStar/A7esHcT0XT7q8c9lOLou/RkgpEtYVNOHSyX70/APY7uzdSeV9aXGE0oWsEvjyiy7CE8hCGM7E6H0kRveeKBO7XsezAxpaWeVvxcZvsWLDfazY+G2c6B2Bl2NIb6gDUq15fXDsOVzXwXOdgNaogBfzkcsUlL0O2H6ws1EVwoNddKwbZHzXvsr550K5Hl8PLyzxwjF8tWpwRQkcdaQi8Z0wfmHmM/Er5jzyKnAdgGmaG4CydvKdlmV9DDjNTPu8UfBthvrnjgOdpmmWOacL0wdgKVM+52imPOfbdqW04YZU3LCK4pV+z7qOouuBpE2HiheWaNmS3KMqg39rOk6qOMtjTQXef2KK3/3CRl6d2ouhZFEVj+mciqFFMJddu2D6VJld0RPROTrxbIVBsDwW4qWRYMhCVxU8Xwb0wlgYVRFkCkk0JUtEoy2p0Va45/rN+NLnf758is1dhxAodIY9BqIz5gExeRRdad5cVJQQHV03kUwfLpmXB9eLgGxuDWo2hJYLFrtGWWO1yl8zmtpQ1zD7R5+p0qkIkS4kiSmd5NJn8KMS1GDnI1wI7XQRsRDdW1eid4QrzJf5ZK3N6vFKVhLp6qOobibsvlxpZCqGhl9wcI6tA68kzPSrN2L9TeBbpmnuJChDFIDvALtM08wCYwT15rOCZVm+aZqfA35qmuY0QcJ7YI6XNcRSUD5PqFcyqy5taLZAK3j4eqC9oJWcsgUQ7e5GyTpIIUlfXgg86kI+sZ4elFcEsWckQtYKExm9UaKdYXptj4LjI6XEx2d+icAM6tkVukizvnea920VqIrG7Zs9PC/NoaRCV1hnKlekvyPMcDyGlJJ8McfVq8CoipMLHQ9WFYXPb7+IHasOkh9JIwUo1cI/QiWsSoSfAmJNaXrx5Z9hPHWUXHo3isgjMchkVjE5dQXRk1pF83iurLEZTU1KWcOyKLo22UIST+/EyOnIjI9UQPjgC4no0lCmZU1Abuf89Whcj78U94oOTiW7AQh5+1HIISIxZGojzr4r8e3C+RixPu8qcQSNum9alvUz0zQ3AddblvXHTZ5bVn8rEDT7KP17WenPm0s/+vOqx14Hyj+/HLjRsizbNM3vEJQ25o2loPwGolpUv/N4iqnNfei9cUJrAxNRKX02XL4N7T0Z9k89R3aFjappROPdRIf7yajjpAp5uvfMbD2l59O/w+R09jii9F9gTRr8Nzp9BHP5tRVBm3a0K2rZFQqJTJZncypC+PzaRaAq8IG3CTyZY8uK9/PjV07x1NFJknmbrrDC21c6vHfL7EpZQ6nROg5tNaqv921rbuVA6otkC+N4MhCQUhWNmNFNZ3QlUunhy7tfb8pnFkLFNP8S6/ROxqdeIzk5RXG8QPSUXjFpLWeN6IJcMdXwPjWj3u20flCTPStCQSgqRS9PqC+GPZam7A6lFhUMPYrSQ01AXkjW2qwe78vAymt0OkLKuZ6w6jHY8zbMq3Ygb/XOC0+5NORxvlXiDgPfN03zPxA0EOY/jto+0sBTpmnmgKMEbI55Q0jZOHt6/vnnh4EjF198MaFzIPS+hBn4tk0xMclh5xBj2ROzBgOk9Nm57/s4dqGmmQSQP3KGnu/V2iWt+Pi1PPTaNyiW9IHLkFJiaDHec8knOHFmb1t2Urbr8YfffZycGwQ+z3dJZE4jEBiKz/9yE4RK3nae77Hd/BBRo6uSoXaHVZ45/MMZHQ/p40sfRShoaogd5u+gKlrFsipooiVQ1DjRklWWRDSU4RwUL5CdegC/9BkOjuPT0Xs73z/4zoYCSHdsXTmLBeL5Lnk7w8n7nmRyp1XJMAe2b8H7QCfjmaNt2W6VkSum2GX946wAnikkyBUz9HUsp3B8CjuRwXNdOk9H2dT1diQw+bg1azR7MTnD7S7E84Vt27z66qsA66688sqji3bgtyCWMuU3AZRQiPDylVzESswGX5qck6Ho25UsRiIrEpzVdklqTxhXdRAoeL5TE5AhUMHz/CJHJ/YwMnWoLTupanYFgIKKn3fxHZecJzn5wgTLhiLEhgdqmojV/O+hrnWcOLOPXHGagpsLdItRGOpeV8kmqy2rhAgj/RyZZDD5OOZf0ViGs/cyhnpFhQ0hlBixrhuJ9t3NroeemmXU2kwLWVU0OiI9bPnM7XifeBf5kSQgOKa8zukpq23brTI0T2+oeR0L9SKEhqFFYI1C1/Ay4spyLlq3HT0cJD7eJ991TrPW+YhILeGNwVJQfpOh/KXxbId8Ihl01PUZMZqMncR2cpWgHDE6MSJRDqdfYOxYkEmWSxPIOiF8CarQOJU8gFLVCJMENefR6cOzarzVWsgAueOTUHCRMUlU84nhUhhPIZGYV9zYMPsyl29jdPoIhewIsjS4ENKjuJ6NNfIU5rIrZ1lWQcB2yE7vZNSLNZasTB1ns/kpegbvmqHu9S5nLO/WLCTVaKWF7Hs+h77xS8Z3vU5hOkPyt4sYgx01Ik6tauHV03SZ9QnsrZJwvLPq9ZLNy65qOZG3JAy0hKWg/CZDszHZwQ8MY409je1kQVCyLfLwfIcnDvwI17MrGV3ATBaIUn3Sl4GzshACV9qcyZxEVWZMNm03CPIIhVdP7uSS1TdXtufV4k0KYJ/JoPkqPj5b+7Oouo8oKGgvOWx855UN35MsUd/6OldWFpOy0t3o9BHWx1fPsqwqw3UTeO4Z0PpmPVZ0CxTsNJlvf69mKtLYdiPx2MXk3Nk6NfVayNWonoajR8Odp7pe9eu7X4uR1gsUVqbwhU/PulWVcpQilKVsdQlNsRSU32RoNia7TFyCsk0DIUq2RQoRPUbU6GYifZze2AxPXSAIGzEKxSzxjuVIJDk7RaGYIWx0YLs5JD5pO4EANNWolDrGpg/PcrIoizM9uvcktuPRKeDqEYf3HfQh3IlaEMici/vxHPry2QGvWgtYFbUfuaJbwCVcO8JcBU2Loyl9uA0ozYYWZvrvvkv6Zw/WCD7lf/4Al1+js6tvw5yi+pXH6qbh1IJALShIXbalrlf/eiEFXS9GAvfqXpXrv/ybGJEIb2W8/pR2TlTitmxrsPpewFgKym8itBqTHXtuH6Ebu4gaHZVGmRAKru/geDae71YkJwE6Qr1I30cIFd8LnhMJdRIL9VDM5Ck4QWAunQFVaIF0paJLCyLYAAAY0UlEQVTN2p6XxZvuvGKYhx/dSyRrY8jgdZT403oL6tZcWsBho3fWCDOAlC5GxzUMiA2cTh6YzQWOrSb3+E9qpyV9iXR8fv2Vn9Nx9/XsPpaoiOpvG+7nAxevxna9WYG5fhpOeILwSY3cuiLS8eZU12s2TSc8gX/KxpsqwFs8KHPhq8RdAvRalrXzXJ5nKSi/idBKttIbLaB5faCCWgpOGTtJoZjF9Rwm02M4soOhzp4KH7gntowbNn2QdCHBs0fuR1N0MnYS3/dQFBWvNH3n+S66ZhArOWw0255HY2G23mAGojXz8HRrRwu4eoTZ8xLkij4pVjKV6MQwjqGpIaSUOJ5dYaasE2s5VhJ88hVITU7jnpiCfBFFlbzr4lf4xB//FmfyRX605zhPHpuoaGjXiyQ1UqcrU+PsYW9Odb3zbUK6hDcEHwRGgaWg/FZBqy92qCtGb98GTqeCjDEIyBk8KXnsyACvTxpkixpdYYcrVujcYfqs7F2HoYXpiQ4S1qM4XhHbySEUgY6B7we7Pl0NEcyfBIzmVqPYC/V0m0sLuDzC3DN4F9aphzg5NYJQQqiA57lI6bOiZxPrBi+rNMh820bpjXNqg0GyV8VR+1CyDlErwf/f3p0Gx3nfhx3//p9rbwC7IEAAPENSXFISRcluZdkULcUR5ZQe133R6TSTSaPUcquxkhc9Jn3TF8lMpuM2nXY6qqYaO540k6nbNFU6dmrJsaayTMq6ZVGiKHIJHqJ4AARx72Kv5/j3xYNdLhaLgyxILri/zxuBwGL5AAJ+++P/+R09P7vCyJFzGF2v8sr+bfz09JX6uM/aDkO4PiSpVTec0orUexF2b3yArXsOLFtGJktI21M2m90N/CngEXbZfRf4R0BAOJviu7lc7vlsNvsQ8BzgE3b9fWv+8X8NTAA/A54Cqtls9peEy1F/lTCGvpjL5f7tWl2zBOU2stIv9q4tBzBGTEZnzlGqzqGU4ufnezlxLYHWLrbpU3I1714y6Y738+S+MODVMtULEycItF8/PzaUiVIKwzDQOsD3fZTnMdi3sx58mutab3an22q3VWtMRgt5VFPjiFIGY/nP2D3wObR3jcAK29WnDu9nqvwZ3mwJ5YGOWBQe7MfLDJAatrl09CRH4pFVlcct+YLzT1dXKyxLSNvSIeAd4PeBg8C9hEOIHiIMusez2exfEs7DeDqXyx3LZrPfAP4D8C8JA/fnc7lcNZvNKmA0l8u9k81m/ydhJ98IYbBeMxKU28xyv9iur+lJPkhf1z2Uqi+icRgeN+cDjgNoAq1JJ/oZHo/h+prI/P/h7OAjBIHPbGl8vobZIBVNo4GqV8IrlZk9dRHntGLyfJHco3PLNk6spnSrscUZqL+9XOXBkgtCdYBTeonLZ/4K7c9gWxuIpQ4wt28I+8wc3sxlIMDtS+CnopR22ZR25ymes5mYLS7YYVjTXB7X+IKTH5uhEHXoSydW3bwhS0jb0veBf0U4KW4G+CnwRi6XqwBks9mPCc+6h3K53LH5zzkCfGf+7fO5XG7xP13hN+cfMwC8vJYXLEG5zbT6xcY2+S8NsyfScZvBRIIvbPbJV1XDTAmFOV8WtyjgKIM9Q1+k6lcYnTmLZdj18938has475v0/DKO8hU+LqdG30Z/GCO5vf+Ghwg1z8qYLFbQGjJxh95EdNmh90vdFExUX8N2P2S6EJm/0TmNPXcBmx1EfuUJitMKN1nFT4TfB9DoiMbO+jilEqYyiJt5in4KX4fjK1uVx1VLFZ5/9QRvjc0yVXGXXDe1HKk1bivfAI7mcrk/zGazvwH8G2Aim82aQIRwtvIwcCWbzT6Qy+U+Ah7j+s3DxrqfADCy2WyEcAbzb8y//5NsNvs/crnchbW4YAnKbarxF/v510/VW4ajtknJDTg2EqPq5kk5mop/ffJgxE6glFoQcBr3wpWrRVy/guuHc40dI4L9gUfivXh9sJE2NZWtPkzOobdeb0BZaYhQ7ajj+29fqu8MHM2XGMuXAaj6ATHHWnbofcubgtrFcD9GEy63rQ9p98s4wSnKzuM4G5KU1VT9eVSgwIdUX5y/Z/yIFBdJWAWKforzxfv5xcTXOLhzqH50UasPf+H987xhGli2SaQ3SXFb3w0P6Rdt5T3gz7LZ7L8mHKL4HPDbhNltL/BHuVxuPJvNfgv4z/NHFB7wzRbP9T7wx8BJYBJ4CygRZt9rtrdPgnKba97sUdMVzfDpjMG9G+d471KAZRhE7ATJSHpRPW7jXjjLtLFMmyDw2Ni9g53WPt7++fcgAV40wCyr+fm7AcqlXgpW03KIUEPQL5RL/O8PI6DixJ2eBUPvJ4sVtqYTS7Y71zTfFIwaFUxdwjSbS8oUhi5BkCe2JU1hPE/g+uggwKqYxPq66R88zmb3QwpehumSg6VK3Nf1Lnv7u/n6l/6g/kxnXniF8y8f49hQGlOFQ+bLDY0jy12vWLXbPiUul8udBR6t/TmbzT4OPJzL5f5h0+M+IFwV1eyRhsf8GPjx/B9/Rrj6ac1JUG5zzbMn6hT4OsXvP/EELx4/x9ufTjNdconbFgd39tcbPpr3wtUYhsXk3AjZbV+g9Jim2F/AjwaYZYPIZROjrFBRA2Uv/BFpVZnRGPTnXIvZisYxC1Q9jRsE9dGatU0kpmEu2+7cfFPQ8+Y4feLPgRbD3s0kfd33Mlkcx4xHsDQ4KkIy1oth+ERKpzFMmx3dKQIdDuK3TQPTHEbhApF6fXjBtigYCrthRldlImwcWe56xerMN3nc7ilx644E5TbXPHuiUToWYaArwT9/7EEqB1rPDV7yxhlh1psbfxvvQRs9Vg4Xf9qa0nYXVQUnk1gwO6NV40Rz0E9FqB+p+LqEbUbx50/l6ptIWL7duaY+B8SK4zt7MSvHFmyaRgf4zr08tOUJAD6+dISrM+cwahutg1kMPUfETgMKQ9FwXDEVjgh1Buv14amYTSrQlBsn67k+uuqRTsZWvF7R/nK53GvAa3f4MpYl66DaXG32hB8sHLHafERRm8rW/M/r2o2zVhwrwkThMsnt/UT7u1CmgQ40hmkSH0qz64FHMA0bP/AxDZvNmT2LGidqQb/+nCbsG9AEGtABPbU9gVqTmS9Na9Xu3Lg2qZlpWHRt+CZl6340DgoPjUPZup+uvm+G7duGxb4tj7Ol9976NSuzh0hkgGS0Z9FzGmYaw8qE1zxfH+5o2FesLrizo2yTwDKXbM8WYq1JprwO1I4ijp4dq7cMNx5RLGe5brpMYojRmXPhcPjtfcS3bgjPkG2LgIAdGx9k76YvLVtX3Kpa4mt7AAKOXzVJxeJELQutoTfuELcWHq80nkcvN7M4O3SAnDIZnT6N705i2hkGenYveJFoVQs9MzbXon3bJ9H1aH2IfmN9+NdnwxeYj+IOBaXo25Di8H2bV/W9FmItyJD7dWSp1UYrqQW+5m66XRv/Fr8Y/suWMylMw64PoG/W3FBy8sobLYN+f1eW/p6HSDgWk8UKoBjsii249qU+d3NmT8vSuxsd0r5weP4UhpmuD89vDNS16ovR1z6hOj4LG7qIPprl888cIia1xiuSIfdrRzLldaRxcPyNWK6bbqWZFI2WymrvGXgYWNxCvWvjw3z3zTMtVzJ5geZaochnk+fqszxqliu9u9Eh7Y3t20utmVrweCAIIAb0W+aiFz+/4kpjiLilJCh3kFYBbaWZFI0aqyxaNZQ0B/3m+uqi6/HjTy5x5OxVlFKMzxXxPI8HBsIjj8bGuaWGIt2sxg3UrTSOTHW64/gVLxy8BOx+9qtLzrle63VNQkhQ7nCrnUmxVGldc1ZbC6JL1VdfnC4yOVfhoS0ZYrbNZNXknUsaCJewAlR9KLoRwoarW2+5kaljR0+y8+mvcPZPXm055xrCoC3EWpGXeAFcz6KXOqdtrrJoVMtqG9Xqqxv5gWayWMHTYb2yUoqIFcdQmuOjipILP/oE/vjn8B9fd/jtH7zOc0c/qY8YvVVqJXEtPzZVpDQytWzQ9ist6qeFuEkSlMWqLF9at7ihpFZf3cj1AzxfYxtGvV45GUkTtZPMuQZ/9XE44a7qm0CFS1MX+cH77/MHL/+EQN+6wFwriWv5sXQcUMsG7epk4ZZdm+g8EpTFqtRK63RTcFzqpmCr+mrbNLAMRSbh1Lv8UJCMptma3sxMdQtRO4YmCGdcKAOF5vVzYxy/+Oat+9rmS+K03/S1zY9MjQ32LBu0ZYC9WEsSlMWqZQcfYXNmz4oNJTXPfGk3h/duIm5Z4W4/x+bRHf1s7l4Y4PxA87ktvRSqwfwRycJz6ELVYPja+ZaNJWtl1zOH2HT4Qay4TVBxseI2mw4/yK5nDq0YtKUKQ6wludEnVm21NwVrarv9nn7knnpJnGWocKxnUyPM7zy8kw8uXSNf8hfdTEw6mqhZWdNqjEVf2wqzkGWAvbhdJCiLG3ajtcLN9dXNgbpWC/zlnYP84P0rhGupQoEO27YTkaVXVK2lpWYhywB7cbvI8YW4I1rN6vj2gT08cU8vETPA9SFiah7eHHA42/rc+k6oBW0JyOJWufM/5ULMMw2DPzz86xy/+CbD184TNSskIks3swhxN5KgLNqKoQz2bz3A/Zu/cEMzLoS4W8jxhWhLKzWzQNg1ODJbpOL5t/HKhLi1JAUR607zYtabWW4qRLuSn2ABrK+s84U3TvPSycsUXa8+6Oilk5d54Q3ZNCTWP8mUO9x6yzqXGnS00jJWIdaL9vutE7fVeso6K57PyavTTJTKLT9eW24qxHommXIHWy9ZZ2M2P14oMzyepztmsy2dpGHH6aqWsQrR7iRT7mCtxmvW3Kqs82bOrhuz+XjEoitqc3W2xIWp69PZWi1jFWI9kky5g9XGaxbdxYN+1jrrvNmz61bZ/Pb5iW0zJZey65O5gUWyQrQ7CcodrDZes7ayqeZWZJ21bLdxNdRLJy8D4SyMpdSy+ah9/VqUUmxNJ8iXq/zR33mQ/ZsykiGLu4YcX3S45vGaccvi8N5Na5p1rnR2vdxRRvOwfK015ycKHLs8yfB4nn//2if8yVvDt3w7iRC3i2TKHa7VeM21zjpbZbs1tbPrpbZ0N2fzn07Oca0QVl/0p6KUPX9VGbcQ64VkygJoPbVtrbRaDVWz0tl1xfP5xv1bOLR7EAuDq4USpqHoT0XZlg43fqwm4xZivZBMWdxyN3N23XxjcLJYoeIFuF6A7SzOJVbKuIVYLyQoi9uidkbdvHFkqbPrxhuDo/kSY/kyWmu0Dgffj+XDI4zt8/vxpEZZ3C0kKIvb4kbOrhtvDAZaMzlXRSmFUgpUQBAEGIbB5FyVrekwUEuNsrhbyJmyuK1Wc3bd2NTi+gFuQ2VFxDRJxyMYKgzeplJrXi0ixJ0kmbJoO41NLbZpYJsGfhDu7bNMxc4NqfBtpfjz33yUrqhzJy9XiDUlmbJoO7Ubg36gMZQiE3fmz5M1mXikfrPwyT1DEpDFXUcyZdGWGm8MDqRiOKaB1tAbd4hblrRVi7uWBGXRllrdGARuWYOLEO1CgrJoa7UbgzVShyzudnKmLIQQbUSCshBCtBEJykII0UYkKAshRBuRoCyEEG1EgrIQQrQRCcpCCNFGJCgLYHVbpv2KS2lkCr/i3vDzzparN7zFWohOJM0jHW41W6YDP+DMC68wdvQUlck5IpkE/Qf3sOuZQxhm69f12vMeOXuVj0amKFZ94o7FA0M9fHnHxhW3WAvRqSQod7jVbJk+88IrXH7pGMo0MKM2XrHK5ZeOAbD72a8CEAQVAm8Sw8pgGBFeeOM0/+eTS1yYLDBZrGAog0qxwsmrMxQq3oLnF0JcJ6lKB1vNlmm/4jJ29CSqKSNWpsHY0ZN45TKTo89z5cxTXDn7FFfOPMXopef4r+/k+OjKFOcnC+QrHkXXQyk1P7Ae2aknxBIkKHewxmHyzWo776qTBSqTxfr7qwomTIOqgupUkYkLz1GYehkdFFEqig6KnPz0f/FQ6odUXR8dhJtBql5A0fVwgwDXD+rPL4RYSI4vOljjMPlm6ViEhGMxUfFQmQR+scpfd8U4HnfIG4pUoHkoKHNP8DpKXZ/YFgSaiYki+1MneHHmYQIzijIClGHgAknHwjYNkrYtO/WEaEGCcgdbasu05wd4ZsBv/bdfUHRdrN0DzIxM4xkGNmBrKCvF8Dabz6ZG2JrJ1D93+sI18hWXuOMTSVaYKcdAg+EHaKA76shOPSGWIUG5w9UGxR8ZHmUqX6I7FeXydImR2RJVP6ivYSp2xXE8n37XZwCI9iZRAymuzNhs6g4wTQMCzchsEd8wKFdjFEsxTKXxlEIHGscP2NuX4ivZIRlQL8QSJCh3OKXh0IcX2Hv0FOOzJV4b6OFv0gk808ALAgINhgIU+BGLfDJCvCtGtFBh/IMRfpkf4FemPyKZ7sLZ2MWsVtiBz1uj96Jckzg+WoHyA/7u1Bz/6V98jZ4tG4Cw7rk6WcDJJDEj9p39RgjRJiQod7hauZuhoCvw+UnUohxoDMLjBoD5ZDn8r2FybaJAcqaEAbyT+zJfz5cpbT5HRVco4XDq7DZePXcA0w6zZNMPiFU8DnxwntEX3yH57Sc5973/e0N1z0J0CgnKHcyvuFw9cpLixXEqkwVGHZOpHQMoHaC1gVbXz5k1gAIDjfbKJOIzVMpx7s0HjBw7QOrDv43T4/Li5h1cPT9Nb7VKOqjgKTB9TdT36YtGGHnlONfePI1XqKBMAz9mc7XiMffywrpnITqVBOUOVfF8rnw2zvjHFwkmCqBAKQuFxvQ1HhrMhfXLmajJN7b+lPuSJxi0ChQrcc5e3cW/G36MpK/YM1tim28yYhroIAAvwFIKbRrsL/n0bNsAgWb8rWG69m1dVM2x//1zfKdUwYlJVYboXBKUO8yCtup8idJ9m8mOTvPVM6Okyy7pYpWRVAyNrh9fAERMg9/d/yZ7kieIzpRRrolru+zdfAJPKf778Fc4tiFFYrpILBFBx2ziU0USrse+fJm/71gowHc9/LkKP0xGeTcRweB6Ncebpsnzr57gn33tc3fouyPEnScHeB2m1lZddD0coOrYfDCY5m92DWAHmo35EigwUZiEPyAGsCEBu1Mn6Esl2NYVIz+fRfsBbOk/Td5wCQJNEUXGC0gHcN94nt97a5gnj3+G4QcAGLZJkIrycSq66IfPtE3eGpuVTj/R0SQod5DmtmrlWNhdMeyIzan+HoqGQimDjWWXlNZ0RR36U1F293dxcKvD54dMtmeS2JsylGyLOUNRtAzseIVYvIwGqmhmqx6Ggosbu0GD9nwC93qgdR65h3yLYUSR3iTTFVc6/URHk+OLDlJrq47aYdOGMhSR3iTaD6jGHIqf30G1N8kQ4KSi2Jsz2KaBaSimqiV81Y1FldHZEoFpQBAAilk3wTU/RSViAYozlokDZEyT8lCayLVZ0AFW3Kb/4F4e/p3H+d53fsjMeB7t+ijbJNKbJLGtj7htSaef6GgSlDtIq7bqxLY+AKLjswy5Hl0GsKErfH/Dfb6uaIquni9TnPkJU6UqtqEoA4qA96/tpqLt8Gah1qAUFaWYsDTvZYd49rcOsv0ffHFBPfLhJx/gxycuYXg+yrFQhsIPtHT6iY4nQbmDtGyrVhDduoFf/7X7eSI7xKe5K/xkeLQekHWg8SouB+7fSN/Qt7lQ9Si4PyLlVCkWbN67tpu/OP+r9b/D0PWnRSmDszv62f7MIWJNzSG1jr6jZ8eYKlVIOxEO7uyXTj/R8SQod5hFwTB2PRiahsG3N3ZjWCZHzl5l5PQI9mSB+6aKPPB6juETl9jy9O/y0uv7IJhi/FyF8aslohEomWEts1IqPBaxTSK2SWljN9MVd1FQNg2DZx/dw9OP3FMfri8ZshASlDvOSsGw9vHHjn3KqeMX6VLg6DDg1gbbf2n/Nl46qYhsUXT5Y0TyZapaYwFRy8SKOSilMBT0JWLLnhFHLJPBrvgt/qqFWD8kKHeo5YKhX3GZfv0UG4DGYuXaYPtv/ePHgflB9Zt72Vooc2WmSKAUxnwXoNaadDzC4/dslAxYiBsgQVksUhtsb0YXDwmqThXxp4sLsu3uqM333z7Dn717lsszRdCwqTvOUw/vlDNiIW6QBGWxiJNJEskk8IrVxR9Lx3EySWBhtv17B/fyT764m5HZEqAZ7IpLhizETZDmEbGIGbHpP7gHPd+FV6P9gP6De5ccsxmxwuaS7ZmUBGQhbpJkyqKlXc8cAmDs6EmqU0WcdJz+g3vr7xdC3BoSlEVLhmmw+9mvsvPpr8ggeiFuIwnKYllmxCY2mL7TlyFEx5AzZSGEaCMSlIUQoo1IUBZCiDYiQVkIIdrIcjf6TIBqdXEDgRBCNGqIE1Kg/v9puaA8CHD69OnbdClCiLvAIHD2Tl/EerZcUH4XOAiMALI0TQixHJMwIL97py9kvVNa65UfJYQQ4raQG31CCNFGJCgLIUQbkaAshBBtRIKyEEK0kf8HfNOE6BB7ewcAAAAASUVORK5CYII=\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "visualize('umap','count',corpus,metric= 'cosine')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/balavenkatesan/testing.py b/examples/balavenkatesan/testing.py index 0793d55ed..35a67117f 100644 --- a/examples/balavenkatesan/testing.py +++ b/examples/balavenkatesan/testing.py @@ -18,31 +18,64 @@ def load_adm_sat_school_data(return_X_y=False): n_features = int(temp[1]) target_names = np.array(temp[2:]) - - df = pd.read_csv("./merged_adm_sat_data.csv", sep=",", usecols=(0, 1, 2, 3), skiprows=0) + df = pd.read_csv( + "./merged_adm_sat_data.csv", sep=",", usecols=(0, 1, 2, 3), skiprows=0 + ) data = np.empty((n_samples, n_features), dtype=int) target = np.ma.empty((n_samples,), dtype=int) for index, row in df.iterrows(): - data[index] = np.asarray([df.iloc[index][0], df.iloc[index][1], df.iloc[index][2]], dtype=np.float) + data[index] = np.asarray( + [df.iloc[index][0], df.iloc[index][1], df.iloc[index][2]], dtype=np.float + ) target[index] = np.asarray(df.iloc[index][3], dtype=np.int) - feature_names = np.array(['ACT_AVG','SAT_AVG','GRAD_DEBT','REGION']) + feature_names = np.array(["ACT_AVG", "SAT_AVG", "GRAD_DEBT", "REGION"]) if return_X_y: return data, target - return datasets.base.Bunch(data=data, target=target, - target_names=target_names, - DESCR='School Data set', - feature_names=feature_names) + return datasets.base.Bunch( + data=data, + target=target, + target_names=target_names, + DESCR="School Data set", + feature_names=feature_names, + ) + def show_plot(X, y, n_neighbors=10, h=0.2): # Create color maps - cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF','#FFAAAA', '#AAFFAA', '#AAAAFF','#FFAAAA', '#AAFFAA', '#AAAAFF','#AAAAFF']) - cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF','#FF0000','#FF0000','#FF0000','#FF0000','#FF0000','#FF0000','#FF0000',]) - - for weights in ['uniform', 'distance']: + cmap_light = ListedColormap( + [ + "#FFAAAA", + "#AAFFAA", + "#AAAAFF", + "#FFAAAA", + "#AAFFAA", + "#AAAAFF", + "#FFAAAA", + "#AAFFAA", + "#AAAAFF", + "#AAAAFF", + ] + ) + cmap_bold = ListedColormap( + [ + "#FF0000", + "#00FF00", + "#0000FF", + "#FF0000", + "#FF0000", + "#FF0000", + "#FF0000", + "#FF0000", + "#FF0000", + "#FF0000", + ] + ) + + for weights in ["uniform", "distance"]: # we create an instance of Neighbours Classifier and fit the data. clf = neighbors.KNeighborsClassifier(n_neighbors, weights=weights) clf.fit(X, y) @@ -52,8 +85,7 @@ def show_plot(X, y, n_neighbors=10, h=0.2): # point in the mesh [x_min, x_max]x[y_min, y_max]. x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 - xx, yy = np.meshgrid(np.arange(x_min, x_max, h), - np.arange(y_min, y_max, h)) + xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) # Put the result into a color plot @@ -65,20 +97,21 @@ def show_plot(X, y, n_neighbors=10, h=0.2): plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold) plt.xlim(xx.min(), xx.max()) plt.ylim(yy.min(), yy.max()) - plt.title("3-Class classification (k = %i, weights = '%s')" - % (n_neighbors, weights)) + plt.title( + "3-Class classification (k = %i, weights = '%s')" % (n_neighbors, weights) + ) plt.show() -if __name__ == '__main__': +if __name__ == "__main__": school = load_adm_sat_school_data() X = school.data[:, :2] # we only take the first two features. y = school.target - #show_plot(X,y,3) + # show_plot(X,y,3) model = neighbors.KNeighborsClassifier(10) - model.fit(X,y) + model.fit(X, y) model.predict(X) - #visualizer = KnnDecisionBoundariesVisualizer(model, classes=school.target_names, features=school.feature_names[:2]) + # visualizer = KnnDecisionBoundariesVisualizer(model, classes=school.target_names, features=school.feature_names[:2]) visualizer = KnnDecisionBoundariesVisualizer(model) - visualizer.fit_draw_poof(X, y) \ No newline at end of file + visualizer.fit_draw_poof(X, y) diff --git a/examples/bbengfort/classifiers.ipynb b/examples/bbengfort/classifiers.ipynb new file mode 100644 index 000000000..ac0590abf --- /dev/null +++ b/examples/bbengfort/classifiers.ipynb @@ -0,0 +1,435 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Classifiers and Class Labels\n", + "\n", + "This notebook explores how the classification score visualizers handle constraints when it comes to different types of models, data, class labeling schemes, and other parameters. In particular we explore the following:\n", + "\n", + "Target Types:\n", + "\n", + "- binary\n", + "- multiclass (3 classes)\n", + "\n", + "Target Encoding:\n", + "\n", + "- integers\n", + "- labels \n", + "\n", + "Labeling:\n", + "\n", + "- list of classes\n", + "- LabelEncoder\n", + "- dict encoding\n", + "- list of more classes than values in y" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# Ensure we're importing the development version of Yellowbrick\n", + "import sys\n", + "sys.path.append(\"../..\")" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# Use inline so that we can run the notebook multiple times\n", + "%matplotlib inline\n", + "\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "\n", + "# Import all of the Yellowbrick classifiers\n", + "from yellowbrick.classifier import *\n", + "from yellowbrick.exceptions import YellowbrickError\n", + "from yellowbrick.datasets import load_game, load_occupancy\n", + "\n", + "# Import scikit-learn utilities\n", + "from sklearn.pipeline import Pipeline\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.naive_bayes import MultinomialNB, GaussianNB\n", + "from sklearn.model_selection import train_test_split as tts\n", + "from sklearn.preprocessing import OneHotEncoder, LabelEncoder\n", + "from sklearn.datasets import make_classification\n", + "\n", + "from collections import namedtuple\n", + "from functools import partial\n", + "\n", + "Dataset = namedtuple(\"Dataset\", \"X,y,classes,encoder\")\n", + "Split = namedtuple(\"Split\", \"train,test\")\n", + "\n", + "\n", + "make_binary = partial(make_classification,\n", + " n_samples=500,\n", + " n_features=20,\n", + " n_informative=8,\n", + " n_redundant=2,\n", + " n_classes=2,\n", + " n_clusters_per_class=3,\n", + " )\n", + "\n", + "make_multiclass = partial(make_classification, \n", + " n_samples=500,\n", + " n_features=20,\n", + " n_informative=8,\n", + " n_redundant=2,\n", + " n_classes=6,\n", + " n_clusters_per_class=3,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# Select the parameters to run against all models \n", + "# Then restart kernel and run all\n", + "MODEL = LogisticRegression(solver='lbfgs', multi_class='auto')\n", + "IS_FITTED = False\n", + "DATASET = \"multiclass\" \n", + "TARGET = \"integers\"\n", + "ENCODER = \"labelencoder\"\n", + "USE_PANDAS = False\n", + "\n", + "\n", + "def make_dataset(name=DATASET, target=TARGET, encoder=ENCODER, use_pandas=USE_PANDAS):\n", + " loader = {\n", + " 'game': load_game, \n", + " 'occupancy': load_occupancy,\n", + " 'binary': make_binary,\n", + " 'multiclass': make_multiclass,\n", + " }.get(name)\n", + " \n", + " if name in {'game', 'occupancy'}:\n", + " dataset = loader(return_dataset=True)\n", + " labels = sorted(dataset.meta['labels'].items(), key=lambda i: i[1])\n", + "\n", + " if use_pandas:\n", + " X, y = dataset.to_pandas()\n", + " else:\n", + " X, y = dataset.to_numpy()\n", + " else:\n", + " X, y = loader()\n", + " labels = zip(list('abcdefghijk'), np.unique(y))\n", + " \n", + " if name == 'game':\n", + " X = OneHotEncoder().fit_transform(X)\n", + " \n", + " # game target is string encoded, occupancy is integer encoded\n", + " if target == \"integers\":\n", + " if y.dtype.kind != 'i':\n", + " y = LabelEncoder().fit_transform(y)\n", + " elif target == \"labels\":\n", + " if y.dtype.kind == 'i':\n", + " rv = {i[1]: i[0] for i in labels.items()}\n", + " y = np.array([rv[yi] for yi in y])\n", + " else:\n", + " raise ValueError(f\"unknown target type '{target}', use integers or labels\")\n", + " \n", + " c, le = None, None\n", + " X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2, shuffle=True, stratify=y)\n", + " \n", + " if encoder == 'list':\n", + " c = [l[0] for l in labels]\n", + " elif encoder == 'labelencoder':\n", + " le = LabelEncoder().fit([l[0] for l in labels])\n", + " elif encoder == 'dict':\n", + " le = {l[1]: l[0] for l in labels}\n", + " elif encoder is None:\n", + " c, le = None, None\n", + " else:\n", + " raise ValueError(f\"unknown encoder type '{encoder}', see make_dataset for choices\")\n", + " \n", + " return Dataset(Split(X_train, X_test), Split(y_train, y_test), c, le)\n", + " \n", + "\n", + "def visualize(visualizer, model=MODEL, is_fitted=IS_FITTED, score=True):\n", + " if is_fitted:\n", + " # This includes both auto and True; fit the model manually if you want the exception raised\n", + " model = model.fit(dataset.X.train, dataset.y.train)\n", + " _, ax = plt.subplots(figsize=(9,6)) \n", + " \n", + " try:\n", + " oz = visualizer(model, ax=ax, classes=dataset.classes, encoder=dataset.encoder, is_fitted=is_fitted)\n", + " oz.fit(dataset.X.train, dataset.y.train)\n", + "\n", + " if score:\n", + " oz.score(dataset.X.test, dataset.y.test)\n", + "\n", + " oz.finalize()\n", + " except YellowbrickError as e:\n", + " print(e)\n", + " except Exception as e:\n", + " print(\"A NON YB ERROR OCCURRED:\")\n", + " print(e)\n", + " return oz\n", + " \n", + " \n", + " \n", + "dataset = make_dataset()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "oz = visualize(ClassPredictionError)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.43" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "oz.score_" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "ClassificationReport(ax=,\n", + " classes=None,\n", + " cmap=,\n", + " encoder=LabelEncoder(), force_model=False, is_fitted=False,\n", + " model=None, support=None)" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "visualize(ClassificationReport)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "ConfusionMatrix(ax=,\n", + " classes=None,\n", + " cmap=,\n", + " encoder=LabelEncoder(), fontsize=None, force_model=False,\n", + " is_fitted=False, model=None, percent=False, sample_weight=None)" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "visualize(ConfusionMatrix)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "PrecisionRecallCurve(ap_score=True,\n", + " ax=,\n", + " classes=None, encoder=LabelEncoder(), fill_area=True,\n", + " fill_opacity=0.2, force_model=False, is_fitted=False,\n", + " iso_f1_curves=False, iso_f1_values={0.2, 0.4, 0.6, 0.8},\n", + " line_opacity=0.8, micro=True, model=None, per_class=False)" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "visualize(PRCurve)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "ROCAUC(ax=,\n", + " classes=None, encoder=LabelEncoder(), force_model=False, is_fitted=False,\n", + " macro=True, micro=True, model=None, per_class=True)" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "visualize(ROCAUC)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "multiclass format is not supported\n" + ] + }, + { + "data": { + "text/plain": [ + "DiscriminationThreshold(argmax='fscore',\n", + " ax=,\n", + " cv=0.1, exclude=None, fbeta=1.0, force_model=None,\n", + " is_fitted=False, model=None, n_trials=50,\n", + " quantiles=array([0.1, 0.5, 0.9]), random_state=None)" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAiAAAAFoCAYAAABuakCAAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8QZhcZAAAQi0lEQVR4nO3cf6jldZ3H8deMo3egxn4gtBIsFeSH4JLBWI6u5QbpZiRI9EcYxApWUrBtBq6ykAVtS5Rb9IdEG7L/7BJCiLaFEsSyNirJNRYv5VsmMCIqKkwr6k7OzP5x79BBZu45d5zzvsd7Hw8YuN/z/Z5z3vC5d87zfs+53z0nTpwIAECnvds9AACw+wgQAKCdAAEA2gkQAKCdAAEA2gkQAKDdTAEyxrh0jPE/p7j92jHGo2OMh8cYHzzr0wEAO9LUABlj3JLka0n2P+/2c5N8McnVSa5M8qExxqvmMSQAsLPMcgbkx0nec4rb35DkSFU9XVVHk3wvydvO5nAAwM60b9oBVfWNMcZrTrHr/CTPTGz/LsnLNnuslZWVpSRvTvLzJMdmHxMAWEDnJLkwyaMHDx5c28odpwbIJp5NcmBi+0CS3065z5uTPPgCnhMAWDxvzfo7ITN7IQHyoySvH2O8Msnvs/72yxem3OfnSXLRRRflvPPOewFPzdm0urqa5eXl7R6DDdZjsViPxWNNFsfRo0fz5JNPJhuv71ux5QAZY1yf5KVV9dUxxs1JHsj6Z0nuqqqfTbn7sSQ577zzsrS0tNWnZo6sx2KxHovFeiwea7JwtvyxipkCpKqeSnJo4+v/mrj9m0m+udUnBQB2NxciAwDaCRAAoJ0AAQDaCRAAoJ0AAQDaCRAAoJ0AAQDaCRAAoJ0AAQDaCRAAoJ0AAQDaCRAAoJ0AAQDaCRAAoJ0AAQDaCRAAoJ0AAQDaCRAAoJ0AAQDaCRAAoJ0AAQDaCRAAoJ0AAQDaCRAAoJ0AAQDaCRAAoJ0AAQDaCRAAoJ0AAQDaCRAAoJ0AAQDaCRAAoJ0AAQDaCRAAoJ0AAQDaCRAAoJ0AAQDaCRAAoJ0AAQDaCRAAoJ0AAQDaCRAAoJ0AAQDaCRAAoJ0AAQDaCRAAoJ0AAQDaCRAAoJ0AAQDaCRAAoJ0AAQDaCRAAoJ0AAQDaCRAAoJ0AAQDaCRAAoJ0AAQDa7Zt2wBhjb5I7k1ycZC3JjVV1ZGL/J5Jcn+R4ks9W1T1zmhUA2CFmOQNyXZL9VXVZkluT3HFyxxjj5Uk+luSyJFcn+dI8hgQAdpZZAuSKJPcnSVU9kuSSiX1/SPKTJC/Z+Hf8bA8IAOw8swTI+Umemdg+NsaYfOvmp0l+mOSxJF8+i7MBADvU1M+AJHk2yYGJ7b1V9dzG19ckuTDJaze2HxhjHK6q72/2gKurq1selPlaWVnZ7hGYYD0Wi/VYPNbkxW+WADmc5Nokd48xDiV5fGLf00n+mGStqk6MMX6b5OXTHnB5eTlLS0tnMi9zsLKykoMHD273GGywHovFeiwea7I41tbWzvikwiwBck+Sq8YYDyXZk+SGMcbNSY5U1X1jjHckeWSMcTzJ95J854wmAQB2jakBUlXHk9z0vJufmNh/e5Lbz/JcAMAO5kJkAEA7AQIAtBMgAEA7AQIAtBMgAEA7AQIAtBMgAEA7AQIAtBMgAEA7AQIAtBMgAEA7AQIAtBMgAEA7AQIAtBMgAEA7AQIAtBMgAEA7AQIAtBMgAEA7AQIAtBMgAEA7AQIAtBMgAEA7AQIAtBMgAEA7AQIAtBMgAEA7AQIAtBMgAEA7AQIAtBMgAEA7AQIAtBMgAEA7AQIAtBMgAEA7AQIAtBMgAEA7AQIAtBMgAEA7AQIAtBMgAEA7AQIAtBMgAEA7AQIAtBMgAEA7AQIAtBMgAEA7AQIAtBMgAEA7AQIAtBMgAEA7AQIAtBMgAEA7AQIAtBMgAEA7AQIAtBMgAEA7AQIAtNs37YAxxt4kdya5OMlakhur6sjE/muS3J5kT5KVJB+tqhPzGRcA2AlmOQNyXZL9VXVZkluT3HFyxxjjQJLPJ3l3VV2a5KkkF8xhTgBgB5klQK5Icn+SVNUjSS6Z2Hd5kseT3DHGeDDJL6vqV2d9SgBgR5n6FkyS85M8M7F9bIyxr6qey/rZjrcneVOS3yd5cIzxcFU9udkDrq6unum8zMnKysp2j8AE67FYrMfisSYvfrMEyLNJDkxs792IjyT5TZJHq+oXSTLG+N+sx8imAbK8vJylpaUzGJd5WFlZycGDB7d7DDZYj8ViPRaPNVkca2trZ3xSYZa3YA4neVeSjDEOZf0tl5MeS7I8xrhgjLEvyaEkPzyjSQCAXWOWMyD3JLlqjPFQ1v/S5YYxxs1JjlTVfWOM25I8sHHs3VXl/RUAYFNTA6Sqjie56Xk3PzGx/+tJvn6W5wIAdjAXIgMA2gkQAKCdAAEA2gkQAKCdAAEA2gkQAKCdAAEA2gkQAKCdAAEA2gkQAKCdAAEA2gkQAKCdAAEA2gkQAKCdAAEA2gkQAKCdAAEA2gkQAKCdAAEA2gkQAKCdAAEA2gkQAKCdAAEA2gkQAKCdAAEA2gkQAKCdAAEA2gkQAKCdAAEA2gkQAKCdAAEA2gkQAKCdAAEA2gkQAKCdAAEA2gkQAKCdAAEA2gkQAKCdAAEA2gkQAKCdAAEA2gkQAKCdAAEA2gkQAKCdAAEA2gkQAKCdAAEA2gkQAKCdAAEA2gkQAKCdAAEA2gkQAKCdAAEA2gkQAKCdAAEA2gkQAKCdAAEA2u2bdsAYY2+SO5NcnGQtyY1VdeQUx3wryb1V9ZV5DAoA7ByznAG5Lsn+qrosya1J7jjFMZ9J8oqzORgAsHPNEiBXJLk/SarqkSSXTO4cY7w3yfGTxwAATDP1LZgk5yd5ZmL72BhjX1U9N8ZYTnJ9kvcm+eSsT7q6urq1KZm7lZWV7R6BCdZjsViPxWNNXvxmCZBnkxyY2N5bVc9tfP2BJK9O8t0kr0lydIzxVFVtejZkeXk5S0tLZzAu87CyspKDBw9u9xhssB6LxXosHmuyONbW1s74pMIsAXI4ybVJ7h5jHEry+MkdVXXLya/HGJ9K8otp8QEAMEuA3JPkqjHGQ0n2JLlhjHFzkiNVdd9cpwMAdqSpAVJVx5Pc9LybnzjFcZ86SzMBADucC5EBAO0ECADQToAAAO0ECADQToAAAO0ECADQToAAAO0ECADQToAAAO0ECADQToAAAO0ECADQToAAAO0ECADQToAAAO0ECADQToAAAO0ECADQToAAAO0ECADQToAAAO0ECADQToAAAO0ECADQToAAAO0ECADQToAAAO0ECADQToAAAO0ECADQToAAAO0ECADQToAAAO0ECADQToAAAO0ECADQToAAAO0ECADQToAAAO0ECADQToAAAO0ECADQToAAAO0ECADQToAAAO0ECADQToAAAO0ECADQToAAAO0ECADQToAAAO0ECADQToAAAO0ECADQToAAAO0ECADQToAAAO32TTtgjLE3yZ1JLk6yluTGqjoysf/jSd63sfntqvr0PAYFAHaOWc6AXJdkf1VdluTWJHec3DHGeF2S9ye5PMmhJFePMd44j0EBgJ1jlgC5Isn9SVJVjyS5ZGLfT5O8s6qOVdWJJOcm+dNZnxIA2FGmvgWT5Pwkz0xsHxtj7Kuq56rqz0l+PcbYk+TzSX5QVU/OY1AAYOeYJUCeTXJgYntvVT13cmOMsT/JXUl+l+Qjszzp6urqVmakwcrKynaPwATrsVisx+KxJi9+swTI4STXJrl7jHEoyeMnd2yc+bg3yXer6nOzPuny8nKWlpa2OitzsrKykoMHD273GGywHovFeiwea7I41tbWzvikwiwBck+Sq8YYDyXZk+SGMcbNSY4kOSfJlUmWxhjXbBx/W1U9fEbTAAC7wtQAqarjSW563s1PTHy9/6xOBADseC5EBgC0EyAAQDsBAgC0EyAAQDsBAgC0EyAAQDsBAgC0EyAAQDsBAgC0EyAAQDsBAgC0EyAAQDsBAgC0EyAAQDsBAgC0EyAAQDsBAgC0EyAAQDsBAgC0EyAAQDsBAgC0EyAAQDsBAgC0EyAAQDsBAgC0EyAAQDsBAgC0EyAAQDsBAgC0EyAAQDsBAgC0EyAAQDsBAgC0EyAAQDsBAgC0EyAAQDsBAgC0EyAAQDsBAgC0EyAAQDsBAgC0EyAAQDsBAgC0EyAAQDsBAgC0EyAAQDsBAgC0EyAAQDsBAgC0EyAAQDsBAgC0EyAAQDsBAgC0EyAAQDsBAgC0EyAAQDsBAgC02zftgDHG3iR3Jrk4yVqSG6vqyMT+Dyb5cJLnknymqv57TrMCADvELGdArkuyv6ouS3JrkjtO7hhj/FWSf0jyN0n+Lsm/jjGW5jEoALBzzBIgVyS5P0mq6pEkl0zse0uSw1W1VlXPJDmS5I1nfUoAYEeZ+hZMkvOTPDOxfWyMsa+qnjvFvt8ledkmj3VOkhw9enSrczJna2tr2z0CE6zHYrEei8eaLIaJ1/NztnrfWQLk2SQHJrb3bsTHqfYdSPLbTR7rwiR58skntzIjDVZXV7d7BCZYj8ViPRaPNVk4Fyb58VbuMEuAHE5ybZK7xxiHkjw+se/7Sf5ljLE/yVKSNyTZ7Lvi0SRvTfLzJMe2MigAsHDOyXp8PLrVO+45ceLEpgdM/BXMG5PsSXJDknclOVJV9238FcyHsv55ks9W1Te2OgQAsLtMDRAAgLPNhcgAgHYCBABoJ0AAgHaz/BXMGXEJ98Uyw3p8PMn7Nja/XVWf7p9y95i2HhPHfCvJvVX1lf4pd5cZfkauSXJ71j+Mv5Lko1XlQ3RzMsN6fCLJ9UmOZ/0PIO7ZlkF3mTHGpUk+V1V/+7zbr03yyay/pt9VVf8+7bHmeQbEJdwXy2br8bok709yeZJDSa4eY7ii7Xyddj0mfCbJK1qn2t02+xk5kOTzSd5dVZcmeSrJBdsx5C6y2Xq8PMnHklyW5OokX9qWCXeZMcYtSb6WZP/zbj83yRezvhZXJvnQGONV0x5vngHiEu6LZbP1+GmSd1bVsY3f6M5N8qf+EXeVzdYjY4z3Zv03u/v7R9u1NluTy7N+DaQ7xhgPJvllVf2qf8RdZbP1+EOSnyR5yca/4+3T7U4/TvKeU9z+hqxfmuPpqjqa5HtJ3jbtweYZIKe8hPtp9k27hDsv3GnXo6r+XFW/HmPsGWN8IckPqsrlaufrtOsxxljO+qnlT27HYLvYZv9nXZDk7Un+Kck1Sf5xjHFR83y7zWbrkaz/4vTDJI8l+XLnYLvVxnW+/nyKXWf0mj7PADmbl3DnhdtsPbJxNdv/3DjmI82z7UabrccHkrw6yXeT/H2Sm8cY7+wdb1fabE1+k+TRqvpFVf0+yf8meVP3gLvMZutxTdavvvnaJH+d5Loxxlua5+Mvzug1fZ4BcjjrV0zNaS7h/tYxxv4xxssy/RLuvHCnXY8xxp4k9yb5v6r6cFW5TP78nXY9quqWqrp040Ne/5Hk36rKWzHzt9n/WY8lWR5jXLDxW/ihrP/2zfxsth5PJ/ljkrWq+lPWX+xe3j4hJ/0oyevHGK8cY5yX9bdfHp52p7n9FUySe5JcNcZ4KBuXcB9j3Jy/XML9y0kezHoE/fPGNxHzc9r1yPq1/K9MsrTxSf8kua2qpn4DccY2/fnY3tF2rWn/Z92W5IGNY++uKr80zde09XhHkkfGGMez/pmD72zjrLvSGOP6JC+tqq9urM0DWX9Nv6uqfjbt/i7FDgC0cyEyAKCdAAEA2gkQAKCdAAEA2gkQAKCdAAEA2gkQAKCdAAEA2v0/KnfaJ4RqHpYAAAAASUVORK5CYII=\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "visualize(DiscriminationThreshold, score=False)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/bbengfort/cooks_distance.ipynb b/examples/bbengfort/cooks_distance.ipynb new file mode 100644 index 000000000..c352805c2 --- /dev/null +++ b/examples/bbengfort/cooks_distance.ipynb @@ -0,0 +1,2004 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Cooks Distance\n", + "\n", + "See more at [Cook's Distance](https://en.wikipedia.org/wiki/Cook%27s_distance). A very good description of influence and leverage can be found at [Linear Regression in Python, Outliers / Leverage Detect](https://songhuiming.github.io/pages/2016/11/27/linear-regression-in-python-outliers-leverage-detect/) (this post should make it into the Yellowbrick documentation in some form). \n", + "\n", + "A good description of outlier detection that also includes Cook's Distance is [How to Make Your Machine Learning Models Robust to Outliers](https://heartbeat.fritz.ai/how-to-make-your-machine-learning-models-robust-to-outliers-44d404067d07). Original motivating post is at [How do you check the quality of your regression model in Python?](https://towardsdatascience.com/how-do-you-check-the-quality-of-your-regression-model-in-python-fa61759ff685) - source code for this post is at [\n", + "Visual analytics and diagnostics of model fit for linear regression](https://github.com/tirthajyoti/Machine-Learning-with-Python/blob/master/Regression/Regression_Diagnostics.ipynb).\n", + "\n", + "The statsmodels source code for Cook's Distance is at:\n", + "\n", + "- [Outliers Influence](http://www.statsmodels.org/0.6.1/_modules/statsmodels/stats/outliers_influence.html)\n", + "- [Linear Model](https://www.statsmodels.org/dev/_modules/statsmodels/regression/linear_model.html)\n", + "- [NumPy Linear Algebra](https://docs.scipy.org/doc/numpy-1.13.0/reference/routines.linalg.html)" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "%matplotlib notebook\n", + "\n", + "import scipy as sp\n", + "import numpy as np\n", + "import pandas as pd \n", + "import matplotlib.pyplot as plt \n", + "\n", + "# Note: statsmodels requires scipy 1.2\n", + "import statsmodels.formula.api as sm\n", + "\n", + "from sklearn.datasets import make_regression\n", + "from sklearn.linear_model import LinearRegression\n", + "from statsmodels.stats.outliers_influence import OLSInfluence as influence\n", + "\n", + "from yellowbrick.base import Visualizer" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Random Test Data\n", + "\n", + "For the purpose of generating tests for the visualizer, I'm using random test data. The compressive concrete strength dataset would also be good for integration testing." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
X0X1X2X3X4X5X6X7X8X9X10X11X12X13y
0-0.0790990.774174-0.389897-1.009309-0.0823650.6373090.2369021.618119-0.520738-0.2108910.654572-0.366485-0.8592110.14597842.341339
1-0.8963111.928183-0.5387420.5881170.724919-0.8566560.842047-1.796532-0.5345442.421266-0.1152431.585129-0.1034420.169937-64.162092
20.479350-0.544794-0.1807320.899390-0.8744540.9077071.0580481.2662990.033375-0.2419880.161981-0.7708800.785022-2.043742-63.616222
3-0.7931361.422011-1.109729-1.4342481.3923040.622966-0.4724420.075291-0.077669-1.725702-0.0756340.147481-0.918953-0.566963-84.649787
40.6708280.1496580.1169480.011745-2.450315-0.529813-1.586794-0.337791-1.360943-1.209053-0.0292310.909747-0.142993-1.622250-166.153271
\n", + "
" + ], + "text/plain": [ + " X0 X1 X2 X3 X4 X5 X6 \\\n", + "0 -0.079099 0.774174 -0.389897 -1.009309 -0.082365 0.637309 0.236902 \n", + "1 -0.896311 1.928183 -0.538742 0.588117 0.724919 -0.856656 0.842047 \n", + "2 0.479350 -0.544794 -0.180732 0.899390 -0.874454 0.907707 1.058048 \n", + "3 -0.793136 1.422011 -1.109729 -1.434248 1.392304 0.622966 -0.472442 \n", + "4 0.670828 0.149658 0.116948 0.011745 -2.450315 -0.529813 -1.586794 \n", + "\n", + " X7 X8 X9 X10 X11 X12 X13 \\\n", + "0 1.618119 -0.520738 -0.210891 0.654572 -0.366485 -0.859211 0.145978 \n", + "1 -1.796532 -0.534544 2.421266 -0.115243 1.585129 -0.103442 0.169937 \n", + "2 1.266299 0.033375 -0.241988 0.161981 -0.770880 0.785022 -2.043742 \n", + "3 0.075291 -0.077669 -1.725702 -0.075634 0.147481 -0.918953 -0.566963 \n", + "4 -0.337791 -1.360943 -1.209053 -0.029231 0.909747 -0.142993 -1.622250 \n", + "\n", + " y \n", + "0 42.341339 \n", + "1 -64.162092 \n", + "2 -63.616222 \n", + "3 -84.649787 \n", + "4 -166.153271 " + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Make Test Dataset\n", + "X, y = make_regression(\n", + " n_samples=100, n_features=14, n_informative=6, bias=1.2, noise=49.8, tail_strength=0.6, random_state=637\n", + ")\n", + "\n", + "# Convert to a DataFrame for statsmodels\n", + "data = pd.DataFrame(X)\n", + "data.columns = [f\"X{i}\" for i in range(X.shape[1])]\n", + "data[\"y\"] = y\n", + "data.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Statsmodels Results\n", + "\n", + "For comparison to the custom computation, this section computes the statsmodels OLS model and Cook's Distance values. In a later section we will compare this to scikit-learn linear regression and the Yellowbrick cooks function." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " OLS Regression Results \n", + "==============================================================================\n", + "Dep. Variable: y R-squared: 0.881\n", + "Model: OLS Adj. R-squared: 0.862\n", + "Method: Least Squares F-statistic: 45.14\n", + "Date: Sun, 09 Jun 2019 Prob (F-statistic): 2.75e-33\n", + "Time: 16:26:30 Log-Likelihood: -513.54\n", + "No. Observations: 100 AIC: 1057.\n", + "Df Residuals: 85 BIC: 1096.\n", + "Df Model: 14 \n", + "Covariance Type: nonrobust \n", + "==============================================================================\n", + " coef std err t P>|t| [0.025 0.975]\n", + "------------------------------------------------------------------------------\n", + "Intercept -9.9575 5.033 -1.978 0.051 -19.965 0.050\n", + "X0 101.3951 4.852 20.897 0.000 91.748 111.042\n", + "X1 7.3318 5.241 1.399 0.165 -3.089 17.752\n", + "X2 -3.2906 4.839 -0.680 0.498 -12.912 6.331\n", + "X3 -3.7461 5.237 -0.715 0.476 -14.158 6.666\n", + "X4 28.0759 4.432 6.335 0.000 19.265 36.887\n", + "X5 3.0478 5.184 0.588 0.558 -7.259 13.355\n", + "X6 3.3205 4.966 0.669 0.505 -6.552 13.193\n", + "X7 15.3175 4.014 3.816 0.000 7.336 23.299\n", + "X8 49.7685 5.584 8.912 0.000 38.666 60.872\n", + "X9 14.7939 4.486 3.298 0.001 5.874 23.713\n", + "X10 -0.6941 5.275 -0.132 0.896 -11.183 9.795\n", + "X11 -10.1851 4.921 -2.070 0.042 -19.969 -0.402\n", + "X12 -8.5387 5.249 -1.627 0.108 -18.976 1.898\n", + "X13 25.9747 4.656 5.579 0.000 16.717 35.232\n", + "==============================================================================\n", + "Omnibus: 0.261 Durbin-Watson: 2.027\n", + "Prob(Omnibus): 0.878 Jarque-Bera (JB): 0.418\n", + "Skew: 0.093 Prob(JB): 0.811\n", + "Kurtosis: 2.744 Cond. No. 2.30\n", + "==============================================================================\n", + "\n", + "Warnings:\n", + "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n" + ] + } + ], + "source": [ + "# Compute an OLS model \n", + "cols = data.columns\n", + "model = sm.ols(formula=f\"{cols[-1]} ~ {' + '.join(cols[:-1])}\", data=data)\n", + "model = model.fit()\n", + "\n", + "print(model.summary())" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# Compute the influence to get Cook's distance\n", + "inf = influence(model)\n", + "\n", + "# cooks_distance is an attribute of incluence, here C, not sure about P (p-value maybe?)\n", + "C, P = inf.cooks_distance" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "application/javascript": [ + "/* Put everything inside the global mpl namespace */\n", + "window.mpl = {};\n", + "\n", + "\n", + "mpl.get_websocket_type = function() {\n", + " if (typeof(WebSocket) !== 'undefined') {\n", + " return WebSocket;\n", + " } else if (typeof(MozWebSocket) !== 'undefined') {\n", + " return MozWebSocket;\n", + " } else {\n", + " alert('Your browser does not have WebSocket support.' +\n", + " 'Please try Chrome, Safari or Firefox ≥ 6. ' +\n", + " 'Firefox 4 and 5 are also supported but you ' +\n", + " 'have to enable WebSockets in about:config.');\n", + " };\n", + "}\n", + "\n", + "mpl.figure = function(figure_id, websocket, ondownload, parent_element) {\n", + " this.id = figure_id;\n", + "\n", + " this.ws = websocket;\n", + "\n", + " this.supports_binary = (this.ws.binaryType != undefined);\n", + "\n", + " if (!this.supports_binary) {\n", + " var warnings = document.getElementById(\"mpl-warnings\");\n", + " if (warnings) {\n", + " warnings.style.display = 'block';\n", + " warnings.textContent = (\n", + " \"This browser does not support binary websocket messages. \" +\n", + " \"Performance may be slow.\");\n", + " }\n", + " }\n", + "\n", + " this.imageObj = new Image();\n", + "\n", + " this.context = undefined;\n", + " this.message = undefined;\n", + " this.canvas = undefined;\n", + " this.rubberband_canvas = undefined;\n", + " this.rubberband_context = undefined;\n", + " this.format_dropdown = undefined;\n", + "\n", + " this.image_mode = 'full';\n", + "\n", + " this.root = $('
');\n", + " this._root_extra_style(this.root)\n", + " this.root.attr('style', 'display: inline-block');\n", + "\n", + " $(parent_element).append(this.root);\n", + "\n", + " this._init_header(this);\n", + " this._init_canvas(this);\n", + " this._init_toolbar(this);\n", + "\n", + " var fig = this;\n", + "\n", + " this.waiting = false;\n", + "\n", + " this.ws.onopen = function () {\n", + " fig.send_message(\"supports_binary\", {value: fig.supports_binary});\n", + " fig.send_message(\"send_image_mode\", {});\n", + " if (mpl.ratio != 1) {\n", + " fig.send_message(\"set_dpi_ratio\", {'dpi_ratio': mpl.ratio});\n", + " }\n", + " fig.send_message(\"refresh\", {});\n", + " }\n", + "\n", + " this.imageObj.onload = function() {\n", + " if (fig.image_mode == 'full') {\n", + " // Full images could contain transparency (where diff images\n", + " // almost always do), so we need to clear the canvas so that\n", + " // there is no ghosting.\n", + " fig.context.clearRect(0, 0, fig.canvas.width, fig.canvas.height);\n", + " }\n", + " fig.context.drawImage(fig.imageObj, 0, 0);\n", + " };\n", + "\n", + " this.imageObj.onunload = function() {\n", + " fig.ws.close();\n", + " }\n", + "\n", + " this.ws.onmessage = this._make_on_message_function(this);\n", + "\n", + " this.ondownload = ondownload;\n", + "}\n", + "\n", + "mpl.figure.prototype._init_header = function() {\n", + " var titlebar = $(\n", + " '
');\n", + " var titletext = $(\n", + " '
');\n", + " titlebar.append(titletext)\n", + " this.root.append(titlebar);\n", + " this.header = titletext[0];\n", + "}\n", + "\n", + "\n", + "\n", + "mpl.figure.prototype._canvas_extra_style = function(canvas_div) {\n", + "\n", + "}\n", + "\n", + "\n", + "mpl.figure.prototype._root_extra_style = function(canvas_div) {\n", + "\n", + "}\n", + "\n", + "mpl.figure.prototype._init_canvas = function() {\n", + " var fig = this;\n", + "\n", + " var canvas_div = $('
');\n", + "\n", + " canvas_div.attr('style', 'position: relative; clear: both; outline: 0');\n", + "\n", + " function canvas_keyboard_event(event) {\n", + " return fig.key_event(event, event['data']);\n", + " }\n", + "\n", + " canvas_div.keydown('key_press', canvas_keyboard_event);\n", + " canvas_div.keyup('key_release', canvas_keyboard_event);\n", + " this.canvas_div = canvas_div\n", + " this._canvas_extra_style(canvas_div)\n", + " this.root.append(canvas_div);\n", + "\n", + " var canvas = $('');\n", + " canvas.addClass('mpl-canvas');\n", + " canvas.attr('style', \"left: 0; top: 0; z-index: 0; outline: 0\")\n", + "\n", + " this.canvas = canvas[0];\n", + " this.context = canvas[0].getContext(\"2d\");\n", + "\n", + " var backingStore = this.context.backingStorePixelRatio ||\n", + "\tthis.context.webkitBackingStorePixelRatio ||\n", + "\tthis.context.mozBackingStorePixelRatio ||\n", + "\tthis.context.msBackingStorePixelRatio ||\n", + "\tthis.context.oBackingStorePixelRatio ||\n", + "\tthis.context.backingStorePixelRatio || 1;\n", + "\n", + " mpl.ratio = (window.devicePixelRatio || 1) / backingStore;\n", + "\n", + " var rubberband = $('');\n", + " rubberband.attr('style', \"position: absolute; left: 0; top: 0; z-index: 1;\")\n", + "\n", + " var pass_mouse_events = true;\n", + "\n", + " canvas_div.resizable({\n", + " start: function(event, ui) {\n", + " pass_mouse_events = false;\n", + " },\n", + " resize: function(event, ui) {\n", + " fig.request_resize(ui.size.width, ui.size.height);\n", + " },\n", + " stop: function(event, ui) {\n", + " pass_mouse_events = true;\n", + " fig.request_resize(ui.size.width, ui.size.height);\n", + " },\n", + " });\n", + "\n", + " function mouse_event_fn(event) {\n", + " if (pass_mouse_events)\n", + " return fig.mouse_event(event, event['data']);\n", + " }\n", + "\n", + " rubberband.mousedown('button_press', mouse_event_fn);\n", + " rubberband.mouseup('button_release', mouse_event_fn);\n", + " // Throttle sequential mouse events to 1 every 20ms.\n", + " rubberband.mousemove('motion_notify', mouse_event_fn);\n", + "\n", + " rubberband.mouseenter('figure_enter', mouse_event_fn);\n", + " rubberband.mouseleave('figure_leave', mouse_event_fn);\n", + "\n", + " canvas_div.on(\"wheel\", function (event) {\n", + " event = event.originalEvent;\n", + " event['data'] = 'scroll'\n", + " if (event.deltaY < 0) {\n", + " event.step = 1;\n", + " } else {\n", + " event.step = -1;\n", + " }\n", + " mouse_event_fn(event);\n", + " });\n", + "\n", + " canvas_div.append(canvas);\n", + " canvas_div.append(rubberband);\n", + "\n", + " this.rubberband = rubberband;\n", + " this.rubberband_canvas = rubberband[0];\n", + " this.rubberband_context = rubberband[0].getContext(\"2d\");\n", + " this.rubberband_context.strokeStyle = \"#000000\";\n", + "\n", + " this._resize_canvas = function(width, height) {\n", + " // Keep the size of the canvas, canvas container, and rubber band\n", + " // canvas in synch.\n", + " canvas_div.css('width', width)\n", + " canvas_div.css('height', height)\n", + "\n", + " canvas.attr('width', width * mpl.ratio);\n", + " canvas.attr('height', height * mpl.ratio);\n", + " canvas.attr('style', 'width: ' + width + 'px; height: ' + height + 'px;');\n", + "\n", + " rubberband.attr('width', width);\n", + " rubberband.attr('height', height);\n", + " }\n", + "\n", + " // Set the figure to an initial 600x600px, this will subsequently be updated\n", + " // upon first draw.\n", + " this._resize_canvas(600, 600);\n", + "\n", + " // Disable right mouse context menu.\n", + " $(this.rubberband_canvas).bind(\"contextmenu\",function(e){\n", + " return false;\n", + " });\n", + "\n", + " function set_focus () {\n", + " canvas.focus();\n", + " canvas_div.focus();\n", + " }\n", + "\n", + " window.setTimeout(set_focus, 100);\n", + "}\n", + "\n", + "mpl.figure.prototype._init_toolbar = function() {\n", + " var fig = this;\n", + "\n", + " var nav_element = $('
')\n", + " nav_element.attr('style', 'width: 100%');\n", + " this.root.append(nav_element);\n", + "\n", + " // Define a callback function for later on.\n", + " function toolbar_event(event) {\n", + " return fig.toolbar_button_onclick(event['data']);\n", + " }\n", + " function toolbar_mouse_event(event) {\n", + " return fig.toolbar_button_onmouseover(event['data']);\n", + " }\n", + "\n", + " for(var toolbar_ind in mpl.toolbar_items) {\n", + " var name = mpl.toolbar_items[toolbar_ind][0];\n", + " var tooltip = mpl.toolbar_items[toolbar_ind][1];\n", + " var image = mpl.toolbar_items[toolbar_ind][2];\n", + " var method_name = mpl.toolbar_items[toolbar_ind][3];\n", + "\n", + " if (!name) {\n", + " // put a spacer in here.\n", + " continue;\n", + " }\n", + " var button = $('');\n", + " button.click(method_name, toolbar_event);\n", + " button.mouseover(tooltip, toolbar_mouse_event);\n", + " nav_element.append(button);\n", + " }\n", + "\n", + " // Add the status bar.\n", + " var status_bar = $('');\n", + " nav_element.append(status_bar);\n", + " this.message = status_bar[0];\n", + "\n", + " // Add the close button to the window.\n", + " var buttongrp = $('
');\n", + " var button = $('');\n", + " button.click(function (evt) { fig.handle_close(fig, {}); } );\n", + " button.mouseover('Stop Interaction', toolbar_mouse_event);\n", + " buttongrp.append(button);\n", + " var titlebar = this.root.find($('.ui-dialog-titlebar'));\n", + " titlebar.prepend(buttongrp);\n", + "}\n", + "\n", + "mpl.figure.prototype._root_extra_style = function(el){\n", + " var fig = this\n", + " el.on(\"remove\", function(){\n", + "\tfig.close_ws(fig, {});\n", + " });\n", + "}\n", + "\n", + "mpl.figure.prototype._canvas_extra_style = function(el){\n", + " // this is important to make the div 'focusable\n", + " el.attr('tabindex', 0)\n", + " // reach out to IPython and tell the keyboard manager to turn it's self\n", + " // off when our div gets focus\n", + "\n", + " // location in version 3\n", + " if (IPython.notebook.keyboard_manager) {\n", + " IPython.notebook.keyboard_manager.register_events(el);\n", + " }\n", + " else {\n", + " // location in version 2\n", + " IPython.keyboard_manager.register_events(el);\n", + " }\n", + "\n", + "}\n", + "\n", + "mpl.figure.prototype._key_event_extra = function(event, name) {\n", + " var manager = IPython.notebook.keyboard_manager;\n", + " if (!manager)\n", + " manager = IPython.keyboard_manager;\n", + "\n", + " // Check for shift+enter\n", + " if (event.shiftKey && event.which == 13) {\n", + " this.canvas_div.blur();\n", + " event.shiftKey = false;\n", + " // Send a \"J\" for go to next cell\n", + " event.which = 74;\n", + " event.keyCode = 74;\n", + " manager.command_mode();\n", + " manager.handle_keydown(event);\n", + " }\n", + "}\n", + "\n", + "mpl.figure.prototype.handle_save = function(fig, msg) {\n", + " fig.ondownload(fig, null);\n", + "}\n", + "\n", + "\n", + "mpl.find_output_cell = function(html_output) {\n", + " // Return the cell and output element which can be found *uniquely* in the notebook.\n", + " // Note - this is a bit hacky, but it is done because the \"notebook_saving.Notebook\"\n", + " // IPython event is triggered only after the cells have been serialised, which for\n", + " // our purposes (turning an active figure into a static one), is too late.\n", + " var cells = IPython.notebook.get_cells();\n", + " var ncells = cells.length;\n", + " for (var i=0; i= 3 moved mimebundle to data attribute of output\n", + " data = data.data;\n", + " }\n", + " if (data['text/html'] == html_output) {\n", + " return [cell, data, j];\n", + " }\n", + " }\n", + " }\n", + " }\n", + "}\n", + "\n", + "// Register the function which deals with the matplotlib target/channel.\n", + "// The kernel may be null if the page has been refreshed.\n", + "if (IPython.notebook.kernel != null) {\n", + " IPython.notebook.kernel.comm_manager.register_target('matplotlib', mpl.mpl_figure_comm);\n", + "}\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "class CooksDistance(Visualizer):\n", + " \n", + " def fit(self, X, y):\n", + " # Leverage is computed as the diagonal of the projection matrix of X \n", + " # TODO: whiten X before computing leverage\n", + " self.leverage_ = (X * np.linalg.pinv(X).T).sum(1)\n", + " \n", + " # Compute the MSE\n", + " rank = np.linalg.matrix_rank(X)\n", + " df = X.shape[0] - rank\n", + " \n", + " resid = y - LinearRegression().fit(X, y).predict(X)\n", + " mse = np.dot(resid, resid) / df \n", + " \n", + " resid_studentized_internal = resid / np.sqrt(mse) / np.sqrt(1-self.leverage_)\n", + " \n", + " self.distance_ = resid_studentized_internal**2 / X.shape[1]\n", + " self.distance_ *= self.leverage_ / (1 - self.leverage_)\n", + "\n", + " self.p_values_ = sp.stats.f.sf(self.distance_, X.shape[1], df)\n", + " \n", + " self.draw()\n", + " return self\n", + " \n", + " \n", + " def draw(self):\n", + " self.ax.stem(self.distance_, markerfmt=\",\", label=\"influence\")\n", + " self.ax.axhline(4/len(self.distance_), c='r', ls='--', lw=1, label=\"$\\frac{4}{n}$\")\n", + " \n", + " def finalize(self):\n", + " self.ax.legend()\n", + " self.ax.set_xlabel(\"instance\")\n", + " self.ax.set_ylabel(\"influence\")\n", + " self.ax.set_title(\"Cook's Distance Outlier Detection\")\n", + " \n", + " \n", + "viz = CooksDistance().fit(X, y)\n", + "viz.finalize()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/download.py b/examples/download.py deleted file mode 100644 index ec82a6685..000000000 --- a/examples/download.py +++ /dev/null @@ -1,149 +0,0 @@ -#!/usr/bin/env python -# download -# Downloads the example datasets for running the examples. -# -# Author: Rebecca Bilbro -# Author: Benjamin Bengfort -# Created: Wed May 18 11:54:45 2016 -0400 -# -# Copyright (C) 2016 District Data Labs -# For license information, see LICENSE.txt -# -# ID: download.py [1f73d2b] benjamin@bengfort.com $ - -""" -Downloads the example datasets for running the examples. -""" - -########################################################################## -## Imports -########################################################################## - -import os -import sys -import hashlib -import zipfile - -try: - import requests -except ImportError: - print(( - "The requests module is required to download data --\n" - "please install it with pip install requests." - )) - sys.exit(1) - - -########################################################################## -## Links and MD5 hash of datasets -########################################################################## - -DATASETS = { - 'concrete': { - 'url': 'https://s3.amazonaws.com/ddl-data-lake/yellowbrick/concrete.zip', - 'signature': 'b9ea5f26a7bb272a040e2f1a993b26babbf8dc4a04ab8198bb315ca66d71f10d', - }, - 'energy': { - 'url': 'https://s3.amazonaws.com/ddl-data-lake/yellowbrick/energy.zip', - 'signature': '19fb86f3bcdde208eed46944172cb643ef6a7d58da103fb568fae43205ed89d3', - }, - 'credit': { - 'url': 'https://s3.amazonaws.com/ddl-data-lake/yellowbrick/credit.zip', - 'signature': '4a91339c69f55e18f3f48004328fbcb7868070b618208fed099920427b084e5e', - }, - 'occupancy': { - 'url': 'https://s3.amazonaws.com/ddl-data-lake/yellowbrick/occupancy.zip', - 'signature': '429cfe376dc9929a1fa528da89f0e1626e34e19695f3f555d8954025bbc522b8', - }, - 'mushroom': { - 'url': 'https://s3.amazonaws.com/ddl-data-lake/yellowbrick/mushroom.zip', - 'signature': '884c43cb70db35d211c67b1cf6a3683b2b4569393d2789d5c07840da4dc85ba8', - }, - 'hobbies': { - 'url': 'https://s3.amazonaws.com/ddl-data-lake/yellowbrick/hobbies.zip', - 'signature': '415c8f68df1486d5d84a1d1757a5aa3035aef5ad63ede5013c261d622fbd29d8', - }, - 'game': { - 'url': 'https://s3.amazonaws.com/ddl-data-lake/yellowbrick/game.zip', - 'signature': 'b1bd85789a014a898daa34cb5f89ceab6d2cd6488a2e572187e34aa4ec21a43b', - }, - 'bikeshare': { - 'url': 'https://s3.amazonaws.com/ddl-data-lake/yellowbrick/bikeshare.zip', - 'signature': 'a9b440f65549746dff680c92ff8bdca3c7265f09db1cf09e708e6e26fc8aba44', - }, - 'spam': { - 'url': 'https://s3.amazonaws.com/ddl-data-lake/yellowbrick/spam.zip', - 'signature': '65be21196ba3d8448847409b70a67d761f873f30719c807600eb516d7aef1de1', - }, -} - - -########################################################################## -## Download functions -########################################################################## - -def sha256sum(path, blocksize=65536): - """ - Computes the SHA256 signature of a file to verify that the file has not - been modified in transit and that it is the correct version of the data. - """ - sig = hashlib.sha256() - with open(path, 'rb') as f: - buf = f.read(blocksize) - while len(buf) > 0: - sig.update(buf) - buf = f.read(blocksize) - return sig.hexdigest() - - -def download_data(url, path='data', signature=None, extract=True): - """ - Downloads the zipped data set specified at the given URL, saving it to - the output path specified. This function verifies the download with the - given signature (if supplied) and extracts the zip file if requested. - """ - # Create the output directory if it does not exist - if not os.path.exists(path): - os.mkdir(path) - - # Get the name of the file from the URL - name = os.path.basename(url) - dlpath = os.path.join(path, name) - - # Fetch the response in a streaming fashion and write it to disk. - response = requests.get(url, stream=True) - with open(dlpath, 'wb') as f: - for chunk in response.iter_content(65536): - f.write(chunk) - - # If verify, compare the signature - if signature is not None: - dlsignature = sha256sum(dlpath) - if signature != dlsignature: - raise ValueError( - "Download signature does not match hardcoded signature!" - ) - - # If extract, extract the zipfile. - if extract: - zf = zipfile.ZipFile(dlpath) - zf.extractall(path) - - -def download_all(path='data', verify=True, extract=True): - """ - Downloads all the example datasets. If verify is True then compare the - download signature with the hardcoded signature. If extract is True then - extract the contents of the zipfile to the given path. - """ - for name, meta in DATASETS.items(): - url = meta['url'] - signature = meta['signature'] if verify else None - - download_data(url, path=path, signature=signature, extract=extract) - - -if __name__ == '__main__': - path='data' - download_all(path) - print("Downloaded datasets to {}".format(os.path.abspath(path))) diff --git a/examples/mike-curry00/Silhouette_Plot_Data.ipynb b/examples/mike-curry00/Silhouette_Plot_Data.ipynb new file mode 100644 index 000000000..5988e2220 --- /dev/null +++ b/examples/mike-curry00/Silhouette_Plot_Data.ipynb @@ -0,0 +1,339 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Visuals for NFL Receiver Clusters\n", + "- 2018 regular season statistics for all players targeted at least once.\n", + "- Data sourced from [Pro-Football-Reference](https://www.pro-football-reference.com/years/2018/receiving.htm)" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import matplotlib as mpl\n", + "import matplotlib.pyplot as plt\n", + "from sklearn.cluster import KMeans\n", + "from yellowbrick.cluster import KElbowVisualizer, SilhouetteVisualizer\n", + "import yellowbrick.datasets as ybdata\n", + "\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "nfl_df = (ybdata.load_nfl(return_dataset=True)\n", + " .to_dataframe()\n", + " .query('Tgt >= 20'))" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
RkPlayerIdTmAgeGGSTgtRecCtch_Rate...FirstTeamAllProC_posCB_posDT_posFB_posQB_posRB_posT_posTE_posWR_pos
01Michael ThomasThomMi05NOR2516161471250.850...1000000001
12Zach ErtzErtzZa00PHI2816161561160.744...0000000010
23DeAndre HopkinsHopkDe00HOU2616161631150.706...1000000001
34Julio JonesJoneJu02ATL2916161701130.665...0000000001
45Adam ThielenThieAd00MIN2816161531130.739...0000000001
\n", + "

5 rows × 29 columns

\n", + "
" + ], + "text/plain": [ + " Rk Player Id Tm Age G GS Tgt Rec Ctch_Rate \\\n", + "0 1 Michael Thomas ThomMi05 NOR 25 16 16 147 125 0.850 \n", + "1 2 Zach Ertz ErtzZa00 PHI 28 16 16 156 116 0.744 \n", + "2 3 DeAndre Hopkins HopkDe00 HOU 26 16 16 163 115 0.706 \n", + "3 4 Julio Jones JoneJu02 ATL 29 16 16 170 113 0.665 \n", + "4 5 Adam Thielen ThieAd00 MIN 28 16 16 153 113 0.739 \n", + "\n", + " ... FirstTeamAllPro C_pos CB_pos DT_pos FB_pos QB_pos RB_pos \\\n", + "0 ... 1 0 0 0 0 0 0 \n", + "1 ... 0 0 0 0 0 0 0 \n", + "2 ... 1 0 0 0 0 0 0 \n", + "3 ... 0 0 0 0 0 0 0 \n", + "4 ... 0 0 0 0 0 0 0 \n", + "\n", + " T_pos TE_pos WR_pos \n", + "0 0 0 1 \n", + "1 0 1 0 \n", + "2 0 0 1 \n", + "3 0 0 1 \n", + "4 0 0 1 \n", + "\n", + "[5 rows x 29 columns]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "nfl_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "features = ['Rec', 'Yds', 'TD', 'Fmb', 'Ctch_Rate']\n", + "X = nfl_df[features].values" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAhsAAAFqCAYAAACzsZhEAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjAsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+17YcXAAAgAElEQVR4nOzdeXxU1f3/8VcWEiAssu87clgCQcFaRVREWYyAQlIVq1bFKq2iotatbmhr2x+lLrVVrFbUr1XBBRQEFFGwImLEINsnArLJJvsiJCHw++PepCFkmUgmd5K8n49HHmTu+jkzQ+Yz53POvVFHjx5FREREJFyigw5AREREKjclGyIiIhJWSjZEREQkrJRsiIiISFgp2RAREZGwUrIhIiIiYRUbdAAiZcU5dxRYDeQACcDXwB/MbIG//jFgnZk9U8wxBgIrzGx9Kc99OnDQzJY4524CmpjZ/T+xKQWP3QH4G+D8RT8CD5vZO2Vx/FLEcS4wG1hTcJ2ZdXbOPQS0NLNRzrm1wC/N7NMwxtPej2e/mfU8geOsJV+szrlzgFeB/ma20l+/3cx6F9jv98AjQDszW/tTz3+inHPNgT8BZwJHgf3AE2b2or/+KNDKzDb+hGM7vPfyvFLuV+L/NalalGxIZXOumW10zkUBKcBU51yKmc0zs3tC2P824FGgVMkGcA3wKbDEzP5eyn1L8n/Ay2Y2FPISm4+cc53NbEMZn6sk682sczmfsyh9gM1m1resDuic64aXaIwws5X5VjV2znUys4x8yy4Bfiirc/8UzrlawDy898g1ZpbjnOsMzHDOVTOz507wFJfgfU6UKtkI8f+aVCFKNqRSMrOjwGTnXF38b33OuReBVWb2qN/78FsgCtiLlyxcBvQHujjnfgdMBR4H+gFHgBnA7/w/6GuBF4Ar8P7QXwUMdc41Burwv2/4rYHngLZANvAXM3vJOdcWWAA8BlwP1AfGmtnrhTSnO/B5vrYtdM51AjYBOOeuAn7vr14IjDKzTOdcKvAg3v/zTcD1Zrba74FoASThfbA+Adzvt6U68I4fS05pnvNCnOecewpoCEwys9/78R4Xl//8PGxmZ/nbzAB2mdkV/uMlwK/M7Cv/8RnAX4A6zrl0M0sKtb1m9nhhwfo9BNOAX5vZ5wVWvw9cDjzsb9sd2AU0yLd/H7z3Sz1gOzDSzNY456KBp4DzgTi8pPRaM8v235Pr8HolOgEZwDAz+7Gw96iZLSsQ19XANjN7MHeB3xtzCZBVoH2/wuvBOb/gY7835294r38U8ABwCLgHyHLO1TOz251zvwbG+tst8Ntx0G/HTr+NjwDJ/O//2lq89/l1QCv/Nbjdj+Fe4Fb/Ofg33v+vtse9OFLhacyGVHbTgNOdczVyFzjnauP9QfyZ/y39/wHJftnje+AK/0P/Vrw/jt2AU4G+eB84uVqamTOzccAXeH8oJxQ4/0TgYzNzeH+An/QTDfA+hI+YWXf/XI8W0YYZwBTn3BjnXBcAM/vezI76xxoPnItXZkkAxuRLci722zgdeDbfMS8ELvQ/eH8J/AL4GdDB/xld1BNaCr2A3v6/v3HOJRUT12dAonOumnMuBmgEdAFwzp0ENMMri+G3fwHeB+ECP9EoTXsLUwfvef6jmU0vZP1kjn3tL/eX4cdYG3gXuNfMOuIlcG/4qy/Be+8k+m3qBVya71ip/uMOfrsvKeo9Wkhc5/htPYaZpZvZiiLaWpjxwG1m1hUYClxiZu8Cb+OVZG53zvX1YzrPTwj2+I9z9ffjnczxzgbOwGv7zc65ln4v0u/wksC+eO9BqaSUbEhltxfvfV4737JDeLXt65xzTcxsspn9pZB9k4GJZnbYzA7i9WAMyLf+veJO7JyrBlwA/APAzNYBc4Hz/E1i8b7NAXwFtC7iUFcCT+P1PCx1zq11zt3orxsAfGZmm/zenJF431AvAOaa2Sp/u38B/Zxzub2ZC81su//7EOAFM9tjZof9bYcXEUtr59zKAj9/LWLb/zOzHDPbBnyC92FTaFx4vT7pwCl4Hz4rgR3OuRZ45ZJPzOxIEeehlO0tzD/weimaFLF+FbDPOdfLfzwCeDPf+r7ARjP7AMDM/gN0dM61NrM3gd5mlm1mh4BFQPt8+043s53+c/8N3vsg1PdofWBrMe0K1TbgKr80962ZjSxkmyHA62a2yX/8DMe+T+b47SvMq/57YZMfbyu8BORjM9vs7/dCGbRDIpTKKFLZtcX7INudu8Dvvu4P3As87HfR/8bMvimwbyO8rvJcu4DG+R7vLOHcDYAoM9tTxDFyzOxA7u9ATGEH8f8QjwfG+9/yU4HHnXPf4fWO7C6wLc65Y2I3sz3+OJaGhcR+EnCH30UO3t+FosYilGbMRv5j7MErLxwtJq65eAlJFF5PRzO8RONUYE4J5ypNewvzF7wegkXOuUW5SUMB/wEu93te1prZdm/8JOA9hx2cc/nHeWQCjZxzB4GnnHOn4pXjmuKVW3Llf3/kADGleI9uxysRnahr8UpxH/rx3mNmUwpscxJer0tuwh2NVxbKVdxzfFwb8d4P+ff5/qcELhWDkg2p7FLwvj1l5ftgwMwWA6nOuTi8rtxn8D7Y8ttKvpq8/3tpvkVuB4749e7cD8JSHcM51xA4Jd835t3Ac865QXhjObbj1ftzt68D1PDPcUa+5fXwPugK+3a/CZgWhoGt9fP9nvvBklVMXHPxyjfV8MZGNAcG43W9l/SttzTtLcwSM1vnnLsG+D/nXG87fkbS68B8vITptQLrNuHNYupdYDnOuYl4CW93fyzN/4USUIjv0bl4Japxfs9W7jnPxJslk/9cBRPaevnOtRW4Ga/EMQB4yzk3s5A2TjKzO0KJPwR7gVr5Hjcro+NKBFIZRSol51yUcy4FbyzEvQXWdXfOTXbOxZlZFvAl3gcIeB8KJ/m/v4fXjR3jnEvAK2cUVs8vuB8Afrf4LOAG/7wd8LqOPyxFU2rijdcYmC/+jsDpeB98M4A+zrm2/jf5Z/AG4n0AnO286aEANwKz/ZgKmgpc6Zyr6R//Bufc1aWIsSiXOeei/UGzff14i4vrc7wSSiKw1H98Ft7Uy4zjjn6s0rS3SP54jReAN51z8QXWfQ9swBtf8XaBXRcCzZw3UwjnXHvn3Mv+a9IY+MZPNJLwEoZaFKOE92h+L+H1LjzhJyU457oCr+AlF/lt9la76v5rneJvX80597FzLvfDPg3v/XyEY9/X04Dhfq8Zzrlhzrm7imtHCb7AK3U19J/rsnjPSYRSz4ZUNh875w4DdYHleAM/vyywzVLgO2CZcy4L2Ic36h9gCvCac+4BvBkE7YFleH/oJ5NvUGABbwP/z/+w25tv+Y14PRG/wvtWP8rMNuQbJFosM1vvnBsKjHPezI7cmQm3mdlCAL/88RHeh8sXwAQzO+ScG4U39bea395fF3oSb/ZJN+Arv/dnNV7CUpjWBUoFua4qZNkiP57GwN/MbLkfb6Fx+R/G3+OVl44Au/0Poc+KiCWPedOdQ21vSe7D6yX5O95Mmfz+g/ee2p1/oT8jIwWvXFIb77W+3x/E+1dgkt9rMh+4HXjeObewmBiKe48WPO+5eGUgc84dwiur3Wpm0wpsPhcvKcrwjz0VGOCXbP4FzPFf/yPAzf6MmHeBV51zbc0sxTn3R7z/Y9F44zxuKKYNxTKzL5xzk4DFeFPNX8ebei6VUNTRo4UlyyIiIuHlnIvKLf8455KBR83slIDDkjBQz4aIiJQ7vxyz0h84ux5v6uuCYKOScNGYDRERKXdm9gNeyWoOXmmnPvBQkDFJ+KiMIiIiImGlng0REREJK43ZyCctLS0eOA1vitiJ3hdCRESkKonBu17Kol69emXmX6Fk41in4U1NExERkZ+mL94NB/Mo2TjWZoBOnToRFxdX0rYhW7p0KYmJiWV2vEhXldo7ePBgsrOz+fDD0lynq+KrSq8xVL32QtVrs9p74rKyssjIyAD/szQ/JRvHygGIi4sjPj6+pG1LpayPF+mqSnt37NhBVlZWlWlvflWtzVWtvVD12qz2lpnjhiFogKiIiIiElZINERERCSuVUUROwH/+8x+WL18edBgiIhFNyYbICejatSsHDx4MOgwRkYimMoqIiIiElXo2RE5AUlISWVlZrFixIuhQREQilno2wuzhWelMXLIt6DBEREQCo56NMHp4VjrjZi8BoPmsdB4cmBRwRCIiIuVPPRthkj/RABg3ewkPz0oPMCIRCdqCBQu44447WLhwIXfeeWe5n3/Lli3MmDGjyPULFiwos7hycnK4+OKLueGGG8rkeEHYvHkzV155JRdeeCHJyclMmjQpb928efMYOHAgF1xwARMnTixxOcDevXsZM2YMgwYNYvDgwSxevBiAF198keTkZC666CLGjh1LZqZ3W5FJkyZx0UUXkZyczIsvvnjMsYo7D0BmZiYpKSkMHTqU5ORknnzyyeO2Kew1KiqWE6VkIwwKJhq5lHCIVG0rV66kS5curFy5kq5du5b7+RcsWMCyZcuKXF+Wcb300kt06NChTI5VGjk5ZXcPzZiYGO6++25mzJjB66+/zquvvsqqVavIyclh3Lhx/Otf/2L69Om89957xS7P9Yc//IG+ffsyc+ZMpk6dSocOHdi6dSsvvfQSb775Ju+99x45OTlMnz6djIwMJk+ezOTJk5k6dSoff/wx69aty2tjcecB70rYkyZNYtq0abzzzjvMnz+fr7/++phtCr5GRcVSFpRslLGiEo1cSjhEKr8NGzYwevRohg8fTkpKCmvWrAG8D/POnTuzcuVKtm7dSmpqKv3792fhwoV5+65evZqrrrqKYcOG8atf/YqdO3eyceNGrrjiCgCWLVuGc46dO3eSk5PDkCFDjpt+PXPmTH7xi18wdOhQLr/8cnbu3MmXX37Jn/70J2bNmsWwYcPYsGHDcXHnJkNZWVk888wzTJgwgaNHj5a6/Vu2bOHjjz8mJSWlxG3ffvtthg8fzpAhQ7j88ssB70Pv5ptv5uKLL2bQoEEsWbKkyOcGYMyYMTzwwAP84he/4Nlnny3y+S+txo0b061bNwBq1apF+/bt2bp1K0uWLKFNmza0atWKuLg4kpOTmTNnTpHLAfbt28eiRYvynpO4uDjq1KkDeMnDoUOHOHz4MIcOHaJx48asXr2aHj16UKNGDWJjYznttNOYPXs2QLHnyRUVFUVCQgIAhw8f5vDhw0RFReWt37FjR6GvUWGxlAWN2RA5ATfffDPr168POgwpQlJS4eOkbr75ZkaNGgXAjTfeyIIFC47bpnfv3jz//PMAzJgxg2uvvZb09JK/KGRnZ/P73/+eRx55hNatW/PJJ5/w3HPP8dhjj7Fy5Uruuusuxo8fz3nnncfkyZP59NNPeeKJJ3j11VfJyspizJgxjB8/ni5dujBx4kQmTZrEddddx48//gjAK6+8Qs+ePdm3bx+LFy/mjDPOoEaNGsfEcPrppzNo0CAA/v73v/P+++9zxRVXkJiYyF133UWnTp0Kjd3MaNCgAddddx29e/fmlltuyVs3cuRIDhw4cNw+d911F2eeeeYxy/74xz9y5513Frp9fvv37+e5557jnXfeIS4ujr1793L48GGuv/56brvtNvr168fBgwfJyckp8rm57bbbyMjIYPDgwbzxxhtkZ2czatSoQp//4tpy8OBBHnrooePakmvjxo2sWLGCpKQkPv30U5o2bZq3rkmTJixZsoStW7cWujx3//r163PPPfewcuVKunXrxn333UeTJk249tpr6devH/Hx8fTp04ezzjqL1atX8/jjj7Nr1y6qV6/OvHnz8m6cVtx58svJyWH48OGsX7+ekSNHHvP/4eWXX+auu+465nkoKpayoGSjjOUOAi2qd+OBAT00ULQSGTVqFGlpaUGHIRHkww8/ZNWqVdx8882A9we/V69eZGdns2/fPmrXrs2uXbvy6uRdunRh165defv26tWLLl26ANCxY0c++ugjatWqxcGDB9m5cyfbtm3j1FNPZc+ePbzxxhvcfffdx8Xw9ttvM2PGDLKysti+fTu33XYbAN999x3t27cvNO7s7Gw2bNjA2LFjGTduHEeOHDlm/auvvhpS++fOnUv9+vVJTEw8psemMDExMRw6dIg///nPXHzxxXTv3p2ZM2fSoUMH+vXrB5CXSM2YMaPQ5yYzM5M9e/bw29/+Nu85LOz5L6ktaWlpx22X68CBA4wZM4Z7772XWrVqhfQ8FHT48GGWL1/O/fffT1JSEo8++igTJ07kmmuuYc6cOcyZM4fatWtzyy23MHXqVIYNG8aoUaO47rrrqFGjBp07dyY6unTFiJiYGKZOncrevXv57W9/S0ZGBp06dWLu3LnUqVPnuNdoz549RcZyopRshEFRCYcSDZHyFUpPxDPPPFPiNhdeeCH3339/SOdcuXIlt956K6mpqcct79ChA2vWrKF169bExcUBXlmkc+fOAKxateqYXoeMjAw6duxIdHQ0UVFRTJkyhZSUFFavXo2ZkZOTQ7t27Y45zzvvvMOSJUuYNGkSCQkJXHHFFZx88sns3LmT2rVrExtb+J/91atX0717d/bs2UNMTMxxyUaoPRtfffUVH330EfPmzSMzM5P9+/dzxx13MH78+OP2rVGjBu+99x5z587lgQceICUlhW3bthXaI1XUc/Ptt9+SlJSU166inv+S2lJUz0Z2djZjxoxhyJAhDBgwAPB6ALZs2ZK3zdatW2nSpEmRywGaNm1K06ZN89o2aNAgJk6cyGeffUbLli2pX78+AAMGDGDx4sUMGzaM1NTUvHZMmDAh71jFnacwderU4fTTT2f+/Pl06tSJr776iq+++orzzjvvmNeof//+RcZyopRshEnBhCMuJpq7+ycGGZKEwY033siOHTuYPHly0KFIhGjcuDGffvopI0aMIDo6GjOjU6dOeeM1VqxYwcaNG8nKyiI7O5unn36ae+65B/A+RHIvELdhwwamTp2a9y08Ojqajz76iFdeeYWtW7fywgsvcPvttx93fjPjlFNOISEhgVmzZrF48WI6derE6tWri62/r1y5klNOOYWhQ4dy0003HXfsUHs2br/99rx9Fy5cyAsvvJCXaFx99dX85S9/yftgXLt2LW3btiU5OZlVq1aRlZVFo0aNWLlyZd7xdu7cSf369Yt8bubOnYtzrsTnP/94hVB7No4ePcp9991H+/btueaaa/KWd+/enbVr17JhwwaaNGnC9OnT+etf/0q7du0KXQ7QqFEjmjZtypo1a2jfvj0LFiygQ4cONG/enPT0dA4ePEj16tVZsGBBXrlkx44dNGjQgE2bNjF79mzeeOONYs+f386dO4mNjaVOnTocOnSIzz77jOuvvz7vNTr33HPp1avXMa9Renp6kbGcKCUbYZSbcLydlsE3Ow4x2zYxpFurgKOSsrRgwQKysrKCDkMiyIgRI1i4cCGDBw+mevXqnHzyyYwfPx4zo3v37nzzzTcMGDCAyy67jEOHDvGb3/yGnj17AjBs2DA++eQThgwZQnx8PH/84x+pV68eALGxsfTt25fY2FgSEhI4dOhQXqkhv+HDh3PTTTfx7rvv0qdPH1q1akXNmjVp3749u3bt4qKLLmLcuHGceuqpx+yXG1+7du244447ePzxx+nbty/VqlUrk+flyJEjrF+/nrp16+Yte+aZZ1i8eDE1a9akY8eOPProoxw+fJjbb7+d5ORkYmNjGTNmDP379y/yucnIyKBHjx4lPv8/RVpaGlOnTqVTp0553+7Hjh3LOeecwwMPPMCoUaPIyclhxIgRnHzyyQBFLge4//77ueOOO8jOzqZVq1Y89thj1K1bl4EDB3LJJZcQGxtLly5duPTSSwFvbNHu3buJjY3lwQcfzBtQGhsbW+R5rr/+eh599FF27drF3XffTU5ODkePHmXQoEGFvl/yS0pKKjKWExX1U0YaV1ZpaWltge8SExOJj48vs+NOmjWfa2ev5Ze92jNpZJ8yO26kKq72WdlU1cuVV6XXGKpee6Hs25yRkcGbb76Z14sTaaraaxyO9mZmZrJ06VKAdr169Vqbf52mvpaDbg1q0LpeAtOWbeBQdtnNARcRqSg6deoUsYmGhJ+SjXIQFRVFalIb9h7KZrZtCjocERGRcqVko5ykJrUBYMqSdQFHIiIiUr40QLSc9G7VgLb1E5i2dCOHsnOoXi0m6JCkDPTu3TvvGgkiIlI49WyUk6ioKFJ6tGFfZjazVEqpNJ5//nnuu+++oMMQEYloSjbKUWrPtgBM/nptoHGIiIiUJyUb5ahXy/q0rZ/Au8s3cjD7cNDhSBmYNGlSsbfsFhERJRvlypuV0pb9mYeZtVKllMpgwoQJIV9ZUUSkqlKyUc5yZ6VMTtesFBERqRqUbJSzU1vWp139WrynUoqIiFQRSjbKWe4FvvZnHmamSikiIlIFKNkIQGpPv5TytUopIiJS+SnZCMApLerToUFtlVJERKRKULIRgKioKFKSWnMg6zDvr1AppSJbtGgR//73v4MOQ0QkoinZCEhqUlsAJqevDTQOOTFxcXFUq1Yt6DBERCKako2A9GxRj44NvVLKj1kqpVRUGRkZrF+/PugwREQimpKNgHillDb8mJXD+yu/Dzoc+YlSU1O59957gw5DRCSiKdkIUN4FvjQrRUREKjElGwFKal6PkxvWZvoKlVJERKTyUrIRoPyllBkrVEoREZHKSclGwPIu8KV7pYiISCWlZCNgPZrVo1OjOkxfvpEDmdlBhyMiIlLmlGwELPdeKQezc5ihe6VUOE888QRjx44NOgwRkYimZCMCpOTNSlkbbCBSaueeey6nnnpq0GGIiEQ0JRsRoHuzk3CN6jBjxffsVylFREQqGSUbESAqKorUnl4pZfpyzUqpSM4//3xuuummoMMQEYloSjYiRG4pZcoSzUqpSH744Qd2794ddBgiIhFNyUaESGx6Ep0b12HGcpVSRESkclGyESG8WSltOXQ4h/eWbww6HBERkTITG64DO+dqAS8B9YB44GFgC/BP4CiwxMxG+9veCaT6yx82sxnOubrAq0BdYD8w0sx2OufOB/4I5AAzzOwR/xh/A37uH+MWM1vknGsFvAzEAJuBK80sM1xtPlEpSa155IMlTElfz2WntAs6HBERkTIRzp6NXwFmZv2AFOAJ4HG8RKAPUNc5N9g51w64DDgLuAiY4JyLAW4FPjazs4C3gLv84z4JjAD6AAOcc12dc+cAJ5vZGcB1/jYA44CnzawvsAq4NoztPWHdmp5ElyZ1eX/F9+w7pFKKiIhUDuFMNrYDDfzf6wE7gXZmtshf9i5wPtAPeN/MsszsB2Ad0BXoD7ydf1vnXHtgp5ltMLMjwAx/u/7AOwBmtgKo55yrA5wLTCtwvoiVe4EvlVIqjssuu4wLLrgg6DBERCJa2JINM3sNaO2cWwXMA+4AduXbZBvQDGgK/FDC8tJsi/97UyAhX9kkd9uIlneBL90rpUK45557uPrqq4MOQ0QkooVzzMYvgfVmNsg5l4TXS7En3yZRRexa2PLSbFvaYxxn6dKloW4asrS0tJC3bVc3nveXb2Te51+QUC2mzGMpD6Vpb2VQ1doLVa/NVa29UPXarPaGT9iSDbwxFbMAzCzdOVcDqJZvfQtgk//jiljeFC9BKbis4LZZBZY3xxsQut85V8PMDubbtkSJiYnEx8eH1soQpKWl0atXr5C3v2p7LA/PXsLGuEaMPLXiDRQtbXsrsvvvv5+tW7cyceLEoEMpV1XpNYaq116oem1We09cZmZmkV/WwzlmYxVwOoBzrg2wD1jhnDvLXz8cmAl8BCQ75+Kcc83xkoLlwGy8GSrgDQidaWZrgTrOubbOuVi8AaWz/Z8U/1ynApvMbB/wob9v3jHC19yyo3ulVBzTpk1j/vz5QYchIhLRwtmz8SzwgnPuE/88N+JNfX3WORcNLDSzDwGcc8/hjes4Cow2syPOuSeBV5xz84HdwC/9444G/uP//rqZZQAZzrk059xnwBHgt/76B4GXnHM34A08nRTG9paZrk1PolvTusyyTew9lEWd6nFBhyQiIvKThS3ZMLP9wC8KWdW3kG2fAp4qZP+LC9l2HnBGIcvvLmTZZqBCThVITWrLQ7PSeXfZRq7o1T7ocERERH4yXUE0QmlWioiIVBZKNiJUlyZ1SWx6ErNWeqUUERGRikrJRgRL7dmGrJwjTFumC3xFqjZt2tC0adOSNxQRqcKUbESwlB65s1JUSolU06ZNY/z48UGHISIS0ZRsRLDOTerSvdlJzLZN7DmoUoqIiFRMSjYiXGqSSimRbMaMGXz22WdBhyEiEtGUbES4/81KWRtsIFKoe+65h3/84x9BhyEiEtGUbEQ417guSc3rMds2s1ulFBERqYCUbFQAKUltyM45wrRlG4IORUREpNSUbFQA/7tXimaliIhIxaNkowLo1KgOPZvX44MMlVJERKTiUbJRQeSWUqYuVSlFREQqFiUbFYTulRKZZsyYwd/+9regwxARiWhKNiqIkxvV4ZQW9fnANrHrx8ygwxFfixYtaNSoUdBhiIhENCUbFUhKUmsOHznKOyqlRIzdu3ezb9++oMMQEYloSjYqkNxSyhSVUiLGOeecw+jRo4MOQ0QkoinZqEA6NqzDqS3r82HGZnaqlCIiIhWEko0KJjWpjVdK+UalFBERqRiUbFQweaWUJSqliIhIxaBko4Jp36A2vVrWZ07GZnYcUClFREQin5KNCig1qa0/K2V90KGIiIiUSMlGBZSS1BrQvVIiwf3338+1114bdBgiIhEtNugApPTaNahN71YN+GjVFnYcyKRBQnzQIVVZKSkppKWlBR2GiEhEU89GBZWa1IacI0d5+xuVUkREJLIp2aigRvTwSym6wFegLrvsMn7/+98HHYaISERTGaWCategNqe1asDcVVvYvv8QDWtVDzqkKmnFihVkZWUFHYaISERTz0YFlldK0b1SREQkginZqMDybjv/9dpgAxERESmGko0KrE39WvysdQM+Xr2VH/YfCjocERGRQinZqOBSk9pqVoqIiEQ0JRsVXO6sFN12Phj9+/end3vKmzYAACAASURBVO/eQYchIhLRNBulgmtTvxant27I3FVb2bbvII1r1wg6pCplwoQJuqiXiEgJ1LNRCaT2bMORo5qVIiIikUnJRiUwood/23ndK6XcPfnkk7z++utBhyEiEtGUbFQCresl8PM2Dfl4tVdKkfLz/PPP8+677wYdhohIRFOyUUmkJnmllLe+USlFREQii5KNSiKvlJK+NthAREREClCyUUm0qpfAmW0b8cnqbWxVKUVERCKIko1KJCWvlKILfImISORQslGJ5F3gS7NSyk3NmjWpXl133BURKY4u6lWJtDwpgT5tG/HJmq1s2XuQpnV0ga9wW7BggS7qJSJSAvVsVDIpSW04ehTeWqJSioiIRAYlG5XMCP+281OWqJRSHhYtWsTy5cuDDkNEJKKpjFLJtKhbk7PaNWbemq1s3vsjzerUDDqkSm3UqFFkZWVx5ZVXBh2KiEjEUs9GJZSS1FqlFBERiRhKNiqh4T3aEBWl286LiEhkULJRCeWWUuZ/t41Ne34MOhwREanilGxUUqmalSIiIhFCyUYlNbxHa6+UolkpIiISMM1GqaSa1alJX7+U8v2eH2lRV7NSwmHSpEmsWLEi6DBERCKaejYqsdSktn4pRb0b4dKzZ086deoUdBgiIhFNyUYllltKmax7pYiISIBURqnEmtapwdntm/DJ6q0qpYRJ7969OXToEEuXLg06FBGRiKWejUou1b98+Zu65kZYZGdnk5OTE3QYIiIRLaw9G865K4DfAYeBB4AlwMtADLAZuNLMMv3tbgWOABPN7HnnXDXgRaANkANcY2ZrnHNJwD+Bo8ASMxvtn+tOINVf/rCZzXDO1QVeBeoC+4GRZrYznG2ONJd0b83Nb3/B5PR1jDm7S9DhiIhIFRS2ng3nXAPgQeAs4CJgGDAOeNrM+gKrgGudcwl4icj5wLnAbc65+sBIYLeZnQX8AXjMP/TjwC1m1geo65wb7JxrB1yW71wTnHMxeAnMx/4x3gLuCld7I1XTOjU4p30TPlv7Axt3Hwg6HBERqYLCWUY5H/jQzPaZ2WYz+zVeMjHNX/+uv83pwCIz22NmB4H/An2A/sDb/rYfAn2cc3FAOzNbVOAY/YD3zSzLzH4A1gFdCxwjd9sqJ6WnX0rRBb5ERCQAIZVRnHPJeB/yf3fOdQDWmNnREnZrC9R0zk0D6gEPAQlmlumv3wY0A5oCP+Tb77jlZnbEOXfUX7arkG13lHSMfMtKFI7BfmlpaWV+zFB1OHKY6Cj493+XcVZC+Vy+PMj2lqesrCyg6rQ3v6rW5qrWXqh6bVZ7w6fEZMM592fgZLyxE3/HK280Bm4uYdcooAFwib/vXH9Z/vVF7Rfq8rLY9jiJiYnEx8eHunmJ0tLS6NWrV5kd76c455s9zF21lcbtO9OqXkJYzxUJ7S0vY8aMYePGjVWmvbmq0msMVa+9UPXarPaeuMzMzCK/rIdSRjnHzIYDewHM7BHg1BD22wp8ZmaHzWw1sA/Y55yr4a9vAWzyf5rm2++45f5g0Si8QaUNitu2mOW5y6qklKS2ALypC3yVqdGjRzN8+PCgwxARiWihJBsH/X+PAvgDL0Mpv8wGznPORfuDRWvhjb0Y4a8fAcwEFgKnOedOcs7VwhuvMd/fP9Xfdggw18yygZXOubP85cP9Y3wEJDvn4pxzzfESi+UFjpF7vippePdWREdFMVlTYEVEpJyFkmx85pz7N9DcOTcW+AT4uKSdzOx7YArwOfA+XtnlQeBq59x8oD4wyR8UejcwCy8ZedjM9gCvAzHOuU+B3wL3+Ie+FXjMOfdfYLWZfWhm64HngHnAm8BoMzsCPAn09s/XD/h/IbS3UmpcuwbndmjC5+u2s36XZqWUlZtvvpm//vWvQYchIhLRSuyhMLP7nHMpwI9AS2CCmb0VysHN7Fng2QKLLyhkuyl4iUn+ZTnANYVsuxzoW8jyp4CnCizbD1wcSqxVQUrPNny0agtvLlnHbed0DTqcSmHevHl5g0RFRKRwoQwQvdvM/kSBZEAqnuHdW3PTm18w+WslGyIiUn5CKaMkOuc6hj0SCbtGtarTr2MTFq7fzrqd+4MOR0REqohQko0ewArn3Bbn3Hrn3AbnnK4OVUGlJOkCXyIiUr5CSTaGAB3xrvTZF++S4MeNmZCK4ZLurYmJjmJy+tqgQxERkSoilCmsG/Eu5HUa3vTXz83sP2GNSsLGK6U05cOMzazduZ+29WsFHVKFlpSUxK5du0reUESkCgulZ+NJYChgwLfAL5xzT4Q1Kgmr3FLKFF1z44S99NJLPPTQQ0GHISIS0UIaIGpmqWb2tJn93cwuIbQriEqEuiSxFTHRUUo2RESkXISSbMQ55/K2K8UVRCVCNaxVnfM6NmXRhh18t2Nf0OFUaK+++iqzZs0KOgwRkYgWSrIxHVjknJvgnJsAfAm8E96wJNz+V0rRrJQT8ec//5mXX3456DBERCJaicmGmT2Kd7nwdcBa4AYz+3OY45Iwy52VMkU3ZhMRkTArMdlwzjUDfmZmT5jZk8BQ51yL8Icm4dQgIZ7+Jzfjyw07WKNSioiIhFEoZZR/A1vyPf4GeCE84Uh5SklqDWhWioiIhFcoyUZ1M3sj94GZvQ5UC19IUl4uTmxNrGaliIhImIUyq+Soc24Q3q3lo4FB4Q1JykuDhHj6d2rGrJWbWL19Hx0a1g46JBERqYRC6dm4HrgD2AZsBkYBvw5nUFJ+UnWBrxPy3//+l+eeey7oMEREIloos1FWmdn5ZlbbzOoAyWa2qhxik3IwLLEVsdFRTFay8ZPUqlWLGjVqBB2GiEhEC2U2yq+cc79xzsU45z4FvnPOjS6H2KQc1K8Zz/mdmrH4+52s2r436HAqnLVr17J58+agwxARiWihlFFuAJ4HLgGWAu2AS8MZlJSv1KS2gEopP8WwYcO48847gw5DRCSihZJsHDSzTOBC4A0zO4J391epJIYltqRaTDSTv1ayISIiZS+UZAPn3NNAH+AT59wZQPWwRiXlqp5fSvl60y6+/UGlFBERKVuhJBtX4N1afqiZ5QBtgRvDGZSUP81KERGRcCnxOhtmthl4PN/j/4Q1IgnEsMRWXiklfR33nN896HBERKQSCamMIpXfSTXiuKBTM9I37SJDpRQRESlDSjYkT2pPlVJKa/z48YwZMyboMEREIloolyvHOZcE1AeicpeZ2UfhCkqCMbRbK+L8WSn3qpQSkgsuuID69esHHYaISEQrMdlwzr0JJAEb8i0+CijZqGROqhHHBa4Z05d/j23bg2tcN+iQRESkEgilZ6OtmXUMeyQSEVKT2jJ9+fdMSV/HfRf0CDqciDd48GD279/P/Pnzgw5FRCRihTJmw5xzcWGPRCLC0G4tvVKKxm2EZNOmTWzfvj3oMEREIlooPRs5wHLn3BfA4dyFZnZV2KKSwNStEccA15z3lm9k5dY9dG6iUoqIiJyYUHo2PgQeAWYBc/L9SCWVNytliXo3RETkxIVyi/lJwCfAPmAvMNdfJpXU0G4tiY/VvVJERKRshHKL+RuBucBleJcu/9g5d3W4A5Pg1KnulVKWbtnN8i27gw5HREQquFDKKFcCXczsF2aWAnRH90ap9HSvlNCMGDGCfv36BR2GiEhECyXZOGxmh3IfmNkBICt8IUkkGOKXUjRuo3gPPPAA1113XdBhiIhEtFBmo2xwzj0FfOA/HgisD19IEgnqVI9jUOcWTF26gWVbdtOt6UlBhyQiIhVUKD0bvwa+B64BfgWs85dJJZeiUkqJxo0bx/PPPx90GCIiEa3Ing3nXJSZHQUOAX8pv5AkUgzp6pdS0tfx4MCkoMOJSG+++SZZWaoqiogUp7iejdxraRwGsvP95D6WSq529WoM7tKC5Vv3sEyzUkRE5CcqMtkws/P8XxuYWUy+n2jg5PIJT4KW0sMrpeiaGyIi8lMVO2bDORcNvOWci3LORfv/xgNTyyc8CdpFXVtSPTaGKUvWcfTo0aDDERGRCqjIZMM5dzmwEjgHr3RyGO8+KQfQbJQqI7eUskKlFBER+YmKHCBqZv8B/uOce8jMHiq/kCTSpCS15u1v1jM5fR2JzeoFHU5Ead68Ofv37w86DBGRiBbK1NcuYY9CIlpuKWXy1yqlFPT+++/z+OOPBx2GiEhEC+WiXt85564FPiPflUPNbE3YopKIUiu+Ghd2bcFbS9azdMtuuqt3Q0RESiGUZOPSQpYdBdqXcSwSwVKT2vDWkvVM/nqdko18PvjgA1atWkWvXr2CDkVEJGKVmGyYWbvyCEQiW3KXFtSoFsPk9HU8PCiJqKiooEOKCHfccQdZWVmMHj066FBERCJWicmGc64Z8ChwGl6PxufA783shzDHJhEkIb4aF3ZpwZtL1vPN5t30aK7eDRERCU0oA0QnAl8BlwNXACsA3QyiCkrt2RaAyelrA41DREQqllDGbNQ0s6fzPV7qnBsaroAkcl3YublXSvl6HeMG9VQpRUREQhJKz0aCX0oBwDnXEqgevpAkUiXEVyO5a0u+3b6P9E27gg5HREQqiFCSjUeANOfcV865xXhjNh4Ob1gSqVJ123kRESmlUMoo/wU6AJ3wBohmAM2K3UMqrQu7tKBmnDcr5ZHBKqVMnTqVpUuXBh2GiEhEC+VGbG8Dh4BvgKV4Cce08IcmkahmXCzJXVqyavs+vv5epZS2bdvSrJlybxGR4hTZs+HfiO1hoCPeDdiOAlH+vzNDObhzrgZegvIIMAd4GYgBNgNXmlmmc+4K4FbgCDDRzJ53zlUDXgTa+Oe+xszWOOeSgH/6MSwxs9H+ee4EUv3lD5vZDOdcXeBVoC6wHxhpZjtDfWKkaKk92zA5fR1TlqzjlJb1gw4nUPv37+fgwYNBhyEiEtGK7Nkws/+YWSdgnJlFm1lMvn+TQzz+74HcD/hxwNNm1hdYBVzrnEsAHgDOB84FbnPO1QdGArvN7CzgD8Bj/jEeB24xsz5AXefcYOdcO+Ay4CzgImCCcy4GL4H52D/GW8BdIcYsJRjcuQUJcbG6VwrQp08frr/++qDDEBGJaKEMEH3ROdcHwDl3vXPueedciTdnc851BroC0/1F5/K/8su7eAnG6cAiM9tjZgfxxof0AfrjlW8APgT6OOfigHZmtqjAMfoB75tZln+hsXX+efMfI3dbKQM142JJ7tqC1Tv2sfh7dRaJiEjxQkk2/g1kOedOAa4H3gSeDGG/vwJj8z1OMLNM//dteINMmwL5r0R63HIzO4JXHmkK7Cpu22KW5y6TMpKa1BbQrBQRESlZKLNRjprZIufcOOApfzzE2OJ2cM5dBSwws++cc4VtUtQUhtIsL4ttCxWO2QVpaWllfswgNTl8hBqxUbzyRQbDmxw5blZKZWtvUbKyvBshV5X25lfV2lzV2gtVr81qb/iEkmzUcs6dBqQA5zjn4oGSboyRDLR3zl0EtAQygf3OuRp+uaQFsMn/aZpvvxZ41/HIXZ7uDxaNwhtU2qDAtrnHcEUsbwrsybcsJImJicTHx4e6eYnS0tIq5V1Bh2Yc4vWv1xLVpB29Wv3vpams7S1MXFwcWVlZVaa9uarSawxVr71Q9dqs9p64zMzMIr+sh1JG+SvwHN5MkR+Ah/BmeRTJzC41s9PM7OfAv/Bmo3wIjPA3GYE3o2UhcJpz7iTnXC288Rrzgdl4s0sAhgBzzSwbWOmcO8tfPtw/xkdAsnMuzjnXHC+xWF7gGLnnkzKU2tO7wNdklVJERKQYodxi/nXg9XyL7jWznzIF4UHgJefcDXiDOCeZWbZz7m5gFv+btrrHOfc6cIFz7lO8XpFf+ce4FXjWv/7HQjP7EMA59xwwzz/GaDM74px7EnjFOTcf2A388ifELMUY1Lk5teJjmZK+jseST6mSF/i66667WLt2bdBhiIhEtOKus/G6mV3qnNuA9yGefx1m1jqUE5jZQ/keXlDI+inAlALLcoBrCtl2OdC3kOVPAU8VWLYfuDiUGOWnqVEtlou6tuS1xWtJ27iT3q0alLxTJTNy5MgqV+cVESmt4no2xvj/nlXMNlLFpSa14bXFa5n89doqmWyIiEjJiks2BhYxkyTXS2Uci1RAgzq38EopS9bxp4tOrXKllKuuuopdu3bx7rvvBh2KiEjEKi7ZyC15NASS8AZzxuBdiOszlGwIUL1aDEO7teLVr77jyw07OK11w6BDKlfp6el5019FRKRwxV2u/EozuxLvviIdzOwSMxuKd68U/XWVPClJmpUiIiJFC2Xqaxv/2hgAmNk+vBukiQAw0DWndnw1pqTrXikiInK8UC7qtcw591+80skR4OfAt2GNSiqU6tViGJrYkv9L+45FG3YQE3RAIiISUULp2bgW70Jem/HuMfIn4OowxiQVUEoPr7PrtncWMXHJtoCjERGRSBLKRb2OAh/4PyKFGuCaExcTzefrtvM50HxWOg8OTAo6rLA7++yz2bFjR9BhiIhEtFDKKCIl+vNHS8nKOZL3eNzsJQCVPuF46qmndFEvEZEShFJGESnWw7PS85KL/MbNXsLDs9IDiEhERCKJejbkhBSVaOSq7D0c//znP9m4cWOVulukiEhpqWdD5AQ888wzvPXWW0GHISIS0ZRsyAl5cGASDwzoUeT601o14J7+ieUYkYiIRBolG3LCiko4GiXEs2jDDvr9YzYbdh0IIDIREYkESjakTBRMOB4Y0IPV913CyFPb8fm67fT+23Q+sE0BRigiIkHRAFEpM7mDQDdt2pT3+0sj+3Bmu0bc9s6XDH5uDg8NTOLe/t2Jjq5ad4cVEanK1LMhZerBgUn8ukfjvMdRUVGMPtMx76aBtDopgQdnpjPk+Y/Y+WNmgFGWnWrVqhETowu0i4gUR8mGlIuftW7IolsvZIBrzsyVm+g9YTpfbqj4V9788ssvmTRpUtBhiIhENCUbUm4a1qrOe6P68eCAHqzffYC+T83k2QUZulOsiEglp2RDylVMdDQPDExi+qj+1IqP5TdTFnLNa5/xY9bhoEP7Sb7++msyMjKCDkNEJKIp2ZBADOzcnC9vS+a0Vg14+cs1nPnk+3z7w96gwyq1q6++mnHjxgUdhohIRFOyIYFpU78Wn9w0kNFnduKbzbv52eMzeGvJ+qDDEhGRMqZkQwIVHxvD30eczksj+3D4yBFSJ33CndPSyM53B1kREanYlGxIRLiiV3sWjBlMp0Z1mPDJci545gM27/0x6LBERKQMKNmQiJHYrB4Lbx3MiB6tmb9mG70mTOeT1VuDDktERE6Qkg2JKHWqx/H6VWczYVhvdhzI5IJnPmD83GWaHisiUoHpcuUScaKiorjl7C70btWAy16ax13vfcVna3/g35edSd0acUGHd4x//etfrFy5MugwREQimno2JGL1adeYL8cm069jE6Yu3cDPHp9B+qadQYd1jNNOO42uXbsGHYaISERTsiERrUntGsz89fnc3T+RVdv3ceYTM3nxi9VBhyUiIqWgZEMiXmxMNH+48BTeufZc4mOjue71z7hh8gIOZecEHRpnnHEGo0aNCjoMEZGIpmRDKowh3Vqx6LZkejavx78+X0Xfv8/kux37Ao3pxx9/5NChQ4HGICIS6ZRsSIXSoWFtPh0ziGt/1pGvNu7ktL/NYPryjUGHJSIixVCyIRVOjWqxPHfpGTz3izM4mJ3D0Ofncv/7i8k5oquOiohEIiUbUmFde3pHPr15EO0b1OKPHy5l8MQ5/LBfJQ0RkUijZEMqtFNa1mfRbckM6daSOd9uodeE6SxY+0PQYYmISD5KNqTCO6lGHG/96lweSz6FzXsPcu7Ts3hq/opyueroddddx5AhQ8J+HhGRikzJhlQK0dFR/O68RGbfeD71a8Zz6ztfcvnL89l3KDus5x0zZgyXXnppWM8hIlLRKdmQSqVfx6akjU2mT9tGTE5fx8+fmMHyLbuDDktEpEpTsiGVTvO6NZnzmwHcdk4XVm7by8+feJ/XFn8XlnONHTuWxx9/PCzHFhGpLJRsSKVULSaa8UN78/pVZxMdFcUVr3zKmLe+IOtw2V51dM6cOXz55ZdlekwRkcpGyYZUailJbVh462C6Na3L0/81+v1jNht2HQg6LBGRKkXJhlR6rnFdFowZzBW92vH5uu30/tt0PrBNQYclIlJlKNmQKiEhvhqTLu/D30f8jL2Hshn83Bwe/WAJR46Ef3qsiEhVp2RDqoyoqChGn+mYd9NAWp2UwIMz0xny/EfsOJAZdGgiIpWakg2pck5r3ZAvb0tmYOfmzFy5idP+Np0vN+z4Scfq0qULbdu2LdsARUQqGSUbUiU1SIjnvevO46GBSazffYC+T83k2QUZpb7q6Guvvcajjz4apihFRCoHJRtSZUVHR3H/gB5MH9Wf2vHV+M2UhVzz2mf8mHU46NBERCoVJRtS5Q3s3Jwvxybzs9YNePnLNZz55Pt8+8PekPadMmUKH330UZgjFBGp2JRsiACt6yXw8W8HMvrMTnyzeTc/e3wGby1ZX+J+jzzyCC+88EI5RCgiUnEp2RDxxcfG8PcRp/PyFWdx+MgRUid9wp3T0sjOORJ0aCIiFZqSDZECRp7ajs9vuRDXqA4TPlnOBc98wOa9PwYdlohIhaVkQ6QQ3ZqexMJbLyQlqQ3z12yj14TpfLJ6a9BhiYhUSEo2RIpQu3o1XruyLxOG9WbHgUwueOYDxs9ddsz02N0Hs9iXVbY3dxMRqWyUbIgUIyoqilvO7sJHvxlAk1rVueu9rxjx4ifsOZjFw7PS2XMom/3ZR3h4VnrQoYqIRKzYcB7cOfcXoK9/nseARcDLQAywGbjSzDKdc1cAtwJHgIlm9rxzrhrwItAGyAGuMbM1zrkk4J/AUWCJmY32z3UnkOovf9jMZjjn6gKvAnWB/cBIM9sZzjZL5dSnXWPSxiZzxSufMnXpBjr84W12HcyCoXcCMG72EgAeHJgUZJgiIhEpbD0bzrl+QKKZnQEMAh4HxgFPm1lfYBVwrXMuAXgAOB84F7jNOVcfGAnsNrOzgD/gJSv4x7nFzPoAdZ1zg51z7YDLgLOAi4AJzrkYvATmY/8YbwF3hau9Uvk1rl2DmTf056x2jbxEAyC+pveDl3Coh0NE5HjhLKPMw+tpANgNJOAlE9P8Ze/iJRinA4vMbI+ZHQT+C/QB+gNv+9t+CPRxzsUB7cxsUYFj9APeN7MsM/sBWAd0LXCM3G1FfrJHP/iGT7/7Ie9x1IHdRB3YnfdYCYeIyPHClmyYWY6ZHfAfXgfMABLMLPcWm9uAZkBT4Id8ux633MyO4JVHmgK7itu2mOW5y0TKTK0ZT1BrxhPHLNu672Cp77EiIlKZhXXMBoBzbhhesjEA+DbfqqgidinN8rLY9jhLly4NddOQpaWllfkxI1llbe9FDWFTYkP+tXR7kds8u+Bbpi1Zyzkta3N2y9r0bFST2OiQ334VRmV9jYtS1doLVa/Nam/4hHuA6EDgPmCQme1xzu13ztXwyyUtgE3+T9N8u7UAPs+3PN0fLBqFN6i0QYFtc4/hiljeFNiTb1mJEhMTiY+PL2Vri5aWlkavXr3K7HiRrrK399le0HxWet6g0Fy/69eNpBb1mLZ0I++v/J7XbCev2U7q1YgjuWtLhia2ZKBrTq34agFFXnYq+2tcUFVrL1S9Nqu9Jy4zM7PIL+thSzb8mSD/Dzg/3wyQD4ERwCv+vzOBhcC/nHMnAYfxxmvcCtTBG/MxCxgCzDWzbOfcSufcWWb2KTAceArIAMY65x4EGuIlFsuB2f4xHs13PpETljvr5G+TvccPDOiRt+yyU9qRdTiHj1dvZdrSDUxbtpFX0tbwStoa4mOjOe/kZgxLbMWQri1pWqdGUE0QESk34ezZuBTvg/8N5/I6Ha7GSyxuwBvEOclPIO7GSypyp63ucc69DlzgnPsUyAR+5R/jVuBZ51w0sNDMPgRwzj2HNyj1KDDazI44554EXnHOzccbpPrLMLZXqpgHBybxQvVq5OTkHDflNS42hgGuOQNcc54a/jO+2riTqUs3MG3ZBt5f8T3vr/ie0VFweuuGDO3WiqGJrejcuA5RUZWv3CIiErZkw8wmAhMLWXVBIdtOAaYUWJYDXFPItsvxrt1RcPlTeL0c+ZftBy4uVeAipXBSjTiysrKK3SYqKoperRrQq1UDxg3uyZod+3h32UamLd3A/O+28fm67dw7YzEnN6zNsEQv8fh5m4bEROuaeyJSOYR9gKhIZfbYY4+xevXqUu3TvkFtbjm7C7ec3YUdBzKZvmIj05ZuZLZtYvzHyxn/8XIa1Yrnoq4tGZbYivM7NaNGNf1XFZGKS3/BRE7AhRdeeEIjuhskxHNV7w5c1bsDh7JzmPPtZqYu3cB7yzfy7y9W8+8vVlMzLoYLOjVnaLdWXNS1BQ1rVS/DFoiIhJ+SDZEIUb1aDMldW5LctSVHjhxl4frt/gDTDUxd6v1ER0VxVrtGDE1sxdBurejQsHbQYYuIlEjJhsgJGDp0KPv27WPu3Llletzo6CjOaNuIM9o24rGLTsW27WHa0o1MW+aN85i3Zht3TEujW9O6DO3WimGJrejVsgHRlfB6HiJS8SnZEDkB69atK3GAaFlwjety53l1ufO8bmzdd5D3lnvjPD7M2Mxjc5by2JylNK9TgyHdWjE0sSX9OjYlPjYm7HGJiIRCyYZIBdOkdg2uO/1krjv9ZA5kZjM7YzPTlm5g+vLveXZBBs8uyKB2fDUGdW7O0MRWXNilBSfViAs6bBGpwpRsiFRgCfHVuKR7ay7p3prDOUf479of8sZ5TE5fx+T0dcRGR3FOhyZ51/NoXS8h6LBFpIpRsiFSScTGRHNOhyac06EJ44f2YtmW3UxbtpGpSzcw59stzPl2C7e8s4hTWtRnaLeWDE1sRVLzerqQmIiEnZINkUooKiqKxGb1SGxWj3vP0X32IwAADsFJREFU7873e35k2rINTFu6kbmrtrD4+508PHsJbeol+DNbWtK3fROqxZR8IbGHZ6WzadM2nq06t5EQkROkZEPkBPz/9u4/Nu76vuP483zns+9H/DNuSDD1hZa8gVIiCBkJG5QhBkww0CYYf3Qt1RhT2drxQ1qlrYyfmti6TRrKVjENWkAVypYOiaijBbKKAi0B8qMQ0uRDCLGDnZBfd7bjn+c43h93vpzxb5+/97XvXg/Jurvv+e7eX/nO99L7+/l8PzfffDNHjhzxu4xpnV0b5e4rjLuvMLoH0vx0z6HcqdM3vLGXDW/spT4S5vcvOJtbLjqH620FS6rHLxj3SN4CdCtefm/cadpFRCaisCFSgMcee2zRLUtdUx3m9ksS3H5JgvSpYV7/+GhunMfzOw7w/I4DhIMVXHPeWbmux/Ka6JigAeSuK3CIyHQUNkTKWDgU5NpVy7l21XKe+MO17OxI5s7n8bO9h/jZ3kP8xY/f5uyaCB3d/eMer8AhIjOhsCFSgMcff5zDhw+zZs3iH8AQCAS4tLmRS5sbefiG1bQme9j8wSc88foeWlO9kz5OgUNEpqOwIVKAjRs3FuWkXn5INMT5q6suINWfHnP4ZCIbdx6gOhRkXaKJy5obiFWNH+8hIuVLYUNEpjTasZgscNRUhfjw2En+9qWdAAQrAqxeUc+6libWtSxlfaKJlQ1xTbEVKWMKGyIyrckCx4PXXcxD16+mo6uPt1qPsbXtGFtbj7O9/QQ72pN8/5cOgM/Fq1nXsjQTQBJNrD2nkWhY/35EyoU+7SIyI58NHKNBAzJTa29d3cKtq1sAGDw1zK87kmxtO54LIZt3t7N5dzug7odIuVHYEJEZGw0Xhw4dmnJAaFUoyOUtTVze0sQ9V10AMOPux/pEE+tamrhM3Q+RkqFPskgBmpqa6O2dfKZGKXro+tVs335q1o+bqPuxsyPJ1tZjbG07Pq77Ecp2Py5X90Nk0VPYECnAli1bFt1JvRaKqlAwexilKbdtou7HdnU/RBY9fUpFZMGYqvvxVttxtrZO3P0YHXi6rmWpuh8iC5DChkgBXnvtNfbt21cSJ/VaiPK7H/dmt7V39uYOu+R3P/492/1YtqSayz+v7ofIQqJPoEgB7rnnHtLpNHfddZffpZSN5roYt9bF5tz9WN+ylMQsuh9a5VakcAobIrKoTdb9GA0eW9uOsaM9Oafuh1a5FZkfChsiUnKa62LcVhfjtgK6H8++u5/HXt2Ve06tASMydwobIlLy5tL9mIgCh8jcKGyISFmaqPuxoz3JIy+/x6sfHp70cY++8j7Pvbufa85bzsrGOC31MVY2xFnZGGdZPEJFhWbCiHyWwoaICJnux/pEE+sTTVOGDYDWVC8/eOejCZ6jgkR9nJaGeCaANMRpaYjlrjfGqjQtV8qSwoZIATZt2sTu3bv9LkPm0XSr3D543cV855ov0Zbs5UCyh9bsz4FkD22pXg6c6MEd657wsfGqEIn6OImGOIlsCElkf1Y2xKmNhD3bLxE/KWyIFGDVqlWcPHnS7zJknk23yi3A+ctqOX9Z7YSPPzkwRGuqhwMnsmEkdz0TUD74tHPCx9VFwnkBZGwYSdTHiFVVzuNeTk7TfWW+KWyIFCCdTjM0NOR3GeKBqVa5nc6S6kq+vLyeLy+vH3ffyMgIqf50rhvSeqKH1tSZLsneo13s7EhO+Lyfi1eTaIhlw0ecRGM8F0ha6mNUhYJz3NszNN1XvKCwIVKAtWvXkk6n2bNnj9+liAdmusrtbAQCARqiVTREq7i0uXHc/SMjIxztGRh3iKY12UtrsoedHSneOXhigueFFTVREvWxXAhpqc8MXF3ZEKe5NkooWDFlbflBAzT7RuaPwoaIyBTmusrtXAUCAZYtibBsSWTMInWjhk+f5nB3/5gAciDZQ1v28q224/yy9di4xwUrApxTF82NGcnMpInnZtL851sfjjmvyCgFDpkPChsiIotIsKKC5roYzXUxrjx3/P1Dw6dp7+ydNIy8tv8I7D8yq9d89JX3aUv2cN/VF9IQraI+EiZSGSyZmTUao+I9hQ0RkRJSGaxgZeMSVjYumfD+gaFhDnZmZs2MhpCf/Kad3xzpmvJ5n932Mc9u+zh3OxysyASPaJj6SDh7WUVD/u1sMBkNKKO/G56HsSXzRWNUikNhQ0SkjFRXBlnVVMOqpprctsdvunTceI18v7dqOb/1+aUk+wZJ9adJ9qXp7B8k1Zfm6MkB3NFuTo+MzLiGWDh0JoREz4SQ0bBSl73dkB9WomFqqysJVkw97mQ2NEaleBQ2RERkRtN9JzMyMsLJwSGSfWlSfelcKEn1p0n1ZUJJMhtOOrNhJdU/SFuqh/cPz3w2VyAAtdXhMV2S+mheNyVyJrzkuinZsBKvCo057DNZuFLg8IbChkgB7r//fg4ePOh3GSLzYq7TfQOBADXVYWqqwyQaZveap4ZP0zUwdCagZMNKZzasJEfDyphtafYe7aIvPTzj1wlVBKjLBo/ewSE6uvsn/d1HX3mfQ119/PU1X6KuOkxtJEzlNDN5FhM/xqgobIgU4I477mD79u1+lyEyb7yY7juVULCCxlgVjbGqWT928NQwqb68UJINK6m8zspoWElluymp/jTHegamfe6n3v6Ip94+c0r6aDhIbXWYukjmcE5t9jJzO0xdpJLabDCpG3Nf5jIWDi2IAbV+jVFR2BARkTGKPd13rqpCQc6qiXBWTWRWjxsZGeGBl3byDz+feKmBr3xhGZec3UDXQJrO/iG6BzKHf7oGhjjWM8C+Y92cOj3zMSqQmXpcW105fWD5THjJ/91Cuyt+jlFR2BApwJ133kkqleKFF17wuxQRmaFAIMDf33gp4VBwzmNU+oeGcwEkc3kmkHTlb+9P0zkwRHd/ms6BNF39Q+w73k3P4OzDXDQczB3WqasOUxOpzN7OXNZF8reFqcuGlLpImO+/uXfCcFWswKGwIVKAbdu2kU6n/S5DROagkDEq0XCIaDjEiomXx5nWqeHTdA+OBpKhbBAZ30kZG2Ay98+1uzKZYgQOhQ0RESlbxR6jMiqUPU9JQ3T2Y1Ug013pS58aE0g68wJJfmB54+Mj7P506vOoeE1hQ0REytpiGaOSLxAIEKuqJFZVyYra6LS/P9V5VGazyOBclc5cHhEREZnQQ9ev5sHrLh63vRhBAxQ2REREysJnA0exggboMIpIQdavX8+JE+OX+xYRWYh8G6NStFcSKUFPPvmkTuolIouKH2NUdBhFREREPKWwIVKAp556ihdffNHvMkREFjSFDZECbNiwgU2bNvldhojIgqawISIiIp5S2BARERFPKWyIiIiIpxQ2RERExFM6z8ZYQcCTVTwHBwfn/TkXsnLZ38bGRoaGhspmf/OV2z6X2/5C+e2z9rcwed+dwc/eFxgZmZ8lakvB9u3bfwd4w+86REREFrEr16xZ82b+BnU2xnoXuBI4DAz7XIuIiMhiEgSWk/kuHUOdDREREfGUBoiKiIiIpxQ2RERExFMKGyIiIuIphQ0RERHxlGajeMjMvkdmdksIeNw594LPJXnKzKLAM8AyoBp4zDn3E1+LKgIziwAfkNnfZ3wux1NmdjWwCdid3bTLOfdt/yrynpl9FfgOcAp40Dn3vz6X5BkzuxP4Wt6my5xzcb/qKQYziwPPAfVAFfCIc+5lf6vyjplVAE8CFwFp4JvOub1ev67ChkfM7HeBi5xz682sEdgJlHTYAP4A2Oac+56ZtQCvAiUfNoAHgKTfRRTRL5xzt/pdRDFkP7sPAWuAOPAIULJhwzn3NPA0gJl9Bfhjfysqim8Azjn3N2a2Avg5cL6/JXnqFqDWOXeFmX0BeAK4yesXVdjwzuvAO9nrnUDMzILOuZI9f4dz7r/ybp4DtPtVS7GY2fnAhZTwF1CZuxbY4pw7CZwE/tzneorpQeCrfhdRBMeBi7PX67O3S9l5ZL+bnHP7zaylGN9NChseyf7herM37wReKuWgkc/MfgU0U4S0vAD8C/At4A6/CymiC81sM9BApuX8qt8FeSgBRLP7Ww887Jz7P39L8p6ZrQU+cc596nctXnPObTSzb5jZR2T+xjf6XZPHdgH3mdm/Al8EzgWWAke8fFENEPWYmd1CJmx8y+9aisU5dwVwM/AjMwv4XY9XzOzrwFvOuQN+11JE+8gcSriFTMB62szC/pbkqQDQCPwRmXb7D0v5PZ3nz8iMvyp5ZvYnwEHn3BeBa4B/87kkTznnfkqms/E6cC+wh8z73FPqbHjIzK4Hvgvc4Jzr8rser5nZGuCoc+4T59yvzSwENAFHfS7NKzcC55rZTWQ6OYNm1u6c2+JzXZ5xznUAo4fL9pvZp8DZQKkGriPAr5xzp8js70lK+z096mqgpAf+5vlt4GUA59x7ZraiDA55PzB63cz2U4T3s8KGR8ysFvgn4FrnXLkMHrwKaAHuNbNlZAbUlezxT+fc7aPXzexhoLWUgwbkZmYsd879s5mdRWbmUYfPZXnpFeAZM/tHMi32kn5PA2QHSfY45+Z/+euF6SPgcuB/sgPbe0o5aJjZauAe59yfmtkNwA7n3GmvX1dhwzu3kzkO9t9mNrrt6865g/6V5LknybTV3wAiwF8W400sRbUZeD57eDAM3F3KX0rOuQ4z+zGwNbvp22Xwnl5O6Xdu8v0H8AMz+wWZ78Rv+lyP13YBFWb2DjBAkQYBayE2ERER8ZQGiIqIiIinFDZERETEUwobIiIi4imFDREREfGUwoaIiIh4SmFDRBY8M7vazN70uw4RmRuFDREREfGUwoaILCpmdrGZ7TKzZr9rEZGZUdgQkUUjGzCeA25zzrX7XY+IzIxOVy4ii8US4CXg75xze/0uRkRmTp0NEVksEsAW4D4z0/8ukUVEH1gRWSx2OefuJ7PK7Hf9LkZEZk5hQ0QWm7uBr5nZFX4XIiIzo1VfRURExFPqbIiIiIinFDZERETEUwobIiIi4imFDREREfGUwoaIiIh4SmFDREREPKWwISIiIp5S2BARERFP/T9Urh/oKpVQ3gAAAABJRU5ErkJggg==\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "elbow_visualizer = KElbowVisualizer(KMeans(random_state=42), k=(2,10), timings=False)\n", + "elbow_visualizer.fit(X)\n", + "elbow_visualizer.size = (600, 400)\n", + "elbow_visualizer.poof()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "model = KMeans(4, random_state=42)\n", + "silhouette_visualizer = SilhouetteVisualizer(model, colors='yellowbrick')\n", + "\n", + "silhouette_visualizer.fit(X)\n", + "elbow_visualizer.size = (600, 400)\n", + "silhouette_visualizer.poof()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "venv", + "language": "python", + "name": "venv" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/examples/rebeccabilbro/barchart_colors.ipynb b/examples/rebeccabilbro/barchart_colors.ipynb new file mode 100644 index 000000000..c09e85a60 --- /dev/null +++ b/examples/rebeccabilbro/barchart_colors.ipynb @@ -0,0 +1,383 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Coloring bars\n", + "\n", + "Demo of using color params for barchart-based visualizers" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/sklearn/externals/joblib/__init__.py:15: DeprecationWarning: sklearn.externals.joblib is deprecated in 0.21 and will be removed in 0.23. Please import this functionality directly from joblib, which can be installed with: pip install joblib. If this warning is raised when loading pickled models, you may need to re-serialize those models with scikit-learn 0.21+.\n", + " warnings.warn(msg, category=DeprecationWarning)\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "\n", + "from sklearn.naive_bayes import GaussianNB\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.preprocessing import LabelEncoder\n", + "from sklearn.datasets import make_classification\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.model_selection import StratifiedKFold\n", + "from sklearn.ensemble import GradientBoostingClassifier\n", + "from sklearn.feature_extraction.text import CountVectorizer\n", + "from sklearn.preprocessing import LabelEncoder, OrdinalEncoder\n", + "\n", + "from yellowbrick.features import Rank1D\n", + "from yellowbrick.datasets import load_game\n", + "from yellowbrick.datasets import load_hobbies\n", + "from yellowbrick.datasets import load_concrete\n", + "from yellowbrick.datasets import load_occupancy\n", + "from yellowbrick.text import FreqDistVisualizer\n", + "from yellowbrick.model_selection import CVScores\n", + "from yellowbrick.features import FeatureImportances\n", + "from yellowbrick.contrib.missing import MissingValuesBar\n", + "from yellowbrick.target import ClassBalance, FeatureCorrelation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Rank1D\n", + "\n", + "has a `color` param now" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYUAAAEFCAYAAAAMk/uQAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjAsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+17YcXAAAZ4klEQVR4nO3deZgdZZn38W8ISQdkE3HAoING4UY2cVohLCFBWcRxBMlEYuA1iSOEkYiIGTKCI4qoCBHnBRkWASPKABOi4wYEVFZRlkYNEbhRBAYR2QQJDtlI5o+qLo5Nd6fXU+nk+7muvvqc2s5dT3fXr556zqketmrVKiRJAliv7gIkSWsOQ0GSVDEUJEkVQ0GSVDEUJEkVQ0GSVFm/7gK0ZoqIscAXgVdRnDw8AszKzF9HxATgq5m50wC91nuB/TLz2H5s4zPAMcCj5aRhwCbAd4BPZGaf3nsdETdQ7OuVHaZfCFyemT/qa809fP0DgK8BTwD7ZOYLDfP2Bv6d4u94CTAzM+/sZBsPAUuBFxom/yEz393Hmt4AzMnMiX1ZX2s2Q0EvExEtwA+AAzLzrnLaEcDV5QFhQGXm94DvDcCmrsjMme1PIuKVwEJgQfk1YDLzwwO5vW5MBr6Wmad2Mu9bwIcy8ycR8T7gG8COXWzn8M4Co4+2AWKAtqU1jKGgzmwIbAZs1DDtUuA5YHj5fKOIuBzYHhgFHJmZN0fEdsA55bqjgV8Ch2XmkohYQXFmuy/wCuDEzPx2REwD/jEz31Oemf+p3O65FGf65wKvpzj7/0ZmntHD/diy3JdnACLiQ8AMYCSwOXBaZp5bvv77gJXAtsAy4IOZuah9QxGxPvCfwHJgKvAj4KvAncCPgauA3cvtnpSZV0TEhsB5wFjgWeAegMyc1lhkRIwAzgTeCbwI3AZ8HDgaOAR4ISI2zcx/6bB/w4FXlo83pugt9EpEbF3ux98CIyh6P18o551Yvv4oip/XLIrwvhDYOiIWULTnoszcqFzn9e3Py3b9p3LdP2fmvhHxT8BHKHqfT1P0bu4rez1nlvu0CvhiZs7v7f6o/xxT0Mtk5jPACcA1EfG7iPgmMB34UWYuKxd7LfCVzNwVOB/4TDn9SIoD9x7Am4A3AH9fzhsO/CkzW4H3AxdHxKs7KeGZzNwhM8+mCKPrM3NnYC/giIiY3EXph0XELyPi/oh4GjgbmJGZt0fERmVt787MtwKHAac3rDse+Gh5SeynQOMBeCQwj+ISzhGZuaLD644BFmTmbsDshu3+G8WJ1/bAfsBbu6j7UxQB+pbyaz3gjDL8vkfRzh0DAeBDwCUR8XvgP4CZnSzT7tKybdq/di2nfxO4uPyZ7AbsFxHvj4htyprHZ+YuwEnAKZn5IvBh4IHMPLCb12u3IzChDITxFIE6rvwZnA58u1zus8CZZR0fAt7Rg21rEBgK6lRmnklxpn0s8BjFwe4XEbFpucgDmXlb+fiXwN+Uj2cDT0bECRRn+KP56x7HV8vtLwTuBvbp5OVvBoiIV1AEwTnlOn8G5gIHdVH2FWVI7QTMpzhDvbpc93ngPcDfR8TnKA5yjXW1Zebvy8d3UZzxt/sysD/wuS7GJpZT9BQ6rvtu4KLMXJmZz1Fc3unMQcB5mbk8M1dShFlX+whARGxJMdYwPjNfCxwBXFm2WWcOz8xdG75+WS47HvhcRPwS+DlFj2HXzHyY4gB+eEScRtFr2aiLbXdnYbnvUJwcvAm4tXy904HNI2Jz4L+AcyLiUqAVOLEPr6UBYCjoZSJir4j4l8xcnJk/yMwTKM74VlIcHKE4ELZbRXFpB+Ay4CjgYeArFAfJYQ3LNp5lr0dxuaSj5xvmD+swbz2KyxxdKnszMykuqZxe7tNrKcJrG+AWirPzRo2DsI37A8XZ9LkUB+HOLCsP5h3XXdFhO53tK7z873C1+wiMAx5uHyfIzP+m+Jm8eTXrNRpe1rdne1hQXOr6QkT8HXArxWD9tcCXePnPAl7eViM7zH++4fFw4JsNr/V3wNsoeobnAzsD1wEHAgsbTkDURIaCOvMk8KnyOm+711Cced+9mnUPpLjMcAXFAWN3XhqHAPggQHnQ2R64sasNZeZiirPXY8p1Ni3Xv251O1AGwz8DM8rXelu5X6dm5gKKXgMRMbzrrVRup7gU9KaIOLIHy7f7ITA9ItYrxxemULRJRwuAoyNiRESsR7G/q9vHhcBO5RgOEbE7xfjJ/T0trjyD/zlwfLmNzSgunR1M0YO7s+wx3kgxttDeVit4KbSeBUZGxA7l8/d185LXAh+IiNeUz4+mGI8hIm4F3pqZcylOKjbjpfESNZGhoJfJzPspDgJfKMcU7qHo3h+Vmbma1U8EvhMRd1IMst5Iccmg3V4RcRdwMcUA9DOr2d7hwDsj4m6Kg/N8iktIPdmPWyjGJL5KcZD9PZAR8QuKyyRPdqitu20tAaYBZ0TEG3uyDsVbepdQBOmPKMYk/reT5U4F/kjRk7mX4oD7sdXUcz/FQXV+RCwE/j9waMOlmp6aAowt2/c24LLMvJSix7dF+bNvozjj3zwiNgZ+DbwYEbdTvPngBIp3pt1B56HXXvMCih7HdWXNU8qaV5XbOKX82VwPfDYzH+rlvmgADPPW2WqWiFgFvDozn6q7lmYoB8Sfy8yryh7AfODazDy35tKkLtlTkAbPIuCkclB1EfAHirdzSmssewqSpIo9BUlSZch+ormtra0FeDvFe+i7equfJOmvDad4N+Edra2tSzvOHLKhQBEIN9ddhCQNUeMoPrPzV4ZyKDwGsN122zFyZMfPy6w7Fi1axE47DcjNSocs28A2ANsAetYGy5Yt4/7774fyGNrRUA6FFwFGjhxJS0tL3bXUal3ff7ANwDYA2wB61QadXnZ3oFmSVDEUJEkVQ0GSVDEUJEkVQ0GSVDEUJEkVQ0GSVBnKn1MAYNa8+Sxevnz1C67NFi5a/TJrO9vANoC1pg3mTp9a22vbU5AkVQwFSVLFUJAkVQwFSVLFUJAkVQwFSVLFUJAkVdbIUIiIURHx4brrkKR1zRoZCsBWgKEgSU222k80R8QGwNeBbYCRwHHANGBbilD5VGbeEBF3AzcBuwD3AY8D+wBLgXcDGwIXAa8qN31sZt4dEb8BfgpEuc5E4CRgh4j4dGaeMjC7KklanZ70FI4GHsrMPYDJwN7AU5m5D3AwcE653MbAf2bmOIp/CH1rucxIYEfgRODHmbkvcBRwbrneGODfyu2/Gng78HngHgNBkpqrJ/c+CuBqgMz8TUS8DhgXEbu3byMitigf31V+fxa4p3z8DDAK2Bl4R0QcVk7fvPz+VGY+Uj5+pFxWklSDnvQU7qU4eycixlD0Fi7LzAnAQcA84E/lsqu62c59wFfK9d4PfKubdVb2sDZJ0gDqyYH3fGBMRNwIXEIRBNuXz28FHs7MlT3YzueB90fEDcA1QHe3M3wCGBkRX+rBdiVJA2S1l48ycwkwpcPk2zpZ7vUNj8c2PD6kYbHGx+3zt2p4PLlh1q6rq02SNLC8RCNJqhgKkqSKoSBJqhgKkqSKoSBJqvTkw2trtDmTJtLS0lJ3GbVpa2ujtbW17jJqZRvYBmAbDBR7CpKkiqEgSaoYCpKkiqEgSaoM+YHmWfPms3j58rrLqNfC7m4jtY6wDWwD+Ks2mDt9ao2FDF32FCRJFUNBklQxFCRJFUNBklQxFCRJFUNBklRpWihExISIuLxZrydJ6j17CpKkyqB9eC0itgO+DqygCJ8LGubNBA4FXgE8BbwPGA5cAowGHgH2yczRg1WfJOnlBrOnsD9wO7AfcDKwKUBErAe8CtgvM3enCKa3A0cBD2bmXsBngC0HsTZJUicGMxQuAp4FrgFmUvQYyMyVwDLgsoi4CHgtMAJ4M3Brucx9wJODWJskqRODGQoHAzdn5juBecBsgIjYBTgkMw8DPlrWMAxYBOxRLvNGYItBrE2S1InBvCHencA3IuJTFOMFZwO7Ab8F/hIRPy2Xe4xiHOEiYG5E3AQ8DCwZxNokSZ0YtFDIzAeAvbuY/Y6OEyJiT+CizLw2IrYF9hys2iRJnVuTbp39O4pxhpMpxhiOqbkeSVrnrDGhkJl/BPatuw5JWpf54TVJUsVQkCRVDAVJUmWNGVPoqzmTJtLS0lJ3GbVpa2ujtbW17jJqZRvYBmAbDBR7CpKkiqEgSaoYCpKkiqEgSaoM+YHmWfPms3j58rrLqNfCRXVXUD/bYK1qg7nTp9ZdwjrLnoIkqWIoSJIqhoIkqWIoSJIqhoIkqWIoSJIqa0woRMTciHhX3XVI0rpsjQkFSVL9Bv3DaxGxCXAhsBkwGjgHGAZMBVYCd2TmseXiMyLiBGBT4J8z8/bBrk+S9JJm9BTeBFyemQcABwDHA9OBmZm5B3BvRLSHU1tmvgM4G5jWhNokSQ2acZuLx4HjIuJQ4DlgBEUozIqINwA/o+g5ALSV3/8IbNiE2iRJDZrRU/gE8LPMPAKYRxEARwJHZ+Z44K3AnuWyq5pQjySpC83oKXwfODsiJgPPAiuAe4CbI2Ix8ChwG0XvQZJUo0EPhcy8Htipk1nnd3g+rWGda4BrBrEsSVInfEuqJKliKEiSKoaCJKliKEiSKoaCJKky5P9H85xJE2lpaam7jNq0tbXR2tpadxm1sg1sAw0cewqSpIqhIEmqGAqSpIqhIEmqDPmB5lnz5rN4+fK6y6jXwkV1V1A/28A2gHWiDeZOnzqo27enIEmqGAqSpIqhIEmqGAqSpIqhIEmqGAqSpIqhIEmqNDUUImLziJjSzNeUJPVcs3sKuwDvbfJrSpJ6qF+faI6INuAg4BngaWBCZt4VEXcBC4C3Aa8CfpWZ04GTgLdExFHA1cAFwAbAC8BRwHDg++W2rsrM0/tTnySpd/rbU/gucCCwN/AgsF9E7AA8BDyTmftTBMPYiNga+Dzwk8y8AJgDnJWZE8rHp5Xb3Ao4wECQpObr772Pvk1x9v8/5fdjKYLmMmD3iLgMeB7YCBjRYd2dgRMjYjYwDGi/gdGDmbmsn3VJkvqgXz2FzFwEjAF2A66iOPgfDCwDXpeZHwBOpLhENAxY2fCa9wGzy57CDGBeOX1lf2qSJPXdQAw03wA8mZkrgRuBJ4DbgDERcRNwJfA7YDTwALBzRBwHzAJOjogbgUuAhQNQiySpH/p96+zMnN3w+JMNs97exSpvbnh8YCfzx/a3JklS3/jhNUlSxVCQJFUMBUlSxVCQJFUMBUlSpd/vPqrbnEkTaWlpqbuM2rS1tdHa2lp3GbWyDWwDsA0Gij0FSVLFUJAkVQwFSVLFUJAkVYb8QPOsefNZvHz56hdcmy1cVHcF9bMNbAMY8DaYO33qgG5vKLCnIEmqGAqSpIqhIEmqGAqSpIqhIEmqGAqSpMqgh0JE3BAR2/dynZ0jYp/BqkmS1Lk1tacwEdih7iIkaV3T5w+vRcR2wNeBFRThcgHwQWAlsBVwQWae07D8a4FzgVHAa4BPZeZ/R8TngX3LWuYD3wKmAcsi4q7MvL2vNUqSeqc/PYX9gduB/YCTgU2BrYH3AmOBj0fE3zQsvz3w5czcHzgKOKacfjgwBRgHPJuZjwJzgTMNBElqrv6EwkXAs8A1wEyKHsOtmbk0M18AFgFvbFj+MWBGRHwTOBoYUU4/HDgNWABs1o96JEn91J9QOBi4OTPfCcwDZgO7RsTwiNgQ2BH4TcPynwMuycz/B1wPDIuIFmAS8AGKS0jTImIbiktQa+p4hySttfpz4L0TOCUifkJx5n82xdn/1cDNwKmZ+VTD8vOAORFxE8Wlpy0ycynwJ+DnFEFxLfA/QBswMyL27Ud9kqRe6vNAc2Y+AOzd/jwiJgC7ZebkDstNKB/eB1zWyXZOAU7pMPmH5ZckqYm8RCNJqgzY/1PIzBuAGwZqe5Kk5rOnIEmqGAqSpIqhIEmqDPn/0Txn0kRaWlrqLqM2bW1ttLa21l1GrWwD2wBsg4FiT0GSVDEUJEkVQ0GSVDEUJEmVIT/QPGvefBYvX153GfVauKjuCupnG9gGsM60wdzpUwdt2/YUJEkVQ0GSVDEUJEkVQ0GSVDEUJEkVQ0GSVDEUJEkVQ0GSVOnxh9ciYgPg68A2wEjgOGAGMAYYDpyZmVdExHjgZIrA2QiYAiwDvg88DVwFPA9MBVYCd2TmsRHxOuACYAPgBeCozHxkIHZSktQzvekpHA08lJl7AJOB8cCTmbknsB9wakRsAewIHJGZE4BvA5PK9bcCDsjM04HpwMxyW/dGxPrAHOCscr05wGn93TlJUu/0JhQC+BlAZv4GeA1wU/l8MXAP8EbgUeCsiJgL7AuMKNd/MDOXlY+nA8dExI0UPY9hwM7AiRFxA/BpYMs+75UkqU96Ewr3Am8HiIgxwAeAceXzjSkO6g8CXwOmZ+Y04A8UB3woLhW1OxI4OjPHA28F9gTuA2aXPYUZwLw+7ZEkqc96c0O884GLy7P74cC7KM72b6EYB/hsZj4REd8Cbo6IvwCPA6M72dbd5TKLKXoWtwGzgHMjYlS5vY/1dackSX3T41DIzCUUg8aNbu9kueO72MTYhmUuBC7sMP93wIE9rUeSNPB8S6okqWIoSJIqhoIkqWIoSJIqhoIkqTLk/0fznEkTaWlpqbuM2rS1tdHa2lp3GbWyDWwDsA0Gij0FSVLFUJAkVQwFSVLFUJAkVYb8QPOsefNZvHx53WXUa+Giuiuon21gG8A60QZzp08d1O3bU5AkVQwFSVLFUJAkVQwFSVLFUJAkVQwFSVLFUJAkVQb9cwoRsT5wHRDAJzPzG4P9mpKkvmnGh9dGA5tk5ugmvJYkqR+aEQrnAdtGxPnAL4D7gNnAMmAMcHlmfj4iXgdcAGwAvAAclZmPNKE+SVKpGWMKHwHuAR5rmLYNMBEYC5xQTpsDnJWZE8rHpzWhNklSg7rufXR3Zq4AVkTEC+W0nYETI2I2MAxYx29oJEnNV9e7j1Z1Mu0+YHbZU5gBzGtqRZKkNeouqbOAcyNiFMW4wsdqrkeS1jmDHgqZ+RDF2EGjGxrmb1V+/x1w4GDXI0nqmh9ekyRVDAVJUsVQkCRVDAVJUsVQkCRV1qS3pPbJnEkTaWlpqbuM2rS1tdHa2lp3GbWyDWwDsA0Gij0FSVLFUJAkVQwFSVLFUJAkVYb8QPOsefNZvHwdv6HqwkV1V1C/dbAN5k6fWncJWgvZU5AkVQwFSVLFUJAkVQwFSVLFUJAkVQwFSVLFUJAkVQwFSVJl0D+8FhGbABcCmwGjgXOAtvL7YuAJYElmTouIjwJTgFXA5Zl51mDXJ0l6STN6Cm+iOMAfABwAHA+cB0zLzHcADwBExA7AYcDewDjgkIiIJtQnSSo14zYXjwPHRcShwHPACGB0Zv66nH8zMBnYCdgG+HE5/ZXAtkA2oUZJEs3pKXwC+FlmHgHMA4YBj5Q9A4Cx5fcEfg3sm5kTgLnAwibUJ0kqNaOn8H3g7IiYDDwLrABmAhdHxPPAMuDRzPxVRPwYuCUiWoDbgUebUJ8kqTTooZCZ11NcGqpExDHAP2TmkxFxKkUwkJlnAGcMdk2SpM7Vdevsx4Fry57CnwHvASxJa4BaQiEzrwSurOO1JUld88NrkqSKoSBJqhgKkqTKkP8fzXMmTaSlpaXuMmrT1tZGa2tr3WXUyjaQBo49BUlSxVCQJFUMBUlSxVCQJFUMBUlSxVCQJFUMBUlSxVCQJFWG8ofXhgMsW7as7jpqt3Tp0rpLqJ1tYBuAbQCrb4OGY+bwzuYPW7Vq1QCX1BxtbW17U/wrT0lS741rbW29pePEodxTuAMYBzwGvFhzLZI0VAwHXkNxDH2ZIdtTkCQNPAeaJUkVQ0GSVDEUJEkVQ0GSVDEUJEmVIfGW1IhYD/gP4C3AUuDDmfnbhvlHAjOAFcCpmfmDWgodJD3Y/48Dk8unV2XmZ5tf5eBaXRs0LPND4LuZeV7zqxxcPfg9OAg4GRgGtAHHZOZa9fbCHrTBJ4ApwErgC5n5nVoKbYKI2B34UmZO6DD9H4BPUxwPL87Mr/Vmu0Olp3AIMCoz9wD+Ffhy+4yI2Ao4FtgLOBD4YkSsbf+fs7v9HwMcDuwJjAUOiIhdaqlycHXZBg1OBV7Z1Kqaq7vfg42BM4D3ZObuwEPAFnUUOci6a4PNgI8BewAHAP9eS4VNEBEnABcCozpMHwF8hWL/xwNHRcSWvdn2UAmFvYFrADLz58DbGubtBvw0M5dm5p+B3wJr20Gxu/1/BHhXZr5YnhWOAJY0v8RB110bEBH/SHF2eE3zS2ua7tpgT+Bu4MsRcTPweGY+2fwSB113bfAX4GHgFeXXyqZX1zwPAId2Mv3NwG8z85nMXAbcAuzTmw0PlVDYBPhzw/MXI2L9LuYtBjZtVmFN0uX+Z+byzHwqIoZFxBzgF5l5fy1VDq4u2yAidqK4ZPDpOgprou7+DrYA9gVmAwcBx0XEdk2urxm6awMoTpLuAe4CzmpmYc2UmfOB5Z3M6vfxcKiEwnPAxg3P18vMFV3M2xh4tlmFNUl3+09EjAIuLZf5SJNra5bu2uCDwNbAT4BpwPER8a7mltcU3bXB08AdmfnHzHweuAnYtdkFNkF3bXAQxe0b3gD8LXBIROzW5Prq1u/j4VAJhZ8C7waIiLEU3eR2twPjImJURGxK0X1a1PwSB1WX+x8Rw4DvAr/KzBmZubbeB6rLNsjMEzJz93LAbS5wZmaujZeRuvs7uAvYKSK2KM+cx1KcMa9tumuDZ4AXgKWZuYTiYLhZ0yus173AthGxeUSMpLh09LPebGBIvPsI+A6wf0TcSvHOiukRcTzFtbPvRcRZFHdMXQ84qfyFWJt0uf8UN7caD7SU7z4B+GRm9uoXYQjo9neg3tKaZnV/B58EFpTL/ldmrm0nR7D6NtgP+HlErKS4nn5djbU2TURMATbKzAvK9lhAcTy8ODMf7c22vCGeJKkyVC4fSZKawFCQJFUMBUlSxVCQJFUMBUlSxVCQJFUMBUlS5f8A6Sz9yqPH6RYAAAAASUVORK5CYII=\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "X, y = load_concrete()\n", + "\n", + "visualizer = Rank1D(algorithm='shapiro', color=[\"cadetblue\"])\n", + "\n", + "visualizer.fit(X, y)\n", + "visualizer.transform(X)\n", + "visualizer.poof()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## FreqDistVisualizer\n", + "\n", + "has a `color` param now" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "corpus = load_hobbies()\n", + "y = LabelEncoder().fit_transform(corpus.target)\n", + "\n", + "vectorizer = CountVectorizer(stop_words='english')\n", + "docs = vectorizer.fit_transform(corpus.data)\n", + "features = vectorizer.get_feature_names()\n", + "visualizer = FreqDistVisualizer(\n", + " features=features, orient='v', size=(600, 300), color=[\"crimson\"]\n", + ")\n", + "visualizer.fit(docs)\n", + "visualizer.poof()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## CVScores\n", + "\n", + "has a `color` param now" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "X, y = load_occupancy()\n", + "\n", + "cv = StratifiedKFold(n_splits=12, random_state=42)\n", + "visualizer = CVScores(\n", + " GaussianNB(), cv=cv, scoring='f1_weighted', color=\"goldenrod\"\n", + ")\n", + "\n", + "visualizer.fit(X, y)\n", + "visualizer.poof()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## FeatureImportances\n", + "\n", + "has a `colors` and a `colormap` param now; \n", + "`colors` is for `stack==False` and `colormap` is for `stack==True`" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "X, y = load_occupancy()\n", + "\n", + "model = RandomForestClassifier(n_estimators=10)\n", + "colors = [\"lightpink\", \"pink\", \"hotpink\", \"crimson\", \"orchid\"]\n", + "viz = FeatureImportances(model, colors=colors)\n", + "viz.fit(X, y)\n", + "viz.poof()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAagAAAEYCAYAAAAJeGK1AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjAsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+17YcXAAAgAElEQVR4nO3debxc8/3H8VcSgtpqqTVqz0dTS0gQQkTLzxqq6KJ2sRRNqq1dUNWipUUXSsWeIkhLF7Gm9i3WUB8k4tciPzuxy/L74/MdmYx775yZO2fuyZ338/HII/fOcs5nZs6d7/l+v5/z+faYPXs2IiIiRdOzqwMQERFpixooEREpJDVQIiJSSGqgRESkkNRAiYhIIamBEhGRQpqv2Ts0s9nAJGBm2c0Pu/vwOre3IXCAux/SiPja2cds4Evu/npe+2hnv8OB3u7+h2butzPMbDHgn8AXgRPd/bo2HtMbuBO41t3PTLdtCJwNLAz0As5w9yvaeO4lwNbAaxV3be/uL9cZ883AHs3+fGtlZgOBY9x9twZtbyqwm7s/3IBtrUB8npt28JhVgTPdfdeMjx9KHEtedvOiwNPAvu7+RmfjbjQzOwV43t0v6+pYuoOmN1DJlg38Mvgq0KdB2yqazYjGfF7SH1jW3dfo4DFnA6uXfjGzHsB1wP7ufquZ9QEeMbMH3P25Np7/m1LD1iBbN3BbuUkNSUMap0ZLJwftNjbJyoDV8HiAye7ev/SLmfUijpWfAMfWF21+3P3Ero6hO+mqBqpNZvYV4BxgKeIs+lx3H21mPYHfAIOIM6gewHDgf4FTgMXN7GLgUuB37r522t7Q0u9mdjKwCbA88IS772lmxwO7EkOdU4FDOzoLN7NVgNvTv02A+Yk/lIOBtYCHge8CXwb+BUwA1kvxHu7ud5nZ/MCvga8TvcgHgCPcfXo6o30AWBc4DtgJ2NrMPgSuBf4ILAssB7wIfMvdX03PuyRt88vA1e5+VIp5f+DHaV+vA/u4+3/MbBhwAtAb+AD4ibvfZ2ZrARcBC6a4/9RWD87MvgGclD6nd4EfAe8Ao4EVzewxYBN3/7DieXsBiwN/L7t5AeCn7n4rgLv/18xeJ0482mqg2pR6ZmcAW6S4HgVGuPu7ZrZjek97A8sAl7r7qHTcANxhZtsDd1HWqyj1MtJ7dxfwb2CVtI9V0/4WBmYBJ7v738xsOeAyYOm07b+7+6g24p2rZ176HfgIuBhYM213InGMDWHO8XwJ8b6vA6wEPAN8x93fS6/jDOIzfwzYCtjM3afW8F6OIo7lGcCzxPE7zczWID7jJYFXiGPkCuJYn+Tui7R1DBHH7p+IY2N8ej2lx88H/BLYMe3vXuDQdkJbLL1H96Q4Fye+M9Yh/h5vA4509xntvQ/AUOAA4nN7x923NLMD0j57Am+k1/uMmW1G/L32AmYDp7n7dR3cfkl6XWea2ebAr4AvAJ8AJ7j7TWa2L7AL8dmume7b293ntZPR3HXVHNQdZvZY2b9l0kF6LTGEMYD4AviJmQ0CNgZWIL7w+hEN0THu/h/gROAud98vw35XBjZIjdPexEG9UTpD+wfxB1TNqsAN7v5V4o/hHOIP+avA5kQjCtFQjE/bPga4OjVOJ6TXsl7615M4iEsmuftX3H0ccAPRW/g98B3gPnffBFiNaFT2KnveIu6+OXFW+gMzW9XM1iP+QLd193XT9o43szWBXxDDYusDBwHXm9nCwJHAjekz2B4Ykk4QPpO+gM4Hdk3bPRH4K/GFNZx01ttG47QOMDLt7zPu/pG7X1T2uIOARYD72/kMjqg4fkrDw8cQX3AD3H094GXg9NRD+zHROA8kPqNjzWzpsuNmy3Q8daQP8DN378ucRmQvd9+AOJk4z8y+DBwITEm3bw6smb5Is9oFWDQdOxum21Zr43EDgG2BrxDH1O5mthRwObBnev4dwIo17Bsz2w/YDtgwfb6TiBMg0rb/nE4CRxAnapU+dwwRX+KlY2Obiscfml7LesDaxEnot9N9q6fPeJKZvUr8zd1A9MIhTlwnpn2tT5wU/CjD+/BVYGhqnLYA9gE2T38PvwSuT4/7KfDrtP39ga9Vub30Hi5FfJ+NTO/hPsAVaZgT4vvtB+l9vCe9Z1KhMEN8ZtaPGPYZbWalmxcC1nf388zsBOBgM1udOAOaXsd+73f3GennHYGNgIfT/noRZzrVfArcmH6eDNzr7u+m1/AycWb5MvCWu48BcPd/mtlMome0HXC8u3+anvNb4C9l27+rrZ26+zlmtrmZ/Yg461qb6G2V/DU97qX0h7wk8UcwvvTF6+5np30eSvQkbyt7r2cBawDjgMvMbCPgVqIHMqsinK8Bt7n7lLTd29M+BxBfRJ+TvqAvJ+Z63i/bb+XjjiEasW0rG7gy7Q3x7UjMfW2dtt8beNXdZ6ce445mtgfxhd6DOIOuZah5BnBf+rnUG/9L2WuZTXzGNwH/SI3VrcTJ1Ds17Odu4BdmNgG4BTjb3Z9PQ5/lbnL3jwHM7EniMx8CPO3ujwO4+6Vmdm4N+4Y4Ri929/fT7+cQJzbLEn8zQ9K2/21mt7Xx/DaPofY+c6Jnc3nZ5/3t9JqGUjbElxrOXxAniJ+mx+4IbJR6QBDfGVD9fXii9HcL7EAc+/eWxbikmS0JXAP8Ph0/txK9cDq4vWRjYi7qgbT/p8zsHuK7azbRqP43PfYR4JvtvTmtrEhZfL2At9OZd/90UA4CLjazHZgzJPRX4uy9RxvbmF1xe++K+9+r2N8ZZfsaCAzOEOcn7l7+JfxpO4+bUfF7T2KoofI970kMTbQV42fM7AxiOPM14ALgZuZ+reVf5qX3YQZlDYaZLZR6P72IBqbyvZ7k7n8jGsBriDPSJ9NJQWXMlSpfR6VtiMZjTBr+24noCZ2SYlvAzP5M9EY3KX2x1KgXccZaek0bAbulnuGjwAbEl8GRxOdW6zH0cdkJTi/g3228h+Pd/SGip30BMRz4oJm1N9/SAz4bngTA3V8gvjBPI4a0bjWztuae2vvMK19X5QlGNW0do/MRvcbPYk5mVjyWjMdQucrjdFkzW76N7V5M9J7+nEZcID6H3cs+g42Bw6n+PlR+F1xeto0NiO+Dt9z9j8RIyy3EMfyEmS3e3u1l26z2N9LWZycVitRAOfCRme0JYGYrEUMLA4hJ7Bvd/TzgIeAbxEEFcSCWPvTXgC+nIcMe6XHtGQ8Mt8g6g/jyv7yBr+dLZrZtei3DiC/EJ9N+DzGz+dPQ2WHEQd6W8te2DXEmfTnwKvGe9GrneSV3AFuV/bEfTAxf3A78T2qsSGP1TwALmtkY4NvufhUx9PIuMcdRrvT81dLzv5Ye8wDtcPdr3H2Vsi+B0vBlaVJ5LPFlvKnXMFdSYTxwuJn1Tu/thcSX/Jpp2ye4+41Ez3IB5rx/M5n7GBqYXtcgopfUlvuJobsh6bH9ifmyFczsdGCUu/+F6A0+BfRtYxuf7YuyM2gz+z4xfHizux+dXtfaGd+De4C+ZrZu2tauxIlBLVWhxwP7pYYdYijvztQLvAfYL217VWLec65td3AMlR/P5W4F9kgnKT2B84gTlbYcQwxnHl4W6xFm1sPMFiCOq8NrfB9uBr5b9ndyCDGUiJndS4ziXEIMTX8RWKK928u2eX883TZK2/kq0aub0M7rkjYUpoFy90+AnYlG4wnioBnl7vcQPaYt0u33EUNrq6aD+T5gLTMb5+5PE5OxDxMHyCsd7PJPwN+A+83sKWJoZt8GvqSPgL3M7HHgeOAb7j4TOBWYRkza/pv4gx3Zzjb+CYwws2OJBvRMM5tIjI/fTZxlt8vdnyR6CzelOLYFDnH3p4g/qqvS7T8DdkpDOj8Dvpduf4AYrvlXxXafJr54rjezScDpwLAah7E+Y2aDgWHp9dxjc+aWKucqqvkZkezyKJGKXJp7eoL4rJ8xs0eI3tvTzHn/rgfuNrO1gaOBkamXdyCRoPA57v4akWDzq/ReXU7MR71IzI/0T+/Nw8ALwJ/b2MwIYpjoEaKnUTpeLyMaz6fN7GGicT0nyxvg7m8SX+6Xpe1uQzQMH7TzlDvN7L2yf4cSCQ63Ej2/fxM9iu+lx+8NfCu95t+n11a57faOoaeAmWb2IHP3GP5IvM8TiZO4V4A2hyXd/S3iMzo5DTmOIIZqnyQ+5yeBX9byPrj7eGKu9pb0HbMH8M00UnIUcIqZPUqc8P00nUC1d3tpm68DuwO/TcOvY4D93P3Ztl6XtK2HlttoPItsv0nuvkhXxyKtJY0InEBkFH5gZhsQw+MrVAxN17v944HrPDLcFicahe3SSUth5P0+SHMUKs1cRDrHI6X+E+AhM/uUGFr+VgO/lJ8lMlJnEd8fpxetcYKmvA/SBOpBiYhIIRVmDkpERKRc7kN8EydOXIC42PAV2khJFRGRufQiskcfGjBgwMddHUxXasYc1Ia0c/GpiIi0a3MiW7dlNaOBegWgb9++9O5ded1sPiZNmsTaa2e9bKS5ihqb4qpdUWNTXLUpWlyffPIJzz77LHR8mUxLaEYDNROgd+/eLLDAAk3YXWjmvmpV1NgUV+2KGpviqk1B42r5KRElSYiISCGpgRIRkUJSAyUiIoWkBkpERApJDZSIiBSSGigRESkkNVAiIlJIHV4HZWbzA6OJVUEXINYyep5YKbQHsUDb8LJVRkVERBqi2oW6ewJvuPteZrYkscjeI8Bx7n6nmV1CLDQ3Lt8wpehWO/es/DZ+z4T8tt0Jq//w/q4OoeWNf/nxz93WczmtCdhdVGugxgLXpp97ECtS7uruM82sN7AcUNcqqiIiIh3psIFy9/cAzGxRoqE6ITVOKxNLQr8DfP4URkREpJOq1uIzs5WIIbw/uPsYAHd/EVjTzIYDvwb2yTVKKbwpI36cy3YnTpzIgAEDctl2Z00cXMzYivqeFTUuKa5qSRLLAjcDU4EDzGx/Yunkg9z9OWBdYLu8gxSpxaxpfZuyn2M2XK/qYyafPaiubefV4IvMS6r1oI4j5pm+CHj6fzpwSZqDWhV4MdcIRUSkJXV4HZS7j3T3JYCV3H0o8BtgCrAT8DqwJdGjEhERaahM60G5+wwzuxTYBdgduAj4EfBhjrGJiEgLy7xgobvvY2ZHAy8A04AJwBeAhczsUndXooQUQrOugzn9IU36i+QpSxbfXkAfdz8N+IBonI4ielJHAf8EFskzSJFG6iiJYo1rDs68nWoX6t4ya2zmbYnI52WpxXc9sL6Z3QmMB34ITCIatx5ALzQPJSIiDVa1B+Xu7wPfKr8tXRu1CtFgLQ2cm0dwIiLSuuqtZn4EMN7d+wLrAZea2YKNC0tERFpd5iSJCm8Bg83sPmBBYAliqE+k8DpKopgyIvt2ilpJQqS7qLeBehj4PjCZKCD7rzQUKFK3tpIX1l8RZk3rgmA6sM0KUUFi8tmDClFpXVUnpLuqd4hvCHAJ8CbwHnBaowISERGB+huopYGBRKr5IcCVZtajYVGJiEjLq3eI7w3gGXf/BHAz+wj4EvBqwyITEZGWVvOS7+5+A3A38Hszex+4AViYaLRE6tZW8kIRl2i4ZVb8X8TYRLqTmpd8T5l7hwGLAccABwKHufvMfEMVaYxGLcex+z0Ht5skocQFkc6rZ8n3RYCTiXWgprn7+blFJyIiLaueJd9fAF4wMy1UKCIiuamaxZfKGt0BXF5a8l1ERCRvWZd8/x2wNzDazNYgroFaCZhqZhe4+6y8AxVplEYtxzFWlSREcpVlyfc+wBnALDObQFyYewIwFPgasDMwLr8QRerTqGSI9nSUJNEWJU6I1Kbqku/AvsD6wNNp2ff1idJGJwNnAVvlG6KIiLSiqnNQ7n4dc6/31MPdZ6efpwOL5xGYiIi0tnpKHZXPNy0KvN2gWERERD6TtdRRf6Bf+nmymb0GPAX0Ba7JIzCRzmpUMkR7lCQhkq+qDZSZHQXsT1yoC5HV14cofXQToJlf6XKrnXtW0/e5+g/v7/D+8S8/3ubteTecIt1FliG+ycAwoscE0Ti9SMxLzQa+kE9oIiLSyupJkngQONLdhwBTgJNyik1ERFpYPcttjHP3UmLEOOC3DYxHpC5dcY2RlnwXyVeWOaj5gV8D/czsQWBxM3sR6A2sCCxjZle5+3fyDVWkdnnOTbU1B1Wad9I8k0jnZZmD2pNIJX8a2JZYZmPhdN8TxHzUEblEJyIiLStLAzUWGOnug4hMvg/dfXCqKvEScI67v5JjjCIi0oKqDvG1teRG+n0Z4Ouo9yQiIjnIlCRhZsOAMcBIYCczOwhYAfgScCWg+ScppDyTJ5QkIZKvLEkSpwBHAVPcfTQwOt1+I3EdlHpQUrOakhdqqBjeTNUu1K1Fexf1tkUJGNIqsvSgNiCKwvZJy21ALPe+KXCS5p9ERCQPWS7U3RHYkLTcRkqOWBSYBpyXb3giItKq6qlmDrAbMMbdZzYyGBERkZJ6KklALFI4y8zuAz4Ghrv7840LS7q7rMkLEycWNxFBSRIi+aq3gRoI3O3um5jZIGJl3Z0bF5bMyxq51Pr6K8KsaQ3bHADbrLBeQ7Yz+exBhU3gmKKGU7qBTA2Uu08FBpXddC1RNBZ3v9/MBjY+NBERaWX1zkEtBrxT9vtMM6u3NyYiIvI59TZQ7xKZfJ9tx91nNCAeERERIHs189HAKsQquqcCLwO/N7MfAK8AT+YYo8xjGnkhaR5JErfMasx2iprAMXHixK4OQaQhsgzL7Qm84e57mdmSwGPAq8DdwLLEBbvn5BeitLLd75lQ2ESERlWS0BIdIm3L0kCNJZIiIKqZzwD6uPtAADPbDmXwiYhIg2WpJPGeu0+vqGY+xcy2SA8Zxpz1oURERBoiU5KEma0E3AFc7u5jgP2AY83sNmK47/X8QhQRkVaUJUliWeBO4B13H21m/YHriUSJ94E+6XeRhhs7eGghExFAlSRE8pZlDuo6ohFaKlUz7w+cCOwPLA980d2H5xahSJlGVqkoqbeyRGcqSeS5TpVId5FliO83QD9SNXPgK+5+rrv3B34N3JZjfCIi0qKyJElcB3xa9vsrAGa2KXA40YCJiIg0VF2VJMzs28D5wA7u/lpjQxIREakyB1VWRcKAfma2E7A2cAzwNHCCmQ139wZdmy/SsTwuZq23skRRK0mIdBfVkiT2BN4AvkVcsPs7YDngP8BHwA7AlcB3c4xRWlyjEiMatcxGSSOW21CyhEj7qg3xjQVGpeU2tiWqSJwKnARsCTwAXJpngCIi0po67EG5+3sAFVUkZgO/Tz+/A0zIN0QREWlFWS7UXQkYB/zB3ceY2avA5u7+lJkdRqyme1jOcUoLa9S8U6OqmJdoDkokX9WSJJYFbgYOd/fS9U5vEutBQdTg2w01UNIkq517VlP2k6VS+fiXH8+8HL0qlYvUrloP6jhgCWCUmY1Ktx0IXGVmywNfAp7PMT4REWlR1eagRgIj27hrsJntCjwBXJ5HYCIi0trqXfL9cxUmREREGilLkkQv4ELiYt3ZwCFEBt9ywILA+mZ2lbt/J89ARaCJ1w2NqP4QJUmI5CtLNfNhAO4+2MyGAj93950BzGxd4D7giNwiFKF5yRGVOkqWqCVJokTJEiLZZSkW+xfgoPTrysDbZXcfAUwrFZAVERFplCw9KNx9hpldCuxCpJVjZssAGwGNX6BHRERaXuYkCXffh2iMLjSz0vVPY9x9Zl7BiYhI68qSJLE3MWX8MdGg9QLWISpIPGVmWwPnufvVeQYqra3Liqp2kCyhJAmRfGUZ4vsYWIqoYP5F4CViyY13gK3c/e0Onpubjipcr78iHU5er3HNwTlEVINOVsDOjeKqSZZqE9Kabpk1tqtD6BayJElcDazp7kOAXwLPAAOI9aBuMLOLUjFZERGRhsk0B1WWJPFbYv2nB4EjU6M1hVh+Q0REpGHqSpIAbnb3iemuccD6OcQmIiItLEuSxF7AJkA/YKd08yQzmwp8ANwKTGzzyTnq6ILHapPXUzJUCchLUSfWFVftJg4uZmxFfc8Ul9QqS5LEl4E9iN7WeKJ6+c1EY7UwMJRUbUKksyqTX6olvLSn0cu7t0VLvovkK8sQ3zPAhsDT7r4J0WB9CnwIPAxs7+7vdvB8ERGRmmXJ4qusWr4K8Ja7bwX8L3B0PqGJiEgrq2e5jTeAG9LPNwIDGxeOiIhIyFSLr8LdwPbEQoXbA5uZ2Vru/kxDI5OWVJn8Uu8E9i2zGhVR+zS5LvOSrXvu3gtYvcGbnXzLrLG5lburp4H6MfAnMzsUWBV4tbEhiWTTUTWRcnklTFQmSSjhQQpudcAbvE0DcltDJms186nAoPTzi8DWZnYOcDJwbF7BiYjIvMvMegJ/ANYjyuYNd/fnsz6/riXfzWxf4DV3H1/P80VEpCV8A1gwZYAfQxQZz6yuBgrYn+hFTQD6A5eZ2XJ1bktERLqnzYCbANz9fmpMqstSSaIXUd7IgNnAIcRaUBcCSwA9gOPdvY7LKUXql3X59LwSJpQkIVLVYsTKFyUzzWw+d5+R5clZ5qCGAbj7YDMbCvwceAu40t2vMbPHgNVqi1lkjs4snVKuGdUjyjWikgQouUK6tXeB8tUuemZtnCDbhbp/AQ5Kv64MvA0MBvqY2a1ENYkrMocrIiKt4h7iciTMbBDwZC1PzprFV1puYxdieG8PUjUJMzuRqCZxYi07FhGRpppMTNU0epsdGUfkK9xLTAftV8vGM18H5e77mNnRwANEL6q8msTPa9mpSLnOVKYv14yLc8tpDkrmJemC2tyuWWqLu88i8hbqknW5DQP2IVIGexIt4b1mdjtx4ddT9QYgrSnrRbalOahmzy9loTkokXxl6UHdQDRASwJ/Al4hctmHE0N+LwNfzytAERFpTVmugzoFOIBY5v27wIrANe6+NXAwcJ+7v5VfiCIi0oo6bKDaqRjRw91np5+nA4vnFJuIiLSwakN8+wOzzWwrUsUIYJmy+xclEiZEapL1IttSIkKzEyCyUJKEzEtWO/esXKqZTxnx466pZu7uQ8oqSawP9ALuNrPvAd8nXuyTZtYzZWuIdNpq51aU66ojEWH1H97fmGA6MP7lxzNdRJy1MRbJ2TxXzTzLHNSw9P+jwDnAjPT/EsA/gReAnXOJTkRE5nlmtnGq3VqTzJUk3H0okV7+ElE2fW133x/4B7BVrTsWEZHuz8yOIjLAF6z1uZmqmZdVkvgtcCVKlBARkWwmA9+s54mZKkmY2cZEHb6+RCWJhczsN8R45isoUUIaqPzC1boTEUY0MKB2KElCpDp3v87MVqnnuVkqSVwFDAWmAh+km2cC3wJ+BmwH3FHPzqV1fC7xoRYNqNYAjU+cyJokkZWSKUTmlmWI7wbgEaAfMB44lait9ClwFNAbuDavAEVEpDVV7UG5+5hUifaqtGwvAGa2NjDN3c/PM0AREWmIrqhm3imZq5mLiMi8K11Q2yXjyO4+FRhU6/M6bKDMbH5gNNHq9jOzndy9tMzGOsDegHpQUlW9FbsbmojQ4MQJJUmI5KtaD2pP4A0iIWIs8DvgBjNbn6gsIdJwqiQhIlA9SWIsMCp1z7YFZpjZUsAviOoRr+YbnoiItKpqtfjeAzCzRYlMvVHARcCPgA9zj05ERFpWluugViLWlf8D8BywJnAeUbain5md7e4/zDVKERHplFnT+uZSzbzncs92TTVzM1sWuBk43N1vSzd/NSVPXE1cG7VpRfKESKeokoRILua5aubVelDHEVXLR5nZqHTbdsB3iPJGTwPbA48RF/RKC+pUlYgsOllJIq+EiUZXkuiIEi1kXlSWCb4KsABwai2dmWpzUCOBkW3sdCxwrbtPT0kTM2oJWkREWsKewBvuvpeZLUmNnZm6LtRtI3nihHq2IyIi3dpY5pTC60GNnZm6K0mUJ0+4+5h6tyMiIt1TZzszdTVQZrY88CTwX2B/M3vQ3SfVsy2Z99VbJSKLhiQi5JQwoSQJkeo605mptwd1AVHF/HXgi8AEM1vJ3XVtlMylIQkUnUiSyLOihJbbEOlYO5ngmdU7BzXMzOZLK+3uA3xNjZOISKF1RTXzNjPBs7YXdc9BlS0DvwuwW73bERGR/KULapvaTW8vEzyrTi234e77mNnRwANm1s/d3+/M9qT76ez8VKfneXK8YFdzUCL5ylLqqBdwIdE1nE2sprszMBx4kTlpg7NyilFa0FxzVxVzUM2oVJ5Fe3NQmksSaYwsS74PA3D3wUSK4M+JyubPpeevCjymOSgREWmkLEu+/8XM/pZ+XZkocXSIu78CYGaHASvmF6KIiLSiTHNQlQkRZY3TpsDhwJD8QhQRkVaUOUmiMiEC2BE4HtjB3V/LK0BpTaXkijYTEZpQqTwLJUmI5CtLksTexFfCx8ScUy/gh8DRRDXzE8xsuLsrSaJFFaGaeVckTjSjmrkSLqSVZUmS+BhYCpgJLAa8BJwEvAZ8BOwAXJlXgCIi0pqqNlDufjWwprsPAX4JPAOcSjRSWwIPAJfmGaSIiLSeLD2o8iSJ3xK9peeAc4F/A8sCE/IKUEREWlMtF+quQTRIlwALEZXM3yaG/s4CDsstSim0QlQz74LECSVJiOQrSxbfmcC67j7QzLYnFqDqBYxK10iNBdbPM0hpvtwTH2qRsZp5sxMlmrnke3uURCHdWZYhvhOAF8zsTmJY7wHgfeBIM/sXsDrwQn4hiohIK8qSJPG+u+9ONELLAKcDDhzn7lsA96M6fCIi0mCZkiQgLtQF+hLzUd8HjjWz24BXiYULRUREGiZLksRewCZAP2Anore0I3A9sB9xjdQ/c4xRukCeiQ+1qCkRocmJEkqSEMlXliSJLwN7EL2t8UQViVWIzD2A2939H7lEJ4XS6MSJoiybUa88kiSU9CAyR5YhvmeADYGn3X0T4G5iuY1Ngafc/fgc4xMRkRaVJUniOuBT+OyaqIuAHwHT8w1NRERaWeYkiWQAsCZwHnAV0M/Mzm54VCIi0vKyLrfRH+jn7g+mpInz0+0fEL0paQENT5zIkNRQ5ESEIscm0h1kyeI7Ctgf6JFuuoD4anmZmI/aA7girwCl6+VVVUJJEu1TsoRItiG+ycAw4Kn0ex93v9fdpwIHAgA81M4AABB/SURBVJvlFJuIiLSwmpIkkilmtkX6eRiwcB6BiYhIa6s1SQLi4lxVkRARkVxlTZIotwPwPeAWYDjwppld7O77NTQyKYzcqkooSUJEOlBPA/UccDtRxfwcXag776ia7JBxWYvOmNcTI8oVYbmNckqskO4m64q6U919UPr5RuAg4CVgQzO73cwG5RijiIi0oHrmoCCufzoT2AY4BLjSzOrpjYmIiLSp3kblWeB5d58NPGtmbwDLA/9pWGTScB3NJTVtPqXGiuNFnucpcmwi3UGmBsrMlgEmAlu7+zPEhbuHmNl5wA3AYsAruUVZIdOFo02YT6lbUWNTXDXpTvNp87rxLz+e+bGaq5t3VB3iM7P5gT8CH6bfvwTsDKwG/AS4Gtjf3WfkGKeIiLSYLD2oM4nae8em3xcBTgK2A6a5+/ntPVFERKReHfagzGxf4DV3H1+6zd1fcPcH8g5MRERaW7Ue1P7AbDPbiqhofpmZ7eTuXXr1R7ULR4s8eV3U2BRX7SYOLmZsRX3PihqXFFeHDZS7D0lzUKOJBmpBYCMz+w+xzMY0M/sOcJ67X517tJJbZXEg92SE7pZUULQLdUvWX5EO41KSgMwrssxB7Qm8ATwGHA2MBU4B7gPGaQ5KRETykOVC3bHAKHcfCjwPzCBW1p0f2MPMLjKzRfMLUUREWlGW5Tbec/fpqRG6FjgBeBA40t2HAFOIrD4REZGGyXqh7jBgDDASeBq4EPjYzJ4FzgLOzS1CmUtelcWbMoFdYxUJKPbEelFjK2pcIrXKsuT7KcBRwBR3H21m44BFgcOIBmsEUWVCulBDkicamCTR3RIi2lLUJAn4c1cHINIQWXpQGwDTgT5mNgFYGbgM+A1gwCRgp7wCFBGR1pRlDmpHYEPg6ZQocRzwA2ApYDKwrbu/m2eQIiLSeupZbuMcYHN3X4voSeV4YY6IiLSqepbbeBP4yMzGEOnmi5nZ2e7+XGNDk1p0Nnmi4RPrdSREtKXIE/6Fje0lTQlL91BPAzUcuBlYCHgKOA34HbF4oeQk1woSJUqSqElRkyQ6qiShKhIyL8nUQLn7VKC05PvdZjYeuMXdxwGY2SV5BSgiIq2p3iXfHwN2NLMeZjYIWNHMejUwLhERaXH1NlCjgXeBu4BdgInuPrNhUYmISMvLWkliY+AMdx9qZv2IKwHnJ0oeXU9cGyU5yquCRImSJGpX1NiKGpdIrbJUkjgK2At4P930C6Ie30hge2BdYMe8AmwVTUmCqKaOJIlWSIZoz7yYJFFJSRNSZFmG+CYD3yz7fVd3v5FonB4BfuLuL+cRnIiItK4slSSuAz4t+32mma1MpJgvDTyeX3giItKq6kqScPcX3X1N4Hzg140NSUREpMocVNly7wb0M7OdgEPT83oDywJP5h1kK8g7CaKauifWG5QM0Z4iT/gXNbaixiVSq2pJEqXl3r9FrKz7O6JBWgV4BXgL+EuO8XVrhUiMKNeJShKtmCzRHZIkyilhQoqm2hBfabn3qcC2xHLvfYELgJnEUhvj8gxQRERaU4cNVDvLva8CvOXuWwH/Cxyde5QiItJyqiZJmNlKwB3A5e4+hhjyuyHdfSMwML/wRESkVVVLkliWqFw+FTjAzPYHniCugbqcyOD7tN0NCNDGXFMDq4Z3pVacd+rOZk3r+9nPmo+SIqjWgzoOWA7oTzREi6TbDzCzt4CNgZvyC09ERFpVtTmoke6+BLBSWu79N0T23n5E0sQvgQ/yDlJERFpPpgt13X2GmV0K/Ba40t1fcPcH8g1NRERaWeYVdd19HzM7GnjAzPq5+/tVnyTA3BfhFvUiyrriyvkiXSju+wXFja2ocYnUKks1832Jr6L3iWXeFwRWMbM/AisBU83sAneflWegUr/MFwSn5A0lP2STx4W6Sk4QmSPLEN8CRHLE7PSvJ3AacU3UxUAPYOe8AhQRkdaUpYG6Ehjg7kOIxIjpwADgX+5+MnAWsFVuEYqISEuqOsTn7u8BVFSTONPdZ6eHTAcWzy1CERFpSVnmoHoBY4AdgJeJC3V7mtndxPDeR8AzeQYpnZOlUvpcE+tNSH7IqsgT/kWOTaQ7yJLFtyewDTHPNBP4OTEXdY27n2tmDjyaX4jznqpJCUWtJFElLiVPzK27VTOvpIQN6WpZGqgNiF7SKGL9pyWA7YCzzWwPYGFUTUJERBosy5LvI919OeBFYEVgb3d/FNgbWAqYhnpQIiLSYJmXfHf3fYi1oC40s4W17LuIiOQpy5LvdxA9p9eIlPL5gP+a2bPA28B1gC7SLdNRUkJRJ9YzxdUFyRNFfb+guLEVNS6RWmVZ8v0xInuvD3AZ8BxR5fwgYvHCY4DB+YUobSlfGqHSNius18RIWtfkswc1POElS8alSKuo1kCNBa5Nq+ouBTwEbO3urwCXmtlhwIrpdxERkYbpsIFq6yLdUmNkZpsChwND8g5SRERaTz1LvmNm3yaSI3Zw99fyDVFERFpR1iXfD3f329JtexLXRL0GjDazBYkVd5dz97dzjleSji6ivKWOlJWiTqwXNS4odmwi3UG1OajjiAtzR5nZKKAXsDZxTdSM9JiewIg8G6fMy0WUK2q1BlDFhm6iu1eSaLw/d3UAMo+pNgc1EhjZ3v1mNpAoHHtBowMTEZHWlvlC3XYcB/y0EYGIiIiUq7uBMrMvAubudzQwHhERESBbsdj2DAFua1QgHan14sUiT16rYkNtihoXFDe2osbFSxO7OgKZx3RmiM+AKY0KREREpFzdPSh3/1UjAxERESnX2SQJERGRXKiBEhGRQlIDJSIihaQGSkRECkkNlIiIFJIaKBERKaTOXKibVS+ATz75pAm7muPjjz9u6v5qUdTYFFftihqb4qpNkeIq+67s1ZVxFEGP2bNn57qDiRMnbgbcletORES6n80HDBhwd1cH0ZWa0YN6CNgceAWY2YT9iYjMy3oByxPfnS0t9x6UiIhIPZQkISIihaQGSkRECkkNlIiIFJIaKBERKSQ1UCIiUkjNSDPPnZktBFwBLANMB/Zx99cqHvNrYDNgFvBjd7+nIHHtC3yfSC39q7v/LO+4ssaWHvcF4F7gGHe/qQhxmdmviM9yPuACd78wx3h6An8A1gM+Boa7+/Nl9x8IHAzMAE5197/lFUuNcR0BfCf9+g93/2kz4soSW9lj/k4c8+cXIS4z2w44CegBTAQOc3elOXeh7tKD+j7wpLtvDlwGnFB+p5mtB2wKbAzsBZxbkLhWT48ZCmwE9Daz+YsQW5nfA838I632nm0JrOHumxCN1NFmtkSO8XwDWDDt7xjgrLJYlgNGAIOBbYDTzGyBHGPJGtdqwPeIY34Q8D9mtm6T4uowtjKnAnl+bm3p6D1bFPgVsKO7bwxMBZZucnxSobs0UJsBpbP7fwJbVdz/EvABsACwGPBpQeLaCngYuBT4F3CPuxclNszsJ0Tv6fEmxZQlrvuA/dPPs4meZ57v2WfxuPv9wMCy+zYiPrOP3f0d4HmgWQ1BR3H9B9jW3WemHsD8wEdNiqtabJjZbsRIRu498hri2hR4EjjLzO4C/q+tEQVprnluiM/MDgCOqLj5/4B30s/TgcUr7p9B/EE8k+47sCBxLQ0MIf44FgLuNrON3P3tro7NzL4OrOnuB5vZ4EbG05m43P0j4KPU07yUGOJ7L4/4ksXK4gGYaWbzufuMNu5r6zNuelzpJOd1M+tB9AoedfdnmxRXh7GZ2drAHsBuwIlNjKnDuIi/xS2B/sB7wF1mdl+T3zepMM81UO5+EXBR+W1mdj2waPp1UaDyC35vYBoxDLMo0RDc7+7/7eK43gAmuPt0YLqZ/RvoCzzYqLg6EdsBwMpmNgFYC9jAzKa5+2NdHBdpSO9a4r07rVHxtOPdsngAeqYvtLbuazPeLogLM1sQGE00moc2KaYsse0NrAjcDqwCfGJmU5sxv1klrjeAh9x9GoCZ3Uk0VmqgutA810C14x5ge+KLfTs+X5z2LeA9d59pZtOJCdKFCxDXPcBh6cukF9CPGCZqhg5jc/c9Sj+b2SXAVY1snOqNKyVR3Aac5e5XNimeYcA1ZjaIGAYqeRD4efr8FgC+AkxqQkwdxpV6Tn8Fbnf3M5oUT6bY3P2o0s9mdjIwrUmNU4dxAY8Aa5vZ0sRJxiAgt+Qbyaa7NFDnAZea2d3AJ8QQAmb2S+JMewww2MzuJRqCK93duzoud3/QzC4i/nB6AD9z9zebEFem2JoUR01xEQkJqwEHpgw6gP3c/YWc4hkHbJ2OnR7Afmb2I+B5d7/BzM4lGtGewPFpCLIZ2o2LOMa3ABZImWkAx7r7fV0dm7vf0KQYao7LzI4FxqfHXuPuzTrZkHaoWKyIiBRSd8niExGRbkYNlIiIFJIaKBERKSQ1UCIiUkhqoEREpJC6S5q5dHNmthhxced8wLeJeop9gIuBtdx9eDvPGwgc0t79Vfa5EbCrux9dd+CxnQnAye4+oTPbEWk1aqBkXtEf+MTdB5rZl4F13H2Fak9y94eBmhunpB+wbJ3PFZFO0nVQkqtU1eB0YBeiJuIf3f0cM+sLXAAsCbwPjHD3h8xsWeCPwEpE/cRjgSeIorXLMadEzlrp9p8QvZOhZtY/PfcLwJtERe81yu5fg7gQeCmiePAP3P3RVCnjHWAA0Sv7KXFR5xPAIkTVip+XvaZHgIPc/WEz6wW8CGxAXBz7Y6Ku4kLEcg53lnpQ6eknu/vQtJ1LiHJNl5jZ3sAPiWH30lIPzSzwKlI4moOSvO1GVH9Yh6j+vV9apuIK4Fx3X5coGHttWqriHGC0uw8AdiIanA+JXtDD7r5Tuv1ldx9Ysa8riWoc6wBXASMr7r8UOMrdNwAOSo8pWQnYnCiFc2Yq2HsicEN545Rczpy1lr5GNGSvA4cQyzWsRzTKR2Z5g8zsq0QB403dvT/wKtHwirQ0DfFJ3rYgysZ8TNRA7G9mixBrOl0PsfSBmb0JGLG8xlpmdkp6/vzA6tV2kmqoLV9aMNDdz0u3D03/LwJsCFxsZqWnLWJmS6Wfb3b32WY2iejVdeTPwL1mdiTwXeAKd59lZrsAwyx2MBSYWS3uZEtgTeD+FFtvojacSEtTAyV5m2utJjNbhSje26PicT2I47EX8LVSTUIzW4FYgmPzGvezIFA+R9UL+Cj1UEqP6UMMBUJaLyk1Uh3uyN2nmdmzRCO0FXB4agAfInpXdxK9qsMrnjqbuV93aXHKXkQjPiLFtQj62xTREJ/k7k7gm2Y2v8Xy8TcRiQeTzeybAKmy9HJEJfDbSctDmFk/4ov+C9V2khYM/I+ZbZ1u2gs4peL+58xsz7TtrVNsHZlB+w3F5cSKrBPc/QNimZRZwC/Sa9iOaHjKvQ6sZmYLmtmSzGl0JwC7mNkyac7uPGI+SqSlqYGSXLn7OKJa+yNED+OctAjcnsAIM3sS+B3wTXf/BPgBMMjMngCuBvZK62VlsSdwkpk9RqSiV84BfQ8YnrZ9GvDttOJsex5MsZzexn3jiGG5K9LvjwOPEYtiPkIserdy+RPc/Sng78BTwFjSUiLu/jiRmHF7uq8nMYcl0tKUxSciIoWkHpSIiBSSGigRESkkNVAiIlJIaqBERKSQ1ECJiEghqYESEZFCUgMlIiKF9P+z3ahu7/fvXQAAAABJRU5ErkJggg==\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "X, y = load_game()\n", + "X = OrdinalEncoder().fit_transform(X)\n", + "y = LabelEncoder().fit_transform(y)\n", + "\n", + "model = LogisticRegression(multi_class=\"auto\", solver=\"liblinear\")\n", + "viz = FeatureImportances(model, stack=True, relative=False, colormap=\"viridis\")\n", + "viz.fit(X, y)\n", + "viz.poof()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## MissingValuesBar" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "No handles with labels found to put in legend.\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# Make a classification dataset\n", + "X, y = make_classification(\n", + " n_samples=400, n_features=10, n_informative=2, n_redundant=3,\n", + " n_classes=2, n_clusters_per_class=2, random_state=854\n", + ")\n", + "\n", + "# Assign NaN values\n", + "X[X > 1.5] = np.nan\n", + "features = [\"Feature {}\".format(str(n)) for n in range(10)]\n", + "\n", + "visualizer = MissingValuesBar(features=features, color=\"lime\")\n", + "\n", + "visualizer.fit(X)\n", + "visualizer.poof()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## ClassBalance" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "X, y = load_game()\n", + "\n", + "visualizer = ClassBalance(\n", + " labels=[\"draw\", \"loss\", \"win\"],\n", + " colormap=\"copper\"\n", + ")\n", + "\n", + "visualizer.fit(y)\n", + "visualizer.poof()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## FeatureCorrelation" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "X, y = load_concrete(return_dataset=True).to_pandas()\n", + "\n", + "# Create a list of the feature names\n", + "features = [\n", + " \"cement\", \"slag\", \"ash\", \"water\", \"splast\", \"coarse\", \"fine\", \"age\"\n", + "]\n", + "\n", + "# Instantiate the visualizer\n", + "visualizer = FeatureCorrelation(labels=features, color=\"rebeccapurple\")\n", + "\n", + "visualizer.fit(X, y)\n", + "visualizer.poof()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.1" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/rebeccabilbro/check_is_fitted.ipynb b/examples/rebeccabilbro/check_is_fitted.ipynb new file mode 100644 index 000000000..c64681a6f --- /dev/null +++ b/examples/rebeccabilbro/check_is_fitted.ipynb @@ -0,0 +1,854 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "\n", + "from sklearn.cluster import KMeans\n", + "from sklearn.ensemble import RandomForestRegressor\n", + "from sklearn.model_selection import train_test_split as tts\n", + "from sklearn.linear_model import LogisticRegression, Lasso, LassoCV\n", + "\n", + "from yellowbrick.cluster import *\n", + "from yellowbrick.features import FeatureImportances\n", + "from yellowbrick.classifier import ROCAUC, DiscriminationThreshold\n", + "from yellowbrick.classifier import ClassPredictionError, ConfusionMatrix\n", + "from yellowbrick.datasets import load_occupancy, load_energy, load_credit\n", + "from yellowbrick.classifier import ClassificationReport, PrecisionRecallCurve\n", + "from yellowbrick.regressor import PredictionError, ResidualsPlot, AlphaSelection" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Check if fitted on Classifiers" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "X, y = load_occupancy(return_dataset=True).to_numpy()\n", + "X_train, X_test, y_train, y_test = tts(X, y, test_size=0.20)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "unfitted_model = LogisticRegression(solver='lbfgs')\n", + "fitted_model = unfitted_model.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXoAAAEYCAYAAABSnD3BAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjAsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+17YcXAAAgAElEQVR4nO3deZwcVb338c9kGwQCgvIgIotsX9QBgUEWCRJZRUC8cq/4EpBF5OESfUBwByQgLijkiiKL7CJRgRC8oCwqIGGXEYQB/EWQxQVFuBcSUSckmeePcxqaZqa7ZtI9PVPzfb9eeaX79KmqX3cqvzp16tSpjv7+fszMrLwmtDsAMzNrLSd6M7OSc6I3Mys5J3ozs5JzojczKzknejOzkpvU7gDaTdJE4Ejgw6TfYwpwNfDFiOiTdBHQGxGntjCGx4E+4J9Af47hBuCYiFi6DOv9FNAVEQdJOg/4YUT8vE79c4GzI6KnSP0hxDEduBaImo+eiYidl3X9dba7Ut7ua0n/nnOGuZ5+YLWIeKYJMW0JfC4i/r1OnXcAH42IwwvWPwg4HXgsF3UAKwHzgMMi4l/LGnezNXP/ssbGfaIHzgJWAXaKiOclrQBcCpwHHDCCcewXEfcASJoC/BI4AjijGSuPiEMLVNsFOGcI9Yfi0YjYrMnrbGQzYPWI2GCEtzuo/G88aNLO3ga8aQj1AeZFxJ6VN5KWA24FDiT/m44mLdi/rI5xneglvRnYD1gjIhYARMQLkg4H3jlA/UOA/0tqca8KfC0izpL0BuB7wOtz1Z9ExPGDlTeKKyIWSZoHbCxpXVLL7GFgXWAH4M3AKcAKwFJgZkRcI2ky8C1Swn4a+CvwfI79ZuCMiLhC0p7AyaSuuxeAw4EPAm8ELpX0kbz+Sv33AycAE4EFwNERcbekmTmmNYB1gL8B+0bEnxt9x2p5Pdvm9dwPPFLz/mBgFrATsAS4C/hkRCzMZ0N3AZsCX4iIuXmdAi4A1pR0X17fbnW+x0vbi4j9C8Y9uU5cWwFnkvaVR/Pvc3Re9IyI6JI0LS8/kXQm91XgbuAkYGVJFwIXV9VfEfg2sB2wGLgKOHaQ8F4HrAz8T451TVKjYW1gMqk1/ZX82UHA50hnlDcCR0bEpIF+F0nHAvuQ9p3HgSMi4s+SPgAcR9oflwCfjohb6pTfzAjtX+Y++i2ABytJviIi/hIRV1aX5f9kHwPeGxGbA/sCX88ffwz4fURsAWwPbChp5TrldUl6I7AXcFMuehPwpYjYCPgXcCFwQF7v+4CzJK1NOgPYCHgrKdmvPcC6Vwe+DxwUEZsC3yAdsI4F/kw6s7irqv7GwNnAPrn+F4Ef524R8vf6j4jYGPhf0oFwIOtLuq/mT3WSWgfYoirJVr8/jnQQenv+MyHHXdEbEW+pJHmAiAjgUF4+k1inwfeo3X4RA8YlaRIwBzg+b+tbpLOLWicCsyKiGzgE2DEi/pBjmxcRB9fUPwlYDnhLXt92pAM/wPb5N31Y0t+Ay4BTI+Ly/PklwAV5W1sBO0v6oKS3kg7qO+f9egEp4Va89LvkBsAmwFb5N/0p6cwX0r/HERGxJXA8ML1BOdDU/cvqGNctelIro9DBLiL+nlvCe0jakPQfbcX88XXAT3Oy/TmpT/V5SQOWD7KJSyX9M8fzInBeRMzJLfrFwB25XqWFdVVqtAKpNbgpsDMwOyIWAYskXZrLq21HSoz35e91JXAlg9sR+EVE/D7Xv1HS00B3/vzmqgPlvaQznYE06rq5MyIWD/J+d+DYiHgRQNK3Sa3Zinl11lv0e9Ruv4jB4tokb+Pa/PdNknoHWP4y4DuS9iLtH19osL2dSa3dJaTW8Q55uweRu24kTSAdgPYDfpw/XyHXXVXSl/K6ViTtw2sCN0TEH3P5t4GZVdus/l32JB0k7sn73kRg+fzZD4G5kn4C/IyXG0GDlVc0a/+yOsZ7i/5u4C2SplYXSlpT0k8kvaaq7E3AfaQWzq2k/0wARMSvSN0p3yWdat4t6Z2DlQ8Sy34RsVlEbBoR3RFxWtVnfVX/2SYCD+e6m+XkuQ1wPSnhd1QtN1DiWpzrVb5Xh6Tag0G1gfaRCaTTf0in+xW12x+Kv9d5XxtD9fYHWnYgjb5HkXU0WmdlfYt59e+wpHbhiDiHdFD4Galb6f4GZ3y1/3ZrSXpdzTqXRsRJpAuz5+fiiTmed9bsM18ZINbaOKt/l4nAKVXr2JLUcCCfEW4H3AMcBNwhacJg5VXrHKn9a1wb14k+Iv5EuvB6QeVUMf99JvBsRFTvZFuS+ghPjojrSa0bJE2U9DXSafpVpBE8DwIbDVa+jGHfSeoCelfe/mbA70hdCNcBH5G0XL4Yt+8Ay99FOri9Lb/fm9SVA+k//eSa+jcCu0paL29vR2CtvJ6Rcj1wuKTJOUnMICXHoWjF9xgsroeBPknvydvaipTQXzGDoKTbgc0j4iLgMNLooFUY+N8BUqv/QEkTJHUCV/By102tGcBOkt6fW8R3kq8RSHotcBvp3/56UjfOmnm5ehdJrwcOrepWOQm4RNKkfK1khYg4m9SF+BZg8mDlVescDftX6Y3rRJ8dATwE3J4v2t2V39fu8DcAfwRC0r2k/u+/ARsA3wQ2y6fn95BaUz+oUz5sEfE30sWwb0j6Danv9YCIeII0uuIeoJc0auexAZb/K+m0/uL8fY8GPpQ/vgr4kaRdq+o/RPqNrszf42vAXnW6oAYzUB/9fZL+T4FlTwb+QjqjepiUKI4cysab8D0el/T3qj+VC9qviiuffe0DzMz7yjG53j9q1vkZ4KRc5ybgxIh4nNRNt7GkuTX1TwQWAb8hdWP8tPZaUtX3fZTU9z4rH/Q/DGwj6QHSPv6DiLg0IuYDnwSul3QPKRHXxllxHnANcKekB0ndggfl73sUMFvSr4HLgUMioq9OeSXOZu1fVkeHpyk2az5J3yBdDP2rpLVIyXm9iHiuzaG9gtLIs4+QLvYvVRol89mI2LrNoVkTjfeLsWat8gTwC0kvkvqVDx1tST77I6nb7wFJi0nDcQ9pb0jWbG7Rm5mVnPvozcxKrhRdNz09PZ3AO4CnGGAYm5mNaxNJ9578qru7u69R5TIqRaInJfkiN82Y2fi1PekemHGnLIn+KYCNNtqIKVOmtDuWturt7aWrq6vdYdgoMt73iUWLFjF//nzIeWI8KkuiXwIwZcoUOjs72x1L2/k3sFreJ4Bx3K3ri7FmZiXnRG9mVnJO9GZmJedEb2ZWck70ZmYl50RvZlZyTvRmZiXnRG9mVnJO9GZmJedEb2ZWck70ZmYl50RvZlZyTvRmZiXnRG9mVnJO9GZmJedEb2ZWck70ZmYl50RvZlZyTvRmZiXnRG9mVnJleTj4kE085pJ2h9A6sx9qdwRNteS0A9odgtmY5ha9mVnJOdGbmZWcE72ZWck50ZuZlZwTvZlZybVs1I2kicC5gIB+4HDgX8BF+X0vMCMilko6AdgDWAwcFRF3S9pgoLqtitfMrKxa2aLfCyAitgOOA74MzAKOi4jtgQ5gb0lbADsAWwMfAr6Tl39V3RbGamZWWi1L9BFxFXBYfrsO8BzQDfwyl10L7AxMA26IiP6IeBKYJGm1QeqamdkQtfSGqYhYLOli4N+Afwd2iYj+/PFCYGVgJeDZqsUq5R0D1K2rt7e3WaHbKNLT09PuEMY8/4bjW8vvjI2IAyV9FrgLeE3VR1NJrfwF+XVt+dIByurq6uqis7OzWGAlu3u0zLq7u9sdwpjW09Mzrn/Dvr6+cd8IbFnXjaQDJH0+v/0HKXHfI2l6LtsdmAfcBuwmaYKktYEJEfEMcO8Adc3MbIha2aK/ErhQ0i3AZOAo4GHgXElT8usrImKJpHnAHaQDz4y8/DG1dVsYq5lZabUs0UfEC8AHB/hohwHqzgRm1pTNH6iumZkNjW+YMjMrOSd6M7OSc6I3Mys5J3ozs5JzojczKzknejOzknOiNzMrOSd6M7OSc6I3Mys5J3ozs5JzojczKzknejOzknOiNzMrOSd6M7OSc6I3Myu5hvPRS1of2AaYDZwDbA58MiJubXFsZmbWBEVa9BcCi4C9gY2Ao4FTWxmUmZk1T5FEv1xEXA7sCVwaEfNIjwY0M7MxoEiiXyJpH1Kiv0bS+4ElrQ3LzMyapUiiPwzYA5gREU8BHwIObWlUZmbWNA0TfUQ8ABwbEXMkbQ/MAx5teWRmZtYUDRO9pLOA4yS9lTTyZgvge60OzMzMmqNI181WwMeBDwLnR8RHgXVaGpWZmTVNkUQ/MdfbG7hW0vLA8i2NyszMmqZIov8e8BTweETcBfQA321pVGZm1jQN74yNiFmSTo+IypDK7SPimUbLSZoMXACsC3QCJwN/AK4BfpernRURP5J0Amlkz2LgqIi4W9IGwEVAP9BLGvWzdChfzszMik2BMA34tKQVgQ5goqR1ImLdBovuDzwbEQdIWhW4DzgJmBURp1WtfwtgB2BrYC1gDvAOYBZwXETcLOlsUtfR3KF+QTOz8a5hogfOA04BDgK+BewO/LrAcpcDV+TXHaTWejcgSXuTWvVHAdOAGyKiH3hS0iRJq+W6v8zLXwvsSoNE39vbWyAsG2t6enraHcKY599wfCuS6P8ZERdKWhf4X+BjpH76uiLi7wCSppIS/nGkLpzzIqJH0rHACcBzwLNViy4EVgY6cvKvLqurq6uLzs7OAl8JmP1QsXrWdt3d3e0OYUzr6ekZ179hX1/fuG8EFrkY+6/c9RLANjn5rlBk5ZLWAm4CLomI2cDciKgcJOaSZsJcAEytWmwqKfkvHaDMzMyGqEiinwX8CLga+IikBynQope0OnAD8NmIuCAXXy9pq/x6p7ye24DdJE2QtDYwIV/svVfS9Fx3d9IduWZmNkRFRt1cLumKiOiX1E2aqvi+Auv+ArAKcLyk43PZ0cB/SXoR+AtwWEQskDQPuIN04JmR6x4DnCtpCvAwL/f3m5nZEAya6CVdSBraWHlfW+WQeiuOiCOBIwf4aLsB6s4EZtaUzSeNxjEzs2VQr0V/80gFYWZmrTNoH31EXBwRFwNXAivm1z8H1icNnTQzszGgyMXYS4E18uuFeZlLWhaRmZk1VZFx9OtExPsAImIBacriIhdjzcxsFCjSou+XtEnljaSNgRdbF5KZmTVTkRb9p4CfSfojaSqD15PmsTEzszGgyDj6n+cbmTYhteQjIvpaHpmZmTVFkRY9EbGIAnfDmpnZ6FOkj97MzMYwJ3ozs5IrPAVCrYioOwWCmZmNDvVa9DeTHvwxFXgjcCNpNspVGixnZmajyKAt+jzlAZKOALatPK9V0mXAnSMTnpmZLasiLfOVgVWr3q8OrNiacMzMrNmKDK/8MnC/pNuAiaSHeH+ipVGZmVnTNGzRR8QlpAd1/xD4PrB5RFzZ6sDMzKw5Gib6/ISng4G9gV8Ah+cyMzMbA4r00X+H1Ce/BWkKhA2A81sZlJmZNU+RRN8dEV8AXoyIfwAHApu3NiwzM2uWotMUT+Hlm6deT50bqczMbHQpkuhPJz1C8A2SvgncA3yzpVGZmVnTFJmm+HuS7gHeTRpeuVdE3N/yyMzMrCkaJnpJcyJiH+ChqrJfRMROLY3MzMyaot6kZnOBtwNrSvp91UeTgSdbHZiZmTVHvRb9gaSpD04n3QnbkcsXA39ttGJJk4ELgHWBTuBk0lnBRaSLub3AjIhYKukEYI+87qMi4m5JGwxUd2hfz8zMBr0YGxELIuJx4AhSkn0i1/8qr5z7ZjD7A89GxPbAe4AzgFnAcbmsA9hb0hbADqSpFT5EGrfPQHWH/vXMzKzIqJvvA5Wumz8D83JZI5cDx+fXHaTWejdp6mOAa4GdgWnADRHRHxFPApMkrTZIXTMzG6Iik5q9LiLOAcgPBT9X0n82Wigi/g4gaSpwBXAccGpEVMbgLyTNjLkS8GzVopXyjgHq1tXb21vg69hY09PjxxUvK/+G41uRRP8PSbtHxLUAknYCXiiycklrAXOBMyNitqSvV308FXgOWJBf15YvHaCsrq6uLjo7O4uEBrMfalzHRoXu7u52hzCm9fT0jOvfsK+vb9w3Aot03RwOfEPSM5KeBU4FGrboJa1OeiLVZyPiglx8r6Tp+fXupG6g24DdJE2QtDYwISKeGaSumZkNUZEbpu4DuiS9jjTfzYKC6/4C6bGDx0uq9NUfCXwrT6nwMHBFRCyRNA+4g3TgmZHrHkPqJnqpbtEvZWZmL6s3jv67EXGYpJuomttGEgARsWO9FUfEkaTEXmuHAerOBGbWlM0fqK6ZWTtNPOaSicD6TV7to0tOO2BJk9f5knot+nPy3zNbtXEzszFofSCavE4B85u8zpfUS/QrSHoXnqnSzKytJE0AziTNVtAHHBoRjxRdvl6iPzH//TrSEex2YAnwTuABYLvhBGxmZkP2fmC5iNhW0jbAaQzhJtJ6d8a+OyLeDfwReHtE7BIR7wE2IY1rNzOzkTENuA4gIu4EthzKwkWGV65Tc4rwJLDOUDZiZmbLZCXg+ar3SyQVuQ8KKHbDVI+ki4HLSAeGD+Mx7WZmI6n2xtIJEbG46MJFWvSHAveTbpz6GGm8+xFDidDMzJbJbcB7AXIf/QNDWbjIDVOLJM0BfgtcD6w1lCOJmVnJPEoaDtnsddYzF9hF0u2kSSIPHsrKizxhal/ShGSvIY24uUPSpyKiyAyWZmalkm9satmY94HkZ3EcPtzli3TdfJaU4BdGxNPA5sDnh7tBMzMbWUUS/ZKIeGk4ZUQ8xStnljQzs1GsyKibByV9HJgsaTPShdj7WhuWmZk1S5EW/QxgTeCfpGfALsCjbszMxowiLfozIuJg3C9vZsZFt36uJbNXHjTta22ZvbKiS9KKlUcDmpmNc6WavbJiKfCkpCB13wCN56M3M7PmkrQ1cEpETB/KckUS/WeGFZGZmTWNpM8AB1Dwmd3VGl6MjYhfkh4JuA9pWswpuczMzEbOo8AHhrNgw0Qv6VTg08DvgCeAL0nyhVkzsxEUEXOAF4ezbJGum72At1Xmt5F0DnAv8NXhbNDMzEZWkXH0TwOvrXo/GXimNeGYmVmzFWnR/w/wG0n/DSwGdgeelnQBQEQc0sL4zMxGm3bMXrlMiiT6K/OfintaFIuZ2aiXb2wa0dkrKyLicWCboS5XZD76i4cTkJmZjQ6Fnzk4HNWD+yVtDlxDGr0DcFZE/EjSCcAepG6hoyLibkkbABcB/UAvMCPPx2xmZkM06MXYnGyHLQ/uPw9YLhd1A7MiYnr+8yNJWwA7AFsDHwK+k+vOAo6LiO1JT1PZe1liMTMbz+qNurkMQNJVw1x37eD+bmAPSbdIOl/SVGAacENE9EfEk8AkSavlupWbsq4Fdh5mDGZm4169rpslkm4FNpV0Y+2Hjea6iYg5ktatKrobOC8ieiQdC5wAPAc8W1VnIbAy0BER/TVlDfX29hapZmNMT09Pu0MY8/wbjm/1Ev2OpMcGng+c2IRtzY2I5yqvgW8DPwamVtWZSkr+Swcoa6irq4vOzs5i0cx+qFg9a7vu7u52hzCm9fT0jOvfsK+vb9w3AgftuomIhRFxC+l5sQ+REu4qwG+HOdfN9ZK2yq93AnqA24DdJE2QtDYwISKeAe6VND3X3R2YN4ztmZkZxe6M3YL06MCDgQOB+yXtOYxt/SfwX5JuBrYDTo6IHlISvwOYQ3qaFcAxwImS7gCmAFcMY3tmZkax4ZVfBqZFxGMAktYj3UB1TaMFqwf3R8SvSQm+ts5MYGZN2XzSaBwzM1tGRVr0kytJHiAifl9wOTMzGwWKtOiflHQU6aIswKGk6YrNzGwMKNIy/yiwLfB74LH8+rBWBmVmZs1TZK6bp4F9RyAWMzNrAfe1m5mVnBO9mVnJFXlm7MkjEYiZmbVGkRb9XpI6Wh6JmZm1RJHhlc8Cv5X0a+CflUI/QtDMbGwokuj9hCkzszGs0KME83TDbwOuB9aqvlPWzMxGtyIXY/cFrgZOB1YF7pC0f6sDMzOz5ihyMfazpKmKF+abpzYHPt/SqMzMrGmKJPolEbGw8iYinuKVDwYxM7NRrMjF2AclfRyYLGkz4AjS/PRmZjYGFGnRzwDWJA2tvABYQEr2ZmY2BhQZdfOCpC8CPwAWAb+LiCUtj8zMzJqiyKibHYBHSa352aSbp7ZsdWBmZtYcRfroZwF7RMQDADnJnwlsVXcpMzMbFQrNXllJ8vn1PRQ7QJiZ2SgwaMKW9K788reSziY9SnAxsB9w9wjEZmZmTVCvZX5izfuvV73ub0EsZmbWAoMm+oh490gGYmZmrdGwr13S9sBRwCrV5RGxY6uCMjOz5ilyUfUiUjfOE60NxczMWqFIov9TRHxvOCuXtDVwSkRMl7QB6aDRD/QCMyJiqaQTgD1IF3qPioi7B6s7nBjMzMa7Ion+W5K+D9xISsYANEr+kj4DHAC8kItmAcdFxM15FM/ekp4AdgC2BtYC5gDvGKguMHdI38zMzIBiib4yr832VWX9QKNW/qPAB4BL8vtu4Jf59bXArkAAN0REP/CkpEmSVhukbsNE39vb26iKjUE9PT3tDmHM8284vhVJ9GtExFuGuuKImJOfTFXRkRM6wEJgZWAl0jNpqSkfqG5DXV1ddHZ2Fgtw9kPF6lnbdXd3tzuEMa2np2dc/4Z9fX3jvhFYJNHPk7QncF1ELG5Ye3DVfexTgedIM2FOHaB8oLpmLXfRrZ9rdwgt8cCtl7c7hKY6aNrX2h3CmFJkCoS9gP8GFklamv8MZ/bKeyVNz693B+YBtwG7SZogaW1gQkQ8M0hdMzMbhiLTFK/RpG0dA5wraQrwMHBFRCyRNA+4g3TQmTFY3SbFYGY27hS5YeqLA5VHxEmNlo2Ix4Ft8uv5pBE2tXVmAjNrygasa2ZmQ1ek66aj6s8U4H3A6q0MyszMmqdI180rJjeT9CXghpZFZGZmTVVoPvoaKwJrNzsQMzNrjSJ99I/x8rTEE4DXAqe2MigzM2ueIuPop1e97geei4gFrQnHzMyardCkZsBuwKqkC7JIajjXjZmZjQ5FEv1sYB3SePZKF06RuW7MzGwUKJLoN42IjVseiZmZtUSRUTcPS2rW3bFmZjbCirTolwdCUi/wr0qhHyVoZjY2FEn0X2l5FGZm1jJF7oz9ZaM6ZmY2eg3nzlgzMxtDnOjNzErOid7MrOSc6M3MSs6J3sys5JzozcxKzonezKzknOjNzErOid7MrOSc6M3MSs6J3sys5JzozcxKrsjslU0l6ddA5ZmzjwHnAKcDi4EbIuJESROAM4G3A33AoRHxyEjHamZWBiOa6CUtB3RExPSqsvuAfYDfAz+RtDnwZmC5iNhW0jbAacDeIxmrmVlZjHSL/u3A8pJuyNueCXRGxKMAkq4HdgbWAK4DiIg7JW05wnGamZXGSCf6fwCnAucBGwLXAs9Vfb4QWA9YCXi+qnyJpEkRsbjeynt7e5sbrY0KPT097Q7BRhnvE0Mz0ol+PvBIRPQD8yU9D6xa9flUUuJfPr+umNAoyQN0dXXR2dlZLJLZDxWN2dqsu7t7xLb1wK2Xj9i2bPiGsk/09fWN+0bgSI+6OYTU346kN5IS+guS1pfUAewGzANuA96b620DPDDCcZqZlcZIt+jPBy6SdCvQT0r8S4FLgYmkUTd3SfoVsIuk24EO4OARjtPMrDRGNNFHxCLgwwN8tE1NvaXA4SMSlJlZyfmGKTOzknOiNzMrOSd6M7OSc6I3Mys5J3ozs5JzojczKzknejOzknOiNzMrOSd6M7OSc6I3Mys5J3ozs5JzojczKzknejOzknOiNzMrOSd6M7OSc6I3Mys5J3ozs5JzojczKzknejOzknOiNzMrOSd6M7OSc6I3Mys5J3ozs5JzojczK7lJ7Q5gMJImAGcCbwf6gEMj4pH2RmVmNvaM2kQPvB9YLiK2lbQNcBqwd5tjspL76Ny3tTsEK+Cgae2OYGwZzYl+GnAdQETcKWnLOnUnAixatKjwytdYYfIyBWcjp6+vb8S25f1ibBjKPlGVFya2JJgxYDQn+pWA56veL5E0KSIWD1B3DYD58+cXXvmP995w2aKzEdPb2zti2/J+MTYMc59YA3i0yaGMCaM50S8Apla9nzBIkgf4FbA98BSwpNWBmdmYMpGU5H/V7kDaZTQn+tuAvYDLch/9A4NV7O7u7gNuHanAzGzMGZct+YrRnOjnArtIuh3oAA5uczxmZmNSR39/f7tjMDOzFvINU2ZmJedEb2ZWck70ZmYl50RfEnnKCDOzV/HF2DFM0nrALGBLYDHpwP0A8MmIKH73mJmV2mgeXmmNnQd8PiLuqhTkew4uBLZrW1RmNqr4dH9sW646yUOaF6hdwZjZ6OQW/dj2G0kXkCZ/e540ZcR7gfvbGpW1laSbgM6a4g6gPyLe2YaQrM2c6Me2I0jTOU8jTQK3ALiGdFexjV+fA84F/o107cbGOV+MNSshSZ8GHokIH/TNid7MrOx8MdbMrOSc6M3MSs6J3kYlSXtJOnqYy06XdHOz65qNVR51Y6NVd7sDMCsLJ3obEZImAWcBXcDqQAAfiIh/SvokcDjpMZBXAxfn90h6AlgHICJm5rLHgenA/wDnA28C3gjcAnykTgybAecAy+dl96v5fAfgy/nzVYDPRMTlkj4MfCbH9xiwP/B64FJgBWAp8P98s5qNVu66sZHyTmBRRGwLbAC8BnivpK1I9wNsBWxKasm/BjgbODsiLqyzzj2A+/I6NwS2BbaoU/9S4EsRsQnwQ+DIms8/ARwaEVsAHwW+mMtPBnaNiG7gt8DG+fNrImJL0kFgWuOfwKw93KK3ERERt0h6VtIMUqLcEFgReBdwdUQ8n6vuDKmPvsA6fyBpK0lHAW8BXpfX+SqSXg+sERHX5GXPyuXTq6rtD+wp6T+AbarWdTVwm6SrgDkRcZ+kFYArJW0O/AQ4o+BPYTbi3KK3ESHpfaQW9T9Ik67dQrot/8Waem+U9Nqaxftz3YrJue4ngG8AfwO+DTxUU69a7XaWy7N/VptHOrPoIXXhdABExJHAPqTunu9L2j8ibgPeClwP7Es6GJiNSk70NlJ2Bi7LXVIamnMAAAEQSURBVDF/IbXkJ5KS6+6SVsz9+D/g5WmXK2ecz5CSKrmrZ41cvgtwTkRcSjoYbJbX+Sr5jOEPknbJRQcAJ1U+l7QqsBHwxYj4KbArMFHSJEm/A56JiK8C3wM2l/R14ICIuBj4OPW7jMzaynfG2oiQtAkwm5TA+4A/AQ9HxHG5O+dwUsPjyog4XtK7SBdlZ+XlLgfeQGptv5XUwl6PdIH3BWAhsAj4EfAIMDMipg8Qw1mkLplnSMlelbqSTiPNHbQAuIPUUl8beB9wPOls5DngwBzrbNJEckuAUyLisub9YmbN40RvZlZy7roxMys5J3ozs5JzojczKzknejOzknOiNzMrOSd6M7OSc6I3Myu5/w/y2CkuwvgHfwAAAABJRU5ErkJggg==\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "oz = ClassPredictionError(fitted_model)\n", + "oz.fit(X_train, y_train)\n", + "oz.score(X_test, y_test)\n", + "oz.poof()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "oz = ClassPredictionError(unfitted_model)\n", + "oz.fit(X_train, y_train)\n", + "oz.score(X_test, y_test)\n", + "oz.poof()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "oz = ClassificationReport(fitted_model)\n", + "oz.fit(X_train, y_train)\n", + "oz.score(X_test, y_test)\n", + "oz.poof()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "oz = ClassificationReport(unfitted_model)\n", + "oz.fit(X_train, y_train)\n", + "oz.score(X_test, y_test)\n", + "oz.poof()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "oz = ConfusionMatrix(fitted_model)\n", + "oz.fit(X_train, y_train)\n", + "oz.score(X_test, y_test)\n", + "oz.poof()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "oz = ConfusionMatrix(unfitted_model)\n", + "oz.fit(X_train, y_train)\n", + "oz.score(X_test, y_test)\n", + "oz.poof()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "oz = PrecisionRecallCurve(fitted_model)\n", + "oz.fit(X_train, y_train)\n", + "oz.score(X_test, y_test)\n", + "oz.poof()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "oz = PrecisionRecallCurve(unfitted_model)\n", + "oz.fit(X_train, y_train)\n", + "oz.score(X_test, y_test)\n", + "oz.poof()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "oz = ROCAUC(fitted_model)\n", + "oz.fit(X_train, y_train)\n", + "oz.score(X_test, y_test)\n", + "oz.poof()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "oz = ROCAUC(unfitted_model)\n", + "oz.fit(X_train, y_train)\n", + "oz.score(X_test, y_test)\n", + "oz.poof()" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "oz = DiscriminationThreshold(fitted_model)\n", + "oz.fit(X, y)\n", + "oz.poof()" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "oz = DiscriminationThreshold(unfitted_model)\n", + "oz.fit(X, y)\n", + "oz.poof()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Check if fitted on Feature Visualizers*\n", + "Just the ones that inherit from `ModelVisualizer`" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/rbilbro/pyjects/my_yb/yellowbrick/features/importances.py:159: YellowbrickWarning: detected multi-dimensional feature importances but stack=False, using mean to aggregate them.\n", + " ), YellowbrickWarning)\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "viz = FeatureImportances(fitted_model)\n", + "viz.fit(X, y)\n", + "viz.poof()" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/rbilbro/pyjects/my_yb/yellowbrick/features/importances.py:159: YellowbrickWarning: detected multi-dimensional feature importances but stack=False, using mean to aggregate them.\n", + " ), YellowbrickWarning)\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "viz = FeatureImportances(unfitted_model)\n", + "viz.fit(X, y)\n", + "viz.poof()" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "# NOTE: Not sure how to deal with Recursive Feature Elimination" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Check if fitted on Regressors" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "X, y = load_energy(return_dataset=True).to_numpy()\n", + "X_train, X_test, y_train, y_test = tts(X, y, test_size=0.20)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "unfitted_nonlinear_model = RandomForestRegressor(n_estimators=10)\n", + "fitted_nonlinear_model = unfitted_nonlinear_model.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "unfitted_linear_model = Lasso()\n", + "fitted_linear_model = unfitted_linear_model.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "oz = PredictionError(unfitted_linear_model)\n", + "oz.fit(X_train, y_train)\n", + "oz.score(X_test, y_test)\n", + "oz.poof()" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "oz = PredictionError(fitted_linear_model)\n", + "oz.fit(X_train, y_train)\n", + "oz.score(X_test, y_test)\n", + "oz.poof()" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "oz = ResidualsPlot(unfitted_linear_model)\n", + "oz.fit(X_train, y_train)\n", + "oz.score(X_test, y_test)\n", + "oz.poof()" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "oz = ResidualsPlot(fitted_linear_model)\n", + "oz.fit(X_train, y_train)\n", + "oz.score(X_test, y_test)\n", + "oz.poof()" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "oz = ResidualsPlot(unfitted_nonlinear_model)\n", + "oz.fit(X_train, y_train)\n", + "oz.score(X_test, y_test)\n", + "oz.poof()" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "oz = ResidualsPlot(fitted_nonlinear_model)\n", + "oz.fit(X_train, y_train)\n", + "oz.score(X_test, y_test)\n", + "oz.poof()" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [], + "source": [ + "unfitted_cv_model = LassoCV(alphas=[.01,1,10], cv=3)\n", + "fitted_cv_model = unfitted_cv_model.fit(X, y)" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "oz = AlphaSelection(unfitted_cv_model)\n", + "oz.fit(X, y)\n", + "oz.poof()" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYUAAAETCAYAAADZHBoWAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjAsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+17YcXAAAgAElEQVR4nO3deZwcVbn/8c/sSUjCko3FSNjyBAmLDFvY70WEQMCfXLhwEQggCogK5qoRDKJxQRSiGC9wJYAsRmW9CLwiCCLIJjIQcJA8AzEBAiEzJCQzWWZ6lv79UTWdzmSWmslU9/TU9/165UV31amq53RCP33OqTqnKJ1OIyIiAlCc7wBERGTgUFIQEZEMJQUREclQUhARkQwlBRERyVBSEBGRjNJ8ByDJZWYTgGp3H57HGIYA3wamAUVACXAX8BPgSOD/gB3cvTHrmApgGXCcu7/cyTlHA+8Ct7v7RVnbzwVOdfdp3cQzgS38TMzsaGAB4B12fejun+rreSUZlBQkscysiOBLvwaY4u6NZjYKeAQY7u5Xmtl7wH8Av8k69BTgzc4SQuh84EHgv8zsCndfFV8turTY3ffLw3WlwCkpyIBkZhOB/wGGAzsCC4HTwy/u7wGfBVLASuBcd1/ezfYjgJ8Cw8J9s9z9jwQtgT2BE929FcDdV5rZ2cCEMJQbCL7ks5PCF8PYOou7GLgQuCSM/ULg6k7K/QX4J3AAMBq4092vCneXmNlNwEHANsA33P0+MxsH/C8wDtgeeBv4T3ev7fkT3eTa3wWmADsArwFvdXh/HjAHOAZoBf4GfM3dG8xsafh+H+AKd3+gN9eWgU9jCjJQfYGg+2UKsDuwC3CimY0HLgMOdPcDgMeAg7vZPgq4F7jU3fcBpgN3mdkuBF/If2tPCO3c/U13/1P49k6gMiyPme1BkEju6SLu44CtgMeB24FLzKysi7I7A4cB+wOnm1l7t9IQ4E/uvj/w3wRdWQBnAM+Hn8muwHrg7C7OvZuZLezw59sdrr2/u5/VyftZBIl43/BPMUFSbVft7nsqIQxOSgoyUM0E6szsm8CNBF9Sw4H3gFeBl83sWmChu/9fN9sPBt5y978BuPvrwLPA0UAbPfw/4O4NBK2E88JNXwTmuXuqi0O+BPzG3VuAPxC0Tk7rouz/unuzu68mSDLHhdtT7n5f+HohMDaM5XrgOTObQdCCmRx+Jp1Z7O77dfjzw6z9L4QxdvZ+KnBTGFsbMDfc1u6vXVxTBgElBRmofkvwBfw28DPgZaAo/JI6CjiXoIvoZ2Z2fVfb6fzfeDFQBrwAHGhmJdk7zexAM7sza9MNwLnhoPTngJs6C9jMdgZOAM4Iu1mcoIv2si7qmP2lXEzQVQPQnLU9TTAAjpldA8wG6oBfEbSGiro4d0/WdvO+42fW/nl1dawMIkoKMlAdB8x2998TfDEeTNDXvi9QDbzh7lcTJIx9u9pO8MVvZnYQwYu9CMYS/uLuzwOLgDnhFz5hv/1cYEl7IGHr4l8EYwPPu/uyLmK+EHjG3Xdy9wnuPgGoBPY3s8M6KX+WmRWb2bbAfwIPRfhMfu7udwK1wLEEd0v1t0eBi8ysLBwjuQT4Uw/HyCChgWbJt63MrOMvzynAFcADZraKoO/8KWB3d7/FzO4GXgqP2wB81d1f7WL7h2Z2GjDXzIYRdBmd5+414bX+A/gRUGVmLQRfsrcD13aI6X+A3xMMvm7GzMqBzxMMSme4+5tm9luC1sIjHQ4bCrwIjABucPcnwltSuzIbuNbMvkPQyniGYLylM7uZ2cJOtn+6m/O3+wFB/RcSfEe8CHwlwnEyCBRp6myR3AvvPvqlu9+b71hEsqn7SEREMtRSEBGRDLUUREQko6AHmquqqiqAA4HlbLydT0REuldC8AT73ysrK5uydxR0UiBICHqQRkSkb44guIsto9CTwnKAiRMnUl5e3qsDp06dSnNzM48//ngsgQ1U1dXVTJ48Od9h5EzS6guqc1JsSZ1TqRQ1NTUQfodmK/Sk0ApQXl5ORUVFrw5cuXIlqVSq18cNBkmrc9LqC6pzUvRDnTfrdtdAs4iIZBR6S6HPjjnmGOrq6vIdhojIgJLYpDBnzhyqqqryHYaIyICi7iMREclIbEvhF7/4BcuWLaOysjLfoYiIDBiJbSnccsstPPRQTzMVi4gkS2JbCvmUTqdpn3IqTTrcBums/cE+Oi2XOUem/MYyG193Xm51Uwsfrm3s9Fob4+hwrexynW3r4lo91q/LOkeIKWKd3/hwPS1v1236WUb5fLuqXw+fb3cxbXbebq8V9e9h889yyZI1LOJfXdav82M3v1Z39Wvf2Hmdu//73RhHlDpH+8zfe7+ORz58tes69/nfdC/r19v/j7PP28s6lzXWc/P+aYqK+rrOUueUFHLs4ntf4FfPv5nfIO6r6bnMYPLY0nxHkHvPv5fvCHLvH8m6m3BoaRHXNTaz9dDePbjbEyWFHHuw+l22Ki/lwPGjAGhP8kUU0THhFxUVZdZabP81sLF81rZMmeA8XZYrgtUfrWbbbbfdZP/m19oY08bzbh5Ht9fqLqZOj+3mWpsc28k2uo5pxYoV7LD99pvF1F2do9c/Wkybnzfr+hH/Hjo9bxfXevedd9h55503q3O3MfUQZ3f16/W/w+xym23r6e+383I1NW9iNrH7a3USU/u5uv976Kn+0eLc/LzdbOvq30vW66X+er8nBFBSyKk1G1KsaGjk07YjC77Y6QJesauqqkrU4HpQ3/3zHUZOVZU3UFm5R77DyKmt69+ncvft8x1GTtWVxbESa4xJIVwM/WbACLrGLgIagV+H76uBS8IF19uPGQrcBYwFGoDp7h5Lm3DYsGEUF+d2nN3r6gGYNHZkTq8rIhJVnN+KJwG4+2HALOCHwBxglrsfQdAq+kyHYy4G/hHuvyM8LhbPP/888+bNi+v0nfLaIClMVFIQkQEqtpaCu/+fmT0cvt0ZWA18imABdoAFBIuIP5B12OHAT7L2XxnlWtXV1X2OM5dPNT/1Wi0ARR+toKqq41r1uZO0J7mTVl9QnZMijjrHOqbg7i1mdjvwWeBU4Fh3b7+zqgHYusMhI4E13ezv1OTJk3s9W+Df//53Fi1axNlnn92r47bEmuogH550+AHstPWwnF03WzLHFJJTX1Cdk2JL6tzU1NTlj+nYO9XdfTowkWB8YWjWrhEErYds9eH2rvb3mwsuuIAf/ehHcZ2+UzW19QyvKGXHkUN7LiwikgexJQUzO9vMLg/frgfagJfM7Ohw21Q2XzXtWeCEbvYXrNa2Nt78sB4bM7LfHzYREekvcXYf3Q/cZmZPA2XAZcAbwM1mVh6+vhfAzB4DpgE3Areb2TNACjgzxvhy6p2P1tHU0sbEMRpkFpGBK86B5nXAf3ay66hOyn46fJkCTosrpnxqvx3VxkYaJhERyYvEToiXazXtt6OqpSAiA5iSQo4sqm1vKSgpiMjAldhpLm6//XbeeOONnF2vpi6401YtBREZyBKbFPbbbz9aW1tzdj2vrefj227FsPLEfuQiUgDUfZQDDY3NvF+/Qa0EERnwEvuz9YADDqCxsXGLpsiIqqb9ziMlBREZ4BLbUmhubs5Z99Gi2mA8QYPMIjLQJTYp5FKNnlEQkQKhpJAD7VNmq/tIRAY6JYUcqKmrZ1h5Sd5mRhURiUpJIWZtbWlq6uqZOHokxcWaCE9EBrbE3n100UUXsWzZstivs2zNejY0t2q1NREpCIlNChdffHFOVmrK3Hk0RoPMIjLwqfsoZjWa80hECkhiWwpf+cpXWLlyJfPnz4/1OhunzFZSEJGBL7FJ4emnnyaVSsV+Ha/VRHgiUjjUfRSzmrp6dtp6GMMryvIdiohIj5QUYrSuqZl3V6/XQ2siUjBi6z4yszLgVmACUAH8gGDN5e3DIhOAF9z9jKxjioBlwJvhpufd/fK4YoxbTV0DoPEEESkccY4pnAWsdPezzWw7YKG7fxzAzLYFngS+1uGY3YCX3f2kGOPKGa/TRHgiUljiTAr3APeGr4uAlqx93wPmuvvyDsdUAjuZ2ZPABuBr7u5xBLfvvvvy0UcfxXHqjI3rMusZBREpDEXpdDrWC5jZCOAPwM3uPt/MxhK0EvZx99YOZY8Exrn7PWZ2OPAzdz+wq3NXVVVNAJbEF/2WmfXsMh57u57/O3l3dhxenu9wREQ62qWysnJp9oZYb0k1s/HAA8AN7t7+QMCpwPyOCSH0EmGLwt2fMbMdzazI3bvNXJMnT6aioqLX8VVVVVFZWdnr46Kqe+oDhpSWcOIRhwyYeY/irvNAk7T6guqcFFtS56ampi4XGIvt7iMzGwc8Bsx091uzdn0KWNDFYVcBl4XH7wu821NC6Kv58+fz6KOPxnFqANLpcCK8MZoIT0QKR5wthSuAbYErzezKcNtUwIB/ZRc0s8eAacCPgbvM7ESCFsO5cQV3zTXXkEqluOKKK2I5/3tr1rMu1aKJ8ESkoMSWFNz9UuDSTnbt1UnZT4cvU8CJccWUS1pYR0QKkR5ei0n7EpxqKYhIIVFSiElmIjy1FESkgCgpxGTRCj24JiKFR0khJjV19ewwcigjh+j5BBEpHImdOvvZZ59l4cKFsZx7Q3ML76xex1G7jovl/CIicUlsS2H48OEMHTo0lnO/WddAOq1BZhEpPIlNCkuXLmX58o5TL/UPDTKLSKFKbPfRZz7zGVKpFNOmTev3c2dWWxurifBEpLAktqUQp/YH1yap+0hECoySQgxq6uqpKC1m5223yncoIiK9oqTQz9LpNF5bz+6jR1BSrI9XRAqLvrX62QcNG2hoatbCOiJSkJQU+llmIjyNJ4hIAUrs3UfXXnstb731Vr+fd5GSgogUsMQmhWOPPZbtttuu389bUxfOeaRnFESkAKn7qJ9t7D7SmIKIFJ7EthSmTp3K2rVr+etf/9qv562pq2fs8CFsM1QT4YlI4UlsUnj//fdJpVL9es7G5laWrlrHYbuM6dfziojkirqP+tFbH9bTlk5rkFlEClZsLQUzKwNuBSYAFcAPgHeBh4E3w2I3uvvvs44ZCtwFjAUagOnuXhdXjP1t40R4Gk8QkcIUZ0vhLGClux8BHA/8EqgE5rj70eGf33c45mLgH+ExdwCzYoyv39XUal1mESlscY4p3APcG74uAloIkoKZ2WcIWguXuXtD1jGHAz8JXy8Arowxvn6nKbNFpNDFlhTcfS2AmY0gSA6zCLqR5rl7lZl9G7gK+HrWYSOBNeHrBiBSP0x1dXWv4zvssMMAqKqq6vWxXXllyXJKi2HV0hqq3inqt/P2t/6scyFIWn1BdU6KOOoc691HZjYeeAC4wd3nm9k27r463P0AMLfDIfXAiPD1CGA1EUyePJmKiopexVZZWUlVVRWVlZW9Oq4r6XSaZQ+8xR5jtubgAw/ol3PGoT/rXAiSVl9QnZNiS+rc1NTU5Y/p2MYUzGwc8Bgw091vDTc/amYHha+PATqmuWeBE8LXU4H+fYggRrVrG1m9IcVEdR2JSAGLs6VwBbAtcKWZtY8NzAB+ZmbNwAfAFwHM7DFgGnAjcLuZPQOkgDPjCm727Nl88MEH/fbrIvMks5KCiBSwOMcULgUu7WTXYZ2U/XT4MgWcFldM2e67775+fXitfZBZdx6JSCGL3H1kZtuZ2TZxBlPIatRSEJFBoNuWgpntBXwDOCnc1GJmEDyANsfdX483vMLh7bOjaiI8ESlgXbYUzOwagnGBe4AJ7j7K3ccBuwH3A98zs2tzE+bA57X1jN6qglFb9e4uKBGRgaS7lsLv3f3ljhvD5w8eAR4xs4F772UOpVpaWbJqLYd8fHS+QxER2SJdJoXshGBmE4C9gD8CH3f3JWGZl+IOMC477rgja9eu7ZdzLV65lta2tAaZRaTg9TjQbGanAw8BvwBGAc+b2VlxBxa3BQsW8POf/7xfzuW17autaTxBRApblLuPZgKHAvXuXgt8Erg81qgKTI1uRxWRQSJKUmjNnrTO3ZcDbfGFlBt/+tOfePHFF/vlXIvC21EnKSmISIGL8vDa62b2ZaDMzPYDvgQsjDes+H39618nlUpx8cUXb/G5amrrKS0uYtdRI3ouLCIygEVpKVwC7ARsIFg0p54gMUjI69aw66gRlJVoITsRKWxRWgq/dPfz0DhCpz5c28iq9SkOnTA236GIiGyxKD9tJ5vZ8NgjKVCZhXU0niAig0CUlkIb8I6ZOUEXEgDu/u+xRVVAMrOjKimIyCAQJSl8M/YoCpieURCRwaTHpODuT5nZVIJFcUqBJ939wdgji9mDDz7Yp2U8O1L3kYgMJlGeaP4m8F3gHWAJ8G0zuyLmuGI3YcIEdthhhy0+T01tPdsOLWe0JsITkUEgSvfRWcDB7r4BwMxuJlhG80dxBha3tWvXsmHDhp4LdqO5tY3FKxs4YPwoioqK+ikyEZH8iZIUitsTQqgRaIkpnpw57LDDSKVSvPHGG30+x5JVa2lpS2sNBREZNKIkhSfM7D7g1+H7c4E/xxVQIVm0on2QWeMJIjI4REkKlwEXAecQjEE8AfyquwPMrIzg6ecJQAXwA4IxiblAK9AEnOPuKzoc9zLBE9MAS8KH5gYsTYQnIoNNlKSwFUEX0mlmthNwIVBO911IZwEr3f1sM9uOYK6kJcBX3H2hmV1IMPvqjPYDzGwIUOTuR/etKrnnWpdZRAaZKElhPvBa+LqBoLVwJ/Af3RxzD3Bv+LqIIIGcEc6w2n7dxg7H7AsMM7PHwv1XuPsLEeLr062lqVQKgKqqql4f265qyXsUF8Gad96k6r3CmfdoS+pciJJWX1CdkyKOOkdJCju7+8kA7l4PzDKzbmdJDZfsxMxGECSHWe0JwcwOBb4MHNnhsPXAtcA8YA9ggZmZu/c4qD158mQqKnp3S2h5eTmpVIrKyspeHZft/T8sZtdRI5hy0IF9PkeuVVVVbVGdC03S6guqc1JsSZ2bmpq6/DEd5edt2sz2bn9jZpOA5p4OMrPxwJPAne4+P9x2OnATcKK713U4pAa4y93T7l4DrAS2/EGCLsycOZOzzz67z8evWt9E3domJqrrSEQGkSgtha8DfzKzZQRdQaMJxgy6ZGbjgMeAL7v7E+G2swjGI45291WdHHY+sDfwJTPbERgJLO+kXL8488wzt6jppTmPRGQwijLNxeNm9nGCL+xmYJG7p3o47ApgW+BKM7sSKAEmA28D95sZwFPufpWZ3QHMAm4Bfm1mzwBp4PwoXUf50p4U1FIQkcGkx6RgZgcBhwO/BB4GPmlmF7n7fV0d4+6XApdGCcDdz8l6e2aUY/rDOeecw0cffcRDDz3Up+Nr6sJnFPTgmogMIlHGFH5BMK3FqQSDwZXAt+IMKhdeffVV3nrrrT4fn5kITy0FERlEoiSFYnd/CjgRuM/d3yHaWMSgVlNbz9ZDyhg3Yki+QxER6TdRksJ6M/tv4N+Bh83sUoLnFRKrpbWNNz9swMaO1ER4IjKoREkKnyN4qvk/3P0jYEdy2Pc/EC39aC3NrW1M1MI6IjLIRLn76D1gdtb7mbFGVAB0O6qIDFaJHRs48sgjWblyZZ+OzUyEp0FmERlkotySWuLurbkIJpfmzp3b54fX2lsKk9RSEJFBJsqYwt9jj6LA1NTVU1QEu49WUhCRwSVKUlhhZkeY2aBahPjGG2/k/vvv79Oxi2rXMGHb4QwpK+nnqERE8itKUjgAeArYYGZt4Z+C70666aab+pQU1mxIsaKhUQvriMigFOXuozG5CKRQ6ElmERnMogw0DwOuAo4Jy/8ZuNLd18Uc24C08XZUPaMgIoNPlO6jXxI8vHY+MJ1gKc6b4gxqINs4EZ5aCiIy+ER5TqHS3ffNev9lM/tnXAENdIu0LrOIDGKRJsQzs23a34SvB+w6B1GVlZVRUtL7u4dqausZXlHKDiOHxhCViEh+RWkpzAFeNLP2hQdOBq6OL6TceOmll3r98FprWxtvfljP5O230UR4IjIo9dhScPfbgFOAfwFLgVPc/daY4xqQ3vloHU0tbRpkFpFBq8uWgpl9GbjR3VvdvRqo7rC/BPiSu8+NOcZYLFy4kJqaGiorKyMfk7kdVYPMIjJIddd99DbwtJk9BTwNLCMYS9iZYG2FfwN+2NXBZlYG3ApMACqAHwD/BH5NsAZzNXCJu7dlHTMUuAsYS7Bmw3R3r+tb1bo3ffp0UqkU//Vf/xX5GK3LLCKDXZfdR+7+EMEX/5vAhcDvgLuBiwAHjnD3B7s591nASnc/Ajie4NbWOcCscFsR8JkOx1wM/CPcfwcwqy+VioumzBaRwa7bgWZ3TwG3hX966x7g3vB1EUEro5JgygyABcCngQeyjjkc+EnW/iv7cN3YtD+jsIcmwhORQSq29RTcfS2AmY0gSA6zgGvdPR0WaQA6jtiOBNZ0s79T1dXVPRfqIJVKAfTqDqTq91ay/bAy3vjHq72+3kDS1ynDC1XS6guqc1LEUedYF9kxs/EELYEb3H2+mf0ka/cIYHWHQ+rD7V3t79TkyZOpqOjdJK7l5eWkUqnIA80Njc3Uzf8nx07coVeD0wNNVVVVQcffW0mrL6jOSbEldW5qauryx3SPt6Sa2UV9uaiZjQMeA2Zm3cL6ipkdHb6eCvy1w2HPAid0sz9vanTnkYgkQJSWwpfp21xHVwDbAleaWfvYwKXAL8ysHHiDcMzBzB4DpgE3Areb2TNACjizD9eNZN68eSxatChy+UW14ZxHY/SMgogMXlGSwrtm9mfgb8CG9o3uPru7g9z9UoIk0NFRnZT9dPgyBZwWIaYtduCBB1JcHGWWj0BmXWa1FERkEIuSFF7Iep3YuR1cE+GJSAJEWWTne2Y2Bjg4LP+8u6+IPbKYTZkyhcbGRl555ZVI5Wvq6tmqvJSdth4Wc2QiIvkTZaD5OGAhcB7Begqvmdm0uAOL2/r162lsbIxUtq0tTU1dPRPHjKS4OLGNJRFJgCjdRz8EDnf3JQBmtitwP/BwnIENJO+uXseG5lZNbyEig16Ukday9oQA4O7/injcoKGJ8EQkKaK0FN4xs8uAW8L3FxBMlpcYNZoIT0QSIsov/s8DUwjWU1gSvv5inEENNO0thUlaR0FEBrkoLYWvuvvpsUeSY5///OdZtmxZpLIePrg2ccyIHkqKiBS2KC2Fk8xs0N1y89WvfpXTT4+W62rq6vnY1sPYqqIs5qhERPIrSkthJbDIzF5m0yeaz48tqgFkXVMz765ezzF7bJ/vUEREYhclKdweexR5MGPGDOrq6rjzzju7LVdT1wBokFlEkiFKUvhc1txEg8YTTzyRWVOhOx4urKNBZhFJgihjCkPCdRESKXM7qp5REJEEiNJSGAMsNbNagjGFIiDt7rvGGtkAkXlwTd1HIpIAUZLC8bFHMYB5bT1Dy0oYv81W+Q5FRCR2PXYfufvbwGEED6zVAUeF2wa9dDqYCG+P0ZoIT0SSIcosqT8mWCLzFIKWxXlmdl3cgcVtzz33ZMKECd2WeW/NetalWjSeICKJEaX76Dhgf+Bld683s2OB14D/jjWymP3ud7+jqqqq2zLtC+tMUlIQkYSIcvdRW/jfdPjfiqxtg1pmCU4NMotIQkRpKdwN/B7YLpwt9WxgfpSTm9nBwDXufrSZ/Q5ofyx4AvCCu5+RVbYIWAa8GW563t0vj1SLPrj33ntZsmQJlZWVXZbZOGW2nlEQkWSIshznNeHqa28DHweucvceF9gxs28SJJB14XnOCLdvCzwJfK3DIbsRdFGd1Ksa9NH3v/99UqkU3/jGN7oss2iFJsITkWSJ0lLA3R8FHu3luRcTDE53nEfie8Bcd1/eYXslsJOZPUnwPMTX3N17ec1+VVNXzw4jhzJySHk+wxARyZlISaEv3P0+M5uQvc3MxgLHsHkrAWA5cLW732NmhwN3AQdGuVZ1dXWv42uf4qKrwebGljbe+Wgd+48b1uOAdKEZbPXpSdLqC6pzUsRR59iSQhdOBea7e2sn+14CWgDc/Rkz29HMitw93UnZTUyePJmKiopeBVJeXk4qlepyTOG19z8izSIO2PVj3Y47FJqqqqpBVZ+eJK2+oDonxZbUuampqcsf07lea/lTwIIu9l0FXAZgZvsC70ZJCHHRuswikkS5bikYwbKeGzeYPQZMA34M3GVmJxK0GM7NcWyb2LjampKCiCRHrEnB3ZcCh2S936uTMu3TcqeAE+OMJ9tTTz3FwoULu9zf/uCaWgoikiS57j4aMLbZZhtGjOj6VtOaunoqSovZeVtNhCciyZHYpPDee+9RV1fX6b50Oo3XBhPhlRQn9iMSkQTK9ZjCgHHCCSeQSqU4/vjNZwb/oGEDDU3NmghPRBJHP4M7sahWC+uISDIpKXTCtQSniCSUkkInauqC21EnaSI8EUkYJYVOuLqPRCShlBQ6UVNXz7gRQ9h6qCbCE5FkSezdR1dffTWLFy/ebHtjcytLV63j8F3G5CEqEZH8SmxSOOGEEzqdYfCtD+tpS6c1yCwiiaTuow4yE+GN0SCziCRPYlsKJ598Mg0NDTz55JObbK/RnEcikmCJTQpvv/12ZqGdbJoyW0SSTN1HHdTU1lNWUsyEbYfnOxQRkZxTUsiSTqdZVLuG3UePoLREH42IJI+++bLUrm1kTWOzFtYRkcRSUsjS/iTzJI0niEhCJXag+eSTT2bFihWbbGsfZJ6o21FFJKESmxS+//3vb/bwmm5HFZGkizUpmNnBwDXufrSZfRJ4GHgz3H2ju/8+q+xQ4C5gLNAATHf3zpdGi4mHs6MqKYhIUsWWFMzsm8DZwLpwUyUwx92v6+KQi4F/uPt3zewMYBZwaVzxXX311SxfvpzKysrMNq+tZ/RWFWw3rCKuy4qIDGhxthQWA6cAd4bvKwEzs88QtBYuc/eGrPKHAz8JXy8Arox6oerq6l4Hd8cddwBkupCaW9MsWdnA5NFDO50TaTAZ7PXrKGn1BdU5KeKoc2xJwd3vM7MJWZteBOa5e5WZfRu4Cvh61v6RwJrwdQMQebR38uTJVFT07td9eXk5qVQq01J4Y8UaWtNvULnLTpu0HgabqqqqQV2/jpJWX1Cdk2JL6tzU1HMnf7QAAA2+SURBVNTlj+lc3pL6gLu3p7UHgE922F8PjAhfjwBW5yowAK/VeIKISC7vPnrUzL7i7i8CxwAd2z3PAicQtCimAn/NYWzUZG5HVVIQybeWlhba2tp6dUxnc5kNdj3Vubi4mNLS3n3N5zIpXAzMNbNm4APgiwBm9hgwDbgRuN3MngFSwJk5jI1Fuh1VZEBoaGigpKSkV19mu+22W4wRDUxR6pxKpdiwYQMjRozosWy7WJOCuy8FDglfvwwc1kmZT4cvU8BpccaTbcyYMaxbty7zvqa2ntLiInYdFf3DE5H+1dLSQklJCcOGDevVcc3NzZSXJ2v53Ch1Li8vZ/369bS0tEROsol9eO3xxx/fZOTe69aw26gRlGkiPJG8aWtr63V3h3SvpKSkV11x+gYEPlzbyKr1KS3BKSKDTlFRUa/KJzYp/OUvf+Hll18GspfgVFIQkWRLbFK49NJLmTNnDrBxdlS1FETk/vvv59prr813GHmjzjuynlHQ7KgiA8o3H6ri3lff7rFcOp2O3E1y6r4785OTkvWgW28oKbCx+0jrKIhIu+uuu47q6mpWr17NpEmTuPrqq6mqquKaa66htLSUoUOHcv3111NXV8fll19OaWkpbW1tXHfddeywww78+Mc/ztzMMm3aNKZPn57nGkWjpEBwO+p2w8oZPXxIvkMRkSw/Oaky0q/6devWsdVWW/XbdZubmxk9ejS33XYbbW1tnHjiiaxYsYLHH3+cqVOnMn36dP785z9TX1/Pc889xz777MM3vvENXnrpJRoaGli0aBHLli3j7rvvpqWlhTPPPJNDDjkEM+u3GOOS2DGFds2tbSxe2aCuIxHJKCoqYtWqVcyYMYPvfOc7rF+/nubmZi666CJqa2uZPn06f/zjHyktLeXUU09l5MiRXHDBBfzmN7+hpKSExYsXc8ABB1BUVERZWRn77rsvixcvzne1Ikl8Uliyai0tbWkNMotIxt/+9jeWL1/OnDlzmDFjBo2NjaTTaf7whz/w2c9+ljvvvJM99tiDu+++myeeeILKykpuv/12jj/+eObNm8duu+22cQbm5mZeeeUVdt555zzXKprEdh/dc889vP766yxa0T7IrKQgIoG9996b119/nc997nMUFRUxfvx4amtr2WeffZg1axZDhw6luLiY2bNnk06nmTlzJjfeeCNtbW1cfvnl7LXXXrz44oucfvrpNDc3c/zxx7PXXnvlu1qRJDYpTJw4kYaGBp6s05xHIrLRKaecwimnnNLl/rvvvnuzbb/97W832zZz5sx+jStXEpsUUqkUzc3NeG0wy6CN1ZiCiEhik8KBBx5IKpVi7CU/p6S4iN1GDc93SCIieZf4gWavW8Mu2w2nvLQk36GIiORdopNCWzpN3domLawjIhJKdFJoCWeTnaTxBBERIOlJIZ0GNBGeiEi7ZCeFtiAp6BkFEZFArHcfmdnBwDXufrSZ7QfMBVqBJuAcd1/RofzLQH34dom7nxdXbDNmzGDO4wv5AD2jICLSLraWgpl9E5gHtM8ydz3wFXc/GrgfmNmh/BCgyN2PDv/ElhAApk+fzro9prD1kDLGaiI8EYnJ008/zXHHHcexxx7Lr371q16Xu/zyy5kyZQrTpk3r87l7I87uo8VA9mOBZ7j7wvB1KdDYofy+wDAze8zM/mxmh8QYGy2tbbzb0MSksVv3erk6EZEoWltbmT17NvPmzeORRx7h4Ycf5q233upVuVNOOYV58+b1+dy9FVv3kbvfZ2YTst4vBzCzQ4EvA0d2OGQ9cC1B62IPYIGZmbu39HSt6urqXsf37e9+n7L3Gxh13qWZiauSQvUd/Aq5zrvtthvNzc2Z91OmTOm03IUXXsi5554LBCspvvjii5uV+eQnP8kNN9wAwPz585k7dy7PP/98pDgWL17MT3/6Uz744ANOPPFEVq1axbRp03o1h9Grr77KTjvtxHbbbUdzczPHHnssCxYs4Pzzz49c7hOf+ATvv/8+bW1trFu3LnNMdXV1pHM3Nzf3aobWnD7RbGanA98GTnT3ug67a4C33D0N1JjZSmAH4N2ezjt58mQqKip6Fcu/Fr9F6dpGDp20C5WVe/fq2EJWVVVFZWVyVp1KWn2hsOucSgXTzpSXl2e2ddWSr6io2GQNhc7KlZaWZsqUl5dTVFQUad2FpqYmvvWtb3H99dczfvx4pk6dyl577cVBBx2UKXPmmWdu8iXdbubMmRx66KEA1NfX87GPfSxzzfHjx/Paa69tFkNP5don4Ms+rq6uLtK5U6kUe++99yafaVNTU5c/pnOWFMzsLOBC4Gh3X9VJkfOBvYEvmdmOwEhgeVzxNLcGDynodlSRge3VV1/tscz111/f45f99OnTI69+9txzz7Hnnnuyxx57AMGv7fPO23SYc/78+ZHOVWhykhTMrAT4BfAOcH+4+tBT7n6Vmd0BzAJuAX5tZs8AaeD8KF1HfdXcFiQF3Y4qIh298cYbfOITnwBgxYoVDBs2bLPWV5SWwrhx4/jggw8y+1asWMG4ceM2OyZquWxjxozp9TFRxJoU3H0p0D5gvF0XZc7JentmnPFka28p7D5aSUFENlVWVsaKFcEd83PmzNlkjKNdlJbC3nvvzdKlS3n33XcZN24cjzzyCNddd12fy2Xba6+9en1MFIl9eK2lNU1JEQwp00R4IrKpk046iZdeeonjjjuOSZMmsd9++/HDH/6w1+cpLS3lO9/5DhdccAEnnHACU6dOzXRJAXzhC19gxYoV3ZabMWMGZ5xxBkuWLOHII4/knnvuiXTuvkrs1Nm77f1Jilqa8h2GiAxA22+/Pffff3+/nOuoo47iqKOO6nTfzTff3GO5OXPm9OncfZXYpPDiw3cX9G17IiJxSGz3kYhIEqTDiT+jSmxSmDdvHg8++GC+wxCRLMXFxbS0xHbTYSK1trZSXBz9qz6x3Udz584llUoxe/bsfIciIqHS0lI2bNjA+vXrKSkpiTwFTXNzc+bBt6Toqc7pdJrW1lZaW1spLY3+VZ/YloKIDEwjRozIPH0cVW+mcRgseqpzUVER5eXljBgxolfnTWxLQUQGrt78sm2XPY1DUsRRZ7UUREQkQ0lBREQyCr37qATo0wDTqFGjaG5upqkpeQ+wJa3OSasvqM5J0dc6Z31nbjalQ1Fv72EdSKqqqg4H/prvOERECtQRlZWVz2RvKPSWwt+BIwim2G7NcywiIoWihGC9mr933FHQLQUREelfGmgWEZEMJQUREclQUhARkQwlBRERyVBSEBGRjEK/JbXXzKwYuAHYF2gCLnD3t/IbVbzMrAy4FZgAVAA/cPc/5DWoHDGzsUAVcKy7L8p3PHEzs8uBk4Fy4AZ3vyXPIcUq/Ld9O8G/7VbgC4P579nMDgaucfejzWx34NdAGqgGLnH3ti29RhJbCv8PGOLuU4BvAVu+0vXAdxaw0t2PAI4HfpnneHIi/ML4X2BDvmPJBTM7GjgUOAw4Chif14By4wSg1N0PBWYDvV9IuUCY2TeBecCQcNMcYFb4/3UR8Jn+uE4Sk8LhwB8B3P0F4ID8hpMT9wBXhq+LgKSsYnItcBPwfr4DyZHjgH8ADwAPAQ/nN5ycqAFKwx6AkUBznuOJ02LglKz3lcBT4esFwKf64yJJTAojgTVZ71vNbFB3o7n7WndvMLMRwL3ArHzHFDczOxeoc/dH8x1LDo0m+JFzGnAR8Bszi74oQWFaS9B1tAi4GfhFXqOJkbvfx6ZJr8jd258+bgC27o/rJDEp1APZq04Uu/ug/+VsZuOBJ4E73X1+vuPJgfOBY83sL8B+wB1mtn1+Q4rdSuBRd0+5uwONwJg8xxS3rxHUeSLBOOHtZjakh2MGi+zxgxHA6v44aRKTwrME/ZCY2SEEze1BzczGAY8BM9391nzHkwvufqS7H+XuRwMLgXPc/YM8hxW3Z4DjzazIzHYEtiJIFIPZR2xs+a8Cyuhk5s9B6pVwHAlgKv00Oeig7jbpwgMEvyCfI+hfPy/P8eTCFcC2wJVm1j62MNXdEzEAmxTu/rCZHQm8SPCD7xJ3H+wTRf4MuNXM/kpwx9UV7r4uzzHlyn8DN5tZOfAGQdfwFtOEeCIikpHE7iMREemCkoKIiGQoKYiISIaSgoiIZCgpiIhIhpKCSB+Z2QQzW9pDme+a2XdzEpBIP1BSEBGRjCQ+vCbSa+H8WDcCk4FxgAMzsvb/mmDagb0J5qD5vrvfGe4+KHxYcifgNnf/rpmNBG4BPgbsCDxN8NS1HhySvFJLQSSaQ4FUOOX67sBQwulSsnwsLPfvwLVZcy2NA/6NYFbLb4QTE54ILAzPtwcwBdg/9lqI9EAtBZEI3P1pM1tpZpcAkwi+yId3KHabuzcDy8zsWYJp2gEWuHsT0GRmHwLbuftvzewgM7sM2BMY1cn5RHJOLQWRCMzsZOA3wHrgNoLunrc7FMuebbc463329jRQZGZfAX4K1AFzgX8SzMUlkldKCiLRfAq4291vAz4AjmTz2Tj/M5yhdGfgYLqftfJY4H/d/TcEiWK/Ts4nknPqPhKJ5mZgvpmdRrC29wsE4wTZhgEvEayD/UV3X2lmXZ3v58CNZvZ1ggVSngN2iSNwkd7QLKki/SC8++gv7v7rPIciskXUfSQiIhlqKYiISIZaCiIikqGkICIiGUoKIiKSoaQgIiIZSgoiIpLx/wEmRQcFMIP8sAAAAABJRU5ErkJggg==\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "oz = AlphaSelection(fitted_cv_model)\n", + "oz.fit(X, y)\n", + "oz.poof()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Check if fitted on Clusterers" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [], + "source": [ + "X, _ = load_credit(return_dataset=True).to_numpy()" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [], + "source": [ + "unfitted_cluster_model = KMeans(6)\n", + "fitted_cluster_model = unfitted_cluster_model.fit(X)" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [], + "source": [ + "# NOTE: Not sure how to deal with K-Elbow and prefitted models...\n", + "\n", + "# visualizer = KElbowVisualizer(unfitted_cluster_model, k=(4,12))\n", + "# visualizer.fit(X)\n", + "# visualizer.poof()" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [], + "source": [ + "# visualizer = KElbowVisualizer(fitted_cluster_model, k=(4,12))\n", + "# visualizer.fit(X)\n", + "# visualizer.poof()" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# NOTE: Silhouette Scores doesn't have a quick method\n", + "visualizer = SilhouetteVisualizer(unfitted_cluster_model)\n", + "visualizer.fit(X)\n", + "visualizer.poof()" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "visualizer = SilhouetteVisualizer(fitted_cluster_model)\n", + "visualizer.fit(X)\n", + "visualizer.poof()" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXIAAAEKCAYAAAAPVd6lAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjAsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+17YcXAAAgAElEQVR4nOy9d5wkdZ3//6zOuacn57xTszkCu+wCS5IgoGJAQYJwcqDenXiecl9Pf5ze9xT1q57oGUBFQAE9MJ0iSFxy2Bxrd2cn59w51++P6llmZydv93T3TD0fDx7sdFd4V1fVq971/rw/77cgyzIqKioqKtmLJt0GqKioqKicGaqQq6ioqGQ5qpCrqKioZDmqkKuoqKhkOaqQq6ioqGQ5qpCrqKioZDm6dBuQbYiiWA0ckCTJNu6z64AfAtcBTUAz8LIkSedPWPcXwC1AgSRJAwtl8wQb7gHyJUn6zAzLfQXYK0nSH5K03xbgQ5IkvTOPdWuAb0uS9MEk2BACAoCAcv3/EfiyJElRURSvAS6RJOkfp9nGe4FzJEn6ypnYciYkjqMAKJIkyTvu85uBB4EPS5L0P0nc38PAvZIkHZji+xl/t0nWeRC4GbhYkqTnx31eDZwA/luSpM8klrsU6E8sYgD2AJ+TJKknsU4j8P+AisQyw8CXJEl6RRRFLfAH4FZJkvpma1+2oXrkZ4goin+PchFdIknSc4mPg0CDKIpV45azAtvSYOJ8uQjQp9uIBFWAmKRt3SBJ0jpJktYCZwPLge8CSJL0x1mI0VlAbpJsORMGgGsnfHYz0JvMnYii+BFgdCoRh1n/bpPRBnx8wmc3ARMF97uJc7YOWAkcAP6aEGmAJ4AHJElaI0nSGuDLwJ9FUcyVJCkGfBP473nYlzWoHvkZIIri3Sge9jZJklrGfRUDHgduAP4z8dm1KJ7BP49b/2rg31C8DD/weUmSXhdFsQj4CVAEFAOtwEckSepLeGMPAhcDlcDjkiR9QRRFG/ALYBkQB3YCfy9JUnwa+x8E3MBqFG/mCPBRFEHYBHxLFMUY8GfgXuACQAvsBv5RkiR3wp43gTXA/wEOJmwvTNjxH5IkPT5un9uBH0iStGri3wnP6meACcVjfiCxrQeAMlEUn5Yk6TJRFM9N2GNN7OMeSZL+VxTFW4DbEp+PSpJ04VTHDiBJkk8Uxc8ATaIofgnlHH1IkqSrRFG8NnFu4ijn819QvPk7AK0oiqMo5/ZHQAOKuHuA6yVJkkRRfBF4HdiaOE8vAzdLkhQXRfEq4D9QHCkfcIckSXunOq4pzH8ERQQfSvyOVYAN5RyO/da3An+Pcn3lAt+QJOlHid/pY4n9lwGdCdu6JtnPvwMfTmzvNeA7Y96+KIrfQDlPh8f9bptRhNMIlAB/kyTptimO4THgNlEUTZIkBROfXQf8himcTEmSZOA/E8dwKfDXxH6s45bZkXgAxcb9/WNRFNdKkrR3CluyGtUjnyeiKH4T+Drw/QkiPsZDnOptjL32jq2/DEUIrpQkaT1wO/BkwnP/KPC6JElbgFoUkb9x3LZskiSdB5wL/EMi9PABwJ7wWs5KLFc7i0PZCFyO4pmWoryW/xB4B/gXSZJ+B9wNRIGNCU+2C/jGuG0ckCRpeWLZx4DfSpK0ErgS5aZzzMIOUMTyT5IkbUysez4gA38HNCVE3IXywLpRkqQNwDXAj0RRrExsYyWwfSYRH0OSpA6Uh9lEj/9bwKckSdqE4uFtlyTpTeDHKA/PLwFXACOSJG2WJKkBeBsYH7KqA7ajPCgvAi5IPKQfAW5JeI/fAr4xi+OayJ+BdaIoliT+vpGEqAMkHuyf5N3r6zoUgR1jK/BpSZJWoDz0vz9xB6IorgLM47zx+1EcFxLe8MdRHrLj+SfgK5IknQOsAK4RRXHjFMfQj/Kwe19im9tQHgpDUyw/nr0ovyvAp4H7RFHsEkXxN4mH89uSJI2OW/7PnP4Gs2hQhXx+WFEuoitRbsJ1ExeQJGknEBdFcaMoihUoIjv+9fRSFE/iOVEU9wC/QvHC6iVJ+i/gNVEUP4fySrgKxdsa4w+JfXSivIbmAq8AKxOe4N3A9yRJOj6LY/mrJEkhSZIiwH4mDxtchXKz7U7Y+n6Um3SMlwFEUcwF1pK4uSVJapckqU6SJPcs7AD4HfAFURSfRLnp/nGSN4otKL/b7xO2/AVF7Nckvt83h/2NIaM8LMfzGPA7URQfAFycKoIAJDzTB0VR/AdRFP8LRbTHn6c/SZIUlyTJAxxH+W23ojz49iS28aQkSVfM4rgmEgZ+C1yf+PujwK/H2eZFOW/vFUXxa8CXJtj2jCRJRxP/vh+4bJJ9NCbsHuM3wBZRFIsTyx+XJOnYhHVuBnJEUfw/KNeuZcJ+JzLe4TnF2ZmBk+dMkqRHUX67m1DeSG4FDiXi7WM0kbzwXMahCvn8CADXSJL0FIpX/ruEiE3kYZSL9MbEv8ejBZ4bi/0lPOnNwAFRFO8FvorisfwUeAblFXb8/seQAUGSpGagPmGPA3hWFMUPzfJYTtnWJMtogX8aZ+fZwPhtjw24RcdtBwBRwTzNPgxj/0iEEZahCMZ6YL8oinWT2HJ4kt/t6Qm2zIpxIYmm8Z8nPO6tKG8mtwCvi6KombDunSihID+KiD7KDOcJ5Tca//sIoiiumcVxTcZDwMcTIZkjkiSd9GRFUSxHGRSsQnnI/9uEdaPj/q0hEYaYQDxhF6CEonj34fEJlAfARF5GcXCOoFzDHUx+TY3xR+CchLNzPkqoZFpEURRQ3iT3i6LYKIriNyRJCkqS9KwkSV9JvNHs59RrVDvFMS4KVCGfH/GEBwtKiOEQ8OjEGx3lFfrDKK+1v57w3fPAexJxYURRvBLYhxIfvgzFo34YxeO+lHE31GQkROUXKJ7WF1EEYNX8Dg9QbvSxwc6ngc+IomhIHOP9KA+MU0h4wjtRPCsSN+ergHPcYv1ApSiKhYkb8v3jjuHXwHWSJD0GfAol5FExwZY3gGWiKJ6fWGcdcAwlLDQnRFHMAe5DidEHx32uS8T+rZIk/Thhy/KEDeNtuQx4UJKknwEScDUznCeU8YTloiiuTPz9PpTrZM7HlQj1mIH/y+me7CaU3/o/JEl6GsU7HwuJAFwsimJZ4t93AH+aZBdHOT08NxZeORdlkPEkifDQJuCLkiQ9iRJ/r2ea30SSpBDKm9hDKG8w0amWHWf/V4ABSZJ2oAzu3j7eaUk4VUXArnGr1jJu/GCxoQr5GZIYfLkJ5Ub/jwnfdaLE/I6N95YS3x1EiYs/JoriXuBrKF6+D8WT+bYoijuBJ1E8qvoZTHkI5YY5JIriOyhe+X+dwaH9KWHDzQnbWlAGOQ+heFj/PMV61wMfSRzTn4C/kxJpYgCSJB1CGcB8B0W8uset+zXghsS6b6Lc4C+hDKDGRFF8CyVb44MoA7F7Ud50bpQkqXWWx/UrURT3JH7bF1Hi2nePXyAhJp8Ffi2K4i4UL/TWhOg8hxL3vQ/4NvD3iVDIcyjCMe15kiSpF2UQ/JeJ9T4HfFSSpP55HtfDKCGQiZ7sMyjesCSK4m6UAdf+cfZ1AA+LongYqE4c70RbDwABURSXj/tsJ8rD7InxD7/Ed8MoD/hdiWvwX1Ee5LO5drczdVjlrsQ5241yDVaieP1j+7wIZdC0RRTFg8CzwLekcWmNKA/dpKVkZhqCWsZWRWVpkcj4+JAkSVfNYtnrUbKyPpVyw1KEqGRGfVqSpA+n25ZUoXrkKioqUyJJ0q+BPFEUV8+4cAaSCMV8AZhPnnvWoHrkKioqKlmO6pGrqKioZDkLOrNz586dRpTJKt0s4lQgFRUVlSSjRcmVf3vjxo2hiV8u9BT9s0hMHlFRUVFRmTPnoWSxncJCC3k3QENDAwaDYaZlk87R7gF2HGxlY92cU45V0siB1h5WVhaxoXbq8ybLMt/6wytcvLoWjSZ5EcPm5mZqamqStr1sIxSJEAhHCYWjBCPKf75whGA4wtHOQfQ6LWaDDkEADQIajQaDXotRr8Og02DQKv836nVok3he5kPXsBuHxci66iy8/+Mx9IERODVd9yQLLeQxAIPBgNFoXOBdQ/eIH6PBAJqZ5myoZBJms5m2QS9blk99zfiCYcJx0OiSW7AxEosvmevFH4ow5PUz6PbTN+qj3+0jFI1h1GnR67TotBr0Wg0GnRZBEOhx+6kpdBGVBWRZBlkmHo0SDYSJxOLE4sp/0ZhMJBbDZNCRYzGTYzPhMBmxm43odQsn7nE0DPnC2X4+Jw1JL6nqh20Do7js5pkXVMko8uxmDrRNX0raGwxj0i+py/mMicbi9I54aRsYoblvhHAkisVowGLUYTcZaSwvwKRXvO2J9Ax7ybGaybGaZrUvGQiFFW++e8hDS2SYYCSKxainOMdOodOKw2JCM91k/jPEoNMy6AnMvGAWsqSu/EGPn6qCnHSboTJH7GYjQ94AsiwjTKYqgDsQwqDLak9rQQhGonQOjNLcP0LHwChmgw6n1URDSR4Wo35S0Z6MYZ8fq3H2bz8CYDLoMBnelRwZ8IfCDHsDdAyMEkemKMdGkdNGvt2CJsmqbtTr8IciMy+YhSwZIZdlGXcghNmQKb0SVGaLVqNBqxHwhyJYTZOPrXiDYYyqRz4psiwz6PFzuLOfpp4h7CYDLpuZDbWl8w5tjHiDOCyz88anQgCsRgNWo3JOQ9EYo/4ghzr6iMbiVOQ5Kc93zumBMR0GnRZ/ODKtQ5CtLJkrPxiJggw6rZo6n42Y9Dq8wfDUQq565KcRjcVp6RvmYFsfo4EghU4r62tKMZxhXDoeB3cgTLHLniRLFYw6LYUOK4UOK6FojEGPn5b+EfLsZirzc8h3WM8o9KLc+zKRaAzDInvoL66jmQZPIITZsGQOd9Fh0GnxBEMUTVHaOhyNpT0rIlPweL187z+/SvORAxitdrZ94AY2nbV51mGTmfCFwmi1AtoUBrSNOi2lLjtFOTZGfEEOtPWh0UBjaT5FOfZ5H4tGoyEal1n4nLnUsmSufG8wvOiewksJg06LNxCe8vtYPJ70mGq2EY/LHOsa4Hs//Antzce5/l++RsPqdbz8218kTcQBPP7Qgg0sawWBPJuZ+mIXBXYrhzoGePVIGwMeP/OpLiIA8UVYlmTJCLk/FMGgXbyv3k1Hj3DrBy7n5Wen60OQvRh0Wnyh6YRcTmnGQyYjyzJt/SM88cZBdp3o4vzLr+LWu79GWUUFJrOFeCy5k6hHA4G0ZAjZzQbqil04zAb2NHfz1rEORnzBmVcchyAkUiUXGUvGRV3MHpvP6+FH3/wPYtFpa/JnNYIgEI9PfQMuvltzdviCYV453MqA20dVYQ65NstJ7/vIrjd569m/sOWK9yV1n8kY6JwvApBjNeG0mhjyBHjreCdlLjtiWf6sxr8EWJRCvmQ88mk0IOu5/3vf5Kyt56fbjJQiCAKxaW5ArSDM61U7W5FlJYzy5BsHicky62pKybO/K+IH33qVP/zsPho3nsPWK5PXc1iWlYHOdGd/CSjzC5YVuxj2BXnlSOuscsRlWUazyDJWYAkJuZJylG4rks9Tv/st7pFhPnjjrek2JaUIAtO63RqNsChjn5PhD4Z5Zu9x3jreiVhWQE1hDuPHeTtPHOPPv/wx9as3cOl1txAOJm8STCweR0ZGp82Mm0mr0VCZ7yDfbmFnUxeH2pXUxamIyyAswjfzJRNa0czwap6tPPu/f8A9Msw/fPyDADz0o+9TVFJKw8qs7AMwJXJcnjY0pl0iQt42MMJLB5vJs1tZV13CZIk6bzzzR+LxGMf2vsN/7X0HgM9//0F0+jPP1QhFYhmZwuswG7GW6Okc8vLKkVY21pZhN092vIvTI186Qq4RFmUc9Uv3fpdYYjDr83/3cT5ww81ULxPTbFXyicO0N6DVZCQSXbyVkWVZZn9rL7ubuxHL8nFapq4788E7pmqneuaEotGMFHJ41zsf8QV542g7a6qKKcqxnvxelmWisThG3eKTvcV3RFNg0uuITfPKla3k5hec8rfd4UxLZclUE4vFps2UsJsMhBepkEdicV453Er3kIe11cUY9enLvgpFougyPF8/x2rCoNeyr7WHmoCLuuJcBEGZa2DU6xZl0sOSEXKbyUBokd7oY/zyT8+m24SUEYrGsE36qqxgMxkIRRZf1o4vGOZve48Tl2XWVhelXYRC0VhKJwIlC4tBT22Ri7b+UTyBEKurighHY1iSNN0/08jsR2sSsZmNyjR9lawkHI1hM00dTrCbjYvuQT3iC/KHtw9jNuhpLCtIu4iDUsFQlyXzMfRaDTVFLvyhCG8d68AXjGAxLr63VVhKQm40EInGlsSA2GIkGI7imM4jNxsIhqOLJkd42Bvgz+8coSTHRmWBM2MyrgKRKPoMjZFPhkaA8nwHGo3A7ubuM64zk6kszqOaBI1GwGLUEwyrXnm2EZflxGvx1EKu12ox6rWLIk4+7A3w550SZXnOpBemOlMi0RjaLBJyUHLOS1x2BOBI5wDB8MKWso3H4/ziB9/ljuuu4f+761P0dHYkfR/ZdUbOEJfVjC849TRvlcwkEI5gMepnzJZwWrL//I74gvxlp0R5nvOUjItMIS6DQIa8HswBAbBbjNhMBv66+9iCjqfsfP0VXn3hb/yfb3wXu8PBrx/4UdL3saSEvCLfyZB3cXYIWcwMeQKU5zlnXK4iz5HV59cXDPOXXRKlufaMFHEA5HgWyrhCKBJlWUkuWo2Gp/ccW7AstmOHD1JUUkplTR0r1m7guHQo6ftYUkJenudg1D+3Ijsq6WfIG6AyfxZCnu+ccxGlTCEai/O3vcfJtVkyLpwynniWzpAeC8/ZzUbqil1EY3FeOdK6IGMqAb8Pg1GpTWM0Ggn4fEnfx5IS8hKXHU8glG4zVOaIJxCiNNcx43IlLjueYPadX1mWeeVwK/F4fFYPLJW5EwxHsZkMaDUaBEGgoTSPjkH3jL1gk4HJbCEcVq7LUCiE2Zr8t60lJeSFThv+UGRRTgxarIy16CudhZda7LLjDYSzrhTD/tZeuobciGUFGe/tCllanMwfiuAcV7FRq9GworyAXSe66BgcTem+68Tl9HZ20HriOAf37GRZ48qk72NJCblOq6HQac3qOOpSwx+OoNNocEwzJX0Mg05Lrt3MSBaFz9oHRtnV3M3yisKMyBOfCUHQZGWpi0AkSo711NK7JoMOsTSPFw40M+JLnSacvfV8tl1yGV//18/h9Xj42G13JH0fS2Zm5xjLywto6R+hwJmhg0kqp9A95KGhNG/WzXKr8nMY8vjJtZlTbNmZ4w+GefHgCRrL8jClcdr9XBAEkLNQykOR6KTOgNNqotRl5/n9J3jf2ctT0i5Qo9Vyy6c+yy2f+mzSt31yHynbcoYilhXQP5r8wQaV1NA74mVFReGsl68tzmXA40+hRclBlmVePtJKns16yit/pqPXaIhlWegqGosTjcWxTzEzuMRlIxaX2dvSs8CWJY8lJ+QVeU6isfi0bcNUMoNoLM6wL0B9cd6s16kvyWPA7c/4OHlTzxD9oz6qCnLSbcqcMBt0RLNs0pXbH6LAaZ0ydCUIAstKctnf2sNgFjgBk7HkhFyjEVhZWUTHgDvdpqjMQM+Ih8qCnDkVOnKYjZTk2Ogb9abQsjPDHwzz+tE2GkrzJ60nnsmYDHoiSe4Bmmo8wTBFObZplzHqdVTmO3npYDOxePYlQ2TZZZQcVlYUZPSNrqLQPexhTVXxnNdbXV1E17AnBRadObIs88qRNvJs1ikaH2Q2Rp2WbEr6isdl/KEw+XbLjMsW5WRviGVJCnldcR7eYFgNr2QwkViMvlEfy8sKZl54AmKpMg6SiQW02gdH6Rv1Zl1IZQyDXkcsnj0euTcYxmExYdDNPJg8FmLZ19qDO8vmmyxJITfotGyqL6OpZyjdpqhMQWvfCA2l+Titcx8ILHRaMeq1GTeLNx6XeftYB1UTemxmE0a9dtqemJmGOxCixDV9WGU8Rr2OIqeN3U1dKbQq+WTp5XTmnLOsnM5Bd8YPii1FZFmmtX+ELWLFvNYXBIE1VcW0DaR2osdcae4dIhyLk2eb+TU/UzHodFkj5LIs4w2G55xqXJ7noKV/mKEsGvhcskJe6LRRnuekPcWzulTmzoDHj16npa4od97b2FhXllEP6mgszltNnVQX5mT87M3pGKtFHsvAsNVERvxBnBYj1jk2k9BpNZS4HLx9vDNFliWfJSvkAOc2VtDSN5xuM1Qm0NQzxLbGqllPApqMohwbpbmOlE+/ni1HuwbQaTTkZFHO+GQIgtLoOrDANb3nw7A3SHWha17rluXa6R310pOhg+YTWdJC3pgYSOsdUTNYMgV3IMSIL8i6mpIz3tbWxkqaM+BBHY3F2XWii+rC7BzgnIjLasIfyuwGLYFwhJgsU+CY3wxujUagIs/BzhPZEStf0kKu1Wi4Yn0Dh9r7MjLDYSlysK2Xi1bVJqVJbmNZAeFoLO2Dnm39Ixh0Wmym7Es3nAyn1ZTxnbYGPQGq8p1nVL+mMMfGgNvPcBbUZlrSQg6wqrIIs0FPx6A6QSjdDHr8+EMRNs9zkHMiOq2Gc8XKtGcnHezonXFCSjbhMBsIZ3Aj81g8jjcYnlUzkunQCAIFTgtSZ3+SLEsdS17INRqBKzc2cKSzX23MnGYOtPXynnX1s8r5nS2b6sroHvakTXgGPX6GPUHy5/mKn4nYTEbCsRgZMo58GkPeIPkOCybDmdcELHHZOdo9SCTDyxIseSEHWFaSR5HTxgk1rzxtdA970AgCG2pLk7pdp9XEhpoSjnQOJHW7s+VIRz/5TitZUKF21mg0AjaTEX848ybUxeMyQ14/dcXzz3gaj0mvw2LU09Sb2dqgCjlK3vFVm0SOdg1mxWj8YiMSi7G3pSdlZUQvWl1Hx5CbQGhhz20oEuVYzxBlGdy6bb5k6oBnv8dPgSO5FSVLXHYOtmX2OJoq5AlKcx2cv7KKnU1dGX3CFiP7WnpZVVlEQ2l+SrbvtJrY0lDBwfbUt/UaT+eQG6tRjyFLao3PhRyrmWCGOT3RWJxhb4BlpbOvljkbXFYT3lAYtz9zp+2rQj6O7Str0Ws1am75AtI74mXUH+C9GxtSup8LVlYz4PEvaA2N1v4RXNbMb3AxH/IcFnzBSEbFyftGfZTlOeY8AWgmBEEgx2LK6MmDqpCPQ6fV8OFzV3G4Y0ANsSwAkViM3c3dfGjLKsyGM083nA6L0cCFq2o42Nab0v2MEYvFaR8YJd+RvdPxp8Ok12I3GfAGMyNOHo7G8ARCSYuNTyTPbs6IOQlToQr5BEpzHVywspq3jnWoWSwpRJZldjV1sboqdSGViWwRK/GHIvS7U98hqs/tw6DTYlyEYZUxinPteAKZUZisZ8RLVaELkz413StdVjMDbn/GhZPGUIV8Ei5cVUuh08re5u50m7JoOdI1gEYjcPWmxgXbp0Gn5QPnrGBXUxexFBd+au0fxpHl0/FnosBhxReMpL2D56g/SDQep7ZoftPxZ4NGI+AwG+kcyswp+6qQT4JGI/DRbWvwhyJqSmIK6Bpy0zkwyo0XrEtqzvhsWFFRiFiWz/621DYPaO0bmVUzg2zGZjKg1WrSOsszGovTPexlTVUxOm1q5cxlM9Han5nhFVXIp8Bs0HPThes51jOoNmtOIm5/kL0tPdy0fT05aRoIvHpTI/1uf8pCLMFIFH84smim5E+FIEBxjpWRNGZzdA17qMh3kGtL/bXktJjoHclMLVCFfBoKHFauP28t7zR1Zl3HkEwkGI7wutTO+85qpDKNHXKsJgMf3LwyZSGWIU8Ai1Gf1eVqZ0uh044vTQOeYyGVZSULM8ZiNugIRiIZGSdXhXwGlpXkcc1Zjbx2pDVjRuizkVAkysuHW9m2vIoNdWXpNielIZZBjw9LirNwMoWx9Er/AodXFjKkMoYgCFiNBoYysIiWKuSzYGNdGZeva+CVwy1p8z6ymVAkysuHWjm7vpyLVtem25yTXL2pkSFvgPYkdxLqG/VhMxuTus1MRaOB6oIchtwL101HlmXaB0epyHcuSEhlPGaDnoEM7BykCvks2SxWcNm6Zew41IJHDbPMmmA4wo5DLWyqL+M96+rPqFlEsrGaDNy0fQP723oZ8SXPy+p3+7CbloaQA5TlOfAEw0QXaHZQ94gXs0GPWLYwIZXx2M0G+jJwzEwV8jmwRazkqk0irxxuzap+funCEwix41ALWxsruSzDRHyM8jwHH9y8gjeOthNKQoXESDSGPxRJSj31bMGo11LksjHkTf09MeQLEoxEWVdTgiYN15PNbGAwA+99VcjnyFn15Xx022reOt6hTuWfhp5hD68cbuWKDQ1ctLouI0V8jLXVJWxpqOSNo+1n3OPTH4pg0GmXxEDneKrycxjxBVOaU+4Lhekf9bKptnTB01bHMOp0Su58hk0WVIV8HiwvL+TOy86htX+EvS3dGXdS04ksy0id/exv6+UTF23grPrydJs0Ky5dW09Rjo3dZzgJLBCOoE+TyKSTHKsJo06LJ5CaMaRINEbHoJu11SVpHX/QaTUIgpxx9clVIZ8nRTk2PnPFZjSCwKtH2ghn2IlNB7FYnLePdzLiC/KZKzbPu/FtOtBoBK7buppwJMrhjvl3hPGHIyc7zS8lBAGqCl0pCTtEY3FaBkapK86l0Jn+Bh0GnY5AhrW6W3pXXBKxmgx84qKNNJbn89y+JjqHlm67uAG3j+f2N1HktHLHZWfjWuBsgmRgNui57ZKN9I36ODrPRhSBcHTB0uEyjdJcB9GYMkaQLGLxOC39I5S57NRkiGOg12rwZ1gu+dK84pKITqvhqo2N3HLhBo53D/LWsY6M7meYbGKxOHuau09WMbxu2xqMKSpctBA4LCZuv3QTncNujncPznl9fyictvhtutFpBJaV5tM76k3K9hQRH6U4x4ZYlp8x4yw6rSbjqqOqQp4kaopcfPaqc6kuzOG5/SfoXALNnPtHFS88x2ric1dvZUVFYbpNSgoum5nbLz2L9sFR2kfmFirwByPotUtLyN95/q98487riUbClOc6kAH3GcbKY/E4zX0jFDgsLC8vyBgRB9BptWmtL9qxgHoAACAASURBVDMZqpAnEaNexzVnLeeWCzfQ0jfMSwebGVjAiRILxag/yKtH2jjQ1suHtqzio9vWYF1kdUXy7BbueM/ZDPtCHJ5DF/W4LKNZTA06pyESDvP8E7/iuf95+ORnGo1AY1kBfSPeeWewRGOKiJe4bKysKMwoEQdlPCDTSlyrQp4Caopc3HX1Vi5ZU8e+1m5ek9pw+zOjbvOZ4AuFeetYB28ebWfzsnI+//5ti8YLnwyXzcw1qysY9gTY2dQ5q5s3JstoyCzhSRW+0WGGers598oPnPJ5kdOGUa9j1Df3az4YiXKid5jyXAeNZZnliY8nw3Sc7A1mZjgajcDGujLWVBXz5rEOJQRhMVFXnJd1XWNGfUGO9wzSN+rjvBVVbG2sSnlHn0zBatRx5+UbePzV/bx8qIXNDRXTjgHIsswS0XFyCor40Kc+z77XXzrlc0EAsTyfPSe6cVhNs/YW3f4QXcMeVlYUUpbnSL7BSUIA5LRXYT8VVchTjF6nZdvyKjbWlbKrqYtXjrQSi8epKnBRXZCDNkMzHOJxmY7BUZr7holEY2wRK7lx+3ocSc7hlWUZTyBE97CXQa+fUV+QEX8Qtz+I2x8iGIkSl2VkWUYraNBoBCxGPQ6LiRyriRyLCYfFSKHTRnGOLSUDrWaDnpsuWM8ze4/zwoFmtjRU4LRO3jRCIwgZ562lgzybBbvZxKDbT8EMjossy/S7/YwGQpy9rJycKX7bTEEGhAx7WqtCvkCYDXq2Lq9ii1hJU88gr0ntPL3nGEU5NspyHRQ6bWmPrcZlmUGPn85BN93DHsrznFy1SUQsLUhaSt3YA6KpZ4jW/hHaB0aJxGI4LSbMRj0mvQ6zQU9Rjo3qQhd6rRaNAAgCsiwTl2VCkRiBcIRAKMKQJ0AoEsEbDOMJhMm1manIz6G6MIdlJXlJS4PUaAQuX7+MEpeNJ984xOqqIsrznJMul2neWjoQBFhZWcCrR9pxWIwYp8jkicdlOobcaASBrWIFpmx405NJ+706EVXIFxhNIkVrWWk+w94A+9t62N/SyztNXeQ7LBTn2Ch12TEsUApfJBajZ9hL97CHAbePXJuF1VVFXLt5BYVOW1L2EYpEOdE7xMH2fg6196HRCOQ7LOTZLJy7vBKLQT+nWKjVCHC6QMfjMqP+IIMeP28cbed/3zmCy2ZmTVUxYlk+ZbmOM465rq0uIc9u4aEXdzPg9rO6suiUtyqdRiAUSW0buWzBZjKwrDiXtoERaopcp/mwgXCEjiEPeTYLqyoLsyb/XkZOS52X6VCFPI24bGbOX1HD+Stq8ARCHOse5EBbL3/b14RBp8VhMeEwG8i1Wci1mc946ncsFmfIF2DIE2DUH8QTCOEPRagpcnFOQwUNSfRgQam38uaxDnad6MJmMlDotHFuY2XKOudoNAIum/nkMciyzIDHz7GeQV6T2jDqtGxdXsX6mhIsxvnbUJ7n5J/eey5/fPsIz+1vYkNt2clxD4vRgCeYedXxUsmaLRewZssFk35XXeiid9TDoDtAvuPd89I36mPEH2J5eXIesAtJJBrHbMgs6cwsa5YwdrORDbWlbKgtJRaPM+D20zXsoWNwlLb+Ud4+3oFGo8Gk12HUaTHotYku7To0gnDSQxiLJ4ejMUKRKKFo7OS/I9E4hU4rVQU5rKospMRlp9BpS6onFIvHOdjex6tHWukZ9lKR72T7qpq0DI4KgkCBw0qBw4qcCBu9c7yTp3cfY011CVvECspy5zeoZjUZ+Oi21Rzq6OPJNw5R4LCyurIIi1GfcXU40olGA6urinn1SDt2i4F4PE7HkAen2ch5yyuzI5QygWgslnGD/aqQZyBajYaiHBtFOTbW15QAStjAGwrjDYTwBsN4g2Hc/iCeYBg5LhNLjLBpBQFBI2A16nGYTdjMBuxmI3aTAavJgFaTmtdXWZY51NHHU7uOnexovrY6PaVGJ0MQBPIdVvIdViXFrWeInz7zNstK8njPuvp5hZEEQWBlRRFVBS7++PbhRIkCO9GYKuTjsZkM1Be5ONjeh9moz0ovfDzhWBxzhpUpVoU8S9BoBBxmY9KzRs4UWZY50TvMU7uOMhoIsqJc8fQzGZNep7R6K83naPcAP/jLm6yvKebC1bXzaghtMxn42LY1HOro46EX99A+MEpNkSvjvLZ0oGSk+BjyBTDotdQXuSYdJM4Wxt52My20kh2jCyoZiTsQ4pEde3lkxx4KnFYuWlWb8SI+Hq1Ww/LyQi5ZW0e/2893/vQaLx9qmVdN8jHv/K6rzsVpNXGgrY9j3YNJaVaRrQx5A+xp6WHYG+KSNXXccuEGBrwBRv3Z22ErGo+j0wgZV4ZBWMha2jt37qwGmhdshyopQZZlTgx6eaWpj1yLkepca8alY82HQCTK0T4PFoOW7cuKyTHPfUA0Go/zi9eb2FDuonnIR+uQjxyznkK7Ge0i+I1mgy8cpccdQCMINBY5KLKbToZR+jxBdrcPUVdgyzgxnA3+cJQ+b5AL6osWdL96rYaGQgdAzcaNG1smfp+W94NVq1ZhNGZWiEBldniDYX735iFa3DKXb1634M1vU82aFTLHuwfZ0THEJWtKMPkHOGvTpjlt47WeMGWl+axZZcEbDLPrRBcneofIs5kozXUsyjZwYxlCPSNeomi4cH01y0ryT3vANwCOvB4OtPWxtqIo6xyAziE3Yr6Ohobqhd1xPAa+qatxZlagRyWj6Rxy89ALu8m1m7lodW3KBk7TiSAoef4luQ5eOdKK7HezZm10TjNGK/OdDHkD5Nkt2EwGzl9RzYaaEqSuAQ539GPQaynOsZFvt2TtgN8YoUiU7mEv/W4fLquZTXVlVOVPP2N5dVURg94AUtdAop7KAhp8hviCYcpKMy/Gv/juRJWUsK+lm58+8zb1JXmsrS5ZlCI+HpvJwAUravCHo/z3X9+cU+ebivwcRn2BU7dnNrKxroyPbVvDproyRnwh3mnqoqVvJKmNGBaCeFxJ5Tzc0c/elh4cFiNXbRS5+qxGaotyZyw7IQgC25ZXoREE2gdHF8jq5OAPR8mzZ95bqOqRq0xLPC7z7L7jvCa1s7mhIis7/8wXjUZQ4pI2Cz986g0+fsE6aotyZ1yvNNc+ZT1urVZDbVEutUW5DHr8HO0aQOocII5MjsVEnt2M02LKOE89Eo0x4PEz7FNq4OTZzSwvK6CuOHdes5D1Wg2Xrq3nD28dxmLUk2/P/EJy8bhMIBTJyHtAFXKVKYnHZZ588yDHugfZvqoGUxZ3/jkTGkrzcVpM/PKF3Vy3dfWMpXuLnDZ8wTCxeHzaN5c8u4UtYiWbGyoY9ARoHxylpXcIqXMQp9WE06Kkm1qMcythkAyisTieQIhRfwh3IEQgFKE8z8HaqmLK8hxJSa20mgxcuraep3YfRafRZHyxLF8ojMNizMhB2qV5Z6rMSCwe57evHaRtYITzl1dnbJXGhaIox8bmhgoee2U/H9qygjXVJVMuq9dpKXbZGHD7KcqZeaKRMlnJQr7DwvqaErzBMJ2Do3QNeWjqHcYXDGM1GjAb9dhMeuxmIya9LikzcmVZJhKL4wuG8QRC+EIR/KEIkVicPLuZIqeN1VVFlOTYU3INFDitXLq2nmf2HEcsy8NpyVwxH/YFMza9VhVyldOIx2WeeF0R8a2NlYs+Hj5bXDYzWxsreeKNQ+i02mk989WVxRzu6JuVkE/EZjIglhUglhUAEI5EGfIGGPQG6Bvx0tI3jDcYAWT0Wi16nRa9VoNeq0Gn1aLRKEVWFS9eRpYVwY7FFdGOxGLK/6MxItE4ep2WHKuRQoeN+pJ8cm1mnFbTgs3KLXHZuXhNLc/tO0FjWT4OS2ZmtI14lQlvmYgq5CqnIMsyf3rnCM19w2xrrFJFfAJOq4nNDRU8/up+bty+jvrivEmXE8vy2XGoGVmWzzgsYtDrKHbZKXbZWZl4eMiyTCQaIxCOKiV9wxEC4Sj+UIS4LBOPKyV/EZSyDRqNgE6rwWIwYDYqpYKV/3QZcY7L85xcuLqGF/Y301CWR06GeeaRmFI6WfXIVbKCN462s6+1h+2rapZ8OGUqXDYlze7XO/byqcvPId9hPW2Z4hylGJk7EEpJuEAQBAx6HQa9bsomF9lGZX4Ol6yp49l9TYla8plzXINuP2V5jowttZuZVqmkhaaeIZ7adYwtYmVGDuhkEgVOK3VFufzyxd0EwqenDwqCwOqqIjoH3WmwLnspy3PwnnX1J1sLZgrDviA1ha50mzElqpCrADDo8fOrHXvYWFeasnrhi426kjxMeh2Pv7p/0vosy8sL6Rv1psGy7KbEZefKDQ10Drlp7htJe+u8eFzG7Q/Nu+TxQqAKuQrRWJxHXtpDTaFrXoNzS5n1taX0jnh56dDpJYRqCl3E4vK8uskvdfIdVt5/9gqisRiHO/qIzaOQWbLoHfVS7LJlXOna8ahCrsJLB5uJxOLUl0w+cKcyNRpB4Kz6cl480EzPsOeU77QaDeeKlRzvGUqTddmNxajnyg0iBU4re1t6CKapkmTfqI+VFQtbJGuuqEK+xOkacrPjUAsba0szbjZhtmAx6mkozec3rx0gGju1X+fGulK6hz1q16B5otNqOG95Neuqi9nb0sPIAr/deAIhZFmJ3WcyqpAvYaKxOL997QCNZfkZ/dqYDdQWuQhFouw41HLK5w6LieXlBTT3DafHsEWAIAisrCzi0jXKIGhr/ygLFWnpGvKwsrIwYzpdTYUq5EuY1460EY7Fqc7g0fhsQRAENtaV8eLB5tMKbJ0rVtLWP8pC1v5fjJTlOXj/2SuIyzJ7W3rwBVNbbCwaizHiD7IsC0KOqpAvUfyhCM8fOMHa6mI1pJIkLEY9VQU5PL3n+CmfVxfmYDXp6ZoQQ1eZO3azkSvWL2NDbQkH2/tS6p13DHqoKcyOln2qkC9RXjrYTL7dknE9QLMdsTQfqbOfzqF388cFQeDy9Q0cbu9TvfIkIAgCjWUFXLs5dd55JBqjd9TL+trSpG43VahCvgQZ9QV5/Wj7yeneKslDp9VQX5zLX3cfPeXzxrJ8cu0WWvpG0mTZ4mOid97cNzKvfquT0do/ilianzWOjirkS5AXDzZTlmtXBzhTRF1xHh2Dbpp73x3gFASBKzc0IHUNEIvHp1lbZS6M985NOi1vN3XROeg5o3BLMBxh2Btg3TQVLjMNVciXGP5QhJ0nOhFL89NtyqJFoxGoLcrlVan1lM+rC11UF+TQ1K3mlScbu9nIRWvquGpjA5FYjF0nOukb8c1rVmhL/yirqoqyytFRhXyJsbelm1ybBVMWDOBkM9WFOUidA7j9p+Y9X75+Gcd7h9I2uWWxk++wcvn6ZVy8up5hX4A9zd0MeQIzr5jA7Q/hC4ZZlWVhR1XIlxCyLPPy4VbqimduV6ZyZui1SoPlt493nvJ5scvOuWIlu090pcmyxY8gCJTm2nnf2cs5t7GSziE3e1p66BnxThtDj8dljnUPsrWxcl7t69KJKuRLiKbeISLRWFb0R1wM1Bfn8brUdtpsz4vX1BKXZdr61YHPVCIIAtWFLj60ZSVbxUqC4ShvH++iqWeIYPj0N6KW/hGKXfasnFehCvkSYk9zN+V5TjVvfIFwWk3otBqae0+Nieu1Wj5y7moOtPURnKQErkpy0Wg0VBbkcMWGBj5wznIKHVb2t/VyoK2PQY8fWVZCKsPeAFsbK7Py/lCFfIkQj8scbOujPD+za0YsNgqdNg539p/2eUW+k3MbK9nd3J0Gq5YuTquJsxsq+Nh5a1lXU0L/qJ+3jnfwmtSGWJqPQZeddfhVIV8itA+OotdpsRrVWuMLSXmegwNtvZNOBLp4TS3I0NQ9mAbLljZ6rYZlJXm8/5zl1BXlsqqyCK1W4KldR3ldauNElg1IZ1dEX2XeHOnop2CSlmQqqcVhMRGNyXQPeyid0JhAr9Vy4/Z1/PCvb+KwmChwqudnoWnqGUKr0fD5923DbNDjC4Y51j3IgfY+nt/XhMVowGkx4rKZybWZsZuNGRl6UYV8ibC/rZcV5QXpNmNJUuCwIHUOnCbkoKTLXX/eWh5+aTcXrKxR35gWkL5RLyd6h/j0FZtP1lOxmgysqylhXU0JkViM9oFROgfdtA6MsutEN95gCIfFiN2kiLvTYsJs1GPUaZMm8LF4nEA4ii8YZsgbwO0PEo2E2V45deNnVciXAP6QMlPNZTOn25QlSYHTSnPfMBdO8f2ykjwuW7uMFw42s31lTcY2+F1M+IJhdjZ1ceP2deRNkcWl12qpLcqltiiX8xKf+UNhuoY9dA95aB0Y5VBHH55AiFAkhkmvw6jXYdRrMei0GHQ6dFoNGkFAEJQmJDJKGrAsy8RlmVAkRjgaIxSJEo7GCEaiRGNx7GYDORYz5fkO1lYXU2gz0dt+eheqMVQhXwL0jHhwWjLzlXApkGuzcLBNKZg11Tk4t7GS7hEPbx/vYHNDhXquUkgkFuM1qY3L1i2jvnhuJWotRgP1xXnUF+edFPexbfqCYTyBMN5gCE8gjCcQIhiJEo/LxGSZWDyORhDQCgIarYBG0OAwG7GZDNjMBuwm5d8Wo/608x8Khehtn9quaYVcFMWvTPe9JElfnfnQVdJN95AHW5YU/1mMWIx6YrLMqD9IjnXytyJBEHjf2cv5xfO72NnUxcY6tWNTKojG4rx6uJU1VcVsESuStl29VkuO1Tzl+U01M73D6YAvAFpAmOQ/lSygpX8El9WUbjOWNE6Lke5h77TL6LVabt6+Ho1GYHdzt1ryNsnEYnFeO9JKfUkeV29qXFQPymk9ckmSviKKYingkyTpmwtkk0qS6Rpys7oqs5vHLnZsJgM9wx6WzzDgbNTr+MRFG/j5czvZdaKLDWov1aQQTYh4VaGLa89ZiUazuH7T2YyqfA5QZy1kKXLilV7NhkgvFqOBEf/sGgebDXpuvXgjWo2GnU1dqmd+hkRiMV453EJdcR4f3rJq0Yk4zELIJUlyS5L08EIYo5J8gpEocRn0WTpjbbFgMejn1AHebNDziYs2YDLoePVIG5FoLIXWLV68wTAvHmhmeXkhHzhnxaIUcZiFkIui+AlRFM8a9/fXRVG8NbVmqSQLbzCMOcsquS1GzAYdo7P0yMcw6nXccuEGlpXk8cKBZryBUIqsW5z0jnjZcbCFi1fXcc1ZjYtWxGEGIRdF8R+AOwD3uI+fAu4URfHOVBqWTXR0dPCRj3wkadu7++672bFjxymf9ff3c88998x5W95AGKMhu4X8/959FzdffQk3X30Jf3ftFek2Z16YDXo8/rkLsU6r4ZqzGrl8/TJePtxKj9rAeVYc7x5kb0s3N1+4nnOztBDWXJjpDr8NOF+SpJNCLknSDlEUrwCeA36USuNU3qWgoGBeQu4Ph9Fn8QSTeDxO24njfOoLX2LV+rPI1vvRaNDhC82v0qEgCJy9rJwCh5VHduxh1B+koTR/0YvTfIjF4+xp7iYUifHpKzZPOdlnsTGTkMfHi/gYkiQNiKKYlY0Hn3zySV544QWCwSD9/f3cdNNNPPfccxw7dowvfOELRCIRHnzwQTQaDRs3buTzn/889913H62trQwPDzMyMsINN9zAM888Q3NzM/feey/5+fkMDQ1xxx13MDg4yPbt2/n0pz9Nd3c3X/7ylwmFQhiNRr72ta8Ri8W48847ycnJ4fzzz8disfD73/8ejUbD6tWr+bd/+zcAHn/8cR544AG8Xi/33HMPubm5fO5zn+M3v/kNV155JZs2beLYsWM4nU6+853vYLFMfsHG4jKaLL7hezrbCQYCPPHIgzzxyIN88OOf4JzztqfbrDmjzOqTp50UNBM1RS4+c8Vmfv3KPnYcamFjXRk2kzqIPcaA28+uE53Ul+Rx7TkrMWX5m+hcmMlVi4qieFrPI1EUi1Byy7MSn8/H/fffzyc/+UkeffRRfvCDH/DVr36V//mf/+G+++7jwQcf5NFHH6W3t5dXX30VAJPJxM9+9jMuu+wyXnrpJX784x9z++238+c//xkAv9/Pt771LR577DFefvlljhw5wr333suNN97Iww8/zG233ca3v/1tQAmT/OxnP+OTn/wkTz75JF/+8pd5/PHHqa2tJRpVKq6tXLmShx56iI9//OM8+eSTp9gfDAa5+uqrefTRR6mtreXxxx+f8ljPRDgyAVmGCy67kk9+9gtsPu9C7v/uvYwOD8+8YgYiIBA7wy7vLpuZO99zNlsaKtlxqIXj3YNLPqtlzAvffaKLazev5GPb1iwpEYeZPfIfAH8RRfEu4C2USUCbgP8H/DTFtqWM5cuXA2C326mrq0MQBJxOJ36/n6GhIW6//XZAEfy2tjYAVqxYcXKd+vp6AJxOJ6GQEvdsbGzEbleK2qxevZrm5maOHj3KT37yEx544AFkWUanU37u8vJyDAbFk/r617/Oz3/+c775zW+ybt26kzflypUrAcjPzycYPHWQTKfTcdZZyvjzhg0bTounj0dAqe+QrZRVVvGxW+/AbLFgtdn5w+OP0NPVgdOVfV1cgKSEhjQagfNXVtNYns9vXzuwpL3zk154cR4fv2DdkvwNYOYJQQ+JomgEHgbG5rOeAL4tSdJPUm1cqpjKQxUEgZKSEn7+85+j1+t58sknWb58Oc8+++yMXm1TUxM+nw+j0ci+ffu47rrrqK2t5dZbb2XDhg00NTXx9ttvA0rHkjF+85vf8O///u8YjUZuu+02du/ePa2NANFolCNHjtDY2MjOnTtPPlimOqZs9theee4Zfvb9b/Ole7+HdGAfBqOJsorKdJs1Z2RZRia5Ya5Cp407LzuHV4+08uy+JkpcdlaUF2Rdv8n54A2GOdjWizsQ4trNK1lZUZjVb55nyky1VkqBKwAv8AvgXyRJys732lmg0+m45ZZbuPHGG4nFYpSVlXHFFbPLknA6ndx1110MDQ1x5ZVXUl9fzxe/+EXuueceQqEQwWCQL33pS6etJ4oi119/PVarlaKiItauXXtaKGUy7r//frq6uigtLeWuu+6acjmDTkssnpXDGQBs2X4xRw/t51tfuRuny8Vn7v4yNocz3WbNmWg8jl6rSbrYaDQC562oZm11Mc/tP8Gz+5qoLnQhluajzeJB7qkIhiMc6uind9TLBSuq2dpYhXEJPLhmQpjOWxNF8WlgJ7ADuA6QJUmadw75zp07q4HmVatWYTSqRZzmy0UXXcRTTz01q9+wY9DNQy/u5qLVtQtg2eLj6NGjNDQ0nPF23IEQu0908cUPnJ8Eq6am3+3jb3uOc6RrgGUledQWuhaFoIciUY52DdI+MMLZDeVsX1m7pMIooVCIAwcOANRs3LixZeL3Mz3KyiRJugxAFMXngD1Jt1AlpdhNBkJZ1LJqsRIIRXAsQAXKAoeV689fS8egm2f3HufpPccpzbVTX5yblRUwBz1+mnoG6Xf7WVtdwoe2rFTr6k/CTEIeHvuHJEkRURTD0y2ssjA8//zzs17WajIQisSIy9mdhpjtBMIRHJaFq0BZnufglos2MOjx8/bxDl490obFpKe2MJeSXHtGXwuxWJyW/hFa+0fQagS2NlayvqYU6xLywOfKXINL2TtqNkdi8Ti7T3Tx29cO4guHCYVjdA97iMXjFDqtbF9Vy5qqYvLsZqoKcjJ2oEWn1WAx6glFoifbWaksPIFwlMI09OTMs1u4fH0DF6+p42BbH68eaWVvSzcFTivFOXaKXTb02vRnEgcjUToH3fSNehn0+KkvzuNDW1ZSX5y3qKfWJ4uZhHylKIonxv1dlvhbQImXL4rAqycQ4mB7H0+8fpA3j7UTjsaQZabsot017GFPS8/Jv3UaDYVOK1UFOVy5oYHtq2sX5DV6tuTbLbj9IVXI04gvGCY/jT1T9VrtyV6Uw94AR7sG2N/Wy96WHlxWpfFzgcOK02paEG89Fosz7AvSN+qlz+3DHwwjlhVw0epa6ovzVO97jswk5Gc+ypPBBMNRvvzo33jhQDOR2LuZHTqthgtX1bJteSW5NjPLSvJxWc2YDDoCYaWdUzga582j7fzpnSO09A3TNeyha9jD60fbaXx5L1/+8IVUFeRgNujT3oOxosDJgNtHUY4trXYsZTzBECWuqZvnLiQum5lzGio4p6GCQDhCU88Qhzv6OdDWy7AviMNswG42kmM147KZsRr1GObZXFjpSxnFGwwz5AngDgRx+0P4QxEKnVZqilyct6Ka6sKcjHgzyFZmyiNvXShDFopgOMqRzn7issyDz+/ipUMtgFJm9IoNy7jxgnXUFOVOub7ZYMCcmMxTvmUlH9yiTNwZ9QX46TPv8MSbB5E6B7jhe78lJ+HdXH/eWq7dvCJtdR8q8pw09QylZd8qSpjOGwhTnCFCPh6zQc+qyiJWVSqNR4LhKD0jHrqHPbT1j3K4ow+3P0Q4GsNk0GHUaTHqdei1WgTh1PkOsgwy8ruNhMNRQpEYFqMep8VIZUEOKysKKcm1U+i0qsKdRJZMAmYwHOWBZ9/hsVf34w+FicVlXDYzt128gYaSfLavqj2jab1Oq5l/+cB5/PP7ttE15OZVqY37//Y2gXCUHzz1Bj9+5i3es7aez1517oJ7xiUu+7wq76kkh1F/iFy7GUMW1IQ3GXRUF7qoLnSxRXz380g0hjeoNBT2BsP4QuGTTYXlxEC6RhDQaTVYTQZsJiMOswGL0ZD2N9KlwJIQ8j3N3Xzlsedo7R9RGuEm6l389yevZkXFaaVkzgiNRqA838l1+aspy7XzhYeexqDXkmM28be9xzEbdHzlIxcldZ8zUeCwEooqXlI2iMliY9Djpyo/J91mnBF6nRaXzaym/mUoi/5R2TPs4dYf/o5wNMYN563FH4pQnGPjl//wwaSL+ES2La/md1+8gY21pfS5fURicY52DdI15Kalb5jekemb8SYLjUagrjiPbrWWdVroH/WxrDQv3Waohk8wrgAAIABJREFULGIWrZB3DSnVd21mI9UFTsrzHPzq5b2ct6KK3999A+tqShbEjqIcGz+6/Ro+dfnZAEhdA9z0/Sf410ee4dpv/prfvXloQWqhrKosVJsSpIFYLM6QV0mnU1FJFYtSyH+1Yy9X/efDvHyohdt/9Hta+kdoKMnjqk0i3/3ElQuehicIAn//nrP59s2X84vPXIssywy4/TSU5HHP489z9yPPpLwno1iaz4DbTzyLC2hlIz2jXsrznWo6nUpKWVQxclmW+ckzb/Ojp9/igpXV/PiZtzjaNcB3PnEl21fWpNs8Ll2rVCm87+/ey192HeVzV2/j58/v5L6/vIE3EObbt1yesoeMw2KiwGmhf1RNQ1xIuoc8rK9dmLc/laXLovHIZVnmO396lR89/RbXbGokHIlxqL0fm8mAJ8Oa1u441MojO/ayp6Wbohwbd111Lt5gKOUhlrVVJXQMjqZ0HyrvEovH6XP7aCxL30QglaXBovHIXz/azkMv7uG6rav54vvP42/7mvCFwhxq72dZSWbFJ2/avo4/vn2EL/3qbwx5A5y9rJyfffoD6LRaguEoGo2QkuyS9bUlPLu/iUg0hl7NXkk57QOjVOY7l0zfSJX0sWg88i0NFfzXre/li+8/D61Wg0GnZV9rL5+8dFPGeUQWo4F//+hFdA17WFFRwI5DLTy16xixeJzPPPAn/vWRZ4jGkl9D3GExsbysgOa+RVtSPqNo6Rtha2P2NcFQyT6yXsgPtffR3DuMIAict6KKO3/6Rx57ZR//+cSLiGX53HbJxnSbOCln1Zfz0W2r2XWiG7E0n3t//zKDHj/bV9bw7L4mfvH8rpTsd4tYSVv/aFZ3DcoGRnwB4nKchtL8dJuisgTIaiEPhCN88eGn+dyDTxGPy/xqx17ePPb/t3ff8VFWacPHf/fUdFJIAiGQQhkCoSUgINKLiIqIiG0fC7iyyuorumtblWWB3XV9Vl0fXRurri6slSaKigiKoCI9ITgQklDSQ3oymXq/f0wyJJDQTJmB6/sP5C5zn8knuXLmnOtc5zhFFTWUVNXy/6aN9OplwPOnDmfWyP48NH0UdoeTP324idvGDGLK4F68+uV2DuWfaPVnxkeF4mfQUVRR0+qvLU7KzC9lZJ8eaDU+/SsmfIRP/5S99NkPHC2p4IkbxlBttfHy5z8ytn88908bwad/uJ2RJu/+WBsS4MdTN45nRJ/uPHr9aK5J7QvA4zPHEuxv5On/foXd2bppiYqiMGlgTzKOF0mvvI1UW6yUVNVwWe/Yjm6KuET4bCDflZXH8i17uWnUAIb1iuWTn36mzuZg3uTLUBSFbuEhPlPHOO1IIV3Dgpk6pDeKohAe5M+TN4ylxmqnuA16zgPjuuCn15FXKguE2kL6sSLG9U+U3HHRbnwykFtsdha+t5GYsBAevGYkqqrywbY0BsZFs3bHAR5+e71P9TafWfUtf1u9hUqLlbe/3kVBWRWTBvXi49/fQkx4SKs/T6NRmJZi4oD0yltdWbWFKouVkabuHd0UcQnxyUCuoDC2fwJ/vGkCAUYDiqLwl9um8PD0K9h+6DhWh8Nrd+xpzpTBvcgqLONYcTkvfLqNj37YD7gLFdVa7fx78+5WD7h9YiLoHBIoGSytLP1oIVMG9ZKd3UW78slA7mfQ8bvrrmgyBtmvexR9YiLILiqjf2zbFsNqbf3ri3eVVlsYnRTPyh8yPEv2v9qXyXNrt7I7O79Vn6koCtcMNfHz8ZIWd0IS5yenqAydRkNqz24d3RRxifG5QP7dgSN8nZbl6aHaHE6eWfUtGceK+Dm3BFWFpDauatja+naLRFEg43gxM0f040RVLbuy8gD3sv5gfyPvb01r9efGRnTi8r492F3/LHHh6mx2Mo4Xc+OoAVJ/W7Q7n/qJU1WVf6zbxqtfbPccO5Rfwoot+8gtreTA8WIA+nXg3ogXItDPQFxkKObcYobUV2XMqH8v/gY9113Wl6/2HaaksvUnPicOTMSlqhwtLm/1176U7MzK44qkHsRGtP6chhBn41OBfG9OAQfzTzB7VLJnDHz/sfrg3T2KkAAjN4zoR1Qn3ysK9fpvZvC326cSGuhPQlQY1XUn68PcODIZh9PFqh8PtPpz9Votsy8fQPrRIups9lZ//UtBTlEZGkVhfPJFsRe58EE+NSPz/tY0gvwMTEs5uSf0gWNFdAowEhMWTLehfbl2aN8ObOGFa1yRcNWjtzaZrHVvu9W9zQpede/cibH94/nh4DHG9EvwmbRNb1BeY+HA8WLmTRnm1YvPxMXNZwJ5eY2FDXszufHyZAKMJ/NzM44XkxQb5VNZKs35Zn82aUcK+e20Ec2+l/+be02bFroan5xIXlkVu7LyGNpLJuvOhdXu4IeDx5g5ol+bpIkKca58ZmiloLyaHpGhXDm4t+eYe4OGGpJ7uCc3n1i+gVnP/rejmviLfHfgCB9+nw7A4YJSbn/xI/YdKfCcbwjibZX3rdEozL48GbvTycG8kjZ5xsXE5VL53nyUkX16MChe6o2LjuUzPfK+3SJZ+citTY4pisLGRXMafQ01Vlt7N61VWB1OT+5xdZ2VvTkFp+18//R7Gyksr+a131zXJm0w6nXcPm4IL6//gWB/I13DgtvkOb5OVVV2ZeXRJSzYs1mIEB3JZ3rk5yI8yJ+SytpWr0/SHgrLqwkLdO9QbtC5A7r1lO3f/PQ69h0pwOVqu9WYEcEB3DE+hb05Be22ObQvUVWVvTn5KBqFm0cNlPkE4RV8JpDf8LcVp5V2rbXa+f07n7N5fzYASd0isTmcZBf61mpFVVU5cLzIkzaprQ8OLlfTmuT9ukdSa7VzpI1TBeMiQ7lj3BB2ZeVJMG/EHcQLsDlc3DU+BT+Dz3ygFRc5nwjkRRXVZBaUnrZrjqLAl3syySooBdwpiAD7jxW1ext/idJqC06X6lnIZLW7e+KGU5Z5N6xYzTje9u8vITqMO8YPYXd2HgVlUlxLVVV2Z+djdzq5e1IqAcb23cBbiDPxiUDesDim3ykrNo31QxANS8x7dA7lttGDSIwOa98G/kIRwQFsWfJrrr8sCQCDTsuAHtGeoZYGCdHh+Ol17RLIARKjw7lrQir7jhSQ2Qa10X2F3elkm/koWo3C3ZOGNsmaEsIb+MRnw5z6oZJeXcKbHNdoFLqGBXuGGjQahUeuH93u7fulVFVFo1HQaNyfOEzdOvOfB2887TqdVsM9k4cSFtR+e0DGRYYy/6oR/HvTbipqrQxJ7IrGx1M9z4fF7mBzejbJ3aO4dliSLL8XXsknfiobetyBzfSEkmIjyThlKCXjWFGbjyO3ljqbg1nP/pcv9hzyHDtTiuHcSUOZOaJfezTNIyI4gHunXkagn54tGTlYL5EiW4Xl1ezNLWfigJ7MGN5PgrjwWj7xk5kYHc60lD7NZggMju9CsL/RUy2w1mpjzsurWPbVjvZu5gXZsDeTzIJSQhsNo8x5eRXPrPq22esdTheWDlhK72/Qc/vYIQyO78rXaVnklla2exvai9PlYm9OPvtyCpiaFMPlfXv4/IIzcXHziUA+ZXAv/vKrKc2eu2N8CisWzPYsmAkwGrhmqIkvdh+ivMbSns28IO9vTSM+KpTL6ldT2hxO9h0p8KQgnup3/17P7f/4qD2b6KHRKFw5pDd3jk8hM/8E2w8dw3aR9c5PVNWycV8WgUYDC6aPolto+w1jCXGhfCKQn4vGueOzL0/G6nDy0ff7O7BFZ7c3J5+0o4XMvnyAp8d3KL8Eh9NFv+7NV3B0OF0d/hE/ITqMBdeOIiE6nI1pWRwtKff5nYbsDid7svPZeTiX64f347YxgwiSrdqEj/CJQL7sqx0Mf/TVFoPF39d+x53/t9Jzvk9MZ8b2j+eNDTu8dqzc5VJZ8uFmIkMCmT7sZKGvL3ZnotUopCTGNHtfnd3hFbvPGHRarh3al7smpFBQVs2m9GyfzDl3ulyYc4vZsO8wnUMCeGj6FQyIi5ahFOFTfCKQgzuA2RzNr9iMjehE+tFC0o+enPR8ctY4QgP9vHZxkEaj8IdZ41h66ySC/Y2Ae+Jz9fYMJgxIJDIksNn7rF4SyBvER4Vx/7QRXDPUhDm3mC0ZOZRW1XZ0s87KpapkFZayYW8miqIwf+pwZo1Mll648EneExHOIKqTO6gdP1FJz1NSEAGuTjXxwrptfLAtjQFx0fX3BLHuif9p04qBF6ohGA9OaFpsSUXl7klDGdxCESZVVcktrWJU3x7t0cxzpigKA+O60C82il1ZuXy17zCKohAfGUaPyE5oNd7TX6i12jlcUMrxExV0iwjhrgmpxEWGdnSzhPhFfCKQJzVa0dhcIA/yM3BNqonV2w/w8PRRngwQvU6Lqqqs3n6Anl3CGRjXpV3b3ZyyagtzXl7J1CF9mDdlWJNz/gY9t48b0uK9TpfKHeOHnJZP7y10Wg2X9e7O0J6xmPNK2PbzEb7YfYhuESH06BxKaKBfhwxZOF0uCsurOVJcTnlNHak9Y5g+rG+TGvBC+DKfCOQJUWH46XUcOFbc4sYRs0cN4INt6az+8QB3TkjxHK+stbLsqx1U1lp587cz6d01or2afZrqOhvz3/iE3BOVDO3ZdAw841gRh/JPcFVKn9NKETTQaTXccYZA7y00GoWk2EiSYiM5UVXL9kPH2ZtTgNXuoHNIAN3CQ4jqFNSmBaesdge5pZUUlldzospCt4hgRvV1l5z1pqEpIVqDT/xE67Qa5kxMbbY33qB31wgeu34MV6f2aXK8U6Afr/1mBnf+38f8+p+reGXedE8Pvz1V1NRx3xufYM4t4bm7rmqy07rN4eTJ/35FlcXKxIE9WwzkOUVl+Bv0PtWTjAgO4KqUPkwd0puiihrMecWk5RTyU2YuIQFGgv2NhAb6EREUQHCA8YJWjdqdTsqqLZyoslBZW0eVxYrN4aRPt0jGJyfSu2sEgTL2LS5iPhHIgdOGIZpzy+iBgPsXW6MonrHZ2IgQ/jX/eua9uoa7/7mav/5qCqP7xbdlc5tQVZV7X1/LofwT/P3OqxjbP6HJ+Ve/2M7hglJe/vW1Z5xs+98135FfVsXHp9Rl9wWKohAdGkR0aBBj+iVgsdkpKKsmr6ySoyUV7M3Jp6ymDr1Oi59eh1GvxajXoVEUFEVBwT1BCWB3urA5HFhtDs+q3+jQIOIiQxkU34WYsGA6hwR2eJqmEO3FZwJ5w0RfSICRkPosj+ZU1tZx9yuruSbV1GS8OS4ylLd/O5P7l62juLJ9siqcLpcnEF0/vB89IkMZ3ju2yTXpRwt56+tdzLgsiSuS4lp8LVVVyThe7HUTnRfK36AnITqMhOgwRtUfc7lUqq02qi1Wqiw2quvcPWuXqqKq7mqXWo0Gf4OOYH8jQX4GgvyM+Bt0ki4oLmk+E8hzisqZ8cxyHpkxmtvGDGrxumB/I11Dg3npsx8YnRRPQqNKiF3CglmxYLanp/bZTjMBRgPjkhNaerkLllVYylP/3cjt4wZz5eDe3Hh58mnXOJwunn5vI51DAnj4uivO+HqZ+Sc4UVVLco/oVm+rt9BoFEL8z/yHWghxOp8J5AnRYST3iOaj79O5dfTAFntgiqLw5I3jmPXsf/ntsk94+/4bmuRkN9778v1t6ezJzqd/9yhuGjWAKwf3/kWbBbhcKj8cOsYHW9P4Zn8OIQFGdGdIvdNpNSy45nIMOu1Zg9cH29Ix6LRcObjlrcXsdjtPPPEEubm52Gw27r33XmJiYli8eDFarRaDwcAzzzxD586dASgtLeWWW25h7dq1GI1GqqqqWLBgAbW1tRgMBp599lkiI5tfYSqE8B4+NYh406hksgrL+Ckz94zXRYYE8tLd11JabeE3r66hrPr0miuKorDs3hk8dv0YLDY7T7+3kcmL3mLdDvN5talxaYBH3v2ce19by57sfO4cP4SVj9zKxIE9T7vH6XJ5NlYe3S+e4X26t/z6DicllbV8uvMgl5u6k5lfyld7M9mVdfr3YO3atYSGhrJixQqWLVvG4sWLWbp0KU899RTvvvsukydP5o033gBgy5YtzJkzh+LiYs/9K1eupE+fPqxYsYJp06bxr3/967y+F0KIjuEzPXKAKYN6879rvuODbWlcdspY86kGxEXzwpyref6TrS1eo9dpuWX0QG6+YgA7DufywdZ0utRnhOzIzOVPH26iX2wkPbtEEGDUo9UouFQVq93JsZIKDhx3pwx+8fSdhAQYmTywFymJMaQmxlBjtbEnO5+K2josNgdWuwOrw0md1c7X6VkcLalgXP94jHq9e+LO7sRav3rV5nB6rrfZndicDmxOJ5v357B5fw4Akwb2JCWxW5P3M3XqVK688krA/YlDq9Xy3HPPERXlztJxOp0Yje6ev0aj4a233uKGG27w3N+nTx+ysrIAqK6uRtdC4S4hhHfxqd9UP4OOGZf146Pv06mps501pWx471hWPDgbjUah1mrjSHEFSbGnDxUoisKwXrEM63Xyj4NepyExOoxd2fms3+2uFa5RFE/mhEGnJSzQn65hwSx46zMAnE4XNqeT979Lx+lyYne5cDhd2BsCs91J42oxDUH5QjQ3shQY6B5Cqq6u5oEHHuDBBx/0BPFdu3bxn//8h+XLlwMwatSo0+4PCwtj69atTJs2jYqKCs+1Qgjv5lOBHODOCSnMnZh6znnBDYtOXt+wg3c272buxKHcM3noWZfuD4rvygtzrgbcNVCsdnevWKfRYNRr8TfozylTwuVSqbPbWb/7EM9/spXaOhuzLx9Ivx6R1NTZPb1wa32v/NTeeVFFDTnFZcSGh6BCfS+95dKx+fn5zJ8/n1tvvZVrr70WgM8++4xXXnmF119/nfDwlnPxX3rpJe6++25uvvlmfv75Z+6//34++eSTs75HIUTH8rlAHh7kXn7vdLmw2BznXORozoQUSipreH3DT2xKz2LxLRPPeWGQn0F3wZOgGo1CgNFArdVOj86hLL5l0hkXNjXmcqnc/c9VKCi8df8N+BtObvjbXCXIkpIS5syZw9NPP83IkSMBWLNmDe+//z7vvvsuoaFnrikSEhJCcHAwABEREdTU1Jzr2xRCdCClPetI79y5Mx7ITk5O9ozVXgiXS+XXr6wm2N/I83dddV45xJv3Z7Pkw02UVlv4y6+mcOXg3hfcjjMpq7awevsBenTuxMSBPXG6XKgq57VIZcWWvTyzaguLbp7AjMvOvr3bkiVLWL9+PYmJiYB7TPzQoUPExMQQEhICwLBhw3jggQc890yYMIH169djNBopLCzkySefpLa2FofDwQMPPNDsEMylZOfOnaSmpnZ0M8Qlzmq1kp6eDpCQmpqac+p5n+uRg7uXOzopjufXbePz3Ye4KqXP2W+qN65/AkPiu/L8um2envGB48XknqhgbHICeu2FV0tUVZW0I4W8vy2NL/dkYnM4mTKoFxMH9jzvCoDHSir4x6ffMzopjuuGJZ3TPY8//jg1NTVkZ2ejKAqLFi1Co9Hw1FNPYbfbiY+P57777gPgzTffZN26dYSFhfHtt98yefJkoqOjeeONNzh8+DCzZ89m6NCh5/09EEK0P58M5AD/M24wX+07zF9WfsOwXt3o3EL97uZ0CvTjjzdN8Hy98of9fLAtnciQQEYnxZHUPYr+3SPp3/3Mi29cLpWjJeXERYaiKAovrPuetzftItCo5/rh/Zg9KpleXc69SFfDcJHFZue5T7aiqDB5YC/W/PQztVYbNrvTPUbucBAbHsL1I/o3uX/Tpk0AvPfee/z44488//zzKIrCQw89xLBhw3jsscfYtGkTw4cP55133uHLL7/EYrEwY8YMJk+eDLgnSp955hkMBqlNIoSv8NlArtVoWHzLJGb//T2WfPTNeQ+xNPbYzDGMSopj5Q/72Zh2mJU/ZhAXGcrax38FuHcgKq6oQVPfq66z2TlRVYs5rwSLzcFzd16FTquhzmZnyqBengqL63cd9ExaujfGcGHzpBW6//VMdp6aelif4fL0+xubbfPkQT1PC+STJk1i3LhxAOTl5RESEsKf//xntFotNpuN4uJigoKC8Pf3JyYmBovFgsVi8XzfVFXlqaee4qGHHvL03IUQ3s9nAzm4V3vOnzqcj3/cT3lNHWFB/me/qRlajYZx/RMY1z/BU9OltPpkPZajxRVkFZbiqK+dggqKBnpGhxPgp+frtMMY9DoU3BOjhwtKG2WhOE/PTKnPgGk435p0Oh2PPvooGzZs4MUXX0Sr1ZKbm8tdd91FUFAQffu6ywB37dqVq6++GqfTybx58wB31srYsWM91wghfINPTnY25nS5sDmcTTI6vJ3d6XQPoVjt1NTZKK+1UF5dR2ZhGW9u3EF0SBBTU3u7e+j2kz30ximKidHh/H7G6BafUVxczOzZs/n0008JCHDvBP/hhx+yY8cOpkyZwttvv82yZcsAmDt3Lo888ggPP/wwXbq4N9/Ys2cPAwcOvORzyWWyU3iDi3KyszF3NTwNdoeTRR98zdWpJkaavLtCoF6rRe/ftL5KQVkVz675Dj+9jufnTiMx+vx3AVq9ejWFhYXMmzcPf39/916U8+ezcOFC4uPjCQwMRKPR0KlTJ/z8/DAYDCiKQnBwMJWVlWzYsMHzWhMmTODNN99slfcrhGhbPh/IG1jsDsy5JXyxJ5Nn75jKuP6tX9GwrRwtLmfeq2uotFhZdt+MCwriAFOmTOHxxx/ntttuw+Fw8MQTTxAeHs5jjz2GXq/H39+fJUuWEBUVxbZt25g9ezYajYaUlJRLPs1QCF/m80MrjZXXWLjv9U/4ObeYxbdM4upUU6s/o7Wpqsrcl1eRVVjKP++ZTr/u7b97kWiZDK0Ib3DRD600Fhrozxv3zuCBf33KE8s3UFBWxdxJ3pkL7V667yDAqGfpbZOx2OwX3BMXQlzafKqM7bkI9DPwz3uu5VdjBpFSvy9me37qOBfHT1Ryz6ur+eP7XwPQNSxYgrgQ4oJdVD3yBka9rklGx7Orv8PhcvHgNSMJMHbcQheXS+XD79N5/pNtaBR4ePoVqKoq25QJIX6RizKQN6aqKhqNwgffpbHlQA4LrhnF+AG/bCn+hSgsr+YPKzbwU2YuI/p05483TaBrWHC7tkEIcXG66IZWTqUoCr+77grenD8TvVbL79/5nKsWv8MPB4+1+bMtNjs5RWUA+Bt0ZBeW8fSN43l13nQJ4kKIVnPR98gbpCTGsOrRW9mScYT3t6YRG+GuBphxrIjqOhvDenVrtSGOnKIyPtyWzprtB+gRGcqKBbMJCfDj86fvaNNPAitXrmTVqlWAe5b7wIEDbN26lcDAQBYsWMCsWbMYM2YM3377rWfLN1VV2blzJ+vWrcNms7Fw4UK0Wi3x8fEsXbrUU5ZACOG9LplADvVL8ZMTGJd8Msf835t38/nuQ4QG+pEUG0m/2CgGxEUzPjnxnF6zoqaOkAAjiqKwYste3v1mD3mlVei0GiYN7MlNowZ4xsHPFMRVVcXudFFrtWGxOai2WCmttlBeU0elpa7JZhNWh5MQfyNzJjZNi5s5cyYzZ84EYNGiRdxwww2Ul5dzzz33UFhYyKxZswAYM2YMY8aMAWDZsmWkpKTQs2dP5s+fz/z58xk7diwPP/wwmzdvZsKECQghvNslFcibs+imiVyRFMfOw3lkHCvi35t207NLuCeQ/23VFo6WlGPU69y7Dalgsduptdo5VlJBUUUNf7ltMlqNhoxjxQQbDVyR1IOEyDB0Wi0b9maybocZq8Ph3n/T0Wi5fUPxLLsLq8OBvb5oVp3dgcPpOmO7UxNjTgvkDdLS0sjMzGThwoX8/PPPLF261NMDb6ygoIA1a9bw8ccfA5CUlER5eTmqqlJTUyN7dgrhIy7531Q/g45rh/bl2qHuQlFWu4OSqpMFs46XVnCiqharw4nLdTKN0ely0SnASGxECNszj+Nv0BPop6d/XDQ2u5OCiuomZWcbetQNX9saFdRyuFo3PfK1115j/vz5AGcsgPXWW29x5513ekrWxsfH86c//YlXXnmF4OBghg8f3qrtEkK0jUs+kJ/KqNfRLTzE8/WLc69p9WeoqorV7qTWZsdis1NlsVJWbaGipo5Ki7VR0G9cAtfZJPiHtlDpsbKykuzsbEaMGHHGNrhcLjZv3syCBQs8x5YuXcry5cvp3bs3y5cv569//SsLFy5s1fcuhGh9Esg7gKIojfYBvbDSuy356aefPPt1nsnBgwdJSEjAz8/Pc6xTp04EBQUBEBUVxa5du1q1bUKItiGB/CKTnZ1NbGzsOV3XvXv3JseWLFnCggUL0Ol06PV6Fi9e3FbNFEK0oouqaJYQrU2KZglvcLaiWZIkLIQQPk4CuRBC+DgJ5EII4eMkkAshhI+TQC6EED7O69MPiytryD1RSWF5NTanE3+9ni5hQcRGhBAa2Lo52EII4Yu8MpCrqkr60SK+2Z9NcWUNYUH+BPkZ0Gs12BxO0o4WUFptISEqjHHJCbK7jhDikuZ1gbzSYmXl9/s5XlpJUmwkKT1j0DRTXtbpcnG0uJx3Nu9mUHxXrk41YdC172YRQgjhDbxqjLy8xsIrn2/H5nQyITmRbuEhzQZxcJekTYgOZ+KAnhwpLudfG3ditTvaucVCCNHxvCaQ2x1O3ty4iy6hgQyM6+IuGXsO9Dotw3vH4nS6eO+7NK/baFkIIdqa1wTyr/YdRlHA1C3yvO9VFIWUxBiOnahgd1ZeG7ROCCG8l1cE8kqLlW3mo6Qkxlzwa2g07mC+fvchnK4zb8oghBAXE68I5LsO59ElNAijvvm5V2udhX8+u5Tf3DSdRQ//loK8481eFx7kj06rwZxb0pbNFUIIr+IVgTzjWBGxEZ1aPP/Nl+s5lJHOohdexelw8OHby1q8tktoMOY8CeRCiEtHh6cfulwqeWWVDIiLbvGaKdNncvm4ifj5B6DRatEpXBNXAAAECklEQVTXb03WnIhgfw4XlLZFU4UQwit1eI+8zu4AFPRnyQEPCunEood/S/7xo1w96+YWrws0Gqi0WFu5lUII4b06PJArCuecMnjfI0+SMnwU/1ja8j6SknwohLjUdHgg99Pr0Gk19T3z5n2++iOeefL3aHVaDEYj1jpLi9dWWaxEBAW0RVOFEMIrdXggVxSF2IgQTlTVtnjNiLETAPjD/F9zMCONe3/3hxavPVFVS/fIlidOhRDiYtPhk50AA+O6sM18lG7hIc2eDw0L59Elz571dVRVJa+0inHJCa3dRCGE8Fod3iMHGBjfhfKaOqp+4SRlflkV/gYdCVFhrdQyIYTwfl4RyP0NeqYO6c2Ow7kXXCvF7nCy70gB1w1PQmmh0JYQQlyMvCKQAwzv3Z2okCB2ZeWddzB3Ol18f/AYqT1j6NUloo1aKIQQ3slrArlGo3DbmEFoNRq2mY+eMYulsSqLlW8ycoiPDOXqlL5t3EohhPA+XjHZ2cDPoGPuxFS+2neYTWlZxEWGktglHL9marBU19nIzD9BflkVU4f0Znjv7udc+lYIIS4m7R3ItQA2m+2MF03oH8eA2M7szMrl+4xsjHotAX4GtCg4XC5q6my4VJVB8V2ZeZmJTgF+2O1nfk0hLpTVKiuFRcdqFDObXQKvtOdGDDt37rwC2NJuDxRCiIvL6NTU1O9OPdjePfKfgNFAPuBs52cLIYSv0gJdccfQ07Rrj1wIIUTr85qsFSGEEBdGArkQQvg4CeRCCOHjJJALIYSPk0AuhBA+TgK5EEL4OAnkQgjh4ySQCyGEj5NALoQQPs6rqh8K0VZMJlM8cBDIAFTAAOQBd5nN5uMmk+l24H5Aj7uDs8xsNr9Yf+8o4IX6cyeAOWaz+Ui7vwkhWiA9cnEpyTObzYPNZvMQs9ncH9gB/J/JZLoHeBCYbjabBwNjgF+ZTKa59fctB+bWn1sOvNgRjReiJRLIxaXsW6AP8CTwkNlszgcwm83lwB1AuslkMgJPms3mffX37AN6dERjhWiJDK2IS5LJZNIDNwE/AnfV/+thNpsPNPryP/X3aIA/Aqvbp5VCnBsJ5OJSEmMymfbU/98IbAd+hzuQn3F7KZPJZAD+jft35s9t2UghzpcEcnEpyasf527CZDJlAUNxD7U0HBsLXGU2mx8zmUxBwFrcE53Xmc1me3s1WIhzIWPkQsCzwN9NJlMXAJPJ1Bn4O5BZf/4/9f+fbTabZd834XVkYwlxSahPP9xsNpvjWzj/APBrwIW7g/Oa2Wx+yWQyDQF24U5bbOiJ55nN5mlt3mghzpEEciGE8HEytCKEED5OArkQQvg4CeRCCOHjJJALIYSPk0AuhBA+TgK5EEL4OAnkQgjh4/4/k1tJAeedP8AAAAAASUVORK5CYII=\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "visualizer = InterclusterDistance(unfitted_cluster_model)\n", + "visualizer.fit(X)\n", + "visualizer.poof()" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "visualizer = InterclusterDistance(fitted_cluster_model)\n", + "visualizer.fit(X)\n", + "visualizer.poof()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Check if fitted on Model Selection Visualizers\n", + "\n", + "_NOTE: Not sure how to proceed with multi-model visualizers -- is already fitted a real use case here?_" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.1" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/rebeccabilbro/tsne_resolve_colors.ipynb b/examples/rebeccabilbro/tsne_resolve_colors.ipynb new file mode 100644 index 000000000..def562d7b --- /dev/null +++ b/examples/rebeccabilbro/tsne_resolve_colors.ipynb @@ -0,0 +1,562 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## TSNE Resolve Colors Bug Documentation\n", + "\n", + "Jerome Massot [reports](https://github.com/DistrictDataLabs/yellowbrick/pull/658) that there is a bug in TSNE that means that colors that are passed in on instantiation do not affect the colors of the plot.\n", + "\n", + "In this example, we'll validate that the bug exists, and that his proposed solution works" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import sys \n", + "\n", + "# Modify the path \n", + "sys.path.append(\"..\")\n", + "\n", + "import pandas as pd\n", + "import yellowbrick as yb\n", + "import matplotlib.pyplot as plt " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Validate Bug" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from download import download_all \n", + "from sklearn.datasets.base import Bunch\n", + "\n", + "## The path to the test data sets\n", + "FIXTURES = os.path.join(os.getcwd(), \"data\")\n", + "\n", + "## Dataset loading mechanisms\n", + "datasets = {\n", + " \"hobbies\": os.path.join(FIXTURES, \"hobbies\")\n", + "}\n", + "\n", + "\n", + "def load_data(name, download=True):\n", + " \"\"\"\n", + " Loads and wrangles the passed in text corpus by name.\n", + " If download is specified, this method will download any missing files. \n", + " \"\"\"\n", + " \n", + " # Get the path from the datasets \n", + " path = datasets[name]\n", + " \n", + " # Check if the data exists, otherwise download or raise \n", + " if not os.path.exists(path):\n", + " if download:\n", + " download_all() \n", + " else:\n", + " raise ValueError((\n", + " \"'{}' dataset has not been downloaded, \"\n", + " \"use the download.py module to fetch datasets\"\n", + " ).format(name))\n", + " \n", + " # Read the directories in the directory as the categories. \n", + " categories = [\n", + " cat for cat in os.listdir(path) \n", + " if os.path.isdir(os.path.join(path, cat))\n", + " ]\n", + " \n", + " \n", + " files = [] # holds the file names relative to the root \n", + " data = [] # holds the text read from the file \n", + " target = [] # holds the string of the category \n", + " \n", + " # Load the data from the files in the corpus \n", + " for cat in categories:\n", + " for name in os.listdir(os.path.join(path, cat)):\n", + " files.append(os.path.join(path, cat, name))\n", + " target.append(cat)\n", + " \n", + " with open(os.path.join(path, cat, name), 'r') as f:\n", + " data.append(f.read())\n", + " \n", + " \n", + " # Return the data bunch for use similar to the newsgroups example\n", + " return Bunch(\n", + " categories=categories,\n", + " files=files,\n", + " data=data,\n", + " target=target,\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "\n", + "corpus = load_data('hobbies')\n", + "tfidf = TfidfVectorizer()\n", + "\n", + "docs = tfidf.fit_transform(corpus.data)\n", + "labels = corpus.target" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from yellowbrick.text import TSNEVisualizer\n", + "\n", + "tsne = TSNEVisualizer(colors=[\"purple\",\"blue\",\"orchid\",\"indigo\",\"plum\",\"navy\"])\n", + "tsne.fit(docs, labels)\n", + "tsne.poof()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Validate Solution" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "\n", + "from collections import defaultdict\n", + "\n", + "from yellowbrick.draw import manual_legend\n", + "from yellowbrick.text.base import TextVisualizer\n", + "from yellowbrick.style.colors import resolve_colors\n", + "from yellowbrick.exceptions import YellowbrickValueError\n", + "\n", + "from sklearn.manifold import TSNE\n", + "from sklearn.pipeline import Pipeline\n", + "from sklearn.decomposition import TruncatedSVD, PCA\n", + "\n", + "##########################################################################\n", + "## Quick Methods\n", + "##########################################################################\n", + "\n", + "def tsne(X, y=None, ax=None, decompose='svd', decompose_by=50, classes=None,\n", + " colors=None, colormap=None, alpha=0.7, **kwargs):\n", + " \"\"\"\n", + " Display a projection of a vectorized corpus in two dimensions using TSNE,\n", + " a nonlinear dimensionality reduction method that is particularly well\n", + " suited to embedding in two or three dimensions for visualization as a\n", + " scatter plot. TSNE is widely used in text analysis to show clusters or\n", + " groups of documents or utterances and their relative proximities.\n", + "\n", + " Parameters\n", + " ----------\n", + "\n", + " X : ndarray or DataFrame of shape n x m\n", + " A matrix of n instances with m features representing the corpus of\n", + " vectorized documents to visualize with tsne.\n", + "\n", + " y : ndarray or Series of length n\n", + " An optional array or series of target or class values for instances.\n", + " If this is specified, then the points will be colored according to\n", + " their class. Often cluster labels are passed in to color the documents\n", + " in cluster space, so this method is used both for classification and\n", + " clustering methods.\n", + "\n", + " ax : matplotlib axes\n", + " The axes to plot the figure on.\n", + "\n", + " decompose : string or None\n", + " A preliminary decomposition is often used prior to TSNE to make the\n", + " projection faster. Specify `\"svd\"` for sparse data or `\"pca\"` for\n", + " dense data. If decompose is None, the original data set will be used.\n", + "\n", + " decompose_by : int\n", + " Specify the number of components for preliminary decomposition, by\n", + " default this is 50; the more components, the slower TSNE will be.\n", + "\n", + " classes : list of strings\n", + " The names of the classes in the target, used to create a legend.\n", + "\n", + " colors : list or tuple of colors\n", + " Specify the colors for each individual class\n", + "\n", + " colormap : string or matplotlib cmap\n", + " Sequential colormap for continuous target\n", + "\n", + " alpha : float, default: 0.7\n", + " Specify a transparency where 1 is completely opaque and 0 is completely\n", + " transparent. This property makes densely clustered points more visible.\n", + "\n", + " kwargs : dict\n", + " Pass any additional keyword arguments to the TSNE transformer.\n", + "\n", + " Returns\n", + " -------\n", + " ax : matplotlib axes\n", + " Returns the axes that the parallel coordinates were drawn on.\n", + " \"\"\"\n", + " # Instantiate the visualizer\n", + " visualizer = TSNEVisualizer(\n", + " ax, decompose, decompose_by, classes, colors, colormap, alpha, **kwargs\n", + " )\n", + "\n", + " # Fit and transform the visualizer (calls draw)\n", + " visualizer.fit(X, y, **kwargs)\n", + " visualizer.transform(X)\n", + "\n", + " # Return the axes object on the visualizer\n", + " return visualizer.ax\n", + "\n", + "\n", + "##########################################################################\n", + "## TSNEVisualizer\n", + "##########################################################################\n", + "\n", + "class TSNEVisualizer(TextVisualizer):\n", + " \"\"\"\n", + " Display a projection of a vectorized corpus in two dimensions using TSNE,\n", + " a nonlinear dimensionality reduction method that is particularly well\n", + " suited to embedding in two or three dimensions for visualization as a\n", + " scatter plot. TSNE is widely used in text analysis to show clusters or\n", + " groups of documents or utterances and their relative proximities.\n", + "\n", + " TSNE will return a scatter plot of the vectorized corpus, such that each\n", + " point represents a document or utterance. The distance between two points\n", + " in the visual space is embedded using the probability distribution of\n", + " pairwise similarities in the higher dimensionality; thus TSNE shows\n", + " clusters of similar documents and the relationships between groups of\n", + " documents as a scatter plot.\n", + "\n", + " TSNE can be used with either clustering or classification; by specifying\n", + " the ``classes`` argument, points will be colored based on their similar\n", + " traits. For example, by passing ``cluster.labels_`` as ``y`` in ``fit()``, all\n", + " points in the same cluster will be grouped together. This extends the\n", + " neighbor embedding with more information about similarity, and can allow\n", + " better interpretation of both clusters and classes.\n", + "\n", + " For more, see https://lvdmaaten.github.io/tsne/\n", + "\n", + " Parameters\n", + " ----------\n", + "\n", + " ax : matplotlib axes\n", + " The axes to plot the figure on.\n", + "\n", + " decompose : string or None, default: ``'svd'``\n", + " A preliminary decomposition is often used prior to TSNE to make the\n", + " projection faster. Specify ``\"svd\"`` for sparse data or ``\"pca\"`` for\n", + " dense data. If None, the original data set will be used.\n", + "\n", + " decompose_by : int, default: 50\n", + " Specify the number of components for preliminary decomposition, by\n", + " default this is 50; the more components, the slower TSNE will be.\n", + "\n", + " labels : list of strings\n", + " The names of the classes in the target, used to create a legend.\n", + " Labels must match names of classes in sorted order.\n", + "\n", + " colors : list or tuple of colors\n", + " Specify the colors for each individual class\n", + "\n", + " colormap : string or matplotlib cmap\n", + " Sequential colormap for continuous target\n", + "\n", + " random_state : int, RandomState instance or None, optional, default: None\n", + " If int, random_state is the seed used by the random number generator;\n", + " If RandomState instance, random_state is the random number generator;\n", + " If None, the random number generator is the RandomState instance used\n", + " by np.random. The random state is applied to the preliminary\n", + " decomposition as well as tSNE.\n", + "\n", + " alpha : float, default: 0.7\n", + " Specify a transparency where 1 is completely opaque and 0 is completely\n", + " transparent. This property makes densely clustered points more visible.\n", + "\n", + " kwargs : dict\n", + " Pass any additional keyword arguments to the TSNE transformer.\n", + " \"\"\"\n", + "\n", + " # NOTE: cannot be np.nan\n", + " NULL_CLASS = None\n", + "\n", + " def __init__(self, ax=None, decompose='svd', decompose_by=50,\n", + " labels=None, classes=None, colors=None, colormap=None,\n", + " random_state=None, alpha=0.7, **kwargs):\n", + "\n", + " # Visual Parameters\n", + " self.alpha = alpha\n", + " self.labels = labels\n", + " self.colors = colors\n", + " self.colormap = colormap\n", + " self.random_state = random_state\n", + "\n", + " # Fetch TSNE kwargs from kwargs by popping only keys belonging to TSNE params\n", + " tsne_kwargs = {\n", + " key: kwargs.pop(key)\n", + " for key in TSNE().get_params()\n", + " if key in kwargs\n", + " }\n", + " self.transformer_ = self.make_transformer(decompose, decompose_by, tsne_kwargs)\n", + "\n", + " # Call super at the end so that size and title are set correctly\n", + " super(TSNEVisualizer, self).__init__(ax=ax, **kwargs)\n", + "\n", + " def make_transformer(self, decompose='svd', decompose_by=50, tsne_kwargs={}):\n", + " \"\"\"\n", + " Creates an internal transformer pipeline to project the data set into\n", + " 2D space using TSNE, applying an pre-decomposition technique ahead of\n", + " embedding if necessary. This method will reset the transformer on the\n", + " class, and can be used to explore different decompositions.\n", + "\n", + " Parameters\n", + " ----------\n", + "\n", + " decompose : string or None, default: ``'svd'``\n", + " A preliminary decomposition is often used prior to TSNE to make\n", + " the projection faster. Specify ``\"svd\"`` for sparse data or ``\"pca\"``\n", + " for dense data. If decompose is None, the original data set will\n", + " be used.\n", + "\n", + " decompose_by : int, default: 50\n", + " Specify the number of components for preliminary decomposition, by\n", + " default this is 50; the more components, the slower TSNE will be.\n", + "\n", + " Returns\n", + " -------\n", + "\n", + " transformer : Pipeline\n", + " Pipelined transformer for TSNE projections\n", + " \"\"\"\n", + "\n", + " # TODO: detect decompose by inferring from sparse matrix or dense or\n", + " # If number of features > 50 etc.\n", + " decompositions = {\n", + " 'svd': TruncatedSVD,\n", + " 'pca': PCA,\n", + " }\n", + "\n", + " if decompose and decompose.lower() not in decompositions:\n", + " raise YellowbrickValueError(\n", + " \"'{}' is not a valid decomposition, use {}, or None\".format(\n", + " decompose, \", \".join(decompositions.keys())\n", + " )\n", + " )\n", + "\n", + " # Create the pipeline steps\n", + " steps = []\n", + "\n", + " # Add the pre-decomposition\n", + " if decompose:\n", + " klass = decompositions[decompose]\n", + " steps.append((decompose, klass(\n", + " n_components=decompose_by, random_state=self.random_state)))\n", + "\n", + " # Add the TSNE manifold\n", + " steps.append(('tsne', TSNE(\n", + " n_components=2, random_state=self.random_state, **tsne_kwargs)))\n", + "\n", + " # return the pipeline\n", + " return Pipeline(steps)\n", + "\n", + " def fit(self, X, y=None, **kwargs):\n", + " \"\"\"\n", + " The fit method is the primary drawing input for the TSNE projection\n", + " since the visualization requires both X and an optional y value. The\n", + " fit method expects an array of numeric vectors, so text documents must\n", + " be vectorized before passing them to this method.\n", + "\n", + " Parameters\n", + " ----------\n", + " X : ndarray or DataFrame of shape n x m\n", + " A matrix of n instances with m features representing the corpus of\n", + " vectorized documents to visualize with tsne.\n", + "\n", + " y : ndarray or Series of length n\n", + " An optional array or series of target or class values for\n", + " instances. If this is specified, then the points will be colored\n", + " according to their class. Often cluster labels are passed in to\n", + " color the documents in cluster space, so this method is used both\n", + " for classification and clustering methods.\n", + "\n", + " kwargs : dict\n", + " Pass generic arguments to the drawing method\n", + "\n", + " Returns\n", + " -------\n", + " self : instance\n", + " Returns the instance of the transformer/visualizer\n", + " \"\"\"\n", + "\n", + " # Store the classes we observed in y\n", + " if y is not None:\n", + " self.classes_ = np.unique(y)\n", + " elif y is None and self.labels is not None:\n", + " self.classes_ = np.array([self.labels[0]])\n", + " else:\n", + " self.classes_ = np.array([self.NULL_CLASS])\n", + "\n", + " # Fit our internal transformer and transform the data.\n", + " vecs = self.transformer_.fit_transform(X)\n", + " self.n_instances_ = vecs.shape[0]\n", + "\n", + " # Draw the vectors\n", + " self.draw(vecs, y, **kwargs)\n", + "\n", + " # Fit always returns self.\n", + " return self\n", + "\n", + " def draw(self, points, target=None, **kwargs):\n", + " \"\"\"\n", + " Called from the fit method, this method draws the TSNE scatter plot,\n", + " from a set of decomposed points in 2 dimensions. This method also\n", + " accepts a third dimension, target, which is used to specify the colors\n", + " of each of the points. If the target is not specified, then the points\n", + " are plotted as a single cloud to show similar documents.\n", + " \"\"\"\n", + " # Resolve the labels with the classes\n", + " labels = self.labels if self.labels is not None else self.classes_\n", + " if len(labels) != len(self.classes_):\n", + " raise YellowbrickValueError((\n", + " \"number of supplied labels ({}) does not \"\n", + " \"match the number of classes ({})\"\n", + " ).format(len(labels), len(self.classes_)))\n", + "\n", + "\n", + " # Create the color mapping for the labels.\n", + " self.color_values_ = resolve_colors(\n", + " n_colors=len(labels), colormap=self.colormap, colors=self.colors)\n", + " colors = dict(zip(labels, self.color_values_))\n", + "\n", + " # Transform labels into a map of class to label\n", + " labels = dict(zip(self.classes_, labels))\n", + "\n", + " # Expand the points into vectors of x and y for scatter plotting,\n", + " # assigning them to their label if the label has been passed in.\n", + " # Additionally, filter classes not specified directly by the user.\n", + " series = defaultdict(lambda: {'x':[], 'y':[]})\n", + "\n", + " if target is not None:\n", + " for t, point in zip(target, points):\n", + " label = labels[t]\n", + " series[label]['x'].append(point[0])\n", + " series[label]['y'].append(point[1])\n", + " else:\n", + " label = self.classes_[0]\n", + " for x,y in points:\n", + " series[label]['x'].append(x)\n", + " series[label]['y'].append(y)\n", + "\n", + " # Plot the points\n", + " for label, points in series.items():\n", + " self.ax.scatter(\n", + " points['x'], points['y'], c=colors[label],\n", + " alpha=self.alpha, label=label\n", + " )\n", + "\n", + " def finalize(self, **kwargs):\n", + " \"\"\"\n", + " Finalize the drawing by adding a title and legend, and removing the\n", + " axes objects that do not convey information about TNSE.\n", + " \"\"\"\n", + " self.set_title(\n", + " \"TSNE Projection of {} Documents\".format(self.n_instances_)\n", + " )\n", + "\n", + " # Remove the ticks\n", + " self.ax.set_yticks([])\n", + " self.ax.set_xticks([])\n", + "\n", + " # Add the legend outside of the figure box.\n", + " if not all(self.classes_ == np.array([self.NULL_CLASS])):\n", + " box = self.ax.get_position()\n", + " self.ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])\n", + " manual_legend(\n", + " self, self.classes_, self.color_values_,\n", + " loc='center left', bbox_to_anchor=(1, 0.5)\n", + " )\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "tsne = TSNEVisualizer(colors=[\"purple\",\"blue\",\"orchid\",\"indigo\",\"plum\",\"navy\"])\n", + "tsne.fit(docs, labels)\n", + "tsne.poof()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/regression.ipynb b/examples/regression.ipynb index 180153532..6ceb97962 100644 --- a/examples/regression.ipynb +++ b/examples/regression.ipynb @@ -20,7 +20,7 @@ "import matplotlib as mpl \n", "import matplotlib.pyplot as plt \n", "\n", - "from sklearn.linear_model import Lasso, LassoCV, Ridge, RidgeCV\n", + "from sklearn.linear_model import Lasso, LassoCV, Ridge\n", "from sklearn.model_selection import cross_val_predict, train_test_split\n", "\n", "from yellowbrick.datasets import load_concrete\n", @@ -63,28 +63,10 @@ "outputs": [], "source": [ "# Use Yellowbrick to load the concrete dataset\n", - "data = load_concrete()\n", - "\n", - "# Save the data in a Pandas DataFrame\n", - "df = pd.DataFrame(data['data'], columns=data['feature_names'], dtype='float')" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "# Save feature names as a list and target variable as a string\n", - "feature_names = ['cement', 'slag', 'ash', 'water', 'splast', 'coarse', 'fine', 'age']\n", - "target_name = 'strength'\n", - "\n", - "# Get the X and y data from the DataFrame \n", - "X = df[feature_names]\n", - "y = df[target_name]\n", + "X, y = load_concrete()\n", "\n", "# Create the train and test data \n", - "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)" + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)" ] }, { @@ -100,12 +82,12 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "metadata": {}, "outputs": [ { "data": { - "image/png": "\n", + "image/png": "\n", "text/plain": [ "
" ] @@ -133,12 +115,12 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "metadata": {}, "outputs": [ { "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAioAAAGACAYAAACDX0mmAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAIABJREFUeJzsvXlglNW9//96lpnJvgIJkLDKIotBXFCrqLiAikhdagVrf+0Vr221l35r3RC5bRE3bK/aW3r12lZrXa7WIrV2UauiSKkCsgQhyJYQspJtMuuz/f4YZpgkM8lMMkkmyXn5h2Se7TznmXnO+3w+n/P5SJZlWQgEAoFAIBAkIXJ/N0AgEAgEAoEgGkKoCAQCgUAgSFqEUBEIBAKBQJC0CKEiEAgEAoEgaRFCRSAQCAQCQdIihIpAIBAIBIKkRQgVgSBJmDJlCldffTXXXHMNixcvZv78+Vx33XXs2rWr2+dcsWIFn3zySYfPd+3axbx587p93qNHj3L66afHdcy6deu46KKLuO+++7p93aeffppzzjmHa665hmuuuYZFixYxb948Hn74YYKZFq655hpaWlo6HPvcc89x7733dvva7Zk3bx6zZs3C5XK1+fyPf/wjU6ZM4a9//Wtc54u1fVOmTKGhoSGucwsEAxm1vxsgEAhO8vzzz5OXlxf6+7nnnmP16tW8+uqr3TrfQw89lKim9ZjXX3+dtWvXcuaZZ/boPFdeeSUPPvhg6O/m5mYWLVrE+eefzwUXXMCbb77Z06bGTG5uLu+88w6LFy8OffbHP/6RYcOG9VkbBILBjhAqAkGSous6VVVVZGdnhz5bt24df//73zFNk9GjR7Nq1SoKCgr4+9//zrp165AkCUVRuPvuuznrrLP4xje+wdKlS1mwYAEvvfQSzz//PBkZGUyePDl0zqeffprGxsbQ4B/+9+eff87jjz+O3++nrq6O8847jzVr1rRp54EDB1ixYgV+vx/Lsrj++utZunRpm32WL19OTU0NK1as4D/+4z+YPXs2//mf/0llZSWWZbF48WJuvfVWjh49ytKlS5k4cSKVlZX87ne/Y8SIEZ32U319PV6vN9RPU6ZMYfPmzWRmZrJ69Wo++eQT8vPzyc/PJzMzE4AjR45w//3309zczPDhw7Esi0WLFnHttdeybds21q5di8fjQZIk7rzzTi6++OKI1160aBEbNmwICZXKykrcbjcTJkwI7fPZZ5/x2GOP4fF4sNlsLF++nLlz56JpWtT2OZ1OHnroIcrKytA0jXPPPZe7774bVRWvbMHQQ3zrBYIk4pvf/CaSJNHQ0IDD4eDiiy/m4YcfBmD9+vWUlZXx2muvoaoqr776Kg888ADPPvssjz32GGvXrmXWrFl8/PHHbNmyhbPOOit03i+++IJf/OIXvPnmmwwfPryNRaIzXnjhBb7//e8zZ84cXC4Xl1xyCbt37yYnJye0z3PPPce8efO47bbbqKurY82aNdx0003I8knP8n/9138xb9481q5dy8yZM7n55pu55JJL+Na3voXT6WTp0qWMHDmSkpISqqureeKJJ6JaXt5++222bt2K1+ulqamJadOm8eMf/5jTTjutzX4vvfQShw8f5s9//jO6rnPzzTeHhMDdd9/NNddcw5IlSzhw4ADXXXcdixYtorm5mfvuu4/nnnuOoqIiampq+NrXvsaUKVMYNWpUh7ZceOGF/N///R+1tbWMGDGCN998k8WLF/O3v/0NgMbGRr7//e+zbt06SkpK2L9/PzfffDOvv/46//jHP6K2b82aNUyfPp1HHnkEwzC49957+c1vfsOyZctiem4CwWBCCBWBIIkIun727NnDsmXLOP3008nPzwfg/fffZ9euXVx33XUAmKaJx+MB4KqrruKOO+7gwgsv5Ctf+UqHAW3z5s185StfYfjw4QDceOONfPzxx12255FHHmHjxo386le/4uDBg3i9Xtxudxuhctlll3HPPfewc+dOzj33XB544IE2IqU9brebbdu28etf/xqAzMxMrr32WjZu3EhJSQmqqjJr1qyoxwddP36/n5/+9Kfs37+fuXPndthv8+bNLFy4ELvdjt1u5+qrr2bfvn00Nzezc+dOXnzxRQAmTpzIOeecA8Dnn39OXV0d3/ve90LnkSSJffv2RRQqNpuNBQsW8NZbb/Htb3+bt99+mxdffDEkVHbu3MmYMWMoKSkBYNKkScyePZt//etfUdsH8MEHH7Br1y5ef/11ALxeb9T+EAgGO0KoCARJyLRp07jvvvt44IEHKCkpoaioCNM0ufXWW1myZAkAfr+f5uZmAH7wgx9w/fXX8/HHH/PGG2/wzDPP8MYbb4TOJ0kS4WW9FEWJuk3TtNC/ly5dytSpU7ngggu44oor2LFjB+3Lg1188cX87W9/45NPPmHz5s3893//N6+88gpjxoyJeG+maXY4h2ma6LoOgN1uj8nFYbfbWblyJddddx2PP/44q1at6nT/4D0H/x+pPwzDYOLEibz22muhbTU1NW3ihtqzePFiVq1axaxZs5gwYUIbEWeaZof9LcsK3Wuk9gWPe/LJJ5k4cSIALS0tSJLU6f0JBIMVsepHIEhSFi5cyKxZs0IxIeeffz6vv/46ra2tADz55JPcfffd6LrOvHnzcLvd3HTTTaxatYoDBw60GQzPO+88Nm3aRHV1NRAI+AySm5tLaWkplmXhdrtDlpbm5mZ2797NXXfdxeWXX05NTQ3l5eUdBt8f/vCHvP3221x11VWsWrWKjIwMqqqqot5XRkYGJSUl/P73vwcC8Rjr16/nvPPOi7uP7HY7q1at4tVXX6W0tLTNtgsuuID169fj8/nw+Xy8/fbboevPnj07JOQqKirYvHkzkiQxa9Ysjhw5wqeffgoEXGbz58+ntrY2ahtKSkrwer38/Oc/56tf/WqHbYcOHWLnzp0A7N+/n08//ZSzzz47avsg8Kx/+9vfYlkWfr+f73znOyELkEAw1BAWFYEgiVm5ciWLFi3io48+4oYbbgjFTEiSxMiRI3nkkUdQVZX777+fu+66C1VVkSSJNWvWYLfbQ+eZMmUKP/rRj/jmN79Jenp6m3iO4Pkvv/xyCgoKOP3007Esi+zsbG677Ta++tWvkpOTQ25uLrNnz+bIkSMUFxeHjv/ud7/LihUrePXVV1EUhUsvvZSzzz670/tau3YtP/nJT3jjjTfw+/1cffXVXHvttVRWVsbdR2eeeSZXX301P/3pT3n55ZdDn3/961+nvLychQsXkpOTw9ixY0PbHn30UVasWMFLL71EQUEBRUVFpKSkkJeXx1NPPcVjjz2Gz+fDsiwee+wxRo8e3WkbrrnmGn7/+99zwQUXtPk8Ly+PJ598kp/+9Kd4vV4kSeLhhx9m/PjxjBkzJmr7VqxYwUMPPcTVV1+Npmmcd9553HrrrXH3jUAwGJCs9jZYgUAgGOSsW7eOyy+/nIkTJ+J0Olm0aBHPPvssp5xySn83TSAQtENYVAQCwZBj3Lhx/OAHP0CWZQzDYNmyZUKkCARJirCoCAQCgUAgSFpEMK1AIBAIBIKkRQgVgUAgEAgEScuAi1ExTROXy4XNZhN5BQQCgUAgGOBYloWmaaSnp0dMFjnghIrL5aKsrKy/myEQCAQCgSCBTJ48OVRGIpwBJ1RsNhsQuKHwPBHdYffu3cyYMSMRzRrwiL5oi+iPk4i+OInoi5OIvmiL6I+TxNsXfr+fsrKy0PjengEnVILuHrvdjsPh6PH5EnGOwYLoi7aI/jiJ6IuTiL44ieiLtoj+OEl3+iJaOIcIphUIBAKBQJC0DDiLSmfouh6xCFhn+P3+XmrNwCMZ+kKW5ZgK0gkEAoFgaDBoLCpOpzPugTZYmVSQPH3h9/txOp393QyBQCAQJAmDYuqq6zqKopCWlhbXcZqm9Tggd7CQLH1ht9txu93oui4sKwKBQCAYHBYV0zTFoDaIUBQlbheeQCAQCAYng0KoCAYXIpGfQCAQCIIIoSIQCAQCgSBpEUJFIBAIBAJB0tLngR2GYfDAAw9w6NAhJEnixz/+MQ6Hg3vvvRdJkpg0aRKrVq2KmO9f0DXvvvsuH3zwAa2trVx//fWcf/75/d0kgUAgEAi6TZ+rgffffx+AV155heXLl/Pzn/+chx9+mOXLl/PSSy9hWRbvvfdeXzdrQPHKK6/wla98hUWLFnHppZeyfv360LZLL72U1atX8+Mf/5i3336729fYuHEj8+fP57LLLuOZZ56JuE9LSwvf//73WbBgAVdccQXbt2/v9Hifz8f111/PokWLuOqqq3jqqae63T6BQCAQDA363KJy6aWXctFFFwFw7NgxsrKy+OSTTzj77LMBmDt3Lps2beKyyy7r66b1mEceeYTS0lLq6urwer0UFxeTm5sb04C8ceNGqqqquPHGG7vct6ysjDvuuIObbrqJnTt3smzZMhYvXtxmn3Xr1rF06dJu3YdhGPzkJz/hN7/5DQUFBVx//fXMmzePU045pc1+Dz30EBdccAFPPfUUfr8fr9fb6fETJ07k+eefJz09HU3TWLJkCXPnzmXWrFndaqdAIBAIBj/9sqZXVVXuuece3nnnHZ566ik2bdoUWumRnp4eU8Kv3bt3t/l74sSJaJoWVzs0w6SirpFMh4pN6blx6c477wRgw4YNHD58mO9///tAoOJzV5xxxhkx77tnzx7mzp2Ly+UiLy8PVVVDx1mWxVNPPcU555zDuHHjYjpfkOC+O3bsYPTo0eTl5aFpGpdddhl/+ctf+Pa3vx3a1+l08q9//YuVK1eGjlMUBZfL1eXxLpcLj8cTEjft26hpGgcOHIi53b3F1q1b+7sJSYPoi5OIvjiJ6Iu2ROoP0zLQLS+qlIIsKf3Qqv4hkd+Nfks+8uijj3LXXXfxta99DZ/PF/rc5XKRlZXV5fEzZswIFT0KZqSNNWGZaVpsKK3gs8O1aEik2VVKRuWyaHoxstzzpbEOhwObzUZ6ejoAb7zxBn/4wx8wTZN/+7d/409/+hNOp5Pa2lqWLFnCkiVLeOONNzh48CATJkzgww8/xOv1Ul5ezrJly7j22mvbnP/AgQOceuqppKWl8dxzz/H//t//C13rhRde4LPPPsPr9VJTU8NNN90UOm7JkiURhcs999xDSUlJ6BwtLS0UFRWF/i4uLmbnzp2hvwHKy8vJz89n9erV7N27l+nTp7NixQrS0tI6Pd4wDK699lrKy8tZsmQJ55xzTof2+P1+Zs6c2a8J6LZu3RoSj0Md0RcnEX1xEtEXbWnfH6Zl8PmRd6hq3Ifb7yTNnklR7hRmjb1s0AuWeL8bPp+vg/EhnD4XKuvXr6empoZ///d/JzU1FUmSmDFjBlu2bGHOnDls3Lgx4uCVSDaUVrDtaAMWFmn2QFnpbUcbAFg8c0yvXDMrK4t169ZRWlrKVVddxeWXX05NTQ3f+MY3WLJkSZt9W1tbee655zh8+DC33357G6FSVVWFy+Xitttuo6amhilTpoQsOQC33HILt9xyS8Q2vPTSS1HbF4/lBQLZgPfs2cPKlSspKSlh9erVPPPMMyxfvrzT4xRF4c0336SlpYXvfe97lJWVMXny5LiuLRAIBMnO50feoazmMyRJQpIkPForZTWfATB73IJ+bt3Aos+FyuWXX859993H0qVL0XWd+++/n4kTJ7Jy5Up+9rOfMWHCBObPn99r19cMkx3HGlFkCSPsc0WW2HGskaumFSXEDdSe8ePHAzBs2DCef/55/v73v5ORkYGu6x32nTp1KgAjR47sUL+orKyMM888kxdeeIHm5mYWLlzI9u3bmT17dpdt6MqiEqSgoIDq6urQ3zU1NRQUFLQ5prCwkMLCwtBxCxYsCAXNxnJ8VlYWc+bM4aOPPhJCRSAQDCp0Q+No474OySslSeJo4z5OK74EVbH1U+sGHn0uVNLS0njyySc7fP7iiy/2yfWdPg23XyfN3vHW3X4dp08jL82R8OsGl1v/+te/ZtasWSxZsoR//vOffPjhhx327Swz6759+5g2bRoA2dnZLFy4kA8//DAmoRKrRWXmzJkcPnyYiooKCgoK+POf/8wTTzzRZv/hw4dTWFgYcldt3rw5VNgw2vENDQ2oqkpWVhZer5dPPvmEZcuWddlugWAooBkmTp9GpsPWK5MlQd/h0Zy4/c6I73KP34lHc5Kp5PVDywYmQ65ATqbDFlGkAKTZVTIdvatyL774YlavXs3bb79NZmYmiqLEVfV53759zJ07N/T3vHnzeOihh/jBD36QsDaqqsqDDz7IrbfeimEYXHfddUyaNAmAZcuWsXr1agoKCli5ciV33XUXmqZRXFzMww8/3Onxe/fu5d5778UwDCzLYsGCBVx88cUJa7dA0F/0RGQEY+Z2HGsMTaISGTMn6HtSbZmk2TPxaK0dt9kzSbVl9kOrBi6SZVlWfzciHoJBNz0Jpl2/q5xtRxsw9JMVgw3TYnZRXq/FqCQ7LperTbBsfxLv8+wNRKDgSURfnKRDwGQCREbwfaSE7T8Q3kcD9XvRW5ar9v2x7fBfQzEqQSzLYnLBmYM+RqW7wbTh43o4Q86iArBoejEAnx2uDb1cZhflhT4XCASCWAgG5iuyFLLUxhOYHx4zF05vx8wNRbojKnsiamaNDeQCO9q4D4/fSWrYqh9BfAxJoSLLEotnjuGisbmYql34hAUCQdwkQmT0V8zcUCQeUZkIS5ksKcwet4DTii/BozlJtWWKANpuMqRHZ5sik5fmECJFIBDETYPbR4Pbh2F29J4HRUZX9HfM3FChK1GpGWabz4OiBmgjajaUVsR9bVWxkZmSJ0RKDxAjtEAgEMSBaVqs31XOf3+8j20Vx/noYA17a5oJD/eLVWTYFJmSUbkdxI5hWpSMyhWTqAQRtFxFor2ojFfUCHof8SsQCASCOAh3IRTlpGNaFlUtHvbVtgDxi4xF04uZXRRYqhocTEXMXGKJx3IVj6gR9A1DMkZFIBAIuoNuWm1m21NGBMp91LR6qWhyM3lEVtwiIxgzd9W0IpFHpZcIWq6ira4K7++euONELpzeQQgVgUAgXrAx4tLMNsGvkiQxtSCbScOzaPL4+e5XplCQmdqtcwdj5gS9Q1A8hgfIRhKV8YiaIKZp8UFFC2/V7xK5cHoBIVQEgn4gWYSBSDYWH+k2OeJsW5El8tMdQmgkMfFYrmIVNUE2lFawt8FD0Wi6tUxd0DlCqAgEfUiyCYOe5gEZaqiyFPdsW9B/RJoQxGK5ikfUBINvZUnkwukthFAZ4Lz77rt88MEHtLa2cv3113P++ef3d5MEnZBMwkAkG+se8c62BX1PoiYE0URNuACKJfhWWNp6hhAqA4RXXnmFp59+mvz8fNxuN3fccQeLFy/m0ksv5dJLL6W5uZlHH32020Jl06ZNPPHEE5imyQ033MBtt90Wcb+WlhYeeOABysrKkCSJNWvWcPrppzNv3jzS09ORZRlFUXjjjTfaHBes+VNQUMD//M//dKuNA51kEwYi2Vj3CM62L58yiqoWNyOz0qIGXwr6h96aEEQSQNMLs0m1qTRF2F/kwkkM4teVQB555BFKS0upq6vD6/VSXFxMbm4uTz31VEzH+3w+NmzYwA033NBhW1lZGXfccQc33XQTO3fuZNmyZSxevDi0fd26dSxdurRb7TYMg0cffZTf/va3FBQUcP311zNv3jxOOeWUDvs+9NBDXHDBBTz11FP4/X68Xm9o2/PPP09eXuSKoC+88AITJ06ktbVjka6hQrIJA5FsrHuYpsWbuw9RWl1Fq89Gqt3RK+67ZIljGmj05oQgkgDaeawJzTAxrY65cIQ7MDEMaaFimBpOb0PCUhvfe++9ALzxxhscPHiQu+66K67j6+rqeO211yIKlX379nH55ZcDUFRUhM0WaK9lWaxdu5a5c+cyffr0brV7586dFBUVUVwcMF1fddVVvPfeex2EitPp5NNPP+WRRx4BAkUDYykcWF1dzQcffMDtt9/Ob3/72261cTCQbMKgO6sbhjqWZfLqZ6/R4jlIYYoXw5GKxxjJtqMzgMS475Itjmmg0VsTgs4EkCzJTMpJwXPiGsIdmFiGpFAxLYPPj7xDef0XeA0XaWHFomRJSfj1NE1j1apVHDlyBNM0Wb58OSNGjOC+++5DVVVM0+SJJ57gV7/6FV9++SW/+MUvuOOOO9qco6ysjPHjx2NZFi+++CI/+MEPAPjd737H5s2bcTqdHDlyhJtuuqnNcUuWLMHlcnVo0z333MN5550HQE1NDYWFhaFtBQUF7Ny5s8MxR48eJS8vj/vuu4+9e/cyffp0VqxYQVpaGgD/9m//hiRJ3Hjjjdx4442h49asWcOPfvSjiO0YSiSjMBgq8RaJsk5U+Utx6YdRJAmQUCQvGepBAHYcK0mI+679rN0wLTYdqsUwLa4rGdujcw8FemtC0JkA8mgGcwszuGDOTGEF6wWGpFD5/Mg7lNV8BhZIsoRHaw38Db1Sfvu1114jNzeXNWvW0NjYyM0338ySJUs47bTT+NGPfsRnn32G0+nk9ttvD7l4wqmqqsLlcnHbbbdRU1PDlClTuPPOOwG45ZZbuOWWW6Je+6WXXkrYfei6zp49e1i5ciUlJSWsXr2aZ555huXLl/Pyyy9TUFDA8ePH+da3vsWECRM466yzeP/998nLy2PGjBls2bIlYW0ZqCSbMBjsycYSaZ3QDY0WswoLaHukRKpyjGrX1B6778Jn7ZZlsa+2hWqnB80w2Xq0AQuLa2eOTUrLSrK4qnprQtCVAEq3WSIXTi8x5ISKbmgcbdyHJEltanNIksTRxn2cVnxJwotHlZWVsXXr1pCVQtd1LrnkEl5//XVuvfVWMjMzQxaSaMefeeaZvPDCCzQ3N7Nw4UK2b9/O7Nmzu7x2LBaVgoICqqurQ9tqamooKCjocExhYSGFhYWUlJQAsGDBAp555pnQOQDy8/O57LLL2LlzJ2eddRbbtm3jH//4Bxs3bsTn89Ha2spdd93F2rVru2z7YCRZhcFgfcEmMqjSozkx8SJFqDyiSF4yHFqP3Xfhs/Z9tS1UtXiQpMDz8esmW47Uo8pyUi0dT6QYTJTY6Y0JQVcCSPXXdfvcgs4ZckLFozlx+51IUscfkMfvxKM5yVQiB4R2lwkTJlBYWMjtt9+O1+tl3bp1bNu2jTPOOIM77riDt956i//93//lzjvvxDQ7Frzat28f06ZNAyA7O5uFCxfy4YcfxiRUYrGozJw5k4qKCioqKigoKODPf/4zTzzxRIf9hg8fTmFhIQcPHmTChAls3ryZiRMn4na7MU2TjIwM3G43mzZt4rvf/S4AP/zhD/nhD38IwJYtW/j1r389ZEVKOINVGCQTiQ6qTLVlYpNSMRX/icJ0YYOVlcL0wpE9Fp3BWbthWlQ7AyIliF2VSbWpSbd0PBFiMNFxOb01IehMAG3fLoRKb5Ec3/Q+JNWWSZo9M/I2eyaptsjbesLXv/51Dh48yM0338zXv/51Ro8ezYwZM3jqqae45ZZbeOWVV7j55pvJz89H0zQef/zxNsfv27ePU089NfT3vHnz+PDDDxPWPlVVueeee7j11lu58sorueKKK5g0aVJo+7Jly6ipqQFg5cqV3HXXXVx99dV88cUX3H777Rw/fpwlS5awaNEibrjhBi688ELmzp2bsPYJBN0h0cXlVMVGplKIQ5FDg56FBVhkpU7gmhnje9rk0Kzdo+ltqvSalkVBRgqKLCWkMJ5mmDS4fT2uBJyoSsNBsQNtM7tuKK3oUfuCE4JEibqgALr/0pncc8kM7r90JotnjklKV9xgYshZVFTFRlHulFBMShDLsijKnZIQt8+1117b5m+73c5jjz3WYb+XX365w2dvvvlmh8/aWzfOOuss1q9f38NWtuX8889n/vz5Ebc9++yzoX+feuqpHXKkZGdns2HDhi6vMWfOHObMmdOzhgoEMdIbQZUF6nRGDivkaOM+3H4ndjWD4rypnDHusg6ZScOJx6WxaHoxhmmx9WgDft3ErsqMykwNFUDsSUBooi0XiVhhk2jLV1/EygiLaN8y5IQKwKyxlwFQXv8FPsNFatiqH4FAMDjojaBKSZKZPW4BpxVfgkdzdpnaoDvCQJYlrisZi4XFliP1pNrUUPt7GhCa6ERoiRCDiVpOLJZ1D16GpFCRJYXZ4xZwSv45SDYzYXlUBIJwkmUVxFCmt1ZZBdxAXcey9UQYXDtzLKosJ6ztvZEILRFiMFGWr2QqTyFILENSqARRZBvpKen93QxBOyzLihjsPFAQM7v+pb1AvGpaEeeOGw6Q0HiFWNrRE2GQ6IDQ3kqE1l4MOlSFScMzuWLq6JiOT4TYSbbyFILEMiiEiizL+P3+mLKkCpIfwzAG9LMUM7v+ob1ATLWp+HQDhyrj0Yw+F4yJEgaJioforURoQUF1xdTRvLbjCPvrW9hd1cTB47tj7u+eWr6SrTyFILEMCqGiqioejwe3242iKDHPxjVNw+/393LrBgbJ0BeWZWEYBoZhoKoD86spZnb9R3uBuLemmcpmN6Oz05hakA2cFIx9kcdmqJVM+MveSsrqWrol0HtqPUq2vhYklkHzxszMzMRut8flMjhw4EAvtmhgkQx9IUkSdrudzMzELxHvKxK9JFYQG+0FYjAPiSJL1LR6McxAckdZgle3H+anf9/Jo+/tZs27u1i/qxzTtDo7fbcICgPD7FisrmRUbr+VTJhdFIitCX5PExGzk6hlyt1dTpyMfS1IHANz2hqF7szCB7KLIdEM1r7QDS2mFRqJQMzs+of2pn+/YaAZZiijq98wSJUD2V7Lm1yMzUvvsVsulmDpoVIyIRlcL8nW14LEMaiEikAQTrD4ZDDnRW8Xn4TkLDw4FGgvEO2KEupruypjV5SQlcWhKtiVk88/XrdcPMHSQ6VkQqIEek8mFcna14KeI4SKYNASLD4pSRKS1PvFJ4OImV3fEy4QIWBRGZGRQlWLh8KsFPyGgWFa+HSDsbkZHVwU8cz6uxMsPdgThPVUoCdyUjHY+3ooIoSKYFASXnwynN4sPhlEzOx6l2gul4WnFrHlSD1byutx+XXSbDKaYVLr9HKk0UWaTUFxN1L5AAAgAElEQVSVJCYP7xgDFeusXzctESwdhZ4I9P6aVAgGBkKoCAYl/VF8sj1iZpdYunK5vPXFUWyKzPnjR+A3DA4db6WqxUNBZgrj8zOwKwr7apv5oraF6YU5ofPG45ZzaWa/x2IkK90V6P05qRAMDIam9BcMevqj+KSgd+mscF34qhNFlrArCnUuH4osUdvqxa4oKLLEqQXZKJKEYVrdWvWSbpNFsHQXxLtyJzipiLjtxKRCMLQRFhXBoCS8+GT4TC2RxScFsdPTlVddLX89d9zwDqt+/LqJTZHQjJOrfiRJYkxuOt87fwo2RY7bLafKkgiWTjDBSYVHa+24TUwqBAihIhjEBItMHm3ch8fvFMUn+4FEBUl2tfwV6LDqx67KWJaFTZHbrPJJs6s9SqUvgqUTi5hUCLpCCBXBoCVYfDLWSrfJwGArZJioIMmulr/mpTnaWDoUWaIgIyWUmTZR1YdBBEv3BmJSIegMIVQEg55YK932J4OxkGEigyRjWf7a3tIxZUQW4/IycKhKr1g+Ehks3ezx82V9C6cMyyI7dXAmXuyMgTipEPQdQqgIBEnAYCxkmOiVV4umF2OYFp9W1GOYkOFoKzyiWTpitVL1hzXL7ze46fcfsb2yAa9mkGJTOH10Hi8vvQC7vXeSEiYzA2FSIeh7+lSoaJrG/fffT2VlJX6/n+985zuccsop3HvvvUiSxKRJk1i1ahWyLMyogthJBndJT9owWAsZJjJIMmhx2l3dhG5YqIrE9MLsiBan9paOriwf/WnNuun3H/FZeT2yLOE4EVPzWXk9N/3+I/7wrYt69doCwUChT4XKhg0byMnJ4fHHH6epqYnFixczdepUli9fzpw5c3jwwQd57733uOwy4ZcUdE0yuEvatyHFpjBpWBY3lIzFYYttRpwMdVJ6g0QGSYZbnDJTAsftPNaEKss9tjj1lzWr2eNne2VDxLT72ysbaPb4B5wbKBkmDYLBR58KlQULFjB//nwg8LJSFIXS0lLOPvtsAObOncumTZuEUBHERDK4S4JtkCUob3RR7fTwzr5j/HnPUW48fVxMomkwFzJMRJBkb1qcIlVd9hsGdkXpdWvWl/UteDUDh9rx/F7N4Mv6Fs4oHtYr1040yTBpEAxeJMuyEl/fvAtaW1v5zne+w9e+9jUeffRRPv74YwA2b97MH/7wB9auXRv1WJ/Px+7du/uqqYIkRTctfl1aB5G+vRJ8e/pw1AS8IHXTwqWZpNvkDucLb8PhFh/1Hp3wPUqGpzItP42LirO6vM4HFS3sbfAgh1keTMtial5qTMcnO6ZloFteVCkl7totzT6DF/bU41A6Pk+fYXHLtGFkO7oXzxE8t12GIy1+jnt1dAtUCbLsCvfPGUVuSuLnc7ppUePSuP29w0T6llrAy1dOJCMJ41Qi/SYG+/dX0DfMmDEDh6Oj9bjPg2mrqqr43ve+x5IlS7j66qt5/PHHQ9tcLhdZWbF9qaPdUDxs3bqVM844o0fnGCwMtL5ocPvIqd4d1V0yafqMHrlLPv3sMyodIzqdIQbb4FAVytw1ZIWFXGiGxfCC4bTaVE6bNbPLWfnppyfvjLS/vxuaYfKRc1fU7RfM6di/sSaYC557b00zXtVNZmbYiiLLojmjkEtLxoY+62lftLU8SOSkpVLn8pIRZjUzTYszxwzjwnPP7vZ1eoP2VpOm47VcccZ0rpg6mrfqd1M0uuMxrRDT938w0N+/k2Qi3r7oygDRp0Klvr6eb3/72zz44IOce+65AEybNo0tW7YwZ84cNm7cyDnnnNOXTRIMUHrbXbLxqJNmh9qpWynYBo+moxlmm5exXQ0kGYs1xkTk5ohOPJV5400wZ1Nkphdm8/6X1W2sAZYFo7LS2F3dxKIZxQl7Fu3dldeXjOG1HUdo8PhRJJkUm8KZY/J5eekFCbleImnf9iYr8Jto9emDMsZKkDz0qVD51a9+RUtLC7/85S/55S9/CcCKFStYvXo1P/vZz5gwYUIohkUg6IyelpXvDM0wKWvyMmpk5zERwTZ8Wn683WBpMSozNfRCj0c0iUKGkYk1G2x3EsxdNLGQl7YeotHjDwnOkVmpTBmRldCBNlKsjaIofH32BDx+g8Uzizm1IDspA2g7ixPaX9+CQ43sohroMVaC5KBPhcoDDzzAAw880OHzF198sS+bIRggdLWCoLdSmTt9Gj49cuhW+4EreK2Dx52UN7lwqAqjMgODnKj/kjhisTh1N8FcdqqdktF5bQJpgwNyIgfazlZ3WVhMHhE52VsyrKSJ1nYJHcNwMXlEAV/We0T9I0GvIBK+CZKOWFcQ9Ja7JNNhw6FGjgtJs6ukquD0NoTiHxbPHMMVU0fzfzsOs7/OiU83kCSJ2aNzuxRNPS3WN9TozOLU3QRz4da5VNvJV2KiB9p43ZXJtJKmY9tNivMOMTxlJ4rkIVvJ5dT8kZQ1TsLtN0X9I0FCEUJFkHTEu+w40e4SmyIzOSeFZtNqN0M0ODX/S94p/aBD/IPDpvCNMyfGPPtNVLE+wUl6kmCuLwoNxuuu7M7y+94Svu3bnm3bRZqjCllSsCkyXr0VySrjyslZTCy4WMRYCRKKECqCpCJZsrTOLcqk0pHXZuA6Nf9LJLMMjxY9/iFW0RQtlsK0DKaMPFdYWLpBTxLM9dQ6F6tAiFUQxfs76I7wjSaqo30ebOPOY3U45GOAhE2RSTkRnyJJElVN+zh9zCWoQqQIEogQKoJ+IdrLMFmytMpS24ErVYV3Sj/Ao/W8wF6kWArLstANH3uObebL2m2k2bMYmTOF8SPmkp2SImanMdLTBHPxWufiFQixCqJ4fwfxBBFHcyktPLWIt744GtXVFGz7hRPS+csuDVOXsbXLvtxVDadkiLcRDDyEUBH0KV353WP14/fVCy84cDm9DQkrsBcplkI3fOimBifSfzW6m6l1/pMPvqzGbZ6eNDlVkp2+rsLbnVVG0LUgiieeJd4g4mgupS1H6rEpcpeupsyULDIcWbTojR3aFs3FlkzxNoKBh5C0gj4l+JIE2rwMN5RWACd94YbZdtWNYVqUjMpFkSTW7ypnzbu7ePS93ax5dxfrd5Vjmr2bYDkY/xBxW5wF9tqfy7IsDNMAAoOdT7fQDBOQyLJXI6G36aNEoRkmDW7fiWsNLlTFRmZKXtwiJZ4+6Uog6IYW17XD6ep3EC7Og8I3EkERHSSaSwlgS3l9h8+Crqbw/gi62Kx2aaE7c7F19bsXCDpDWFQEfUasfvfO/Pj9Vd8nkQX22p/LwsIiIEwUScWrWwQtK4rkRZa8KHJGwmJ0xOy2I93pk+6uMoqVWONZ4gkijuZS8hsGLr+O3zBIldtui+RqmjX2MqqqqzFszW1cbNOLLqHB7Wtj6UyWuDPBwEUIFUGfEavfXZYlrppWxLnjhgOQl+bApsj9/sJLRIG9SOdy+1qQJRVZklFkOxY60gmhYlgpmFYKkLgYnWQo5phsdKdPerLKKBZijWeJR0RHcynZFYV0u4pd6RhXE3Q1tXW3KhTaZlJy2ml4NCcOJYO391bzp/f2dBB6yRJ3Jhi4CKEi6DNi8bt3NrPt7xdeIuMf2p9rb9U/OVC7DenEfwEsPMYorBM/00QkH+up2BuMwZDd7ZNEWtk6I5YA31hFdLQl0gBzxnSs1GyYFqePzuPPezoG2Y62rICLTclj/a7yqELvqmlFg7Y6uKBvEEJF0GfEkkei/QvPMC02HarFMC0WzShOihde8OWcyHOdMW4+iiQHBhqtGZ9hx2OMolmbCSQu+Vh3xd5gche1F1vx9kn48Ym0svWEeER0NJdSpFU/s4vyMC2L7UcbO4iQAz4nZ50Zm9DrrXIXgqGBECqCPqUzv3v4C8+yLPbVtlDt9KAZJluPNmBhMXNkDp9XNvbJCy+a9aA3kmqFDzRun5P3vmxiZ5XzRB/JCUs+FotVK9J9DwZ3UTSxdcXU0TEJYNO0+KCihbfqd7UTa/P7bJVRV8QiojtzKbX/HGDNu7siipCyJm/ou9KV0OuLhHqCwYsQKoI+pbOXZJPbH3qJ7attoarFgyQFLDF+3WTLkXrOGzeC2UV5vfrCCx+QPH4fGQ6NKSMKuHDiSI7Uf8Sxxn24/C2k27MozktsNllVsZGVlsdXT8tj4fTEu1k6s2pFM/EXDJJgyM7EViwz/g2lFext8FA0mohiLVFWtr4imksp/PMGty+qCPHpVuj72ZXQE9XBBT1BCBVBv9D+JakZJpph4lAVDNOi2hkQKUHsqkyqTWVnVSP3XzqzV194gQHJzfRxFRSmHMOy3ByptfG7WpV0uxsI5Jpw+5todG/BtODM8dFzZnSX3qqkHG12G83Eb2ttwZ2SM6CDIbtyT9w7bwYQfcYfPF4O+1J6/T5cfif/OuIfMGItXjoTIQ5VCv3+YnXtiOrggu4ghIqgX2lvji9vdOHVdfy6gf1Eam7TshiVmXpCHJwcGHvjhRcckMbmHyFDrUU3LUwL0mwaDsWFaclopgQmqLKEblrsqdrFrDGxZ6btbyLNbiG6ib+81ceozMgWo4ESDNmZe6LVp3O02cVV04qiCuDg8QCGoVHduIninEYKs3VafCoPbNjGj6/8NimOjtWPBzKdiZDJOSczJgvXjqA3EUJF0K+0N8dPGZFFaXUTTp9OliRjV2VGZaYyZUQW0PsDo9On4fH7KEo/DoBhWUiAhIUkgYwJBMQLBPKd+HUXTm8LmSlZSRGnECuxmvg1AyYNy6KsrmXABkNGsgwE46BqWr2s22SRmWKLGiAcPL7Rsig//jGzCutAkrCALIdOdspRfvLX37Lmmtv68K76hmgiZLRPD+0jXDuC3kQIFUG/EckcL0kSM0bmIsswOiudDIcttL0vBsZMh40Mux9Z9uMzJAzzhFCRwLIC/5cA0wrKFNBMB7srP6HRdbDPKyEnarlwVyb+G0rG8pe9lQN2xhzJMrCvtoXKZjejs9PITAkIy2gBwsHjPyotY+7kJmif5M2CTHs1DS4XeenpEdsQ/qwkjAEjaqOJkK1b6zrsO9RcO+GB9YLeQwgVQb+gGSblja20+nQyHB2/huNyMpkxKoeDx1v7dGC0KTJuzY7dUslwGKGMJqYFuimjymZAoEjB3LEWYKOyYQeSHF+9l56Q6OXCXZn4HTZlwM+Ywy0DrT6dmlYvo7PTQtY66DxA+Iqpo/nvf2hkpZysyXQy4w2k2/zsq63m3PET2xzX9ln5GZn+BXkptdgVb5+K2p4SrwgZjDl3gkQqRqlo2ZjWrKR/jgMRIVQEfUr4S9vp1dhW2UBBRgpTRmS1SZqV5lD5+unjAfr0ZacZJnbVQW1LLhnDT84YZQk0U6bRm44qm2TYNQwrBbdeSF5KLZYkIZ0ofWICMvFXVY6H3lguHIuJfyDPmMMtA+WNrSF3T3uiBQi7NJ2C1HScPhtZDr3Dca1+G1NGFHb4PPxZjUz/ggz1IH4dLEtGknpf1PY1gynnTjQiFaP06w18fuSdQfMckwkhVAQxk4gZUvhLOzPFRkFGCpXNbgCmFmQDHV08fTkwOn0aHk3H6ToFX342KUoVkuXGpdk41JRHlWsqKarM+FwFm5qObrrJtB3Cf8JNBIFZuYSER2vG7XOSlZbYZau9VUogHhP/QMamyIzJzYgoUiB6HFSmw0a63U5Fcz7TR1Rz0p4CYOHURnZw+4Q/KwmdVOVY6DjNsEhRQULi8PG9nDrqYlLtA1MEhjMYcu50RtRilL08ORnKCKEi6JKuZkixJkCLNMAGze41rV6KctLJcHTPxaMbGq2+BrAkMlJyI7YjlnYGYzWakGjRZ+HUZyBLXjTDQUaaxO8Wl4Symf5jfxW7jlkYVhqW5ca0TppUVFnCZ9h578smvnpaXhuRBz2zEvV2KYGBbDWJlXiW1IYfMzknhQb1XEprN1OcfZxMh47Tp1LrGs7j1/x/HY4Jf1ay5EWRvJx0Glp4NAPdtLB8jTzxwWfMGFk8oC0P/V2Pqy/o7WKUgo4IoTJISWT21GgzJMsyGZu1t42ftjN/e6QBVpIkphZkMzo7jdvPm8SY3Iy4XmSmZbD98N/YX7MVn+4BLBy2NE4ZcSazx12OLCkR/cnR2hkcwI5WVgJgoWJYGeimyfTCLGyKjE2RyXTYKK1uRpZtuPVCHPKXBAeggGAx8Rij2HHMiWEdprS6GbdPp7zJBUBxThrpjuirTDojlgRbgq7pzpLauUWZVDpGsOPYXBrcHuq9Ls4eW8SK2ZMjPsPwZ2VaKRhW6gmxwolA7UDVbNNKxbRSBrzlob/rcfUFvV2MUtARIVQGGfEMyrHQ2QzpSP1GfO7ymINIOxtgM1NscYsUCPiK91RtxjBPBjh6NTdfHPsEWZKYPW5BW38yEi5/K2XV0du5aHoxBw4coBVw+TQqmgKuKd0wWXN8FyWjcjl//IjQC7nGMw2Px8WY7OOkqH48mh23XIzLnMnOYw24/BoZDhvlTS6qWjxYBCwvUwuyuzUwdccaEAuDOfgxEt1ZUitL8R3T9lmpeIyRZKgHw/aQCC8+qcgMaMvDUBDRUYtRkthilIKTCKEyyIgU5NWTYL3gDMmhKvgNA7uihPztDvlYIIg0bH9Jiu6nTfQAqxsaFQ17MU2DtvECYGJQ0bCPaaPmBvzJSHg1A82wsLCQkNh9bBczRs/DbmubpEuWJS4qzuK0WTN5dfthbIqCXT3Ztm1HGzBMK/RCtis2/lV3CqV140hR/Xh1O+eNHwVAk1cj1aa2ybYrIVHT6mXS8Kxum8RjsQbEKjyGQvBjZ3TH1RXPMeHPqsp1KiPTIcdRg2W1YliONsUnYWBbHnpLRCcbkYpRZpjZfV6McqgghMogImqQVxTx0D5uIhLpNpXyRleoOKBNkSnMTGV6gYxN9iLT8djO/LQ9yWDZfuD1aE5c/hasE+bzcCzLwq210OSpwe134tVMdDOwX1Ba+XUXf9pTxnUlM6Je88BxZxuRAgFr0u7qJqYXZrPzWBOKLFGQkcKxFje6P5WRWYEsuq0+jeyUQB4Yj6aH+i9wbRO/YZAqq90amDqzBpimxR93lfNpRT2GCRmOzoXHYA9+7As6E4Udn9UZ6IbGEx98hmmlYLV7DQ90y8NQyFIbqVr1js93iqXJvYQQKoOIWIO8Is2gMzwtnH661WEg+8veSgzLwrSs0Au4qsUD2BhzSgZIvg7X6sxP2x1ze7QZ/5VTC0m3Z6Hp3pA7JYgkSaTaMtl40EejR8E0/YHMspKEKgdyzRpWCqXVPhbNMCO2oSt/+0UTC1FlmR3HGinOSQttG5MTWPlxztjhpIesLkqba9hVGbsSeKn1ZGBqP7M3TYsVb2/nk8O1GKYVEpa6YQIdhUcswY+C6MRjjQp/VjbFwYyRxScE4sl9BoPlYShlqY2lWrWg5wihMoiINcgr0gx6b4OHDaUVbQay4CA2rSAbRQq4K/y6ecLCoDK1YDoH67e29dNasflp4zGddzbjH5M3lRbv8TYxKgAyCgeO5/Lm3sPMHJHJlGEu5BPbdRNUGTzGKFx+olozuvK3Z6faI9bMCX85K7IUanthZmooRiVYuygRA1P4bP7N3eVsPlKHLEnISuB+A8ISVEXu4GKKJfhxoNObsTc9sUYNdsvDUFg9JugbhFAZREQN8goTD9Fm0LLUMVYifBCbWpDNpOFZoTgVn24wofBMVEVq46cNBu72hPCBRTNM/nmkLmR9CBKc8V9xySVYltlh1Y9uTeK321JwaW7Kmwpo9PiZNryFnFQDj2ZHVopp0WeSZpejWjNi9be3fyGH/zt8MBqdlYZPN7ApcsgC05OBqf1sPsWmsOtYI7rR1kIkSVDt9FCUk95BlA3m4Mfejr3p6VLcoWR5EAh6ghAqg4xIQV7h4iGe5YPtBzFFlkiVA3+n2VWyU1I6+Gm7sqR0NrsNH1hcPo3DjS48mk6t00u6w9Yhg63br+Pym5wx/kpKxlwWyqPisGVz8+830+ytPxH4K/PewSLe/VIjL81geEYeX5kwErsiMbsot9PBoaezXlmWWDS9GMO0+LSinonDskizK0walsUNJWNx2Lrv024/m/doOsda3Lj8BsPS285kNcNEkekgPAZz8GNvx94kaimusDwIBJ0jhMogo32Ql01OQTO9mKaJrCgdxIdhWvgNA9PqOIOOdRCLxU8by+w2OLDIEvzzcB0VzW5008JvmIzISMEwA3EWwQy24e1VFRs5aQUA1Dg9VDs9bdqcZlNwa1DdKpOdEogZiUVwJGLWG7zvVNvJfi+ra+Eveyu7PWBGms3bFYVUm4rbb2CYVpttiixxVvGwiG0fTC6IoBBOUZVeTzw2mK1Rg5FE5paKlaG25L+3EEJlkCLLMvur/xUxn0rJqFy2Vhzny3pnaDWPx93KsBEGSrtA3EQNYp3Nbq+aVkSD2xfavqe6ifJmNxKBDK9+w6LF6wdAkWUmDQ9ks+1sxi8DGQ4brWExFmk2Bbsic8tZE7ntvClxvTi6O+ttLyiCwtCuKD0aMCPN5oNxMD7dYFiGgyaPhl83URWJ88aN4KtRRNFgcEG0F8KSFKiOPKMwB800Q8vqIXHLf8OFvCobyJIX00pBN5UBb40aTCQ6t1RM1xziS/4TjRAqg5TO8qksmj6fLUfqqXJ60A0Lu6qQnmJDlaUOAbWJGMTCB+vwgVqW4JXth9h0qBaPZrCjsoGR2Wkca3ZjWYHYCgCHopBuV3H5ddJ8On7D4Jyxw6OKpbw0B0U56cgtgURtrT4N07KQgPG56dx6zuQ+GUR0Q6Om5Tgev48Um519tS1tlnnnptpp9vgZlpES97mjzeaDJQkmDc/C7ddRFYkzi/O5dubYLl+QA9kF0V4I64bJ/roWDtQ7yUqxY1flkOswVmtHLLPhhdNG4fH8kxbPIWTJjWmlkZU6noXTrk30LQq6SaJzS8WCWPKfWIRQGYR0lU/l1FEXk2JTmDuhICQaaqqrUBU56iy/J4OY06eF0seHD9Q1Ti9On8aorFZSbSpNXg3DctHo8iPLEFxxrMgSIzJS8RsmMwqz+fdzJweESJSB16bI3DBrLK9sP4wiy2Sl2JAlKMxI5aYzxscVF9Id0237Gdz4LBtHW/KobikCSQ6dp8Hj54MD1VxfMi7u60Vzy5kW3Hj6uAFtHYmXSG6wL+udALT4NHLT7FiWxbEWN4Zl8bVZYzvtm85mw+3ZWfEuklVGToqEiR0ZHcsqY2fFu6KKbhIQb26pRDAU6h31NUKoDEK6yqdy3HXyBRwMjjUtC4+moxtWwrNiZoalj5ekwCBb1+rlWLMLmypjV2UkKbA6qdmr4TNN0mwqLp+GJElkOGwcd3upcXppdHnZU9NCcU4aX5s1jmtmRDalLp4xBlkKLA1u8vjJSbXH5bIyLYv1u8q7Zbptn7LfofgYmVHOrEKNz2smhs4/KiuN0upmrplhokhSXEnaoHO3nCxLA9Y6Eg+aYVLe2EqrTyfDEfguB7MAD89IwXR6MEwLAwtVlqhudrPzWBOflh+P+kw7mw2Hf3vaD4LBoUdU0U0e+qOA4FCod9TXCKHSB/R1QFVX+VTy03NJs9cAAXGwr7aF/bVuUlrrsKsy7++vZuG0IlyaHlebu7rPYOp607SodXrw6gaaaVHR5CbToTIs3cFxtx+HKpOmysiSDZAwTJN6lxeHqpCX7kCRA7lBXtl+CEmKbErtqctq41EnzQ41btNtaPAKS9lvWhamCYUZxzGrxqAodkZlpjJlRMA90+zx88QHe2JO0hbrPfZH8GBfEW71cHo1tlU2hFw7fsM4mUU5K41zxw7HsEwOHm/laJMLv2FEfaZdzYZH5p1MLCiq6CY//VFAUARZJx4hVHqR/gqo6iqfSqrdEXIb7K9r4diJWA5VlhiRkcL/fX6Y13ccYUxuekxt7uo+nT4tlDekptVLldONTzdRFQVVkrAsC6c3EPSanWLn3HHDmV6Yw6GGVpxenX8eqcXl07Cp0Oprwq3bSFHtgMX2yoZOTandcVlphklZk5dRI+M33QYHL59uohmBlP2yFBAVGXY/RdkyE4ePCA2EaXaV9/ZXxZWkrat77I/gwb4m3OqRmRJYul7ZHPgeTxoeqHJtWoHEenZVxjAl6l0+HKrSJidP+2fa1WzYpZmhv0UV3eQnltxSiWYwL/nvL4RQ6UViDajqDYtLV/lUFk0vRjdNPj4UmMUjwehsO3bFRXmjhSWpodU1XVkSurrPTIeNdIeNqQXZTMjP5EPdwG+YeDQDv26ccJFAq18nJ9XOOWOHc13JWDTD5NDxRioajzE1/yin5LeQaddw+m3sr8/mX0fH0uTxJ9yU6vRp+HQr4rauTLeptsAA1eprAoJWpEBCvVbNRq1LYuLwwL6GaXHaqBw+qziObljYlJMvtc6StHVFfwQP9iWRrB7BIOKaVi9FOekUZqZiWFboc79h4NMNxuSkd7CWhD/TrmbD6baT34v+GAQF8dPVu7A3GExL/pMBIVR6iVgCqhRJ6jWLS6SiWeEvTlmWmDdpJFuOHEeRTdKNQ4zI+RLDcDFjmJ3y5nz8Ri6psr1TS4JmmGw72hAIyuXkEtD2xwRnGIYVsDJkOmxYlkWqKiNLErppYhowa3QeX505BtMy2FXxDuUN+7h8fA2ybKKbMn5DIdOuM3tUPZIkkekYl3BTaqbDhkON3P9dmW5VxUZ+5ilUtWzBtAitXpIlaPYV4NEkmjx+hqUrlIxycP74fP55uB67KmNZbcVRtCRtnZGI4MFkzzcRyeohSRJTC7IZnZ3G7edNoig7nb/srQz9thyqwpic9JBwCSf8mXY1G1b9dW2O7Y9BUBAfXb0Le+Wag2DJfzLRL0Jlx44drF27lt/97nccOXKEe++9F0mSmDRpEqtWrUKWB/4DjSWgauOBml5fwtZZMrZMh40Mh0q2bQdpchWSpKAjkWrTmDysCq+xh1ZjVps2ty+A9/utB/n7vo2H2WYAACAASURBVEoUWcahKm2yx4YfE5xJbDvagITFsHQHOal2FAmqnV4M0yLDIZOiKPxxVzljs77gy9rPMC2QZfNEEG7A7O43FEBi8rBmRuWnJfwFYFNkJuek0NwuaVqsptsq16nsqz9CUdZxUlQ/bs3O0ZZ8at2TuWRyFpdMOEZj6wE8Xif//DKTosxcjmWMobLFixwmMDpL0haNnsRNRHMZWVZ+zNePl+64RzuzemSm2BiTm4FNkTsMFH/ec5RPy49jWEYor0qkZ9rZbHj79rZCJXwQbHQ30eiRKcrJHjQutsFEfxQQHMhL/pOJPhcqzz77LBs2bCA1NRWAhx9+mOXLlzNnzhwefPBB3nvvPS67bODPRroyIfdF5syusCkyp43MpLbxGFgB94tFIPBTAjArKKsdw6ThuR0sCcEqvZsO11LV4kWRJdLtapvsseHHhM8wJg/P4ouaZuyqzJ7qJjTDosnjQ5YlNh+pY+vRGm6Yvou8VAkJk2AXWYAqm/gNBUmCnBSdq07tnUF0blEmlY68uEy3mmHS4Paxq6qFKvd03j90HPDi9KmYlkK6vZkrp9RwrHF/yC3j1VvJsTdx5mgLi/Ghwo9dJWmLRk/iJqK5jDKtIuCsuNoRpCtLSXfyTQStHgHRYXYqOoIDhWkGgpoPHndytMmFhRRaOdb+mcY7G9Z1k5V/3cWW8npcfp10u8qcMcP46YJZqOrAn3QJBP1NnwuVMWPG8PTTT3P33XcDUFpaytlnnw3A3Llz2bRp06AQKl2ZkL26kRRL2C45JYc3tvnw66CbAddDwFUhkSL5aPI0sadGZsns8dgUOTTw/H1fJZuP1KFIElkpgQywwUq7iiwzIT+T2UV5HV70NiVwrg2lFWyvbKCy2UPLiWXI+WkOJEkizebDMl14dRsOVUY6EXArnWhbqk0GJOxqOtmp2THfazzuBVmKfbAKtwo0uH1sqzgOWGiGjFt3YJogyyBhoGvl2OxtxWmqTWWYXMupBTMYnZ0WV5K2SPfWnbiJzlxGTr0a3dDiMpfHYinpbr4J07SwLDh43ElFkxsJi6KcdG6YNTaqkNxQWsHnlY1timsqUmBZfGf5eGL5Da786+dsOlQbEFs2Bcuy2HSolpV//ZyHF87u8niBQNA5fS5U5s+fz9GjR0N/W5YVejmmp6fjdDpjOs/u3bsT0p6tW7cm5DyRGG1ZHPA5KWvy4tMtHKrE5JwURvt09pfW0nS8jqZIMZsS7C/dhdoHqZZNy8CBHVnx4dFNbCdmppZl4fKr1DdrGLZGhrtTeWL9AcqavHg0k531blq8BnmpKg7AYxp4dJN6nx9Z93FI8VNXW8WfPi0N3ffcosyQa2OkaaHb/eyx+fG4fUhIOFs1TAtcionTr6JIOoolIyFjYQBgnVjerEoWedJwdny+s9P7000Lp99gW42LAy2+Ns8hvD2RiPW78UFFC3sbPMiShGmBx+3iWKufNJtMnl3BkEGRIMvux6+1oKKcuJcTlitAxs/Fua1oeirpNhlVP86nW+txaWbg72grriyLjUfbf8fsTCkootWsRrc8qFIqmUohZn0+W49Hvie/6aLF1xDWojZb+XTbZuxyekz90b5PAJqAo5WVHDhwgIuKA3EizT6DQxX1OJSO1/QZFh9t8ZHt6OhCCZ47S5I4NQM000LRWjl08CDbtfoO++umxV9K60IJBMP5S101I721EftXN62I/R/+vfDqJu+UHol47ndKW1k43CBlEFtVevP9ORAR/XGSRPZFvwfThsejuFwusrI6BrtFYsaMGTgcPbM4bN26lTPOOKNH5+iKs86MPpOvSimPanGZE6PJP+hugEDq+HjcRcFZb03LaFKk/RiWhCJJ2NVAWliPMZb5Mybj0w0OKbk0O+yMGinh0XS+dNfQpHvwyTby0xykpptISPgNk5FZKYwvyD1xngDNpkWlI+A6Cc60W30S1boNl2kgSeDRDCxA1hUONeZxxqh6FJsNFRXd8GGYBrKkkJeez5i8zpfbhs/od1Q6afBojMxMZ0pRIH4m2J5o7oVYvxuaYbKhbif5I7JCLogGqZFaXz2GIpGRmX5CwFiMynZgSBXo+NFNK5RXxqZI5KRlc+5p56EqtrjiNtbvKqfZobZZSt1sWugZk7h+2siYgwd1Q6Nu5/aILiNLUzhr9rkxW1Q0w+St+l0Uje64rRU4bdbMkHXuI+euqOe5YM7MiMHbsZw7nAa3j5zq3VGtl5Omz+gQexWt/7dv39bme3GgvgVlcx1pEbIduzWDwolTmDis63daLNa+ZCtw19VvJNna29v0xXgyUIi3L3w+X6fGh34XKtOmTWPLli3MmTOHjRs3cs455/R3kxJONBNypKC900blcP74EaGEVdEwTYv1u8t57fMjHXzu0bK1tn9xnIwPmInT6yQttZEU1YfPSMWSinCZM1FkCYeq8P+z9+ZBkl3Xmd/v3rfknrWvXVW9oFf0hgYaBCmKFBdwFbhJIkWKtCfssMMTY9OaiLFC5ijECGkkOUxF6A8rRg57xhHyhIaiAJmiIFEkRwQBriCJBnoH0Hst3bVvmVm5vPX6j5fvVWZVVlVWd1UvYH/8A2Ch8tXLt9zz3XO+850rs/mIUJmaRtKAHVmXG/N2dZZOkLJPGJIhmawjKbCczvd8FXX/xHWJUjC2WAQhMLXAXj6hS348OkhXOs4TmRKWUyCb6KS/dS/7e58iabZsGDTD7wawULYREPnFHOxp2RItkO8rvnb6Bi9cngSWjdoOdbdweSZPvuJQcXxSMZ3+TIJ9XVnGCx3sbLkJSARhCcSj5PRF32kzbe0blU4y8ebEg+u12ma03k2VfZp15rwdv4nNun46XuBnE68SidpZU+H1XdlV1ej6vzI6R8Fy2L+iM6svmyRl6qs6tgBSpk5fNrnepWqKlD5oA+4etPN9iPsf95yo/O7v/i6///u/z5/92Z+xZ88ePvShD92zc7nbO4Ba0V6ubPP9a1NcmFzf3jvE8xfH+NrpYaYKZfTquda6tX7oQA9zxQU6Um3EdJO/v3CDi5MTLFkGCTPGkd5Wzk+EgwLhxsxuKuYh8uUcJcfgl/fswNAknq/Y15XhwsRiddH2aY+d52MHbqKJMotljWvzrbw0PICvIGUaLNku3oqOGYB82eY/nbrGVCHobilUhwUauoblevgqCCIJQ6MtGePqwgH+7YeewvWLm2oprA3gZceNSJ8UgqmlCvu6smhS3LEW6PmLY7w5la9rLQ6N2o70tnIrV+KpnZ0kjMDh1nZ9bizsoyMVI6GNo4kKnopT9vq5tbCvahBH07qNrbbqXqvV1p/dnGB5M86cm/WbaPbYK4PlyMISY4slzGomx9Ak3ek4nz2xu+5dX0n+QufmqaUK3782xaNpxUR8NHovk1XhbKhRCeH5infu7lrzXEM0Q0rvlwF3zbat3y/n+xBvHdwTojIwMMCzzz4LwO7du/mrv/qre3EaEe72DmAlITI0yY9uTEcL5EYvd+hdMlu06jQWQsB0ocip4W8zNT+PFGV8lcDzNaT06I1X8GIJyl4fPx05yNXZInFdY7JQZiFXxNVsCpaH53tU3Al2t6f5zGO7+OihHVyfC9JyLcZ50vp1ACoutMQ9Tu6YIxszeHVyN5qUXJzMUXI8+qo28UIIfN/nHy/eZHKpAhB18mRiBoaUGNXWZCECMW5fNslga5KyC+3JzbUU1gZwU9PqApHt+pHnixAQ12+vjTQMaOFU3vF8CSkEuvQoVOY53t/PrvY0cUOLzuVQTwuO55FzjpN3DiNFBV/FUegRsQCaJh9bbdW9lt/EWrqWtbCZTMlmO2yaPfaqYKlgoRQMu0ybBroWyErUCnHJSvJ3aTof3VtQ2NWMICy/l//uw4/x+98+U9f1887dXfy7Dz+27nVqJiMGzRPX7cJm2tYfDuR7iO3APc+o3A+4WzuAtQjRRw7u2NTLXbAcFss2tuvXuZkC7Gu7Sl9qBik0BBJdFDB0D19pgIkmKqT16/SnFN96szUwN9M0lhwP1xVIIcgkdN61pxuzShxihsbx/jZO35wmoY0DItJX6JpAk4I97TnOzwQ/TZkajufXlVr++fIE8+XKcquxgopb1aQIQTZmMNSaCtqjFezuSJMwbm8uRm0A16SgN5OoGYgouDG3xES+TEvC4H//3gVO7GjfNCmtDWgHurMIfHpTb9CXniVp2KRiVzjSf5Sjg++naPvR97g2F4jFFTqeSkfHqyUWzZKP7bLq3gq/ic1mSjbjN7HRsVcGS89XTC0FLfRLlkNKD0o1mhCcG1/kY4cHo2tV++yEnws3A4YmMaS/6r3Udcn/9szjlGyXiXyJvmxyw0wKNJcRC/89/B2BGxHcos1d6Q7cTNv6gzCQ7xdNO/NWwC88UbmbO4C1CNFmX+5MzKA1Ya5yMxW47GpfiCzpQSFEUE6Q+Pg1fSZS3WK+aLJY0jE0Sb7ikU0awW8IEZUrzo4v8MED/bxjZxuWfYtCsVylKME1CrshDFkmaTgUHZ2eTILudJzppQo3F4vsak+zWLZJmka9bb4Ay/VIGhqmFpCkcDHURBCEy47F5akZBtu7yMYTTV3nlQE8dCMdzwczhi5OLgZZHqX4wbUp3pzKoRR86ljzpLQ2oAkheNvAKGl9snptNBK6zdXpU0hRb1vfDLHYDPm4X626t9OZc6Njr3yfbM9jqlCm7Hj4CkwjKANO5MtYrlf3ftU+O7bnRZsBpaAvm0B6RaDxe5k09aaEsyGazYiFJdcW4zwJbQJNlPFUgrLRR8o8tqlrt1lstm39fh7I91A78+DiF56o3O4OYLOsfD1CdGWmEIn9VqLRy21okscH2nlzOsdUYbkF1JA22ZiLFIHPCPjV3Acgqn7uCFxfocsynUkfH4O85WJ5PiXHozsdpyVuYHsecaFxfnyOv/j+X5EyJjFkGV0Gxze1GFZVqKiAkmNScc1oQT/Y08IBX5Gr2HziyCDfuzJBxfVJGsvXQwqB4/oII5CWji4WMTXBYzvaeWxHC9cmX+TaxBhx3aLixkAM8sX3/BamsfFiVxvAy47HwZ4WPnV0kK+dHma2KOvuw1ShwrNnhnnmcPOktNZ0zFcOvfFxgm8RiGoRIFhtW98MsdgM+bjfrbq305lzrWOvDJaakJQcD6UUskqCISiVzpdtSnb9lPCVLsog6MsGZcyJiYCobEXQbTYjdry/jRvTL5LWbxC81wJNlGk1b3Dx5gvbOr9pPadjV5VXOR3fzwP5HmpnHlz8whOVze4AbpeVr0eILNfjaF8bl2fyTb/cHz88iO15gaA2X0EIQW+mBU8liEu7+luiusNX+CpY4CBw6Cy7JpqWoCORoC3p49g2cUOrmq4FnT2XpvMMZS/RGpsCJCCrx3FwfYjrcSDoWhnLd+ApLVrQISBh7ckYh3paSMUMSk6wew27JJZsB9/z6UonaEvGMHVJW8Lg14/tZHTm+yT0q4QLc8Kwgav8+Utf5d984F/UXYtGpLFRAJ8vWfwfP7y0qlwmBIwtlpgvWfRkmsva1JqOLZTmGNhfRElJwtDqdC8rbeubIRa3Qz4eWnUvozZYSgFvTC9Sshws1ydu6syVLDqSJrNFC99X/OmLF2lPxqL32FOKdz/SwwcP9Ne5KAtcYnoF33c4MdC9JUG3GVL60YO9fK0wje2KmpZ2SVzXmp7fdLtYz+lYF4mGTsdbleXbyhLNW1E7cy9mct0r/MITlc3uAG6XlW9EiD59fGfdELX1Xu6QLL0xleeRjiyPdGTY15nml3f38trIDBX7dcIAr5SGEC5CaCgVOLwqpchZvRzo6mCyUEGXkpQhsautm73ZBL5STBUKvH3HIgFJqf5tpVf/qfCVT1uyhb7WA8QSe0knC3VtyZ6vON7fhqcUJweynL5ZQBOCvAVudXpyOmHSloyhSehOx3m0p4Vz4zO06KNRiWkZAtQY+UqZbDzRFGlcGcAFtTZrrPh58widdQ/2tOD7SaS8hC4r0WmGWMu2vhlicT+Tj7JtRV1lCfP+O8fwvfmb08NM5MukYgYJUyGlIFe2KVRsUjGD7kyclrgJwKtjc/xsZLZOAH20r5UnBtoYnfsBMTlO70CReOwaA+kj+GrHHc/0aYaUWt4SpmYR03R8qm9j9RnbaH7TneJ22tbvNMu3HSWaB0E70yzWEjev5yv1oOMXnqjA+juAWlYf/s7tsPKNCFHM0Jp+uevIkiF58cokz50dwdTeYKAly9sHdrCzNej68VSKhB6nJS4pOwVMLc1Irg3dPMbBnkAPMlkok9IljqtYKNs4nmJmqUJcs0gZFstEReH6Cs+XGJrgVuntHEw+whM7d/PETuoWl4Sh43g+58bneHX4O3QnpvjEQZu8pXN9vpUfjgzgK5N9XdloAQzKWGC7Od6z08LzRWSbH3ZdxPUKI7MTHB3Ys2nS2J6MMdCaquniCOCrwIK92YVq5e5MSgPL78eQ13E8RVwn4FQb2NY3wmYFmduJ8NkPRysAuJ7Lc69+nXz5BlKU8FWSbGI3n37i19C1+2c5kVLwq48OcPrWPDvbU9yYW2KqUEGhcDyfiXyZjmSMvmwyuo9XZwuM50v8yiO90bU/c2uBQx2X6U2OooTAczQM3WqoP7oTrEdKa7MaK1eEjeY3bQVut239don2dpRo7mftzGaxlrgZtu55vN9w/6ws9xCNdgCaEKtY/SMdGYqWQ6rBQ90MK28mJbrRy70ySL54ZZLRxSJCBNoTX0l+NDqEJx7ji+8aina8YZpQlyn+9txNZqfymLrgYE8LezszfON0jkzcpCVhIgX0phPMFj2KToyUEXQfuNXBbkKAIoHjt3P6Vh64wdP7W3nm0b7oGn7vygTnxhfx3VcYyI4RZnjaEz6dA/N0ZxK8PLaz/j4IwcXJHK1xwZJtkDFdIAj4mvTQZSAMvjr1PBXnIGfHe9BWTNpejzQamuTTj+2s6lQsbNeP2os//djOpnd9jXZnOecoADE5jq9cUjW7nGYQDLY7c18Mtlu5o12cm4m8Q5579esUKxerOg+JJioUKxd57lX43Ns+c1fPcyMULIeK40Wt4bJKyj1f4XiKznQsKlOGHT6hIVxCBvdWlx758nVa44H426seW4jV+qPtwnpZjc0S4dvBVrWtN4PtKtHcz9qZzWA9cfPdeh7vBR4SlRrUkoRvnB9dxerfmMoxtljiYM/qQXjNsPKtED7WBknb9RnLl6KH1vMVnvLRpeSVm4u0px4nUT13KXReuLLE2fExSpbL6GIgChxsTTK8UKTs+Ay2xqO/M12soHyN4YVWDndPAUGXTHAsKHv9KCTtsXNML0zwzbMOSTNDX+sBBjrexfmJRQQuLbFJamshvlJoUtIem6I3s5fpQoWEYVNxTVxfI1e22dfZwfRSF5n2oBXa1HwMza8eQcf2ilyZOkVSDlDyT6y6RuuRxk8eGUIKwWs351ks27QmzE3XzxvvziQ55ziCw3z2yV1k4tlNLRi/963T/OD6FDFNu+PBdnda23/+4lg0mThWNRx57eY8jmeTL1+PxKjLkOTL1ynb1n1VBlrZmRUOJCw7Lj8fneWRjkzgqCyo6fCRmJoWEZakXg78iDBXZTO2u+xSi7WyGs0S4TvF3dJDbGeJ5n7tkNsM1hM3383n8W7jIVFpgLVYfai/CHfiITbLyu9Ee1C7+C7ZDo6rInGoJpc7GorVEkLYLlmXTo3p7OnIsFCyeKQzHQxBHK9/FKQQoAmGc/vZ0ZIkoU+AKuGqBD47yLtHI/M3BSils1DKMV34Kd+9NMELl/vY1ebzjh02tURFBS5bJAybk/1X0cUshihRdg3Gcu1MLw1wuLcVKZ7i2vzP6E5P05Nygs+hkY4FYldNCLLmJOWKi1rxGK9HGpsli66vmC9ZDf/7+ruzLtpSzTu5+r7i2bM3eO7sCJ4fdEKlYwadqRiaFPxsdHbNhbvRse60tm85Hn9zepippUr0nBu2RV8fnBsfZyhVhlUhG6QoM1dcYMDsbfq7bzca3ScpYGShiK/gJ8MzxHSNnnScvZ0ZdE3Qk45zZSYfff+EoXhmf4xsbLWG6W6UXUKsldXYbqynh9gObGeJ5n7vkGsG64mb7+bzeLfxkKg0wHqsfqg1xcGeLNfnljbNyrdCxV67+KZNA0MXoIJMRcrQoq6A2jkjtcTL931evDLJWL6E4yr+6Y1bdGdiyAZiUsdTHOhp41/9yntYLJf4f356DkUChY7AjczfBGBXW5WD8s4UCaOXWznFUpdJJhZY5SsVuIE6vo8AWswxlPKQwiduuLTGb9GbzlMROxFCpy37TmxvAV99DyFksEiF8VZATLNBlUEsv5zNksa1yGIY7L91cSYaZNco2G/V7uz5i2P86PpMnXnfUtXoqzMVW0U4NzrWndb2nzs7wsjCEjFdq/qHKGbLDpem8wy0xHBVAkNYqz7nqwQdqbam/sbdxMr7NLpQRADv29vL5Zk8k4UyIwvBov+OnV1cm81HLf+GJnA8wUyxk5bYRJSdhLtXdlmJrTDj2wzW00NA15b/vbtRormfReob4V6XAe8VHhKVBliX1cd0PntiN0DTpGOrVey1i29PKs71uQJCBhqQ8mKRhK7zq4d2UHE9DE3WEa9aTYuhCTxfMZkvoyuPVK0GRQU7kJODHSRNHUNLs6ujrzrbBqSooImgyyWYhLvcTaOJCgMtgmtzwRygYz1TKJZ3tKCQApTy0KQffS8poDtd4Or8z9HNt6NJgdQy+CSJa3bQ8VDTtNOWaqEl28+5icKmyILj+SyWS+jCWlWiiYYZKtYN9luxOwsJZEt8tXnfkuXQkYytO9iuNh2v0G6rtr9SLH5lNh+Ue2ogamYkZRN7KFsXqc+q+GQTe+6rsk+I2vs0X7L4ix9fioTUYSnI9jxiusbvvOcw//2zLwf+PtV5QH3ZBLp5kkX7Aq3JBRx7noSRvqtll3uFjfQQfer2CdN6m7a3QolmO3Gvy4D3Ag+JSgM0y+qbZeVbrWKvXXx3taX4v16+zGShgu35SBVkTa7N5vmTfz5PJm5wpLeVhKGt0rQA6Frg/TGxUCBeLDO7ZGN5HnEDTvQlQHn83blRzk0s1GlbhlpjdMUSxHULU5PYnhs1E3sqjuWaLJQLfOtyDyXH5VBnnmzcpeKYLDkdDGQnEPgNGoWhLT7Jt6+Nk40nONbXxpLdhyauY7k+UgT3J6ZJBtsO8PiuPTxzuLlMle8r/v7CCCOzQaupISuYeppH+47w+K4P4Pli08F+M7uzlYtzLYEcyCYYWSxGQdRXCsvzeM+u3lWkuVE6viO9j5LdRdI0V/3dRrX9RuT5kY4MZdutGzkQwnI99nVl+M0Tv8Zzr0K+fD2aJZVN7OHTT/zaut/1TrBVmUhDk5G4NoQmBQkZzFqaWioz1JaKyEs4YRlgoniYz7/tAFffeI0nj73jjnauD4qF+0Z6CDdsx98Emtm0vRVKNNuJOykDPijP3ko8JCproBGrP9bfyi/v7o52W81gK1XsjR6y4YUiHzq4A8fzyVdsXh2b41a+zKu3FpivOPRlEriej+srCpaD7QSzSmS17zdt6oDCJyj1KDx+df8kj3bnaU143Ji6wKLVgxl7kmRM52BPC7brc6inhQMdx7k2fQpR/V8Axc18B+N5h85UnGzc5Mr8fs5NWSQNB1NPYWqSjx+YI206dd9PAShB0vB4aiiF46coOQ5vTg+xI12ivzpHp2ibdGX3cnjg/ZGOpBmy8PzFMYZnvk/GWHb4tN0lLtz6KUtWif19T68r5Lu5WGS+ZLG3M0tLYjUhWAvrzXgK/9Z79/Xy4pVJbubLkTbk3Xt6Gg62a5SOH5t7jb7UEDnn+Krfb1Tbb0SeQ7F42AkzWSgH5TwRlDw/c3wXuqbxubd9Zk0fla3MHtYea8ly0SQ8OdjJp44ONXWsle/LRvqH2pbwsOsnREzX8JWGJHnbJGW9a+Mpdd8FkI30ELoXb/Cp9bGZTduDXKK5G9hMGfBBHx/wkKisgVpWnyvbfP/aFBcmF3lldG5TN3krVOxrPWTv2NlGxV4EkcDQTCYLFcbz5ar/rMLzlwcD7u/KYmiCsuvh+j4SQTZu0JYwGFssEZM+ggIf2jvDkwMLCCRKQVy36TNGcVSMvBsETVOXXJsr8OnH3o8mqilIJ4flmZTcPn482o0QVX+SliSvT+UoWIqCpbOzzcBTcGOhjSPdxWjXHhY9PCWpuDEMLYWu6Xzj/CitCZPppT2cEzuJ6zZLto7lSt6YvYDt+U3dj8DTZYae+ATLQhcfpRw83+f6zM+5NHUJk06UeqLus57n8Q8Xxnj27DCW4xM3NE7saOevP/8uTHNjg6X1FuflzJ3k/Qf6sV2fXMXml/d08dkTe1Yda810vBS0x6dYsBykXA6kjWr7G4nFHU/VlUVmpqb42IldxGrGPCTMWEPh7FZmD5+/OMarY3NcnS1EpOnl4RlOjc3xxx89sea9Xm9RXi9TGv5e7X9XSvH6VA5D+vz5D15hbjYftWpvdoFvdG0amczdLwFkIz2Emtucudhb0R32QcGDPj7g4VOxAQxN8qMb05wdXwDqdQvPXxzb8PMb7eLiusZ8yaoKURsj0k2wPKDs+vSLfP21f89g6ru0G9+hUPwJFyfma1qIgw4gWdUXnB2fRxOCnkw8CtP5is3IQoF3Do7wxXde53986jLv3j1HTFfVKcbBnCBfCeLaOAI3OqeS7VK0fR7f9WE+euxf8WuP/090t32GqfIRio5CCEF/Nsn+rmzw91RwPE8F3/P05B5missiWKUEnq/h+JJFqwcpDcqOy3zZDqcV4SmNkUXJm9MlLk4u8r2rEwzPB6LmV0bn1r0fBcvBdouRriYYJeCgSX+5xKHK7MiM4dqn6j773NkRZooVUBCraklOjc7yuf/8wzX/Xoi1FmeAn47M8PS+Ph4faI+uqalLnt7fx2eO7254VgLhFAAAIABJREFUvDAd3wimZnGsPx4dC2hY2w/JcyMMtaY40J1EE0vYrk3C0Hm0I9G0WHy9QLTeM77Wsa7OFpjIl4HgXZRC8PLIDF8/P7LmZ1e/L8vv68cPD9Zdb6i/Rh85uIP9XVl8pSjZLpenF9nfdol3D/2MXZnvcXjHaW5Mv8jfX1j776/3fVZem6uzBV4emcHz1abXlruBx3Z+gP09J0kYaVCKhJFmf8/J29JDrPfc1U6LfoitxVa+l/cKDzMqG+BOdwFr6V3CcsyfvnhxVTmg6LhRqn6+ZHH6Vv1nW4zzxOU1fAVSSlKmw972CRZ22HzrSj9JQyMdM6IAbLkes64HAjqTMcq2S95y8Xx45+BNnhyYQxMCH9BltX8YsFxZzc4EAlkpKtheKhhYaGjROeqaQTbZzqeOtfOhgy5wNqrvl6vfxVOKQsXBcny0mKQvm2bOfpo+7RqWM4zjlam4MRatHjRjOaOh1QyRmy1a5CuBhkZKwXzRZjxX5vzEAj2ZBNfnCnzk4I66nX94Dx3PR6k4thfD1Cxc38eoEfIqJfBUQPC6UrPMzvVTsgPdzXzZJh0zUEoRmLQGXjCnb82TK9vrloFWZtSUUlyazjNRKLNkBd1Qv7Srm//1fUei+76e6DWhp0gYGUqhS2nNY5k0M3z0yAE+fkRbt4ywNnn26UtfpEVbYG9LAVNPM9h+EDHX0dTufis9MAqWw5LlMlmo18oAuJ7i1NgcnzgytOr7NfO+NtI/+L7iG+dHoyxMTNc42JOlN3mBjBF0t4HA1B3i8gYjswLHW/33N3NtPF8xWSjjevUmc5oUvDo2xa42xZ7O7qanhm8H1tJDOJ5PzvI2VQZ/K7nDPkh4K4wPeEhUNkAzNzkbk+uKmhrpXWzPx/ODSbvJ6pC+v37tBn9zepih1hTDC0s4nk9fJsHZiUUGW5Mc6M4ihUdCG8fzg84cTRPggw/s78jx3Wu9JM0YnanlB2++VMH3Av8IXQoycZOhtjSe73Cir1idtgwohfJBSDCkj139uRTgk+CNKZvx/BKW67GzLc03X79ZR6xKtsvV2TxH+lq4PB3UtQ0pyVs2s0sVfKWYyJdoSRj0puOkYnF+66nP4noOz515g4kZB9OUSFHG8WJ4vmKwNRmeGsXqbkwFsglKjosUgrLjoRSMLhZ59uww/9XJR4DlEsDpm3N859IEN+aXeHpPjLcPBoLg2qqNr2T1n4qEbvHxfUne/vgRLk/n+X9fuUrRdik5Hp7nowToQmBqksvTeZ7c2bnm87NycX5zOseFiUWKtouvFD8fmWNkvohS8Kljq1OwdWWMqph5b1uSoZZ5BMEog7getKX3tR4gb/nE9fVJxVrkOaufo9UcpuLqCCFwvCLXpk+RUQPAk+ses9F3rcVmA1EmZqBJGgZCU5e4nmq4wDa7KK/UPzTU7EzO05+8yerZUIKYHGexXKI1kWxKW9Lo2theEOhNXcPUgofR911ySz+jOzXDjy67vPD65qaGbxdCPUQtobsxNssPC+eb1tm8VdxhHzRs5Xt5r8S4D4nKBqi9yaFbZZgtSJqSa1MvMrG4/nColXqX714Z5/9++Sp2tX24N5NAoZgqVIJJvgtLjOVKeL6iJW4gpUTLB8c63KshRRlXBTNwBKBLCSjaEh6Huk2yiZaonLRYtlE+VFyXfMVFk6KaflXsbFWkTDuasIyQOL6PKVV1vk5wbE3CaK6D4YUKMV1jZ1ua/V0ZvlolVgMtSf7pjVvMliwkEDd1etNxPrC/lwuTOfJlGwWkqy/EkuVybmKR3R2ZajdGjN964kg0Q0b5RYqY2KKbgZZ9jOUqkVeLLgWmHuxuaztkQhfVKzOFKLiFweelK5NM5EskdMkL1/sDl9LORZK6C0LgK4njB/dLKfBUgjYzFU1+9iEiFlSvuesH2ZhLs4vrEpXaxdnzFefGFyjZQXYrGzcRAqYKFZ49M8wzh1dn52oD6OhikYl8mYn8Dt61Ewayc+BVECKBoe/km5e6OfviT8hVHFrjBsf623lsR2O9w0rynDKhIzFNbMW8HiEEBXcS13M2FJE2E4iaXegMTfLkYCcvD8/U/dxXiv5MgkzcaLjA3s6ivFYWJm06mLICaKwkK4as8OKV67wxrZrSljS6NuE60pOORz/LLf2MR6qOzFKsPzV8q7CZ4FP7PMaqvj+b0dk8bD2++9gKgnivxbgPicoGMDTJsb42vnb6BtNLlSgIdqfjfOLQFNemLzc9HMrQJD+4NsrrE9fAdzC0YEd3K1dioWzTmYoxkS8xu1TBq5qjlRyPlKnjV11L93d14KsEQoSD9ZYd0KRM0p1pY29XKyXbQwjFT25MownJXMkiVwkcXm1PMb1k0ZlM4agEMc1CAKaUuL7A8wMio0tJ0kxzsOcIY8Ue3rVHRYvrm1O5yBjrzPg8E7kyoDB1DZRiPFfi7MQiB7qzLJQs5ko2RdvF84OyiSYFulz2qzg39l2Eukxc97E9SYvm0hIfpysdw/IG0aTE9RVxQ8NyPG5W26Rr0ZOOY7letOieHV/A8xU388F5KqUwNZ0f3NjBj0d38PSeUU7056gNQkIosondmNWAnTR12pMmubK9ogQhSBgal6eXNkx/P3NogJeHZ3jp6iRjiyUMTZKN6XQkzerfhLHFEvMli57Mcpq/NoCGZYLgHDR+PLaTd+0+gaFZvDFls6u9letzBeZLFlII5ks2l6ZzkWZppWBuZQuoUEt854LFqjoL4Kpy09bcawWiZw4N1JVWmlnoPnV0iFNjc7w8MoPrKXRN0JGKsbs9zfH+ti3bta+ZhREJbD+OgbOCpigcP87FSRspl4nRRuLERtfml3Z1o1fP03FtutMzgFj1rNVODd8qbDb4rKezWTnMca1r8bD1+N7gTgnivRbjPiQqTUCh8H2F6/soqBITD9cdIVabZq/qF8bmVw+Hcn2bb5/7jywWbrI74zN0VDJbTvHP107gIshXbNoSJvPFIBMiZaAP8ZVCoLBcG1MqcpUsbbF+0vo16nd5ipLbx6cfeyRaBEq2y09uzCKloDMVo2A55CsOisAqvTWZwGMHcX0Y3/PxEWhSIjDQ9QP82rEP0pJoIW/5WK9fqMssTS1VorLLZOS5IapaEIWUgjO3FujPJHB9RWcqRkcyhqd8NBGQjlzFiUpnNxcuEbQLL4tbXV+hvFHmix0IafDkYCeZmI4mBd+4MMr0UgWr6ui6WLbZkU2SMIJdc/j9K66H5Xg4flWnUr1PmbjOty4P0J5M052aIa7blF0TIYb4b058ipdPnYl2mu/f28fXlm5Qsj08pZBAJqazvyvLYtlet8br+4rf//YZfjoyg1/9rCEFCMFcyY5KdKKBM3DdXCevXg9guz6WJ1CkGF3I09/iRfcEAr4xWSizryu7rpYqLIG4nlyzFVUjTsU1iTehR1grEDWanbXRQiel4I8/eoK/PTfM114bZqpQZrpgEdOWeHKoE99XDQPqZhfltbIwCh0lBjDkMK5PJOrWpaDg9NV1V8HGurWNhp/mK3l6Wh2EWM4WhojpFmPzMxzu37qgsNngs5bOptEwx42uxcPW47uLOyGI90O31kOisgEsx+O5MyPMlW08P7g5XakYR3o0bLdIrEpGHM/CUy5KKSynxKkb/8jbHvl4VAL6zvn/yOzSaEQtNOnTnSrwgUdO8+2rJ4FAeKoIBLIhdAlPPzLO3vZFOlM+nak2Btr3I0QXb0xcwHaLOH4cy+9nV9e7o1pxiCAABhmfPR0ZZpYq5CsOnlLs7cyyu+tXGMz0cHX8DOhuJKB8omqAlrcc4rpWtzgtD3ATuL5fzZJU2zlhedCbGwT28CEWAvSq7sXUJa0Jk0zMYKm8wFRhHttTeL4izBNJKYjrNpmYy2JFJ6YH30bXJCnTIKbZdCRjdKXjSCEYz5ejclIYfHQpcZXCdr2AYAYXBYlAaDpm7CmmKg4xvcLRvn4MzeQrL74R1d8PdWdJmJLDvW3MLJVZLNtVe3VJ3nJpiZvr1ni/fn6El0dmkEIE52xoWK4H1WDUkYyhUAy0plYt3LUB1NS0usXA1IPhebbnRa6/tTb8wTPpB//dZUPBXMNWVAVlx2VktpPXvneJdEzncG8L73mkl5aEuWHpJvx7JdvlpyMzkQ4jRDMLnZQCU9M41NPK/u5slNE7fWseIRoH1M0uyutlYXZ1/Qo7sz2MzgcuoLgaQz3Hef7NLpINNNTNiBNXBunwXMdzeZ479ROSxuruF8uNMdi+dZb1txN81tLZ1A5zrMWDItT8RcLtEMT7QYz7kKisgXA3/fyFsWj2SazqMzFZKKPLGB2DcXw8PM/C9WsXF8GN2QuYeoLHd32Yil1ibinsHKglEdCZKCKxyMYNetIJppcsKq6H7QYD5T+yb5ITfXMoBKZm4PhFrs+8xv6ek/zWU/+aQiWPq2K0JpJ1u7OS7RI3tCgrE+7QutJx2pMxOtMx/uDDj1Ufvl2ouXYOHtlLwsgghc7zF+pTwhXHQ5cCvbogmXrQKbGzLcX1uaXI/l0Q2uRDwtT5lb295CpOpL+hej496XiUiv/KS9doN3WSxnKK3asKe23PpOKa6Jqo2v5L/vW7D1GyHW7MF5ktWtiuj6YLdrQkiOkyyjwc72/jldE5ErrEcpcXZFPXEEIw2JbiS08fpeJ6ZGIG33z9ZhSsTC0ob33vygSW6zOeK1FxgzIcBN07XjXLtpb2wvF8To3N4XrB4EghoDsdj0qItuvjKZ++bJJPP7Zzw+GHoWOsItBpaFKg+ZLB1iCTtNKGPwwemhRNCeZqrblLdoGFss7l2Ta+f6WLbGYG31d878oEX331Bsd3tG9YuglLCz8dmeHFq1OkTJ2edJwD3dmIDG200IUBNfB4Wb4+zZCczSzK62VhpNwVdb28fv4S+/v388L14QY5sNvvXjE0yc72VoQYAq6yMluKGNzSsk+j4CNwkaLCkmU0vCdr6Wx0LXg2V5Keh508bw3cD91aD4nKCqx0wzw1NkvBcjA1GS2uUggmCg4Vrw/UGJ6q9QZQaFJHyqCccWzw/SyWp/CVS5jZqA0mmvT55T0GR/sPIQT8wXfOocphitnnUFceEIHVd7XtNpy1cWzw/XWTehul13uzCW7lS8R0PXI87c3G+eyJXXUPnxQamXg7jufz1ddu8MZUDlOX0e/oUuD6Cl0L2p170nF8pTjU08IbU3nGcyVCjYoQAt9XnBzq4HMndpM0dJ49M8zYYglRzR58+rGdfPzwYLDbHl3keHcH+zomqkZx1XuhfMbyHUwWHKRweXl4FoGiP5sAJTjc27pK4Fwb+D5+eJCC5bCjNYW3UKRou0gBSUNnoDXJO3Z2UXE92pOxVTvMkbyFVfXtiOmSbFzHWvIo2i4JQ6clbvJoTwumJvj/zo5wYXJxVZ2/YDm4nqojEF3pOEIIcmWb7nSMd+/p5cmhjjXLErUBdKg1Ff087IZ6cqiDk4MdnL41T086zni+VNXjQF82CGzH+jJUnEUE61tt17aifv3cG/ztqzcDN1ZRYK5kkS87pGIGQgSamVdG51iyXH7zxK6GZCEsLZiaRtLQUEpFBoQHe1qAjRe6u7Wb2ygLI6XkyuTPuWGf4caFF9iViTNX7iLvHiMkUFvRvfLF9/wWf/7SV0GNEdMtLHe562crUR98fFqM8yS0CTRRxosluDY1T+uu+qYAqH8eLS/IpL5jZ1ekswnxsJPnrYP7oVvrIVFZgdq6bdgeqVTg4dGVXraMtlyPjuwvsavtIpcnfw4oBBJN6uhVkWzZLlB2CrQmepBCx1dBlqSWrCgkj+14hE8e3YmUQYD56ms3WCjbxLQSLXEXTQbzeGo3WeGxQ4HjWqncR3tb0aRkT0eGguXQmjAb1ut9pSKi8+KVCfRqN1K4+9U1ia7B77z3cJBZMHS+9eatYFd7aEdd148QgpNDHfz159+FlIJPHRvimcPBUDggag8FmMiXKNouZ6cCg7Md2Tnius1CRefiVIbTE73oOrQnzSpRFAzPFxldLHKwpyWa1RKiNvB5SvHevb2cubXAL+3qQpeS0gqvkvB3awOi5yvmKi6ZdHivfVKmQUdPHNdXPDXUSTpmoEnBmVvzlByPdKxeVOl4NicHU6RM6ggEBFOR25Im/+3b9vL5J/as+6I3CqCu5zBXXCCbaMH1JSlDj7q0Kq5HruLQljA40J3hQPtlpDfBN8+u3ZW2EgqN1245WK7E0ILs0fRSJXDNtRyWLAfLCbx0fnBtkssz+Zrsg2j4PIbZoNCAcF9XYNO/0UJ3t3dza2VhwrEFrnIxhEFMs2iLDSOEYKJ4eMu6V0zD4N984F+Qr5QZm59hsL1rVSZlq6ewt8fOk9avEy4wcd3i2vQpNLG6KaD2efzhzyze9dTRVZnch508m8ODMH/nXndrPSQqNVi5uIa6gK50nNmiFbTBVr1PhlpT/OZje9DkLqbzw5ScQjDxpkYElzAzxLQ0//TmJEWnlbg2C4Ri3KAM1JUe4Jnj+6PP/PqxoAQQmImV0bUUcd0mvmKibcLMkDCWnV0XyyUcN4epp1A1t1UIwVBbiv/53Qcj7Uajl+EHNwvkYnqd5iF0BA13v6E4NVzIawPonzzzeOSj0mgWjqHJuo6WEH3ZJKmqj8yZqUc4P72LuG5zM+czV/TZ2ZHAqGp2wiyBJgUFy6FouaRi9cK+xwfa0YTg786NRlmc+WIFKQVH+lo52N0SZQRqg2RtQLQ9j1qzxng1GyCrGhdTl1EnTq7ikDCWz0EpD+W+yuj0FIs5h3Yzxp7WTlD7mCo62NXBik8MtPO5E7ubXpgMTdKa0Dkz8l8YW7jEQnGRshtjye6lpI5zvL+DL73/KEXHJa5rVFyPa1Mvcm36MhWnua60ECszQfOWh+WqiCdXXI/xfIls3KQlbmJ73ioRZsFyKFQcdC3QmNTODyraLrbn8fadXRsudPd6NxdO2h6dXzG2QASlzdbkAp9/2wFaE8ktPZdsPLFKOLsdU9iVcpleGCccSm5oMlhrBFHWtlEWztAkLbFl3dTDTp7N4163/G4G97pb6yFRqcHKNHOtLqAlbvK2oc4g0yIkTw4Fbp15y6e/7WAwnK/BPIx/enOyusi+l+7Yi8S0eVA+Qki6MoN86Oh/V3cOKx+Ia1M5rk2fqsumhMfWNSOapDs2f4ldmQU8laDs9ZFzjhKmpIMW29iaD5bj+VxerNDfJzDRogBV2zkSlpNW7mBrd6AtCZMnBld7iqy3Y0iaOk8NdfLjG9NIIbB8gesnyMR9BltNFspOpDnpSce4ubjEz0ZnKTse1+cKdCZjvH1nF6m4ETH85y+O8bXTN5gqVDA0QXcmzkzR4sL4IgXL4XBvK08N1QfJ2oAYlJGCn/tK0Z8NyiwT+XIkYoVAaNoaN+oDqPMqfelRlBIgJCnDZjB7EykEO1pOcH2+gKYJfOXyle+9zJG+fj5xZHdTC9OZkX/m8uQplmwX2/NRqkTKuEZ+yeHVsZPAMlEwNcXEYoOZQDVlw7XKQJmYQSZu0JOOcytXwnL9aOhkeDQpBEXbpSMVi8puoWZEE4IXr0zy2q35qNwYalPC+UFf/uDxNTMlK3EvdnO1QcRxc+zKLGBokpV5KMspENdtDC29becSYjumsD+9v5VvnnVRGKucjldmbTfCw06ezeFet/zeDu7VPX5IVGrQKM0c7gSnliooBQlD52hfK0rBn3z3fHXh7GF/234SxgQVu0CimmI/PPB+/uGF16uBTGfa+gDKryBYANHG59/xZNWsbTXC7Mfu7ncDMLEYdB0katL3UD9J19AkeOVqGhdyzvGmdp4Fy8Fyg1JUaEAVlirCzhETbdM72PWmBtfaxf/BB4/zuf/8Q07fmqdSLSmc2NHOf/rsO/njF84zV7JoT8b48fUpRheD84rrku50Atf3sX2fP376aCRqPX1rnumlZfGuEAIJTBXLzJct5oo2YwvBcT5xZHn3UhsQs4aGD/Rnk9EzEIqSLdcLCNbOTlKmUfN9HVpjU4Q+GGFo16XGQHYOI5ZidDHPjtQb9KfmSBk2o9MJvnZqL5998tfXLceUbYvhuTepeD52Nd0TtoS3xCY5PbWIrslIXBrOBFpJVIJjrR+AQtLmej6W53F1SmBIie15ZOJmFMtcz6c9aUZELdSM/ODaFGfHF+hJxwPxb402ZV9Xlrfv7GqapMC92c3VBhFTT+GpBHjloIut5vdWZjZvF67nrOtuvV0tohtNSN6K7/YQq3E/tPw+SHhIVGrQKM0shGBfV5bfOL6T9+7rXdUdEtltz+3nxI6TfPR4a7TYzJesKECHM16CKbAKWGBP5w0+//ieVbvp1QG+j2N9+/nI0VaSseWFbOUk3bA85HiKmBxHcJjHBzZOr2diRp0fTC05A0FMD4jDyuOUbYvJwjwxPUVXOr2uq+qqMQFtqYi4KAWHe4OyzJLtkDYNBIr/+ms/5vJMnvmShS4hX3ZprfaEhrOMDE1ybmIxyroslkuUrAV8f3mK8GyxwthiiYrrYmoaE/kSS1bQol3b4lobEN+ZrrCY7uHiZC66h597fPcqkqXLZQGzr0rEdYtl19waIznKvHD5BjvSw+zrCCc4CwytQr50gVeHszy5e3U5JnwWLkyM0RNbwPYVyldIudxDltBtcuUcS1Y2Epc2DEAKFD4JI7thAArvtZSCaxOzZNJpwhlHw/NFENCaMDnc0xp9JhyyGS7AteWe0G/j148P3XYm5G7t5kKyGxJ0TeqUvT7S+nVcVXViFPWZzdtFmBENO63W0hFtl6h4ownJd/LdthL3g45jIzK5GdwPLb8PEh4SlRVYv01RrMuEz00UeObwLvQGuoc3pnLcXCyha0HmQwjBm1N5nr84tirN1yglePpWHiF0Pnl0eRe8atcsAj1FXAdfuXz2yV11XUFrwdAk+1vj5Kp+KEIIDva0sKcjw8GeLJ9doaVwPZdnX/06M4Wr6KJMyTHJ272cGPpAJAoOr1NwngFBuDKTZ6oQ6EVCMeUro3NcnytEwth2PXg5X7g0zuhikZ3tQUp9sWRTcl0ow0Bris5UrGp372O7ivHcEvniTxmbv8SBtjn6kxpjuQ5OTz7CdKGC5QU+KpoUUdnijakcp2/NR7uX2sWwLa7z9PFdfOLI6gWydujhRw7uoGA5XJkpUHFiVPQYKdNZ3QWh4tzKOTzeO8dKO3bPh7G5NzkxtLocEz4LuozjqgSoIobmoVenPisFlqezZAXlqtpBkVEAQtT4/Pig4NzYC+uKamtJm16Yo5Jsx9Qlnq/oSC4wvVRhsDUVPeth5q7ietF7Ez5HYbnH8+F9+/o2LHNtZ1DaKNj4vuJrp2/wwuVJINCjBaLyIwBo3gh+XJFakdm8XdRmRNfTEW2nqLi2Lb1R1vZeYmUXpibhycFOPnV06K7pOJolk5vB/dDy+yDhIVFZgY3SzJthwoYmOdrXyl+/eoNTt+bwfZAyaI892tuKqctVab7NpATXTNsKSJkZMvFs09/73QMZbsXa6wha2Da7ckF49tWvs7B0Hgl4CuK6TcIY4dWR/4KUH+aTR4fIlW3O3ppnoWzjeD66JpkrWnSmYtiuH7lYespnbLEU6WAgMC4by5dQCHwVuNpmYjr56hj4toTJbNGKpg9rUvLytW+jqyt4ykaXLq1xm2zsJt2pHGfGd6NUMBLArBEl5ysOc0WLXNnmRzem6757upznxAm15i7ecjyeOzvCldl8VK46vqMLvH1UrNepJyM+CXM3CcMnodusJCoAFXd1Oab2WVDoVLw+0voltJqpz0IEAyRP9I3x5OCxumc1DDSXp17B9W0EEl0aIERToloInuEP727hVqwjaEl1PY71t2O5HjFdriLznlKr3o3azqz1FuDtFBc2G2yevzjGm1P5upbyZVH5cSYn2/mT33gHmXj2jnfVKzOiIRrpiLZTVLzWhOT7Ac9fHOPVsTmuzhaq2Wifl4dnODU2xx9/9MRdISvNksnN4F6LxB80PCQqa2CtALVZJiwQuCqw4A/+37I7KqwmN5shQluZtpUiIGgfPNDPRL5EXzbZ8BzKtsVM/mpk6hb+VV8JsuYkZ25N86uPDvDStUnmy3bUSeD6PrmyDQReIqEg1dQ0BPX220u2g+MGXSdSCGaLVjA5WSlKjse1uUKQlar+7Z0tJiXrMjHNApb9aiSKrlSBj+4b5+/f7CdpBn4ey1AkTY2Xrk1ybnyxLoP15ny5YbYrDKZ/c3qYm4s5WuIeLYkW9nW1cXkmz2P9T6EJQb58HSnK+CpBNrGHTz72Sf752o8pueYq59Hg79aXYxzPZ3RhiULFIROvtlC7j5LQriGVgxAKpZYHKu7vzPOxR/vq7tNccYF9vb/M2PybRE9fzXOykah25bOxkrw3ynxIxG0vwNspLmwm2NSay9XqtEJR+Z6ODHtbUk1lKZvBZnVEzYqKw/uS0MH1i00Tj3BC8v2C8H5cnS0wUR3TET4/L4/M8PXzI/zG8V3beg6bIZObxb1u+X2Q0BRRGR0d5cyZM3zsYx/jy1/+Mq+//jpf+tKXOHny5Haf332HzTBhx/M5N7HA0b42Fkp2NOdGCJhestjfpVaRm80Soa1K24Y+KhvtZicL8+iyjKeqGgmlIofOuG7zytgEs0sVLk7m6MskosVeE0FL75LlcKha5oEgSA+0ptDE8nVLmwa6JkiZOvOlYJaOAFoTBosVKFkumibIxk12tiR5374MmijjK69uRooQAk3A2wbLnJuOU7CWMxGa9OlL+Tyxo4WLk7lVGSwpGova/u78KD++MUlv8gIne+dI6DYl1yRn9aKbJzk/mePfPv0bkddJR6qNhBkQy08c3cVrI5cZMm6ilKh6nwTC4KH2Ayg0ZpcqvHRtkouTOZYsl9eqRm4HurNo0q4a75n4ykeXWjV7JonpLpa3hJTZaAq1FCWUimFqJeJ6HLHiO95pV8daZP7QtizXAAAgAElEQVR2FuDtFBe6nsPo/CWUImozR6wONrWbhFqdlu36CAJzwwN4a57/ZstVmxWybpTtXc5IzZGUZ8maU8S0Cm2pVgbvsFSxFrazTFewHJYst2YY5zJcT3FqbI5PHBna1uzDnYjSN8K9bvl9kNAUUfnSl77EF77wBV544QWGh4f50pe+xFe+8hWeffbZ7T6/+xLNLsS1C19fNhHtCiAob5Qdl3fu7q57ODebEtyqtG3oo7LRbjampyg5JnHdriMpAHlLZ6oA37k0Tsl26xZ7x/NJxwxcz2d3+7LDqucrPnm0D8spcm3OoeIGf/+JgQ6mCmXG8+Xl+dBC8Gh3C6pqmPfevX0kTA2Fi6diaCIQstZCKYhpNpmYQ8HSUMrnvbvHOdiVoz3pYfgTJGUrJf84tRbtEGSw5ksWhhaYqv3dhRH+/Y8uc7z7Kvs7J6sBT5A0HJLGKJZvMlk6EmW9BsxeYDmAvD6Z58rcXizHY6hlgXTMIaanONR7lJH8Qf7hu+c5e2ue+bJNX9VsL2wRBjjUk8JTSTRZJq4ZmLoWtZQmjDQJI8Nzr36dYuVilfhJhLBRyqXiVkiY9T4229XVcTsL8HaJC31f8Y3zbzKbn4u8cUKX57iu1QWb2k3CSn2NqWn85oldnDuzsOr4K8tVzc5Dut2M6FoEMcxIBQZuw0DgJr1YylG276xUsRK+r3hpLM8/zp7f8jJdiEzMiEw3V15HU5e4ntp20end6Ip62Na9MZoiKpZl8ZGPfITf+73f42Mf+xgnT57Edd2NP/gWRbMLce3CV9sB4Xg+pq7x1M7OhrvM29mR3knattZHpRaNdrNd6TR5u5eEPoJf99uKy7Mt9LdkuTEfzEYKO6YGqnbvcV3j6myBmK5Tsl0ShqBFP89ifhIpSvQnE8TNXXzmiV/DkDr/yz+c4u/OjwUme7pgqCXFux/p4Scjs8EsJBHQJNuTLFR66E7m6yekKIVCR8gUuzu60TWLtw8Ms68j8EqJ6yaOV6QttojhSnLO8brPji4U+YsfX6Jsu/xkeJqpQgXLdfjII3PVDhrwUVXLekFcG0eKQ6vM+WpLGsd2dOJ47VzPl3nnrjS/8dgh/vH1CU7fCkjhQrVcFrbz1pK9HS1JskYfreaNyJQrPNeBtgM4nk++fL0uOxWMpdQDIzpfRVmVu9HVsZkFeLvEhc9fHONvz81wsteISm6+UpSdIDPSlmyJgk2jTYImxbqt+bX3NmFovDmV48Wrk03PQ9qqjGiYkdKlR0IL54pBMNFcEdfvvFRRi+cvjvHmfJmBHWybB4ihSZ4c7OTl4Zm6n/sqmHWViRvbLjp9ULqi3upoiqhomsZ3vvMdXnrpJX77t3+b7373u3UTfn9RsdFCvHLhC3doZcfl7Tu7+PXjOxt+7m6nBGt9VFaikUD4saGneeXGd0gbk2RiLnlL5/JsCyO5A7xvfwtlx+NwbyvffP1mNIDP0CTd6TifPbGbZw4H3+tb5/+esnWdIJMh0YSF47zBN858g8+97TP86cdPko4FKfm0aVQH0y1b0utC8t3LE9zKlXC8DP/DyTQ7skvVEQWg0PGVRtnrZ393O3s7HXoTZ8jEzLpUsqFJEmqcvHM4cvUdzlu0taUQwE9HJhidn2WhLMnEXFKmE2SSVLVPVYBC4fslRuZn+dMXL0YBylNqWRBb16Luc3FqCWSaN6cKaDKYnh1eq1qr+YM9LQy0pviXv7SPgdbjXLz5QsPANr44gxRlVmaGfKUHdEWL4/mV+6qrI8R2iAsdz+e1m/NML7ncynfUtIUHwc7xPPpa64PNZjYJK8tVl6bzUalzoWzj+WrD4L1VGdEwI5WJVdBEhdrMokLhc+elihDh95Zi68t0K/Gpo0OcGpvj5ZGZyC25P5Ngb2eG4/1td6VUcj93Rf2ioCmi8od/+If85V/+JV/+8pfp7u7mm9/8Jn/0R3+03ef2QGJlzbbRwvfO3d1NCabuNCXYbN//Sh+VWjTazX7q6C4UH+b//PHroMo4foy+bIb37Q/s6ZOmTtyQgYCz6icihEARLJqGJknosFRZufsHkOTL1ynbFgkzxsnBjlXBa29nhl3tab5/fYrh+SWkUCQNg+9cO8HJ/mH2dy6iSxtfJSh7/VWXXjA0C12WUZh1mZe4rqGUjRQVClacmK4hERzqyeDaP+eDe8ZJH3LIVwwuTqXIVXSy8eWMoucHfiyuSnCguxtY3l2++5Ge6L5fms7XiQJt1+dH12cYz5U4vqM9GtkQorY7Kh3TGWoLvGrWCmwdqTZ8lawGqloIXJXhQ0f/JWDfV10dtdhqcWHBclgs29iuH82SGqjOkio7JnkGIkPFEJvZJKycDzW1VImCd2iUmDD0poL3nQpZw4yUr+J4KlF9BlRkjS+BuJHB8x1cz7mj+x9+70bYag8QKQV//NETfP38SDSJPBM3oo3A3cD93BX1i4J1icr4+DgAmUyGL37xi9HPfud3fmf7z+wBw3qtlXdbMLXZvv+VPioh1trN+srlQweyoA7ws9EFYrqG6/vB1GOlON7fxoXJRQ73tmK7fmTiZuqS8xOLfOzwIHPFhQa7/3BhLTNXXGDA7G0YvJ4Y7ODpfX189D98F9/3ydseCofpYoXRxR72tO/kPY+0ImSibu6Rr+L4KolkxSIroD3Vwm++7SRlt9pxc2ucFuM8pjmG6ysUgmzc5R07F5krmoSuX5oQSCkwJCy5AyCCBSzcXX7wQD9xQ2PJchjPl4J2agJBtalLWuImr0/l8KrXPhzZACCEQhOy4X1oFNgSZoxsYjfFysUV19Unm9hDJr79Nu93gq3OJGZiBq0JM2o1rp0lVXFN3rmnn5Z4vOFn19skhJuRuK7VzYeyXR9DE9Hnw862u2HgVZuRKrm9JLRr+CpI+gXZujK+8vn2+f9wxz4gISlahFXTy7fDA0RKwW8c38UnjgzdU9Hp/dYV9YuEdYnKF77whbpJv7UQQvDCCy9s24k9aNiotfJuCqZup++/kY/Kyt3sSgJkav8/e28eJelVnnn+7v2WWDNyXytr3xfVogWxGiEBFlgWAuwG240XZnyOD9Bu022fsWem+7Qtt00bLz2Gdh/PTLc9TUPLxrRBbAIJCZCE0FKqkqqySrVX5b5n7PGt984fX0TknpVVlSUkOZ9/pMqI+OLGt9z73vd93udJ0xFv4pHzXRQ9Rarq2/OOrR08e2WCgWy53jVRa/nc2JSk4PoLdv+aQGmU1mgNTmDzzJUiH27Uyy5e5yZynBzJUvaDeu1YaUGu4nFmQvGRW3cynKswdz4LlEEmsRWtz9a9a2C23pywYyTsaCFKWIq4MYJSkf5rTQUWBLah+HF/M/s7irQmQ1w/RpENFIID876v5Pr8/fHLvDw8zYnhLJNlF9uInLBjhuRgT0s1WLGo+AHpmMWu9gyDuTKD2RKWIXnmygR3bmrjvr29q7r2P3/bh/i75wOK7gWk8FA6RdzewgOHH1jV518LWKtnxTIkRzZkuDI9wkBWoTEJtUHRS9DZEOfIhmu3hFhIIHX8EFNGxou1gGiueSa8egJetWf1749tpj1RYlPjNCnbw5QQKoUbKBK2ecM6IJYhOdjdzFOnL3K2PLaotHuzgoh10uk/XawYqDz++OOvyiCUUvy7f/fvOHPmDLZt84d/+Ids3rw0f+O1iNeSb8P19v3P1crIVsqYwqUhnplHAqyZ4rmhwg81gZohZU3x5l6PQNxKzIxupx9eGmMgW65nEPxQIcSs30st2Kjt/gMVcQaicWqybie5kSKWsVjHpIZnLo9TCcJ5vzNUGiQ4geJDBzbxoysTiwKv+/Z9iJcHHlux3mwZkl3NElNW8HWkP6LnBCsNsZAnL7fw3NAm3rYlzTMDJWxpsalpmC3NPj2NPShi/PjyBD++MslEsULe8fDDkIoXkK2AKQVZx2O8UOGuHZ0c2tDKydEsJ0ai++iOTW1sbUnVz+k3Tg9elaQYKI/vnPx/CcNhYkYASCqB5NTkVv7kidPLEjvXUhr8tYJaUC3DM7xrc5Zcl8XF6SZeHNlCb1Oaf3Z4yzWXDpYikEbt4pq4FQWc02V3nj/UShybtW7tlVLwM/t6eXFwGi+8k3yocbwybbGnMYQkUNTl/29UB6T2PCxV2l3HOtYaQi+VLlmAixcv8qUvfYlyuYzWGqUUg4ODfPGLX1yTQXz3u9/l8ccf5zOf+QzHjx/nr//6r/nP//k/L/le13U5efIkv/M7v8PU1NQNfa/nedi2fUPHgGiRzTn+Enqj0bzQGLcWEc/W4juXOqbWirJXWPZzSbsBsYgXMnsuvKBCoPyqe7LAlBa2GbW1lr08qirBDvX/oLXAU7F5Bot+GO3gVK09RkRiYDFT0pqykSLafZa9MqEKZgXMMACLWh6jMWFT8QO8QNXr7bYhcYKQohssOS1KEe1oa+NZ6lxFgZFGLnEuAFzPxdduJDk/57dGvxcKnoUpIsMdpTRJ28cUqm7Ao7Sk6FnI6kIG1UBqDgRRgBgzJe3pOFpD1vGWvI8g8tZZCRWvQKgXcwe0Ngh1VOKwTUnSmt2frHS96++p3hvL3XOvRXhBBT/0Fv3dkDZxK7HEJ66ObMUjDEMMY3G5pDFuo9E4fhg5W1O9Vxec7xrKC+/pZd53rVg4Fwk0hohKiRoWXb/l5oOrYe65qP2GGq52n75RsVbryRsB13ouWltb+exnP8uBAweIxRZnzVb1ZHz605/mnnvu4ejRo3zwgx/khz/8ITt37lz9qK+Co0eP8o53vAOAw4cPc/Lkyat+xvd9PG/xRHStWItjaECFSwtBAfj+UqLp1/E9GiqBqnImogXZkpFg2PzRzP3vXAh832e50VTcImoOf0NrHXnEhCFSmGitWCqsFUIThCGi+qJG44eaeeuyhpgVYBuKsufUR1Aj3ColCZSJKUOkdGqvUKi4+OH827QchgRVAmstDlqIYsUjYcpFQlFOoPDVLMlw8fmrnSmBgSQUat4XaCBQEktE8v5oSNn+PFl7BEihSNk+Jc+qlwMWorYDDUKF63poNGGolg14HdddNlDQ6CWDlGg4IWEYKfZWwhBDR9/hhi6I2ftWaI2vo+ttiNlJxgkUBS9yDjaEWPacvXag8bXHUndGqDw8T3KtT6TS0XUSQLjgWddEga0UAhMwDOqBn9Bq0RxTuwfnohKGBH5ww+d1qblIGNSfA6VrmRC42nywHK52Lla6T9/oWIv15I2CazkX0X24PFYVqCil+M3f/E2CIGDfvn189KMf5aMf/eiqB3E1FItF0ulZop9hGARBgGkuP7zHHntsycjrWnD06FFuu+22GzpGDV890b9sa+WN6goopfnayQH+7Pt9nBnP4Vfb9Dob4uztaOSjt27lQwdnS2UvXn5kyb7/XZ23L1uTfv6F5xgxnlla2MhKc/feX+XbJ/4fcpUCtYnND0M0gopv8fUzh7lvzxQN9hhSlJkoSc5PNfH9yxsIQs0924a4tWcSEDQlBKHyAY0pLZzAiBZslcaSJZgXxmiKwfZ5Gieh0jx1aRwvCBjIlqn4IY4fYlTNEFtSMTY0JtncnOYjR7bUyx3Xco2OHj3KkVsPz+PkmEaKS9PNFMKDeKHmqYsTpCyfn9/3I6TUi6b7QAn+w5MHGcwF+HPiGEtA3DIRAhoTNh3pBF/62NvZ1Jzmjx47scxdAP/7u29ZtkQwmrvEIyf+uh4QzV0GBTDq3IOnOih7Af/bPQf4wfkhxmf+viqSV0PUkdWcbOT9Bz+BaVj8w0uX+cNvPkdemXWuUXcmzqffue+65cuvVvK40ZJIwZnmmy/91ZJqomjN+w99gob4tZEiy17AH3z3JabGx+nd0LPo9d95136cILzqmP1QXfc1Xi3m3uevjOXoSp5kZ+sIUshqllFjSsHB3juvi6NS+w3Dw8P09Cw+F2vxG16PWMv15PWOaz0XtUrJclhVoJJIJPA8jy1bttDX18ftt9+O67pX/+AqkU6nKZVK9X8rpVYMUl6LuJm+DQ/3DfClFy9ydiJf5XtEC/J4wUFr+PLxK/zs/o31yaHGt+ifPkPJzZOKZdjUsnLff6CdpaWiNeTKEzx8/D/iBRWMOS8bJigNeTfGrT0DNMXGAIEhJQ12wJHuCbxQ8fiFNna1zVRlRzRK1XZhgkAFBEoQKk3MmCZUdp2QakqJrxQJY77GiSEFTXGLrS1NoCc5NZ6PzolSJCyDbS1pRFWH5Pn+qDz4M/t6V8UjqvE1Ijn+xW2J3zg1wouD09hGlK5vThSWDFKiY2tiMofWqXl/V1Dt+jFQWmObgu5M8oa0RFKxdpSWiKoM3zzhO6DinEdarVHruGnQNzpCV9xZ8M5IHKzkFjgzNkJvcxd/+YPTjJV84nEDy4iI9YPZMn/5g9PXLF9+NdPBtTIlXEs10bljOjOep3+8SNHMsbsjgxCCIIwynJ99om9VY75WBd7r4Q/V5pwXB6cZzJYYK2wlHbPozUwBDqGOU/R62N97z6rPw1zU7tPBoaF5f1831FvHzcKqooH777+f3/iN3+BP//RP+chHPsKTTz5JZ2fnmg3i1ltv5YknnuD9738/x48fZ9euXWt27FcLN0ukzQ8Vx4amGc6XI8+ROYGEpxRFL2AgW2K67NLZENXeVSj48skOnutXqLCMNJK8aVMHB3sFcpkrbor4kpO7H7qEOiCsKhELwbwyhtKSTKxMY9wBTCxDYBmSih8SKrhjwyT722foTHtoDaGWVfmpWYVUU0alFil1VLdXkrgpsY2oG8gQDlI4hHo263awp4UD3Y1YpoEbaqSEohvQkZ5tN/UCRagjovNbtrSvuEDkHIf+iR/WsycEJsblaQ5vfs+8tsTZgHSKN224REtsdH4Za85xAyWYKCejBSucPWlKR5N90ooClbdu6aiP63oDXl9ZVIImkub0oteUgs7UMCPFF7h9671Vfo9FGEss0FzRuIEi60g+9+WX0Jzg1GielBlp4tTKGVII+rMlxgoVeptSi75vOVytM26tTAmvRU30atmbuWM61NNMOZ9jpFDBDUIObWghUFF2Alan0LpaBd5rlRiYi9pc9JYt7eQcj8a4jSF7GHMCpHBQOk7Jg5KniF3nfvD+/Ru5cOECRVg31FvHTceqbtN//s//OQ888ADpdJovfOELnDhxgre//e1rNoj3vOc9PP3003z0ox9Fa80f/dEfrdmxX22sdQtdTbQqCPSiUrJS0UQbLKh3/5/fPsbX+waoBIpQCQzpMJrvR2vNZ3526XSc0pLW9E4Gpl6cNa/TUSdJhNnluBasRPwSm6QF4CKEhR9qvDBAa7CNEMtQxM2wXheXIuK5RMeo9Q7IahZFYhsSKSJ+gRdSba2UKD17q84t17y/2rYbKr1Iats2Iy2LshdUM1HU9UrmImmbXBr/IRfGqy3dCDztcHZ0cQtnbRHYkD5F39AoJS8k1GCIOo+2fraGcgkCbWMbCq1DlIp+d6ij6xYqxW0b2/j37zuy6Pg/s6+X6bJbb/0MtUauwCVoiFlMevfQIb5HzJiuvzNUUAkiD6f21CTv2dWBbVok7BiVsJu0ebF+bd1QobRiKN9KzLQpuwGOH+AFEFfUr5ttSCxpUHRXb6Nxtc649+7uWdPOuaupia4me1P2An58ZaKuhyKEYEtjjM6uTrww5Ld+ai//8YenF333SmNebdbseiQGFqIlGZs3F0Xt2VGwn7S5oZZpKQV3bcxw8PAtN0XbJO9UGJieYGNLO5n49ZGf1/HGwaoClc9//vOL/nbmzBk+9alPrckgpJT8wR/8wZoc642GmmhV3DaImRI3UAShIqymNcpeRAhtitvVfwd88/QQJT+sdpVEAUHRC3i4b4Dfe/ctNM5h5dcm7G/3TdDQ3EZnope25DgJyyNmJnGDCmoJkmbVgJaEFXn6OL6LH4ZEvT0C25RIFFqBZSzONtSWdK0Naku8G7YQM6YQC0iQhlS0x57kcuFdi3ZuSdvkzZvbeb5/ar6TbNUPRAq4PFPkL588zbmJwrz2USGiktORDRlGst9HIHD8ED+MSK2GCjk5fIIDG+7GtmbPWRD6DM2coeSFmDJYUDwBX0HBbeKLJ3bQGBeUqu7P0e5b4ochCcugsyHBoZ5mvnF6cN4CqZTm630DfPn4FQazJTSCjU1J/tnhLXzgwNIlBcuQ7O9q5UcXb2d74+MYAiK6o8AyBKYUWIbLTDlLb3NXdbE8AEDCGEYKh5xj0J9t5+x0pOJqWxEZ2VcQq8XJOspUJeIGm5qXz6YszFRcreQxki+vqSnh1dREV8re3L9/Iw/3DfDjKxM8cX6MlG3WHawhCkR0AJMl57rGfLWs2fVKDCzEzbAlWOo71nJj5vk+n/v+l0APEDddnCAGYiP/4q5fxLbeGK3z67h2XHPiz/d9nnzySQ4dOnT1N6/jhhGJVrXwyliOyZLLYLYUBSkaDEPQELPoziT49itDPHDLJvpnSmTLHqYxW1op+yF+qJgpe/yf3zrGPbu66wvjw30DHB2Y4lLWYWR8lLzTgCGT7Gmz+Pib95FJfJtceXxJfQSBrGcggtCeF4mIaualaoWzBAShskAIQh2jEvZQ8HewIfnwku9NWVk+/c5ttKYagaiNd6FNwcWpAv3ZEjHToDdjs6dd8ti5AabKiqFsGVMKtIr0XGqp+1t7W7h7e5pvnSjgBgo/jMpSsy7XJb5+6iwfPnSgPpqyW2C8MI0hAyxjcfvyC4PNnJo6SGcDDM6U8Kq2vaYRdcw0xGNsb400apReXCp4uG+Ah45dZqxQwawuJiP5Cg8du4QQi0sKtWDz5EiOKzOK9rhNg+0jZVSmiUoTAqXjtKaagbmL5SFGS3tQqsQ3Tk/TkU7VF8iZslf/bTWycpRNEWxqSi250NXG8uLgNNmKR1PC5tbeFt63Z8OKJY/uTPKmmBIupSZ6texOqCJ/JtswSFoGWs9qAGXWYMxLlYlh9p52/MLSfDGu3a/nZnLnbgY+9/0vkTDPQzXITlgecJ7Pff9L/Ov3/MpPeHTr+ElhVYHKwszJJz/5ST7+8Y/flAGtYzHu378RreF/vHiRiUJEgLQMQVcmwYGuJvZ0NNZTzemYOW/HXfZDvKowmpRRpqO2MNYIpucnC1zIuoSGVc3AGJwaD/m7YyN8aH8PUk5Vu3QWTuxGPSsx423BlJK4MYwhIsKe1CGWuXQ2JmYmaUjfT99oAUQCpQ3G8xfZEBkt18sMsyZyATrM8s1ThWVtCt69s5svvHCOsvM8SXMEQZn37zQZLbbx0thWNNE56GyIs7U1ze+8az9J2yQIfRJWA0U3u+g3hjpO36jL/Qdmrea/e3aGvGPSnFhMKNcKNjWVOTHu8/Zt3Xyt6OCUFFpHHTW2pEr2nePlM8cPBqLAZbI0v8VTCBgvOhwbml5UUpjvztxOye0iExuoBim190Uy+gk72v0uXCwNIXhh6Pt1FerJkkvO8bENiVaKuGkQ6qjbbHdbhjdvbl8yY/DVk/08dOwykyW33iX0yngOVbVWWG53X7uWN3P3X8NK2Z2C4/P8wCSJqqZJzdKgZhKZjus1HXPU0ddP32iOshcQtwx2tCbJWA04wY2TgV9tg9MbQd6pgB5g8dZGgB4g71RIWLHX/O9Yx9rjuqhUpVKp7gO0jpsPKQUfPLiJt25t58HvvoxtyKqtvFmfIGup5s6GBJubkvRnoy6qWpcQQFPcImVb9Z3jW7a0U3B8RgoVnFBjz+HoaWC0WOH01A7u253m4vhR3KCCBgxpYkq7WvpJ05XZxTfOJjg1No3n9+CHITOOzc/sHuentozPy6oIAaa0ScUy3Lt/L4YxykvDU7juc2xrGpnfrVIPVkAKk2f6fY4P5xal62tCZC8Nz5CUx2iOXUIKSagkaTuouubC8bHt9QV/U3MaJwhJ2iamYdHWsJPx4rOIBd5DlbCHkkd9UfZDxQuDWQIvQ0uyxEL4StAUV+zptLiSLdOajOGFKhK6S8YYzJaYrni0peJ1Ds3c6wcwVXIpuQFxK3JRNkRI3PQouCbZijcvQFgqO2DatzNaEjTFxmiI+WgSZBLbeODwA0yXXeKmMa+VtnasOze18fSlcaQQuIFHa8LF9SBlGuzsbK57uty1owtDikUZAz9UfPn4FcYK0cJe6xIaK1T48vEr/O0vvA1Yfnf/au3+VyK0moYgnNNKXiv3jBYqlLwAz9Y3POa5/JiXhqaZrnh0NcRBw1jR4dEzIfdsS7K9ORsFTDXK2BJk4NXi9SA/PzA9Qdx0WSoHGzNd/scLfYyWrBvqCFvH6xOrClTuvvvu+mKntSafz69nVNYQq9WNaEnGaE8vbaJWSzVbhuRfvnMff/GDPvqz5YgXYQgyMYs7N7fPC2wgmpgdP2QBHxcporJEthKwo+sebt38HoruNGhBOh6VD2q1/6/3DdGe+CY/t2+ChlhA3jE5MZbma6c72dZcYmNTtKDXlE9NadPbvBvbsnnglk10J/t4eWiIUEUE4TkCt9VOE2hJ9XB0tIIhBYLZ7gVDmnz5+BW2tTZgGyG2HMYLdURerX5WIOjNTHFifAuhNvBDhSHnkwlv3/qeaCddzQh5gYUyesj7e+el8QuuT6jgpdEdbGmcIGEGCFklNiuBF0ogzvv37eTkaJFtrWkuTRUZK0TdNYaUFL2ApoSitzE1zw8mZZl849QgfWNZRgsVDEPznm0j7GjJkrI8KoGNNDaRsmfJt0tlB4QwMO03Mea43Hugiy2tHTx6dpz/8PhpXh6eJuv4NFYdaA90N3HX9i4aEzYP3nuYf/PIi5Sd53jv1gka4j5l3+bKVIZXss0Y0sQPFRU/4G1bO+bdq36ouDBZoH+miG3O70qRQjCYLZF1vBV396/W7n8l7sbtG1vpG83NOZeCPZ2N7GzP4IUhP9uueNstm/BDRbYclWqudcwP9w1wbHAcQzjknQoCg5MjWZSG1pSNbRj8aD8RQp4AACAASURBVGAjTQmbdjlOzHCXtHp4o2FjSztOEKuWe+aj5NlkS4q4vbruqnW8sbCqQOULX/hC/f+FEGQymXkCbetYHRYGJNeqG7FactwHb9mEIQXPXpnk0bPDpGxznv8IRA97SzLG7Rtb+dGlCRZ+XTpmETMNmhI2DTEL05A0Jee3pDcYLZS9gPNj32dX6yhhVfG1IR7w1s0zSCH4m2N7+NC+cfa055DCpSnZMk/TJQh9hrJnoiBFa1xtEiOoBytKSdoaNvCmHb/MkwOn6U6dJmGMYIgKoU5QDroYyrazuSXF5elJ7uiqUOeYRHpxKDRx08OQDhU3gZRwx8a2eQtKzLTY0n4XxwZHabFfQuoRkkY/sfgEmcQ2DBlxVBpiFumYSVu6gQszPWxvGUIHVAO9iA/SnN7Ju3b08sJAH0nbZG9nI1IIRgsVEpZB0QtpT0XkzFBpKn7AnZvb+PYrQ7w0PEN3Q4LJosORrovsa59EIAi1IG76pOzL9A1+r975sVJ2IBWLsauzl2+eGuTFwWnOTeSZLnsIAVMlh8fPjvD9C2N86eglDm1o4VBPMx/eP8m5sWmKrkYIk4SlaLAnicdjPDe0Fds0ePPm9nrGYO49PFF0GMxVyMRNWpOxelnQC+e31V9td/9q7P5XyoSYcmDRMwbw5s3t2O44Xz3Rv+Qzu5oxu4HP5Ynv05McQYoyP73dZLjQykPjrbiBpuj6mEbUnj9e2U8pPMC/ePsWGuKZN4wP03LIxBMgNgI1jkoErTUjhVaaM/PP70/CS20dPxmsGKh89atfXfHDDzzw+nFk/UliuYBEazg2dG26EatJNc/dme7vauL0WA57jjT33MDmQ7ds5ujANENTWfxq6246ZtGStGlLxervWy7IevbKKM3mAGH18KIutCLY11Hg0fMuX+7r4E2bdrO12eBfHbqNnsbZgKniF3CDQt2UECSuskEphNZ899IRvvix6D5rj/eRNi8BkQS6IRzS1kWOdJcwRDeDOc2BNpuEFZVQRHU8odbkHYMX+ssEuPRmkggdXZe5AeF9e3u5MPoEluxHAV4osA0Hoc5y/Mqj3Lrl3nqwGISKC5P7ANjQMBXtAkWSOzYd5vZt78Xx9bx26Nqu3A1CLs8U2NUW8Ypyjk9T3CJumpyfzLO7I8PujgwjuTx72nJQ7YGSIpqYa50fe3vexVQ5oOj67O3IcHI0t2TwWrtXICpf1OKFmYpH0Q3Y3JxmpuIRKs2xwXHyjSeImRGXqUYsNqVkd1sO02rmTZs7+fChWRXkufyYlmSMTNyi4ESaOUXXJ+8GhFXTvj974hR/9P4jmNchE38zDPzmtoFDlLGUUqz4jP3FwxfIxczr1np54dKjpMwL1RKjIGX7bG8e5l1bHL51rhuYJcBfni6xpSWNFuk3fJBSw7+46xfrXT8x08UNYvh6A7a9f8n3X09H2Gqw1vfbOm4MKwYqzz77LAD9/f1cuXKFd77znRiGwVNPPcWOHTvWA5VVYqlWyOf7p7g4VWBPZ+O8915tl3C19PjCB+wXb926KEiaG9hIKfj37z9Ck5fluZxktFBBAj2ZJD9/eDM/vauHL7xwgXMTBdwqp2NukJUwPdK2R1DTVan7/UAmFtCYCCn5iqMDOU6NGSRjF+vfL6UgYTUQMxuQYmZOsAIgKQUWQjTyP1/u5+LUDBljCC/USKHqnSwCwbaWLKMVn4ovGMy3VjkpEXE2VBqlFSfHGgkxaIpZtKVi/N1Ll5GG4L593bMlrFMDpO0RDGEQojCq59UN1by20Nq5Mw1J0b2Tad/jUBv87IGDxK0kD5+cVTKd2w5tSEHMNPiFI9sIlcYJwjrPqOIHdV7RzvYM6VhAUyJq94ZIE0YQKRJPl3J86G++xamxqE29OW6xsz3DLd3Ndd5N7RxnHY+yF0RtxlVNFlVtV1caQh21kHthSNp28YISMcMiXi3f+GHkU2RIhzs3p/jgLbNBykJ+jCEF+7uaODkyw1C2TKgVhpTETElHOs6Pr0zwbx45zh/fd+uqn521Uqtd7tjfPDW4LDl74TPmh4qzWYee7uvTeglCn4nCuTk8qKgrK0SwtyPPYxc663ICDXGLiZLL7o7GG9I7eb3Btiz+9Xt+ZZ6OSsKKLWs7cCMdYUvhZt5v67h+rBio/PEf/zEAH/vYx3j44YdpaYl2aLlcjk9+8pM3f3RvACzXCumFIZemi2xva1g0ua1ml7AwPb7SA3a1GrqUgndvbuRfHT5S3102xW2+/coQH/+7H9VbfmtaEj++PMm5yTz7Ohs5M+kTa7erJDjmaM5CwTUpuGa1rKSxqvWcuTtQ07DY2LKH6dIzBCoy+otItJqc24VpWLw4OEVT3I0Coup7AkVVG0TSKAOmXQ/blLw0FmmAbMhMkagSUE+ONfBU/wYa45G/znTFw5CaS+OP8dXKFL6qkLQbGcilSRoVQM7rOPJDTXlOW2gtWHzf3m5euPQoE4VzlL08j/Y9Rzno5vTUDgxpcKinmTPjeUYKFSp+wJ7ORu7Y2Mb79mzgM4+fJB2z6qUfQ0hipsFoocKGxiRXsoqca5K2I52WUEUKvoHSzFQE5yYCpIjItgXX55XxPDvaMvzuuw/Mu8a10lCodP1vodIoFS2whpAICbZhoHQcX8VRhEgBccsgboLraxKxRvZ2dNYDoej6LubH7OnIRO7quTINMRvLkDTEZktBz/ZPLttxsxRuVK12pZ3x1Y698BkruD5usLTZ/Gqe2YpfwPELWIaoZ6sif6ooqM/EAxAWnWlN3EzgBCE72xfPD/8UkIkn2N8ze31frY6wtVJHXsfaYlWzxfj4OE1NTfV/JxIJJiYmVvjEOmpYOJlrrTkznmc4X2YwV+YHF0bpbUzVBcjg+nYJ1zrpLgXLkHUZ/q+e6Of5/ilGC5WoRbWqJTGULaGAK9MlzoxHpMOORCvbW4ZRqk4LQaA5M9kIGCRjJunqb1rYjmsZktu2vIfzE3nylYvY0sFXcSphN2cmN3J6Is9wziFhae7bFSNledTKIelYFHjErTSNmR7OTg5yZabI8bHtjJb3MFPK8txAiaIHUvhIERAzJYZUfGjP86Rsn2KVt+cFDgljCvRi4ReNxjbTi9pCTwx8j3NjzxOo6D1lb4ZATdFsV8gHhxFCsLsjg9Ka4XwFL9C8PDLDRMkhX/EYzlcYKzr1Nt4gCAm05vxknvGCz8mxDHf2TgKSQClAorTi9ESGUM9OzkIIKkHAcwOTxE1j3sQ9l9dUa7WNSkiQsmdbcA0p0Ji4qgeh+6vSweAEIQU34IlLkk9/6we0JGzev3cDf/i+I0vyY4QQdDcmSdkmm5sjrZW5bdalqrjb9rYMV8PV9E5WymBcbWd8PcduiFnEzBrvJuqAmkuGXqoLam6QVPMgEhSrr+vqOQPHt7l72wxbmi+TiQU4gU3O7eLnDl6fH0/BKTKaG6KrcQMN8dc/n/DV6Ai7kfttHTcXqwpU7rrrLn7t136N9773vSileOSRR3jf+953s8f2hsDCyfzMeL6uy9CSsBEIRvIVAPZ0Nl7XLmGtH7Da8UKt6uUCiNpm867PpqYUQkLBDZACHrvQgwI2ZqaImx55x+TcVCNPXOrGC0LCUNGajCFl1I4rCPCCEtlKmfZ0GikMPnL7z/O1k5foGx2h6FpcmXHwdEBDzCSS0xdcnmlid9sopjSgJkGnNRubd3PLxi28eXMX3zkzzKXpItmKx5OXpnECgRRV2X+icssv3XKWlD3fVlyjMISPwgZqIm66KjsFG1v2zOMJBKHPqZETVfuCqASl0YQKJEMIDqAxOTOeZ7RQQWvN5ZkCM2UPxw8ZyZdJxyza0/F6Gy8iKvRcmCqggO+cjUoB+9rzNMQCso7BZKmTxy62zzOHjMajmSw6XJwqcKC7ed5r9TKfELhBSNbx6WpIEDMl3ZlEnWQdKs3mtp9ic+YVBmfOMF3KMVkSnBht5sn+HiSQq7g83DeIEII/vu/WJXe6SSvKoMTMxZ40qapQ2mpwrQZ+c3G1wH3usRcGHssd2xACL9D84MJoPUPV1ZBgR1sDt21srT8nKwVJNQ+iWraq4oeEUqExONw9BQi0FqTtgLbkMKeHH78mh2MvcHnoub8iCCcQKDQS02jno2/6BLb52m5PXgmvRkfYjdxv67i5WFWg8nu/93t85zvf4bnnnkMIwcc//nHuuef6Iv1/api7o4VZQqPSmn2djXWX34FsmV0dmevaJaz1A1Y7XmzO7nyi6DCcrxAqRb+OFnGlNY7S5Byff+zrwDbaSJgeEyWBbcWwTYH0FXnH5+JUkbdvbaMl9nK9a+dHZ4+xsdoBJKXBBw9u4779W5guu/ynp17hlWrZpNayPFOOLOV3teWwpEvCStPbvJsr+T18/bETVDyXdMxnT0c7u9p6+drJfhriNiXXJ1AKW2o6Ug69jVFguFD4U4roNw0VukgY46TtgHJgI8Rmjmx69/xz5OTxghJz0y+iekyLCugKoU4zVnSQQjBZiQTQ4pZBzDSoBGG1xCVoS1VF2ISkNWkRamhXmsmSyzfOdPGdcx20JhWWmeTuHRsx5WBdmA0gW/FwgpBs2eOTX3mWt27p4MF7D9dJqwsn+bgZ+R/94MIYJ0ezi3aoUm5hb8+7+JPvPc9/f3GEqaKHFkHdBqESKn50eZyyFyy5071zcxtaa350eWJRqv5tW9tXXfZZrYHfQqwmcG+IWSQsg1fGcvOyWrXy5lLHfrhvAENoejLJ+mdGChW2tKTnPbMryvMfmPUgKnsF3NDGCTeRsIaIgpTZ+zJhmdckmQ/w0HN/RRiOzeoWoQjDMR567q/45bd+elXHeC3jZnaEXe/9to6bjxVnjL6+Pvbv38/zzz9PS0sL9947G9k///zz3HHHHTd9gG8E1CaxH1+ZoOyHUbtwQ6Je7tnZniFb8fjE23bXSy/XgrV+wOYer6shwYmRGfKuXxdWE2hStsVUycVXCqWjrpSYaTGYC7FNSSZmUfZDTEPiBwo3DLmt+0q1ayeacJxgsdGaZUgsQ3JiJMt02SVtWxQcnzBUjOZ9/vvxVna29bKrw+SenTu4mLM4OTJNS+wknbFhtC4xOGlz9FI7JbcRISQpW/DOLWPs7yjSkvSw5um0zP5/ZBhok6sokunoN9mGJAC+3jfIBw9uqb830DF8FceSc9VpI3Jk0bOoBDaakCD0yJZzDOcUSkfE2ZRtYgpJKmZS8gIycQvbiLIbLYkYfWNZnECRsk1SREGtEwikNLilp4kfX5mISM9CkK14VY6LoCVlI4Xg6UvjS5JW507ySdvkw4c2c/+BjUvuUPOu4sKUZqLkEwQKy5q1RHT9kMFsuV7CWWqne9/eXv7NI8d5tn+SkheQsk3etrWdB+89vOr78Hq9alYbuLuBYihXxpCz4nRDuTJbWtKLjj0b/Mh5uiq1LIwThDhBSNw0rhok1TyIRvNTfP6pftIxn674ZUDWg0FNlNe7Fsn8glOsZlIWIwgnKDhF4lZyzTMSfqjmdU69Xssjr4Y30jquDysGKg899BAPPvggf/mXf7noNSEE/+2//bebNrA3Emo72vfu7gFemlfbhmgSa03FrnunsNYP2Nzj7Whr4ORoFkNEE7ltGWQSNi2JGNMVj4xlgpBsak7hBiEhFRDR76HkEipFWWvCMEAwUD9+ratkKaO1uGmQc/wqt0FTCUKKrk+oVNX7xuLKjOQ/PX2evOPzwb1jWLEBvCAqB9nSZVPTIPftdvjm2W7u2TbCnRtnqKXVa9ALeJFaQ9mTHOgcIbL008QooTnLlUnww031c9mUSOKqHiw56z4MkdvzjNtJwQ/pSZ7k3u0DpG2fnGtyejzDd853UXAik8Id7Q0EYSQy1hi3kQJOjWXJOj6uH2LIKGir+c2UvIBLU0V6MgnGChWKXkDFDzClpCVps7Ul4iIYcvWk1bnBix8qchWPJ86P8pWXr/D0pQlKbmRuqYMQy5B1K4YQTVsqvuRxAExT8sf33Vo3HFzJF2clXA83YTWBux8qYqbBhsYko4VKvcS5oTFJzDTmlTxhNvipwZCChDTRWnN8aJo/+O5L9WzImfE8h3qaF3n1zA2STMOiM9NBwh5DaYNQJzCEMycTEpUBr0UyfzQ3hEAt+ZpA8T9fOs7lbMN1d7PkKh7nJyPSdmPCRmnNP77cz98fv8xAtoxA09uU4ucPb+aBA5tel10yrzdvpH8qWHHmePDBB4H5gm8AxWJxXfDtOlBz+q2lgWu4kYi9Rth7354NwNo8YH6oePvWDkKleX5gkqRt0pqKVUmXYBkGgYpahCMJegOlIWYZJCwjEqsqOpS8aKefjlk0JhRp28cyDOLWfO7Cwl2jE4Q0xS3OTuQpeQEJU+IFEl3Vb5mpOJieQd7xKDoucXO4yhWZhdawpz3Ls4Pt7O0oIEQtIyBQSiHl4u6NQMWJGWUE4dwjIQhJGRfrnBqIFubNbT/F5QlN0hyp+xv1jTXwvYvtvGXjc6SbJlB6tlX7zo1TADx6voOGeIAfuthGnMa4jSEFp0az+KGiNW4zpVz8agszaGKGQcKUhFpzuLeVW3pauDJd5AcXRtnUnF60g78W0upcTsXxoUgYDiKysiEFgdL4oUYIjWUILCnpSifqDt4rIWmbqxrDcrgebsJqAvfpslvvxFqYHVmqXFoLfrILvqvWgq41dRPG6bLLmfH8IumBhdnNueOshN2kzVrQG7lsX6tkflfjBjRy2WBlsnAGuP2au1k8L+QXvvgkx4amcfyQuGVwZEMLd2QCjhVLjBUcLCNicw3nyzx07DJSiNdll8zryRvpnxJWtcV54okneOGFF/jEJz7Bz/3czzE9Pc1v/uZv8ku/9Es3e3xvOKxVxL4cYe937z5AyQ+u6wFb6phHNrSgtSZmRm3GZ8bzjBWdKDAxDW7paWF3ewZfKWzDQCvF6bE8OSciq0ohMAVsbmpAk8QPHeIm8zprFu4aG2IWB7qbOTcZOcgGYZQpiRkSTyimyx4Jy0QKQSbuV/1samEIxEyFJRUNsYBfv+0sCcvDUxEp15ACJSxQfj1Y0UjcsIlJ5zZ6ko9W/zYfUrh878w5/tmRw/Wd4gcObObhvnfx8vAErl/k631TjBY9TOmwuWkmytjouY1Egjs3TrGvI0dDLIzk6bPN9I2Y7O9uRQrBga5mZsoepinJOx5+qJEC0nGLjlSs7g1kSEFvUwopZ7tqap5ATmBfE2m1xqmASAiu5EV8lLRt0hi3yFU8Qh0FsCnbZFd7hrt3dr+qNftr5SZc7Tmbm3WpZUdqSNkgdJEglPOChG2tafoHZ4OAUGmGciUuTZc4MZIlCKOMX8qOMi072zP1QGm5zcjsOG/BDxS2MUzMcPBVjKy3gVh+D4cXCBMuh4Z4GtNoJwzHFr2mFKStCUphgGb2d6+GbP8LX3ySF/onkVIQM6MA6vn+CV5UAVs6WuZxvaQQTJZcXhxcbJ75esLrwRvpnxJWFah8/vOf50/+5E/41re+xcGDB/m3//bf8rGPfWw9ULkOrFXEfjP6/Zc6Zt9oDimi8c31Pan4AYJIx6QWpEhBNbOhcfwAEJiGJJ2w6cxkqITdpMwLRI22EWq7RoCCM03CasAyLHa1Z+r6G24YMJgF0Cg/EmCLWxqBoBLY5B0r0qDQELMUtlHt2lGQimkMobGMoOr7E/XyhFqQc+I46k2EtKKIYcgcK+G7Z4aI26318zv3Wv5/z58n706Ssi0ERTKxYDbYqRIP4qbCMlTUgaQFzYmQ9tQkM84AzZmNZCseWgdsaAiZLoU4fkglUGgddYe0xq15Vgd2VUjNCwPu3HCF3qp2TEQA3kTcuvriNpd4WvGDqu9TxEUq+yHt6TiGVgjTpC0V4927uknZFkde4zX7qz1nS2ddFBnzZVoTE3znpEPSbqCnaTcDhT28PJKj7AYMFXzKYzk2NiWRUnBhskjJ8xEi0kTRWlNwPC5Mat66NUQHrLgZmTvOh45lODM2TdzyUDpOoAyGLk+itJinBrwS7jv4Szx8/C/mka2FiIJzAwcpHEI9mw2/Gtk+V/E4NjS9KFASQjBRCejwA5LW/GXEC9Qi88x1rONGsOqi8fbt2/nzP/9z7r//flKpFL7vX/1D61gWNxKx34x+/0DpZY8phcGB7sa6FX3SNnnrlg6U1vzDS1fq9WmtNQU3YH93M5eni5FEvRBkEjbjRYdpN/LLaU/P4PoFEnYDPU270FrxrZf/irJXIFk1X/vwwbv5Rt8AZyemCVWFUAkCFY1NVVuCQ6VoSSQ5Nd7AmzdOgxCYcnbH66tIb8UwohblGgQaU2q0juGo7mraX6NIYsgEWlcWnZ+Kb3J5WvCjy+O8d3fPPA5Ewanw/JVLaEK0luQqkpxjRsHTHJgyCjpE9byaUgIa2xjmoaNn2dZ0ke5Eljf3euxtM7gwleQbZ7oJtIUhBNOOv6ik8NO7u0mbL5EyRyL+jRA0xUPS9kBd9n8lzCWe2tWyXC1DEypNU8LGcRxCQ5K2TVK29bqq2a/0nC3MunSn+miyLxMzTBCCil/k5NCzzLhjwGGSMZPNjTHaWhvY29nIPTu7+L+fObeIixLp2oT8y3fsJWYZq96MXJwqYpk2gbLqbe1+qDg6OI1G86FbNl81s5JJNNOS6qLk5FEoJBIpJYEbEOoYSs83Nb0a2f78ZB7HD4ktsD2o3yOhggUft01Z9whbxzrWAqsKVNra2njwwQc5efIkn/3sZ/nMZz5DT0/PzR7bOpbBarsarsWvouSrZY9Z8QPu3tnNBw5sqh/vm6cGeWloZl6G5alL4/XdeOT5Eu00i65PJm5R8RVbeu+aJ1v/8sD3ODt2FCEiH5uKH3UCaa24o2eC27r6SVadg/vGGviHk604AXjFCnZVmfXxi92A5lB3IcpkKPCqTsZSKGyjWuKptVQQBV9JW/PI6RHyriZlm9y5qY1tbbdyZuwZJAoho88ESvDsYAsvj+TJuQopBG/e3M69u7r43A+/BGqA/a0uG241eWUiw9deaefkeJq3bpqp95rWtFzQkLJDQKG1xlcSU1TY23aOXW0Rh8WSIV3pgJ6My+GePC8MtfF0fy8lN2CkUKmXFEKlObKhESPMUfbis11Z1bWsf/oMG9veTlMiuez1X1gCqRkilrygHkw12pJtXa184JZNfOTIlhvOpLxWfFTmZjOylTJPn30KN5hz/2sIFCTNEQrBgXrJxDYlF6YK7GhLg9ZVh+/5JnoCGM6XuW1j26rGMveZrmktCREFWl6gePbKJKaUV82WmoZV12oxxexvMSUU3O76b4DVceN2tGWIV8ncCxE3BBubU8yU/fo9p7SmMx1f75JZx5piVYHKn/3Zn/HYY4/xK7/yKySTSTZu3MinPvWpmz22dSyDlboaYqaB64d85aUr8zQyrsbwT1nyqp0Std3pUh4vhhQIwKmWKlqru9iiF+AHEc+i5rorpaDBaCEIfQZnziy5Iz07dpTGWECgINSCmOFxa88kvtL8w4k2VKjxZEjW8ehMxXnsQi/PDCo+cccZElYAQmIZUdYl0q2Bkm9iCDCkxJQSS7q8Y2sD5SBR536MlPdxamKA9sQYmXhI3jM5ORoFH6EOmSy7XJwqYgrB905/PfIVkpH+RSYe8KbeKfxQ8c0zkdP0gY4iTYmIVBz9ttqv1AgRSecXgzg9mTyGlBgiwDJmF4W0HfKm3ilMQ/Lo+R6Kjs90yaW9IVoM7t6e5lsnIi6PUaVRipqirDPD//WDF7HMxmWv/8ISyO6ODBrNydEsSmlCpWhLWvzCbVtvuJPjteqjEnWheTh+Yd69qIj0ggyxdMmksyFBY8Km5AX4oaq3FtumQco22XENJOK5VgdzzSMhCowWqjmvhJoz+eDMGSpelLnc3j6nhHUN3LjGhM2RDS11jkr93CjNwfYkv3z7jiW7fl4vGbd1vD6wqkAlnU4jpeQrX/kKv/Ebv0EqlVrv+vkJYqn6utaaU2M5DCH4xFeenWeGB1fnr5hSrKrF2Q8V/TNFCo5PQ3w2tWsbkTicKQVuGOlJtKXitCQ1fqj41Nv3LKqzV/wCZa+wKFDRSuMGZQJloWqEVAAEu9tyNMQ7AINARQJtpinpbkzyM3s3EAoH07gERJ5ASmu0Al9LtK42HYeaQIVYRgpEgsScGvux4SyXcnv49ukm4nGYKAqmyyGISJk0ZZmMFSqM5PK8d/sEYoFqhRaCA51FHj3fwcOvdPLDS93saBXcv+c8phGRfOdCCsV4oZFNzZPV6zD/dRFVc9jRnOWbYRumkcQ2JVuaU7xvzwYMqUjYDWTLuchAEE1Y7YAKdRxIXPX6zy2BVPyQvZ1N/NyhzbxtSwdSCi6/0sedB1fHkVgJr2UflZq8fcUv1v8W+RuLZUsmvU0pbu1t5YX+SZKWUdcT0hpu7W0laZtMl91r6lR6+tL4vNZopTU9VYuD1Yo3SmHUtVpqmUvTsLgduG//tWez/scvvWNR18/tm1r57f1J3nJwE/ft731D6Kis47WLVQUqf/qnf8ro6Ch9fX38+q//Ol/5yld45ZVX+N3f/d2bPb51LIOF9fX+mVKkzdHWwNOXJ5BivjT/Qv7KUun3lTol5u6Gi27Ai0PTdRVPgHMTeabLHhU/pOD6FN2AxriFaUjeub2Lnz+0ZdFvSFhRt0/ZL9bJtU4Q4ochkijAkdWSELrqKhsLaIgFKB1DE/mlNNgWLakYXqjIi1sYKzhsbJzCNlzyjsVkKUZbKppIa15EaE3e7ZqXCgcIFezuaKRcyDPsCrKVIoGKyjaGOet7O1POkYlFhGEhqnvpanq8KRFwS0+cjnQ7Co3j5UjaPl5oIBBIoRBCVxdBwaS7jc4gT9LyFqnlah1lg9IxnwbbxwlCXhic4tn+SR55ZZiPHNlC2e/GD6eBqAXbCxVaK54fivP0QD8bM0netbNr2R35csTT2j2yFlgNr0oQzltYepbjQwAAIABJREFUX03MLZnUA2dx9ZLJ3EXcq7Xu9rTwkSOb+aPHTlxT5uj+/RsJlebo4HRdKbcmDAnXLt5oGtYiobjr4cbZtsFXfu2uRToqR48erR/zeoQq17GO1WJVgcpTTz3FP/7jP/LBD36QdDrN3/zN33D//fevByo/QcxdXKbLLn/19BmkiDo3vEBhGRFXYXQOp6HsBeQqHk9dGl+Uft+g9YqdEl890V/fDadjJp3pOEO5cn08w/kyLUmbpoSNWe0g6UjH+cXbti5JAlRK841TI1zONpMyp4lCgFq7sCLQkqQVIoRGa4EhBE4gyTsmjm8TEuIGIX6oOT0e0Jq0SVkmhiEYzG7lpbFNFN0Cl6dCtJD89I5R9nbkaYqHFFyTs1ONtDfuIx1jntdLOmayv6uR6YkxUiSYKXuU/AC0xjYNin5IbqqI42nQmqQdVLt4IFASLzRwApv/9c230DdW4ulLE1Q8KHkWDbEAXxmAgUCTsEziVpKNzd1k3VES1pV5EuoQHVNryDsm40XBeKmAKSWZuIUlBT++PEn/TAdv6nVIGMP4YYmcY3B6vJnvXOikIQb92RJPnBvlLVs7VtyR1xYxpTRfPdFfv0eyUxOMxPtvqESzMq/K47mLjzBVPDePUH1483uQYrFf0PXgaryYIPTZ0Xk7SocMZ88vWzJBMK9kstQi/sT50evKHEkZdfdoNM9emSRhmVdtb3410ZiwV825Wcc61hKrClSknG1PBfA8r/63dSyNV4swWJOcd/yw3rlhV7UOauPwwpCENEnaJt+/MMrLw9lFk+gFt8Adt88ec+5ittRuuLbLGylUIv0HMzJoi9yCwQtDYqbBB5bhNcyWAW4BIG4ME4QlSp5NqC1aE0WErLnLaiwjKmucnmjGC0VkmBgowmp5p+D6vDKRZ09Hhtt6W0lYJj/un0AaWbwg5NvnuvnBlR7aU4qSZ9HRkKKn2eSVsVy9u8KQgrdu6eCB/Zu4eOEi50Y8TEMSq5aNkpZBxQ/xgpD37ZwgZqp6UBERHyNWw3BxI+UZj5Ib4gUKT0lGi200xEaoKaooLfCV4lD3ATINHRwfuo2BvGZjZoC46aO1iMS7hIkg5NR4A1qYWNXnLlfxuDBV4NCGFvqzDjvaD6LUXr5x6hwjefDCqONJVwm2A/kybxeRa/BC1dXlr010j2T1jZdoVuJVdadOMzDVj5DzCdXANRnyLQWlNP94op/nByYJVSRiNze7oXTI8SuP1r13knYD3U072dX1JlJ246KSybm+E9y5xDmoLeJr0ZH3oVs2Y0q5ZGZztfNKEPo/sezUOtax1lhVoHLvvffyW7/1W+RyOf72b/+Wr33ta9x33303e2yvS/wkCIMLOzc603GG82WkiHRObMMgVJpDPc2RHP4Sk+jZrLPsArbUbrimqdKaiuOHIZ3VOjqAISAhzWVr6vMnc0HOP8RTV3o4Oz6Gr0z+l9vO4oUGoDBlLRgQKGXw/HBvtXtGE+qofBI3DdIxi6LrM1ZwMKXkw4c2cWWmyGiuxHSk6o8pTbIObGhM8Ol37uPowDQjhQpBGGVLOtNxTCn4xulBbu9KM25IWhI2EyWHbMWn4Hg4foApQg52FQm1iR+GmGK2QyhUBqZ5EIDWVIy4ZTBWcPjexUg5uLfqMO0ENuVgI9u63smReJy3bGkH9hIzFccuP0L/zEX8oESgYvx4MM4Tl7sx57ZYC0HJC4kZUXbGC0MqvmayZINQQFh1fo6uSd7xeXlkhj///qkV78mbZXVvGZIDXZFP0dxMgVI+LfExxBI6HddqyLcQSmn+j28d40eXx+e5HQdhxAN64JZNHL/yaL3cUwuSLoy/iCHkvCCpFrybV3mG18IgdKnMpiHEquaVpQKvtc5OrWMdrzauGqhcvHiRD3zgA+zdu5eenh5GR0f51V/9VV544YVXY3yvO7wahMGFu6qlOjcgyna0JCJ59kM9zbx9awfP9U8uOYm6gV52El1pN9yStBFiVldhLparqS+czCMXWo+8F6clUSZleYRa4IUGXhgRdC1DYpmCB/a389TlKLg4N5knbhr14ygdZV0GsmXesa2Tn923kb87fplvnh5gYKaCUorNLWk+cmQLP7O3l77RHD+1rXOefDpEHJ13Nwka4hYHe5p5ZSxHrpKrknehvUHTFI86i3yl8XRN9tyoqncGhESLe08mwUC2RMENODa6nRPjW4ibHo2JRna2N/OD8xO8NDxTl1+Pds4fQumAgpPn/KTHXzzzDDFT4Yb+PMlcKcANQ3qbUhhCkrajc+S7Eb8HIOt4CMA0BAe7m4lV7QuWuydvhtV9LXg/MTLD+ckCOcenKW5xsKeFgz1xfMcFFt8/12LItxT+54krPHMl4mtJIzp+jbdlGpKf3t25bNfZ9QZJa2kQOjezObf0utK8slTgtVbZqXWs4yeFFQOVz33uc/zX//pfgUid9rd/+7f5L//lv/D7v//7HDly5FUZ4OsJN2s3WsNK2ZqFnRt7Ohv58KFN3LW9i8aEXSdHLt/WLJadRK/mnQLUX6vxPQwhuWNT65K/tzaZa605M55nIFtiYKbET20ZZE9bFtvwEQgCLXADg5pFW8xM8vvveyv/4fFXmC67jJecetsvRMGSISShjhbpmGXwy3ds5xdu3bqoK2G67NbP4Vz5dIgWZDfU/z97bx4k13Xf937OXXqZ7unZMRgAg43YiH2hSEmBJGs1RdPUZkSyy0nxj1eKqqJ6ScWKU+WUrZId0klelMXOe7Kf5ZKVFz9bpG3JerZJiVZESxQoisS+kAAIApgBZl97777LeX/c7p7ume6enpnuWc/HVZY0M+g+fe7t+/ud3/L9FT6zEILWoI+w36B/KkHADBC3DCK+NEHDLdSoOC5MpPxcH82y3+tO5uHuFgank9ydjJO2HYKmTnOgnd0dzaQth29dvMtYIlMonrw+PMV0OsuvnNxNW6iDQ6bNtpYQY4kMGdslbTloOcetyacTNAzOHN+BJrx7zNQ0MrZDyG8QNLxOlOlUhq7mUMFJgcr3ZCNG3Rc778e3tuO4kpRlc7inhU8c3srfXS7ttsmzkIF8s7Eclzf6x7EdmZtD45Gv29rWGmI8MVm26wwW7yQ1YgJvrc+Vau3+S41O1bLG1aCNo1ifVHVUvvOd7/C9732PkZERfu/3fo+vf/3rjI2N8d/+23/jfe9733Ktcc3QiNNoMfNFa+aT5q/2EN3XGqj6gJlvdoqUzNFTONXb7qnTVtDu+LPzdxiOpdE1wUceGuB4zzggsF0NU3cxhEQzBSFfEAHs23yEkN/TD3m9b5zWgI9oOlt4MIf9JjL33sX7XK4rYT6DHDIlP3eoF9t1+fE7I4wl0qRtFxBMpSVpS6M9ONNKLAQYukvG1rg6FGdPV7unLSMEH9rXw/XhabZEgsQzDt3NAY5tbeMvLvUxFE3ltDoEo/EU74zHuD40xTvjcU5s9fb33Tu6+MmdEdqbfIzG06QsB9t16Y008e6dnYXw/8f2byGRsTh7d4QH0RSWLTE0iAR9bGoO4Liy5LqXuyfrbWjLGVmvINvk2tA0nzi8fW63DTOjFRZrWGMZK5fS0+aIlXn1SNARapvTkpxnKU7SQud5zWfka32uVGr3h8U5XrU4H6tVG0exvqjqqIRCITZt2sSmTZu4fPkyn/zkJ/n617+OrqtcZzkacRrNU+upar72w0oP0a0Zu+K/gflnpwgBuzua2dEeKqRRLj6YrDhF9eMHtvKtC3fRNIHj2OzvjHoheiFwpI4hBUJ4NSpNvmZ62/cXhKzyn+H2WJSLDybIuC5BQ6cz5KcrHODM8R1lH6yzH7yVDPLRLa0kEiM4UvKhvT38zzfeIev4MTUNIWAqlUQTDllHw8zX0EiBIzUCpiSeTZOybMK56207Lk5uREH+/aOpLJcejJO2XTThYIg0sayJ3/AExJJZu+CE/s7jx/nNFy/y6r1RmkyDzpDnrP3uEycJ+mfut7TtIITgo/u3krVd4lmLJtPgp/dGsRxZKKrOU+menH2PzO50WQi1GNlyAmX5uorF0uw3aQ6YJfVaeXRN8K7eToI+f6mTJMkJ5i3NSdI0wZMHe/jA7hC29FdUBq7VyNf6XCmnBZNnIY7XQpyP1ayNo1g/VHVUijt72traVDvyPDQi7JunXtGaSg7HuXOjNa2jnCOUd6J8hsbMuMHqKa+EZbO9LcTujmay9hTtTTaOq3l1JlLgYmIKgc8QfODA52ht6p7zGT5+YCvPX7rHWyPTRNNWUY1HqUGdrQGja/DItlY+sqcVKW0uD8ZIZm2Cpo7luFwdnObu/TF+HLvCw5siTKctfEXr3xoRRAI2GVvD1XW8CpVcVMdnsSnkFq5Lk8/gzZEowzm10bDPMyrfv/GA4USKTx8cY0/7FCGfRTRjcHOshVf6ekv27+MHtvLYjk50TTAaT9MVDvBIb0dJKgdKDZrP0Gg3vOu0uTnIYCxVUN+F6vfk7HukUqdLLdRiZLVc4epsgbKlkP8u5gtnh+NpsraLoXudXZ/KfZ7jOz6KK+H64BWydgLLDZBxtyxoanEx+WLW/om3SGSjhHwRetsPlC1mrdXI1/pcKasFw8KjU7Wuq9p8sHqkuhWKPFUdleKbPRAIVPlLRZ6Fhn1rpd7RmvkiLwvJOccyFqlshrDfwpWBEnGsSk5UyDTom0wwHE/jOBabgyYhn+cQSLzPKwQEzTBhf/lwtd/U+dVHds+71u9e6+dc/zhvj8UYjiU4sukOd0fGeT5qs6m5nSf27WfXpvcXCluFgKDhoos4b/R7qRa/4c0VmkwmGHFiTKU0wn5Px8Ub3Oi9V8r28VBnN195/LgX4ZDw/v/ze8QzVmEWT8hnkHFcHt8zxPGeSSAnwe+3eWSrV+wcNPcU9u+5S3e5NRorpLCyjsPrfd5coFoM2p7OZna2hwtaOrXek7V2ulTCdizi6Si72gPcGEnmHFmPco5SOYGypZD/fIausbWlCUMXPNLbUaLrowmd+/GD3J7uxNQzhfv3woMphJh/ts5sLtz9HtcHX8V1HSQulp0mmh5HSpdTu54o/N3sCGmxlk85I1/rc2Wp0amF1NlVmw9Wj1S3QpGnqqNy69YtPvzhDwMwPDxc+O/5IVw/+MEPGr/CNcZ8KZLF0shoTTGVwr4fP7CVhGXP+TyudLg9/EN2RS6hixSODJJyepi2jgBaRSfqhbceYLsuGdvB1AzuR9vZ2zGI7WoETT1XoFrbSbCa05V/8L49FmMwmuLE5jvefB4ElgNTyWlSWa8r4upQD+AS0i5zaFs/AZ9Dlz+AtS3Cvek9dDXdoHfnOM1+GwH4dYeMo+F6pStoGkymu/n5A9tzxa4GX/vJW0wms5i6KKQfohkLy7E41B0rtA8XFHMR7Gqb5MroJHu72giYOrdGY2iCEs0XU9d4ZzzGxw9sLYmslDdoLXx4TyumESJl0/CCR1c6nL/7EtcHr5K141huAM3t4MbwXra1hgn5l2cCcy3fxfz9oWkmjpy5zxYTFbAdi1vD53BcC3LN4d5IA+/nx7Z/FEM3sR2L4eg4qWyGgOnjxki0EPHxGRotAZPpVJbO8MzhsNbnSiX5/FpZSOS2lvlgCkU9qOqofO9731uudaw7FiNVPR+NitYUMzvsK6Xk/z1/h29duMv2ttCcfPXFey9xe+QNAoaL5YAu0oSNdwCYyBwt60RlLIdvXfC6XSaTGdK2y0i8B4CdbVO0Bl2CZnjJdQpAQc5/KJbC0By2RbyCXfAKgLOOS8DQ6R9/iysDBr3NN+luHwQ8VVhI8VBbnK6mSZr9qdwEZs+9EBporkAiSNk+pjLd3Jnah8Ukz/59nEObW+ifTM0p6NSFwDSztAZsTF3Hdl2QedMGzT6L6dQ0bw5r/OKhXq4MTtI3mSiZqAue4uxzl+7yTx55qCSqlDdo0+k0d0Z+xODUy/zdlVJNjUZy8d5LXH3wGrbrjRYwtQw7Wh7QGvTR3fq+ukxgXgjVvov1LICPZybI2CnmtloLMnaKaHqMu6OXChonuyIm96PtDEa3IoSOqQuklEwkM/zD7eE5s7Hm+yzFLDY6tZDIba3zwRSKpVLVUdm6detyrUNRA42K1uQpF/a9kauv0IRgb1fpgMMnD/YU2iEDhp57DYkEgvoAJ7aenuNE2Y7Fty5c48F0FF0z6W4OIiVYrstA4hB+f5DPnNzO5khHXVopm/0muuZ9tlAgS9DIkjck+bZiV0qi6ShZO8q2yDhCCFxXevUNwtv3TaEEKVtHIhFSIjSB7ZpkHJMf3DkINHGgu5P93b7Ce792b4zbYzG2RYLcm0qUFHQmMgZp20/QdHClQGiekUJAyvKRcfw0C8EnD/dyayw6Z6IueJOyb45EK07K7hv9EbdHlldTw3Ys+iZuYLlyjrluMga4MjjKp49ub6gRW0jasq4p1ZyeToVf8ubAq/RPXC9cj4CRoSfcx4nNFheHH/L+SsKWSBNXh6Z46nDvshv7hUZul+PwpFDUpEyrWF00IloDc0+XjisZjqfRctLr+a6RfFj8A7tDM+2QAgKmTsDIdU5g85F9rYVagJkiwxtk0xN8Yr93mrw0vAuEhk/XGImnOdDdQndkE0adHtCmrvGu3k5evTtK2vaRsn0ETa9eBDyjlsjaJCwfnSETv5HNTV32TI6QYBoauuZiaDqW60VT8k5HwMiiCZ24pRH2myW1BkHTYCpt8f6HuvnR7WHuR1OF8P7erna6W3SS6eu5yIMn4GZoAqn18p6dPWRsh6zrsrczwks3BvAbMyme/FTdq0NTpG2HcJHBPX9/AiltdGd5NTVcV/LXV28wEZ8ga8sSIUDbdXFlkr+79ja3RpP86iO7+di+LQzHU/REmio6Cwt9/4W2ytYzpRoOtOE3m0hbyTm/85tNjMX6Sq6HT9dICY1tkXHOD25H00x6It4YipWs8ViI89How5NCAcpR2VAkszaD0WRFwzD7dJl1nMKAw7wUf/Fr2dI/tx1SeH0/+cnIefKKmRIvVRLyWblaEQqnyYztsLczUvcH3aeObOeN/nFevTdK33QHezsGAFFoh5ZI3hlvIWLcxafbGJrMRXk0sraWqx/R0ISGJlzPickND0zZPvxmGEPXuDVaWmvQHQ4Q8XuFsx/ev6XQMhw0DN69sxPHdXhzOEqzOUjYbxPNGNybaiea3cGBzaJwoj9zbAd/e/1+yWtvaQ4WJmUHzdJrqWuCa0OD7Gmpr5jZfHz3Wj+XB9JsaQoihGesPQfFi2HFswZp18dIPMW//dvzfPnFS55mjc/gse2d/NZHj5JynEUbu8W2ytYrKmDoJns2PcKbA2dxcQq1fBo6OzoOc2/8Wsn10ITA0DRaAg7vfyiCEJGCs1SPGo/FirAtxvlo1OFJoQDlqGwIbNvlN1+8yGt9YySydsEw/M7jxzGKOjFmny7zAw5dV9ITCZacOJt8Bq3BppraIYsVM4X05s8YmlcDsi0ywfnBHei6yY62MGfK5OWXiqYJnnniBH915R6v3Wvj1tjrbG+dIOSzECJI3OpB0+LsbhvOFbd60QCf7iIARwrSThumFsdA82pKcp9zOrOZz53Yw19evlc0X8lL4wxEk/REgrx7RxdXh6awXZf2Jj/HtrTx8/u28PN/+BJ9U+1knGaCRpaM48fUDCKBGLs7W2eUfXX4zLHtnL0ziq6JwqyceMaTop/doQEQz5j4jDCWk5j5YU4nZLYTWQ+Ki1JTzhYC2tsFh84z2HBrrIWA4eP+VJKplEVr0KSjyY/ruvz11X5euTPCe3d2LUo0bCmq0PMZ5uIBf7Pfc/bfn9z5MW8I5MQNklaUJjNCb/t+Dm/7OUZjfXOcelMXpG0/ph4qtLgvpcbDdiySmRg/eHuq0Ha/WBE25XwoVgvKUdkA/OaLF/nJnRHvpGnqSCn5yZ0RfvPFi/zukydL/rb4dJmxHbrDAVwpC/ODoPRBWks7ZIliZu7hbDkyd5q0ef9DzbiymXdtn6sNUi80TfBLx3byvt3dPPv3EHU0UlmvHRVgR+sLAGQdDR8STXjGVQjJ5cFO3hzbw/HN9+htGaclYNPe1EpXZC//2+6fRxM6f3W5r9DBk0fk/u+pw708dbi3xKh942e3uDuVQBeCgOEjYelkbQdTd5BY7OkK89Sh3kI64+rgNHcn4kylLVoCJse2tPHYjk5COV2W4pSTrgmCPj+97Qe8GhUEadvBclwkLsOJrfzN9cGaDJftWGTdBLZjVU0VFacNp60jSEMi5H00UkQzBu9MtHJucCetAR8PplMgvDU70mUqlSWZtUjbTsE4n78/geNKPrh3c80t8kstip1tmMsN+NOtFmznGH9zfaBCiqly1005p96va/jM3QyljZqiOZWmIhevdSIxRcbx02JuAY4U9hPWhgibkuNXzGZFHJWXXnqJF198ka9+9asAXLx4kWeeeQZd1zl9+jRf/OIXV2JZ65Jk1ua1vrGyJ83X+sbmPNxnny5DpsELbz2oGBavpR1ytmJmceGt5QYw9RBHt3TUFGpf6kOsvclfMEZeO6pLm+88YcNbW66eFUQ+GgCv3e+kya9xZXQPY+mH2ddp0tm6hffv2w3ARDLD9tYQQEn7cE8kyPbWUMFINvtNYhmLgKFzfWi6xLEJGhohE8K+LC3BEE88vA1NEyXD6I4Vzck50tPGZ47tQBd9/PmFO4zEZ6ZfbwoH+NyJXRztPUjWdnl75Cq2k0QTXhdOUB/izsgP+Wv5AT51dGfZfSo2fNHMBKOXL1SdwlucNpRS8LMHuxiKdTISnWAkIdgUDrOpOUjKcnBciSa8e1AgiGdthBBkc9e2Lejj1miUV+6M8OrdUZpzjlk1x6oRqtDlBvxl7QmeP/dXvDm+r2qKqVzXTTWn3nFF1ft6vqnIhbUisF3QRabQfTdtHVsTImxKjl9RiWV3VP7dv/t3vPLKKzz88MOFn335y1/m93//9+nt7eXzn/88169f5+DBg8u9tHXJYDRJImvTVCZSkcjVrDzUGZnzu+LTZS35aomO5YYIMPd95ihm5gpv/bqkt+Moj+4+Me/Ds14PsdnprRbzCk16X0nKp4AAQ5N8YNckgcCBkgnLlwdjPHnIcwya/SZNfoMD3S3s7YrMmcYcMg2+c6WvsHYh4NZolGa/QSxjkbFtPrRrgIc3RYn4bbKOn3sjOq3Bj1Sck5PvCvHmRefFGT2DKqXk7J0RXusbJWN1obth9nbE0DUdSa6F3HyHu6NgOeU7cIqNNMzfMVS8rzdHp7k6OEXScoimDLKOy0g8jaYJ2oJ+tFyLdXPA5xUuu14hsS+3lzdGogxGU9iuxMgNFJwvIrCQothanN1895LES1fmfUqBIJp6B0N7qETYsKYUUxWnXtOpGvGpNhX5aO+HC6lVrwhc5u5nQVAfIGodQmKsehE2JcevqMSyOyonT57kIx/5CN/61rcAiMfjZLNZtm/3bsTTp09z9uxZ5ajUiZ5IE6GcHspsQj6DnkhTTa9TKV9dqwNR7TSpiflPePV8iOUjN5cHRvFrA4CGEDpSzp135LgavZFJok6pnPrsh/5DHc28OTyNz9AK83TyRvKFtx6UrN1xJZOpLK1Bk5FYkp/b9YBHez2FWoBmv831wZ9huZJktrtiOmMimeHK4BSHNrcWUj+mpvEPbw/xs/vjbI00YWoOjz80huWA5TrkbwMBOHYff3b+Fr96al/JZ1vsFN6nckMcv32lj+lUlozjomsafk0jlbXpm0zQ7DPpaQ4Cks6Q31Pr1UC6sK21CU2IQiu2z5gp4K7FEZivKLbWezXfvTQZn/ScQLy6o4DhOXqaSKGJNI4Mz7kmtTgCC9U4me967O46WUitarn15tFFurDW1SzC1ujJ84q1TcMcleeff55vfvObJT979tlneeKJJ3jttdcKP4vH44TDM1/4UChEf3//vK9/9erVuqzz3LlzdXmd1cwOn82l0VIdD1dKjnWFePPKpcLPFrMXL/dHeWsiVXjtKeD+gwfcvn2bn+udHanpoke2Y2tpDCeAHNe5MH5x3vewXckL10bLSlS8MDxIT3pkwTLvvUBHW5w72RQaAgdtzstLqWE5On49y/BgH9ItmsAs4MbVy5wdiPPWZIpYxmEibaMh6A6bBAyNfa0BulMWf3J9bM7aA06GWxMphJvlSHe8YFwE4LouWVtyb/gi6eh+opYfV86KVAm4dMnmTv84fn3ms9+ZzvD2WBKJIKG5tAazBM0MthQFUbk8fiPDT69fZfTBQMm1yroJopkJiv/aylq5/5zg9fOv4tNCZfe1JWWj21lMHGzpFu4L0xTYruS9nQaf2tvJ16+McWU8RsaW6I5D2NQ5EHLpezDAVDSJhqQzaDI8NFh47Ywj+fFrGVr8leuYeoGedknCcgmZEiM7yoUL3hyrWu/Vl/uj3JyMc6RXx2dYSCDjgmM7+HRBJmvwYGASV0bnXJNb164seuRAJcpdjzxWdoJr166BbWDJNACalNg5jzRrmzwYmMR2pznQHuTyxQt1XVu9np/TGYc7/WMl93KeWq77amEj2JNaqedeNMxROXPmDGfOnJn378LhMInETGdCIpEgEpmbipjN4cOH8fuXFsI8d+4cp06dWtJrrAX+6Nj8XT+L2QvLcfmbsStsK6MLGAeOHj9Sl1PQRDJD69DVipGFvYcOLyqcbTsWk5cvkrLiGK4kaVmInEfhtVH70HVJPGvQvcnTuYCZSMmglLw2nWQsZZC1NXw+P+1NPj50aBu/cnI3pq55ax++it/QS1JCm7pd7l68S1fQpTVoI/HmBeWNqK65ZIlxYsdlMk6QlLOlMJYg//4fPLiNVxNXCp/HcSXX44OYPhtd02iJNDGVSjKdNon4c9GiIjsQy5h0bdpBXDNLrpXtWIxevlCoKbKyFqYvP6E3zLuOvqdiYe1wLEXo3DjTborALLtiSonW0s6jj5zkHz2mFdru4TaNAAAgAElEQVTlu8NB/v7WYGFoZFt2nO5wgP2bInOiCO97bHH3VK33av7vtgTbsbVJAto7hU1zAYlLR2Q/Xd3byqaYFju8sRqzr0cxQTPMY0dPE+zPFFJDpqRQPJ2SvWzu6W1IrUc9n5+W4/Lj2JWKv1/sdV9ONoo9qYWF7kUmk6kafFjxrp9wOIxpmvT19dHb28srr7yiimnrjGFo/O6TJ+foqFiOy0Qys+hwcD3lx6ux1ELJSvoxJbUzmkArSv9I6QXRDU0ihOek5D/r4c0t7OuK8FsvXmQ0J4iXb0kejad58c0BfuWkV2ibH75YXGS7uTnI9rYQnSE/Pt1ToW0yrcK6TM1BEw4CDb9hIkQWXbyD5bgk3ROFdIY2S8I86zieZklu8OFEMstU0uHqcJj3bs8NP2RmCs2t8RY6W0ATpddqKVN425v8dDcH6Z9KlhhyCUQCPqQUhfdq8hmF+qjiOqj/dWuQywNTJe+9VFn2Wu/V2d1L4Kks6yKN5QZo07r5xKlPz+n6aaQaay3XY3Zqta2phZ5Wb9hmSyCw6o38cs0yU6xNVtxRAfjKV77Cl770JRzH4fTp0xw7dmyll7QuyRsG15UlxZ1NPoNwKsqJEwsba9+ITotyLPYhZtsuv/HCeV69O0rWdgn5zTmRpOIHvOu6ZGwT23VxHBc0P5Hgbp7+R59GojGZyPCfXr7OH7x6i+lUlnfGYkSCJrvawwUDognB/akEw7EUTT6D/3VrEEdKXCkL6/QKRV22t3kpzwfRjsKgRO81XIQAXdMRmiCg6QQMnfbQFB8/coCgb8b5K67LsB2J39DpbQ0hpaR/yhNd+5ub3QAc6Y4TCXjCcncm23j57hZ+6Vj5a1W8L1Z2oubZS6au8bmTO7k5FiWZ9Tp8dE0QMnUObW4h7K98X+TroD59ZAeGptXVEaj1Xi39O41p6xhR6xCaSOPKACdDEkM3ll2NdT4ZgKUMI1wt7cBKjl9RiRVxVB577DEee+yxwv8+fvw4zz333EosZUNSrjD1rYkU373Wv6DC1OU8BS30Iea6ks/+Pz/iXP8YEoGuCZKWwyt3hkv0Y8o94C3H5cevn+V973pviVPwX370Jj+954muBUwdF8lUMsM7UpZ0To0nM/zej99ESsG5++NsCvnpiQQZiWcKyrKGpvGpo71cGZji9thBhBBsCY8R8mURQmJoJoZeFI0SkLXjjCcm6dY3FfZ2djv5D24O8vyle1wdnGIqlUUgcRz4m7e6+f7b3YTMLLGsid/wsTniw6frHNvSNudaFe/L6+dfrZrumc0vHd3JhfuT/OTOSGHoY0/EU9It914wVx+k3o5Arfdqub+TGGSdECe3tWNkR0tec7k6aGp1RBZSqLvS7cC2YxHPTIAUhANtDbnuivXBqoioKJaPStX1mlhcdf1ynYIWKuv9/KW7nLvvDRgUeGHyWMZLr5TTjyl+wBs6dAQiJU5KsR6NlJLpVBZHguNIRhMZwv40m8J+RuNpJGBqOo50ydoOA9EUnWE/79nRhSNdfLpOxnb40J4efLqOoWvE0u9i3HLp7TLx8w9k7FJF2bTtkLZ9/KeX79ASHOLE1pn0T/GJOH9dNzUHPH0SJI6bJutI0jYkLR8Cr+26JeDjXdur69cYuolPCyHRC2nCSvtevI5nnjjBt6/08Xr/GI4LYf+MESymmj6Iqet1dQRqvVer/d3r50bm3YdGstipyOVYqXZgVzpcuPs9bg2fy02blvjNJvZseoSTOz9W9+uuWPsoR2WDUe+6kuUeSlbLKdZyXM7eHcV2JGZRF4HA045pylgV9WMqUaxHM57MEM9YhHw6iYzEdiWTyTTxjEXGdugI+Xn13iidTT6mUlkSWZt7k3HGExm2RJrYvylCk8+gJegru3fn7w6X1COkbYeUZfPmWCdvPJjCZ0R5a3ga15FouigYVL+hc2s0ysHuFg5saqE96OPN4WmmNQ1TSAwhkBJ8umBPVwsf3LOZD+3tYmB6mI5QW4ljlsd1JS/3T/LC+KvEMyZBn3/OqbvSyfxTR7bPUeSdTTV9kHpPeK71Xi33d7oQfPdaPy9cGy0Udq8WMbLFpG5Wsh344r2XuD74Ko5rkU93pq0kbw6cRROiYZO9FWsX5ahsMBpVV7Ka5oLEMha60DANMact2HElPkOrWT8mT16PxnHdgpJqk2kAXhFr2G+SsV3amnx0NweRUnJ9eJpo2soZMoHjSgajKVwp+eWTu0rSDcV7V1yPkMzGGE0Iro908MM7mxAiRchn4ErJf/nxm7x/dzc+Q6PJZ5CybPqmvEjMge4WDm1u5a2RaRACHUEkYBL2G3SFA+hCYlk/409/+j0MLYUrm4gEd3Hm1KcxdO/+cKXDt974S/zhG7T4HBx/kJTTw/n7h4GZU/d8J/NK98Vi9VqWSq33avHf5RWCkdQ9+rDYGpGlpG4WemCpVx2L7Vj0T7yF6zrMbrd2ceifaNx1V6xdlKOywaiUq3elrFg/sNZo9pu0BE16I030TSVKDKFA8p7c4LuF0JRr6X757aGCkipA0NB5qCOMrgmcop9L6UVvdF0QMg2SlteRo2nexOaPHyjTJ5ujuB7hf75xlf/rtVvInOKv47rE0lYulWV7Lc+5wmCfruM3dIZiKfZ2RbBcl7YmPynLwZWSHW3hwjXf03qL3uYRTE0HNHSRJpG+xrfekHz8yCdp9ptc7HuJRPo6PsMFdE/RNifLfmngGL9wcFvuvy/uZF4yA2r27xo04XkxNCr6sNQakaWkbmo9sFRb42JIWTES2SgSl9mOipSSpBVdNde9FoodOEXjUI7KBqRcDv5Ae7Ch1fXlWoTr3W1Q/HrHtrRh2Q4/uj1MfzSJZXty7Kd6O3n24yfnf7Ey/M7jx/mNv7vAX1y+R9Zx8eka21qbeM+OTs7eHSVg6nSF/AzH0jjSxZUSpBfJOLi5ld0d4UJ9SsKy5x3AKNG5NuzgSI207ZC1nUJ7cdKyMbXSPdM1QXc4wL3JeEGzxW/ohHL7nTe0Apud7RPomsaMsZDYLgxN3+Lf//0lwn6d7eErhYCULMz29WTZhxIHCjU/i00llsyAkp5GiUADAUFf7ROeG9210qg2/KU4Gkt1nmotLq62xsU8LYJmMyFfBMtO54Y/zOBFKSN1n+zdCMo5cIvpnFTUhnJUNiDlcvCXL15oyBfMtueKzT3a28GjvZ1cGZ6qS7dBuYfGkZ5WTvV2Yho60ykLR7q8d2cXZ47tXPTnNAyN//jUKR7pbedH74zQGvRh6lqhBTcvUqYJwWAsBbluox1tAQ506WScLIZwMHzBmk5gniPgrTVj2YXCYIC05eAPagTN0q9wfsq139BJW970686QN19nOJbGdS3agzFa/DaGpkPOXDiu10IdMDL4jTSa0LGcOJadmybkei3TmhAYWpqwf+YUudhUoqGbbGndx5sDZ3FxkFLmZOB1Huo6OW/4f7m6VhqRLl2qozGRzDCeyNAa9M15jVqdp/mKi+dbY097GanoeTB0k972A0TT4yU1KgAaOr3t1XV6Vgv16pxU1IZyVDYwy1FX8psvXuQnd0a8L7SpI6Xku9fu8+M7I3xs/5a65PvLPTQuPpjk5LZ2fuMjRxZ02s6fzm238kP4l47txGfoJQ/49+7chKF5BaH54YRtQZ2HO96hO/wmTXoChBeXcGQzl/pGObmz/CTiPJ6B1Gn2m6QsT2k0H1EJmDrbW5oKTlIeV8JnT+wsM/16nFOb7xD2DRE00rjSwZUuWUdHSi/1JwRkbB+aaMKVgqTlRxcpkJAP3rhSksj6ObS5p7Cf+ZO5oTkFvRHb1edtUbccl7Tlev9D5v6fFHhBlfmN4HJ1rQgcDm/WuTyQLvn5UtrwFxulyTtnFx5McP7+BD5Dm6PiW6vzNF9x8XxrTOSv3QI5vuOjSOmW7fqZT6dnNVDvzknF/ChHRdEwilt687hSkrJt0lGnoNQKi8/313IyrcUZm306nxofZTDQV/Z0Xq0rpNh5OXN4gkT6AUgLXfMe6gKJQYKrD15DE5U7W/IO0672MJGAiRCCeMbCdiS6BttbQzy2o5OHu1u4PR6bcyLWNFEy/Xpr+Dq3hu+jCwHCJJ620ISDqeE5KwBS0hftoL3ZxHYldydb2dOeLEyUllIghGQs1cUXHt5RWOuTB7eQSv2UaOoOmkgWCnOfPPjpqnt9eWCULv8VTM3A0Az8hlbo/nkwdYtjjlXxdL0cXSvFrdNWNsZDLQFMq4lYtpMmn29JbfiLjdIUO2e9rU0MRlMMRD1hvwPdLYtyniodWOZbY8hceEQFvBqsU7ue4Nj2j87RUakHszV56s1yKXIrZlCOiqJhFLf05nFcieuCI905X+jFfMnr9dCYfTqfkvOfzmc/4Iudl6AB37v6MkkEmiZLygYFDrZr0Femw2G2w+Q3NAwh6Az5iQRMNOF1IB3sbkEIwWdP7CzsQ/GJuPhhDTA4dWPGqEtPxMx1ZS5yIUlkTe5NtXFzYhenQ5KMbZHMWrgSmkxvrEDWMUm5D5HlUEmNzeX+v0fIm7QGBC4+NGykvMnl/r8v64jl99qnpzG1NOANLBSOJGB66y8upi1Xg7IcxmJ267TfyLCjPcbu7lEe3f34khyhajUiJ7ZGSFtTCEoN7WznLJ/mG4qluD+VYG9XZMkaRsX3jambVetYisXvFoOhm7Q2dS/pNYqppslTLXK5UJZLkVsxg3JUFA0j39Ir5czJS9cEmgY62pwv9GK+5PV4aNTzdJ53XmLpCZK57oa5aQzPOSjX2VIundHT0oQjJXs6mwtDDR1XcnLrTJdW3iiXe1h3Ne8gkYmi5fI33nA9gRB+wGUs8x7eHPUxGM3iuJKs42DKCxzuHkUIQdr2HCQJTCYtmny+wr7ObjHO75KgfItx8V67MoAjg+jCc1YsRxLwOr4J+prx6+E5ox7yNSiNNhYVW6cRjMdvIfho0addOJbjcnrXJhxXcnUoX6ul8XDHLTRnkL+9NNfQznbOitOM0+ks//z0frqbg/O8c3kqGfknD34EKF/Hkp9KvVpYLk2ejdA5udpQjoqiYeRbevM1KuDlcYOGQVuTr+QLvdh8fz1k/BtxOs93NySzk55TUOKseLoqsztbKjlMD3e3cGMkWugYyhuLjx/YOkcltdzD+t7Y1ZKIjtfr4/3EkUGE1sWBTTqCKMPxNLZj0xUcQRdeB450JTJnsFv8Q+zcFC6830JbjIv3WmKQcnpyLc8CmW9azQ3b+7u3hqrWoDRyfEOjWqfLFQAf2tzCzz20mTujL3N75CZpq7yhreSc6bk031IiSNWM/CePPL7qZe2XW5NnJTonNzLKUVE0lN95/Picrp+nDm2b0/WzlJD1UmX8G3E697ob9jOZfA3H1RDCKfxOomNogu2zOhwqOUxCCLa2NPErJ3fR1uSjNeDjhbce8O//19WSSMMTBzaXf1jnZP9d1/WiKgJMXWA5DilnCxIDIWBvV4TPHNvOw10aZ29ZCKFhuxIHiZQgBIRMi0d7Z8TySlqMZ1GuxXj2Xs+eUBw0w2xv38+hbR/m//vB9apRrkaOb1jo56qVchGzywNT6MJFd6ob2vlSMYt1IGox8qZuruq6i+XW5FnOzkmFclQUZbAcl4lkBvBSCks5QRmGxu8+ebKsjsovOtXl1WtlqTL+jRqueHzHR3ElXBu4jO1MA177raG3cLDnyJwOh3IOk5SSGyNepOOPX3ubsN8gbTkYmsDQPUVax5X85M4Ilj2NU+FhDYJdnYcYifWRysZobWohafXwYHLvHCOfsbO8cqsJnTSGJtCQCE1DAI4M0NU888A3dJNtbftLJP/z697WNrfVdO5eexOKJzMHObolwJOHvX8zkczUFOVq1PiGip+L8p+rFqqlGK8NDbKnZX5D2wjnbK0I71WjUY7lfKwmRe71jHJUFAVcV/Kdq308f/Ee96cSSLzOgn98fCefOLw0bYomnzFntk69v+RLeb3ZBgDBkg2AJnQe2fU4x7d/mFg6SsbR8OsuzYFIWUNXzmG6MRLlwXSSrS1NhP2eU3L27khhZtCNkShDsRSW43LxAfyTYwH8eqZYxw0X70H+yK4nAUo6IsoVqgZ9fiLBXSTS18jXYXipIpdIcPecmUDFkv+pbIxgUW1FLXvd5DM4sW1TSYfVQqJcjTIW5T5X2G1ZdAtttRRjPGPiM8JYTmLO74oNbSNma62Uka8nC3WYFWsL5agoCnz3Wj9/fuEuw7EURu7hNxhN8ecX7iBEYyeqrjSzDcCta1d4rE6f19BN2kIdNf1tsRGPpS2G42m2tjQVOjyyjoPjSobjaVwpGY552h5CQNKC4XgnXU33CJoGadvBcrzKj+FEG39zfdArRA3MnI4rGfkzpz7N8+cgmnoHZAJH+okEd3Pm1NyW42LJ/1raQmsxto2Kci2Ecp/r0sXLi+4gqeZ8BX1+etsPcHuk9shUvZyz9WLkF+owK9YOylFRAGC7kvP3JxhLZNCKZ+MIGImnufBgYkMIGeUNgLFCueZiI943GecPzt4i7J/5mvp0HVPXyNgOD6aTTKct4hkLV3rS3ecGd/DYNg1NG8ZyEjgyQMrZwrR1ZEFiaIZu8MuP/mNS2Qw/fv0s73vXe8tOV863sxpaiJQNzf7WgpM7H/MZ20bWoCwEQzfrkvqYz/k6tfMwulgZQ7sejPxCHWbF2kE5KgoAEpbLVCpL1nYx9VIjbTne75SQ0fJh6hrb28IlTgp49Qybm4P0TyUYT6RJ23khOQj7DIZiWf7h7jYe7j5AyGfhygCS/Kwfz+h/bP8W0rZTU9og6PPTEYjMcVLy7az9EzeYTE6RcfxEs5tJusc4tqWjLjL2jUhzrDTVnC9NiBUztAs18pbjMp0pFW1cLdTLsVSsHpSjogAgZGq0Bg18hlaiewKe0WwN+laNkFG1IXSNHlC3nFQ6ge/pbGZba5C/vNyfm8jspRU6mvwIAePJDBk7QsAMl7yelJLLA5P89vcvISVLmo1z/u5LXH3wGmnbxXElQiRpNt/BcATn7x8D6pcqrHcNykreI5om+IWD23jPzi6gfLH6Shra+d67uL36Tv8YP45dach8JYWiGOWoKAAwNMHJbe28NTLNcCxVSP9ICd3NAU5sXZ66gGrMN3J+OQbULTflTuCnejs4vWsTb49NkcjEsF1/IWriSklHkx+fMfcz3xiJMpHKFkTjoLz67uzR9bNPzrZjcX3wCnZukGE+U+hK0HiAoR2eVyiv0TLn5ViuIYa1vH8sbWHogkd6O/j0kR1r5h4tbq/265XvIYWinihHRVHgqUO9uFKW7fpZDUJG1YbQ5f97owfULTdl5wppknN3X+Lxhy4iZZJE1kffdDtXR3ezudkrvD3S01bSCuu4koFoki2RppLoTLEuSfG8okTGon/KmyHjy8ZKTs6xdJSsnfD0dXP6KnlMkQKZImkFy6YKl0vmvBzLNcSw2vuf6x/n7bEYw/E0Wdvlp/fGONc/wTNPnFj1zspyzFdSKMqhHBVFAU0TfProDn7xUG/ddFTqRbWH5Pn7EwhB1QcozJ2Hs5YoTn+cv/sit0feIGC4WI5GS8DmcGCYne0hJrPHOL7Vq3nIf/5k1kYI71rmu4eKyeuS/Oj2cMFwvz0WYzSRRhcafsdiNzNG/R/tasdyA5hamtnSG0nLR8r2VRTKq1XmvN4Rl0Ya2VpSSfn3f3ssxkA0iSZEoRbs7N0Rvn2lj88c21H2364W1DA+xUqhHBXFHExdW/TMkEZR7SE5lcoWDPFskhmbPzt/i77JMeIZk6DPv6Bw/2qreSlWEQ0YXgTCcryaoiZjkJ1d7yt8tuJITMDQ+T9+eK3sazb5DAKGzsUHE9wajTIYS3FnPI6meQW6PtfCcWXBqH9s/xYy7hZM7R00IXALNU2SgXgnwYBZduZJLQqomqYVinQT2SgBo5nejgOc2rm0iEsjjOxCUkmxjFVoN9dmfX7HlbzeP8ZTh3tXxT1WCTWMT7FSKEdFsSao9pBsDfrmnOw9XHziPMnkOJsDaRx/kJTTw/n7h4Hq4X5XyooD8VYyRF+iIiogYOoEDE/UTWDzkX2tJesrjsRUa41N2w6XBiaZSGZmHA8JsbSF5jpkHYegZpDM2qRthx2d7+fuqKTJGEQjmZu+3M5Y+mH+8b6OsqnCWhRQbwz+rKRIFyYYiZ/l1miUzz1yZlF7bzkuluMSMMs7Oos1sgtJJTX7TQxdlO2qM3UNx2XVRyRWg7aNYmOiHBXFmmC+hyQw53dh/TKh5n4MoQMCXaRzA/Dg0sCxquH+H92PMe03VrTmpaxq7GwVUUnBsQjMoyJarTU2bTtMpy00IRAIb7qx6+JKyNiuN6CQGaP+icM7+O61D3J5YJSsncDQm3h0dztnju3AX8EhmE8B1dQCXB+8Qtp25xTpjkbf5q+v3uFTR3fXvH+zIx59kwlcKXm4u6XgLC3WyC40lWTqGo/0dvDTe2Mlfy8l9ESChP1rIyJRfA9lcpG8ldC2UWwslKOiWDPUIgCW/13IBx3BESxntgESBPUBhhIHKp5gLcfl5lSaLT0rUzRYLaVQUBEdeoOM45KynFzkQTI02o00BiuOO6imS5K2HVoDJhPJLCBxXJdo2kICmuvy5sgUh7pbOdXbUfg3C9U4mU8BNWWlyNrxolTSDKaW4srAAE8e2lnz3s+OeOzfFOH68DQ3RqJsbwstSUBuMamkTx/Zwbn+Cc7eHcFxJaau0RMJsqezuWyqbDVSfA/9+LUM73vsyJpYt2JtoxwVxZphPgGw4t8JGed7VzPYzlyDrYs0Yb9V8QQby1hk7LnGEpanaHC+lMLxHR/lxkiUqfjbGJpF2vZxP9rBxaFt3JyYf9xBOV2SZr/J0S3t3BiZ5srQJLbjYugaGqBpkrF4BrtLzjHqC9U4qaaAOp5IkXUCSJkqW6Q7ntRr3vtyEQ8hBIc2t+K4kn9+ev+iCsVd6RBLTxA0Qguu19A0wTNPnODbV/p4vX8Mx4Wwv7TFfq1g6hotfl05KYplQTkqijVHNeOY/53taDT5msnY01iOV8GRx5EBDm3uqfiQbfab+MvokEDjiwZrSSmA4M3xfbx6txm/niFt+3CkDksYd5BPrV0dmmA0liZtOwgh8OkaXUGDDzzUjaFrOFKisfganWoKqK3BJjLuFvza21DyHl6RbktTsOa9rxbxyNgOpq4taH/ybdW3Mxe5fen7NPma2dfWw5vje9C1mVRXpVRScRrvM8d28NTh+kwOL2YltGkUiuVAOSqKdUk+zZDMeK2v3nA+iQAiwd184vCuiv/W1DX2tQaYznW65FmOosFaUgrgdTqlLIHtlnZnLWXcgUTSP5kg7bi59mHPXUhYLm+PxdjeFqpbNKmcAqqpa+zs+gDn7mZoDQwTNLKFaNFA4gC/vK/2va93h0q+rdqWNqYwSVlxhHuThzvg5uT+iqnIamm84n2cSCS4MTLE/k2baQ+FFrS2ldSmUSiWA+WoKNYNs0+UxWmGZDaGzwjT255vda0eFXj/tmYe+NuXfSBeNQPrN3Qsx6XZb9Ia9NV13IHluFx8MEnQZ9IW9CGl9JwVIJ3NMhhLsW9TpOEFn584vAN4nL+4dJvxxBRpy6SnJcJnT+xY0N7Xs0OlYlu1JmgyB/n1D34iN5BxbnRkvjReOpPlt1/8E5p9Q4R8WV5+y0csu5nfevxpAn5fTeurVZtGoVirKEdFseapdqJc7JA3TazMQLxyBlZKyfXhaXQh+M8vX6fJZ2A5Lh1NPkaKdDmWMu4glrGYSmWxHUmz3yCWtgrJF1d60Zy9nZGG74GmCT51dDtPHtq2ZNHBek1fnq+t2nYTtDfNnY9TSxrvt1/8EzaH+/BcQkHYZxHy9fHbL/4Jz37i8/OurRZtGpUGUqx1lKOiWPPMd6JcypC3eg/Eq4XZBrZvMoEA9m+KFAySoQke6mjG0LS6jDsojtJ05D5vPGvjup5E/q6OMGeWUTm1HqKD9Zq+PF9bdaWW8PnSePcmpmj2DeWclBkEgmbfEBOJxLxpoFq0aZZjwOFYPM3lwQmO9rTTGQ40/P0UGwvlqCjWNOvxRKlpgicP9vCB3SFStskfvvrOnFSVoWsYusYff+69hbqVpYw7MHWtZChlZyhAe5PEdiQBV/DLJ3ZV1EdZ7SzV2Sxuqy4m31Zd6f6ar05mYHqckC8LZYqTQ2aWGyNDvGfXQ1XXtlgnql6k0zYf/IPvc2M0Whhcub8rwg+/8DECAWVeFPVB3UmKNc1qOVHWi9lpLJ8RJqy3knSPAaVOSF4ltl7jDsoNpdzeFuJUs78utTnLNY6gEe+Tr3e6+eAiSLukrboS89XJHO7p4Ce3fIR91px/m7B87N+0ed51zadN02gn/YN/8H2uD02haQJDE16acmiKD/7B93n1Xz7R0PdWbByUo6JY01Q8UUowjTCGtrAOipVmdhrLshO0+acwbY1p61jJ39a7VbrSUMrLFy8saWzAQmbiLAXXlfzVlXu80T/u1drkZg7V433ybdXOWDsHDu+pud6pWp2Mpgli2c2EfH0l6R+JJJat3P0z2xGrpk3TSKbSNjdGo3P2VtMEN0ajjMXTKg2kqAvKUVGsaeacKKWnsmo5LpPRVv7jD9+q2VgVG4CVoGwaS3gn86AcIGodQua+so1slZ6vPmShEYuFzMRZLK4r+bd/d4FX741iOxKfodEdDmA7LlJ6M5DqoS+iCZ3mQO0RuvnqZH7r8af57Rf/hLBvkLBpEbdM4tkefuvxp8t+xkoO30KKxusVcbo5mcZyXIwy3yvLcbk8OMGH9m5Z9OsrFHmUo6JY8xSfKCcS02QcHylnC0n3CDC/USxnAMKpKCdOyGUdQFgpjRUwdKTMook0sUxg2VqlZ7OYyMhCZ+Islm9f6ePs3RE0ITB1LwUxGI2zp+0GI5MT/O0la0X1RSrVyfhMk0d3P87r94a4Fx3tTGwAABopSURBVJ2iM9TKo7s34zPnOhrzOXzzFY3XO7K1ry2Aqc9tkc9/3qM9ayflqljdKEdFsebJh+Uf3vJBvvryG7gyUIg8wPxGsZwBeGsixXev9S/bAEKoksYS0B5q4bOPPlJRr2M5WExkZDEzcRaK5bg5SXqJVjSZ+MTmu/SEB9GEjsRclfoi+T31mX62tHYD5fe0Hg5fvSNbrQGD/V2RQo1KHteVHNzcotI+irqxrE+7WCzGF77wBX71V3+Vz372s1y4cAGAixcvcubMGT73uc/x3//7f1/OJSnWESkbYplSJyVPsaprMZUMgCY8A2A5LpbjMpHM5KT4G0c+jTX7hJovjAz6/Evq7FkK8xnKSntTb4XYcsQyFo5Lyb7owmFbZBwpvfXmf5PvBrOdufdCI6h27yxkT/MOXzkq3duLfa+F8MMvfIyDm1sRQmC7nkjgwc2t/PALH1vU6ykU5VjWiMo3vvEN3v3ud/P000/zzjvv8Gu/9mt8+9vf5stf/jK///u/T29vL5///Oe5fv06Bw8eXM6lKdYBizGKVU/8GZs/v3CHd8bjDS0CLWalCiPnY7GRkXoqxFai2W8S9htsbg4yGPUGGgaMLEEjC7l5RcUdwMvRDVZLmmUhe7pUh69Rka1AwODVf/mE0lFRNJRldVSefvppfD5PFtpxHPx+P/F4nGw2y/btXujx9OnTnD17VjkqigUzn1EUOMTSUyXFhtUMQN9UAkPX8Blaw4pAZ1NtaN9KshRDWS+F2NkUj0w4tqUNOxcVGIqliGUMkraP1oBDwCitR1kOfZFa0iwL2dOlOnyNjmx1hgOqcFbRMBrmqDz//PN885vfLPnZs88+y9GjRxkdHeVf/+t/zW/8xm8Qj8cJh8OFvwmFQvT398/7+levXq3LOs+dO1eX11kPrIe92ColtzMxbk6lydgSvyHY1+rDiF/kL346hC1TGCJIs76ZbuMQQmiEU1HemkiViKrZrmQiOkHEScx5jxeGB+lJj5TtdlivXL54oew+uVJyoD3I5YsXqv77XqCnXZKwXEKmxMiOcuHC6KLWIqXLsH2NmDNzPcPaZlqzvUTcLKbPRReCgNuFTxvAsmbSIhJJ2G3h0sXLi3pvKP89caWDLdMYIoArNV64Ngpza0zn3DsL2dPy93aArRmbc+fm7uVU2ubmZJp9bQFaA8aSrt9C9mIjo/ZjhnruRcMclTNnznDmzJk5P79x4wb/6l/9K37913+dRx99lHg8TiIxYwwSiQSRSGTe1z98+DB+/9KK8M6dO8epU6eW9BrrhfW0F+96pLQF80r/97k5fB+hC0x8gENM3qenczMndz7OiRNzw/RadIyOrk2Eypw0k1mbvYcOL7u0/kqRvzfK7VOjU2HlOH/3RWLD9xHmzPWMy/t88Phm/vfen2c6nebOyI8YmEwTTUtcaaMJQXOwk94ldv3M/p7kBfoGi+ZMdYT30trRRZNv7lDB2fdOuT09vLmVDzzUTUvQNydSMvveLhdJqaQW+4PPf5Tv3x6s2/VbT8+MeqD2Y4aF7kUmk6kafFjW1M/bb7/Nv/gX/4L/+l//KwcOHAAgHA5jmiZ9fX309vbyyiuv8MUvfnE5l6VYh+TbQatJ7PdPvMXurpOEA21ztC7Onz/PixPlw+H1FlpbK9Rrds5SqGVkQt/oj7g94unq+IwAUkpc6bK1dU/du33KzZnqHz9PT2j7HIE+mHvv5Pf0Y/u38GAqyfkH41wdmuJnfWMVHYn5RgJUUov98P/9Eq/+yydW9PopFIthWR2Vr371q2SzWZ555hnAc1K+9rWv8ZWvfIUvfelLOI7D6dOnOXZs7hdcUV+WS858pSmnTSKlxHYypLMJXrjyh4T8kULBat4AGJpoeBHoWmUlBjXmmW9kQjwzMceREUKgC52BqbexHatuNT8VnSZN0B4YZjJjoWkz71Xu3ikuur34YIKJZIYtkSb2b/KiygutiRqLp2tSi90o0UDF+mBZHZWvfe1rZX9+/PhxnnvuueVcyoalUjfC1jKiTeuBctoktpPBdi2E0NCEVlFfo1FFoIrFM98QPqRYttlP1Zwmn57h6JYAV4ecqvdOvugWYDptoQnBYDQFwIHulgUL410enFBqsYp1hxJ822BU6ka4nYnxrkdWeHENYLbEvpQSx3UAgS6MQttquWnLqyHVoShlviF84UDbsk0TruY0NfmaeeLwfp46rFe8d4q1TVKWTdZ2MXWBEF7n0t6uCLomFtQ+fLSnXanFKtYd6qm7gagm+nRzKt1wQbOV4viOj7Kv+xGCZhgpXUBiaCamXvrgz5+4Z5NPdSgnZXVQfD2RkqAZZl/3Ixzf8dF5RfPq2epdy3tVu3eKRdx8uo7PmPkby3HJOg6wsJqoznCA/V0RXLd0Ta4r2d8VURonijWJiqhsIKqJPmVsWRc589VIsTZJPDPBP7z156Ttxp+4FY1hPq2Z5RTNW8p7FWub6JqgOxxgIJrMzSvS8On6omqifviFj83p+jm4uUWpxSrWLMpR2UBUE33yG2Ldd7IYuklrUze97ZVTB6tBXE1RG5WG8C2naN5S3mu2iFu+gHYwlqI96EPPFXQvtCZKqcUq1hvKUdlAVFO33Nca2DCpjdUqU6+oL/NNE14N71VcsJ2yHA50t/CZY9v5uYc2l9VRWQhKLVaxXlCOygajUifL1kz5gWfrkWqn4Hzbtu2uzy4oxepCFWwrFPOjHJUNRqUHYzkJ7vVO8Sl4dtv21Pgog4G+ZVddVWxMVlKbRqFY7SjXfYOiOllKKdazaPIZIL227e9em3/ulEKhUCgah7JSig1PtbbtSwOT67ZtW7G+sRyXiWRG3b+KNY9K/Sg2PNXathcitqVQrAYqqU+rNKZiraIiKooNT7W27Y0ygFCdvtcPc9KYqDSmYm2jIiqKDU+1tu31PoBQnb7XF/OlMWudGaRQrCbUHatQ4LVtn9zmdQAlszYINsQAwkaevlWUZvkpluWfTT6NqVCsNVRERaFgbtv2rWtXeOzI9pVeVkNp1OlbRWlWDpXGVKxHVERFUZGNeCLOt20bG8CgNur0rWokVo58GtOZJVjouJJjW9pU2kexJlERFcUc1Il4Y9CI07eqkVh5KqlPr/c0pmL9ohwVxRzyJ2JdEyUnYoBPrvN0yEaiEUXEG63VOz9yYTVJ3ytZfsV6QzkqihLUiXhjUe/T90apkVgLUUcly69YLyhHRVHCRjsRb3TqffreKK3eKuqoUCwf6+OpoagbG+VErCilnrOf5rR6s75avdXIBYVieVERFUUJ1U7ER7e0qpy3Yl7We42EijoqFMuLclQUc5hdtxA0dSzH5ergNK/3ja/KfLxi9bFeayRU1FGhWF6Uo6KYw+wT8Q9vDXFpYBIhUPl4xYan3nU4q7FzSKFYTShHRVERU9do9ptcHZpSXUAKRRH16JZaC51DCsVqQDkqiqqofLxCMZd61OGoziGFojbUUVhRFZWPVygqs9huKdU5pFDUjnJUFFVRs0MUivqjphwrFLWjrIxiXta7LoZCsdyoSKVCUTuqRkUxL+tdF0OhWG42ioKvQlEP1LdBUTP1VC9VKDY6KlKpUNSGiqgoFArFCqAilQpFbShHRaFQKFaQ9argq1DUC+W+KxQKhUKhWLUoR0WhUCgUCsWqRTkqCoVCoVAoVi3KUVEoFAqFQrFqUY6KQqFQKBSKVYtyVBQKhUKhUKxalrU9OZlM8mu/9mtEo1FM0+Q//If/QHd3NxcvXuSZZ55B13VOnz7NF7/4xeVclkKhUCgUilXKskZUnnvuOQ4dOsSf/umf8tRTT/FHf/RHAHz5y1/mq1/9Kn/2Z3/GpUuXuH79+nIuS6FQKBQKxSplWSMqTz/9NI7jADAwMEAkEiEej5PNZtm+fTsAp0+f5uzZsxw8eHA5l6ZQKBQKhWIV0jBH5fnnn+eb3/xmyc+effZZjh49yj/9p/+Umzdv8o1vfIN4PE44HC78TSgUor+/f97Xv3r1al3Wee7cubq8znpA7UUpaj9mUHsxg9qLGdRelKL2Y4Z67kXDHJUzZ85w5syZsr/7H//jf3D79m3+2T/7Z3znO98hkUgUfpdIJIhEIvO+/uHDh/H7lyY7fe7cOU6dOrWk11gvqL0oRe3HDGovZlB7MYPai1LUfsyw0L3IZDJVgw/LWqPyh3/4h3znO98BvMiJruuEw2FM06Svrw8pJa+88gqPPPLIci5LoVAoFArFKmVZa1Q+85nP8G/+zb/hL//yL3Ech2effRaAr3zlK3zpS1/CcRxOnz7NsWPHlnNZCoVCoVAoVinL6qh0dnbyx3/8x3N+fvz4cZ577rnlXIpCoVAoFIo1gBJ8UygUCoVCsWpRjopCoVAoFIpVi3JUFAqFQqFQrFqUo6JQKNY0luMykcxgOe5KL0WhUDSAZS2mVSgUinrhupLvXuvn0sAkyaxNk8/g2JY2njrUi6aJlV6eQqGoEyqiolAo1iTfvdbP+fsTADT5vDPX+fsTfPfa/MrWCoVi7aAcFYVCseawHJdLA5PosyInuia4NDCp0kAKxTpCOSoKhWLNEctYJLN22d8lszaxjLXMK1IoFI1COSoKhWLN0ew3C+me2TT5DJr95jKvSKFQNArlqCgUijWHqWsc29KG48qSnzuu5NiWNkxdPdoUivWC+jYrFIo1yVOHejm5rR2gkAY6ua2dpw71ruSyFApFnVHtyQqFYk2iaYJPHtnOLxzcRixj0ew3VSRFoViHKEdFoVCsaUxdo73Jv9LLUCgUDUIdPxQKhUKhUKxalKOiUCgUCoVi1aIcFYVCsS5QM38UivWJqlFRKBRrGjXzR6FY36iIikKhWNOomT8KxfpGOSoKhWLNomb+KBTrH+WoKBSKNYua+aNQrH+Uo6JQKNYsauaP4v9v796Doir/OI6/DxdJwUs3mxwHk0zTqMY01AatcUoSBVpMEipvZWoypimBhEoDXUgbZ3CaUbOyIWeITKFURqnGKU2UZtLSzMYRt/BCijTKYu66+/z+8McWkjP6k5+sns/rv3Me9uyzn3l29wvncL5y/VOhIiLXLPX8Ebn+6V0sItc09fwRub7p35NF5Jqmnj8i1zcVKiJyXVDPH5Hrk37tEBERkYClQkVEREQClgoVERERCVgqVERERCRgqVARERGRgKVCRURERAKWChUREREJWCpUREREJGBdczd8M+Z8Tw+3290qxzt79myrHOd6oCyaUx5/UxZ/UxZ/UxbNKY+/XU4WTd/nTd/vF7LMxUYC1OnTp/n111/behoiIiLSinr37k3Hjh1b7L/mChWfz4fL5SI0NBTLstp6OiIiInIFjDF4PB7Cw8MJCmp5Rco1V6iIiIiIfehiWhEREQlYKlREREQkYKlQERERkYClQkVEREQC1jV3H5UrtXv3bhYvXkxRURFOp5OsrCwsy+Kuu+5i4cKF/3rF8fXG4/GQnZ3N4cOHcbvdTJ8+nV69etkyCwCv10tOTg7V1dVYlsVrr71GWFiYbfOoq6sjOTmZDz74gJCQENvmAOBwOIiIiACge/fuPPXUU7z++usEBwcTGxtLenp6G8/w6lm+fDlff/01Ho+H1NRUYmJibLs21q5dy7p164Dz9wvZt28fRUVFtlwbHo+HrKwsDh8+TFBQEHl5ea3/uWFsZMWKFWb06NFm7Nixxhhjpk6daiorK40xxsyfP99s3ry5Lad31axZs8bk5+cbY4ypr683Dz/8sG2zMMaYiooKk5WVZYwxprKy0kybNs22ebjdbvPiiy+aESNGmAMHDtg2B2OM+euvv0xSUlKzfYmJicbpdBqfz2eef/55s3fv3jaa3dVVWVlppk6darxer2loaDCFhYW2Xhv/lJuba4qLi227NioqKszMmTONMcZs3brVpKent/rasEf5+1+RkZEsXbrUv713715iYmIAGDZsGN99911bTe2qevzxx3nppZeA8/+/HhwcbNssAB599FHy8vIAOHLkCJ06dbJtHgUFBYwbN46uXbsC9n2PAPzyyy+cOXOGyZMnM378eKqqqnC73URGRmJZFrGxsbbJY+vWrfTu3ZsZM2Ywbdo0HnnkEVuvjSY//fQTBw4cYNSoUbZdGz179sTr9eLz+WhoaCAkJKTV14atTv3ExcVRU1Pj3zbG+G8aFx4ezunTp9tqaldVeHg4AA0NDcycOZNZs2ZRUFBgyyyahISEkJmZSUVFBYWFhWzbts12eaxdu5abbrqJoUOHsmLFCsC+7xGAG264geeee46xY8dy6NAhpkyZQqdOnfzj4eHh/P777204w6unvr6eI0eOsGzZMmpqapg+fbqt10aT5cuXM2PGDBoaGvynCMFea6NDhw4cPnyYkSNHUl9fz7Jly6iqqmrVtWGrQuVC/zxn5nK5mn0IXe+OHj3KjBkzSEtLIyEhgUWLFvnH7JZFk4KCAubOnUtKSkqzPhV2yeOzzz7Dsiy2b9/Ovn37yMzM5OTJk/5xu+TQpGfPnvTo0QPLsujZsycdO3bkzz//9I/bKY8uXboQFRVFu3btiIqKIiwsjGPHjvnH7ZRFk1OnTlFdXc3gwYNpaGjA5XL5x+yUx6pVq4iNjWXOnDkcPXqUCRMm4PF4/OOtkYWtTv1cqF+/fuzYsQOAb775hoEDB7bxjK6OEydOMHnyZDIyMnjyyScB+2YBUFpayvLlywFo3749lmURHR1tuzxWr17Nxx9/TFFREX379qWgoIBhw4bZLocma9as4a233gKgtraWM2fO0KFDB3777TeMMWzdutU2eQwYMIBvv/0WY4w/iyFDhth2bQBUVVUxZMgQACIiIggNDbXl2ujUqZO/P0/nzp05d+5cq3+f2O4W+jU1Nbz88suUlJRQXV3N/Pnz8Xg8REVFkZ+fT3BwcFtP8f8uPz+f8vJyoqKi/PteffVV8vPzbZcFQGNjI/PmzePEiROcO3eOKVOmcOedd9pybTR59tlnyc3NJSgoyLY5uN1u5s2bx5EjR7Asi7lz5xIUFMQbb7yB1+slNjaW2bNnt/U0r5q3336bHTt2YIxh9uzZdO/e3bZrA2DlypWEhIQwceJEAHbt2mXLteFyucjOzub48eN4PB7Gjx9PdHR0q64N2xUqIiIicu2w9akfERERCWwqVERERCRgqVARERGRgKVCRURERAKWChUREREJWCpURGyupqaG6OhokpKSeOKJJxg1ahSTJk1qdkOvy7V27VqysrIAmDJlCrW1tRf92cLCQr7//vvLOn6fPn2abTc0NNC/f/8Wz7Nz504cDsdlHUtEAosKFRGha9eulJWVUVpayoYNG4iOjvb3P7pS7733HrfddttFx6uqqvB6vVf0HBERETz22GNs2LCh2f7S0lLGjBlzRccWkbalQkVEWhg4cCCHDh0CYPjw4cyaNYu4uDjq6uooLS3F4XCQlJREdna2v91AaWkpcXFxjBkzhi1btviPNXz4cGpqajh79izZ2dnExcUxevRoNm7cSGlpKXv27CEnJ4f9+/fjdDqZNGkSDoeD1NRUfv75Z+D8X31SU1NJSkpiwYIF/zrnMWPGsH79ev/22bNn2bJlCwkJCQAsWbKElJQU4uLiGDduHMePH2/2+KVLlzZrWto0b6/Xy5tvvonD4SAxMZFVq1ZdabwichlUqIhIMx6Ph/Lych544AH/vmHDhrFp0yZOnjxJSUkJxcXFlJWVcfPNN/P+++9TW1vL4sWLWb16NZ988kmzvidNioqKaGxspLy8nA8//JB3332X+Ph4oqOjyc/Pp0+fPmRmZpKRkcG6devIy8vz390zLy+P5ORkysrKms3rn2JiYjh16hQHDx4E4Msvv2Tw4MF07twZp9PJwYMHKS4uZtOmTURGRvLFF19cUh4lJSUArFu3jjVr1vDVV19d9qkqEfnf2bopoYic98cff5CUlAScv3X8fffdx5w5c/zj999/PwA7duzA6XSSkpICnC9q+vXrxw8//ED//v255ZZbAEhISKCysrLZc1RVVZGSkkJQUBC33npri9M0LpeLPXv2MG/ePP++xsZG6uvr2blzJ++88w4AiYmJ5OTktHgNlmXhcDhYv349M2fOpKyszH978x49epCZmcmnn35KdXU1u3btIjIy8pKyaWrS2PR6Ghsb2b9/v216uYi0NRUqIuK/RuViwsLCAPB6vYwcOdJfKLhcLrxeL9u3b8fn8/l/PiSk5UfLhfucTie33367f9vn89GuXbtm8zh27BhdunQBoKnbh2VZ/hbyF3I4HEyePJm0tDSqq6v9TeP27NnDnDlzmDhxInFxcQQFBXFh9xDLspq9hqYOsF6vl4yMDEaMGAHAyZMn6dChw0WzEpHWpVM/InLJBg0aREVFBXV1dRhjyM3N5aOPPmLAgAHs3r2b2tpafD4fGzdubPHYBx98kPLycowx1NXV8cwzz+B2uwkODsbr9dKxY0fuuOMOf6Gybds2nn76aQAeeughPv/8cwA2b96M2+3+1/l169aNbt26UVhYSFJSkr+gqaqqIiYmhtTUVHr16sW2bdtaXMB74403cuDAAQB+/PFH/zUsgwcPpqSkBI/Hg8vlIi0tjd27d7dCmiJyKfQXFRG5ZHfffTfp6elMmDABn89H3759eeGFFwgLCyMnJ4eJEyfSvn17evXq1eKxaWlp5Ofnk5iYCMD8+fOJiIhg6NChLFy4kIKCAhYtWkRubi4rV64kNDSUJUuWYFkWCxYsICMjg+LiYu69917Cw8MvOsfk5GReeeUVKioq/Pvi4+NJT08nISGB0NBQ+vTpQ01NTbPHxcfHs2nTJuLj47nnnnvo168fAOPGjcPpdOJwODh37hzJyckMGjSoNeIUkUug7skiIiISsHTqR0RERAKWChUREREJWCpUREREJGCpUBEREZGApUJFREREApYKFREREQlYKlREREQkYKlQERERkYD1H6Gc/ivcW+meAAAAAElFTkSuQmCC\n", + "image/png": "\n", "text/plain": [ "
" ] @@ -168,12 +150,12 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 6, "metadata": {}, "outputs": [ { "data": { - "image/png": "\n", + "image/png": "\n", "text/plain": [ "
" ] @@ -205,12 +187,12 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 7, "metadata": {}, "outputs": [ { "data": { - "image/png": "\n", + "image/png": "\n", "text/plain": [ "
" ] @@ -249,7 +231,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.5" + "version": "3.6.7" } }, "nbformat": 4, diff --git a/examples/walkthrough.ipynb b/examples/walkthrough.ipynb new file mode 100644 index 000000000..2cd3bac52 --- /dev/null +++ b/examples/walkthrough.ipynb @@ -0,0 +1,135 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Walkthrough\n", + "\n", + "This notebook contains the code for the [walkthrough in the quickstart guide](http://www.scikit-yb.org/en/latest/quickstart.html#walkthrough). We've purposefully omitted the text of the guide so that you can follow along in code using this notebook as a template! The scikit-yb developers also use this notebook to verify that the quickstart code is correct, so if this code doesn't match what's on the guide, please leave us a note on our [GitHub Issues](https://github.com/DistrictDataLabs/yellowbrick/issues)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%matplotlib notebook" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from yellowbrick.datasets import load_bikeshare\n", + "\n", + "X, y = load_bikeshare()\n", + "print(X.head())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from yellowbrick.features import Rank2D\n", + "\n", + "visualizer = Rank2D(algorithm=\"pearson\")\n", + "visualizer.fit_transform(X)\n", + "visualizer.poof()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from yellowbrick.features import JointPlotVisualizer\n", + "\n", + "visualizer = JointPlotVisualizer(feature='temp', target='feelslike')\n", + "visualizer.fit_transform(X['temp'], X['feelslike'])\n", + "visualizer.poof()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from yellowbrick.regressor import ResidualsPlot\n", + "from sklearn.linear_model import LinearRegression\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "# Create training and test sets\n", + "X_train, X_test, y_train, y_test = train_test_split(\n", + " X, y, test_size=0.1\n", + ")\n", + "\n", + "visualizer = ResidualsPlot(LinearRegression())\n", + "visualizer.fit(X_train, y_train)\n", + "visualizer.score(X_test, y_test)\n", + "visualizer.poof()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "\n", + "from sklearn.linear_model import RidgeCV\n", + "from yellowbrick.regressor import AlphaSelection\n", + "\n", + "alphas = np.logspace(-10, 1, 200)\n", + "visualizer = AlphaSelection(RidgeCV(alphas=alphas))\n", + "visualizer.fit(X, y)\n", + "visualizer.poof()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.linear_model import Ridge\n", + "from yellowbrick.regressor import PredictionError\n", + "\n", + "visualizer = PredictionError(Ridge(alpha=3.181))\n", + "visualizer.fit(X_train, y_train)\n", + "visualizer.score(X_test, y_test)\n", + "visualizer.poof()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/paper/figures/figures.py b/paper/figures/figures.py index 6279dc18b..13c1741bf 100644 --- a/paper/figures/figures.py +++ b/paper/figures/figures.py @@ -31,8 +31,8 @@ ) # Quick reference dataset objects -Dataset = namedtuple('Dataset', 'X,y') -Split = namedtuple('Split', 'train,test') +Dataset = namedtuple("Dataset", "X,y") +Split = namedtuple("Split", "train,test") def _make_dataset(X, y, split=False): @@ -58,8 +58,8 @@ def load_concrete(split=False): path = os.path.join(DATA, "concrete", "concrete.csv") data = pd.read_csv(path) - X = data[['cement', 'slag', 'ash', 'water', 'splast', 'coarse', 'fine', 'age']] - y = data['strength'] + X = data[["cement", "slag", "ash", "water", "splast", "coarse", "fine", "age"]] + y = data["strength"] return _make_dataset(X, y, split) @@ -81,7 +81,7 @@ def feature_analysis(fname="feature_analysis.png"): """ # Create side-by-side axes grid - _, axes = plt.subplots(ncols=2, figsize=(18,6)) + _, axes = plt.subplots(ncols=2, figsize=(18, 6)) # Draw RadViz on the left data = load_occupancy(split=False) @@ -130,7 +130,7 @@ def regression(fname="regression.png"): def classification(fname="classification.png"): # Create side-by-side axes grid - _, axes = plt.subplots(ncols=2, figsize=(18,6)) + _, axes = plt.subplots(ncols=2, figsize=(18, 6)) # Add ClassificationReport to the reft data = load_spam(split=True) @@ -153,11 +153,11 @@ def classification(fname="classification.png"): def clustering(fname="clustering.png"): # Create side-by-side axes grid - _, axes = plt.subplots(ncols=2, figsize=(18,6)) + _, axes = plt.subplots(ncols=2, figsize=(18, 6)) X, y = make_blobs(centers=7) # Add K-Elbow to the left - oz = KElbowVisualizer(MiniBatchKMeans(), k=(3,12), ax=axes[0]) + oz = KElbowVisualizer(MiniBatchKMeans(), k=(3, 12), ax=axes[0]) oz.fit(X, y) oz.finalize() @@ -171,9 +171,10 @@ def clustering(fname="clustering.png"): plt.tight_layout() plt.savefig(path) + def hyperparameter_tuning(fname="hyperparameter_tuning.png"): # Create side-by-side axes grid - _, axes = plt.subplots(ncols=2, figsize=(18,6)) + _, axes = plt.subplots(ncols=2, figsize=(18, 6)) # Load the concrete dataset data = load_concrete(split=False) @@ -187,7 +188,7 @@ def hyperparameter_tuning(fname="hyperparameter_tuning.png"): oz.finalize() # Add LearningCurve to the right - oz = LearningCurve(RandomForestRegressor(), scoring='r2', ax=axes[1]) + oz = LearningCurve(RandomForestRegressor(), scoring="r2", ax=axes[1]) oz.fit(data.X, data.y) oz.finalize() @@ -197,8 +198,7 @@ def hyperparameter_tuning(fname="hyperparameter_tuning.png"): plt.savefig(path) - -if __name__ == '__main__': +if __name__ == "__main__": parser = argparse.ArgumentParser( description="generate visualizations for JOSS paper" ) diff --git a/paper/paper.bib b/paper/paper.bib index cd1ed1f5c..59409d8fb 100644 --- a/paper/paper.bib +++ b/paper/paper.bib @@ -5,7 +5,7 @@ @article{zenodo Gray, Larry and others}, title = {Yellowbrick}, - month = Jul, + month = Nov, year = 2018, doi = {10.5281/zenodo.1206239}, url = {https://doi.org/10.5281/zenodo.1206239} @@ -44,7 +44,7 @@ @misc{scipy title = {{SciPy}: Open source scientific tools for {Python}}, year = {2001--}, url = "http://www.scipy.org/", - note = {[Online; accessed ]} + note = {[Online; accessed 2018-07-30]} } @article{kumar2016model, @@ -55,7 +55,8 @@ @article{kumar2016model number = {4}, pages = {17--22}, year = {2016}, - publisher = {ACM} + publisher = {ACM}, + doi = {10.1145/2935694.2935698} } @article{liu_wang_liu_zhu_2017, @@ -90,7 +91,8 @@ @inproceedings{kapoor2010interactive booktitle = {Proceedings of the SIGCHI Conference on Human Factors in Computing Systems}, pages = {1343--1352}, year = {2010}, - organization = {ACM} + organization = {ACM}, + doi = {10.1145/1753326.1753529} } @article{rajaraman2008more, diff --git a/requirements.txt b/requirements.txt index b7f02d62b..6e0f3680c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,29 +1,28 @@ ## Dependencies -matplotlib>=1.5.1,!=3.0.0 +matplotlib>=2.0.2,!=3.0.0 scipy>=1.0.0 scikit-learn>=0.20 numpy>=1.13.0 cycler>=0.10.0 -## Testing Requirements (uncomment for development) -#pytest>=3.4.1 -#pytest-cov>=2.5.1 -#pytest-flakes>=2.0.0 -#pytest-spec>=1.1.0 -#coverage>=4.4.1 -#requests>=2.18.3 -#six==1.11.0 +## Optional Dependencies (uncomment to use) +#umap-learn>=0.3 -## Python 2 Testing Requirements -#mock>=2.0.0 +## Testing Requirements (pip install -r tests/requirements.txt) +#pytest>=4.2.0 +#pytest-cov>=2.6.1 +#pytest-flakes>=4.0.0 +#pytest-spec>=1.1.0 +#coverage>=4.5.2 -## Optional Testing Dependencies (uncomment for development) +## Optional Testing Dependencies (pip install -r tests/requirements.txt) #nltk>=3.2 #pandas>=0.20 +#umap-learn>=0.3 -## Documentation (uncomment to build documentation) -#Sphinx>=1.7.5 -#sphinx-rtd-theme>=0.4.0 +## Documentation (pip install -r docs/requirements.txt) +#Sphinx>=1.8.3 +#sphinx-rtd-theme>=0.4.2 #numpydoc>=0.8.0 ## Build Requirements (uncomment for deployment) diff --git a/setup.cfg b/setup.cfg index 3308f57b3..505034391 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,6 @@ [metadata] description-file = DESCRIPTION.txt +license_file = LICENSE.txt [wheel] universal = 1 @@ -12,17 +13,30 @@ tests = True test=pytest [tool:pytest] -addopts = --verbose --cov=yellowbrick --flakes --spec +# TODO: add --spec and --verbose back to addopts +addopts = --cov=yellowbrick --flakes python_files = tests/* flakes-ignore = __init__.py UnusedImport __init__.py ImportStarUsed test_*.py ImportStarUsed test_*.py ImportStarUsage + conftest.py UnusedVariable examples/* ALL tests/checks.py ALL + docs/_build ALL spec_header_format = {class_name} ({path}) filterwarnings = + once::UserWarning once::DeprecationWarning once::PendingDeprecationWarning + ignore::sklearn.exceptions.ConvergenceWarning ignore::FutureWarning + +[flake8] +# match black maximum line length +max-line-length = 88 +per-file-ignores = + __init__.py:F401 + test_*.py:F405,F403 + conftest.py:F841 \ No newline at end of file diff --git a/setup.py b/setup.py index bf144208f..06f23e4f4 100755 --- a/setup.py +++ b/setup.py @@ -2,10 +2,10 @@ # setup # Setup script for installing yellowbrick # -# Author: Benjamin Bengfort +# Author: Benjamin Bengfort # Created: Wed May 18 14:33:26 2016 -0400 # -# Copyright (C) 2016 District Data Labs +# Copyright (C) 2016 The scikit-yb developers # For license information, see LICENSE.txt and NOTICE.md # # ID: setup.py [c4f3ba7] benjamin@bengfort.com $ @@ -29,69 +29,88 @@ ## Package Information ########################################################################## +## Basic information ## Basic information NAME = "yellowbrick" DESCRIPTION = "A suite of visual analysis and diagnostic tools for machine learning." -AUTHOR = "Rebecca Bilbro, Benjamin Bengfort" -EMAIL = "info@districtdatalabs.com" -MAINTAINER = "Benjamin Bengfort" +AUTHOR = "The scikit-yb developers" +EMAIL = "yellowbrick@googlegroups.com" +MAINTAINER = "The scikit-yb developers" LICENSE = "Apache 2" -REPOSITORY = "https://github.com/districtdatalabs/yellowbrick" +REPOSITORY = "https://github.com/DistrictDataLabs/yellowbrick" PACKAGE = "yellowbrick" +URL = "http://scikit-yb.org/" ## Define the keywords -KEYWORDS = ('visualization', 'machine learning', 'scikit-learn', 'matplotlib', 'data science') +KEYWORDS = ( + "visualization", + "machine learning", + "scikit-learn", + "matplotlib", + "data science", +) ## Define the classifiers ## See https://pypi.python.org/pypi?%3Aaction=list_classifiers CLASSIFIERS = ( - 'Development Status :: 4 - Beta', - 'Intended Audience :: Developers', - 'Intended Audience :: Science/Research', - 'License :: OSI Approved :: Apache Software License', - 'Natural Language :: English', - 'Operating System :: OS Independent', - 'Programming Language :: Python', - 'Programming Language :: Python :: 2.7', - 'Programming Language :: Python :: 3.5', - 'Programming Language :: Python :: 3.6', - 'Topic :: Software Development', - 'Topic :: Software Development :: Libraries :: Python Modules', - 'Topic :: Scientific/Engineering :: Visualization', + "Development Status :: 5 - Production/Stable", + "Intended Audience :: Developers", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: Apache Software License", + "Natural Language :: English", + "Operating System :: OS Independent", + "Programming Language :: Python", + "Programming Language :: Python :: 3.5", + "Programming Language :: Python :: 3.6", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Topic :: Software Development", + "Topic :: Software Development :: Libraries :: Python Modules", + "Topic :: Scientific/Engineering :: Visualization", ) ## Important Paths -PROJECT = os.path.abspath(os.path.dirname(__file__)) +PROJECT = os.path.abspath(os.path.dirname(__file__)) REQUIRE_PATH = "requirements.txt" VERSION_PATH = os.path.join(PACKAGE, "version.py") -PKG_DESCRIBE = "DESCRIPTION.rst" +PKG_DESCRIBE = "DESCRIPTION.md" ## Directories to ignore in find_packages -EXCLUDES = ( - "tests", "bin", "docs", "fixtures", "register", "notebooks", "examples", +EXCLUDES = ( + "tests", + "bin", + "docs", + "fixtures", + "register", + "notebooks", + "examples", + "binder", + "paper", ) ########################################################################## ## Helper Functions ########################################################################## + def read(*parts): """ Assume UTF-8 encoding and return the contents of the file located at the absolute path from the REPOSITORY joined with *parts. """ - with codecs.open(os.path.join(PROJECT, *parts), 'rb', 'utf-8') as f: + with codecs.open(os.path.join(PROJECT, *parts), "rb", "utf-8") as f: return f.read() def get_version(path=VERSION_PATH): """ - Reads the __init__.py defined in the VERSION_PATH to find the get_version - function, and executes it to ensure that it is loaded correctly. + Reads the python file defined in the VERSION_PATH to find the get_version + function, and executes it to ensure that it is loaded correctly. Separating + the version in this way ensures no additional code is executed. """ namespace = {} exec(read(path), namespace) - return namespace['get_version'](short=True) + return namespace["get_version"](short=True) def get_requires(path=REQUIRE_PATH): @@ -101,9 +120,19 @@ def get_requires(path=REQUIRE_PATH): """ for line in read(path).splitlines(): line = line.strip() - if line and not line.startswith('#'): + if line and not line.startswith("#"): yield line + +def get_description_type(path=PKG_DESCRIBE): + """ + Returns the long_description_content_type based on the extension of the + package describe path (e.g. .txt, .rst, or .md). + """ + _, ext = os.path.splitext(path) + return {".rst": "text/x-rst", ".txt": "text/plain", ".md": "text/markdown"}[ext] + + ########################################################################## ## Define the configuration ########################################################################## @@ -113,21 +142,30 @@ def get_requires(path=REQUIRE_PATH): "version": get_version(), "description": DESCRIPTION, "long_description": read(PKG_DESCRIBE), + "long_description_content_type": get_description_type(PKG_DESCRIBE), + "classifiers": CLASSIFIERS, + "keywords": KEYWORDS, "license": LICENSE, "author": AUTHOR, "author_email": EMAIL, + "url": URL, "maintainer": MAINTAINER, "maintainer_email": EMAIL, - "url": REPOSITORY, + "project_urls": { + "Documentation": URL, + "Download": "{}/tarball/v{}".format(REPOSITORY, get_version()), + "Source": REPOSITORY, + "Tracker": "{}/issues".format(REPOSITORY), + }, "download_url": "{}/tarball/v{}".format(REPOSITORY, get_version()), "packages": find_packages(where=PROJECT, exclude=EXCLUDES), - "install_requires": list(get_requires()), - "classifiers": CLASSIFIERS, - "keywords": KEYWORDS, + "package_data": {"yellowbrick": ["datasets/manifest.json"]}, "zip_safe": False, - "scripts": [], - "setup_requires":["pytest-runner"], - "tests_require":["pytest"], + "entry_points": {"console_scripts": []}, + "install_requires": list(get_requires()), + "python_requires": ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, <4", + "setup_requires": ["pytest-runner"], + "tests_require": ["pytest"], } @@ -135,5 +173,5 @@ def get_requires(path=REQUIRE_PATH): ## Run setup script ########################################################################## -if __name__ == '__main__': +if __name__ == "__main__": setup(**config) diff --git a/tests/README.md b/tests/README.md new file mode 100644 index 000000000..f22f58eb0 --- /dev/null +++ b/tests/README.md @@ -0,0 +1,94 @@ +# Yellowbrick Tests + +*Welcome to the Yellowbrick tests!* + +If you're looking for information about how to use Yellowbrick, for our contributor's guide, for examples and teaching resources, for answers to frequently asked questions, and more, please visit the latest version of our documentation at [www.scikit-yb.org](https://www.scikit-yb.org/). + +## Running Yellowbrick Tests + +To run the tests locally, first install the tests-specific requirements with `pip` using the `requirements.txt` file in the `tests` directory: + +``` +$ pip install -r tests/requirements.txt +``` + +The required dependencies for the test suite include testing utilities and libraries such as `pandas` and `nltk` that are not included in the core dependencies. + +Tests can then be run as follows from the project `root`: + +```bash +$ make test +``` + +The Makefile uses the `pytest` runner and testing suite as well as the coverage library. + +## Adding a Test for Your Visualizer + +The `tests` package mirrors the yellowbrick package in structure and also contains several helper methods and base functionality. To add a test to your visualizer, find the corresponding file to add the test case, or create a new test file in the same place you added your code. + +### Visual Tests + +The primary test you should create is simply to test your visualizer from end to end and make sure that no exceptions occur. + +Visual tests are notoriously difficult to create --- how do you test a visualization or figure? Moreover, testing scikit-learn models with real data can consume a lot of memory. To assist with this, we have two primary helpers, `VisualTestCase` and the `yellowbrick.datasets` module. + +Leverage these helpers to create your tests as follows: + +```python +import pytest + +from tests.base import VisualTestCase +from yellowbrick.datasets import load_occupancy + + +class MyVisualizerTests(VisualTestCase): + + def test_my_visualizer(self): + """ + Test MyVisualizer on a real dataset + """ + # Load the data using the Yellowbrick datasets module + X, y = load_occupancy() + + try: + visualizer = MyVisualizer() + visualizer.fit(X) + visualizer.finalize() + except Exception as e: + pytest.fail("my visualizer didn't work") +``` + +### Image Comparison Tests + +Writing an image-based comparison test is only a little more difficult than the simple test case presented above. We have adapted `matplotlib`'s image comparison test utility into an easy to use assert method: `self.assert_images_similar(visualizer)` + +The main consideration is that you must specify the “baseline” (i.e. expected) image in the `tests/baseline_images/` folder structure. + +For example, let's say you create your tests in `tests/test_regressor/test_myvisualizer.py` as follows: + +```python +from tests.base import VisualTestCase +... + def test_my_visualizer_output(self): + ... + visualizer = MyVisualizer() + visualizer.fit(X) + visualizer.finalize() + self.assert_images_similar(visualizer) +``` + +The first time this test is run, there will be no baseline image to compare against, so the test will fail. Alternatively, if you are making a correction to the existing test `test_my_visualizer_output`, and the correction modifies the resulting test image, the test may also fail to match the existing baseline image. The solution is to first run the tests, then copy the new output images to the correct subdirectory under source code revision control (with `git add`). When rerunning the tests, they should now pass! + +We have a helper script, `tests/images.py` to clean up and manage baseline images automatically. It is run using the ``python -m`` command to execute a module as main, and it takes as an argument the path to **your** test file. To copy the figures as above: + +```bash +$ python -m tests.images tests/test_regressor/test_myvisualizer.py +``` + +This will move all related test images from `actual_images` to `baseline_images` on your behalf (note you'll have had to already run the tests at least once to generate the images). You can also clean up images from both actual and baseline as follows: + +```bash +$ python -m tests.images -C tests/test_regressor/test_myvisualizer.py +``` + +This is useful particularly if you're stuck trying to get an image comparison to work. For more information on the images helper script, use `python -m tests.images --help`. diff --git a/tests/__init__.py b/tests/__init__.py index eb9ab5155..4f88b2007 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -4,7 +4,7 @@ # Author: Rebecca Bilbro # Created: Wed May 18 10:48:46 2016 -0400 # -# Copyright (C) 2016 District Data Labs +# Copyright (C) 2016 The scikit-yb developers # For license information, see LICENSE.txt # # ID: __init__.py [0c5ba04] benjamin@bengfort.com $ @@ -17,31 +17,30 @@ ## Imports ########################################################################## -import unittest import matplotlib ## IMPORTANT! Set matplotlib to use the Agg backend before imported anywhere! -matplotlib.use('Agg') +matplotlib.use("Agg") ########################################################################## ## Test Constants ########################################################################## -EXPECTED_VERSION = "0.9.1" +EXPECTED_VERSION = "1.0" ########################################################################## ## Initialization Tests ########################################################################## -class InitializationTests(unittest.TestCase): +class TestInitialization(object): def test_sanity(self): """ Test that tests work by confirming 7-3 = 4 """ - self.assertEqual(7-3, 4, "The world went wrong!!") + assert 7 - 3 == 4, "The world went wrong!!" def test_import(self): """ @@ -58,6 +57,7 @@ def test_version(self): """ try: import yellowbrick as yb - self.assertEqual(yb.__version__, EXPECTED_VERSION) + + assert yb.__version__ == EXPECTED_VERSION except ImportError: self.fail("Could not import the yellowbrick library!") diff --git a/tests/base.py b/tests/base.py index 449def1ef..09ae206b6 100644 --- a/tests/base.py +++ b/tests/base.py @@ -15,19 +15,32 @@ ########################################################################## import os +import sys import inspect -import unittest import matplotlib as mpl import matplotlib.pyplot as plt from matplotlib import ticker -from matplotlib import rcParams - from matplotlib.testing.compare import compare_images from yellowbrick.exceptions import ImageComparisonFailure +########################################################################## +## Environment +########################################################################## + + +def is_windows_or_conda(): + """ + Simple detection mechanism to determine if the tests are running in a + win32 or Anaconda/Miniconda environment. + """ + is_windows = sys.platform == "win32" + is_conda = os.path.exists(os.path.join(sys.prefix, "conda-meta")) + return is_windows or is_conda + + ########################################################################## ## Module Constants ########################################################################## @@ -36,51 +49,49 @@ TESTS = os.path.dirname(__file__) ACTUAL_IMAGES = os.path.join(TESTS, "actual_images") BASELINE_IMAGES = os.path.join(TESTS, "baseline_images") +IS_WINDOWS_OR_CONDA = is_windows_or_conda() ########################################################################## ## Visual Test Case ########################################################################## -class VisualTestCase(unittest.TestCase): - @classmethod - def setUpClass(klass): - """ - In order for tests to pass on Travis-CI we must use the 'Agg' - matplotlib backend. This setup function ensures that all tests - that do visual work setup the backend correctly. +class VisualTestCase(object): + """ + The visual test case class ensures that all tests inside of the class + can execute image similarity tests inside of a clean matplotlib global + figure. + """ - Note: + def setup_method(self): """ - klass._backend = mpl.get_backend() - super(VisualTestCase, klass).setUpClass() + Before a visual test case method is run, ensure that the previous + figure is closed and the current axes are cleared. - def setUp(self): - """ - Assert tthat the backend is 'Agg' and close all previous plots + See: https://docs.pytest.org/en/latest/xunit_setup.html """ # Reset the matplotlib environment - plt.cla() # clear current axis - plt.clf() # clear current figure - plt.close("all") # close all existing plots - - # Travis-CI does not have san-serif - rcParams['font.family'] = 'DejaVu Sans' - - # Assert that the backend is agg - self.assertEqual(self._backend, 'agg') - super(VisualTestCase, self).setUp() - - def assert_images_similar(self, visualizer=None, ax=None, tol=0.01, **kwargs): + plt.cla() # clear current axis + plt.clf() # clear current figure + plt.close("all") # close all existing plots + + # Travis-CI does not have san-serif so ensure standard fonts are used. + # Note that this must be set before each test otherwise it will be reset by + # the Yellowbrick styles. + mpl.rcParams["font.family"] = "DejaVu Sans" + + def assert_images_similar( + self, visualizer=None, ax=None, tol=0.01, windows_tol=None, **kwargs + ): """Accessible testing method for testing generation of a Visualizer. Requires the placement of a baseline image for comparison in the tests/baseline_images folder that corresponds to the module path of the VisualTestCase being evaluated. The name of the image corresponds to - the unittest function where "self.assert_images_similar" is called. + the test function where "self.assert_images_similar" is called. - For example, calling "assert_images_similar" in the unittest + For example, calling "assert_images_similar" in the test function "test_class_report" in tests.test_classifier.test_class_balance would require placement a baseline image at: @@ -92,18 +103,22 @@ def assert_images_similar(self, visualizer=None, ax=None, tol=0.01, **kwargs): actual_images/ - visualizer : yellowbrick visualizer + visualizer : yellowbrick visualizer, default: None An instantiated yellowbrick visualizer that has been fitted, transformed and had all operations except for poof called on it. ax : matplotlib Axes, default: None The axis to plot the figure on. - tol : float + tol : float, default: 0.01 The tolerance (a color value difference, where 255 is the maximal difference). The test fails if the average pixel difference is greater than this value. + windows_tol: float, default: None + Similar to the tol parameter, but targeted for testing on a + windows environment. + kwargs : dict Options to pass to the ImageComparison class. """ @@ -112,7 +127,12 @@ def assert_images_similar(self, visualizer=None, ax=None, tol=0.01, **kwargs): # Build and execute the image comparison compare = ImageComparison( - inspect.stack(), visualizer=visualizer, ax=ax, tol=tol, **kwargs + inspect.stack(), + visualizer=visualizer, + ax=ax, + tol=tol, + windows_tol=windows_tol, + **kwargs ) compare() @@ -124,6 +144,7 @@ def assert_images_similar(self, visualizer=None, ax=None, tol=0.01, **kwargs): ## Image Comparison Test ########################################################################## + class ImageComparison(object): """ An image comparison wraps a single ``assert_images_similar`` statement to @@ -153,6 +174,9 @@ class ImageComparison(object): difference. The test fails if the average pixel difference is greater than this value. + windows_tol : float, default: 0.01 + The tolerance (tol) parameter for the windows operating system environment. + ext : string, default: ".png" The file extension to save the actual and baseline images as. @@ -164,19 +188,34 @@ class ImageComparison(object): remove_title : bool, default: True Remove the title since different OS may have varying fonts. + remove_labels : bool, default: True + Remove the x and y labels since different OS may have varying fonts. + + remove_legend : bool, default: True + Remove the legend since different OS may have varying fonts. + Raises ------ ValueError : at least one of visualizer or ax must be specified. """ - def __init__(self, stack, visualizer=None, ax=None, tol=0.01, ext=".png", - remove_ticks=True, remove_title=True, remove_legend=False): + def __init__( + self, + stack, + visualizer=None, + ax=None, + tol=0.01, + windows_tol=0.01, + ext=".png", + remove_ticks=True, + remove_title=True, + remove_labels=True, + remove_legend=True, + ): # Ensure we have something to draw on if visualizer is None and ax is None: - raise ValueError( - "at least one of visualizer or ax must be specified" - ) + raise ValueError("at least one of visualizer or ax must be specified") # Save the ax being drawn on self.ax = ax or visualizer.ax @@ -188,10 +227,8 @@ def __init__(self, stack, visualizer=None, ax=None, tol=0.01, ext=".png", # FrameInfo(frame, filename, lineno, function, code_context, index) self.test_func_name = frame[3] - if not self.test_func_name.startswith('test'): - raise ValueError( - "{} is not a test function".format(self.test_func_name) - ) + if not self.test_func_name.startswith("test"): + raise ValueError("{} is not a test function".format(self.test_func_name)) # Find the relative path to the Yellowbrick tests to compute the # module name for storing images in the actual and baseline dirs. @@ -199,11 +236,17 @@ def __init__(self, stack, visualizer=None, ax=None, tol=0.01, ext=".png", module_path = os.path.relpath(frame[1], root) self.test_module_path = os.path.splitext(module_path)[0] + # Set the error tolerance depending on the os + if os.name == "nt" and windows_tol is not None: + self.tol = windows_tol + else: + self.tol = tol + # Save other image comparison properties - self.tol = tol self.ext = ext self.remove_ticks = remove_ticks self.remove_title = remove_title + self.remove_labels = remove_labels self.remove_legend = remove_legend def __call__(self): @@ -271,7 +314,11 @@ def cleanup(self): except AttributeError: continue - if self.remove_legend: + if self.remove_labels: + self.ax.set_xlabel("") + self.ax.set_ylabel("") + + if self.remove_legend and self.ax.get_legend() is not None: self.ax.legend_.remove() def save(self): @@ -302,18 +349,18 @@ def compare(self): # Ensure we have an image to compare against (common failure) if not os.path.exists(expected): raise ImageComparisonFailure( - 'baseline image does not exist:\n{}'.format(os.path.relpath(expected)) + "baseline image does not exist:\n{}".format(os.path.relpath(expected)) ) - # Perform the comparison err = compare_images(expected, actual, self.tol, in_decorator=True) # Raise image comparison failure if not close if err: - for key in ('actual', 'expected'): + for key in ("actual", "expected"): err[key] = os.path.relpath(err[key]) - raise ImageComparisonFailure(( - "images not close (RMS {rms:0.3f})" - "\n{actual}\n\tvs\n{expected}" - ).format(**err)) + raise ImageComparisonFailure( + ( + "images not close (RMS {rms:0.3f})" "\n{actual}\n\tvs\n{expected}" + ).format(**err) + ) diff --git a/tests/baseline_images/test_base/test_draw_visualizer_grid.png b/tests/baseline_images/test_base/test_draw_visualizer_grid.png index 074c416ed..fc1735ca9 100644 Binary files a/tests/baseline_images/test_base/test_draw_visualizer_grid.png and b/tests/baseline_images/test_base/test_draw_visualizer_grid.png differ diff --git a/tests/baseline_images/test_base/test_draw_with_cols.png b/tests/baseline_images/test_base/test_draw_with_cols.png index 6188b4773..8a396f981 100644 Binary files a/tests/baseline_images/test_base/test_draw_with_cols.png and b/tests/baseline_images/test_base/test_draw_with_cols.png differ diff --git a/tests/baseline_images/test_base/test_draw_with_rows.png b/tests/baseline_images/test_base/test_draw_with_rows.png index 2c9af4d8b..fc82c6bd5 100644 Binary files a/tests/baseline_images/test_base/test_draw_with_rows.png and b/tests/baseline_images/test_base/test_draw_with_rows.png differ diff --git a/tests/baseline_images/test_classifier/test_class_balance/test_class_report.png b/tests/baseline_images/test_classifier/test_class_balance/test_class_report.png deleted file mode 100644 index 99aa71a13..000000000 Binary files a/tests/baseline_images/test_classifier/test_class_balance/test_class_report.png and /dev/null differ diff --git a/tests/baseline_images/test_classifier/test_class_prediction_error/test_class_prediction_error_quickmethod.png b/tests/baseline_images/test_classifier/test_class_prediction_error/test_class_prediction_error_quickmethod.png index 0aff817b3..44ccb4622 100644 Binary files a/tests/baseline_images/test_classifier/test_class_prediction_error/test_class_prediction_error_quickmethod.png and b/tests/baseline_images/test_classifier/test_class_prediction_error/test_class_prediction_error_quickmethod.png differ diff --git a/tests/baseline_images/test_classifier/test_class_prediction_error/test_integration_class_prediction_error_.png b/tests/baseline_images/test_classifier/test_class_prediction_error/test_integration_class_prediction_error_.png deleted file mode 100644 index 0cc3c3c7e..000000000 Binary files a/tests/baseline_images/test_classifier/test_class_prediction_error/test_integration_class_prediction_error_.png and /dev/null differ diff --git a/tests/baseline_images/test_classifier/test_class_prediction_error/test_numpy_integration.png b/tests/baseline_images/test_classifier/test_class_prediction_error/test_numpy_integration.png new file mode 100644 index 000000000..6163791df Binary files /dev/null and b/tests/baseline_images/test_classifier/test_class_prediction_error/test_numpy_integration.png differ diff --git a/tests/baseline_images/test_classifier/test_class_prediction_error/test_pandas_integration.png b/tests/baseline_images/test_classifier/test_class_prediction_error/test_pandas_integration.png new file mode 100644 index 000000000..6163791df Binary files /dev/null and b/tests/baseline_images/test_classifier/test_class_prediction_error/test_pandas_integration.png differ diff --git a/tests/baseline_images/test_classifier/test_classification_report/test_binary_class_report.png b/tests/baseline_images/test_classifier/test_classification_report/test_binary_class_report.png index cb2627f17..0b02c243d 100644 Binary files a/tests/baseline_images/test_classifier/test_classification_report/test_binary_class_report.png and b/tests/baseline_images/test_classifier/test_classification_report/test_binary_class_report.png differ diff --git a/tests/baseline_images/test_classifier/test_classification_report/test_multiclass_class_report.png b/tests/baseline_images/test_classifier/test_classification_report/test_multiclass_class_report.png index e8cc21ae8..42829ff4c 100644 Binary files a/tests/baseline_images/test_classifier/test_classification_report/test_multiclass_class_report.png and b/tests/baseline_images/test_classifier/test_classification_report/test_multiclass_class_report.png differ diff --git a/tests/baseline_images/test_classifier/test_classification_report/test_numpy_integration.png b/tests/baseline_images/test_classifier/test_classification_report/test_numpy_integration.png new file mode 100644 index 000000000..1689d989f Binary files /dev/null and b/tests/baseline_images/test_classifier/test_classification_report/test_numpy_integration.png differ diff --git a/tests/baseline_images/test_classifier/test_classification_report/test_pandas_integration.png b/tests/baseline_images/test_classifier/test_classification_report/test_pandas_integration.png index 92a9fe7a4..1689d989f 100644 Binary files a/tests/baseline_images/test_classifier/test_classification_report/test_pandas_integration.png and b/tests/baseline_images/test_classifier/test_classification_report/test_pandas_integration.png differ diff --git a/tests/baseline_images/test_classifier/test_classification_report/test_quick_method.png b/tests/baseline_images/test_classifier/test_classification_report/test_quick_method.png index aef0f8cbc..62b525b6e 100644 Binary files a/tests/baseline_images/test_classifier/test_classification_report/test_quick_method.png and b/tests/baseline_images/test_classifier/test_classification_report/test_quick_method.png differ diff --git a/tests/baseline_images/test_classifier/test_classification_report/test_support_count_class_report.png b/tests/baseline_images/test_classifier/test_classification_report/test_support_count_class_report.png index 31917cbc9..051058361 100644 Binary files a/tests/baseline_images/test_classifier/test_classification_report/test_support_count_class_report.png and b/tests/baseline_images/test_classifier/test_classification_report/test_support_count_class_report.png differ diff --git a/tests/baseline_images/test_classifier/test_classification_report/test_support_percent_class_report.png b/tests/baseline_images/test_classifier/test_classification_report/test_support_percent_class_report.png index e6a39c38d..5eb75929c 100644 Binary files a/tests/baseline_images/test_classifier/test_classification_report/test_support_percent_class_report.png and b/tests/baseline_images/test_classifier/test_classification_report/test_support_percent_class_report.png differ diff --git a/tests/baseline_images/test_classifier/test_confusion_matrix/test_class_filter_eg_zoom_in.png b/tests/baseline_images/test_classifier/test_confusion_matrix/test_class_filter_eg_zoom_in.png deleted file mode 100644 index d25e29d54..000000000 Binary files a/tests/baseline_images/test_classifier/test_confusion_matrix/test_class_filter_eg_zoom_in.png and /dev/null differ diff --git a/tests/baseline_images/test_classifier/test_confusion_matrix/test_confusion_matrix.png b/tests/baseline_images/test_classifier/test_confusion_matrix/test_confusion_matrix.png index 19cd37aa2..7bb3d23ef 100644 Binary files a/tests/baseline_images/test_classifier/test_confusion_matrix/test_confusion_matrix.png and b/tests/baseline_images/test_classifier/test_confusion_matrix/test_confusion_matrix.png differ diff --git a/tests/baseline_images/test_classifier/test_confusion_matrix/test_extra_classes.png b/tests/baseline_images/test_classifier/test_confusion_matrix/test_extra_classes.png deleted file mode 100644 index 8dce1720e..000000000 Binary files a/tests/baseline_images/test_classifier/test_confusion_matrix/test_extra_classes.png and /dev/null differ diff --git a/tests/baseline_images/test_classifier/test_confusion_matrix/test_fontsize.png b/tests/baseline_images/test_classifier/test_confusion_matrix/test_fontsize.png index dcf208634..594f8f2ef 100644 Binary files a/tests/baseline_images/test_classifier/test_confusion_matrix/test_fontsize.png and b/tests/baseline_images/test_classifier/test_confusion_matrix/test_fontsize.png differ diff --git a/tests/baseline_images/test_classifier/test_confusion_matrix/test_no_classes_provided.png b/tests/baseline_images/test_classifier/test_confusion_matrix/test_no_classes_provided.png index 13ac38f7c..e366a7234 100644 Binary files a/tests/baseline_images/test_classifier/test_confusion_matrix/test_no_classes_provided.png and b/tests/baseline_images/test_classifier/test_confusion_matrix/test_no_classes_provided.png differ diff --git a/tests/baseline_images/test_classifier/test_confusion_matrix/test_one_class.png b/tests/baseline_images/test_classifier/test_confusion_matrix/test_one_class.png deleted file mode 100644 index c8fbe9cca..000000000 Binary files a/tests/baseline_images/test_classifier/test_confusion_matrix/test_one_class.png and /dev/null differ diff --git a/tests/baseline_images/test_classifier/test_confusion_matrix/test_pandas_integration.png b/tests/baseline_images/test_classifier/test_confusion_matrix/test_pandas_integration.png index 657b2c678..87b947763 100644 Binary files a/tests/baseline_images/test_classifier/test_confusion_matrix/test_pandas_integration.png and b/tests/baseline_images/test_classifier/test_confusion_matrix/test_pandas_integration.png differ diff --git a/tests/baseline_images/test_classifier/test_confusion_matrix/test_percent_mode.png b/tests/baseline_images/test_classifier/test_confusion_matrix/test_percent_mode.png index c31609fcc..7b0acd674 100644 Binary files a/tests/baseline_images/test_classifier/test_confusion_matrix/test_percent_mode.png and b/tests/baseline_images/test_classifier/test_confusion_matrix/test_percent_mode.png differ diff --git a/tests/baseline_images/test_classifier/test_confusion_matrix/test_quick_method.png b/tests/baseline_images/test_classifier/test_confusion_matrix/test_quick_method.png new file mode 100644 index 000000000..326d33380 Binary files /dev/null and b/tests/baseline_images/test_classifier/test_confusion_matrix/test_quick_method.png differ diff --git a/tests/baseline_images/test_classifier/test_prcurve/test_binary_decision.png b/tests/baseline_images/test_classifier/test_prcurve/test_binary_decision.png index 6ae4fc789..ddff9660d 100644 Binary files a/tests/baseline_images/test_classifier/test_prcurve/test_binary_decision.png and b/tests/baseline_images/test_classifier/test_prcurve/test_binary_decision.png differ diff --git a/tests/baseline_images/test_classifier/test_prcurve/test_binary_probability.png b/tests/baseline_images/test_classifier/test_prcurve/test_binary_probability.png index 951ed3b3b..68ef5f09f 100644 Binary files a/tests/baseline_images/test_classifier/test_prcurve/test_binary_probability.png and b/tests/baseline_images/test_classifier/test_prcurve/test_binary_probability.png differ diff --git a/tests/baseline_images/test_classifier/test_prcurve/test_binary_probability_decision.png b/tests/baseline_images/test_classifier/test_prcurve/test_binary_probability_decision.png index 0dd8b102c..8395a9158 100644 Binary files a/tests/baseline_images/test_classifier/test_prcurve/test_binary_probability_decision.png and b/tests/baseline_images/test_classifier/test_prcurve/test_binary_probability_decision.png differ diff --git a/tests/baseline_images/test_classifier/test_prcurve/test_custom_iso_f1_scores.png b/tests/baseline_images/test_classifier/test_prcurve/test_custom_iso_f1_scores.png new file mode 100644 index 000000000..8e3cf2a1a Binary files /dev/null and b/tests/baseline_images/test_classifier/test_prcurve/test_custom_iso_f1_scores.png differ diff --git a/tests/baseline_images/test_classifier/test_prcurve/test_multiclass_decision.png b/tests/baseline_images/test_classifier/test_prcurve/test_multiclass_decision.png index e05a1bc9b..f886a9f46 100644 Binary files a/tests/baseline_images/test_classifier/test_prcurve/test_multiclass_decision.png and b/tests/baseline_images/test_classifier/test_prcurve/test_multiclass_decision.png differ diff --git a/tests/baseline_images/test_classifier/test_prcurve/test_multiclass_probability.png b/tests/baseline_images/test_classifier/test_prcurve/test_multiclass_probability.png index ea7734e76..b27b94c85 100644 Binary files a/tests/baseline_images/test_classifier/test_prcurve/test_multiclass_probability.png and b/tests/baseline_images/test_classifier/test_prcurve/test_multiclass_probability.png differ diff --git a/tests/baseline_images/test_classifier/test_prcurve/test_multiclass_probability_with_class_labels.png b/tests/baseline_images/test_classifier/test_prcurve/test_multiclass_probability_with_class_labels.png new file mode 100644 index 000000000..d28eded67 Binary files /dev/null and b/tests/baseline_images/test_classifier/test_prcurve/test_multiclass_probability_with_class_labels.png differ diff --git a/tests/baseline_images/test_classifier/test_prcurve/test_pandas_integration.png b/tests/baseline_images/test_classifier/test_prcurve/test_pandas_integration.png new file mode 100644 index 000000000..fdbfea5d7 Binary files /dev/null and b/tests/baseline_images/test_classifier/test_prcurve/test_pandas_integration.png differ diff --git a/tests/baseline_images/test_classifier/test_prcurve/test_quick_method.png b/tests/baseline_images/test_classifier/test_prcurve/test_quick_method.png index c9bec66f6..a0ceab7ab 100644 Binary files a/tests/baseline_images/test_classifier/test_prcurve/test_quick_method.png and b/tests/baseline_images/test_classifier/test_prcurve/test_quick_method.png differ diff --git a/tests/baseline_images/test_classifier/test_prcurve/test_quick_method_with_test_set.png b/tests/baseline_images/test_classifier/test_prcurve/test_quick_method_with_test_set.png new file mode 100644 index 000000000..4462ffa95 Binary files /dev/null and b/tests/baseline_images/test_classifier/test_prcurve/test_quick_method_with_test_set.png differ diff --git a/tests/baseline_images/test_classifier/test_rocauc/test_binary_decision.png b/tests/baseline_images/test_classifier/test_rocauc/test_binary_decision.png index 548ecdb5a..3d813ea40 100644 Binary files a/tests/baseline_images/test_classifier/test_rocauc/test_binary_decision.png and b/tests/baseline_images/test_classifier/test_rocauc/test_binary_decision.png differ diff --git a/tests/baseline_images/test_classifier/test_rocauc/test_binary_probability.png b/tests/baseline_images/test_classifier/test_rocauc/test_binary_probability.png index d839dfafa..6da7abe38 100644 Binary files a/tests/baseline_images/test_classifier/test_rocauc/test_binary_probability.png and b/tests/baseline_images/test_classifier/test_rocauc/test_binary_probability.png differ diff --git a/tests/baseline_images/test_classifier/test_rocauc/test_binary_probability_decision.png b/tests/baseline_images/test_classifier/test_rocauc/test_binary_probability_decision.png index 5f6a39544..68a7f131f 100644 Binary files a/tests/baseline_images/test_classifier/test_rocauc/test_binary_probability_decision.png and b/tests/baseline_images/test_classifier/test_rocauc/test_binary_probability_decision.png differ diff --git a/tests/baseline_images/test_classifier/test_rocauc/test_multiclass_rocauc.png b/tests/baseline_images/test_classifier/test_rocauc/test_multiclass_rocauc.png index 09f9cb975..244d17c49 100644 Binary files a/tests/baseline_images/test_classifier/test_rocauc/test_multiclass_rocauc.png and b/tests/baseline_images/test_classifier/test_rocauc/test_multiclass_rocauc.png differ diff --git a/tests/baseline_images/test_classifier/test_rocauc/test_pandas_integration.png b/tests/baseline_images/test_classifier/test_rocauc/test_pandas_integration.png new file mode 100644 index 000000000..4751eeb60 Binary files /dev/null and b/tests/baseline_images/test_classifier/test_rocauc/test_pandas_integration.png differ diff --git a/tests/baseline_images/test_classifier/test_rocauc/test_rocauc_no_classes.png b/tests/baseline_images/test_classifier/test_rocauc/test_rocauc_no_classes.png index 253c0e143..f1fda54fd 100644 Binary files a/tests/baseline_images/test_classifier/test_rocauc/test_rocauc_no_classes.png and b/tests/baseline_images/test_classifier/test_rocauc/test_rocauc_no_classes.png differ diff --git a/tests/baseline_images/test_classifier/test_rocauc/test_rocauc_no_macro.png b/tests/baseline_images/test_classifier/test_rocauc/test_rocauc_no_macro.png index a04d729be..d3a200f63 100644 Binary files a/tests/baseline_images/test_classifier/test_rocauc/test_rocauc_no_macro.png and b/tests/baseline_images/test_classifier/test_rocauc/test_rocauc_no_macro.png differ diff --git a/tests/baseline_images/test_classifier/test_rocauc/test_rocauc_no_macro_no_micro.png b/tests/baseline_images/test_classifier/test_rocauc/test_rocauc_no_macro_no_micro.png index 092254631..fc46463e0 100644 Binary files a/tests/baseline_images/test_classifier/test_rocauc/test_rocauc_no_macro_no_micro.png and b/tests/baseline_images/test_classifier/test_rocauc/test_rocauc_no_macro_no_micro.png differ diff --git a/tests/baseline_images/test_classifier/test_rocauc/test_rocauc_no_micro.png b/tests/baseline_images/test_classifier/test_rocauc/test_rocauc_no_micro.png index 8f45199a8..1dfbe770c 100644 Binary files a/tests/baseline_images/test_classifier/test_rocauc/test_rocauc_no_micro.png and b/tests/baseline_images/test_classifier/test_rocauc/test_rocauc_no_micro.png differ diff --git a/tests/baseline_images/test_classifier/test_threshold/test_binary_discrimination_threshold.png b/tests/baseline_images/test_classifier/test_threshold/test_binary_discrimination_threshold.png index aeb6d54ca..a709220d5 100644 Binary files a/tests/baseline_images/test_classifier/test_threshold/test_binary_discrimination_threshold.png and b/tests/baseline_images/test_classifier/test_threshold/test_binary_discrimination_threshold.png differ diff --git a/tests/baseline_images/test_classifier/test_threshold/test_binary_discrimination_threshold_alt_args.png b/tests/baseline_images/test_classifier/test_threshold/test_binary_discrimination_threshold_alt_args.png index 0c73c5e0f..4ef0de49e 100644 Binary files a/tests/baseline_images/test_classifier/test_threshold/test_binary_discrimination_threshold_alt_args.png and b/tests/baseline_images/test_classifier/test_threshold/test_binary_discrimination_threshold_alt_args.png differ diff --git a/tests/baseline_images/test_classifier/test_threshold/test_numpy_integration.png b/tests/baseline_images/test_classifier/test_threshold/test_numpy_integration.png new file mode 100644 index 000000000..661aaf2ea Binary files /dev/null and b/tests/baseline_images/test_classifier/test_threshold/test_numpy_integration.png differ diff --git a/tests/baseline_images/test_classifier/test_threshold/test_pandas_integration.png b/tests/baseline_images/test_classifier/test_threshold/test_pandas_integration.png index 438e7a90a..661aaf2ea 100644 Binary files a/tests/baseline_images/test_classifier/test_threshold/test_pandas_integration.png and b/tests/baseline_images/test_classifier/test_threshold/test_pandas_integration.png differ diff --git a/tests/baseline_images/test_classifier/test_threshold/test_quick_method.png b/tests/baseline_images/test_classifier/test_threshold/test_quick_method.png index ebf541f64..97b869729 100644 Binary files a/tests/baseline_images/test_classifier/test_threshold/test_quick_method.png and b/tests/baseline_images/test_classifier/test_threshold/test_quick_method.png differ diff --git a/tests/baseline_images/test_cluster/test_elbow/test_calinski_harabasz_metric.png b/tests/baseline_images/test_cluster/test_elbow/test_calinski_harabasz_metric.png new file mode 100644 index 000000000..071d1b8e0 Binary files /dev/null and b/tests/baseline_images/test_cluster/test_elbow/test_calinski_harabasz_metric.png differ diff --git a/tests/baseline_images/test_cluster/test_elbow/test_calinski_harabaz_metric.png b/tests/baseline_images/test_cluster/test_elbow/test_calinski_harabaz_metric.png deleted file mode 100644 index cedba48d3..000000000 Binary files a/tests/baseline_images/test_cluster/test_elbow/test_calinski_harabaz_metric.png and /dev/null differ diff --git a/tests/baseline_images/test_cluster/test_elbow/test_distortion_metric.png b/tests/baseline_images/test_cluster/test_elbow/test_distortion_metric.png index 0876dcb86..11d937448 100644 Binary files a/tests/baseline_images/test_cluster/test_elbow/test_distortion_metric.png and b/tests/baseline_images/test_cluster/test_elbow/test_distortion_metric.png differ diff --git a/tests/baseline_images/test_cluster/test_elbow/test_integrated_kmeans_elbow.png b/tests/baseline_images/test_cluster/test_elbow/test_integrated_kmeans_elbow.png index 764411681..95a602a93 100644 Binary files a/tests/baseline_images/test_cluster/test_elbow/test_integrated_kmeans_elbow.png and b/tests/baseline_images/test_cluster/test_elbow/test_integrated_kmeans_elbow.png differ diff --git a/tests/baseline_images/test_cluster/test_elbow/test_integrated_mini_batch_kmeans_elbow.png b/tests/baseline_images/test_cluster/test_elbow/test_integrated_mini_batch_kmeans_elbow.png index 5fe2017a2..d9342b40b 100644 Binary files a/tests/baseline_images/test_cluster/test_elbow/test_integrated_mini_batch_kmeans_elbow.png and b/tests/baseline_images/test_cluster/test_elbow/test_integrated_mini_batch_kmeans_elbow.png differ diff --git a/tests/baseline_images/test_cluster/test_elbow/test_locate_elbow.png b/tests/baseline_images/test_cluster/test_elbow/test_locate_elbow.png new file mode 100644 index 000000000..1ebd4b25b Binary files /dev/null and b/tests/baseline_images/test_cluster/test_elbow/test_locate_elbow.png differ diff --git a/tests/baseline_images/test_cluster/test_elbow/test_quick_method.png b/tests/baseline_images/test_cluster/test_elbow/test_quick_method.png new file mode 100644 index 000000000..bfc4d46f7 Binary files /dev/null and b/tests/baseline_images/test_cluster/test_elbow/test_quick_method.png differ diff --git a/tests/baseline_images/test_cluster/test_elbow/test_silhouette_metric.png b/tests/baseline_images/test_cluster/test_elbow/test_silhouette_metric.png index c0220b6e7..0b22bbe71 100644 Binary files a/tests/baseline_images/test_cluster/test_elbow/test_silhouette_metric.png and b/tests/baseline_images/test_cluster/test_elbow/test_silhouette_metric.png differ diff --git a/tests/baseline_images/test_cluster/test_elbow/test_timings.png b/tests/baseline_images/test_cluster/test_elbow/test_timings.png index b6dc5f622..540741d2f 100644 Binary files a/tests/baseline_images/test_cluster/test_elbow/test_timings.png and b/tests/baseline_images/test_cluster/test_elbow/test_timings.png differ diff --git a/tests/baseline_images/test_cluster/test_icdm/test_affinity_tsne_no_legend.png b/tests/baseline_images/test_cluster/test_icdm/test_affinity_tsne_no_legend.png index 0d0933d52..718577028 100644 Binary files a/tests/baseline_images/test_cluster/test_icdm/test_affinity_tsne_no_legend.png and b/tests/baseline_images/test_cluster/test_icdm/test_affinity_tsne_no_legend.png differ diff --git a/tests/baseline_images/test_cluster/test_icdm/test_kmeans_mds.png b/tests/baseline_images/test_cluster/test_icdm/test_kmeans_mds.png index 12cc087d3..609730905 100644 Binary files a/tests/baseline_images/test_cluster/test_icdm/test_kmeans_mds.png and b/tests/baseline_images/test_cluster/test_icdm/test_kmeans_mds.png differ diff --git a/tests/baseline_images/test_cluster/test_icdm/test_quick_method.png b/tests/baseline_images/test_cluster/test_icdm/test_quick_method.png index 08388f3fa..c807f2d62 100644 Binary files a/tests/baseline_images/test_cluster/test_icdm/test_quick_method.png and b/tests/baseline_images/test_cluster/test_icdm/test_quick_method.png differ diff --git a/tests/baseline_images/test_cluster/test_silhouette/test_colormap_as_colors_silhouette.png b/tests/baseline_images/test_cluster/test_silhouette/test_colormap_as_colors_silhouette.png new file mode 100644 index 000000000..b8ac7f86a Binary files /dev/null and b/tests/baseline_images/test_cluster/test_silhouette/test_colormap_as_colors_silhouette.png differ diff --git a/tests/baseline_images/test_cluster/test_silhouette/test_colormap_silhouette.png b/tests/baseline_images/test_cluster/test_silhouette/test_colormap_silhouette.png new file mode 100644 index 000000000..c4189cadc Binary files /dev/null and b/tests/baseline_images/test_cluster/test_silhouette/test_colormap_silhouette.png differ diff --git a/tests/baseline_images/test_cluster/test_silhouette/test_colors_silhouette.png b/tests/baseline_images/test_cluster/test_silhouette/test_colors_silhouette.png new file mode 100644 index 000000000..c47f335a6 Binary files /dev/null and b/tests/baseline_images/test_cluster/test_silhouette/test_colors_silhouette.png differ diff --git a/tests/baseline_images/test_cluster/test_silhouette/test_integrated_kmeans_silhouette.png b/tests/baseline_images/test_cluster/test_silhouette/test_integrated_kmeans_silhouette.png index 8de5e3a96..81647ec4e 100644 Binary files a/tests/baseline_images/test_cluster/test_silhouette/test_integrated_kmeans_silhouette.png and b/tests/baseline_images/test_cluster/test_silhouette/test_integrated_kmeans_silhouette.png differ diff --git a/tests/baseline_images/test_cluster/test_silhouette/test_integrated_mini_batch_kmeans_silhouette.png b/tests/baseline_images/test_cluster/test_silhouette/test_integrated_mini_batch_kmeans_silhouette.png index 5c9a89bb6..d2f717148 100644 Binary files a/tests/baseline_images/test_cluster/test_silhouette/test_integrated_mini_batch_kmeans_silhouette.png and b/tests/baseline_images/test_cluster/test_silhouette/test_integrated_mini_batch_kmeans_silhouette.png differ diff --git a/tests/baseline_images/test_cluster/test_silhouette/test_quick_method.png b/tests/baseline_images/test_cluster/test_silhouette/test_quick_method.png new file mode 100644 index 000000000..7ad3a33bd Binary files /dev/null and b/tests/baseline_images/test_cluster/test_silhouette/test_quick_method.png differ diff --git a/tests/baseline_images/test_contrib/test_classifier/test_boundaries/test_real_data_set_viz.png b/tests/baseline_images/test_contrib/test_classifier/test_boundaries/test_real_data_set_viz.png index 17934f217..d898a0e35 100644 Binary files a/tests/baseline_images/test_contrib/test_classifier/test_boundaries/test_real_data_set_viz.png and b/tests/baseline_images/test_contrib/test_classifier/test_boundaries/test_real_data_set_viz.png differ diff --git a/tests/baseline_images/test_contrib/test_missing/test_bar/test_missingvaluesbar_numpy.png b/tests/baseline_images/test_contrib/test_missing/test_bar/test_missingvaluesbar_numpy.png index e78b7b429..01dd7f82e 100644 Binary files a/tests/baseline_images/test_contrib/test_missing/test_bar/test_missingvaluesbar_numpy.png and b/tests/baseline_images/test_contrib/test_missing/test_bar/test_missingvaluesbar_numpy.png differ diff --git a/tests/baseline_images/test_contrib/test_missing/test_bar/test_missingvaluesbar_numpy_with_y_target.png b/tests/baseline_images/test_contrib/test_missing/test_bar/test_missingvaluesbar_numpy_with_y_target.png index 1f37cc733..9f23f28bb 100644 Binary files a/tests/baseline_images/test_contrib/test_missing/test_bar/test_missingvaluesbar_numpy_with_y_target.png and b/tests/baseline_images/test_contrib/test_missing/test_bar/test_missingvaluesbar_numpy_with_y_target.png differ diff --git a/tests/baseline_images/test_contrib/test_missing/test_bar/test_missingvaluesbar_numpy_with_y_target_with_labels.png b/tests/baseline_images/test_contrib/test_missing/test_bar/test_missingvaluesbar_numpy_with_y_target_with_labels.png index 75d04ce36..9f23f28bb 100644 Binary files a/tests/baseline_images/test_contrib/test_missing/test_bar/test_missingvaluesbar_numpy_with_y_target_with_labels.png and b/tests/baseline_images/test_contrib/test_missing/test_bar/test_missingvaluesbar_numpy_with_y_target_with_labels.png differ diff --git a/tests/baseline_images/test_contrib/test_missing/test_bar/test_missingvaluesbar_pandas.png b/tests/baseline_images/test_contrib/test_missing/test_bar/test_missingvaluesbar_pandas.png index 8a0f764d5..1fb2c743f 100644 Binary files a/tests/baseline_images/test_contrib/test_missing/test_bar/test_missingvaluesbar_pandas.png and b/tests/baseline_images/test_contrib/test_missing/test_bar/test_missingvaluesbar_pandas.png differ diff --git a/tests/baseline_images/test_contrib/test_missing/test_dispersion/test_missingvaluesdispersion_with_numpy.png b/tests/baseline_images/test_contrib/test_missing/test_dispersion/test_missingvaluesdispersion_with_numpy.png index 3ae92423d..3b6d089e2 100644 Binary files a/tests/baseline_images/test_contrib/test_missing/test_dispersion/test_missingvaluesdispersion_with_numpy.png and b/tests/baseline_images/test_contrib/test_missing/test_dispersion/test_missingvaluesdispersion_with_numpy.png differ diff --git a/tests/baseline_images/test_contrib/test_missing/test_dispersion/test_missingvaluesdispersion_with_numpy_with_y_targets.png b/tests/baseline_images/test_contrib/test_missing/test_dispersion/test_missingvaluesdispersion_with_numpy_with_y_targets.png index 569624b93..edadc8f09 100644 Binary files a/tests/baseline_images/test_contrib/test_missing/test_dispersion/test_missingvaluesdispersion_with_numpy_with_y_targets.png and b/tests/baseline_images/test_contrib/test_missing/test_dispersion/test_missingvaluesdispersion_with_numpy_with_y_targets.png differ diff --git a/tests/baseline_images/test_contrib/test_missing/test_dispersion/test_missingvaluesdispersion_with_pandas.png b/tests/baseline_images/test_contrib/test_missing/test_dispersion/test_missingvaluesdispersion_with_pandas.png index ddc0eb338..c82ecefaf 100644 Binary files a/tests/baseline_images/test_contrib/test_missing/test_dispersion/test_missingvaluesdispersion_with_pandas.png and b/tests/baseline_images/test_contrib/test_missing/test_dispersion/test_missingvaluesdispersion_with_pandas.png differ diff --git a/tests/baseline_images/test_contrib/test_missing/test_dispersion/test_missingvaluesdispersion_with_pandas_with_y_targets.png b/tests/baseline_images/test_contrib/test_missing/test_dispersion/test_missingvaluesdispersion_with_pandas_with_y_targets.png index 3b4d52b43..0d293e939 100644 Binary files a/tests/baseline_images/test_contrib/test_missing/test_dispersion/test_missingvaluesdispersion_with_pandas_with_y_targets.png and b/tests/baseline_images/test_contrib/test_missing/test_dispersion/test_missingvaluesdispersion_with_pandas_with_y_targets.png differ diff --git a/tests/baseline_images/test_contrib/test_scatter/test_scatter_image.png b/tests/baseline_images/test_contrib/test_scatter/test_scatter_image.png index 93839a259..561d91d88 100644 Binary files a/tests/baseline_images/test_contrib/test_scatter/test_scatter_image.png and b/tests/baseline_images/test_contrib/test_scatter/test_scatter_image.png differ diff --git a/tests/baseline_images/test_contrib/test_scatter/test_scatter_image_fail.png b/tests/baseline_images/test_contrib/test_scatter/test_scatter_image_fail.png deleted file mode 100644 index 04cde26de..000000000 Binary files a/tests/baseline_images/test_contrib/test_scatter/test_scatter_image_fail.png and /dev/null differ diff --git a/tests/baseline_images/test_draw/test_horizontal_bar_stack.png b/tests/baseline_images/test_draw/test_horizontal_bar_stack.png new file mode 100644 index 000000000..f7317dedd Binary files /dev/null and b/tests/baseline_images/test_draw/test_horizontal_bar_stack.png differ diff --git a/tests/baseline_images/test_draw/test_labels_horizontal.png b/tests/baseline_images/test_draw/test_labels_horizontal.png new file mode 100644 index 000000000..efa9982f6 Binary files /dev/null and b/tests/baseline_images/test_draw/test_labels_horizontal.png differ diff --git a/tests/baseline_images/test_draw/test_labels_vertical.png b/tests/baseline_images/test_draw/test_labels_vertical.png new file mode 100644 index 000000000..eecd0202e Binary files /dev/null and b/tests/baseline_images/test_draw/test_labels_vertical.png differ diff --git a/tests/baseline_images/test_draw/test_manual_legend.png b/tests/baseline_images/test_draw/test_manual_legend.png index f0682256a..65bf8b975 100644 Binary files a/tests/baseline_images/test_draw/test_manual_legend.png and b/tests/baseline_images/test_draw/test_manual_legend.png differ diff --git a/tests/baseline_images/test_draw/test_single_row_bar_stack.png b/tests/baseline_images/test_draw/test_single_row_bar_stack.png new file mode 100644 index 000000000..e4475de34 Binary files /dev/null and b/tests/baseline_images/test_draw/test_single_row_bar_stack.png differ diff --git a/tests/baseline_images/test_draw/test_vertical_bar_stack.png b/tests/baseline_images/test_draw/test_vertical_bar_stack.png new file mode 100644 index 000000000..8148df34f Binary files /dev/null and b/tests/baseline_images/test_draw/test_vertical_bar_stack.png differ diff --git a/tests/baseline_images/test_features/test_importances/test_integration_coef.png b/tests/baseline_images/test_features/test_importances/test_integration_coef.png deleted file mode 100644 index 67ca51bc1..000000000 Binary files a/tests/baseline_images/test_features/test_importances/test_integration_coef.png and /dev/null differ diff --git a/tests/baseline_images/test_features/test_importances/test_integration_feature_importances.png b/tests/baseline_images/test_features/test_importances/test_integration_feature_importances.png deleted file mode 100644 index 9879e019c..000000000 Binary files a/tests/baseline_images/test_features/test_importances/test_integration_feature_importances.png and /dev/null differ diff --git a/tests/baseline_images/test_features/test_importances/test_integration_quick_method.png b/tests/baseline_images/test_features/test_importances/test_integration_quick_method.png deleted file mode 100644 index c388bff47..000000000 Binary files a/tests/baseline_images/test_features/test_importances/test_integration_quick_method.png and /dev/null differ diff --git a/tests/baseline_images/test_features/test_importances/test_multi_coefs_stacked.png b/tests/baseline_images/test_features/test_importances/test_multi_coefs_stacked.png deleted file mode 100644 index 7293b6dce..000000000 Binary files a/tests/baseline_images/test_features/test_importances/test_multi_coefs_stacked.png and /dev/null differ diff --git a/tests/baseline_images/test_features/test_jointplot/test_columns_double_index_continuous_y.png b/tests/baseline_images/test_features/test_jointplot/test_columns_double_index_continuous_y.png new file mode 100644 index 000000000..2bc44d0c5 Binary files /dev/null and b/tests/baseline_images/test_features/test_jointplot/test_columns_double_index_continuous_y.png differ diff --git a/tests/baseline_images/test_features/test_jointplot/test_columns_double_index_continuous_y_hist.png b/tests/baseline_images/test_features/test_jointplot/test_columns_double_index_continuous_y_hist.png new file mode 100644 index 000000000..ad00ec273 Binary files /dev/null and b/tests/baseline_images/test_features/test_jointplot/test_columns_double_index_continuous_y_hist.png differ diff --git a/tests/baseline_images/test_features/test_jointplot/test_columns_double_index_discrete_y.png b/tests/baseline_images/test_features/test_jointplot/test_columns_double_index_discrete_y.png new file mode 100644 index 000000000..f4df04e2b Binary files /dev/null and b/tests/baseline_images/test_features/test_jointplot/test_columns_double_index_discrete_y.png differ diff --git a/tests/baseline_images/test_features/test_jointplot/test_columns_double_index_discrete_y_hist.png b/tests/baseline_images/test_features/test_jointplot/test_columns_double_index_discrete_y_hist.png new file mode 100644 index 000000000..14c6ac802 Binary files /dev/null and b/tests/baseline_images/test_features/test_jointplot/test_columns_double_index_discrete_y_hist.png differ diff --git a/tests/baseline_images/test_features/test_jointplot/test_columns_double_int_index_numpy_no_y.png b/tests/baseline_images/test_features/test_jointplot/test_columns_double_int_index_numpy_no_y.png new file mode 100644 index 000000000..f4df04e2b Binary files /dev/null and b/tests/baseline_images/test_features/test_jointplot/test_columns_double_int_index_numpy_no_y.png differ diff --git a/tests/baseline_images/test_features/test_jointplot/test_columns_double_int_index_numpy_no_y_hist.png b/tests/baseline_images/test_features/test_jointplot/test_columns_double_int_index_numpy_no_y_hist.png new file mode 100644 index 000000000..14c6ac802 Binary files /dev/null and b/tests/baseline_images/test_features/test_jointplot/test_columns_double_int_index_numpy_no_y_hist.png differ diff --git a/tests/baseline_images/test_features/test_jointplot/test_columns_double_str_index_pandas_no_y.png b/tests/baseline_images/test_features/test_jointplot/test_columns_double_str_index_pandas_no_y.png new file mode 100644 index 000000000..2bc44d0c5 Binary files /dev/null and b/tests/baseline_images/test_features/test_jointplot/test_columns_double_str_index_pandas_no_y.png differ diff --git a/tests/baseline_images/test_features/test_jointplot/test_columns_double_str_index_pandas_no_y_hist.png b/tests/baseline_images/test_features/test_jointplot/test_columns_double_str_index_pandas_no_y_hist.png new file mode 100644 index 000000000..ad00ec273 Binary files /dev/null and b/tests/baseline_images/test_features/test_jointplot/test_columns_double_str_index_pandas_no_y_hist.png differ diff --git a/tests/baseline_images/test_features/test_jointplot/test_columns_none_x.png b/tests/baseline_images/test_features/test_jointplot/test_columns_none_x.png new file mode 100644 index 000000000..f4df04e2b Binary files /dev/null and b/tests/baseline_images/test_features/test_jointplot/test_columns_none_x.png differ diff --git a/tests/baseline_images/test_features/test_jointplot/test_columns_none_x_hist.png b/tests/baseline_images/test_features/test_jointplot/test_columns_none_x_hist.png new file mode 100644 index 000000000..14c6ac802 Binary files /dev/null and b/tests/baseline_images/test_features/test_jointplot/test_columns_none_x_hist.png differ diff --git a/tests/baseline_images/test_features/test_jointplot/test_columns_none_x_y.png b/tests/baseline_images/test_features/test_jointplot/test_columns_none_x_y.png new file mode 100644 index 000000000..0ab687b5a Binary files /dev/null and b/tests/baseline_images/test_features/test_jointplot/test_columns_none_x_y.png differ diff --git a/tests/baseline_images/test_features/test_jointplot/test_columns_none_x_y_hist.png b/tests/baseline_images/test_features/test_jointplot/test_columns_none_x_y_hist.png new file mode 100644 index 000000000..abc472a97 Binary files /dev/null and b/tests/baseline_images/test_features/test_jointplot/test_columns_none_x_y_hist.png differ diff --git a/tests/baseline_images/test_features/test_jointplot/test_columns_single_int_index_numpy.png b/tests/baseline_images/test_features/test_jointplot/test_columns_single_int_index_numpy.png new file mode 100644 index 000000000..5d30c2762 Binary files /dev/null and b/tests/baseline_images/test_features/test_jointplot/test_columns_single_int_index_numpy.png differ diff --git a/tests/baseline_images/test_features/test_jointplot/test_columns_single_int_index_numpy_hist.png b/tests/baseline_images/test_features/test_jointplot/test_columns_single_int_index_numpy_hist.png new file mode 100644 index 000000000..9d2788c43 Binary files /dev/null and b/tests/baseline_images/test_features/test_jointplot/test_columns_single_int_index_numpy_hist.png differ diff --git a/tests/baseline_images/test_features/test_jointplot/test_columns_single_str_index_pandas.png b/tests/baseline_images/test_features/test_jointplot/test_columns_single_str_index_pandas.png new file mode 100644 index 000000000..6ae4778a5 Binary files /dev/null and b/tests/baseline_images/test_features/test_jointplot/test_columns_single_str_index_pandas.png differ diff --git a/tests/baseline_images/test_features/test_jointplot/test_columns_single_str_index_pandas_hist.png b/tests/baseline_images/test_features/test_jointplot/test_columns_single_str_index_pandas_hist.png new file mode 100644 index 000000000..0e04db612 Binary files /dev/null and b/tests/baseline_images/test_features/test_jointplot/test_columns_single_str_index_pandas_hist.png differ diff --git a/tests/baseline_images/test_features/test_jointplot/test_jointplot_has_no_errors.png b/tests/baseline_images/test_features/test_jointplot/test_jointplot_has_no_errors.png deleted file mode 100644 index ec2dfd703..000000000 Binary files a/tests/baseline_images/test_features/test_jointplot/test_jointplot_has_no_errors.png and /dev/null differ diff --git a/tests/baseline_images/test_features/test_jointplot/test_jointplot_integrated_has_no_errors.png b/tests/baseline_images/test_features/test_jointplot/test_jointplot_integrated_has_no_errors.png deleted file mode 100644 index 3eb043bdb..000000000 Binary files a/tests/baseline_images/test_features/test_jointplot/test_jointplot_integrated_has_no_errors.png and /dev/null differ diff --git a/tests/baseline_images/test_features/test_manifold/test_manifold_classification.png b/tests/baseline_images/test_features/test_manifold/test_manifold_classification.png index d78bf790f..b144c8480 100644 Binary files a/tests/baseline_images/test_features/test_manifold/test_manifold_classification.png and b/tests/baseline_images/test_features/test_manifold/test_manifold_classification.png differ diff --git a/tests/baseline_images/test_features/test_manifold/test_manifold_classification_3d.png b/tests/baseline_images/test_features/test_manifold/test_manifold_classification_3d.png new file mode 100644 index 000000000..bc1b5ffea Binary files /dev/null and b/tests/baseline_images/test_features/test_manifold/test_manifold_classification_3d.png differ diff --git a/tests/baseline_images/test_features/test_manifold/test_manifold_pandas.png b/tests/baseline_images/test_features/test_manifold/test_manifold_pandas.png index c28d5137c..40abd6462 100644 Binary files a/tests/baseline_images/test_features/test_manifold/test_manifold_pandas.png and b/tests/baseline_images/test_features/test_manifold/test_manifold_pandas.png differ diff --git a/tests/baseline_images/test_features/test_manifold/test_manifold_regression.png b/tests/baseline_images/test_features/test_manifold/test_manifold_regression.png index 5b0ecae7e..d30fa174b 100644 Binary files a/tests/baseline_images/test_features/test_manifold/test_manifold_regression.png and b/tests/baseline_images/test_features/test_manifold/test_manifold_regression.png differ diff --git a/tests/baseline_images/test_features/test_manifold/test_manifold_regression_3d.png b/tests/baseline_images/test_features/test_manifold/test_manifold_regression_3d.png new file mode 100644 index 000000000..9550de9b5 Binary files /dev/null and b/tests/baseline_images/test_features/test_manifold/test_manifold_regression_3d.png differ diff --git a/tests/baseline_images/test_features/test_manifold/test_manifold_single.png b/tests/baseline_images/test_features/test_manifold/test_manifold_single.png index 716dd2073..4ff3243a8 100644 Binary files a/tests/baseline_images/test_features/test_manifold/test_manifold_single.png and b/tests/baseline_images/test_features/test_manifold/test_manifold_single.png differ diff --git a/tests/baseline_images/test_features/test_manifold/test_manifold_single_3d.png b/tests/baseline_images/test_features/test_manifold/test_manifold_single_3d.png new file mode 100644 index 000000000..6881c983b Binary files /dev/null and b/tests/baseline_images/test_features/test_manifold/test_manifold_single_3d.png differ diff --git a/tests/baseline_images/test_features/test_pca/test_biplot_2d.png b/tests/baseline_images/test_features/test_pca/test_biplot_2d.png index 3e2b58f6c..551dd59e2 100644 Binary files a/tests/baseline_images/test_features/test_pca/test_biplot_2d.png and b/tests/baseline_images/test_features/test_pca/test_biplot_2d.png differ diff --git a/tests/baseline_images/test_features/test_pca/test_biplot_3d.png b/tests/baseline_images/test_features/test_pca/test_biplot_3d.png index 0eafad7f5..93b79f1a1 100644 Binary files a/tests/baseline_images/test_features/test_pca/test_biplot_3d.png and b/tests/baseline_images/test_features/test_pca/test_biplot_3d.png differ diff --git a/tests/baseline_images/test_features/test_pca/test_colorbar.png b/tests/baseline_images/test_features/test_pca/test_colorbar.png new file mode 100644 index 000000000..000145934 Binary files /dev/null and b/tests/baseline_images/test_features/test_pca/test_colorbar.png differ diff --git a/tests/baseline_images/test_features/test_pca/test_colorbar_heatmap.png b/tests/baseline_images/test_features/test_pca/test_colorbar_heatmap.png new file mode 100644 index 000000000..af198ba5e Binary files /dev/null and b/tests/baseline_images/test_features/test_pca/test_colorbar_heatmap.png differ diff --git a/tests/baseline_images/test_features/test_pca/test_continuous.png b/tests/baseline_images/test_features/test_pca/test_continuous.png new file mode 100644 index 000000000..609714cef Binary files /dev/null and b/tests/baseline_images/test_features/test_pca/test_continuous.png differ diff --git a/tests/baseline_images/test_features/test_pca/test_discrete.png b/tests/baseline_images/test_features/test_pca/test_discrete.png new file mode 100644 index 000000000..eb66353c9 Binary files /dev/null and b/tests/baseline_images/test_features/test_pca/test_discrete.png differ diff --git a/tests/baseline_images/test_features/test_pca/test_heatmap.png b/tests/baseline_images/test_features/test_pca/test_heatmap.png new file mode 100644 index 000000000..e3c564521 Binary files /dev/null and b/tests/baseline_images/test_features/test_pca/test_heatmap.png differ diff --git a/tests/baseline_images/test_features/test_pca/test_pca_decomposition_quick_method.png b/tests/baseline_images/test_features/test_pca/test_pca_decomposition_quick_method.png index 38ae6a50e..60db53d0f 100644 Binary files a/tests/baseline_images/test_features/test_pca/test_pca_decomposition_quick_method.png and b/tests/baseline_images/test_features/test_pca/test_pca_decomposition_quick_method.png differ diff --git a/tests/baseline_images/test_features/test_pca/test_scale_false_2d.png b/tests/baseline_images/test_features/test_pca/test_scale_false_2d.png index 7b478681f..83c7fa2f7 100644 Binary files a/tests/baseline_images/test_features/test_pca/test_scale_false_2d.png and b/tests/baseline_images/test_features/test_pca/test_scale_false_2d.png differ diff --git a/tests/baseline_images/test_features/test_pca/test_scale_false_3d.png b/tests/baseline_images/test_features/test_pca/test_scale_false_3d.png index cfc4858dc..6b18070fa 100644 Binary files a/tests/baseline_images/test_features/test_pca/test_scale_false_3d.png and b/tests/baseline_images/test_features/test_pca/test_scale_false_3d.png differ diff --git a/tests/baseline_images/test_features/test_pca/test_scale_true_2d.png b/tests/baseline_images/test_features/test_pca/test_scale_true_2d.png index 3d8902cde..141ce961b 100644 Binary files a/tests/baseline_images/test_features/test_pca/test_scale_true_2d.png and b/tests/baseline_images/test_features/test_pca/test_scale_true_2d.png differ diff --git a/tests/baseline_images/test_features/test_pca/test_scale_true_3d.png b/tests/baseline_images/test_features/test_pca/test_scale_true_3d.png index 17b3540f4..77c70c987 100644 Binary files a/tests/baseline_images/test_features/test_pca/test_scale_true_3d.png and b/tests/baseline_images/test_features/test_pca/test_scale_true_3d.png differ diff --git a/tests/baseline_images/test_features/test_pca/test_single.png b/tests/baseline_images/test_features/test_pca/test_single.png new file mode 100644 index 000000000..baf50882c Binary files /dev/null and b/tests/baseline_images/test_features/test_pca/test_single.png differ diff --git a/tests/baseline_images/test_features/test_pcoords/test_alpha.png b/tests/baseline_images/test_features/test_pcoords/test_alpha.png index bbf9bb84c..dc7cc9659 100644 Binary files a/tests/baseline_images/test_features/test_pcoords/test_alpha.png and b/tests/baseline_images/test_features/test_pcoords/test_alpha.png differ diff --git a/tests/baseline_images/test_features/test_pcoords/test_alpha_fast.png b/tests/baseline_images/test_features/test_pcoords/test_alpha_fast.png index 8c5e2fab2..0544ca9f7 100644 Binary files a/tests/baseline_images/test_features/test_pcoords/test_alpha_fast.png and b/tests/baseline_images/test_features/test_pcoords/test_alpha_fast.png differ diff --git a/tests/baseline_images/test_features/test_pcoords/test_labels.png b/tests/baseline_images/test_features/test_pcoords/test_labels.png index a27856219..a55d0b401 100644 Binary files a/tests/baseline_images/test_features/test_pcoords/test_labels.png and b/tests/baseline_images/test_features/test_pcoords/test_labels.png differ diff --git a/tests/baseline_images/test_features/test_pcoords/test_labels_fast.png b/tests/baseline_images/test_features/test_pcoords/test_labels_fast.png index 4dd06ca2a..6172c7042 100644 Binary files a/tests/baseline_images/test_features/test_pcoords/test_labels_fast.png and b/tests/baseline_images/test_features/test_pcoords/test_labels_fast.png differ diff --git a/tests/baseline_images/test_features/test_pcoords/test_normalized_l2.png b/tests/baseline_images/test_features/test_pcoords/test_normalized_l2.png index 16f4132f0..ef1cfd2cb 100644 Binary files a/tests/baseline_images/test_features/test_pcoords/test_normalized_l2.png and b/tests/baseline_images/test_features/test_pcoords/test_normalized_l2.png differ diff --git a/tests/baseline_images/test_features/test_pcoords/test_normalized_l2_fast.png b/tests/baseline_images/test_features/test_pcoords/test_normalized_l2_fast.png index a08cb3f4e..91d6b5609 100644 Binary files a/tests/baseline_images/test_features/test_pcoords/test_normalized_l2_fast.png and b/tests/baseline_images/test_features/test_pcoords/test_normalized_l2_fast.png differ diff --git a/tests/baseline_images/test_features/test_pcoords/test_normalized_minmax.png b/tests/baseline_images/test_features/test_pcoords/test_normalized_minmax.png index c94ba5be9..48ba143bf 100644 Binary files a/tests/baseline_images/test_features/test_pcoords/test_normalized_minmax.png and b/tests/baseline_images/test_features/test_pcoords/test_normalized_minmax.png differ diff --git a/tests/baseline_images/test_features/test_pcoords/test_normalized_minmax_fast.png b/tests/baseline_images/test_features/test_pcoords/test_normalized_minmax_fast.png index e0af88231..cc21f860c 100644 Binary files a/tests/baseline_images/test_features/test_pcoords/test_normalized_minmax_fast.png and b/tests/baseline_images/test_features/test_pcoords/test_normalized_minmax_fast.png differ diff --git a/tests/baseline_images/test_features/test_pcoords/test_numpy_integration_fast.png b/tests/baseline_images/test_features/test_pcoords/test_numpy_integration_fast.png new file mode 100644 index 000000000..3ea42d18e Binary files /dev/null and b/tests/baseline_images/test_features/test_pcoords/test_numpy_integration_fast.png differ diff --git a/tests/baseline_images/test_features/test_pcoords/test_numpy_integration_sampled.png b/tests/baseline_images/test_features/test_pcoords/test_numpy_integration_sampled.png new file mode 100644 index 000000000..2ea193b1f Binary files /dev/null and b/tests/baseline_images/test_features/test_pcoords/test_numpy_integration_sampled.png differ diff --git a/tests/baseline_images/test_features/test_pcoords/test_pandas_integration_fast.png b/tests/baseline_images/test_features/test_pcoords/test_pandas_integration_fast.png index e8eb1338f..3ea42d18e 100644 Binary files a/tests/baseline_images/test_features/test_pcoords/test_pandas_integration_fast.png and b/tests/baseline_images/test_features/test_pcoords/test_pandas_integration_fast.png differ diff --git a/tests/baseline_images/test_features/test_pcoords/test_pandas_integration_sampled.png b/tests/baseline_images/test_features/test_pcoords/test_pandas_integration_sampled.png index 41da2a3b7..2ea193b1f 100644 Binary files a/tests/baseline_images/test_features/test_pcoords/test_pandas_integration_sampled.png and b/tests/baseline_images/test_features/test_pcoords/test_pandas_integration_sampled.png differ diff --git a/tests/baseline_images/test_features/test_pcoords/test_parallel_coords.png b/tests/baseline_images/test_features/test_pcoords/test_parallel_coords.png index ecb9b2557..a55d0b401 100644 Binary files a/tests/baseline_images/test_features/test_pcoords/test_parallel_coords.png and b/tests/baseline_images/test_features/test_pcoords/test_parallel_coords.png differ diff --git a/tests/baseline_images/test_features/test_pcoords/test_parallel_coords_fast.png b/tests/baseline_images/test_features/test_pcoords/test_parallel_coords_fast.png index c4fd16d03..6172c7042 100644 Binary files a/tests/baseline_images/test_features/test_pcoords/test_parallel_coords_fast.png and b/tests/baseline_images/test_features/test_pcoords/test_parallel_coords_fast.png differ diff --git a/tests/baseline_images/test_features/test_projection/test_3d_continuous_plot.png b/tests/baseline_images/test_features/test_projection/test_3d_continuous_plot.png new file mode 100644 index 000000000..19b50f61f Binary files /dev/null and b/tests/baseline_images/test_features/test_projection/test_3d_continuous_plot.png differ diff --git a/tests/baseline_images/test_features/test_projection/test_colorbar_false.png b/tests/baseline_images/test_features/test_projection/test_colorbar_false.png new file mode 100644 index 000000000..8b1288b13 Binary files /dev/null and b/tests/baseline_images/test_features/test_projection/test_colorbar_false.png differ diff --git a/tests/baseline_images/test_features/test_projection/test_continuous_plot.png b/tests/baseline_images/test_features/test_projection/test_continuous_plot.png new file mode 100644 index 000000000..445753ae2 Binary files /dev/null and b/tests/baseline_images/test_features/test_projection/test_continuous_plot.png differ diff --git a/tests/baseline_images/test_features/test_projection/test_continuous_when_target_discrete.png b/tests/baseline_images/test_features/test_projection/test_continuous_when_target_discrete.png new file mode 100644 index 000000000..6d3bfad7e Binary files /dev/null and b/tests/baseline_images/test_features/test_projection/test_continuous_when_target_discrete.png differ diff --git a/tests/baseline_images/test_features/test_projection/test_discrete_3d.png b/tests/baseline_images/test_features/test_projection/test_discrete_3d.png new file mode 100644 index 000000000..b2170bc74 Binary files /dev/null and b/tests/baseline_images/test_features/test_projection/test_discrete_3d.png differ diff --git a/tests/baseline_images/test_features/test_projection/test_discrete_plot.png b/tests/baseline_images/test_features/test_projection/test_discrete_plot.png new file mode 100644 index 000000000..6fcd54ff5 Binary files /dev/null and b/tests/baseline_images/test_features/test_projection/test_discrete_plot.png differ diff --git a/tests/baseline_images/test_features/test_projection/test_single_plot.png b/tests/baseline_images/test_features/test_projection/test_single_plot.png new file mode 100644 index 000000000..febcfd9b1 Binary files /dev/null and b/tests/baseline_images/test_features/test_projection/test_single_plot.png differ diff --git a/tests/baseline_images/test_features/test_radviz/test_integrated_radiz_pandas_classes_features.png b/tests/baseline_images/test_features/test_radviz/test_integrated_radiz_pandas_classes_features.png deleted file mode 100644 index 98a11a9f0..000000000 Binary files a/tests/baseline_images/test_features/test_radviz/test_integrated_radiz_pandas_classes_features.png and /dev/null differ diff --git a/tests/baseline_images/test_features/test_radviz/test_integrated_radiz_with_pandas.png b/tests/baseline_images/test_features/test_radviz/test_integrated_radiz_with_pandas.png deleted file mode 100644 index 98607bc74..000000000 Binary files a/tests/baseline_images/test_features/test_radviz/test_integrated_radiz_with_pandas.png and /dev/null differ diff --git a/tests/baseline_images/test_features/test_radviz/test_integrated_radviz_numpy_classes_features.png b/tests/baseline_images/test_features/test_radviz/test_integrated_radviz_numpy_classes_features.png new file mode 100644 index 000000000..185306988 Binary files /dev/null and b/tests/baseline_images/test_features/test_radviz/test_integrated_radviz_numpy_classes_features.png differ diff --git a/tests/baseline_images/test_features/test_radviz/test_integrated_radviz_pandas_classes_features.png b/tests/baseline_images/test_features/test_radviz/test_integrated_radviz_pandas_classes_features.png new file mode 100644 index 000000000..185306988 Binary files /dev/null and b/tests/baseline_images/test_features/test_radviz/test_integrated_radviz_pandas_classes_features.png differ diff --git a/tests/baseline_images/test_features/test_radviz/test_integrated_radviz_with_numpy.png b/tests/baseline_images/test_features/test_radviz/test_integrated_radviz_with_numpy.png new file mode 100644 index 000000000..4804bd985 Binary files /dev/null and b/tests/baseline_images/test_features/test_radviz/test_integrated_radviz_with_numpy.png differ diff --git a/tests/baseline_images/test_features/test_radviz/test_integrated_radviz_with_pandas.png b/tests/baseline_images/test_features/test_radviz/test_integrated_radviz_with_pandas.png new file mode 100644 index 000000000..cf2d44dbd Binary files /dev/null and b/tests/baseline_images/test_features/test_radviz/test_integrated_radviz_with_pandas.png differ diff --git a/tests/baseline_images/test_features/test_radviz/test_radviz.png b/tests/baseline_images/test_features/test_radviz/test_radviz.png index fdbe4aeb3..5c644ad52 100644 Binary files a/tests/baseline_images/test_features/test_radviz/test_radviz.png and b/tests/baseline_images/test_features/test_radviz/test_radviz.png differ diff --git a/tests/baseline_images/test_features/test_radviz/test_radviz_alpha.png b/tests/baseline_images/test_features/test_radviz/test_radviz_alpha.png index d53445692..13677c268 100644 Binary files a/tests/baseline_images/test_features/test_radviz/test_radviz_alpha.png and b/tests/baseline_images/test_features/test_radviz/test_radviz_alpha.png differ diff --git a/tests/baseline_images/test_features/test_rankd/test_rank1d_integrated.png b/tests/baseline_images/test_features/test_rankd/test_rank1d_integrated_numpy.png similarity index 50% rename from tests/baseline_images/test_features/test_rankd/test_rank1d_integrated.png rename to tests/baseline_images/test_features/test_rankd/test_rank1d_integrated_numpy.png index 50cea16e4..dd611de43 100644 Binary files a/tests/baseline_images/test_features/test_rankd/test_rank1d_integrated.png and b/tests/baseline_images/test_features/test_rankd/test_rank1d_integrated_numpy.png differ diff --git a/tests/baseline_images/test_features/test_rankd/test_rank1d_integrated_pandas.png b/tests/baseline_images/test_features/test_rankd/test_rank1d_integrated_pandas.png new file mode 100644 index 000000000..dd611de43 Binary files /dev/null and b/tests/baseline_images/test_features/test_rankd/test_rank1d_integrated_pandas.png differ diff --git a/tests/baseline_images/test_features/test_rankd/test_rank1d_orientation.png b/tests/baseline_images/test_features/test_rankd/test_rank1d_orientation.png new file mode 100644 index 000000000..8b3b016df Binary files /dev/null and b/tests/baseline_images/test_features/test_rankd/test_rank1d_orientation.png differ diff --git a/tests/baseline_images/test_features/test_rankd/test_rank1d_random.png b/tests/baseline_images/test_features/test_rankd/test_rank1d_random.png deleted file mode 100644 index e44d850f9..000000000 Binary files a/tests/baseline_images/test_features/test_rankd/test_rank1d_random.png and /dev/null differ diff --git a/tests/baseline_images/test_features/test_rankd/test_rank1d_shapiro.png b/tests/baseline_images/test_features/test_rankd/test_rank1d_shapiro.png new file mode 100644 index 000000000..9c78a6219 Binary files /dev/null and b/tests/baseline_images/test_features/test_rankd/test_rank1d_shapiro.png differ diff --git a/tests/baseline_images/test_features/test_rankd/test_rank2d_covariance.png b/tests/baseline_images/test_features/test_rankd/test_rank2d_covariance.png new file mode 100644 index 000000000..f4ee0001a Binary files /dev/null and b/tests/baseline_images/test_features/test_rankd/test_rank2d_covariance.png differ diff --git a/tests/baseline_images/test_features/test_rankd/test_rank2d_integrated.png b/tests/baseline_images/test_features/test_rankd/test_rank2d_integrated_numpy.png similarity index 98% rename from tests/baseline_images/test_features/test_rankd/test_rank2d_integrated.png rename to tests/baseline_images/test_features/test_rankd/test_rank2d_integrated_numpy.png index 5ca032037..da30d76fd 100644 Binary files a/tests/baseline_images/test_features/test_rankd/test_rank2d_integrated.png and b/tests/baseline_images/test_features/test_rankd/test_rank2d_integrated_numpy.png differ diff --git a/tests/baseline_images/test_features/test_rankd/test_rank2d_integrated_pandas.png b/tests/baseline_images/test_features/test_rankd/test_rank2d_integrated_pandas.png new file mode 100644 index 000000000..da30d76fd Binary files /dev/null and b/tests/baseline_images/test_features/test_rankd/test_rank2d_integrated_pandas.png differ diff --git a/tests/baseline_images/test_features/test_rankd/test_rank2d_kendalltau.png b/tests/baseline_images/test_features/test_rankd/test_rank2d_kendalltau.png new file mode 100644 index 000000000..ee18a6627 Binary files /dev/null and b/tests/baseline_images/test_features/test_rankd/test_rank2d_kendalltau.png differ diff --git a/tests/baseline_images/test_features/test_rankd/test_rank2d_pearson.png b/tests/baseline_images/test_features/test_rankd/test_rank2d_pearson.png new file mode 100644 index 000000000..3c3497c78 Binary files /dev/null and b/tests/baseline_images/test_features/test_rankd/test_rank2d_pearson.png differ diff --git a/tests/baseline_images/test_features/test_rankd/test_rank2d_random.png b/tests/baseline_images/test_features/test_rankd/test_rank2d_random.png deleted file mode 100644 index cccb32171..000000000 Binary files a/tests/baseline_images/test_features/test_rankd/test_rank2d_random.png and /dev/null differ diff --git a/tests/baseline_images/test_features/test_rankd/test_rank2d_spearman.png b/tests/baseline_images/test_features/test_rankd/test_rank2d_spearman.png new file mode 100644 index 000000000..8b79bab35 Binary files /dev/null and b/tests/baseline_images/test_features/test_rankd/test_rank2d_spearman.png differ diff --git a/tests/baseline_images/test_features/test_rfecv/test_pandas_integration.png b/tests/baseline_images/test_features/test_rfecv/test_pandas_integration.png deleted file mode 100644 index c542e6100..000000000 Binary files a/tests/baseline_images/test_features/test_rfecv/test_pandas_integration.png and /dev/null differ diff --git a/tests/baseline_images/test_features/test_rfecv/test_quick_method.png b/tests/baseline_images/test_features/test_rfecv/test_quick_method.png deleted file mode 100644 index 85cef8a7d..000000000 Binary files a/tests/baseline_images/test_features/test_rfecv/test_quick_method.png and /dev/null differ diff --git a/tests/baseline_images/test_features/test_rfecv/test_rfecv_classification.png b/tests/baseline_images/test_features/test_rfecv/test_rfecv_classification.png deleted file mode 100644 index a6a2ffdc7..000000000 Binary files a/tests/baseline_images/test_features/test_rfecv/test_rfecv_classification.png and /dev/null differ diff --git a/tests/baseline_images/test_meta/test_random_visualizer.png b/tests/baseline_images/test_meta/test_random_visualizer.png index 53e9fe65a..f3f893197 100644 Binary files a/tests/baseline_images/test_meta/test_random_visualizer.png and b/tests/baseline_images/test_meta/test_random_visualizer.png differ diff --git a/tests/baseline_images/test_meta/test_random_visualizer_increased_tolerance.png b/tests/baseline_images/test_meta/test_random_visualizer_increased_tolerance.png index 4cf89a0fb..da6f804ae 100644 Binary files a/tests/baseline_images/test_meta/test_random_visualizer_increased_tolerance.png and b/tests/baseline_images/test_meta/test_random_visualizer_increased_tolerance.png differ diff --git a/tests/baseline_images/test_meta/test_random_visualizer_not_close.png b/tests/baseline_images/test_meta/test_random_visualizer_not_close.png index c9b9b1ff4..ab01c119a 100644 Binary files a/tests/baseline_images/test_meta/test_random_visualizer_not_close.png and b/tests/baseline_images/test_meta/test_random_visualizer_not_close.png differ diff --git a/tests/baseline_images/test_model_selection/test_cross_validation/test_classifier.png b/tests/baseline_images/test_model_selection/test_cross_validation/test_classifier.png index 1e2b6fd5d..7940183d1 100644 Binary files a/tests/baseline_images/test_model_selection/test_cross_validation/test_classifier.png and b/tests/baseline_images/test_model_selection/test_cross_validation/test_classifier.png differ diff --git a/tests/baseline_images/test_model_selection/test_cross_validation/test_numpy_integration.png b/tests/baseline_images/test_model_selection/test_cross_validation/test_numpy_integration.png new file mode 100644 index 000000000..49261aefe Binary files /dev/null and b/tests/baseline_images/test_model_selection/test_cross_validation/test_numpy_integration.png differ diff --git a/tests/baseline_images/test_model_selection/test_cross_validation/test_pandas_integration.png b/tests/baseline_images/test_model_selection/test_cross_validation/test_pandas_integration.png index d3b00b845..49261aefe 100644 Binary files a/tests/baseline_images/test_model_selection/test_cross_validation/test_pandas_integration.png and b/tests/baseline_images/test_model_selection/test_cross_validation/test_pandas_integration.png differ diff --git a/tests/baseline_images/test_model_selection/test_cross_validation/test_quick_method.png b/tests/baseline_images/test_model_selection/test_cross_validation/test_quick_method.png index 6e32d4d37..6027d5528 100644 Binary files a/tests/baseline_images/test_model_selection/test_cross_validation/test_quick_method.png and b/tests/baseline_images/test_model_selection/test_cross_validation/test_quick_method.png differ diff --git a/tests/baseline_images/test_model_selection/test_cross_validation/test_regression.png b/tests/baseline_images/test_model_selection/test_cross_validation/test_regression.png index cf63f267e..2d4464b17 100644 Binary files a/tests/baseline_images/test_model_selection/test_cross_validation/test_regression.png and b/tests/baseline_images/test_model_selection/test_cross_validation/test_regression.png differ diff --git a/tests/baseline_images/test_model_selection/test_importances/test_integration_coef.png b/tests/baseline_images/test_model_selection/test_importances/test_integration_coef.png new file mode 100644 index 000000000..894ceff20 Binary files /dev/null and b/tests/baseline_images/test_model_selection/test_importances/test_integration_coef.png differ diff --git a/tests/baseline_images/test_model_selection/test_importances/test_integration_feature_importances.png b/tests/baseline_images/test_model_selection/test_importances/test_integration_feature_importances.png new file mode 100644 index 000000000..ce10311b4 Binary files /dev/null and b/tests/baseline_images/test_model_selection/test_importances/test_integration_feature_importances.png differ diff --git a/tests/baseline_images/test_model_selection/test_importances/test_integration_quick_method.png b/tests/baseline_images/test_model_selection/test_importances/test_integration_quick_method.png new file mode 100644 index 000000000..780184bb0 Binary files /dev/null and b/tests/baseline_images/test_model_selection/test_importances/test_integration_quick_method.png differ diff --git a/tests/baseline_images/test_model_selection/test_importances/test_multi_coefs_stacked.png b/tests/baseline_images/test_model_selection/test_importances/test_multi_coefs_stacked.png new file mode 100644 index 000000000..4419a2747 Binary files /dev/null and b/tests/baseline_images/test_model_selection/test_importances/test_multi_coefs_stacked.png differ diff --git a/tests/baseline_images/test_model_selection/test_learning_curve/test_classifier.png b/tests/baseline_images/test_model_selection/test_learning_curve/test_classifier.png index d45011096..d60fe234f 100644 Binary files a/tests/baseline_images/test_model_selection/test_learning_curve/test_classifier.png and b/tests/baseline_images/test_model_selection/test_learning_curve/test_classifier.png differ diff --git a/tests/baseline_images/test_model_selection/test_learning_curve/test_clusters.png b/tests/baseline_images/test_model_selection/test_learning_curve/test_clusters.png index 0975c2712..8814d17f1 100644 Binary files a/tests/baseline_images/test_model_selection/test_learning_curve/test_clusters.png and b/tests/baseline_images/test_model_selection/test_learning_curve/test_clusters.png differ diff --git a/tests/baseline_images/test_model_selection/test_learning_curve/test_numpy_integration.png b/tests/baseline_images/test_model_selection/test_learning_curve/test_numpy_integration.png new file mode 100644 index 000000000..5aa360fdb Binary files /dev/null and b/tests/baseline_images/test_model_selection/test_learning_curve/test_numpy_integration.png differ diff --git a/tests/baseline_images/test_model_selection/test_learning_curve/test_pandas_integration.png b/tests/baseline_images/test_model_selection/test_learning_curve/test_pandas_integration.png index 6b2a2814a..5aa360fdb 100644 Binary files a/tests/baseline_images/test_model_selection/test_learning_curve/test_pandas_integration.png and b/tests/baseline_images/test_model_selection/test_learning_curve/test_pandas_integration.png differ diff --git a/tests/baseline_images/test_model_selection/test_learning_curve/test_quick_method.png b/tests/baseline_images/test_model_selection/test_learning_curve/test_quick_method.png index 7a4980fa2..b4f80a803 100644 Binary files a/tests/baseline_images/test_model_selection/test_learning_curve/test_quick_method.png and b/tests/baseline_images/test_model_selection/test_learning_curve/test_quick_method.png differ diff --git a/tests/baseline_images/test_model_selection/test_learning_curve/test_regressor.png b/tests/baseline_images/test_model_selection/test_learning_curve/test_regressor.png index f324fc2b5..07d4d9d34 100644 Binary files a/tests/baseline_images/test_model_selection/test_learning_curve/test_regressor.png and b/tests/baseline_images/test_model_selection/test_learning_curve/test_regressor.png differ diff --git a/tests/baseline_images/test_model_selection/test_rfecv/test_numpy_integration.png b/tests/baseline_images/test_model_selection/test_rfecv/test_numpy_integration.png new file mode 100644 index 000000000..2d91dc140 Binary files /dev/null and b/tests/baseline_images/test_model_selection/test_rfecv/test_numpy_integration.png differ diff --git a/tests/baseline_images/test_model_selection/test_rfecv/test_pandas_integration.png b/tests/baseline_images/test_model_selection/test_rfecv/test_pandas_integration.png new file mode 100644 index 000000000..2d91dc140 Binary files /dev/null and b/tests/baseline_images/test_model_selection/test_rfecv/test_pandas_integration.png differ diff --git a/tests/baseline_images/test_model_selection/test_rfecv/test_quick_method.png b/tests/baseline_images/test_model_selection/test_rfecv/test_quick_method.png new file mode 100644 index 000000000..553c18edc Binary files /dev/null and b/tests/baseline_images/test_model_selection/test_rfecv/test_quick_method.png differ diff --git a/tests/baseline_images/test_model_selection/test_rfecv/test_rfecv_classification.png b/tests/baseline_images/test_model_selection/test_rfecv/test_rfecv_classification.png new file mode 100644 index 000000000..6fde68771 Binary files /dev/null and b/tests/baseline_images/test_model_selection/test_rfecv/test_rfecv_classification.png differ diff --git a/tests/baseline_images/test_model_selection/test_rfecv/test_rfecv_step.png b/tests/baseline_images/test_model_selection/test_rfecv/test_rfecv_step.png new file mode 100644 index 000000000..7d20edc32 Binary files /dev/null and b/tests/baseline_images/test_model_selection/test_rfecv/test_rfecv_step.png differ diff --git a/tests/baseline_images/test_model_selection/test_validation_curve/test_classifier.png b/tests/baseline_images/test_model_selection/test_validation_curve/test_classifier.png index e04a81472..fd2652a81 100644 Binary files a/tests/baseline_images/test_model_selection/test_validation_curve/test_classifier.png and b/tests/baseline_images/test_model_selection/test_validation_curve/test_classifier.png differ diff --git a/tests/baseline_images/test_model_selection/test_validation_curve/test_numpy_integration.png b/tests/baseline_images/test_model_selection/test_validation_curve/test_numpy_integration.png new file mode 100644 index 000000000..d5d10835b Binary files /dev/null and b/tests/baseline_images/test_model_selection/test_validation_curve/test_numpy_integration.png differ diff --git a/tests/baseline_images/test_model_selection/test_validation_curve/test_pandas_integration.png b/tests/baseline_images/test_model_selection/test_validation_curve/test_pandas_integration.png index b5fc0f6b2..d5d10835b 100644 Binary files a/tests/baseline_images/test_model_selection/test_validation_curve/test_pandas_integration.png and b/tests/baseline_images/test_model_selection/test_validation_curve/test_pandas_integration.png differ diff --git a/tests/baseline_images/test_model_selection/test_validation_curve/test_quick_method.png b/tests/baseline_images/test_model_selection/test_validation_curve/test_quick_method.png index 685e71bc6..c62bb473c 100644 Binary files a/tests/baseline_images/test_model_selection/test_validation_curve/test_quick_method.png and b/tests/baseline_images/test_model_selection/test_validation_curve/test_quick_method.png differ diff --git a/tests/baseline_images/test_model_selection/test_validation_curve/test_regression.png b/tests/baseline_images/test_model_selection/test_validation_curve/test_regression.png index 52fd5295e..9cc41cf86 100644 Binary files a/tests/baseline_images/test_model_selection/test_validation_curve/test_regression.png and b/tests/baseline_images/test_model_selection/test_validation_curve/test_regression.png differ diff --git a/tests/baseline_images/test_regressor/test_alphas/test_similar_image.png b/tests/baseline_images/test_regressor/test_alphas/test_similar_image.png index 8b5c6406e..ab4bc45d4 100644 Binary files a/tests/baseline_images/test_regressor/test_alphas/test_similar_image.png and b/tests/baseline_images/test_regressor/test_alphas/test_similar_image.png differ diff --git a/tests/baseline_images/test_regressor/test_influence/test_cooks_distance.png b/tests/baseline_images/test_regressor/test_influence/test_cooks_distance.png new file mode 100644 index 000000000..b28b5e428 Binary files /dev/null and b/tests/baseline_images/test_regressor/test_influence/test_cooks_distance.png differ diff --git a/tests/baseline_images/test_regressor/test_influence/test_cooks_distance_quickmethod.png b/tests/baseline_images/test_regressor/test_influence/test_cooks_distance_quickmethod.png new file mode 100644 index 000000000..7e3f08e9b Binary files /dev/null and b/tests/baseline_images/test_regressor/test_influence/test_cooks_distance_quickmethod.png differ diff --git a/tests/baseline_images/test_regressor/test_influence/test_numpy_integration.png b/tests/baseline_images/test_regressor/test_influence/test_numpy_integration.png new file mode 100644 index 000000000..0ddac4f19 Binary files /dev/null and b/tests/baseline_images/test_regressor/test_influence/test_numpy_integration.png differ diff --git a/tests/baseline_images/test_regressor/test_influence/test_pandas_integration.png b/tests/baseline_images/test_regressor/test_influence/test_pandas_integration.png new file mode 100644 index 000000000..0ddac4f19 Binary files /dev/null and b/tests/baseline_images/test_regressor/test_influence/test_pandas_integration.png differ diff --git a/tests/baseline_images/test_regressor/test_residuals/test_peplot_no_lines.png b/tests/baseline_images/test_regressor/test_residuals/test_peplot_no_lines.png index 0621deca8..83979f5e6 100644 Binary files a/tests/baseline_images/test_regressor/test_residuals/test_peplot_no_lines.png and b/tests/baseline_images/test_regressor/test_residuals/test_peplot_no_lines.png differ diff --git a/tests/baseline_images/test_regressor/test_residuals/test_peplot_no_shared_limits.png b/tests/baseline_images/test_regressor/test_residuals/test_peplot_no_shared_limits.png index 70088ef7f..15f728b82 100644 Binary files a/tests/baseline_images/test_regressor/test_residuals/test_peplot_no_shared_limits.png and b/tests/baseline_images/test_regressor/test_residuals/test_peplot_no_shared_limits.png differ diff --git a/tests/baseline_images/test_regressor/test_residuals/test_pred_error.png b/tests/baseline_images/test_regressor/test_residuals/test_pred_error.png deleted file mode 100644 index 8a2b8fb8c..000000000 Binary files a/tests/baseline_images/test_regressor/test_residuals/test_pred_error.png and /dev/null differ diff --git a/tests/baseline_images/test_regressor/test_residuals/test_prediction_error.png b/tests/baseline_images/test_regressor/test_residuals/test_prediction_error.png index c12fe075f..b984640ff 100644 Binary files a/tests/baseline_images/test_regressor/test_residuals/test_prediction_error.png and b/tests/baseline_images/test_regressor/test_residuals/test_prediction_error.png differ diff --git a/tests/baseline_images/test_regressor/test_residuals/test_prediction_error_numpy.png b/tests/baseline_images/test_regressor/test_residuals/test_prediction_error_numpy.png new file mode 100644 index 000000000..f65d9341b Binary files /dev/null and b/tests/baseline_images/test_regressor/test_residuals/test_prediction_error_numpy.png differ diff --git a/tests/baseline_images/test_regressor/test_residuals/test_prediction_error_pandas.png b/tests/baseline_images/test_regressor/test_residuals/test_prediction_error_pandas.png index 334fa8073..ef9e0ae8f 100644 Binary files a/tests/baseline_images/test_regressor/test_residuals/test_prediction_error_pandas.png and b/tests/baseline_images/test_regressor/test_residuals/test_prediction_error_pandas.png differ diff --git a/tests/baseline_images/test_regressor/test_residuals/test_resid_plots.png b/tests/baseline_images/test_regressor/test_residuals/test_resid_plots.png deleted file mode 100644 index 0143bf10f..000000000 Binary files a/tests/baseline_images/test_regressor/test_residuals/test_resid_plots.png and /dev/null differ diff --git a/tests/baseline_images/test_regressor/test_residuals/test_residuals_plot.png b/tests/baseline_images/test_regressor/test_residuals/test_residuals_plot.png index a66872dda..498498898 100644 Binary files a/tests/baseline_images/test_regressor/test_residuals/test_residuals_plot.png and b/tests/baseline_images/test_regressor/test_residuals/test_residuals_plot.png differ diff --git a/tests/baseline_images/test_regressor/test_residuals/test_residuals_plot_no_histogram.png b/tests/baseline_images/test_regressor/test_residuals/test_residuals_plot_no_histogram.png index a9f309123..f637c067e 100644 Binary files a/tests/baseline_images/test_regressor/test_residuals/test_residuals_plot_no_histogram.png and b/tests/baseline_images/test_regressor/test_residuals/test_residuals_plot_no_histogram.png differ diff --git a/tests/baseline_images/test_regressor/test_residuals/test_residuals_plot_numpy.png b/tests/baseline_images/test_regressor/test_residuals/test_residuals_plot_numpy.png new file mode 100644 index 000000000..fadd802b9 Binary files /dev/null and b/tests/baseline_images/test_regressor/test_residuals/test_residuals_plot_numpy.png differ diff --git a/tests/baseline_images/test_regressor/test_residuals/test_residuals_plot_pandas.png b/tests/baseline_images/test_regressor/test_residuals/test_residuals_plot_pandas.png index 9c21534af..fadd802b9 100644 Binary files a/tests/baseline_images/test_regressor/test_residuals/test_residuals_plot_pandas.png and b/tests/baseline_images/test_regressor/test_residuals/test_residuals_plot_pandas.png differ diff --git a/tests/baseline_images/test_regressor/test_residuals/test_residuals_quick_method.png b/tests/baseline_images/test_regressor/test_residuals/test_residuals_quick_method.png index 3ec28a8be..03a928039 100644 Binary files a/tests/baseline_images/test_regressor/test_residuals/test_residuals_quick_method.png and b/tests/baseline_images/test_regressor/test_residuals/test_residuals_quick_method.png differ diff --git a/tests/baseline_images/test_style/test_colors/test_integrated_yb_colormap.png b/tests/baseline_images/test_style/test_colors/test_integrated_yb_colormap.png new file mode 100644 index 000000000..565847cc8 Binary files /dev/null and b/tests/baseline_images/test_style/test_colors/test_integrated_yb_colormap.png differ diff --git a/tests/baseline_images/test_target/test_binning/test_balancedbinningreference.png b/tests/baseline_images/test_target/test_binning/test_balancedbinningreference.png deleted file mode 100644 index e359f6e40..000000000 Binary files a/tests/baseline_images/test_target/test_binning/test_balancedbinningreference.png and /dev/null differ diff --git a/tests/baseline_images/test_target/test_binning/test_numpy_bins.png b/tests/baseline_images/test_target/test_binning/test_numpy_bins.png new file mode 100644 index 000000000..6eb431663 Binary files /dev/null and b/tests/baseline_images/test_target/test_binning/test_numpy_bins.png differ diff --git a/tests/baseline_images/test_target/test_binning/test_pandas_bins.png b/tests/baseline_images/test_target/test_binning/test_pandas_bins.png new file mode 100644 index 000000000..6eb431663 Binary files /dev/null and b/tests/baseline_images/test_target/test_binning/test_pandas_bins.png differ diff --git a/tests/baseline_images/test_target/test_class_balance/test_binary_balance.png b/tests/baseline_images/test_target/test_class_balance/test_binary_balance.png index 096b59688..1d48f35f7 100644 Binary files a/tests/baseline_images/test_target/test_class_balance/test_binary_balance.png and b/tests/baseline_images/test_target/test_class_balance/test_binary_balance.png differ diff --git a/tests/baseline_images/test_target/test_class_balance/test_binary_compare.png b/tests/baseline_images/test_target/test_class_balance/test_binary_compare.png index 69837fd7f..4ec0e2427 100644 Binary files a/tests/baseline_images/test_target/test_class_balance/test_binary_compare.png and b/tests/baseline_images/test_target/test_class_balance/test_binary_compare.png differ diff --git a/tests/baseline_images/test_target/test_class_balance/test_multiclass_balance.png b/tests/baseline_images/test_target/test_class_balance/test_multiclass_balance.png index edce080ba..4a45f5817 100644 Binary files a/tests/baseline_images/test_target/test_class_balance/test_multiclass_balance.png and b/tests/baseline_images/test_target/test_class_balance/test_multiclass_balance.png differ diff --git a/tests/baseline_images/test_target/test_class_balance/test_multiclass_compare.png b/tests/baseline_images/test_target/test_class_balance/test_multiclass_compare.png index 7b051203f..bff3d1c4b 100644 Binary files a/tests/baseline_images/test_target/test_class_balance/test_multiclass_compare.png and b/tests/baseline_images/test_target/test_class_balance/test_multiclass_compare.png differ diff --git a/tests/baseline_images/test_target/test_class_balance/test_numpy_occupancy_balance.png b/tests/baseline_images/test_target/test_class_balance/test_numpy_occupancy_balance.png new file mode 100644 index 000000000..f4115587b Binary files /dev/null and b/tests/baseline_images/test_target/test_class_balance/test_numpy_occupancy_balance.png differ diff --git a/tests/baseline_images/test_target/test_class_balance/test_numpy_occupancy_compare.png b/tests/baseline_images/test_target/test_class_balance/test_numpy_occupancy_compare.png new file mode 100644 index 000000000..0ab35b080 Binary files /dev/null and b/tests/baseline_images/test_target/test_class_balance/test_numpy_occupancy_compare.png differ diff --git a/tests/baseline_images/test_target/test_class_balance/test_pandas_occupancy_balance.png b/tests/baseline_images/test_target/test_class_balance/test_pandas_occupancy_balance.png index 01cefff44..f4115587b 100644 Binary files a/tests/baseline_images/test_target/test_class_balance/test_pandas_occupancy_balance.png and b/tests/baseline_images/test_target/test_class_balance/test_pandas_occupancy_balance.png differ diff --git a/tests/baseline_images/test_target/test_class_balance/test_pandas_occupancy_compare.png b/tests/baseline_images/test_target/test_class_balance/test_pandas_occupancy_compare.png index 992914346..0ab35b080 100644 Binary files a/tests/baseline_images/test_target/test_class_balance/test_pandas_occupancy_compare.png and b/tests/baseline_images/test_target/test_class_balance/test_pandas_occupancy_compare.png differ diff --git a/tests/baseline_images/test_target/test_class_balance/test_quick_method.png b/tests/baseline_images/test_target/test_class_balance/test_quick_method.png index 26a60fb35..d19850334 100644 Binary files a/tests/baseline_images/test_target/test_class_balance/test_quick_method.png and b/tests/baseline_images/test_target/test_class_balance/test_quick_method.png differ diff --git a/tests/baseline_images/test_target/test_feature_correlation/test_feature_correlation_integrated_mutual_info_classification.png b/tests/baseline_images/test_target/test_feature_correlation/test_feature_correlation_integrated_mutual_info_classification.png index eae06d1d7..facea8b7e 100644 Binary files a/tests/baseline_images/test_target/test_feature_correlation/test_feature_correlation_integrated_mutual_info_classification.png and b/tests/baseline_images/test_target/test_feature_correlation/test_feature_correlation_integrated_mutual_info_classification.png differ diff --git a/tests/baseline_images/test_target/test_feature_correlation/test_feature_correlation_integrated_mutual_info_regression.png b/tests/baseline_images/test_target/test_feature_correlation/test_feature_correlation_integrated_mutual_info_regression.png index d0c4c759a..c39aa1463 100644 Binary files a/tests/baseline_images/test_target/test_feature_correlation/test_feature_correlation_integrated_mutual_info_regression.png and b/tests/baseline_images/test_target/test_feature_correlation/test_feature_correlation_integrated_mutual_info_regression.png differ diff --git a/tests/baseline_images/test_target/test_feature_correlation/test_feature_correlation_integrated_pearson.png b/tests/baseline_images/test_target/test_feature_correlation/test_feature_correlation_integrated_pearson.png index 0b8a87322..46de93cdf 100644 Binary files a/tests/baseline_images/test_target/test_feature_correlation/test_feature_correlation_integrated_pearson.png and b/tests/baseline_images/test_target/test_feature_correlation/test_feature_correlation_integrated_pearson.png differ diff --git a/tests/baseline_images/test_target/test_feature_correlation/test_feature_correlation_quick_method.png b/tests/baseline_images/test_target/test_feature_correlation/test_feature_correlation_quick_method.png index 0b8a87322..46de93cdf 100644 Binary files a/tests/baseline_images/test_target/test_feature_correlation/test_feature_correlation_quick_method.png and b/tests/baseline_images/test_target/test_feature_correlation/test_feature_correlation_quick_method.png differ diff --git a/tests/baseline_images/test_text/test_dispersion/test_dispersion_plot_annotate_docs.png b/tests/baseline_images/test_text/test_dispersion/test_dispersion_plot_annotate_docs.png new file mode 100644 index 000000000..381bfc6a1 Binary files /dev/null and b/tests/baseline_images/test_text/test_dispersion/test_dispersion_plot_annotate_docs.png differ diff --git a/tests/baseline_images/test_text/test_dispersion/test_dispersion_plot_color_by_class.png b/tests/baseline_images/test_text/test_dispersion/test_dispersion_plot_color_by_class.png new file mode 100644 index 000000000..6575bc1fb Binary files /dev/null and b/tests/baseline_images/test_text/test_dispersion/test_dispersion_plot_color_by_class.png differ diff --git a/tests/baseline_images/test_text/test_dispersion/test_dispersion_plot_generator_input.png b/tests/baseline_images/test_text/test_dispersion/test_dispersion_plot_generator_input.png new file mode 100644 index 000000000..679520992 Binary files /dev/null and b/tests/baseline_images/test_text/test_dispersion/test_dispersion_plot_generator_input.png differ diff --git a/tests/baseline_images/test_text/test_dispersion/test_dispersion_plot_ignore_case.png b/tests/baseline_images/test_text/test_dispersion/test_dispersion_plot_ignore_case.png new file mode 100644 index 000000000..679520992 Binary files /dev/null and b/tests/baseline_images/test_text/test_dispersion/test_dispersion_plot_ignore_case.png differ diff --git a/tests/baseline_images/test_text/test_dispersion/test_dispersionplot_annotate_docs.png b/tests/baseline_images/test_text/test_dispersion/test_dispersionplot_annotate_docs.png deleted file mode 100644 index 76c4c936f..000000000 Binary files a/tests/baseline_images/test_text/test_dispersion/test_dispersionplot_annotate_docs.png and /dev/null differ diff --git a/tests/baseline_images/test_text/test_dispersion/test_dispersionplot_color_words_by_class.png b/tests/baseline_images/test_text/test_dispersion/test_dispersionplot_color_words_by_class.png deleted file mode 100644 index 935b91f98..000000000 Binary files a/tests/baseline_images/test_text/test_dispersion/test_dispersionplot_color_words_by_class.png and /dev/null differ diff --git a/tests/baseline_images/test_text/test_dispersion/test_dispersionplot_generator_input.png b/tests/baseline_images/test_text/test_dispersion/test_dispersionplot_generator_input.png deleted file mode 100644 index 05c17a9e5..000000000 Binary files a/tests/baseline_images/test_text/test_dispersion/test_dispersionplot_generator_input.png and /dev/null differ diff --git a/tests/baseline_images/test_text/test_dispersion/test_dispersionplot_ignore_case.png b/tests/baseline_images/test_text/test_dispersion/test_dispersionplot_ignore_case.png deleted file mode 100644 index 05c17a9e5..000000000 Binary files a/tests/baseline_images/test_text/test_dispersion/test_dispersionplot_ignore_case.png and /dev/null differ diff --git a/tests/baseline_images/test_text/test_dispersion/test_integrated_dispersion_plot.png b/tests/baseline_images/test_text/test_dispersion/test_integrated_dispersion_plot.png new file mode 100644 index 000000000..809830a18 Binary files /dev/null and b/tests/baseline_images/test_text/test_dispersion/test_integrated_dispersion_plot.png differ diff --git a/tests/baseline_images/test_text/test_dispersion/test_integrated_dispersionplot.png b/tests/baseline_images/test_text/test_dispersion/test_integrated_dispersionplot.png deleted file mode 100644 index 11b347da2..000000000 Binary files a/tests/baseline_images/test_text/test_dispersion/test_integrated_dispersionplot.png and /dev/null differ diff --git a/tests/baseline_images/test_text/test_dispersion/test_quick_method.png b/tests/baseline_images/test_text/test_dispersion/test_quick_method.png new file mode 100644 index 000000000..809830a18 Binary files /dev/null and b/tests/baseline_images/test_text/test_dispersion/test_quick_method.png differ diff --git a/tests/baseline_images/test_text/test_freqdist/test_integrated_freqdist.png b/tests/baseline_images/test_text/test_freqdist/test_integrated_freqdist.png index f0368cd5e..43d09b8c9 100644 Binary files a/tests/baseline_images/test_text/test_freqdist/test_integrated_freqdist.png and b/tests/baseline_images/test_text/test_freqdist/test_integrated_freqdist.png differ diff --git a/tests/baseline_images/test_text/test_postag/test_frequency_mode.png b/tests/baseline_images/test_text/test_postag/test_frequency_mode.png new file mode 100644 index 000000000..7e57a2f13 Binary files /dev/null and b/tests/baseline_images/test_text/test_postag/test_frequency_mode.png differ diff --git a/tests/baseline_images/test_text/test_postag/test_quick_method.png b/tests/baseline_images/test_text/test_postag/test_quick_method.png new file mode 100644 index 000000000..50dd117ff Binary files /dev/null and b/tests/baseline_images/test_text/test_postag/test_quick_method.png differ diff --git a/tests/baseline_images/test_text/test_postag/test_spacy_tagged.png b/tests/baseline_images/test_text/test_postag/test_spacy_tagged.png new file mode 100644 index 000000000..c002661e5 Binary files /dev/null and b/tests/baseline_images/test_text/test_postag/test_spacy_tagged.png differ diff --git a/tests/baseline_images/test_text/test_postag/test_stack_frequency_mode.png b/tests/baseline_images/test_text/test_postag/test_stack_frequency_mode.png new file mode 100644 index 000000000..3746f30cf Binary files /dev/null and b/tests/baseline_images/test_text/test_postag/test_stack_frequency_mode.png differ diff --git a/tests/baseline_images/test_text/test_postag/test_stack_mode.png b/tests/baseline_images/test_text/test_postag/test_stack_mode.png new file mode 100644 index 000000000..281fd171b Binary files /dev/null and b/tests/baseline_images/test_text/test_postag/test_stack_mode.png differ diff --git a/tests/baseline_images/test_text/test_postag/test_word_tagged.png b/tests/baseline_images/test_text/test_postag/test_word_tagged.png new file mode 100644 index 000000000..50dd117ff Binary files /dev/null and b/tests/baseline_images/test_text/test_postag/test_word_tagged.png differ diff --git a/tests/baseline_images/test_text/test_postag/test_wordpunct_tagged.png b/tests/baseline_images/test_text/test_postag/test_wordpunct_tagged.png new file mode 100644 index 000000000..b36fce746 Binary files /dev/null and b/tests/baseline_images/test_text/test_postag/test_wordpunct_tagged.png differ diff --git a/tests/baseline_images/test_text/test_tsne/test_integrated_tsne.png b/tests/baseline_images/test_text/test_tsne/test_integrated_tsne.png index 59e831b36..39d843eb5 100644 Binary files a/tests/baseline_images/test_text/test_tsne/test_integrated_tsne.png and b/tests/baseline_images/test_text/test_tsne/test_integrated_tsne.png differ diff --git a/tests/baseline_images/test_text/test_tsne/test_make_classification_tsne.png b/tests/baseline_images/test_text/test_tsne/test_make_classification_tsne.png index 2341ba7a4..035d620ff 100644 Binary files a/tests/baseline_images/test_text/test_tsne/test_make_classification_tsne.png and b/tests/baseline_images/test_text/test_tsne/test_make_classification_tsne.png differ diff --git a/tests/baseline_images/test_text/test_tsne/test_make_classification_tsne_class_labels.png b/tests/baseline_images/test_text/test_tsne/test_make_classification_tsne_class_labels.png index 2341ba7a4..035d620ff 100644 Binary files a/tests/baseline_images/test_text/test_tsne/test_make_classification_tsne_class_labels.png and b/tests/baseline_images/test_text/test_tsne/test_make_classification_tsne_class_labels.png differ diff --git a/tests/baseline_images/test_text/test_tsne/test_no_target_tsne.png b/tests/baseline_images/test_text/test_tsne/test_no_target_tsne.png index 08eb3ca82..62e844b0f 100644 Binary files a/tests/baseline_images/test_text/test_tsne/test_no_target_tsne.png and b/tests/baseline_images/test_text/test_tsne/test_no_target_tsne.png differ diff --git a/tests/baseline_images/test_text/test_tsne/test_visualizer_with_pandas.png b/tests/baseline_images/test_text/test_tsne/test_visualizer_with_pandas.png index 25477dbf0..aaffd6dc7 100644 Binary files a/tests/baseline_images/test_text/test_tsne/test_visualizer_with_pandas.png and b/tests/baseline_images/test_text/test_tsne/test_visualizer_with_pandas.png differ diff --git a/tests/baseline_images/test_text/test_umap/test_integrated_umap.png b/tests/baseline_images/test_text/test_umap/test_integrated_umap.png new file mode 100644 index 000000000..156a4e7ae Binary files /dev/null and b/tests/baseline_images/test_text/test_umap/test_integrated_umap.png differ diff --git a/tests/baseline_images/test_text/test_umap/test_make_classification_umap.png b/tests/baseline_images/test_text/test_umap/test_make_classification_umap.png new file mode 100644 index 000000000..d16d8459c Binary files /dev/null and b/tests/baseline_images/test_text/test_umap/test_make_classification_umap.png differ diff --git a/tests/baseline_images/test_text/test_umap/test_make_classification_umap_class_labels.png b/tests/baseline_images/test_text/test_umap/test_make_classification_umap_class_labels.png new file mode 100644 index 000000000..d16d8459c Binary files /dev/null and b/tests/baseline_images/test_text/test_umap/test_make_classification_umap_class_labels.png differ diff --git a/tests/baseline_images/test_text/test_umap/test_no_target_umap.png b/tests/baseline_images/test_text/test_umap/test_no_target_umap.png new file mode 100644 index 000000000..87a244cdc Binary files /dev/null and b/tests/baseline_images/test_text/test_umap/test_no_target_umap.png differ diff --git a/tests/baseline_images/test_text/test_umap/test_visualizer_with_pandas.png b/tests/baseline_images/test_text/test_umap/test_visualizer_with_pandas.png new file mode 100644 index 000000000..f566446de Binary files /dev/null and b/tests/baseline_images/test_text/test_umap/test_visualizer_with_pandas.png differ diff --git a/tests/checks.py b/tests/checks.py index 851509e25..64ec00f14 100644 --- a/tests/checks.py +++ b/tests/checks.py @@ -4,7 +4,7 @@ # Author: Benjamin Bengfort # Created: Mon May 22 11:18:06 2017 -0700 # -# Copyright (C) 2017 District Data Labs +# Copyright (C) 2017 The scikit-yb developers # For license information, see LICENSE.txt # # ID: checks.py [4131cb1] benjamin@bengfort.com $ @@ -18,6 +18,7 @@ ########################################################################## import sys + sys.path.append("..") import numpy as np @@ -35,6 +36,7 @@ ## Checking runable ########################################################################## + def check_visualizer(Visualizer): """ Check if visualizer adheres to Yellowbrick conventions. @@ -53,6 +55,7 @@ def check_visualizer(Visualizer): ## Generate the specific per-visualizer checking ########################################################################## + def _yield_all_checks(name, Visualizer): """ Composes the checks required for the specific visualizer. @@ -125,6 +128,7 @@ def _yield_text_checks(name, Visualizer): ## Checking Functions ########################################################################## + def check_instantiation(name, Visualizer, args, kwargs): # assert that visualizers can be passed an axes object. ax = plt.gca() @@ -135,7 +139,7 @@ def check_instantiation(name, Visualizer, args, kwargs): def check_estimator_api(name, Visualizer): X = np.random.rand((5, 10)) - y = np.random.randint(0,2, 10) + y = np.random.randint(0, 2, 10) # Ensure fit returns self. viz = Visualizer() @@ -143,8 +147,9 @@ def check_estimator_api(name, Visualizer): assert viz == self -if __name__ == '__main__': +if __name__ == "__main__": import sys + sys.path.append("..") from yellowbrick.classifier import * @@ -154,12 +159,24 @@ def check_estimator_api(name, Visualizer): from yellowbrick.text import * visualizers = [ - ClassBalance, ClassificationReport, ConfusionMatrix, ROCAUC, - KElbowVisualizer, SilhouetteVisualizer, - ScatterVisualizer, JointPlotVisualizer, Rank2D, RadViz, ParallelCoordinates, - AlphaSelection, ManualAlphaSelection, - PredictionError, ResidualsPlot, - TSNEVisualizer, FreqDistVisualizer, PosTagVisualizer + ClassBalance, + ClassificationReport, + ConfusionMatrix, + ROCAUC, + KElbowVisualizer, + SilhouetteVisualizer, + ScatterVisualizer, + JointPlotVisualizer, + Rank2D, + RadViz, + ParallelCoordinates, + AlphaSelection, + ManualAlphaSelection, + PredictionError, + ResidualsPlot, + TSNEVisualizer, + FreqDistVisualizer, + PosTagVisualizer, ] for visualizer in visualizers: diff --git a/tests/conftest.py b/tests/conftest.py index 8e39b0261..7fd1b4b69 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,13 +1,13 @@ # tests.conftest -# Global definitions for Yellowbrick PyTest +# Global definitions for Yellowbrick pytest # # Author: Benjamin Bengfort # Created: Fri Mar 02 11:53:55 2018 -0500 # -# Copyright (C) 2016 District Data Labs +# Copyright (C) 2016 The scikit-yb developers # For license information, see LICENSE.txt # -# ID: conftest.py [] benjamin@bengfort.com $ +# ID: conftest.py [957cd53] benjamin@bengfort.com $ """ Global definitions for Yellowbrick PyTest @@ -18,13 +18,51 @@ ########################################################################## import os +import matplotlib as mpl from pytest_flakes import FlakesItem + +########################################################################## +## Configure tests +########################################################################## + + +def pytest_configure(config): + """ + This function is called by pytest for every plugin and conftest file + after the command line arguments have been passed but before the + session object is created and all of the tests are created. It is used + to set a global configuration before all tests are run. + + Yellowbrick uses this function primarily to ensure that the matplotlib + environment is setup correctly for all tests. + """ + # This is redundant with the line in tests/__init__.py but ensures that + # the backend is correctly set across all tests and plugins. + mpl.use("Agg") + + # Travis-CI does not have san-serif so ensure standard fonts are used. + # TODO: this is currently being reset before each test; needs fixing. + mpl.rcParams["font.family"] = "DejaVu Sans" + + ########################################################################## ## PyTest Hooks ########################################################################## + +def docline(obj): + """ + Returns the first line of the object's docstring or None if + there is no __doc__ on the object. + """ + if not obj.__doc__: + return None + lines = list(filter(None, obj.__doc__.split("\n"))) + return lines[0].strip() + + def pytest_itemcollected(item): """ A reporting hook that is called when a test item is collected. @@ -38,7 +76,7 @@ def pytest_itemcollected(item): """ # Ignore Session and PyFlake tests that are generated automatically - if not hasattr(item.parent, 'obj') or isinstance(item, FlakesItem): + if not hasattr(item.parent, "obj") or isinstance(item, FlakesItem): return # Collect test objects to inspect @@ -49,12 +87,13 @@ def pytest_itemcollected(item): # or class name, and the docstring of the test case, then set the nodeid # so that pytest-spec will correctly parse the information. path = os.path.relpath(str(item.fspath)) - prefix = parent.__doc__ or getattr(parent, '__name__', parent.__class__.__name__) - suffix = node.__doc__.strip() if node.__doc__ else node.__name__ + prefix = docline(parent) or getattr(parent, "__name__", parent.__class__.__name__) + suffix = docline(node) or node.__name__ # Add parametrize or test generation id to distinguish it in output - if item._genid: + # TODO: this is broken with pytest 4.2 (no attribute _genid) + if hasattr(item, "_genid") and item._genid: suffix += " ({})".format(item._genid) if prefix or suffix: - item._nodeid = '::'.join((path, prefix.strip(), suffix.strip())) + item._nodeid = "::".join((path, prefix.strip(), suffix.strip())) diff --git a/tests/dataset.py b/tests/dataset.py deleted file mode 100644 index 5354d912a..000000000 --- a/tests/dataset.py +++ /dev/null @@ -1,264 +0,0 @@ -# tests.dataset -# Helper functions for tests that utilize downloadable datasets. -# -# Author: Benjamin Bengfort -# Created: Thu Oct 13 19:55:53 2016 -0400 -# -# Copyright (C) 2016 District Data Labs -# For license information, see LICENSE.txt -# -# ID: dataset.py [8f4de77] benjamin@bengfort.com $ - -""" -Helper functions for tests that utilize downloadable datasets. -""" - -########################################################################## -## Imports -########################################################################## - -import io -import os -import shutil -import hashlib -import zipfile -import numpy as np - -from collections import namedtuple -from sklearn.datasets.base import Bunch - - -try: - import requests -except ImportError: - requests = None - -try: - import pandas as pd -except ImportError: - pd = None - - -# Helpers for fixtures -Dataset = namedtuple('Dataset', 'X,y') -Split = namedtuple('Split', 'train,test') - - -########################################################################## -## Fixtures -########################################################################## - -DATASETS = { - 'concrete': { - 'url': 'https://s3.amazonaws.com/ddl-data-lake/yellowbrick/concrete.zip', - 'signature': 'b9ea5f26a7bb272a040e2f1a993b26babbf8dc4a04ab8198bb315ca66d71f10d', - 'type': 'numpy', - }, - 'energy': { - 'url': 'https://s3.amazonaws.com/ddl-data-lake/yellowbrick/energy.zip', - 'signature': '19fb86f3bcdde208eed46944172cb643ef6a7d58da103fb568fae43205ed89d3', - 'type': 'numpy', - }, - 'credit': { - 'url': 'https://s3.amazonaws.com/ddl-data-lake/yellowbrick/credit.zip', - 'signature': '4a91339c69f55e18f3f48004328fbcb7868070b618208fed099920427b084e5e', - 'type': 'numpy', - }, - 'occupancy': { - 'url': 'https://s3.amazonaws.com/ddl-data-lake/yellowbrick/occupancy.zip', - 'signature': '429cfe376dc9929a1fa528da89f0e1626e34e19695f3f555d8954025bbc522b8', - 'type': 'numpy', - }, - 'mushroom': { - 'url': 'https://s3.amazonaws.com/ddl-data-lake/yellowbrick/mushroom.zip', - 'signature': '884c43cb70db35d211c67b1cf6a3683b2b4569393d2789d5c07840da4dc85ba8', - 'type': 'pandas', - }, - 'hobbies': { - 'url': 'https://s3.amazonaws.com/ddl-data-lake/yellowbrick/hobbies.zip', - 'signature': '415c8f68df1486d5d84a1d1757a5aa3035aef5ad63ede5013c261d622fbd29d8', - 'type': 'corpus', - }, - 'game': { - 'url': 'https://s3.amazonaws.com/ddl-data-lake/yellowbrick/game.zip', - 'signature': 'b1bd85789a014a898daa34cb5f89ceab6d2cd6488a2e572187e34aa4ec21a43b', - 'type': 'pandas', - }, - 'bikeshare': { - 'url': 'https://s3.amazonaws.com/ddl-data-lake/yellowbrick/bikeshare.zip', - 'signature': 'a9b440f65549746dff680c92ff8bdca3c7265f09db1cf09e708e6e26fc8aba44', - 'type': 'numpy', - }, - 'spam': { - 'url': 'https://s3.amazonaws.com/ddl-data-lake/yellowbrick/spam.zip', - 'signature': '65be21196ba3d8448847409b70a67d761f873f30719c807600eb516d7aef1de1', - 'type': 'numpy', - }, -} - -FIXTURES = os.path.join(os.path.dirname(__file__), "fixtures") - - -########################################################################## -## Test Cases that Require Download -########################################################################## - -class DatasetMixin(object): - """ - Mixin for unittest.TestCase class to download datasets from S3 for - testing real world machine learning visual diagnostics. - """ - - @staticmethod - def sha256sum(path, blocksize=65536): - """ - Computes the SHA256 signature of a file to verify that the file has not - been modified in transit and that it is the correct version of the data. - """ - sig = hashlib.sha256() - with open(path, 'rb') as f: - buf = f.read(blocksize) - while len(buf) > 0: - sig.update(buf) - buf = f.read(blocksize) - return sig.hexdigest() - - - @staticmethod - def download_data(url, path=FIXTURES, signature=None, extract=True): - """ - Downloads the zipped data set specified at the given URL, saving it to - the output path specified. This function verifies the download with the - given signature (if supplied) and extracts the zip file if requested. - """ - if requests is None: - raise ImportError( - "The requests module is required to download data --\n" - "please install it with pip install requests." - ) - - # Create the output directory if it does not exist - if not os.path.exists(path): - os.mkdir(path) - - # Get the name of the file from the URL - name = os.path.basename(url) - dlpath = os.path.join(path, name) - - # Fetch the response in a streaming fashion and write it to disk. - response = requests.get(url, stream=True) - with open(dlpath, 'wb') as f: - for chunk in response.iter_content(65536): - f.write(chunk) - - # If verify, compare the signature - if signature is not None: - dlsignature = DatasetMixin.sha256sum(dlpath) - if signature != dlsignature: - raise ValueError( - "Download signature does not match hardcoded signature!" - ) - - # If extract, extract the zipfile. - if extract: - zf = zipfile.ZipFile(dlpath) - zf.extractall(path) - - - @staticmethod - def download_all(path=FIXTURES, verify=True, extract=True): - """ - Downloads all the example datasets. If verify is True then compare the - download signature with the hardcoded signature. If extract is True then - extract the contents of the zipfile to the given path. - """ - for name, meta in DATASETS.items(): - url = meta['url'] - signature = meta['signature'] if verify else None - - DatasetMixin.download_data( - url, path=path, signature=signature, extract=extract - ) - - @staticmethod - def remove_all(fixtures=FIXTURES): - """ - Removes all the downloaded datasets as clean up - """ - shutil.rmtree(fixtures) - - @staticmethod - def load_data(name, fixtures=FIXTURES): - """ - Loads the numpy matrix from the specified data set, downloads it if - it hasn't already been downloaded. - """ - # Just in case this is a corpus data set, then do that instead. - if DATASETS[name]['type'] == 'corpus': - return DatasetMixin.load_corpus(name, fixtures) - - if DATASETS[name]['type'] == 'pandas': - return DatasetMixin.load_pandas(name, fixtures) - - path = DatasetMixin._lookup_path(name, fixtures) - return np.genfromtxt(path, dtype=float, delimiter=',', names=True) - - @staticmethod - def load_corpus(name, fixtures=FIXTURES): - """ - Loads a sklearn Bunch with the corpus and downloads it if it hasn't - already been downloaded. Used to test text visualizers. - """ - path = DatasetMixin._lookup_path(name, fixtures, ext=None) - - # Read the directories in the directory as the categories. - categories = [ - cat for cat in os.listdir(path) - if os.path.isdir(os.path.join(path, cat)) - ] - - files = [] # holds the file names relative to the root - data = [] # holds the text read from the file - target = [] # holds the string of the category - - # Load the data from the files in the corpus - for cat in categories: - for name in os.listdir(os.path.join(path, cat)): - files.append(os.path.join(path, cat, name)) - target.append(cat) - - with io.open(os.path.join(path, cat, name), 'r', encoding='UTF-8', errors='ignore') as f: - data.append(f.read()) - - # Return the data bunch for use similar to the newsgroups example - return Bunch( - categories=categories, - files=files, - data=data, - target=target, - ) - - @staticmethod - def load_pandas(name, fixtures=FIXTURES): - """ - Loads a pandas Dataframe with the specified - """ - if pd is None: - raise ImportError("pandas is required to load this dataset") - - path = DatasetMixin._lookup_path(name, fixtures) - return pd.read_csv(path) - - @staticmethod - def _lookup_path(name, fixtures=FIXTURES, ext=".csv"): - """ - Looks up the path to the dataset, downloading it if necessary - """ - if ext is None: - path = os.path.join(fixtures, name) - else: - path = os.path.join(fixtures, name, "{}{}".format(name, ext)) - - if not os.path.exists(path): - DatasetMixin.download_all(path=fixtures) - return path diff --git a/tests/fixtures.py b/tests/fixtures.py new file mode 100644 index 000000000..449ed631b --- /dev/null +++ b/tests/fixtures.py @@ -0,0 +1,25 @@ +# tests.fixtures +# Helpers for pytest fixtures and data related testing. +# +# Author: Zijie (ZJ) Poh +# Created: Wed Feb 13 13:24:24 2019 -0400 +# +# Copyright (C) 2019 The scikit-yb developers. +# For license information, see LICENSE.txt +# +# ID: fixtures.py [eb9f8cc] 8103276+zjpoh@users.noreply.github.com $ + +""" +Helpers for pytest fixtures and data related testing. +""" + +########################################################################## +## Imports and Module Variables +########################################################################## + +from collections import namedtuple + + +## Used for wrapping an dataset into a single variable. +Dataset = namedtuple("Dataset", "X,y") +Split = namedtuple("Split", "train,test") diff --git a/tests/images.py b/tests/images.py index f4c945565..d0d46a11e 100644 --- a/tests/images.py +++ b/tests/images.py @@ -4,10 +4,10 @@ # Author: Benjamin Bengfort # Created: Fri Mar 02 21:51:56 2018 -0500 # -# Copyright (C) 2016 District Data Labs +# Copyright (C) 2016 The scikit-yb developers # For license information, see LICENSE.txt # -# ID: images.py [] benjamin@bengfort.com $ +# ID: images.py [ab37b18] benjamin@bengfort.com $ """ Helper utility to manage baseline images for image comparisons. Usage: @@ -37,6 +37,7 @@ ## Helper Methods ########################################################################## + def relpath(path): """ Compute the path relative to the test directory. @@ -100,7 +101,7 @@ def list_images(path): bname, bext = os.path.splitext(name) output.append(bname) - output.append("-"*len(bname)) + output.append("-" * len(bname)) # Handle base path base_path = os.path.join(BASELINE, path, name) @@ -117,7 +118,7 @@ def list_images(path): output.append(" - no actual image") # Handle diff path - diff_path = os.path.join(ACTUAL, path, '{}-failed-diff{}'.format(bname, bext)) + diff_path = os.path.join(ACTUAL, path, "{}-failed-diff{}".format(bname, bext)) if diff_path in diffs: output.append(" - {}".format(os.path.relpath(diff_path))) @@ -132,10 +133,10 @@ def sync(path): Move all non-diff images from actual to baseline """ for fname in os.listdir(os.path.join(ACTUAL, path)): - if fname.endswith('-diff.png'): + if fname.endswith("-diff.png"): continue - if fname.endswith('.png'): + if fname.endswith(".png"): src = os.path.join(ACTUAL, path, fname) dst = os.path.join(BASELINE, path, fname) shutil.copy2(src, dst) @@ -146,6 +147,7 @@ def sync(path): ## Main Method ########################################################################## + def main(args): # Get directories relative to test dir test_dirs = list(map(relpath, args.test_dirs)) @@ -170,28 +172,28 @@ def main(args): sync(path) -if __name__ == '__main__': +if __name__ == "__main__": args = { - ('-C', '--clear'): { - 'action': 'store_true', - 'help': 'clear actual, diffs, and baseline images for test', + ("-C", "--clear"): { + "action": "store_true", + "help": "clear actual, diffs, and baseline images for test", }, - ('-L', '--list'): { - 'action': 'store_true', - 'help': 'list images images for tests and exit', + ("-L", "--list"): { + "action": "store_true", + "help": "list images images for tests and exit", }, - 'test_dirs' : { - 'metavar': 'DIR', 'nargs': '+', - 'help': 'directories to move images from actual to baseline', + "test_dirs": { + "metavar": "DIR", + "nargs": "+", + "help": "directories to move images from actual to baseline", }, } - # Create the parser and add the arguments parser = argparse.ArgumentParser( description="utility to manage baseline images for comparisons", - epilog="report any issues on GitHub" + epilog="report any issues on GitHub", ) for pargs, kwargs in args.items(): diff --git a/tests/rand.py b/tests/rand.py index 15e2533ac..78b930d85 100644 --- a/tests/rand.py +++ b/tests/rand.py @@ -4,7 +4,7 @@ # Author: Benjamin Bengfort # Created: Wed Mar 21 17:51:15 2018 -0400 # -# ID: random.py [] benjamin@bengfort.com $ +# ID: rand.py [cc69b3c] davidwaterman@gmail.com $ """ A visualizer that draws a random scatter plot for testing. @@ -26,6 +26,7 @@ ## Random Visualizer ########################################################################## + class RandomVisualizer(Visualizer): """ Creates random scatter plots as a testing utility. @@ -49,16 +50,13 @@ class RandomVisualizer(Visualizer): Used to specify the seed of the random state to ensure tests work. """ - def __init__(self, ax=None, n_samples=100, n_blobs=3, - random_state=None, **kwargs): + def __init__(self, ax=None, n_samples=100, n_blobs=3, random_state=None, **kwargs): super(RandomVisualizer, self).__init__(ax=ax, **kwargs) if isinstance(random_state, (int, float)) or random_state is None: random_state = np.random.RandomState(random_state) - self.set_params( - n_samples=n_samples, n_blobs=n_blobs, random_state=random_state, - ) + self.set_params(n_samples=n_samples, n_blobs=n_blobs, random_state=random_state) def generate(self): """ @@ -79,8 +77,8 @@ def generate(self): def fit(self, *args, **kwargs): X, c = self.generate() - x = X[:,0] - y = X[:,1] + x = X[:, 0] + y = X[:, 1] self.draw(x, y, c) return self @@ -89,7 +87,7 @@ def draw(self, x, y, c): colors = resolve_colors(self.n_blobs) for i in np.arange(self.n_blobs): - mask = c==i + mask = c == i label = "c{}".format(i) self.ax.scatter(x[mask], y[mask], label=label, c=colors[i]) @@ -103,7 +101,7 @@ def finalize(self): return self.ax -if __name__ == '__main__': +if __name__ == "__main__": r = RandomVisualizer() r.fit() - r.poof(outpath='test.png') + r.poof(outpath="test.png") diff --git a/tests/requirements.txt b/tests/requirements.txt index 64bf01d3a..6ac14d583 100644 --- a/tests/requirements.txt +++ b/tests/requirements.txt @@ -1,22 +1,20 @@ # Library Dependencies -matplotlib>=1.5.1,!=3.0.0 -scipy>=0.19 -scikit-learn>=0.19 +matplotlib>=2.0.2,!=3.0.0,!=3.1.1 +scipy>=1.0.0 +scikit-learn>=0.20 numpy>=1.13.0 cycler>=0.10.0 # Testing Requirements -pytest>=3.4.1,<4.2.0 -pytest-cov>=2.5.1 -pytest-flakes>=2.0.0 -pytest-spec>=1.1.0 -coverage>=4.4.1 -requests>=2.18.3 -six==1.11.0 - -# Python 2 Testing Requirements -mock>=2.0.0 +pytest>=4.2.0, !=4.6.0 +pytest-cov>=2.6.1 +pytest-flakes>=4.0.0 +#pytest-spec>=1.1.0 +coverage>=4.5.2 # Optional Testing Dependencies nltk>=3.2 +# spacy>=2.0.18 pandas>=0.20 +umap-learn==0.3 +numba==0.42 diff --git a/tests/test_base.py b/tests/test_base.py index c599ee2e4..a05894fcc 100644 --- a/tests/test_base.py +++ b/tests/test_base.py @@ -1,11 +1,14 @@ # tests.test_base.py # Assertions for the base classes and abstract hierarchy. # -# Author: Rebecca Bilbro -# Author: Benjamin Bengfort +# Author: Rebecca Bilbro +# Author: Benjamin Bengfort # Author: Neal Humphrey # Created: Sat Oct 08 18:34:30 2016 -0400 # +# Copyright (C) 2016 The scikit-yb developers +# For license information, see LICENSE.txt +# # ID: test_base.py [83131ef] benjamin@bengfort.com $ """ @@ -16,24 +19,28 @@ ## Imports ########################################################################## -import sys import pytest import matplotlib.pyplot as plt from yellowbrick.base import * from yellowbrick.base import VisualizerGrid +from yellowbrick.datasets import load_occupancy +from yellowbrick.exceptions import YellowbrickWarning from yellowbrick.exceptions import YellowbrickValueError -from tests.base import VisualTestCase +from unittest.mock import patch +from unittest.mock import MagicMock from tests.rand import RandomVisualizer +from tests.base import IS_WINDOWS_OR_CONDA, VisualTestCase +from sklearn.svm import LinearSVC from sklearn.datasets import make_classification - ########################################################################## ## Base Cases ########################################################################## + class TestBaseClasses(VisualTestCase): """ Tests for the high-level API of Yellowbrick and base classes @@ -51,6 +58,38 @@ def test_visualizer_ax_property(self): assert viz._ax == "foo" assert viz.ax == "foo" + def test_visualizer_fig_property(self): + """ + Test the fig property on the Visualizer + """ + viz = Visualizer() + assert viz._fig is None + assert viz.fig is not None + + viz.fig = "foo" + assert viz._fig == "foo" + assert viz.fig == "foo" + + def test_size_property(self): + """ + Test the size property on the base Visualizer + """ + fig = plt.figure(figsize=(1, 2)) + viz = Visualizer() + + assert viz._size is None + assert viz.size is not None + + fsize = fig.get_size_inches() * fig.get_dpi() + assert all(viz.size) == all(fsize) + + viz.size = (1080, 720) + assert viz._size == (1080, 720) + assert viz.size == (1080, 720) + + fsize = fig.get_size_inches() * fig.get_dpi() + assert all(viz.size) == all(fsize) + def test_visualizer_fit_returns_self(self): """ Assert that all visualizers return self @@ -73,58 +112,133 @@ def test_finalize_interface(self): viz = Visualizer() assert viz.finalize() is viz.ax - def test_size_property(self): + @patch("yellowbrick.base.plt") + def test_poof_show_interface(self, mock_plt): """ - Test the size property on the base Visualizer + Test poof calls plt.show and other figure finalization correctly """ - fig = plt.figure(figsize =(1,2)) - viz = Visualizer() - assert viz._size is None - assert viz.size is not None + class CustomVisualizer(Visualizer): + pass - fsize = fig.get_size_inches() * fig.get_dpi() - assert all(viz.size) == all(fsize) + _, ax = plt.subplots() + viz = CustomVisualizer(ax=ax) + viz.finalize = MagicMock() + assert viz.poof() is ax - viz.size = (1080, 720) - assert viz._size == (1080, 720) - assert viz.size == (1080, 720) + viz.finalize.assert_called_once_with() + mock_plt.show.assert_called_once_with() + mock_plt.savefig.assert_not_called() + + @patch("yellowbrick.base.plt") + def test_poof_savefig_interface(self, mock_plt): + """ + Test poof calls plt.savefig and other figure finalization correctly + """ + + class CustomVisualizer(Visualizer): + pass + + _, ax = plt.subplots() + viz = CustomVisualizer(ax=ax) + viz.finalize = MagicMock() + assert viz.poof(outpath="test.png") is ax + + viz.finalize.assert_called_once_with() + mock_plt.show.assert_not_called() + mock_plt.savefig.assert_called_once_with("test.png") + + @patch("yellowbrick.base.plt") + def test_poof_warns(self, mock_plt): + """ + Test poof issues a warning when no axes has been modified + """ + + class CustomVisualizer(Visualizer): + pass + + with pytest.warns(YellowbrickWarning): + viz = CustomVisualizer() + assert viz.poof() is not None + + +########################################################################## +## ScoreVisualizer Cases +########################################################################## - fsize = fig.get_size_inches() * fig.get_dpi() - assert all(viz.size) == all(fsize) + +class MockVisualizer(ScoreVisualizer): + """ + Mock for a downstream score visualizer + """ + + def fit(self, X, y): + super(MockVisualizer, self).fit(X, y) + + +class TestScoreVisualizer(VisualTestCase): + """ + Tests for the ScoreVisualizer + """ + + def test_with_fitted(self): + """ + Test that visualizer properly handles an already-fitted model + """ + X, y = load_occupancy(return_dataset=True).to_numpy() + + model = LinearSVC().fit(X, y) + classes = ["unoccupied", "occupied"] + + with patch.object(model, "fit") as mockfit: + oz = MockVisualizer(model, classes=classes) + oz.fit(X, y) + mockfit.assert_not_called() + + with patch.object(model, "fit") as mockfit: + oz = MockVisualizer(model, classes=classes, is_fitted=True) + oz.fit(X, y) + mockfit.assert_not_called() + + with patch.object(model, "fit") as mockfit: + oz = MockVisualizer(model, classes=classes, is_fitted=False) + oz.fit(X, y) + mockfit.assert_called_once_with(X, y) ########################################################################## ## Visual Grid Cases ########################################################################## + +@pytest.mark.filterwarnings("ignore:Matplotlib is currently using agg") class TestVisualizerGrid(VisualTestCase): """ Tests for the VisualizerGrid layout class """ @pytest.mark.xfail( - sys.platform == 'win32', reason="images not close on windows" + IS_WINDOWS_OR_CONDA, + reason="font rendering different in OS and/or Python; see #892", ) def test_draw_visualizer_grid(self): """ Draw a 4 visualizers grid with default options """ - visualizers = [ - RandomVisualizer(random_state=(1+x)**2) - for x in range(4) - ] + visualizers = [RandomVisualizer(random_state=(1 + x) ** 2) for x in range(4)] X, y = make_classification(random_state=78) grid = VisualizerGrid(visualizers) grid.fit(X, y) - grid.poof() + # poof is required here (do not replace with finalize)! + assert grid.poof() is not None self.assert_images_similar(grid) @pytest.mark.xfail( - sys.platform == 'win32', reason="images not close on windows" + IS_WINDOWS_OR_CONDA, + reason="font rendering different in OS and/or Python; see #892", ) def test_draw_with_rows(self): """ @@ -139,12 +253,14 @@ def test_draw_with_rows(self): grid = VisualizerGrid(visualizers, nrows=2) grid.fit(X, y) - grid.poof() + # poof is required here (do not replace with finalize)! + assert grid.poof() is not None self.assert_images_similar(grid) @pytest.mark.xfail( - sys.platform == 'win32', reason="images not close on windows" + IS_WINDOWS_OR_CONDA, + reason="font rendering different in OS and/or Python; see #892", ) def test_draw_with_cols(self): """ @@ -159,7 +275,8 @@ def test_draw_with_cols(self): grid = VisualizerGrid(visualizers, ncols=2) grid.fit(X, y) - grid.poof() + # poof is required here (do not replace with finalize)! + assert grid.poof() is not None self.assert_images_similar(grid) diff --git a/tests/test_bestfit.py b/tests/test_bestfit.py index 8641b4a91..178a6db26 100644 --- a/tests/test_bestfit.py +++ b/tests/test_bestfit.py @@ -4,7 +4,7 @@ # Author: Benjamin Bengfort # Created: Sun Jun 26 19:27:39 2016 -0400 # -# Copyright (C) 2016 District Data Labs +# Copyright (C) 2016 The scikit-yb developers # For license information, see LICENSE.txt # # ID: test_bestfit.py [56236f3] benjamin@bengfort.com $ @@ -35,49 +35,50 @@ ## Best fit tests ########################################################################## -class BestFitTests(VisualTestCase): +class TestBestFit(VisualTestCase): def test_bad_estimator(self): """ Test that a bad estimator name raises a value error. """ - fig, axe = plt.subplots() + fig, ax = plt.subplots() X, y = ANSCOMBE[1] - with self.assertRaises(YellowbrickValueError): - draw_best_fit(X, y, axe, 'pepper') + with pytest.raises(YellowbrickValueError): + draw_best_fit(X, y, ax, "pepper") def test_ensure_same_length(self): """ Ensure that vectors of different lengths raise """ - fig, axe = plt.subplots() + fig, ax = plt.subplots() X = np.array([1, 2, 3, 5, 8, 10, 2]) y = np.array([1, 3, 6, 2]) - with self.assertRaises(YellowbrickValueError): - draw_best_fit(X, y, axe, 'linear') + with pytest.raises(YellowbrickValueError): + draw_best_fit(X, y, ax, "linear") - with self.assertRaises(YellowbrickValueError): - draw_best_fit(X[:,np.newaxis], y, axe, 'linear') + with pytest.raises(YellowbrickValueError): + draw_best_fit(X[:, np.newaxis], y, ax, "linear") - @pytest.mark.filterwarnings('ignore') - def testdraw_best_fit(self): + @pytest.mark.filterwarnings("ignore") + def test_draw_best_fit(self): """ Test that drawing a best fit line works. """ - fig, axe = plt.subplots() + fig, ax = plt.subplots() X, y = ANSCOMBE[0] - self.assertEqual(axe, draw_best_fit(X, y, axe, 'linear')) - self.assertEqual(axe, draw_best_fit(X, y, axe, 'quadratic')) + assert ax == draw_best_fit(X, y, ax, "linear") + assert ax == draw_best_fit(X, y, ax, "quadratic") ########################################################################## ## Estimator tests ########################################################################## -class EstimatorTests(VisualTestCase): + +class TestEstimator(VisualTestCase): """ Test the estimator functions for best fit lines. """ @@ -89,12 +90,11 @@ def test_linear(self): X, y = ANSCOMBE[0] X = np.array(X) y = np.array(y) - X = X[:,np.newaxis] + X = X[:, np.newaxis] model = fit_linear(X, y) - self.assertIsNotNone(model) - self.assertIsInstance(model, LinearRegression) - + assert model is not None + assert isinstance(model, LinearRegression) def test_quadratic(self): """ @@ -103,11 +103,11 @@ def test_quadratic(self): X, y = ANSCOMBE[1] X = np.array(X) y = np.array(y) - X = X[:,np.newaxis] + X = X[:, np.newaxis] model = fit_quadratic(X, y) - self.assertIsNotNone(model) - self.assertIsInstance(model, Pipeline) + assert model is not None + assert isinstance(model, Pipeline) def test_select_best(self): """ @@ -116,17 +116,17 @@ def test_select_best(self): X, y = ANSCOMBE[1] X = np.array(X) y = np.array(y) - X = X[:,np.newaxis] + X = X[:, np.newaxis] model = fit_select_best(X, y) - self.assertIsNotNone(model) - self.assertIsInstance(model, Pipeline) + assert model is not None + assert isinstance(model, Pipeline) X, y = ANSCOMBE[3] X = np.array(X) y = np.array(y) - X = X[:,np.newaxis] + X = X[:, np.newaxis] model = fit_select_best(X, y) - self.assertIsNotNone(model) - self.assertIsInstance(model, LinearRegression) + assert model is not None + assert isinstance(model, LinearRegression) diff --git a/tests/test_classifier/__init__.py b/tests/test_classifier/__init__.py index c82b7f039..c366aa959 100644 --- a/tests/test_classifier/__init__.py +++ b/tests/test_classifier/__init__.py @@ -1,6 +1,12 @@ -#Backend must be set before first use. -# Setting backend here allows us to run tests just in this folder, without running the whole yellowbrick.tests folder -# This command will have no effect if backend has already been set previously. -import matplotlib -matplotlib.use('Agg') +# tests.test_classifier +# Tests for the classifier visualizers +# +# ID: __init__.py [5388065] neal@nhumphrey.com $ +""" +Tests for the classifier visualizers +""" + +########################################################################## +## Imports +########################################################################## diff --git a/tests/test_classifier/conftest.py b/tests/test_classifier/conftest.py index 799ac3f11..deea5363f 100644 --- a/tests/test_classifier/conftest.py +++ b/tests/test_classifier/conftest.py @@ -1,10 +1,13 @@ # tests.test_classifier.conftest # Provides fixtures for the classification tests module. # -# Author: Benjamin Bengfort +# Author: Benjamin Bengfort # Created: Fri Mar 23 18:07:00 2018 -0400 # -# ID: conftest.py [] benjamin@bengfort.com $ +# Copyright (C) 2018 The scikit-yb developers +# For license information, see LICENSE.txt +# +# ID: conftest.py [1e04216] benjamin@bengfort.com $ """ Provides fixtures for the classification tests module. @@ -16,47 +19,97 @@ import pytest -from tests.dataset import Dataset, Split +from tests.fixtures import Dataset, Split +from yellowbrick.exceptions import NotFitted +from sklearn.exceptions import NotFittedError from sklearn.datasets import make_classification from sklearn.model_selection import train_test_split as tts +########################################################################## +## Assertion Helpers +########################################################################## + +ATTRS = ["classes_", "class_count_", "score_"] + + +def assert_not_fitted(estimator, attrs=ATTRS, X_test=None): + """ + Check that the estimator is not fitted by ensuring it does not have + any of the attributes specified in attrs. If X_test is specified, + it is passed to predict, which must also raise a NotFitted exception. + """ + __traceback_hide__ = True + for attr in attrs: + msg = "model is fitted, has {} attribute".format(attr) + assert not hasattr(estimator, attr), msg + + if X_test is not None: + with pytest.raises((NotFitted, NotFittedError)): + estimator.predict(X_test) + + +def assert_fitted(estimator, attrs=ATTRS, X_test=None): + """ + Check that the estimator is fitted by ensuring it does have the attributes + passed in attrs. If X_test is specified, it is passed to predict which + must not raise a NotFitted exception. + """ + __traceback_hide__ = True + for attr in attrs: + msg = "model is not fitted, does not have {} attribute".format(attr) + assert hasattr(estimator, attr), msg + + if X_test is not None: + try: + estimator.predict(X_test) + except (NotFitted, NotFittedError): + pytest.fail("estimator not fitted raised from predict") + + ########################################################################## ## Fixtures ########################################################################## -@pytest.fixture(scope='class') + +@pytest.fixture(scope="class") def binary(request): """ Creates a random binary classification dataset fixture """ X, y = make_classification( - n_samples=500, n_features=20, n_informative=8, n_redundant=2, - n_classes=2, n_clusters_per_class=3, random_state=87 + n_samples=500, + n_features=20, + n_informative=8, + n_redundant=2, + n_classes=2, + n_clusters_per_class=3, + random_state=87, ) - X_train, X_test, y_train, y_test = tts( - X, y, test_size=0.2, random_state=93 - ) + X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2, random_state=93) dataset = Dataset(Split(X_train, X_test), Split(y_train, y_test)) request.cls.binary = dataset -@pytest.fixture(scope='class') +@pytest.fixture(scope="class") def multiclass(request): """ Creates a random multiclass classification dataset fixture """ X, y = make_classification( - n_samples=500, n_features=20, n_informative=8, n_redundant=2, - n_classes=6, n_clusters_per_class=3, random_state=87 + n_samples=500, + n_features=20, + n_informative=8, + n_redundant=2, + n_classes=6, + n_clusters_per_class=3, + random_state=87, ) - X_train, X_test, y_train, y_test = tts( - X, y, test_size=0.2, random_state=93 - ) + X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2, random_state=93) dataset = Dataset(Split(X_train, X_test), Split(y_train, y_test)) request.cls.multiclass = dataset diff --git a/tests/test_classifier/test_base.py b/tests/test_classifier/test_base.py new file mode 100644 index 000000000..1455f47d2 --- /dev/null +++ b/tests/test_classifier/test_base.py @@ -0,0 +1,249 @@ +# tests.test_classifier.test_base +# Tests for the base classification visualizers +# +# Author: Benjamin Bengfort +# Created: Wed Jul 31 11:21:28 2019 -0400 +# +# Copyright (C) 2019 The scikit-yb developers +# For license information, see LICENSE.txt +# +# ID: test_base.py [da729da] benjamin@bengfort.com $ + +""" +Tests for the base classification visualizers +""" + +########################################################################## +## Imports +########################################################################## + +import pytest +import numpy as np +import numpy.testing as npt + +from yellowbrick.classifier.base import * +from sklearn.naive_bayes import GaussianNB +from sklearn.preprocessing import LabelEncoder +from sklearn.linear_model import LinearRegression +from .conftest import assert_fitted, assert_not_fitted +from yellowbrick.exceptions import YellowbrickTypeError + + +########################################################################## +## Test Classification Score Visualizer +########################################################################## + + +@pytest.mark.usefixtures("binary", "multiclass") +class TestClassificationScoreVisualizer(object): + """ + Test the ClassificationScoreVisualizer base functionality + """ + + def test_fit_score(self): + """ + Ensure correct fit and score behavior + """ + oz = ClassificationScoreVisualizer(GaussianNB()) + assert_not_fitted(oz, X_test=self.binary.X.test) + assert oz.fit(self.binary.X.train, self.binary.y.train) is oz + assert 0.0 <= oz.score(self.binary.X.test, self.binary.y.test) <= 1.0 + assert_fitted(oz, X_test=self.binary.X.test) + + def test_class_counts(self): + """ + Test class and class counts identification + """ + oz = ClassificationScoreVisualizer(GaussianNB()) + oz.fit(self.multiclass.X.train, self.multiclass.y.train) + + unique, counts = np.unique(self.multiclass.y.train, return_counts=True) + npt.assert_array_equal(oz.classes_, unique) + npt.assert_array_equal(oz.class_counts_, counts) + + def test_force_estimator(self): + """ + Test that an estimator can be forced through + """ + with pytest.raises(YellowbrickTypeError): + ClassificationScoreVisualizer(LinearRegression()) + + try: + ClassificationScoreVisualizer(LinearRegression(), force_model=True) + except YellowbrickTypeError as e: + pytest.fail("type error was raised incorrectly: {}".format(e)) + + def test_score_with_fitted_estimator(self): + """ + Assert fitted estimator can be scored without fit but warns + """ + model = GaussianNB().fit(self.binary.X.train, self.binary.y.train) + + # NOTE that the wrapper will pass a call down to `classes_` + oz = ClassificationScoreVisualizer(model) + assert_not_fitted(oz, ["class_counts_", "score_"]) + + msg = "could not determine class_counts_" + with pytest.warns(YellowbrickWarning, match=msg): + oz.score(self.binary.X.test, self.binary.y.test) + assert_fitted(oz, ["classes_", "class_counts_", "score_"]) + + def test_score_without_fitted_estimator(self): + """ + Assert score without fitted estimator raises NotFitted + """ + oz = ClassificationScoreVisualizer(GaussianNB()) + assert_not_fitted(oz) + + with pytest.raises(NotFitted): + oz.score(self.binary.X.test, self.binary.y.test) + assert_not_fitted(oz) + + def test_colors_property(self): + """ + Test that a unique color per class is created after fit + """ + oz = ClassificationScoreVisualizer(GaussianNB()) + + with pytest.raises(NotFitted, match="cannot determine colors before fit"): + oz.colors + + oz.fit(self.multiclass.X.train, self.multiclass.y.train) + assert len(oz.colors) == len(oz.classes_) + + def test_decode_labels_warning(self): + """ + Assert warning is issued and encoder is used with multiple decoding params + """ + with pytest.warns( + YellowbrickWarning, match="both classes and encoder specified" + ): + oz = ClassificationScoreVisualizer( + GaussianNB(), + classes=["a", "b", "c"], + encoder={0: "foo", 1: "bar", 2: "zap"}, + ) + encoded = oz._decode_labels([0, 1, 2]) + npt.assert_array_equal(encoded, ["foo", "bar", "zap"]) + + def test_decode_labels_from_numeric(self): + """ + Test that a numeric y can be decoded using classes and encoder + """ + classes = np.array(["a", "b", "c", "d", "e"]) + y = np.random.randint(0, 5, 100) + decoded = classes[y] + + oz = ClassificationScoreVisualizer(GaussianNB, classes=classes) + npt.assert_array_equal(oz._decode_labels(y), decoded) + + encoder = dict(zip(range(len(classes)), classes)) + oz = ClassificationScoreVisualizer(GaussianNB, encoder=encoder) + npt.assert_array_equal(oz._decode_labels(y), decoded) + + encoder = LabelEncoder().fit(decoded) + oz = ClassificationScoreVisualizer(GaussianNB, encoder=encoder) + npt.assert_array_equal(oz._decode_labels(y), decoded) + + def test_decode_labels_from_strings(self): + """ + Test that string y can be decoded using classes and encoder + """ + classes = np.array(["a", "b", "c", "d", "e"]) + decoded = classes[np.random.randint(0, 5, 100)] + y = np.array([v.upper() for v in decoded]) + + oz = ClassificationScoreVisualizer(GaussianNB, classes=classes) + npt.assert_array_equal(oz._decode_labels(y), decoded) + + encoder = {c.upper(): c for c in classes} + oz = ClassificationScoreVisualizer(GaussianNB, encoder=encoder) + npt.assert_array_equal(oz._decode_labels(y), decoded) + + class L2UTransformer(object): + def transform(self, y): + return np.array([yi.upper() for yi in y]) + + def inverse_transform(self, y): + return np.array([yi.lower() for yi in y]) + + oz = ClassificationScoreVisualizer(GaussianNB, encoder=L2UTransformer()) + npt.assert_array_equal(oz._decode_labels(y), decoded) + + def test_decode_labels_unknown_class(self): + """ + Ensure a human-understandable error is raised when decode fails + """ + classes = np.array(["a", "b", "c", "d", "e"]) + y = classes[np.random.randint(0, 5, 100)] + + # Remove class "c" from the known array labels + classes = np.array(["a", "b", "d", "e"]) + + oz = ClassificationScoreVisualizer(GaussianNB, classes=classes) + with pytest.raises(ModelError, match="could not decode"): + npt.assert_array_equal(oz._decode_labels(y), decoded) + + encoder = dict(zip(classes, range(len(classes)))) + oz = ClassificationScoreVisualizer(GaussianNB, encoder=encoder) + with pytest.raises(ModelError, match="cannot decode class 'c' to label"): + npt.assert_array_equal(oz._decode_labels(y), decoded) + + encoder = LabelEncoder().fit(classes[np.random.randint(0, 4, 100)]) + oz = ClassificationScoreVisualizer(GaussianNB, encoder=encoder) + with pytest.raises(ModelError, match="could not decode"): + npt.assert_array_equal(oz._decode_labels(y), decoded) + + def test_labels(self): + """ + Check visualizer can return human labels correctly + """ + classes = np.array(["a", "b", "c", "d", "e"]) + y = classes[np.random.randint(0, 5, 100)] + + oz = ClassificationScoreVisualizer(GaussianNB, classes=classes) + npt.assert_array_equal(oz._labels(), classes) + + encoder = dict(zip(range(len(classes)), classes)) + oz = ClassificationScoreVisualizer(GaussianNB, encoder=encoder) + npt.assert_array_equal(oz._labels(), classes) + + encoder = LabelEncoder().fit(y) + oz = ClassificationScoreVisualizer(GaussianNB, encoder=encoder) + npt.assert_array_equal(oz._labels(), classes) + + def test_labels_warning(self): + """ + Assert warning and encoder is used with multiple decoding params for labels + """ + with pytest.warns( + YellowbrickWarning, match="both classes and encoder specified" + ): + oz = ClassificationScoreVisualizer( + GaussianNB(), + classes=["a", "b", "c"], + encoder={0: "foo", 1: "bar", 2: "zap"}, + ) + labels = oz._labels() + npt.assert_array_equal(labels, ["foo", "bar", "zap"]) + + def test_labels_encoder_no_classes(self): + """ + Assert warning and None returned if encoder doesn't have classes + """ + + class L2UTransformer(object): + def transform(self, y): + return np.array([yi.upper() for yi in y]) + + oz = ClassificationScoreVisualizer(GaussianNB(), encoder=L2UTransformer()) + with pytest.warns(YellowbrickWarning, match="could not determine class labels"): + assert oz._labels() is None + + def test_dict_labels_sorted(self): + """ + Ensure dictionary encoder labels are returned sorted + """ + le = {3: "a", 2: "c", 1: "b"} + oz = ClassificationScoreVisualizer(GaussianNB(), encoder=le) + npt.assert_array_equal(oz._labels(), ["b", "c", "a"]) diff --git a/tests/test_classifier/test_class_prediction_error.py b/tests/test_classifier/test_class_prediction_error.py index ae5743aa5..1c08e9899 100644 --- a/tests/test_classifier/test_class_prediction_error.py +++ b/tests/test_classifier/test_class_prediction_error.py @@ -1,12 +1,12 @@ # tests.test_classifier.test_class_prediction_error # Testing for the ClassPredictionError visualizer # -# Author: Benjamin Bengfort -# Author: Rebecca Bilbro +# Author: Benjamin Bengfort +# Author: Rebecca Bilbro # Author: Larry Gray # Created: Tue May 23 13:41:55 2017 -0700 # -# Copyright (C) 2017 District Data Labs +# Copyright (C) 2017 The scikit-yb developers # For license information, see LICENSE.txt # # ID: test_rocauc.py [] benjamin@bengfort.com $ @@ -22,74 +22,114 @@ import pytest import matplotlib.pyplot as plt -from tests.dataset import DatasetMixin -from yellowbrick.classifier.class_prediction_error import * from yellowbrick.exceptions import ModelError +from yellowbrick.datasets import load_occupancy +from yellowbrick.classifier.class_prediction_error import * from sklearn.svm import LinearSVC from sklearn.ensemble import RandomForestClassifier -from sklearn.datasets import make_multilabel_classification, make_classification +from sklearn.datasets import make_multilabel_classification +from unittest.mock import patch from tests.base import VisualTestCase -########################################################################## -## Data -########################################################################## -X, y = make_classification( - n_classes=4, n_informative=3, n_clusters_per_class=1, random_state=42 -) +try: + import pandas as pd +except ImportError: + pd = None + ########################################################################## ## Tests ########################################################################## -class ClassPredictionErrorTests(VisualTestCase, DatasetMixin): +class TestClassPredictionError(VisualTestCase): + """ + Test ClassPredictionError visualizer + """ + + @pytest.mark.filterwarnings("ignore:could not determine class_counts_") + def test_numpy_integration(self): + """ + Assert no errors during class prediction error integration with NumPy arrays + """ + X, y = load_occupancy(return_dataset=True).to_numpy() + + classes = ["unoccupied", "occupied"] + + model = LinearSVC(random_state=42) + model.fit(X, y) + visualizer = ClassPredictionError(model, classes=classes) + visualizer.score(X, y) + visualizer.finalize() + + # AppVeyor and Linux conda fail due to non-text-based differences + # AppVeyor fails with RMS 13.161 - 13.289 (python - miniconda) + self.assert_images_similar(visualizer, tol=12.5, windows_tol=13.3) - def test_integration_class_prediction_error_(self): + @pytest.mark.filterwarnings("ignore:could not determine class_counts_") + @pytest.mark.skipif(pd is None, reason="test requires pandas") + def test_pandas_integration(self): """ - Assert no errors occur during class prediction error integration + Assert no errors during class prediction error integration with Pandas """ - model = LinearSVC() + X, y = load_occupancy(return_dataset=True).to_pandas() + classes = ["unoccupied", "occupied"] + + model = LinearSVC(random_state=42) model.fit(X, y) - visualizer = ClassPredictionError(model, classes=["A", "B", "C", "D"]) + visualizer = ClassPredictionError(model, classes=classes) visualizer.score(X, y) - self.assert_images_similar(visualizer) + visualizer.finalize() + + # AppVeyor and Linux conda fail due to non-text-based differences + # AppVeyor fails with RMS 13.161 - 13.289 (python - miniconda) + self.assert_images_similar(visualizer, tol=12.5, windows_tol=13.3) def test_class_prediction_error_quickmethod(self): """ Test the ClassPreditionError quickmethod """ + X, y = load_occupancy(return_dataset=True).to_numpy() + fig = plt.figure() ax = fig.add_subplot() clf = LinearSVC(random_state=42) - g = class_prediction_error(clf, X, y, ax, random_state=42) + viz = class_prediction_error(clf, X, y, ax=ax, random_state=42) - self.assert_images_similar(ax=g) + # Not sure why the tolerance must be so high for this + # Failing on travis with RMS 9.544 + # AppVeyor and Linux conda fail due to non-text-based differences: RMS 12.961 + self.assert_images_similar(viz, tol=13, windows_tol=13) + @pytest.mark.filterwarnings("ignore:could not determine class_counts_") def test_classes_greater_than_indices(self): """ - Assert error when y and y_pred contain zero values for - one of the specified classess + A model error should be raised when there are more classes in fit than score """ - model = LinearSVC() + X, y = load_occupancy(return_dataset=True).to_numpy() + classes = ["unoccupied", "occupied", "partytime"] + + model = LinearSVC(random_state=42) model.fit(X, y) - with self.assertRaises(ModelError): - visualizer = ClassPredictionError( - model, classes=["A", "B", "C", "D", "E"] - ) + with pytest.raises(ModelError): + visualizer = ClassPredictionError(model, classes=classes) visualizer.score(X, y) def test_classes_less_than_indices(self): """ Assert error when there is an attempt to filter classes """ - model = LinearSVC() + X, y = load_occupancy(return_dataset=True).to_numpy() + classes = ["unoccupied"] + + model = LinearSVC(random_state=42) model.fit(X, y) - with self.assertRaises(NotImplementedError): - visualizer = ClassPredictionError(model, classes=["A"]) + with pytest.raises(NotImplementedError): + visualizer = ClassPredictionError(model, classes=classes) visualizer.score(X, y) @pytest.mark.skip(reason="not implemented yet") @@ -106,7 +146,7 @@ def test_class_type(self): X, y = make_multilabel_classification() model = RandomForestClassifier() model.fit(X, y) - with self.assertRaises(YellowbrickValueError): + with pytest.raises(YellowbrickValueError): visualizer = ClassPredictionError(model) visualizer.score(X, y) @@ -114,10 +154,36 @@ def test_score_returns_score(self): """ Test that ClassPredictionError score() returns a score between 0 and 1 """ + X, y = load_occupancy(return_dataset=True).to_numpy() + # Create and fit the visualizer - visualizer = ClassPredictionError(LinearSVC()) + visualizer = ClassPredictionError(LinearSVC(random_state=42)) visualizer.fit(X, y) # Score the visualizer s = visualizer.score(X, y) assert 0 <= s <= 1 + + def test_with_fitted(self): + """ + Test that visualizer properly handles an already-fitted model + """ + X, y = load_occupancy(return_dataset=True).to_numpy() + + model = RandomForestClassifier().fit(X, y) + classes = ["unoccupied", "occupied"] + + with patch.object(model, "fit") as mockfit: + oz = ClassPredictionError(model, classes=classes) + oz.fit(X, y) + mockfit.assert_not_called() + + with patch.object(model, "fit") as mockfit: + oz = ClassPredictionError(model, classes=classes, is_fitted=True) + oz.fit(X, y) + mockfit.assert_not_called() + + with patch.object(model, "fit") as mockfit: + oz = ClassPredictionError(model, classes=classes, is_fitted=False) + oz.fit(X, y) + mockfit.assert_called_once_with(X, y) diff --git a/tests/test_classifier/test_classification_report.py b/tests/test_classifier/test_classification_report.py index 1badee928..8f6e3570a 100644 --- a/tests/test_classifier/test_classification_report.py +++ b/tests/test_classifier/test_classification_report.py @@ -1,10 +1,13 @@ # tests.test_classifier.test_classification_report # Tests for the classification report visualizer # -# Author: Rebecca Bilbro -# Author: Benjamin Bengfort +# Author: Rebecca Bilbro +# Author: Benjamin Bengfort # Created: Sun Mar 18 16:57:27 2018 -0400 # +# Copyright (C) 208 The scikit-yb developers +# For license information, see LICENSE.txt +# # ID: test_classification_report.py [] benjamin@bengfort.com $ """ @@ -20,11 +23,12 @@ import yellowbrick as yb import matplotlib.pyplot as plt +from yellowbrick.datasets import load_occupancy from yellowbrick.classifier.classification_report import * from pytest import approx +from unittest.mock import patch from tests.base import VisualTestCase -from tests.dataset import DatasetMixin from sklearn.svm import LinearSVC from sklearn.naive_bayes import GaussianNB @@ -43,8 +47,9 @@ ## Test for Classification Report ########################################################################## + @pytest.mark.usefixtures("binary", "multiclass") -class ClassificationReportTests(VisualTestCase, DatasetMixin): +class TestClassificationReport(VisualTestCase): """ ClassificationReport visualizer tests """ @@ -62,14 +67,12 @@ def test_binary_class_report(self): self.assert_images_similar(viz, tol=40) assert viz.scores_ == { - 'precision': {0: approx(0.7446808), 1: approx(0.8490566)}, - 'recall': {0: approx(0.8139534), 1: approx(0.7894736)}, - 'f1': {0: approx(0.7777777), 1: approx(0.8181818)} - } - - @pytest.mark.xfail( - sys.platform == 'win32', reason="images not close on windows" - ) + "precision": {0: approx(0.7446808), 1: approx(0.8490566)}, + "recall": {0: approx(0.8139534), 1: approx(0.7894736)}, + "f1": {0: approx(0.7777777), 1: approx(0.8181818)}, + } + + @pytest.mark.xfail(sys.platform == "win32", reason="images not close on windows") def test_multiclass_class_report(self): """ Correctly generates report for multi-class with LogisticRegression @@ -83,21 +86,33 @@ def test_multiclass_class_report(self): self.assert_images_similar(viz, tol=11.0) assert viz.scores_ == { - 'precision': { - 0: 0.5333333333333333, 1: 0.5, 2: 0.45, - 3: 0.4, 4: 0.4, 5: 0.5882352941176471 - }, 'recall': { - 0: 0.42105263157894735, 1: 0.5625, 2: 0.6428571428571429, - 3: 0.3157894736842105, 4: 0.375, 5: 0.625 - }, 'f1': { - 0: 0.47058823529411764, 1: 0.5294117647058824, - 2: 0.5294117647058824, 3: 0.35294117647058826, - 4: 0.38709677419354843, 5: 0.6060606060606061 - }} - - @pytest.mark.xfail( - sys.platform == 'win32', reason="images not close on windows" - ) + "precision": { + 0: 0.5333333333333333, + 1: 0.5, + 2: 0.45, + 3: 0.4, + 4: 0.4, + 5: 0.5882352941176471, + }, + "recall": { + 0: 0.42105263157894735, + 1: 0.5625, + 2: 0.6428571428571429, + 3: 0.3157894736842105, + 4: 0.375, + 5: 0.625, + }, + "f1": { + 0: 0.47058823529411764, + 1: 0.5294117647058824, + 2: 0.5294117647058824, + 3: 0.35294117647058826, + 4: 0.38709677419354843, + 5: 0.6060606060606061, + }, + } + + @pytest.mark.xfail(sys.platform == "win32", reason="images not close on windows") @pytest.mark.skipif(pd is None, reason="test requires pandas") def test_pandas_integration(self): """ @@ -106,21 +121,52 @@ def test_pandas_integration(self): _, ax = plt.subplots() # Load the occupancy dataset from fixtures - data = self.load_data('occupancy') - target = 'occupancy' - features = [ - "temperature", "relative_humidity", "light", "C02", "humidity" - ] + data = load_occupancy(return_dataset=True) + X, y = data.to_pandas() + + # Create train/test splits + splits = tts(X, y, test_size=0.2, random_state=4512) + X_train, X_test, y_train, y_test = splits - # Create instances and target - X = pd.DataFrame(data[features]) - y = pd.Series(data[target].astype(int)) + classes = ["unoccupied", "occupied"] + + # Create classification report + model = GaussianNB() + viz = ClassificationReport(model, ax=ax, classes=classes) + viz.fit(X_train, y_train) + viz.score(X_test, y_test) + + self.assert_images_similar(viz, tol=5.0) + + # Ensure correct classification scores under the hood! + assert viz.scores_ == { + "precision": { + "unoccupied": 0.999347471451876, + "occupied": 0.8825214899713467, + }, + "recall": { + "unoccupied": 0.9613935969868174, + "occupied": 0.9978401727861771, + }, + "f1": {"unoccupied": 0.9800031994880819, "occupied": 0.9366447034972124}, + } + + @pytest.mark.xfail(sys.platform == "win32", reason="images not close on windows") + def test_numpy_integration(self): + """ + Test with NumPy arrays + """ + _, ax = plt.subplots() + + # Load the occupancy dataset from fixtures + data = load_occupancy(return_dataset=True) + X, y = data.to_numpy() # Create train/test splits splits = tts(X, y, test_size=0.2, random_state=4512) X_train, X_test, y_train, y_test = splits - classes = ['unoccupied', 'occupied'] + classes = ["unoccupied", "occupied"] # Create classification report model = GaussianNB() @@ -128,28 +174,33 @@ def test_pandas_integration(self): viz.fit(X_train, y_train) viz.score(X_test, y_test) - self.assert_images_similar(viz, tol=43.0) + self.assert_images_similar(viz, tol=5.0) # Ensure correct classification scores under the hood! assert viz.scores_ == { - 'precision': { - 'unoccupied': 0.999347471451876, - 'occupied': 0.8825214899713467 - }, 'recall': { - 'unoccupied': 0.9613935969868174, - 'occupied': 0.9978401727861771 - }, 'f1': { - 'unoccupied': 0.9800031994880819, - 'occupied': 0.9366447034972124 - }} + "precision": { + "unoccupied": 0.999347471451876, + "occupied": 0.8825214899713467, + }, + "recall": { + "unoccupied": 0.9613935969868174, + "occupied": 0.9978401727861771, + }, + "f1": {"unoccupied": 0.9800031994880819, "occupied": 0.9366447034972124}, + } def test_quick_method(self): """ Test the quick method with a random dataset """ X, y = make_classification( - n_samples=400, n_features=20, n_informative=8, n_redundant=8, - n_classes=2, n_clusters_per_class=4, random_state=27 + n_samples=400, + n_features=20, + n_informative=8, + n_redundant=8, + n_classes=2, + n_clusters_per_class=4, + random_state=27, ) _, ax = plt.subplots() @@ -164,8 +215,8 @@ def test_isclassifier(self): """ message = ( - 'This estimator is not a classifier; ' - 'try a regression or clustering score visualizer instead!' + "This estimator is not a classifier; " + "try a regression or clustering score visualizer instead!" ) with pytest.raises(yb.exceptions.YellowbrickError, match=message): @@ -177,20 +228,18 @@ def test_support_count_class_report(self): """ _, ax = plt.subplots() - viz = ClassificationReport(LinearSVC(random_state=42), ax=ax, - support='count') + viz = ClassificationReport(LinearSVC(random_state=42), ax=ax, support="count") viz.fit(self.binary.X.train, self.binary.y.train) viz.score(self.binary.X.test, self.binary.y.test) self.assert_images_similar(viz, tol=40) assert viz.scores_ == { - 'precision': {0: approx(0.7446808), 1: approx(0.8490566)}, - 'recall': {0: approx(0.8139534), 1: approx(0.7894736)}, - 'f1': {0: approx(0.7777777), 1: approx(0.8181818)}, - 'support': {0: approx(0.42999999999999999), - 1: approx(0.56999999999999995)} - } + "precision": {0: approx(0.7446808), 1: approx(0.8490566)}, + "recall": {0: approx(0.8139534), 1: approx(0.7894736)}, + "f1": {0: approx(0.7777777), 1: approx(0.8181818)}, + "support": {0: approx(0.42999999999999999), 1: approx(0.56999999999999995)}, + } def test_support_percent_class_report(self): """ @@ -198,28 +247,28 @@ def test_support_percent_class_report(self): """ _, ax = plt.subplots() - viz = ClassificationReport(LinearSVC(random_state=42), ax=ax, - support='percent') + viz = ClassificationReport(LinearSVC(random_state=42), ax=ax, support="percent") viz.fit(self.binary.X.train, self.binary.y.train) viz.score(self.binary.X.test, self.binary.y.test) self.assert_images_similar(viz, tol=40) assert viz.scores_ == { - 'precision': {0: approx(0.7446808), 1: approx(0.8490566)}, - 'recall': {0: approx(0.8139534), 1: approx(0.7894736)}, - 'f1': {0: approx(0.7777777), 1: approx(0.8181818)}, - 'support': {0: approx(0.42999999999999999), - 1: approx(0.56999999999999995)} - } + "precision": {0: approx(0.7446808), 1: approx(0.8490566)}, + "recall": {0: approx(0.8139534), 1: approx(0.7894736)}, + "f1": {0: approx(0.7777777), 1: approx(0.8181818)}, + "support": {0: approx(0.42999999999999999), 1: approx(0.56999999999999995)}, + } def test_invalid_support(self): """ Ensure that bad support arguments raise exception """ - with pytest.raises(YellowbrickValueError, - match="'foo' is an invalid argument for support, use None, " \ - "True, False, 'percent', or 'count'"): + with pytest.raises( + YellowbrickValueError, + match="'foo' is an invalid argument for support, use None, " + "True, False, 'percent', or 'count'", + ): ClassificationReport(LinearSVC(), support="foo") def test_score_returns_score(self): @@ -232,3 +281,27 @@ def test_score_returns_score(self): s = viz.score(self.binary.X.test, self.binary.y.test) assert 0 <= s <= 1 + + def test_with_fitted(self): + """ + Test that visualizer properly handles an already-fitted model + """ + X, y = load_occupancy(return_dataset=True).to_numpy() + + model = LinearSVC().fit(X, y) + classes = ["unoccupied", "occupied"] + + with patch.object(model, "fit") as mockfit: + oz = ClassificationReport(model, classes=classes) + oz.fit(X, y) + mockfit.assert_not_called() + + with patch.object(model, "fit") as mockfit: + oz = ClassificationReport(model, classes=classes, is_fitted=True) + oz.fit(X, y) + mockfit.assert_not_called() + + with patch.object(model, "fit") as mockfit: + oz = ClassificationReport(model, classes=classes, is_fitted=False) + oz.fit(X, y) + mockfit.assert_called_once_with(X, y) diff --git a/tests/test_classifier/test_confusion_matrix.py b/tests/test_classifier/test_confusion_matrix.py index 5d0d1e5ba..4e96f434f 100644 --- a/tests/test_classifier/test_confusion_matrix.py +++ b/tests/test_classifier/test_confusion_matrix.py @@ -2,9 +2,12 @@ # Tests for the confusion matrix visualizer # # Aithor: Neal Humphrey -# Author: Benjamin Bengfort +# Author: Benjamin Bengfort # Created: Tue May 03 11:05:11 2017 -0700 # +# Copyright (C) 2017 The scikit-yb developers +# For license information, see LICENSE.txt +# # ID: test_confusion_matrix.py [] benjamin@bengfort.com $ """ @@ -16,38 +19,39 @@ ########################################################################## import sys -import six import pytest import yellowbrick as yb import numpy.testing as npt import matplotlib.pyplot as plt +from yellowbrick.exceptions import ModelError +from yellowbrick.datasets import load_occupancy from yellowbrick.classifier.confusion_matrix import * -from tests.base import VisualTestCase -from tests.dataset import DatasetMixin, Dataset, Split +from unittest.mock import patch +from tests.fixtures import Dataset, Split +from tests.base import IS_WINDOWS_OR_CONDA, VisualTestCase from sklearn.svm import SVC -from sklearn.datasets import load_digits from sklearn.naive_bayes import GaussianNB from sklearn.preprocessing import LabelEncoder from sklearn.tree import DecisionTreeClassifier -from sklearn.datasets import make_classification from sklearn.linear_model import LogisticRegression from sklearn.linear_model import PassiveAggressiveRegressor from sklearn.model_selection import train_test_split as tts +from sklearn.datasets import load_digits, make_classification try: import pandas as pd except ImportError: pd = None - ########################################################################## ## Fixtures ########################################################################## -@pytest.fixture(scope='class') + +@pytest.fixture(scope="class") def digits(request): """ Creates a fixture of train and test splits for the sklearn digits dataset @@ -59,24 +63,21 @@ def digits(request): ) # Set a class attribute for digits - request.cls.digits = Dataset( - Split(X_train, X_test), Split(y_train, y_test) - ) + request.cls.digits = Dataset(Split(X_train, X_test), Split(y_train, y_test)) ########################################################################## ## Test Cases ########################################################################## + @pytest.mark.usefixtures("digits") -class ConfusionMatrixTests(VisualTestCase, DatasetMixin): +class TestConfusionMatrix(VisualTestCase): """ - ConfusionMatrix visualizer tests + Test ConfusionMatrix visualizer """ - @pytest.mark.xfail( - sys.platform == 'win32', reason="images not close on windows" - ) + @pytest.mark.xfail(sys.platform == "win32", reason="images not close on windows") def test_confusion_matrix(self): """ Integration test on digits dataset with LogisticRegression @@ -84,28 +85,32 @@ def test_confusion_matrix(self): _, ax = plt.subplots() model = LogisticRegression(random_state=93) - cm = ConfusionMatrix(model, ax=ax, classes=[0,1,2,3,4,5,6,7,8,9]) + cm = ConfusionMatrix(model, ax=ax, classes=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) cm.fit(self.digits.X.train, self.digits.y.train) cm.score(self.digits.X.test, self.digits.y.test) self.assert_images_similar(cm, tol=10) # Ensure correct confusion matrix under the hood - npt.assert_array_equal(cm.confusion_matrix_, np.array([ - [38, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [ 0, 35, 0, 0, 0, 0, 0, 0, 2, 0], - [ 0, 0, 39, 0, 0, 0, 0, 0, 0, 0], - [ 0, 0, 0, 38, 0, 1, 0, 0, 2, 0], - [ 0, 0, 0, 0, 40, 0, 0, 1, 0, 0], - [ 0, 0, 0, 0, 0, 27, 0, 0, 0, 0], - [ 0, 0, 0, 0, 0, 1, 29, 0, 0, 0], - [ 0, 0, 0, 0, 0, 0, 0, 35, 0, 1], - [ 0, 2, 0, 0, 0, 0, 0, 0, 32, 0], - [ 0, 0, 0, 0, 0, 0, 0, 1, 1, 35]])) + npt.assert_array_equal( + cm.confusion_matrix_, + np.array( + [ + [38, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 35, 0, 0, 0, 0, 0, 0, 2, 0], + [0, 0, 39, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 38, 0, 1, 0, 0, 2, 0], + [0, 0, 0, 0, 40, 0, 0, 1, 0, 0], + [0, 0, 0, 0, 0, 27, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 1, 29, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 35, 0, 1], + [0, 2, 0, 0, 0, 0, 0, 0, 32, 0], + [0, 0, 0, 0, 0, 0, 0, 1, 1, 35], + ] + ), + ) - @pytest.mark.xfail( - sys.platform == 'win32', reason="images not close on windows" - ) + @pytest.mark.xfail(sys.platform == "win32", reason="images not close on windows") def test_no_classes_provided(self): """ Integration test on digits dataset with GaussianNB, no classes @@ -120,17 +125,23 @@ def test_no_classes_provided(self): self.assert_images_similar(cm, tol=10) # Ensure correct confusion matrix under the hood - npt.assert_array_equal(cm.confusion_matrix_, np.array([ - [36, 0, 0, 0, 1, 0, 0, 1, 0, 0], - [ 0, 31, 0, 0, 0, 0, 0, 1, 3, 2], - [ 0, 1, 34, 0, 0, 0, 0, 0, 4, 0], - [ 0, 1, 0, 33, 0, 2, 0, 2, 3, 0], - [ 0, 0, 0, 0, 36, 0, 0, 5, 0, 0], - [ 0, 0, 0, 0, 0, 27, 0, 0, 0, 0], - [ 0, 0, 1, 0, 1, 0, 28, 0, 0, 0], - [ 0, 0, 0, 0, 0, 0, 0, 36, 0, 0], - [ 0, 3, 0, 1, 0, 1, 0, 4, 25, 0], - [ 1, 2, 0, 0, 1, 0, 0, 8, 3, 22]])) + npt.assert_array_equal( + cm.confusion_matrix_, + np.array( + [ + [36, 0, 0, 0, 1, 0, 0, 1, 0, 0], + [0, 31, 0, 0, 0, 0, 0, 1, 3, 2], + [0, 1, 34, 0, 0, 0, 0, 0, 4, 0], + [0, 1, 0, 33, 0, 2, 0, 2, 3, 0], + [0, 0, 0, 0, 36, 0, 0, 5, 0, 0], + [0, 0, 0, 0, 0, 27, 0, 0, 0, 0], + [0, 0, 1, 0, 1, 0, 28, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 36, 0, 0], + [0, 3, 0, 1, 0, 1, 0, 4, 25, 0], + [1, 2, 0, 0, 1, 0, 0, 8, 3, 22], + ] + ), + ) def test_fontsize(self): """ @@ -161,18 +172,25 @@ def test_percent_mode(self): self.assert_images_similar(cm, tol=10) # Ensure correct confusion matrix under the hood - npt.assert_array_equal(cm.confusion_matrix_, np.array([ - [16, 0, 0, 0, 0, 22, 0, 0, 0, 0], - [ 0, 11, 0, 0, 0, 26, 0, 0, 0, 0], - [ 0, 0, 10, 0, 0, 29, 0, 0, 0, 0], - [ 0, 0, 0, 6, 0, 35, 0, 0, 0, 0], - [ 0, 0, 0, 0, 11, 30, 0, 0, 0, 0], - [ 0, 0, 0, 0, 0, 27, 0, 0, 0, 0], - [ 0, 0, 0, 0, 0, 9, 21, 0, 0, 0], - [ 0, 0, 0, 0, 0, 29, 0, 7, 0, 0], - [ 0, 0, 0, 0, 0, 32, 0, 0, 2, 0], - [ 0, 0, 0, 0, 0, 34, 0, 0, 0, 3]])) + npt.assert_array_equal( + cm.confusion_matrix_, + np.array( + [ + [16, 0, 0, 0, 0, 22, 0, 0, 0, 0], + [0, 11, 0, 0, 0, 26, 0, 0, 0, 0], + [0, 0, 10, 0, 0, 29, 0, 0, 0, 0], + [0, 0, 0, 6, 0, 35, 0, 0, 0, 0], + [0, 0, 0, 0, 11, 30, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 27, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 9, 21, 0, 0, 0], + [0, 0, 0, 0, 0, 29, 0, 7, 0, 0], + [0, 0, 0, 0, 0, 32, 0, 0, 2, 0], + [0, 0, 0, 0, 0, 34, 0, 0, 0, 3], + ] + ), + ) + @pytest.mark.xfail(reason="class filtering is not currently supported") def test_class_filter_eg_zoom_in(self): """ Test filtering classes zooms in on the confusion matrix. @@ -180,53 +198,26 @@ def test_class_filter_eg_zoom_in(self): _, ax = plt.subplots() model = LogisticRegression(random_state=93) - cm = ConfusionMatrix(model, ax=ax, classes=[0,1,2]) + cm = ConfusionMatrix(model, ax=ax, classes=[0, 1, 2]) cm.fit(self.digits.X.train, self.digits.y.train) cm.score(self.digits.X.test, self.digits.y.test) self.assert_images_similar(cm, tol=10) # Ensure correct confusion matrix under the hood - npt.assert_array_equal(cm.confusion_matrix_, np.array([ - [38, 0, 0], - [ 0, 35, 0], - [ 0, 0, 39]])) + npt.assert_array_equal( + cm.confusion_matrix_, np.array([[38, 0, 0], [0, 35, 0], [0, 0, 39]]) + ) def test_extra_classes(self): """ - Assert that any extra classes are simply ignored - """ - # TODO: raise exception instead - _, ax = plt.subplots() - - model = LogisticRegression(random_state=93) - cm = ConfusionMatrix(model, ax=ax, classes=[0,1,2,11]) - cm.fit(self.digits.X.train, self.digits.y.train) - cm.score(self.digits.X.test, self.digits.y.test) - - npt.assert_array_equal(cm.class_counts_, [38, 37, 39, 0]) - - # Ensure correct confusion matrix under the hood - npt.assert_array_equal(cm.confusion_matrix_, np.array([ - [38, 0, 0, 0], - [ 0, 35, 0, 0], - [ 0, 0, 39, 0], - [ 0, 0, 0, 0]])) - - self.assert_images_similar(cm, tol=10) - - def test_one_class(self): - """ - Test single class confusion matrix with LogisticRegression + Assert that any extra classes raise an exception """ - _, ax = plt.subplots() - model = LogisticRegression(random_state=93) - cm = ConfusionMatrix(model, ax=ax, classes=[0]) - cm.fit(self.digits.X.train, self.digits.y.train) - cm.score(self.digits.X.test, self.digits.y.test) + cm = ConfusionMatrix(model, classes=[0, 1, 2, 11]) - self.assert_images_similar(cm, tol=10) + with pytest.raises(ModelError, match="could not decode"): + cm.fit(self.digits.X.train, self.digits.y.train) def test_defined_mapping(self): """ @@ -235,16 +226,43 @@ def test_defined_mapping(self): _, ax = plt.subplots() model = LogisticRegression(random_state=93) - classes = ['one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine'] - mapping = {0: 'zero', 1: 'one', 2: 'two', 3: 'three', 4: 'four', 5: 'five', - 6: 'six', 7: 'seven', 8: 'eight', 9: 'nine'} - cm = ConfusionMatrix(model, ax=ax, classes=classes, label_encoder=mapping) + classes = np.array( + [ + "zero", + "one", + "two", + "three", + "four", + "five", + "six", + "seven", + "eight", + "nine", + ] + ) + mapping = { + 0: "zero", + 1: "one", + 2: "two", + 3: "three", + 4: "four", + 5: "five", + 6: "six", + 7: "seven", + 8: "eight", + 9: "nine", + } + cm = ConfusionMatrix(model, ax=ax, encoder=mapping) cm.fit(self.digits.X.train, self.digits.y.train) cm.score(self.digits.X.test, self.digits.y.test) - assert [l.get_text() for l in ax.get_xticklabels()] == classes + xlabels = np.array([l.get_text() for l in ax.get_xticklabels()]) + npt.assert_array_equal(xlabels, classes) + ylabels = [l.get_text() for l in ax.get_yticklabels()] ylabels.reverse() + ylabels = np.asarray(ylabels) + npt.assert_array_equal(ylabels, classes) def test_inverse_mapping(self): """ @@ -254,20 +272,36 @@ def test_inverse_mapping(self): model = LogisticRegression(random_state=93) le = LabelEncoder() - classes = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine'] - le.fit(['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine']) + le.fit( + [ + "zero", + "one", + "two", + "three", + "four", + "five", + "six", + "seven", + "eight", + "nine", + ] + ) - cm = ConfusionMatrix(model, ax=ax, classes=classes, label_encoder=le) + cm = ConfusionMatrix(model, ax=ax, encoder=le) cm.fit(self.digits.X.train, self.digits.y.train) cm.score(self.digits.X.test, self.digits.y.test) - assert [l.get_text() for l in ax.get_xticklabels()] == classes + xlabels = np.array([l.get_text() for l in ax.get_xticklabels()]) + npt.assert_array_equal(xlabels, le.classes_) + ylabels = [l.get_text() for l in ax.get_yticklabels()] ylabels.reverse() - assert ylabels == classes + ylabels = np.asarray(ylabels) + npt.assert_array_equal(ylabels, le.classes_) @pytest.mark.xfail( - sys.platform == 'win32', reason="images not close on windows" + IS_WINDOWS_OR_CONDA, + reason="font rendering different in OS and/or Python; see #892", ) @pytest.mark.skipif(pd is None, reason="test requires pandas") def test_pandas_integration(self): @@ -277,15 +311,7 @@ def test_pandas_integration(self): _, ax = plt.subplots() # Load the occupancy dataset from fixtures - data = self.load_data('occupancy') - target = 'occupancy' - features = [ - "temperature", "relative_humidity", "light", "C02", "humidity" - ] - - # Create instances and target - X = pd.DataFrame(data[features]) - y = pd.Series(data[target].astype(int)) + X, y = load_occupancy(return_dataset=True).to_pandas() # Create train/test splits splits = tts(X, y, test_size=0.2, random_state=8873) @@ -297,29 +323,34 @@ def test_pandas_integration(self): cm.fit(X_train, y_train) cm.score(X_test, y_test) - tol = 0.1 if six.PY3 else 40 - self.assert_images_similar(cm, tol=tol) + self.assert_images_similar(cm, tol=0.1) # Ensure correct confusion matrix under the hood - npt.assert_array_equal(cm.confusion_matrix_, np.array([ - [3012, 114], - [ 1, 985] - ])) + npt.assert_array_equal(cm.confusion_matrix_, np.array([[3012, 114], [1, 985]])) - @pytest.mark.skip(reason="requires random state in quick method") + @pytest.mark.xfail( + IS_WINDOWS_OR_CONDA, + reason="font rendering different in OS and/or Python; see #892", + ) def test_quick_method(self): """ Test the quick method with a random dataset """ X, y = make_classification( - n_samples=400, n_features=20, n_informative=8, n_redundant=8, - n_classes=2, n_clusters_per_class=4, random_state=27 + n_samples=400, + n_features=20, + n_informative=8, + n_redundant=8, + n_classes=2, + n_clusters_per_class=4, + random_state=27, ) _, ax = plt.subplots() - confusion_matrix(DecisionTreeClassifier(), X, y, ax=ax) + model = DecisionTreeClassifier(random_state=25) + confusion_matrix(model, X, y, ax=ax, random_state=23) - self.assert_images_similar(ax=ax) + self.assert_images_similar(ax=ax, tol=0.1) def test_isclassifier(self): """ @@ -327,31 +358,21 @@ def test_isclassifier(self): """ model = PassiveAggressiveRegressor() message = ( - 'This estimator is not a classifier; ' - 'try a regression or clustering score visualizer instead!' + "This estimator is not a classifier; " + "try a regression or clustering score visualizer instead!" ) with pytest.raises(yb.exceptions.YellowbrickError, match=message): ConfusionMatrix(model) - @pytest.mark.xfail( - sys.platform == 'win32', reason="Changing the dtype to a subarray type is only supported if the total itemsize is unchanged" - ) def test_score_returns_score(self): """ Test that ConfusionMatrix score() returns a score between 0 and 1 """ - data = self.load_data("occupancy") - X = data[[ - "temperature", "relative_humidity", "light", "C02", "humidity" - ]] - - y = data['occupancy'] - - # Convert X to an ndarray - X = X.copy().view((float, len(X.dtype.names))) - + # Load the occupancy dataset from fixtures + X, y = load_occupancy(return_dataset=True).to_numpy() X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2, random_state=42) + # Create and fit the visualizer visualizer = ConfusionMatrix(LogisticRegression()) visualizer.fit(X_train, y_train) @@ -360,3 +381,27 @@ def test_score_returns_score(self): s = visualizer.score(X_test, y_test) assert 0 <= s <= 1 + + def test_with_fitted(self): + """ + Test that visualizer properly handles an already-fitted model + """ + X, y = load_occupancy(return_dataset=True).to_numpy() + + model = LogisticRegression().fit(X, y) + classes = ["unoccupied", "occupied"] + + with patch.object(model, "fit") as mockfit: + oz = ConfusionMatrix(model, classes=classes) + oz.fit(X, y) + mockfit.assert_not_called() + + with patch.object(model, "fit") as mockfit: + oz = ConfusionMatrix(model, classes=classes, is_fitted=True) + oz.fit(X, y) + mockfit.assert_not_called() + + with patch.object(model, "fit") as mockfit: + oz = ConfusionMatrix(model, classes=classes, is_fitted=False) + oz.fit(X, y) + mockfit.assert_called_once_with(X, y) diff --git a/tests/test_classifier/test_prcurve.py b/tests/test_classifier/test_prcurve.py index d4ebbc2da..502578e24 100644 --- a/tests/test_classifier/test_prcurve.py +++ b/tests/test_classifier/test_prcurve.py @@ -1,10 +1,13 @@ # tests.test_classifier.test_prcurve # Tests for the Precision-Recall curves visualizer # -# Author: Benjamin Bengfort +# Author: Benjamin Bengfort # Created: Tue Sep 04 16:48:09 2018 -0400 # -# ID: test_prcurve.py [] benjamin@bengfort.com $ +# Copyright (C) 2018 The scikit-yb developers +# For license information, see LICENSE.txt +# +# ID: test_prcurve.py [48889c4] benjamin@bengfort.com $ """ Tests for the Precision-Recall curves visualizer @@ -16,29 +19,33 @@ import sys import pytest +import matplotlib from yellowbrick.exceptions import * from yellowbrick.classifier.prcurve import * +from yellowbrick.datasets import load_occupancy -from tests.base import VisualTestCase +from tests.base import IS_WINDOWS_OR_CONDA, VisualTestCase from .test_rocauc import FakeClassifier from sklearn.svm import LinearSVC -from sklearn.datasets import load_iris from sklearn.naive_bayes import GaussianNB from sklearn.datasets import make_regression from sklearn.tree import DecisionTreeClassifier from sklearn.linear_model import RidgeClassifier +from sklearn.model_selection import train_test_split as tts from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier +try: + import pandas as pd +except ImportError: + pd = None ########################################################################## ## Assertion Helpers ########################################################################## -LEARNED_FIELDS = ( - 'target_type_', 'score_', 'precision_', 'recall_' -) +LEARNED_FIELDS = ("target_type_", "score_", "precision_", "recall_") def assert_not_fitted(oz): @@ -51,11 +58,11 @@ def assert_fitted(oz): assert hasattr(oz, field) - ########################################################################## ## PrecisionRecallCurve Tests ########################################################################## + @pytest.mark.usefixtures("binary", "multiclass") class TestPrecisionRecallCurve(VisualTestCase): """ @@ -75,7 +82,9 @@ def test_ensure_fit(self): """ Requires visualizer to be fit """ - with pytest.raises(NotFitted, match="cannot wrap an already fitted estimator"): + with pytest.raises( + NotFitted, match="this PrecisionRecallCurve instance is not fitted yet" + ): oz = PrecisionRecallCurve(RidgeClassifier()) oz.score(self.binary.X.test, self.binary.y.test) @@ -106,9 +115,15 @@ def test_binary_probability(self): # Compare the images oz.finalize() - tol = 1.5 if sys.platform == 'win32' else 1.0 # fails with RMSE 1.409 on AppVeyor + tol = ( + 1.5 if sys.platform == "win32" else 1.0 + ) # fails with RMSE 1.409 on AppVeyor self.assert_images_similar(oz, tol=tol) + @pytest.mark.xfail( + IS_WINDOWS_OR_CONDA, + reason="font rendering different in OS and/or Python; see #892", + ) def test_binary_probability_decision(self): """ Visual similarity of binary classifier with both predict_proba & decision @@ -136,8 +151,7 @@ def test_binary_probability_decision(self): # Compare the images oz.finalize() - tol = 4.6 if sys.platform == 'win32' else 1.0 # fails with RMSE 4.522 on AppVeyor - self.assert_images_similar(oz, tol=tol) + self.assert_images_similar(oz) def test_binary_decision(self): """ @@ -199,17 +213,27 @@ def test_multiclass_decision(self): # Compare the images oz.finalize() - tol = 1.25 if sys.platform == 'win32' else 1.0 # fails with RMSE 1.118 on AppVeyor + tol = ( + 1.25 if sys.platform == "win32" else 1.0 + ) # fails with RMSE 1.118 on AppVeyor self.assert_images_similar(oz, tol=tol) + @pytest.mark.xfail( + IS_WINDOWS_OR_CONDA, + reason="font rendering different in OS and/or Python; see #892", + ) def test_multiclass_probability(self): """ Visual similarity of multiclass classifier with predict_proba function """ # Create and fit the visualizer oz = PrecisionRecallCurve( - GaussianNB(), per_class=True, micro=False, fill_area=False, - iso_f1_curves=True, ap_score=False + GaussianNB(), + per_class=True, + micro=False, + fill_area=False, + iso_f1_curves=True, + ap_score=False, ) assert_not_fitted(oz) @@ -235,30 +259,207 @@ def test_multiclass_probability(self): # Compare the images oz.finalize() - tol = 6.6 if sys.platform == 'win32' else 1.0 # fails with RMSE 6.583 on AppVeyor + self.assert_images_similar(oz) + + def test_multiclass_probability_with_class_labels(self): + """Visual similarity of multiclass classifier with class labels.""" + # Create and fit the visualizer + oz = PrecisionRecallCurve( + GaussianNB(), + per_class=True, + micro=False, + fill_area=False, + iso_f1_curves=True, + ap_score=False, + classes=["a", "b", "c", "d", "e", "f"], + ) + assert_not_fitted(oz) + + # Fit returns self + assert oz.fit(self.multiclass.X.train, self.multiclass.y.train) is oz + + # Score the visualizer + s = oz.score(self.multiclass.X.test, self.multiclass.y.test) + assert_fitted(oz) + + # Score should be between 0 and 1 + assert 0.0 <= s <= 1.0 + + # Check the multiclass classification properties + assert oz.target_type_ == MULTICLASS + assert isinstance(oz.score_, dict) + assert oz.score_[MICRO] == s + assert isinstance(oz.precision_, dict) + assert isinstance(oz.recall_, dict) + assert len(oz.score_) == len(oz.classes_) + 1 + assert len(oz.precision_) == len(oz.classes_) + 1 + assert len(oz.recall_) == len(oz.classes_) + 1 + + # Finalize image + oz.finalize() + + # Compare the label text of the images. + assert oz.ax.get_xlabel() == "Recall" + oz.ax.set_xlabel("") + assert oz.ax.get_ylabel() == "Precision" + oz.ax.set_ylabel("") + assert oz.ax.get_title() == "Precision-Recall Curve for GaussianNB" + oz.ax.set_title("") + + # Compare the Legend text + expected_legend_txt = [ + "PR for class a (area=0.42)", + "PR for class b (area=0.36)", + "PR for class c (area=0.44)", + "PR for class d (area=0.52)", + "PR for class e (area=0.37)", + "PR for class f (area=0.49)", + ] + assert [x.get_text() for x in oz.ax.legend().get_texts()] == expected_legend_txt + oz.ax.get_legend().remove() + + # Text in iso_f1_curves. + # Will not check for these as they appears okay in other test images. + for child in oz.ax.get_children(): + if isinstance(child, matplotlib.text.Annotation): + oz.ax.texts.remove(child) + + # Compare the images + tol = ( + 6.6 if sys.platform == "win32" else 1.0 + ) # fails with RMSE 6.583 on AppVeyor self.assert_images_similar(oz, tol=tol) @pytest.mark.filterwarnings("ignore:From version 0.21") + @pytest.mark.xfail( + IS_WINDOWS_OR_CONDA, + reason="font rendering different in OS and/or Python; see #892", + ) def test_quick_method(self): """ - Test the precision_recall_curve quick method. + Test the precision_recall_curve quick method with numpy arrays. """ - data = load_iris() + X, y = load_occupancy(return_dataset=True).to_numpy() + model = DecisionTreeClassifier(random_state=14) oz = precision_recall_curve( - model, data.data, data.target, per_class=True, micro=True, - fill_area=False, iso_f1_curves=True, ap_score=False, - random_state=2) + model, + X, + y, + per_class=True, + micro=True, + fill_area=False, + iso_f1_curves=True, + ap_score=False, + random_state=2, + ) assert isinstance(oz, PrecisionRecallCurve) - tol = 5.8 if sys.platform == 'win32' else 1.0 # fails with RMSE 5.740 on AppVeyor - self.assert_images_similar(oz, tol=tol) + self.assert_images_similar(oz) + + @pytest.mark.skipif(pd is None, reason="test requires pandas") + def test_pandas_integration(self): + """ + Test the precision_recall_curve with Pandas dataframes + """ + X, y = load_occupancy(return_dataset=True).to_pandas() + + model = DecisionTreeClassifier(random_state=14) + + X_train, X_test, y_train, y_test = tts( + X, y, test_size=0.2, shuffle=True, random_state=555 + ) + + oz = PrecisionRecallCurve( + model, + per_class=True, + micro=False, + fill_area=False, + iso_f1_curves=True, + ap_score=False, + classes=["unoccupied", "occupied"], + ) + oz.fit(X_train, y_train) + oz.score(X_test, y_test) + + oz.finalize() + + self.assert_images_similar(oz, tol=5.0) def test_no_scoring_function(self): """ Test get y scores with classifiers that have no scoring method """ oz = PrecisionRecallCurve(FakeClassifier()) - with pytest.raises(ModelError, match="requires .* predict_proba or decision_function"): + with pytest.raises( + ModelError, match="requires .* predict_proba or decision_function" + ): oz._get_y_scores(self.binary.X.train) + + @pytest.mark.xfail( + IS_WINDOWS_OR_CONDA, + reason="font rendering different in OS and/or Python; see #892", + ) + def test_custom_iso_f1_scores(self): + """ + Test using custom ISO F1 Values + """ + X, y = load_occupancy(return_dataset=True).to_numpy() + + vals = (0.1, 0.6, 0.3, 0.9, 0.9) + viz = PrecisionRecallCurve( + RandomForestClassifier(random_state=27), + iso_f1_curves=True, + iso_f1_values=vals, + ) + + X_train, X_test, y_train, y_test = tts( + X, y, test_size=0.2, shuffle=True, random_state=555 + ) + + assert viz.fit(X_train, y_train) is viz + viz.score(X_test, y_test) + viz.finalize() + + self.assert_images_similar(viz) + + def test_quick_method_with_test_set(self): + """ + Test quick method when both train and test data is supplied + """ + X, y = load_occupancy(return_dataset=True).to_numpy() + + X_train, X_test, y_train, y_test = tts( + X, y, test_size=0.2, shuffle=True, random_state=555 + ) + + viz = precision_recall_curve( + RandomForestClassifier(random_state=72), + X_train, + y_train, + X_test, + y_test, + random_state=7, + ) + self.assert_images_similar(viz) + + def test_missing_test_data_in_quick_method(self): + """ + Test quick method when test data is missing. + """ + X, y = load_occupancy(return_dataset=True).to_numpy() + + X_train, X_test, y_train, y_test = tts( + X, y, test_size=0.2, shuffle=True, random_state=55555 + ) + + emsg = "both X_test and y_test are required if one is specified" + + with pytest.raises(YellowbrickValueError, match=emsg): + precision_recall_curve( + RandomForestClassifier(), X_train, y_train, y_test=y_test + ) + + with pytest.raises(YellowbrickValueError, match=emsg): + precision_recall_curve(RandomForestClassifier(), X_train, y_train, X_test) diff --git a/tests/test_classifier/test_rocauc.py b/tests/test_classifier/test_rocauc.py index dccf2666a..24822080f 100644 --- a/tests/test_classifier/test_rocauc.py +++ b/tests/test_classifier/test_rocauc.py @@ -1,11 +1,11 @@ # tests.test_classifier.test_rocauc # Testing for the ROCAUC visualizer # -# Author: Benjamin Bengfort -# Author: Rebecca Bilbro +# Author: Benjamin Bengfort +# Author: Rebecca Bilbro # Created: Tue May 23 13:41:55 2017 -0700 # -# Copyright (C) 2017 District Data Labs +# Copyright (C) 2017 The scikit-yb developers # For license information, see LICENSE.txt # # ID: test_rocauc.py [] benjamin@bengfort.com $ @@ -18,46 +18,70 @@ ## Imports ########################################################################## -import os import pytest import numpy as np import numpy.testing as npt +from unittest.mock import patch from tests.base import VisualTestCase -from tests.dataset import DatasetMixin + from yellowbrick.classifier.rocauc import * +from yellowbrick.datasets import load_occupancy from yellowbrick.exceptions import ModelError, YellowbrickValueError from sklearn.svm import LinearSVC from sklearn.naive_bayes import GaussianNB from sklearn.tree import DecisionTreeClassifier -from sklearn.datasets import load_breast_cancer from sklearn.linear_model import LogisticRegression from sklearn.base import BaseEstimator, ClassifierMixin +from sklearn.model_selection import train_test_split as tts from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier +try: + import pandas as pd +except ImportError: + pd = None ########################################################################## ## Fixtures ########################################################################## -# Increased tolerance for AppVeyor tests -TOL = 10 if os.name == 'nt' else 0.1 - class FakeClassifier(BaseEstimator, ClassifierMixin): """ A fake classifier for testing noops on the visualizer. """ + pass +def assert_valid_rocauc_scores(visualizer, nscores=4): + """ + Assertion helper to ensure scores are correctly computed + """ + __tracebackhide__ = True + assert len(visualizer.fpr.keys()) == nscores + assert len(visualizer.tpr.keys()) == nscores + assert len(visualizer.roc_auc.keys()) == nscores + + for k in (0, 1, "micro", "macro"): + assert k in visualizer.fpr + assert k in visualizer.tpr + assert k in visualizer.roc_auc + assert len(visualizer.fpr[k]) == len(visualizer.tpr[k]) + assert 0.0 < visualizer.roc_auc[k] < 1.0 + + ########################################################################## ## Tests ########################################################################## + @pytest.mark.usefixtures("binary", "multiclass") -class ROCAUCTests(VisualTestCase, DatasetMixin): +class TestROCAUC(VisualTestCase): + """ + Test ROCAUC visualizer + """ def test_binary_probability(self): """ @@ -74,21 +98,11 @@ def test_binary_probability(self): assert 0 <= s <= 1 # Check the scores - self.assertEqual(len(visualizer.fpr.keys()), 4) - self.assertEqual(len(visualizer.tpr.keys()), 4) - self.assertEqual(len(visualizer.roc_auc.keys()), 4) - - for k in (0, 1, "micro", "macro"): - self.assertIn(k, visualizer.fpr) - self.assertIn(k, visualizer.tpr) - self.assertIn(k, visualizer.roc_auc) - self.assertEqual(len(visualizer.fpr[k]), len(visualizer.tpr[k])) - self.assertGreater(visualizer.roc_auc[k], 0.0) - self.assertLess(visualizer.roc_auc[k], 1.0) + assert_valid_rocauc_scores(visualizer) # Compare the images - visualizer.poof() - self.assert_images_similar(visualizer, tol=TOL) + visualizer.finalize() + self.assert_images_similar(visualizer, tol=0.1, windows_tol=10) def test_binary_probability_decision(self): """ @@ -105,28 +119,20 @@ def test_binary_probability_decision(self): assert 0 <= s <= 1 # Check the scores - self.assertEqual(len(visualizer.fpr.keys()), 4) - self.assertEqual(len(visualizer.tpr.keys()), 4) - self.assertEqual(len(visualizer.roc_auc.keys()), 4) - - for k in (0, 1, "micro", "macro"): - self.assertIn(k, visualizer.fpr) - self.assertIn(k, visualizer.tpr) - self.assertIn(k, visualizer.roc_auc) - self.assertEqual(len(visualizer.fpr[k]), len(visualizer.tpr[k])) - self.assertGreater(visualizer.roc_auc[k], 0.0) - self.assertLess(visualizer.roc_auc[k], 1.0) + assert_valid_rocauc_scores(visualizer) # Compare the images - visualizer.poof() - self.assert_images_similar(visualizer, tol=TOL) + visualizer.finalize() + self.assert_images_similar(visualizer, tol=0.1, windows_tol=10) def test_binary_decision(self): """ Test ROCAUC with a binary classifier with a decision_function """ # Create and fit the visualizer - visualizer = ROCAUC(LinearSVC(random_state=42), micro=False, macro=False, per_class=False) + visualizer = ROCAUC( + LinearSVC(random_state=42), micro=False, macro=False, per_class=False + ) visualizer.fit(self.binary.X.train, self.binary.y.train) # Score the visualizer @@ -136,13 +142,13 @@ def test_binary_decision(self): assert 0 <= s <= 1 # Check the scores - self.assertEqual(len(visualizer.fpr.keys()), 1) - self.assertEqual(len(visualizer.tpr.keys()), 1) - self.assertEqual(len(visualizer.roc_auc.keys()), 1) + assert len(visualizer.fpr.keys()) == 1 + assert len(visualizer.tpr.keys()) == 1 + assert len(visualizer.roc_auc.keys()) == 1 # Compare the images # NOTE: increased tolerance for both AppVeyor and Travis CI tests - visualizer.poof() + visualizer.finalize() self.assert_images_similar(visualizer, tol=10) def test_binary_micro_error(self): @@ -154,7 +160,7 @@ def test_binary_micro_error(self): visualizer.fit(self.binary.X.train, self.binary.y.train) # Ensure score raises error (micro curves aren't defined for binary decisions) - with self.assertRaises(ModelError): + with pytest.raises(ModelError): visualizer.score(self.binary.X.test, self.binary.y.test) def test_binary_macro_error(self): @@ -166,7 +172,7 @@ def test_binary_macro_error(self): visualizer.fit(self.binary.X.train, self.binary.y.train) # Ensure score raises error (macro curves aren't defined for binary decisions) - with self.assertRaises(ModelError): + with pytest.raises(ModelError): visualizer.score(self.binary.X.test, self.binary.y.test) def test_binary_per_class_error(self): @@ -178,7 +184,7 @@ def test_binary_per_class_error(self): visualizer.fit(self.binary.X.train, self.binary.y.train) # Ensure score raises error (per_class curves not defined for binary decisions) - with self.assertRaises(ModelError): + with pytest.raises(ModelError): visualizer.score(self.binary.X.test, self.binary.y.test) def test_multiclass_rocauc(self): @@ -196,31 +202,40 @@ def test_multiclass_rocauc(self): assert 0 <= s <= 1 # Check the scores - self.assertEqual(len(visualizer.fpr.keys()), 8) - self.assertEqual(len(visualizer.tpr.keys()), 8) - self.assertEqual(len(visualizer.roc_auc.keys()), 8) - - for k in (0, 1, "micro", "macro"): - self.assertIn(k, visualizer.fpr) - self.assertIn(k, visualizer.tpr) - self.assertIn(k, visualizer.roc_auc) - self.assertEqual(len(visualizer.fpr[k]), len(visualizer.tpr[k])) - self.assertGreater(visualizer.roc_auc[k], 0.0) - self.assertLess(visualizer.roc_auc[k], 1.0) + assert_valid_rocauc_scores(visualizer, nscores=8) # Compare the images - visualizer.poof() - self.assert_images_similar(visualizer, tol=TOL) + visualizer.finalize() + self.assert_images_similar(visualizer, tol=0.1, windows_tol=10) def test_rocauc_quickmethod(self): """ Test the ROCAUC quick method """ - data = load_breast_cancer() + X, y = load_occupancy(return_dataset=True).to_numpy() model = DecisionTreeClassifier() # TODO: image comparison of the quick method - roc_auc(model, data.data, data.target) + roc_auc(model, X, y) + + @pytest.mark.skipif(pd is None, reason="test requires pandas") + def test_pandas_integration(self): + """ + Test the ROCAUC with Pandas dataframe + """ + X, y = load_occupancy(return_dataset=True).to_pandas() + + # Create train/test splits + splits = tts(X, y, test_size=0.2, random_state=4512) + X_train, X_test, y_train, y_test = splits + + visualizer = ROCAUC(GaussianNB()) + visualizer.fit(X_train, y_train) + visualizer.score(X_test, y_test) + + # Compare the images + visualizer.finalize() + self.assert_images_similar(visualizer) def test_rocauc_no_micro(self): """ @@ -232,16 +247,16 @@ def test_rocauc_no_micro(self): # Score the visualizer (should be the macro average) s = visualizer.score(self.binary.X.test, self.binary.y.test) - self.assertAlmostEqual(s, 0.8) + assert s == pytest.approx(0.8661, abs=1e-4) # Assert that there is no micro score - self.assertNotIn("micro", visualizer.fpr) - self.assertNotIn("micro", visualizer.tpr) - self.assertNotIn("micro", visualizer.roc_auc) + assert "micro" not in visualizer.fpr + assert "micro" not in visualizer.tpr + assert "micro" not in visualizer.roc_auc # Compare the images - visualizer.poof() - self.assert_images_similar(visualizer, tol=TOL) + visualizer.finalize() + self.assert_images_similar(visualizer, tol=0.1, windows_tol=10) def test_rocauc_no_macro(self): """ @@ -253,16 +268,16 @@ def test_rocauc_no_macro(self): # Score the visualizer (should be the micro average) s = visualizer.score(self.binary.X.test, self.binary.y.test) - self.assertAlmostEqual(s, 0.8) + assert s == pytest.approx(0.8573, abs=1e-4) # Assert that there is no macro score - self.assertNotIn("macro", visualizer.fpr) - self.assertNotIn("macro", visualizer.tpr) - self.assertNotIn("macro", visualizer.roc_auc) + assert "macro" not in visualizer.fpr + assert "macro" not in visualizer.tpr + assert "macro" not in visualizer.roc_auc # Compare the images - visualizer.poof() - self.assert_images_similar(visualizer, tol=TOL) + visualizer.finalize() + self.assert_images_similar(visualizer, tol=0.1, windows_tol=10) def test_rocauc_no_macro_no_micro(self): """ @@ -274,21 +289,21 @@ def test_rocauc_no_macro_no_micro(self): # Score the visualizer (should be the F1 score) s = visualizer.score(self.binary.X.test, self.binary.y.test) - self.assertAlmostEqual(s, 0.8) + assert s == pytest.approx(0.8) # Assert that there is no macro score - self.assertNotIn("macro", visualizer.fpr) - self.assertNotIn("macro", visualizer.tpr) - self.assertNotIn("macro", visualizer.roc_auc) + assert "macro" not in visualizer.fpr + assert "macro" not in visualizer.tpr + assert "macro" not in visualizer.roc_auc # Assert that there is no micro score - self.assertNotIn("micro", visualizer.fpr) - self.assertNotIn("micro", visualizer.tpr) - self.assertNotIn("micro", visualizer.roc_auc) + assert "micro" not in visualizer.fpr + assert "micro" not in visualizer.tpr + assert "micro" not in visualizer.roc_auc # Compare the images - visualizer.poof() - self.assert_images_similar(visualizer, tol=TOL) + visualizer.finalize() + self.assert_images_similar(visualizer, tol=0.1, windows_tol=10) def test_rocauc_no_classes(self): """ @@ -300,24 +315,26 @@ def test_rocauc_no_classes(self): # Score the visualizer (should be the micro average) s = visualizer.score(self.binary.X.test, self.binary.y.test) - self.assertAlmostEqual(s, 0.8) + assert s == pytest.approx(0.8661, abs=1e-4) # Assert that there still are per-class scores for c in (0, 1): - self.assertIn(c, visualizer.fpr) - self.assertIn(c, visualizer.tpr) - self.assertIn(c, visualizer.roc_auc) + assert c in visualizer.fpr + assert c in visualizer.tpr + assert c in visualizer.roc_auc # Compare the images - visualizer.poof() - self.assert_images_similar(visualizer, tol=TOL) + visualizer.finalize() + self.assert_images_similar(visualizer, tol=0.1, windows_tol=10) def test_rocauc_no_curves(self): """ Test ROCAUC with no curves specified at all """ # Create and fit the visualizer - visualizer = ROCAUC(LogisticRegression(), per_class=False, macro=False, micro=False) + visualizer = ROCAUC( + LogisticRegression(), per_class=False, macro=False, micro=False + ) visualizer.fit(self.binary.X.train, self.binary.y.train) # Attempt to score the visualizer @@ -328,7 +345,7 @@ def test_rocauc_label_encoded(self): """ Test ROCAUC with a target specifying a list of classes as strings """ - class_labels = ['a', 'b', 'c', 'd', 'e', 'f'] + class_labels = ["a", "b", "c", "d", "e", "f"] # Create and fit the visualizer visualizer = ROCAUC(LogisticRegression(), classes=class_labels) @@ -336,14 +353,14 @@ def test_rocauc_label_encoded(self): # Score the visualizer visualizer.score(self.multiclass.X.test, self.multiclass.y.test) - self.assertEqual(list(visualizer.classes_), class_labels) + assert list(visualizer.classes_) == class_labels def test_rocauc_not_label_encoded(self): """ Test ROCAUC with a target whose classes are unencoded strings before scoring """ # Map numeric targets to strings - classes = {0: 'a', 1: 'b', 2: 'c', 3: 'd', 4: 'e', 5: 'f'} + classes = {0: "a", 1: "b", 2: "c", 3: "d", 4: "e", 5: "f"} y_train = np.array([classes[yi] for yi in self.multiclass.y.train]) y_test = np.array([classes[yi] for yi in self.multiclass.y.test]) @@ -352,7 +369,7 @@ def test_rocauc_not_label_encoded(self): visualizer.fit(self.multiclass.X.train, y_train) # Confirm that y_train and y_test have the same targets before calling score - self.assertEqual(set(y_train), set(y_test)) + assert set(y_train) == set(y_test) def test_binary_decision_function_rocauc(self): """ @@ -360,7 +377,7 @@ def test_binary_decision_function_rocauc(self): """ # Load the model and assert there is no predict_proba method. model = LinearSVC() - with self.assertRaises(AttributeError): + with pytest.raises(AttributeError): model.predict_proba # Fit model and visualizer @@ -368,9 +385,9 @@ def test_binary_decision_function_rocauc(self): visualizer.fit(self.binary.X.train, self.binary.y.train) # First 10 expected values in the y_scores - first_ten_expected = np.asarray([ - -0.092, 0.019, -0.751, -0.838, 0.183, -0.344, -1.019, 2.203, 1.415, -0.529 - ]) + first_ten_expected = np.asarray( + [-0.092, 0.019, -0.751, -0.838, 0.183, -0.344, -1.019, 2.203, 1.415, -0.529] + ) # Get the predict_proba scores and evaluate y_scores = visualizer._get_y_scores(self.binary.X.train) @@ -384,7 +401,7 @@ def test_multi_decision_function_rocauc(self): """ # Load the model and assert there is no predict_proba method. model = LinearSVC() - with self.assertRaises(AttributeError): + with pytest.raises(AttributeError): model.predict_proba # Fit model and visualizer @@ -393,11 +410,11 @@ def test_multi_decision_function_rocauc(self): # First 5 expected arrays in the y_scores first_five_expected = [ - [-0.370, -0.543, -1.059, -0.466, -0.743, -1.156], - [-0.445, -0.693, -0.362, -1.002, -0.815, -0.878], - [-1.058, -0.808, -0.291, -0.767, -0.651, -0.586], - [-0.446, -1.255, -0.489, -0.961, -0.807, -0.126], - [-1.066, -0.493, -0.639, -0.442, -0.639, -1.017] + [-0.370, -0.543, -1.059, -0.466, -0.743, -1.156], + [-0.445, -0.693, -0.362, -1.002, -0.815, -0.878], + [-1.058, -0.808, -0.291, -0.767, -0.651, -0.586], + [-0.446, -1.255, -0.489, -0.961, -0.807, -0.126], + [-1.066, -0.493, -0.639, -0.442, -0.639, -1.017], ] # Get the predict_proba scores and evaluate @@ -412,7 +429,7 @@ def test_predict_proba_rocauc(self): """ # Load the model and assert there is no decision_function method. model = GaussianNB() - with self.assertRaises(AttributeError): + with pytest.raises(AttributeError): model.decision_function # Fit model and visualizer @@ -420,18 +437,20 @@ def test_predict_proba_rocauc(self): visualizer.fit(self.binary.X.train, self.binary.y.train) # First 10 expected arrays in the y_scores - first_ten_expected = np.asarray([ - [0.595, 0.405], - [0.161, 0.839], - [0.990, 0.010], - [0.833, 0.167], - [0.766, 0.234], - [0.996, 0.004], - [0.592, 0.408], - [0.007, 0.993], - [0.035, 0.965], - [0.764, 0.236] - ]) + first_ten_expected = np.asarray( + [ + [0.595, 0.405], + [0.161, 0.839], + [0.990, 0.010], + [0.833, 0.167], + [0.766, 0.234], + [0.996, 0.004], + [0.592, 0.408], + [0.007, 0.993], + [0.035, 0.965], + [0.764, 0.236], + ] + ) # Get the predict_proba scores and evaluate y_scores = visualizer._get_y_scores(self.binary.X.train) @@ -444,5 +463,29 @@ def test_no_scoring_function(self): Test ROCAUC with classifiers that have no scoring method """ visualizer = ROCAUC(FakeClassifier()) - with self.assertRaises(ModelError): + with pytest.raises(ModelError): visualizer._get_y_scores(self.binary.X.train) + + def test_with_fitted(self): + """ + Test that visualizer properly handles an already-fitted model + """ + X, y = load_occupancy(return_dataset=True).to_numpy() + + model = GaussianNB().fit(X, y) + classes = ["unoccupied", "occupied"] + + with patch.object(model, "fit") as mockfit: + oz = ROCAUC(model, classes=classes) + oz.fit(X, y) + mockfit.assert_not_called() + + with patch.object(model, "fit") as mockfit: + oz = ROCAUC(model, classes=classes, is_fitted=True) + oz.fit(X, y) + mockfit.assert_not_called() + + with patch.object(model, "fit") as mockfit: + oz = ROCAUC(model, classes=classes, is_fitted=False) + oz.fit(X, y) + mockfit.assert_called_once_with(X, y) diff --git a/tests/test_classifier/test_threshold.py b/tests/test_classifier/test_threshold.py index f39d8b44e..810d3ea07 100644 --- a/tests/test_classifier/test_threshold.py +++ b/tests/test_classifier/test_threshold.py @@ -1,11 +1,11 @@ # tests.test_classifier.test_threshold # Ensure that the discrimination threshold visualizations work. # -# Author: Nathan Danielsen -# Author: Benjamin Bengfort +# Author: Nathan Danielsen +# Author: Benjamin Bengfort # Created: Wed April 26 20:17:29 2017 -0700 # -# Copyright (C) 2017 District Data Labs +# Copyright (C) 2017 The scikit-yb developers # For license information, see LICENSE.txt # # ID: test_threshold.py [] nathan.danielsen@gmail.com $ @@ -19,16 +19,16 @@ ########################################################################## import sys -import six import pytest import yellowbrick as yb import matplotlib.pyplot as plt from yellowbrick.classifier.threshold import * +from yellowbrick.datasets import load_occupancy from yellowbrick.utils import is_probabilistic, is_classifier +from unittest.mock import patch from tests.base import VisualTestCase -from tests.dataset import DatasetMixin from numpy.testing.utils import assert_array_equal from sklearn.svm import LinearSVC, NuSVC @@ -44,31 +44,30 @@ except ImportError: pd = None -try: - from unittest.mock import patch -except ImportError: - from mock import patch - ########################################################################## ## DiscriminationThreshold Test Cases ########################################################################## -class TestDiscriminationThreshold(VisualTestCase, DatasetMixin): + +class TestDiscriminationThreshold(VisualTestCase): """ DiscriminationThreshold visualizer tests """ - @pytest.mark.xfail( - sys.platform == 'win32', reason="images not close on windows" - ) + @pytest.mark.xfail(sys.platform == "win32", reason="images not close on windows") def test_binary_discrimination_threshold(self): """ Correctly generates viz for binary classification with BernoulliNB """ X, y = make_classification( - n_samples=400, n_features=20, n_informative=8, n_redundant=8, - n_classes=2, n_clusters_per_class=4, random_state=854 + n_samples=400, + n_features=20, + n_informative=8, + n_redundant=8, + n_classes=2, + n_clusters_per_class=4, + random_state=854, ) _, ax = plt.subplots() @@ -77,7 +76,7 @@ def test_binary_discrimination_threshold(self): visualizer = DiscriminationThreshold(model, ax=ax, random_state=23) visualizer.fit(X, y) - visualizer.poof() + visualizer.finalize() self.assert_images_similar(visualizer) @@ -86,8 +85,13 @@ def test_multiclass_discrimination_threshold(self): Assert exception is raised in multiclass case. """ X, y = make_classification( - n_samples=400, n_features=20, n_informative=8, n_redundant=8, - n_classes=3, n_clusters_per_class=4, random_state=854 + n_samples=400, + n_features=20, + n_informative=8, + n_redundant=8, + n_classes=3, + n_clusters_per_class=4, + random_state=854, ) visualizer = DiscriminationThreshold(GaussianNB(), random_state=23) @@ -96,9 +100,7 @@ def test_multiclass_discrimination_threshold(self): with pytest.raises(ValueError, match=msg): visualizer.fit(X, y) - @pytest.mark.xfail( - sys.platform == 'win32', reason="images not close on windows" - ) + @pytest.mark.xfail(sys.platform == "win32", reason="images not close on windows") @pytest.mark.skipif(pd is None, reason="test requires pandas") def test_pandas_integration(self): """ @@ -107,24 +109,39 @@ def test_pandas_integration(self): _, ax = plt.subplots() # Load the occupancy dataset from fixtures - data = self.load_data('occupancy') - target = 'occupancy' - features = [ - "temperature", "relative_humidity", "light", "C02", "humidity" - ] + data = load_occupancy(return_dataset=True) + X, y = data.to_pandas() + + classes = ["unoccupied", "occupied"] + + # Create the visualizer + viz = DiscriminationThreshold( + LogisticRegression(), ax=ax, classes=classes, random_state=193 + ) + viz.fit(X, y) + viz.finalize() - # Create instances and target - X = pd.DataFrame(data[features]) - y = pd.Series(data[target].astype(int)) + self.assert_images_similar(viz, tol=0.1) + + @pytest.mark.xfail(sys.platform == "win32", reason="images not close on windows") + def test_numpy_integration(self): + """ + Test with NumPy arrays + """ + _, ax = plt.subplots() + + # Load the occupancy dataset from fixtures + data = load_occupancy(return_dataset=True) + X, y = data.to_numpy() - classes = ['unoccupied', 'occupied'] + classes = ["unoccupied", "occupied"] # Create the visualizer viz = DiscriminationThreshold( LogisticRegression(), ax=ax, classes=classes, random_state=193 ) viz.fit(X, y) - viz.poof() + viz.finalize() self.assert_images_similar(viz, tol=0.1) @@ -134,8 +151,13 @@ def test_quick_method(self): """ X, y = make_classification( - n_samples=400, n_features=20, n_informative=8, n_redundant=8, - n_classes=2, n_clusters_per_class=4, random_state=2721 + n_samples=400, + n_features=20, + n_informative=8, + n_redundant=8, + n_classes=2, + n_clusters_per_class=4, + random_state=2721, ) _, ax = plt.subplots() @@ -143,14 +165,19 @@ def test_quick_method(self): discrimination_threshold(BernoulliNB(3), X, y, ax=ax, random_state=5) self.assert_images_similar(ax=ax, tol=10) - @patch.object(DiscriminationThreshold, 'draw', autospec=True) + @patch.object(DiscriminationThreshold, "draw", autospec=True) def test_fit(self, mock_draw): """ Test the fit method generates scores, calls draw, and returns self """ X, y = make_classification( - n_samples=400, n_features=20, n_informative=8, n_redundant=8, - n_classes=2, n_clusters_per_class=4, random_state=1221 + n_samples=400, + n_features=20, + n_informative=8, + n_redundant=8, + n_classes=2, + n_clusters_per_class=4, + random_state=1221, ) visualizer = DiscriminationThreshold(BernoulliNB()) @@ -160,8 +187,7 @@ def test_fit(self, mock_draw): out = visualizer.fit(X, y) assert out is visualizer - if six.PY3: - mock_draw.assert_called_once() + mock_draw.assert_called_once() assert hasattr(visualizer, "thresholds_") assert hasattr(visualizer, "cv_scores_") @@ -170,17 +196,21 @@ def test_fit(self, mock_draw): assert "{}_lower".format(metric) in visualizer.cv_scores_ assert "{}_upper".format(metric) in visualizer.cv_scores_ - @pytest.mark.xfail( - sys.platform == 'win32', reason="images not close on windows" - ) + @pytest.mark.xfail(sys.platform == "win32", reason="images not close on windows") def test_binary_discrimination_threshold_alt_args(self): """ Correctly generates visualization with alternate arguments """ X, y = make_classification( - n_samples=400, n_features=20, n_informative=10, n_redundant=3, - n_classes=2, n_clusters_per_class=4, random_state=1231, - flip_y=0.1, weights=[0.35, 0.65], + n_samples=400, + n_features=20, + n_informative=10, + n_redundant=3, + n_classes=2, + n_clusters_per_class=4, + random_state=1231, + flip_y=0.1, + weights=[0.35, 0.65], ) exclude = ["queue_rate", "fscore"] @@ -190,7 +220,7 @@ def test_binary_discrimination_threshold_alt_args(self): ) visualizer.fit(X, y) - visualizer.poof() + visualizer.finalize() for metric in exclude: assert metric not in visualizer.cv_scores_ diff --git a/tests/test_cluster/__init__.py b/tests/test_cluster/__init__.py index 2ff63587b..f2cf1adb7 100644 --- a/tests/test_cluster/__init__.py +++ b/tests/test_cluster/__init__.py @@ -4,7 +4,7 @@ # Author: Benjamin Bengfort # Created: Thu Mar 23 17:37:57 2017 -0400 # -# Copyright (C) 2016 District Data Labs +# Copyright (C) 2016 The scikit-yb developers # For license information, see LICENSE.txt # # ID: __init__.py [241edca] benjamin@bengfort.com $ diff --git a/tests/test_cluster/test_base.py b/tests/test_cluster/test_base.py index 66d91ada8..41db7d652 100644 --- a/tests/test_cluster/test_base.py +++ b/tests/test_cluster/test_base.py @@ -1,10 +1,10 @@ # tests.test_cluster.test_base # Test the cluster base visualizers. # -# Author: Rebecca Bilbro +# Author: Rebecca Bilbro # Created: Thu Mar 23 17:38:42 2017 -0400 # -# Copyright (C) 2016 District Data Labs +# Copyright (C) 2016 The scikit-yb developers # For license information, see LICENSE.txt # # ID: test_base.py [241edca] benjamin@bengfort.com $ @@ -17,41 +17,55 @@ ## Imports ########################################################################## -import unittest +import pytest from yellowbrick.exceptions import YellowbrickTypeError from yellowbrick.cluster.base import ClusteringScoreVisualizer from sklearn.svm import SVC, SVR -from sklearn.linear_model import Ridge, RidgeCV, LinearRegression from sklearn.ensemble import RandomForestClassifier -from sklearn.cluster import KMeans, MiniBatchKMeans, AffinityPropagation from sklearn.cluster import MeanShift, DBSCAN, Birch +from sklearn.linear_model import Ridge, RidgeCV, LinearRegression +from sklearn.cluster import KMeans, MiniBatchKMeans, AffinityPropagation + ########################################################################## ## Clustering Base Test Cases ########################################################################## -class ClusterBaseTests(unittest.TestCase): - def test_clusterer_enforcement(self): +class TestClusterBase(object): + @pytest.mark.parametrize( + "model", [SVC, SVR, Ridge, RidgeCV, LinearRegression, RandomForestClassifier] + ) + def test_clusterer_enforcement_raises(self, model): + """ + Assert that non-cluster models raise a TypeError for cluster visualizers + """ + with pytest.raises(YellowbrickTypeError): + ClusteringScoreVisualizer(model()) + + @pytest.mark.parametrize( + "model", + [KMeans, MiniBatchKMeans, AffinityPropagation, MeanShift, DBSCAN, Birch], + ) + def test_clusterer_enforcement(self, model): """ Assert that only clustering estimators can be passed to cluster viz """ - nomodels = [ - SVC, SVR, Ridge, RidgeCV, LinearRegression, RandomForestClassifier - ] - - for nomodel in nomodels: - with self.assertRaises(YellowbrickTypeError): - ClusteringScoreVisualizer(nomodel()) - - models = [ - KMeans, MiniBatchKMeans, AffinityPropagation, MeanShift, DBSCAN, Birch - ] - - for model in models: - try: - ClusteringScoreVisualizer(model()) - except YellowbrickTypeError: - self.fail("could not pass clustering estimator to visualizer") + try: + ClusteringScoreVisualizer(model()) + except YellowbrickTypeError: + self.fail("could not pass clustering estimator to visualizer") + + def test_force_estimator(self): + """ + Test that an estimator can be forced through + """ + with pytest.raises(YellowbrickTypeError): + ClusteringScoreVisualizer(LinearRegression()) + + try: + ClusteringScoreVisualizer(LinearRegression(), force_model=True) + except YellowbrickTypeError as e: + pytest.fail("type error was raised incorrectly: {}".format(e)) diff --git a/tests/test_cluster/test_elbow.py b/tests/test_cluster/test_elbow.py index 3bd16793e..81f3579ca 100644 --- a/tests/test_cluster/test_elbow.py +++ b/tests/test_cluster/test_elbow.py @@ -1,10 +1,10 @@ # tests.test_cluster.test_elbow # Tests for the KElbowVisualizer # -# Author: Benjamin Bengfort +# Author: Benjamin Bengfort # Created: Thu Mar 23 22:30:19 2017 -0400 # -# Copyright (C) 2016 District Data Labs +# Copyright (C) 2017 The scikit-yb developers # For license information, see LICENSE.txt # # ID: test_elbow.py [5a370c8] benjamin@bengfort.com $ @@ -22,9 +22,6 @@ import numpy as np import matplotlib.pyplot as plt -from ..base import VisualTestCase -from ..dataset import DatasetMixin - from scipy.sparse import csc_matrix, csr_matrix from numpy.testing.utils import assert_array_almost_equal @@ -32,44 +29,62 @@ from sklearn.cluster import KMeans, MiniBatchKMeans from sklearn.feature_extraction.text import TfidfVectorizer +from tests.fixtures import Dataset +from tests.base import VisualTestCase +from yellowbrick.datasets import load_hobbies from yellowbrick.cluster.elbow import distortion_score -from yellowbrick.cluster.elbow import KElbowVisualizer -from yellowbrick.exceptions import YellowbrickValueError +from yellowbrick.cluster.elbow import KElbowVisualizer, kelbow_visualizer +from yellowbrick.exceptions import YellowbrickValueError, YellowbrickWarning + +from tests.base import IS_WINDOWS_OR_CONDA try: import pandas as pd except ImportError: pd = None +########################################################################## +## Data +########################################################################## + + +@pytest.fixture(scope="class") +def clusters(request): + # TODO: replace with make_blobs + X = np.array( + [ + [-0.40020753, -4.67055317, -0.27191127, -1.49156318], + [0.37143349, -4.89391622, -1.23893945, 0.48318165], + [8.625142, -1.2372284, 1.39301471, 4.3394457], + [7.65803596, -2.21017215, 1.99175714, 3.71004654], + [0.89319875, -5.37152317, 1.50313598, 1.95284886], + [2.68362166, -5.78810913, -0.41233406, 1.94638989], + [7.63541182, -1.99606076, 0.9241231, 4.53478238], + [9.04699415, -0.74540679, 0.98042851, 5.99569071], + [1.02552122, -5.73874278, -1.74804915, -0.07831216], + [7.18135665, -3.49473178, 1.14300963, 4.46065816], + [0.58812902, -4.66559815, -0.72831685, 1.40171779], + [1.48620862, -5.9963108, 0.19145963, -1.11369256], + [7.6625556, -1.21328083, 2.06361094, 6.2643551], + [9.45050727, -1.36536078, 1.31154384, 3.89103468], + [6.88203724, -1.62040255, 3.89961049, 2.12865388], + [5.60842705, -2.10693356, 1.93328514, 3.90825432], + [2.35150936, -6.62836131, -1.84278374, 0.51540886], + [1.17446451, -5.62506058, -2.18420699, 1.21385128], + ] + ) + + y = np.array([0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0]) + + request.cls.clusters = Dataset(X, y) + ########################################################################## ## K-Elbow Helpers Test Cases ########################################################################## -X = np.array( - [[-0.40020753, -4.67055317, -0.27191127, -1.49156318], - [ 0.37143349, -4.89391622, -1.23893945, 0.48318165], - [ 8.625142 , -1.2372284 , 1.39301471, 4.3394457 ], - [ 7.65803596, -2.21017215, 1.99175714, 3.71004654], - [ 0.89319875, -5.37152317, 1.50313598, 1.95284886], - [ 2.68362166, -5.78810913, -0.41233406, 1.94638989], - [ 7.63541182, -1.99606076, 0.9241231 , 4.53478238], - [ 9.04699415, -0.74540679, 0.98042851, 5.99569071], - [ 1.02552122, -5.73874278, -1.74804915, -0.07831216], - [ 7.18135665, -3.49473178, 1.14300963, 4.46065816], - [ 0.58812902, -4.66559815, -0.72831685, 1.40171779], - [ 1.48620862, -5.9963108 , 0.19145963, -1.11369256], - [ 7.6625556 , -1.21328083, 2.06361094, 6.2643551 ], - [ 9.45050727, -1.36536078, 1.31154384, 3.89103468], - [ 6.88203724, -1.62040255, 3.89961049, 2.12865388], - [ 5.60842705, -2.10693356, 1.93328514, 3.90825432], - [ 2.35150936, -6.62836131, -1.84278374, 0.51540886], - [ 1.17446451, -5.62506058, -2.18420699, 1.21385128]] -) - -y = np.array([0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0]) - +@pytest.mark.usefixtures("clusters") class TestKElbowHelper(object): """ Helper functions for K-Elbow Visualizer @@ -79,36 +94,36 @@ def test_distortion_score(self): """ Test the distortion score metric function """ - score = distortion_score(X, y) - assert score == 7.6777850157143783 + score = distortion_score(self.clusters.X, self.clusters.y) + assert score == pytest.approx(69.10006514142941) - @pytest.mark.parametrize("Xs", [ - csc_matrix(X), csr_matrix(X), - ], ids=["csc", "csr"]) - def test_distortion_score_sparse_matrix_input(self, Xs): + @pytest.mark.parametrize("func", [csc_matrix, csr_matrix], ids=["csc", "csr"]) + def test_distortion_score_sparse_matrix_input(self, func): """ Test the distortion score metric on a sparse array """ - score = distortion_score(Xs, y) - assert score == pytest.approx(7.6777850157143783) + score = distortion_score(func(self.clusters.X), self.clusters.y) + assert score == pytest.approx(69.10006514142938) @pytest.mark.skipif(pd is None, reason="pandas is required") def test_distortion_score_pandas_input(self): """ Test the distortion score metric on pandas DataFrame and Series """ - df = pd.DataFrame(X) - s = pd.Series(y) + df = pd.DataFrame(self.clusters.X) + s = pd.Series(self.clusters.y) score = distortion_score(df, s) - assert score == pytest.approx(7.6777850157143783) + assert score == pytest.approx(69.10006514142941) ########################################################################## ## KElbowVisualizer Test Cases ########################################################################## -class TestKElbowVisualizer(VisualTestCase, DatasetMixin): + +@pytest.mark.usefixtures("clusters") +class TestKElbowVisualizer(VisualTestCase): """ K-Elbow Visualizer Tests """ @@ -121,9 +136,8 @@ def test_integrated_kmeans_elbow(self): # NOTE #182: cannot use occupancy dataset because of memory usage # Generate a blobs data set - X,y = make_blobs( - n_samples=1000, n_features=12, centers=6, - shuffle=True, random_state=42 + X, y = make_blobs( + n_samples=1000, n_features=12, centers=6, shuffle=True, random_state=42 ) try: @@ -131,7 +145,7 @@ def test_integrated_kmeans_elbow(self): visualizer = KElbowVisualizer(KMeans(random_state=42), k=4, ax=ax) visualizer.fit(X) - visualizer.poof() + visualizer.finalize() self.assert_images_similar(visualizer) except Exception as e: @@ -145,18 +159,16 @@ def test_integrated_mini_batch_kmeans_elbow(self): # NOTE #182: cannot use occupancy dataset because of memory usage # Generate a blobs data set - X,y = make_blobs( + X, y = make_blobs( n_samples=1000, n_features=12, centers=6, shuffle=True, random_state=42 ) try: _, ax = plt.subplots() - visualizer = KElbowVisualizer( - MiniBatchKMeans(random_state=42), k=4, ax=ax - ) + visualizer = KElbowVisualizer(MiniBatchKMeans(random_state=42), k=4, ax=ax) visualizer.fit(X) - visualizer.poof() + visualizer.finalize() self.assert_images_similar(visualizer) except Exception as e: @@ -167,14 +179,14 @@ def test_topic_modeling_k_means(self): """ Test topic modeling k-means on the hobbies corpus """ - corpus = self.load_corpus("hobbies") + corpus = load_hobbies() - tfidf = TfidfVectorizer() - docs = tfidf.fit_transform(corpus.data) + tfidf = TfidfVectorizer() + docs = tfidf.fit_transform(corpus.data) visualizer = KElbowVisualizer(KMeans(), k=(4, 8)) visualizer.fit(docs) - visualizer.poof() + visualizer.finalize() self.assert_images_similar(visualizer) @@ -184,20 +196,21 @@ def test_invalid_k(self): """ with pytest.raises(YellowbrickValueError): - KElbowVisualizer(KMeans(), k=(1, 2, 3, 'foo', 5)) + KElbowVisualizer(KMeans(), k=(1, 2, 3, "foo", 5)) with pytest.raises(YellowbrickValueError): KElbowVisualizer(KMeans(), k="foo") def test_valid_k(self): """ - Assert that valid values of K generate correct k_values_: - if k is an int, k_values_ = range(2, k+1) - if k is a tuple of 2 ints, k_values = range(k[0], k[1]) - if k is an iterable, k_values_ = list(k) + Assert that valid values of K generate correct k_values_ """ + # if k is an int, k_values_ = range(2, k+1) + # if k is a tuple of 2 ints, k_values = range(k[0], k[1]) + # if k is an iterable, k_values_ = list(k) + visualizer = KElbowVisualizer(KMeans(), k=8) - assert visualizer.k_values_ == list(np.arange(2, 8+1)) + assert visualizer.k_values_ == list(np.arange(2, 8 + 1)) visualizer = KElbowVisualizer(KMeans(), k=(4, 12)) assert visualizer.k_values_ == list(np.arange(4, 12)) @@ -205,72 +218,117 @@ def test_valid_k(self): visualizer = KElbowVisualizer(KMeans(), k=np.arange(10, 100, 10)) assert visualizer.k_values_ == list(np.arange(10, 100, 10)) - visualizer = KElbowVisualizer(KMeans(), - k=[10, 20, 30, 40, 50, 60, 70, 80, 90]) + visualizer = KElbowVisualizer(KMeans(), k=[10, 20, 30, 40, 50, 60, 70, 80, 90]) assert visualizer.k_values_ == list(np.arange(10, 100, 10)) - @pytest.mark.xfail( - sys.platform == 'win32', reason="images not close on windows" - ) + @pytest.mark.xfail(sys.platform == "win32", reason="images not close on windows") def test_distortion_metric(self): """ Test the distortion metric of the k-elbow visualizer """ visualizer = KElbowVisualizer( - KMeans(random_state=0), k=5, metric="distortion", timings=False + KMeans(random_state=0), + k=5, + metric="distortion", + timings=False, + locate_elbow=False, ) - visualizer.fit(X) + visualizer.fit(self.clusters.X) - expected = np.array([ 7.677785, 8.364319, 8.893634, 8.013021]) + expected = np.array([69.100065, 54.081571, 43.146921, 34.978487]) assert len(visualizer.k_scores_) == 4 - visualizer.poof() - self.assert_images_similar(visualizer) + visualizer.finalize() + self.assert_images_similar(visualizer, tol=0.03) assert_array_almost_equal(visualizer.k_scores_, expected) - @pytest.mark.xfail( - sys.platform == 'win32', reason="images not close on windows" - ) + @pytest.mark.xfail(sys.platform == "win32", reason="images not close on windows") def test_silhouette_metric(self): """ Test the silhouette metric of the k-elbow visualizer """ visualizer = KElbowVisualizer( - KMeans(random_state=0), k=5, metric="silhouette", timings=False + KMeans(random_state=0), + k=5, + metric="silhouette", + timings=False, + locate_elbow=False, ) - visualizer.fit(X) + visualizer.fit(self.clusters.X) - expected = np.array([ 0.691636, 0.456646, 0.255174, 0.239842]) + expected = np.array([0.691636, 0.456646, 0.255174, 0.239842]) assert len(visualizer.k_scores_) == 4 - visualizer.poof() + visualizer.finalize() self.assert_images_similar(visualizer) assert_array_almost_equal(visualizer.k_scores_, expected) - @pytest.mark.xfail( - sys.platform == 'win32', reason="images not close on windows" - ) - def test_calinski_harabaz_metric(self): + @pytest.mark.xfail(sys.platform == "win32", reason="images not close on windows") + def test_calinski_harabasz_metric(self): """ - Test the calinski-harabaz metric of the k-elbow visualizer + Test the calinski-harabasz metric of the k-elbow visualizer """ visualizer = KElbowVisualizer( - KMeans(random_state=0), k=5, - metric="calinski_harabaz", timings=False + KMeans(random_state=0), + k=5, + metric="calinski_harabasz", + timings=False, + locate_elbow=False, ) - visualizer.fit(X) + visualizer.fit(self.clusters.X) assert len(visualizer.k_scores_) == 4 + assert visualizer.elbow_value_ is None - expected = np.array([ - 81.662726256035683, 50.992378259195554, - 40.952179227847012, 35.939494 - ]) - + expected = np.array( + [81.662726256035683, 50.992378259195554, 40.952179227847012, 35.939494] + ) - visualizer.poof() + visualizer.finalize() self.assert_images_similar(visualizer) assert_array_almost_equal(visualizer.k_scores_, expected) + def test_locate_elbow(self): + """ + Test the addition of locate_elbow to an image + """ + X, y = make_blobs( + n_samples=1000, n_features=5, centers=3, shuffle=True, random_state=42 + ) + + visualizer = KElbowVisualizer( + KMeans(random_state=0), + k=6, + metric="calinski_harabasz", + timings=False, + locate_elbow=True, + ) + visualizer.fit(X) + assert len(visualizer.k_scores_) == 5 + assert visualizer.elbow_value_ == 3 + expected = np.array( + [4286.479848, 12463.383743, 8766.999551, 6950.08391, 5865.79722] + ) + + visualizer.finalize() + self.assert_images_similar(visualizer, windows_tol=2.2) + assert_array_almost_equal(visualizer.k_scores_, expected) + + def test_no_knee(self): + """ + Assert that a warning is issued if there is no knee detected + """ + X, y = make_blobs(n_samples=1000, centers=3, n_features=12, random_state=12) + message = ( + "No 'knee' or 'elbow point' detected " + "This could be due to bad clustering, no " + "actual clusters being formed etc." + ) + with pytest.warns(YellowbrickWarning, match=message): + visualizer = KElbowVisualizer( + KMeans(random_state=12), k=(4, 12), locate_elbow=True + ) + visualizer.fit(X) + def test_bad_metric(self): """ Assert KElbow raises an exception when a bad metric is supplied @@ -279,16 +337,17 @@ def test_bad_metric(self): KElbowVisualizer(KMeans(), k=5, metric="foo") @pytest.mark.xfail( - sys.platform == 'win32', reason="images not close on windows" + IS_WINDOWS_OR_CONDA, + reason="font rendering different in OS and/or Python; see #892", ) def test_timings(self): """ Test the twinx double axes with k-elbow timings """ visualizer = KElbowVisualizer( - KMeans(random_state=0), k=5, timings=True + KMeans(random_state=0), k=5, timings=True, locate_elbow=False ) - visualizer.fit(X) + visualizer.fit(self.clusters.X) # Check that we kept track of time assert len(visualizer.k_timers_) == 4 @@ -302,13 +361,30 @@ def test_timings(self): # overwrite k_timers_, k_values_ for image similarity Tests visualizer.axes[1].remove() visualizer.k_timers_ = [ - 0.01084589958190918, 0.011144161224365234, - 0.017028093338012695, 0.010634183883666992 + 0.01084589958190918, + 0.011144161224365234, + 0.017028093338012695, + 0.010634183883666992, ] visualizer.k_values_ = [2, 3, 4, 5] # call draw again which is normally called in fit visualizer.draw() - visualizer.poof() + visualizer.finalize() self.assert_images_similar(visualizer) + + @pytest.mark.xfail(reason="images not close due to timing lines") + def test_quick_method(self): + """ + Test the quick method producing a valid visualization + """ + X, y = make_blobs( + n_samples=1000, n_features=12, centers=8, shuffle=False, random_state=2 + ) + + model = MiniBatchKMeans(3, random_state=43) + oz = kelbow_visualizer(model, X, random_state=13, legend=False) + assert isinstance(oz, KElbowVisualizer) + + self.assert_images_similar(oz) diff --git a/tests/test_cluster/test_icdm.py b/tests/test_cluster/test_icdm.py index 9228b66f5..c6ac439da 100644 --- a/tests/test_cluster/test_icdm.py +++ b/tests/test_cluster/test_icdm.py @@ -1,10 +1,13 @@ # tests.test_cluster.test_icdm # Tests for the intercluster distance map visualizer. # -# Author: Benjamin Bengfort +# Author: Benjamin Bengfort # Created: Tue Aug 21 11:57:44 2018 -0400 # -# ID: test_icdm.py [] benjamin@bengfort.com $ +# Copyright (C) 2018 The scikit-yb developers +# For license information, see LICENSE.txt +# +# ID: test_icdm.py [2f23976] benjamin@bengfort.com $ """ Tests for the intercluster distance map visualizer. @@ -14,36 +17,36 @@ ## Imports ########################################################################## -import sys import pytest import matplotlib as mpl from yellowbrick.cluster.icdm import * +from yellowbrick.datasets import load_nfl from yellowbrick.exceptions import YellowbrickValueError -from tests.base import VisualTestCase -from tests.dataset import DatasetMixin, Dataset +from unittest import mock +from tests.fixtures import Dataset +from tests.base import IS_WINDOWS_OR_CONDA, VisualTestCase from sklearn.datasets import make_blobs from sklearn.cluster import Birch, AgglomerativeClustering -from sklearn.cluster import KMeans, AffinityPropagation, MiniBatchKMeans from sklearn.decomposition import LatentDirichletAllocation as LDA +from sklearn.cluster import KMeans, AffinityPropagation, MiniBatchKMeans try: import pandas as pd except ImportError: pd = None - # Determine version of matplotlib MPL_VERS_MAJ = int(mpl.__version__.split(".")[0]) - ########################################################################## ## Fixtures ########################################################################## -@pytest.fixture(scope='class') + +@pytest.fixture(scope="class") def blobs12(request): """ Creates a fixture of 1000 instances in 12 clusters with 16 features. @@ -54,7 +57,7 @@ def blobs12(request): request.cls.blobs12 = Dataset(X, y) -@pytest.fixture(scope='class') +@pytest.fixture(scope="class") def blobs4(request): """ Creates a fixture of 400 instances in 4 clusters with 16 features. @@ -66,22 +69,22 @@ def blobs4(request): def assert_fitted(oz): - for param in ('cluster_centers_', 'embedded_centers_', 'scores_', 'fit_time_'): + for param in ("cluster_centers_", "embedded_centers_", "scores_", "fit_time_"): assert hasattr(oz, param) def assert_not_fitted(oz): - for param in ('embedded_centers_', 'scores_', 'fit_time_'): + for param in ("embedded_centers_", "scores_", "fit_time_"): assert not hasattr(oz, param) - ########################################################################## ## InterclusterDistance Test Cases ########################################################################## + @pytest.mark.usefixtures("blobs12", "blobs4") -class TestInterclusterDistance(VisualTestCase, DatasetMixin): +class TestInterclusterDistance(VisualTestCase): """ Test the InterclusterDistance visualizer """ @@ -92,11 +95,11 @@ def test_only_valid_embeddings(self): """ # On init with pytest.raises(YellowbrickValueError, match="unknown embedding 'foo'"): - InterclusterDistance(KMeans(), embedding='foo') + InterclusterDistance(KMeans(), embedding="foo") # After init icdm = InterclusterDistance(KMeans()) - icdm.embedding = 'foo' + icdm.embedding = "foo" with pytest.raises(YellowbrickValueError, match="unknown embedding 'foo'"): icdm.transformer @@ -106,25 +109,29 @@ def test_only_valid_scoring(self): """ # On init with pytest.raises(YellowbrickValueError, match="unknown scoring 'foo'"): - InterclusterDistance(KMeans(), scoring='foo') + InterclusterDistance(KMeans(), scoring="foo") # After init icdm = InterclusterDistance(KMeans()) - icdm.scoring = 'foo' + icdm.scoring = "foo" with pytest.raises(YellowbrickValueError, match="unknown scoring method 'foo'"): icdm._score_clusters(None) + @pytest.mark.xfail( + IS_WINDOWS_OR_CONDA, + reason="font rendering different in OS and/or Python; see #892", + ) def test_kmeans_mds(self): """ Visual similarity with KMeans and MDS scaling """ model = KMeans(9, random_state=38) - oz = InterclusterDistance(model, random_state=83, embedding='mds') + oz = InterclusterDistance(model, random_state=83, embedding="mds") # Prefit assertions assert_not_fitted(oz) - assert oz.fit(self.blobs12.X) is oz # Fit returns self + assert oz.fit(self.blobs12.X) is oz # Fit returns self # Postfit assertions assert_fitted(oz) @@ -135,23 +142,26 @@ def test_kmeans_mds(self): # Image similarity oz.finalize() - tol = 4.9 if sys.platform == 'win32' else 1.0 # fails with RMSE 4.740 on AppVeyor - self.assert_images_similar(oz, tol=tol) + self.assert_images_similar(oz) @pytest.mark.filterwarnings("ignore:the matrix subclass is not the recommended way") + @pytest.mark.xfail( + IS_WINDOWS_OR_CONDA, + reason="font rendering different in OS and/or Python; see #892", + ) def test_affinity_tsne_no_legend(self): """ Visual similarity with AffinityPropagation, TSNE scaling, and no legend """ model = AffinityPropagation() oz = InterclusterDistance( - model, random_state=763, embedding='tsne', legend=False + model, random_state=763, embedding="tsne", legend=False ) # Prefit assertions assert_not_fitted(oz) - assert oz.fit(self.blobs4.X) is oz # Fit returns self + assert oz.fit(self.blobs4.X) is oz # Fit returns self # Postfit assertions assert_fitted(oz) @@ -160,9 +170,7 @@ def test_affinity_tsne_no_legend(self): # Image similarity oz.finalize() - tol = 2.75 if sys.platform == 'win32' else 1.0 # fails with RMSE 2.687 on AppVeyor - self.assert_images_similar(oz, tol=tol) - + self.assert_images_similar(oz) @pytest.mark.skip(reason="LDA not implemented yet") def test_lda_mds(self): @@ -170,12 +178,12 @@ def test_lda_mds(self): Visual similarity with LDA and MDS scaling """ model = LDA(9, random_state=6667) - oz = InterclusterDistance(model, random_state=2332, embedding='mds') + oz = InterclusterDistance(model, random_state=2332, embedding="mds") # Prefit assertions assert_not_fitted(oz) - assert oz.fit(self.blobs12.X) is oz # Fit returns self + assert oz.fit(self.blobs12.X) is oz # Fit returns self # Postfit assertions assert_fitted(oz) @@ -195,12 +203,12 @@ def test_birch_tsne(self): """ Visual similarity with Birch and MDS scaling """ - oz = InterclusterDistance(Birch(n_clusters=9), random_state=83, embedding='mds') + oz = InterclusterDistance(Birch(n_clusters=9), random_state=83, embedding="mds") # Prefit assertions assert_not_fitted(oz) - assert oz.fit(self.blobs12.X) is oz # Fit returns self + assert oz.fit(self.blobs12.X) is oz # Fit returns self # Postfit assertions assert_fitted(oz) @@ -220,13 +228,13 @@ def test_ward_mds_no_legend(self): """ model = AgglomerativeClustering(n_clusters=9) oz = InterclusterDistance( - model, random_state=83, embedding='tsne', legend=False + model, random_state=83, embedding="tsne", legend=False ) # Prefit assertions assert_not_fitted(oz) - assert oz.fit(self.blobs12.X) is oz # Fit returns self + assert oz.fit(self.blobs12.X) is oz # Fit returns self # Postfit assertions assert_fitted(oz) @@ -239,6 +247,10 @@ def test_ward_mds_no_legend(self): oz.finalize() self.assert_images_similar(oz, tol=1.0) + @pytest.mark.xfail( + IS_WINDOWS_OR_CONDA, + reason="font rendering different in OS and/or Python; see #892", + ) def test_quick_method(self): """ Test the quick method producing a valid visualization @@ -247,31 +259,63 @@ def test_quick_method(self): oz = intercluster_distance(model, self.blobs4.X, random_state=93, legend=False) assert isinstance(oz, InterclusterDistance) - tol = 2.75 if sys.platform == 'win32' else 1.0 # fails with RMSE 2.631 on AppVeyor - self.assert_images_similar(oz, tol=tol) + self.assert_images_similar(oz) - @pytest.mark.skipif(MPL_VERS_MAJ >= 2, reason="test requires mpl earlier than 2.0.2") + @pytest.mark.skipif( + MPL_VERS_MAJ >= 2, reason="test requires mpl earlier than 2.0.2" + ) def test_legend_matplotlib_version(self, mock_toolkit): """ ValueError is raised when matplotlib version is incorrect and legend=True """ with pytst.raises(ImportError): from mpl_toolkits.axes_grid1 import inset_locator + assert not inset_locator with pytest.raises(YellowbrickValueError, match="requires matplotlib 2.0.2"): InterclusterDistance(KMeans(), legend=True) - @pytest.mark.skipif(MPL_VERS_MAJ >= 2, reason="test requires mpl earlier than 2.0.2") + @pytest.mark.skipif( + MPL_VERS_MAJ >= 2, reason="test requires mpl earlier than 2.0.2" + ) def test_no_legend_matplotlib_version(self, mock_toolkit): """ No error is raised when matplotlib version is incorrect and legend=False """ with pytst.raises(ImportError): from mpl_toolkits.axes_grid1 import inset_locator + assert not inset_locator try: InterclusterDistance(KMeans(), legend=False) except YellowbrickValueError as e: self.fail(e) + + @pytest.mark.xfail( + reason="""third test fails with AssertionError: Expected fit + to be called once. Called 0 times.""" + ) + def test_with_fitted(self): + """ + Test that visualizer properly handles an already-fitted model + """ + X, y = load_nfl(return_dataset=True).to_numpy() + + model = KMeans().fit(X, y) + + with mock.patch.object(model, "fit") as mockfit: + oz = ICDM(model) + oz.fit(X, y) + mockfit.assert_not_called() + + with mock.patch.object(model, "fit") as mockfit: + oz = ICDM(model, is_fitted=True) + oz.fit(X, y) + mockfit.assert_not_called() + + with mock.patch.object(model, "fit") as mockfit: + oz = ICDM(model, is_fitted=False) + oz.fit(X, y) + mockfit.assert_called_once_with(X, y) diff --git a/tests/test_cluster/test_silhouette.py b/tests/test_cluster/test_silhouette.py index 6daa64a29..4df85b7d2 100644 --- a/tests/test_cluster/test_silhouette.py +++ b/tests/test_cluster/test_silhouette.py @@ -1,10 +1,10 @@ # tests.test_cluster.test_silhouette # Tests for the SilhouetteVisualizer # -# Author: Benjamin Bengfort +# Author: Benjamin Bengfort # Created: Mon Mar 27 10:01:37 2017 -0400 # -# Copyright (C) 2016 District Data Labs +# Copyright (C) 2017 The scikit-yb developers # For license information, see LICENSE.txt # # ID: test_silhouette.py [57b563b] benjamin@bengfort.com $ @@ -19,29 +19,29 @@ import sys import pytest - import matplotlib.pyplot as plt -from ..base import VisualTestCase - from sklearn.datasets import make_blobs from sklearn.cluster import KMeans, MiniBatchKMeans -from yellowbrick.cluster.silhouette import SilhouetteVisualizer +from unittest import mock +from tests.base import VisualTestCase + +from yellowbrick.datasets import load_nfl +from yellowbrick.cluster.silhouette import SilhouetteVisualizer, silhouette_visualizer ########################################################################## ## SilhouetteVisualizer Test Cases ########################################################################## -class SilhouetteVisualizerTests(VisualTestCase): + +class TestSilhouetteVisualizer(VisualTestCase): """ - Silhouette Visualizer + Silhouette Visualizer Tests """ - @pytest.mark.xfail( - sys.platform == 'win32', reason="images not close on windows" - ) + @pytest.mark.xfail(sys.platform == "win32", reason="images not close on windows") def test_integrated_kmeans_silhouette(self): """ Test no exceptions for kmeans silhouette visualizer on blobs dataset @@ -59,15 +59,13 @@ def test_integrated_kmeans_silhouette(self): visualizer = SilhouetteVisualizer(KMeans(random_state=0), ax=ax) visualizer.fit(X) - visualizer.poof() + visualizer.finalize() - self.assert_images_similar(visualizer) + self.assert_images_similar(visualizer, remove_legend=True) except Exception as e: self.fail("error during silhouette: {}".format(e)) - @pytest.mark.xfail( - sys.platform == 'win32', reason="images not close on windows" - ) + @pytest.mark.xfail(sys.platform == "win32", reason="images not close on windows") def test_integrated_mini_batch_kmeans_silhouette(self): """ Test no exceptions for mini-batch kmeans silhouette visualizer @@ -85,17 +83,134 @@ def test_integrated_mini_batch_kmeans_silhouette(self): visualizer = SilhouetteVisualizer(MiniBatchKMeans(random_state=0), ax=ax) visualizer.fit(X) - visualizer.poof() + visualizer.finalize() - self.assert_images_similar(visualizer) + self.assert_images_similar(visualizer, remove_legend=True) except Exception as e: self.fail("error during silhouette: {}".format(e)) - @pytest.mark.skip( - reason="no negative silhouette example available yet" - ) + @pytest.mark.skip(reason="no negative silhouette example available yet") def test_negative_silhouette_score(self): """ Ensure negative silhouette scores are correctly displayed by the visualizer. """ - raise NotImplementedError("no negative silhouette example available") \ No newline at end of file + raise NotImplementedError("no negative silhouette example available") + + @pytest.mark.xfail(sys.platform == "win32", reason="images not close on windows") + def test_colormap_silhouette(self): + """ + Test no exceptions for modifying the colormap in a silhouette visualizer + """ + # Generate a blobs data set + X, y = make_blobs( + n_samples=1000, n_features=12, centers=8, shuffle=False, random_state=0 + ) + + try: + fig = plt.figure() + ax = fig.add_subplot() + + visualizer = SilhouetteVisualizer( + MiniBatchKMeans(random_state=0), ax=ax, colormap="gnuplot" + ) + visualizer.fit(X) + visualizer.finalize() + + self.assert_images_similar(visualizer, remove_legend=True) + except Exception as e: + self.fail("error during silhouette: {}".format(e)) + + @pytest.mark.xfail(sys.platform == "win32", reason="images not close on windows") + def test_colors_silhouette(self): + """ + Test no exceptions for modifying the colors in a silhouette visualizer + with a list of color names + """ + # Generate a blobs data set + X, y = make_blobs( + n_samples=1000, n_features=12, centers=8, shuffle=False, random_state=0 + ) + + try: + fig = plt.figure() + ax = fig.add_subplot() + + visualizer = SilhouetteVisualizer( + MiniBatchKMeans(random_state=0), + ax=ax, + colors=["red", "green", "blue", "indigo", "cyan", "lavender"], + ) + visualizer.fit(X) + visualizer.finalize() + + self.assert_images_similar(visualizer, remove_legend=True) + except Exception as e: + self.fail("error during silhouette: {}".format(e)) + + def test_colormap_as_colors_silhouette(self): + """ + Test no exceptions for modifying the colors in a silhouette visualizer + by using a matplotlib colormap as colors + """ + # Generate a blobs data set + X, y = make_blobs( + n_samples=1000, n_features=12, centers=8, shuffle=False, random_state=0 + ) + + try: + fig = plt.figure() + ax = fig.add_subplot() + + visualizer = SilhouetteVisualizer( + MiniBatchKMeans(random_state=0), ax=ax, colors="cool" + ) + visualizer.fit(X) + visualizer.finalize() + + tol = ( + 3.2 if sys.platform == "win32" else 0.01 + ) # Fails on AppVeyor with RMS 3.143 + self.assert_images_similar(visualizer, remove_legend=True, tol=tol) + except Exception as e: + self.fail("error during silhouette: {}".format(e)) + + def test_quick_method(self): + """ + Test the quick method producing a valid visualization + """ + X, y = make_blobs( + n_samples=1000, n_features=12, centers=8, shuffle=False, random_state=0 + ) + + model = MiniBatchKMeans(3, random_state=343) + oz = silhouette_visualizer(model, X, random_state=93, legend=False) + assert isinstance(oz, SilhouetteVisualizer) + + self.assert_images_similar(oz) + + @pytest.mark.xfail( + reason="""third test fails with AssertionError: Expected fit + to be called once. Called 0 times.""" + ) + def test_with_fitted(self): + """ + Test that visualizer properly handles an already-fitted model + """ + X, y = load_nfl(return_dataset=True).to_numpy() + + model = MiniBatchKMeans().fit(X, y) + + with mock.patch.object(model, "fit") as mockfit: + oz = SilhouetteVisualizer(model) + oz.fit(X, y) + mockfit.assert_not_called() + + with mock.patch.object(model, "fit") as mockfit: + oz = SilhouetteVisualizer(model, is_fitted=True) + oz.fit(X, y) + mockfit.assert_not_called() + + with mock.patch.object(model, "fit") as mockfit: + oz = SilhouetteVisualizer(model, is_fitted=False) + oz.fit(X, y) + mockfit.assert_called_once_with(X, y) diff --git a/tests/test_contrib/__init__.py b/tests/test_contrib/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/test_contrib/test_classifier/__init__.py b/tests/test_contrib/test_classifier/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/test_contrib/test_classifier/test_boundaries.py b/tests/test_contrib/test_classifier/test_boundaries.py index 02ae5e894..f901a1491 100644 --- a/tests/test_contrib/test_classifier/test_boundaries.py +++ b/tests/test_contrib/test_classifier/test_boundaries.py @@ -4,10 +4,10 @@ # Author: Author: Nathan Danielsen # Created: Sun Mar 19 13:01:29 2017 -0400 # -# Copyright (C) 2017 District Data Labs +# Copyright (C) 2017 The scikit-yb developers # For license information, see LICENSE.txt # -# ID: test_boundaries.py [] nathan.danielsen@gmail.com $ +# ID: test_boundaries.py [a60bc41] nathan.danielsen@gmail.com $ """ Ensure that the Decision Boundary visualizations work. """ @@ -17,10 +17,10 @@ ########################################################################## import sys -import six import pytest import numpy as np +from unittest import mock from tests.base import VisualTestCase from yellowbrick.contrib.classifier import * @@ -31,11 +31,6 @@ from sklearn import neighbors from sklearn import naive_bayes -try: - from unittest import mock -except ImportError: - import mock - try: import pandas as pd except ImportError: @@ -46,20 +41,22 @@ # Data ########################################################################## -X = np.array([ - [2.318, 2.727, 4.260, 7.212, 4.792, ], - [2.315, 2.726, 4.295, 7.140, 4.783, ], - [2.315, 2.724, 4.260, 7.135, 4.779, ], - [2.110, 3.609, 4.330, 7.985, 5.595, ], - [2.110, 3.626, 4.330, 8.203, 5.621, ], - [2.110, 3.620, 4.470, 8.210, 5.612, ], - [2.318, 2.727, 4.260, 7.212, 4.792, ], - [2.315, 2.726, 4.295, 7.140, 4.783, ], - [2.315, 2.724, 4.260, 7.135, 4.779, ], - [2.110, 3.609, 4.330, 7.985, 5.595, ], - [2.110, 3.626, 4.330, 8.203, 5.621, ], - [2.110, 3.620, 4.470, 8.210, 5.612, ] - ]) +X = np.array( + [ + [2.318, 2.727, 4.260, 7.212, 4.792], + [2.315, 2.726, 4.295, 7.140, 4.783], + [2.315, 2.724, 4.260, 7.135, 4.779], + [2.110, 3.609, 4.330, 7.985, 5.595], + [2.110, 3.626, 4.330, 8.203, 5.621], + [2.110, 3.620, 4.470, 8.210, 5.612], + [2.318, 2.727, 4.260, 7.212, 4.792], + [2.315, 2.726, 4.295, 7.140, 4.783], + [2.315, 2.724, 4.260, 7.135, 4.779], + [2.110, 3.609, 4.330, 7.985, 5.595], + [2.110, 3.626, 4.330, 8.203, 5.621], + [2.110, 3.620, 4.470, 8.210, 5.612], + ] +) y = np.array([1, 2, 1, 2, 1, 0, 0, 1, 3, 1, 3, 2]) @@ -70,13 +67,13 @@ ########################################################################## -@pytest.mark.filterwarnings('ignore') -class DecisionBoundariesVisualizerTest(VisualTestCase): +@pytest.mark.filterwarnings("ignore") +class TestDecisionBoundariesVisualizer(VisualTestCase): """ - DecisionBoundariesVisualizer + Test DecisionBoundariesVisualizer """ - def test_decision_bounardies(self): + def test_decision_boundaries(self): """ Assert no errors during kNN DecisionBoundariesVisualizer integration """ @@ -85,13 +82,20 @@ def test_decision_bounardies(self): viz.fit_draw_poof(X_two_cols, y=y) def test_deprecated(self): + """ + Assert the DecisionViz class issues deprecation warning + """ with pytest.deprecated_call(): model = neighbors.KNeighborsClassifier(3) DecisionViz(model) - @pytest.mark.skipif(six.PY2, reason="deprecation warnings filtered in PY2") def test_deprecated_message(self): - with pytest.warns(DeprecationWarning, match='Will be moved to yellowbrick.contrib in v0.8'): + """ + Test the deprecation warning message + """ + with pytest.warns( + DeprecationWarning, match="Will be moved to yellowbrick.contrib in v0.8" + ): model = neighbors.KNeighborsClassifier(3) DecisionViz(model) @@ -102,24 +106,23 @@ def test_init(self): model = neighbors.KNeighborsClassifier(3) viz = DecisionBoundariesVisualizer(model) - self.assertEquals(viz.step_size, 0.0025) - self.assertEqual(viz.name, 'KNeighborsClassifier') - self.assertEqual(viz.estimator, model) - - self.assertIsNone(viz.classes_) - self.assertIsNone(viz.features_) - self.assertIsNotNone(viz.markers) - self.assertIsNotNone(viz.scatter_alpha) - self.assertTrue(viz.show_scatter) + assert viz.step_size == 0.0025 + assert viz.name == "KNeighborsClassifier" + assert viz.estimator is model - self.assertIsNone(viz.Z) - self.assertIsNone(viz.xx) - self.assertIsNone(viz.yy) - self.assertIsNone(viz.class_labels) - self.assertIsNone(viz.title) - self.assertIsNone(viz.x) - self.assertIsNone(viz.y) + assert not hasattr(viz, "classes_") + assert viz.features_ is None + assert viz.markers is not None + assert viz.scatter_alpha is not None + assert viz.show_scatter is True + assert viz.Z is None + assert viz.xx is None + assert viz.yy is None + assert viz.class_labels is None + assert viz.title is None + assert viz.x is None + assert viz.y is None def test_scatter_xy_and_features_raise_error(self): """ @@ -128,19 +131,16 @@ def test_scatter_xy_and_features_raise_error(self): model = neighbors.KNeighborsClassifier(3) features = ["temperature", "relative_humidity", "light"] - with self.assertRaises(YellowbrickValueError): - DecisionBoundariesVisualizer( - model, features=features, x='one', y='two' - ) + with pytest.raises(YellowbrickValueError): + DecisionBoundariesVisualizer(model, features=features, x="one", y="two") def test_scatter_xy_changes_to_features(self): """ Assert that x,y and features will raise error """ model = neighbors.KNeighborsClassifier(3) - visualizer = DecisionBoundariesVisualizer(model, x='one', y='two') - self.assertEquals(visualizer.features_, ['one', 'two']) - + visualizer = DecisionBoundariesVisualizer(model, x="one", y="two") + assert visualizer.features_ == ["one", "two"] def test_fit(self): """ @@ -154,17 +154,17 @@ def test_fit(self): fitted_viz = viz.fit(X_two_cols, y=y) # assert that classes and labels are established - self.assertEqual(fitted_viz.classes_, {0: '0', 1: '1', 2: '2', 3: '3'}) - self.assertEqual(fitted_viz.features_, ['Feature One', 'Feature Two']) + assert fitted_viz.classes_ == {0: "0", 1: "1", 2: "2", 3: "3"} + assert fitted_viz.features_ == ["Feature One", "Feature Two"] # assert that the fit method is called model.fit.assert_called_once_with(X_two_cols, y) # mock object is called twice in predict and reshape - self.assertEqual(len(model.predict.mock_calls), 2) + assert len(model.predict.mock_calls) == 2 # test that attrs are set - self.assertIsNotNone(fitted_viz.ax) - self.assertIsNotNone(fitted_viz.Z_shape) + assert fitted_viz.ax is not None + assert fitted_viz.Z_shape is not None def test_fit_class_labels(self): """ @@ -172,13 +172,15 @@ def test_fit_class_labels(self): """ model = neighbors.KNeighborsClassifier(3) viz = DecisionBoundariesVisualizer( - model, classes=['one', 'two', 'three', 'four']) + model, classes=["one", "two", "three", "four"] + ) fitted_viz = viz.fit(X_two_cols, y=y) - self.assertEquals(fitted_viz.classes_, - {'three': '2', - 'four': '3', - 'two': '1', - 'one': '0'}) + assert fitted_viz.classes_ == { + "three": "2", + "four": "3", + "two": "1", + "one": "0", + } def test_fit_class_labels_class_names_edge_case(self): """ @@ -186,8 +188,11 @@ def test_fit_class_labels_class_names_edge_case(self): """ model = neighbors.KNeighborsClassifier(3) viz = DecisionBoundariesVisualizer( - model, classes=['one', 'two', 'three', 'four', 'five']) - self.assertRaises(YellowbrickTypeError, viz.fit, X_two_cols, y=y) + model, classes=["one", "two", "three", "four", "five"] + ) + + with pytest.raises(YellowbrickTypeError): + viz.fit(X_two_cols, y=y) def test_fit_features_assignment_None(self): """ @@ -195,18 +200,18 @@ def test_fit_features_assignment_None(self): """ model = neighbors.KNeighborsClassifier(3) viz = DecisionBoundariesVisualizer(model) - self.assertIsNone(viz.features_) + assert viz.features_ is None fitted_viz = viz.fit(X_two_cols, y=y) - self.assertEquals(fitted_viz.features_, ['Feature One', 'Feature Two']) + assert fitted_viz.features_ == ["Feature One", "Feature Two"] def test_fit_features_assignment(self): """ Test fit when features are specified """ model = neighbors.KNeighborsClassifier(3) - viz = DecisionBoundariesVisualizer(model, features=['one', 'two']) + viz = DecisionBoundariesVisualizer(model, features=["one", "two"]) fitted_viz = viz.fit(X_two_cols, y=y) - self.assertEquals(fitted_viz.features_, ['one', 'two']) + assert fitted_viz.features_ == ["one", "two"] @mock.patch("yellowbrick.contrib.classifier.boundaries.OrderedDict") def test_draw_ordereddict_calls(self, mock_odict): @@ -215,9 +220,12 @@ def test_draw_ordereddict_calls(self, mock_odict): """ mock_odict.return_value = {} model = neighbors.KNeighborsClassifier(3) - viz = DecisionBoundariesVisualizer(model, features=['one', 'two']) - self.assertRaises(KeyError, viz.fit_draw, X_two_cols, y=y) - self.assertEquals(len(mock_odict.mock_calls), 2) + viz = DecisionBoundariesVisualizer(model, features=["one", "two"]) + + with pytest.raises(KeyError): + viz.fit_draw(X_two_cols, y=y) + + assert len(mock_odict.mock_calls) == 2 @mock.patch("yellowbrick.contrib.classifier.boundaries.resolve_colors") def test_draw_ordereddict_calls_one(self, mock_resolve_colors): @@ -226,16 +234,19 @@ def test_draw_ordereddict_calls_one(self, mock_resolve_colors): """ mock_resolve_colors.return_value = [] model = neighbors.KNeighborsClassifier(3) - viz = DecisionBoundariesVisualizer(model, features=['one', 'two']) - self.assertRaises(StopIteration, viz.fit_draw, X_two_cols, y=y) - self.assertEquals(len(mock_resolve_colors.mock_calls), 1) + viz = DecisionBoundariesVisualizer(model, features=["one", "two"]) + + with pytest.raises(StopIteration): + viz.fit_draw(X_two_cols, y=y) + + assert len(mock_resolve_colors.mock_calls) == 1 def test_draw_ax_show_scatter_true(self): """ Test that the matplotlib functions are being called """ model = neighbors.KNeighborsClassifier(3) - viz = DecisionBoundariesVisualizer(model, features=['one', 'two']) + viz = DecisionBoundariesVisualizer(model, features=["one", "two"]) fitted_viz = viz.fit(X_two_cols, y=y) fitted_viz.ax = mock.Mock() fitted_viz.ax.pcolormesh = mock.MagicMock() @@ -243,9 +254,9 @@ def test_draw_ax_show_scatter_true(self): fitted_viz.ax.legend = mock.MagicMock() fitted_viz.draw(X_two_cols, y=y) - self.assertEquals(len(fitted_viz.ax.pcolormesh.mock_calls), 1) - self.assertEquals(len(fitted_viz.ax.scatter.mock_calls), 4) - self.assertEquals(len(fitted_viz.ax.legend.mock_calls), 0) + assert len(fitted_viz.ax.pcolormesh.mock_calls) == 1 + assert len(fitted_viz.ax.scatter.mock_calls) == 4 + assert len(fitted_viz.ax.legend.mock_calls) == 0 def test_draw_ax_show_scatter_False(self): """ @@ -253,7 +264,8 @@ def test_draw_ax_show_scatter_False(self): """ model = neighbors.KNeighborsClassifier(3) viz = DecisionBoundariesVisualizer( - model, features=['one', 'two'], show_scatter=False) + model, features=["one", "two"], show_scatter=False + ) fitted_viz = viz.fit(X_two_cols, y=y) fitted_viz.ax = mock.Mock() fitted_viz.ax.pcolormesh = mock.MagicMock() @@ -261,9 +273,9 @@ def test_draw_ax_show_scatter_False(self): fitted_viz.ax.legend = mock.MagicMock() fitted_viz.draw(X_two_cols, y=y) - self.assertEquals(len(fitted_viz.ax.pcolormesh.mock_calls), 1) - self.assertEquals(len(fitted_viz.ax.scatter.mock_calls), 0) - self.assertEquals(len(fitted_viz.ax.legend.mock_calls), 1) + assert len(fitted_viz.ax.pcolormesh.mock_calls) == 1 + assert len(fitted_viz.ax.scatter.mock_calls) == 0 + assert len(fitted_viz.ax.legend.mock_calls) == 1 def test_finalize(self): """ @@ -271,7 +283,8 @@ def test_finalize(self): """ model = neighbors.KNeighborsClassifier(3) viz = DecisionBoundariesVisualizer( - model, features=['one', 'two'], show_scatter=False) + model, features=["one", "two"], show_scatter=False + ) fitted_viz = viz.fit(X_two_cols, y=y) fitted_viz.draw(X_two_cols, y=y) @@ -280,11 +293,11 @@ def test_finalize(self): fitted_viz.ax.set_xlabel = mock.MagicMock() fitted_viz.ax.set_ylabel = mock.MagicMock() - fitted_viz.poof() + fitted_viz.finalize() - fitted_viz.ax.legend.assert_called_once_with(loc='best', frameon=True) - fitted_viz.ax.set_xlabel.assert_called_once_with('one') - fitted_viz.ax.set_ylabel.assert_called_once_with('two') + fitted_viz.ax.legend.assert_called_once_with(loc="best", frameon=True) + fitted_viz.ax.set_xlabel.assert_called_once_with("one") + fitted_viz.ax.set_ylabel.assert_called_once_with("two") def test_fit_draw(self): """ @@ -292,7 +305,8 @@ def test_fit_draw(self): """ model = neighbors.KNeighborsClassifier(3) viz = DecisionBoundariesVisualizer( - model, features=['one', 'two'], show_scatter=False) + model, features=["one", "two"], show_scatter=False + ) viz.fit = mock.Mock() viz.draw = mock.Mock() @@ -308,7 +322,8 @@ def test_fit_draw_poof(self): """ model = neighbors.KNeighborsClassifier(3) viz = DecisionBoundariesVisualizer( - model, features=['one', 'two'], show_scatter=False) + model, features=["one", "two"], show_scatter=False + ) viz.fit = mock.Mock() viz.draw = mock.Mock() @@ -320,34 +335,39 @@ def test_fit_draw_poof(self): viz.draw.assert_called_once_with(X_two_cols, y) viz.poof.assert_called_once_with() - @pytest.mark.xfail( - sys.platform == 'win32', reason="images not close on windows" - ) + @pytest.mark.xfail(reason="numpy structured arrays have changed since v1.14") def test_integrated_plot_numpy_named_arrays(self): """ Test integration of visualizer with numpy named arrays """ model = naive_bayes.MultinomialNB() - X = np.array([ - (1.1, 9.52, 1.23, 0.86, 7.89, 0.13), - (3.4, 2.84, 8.65, 0.45, 7.43, 0.16), - (1.2, 3.22, 6.56, 0.24, 3.45, 0.17), - (3.8, 6.18, 2.45, 0.28, 2.53, 0.13), - (5.1, 9.12, 1.06, 0.19, 1.43, 0.13), - (4.4, 8.84, 4.97, 0.98, 1.35, 0.13), - (3.2, 3.22, 5.03, 0.68, 3.53, 0.32), - (7.8, 2.18, 6.87, 0.35, 3.25, 0.38), - ], dtype=[('a',' # Created: Thu Mar 29 12:13:04 2018 -0500 # -# Copyright (C) 2018 District Data Labs +# Copyright (C) 2018 The scikit-yb developers # For license information, see LICENSE.txt # -# ID: test_bar.py [7d3f5e6] nathan.danielsen@gmail.com $ +# ID: test_bar.py [1443e16] ndanielsen@users.noreply.github.com $ """ Tests for the MissingValuesBar visualizations. @@ -18,6 +18,9 @@ ########################################################################## import os +import pytest +import numpy as np + from tests.base import VisualTestCase from sklearn.datasets import make_classification from yellowbrick.contrib.missing.bar import * @@ -27,28 +30,35 @@ except ImportError: pd = None + +@pytest.fixture(scope="class") +def missing_bar_tolerance(request): + request.cls.tol = 0.5 if os.name == "nt" else 0.01 + + ########################################################################## ## Feature Importances Tests ########################################################################## + +@pytest.mark.usefixtures("missing_bar_tolerance") class TestMissingBarVisualizer(VisualTestCase): """ FeatureImportances visualizer """ - def setUp(self): - super(TestMissingBarVisualizer, self).setUp() - self.tol = 0.01 - if os.name == 'nt': # Windows - self.tol = 0.5 - def test_missingvaluesbar_pandas(self): """ Integration test of visualizer with pandas """ X, y = make_classification( - n_samples=400, n_features=20, n_informative=8, n_redundant=8, - n_classes=2, n_clusters_per_class=4, random_state=854 + n_samples=400, + n_features=20, + n_informative=8, + n_redundant=8, + n_classes=2, + n_clusters_per_class=4, + random_state=854, ) # add nan values to a range of values in the matrix @@ -58,18 +68,22 @@ def test_missingvaluesbar_pandas(self): features = [str(n) for n in range(20)] viz = MissingValuesBar(features=features) viz.fit(X_) - viz.poof() + viz.finalize() self.assert_images_similar(viz, tol=self.tol) - def test_missingvaluesbar_numpy(self): """ Integration test of visualizer with numpy without target y passed in """ X, y = make_classification( - n_samples=400, n_features=20, n_informative=8, n_redundant=8, - n_classes=2, n_clusters_per_class=4, random_state=856 + n_samples=400, + n_features=20, + n_informative=8, + n_redundant=8, + n_classes=2, + n_clusters_per_class=4, + random_state=856, ) # add nan values to a range of values in the matrix @@ -78,7 +92,7 @@ def test_missingvaluesbar_numpy(self): features = [str(n) for n in range(20)] viz = MissingValuesBar(features=features) viz.fit(X) - viz.poof() + viz.finalize() self.assert_images_similar(viz, tol=self.tol) @@ -88,8 +102,13 @@ def test_missingvaluesbar_numpy_with_y_target(self): but no class labels """ X, y = make_classification( - n_samples=400, n_features=20, n_informative=8, n_redundant=8, - n_classes=2, n_clusters_per_class=4, random_state=856 + n_samples=400, + n_features=20, + n_informative=8, + n_redundant=8, + n_classes=2, + n_clusters_per_class=4, + random_state=856, ) # add nan values to a range of values in the matrix @@ -98,7 +117,7 @@ def test_missingvaluesbar_numpy_with_y_target(self): features = [str(n) for n in range(20)] viz = MissingValuesBar(features=features) viz.fit(X, y) - viz.poof() + viz.finalize() self.assert_images_similar(viz, tol=self.tol) @@ -108,16 +127,21 @@ def test_missingvaluesbar_numpy_with_y_target_with_labels(self): but no class labels """ X, y = make_classification( - n_samples=400, n_features=20, n_informative=8, n_redundant=8, - n_classes=2, n_clusters_per_class=4, random_state=856 + n_samples=400, + n_features=20, + n_informative=8, + n_redundant=8, + n_classes=2, + n_clusters_per_class=4, + random_state=856, ) # add nan values to a range of values in the matrix X[X > 1.5] = np.nan features = [str(n) for n in range(20)] - viz = MissingValuesBar(features=features, classes=['class A', 'class B']) + viz = MissingValuesBar(features=features, classes=["class A", "class B"]) viz.fit(X, y) - viz.poof() + viz.finalize() self.assert_images_similar(viz, tol=self.tol) diff --git a/tests/test_contrib/test_missing/test_dispersion.py b/tests/test_contrib/test_missing/test_dispersion.py index 0636f7a70..7db9a525a 100644 --- a/tests/test_contrib/test_missing/test_dispersion.py +++ b/tests/test_contrib/test_missing/test_dispersion.py @@ -4,10 +4,10 @@ # Author: Nathan Danielsen # Created: Thu Mar 29 12:13:04 2018 -0500 # -# Copyright (C) 2018 District Data Labs +# Copyright (C) 2018 The scikit-yb developers # For license information, see LICENSE.txt # -# ID: test_dispersion.py [7d3f5e6] nathan.danielsen@gmail.com $ +# ID: test_dispersion.py [1443e16] ndanielsen@users.noreply.github.com $ """ Tests for the MissingValuesDispersion visualizations. @@ -16,7 +16,10 @@ ########################################################################## ## Imports ########################################################################## + import os +import pytest + from sklearn.datasets import make_classification from tests.base import VisualTestCase @@ -27,28 +30,35 @@ except ImportError: pd = None + +@pytest.fixture(scope="class") +def missing_dispersion_tolerance(request): + request.cls.tol = 0.5 if os.name == "nt" else 0.01 + + ########################################################################## ## Feature Importances Tests ########################################################################## -class MissingValuesDispersionTestCase(VisualTestCase): + +@pytest.mark.usefixtures("missing_dispersion_tolerance") +class TestMissingValuesDispersion(VisualTestCase): """ MissingValuesDispersion visualizer """ - def setUp(self): - super(MissingValuesDispersionTestCase, self).setUp() - self.tol = 0.01 - if os.name == 'nt': # Windows - self.tol = 5.0 - def test_missingvaluesdispersion_with_pandas(self): """ Integration test of visualizer with pandas """ X, y = make_classification( - n_samples=400, n_features=20, n_informative=8, n_redundant=8, - n_classes=2, n_clusters_per_class=4, random_state=854 + n_samples=400, + n_features=20, + n_informative=8, + n_redundant=8, + n_classes=2, + n_clusters_per_class=4, + random_state=854, ) # add nan values to a range of values in the matrix @@ -58,7 +68,7 @@ def test_missingvaluesdispersion_with_pandas(self): features = [str(n) for n in range(20)] viz = MissingValuesDispersion(features=features) viz.fit(X_) - viz.poof() + viz.finalize() self.assert_images_similar(viz, tol=self.tol) @@ -67,8 +77,13 @@ def test_missingvaluesdispersion_with_pandas_with_y_targets(self): Integration test of visualizer with pandas with y targets """ X, y = make_classification( - n_samples=400, n_features=20, n_informative=8, n_redundant=8, - n_classes=2, n_clusters_per_class=4, random_state=854 + n_samples=400, + n_features=20, + n_informative=8, + n_redundant=8, + n_classes=2, + n_clusters_per_class=4, + random_state=854, ) # add nan values to a range of values in the matrix @@ -76,21 +91,25 @@ def test_missingvaluesdispersion_with_pandas_with_y_targets(self): X_ = pd.DataFrame(X) features = [str(n) for n in range(20)] - classes = ['Class A', 'Class B'] + classes = ["Class A", "Class B"] viz = MissingValuesDispersion(features=features, classes=classes) viz.fit(X_, y=y) - viz.poof() + viz.finalize() self.assert_images_similar(viz, tol=self.tol) - def test_missingvaluesdispersion_with_numpy(self): """ Integration test of visualizer with numpy """ X, y = make_classification( - n_samples=400, n_features=20, n_informative=8, n_redundant=8, - n_classes=2, n_clusters_per_class=4, random_state=852 + n_samples=400, + n_features=20, + n_informative=8, + n_redundant=8, + n_classes=2, + n_clusters_per_class=4, + random_state=852, ) # add nan values to a range of values in the matrix @@ -99,7 +118,7 @@ def test_missingvaluesdispersion_with_numpy(self): features = [str(n) for n in range(20)] viz = MissingValuesDispersion(features=features) viz.fit(X) - viz.poof() + viz.finalize() self.assert_images_similar(viz, tol=self.tol) @@ -108,17 +127,22 @@ def test_missingvaluesdispersion_with_numpy_with_y_targets(self): Integration test of visualizer with numpy with y targets """ X, y = make_classification( - n_samples=400, n_features=20, n_informative=8, n_redundant=8, - n_classes=2, n_clusters_per_class=4, random_state=852 + n_samples=400, + n_features=20, + n_informative=8, + n_redundant=8, + n_classes=2, + n_clusters_per_class=4, + random_state=852, ) # add nan values to a range of values in the matrix X[X > 1.5] = np.nan features = [str(n) for n in range(20)] - classes = ['Class A', 'Class B'] + classes = ["Class A", "Class B"] viz = MissingValuesDispersion(features=features, classes=classes) viz.fit(X, y=y) - viz.poof() + viz.finalize() self.assert_images_similar(viz, tol=self.tol) diff --git a/tests/test_contrib/test_scatter.py b/tests/test_contrib/test_scatter.py index 082d2e992..28c15b8eb 100644 --- a/tests/test_contrib/test_scatter.py +++ b/tests/test_contrib/test_scatter.py @@ -1,13 +1,13 @@ # tests.test_contrib.test_scatter # Test the ScatterViz feature analysis visualizers # -# Author: Nathan Danielsen +# Author: Nathan Danielsen # Created: Fri Feb 26 19:40:00 2017 -0400 # -# Copyright (C) 2016 District Data Labs +# Copyright (C) 2016 The scikit-yb developers # For license information, see LICENSE.txt # -# ID: test_scatter.py [fc94ec4] ndanielsen@users.noreply.github.com $ +# ID: test_scatter.py [a89633e] benjamin@bengfort.com $ """ Test the ScatterViz feature analysis visualizers """ @@ -16,36 +16,32 @@ # Imports ########################################################################## -import sys import pytest import numpy as np -import matplotlib as mptl +from unittest import mock +from tests.base import VisualTestCase from yellowbrick.contrib.scatter import * +from yellowbrick.datasets import load_occupancy from yellowbrick.exceptions import YellowbrickValueError from yellowbrick.style import palettes -from tests.dataset import DatasetMixin -from tests.base import VisualTestCase -from yellowbrick.exceptions import ImageComparisonFailure - try: import pandas as pd except ImportError: pd = None -try: - from unittest import mock -except ImportError: - import mock - ########################################################################## # ScatterViz Base Tests ########################################################################## -@pytest.mark.filterwarnings('ignore') -class ScatterVizTests(VisualTestCase, DatasetMixin): + +@pytest.mark.filterwarnings("ignore") +class TestScatterViz(VisualTestCase): + """ + Test ScatterViz + """ # yapf: disable X = np.array([ @@ -55,32 +51,24 @@ class ScatterVizTests(VisualTestCase, DatasetMixin): [2.110, 3.609, 4.330, 7.985, 5.595, ], [2.110, 3.626, 4.330, 8.203, 5.621, ], [2.110, 3.620, 4.470, 8.210, 5.612, ] - ]) + ]) # yapf: enable y = np.array([1, 0, 1, 0, 1, 0]) - def setUp(self): - self.occupancy = self.load_data('occupancy') - super(ScatterVizTests, self).setUp() - - def tearDown(self): - self.occupancy = None - super(ScatterVizTests, self).tearDown() - def test_init_alias(self): """ Test alias for ScatterViz """ - features = ["temperature", "relative_humidity"] - visualizer = ScatterVisualizer(features=features, markers=['*']) - self.assertIsNotNone(visualizer.markers) + features = ["temperature", "relative humidity"] + visualizer = ScatterVisualizer(features=features, markers=["*"]) + assert visualizer.markers is not None def test_scatter(self): """ Assert no errors occur during scatter visualizer integration """ X_two_cols = self.X[:, :2] - features = ["temperature", "relative_humidity"] + features = ["temperature", "relative humidity"] visualizer = ScatterViz(features=features) visualizer.fit_transform(X_two_cols, self.y) @@ -88,9 +76,9 @@ def test_color_builds(self): """ Assert no errors occur during scatter visualizer integration """ - colors = palettes.PALETTES['pastel'] + colors = palettes.PALETTES["pastel"] X_two_cols = self.X[:, :2] - features = ["temperature", "relative_humidity"] + features = ["temperature", "relative humidity"] visualizer = ScatterViz(features=features, color=colors) visualizer.fit_transform(X_two_cols, self.y) @@ -101,61 +89,50 @@ def test_scatter_no_features(self): X_two_cols = self.X[:, :2] visualizer = ScatterViz() visualizer.fit_transform_poof(X_two_cols, self.y) - self.assertEquals(visualizer.features_, ['Feature One', 'Feature Two']) + assert visualizer.features_ == ["Feature One", "Feature Two"] def test_scatter_only_two_features_allowed_init(self): """ Assert that only two features are allowed for scatter visualizer init """ - features = ["temperature", "relative_humidity", "light"] + features = ["temperature", "relative humidity", "light"] - with self.assertRaises(YellowbrickValueError): + with pytest.raises(YellowbrickValueError): ScatterViz(features=features) def test_scatter_xy_and_features_raise_error(self): """ Assert that x,y and features will raise scatterviz error """ - features = ["temperature", "relative_humidity", "light"] + features = ["temperature", "relative humidity", "light"] - with self.assertRaises(YellowbrickValueError): - ScatterViz(features=features, x='one', y='two') + with pytest.raises(YellowbrickValueError): + ScatterViz(features=features, x="one", y="two") def test_scatter_xy_changes_to_features(self): """ Assert that x,y with no features will not raise scatterviz error """ - visualizer = ScatterViz(x='one', y='two') - self.assertEquals(visualizer.features_, ['one', 'two']) + visualizer = ScatterViz(x="one", y="two") + assert visualizer.features == ["one", "two"] def test_scatter_requires_two_features_in_numpy_matrix(self): """ Assert only two features allowed for scatter visualizer if not in init """ visualizer = ScatterViz() - with self.assertRaises(YellowbrickValueError) as context: + with pytest.raises(YellowbrickValueError, match="only accepts two features"): visualizer.fit_transform(self.X, self.y) - self.assertTrue( - 'only accepts two features' in str(context.exception)) - @pytest.mark.xfail( - sys.platform == 'win32', reason="Changing the dtype to a subarray type is only supported if the total itemsize is unchanged" - ) def test_integrated_scatter(self): """ Test scatter on the real, occupancy data set """ # Load the data from the fixture - X = self.occupancy[[ - "temperature", "relative_humidity", "light", "C02", "humidity" - ]] - - # Convert to numpy arrays - X = X.copy().view((float, len(X.dtype.names))) - y = self.occupancy['occupancy'].astype(int) + X, y = load_occupancy(return_dataset=True).to_numpy() # Test the visualizer - features = ["temperature", "relative_humidity"] + features = ["temperature", "relative humidity"] visualizer = ScatterViz(features=features) visualizer.fit_transform_poof(X[:, :2], y) @@ -178,28 +155,19 @@ def test_alpha_param(self): assert "alpha" in scatter_kwargs assert scatter_kwargs["alpha"] == 0.7 - @pytest.mark.xfail( - sys.platform == 'win32', reason="Changing the dtype to a subarray type is only supported if the total itemsize is unchanged" - ) def test_scatter_quick_method(self): """ Test scatter quick method on the real, occupancy data set """ # Load the data from the fixture - X = self.occupancy[[ - "temperature", "relative_humidity", "light", "C02", "humidity" - ]] - - # Convert to numpy arrays - X = X.copy().view((float, len(X.dtype.names))) - y = self.occupancy['occupancy'].astype(int) + X, y = load_occupancy(return_dataset=True).to_numpy() # Test the visualizer - features = ["temperature", "relative_humidity"] - ax = scatterviz(X[:, :2], y=y, ax=None, features=features) + features = ["temperature", "relative humidity"] + viz = scatterviz(X[:, :2], y=y, ax=None, features=features) # test that is returns a matplotlib obj with axes - self.assertIsInstance(ax, mptl.axes.Axes) + assert isinstance(viz, ScatterVisualizer) @pytest.mark.skipif(pd is None, reason="pandas is required for this test") def test_integrated_scatter_with_pandas(self): @@ -207,44 +175,30 @@ def test_integrated_scatter_with_pandas(self): Test scatterviz on the real, occupancy data set with pandas """ # Load the data from the fixture - X = self.occupancy[[ - "temperature", "relative_humidity", "light", "C02", "humidity" - ]] - y = self.occupancy['occupancy'].astype(int) - - # Convert X to a pandas dataframe - X = pd.DataFrame(X) - X.columns = [ - "temperature", "relative_humidity", "light", "C02", "humidity" - ] + # Load the data from the fixture + X, y = load_occupancy(return_dataset=True).to_pandas() # Test the visualizer - features = ["temperature", "relative_humidity"] + features = ["temperature", "relative humidity"] visualizer = ScatterViz(features=features) visualizer.fit_transform_poof(X, y) - @pytest.mark.xfail( - sys.platform == 'win32', reason="Changing the dtype to a subarray type is only supported if the total itemsize is unchanged" - ) + @pytest.mark.xfail(reason="numpy structured arrays have changed since v1.14") def test_integrated_scatter_numpy_named_arrays(self): """ Test scatterviz on numpy named arrays """ - dt = np.dtype({ - 'names': ['one', 'two', 'three', 'four', "five"], - 'formats': [ - np.float64, - np.float64, - np.float64, - np.float64, - np.float64, - ] - }) - - X_named = self.X.astype(dt, casting='unsafe') - visualizer = ScatterViz(features=['one', 'two']) + dt = np.dtype( + { + "names": ["one", "two", "three", "four", "five"], + "formats": [np.float64, np.float64, np.float64, np.float64, np.float64], + } + ) + + X_named = self.X.astype(dt, casting="unsafe") + visualizer = ScatterViz(features=["one", "two"]) visualizer.fit_transform_poof(X_named, self.y) - self.assertEquals(visualizer.features_, ['one', 'two']) + assert visualizer.features_ == ["one", "two"] def test_integrated_scatter_numpy_arrays_no_names(self): """ @@ -252,7 +206,7 @@ def test_integrated_scatter_numpy_arrays_no_names(self): """ visualizer = ScatterViz(features=[1, 2]) visualizer.fit_transform_poof(self.X, self.y) - self.assertEquals(visualizer.features_, [1, 2]) + assert visualizer.features_ == [1, 2] def test_scatter_image(self): """ @@ -261,24 +215,9 @@ def test_scatter_image(self): # self.setUp_ImageTest() X_two_cols = self.X[:, :2] - features = ["temperature", "relative_humidity"] + features = ["temperature", "relative humidity"] visualizer = ScatterViz(features=features) visualizer.fit(X_two_cols, self.y) visualizer.draw(X_two_cols, self.y) self.assert_images_similar(visualizer) - - - def test_scatter_image_fail(self): - """ - Assert bad image similarity on scatterviz errors - """ - - X_two_cols = self.X[:, :2] - features = ["temperature", "relative_humidity"] - visualizer = ScatterViz(features=features) - visualizer.fit(X_two_cols, self.y) - visualizer.draw(X_two_cols, self.y) - - with self.assertRaises(ImageComparisonFailure): - self.assert_images_similar(visualizer) diff --git a/tests/test_contrib/test_statsmodels/__init__.py b/tests/test_contrib/test_statsmodels/__init__.py index a4651ffae..9ecc36cb9 100644 --- a/tests/test_contrib/test_statsmodels/__init__.py +++ b/tests/test_contrib/test_statsmodels/__init__.py @@ -4,7 +4,7 @@ # Author: Benjamin Bengfort # Created: Wed Apr 04 13:28:13 2018 -0400 # -# ID: __init__.py [] benjamin@bengfort.com $ +# ID: __init__.py [d6ebc39] benjamin@bengfort.com $ """ Tests for the statsmodels contrib package diff --git a/tests/test_contrib/test_statsmodels/test_base.py b/tests/test_contrib/test_statsmodels/test_base.py index a3c7acbec..777098718 100644 --- a/tests/test_contrib/test_statsmodels/test_base.py +++ b/tests/test_contrib/test_statsmodels/test_base.py @@ -4,7 +4,7 @@ # Author: Ian Ozsvald # Created: Wed Jan 10 12:47:00 2018 -0500 # -# ID: test_base.py [] benjamin@bengfort.com $ +# ID: test_base.py [d6ebc39] benjamin@bengfort.com $ """ Tests for the statsmodels estimator wrapper. @@ -30,6 +30,7 @@ ## Test Cases ########################################################################## + @pytest.mark.skipif(sm is None, reason="test requires statsmodels") def test_stats_models_wrapper(): """ diff --git a/tests/test_datasets/__init__.py b/tests/test_datasets/__init__.py new file mode 100644 index 000000000..13a92dae8 --- /dev/null +++ b/tests/test_datasets/__init__.py @@ -0,0 +1,15 @@ +# tests.test_datasets +# Tests for the datasets module +# +# Author: Benjamin Bengfort +# Created: Thu Jul 26 14:28:14 2018 -0400 +# +# ID: __init__.py [7082742] benjamin@bengfort.com $ + +""" +Tests for the datasets module +""" + +########################################################################## +## Imports +########################################################################## diff --git a/tests/test_datasets/test_download.py b/tests/test_datasets/test_download.py index a2fc2978a..517b3ce0a 100644 --- a/tests/test_datasets/test_download.py +++ b/tests/test_datasets/test_download.py @@ -1,44 +1,59 @@ -import unittest - -import numpy as np -from sklearn.utils import Bunch - -from yellowbrick.datasets import * - - -class TestDataDownloaders(unittest.TestCase): +# tests.test_datasets.test_download +# Tests the download from S3 to ensure data is accessible. +# +# Author: Benjamin Bengfort +# Created: Tue Jan 01 15:06:05 2019 -0500 +# +# For license information, see LICENSE.txt +# +# ID: test_download.py [57aab02] ndanielsen@users.noreply.github.com $ + +""" +Tests the download from S3 to ensure data is accessible. +""" + +########################################################################## +## Imports +########################################################################## + +import pytest + +from yellowbrick.datasets.loaders import * +from yellowbrick.datasets.loaders import DATASETS +from yellowbrick.datasets.path import dataset_exists, dataset_archive + + +@pytest.mark.parametrize( + "loader", + [ + load_bikeshare, + load_concrete, + load_credit, + load_energy, + load_game, + load_mushroom, + load_occupancy, + load_spam, + load_walking, + load_hobbies, + load_nfl, + ], + ids=lambda l: l.__name__, +) +def test_loader_download(tmpdir, loader): """ - Test the dataset loading functions + Test download of dataset when it does not exist (requires Internet connection!) """ + name = loader.__name__[len("load_") :] + data_home = str(tmpdir.mkdir("datasets")) - def test_load_concrete(self): - data = load_concrete() - self.assertIsInstance(data, np.ndarray) - - def test_load_energy(self): - data = load_energy() - self.assertIsInstance(data, np.ndarray) - - def test_load_occupancy(self): - data = load_occupancy() - self.assertIsInstance(data, np.ndarray) - - def test_load_mushroom(self): - data = load_mushroom() - self.assertIsInstance(data, np.ndarray) - - def test_load_hobbies(self): - data = load_hobbies() - self.assertIsInstance(data, Bunch) - - def test_load_game(self): - data = load_game() - self.assertIsInstance(data, np.ndarray) + # The dataset should not exist + assert not dataset_exists(name, data_home=data_home) + assert not dataset_archive(name, DATASETS[name]["signature"], data_home=data_home) - def test_load_bikeshare(self): - data = load_bikeshare() - self.assertIsInstance(data, np.ndarray) + # Load the dataset + loader(data_home=data_home) - def test_load_spam(self): - data = load_spam() - self.assertIsInstance(data, np.ndarray) + # The dataset should have been downloaded + assert dataset_exists(name, data_home=data_home) + assert dataset_archive(name, DATASETS[name]["signature"], data_home=data_home) diff --git a/tests/test_datasets/test_loaders.py b/tests/test_datasets/test_loaders.py new file mode 100644 index 000000000..774d3ac16 --- /dev/null +++ b/tests/test_datasets/test_loaders.py @@ -0,0 +1,308 @@ +# tests.test_datasets.test_loaders +# Test the dataset loading utilities +# +# Author: Benjamin Bengfort +# Created: Tue Jul 31 15:34:56 2018 -0400 +# +# ID: test_loaders.py [7082742] benjamin@bengfort.com $ + +""" +Test the dataset loading utilities +""" + +########################################################################## +## Imports +########################################################################## + +import pytest +import numpy as np + +from unittest.mock import patch + +from yellowbrick.datasets.loaders import * +from yellowbrick.datasets.loaders import DATASETS +from yellowbrick.datasets.base import Dataset, Corpus +from yellowbrick.datasets.path import dataset_exists, dataset_archive +from yellowbrick.datasets.path import find_dataset_path +from yellowbrick.exceptions import DatasetsError + +try: + import pandas as pd +except ImportError: + pd = None + + +########################################################################## +## Assertion Helpers +########################################################################## + + +def assert_valid_dataset(data, name): + __tracebackhide__ = True + assert isinstance(data, Dataset), "not a Dataset object" + assert name in DATASETS, "dataset not in manifest" + + assert dataset_exists(name), "dataset directory does not exist" + assert dataset_archive( + name, DATASETS[name]["signature"] + ), "dataset archive does not match signature" + assert ( + find_dataset_path(name, ext=".csv.gz", raises=False) is not None + ), "no .csv.tgz in dataset" + assert ( + find_dataset_path(name, ext=".npz", raises=False) is not None + ), "no .npz in dataset" + + n_files = len(data.contents()) + assert n_files == 4 or n_files == 5, "not enough files in dataset" + assert len(data.README) > 0, "readme contains no data" + assert len(data.meta) > 0, "metadata is empty" + + if n_files == 5: + assert len(data.citation) > 0, "citation.bib is empty" + + assert "features" in data.meta, "no features in metadata" + assert "target" in data.meta, "no target in metadata" + + +def assert_valid_corpus(corpus, name): + __tracebackhide__ = True + assert isinstance(corpus, Corpus), "not a Corpus object" + assert name in DATASETS, "corpus not in manifest" + + assert dataset_exists(name), "corpus directory does not exist" + assert dataset_archive( + name, DATASETS[name]["signature"] + ), "corpus archive does not match signature" + + n_contents = len(corpus.contents()) + assert n_contents > 2, "not enough files/directories in corpus" + assert len(corpus.README) > 0, "readme contains no data" + assert corpus.citation is None or len(corpus.citation) > 0, "citation.bib is empty" + + +def assert_valid_pandas(data): + __tracebackhide__ = True + # Get raw data frame + df = data.to_dataframe() + assert isinstance(df, pd.DataFrame), "raw dataframe is wrong type" + + # Get pandas data + X, y = data.to_pandas() + assert isinstance(X, pd.DataFrame), "X is not a DataFrame" + assert isinstance(y, pd.Series), "y is not a Series" + + # Assert pandas is returned from to_data() + X, y = data.to_data() + assert isinstance(X, pd.DataFrame), "to_data does not return pandas" + assert isinstance(y, pd.Series), "to_data does not return pandas" + + +def assert_valid_numpy(data): + __tracebackhide__ = True + X, y = data.to_numpy() + assert isinstance(X, np.ndarray), "X is not a numpy array" + assert isinstance(y, np.ndarray), "y is not a numpy array" + assert X.ndim == 2 and y.ndim == 1, "X and y dimensions are incorrect" + + # Patch pandas and make defaults assertions + X, y = data.to_data() + assert isinstance(X, np.ndarray), "to_data does not return numpy" + assert isinstance(y, np.ndarray), "to_data does not return numpy" + + with pytest.raises(DatasetsError): + data.to_pandas(), "exception not raised when pandas unavailable" + + +########################################################################## +## Test Cases +########################################################################## + + +class TestDatasetLoaders(object): + """ + Test the dataset loading functions + + Broadly: test each of the dataset loaders to ensure that they are valid + for their particular type of dataset and that they return X and y by + default. Then test their shape to ensure that the dataset hasn't changed + unexpectedly between versions. See ``test_load_concrete`` for a sketch. + + Final tests with parametrize test all loaders against Base classes. + Make sure you scroll to the bottom and implement your loader in the + correct test batch! + """ + + def test_load_concrete(self): + """ + Test loading the concrete regression dataset + """ + # Load the type-specific dataset wrapper and validate it + data = load_concrete(return_dataset=True) + assert_valid_dataset(data, "concrete") + + # Ensure that the default returns X, y to match documentation + # Check shape to ensure no unexpected dataset changes have occured + # before we push something to PyPI! + X, y = load_concrete() + assert X.shape == (1030, 8) + assert y.shape == (1030,) + + def test_load_energy(self): + """ + Test loading the energy multi regression dataset + """ + data = load_energy(return_dataset=True) + assert_valid_dataset(data, "energy") + + X, y = load_energy() + assert X.shape == (768, 8) + assert y.shape == (768,) + + def test_load_credit(self): + """ + Test loading the credit binary classification dataset + """ + data = load_credit(return_dataset=True) + assert_valid_dataset(data, "credit") + + X, y = load_credit() + assert X.shape == (30000, 23) + assert y.shape == (30000,) + + def test_load_occupancy(self): + """ + Test loading the occupancy binary classification dataset + """ + data = load_occupancy(return_dataset=True) + assert_valid_dataset(data, "occupancy") + + X, y = load_occupancy() + assert X.shape == (20560, 5) + assert y.shape == (20560,) + + def test_load_mushroom(self): + """ + Test loading the mushroom binary classification dataset + """ + data = load_mushroom(return_dataset=True) + assert_valid_dataset(data, "mushroom") + + X, y = load_mushroom() + assert X.shape == (8123, 3) + assert y.shape == (8123,) + + def test_load_hobbies(self): + """ + Test loading the hobbies text corpus dataset + """ + corpus = load_hobbies() + assert_valid_corpus(corpus, "hobbies") + + assert len(corpus.labels) == 5 + assert len(corpus.files) == 448 + assert len(corpus.data) == 448 + assert len(corpus.target) == 448 + + def test_load_game(self): + """ + Test loading the game multiclass classification dataset + """ + data = load_game(return_dataset=True) + assert_valid_dataset(data, "game") + + X, y = load_game() + assert X.shape == (67557, 42) + assert y.shape == (67557,) + + def test_load_bikeshare(self): + """ + Test loading the bikeshare regression dataset + """ + data = load_bikeshare(return_dataset=True) + assert_valid_dataset(data, "bikeshare") + + X, y = load_bikeshare() + assert X.shape == (17379, 12) + assert y.shape == (17379,) + + def test_load_spam(self): + """ + Test loading the spam binary classification dataset + """ + data = load_spam(return_dataset=True) + assert_valid_dataset(data, "spam") + + X, y = load_spam() + assert X.shape == (4600, 57) + assert y.shape == (4600,) + + def test_load_walking(self): + """ + Test loading the walking activity clustering dataset + """ + data = load_walking(return_dataset=True) + assert_valid_dataset(data, "walking") + + X, y = load_walking() + assert X.shape == (149332, 4) + assert y.shape == (149332,) + + def test_load_nfl(self): + """ + Test loading the nfl clustering dataset + """ + data = load_nfl(return_dataset=True) + assert_valid_dataset(data, "nfl") + + X, y = load_nfl() + assert X.shape == (494, 23) + assert y.shape == (494,) + + @pytest.mark.skipif(pd is None, reason="pandas is required for this test") + @pytest.mark.parametrize( + "loader", + [ + load_bikeshare, + load_concrete, + load_credit, + load_energy, + load_game, + load_mushroom, + load_occupancy, + load_spam, + load_walking, + load_nfl, + ], + ids=lambda l: l.__name__, + ) + def test_load_pandas(self, loader): + """ + Test loading datasets as pandas objects + """ + data = loader(return_dataset=True) + assert_valid_pandas(data) + + @patch("yellowbrick.datasets.base.pd", None) + @pytest.mark.parametrize( + "loader", + [ + load_bikeshare, + load_concrete, + load_credit, + load_energy, + load_game, + load_mushroom, + load_occupancy, + load_spam, + load_walking, + load_nfl, + ], + ids=lambda l: l.__name__, + ) + def test_load_numpy(self, loader): + """ + Test loading datasets as numpy defaults + """ + data = loader(return_dataset=True) + assert_valid_numpy(data) diff --git a/tests/test_datasets/test_path.py b/tests/test_datasets/test_path.py new file mode 100644 index 000000000..0ed3c0a39 --- /dev/null +++ b/tests/test_datasets/test_path.py @@ -0,0 +1,220 @@ +# tests.test_datasets.test_paths +# Tests for the dataset path utilities +# +# Author: Benjamin Bengfort +# Created: Thu Jul 26 14:28:14 2018 -0400 +# +# ID: test_path.py [7082742] benjamin@bengfort.com $ + +""" +Tests for the dataset path utilities +""" + +########################################################################## +## Imports +########################################################################## + +import os +import pytest +import contextlib + +from yellowbrick.datasets.path import * +from yellowbrick.exceptions import DatasetsError + + +########################################################################## +## Utilities +########################################################################## + + +@contextlib.contextmanager +def environ(**env): + """ + Temporarily set the environment variables for a test, restoring them when + the test is complete (e.g. when the context manager exits). + + Parameters + ---------- + env : dict + The environment variables that should exist in context + """ + old_env = dict(os.environ) + os.environ.clear() + os.environ.update(env) + + try: + yield + finally: + os.environ.clear() + os.environ.update(old_env) + + +########################################################################## +## Test Cases +########################################################################## + + +def test_get_data_home_fixtures(): + """ + get_data_home should return fixtures by default + """ + assert get_data_home() == FIXTURES + assert os.path.exists(FIXTURES) + + +def test_get_data_home_env(tmpdir): + """ + get_data_home should return the environment variable if set + """ + path = str(tmpdir.mkdir("fixtures").join("foo")) + assert not os.path.exists(path) + + with environ(YELLOWBRICK_DATA=path): + assert get_data_home() == path + assert os.path.exists(path) + + +def test_get_data_home_specified(tmpdir): + """ + get_data_home should return a passed in path + """ + path = str(tmpdir.mkdir("fixtures").join("foo")) + assert not os.path.exists(path) + + assert get_data_home(path) == path + assert os.path.exists(path) + + +def test_find_dataset_path(tmpdir): + """ + Test find_dataset_path with a specified data_home + """ + + # Create the dataset + data_home = tmpdir.mkdir("fixtures") + foo = data_home.mkdir("foo") + + # Test the default lookup of foo/foo.csv.gz + fpath = foo.join("foo.csv.gz") + fpath.write("1,2,3") + assert find_dataset_path("foo", data_home=data_home) == fpath + + # Test the extension based lookup of foo/foo.npz + fpath = foo.join("foo.npz") + fpath.write("1234") + assert find_dataset_path("foo", data_home=data_home, ext=".npz") == fpath + + # Test the fname based lookup of foo/data.txt + fpath = foo.join("data.txt") + fpath.write("there is data in this file") + assert find_dataset_path("foo", data_home=data_home, fname="data.txt") == fpath + + +def test_missing_find_dataset_path(tmpdir): + """ + Test find_dataset_path when the dataset does not exist + """ + data_home = tmpdir.mkdir("fixtures") + + # When the data directory doesn't exist + with pytest.raises(DatasetsError): + find_dataset_path("foo", data_home=str(data_home)) + + # When the data directory exists but no file is in the directory + foo = data_home.mkdir("foo") + with pytest.raises(DatasetsError): + find_dataset_path("foo", data_home=str(data_home)) + + # When the specified file doesn't exist + fpath = foo.join("foo.csv") + fpath.write("1,2,3") + with pytest.raises(DatasetsError): + find_dataset_path("foo", data_home=str(data_home), ext=".npz") + + +def test_suppress_exception_find_dataset_path(tmpdir): + """ + Assert that find_dataset_path can suppress exceptions + """ + data_home = str(tmpdir.mkdir("fixtures")) + assert find_dataset_path("foo", data_home=data_home, raises=False) is None + + +def test_dataset_exists(tmpdir): + """ + Test the dataset_exists helper function + """ + data_home = tmpdir.mkdir("fixtures") + assert not os.path.exists(str(data_home.join("foo"))) + + # Test when directory doesn't exist + assert not dataset_exists("foo", str(data_home)) + + # Test when path exists but is file + fpath = data_home.join("foo.txt") + fpath.write("foo") + + assert not dataset_exists("foo.txt", str(data_home)) + + # Test correct case + data_home.mkdir("foo") + assert dataset_exists("foo", str(data_home)) + + +def test_dataset_archive(tmpdir): + """ + Test the dataset_archive determines if an archive is up to date + """ + sig = "49b3fc3143d727d7819fabd4365d7e7b29794089dc9fa1e5e452aeb0b33d5eda" + data_home = tmpdir.mkdir("fixtures") + + # When archive does not exist + assert not dataset_archive("foo", sig, data_home=str(data_home)) + + # Create archive + fpath = data_home.join("foo.zip") + fpath.write("this is a data archive") + + # When archive exists + assert dataset_archive("foo", sig, data_home=data_home) + + # When archive does not match signature + assert not dataset_archive("foo", "abcd", data_home=data_home) + + +def test_cleanup_dataset(tmpdir): + """ + Test cleanup_dataset removes both data dir and archive + """ + data_home = tmpdir.mkdir("fixtures") + + # Make dataset and archive + foo = data_home.mkdir("foo") + fdata = foo.join("foo.csv") + fdata.write("testing 1,2,3") + + fzip = data_home.join("foo.zip") + fzip.write("this is the archive file") + + # Make sure the files exist + assert os.path.exists(fzip) + assert os.path.exists(fdata) + + # Cleanup the dataset + cleanup_dataset("foo", data_home=data_home) + + # Files should be gone + assert not os.path.exists(fzip) + assert not os.path.exists(fdata) + + +def test_cleanup_dataset_no_data(tmpdir): + """ + Assert cleanup_dataset fails gracefully if data and archive don't exist. + """ + data_home = tmpdir.mkdir("fixtures") + cleanup_dataset("foo", data_home=str(data_home)) + + # Files should be gone + assert not os.path.exists(str(data_home.join("foo.zip"))) + assert not os.path.exists(str(data_home.join("foo"))) diff --git a/tests/test_datasets/test_signature.py b/tests/test_datasets/test_signature.py new file mode 100644 index 000000000..36fffa746 --- /dev/null +++ b/tests/test_datasets/test_signature.py @@ -0,0 +1,45 @@ +# tests.test_datasets.test_signature +# Test the sha256sum file signature library +# +# Author: Benjamin Bengfort +# Created: Tue Jul 31 14:20:10 2018 -0400 +# +# ID: test_signature.py [7082742] benjamin@bengfort.com $ + +""" +Test the sha256sum file signature library +""" + +########################################################################## +## Imports +########################################################################## + +import json +from yellowbrick.datasets.signature import sha256sum + + +########################################################################## +## Test Case +########################################################################## + +FIXTURE = { + "name": "The Cat in the Hat", + "color": "red and black", + "weather": "rainy", + "chaos_level": "HIGH", + "things": ["1", "2"], + "extra": "angry fish in bowl", +} + + +def test_signature(tmpdir): + """ + Test the SHA 256 signature of a temporary file + """ + + fpath = tmpdir.join("test.json") + json.dump(FIXTURE, fpath, indent=2) + assert ( + sha256sum(str(fpath)) + == "d10b36aa74a59bcf4a88185837f658afaf3646eff2bb16c3928d0e9335e945d2" + ) diff --git a/tests/test_draw.py b/tests/test_draw.py index 675371846..3dcca499c 100644 --- a/tests/test_draw.py +++ b/tests/test_draw.py @@ -4,7 +4,7 @@ # Author: Benjamin Bengfort # Created: Sun Aug 19 11:21:04 2018 -0400 # -# ID: test_draw.py [] benjamin@bengfort.com $ +# ID: test_draw.py [dd915ad] benjamin@bengfort.com $ """ Tests for the high-level drawing utility functions @@ -26,18 +26,36 @@ ## Simple tests for high-level drawing utilities ########################################################################## + def test_manual_legend_uneven_colors(): """ Raise exception when colors and labels are mismatched in manual_legend """ with pytest.raises(YellowbrickValueError, match="same number of colors as labels"): - manual_legend(None, ('a', 'b', 'c'), ('r', 'g')) + manual_legend(None, ("a", "b", "c"), ("r", "g")) + + +@pytest.fixture(scope="class") +def data(request): + + data = np.array( + [ + [4, 8, 7, 6, 5, 2, 1], + [6, 7, 9, 6, 9, 3, 6], + [5, 1, 6, 8, 4, 7, 8], + [6, 8, 1, 5, 6, 7, 4], + ] + ) + + request.cls.data = data ########################################################################## ## Visual test cases for high-level drawing utilities ########################################################################## + +@pytest.mark.usefixtures("data") class TestDraw(VisualTestCase): """ Visual tests for the high-level drawing utilities @@ -54,16 +72,92 @@ def test_manual_legend(self): Bx, By = random.normal(42, 3, 100), random.normal(44, 1, 100) Cx, Cy = random.normal(20, 10, 100), random.normal(30, 1, 100) - _, ax = plt.subplots() - ax.scatter(Ax, Ay, c='r', alpha=0.35, label='a') - ax.scatter(Bx, By, c='g', alpha=0.35, label='b') - ax.scatter(Cx, Cy, c='b', alpha=0.35, label='c') + ax.scatter(Ax, Ay, c="r", alpha=0.35, label="a") + ax.scatter(Bx, By, c="g", alpha=0.35, label="b") + ax.scatter(Cx, Cy, c="b", alpha=0.35, label="c") # Add the manual legend manual_legend( - ax, ('a', 'b', 'c'), ('r', 'g', 'b'), frameon=True, loc='upper left' + ax, ("a", "b", "c"), ("r", "g", "b"), frameon=True, loc="upper left" ) # Assert image similarity self.assert_images_similar(ax=ax) + + def test_vertical_bar_stack(self): + """ + Test bar_stack for vertical orientation + """ + _, ax = plt.subplots() + + # Plots stacked bar charts + bar_stack(self.data, ax=ax, orientation="v") + + # Assert image similarity + self.assert_images_similar(ax=ax, tol=0.1) + + def test_horizontal_bar_stack(self): + """ + Test bar_stack for horizontal orientation + """ + _, ax = plt.subplots() + # Plots stacked bar charts + bar_stack(self.data, ax=ax, orientation="h") + + # Assert image similarity + self.assert_images_similar(ax=ax, tol=0.1) + + def test_single_row_bar_stack(self): + """ + Test bar_stack for single row + """ + data = np.array([[4, 8, 7, 6, 5, 2, 1]]) + + _, ax = plt.subplots() + + # Plots stacked bar charts + bar_stack(data, ax=ax) + + # Assert image similarity + self.assert_images_similar(ax=ax, tol=0.1) + + def test_labels_vertical(self): + """ + Test labels and ticks for vertical barcharts + """ + labels = ["books", "cinema", "cooking", "gaming"] + ticks = ["noun", "verb", "adverb", "pronoun", "preposition", "digit", "other"] + _, ax = plt.subplots() + + # Plots stacked bar charts + bar_stack(self.data, labels=labels, ticks=ticks, colors=["r", "b", "g", "y"]) + + # Extract tick labels from the plot + ticks_ax = [tick.get_text() for tick in ax.xaxis.get_ticklabels()] + # Assert that ticks are set properly + assert ticks_ax == ticks + + # Assert image similarity + self.assert_images_similar(ax=ax, tol=0.05) + + def test_labels_horizontal(self): + """ + Test labels and ticks with horizontal barcharts + """ + labels = ["books", "cinema", "cooking", "gaming"] + ticks = ["noun", "verb", "adverb", "pronoun", "preposition", "digit", "other"] + _, ax = plt.subplots() + + # Plots stacked bar charts + bar_stack( + self.data, labels=labels, ticks=ticks, orientation="h", colormap="cool" + ) + + # Extract tick labels from the plot + ticks_ax = [tick.get_text() for tick in ax.yaxis.get_ticklabels()] + # Assert that ticks are set properly + assert ticks_ax == ticks + + # Assert image similarity + self.assert_images_similar(ax=ax, tol=0.05) diff --git a/tests/test_exceptions.py b/tests/test_exceptions.py new file mode 100644 index 000000000..ca5aeabdb --- /dev/null +++ b/tests/test_exceptions.py @@ -0,0 +1,27 @@ +########################################################################## +## Imports +########################################################################## + +import pytest + +from yellowbrick.exceptions import NotFitted + + +########################################################################## +## NotFitted Exception Tests +########################################################################## + + +class TestExceptions(object): + """ + Test exception specific code and utilities + """ + + @pytest.mark.parametrize("method", ["transform", None]) + def test_not_fitted_from_estimator(self, method): + """ + Ensure not fitted can be raised directly from an estimator + """ + msg = "instance is not fitted yet, please call fit" + with pytest.raises(NotFitted, match=msg): + raise NotFitted.from_estimator(self, method) diff --git a/tests/test_features/__init__.py b/tests/test_features/__init__.py index d8573ef26..19d116c6a 100644 --- a/tests/test_features/__init__.py +++ b/tests/test_features/__init__.py @@ -4,7 +4,7 @@ # Author: Benjamin Bengfort # Created: Thu Oct 06 11:19:55 2016 -0400 # -# Copyright (C) 2016 District Data Labs +# Copyright (C) 2016 The scikit-yb developers # For license information, see LICENSE.txt # # ID: __init__.py [1d407ab] benjamin@bengfort.com $ diff --git a/tests/test_features/conftest.py b/tests/test_features/conftest.py new file mode 100644 index 000000000..9820ffb7e --- /dev/null +++ b/tests/test_features/conftest.py @@ -0,0 +1,73 @@ +# tests.test_features.conftest +# Provides fixtures for the feature tests module. +# +# Author: Naresh Bachwani +# Created: Thu Aug 15 07:35:53 2019 -0400 +# +# Copyright (C) 2016 The scikit-yb developers +# For license information, see LICENSE.txt +# +# ID: conftest.py [2c5f0e9] 43993586+naresh-bachwani@users.noreply.github.com $ + +""" +Provides fixtures for the feature tests module. +""" + +########################################################################## +## Imports +########################################################################## + +import pytest + +from tests.fixtures import Dataset +from sklearn.datasets import make_classification, make_regression, make_s_curve + + +########################################################################## +## Fixtures +########################################################################## + + +@pytest.fixture(scope="class") +def discrete(request): + """ + Creates a fixture of train and test splits for the sklearn digits dataset + For ease of use returns a Dataset named tuple composed of two Split tuples. + """ + X, y = make_classification( + n_samples=400, + n_features=12, + n_informative=8, + n_redundant=0, + n_classes=5, + n_clusters_per_class=1, + class_sep=1.8, + random_state=854, + scale=[14.2, 2.1, 0.32, 0.001, 32.3, 44.1, 102.3, 2.3, 2.4, 38.2, 0.05, 1.0], + ) + + # Set a class attribute for discrete data. + request.cls.discrete = Dataset(X, y) + + +@pytest.fixture(scope="class") +def continuous(request): + """ + Creates a random regressor fixture. + """ + X, y = make_regression( + n_samples=500, n_features=22, n_informative=8, random_state=2019 + ) + + # Set a class attribute for continuous data + request.cls.continuous = Dataset(X, y) + + +@pytest.fixture(scope="class") +def s_curves(request): + """ + Creates a random regressor fixture. + """ + X, y = make_s_curve(1000, random_state=888) + # Set a class attribute for continuous data + request.cls.s_curves = Dataset(X, y) diff --git a/tests/test_features/test_base.py b/tests/test_features/test_base.py index 75f3023f4..f25abcebb 100644 --- a/tests/test_features/test_base.py +++ b/tests/test_features/test_base.py @@ -1,10 +1,10 @@ # tests.test_features.test_base # Tests for the feature selection and analysis base classes # -# Author: Benjamin Bengfort +# Author: Benjamin Bengfort # Created: Fri Oct 07 13:43:55 2016 -0400 # -# Copyright (C) 2016 District Data Labs +# Copyright (C) 2016 The scikit-yb developers # For license information, see LICENSE.txt # # ID: test_base.py [2e898a6] benjamin@bengfort.com $ @@ -17,33 +17,451 @@ ## Imports ########################################################################## -import unittest +import pytest +import numpy as np +import numpy.testing as npt -from yellowbrick.base import * +from tests.fixtures import Dataset +from yellowbrick.base import Visualizer from yellowbrick.features.base import * + from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.datasets import make_classification, make_regression + +from unittest.mock import Mock + +try: + import pandas as pd +except ImportError: + pd = None + + +########################################################################## +## Fixtures +########################################################################## + + +@pytest.fixture(scope="class") +def discrete(request): + """ + Create a random classification dataset fixture. + """ + X, y = make_classification( + n_classes=5, + n_samples=400, + n_features=12, + n_informative=10, + n_redundant=0, + random_state=2019, + ) + + # Dataset is accessible on the class so it is only generated once + request.cls.discrete = Dataset(X, y) + + +@pytest.fixture(scope="class") +def continuous(request): + """ + Creates a random regression dataset fixture. + """ + X, y = make_regression( + n_samples=500, n_features=22, n_informative=8, random_state=2019 + ) + + # Dataset is accessible on the class so it is only generated once + request.cls.continuous = Dataset(X, y) ########################################################################## ## FeatureVisualizer Base Tests ########################################################################## -class FeatureVisualizerBaseTests(unittest.TestCase): + +@pytest.mark.usefixtures("discrete") +class TestFeatureVisualizer(object): + """ + Test FeatureVisualizer base class + """ def test_subclass(self): """ - Assert the feature visualizer is in its rightful place + Check the visualizer/transformer class hierarchy """ visualizer = FeatureVisualizer() - self.assertIsInstance(visualizer, TransformerMixin) - self.assertIsInstance(visualizer, BaseEstimator) - self.assertIsInstance(visualizer, Visualizer) - - # def test_interface(self): - # """ - # Test the feature visualizer interface - # """ - # - # visualizer = FeatureVisualizer() - # with self.assertRaises(NotImplementedError): - # visualizer.poof() + assert isinstance(visualizer, TransformerMixin) + assert isinstance(visualizer, BaseEstimator) + assert isinstance(visualizer, Visualizer) + + def test_transform_returns_unmodified(self): + """ + Ensure transformer is just a passthrough + """ + X, y = self.discrete + visualizer = FeatureVisualizer().fit(X, y) + assert visualizer.transform(X, y) is X + + def test_fit_transform_poof(self): + """ + Test the fit/transform/poof quick method + """ + + class MockFeatureVisaulizer(FeatureVisualizer): + pass + + viz = MockFeatureVisaulizer() + viz.fit = Mock(return_value=viz) + viz.transform = Mock(return_value="a") + viz.poof = Mock() + + X, y = self.discrete + assert viz.fit_transform_poof(X, y, outpath="a.png", clear_figure=True) == "a" + assert viz.fit.called_once_with(X, y) + assert viz.transform.called_once_with(X, y) + assert viz.poof.called_once_with(outpath="a.png", clear_figure=True) + + +########################################################################## +## MultiFeatureVisualizer Tests +########################################################################## + + +@pytest.mark.usefixtures("discrete") +class TestMultiFeatureVisualizer(object): + """ + Test the MultiFeatureVisualizer base class + """ + + def test_subclass(self): + """ + Check the visualizer/transformer class hierarchy + """ + visualizer = MultiFeatureVisualizer() + assert isinstance(visualizer, FeatureVisualizer) + assert isinstance(visualizer, TransformerMixin) + assert isinstance(visualizer, BaseEstimator) + assert isinstance(visualizer, Visualizer) + + def test_user_supplied_features(self): + """ + Test that a user can supply feature names directly + """ + X, y = self.discrete + features = ["f{}".format(i + 1) for i in range(X.shape[1])] + oz = MultiFeatureVisualizer(features=features) + + assert not hasattr(oz, "features_") + assert oz.fit(X, y) is oz + assert hasattr(oz, "features_") + npt.assert_array_equal(oz.features_, np.asarray(features)) + + def test_numeric_features(self): + """ + Test that the features are column indices for numpy arrays + """ + X, y = self.discrete + oz = MultiFeatureVisualizer() + + assert not hasattr(oz, "features_") + assert oz.fit(X, y) is oz + assert hasattr(oz, "features_") + assert len(oz.features_) == X.shape[1] + + @pytest.mark.skipif(pd is None, reason="test requires pandas") + def test_pandas_string_columns(self): + """ + Ensure that DataFrame column names are uses as features + """ + X, y = self.discrete + features = ["f{}".format(i + 1) for i in range(X.shape[1])] + + X = pd.DataFrame(X, columns=features) + y = pd.Series(y, name="target") + + oz = MultiFeatureVisualizer() + assert not hasattr(oz, "features_") + assert oz.fit(X, y) is oz + assert hasattr(oz, "features_") + npt.assert_array_equal(oz.features_, np.asarray(features)) + + @pytest.mark.skipif(pd is None, reason="test requires pandas") + def test_pandas_no_columns(self): + """ + Test when a DataFrame has no column names + """ + """ + Ensure that Pandas column names are uses as features + """ + X, y = self.discrete + + X = pd.DataFrame(X) + y = pd.Series(y, name="target") + + oz = MultiFeatureVisualizer() + assert not hasattr(oz, "features_") + assert oz.fit(X, y) is oz + assert hasattr(oz, "features_") + assert len(oz.features_) == X.shape[1] + + +########################################################################## +## DataVisualizer Tests +########################################################################## + + +@pytest.mark.usefixtures("discrete", "continuous") +class TestDataVisualizer(object): + """ + Test DataVisualizer base class + """ + + FIELDS = ( + "features_", + "classes_", + "range_", + "_colors", + "_target_color_type", + "_label_encoder", + ) + + def assert_not_fitted(self, obj): + __tracebackhide__ = True + for field in self.FIELDS: + if field.endswith("_"): + assert not hasattr(obj, field), "has {} before fit".format(field) + else: + msg = "missing internal var {}".format(field) + assert getattr(obj, field) is None, msg + + def assert_fitted(self, obj, fields=FIELDS): + __tracebackhide__ = True + # Mutually exclusive fields + if obj._target_color_type == TargetType.SINGLE: + assert (not hasattr(obj, "classes_")) and (not hasattr(obj, "range_")) + + elif obj._target_color_type == TargetType.DISCRETE: + assert hasattr(obj, "_label_encoder") + assert hasattr(obj, "classes_") and (not hasattr(obj, "range_")) + + elif obj._target_color_type == TargetType.CONTINUOUS: + assert (not hasattr(obj, "classes_")) and hasattr(obj, "range_") + + else: + raise ValueError( + "cannot test target type {}".format(obj._target_color_type) + ) + + for field in fields: + if field in {"classes_", "range_", "_label_encoder"}: + continue # handled by mutually exclusive + assert hasattr(obj, field) + + def test_single_when_none(self): + """ + Ensure that the target type is "single" when y is None + """ + X, _ = self.discrete + oz = DataVisualizer() + assert oz.target_type == TargetType.AUTO + + # Assert single when y is None + self.assert_not_fitted(oz) + assert oz.fit(X, y=None) is oz + self.assert_fitted(oz) + assert oz._target_color_type == TargetType.SINGLE + + @pytest.mark.parametrize("target_type", ("discrete", "continuous")) + def test_none_overrides_user_specified_target(self, target_type): + """ + Even if a user supplies a target type it should be overriden by y=None + """ + X, _ = getattr(self, target_type) + oz = DataVisualizer(target_type=target_type).fit(X, y=None) + + assert oz._colors == "C0" + assert oz._target_color_type == TargetType.SINGLE + + @pytest.mark.parametrize("dataset", ("discrete", "continuous")) + def test_user_overrides_auto_target(self, dataset): + """ + Ensure user specified target type overrides auto discovery + """ + X, y = getattr(self, dataset) + target_type = ( + TargetType.CONTINUOUS if dataset == "discrete" else TargetType.DISCRETE + ) + + oz = DataVisualizer(target_type=target_type) + assert oz.target_type != TargetType.AUTO + oz.fit(X, y) + assert oz.target_type == target_type + + def test_continuous(self): + """ + Test data visualizer on continuous data + """ + # Check when y is continuous + X, y = self.continuous + oz = DataVisualizer() + assert oz.target_type == TargetType.AUTO + + self.assert_not_fitted(oz) + assert oz.fit(X, y) is oz + self.assert_fitted(oz) + assert oz._target_color_type == TargetType.CONTINUOUS + assert oz.range_ == (y.min(), y.max()) + + def test_discrete(self): + """ + Test data visualizer on discrete data + """ + # Check when y is discrete + X, y = self.discrete + oz = DataVisualizer() + assert oz.target_type == TargetType.AUTO + + self.assert_not_fitted(oz) + assert oz.fit(X, y) is oz + self.assert_fitted(oz) + assert oz._target_color_type == TargetType.DISCRETE + assert len(oz.classes_) == np.unique(y).shape[0] + + def test_bad_target_type(self): + """ + Assert target type is validated on init + """ + msg = "unknown target color type 'foo'" + with pytest.raises(YellowbrickValueError, match=msg): + DataVisualizer(target_type="foo") + + def test_classes_discrete(self): + """ + Ensure classes are assigned correctly for label encoding + """ + X, y = self.discrete + classes = ["a", "b", "c", "d", "e"] + oz = DataVisualizer(classes=classes, target_type="discrete").fit(X, y) + + npt.assert_array_equal(oz.classes_, classes) + assert list(oz._colors.keys()) == classes + + def test_classes_continuous(self): + """ + Ensure classes are ignored in continuous case + """ + X, y = self.continuous + classes = ["a", "b", "c", "d", "e"] + oz = DataVisualizer(classes=classes, target_type="continuous").fit(X, y) + + assert not hasattr(oz, "classes_") + + def test_get_target_color_type(self): + """ + Test the get_target_color_type helper method + """ + oz = DataVisualizer() + + with pytest.raises(NotFitted, match="unknown target color type"): + oz.get_target_color_type() + + oz.fit(*self.continuous) + assert oz.get_target_color_type() == TargetType.CONTINUOUS + + def test_get_colors_not_fitted(self): + """ + Assert get_colors requires an fitted visualizer + """ + oz = DataVisualizer() + with pytest.raises(NotFitted, match="cannot determine colors"): + oz.get_colors(["a", "b", "c"]) + + @pytest.mark.parametrize("color, expected", [(None, "C0"), ("#F3B8AB", "#F3B8AB")]) + def test_get_colors_single(self, color, expected): + """ + Test color assignment for single target type + """ + X, y = self.discrete + oz = DataVisualizer(colors=color).fit(X) + assert oz.get_target_color_type() == TargetType.SINGLE + + # Test default colors + colors = oz.get_colors(y) + assert len(colors) == len(y) + assert np.unique(colors) == expected + + def test_get_colors_discrete(self): + """ + Test discrete colors with no label encoding + """ + X, y = self.discrete + oz = DataVisualizer().fit(X, y) + assert oz.get_target_color_type() == TargetType.DISCRETE + + colors = oz.get_colors(y) + assert len(colors) == len(y) + assert set(colors) == set(oz._colors.values()) + + def test_get_colors_discrete_classes(self): + """ + Test discrete colors with label encoding and colors + """ + X, y = self.discrete + oz = DataVisualizer( + classes=["a", "b", "c", "d", "e"], colors=["g", "r", "b", "m", "y"] + ).fit(X, y) + assert oz.get_target_color_type() == TargetType.DISCRETE + + colors = oz.get_colors(y) + assert len(colors) == len(y) + assert set(colors) == set(["g", "r", "b", "m", "y"]) + + def test_get_colors_not_label_encoded(self): + """ + Assert exception is raised on unknown class label for get_colors + """ + X, y = self.discrete + oz = DataVisualizer(classes="abcde").fit(X, y) + + with pytest.raises(YellowbrickKeyError, match="could not determine color"): + oz.get_colors(["foo"]) + + @pytest.mark.parametrize( + "colors, colormap", + [ + (["#3f78de", "#f38b33"], None), + (["b", "g", "r", "m", "y"], None), + (None, "Blues"), + ], + ) + def test_user_get_colors_discrete(self, colors, colormap): + """ + Test the ways that users can specify colors + """ + X, y = self.discrete + oz = DataVisualizer( + colors=colors, colormap=colormap, target_type="discrete" + ).fit(X, y) + + colors = oz.get_colors(y) + assert len(colors) == len(y) + + def test_get_colors_continous(self): + """ + Test continuous colors with no default colormap + """ + X, y = self.continuous + oz = DataVisualizer().fit(X, y) + assert oz.get_target_color_type() == TargetType.CONTINUOUS + + colors = oz.get_colors(y) + assert len(colors) == len(y) + + def test_get_colors_continous_cmap(self): + """ + Test continuous colors with user specified cmap + """ + X, y = self.continuous + oz = DataVisualizer(colormap="jet").fit(X, y) + assert oz.get_target_color_type() == TargetType.CONTINUOUS + + colors = oz.get_colors(y) + assert len(colors) == len(y) diff --git a/tests/test_features/test_jointplot.py b/tests/test_features/test_jointplot.py index 4182a33af..aba077396 100644 --- a/tests/test_features/test_jointplot.py +++ b/tests/test_features/test_jointplot.py @@ -1,20 +1,21 @@ # tests.test_features.test_jointplot -# Test the JointPlotVisualizer +# Test the JointPlot Visualizer # # Author: Prema Damodaran Roman # Created: Mon Apr 10 21:00:54 2017 -0400 # -# Copyright (C) 2017 District Data Labs +# Copyright (C) 2017 The scikit-yb developers. # For license information, see LICENSE.txt # # ID: test_jointplot.py [9e008b0] pdamodaran@users.noreply.github.com $ """ -Test the JointPlotVisualizer. +Test joint plot visualization methods. These tests work differently depending on what version of matplotlib is -installed. If version 2.0.0 or greater is installed, then most tests will -execute, otherwise most will skip and only the warning will be tested. +installed. If version 2.0.2 or greater is installed, then most tests will +execute, otherwise the histogram tests will skip and only the warning will +be tested. """ ########################################################################## @@ -23,117 +24,471 @@ import sys import pytest -import warnings import numpy as np -import matplotlib as mpl -import matplotlib.pyplot as plt -from tests.dataset import DatasetMixin -from tests.base import VisualTestCase +from functools import partial +from unittest.mock import patch, MagicMock + +from sklearn.datasets import make_classification, make_regression + +from tests.base import IS_WINDOWS_OR_CONDA, VisualTestCase +from yellowbrick.exceptions import YellowbrickValueError from yellowbrick.features.jointplot import * +from ..fixtures import Dataset + +try: + # Only available in Matplotlib >= 2.0.2 + from mpl_toolkits.axes_grid1 import make_axes_locatable +except ImportError: + make_axes_locatable = None + +try: + import pandas as pd +except ImportError: + pd = None + + +########################################################################## +## Fixtures +########################################################################## + +# Random numpy array generators +rand1d = partial(np.random.rand, 120) +rand2col = partial(np.random.rand, 120, 2) +rand3col = partial(np.random.rand, 120, 3) + + +@pytest.fixture(scope="class") +def discrete(request): + """ + Creates a simple 2-column dataset with a discrete target. + """ + X, y = make_classification( + n_samples=120, + n_features=2, + n_informative=2, + n_redundant=0, + n_classes=3, + n_clusters_per_class=1, + random_state=2221, + ) + + request.cls.discrete = Dataset(X, y) + + +@pytest.fixture(scope="class") +def continuous(request): + """ + Creates a simple 2-column dataset with a continuous target. + """ + X, y = make_regression(n_samples=120, n_features=2, random_state=1112) + + request.cls.continuous = Dataset(X, y) + ########################################################################## -## JointPlotVisualizer Tests +## JointPlot Tests ########################################################################## -# Determine version of matplotlib -MPL_VERS_MAJ = int(mpl.__version__.split(".")[0]) +@pytest.mark.usefixtures("discrete", "continuous") +class TestJointPlotNoHistogram(VisualTestCase): + """ + Test the JointPlot visualizer without histograms + """ -class JointPlotTests(VisualTestCase, DatasetMixin): + def test_invalid_columns_values(self): + """ + Assert invalid columns arguments raise exception + """ + with pytest.raises(YellowbrickValueError, match="invalid for joint plot"): + JointPlot(columns=["a", "b", "c"], hist=False) + + def test_invalid_correlation_values(self): + """ + Assert invalid correlation arguments raise an exception + """ + with pytest.raises(YellowbrickValueError, match="invalid correlation method"): + JointPlot(correlation="foo", hist=False) - X = np.array([1, 2, 3, 5, 8, 10]) + def test_invalid_kind_values(self): + """ + Assert invalid kind arguments raise exception + """ + for bad_kind in ("foo", None, 123): + with pytest.raises(YellowbrickValueError, match="invalid joint plot kind"): + JointPlot(kind=bad_kind, hist=False) - y = np.array([1, 3, 6, 2, 9, 2]) + def test_invalid_hist_values(self): + """ + Assert invalid hist arguments raise exception + """ + for bad_hist in ("foo", 123): + with pytest.raises( + YellowbrickValueError, match="invalid argument for hist" + ): + JointPlot(hist=bad_hist) - def setUp(self): - self.concrete = self.load_data('concrete') + def test_no_haxes(self): + """ + Test that xhax and yhax are not available + """ + oz = JointPlot(hist=False) + with pytest.raises(AttributeError, match="histogram for the X axis"): + oz.xhax - def tearDown(self): - self.concrete = None + with pytest.raises(AttributeError, match="histogram for the Y axis"): + oz.yhax - @pytest.mark.skipif(MPL_VERS_MAJ > 1, reason="requires matplotlib 1.5.3 or less") - def test_warning(self): + @patch("yellowbrick.features.jointplot.plt") + def test_correlation(self, mplt): """ - Ensure that the jointplot warns if mpl version is < 2.0.0 + Test correlation is correctly computed """ - # Note Python 3.2+ has a self.assertWarns ... but we need to be - # Python 2.7 compatible, so we're going to do this. - with warnings.catch_warnings(record=True) as w: - # Cause all warnings to always be triggered. - warnings.simplefilter("always") + x = self.discrete.X[:, 0] + y = self.discrete.X[:, 1] - # Trigger a warning. - JointPlotVisualizer() + cases = ( + ("pearson", -0.3847799883805261), + ("spearman", -0.37301201472324463), + ("covariance", -0.5535440619953924), + ("kendalltau", -0.2504201680672269), + ) + + for alg, expected in cases: + oz = JointPlot(hist=False, correlation=alg, columns=None) + oz.ax = MagicMock() + oz.fit(x, y) + + assert hasattr(oz, "corr_") + assert oz.corr_ == pytest.approx( + expected + ), "{} not computed correctly".format(alg) + + def test_columns_none_invalid_x(self): + """ + When self.columns=None validate X and y + """ + bad_kws = ( + {"X": rand1d(), "y": None}, + {"X": rand3col(), "y": None}, + {"X": rand2col(), "y": rand1d()}, + {"X": rand3col(), "y": rand1d()}, + {"X": rand1d(), "y": rand2col()}, + ) + + for kws in bad_kws: + oz = JointPlot(columns=None, hist=False) + with pytest.raises( + YellowbrickValueError, match="when self.columns is None" + ): + oz.fit(**kws) + + def test_columns_none_x_y(self): + """ + When self.columns=None image similarity with valid X and y + """ + oz = JointPlot(hist=False, columns=None) + assert oz.fit(self.discrete.X[:, 0], self.discrete.y) is oz + assert hasattr(oz, "corr_") + + oz.finalize() + # Appveyor and Linux conda fail due to non-text-based differences + self.assert_images_similar(oz, tol=2.5) + + def test_columns_none_x(self): + """ + When self.columns=None image similarity with valid X, no y + """ + oz = JointPlot(hist=False, columns=None) + assert oz.fit(self.discrete.X) is oz + assert hasattr(oz, "corr_") + + oz.finalize() + tol = ( + 4.0 if sys.platform == "win32" else 0.01 + ) # Fails on AppVeyor with RMS 3.941 + self.assert_images_similar(oz, tol=tol) + + def test_columns_single_index_no_y(self): + """ + When self.columns=int or str y must not be None + """ + oz = JointPlot(columns="foo", hist=False) + with pytest.raises(YellowbrickValueError, match="y must be specified"): + oz.fit(rand2col(), y=None) + + def test_columns_single_invalid_index_numpy(self): + """ + When self.columns=int validate the index in X + """ + oz = JointPlot(columns=2, hist=False) + with pytest.raises(IndexError, match="could not index column '2' into type"): + oz.fit(self.continuous.X, self.continuous.y) + + @pytest.mark.skipif(pd is None, reason="test requires pandas") + def test_columns_single_invalid_index_pandas(self): + """ + When self.columns=str validate the index in X + """ + oz = JointPlot(columns="foo", hist=False) + X = pd.DataFrame(self.continuous.X, columns=["a", "b"]) + y = pd.Series(self.continuous.y) + + with pytest.raises(IndexError, match="could not index column 'foo' into type"): + oz.fit(X, y) + + def test_columns_single_int_index_numpy(self): + """ + When self.columns=int image similarity on numpy dataset + """ + oz = JointPlot(columns=1, hist=False) + assert oz.fit(self.continuous.X, self.continuous.y) is oz + assert hasattr(oz, "corr_") - # Ensure that a warning occurred - self.assertEqual(len(w), 1) - self.assertEqual( - str(w[-1].message), - "JointPlotVisualizer requires matplotlib major version 2 " - "or greater. Please upgrade." - ) + oz.finalize() + # Appveyor and Linux conda failed based on non-text-based differences + self.assert_images_similar(oz, tol=5) + + @pytest.mark.skipif(pd is None, reason="test requires pandas") + def test_columns_single_str_index_pandas(self): + """ + When self.columns=str image similarity on pandas dataset + """ + oz = JointPlot(columns="a", hist=False) + X = pd.DataFrame(self.continuous.X, columns=["a", "b"]) + y = pd.Series(self.continuous.y) + assert oz.fit(X, y) is oz + assert hasattr(oz, "corr_") + + oz.finalize() + # Appveyor and Linux conda failed based on non-text-based differences + self.assert_images_similar(oz, tol=5.5) + + def test_columns_double_int_index_numpy_no_y(self): + """ + When self.columns=[int, int] image similarity on numpy dataset no y + """ + oz = JointPlot(columns=[0, 1], hist=False) + assert oz.fit(self.discrete.X, y=None) is oz + assert hasattr(oz, "corr_") + + oz.finalize() + tol = ( + 4.0 if sys.platform == "win32" else 0.01 + ) # Fails on AppVeyor with RMS 3.941 + self.assert_images_similar(oz, tol=tol) + + @pytest.mark.skipif(pd is None, reason="test requires pandas") + def test_columns_double_str_index_pandas_no_y(self): + """ + When self.columns=[str, str] image similarity on pandas dataset no y + """ + oz = JointPlot(columns=["a", "b"], hist=False) + X = pd.DataFrame(self.continuous.X, columns=["a", "b"]) + assert oz.fit(X, y=None) is oz + assert hasattr(oz, "corr_") + + oz.finalize() + tol = ( + 4.0 if sys.platform == "win32" else 0.01 + ) # Fails on AppVeyor with RMS 3.911 + self.assert_images_similar(oz, tol=tol) + + @pytest.mark.skipif(pd is None, reason="test requires pandas") + def test_columns_double_index_discrete_y(self): + """ + When self.columns=[str, str] on DataFrame with discrete y + """ + oz = JointPlot(columns=["a", "b"], hist=False) + X = pd.DataFrame(self.discrete.X, columns=["a", "b"]) + y = pd.Series(self.discrete.y) + assert oz.fit(X, y) is oz + assert hasattr(oz, "corr_") + + oz.finalize() + tol = ( + 4.0 if sys.platform == "win32" else 0.01 + ) # Fails on AppVeyor with RMS 3.940 + self.assert_images_similar(oz, tol=tol) + + @pytest.mark.skipif(pd is None, reason="test requires pandas") + def test_columns_double_index_continuous_y(self): + """ + When self.columns=[str, str] on DataFrame with continuous y + """ + oz = JointPlot(columns=["a", "b"], hist=False) + X = pd.DataFrame(self.continuous.X, columns=["a", "b"]) + y = pd.Series(self.continuous.y) + assert oz.fit(X, y) is oz + assert hasattr(oz, "corr_") + + oz.finalize() + tol = ( + 4.0 if sys.platform == "win32" else 0.01 + ) # Fails on AppVeyor with RMS 3.911 + self.assert_images_similar(oz, tol=tol) + + +@pytest.mark.skipif( + make_axes_locatable is not None, reason="requires matplotlib <= 2.0.1" +) +def test_matplotlib_version_error(): + """ + Assert an exception is raised with incompatible matplotlib versions + """ + with pytest.raises(YellowbrickValueError): + JointPlot(hist=True) + + +@patch("yellowbrick.features.jointplot.make_axes_locatable", None) +def test_matplotlib_incompatibility(): + """ + Assert an exception is raised if make_axes_locatable is None + """ + with pytest.raises(YellowbrickValueError): + JointPlot(hist=True) + + +@pytest.mark.usefixtures("discrete", "continuous") +@pytest.mark.skipif(make_axes_locatable is None, reason="requires matplotlib >= 2.0.2") +class TestJointPlotHistogram(VisualTestCase): + """ + Test the JointPlot visualizer with histograms + """ + + def test_haxes_available(self): + """ + Test that xhax and yhax are available + """ + oz = JointPlot(hist=True) + assert oz.xhax is not None + assert oz.yhax is not None @pytest.mark.xfail( - sys.platform == 'win32', reason="images not close on windows" + IS_WINDOWS_OR_CONDA, + reason="font rendering different in OS and/or Python; see #892", ) - @pytest.mark.skipif(MPL_VERS_MAJ < 2, reason="requires matplotlib 2.0.0 or greater") - @pytest.mark.filterwarnings("ignore:internal gelsd driver") - def test_jointplot_has_no_errors(self): + def test_columns_none_x_y_hist(self): """ - Assert no errors occur during jointplot visualizer integration + When self.columns=None image similarity with valid X and y """ - fig = plt.figure() - ax = fig.add_subplot() + oz = JointPlot(hist=True, columns=None) + assert oz.fit(self.discrete.X[:, 0], self.discrete.y) is oz + assert hasattr(oz, "corr_") - visualizer = JointPlotVisualizer(ax=ax) - visualizer.fit(self.X, self.y) + oz.finalize() + self.assert_images_similar(oz) + + @pytest.mark.xfail( + IS_WINDOWS_OR_CONDA, + reason="font rendering different in OS and/or Python; see #892", + ) + def test_columns_none_x_hist(self): + """ + When self.columns=None image similarity with valid X, no y + """ + oz = JointPlot(hist=True, columns=None) + assert oz.fit(self.discrete.X) is oz + assert hasattr(oz, "corr_") - self.assert_images_similar(visualizer, tol=10) + oz.finalize() + self.assert_images_similar(oz) @pytest.mark.xfail( - sys.platform == 'win32', reason="images not close on windows" + IS_WINDOWS_OR_CONDA, + reason="font rendering different in OS and/or Python; see #892", ) - @pytest.mark.skipif(MPL_VERS_MAJ < 2, reason="requires matplotlib 2.0.0 or greater") - def test_jointplot_integrated_has_no_errors(self): + def test_columns_single_int_index_numpy_hist(self): """ - Test jointplot on the concrete data set + When self.columns=int image similarity on numpy dataset """ + oz = JointPlot(columns=1, hist=True) + assert oz.fit(self.continuous.X, self.continuous.y) is oz + assert hasattr(oz, "corr_") - fig = plt.figure() - ax = fig.add_subplot() + oz.finalize() + self.assert_images_similar(oz) - # Load the data from the fixture - X = self.concrete['cement'] - y = self.concrete['strength'] - feature = 'cement' - target = 'strength' + @pytest.mark.skipif(pd is None, reason="test requires pandas") + @pytest.mark.xfail( + IS_WINDOWS_OR_CONDA, + reason="font rendering different in OS and/or Python; see #892", + ) + def test_columns_single_str_index_pandas_hist(self): + """ + When self.columns=str image similarity on pandas dataset + """ + oz = JointPlot(columns="a", hist=True) + X = pd.DataFrame(self.continuous.X, columns=["a", "b"]) + y = pd.Series(self.continuous.y) + assert oz.fit(X, y) is oz + assert hasattr(oz, "corr_") - # Test the visualizer - visualizer = JointPlotVisualizer( - feature=feature, target=target, joint_plot="hex", ax=ax) - visualizer.fit(X, y) + oz.finalize() + self.assert_images_similar(oz) - self.assert_images_similar(visualizer, tol=15) + @pytest.mark.xfail( + IS_WINDOWS_OR_CONDA, + reason="font rendering different in OS and/or Python; see #892", + ) + def test_columns_double_int_index_numpy_no_y_hist(self): + """ + When self.columns=[int, int] image similarity on numpy dataset no y + """ + oz = JointPlot(columns=[0, 1], hist=True) + assert oz.fit(self.discrete.X, y=None) is oz + assert hasattr(oz, "corr_") + + oz.finalize() + self.assert_images_similar(oz) + + @pytest.mark.skipif(pd is None, reason="test requires pandas") + @pytest.mark.xfail( + IS_WINDOWS_OR_CONDA, + reason="font rendering different in OS and/or Python; see #892", + ) + def test_columns_double_str_index_pandas_no_y_hist(self): + """ + When self.columns=[str, str] image similarity on pandas dataset no y + """ + oz = JointPlot(columns=["a", "b"], hist=True) + X = pd.DataFrame(self.continuous.X, columns=["a", "b"]) + assert oz.fit(X, y=None) is oz + assert hasattr(oz, "corr_") + + oz.finalize() + self.assert_images_similar(oz) + + @pytest.mark.skipif(pd is None, reason="test requires pandas") + @pytest.mark.xfail( + IS_WINDOWS_OR_CONDA, + reason="font rendering different in OS and/or Python; see #892", + ) + def test_columns_double_index_discrete_y_hist(self): + """ + When self.columns=[str, str] on DataFrame with discrete y + """ + oz = JointPlot(columns=["a", "b"], hist=True) + X = pd.DataFrame(self.discrete.X, columns=["a", "b"]) + y = pd.Series(self.discrete.y) + assert oz.fit(X, y) is oz + assert hasattr(oz, "corr_") + oz.finalize() + self.assert_images_similar(oz) - @pytest.mark.skipif(MPL_VERS_MAJ < 2, reason="requires matplotlib 2.0.0 or greater") - def test_jointplot_no_matplotlib2_warning(self): + @pytest.mark.skipif(pd is None, reason="test requires pandas") + def test_columns_double_index_continuous_y_hist(self): """ - Assert no UserWarning occurs if matplotlib major version >= 2 + When self.columns=[str, str] on DataFrame with continuous y """ - with warnings.catch_warnings(record=True) as ws: - # Filter on UserWarnings - warnings.filterwarnings("always", category=UserWarning) - visualizer = JointPlotVisualizer() - visualizer.fit(self.X, self.y) - visualizer.finalize() + oz = JointPlot(columns=["a", "b"], hist=True) + X = pd.DataFrame(self.continuous.X, columns=["a", "b"]) + y = pd.Series(self.continuous.y) + assert oz.fit(X, y) is oz + assert hasattr(oz, "corr_") - # Filter out user warnings not related to matplotlib version - ver_warn_msg = "requires matplotlib major version 2 or greater" - mpl_ver_cnt = 0 - for w in ws: - if w and w.message and ver_warn_msg in str(w.message): - mpl_ver_cnt += 1 - self.assertEqual(0, mpl_ver_cnt, ws[-1].message \ - if ws else "No error") + oz.finalize() + # Appveyor and Linux conda failed due to non-text-based differences + self.assert_images_similar(oz, tol=4.0) diff --git a/tests/test_features/test_manifold.py b/tests/test_features/test_manifold.py index f9850c39e..499f44fff 100644 --- a/tests/test_features/test_manifold.py +++ b/tests/test_features/test_manifold.py @@ -4,7 +4,10 @@ # Author: Benjamin Bengfort # Created: Sat May 12 11:24:41 2018 -0400 # -# ID: test_manifold.py [] benjamin@bengfort.com $ +# Copyright (C) 2018 The scikit-yb developers +# For license information, see LICENSE.txt +# +# ID: test_manifold.py [02f8c27] benjamin@bengfort.com $ """ Tests for the Manifold High Dimensional Visualizations @@ -18,21 +21,18 @@ from yellowbrick.features.manifold import * from yellowbrick.utils.types import is_estimator -from yellowbrick.exceptions import YellowbrickValueError +from yellowbrick.features.base import TargetType +from yellowbrick.exceptions import YellowbrickValueError, ModelError, NotFitted from sklearn.pipeline import Pipeline from sklearn.decomposition import PCA +from sklearn.datasets import make_blobs +from sklearn.datasets import make_s_curve from sklearn.manifold import LocallyLinearEmbedding -from sklearn.datasets.samples_generator import make_s_curve -from sklearn.datasets import make_classification, make_regression, make_blobs +from unittest.mock import patch from tests.base import VisualTestCase -try: - from unittest.mock import patch -except ImportError: - from mock import patch - try: import pandas as pd except ImportError: @@ -43,35 +43,55 @@ ## Manifold Visualizer Tests ########################################################################## + +@pytest.mark.usefixtures("s_curves", "discrete", "continuous") class TestManifold(VisualTestCase): """ Test Manifold visualizer """ - def test_manifold_construction(self): + @pytest.mark.parametrize( + "algorithm", + ["lle", "ltsa", "hessian", "modified", "isomap", "mds", "spectral", "tsne"], + ) + def test_manifold_construction(self, algorithm): """ Should be able to construct a manifold estimator from a string """ - # TODO: parametrize this once unittest.TestCase dependency removed. - algorithms = [ - "lle", "ltsa", "hessian", "modified", - "isomap", "mds", "spectral", "tsne", - ] - - for algorithm in algorithms: - message = "case failed for {}".format(algorithm) - params = { - "n_neighbors": 18, - "random_state": 53, - } - oz = Manifold(manifold=algorithm, **params) - assert is_estimator(oz.manifold), message - assert oz.manifold.get_params()["n_components"] == 2, message - - manifold_params = oz.manifold.get_params() - for param, value in params.items(): - if param in manifold_params: - assert value == manifold_params[param], message + message = "case failed for {}".format(algorithm) + params = {"n_neighbors": 18, "random_state": 53} + oz = Manifold(manifold=algorithm, **params) + assert is_estimator(oz.manifold), message + assert oz.manifold.get_params()["n_components"] == 2, message + + manifold_params = oz.manifold.get_params() + for param, value in params.items(): + if param in manifold_params: + assert value == manifold_params[param], message + + @pytest.mark.parametrize( + "algorithm", ["lle", "ltsa", "hessian", "modified", "isomap", "spectral"] + ) + def test_manifold_warning(self, algorithm): + """ + Should raise a warning if n_neighbors not specified + """ + message = "case failed for {}".format(algorithm) + n_neighbors = 6 if algorithm == "hessian" else 5 + + with pytest.warns(YellowbrickWarning): + oz = Manifold(manifold=algorithm) + assert oz.n_neighbors == n_neighbors, message + + @pytest.mark.parametrize("algorithm", ["mds", "tsne"]) + def test_manifold_no_warning(self, algorithm): + """ + Should not raise a warning if n_neighbors not specified + """ + message = "case failed for {}".format(algorithm) + + with pytest.warns(None) as record: + assert not record.list, message def test_bad_manifold_exception(self): """ @@ -84,186 +104,213 @@ def test_manifold_instance_construction(self): """ Should allow a sklearn.Estimator object to be set as manifold """ - manifold = Pipeline([ - ('pca', PCA(n_components=50)), - ('lle', LocallyLinearEmbedding(n_components=2)), - ]) + manifold = Pipeline( + [ + ("pca", PCA(n_components=50)), + ("lle", LocallyLinearEmbedding(n_components=2)), + ] + ) oz = Manifold(manifold=manifold) assert oz.manifold is manifold - @patch('yellowbrick.features.manifold.Manifold.fit_transform', spec=True) - def test_manifold_fit(self, mock_fit_transform): + @pytest.mark.filterwarnings("ignore:Conversion of the second argument") + @pytest.mark.parametrize( + "algorithm", ["lle", "ltsa", "hessian", "modified", "isomap"] + ) + def test_manifold_algorithm_transform_fit(self, algorithm): """ - Test manifold fit method + Test manifold fit with algorithms having transform implemented """ - X, y = make_s_curve(1000, random_state=888) - manifold = Manifold(target="auto") + X, y = make_s_curve(1000, random_state=94) + with pytest.warns(YellowbrickWarning): + manifold = Manifold(manifold=algorithm, target="auto") assert manifold.fit(X, y) is manifold, "fit did not return self" - mock_fit_transform.assert_called_once() - @patch('yellowbrick.features.manifold.Manifold.draw', spec=True) - def test_manifold_fit_transform(self, mock_draw): + @pytest.mark.filterwarnings("ignore:Conversion of the second argument") + @pytest.mark.parametrize("algorithm", ["mds", "spectral", "tsne"]) + def test_manifold_algorithm_no_transform_fit(self, algorithm): + """ + Test manifold fit with algorithms not having transform implemented + """ + X, y = self.s_curves + msg = "requires data to be simultaneously fit and transformed" + oz = Manifold(manifold=algorithm, n_neighbors=10, random_state=223) + with pytest.raises(ModelError, match=msg): + oz.fit(X) + + @patch("yellowbrick.features.manifold.Manifold.draw", spec=True) + @pytest.mark.parametrize("projection", [2, 3]) + def test_manifold_fit_transform(self, mock_draw, projection): """ Test manifold fit_transform method """ - X, y = make_s_curve(1000, random_state=888) - manifold = Manifold(target="auto") + X, y = self.s_curves + manifold = Manifold(target="auto", projection=projection) - assert not hasattr(manifold, 'fit_time_') + assert not hasattr(manifold, "fit_time_") Xp = manifold.fit_transform(X, y) - assert Xp.shape == (X.shape[0], 2) + assert Xp.shape == (X.shape[0], projection) mock_draw.assert_called_once() - assert hasattr(manifold, 'fit_time_') - assert manifold._target_color_type == CONTINUOUS + assert hasattr(manifold, "fit_time_") + assert manifold._target_color_type == TargetType.CONTINUOUS - @pytest.mark.filterwarnings("ignore:Conversion of the second argument") - def test_manifold_classification(self): + @patch("yellowbrick.features.manifold.Manifold.fit_transform", spec=True) + @patch("yellowbrick.features.manifold.Manifold.draw", spec=True) + @pytest.mark.parametrize("projection", [2, 3]) + def test_manifold_transform(self, mock_draw, mock_fit_transform, projection): """ - Image similarity test for classification dataset (discrete y) + Test manifold transform method """ - X, y = make_classification( - n_samples=300, n_features=7, n_informative=4, n_redundant=2, - n_classes=4, n_clusters_per_class=2, random_state=78 + X, y = self.s_curves + manifold = Manifold( + manifold="lle", target="auto", n_neighbors=5, projection=projection ) - oz = Manifold(manifold="spectral", target="discrete", random_state=108) - assert not hasattr(oz, 'classes_') + manifold.fit(X, y) + Xp = manifold.transform(X, y) + assert Xp.shape == (X.shape[0], projection) - oz.fit(X, y) - - assert hasattr(oz, 'classes_') - assert not hasattr(oz, 'range_') - self.assert_images_similar(oz, tol=0.5) + mock_draw.assert_called_once() - def test_manifold_regression(self): + def test_manifold_no_transform(self): """ - Image similarity test for regression dataset (continuous y) + Test the exception when manifold doesn't implement transform. """ - X, y = make_regression( - n_samples=300, n_features=7, n_informative=4, random_state=87 - ) - - oz = Manifold(manifold="tsne", target="continuous", random_state=1) - assert not hasattr(oz, 'range_') + X, _ = self.s_curves + manifold = Manifold(manifold="lle", n_neighbors=5, target="auto") - oz.fit(X, y) + msg = "instance is not fitted yet, please call fit" + with pytest.raises(NotFitted, match=msg): + manifold.transform(X) - assert not hasattr(oz, 'classes_') - assert hasattr(oz, 'range_') - self.assert_images_similar(oz, tol=1.5) + @patch("yellowbrick.features.manifold.Manifold.fit", spec=True) + @pytest.mark.parametrize("manifolds", ["mds", "spectral", "tsne"]) + def test_manifold_assert_no_transform(self, mock_fit, manifolds): + """ + Assert that transform raises error when MDS, TSNE or Spectral Embedding algorithms are used. + """ + X, _ = self.s_curves + manifold = Manifold(manifold=manifolds, target="auto", n_neighbors=10) + mock_fit(X) + msg = "requires data to be simultaneously fit and transformed" + with pytest.raises(ModelError, match=msg): + manifold.transform(X) - def test_manifold_single(self): + @pytest.mark.filterwarnings("ignore:Conversion of the second argument") + def test_manifold_classification(self): """ - Image similarity test for simple dataset (no y) + Image similarity test for classification dataset (discrete y) """ - X, _ = make_blobs( - n_samples=300, n_features=7, centers=3, random_state=1112, + X, y = self.discrete + + oz = Manifold( + manifold="spectral", target="discrete", n_neighbors=5, random_state=108 ) + assert not hasattr(oz, "classes_") - oz = Manifold(manifold="mds", random_state=139973) - oz.fit(X) + oz.fit_transform(X, y) - self.assert_images_similar(oz, tol=5.0) + assert hasattr(oz, "classes_") + assert not hasattr(oz, "range_") + self.assert_images_similar(oz, tol=0.5) - @pytest.mark.skipif(pd is None, reason="requires pandas") - def test_manifold_pandas(self): + def test_manifold_classification_3d(self): """ - Test manifold on a dataset made up of a pandas DataFrame and Series + Image similarity test for classification dataset (discrete y) """ - X, y = make_s_curve(200, random_state=888) - - X = pd.DataFrame(X) - y = pd.Series(y) + X, y = self.discrete oz = Manifold( - manifold='ltsa', colors='nipy_spectral', - target='continuous', random_state=223 - ).fit(X, y) + manifold="spectral", + target="discrete", + n_neighbors=5, + random_state=108, + projection=3, + ) - # TODO: find a way to decrease this tolerance - self.assert_images_similar(oz, tol=35) + assert not hasattr(oz, "classes_") - @pytest.mark.filterwarnings("ignore:Conversion of the second argument") - def test_manifold_algorithm_fit(self): + oz.fit_transform(X, y) + + assert hasattr(oz, "classes_") + assert not hasattr(oz, "range_") + self.assert_images_similar(oz) + + def test_manifold_regression(self): """ - Test that all algorithms can be fitted correctly + Image similarity test for regression dataset (continuous y) """ - # TODO: parametrize this once unittest.TestCase dependency removed. - algorithms = [ - "lle", "ltsa", "hessian", "modified", - "isomap", "mds", "spectral", "tsne", - ] + X, y = self.continuous - X, y = make_s_curve(200, random_state=888) + oz = Manifold(manifold="tsne", target="continuous", random_state=1) + assert not hasattr(oz, "range_") - for algorithm in algorithms: - oz = Manifold(manifold=algorithm, random_state=223) - oz.fit(X, y) + oz.fit_transform(X, y) + oz.finalize() + assert not hasattr(oz, "classes_") + assert hasattr(oz, "range_") + self.assert_images_similar(oz, tol=1.5) - def test_determine_target_color_type(self): + def test_manifold_regression_3d(self): """ - Check that the target type is determined by a value y + Image similarity test for regression dataset (continuous y) """ - manifold = Manifold() - - # Check default is auto - assert manifold.target == AUTO + X, y = self.continuous - # Assert single when y is None - manifold._determine_target_color_type(None) - assert manifold._target_color_type == SINGLE + oz = Manifold( + manifold="tsne", target="continuous", random_state=1, projection=3 + ) + assert not hasattr(oz, "range_") - # Check when y is continuous - y = np.random.rand(100) - manifold._determine_target_color_type(y) - assert manifold._target_color_type == CONTINUOUS + oz.fit_transform(X, y) + oz.finalize() + oz.cbar.set_ticks([]) + assert not hasattr(oz, "classes_") + assert hasattr(oz, "range_") + self.assert_images_similar(oz, tol=15) - # Check when y is discrete - y = np.random.choice(['a', 'b', 'c', 'd'], 100) - manifold._determine_target_color_type(y) - assert manifold._target_color_type == DISCRETE + def test_manifold_single(self): + """ + Image similarity test for simple dataset (no y) + """ + X, _ = make_blobs(n_samples=300, n_features=7, centers=3, random_state=1112) - # Check when default is set to continuous and discrete data passed in - manifold = Manifold(target=CONTINUOUS) - y = np.random.choice(['a', 'b', 'c', 'd'], 100) - manifold._determine_target_color_type(y) - assert manifold._target_color_type == CONTINUOUS + oz = Manifold(manifold="mds", random_state=139973) + oz.fit_transform(X) - # Check when default is set to discrete and continuous data passed in - manifold = Manifold(target=DISCRETE) - y = np.random.rand(100) - manifold._determine_target_color_type(y) - assert manifold._target_color_type == DISCRETE + self.assert_images_similar(oz) - # None overrides specified target - manifold = Manifold(target=CONTINUOUS) - manifold._determine_target_color_type(None) - assert manifold._target_color_type == SINGLE + def test_manifold_single_3d(self): + """ + Image similarity test for simple dataset (no y) + """ + X, _ = make_blobs(n_samples=300, n_features=7, centers=3, random_state=1112) - # None overrides specified target - manifold = Manifold(target=DISCRETE) - manifold._determine_target_color_type(None) - assert manifold._target_color_type == SINGLE + oz = Manifold(manifold="mds", random_state=139973, projection=3) + oz.fit_transform(X) - # Bad target raises exception - # None overrides specified target - manifold = Manifold(target="foo") - msg = "could not determine target color type" - with pytest.raises(YellowbrickValueError, match=msg): - manifold._determine_target_color_type([]) + self.assert_images_similar(oz) - def test_manifold_no_transform(self): + @pytest.mark.skipif(pd is None, reason="requires pandas") + def test_manifold_pandas(self): """ - Test the exception when manifold doesn't implement transform. + Test manifold on a dataset made up of a pandas DataFrame and Series """ - X, _ = make_s_curve(1000, random_state=888) - manifold = Manifold(manifold='mds', target="auto") + X, y = self.s_curves - assert not hasattr(manifold._manifold, 'transform') - - with pytest.raises(AttributeError, match="try using fit_transform instead"): - manifold.transform(X) + oz = Manifold( + manifold="ltsa", + colormap="nipy_spectral", + n_neighbors=10, + target="continuous", + random_state=223, + ) + oz.fit_transform(X, y) + oz.finalize() + oz.cbar.set_ticks([]) + # TODO: find a way to decrease this tolerance + self.assert_images_similar(oz, tol=40) diff --git a/tests/test_features/test_pca.py b/tests/test_features/test_pca.py index ed56753f4..ee8242230 100644 --- a/tests/test_features/test_pca.py +++ b/tests/test_features/test_pca.py @@ -2,11 +2,14 @@ # tests.test_features.test_pca # Tests for the PCA based feature visualizer. # -# Author: Carlo Morales <@cjmorale> -# Author: Raúl Peralta Lozada <@RaulPL> -# Author: Benjamin Bengfort <@bbengfort> +# Author: Carlo Morales +# Author: Raúl Peralta Lozada +# Author: Benjamin Bengfort # Created: Tue May 23 18:34:27 2017 -0400 # +# Copyright (C) 2017 The scikit-yb developers. +# For license information, see LICENSE.txt +# # ID: test_pca.py [] cmorales@pacificmetrics.com $ """ @@ -20,171 +23,342 @@ import sys import pytest import numpy as np +import numpy.testing as npt + +from unittest import mock +from tests.base import VisualTestCase, IS_WINDOWS_OR_CONDA -from tests.dataset import Dataset -from tests.base import VisualTestCase from yellowbrick.features.pca import * -from yellowbrick.exceptions import YellowbrickError -from sklearn.datasets import make_classification +from yellowbrick.exceptions import YellowbrickError, NotFitted + +# Note: this can be removed when we deprecate mpl in #826 +try: + # Only available in Matplotlib >= 2.0.2 + from mpl_toolkits.axes_grid1 import make_axes_locatable +except ImportError: + make_axes_locatable = None ########################################################################## -## Fixtures +# PCA Tests ########################################################################## -@pytest.fixture(scope='class') -def binary(request): + +@pytest.mark.usefixtures("discrete", "continuous") +class TestPCA(VisualTestCase): """ - Creates a fixture of train and test splits for the sklearn digits dataset - For ease of use returns a Dataset named tuple composed of two Split tuples. + Test the PCA visualizer """ - X, y = make_classification( - n_samples=400, n_features=12, n_informative=8, n_redundant=0, - n_classes=2, n_clusters_per_class=1, class_sep=1.8, random_state=854, - scale=[14.2, 2.1, 0.32, 0.001, 32.3, 44.1, 102.3, 2.3, 2.4, 38.2, 0.05, 1.0], - ) - # Set a class attribute for digits - request.cls.dataset = Dataset(X, y) + def test_single(self): + """ + Test single target. + """ + visualizer = PCA(random_state=1998) + visualizer.fit(self.continuous.X) + visualizer.transform(self.continuous.X) + assert not hasattr(visualizer, "classes_") + assert not hasattr(visualizer, "range_") + self.assert_images_similar(visualizer) + @pytest.mark.xfail(IS_WINDOWS_OR_CONDA, reason="RMS of 10.205 on miniconda") + def test_continuous(self): + """ + Test continuous target + """ + visualizer = PCA(colormap="YlOrRd", random_state=2019) + assert not hasattr(visualizer, "range_") + visualizer.fit(*self.continuous) + visualizer.transform(*self.continuous) + assert hasattr(visualizer, "range_") + assert not hasattr(visualizer, "classes_") + visualizer.finalize() -########################################################################## -##PCA Tests -########################################################################## + visualizer.cax.set_yticklabels([]) -@pytest.mark.usefixtures("binary") -class PCADecompositionTests(VisualTestCase): - """ - Test the PCADecomposition visualizer - """ + # AppVeyor tests fail with RMS 10.085 + self.assert_images_similar(visualizer, windows_tol=10.5) - @pytest.mark.xfail( - sys.platform == 'win32', reason="images not close on windows (RMSE=4)" - ) + def test_discrete(self): + """ + Test discrete target. + """ + classes = ["a", "b", "c", "d", "e"] + colors = ["r", "b", "g", "m", "c"] + + visualizer = PCA(colors=colors, classes=classes, random_state=83) + assert not hasattr(visualizer, "classes_") + visualizer.fit(*self.discrete) + assert hasattr(visualizer, "classes_") + assert not hasattr(visualizer, "range_") + visualizer.transform(*self.discrete) + + # Make sure that classes are set correctly. + npt.assert_array_equal(visualizer.classes_, classes) + + self.assert_images_similar(visualizer) + + def test_fit(self): + """ + Test that fit returns self. + """ + pca = PCA() + assert pca.fit(*self.discrete) is pca + + @pytest.mark.parametrize("n_components", [2, 3]) + def test_transform(self, n_components): + Xprime = PCA(projection=n_components).fit_transform(*self.continuous) + assert Xprime.shape == (500, n_components) + + def test_transform_without_fit(self): + """ + Test that appropriate error is raised when transform called without fit. + """ + oz = PCA(projection=3) + msg = "instance is not fitted yet, please call fit" + with pytest.raises(NotFitted, match=msg): + oz.transform(*self.continuous) + + @pytest.mark.xfail(IS_WINDOWS_OR_CONDA, reason="RMS of 12.115 on miniconda") def test_pca_decomposition_quick_method(self): """ - Test the quick method PCADecomposition visualizer 2 dimensions scaled. + Test the quick method PCA visualizer 2 dimensions scaled. """ - ax = pca_decomposition( - X=self.dataset.X, proj_dim=2, scale=True, random_state=28 + visualizer = pca_decomposition( + *self.discrete, projection=2, scale=True, random_state=28 ) - self.assert_images_similar(ax=ax) - @pytest.mark.xfail( - sys.platform == 'win32', reason="images not close on windows (RMSE=?)" - ) + # AppVeyor tests fail with RMS 12.115 + self.assert_images_similar(visualizer, windows_tol=12.5) + def test_scale_true_2d(self): """ - Test the PCADecomposition visualizer 2 dimensions scaled. + Test the PCA visualizer 2 dimensions scaled. """ - params = {'scale': True, 'proj_dim': 2, 'random_state': 9932} - visualizer = PCADecomposition(**params).fit(self.dataset.X) - pca_array = visualizer.transform(self.dataset.X) + params = {"scale": True, "projection": 2, "random_state": 9932} + visualizer = PCA(**params).fit(*self.discrete) + pca_array = visualizer.transform(*self.discrete) # Image comparison tests self.assert_images_similar(visualizer) # Assert PCA transformation occurred successfully - assert pca_array.shape == (self.dataset.X.shape[0], 2) + assert pca_array.shape == (self.discrete.X.shape[0], 2) + @pytest.mark.xfail(IS_WINDOWS_OR_CONDA, reason="RMS of 8.828 on miniconda") def test_scale_false_2d(self): """ - Test the PCADecomposition visualizer 2 dimensions non-scaled. + Test the PCA visualizer 2 dimensions non-scaled. """ - params = {'scale': False, 'proj_dim': 2, 'random_state': 1229} - visualizer = PCADecomposition(**params).fit(self.dataset.X) - pca_array = visualizer.transform(self.dataset.X) - + params = {"scale": False, "projection": 2, "random_state": 1229} + visualizer = PCA(**params).fit(*self.continuous) + pca_array = visualizer.transform(*self.continuous) + visualizer.finalize() + visualizer.cax.set_yticklabels([]) # Image comparison tests - self.assert_images_similar(visualizer) + # AppVeyor tests fail with RMS 8.180 + self.assert_images_similar(visualizer, tol=0.03, windows_tol=8.5) # Assert PCA transformation occurred successfully - assert pca_array.shape == (self.dataset.X.shape[0], 2) + assert pca_array.shape == (self.continuous.X.shape[0], 2) - @pytest.mark.xfail( - sys.platform == 'win32', reason="images not close on windows (RMSE=3)" - ) def test_biplot_2d(self): """ - Test the PCADecomposition 2D biplot (proj_features). + Test the PCA 2D biplot (proj_features). """ params = { - 'features': 'ABCDEFGHIKLM', 'random_state': 67, - 'proj_features': True, 'proj_dim': 2, + "features": list("ABCDEFGHIKLM"), + "random_state": 67, + "proj_features": True, + "projection": 2, } - visualizer = PCADecomposition(**params).fit(self.dataset.X) - pca_array = visualizer.transform(self.dataset.X) + visualizer = PCA(**params).fit(self.discrete.X) + pca_array = visualizer.transform(self.discrete.X) # Image comparison tests - self.assert_images_similar(visualizer) + self.assert_images_similar(visualizer, tol=5) # Assert PCA transformation occurred successfully - assert pca_array.shape == (self.dataset.X.shape[0], 2) + assert pca_array.shape == (self.discrete.X.shape[0], 2) def test_scale_true_3d(self): """ - Test the PCADecomposition visualizer 3 dimensions scaled. + Test the PCA visualizer 3 dimensions scaled. """ - params = {'scale': True, 'proj_dim': 3, 'random_state': 7382} - visualizer = PCADecomposition(**params).fit(self.dataset.X) - pca_array = visualizer.transform(self.dataset.X) + params = {"scale": True, "projection": 3, "random_state": 7382} + visualizer = PCA(**params).fit(self.discrete.X) + pca_array = visualizer.transform(self.discrete.X) # Image comparison tests self.assert_images_similar(visualizer) # Assert PCA transformation occurred successfully - assert pca_array.shape == (self.dataset.X.shape[0], 3) + assert pca_array.shape == (self.discrete.X.shape[0], 3) def test_scale_false_3d(self): """ - Test the PCADecomposition visualizer 3 dimensions non-scaled. + Test the PCA visualizer 3 dimensions non-scaled. """ - params = {'scale': False, 'proj_dim': 3, 'random_state': 98} - visualizer = PCADecomposition(**params).fit(self.dataset.X) - pca_array = visualizer.transform(self.dataset.X) + params = {"scale": False, "projection": 3, "random_state": 98} + visualizer = PCA(**params).fit(self.discrete.X) + pca_array = visualizer.transform(self.discrete.X) # Image comparison tests self.assert_images_similar(visualizer) # Assert PCA transformation occurred successfully - assert pca_array.shape == (self.dataset.X.shape[0], 3) + assert pca_array.shape == (self.discrete.X.shape[0], 3) @pytest.mark.xfail( - sys.platform == 'win32', reason="images not close on windows (RMSE=3)" + sys.platform == "win32", reason="images not close on windows (RMSE=3)" ) def test_biplot_3d(self): """ - Test the PCADecomposition 3D biplot (proj_features). + Test the PCA 3D biplot (proj_features). """ params = { - 'features': 'ABCDEFGHIKLM', 'random_state': 800, - 'proj_features': True, 'proj_dim': 3, + "features": list("ABCDEFGHIKLM"), + "random_state": 800, + "proj_features": True, + "projection": 3, } - visualizer = PCADecomposition(**params).fit(self.dataset.X) - pca_array = visualizer.transform(self.dataset.X) + visualizer = PCA(**params).fit(*self.discrete) + pca_array = visualizer.transform(*self.discrete) # Image comparison tests - self.assert_images_similar(visualizer) + self.assert_images_similar(visualizer, tol=5) # Assert PCA transformation occurred successfully - assert pca_array.shape == (self.dataset.X.shape[0], 3) + assert pca_array.shape == (self.discrete.X.shape[0], 3) - def test_scale_true_4d_execption(self): + def test_scale_true_4d_exception(self): """ - Test the PCADecomposition visualizer 4 dimensions scaled (catch YellowbrickError). + Test PCA visualizer 4 dimensions scaled (catch YellowbrickError). """ - params = {'scale': True, 'proj_dim': 4} - with pytest.raises(YellowbrickError, match="proj_dim object is not 2 or 3"): - PCADecomposition(**params) + params = {"scale": True, "projection": 4} + msg = "Projection dimensions must be either 2 or 3" + with pytest.raises(YellowbrickError, match=msg): + PCA(**params) - def test_scale_true_3d_execption(self): + def test_scale_true_3d_exception(self): """ - Test the PCADecomposition visualizer 3 dims scaled on 2 dim data set (catch ValueError). + Test PCA visualizer 3 dims scaled on 2 dim data set (catch ValueError). """ X = np.random.normal(loc=2, size=(100, 2)) - params = {'scale': True, 'proj_dim': 3} + params = {"scale": True, "projection": 3} - e = r'n_components=3 must be between 0 and min\(n_samples, n_features\)=2' + e = r"n_components=3 must be between 0 and min\(n_samples, n_features\)=2" with pytest.raises(ValueError, match=e): - pca = PCADecomposition(**params) + pca = PCA(**params) pca.fit(X) + + @mock.patch("yellowbrick.features.pca.plt.sca", autospec=True) + def test_alpha_param(self, mock_sca): + """ + Test that the user can supply an alpha param on instantiation + """ + # Instantiate a prediction error plot, provide custom alpha + params = {"alpha": 0.3, "projection": 2, "random_state": 9932} + visualizer = PCA(**params).fit(self.discrete.X) + pca_array = visualizer.transform(self.discrete.X) + assert visualizer.alpha == 0.3 + + visualizer.ax = mock.MagicMock() + visualizer.fit(self.discrete.X) + visualizer.transform(self.discrete.X) + + # Test that alpha was passed to internal matplotlib scatterplot + _, scatter_kwargs = visualizer.ax.scatter.call_args + assert "alpha" in scatter_kwargs + assert scatter_kwargs["alpha"] == 0.3 + assert pca_array.shape == (self.discrete.X.shape[0], 2) + + @pytest.mark.xfail(IS_WINDOWS_OR_CONDA, reason="RMS of 7.332 on miniconda") + def test_colorbar(self): + """ + Test the PCA visualizer's colorbar features. + """ + params = { + "scale": True, + "projection": 2, + "random_state": 7382, + "color": self.discrete.y, + "colorbar": True, + } + visualizer = PCA(**params).fit(*self.continuous) + visualizer.transform(self.continuous.X, self.continuous.y) + visualizer.finalize() + visualizer.cax.set_yticklabels([]) + + # Image comparison tests + # AppVeyor tests fail with RMS of 7.280 + self.assert_images_similar(visualizer, windows_tol=7.5) + + @pytest.mark.xfail(IS_WINDOWS_OR_CONDA, reason="RMS of 14.515 on miniconda") + def test_heatmap(self): + """ + Test the PCA visualizer's heatmap features. + """ + params = { + "scale": True, + "projection": 2, + "random_state": 7382, + "color": self.discrete.y, + "heatmap": True, + } + visualizer = PCA(**params).fit(self.discrete.X, self.discrete.y) + visualizer.transform(self.discrete.X, self.discrete.y) + visualizer.finalize() + # TODO: manually modifying ticks should be removed after #916 is fixed + visualizer.lax.set_xticks([]) + visualizer.lax.set_yticks([]) + visualizer.lax.set_xticks([], minor=True) + visualizer.uax.set_xticklabels([]) + + # Image comparison tests + # AppVeyor tests fail with RMS 14.492 + self.assert_images_similar(visualizer, windows_tol=14.5) + + @pytest.mark.xfail(IS_WINDOWS_OR_CONDA, reason="RMS of 10.987 on miniconda") + def test_colorbar_heatmap(self): + """ + Test the PCA visualizer with both colorbar and heatmap. + """ + params = { + "scale": True, + "projection": 2, + "random_state": 7382, + "color": self.discrete.y, + "colorbar": True, + "heatmap": True, + } + visualizer = PCA(**params).fit(self.continuous.X, self.continuous.y) + visualizer.transform(self.continuous.X, self.continuous.y) + visualizer.finalize() + # TODO: manually modifying ticks should be removed after #916 is fixed + visualizer.lax.set_xticks([]) + visualizer.lax.set_yticks([]) + visualizer.lax.set_xticks([], minor=True) + visualizer.uax.set_xticklabels([]) + visualizer.cax.set_yticklabels([]) + + # Image comparison tests + # AppVeyor tests fail with RMS 10.331 + self.assert_images_similar(visualizer, windows_tol=10.5) + + def test_3d_heatmap_enabled_error(self): + """ + Assert an exception if colorbar and heatmap is enabled with 3-dimensions + """ + with pytest.raises(YellowbrickValueError): + PCA(projection=3, heatmap=True) + + @pytest.mark.skipif( + make_axes_locatable is not None, reason="requires matplotlib <= 2.0.1" + ) + def test_matplotlib_version_error(): + """ + Assert an exception is raised with incompatible matplotlib versions + """ + with pytest.raises(YellowbrickValueError): + PCA(colorbar=True, heatmap=True) diff --git a/tests/test_features/test_pcoords.py b/tests/test_features/test_pcoords.py index e66bf18de..f26824aa4 100644 --- a/tests/test_features/test_pcoords.py +++ b/tests/test_features/test_pcoords.py @@ -1,11 +1,11 @@ # tests.test_features.test_pcoords # Testing for the parallel coordinates feature visualizers # -# Author: Benjamin Bengfort +# Author: Benjamin Bengfort # Author: @thekylesaurus # Created: Thu Oct 06 11:21:27 2016 -0400 # -# Copyright (C) 2016 District Data Labs +# Copyright (C) 2017 The scikit-yb developers. # For license information, see LICENSE.txt # # ID: test_pcoords.py [1d407ab] benjamin@bengfort.com $ @@ -21,10 +21,11 @@ import pytest import numpy as np +from yellowbrick.datasets import load_occupancy from yellowbrick.features.pcoords import * from tests.base import VisualTestCase -from tests.dataset import DatasetMixin, Dataset +from ..fixtures import Dataset from sklearn.datasets import make_classification @@ -38,28 +39,36 @@ ## Fixtures ########################################################################## -@pytest.fixture(scope='class') + +@pytest.fixture(scope="class") def dataset(request): """ Creates a random multiclass classification dataset fixture """ X, y = make_classification( - n_samples=200, n_features=5, n_informative=4, n_redundant=0, - n_classes=3, n_clusters_per_class=1, random_state=451, flip_y=0, - class_sep=3, scale=np.array([1.0, 2.0, 100.0, 20.0, 1.0]) + n_samples=200, + n_features=5, + n_informative=4, + n_redundant=0, + n_classes=3, + n_clusters_per_class=1, + random_state=451, + flip_y=0, + class_sep=3, + scale=np.array([1.0, 2.0, 100.0, 20.0, 1.0]), ) dataset = Dataset(X, y) request.cls.dataset = dataset - ########################################################################## ## Parallel Coordinates Tests ########################################################################## -@pytest.mark.usefixtures('dataset') -class TestParallelCoordinates(VisualTestCase, DatasetMixin): + +@pytest.mark.usefixtures("dataset") +class TestParallelCoordinates(VisualTestCase): """ Test the ParallelCoordinates visualizer """ @@ -70,7 +79,7 @@ def test_parallel_coords(self): """ visualizer = ParallelCoordinates() visualizer.fit_transform(self.dataset.X, self.dataset.y) - visualizer.poof() + visualizer.finalize() self.assert_images_similar(visualizer, tol=0.25) def test_parallel_coords_fast(self): @@ -79,7 +88,7 @@ def test_parallel_coords_fast(self): """ visualizer = ParallelCoordinates(fast=True) visualizer.fit_transform(self.dataset.X, self.dataset.y) - visualizer.poof() + visualizer.finalize() self.assert_images_similar(visualizer, tol=0.25) def test_alpha(self): @@ -88,7 +97,7 @@ def test_alpha(self): """ visualizer = ParallelCoordinates(alpha=1.0) visualizer.fit_transform(self.dataset.X, self.dataset.y) - visualizer.poof() + visualizer.finalize() self.assert_images_similar(visualizer, tol=0.25) def test_alpha_fast(self): @@ -97,7 +106,7 @@ def test_alpha_fast(self): """ visualizer = ParallelCoordinates(alpha=1.0, fast=True) visualizer.fit_transform(self.dataset.X, self.dataset.y) - visualizer.poof() + visualizer.finalize() self.assert_images_similar(visualizer, tol=0.25) def test_labels(self): @@ -105,10 +114,10 @@ def test_labels(self): Test image closeness when class and feature labels are supplied """ visualizer = ParallelCoordinates( - classes=['a', 'b', 'c'], features=['f1', 'f2', 'f3', 'f4', 'f5'] + classes=["a", "b", "c"], features=["f1", "f2", "f3", "f4", "f5"] ) visualizer.fit_transform(self.dataset.X, self.dataset.y) - visualizer.poof() + visualizer.finalize() self.assert_images_similar(visualizer) def test_labels_fast(self): @@ -116,46 +125,46 @@ def test_labels_fast(self): Test image closeness when class and feature labels are supplied in fast mode """ visualizer = ParallelCoordinates( - classes=['a', 'b', 'c'], features=['f1', 'f2', 'f3', 'f4', 'f5'], fast=True + classes=["a", "b", "c"], features=["f1", "f2", "f3", "f4", "f5"], fast=True ) visualizer.fit_transform(self.dataset.X, self.dataset.y) - visualizer.poof() + visualizer.finalize() self.assert_images_similar(visualizer) def test_normalized_l2(self): """ Test image closeness on l2 normalized 3 class dataset """ - visualizer = ParallelCoordinates(normalize='l2') + visualizer = ParallelCoordinates(normalize="l2") visualizer.fit_transform(self.dataset.X, self.dataset.y) - visualizer.poof() + visualizer.finalize() self.assert_images_similar(visualizer, tol=0.25) def test_normalized_l2_fast(self): """ Test image closeness on l2 normalized 3 class dataset in fast mode """ - visualizer = ParallelCoordinates(normalize='l2', fast=True) + visualizer = ParallelCoordinates(normalize="l2", fast=True) visualizer.fit_transform(self.dataset.X, self.dataset.y) - visualizer.poof() + visualizer.finalize() self.assert_images_similar(visualizer, tol=0.25) def test_normalized_minmax(self): """ Test image closeness on minmax normalized 3 class dataset """ - visualizer = ParallelCoordinates(normalize='minmax') + visualizer = ParallelCoordinates(normalize="minmax") visualizer.fit_transform(self.dataset.X, self.dataset.y) - visualizer.poof() + visualizer.finalize() self.assert_images_similar(visualizer, tol=0.25) def test_normalized_minmax_fast(self): """ Test image closeness on minmax normalized 3 class dataset in fast mode """ - visualizer = ParallelCoordinates(normalize='minmax', fast=True) + visualizer = ParallelCoordinates(normalize="minmax", fast=True) visualizer.fit_transform(self.dataset.X, self.dataset.y) - visualizer.poof() + visualizer.finalize() self.assert_images_similar(visualizer, tol=0.25) @pytest.mark.skipif(pd is None, reason="test requires pandas") @@ -163,24 +172,41 @@ def test_pandas_integration_sampled(self): """ Test on a real dataset with pandas DataFrame and Series sampled for speed """ - df = self.load_pandas("occupancy") - - target = "occupancy" - features = [ - 'temperature', 'relative humidity', 'light', 'C02', 'humidity' + data = load_occupancy(return_dataset=True) + X, y = data.to_pandas() + classes = [ + k for k, _ in sorted(data.meta["labels"].items(), key=lambda i: i[1]) ] - X = df[features] - y = pd.Series([ - 'occupied' if yi == 1 else 'unoccupied' for yi in df[target] - ]) - assert isinstance(X, pd.DataFrame) assert isinstance(y, pd.Series) - oz = ParallelCoordinates(sample=0.05, shuffle=True, random_state=4291) + oz = ParallelCoordinates( + sample=0.05, shuffle=True, random_state=4291, classes=classes + ) + oz.fit_transform(X, y) + oz.finalize() + + self.assert_images_similar(oz, tol=0.1) + + def test_numpy_integration_sampled(self): + """ + Ensure visualizer works in default case with numpy arrays and sampling + """ + data = load_occupancy(return_dataset=True) + X, y = data.to_numpy() + classes = [ + k for k, _ in sorted(data.meta["labels"].items(), key=lambda i: i[1]) + ] + + assert isinstance(X, np.ndarray) + assert isinstance(y, np.ndarray) + + oz = ParallelCoordinates( + sample=0.05, shuffle=True, random_state=4291, classes=classes + ) oz.fit_transform(X, y) - oz.poof() + oz.finalize() self.assert_images_similar(oz, tol=0.1) @@ -189,24 +215,37 @@ def test_pandas_integration_fast(self): """ Test on a real dataset with pandas DataFrame and Series in fast mode """ - df = self.load_pandas("occupancy") - - target = "occupancy" - features = [ - 'temperature', 'relative humidity', 'light', 'C02', 'humidity' + data = load_occupancy(return_dataset=True) + X, y = data.to_pandas() + classes = [ + k for k, _ in sorted(data.meta["labels"].items(), key=lambda i: i[1]) ] - X = df[features] - y = pd.Series([ - 'occupied' if yi == 1 else 'unoccupied' for yi in df[target] - ]) - assert isinstance(X, pd.DataFrame) assert isinstance(y, pd.Series) - oz = ParallelCoordinates(fast=True) + oz = ParallelCoordinates(fast=True, classes=classes) + oz.fit_transform(X, y) + oz.finalize() + + self.assert_images_similar(oz, tol=0.1) + + def test_numpy_integration_fast(self): + """ + Ensure visualizer works in default case with numpy arrays and fast mode + """ + data = load_occupancy(return_dataset=True) + X, y = data.to_numpy() + classes = [ + k for k, _ in sorted(data.meta["labels"].items(), key=lambda i: i[1]) + ] + + assert isinstance(X, np.ndarray) + assert isinstance(y, np.ndarray) + + oz = ParallelCoordinates(fast=True, classes=classes) oz.fit_transform(X, y) - oz.poof() + oz.finalize() self.assert_images_similar(oz, tol=0.1) @@ -214,8 +253,8 @@ def test_normalized_invalid_arg(self): """ Invalid argument to 'normalize' should raise """ - with self.assertRaises(YellowbrickValueError): - ParallelCoordinates(normalize='foo') + with pytest.raises(YellowbrickValueError): + ParallelCoordinates(normalize="foo") def test_sample_int(self): """ @@ -234,7 +273,9 @@ def test_sample_int_shuffle(self): visualizer = ParallelCoordinates(sample=3, shuffle=True, random_state=444) visualizer.fit_transform(self.dataset.X, self.dataset.y) - visualizer = ParallelCoordinates(sample=3, shuffle=True, random_state=np.random.RandomState()) + visualizer = ParallelCoordinates( + sample=3, shuffle=True, random_state=np.random.RandomState() + ) visualizer.fit_transform(self.dataset.X, self.dataset.y) def test_sample_int_shuffle_false(self): @@ -247,14 +288,16 @@ def test_sample_int_shuffle_false(self): visualizer = ParallelCoordinates(sample=3, shuffle=False, random_state=444) visualizer.fit_transform(self.dataset.X, self.dataset.y) - visualizer = ParallelCoordinates(sample=3, shuffle=False, random_state=np.random.RandomState()) + visualizer = ParallelCoordinates( + sample=3, shuffle=False, random_state=np.random.RandomState() + ) visualizer.fit_transform(self.dataset.X, self.dataset.y) def test_sample_int_invalid(self): """ Negative int values should raise exception """ - with self.assertRaises(YellowbrickValueError): + with pytest.raises(YellowbrickValueError): ParallelCoordinates(sample=-1) def test_sample_float(self): @@ -274,7 +317,9 @@ def test_sample_float_shuffle(self): visualizer = ParallelCoordinates(sample=0.5, shuffle=True, random_state=444) visualizer.fit_transform(self.dataset.X, self.dataset.y) - visualizer = ParallelCoordinates(sample=0.5, shuffle=True, random_state=np.random.RandomState()) + visualizer = ParallelCoordinates( + sample=0.5, shuffle=True, random_state=np.random.RandomState() + ) visualizer.fit_transform(self.dataset.X, self.dataset.y) def test_sample_float_shuffle_false(self): @@ -287,24 +332,27 @@ def test_sample_float_shuffle_false(self): visualizer = ParallelCoordinates(sample=0.5, shuffle=False, random_state=444) visualizer.fit_transform(self.dataset.X, self.dataset.y) - visualizer = ParallelCoordinates(sample=0.5, shuffle=False, random_state=np.random.RandomState()) + visualizer = ParallelCoordinates( + sample=0.5, shuffle=False, random_state=np.random.RandomState() + ) visualizer.fit_transform(self.dataset.X, self.dataset.y) def test_sample_float_invalid(self): """ Float values for 'sample' argument outside [0,1] should raise. """ - with self.assertRaises(YellowbrickValueError): + with pytest.raises(YellowbrickValueError): ParallelCoordinates(sample=-0.2) - with self.assertRaises(YellowbrickValueError): + + with pytest.raises(YellowbrickValueError): ParallelCoordinates(sample=1.1) def test_sample_invalid_type(self): """ Non-numeric values for 'sample' argument should raise. """ - with self.assertRaises(YellowbrickTypeError): - ParallelCoordinates(sample='foo') + with pytest.raises(YellowbrickTypeError): + ParallelCoordinates(sample="foo") @staticmethod def test_static_subsample(): @@ -329,7 +377,9 @@ def test_static_subsample(): assert np.array_equal(yprime, y) sample = 50 - visualizer = ParallelCoordinates(sample=sample, random_state=None, shuffle=False) + visualizer = ParallelCoordinates( + sample=sample, random_state=None, shuffle=False + ) Xprime, yprime = visualizer._subsample(X, y) assert np.array_equal(Xprime, X[:sample, :]) assert np.array_equal(yprime, y[:sample]) @@ -343,8 +393,8 @@ def test_static_subsample(): visualizer = ParallelCoordinates(sample=0.5, random_state=None, shuffle=False) Xprime, yprime = visualizer._subsample(X, y) - assert np.array_equal(Xprime, X[:int(ntotal/2), :]) - assert np.array_equal(yprime, y[:int(ntotal/2)]) + assert np.array_equal(Xprime, X[: int(ntotal / 2), :]) + assert np.array_equal(yprime, y[: int(ntotal / 2)]) sample = 0.5 visualizer = ParallelCoordinates(sample=sample, random_state=None, shuffle=True) @@ -361,7 +411,9 @@ def test_static_subsample(): assert len(yprime) == ntotal * sample sample = 0.99 - visualizer = ParallelCoordinates(sample=sample, random_state=np.random.RandomState(), shuffle=True) + visualizer = ParallelCoordinates( + sample=sample, random_state=np.random.RandomState(), shuffle=True + ) Xprime, yprime = visualizer._subsample(X, y) assert np.array_equal(Xprime, X[yprime.flatten(), :]) assert len(Xprime) == ntotal * sample diff --git a/tests/test_features/test_projection.py b/tests/test_features/test_projection.py new file mode 100644 index 000000000..6cd00c0be --- /dev/null +++ b/tests/test_features/test_projection.py @@ -0,0 +1,241 @@ +# tests.test_features.test_projection +# Test the base ProjectionVisualizer drawing functionality +# +# Author: Naresh Bachwani +# Created: Wed Jul 17 09:53:07 2019 -0400 +# +# Copyright (C) 2019 the scikit-yb developers. +# For license information, see LICENSE.txt +# +# ID: test_projection.py [21eb9d2] 43993586+naresh-bachwani@users.noreply.github.com $ + +""" +Test the base ProjectionVisualizer drawing functionality +""" + +########################################################################## +## Imports +########################################################################## + +import pytest +import numpy.testing as npt +import matplotlib.pyplot as plt + +from yellowbrick.features.projection import * +from yellowbrick.exceptions import YellowbrickValueError + +from tests.base import VisualTestCase +from unittest import mock + +from sklearn.decomposition import PCA +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import StandardScaler + +########################################################################## +## MockVisualizer +########################################################################## + + +class MockVisualizer(ProjectionVisualizer): + """ + The MockVisualizer implements the ProjectionVisualizer interface using + PCA as an internal transformer. This visualizer is used to directly test + how subclasses interact with the ProjectionVisualizer base class. + """ + + def __init__( + self, + ax=None, + features=None, + classes=None, + colors=None, + colormap=None, + target_type="auto", + projection=2, + alpha=0.75, + colorbar=True, + **kwargs + ): + + super(MockVisualizer, self).__init__( + ax=ax, + features=features, + classes=classes, + colors=colors, + colormap=colormap, + target_type=target_type, + projection=projection, + alpha=alpha, + colorbar=colorbar, + **kwargs + ) + + self.pca_transformer = Pipeline( + [ + ("scale", StandardScaler()), + ("pca", PCA(self.projection, random_state=2019)), + ] + ) + + def fit(self, X, y=None): + super(MockVisualizer, self).fit(X, y) + self.pca_transformer.fit(X) + return self + + def transform(self, X, y=None): + try: + Xp = self.pca_transformer.transform(X) + except AttributeError as e: + raise AttributeError(str(e) + " try using fit_transform instead.") + self.draw(Xp, y) + return Xp + + +########################################################################## +## ProjectionVisualizer Tests +########################################################################## + + +@pytest.mark.usefixtures("discrete", "continuous") +class TestProjectionVisualizer(VisualTestCase): + """ + Test the ProjectionVisualizer base class + """ + + def test_discrete_plot(self): + """ + Test the visualizer with discrete target. + """ + X, y = self.discrete + classes = ["a", "b", "c", "d", "e"] + visualizer = MockVisualizer(projection=2, colormap="plasma", classes=classes) + X_prime = visualizer.fit_transform(X, y) + npt.assert_array_equal(visualizer.classes_, classes) + visualizer.finalize() + self.assert_images_similar(visualizer) + assert X_prime.shape == (self.discrete.X.shape[0], 2) + + def test_continuous_plot(self): + """ + Test the visualizer with continuous target. + """ + X, y = self.continuous + visualizer = MockVisualizer(projection="2d") + visualizer.fit_transform(X, y) + visualizer.finalize() + visualizer.cax.set_yticklabels([]) + self.assert_images_similar(visualizer) + + def test_continuous_when_target_discrete(self): + """ + Ensure user can override discrete target_type by specifying continuous + """ + _, ax = plt.subplots() + X, y = self.discrete + visualizer = MockVisualizer( + ax=ax, projection="2D", target_type="continuous", colormap="cool" + ) + visualizer.fit(X, y) + visualizer.transform(X, y) + visualizer.finalize() + visualizer.cax.set_yticklabels([]) + self.assert_images_similar(visualizer) + + def test_single_plot(self): + """ + Assert single color plot when y is not specified + """ + X, y = self.discrete + visualizer = MockVisualizer(projection=2, colormap="plasma") + visualizer.fit_transform(X) + visualizer.finalize() + self.assert_images_similar(visualizer) + + def test_discrete_3d(self): + """ + Test visualizer for 3 dimensional discrete plots + """ + X, y = self.discrete + + classes = ["a", "b", "c", "d", "e"] + colors = ["r", "b", "g", "m", "c"] + visualizer = MockVisualizer(projection=3, colors=colors, classes=classes) + visualizer.fit_transform(X, y) + npt.assert_array_equal(visualizer.classes_, classes) + visualizer.finalize() + self.assert_images_similar(visualizer) + + def test_3d_continuous_plot(self): + """ + Tests visualizer for 3 dimensional continuous plots + """ + X, y = self.continuous + visualizer = MockVisualizer(projection="3D") + visualizer.fit_transform(X, y) + visualizer.finalize() + visualizer.cbar.set_ticks([]) + self.assert_images_similar(visualizer) + + def test_alpha_param(self): + """ + Ensure that the alpha parameter modifies opacity + """ + # Instantiate a prediction error plot, provide custom alpha + X, y = self.discrete + params = {"alpha": 0.3, "projection": 2} + visualizer = MockVisualizer(**params) + visualizer.ax = mock.MagicMock() + visualizer.fit(X, y) + visualizer.transform(X, y) + + assert visualizer.alpha == 0.3 + + # Test that alpha was passed to internal matplotlib scatterplot + _, scatter_kwargs = visualizer.ax.scatter.call_args + assert "alpha" in scatter_kwargs + assert scatter_kwargs["alpha"] == 0.3 + + # Check Errors + @pytest.mark.parametrize("projection", ["4D", 1, "100d", 0]) + def test_wrong_projection_dimensions(self, projection): + """ + Validate projection hyperparameter + """ + msg = "Projection dimensions must be either 2 or 3" + with pytest.raises(YellowbrickValueError, match=msg): + MockVisualizer(projection=projection) + + def test_target_not_label_encoded(self): + """ + Assert label encoding mismatch with y raises exception + """ + X, y = self.discrete + # Multiply every element by 10 to make non-label encoded + y = y * 10 + visualizer = MockVisualizer() + msg = "Target needs to be label encoded." + with pytest.raises(YellowbrickValueError, match=msg): + visualizer.fit_transform(X, y) + + @pytest.mark.parametrize("dataset", ("discrete", "continuous")) + def test_y_required_for_discrete_and_continuous(self, dataset): + """ + Assert error is raised when y is not passed to transform + """ + X, y = getattr(self, dataset) + visualizer = MockVisualizer() + visualizer.fit(X, y) + + msg = "y is required for {} target".format(dataset) + with pytest.raises(YellowbrickValueError, match=msg): + visualizer.transform(X) + + def test_colorbar_false(self): + """ + Test that colorbar equals false works correctly + """ + visualizer = MockVisualizer(colorbar=False, colormap="YlOrRd") + visualizer.fit_transform(*self.continuous) + visualizer.finalize() + + self.assert_images_similar(visualizer) diff --git a/tests/test_features/test_radviz.py b/tests/test_features/test_radviz.py index a198f7062..0513ca254 100644 --- a/tests/test_features/test_radviz.py +++ b/tests/test_features/test_radviz.py @@ -1,10 +1,10 @@ # tests.test_features.test_radviz # Test the RadViz feature analysis visualizers # -# Author: Benjamin Bengfort +# Author: Benjamin Bengfort # Created: Fri Oct 07 12:19:19 2016 -0400 # -# Copyright (C) 2016 District Data Labs +# Copyright (C) 2016 The scikit-yb developers # For license information, see LICENSE.txt # # ID: test_radviz.py [01d5996] benjamin@bengfort.com $ @@ -21,31 +21,39 @@ import pytest import numpy.testing as npt -from tests.base import VisualTestCase -from tests.dataset import DatasetMixin, Dataset +from tests.base import IS_WINDOWS_OR_CONDA, VisualTestCase +from ..fixtures import Dataset from sklearn.datasets import make_classification +from yellowbrick.datasets import load_occupancy from yellowbrick.features.radviz import * try: - import pandas + import pandas as pd except ImportError: - pandas = None - + pd = None ########################################################################## ## Fixtures ########################################################################## -@pytest.fixture(scope='class') + +@pytest.fixture(scope="class") def dataset(request): """ Creates a random multiclass classification dataset fixture """ X, y = make_classification( - n_samples=200, n_features=5, n_informative=4, n_redundant=0, - n_classes=3, n_clusters_per_class=1, random_state=451, flip_y=0, - class_sep=3, scale=np.array([1.0, 2.0, 100.0, 20.0, 1.0]) + n_samples=200, + n_features=5, + n_informative=4, + n_redundant=0, + n_classes=3, + n_clusters_per_class=1, + random_state=451, + flip_y=0, + class_sep=3, + scale=np.array([1.0, 2.0, 100.0, 20.0, 1.0]), ) dataset = Dataset(X, y) @@ -56,8 +64,9 @@ def dataset(request): ## RadViz Tests ########################################################################## -@pytest.mark.usefixtures('dataset') -class TestRadViz(VisualTestCase, DatasetMixin): + +@pytest.mark.usefixtures("dataset") +class TestRadViz(VisualTestCase): """ Test the RadViz visualizer """ @@ -68,22 +77,26 @@ def test_normalize_x(self): """ # Original data X = np.array( - [[ 2.318, 2.727, 4.260, 7.212, 4.792], - [ 2.315, 2.726, 4.295, 7.140, 4.783,], - [ 2.315, 2.724, 4.260, 7.135, 4.779,], - [ 2.110, 3.609, 4.330, 7.985, 5.595,], - [ 2.110, 3.626, 4.330, 8.203, 5.621,], - [ 2.110, 3.620, 4.470, 8.210, 5.612,]] + [ + [2.318, 2.727, 4.260, 7.212, 4.792], + [2.315, 2.726, 4.295, 7.140, 4.783], + [2.315, 2.724, 4.260, 7.135, 4.779], + [2.110, 3.609, 4.330, 7.985, 5.595], + [2.110, 3.626, 4.330, 8.203, 5.621], + [2.110, 3.620, 4.470, 8.210, 5.612], + ] ) # Expected result Xe = np.array( - [[ 1. , 0.00332594, 0. , 0.07162791, 0.01543943], - [ 0.98557692, 0.00221729, 0.16666667, 0.00465116, 0.00475059], - [ 0.98557692, 0. , 0. , 0. , 0. ], - [ 0. , 0.98115299, 0.33333333, 0.79069767, 0.96912114], - [ 0. , 1. , 0.33333333, 0.99348837, 1. ], - [ 0. , 0.99334812, 1. , 1. , 0.98931116]] + [ + [1.0, 0.00332594, 0.0, 0.07162791, 0.01543943], + [0.98557692, 0.00221729, 0.16666667, 0.00465116, 0.00475059], + [0.98557692, 0.0, 0.0, 0.0, 0.0], + [0.0, 0.98115299, 0.33333333, 0.79069767, 0.96912114], + [0.0, 1.0, 0.33333333, 0.99348837, 1.0], + [0.0, 0.99334812, 1.0, 1.0, 0.98931116], + ] ) # Xprime (transformed X) @@ -96,7 +109,7 @@ def test_radviz(self): """ visualizer = RadViz() visualizer.fit_transform(self.dataset.X, self.dataset.y) - visualizer.poof() + visualizer.finalize() self.assert_images_similar(visualizer, tol=0.25) def test_radviz_alpha(self): @@ -105,47 +118,94 @@ def test_radviz_alpha(self): """ visualizer = RadViz(alpha=0.5) visualizer.fit_transform(self.dataset.X, self.dataset.y) - visualizer.poof() + visualizer.finalize() self.assert_images_similar(visualizer, tol=0.25) @pytest.mark.xfail( - sys.platform == 'win32', reason="images not close on windows" + IS_WINDOWS_OR_CONDA, + reason="font rendering different in OS and/or Python; see #892", ) - @pytest.mark.skipif(pandas is None, reason="test requires Pandas") - def test_integrated_radiz_with_pandas(self): + @pytest.mark.skipif(pd is None, reason="test requires pandas") + def test_integrated_radviz_with_pandas(self): """ Test RadViz with Pandas on the occupancy dataset """ - occupancy = self.load_pandas("occupancy") + data = load_occupancy(return_dataset=True) + X, y = data.to_pandas() - # Load the data from the fixture - X = occupancy[[ - "temperature", "relative humidity", "light", "C02", "humidity" - ]] - y = occupancy['occupancy'].astype(int) + assert isinstance(X, pd.DataFrame) + assert isinstance(y, pd.Series) # Test the visualizer visualizer = RadViz() visualizer.fit_transform_poof(X, y) - self.assert_images_similar(visualizer) + self.assert_images_similar(visualizer, tol=0.1) - @pytest.mark.xfail( - sys.platform == 'win32', reason="images not close on windows" - ) - @pytest.mark.skipif(pandas is None, reason="test requires Pandas") - def test_integrated_radiz_pandas_classes_features(self): + @pytest.mark.xfail(sys.platform == "win32", reason="images not close on windows") + def test_integrated_radviz_with_numpy(self): + """ + Test RadViz with numpy on the occupancy dataset + """ + data = load_occupancy(return_dataset=True) + X, y = data.to_numpy() + + assert isinstance(X, np.ndarray) + assert isinstance(y, np.ndarray) + + # Test the visualizer + visualizer = RadViz() + visualizer.fit_transform_poof(X, y) + self.assert_images_similar(visualizer, tol=0.1) + + @pytest.mark.xfail(sys.platform == "win32", reason="images not close on windows") + @pytest.mark.skipif(pd is None, reason="test requires pandas") + def test_integrated_radviz_pandas_classes_features(self): """ - Test RadViz with classes and features specified + Test RadViz with classes and features specified using Pandas """ # Load the data from the fixture - occupancy = self.load_pandas("occupancy") + data = load_occupancy(return_dataset=True) + X, y = data.to_pandas() + features = ["temperature", "relative humidity", "light"] - classes = ['unoccupied', 'occupied'] + classes = [ + k for k, _ in sorted(data.meta["labels"].items(), key=lambda i: i[1]) + ] + + assert isinstance(X, pd.DataFrame) + assert isinstance(y, pd.Series) + + # Filter the dataset to make sure it's not just class names + X = X[features] + y = y.astype(int) + + # Test the visualizer + visualizer = RadViz(features=features, classes=classes) + visualizer.fit_transform_poof(X, y) + self.assert_images_similar(visualizer, tol=0.1) + + @pytest.mark.xfail(sys.platform == "win32", reason="images not close on windows") + def test_integrated_radviz_numpy_classes_features(self): + """ + Test RadViz with classes and features specified using numpy + """ + # Load the data from the fixture + data = load_occupancy(return_dataset=True) + X, y = data.to_numpy() + + features = data.meta["features"][0:3] + classes = [ + k for k, _ in sorted(data.meta["labels"].items(), key=lambda i: i[1]) + ] + + assert isinstance(X, np.ndarray) + assert isinstance(y, np.ndarray) - X = occupancy[features] - y = occupancy['occupancy'].astype(int) + # Filter the dataset to make sure it's not just class names + X = X[:, :3] + y = y.astype(int) # Test the visualizer visualizer = RadViz(features=features, classes=classes) visualizer.fit_transform_poof(X, y) - self.assert_images_similar(visualizer) + self.assert_images_similar(visualizer, tol=0.1) diff --git a/tests/test_features/test_rankd.py b/tests/test_features/test_rankd.py index 1d880f2af..f3fd4b78e 100644 --- a/tests/test_features/test_rankd.py +++ b/tests/test_features/test_rankd.py @@ -1,13 +1,13 @@ # tests.test_features.test_rankd # Test the rankd feature analysis visualizers # -# Author: Benjamin Bengfort +# Author: Benjamin Bengfort # Created: Fri Oct 07 12:19:19 2016 -0400 # -# Copyright (C) 2016 District Data Labs +# Copyright (C) 2016 The scikit-yb developers # For license information, see LICENSE.txt # -# ID: test_rankd.py [01d5996] benjamin@bengfort.com $ +# ID: test_rankd.py [7b4350a] nathan.danielsen@gmail.com $ """ Test the Rankd feature analysis visualizers @@ -17,15 +17,19 @@ ## Imports ########################################################################## -import sys -import six import pytest +import numpy as np +import numpy.testing as npt -from tests.base import VisualTestCase -from tests.dataset import DatasetMixin, Dataset +from tests.base import IS_WINDOWS_OR_CONDA, VisualTestCase +from yellowbrick.datasets import load_occupancy from yellowbrick.features.rankd import * -from sklearn.datasets import make_classification +from yellowbrick.features.rankd import kendalltau +from yellowbrick.features.rankd import RankDBase +from sklearn.datasets import make_regression + +from yellowbrick.exceptions import YellowbrickValueError try: import pandas as pd @@ -33,106 +37,742 @@ pd = None -@pytest.fixture(scope='class') +@pytest.fixture(scope="class") def dataset(request): """ - Creates a binary classification dataset for use in RankD tests + Creates a dataset with 6 gaussian features and 2 categorical features + for testing the RankD ranking algorithms. The gaussian features have + different correlations with respect to each other, including strong + positive and negative correlation and no correlation at all. """ - X, y = make_classification( - n_samples=700, n_features=10, n_informative=8, n_redundant=2, - n_classes=2, n_clusters_per_class=2, random_state=6483 + X, _ = make_regression( + n_samples=100, + n_features=6, + effective_rank=2, + tail_strength=0, + n_informative=2, + noise=0.45, + random_state=27, ) - request.cls.dataset = Dataset(X, y) + rand = np.random.RandomState(seed=27) + request.cls.dataset = np.concatenate((X, rand.binomial(1, 0.6, (100, 2))), axis=1) + + +########################################################################## +## Kendall-Tau Tests +########################################################################## + + +@pytest.mark.usefixtures("dataset") +class TestKendallTau(object): + """ + Test the Kendall-Tau correlation metric + """ + + def test_kendalltau(self): + """ + Test results returned match expectations + """ + expected = np.array( + [ + [ + 1.0, + -0.68, + -0.57454545, + 0.49858586, + 0.07555556, + -0.05858586, + 0.02387848, + 0.11357219, + ], + [ + -0.68, + 1.0, + 0.58666667, + -0.69090909, + -0.22262626, + -0.17171717, + -0.05059964, + -0.12397575, + ], + [ + -0.57454545, + 0.58666667, + 1.0, + -0.61050505, + 0.18909091, + 0.07515152, + 0.00341121, + -0.0638663, + ], + [ + 0.49858586, + -0.69090909, + -0.61050505, + 1.0, + 0.11070707, + 0.3030303, + 0.03013237, + 0.07542581, + ], + [ + 0.07555556, + -0.22262626, + 0.18909091, + 0.11070707, + 1.0, + 0.4610101, + 0.01648752, + 0.05982047, + ], + [ + -0.05858586, + -0.17171717, + 0.07515152, + 0.3030303, + 0.4610101, + 1.0, + 0.03695479, + -0.02398599, + ], + [ + 0.02387848, + -0.05059964, + 0.00341121, + 0.03013237, + 0.01648752, + 0.03695479, + 1.0, + 0.18298883, + ], + [ + 0.11357219, + -0.12397575, + -0.0638663, + 0.07542581, + 0.05982047, + -0.02398599, + 0.18298883, + 1.0, + ], + ] + ) + npt.assert_almost_equal(expected, kendalltau(self.dataset)) + + def test_kendalltau_shape(self): + """ + Assert that a square correlation matrix is returned + """ + corr = kendalltau(self.dataset) + assert corr.shape[0] == corr.shape[1] + + for (i, j), val in np.ndenumerate(corr): + assert corr[j][i] == pytest.approx(val) + + def test_kendalltau_1D(self): + """ + Assert that a 2D matrix is required as input + """ + with pytest.raises(IndexError, match="tuple index out of range"): + X = 0.1 * np.arange(10) + kendalltau(X) + + +########################################################################## +## RankDBase Tests +########################################################################## + + +@pytest.mark.usefixtures("dataset") +class TestRankDBase(VisualTestCase): + """ + Test the RankDBase Visualizer + """ + + def test_rankdbase_unknown_algorithm(self): + """ + Assert that unknown algorithms raise an exception + """ + with pytest.raises( + YellowbrickValueError, match=".* is unrecognized ranking method" + ) as e: + oz = RankDBase(algorithm="unknown") + oz.fit_transform(self.dataset) + assert str(e.value) == "'unknown' is unrecognized ranking method" ########################################################################## ## Rank1D Base Tests ########################################################################## + @pytest.mark.usefixtures("dataset") -class TestRank1D(VisualTestCase, DatasetMixin): +class TestRank1D(VisualTestCase): """ Test the Rank1D visualizer """ - def test_rank1d_random(self): + def test_rank1d_shapiro(self): + """ + Test Rank1D using shapiro metric + """ + oz = Rank1D(algorithm="shapiro") + npt.assert_array_equal(oz.fit_transform(self.dataset), self.dataset) + + # Check Ranking + expected = np.array( + [ + 0.985617, + 0.992236, + 0.982354, + 0.984898, + 0.978514, + 0.990372, + 0.636401, + 0.624511, + ] + ) + + assert hasattr(oz, "ranks_") + assert oz.ranks_.shape == (self.dataset.shape[1],) + npt.assert_array_almost_equal(oz.ranks_, expected) + + # Image similarity comparison + oz.finalize() + self.assert_images_similar(oz) + + def test_rank1d_orientation(self): """ - Test Rank1D on a random binary classification dataset + Test Rank1D using vertical orientation """ - visualizer = Rank1D() - visualizer.fit_transform(self.dataset.X, self.dataset.y) - visualizer.poof() + oz = Rank1D(orient="v") + npt.assert_array_equal(oz.fit_transform(self.dataset), self.dataset) - self.assert_images_similar(visualizer) + # Image similarity comparison + oz.finalize() + self.assert_images_similar(oz) - @pytest.mark.skipif(pd is None, reason="requires pandas") @pytest.mark.filterwarnings("ignore:p-value") - def test_rank1d_integrated(self): + @pytest.mark.skipif(pd is None, reason="test requires pandas") + def test_rank1d_integrated_pandas(self): """ Test Rank1D on occupancy dataset with pandas DataFrame and Series """ - df = self.load_pandas("occupancy") + data = load_occupancy(return_dataset=True) + X, y = data.to_pandas() + features = data.meta["features"] + + assert isinstance(X, pd.DataFrame) + assert isinstance(y, pd.Series) + + # Test the visualizer + oz = Rank1D(features=features, show_feature_names=True) + assert oz.fit(X, y) is oz + assert oz.transform(X) is X + + # Image similarity testing + oz.finalize() + self.assert_images_similar(oz) + + @pytest.mark.filterwarnings("ignore:p-value") + def test_rank1d_integrated_numpy(self): + """ + Test Rank1D on occupancy dataset with default numpy data structures + """ + data = load_occupancy(return_dataset=True) + X, y = data.to_numpy() + features = data.meta["features"] - # Load the data from the fixture - X = df[[ - "temperature", "relative humidity", "light", "C02", "humidity" - ]] - y = df['occupancy'] + assert isinstance(X, np.ndarray) + assert isinstance(y, np.ndarray) # Test the visualizer - visualizer = Rank1D() - visualizer.fit_transform(X, y) - visualizer.poof() + oz = Rank1D(features=features, show_feature_names=True) + assert oz.fit(X, y) is oz + assert oz.transform(X) is X - self.assert_images_similar(visualizer) + # Image similarity testing + oz.finalize() + self.assert_images_similar(oz) ########################################################################## ## Rank2D Test Cases ########################################################################## + @pytest.mark.usefixtures("dataset") -class TestRank2D(VisualTestCase, DatasetMixin): +class TestRank2D(VisualTestCase): """ Test the Rank2D visualizer """ @pytest.mark.xfail( - sys.platform == 'win32', reason="images not close on windows" + IS_WINDOWS_OR_CONDA, + reason="font rendering different in OS and/or Python; see #892", + ) + def test_rank2d_pearson(self): + """ + Test Rank2D using pearson metric + """ + oz = Rank2D(algorithm="pearson") + npt.assert_array_equal(oz.fit_transform(self.dataset), self.dataset) + + # Check Ranking + expected = np.array( + [ + [ + 1.0, + -0.86937243, + -0.77884764, + 0.71424708, + 0.10836854, + -0.11550965, + 0.04494811, + 0.1725682, + ], + [ + -0.86937243, + 1.0, + 0.80436327, + -0.9086706, + -0.31117192, + -0.26313947, + -0.0711807, + -0.16924862, + ], + [ + -0.77884764, + 0.80436327, + 1.0, + -0.85520468, + 0.30940711, + 0.10634903, + -0.02485686, + -0.10230028, + ], + [ + 0.71424708, + -0.9086706, + -0.85520468, + 1.0, + 0.12537213, + 0.41306822, + 0.04704408, + 0.1031842, + ], + [ + 0.10836854, + -0.31117192, + 0.30940711, + 0.12537213, + 1.0, + 0.671111, + 0.06777278, + 0.09513859, + ], + [ + -0.11550965, + -0.26313947, + 0.10634903, + 0.41306822, + 0.671111, + 1.0, + 0.04684117, + -0.01072631, + ], + [ + 0.04494811, + -0.0711807, + -0.02485686, + 0.04704408, + 0.06777278, + 0.04684117, + 1.0, + 0.18298883, + ], + [ + 0.1725682, + -0.16924862, + -0.10230028, + 0.1031842, + 0.09513859, + -0.01072631, + 0.18298883, + 1.0, + ], + ] + ) + + assert hasattr(oz, "ranks_") + assert oz.ranks_.shape == (self.dataset.shape[1], self.dataset.shape[1]) + npt.assert_array_almost_equal(oz.ranks_, expected) + + # Image similarity comparision + oz.finalize() + self.assert_images_similar(oz, tol=0.1) + + @pytest.mark.xfail( + IS_WINDOWS_OR_CONDA, + reason="font rendering different in OS and/or Python; see #892", + ) + def test_rank2d_covariance(self): + """ + Test Rank2D using covariance metric + """ + oz = Rank2D(algorithm="covariance") + npt.assert_array_equal(oz.fit_transform(self.dataset), self.dataset) + + # Check Ranking + expected = np.array( + [ + [ + 4.09266931e-03, + -1.41062431e-03, + -2.26778429e-03, + 3.13507202e-03, + 2.21273274e-04, + -5.05566875e-04, + 1.44499782e-03, + 5.45713163e-03, + ], + [ + -1.41062431e-03, + 6.43286363e-04, + 9.28539346e-04, + -1.58126396e-03, + -2.51898163e-04, + -4.56609749e-04, + -9.07228811e-04, + -2.12191333e-03, + ], + [ + -2.26778429e-03, + 9.28539346e-04, + 2.07153281e-03, + -2.67061756e-03, + 4.49467833e-04, + 3.31158917e-04, + -5.68518509e-04, + -2.30156415e-03, + ], + [ + 3.13507202e-03, + -1.58126396e-03, + -2.67061756e-03, + 4.70751209e-03, + 2.74548546e-04, + 1.93898526e-03, + 1.62200836e-03, + 3.49952628e-03, + ], + [ + 2.21273274e-04, + -2.51898163e-04, + 4.49467833e-04, + 2.74548546e-04, + 1.01869657e-03, + 1.46545939e-03, + 1.08700151e-03, + 1.50099581e-03, + ], + [ + -5.05566875e-04, + -4.56609749e-04, + 3.31158917e-04, + 1.93898526e-03, + 1.46545939e-03, + 4.68073451e-03, + 1.61041253e-03, + -3.62750059e-04, + ], + [ + 1.44499782e-03, + -9.07228811e-04, + -5.68518509e-04, + 1.62200836e-03, + 1.08700151e-03, + 1.61041253e-03, + 2.52525253e-01, + 4.54545455e-02, + ], + [ + 5.45713163e-03, + -2.12191333e-03, + -2.30156415e-03, + 3.49952628e-03, + 1.50099581e-03, + -3.62750059e-04, + 4.54545455e-02, + 2.44343434e-01, + ], + ] + ) + + assert hasattr(oz, "ranks_") + assert oz.ranks_.shape == (self.dataset.shape[1], self.dataset.shape[1]) + npt.assert_array_almost_equal(oz.ranks_, expected) + + # Image similarity comparision + oz.finalize() + self.assert_images_similar(oz, tol=0.1) + + @pytest.mark.xfail( + IS_WINDOWS_OR_CONDA, + reason="font rendering different in OS and/or Python; see #892", ) - def test_rank2d_random(self): + def test_rank2d_spearman(self): """ - Test Rank2D on a random binary classification dataset + Test Rank2D using spearman metric """ - visualizer = Rank2D() - visualizer.fit_transform(self.dataset.X, self.dataset.y) - visualizer.poof() + oz = Rank2D(algorithm="spearman") + npt.assert_array_equal(oz.fit_transform(self.dataset), self.dataset) + + # Check Ranking + expected = np.array( + [ + [ + 1.0, + -0.86889889, + -0.77551755, + 0.68520852, + 0.11369937, + -0.09489349, + 0.02909991, + 0.13840665, + ], + [ + -0.86889889, + 1.0, + 0.78232223, + -0.87065107, + -0.33450945, + -0.25244524, + -0.06166409, + -0.15108512, + ], + [ + -0.77551755, + 0.78232223, + 1.0, + -0.81636964, + 0.26846685, + 0.10348635, + 0.00415713, + -0.07783173, + ], + [ + 0.68520852, + -0.87065107, + -0.81636964, + 1.0, + 0.16316832, + 0.45167717, + 0.03672131, + 0.09191892, + ], + [ + 0.11369937, + -0.33450945, + 0.26846685, + 0.16316832, + 1.0, + 0.63986799, + 0.02009279, + 0.07290121, + ], + [ + -0.09489349, + -0.25244524, + 0.10348635, + 0.45167717, + 0.63986799, + 1.0, + 0.04503557, + -0.02923092, + ], + [ + 0.02909991, + -0.06166409, + 0.00415713, + 0.03672131, + 0.02009279, + 0.04503557, + 1.0, + 0.18298883, + ], + [ + 0.13840665, + -0.15108512, + -0.07783173, + 0.09191892, + 0.07290121, + -0.02923092, + 0.18298883, + 1.0, + ], + ] + ) + + assert hasattr(oz, "ranks_") + assert oz.ranks_.shape == (self.dataset.shape[1], self.dataset.shape[1]) + npt.assert_array_almost_equal(oz.ranks_, expected) + + # Image similarity comparision + oz.finalize() + self.assert_images_similar(oz, tol=0.1) + + @pytest.mark.xfail( + IS_WINDOWS_OR_CONDA, + reason="font rendering different in OS and/or Python; see #892", + ) + def test_rank2d_kendalltau(self): + """ + Test Rank2D using kendalltau metric + """ + oz = Rank2D(algorithm="kendalltau") + npt.assert_array_equal(oz.fit_transform(self.dataset), self.dataset) + + # Check Ranking + expected = np.array( + [ + [ + 1.0, + -0.68, + -0.57454545, + 0.49858586, + 0.07555556, + -0.05858586, + 0.02387848, + 0.11357219, + ], + [ + -0.68, + 1.0, + 0.58666667, + -0.69090909, + -0.22262626, + -0.17171717, + -0.05059964, + -0.12397575, + ], + [ + -0.57454545, + 0.58666667, + 1.0, + -0.61050505, + 0.18909091, + 0.07515152, + 0.00341121, + -0.0638663, + ], + [ + 0.49858586, + -0.69090909, + -0.61050505, + 1.0, + 0.11070707, + 0.3030303, + 0.03013237, + 0.07542581, + ], + [ + 0.07555556, + -0.22262626, + 0.18909091, + 0.11070707, + 1.0, + 0.4610101, + 0.01648752, + 0.05982047, + ], + [ + -0.05858586, + -0.17171717, + 0.07515152, + 0.3030303, + 0.4610101, + 1.0, + 0.03695479, + -0.02398599, + ], + [ + 0.02387848, + -0.05059964, + 0.00341121, + 0.03013237, + 0.01648752, + 0.03695479, + 1.0, + 0.18298883, + ], + [ + 0.11357219, + -0.12397575, + -0.0638663, + 0.07542581, + 0.05982047, + -0.02398599, + 0.18298883, + 1.0, + ], + ] + ) + + assert hasattr(oz, "ranks_") + assert oz.ranks_.shape == (self.dataset.shape[1], self.dataset.shape[1]) + npt.assert_array_almost_equal(oz.ranks_, expected) - tol = 10 if six.PY2 else 0.1 - self.assert_images_similar(visualizer, tol=tol) + # Image similarity comparision + oz.finalize() + self.assert_images_similar(oz, tol=0.1) @pytest.mark.xfail( - sys.platform == 'win32', reason="images not close on windows" + IS_WINDOWS_OR_CONDA, + reason="font rendering different in OS and/or Python; see #892", ) - @pytest.mark.skipif(pd is None, reason="requires pandas") - def test_rank2d_integrated(self): + @pytest.mark.skipif(pd is None, reason="test requires pandas") + def test_rank2d_integrated_pandas(self): """ Test Rank2D on occupancy dataset with pandas DataFrame and Series """ - df = self.load_pandas("occupancy") + data = load_occupancy(return_dataset=True) + X, y = data.to_pandas() + features = data.meta["features"] - # Load the data from the fixture - X = df[[ - "temperature", "relative humidity", "light", "C02", "humidity" - ]] - y = df['occupancy'] + assert isinstance(X, pd.DataFrame) + assert isinstance(y, pd.Series) # Test the visualizer - visualizer = Rank2D() - visualizer.fit_transform(X, y) - visualizer.poof() + oz = Rank2D(features=features, show_feature_names=True) + assert oz.fit(X, y) is oz + assert oz.transform(X) is X + oz.finalize() - tol = 10 if six.PY2 else 0.1 - self.assert_images_similar(visualizer, tol=tol) -# + # Image similarity testing + self.assert_images_similar(oz, tol=0.1) + + @pytest.mark.xfail( + IS_WINDOWS_OR_CONDA, + reason="font rendering different in OS and/or Python; see #892", + ) + def test_rank2d_integrated_numpy(self): + """ + Test Rank2D on occupancy dataset with numpy ndarray + """ + data = load_occupancy(return_dataset=True) + X, y = data.to_numpy() + features = data.meta["features"] + + assert isinstance(X, np.ndarray) + assert isinstance(y, np.ndarray) + + # Test the visualizer + oz = Rank2D(features=features, show_feature_names=True) + assert oz.fit(X, y) is oz + assert oz.transform(X) is X + oz.finalize() + + # Image similarity testing + self.assert_images_similar(oz, tol=0.1) diff --git a/tests/test_meta.py b/tests/test_meta.py index 8318fc945..5f66c3d6c 100644 --- a/tests/test_meta.py +++ b/tests/test_meta.py @@ -4,7 +4,7 @@ # Author: Benjamin Bengfort # Created: Sat Apr 07 13:16:53 2018 -0400 # -# ID: test_meta.py [] benjamin@bengfort.com $ +# ID: test_meta.py [0a2d2b4] benjamin@bengfort.com $ """ Meta testing for testing helper functions! @@ -18,17 +18,15 @@ import pytest import inspect +import matplotlib as mpl + from tests.rand import RandomVisualizer +from unittest.mock import MagicMock, patch from tests.base import ACTUAL_IMAGES, BASELINE_IMAGES from tests.base import VisualTestCase, ImageComparison from yellowbrick.exceptions import ImageComparisonFailure -try: - from unittest.mock import MagicMock, patch -except ImportError: - from mock import MagicMock, patch - def assert_path_exists(*parts): # Hide this method from the pytest traceback on test failure. @@ -50,6 +48,7 @@ def assert_path_not_exists(*parts): ## Test Cases ########################################################################## + class TestMetaImageComparison(VisualTestCase): """ Meta Test: ImageComparison test cases @@ -59,6 +58,7 @@ def test_image_comparison(self): """ Test the image comparison initialization and properties """ + def inner_assertion_function(ax): stack = inspect.stack() return ImageComparison(stack, ax=ax) @@ -70,12 +70,16 @@ def inner_assertion_function(ax): assert compare.test_module_path == "test_meta" # Must use os.path.join for Windows/POSIX compatibility - assert compare.actual_image_path.endswith(os.path.join( - "tests", "actual_images", "test_meta", "test_image_comparison.png" - )) - assert compare.baseline_image_path.endswith(os.path.join( - "tests", "baseline_images", "test_meta", "test_image_comparison.png" - )) + assert compare.actual_image_path.endswith( + os.path.join( + "tests", "actual_images", "test_meta", "test_image_comparison.png" + ) + ) + assert compare.baseline_image_path.endswith( + os.path.join( + "tests", "baseline_images", "test_meta", "test_image_comparison.png" + ) + ) @patch.object(ImageComparison, "cleanup") @patch.object(ImageComparison, "save") @@ -84,6 +88,7 @@ def test_image_comparison_call(self, mock_cleanup, mock_save, mock_compare): """ Test that image comparison cleans up, saves, and compares """ + def inner_assertion_function(): stack = inspect.stack() return ImageComparison(stack, ax=MagicMock()) @@ -115,7 +120,7 @@ def test_missing_baseline_image(self): Test that a missing basline image raises an exception """ viz = RandomVisualizer(random_state=14).fit() - viz.poof() + viz.finalize() # Assert the baseline image does not exist assert_path_not_exists( @@ -135,7 +140,9 @@ def test_random_visualizer(self): Test that a random visualization is correctly compared to a baseline """ viz = RandomVisualizer(random_state=111).fit() - viz.poof() + viz.finalize() + + assert mpl.get_backend() == "agg" compare = self.assert_images_similar(viz, tol=1.0) assert_path_exists(compare.actual_image_path) @@ -145,15 +152,18 @@ def test_random_visualizer_not_close(self): """ Test that not close visualizers raise an assertion error. """ + # Baseline image random_state=225 viz = RandomVisualizer(random_state=224).fit() - viz.poof() + viz.finalize() with pytest.raises(ImageComparisonFailure, match="images not close"): self.assert_images_similar(viz) # Assert there is a diff assert_path_exists( - ACTUAL_IMAGES, "test_meta", "test_random_visualizer_not_close-failed-diff.png" + ACTUAL_IMAGES, + "test_meta", + "test_random_visualizer_not_close-failed-diff.png", ) def test_random_visualizer_increased_tolerance(self): @@ -161,6 +171,6 @@ def test_random_visualizer_increased_tolerance(self): Test that not close visualizers pass with increased tolerance """ viz = RandomVisualizer(random_state=224).fit() - viz.poof() + viz.finalize() self.assert_images_similar(viz, tol=30) diff --git a/tests/test_model_selection/__init__.py b/tests/test_model_selection/__init__.py index 9ec2e913e..c29f98a2d 100644 --- a/tests/test_model_selection/__init__.py +++ b/tests/test_model_selection/__init__.py @@ -4,7 +4,7 @@ # Author: Benjamin Bengfort # Created: Fri Mar 30 10:37:18 2018 -0400 # -# ID: __init__.py [] benjamin@bengfort.com $ +# ID: __init__.py [c5355ee] benjamin@bengfort.com $ """ Tests for the model selection visualizer library. diff --git a/tests/test_model_selection/conftest.py b/tests/test_model_selection/conftest.py index d4dae0d27..f9a4fc438 100644 --- a/tests/test_model_selection/conftest.py +++ b/tests/test_model_selection/conftest.py @@ -4,7 +4,7 @@ # Author: Benjamin Bengfort # Created: Fri Mar 30 14:35:39 2018 -0400 # -# ID: conftest.py [] benjamin@bengfort.com $ +# ID: conftest.py [c5355ee] benjamin@bengfort.com $ """ Provides fixtures for the classification tests module. @@ -16,7 +16,7 @@ import pytest -from tests.dataset import Dataset +from tests.fixtures import Dataset from sklearn.datasets import make_classification, make_regression, make_blobs @@ -24,42 +24,50 @@ ## Fixtures ########################################################################## -@pytest.fixture(scope='class') + +@pytest.fixture(scope="class") def classification(request): """ Creates a random multiclass classification dataset fixture """ X, y = make_classification( - n_samples=500, n_features=20, n_informative=8, n_redundant=2, - n_classes=3, n_clusters_per_class=3, random_state=3902 + n_samples=500, + n_features=20, + n_informative=8, + n_redundant=2, + n_classes=3, + n_clusters_per_class=3, + random_state=3902, ) dataset = Dataset(X, y) request.cls.classification = dataset -@pytest.fixture(scope='class') +@pytest.fixture(scope="class") def regression(request): """ Creates a random regression dataset fixture """ X, y = make_regression( - n_samples=500, n_features=20, n_informative=8, - noise=0.01, bias=1.4, random_state=953, + n_samples=500, + n_features=20, + n_informative=8, + noise=0.01, + bias=1.4, + random_state=953, ) dataset = Dataset(X, y) request.cls.regression = dataset -@pytest.fixture(scope='class') +@pytest.fixture(scope="class") def clusters(request): """ Creates a random regression dataset fixture """ - X, y = make_blobs( - n_samples=500, n_features=20, centers=3, random_state=743, - ) + X, y = make_blobs(n_samples=500, n_features=20, centers=3, random_state=743) dataset = Dataset(X, y) request.cls.clusters = dataset diff --git a/tests/test_model_selection/test_cross_validation.py b/tests/test_model_selection/test_cross_validation.py index 516215178..bc5df7a13 100644 --- a/tests/test_model_selection/test_cross_validation.py +++ b/tests/test_model_selection/test_cross_validation.py @@ -1,32 +1,37 @@ # tests.test_model_selection.test_cross_validation # Tests for the CVScores visualizer # -# Author: Rebecca Bilbro +# Author: Rebecca Bilbro # Created: Fri Aug 10 13:45:11 2018 -0400 # -# ID: test_cross_validation.py [] bilbro@gmail.com $ +# Copyright (C) 2018 The scikit-yb developers +# For license information, see LICENSE.txt +# +# ID: test_cross_validation.py [962c8bb] rebeccabilbro@users.noreply.github.com $ """ Tests for the CVScores visualizer """ ########################################################################## -## Imports +# Imports ########################################################################## import pytest import numpy.testing as npt +from unittest.mock import patch from tests.base import VisualTestCase -from tests.dataset import DatasetMixin from sklearn.svm import SVC from sklearn.naive_bayes import BernoulliNB from sklearn.tree import DecisionTreeRegressor +from sklearn.preprocessing import OneHotEncoder from sklearn.neighbors import KNeighborsClassifier -from sklearn.model_selection import ShuffleSplit, StratifiedKFold from sklearn.linear_model import RidgeCV, LogisticRegressionCV +from sklearn.model_selection import ShuffleSplit, StratifiedKFold +from yellowbrick.datasets import load_mushroom from yellowbrick.model_selection.cross_validation import * @@ -35,30 +40,26 @@ except ImportError: pd = None -try: - from unittest.mock import patch -except ImportError: - from mock import patch - ########################################################################## -## Test Cases +# Test Cases ########################################################################## + @pytest.mark.usefixtures("classification", "regression") -class TestCrossValidation(VisualTestCase, DatasetMixin): +class TestCrossValidation(VisualTestCase): """ Test the CVScores visualizer """ - @patch.object(CVScores, 'draw') + @patch.object(CVScores, "draw") def test_fit(self, mock_draw): """ Assert that fit returns self and creates expected properties """ X, y = self.classification - params = ("cv_scores_", "cv_scores_mean_") + params = ("cv_scores_", "cv_scores_mean_") oz = CVScores(SVC()) @@ -79,12 +80,10 @@ def test_classifier(self): cv = ShuffleSplit(3, random_state=288) - oz = CVScores( - KNeighborsClassifier(), cv=cv, scoring='f1_weighted', - ) + oz = CVScores(KNeighborsClassifier(), cv=cv, scoring="f1_weighted") oz.fit(X, y) - oz.poof() + oz.finalize() self.assert_images_similar(oz, tol=2.0) @@ -96,16 +95,12 @@ def test_classifier_with_cv(self): cv = ShuffleSplit(3, random_state=288) - oz_external_cv = CVScores( - LogisticRegressionCV(), cv=cv - ) + oz_external_cv = CVScores(LogisticRegressionCV(), cv=cv) - oz_internal_cv = CVScores( - LogisticRegressionCV(cv=cv) - ) + oz_internal_cv = CVScores(LogisticRegressionCV(cv=cv)) - oz_external_cv.fit(X,y) - oz_internal_cv.fit(X,y) + oz_external_cv.fit(X, y) + oz_internal_cv.fit(X, y) npt.assert_array_almost_equal( oz_external_cv.cv_scores_, oz_internal_cv.cv_scores_, decimal=1 @@ -119,12 +114,10 @@ def test_regression(self): cv = ShuffleSplit(3, random_state=938) - oz = CVScores( - DecisionTreeRegressor(random_state=23), cv=cv, scoring='r2', - ) + oz = CVScores(DecisionTreeRegressor(random_state=23), cv=cv, scoring="r2") oz.fit(X, y) - oz.poof() + oz.finalize() self.assert_images_similar(oz, tol=36.0) @@ -136,16 +129,12 @@ def test_regressor_with_cv(self): cv = ShuffleSplit(3, random_state=288) - oz_external_cv = CVScores( - RidgeCV(), cv=cv - ) + oz_external_cv = CVScores(RidgeCV(), cv=cv) - oz_internal_cv = CVScores( - RidgeCV(cv=cv) - ) + oz_internal_cv = CVScores(RidgeCV(cv=cv)) - oz_external_cv.fit(X,y) - oz_internal_cv.fit(X,y) + oz_external_cv.fit(X, y) + oz_internal_cv.fit(X, y) npt.assert_array_almost_equal( oz_external_cv.cv_scores_, oz_internal_cv.cv_scores_ @@ -158,22 +147,19 @@ def test_quick_method(self): X, y = self.classification cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=321) - ax = cv_scores(SVC(), X, y, cv=cv) + viz = cv_scores(SVC(), X, y, cv=cv) - self.assert_images_similar(ax=ax, tol=2.0) + self.assert_images_similar(viz, tol=2.0) @pytest.mark.skipif(pd is None, reason="test requires pandas") def test_pandas_integration(self): """ Test on mushroom dataset with pandas DataFrame and Series and NB """ - df = self.load_pandas("mushroom") + data = load_mushroom(return_dataset=True) + X, y = data.to_pandas() - target = "target" - features = [col for col in df.columns if col != target] - - X = pd.get_dummies(df[features]) - y = df[target] + X = pd.get_dummies(X) assert isinstance(X, pd.DataFrame) assert isinstance(y, pd.Series) @@ -182,6 +168,23 @@ def test_pandas_integration(self): oz = CVScores(BernoulliNB(), cv=cv) oz.fit(X, y) - oz.poof() + oz.finalize() + + self.assert_images_similar(oz, tol=2.0) + + def test_numpy_integration(self): + """ + Test on mushroom dataset with NumPy arrays + """ + data = load_mushroom(return_dataset=True) + X, y = data.to_numpy() + + X = OneHotEncoder().fit_transform(X).toarray() + + cv = StratifiedKFold(n_splits=2, random_state=11) + oz = CVScores(BernoulliNB(), cv=cv) + + oz.fit(X, y) + oz.finalize() self.assert_images_similar(oz, tol=2.0) diff --git a/tests/test_features/test_importances.py b/tests/test_model_selection/test_importances.py similarity index 68% rename from tests/test_features/test_importances.py rename to tests/test_model_selection/test_importances.py index f01905d35..4463e0152 100644 --- a/tests/test_features/test_importances.py +++ b/tests/test_model_selection/test_importances.py @@ -1,12 +1,11 @@ -# tests.test_features.test_importances +# tests.test_model_selection.test_importances # Test the feature importance visualizers # -# Author: Benjamin Bengfort +# Author: Benjamin Bengfort +# Author: Rebecca Bilbro # Created: Fri Mar 02 15:23:22 2018 -0500 -# Author: Rebecca Bilbro -# Updated: Sun Jun 24 12:10:43 2018 -0500 # -# Copyright (C) 2018 District Data Labs +# Copyright (C) 2018 The scikit-yb developers # For license information, see LICENSE.txt # # ID: test_importances.py [] benjamin@bengfort.com $ @@ -19,28 +18,23 @@ ## Imports ########################################################################## -import sys import pytest import numpy as np import numpy.testing as npt import matplotlib.pyplot as plt from yellowbrick.exceptions import NotFitted -from yellowbrick.features.importances import * +from yellowbrick.model_selection.importances import * +from yellowbrick.datasets import load_occupancy, load_concrete from sklearn.datasets import load_iris -from sklearn.base import BaseEstimator, ClassifierMixin -from sklearn.linear_model import LogisticRegression, Lasso from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import GradientBoostingClassifier +from sklearn.base import BaseEstimator, ClassifierMixin +from sklearn.linear_model import LogisticRegression, Lasso +from unittest import mock from tests.base import VisualTestCase -from tests.dataset import DatasetMixin - -try: - from unittest import mock -except ImportError: - import mock try: import pandas as pd @@ -52,28 +46,19 @@ ## Feature Importances Tests ########################################################################## -class TestFeatureImportancesVisualizer(VisualTestCase, DatasetMixin): + +class TestFeatureImportancesVisualizer(VisualTestCase): """ - FeatureImportances visualizer + Test FeatureImportances visualizer """ - @pytest.mark.xfail( - sys.platform == 'win32', reason="images not close on windows" - ) def test_integration_feature_importances(self): """ Integration test of visualizer with feature importances param """ - occupancy = self.load_data('occupancy') - features = [ - "temperature", "relative_humidity", "light", "C02", "humidity" - ] - - # Extract X and y as numpy arrays - X = occupancy[features].copy() - X = X.view((float, len(X.dtype.names))) - y = occupancy['occupancy'].astype(int) + # Load the test dataset + X, y = load_occupancy(return_dataset=True).to_numpy() fig = plt.figure() ax = fig.add_subplot() @@ -81,62 +66,49 @@ def test_integration_feature_importances(self): clf = GradientBoostingClassifier(random_state=42) viz = FeatureImportances(clf, ax=ax) viz.fit(X, y) - viz.poof() + viz.finalize() - self.assert_images_similar(viz) + # Appveyor and Linux conda non-text-based differences + self.assert_images_similar(viz, tol=13.0) - @pytest.mark.xfail( - sys.platform == 'win32', reason="images not close on windows" - ) def test_integration_coef(self): """ Integration test of visualizer with coef param """ - concrete = self.load_data('concrete') - feats = ['cement','slag','ash','water','splast','coarse','fine','age'] - - # Create X and y datasets as numpy arrays - X = concrete[feats].copy() - X = X.view((float, len(X.dtype.names))) - y = concrete['strength'] + # Load the test dataset + dataset = load_concrete(return_dataset=True) + X, y = dataset.to_numpy() + features = dataset.meta["features"] fig = plt.figure() ax = fig.add_subplot() reg = Lasso(random_state=42) - feats = list(map(lambda s: s.title(), feats)) - viz = FeatureImportances(reg, ax=ax, labels=feats, relative=False) + features = list(map(lambda s: s.title(), features)) + viz = FeatureImportances(reg, ax=ax, labels=features, relative=False) viz.fit(X, y) - viz.poof() + viz.finalize() - self.assert_images_similar(viz) + # Appveyor and Linux conda non-text-based differences + self.assert_images_similar(viz, tol=16.2) - @pytest.mark.xfail( - sys.platform == 'win32', reason="images not close on windows" - ) def test_integration_quick_method(self): """ Integration test of quick method """ - occupancy = self.load_data('occupancy') - features = [ - "temperature", "relative_humidity", "light", "C02", "humidity" - ] - - # Create X and y datasets as numpy arrays - X = occupancy[features].copy() - X = X.view((float, len(X.dtype.names))) - y = occupancy['occupancy'].astype(int) + # Load the test dataset + X, y = load_occupancy(return_dataset=True).to_numpy() fig = plt.figure() ax = fig.add_subplot() clf = RandomForestClassifier(random_state=42) - g = feature_importances(clf, X, y, ax) + g = feature_importances(clf, X, y, ax=ax) - self.assert_images_similar(ax=g) + # Appveyor and Linux conda non-text-based differences + self.assert_images_similar(g, tol=15.0) def test_fit_no_importances_model(self): """ @@ -155,8 +127,8 @@ def test_fit_sorted_params(self): """ On fit, sorted features_ and feature_importances_ params are created """ - coefs = np.array([0.4, 0.2, 0.08, 0.07, 0.16, .23, 0.38, 0.1, 0.05]) - names = np.array(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i']) + coefs = np.array([0.4, 0.2, 0.08, 0.07, 0.16, 0.23, 0.38, 0.1, 0.05]) + names = np.array(["a", "b", "c", "d", "e", "f", "g", "h", "i"]) model = MockEstimator() model.make_importance_param(value=coefs) @@ -164,8 +136,8 @@ def test_fit_sorted_params(self): visualizer = FeatureImportances(model, labels=names) visualizer.fit(np.random.rand(100, len(names)), np.random.rand(100)) - assert hasattr(visualizer, 'features_') - assert hasattr(visualizer, 'feature_importances_') + assert hasattr(visualizer, "features_") + assert hasattr(visualizer, "feature_importances_") # get the expected sort index sort_idx = np.argsort(coefs) @@ -178,7 +150,7 @@ def test_fit_relative(self): """ Test fit computes relative importances """ - coefs = np.array([0.4, 0.2, 0.08, 0.07, 0.16, .23, 0.38, 0.1, 0.05]) + coefs = np.array([0.4, 0.2, 0.08, 0.07, 0.16, 0.23, 0.38, 0.1, 0.05]) model = MockEstimator() model.make_importance_param(value=coefs) @@ -194,7 +166,7 @@ def test_fit_not_relative(self): """ Test fit stores unmodified importances """ - coefs = np.array([0.4, 0.2, 0.08, 0.07, 0.16, .23, 0.38, 0.1, 0.05]) + coefs = np.array([0.4, 0.2, 0.08, 0.07, 0.16, 0.23, 0.38, 0.1, 0.05]) model = MockEstimator() model.make_importance_param(value=coefs) @@ -209,7 +181,7 @@ def test_fit_absolute(self): """ Test fit with absolute values """ - coefs = np.array([0.4, 0.2, -0.08, 0.07, 0.16, .23, -0.38, 0.1, -0.05]) + coefs = np.array([0.4, 0.2, -0.08, 0.07, 0.16, 0.23, -0.38, 0.1, -0.05]) model = MockEstimator() model.make_importance_param(value=coefs) @@ -218,63 +190,62 @@ def test_fit_absolute(self): visualizer = FeatureImportances(model, absolute=True, relative=False) visualizer.fit(np.random.rand(100, len(coefs)), np.random.rand(100)) - expected = np.array([0.05, 0.07, 0.08, 0.1, 0.16, 0.2, .23, 0.38, 0.4]) + expected = np.array([0.05, 0.07, 0.08, 0.1, 0.16, 0.2, 0.23, 0.38, 0.4]) npt.assert_array_equal(visualizer.feature_importances_, expected) # Test no absolute value visualizer = FeatureImportances(model, absolute=False, relative=False) visualizer.fit(np.random.rand(100, len(coefs)), np.random.rand(100)) - expected = np.array([-0.38, -0.08, -0.05, 0.07, 0.1, 0.16, 0.2, .23, 0.4]) + expected = np.array([-0.38, -0.08, -0.05, 0.07, 0.1, 0.16, 0.2, 0.23, 0.4]) npt.assert_array_equal(visualizer.feature_importances_, expected) def test_multi_coefs(self): """ - Test fit with multidimensional coefficients + Test fit with multidimensional coefficients and stack warning """ - coefs = np.array([ - [0.4, 0.2, -0.08, 0.07, 0.16, 0.23, -0.38, 0.1, -0.05], - [0.41, 0.12, -0.1, 0.1, 0.14, 0.21, 0.01, 0.31, -0.15], - [0.31, 0.2, -0.01, 0.1, 0.22, 0.23, 0.01, 0.12, -0.15] + coefs = np.array( + [ + [0.4, 0.2, -0.08, 0.07, 0.16, 0.23, -0.38, 0.1, -0.05], + [0.41, 0.12, -0.1, 0.1, 0.14, 0.21, 0.01, 0.31, -0.15], + [0.31, 0.2, -0.01, 0.1, 0.22, 0.23, 0.01, 0.12, -0.15], ] ) model = MockEstimator() model.make_importance_param(value=coefs) - visualizer = FeatureImportances(model) - visualizer.fit( - np.random.rand(100, len(np.mean(coefs, axis=0))), np.random.rand(100) - ) + visualizer = FeatureImportances(model, stack=False) + + with pytest.warns(YellowbrickWarning): + visualizer.fit( + np.random.rand(100, len(np.mean(coefs, axis=0))), np.random.rand(100) + ) npt.assert_equal(visualizer.feature_importances_.ndim, 1) - @pytest.mark.xfail( - sys.platform == 'win32', reason="images not close on windows" - ) def test_multi_coefs_stacked(self): """ Test stack plot with multidimensional coefficients """ - X_iris, y_iris = load_iris(True) - X_iris_pd = pd.DataFrame(X_iris, columns=['f1', 'f2', 'f3', 'f4']) + X, y = load_iris(True) - viz = FeatureImportances(LogisticRegression(), stack=True) - viz.fit(X_iris_pd, y_iris) - viz.poof() + viz = FeatureImportances(LogisticRegression(random_state=222), stack=True) + viz.fit(X, y) + viz.finalize() npt.assert_equal(viz.feature_importances_.shape, (3, 4)) - self.assert_images_similar(viz) - + # Appveyor and Linux conda non-text-based differences + self.assert_images_similar(viz, tol=17.5) @pytest.mark.skipif(pd is None, reason="pandas is required for this test") def test_fit_dataframe(self): """ Ensure feature names are extracted from DataFrame columns """ - labels = ['a', 'b', 'c', 'd', 'e', 'f'] + labels = ["a", "b", "c", "d", "e", "f"] df = pd.DataFrame(np.random.rand(100, 6), columns=labels) - s = pd.Series(np.random.rand(100), name='target') + s = pd.Series(np.random.rand(100), name="target") assert df.shape == (100, 6) @@ -284,7 +255,7 @@ def test_fit_dataframe(self): visualizer = FeatureImportances(model) visualizer.fit(df, s) - assert hasattr(visualizer, 'features_') + assert hasattr(visualizer, "features_") npt.assert_array_equal(visualizer.features_, np.array(df.columns)) def test_fit_makes_labels(self): @@ -298,7 +269,7 @@ def test_fit_makes_labels(self): visualizer.fit(np.random.rand(100, 10), np.random.rand(100)) # Don't have to worry about label space since importances are linspace - assert hasattr(visualizer, 'features_') + assert hasattr(visualizer, "features_") npt.assert_array_equal(np.arange(10), visualizer.features_) def test_fit_calls_draw(self): @@ -306,12 +277,12 @@ def test_fit_calls_draw(self): Assert that fit calls draw """ model = MockEstimator() - model.make_importance_param('coef_') + model.make_importance_param("coef_") visualizer = FeatureImportances(model) - with mock.patch.object(visualizer, 'draw') as mdraw: - visualizer.fit(np.random.rand(100,42), np.random.rand(100)) + with mock.patch.object(visualizer, "draw") as mdraw: + visualizer.fit(np.random.rand(100, 42), np.random.rand(100)) mdraw.assert_called_once() def test_draw_raises_unfitted(self): @@ -326,35 +297,36 @@ def test_find_importances_param(self): """ Test the expected parameters can be found """ - params = ('feature_importances_', 'coef_') + params = ("feature_importances_", "coef_") for param in params: model = MockEstimator() - model.make_importance_param(param, 'foo') + model.make_importance_param(param, "foo") visualizer = FeatureImportances(model) assert hasattr(model, param), "expected '{}' missing".format(param) for oparam in params: - if oparam == param: continue + if oparam == param: + continue assert not hasattr(model, oparam), "unexpected '{}'".format(oparam) importances = visualizer._find_importances_param() - assert importances == 'foo' + assert importances == "foo" def test_find_importances_param_priority(self): """ With both feature_importances_ and coef_, one has priority """ model = MockEstimator() - model.make_importance_param('feature_importances_', 'foo') - model.make_importance_param('coef_', 'bar') + model.make_importance_param("feature_importances_", "foo") + model.make_importance_param("coef_", "bar") visualizer = FeatureImportances(model) - assert hasattr(model, 'feature_importances_') - assert hasattr(model, 'coef_') + assert hasattr(model, "feature_importances_") + assert hasattr(model, "coef_") importances = visualizer._find_importances_param() - assert importances == 'foo' + assert importances == "foo" def test_find_importances_param_not_found(self): """ @@ -363,8 +335,8 @@ def test_find_importances_param_not_found(self): model = MockEstimator() visualizer = FeatureImportances(model) - assert not hasattr(model, 'feature_importances_') - assert not hasattr(model, 'coef_') + assert not hasattr(model, "feature_importances_") + assert not hasattr(model, "coef_") with pytest.raises(YellowbrickTypeError): visualizer._find_importances_param() @@ -376,9 +348,9 @@ def test_find_classes_param_not_found(self): model = MockClassifier() visualizer = FeatureImportances(model) - assert not hasattr(model, 'classes_') + assert not hasattr(model, "classes_") - e = 'could not find classes_ param on {}'.format( + e = "could not find classes_ param on {}".format( visualizer.estimator.__class__.__name__ ) with pytest.raises(YellowbrickTypeError, match=e): @@ -389,7 +361,7 @@ def test_xlabel(self): Check the various xlabels are sensical """ model = MockEstimator() - model.make_importance_param('feature_importances_') + model.make_importance_param("feature_importances_") visualizer = FeatureImportances(model, xlabel="foo", relative=True) # Assert the visualizer uses the user supplied xlabel @@ -405,7 +377,7 @@ def test_xlabel(self): # Check coeficients model = MockEstimator() - model.make_importance_param('coef_') + model.make_importance_param("coef_") visualizer = FeatureImportances(model, xlabel="baz", relative=True) # Assert the visualizer uses the user supplied xlabel @@ -421,7 +393,6 @@ def test_xlabel(self): assert "coefficient" in visualizer._get_xlabel() assert "relative" not in visualizer._get_xlabel() - def test_is_fitted(self): """ Test identification if is fitted @@ -438,17 +409,41 @@ def test_is_fitted(self): del visualizer.features_ assert not visualizer._is_fitted() + def test_with_fitted(self): + """ + Test that visualizer properly handles an already-fitted model + """ + X, y = load_concrete(return_dataset=True).to_numpy() + + model = Lasso().fit(X, y) + + with mock.patch.object(model, "fit") as mockfit: + oz = FeatureImportances(model) + oz.fit(X, y) + mockfit.assert_not_called() + + with mock.patch.object(model, "fit") as mockfit: + oz = FeatureImportances(model, is_fitted=True) + oz.fit(X, y) + mockfit.assert_not_called() + + with mock.patch.object(model, "fit") as mockfit: + oz = FeatureImportances(model, is_fitted=False) + oz.fit(X, y) + mockfit.assert_called_once_with(X, y) + ########################################################################## ## Mock Estimator ########################################################################## + class MockEstimator(BaseEstimator): """ Creates params when fit is called on demand. """ - def make_importance_param(self, name='feature_importances_', value=None): + def make_importance_param(self, name="feature_importances_", value=None): if value is None: value = np.random.rand(42) setattr(self, name, value) @@ -461,4 +456,5 @@ class MockClassifier(BaseEstimator, ClassifierMixin): """ Creates empty classifier. """ + pass diff --git a/tests/test_model_selection/test_learning_curve.py b/tests/test_model_selection/test_learning_curve.py index 0d887dc67..422db91dd 100644 --- a/tests/test_model_selection/test_learning_curve.py +++ b/tests/test_model_selection/test_learning_curve.py @@ -1,9 +1,12 @@ # tests.test_model_selection.test_learning_curve # Tests for the LearningCurve visualizer # -# Author: Jason Keung +# Author: Jason Keung # Created: Tues May 23 11:45:00 2017 -0400 # +# Copyright (C) 2017 The scikit-yb developers +# For license information, see LICENSE.txt +# # ID: test_learning_curve.py jason.s.keung@gmail.com $ """ @@ -11,24 +14,26 @@ """ ########################################################################## -## Imports +# Imports ########################################################################## import sys import pytest import numpy as np +from unittest.mock import patch from tests.base import VisualTestCase -from tests.dataset import DatasetMixin from sklearn.svm import LinearSVC from sklearn.linear_model import Ridge from sklearn.naive_bayes import GaussianNB from sklearn.cluster import MiniBatchKMeans +from sklearn.preprocessing import OneHotEncoder from sklearn.model_selection import ShuffleSplit from sklearn.model_selection import StratifiedKFold from sklearn.ensemble import RandomForestClassifier +from yellowbrick.datasets import load_mushroom from yellowbrick.exceptions import YellowbrickValueError from yellowbrick.model_selection.learning_curve import * @@ -37,23 +42,19 @@ except ImportError: pd = None -try: - from unittest.mock import patch -except ImportError: - from mock import patch - ########################################################################## -## LearningCurve Test Cases +# LearningCurve Test Cases ########################################################################## + @pytest.mark.usefixtures("classification", "regression", "clusters") -class TestLearningCurve(VisualTestCase, DatasetMixin): +class TestLearningCurve(VisualTestCase): """ Test the LearningCurve visualizer """ - @patch.object(LearningCurve, 'draw') + @patch.object(LearningCurve, "draw") def test_fit(self, mock_draw): """ Assert that fit returns self and creates expected properties @@ -61,8 +62,12 @@ def test_fit(self, mock_draw): X, y = self.classification params = ( "train_sizes_", - "train_scores_", "train_scores_mean_", "train_scores_std_", - "test_scores_", "test_scores_mean_", "test_scores_std_" + "train_scores_", + "train_scores_mean_", + "train_scores_std_", + "test_scores_", + "test_scores_mean_", + "test_scores_std_", ) oz = LearningCurve(GaussianNB(), random_state=12) @@ -75,9 +80,7 @@ def test_fit(self, mock_draw): for param in params: assert hasattr(oz, param) - @pytest.mark.xfail( - sys.platform == 'win32', reason="images not close on windows" - ) + @pytest.mark.xfail(sys.platform == "win32", reason="images not close on windows") def test_classifier(self): """ Test image closeness on a classification dataset @@ -87,13 +90,11 @@ def test_classifier(self): oz = LearningCurve( RandomForestClassifier(random_state=21), random_state=12 ).fit(X, y) - oz.poof() + oz.finalize() self.assert_images_similar(oz) - @pytest.mark.xfail( - sys.platform == 'win32', reason="images not close on windows" - ) + @pytest.mark.xfail(sys.platform == "win32", reason="images not close on windows") def test_regressor(self): """ Test image closeness on a regression dataset @@ -102,7 +103,7 @@ def test_regressor(self): oz = LearningCurve(Ridge(), random_state=18) oz.fit(X, y) - oz.poof() + oz.finalize() self.assert_images_similar(oz) @@ -112,44 +113,40 @@ def test_clusters(self): """ X, y = self.clusters - oz = LearningCurve( - MiniBatchKMeans(random_state=281), random_state=182 - ).fit(X) - oz.poof() + oz = LearningCurve(MiniBatchKMeans(random_state=281), random_state=182).fit(X) + oz.finalize() self.assert_images_similar(oz, tol=10) - @pytest.mark.xfail( - sys.platform == 'win32', reason="images not close on windows" - ) + @pytest.mark.xfail(sys.platform == "win32", reason="images not close on windows") def test_quick_method(self): """ Test the learning curve quick method acts as expected """ X, y = self.classification - train_sizes = np.linspace(.1, 1.0, 8) - ax = learning_curve(GaussianNB(), X, y, train_sizes=train_sizes, + train_sizes = np.linspace(0.1, 1.0, 8) + viz = learning_curve( + GaussianNB(), + X, + y, + train_sizes=train_sizes, cv=ShuffleSplit(n_splits=10, test_size=0.2, random_state=34), - scoring='f1_macro', random_state=43, + scoring="f1_macro", + random_state=43, ) - self.assert_images_similar(ax=ax) + self.assert_images_similar(viz) - @pytest.mark.xfail( - sys.platform == 'win32', reason="images not close on windows" - ) + @pytest.mark.xfail(sys.platform == "win32", reason="images not close on windows") @pytest.mark.skipif(pd is None, reason="test requires pandas") def test_pandas_integration(self): """ Test on a real dataset with pandas DataFrame and Series """ - df = self.load_pandas("mushroom") - - target = "target" - features = [col for col in df.columns if col != target] + data = load_mushroom(return_dataset=True) + X, y = data.to_pandas() - X = pd.get_dummies(df[features]) - y = df[target] + X = pd.get_dummies(X) assert isinstance(X, pd.DataFrame) assert isinstance(y, pd.Series) @@ -157,11 +154,27 @@ def test_pandas_integration(self): cv = StratifiedKFold(n_splits=4, random_state=32) oz = LearningCurve(GaussianNB(), cv=cv, random_state=23) oz.fit(X, y) - oz.poof() + oz.finalize() + + self.assert_images_similar(oz) + + def test_numpy_integration(self): + """ + Test on a real dataset with NumPy arrays + """ + data = load_mushroom(return_dataset=True) + X, y = data.to_numpy() + + X = OneHotEncoder().fit_transform(X).toarray() + + cv = StratifiedKFold(n_splits=4, random_state=32) + oz = LearningCurve(GaussianNB(), cv=cv, random_state=23) + oz.fit(X, y) + oz.finalize() self.assert_images_similar(oz) - @patch.object(LearningCurve, 'draw') + @patch.object(LearningCurve, "draw") def test_reshape_scores(self, mock_draw): """ Test supplying an alternate CV methodology and train_sizes diff --git a/tests/test_features/test_rfecv.py b/tests/test_model_selection/test_rfecv.py similarity index 50% rename from tests/test_features/test_rfecv.py rename to tests/test_model_selection/test_rfecv.py index c1ceae966..8405011c6 100644 --- a/tests/test_features/test_rfecv.py +++ b/tests/test_model_selection/test_rfecv.py @@ -1,10 +1,13 @@ -# tests.test_feautures.test_rfecv +# tests.test_model_selection.test_rfecv # Tests for the RFECV visualizer # -# Author: Benjamin Bengfort +# Author: Benjamin Bengfort # Created: Tue Apr 03 17:35:16 2018 -0400 # -# ID: test_rfecv.py [] benjamin@bengfort.com $ +# Copyright (C) 2018 The scikit-yb developers +# For license information, see LICENSE.txt +# +# ID: test_rfecv.py [a4599db] rebeccabilbro@users.noreply.github.com $ """ Tests for the RFECV visualizer @@ -16,17 +19,21 @@ import sys import pytest +import numpy as np +import numpy.testing as npt -from tests.base import VisualTestCase -from tests.dataset import DatasetMixin, Dataset +from unittest.mock import patch -from yellowbrick.features.rfecv import * +from tests.fixtures import Dataset +from tests.base import VisualTestCase +from yellowbrick.model_selection.rfecv import * +from yellowbrick.datasets import load_occupancy from yellowbrick.exceptions import YellowbrickValueError from sklearn.svm import SVC +from sklearn.datasets import make_classification from sklearn.model_selection import ShuffleSplit from sklearn.model_selection import StratifiedKFold -from sklearn.datasets import make_classification from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier @@ -35,24 +42,26 @@ except ImportError: pd = None -try: - from unittest.mock import patch -except ImportError: - from mock import patch - ########################################################################## ## Fixtures ########################################################################## + @pytest.fixture(scope="class") def dataset(request): """ Creates a multiclass classification dataset fixture for RFECV """ X, y = make_classification( - n_samples=600, n_features=15, n_informative=7, n_redundant=4, - n_repeated=0, n_classes=8, n_clusters_per_class=1, random_state=0 + n_samples=600, + n_features=15, + n_informative=7, + n_redundant=4, + n_repeated=0, + n_classes=8, + n_clusters_per_class=1, + random_state=0, ) dataset = Dataset(X, y) @@ -63,21 +72,26 @@ def dataset(request): ## Test Cases ########################################################################## + @pytest.mark.usefixtures("dataset") -class TestRFECV(VisualTestCase, DatasetMixin): +class TestRFECV(VisualTestCase): """ Test the RFECV visualizer """ - @patch.object(RFECV, 'draw') + @patch.object(RFECV, "draw") def test_fit(self, mock_draw): """ Assert that fit returns self and creates expected properties with NB """ X, y = self.dataset params = ( - "n_features_", "support_", "ranking_", - "cv_scores_", "rfe_estimator_", + "n_features_", + "support_", + "ranking_", + "cv_scores_", + "rfe_estimator_", + "n_feature_subsets_", ) rf = RandomForestClassifier() @@ -98,9 +112,7 @@ def test_fit(self, mock_draw): assert oz._wrapped is not rf assert oz._wrapped is oz.rfe_estimator_ - @pytest.mark.xfail( - sys.platform == 'win32', reason="images not close on windows" - ) + @pytest.mark.xfail(sys.platform == "win32", reason="images not close on windows") def test_rfecv_classification(self): """ Test image closeness on a classification dataset with an SVM @@ -108,14 +120,12 @@ def test_rfecv_classification(self): cv = ShuffleSplit(3, random_state=21) oz = RFECV(SVC(kernel="linear", C=1), cv=cv) oz.fit(self.dataset.X, self.dataset.y) - oz.poof() + oz.finalize() - self.assert_images_similar(oz) + self.assert_images_similar(oz, remove_legend=True) - @pytest.mark.xfail( - sys.platform == 'win32', reason="images not close on windows" - ) - @pytest.mark.filterwarnings('ignore:F-score is ill-defined') + @pytest.mark.xfail(sys.platform == "win32", reason="images not close on windows") + @pytest.mark.filterwarnings("ignore:F-score is ill-defined") def test_quick_method(self): """ Test the recv quick method works with LogisticRegression @@ -124,27 +134,18 @@ def test_quick_method(self): model = LogisticRegression() X, y = self.dataset - ax = rfecv(model, X, y, step=3, cv=cv, scoring='f1_weighted') + viz = rfecv(model, X, y, step=2, cv=cv, scoring="f1_weighted") - self.assert_images_similar(ax=ax) + self.assert_images_similar(viz, remove_legend=True) - @pytest.mark.xfail( - sys.platform == 'win32', reason="images not close on windows" - ) + @pytest.mark.xfail(sys.platform == "win32", reason="images not close on windows") @pytest.mark.skipif(pd is None, reason="test requires pandas") def test_pandas_integration(self): """ Test on a real dataset with pandas DataFrame and Series """ - df = self.load_pandas("occupancy") - - target = "occupancy" - features = [ - 'temperature', 'relative humidity', 'light', 'C02', 'humidity' - ] - - X = df[features] - y = df[target] + data = load_occupancy(return_dataset=True) + X, y = data.to_pandas() assert isinstance(X, pd.DataFrame) assert isinstance(y, pd.Series) @@ -152,15 +153,56 @@ def test_pandas_integration(self): cv = StratifiedKFold(n_splits=4, random_state=32) oz = RFECV(RandomForestClassifier(random_state=83), cv=cv) oz.fit(X, y) - oz.poof() + oz.finalize() - self.assert_images_similar(oz) + self.assert_images_similar(oz, remove_legend=True) - def test_valid_step(self): + @pytest.mark.xfail(sys.platform == "win32", reason="images not close on windows") + def test_numpy_integration(self): + """ + Test on a real dataset with numpy ndarray + """ + data = load_occupancy(return_dataset=True) + X, y = data.to_numpy() + + assert isinstance(X, np.ndarray) + assert isinstance(y, np.ndarray) + + cv = StratifiedKFold(n_splits=4, random_state=32) + oz = RFECV(RandomForestClassifier(random_state=83), cv=cv) + oz.fit(X, y) + oz.finalize() + + self.assert_images_similar(oz, remove_legend=True) + + @pytest.mark.parametrize("step", [0, -1, -5]) + def test_invalid_step(self, step): """ Test step hyperparam validation """ - # TODO: parametrize when unittest is removed - with pytest.raises(YellowbrickValueError): - oz = RFECV(SVC(kernel="lnear"), step=-1) + with pytest.raises(YellowbrickValueError, match="step must be >0"): + oz = RFECV(SVC(kernel="linear"), step=step) oz.fit(self.dataset.X, self.dataset.y) + + def test_rfecv_step(self): + """ + Test RFECV step=5 with LogisticRegression + """ + X, y = make_classification( + n_samples=200, + n_features=30, + n_informative=18, + n_redundant=6, + n_repeated=0, + n_classes=8, + n_clusters_per_class=1, + random_state=0, + ) + + oz = RFECV(LogisticRegression(random_state=32), step=5).fit(X, y) + assert hasattr(oz, "n_feature_subsets_") + npt.assert_array_equal(oz.n_feature_subsets_, np.arange(1, 35, 5)) + + oz.finalize() + tol = 1.75 if sys.platform == "win32" else 0.25 + self.assert_images_similar(oz, tol=tol, remove_legend=True) diff --git a/tests/test_model_selection/test_validation_curve.py b/tests/test_model_selection/test_validation_curve.py index 0ec254331..664cc3ea0 100644 --- a/tests/test_model_selection/test_validation_curve.py +++ b/tests/test_model_selection/test_validation_curve.py @@ -1,32 +1,37 @@ # tests.test_model_selection.test_validation_curve # Tests for the ValidationCurve visualizer # -# Author: Benjamin Bengfort +# Author: Benjamin Bengfort # Created: Sat Mar 31 06:25:05 2018 -0400 # -# ID: test_validation_curve.py [] benjamin@bengfort.com $ +# Copyright (C) 2018 The scikit-yb developers +# For license information, see LICENSE.txt +# +# ID: test_validation_curve.py [c5355ee] benjamin@bengfort.com $ """ Tests for the ValidationCurve visualizer """ ########################################################################## -## Imports +# Imports ########################################################################## import sys import pytest import numpy as np +from unittest.mock import patch from tests.base import VisualTestCase -from tests.dataset import DatasetMixin from sklearn.svm import SVC from sklearn.naive_bayes import BernoulliNB from sklearn.tree import DecisionTreeRegressor +from sklearn.preprocessing import OneHotEncoder from sklearn.neighbors import KNeighborsClassifier from sklearn.model_selection import ShuffleSplit, StratifiedKFold +from yellowbrick.datasets import load_mushroom from yellowbrick.exceptions import YellowbrickValueError from yellowbrick.model_selection.validation_curve import * @@ -36,31 +41,31 @@ except ImportError: pd = None -try: - from unittest.mock import patch -except ImportError: - from mock import patch - ########################################################################## -## Test Cases +# Test Cases ########################################################################## + @pytest.mark.usefixtures("classification", "regression", "clusters") -class TestValidationCurve(VisualTestCase, DatasetMixin): +class TestValidationCurve(VisualTestCase): """ Test the ValidationCurve visualizer """ - @patch.object(ValidationCurve, 'draw') + @patch.object(ValidationCurve, "draw") def test_fit(self, mock_draw): """ Assert that fit returns self and creates expected properties """ X, y = self.classification params = ( - "train_scores_", "train_scores_mean_", "train_scores_std_", - "test_scores_", "test_scores_mean_", "test_scores_std_" + "train_scores_", + "train_scores_mean_", + "train_scores_std_", + "test_scores_", + "test_scores_mean_", + "test_scores_std_", ) oz = ValidationCurve( @@ -76,9 +81,7 @@ def test_fit(self, mock_draw): for param in params: assert hasattr(oz, param) - @pytest.mark.xfail( - sys.platform == 'win32', reason="images not close on windows" - ) + @pytest.mark.xfail(sys.platform == "win32", reason="images not close on windows") def test_classifier(self): """ Test image closeness on a classification dataset with kNN @@ -89,12 +92,15 @@ def test_classifier(self): param_range = np.arange(3, 10) oz = ValidationCurve( - KNeighborsClassifier(), param_name="n_neighbors", - param_range=param_range, cv=cv, scoring='f1_weighted', + KNeighborsClassifier(), + param_name="n_neighbors", + param_range=param_range, + cv=cv, + scoring="f1_weighted", ) oz.fit(X, y) - oz.poof() + oz.finalize() self.assert_images_similar(oz) @@ -108,18 +114,19 @@ def test_regression(self): param_range = np.arange(3, 10) oz = ValidationCurve( - DecisionTreeRegressor(random_state=23), param_name="max_depth", - param_range=param_range, cv=cv, scoring='r2', + DecisionTreeRegressor(random_state=23), + param_name="max_depth", + param_range=param_range, + cv=cv, + scoring="r2", ) oz.fit(X, y) - oz.poof() + oz.finalize() self.assert_images_similar(oz, tol=12.0) - @pytest.mark.xfail( - sys.platform == 'win32', reason="images not close on windows" - ) + @pytest.mark.xfail(sys.platform == "win32", reason="images not close on windows") def test_quick_method(self): """ Test validation curve quick method with image closeness on SVC @@ -128,42 +135,53 @@ def test_quick_method(self): pr = np.logspace(-6, -1, 3) cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=321) - ax = validation_curve( - SVC(), X, y, logx=True, param_name='gamma', param_range=pr, cv=cv + viz = validation_curve( + SVC(), X, y, logx=True, param_name="gamma", param_range=pr, cv=cv ) - self.assert_images_similar(ax=ax) + self.assert_images_similar(viz) - @pytest.mark.xfail( - sys.platform == 'win32', reason="images not close on windows" - ) + @pytest.mark.xfail(sys.platform == "win32", reason="images not close on windows") @pytest.mark.skipif(pd is None, reason="test requires pandas") def test_pandas_integration(self): """ Test on mushroom dataset with pandas DataFrame and Series and NB """ - df = self.load_pandas("mushroom") + data = load_mushroom(return_dataset=True) + X, y = data.to_pandas() - target = "target" - features = [col for col in df.columns if col != target] - - X = pd.get_dummies(df[features]) - y = df[target] + X = pd.get_dummies(X) assert isinstance(X, pd.DataFrame) assert isinstance(y, pd.Series) cv = StratifiedKFold(n_splits=2, random_state=11) pr = np.linspace(0.1, 3.0, 6) - oz = ValidationCurve( - BernoulliNB(), cv=cv, param_range=pr, param_name='alpha' - ) + oz = ValidationCurve(BernoulliNB(), cv=cv, param_range=pr, param_name="alpha") + oz.fit(X, y) + oz.finalize() + + self.assert_images_similar(oz) + + @pytest.mark.xfail(sys.platform == "win32", reason="images not close on windows") + def test_numpy_integration(self): + """ + Test on mushroom dataset with NumPy arrays + """ + data = load_mushroom(return_dataset=True) + X, y = data.to_numpy() + + X = OneHotEncoder().fit_transform(X).toarray() + + cv = StratifiedKFold(n_splits=2, random_state=11) + pr = np.linspace(0.1, 3.0, 6) + oz = ValidationCurve(BernoulliNB(), cv=cv, param_range=pr, param_name="alpha") oz.fit(X, y) - oz.poof() + oz.finalize() self.assert_images_similar(oz) - @patch.object(ValidationCurve, 'draw') + @patch.object(ValidationCurve, "draw") def test_reshape_scores(self, mock_draw): """ Test supplying an alternate CV methodology and train_sizes @@ -172,7 +190,7 @@ def test_reshape_scores(self, mock_draw): pr = np.logspace(-6, -1, 3) cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=14) - oz = ValidationCurve(SVC(), param_name='gamma', param_range=pr, cv=cv) + oz = ValidationCurve(SVC(), param_name="gamma", param_range=pr, cv=cv) oz.fit(X, y) assert oz.train_scores_.shape == (3, 5) @@ -183,4 +201,4 @@ def test_bad_train_sizes(self): Test learning curve with bad input for training size. """ with pytest.raises(YellowbrickValueError): - ValidationCurve(SVC(), param_name='gamma', param_range=100) + ValidationCurve(SVC(), param_name="gamma", param_range=100) diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index 940dc7e83..b642f795a 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -4,7 +4,7 @@ # Author: Benjamin Bengfort # Created: Fri Oct 07 22:10:50 2016 -0400 # -# Copyright (C) 2016 District Data Labs +# Copyright (C) 2016 The scikit-yb developers # For license information, see LICENSE.txt # # ID: test_pipeline.py [1efae1f] benjamin@bengfort.com $ @@ -18,34 +18,30 @@ ########################################################################## import os -import unittest +import pytest +from unittest import mock from yellowbrick.base import Visualizer from yellowbrick.pipeline import VisualPipeline from sklearn.pipeline import Pipeline from sklearn.base import BaseEstimator, TransformerMixin -try: - from unittest import mock -except ImportError: - import mock - ########################################################################## ## Mock Objects ########################################################################## + class Thing(object): pass class MockEstimator(BaseEstimator): - def fit(self, X, y=None, **kwargs): return self -class MockVisualEstimator(Visualizer): +class MockVisualEstimator(Visualizer): def fit(self, X, y=None, **kwargs): self.draw(**kwargs) return self @@ -55,7 +51,6 @@ def draw(self, **kwargs): class MockTransformer(BaseEstimator, TransformerMixin): - def fit(self, X, y=None, **kwargs): return self @@ -64,7 +59,6 @@ def transform(self, X, **kwargs): class MockVisualTransformer(Visualizer, TransformerMixin): - def fit(self, X, y=None, **kwargs): self.draw(**kwargs) return self @@ -80,8 +74,8 @@ def draw(self, **kwargs): ## VisualPipeline Tests ########################################################################## -class VisualPipelineTests(unittest.TestCase): +class TestVisualPipeline(object): def test_validate_steps(self): """ Assert that visual transformers can be added to pipelines @@ -91,52 +85,54 @@ def test_validate_steps(self): # TypeError if the steps don't match transforms --> estimator. # validate a bad intermediate transformer on the Pipeline - with self.assertRaises(TypeError): - Pipeline([ - ('real', MockTransformer()), - ('bad', Thing()), - ('model', MockEstimator()), - ]) + with pytest.raises(TypeError): + Pipeline( + [ + ("real", MockTransformer()), + ("bad", Thing()), + ("model", MockEstimator()), + ] + ) # validate a bad intermediate transformer on the VisualPipeline - with self.assertRaises(TypeError): - VisualPipeline([ - ('real', MockTransformer()), - ('bad', Thing()), - ('model', MockEstimator()), - ]) + with pytest.raises(TypeError): + VisualPipeline( + [ + ("real", MockTransformer()), + ("bad", Thing()), + ("model", MockEstimator()), + ] + ) # validate a bad final estimator on the Pipeline - with self.assertRaises(TypeError): - Pipeline([ - ('real', MockTransformer()), - ('bad', Thing()), - ]) + with pytest.raises(TypeError): + Pipeline([("real", MockTransformer()), ("bad", Thing())]) # validate a bad final estimator on the VisualPipeline - with self.assertRaises(TypeError): - VisualPipeline([ - ('real', MockTransformer()), - ('bad', Thing()), - ]) + with pytest.raises(TypeError): + VisualPipeline([("real", MockTransformer()), ("bad", Thing())]) # validate visual transformers on a Pipeline try: - Pipeline([ - ('real', MockTransformer()), - ('visual', MockVisualTransformer()), - ('model', MockEstimator()), - ]) + Pipeline( + [ + ("real", MockTransformer()), + ("visual", MockVisualTransformer()), + ("model", MockEstimator()), + ] + ) except TypeError: self.fail("could not add a visual transformer to a Pipeline!") # validate visual transformers on a VisualPipeline try: - VisualPipeline([ - ('real', MockTransformer()), - ('visual', MockVisualTransformer()), - ('model', MockEstimator()), - ]) + VisualPipeline( + [ + ("real", MockTransformer()), + ("visual", MockVisualTransformer()), + ("model", MockEstimator()), + ] + ) except TypeError: self.fail("could not add a visual transformer to a VisualPipeline!") @@ -145,32 +141,36 @@ def test_visual_steps_property(self): Test the visual steps property to filter visualizers """ - pipeline = VisualPipeline([ - ('a', MockTransformer()), - ('b', MockVisualTransformer()), - ('c', MockTransformer()), - ('d', MockVisualTransformer()), - ('e', MockEstimator()), - ]) - - self.assertNotIn('a', pipeline.visual_steps) - self.assertIn('b', pipeline.visual_steps) - self.assertNotIn('c', pipeline.visual_steps) - self.assertIn('d', pipeline.visual_steps) - self.assertNotIn('e', pipeline.visual_steps) + pipeline = VisualPipeline( + [ + ("a", MockTransformer()), + ("b", MockVisualTransformer()), + ("c", MockTransformer()), + ("d", MockVisualTransformer()), + ("e", MockEstimator()), + ] + ) + + assert "a" not in pipeline.visual_steps + assert "b" in pipeline.visual_steps + assert "c" not in pipeline.visual_steps + assert "d" in pipeline.visual_steps + assert "e" not in pipeline.visual_steps def test_pipeline_poof(self): """ Test the poof call against the VisualPipeline """ - pipeline = VisualPipeline([ - ('a', mock.MagicMock(MockTransformer())), - ('b', mock.MagicMock(MockVisualTransformer())), - ('c', mock.MagicMock(MockTransformer())), - ('d', mock.MagicMock(MockVisualTransformer())), - ('e', mock.MagicMock(MockEstimator()),) - ]) + pipeline = VisualPipeline( + [ + ("a", mock.MagicMock(MockTransformer())), + ("b", mock.MagicMock(MockVisualTransformer())), + ("c", mock.MagicMock(MockTransformer())), + ("d", mock.MagicMock(MockVisualTransformer())), + ("e", mock.MagicMock(MockEstimator())), + ] + ) pipeline.poof() pipeline.steps[1][1].poof.assert_called_once_with(outpath=None) @@ -180,41 +180,49 @@ def test_pipeline_savefig_poof(self): """ Test the poof call with an outdir to save all the figures """ - pipeline = VisualPipeline([ - ('a', mock.MagicMock(MockTransformer())), - ('b', mock.MagicMock(MockVisualTransformer())), - ('c', mock.MagicMock(MockTransformer())), - ('d', mock.MagicMock(MockVisualTransformer())), - ('e', mock.MagicMock(MockVisualEstimator()),) - ]) + pipeline = VisualPipeline( + [ + ("a", mock.MagicMock(MockTransformer())), + ("b", mock.MagicMock(MockVisualTransformer())), + ("c", mock.MagicMock(MockTransformer())), + ("d", mock.MagicMock(MockVisualTransformer())), + ("e", mock.MagicMock(MockVisualEstimator())), + ] + ) # Must use path joining for Windows compatibility tmpdir = os.path.join("tmp", "figures") pipeline.poof(outdir=tmpdir) - pipeline.steps[1][1].poof.assert_called_once_with(outpath=os.path.join(tmpdir, "b.pdf")) - pipeline.steps[3][1].poof.assert_called_once_with(outpath=os.path.join(tmpdir, "d.pdf")) - pipeline.steps[4][1].poof.assert_called_once_with(outpath=os.path.join(tmpdir, "e.pdf")) - - @unittest.skip("need to find a way for fit to return self in mocks") + pipeline.steps[1][1].poof.assert_called_once_with( + outpath=os.path.join(tmpdir, "b.pdf") + ) + pipeline.steps[3][1].poof.assert_called_once_with( + outpath=os.path.join(tmpdir, "d.pdf") + ) + pipeline.steps[4][1].poof.assert_called_once_with( + outpath=os.path.join(tmpdir, "e.pdf") + ) + + @pytest.mark.skip(reason="need to find a way for fit to return self in mocks") def test_fit_transform_poof_and_draw_calls(self): """ Test calling fit, transform, and poof on the pipeline """ - pipeline = VisualPipeline([ - ('a', mock.MagicMock(MockTransformer())), - ('b', mock.MagicMock(MockVisualTransformer())), - ('c', mock.MagicMock(MockTransformer())), - ('d', mock.MagicMock(MockVisualTransformer())), - ('e', mock.MagicMock(MockEstimator()),) - ]) + pipeline = VisualPipeline( + [ + ("a", mock.MagicMock(MockTransformer())), + ("b", mock.MagicMock(MockVisualTransformer())), + ("c", mock.MagicMock(MockTransformer())), + ("d", mock.MagicMock(MockVisualTransformer())), + ("e", mock.MagicMock(MockEstimator())), + ] + ) - X = [[1, 1, 1, 1, 1], - [2, 2, 2, 2, 2], - [3, 3, 3, 3, 3]] + X = [[1, 1, 1, 1, 1], [2, 2, 2, 2, 2], [3, 3, 3, 3, 3]] - y = [1, 2, 3, 4, 5] + y = [1, 2, 3, 4, 5] pipeline.fit(X, y) for name, step in pipeline.named_steps.items(): @@ -222,10 +230,12 @@ def test_fit_transform_poof_and_draw_calls(self): pipeline.transform(X) for name, step in pipeline.named_steps.items(): - if name == 'e': continue + if name == "e": + continue step.transform.assert_called_once_with(X) pipeline.poof() for name, step in pipeline.named_steps.items(): - if name in {'a', 'c', 'e'}: continue + if name in {"a", "c", "e"}: + continue step.poof.assert_called_once_with(outpath=None) diff --git a/tests/test_regressor/__init__.py b/tests/test_regressor/__init__.py index 891dc2116..39eaa1d77 100644 --- a/tests/test_regressor/__init__.py +++ b/tests/test_regressor/__init__.py @@ -4,7 +4,7 @@ # Author: Benjamin Bengfort # Created: Mon Mar 06 22:01:34 2017 -0500 # -# Copyright (C) 2016 District Data Labs +# Copyright (C) 2016 The scikit-yb developers # For license information, see LICENSE.txt # # ID: __init__.py [7d3f5e6] benjamin@bengfort.com $ diff --git a/tests/test_regressor/test_alphas.py b/tests/test_regressor/test_alphas.py index e7bf34608..6f43feba5 100644 --- a/tests/test_regressor/test_alphas.py +++ b/tests/test_regressor/test_alphas.py @@ -1,10 +1,10 @@ # tests.test_regressor.test_alphas # Tests for the alpha selection visualizations. # -# Author: Benjamin Bengfort +# Author: Benjamin Bengfort # Created: Tue Mar 07 12:13:04 2017 -0500 # -# Copyright (C) 2016 District Data Labs +# Copyright (C) 2016 The scikit-yb developers # For license information, see LICENSE.txt # # ID: test_alphas.py [7d3f5e6] benjamin@bengfort.com $ @@ -29,6 +29,8 @@ from yellowbrick.exceptions import YellowbrickValueError from sklearn.svm import SVR, SVC +from sklearn.cluster import KMeans +from sklearn.decomposition import PCA from sklearn.datasets import make_regression from sklearn.linear_model import Ridge, RidgeCV from sklearn.linear_model import Lasso, LassoCV @@ -40,14 +42,13 @@ ## Alpha Selection Tests ########################################################################## + class TestAlphaSelection(VisualTestCase): """ Test the AlphaSelection visualizer """ - @pytest.mark.xfail( - sys.platform == 'win32', reason="images not close on windows" - ) + @pytest.mark.xfail(sys.platform == "win32", reason="images not close on windows") def test_similar_image(self): """ Integration test with image simiarlity comparison @@ -57,33 +58,35 @@ def test_similar_image(self): X, y = make_regression(random_state=0) visualizer.fit(X, y) - visualizer.poof() + visualizer.finalize() self.assert_images_similar(visualizer) - def test_regressor_cv(self): + @pytest.mark.parametrize("model", [SVR, Ridge, Lasso, LassoLars, ElasticNet]) + def test_regressor_nocv(self, model): """ Ensure only "CV" regressors are allowed """ - # TODO: parametrize with models when unittest dependency removed - for model in (SVR, Ridge, Lasso, LassoLars, ElasticNet): - with pytest.raises(YellowbrickTypeError): - AlphaSelection(model()) + with pytest.raises(YellowbrickTypeError): + AlphaSelection(model()) - # TODO: parametrize with models when unittest dependency removed (new test case) - for model in (RidgeCV, LassoCV, LassoLarsCV, ElasticNetCV): - try: - AlphaSelection(model()) - except YellowbrickTypeError: - pytest.fail("could not instantiate RegressorCV on alpha selection") + @pytest.mark.parametrize("model", [RidgeCV, LassoCV, LassoLarsCV, ElasticNetCV]) + def test_regressor_cv(self, model): + """ + Ensure "CV" regressors are allowed + """ + try: + AlphaSelection(model()) + except YellowbrickTypeError: + pytest.fail("could not instantiate RegressorCV on alpha selection") - def test_only_regressors(self): + @pytest.mark.parametrize("model", [SVC, KMeans, PCA]) + def test_only_regressors(self, model): """ Assert AlphaSelection only works with regressors """ - # TODO: parameterize with classifier, clusterer, decomposition with pytest.raises(YellowbrickTypeError): - AlphaSelection(SVC()) + AlphaSelection(model()) def test_store_cv_values(self): """ @@ -99,21 +102,19 @@ def test_store_cv_values(self): model = AlphaSelection(RidgeCV(store_cv_values=False)) assert model.estimator.store_cv_values - def test_get_alphas_param(self): + @pytest.mark.parametrize("model", [RidgeCV, LassoCV, ElasticNetCV]) + def test_get_alphas_param(self, model): """ - Assert that we can get the alphas from ridge, lasso, and elasticnet + Assert that we can get the alphas from original CV models """ alphas = np.logspace(-10, -2, 100) - # Test original CV models - # TODO: parametrize this test with different models - for model in (RidgeCV, LassoCV, ElasticNetCV): - try: - model = AlphaSelection(model(alphas=alphas)) - malphas = model._find_alphas_param() - assert_array_equal(alphas, malphas) - except YellowbrickValueError: - pytest.fail("could not find alphas on {}".format(model.name)) + try: + model = AlphaSelection(model(alphas=alphas)) + malphas = model._find_alphas_param() + assert_array_equal(alphas, malphas) + except YellowbrickValueError: + pytest.fail("could not find alphas on {}".format(model.name)) def test_get_alphas_param_lassolars(self): """ @@ -128,24 +129,21 @@ def test_get_alphas_param_lassolars(self): except YellowbrickValueError: pytest.fail("could not find alphas on {}".format(model.name)) - def test_get_errors_param(self): + @pytest.mark.parametrize("model", [RidgeCV, LassoCV, LassoLarsCV, ElasticNetCV]) + def test_get_errors_param(self, model): """ Test known models we can get the cv errors for alpha selection """ + try: + model = AlphaSelection(model()) - # Test original CV models - # TODO: parametrize this test with different models - for model in (RidgeCV, LassoCV, LassoLarsCV, ElasticNetCV): - try: - model = AlphaSelection(model()) - - X, y = make_regression() - model.fit(X, y) + X, y = make_regression() + model.fit(X, y) - errors = model._find_errors_param() - assert len(errors) > 0 - except YellowbrickValueError: - pytest.fail("could not find errors on {}".format(model.name)) + errors = model._find_errors_param() + assert len(errors) > 0 + except YellowbrickValueError: + pytest.fail("could not find errors on {}".format(model.name)) def test_score(self): """ diff --git a/tests/test_regressor/test_influence.py b/tests/test_regressor/test_influence.py new file mode 100644 index 000000000..f07f81054 --- /dev/null +++ b/tests/test_regressor/test_influence.py @@ -0,0 +1,173 @@ +# tests.test_regressor.test_influence +# Test the regressor influence visualizers. +# +# Author: Benjamin Bengfort +# Created: Sun Jun 09 16:03:31 2019 -0400 +# +# Copyright (C) 2019 The scikit-yb developers +# For license information, see LICENSE.txt +# +# ID: test_influence.py [fe14cfd] benjamin@bengfort.com $ + +""" +Test the regressor influence visualizers. +""" + +########################################################################## +## Imports +########################################################################## + +import pytest +import numpy as np +import matplotlib.pyplot as plt + +from tests.base import VisualTestCase +from tests.fixtures import Dataset +from sklearn.datasets import make_regression + +from yellowbrick.regressor.influence import * +from yellowbrick.datasets import load_concrete + +try: + import pandas as pd +except ImportError: + pd = None + + +########################################################################## +## Fixtures +########################################################################## + + +@pytest.fixture(scope="class") +def data(request): + """ + Creates a random regression fixture that has a R2 score below 0.85 and several + outliers that best demonstrate the effectiveness of influence visualizers. + """ + X, y = make_regression( + n_samples=100, + n_features=14, + n_informative=6, + bias=1.2, + noise=49.8, + tail_strength=0.6, + random_state=637, + ) + + request.cls.data = Dataset(X, y) + + +########################################################################## +## Assertion Helpers +########################################################################## + +LEARNED_FIELDS = ( + "distance_", + "p_values_", + "influence_threshold_", + "outlier_percentage_", +) + + +def assert_not_fitted(oz): + for field in LEARNED_FIELDS: + assert not hasattr(oz, field) + + +def assert_fitted(oz): + for field in LEARNED_FIELDS: + assert hasattr(oz, field) + + +########################################################################## +## Test CooksDistance Visualizer +########################################################################## + + +@pytest.mark.usefixtures("data") +class TestCooksDistance(VisualTestCase): + """ + CooksDistance visual test cases + """ + + def test_cooks_distance(self): + """ + Test image similarity of Cook's Distance on a random dataset + """ + _, ax = plt.subplots() + viz = CooksDistance(ax=ax) + + assert_not_fitted(viz) + assert viz.fit(self.data.X, self.data.y) is viz + assert_fitted(viz) + + # Test fitted values + assert viz.distance_.shape == (self.data.X.shape[0],) + assert viz.p_values_.shape == viz.distance_.shape + assert 0.0 <= viz.influence_threshold_ <= 4.0 + assert 0.0 <= viz.outlier_percentage_ <= 100.0 + + self.assert_images_similar(viz) + + def test_cooks_distance_quickmethod(self): + """ + Test the cooks_distance quick method on a random dataset + """ + _, ax = plt.subplots() + viz = cooks_distance( + self.data.X, + self.data.y, + ax=ax, + draw_threshold=False, + linefmt="r-", + markerfmt="ro", + ) + + assert_fitted(viz) + self.assert_images_similar(viz) + + @pytest.mark.skipif(pd is None, reason="test requires pandas") + def test_pandas_integration(self): + """ + Test on the concrete dataset with pandas DataFrame and Series + """ + data = load_concrete(return_dataset=True) + X, y = data.to_pandas() + + assert isinstance(X, pd.DataFrame) + assert isinstance(y, pd.Series) + + _, ax = plt.subplots() + viz = CooksDistance(ax=ax).fit(X, y) + assert_fitted(viz) + + assert viz.distance_.sum() == pytest.approx(1.2911900571300652) + assert viz.p_values_.sum() == pytest.approx(1029.9999525376425) + assert viz.influence_threshold_ == pytest.approx(0.003883495145631068) + assert viz.outlier_percentage_ == pytest.approx(7.3786407766990285) + + viz.finalize() + self.assert_images_similar(viz) + + def test_numpy_integration(self): + """ + Test on concrete dataset with numpy arrays + """ + data = load_concrete(return_dataset=True) + X, y = data.to_numpy() + + assert isinstance(X, np.ndarray) + assert isinstance(y, np.ndarray) + + _, ax = plt.subplots() + viz = CooksDistance(ax=ax).fit(X, y) + assert_fitted(viz) + + assert viz.distance_.sum() == pytest.approx(1.2911900571300652) + assert viz.p_values_.sum() == pytest.approx(1029.9999525376425) + assert viz.influence_threshold_ == pytest.approx(0.003883495145631068) + assert viz.outlier_percentage_ == pytest.approx(7.3786407766990285) + + viz.finalize() + self.assert_images_similar(viz) diff --git a/tests/test_regressor/test_residuals.py b/tests/test_regressor/test_residuals.py index 4ab36e8c0..5550bd193 100644 --- a/tests/test_regressor/test_residuals.py +++ b/tests/test_regressor/test_residuals.py @@ -1,11 +1,11 @@ # tests.test_regressor.test_residuals # Ensure that the regressor residuals visualizations work. # -# Author: Rebecca Bilbro -# Author: Benjamin Bengfort +# Author: Rebecca Bilbro +# Author: Benjamin Bengfort # Created: Sat Oct 8 16:30:39 2016 -0400 # -# Copyright (C) 2016 District Data Labs +# Copyright (C) 2016 The scikit-yb developers # For license information, see LICENSE.txt # # ID: test_residuals.py [7d3f5e6] benjamin@bengfort.com $ @@ -23,17 +23,18 @@ import matplotlib as mpl import matplotlib.pyplot as plt +from yellowbrick.datasets import load_energy from yellowbrick.regressor.residuals import * from yellowbrick.exceptions import YellowbrickValueError -from tests.base import VisualTestCase -from tests.dataset import DatasetMixin, Dataset, Split +from unittest import mock +from tests.fixtures import Dataset, Split +from tests.base import IS_WINDOWS_OR_CONDA, VisualTestCase +from sklearn.datasets import make_regression from sklearn.linear_model import Ridge, Lasso -from sklearn.linear_model import LinearRegression from sklearn.neural_network import MLPRegressor - -from sklearn.datasets import make_regression +from sklearn.linear_model import LinearRegression from sklearn.model_selection import train_test_split as tts @@ -42,11 +43,6 @@ except ImportError: pd = None -try: - from unittest import mock -except ImportError: - import mock - # Determine version of matplotlib MPL_VERS_MAJ = int(mpl.__version__.split(".")[0]) @@ -55,25 +51,26 @@ ## Data ########################################################################## -@pytest.fixture(scope='class') + +@pytest.fixture(scope="class") def data(request): """ Creates a fixture of train and test splits for the sklearn digits dataset For ease of use returns a Dataset named tuple composed of two Split tuples. """ X, y = make_regression( - n_samples=500, n_features=22, n_informative=8, random_state=42, - noise=0.2, bias=0.2, + n_samples=500, + n_features=22, + n_informative=8, + random_state=42, + noise=0.2, + bias=0.2, ) - X_train, X_test, y_train, y_test = tts( - X, y, test_size=0.2, random_state=11 - ) + X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2, random_state=11) # Set a class attribute for digits - request.cls.data = Dataset( - Split(X_train, X_test), Split(y_train, y_test) - ) + request.cls.data = Dataset(Split(X_train, X_test), Split(y_train, y_test)) ########################################################################## @@ -82,7 +79,7 @@ def data(request): @pytest.mark.usefixtures("data") -class TestPredictionError(VisualTestCase, DatasetMixin): +class TestPredictionError(VisualTestCase): """ Test the PredictionError visualizer """ @@ -112,17 +109,29 @@ def test_prediction_error_pandas(self): _, ax = plt.subplots() # Load the occupancy dataset from fixtures - data = self.load_data('energy') - target = 'cooling_load' - features = [ - "relative_compactness", "surface_area", "wall_area", "roof_area", - "overall_height", "orientation", "glazing_area", - "glazing_area_distribution" - ] - - # Create instances and target - X = pd.DataFrame(data[features]) - y = pd.Series(data[target].astype(float)) + data = load_energy(return_dataset=True) + X, y = data.to_pandas() + + # Create train/test splits + splits = tts(X, y, test_size=0.2, random_state=8873) + X_train, X_test, y_train, y_test = splits + + visualizer = PredictionError(Ridge(random_state=22), ax=ax) + visualizer.fit(X_train, y_train) + visualizer.score(X_test, y_test) + visualizer.finalize() + + self.assert_images_similar(visualizer, tol=1, remove_legend=True) + + def test_prediction_error_numpy(self): + """ + Test NumPy real world dataset with image similarity on Ridge + """ + _, ax = plt.subplots() + + # Load the occupancy dataset from fixtures + data = load_energy(return_dataset=True) + X, y = data.to_numpy() # Create train/test splits splits = tts(X, y, test_size=0.2, random_state=8873) @@ -199,9 +208,7 @@ def test_alpha_param(self): # Instantiate a sklearn regressor model = Lasso(random_state=23, alpha=10) # Instantiate a prediction error plot, provide custom alpha - visualizer = PredictionError( - model, bestfit=False, identity=False, alpha=0.7 - ) + visualizer = PredictionError(model, bestfit=False, identity=False, alpha=0.7) # Test param gets set correctly assert visualizer.alpha == 0.7 @@ -216,19 +223,48 @@ def test_alpha_param(self): assert "alpha" in scatter_kwargs assert scatter_kwargs["alpha"] == 0.7 + @pytest.mark.xfail( + reason="""third test fails with AssertionError: Expected fit + to be called once. Called 0 times.""" + ) + def test_peplot_with_fitted(self): + """ + Test that PredictionError properly handles an already-fitted model + """ + X, y = load_energy(return_dataset=True).to_numpy() + + model = Ridge().fit(X, y) + + with mock.patch.object(model, "fit") as mockfit: + oz = PredictionError(model) + oz.fit(X, y) + mockfit.assert_not_called() + + with mock.patch.object(model, "fit") as mockfit: + oz = PredictionError(model, is_fitted=True) + oz.fit(X, y) + mockfit.assert_not_called() + + with mock.patch.object(model, "fit") as mockfit: + oz = PredictionError(model, is_fitted=False) + oz.fit(X, y) + mockfit.assert_called_once_with(X, y) + ########################################################################## ## Residuals Plot Test Cases ########################################################################## + @pytest.mark.usefixtures("data") -class TestResidualsPlot(VisualTestCase, DatasetMixin): +class TestResidualsPlot(VisualTestCase): """ Test ResidualPlot visualizer """ @pytest.mark.xfail( - sys.platform == 'win32', reason="images not close on windows (RMSE=32)" + IS_WINDOWS_OR_CONDA, + reason="font rendering different in OS and/or Python; see #892", ) def test_residuals_plot(self): """ @@ -242,10 +278,10 @@ def test_residuals_plot(self): visualizer.score(self.data.X.test, self.data.y.test) visualizer.finalize() - self.assert_images_similar(visualizer, tol=1, remove_legend=True) + self.assert_images_similar(visualizer) @pytest.mark.xfail( - sys.platform == 'win32', reason="images not close on windows (RMSE=32)" + sys.platform == "win32", reason="images not close on windows (RMSE=32)" ) @pytest.mark.filterwarnings("ignore:Stochastic Optimizer") def test_residuals_plot_no_histogram(self): @@ -263,25 +299,31 @@ def test_residuals_plot_no_histogram(self): self.assert_images_similar(visualizer, tol=1, remove_legend=True) - @pytest.mark.skipif(MPL_VERS_MAJ >= 2, reason="test requires mpl earlier than 2.0.2") + @pytest.mark.skipif( + MPL_VERS_MAJ >= 2, reason="test requires mpl earlier than 2.0.2" + ) def test_hist_matplotlib_version(self, mock_toolkit): """ ValueError is raised when matplotlib version is incorrect and hist=True """ with pytst.raises(ImportError): from mpl_toolkits.axes_grid1 import make_axes_locatable + assert not make_axes_locatable with pytest.raises(YellowbrickValueError, match="requires matplotlib 2.0.2"): ResidualsPlot(LinearRegression(), hist=True) - @pytest.mark.skipif(MPL_VERS_MAJ >= 2, reason="test requires mpl earlier than 2.0.2") + @pytest.mark.skipif( + MPL_VERS_MAJ >= 2, reason="test requires mpl earlier than 2.0.2" + ) def test_no_hist_matplotlib_version(self, mock_toolkit): """ No error is raised when matplotlib version is incorrect and hist=False """ - with pytst.raises(ImportError): + with pytest.raises(ImportError): from mpl_toolkits.axes_grid1 import make_axes_locatable + assert not make_axes_locatable try: @@ -290,7 +332,8 @@ def test_no_hist_matplotlib_version(self, mock_toolkit): self.fail(e) @pytest.mark.xfail( - sys.platform == 'win32', reason="images not close on windows (RMSE=32)" + IS_WINDOWS_OR_CONDA, + reason="font rendering different in OS and/or Python; see #892", ) def test_residuals_quick_method(self): """ @@ -299,14 +342,15 @@ def test_residuals_quick_method(self): _, ax = plt.subplots() model = Lasso(random_state=19) - ax = residuals_plot( + oz = residuals_plot( model, self.data.X.train, self.data.y.train, ax=ax, random_state=23 ) - self.assert_images_similar(ax=ax, tol=1, remove_legend=True) + self.assert_images_similar(oz) @pytest.mark.xfail( - sys.platform == 'win32', reason="images not close on windows (RMSE=32)" + IS_WINDOWS_OR_CONDA, + reason="font rendering different in OS and/or Python; see #892", ) @pytest.mark.skipif(pd is None, reason="pandas is required") def test_residuals_plot_pandas(self): @@ -316,17 +360,8 @@ def test_residuals_plot_pandas(self): _, ax = plt.subplots() # Load the occupancy dataset from fixtures - data = self.load_data('energy') - target = 'heating_load' - features = [ - "relative_compactness", "surface_area", "wall_area", "roof_area", - "overall_height", "orientation", "glazing_area", - "glazing_area_distribution" - ] - - # Create instances and target - X = pd.DataFrame(data[features]) - y = pd.Series(data[target].astype(float)) + data = load_energy(return_dataset=True) + X, y = data.to_pandas() # Create train/test splits splits = tts(X, y, test_size=0.2, random_state=231) @@ -337,7 +372,28 @@ def test_residuals_plot_pandas(self): visualizer.score(X_test, y_test) visualizer.finalize() - self.assert_images_similar(visualizer, tol=1, remove_legend=True) + self.assert_images_similar(visualizer) + + def test_residuals_plot_numpy(self): + """ + Test NumPy real world dataset with image similarity on Lasso + """ + _, ax = plt.subplots() + + # Load the occupancy dataset from fixtures + data = load_energy(return_dataset=True) + X, y = data.to_numpy() + + # Create train/test splits + splits = tts(X, y, test_size=0.2, random_state=231) + X_train, X_test, y_train, y_test = splits + + visualizer = ResidualsPlot(Lasso(random_state=44), ax=ax) + visualizer.fit(X_train, y_train) + visualizer.score(X_test, y_test) + visualizer.finalize() + + self.assert_images_similar(visualizer, tol=1.5) def test_score(self): """ @@ -352,18 +408,18 @@ def test_score(self): assert visualizer.train_score_ == pytest.approx(0.9999906, rel=1e-4) assert visualizer.test_score_ == score - @mock.patch('yellowbrick.regressor.residuals.plt.sca', autospec=True) + @mock.patch("yellowbrick.regressor.residuals.plt.sca", autospec=True) def test_alpha_param(self, mock_sca): """ Test that the user can supply an alpha param on instantiation """ # Instantiate a prediction error plot, provide custom alpha visualizer = ResidualsPlot( - Ridge(random_state=8893), alpha=0.3, hist=False + Ridge(random_state=8893), train_alpha=0.3, test_alpha=0.75, hist=False ) - + alphas = {"train_point": 0.3, "test_point": 0.75} # Test param gets set correctly - assert visualizer.alpha == 0.3 + assert visualizer.alphas == alphas visualizer.ax = mock.MagicMock() visualizer.fit(self.data.X.train, self.data.y.train) @@ -372,4 +428,31 @@ def test_alpha_param(self, mock_sca): # Test that alpha was passed to internal matplotlib scatterplot _, scatter_kwargs = visualizer.ax.scatter.call_args assert "alpha" in scatter_kwargs - assert scatter_kwargs["alpha"] == 0.3 + assert scatter_kwargs["alpha"] == 0.75 + + @pytest.mark.xfail( + reason="""third test fails with AssertionError: Expected fit + to be called once. Called 0 times.""" + ) + def test_residuals_with_fitted(self): + """ + Test that ResidualsPlot properly handles an already-fitted model + """ + X, y = load_energy(return_dataset=True).to_numpy() + + model = Ridge().fit(X, y) + + with mock.patch.object(model, "fit") as mockfit: + oz = ResidualsPlot(model) + oz.fit(X, y) + mockfit.assert_not_called() + + with mock.patch.object(model, "fit") as mockfit: + oz = ResidualsPlot(model, is_fitted=True) + oz.fit(X, y) + mockfit.assert_not_called() + + with mock.patch.object(model, "fit") as mockfit: + oz = ResidualsPlot(model, is_fitted=False) + oz.fit(X, y) + mockfit.assert_called_once_with(X, y) diff --git a/tests/test_style/__init__.py b/tests/test_style/__init__.py index cdf542fd3..8f4e31e4a 100644 --- a/tests/test_style/__init__.py +++ b/tests/test_style/__init__.py @@ -4,7 +4,7 @@ # Author: Benjamin Bengfort # Created: Tue Oct 04 16:21:21 2016 -0400 # -# Copyright (C) 2016 District Data Labs +# Copyright (C) 2016 The scikit-yb developers # For license information, see LICENSE.txt # # ID: __init__.py [c6aff34] benjamin@bengfort.com $ diff --git a/tests/test_style/test_colors.py b/tests/test_style/test_colors.py index 36894571e..ea0a3e5c3 100644 --- a/tests/test_style/test_colors.py +++ b/tests/test_style/test_colors.py @@ -4,7 +4,7 @@ # Author: Benjamin Bengfort # Created: Thu Oct 06 09:30:49 2016 -0400 # -# Copyright (C) 2016 District Data Labs +# Copyright (C) 2016 The scikit-yb developers # For license information, see LICENSE.txt # # ID: test_colors.py [c6aff34] benjamin@bengfort.com $ @@ -17,13 +17,18 @@ ## Imports ########################################################################## +import sys import pytest from matplotlib import cm from cycler import Cycler +from sklearn.cluster import KMeans +from sklearn.datasets import make_blobs + from yellowbrick.style.colors import * from yellowbrick.style.palettes import ColorPalette, PALETTES +from yellowbrick.cluster.silhouette import SilhouetteVisualizer from tests.base import VisualTestCase @@ -32,6 +37,7 @@ ## Color Tests ########################################################################## + class TestGetColorCycle(VisualTestCase): """ Test get_color_cycle helper function @@ -44,7 +50,7 @@ def test_cycle_depends_on_palette(self): c = get_color_cycle() assert len(c) == 6 - with ColorPalette('paired'): + with ColorPalette("paired"): c = get_color_cycle() assert len(c) == 12 @@ -58,7 +64,7 @@ def test_mpl_ge_150(self): Test get color cycle with matplotlib 1.5 or later """ colors = get_color_cycle() - cycle = mpl.rcParams['axes.prop_cycle'] + cycle = mpl.rcParams["axes.prop_cycle"] # Ensure the cycle is in fact a cycle assert isinstance(cycle, Cycler) @@ -72,8 +78,7 @@ def test_mpl_ge_150(self): # Ensure the colors and cycle match for color, cycle_color in zip(colors, cycle): - assert color == cycle_color['color'] - + assert color == cycle_color["color"] @pytest.mark.filterwarnings() @pytest.mark.skipif(mpl_ge_150, reason="requires matplotlib ealier than 1.5") @@ -81,7 +86,7 @@ def test_mpl_lt_150(self): """ Test get color cycle with matplotlib earlier than 1.5 """ - assert get_color_cycle() == mpl.rcParams['axes.color_cycle'] + assert get_color_cycle() == mpl.rcParams["axes.color_cycle"] class TestResolveColors(VisualTestCase): @@ -115,15 +120,15 @@ def test_warning_on_colormap_and_colors_args(self): Warns when both colormap and colors is used, colors is default """ with pytest.warns(Warning, match="both colormap and colors specified"): - colors = resolve_colors(colormap='RdBu', colors=['r', 'g', 'b']) - assert colors == ['r', 'g', 'b'] + colors = resolve_colors(colormap="RdBu", colors=["r", "g", "b"]) + assert colors == ["r", "g", "b"] def test_colormap_invalid(self): """ Exception raised when invalid colormap is supplied """ with pytest.raises(YellowbrickValueError): - resolve_colors(12, colormap='foo') + resolve_colors(12, colormap="foo") def test_colormap_string(self): """ @@ -131,18 +136,18 @@ def test_colormap_string(self): """ cases = ( ( - {'n_colors': 6, 'colormap': 'RdBu'}, + {"n_colors": 6, "colormap": "RdBu"}, [ (0.403921568627451, 0.0, 0.12156862745098039, 1.0), (0.8392156862745098, 0.3764705882352941, 0.30196078431372547, 1.0), (0.9921568627450981, 0.8588235294117647, 0.7803921568627451, 1.0), (0.8196078431372551, 0.8980392156862746, 0.9411764705882353, 1.0), (0.2627450980392157, 0.5764705882352941, 0.7647058823529411, 1.0), - (0.0196078431372549, 0.18823529411764706, 0.3803921568627451, 1.0) + (0.0196078431372549, 0.18823529411764706, 0.3803921568627451, 1.0), ], ), ( - {'n_colors': 18, 'colormap': 'viridis'}, + {"n_colors": 18, "colormap": "viridis"}, [ (0.267004, 0.004874, 0.329415, 1.0), (0.281924, 0.089666, 0.412415, 1.0), @@ -161,11 +166,11 @@ def test_colormap_string(self): (0.535621, 0.835785, 0.281908, 1.0), (0.688944, 0.865448, 0.182725, 1.0), (0.845561, 0.887322, 0.099702, 1.0), - (0.993248, 0.906157, 0.143936, 1.0) + (0.993248, 0.906157, 0.143936, 1.0), ], ), ( - {'n_colors': 9, 'colormap': 'Set1'}, + {"n_colors": 9, "colormap": "Set1"}, [ (0.8941176470588236, 0.10196078431372549, 0.10980392156862745, 1.0), (0.21568627450980393, 0.49411764705882355, 0.7215686274509804, 1.0), @@ -175,7 +180,7 @@ def test_colormap_string(self): (1.0, 1.0, 0.2, 1.0), (0.6509803921568628, 0.33725490196078434, 0.1568627450980392, 1.0), (0.9686274509803922, 0.5058823529411764, 0.7490196078431373, 1.0), - (0.6, 0.6, 0.6, 1.0) + (0.6, 0.6, 0.6, 1.0), ], ), ) @@ -190,33 +195,121 @@ def test_colormap_string_default_length(self): Check colormap when n_colors is not specified """ n_colors = len(get_color_cycle()) - assert len(resolve_colors(colormap='autumn')) == n_colors + assert len(resolve_colors(colormap="autumn")) == n_colors def test_colormap_cmap(self): """ Assert that supplying a maptlotlib.cm as colormap works """ - cmap = cm.get_cmap('nipy_spectral') + cmap = cm.get_cmap("nipy_spectral") colors = resolve_colors(4, colormap=cmap) assert colors == [ (0.0, 0.0, 0.0, 1.0), (0.0, 0.6444666666666666, 0.7333666666666667, 1.0), (0.7999666666666666, 0.9777666666666667, 0.0, 1.0), - (0.8, 0.8, 0.8, 1.0) + (0.8, 0.8, 0.8, 1.0), + ] + + def test_colormap_palette_mpl(self): + """ + Assert that supplying a maptlotlib palette as colormap works + """ + colormap = cm.get_cmap("nipy_spectral") + colors = resolve_colors(colormap=colormap) + assert colors == [ + (0.0, 0.0, 0.0, 1.0), + (0.0, 0.0, 0.8667, 1.0), + (0.0, 0.6667, 0.5333, 1.0), + (0.0, 1.0, 0.0, 1.0), + (1.0, 0.6, 0.0, 1.0), + (0.8, 0.8, 0.8, 1.0), + ] + + def test_integrated_yb_colormap(self): + """ + Assert silhouette plot colormap can be set with a yellowbrick palette + """ + # Generate a blobs data set + X, y = make_blobs( + n_samples=1000, n_features=12, centers=8, shuffle=False, random_state=0 + ) + visualizer = SilhouetteVisualizer( + KMeans(random_state=0), colormap="neural_paint" + ) + visualizer.fit(X) + visualizer.finalize() + + tol = ( + 3.2 if sys.platform == "win32" else 0.01 + ) # Fails on AppVeyor with RMS 3.143 + self.assert_images_similar(visualizer, remove_legend=True, tol=tol) + + def test_colormap_palette_yb(self): + """ + Assert that supplying a yellowbrick palette as colormap works + """ + colormap = ColorPalette("neural_paint") + assert resolve_colors(colormap=colormap) == [ + (0.08627450980392157, 0.44313725490196076, 0.5725490196078431), + (0.43137254901960786, 0.4588235294117647, 0.2823529411764706), + (0.7725490196078432, 0.6352941176470588, 0.6705882352941176), + (0.0, 0.8, 1.0), + (0.8705882352941177, 0.47058823529411764, 0.6823529411764706), + (1.0, 0.8, 0.6), + (0.23921568627450981, 0.24705882352941178, 0.25882352941176473), + (1.0, 1.0, 0.8), ] + def test_colormap_cmap_with_colors(self): + """ + Assert that colors overrides a mpl colormap if both are provided + """ + colormap = cm.get_cmap("nipy_spectral") + overriding_colors = [ + (0.0, 0.0, 0.0, 1.0), + (0.0, 0.6444666666666666, 0.7333666666666667, 1.0), + (0.7999666666666666, 0.9777666666666667, 0.0, 1.0), + (0.8, 0.8, 0.8, 1.0), + ] + with pytest.warns(Warning, match="both colormap and colors specified"): + colors = resolve_colors(colormap=colormap, colors=overriding_colors) + assert colors == overriding_colors + + def test_colormap_palette_yb_colors(self): + """ + Assert that colors overrides a yellowbrick colormap if both are provided + """ + colormap = ColorPalette("neural_paint") + overriding_colors = [ + (0.0, 0.0, 0.0, 1.0), + (0.0, 0.6444666666666666, 0.7333666666666667, 1.0), + (0.7999666666666666, 0.9777666666666667, 0.0, 1.0), + (0.8, 0.8, 0.8, 1.0), + ] + with pytest.warns(Warning, match="both colormap and colors specified"): + colors = resolve_colors(colormap=colormap, colors=overriding_colors) + assert colors == overriding_colors + + def test_colormap_invalid_type(self): + """ + Exception raised when invalid colormap type is supplied + """ + with pytest.raises(YellowbrickValueError): + a = lambda x: x + 1 + resolve_colors(colormap=a) + def test_colors(self): """ Test passing in a list of colors """ - c = PALETTES['flatui'] + c = PALETTES["flatui"] assert resolve_colors(colors=c) == c def test_colors_truncate(self): """ Test passing in a list of colors with n_colors truncate """ - c = PALETTES['flatui'] + c = PALETTES["flatui"] assert len(c) > 3 assert len(resolve_colors(n_colors=3, colors=c)) == 3 @@ -225,7 +318,7 @@ def test_colors_multiply(self): """ Test passing in a list of colors with n_colors multiply """ - c = PALETTES['flatui'] + c = PALETTES["flatui"] assert len(c) < 12 assert len(resolve_colors(n_colors=12, colors=c)) == 12 diff --git a/tests/test_style/test_palettes.py b/tests/test_style/test_palettes.py index 6ff78e71f..ece83cd5b 100644 --- a/tests/test_style/test_palettes.py +++ b/tests/test_style/test_palettes.py @@ -4,7 +4,7 @@ # Author: Benjamin Bengfort # Created: Tue Oct 04 16:21:58 2016 -0400 # -# Copyright (C) 2016 District Data Labs +# Copyright (C) 2016 The scikit-yb developers # For license information, see LICENSE.txt # # ID: test_palettes.py [c6aff34] benjamin@bengfort.com $ @@ -17,7 +17,7 @@ ## Imports ########################################################################## -import unittest +import pytest import numpy as np import matplotlib as mpl @@ -35,7 +35,8 @@ ## Color Palette Tests ########################################################################## -class ColorPaletteObjectTests(VisualTestCase): + +class TestColorPaletteObject(VisualTestCase): """ Tests the ColorPalette object """ @@ -50,16 +51,16 @@ def test_init_palette_by_name(self): try: palette = ColorPalette(name) except YellowbrickValueError: - self.fail( - "Could not instantiate {} color palette by name".format(name) - ) + self.fail("Could not instantiate {} color palette by name".format(name)) - self.assertEqual(value, palette) + assert value == palette # Try a name not in PALETTES - with self.assertRaises(YellowbrickValueError): - self.assertNotIn('foo', PALETTES, "Cannot test bad name 'foo' it is in PALETTES!") - palette = ColorPalette('foo') + with pytest.raises(YellowbrickValueError): + assert ( + "foo" not in PALETTES + ), "Cannot test bad name 'foo' it is in PALETTES!" + palette = ColorPalette("foo") def test_init_palette_by_list(self): """ @@ -69,54 +70,52 @@ def test_init_palette_by_list(self): # Try all the values in the palettes (HEX) for value in PALETTES.values(): palette = ColorPalette(value) - self.assertEqual(len(value), len(palette)) + assert len(value) == len(palette) # Try all the values converted to RGB for value in PALETTES.values(): palette = ColorPalette(map(mpl.colors.colorConverter.to_rgb, value)) - self.assertEqual(len(value), len(palette)) + assert len(value) == len(palette) def test_color_palette_context(self): """ Test ColorPalette context management """ default = color_palette() - context = color_palette('dark') + context = color_palette("dark") - with ColorPalette('dark') as palette: - self.assertIsInstance(palette, ColorPalette) - self.assertEqual(get_color_cycle(), context) + with ColorPalette("dark") as palette: + assert isinstance(palette, ColorPalette) + assert get_color_cycle() == context - self.assertEqual(get_color_cycle(), default) + assert get_color_cycle() == default def test_as_hex_as_rgb(self): """ Test the conversion of a ColorPalette to hex values and back to rgb """ - palette = color_palette('flatui') - expected = PALETTES['flatui'] + palette = color_palette("flatui") + expected = PALETTES["flatui"] morgified = palette.as_hex() - self.assertIsNot(morgified, palette) - self.assertIsInstance(morgified, ColorPalette) - self.assertEqual(morgified, expected) + assert morgified is not palette + assert isinstance(morgified, ColorPalette) + assert morgified == expected remorgified = morgified.as_rgb() - self.assertIsNot(remorgified, morgified) - self.assertIsNot(remorgified, palette) - self.assertEqual(remorgified, palette) + assert remorgified is not morgified + assert remorgified is not palette + assert remorgified == palette - @unittest.skip("not implemented yet") + @pytest.mark.skip(reason="not implemented yet") def test_plot_color_palette(self): """ Test the plotting of a color palette for color visualization """ - raise NotImplementedError( - "Not quite sure how to implement this yet" - ) + raise NotImplementedError("Not quite sure how to implement this yet") -class ColorPaletteFunctionTests(VisualTestCase): +class TestColorPaletteFunction(VisualTestCase): """ Tests the color_palette function. """ @@ -127,7 +126,7 @@ def test_current_palette(self): """ pal = color_palette(["red", "blue", "green"], 3) set_palette(pal, 3) - self.assertEqual(pal, get_color_cycle()) + assert pal == get_color_cycle() # Reset the palette set_aesthetic() @@ -141,9 +140,9 @@ def test_palette_context(self): context_pal = color_palette("muted") with color_palette(context_pal): - self.assertEqual(get_color_cycle(), context_pal) + assert get_color_cycle() == context_pal - self.assertEqual(get_color_cycle(), default_pal) + assert get_color_cycle() == default_pal def test_big_palette_context(self): """ @@ -155,9 +154,9 @@ def test_big_palette_context(self): set_palette(original_pal) with color_palette(context_pal, 10): - self.assertEqual(get_color_cycle(), context_pal) + assert get_color_cycle() == context_pal - self.assertEqual(get_color_cycle(), original_pal) + assert get_color_cycle() == original_pal # Reset default set_aesthetic() @@ -169,17 +168,23 @@ def test_yellowbrick_palettes(self): pals = ["accent", "dark", "pastel", "bold", "muted"] for name in pals: pal_out = color_palette(name) - self.assertEqual(len(pal_out), 6, "{} is not of len 6".format(name)) + assert len(pal_out) == 6, "{} is not of len 6".format(name) def test_seaborn_palettes(self): """ Test the seaborn palettes have length 6 (bgrmyck) """ - pals = ["sns_deep", "sns_muted", "sns_pastel", - "sns_bright", "sns_dark", "sns_colorblind"] + pals = [ + "sns_deep", + "sns_muted", + "sns_pastel", + "sns_bright", + "sns_dark", + "sns_colorblind", + ] for name in pals: pal_out = color_palette(name) - self.assertEqual(len(pal_out), 6) + assert len(pal_out) == 6 def test_other_palettes(self): """ @@ -188,18 +193,18 @@ def test_other_palettes(self): pals = ["flatui", "paired", "neural_paint", "set1"] for name in pals: pal_out = color_palette(name) - self.assertTrue(pal_out) - + assert pal_out is not None + assert len(pal_out) > 0 def test_bad_palette_name(self): """ Test that a bad palette name raises an exception """ - with self.assertRaises(ValueError): + with pytest.raises(ValueError): color_palette("IAmNotAPalette") - with self.assertRaises(YellowbrickValueError): + with pytest.raises(YellowbrickValueError): color_palette("IAmNotAPalette") def test_bad_palette_colors(self): @@ -208,10 +213,10 @@ def test_bad_palette_colors(self): """ pal = ["red", "blue", "iamnotacolor"] - with self.assertRaises(ValueError): + with pytest.raises(ValueError): color_palette(pal) - with self.assertRaises(YellowbrickValueError): + with pytest.raises(YellowbrickValueError): color_palette(pal) def test_palette_is_list_of_tuples(self): @@ -222,10 +227,10 @@ def test_palette_is_list_of_tuples(self): pal_in = np.array(["red", "blue", "green"]) pal_out = color_palette(pal_in, 3) - self.assertIsInstance(pal_out, list) - self.assertIsInstance(pal_out[0], tuple) - self.assertIsInstance(pal_out[0][0], float) - self.assertEqual(len(pal_out[0]), 3) + assert isinstance(pal_out, list) + assert isinstance(pal_out[0], tuple) + assert isinstance(pal_out[0][0], float) + assert len(pal_out[0]) == 3 def test_palette_cycles(self): """ @@ -233,20 +238,20 @@ def test_palette_cycles(self): """ accent = color_palette("accent") double_accent = color_palette("accent", 12) - self.assertEqual(double_accent, accent + accent) + assert double_accent == accent + accent - @unittest.skip("Discovered this commented out, don't know why") + @pytest.mark.skip(reason="discovered this commented out, don't know why") def test_cbrewer_qual(self): """ Test colorbrewer qualitative palettes """ pal_short = mpl_palette("Set1", 4) pal_long = mpl_palette("Set1", 6) - self.assertEqual(pal_short, pal_long[:4]) + assert pal_short == pal_long[:4] pal_full = palettes.mpl_palette("Set2", 8) pal_long = palettes.mpl_palette("Set2", 10) - self.assertEqual(pal_full, pal_long[:8]) + assert pal_full == pal_long[:8] def test_color_codes(self): """ @@ -257,7 +262,7 @@ def test_color_codes(self): for code, color in zip("bgrmyck", colors): rgb_want = mpl.colors.colorConverter.to_rgb(color) rgb_got = mpl.colors.colorConverter.to_rgb(code) - self.assertEqual(rgb_want, rgb_got) + assert rgb_want == rgb_got set_color_codes("reset") def test_as_hex(self): @@ -266,10 +271,10 @@ def test_as_hex(self): """ pal = color_palette("accent") for rgb, hex in zip(pal, pal.as_hex()): - self.assertEqual(mpl.colors.rgb2hex(rgb), hex) + assert mpl.colors.rgb2hex(rgb) == hex for rgb_e, rgb_v in zip(pal, pal.as_hex().as_rgb()): - self.assertEqual(rgb_e, rgb_v) + assert rgb_e == rgb_v def test_preserved_palette_length(self): """ @@ -277,7 +282,7 @@ def test_preserved_palette_length(self): """ pal_in = color_palette("Set1", 10) pal_out = color_palette(pal_in) - self.assertEqual(pal_in, pal_out) + assert pal_in == pal_out def test_color_sequence(self): """ @@ -286,33 +291,30 @@ def test_color_sequence(self): for name, ncols in SEQUENCES.items(): for n in ncols.keys(): cmap = color_sequence(name, n) - self.assertEqual(name, cmap.name) - self.assertEqual(n, cmap.N) + assert name == cmap.name + assert n == cmap.N def test_color_sequence_default(self): """ Assert the default color sequence is RdBu """ cmap = color_sequence() - self.assertEqual(cmap.name, "RdBu") - self.assertEqual(cmap.N, 11) + assert cmap.name == "RdBu" + assert cmap.N == 11 def test_color_sequence_unrecocognized(self): """ Test value errors for unrecognized sequences """ - with self.assertRaises(YellowbrickValueError): - color_sequence('PepperBucks', 3) + with pytest.raises(YellowbrickValueError): + color_sequence("PepperBucks", 3) def test_color_sequence_bounds(self): """ Test color sequence out of bounds value error """ - with self.assertRaises(YellowbrickValueError): - color_sequence('RdBu', 18) - - with self.assertRaises(YellowbrickValueError): - color_sequence('RdBu', 2) + with pytest.raises(YellowbrickValueError): + color_sequence("RdBu", 18) -if __name__ == "__main__": - unittest.main() + with pytest.raises(YellowbrickValueError): + color_sequence("RdBu", 2) diff --git a/tests/test_style/test_rcmod.py b/tests/test_style/test_rcmod.py index d62ca2820..e15d38b98 100644 --- a/tests/test_style/test_rcmod.py +++ b/tests/test_style/test_rcmod.py @@ -5,7 +5,7 @@ # Author: Benjamin Bengfort # Created: Thu Oct 06 08:20:33 2016 -0400 # -# Copyright (C) 2016 District Data Labs +# Copyright (C) 2016 The scikit-yb developers # For license information, see LICENSE.txt # # ID: test_rcmod.py [c6aff34] benjamin@bengfort.com $ @@ -18,13 +18,12 @@ ## Imports ########################################################################## -import unittest +import pytest import numpy as np import matplotlib as mpl import numpy.testing as npt import yellowbrick.style.rcmod as yb_rcmod -from distutils.version import LooseVersion from tests.base import VisualTestCase @@ -32,6 +31,7 @@ ## Parameter Tests ########################################################################## + class RCParamTester(VisualTestCase): """ Base class for asserting parameters have been correctly changed. @@ -39,8 +39,8 @@ class RCParamTester(VisualTestCase): excluded_params = { "backend", # This cannot be changed by manipulating rc - "svg.embed_char_paths", # This param causes test issues and is deprecated anyway - "font.family", # breaks the visualtest case + "svg.embed_char_paths", # This param causes test issues and is deprecated + "font.family", # breaks the visualtest case } def flatten_list(self, orig_list): @@ -57,15 +57,15 @@ def assert_rc_params(self, params): elif isinstance(v, np.ndarray): npt.assert_array_equal(mpl.rcParams[k], v) else: - self.assertEqual((k, mpl.rcParams[k]), (k, v)) + assert (k, mpl.rcParams[k]) == (k, v) ########################################################################## ## Parameter Tests ########################################################################## -class TestAxesStyle(RCParamTester): +class TestAxesStyle(RCParamTester): def test_default_return(self): """ Test that the axes style returns the default params @@ -80,8 +80,8 @@ def test_rc_override(self): rc = {"axes.facecolor": "blue", "foo.notaparam": "bar"} out = yb_rcmod._axes_style("darkgrid", rc) - self.assertEqual(out["axes.facecolor"], "blue") - self.assertNotIn("foo.notaparam", out) + assert out["axes.facecolor"] == "blue" + assert "foo.notaparam" not in out def test_set_style(self): """ @@ -91,7 +91,7 @@ def test_set_style(self): yb_rcmod.set_style() self.assert_rc_params(style_dict) - @unittest.skip("This test doesn't make sense without multiple styles") + @pytest.mark.skip(reason="this test doesn't make sense without multiple styles") def test_style_context_manager(self): yb_rcmod.set_style("darkgrid") @@ -105,6 +105,7 @@ def test_style_context_manager(self): @yb_rcmod._axes_style("whitegrid") def func(): self.assert_rc_params(context_params) + func() self.assert_rc_params(orig_params) @@ -112,25 +113,20 @@ def test_style_context_independence(self): """ Assert context and style independence """ - self.assertTrue(set(yb_rcmod._style_keys) ^ set(yb_rcmod._context_keys)) + assert len(set(yb_rcmod._style_keys) ^ set(yb_rcmod._context_keys)) > 0 def test_set_rc(self): """ Test the ability to set the mpl configuration rc dict """ yb_rcmod.set_aesthetic(rc={"lines.linewidth": 4}) - self.assertEqual(mpl.rcParams["lines.linewidth"], 4) + assert mpl.rcParams["lines.linewidth"] == 4 yb_rcmod.set_aesthetic() def test_reset_defaults(self): """ Test the ability to reset to the mpl defaults """ - # Changes to the rc parameters make this test hard to manage - # on older versions of matplotlib, so we'll skip it - if LooseVersion(mpl.__version__) < LooseVersion("1.3"): - raise self.SkipTest - yb_rcmod.reset_defaults() self.assert_rc_params(mpl.rcParamsDefault) yb_rcmod.set_aesthetic() @@ -139,19 +135,12 @@ def test_reset_orig(self): """ Test the ability to reset to the original (respecting custom styles) """ - - # Changes to the rc parameters make this test hard to manage - # on older versions of matplotlib, so we'll skip it - if LooseVersion(mpl.__version__) < LooseVersion("1.3"): - raise self.SkipTest - yb_rcmod.reset_orig() self.assert_rc_params(mpl.rcParamsOrig) yb_rcmod.set_aesthetic() class TestPlottingContext(RCParamTester): - def test_default_return(self): """ Test the context returns the default @@ -167,11 +156,17 @@ def test_font_scale(self): notebook_ref = yb_rcmod._plotting_context("notebook") notebook_big = yb_rcmod._plotting_context("notebook", 2) - font_keys = ["axes.labelsize", "axes.titlesize", "legend.fontsize", - "xtick.labelsize", "ytick.labelsize", "font.size"] + font_keys = [ + "axes.labelsize", + "axes.titlesize", + "legend.fontsize", + "xtick.labelsize", + "ytick.labelsize", + "font.size", + ] for k in font_keys: - self.assertEqual(notebook_ref[k] * 2, notebook_big[k]) + assert notebook_ref[k] * 2 == notebook_big[k] def test_rc_override(self): """ @@ -180,8 +175,8 @@ def test_rc_override(self): key, val = "grid.linewidth", 5 rc = {key: val, "foo": "bar"} out = yb_rcmod._plotting_context("talk", rc=rc) - self.assertEqual(out[key], val) - self.assertNotIn("foo", out) + assert out[key] == val + assert "foo" not in out def test__set_context(self): """ @@ -191,7 +186,7 @@ def test__set_context(self): yb_rcmod._set_context() self.assert_rc_params(context_dict) - @unittest.skip("This test doesn't make sense without multiple contexts") + @pytest.mark.skip(reason="this test doesn't make sense without multiple contexts") def test_context_context_manager(self): yb_rcmod._set_context("notebook") @@ -205,9 +200,6 @@ def test_context_context_manager(self): @yb_rcmod._plotting_context("paper") def func(): self.assert_rc_params(context_params) + func() self.assert_rc_params(orig_params) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/test_target/__init__.py b/tests/test_target/__init__.py index 6a1059b01..7b433d5ba 100644 --- a/tests/test_target/__init__.py +++ b/tests/test_target/__init__.py @@ -4,7 +4,7 @@ # Author: Benjamin Bengfort # Created: Thu Jul 19 09:09:07 2018 -0400 # -# ID: __init__.py [] benjamin@bengfort.com $ +# ID: __init__.py [d742c57] benjamin@bengfort.com $ """ Tests for the target module. diff --git a/tests/test_target/test_binning.py b/tests/test_target/test_binning.py index 94d33c5b1..54ebbb517 100644 --- a/tests/test_target/test_binning.py +++ b/tests/test_target/test_binning.py @@ -1,39 +1,63 @@ # tests.test_target.test_binning # Tests for the BalancedBinningReference visualizer # -# Author: Juan L. Kehoe (juanluo2008@gmail.com) -# Author: Prema Damodaran Roman (pdamo24@gmail.com) +# Author: Juan L. Kehoe +# Author: Prema Damodaran Roman # Created: Thu Jul 20 10:21:49 2018 -0400 # +# Copyright (C) 2018 The scikit-yb developers +# For license information, see LICENSE.txt +# # ID: test_binning.py -from tests.base import VisualTestCase -from tests.dataset import DatasetMixin +########################################################################## +# Imports +########################################################################## +import pytest + from yellowbrick.target.binning import * +from yellowbrick.datasets import load_occupancy + +from tests.base import VisualTestCase + +try: + import pandas as pd +except ImportError: + pd = None ########################################################################## -## BalancedBinningReference Tests +# BalancedBinningReference Tests ########################################################################## -class TestBalancedBinningReference(VisualTestCase, DatasetMixin): - """ - Test the BalancedBinningReference visualizer - """ - - def test_balancedbinningreference(self): - """ - Test Histogram on a real dataset - """ - # Load the data from the fixture - dataset = self.load_data('occupancy') - - # Get the data - y = dataset["temperature"] - - - visualizer = BalancedBinningReference() - visualizer.fit(y) - visualizer.poof() - self.assert_images_similar(visualizer, tol=0.5) - - \ No newline at end of file + +class TestBalancedBinningReference(VisualTestCase): + """ + Test the BalancedBinningReference visualizer + """ + + def test_numpy_bins(self): + """ + Test Histogram on a NumPy array + """ + # Load the data from the fixture + data = load_occupancy(return_dataset=True) + X, y = data.to_numpy() + + visualizer = BalancedBinningReference() + visualizer.fit(y) + visualizer.finalize() + self.assert_images_similar(visualizer, tol=0.5) + + @pytest.mark.skipif(pd is None, reason="pandas is required") + def test_pandas_bins(self): + """ + Test Histogram on a Pandas Dataframe + """ + # Load the data from the fixture + data = load_occupancy(return_dataset=True) + X, y = data.to_pandas() + + visualizer = BalancedBinningReference() + visualizer.fit(y) + visualizer.finalize() + self.assert_images_similar(visualizer, tol=0.5) diff --git a/tests/test_target/test_class_balance.py b/tests/test_target/test_class_balance.py index 43a2e76d2..216cf3fe1 100644 --- a/tests/test_target/test_class_balance.py +++ b/tests/test_target/test_class_balance.py @@ -1,10 +1,13 @@ # tests.test_target.test_class_balance # Tests for the ClassBalance visualizer # -# Author: Benjamin Bengfort +# Author: Benjamin Bengfort # Created: Thu Jul 19 10:21:49 2018 -0400 # -# ID: test_class_balance.py [] benjamin@bengfort.com $ +# Copyright (C) 2018 The scikit-yb developers +# For license information, see LICENSE.txt +# +# ID: test_class_balance.py [d742c57] benjamin@bengfort.com $ """ Tests for the ClassBalance visualizer @@ -15,13 +18,13 @@ ########################################################################## import pytest -import numpy as np from yellowbrick.target.class_balance import * +from yellowbrick.datasets import load_occupancy from yellowbrick.exceptions import YellowbrickValueError from tests.base import VisualTestCase -from tests.dataset import DatasetMixin, Dataset, Split +from tests.fixtures import Dataset, Split from sklearn.datasets import make_classification from sklearn.model_selection import train_test_split as tts @@ -33,41 +36,45 @@ ########################################################################## -## Data Fixtures +# Data Fixtures ########################################################################## +# TODO: convert to Pytest fixture def make_fixture(binary=False, balanced=False, split=False): """ Make a dataset for testing ClassBalance based on the specified params. """ kwargs = { - "n_samples":100, "n_features":20, "n_informative":8, "n_redundant":2, - "n_clusters_per_class":1, "random_state":89092, + "n_samples": 100, + "n_features": 20, + "n_informative": 8, + "n_redundant": 2, + "n_clusters_per_class": 1, + "random_state": 89092, } if binary: - kwargs['n_classes'] = 2 - kwargs['weights'] = None if balanced else [0.3, 0.7] + kwargs["n_classes"] = 2 + kwargs["weights"] = None if balanced else [0.3, 0.7] else: - kwargs['n_classes'] = 5 - kwargs['weights'] = None if balanced else [0.1, 0.2, 0.4, 0.2, .01] + kwargs["n_classes"] = 5 + kwargs["weights"] = None if balanced else [0.1, 0.2, 0.4, 0.2, 0.01] X, y = make_classification(**kwargs) if split: - X_train, X_test, y_train, y_test = tts( - X, y, test_size=0.2, random_state=101 - ) + X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2, random_state=101) return Dataset(Split(X_train, X_test), Split(y_train, y_test)) return Dataset(X, y) ########################################################################## -## Tests +# Tests ########################################################################## -class ClassBalanceTests(VisualTestCase, DatasetMixin): + +class TestClassBalance(VisualTestCase): """ Test ClassBalance visualizer """ @@ -118,7 +125,7 @@ def test_binary_balance(self): assert oz.fit(dataset.y) is oz assert oz._mode == BALANCE - #oz.finalize() + # oz.finalize() self.assert_images_similar(oz) def test_binary_compare(self): @@ -131,7 +138,7 @@ def test_binary_compare(self): assert oz.fit(dataset.y.train, dataset.y.test) is oz assert oz._mode == COMPARE - #oz.finalize() + # oz.finalize() self.assert_images_similar(oz) def test_multiclass_balance(self): @@ -144,7 +151,7 @@ def test_multiclass_balance(self): assert oz.fit(dataset.y) is oz assert oz._mode == BALANCE - #oz.finalize() + # oz.finalize() self.assert_images_similar(oz) def test_multiclass_compare(self): @@ -157,7 +164,7 @@ def test_multiclass_compare(self): assert oz.fit(dataset.y.train, dataset.y.test) is oz assert oz._mode == COMPARE - #oz.finalize() + # oz.finalize() self.assert_images_similar(oz) @pytest.mark.skipif(pd is None, reason="test requires pandas") @@ -165,16 +172,28 @@ def test_pandas_occupancy_balance(self): """ Test pandas data frame with string target in balance mode """ - data = self.load_data("occupancy") - y = pd.Series([ - "occupied" if yi else "unoccupied" for yi in data['occupancy'] - ]) + data = load_occupancy(return_dataset=True) + X, y = data.to_pandas() + + # Create and fit the visualizer + oz = ClassBalance() + assert oz.fit(y) is oz + + # oz.finalize() + self.assert_images_similar(oz) + + def test_numpy_occupancy_balance(self): + """ + Test NumPy arrays with string target in balance mode + """ + data = load_occupancy(return_dataset=True) + X, y = data.to_numpy() # Create and fit the visualizer oz = ClassBalance() assert oz.fit(y) is oz - #oz.finalize() + # oz.finalize() self.assert_images_similar(oz) @pytest.mark.skipif(pd is None, reason="test requires pandas") @@ -182,15 +201,24 @@ def test_pandas_occupancy_compare(self): """ Test pandas data frame with string target in compare mode """ - data = self.load_data("occupancy") - features = [ - "temperature", "relative_humidity", "light", "C02", "humidity" - ] + data = load_occupancy(return_dataset=True) + X, y = data.to_pandas() - X = pd.DataFrame(data[features]) - y = pd.Series([ - "occupied" if yi else "unoccupied" for yi in data['occupancy'] - ]) + _, _, y_train, y_test = tts(X, y, test_size=0.4, random_state=2242) + + # Create and fit the visualizer + oz = ClassBalance() + assert oz.fit(y_train, y_test) is oz + + # oz.finalize() + self.assert_images_similar(oz) + + def test_numpy_occupancy_compare(self): + """ + Test NumPy arrays with string target in compare mode + """ + data = load_occupancy(return_dataset=True) + X, y = data.to_numpy() _, _, y_train, y_test = tts(X, y, test_size=0.4, random_state=2242) @@ -198,7 +226,7 @@ def test_pandas_occupancy_compare(self): oz = ClassBalance() assert oz.fit(y_train, y_test) is oz - #oz.finalize() + # oz.finalize() self.assert_images_similar(oz) def test_quick_method(self): @@ -207,5 +235,5 @@ def test_quick_method(self): """ dataset = make_fixture(binary=False, split=False) - ax = class_balance(dataset.y) - self.assert_images_similar(ax=ax, tol=0.5) + viz = class_balance(dataset.y) + self.assert_images_similar(viz, tol=0.5) diff --git a/tests/test_target/test_feature_correlation.py b/tests/test_target/test_feature_correlation.py index b9ba6af7a..5c03cad5f 100644 --- a/tests/test_target/test_feature_correlation.py +++ b/tests/test_target/test_feature_correlation.py @@ -1,13 +1,13 @@ # tests.test_features.test_feature_correlation # Test the feature correlation visualizers # -# Author: Zijie (ZJ) Poh +# Author: Zijie (ZJ) Poh # Created: Tue Jul 31 20:21:32 2018 -0700 # -# Copyright (C) 2018 District Data Labs +# Copyright (C) 2018 The scikit-yb developers # For license information, see LICENSE.txt # -# ID: test_feature_correlation.py [] poh.zijie@gmail.com $ +# ID: test_feature_correlation.py [33aec16] 8103276+zjpoh@users.noreply.github.com $ """ Test the feature correlation to dependent variable visualizer. @@ -20,10 +20,6 @@ import sys import pytest import numpy as np -try: - import pandas as pd -except ImportError: - pd = None import numpy.testing as npt import matplotlib.pyplot as plt @@ -31,26 +27,29 @@ from yellowbrick.exceptions import YellowbrickValueError, YellowbrickWarning from sklearn import datasets - from tests.base import VisualTestCase +try: + import pandas as pd +except ImportError: + pd = None + ########################################################################## ## Feature Correlation Tests ########################################################################## + class TestFeatureCorrelationVisualizer(VisualTestCase): """ FeatureCorrelation visualizer """ data = datasets.load_diabetes() - X, y = data['data'], data['target'] - labels = data['feature_names'] + X, y = data["data"], data["target"] + labels = data["feature_names"] - @pytest.mark.xfail( - sys.platform == 'win32', reason="images not close on windows" - ) + @pytest.mark.xfail(sys.platform == "win32", reason="images not close on windows") def test_feature_correlation_integrated_pearson(self): """ Test FeatureCorrelation visualizer with pearson correlation @@ -58,37 +57,33 @@ def test_feature_correlation_integrated_pearson(self): """ viz = FeatureCorrelation() viz.fit(self.X, self.y) - viz.poof() + viz.finalize() self.assert_images_similar(viz) - @pytest.mark.xfail( - sys.platform == 'win32', reason="images not close on windows" - ) + @pytest.mark.xfail(sys.platform == "win32", reason="images not close on windows") def test_feature_correlation_integrated_mutual_info_regression(self): """ Test FeatureCorrelation visualizer with mutual information regression """ - viz = FeatureCorrelation(method='mutual_info-regression') + viz = FeatureCorrelation(method="mutual_info-regression") viz.fit(self.X, self.y, random_state=23456) - viz.poof() + viz.finalize() self.assert_images_similar(viz) - @pytest.mark.xfail( - sys.platform == 'win32', reason="images not close on windows" - ) + @pytest.mark.xfail(sys.platform == "win32", reason="images not close on windows") def test_feature_correlation_integrated_mutual_info_classification(self): """ Test FeatureCorrelation visualizer with mutual information on wine dataset (classification) """ data = datasets.load_wine() - X, y = data['data'], data['target'] + X, y = data["data"], data["target"] - viz = FeatureCorrelation(method='mutual_info-classification') + viz = FeatureCorrelation(method="mutual_info-classification") viz.fit(X, y, random_state=12345) - viz.poof() + viz.finalize() self.assert_images_similar(viz) @@ -96,8 +91,8 @@ def test_feature_correlation_method_not_implemented(self): """ Test FeatureCorrelation visualizer with unknown method """ - method = 'foo' - e = ('Method foo not implement; choose from *') + method = "foo" + e = "Method foo not implement; choose from *" with pytest.raises(YellowbrickValueError, match=e): FeatureCorrelation(method=method) @@ -135,7 +130,7 @@ def test_feature_correlation_select_feature_by_index_out_of_range(self): """ Test selecting feature by feature index but index is out of range """ - e = 'Feature index is out of range' + e = "Feature index is out of range" with pytest.raises(YellowbrickValueError, match=e): viz = FeatureCorrelation(feature_index=[0, 2, 10]) viz.fit(self.X, self.y) @@ -154,13 +149,16 @@ def test_feature_correlation_select_feature_by_index_and_name(self): Test selecting feature warning when both index and names are provided """ feature_index = [0, 2, 3] - feature_names = ['age'] + feature_names = ["age"] - e = ('Both feature_index and feature_names are specified. ' - 'feature_names is ignored') + e = ( + "Both feature_index and feature_names are specified. " + "feature_names is ignored" + ) with pytest.raises(YellowbrickWarning, match=e): - viz = FeatureCorrelation(feature_index=feature_index, - feature_names=feature_names) + viz = FeatureCorrelation( + feature_index=feature_index, feature_names=feature_names + ) viz.fit(self.X, self.y) assert viz.scores_.shape[0] == 3 @@ -168,9 +166,9 @@ def test_feature_correlation_select_feature_by_name_no_labels(self): """ Test selecting feature by feature names with labels is not supplied """ - feature_names = ['age'] + feature_names = ["age"] - e = 'age not in labels' + e = "age not in labels" with pytest.raises(YellowbrickValueError, match=e): viz = FeatureCorrelation(feature_names=feature_names) viz.fit(self.X, self.y) @@ -179,10 +177,9 @@ def test_feature_correlation_select_feature_by_name(self): """ Test selecting feature by feature names """ - feature_names = ['age', 'sex', 'bp', 's5'] + feature_names = ["age", "sex", "bp", "s5"] - viz = FeatureCorrelation(labels=self.labels, - feature_names=feature_names) + viz = FeatureCorrelation(labels=self.labels, feature_names=feature_names) viz.fit(self.X, self.y) npt.assert_array_equal(viz.features_, feature_names) @@ -196,9 +193,7 @@ def test_feature_correlation_sort(self): assert np.all(viz.scores_[:-1] <= viz.scores_[1:]) - @pytest.mark.xfail( - sys.platform == 'win32', reason="images not close on windows" - ) + @pytest.mark.xfail(sys.platform == "win32", reason="images not close on windows") def test_feature_correlation_quick_method(self): """ Test sorting of correlation @@ -207,4 +202,4 @@ def test_feature_correlation_quick_method(self): ax = fig.add_subplot() g = feature_correlation.feature_correlation(self.X, self.y, ax) - self.assert_images_similar(ax=g) + self.assert_images_similar(g) diff --git a/tests/test_text/test_base.py b/tests/test_text/test_base.py index aa496cbf7..17e568470 100644 --- a/tests/test_text/test_base.py +++ b/tests/test_text/test_base.py @@ -1,10 +1,10 @@ # tests.test_text.test_base # Tests for the text visualization base classes # -# Author: Benjamin Bengfort +# Author: Benjamin Bengfort # Created: Mon Feb 20 06:34:50 2017 -0500 # -# Copyright (C) 2016 District Data Labs +# Copyright (C) 2016 The scikit-yb developers # For license information, see LICENSE.txt # # ID: test_base.py [6aa9198] benjamin@bengfort.com $ @@ -17,8 +17,6 @@ ## Imports ########################################################################## -import unittest - from yellowbrick.base import * from yellowbrick.text.base import * from sklearn.base import BaseEstimator, TransformerMixin @@ -28,22 +26,13 @@ ## TextVisualizer Base Tests ########################################################################## -class TextVisualizerBaseTests(unittest.TestCase): +class TestTextVisualizerBase(object): def test_subclass(self): """ - Assert the text visualizer is subclassed correctly + Assert the text visualizer is subclassed correctly """ visualizer = TextVisualizer() - self.assertIsInstance(visualizer, TransformerMixin) - self.assertIsInstance(visualizer, BaseEstimator) - self.assertIsInstance(visualizer, Visualizer) - - # def test_interface(self): - # """ - # Test the feature visualizer interface - # """ - # - # visualizer = TextVisualizer() - # with self.assertRaises(NotImplementedError): - # visualizer.poof() + assert isinstance(visualizer, TransformerMixin) + assert isinstance(visualizer, BaseEstimator) + assert isinstance(visualizer, Visualizer) diff --git a/tests/test_text/test_dispersion.py b/tests/test_text/test_dispersion.py index ec94eceb7..09962b6e7 100644 --- a/tests/test_text/test_dispersion.py +++ b/tests/test_text/test_dispersion.py @@ -2,13 +2,12 @@ # Tests for the dispersion plot visualization # # Author: Larry Gray -# Github: @lwgray # Created: 2018-06-22 15:27 # -# Copyright (C) 2018 +# Copyright (C) 2018 The scikit-yb developers # For license information, see LICENSE.txt # -# ID: test_dispersion.py [] lwgray@gmail.com $ +# ID: test_dispersion.py [25f1b9a] lwgray@gmail.com $ """ Tests for the dispersion plot text visualization @@ -19,26 +18,46 @@ ########################################################################## import pytest +import matplotlib.pyplot as plt from yellowbrick.exceptions import YellowbrickValueError +from yellowbrick.datasets import load_hobbies from yellowbrick.text.dispersion import * -from tests.dataset import DatasetMixin from tests.base import VisualTestCase + +########################################################################## +## Data +########################################################################## + +corpus = load_hobbies() + ########################################################################## ## DispersionPlot Tests ########################################################################## -class DispersionPlotTests(VisualTestCase, DatasetMixin): - def test_integrated_dispersionplot(self): +class TestDispersionPlot(VisualTestCase): + def test_quick_method(self): """ - Assert no errors occur during DispersionPlot integration + Assert no errors occur when using the qucik method """ - corpus = self.load_data('hobbies') + _, ax = plt.subplots() text = [doc.split() for doc in corpus.data] - target_words = ['Game', 'player', 'score', 'oil', 'Man'] + target_words = ["Game", "player", "score", "oil", "Man"] + + viz = dispersion(words=target_words, corpus=text, ax=ax) + viz.ax.grid(False) + + self.assert_images_similar(viz, tol=25) + + def test_integrated_dispersion_plot(self): + """ + Assert no errors occur during DispersionPlot integration + """ + text = [doc.split() for doc in corpus.data] + target_words = ["Game", "player", "score", "oil", "Man"] visualizer = DispersionPlot(target_words) visualizer.fit(text) @@ -46,15 +65,13 @@ def test_integrated_dispersionplot(self): self.assert_images_similar(visualizer, tol=25) - def test_dispersionplot_ignore_case(self): + def test_dispersion_plot_ignore_case(self): """ Assert no errors occur during DispersionPlot integration with ignore_case parameter turned on """ - corpus = self.load_data('hobbies') - text = [doc.split() for doc in corpus.data] - target_words = ['Game', 'player', 'score', 'oil', 'Man'] + target_words = ["Game", "player", "score", "oil", "Man"] visualizer = DispersionPlot(target_words, ignore_case=True) visualizer.fit(text) @@ -62,15 +79,13 @@ def test_dispersionplot_ignore_case(self): self.assert_images_similar(visualizer, tol=25) - def test_dispersionplot_generator_input(self): + def test_dispersion_plot_generator_input(self): """ Assert no errors occur during dispersionPlot integration when the corpus' text type is a generator """ - corpus = self.load_data('hobbies') - - text = (doc.split() for doc in corpus.data) - target_words = ['Game', 'player', 'score', 'oil', 'Man'] + text = [doc.split() for doc in corpus.data] + target_words = ["Game", "player", "score", "oil", "Man"] visualizer = DispersionPlot(target_words, ignore_case=True) visualizer.fit(text) @@ -78,58 +93,49 @@ def test_dispersionplot_generator_input(self): self.assert_images_similar(visualizer, tol=25) - def test_dispersionplot_annotate_docs(self): + def test_dispersion_plot_annotate_docs(self): """ Assert no errors occur during DispersionPlot integration with annotate_docs parameter turned on """ - corpus = self.load_data('hobbies') - text = [doc.split() for doc in corpus.data] - target_words = ['girl', 'she', 'boy', 'he', 'man'] + target_words = ["girl", "she", "boy", "he", "man"] visualizer = DispersionPlot(target_words, annotate_docs=True) visualizer.fit(text) visualizer.ax.grid(False) - self.assert_images_similar(visualizer, tol=25) + self.assert_images_similar(visualizer, tol=25.5) - def test_dispersionplot_color_words_by_class(self): + def test_dispersion_plot_color_by_class(self): """ Assert no errors occur during DispersionPlot integration when target values are specified """ - corpus = self.load_data('hobbies') - - text = (doc.split() for doc in corpus.data) - target_words = ['girl', 'she', 'boy', 'he', 'man'] - - target_values = corpus.target + target = corpus.target + text = [doc.split() for doc in corpus.data] + target_words = ["girl", "she", "boy", "he", "man"] visualizer = DispersionPlot(target_words) - visualizer.fit(text, target_values) + visualizer.fit(text, target) visualizer.ax.grid(False) self.assert_images_similar(visualizer, tol=25) - def test_dispersionplot_mismatched_labels(self): + def test_dispersion_plot_mismatched_labels(self): """ Assert exception is raised when number of labels doesn't match """ - corpus = self.load_data('hobbies') - - text = (doc.split() for doc in corpus.data) - target_words = ['girl', 'she', 'boy', 'he', 'man'] - - target_values = corpus.target + target = corpus.target + text = [doc.split() for doc in corpus.data] + target_words = ["girl", "she", "boy", "he", "man"] - visualizer = DispersionPlot(target_words, annotate_docs=True, - labels=['a', 'b']) + visualizer = DispersionPlot(target_words, annotate_docs=True, labels=["a", "b"]) msg = ( - r'number of supplied labels \(\d\) ' - r'does not match the number of classes \(\d\)' + r"number of supplied labels \(\d\) " + r"does not match the number of classes \(\d\)" ) with pytest.raises(YellowbrickValueError, match=msg): - visualizer.fit(text, target_values) + visualizer.fit(text, target) diff --git a/tests/test_text/test_freqdist.py b/tests/test_text/test_freqdist.py index e687d8b28..2679703fe 100644 --- a/tests/test_text/test_freqdist.py +++ b/tests/test_text/test_freqdist.py @@ -2,13 +2,12 @@ # Tests for the frequency distribution visualization # # Author: Rebecca Bilbro -# Github: @rebeccabilbro # Created: 2017-03-22 15:27 # -# Copyright (C) 2018 +# Copyright (C) 2018 The scikit-yb developers # For license information, see LICENSE.txt # -# ID: test_freqdist.py [bd9cbb9] rbilbro@districtdatalabs.com $ +# ID: test_freqdist.py [bd9cbb9] rebecca.bilbro@bytecubed.com $ """ Tests for the frequency distribution text visualization @@ -18,36 +17,41 @@ ## Imports ########################################################################## -import sys import pytest +from yellowbrick.datasets import load_hobbies from yellowbrick.text.freqdist import * -from tests.dataset import DatasetMixin -from tests.base import VisualTestCase +from tests.base import IS_WINDOWS_OR_CONDA, VisualTestCase + from sklearn.feature_extraction.text import CountVectorizer +########################################################################## +## Data +########################################################################## + +corpus = load_hobbies() ########################################################################## ## FreqDist Tests ########################################################################## -class FreqDistTests(VisualTestCase, DatasetMixin): +class TestFreqDist(VisualTestCase): @pytest.mark.xfail( - sys.platform == 'win32', reason="images not close on windows" + IS_WINDOWS_OR_CONDA, + reason="font rendering different in OS and/or Python; see #892", ) def test_integrated_freqdist(self): """ Assert no errors occur during freqdist integration """ - corpus = self.load_data('hobbies') vectorizer = CountVectorizer() - docs = vectorizer.fit_transform(corpus.data) - features = vectorizer.get_feature_names() + docs = vectorizer.fit_transform(corpus.data) + features = vectorizer.get_feature_names() visualizer = FreqDistVisualizer(features) visualizer.fit(docs) - visualizer.poof() - self.assert_images_similar(visualizer, tol=1) + visualizer.finalize() + self.assert_images_similar(visualizer) diff --git a/tests/test_text/test_postag.py b/tests/test_text/test_postag.py index d465c1b12..16ffd6b90 100644 --- a/tests/test_text/test_postag.py +++ b/tests/test_text/test_postag.py @@ -1,11 +1,10 @@ -# -*- coding: utf8 -*- # tests.test_text.test_postag # Tests for the part-of-speech tagging visualization # -# Author: Rebecca Bilbro -# Created: 2017-03-22 15:46 +# Author: Rebecca Bilbro +# Created: 2019-02-19 21:29 # -# Copyright (C) 2017 District Data Labs +# Copyright (C) 2019 The scikit-yb developers # For license information, see LICENSE.txt # # ID: test_postag.py [bd9cbb9] rebecca.bilbro@bytecubed.com $ @@ -20,36 +19,83 @@ import pytest +from yellowbrick.exceptions import YellowbrickValueError from yellowbrick.text.postag import * - +from tests.base import VisualTestCase +import matplotlib.pyplot as plt try: import nltk - from nltk import pos_tag, word_tokenize + from nltk import pos_tag, sent_tokenize + from nltk import word_tokenize, wordpunct_tokenize except ImportError: nltk = None +try: + import spacy +except ImportError: + spacy = None + ########################################################################## -## Fixtures +## Data ########################################################################## -pie = """ -In a small saucepan, combine sugar and eggs -until well blended. Cook over low heat, stirring -constantly, until mixture reaches 160° and coats -the back of a metal spoon. Remove from the heat. -Stir in chocolate and vanilla until smooth. Cool -to lukewarm (90°), stirring occasionally. In a small -bowl, cream butter until light and fluffy. Add cooled -chocolate mixture; beat on high speed for 5 minutes -or until light and fluffy. In another large bowl, -beat cream until it begins to thicken. Add -confectioners' sugar; beat until stiff peaks form. -Fold into chocolate mixture. Pour into crust. Chill -for at least 6 hours before serving. Garnish with -whipped cream and chocolate curls if desired. -""" +sonnets = [ + """ + FROM fairest creatures we desire increase, + That thereby beauty's rose might never die, + But as the riper should by time decease, + His tender heir might bear his memory: + But thou, contracted to thine own bright eyes, + Feed'st thy light'st flame with self-substantial fuel, + Making a famine where abundance lies, + Thyself thy foe, to thy sweet self too cruel. + Thou that art now the world's fresh ornament + And only herald to the gaudy spring, + Within thine own bud buriest thy content + And, tender churl, makest waste in niggarding. + Pity the world, or else this glutton be, + To eat the world's due, by the grave and thee. + """, + """ + When forty winters shall beseige thy brow, + And dig deep trenches in thy beauty's field, + Thy youth's proud livery, so gazed on now, + Will be a tatter'd weed, of small worth held: + Then being ask'd where all thy beauty lies, + Where all the treasure of thy lusty days, + To say, within thine own deep-sunken eyes, + Were an all-eating shame and thriftless praise. + How much more praise deserved thy beauty's use, + If thou couldst answer 'This fair child of mine + Shall sum my count and make my old excuse,' + Proving his beauty by succession thine! + This were to be new made when thou art old, + And see thy blood warm when thou feel'st it cold. + """, + """ + Look in thy glass, and tell the face thou viewest + Now is the time that face should form another; + Whose fresh repair if now thou not renewest, + Thou dost beguile the world, unbless some mother. + For where is she so fair whose unear'd womb + Disdains the tillage of thy husbandry? + Or who is he so fond will be the tomb + Of his self-love, to stop posterity? + Thou art thy mother's glass, and she in thee + Calls back the lovely April of her prime: + So thou through windows of thine age shall see + Despite of wrinkles this thy golden time. + But if thou live, remember'd not to be, + Die single, and thine image dies with thee. + """, +] + + +########################################################################## +## PosTag Utils +########################################################################## def check_nltk_data(): @@ -57,32 +103,247 @@ def check_nltk_data(): Returns True if NLTK data has been downloaded, False otherwise """ try: - nltk.data.find('corpora/treebank') + nltk.data.find("corpora/treebank") return True except LookupError: pytest.xfail("error occured because nltk postag data is not available") +def check_spacy_data(): + """ + Returns True if SpaCy data has been downloaded, False otherwise + """ + try: + spacy.load("en_core_web_sm") + return True + except OSError: + pytest.xfail("error occured because spacy data model is not available") + + +def get_tagged_docs(X, model="nltk", tagger="word"): + """ + X is a list of strings; each string is a single document. + For each document, perform part-of-speech tagging, and + yield a list of sentences, where each sentence is a list + of (token, tag) tuples + + If model=="nltk", `NLTK` will be used to sentence and word + tokenize the incoming documents. User may select the `NLTK` + tagger to be used; (for now) either the word tokenizer or the + workpunct tokenizer. + + If model=="spacy", `SpaCy` will be used to sentence and word + tokenize the incoming documents. + """ + if model == "spacy": + nlp = spacy.load("en_core_web_sm") + for doc in X: + tagged = nlp(doc) + yield [ + list((token.text, token.pos_) for token in sent) + for sent in tagged.sents + ] + + elif model == "nltk": + if tagger == "wordpunct": + for doc in X: + yield [pos_tag(wordpunct_tokenize(sent)) for sent in sent_tokenize(doc)] + else: + for doc in X: + yield [pos_tag(word_tokenize(sent)) for sent in sent_tokenize(doc)] + + ########################################################################## ## PosTag Tests ########################################################################## -class TestPosTag(object): + +class TestPosTag(VisualTestCase): """ PosTag (Part of Speech Tagging Visualizer) Tests """ + def test_quick_method(self): + """ + Assert no errors occur when using the quick method + """ + # Fail if data hasn't been downloaded + check_nltk_data() + + _, ax = plt.subplots() + tagged_docs = list(get_tagged_docs(sonnets)) + + viz = postag(tagged_docs, ax=ax) + viz.ax.grid(False) + + self.assert_images_similar(viz) + + def test_unknown_tagset(self): + """ + Ensure an exception is raised if the specified tagset is unknown + """ + with pytest.raises(YellowbrickValueError): + PosTagVisualizer(tagset="brill") + + def test_frequency_mode(self): + """ + Assert no errors occur when the visualizer is run on frequency mode + """ + check_nltk_data() + + _, ax = plt.subplots() + tagged_docs = list(get_tagged_docs(sonnets)) + + viz = PosTagVisualizer(ax=ax, frequency=True) + viz.fit(tagged_docs) + viz.finalize() + ax.grid(False) + + # Sorted tags i.e predetermined order + sorted_tags = [ + "noun", + "adjective", + "punctuation", + "verb", + "preposition", + "determiner", + "adverb", + "conjunction", + "pronoun", + "wh- word", + "modal", + "infinitive", + "possessive", + "other", + "symbol", + "existential", + "digit", + "non-English", + "interjection", + "list", + ] + + # Extract tick labels from the plot + ticks_ax = [tick.get_text() for tick in ax.xaxis.get_ticklabels()] + + # Assert that ticks are set properly + assert ticks_ax == sorted_tags + + self.assert_images_similar(ax=ax, tol=0.5) + @pytest.mark.skipif(nltk is None, reason="test requires nltk") - def test_integrated_postag(self): + def test_word_tagged(self): """ - Assert no errors occur during postag integration + Assert no errors occur during PosTagVisualizer integration + with word tokenized corpus """ + # Fail if data hasn't been downloaded + check_nltk_data() + + tagged_docs = list(get_tagged_docs(sonnets, model="nltk", tagger="word")) + + visualizer = PosTagVisualizer(tagset="penn_treebank") + + visualizer.fit(tagged_docs) + visualizer.ax.grid(False) + + self.assert_images_similar(visualizer) + @pytest.mark.skipif(nltk is None, reason="test requires nltk") + def test_wordpunct_tagged(self): + """ + Assert no errors occur during PosTagVisualizer integration + with wordpunct tokenized corpus + """ # Fail if data hasn't been downloaded check_nltk_data() - tokens = word_tokenize(pie) - tagged = pos_tag(tokens) + wordpunct_tagged_docs = list( + get_tagged_docs(sonnets, model="nltk", tagger="wordpunct") + ) + + visualizer = PosTagVisualizer(tagset="penn_treebank") + + visualizer.fit(wordpunct_tagged_docs) + visualizer.ax.grid(False) + + self.assert_images_similar(visualizer) + + @pytest.mark.skipif(spacy is None, reason="test requires spacy") + def test_spacy_tagged(self): + """ + Assert no errors occur during PosTagVisualizer integration + with spacy tokenized corpus + """ + # Fail if data hasn't been downloaded + check_spacy_data() + + spacy_tagged_docs = list(get_tagged_docs(sonnets, model="spacy")) + + visualizer = PosTagVisualizer(tagset="universal") + + visualizer.fit(spacy_tagged_docs) + visualizer.ax.grid(False) + + self.assert_images_similar(visualizer) + + def test_stack_mode(self): + """ + Assert no errors occur when the visualizer is run on stack mode + """ + check_nltk_data() + + _, ax = plt.subplots() + tagged_docs = list(get_tagged_docs(sonnets)) + + visualizer = PosTagVisualizer(stack=True, ax=ax) + visualizer.fit(tagged_docs, y=["a", "b", "c"]) + visualizer.ax.grid(False) + + self.assert_images_similar(ax=ax) + + def test_stack_frequency_mode(self): + """ + Assert no errors occur when the visualizer is run on both stack and + frequency mode + """ + check_nltk_data() + + _, ax = plt.subplots() + tagged_docs = list(get_tagged_docs(sonnets)) + + visualizer = PosTagVisualizer(stack=True, frequency=True, ax=ax) + visualizer.fit(tagged_docs, y=["a", "b", "c"]) + visualizer.ax.grid(False) + + # Sorted tags i.e predetermined order + sorted_tags = [ + "noun", + "adjective", + "punctuation", + "verb", + "preposition", + "determiner", + "adverb", + "conjunction", + "pronoun", + "wh- word", + "modal", + "infinitive", + "possessive", + "other", + "symbol", + "existential", + "digit", + "non-English", + "interjection", + "list", + ] + + # Extract tick labels from the plot + ticks_ax = [tick.get_text() for tick in ax.xaxis.get_ticklabels()] + + # Assert that ticks are set properly + assert ticks_ax == sorted_tags - visualizer = PosTagVisualizer() - visualizer.transform(tagged) + self.assert_images_similar(ax=ax) diff --git a/tests/test_text/test_tsne.py b/tests/test_text/test_tsne.py index 9cde779e5..2e6f7a864 100644 --- a/tests/test_text/test_tsne.py +++ b/tests/test_text/test_tsne.py @@ -1,10 +1,10 @@ # tests.test_text.test_tsne # Tests for the TSNE visual corpus embedding mechanism. # -# Author: Benjamin Bengfort +# Author: Benjamin Bengfort # Created: Mon Feb 20 07:23:53 2017 -0500 # -# Copyright (C) 2016 Bengfort.com +# Copyright (C) 2018 The scikit-yb developers # For license information, see LICENSE.txt # # ID: test_tsne.py [6aa9198] benjamin@bengfort.com $ @@ -17,12 +17,12 @@ ## Imports ########################################################################## -import six import pytest +from unittest import mock from yellowbrick.text.tsne import * from tests.base import VisualTestCase -from tests.dataset import DatasetMixin +from yellowbrick.datasets import load_hobbies from yellowbrick.exceptions import YellowbrickValueError from sklearn.manifold import TSNE @@ -34,16 +34,20 @@ except ImportError: pandas = None -try: - from unittest import mock -except ImportError: - import mock + +########################################################################## +## Data +########################################################################## + +corpus = load_hobbies() + ########################################################################## ## TSNE Tests ########################################################################## -class TestTSNE(VisualTestCase, DatasetMixin): + +class TestTSNE(VisualTestCase): """ TSNEVisualizer tests """ @@ -53,20 +57,20 @@ def test_bad_decomposition(self): Ensure an error is raised when a bad decompose argument is specified """ with pytest.raises(YellowbrickValueError): - TSNEVisualizer(decompose='bob') + TSNEVisualizer(decompose="bob") def test_make_pipeline(self): """ Verify the pipeline creation step for TSNE """ - tsne = TSNEVisualizer() # Should not cause an exception. + tsne = TSNEVisualizer() # Should not cause an exception. assert tsne.transformer_ is not None - svdp = tsne.make_transformer('svd', 90) + svdp = tsne.make_transformer("svd", 90) assert len(svdp.steps) == 2 - pcap = tsne.make_transformer('pca') + pcap = tsne.make_transformer("pca") assert len(pcap.steps) == 2 none = tsne.make_transformer(None) @@ -76,17 +80,15 @@ def test_integrated_tsne(self): """ Check tSNE integrated visualization on the hobbies corpus """ - corpus = self.load_data('hobbies') - tfidf = TfidfVectorizer() + tfidf = TfidfVectorizer() - docs = tfidf.fit_transform(corpus.data) + docs = tfidf.fit_transform(corpus.data) labels = corpus.target - tsne = TSNEVisualizer(random_state=8392, colormap='Set1', alpha=1.0) + tsne = TSNEVisualizer(random_state=8392, colormap="Set1", alpha=1.0) tsne.fit_transform(docs, labels) - tol = 50 if six.PY3 else 55 - self.assert_images_similar(tsne, tol=tol) + self.assert_images_similar(tsne, tol=50) def test_sklearn_tsne_size(self): """ @@ -97,7 +99,7 @@ def test_sklearn_tsne_size(self): # like size, are passed through to YB's finalize method. This test should # notify us if TSNE's params change on the sklearn side. with pytest.raises(TypeError): - TSNE(size=(100,100)) + TSNE(size=(100, 100)) def test_sklearn_tsne_title(self): """ @@ -126,22 +128,66 @@ def test_custom_size_tsne(self): assert tsne._size == (100, 50) + def test_custom_colors_tsne(self): + """ + Check tSNE accepts and properly handles custom colors from user + """ + ## produce random data + X, y = make_classification( + n_samples=200, + n_features=100, + n_informative=20, + n_redundant=10, + n_classes=5, + random_state=42, + ) + + ## specify a list of custom colors >= n_classes + purple_blues = ["indigo", "orchid", "plum", "navy", "purple", "blue"] + + ## instantiate the visualizer and check that self.colors is correct + purple_tsne = TSNEVisualizer(colors=purple_blues, random_state=87) + assert purple_tsne.colors == purple_blues + + ## fit the visualizer and check that self.color_values is as long as + ## n_classes and is the first n_classes items in self.colors + purple_tsne.fit(X, y) + assert len(purple_tsne.color_values_) == len(purple_tsne.classes_) + assert purple_tsne.color_values_ == purple_blues[: len(purple_tsne.classes_)] + + ## specify a list of custom colors < n_classes + greens = ["green", "lime", "teal"] + + ## instantiate the visualizer and check that self.colors is correct + green_tsne = TSNEVisualizer(colors=greens, random_state=87) + assert green_tsne.colors == greens + + ## fit the visualizer and check that self.color_values is as long as + ## n_classes and the user-supplied color list gets recycled as expected + green_tsne.fit(X, y) + assert len(green_tsne.color_values_) == len(green_tsne.classes_) + assert green_tsne.color_values_ == ["green", "lime", "teal", "green", "lime"] + def test_make_classification_tsne(self): """ Test tSNE integrated visualization on a sklearn classifier dataset """ ## produce random data - X, y = make_classification(n_samples=200, n_features=100, - n_informative=20, n_redundant=10, - n_classes=3, random_state=42) + X, y = make_classification( + n_samples=200, + n_features=100, + n_informative=20, + n_redundant=10, + n_classes=3, + random_state=42, + ) ## visualize data with t-SNE tsne = TSNEVisualizer(random_state=87) tsne.fit(X, y) - tol = 0.1 if six.PY3 else 40 - self.assert_images_similar(tsne, tol=tol) + self.assert_images_similar(tsne, tol=0.1) def test_make_classification_tsne_class_labels(self): """ @@ -149,44 +195,58 @@ def test_make_classification_tsne_class_labels(self): """ ## produce random data - X, y = make_classification(n_samples=200, n_features=100, - n_informative=20, n_redundant=10, - n_classes=3, random_state=42) + X, y = make_classification( + n_samples=200, + n_features=100, + n_informative=20, + n_redundant=10, + n_classes=3, + random_state=42, + ) ## visualize data with t-SNE - tsne = TSNEVisualizer(random_state=87, labels=['a', 'b', 'c']) + tsne = TSNEVisualizer(random_state=87, labels=["a", "b", "c"]) tsne.fit(X, y) - tol = 0.1 if six.PY3 else 40 - self.assert_images_similar(tsne, tol=tol) + self.assert_images_similar(tsne, tol=0.1) def test_tsne_mismtached_labels(self): """ Assert exception is raised when number of labels doesn't match """ ## produce random data - X, y = make_classification(n_samples=200, n_features=100, - n_informative=20, n_redundant=10, - n_classes=3, random_state=42) + X, y = make_classification( + n_samples=200, + n_features=100, + n_informative=20, + n_redundant=10, + n_classes=3, + random_state=42, + ) ## fewer labels than classes - tsne = TSNEVisualizer(random_state=87, labels=['a', 'b']) + tsne = TSNEVisualizer(random_state=87, labels=["a", "b"]) with pytest.raises(YellowbrickValueError): - tsne.fit(X,y) + tsne.fit(X, y) ## more labels than classes - tsne = TSNEVisualizer(random_state=87, labels=['a', 'b', 'c', 'd']) + tsne = TSNEVisualizer(random_state=87, labels=["a", "b", "c", "d"]) with pytest.raises(YellowbrickValueError): - tsne.fit(X,y) + tsne.fit(X, y) def test_no_target_tsne(self): """ Test tSNE when no target or classes are specified """ ## produce random data - X, y = make_classification(n_samples=200, n_features=100, - n_informative=20, n_redundant=10, - n_classes=3, random_state=6897) + X, y = make_classification( + n_samples=200, + n_features=100, + n_informative=20, + n_redundant=10, + n_classes=3, + random_state=6897, + ) ## visualize data with t-SNE tsne = TSNEVisualizer(random_state=64) @@ -200,8 +260,12 @@ def test_visualizer_with_pandas(self): Test tSNE when passed a pandas DataFrame and series """ X, y = make_classification( - n_samples=200, n_features=100, n_informative=20, n_redundant=10, - n_classes=3, random_state=3020 + n_samples=200, + n_features=100, + n_informative=20, + n_redundant=10, + n_classes=3, + random_state=3020, ) X = pandas.DataFrame(X) @@ -210,17 +274,21 @@ def test_visualizer_with_pandas(self): tsne = TSNEVisualizer(random_state=64) tsne.fit(X, y) - tol = 0.1 if six.PY3 else 40 - self.assert_images_similar(tsne, tol=tol) + self.assert_images_similar(tsne, tol=0.1) def test_alpha_param(self): """ Test that the user can supply an alpha param on instantiation """ ## produce random data - X, y = make_classification(n_samples=200, n_features=100, - n_informative=20, n_redundant=10, - n_classes=3, random_state=42) + X, y = make_classification( + n_samples=200, + n_features=100, + n_informative=20, + n_redundant=10, + n_classes=3, + random_state=42, + ) ## Instantiate a TSNEVisualizer, provide custom alpha tsne = TSNEVisualizer(random_state=64, alpha=0.5) diff --git a/tests/test_text/test_umap.py b/tests/test_text/test_umap.py new file mode 100644 index 000000000..ef24d1e75 --- /dev/null +++ b/tests/test_text/test_umap.py @@ -0,0 +1,323 @@ +# tests.test_text.test_umap +# Tests for the UMAP visual corpus embedding mechanism. +# +# Author: John Healy +# Created: Mon Dec 03, 14:00:00 +# +# Copyright (C) 2018 The scikit-yb developers +# For license information, see LICENSE.txt +# +# ID: test_umap.py [] jchealy@gmail.com> $ + +""" +Tests for the UMAP visual corpus embedding mechanism. +""" + +########################################################################## +## Imports +########################################################################## + +import sys +import pytest +import warnings + +from unittest import mock +from tests.base import VisualTestCase +from yellowbrick.text.umap_vis import * +from yellowbrick.datasets import load_hobbies +from yellowbrick.exceptions import YellowbrickValueError + +from sklearn.datasets import make_classification +from sklearn.feature_extraction.text import TfidfVectorizer + +try: + import pandas +except ImportError: + pandas = None + +try: + from umap import UMAP +except ImportError: + UMAP = None +except (RuntimeError, AttributeError): + UMAP = None + warnings.warn( + "Error Importing UMAP. UMAP does not support python 2.7 on Windows 32 bit." + ) + + +########################################################################## +## Data +########################################################################## + +corpus = load_hobbies() + + +########################################################################## +## UMAP Tests +########################################################################## + + +@mock.patch("yellowbrick.text.umap_vis.UMAP", None) +def test_umap_unavailable(): + """ + Assert an appropriate exception is raised when UMAP is not installed + """ + from yellowbrick.text.umap_vis import UMAP + + assert UMAP is None + + with pytest.raises( + YellowbrickValueError, match="umap package doesn't seem to be installed" + ): + UMAPVisualizer() + + +@pytest.mark.skipif(UMAP is None, reason="tests require the umap library") +@pytest.mark.xfail( + sys.platform == "win32", reason="not supported on windows 32bit with Python 2.7" +) +class TestUMAP(VisualTestCase): + """ + UMAPVisualizer tests + """ + + def test_make_pipeline(self): + """ + Verify the pipeline creation step for UMAP + """ + + umap = UMAPVisualizer() # Should not cause an exception. + assert umap.transformer_ is not None + + assert len(umap.transformer_.steps) == 1 + + def test_integrated_umap(self): + """ + Check UMAP integrated visualization on the hobbies corpus + """ + tfidf = TfidfVectorizer() + + docs = tfidf.fit_transform(corpus.data) + labels = corpus.target + + umap = UMAPVisualizer(random_state=8392, colormap="Set1", alpha=1.0) + umap.fit_transform(docs, labels) + + tol = 55 + self.assert_images_similar(umap, tol=tol) + + def test_sklearn_umap_size(self): + """ + Check to make sure sklearn's UMAP doesn't use the size param + """ + # In UMAPVisualizer, the internal sklearn UMAP transform consumes + # some but not all kwargs passed in by user. Those not in get_params(), + # like size, are passed through to YB's finalize method. This test should + # notify us if UMAP's params change on the sklearn side. + with pytest.raises(TypeError): + UMAP(size=(100, 100)) + + def test_sklearn_umap_title(self): + """ + Check to make sure sklearn's UMAP doesn't use the title param + """ + # In TSNEVisualizer, the internal sklearn UMAP transform consumes + # some but not all kwargs passed in by user. Those not in get_params(), + # like title, are passed through to YB's finalize method. This test should + # notify us if UMAP's params change on the sklearn side. + with pytest.raises(TypeError): + UMAP(title="custom_title") + + def test_custom_title_umap(self): + """ + Check UMAP can accept a custom title (string) from the user + """ + umap = UMAPVisualizer(title="custom_title") + + assert umap.title == "custom_title" + + def test_custom_size_umap(self): + """ + Check UMAP can accept a custom size (tuple of pixels) from the user + """ + umap = UMAPVisualizer(size=(100, 50)) + + assert umap._size == (100, 50) + + def test_custom_colors_umap(self): + """ + Check UMAP accepts and properly handles custom colors from user + """ + ## produce random data + X, y = make_classification( + n_samples=200, + n_features=100, + n_informative=20, + n_redundant=10, + n_classes=5, + random_state=42, + ) + + ## specify a list of custom colors >= n_classes + purple_blues = ["indigo", "orchid", "plum", "navy", "purple", "blue"] + + ## instantiate the visualizer and check that self.colors is correct + purple_umap = UMAPVisualizer(colors=purple_blues, random_state=87) + assert purple_umap.colors == purple_blues + + ## fit the visualizer and check that self.color_values is as long as + ## n_classes and is the first n_classes items in self.colors + purple_umap.fit(X, y) + assert len(purple_umap.color_values_) == len(purple_umap.classes_) + assert purple_umap.color_values_ == purple_blues[: len(purple_umap.classes_)] + + ## specify a list of custom colors < n_classes + greens = ["green", "lime", "teal"] + + ## instantiate the visualizer and check that self.colors is correct + green_umap = UMAPVisualizer(colors=greens, random_state=87) + assert green_umap.colors == greens + + ## fit the visualizer and check that self.color_values is as long as + ## n_classes and the user-supplied color list gets recycled as expected + green_umap.fit(X, y) + assert len(green_umap.color_values_) == len(green_umap.classes_) + assert green_umap.color_values_ == ["green", "lime", "teal", "green", "lime"] + + def test_make_classification_umap(self): + """ + Test UMAP integrated visualization on a sklearn classifier dataset + """ + + ## produce random data + X, y = make_classification( + n_samples=200, + n_features=100, + n_informative=20, + n_redundant=10, + n_classes=3, + random_state=42, + ) + + ## visualize data with UMAP + umap = UMAPVisualizer(random_state=87) + umap.fit(X, y) + + self.assert_images_similar(umap, tol=40) + + def test_make_classification_umap_class_labels(self): + """ + Test UMAP integrated visualization with class labels specified + """ + + ## produce random data + X, y = make_classification( + n_samples=200, + n_features=100, + n_informative=20, + n_redundant=10, + n_classes=3, + random_state=42, + ) + + ## visualize data with UMAP + umap = UMAPVisualizer(random_state=87, labels=["a", "b", "c"]) + umap.fit(X, y) + + self.assert_images_similar(umap, tol=40) + + def test_umap_mismtached_labels(self): + """ + Assert exception is raised when number of labels doesn't match + """ + ## produce random data + X, y = make_classification( + n_samples=200, + n_features=100, + n_informative=20, + n_redundant=10, + n_classes=3, + random_state=42, + ) + + ## fewer labels than classes + umap = UMAPVisualizer(random_state=87, labels=["a", "b"]) + with pytest.raises(YellowbrickValueError): + umap.fit(X, y) + + ## more labels than classes + umap = UMAPVisualizer(random_state=87, labels=["a", "b", "c", "d"]) + with pytest.raises(YellowbrickValueError): + umap.fit(X, y) + + def test_no_target_umap(self): + """ + Test UMAP when no target or classes are specified + """ + ## produce random data + X, y = make_classification( + n_samples=200, + n_features=100, + n_informative=20, + n_redundant=10, + n_classes=3, + random_state=6897, + ) + + ## visualize data with UMAP + umap = UMAPVisualizer(random_state=64) + umap.fit(X) + + self.assert_images_similar(umap, tol=40) + + @pytest.mark.skipif(pandas is None, reason="test requires pandas") + def test_visualizer_with_pandas(self): + """ + Test UMAP when passed a pandas DataFrame and series + """ + X, y = make_classification( + n_samples=200, + n_features=100, + n_informative=20, + n_redundant=10, + n_classes=3, + random_state=3020, + ) + + X = pandas.DataFrame(X) + y = pandas.Series(y) + + umap = UMAPVisualizer(random_state=64) + umap.fit(X, y) + + self.assert_images_similar(umap, tol=40) + + def test_alpha_param(self): + """ + Test that the user can supply an alpha param on instantiation + """ + ## produce random data + X, y = make_classification( + n_samples=200, + n_features=100, + n_informative=20, + n_redundant=10, + n_classes=3, + random_state=42, + ) + + ## Instantiate a UMAPVisualizer, provide custom alpha + umap = UMAPVisualizer(random_state=64, alpha=0.5) + + # Test param gets set correctly + assert umap.alpha == 0.5 + + # Mock ax and fit the visualizer + umap.ax = mock.MagicMock(autospec=True) + umap.fit(X, y) + + # Test that alpha was passed to internal matplotlib scatterplot + _, scatter_kwargs = umap.ax.scatter.call_args + assert "alpha" in scatter_kwargs + assert scatter_kwargs["alpha"] == 0.5 diff --git a/tests/test_utils/__init__.py b/tests/test_utils/__init__.py index bd1ef00e9..db507a95d 100644 --- a/tests/test_utils/__init__.py +++ b/tests/test_utils/__init__.py @@ -4,7 +4,7 @@ # Author: Benjamin Bengfort # Created: Thu May 18 15:15:05 2017 -0400 # -# Copyright (C) 2017 District Data Labs +# Copyright (C) 2017 The scikit-yb developers # For license information, see LICENSE.txt # # ID: __init__.py [79cd8cf] benjamin@bengfort.com $ diff --git a/tests/test_utils/test_decorators.py b/tests/test_utils/test_decorators.py index 99e6a477f..322de8713 100644 --- a/tests/test_utils/test_decorators.py +++ b/tests/test_utils/test_decorators.py @@ -4,7 +4,7 @@ # Author: Benjamin Bengfort # Created: Thu May 18 15:14:34 2017 -0400 # -# Copyright (C) 2017 District Data Labs +# Copyright (C) 2017 The scikit-yb developers # For license information, see LICENSE.txt # # ID: test_decorators.py [79cd8cf] benjamin@bengfort.com $ @@ -17,8 +17,6 @@ ## Imports ########################################################################## -import unittest - from yellowbrick.utils.decorators import * @@ -26,7 +24,8 @@ ## Decorator Tests ########################################################################## -class DecoratorTests(unittest.TestCase): + +class TestDecorators(object): """ Tests for the decorator utilities. """ @@ -37,16 +36,14 @@ def test_memoization(self): """ class Visualizer(object): - @memoized def foo(self): return "bar" viz = Visualizer() - self.assertFalse(hasattr(viz, "_foo")) - self.assertEqual(viz.foo, "bar") - self.assertEqual(viz._foo, "bar") - + assert not hasattr(viz, "_foo") + assert viz.foo == "bar" + assert viz._foo == "bar" def test_docutil(self): """ @@ -54,14 +51,12 @@ def test_docutil(self): """ class Visualizer(object): - def __init__(self): """ This is the correct docstring. """ pass - def undecorated(*args, **kwargs): """ This is an undecorated function string. @@ -69,34 +64,18 @@ def undecorated(*args, **kwargs): pass # Test the undecorated string to protect from magic - self.assertEqual( - undecorated.__doc__.strip(), "This is an undecorated function string." - ) + assert undecorated.__doc__.strip() == "This is an undecorated function string." # Decorate manually and test the newly decorated return function. decorated = docutil(Visualizer.__init__)(undecorated) - self.assertEqual( - decorated.__doc__.strip(), "This is the correct docstring." - ) + assert decorated.__doc__.strip() == "This is the correct docstring." # Assert that decoration modifies the original function. - self.assertEqual( - undecorated.__doc__.strip(), "This is the correct docstring." - ) + assert undecorated.__doc__.strip() == "This is the correct docstring." @docutil(Visualizer.__init__) def sugar(*args, **kwargs): pass # Assert that syntactic sugar works as expected. - self.assertEqual( - sugar.__doc__.strip(), "This is the correct docstring." - ) - - -########################################################################## -## Execute Tests -########################################################################## - -if __name__ == "__main__": - unittest.main() + assert sugar.__doc__.strip() == "This is the correct docstring." diff --git a/tests/test_utils/test_helpers.py b/tests/test_utils/test_helpers.py index 6c4fe9f62..91d761ebe 100644 --- a/tests/test_utils/test_helpers.py +++ b/tests/test_utils/test_helpers.py @@ -2,10 +2,11 @@ # tests.test_utils.test_helpers # Tests for the stand alone helper functions in Yellowbrick utils. # -# Author: Benjamin Bengfort +# Author: Benjamin Bengfort +# Author: Rebecca Bilbro # Created: Fri May 19 10:43:43 2017 -0700 # -# Copyright (C) 2017 District Data Labs +# Copyright (C) 2017 The scikit-yb developers # For license information, see LICENSE.txt # # ID: test_helpers.py [79cd8cf] benjamin@bengfort.com $ @@ -22,32 +23,41 @@ import numpy as np import numpy.testing as npt -from yellowbrick.utils.helpers import * - +from sklearn.svm import SVR, SVC from sklearn.pipeline import Pipeline from sklearn.decomposition import PCA +from sklearn.naive_bayes import GaussianNB +from sklearn.tree import DecisionTreeClassifier from sklearn.neighbors import KNeighborsClassifier -from sklearn.linear_model import LassoCV -from sklearn.linear_model import LinearRegression from sklearn.ensemble import RandomForestClassifier -from sklearn.cluster import KMeans +from sklearn.cluster import Birch, AgglomerativeClustering +from sklearn.decomposition import LatentDirichletAllocation as LDA +from sklearn.cluster import KMeans, AffinityPropagation, MiniBatchKMeans +from sklearn.datasets import make_classification, make_regression, make_blobs +from sklearn.linear_model import Ridge, RidgeCV, Lasso, LassoCV, LinearRegression +from yellowbrick.utils.helpers import * ########################################################################## ## Helper Function Tests ########################################################################## + class TestHelpers(object): """ Helper functions and utilities """ - @pytest.mark.parametrize("model, name", [ - (LassoCV, 'LassoCV'), - (KNeighborsClassifier, 'KNeighborsClassifier'), - (KMeans, 'KMeans'), - (RandomForestClassifier, 'RandomForestClassifier'), - ], ids=["LassoCV", "KNeighborsClassifier", "KMeans", "RandomForestClassifier"]) + @pytest.mark.parametrize( + "model, name", + [ + (LassoCV, "LassoCV"), + (KNeighborsClassifier, "KNeighborsClassifier"), + (KMeans, "KMeans"), + (RandomForestClassifier, "RandomForestClassifier"), + ], + ids=["LassoCV", "KNeighborsClassifier", "KMeans", "RandomForestClassifier"], + ) def test_real_model(self, model, name): """ Test getting model name for sklearn estimators @@ -58,9 +68,8 @@ def test_pipeline(self): """ Test getting model name for sklearn pipelines """ - pipeline = Pipeline([('reduce_dim', PCA()), - ('linreg', LinearRegression())]) - assert get_model_name(pipeline) == 'LinearRegression' + pipeline = Pipeline([("reduce_dim", PCA()), ("linreg", LinearRegression())]) + assert get_model_name(pipeline) == "LinearRegression" def test_int_input(self): """ @@ -74,7 +83,95 @@ def test_str_input(self): Assert a type error is raised when a str is passed to model name """ with pytest.raises(TypeError): - get_model_name('helloworld') + get_model_name("helloworld") + + @pytest.mark.parametrize( + "regressor", + [SVR, Ridge, Lasso, RidgeCV, LassoCV], + ids=["SVR", "Ridge", "Lasso", "RidgeCV", "LassoCV"], + ) + def test_is_fitted_regressors(self, regressor): + """ + Assert sklearn regressors return False for fitted before fit and True after + """ + assert is_fitted(regressor()) is False + + X, y = make_regression(n_samples=20, n_features=3) + fitted = regressor().fit(X, y) + assert is_fitted(fitted) is True + + @pytest.mark.parametrize( + "classifier", + [ + SVC, + GaussianNB, + KNeighborsClassifier, + RandomForestClassifier, + DecisionTreeClassifier, + ], + ids=[ + "SVC", + "GaussianNB", + "KNeighborsClassifier", + "RandomForestClassifier", + "DecisionTreeClassifier", + ], + ) + def test_is_fitted_classifiers(self, classifier): + """ + Assert sklearn classifiers return False for fitted before fit and True after + """ + assert is_fitted(classifier()) is False + + X, y = make_classification(n_samples=20, n_features=3, n_redundant=0) + fitted = classifier().fit(X, y) + assert is_fitted(fitted) is True + + @pytest.mark.parametrize( + "clusterer", + [ + PCA, # AttributeError: has no attribute 'predict' + LDA, # AttributeError: has no attribute 'predict' + Birch, + KMeans, + MiniBatchKMeans, + AffinityPropagation, + AgglomerativeClustering, # AttributeError: has no attribute 'predict' + ], + ids=[ + "PCA", + "LDA", + "Birch", + "KMeans", + "MiniBatchKMeans", + "AffinityPropagation", + "AgglomerativeClustering", + ], + ) + def test_is_fitted_clusterers(self, clusterer): + """ + Assert sklearn clusterers return False for fitted before fit and True after + """ + assert is_fitted(clusterer()) is False + + X, _ = make_blobs(n_samples=20, center_box=(10.0, 20.0)) + fitted = clusterer().fit(X) + assert is_fitted(fitted) is True + + def test_check_fitted(self): + """ + Verify the user can specify whether or not to check if the model is fitted + """ + model = SVC() + assert check_fitted(model, is_fitted_by="auto") is False + assert check_fitted(model, is_fitted_by=True) is True + assert check_fitted(model, is_fitted_by=False) is False + + X, y = make_classification(n_samples=20, n_features=3, n_redundant=0) + model.fit(X, y) + assert check_fitted(model, is_fitted_by="auto") is True + assert check_fitted(model, is_fitted_by=True) is True + assert check_fitted(model, is_fitted_by=False) is False ########################################################################## @@ -91,33 +188,32 @@ def test_div_1d_by_scalar(self): """ Test divide 1D vector by scalar """ - result = div_safe( [-1, 0, 1], 0 ) + result = div_safe([-1, 0, 1], 0) assert result.all() == 0 def test_div_1d_by_1d(self): """ Test divide 1D vector by another 1D vector with same length """ - result = div_safe( [-1, 0 , 1], [0,0,0]) + result = div_safe([-1, 0, 1], [0, 0, 0]) assert result.all() == 0 def test_div_2d_by_1d(self): """ Test divide 2D vector by 1D vector with similar shape component """ - numerator = np.array([[-1,0,1,2],[1,-1,0,3]]) - denominator = [0,0,0,0] + numerator = np.array([[-1, 0, 1, 2], [1, -1, 0, 3]]) + denominator = [0, 0, 0, 0] npt.assert_array_equal( - div_safe(numerator, denominator), - np.array([[0,0,0,0], [0,0,0,0]]) + div_safe(numerator, denominator), np.array([[0, 0, 0, 0], [0, 0, 0, 0]]) ) def test_invalid_dimensions(self): """ Assert an error is raised on division with invalid dimensions """ - numerator = np.array([[-1,0,1,2],[1,-1,0,3]]) - denominator = [0,0] + numerator = np.array([[-1, 0, 1, 2], [1, -1, 0, 3]]) + denominator = [0, 0] with pytest.raises(ValueError): div_safe(numerator, denominator) @@ -133,29 +229,98 @@ def test_prop_to_size_list(self): Test prop to size correctly returns scaled values for a list """ # Hieghts (in cm) of U.S. Presidents in order of term until Lincoln - heights = [188, 170, 189, 163, 183, 171, 185, 168, 173, 183, 173, 173, 175, 178, 183, 193] + heights = [ + 188, + 170, + 189, + 163, + 183, + 171, + 185, + 168, + 173, + 183, + 173, + 173, + 175, + 178, + 183, + 193, + ] sizes = prop_to_size(heights, mi=1, ma=10, log=False, power=0.33) - npt.assert_array_almost_equal(sizes, np.array([ - 9.47447296, 6.56768746, 9.58486955, 1. , 8.87285756, - 6.81851544, 9.12441277, 5.98256068, 7.26314542, 8.87285756, - 7.26314542, 7.26314542, 7.65154152, 8.15982835, 8.87285756, - 10. - ])) + npt.assert_array_almost_equal( + sizes, + np.array( + [ + 9.47447296, + 6.56768746, + 9.58486955, + 1.0, + 8.87285756, + 6.81851544, + 9.12441277, + 5.98256068, + 7.26314542, + 8.87285756, + 7.26314542, + 7.26314542, + 7.65154152, + 8.15982835, + 8.87285756, + 10.0, + ] + ), + ) def test_prop_to_size_log(self): """ Test prop to size returns natural log scaled values """ - # Hieghts (in cm) of U.S. Presidents in order of term until Lincoln - heights = [188, 170, 189, 163, 183, 171, 185, 168, 173, 183, 173, 173, 175, 178, 183, 193] + # Heights (in cm) of U.S. Presidents in order of term until Lincoln + heights = [ + 188, + 170, + 189, + 163, + 183, + 171, + 185, + 168, + 173, + 183, + 173, + 173, + 175, + 178, + 183, + 193, + ] sizes = prop_to_size(heights, mi=1, ma=10, log=True, power=0.5) - npt.assert_array_almost_equal(sizes, np.array([ - 9.271337, 5.49004 , 9.423692, 1. , 8.449214, 5.792968, - 8.791172, 4.806088, 6.343007, 8.449214, 6.343007, 6.343007, - 6.835994, 7.496806, 8.449214, 10. - ])) + npt.assert_array_almost_equal( + sizes, + np.array( + [ + 9.271337, + 5.49004, + 9.423692, + 1.0, + 8.449214, + 5.792968, + 8.791172, + 4.806088, + 6.343007, + 8.449214, + 6.343007, + 6.343007, + 6.835994, + 7.496806, + 8.449214, + 10.0, + ] + ), + ) def test_prop_to_size_default(self): """ @@ -173,15 +338,16 @@ def test_prop_to_size_zero_division(self): """ Ensure that prop to size does not cause division by zero errors """ - vals = [8]*8 + vals = [8] * 8 sizes = prop_to_size(vals) - npt.assert_array_equal(sizes, [0]*8) + npt.assert_array_equal(sizes, [0] * 8) ########################################################################## ## Features/Array Tests ########################################################################## + class TestNarrayIntColumns(object): """ Features and array helper tests @@ -191,7 +357,7 @@ def test_has_ndarray_int_columns_true_int_features(self): """ Ensure ndarray with int features has int columns """ - x = np.random.rand(3,5) + x = np.random.rand(3, 5) features = [0, 1] assert has_ndarray_int_columns(features, x) @@ -199,42 +365,50 @@ def test_has_ndarray_int_columns_true_int_strings(self): """ Ensure ndarray with str(int) features has int columns """ - x = np.random.rand(3,5) - features = ['0', '1'] + x = np.random.rand(3, 5) + features = ["0", "1"] assert has_ndarray_int_columns(features, x) def test_has_ndarray_int_columns_false_not_numeric(self): """ Ensure ndarray with str features does not have int columns """ - x = np.random.rand(3,5) - features = ['a', '1'] + x = np.random.rand(3, 5) + features = ["a", "1"] assert not has_ndarray_int_columns(features, x) def test_has_ndarray_int_columns_false_outside_column_range(self): """ Ensure ndarray with str(int) outside range does not have int columns """ - x = np.random.rand(3,5) - features = ['0', '10'] + x = np.random.rand(3, 5) + features = ["0", "10"] assert not has_ndarray_int_columns(features, x) - @pytest.mark.parametrize("a, increasing", [ - (np.array([0.8]), True), - (np.array([9]), False), - (np.array([0.2, 1.3, 1.4, 1.4, 1.4, 1.5, 8.3, 8.5]), True), - (np.array([8, 7, 6, 5, 5, 5, 5, 4, 3, -1, -5]), False), - ], ids=["increasing single", "decreasing single", "increasing", "decreasing"]) + @pytest.mark.parametrize( + "a, increasing", + [ + (np.array([0.8]), True), + (np.array([9]), False), + (np.array([0.2, 1.3, 1.4, 1.4, 1.4, 1.5, 8.3, 8.5]), True), + (np.array([8, 7, 6, 5, 5, 5, 5, 4, 3, -1, -5]), False), + ], + ids=["increasing single", "decreasing single", "increasing", "decreasing"], + ) def test_is_monotonic(self, a, increasing): """ Test if a vector is monotonic """ assert is_monotonic(a, increasing) - @pytest.mark.parametrize("a, increasing", [ - (np.array([0.2, 1.3, 1.3, 0.2, 1.8]), True), - (np.array([8, 7, 7, 8, 9, 6, 5]), False), - ], ids=["increasing", "decreasing"]) + @pytest.mark.parametrize( + "a, increasing", + [ + (np.array([0.2, 1.3, 1.3, 0.2, 1.8]), True), + (np.array([8, 7, 7, 8, 9, 6, 5]), False), + ], + ids=["increasing", "decreasing"], + ) def test_not_is_monotonic(self, a, increasing): """ Test if a vector is not monotonic @@ -246,13 +420,14 @@ def test_multi_dim_is_monotonic(self): Assert monotonicity is not decidable on multi-dimensional array """ with pytest.raises(ValueError): - is_monotonic(np.array([[1,2,3], [4,5,6], [7,8,9]])) + is_monotonic(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])) ########################################################################## ## String Helpers Tests ########################################################################## + class TestStringHelpers(object): """ String helper functions @@ -265,7 +440,7 @@ def test_slugifiy(self): cases = ( ("This is a test ---", "this-is-a-test"), - ("This -- is a ## test ---" , "this-is-a-test"), + ("This -- is a ## test ---", "this-is-a-test"), ) for case, expected in cases: diff --git a/tests/test_utils/test_kneed.py b/tests/test_utils/test_kneed.py new file mode 100644 index 000000000..c15f0c23e --- /dev/null +++ b/tests/test_utils/test_kneed.py @@ -0,0 +1,134 @@ +# tests.test_utils.test_kneed +# A port of the tests for knee-point detection package, kneed. +# +# Author: Kevin Arvai +# Author: Pradeep Singh +# Created: Mon Apr 23 01:29:18 2019 -0400 +# +# Copyright (C) 2017 Kevin Arvai +# All rights reserved. +# Redistribution and use in source and binary forms, with or without modification, +# are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this list +# of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, this +# list of conditions and the following disclaimer in the documentation and/or other +# materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its contributors may +# be used to endorse or promote products derived from this software without specific +# prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS +# OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN +# IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# ID: test_kneed.py [] pswaldia@no-reply.github.com $ + +""" +This package contains a port of the tests for knee-point detection package, kneed, by +Kevin Arvai and hosted at https://github.com/arvkevi/kneed. This port is maintained +with permission by the Yellowbrick contributors. +""" + +import numpy as np +from yellowbrick.utils.kneed import KneeLocator + +x = np.arange(0, 10) +y_convex_inc = np.array([1, 2, 3, 4, 5, 10, 15, 20, 40, 100]) +y_convex_dec = np.array(y_convex_inc[::-1]) +y_concave_dec = np.array(100 - y_convex_inc) +y_concave_inc = np.array(100 - y_convex_dec) + + +def test_concave_increasing(): + """Tests that a correct knee point is detected in + curve having concave and increasing nature.""" + kn = KneeLocator( + x, y_concave_inc, curve_nature="concave", curve_direction="increasing" + ) + assert kn.knee == 2 + + +def test_concave_decreasing(): + """Tests that a correct knee point is detected in + curve having concave and decreasing nature.""" + kn = KneeLocator( + x, y_concave_dec, curve_nature="concave", curve_direction="decreasing" + ) + assert kn.knee == 7 + + +def test_convex_increasing(): + """Tests that a correct knee point is detected in + curve having convex and increasing nature.""" + kn = KneeLocator( + x, y_convex_inc, curve_nature="convex", curve_direction="increasing" + ) + assert kn.knee == 7 + + +def test_convex_decreasing(): + """Tests that a correct knee point is detected in + curve having convex and decreasing nature.""" + kn = KneeLocator( + x, y_convex_dec, curve_nature="convex", curve_direction="decreasing" + ) + assert kn.knee == 2 + + +def test_concave_increasing_truncated(): + """Tests that a correct knee point is detected in + curve having truncated concave increasing nature""" + kn = KneeLocator( + x[:-3] / 10, + y_concave_inc[:-3] / 10, + curve_nature="concave", + curve_direction="increasing", + ) + assert kn.knee == 0.2 + + +def test_concave_decreasing_truncated(): + """Tests that a correct knee point is detected in + curve having truncated concave decreasing nature""" + kn = KneeLocator( + x[:-3] / 10, + y_concave_dec[:-3] / 10, + curve_nature="concave", + curve_direction="decreasing", + ) + assert kn.knee == 0.4 + + +def test_convex_increasing_truncated(): + """Tests that a correct knee point is detected in + curve having truncated convex increasing nature""" + kn = KneeLocator( + x[:-3] / 10, + y_convex_inc[:-3] / 10, + curve_nature="convex", + curve_direction="increasing", + ) + assert kn.knee == 0.4 + + +def test_convex_decreasing_truncated(): + """Tests that a correct knee point is detected in + curve having truncated convex decreasing nature""" + kn = KneeLocator( + x[:-3] / 10, + y_convex_dec[:-3] / 10, + curve_nature="convex", + curve_direction="decreasing", + ) + assert kn.knee == 0.2 diff --git a/tests/test_utils/test_nan_warnings.py b/tests/test_utils/test_nan_warnings.py index e47e89dd3..1b6714911 100644 --- a/tests/test_utils/test_nan_warnings.py +++ b/tests/test_utils/test_nan_warnings.py @@ -4,16 +4,17 @@ import pytest from yellowbrick.exceptions import DataWarning -from yellowbrick.utils.nan_warnings import count_nan_elements, \ - count_rows_with_nans, warn_if_nans_exist, filter_missing +from yellowbrick.utils.nan_warnings import ( + count_nan_elements, + count_rows_with_nans, + warn_if_nans_exist, + filter_missing, +) def test_raise_warning_if_nans_exist(): """Test that a warning is raised if any nans are in the data.""" - data = np.array([ - [1, 2, 3], - [1, 2, np.nan], - ]) + data = np.array([[1, 2, 3], [1, 2, np.nan]]) with pytest.warns(DataWarning): warn_if_nans_exist(data) @@ -21,23 +22,13 @@ def test_raise_warning_if_nans_exist(): def test_count_rows_in_2d_arrays_with_nans(): """Test that nan-containinr rows in 2d arrays are counted correctly.""" - data_1_row = np.array([ - [1, 2, 3], - ]) - - data_2_rows = np.array([ - [1, 2, 3], - [1, 2, 3], - [np.nan, 2, 3], - [1, np.nan, 3], - ]) - - data_3_rows = np.array([ - [1, 2, 3], - [np.nan, 2, 3], - [1, np.nan, 3], - [np.nan, np.nan, np.nan], - ]) + data_1_row = np.array([[1, 2, 3]]) + + data_2_rows = np.array([[1, 2, 3], [1, 2, 3], [np.nan, 2, 3], [1, np.nan, 3]]) + + data_3_rows = np.array( + [[1, 2, 3], [np.nan, 2, 3], [1, np.nan, 3], [np.nan, np.nan, np.nan]] + ) assert count_rows_with_nans(data_1_row) == 0 assert count_rows_with_nans(data_2_rows) == 2 @@ -57,11 +48,7 @@ def test_count_nan_elements(): def test_clean_data_X_only_no_nans(): """Test that an array with no nulls is returned intact.""" - X = np.array([ - [1, 2, 3], - [4, 5, 6], - [7, 8, 9], - ]) + X = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) observed = filter_missing(X) np.testing.assert_array_equal(X, observed) @@ -69,15 +56,9 @@ def test_clean_data_X_only_no_nans(): def test_clean_data_X_only(): """Test that nan-containing X rows are removed without y.""" - X = np.array([ - [1, 2, np.nan], - [4, 5, 6], - [np.nan, np.nan, np.nan], - ]) - - expected = np.array([ - [4, 5, 6] - ]) + X = np.array([[1, 2, np.nan], [4, 5, 6], [np.nan, np.nan, np.nan]]) + + expected = np.array([[4, 5, 6]]) observed = filter_missing(X) np.testing.assert_array_equal(expected, observed) @@ -85,17 +66,10 @@ def test_clean_data_X_only(): def test_clean_data_dirty_X_dirty_y(): """Test that nan-containing X, y rows are removed when both contain nans.""" - X = np.array([ - [1, 2, 3], - [4, 5, 6], - [7, 8, np.nan], - [np.nan, np.nan, np.nan], - ]) + X = np.array([[1, 2, 3], [4, 5, 6], [7, 8, np.nan], [np.nan, np.nan, np.nan]]) y = np.array([33, np.nan, 44, np.nan]) - expected_X = np.array([ - [1, 2, 3], - ]) + expected_X = np.array([[1, 2, 3]]) expected_y = np.array([33]) observed_X, observed_y = filter_missing(X, y) @@ -105,18 +79,10 @@ def test_clean_data_dirty_X_dirty_y(): def test_clean_data_dirty_X_clean_y(): """Test that nan-containing X, y rows are removed when X contains nans.""" - X = np.array([ - [1, 2, 3], - [4, 5, 6], - [7, 8, np.nan], - [np.nan, np.nan, np.nan], - ]) + X = np.array([[1, 2, 3], [4, 5, 6], [7, 8, np.nan], [np.nan, np.nan, np.nan]]) y = np.array([33, 44, 55, 66]) - expected_X = np.array([ - [1, 2, 3], - [4, 5, 6], - ]) + expected_X = np.array([[1, 2, 3], [4, 5, 6]]) expected_y = np.array([33, 44]) observed_X, observed_y = filter_missing(X, y) @@ -126,18 +92,10 @@ def test_clean_data_dirty_X_clean_y(): def test_clean_data_clean_X_dirty_y(): """Test that nan-containing X, y rows are removed when y contains nans.""" - X = np.array([ - [1, 2, 3], - [4, 5, 6], - [7, 8, 9], - [10, 11, 12] - ]) + X = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]]) y = np.array([np.nan, 44, np.nan, 66]) - expected_X = np.array([ - [4, 5, 6], - [10, 11, 12] - ]) + expected_X = np.array([[4, 5, 6], [10, 11, 12]]) expected_y = np.array([44, 66]) observed_X, observed_y = filter_missing(X, y) diff --git a/tests/test_utils/test_target.py b/tests/test_utils/test_target.py new file mode 100644 index 000000000..cad27f0da --- /dev/null +++ b/tests/test_utils/test_target.py @@ -0,0 +1,135 @@ +# tests.test_utils.test_target +# Tests for the target helper functions module. +# +# Author: Benjamin Bengfort +# Created: Thu Dec 27 20:43:31 2018 -0500 +# +# For license information, see LICENSE.txt +# +# ID: test_target.py [899c88a] benjamin@bengfort.com $ + +""" +Tests for the target helper functions module. +""" + +########################################################################## +## Imports +########################################################################## + +import pytest +import numpy as np + +from yellowbrick.utils.target import * +from yellowbrick.exceptions import YellowbrickValueError +from sklearn.datasets import make_regression, make_classification + + +########################################################################## +## Target Color Type Tests +########################################################################## + + +@pytest.mark.parametrize( + "value,expected", + [ + (None, TargetType.SINGLE), + (np.ones(15), TargetType.SINGLE), + (["a", "b", "a", "b", "c"], TargetType.DISCRETE), + ([1, 2, 1, 2, 3], TargetType.DISCRETE), + ([0.23, 0.94, 1.3, -1.02, 0.11], TargetType.CONTINUOUS), + ([1, 2, 0.2, 0.5, 1], TargetType.CONTINUOUS), + (np.array([0.2, 2.2, 1.2, -3.1]), TargetType.CONTINUOUS), + (np.array([[1, 2], [0, 2], [2, 1]]), TargetType.DISCRETE), + (np.array([[[1, 2], [1, 2]], [[1, 2], [1, 2]]]), TargetType.UNKNOWN), + ], + ids=[ + "none", + "ones", + "list str", + "list int", + "list float", + "mixed list", + "float array", + "multioutput", + "cube", + ], +) +def test_target_color_type(value, expected): + """ + Test the target_color_type helper function with a variety of data types + """ + assert target_color_type(value) == expected + + +@pytest.mark.parametrize( + "n_classes,expected", + [ + (2, TargetType.DISCRETE), + (4, TargetType.DISCRETE), + (MAX_DISCRETE_CLASSES, TargetType.DISCRETE), + (MAX_DISCRETE_CLASSES + 3, TargetType.CONTINUOUS), + ], + ids=["binary", "multiclass", "max discrete", "too many discrete"], +) +def test_binary_target_color_type(n_classes, expected): + """ + Test classification target color type + """ + _, y = make_classification(n_classes=n_classes, n_informative=n_classes + 2) + assert target_color_type(y) == expected + + +def test_regression_target_color_type(): + """ + Test regression target color type + """ + _, y = make_regression() + assert target_color_type(y) == TargetType.CONTINUOUS + + +@pytest.mark.parametrize( + "val", + [ + "auto", + "single", + "discrete", + "continuous", + "unknown", + TargetType.AUTO, + TargetType.SINGLE, + TargetType.DISCRETE, + TargetType.CONTINUOUS, + TargetType.UNKNOWN, + ], +) +def test_target_type_validate_valid(val): + try: + TargetType.validate(val) + except YellowbrickValueError: + pyetst.fail("valid target type raised validation error") + + +@pytest.mark.parametrize( + "val", ["foo", 1, 3.14, "s", "DISCRETE", "CONTINUOUS", ["a", "b", "c"]] +) +def test_target_type_validate_invalid(val): + with pytest.raises(YellowbrickValueError, match="unknown target color type"): + TargetType.validate(val) + + +@pytest.mark.parametrize( + "val,expected", + [ + ("discrete", True), + (TargetType.DISCRETE, True), + ("DISCRETE", True), + (8, False), + ("FOO", False), + (3.14, False), + ("foo", False), + (["discrete"], False), + ({"discrete"}, False), + ], +) +def test_target_type_equals(val, expected): + assert (TargetType.DISCRETE == val) is expected diff --git a/tests/test_utils/test_timer.py b/tests/test_utils/test_timer.py index 31763b84c..eaa2f766c 100644 --- a/tests/test_utils/test_timer.py +++ b/tests/test_utils/test_timer.py @@ -4,7 +4,7 @@ # Author: ZJ Poh # Created: Tue Jul 17 21:11:11 2018 -0700 # -# Copyright (C) 2017 District Data Labs +# Copyright (C) 2017 The scikit-yb developers # For license information, see LICENSE.txt """ Tests for the stand alone timer functions in Yellowbrick utils. @@ -15,23 +15,22 @@ ########################################################################## import pytest -try: - from unittest import mock -except ImportError: - import mock +from unittest import mock from yellowbrick.utils.timer import * + ########################################################################## ## Helper Function Tests ########################################################################## + class TestTimer(object): """ Timer functions and utilities """ - @mock.patch('time.time', mock.Mock(side_effect=[1234.2, 1242.8])) + @mock.patch("time.time", mock.Mock(side_effect=[1234.2, 1242.8])) def test_timer(self): with Timer() as timer: pass @@ -39,11 +38,14 @@ def test_timer(self): assert timer.interval == pytest.approx(8.6) -@pytest.mark.parametrize('s,expected', [ - (1.01, '00:00:01.0100'), - (61.01, '00:01:01.0100'), - (3661.01, '01:01:01.0100'), - (360061.01, '100:01:01.0100') -]) +@pytest.mark.parametrize( + "s,expected", + [ + (1.01, "00:00:01.0100"), + (61.01, "00:01:01.0100"), + (3661.01, "01:01:01.0100"), + (360061.01, "100:01:01.0100"), + ], +) def test_human_readable_time(s, expected): assert human_readable_time(s) == expected diff --git a/tests/test_utils/test_types.py b/tests/test_utils/test_types.py index 17d5ff033..8ef0e5c15 100644 --- a/tests/test_utils/test_types.py +++ b/tests/test_utils/test_types.py @@ -41,8 +41,15 @@ from sklearn.linear_model import Ridge, RidgeCV, Lasso, LassoCV REGRESSORS = [ - SVR, DecisionTreeRegressor, MLPRegressor, LinearRegression, - RandomForestRegressor, Ridge, RidgeCV, Lasso, LassoCV, + SVR, + DecisionTreeRegressor, + MLPRegressor, + LinearRegression, + RandomForestRegressor, + Ridge, + RidgeCV, + Lasso, + LassoCV, ] # Import Classifiers @@ -55,8 +62,13 @@ from sklearn.naive_bayes import MultinomialNB, GaussianNB CLASSIFIERS = [ - SVC, DecisionTreeClassifier, MLPClassifier, LogisticRegression, - RandomForestClassifier, GradientBoostingClassifier, MultinomialNB, + SVC, + DecisionTreeClassifier, + MLPClassifier, + LogisticRegression, + RandomForestClassifier, + GradientBoostingClassifier, + MultinomialNB, GaussianNB, ] @@ -64,17 +76,13 @@ from sklearn.cluster import KMeans, MiniBatchKMeans from sklearn.cluster import AffinityPropagation, Birch -CLUSTERERS = [ - KMeans, MiniBatchKMeans, AffinityPropagation, Birch, -] +CLUSTERERS = [KMeans, MiniBatchKMeans, AffinityPropagation, Birch] # Import Decompositions from sklearn.decomposition import PCA from sklearn.decomposition import TruncatedSVD -DECOMPOSITIONS = [ - PCA, TruncatedSVD -] +DECOMPOSITIONS = [PCA, TruncatedSVD] # Import Transformers from sklearn.feature_extraction.text import TfidfVectorizer @@ -84,7 +92,10 @@ from sklearn.impute import SimpleImputer TRANSFORMERS = [ - DictVectorizer, QuantileTransformer, StandardScaler, SimpleImputer, + DictVectorizer, + QuantileTransformer, + StandardScaler, + SimpleImputer, TfidfVectorizer, ] @@ -92,16 +103,12 @@ from sklearn.pipeline import Pipeline, FeatureUnion -PIPELINES = [ - Pipeline, FeatureUnion, -] +PIPELINES = [Pipeline, FeatureUnion] # Import GridSearch Utilities from sklearn.model_selection import GridSearchCV, RandomizedSearchCV -SEARCH = [ - GridSearchCV, RandomizedSearchCV, -] +SEARCH = [GridSearchCV, RandomizedSearchCV] # Other Groups @@ -120,6 +127,7 @@ def obj_name(obj): ## Model type checking test cases ########################################################################## + class TestModelTypeChecking(object): """ Test model type checking utilities @@ -146,9 +154,9 @@ def test_is_estimator(self, model): obj = model() assert is_estimator(obj) - @pytest.mark.parametrize("cls", [ - list, dict, tuple, set, str, bool, int, float - ], ids=obj_name) + @pytest.mark.parametrize( + "cls", [list, dict, tuple, set, str, bool, int, float], ids=obj_name + ) def test_not_is_estimator(self, cls): """ Assert Python objects are not estimators @@ -166,10 +174,7 @@ def test_is_estimator_pipeline(self): assert is_estimator(Pipeline) assert is_estimator(FeatureUnion) - model = Pipeline([ - ('reduce_dim', PCA()), - ('linreg', LinearRegression()) - ]) + model = Pipeline([("reduce_dim", PCA()), ("linreg", LinearRegression())]) assert is_estimator(model) @@ -180,14 +185,18 @@ def test_is_estimator_search(self): assert is_estimator(GridSearchCV) assert is_estimator(RandomizedSearchCV) - model = GridSearchCV(SVR(), {'kernel': ['linear', 'rbf']}) + model = GridSearchCV(SVR(), {"kernel": ["linear", "rbf"]}) assert is_estimator(model) - @pytest.mark.parametrize("viz,params", [ - (Visualizer, {}), - (ScoreVisualizer, {'model': LinearRegression()}), - (ModelVisualizer, {'model': LogisticRegression()}) - ], ids=["Visualizer", "ScoreVisualizer", "ModelVisualizer"]) + @pytest.mark.parametrize( + "viz,params", + [ + (Visualizer, {}), + (ScoreVisualizer, {"model": LinearRegression()}), + (ModelVisualizer, {"model": LogisticRegression()}), + ], + ids=["Visualizer", "ScoreVisualizer", "ModelVisualizer"], + ) def test_is_estimator_visualizer(self, viz, params): """ Test that is_estimator works for Visualizers @@ -219,9 +228,9 @@ def test_is_regressor(self, model): obj = model() assert is_regressor(obj) - @pytest.mark.parametrize("model", - CLASSIFIERS+CLUSTERERS+TRANSFORMERS+DECOMPOSITIONS, - ids=obj_name) + @pytest.mark.parametrize( + "model", CLASSIFIERS + CLUSTERERS + TRANSFORMERS + DECOMPOSITIONS, ids=obj_name + ) def test_not_is_regressor(self, model): """ Test that is_regressor does not match non-regressor estimators @@ -239,10 +248,7 @@ def test_is_regressor_pipeline(self): assert not is_regressor(Pipeline) assert not is_regressor(FeatureUnion) - model = Pipeline([ - ('reduce_dim', PCA()), - ('linreg', LinearRegression()) - ]) + model = Pipeline([("reduce_dim", PCA()), ("linreg", LinearRegression())]) assert is_regressor(model) @@ -254,13 +260,17 @@ def test_is_regressor_search(self): assert is_regressor(GridSearchCV) assert is_regressor(RandomizedSearchCV) - model = GridSearchCV(SVR(), {'kernel': ['linear', 'rbf']}) + model = GridSearchCV(SVR(), {"kernel": ["linear", "rbf"]}) assert is_regressor(model) - @pytest.mark.parametrize("viz,params", [ - (ScoreVisualizer, {'model': LinearRegression()}), - (ModelVisualizer, {'model': Ridge()}) - ], ids=["ScoreVisualizer", "ModelVisualizer"]) + @pytest.mark.parametrize( + "viz,params", + [ + (ScoreVisualizer, {"model": LinearRegression()}), + (ModelVisualizer, {"model": Ridge()}), + ], + ids=["ScoreVisualizer", "ModelVisualizer"], + ) def test_is_regressor_visualizer(self, viz, params): """ Test that is_regressor works on visualizers @@ -292,9 +302,9 @@ def test_is_classifier(self, model): obj = model() assert is_classifier(obj) - @pytest.mark.parametrize("model", - REGRESSORS+CLUSTERERS+TRANSFORMERS+DECOMPOSITIONS, - ids=obj_name) + @pytest.mark.parametrize( + "model", REGRESSORS + CLUSTERERS + TRANSFORMERS + DECOMPOSITIONS, ids=obj_name + ) def test_not_is_classifier(self, model): """ Test that is_classifier does not match non-classifier estimators @@ -312,10 +322,7 @@ def test_classifier_pipeline(self): assert not is_classifier(Pipeline) assert not is_classifier(FeatureUnion) - model = Pipeline([ - ('reduce_dim', PCA()), - ('linreg', LogisticRegression()) - ]) + model = Pipeline([("reduce_dim", PCA()), ("linreg", LogisticRegression())]) assert is_classifier(model) @@ -327,13 +334,17 @@ def test_is_classifier_search(self): assert is_classifier(GridSearchCV) assert is_classifier(RandomizedSearchCV) - model = GridSearchCV(SVC(), {'kernel': ['linear', 'rbf']}) + model = GridSearchCV(SVC(), {"kernel": ["linear", "rbf"]}) assert is_classifier(model) - @pytest.mark.parametrize("viz,params", [ - (ScoreVisualizer, {'model': MultinomialNB()}), - (ModelVisualizer, {'model': MLPClassifier()}) - ], ids=["ScoreVisualizer", "ModelVisualizer"]) + @pytest.mark.parametrize( + "viz,params", + [ + (ScoreVisualizer, {"model": MultinomialNB()}), + (ModelVisualizer, {"model": MLPClassifier()}), + ], + ids=["ScoreVisualizer", "ModelVisualizer"], + ) def test_is_classifier_visualizer(self, viz, params): """ Test that is_classifier works on visualizers @@ -365,9 +376,9 @@ def test_is_clusterer(self, model): obj = model() assert is_clusterer(obj) - @pytest.mark.parametrize("model", - REGRESSORS+CLASSIFIERS+TRANSFORMERS+DECOMPOSITIONS, - ids=obj_name) + @pytest.mark.parametrize( + "model", REGRESSORS + CLASSIFIERS + TRANSFORMERS + DECOMPOSITIONS, ids=obj_name + ) def test_not_is_clusterer(self, model): """ Test that is_clusterer does not match non-clusterer estimators @@ -385,16 +396,13 @@ def test_clusterer_pipeline(self): assert not is_clusterer(Pipeline) assert not is_clusterer(FeatureUnion) - model = Pipeline([ - ('reduce_dim', PCA()), - ('kmeans', KMeans()) - ]) + model = Pipeline([("reduce_dim", PCA()), ("kmeans", KMeans())]) assert is_clusterer(model) - @pytest.mark.parametrize("viz,params", [ - (ModelVisualizer, {'model': KMeans()}) - ], ids=["ModelVisualizer"]) + @pytest.mark.parametrize( + "viz,params", [(ModelVisualizer, {"model": KMeans()})], ids=["ModelVisualizer"] + ) def test_is_clusterer_visualizer(self, viz, params): """ Test that is_clusterer works on visualizers @@ -426,8 +434,9 @@ def test_is_gridsearch(self, model): obj = model(SVC, {"C": [0.5, 1, 10]}) assert is_gridsearch(obj) - @pytest.mark.parametrize("model", - [MLPRegressor, MLPClassifier, SimpleImputer], ids=obj_name) + @pytest.mark.parametrize( + "model", [MLPRegressor, MLPClassifier, SimpleImputer], ids=obj_name + ) def test_not_is_gridsearch(self, model): """ Test that is_gridsearch does not match non grid searches @@ -448,10 +457,19 @@ def test_probabilistic_alias(self): """ assert isprobabilistic is is_probabilistic - @pytest.mark.parametrize("model", [ - MultinomialNB, GaussianNB, LogisticRegression, SVC, - RandomForestClassifier, GradientBoostingClassifier, MLPClassifier, - ], ids=obj_name) + @pytest.mark.parametrize( + "model", + [ + MultinomialNB, + GaussianNB, + LogisticRegression, + SVC, + RandomForestClassifier, + GradientBoostingClassifier, + MLPClassifier, + ], + ids=obj_name, + ) def test_is_probabilistic(self, model): """ Test that is_probabilistic works correctly @@ -462,10 +480,11 @@ def test_is_probabilistic(self, model): obj = model() assert is_probabilistic(obj) - @pytest.mark.parametrize("model", [ - MLPRegressor, SimpleImputer, StandardScaler, KMeans, - RandomForestRegressor, - ], ids=obj_name) + @pytest.mark.parametrize( + "model", + [MLPRegressor, SimpleImputer, StandardScaler, KMeans, RandomForestRegressor], + ids=obj_name, + ) def test_not_is_probabilistic(self, model): """ Test that is_probabilistic does not match non probablistic estimators @@ -481,6 +500,7 @@ def test_not_is_probabilistic(self, model): ## Data type checking test cases ########################################################################## + class TestDataTypeChecking(object): """ Test data type checking utilities @@ -501,22 +521,24 @@ def test_is_dataframe(self): """ Test that is_dataframe works correctly """ - df = pd.DataFrame([ - {'a': 1, 'b': 2.3, 'c': 'Hello'}, - {'a': 2, 'b': 3.14, 'c': 'World'}, - ]) + df = pd.DataFrame( + [{"a": 1, "b": 2.3, "c": "Hello"}, {"a": 2, "b": 3.14, "c": "World"}] + ) assert is_dataframe(df) - @pytest.mark.parametrize("obj", [ - np.array([ - (1,2.,'Hello'), (2,3.,"World")], - dtype=[('foo', 'i4'),('bar', 'f4'), ('baz', 'S10')] - ), - np.array([[1,2,3], [1,2,3]]), - [[1,2,3], [1,2,3]], - ], - ids=["structured array", "array", "list"]) + @pytest.mark.parametrize( + "obj", + [ + np.array( + [(1, 2.0, "Hello"), (2, 3.0, "World")], + dtype=[("foo", "i4"), ("bar", "f4"), ("baz", "S10")], + ), + np.array([[1, 2, 3], [1, 2, 3]]), + [[1, 2, 3], [1, 2, 3]], + ], + ids=["structured array", "array", "list"], + ) def test_not_is_dataframe(self, obj): """ Test that is_dataframe does not match non-dataframes @@ -542,15 +564,18 @@ def test_is_series(self): assert is_series(df) - @pytest.mark.parametrize("obj", [ - np.array([ - (1,2.,'Hello'), (2,3.,"World")], - dtype=[('foo', 'i4'),('bar', 'f4'), ('baz', 'S10')] - ), - np.array([1,2,3]), - [1, 2, 3], - ], - ids=["structured array", "array", "list"]) + @pytest.mark.parametrize( + "obj", + [ + np.array( + [(1, 2.0, "Hello"), (2, 3.0, "World")], + dtype=[("foo", "i4"), ("bar", "f4"), ("baz", "S10")], + ), + np.array([1, 2, 3]), + [1, 2, 3], + ], + ids=["structured array", "array", "list"], + ) def test_not_is_series(self, obj): """ Test that is_series does not match non-dataframes @@ -571,18 +596,16 @@ def test_is_structured_array(self): """ Test that is_structured_array works correctly """ - x = np.array([ - (1,2.,'Hello'), (2,3.,"World")], - dtype=[('foo', 'i4'),('bar', 'f4'), ('baz', 'S10')] + x = np.array( + [(1, 2.0, "Hello"), (2, 3.0, "World")], + dtype=[("foo", "i4"), ("bar", "f4"), ("baz", "S10")], ) assert is_structured_array(x) - @pytest.mark.parametrize("obj", [ - np.array([[1,2,3], [1,2,3]]), - [[1,2,3], [1,2,3]], - ], - ids=obj_name) + @pytest.mark.parametrize( + "obj", [np.array([[1, 2, 3], [1, 2, 3]]), [[1, 2, 3], [1, 2, 3]]], ids=obj_name + ) def test_not_is_structured_array(self, obj): """ Test that is_structured_array does not match non-structured-arrays diff --git a/tests/test_utils/test_wrapper.py b/tests/test_utils/test_wrapper.py index d2cab8848..83384fc7b 100644 --- a/tests/test_utils/test_wrapper.py +++ b/tests/test_utils/test_wrapper.py @@ -4,7 +4,7 @@ # Author: Benjamin Bengfort # Created: Mon May 22 09:25:52 2017 -0700 # -# Copyright (C) 2017 District Data Labs +# Copyright (C) 2017 The scikit-yb developers # For license information, see LICENSE.txt # # ID: test_wrapper.py [b2ecd50] benjamin@bengfort.com $ @@ -17,23 +17,20 @@ ## Imports ########################################################################## +from unittest import mock + from yellowbrick.base import Visualizer from yellowbrick.utils.wrapper import * from sklearn.naive_bayes import MultinomialNB from sklearn.naive_bayes import GaussianNB -try: - from unittest import mock -except ImportError: - import mock - ########################################################################## ## Fixture ########################################################################## -class MockVisualizer(Visualizer): +class MockVisualizer(Visualizer): def __init__(self, ax=None, **kwargs): self.ax = ax self.fit = mock.MagicMock() @@ -51,7 +48,6 @@ def ax(self, val): class WrappedEstimator(MockVisualizer, Wrapper): - def __init__(self, **kwargs): self.estimator = mock.MagicMock(spec=MultinomialNB()) @@ -62,13 +58,14 @@ def draw(self): return True def foo(self, a, b): - return a+b + return a + b ########################################################################## ## Wrapper Test Case ########################################################################## + class TestWrapper(object): """ Test the object Wrapper mixin utility @@ -82,7 +79,7 @@ def test_wrapper_methods(self): # Assert that all the wrapper methods are called assert obj.draw() - assert obj.foo(2,2) == 4 + assert obj.foo(2, 2) == 4 assert obj.estimator is not None def test_super_methods(self): diff --git a/yellowbrick/__init__.py b/yellowbrick/__init__.py index c2d491371..964afb995 100644 --- a/yellowbrick/__init__.py +++ b/yellowbrick/__init__.py @@ -1,10 +1,11 @@ # yellowbrick # A suite of visual analysis and diagnostic tools for machine learning. # -# Author: Rebecca Bilbro +# Author: Rebecca Bilbro +# Author: Benjamin Bengfort # Created: Wed May 18 10:46:33 2016 -0400 # -# Copyright (C) 2016 District Data Labs +# Copyright (C) 2016 The scikit-yb developers # For license information, see LICENSE.txt # # ID: __init__.py [0c5ba04] benjamin@bengfort.com $ @@ -20,6 +21,7 @@ # Capture the original matplotlib rcParams import matplotlib as mpl + _orig_rc_params = mpl.rcParams.copy() # Import the version number at the top level @@ -35,6 +37,7 @@ from .anscombe import anscombe from .datasaurus import datasaurus from .classifier import ROCAUC, ClassBalance, ClassificationScoreVisualizer + # from .classifier import crplot, rocplot # from .regressor import peplot, residuals_plot @@ -43,7 +46,7 @@ ## Set default aesthetics ########################################################################## -set_aesthetic() # NOTE: modifies mpl.rcParams +set_aesthetic() # NOTE: modifies mpl.rcParams ########################################################################## diff --git a/yellowbrick/anscombe.py b/yellowbrick/anscombe.py index 3406aeba7..1d72d5bf4 100644 --- a/yellowbrick/anscombe.py +++ b/yellowbrick/anscombe.py @@ -1,10 +1,10 @@ # yellowbrick.anscombe # Plots Anscombe's Quartet as an illustration of the importance of visualization. # -# Author: Benjamin Bengfort +# Author: Benjamin Bengfort # Created: Wed May 18 11:38:25 2016 -0400 # -# Copyright (C) 2016 District Data Labs +# Copyright (C) 2016 The sckit-yb developers # For license information, see LICENSE.txt # # ID: anscombe.py [0bfa366] benjamin@bengfort.com $ @@ -29,22 +29,30 @@ ########################################################################## ANSCOMBE = [ - np.array([ - [10.0, 8.0, 13.0, 9.0, 11.0, 14.0, 6.0, 4.0, 12.0, 7.0, 5.0], - [8.04, 6.95, 7.58, 8.81, 8.33, 9.96, 7.24, 4.26, 10.84, 4.82, 5.68] - ]), - np.array([ - [10.0, 8.0, 13.0, 9.0, 11.0, 14.0, 6.0, 4.0, 12.0, 7.0, 5.0], - [9.14, 8.14, 8.74, 8.77, 9.26, 8.10, 6.13, 3.10, 9.13, 7.26, 4.74] - ]), - np.array([ - [10.0, 8.0, 13.0, 9.0, 11.0, 14.0, 6.0, 4.0, 12.0, 7.0, 5.0], - [7.46, 6.77, 12.74, 7.11, 7.81, 8.84, 6.08, 5.39, 8.15, 6.42, 5.73] - ]), - np.array([ - [8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 19.0, 8.0, 8.0, 8.0], - [6.58, 5.76, 7.71, 8.84, 8.47, 7.04, 5.25, 12.50, 5.56, 7.91, 6.89] - ]) + np.array( + [ + [10.0, 8.0, 13.0, 9.0, 11.0, 14.0, 6.0, 4.0, 12.0, 7.0, 5.0], + [8.04, 6.95, 7.58, 8.81, 8.33, 9.96, 7.24, 4.26, 10.84, 4.82, 5.68], + ] + ), + np.array( + [ + [10.0, 8.0, 13.0, 9.0, 11.0, 14.0, 6.0, 4.0, 12.0, 7.0, 5.0], + [9.14, 8.14, 8.74, 8.77, 9.26, 8.10, 6.13, 3.10, 9.13, 7.26, 4.74], + ] + ), + np.array( + [ + [10.0, 8.0, 13.0, 9.0, 11.0, 14.0, 6.0, 4.0, 12.0, 7.0, 5.0], + [7.46, 6.77, 12.74, 7.11, 7.81, 8.84, 6.08, 5.39, 8.15, 6.42, 5.73], + ] + ), + np.array( + [ + [8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 19.0, 8.0, 8.0, 8.0], + [6.58, 5.76, 7.71, 8.84, 8.47, 7.04, 5.25, 12.50, 5.56, 7.91, 6.89], + ] + ), ] @@ -52,7 +60,7 @@ def anscombe(): """ Creates 2x2 grid plot of the 4 anscombe datasets for illustration. """ - _, ((axa, axb), (axc, axd)) = plt.subplots(2, 2, sharex='col', sharey='row') + _, ((axa, axb), (axc, axd)) = plt.subplots(2, 2, sharex="col", sharey="row") colors = get_color_cycle() for arr, ax, color in zip(ANSCOMBE, (axa, axb, axc, axd), colors): @@ -72,6 +80,6 @@ def anscombe(): return (axa, axb, axc, axd) -if __name__ == '__main__': +if __name__ == "__main__": anscombe() plt.show() diff --git a/yellowbrick/base.py b/yellowbrick/base.py index 4652b1d47..c329b86db 100644 --- a/yellowbrick/base.py +++ b/yellowbrick/base.py @@ -1,11 +1,11 @@ # yellowbrick.base # Abstract base classes and interface for Yellowbrick. # -# Author: Rebecca Bilbro -# Author: Benjamin Bengfort +# Author: Rebecca Bilbro +# Author: Benjamin Bengfort # Created: Fri Jun 03 10:20:59 2016 -0700 # -# Copyright (C) 2016 District Data Labs +# Copyright (C) 2019 The scikit-yb developers # For license information, see LICENSE.txt # # ID: base.py [4a59c49] benjamin@bengfort.com $ @@ -14,19 +14,24 @@ Abstract base classes and interface for Yellowbrick. """ -import matplotlib.pyplot as plt import math +import warnings +import matplotlib.pyplot as plt -from .utils.wrapper import Wrapper from sklearn.base import BaseEstimator -from .utils import get_model_name, isestimator -from sklearn.model_selection import cross_val_predict as cvp -from .exceptions import YellowbrickValueError, YellowbrickTypeError + +from yellowbrick.utils import get_model_name +from yellowbrick.utils.wrapper import Wrapper +from yellowbrick.utils.helpers import check_fitted +from yellowbrick.exceptions import YellowbrickWarning +from yellowbrick.exceptions import YellowbrickValueError, YellowbrickTypeError + ########################################################################## ## Base class hierarchy ########################################################################## + class Visualizer(BaseEstimator): """ The root of the visual object hierarchy that defines how yellowbrick @@ -43,6 +48,10 @@ class Visualizer(BaseEstimator): The axis to plot the figure on. If None is passed in the current axes will be used (or generated if required). + fig : matplotlib Figure, default: None + The figure to plot the Visualizer on. If None is passed in the current + plot will be used (or generated if required). + kwargs : dict Keyword arguments that are passed to the base class and may influence the visualization as defined in other Visualizers. Optional keyword @@ -60,26 +69,28 @@ class Visualizer(BaseEstimator): ----- Visualizers are objects that learn from data (e.g. estimators), therefore they must be ``fit()`` before they can be drawn or used. Visualizers also - maintain a reference to an ``ax`` object, a matplotlib Axes where the - figures are drawn and rendered. + maintain a reference to an ``ax`` object, a Matplotlib Axes where the + figures are drawn and rendered, as well as to a ``fig`` object, a Matplotlib + Figure on which the Visualizer will be plotted. """ - def __init__(self, ax=None, **kwargs): + def __init__(self, ax=None, fig=None, **kwargs): self.ax = ax - self.size = kwargs.pop('size', None) - self.color = kwargs.pop('color', None) - self.title = kwargs.pop('title', None) + self.fig = fig + self.size = kwargs.pop("size", None) + self.color = kwargs.pop("color", None) + self.title = kwargs.pop("title", None) - ##//////////////////////////////////////////////////////////////////// + ## //////////////////////////////////////////////////////////////////// ## Primary Visualizer Properties - ##//////////////////////////////////////////////////////////////////// + ## //////////////////////////////////////////////////////////////////// @property def ax(self): """ The matplotlib axes that the visualizer draws upon (can also be a grid - of multiple axes objects). The visualizer automatically creates an - axes for the user if one has not been specified. + of multiple axes objects). The visualizer uses :func:`matplotlib.pyplot.gca` + to create an axes for the user if one has not been specified. """ if not hasattr(self, "_ax") or self._ax is None: self._ax = plt.gca() @@ -89,6 +100,21 @@ def ax(self): def ax(self, ax): self._ax = ax + @property + def fig(self): + """ + The matplotlib fig that the visualizer draws upon. The visualizer uses + the matplotlib method :func:`matplotlib.pyplot.gcf` to create a figure for + the user if one has not been specified. + """ + if not hasattr(self, "_fig") or self._fig is None: + self._fig = plt.gcf() + return self._fig + + @fig.setter + def fig(self, fig): + self._fig = fig + @property def size(self): """ @@ -96,23 +122,21 @@ def size(self): the user provided size if available. """ if not hasattr(self, "_size") or self._size is None: - fig = plt.gcf() - self._size = fig.get_size_inches()*fig.dpi + self._size = self.fig.get_size_inches() * self.fig.dpi return self._size @size.setter def size(self, size): self._size = size if self._size is not None: - fig = plt.gcf() width, height = size - width_in_inches = width / fig.get_dpi() - height_in_inches = height / fig.get_dpi() - fig.set_size_inches(width_in_inches, height_in_inches) + width_in_inches = width / self.fig.get_dpi() + height_in_inches = height / self.fig.get_dpi() + self.fig.set_size_inches(width_in_inches, height_in_inches) - ##//////////////////////////////////////////////////////////////////// + ## //////////////////////////////////////////////////////////////////// ## Estimator interface - ##//////////////////////////////////////////////////////////////////// + ## //////////////////////////////////////////////////////////////////// def fit(self, X, y=None, **kwargs): """ @@ -142,9 +166,9 @@ def fit(self, X, y=None, **kwargs): """ return self - ##//////////////////////////////////////////////////////////////////// + ## //////////////////////////////////////////////////////////////////// ## Visualizer interface - ##//////////////////////////////////////////////////////////////////// + ## //////////////////////////////////////////////////////////////////// def draw(self, **kwargs): """ @@ -160,9 +184,7 @@ def draw(self, **kwargs): generic keyword arguments. """ - raise NotImplementedError( - "Visualizers must implement a drawing interface." - ) + raise NotImplementedError("Visualizers must implement a drawing interface.") def finalize(self, **kwargs): """ @@ -206,7 +228,14 @@ def poof(self, outpath=None, clear_figure=False, **kwargs): primarily called by the user to render the visualization. """ # Ensure that draw has been called - if self._ax is None: return + if self._ax is None: + warn_message = ( + "{} does not have a reference to a matplotlib.Axes " + "the figure may not render as expected!" + ) + warnings.warn( + warn_message.format(self.__class__.__name__), YellowbrickWarning + ) # Finalize the figure self.finalize() @@ -217,11 +246,14 @@ def poof(self, outpath=None, clear_figure=False, **kwargs): plt.show() if clear_figure: - plt.gcf().clear() + self.fig.clear() + + # Return ax to ensure display in notebooks + return self.ax - ##//////////////////////////////////////////////////////////////////// + ## //////////////////////////////////////////////////////////////////// ## Helper Functions - ##//////////////////////////////////////////////////////////////////// + ## //////////////////////////////////////////////////////////////////// def set_title(self, title=None): """ @@ -241,6 +273,7 @@ def set_title(self, title=None): ## Model Visualizers ########################################################################## + class ModelVisualizer(Visualizer, Wrapper): """ The ModelVisualizer class wraps a Scikit-Learn estimator (usually a @@ -251,14 +284,26 @@ class ModelVisualizer(Visualizer, Wrapper): Parameters ---------- - model : Estimator + model : a Scikit-Learn estimator A Scikit-Learn estimator to wrap functionality for, usually regressor, - classifier, or clusterer predictive model. + classifier, or clusterer predictive model. If the estimator is not fitted, + it is fit when the visualizer is fitted, unless otherwise specified by + ``is_fitted``. ax : matplotlib Axes, default: None The axis to plot the figure on. If None is passed in the current axes will be used (or generated if required). + fig : matplotlib Figure, default: None + The figure to plot the Visualizer on. If None is passed in the current + plot will be used (or generated if required). + + is_fitted : bool or str, default="auto" + Specify if the wrapped estimator is already fitted. If False, the estimator + will be fit when the visualizer is fit, otherwise, the estimator will not be + modified. If "auto" (default), a helper method will check if the estimator + is fitted before fitting it again. + kwargs : dict Keyword arguments that are passed to the base class and may influence the visualization as defined by other Visualizers. @@ -268,21 +313,14 @@ class ModelVisualizer(Visualizer, Wrapper): Model visualizers can wrap either fitted or unfitted models. """ - def __init__(self, model, ax=None, **kwargs): - """ - Parameters - ---------- - ax: matplotlib axes - the axis to plot the figure on. - - kwargs: dict - keyword arguments for Scikit-Learn model - """ + def __init__(self, model, ax=None, fig=None, is_fitted="auto", **kwargs): self.estimator = model + self.is_fitted = is_fitted self.name = get_model_name(self.estimator) + # Initialize base classes independently Wrapper.__init__(self, self.estimator) - Visualizer.__init__(self, ax=ax, **kwargs) + Visualizer.__init__(self, ax=ax, fig=fig, **kwargs) def fit(self, X, y=None, **kwargs): """ @@ -300,7 +338,7 @@ def fit(self, X, y=None, **kwargs): kwargs: dict Keyword arguments passed to the drawing functionality or to the - Scikit-Learn API. See visualizer specific details for how to use + scikit-learn API. See visualizer specific details for how to use the kwargs to modify the visualization or fitting process. Returns @@ -308,7 +346,8 @@ def fit(self, X, y=None, **kwargs): self : visualizer The fit method must always return self to support pipelines. """ - self.estimator.fit(X, y) + if not check_fitted(self.estimator, is_fitted_by=self.is_fitted): + self.estimator.fit(X, y, **kwargs) return self @@ -316,6 +355,7 @@ def fit(self, X, y=None, **kwargs): ## Score Visualizers ########################################################################## + class ScoreVisualizer(ModelVisualizer): """ The ScoreVisualizer reports the performance of a Scikit-Learn estimator @@ -326,24 +366,30 @@ class ScoreVisualizer(ModelVisualizer): Parameters ---------- - model : Estimator + model : a Scikit-Learn estimator A Scikit-Learn estimator to wrap functionality for, usually regressor, - classifier, or clusterer predictive model. + classifier, or clusterer predictive model. If the estimator is not fitted, + it is fit when the visualizer is fitted, unless otherwise specified by + ``is_fitted``. ax : matplotlib Axes, default: None The axis to plot the figure on. If None is passed in the current axes will be used (or generated if required). + fig : matplotlib Figure, default: None + The figure to plot the Visualizer on. If None is passed in the current + plot will be used (or generated if required). + + is_fitted : bool or str, default="auto" + Specify if the wrapped estimator is already fitted. If False, the estimator + will be fit when the visualizer is fit, otherwise, the estimator will not be + modified. If "auto" (default), a helper method will check if the estimator + is fitted before fitting it again. + kwargs : dict Keyword arguments that are passed to the base class and may influence the visualization as defined in other Visualizersself. - Returns - ------- - score : float or array-like - Returns the score of the underlying model, which is model-speciifc, - e.g. accuracy for classifiers, R2 for regressors, etc. - Notes ----- Score visualizers can wrap either fitted or unfitted models. @@ -353,76 +399,20 @@ def score(self, X, y, **kwargs): """ The primary entry point for score visualizers is the score method, which makes predictions based on X and scores them relative to y. - """ - raise NotImplementedError( - "ScoreVisualizer subclasses should implement score" - ) + Returns + ------- + score : float or array-like + Returns the score of the underlying model, which is model-specific, + e.g. accuracy for classifiers, R2 for regressors, etc. + """ + raise NotImplementedError("ScoreVisualizer subclasses should implement score") ########################################################################## -## Multiple Models and Mixins +## Multiple Models ########################################################################## -class MultiModelMixin(object): - """ - Does predict for each of the models and generates subplots. - """ - - def __init__(self, models, ax=None, **kwargs): - # Ensure models is a collection, if it's a single estimator then we - # wrap it in a list so that the API doesn't break during render. - """ - These parameters can be influenced later on in the visualization - process, but can and should be set as early as possible. - - Parameters - ---------- - models: Scikit-Learn estimator - the Scikit-Learn models being compared with each other. - - kwargs: dict - keyword arguments. - """ - # TODO: How to handle the axes in this mixin? - self.ax = ax - - if all(isestimator, models): - models = [models] - - # Keep track of the models - self.models = models - self.names = kwargs.pop('names', list(map(get_model_name, models))) - - def generate_subplots(self): - """ - Generates the subplots for the number of given models. - """ - _, axes = plt.subplots(len(self.models), sharex=True, sharey=True) - return axes - - def predict(self, X, y): - """ - Returns a generator containing the predictions for each of the - internal models (using cross_val_predict and a CV=12). - - Parameters - ---------- - - X : ndarray or DataFrame of shape n x m - A matrix of n instances with m features - - y : ndarray or Series of length n - An array or series of target or class values - - kwargs: dict - keyword arguments passed to Scikit-Learn API. - - """ - for model in self.models: - yield cvp(model, X, y, cv=12) - - class VisualizerGrid(Visualizer): """ @@ -445,12 +435,13 @@ class VisualizerGrid(Visualizer): visualizers specified in the visualizers list. axarr: matplotlib.axarr, default: None. - If you want to put the plot onto an existing axarr, specify it here. Otherwise a new - one will be created. + If you want to put the plot onto an existing axarr, specify it here. Otherwise + a new one will be created. kwargs : additional keyword arguments, default: None - Any additional keyword arguments will be passed on to the fit() method and therefore - passed on to the fit() method of the wrapped estimators, if applicable. Otherwise ignored. + Any additional keyword arguments will be passed on to the fit() method and + therefore passed on to the fit() method of the wrapped estimators, if + applicable. Otherwise ignored. Examples -------- @@ -465,30 +456,34 @@ class VisualizerGrid(Visualizer): >>> mv.score(X_test, y_test) >>> mv.poof() """ - def __init__(self, visualizers = [], nrows = None, ncols = None, axarr = None, **kwargs): - #Class static params + + def __init__(self, visualizers=[], nrows=None, ncols=None, axarr=None, **kwargs): + # Class static params self.SUBPLOT_DEFAULT_PIXELS = 400 - #Allocate passed parameters + # Allocate passed parameters self._visualizers = visualizers plotcount = len(visualizers) - if nrows == None and ncols == None: - #TODO: enhancement would be to also allow a 2-d array of visualizers instead of just a 1-d left-to-right + top-to-bottom list + if nrows is None and ncols is None: + # TODO: enhancement would be to also allow a 2-d array of visualizers + # instead of just a 1-d left-to-right + top-to-bottom list self.ncols = 1 self.nrows = plotcount - elif ncols == None: + elif ncols is None: self.nrows = nrows self.ncols = int(math.ceil(plotcount / self.nrows)) - elif nrows == None: + elif nrows is None: self.ncols = ncols self.nrows = int(math.ceil(plotcount / self.ncols)) else: - raise YellowbrickValueError("You can only specify either nrows or ncols, \ - the other will be calculated based on the length of the list of visualizers.") - + raise YellowbrickValueError( + "You can only specify either nrows or ncols, \ + the other will be calculated based on the length of the list of \ + visualizers." + ) - if axarr == None: - fig, axarr = plt.subplots(self.nrows, self.ncols, squeeze = False) + if axarr is None: + fig, axarr = plt.subplots(self.nrows, self.ncols, squeeze=False) self.axarr = axarr @@ -497,10 +492,10 @@ def __init__(self, visualizers = [], nrows = None, ncols = None, axarr = None, * for col in range(self.ncols): try: self.visualizers[idx].ax = self.axarr[row, col] - #If len(visualizers) isn't evenly divisibly by rows/columns, - #we want to create the illusion of empty space by hiding the axis + # If len(visualizers) isn't evenly divisibly by rows/columns, + # we want to create the illusion of empty space by hiding the axis except IndexError: - self.axarr[row,col].axis('off') + self.axarr[row, col].axis("off") idx += 1 @@ -511,49 +506,53 @@ def visualizers(self): return self._visualizers @visualizers.setter - def visualizers(self,value): - raise AttributeError("Visualizers list can only be set during class instantiation.") + def visualizers(self, value): + raise AttributeError( + "Visualizers list can only be set during class instantiation." + ) @property def ax(self): - """ + """ Override Visualizer.ax to return the current axis """ - return plt.gca() + return plt.gca() @ax.setter def ax(self, ax): - raise YellowbrickTypeError("cannot set new axes objects on multiple visualizers") - + raise YellowbrickTypeError( + "cannot set new axes objects on multiple visualizers" + ) - def fit(self,X,y,**kwargs): + def fit(self, X, y, **kwargs): for vz in self.visualizers: - vz.fit(X,y,**kwargs) + vz.fit(X, y, **kwargs) return self - def score(self,X,y): + def score(self, X, y): for idx in range(len(self.visualizers)): - self.visualizers[idx].score(X,y) + self.visualizers[idx].score(X, y) return self def poof(self, outpath=None, clear_figure=False, **kwargs): - if self.axarr is None: return + if self.axarr is None: + return - #Finalize all visualizers + # Finalize all visualizers for idx in range(len(self.visualizers)): self.visualizers[idx].finalize() - #Choose a reasonable default size if the user has not manually specified one + # Choose a reasonable default size if the user has not manually specified one # self.size() uses pixels rather than matplotlib's default of inches if not hasattr(self, "_size") or self._size is None: self._width = self.SUBPLOT_DEFAULT_PIXELS * self.ncols self._height = self.SUBPLOT_DEFAULT_PIXELS * self.nrows - self.size = (self._width,self._height); + self.size = (self._width, self._height) if outpath is not None: plt.savefig(outpath, **kwargs) @@ -562,3 +561,6 @@ def poof(self, outpath=None, clear_figure=False, **kwargs): if clear_figure: plt.gcf().clear() + + # Return Axes array to ensure poof works in notebooks + return self.axarr diff --git a/yellowbrick/bestfit.py b/yellowbrick/bestfit.py index 32b401fa8..bac9e6fbe 100644 --- a/yellowbrick/bestfit.py +++ b/yellowbrick/bestfit.py @@ -1,10 +1,10 @@ # yellowbrick.bestfit # Uses Scikit-Learn to compute a best fit function, then draws it in the plot. # -# Author: Benjamin Bengfort +# Author: Benjamin Bengfort # Created: Sun Jun 26 17:27:08 2016 -0400 # -# Copyright (C) 2016 District Data Labs +# Copyright (C) 2016 The sckit-yb developers # For license information, see LICENSE.txt # # ID: bestfit.py [56236f3] benjamin@bengfort.com $ @@ -35,18 +35,19 @@ ########################################################################## # Names of the various estimator functions -LINEAR = 'linear' -QUADRATIC = 'quadratic' -EXPONENTIAL = 'exponential' -LOG = 'log' -SELECT_BEST = 'select_best' +LINEAR = "linear" +QUADRATIC = "quadratic" +EXPONENTIAL = "exponential" +LOG = "log" +SELECT_BEST = "select_best" ########################################################################## ## Draw Line of Best Fit ########################################################################## -def draw_best_fit(X, y, ax, estimator='linear', **kwargs): + +def draw_best_fit(X, y, ax, estimator="linear", **kwargs): """ Uses Scikit-Learn to fit a model to X and y then uses the resulting model to predict the curve based on the X values. This curve is drawn to the ax @@ -95,11 +96,11 @@ def draw_best_fit(X, y, ax, estimator='linear', **kwargs): # Estimators are the types of best fit lines that can be drawn. estimators = { - LINEAR: fit_linear, # Uses OLS to fit the regression - QUADRATIC: fit_quadratic, # Uses OLS with Polynomial order 2 - EXPONENTIAL: fit_exponential, # Not implemented yet - LOG: fit_log, # Not implemented yet - SELECT_BEST: fit_select_best, # Selects the best fit via MSE + LINEAR: fit_linear, # Uses OLS to fit the regression + QUADRATIC: fit_quadratic, # Uses OLS with Polynomial order 2 + EXPONENTIAL: fit_exponential, # Not implemented yet + LOG: fit_log, # Not implemented yet + SELECT_BEST: fit_select_best, # Selects the best fit via MSE } # Check to make sure that a correct estimator value was passed in. @@ -115,10 +116,11 @@ def draw_best_fit(X, y, ax, estimator='linear', **kwargs): # Ensure that X and y are the same length if len(X) != len(y): - raise YellowbrickValueError(( - "X and y must have same length:" - " X len {} doesn't match y len {}!" - ).format(len(X), len(y))) + raise YellowbrickValueError( + ( + "X and y must have same length:" " X len {} doesn't match y len {}!" + ).format(len(X), len(y)) + ) # Ensure that X and y are np.arrays X = np.array(X) @@ -127,7 +129,7 @@ def draw_best_fit(X, y, ax, estimator='linear', **kwargs): # Verify that X is a two dimensional array for Scikit-Learn esitmators # and that its dimensions are (n, 1) where n is the number of rows. if X.ndim < 2: - X = X[:,np.newaxis] # Reshape X into the correct dimensions + X = X[:, np.newaxis] # Reshape X into the correct dimensions if X.ndim > 2: raise YellowbrickValueError( @@ -144,8 +146,8 @@ def draw_best_fit(X, y, ax, estimator='linear', **kwargs): model = estimator(X, y) # Set the color if not passed in. - if 'c' not in kwargs and 'color' not in kwargs: - kwargs['color'] = LINE_COLOR + if "c" not in kwargs and "color" not in kwargs: + kwargs["color"] = LINE_COLOR # Get the current working axes ax = ax or plt.gca() @@ -153,7 +155,7 @@ def draw_best_fit(X, y, ax, estimator='linear', **kwargs): # Plot line of best fit onto the axes that were passed in. # TODO: determine if xlim or X.min(), X.max() are better params xr = np.linspace(*ax.get_xlim(), num=100) - ax.plot(xr, model.predict(xr[:,np.newaxis]), **kwargs) + ax.plot(xr, model.predict(xr[:, np.newaxis]), **kwargs) return ax @@ -161,12 +163,13 @@ def draw_best_fit(X, y, ax, estimator='linear', **kwargs): ## Estimator Functions ########################################################################## + def fit_select_best(X, y): """ Selects the best fit of the estimators already implemented by choosing the model with the smallest mean square error metric for the trained values. """ - models = [fit(X,y) for fit in [fit_linear, fit_quadratic]] + models = [fit(X, y) for fit in [fit_linear, fit_quadratic]] errors = map(lambda model: mse(y, model.predict(X)), models) return min(zip(models, errors), key=itemgetter(1))[0] @@ -185,9 +188,7 @@ def fit_quadratic(X, y): """ Uses OLS with Polynomial order 2. """ - model = make_pipeline( - PolynomialFeatures(2), linear_model.LinearRegression() - ) + model = make_pipeline(PolynomialFeatures(2), linear_model.LinearRegression()) model.fit(X, y) return model @@ -210,6 +211,7 @@ def fit_log(X, y): ## Draw 45 Degree Line ########################################################################## + def draw_identity_line(ax=None, dynamic=True, **kwargs): """ Draws a 45 degree identity line such that y=x for all points within the @@ -249,15 +251,15 @@ def draw_identity_line(ax=None, dynamic=True, **kwargs): ax = ax or plt.gca() # Define the standard line color - if 'c' not in kwargs and 'color' not in kwargs: - kwargs['color'] = LINE_COLOR + if "c" not in kwargs and "color" not in kwargs: + kwargs["color"] = LINE_COLOR # Define the standard opacity - if 'alpha' not in kwargs: - kwargs['alpha'] = 0.5 + if "alpha" not in kwargs: + kwargs["alpha"] = 0.5 # Draw the identity line - identity, = ax.plot([],[], **kwargs) + identity, = ax.plot([], [], **kwargs) # Define the callback def callback(ax): @@ -266,35 +268,35 @@ def callback(ax): ylim = ax.get_ylim() # Set the bounding range of the line - data = ( - max(xlim[0], ylim[0]), min(xlim[1], ylim[1]) - ) + data = (max(xlim[0], ylim[0]), min(xlim[1], ylim[1])) identity.set_data(data, data) # Register the callback and return callback(ax) if dynamic: - ax.callbacks.connect('xlim_changed', callback) - ax.callbacks.connect('ylim_changed', callback) + ax.callbacks.connect("xlim_changed", callback) + ax.callbacks.connect("ylim_changed", callback) return ax -if __name__ == '__main__': +if __name__ == "__main__": import os import pandas as pd - path = os.path.join(os.path.dirname(__file__), "..", "examples", "data", "concrete.xls") + path = os.path.join( + os.path.dirname(__file__), "..", "examples", "data", "concrete.xls" + ) if not os.path.exists(path): raise Exception("Could not find path for testing") - xkey = 'Fine Aggregate (component 7)(kg in a m^3 mixture)' - ykey = 'Coarse Aggregate (component 6)(kg in a m^3 mixture)' + xkey = "Fine Aggregate (component 7)(kg in a m^3 mixture)" + ykey = "Coarse Aggregate (component 6)(kg in a m^3 mixture)" data = pd.read_excel(path) fig, axe = plt.subplots() axe.scatter(data[xkey], data[ykey]) - draw_best_fit(data[xkey], data[ykey], axe, 'select_best') + draw_best_fit(data[xkey], data[ykey], axe, "select_best") plt.show() diff --git a/yellowbrick/classifier/__init__.py b/yellowbrick/classifier/__init__.py index ad7e932ac..d0c5a151c 100644 --- a/yellowbrick/classifier/__init__.py +++ b/yellowbrick/classifier/__init__.py @@ -1,13 +1,13 @@ # yellowbrick.classifier # Visualizations related to evaluating Scikit-Learn classification models # -# Author: Rebecca Bilbro -# Author: Benjamin Bengfort +# Author: Rebecca Bilbro +# Author: Benjamin Bengfort # Author: Neal Humphrey -# Author: Jason Keung +# Author: Jason Keung # Created: Wed May 18 12:39:40 2016 -0400 # -# Copyright (C) 2016 District Data Labs +# Copyright (C) 2016 The scikit-yb developers # For license information, see LICENSE.txt # # ID: __init__.py [5eee25b] benjamin@bengfort.com $ diff --git a/yellowbrick/classifier/base.py b/yellowbrick/classifier/base.py index d581b2f48..508291933 100644 --- a/yellowbrick/classifier/base.py +++ b/yellowbrick/classifier/base.py @@ -1,12 +1,12 @@ # yellowbrick.classifier.base # API for classification visualizer hierarchy. # -# Author: Rebecca Bilbro -# Author: Benjamin Bengfort +# Author: Rebecca Bilbro +# Author: Benjamin Bengfort # Author: Neal Humphrey # Created: Wed May 18 12:39:40 2016 -0400 # -# Copyright (C) 2016 District Data Labs +# Copyright (C) 2016 The scikit-yb developers # For license information, see LICENSE.txt # # ID: base.py [5388065] neal@nhumphrey.com $ @@ -19,120 +19,313 @@ ## Imports ########################################################################## +import warnings import numpy as np -from ..utils import isclassifier -from ..base import ScoreVisualizer -from ..style.palettes import color_palette -from ..exceptions import YellowbrickTypeError +from yellowbrick.utils import isclassifier +from yellowbrick.base import ScoreVisualizer +from yellowbrick.style.palettes import color_palette +from yellowbrick.exceptions import NotFitted, YellowbrickWarning +from yellowbrick.exceptions import YellowbrickTypeError, ModelError + +from sklearn.preprocessing import LabelEncoder ########################################################################## ## Base Classification Visualizer ########################################################################## + class ClassificationScoreVisualizer(ScoreVisualizer): + """Base class for classifier model selection. - def __init__(self, model, ax=None, classes=None, **kwargs): - """ - Check to see if model is an instance of a classifer. - Should return an error if it isn't. + The ClassificationScoreVisualizer wraps a classifier to produce a + visualization when the score method is called, usually to allow the user + to effectively compare the performance between models. - .. todo:: document this class. - .. tood:: accept as input classes as all visualizers need this. - """ + The base class provides helper functionality to ensure that classification + visualizers are able to correctly identify and encode classes with human + readable labels and to map colors to the classes if required. + + Parameters + ---------- + model : estimator + A scikit-learn estimator that should be a classifier. If the model is + not a classifier, an exception is raised. If the internal model is not + fitted, it is fit when the visualizer is fitted, unless otherwise specified + by ``is_fitted``. + + ax : matplotlib Axes, default: None + The axes to plot the figure on. If not specified the current axes will be + used (or generated if required). + + fig : matplotlib Figure, default: None + The figure to plot the Visualizer on. If None is passed in the current + plot will be used (or generated if required). + + classes : list of str, defult: None + The class labels to use for the legend ordered by the index of the sorted + classes discovered in the ``fit()`` method. Specifying classes in this + manner is used to change the class names to a more specific format or + to label encoded integer classes. Some visualizers may also use this + field to filter the visualization for specific classes. For more advanced + usage specify an encoder rather than class labels. + + encoder : dict or LabelEncoder, default: None + A mapping of classes to human readable labels. Often there is a mismatch + between desired class labels and those contained in the target variable + passed to ``fit()`` or ``score()``. The encoder disambiguates this mismatch + ensuring that classes are labeled correctly in the visualization. + + is_fitted : bool or str, default="auto" + Specify if the wrapped estimator is already fitted. If False, the estimator + will be fit when the visualizer is fit, otherwise, the estimator will not be + modified. If "auto" (default), a helper method will check if the estimator + is fitted before fitting it again. + + force_model : bool, default: False + Do not check to ensure that the underlying estimator is a classifier. This + will prevent an exception when the visualizer is initialized but may result + in unexpected or unintended behavior. + + kwargs : dict + Keyword arguments passed to the visualizer base classes. + + Attributes + ---------- + classes_ : ndarray of shape (n_classes,) + The class labels observed while fitting. + + class_count_ : ndarray of shape (n_classes,) + Number of samples encountered for each class during fitting. + + score_ : float + An evaluation metric of the classifier on test data produced when + ``score()`` is called. This metric is between 0 and 1 -- higher scores are + generally better. For classifiers, this score is usually accuracy, but + ensure you check the underlying model for more details about the metric. + """ + + def __init__( + self, + model, + ax=None, + fig=None, + classes=None, + encoder=None, + is_fitted="auto", + force_model=False, + **kwargs, + ): # A bit of type checking - if not isclassifier(model): + if not force_model and not isclassifier(model): raise YellowbrickTypeError( "This estimator is not a classifier; " "try a regression or clustering score visualizer instead!" - ) + ) # Initialize the super method. - super(ClassificationScoreVisualizer, self).__init__(model, ax=ax, **kwargs) - - # Convert to array if necessary to match estimator.classes_ - if classes is not None: - classes = np.array(classes) - - # Set up classifier score visualization properties - if classes is not None: - n_colors = len(classes) - else: - n_colors = None + super(ClassificationScoreVisualizer, self).__init__( + model, ax=ax, fig=fig, is_fitted=is_fitted, **kwargs + ) - self.colors = color_palette(kwargs.pop('colors', None), n_colors) - self.classes_ = classes + self.set_params(classes=classes, encoder=encoder, force_model=force_model) @property - def classes_(self): + def colors(self): """ - Proxy property to smartly access the classes from the estimator or - stored locally on the score visualizer for visualization. + Returns ``_colors`` if it exists, otherwise computes a categorical color + per class based on the matplotlib color cycle. If the visualizer is not + fitted, raises a NotFitted exception. + + If subclasses require users to choose colors or have specialized color + handling, they should set ``_colors`` on init or during fit. + + Notes + ----- + Because this is a property, this docstring is for developers only. """ - if self.__classes is None: - try: - return self.estimator.classes_ - except AttributeError: - return None - return self.__classes + if not hasattr(self, "_colors"): + if not hasattr(self, "classes_"): + raise NotFitted("cannot determine colors before fit") - @classes_.setter - def classes_(self, value): - self.__classes = value + # TODO: replace with resolve_colors + self._colors = color_palette(None, len(self.classes_)) + return self._colors def fit(self, X, y=None, **kwargs): """ + Fit the visualizer to the specified data. + Parameters ---------- - X : ndarray or DataFrame of shape n x m A matrix of n instances with m features y : ndarray or Series of length n An array or series of target or class values - kwargs: keyword arguments passed to Scikit-Learn API. - Returns ------- self : instance Returns the instance of the classification score visualizer - """ - # Fit the inner estimator - self.estimator.fit(X, y) + # Super fits the wrapped estimator + super(ClassificationScoreVisualizer, self).fit(X, y, **kwargs) + + # Extract the classes and the class counts from the target + self.classes_, self.class_counts_ = np.unique(y, return_counts=True) - # Extract the classes from the estimator - if self.classes_ is None: - self.classes_ = self.estimator.classes_ + # Ensure the classes are aligned with the estimator + # If they are not aligned, ignore class counts and issue a warning + if hasattr(self.estimator, "classes_"): + if not np.array_equal(self.classes_, self.estimator.classes_): + self.classes_ = self.estimator.classes_ + self.class_counts_ = None + + # Decode classes to human readable labels specified by the user + self.classes_ = self._decode_labels(self.classes_) # Always return self from fit return self - - def score(self, X, y, **kwargs): + def score(self, X, y): """ The score function is the hook for visual interaction. Pass in test data and the visualizer will create predictions on the data and evaluate them with respect to the test values. The evaluation will then be passed to draw() and the result of the estimator score will be returned. + Parameters ---------- X : array-like X (also X_test) are the dependent variables of test set to predict + y : array-like y (also y_test) is the independent actual variables to score against + Returns ------- score : float + Returns the score of the underlying model, usually accuracy for + classification models. Refer to the specific model for more details. """ - self.score_ = self.estimator.score(X, y, **kwargs) + # If the estimator has been passed in fitted but the visualizer was not fit + # then we can retrieve the classes from the estimator, unfortunately we cannot + # retrieve the class counts so we simply set them to None and warn the user. + # NOTE: cannot test if hasattr(self, "classes_") because it will be proxied. + if not hasattr(self, "class_counts_"): + if not hasattr(self.estimator, "classes_"): + raise NotFitted( + ( + "could not determine required property classes_; " + "the visualizer must either be fit or instantiated with a " + "fitted classifier before calling score()" + ) + ) + self.class_counts_ = None + self.classes_ = self._decode_labels(self.estimator.classes_) + warnings.warn( + "could not determine class_counts_ from previously fitted classifier", + YellowbrickWarning, + ) + + # This method implements ScoreVisualizer (do not call super). + self.score_ = self.estimator.score(X, y) return self.score_ - #TODO during refactoring this can be used to generalize ClassBalance - def class_counts(self, y): - unique, counts = np.unique(y, return_counts=True) - return dict(zip(unique, counts)) + def _decode_labels(self, y): + """ + An internal helper function that uses either the classes or encoder + properties to correctly decode y as user-readable string labels. + + If both classes and encoder are set, a warning is issued and encoder is + used instead of classes. If neither encoder nor classes is set then the + original array is returned unmodified. + """ + if self.classes is not None and self.encoder is not None: + warnings.warn( + "both classes and encoder specified, using encoder", YellowbrickWarning + ) + + if self.encoder is not None: + # Use the label encoder or other transformer + if hasattr(self.encoder, "inverse_transform"): + try: + return self.encoder.inverse_transform(y) + except ValueError: + y_labels = np.unique(y) + raise ModelError( + "could not decode {} y values to {} labels".format( + y_labels, self._labels() + ) + ) + + # Otherwise, treat as a dictionary + try: + return np.asarray([self.encoder[yi] for yi in y]) + except KeyError as e: + raise ModelError( + ( + "cannot decode class {} to label, " + "key not specified by encoder" + ).format(e) + ) + + if self.classes is not None: + # Determine indices to perform class mappings on + yp = np.asarray(y) + if yp.dtype.kind in {"i", "u"}: + idx = yp + else: + # Use label encoder to get indices by sorted class names + idx = LabelEncoder().fit_transform(yp) + + # Use index mapping for classes + try: + return np.asarray(self.classes)[idx] + except IndexError: + y_labels = np.unique(yp) + raise ModelError( + "could not decode {} y values to {} labels".format( + y_labels, self._labels() + ) + ) + + # could not decode y without encoder or classes, return it as it is, unmodified + return y + + def _labels(self): + """ + Returns the human specified labels in either the classes list or from the + encoder. Returns None if no human labels have been specified, but issues a + warning if a transformer has been passed that does not specify labels. + """ + if self.classes is not None and self.encoder is not None: + warnings.warn( + "both classes and encoder specified, using encoder", YellowbrickWarning + ) + + if self.encoder is not None: + # Use label encoder or other transformer + if hasattr(self.encoder, "transform"): + if hasattr(self.encoder, "classes_"): + return self.encoder.classes_ + + # This is not a label encoder + msg = "could not determine class labels from {}".format( + self.encoder.__class__.__name__ + ) + warnings.warn(msg, YellowbrickWarning) + return None + + # Otherwise, treat as dictionary and ensure sorted by key + keys = sorted(list(self.encoder.keys())) + return np.asarray([self.encoder[key] for key in keys]) + + if self.classes is not None: + return np.asarray(self.classes) + + return None diff --git a/yellowbrick/classifier/class_prediction_error.py b/yellowbrick/classifier/class_prediction_error.py index b9919becb..013e05f47 100644 --- a/yellowbrick/classifier/class_prediction_error.py +++ b/yellowbrick/classifier/class_prediction_error.py @@ -2,8 +2,11 @@ # Shows the balance of classes and their associated predictions. # # Author: Larry Gray -# Author: Benjamin Bengfort -# Created: Wed May 18 12:39:40 2016 -0400 +# Author: Benjamin Bengfort +# Created: Fri Jul 20 10:26:25 2018 -0400 +# +# Copyright (C) 2018 The scikit-yb developers +# For license information, see LICENSE.txt # # ID: class_prediction_error.py [] lwgray@gmail.com $ @@ -18,61 +21,107 @@ import numpy as np import matplotlib.pyplot as plt -from .base import ClassificationScoreVisualizer - from sklearn.utils.multiclass import unique_labels from sklearn.metrics.classification import _check_targets from sklearn.model_selection import train_test_split as tts -from ..exceptions import ModelError, YellowbrickValueError -from ..style.colors import resolve_colors +from yellowbrick.draw import bar_stack +from yellowbrick.classifier.base import ClassificationScoreVisualizer +from yellowbrick.exceptions import ModelError, YellowbrickValueError, NotFitted ########################################################################## ## Class Prediction Error Chart ########################################################################## + class ClassPredictionError(ClassificationScoreVisualizer): """ Class Prediction Error chart that shows the support for each class in the - fitted classification model displayed as a stacked bar. Each bar is - segmented to show the distribution of predicted classes for each - class. It is initialized with a fitted model and generates a - class prediction error chart on draw. + fitted classification model displayed as a stacked bar. Each bar is segmented + to show the distribution of predicted classes for each class. It is initialized + with a fitted model and generates a class prediction error chart on draw. Parameters ---------- - ax: axes - the axis to plot the figure on. - - model: estimator - Scikit-Learn estimator object. Should be an instance of a classifier, - else ``__init__()`` will raise an exception. - - classes: list - A list of class names for the legend. If classes is None and a y value - is passed to fit then the classes are selected from the target vector. - - kwargs: dict - Keyword arguments passed to the super class. Here, used - to colorize the bars in the histogram. + model : estimator + A scikit-learn estimator that should be a classifier. If the model is + not a classifier, an exception is raised. If the internal model is not + fitted, it is fit when the visualizer is fitted, unless otherwise specified + by ``is_fitted``. + + ax : matplotlib Axes, default: None + The axes to plot the figure on. If not specified the current axes will be + used (or generated if required). + + classes : list of str, defult: None + The class labels to use for the legend ordered by the index of the sorted + classes discovered in the ``fit()`` method. Specifying classes in this + manner is used to change the class names to a more specific format or + to label encoded integer classes. Some visualizers may also use this + field to filter the visualization for specific classes. For more advanced + usage specify an encoder rather than class labels. + + encoder : dict or LabelEncoder, default: None + A mapping of classes to human readable labels. Often there is a mismatch + between desired class labels and those contained in the target variable + passed to ``fit()`` or ``score()``. The encoder disambiguates this mismatch + ensuring that classes are labeled correctly in the visualization. + + is_fitted : bool or str, default="auto" + Specify if the wrapped estimator is already fitted. If False, the estimator + will be fit when the visualizer is fit, otherwise, the estimator will not be + modified. If "auto" (default), a helper method will check if the estimator + is fitted before fitting it again. + + force_model : bool, default: False + Do not check to ensure that the underlying estimator is a classifier. This + will prevent an exception when the visualizer is initialized but may result + in unexpected or unintended behavior. + + kwargs : dict + Keyword arguments passed to the visualizer base classes. Attributes ---------- + classes_ : ndarray of shape (n_classes,) + The class labels observed while fitting. + + class_count_ : ndarray of shape (n_classes,) + Number of samples encountered for each class during fitting. + score_ : float - Global accuracy score + An evaluation metric of the classifier on test data produced when + ``score()`` is called. This metric is between 0 and 1 -- higher scores are + generally better. For classifiers, this score is usually accuracy, but + ensure you check the underlying model for more details about the score. predictions_ : ndarray An ndarray of predictions whose rows are the true classes and whose columns are the predicted classes - - Notes - ----- - These parameters can be influenced later on in the visualization - process, but can and should be set as early as possible. """ - def score(self, X, y, **kwargs): + def __init__( + self, + model, + ax=None, + classes=None, + encoder=None, + is_fitted="auto", + force_model=False, + **kwargs + ): + super(ClassPredictionError, self).__init__( + model, + ax=ax, + classes=classes, + encoder=encoder, + is_fitted=is_fitted, + force_model=force_model, + **kwargs + ) + + def score(self, X, y): """ Generates a 2D array where each row is the count of the predicted classes and each column is the true class @@ -90,58 +139,71 @@ def score(self, X, y, **kwargs): score_ : float Global accuracy score """ - + # Must be computed before calling super # We're relying on predict to raise NotFitted y_pred = self.predict(X) - y_type, y_true, y_pred = _check_targets(y, y_pred) - if y_type not in ("binary", "multiclass"): - raise YellowbrickValueError("%s is not supported" % y_type) + raise YellowbrickValueError("{} is not supported".format(y_type)) + # Get the indices of the unique labels indices = unique_labels(y_true, y_pred) - - if len(self.classes_) > len(indices): - raise ModelError("y and y_pred contain zero values " - "for one of the specified classes") - elif len(self.classes_) < len(indices): - raise NotImplementedError("filtering classes is " - "currently not supported") + labels = self._labels() + + # Call super to compute self.score_ and verify classes + try: + super(ClassPredictionError, self).score(X, y) + except ModelError as e: + # raise visualizer-specific errors + if labels is not None and len(labels) < len(indices): + raise NotImplementedError( + "filtering classes is currently not supported" + ) + else: + raise e + + # Ensure all labels are used + if labels is not None and len(labels) > len(indices): + raise ModelError( + "y and y_pred contain zero values for one of the specified classes" + ) # Create a table of predictions whose rows are the true classes # and whose columns are the predicted classes; each element # is the count of predictions for that class that match the true # value of that class. - self.predictions_ = np.array([ + self.predictions_ = np.array( [ - (y_pred[y == label_t] == label_p).sum() - for label_p in indices + [(y_pred[y == label_t] == label_p).sum() for label_p in indices] + for label_t in indices ] - for label_t in indices - ]) + ) self.draw() - self.score_ = self.estimator.score(X, y) - return self.score_ def draw(self): """ Renders the class prediction error across the axis. + + Returns + ------- + ax : Matplotlib Axes + The axes on which the figure is plotted """ - indices = np.arange(len(self.classes_)) - prev = np.zeros(len(self.classes_)) + if not hasattr(self, "predictions_") or not hasattr(self, "classes_"): + raise NotFitted.from_estimator(self, "draw") - colors = resolve_colors( + legend_kws = {"bbox_to_anchor": (1.04, 0.5), "loc": "center left"} + bar_stack( + self.predictions_, + self.ax, + labels=list(self.classes_), + ticks=self.classes_, colors=self.colors, - n_colors=len(self.classes_)) - - for idx, row in enumerate(self.predictions_): - self.ax.bar(indices, row, label=self.classes_[idx], - bottom=prev, color=colors[idx]) - prev += row - + legend_kws=legend_kws, + ) return self.ax def finalize(self, **kwargs): @@ -150,15 +212,9 @@ def finalize(self, **kwargs): The user calls poof and poof calls finalize. """ - indices = np.arange(len(self.classes_)) - # Set the title self.set_title("Class Prediction Error for {}".format(self.name)) - # Set the x ticks with the class names - self.ax.set_xticks(indices) - self.ax.set_xticklabels(self.classes_) - # Set the axes labels self.ax.set_xlabel("actual class") self.ax.set_ylabel("number of predicted class") @@ -167,37 +223,42 @@ def finalize(self, **kwargs): cmax = max([sum(predictions) for predictions in self.predictions_]) self.ax.set_ylim(0, cmax + cmax * 0.1) - # Put the legend outside of the graph - plt.legend(bbox_to_anchor=(1.04, 0.5), loc="center left") - plt.tight_layout(rect=[0, 0, 0.85, 1]) + # Ensure the legend fits on the figure + plt.tight_layout(rect=[0, 0, 0.90, 1]) # TODO: Could use self.fig now ########################################################################## ## Quick Method ########################################################################## + def class_prediction_error( model, X, - y=None, + y, ax=None, - classes=None, test_size=0.2, random_state=None, - **kwargs): - """Quick method: - Divides the dataset X and y into train and test splits, fits the model on - the train split, then scores the model on the test split. The visualizer - displays the support for each class in the fitted classification model - displayed as a stacked bar plot Each bar is segmented to show the - distribution of predicted classes for each class. - - This helper function is a quick wrapper to utilize the ClassPredictionError - ScoreVisualizer for one-off analysis. + classes=None, + encoder=None, + is_fitted="auto", + force_model=False, + **kwargs +): + """Class Prediction Error + + Divides the dataset X and y into train and test splits, fits the model on the train + split, then scores the model on the test split. The visualizer displays the support + for each class in the fitted classification model displayed as a stacked bar plot. + Each bar is segmented to show the distribution of predicted classes for each class. Parameters ---------- - model : the Scikit-Learn estimator (should be a classifier) + model : estimator + A scikit-learn estimator that should be a classifier. If the model is + not a classifier, an exception is raised. If the internal model is not + fitted, it is fit when the visualizer is fitted, unless otherwise specified + by ``is_fitted``. X : ndarray or DataFrame of shape n x m A matrix of n instances with m features. @@ -205,11 +266,9 @@ def class_prediction_error( y : ndarray or Series of length n An array or series of target or class values. - ax : matplotlib axes - The axes to plot the figure on. - - classes : list of strings - The names of the classes in the target + ax : matplotlib Axes, default: None + The axes to plot the figure on. If not specified the current axes will be + used (or generated if required). test_size : float, default=0.2 The percentage of the data to reserve as test data. @@ -217,13 +276,49 @@ def class_prediction_error( random_state : int or None, default=None The value to seed the random number generator for shuffling data. + classes : list of str, defult: None + The class labels to use for the legend ordered by the index of the sorted + classes discovered in the ``fit()`` method. Specifying classes in this + manner is used to change the class names to a more specific format or + to label encoded integer classes. Some visualizers may also use this + field to filter the visualization for specific classes. For more advanced + usage specify an encoder rather than class labels. + + encoder : dict or LabelEncoder, default: None + A mapping of classes to human readable labels. Often there is a mismatch + between desired class labels and those contained in the target variable + passed to ``fit()`` or ``score()``. The encoder disambiguates this mismatch + ensuring that classes are labeled correctly in the visualization. + + is_fitted : bool or str, default="auto" + Specify if the wrapped estimator is already fitted. If False, the estimator + will be fit when the visualizer is fit, otherwise, the estimator will not be + modified. If "auto" (default), a helper method will check if the estimator + is fitted before fitting it again. + + force_model : bool, default: False + Do not check to ensure that the underlying estimator is a classifier. This + will prevent an exception when the visualizer is initialized but may result + in unexpected or unintended behavior. + + kwargs: dict + Keyword arguments passed to the visualizer base classes. + Returns ------- - ax : matplotlib axes - Returns the axes that the class prediction error plot was drawn on. + viz : ClassPredictionError + Returns the fitted, finalized visualizer """ # Instantiate the visualizer - visualizer = ClassPredictionError(model, ax, classes, **kwargs) + visualizer = ClassPredictionError( + model=model, + ax=ax, + classes=classes, + encoder=encoder, + is_fitted=is_fitted, + force_model=force_model, + **kwargs + ) # Create the train and test splits X_train, X_test, y_train, y_test = tts( @@ -233,6 +328,7 @@ def class_prediction_error( # Fit and transform the visualizer (calls draw) visualizer.fit(X_train, y_train, **kwargs) visualizer.score(X_test, y_test) + visualizer.finalize() - # Return the axes object on the visualizer - return visualizer.ax + # Return the visualizer + return visualizer diff --git a/yellowbrick/classifier/classification_report.py b/yellowbrick/classifier/classification_report.py index 8ffe545e9..7eca50741 100644 --- a/yellowbrick/classifier/classification_report.py +++ b/yellowbrick/classifier/classification_report.py @@ -1,14 +1,14 @@ # yellowbrick.classifier.classification_report # Visual classification report for classifier scoring. # -# Author: Rebecca Bilbro -# Author: Benjamin Bengfort +# Author: Rebecca Bilbro +# Author: Benjamin Bengfort # Author: Neal Humphrey # Author: Allyssa Riley # Author: Larry Gray -# Created: Wed May 18 12:39:40 2016 -0400 +# Created: Wed May 3 18:15:42 2017 -0400 # -# Copyright (C) 2017 District Data Labs +# Copyright (C) 2017 The scikit-yb developers # For license information, see LICENSE.txt # # ID: classification_report.py [5388065] neal@nhumphrey.com $ @@ -21,26 +21,25 @@ ## Imports ########################################################################## -from __future__ import division import numpy as np import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split from sklearn.metrics import precision_recall_fscore_support -from ..style import find_text_color -from ..style.palettes import color_sequence -from .base import ClassificationScoreVisualizer -from ..exceptions import YellowbrickValueError +from yellowbrick.style import find_text_color +from yellowbrick.style.palettes import color_sequence +from yellowbrick.exceptions import YellowbrickValueError +from yellowbrick.classifier.base import ClassificationScoreVisualizer ########################################################################## ## Classification Report ########################################################################## -CMAP_UNDERCOLOR = 'w' -CMAP_OVERCOLOR = '#2a7d4f' -SCORES_KEYS = ('precision', 'recall', 'f1', 'support') -PERCENT = 'percent' +PERCENT = "percent" +CMAP_UNDERCOLOR = "w" +CMAP_OVERCOLOR = "#2a7d4f" +SCORES_KEYS = ("precision", "recall", "f1", "support") class ClassificationReport(ClassificationScoreVisualizer): @@ -50,15 +49,23 @@ class ClassificationReport(ClassificationScoreVisualizer): Parameters ---------- - ax : The axis to plot the figure on. - - model : the Scikit-Learn estimator - Should be an instance of a classifier, else the __init__ will - return an error. - - classes : a list of class names for the legend - If classes is None and a y value is passed to fit then the classes - are selected from the target vector. + model : estimator + A scikit-learn estimator that should be a classifier. If the model is + not a classifier, an exception is raised. If the internal model is not + fitted, it is fit when the visualizer is fitted, unless otherwise specified + by ``is_fitted``. + + ax : matplotlib Axes, default: None + The axes to plot the figure on. If not specified the current axes will be + used (or generated if required). + + classes : list of str, defult: None + The class labels to use for the legend ordered by the index of the sorted + classes discovered in the ``fit()`` method. Specifying classes in this + manner is used to change the class names to a more specific format or + to label encoded integer classes. Some visualizers may also use this + field to filter the visualization for specific classes. For more advanced + usage specify an encoder rather than class labels. cmap : string, default: ``'YlOrRd'`` Specify a colormap to define the heatmap of the predicted class @@ -68,7 +75,25 @@ class ClassificationReport(ClassificationScoreVisualizer): Specify if support will be displayed. It can be further defined by whether support should be reported as a raw count or percentage. - kwargs : keyword arguments passed to the super class. + encoder : dict or LabelEncoder, default: None + A mapping of classes to human readable labels. Often there is a mismatch + between desired class labels and those contained in the target variable + passed to ``fit()`` or ``score()``. The encoder disambiguates this mismatch + ensuring that classes are labeled correctly in the visualization. + + is_fitted : bool or str, default="auto" + Specify if the wrapped estimator is already fitted. If False, the estimator + will be fit when the visualizer is fit, otherwise, the estimator will not be + modified. If "auto" (default), a helper method will check if the estimator + is fitted before fitting it again. + + force_model : bool, default: False + Do not check to ensure that the underlying estimator is a classifier. This + will prevent an exception when the visualizer is initialized but may result + in unexpected or unintended behavior. + + kwargs : dict + Keyword arguments passed to the visualizer base classes. Examples -------- @@ -81,35 +106,61 @@ class ClassificationReport(ClassificationScoreVisualizer): Attributes ---------- + classes_ : ndarray of shape (n_classes,) + The class labels observed while fitting. + + class_count_ : ndarray of shape (n_classes,) + Number of samples encountered for each class during fitting. + score_ : float - Global accuracy score + An evaluation metric of the classifier on test data produced when + ``score()`` is called. This metric is between 0 and 1 -- higher scores are + generally better. For classifiers, this score is usually accuracy, but + ensure you check the underlying model for more details about the score. scores_ : dict of dicts Outer dictionary composed of precision, recall, f1, and support scores with inner dictionaries specifiying the values for each class listed. """ - def __init__(self, model, ax=None, classes=None, cmap='YlOrRd', - support=None, **kwargs): + + def __init__( + self, + model, + ax=None, + classes=None, + cmap="YlOrRd", + support=None, + encoder=None, + is_fitted="auto", + force_model=False, + **kwargs + ): super(ClassificationReport, self).__init__( - model, ax=ax, classes=classes, **kwargs + model, + ax=ax, + classes=classes, + encoder=encoder, + is_fitted=is_fitted, + force_model=force_model, + **kwargs ) + self.support = support self.cmap = color_sequence(cmap) - self.cmap.set_under(color=CMAP_UNDERCOLOR) self.cmap.set_over(color=CMAP_OVERCOLOR) + self.cmap.set_under(color=CMAP_UNDERCOLOR) self._displayed_scores = [key for key in SCORES_KEYS] - self.support = support if support not in {None, True, False, "percent", "count"}: raise YellowbrickValueError( - "'{}' is an invalid argument for support, use None, True, " \ + "'{}' is an invalid argument for support, use None, True, " "False, 'percent', or 'count'".format(support) ) if not support: self._displayed_scores.remove("support") - def score(self, X, y=None, **kwargs): + def score(self, X, y): """ Generates the Scikit-Learn classification report. @@ -127,8 +178,10 @@ def score(self, X, y=None, **kwargs): score_ : float Global accuracy score """ - y_pred = self.predict(X) + # Call super to check if fitted and to compute self.score_ + super(ClassificationReport, self).score(X, y) + y_pred = self.predict(X) scores = precision_recall_fscore_support(y, y_pred) # Calculate the percentage for the support metric @@ -145,13 +198,9 @@ def score(self, X, y=None, **kwargs): # Remove support scores if not required if not self.support: - self.scores_.pop('support') + self.scores_.pop("support") self.draw() - - # Retrieve and store the score attribute from the sklearn classifier - self.score_ = self.estimator.score(X, y) - return self.score_ def draw(self): @@ -161,7 +210,6 @@ def draw(self): # Create display grid cr_display = np.zeros((len(self.classes_), len(self._displayed_scores))) - # For each class row, append columns for precision, recall, f1, and support for idx, cls in enumerate(self.classes_): for jdx, metric in enumerate(self._displayed_scores): @@ -169,7 +217,10 @@ def draw(self): # Set up the dimensions of the pcolormesh # NOTE: pcolormesh accepts grids that are (N+1,M+1) - X, Y = np.arange(len(self.classes_)+1), np.arange(len(self._displayed_scores)+1) + X, Y = ( + np.arange(len(self.classes_) + 1), + np.arange(len(self._displayed_scores) + 1), + ) self.ax.set_ylim(bottom=0, top=cr_display.shape[0]) self.ax.set_xlim(left=0, right=cr_display.shape[1]) @@ -194,21 +245,18 @@ def draw(self): text_color = find_text_color(base_color) # Add the label to the middle of the grid - cx, cy = x+0.5, y+0.5 - self.ax.text( - cy, cx, svalue, va='center', ha='center', color=text_color - ) - + cx, cy = x + 0.5, y + 0.5 + self.ax.text(cy, cx, svalue, va="center", ha="center", color=text_color) # Draw the heatmap with colors bounded by the min and max of the grid # NOTE: I do not understand why this is Y, X instead of X, Y it works # in this order but raises an exception with the other order. g = self.ax.pcolormesh( - Y, X, cr_display, vmin=0, vmax=1, cmap=self.cmap, edgecolor='w', + Y, X, cr_display, vmin=0, vmax=1, cmap=self.cmap, edgecolor="w" ) # Add the color bar - plt.colorbar(g, ax=self.ax) + plt.colorbar(g, ax=self.ax) # TODO: Could use self.fig now # Return the axes being drawn on return self.ax @@ -224,63 +272,125 @@ def finalize(self, **kwargs): """ # Set the title of the classifiation report - self.set_title('{} Classification Report'.format(self.name)) + self.set_title("{} Classification Report".format(self.name)) # Set the tick marks appropriately - self.ax.set_xticks(np.arange(len(self._displayed_scores))+0.5) - self.ax.set_yticks(np.arange(len(self.classes_))+0.5) + self.ax.set_xticks(np.arange(len(self._displayed_scores)) + 0.5) + self.ax.set_yticks(np.arange(len(self.classes_)) + 0.5) self.ax.set_xticklabels(self._displayed_scores, rotation=45) self.ax.set_yticklabels(self.classes_) - plt.tight_layout() - - -def classification_report(model, X, y=None, ax=None, classes=None, - random_state=None,**kwargs): - """Quick method: + plt.tight_layout() # TODO: Could use self.fig now + + +def classification_report( + model, + X, + y, + ax=None, + test_size=0.2, + random_state=None, + classes=None, + cmap="YlOrRd", + support=None, + encoder=None, + is_fitted="auto", + force_model=False, + **kwargs +): + """Classification Report Displays precision, recall, F1, and support scores for the model. Integrates numerical scores as well as color-coded heatmap. - This helper function is a quick wrapper to utilize the ClassificationReport - ScoreVisualizer for one-off analysis. - Parameters ---------- + model : estimator + A scikit-learn estimator that should be a classifier. If the model is + not a classifier, an exception is raised. If the internal model is not + fitted, it is fit when the visualizer is fitted, unless otherwise specified + by ``is_fitted``. + X : ndarray or DataFrame of shape n x m A matrix of n instances with m features. y : ndarray or Series of length n An array or series of target or class values. - ax : matplotlib axes - The axes to plot the figure on. + ax : matplotlib Axes, default: None + The axes to plot the figure on. If not specified the current axes will be + used (or generated if required). + + test_size : float, default=0.2 + The percentage of the data to reserve as test data. + + random_state : int or None, default=None + The value to seed the random number generator for shuffling data. + + classes : list of str, defult: None + The class labels to use for the legend ordered by the index of the sorted + classes discovered in the ``fit()`` method. Specifying classes in this + manner is used to change the class names to a more specific format or + to label encoded integer classes. Some visualizers may also use this + field to filter the visualization for specific classes. For more advanced + usage specify an encoder rather than class labels. + + cmap : string, default: ``'YlOrRd'`` + Specify a colormap to define the heatmap of the predicted class + against the actual class in the classification report. - model : the Scikit-Learn estimator (should be a classifier) + support: {True, False, None, 'percent', 'count'}, default: None + Specify if support will be displayed. It can be further defined by + whether support should be reported as a raw count or percentage. + + encoder : dict or LabelEncoder, default: None + A mapping of classes to human readable labels. Often there is a mismatch + between desired class labels and those contained in the target variable + passed to ``fit()`` or ``score()``. The encoder disambiguates this mismatch + ensuring that classes are labeled correctly in the visualization. - classes : list of strings - The names of the classes in the target + is_fitted : bool or str, default="auto" + Specify if the wrapped estimator is already fitted. If False, the estimator + will be fit when the visualizer is fit, otherwise, the estimator will not be + modified. If "auto" (default), a helper method will check if the estimator + is fitted before fitting it again. - random_state: integer - The seed value for a random generator + force_model : bool, default: False + Do not check to ensure that the underlying estimator is a classifier. This + will prevent an exception when the visualizer is initialized but may result + in unexpected or unintended behavior. + + kwargs : dict + Keyword arguments passed to the visualizer base classes. Returns ------- - ax : matplotlib axes - Returns the axes that the classification report was drawn on. + viz : ClassificationReport + Returns the fitted, finalized visualizer """ # Instantiate the visualizer - visualizer = ClassificationReport(model, ax, classes, **kwargs) + visualizer = ClassificationReport( + model=model, + ax=ax, + classes=classes, + cmap=cmap, + support=support, + encoder=encoder, + is_fitted=is_fitted, + force_model=force_model, + **kwargs + ) # Create the train and test splits X_train, X_test, y_train, y_test = train_test_split( - X, y, test_size=0.2, random_state=random_state + X, y, test_size=test_size, random_state=random_state ) # Fit and transform the visualizer (calls draw) visualizer.fit(X_train, y_train, **kwargs) visualizer.score(X_test, y_test) + visualizer.finalize() - # Return the axes object on the visualizer - return visualizer.ax + # Return the visualizer + return visualizer diff --git a/yellowbrick/classifier/confusion_matrix.py b/yellowbrick/classifier/confusion_matrix.py index 708e197eb..adb92cacc 100644 --- a/yellowbrick/classifier/confusion_matrix.py +++ b/yellowbrick/classifier/confusion_matrix.py @@ -4,7 +4,7 @@ # Author: Neal Humphrey # Created: Tue May 03 11:05:11 2017 -0700 # -# Copyright (C) 2017 District Data Labs +# Copyright (C) 2017 The scikit-yb developers # For license information, see LICENSE.txt # # ID: confusion_matrix.py [5388065] neal@nhumphrey.com $ @@ -19,22 +19,22 @@ import numpy as np -from ..utils import div_safe -from ..style import find_text_color -from ..style.palettes import color_sequence -from .base import ClassificationScoreVisualizer - from sklearn.model_selection import train_test_split from sklearn.metrics import confusion_matrix as confusion_matrix_metric +from yellowbrick.utils import div_safe +from yellowbrick.style import find_text_color +from yellowbrick.style.palettes import color_sequence +from yellowbrick.classifier.base import ClassificationScoreVisualizer + ########################################################################## ## ConfusionMatrix ########################################################################## -CMAP_UNDERCOLOR = 'w' -CMAP_OVERCOLOR = '#2a7d4f' -CMAP_MUTEDCOLOR = '0.75' +CMAP_UNDERCOLOR = "w" +CMAP_MUTEDCOLOR = "0.75" +CMAP_OVERCOLOR = "#2a7d4f" class ConfusionMatrix(ClassificationScoreVisualizer): @@ -53,11 +53,14 @@ class ConfusionMatrix(ClassificationScoreVisualizer): Parameters ---------- model : estimator - Must be a classifier, otherwise raises YellowbrickTypeError + A scikit-learn estimator that should be a classifier. If the model is + not a classifier, an exception is raised. If the internal model is not + fitted, it is fit when the visualizer is fitted, unless otherwise specified + by ``is_fitted``. ax : matplotlib Axes, default: None - The axes to plot the figure on. If None is passed in the current axes - will be used (or generated if required). + The axes to plot the figure on. If not specified the current axes will be + used (or generated if required). sample_weight: array-like of shape = [n_samples], optional Passed to ``confusion_matrix`` to weight the samples. @@ -68,22 +71,19 @@ class ConfusionMatrix(ClassificationScoreVisualizer): classes, percent should be set to False or inaccurate figures will be displayed. - classes : list, default: None - a list of class names to use in the confusion_matrix. - This is passed to the ``labels`` parameter of - ``sklearn.metrics.confusion_matrix()``, and follows the behaviour - indicated by that function. It may be used to reorder or select a - subset of labels. If None, classes that appear at least once in - ``y_true`` or ``y_pred`` are used in sorted order. - - label_encoder : dict or LabelEncoder, default: None - When specifying the ``classes`` argument, the input to ``fit()`` - and ``score()`` must match the expected labels. If the ``X`` and ``y`` - datasets have been encoded prior to training and the labels must be - preserved for the visualization, use this argument to provide a - mapping from the encoded class to the correct label. Because typically - a Scikit-Learn ``LabelEncoder`` is used to perform this operation, you - may provide it directly to the class to utilize its fitted encoding. + classes : list of str, defult: None + The class labels to use for the legend ordered by the index of the sorted + classes discovered in the ``fit()`` method. Specifying classes in this + manner is used to change the class names to a more specific format or + to label encoded integer classes. Some visualizers may also use this + field to filter the visualization for specific classes. For more advanced + usage specify an encoder rather than class labels. + + encoder : dict or LabelEncoder, default: None + A mapping of classes to human readable labels. Often there is a mismatch + between desired class labels and those contained in the target variable + passed to ``fit()`` or ``score()``. The encoder disambiguates this mismatch + ensuring that classes are labeled correctly in the visualization. cmap : string, default: ``'YlOrRd'`` Specify a colormap to define the heatmap of the predicted class @@ -93,16 +93,36 @@ class ConfusionMatrix(ClassificationScoreVisualizer): Specify the fontsize of the text in the grid and labels to make the matrix a bit easier to read. Uses rcParams font size by default. + is_fitted : bool or str, default="auto" + Specify if the wrapped estimator is already fitted. If False, the estimator + will be fit when the visualizer is fit, otherwise, the estimator will not be + modified. If "auto" (default), a helper method will check if the estimator + is fitted before fitting it again. + + force_model : bool, default: False + Do not check to ensure that the underlying estimator is a classifier. This + will prevent an exception when the visualizer is initialized but may result + in unexpected or unintended behavior. + + kwargs : dict + Keyword arguments passed to the visualizer base classes. + Attributes ---------- + classes_ : ndarray of shape (n_classes,) + The class labels observed while fitting. + + class_counts_ : ndarray of shape (n_classes,) + Number of samples encountered for each class supporting the confusion matrix. + score_ : float - Global accuracy score + An evaluation metric of the classifier on test data produced when + ``score()`` is called. This metric is between 0 and 1 -- higher scores are + generally better. For classifiers, this score is usually accuracy, but + ensure you check the underlying model for more details about the metric. confusion_matrix_ : array, shape = [n_classes, n_classes] - The numeric scores of the confusion matrix - - class_counts_ : array, shape = [n_classes,] - The total number of each class supporting the confusion matrix + The numeric scores of the confusion matrix. Examples -------- @@ -114,24 +134,39 @@ class ConfusionMatrix(ClassificationScoreVisualizer): >>> viz.poof() """ - - def __init__(self, model, ax=None, classes=None, sample_weight=None, - percent=False, label_encoder=None, cmap='YlOrRd', - fontsize=None, **kwargs): + def __init__( + self, + model, + ax=None, + sample_weight=None, + percent=False, + classes=None, + encoder=None, + cmap="YlOrRd", + fontsize=None, + is_fitted="auto", + force_model=False, + **kwargs + ): super(ConfusionMatrix, self).__init__( - model, ax=ax, classes=classes, **kwargs + model, + ax=ax, + classes=classes, + encoder=encoder, + is_fitted=is_fitted, + force_model=force_model, + **kwargs ) # Visual parameters + self.fontsize = fontsize self.cmap = color_sequence(cmap) self.cmap.set_under(color=CMAP_UNDERCOLOR) self.cmap.set_over(color=CMAP_OVERCOLOR) - self.fontsize = fontsize # Estimator parameters - self.label_encoder = label_encoder - self.sample_weight = sample_weight self.percent = percent + self.sample_weight = sample_weight # Used to draw diagonal line for predicted class = true class self._edgecolors = [] @@ -156,30 +191,30 @@ def score(self, X, y): score_ : float Global accuracy score """ + # Call super to check if fitted and to compute self.score_ + super(ConfusionMatrix, self).score(X, y) + # Create predictions from X (will raise not fitted error) y_pred = self.predict(X) - # Encode the target with the supplied label encoder - if self.label_encoder: - try : - y = self.label_encoder.inverse_transform(y) - y_pred = self.label_encoder.inverse_transform(y_pred) - except AttributeError: - # if a mapping is passed to class apply it here. - y = np.array([self.label_encoder[x] for x in y]) - y_pred = np.array([self.label_encoder[x] for x in y_pred]) + # Decode the target with the label encoder and get human readable labels + y = self._decode_labels(y) + y_pred = self._decode_labels(y_pred) + labels = self._labels() + if labels is None: + labels = self.classes_ # Compute the confusion matrix and class counts self.confusion_matrix_ = confusion_matrix_metric( - y, y_pred, labels=self.classes_, sample_weight=self.sample_weight + y, y_pred, labels=labels, sample_weight=self.sample_weight ) - self.class_counts_ = self.class_counts(y) + self.class_counts_ = dict(zip(*np.unique(y, return_counts=True))) # Make array of only the classes actually being used. # Needed because sklearn confusion_matrix only returns counts for # selected classes but percent should be calculated on all classes selected_class_counts = [] - for c in self.classes_: + for c in labels: try: selected_class_counts.append(self.class_counts_[c]) except KeyError: @@ -187,10 +222,6 @@ def score(self, X, y): self.class_counts_ = np.array(selected_class_counts) self.draw() - - # Retrieve and store the score attribute from the sklearn classifier - self.score_ = self.estimator.score(X, y) - return self.score_ def draw(self): @@ -203,27 +234,36 @@ def draw(self): # Convert confusion matrix to percent of each row, i.e. the # predicted as a percent of true in each class. - if self.percent == True: + if self.percent is True: # Note: div_safe function returns 0 instead of NAN. - cm_display = div_safe(self.confusion_matrix_, self.class_counts_.reshape(-1,1)) - cm_display = np.round(cm_display* 100, decimals=0) + cm_display = div_safe( + self.confusion_matrix_, self.class_counts_.reshape(-1, 1) + ) + cm_display = np.round(cm_display * 100, decimals=0) # Y axis should be sorted top to bottom in pcolormesh - cm_display = cm_display[::-1,::] + cm_display = cm_display[::-1, ::] + + # Get the human readable labels + labels = self._labels() + if labels is None: + labels = self.classes_ # Set up the dimensions of the pcolormesh - n_classes = len(self.classes_) - X, Y = np.arange(n_classes+1), np.arange(n_classes+1) + n_classes = len(labels) + X, Y = np.arange(n_classes + 1), np.arange(n_classes + 1) self.ax.set_ylim(bottom=0, top=cm_display.shape[0]) self.ax.set_xlim(left=0, right=cm_display.shape[1]) # Fetch the grid labels from the classes in correct order; set ticks. - xticklabels = self.classes_ - yticklabels = self.classes_[::-1] + xticklabels = labels + yticklabels = labels[::-1] ticks = np.arange(n_classes) + 0.5 self.ax.set(xticks=ticks, yticks=ticks) - self.ax.set_xticklabels(xticklabels, rotation="vertical", fontsize=self.fontsize) + self.ax.set_xticklabels( + xticklabels, rotation="vertical", fontsize=self.fontsize + ) self.ax.set_yticklabels(yticklabels, fontsize=self.fontsize) # Set data labels in the grid enumerating over all x,y class pairs. @@ -233,7 +273,7 @@ def draw(self): for y in Y[:-1]: # Extract the value and the text label - value = cm_display[x,y] + value = cm_display[x, y] svalue = "{:0.0f}".format(value) if self.percent: svalue += "%" @@ -243,37 +283,52 @@ def draw(self): text_color = find_text_color(base_color) # Make zero values more subtle - if cm_display[x,y] == 0: + if cm_display[x, y] == 0: text_color = CMAP_MUTEDCOLOR # Add the label to the middle of the grid - cx, cy = x+0.5, y+0.5 + cx, cy = x + 0.5, y + 0.5 self.ax.text( - cy, cx, svalue, va='center', ha='center', - color=text_color, fontsize=self.fontsize, + cy, + cx, + svalue, + va="center", + ha="center", + color=text_color, + fontsize=self.fontsize, ) # Add a dark line on the grid with the diagonal. Note that the # tick labels have already been reversed. - lc = 'k' if xticklabels[x] == yticklabels[y] else 'w' + lc = "k" if xticklabels[x] == yticklabels[y] else "w" self._edgecolors.append(lc) - # Draw the heatmap with colors bounded by vmin,vmax vmin = 0.00001 - vmax = 99.999 if self.percent == True else cm_display.max() + vmax = 99.999 if self.percent is True else cm_display.max() self.ax.pcolormesh( - X, Y, cm_display, vmin=vmin, vmax=vmax, - edgecolor=self._edgecolors, cmap=self.cmap, linewidth='0.01' + X, + Y, + cm_display, + vmin=vmin, + vmax=vmax, + edgecolor=self._edgecolors, + cmap=self.cmap, + linewidth="0.01", ) # Return the axes being drawn on return self.ax + def poof(self, outpath=None, **kwargs): + if outpath is not None: + kwargs["bbox_inches"] = kwargs.get("bbox_inches", "tight") + return super(ConfusionMatrix, self).poof(outpath, **kwargs) + def finalize(self, **kwargs): - self.set_title('{} Confusion Matrix'.format(self.name)) - self.ax.set_ylabel('True Class') - self.ax.set_xlabel('Predicted Class') + self.set_title("{} Confusion Matrix".format(self.name)) + self.ax.set_ylabel("True Class") + self.ax.set_xlabel("Predicted Class") ########################################################################## @@ -281,10 +336,24 @@ def finalize(self, **kwargs): ########################################################################## -def confusion_matrix(model, X, y, ax=None, classes=None, sample_weight=None, - percent=False, label_encoder=None, cmap='YlOrRd', - fontsize=None, **kwargs): - """Quick method: +def confusion_matrix( + model, + X, + y, + ax=None, + test_size=0.2, + random_state=None, + sample_weight=None, + percent=False, + classes=None, + encoder=None, + cmap="YlOrRd", + fontsize=None, + is_fitted="auto", + force_model=False, + **kwargs +): + """Confusion Matrix Creates a heatmap visualization of the sklearn.metrics.confusion_matrix(). A confusion matrix shows each combination of the true and predicted @@ -300,7 +369,10 @@ def confusion_matrix(model, X, y, ax=None, classes=None, sample_weight=None, Parameters ---------- model : estimator - Must be a classifier, otherwise raises YellowbrickTypeError + A scikit-learn estimator that should be a classifier. If the model is + not a classifier, an exception is raised. If the internal model is not + fitted, it is fit when the visualizer is fitted, unless otherwise specified + by ``is_fitted``. X : ndarray or DataFrame of shape n x m A matrix of n instances with m features. @@ -309,8 +381,14 @@ def confusion_matrix(model, X, y, ax=None, classes=None, sample_weight=None, An array or series of target or class values. ax : matplotlib Axes, default: None - The axes to plot the figure on. If None is passed in the current axes - will be used (or generated if required). + The axes to plot the figure on. If not specified the current axes will be + used (or generated if required). + + test_size : float, default=0.2 + The percentage of the data to reserve as test data. + + random_state : int or None, default=None + The value to seed the random number generator for shuffling data. sample_weight: array-like of shape = [n_samples], optional Passed to ``confusion_matrix`` to weight the samples. @@ -321,22 +399,19 @@ def confusion_matrix(model, X, y, ax=None, classes=None, sample_weight=None, classes, percent should be set to False or inaccurate figures will be displayed. - classes : list, default: None - a list of class names to use in the confusion_matrix. - This is passed to the ``labels`` parameter of - ``sklearn.metrics.confusion_matrix()``, and follows the behaviour - indicated by that function. It may be used to reorder or select a - subset of labels. If None, classes that appear at least once in - ``y_true`` or ``y_pred`` are used in sorted order. - - label_encoder : dict or LabelEncoder, default: None - When specifying the ``classes`` argument, the input to ``fit()`` - and ``score()`` must match the expected labels. If the ``X`` and ``y`` - datasets have been encoded prior to training and the labels must be - preserved for the visualization, use this argument to provide a - mapping from the encoded class to the correct label. Because typically - a Scikit-Learn ``LabelEncoder`` is used to perform this operation, you - may provide it directly to the class to utilize its fitted encoding. + classes : list of str, defult: None + The class labels to use for the legend ordered by the index of the sorted + classes discovered in the ``fit()`` method. Specifying classes in this + manner is used to change the class names to a more specific format or + to label encoded integer classes. Some visualizers may also use this + field to filter the visualization for specific classes. For more advanced + usage specify an encoder rather than class labels. + + encoder : dict or LabelEncoder, default: None + A mapping of classes to human readable labels. Often there is a mismatch + between desired class labels and those contained in the target variable + passed to ``fit()`` or ``score()``. The encoder disambiguates this mismatch + ensuring that classes are labeled correctly in the visualization. cmap : string, default: ``'YlOrRd'`` Specify a colormap to define the heatmap of the predicted class @@ -346,23 +421,50 @@ def confusion_matrix(model, X, y, ax=None, classes=None, sample_weight=None, Specify the fontsize of the text in the grid and labels to make the matrix a bit easier to read. Uses rcParams font size by default. + is_fitted : bool or str, default="auto" + Specify if the wrapped estimator is already fitted. If False, the estimator + will be fit when the visualizer is fit, otherwise, the estimator will not be + modified. If "auto" (default), a helper method will check if the estimator + is fitted before fitting it again. + + force_model : bool, default: False + Do not check to ensure that the underlying estimator is a classifier. This + will prevent an exception when the visualizer is initialized but may result + in unexpected or unintended behavior. + + kwargs : dict + Keyword arguments passed to the visualizer base classes. + Returns ------- - ax : matplotlib axes - Returns the axes that the classification report was drawn on. + viz : ConfusionMatrix + Returns the fitted, finalized visualizer """ # Instantiate the visualizer visualizer = ConfusionMatrix( - model, ax, classes, sample_weight, percent, - label_encoder, cmap, fontsize, **kwargs + model=model, + ax=ax, + sample_weight=sample_weight, + percent=percent, + classes=classes, + encoder=encoder, + cmap=cmap, + fontsize=fontsize, + is_fitted=is_fitted, + force_model=force_model, + **kwargs ) # Create the train and test splits - X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) + # TODO: determine how to use quick methods that require train and test data. + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=test_size, random_state=random_state + ) # Fit and transform the visualizer (calls draw) visualizer.fit(X_train, y_train, **kwargs) visualizer.score(X_test, y_test) + visualizer.finalize() - # Return the axes object on the visualizer - return visualizer.ax + # Return the visualizer + return visualizer diff --git a/yellowbrick/classifier/prcurve.py b/yellowbrick/classifier/prcurve.py index 2199dbe6a..8069f834a 100644 --- a/yellowbrick/classifier/prcurve.py +++ b/yellowbrick/classifier/prcurve.py @@ -1,10 +1,13 @@ # yellowbrick.classifier.prcurve # Implements Precision-Recall curves for classification models. # -# Author: Benjamin Bengfort +# Author: Benjamin Bengfort # Created: Tue Sep 04 16:47:19 2018 -0400 # -# ID: prcurve.py [] benjamin@bengfort.com $ +# Copyright (C) 2018 The scikit-yb developers +# For license information, see LICENSE.txt +# +# ID: prcurve.py [48889c4] benjamin@bengfort.com $ """ Implements Precision-Recall curves for classification models. @@ -16,10 +19,6 @@ import numpy as np -from ..exceptions import ModelError, NotFitted -from ..exceptions import YellowbrickValueError -from .base import ClassificationScoreVisualizer - from sklearn.preprocessing import label_binarize from sklearn.multiclass import OneVsRestClassifier from sklearn.utils.multiclass import type_of_target @@ -27,19 +26,28 @@ from sklearn.model_selection import train_test_split as tts from sklearn.metrics import precision_recall_curve as sk_precision_recall_curve +from yellowbrick.exceptions import ModelError, NotFitted +from yellowbrick.exceptions import YellowbrickValueError +from yellowbrick.classifier.base import ClassificationScoreVisualizer + # Target Type Constants +# TODO: These can now be imported from utils.target BINARY = "binary" MULTICLASS = "multiclass" # Average Metric Constants MICRO = "micro" +# Default Values +DEFAULT_ISO_F1_VALUES = (0.2, 0.4, 0.6, 0.8) + ########################################################################## ## PrecisionRecallCurve Visualizer ########################################################################## + class PrecisionRecallCurve(ClassificationScoreVisualizer): """ Precision-Recall curves are a metric used to evaluate a classifier's quality, @@ -54,54 +62,78 @@ class PrecisionRecallCurve(ClassificationScoreVisualizer): Parameters ---------- - model : the Scikit-Learn estimator - A classification model to score the precision-recall curve on. + model : estimator + A scikit-learn estimator that should be a classifier. If the model is + not a classifier, an exception is raised. If the internal model is not + fitted, it is fit when the visualizer is fitted, unless otherwise specified + by ``is_fitted``. ax : matplotlib Axes, default: None - The axes to plot the figure on. If None is passed in the current axes - will be used (or generated if required). - - classes : list - A list of class names for the legend. If classes is None and a y value - is passed to fit then the classes are selected from the target vector. - Note that the curves must be computed based on what is in the target - vector passed to the ``score()`` method. Class names are used for - labeling only and must be in the correct order to prevent confusion. - - fill_area : bool, default=True + The axes to plot the figure on. If not specified the current axes will be + used (or generated if required). + + classes : list of str, defult: None + The class labels to use for the legend ordered by the index of the sorted + classes discovered in the ``fit()`` method. Specifying classes in this + manner is used to change the class names to a more specific format or + to label encoded integer classes. Some visualizers may also use this + field to filter the visualization for specific classes. For more advanced + usage specify an encoder rather than class labels. + + encoder : dict or LabelEncoder, default: None + A mapping of classes to human readable labels. Often there is a mismatch + between desired class labels and those contained in the target variable + passed to ``fit()`` or ``score()``. The encoder disambiguates this mismatch + ensuring that classes are labeled correctly in the visualization. + + fill_area : bool, default: True Fill the area under the curve (or curves) with the curve color. - ap_score : bool, default=True + ap_score : bool, default: True Annotate the graph with the average precision score, a summary of the plot that is computed as the weighted mean of precisions at each threshold, with the increase in recall from the previous threshold used as the weight. - micro : bool, default=True + micro : bool, default: True If multi-class classification, draw the precision-recall curve for the micro-average of all classes. In the multi-class case, either micro or per-class must be set to True. Ignored in the binary case. - iso_f1_curves : bool, default=False + iso_f1_curves : bool, default: False Draw ISO F1-Curves on the plot to show how close the precision-recall curves are to different F1 scores. - per_class : bool, default=False + iso_f1_values : tuple , default: (0.2, 0.4, 0.6, 0.8) + Values of f1 score for which to draw ISO F1-Curves + + per_class : bool, default: False If multi-class classification, draw the precision-recall curve for each class using a OneVsRestClassifier to compute the recall on a per-class basis. In the multi-class case, either micro or per-class must be set to True. Ignored in the binary case. - fill_opacity : float, default=0.2 + fill_opacity : float, default: 0.2 Specify the alpha or opacity of the fill area (0 being transparent, and 1.0 being completly opaque). - line_opacity : float, default=0.8 + line_opacity : float, default: 0.8 Specify the alpha or opacity of the lines (0 being transparent, and 1.0 being completly opaque). + is_fitted : bool or str, default="auto" + Specify if the wrapped estimator is already fitted. If False, the estimator + will be fit when the visualizer is fit, otherwise, the estimator will not be + modified. If "auto" (default), a helper method will check if the estimator + is fitted before fitting it again. + + force_model : bool, default: False + Do not check to ensure that the underlying estimator is a classifier. This + will prevent an exception when the visualizer is initialized but may result + in unexpected or unintended behavior. + kwargs : dict - Keyword arguments passed to the visualization base class. + Keyword arguments passed to the visualizer base classes. Attributes ---------- @@ -126,9 +158,16 @@ class PrecisionRecallCurve(ClassificationScoreVisualizer): predictions with score >= thresholds[i] and the last element is 0. In the multiclass case, a mapping of class/metric to recall array. + classes_ : ndarray of shape (n_classes,) + The class labels observed while fitting. + + class_count_ : ndarray of shape (n_classes,) + Number of samples encountered for each class during fitting. + + + Examples + -------- - Example - ------- >>> from yellowbrick.classifier import PrecisionRecallCurve >>> from sklearn.model_selection import train_test_split >>> from sklearn.svm import LinearSVC @@ -141,13 +180,36 @@ class PrecisionRecallCurve(ClassificationScoreVisualizer): Notes ----- - .. seealso:: http://scikit-learn.org/stable/auto_examples/model_selection/plot_precision_recall.html + .. seealso:: https://bit.ly/2kOIeCC """ - def __init__(self, model, ax=None, classes=None, fill_area=True, ap_score=True, - micro=True, iso_f1_curves=False, per_class=False, fill_opacity=0.2, - line_opacity=0.8, **kwargs): - super(PrecisionRecallCurve, self).__init__(model, ax=ax, classes=classes, **kwargs) + def __init__( + self, + model, + ax=None, + classes=None, + encoder=None, + fill_area=True, + ap_score=True, + micro=True, + iso_f1_curves=False, + iso_f1_values=DEFAULT_ISO_F1_VALUES, + per_class=False, + fill_opacity=0.2, + line_opacity=0.8, + is_fitted="auto", + force_model=False, + **kwargs + ): + super(PrecisionRecallCurve, self).__init__( + model, + ax=ax, + classes=classes, + encoder=encoder, + is_fitted=is_fitted, + force_model=force_model, + **kwargs + ) # Set visual params self.set_params( @@ -155,6 +217,7 @@ def __init__(self, model, ax=None, classes=None, fill_area=True, ap_score=True, ap_score=ap_score, micro=micro, iso_f1_curves=iso_f1_curves, + iso_f1_values=set(iso_f1_values), per_class=per_class, fill_opacity=fill_opacity, line_opacity=line_opacity, @@ -168,29 +231,29 @@ def fit(self, X, y=None): """ # The target determines what kind of estimator is fit ttype = type_of_target(y) + self._target_labels = np.unique(y) if ttype.startswith(MULTICLASS): self.target_type_ = MULTICLASS self.estimator = OneVsRestClassifier(self.estimator) - # Use label_binarize to create multi-label ouptut for OneVsRestClassifier - Y = label_binarize(y, classes=np.unique(y)) + # Use label_binarize to create multi-label output for OneVsRestClassifier + Y = label_binarize(y, classes=self._target_labels) elif ttype.startswith(BINARY): - self.target_type_ = BINARY - # Different variable is used here to prevent transformation Y = y + self.target_type_ = BINARY else: - raise YellowbrickValueError(( - "{} does not support target type '{}', " - "please provide a binary or multiclass single-output target" - ).format( - self.__class__.__name__, ttype - )) + raise YellowbrickValueError( + ( + "{} does not support target type '{}', " + "please provide a binary or multiclass single-output target" + ).format(self.__class__.__name__, ttype) + ) # Fit the model and return self return super(PrecisionRecallCurve, self).fit(X, Y) - def score(self, X, y=None): + def score(self, X, y): """ Generates the Precision-Recall curve on the specified test data. @@ -204,11 +267,16 @@ def score(self, X, y=None): # If we don't do this check, then it is possible that OneVsRestClassifier # has not correctly been fitted for multi-class targets. if not hasattr(self, "target_type_"): - raise NotFitted(( - "{} cannot wrap an already fitted estimator" - ).format( - self.__class__.__name__ - )) + raise NotFitted.from_estimator(self, "score") + + # Must perform label binarization before calling super + if self.target_type_ == MULTICLASS: + # Use label_binarize to create multi-label output for OneVsRestClassifier + y = label_binarize(y, classes=self._target_labels) + + # Call super to check if fitted and to compute classes_ + # Note that self.score_ computed in super will be overridden below + super(PrecisionRecallCurve, self).score(X, y) # Compute the prediction/threshold scores y_scores = self._get_y_scores(X) @@ -218,21 +286,20 @@ def score(self, X, y=None): self.precision_, self.recall_, _ = sk_precision_recall_curve(y, y_scores) self.score_ = average_precision_score(y, y_scores) else: - # Use label_binarize to create multi-label ouptut for OneVsRestClassifier - Y = label_binarize(y, classes=self.classes_) - self.precision_, self.recall_, self.score_ = {}, {}, {} # Compute PRCurve for all classes for i, class_i in enumerate(self.classes_): - self.precision_[class_i], self.recall_[class_i], _ = sk_precision_recall_curve(Y[:,i], y_scores[:,i]) - self.score_[class_i] = average_precision_score(Y[:,i], y_scores[:,i]) + self.precision_[class_i], self.recall_[ + class_i + ], _ = sk_precision_recall_curve(y[:, i], y_scores[:, i]) + self.score_[class_i] = average_precision_score(y[:, i], y_scores[:, i]) # Compute micro average PR curve self.precision_[MICRO], self.recall_[MICRO], _ = sk_precision_recall_curve( - Y.ravel(), y_scores.ravel() + y.ravel(), y_scores.ravel() ) - self.score_[MICRO] = average_precision_score(Y, y_scores, average=MICRO) + self.score_[MICRO] = average_precision_score(y, y_scores, average=MICRO) # Draw the figure self.draw() @@ -247,11 +314,11 @@ def draw(self): Draws the precision-recall curves computed in score on the axes. """ if self.iso_f1_curves: - for f1 in np.linspace(0.2, 0.8, num=4): + for f1 in self.iso_f1_values: x = np.linspace(0.01, 1) y = f1 * x / (2 * x - f1) - self.ax.plot(x[y>=0], y[y>=0], color='#333333', alpha=0.2) - self.ax.annotate('$f_1={:0.1f}$'.format(f1), xy=(0.9, y[45]+0.02)) + self.ax.plot(x[y >= 0], y[y >= 0], color="#333333", alpha=0.2) + self.ax.annotate("$f_1={:0.1f}$".format(f1), xy=(0.9, y[45] + 0.02)) if self.target_type_ == BINARY: return self._draw_binary() @@ -264,7 +331,6 @@ def _draw_binary(self): self._draw_pr_curve(self.recall_, self.precision_, label="binary PR curve") self._draw_ap_score(self.score_) - def _draw_multiclass(self): """ Draw the precision-recall curves in the multiclass case @@ -290,10 +356,12 @@ def _draw_pr_curve(self, recall, precision, label=None): Helper function to draw a precision-recall curve with specified settings """ self.ax.step( - recall, precision, alpha=self.line_opacity, where='post', label=label + recall, precision, alpha=self.line_opacity, where="post", label=label ) if self.fill_area: - self.ax.fill_between(recall, precision, step='post', alpha=self.fill_opacity) + self.ax.fill_between( + recall, precision, step="post", alpha=self.fill_opacity + ) def _draw_ap_score(self, score, label=None): """ @@ -301,16 +369,14 @@ def _draw_ap_score(self, score, label=None): """ label = label or "Avg Precision={:0.2f}".format(score) if self.ap_score: - self.ax.axhline( - y=score, color="r", ls="--", label=label - ) + self.ax.axhline(y=score, color="r", ls="--", label=label) def finalize(self): """ Finalize the figure by adding titles, labels, and limits. """ self.set_title("Precision-Recall Curve for {}".format(self.name)) - self.ax.legend(loc='lower left', frameon=True) + self.ax.legend(loc="lower left", frameon=True) self.ax.set_xlim([0.0, 1.0]) self.ax.set_ylim([0.0, 1.0]) @@ -328,10 +394,7 @@ def _get_y_scores(self, X): # TODO refactor shared method with ROCAUC # Resolution order of scoring functions - attrs = ( - 'decision_function', - 'predict_proba', - ) + attrs = ("decision_function", "predict_proba") # Return the first resolved function for attr in attrs: @@ -343,7 +406,7 @@ def _get_y_scores(self, X): # Return only the positive class for binary predict_proba if self.target_type_ == BINARY and y_scores.ndim == 2: - return y_scores[:,1] + return y_scores[:, 1] return y_scores except AttributeError: @@ -353,9 +416,11 @@ def _get_y_scores(self, X): continue # If we've gotten this far, we can't do anything - raise ModelError(( - "{} requires estimators with predict_proba or decision_function methods." - ).format(self.__class__.__name__)) + raise ModelError( + ( + "{} requires an estimator with predict_proba or decision_function." + ).format(self.__class__.__name__) + ) # Alias for PrecisionRecallCurve @@ -366,31 +431,74 @@ def _get_y_scores(self, X): ## Quick Method ########################################################################## -def precision_recall_curve(model, X, y, ax=None, train_size=0.8, - random_state=None, shuffle=True, **kwargs): - """Precision-Recall Curve quick method: + +def precision_recall_curve( + model, + X, + y, + X_test=None, + y_test=None, + train_size=0.8, + random_state=None, + shuffle=True, + ax=None, + classes=None, + encoder=None, + fill_area=True, + ap_score=True, + micro=True, + iso_f1_curves=False, + iso_f1_values=DEFAULT_ISO_F1_VALUES, + per_class=False, + fill_opacity=0.2, + line_opacity=0.8, + is_fitted="auto", + force_model=False, + **kwargs +): + """Precision-Recall Curve + + Precision-Recall curves are a metric used to evaluate a classifier's quality, + particularly when classes are very imbalanced. The precision-recall curve + shows the tradeoff between precision, a measure of result relevancy, and + recall, a measure of how many relevant results are returned. A large area + under the curve represents both high recall and precision, the best case + scenario for a classifier, showing a model that returns accurate results + for the majority of classes it selects. Parameters ---------- - model : the Scikit-Learn estimator - A classification model to score the precision-recall curve on. + model : estimator + A scikit-learn estimator that should be a classifier. If the model is + not a classifier, an exception is raised. If the internal model is not + fitted, it is fit when the visualizer is fitted, unless otherwise specified + by ``is_fitted``. X : ndarray or DataFrame of shape n x m A feature array of n instances with m features the model is trained on. - This array will be split into train and test splits. + This array will be split into train and test splits if X_test is not specified. y : ndarray or Series of length n An array or series of target or class values. This vector will be split - into train and test splits. + into train and test splits if y_test is not specified. + + X_test : ndarray or DataFrame of shape n x m + An optional feature array of n instances with m features that the model + is tested on if specified, using X as the training data. Otherwise + X will be split into train and test splits. + + y_test : ndarray or Series of length n + An array or series of target or class values that serve as actual labels for + X_test. If not specified, y will be split into test and train along with X. ax : matplotlib Axes, default: None - The axes to plot the figure on. If None is passed in the current axes - will be used (or generated if required). + The axes to plot the figure on. If not specified the current axes will be + used (or generated if required). train_size : float or int, default=0.8 If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the train split. If int, represents the - absolute number of train samples. + absolute number of train samples. Used if X_test and y_test not specified. random_state : int, RandomState, or None, optional If int, random_state is the seed used by the random number generator; @@ -401,47 +509,68 @@ def precision_recall_curve(model, X, y, ax=None, train_size=0.8, shuffle : bool, default=True Whether or not to shuffle the data before splitting. - classes : list - A list of class names for the legend. If classes is None and a y value - is passed to fit then the classes are selected from the target vector. - Note that the curves must be computed based on what is in the target - vector passed to the ``score()`` method. Class names are used for - labeling only and must be in the correct order to prevent confusion. - - fill_area : bool, default=True + classes : list of str, defult: None + The class labels to use for the legend ordered by the index of the sorted + classes discovered in the ``fit()`` method. Specifying classes in this + manner is used to change the class names to a more specific format or + to label encoded integer classes. Some visualizers may also use this + field to filter the visualization for specific classes. For more advanced + usage specify an encoder rather than class labels. + + encoder : dict or LabelEncoder, default: None + A mapping of classes to human readable labels. Often there is a mismatch + between desired class labels and those contained in the target variable + passed to ``fit()`` or ``score()``. The encoder disambiguates this mismatch + ensuring that classes are labeled correctly in the visualization. + + fill_area : bool, default: True Fill the area under the curve (or curves) with the curve color. - ap_score : bool, default=True + ap_score : bool, default: True Annotate the graph with the average precision score, a summary of the plot that is computed as the weighted mean of precisions at each threshold, with the increase in recall from the previous threshold used as the weight. - micro : bool, default=True + micro : bool, default: True If multi-class classification, draw the precision-recall curve for the micro-average of all classes. In the multi-class case, either micro or per-class must be set to True. Ignored in the binary case. - iso_f1_curves : bool, default=False + iso_f1_curves : bool, default: False Draw ISO F1-Curves on the plot to show how close the precision-recall curves are to different F1 scores. - per_class : bool, default=False + iso_f1_values : tuple , default: (0.2, 0.4, 0.6, 0.8) + Values of f1 score for which to draw ISO F1-Curves + + per_class : bool, default: False If multi-class classification, draw the precision-recall curve for each class using a OneVsRestClassifier to compute the recall on a per-class basis. In the multi-class case, either micro or per-class must be set to True. Ignored in the binary case. - fill_opacity : float, default=0.2 + fill_opacity : float, default: 0.2 Specify the alpha or opacity of the fill area (0 being transparent, and 1.0 being completly opaque). - line_opacity : float, default=0.8 + line_opacity : float, default: 0.8 Specify the alpha or opacity of the lines (0 being transparent, and 1.0 being completly opaque). + is_fitted : bool or str, default="auto" + Specify if the wrapped estimator is already fitted. If False, the estimator + will be fit when the visualizer is fit, otherwise, the estimator will not be + modified. If "auto" (default), a helper method will check if the estimator + is fitted before fitting it again. + + force_model : bool, default: False + Do not check to ensure that the underlying estimator is a classifier. This + will prevent an exception when the visualizer is initialized but may result + in unexpected or unintended behavior. + kwargs : dict - Keyword arguments passed to the visualization base class. + Keyword arguments passed to the visualizer base classes. Returns ------- @@ -455,12 +584,43 @@ def precision_recall_curve(model, X, y, ax=None, train_size=0.8, random_state, and shuffle are specified. Note that splits are not stratified, if required, it is recommended to use the base class. """ - # Instantiate the visualizer - viz = PRCurve(model, ax=ax, **kwargs) - # Create train and test splits to validate the model - X_train, X_test, y_train, y_test = tts( - X, y, train_size=train_size, random_state=random_state, shuffle=shuffle + if (X_test is None) and (y_test is None): + # Create train and test splits to validate the model + X_train, X_test, y_train, y_test = tts( + X, y, train_size=train_size, random_state=random_state, shuffle=shuffle + ) + elif any( + [ + ((X_test is not None) and (y_test is None)), + ((X_test is None) and (y_test is not None)), + ] + ): + # exception handling in case of missing X_test or y_test + raise YellowbrickValueError( + "both X_test and y_test are required if one is specified" + ) + + else: + X_train, y_train = X, y + + # Instantiate the visualizer + viz = PRCurve( + model, + ax=ax, + classes=classes, + encoder=encoder, + fill_area=fill_area, + ap_score=ap_score, + micro=micro, + iso_f1_curves=iso_f1_curves, + iso_f1_values=iso_f1_values, + per_class=per_class, + fill_opacity=fill_opacity, + line_opacity=line_opacity, + is_fitted=is_fitted, + force_model=force_model, + **kwargs ) # Fit and transform the visualizer diff --git a/yellowbrick/classifier/rocauc.py b/yellowbrick/classifier/rocauc.py index 45eced42b..5a61e9702 100644 --- a/yellowbrick/classifier/rocauc.py +++ b/yellowbrick/classifier/rocauc.py @@ -1,12 +1,12 @@ # yellowbrick.classifier.rocauc # Implements visual ROC/AUC curves for classification evaluation. # -# Author: Rebecca Bilbro -# Author: Benjamin Bengfort +# Author: Rebecca Bilbro +# Author: Benjamin Bengfort # Author: Neal Humphrey -# Created: Wed May 18 12:39:40 2016 -0400 +# Created: Tue May 03 18:15:42 2017 -0400 # -# Copyright (C) 2017 District Data Labs +# Copyright (C) 2016 The scikit-yb developers # For license information, see LICENSE.txt # # ID: rocauc.py [5388065] neal@nhumphrey.com $ @@ -21,15 +21,15 @@ import numpy as np -from ..exceptions import ModelError -from ..exceptions import YellowbrickValueError -from ..style.palettes import LINE_COLOR -from .base import ClassificationScoreVisualizer - from scipy import interp +from sklearn.metrics import auc, roc_curve from sklearn.preprocessing import label_binarize from sklearn.model_selection import train_test_split -from sklearn.metrics import auc, roc_curve + +from yellowbrick.exceptions import ModelError +from yellowbrick.style.palettes import LINE_COLOR +from yellowbrick.exceptions import YellowbrickValueError +from yellowbrick.classifier.base import ClassificationScoreVisualizer # Dictionary keys for ROCAUC @@ -41,6 +41,7 @@ ## ROCAUC Visualizer ########################################################################## + class ROCAUC(ClassificationScoreVisualizer): """ Receiver Operating Characteristic (ROC) curves are a measure of a @@ -51,7 +52,7 @@ class ROCAUC(ClassificationScoreVisualizer): therefore the top-left corner of the plot: false positives are zero and true positives are one. - This leads to another metric, area under the curve (AUC), a computation + This leads to another metric, area under the curve (AUC), a computation of the relationship between false positives and true positives. The higher the AUC, the better the model generally is. However, it is also important to inspect the "steepness" of the curve, as this describes the @@ -62,46 +63,74 @@ class ROCAUC(ClassificationScoreVisualizer): Parameters ---------- model : estimator - Must be a classifier, otherwise raises YellowbrickTypeError + A scikit-learn estimator that should be a classifier. If the model is + not a classifier, an exception is raised. If the internal model is not + fitted, it is fit when the visualizer is fitted, unless otherwise specified + by ``is_fitted``. ax : matplotlib Axes, default: None - The axes to plot the figure on. If None is passed in the current axes - will be used (or generated if required). + The axes to plot the figure on. If not specified the current axes will be + used (or generated if required). - classes : list - A list of class names for the legend. If classes is None and a y value - is passed to fit then the classes are selected from the target vector. - Note that the curves must be computed based on what is in the target - vector passed to the ``score()`` method. Class names are used for - labeling only and must be in the correct order to prevent confusion. - - micro : bool, default = True + micro : bool, default: True Plot the micro-averages ROC curve, computed from the sum of all true positives and false positives across all classes. Micro is not defined for binary classification problems with estimators with only a decision_function method. - macro : bool, default = True + macro : bool, default: True Plot the macro-averages ROC curve, which simply takes the average of curves across all classes. Macro is not defined for binary classification problems with estimators with only a decision_function method. - per_class : bool, default = True + per_class : bool, default: True Plot the ROC curves for each individual class. This should be set to false if only the macro or micro average curves are required. Per- class classification is not defined for binary classification problems with estimators with only a decision_function method. - kwargs : keyword arguments passed to the super class. - Currently passing in hard-coded colors for the Receiver Operating - Characteristic curve and the diagonal. - These will be refactored to a default Yellowbrick style. + classes : list of str, defult: None + The class labels to use for the legend ordered by the index of the sorted + classes discovered in the ``fit()`` method. Specifying classes in this + manner is used to change the class names to a more specific format or + to label encoded integer classes. Some visualizers may also use this + field to filter the visualization for specific classes. For more advanced + usage specify an encoder rather than class labels. + + encoder : dict or LabelEncoder, default: None + A mapping of classes to human readable labels. Often there is a mismatch + between desired class labels and those contained in the target variable + passed to ``fit()`` or ``score()``. The encoder disambiguates this mismatch + ensuring that classes are labeled correctly in the visualization. + + is_fitted : bool or str, default="auto" + Specify if the wrapped estimator is already fitted. If False, the estimator + will be fit when the visualizer is fit, otherwise, the estimator will not be + modified. If "auto" (default), a helper method will check if the estimator + is fitted before fitting it again. + + force_model : bool, default: False + Do not check to ensure that the underlying estimator is a classifier. This + will prevent an exception when the visualizer is initialized but may result + in unexpected or unintended behavior. + + kwargs : dict + Keyword arguments passed to the visualizer base classes. Attributes ---------- + classes_ : ndarray of shape (n_classes,) + The class labels observed while fitting. + + class_count_ : ndarray of shape (n_classes,) + Number of samples encountered for each class during fitting. + score_ : float - Global accuracy score, unless micro or macro scores are requested + An evaluation metric of the classifier on test data produced when + ``score()`` is called. This metric is between 0 and 1 -- higher scores are + generally better. For classifiers, this score is usually accuracy, but + if micro or macro is specified this returns an F1 score. Notes ----- @@ -119,7 +148,9 @@ class classification is not defined for binary classification problems ensure the best quality visualization, do not use a LabelEncoder for this and do not pass in class labels. - .. seealso:: http://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html + .. seealso:: + http://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html + .. todo:: Allow the class list to filter the curves on the visualization. Examples @@ -136,16 +167,33 @@ class classification is not defined for binary classification problems >>> oz.poof() """ - def __init__(self, model, ax=None, classes=None, - micro=True, macro=True, per_class=True, **kwargs): - super(ROCAUC, self).__init__(model, ax=ax, classes=classes, **kwargs) + def __init__( + self, + model, + ax=None, + micro=True, + macro=True, + per_class=True, + classes=None, + encoder=None, + is_fitted="auto", + force_model=False, + **kwargs + ): + super(ROCAUC, self).__init__( + model, + ax=ax, + classes=classes, + encoder=encoder, + is_fitted=is_fitted, + force_model=force_model, + **kwargs + ) # Set the visual parameters for ROCAUC - self.micro = micro - self.macro = macro - self.per_class = per_class + self.set_params(micro=micro, macro=macro, per_class=per_class) - def score(self, X, y=None, **kwargs): + def score(self, X, y=None): """ Generates the predicted target values using the Scikit-Learn estimator. @@ -163,6 +211,9 @@ def score(self, X, y=None, **kwargs): score_ : float Global accuracy unless micro or macro scores are requested. """ + # Call super to check if fitted and to compute self.score_ + # NOTE: this sets score to the base score if neither macro nor micro + super(ROCAUC, self).score(X, y) # Compute the predictions for the test data y_pred = self._get_y_scores(X) @@ -205,13 +256,13 @@ def score(self, X, y=None, **kwargs): self.roc_auc = dict() # If the decision is binary, compute the ROC curve and ROC area - if self._binary_decision == True: + if self._binary_decision is True: self.fpr[0], self.tpr[0], _ = roc_curve(y, y_pred) self.roc_auc[0] = auc(self.fpr[0], self.tpr[0]) else: # Otherwise compute the ROC curve and ROC area for each class for i, c in enumerate(classes): - self.fpr[i], self.tpr[i], _ = roc_curve(y, y_pred[:,i], pos_label=c) + self.fpr[i], self.tpr[i], _ = roc_curve(y, y_pred[:, i], pos_label=c) self.roc_auc[i] = auc(self.fpr[i], self.tpr[i]) # Compute micro average @@ -233,9 +284,6 @@ def score(self, X, y=None, **kwargs): if self.macro: self.score_ = self.roc_auc[MACRO] - # Set score to the base score if neither macro nor micro - self.score_ = self.estimator.score(X, y) - return self.score_ def draw(self): @@ -247,50 +295,55 @@ def draw(self): ------- ax : the axis with the plotted figure """ - colors = self.colors[0:len(self.classes_)] + colors = self.colors[0 : len(self.classes_)] n_classes = len(colors) # If it's a binary decision, plot the single ROC curve - if self._binary_decision == True: + if self._binary_decision is True: self.ax.plot( - self.fpr[0], self.tpr[0], - label='ROC for binary decision, AUC = {:0.2f}'.format( - self.roc_auc[0] - ) + self.fpr[0], + self.tpr[0], + label="ROC for binary decision, AUC = {:0.2f}".format(self.roc_auc[0]), ) # If per-class plotting is requested, plot ROC curves for each class if self.per_class: for i, color in zip(range(n_classes), colors): self.ax.plot( - self.fpr[i], self.tpr[i], color=color, - label='ROC of class {}, AUC = {:0.2f}'.format( - self.classes_[i], self.roc_auc[i], - ) + self.fpr[i], + self.tpr[i], + color=color, + label="ROC of class {}, AUC = {:0.2f}".format( + self.classes_[i], self.roc_auc[i] + ), ) # If requested, plot the ROC curve for the micro average if self.micro: self.ax.plot( - self.fpr[MICRO], self.tpr[MICRO], linestyle="--", - color= self.colors[len(self.classes_)-1], - label='micro-average ROC curve, AUC = {:0.2f}'.format( - self.roc_auc["micro"], - ) + self.fpr[MICRO], + self.tpr[MICRO], + linestyle="--", + color=self.colors[len(self.classes_) - 1], + label="micro-average ROC curve, AUC = {:0.2f}".format( + self.roc_auc["micro"] + ), ) # If requested, plot the ROC curve for the macro average if self.macro: self.ax.plot( - self.fpr[MACRO], self.tpr[MACRO], linestyle="--", - color= self.colors[len(self.classes_)-1], - label='macro-average ROC curve, AUC = {:0.2f}'.format( - self.roc_auc["macro"], - ) + self.fpr[MACRO], + self.tpr[MACRO], + linestyle="--", + color=self.colors[len(self.classes_) - 1], + label="macro-average ROC curve, AUC = {:0.2f}".format( + self.roc_auc["macro"] + ), ) # Plot the line of no discrimination to compare the curve to. - self.ax.plot([0,1], [0,1], linestyle=':', c=LINE_COLOR) + self.ax.plot([0, 1], [0, 1], linestyle=":", c=LINE_COLOR) return self.ax def finalize(self, **kwargs): @@ -304,16 +357,16 @@ def finalize(self, **kwargs): """ # Set the title and add the legend - self.set_title('ROC Curves for {}'.format(self.name)) - self.ax.legend(loc='lower right', frameon=True) + self.set_title("ROC Curves for {}".format(self.name)) + self.ax.legend(loc="lower right", frameon=True) # Set the limits for the ROC/AUC (always between 0 and 1) self.ax.set_xlim([0.0, 1.0]) self.ax.set_ylim([0.0, 1.0]) # Set x and y axis labels - self.ax.set_ylabel('True Postive Rate') - self.ax.set_xlabel('False Positive Rate') + self.ax.set_ylabel("True Postive Rate") + self.ax.set_xlabel("False Positive Rate") def _get_y_scores(self, X): """ @@ -333,10 +386,7 @@ def _get_y_scores(self, X): that is associated with y_true values. """ # The resolution order of scoring functions - attrs = ( - 'predict_proba', - 'decision_function', - ) + attrs = ("predict_proba", "decision_function") # Return the first resolved function for attr in attrs: @@ -365,7 +415,7 @@ def _score_micro_average(self, y, y_pred, classes, n_classes): # Convert y to binarized array for micro and macro scores y = label_binarize(y, classes=classes) if n_classes == 2: - y = np.hstack((1-y, y)) + y = np.hstack((1 - y, y)) # Compute micro-average self.fpr[MICRO], self.tpr[MICRO], _ = roc_curve(y.ravel(), y_pred.ravel()) @@ -396,8 +446,24 @@ def _score_macro_average(self, n_classes): ## Quick method for ROCAUC ########################################################################## -def roc_auc(model, X, y=None, ax=None, **kwargs): - """ROCAUC Quick method: + +def roc_auc( + model, + X, + y, + ax=None, + test_size=0.2, + random_state=None, + micro=True, + macro=True, + per_class=True, + classes=None, + encoder=None, + is_fitted="auto", + force_model=False, + **kwargs +): + """ROCAUC Receiver Operating Characteristic (ROC) curves are a measure of a classifier's predictive quality that compares and visualizes the tradeoff @@ -417,9 +483,11 @@ def roc_auc(model, X, y=None, ax=None, **kwargs): Parameters ---------- - model : the Scikit-Learn estimator - Should be an instance of a classifier, else the __init__ will - return an error. + model : estimator + A scikit-learn estimator that should be a classifier. If the model is + not a classifier, an exception is raised. If the internal model is not + fitted, it is fit when the visualizer is fitted, unless otherwise specified + by ``is_fitted``. X : ndarray or DataFrame of shape n x m A matrix of n instances with m features @@ -427,33 +495,62 @@ def roc_auc(model, X, y=None, ax=None, **kwargs): y : ndarray or Series of length n An array or series of target or class values - ax : the axis to plot the figure on. + ax : matplotlib Axes, default: None + The axes to plot the figure on. If not specified the current axes will be + used (or generated if required). - classes : list - A list of class names for the legend. If classes is None and a y value - is passed to fit then the classes are selected from the target vector. - Note that the curves must be computed based on what is in the target - vector passed to the ``score()`` method. Class names are used for - labeling only and must be in the correct order to prevent confusion. + test_size : float, default=0.2 + The percentage of the data to reserve as test data. - micro : bool, default = True + random_state : int or None, default=None + The value to seed the random number generator for shuffling data. + + micro : bool, default: True Plot the micro-averages ROC curve, computed from the sum of all true positives and false positives across all classes. Micro is not defined for binary classification problems with estimators with only a decision_function method. - macro : bool, default = True + macro : bool, default: True Plot the macro-averages ROC curve, which simply takes the average of curves across all classes. Macro is not defined for binary classification problems with estimators with only a decision_function method. - per_class : bool, default = True + per_class : bool, default: True Plot the ROC curves for each individual class. This should be set to false if only the macro or micro average curves are required. Per- class classification is not defined for binary classification problems with estimators with only a decision_function method. + classes : list of str, defult: None + The class labels to use for the legend ordered by the index of the sorted + classes discovered in the ``fit()`` method. Specifying classes in this + manner is used to change the class names to a more specific format or + to label encoded integer classes. Some visualizers may also use this + field to filter the visualization for specific classes. For more advanced + usage specify an encoder rather than class labels. + + encoder : dict or LabelEncoder, default: None + A mapping of classes to human readable labels. Often there is a mismatch + between desired class labels and those contained in the target variable + passed to ``fit()`` or ``score()``. The encoder disambiguates this mismatch + ensuring that classes are labeled correctly in the visualization. + + is_fitted : bool or str, default="auto" + Specify if the wrapped estimator is already fitted. If False, the estimator + will be fit when the visualizer is fit, otherwise, the estimator will not be + modified. If "auto" (default), a helper method will check if the estimator + is fitted before fitting it again. + + force_model : bool, default: False + Do not check to ensure that the underlying estimator is a classifier. This + will prevent an exception when the visualizer is initialized but may result + in unexpected or unintended behavior. + + kwargs : dict + Keyword arguments passed to the visualizer base classes. + Notes ----- ROC curves are typically used in binary classification, and in fact the @@ -470,7 +567,7 @@ class classification is not defined for binary classification problems ensure the best quality visualization, do not use a LabelEncoder for this and do not pass in class labels. - .. seealso:: http://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html + .. seealso:: https://bit.ly/2IORWO2 .. todo:: Allow the class list to filter the curves on the visualization. Examples @@ -485,19 +582,32 @@ class classification is not defined for binary classification problems Returns ------- - ax : matplotlib axes - Returns the axes that the roc-auc curve was drawn on. + viz : ROCAUC + Returns the fitted, finalized visualizer object """ # Instantiate the visualizer - visualizer = ROCAUC(model, ax, **kwargs) + visualizer = ROCAUC( + model=model, + ax=ax, + micro=micro, + macro=macro, + per_class=per_class, + classes=classes, + encoder=encoder, + is_fitted=is_fitted, + force_model=force_model, + **kwargs + ) # Create the train and test splits - X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=test_size, random_state=random_state + ) # Fit and transform the visualizer (calls draw) visualizer.fit(X_train, y_train, **kwargs) visualizer.score(X_test, y_test) visualizer.finalize() - # Return the axes object on the visualizer - return visualizer.ax + # Return the visualizer + return visualizer diff --git a/yellowbrick/classifier/threshold.py b/yellowbrick/classifier/threshold.py index 0898f8edb..68c70bf69 100644 --- a/yellowbrick/classifier/threshold.py +++ b/yellowbrick/classifier/threshold.py @@ -1,10 +1,13 @@ # yellowbrick.classifier.threshold # DiscriminationThreshold visualizer for probabilistic classifiers. # -# Author: Nathan Danielsen -# Author: Benjamin Bengfort +# Author: Nathan Danielsen +# Author: Benjamin Bengfort # Created: Wed April 26 20:17:29 2017 -0700 # +# Copyright (C) 2017 The scikit-yb developers +# For license information, see LICENSE.txt +# # ID: threshold.py [] nathan.danielsen@gmail.com $ """ @@ -15,24 +18,23 @@ ## Imports ########################################################################## -import six import bisect import numpy as np from scipy.stats import mstats from collections import defaultdict -from yellowbrick.base import ModelVisualizer -from yellowbrick.style.colors import resolve_colors -from yellowbrick.utils import is_classifier, is_probabilistic, is_monotonic -from yellowbrick.exceptions import YellowbrickTypeError, YellowbrickValueError - from sklearn.base import clone from sklearn.model_selection import ShuffleSplit from sklearn.metrics import precision_recall_curve from sklearn.utils import indexable, safe_indexing from sklearn.utils.multiclass import type_of_target +from yellowbrick.base import ModelVisualizer +from yellowbrick.style.colors import resolve_colors +from yellowbrick.utils import is_classifier, is_probabilistic, is_monotonic +from yellowbrick.exceptions import YellowbrickTypeError, YellowbrickValueError + # Quantiles for lower bound, curve, and upper bound QUANTILES_MEDIAN_80 = np.array([0.1, 0.5, 0.9]) @@ -45,6 +47,7 @@ # Discrimination Thresholds Visualization ########################################################################## + class DiscriminationThreshold(ModelVisualizer): """ Visualizes how precision, recall, f1 score, and queue rate change as the @@ -70,14 +73,15 @@ class DiscriminationThreshold(ModelVisualizer): Parameters ---------- - model : Classification Estimator - A binary classification estimator that implements ``predict_proba`` or - ``decision_function`` methods. Will raise ``TypeError`` if the model - cannot be used with the visualizer. + model : estimator + A scikit-learn estimator that should be a classifier. If the model is + not a classifier, an exception is raised. If the internal model is not + fitted, it is fit when the visualizer is fitted, unless otherwise specified + by ``is_fitted``. ax : matplotlib Axes, default: None - The axis to plot the figure on. If None is passed in the current axes - will be used (or generated if required). + The axes to plot the figure on. If not specified the current axes will be + used (or generated if required). n_trials : integer, default: 50 Number of times to shuffle and split the dataset to account for noise @@ -128,8 +132,19 @@ class DiscriminationThreshold(ModelVisualizer): Note that if a splitter is provided, it's random state will also be updated with this random state, even if it was previously set. + is_fitted : bool or str, default="auto" + Specify if the wrapped estimator is already fitted. If False, the estimator + will be fit when the visualizer is fit, otherwise, the estimator will not be + modified. If "auto" (default), a helper method will check if the estimator + is fitted before fitting it again. + + force_model : bool, default: False + Do not check to ensure that the underlying estimator is a classifier. This + will prevent an exception when the visualizer is initialized but may result + in unexpected or unintended behavior. + kwargs : dict - Keyword arguments that are passed to the base visualizer class. + Keyword arguments passed to the visualizer base classes. Attributes ---------- @@ -158,16 +173,31 @@ class DiscriminationThreshold(ModelVisualizer): by Insight Data. """ - def __init__(self, model, ax=None, n_trials=50, cv=0.1, fbeta=1.0, - argmax='fscore', exclude=None, quantiles=QUANTILES_MEDIAN_80, - random_state=None, **kwargs): + def __init__( + self, + model, + ax=None, + n_trials=50, + cv=0.1, + fbeta=1.0, + argmax="fscore", + exclude=None, + quantiles=QUANTILES_MEDIAN_80, + random_state=None, + is_fitted="auto", + force_model=False, + **kwargs + ): # Perform some quick type checking to help users avoid error. - if not is_classifier(model) or not is_probabilistic(model): + if not force_model and ( + not is_classifier(model) or not is_probabilistic(model) + ): raise YellowbrickTypeError( "{} requires a probabilistic binary classifier".format( - self.__class__.__name__ - )) + self.__class__.__name__ + ) + ) # Check the various inputs self._check_quantiles(quantiles) @@ -175,15 +205,21 @@ def __init__(self, model, ax=None, n_trials=50, cv=0.1, fbeta=1.0, self._check_exclude(exclude) # Initialize the ModelVisualizer - super(DiscriminationThreshold, self).__init__(model, ax=ax, **kwargs) + super(DiscriminationThreshold, self).__init__( + model, ax=ax, is_fitted=is_fitted, **kwargs + ) # Set params self.set_params( - n_trials=n_trials, cv=cv, fbeta=fbeta, argmax=argmax, - exclude=exclude, quantiles=quantiles, random_state=random_state, + n_trials=n_trials, + cv=cv, + fbeta=fbeta, + argmax=argmax, + exclude=exclude, + quantiles=quantiles, + random_state=random_state, ) - def fit(self, X, y, **kwargs): """ Fit is the entry point for the visualizer. Given instances described @@ -213,14 +249,15 @@ def fit(self, X, y, **kwargs): If the target y is not a binary classification target. """ # Check target before metrics raise crazy exceptions - if type_of_target(y) != 'binary': + # TODO: Switch to using target type from utils.target + if type_of_target(y) != "binary": raise YellowbrickValueError("multiclass format is not supported") # Make arrays indexable for cross validation X, y = indexable(X, y) # TODO: parallelize trials with joblib (using sklearn utility) - # NOTE: parallelization with matplotlib is tricy at best! + # NOTE: parallelization with matplotlib is tricky at best! trials = [ metric for idx in range(self.n_trials) @@ -228,7 +265,7 @@ def fit(self, X, y, **kwargs): ] # Compute maximum number of uniform thresholds across all trials - n_thresholds = np.array([len(t['thresholds']) for t in trials]).min() + n_thresholds = np.array([len(t["thresholds"]) for t in trials]).min() self.thresholds_ = np.linspace(0.0, 1.0, num=n_thresholds) # Filter metrics and collect values for uniform thresholds @@ -238,7 +275,7 @@ def fit(self, X, y, **kwargs): for trial in trials: rows = defaultdict(list) for t in self.thresholds_: - idx = bisect.bisect_left(trial['thresholds'], t) + idx = bisect.bisect_left(trial["thresholds"], t) for metric in metrics: rows[metric].append(trial[metric][idx]) @@ -247,8 +284,7 @@ def fit(self, X, y, **kwargs): # Convert metrics to metric arrays uniform_metrics = { - metric: np.array(values) - for metric, values in uniform_metrics.items() + metric: np.array(values) for metric, values in uniform_metrics.items() } # Perform aggregation and store cv_scores_ @@ -257,15 +293,17 @@ def fit(self, X, y, **kwargs): for metric, values in uniform_metrics.items(): # Compute the lower, median, and upper plots - lower, median, upper = mstats.mquantiles( - values, prob=quantiles, axis=0 - ) + lower, median, upper = mstats.mquantiles(values, prob=quantiles, axis=0) # Store the aggregates in cv scores self.cv_scores_[metric] = median self.cv_scores_["{}_lower".format(metric)] = lower self.cv_scores_["{}_upper".format(metric)] = upper + # TODO: fit the underlying estimator with the best decision threshold + # Call super to ensure the underlying estimator is correctly fitted + super(DiscriminationThreshold, self).fit(X, y) + # Draw and always return self self.draw() return self @@ -298,7 +336,7 @@ def _split_fit_score_trial(self, X, y, idx=0): if hasattr(model, "predict_proba"): # Get the probabilities for the positive class - y_scores = model.predict_proba(X_test)[:,1] + y_scores = model.predict_proba(X_test)[:, 1] else: # Use the decision function to get the scores y_scores = model.decision_function(X_test) @@ -309,26 +347,24 @@ def _split_fit_score_trial(self, X, y, idx=0): # Compute the F1 score from precision and recall # Don't need to warn for F, precision/recall would have warned - with np.errstate(divide='ignore', invalid='ignore'): + with np.errstate(divide="ignore", invalid="ignore"): beta = self.fbeta ** 2 - f_score = ((1 + beta) * precision * recall / - (beta * precision + recall)) + f_score = (1 + beta) * precision * recall / (beta * precision + recall) # Ensure thresholds ends at 1 thresholds = np.append(thresholds, 1) # Compute the queue rate - queue_rate = np.array([ - (y_scores >= threshold).mean() - for threshold in thresholds - ]) + queue_rate = np.array( + [(y_scores >= threshold).mean() for threshold in thresholds] + ) yield { - 'thresholds': thresholds, - 'precision': precision, - 'recall': recall, - 'fscore': f_score, - 'queue_rate': queue_rate + "thresholds": thresholds, + "precision": precision, + "recall": recall, + "fscore": f_score, + "queue_rate": queue_rate, } def draw(self): @@ -357,8 +393,7 @@ def draw(self): # Draw the metric values self.ax.plot( - self.thresholds_, self.cv_scores_[metric], - color=color, label=label + self.thresholds_, self.cv_scores_[metric], color=color, label=label ) # Draw the upper and lower bounds @@ -366,8 +401,7 @@ def draw(self): upper = self.cv_scores_["{}_upper".format(metric)] self.ax.fill_between( - self.thresholds_, upper, lower, - alpha=0.35, linewidth=0, color=color + self.thresholds_, upper, lower, alpha=0.35, linewidth=0, color=color ) # Annotate the graph with the maximizing value @@ -375,8 +409,11 @@ def draw(self): argmax = self.cv_scores_[metric].argmax() threshold = self.thresholds_[argmax] self.ax.axvline( - threshold, ls='--', c='k', lw=1, - label="$t_{}={:0.2f}$".format(metric[0], threshold) + threshold, + ls="--", + c="k", + lw=1, + label="$t_{}={:0.2f}$".format(metric[0], threshold), ) return self.ax @@ -395,9 +432,9 @@ def finalize(self, **kwargs): # Set the title of the threshold visualiztion self.set_title("Threshold Plot for {}".format(self.name)) - self.ax.legend(frameon=True, loc='best') - self.ax.set_xlabel('discrimination threshold') - self.ax.set_ylabel('score') + self.ax.legend(frameon=True, loc="best") + self.ax.set_xlabel("discrimination threshold") + self.ax.set_ylabel("score") self.ax.set_xlim(0.0, 1.0) self.ax.set_ylim(0.0, 1.0) @@ -418,21 +455,18 @@ def _check_cv(self, val, random_state=None): validation exception is raised. """ # Use default splitter in this case - if val is None: val = 0.1 + if val is None: + val = 0.1 if isinstance(val, float) and val <= 1.0: - return ShuffleSplit( - n_splits=1, test_size=val, random_state=random_state - ) + return ShuffleSplit(n_splits=1, test_size=val, random_state=random_state) if hasattr(val, "split") and hasattr(val, "get_n_splits"): if random_state is not None and hasattr(val, "random_state"): val.random_state = random_state return val - raise YellowbrickValueError( - "'{}' is not a valid cv splitter".format(type(val)) - ) + raise YellowbrickValueError("'{}' is not a valid cv splitter".format(type(val))) def _check_exclude(self, val): """ @@ -440,7 +474,7 @@ def _check_exclude(self, val): """ if val is None: exclude = frozenset() - elif isinstance(val, six.string_types): + elif isinstance(val, str): exclude = frozenset([val.lower()]) else: exclude = frozenset(map(lambda s: s.lower(), val)) @@ -457,11 +491,24 @@ def _check_exclude(self, val): # Quick Methods ########################################################################## -def discrimination_threshold(model, X, y, ax=None, n_trials=50, cv=0.1, - fbeta=1.0, argmax='fscore', exclude=None, - quantiles=QUANTILES_MEDIAN_80, random_state=None, - **kwargs): - """Quick method for DiscriminationThreshold. + +def discrimination_threshold( + model, + X, + y, + ax=None, + n_trials=50, + cv=0.1, + fbeta=1.0, + argmax="fscore", + exclude=None, + quantiles=QUANTILES_MEDIAN_80, + random_state=None, + is_fitted="auto", + force_model=False, + **kwargs +): + """Discrimination Threshold: Visualizes how precision, recall, f1 score, and queue rate change as the discrimination threshold increases. For probabilistic, binary classifiers, @@ -475,10 +522,11 @@ def discrimination_threshold(model, X, y, ax=None, n_trials=50, cv=0.1, Parameters ---------- - model : Classification Estimator - A binary classification estimator that implements ``predict_proba`` or - ``decision_function`` methods. Will raise ``TypeError`` if the model - cannot be used with the visualizer. + model : estimator + A scikit-learn estimator that should be a classifier. If the model is + not a classifier, an exception is raised. If the internal model is not + fitted, it is fit when the visualizer is fitted, unless otherwise specified + by ``is_fitted``. X : ndarray or DataFrame of shape n x m A matrix of n instances with m features @@ -488,8 +536,8 @@ def discrimination_threshold(model, X, y, ax=None, n_trials=50, cv=0.1, be a binary classification target. ax : matplotlib Axes, default: None - The axis to plot the figure on. If None is passed in the current axes - will be used (or generated if required). + The axes to plot the figure on. If not specified the current axes will be + used (or generated if required). n_trials : integer, default: 50 Number of times to shuffle and split the dataset to account for noise @@ -540,24 +588,44 @@ def discrimination_threshold(model, X, y, ax=None, n_trials=50, cv=0.1, Note that if a splitter is provided, it's random state will also be updated with this random state, even if it was previously set. + is_fitted : bool or str, default="auto" + Specify if the wrapped estimator is already fitted. If False, the estimator + will be fit when the visualizer is fit, otherwise, the estimator will not be + modified. If "auto" (default), a helper method will check if the estimator + is fitted before fitting it again. + + force_model : bool, default: False + Do not check to ensure that the underlying estimator is a classifier. This + will prevent an exception when the visualizer is initialized but may result + in unexpected or unintended behavior. + kwargs : dict - Keyword arguments that are passed to the base visualizer class. + Keyword arguments passed to the visualizer base classes. Returns ------- - ax : matplotlib axes - Returns the axes that the parallel coordinates were drawn on. + viz : DiscriminationThreshold + Returns the fitted and finalized visualizer object. """ # Instantiate the visualizer visualizer = DiscriminationThreshold( - model, ax=ax, n_trials=n_trials, cv=cv, fbeta=fbeta, argmax=argmax, - exclude=exclude, quantiles=quantiles, random_state=random_state, + model, + ax=ax, + n_trials=n_trials, + cv=cv, + fbeta=fbeta, + argmax=argmax, + exclude=exclude, + quantiles=quantiles, + random_state=random_state, + is_fitted=is_fitted, + force_model=force_model, **kwargs ) # Fit and transform the visualizer (calls draw) visualizer.fit(X, y) - visualizer.poof() + visualizer.finalize() - # Return the axes object on the visualizer - return visualizer.ax + # Return the visualizer + return visualizer diff --git a/yellowbrick/cluster/__init__.py b/yellowbrick/cluster/__init__.py index e783eb6ad..6672c1bc7 100644 --- a/yellowbrick/cluster/__init__.py +++ b/yellowbrick/cluster/__init__.py @@ -1,10 +1,10 @@ # yellowbrick.cluster # Visualizers for Cluster analysis and diagnostics # -# Author: Benjamin Bengfort +# Author: Benjamin Bengfort # Created: Thu Mar 23 17:26:57 2017 -0400 # -# Copyright (C) 2016 District Data Labs +# Copyright (C) 2016 The scikit-yb developers # For license information, see LICENSE.txt # # ID: __init__.py [241edca] benjamin@bengfort.com $ @@ -22,4 +22,4 @@ from .base import * from .elbow import * from .silhouette import * -from .icdm import * +from .icdm import * diff --git a/yellowbrick/cluster/base.py b/yellowbrick/cluster/base.py index 1ae1dce6a..e183eef31 100644 --- a/yellowbrick/cluster/base.py +++ b/yellowbrick/cluster/base.py @@ -1,10 +1,10 @@ # yellowbrick.cluster.base # Base class for cluster visualizers. # -# Author: Benjamin Bengfort +# Author: Benjamin Bengfort # Created: Thu Mar 23 17:28:38 2017 -0400 # -# Copyright (C) 2016 District Data Labs +# Copyright (C) 2016 The scikit-yb developers # For license information, see LICENSE.txt # # ID: base.py [241edca] benjamin@bengfort.com $ @@ -17,21 +17,20 @@ ## Imports ########################################################################## -from ..utils import isclusterer -from ..base import ScoreVisualizer -from ..exceptions import YellowbrickTypeError +from yellowbrick.utils import isclusterer +from yellowbrick.base import ScoreVisualizer +from yellowbrick.exceptions import YellowbrickTypeError ## Packages for export -__all__ = [ - "ClusteringScoreVisualizer", -] +__all__ = ["ClusteringScoreVisualizer"] ########################################################################## ## Clustering Score Visualization Base Object ########################################################################## + class ClusteringScoreVisualizer(ScoreVisualizer): """ Base class for all ScoreVisualizers that evaluate a clustering estimator. @@ -41,11 +40,11 @@ class ClusteringScoreVisualizer(ScoreVisualizer): ``YellowbrickTypeError`` exception is raised. """ - def __init__(self, model, ax=None, **kwargs): - if not isclusterer(model): + def __init__(self, model, ax=None, fig=None, force_model=False, **kwargs): + if not force_model and not isclusterer(model): raise YellowbrickTypeError( "The supplied model is not a clustering estimator; try a " "classifier or regression score visualizer instead!" ) - - super(ClusteringScoreVisualizer, self).__init__(model, ax=ax, **kwargs) + self.force_model = force_model + super(ClusteringScoreVisualizer, self).__init__(model, ax=ax, fig=fig, **kwargs) diff --git a/yellowbrick/cluster/elbow.py b/yellowbrick/cluster/elbow.py index d08de7c29..bee510b00 100644 --- a/yellowbrick/cluster/elbow.py +++ b/yellowbrick/cluster/elbow.py @@ -1,10 +1,10 @@ # yellowbrick.cluster.elbow # Implements the elbow method for determining the optimal number of clusters. # -# Author: Benjamin Bengfort +# Author: Benjamin Bengfort # Created: Thu Mar 23 22:36:31 2017 -0400 # -# Copyright (C) 2016 District Data Labs +# Copyright (C) 2016 The scikit-yb developers # For license information, see LICENSE.txt # # ID: elbow.py [5a370c8] benjamin@bengfort.com $ @@ -18,31 +18,37 @@ ## Imports ########################################################################## -import collections import time +import warnings import numpy as np import scipy.sparse as sp - -from .base import ClusteringScoreVisualizer -from ..exceptions import YellowbrickValueError +from collections.abc import Iterable from sklearn.metrics import silhouette_score -from sklearn.metrics import calinski_harabaz_score -from sklearn.metrics.pairwise import pairwise_distances from sklearn.preprocessing import LabelEncoder +from sklearn.metrics.pairwise import pairwise_distances + +from yellowbrick.utils import KneeLocator +from yellowbrick.style.palettes import LINE_COLOR +from yellowbrick.cluster.base import ClusteringScoreVisualizer +from yellowbrick.exceptions import YellowbrickValueError, YellowbrickWarning + +try: + from sklearn.metrics import calinski_harabasz_score as chs +except ImportError: + from sklearn.metrics import calinski_harabaz_score as chs ## Packages for export -__all__ = [ - "KElbowVisualizer", "distortion_score" -] +__all__ = ["KElbowVisualizer", "KElbow", "distortion_score", "kelbow_visualizer"] ########################################################################## ## Metrics ########################################################################## -def distortion_score(X, labels, metric='euclidean'): + +def distortion_score(X, labels, metric="euclidean"): """ Compute the mean distortion of all samples. @@ -65,7 +71,7 @@ def distortion_score(X, labels, metric='euclidean'): The metric to use when calculating distance between instances in a feature array. If metric is a string, it must be one of the options allowed by `sklearn.metrics.pairwise.pairwise_distances - `_ + `_ .. todo:: add sample_size and random_state kwds similar to silhouette_score """ @@ -98,8 +104,8 @@ def distortion_score(X, labels, metric='euclidean'): distances = pairwise_distances(instances, center, metric=metric) distances = distances ** 2 - # Add the mean square distance to the distortion - distortion += distances.mean() + # Add the sum of square distance to the distortion + distortion += distances.sum() return distortion @@ -111,7 +117,7 @@ def distortion_score(X, labels, metric='euclidean'): KELBOW_SCOREMAP = { "distortion": distortion_score, "silhouette": silhouette_score, - "calinski_harabaz": calinski_harabaz_score, + "calinski_harabasz": chs, } @@ -129,12 +135,12 @@ class KElbowVisualizer(ClusteringScoreVisualizer): average score for all clusters. By default, the ``distortion`` score is computed, the sum of square distances from each point to its assigned center. Other metrics can also be used such as the ``silhouette`` score, - the mean silhouette coefficient for all samples or the - ``calinski_harabaz`` score, which computes the ratio of dispersion between + the mean silhouette coefficient for all samples or the + ``calinski_harabasz`` score, which computes the ratio of dispersion between and within clusters. When these overall metrics for each model are plotted, it is possible to - visually determine the best value for K. If the line chart looks like an + visually determine the best value for k. If the line chart looks like an arm, then the "elbow" (the point of inflection on the curve) is the best value of k. The "arm" can be either up or down, but if there is a strong inflection point, it is a good indication that the underlying model fits @@ -143,8 +149,8 @@ class KElbowVisualizer(ClusteringScoreVisualizer): Parameters ---------- - model : a Scikit-Learn clusterer - Should be an instance of a clusterer, specifically ``KMeans`` or + model : a scikit-learn clusterer + Should be an instance of an unfitted clusterer, specifically ``KMeans`` or ``MiniBatchKMeans``. If it is not a clusterer, an exception is raised. ax : matplotlib Axes, default: None @@ -164,16 +170,37 @@ class KElbowVisualizer(ClusteringScoreVisualizer): - **distortion**: mean sum of squared distances to centers - **silhouette**: mean ratio of intra-cluster and nearest-cluster distance - - **calinski_harabaz**: ratio of within to between cluster dispersion + - **calinski_harabasz**: ratio of within to between cluster dispersion timings : bool, default: True Display the fitting time per k to evaluate the amount of time required to train the clustering model. + locate_elbow : bool, default: True + Automatically find the "elbow" or "knee" which likely corresponds to the optimal + value of k using the "knee point detection algorithm". The knee point detection + algorithm finds the point of maximum curvature, which in a well-behaved + clustering problem also represents the pivot of the elbow curve. The point is + labeled with a dashed line and annotated with the score and k values. + kwargs : dict Keyword arguments that are passed to the base class and may influence the visualization as defined in other Visualizers. + Attributes + ---------- + k_scores_ : array of shape (n,) where n is no. of k values + The silhouette score corresponding to each k value. + + k_timers_ : array of shape (n,) where n is no. of k values + The time taken to fit n KMeans model corresponding to each k value. + + elbow_value_ : integer + The optimal value of k. + + elbow_score_ : float + The silhouette score corresponding to the optimal value of k. + Examples -------- @@ -193,60 +220,88 @@ class KElbowVisualizer(ClusteringScoreVisualizer): SSE, also can be used to explore if clustering is a correct choice. For a discussion on the Elbow method, read more at - `Robert Gove's Block `_. - + `Robert Gove's Block website `_. + For more on the knee point detection algorithm see the paper `"Finding a "kneedle" + in a Haystack" `_. + .. seealso:: The scikit-learn documentation for the `silhouette_score - `_ and `calinski_harabaz_score - `_. The default, `distortion_score`, is - implemented in`yellowbrick.cluster.elbow`. + `_ and `calinski_harabasz_score + `_. The default, ``distortion_score``, is + implemented in ``yellowbrick.cluster.elbow``. .. todo:: add parallelization option for performance .. todo:: add different metrics for scores and silhouette .. todo:: add timing information about how long it's taking """ - def __init__(self, model, ax=None, k=10, - metric="distortion", timings=True, **kwargs): + def __init__( + self, + model, + ax=None, + k=10, + metric="distortion", + timings=True, + locate_elbow=True, + **kwargs + ): super(KElbowVisualizer, self).__init__(model, ax=ax, **kwargs) # Get the scoring method if metric not in KELBOW_SCOREMAP: raise YellowbrickValueError( "'{}' is not a defined metric " - "use one of distortion, silhouette, or calinski_harabaz" + "use one of distortion, silhouette, or calinski_harabasz" ) # Store the arguments self.scoring_metric = KELBOW_SCOREMAP[metric] + self.metric = metric self.timings = timings + self.locate_elbow = locate_elbow # Convert K into a tuple argument if an integer if isinstance(k, int): - self.k_values_ = list(range(2, k+1)) - elif isinstance(k, tuple) and len(k) == 2 and \ - all(isinstance(x, (int, np.integer)) for x in k): + self.k_values_ = list(range(2, k + 1)) + elif ( + isinstance(k, tuple) + and len(k) == 2 + and all(isinstance(x, (int, np.integer)) for x in k) + ): self.k_values_ = list(range(*k)) - elif isinstance(k, collections.Iterable) and \ - all(isinstance(x, (int, np.integer)) for x in k): + elif isinstance(k, Iterable) and all( + isinstance(x, (int, np.integer)) for x in k + ): self.k_values_ = list(k) else: - raise YellowbrickValueError(( - "Specify an iterable of integers, a range, or maximal K value," - " the value '{}' is not a valid argument for K.".format(k) - )) + raise YellowbrickValueError( + ( + "Specify an iterable of integers, a range, or maximal K value," + " the value '{}' is not a valid argument for K.".format(k) + ) + ) # Holds the values of the silhoutte scores self.k_scores_ = None + # Set Default Elbow Value + self.elbow_value_ = None def fit(self, X, y=None, **kwargs): """ Fits n KMeans models where n is the length of ``self.k_values_``, - storing the silhoutte scores in the ``self.k_scores_`` attribute. + storing the silhouette scores in the ``self.k_scores_`` attribute. + The "elbow" and silhouette score corresponding to it are stored in + ``self.elbow_value`` and ``self.elbow_score`` respectively. This method finishes up by calling draw to create the plot. """ self.k_scores_ = [] self.k_timers_ = [] + self.kneedle = None + self.knee_value = None + + if self.locate_elbow: + self.elbow_value_ = None + self.elbow_score_ = None for k in self.k_values_: # Compute the start time for each model @@ -258,9 +313,39 @@ def fit(self, X, y=None, **kwargs): # Append the time and score to our plottable metrics self.k_timers_.append(time.time() - start) - self.k_scores_.append( - self.scoring_metric(X, self.estimator.labels_) + self.k_scores_.append(self.scoring_metric(X, self.estimator.labels_)) + + if self.locate_elbow: + locator_kwargs = { + "distortion": { + "curve_nature": "convex", + "curve_direction": "decreasing", + }, + "silhouette": { + "curve_nature": "concave", + "curve_direction": "increasing", + }, + "calinski_harabasz": { + "curve_nature": "concave", + "curve_direction": "increasing", + }, + }.get(self.metric, {}) + elbow_locator = KneeLocator( + self.k_values_, self.k_scores_, **locator_kwargs ) + if elbow_locator.knee is None: + self.elbow_value_ = None + self.elbow_score_ = 0 + warning_message = ( + "No 'knee' or 'elbow' point detected, " + "pass `locate_elbow=False` to remove the warning" + ) + warnings.warn(warning_message, YellowbrickWarning) + else: + self.elbow_value_ = elbow_locator.knee + self.elbow_score_ = self.k_scores_[ + self.k_values_.index(self.elbow_value_) + ] self.draw() @@ -271,14 +356,26 @@ def draw(self): Draw the elbow curve for the specified scores and values of K. """ # Plot the silhouette score against k - self.ax.plot(self.k_values_, self.k_scores_, marker="D", label="score") + self.ax.plot(self.k_values_, self.k_scores_, marker="D") + if self.locate_elbow is True and self.elbow_value_ is not None: + elbow_label = "$elbow at k={}, score={:0.3f}$".format( + self.elbow_value_, self.elbow_score_ + ) + self.ax.axvline( + self.elbow_value_, c=LINE_COLOR, linestyle="--", label=elbow_label + ) # If we're going to plot the timings, create a twinx axis if self.timings: self.axes = [self.ax, self.ax.twinx()] self.axes[1].plot( - self.k_values_, self.k_timers_, label="fit time", - c='g', marker="o", linestyle="--", alpha=0.75, + self.k_values_, + self.k_timers_, + label="fit time", + c="g", + marker="o", + linestyle="--", + alpha=0.75, ) return self.ax @@ -287,21 +384,105 @@ def finalize(self): """ Prepare the figure for rendering by setting the title as well as the X and Y axis labels and adding the legend. + """ # Get the metric name metric = self.scoring_metric.__name__.replace("_", " ").title() # Set the title - self.set_title( - '{} Elbow for {} Clustering'.format(metric, self.name) - ) + self.set_title("{} Elbow for {} Clustering".format(metric, self.name)) # Set the x and y labels - self.ax.set_xlabel('k') + self.ax.set_xlabel("k") self.ax.set_ylabel(metric.lower()) + # set the legend if locate_elbow=True + if self.locate_elbow is True and self.elbow_value_ is not None: + self.ax.legend(loc="best", fontsize="medium") + # Set the second y axis labels if self.timings: self.axes[1].grid(False) - self.axes[1].set_ylabel("fit time (seconds)", color='g') - self.axes[1].tick_params('y', colors='g') + self.axes[1].set_ylabel("fit time (seconds)", color="g") + self.axes[1].tick_params("y", colors="g") + + +# alias +KElbow = KElbowVisualizer + + +########################################################################## +## Quick Method +########################################################################## + + +def kelbow_visualizer( + model, + X, + y=None, + k=10, + ax=None, + timings=True, + locate_elbow=True, + metric="distortion", + **kwargs +): + """ + Quick Method: + + model : a Scikit-Learn clusterer + Should be an instance of an unfitted clusterer, specifically ``KMeans`` or + ``MiniBatchKMeans``. If it is not a clusterer, an exception is raised. + + ax : matplotlib Axes, default: None + The axes to plot the figure on. If None is passed in the current axes + will be used (or generated if required). + + k : integer, tuple, or iterable + The k values to compute silhouette scores for. If a single integer + is specified, then will compute the range (2,k). If a tuple of 2 + integers is specified, then k will be in np.arange(k[0], k[1]). + Otherwise, specify an iterable of integers to use as values for k. + + metric : string, default: ``"distortion"`` + Select the scoring metric to evaluate the clusters. The default is the + mean distortion, defined by the sum of squared distances between each + observation and its closest centroid. Other metrics include: + + - **distortion**: mean sum of squared distances to centers + - **silhouette**: mean ratio of intra-cluster and nearest-cluster distance + - **calinski_harabasz**: ratio of within to between cluster dispersion + + timings : bool, default: True + Display the fitting time per k to evaluate the amount of time required + to train the clustering model. + + locate_elbow : bool, default: True + Automatically find the "elbow" or "knee" which likely corresponds to the optimal + value of k using the "knee point detection algorithm". The knee point detection + algorithm finds the point of maximum curvature, which in a well-behaved + clustering problem also represents the pivot of the elbow curve. The point is + labeled with a dashed line and annotated with the score and k values. + + kwargs : dict + Keyword arguments that are passed to the base class and may influence + the visualization as defined in other Visualizers. + + Returns + ------- + viz : KElbowVisualizer + The kelbow visualizer, fitted and finalized. + """ + oz = KElbow( + model, + ax=ax, + k=k, + metric=metric, + timings=timings, + locate_elbow=locate_elbow, + **kwargs + ) + + oz.fit(X, y) + oz.finalize() + return oz diff --git a/yellowbrick/cluster/icdm.py b/yellowbrick/cluster/icdm.py index f24d580ef..7b007845f 100644 --- a/yellowbrick/cluster/icdm.py +++ b/yellowbrick/cluster/icdm.py @@ -1,10 +1,13 @@ # yellowbrick.cluster.icdm # Implements Intercluster Distance Map visualizations. # -# Author: Benjamin Bengfort +# Author: Benjamin Bengfort # Created: Tue Aug 21 11:56:53 2018 -0400 # -# ID: icdm.py [] benjamin@bengfort.com $ +# Copyright (C) 2018 The scikit-yb developers +# For license information, see LICENSE.txt +# +# ID: icdm.py [2f23976] benjamin@bengfort.com $ """ Implements Intercluster Distance Map visualizations. @@ -20,12 +23,11 @@ from matplotlib.patches import Circle from sklearn.manifold import MDS, TSNE -from .base import ClusteringScoreVisualizer - -from ..utils.timer import Timer -from ..utils.decorators import memoized -from ..utils.helpers import prop_to_size -from ..exceptions import YellowbrickValueError +from yellowbrick.utils.timer import Timer +from yellowbrick.utils.decorators import memoized +from yellowbrick.exceptions import YellowbrickValueError +from yellowbrick.cluster.base import ClusteringScoreVisualizer +from yellowbrick.utils.helpers import prop_to_size, check_fitted try: # Only available in Matplotlib >= 2.0.2 @@ -36,22 +38,26 @@ ## Packages for export __all__ = [ - "InterclusterDistance", "intercluster_distance", - "VALID_EMBEDDING", "VALID_SCORING", + "InterclusterDistance", + "intercluster_distance", + "VALID_EMBEDDING", + "VALID_SCORING", + "ICDM", ] # Valid strings to use for embedding names -VALID_EMBEDDING = {'mds', 'tsne'} +VALID_EMBEDDING = {"mds", "tsne"} # Valid strings to use for scoring names -VALID_SCORING = {'membership',} +VALID_SCORING = {"membership"} ########################################################################## ## InterclusterDistance Visualizer ########################################################################## + class InterclusterDistance(ClusteringScoreVisualizer): """ Intercluster distance maps display an embedding of the cluster centers in @@ -69,7 +75,9 @@ class InterclusterDistance(ClusteringScoreVisualizer): Should be an instance of a centroidal clustering algorithm (or a hierarchical algorithm with a specified number of clusters). Also accepts some other models like LDA for text clustering. - If it is not a clusterer, an exception is raised. + If it is not a clusterer, an exception is raised. If the estimator + is not fitted, it is fit when the visualizer is fitted, unless + otherwise specified by ``is_fitted``. ax : matplotlib Axes, default: None The axes to plot the figure on. If None is passed in the current axes @@ -116,6 +124,12 @@ class InterclusterDistance(ClusteringScoreVisualizer): random_state : int or RandomState, default: None Fixes the random state for stochastic embedding algorithms. + is_fitted : bool or str, default='auto' + Specify if the wrapped estimator is already fitted. If False, the estimator + will be fit when the visualizer is fit, otherwise, the estimator will not be + modified. If 'auto' (default), a helper method will check if the estimator + is fitted before fitting it again. + kwargs : dict Keyword arguments passed to the base class and may influence the feature visualization properties. @@ -136,7 +150,7 @@ class InterclusterDistance(ClusteringScoreVisualizer): Notes ----- - Currently the only two embeddings supportted are MDS and TSNE. Soon to + Currently the only two embeddings supported are MDS and TSNE. Soon to follow will be PCoA and a customized version of PCoA for LDA. The only supported scoring metric is membership, but in the future, silhouette scores and cluster diameter will be added. @@ -147,11 +161,21 @@ class InterclusterDistance(ClusteringScoreVisualizer): clusterers that have ``n_components`` and LDA. """ - - def __init__(self, model, ax=None, min_size=400, max_size=25000, - embedding='mds', scoring='membership', - legend=True, legend_loc="lower left", legend_size=1.5, - random_state=None, **kwargs): + def __init__( + self, + model, + ax=None, + min_size=400, + max_size=25000, + embedding="mds", + scoring="membership", + legend=True, + legend_loc="lower left", + legend_size=1.5, + random_state=None, + is_fitted="auto", + **kwargs + ): # Initialize the visualizer bases super(InterclusterDistance, self).__init__(model, ax=ax, **kwargs) @@ -160,14 +184,14 @@ def __init__(self, model, ax=None, min_size=400, max_size=25000, validate_scoring(scoring) # Set decomposition properties - self.embedding = embedding self.scoring = scoring + self.embedding = embedding self.random_state = random_state # Set visual properties + self.legend = legend self.min_size = min_size self.max_size = max_size - self.legend = legend self.legend_loc = legend_loc self.legend_size = legend_size @@ -178,7 +202,7 @@ def __init__(self, model, ax=None, min_size=400, max_size=25000, self.edgecolor = "#2e719399" if self.legend: - self.lax # If legend True, test the version availability + self.lax # If legend True, test the version availability @memoized def lax(self): @@ -188,20 +212,25 @@ def lax(self): is mostly invisible). The legend can then be drawn on this axes. """ if inset_locator is None: - raise YellowbrickValueError(( - "intercluster distance map legend requires matplotlib 2.0.2 or greater " - "please upgrade matplotlib or set legend=False on the visualizer" - )) + raise YellowbrickValueError( + ( + "intercluster distance map legend requires matplotlib 2.0.2 or " + "later please upgrade matplotlib or set legend=False " + ) + ) lax = inset_locator.inset_axes( - self.ax, width=self.legend_size, height=self.legend_size, loc=self.legend_loc + self.ax, + width=self.legend_size, + height=self.legend_size, + loc=self.legend_loc, ) lax.set_frame_on(False) lax.set_facecolor("none") lax.grid(False) - lax.set_xlim(-1.4,1.4) - lax.set_ylim(-1.4,1.4) + lax.set_xlim(-1.4, 1.4) + lax.set_ylim(-1.4, 1.4) lax.set_xticks([]) lax.set_yticks([]) @@ -216,12 +245,12 @@ def transformer(self): Creates the internal transformer that maps the cluster center's high dimensional space to its two dimensional space. """ - ttype = self.embedding.lower() # transformer method type + ttype = self.embedding.lower() # transformer method type - if ttype == 'mds': + if ttype == "mds": return MDS(n_components=2, random_state=self.random_state) - if ttype == 'tsne': + if ttype == "tsne": return TSNE(n_components=2, random_state=self.random_state) raise YellowbrickValueError("unknown embedding '{}'".format(ttype)) @@ -235,7 +264,7 @@ def cluster_centers_(self): maintained. """ # TODO: Handle agglomerative clustering and LDA - for attr in ('cluster_centers_',): + for attr in ("cluster_centers_",): try: return getattr(self.estimator, attr) except AttributeError: @@ -243,8 +272,9 @@ def cluster_centers_(self): raise AttributeError( "could not find or make cluster_centers_ for {}".format( - self.estimator.__class__.__name__ - )) + self.estimator.__class__.__name__ + ) + ) def fit(self, X, y=None): """ @@ -252,16 +282,17 @@ def fit(self, X, y=None): into 2D space using the embedding method specified. """ with Timer() as self.fit_time_: - # Fit the underlying estimator - self.estimator.fit(X, y) + if not check_fitted(self.estimator, is_fitted_by=self.is_fitted): + # Fit the underlying estimator + self.estimator.fit(X, y) - # Get the centers - # TODO: is this how sklearn stores all centers in the model? - C = self.cluster_centers_ + # Get the centers + # TODO: is this how sklearn stores all centers in the model? + C = self.cluster_centers_ - # Embed the centers in 2D space and get the cluster scores - self.embedded_centers_ = self.transformer.fit_transform(C) - self.scores_ = self._score_clusters(X, y) + # Embed the centers in 2D space and get the cluster scores + self.embedded_centers_ = self.transformer.fit_transform(C) + self.scores_ = self._score_clusters(X, y) # Draw the clusters and fit returns self self.draw() @@ -276,8 +307,12 @@ def draw(self): # Draw the scatter plots with associated sizes on the graph self.ax.scatter( - self.embedded_centers_[:,0], self.embedded_centers_[:,1], - s=sizes, c=self.facecolor, edgecolor=self.edgecolor, linewidth=1, + self.embedded_centers_[:, 0], + self.embedded_centers_[:, 1], + s=sizes, + c=self.facecolor, + edgecolor=self.edgecolor, + linewidth=1, ) # Annotate the clusters with their labels @@ -298,9 +333,11 @@ def finalize(self): sizes if required. """ # Set the default title if a user hasn't supplied one - self.set_title("{} Intercluster Distance Map (via {})".format( - self.estimator.__class__.__name__, self.embedding.upper() - )) + self.set_title( + "{} Intercluster Distance Map (via {})".format( + self.estimator.__class__.__name__, self.embedding.upper() + ) + ) # Create the origin grid and minimalist display self.ax.set_xticks([0]) @@ -315,14 +352,12 @@ def finalize(self): if self.legend: self._make_size_legend() - return self.ax - def _score_clusters(self, X, y=None): """ Determines the "scores" of the cluster, the metric that determines the size of the cluster visualized on the visualization. """ - stype = self.scoring.lower() # scoring method name + stype = self.scoring.lower() # scoring method name if stype == "membership": return np.bincount(self.estimator.labels_) @@ -349,48 +384,56 @@ def _make_size_legend(self): # radius of the markers. areas = self._get_cluster_sizes() radii = np.sqrt(areas / np.pi) - scaled = np.interp(radii, (radii.min(), radii.max()), (.1, 1)) + scaled = np.interp(radii, (radii.min(), radii.max()), (0.1, 1)) # Compute the locations of the 25th, 50th, and 75th percentile scores - indices = np.array([ - percentile_index(self.scores_, p) for p in (25, 50, 75) - ]) + indices = np.array([percentile_index(self.scores_, p) for p in (25, 50, 75)]) # Draw size circles annotated with the percentile score as the legend. for idx in indices: # TODO: should the size circle's center be hard coded like this? - center = (-0.30, 1-scaled[idx]) + center = (-0.30, 1 - scaled[idx]) c = Circle( - center, scaled[idx], facecolor="none", edgecolor="#2e7193", - linewidth=1.5, linestyle="--" + center, + scaled[idx], + facecolor="none", + edgecolor="#2e7193", + linewidth=1.5, + linestyle="--", ) self.lax.add_patch(c) # Add annotation to the size circle with the value of the score self.lax.annotate( - self.scores_[idx], (-0.30, 1-(2*scaled[idx])), xytext=(1, 1-(2*scaled[idx])), - arrowprops=dict(arrowstyle="wedge", color="#2e7193"), va='center', ha='center', + self.scores_[idx], + (-0.30, 1 - (2 * scaled[idx])), + xytext=(1, 1 - (2 * scaled[idx])), + arrowprops=dict(arrowstyle="wedge", color="#2e7193"), + va="center", + ha="center", ) # Draw size legend title - self.lax.text(s="membership", x=0, y=1.2, va='center', ha='center') + self.lax.text(s="membership", x=0, y=1.2, va="center", ha="center") # Ensure the current axes is always the main axes after modifying the # inset axes and while drawing. plt.sca(self.ax) +# alias +ICDM = InterclusterDistance + ########################################################################## ## Helper Methods ########################################################################## + def percentile_index(a, q): """ Returns the index of the value at the Qth percentile in array a. """ - return np.where( - a==np.percentile(a, q, interpolation='nearest') - )[0][0] + return np.where(a == np.percentile(a, q, interpolation="nearest"))[0][0] def validate_string_param(s, valid, param_name="param"): @@ -400,9 +443,7 @@ def validate_string_param(s, valid, param_name="param"): """ if s.lower() not in valid: raise YellowbrickValueError( - "unknown {} '{}', chose from '{}'".format( - param_name, s, ", ".join(valid) - ) + "unknown {} '{}', chose from '{}'".format(param_name, s, ", ".join(valid)) ) @@ -424,13 +465,24 @@ def validate_scoring(param): ## Quick Method ########################################################################## -def intercluster_distance(model, X, y=None, ax=None, - min_size=400, max_size=25000, - embedding='mds', scoring='membership', - legend=True, legend_loc="lower left", legend_size=1.5, - random_state=None, **kwargs): - """Quick Method: +def intercluster_distance( + model, + X, + y=None, + ax=None, + min_size=400, + max_size=25000, + embedding="mds", + scoring="membership", + legend=True, + legend_loc="lower left", + legend_size=1.5, + random_state=None, + is_fitted="auto", + **kwargs +): + """Quick Method: Intercluster distance maps display an embedding of the cluster centers in 2 dimensions with the distance to other centers preserved. E.g. the closer to centers are in the visualization, the closer they are in the original @@ -446,7 +498,9 @@ def intercluster_distance(model, X, y=None, ax=None, Should be an instance of a centroidal clustering algorithm (or a hierarchical algorithm with a specified number of clusters). Also accepts some other models like LDA for text clustering. - If it is not a clusterer, an exception is raised. + If it is not a clusterer, an exception is raised. If the estimator + is not fitted, it is fit when the visualizer is fitted, unless + otherwise specified by ``is_fitted``. X : array-like of shape (n, m) A matrix or data frame with n instances and m features @@ -499,6 +553,12 @@ def intercluster_distance(model, X, y=None, ax=None, random_state : int or RandomState, default: None Fixes the random state for stochastic embedding algorithms. + is_fitted : bool or str, default='auto' + Specify if the wrapped estimator is already fitted. If False, the estimator + will be fit when the visualizer is fit, otherwise, the estimator will not be + modified. If 'auto' (default), a helper method will check if the estimator + is fitted before fitting it again. + kwargs : dict Keyword arguments passed to the base class and may influence the feature visualization properties. @@ -509,11 +569,20 @@ def intercluster_distance(model, X, y=None, ax=None, The intercluster distance visualizer, fitted and finalized. """ oz = InterclusterDistance( - model, ax=ax, min_size=min_size, max_size=max_size, embedding=embedding, - scoring=scoring, legend=legend, legend_loc=legend_loc, legend_size=legend_size, - random_state=random_state, **kwargs + model, + ax=ax, + min_size=min_size, + max_size=max_size, + embedding=embedding, + scoring=scoring, + legend=legend, + legend_loc=legend_loc, + legend_size=legend_size, + random_state=random_state, + is_fitted=is_fitted, + **kwargs ) oz.fit(X, y) - oz.poof() + oz.finalize() return oz diff --git a/yellowbrick/cluster/silhouette.py b/yellowbrick/cluster/silhouette.py index b8d7199e7..d37f82a69 100644 --- a/yellowbrick/cluster/silhouette.py +++ b/yellowbrick/cluster/silhouette.py @@ -1,10 +1,11 @@ # yellowbrick.cluster.silhouette # Implements visualizers using the silhouette metric for cluster evaluation. # -# Author: Benjamin Bengfort +# Author: Benjamin Bengfort +# Author: Rebecca Bilbro # Created: Mon Mar 27 10:09:24 2017 -0400 # -# Copyright (C) 2016 District Data Labs +# Copyright (C) 2017 The scikit-yb developers # For license information, see LICENSE.txt # # ID: silhouette.py [57b563b] benjamin@bengfort.com $ @@ -20,22 +21,21 @@ import numpy as np import matplotlib.ticker as ticker -from ..style import color_palette -from .base import ClusteringScoreVisualizer - from sklearn.metrics import silhouette_score, silhouette_samples +from yellowbrick.utils import check_fitted +from yellowbrick.style import resolve_colors +from yellowbrick.cluster.base import ClusteringScoreVisualizer ## Packages for export -__all__ = [ - "SilhouetteVisualizer" -] +__all__ = ["SilhouetteVisualizer", "silhouette_visualizer"] ########################################################################## ## Silhouette Method for K Selection ########################################################################## + class SilhouetteVisualizer(ClusteringScoreVisualizer): """ The Silhouette Visualizer displays the silhouette coefficient for each @@ -60,12 +60,24 @@ class SilhouetteVisualizer(ClusteringScoreVisualizer): ---------- model : a Scikit-Learn clusterer Should be an instance of a centroidal clustering algorithm (``KMeans`` - or ``MiniBatchKMeans``). + or ``MiniBatchKMeans``). If the estimator is not fitted, it is fit when + the visualizer is fitted, unless otherwise specified by ``is_fitted``. ax : matplotlib Axes, default: None The axes to plot the figure on. If None is passed in the current axes will be used (or generated if required). + colors : iterable or string, default: None + A collection of colors to use for each cluster group. If there are + fewer colors than cluster groups, colors will repeat. May also be a + Yellowbrick or matplotlib colormap string. + + is_fitted : bool or str, default='auto' + Specify if the wrapped estimator is already fitted. If False, the estimator + will be fit when the visualizer is fit, otherwise, the estimator will not be + modified. If 'auto' (default), a helper method will check if the estimator + is fitted before fitting it again. + kwargs : dict Keyword arguments that are passed to the base class and may influence the visualization as defined in other Visualizers. @@ -87,6 +99,9 @@ class SilhouetteVisualizer(ClusteringScoreVisualizer): Number of clusters (e.g. n_clusters or k value) passed to internal scikit-learn model. + y_tick_pos_ : array of shape (n_clusters,) + The computed center positions of each cluster on the y-axis + Examples -------- @@ -97,13 +112,18 @@ class SilhouetteVisualizer(ClusteringScoreVisualizer): >>> model.poof() """ - def __init__(self, model, ax=None, **kwargs): + def __init__(self, model, ax=None, colors=None, is_fitted="auto", **kwargs): + + # Initialize the visualizer bases super(SilhouetteVisualizer, self).__init__(model, ax=ax, **kwargs) # Visual Properties - # TODO: Fix the color handling - self.colormap = kwargs.get('colormap', 'set1') - self.color = kwargs.get('color', None) + # Use colors if it is given, otherwise attempt to use colormap which + # which will override colors. If neither is found, default to None. + # The colormap may yet still be found in resolve_colors + self.colors = colors + if "colormap" in kwargs: + self.colors = kwargs["colormap"] def fit(self, X, y=None, **kwargs): """ @@ -113,8 +133,9 @@ def fit(self, X, y=None, **kwargs): # NOTE: Probably this would be better in score, but the standard score # is a little different and I'm not sure how it's used. - # Fit the wrapped estimator - self.estimator.fit(X, y, **kwargs) + if not check_fitted(self.estimator, is_fitted_by=self.is_fitted): + # Fit the wrapped estimator + self.estimator.fit(X, y, **kwargs) # Get the properties of the dataset self.n_samples_ = X.shape[0] @@ -145,13 +166,22 @@ def draw(self, labels): """ # Track the positions of the lines being drawn - y_lower = 10 # The bottom of the silhouette + y_lower = 10 # The bottom of the silhouette # Get the colors from the various properties - # TODO: Use resolve_colors instead of this - colors = color_palette(self.colormap, self.n_clusters_) + color_kwargs = {"n_colors": self.n_clusters_} + + if self.colors is None: + color_kwargs["colormap"] = "Set1" + elif isinstance(self.colors, str): + color_kwargs["colormap"] = self.colors + else: + color_kwargs["colors"] = self.colors + + colors = resolve_colors(**color_kwargs) # For each cluster, plot the silhouette scores + self.y_tick_pos_ = [] for idx in range(self.n_clusters_): # Collect silhouette scores for samples in the current cluster . @@ -164,19 +194,26 @@ def draw(self, labels): color = colors[idx] self.ax.fill_betweenx( - np.arange(y_lower, y_upper), 0, values, - facecolor=color, edgecolor=color, alpha=0.5 + np.arange(y_lower, y_upper), + 0, + values, + facecolor=color, + edgecolor=color, + alpha=0.5, ) - # Label the silhouette plots with their cluster numbers - self.ax.text(-0.05, y_lower + 0.5 * size, str(idx)) + # Collect the tick position for each cluster + self.y_tick_pos_.append(y_lower + 0.5 * size) # Compute the new y_lower for next plot y_lower = y_upper + 10 # The vertical line for average silhouette score of all the values self.ax.axvline( - x=self.silhouette_score_, color="red", linestyle="--" + x=self.silhouette_score_, + color="red", + linestyle="--", + label="Average Silhouette Score", ) return self.ax @@ -188,18 +225,18 @@ def finalize(self): """ # Set the title - self.set_title(( - "Silhouette Plot of {} Clustering for {} Samples in {} Centers" - ).format( - self.name, self.n_samples_, self.n_clusters_ - )) + self.set_title( + ("Silhouette Plot of {} Clustering for {} Samples in {} Centers").format( + self.name, self.n_samples_, self.n_clusters_ + ) + ) # Set the X and Y limits # The silhouette coefficient can range from -1, 1; # but here we scale the plot according to our visualizations # l_xlim and u_xlim are lower and upper limits of the x-axis, - # set according to our calculated maximum and minimum silhouette score along with necessary padding + # set according to our calculated max and min score with necessary padding l_xlim = max(-1, min(-0.1, round(min(self.silhouette_samples_) - 0.1, 1))) u_xlim = min(1, round(max(self.silhouette_samples_) + 0.1, 1)) self.ax.set_xlim([l_xlim, u_xlim]) @@ -213,5 +250,75 @@ def finalize(self): self.ax.set_ylabel("cluster label") # Set the ticks on the axis object. - self.ax.set_yticks([]) # Clear the yaxis labels / ticks - self.ax.xaxis.set_major_locator(ticker.MultipleLocator(0.1)) # Set the ticks at multiples of 0.1 + self.ax.set_yticks(self.y_tick_pos_) + self.ax.set_yticklabels(str(idx) for idx in range(self.n_clusters_)) + # Set the ticks at multiples of 0.1 + self.ax.xaxis.set_major_locator(ticker.MultipleLocator(0.1)) + + # Show legend (Average Silhouette Score axis) + self.ax.legend(loc="best") + + +########################################################################## +## Quick Method +########################################################################## + + +def silhouette_visualizer( + model, X, y=None, ax=None, colors=None, is_fitted="auto", **kwargs +): + """Quick Method: + The Silhouette Visualizer displays the silhouette coefficient for each + sample on a per-cluster basis, visually evaluating the density and + separation between clusters. The score is calculated by averaging the + silhouette coefficient for each sample, computed as the difference + between the average intra-cluster distance and the mean nearest-cluster + distance for each sample, normalized by the maximum value. This produces a + score between -1 and +1, where scores near +1 indicate high separation + and scores near -1 indicate that the samples may have been assigned to + the wrong cluster. + + Parameters + ---------- + model : a Scikit-Learn clusterer + Should be an instance of a centroidal clustering algorithm (``KMeans`` + or ``MiniBatchKMeans``). If the estimator is not fitted, it is fit when + the visualizer is fitted, unless otherwise specified by ``is_fitted``. + + ax : matplotlib Axes, default: None + The axes to plot the figure on. If None is passed in the current axes + will be used (or generated if required). + + X : array-like of shape (n, m) + A matrix or data frame with n instances and m features + + y : array-like of shape (n,), optional + A vector or series representing the target for each instance + + colors : iterable or string, default: None + A collection of colors to use for each cluster group. If there are + fewer colors than cluster groups, colors will repeat. May also be a + Yellowbrick or matplotlib colormap string. + + is_fitted : bool or str, default='auto' + Specify if the wrapped estimator is already fitted. If False, the estimator + will be fit when the visualizer is fit, otherwise, the estimator will not be + modified. If 'auto' (default), a helper method will check if the estimator + is fitted before fitting it again. + + kwargs : dict + Keyword arguments that are passed to the base class and may influence + the visualization as defined in other Visualizers. + + Returns + ------- + viz : SilhouetteVisualizer + The silhouette visualizer, fitted and finalized. + """ + oz = SilhouetteVisualizer( + model, ax=ax, colors=colors, is_fitted=is_fitted, **kwargs + ) + + oz.fit(X, y) + oz.finalize() + return oz diff --git a/yellowbrick/contrib/__init__.py b/yellowbrick/contrib/__init__.py index 363954ba8..e9e66dfe4 100644 --- a/yellowbrick/contrib/__init__.py +++ b/yellowbrick/contrib/__init__.py @@ -4,8 +4,7 @@ # core support or still in development. # # -# ID: __init__.py [] bilbro@gmail.com $ - +# ID: __init__.py [a60bc41] nathan.danielsen@gmail.com $ from .scatter import ScatterViz, ScatterVisualizer, scatterviz diff --git a/yellowbrick/contrib/classifier/__init__.py b/yellowbrick/contrib/classifier/__init__.py index 3963f69b8..a91152496 100644 --- a/yellowbrick/contrib/classifier/__init__.py +++ b/yellowbrick/contrib/classifier/__init__.py @@ -1,12 +1,12 @@ # yellowbrick.contrib.classifier # Visualizations related to evaluating Scikit-Learn classification models # -# Author: Nathan Danielsen +# Author: Nathan Danielsen # Created: Wed Mar 29 12:39:40 2018 -0400 # -# Copyright (C) 2016 District Data Labs +# Copyright (C) 2016 The scikit-yb developers # For license information, see LICENSE.txt # -# ID: __init__.py [5eee25b] nathan.danielsen@gmail.com $ +# ID: __init__.py [a60bc41] nathan.danielsen@gmail.com $ from .boundaries import decisionviz, DecisionBoundariesVisualizer, DecisionViz diff --git a/yellowbrick/contrib/classifier/boundaries.py b/yellowbrick/contrib/classifier/boundaries.py index ece628f52..45047f712 100644 --- a/yellowbrick/contrib/classifier/boundaries.py +++ b/yellowbrick/contrib/classifier/boundaries.py @@ -1,11 +1,13 @@ # yellowbrick.contrib.classifier.boundaries # Decision boundaries classifier visualizer for Yellowbrick. # -# Author: Nathan Danielsen +# Author: Nathan Danielsen # Created: Sat Mar 12 14:17:29 2017 -0700 # -# Copyright (C) 2017 District Data Labs +# Copyright (C) 2017 The scikit-yb developers # For license information, see LICENSE.txt +# +# ID: boundaries.py [a60bc41] nathan.danielsen@gmail.com $ import itertools import numpy as np @@ -28,20 +30,28 @@ ########################################################################## # Quick Methods ########################################################################## + + @deprecated("Will be moved to yellowbrick.contrib in v0.8") -def decisionviz(model, - X, - y, - colors=None, - classes=None, - features=None, - show_scatter=True, - step_size=0.0025, - markers=None, - pcolormesh_alpha=0.8, - scatter_alpha=1.0, - title=None, - **kwargs): +def decisionviz( + model, + X, + y, + ax=None, + x_name=None, + y_name=None, + features=None, + classes=None, + show_scatter=True, + step_size=0.0025, + markers=None, + pcolormesh_alpha=0.8, + scatter_alpha=1.0, + encoder=None, + is_fitted="auto", + force_model=False, + **kwargs +): """DecisionBoundariesVisualizer is a bivariate data visualization algorithm that plots the decision boundaries of each class. @@ -50,25 +60,41 @@ def decisionviz(model, Parameters ---------- - model : the Scikit-Learn estimator, required - Should be an instance of a classifier, else the __init__ will - return an error. + model : estimator + A scikit-learn estimator that should be a classifier. If the model is + not a classifier, an exception is raised. If the internal model is not + fitted, it is fit when the visualizer is fitted, unless otherwise specified + by ``is_fitted``. + + X : ndarray or DataFrame of shape n x m + A matrix of n instances with m features + + y : ndarray or Series of length n + An array or series of target or class values + + ax : matplotlib Axes, default: None + The axes to plot the figure on. If not specified the current axes will be + used (or generated if required). - x : matrix, required + x_name : string, default: None The feature name that corresponds to a column name or index postion in the matrix that will be plotted against the x-axis - y : array, required + y_name : string, default: None The feature name that corresponds to a column name or index postion in the matrix that will be plotted against the y-axis - classes : a list of class names for the legend, default: None - If classes is None and a y value is passed to fit then the classes - are selected from the target vector. - features : list of strings, default: None The names of the features or columns + classes : list of str, defult: None + The class labels to use for the legend ordered by the index of the sorted + classes discovered in the ``fit()`` method. Specifying classes in this + manner is used to change the class names to a more specific format or + to label encoded integer classes. Some visualizers may also use this + field to filter the visualization for specific classes. For more advanced + usage specify an encoder rather than class labels. + show_scatter : boolean, default: True If boolean is True, then a scatter plot with points will be drawn on top of the decision boundary graph @@ -89,40 +115,63 @@ def decisionviz(model, scatter_alpha : float, default: 1.0 Sets the alpha transparency for the scatter plot points - title : string, default: stringified feature_one and feature_two - Sets the title of the visualization + encoder : dict or LabelEncoder, default: None + A mapping of classes to human readable labels. Often there is a mismatch + between desired class labels and those contained in the target variable + passed to ``fit()`` or ``score()``. The encoder disambiguates this mismatch + ensuring that classes are labeled correctly in the visualization. + + is_fitted : bool or str, default="auto" + Specify if the wrapped estimator is already fitted. If False, the estimator + will be fit when the visualizer is fit, otherwise, the estimator will not be + modified. If "auto" (default), a helper method will check if the estimator + is fitted before fitting it again. - kwargs : keyword arguments passed to the super class. + force_model : bool, default: False + Do not check to ensure that the underlying estimator is a classifier. This + will prevent an exception when the visualizer is initialized but may result + in unexpected or unintended behavior. + + kwargs : dict + Keyword arguments passed to the visualizer base classes. Returns ------- - ax : matplotlib axes - Returns the axes that the decision boundaries graph were drawn on. + viz : DecisionBoundariesVisualizer + Returns the fitted and finalized visualizer """ # Instantiate the visualizer - visualizer = DecisionBoundariesVisualizer(model, - X, - y, - colors=colors, - classes=classes, - features=features, - show_scatter=show_scatter, - step_size=step_size, - markers=markers, - pcolormesh_alpha=pcolormesh_alpha, - scatter_alpha=scatter_alpha, - title=title, - **kwargs) + visualizer = DecisionBoundariesVisualizer( + model, + ax=ax, + x=x_name, + y=y_name, + features=features, + classes=classes, + show_scatter=show_scatter, + step_size=step_size, + markers=markers, + pcolormesh_alpha=pcolormesh_alpha, + scatter_alpha=scatter_alpha, + encoder=encoder, + is_fitted=is_fitted, + force_model=force_model, + **kwargs + ) # Fit, draw and poof the visualizer - visualizer.fit_draw_poof(X, y, **kwargs) + visualizer.fit(X, y) + visualizer.finalize() # Return the axes object on the visualizer - return visualizer.ax + return visualizer + ########################################################################## # Static ScatterVisualizer Visualizer ########################################################################## + + @deprecated("Will be moved to yellowbrick.contrib in v0.8") class DecisionBoundariesVisualizer(ClassificationScoreVisualizer): """ @@ -131,10 +180,15 @@ class DecisionBoundariesVisualizer(ClassificationScoreVisualizer): Parameters ---------- + model : estimator + A scikit-learn estimator that should be a classifier. If the model is + not a classifier, an exception is raised. If the internal model is not + fitted, it is fit when the visualizer is fitted, unless otherwise specified + by ``is_fitted``. - model : the Scikit-Learn estimator - Should be an instance of a classifier, else the __init__ will - return an error. + ax : matplotlib Axes, default: None + The axes to plot the figure on. If not specified the current axes will be + used (or generated if required). x : string, default: None The feature name that corresponds to a column name or index postion @@ -144,13 +198,17 @@ class DecisionBoundariesVisualizer(ClassificationScoreVisualizer): The feature name that corresponds to a column name or index postion in the matrix that will be plotted against the y-axis - classes : a list of class names for the legend, default: None - If classes is None and a y value is passed to fit then the classes - are selected from the target vector. - features : list of strings, default: None The names of the features or columns + classes : list of str, defult: None + The class labels to use for the legend ordered by the index of the sorted + classes discovered in the ``fit()`` method. Specifying classes in this + manner is used to change the class names to a more specific format or + to label encoded integer classes. Some visualizers may also use this + field to filter the visualization for specific classes. For more advanced + usage specify an encoder rather than class labels. + show_scatter : boolean, default: True If boolean is True, then a scatter plot with points will be drawn on top of the decision boundary graph @@ -171,34 +229,54 @@ class DecisionBoundariesVisualizer(ClassificationScoreVisualizer): scatter_alpha : float, default: 1.0 Sets the alpha transparency for the scatter plot points - title : string, default: stringified feature_one and feature_two - Sets the title of the visualization + encoder : dict or LabelEncoder, default: None + A mapping of classes to human readable labels. Often there is a mismatch + between desired class labels and those contained in the target variable + passed to ``fit()`` or ``score()``. The encoder disambiguates this mismatch + ensuring that classes are labeled correctly in the visualization. + + is_fitted : bool or str, default="auto" + Specify if the wrapped estimator is already fitted. If False, the estimator + will be fit when the visualizer is fit, otherwise, the estimator will not be + modified. If "auto" (default), a helper method will check if the estimator + is fitted before fitting it again. - kwargs : keyword arguments passed to the super class. + force_model : bool, default: False + Do not check to ensure that the underlying estimator is a classifier. This + will prevent an exception when the visualizer is initialized but may result + in unexpected or unintended behavior. - These parameters can be influenced later on in the visualization - process, but can and should be set as early as possible. + kwargs : dict + Keyword arguments passed to the visualizer base classes. """ - def __init__(self, - model, - x=None, - y=None, - features=None, - show_scatter=True, - step_size=0.0025, - markers=None, - pcolormesh_alpha=0.8, - scatter_alpha=1.0, - # title=None, - *args, - **kwargs): - """ - Pass in a unfitted model to generate a decision boundaries - visualization. - """ - super(DecisionBoundariesVisualizer, self).__init__(model, *args, **kwargs) + def __init__( + self, + model, + ax=None, + x=None, + y=None, + features=None, + classes=None, + show_scatter=True, + step_size=0.0025, + markers=None, + pcolormesh_alpha=0.8, + scatter_alpha=1.0, + encoder=None, + is_fitted="auto", + force_model=False, + **kwargs + ): + super(DecisionBoundariesVisualizer, self).__init__( + model, + ax=ax, + classes=classes, + encoder=encoder, + is_fitted=is_fitted, + force_model=force_model, + ) self.x = x self.y = y @@ -207,7 +285,8 @@ def __init__(self, self.show_scatter = show_scatter self.step_size = step_size self.markers = itertools.cycle( - kwargs.pop('markers', (',', 'o', 'd', '*', 'v', 'h', '+'))) + kwargs.pop("markers", (",", "o", "d", "*", "v", "h", "+")) + ) self.pcolormesh_alpha = pcolormesh_alpha self.scatter_alpha = scatter_alpha @@ -219,8 +298,7 @@ def __init__(self, self.class_labels = None if self.x is not None and self.y is not None and self.features_ is not None: - raise YellowbrickValueError( - 'Please specify x,y or features, not both.') + raise YellowbrickValueError("Please specify x,y or features, not both.") if self.x is not None and self.y is not None and self.features_ is None: self.features_ = [self.x, self.y] @@ -229,7 +307,8 @@ def __init__(self, if features is not None: if len(features) != 2: raise YellowbrickValueError( - 'DecisionBoundariesVisualizer only accepts two features.') + "DecisionBoundariesVisualizer only accepts two features." + ) def _select_feature_columns(self, X): """ """ @@ -261,10 +340,12 @@ def _select_feature_columns(self, X): X_two_cols = X[:, [int(f_one), int(f_two)]] else: - raise YellowbrickValueError(""" + raise YellowbrickValueError( + """ ScatterVisualizer only accepts two features, please explicitly set these two features in the init kwargs or - pass a matrix/ dataframe in with only two columns.""") + pass a matrix/ dataframe in with only two columns.""" + ) return X_two_cols @@ -291,18 +372,17 @@ def fit(self, X, y=None, **kwargs): Returns the instance of the visualizer """ X = self._select_feature_columns(X) + self.classes_ = self._labels() # Assign each class a unique number for drawing if self.classes_ is None: self.classes_ = { - label: str(kls_num) - for kls_num, label in enumerate(np.unique(y)) + label: str(kls_num) for kls_num, label in enumerate(np.unique(y)) } self.class_labels = None elif len(set(y)) == len(self.classes_): self.classes_ = { - label: str(kls_num) - for kls_num, label in enumerate(self.classes_) + label: str(kls_num) for kls_num, label in enumerate(self.classes_) } self.class_labels = dict(zip(set(y), self.classes_)) else: @@ -316,8 +396,14 @@ def fit(self, X, y=None, **kwargs): # Plot the decision boundary. For that, we will assign a color to each # point in the mesh [x_min, x_max]x[y_min, y_max]. - x_min, x_max = X[:, 0].min() - (X[:, 0].min() * .01), X[:, 0].max() + (X[:, 0].max() * .01) - y_min, y_max = X[:, 1].min() - (X[:, 1].min() * .01), X[:, 1].max() + (X[:, 1].max() * .01) + x_min, x_max = ( + X[:, 0].min() - (X[:, 0].min() * 0.01), + X[:, 0].max() + (X[:, 0].max() * 0.01), + ) + y_min, y_max = ( + X[:, 1].min() - (X[:, 1].min() * 0.01), + X[:, 1].max() + (X[:, 1].max() * 0.01), + ) self.ax.set_xlim([x_min, x_max]) self.ax.set_ylim([y_min, y_max]) @@ -326,7 +412,8 @@ def fit(self, X, y=None, **kwargs): y_step = (y_max - y_min) * self.step_size self.xx, self.yy = np.meshgrid( - np.arange(x_min, x_max, x_step), np.arange(y_min, y_max, y_step)) + np.arange(x_min, x_max, x_step), np.arange(y_min, y_max, y_step) + ) # raise Exception(self.yy.ravel().shape) Z = self.estimator.predict(np.c_[self.xx.ravel(), self.yy.ravel()]) @@ -345,16 +432,17 @@ def draw(self, X, y=None, **kwargs): X = self._select_feature_columns(X) color_cycle = iter( - resolve_colors(colors=self.colors, n_colors=len(self.classes_))) - colors = OrderedDict([(c, next(color_cycle)) - for c in self.classes_.keys()]) + resolve_colors(colors=self.colors, n_colors=len(self.classes_)) + ) + colors = OrderedDict([(c, next(color_cycle)) for c in self.classes_.keys()]) self.ax.pcolormesh( self.xx, self.yy, self.Z_shape, alpha=self.pcolormesh_alpha, - cmap=ListedColormap(colors.values())) + cmap=ListedColormap(colors.values()), + ) # Create a data structure to hold the scatter plot representations to_plot = OrderedDict() @@ -387,13 +475,13 @@ def draw(self, X, y=None, **kwargs): color=colors[kls], alpha=self.scatter_alpha, s=30, - edgecolors='black', + edgecolors="black", label=str(kls), - **kwargs) + **kwargs + ) else: labels = [ - Patch(color=colors[kls], label=kls) - for kls in self.classes_.keys() + Patch(color=colors[kls], label=kls) for kls in self.classes_.keys() ] self.ax.legend(handles=labels) @@ -412,7 +500,7 @@ def finalize(self, **kwargs): self.set_title(self.title) # Add the legend - self.ax.legend(loc='best', frameon=True) + self.ax.legend(loc="best", frameon=True) self.ax.set_xlabel(feature_one) self.ax.set_ylabel(feature_two) diff --git a/yellowbrick/contrib/missing/__init__.py b/yellowbrick/contrib/missing/__init__.py index 96ab3aecc..e6a64cd03 100644 --- a/yellowbrick/contrib/missing/__init__.py +++ b/yellowbrick/contrib/missing/__init__.py @@ -1,13 +1,13 @@ # yellowbrick.contrib.missing # Visualizations related to missing values # -# Author: Nathan Danielsen +# Author: Nathan Danielsen # Created: Fri Mar 29 5:17:36 2018 -0500 # -# Copyright (C) 2018 District Data Labs +# Copyright (C) 2018 The scikit-yb developers # For license information, see LICENSE.txt # -# ID: __init__.py [5eee25b] nathan.danielsen@gmail.com $ +# ID: __init__.py [1443e16] ndanielsen@users.noreply.github.com $ from .bar import MissingValuesBar, missing_bar from .dispersion import MissingValuesDispersion, missing_dispersion diff --git a/yellowbrick/contrib/missing/bar.py b/yellowbrick/contrib/missing/bar.py index 7fd1a7d1e..99e0b5c72 100644 --- a/yellowbrick/contrib/missing/bar.py +++ b/yellowbrick/contrib/missing/bar.py @@ -1,13 +1,13 @@ # yellowbrick.contrib.missing.bar # Missing Values Bar Visualizer # -# Author: Nathan Danielsen +# Author: Nathan Danielsen # Created: Fri Mar 29 5:17:36 2018 -0500 # -# Copyright (C) 2018 District Data Labs +# Copyright (C) 2018 The scikit-yb developers # For license information, see LICENSE.txt # -# ID: bar.py [] nathan.danielsen@gmail.com.com $ +# ID: bar.py [1443e16] ndanielsen@users.noreply.github.com $ """ Bar visualizer of missing values by column. @@ -48,7 +48,7 @@ class MissingValuesBar(MissingDataVisualizer): fit. colors : list, default: None - The color pallette for drawing a stack bar chart when the y targets + The color palette for drawing a stack bar chart when the y targets are passed to fit. classes : list, default: None @@ -77,7 +77,10 @@ class MissingValuesBar(MissingDataVisualizer): >>> visualizer.poof() """ - def __init__(self, width=0.5, color='black', colors=None, classes=None, **kwargs): + def __init__(self, width=0.5, color=None, colors=None, classes=None, **kwargs): + + if "target_type" not in kwargs: + kwargs["target_type"] = "single" super(MissingValuesBar, self).__init__(**kwargs) self.width = width # the width of the bars self.classes_ = classes @@ -88,17 +91,20 @@ def __init__(self, width=0.5, color='black', colors=None, classes=None, **kwargs self.classes_ = np.array(classes) # Set up classifier score visualization properties + self.color = color if self.classes_ is not None: n_colors = len(self.classes_) else: n_colors = None - self.colors = color_palette(kwargs.pop('colors', None), n_colors) + self.colors = color_palette(kwargs.pop("colors", None), n_colors) def get_nan_col_counts(self, **kwargs): # where matrix contains strings, handle them - if np.issubdtype(self.X.dtype, np.string_) or np.issubdtype(self.X.dtype, np.unicode_): - mask = np.where( self.X == '' ) + if np.issubdtype(self.X.dtype, np.string_) or np.issubdtype( + self.X.dtype, np.unicode_ + ): + mask = np.where(self.X == "") nan_matrix = np.zeros(self.X.shape) nan_matrix[mask] = np.nan @@ -116,7 +122,9 @@ def get_nan_col_counts(self, **kwargs): indices = np.argwhere(self.y == target_value) target_matrix = nan_matrix[indices.flatten()] - nan_col_counts = np.array([np.count_nonzero(np.isnan(col)) for col in target_matrix.T]) + nan_col_counts = np.array( + [np.count_nonzero(np.isnan(col)) for col in target_matrix.T] + ) nan_counts.append((target_value, nan_col_counts)) return nan_counts @@ -134,8 +142,13 @@ def draw(self, X, y, **kwargs): self.ind = np.arange(len(self.features_)) if y is None: - self.ax.barh(self.ind - self.width / 2, nan_col_counts, self.width, - color=self.color, label=None) + self.ax.barh( + self.ind - self.width / 2, + nan_col_counts, + self.width, + color=self.color, + label=None, + ) else: self.draw_stacked_bar(nan_col_counts) @@ -156,8 +169,14 @@ def draw_stacked_bar(self, nan_col_counts): color = self.colors[index] - self.ax.barh(self.ind - self.width / 2, nan_col_counts, self.width, - color=color, label=label, left=bottom_chart) + self.ax.barh( + self.ind - self.width / 2, + nan_col_counts, + self.width, + color=color, + label=label, + left=bottom_chart, + ) # keep track of counts to build on stacked bottom_chart = nan_col_counts @@ -173,22 +192,24 @@ def finalize(self, **kwargs): """ # Set the title - self.set_title( - 'Count of Missing Values by Column' - ) - tick_locations = np.arange(len(self.features_)) # the x locations for the groups + self.set_title("Count of Missing Values by Column") + tick_locations = np.arange( + len(self.features_) + ) # the x locations for the groups self.ax.set_yticks(tick_locations) self.ax.set_yticklabels(self.get_feature_names()) # Remove the ticks from the graph - self.ax.set_xlabel('Count') + self.ax.set_xlabel("Count") + + self.ax.legend(loc="best") - self.ax.legend(loc='best') ########################################################################## ## Quick Method ########################################################################## -def missing_bar(X, y=None, ax=None, classes=None, width=0.5, color='black', **kwargs): + +def missing_bar(X, y=None, ax=None, classes=None, width=0.5, color="black", **kwargs): """The MissingValues Bar visualizer creates a bar graph that lists the total count of missing values for each selected feature column. diff --git a/yellowbrick/contrib/missing/base.py b/yellowbrick/contrib/missing/base.py index 1c7310b8a..5f6512538 100644 --- a/yellowbrick/contrib/missing/base.py +++ b/yellowbrick/contrib/missing/base.py @@ -1,13 +1,13 @@ # yellowbrick.contrib.missing.base # Base Visualizer for missing values # -# Author: Nathan Danielsen +# Author: Nathan Danielsen # Created: Fri Mar 29 5:17:36 2018 -0500 # -# Copyright (C) 2018 District Data Labs +# Copyright (C) 2018 The scikit-yb developers # For license information, see LICENSE.txt # -# ID: base.py [] nathan.danielsen@gmail.com.com $ +# ID: base.py [1443e16] ndanielsen@users.noreply.github.com $ """ Base classes for missing values visualizers. @@ -16,15 +16,18 @@ ########################################################################## ## Imports ########################################################################## + import numpy as np from yellowbrick.features.base import DataVisualizer from yellowbrick.utils import is_dataframe + ########################################################################## ## Feature Visualizers ########################################################################## + class MissingDataVisualizer(DataVisualizer): """Base class for MissingDataVisualizers. """ @@ -51,6 +54,12 @@ def fit(self, X, y=None, **kwargs): self : instance Returns the instance of the transformer/visualizer """ + # Do not call super here - the data visualizer has been refactored + # to provide increased functionality that is not yet compatible with + # the current implementation. This mimicks the previous functionality. + # TODO: Refactor MissingDataVisualizer to make use of new features. + self.features_ = self.features + if is_dataframe(X): self.X = X.values if self.features_ is None: @@ -60,8 +69,8 @@ def fit(self, X, y=None, **kwargs): self.y = y - super(MissingDataVisualizer, self).fit(X, y, **kwargs) - + self.draw(X, y, **kwargs) + return self def get_feature_names(self): if self.features_ is None: diff --git a/yellowbrick/contrib/missing/dispersion.py b/yellowbrick/contrib/missing/dispersion.py index e51760762..e6f089057 100644 --- a/yellowbrick/contrib/missing/dispersion.py +++ b/yellowbrick/contrib/missing/dispersion.py @@ -1,13 +1,13 @@ # yellowbrick.contrib.missing.dispersion # Missing Values Dispersion Visualizer # -# Author: Nathan Danielsen +# Author: Nathan Danielsen # Created: Fri Mar 29 5:17:36 2018 -0500 # -# Copyright (C) 2018 District Data Labs +# Copyright (C) 2018 The scikit-yb developers # For license information, see LICENSE.txt # -# ID: dispersion.py [] nathan.danielsen@gmail.com.com $ +# ID: dispersion.py [1443e16] ndanielsen@users.noreply.github.com $ """ Dispersion visualizer for locations of missing values by column against index position. @@ -27,6 +27,7 @@ ## MissingValues Visualizer ########################################################################## + class MissingValuesDispersion(MissingDataVisualizer): """ The Missing Values Dispersion visualizer shows the locations of missing (nan) @@ -71,6 +72,8 @@ class MissingValuesDispersion(MissingDataVisualizer): def __init__(self, alpha=0.5, marker="|", classes=None, **kwargs): + if "target_type" not in kwargs: + kwargs["target_type"] = "single" super(MissingValuesDispersion, self).__init__(**kwargs) self.alpha = alpha self.marker = marker @@ -87,15 +90,16 @@ def __init__(self, alpha=0.5, marker="|", classes=None, **kwargs): else: n_colors = None - self.colors = color_palette(kwargs.pop('colors', None), n_colors) - + self.colors = color_palette(kwargs.pop("colors", None), n_colors) def get_nan_locs(self, **kwargs): """Gets the locations of nans in feature data and returns the coordinates in the matrix """ - if np.issubdtype(self.X.dtype, np.string_) or np.issubdtype(self.X.dtype, np.unicode_): - mask = np.where( self.X == '' ) + if np.issubdtype(self.X.dtype, np.string_) or np.issubdtype( + self.X.dtype, np.unicode_ + ): + mask = np.where(self.X == "") nan_matrix = np.zeros(self.X.shape) nan_matrix[mask] = np.nan @@ -143,7 +147,9 @@ def draw_multi_dispersion_chart(self, nan_locs): color = self.colors[index] x_, y_ = list(zip(*nan_locations)) - self.ax.scatter(x_, y_, alpha=self.alpha, marker=self.marker, color=color, label=label) + self.ax.scatter( + x_, y_, alpha=self.alpha, marker=self.marker, color=color, label=label + ) def finalize(self, **kwargs): """ @@ -156,24 +162,24 @@ def finalize(self, **kwargs): """ # Set the title - self.set_title( - 'Dispersion of Missing Values by Feature' - ) + self.set_title("Dispersion of Missing Values by Feature") # the x locations for the groups tick_locations = np.arange(len(self.features_)) - self.ax.set_xlabel('Position by index') + self.ax.set_xlabel("Position by index") self.ax.set_yticks(tick_locations) self.ax.set_yticklabels(self.get_feature_names()) - self.ax.legend(loc='upper left', prop={'size':5}, bbox_to_anchor=(1,1)) - + self.ax.legend(loc="upper left", prop={"size": 5}, bbox_to_anchor=(1, 1)) ########################################################################## ## Quick Method ########################################################################## -def missing_dispersion(X, y=None, ax=None, classes=None, alpha=0.5, marker="|", **kwargs): + +def missing_dispersion( + X, y=None, ax=None, classes=None, alpha=0.5, marker="|", **kwargs +): """ The Missing Values Dispersion visualizer shows the locations of missing (nan) values in the feature dataset by the order of the index. diff --git a/yellowbrick/contrib/scatter.py b/yellowbrick/contrib/scatter.py index 5206df57b..0d062184c 100644 --- a/yellowbrick/contrib/scatter.py +++ b/yellowbrick/contrib/scatter.py @@ -1,12 +1,13 @@ # yellowbrick.contrib.scatter # Implements a 2d scatter plot for feature analysis. # -# Author: Nathan Danielsen +# Author: Nathan Danielsen # Created: Fri Feb 26 19:40:00 2017 -0400 # +# Copyright (C) 2017 The scikit-yb developers # For license information, see LICENSE.txt # -# ID: scatter.py [fc94ec4] ndanielsen@users.noreply.github.com $ +# ID: scatter.py [a89633e] benjamin@bengfort.com $ """ Implements a 2D scatter plot for feature analysis. """ @@ -29,16 +30,19 @@ # Quick Methods ########################################################################## -def scatterviz(X, - y=None, - ax=None, - features=None, - classes=None, - color=None, - colormap=None, - markers=None, - alpha=1.0, - **kwargs): + +def scatterviz( + X, + y=None, + ax=None, + features=None, + classes=None, + color=None, + colormap=None, + markers=None, + alpha=1.0, + **kwargs +): """Displays a bivariate scatter plot. This helper function is a quick wrapper to utilize the ScatterVisualizer @@ -78,26 +82,34 @@ def scatterviz(X, Returns ------- - ax : matplotlib axes - Returns the axes that the parallel coordinates were drawn on. + viz : ScatterVisualizer + Returns the fitted, finalized visualizer """ # Instantiate the visualizer - visualizer = ScatterVisualizer(ax=ax, features=features, classes=classes, - color=color, colormap=colormap, - markers=markers, alpha=alpha, **kwargs) + visualizer = ScatterVisualizer( + ax=ax, + features=features, + classes=classes, + color=color, + colormap=colormap, + markers=markers, + alpha=alpha, + **kwargs + ) # Fit and transform the visualizer (calls draw) visualizer.fit(X, y, **kwargs) visualizer.transform(X) - # Return the axes object on the visualizer - return visualizer.ax + # Return the visualizer object + return visualizer ########################################################################## # Static ScatterVisualizer Visualizer ########################################################################## + class ScatterVisualizer(DataVisualizer): """ ScatterVisualizer is a bivariate feature data visualization algorithm that @@ -149,23 +161,31 @@ class ScatterVisualizer(DataVisualizer): process, but can and should be set as early as possible. """ - def __init__(self, - ax=None, - x=None, - y=None, - features=None, - classes=None, - color=None, - colormap=None, - markers=None, - alpha=1.0, - **kwargs): + def __init__( + self, + ax=None, + x=None, + y=None, + features=None, + classes=None, + color=None, + colormap=None, + markers=None, + alpha=1.0, + **kwargs + ): """ Initialize the base scatter with many of the options required in order to make the visualization work. """ - super(ScatterVisualizer, self).__init__(ax, features, classes, color, - colormap, **kwargs) + super(ScatterVisualizer, self).__init__( + ax=ax, + features=features, + classes=classes, + color=color, + colormap=colormap, + **kwargs + ) self.x = x self.y = y @@ -173,23 +193,24 @@ def __init__(self, self.alpha = alpha self.markers = itertools.cycle( - kwargs.pop('markers', (',', '+', 'o', '*', 'v', 'h', 'd'))) + kwargs.pop("markers", (",", "+", "o", "*", "v", "h", "d")) + ) self.color = color self.colormap = colormap - if self.x is not None and self.y is not None and self.features_ is not None: - raise YellowbrickValueError( - 'Please specify x,y or features, not both.') + if self.x is not None and self.y is not None and self.features is not None: + raise YellowbrickValueError("Please specify x,y or features, not both.") - if self.x is not None and self.y is not None and self.features_ is None: - self.features_ = [self.x, self.y] + if self.x is not None and self.y is not None and self.features is None: + self.features = [self.x, self.y] # Ensure with init that features doesn't have more than two features if features is not None: if len(features) != 2: raise YellowbrickValueError( - 'ScatterVisualizer only accepts two features.') + "ScatterVisualizer only accepts two features." + ) def fit(self, X, y=None, **kwargs): """ @@ -215,6 +236,12 @@ def fit(self, X, y=None, **kwargs): """ _, ncols = X.shape + # NOTE: Do not call super for this class, it conflicts with the fit. + # Setting these variables is similar to the old behavior of DataVisualizer. + # TODO: refactor to make use of the new DataVisualizer functionality + self.features_ = self.features + self.classes_ = self.classes + if ncols == 2: X_two_cols = X if self.features_ is None: @@ -227,7 +254,9 @@ def fit(self, X, y=None, **kwargs): # handle numpy named/ structured array elif self.features_ is not None and is_structured_array(X): X_selected = X[self.features_] - X_two_cols = X_selected.copy().view((np.float64, len(X_selected.dtype.names))) + X_two_cols = X_selected.copy().view( + (np.float64, len(X_selected.dtype.names)) + ) # handle features that are numeric columns in ndarray matrix elif self.features_ is not None and has_ndarray_int_columns(self.features_, X): @@ -235,10 +264,12 @@ def fit(self, X, y=None, **kwargs): X_two_cols = X[:, [int(f_one), int(f_two)]] else: - raise YellowbrickValueError(""" + raise YellowbrickValueError( + """ ScatterVisualizer only accepts two features, please explicitly set these two features in the init kwargs or - pass a matrix/ dataframe in with only two columns.""") + pass a matrix/ dataframe in with only two columns.""" + ) # Store the classes for the legend if they're None. if self.classes_ is None: @@ -257,14 +288,12 @@ def draw(self, X, y, **kwargs): is determined by the feature data set. """ # Set the axes limits - self.ax.set_xlim([-1,1]) - self.ax.set_ylim([-1,1]) + self.ax.set_xlim([-1, 1]) + self.ax.set_ylim([-1, 1]) # set the colors color_values = resolve_colors( - n_colors=len(self.classes_), - colormap=self.colormap, - colors=self.color + n_colors=len(self.classes_), colormap=self.colormap, colors=self.color ) colors = dict(zip(self.classes_, color_values)) @@ -295,9 +324,10 @@ def draw(self, X, y, **kwargs): color=colors[kls], label=str(kls), alpha=self.alpha, - **kwargs) + **kwargs + ) - self.ax.axis('equal') + self.ax.axis("equal") def finalize(self, **kwargs): """ @@ -313,10 +343,11 @@ def finalize(self, **kwargs): feature_one, feature_two = self.features_ # Set the title - self.set_title('Scatter Plot: {0} vs {1}'.format( - str(feature_one), str(feature_two))) + self.set_title( + "Scatter Plot: {0} vs {1}".format(str(feature_one), str(feature_two)) + ) # Add the legend - self.ax.legend(loc='best') + self.ax.legend(loc="best") self.ax.set_xlabel(str(feature_one)) self.ax.set_ylabel(str(feature_two)) diff --git a/yellowbrick/contrib/statsmodels/__init__.py b/yellowbrick/contrib/statsmodels/__init__.py index 632ec3754..556bad9a9 100644 --- a/yellowbrick/contrib/statsmodels/__init__.py +++ b/yellowbrick/contrib/statsmodels/__init__.py @@ -4,7 +4,7 @@ # Author: Benjamin Bengfort # Created: Wed Apr 04 13:13:24 2018 -0400 # -# ID: __init__.py [] benjamin@bengfort.com $ +# ID: __init__.py [d6ebc39] benjamin@bengfort.com $ """ Implements wrappers around hte statsmodels library to use Yellowbrick with. diff --git a/yellowbrick/contrib/statsmodels/base.py b/yellowbrick/contrib/statsmodels/base.py index 7860c9a17..3e68edf50 100644 --- a/yellowbrick/contrib/statsmodels/base.py +++ b/yellowbrick/contrib/statsmodels/base.py @@ -4,7 +4,7 @@ # Author: Ian Ozsvald # Created: Wed Jan 10 12:47:00 2018 -0500 # -# ID: base.py [] benjamin@bengfort.com $ +# ID: base.py [d6ebc39] benjamin@bengfort.com $ """ A basic wrapper for statsmodels that emulates a scikit-learn estimator. @@ -22,6 +22,7 @@ ## statsmodels Estimator ########################################################################## + class StatsModelsWrapper(BaseEstimator): """ Wrap a statsmodels GLM as a sklearn (fake) BaseEstimator for YellowBrick. @@ -48,13 +49,13 @@ class StatsModelsWrapper(BaseEstimator): >>> gaussian_model = glm_gaussian_partial(y_train, X_train) - Note - ---- + Notes + ----- .. note:: This wrapper is trivial, options and extra things like weights are not currently handled. """ - def __init__(self, glm_partial, stated_estimator_type="regressor", - scorer=r2_score): + + def __init__(self, glm_partial, stated_estimator_type="regressor", scorer=r2_score): # YellowBrick checks the attribute to see if it is a # regressor/clusterer/classifier diff --git a/yellowbrick/datasaurus.py b/yellowbrick/datasaurus.py index b65d31ccf..68c45a433 100644 --- a/yellowbrick/datasaurus.py +++ b/yellowbrick/datasaurus.py @@ -4,10 +4,10 @@ # Author: Larry Gray # Created: Wed Jun 20 15:17:35 2018 -0400 # -# Copyright (C) 2018 District Data Labs +# Copyright (C) 2018 The sckit-yb developers # For license information, see LICENSE.txt # -# ID: datasaurus.py [] lwgray@gmail.com $ +# ID: datasaurus.py [e49d780] lwgray@gmail.com $ """ Plots a Datasaurus Quartet as an illustration of the importance of visualization. @@ -29,222 +29,1175 @@ ########################################################################## DATASAURUS = [ - np.array([[55.3846, 51.5385, 46.1538, 42.8205, 40.7692, 38.7179, 35.641 , - 33.0769, 28.9744, 26.1538, 23.0769, 22.3077, 22.3077, 23.3333, - 25.8974, 29.4872, 32.8205, 35.3846, 40.2564, 44.1026, 46.6667, - 50. , 53.0769, 56.6667, 59.2308, 61.2821, 61.5385, 61.7949, - 57.4359, 54.8718, 52.5641, 48.2051, 49.4872, 51.0256, 45.3846, - 42.8205, 38.7179, 35.1282, 32.5641, 30. , 33.5897, 36.6667, - 38.2051, 29.7436, 29.7436, 30. , 32.0513, 35.8974, 41.0256, - 44.1026, 47.1795, 49.4872, 51.5385, 53.5897, 55.1282, 56.6667, - 59.2308, 62.3077, 64.8718, 67.9487, 70.5128, 71.5385, 71.5385, - 69.4872, 46.9231, 48.2051, 50. , 53.0769, 55.3846, 56.6667, - 56.1538, 53.8462, 51.2821, 50. , 47.9487, 29.7436, 29.7436, - 31.2821, 57.9487, 61.7949, 64.8718, 68.4615, 70.7692, 72.0513, - 73.8462, 75.1282, 76.6667, 77.6923, 79.7436, 81.7949, 83.3333, - 85.1282, 86.4103, 87.9487, 89.4872, 93.3333, 95.3846, 98.2051, - 56.6667, 59.2308, 60.7692, 63.0769, 64.1026, 64.359 , 74.359 , - 71.2821, 67.9487, 65.8974, 63.0769, 61.2821, 58.7179, 55.1282, - 52.3077, 49.7436, 47.4359, 44.8718, 48.7179, 51.2821, 54.1026, - 56.1538, 52.0513, 48.7179, 47.1795, 46.1538, 50.5128, 53.8462, - 57.4359, 60. , 64.1026, 66.9231, 71.2821, 74.359 , 78.2051, - 67.9487, 68.4615, 68.2051, 37.6923, 39.4872, 91.2821, 50. , - 47.9487, 44.1026], - [97.1795, 96.0256, 94.4872, 91.4103, 88.3333, 84.8718, 79.8718, - 77.5641, 74.4872, 71.4103, 66.4103, 61.7949, 57.1795, 52.9487, - 51.0256, 51.0256, 51.0256, 51.4103, 51.4103, 52.9487, 54.1026, - 55.2564, 55.641 , 56.0256, 57.9487, 62.1795, 66.4103, 69.1026, - 55.2564, 49.8718, 46.0256, 38.3333, 42.1795, 44.1026, 36.4103, - 32.5641, 31.4103, 30.2564, 32.1795, 36.7949, 41.4103, 45.641 , - 49.1026, 36.0256, 32.1795, 29.1026, 26.7949, 25.2564, 25.2564, - 25.641 , 28.718 , 31.4103, 34.8718, 37.5641, 40.641 , 42.1795, - 44.4872, 46.0256, 46.7949, 47.9487, 53.718 , 60.641 , 64.4872, - 69.4872, 79.8718, 84.1026, 85.2564, 85.2564, 86.0256, 86.0256, - 82.9487, 80.641 , 78.718 , 78.718 , 77.5641, 59.8718, 62.1795, - 62.5641, 99.4872, 99.1026, 97.5641, 94.1026, 91.0256, 86.4103, - 83.3333, 79.1026, 75.2564, 71.4103, 66.7949, 60.2564, 55.2564, - 51.4103, 47.5641, 46.0256, 42.5641, 39.8718, 36.7949, 33.718 , - 40.641 , 38.3333, 33.718 , 29.1026, 25.2564, 24.1026, 22.9487, - 22.9487, 22.1795, 20.2564, 19.1026, 19.1026, 18.3333, 18.3333, - 18.3333, 17.5641, 16.0256, 13.718 , 14.8718, 14.8718, 14.8718, - 14.1026, 12.5641, 11.0256, 9.8718, 6.0256, 9.4872, 10.2564, - 10.2564, 10.641 , 10.641 , 10.641 , 10.641 , 10.641 , 10.641 , - 8.718 , 5.2564, 2.9487, 25.7692, 25.3846, 41.5385, 95.7692, - 95. , 92.6923]]), - np.array([[51.20389114, 58.9744699 , 51.87207267, 48.17993079, 41.6832004 , - 37.8904155 , 39.54897369, 39.64957388, 34.75059705, 27.56083529, - 24.63553998, 20.95946481, 20.68914905, 19.28820474, 20.02450057, - 35.469523 , 36.89432765, 39.05554978, 46.95708015, 37.31045274, - 40.009672 , 48.01438668, 53.70377593, 63.06749989, 62.04803251, - 59.83996671, 55.16094182, 61.27978658, 60.83491753, 61.52059065, - 36.91654386, 38.50219967, 48.66437073, 50.2852524 , 42.27633267, - 54.03177562, 37.32935526, 41.38952255, 40.07466666, 35.34968062, - 34.76370042, 37.02662945, 36.45556953, 35.53766421, 20.40894789, - 23.49571047, 29.55754336, 33.00823391, 53.98039918, 52.2343086 , - 59.50307661, 41.16378107, 48.99304012, 59.26928032, 45.469177 , - 62.69126654, 73.42867087, 70.84642611, 71.53901985, 67.62086589, - 72.47095256, 64.81223756, 60.85367987, 67.78949616, 41.60955727, - 53.00302532, 54.71417106, 44.29166872, 49.19172196, 53.10138178, - 51.59984815, 54.37972195, 46.4807681 , 53.17465627, 45.27200294, - 36.03340215, 28.27119417, 25.05480608, 64.758887 , 63.14452748, - 50.42467869, 70.64499626, 63.14904908, 62.82402452, 70.23686951, - 70.04273524, 72.57062345, 75.13071604, 83.29390573, 79.66426228, - 88.43210253, 89.11555901, 89.09219763, 91.72600577, 91.73553876, - 91.50788817, 88.2390019 , 88.5305192 , 55.36516034, 62.56025887, - 58.00666912, 55.06711799, 61.61477596, 68.54314354, 77.70610965, - 68.453046 , 68.25720644, 70.25547467, 65.04432528, 60.09224661, - 52.99202897, 50.14462898, 46.50861419, 43.80703196, 57.81785469, - 50.94049266, 63.49732308, 50.01648295, 58.63676508, 54.73028909, - 65.8755478 , 57.06098271, 46.81990795, 38.35939487, 47.31541578, - 55.05191654, 50.51596026, 49.67741465, 67.28065952, 66.17301826, - 61.08854414, 66.05308577, 72.66998927, 61.5034725 , 68.99502863, - 78.24991617, 36.48198057, 50.96774838, 91.19105361, 55.86376849, - 49.2805948 , 43.36850154], - [83.33977661, 85.49981761, 85.82973763, 85.04511674, 84.0179406 , - 82.567493 , 80.81260177, 82.66453387, 80.01109099, 72.84782559, - 71.61071483, 66.04149838, 62.72130521, 62.06305936, 61.34262387, - 43.11588495, 47.70655597, 55.54697371, 65.24040739, 45.2587509 , - 60.98658251, 65.71281959, 66.38948204, 64.03500046, 63.84586325, - 64.47676444, 65.23730817, 65.7664025 , 64.60376971, 64.79185504, - 41.09524744, 41.56715562, 30.68066685, 30.33792211, 34.52763612, - 29.67234831, 39.60204231, 37.29605623, 34.6236852 , 47.14107313, - 47.62479992, 44.46229305, 40.79184303, 48.72938687, 32.20303042, - 25.32246815, 21.36477746, 15.98507146, 29.35098671, 29.71167299, - 30.66967394, 34.31575825, 32.03035884, 29.64070177, 33.83119273, - 30.29037383, 48.57785513, 52.28225333, 45.52180616, 38.00655847, - 51.12213482, 62.81091559, 65.49914703, 61.36370155, 83.84868656, - 84.6747986 , 84.04312807, 82.90944121, 85.87622912, 84.54765869, - 84.81982149, 84.24035555, 83.51821167, 84.26056799, 85.23707942, - 53.37168776, 72.84023126, 71.54859792, 82.31522364, 85.23669633, - 85.17474759, 82.43091876, 83.94685535, 84.96618595, 82.17115106, - 80.38502135, 80.97121843, 79.98409314, 70.77843179, 73.93230972, - 64.624247 , 64.00150664, 57.76819305, 52.62335326, 48.97021089, - 53.31265209, 31.47743488, 30.47603101, 30.44585028, 30.44713567, - 30.2537213 , 29.0115352 , 29.99439119, 35.65783217, 20.30426019, - 13.03552859, 12.38463915, 13.25038497, 11.00084148, 11.87211171, - 9.90666848, 12.21154309, 11.20713449, 11.31894489, 10.94514243, - 9.69154713, 11.91406917, 11.93385209, 11.97472107, 11.41288267, - 11.73243636, 9.92056085, 10.49465268, 13.43132262, 12.85345178, - 11.94998862, 9.76559162, 10.38313251, 14.12865153, 12.03791702, - 10.08453441, 13.38022601, 15.23422594, 10.82841448, 13.99431053, - 17.88324091, 15.16276009, 29.67977429, 46.67434284, 85.33648676, - 84.04882283, 84.3321772 ]]), - np.array([[58.21360826, 58.19605369, 58.71823072, 57.27837287, 58.08202049, - 57.48944777, 28.08874132, 28.08546821, 28.08727305, 27.57802522, - 27.77991911, 28.58899981, 28.7391415 , 27.02460324, 28.8013367 , - 27.18646384, 29.2851466 , 39.4029453 , 28.81132844, 34.30395791, - 29.60276098, 49.11615686, 39.61754583, 43.23308466, 64.89278794, - 62.49014932, 68.98808443, 62.10561863, 32.46184674, 41.32720065, - 44.00714993, 44.07406069, 44.00131524, 45.00630045, 44.44384061, - 42.1787134 , 44.04456562, 41.64045402, 41.93833001, 44.05392751, - 39.20671933, 28.70444923, 31.7086629 , 42.81171147, 43.30061489, - 40.39863291, 40.43569158, 40.93654667, 39.66157367, 40.89925917, - 41.96861683, 40.38340582, 56.53812645, 52.97069128, 54.62095259, - 65.09904439, 63.05599091, 70.96013623, 69.89581924, 70.59589286, - 69.64702143, 77.39298249, 64.40078719, 63.86895983, 56.59442132, - 56.53133729, 59.65215837, 56.6365087 , 58.672288 , 58.22161273, - 57.91466448, 55.31550906, 54.57572859, 54.41309365, 55.0745059 , - 29.43296052, 29.42268607, 29.00561416, 58.46183859, 57.99780474, - 57.54947408, 59.52992846, 58.24939106, 58.02451401, 58.38212449, - 62.56675904, 72.17582431, 79.47276157, 80.35770088, 78.75723614, - 82.54023959, 86.43589719, 79.48868442, 81.53042032, 79.18678857, - 77.89905795, 75.13071421, 76.05801375, 57.61467439, 56.17139753, - 66.2878906 , 67.88171962, 64.0280813 , 77.49665175, 77.63465176, - 77.86372643, 77.33815817, 76.18041653, 77.25265109, 77.41337528, - 76.7318494 , 49.47110541, 42.47653994, 43.59511586, 50.33996967, - 40.74898026, 38.38652558, 38.40401521, 38.76427889, 41.47014233, - 47.15540481, 39.58256675, 41.74024382, 39.31187189, 41.67984769, - 39.08746445, 41.48150286, 77.60608655, 75.98266152, 76.94575724, - 77.54372007, 77.58473984, 76.82230426, 77.34857166, 77.57315269, - 77.97261068, 41.52891976, 43.7225508 , 79.32607818, 56.66397408, - 57.82178923, 58.2431719 ], - [91.88189151, 92.21498865, 90.31053209, 89.90760672, 92.00814501, - 88.08528556, 63.51079443, 63.59019695, 63.12328281, 62.82103866, - 63.51814752, 63.02408057, 62.72086389, 62.90185886, 63.38904039, - 63.55872965, 63.38360583, 51.1508572 , 61.35785406, 56.54212591, - 60.15734672, 63.66000062, 62.92518796, 63.16521872, 65.81417676, - 74.58428961, 63.2321473 , 75.99087076, 62.88190292, 49.07025127, - 46.44967378, 34.55320389, 33.90420735, 38.29901955, 36.0190833 , - 26.49211948, 35.66223828, 27.09309542, 24.99152298, 33.55639249, - 51.5337157 , 61.7775254 , 58.83775437, 30.02044842, 31.5264262 , - 16.34700838, 20.23267068, 16.91300484, 15.60935558, 20.79852895, - 26.4970726 , 21.39122552, 32.44424547, 29.04019669, 30.34452445, - 27.24155756, 29.70909567, 41.25950129, 43.45375927, 41.96474387, - 44.04444502, 63.37145906, 67.44871845, 70.21373883, 86.92700622, - 87.49981107, 87.80946159, 85.63749556, 90.07716031, 90.41101877, - 89.95380277, 80.25186069, 77.53628847, 78.22908659, 79.81754642, - 60.80177654, 63.06846482, 63.39075133, 90.26532639, 92.15990861, - 90.74890656, 88.32727415, 92.12968148, 91.69442117, 90.55347607, - 77.74393476, 63.12892942, 63.40868612, 63.29543754, 53.33262001, - 56.54105229, 59.79276181, 53.65167426, 56.02536457, 53.23479185, - 51.82245833, 23.37244197, 16.38374969, 33.82244765, 32.11798877, - 26.11710975, 24.23601841, 27.67268551, 14.94852356, 14.46185393, - 14.61067765, 15.89005466, 15.91257375, 15.15151702, 15.22192798, - 16.21684614, 25.06301931, 18.33847356, 19.99420098, 26.47139661, - 16.18214166, 14.58021515, 14.45194845, 14.36559047, 17.27803344, - 22.37793253, 17.64845284, 17.82932431, 15.64071697, 17.74591901, - 15.12230394, 18.04743744, 15.16287254, 16.30692238, 15.85847833, - 15.25394915, 15.83003939, 15.59516532, 15.77452924, 14.78064583, - 14.95569875, 24.91642519, 19.0773278 , 52.90039129, 87.94012501, - 90.69316655, 92.10432787]]), - np.array([[51.14791671, 50.51712581, 50.2074802 , 50.06948192, 50.56284634, - 50.2885278 , 25.58347508, 25.48358339, 25.4435257 , 25.56511342, - 25.92884427, 27.55147826, 27.53046637, 27.09557036, 27.43924961, - 27.87826426, 27.33886892, 27.67840297, 52.63565768, 52.02521411, - 52.88116479, 52.95260731, 52.52055249, 52.34282206, 51.92759021, - 52.71377449, 50.44380279, 50.21669503, 52.18418011, 52.79209735, - 52.58971986, 52.02884867, 52.72924658, 52.88431329, 52.50930089, - 50.86268433, 50.89149225, 25.8551276 , 26.02564455, 27.89317272, - 27.63996794, 27.8926589 , 52.79773294, 27.58063881, 26.49139853, - 25.98531782, 26.20141928, 25.85756947, 50.70468436, 50.81197535, - 50.56484556, 50.93930391, 50.45885484, 52.90136407, 52.68495344, - 52.50008894, 51.83563726, 76.9954121 , 77.31060048, 77.92604434, - 77.25438834, 76.2431578 , 77.08448437, 75.2280532 , 50.65835477, - 50.20336581, 50.9295477 , 50.17867185, 50.42269806, 50.46422483, - 50.44927033, 49.92838028, 50.48801364, 49.96490538, 50.75210826, - 27.42242921, 27.6740834 , 27.53739532, 52.26334738, 51.73728166, - 75.87096369, 75.24432621, 75.19829529, 75.70104153, 75.47933966, - 75.19456687, 74.82025396, 75.16434049, 75.26335555, 77.75641893, - 77.95443505, 77.08333777, 76.06355025, 77.68201632, 76.87808198, - 76.94850272, 77.86405471, 75.77145009, 52.33156913, 52.59281837, - 50.47704772, 75.29647509, 75.57395413, 75.40052716, 75.87099084, - 75.60588476, 75.89557705, 75.7465632 , 75.14234148, 50.66177956, - 50.69985064, 50.91894087, 50.72525854, 51.26387123, 51.25091965, - 50.78515721, 50.50139658, 50.73367454, 50.71137854, 50.8127449 , - 51.01423295, 50.35352141, 50.43552957, 50.63098196, 51.0668072 , - 50.79235473, 50.55127806, 50.55975806, 75.32597855, 75.04472578, - 75.28708772, 75.23996998, 75.1524592 , 75.96184009, 75.44806251, - 75.75938382, 50.3782623 , 50.53363501, 77.50090732, 50.69112419, - 49.99039495, 50.12718203], - [90.86741233, 89.10239459, 85.4600474 , 83.05766953, 82.93782178, - 82.97525357, 82.91489113, 82.92908498, 82.8742005 , 82.92409777, - 82.82118411, 51.48738653, 51.41484656, 52.07679944, 51.71207905, - 50.70890793, 51.65304675, 51.18198917, 51.41855226, 52.12301105, - 50.62155476, 50.07473901, 51.5024421 , 51.86195209, 52.25779061, - 51.19794432, 82.94182882, 83.75234297, 51.97525067, 51.07339565, - 51.3380902 , 52.1768375 , 51.20176505, 50.44143545, 51.41620515, - 17.14563109, 17.14132373, 17.08190869, 16.92501353, 50.66196341, - 51.39909748, 50.79528152, 50.68603709, 51.52476126, 17.40539097, - 17.20372213, 17.09382391, 17.11384266, 17.02374454, 17.11492526, - 17.07777732, 16.98102188, 17.03857897, 50.69056272, 51.29446922, - 51.59435617, 52.33576553, 52.04552865, 51.74673004, 50.31866042, - 51.46182482, 52.12368985, 51.9671367 , 82.98566202, 83.11447934, - 82.98265686, 82.84604113, 83.18462233, 82.90990147, 82.93532841, - 83.96992038, 82.99366549, 83.09951912, 83.7083177 , 82.9019501 , - 51.43887623, 51.30411215, 51.59365408, 94.24932783, 92.97911753, - 88.38644174, 83.90349738, 83.46230334, 82.91945886, 82.88405139, - 82.93211578, 82.96238879, 83.03499717, 82.9452793 , 51.15177033, - 50.47557897, 52.15779927, 52.10465206, 51.16563781, 51.8675623 , - 51.90751654, 49.66254553, 17.11125121, 51.87886035, 51.39159152, - 17.04828941, 17.01565319, 17.06219214, 17.04110689, 17.13489391, - 17.06772306, 17.16994971, 17.10571651, 16.75492389, 17.07814052, - 17.08518438, 17.14760476, 16.90746981, 17.16234971, 17.24045586, - 17.18019648, 17.10577072, 16.99296341, 17.08831585, 16.57271805, - 17.22109553, 17.06474308, 17.0651685 , 17.07652235, 17.20885971, - 17.20421434, 17.08465518, 17.09388377, 15.77189199, 17.00426226, - 16.17493491, 17.03184749, 17.0049424 , 16.69484223, 17.04514941, - 16.94292965, 16.94627981, 17.01958137, 50.16698595, 87.51396042, - 83.99735692, 82.99075 ]])] + np.array( + [ + [ + 55.3846, + 51.5385, + 46.1538, + 42.8205, + 40.7692, + 38.7179, + 35.641, + 33.0769, + 28.9744, + 26.1538, + 23.0769, + 22.3077, + 22.3077, + 23.3333, + 25.8974, + 29.4872, + 32.8205, + 35.3846, + 40.2564, + 44.1026, + 46.6667, + 50.0, + 53.0769, + 56.6667, + 59.2308, + 61.2821, + 61.5385, + 61.7949, + 57.4359, + 54.8718, + 52.5641, + 48.2051, + 49.4872, + 51.0256, + 45.3846, + 42.8205, + 38.7179, + 35.1282, + 32.5641, + 30.0, + 33.5897, + 36.6667, + 38.2051, + 29.7436, + 29.7436, + 30.0, + 32.0513, + 35.8974, + 41.0256, + 44.1026, + 47.1795, + 49.4872, + 51.5385, + 53.5897, + 55.1282, + 56.6667, + 59.2308, + 62.3077, + 64.8718, + 67.9487, + 70.5128, + 71.5385, + 71.5385, + 69.4872, + 46.9231, + 48.2051, + 50.0, + 53.0769, + 55.3846, + 56.6667, + 56.1538, + 53.8462, + 51.2821, + 50.0, + 47.9487, + 29.7436, + 29.7436, + 31.2821, + 57.9487, + 61.7949, + 64.8718, + 68.4615, + 70.7692, + 72.0513, + 73.8462, + 75.1282, + 76.6667, + 77.6923, + 79.7436, + 81.7949, + 83.3333, + 85.1282, + 86.4103, + 87.9487, + 89.4872, + 93.3333, + 95.3846, + 98.2051, + 56.6667, + 59.2308, + 60.7692, + 63.0769, + 64.1026, + 64.359, + 74.359, + 71.2821, + 67.9487, + 65.8974, + 63.0769, + 61.2821, + 58.7179, + 55.1282, + 52.3077, + 49.7436, + 47.4359, + 44.8718, + 48.7179, + 51.2821, + 54.1026, + 56.1538, + 52.0513, + 48.7179, + 47.1795, + 46.1538, + 50.5128, + 53.8462, + 57.4359, + 60.0, + 64.1026, + 66.9231, + 71.2821, + 74.359, + 78.2051, + 67.9487, + 68.4615, + 68.2051, + 37.6923, + 39.4872, + 91.2821, + 50.0, + 47.9487, + 44.1026, + ], + [ + 97.1795, + 96.0256, + 94.4872, + 91.4103, + 88.3333, + 84.8718, + 79.8718, + 77.5641, + 74.4872, + 71.4103, + 66.4103, + 61.7949, + 57.1795, + 52.9487, + 51.0256, + 51.0256, + 51.0256, + 51.4103, + 51.4103, + 52.9487, + 54.1026, + 55.2564, + 55.641, + 56.0256, + 57.9487, + 62.1795, + 66.4103, + 69.1026, + 55.2564, + 49.8718, + 46.0256, + 38.3333, + 42.1795, + 44.1026, + 36.4103, + 32.5641, + 31.4103, + 30.2564, + 32.1795, + 36.7949, + 41.4103, + 45.641, + 49.1026, + 36.0256, + 32.1795, + 29.1026, + 26.7949, + 25.2564, + 25.2564, + 25.641, + 28.718, + 31.4103, + 34.8718, + 37.5641, + 40.641, + 42.1795, + 44.4872, + 46.0256, + 46.7949, + 47.9487, + 53.718, + 60.641, + 64.4872, + 69.4872, + 79.8718, + 84.1026, + 85.2564, + 85.2564, + 86.0256, + 86.0256, + 82.9487, + 80.641, + 78.718, + 78.718, + 77.5641, + 59.8718, + 62.1795, + 62.5641, + 99.4872, + 99.1026, + 97.5641, + 94.1026, + 91.0256, + 86.4103, + 83.3333, + 79.1026, + 75.2564, + 71.4103, + 66.7949, + 60.2564, + 55.2564, + 51.4103, + 47.5641, + 46.0256, + 42.5641, + 39.8718, + 36.7949, + 33.718, + 40.641, + 38.3333, + 33.718, + 29.1026, + 25.2564, + 24.1026, + 22.9487, + 22.9487, + 22.1795, + 20.2564, + 19.1026, + 19.1026, + 18.3333, + 18.3333, + 18.3333, + 17.5641, + 16.0256, + 13.718, + 14.8718, + 14.8718, + 14.8718, + 14.1026, + 12.5641, + 11.0256, + 9.8718, + 6.0256, + 9.4872, + 10.2564, + 10.2564, + 10.641, + 10.641, + 10.641, + 10.641, + 10.641, + 10.641, + 8.718, + 5.2564, + 2.9487, + 25.7692, + 25.3846, + 41.5385, + 95.7692, + 95.0, + 92.6923, + ], + ] + ), + np.array( + [ + [ + 51.20389114, + 58.9744699, + 51.87207267, + 48.17993079, + 41.6832004, + 37.8904155, + 39.54897369, + 39.64957388, + 34.75059705, + 27.56083529, + 24.63553998, + 20.95946481, + 20.68914905, + 19.28820474, + 20.02450057, + 35.469523, + 36.89432765, + 39.05554978, + 46.95708015, + 37.31045274, + 40.009672, + 48.01438668, + 53.70377593, + 63.06749989, + 62.04803251, + 59.83996671, + 55.16094182, + 61.27978658, + 60.83491753, + 61.52059065, + 36.91654386, + 38.50219967, + 48.66437073, + 50.2852524, + 42.27633267, + 54.03177562, + 37.32935526, + 41.38952255, + 40.07466666, + 35.34968062, + 34.76370042, + 37.02662945, + 36.45556953, + 35.53766421, + 20.40894789, + 23.49571047, + 29.55754336, + 33.00823391, + 53.98039918, + 52.2343086, + 59.50307661, + 41.16378107, + 48.99304012, + 59.26928032, + 45.469177, + 62.69126654, + 73.42867087, + 70.84642611, + 71.53901985, + 67.62086589, + 72.47095256, + 64.81223756, + 60.85367987, + 67.78949616, + 41.60955727, + 53.00302532, + 54.71417106, + 44.29166872, + 49.19172196, + 53.10138178, + 51.59984815, + 54.37972195, + 46.4807681, + 53.17465627, + 45.27200294, + 36.03340215, + 28.27119417, + 25.05480608, + 64.758887, + 63.14452748, + 50.42467869, + 70.64499626, + 63.14904908, + 62.82402452, + 70.23686951, + 70.04273524, + 72.57062345, + 75.13071604, + 83.29390573, + 79.66426228, + 88.43210253, + 89.11555901, + 89.09219763, + 91.72600577, + 91.73553876, + 91.50788817, + 88.2390019, + 88.5305192, + 55.36516034, + 62.56025887, + 58.00666912, + 55.06711799, + 61.61477596, + 68.54314354, + 77.70610965, + 68.453046, + 68.25720644, + 70.25547467, + 65.04432528, + 60.09224661, + 52.99202897, + 50.14462898, + 46.50861419, + 43.80703196, + 57.81785469, + 50.94049266, + 63.49732308, + 50.01648295, + 58.63676508, + 54.73028909, + 65.8755478, + 57.06098271, + 46.81990795, + 38.35939487, + 47.31541578, + 55.05191654, + 50.51596026, + 49.67741465, + 67.28065952, + 66.17301826, + 61.08854414, + 66.05308577, + 72.66998927, + 61.5034725, + 68.99502863, + 78.24991617, + 36.48198057, + 50.96774838, + 91.19105361, + 55.86376849, + 49.2805948, + 43.36850154, + ], + [ + 83.33977661, + 85.49981761, + 85.82973763, + 85.04511674, + 84.0179406, + 82.567493, + 80.81260177, + 82.66453387, + 80.01109099, + 72.84782559, + 71.61071483, + 66.04149838, + 62.72130521, + 62.06305936, + 61.34262387, + 43.11588495, + 47.70655597, + 55.54697371, + 65.24040739, + 45.2587509, + 60.98658251, + 65.71281959, + 66.38948204, + 64.03500046, + 63.84586325, + 64.47676444, + 65.23730817, + 65.7664025, + 64.60376971, + 64.79185504, + 41.09524744, + 41.56715562, + 30.68066685, + 30.33792211, + 34.52763612, + 29.67234831, + 39.60204231, + 37.29605623, + 34.6236852, + 47.14107313, + 47.62479992, + 44.46229305, + 40.79184303, + 48.72938687, + 32.20303042, + 25.32246815, + 21.36477746, + 15.98507146, + 29.35098671, + 29.71167299, + 30.66967394, + 34.31575825, + 32.03035884, + 29.64070177, + 33.83119273, + 30.29037383, + 48.57785513, + 52.28225333, + 45.52180616, + 38.00655847, + 51.12213482, + 62.81091559, + 65.49914703, + 61.36370155, + 83.84868656, + 84.6747986, + 84.04312807, + 82.90944121, + 85.87622912, + 84.54765869, + 84.81982149, + 84.24035555, + 83.51821167, + 84.26056799, + 85.23707942, + 53.37168776, + 72.84023126, + 71.54859792, + 82.31522364, + 85.23669633, + 85.17474759, + 82.43091876, + 83.94685535, + 84.96618595, + 82.17115106, + 80.38502135, + 80.97121843, + 79.98409314, + 70.77843179, + 73.93230972, + 64.624247, + 64.00150664, + 57.76819305, + 52.62335326, + 48.97021089, + 53.31265209, + 31.47743488, + 30.47603101, + 30.44585028, + 30.44713567, + 30.2537213, + 29.0115352, + 29.99439119, + 35.65783217, + 20.30426019, + 13.03552859, + 12.38463915, + 13.25038497, + 11.00084148, + 11.87211171, + 9.90666848, + 12.21154309, + 11.20713449, + 11.31894489, + 10.94514243, + 9.69154713, + 11.91406917, + 11.93385209, + 11.97472107, + 11.41288267, + 11.73243636, + 9.92056085, + 10.49465268, + 13.43132262, + 12.85345178, + 11.94998862, + 9.76559162, + 10.38313251, + 14.12865153, + 12.03791702, + 10.08453441, + 13.38022601, + 15.23422594, + 10.82841448, + 13.99431053, + 17.88324091, + 15.16276009, + 29.67977429, + 46.67434284, + 85.33648676, + 84.04882283, + 84.3321772, + ], + ] + ), + np.array( + [ + [ + 58.21360826, + 58.19605369, + 58.71823072, + 57.27837287, + 58.08202049, + 57.48944777, + 28.08874132, + 28.08546821, + 28.08727305, + 27.57802522, + 27.77991911, + 28.58899981, + 28.7391415, + 27.02460324, + 28.8013367, + 27.18646384, + 29.2851466, + 39.4029453, + 28.81132844, + 34.30395791, + 29.60276098, + 49.11615686, + 39.61754583, + 43.23308466, + 64.89278794, + 62.49014932, + 68.98808443, + 62.10561863, + 32.46184674, + 41.32720065, + 44.00714993, + 44.07406069, + 44.00131524, + 45.00630045, + 44.44384061, + 42.1787134, + 44.04456562, + 41.64045402, + 41.93833001, + 44.05392751, + 39.20671933, + 28.70444923, + 31.7086629, + 42.81171147, + 43.30061489, + 40.39863291, + 40.43569158, + 40.93654667, + 39.66157367, + 40.89925917, + 41.96861683, + 40.38340582, + 56.53812645, + 52.97069128, + 54.62095259, + 65.09904439, + 63.05599091, + 70.96013623, + 69.89581924, + 70.59589286, + 69.64702143, + 77.39298249, + 64.40078719, + 63.86895983, + 56.59442132, + 56.53133729, + 59.65215837, + 56.6365087, + 58.672288, + 58.22161273, + 57.91466448, + 55.31550906, + 54.57572859, + 54.41309365, + 55.0745059, + 29.43296052, + 29.42268607, + 29.00561416, + 58.46183859, + 57.99780474, + 57.54947408, + 59.52992846, + 58.24939106, + 58.02451401, + 58.38212449, + 62.56675904, + 72.17582431, + 79.47276157, + 80.35770088, + 78.75723614, + 82.54023959, + 86.43589719, + 79.48868442, + 81.53042032, + 79.18678857, + 77.89905795, + 75.13071421, + 76.05801375, + 57.61467439, + 56.17139753, + 66.2878906, + 67.88171962, + 64.0280813, + 77.49665175, + 77.63465176, + 77.86372643, + 77.33815817, + 76.18041653, + 77.25265109, + 77.41337528, + 76.7318494, + 49.47110541, + 42.47653994, + 43.59511586, + 50.33996967, + 40.74898026, + 38.38652558, + 38.40401521, + 38.76427889, + 41.47014233, + 47.15540481, + 39.58256675, + 41.74024382, + 39.31187189, + 41.67984769, + 39.08746445, + 41.48150286, + 77.60608655, + 75.98266152, + 76.94575724, + 77.54372007, + 77.58473984, + 76.82230426, + 77.34857166, + 77.57315269, + 77.97261068, + 41.52891976, + 43.7225508, + 79.32607818, + 56.66397408, + 57.82178923, + 58.2431719, + ], + [ + 91.88189151, + 92.21498865, + 90.31053209, + 89.90760672, + 92.00814501, + 88.08528556, + 63.51079443, + 63.59019695, + 63.12328281, + 62.82103866, + 63.51814752, + 63.02408057, + 62.72086389, + 62.90185886, + 63.38904039, + 63.55872965, + 63.38360583, + 51.1508572, + 61.35785406, + 56.54212591, + 60.15734672, + 63.66000062, + 62.92518796, + 63.16521872, + 65.81417676, + 74.58428961, + 63.2321473, + 75.99087076, + 62.88190292, + 49.07025127, + 46.44967378, + 34.55320389, + 33.90420735, + 38.29901955, + 36.0190833, + 26.49211948, + 35.66223828, + 27.09309542, + 24.99152298, + 33.55639249, + 51.5337157, + 61.7775254, + 58.83775437, + 30.02044842, + 31.5264262, + 16.34700838, + 20.23267068, + 16.91300484, + 15.60935558, + 20.79852895, + 26.4970726, + 21.39122552, + 32.44424547, + 29.04019669, + 30.34452445, + 27.24155756, + 29.70909567, + 41.25950129, + 43.45375927, + 41.96474387, + 44.04444502, + 63.37145906, + 67.44871845, + 70.21373883, + 86.92700622, + 87.49981107, + 87.80946159, + 85.63749556, + 90.07716031, + 90.41101877, + 89.95380277, + 80.25186069, + 77.53628847, + 78.22908659, + 79.81754642, + 60.80177654, + 63.06846482, + 63.39075133, + 90.26532639, + 92.15990861, + 90.74890656, + 88.32727415, + 92.12968148, + 91.69442117, + 90.55347607, + 77.74393476, + 63.12892942, + 63.40868612, + 63.29543754, + 53.33262001, + 56.54105229, + 59.79276181, + 53.65167426, + 56.02536457, + 53.23479185, + 51.82245833, + 23.37244197, + 16.38374969, + 33.82244765, + 32.11798877, + 26.11710975, + 24.23601841, + 27.67268551, + 14.94852356, + 14.46185393, + 14.61067765, + 15.89005466, + 15.91257375, + 15.15151702, + 15.22192798, + 16.21684614, + 25.06301931, + 18.33847356, + 19.99420098, + 26.47139661, + 16.18214166, + 14.58021515, + 14.45194845, + 14.36559047, + 17.27803344, + 22.37793253, + 17.64845284, + 17.82932431, + 15.64071697, + 17.74591901, + 15.12230394, + 18.04743744, + 15.16287254, + 16.30692238, + 15.85847833, + 15.25394915, + 15.83003939, + 15.59516532, + 15.77452924, + 14.78064583, + 14.95569875, + 24.91642519, + 19.0773278, + 52.90039129, + 87.94012501, + 90.69316655, + 92.10432787, + ], + ] + ), + np.array( + [ + [ + 51.14791671, + 50.51712581, + 50.2074802, + 50.06948192, + 50.56284634, + 50.2885278, + 25.58347508, + 25.48358339, + 25.4435257, + 25.56511342, + 25.92884427, + 27.55147826, + 27.53046637, + 27.09557036, + 27.43924961, + 27.87826426, + 27.33886892, + 27.67840297, + 52.63565768, + 52.02521411, + 52.88116479, + 52.95260731, + 52.52055249, + 52.34282206, + 51.92759021, + 52.71377449, + 50.44380279, + 50.21669503, + 52.18418011, + 52.79209735, + 52.58971986, + 52.02884867, + 52.72924658, + 52.88431329, + 52.50930089, + 50.86268433, + 50.89149225, + 25.8551276, + 26.02564455, + 27.89317272, + 27.63996794, + 27.8926589, + 52.79773294, + 27.58063881, + 26.49139853, + 25.98531782, + 26.20141928, + 25.85756947, + 50.70468436, + 50.81197535, + 50.56484556, + 50.93930391, + 50.45885484, + 52.90136407, + 52.68495344, + 52.50008894, + 51.83563726, + 76.9954121, + 77.31060048, + 77.92604434, + 77.25438834, + 76.2431578, + 77.08448437, + 75.2280532, + 50.65835477, + 50.20336581, + 50.9295477, + 50.17867185, + 50.42269806, + 50.46422483, + 50.44927033, + 49.92838028, + 50.48801364, + 49.96490538, + 50.75210826, + 27.42242921, + 27.6740834, + 27.53739532, + 52.26334738, + 51.73728166, + 75.87096369, + 75.24432621, + 75.19829529, + 75.70104153, + 75.47933966, + 75.19456687, + 74.82025396, + 75.16434049, + 75.26335555, + 77.75641893, + 77.95443505, + 77.08333777, + 76.06355025, + 77.68201632, + 76.87808198, + 76.94850272, + 77.86405471, + 75.77145009, + 52.33156913, + 52.59281837, + 50.47704772, + 75.29647509, + 75.57395413, + 75.40052716, + 75.87099084, + 75.60588476, + 75.89557705, + 75.7465632, + 75.14234148, + 50.66177956, + 50.69985064, + 50.91894087, + 50.72525854, + 51.26387123, + 51.25091965, + 50.78515721, + 50.50139658, + 50.73367454, + 50.71137854, + 50.8127449, + 51.01423295, + 50.35352141, + 50.43552957, + 50.63098196, + 51.0668072, + 50.79235473, + 50.55127806, + 50.55975806, + 75.32597855, + 75.04472578, + 75.28708772, + 75.23996998, + 75.1524592, + 75.96184009, + 75.44806251, + 75.75938382, + 50.3782623, + 50.53363501, + 77.50090732, + 50.69112419, + 49.99039495, + 50.12718203, + ], + [ + 90.86741233, + 89.10239459, + 85.4600474, + 83.05766953, + 82.93782178, + 82.97525357, + 82.91489113, + 82.92908498, + 82.8742005, + 82.92409777, + 82.82118411, + 51.48738653, + 51.41484656, + 52.07679944, + 51.71207905, + 50.70890793, + 51.65304675, + 51.18198917, + 51.41855226, + 52.12301105, + 50.62155476, + 50.07473901, + 51.5024421, + 51.86195209, + 52.25779061, + 51.19794432, + 82.94182882, + 83.75234297, + 51.97525067, + 51.07339565, + 51.3380902, + 52.1768375, + 51.20176505, + 50.44143545, + 51.41620515, + 17.14563109, + 17.14132373, + 17.08190869, + 16.92501353, + 50.66196341, + 51.39909748, + 50.79528152, + 50.68603709, + 51.52476126, + 17.40539097, + 17.20372213, + 17.09382391, + 17.11384266, + 17.02374454, + 17.11492526, + 17.07777732, + 16.98102188, + 17.03857897, + 50.69056272, + 51.29446922, + 51.59435617, + 52.33576553, + 52.04552865, + 51.74673004, + 50.31866042, + 51.46182482, + 52.12368985, + 51.9671367, + 82.98566202, + 83.11447934, + 82.98265686, + 82.84604113, + 83.18462233, + 82.90990147, + 82.93532841, + 83.96992038, + 82.99366549, + 83.09951912, + 83.7083177, + 82.9019501, + 51.43887623, + 51.30411215, + 51.59365408, + 94.24932783, + 92.97911753, + 88.38644174, + 83.90349738, + 83.46230334, + 82.91945886, + 82.88405139, + 82.93211578, + 82.96238879, + 83.03499717, + 82.9452793, + 51.15177033, + 50.47557897, + 52.15779927, + 52.10465206, + 51.16563781, + 51.8675623, + 51.90751654, + 49.66254553, + 17.11125121, + 51.87886035, + 51.39159152, + 17.04828941, + 17.01565319, + 17.06219214, + 17.04110689, + 17.13489391, + 17.06772306, + 17.16994971, + 17.10571651, + 16.75492389, + 17.07814052, + 17.08518438, + 17.14760476, + 16.90746981, + 17.16234971, + 17.24045586, + 17.18019648, + 17.10577072, + 16.99296341, + 17.08831585, + 16.57271805, + 17.22109553, + 17.06474308, + 17.0651685, + 17.07652235, + 17.20885971, + 17.20421434, + 17.08465518, + 17.09388377, + 15.77189199, + 17.00426226, + 16.17493491, + 17.03184749, + 17.0049424, + 16.69484223, + 17.04514941, + 16.94292965, + 16.94627981, + 17.01958137, + 50.16698595, + 87.51396042, + 83.99735692, + 82.99075, + ], + ] + ), +] def datasaurus(): @@ -258,7 +1211,7 @@ def datasaurus(): CHI 2017 Conference proceedings: ACM SIGCHI Conference on Human Factors in Computing Systems """ - _, ((axa, axb), (axc, axd)) = plt.subplots(2, 2, sharex='col', sharey='row') + _, ((axa, axb), (axc, axd)) = plt.subplots(2, 2, sharex="col", sharey="row") colors = get_color_cycle() for arr, ax, color in zip(DATASAURUS, (axa, axb, axc, axd), colors): x = arr[0] @@ -277,6 +1230,6 @@ def datasaurus(): return (axa, axb, axc, axd) -if __name__ == '__main__': +if __name__ == "__main__": datasaurus() plt.show() diff --git a/yellowbrick/datasets/__init__.py b/yellowbrick/datasets/__init__.py index 0535e081a..f5c53c6b1 100644 --- a/yellowbrick/datasets/__init__.py +++ b/yellowbrick/datasets/__init__.py @@ -1,9 +1,24 @@ -from .download import load_concrete -from .download import load_energy -from .download import load_credit -from .download import load_occupancy -from .download import load_mushroom -from .download import load_hobbies -from .download import load_game -from .download import load_bikeshare -from .download import load_spam +# yellowbrick.datasets +# Management utilities for Yellowbrick example datasets. +# +# Author: Raul Peralta +# Author: Rebecca Bilbro +# Author: Benjamin Bengfort +# Author: Nathan Danielsen +# Created: Tue May 15 11:54:45 2018 -0400 +# +# Copyright (C) 2018 The scikit-yb developers +# For license information, see LICENSE.txt +# +# ID: __init__.py [] raulpl25@gmail.com $ + +""" +Management utilities for Yellowbrick example datasets. +""" + +########################################################################## +## Imports +########################################################################## + +from .loaders import * +from .path import get_data_home diff --git a/yellowbrick/datasets/base.py b/yellowbrick/datasets/base.py new file mode 100644 index 000000000..308c4803b --- /dev/null +++ b/yellowbrick/datasets/base.py @@ -0,0 +1,334 @@ +# yellowbrick.datasets.base +# Loading utilities for the yellowbrick datasets. +# +# Author: Rebecca Bilbro +# Author: Benjamin Bengfort +# Author: Raul Peralta +# Created: Thu Jul 26 13:53:01 2018 -0400 +# +# Copyright (C) 2018 The scikit-yb developers +# For license information, see LICENSE.txt +# +# ID: base.py [] benjamin@bengfort.com $ + +""" +Loading utilities for the yellowbrick datasets. +""" + +########################################################################## +## Imports +########################################################################## + +import os +import json +import numpy as np + +from .download import download_data +from .path import find_dataset_path, dataset_exists + +from yellowbrick.exceptions import DatasetsError +from yellowbrick.utils.decorators import memoized + +try: + import pandas as pd +except ImportError: + pd = None + + +########################################################################## +## Dataset Object +########################################################################## + + +class BaseDataset(object): + """ + Base functionality for Dataset and Corpus objects. + """ + + def __init__(self, name, url=None, signature=None, data_home=None): + self.url = url + self.name = name + self.data_home = data_home + self.signature = signature + + # Check if the dataset exists, and if not - download it! + if not dataset_exists(self.name, data_home=data_home): + self.download() + + def download(self, replace=False): + """ + Download the dataset from the hosted Yellowbrick data store and save + it to the location specified by ``get_data_home``. The downloader + verifies the download completed successfully and safely by comparing + the expected signature with the SHA 256 signature of the downloaded + archive file. + + Parameters + ---------- + replace : bool, default: False + If the data archive already exists, replace the dataset. If this is + False and the dataset exists, an exception is raised. + """ + download_data( + self.url, + self.signature, + data_home=self.data_home, + replace=replace, + extract=True, + ) + + def contents(self): + """ + Contents returns a list of the files in the data directory. + """ + data = find_dataset_path(self.name, data_home=self.data_home, ext=None) + return os.listdir(data) + + @memoized + def README(self): + """ + Returns the contents of the README.md file that describes the dataset + in detail and contains attribution information. + """ + path = find_dataset_path(self.name, data_home=self.data_home, fname="README.md") + with open(path, "r") as f: + return f.read() + + @memoized + def meta(self): + """ + Returns the contents of the meta.json file that describes important + attributes about the dataset and modifies the behavior of the loader. + """ + path = find_dataset_path( + self.name, data_home=self.data_home, fname="meta.json", raises=False + ) + if path is None: + return None + + with open(path, "r") as f: + return json.load(f) + + @memoized + def citation(self): + """ + Returns the contents of the citation.bib file that describes the source + and provenance of the dataset or to cite for academic work. + """ + path = find_dataset_path( + self.name, data_home=self.data_home, fname="meta.json", raises=False + ) + if path is None: + return None + + with open(path, "r") as f: + return f.read() + + +class Dataset(BaseDataset): + """ + Datasets contain a reference to data on disk and provide utilities for + quickly loading files and objects into a variety of formats. The most + common use of the Dataset object is to load example datasets provided by + Yellowbrick to run the examples in the documentation. + + The dataset by default will return the data as a numpy array, however if + Pandas is installed, it is possible to access the data as a DataFrame and + Series object. In either case, the data is represented by a features table, + X and a target vector, y. + + Parameters + ---------- + name : str + The name of the dataset; should either be a folder in data home or + specified in the yellowbrick.datasets.DATASETS variable. This name is + used to perform all lookups and identify the dataset externally. + + data_home : str, optional + The path on disk where data is stored. If not passed in, it is looked + up from YELLOWBRICK_DATA or the default returned by ``get_data_home``. + + url : str, optional + The web location where the archive file of the dataset can be + downloaded from. + + signature : str, optional + The signature of the data archive file, used to verify that the latest + version of the data has been downloaded and that the download hasn't + been corrupted or modified in anyway. + """ + + def to_data(self): + """ + Returns the data contained in the dataset as X and y where X is the + features matrix and y is the target vector. If pandas is installed, + the data will be returned as DataFrame and Series objects. Otherwise, + the data will be returned as two numpy arrays. + + Returns + ------- + X : array-like with shape (n_instances, n_features) + A pandas DataFrame or numpy array describing the instance features. + + y : array-like with shape (n_instances,) + A pandas Series or numpy array describing the target vector. + """ + if pd is not None: + return self.to_pandas() + return self.to_numpy() + + def to_numpy(self): + """ + Returns the dataset as two numpy arrays: X and y. + + Returns + ------- + X : array-like with shape (n_instances, n_features) + A numpy array describing the instance features. + + y : array-like with shape (n_instances,) + A numpy array describing the target vector. + """ + path = find_dataset_path(self.name, ext=".npz", data_home=self.data_home) + with np.load(path, allow_pickle=False) as npf: + if "X" not in npf or "y" not in npf: + raise DatasetsError( + ( + "the downloaded dataset was improperly packaged without numpy " + "arrays - please report this bug to the Yellowbrick maintainers!" + ) + ) + + # TODO: How to handle the case where y is None? + return npf["X"], npf["y"] + + def to_pandas(self): + """ + Returns the dataset as two pandas objects: X and y. + + Returns + ------- + X : DataFrame with shape (n_instances, n_features) + A pandas DataFrame containing feature data and named columns. + + y : Series with shape (n_instances,) + A pandas Series containing target data and an index that matches + the feature DataFrame index. + """ + # Ensure the metadata is valid before continuing + if self.meta is None: + raise DatasetsError( + ( + "the downloaded dataset was improperly packaged without meta.json " + "- please report this bug to the Yellowbrick maintainers!" + ) + ) + + if "features" not in self.meta or "target" not in self.meta: + raise DatasetsError( + ( + "the downloaded dataset was improperly packaged without features " + "or target - please report this bug to the Yellowbrick maintainers!" + ) + ) + + # Load data frame and return features and target + # TODO: Return y as None if there is no self.meta["target"] + df = self.to_dataframe() + return df[self.meta["features"]], df[self.meta["target"]] + + def to_dataframe(self): + """ + Returns the entire dataset as a single pandas DataFrame. + + Returns + ------- + df : DataFrame with shape (n_instances, n_columns) + A pandas DataFrame containing the complete original data table + including all targets (specified by the meta data) and all + features (including those that might have been filtered out). + """ + if pd is None: + raise DatasetsError( + "pandas is required to load DataFrame, it can be installed with pip" + ) + + path = find_dataset_path(self.name, ext=".csv.gz", data_home=self.data_home) + return pd.read_csv(path, compression="gzip") + + +class Corpus(BaseDataset): + """ + Corpus datasets contain a reference to documents on disk and provide + utilities for quickly loading text data for use in machine learning + workflows. The most common use of the corpus is to load the text analysis + examples from the Yellowbrick documentation. + + Parameters + ---------- + name : str + The name of the corpus; should either be a folder in data home or + specified in the yellowbrick.datasets.DATASETS variable. This name is + used to perform all lookups and identify the corpus externally. + + data_home : str, optional + The path on disk where data is stored. If not passed in, it is looked + up from YELLOWBRICK_DATA or the default returned by ``get_data_home``. + + url : str, optional + The web location where the archive file of the corpus can be + downloaded from. + + signature : str, optional + The signature of the data archive file, used to verify that the latest + version of the data has been downloaded and that the download hasn't + been corrupted or modified in anyway. + """ + + @memoized + def root(self): + """ + Discovers and caches the root directory of the corpus. + """ + return find_dataset_path(self.name, data_home=self.data_home, ext=None) + + @memoized + def labels(self): + """ + Return the unique labels assigned to the documents. + """ + return [ + name + for name in os.listdir(self.root) + if os.path.isdir(os.path.join(self.root, name)) + ] + + @property + def files(self): + """ + Returns the list of file names for all documents. + """ + return [ + os.path.join(self.root, label, name) + for label in self.labels + for name in os.listdir(os.path.join(self.root, label)) + ] + + @property + def data(self): + """ + Read all of the documents from disk into an in-memory list. + """ + + def read(path): + with open(path, "r", encoding="UTF-8") as f: + return f.read() + + return [read(f) for f in self.files] + + @property + def target(self): + """ + Returns the label associated with each item in data. + """ + return [os.path.basename(os.path.dirname(f)) for f in self.files] diff --git a/yellowbrick/datasets/download.py b/yellowbrick/datasets/download.py index 831c7da60..25f398391 100644 --- a/yellowbrick/datasets/download.py +++ b/yellowbrick/datasets/download.py @@ -1,13 +1,12 @@ -#!/usr/bin/env python -# download +# yellowbrick.datasets.download # Downloads the example datasets for running the examples. # -# Author: Rebecca Bilbro -# Author: Benjamin Bengfort -# Author: Raul Peralta +# Author: Rebecca Bilbro +# Author: Benjamin Bengfort +# Author: Raul Peralta # Created: Wed May 18 11:54:45 2016 -0400 # -# Copyright (C) 2016 District Data Labs +# Copyright (C) 2018 The scikit-yb developers # For license information, see LICENSE.txt # # ID: download.py [1f73d2b] benjamin@bengfort.com $ @@ -21,123 +20,91 @@ ########################################################################## import os -import numpy as np +import zipfile -from .utils import load_numpy, load_corpus, download_data, DATASETS -from .utils import _lookup_path +from urllib.request import urlopen -########################################################################## -## Functions -########################################################################## +from .signature import sha256sum +from .path import get_data_home, cleanup_dataset -FIXTURES = os.path.join(os.path.dirname(__file__), "fixtures") +from yellowbrick.exceptions import DatasetsError -def download_all(data_path=FIXTURES, verify=True): - """ - Downloads all the example datasets. If verify is True then compare the - download signature with the hardcoded signature. If extract is True then - extract the contents of the zipfile to the given path. - """ - for name, meta in DATASETS.items(): - download_data(name, data_dir=data_path) +# Downlod chunk size +CHUNK = 524288 -def load_concrete(data_path=FIXTURES): - """ - Downloads the 'concrete' dataset, saving it to the output - path specified and returns the data. - """ - # name of the dataset - name = 'concrete' - data = load_numpy(name, data_path=data_path) - return data +########################################################################## +## Download functions +########################################################################## -def load_energy(data_path=FIXTURES): - """ - Downloads the 'energy' dataset, saving it to the output - path specified and returns the data. +def download_data(url, signature, data_home=None, replace=False, extract=True): """ - # name of the dataset - name = 'energy' - data = load_numpy(name, data_path=data_path) - return data + Downloads the zipped data set specified at the given URL, saving it to + the data directory specified by ``get_data_home``. This function verifies + the download with the given signature and extracts the archive. + Parameters + ---------- + url : str + The URL of the dataset on the Internet to GET -def load_credit(data_path=FIXTURES): - """ - Downloads the 'credit' dataset, saving it to the output - path specified and returns the data. - """ - # name of the dataset - name = 'credit' - data = load_numpy(name, data_path=data_path) - return data - + signature : str + The SHA 256 hash of the dataset archive being downloaded to verify + that the dataset has been correctly downloaded -def load_occupancy(data_path=FIXTURES): - """ - Downloads the 'occupancy' dataset, saving it to the output - path specified and returns the data. - """ - # name of the dataset - name = 'occupancy' - data = load_numpy(name, data_path=data_path) - return data + data_home : str, optional + The path on disk where data is stored. If not passed in, it is looked + up from YELLOWBRICK_DATA or the default returned by ``get_data_home``. + replace : bool, default: False + If the data archive already exists, replace the dataset. If this is + False and the dataset exists, an exception is raised. -def load_mushroom(data_path=FIXTURES): - """ - Downloads the 'mushroom' dataset, saving it to the output - path specified and returns the data. + extract : bool, default: True + Extract the archive file after downloading it """ - # name of the dataset - name = 'mushroom' - data = load_numpy(name, data_path=data_path) - return data + data_home = get_data_home(data_home) + # Get the name of the file from the URL + basename = os.path.basename(url) + name, _ = os.path.splitext(basename) -def load_hobbies(data_path=FIXTURES): - """ - Downloads the 'hobbies' dataset, saving it to the output - path specified and returns the data. - """ - # name of the dataset - name = 'hobbies' - data = load_corpus(name, data_path=data_path) - return data + # Get the archive and data directory paths + archive = os.path.join(data_home, basename) + datadir = os.path.join(data_home, name) + # If the archive exists cleanup or raise override exception + if os.path.exists(archive): + if not replace: + raise DatasetsError( + ("dataset already exists at {}, set replace=False to overwrite").format( + archive + ) + ) -def load_game(data_path=FIXTURES): - """ - Downloads the 'game' dataset, saving it to the output - path specified and returns the data. - """ - # name of the dataset - name = 'game' - path = _lookup_path(name, data_path=data_path) - dtype = np.array(['S1']*42+['|S4']) - return np.genfromtxt(path, dtype=dtype, delimiter=',', names=True) + cleanup_dataset(name, data_home=data_home) + # Create the output directory if it does not exist + if not os.path.exists(datadir): + os.mkdir(datadir) -def load_bikeshare(data_path=FIXTURES): - """ - Downloads the 'bikeshare' dataset, saving it to the output - path specified and returns the data. - """ - # name of the dataset - name = 'bikeshare' - data = load_numpy(name, data_path=data_path) - return data + # Fetch the response in a streaming fashion and write it to disk. + response = urlopen(url) + with open(archive, "wb") as f: + while True: + chunk = response.read(CHUNK) + if not chunk: + break + f.write(chunk) -def load_spam(data_path=FIXTURES): - """ - Downloads the 'spam' dataset, saving it to the output - path specified and returns the data. - """ - # name of the dataset - name = 'spam' - data = load_numpy(name, skip_header=True, data_path=data_path) - return data + # Compare the signature of the archive to the expected one + if sha256sum(archive) != signature: + raise ValueError("Download signature does not match hardcoded signature!") + + # If extract, extract the zipfile. + if extract: + zf = zipfile.ZipFile(archive) + zf.extractall(path=data_home) diff --git a/yellowbrick/datasets/loaders.py b/yellowbrick/datasets/loaders.py new file mode 100644 index 000000000..ba871f641 --- /dev/null +++ b/yellowbrick/datasets/loaders.py @@ -0,0 +1,517 @@ +# yellowbrick.datasets.loaders +# Dataset loading utilities and primary API to the datasets module. +# +# Author: Benjamin Bengfort +# Created: Tue Jul 31 13:31:23 2018 -0400 +# +# Copyright (C) 2018 The scikit-yb developers +# For license information, see LICENSE.txt +# +# ID: loaders.py [7082742] benjamin@bengfort.com $ + +""" +Dataset loading utilities and primary API to the datasets module. +""" + +########################################################################## +## Imports +########################################################################## + +import os +import json + +from .base import Dataset, Corpus + +__all__ = [ + "load_concrete", + "load_energy", + "load_credit", + "load_occupancy", + "load_mushroom", + "load_hobbies", + "load_game", + "load_bikeshare", + "load_spam", + "load_walking", + "load_nfl", +] + + +########################################################################## +## Links and SHA 256 signature of Yellowbrick hosted datasets +########################################################################## + +MANIFEST = os.path.join(os.path.dirname(__file__), "manifest.json") +with open(MANIFEST, "r") as f: + DATASETS = json.load(f) + + +########################################################################## +## Specific loading utilities +########################################################################## + + +def _load_dataset(name, data_home=None, return_dataset=False): + """ + Load a dataset by name and return specified format. + """ + info = DATASETS[name] + data = Dataset(name, data_home=data_home, **info) + if return_dataset: + return data + return data.to_data() + + +def _load_corpus(name, data_home=None): + """ + Load a corpus object by name. + """ + info = DATASETS[name] + return Corpus(name, data_home=data_home, **info) + + +def load_concrete(data_home=None, return_dataset=False): + """ + Loads the concrete multivariate dataset that is well suited to regression + tasks. The dataset contains 1030 instances and 8 real valued attributes + with a continuous target. + + The Yellowbrick datasets are hosted online and when requested, the dataset + is downloaded to your local computer for use. Note that if the dataset + hasn't been downloaded before, an Internet connection is required. However, + if the data is cached locally, no data will be downloaded. Yellowbrick + checks the known signature of the dataset with the data downloaded to + ensure the download completes successfully. + + Datasets are stored alongside the code, but the location can be specified + with the ``data_home`` parameter or the ``$YELLOWBRICK_DATA`` envvar. + + Parameters + ---------- + data_home : str, optional + The path on disk where data is stored. If not passed in, it is looked + up from ``$YELLOWBRICK_DATA`` or the default returned by ``get_data_home``. + + return_dataset : bool, default=False + Return the raw dataset object instead of X and y numpy arrays to + get access to alternative targets, extra features, content and meta. + + Returns + ------- + X : array-like with shape (n_instances, n_features) if return_dataset=False + A pandas DataFrame or numpy array describing the instance features. + + y : array-like with shape (n_instances,) if return_dataset=False + A pandas Series or numpy array describing the target vector. + + dataset : Dataset instance if return_dataset=True + The Yellowbrick Dataset object provides an interface to accessing the + data in a variety of formats as well as associated metadata and content. + """ + return _load_dataset("concrete", data_home, return_dataset) + + +def load_energy(data_home=None, return_dataset=False): + """ + Loads the energy multivariate dataset that is well suited to multi-output + regression and classification tasks. The dataset contains 768 instances and + 8 real valued attributes with two continous targets. + + The Yellowbrick datasets are hosted online and when requested, the dataset + is downloaded to your local computer for use. Note that if the dataset + hasn't been downloaded before, an Internet connection is required. However, + if the data is cached locally, no data will be downloaded. Yellowbrick + checks the known signature of the dataset with the data downloaded to + ensure the download completes successfully. + + Datasets are stored alongside the code, but the location can be specified + with the ``data_home`` parameter or the ``$YELLOWBRICK_DATA`` envvar. + + Parameters + ---------- + data_home : str, optional + The path on disk where data is stored. If not passed in, it is looked + up from ``$YELLOWBRICK_DATA`` or the default returned by ``get_data_home``. + + return_dataset : bool, default=False + Return the raw dataset object instead of X and y numpy arrays to + get access to alternative targets, extra features, content and meta. + + Returns + ------- + X : array-like with shape (n_instances, n_features) if return_dataset=False + A pandas DataFrame or numpy array describing the instance features. + + y : array-like with shape (n_instances,) if return_dataset=False + A pandas Series or numpy array describing the target vector. + + dataset : Dataset instance if return_dataset=True + The Yellowbrick Dataset object provides an interface to accessing the + data in a variety of formats as well as associated metadata and content. + """ + return _load_dataset("energy", data_home, return_dataset) + + +def load_credit(data_home=None, return_dataset=False): + """ + Loads the credit multivariate dataset that is well suited to binary + classification tasks. The dataset contains 30000 instances and 23 integer + and real value attributes with a discrete target. + + The Yellowbrick datasets are hosted online and when requested, the dataset + is downloaded to your local computer for use. Note that if the dataset + hasn't been downloaded before, an Internet connection is required. However, + if the data is cached locally, no data will be downloaded. Yellowbrick + checks the known signature of the dataset with the data downloaded to + ensure the download completes successfully. + + Datasets are stored alongside the code, but the location can be specified + with the ``data_home`` parameter or the ``$YELLOWBRICK_DATA`` envvar. + + Parameters + ---------- + data_home : str, optional + The path on disk where data is stored. If not passed in, it is looked + up from ``$YELLOWBRICK_DATA`` or the default returned by ``get_data_home``. + + return_dataset : bool, default=False + Return the raw dataset object instead of X and y numpy arrays to + get access to alternative targets, extra features, content and meta. + + Returns + ------- + X : array-like with shape (n_instances, n_features) if return_dataset=False + A pandas DataFrame or numpy array describing the instance features. + + y : array-like with shape (n_instances,) if return_dataset=False + A pandas Series or numpy array describing the target vector. + + dataset : Dataset instance if return_dataset=True + The Yellowbrick Dataset object provides an interface to accessing the + data in a variety of formats as well as associated metadata and content. + """ + return _load_dataset("credit", data_home, return_dataset) + + +def load_occupancy(data_home=None, return_dataset=False): + """ + Loads the occupancy multivariate, time-series dataset that is well suited + to binary classification tasks. The dataset contains 20560 instances with + 5 real valued attributes and a discrete target. + + The Yellowbrick datasets are hosted online and when requested, the dataset + is downloaded to your local computer for use. Note that if the dataset + hasn't been downloaded before, an Internet connection is required. However, + if the data is cached locally, no data will be downloaded. Yellowbrick + checks the known signature of the dataset with the data downloaded to + ensure the download completes successfully. + + Datasets are stored alongside the code, but the location can be specified + with the ``data_home`` parameter or the ``$YELLOWBRICK_DATA`` envvar. + + Parameters + ---------- + data_home : str, optional + The path on disk where data is stored. If not passed in, it is looked + up from ``$YELLOWBRICK_DATA`` or the default returned by ``get_data_home``. + + return_dataset : bool, default=False + Return the raw dataset object instead of X and y numpy arrays to + get access to alternative targets, extra features, content and meta. + + Returns + ------- + X : array-like with shape (n_instances, n_features) if return_dataset=False + A pandas DataFrame or numpy array describing the instance features. + + y : array-like with shape (n_instances,) if return_dataset=False + A pandas Series or numpy array describing the target vector. + + dataset : Dataset instance if return_dataset=True + The Yellowbrick Dataset object provides an interface to accessing the + data in a variety of formats as well as associated metadata and content. + """ + return _load_dataset("occupancy", data_home, return_dataset) + + +def load_mushroom(data_home=None, return_dataset=False): + """ + Loads the mushroom multivariate dataset that is well suited to binary + classification tasks. The dataset contains 8123 instances with 3 + categorical attributes and a discrete target. + + The Yellowbrick datasets are hosted online and when requested, the dataset + is downloaded to your local computer for use. Note that if the dataset + hasn't been downloaded before, an Internet connection is required. However, + if the data is cached locally, no data will be downloaded. Yellowbrick + checks the known signature of the dataset with the data downloaded to + ensure the download completes successfully. + + Datasets are stored alongside the code, but the location can be specified + with the ``data_home`` parameter or the ``$YELLOWBRICK_DATA`` envvar. + + Parameters + ---------- + data_home : str, optional + The path on disk where data is stored. If not passed in, it is looked + up from ``$YELLOWBRICK_DATA`` or the default returned by ``get_data_home``. + + return_dataset : bool, default=False + Return the raw dataset object instead of X and y numpy arrays to + get access to alternative targets, extra features, content and meta. + + Returns + ------- + X : array-like with shape (n_instances, n_features) if return_dataset=False + A pandas DataFrame or numpy array describing the instance features. + + y : array-like with shape (n_instances,) if return_dataset=False + A pandas Series or numpy array describing the target vector. + + dataset : Dataset instance if return_dataset=True + The Yellowbrick Dataset object provides an interface to accessing the + data in a variety of formats as well as associated metadata and content. + """ + return _load_dataset("mushroom", data_home, return_dataset) + + +def load_hobbies(data_home=None): + """ + Loads the hobbies text corpus that is well suited to classification, + clustering, and text analysis tasks. The dataset contains 448 documents in + 5 categories with 7420 paragraphs, 14251 sentences, 288520 words, and a + vocabulary of 23738. + + The Yellowbrick datasets are hosted online and when requested, the dataset + is downloaded to your local computer for use. Note that if the dataset + hasn't been downloaded before, an Internet connection is required. However, + if the data is cached locally, no data will be downloaded. Yellowbrick + checks the known signature of the dataset with the data downloaded to + ensure the download completes successfully. + + Datasets are stored alongside the code, but the location can be specified + with the ``data_home`` parameter or the ``$YELLOWBRICK_DATA`` envvar. + + Parameters + ---------- + data_home : str, optional + The path on disk where data is stored. If not passed in, it is looked + up from ``$YELLOWBRICK_DATA`` or the default returned by ``get_data_home``. + + Returns + ------- + dataset : Corpus + The Yellowbrick Corpus object provides an interface to accessing the + text documents and metadata associated with the corpus. + """ + return _load_corpus("hobbies", data_home) + + +def load_game(data_home=None, return_dataset=False): + """ + Load the Connect-4 game multivariate and spatial dataset that is well + suited to multiclass classification tasks. The dataset contains 67557 + instances with 42 categorical attributes and a discrete target. + + Note that the game data is stored with categorical features that need to + be numerically encoded before use with scikit-learn estimators. We + recommend the use of the ``sklearn.preprocessing.OneHotEncoder`` for this + task and to develop a ``Pipeline`` using this dataset. + + The Yellowbrick datasets are hosted online and when requested, the dataset + is downloaded to your local computer for use. Note that if the dataset + hasn't been downloaded before, an Internet connection is required. However, + if the data is cached locally, no data will be downloaded. Yellowbrick + checks the known signature of the dataset with the data downloaded to + ensure the download completes successfully. + + Datasets are stored alongside the code, but the location can be specified + with the ``data_home`` parameter or the ``$YELLOWBRICK_DATA`` envvar. + + Parameters + ---------- + data_home : str, optional + The path on disk where data is stored. If not passed in, it is looked + up from ``$YELLOWBRICK_DATA`` or the default returned by ``get_data_home``. + + return_dataset : bool, default=False + Return the raw dataset object instead of X and y numpy arrays to + get access to alternative targets, extra features, content and meta. + + Returns + ------- + X : array-like with shape (n_instances, n_features) if return_dataset=False + A pandas DataFrame or numpy array describing the instance features. + + y : array-like with shape (n_instances,) if return_dataset=False + A pandas Series or numpy array describing the target vector. + + dataset : Dataset instance if return_dataset=True + The Yellowbrick Dataset object provides an interface to accessing the + data in a variety of formats as well as associated metadata and content. + """ + return _load_dataset("game", data_home, return_dataset) + + +def load_bikeshare(data_home=None, return_dataset=False): + """ + Loads the bike sharing univariate dataset that is well suited to regression + tasks. The dataset contains 17379 instances with 12 integer and real valued + attributes and a continuous target. + + The Yellowbrick datasets are hosted online and when requested, the dataset + is downloaded to your local computer for use. Note that if the dataset + hasn't been downloaded before, an Internet connection is required. However, + if the data is cached locally, no data will be downloaded. Yellowbrick + checks the known signature of the dataset with the data downloaded to + ensure the download completes successfully. + + Datasets are stored alongside the code, but the location can be specified + with the ``data_home`` parameter or the ``$YELLOWBRICK_DATA`` envvar. + + Parameters + ---------- + data_home : str, optional + The path on disk where data is stored. If not passed in, it is looked + up from ``$YELLOWBRICK_DATA`` or the default returned by ``get_data_home``. + + return_dataset : bool, default=False + Return the raw dataset object instead of X and y numpy arrays to + get access to alternative targets, extra features, content and meta. + + Returns + ------- + X : array-like with shape (n_instances, n_features) if return_dataset=False + A pandas DataFrame or numpy array describing the instance features. + + y : array-like with shape (n_instances,) if return_dataset=False + A pandas Series or numpy array describing the target vector. + + dataset : Dataset instance if return_dataset=True + The Yellowbrick Dataset object provides an interface to accessing the + data in a variety of formats as well as associated metadata and content. + """ + return _load_dataset("bikeshare", data_home, return_dataset) + + +def load_spam(data_home=None, return_dataset=False): + """ + Loads the email spam dataset that is weill suited to binary classification + and threshold tasks. The dataset contains 4600 instances with 57 integer and + real valued attributes and a discrete target. + + The Yellowbrick datasets are hosted online and when requested, the dataset + is downloaded to your local computer for use. Note that if the dataset + hasn't been downloaded before, an Internet connection is required. However, + if the data is cached locally, no data will be downloaded. Yellowbrick + checks the known signature of the dataset with the data downloaded to + ensure the download completes successfully. + + Datasets are stored alongside the code, but the location can be specified + with the ``data_home`` parameter or the ``$YELLOWBRICK_DATA`` envvar. + + Parameters + ---------- + data_home : str, optional + The path on disk where data is stored. If not passed in, it is looked + up from ``$YELLOWBRICK_DATA`` or the default returned by ``get_data_home``. + + return_dataset : bool, default=False + Return the raw dataset object instead of X and y numpy arrays to + get access to alternative targets, extra features, content and meta. + + Returns + ------- + X : array-like with shape (n_instances, n_features) if return_dataset=False + A pandas DataFrame or numpy array describing the instance features. + + y : array-like with shape (n_instances,) if return_dataset=False + A pandas Series or numpy array describing the target vector. + + dataset : Dataset instance if return_dataset=True + The Yellowbrick Dataset object provides an interface to accessing the + data in a variety of formats as well as associated metadata and content. + """ + return _load_dataset("spam", data_home, return_dataset) + + +def load_walking(data_home=None, return_dataset=False): + """ + Loads the walking activity dataset that is weill suited to clustering and + multi-label classification tasks. The dataset contains multi-variate time + series data with 149,332 real valued measurements across 22 unique walkers. + + The Yellowbrick datasets are hosted online and when requested, the dataset + is downloaded to your local computer for use. Note that if the dataset + hasn't been downloaded before, an Internet connection is required. However, + if the data is cached locally, no data will be downloaded. Yellowbrick + checks the known signature of the dataset with the data downloaded to + ensure the download completes successfully. + + Datasets are stored alongside the code, but the location can be specified + with the ``data_home`` parameter or the ``$YELLOWBRICK_DATA`` envvar. + + Parameters + ---------- + data_home : str, optional + The path on disk where data is stored. If not passed in, it is looked + up from ``$YELLOWBRICK_DATA`` or the default returned by ``get_data_home``. + + return_dataset : bool, default=False + Return the raw dataset object instead of X and y numpy arrays to + get access to alternative targets, extra features, content and meta. + + Returns + ------- + X : array-like with shape (n_instances, n_features) if return_dataset=False + A pandas DataFrame or numpy array describing the instance features. + + y : array-like with shape (n_instances,) if return_dataset=False + A pandas Series or numpy array describing the target vector. + + dataset : Dataset instance if return_dataset=True + The Yellowbrick Dataset object provides an interface to accessing the + data in a variety of formats as well as associated metadata and content. + """ + return _load_dataset("walking", data_home, return_dataset) + + +def load_nfl(data_home=None, return_dataset=False): + """ + Loads the football receivers dataset that is well suited to clustering + tasks. The dataset contains 494 instances with 28 integer, real valued, and + categorical attributes and a discrete target. + + The Yellowbrick datasets are hosted online and when requested, the dataset + is downloaded to your local computer for use. Note that if the dataset + hasn't been downloaded before, an Internet connection is required. However, + if the data is cached locally, no data will be downloaded. Yellowbrick + checks the known signature of the dataset with the data downloaded to + ensure the download completes successfully. + + Datasets are stored alongside the code, but the location can be specified + with the ``data_home`` parameter or the ``$YELLOWBRICK_DATA`` envvar. + + Parameters + ---------- + data_home : str, optional + The path on disk where data is stored. If not passed in, it is looked + up from ``$YELLOWBRICK_DATA`` or the default returned by ``get_data_home``. + + return_dataset : bool, default=False + Return the raw dataset object instead of X and y numpy arrays to + get access to alternative targets, extra features, content and meta. + + Returns + ------- + X : array-like with shape (n_instances, n_features) if return_dataset=False + A pandas DataFrame or numpy array describing the instance features. + + y : array-like with shape (n_instances,) if return_dataset=False + A pandas Series or numpy array describing the target vector. + + dataset : Dataset instance if return_dataset=True + The Yellowbrick Dataset object provides an interface to accessing the + data in a variety of formats as well as associated metadata and content. + """ + return _load_dataset("nfl", data_home, return_dataset) diff --git a/yellowbrick/datasets/manifest.json b/yellowbrick/datasets/manifest.json new file mode 100644 index 000000000..ea071ccf0 --- /dev/null +++ b/yellowbrick/datasets/manifest.json @@ -0,0 +1,46 @@ +{ + "bikeshare": { + "url": "https://s3.amazonaws.com/ddl-data-lake/yellowbrick/v1.0/bikeshare.zip", + "signature": "4ed07a929ccbe0171309129e6adda1c4390190385dd6001ba9eecc795a21eef2" + }, + "hobbies": { + "url": "https://s3.amazonaws.com/ddl-data-lake/yellowbrick/v1.0/hobbies.zip", + "signature": "6114e32f46baddf049a18fb05bad3efa98f4e6a0fe87066c94071541cb1e906f" + }, + "concrete": { + "url": "https://s3.amazonaws.com/ddl-data-lake/yellowbrick/v1.0/concrete.zip", + "signature": "5807af2f04e14e407f61e66a4f3daf910361a99bb5052809096b47d3cccdfc0a" + }, + "credit": { + "url": "https://s3.amazonaws.com/ddl-data-lake/yellowbrick/v1.0/credit.zip", + "signature": "2c6f5821c4039d70e901cc079d1404f6f49c3d6815871231c40348a69ae26573" + }, + "energy": { + "url": "https://s3.amazonaws.com/ddl-data-lake/yellowbrick/v1.0/energy.zip", + "signature": "174eca3cd81e888fc416c006de77dbe5f89d643b20319902a0362e2f1972a34e" + }, + "game": { + "url": "https://s3.amazonaws.com/ddl-data-lake/yellowbrick/v1.0/game.zip", + "signature": "ce799d1c55fcf1985a02def4d85672ac86c022f8f7afefbe42b20364fba47d7a" + }, + "mushroom": { + "url": "https://s3.amazonaws.com/ddl-data-lake/yellowbrick/v1.0/mushroom.zip", + "signature": "f79fdbc33b012dabd06a8f3cb3007d244b6aab22d41358b9aeda74417c91f300" + }, + "occupancy": { + "url": "https://s3.amazonaws.com/ddl-data-lake/yellowbrick/v1.0/occupancy.zip", + "signature": "0b390387584586a05f45c7da610fdaaf8922c5954834f323ae349137394e6253" + }, + "spam": { + "url": "https://s3.amazonaws.com/ddl-data-lake/yellowbrick/v1.0/spam.zip", + "signature": "000309ac2b61090a3001de3e262a5f5319708bb42791c62d15a08a2f9f7cb30a" + }, + "walking": { + "url": "https://s3.amazonaws.com/ddl-data-lake/yellowbrick/v1.0/walking.zip", + "signature": "7a36615978bc3bb74a2e9d5de216815621bd37f6a42c65d3fc28b242b4d6e040" + }, + "nfl": { + "url": "https://s3.amazonaws.com/ddl-data-lake/yellowbrick/v1.0/nfl.zip", + "signature": "4989c66818ea18217ee0fe3a59932b963bd65869928c14075a5c50366cb81e1f" + } +} diff --git a/yellowbrick/datasets/path.py b/yellowbrick/datasets/path.py new file mode 100644 index 000000000..6b40deda3 --- /dev/null +++ b/yellowbrick/datasets/path.py @@ -0,0 +1,240 @@ +# yellowbrick.datasets.path +# Helper functions for looking up dataset paths. +# +# Author: Benjamin Bengfort +# Created: Thu Jul 26 14:10:51 2018 -0400 +# +# Copyright (C) 2018 The scikit-yb developers +# For license information, see LICENSE.txt +# +# ID: path.py [7082742] benjamin@bengfort.com $ + +""" +Helper functions for looking up dataset paths. +""" + +########################################################################## +## Imports +########################################################################## + +import os +import shutil + +from .signature import sha256sum +from yellowbrick.exceptions import DatasetsError + + +########################################################################## +## Fixtures +########################################################################## + +FIXTURES = os.path.join(os.path.dirname(__file__), "fixtures") + + +########################################################################## +## Dataset path utilities +########################################################################## + + +def get_data_home(path=None): + """ + Return the path of the Yellowbrick data directory. This folder is used by + dataset loaders to avoid downloading data several times. + + By default, this folder is colocated with the code in the install directory + so that data shipped with the package can be easily located. Alternatively + it can be set by the ``$YELLOWBRICK_DATA`` environment variable, or + programmatically by giving a folder path. Note that the ``'~'`` symbol is + expanded to the user home directory, and environment variables are also + expanded when resolving the path. + """ + if path is None: + path = os.environ.get("YELLOWBRICK_DATA", FIXTURES) + + path = os.path.expanduser(path) + path = os.path.expandvars(path) + + if not os.path.exists(path): + os.makedirs(path) + + return path + + +def find_dataset_path(dataset, data_home=None, fname=None, ext=".csv.gz", raises=True): + """ + Looks up the path to the dataset specified in the data home directory, + which is found using the ``get_data_home`` function. By default data home + is colocated with the code, but can be modified with the YELLOWBRICK_DATA + environment variable, or passing in a different directory. + + The file returned will be by default, the name of the dataset in compressed + CSV format. Other files and extensions can be passed in to locate other data + types or auxilliary files. + + If the dataset is not found a ``DatasetsError`` is raised by default. + + Parameters + ---------- + dataset : str + The name of the dataset; should either be a folder in data home or + specified in the yellowbrick.datasets.DATASETS variable. + + data_home : str, optional + The path on disk where data is stored. If not passed in, it is looked + up from YELLOWBRICK_DATA or the default returned by ``get_data_home``. + + fname : str, optional + The filename to look up in the dataset path, by default it will be the + name of the dataset. The fname must include an extension. + + ext : str, default: ".csv.gz" + The extension of the data to look up in the dataset path, if the fname + is specified then the ext parameter is ignored. If ext is None then + the directory of the dataset will be returned. + + raises : bool, default: True + If the path does not exist, raises a DatasetsError unless this flag is set + to False, at which point None is returned (e.g. for checking if the + path exists or not). + + Returns + ------- + path : str or None + A path to the requested file, guaranteed to exist if an exception is + not raised during processing of the request (unless None is returned). + + raises : DatasetsError + If raise is True and the path does not exist, raises a DatasetsError. + """ + # Figure out the root directory of the datasets + data_home = get_data_home(data_home) + + # Figure out the relative path to the dataset + if fname is None: + if ext is None: + path = os.path.join(data_home, dataset) + else: + path = os.path.join(data_home, dataset, "{}{}".format(dataset, ext)) + else: + path = os.path.join(data_home, dataset, fname) + + # Determine if the path exists + if not os.path.exists(path): + + # Suppress exceptions if required + if not raises: + return None + + raise DatasetsError( + ("could not find dataset at {} - does it need to be downloaded?").format( + path + ) + ) + + return path + + +def dataset_exists(dataset, data_home=None): + """ + Checks to see if a directory with the name of the specified dataset exists + in the data home directory, found with ``get_data_home``. + + Parameters + ---------- + dataset : str + The name of the dataset; should either be a folder in data home or + specified in the yellowbrick.datasets.DATASETS variable. + + data_home : str, optional + The path on disk where data is stored. If not passed in, it is looked + up from YELLOWBRICK_DATA or the default returned by ``get_data_home``. + + Returns + ------- + exists : bool + If a folder with the dataset name is in the data home directory. + """ + data_home = get_data_home(data_home) + path = os.path.join(data_home, dataset) + + return os.path.exists(path) and os.path.isdir(path) + + +def dataset_archive(dataset, signature, data_home=None, ext=".zip"): + """ + Checks to see if the dataset archive file exists in the data home directory, + found with ``get_data_home``. By specifying the signature, this function + also checks to see if the archive is the latest version by comparing the + sha256sum of the local archive with the specified signature. + + Parameters + ---------- + dataset : str + The name of the dataset; should either be a folder in data home or + specified in the yellowbrick.datasets.DATASETS variable. + + signature : str + The SHA 256 signature of the dataset, used to determine if the archive + is the latest version of the dataset or not. + + data_home : str, optional + The path on disk where data is stored. If not passed in, it is looked + up from YELLOWBRICK_DATA or the default returned by ``get_data_home``. + + ext : str, default: ".zip" + The extension of the archive file. + + Returns + ------- + exists : bool + True if the dataset archive exists and is the latest version. + """ + data_home = get_data_home(data_home) + path = os.path.join(data_home, dataset + ext) + + if os.path.exists(path) and os.path.isfile(path): + return sha256sum(path) == signature + + return False + + +def cleanup_dataset(dataset, data_home=None, ext=".zip"): + """ + Removes the dataset directory and archive file from the data home directory. + + Parameters + ---------- + dataset : str + The name of the dataset; should either be a folder in data home or + specified in the yellowbrick.datasets.DATASETS variable. + + data_home : str, optional + The path on disk where data is stored. If not passed in, it is looked + up from YELLOWBRICK_DATA or the default returned by ``get_data_home``. + + ext : str, default: ".zip" + The extension of the archive file. + + Returns + ------- + removed : int + The number of objects removed from data_home. + """ + removed = 0 + data_home = get_data_home(data_home) + + # Paths to remove + datadir = os.path.join(data_home, dataset) + archive = os.path.join(data_home, dataset + ext) + + # Remove directory and contents + if os.path.exists(datadir): + shutil.rmtree(datadir) + removed += 1 + + # Remove the archive file + if os.path.exists(archive): + os.remove(archive) + removed += 1 + + return removed diff --git a/yellowbrick/datasets/signature.py b/yellowbrick/datasets/signature.py new file mode 100644 index 000000000..3b73a07bd --- /dev/null +++ b/yellowbrick/datasets/signature.py @@ -0,0 +1,38 @@ +# yellowbrick.datasets.signature +# Performs SHA 256 hashing of a file for dataset archive verification. +# +# Author: Benjamin Bengfort +# Created: Tue Jul 31 14:18:11 2018 -0400 +# +# Copyright (C) 2018 The scikit-yb developers +# For license information, see LICENSE.txt +# +# ID: signature.py [7082742] benjamin@bengfort.com $ + +""" +Performs SHA 256 hashing of a file for dataset archive verification. +""" + +########################################################################## +## Imports +########################################################################## + +import hashlib + +########################################################################## +## Signature checking utility +########################################################################## + + +def sha256sum(path, blocksize=65536): + """ + Computes the SHA256 signature of a file to verify that the file has not + been modified in transit and that it is the correct version of the data. + """ + sig = hashlib.sha256() + with open(path, "rb") as f: + buf = f.read(blocksize) + while len(buf) > 0: + sig.update(buf) + buf = f.read(blocksize) + return sig.hexdigest() diff --git a/yellowbrick/datasets/utils.py b/yellowbrick/datasets/utils.py deleted file mode 100644 index 5abd0002d..000000000 --- a/yellowbrick/datasets/utils.py +++ /dev/null @@ -1,207 +0,0 @@ -#!/usr/bin/env python -""" -Utils for downloading datasets for running the examples. -""" - -########################################################################## -## Imports -########################################################################## - -import os -import six -import hashlib -import zipfile -import numpy as np - -from sklearn.datasets.base import Bunch - -if six.PY2: - # backport for encoding in open for python2 - from io import open - -try: - from urllib.request import urlopen -except ImportError: - # python 2 - from urllib2 import urlopen - -try: - import pandas as pd -except ImportError: - pd = None - -########################################################################## -## Links and MD5 hash of datasets -########################################################################## - -DATASETS = { - 'concrete': { - 'url': 'https://s3.amazonaws.com/ddl-data-lake/yellowbrick/concrete.zip', - 'signature': 'b9ea5f26a7bb272a040e2f1a993b26babbf8dc4a04ab8198bb315ca66d71f10d', - 'type': 'numpy', - }, - 'energy': { - 'url': 'https://s3.amazonaws.com/ddl-data-lake/yellowbrick/energy.zip', - 'signature': '19fb86f3bcdde208eed46944172cb643ef6a7d58da103fb568fae43205ed89d3', - 'type': 'numpy', - }, - 'credit': { - 'url': 'https://s3.amazonaws.com/ddl-data-lake/yellowbrick/credit.zip', - 'signature': '4a91339c69f55e18f3f48004328fbcb7868070b618208fed099920427b084e5e', - 'type': 'numpy', - }, - 'occupancy': { - 'url': 'https://s3.amazonaws.com/ddl-data-lake/yellowbrick/occupancy.zip', - 'signature': '429cfe376dc9929a1fa528da89f0e1626e34e19695f3f555d8954025bbc522b8', - 'type': 'numpy', - }, - 'mushroom': { - 'url': 'https://s3.amazonaws.com/ddl-data-lake/yellowbrick/mushroom.zip', - 'signature': '884c43cb70db35d211c67b1cf6a3683b2b4569393d2789d5c07840da4dc85ba8', - 'type': 'numpy', - }, - 'game': { - 'url': 'https://s3.amazonaws.com/ddl-data-lake/yellowbrick/game.zip', - 'signature': 'b1bd85789a014a898daa34cb5f89ceab6d2cd6488a2e572187e34aa4ec21a43b', - 'type': 'numpy', - }, - 'bikeshare': { - 'url': 'https://s3.amazonaws.com/ddl-data-lake/yellowbrick/bikeshare.zip', - 'signature': 'a9b440f65549746dff680c92ff8bdca3c7265f09db1cf09e708e6e26fc8aba44', - 'type': 'numpy', - }, - 'spam': { - 'url': 'https://s3.amazonaws.com/ddl-data-lake/yellowbrick/spam.zip', - 'signature': '65be21196ba3d8448847409b70a67d761f873f30719c807600eb516d7aef1de1', - 'type': 'numpy', - }, - 'hobbies': { - 'url': 'https://s3.amazonaws.com/ddl-data-lake/yellowbrick/hobbies.zip', - 'signature': '415c8f68df1486d5d84a1d1757a5aa3035aef5ad63ede5013c261d622fbd29d8', - 'type': 'corpus', - }, -} - - -########################################################################## -## Download functions -########################################################################## - -def sha256sum(path, blocksize=65536): - """ - Computes the SHA256 signature of a file to verify that the file has not - been modified in transit and that it is the correct version of the data. - """ - sig = hashlib.sha256() - with open(path, 'rb') as f: - buf = f.read(blocksize) - while len(buf) > 0: - sig.update(buf) - buf = f.read(blocksize) - return sig.hexdigest() - - -def download_data(name, data_dir=None, signature=None, extract=True): - """ - Downloads the zipped data set specified at the given URL, saving it to - the output path specified. This function verifies the download with the - given signature (if supplied) and extracts the zip file if requested. - """ - - # Create the fixture directory - if not os.path.exists(data_dir): - os.mkdir(data_dir) - - dataset = DATASETS[name] - url = dataset['url'] - - # Get the name of the file from the URL - filename = os.path.basename(url) - dlpath = os.path.join(data_dir, filename) - dataset_path = os.path.join(data_dir, name) - - #Create the output directory if it does not exist - if not os.path.exists(dataset_path): - os.mkdir(dataset_path) - - # Fetch the response in a streaming fashion and write it to disk. - response = urlopen(url) - CHUNK = 16 * 1024 - with open(dlpath, 'wb') as f: - - while True: - chunk = response.read(CHUNK) - if not chunk: - break - f.write(chunk) - - # If verify, compare the signature - if signature is not None: - dlsignature = sha256sum(dlpath) - if signature != dlsignature: - raise ValueError( - "Download signature does not match hardcoded signature!" - ) - - # If extract, extract the zipfile. - if extract: - zf = zipfile.ZipFile(dlpath) - zf.extractall(path=data_dir) - -def load_numpy(name, data_path=None, **kwargs): - """ - Loads the numpy matrix from the specified data set, downloads it if - it hasn't already been downloaded. - """ - - path = _lookup_path(name, data_path=data_path) - return np.genfromtxt(path, dtype=float, delimiter=',', names=True, **kwargs) - - -def load_corpus(name, data_path=None): - """ - Loads a sklearn Bunch with the corpus and downloads it if it hasn't - already been downloaded. Used to test text visualizers. - """ - path = _lookup_path(name, data_path=data_path, ext=None) - - # Read the directories in the directory as the categories. - categories = [ - cat for cat in os.listdir(path) - if os.path.isdir(os.path.join(path, cat)) - ] - - files = [] # holds the file names relative to the root - data = [] # holds the text read from the file - target = [] # holds the string of the category - - # Load the data from the files in the corpus - for cat in categories: - for name in os.listdir(os.path.join(path, cat)): - files.append(os.path.join(path, cat, name)) - target.append(cat) - - with open(os.path.join(path, cat, name), 'r', encoding='UTF-8') as f: - data.append(f.read()) - - # Return the data bunch for use similar to the newsgroups example - return Bunch( - categories=categories, - files=files, - data=data, - target=target, - ) - -def _lookup_path(name, data_path=None, ext=".csv"): - """ - Looks up the path to the dataset, downloading it if necessary - """ - if ext is None: - path = os.path.join(data_path, name) - else: - path = os.path.join(data_path, name, "{}{}".format(name, ext)) - - if not os.path.exists(path): - download_data(name, signature=None, extract=True, data_dir=data_path) - - return path diff --git a/yellowbrick/download.py b/yellowbrick/download.py index bd73e4752..a45ba958c 100644 --- a/yellowbrick/download.py +++ b/yellowbrick/download.py @@ -1,12 +1,11 @@ -#!/usr/bin/env python # yellowbrick.download # Downloads the example datasets for running the examples. # -# Author: Rebecca Bilbro -# Author: Benjamin Bengfort +# Author: Rebecca Bilbro +# Author: Benjamin Bengfort # Created: Wed May 18 11:54:45 2016 -0400 # -# Copyright (C) 2016 District Data Labs +# Copyright (C) 2016 The sckit-yb developers # For license information, see LICENSE.txt # # ID: download.py [1f73d2b] benjamin@bengfort.com $ @@ -20,130 +19,86 @@ ########################################################################## import os -import sys -import hashlib -import zipfile +import argparse -try: - import requests -except ImportError: - print(( - "The requests module is required to download data --\n" - "please install it with pip install requests." - )) - sys.exit(1) +from yellowbrick.datasets import get_data_home +from yellowbrick.datasets.loaders import DATASETS +from yellowbrick.datasets.download import download_data +from yellowbrick.datasets.path import cleanup_dataset ########################################################################## -## Links and MD5 hash of datasets +## Functions ########################################################################## -DATASETS = { - 'concrete': { - 'url': 'https://s3.amazonaws.com/ddl-data-lake/yellowbrick/concrete.zip', - 'signature': 'b9ea5f26a7bb272a040e2f1a993b26babbf8dc4a04ab8198bb315ca66d71f10d', - }, - 'energy': { - 'url': 'https://s3.amazonaws.com/ddl-data-lake/yellowbrick/energy.zip', - 'signature': '19fb86f3bcdde208eed46944172cb643ef6a7d58da103fb568fae43205ed89d3', - }, - 'credit': { - 'url': 'https://s3.amazonaws.com/ddl-data-lake/yellowbrick/credit.zip', - 'signature': '4a91339c69f55e18f3f48004328fbcb7868070b618208fed099920427b084e5e', - }, - 'occupancy': { - 'url': 'https://s3.amazonaws.com/ddl-data-lake/yellowbrick/occupancy.zip', - 'signature': '429cfe376dc9929a1fa528da89f0e1626e34e19695f3f555d8954025bbc522b8', - }, - 'mushroom': { - 'url': 'https://s3.amazonaws.com/ddl-data-lake/yellowbrick/mushroom.zip', - 'signature': '884c43cb70db35d211c67b1cf6a3683b2b4569393d2789d5c07840da4dc85ba8', - }, - 'hobbies': { - 'url': 'https://s3.amazonaws.com/ddl-data-lake/yellowbrick/hobbies.zip', - 'signature': '415c8f68df1486d5d84a1d1757a5aa3035aef5ad63ede5013c261d622fbd29d8', - }, - 'game': { - 'url': 'https://s3.amazonaws.com/ddl-data-lake/yellowbrick/game.zip', - 'signature': 'b1bd85789a014a898daa34cb5f89ceab6d2cd6488a2e572187e34aa4ec21a43b', - }, - 'bikeshare': { - 'url': 'https://s3.amazonaws.com/ddl-data-lake/yellowbrick/bikeshare.zip', - 'signature': 'a9b440f65549746dff680c92ff8bdca3c7265f09db1cf09e708e6e26fc8aba44', - }, - 'spam': { - 'url': 'https://s3.amazonaws.com/ddl-data-lake/yellowbrick/spam.zip', - 'signature': '65be21196ba3d8448847409b70a67d761f873f30719c807600eb516d7aef1de1', - }, -} - -########################################################################## -## Download functions -########################################################################## - -def sha256sum(path, blocksize=65536): +def download_all(data_home=None, replace=False): """ - Computes the SHA256 signature of a file to verify that the file has not - been modified in transit and that it is the correct version of the data. + Downloads all the example datasets to the data directory specified by + ``get_data_home``. This function ensures that all datasets are available + for use with the examples. """ - sig = hashlib.sha256() - with open(path, 'rb') as f: - buf = f.read(blocksize) - while len(buf) > 0: - sig.update(buf) - buf = f.read(blocksize) - return sig.hexdigest() + for _, meta in DATASETS.items(): + download_data( + meta["url"], meta["signature"], data_home=data_home, replace=replace + ) + print( + "Downloaded {} datasets to {}".format(len(DATASETS), get_data_home(data_home)) + ) -def download_data(url, path='data', signature=None, extract=True): - """ - Downloads the zipped data set specified at the given URL, saving it to - the output path specified. This function verifies the download with the - given signature (if supplied) and extracts the zip file if requested. - """ - # Create the output directory if it does not exist - if not os.path.exists(path): - os.mkdir(path) - - # Get the name of the file from the URL - name = os.path.basename(url) - dlpath = os.path.join(path, name) - - # Fetch the response in a streaming fashion and write it to disk. - response = requests.get(url, stream=True) - with open(dlpath, 'wb') as f: - for chunk in response.iter_content(65536): - f.write(chunk) - - # If verify, compare the signature - if signature is not None: - dlsignature = sha256sum(dlpath) - if signature != dlsignature: - raise ValueError( - "Download signature does not match hardcoded signature!" - ) - - # If extract, extract the zipfile. - if extract: - zf = zipfile.ZipFile(dlpath) - zf.extractall(path) - - -def download_all(path='data', verify=True, extract=True): + +def cleanup_all(data_home=None): """ - Downloads all the example datasets. If verify is True then compare the - download signature with the hardcoded signature. If extract is True then - extract the contents of the zipfile to the given path. + Cleans up all the example datasets in the data directory specified by + ``get_data_home`` either to clear up disk space or start from fresh. """ + removed = 0 for name, meta in DATASETS.items(): - url = meta['url'] - signature = meta['signature'] if verify else None - - download_data(url, path=path, signature=signature, extract=extract) - - -if __name__ == '__main__': - path='data' - download_all(path) - print("Downloaded datasets to {}".format(os.path.abspath(path))) + _, ext = os.path.splitext(meta["url"]) + removed += cleanup_dataset(name, data_home=data_home, ext=ext) + + print( + "Removed {} fixture objects from {}".format(removed, get_data_home(data_home)) + ) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Yellowbrick data downloader", + epilog="for troubleshooting please visit our GitHub issues", + ) + parser.add_argument( + "-c", + "--cleanup", + action="store_true", + default=False, + help="cleanup any existing datasets before download", + ) + parser.add_argument( + "--no-download", + action="store_true", + default=False, + help="prevent new data from being downloaded", + ) + parser.add_argument( + "-f", + "--overwrite", + action="store_true", + default=False, + help="overwrite any existing data with new download", + ) + parser.add_argument( + "data_home", + default=None, + nargs="?", + help="specify the data download location or set $YELLOWBRICK_DATA", + ) + + args = parser.parse_args() + + if args.cleanup: + cleanup_all(data_home=args.data_home) + + if not args.no_download: + download_all(data_home=args.data_home, replace=args.overwrite) diff --git a/yellowbrick/draw.py b/yellowbrick/draw.py index e7b4f80c2..a3fcca4a7 100644 --- a/yellowbrick/draw.py +++ b/yellowbrick/draw.py @@ -1,10 +1,13 @@ # yellowbrick.draw # Utilities for common matplotlib drawing procedures. # -# Author: Benjamin Bengfort +# Author: Benjamin Bengfort # Created: Sun Aug 19 10:35:50 2018 -0400 # -# ID: draw.py [] benjamin@bengfort.com $ +# Copyright (C) 2018 The sckit-yb developers +# For license information, see LICENSE.txt +# +# ID: draw.py [dd915ad] benjamin@bengfort.com $ """ Utilities for common matplotlib drawing procedures. @@ -16,16 +19,18 @@ from .base import Visualizer from .exceptions import YellowbrickValueError +from .style.colors import resolve_colors from matplotlib import patches import matplotlib.pyplot as plt - +import numpy as np ########################################################################## ## Legend Drawing Utilities ########################################################################## + def manual_legend(g, labels, colors, **legend_kwargs): """ Adds a manual legend for a scatter plot to the visualizer where the labels @@ -82,9 +87,108 @@ def manual_legend(g, labels, colors, **legend_kwargs): # Create the legend handles with the associated colors and labels handles = [ - patches.Patch(color=color, label=label) - for color, label in zip(colors, labels) + patches.Patch(color=color, label=label) for color, label in zip(colors, labels) ] # Return the Legend artist return g.legend(handles=handles, **legend_kwargs) + + +def bar_stack( + data, + ax=None, + labels=None, + ticks=None, + colors=None, + colormap=None, + orientation="vertical", + legend=True, + legend_kws=None, + **kwargs +): + """ + An advanced bar chart plotting utility that can draw bar and stacked bar charts from + data, wrapping calls to the specified matplotlib.Axes object. + + Parameters + ---------- + data : 2D array-like + The data passed to the Visualizer. Rows represent each stack in the bar chart and columns + represent each bar. Therefore, a single bar chart is created by passing a 2D array + containing a single row, while the data to create a bar chart with 3 stacks would have a + shape of (3, b). + + ax : matplotlib.Axes, default: None + The axes object to draw the barplot on, uses plt.gca() if not specified. + + labels : list of str, default: None + The labels for each row in the bar stack, used to create a legend. + + ticks : list of str, default: None + The labels for each bar, added to the x-axis for a vertical plot, or the y-axis + for a horizontal plot. + + colors : array-like, default: None + Specify the colors of each bar, each row in the stack, or every segment. + + colormap : string or matplotlib cmap + Specify a colormap for each bar, each row in the stack, or every segment. + + orientation:‘vertical’ or ‘horizontal’ + Specifies a horizontal or vertical bar chart. + + legend : boolean, default: True + If True, the function add a legend with the plot + + legend_kws : dict, default: None + Additional keyword arguments for the legend components. + + kwargs : dict + Additional keyword arguments to pass to ``ax.bar``. + """ + if ax is None: + ax = plt.gca() + + colors = resolve_colors(n_colors=data.shape[0], colormap=colormap, colors=colors) + + idx = np.arange(data.shape[1]) + zeros = np.zeros(data.shape[1]) + # Stores stacks for both side of plotting axes + stack_arr = np.zeros((data.shape[1], 2)) + orientation = orientation.lower() + + if orientation.startswith("h"): + + for rdx in range(len(data)): + stack = [stack_arr[j, int(data[rdx][j] > 0)] for j in range(len(data[rdx]))] + ax.barh(idx, data[rdx], left=stack, color=colors[rdx]) + # Updates the stack for negative side of y-axis + stack_arr[:, 0] += np.minimum(data[rdx], zeros) + # Updates stack for positive side of y-axis + stack_arr[:, 1] += np.maximum(data[rdx], zeros) + ax.set_yticks(idx) + if ticks is not None: + ax.set_yticklabels(ticks) + + elif orientation.startswith("v"): + for rdx in range(len(data)): + stack = [stack_arr[j, int(data[rdx][j] > 0)] for j in range(len(data[rdx]))] + ax.bar(idx, data[rdx], bottom=stack, color=colors[rdx]) + # Updates the stack for negative side of x-axis + stack_arr[:, 0] += np.minimum(data[rdx], zeros) + # Updates the stack for negative side of x-axis + stack_arr[:, 1] += np.maximum(data[rdx], zeros) + ax.set_xticks(idx) + if ticks is not None: + ax.set_xticklabels(ticks, rotation=90) + + else: + raise YellowbrickValueError("unknown orientation '{}'".format(orientation)) + + # Generates default labels is labels are not specified. + labels = labels or np.arange(data.shape[0]) + + if legend: + legend_kws = legend_kws or {} + manual_legend(ax, labels=labels, colors=colors, **legend_kws) + return ax diff --git a/yellowbrick/exceptions.py b/yellowbrick/exceptions.py index 772b4741a..2bf03247e 100644 --- a/yellowbrick/exceptions.py +++ b/yellowbrick/exceptions.py @@ -1,10 +1,10 @@ # yellowbrick.exceptions # Exceptions hierarchy for the yellowbrick library # -# Author: Benjamin Bengfort +# Author: Benjamin Bengfort # Created: Fri Jun 03 10:39:41 2016 -0700 # -# Copyright (C) 2016 District Data Labs +# Copyright (C) 2016 The sckit-yb developers # For license information, see LICENSE.txt # # ID: exceptions.py [cb75e0e] benjamin@bengfort.com $ @@ -22,6 +22,7 @@ class YellowbrickError(Exception): """ The root exception for all yellowbrick related errors. """ + pass @@ -29,6 +30,7 @@ class VisualError(YellowbrickError): """ A problem when interacting with matplotlib or the display framework. """ + pass @@ -36,6 +38,7 @@ class ModelError(YellowbrickError): """ A problem when interacting with sklearn or the ML framework. """ + pass @@ -43,6 +46,22 @@ class NotFitted(ModelError): """ An action was called that requires a fitted model. """ + + @classmethod + def from_estimator(klass, estimator, method=None): + method = method or "this method" + message = ( + "this {} instance is not fitted yet, please call fit " + "with the appropriate arguments before using {}" + ).format(estimator.__class__.__name__, method) + return klass(message) + + +class DatasetsError(YellowbrickError): + """ + A problem occured when interacting with data sets. + """ + pass @@ -50,6 +69,7 @@ class YellowbrickTypeError(YellowbrickError, TypeError): """ There was an unexpected type or none for a property or input. """ + pass @@ -57,6 +77,7 @@ class YellowbrickValueError(YellowbrickError, ValueError): """ A bad value was passed into a function. """ + pass @@ -64,32 +85,47 @@ class YellowbrickKeyError(YellowbrickError, KeyError): """ An invalid key was used in a hash (dict or set). """ + pass -class YellowbrickWarning(UserWarning): +########################################################################## +## Assertions +########################################################################## + + +class YellowbrickAssertionError(YellowbrickError, AssertionError): """ - Warning class used to notify users of Yellowbrick-specific issues. + Used to indicate test failures. """ + pass -class DataWarning(YellowbrickWarning): +class ImageComparisonFailure(YellowbrickAssertionError): """ - The supplied data has an issue that may produce unexpected visualizations. + Provides a cleaner error when image comparison assertions fail. """ + pass -class YellowbrickAssertionError(YellowbrickError, AssertionError): +########################################################################## +## Warnings +########################################################################## + + +class YellowbrickWarning(UserWarning): """ - Used to indicate test failures. + Warning class used to notify users of Yellowbrick-specific issues. """ + pass -class ImageComparisonFailure(YellowbrickAssertionError): +class DataWarning(YellowbrickWarning): """ - Provides a cleaner error when image comparison assertions fail. + The supplied data has an issue that may produce unexpected visualizations. """ + pass diff --git a/yellowbrick/features/__init__.py b/yellowbrick/features/__init__.py index c42917e8e..7b7fc0fc2 100644 --- a/yellowbrick/features/__init__.py +++ b/yellowbrick/features/__init__.py @@ -1,10 +1,10 @@ # yellowbrick.features # Visualizers for feature analysis and diagnostics. # -# Author: Benjamin Bengfort +# Author: Benjamin Bengfort # Created: Mon Oct 03 21:30:18 2016 -0400 # -# Copyright (C) 2016 District Data Labs +# Copyright (C) 2016 The scikit-yb developers # For license information, see LICENSE.txt # # ID: __init__.py [0f4b236] benjamin@bengfort.com $ @@ -21,8 +21,14 @@ from .pcoords import ParallelCoordinates, parallel_coordinates from .radviz import RadialVisualizer, RadViz, radviz from .rankd import Rank1D, rank1d, Rank2D, rank2d -from .jointplot import JointPlotVisualizer -from .pca import PCADecomposition, pca_decomposition -from .importances import FeatureImportances, feature_importances -from .rfecv import RFECV, rfecv +from .jointplot import JointPlot, JointPlotVisualizer, joint_plot +from .pca import PCA, PCADecomposition, pca_decomposition from .manifold import Manifold, manifold_embedding + +# Alias the TargetType defined in yellowbrick.utils.target +from yellowbrick.utils.target import TargetType + +# RFECV and Feature Importances moved to model selection module as of YB v1.0 +from yellowbrick.model_selection.rfecv import RFECV, rfecv +from yellowbrick.model_selection.importances import FeatureImportances +from yellowbrick.model_selection.importances import feature_importances diff --git a/yellowbrick/features/base.py b/yellowbrick/features/base.py index 5623480eb..e65344277 100644 --- a/yellowbrick/features/base.py +++ b/yellowbrick/features/base.py @@ -1,11 +1,11 @@ # yellowbrick.features.base # Base classes for feature visualizers and feature selection tools. # -# Author: Rebecca Bilbro -# Author: Benjamin Bengfort +# Author: Rebecca Bilbro +# Author: Benjamin Bengfort # Created: Fri Oct 07 13:41:24 2016 -0400 # -# Copyright (C) 2016 District Data Labs +# Copyright (C) 2016 The scikit-yb developers # For license information, see LICENSE.txt # # ID: base.py [2e898a6] benjamin@bengfort.com $ @@ -19,9 +19,17 @@ ########################################################################## import numpy as np +import matplotlib as mpl from yellowbrick.base import Visualizer from yellowbrick.utils import is_dataframe +from yellowbrick.style import resolve_colors +from yellowbrick.exceptions import NotFitted +from yellowbrick.utils.target import target_color_type, TargetType +from yellowbrick.exceptions import YellowbrickKeyError, YellowbrickValueError +from yellowbrick.style import palettes + +from matplotlib.colors import Normalize from sklearn.base import TransformerMixin @@ -31,171 +39,311 @@ class FeatureVisualizer(Visualizer, TransformerMixin): - """ - Base class for feature visualization to investigate features - individually or together. + """Base class for feature visualization. + + Feature engineering is primarily conceptualized as a transformation or + extraction operation, e.g. some raw data is passed through a series of + transformers and mappings to result in some final dataset which can be + directly fitted to a model. Therefore feature visualizers are + transformers and support the sklearn transformer interface by implementing + a transform method. + + Subclasses of the FeatureVisualizer may call draw either from fit or from + transform but must implement both so that they can be supported in pipeline + objects. By default, the transform method of the visualizer is just a data + pass through that ensures the visualizer can be placed into a feature + extraction workflow. + + Parameters + ---------- + ax : matplotlib.Axes, default: None + The axis to plot the figure on. If None is passed in the current axes + will be used (or generated if required). - FeatureVisualizer is itself a transformer so that it can be used in - a Scikit-Learn Pipeline to perform automatic visual analysis during build. + fig : matplotlib Figure, default: None + The figure to plot the Visualizer on. If None is passed in the current + plot will be used (or generated if required). - Accepts as input a DataFrame or Numpy array. + kwargs : dict + Any additional keyword arguments to pass to the base Visualizer. """ - def __init__(self, ax=None, **kwargs): - super(FeatureVisualizer, self).__init__(ax=ax, **kwargs) + def __init__(self, ax=None, fig=None, **kwargs): + super(FeatureVisualizer, self).__init__(ax=ax, fig=fig, **kwargs) - def transform(self, X): + def transform(self, X, y=None): """ - Primarily a pass-through to ensure that the feature visualizer will - work in a pipeline setting. This method can also call drawing methods - in order to ensure that the visualization is constructed. + A pass-through to ensure that feature visualizers work in Pipelines. + Subclasses may override this method to actually transform data or to + call drawing methods. The transformer may also take an optional y + argument if it is required for either transformation or drawing. + + Parameters + ---------- + X : array-like, shape (n_samples, n_features) + Feature dataset to be transformed. - This method must return a numpy array with the same shape as X. + y : array-like, shape (n_samples,) + Dependent target data associated with X, unused. + + Returns + ------- + X : array-like, shape (n_samples, n_features) + Returns the original dataset, unmodified. """ return X def fit_transform_poof(self, X, y=None, **kwargs): - """ - Fit to data, transform it, then visualize it. + """Fit, transform, then visualize data in one step. - Fits the visualizer to X and y with opetional parameters by passing in - all of kwargs, then calls poof with the same kwargs. This method must - return the result of the transform method. + A helper method similar to ``fit_transform`` that allows you to fit, + transform, and create a visualization in one simple step. Returns a + transformed dataset similar to ``fit_transform``. + + Parameters + ---------- + X : array-like, shape (n_samples, n_features) + Feature dataset for both training and transformation. + + y : array-like, shape (n_samples,) + Dependent target dataset optionally used for training. + + kwargs : dict, optional + Keyword arguments to pass to the ``poof()`` method. + + Returns + ------- + Xp : array-like, shape (m_samples, m_features) + The transformed dataset X prime. """ - Xp = self.fit_transform(X, y, **kwargs) + Xp = self.fit_transform(X, y) self.poof(**kwargs) return Xp class MultiFeatureVisualizer(FeatureVisualizer): - """ - MultiFeatureVisualiers are a subclass of FeatureVisualizer that visualize - several features at once. This class provides base functionality for - getting the names of features for use in plot annotation. + """Direct visualization of a feature set. + + MultiFeatureVisualiers visualize several features at once, usually in order + to compare the effectiveness of a subset of features to the superset. This + type of visualizer provides base functionality for identifying the names of + the features either directly from the data or from user supplied values. It + also provides other functionality for feature selection, e.g. ensuring that + a subset of features is used if specified by the user. Parameters ---------- - - ax: matplotlib Axes, default: None + ax : matplotlib.Axes, default: None The axis to plot the figure on. If None is passed in the current axes will be used (or generated if required). - features: list, default: None - a list of feature names to use - If a DataFrame is passed to fit and features is None, feature - names are selected as the columns of the DataFrame. + fig : matplotlib Figure, default: None + The figure to plot the Visualizer on. If None is passed in the current + plot will be used (or generated if required). - kwargs : dict + features : list, default: None + The names of the features specified by the columns of the input dataset. + This length of this list must match the number of columns in X, otherwise + an exception will be raised on ``fit()``. + + kwargs : dict, optional Keyword arguments that are passed to the base class and may influence the visualization as defined in other Visualizers. + Attributes + ---------- + features_ : ndarray, shape (n_features,) + The names of the features discovered or used in the visualizer that + can be used as an index to access or modify data in X. If a user passes + feature names in, those features are used. Otherwise the columns of a + DataFrame are used or just simply the indices of the data array. """ - def __init__(self, ax=None, features=None, **kwargs): - super(MultiFeatureVisualizer, self).__init__(ax=ax, **kwargs) + def __init__(self, ax=None, fig=None, features=None, **kwargs): + super(MultiFeatureVisualizer, self).__init__(ax=ax, fig=fig, **kwargs) # Data Parameters - self.features_ = features + self.features = features - def fit(self, X, y=None, **fit_params): + def fit(self, X, y=None): """ This method performs preliminary computations in order to set up the figure or perform other analyses. It can also call drawing methods in order to set up various non-instance related figure elements. - This method must return self. - """ + Parameters + ---------- + X : array-like, shape (n_samples, n_features) + Feature dataset to be transformed. - # Handle the feature names if they're None. - if self.features_ is None: + y : array-like, shape (n_samples,) + Optional dependent target data associated with X. - # If X is a data frame, get the columns off it. + Returns + ------- + self : MultiFeatureVisualizer + Returns the visualizer/transformer for use in Pipelines and chaining. + """ + n_columns = X.shape[1] + + if self.features is not None: + # Use the user-specified features with some checking + # TODO: allow the user specified features to filter the dataset + if len(self.features) != n_columns: + raise YellowbrickValueError( + ( + "number of supplied feature names does not match the number " + "of columns in the training data." + ) + ) + + self.features_ = np.array(self.features) + + else: + # Attempt to determine the feature names from the input data if is_dataframe(X): self.features_ = np.array(X.columns) # Otherwise create numeric labels for each column. else: - _, ncols = X.shape - self.features_ = np.arange(0, ncols) + self.features_ = np.arange(0, n_columns) + # Ensure super is called and fit is returned + super(MultiFeatureVisualizer, self).fit(X, y) return self + ########################################################################## ## Data Visualizers ########################################################################## + class DataVisualizer(MultiFeatureVisualizer): - """ - Data Visualizers are a subclass of Feature Visualizers which plot the - instances in feature space (also called data space, hence the name of the - visualizer). Feature space is a multi-dimensional space defined by the - columns of the instance dependent vector input, X which is passed to - ``fit()`` and ``transform()``. Instances can also be labeled by the target - independent vector input, y which is only passed to ``fit()``. For that - reason most Data Visualizers perform their drawing in ``fit()``. + """Visualizations of instances in feature space. + + Data Visualizers plot instances in feature space (sometimes also referred + to as data space). Feature space is a multi-dimensional space defined by + the columns of the dataset ``X`` when passed to ``fit()`` and ``transform``. + These instances and their features are directly plotted in a representation + of the higher dimensional space. + + Instances can also be labeled by an target vectory, ``y``. The target is + visualized in data space by color. For example a discrete target for + classification problems will use categorical colors and a legend. A + continuous target for regression problems will use sequential colors with + a colormap. This class provides helper functionality related to target identification: whether or not the target is sequential or categorical, and mapping a - color sequence or color set to the targets as appropriate. It also uses - the fit method to call the drawing utilities. + color sequence or color set to the targets as appropriate. It also + determines the scope of the target, e.g. the unique classes or the range + of the dataset for use in specific visualizations. Parameters ---------- - - ax: matplotlib Axes, default: None + ax : matplotlib.Axes, default: None The axis to plot the figure on. If None is passed in the current axes will be used (or generated if required). - features: list, default: None - a list of feature names to use - If a DataFrame is passed to fit and features is None, feature - names are selected as the columns of the DataFrame. - - classes: list, default: None - a list of class names for the legend - If classes is None and a y value is passed to fit then the classes - are selected from the target vector. - - color: list or tuple, default: None - optional list or tuple of colors to colorize lines - Use either color to colorize the lines on a per class basis or - colormap to color them on a continuous scale. - - colormap: string or cmap, default: None - optional string or matplotlib cmap to colorize lines - Use either color to colorize the lines on a per class basis or - colormap to color them on a continuous scale. + fig : matplotlib Figure, default: None + The figure to plot the Visualizer on. If None is passed in the current + plot will be used (or generated if required). + + features : list, default: None + The names of the features specified by the columns of the input dataset. + This length of this list must match the number of columns in X, otherwise + an exception will be raised on ``fit()``. + + classes : list, default: None + The class labels for each class in y, ordered by sorted class index. These + names act as a label encoder for the legend, identifying integer classes + or renaming string labels. If omitted, the class labels will be taken from + the unique values in y. + + Note that the length of this list must match the number of unique values in + y, otherwise an exception is raised. This parameter is only used in the + discrete target type case and is ignored otherwise. + + colors : list or tuple, default: None + A single color to plot all instances as or a list of colors to color each + instance according to its class in the discrete case or as an ordered + colormap in the sequential case. If not enough colors per class are + specified then the colors are treated as a cycle. + + colormap : string or cmap, default: None + The colormap used to create the individual colors. In the discrete case + it is used to compute the number of colors needed for each class and + in the continuous case it is used to create a sequential color map based + on the range of the target. + + target_type : str, default: "auto" + Specify the type of target as either "discrete" (classes) or "continuous" + (real numbers, usually for regression). If "auto", then it will + attempt to determine the type by counting the number of unique values. + + If the target is discrete, the colors are returned as a dict with classes + being the keys. If continuous the colors will be list having value of + color for each point. In either case, if no target is specified, then + color will be specified as the first color in the color cycle. kwargs : dict Keyword arguments that are passed to the base class and may influence the visualization as defined in other Visualizers. - Notes - ----- - These parameters can be influenced later on in the visualization - process, but can and should be set as early as possible. + Attributes + ---------- + features_ : ndarray, shape (n_features,) + The names of the features discovered or used in the visualizer that + can be used as an index to access or modify data in X. If a user passes + feature names in, those features are used. Otherwise the columns of a + DataFrame are used or just simply the indices of the data array. + + classes_ : ndarray, shape (n_classes,) + The class labels that define the discrete values in the target. Only + available if the target type is discrete. This is guaranteed to be + strings even if the classes are a different type. + + range_ : (min y, max y) + A tuple that describes the minimum and maximum values in the target. + Only available if the target type is continuous. """ - def __init__(self, ax=None, features=None, classes=None, color=None, - colormap=None, **kwargs): - """ - Initialize the data visualization with many of the options required - in order to make most visualizations work. - """ - super(DataVisualizer, self).__init__(ax=ax, features=features, **kwargs) + def __init__( + self, + ax=None, + fig=None, + features=None, + classes=None, + colors=None, + colormap=None, + target_type="auto", + **kwargs + ): + super(DataVisualizer, self).__init__( + ax=ax, fig=fig, features=features, **kwargs + ) + + # Validate raises YellowbrickValueError if invalid + TargetType.validate(target_type) # Data Parameters - self.classes_ = classes + self.classes = classes + self.target_type = target_type # Visual Parameters - self.color = color + self.colors = colors self.colormap = colormap - def fit(self, X, y=None, **kwargs): + # Internal attributes + self._colors = None + self._target_color_type = None + self._label_encoder = None + + def fit(self, X, y=None): """ - The fit method is the primary drawing input for the - visualization since it has both the X and y data required for the - viz and the transform method does not. + Fits the visualizer to the training data set by determining the + target type, colors, classes, and range of the data to ensure that + the visualizer can accurately portray the instances in data space. Parameters ---------- @@ -205,23 +353,146 @@ def fit(self, X, y=None, **kwargs): y : ndarray or Series of length n An array or series of target or class values - kwargs : dict - Pass generic arguments to the drawing method - Returns ------- - self : instance + self : DataVisualizer Returns the instance of the transformer/visualizer """ - super(DataVisualizer, self).fit(X, y, **kwargs) + # Compute the features from the data + super(DataVisualizer, self).fit(X, y) + + # Determine the target color type + self._determine_target_color_type(y) + + # Handle the single target color type + if self._target_color_type == TargetType.SINGLE: + # use the user supplied color or the first color in the color cycle + self._colors = self.colors or "C0" + + # Compute classes and colors if target type is discrete + elif self._target_color_type == TargetType.DISCRETE: + # Unique labels are used both for validation and color mapping + labels = np.unique(y) + + # Handle user supplied classes + if self.classes is not None: + self.classes_ = np.asarray([str(c) for c in self.classes]) + + # Validate user supplied class labels + if len(self.classes_) != len(labels): + raise YellowbrickValueError( + ( + "number of specified classes does not match " + "number of unique values in target" + ) + ) + + # Get the string labels from the unique values in y + else: + self.classes_ = np.asarray([str(c) for c in labels]) + + # Create a map of class labels to colors + color_values = resolve_colors( + n_colors=len(self.classes_), colormap=self.colormap, colors=self.colors + ) + self._colors = dict(zip(self.classes_, color_values)) + self._label_encoder = dict(zip(labels, self.classes_)) + + # Compute target range if colors are continuous + elif self._target_color_type == TargetType.CONTINUOUS: + y = np.asarray(y) + self.range_ = (y.min(), y.max()) + if self.colormap is None: + self.colormap = palettes.DEFAULT_SEQUENCE + # TODO: allow for Yellowbrick palettes here as well + self._colors = mpl.cm.get_cmap(self.colormap) + + # If this exception is raised a developer error has occurred because + # unknown types should have errored when the type was determined. + else: + raise YellowbrickValueError( + "unknown target color type '{}'".format(self._target_color_type) + ) + + # NOTE: cannot call draw in fit to support data transformers + return self - # Store the classes for the legend if they're None. - if self.classes_ is None: - # TODO: Is this the most efficient method? - self.classes_ = [str(label) for label in np.unique(y)] + def _determine_target_color_type(self, y): + """ + Determines the target color type from the vector y as follows: - # Draw the instances - self.draw(X, y, **kwargs) + - if y is None: only a single color is used + - if target is auto: determine if y is continuous or discrete + - otherwise specify supplied target type - # Fit always returns self. - return self + This property will be used to compute the colors for each point. + """ + if y is None: + self._target_color_type = TargetType.SINGLE + elif self.target_type == TargetType.AUTO: + self._target_color_type = target_color_type(y) + else: + self._target_color_type = TargetType(self.target_type) + + # Ensures that target is either SINGLE, DISCRETE or CONTINUOUS before continuing + if ( + self._target_color_type == TargetType.AUTO + or self._target_color_type == TargetType.UNKNOWN + ): + raise YellowbrickValueError( + ( + "could not determine target color type " "from target='{}' to '{}'" + ).format(self.target_type, self._target_color_type) + ) + + def get_target_color_type(self): + """ + Returns the computed target color type if fitted or specified by the user. + """ + if self._target_color_type is None: + raise NotFitted("unknown target color type on unfitted visualizer") + return self._target_color_type + + def get_colors(self, y): + """ + Returns the color for the specified value(s) of y based on the learned + colors property for any specified target type. + + Parameters + ---------- + y : array-like + The values of y to get the associated colors for. + + Returns + ------- + colors : list + Returns a list of colors for each value in y. + """ + if self._colors is None: + raise NotFitted("cannot determine colors on unfitted visualizer") + + if self._target_color_type == TargetType.SINGLE: + return [self._colors] * len(y) + + if self._target_color_type == TargetType.DISCRETE: + try: + # Use the label encoder to get the class name (or use the value + # if the label is not mapped in the encoder) then use the class + # name to get the color from the color map. + return [self._colors[self._label_encoder.get(yi, yi)] for yi in y] + except KeyError: + unknown = set(y) - set(self._label_encoder.keys()) + unknown = ", ".join(["'{}'".format(uk) for uk in unknown]) + raise YellowbrickKeyError( + "could not determine color for classes {}".format(unknown) + ) + + if self._target_color_type == TargetType.CONTINUOUS: + # Normalize values into target range and compute colors from colormap + norm = Normalize(*self.range_) + return self._colors(norm(y)) + + # This is a developer error, we should never get here! + raise YellowbrickValueError( + "unknown target color type '{}'".format(self._target_color_type) + ) diff --git a/yellowbrick/features/decomposition.py b/yellowbrick/features/decomposition.py index c679f1d5f..c8cbb4448 100644 --- a/yellowbrick/features/decomposition.py +++ b/yellowbrick/features/decomposition.py @@ -1,9 +1,19 @@ +# yellowbrick.features.decomposition +# +# Author: George Richardson +# Created: Fri Mar 2 16:16:00 2018 +0000 +# +# Copyright (C) 2016 The scikit-yb developers +# For license information, see LICENSE.txt +# +# ID: decomposition.py [0ed6e8a] g.raymond.richardson@gmail.com $ + ########################################################################## ## Imports ########################################################################## -from .base import FeatureVisualizer from yellowbrick.style import palettes +from yellowbrick.features.base import FeatureVisualizer from sklearn.pipeline import Pipeline from sklearn.decomposition import PCA @@ -13,11 +23,18 @@ ## Quick Methods ########################################################################## -def explained_variance_visualizer(X, y=None, ax=None, scale=True, - center=True, colormap=palettes.DEFAULT_SEQUENCE, - **kwargs): - """Produce a plot of the explained variance produced by a dimensionality - reduction algorithm using n=1 to n=n_components dimensions. This is a single + +def explained_variance_visualizer( + X, + y=None, + ax=None, + scale=True, + center=True, + colormap=palettes.DEFAULT_SEQUENCE, + **kwargs +): + """Produce a plot of the explained variance produced by a dimensionality + reduction algorithm using n=1 to n=n_components dimensions. This is a single plot to help identify the best trade off between number of dimensions and amount of information retained within the data. @@ -28,7 +45,7 @@ def explained_variance_visualizer(X, y=None, ax=None, scale=True, y : ndarray or Series of length n An array or Series of target or class values - + ax : matplotlib Axes, default: None The aces to plot the figure on @@ -43,38 +60,58 @@ def explained_variance_visualizer(X, y=None, ax=None, scale=True, kwargs : dict Keyword arguments that are passed to the base class and may influence the visualization as defined in other Visualizers. - + + Returns + ------- + viz : ExplainedVariance + Returns the fitted, finalized visualizer + Examples -------- >>> from sklearn import datasets >>> bc = datasets.load_breast_cancer() >>> X = bc = bc.data >>> explained_variance_visualizer(X, scale=True, center=True, colormap='RdBu_r') - + """ - # Instantiate the visualizer - visualizer = ExplainedVariance(X=X) + # Instantiate the visualizer + visualizer = ExplainedVariance(X=X) - # Fit and transform the visualizer (calls draw) - visualizer.fit(X, y, **kwargs) - visualizer.transform(X) + # Fit and transform the visualizer (calls draw) + visualizer.fit(X, y, **kwargs) + visualizer.transform(X) + visualizer.finalize() - # Return the axes object on the visualizer - return visualizer.poof() + # Return the visualizer object + return visualizer ########################################################################## ## Explained Variance Feature Visualizer ########################################################################## + class ExplainedVariance(FeatureVisualizer): """ Parameters ---------- - - + ax : matplotlib Axes, default: None + The aces to plot the figure on + + scale : bool, default: True + Boolean that indicates if the values of X should be scaled. + + colormap : string or cmap, default: None + optional string or matplotlib cmap to colorize lines + Use either color to colorize the lines on a per class basis or + colormap to color them on a continuous scale. + + kwargs : dict + Keyword arguments that are passed to the base class and may influence + the visualization as defined in other Visualizers. + Examples -------- @@ -84,13 +121,17 @@ class ExplainedVariance(FeatureVisualizer): >>> visualizer.transform(X) >>> visualizer.poof() - Notes - ----- - """ - def __init__(self, n_components=None, ax=None, scale=True, center=True, - colormap=palettes.DEFAULT_SEQUENCE, **kwargs): + def __init__( + self, + ax=None, + scale=True, + center=True, + n_components=None, + colormap=palettes.DEFAULT_SEQUENCE, + **kwargs + ): super(ExplainedVariance, self).__init__(ax=ax, **kwargs) @@ -98,9 +139,12 @@ def __init__(self, n_components=None, ax=None, scale=True, center=True, self.n_components = n_components self.center = center self.scale = scale - self.pipeline = Pipeline([('scale', StandardScaler(with_mean=self.center, - with_std=self.scale)), - ('pca', PCA(n_components=self.n_components))]) + self.pipeline = Pipeline( + [ + ("scale", StandardScaler(with_mean=self.center, with_std=self.scale)), + ("pca", PCA(n_components=self.n_components)), + ] + ) self.pca_features = None @property @@ -123,9 +167,8 @@ def draw(self): def finalize(self, **kwargs): # Set the title - self.set_title('Explained Variance Plot') + self.set_title("Explained Variance Plot") # Set the axes labels - self.ax.set_ylabel('Explained Variance') - self.ax.set_xlabel('Number of Components') - + self.ax.set_ylabel("Explained Variance") + self.ax.set_xlabel("Number of Components") diff --git a/yellowbrick/features/jointplot.py b/yellowbrick/features/jointplot.py index 4403745c2..ab4f4732c 100644 --- a/yellowbrick/features/jointplot.py +++ b/yellowbrick/features/jointplot.py @@ -1,11 +1,10 @@ - # yellowbrick.features.jointplot -# Implementations of joint plots for univariate and bivariate analysis. +# Implementation of joint plots for univariate and bivariate analysis. # # Author: Prema Damodaran Roman # Created: Mon Apr 10 21:00:54 2017 -0400 # -# Copyright (C) 2017 District Data Labs +# Copyright (C) 2017 The scikit-yb developers. # For license information, see LICENSE.txt # # ID: jointplot.py [7f47800] pdamodaran@users.noreply.github.com $ @@ -14,306 +13,439 @@ ## Imports ########################################################################## -import warnings import numpy as np -import matplotlib as mpl import matplotlib.pyplot as plt -from yellowbrick.features.base import FeatureVisualizer -from yellowbrick.exceptions import YellowbrickValueError -from yellowbrick.bestfit import draw_best_fit -from yellowbrick.utils import is_dataframe +try: + # Only available in Matplotlib >= 2.0.2 + from mpl_toolkits.axes_grid1 import make_axes_locatable +except ImportError: + make_axes_locatable = None + +from .base import FeatureVisualizer + +# from ..bestfit import draw_best_fit # TODO: return in #728 +from ..utils.types import is_dataframe +from ..exceptions import YellowbrickValueError +from scipy.stats import pearsonr, spearmanr, kendalltau + + +# Default Colors +# TODO: should we reuse these colors? +FACECOLOR = "#FAFAFA" +HISTCOLOR = "#6897bb" + + +# Objects for export +__all__ = ["JointPlot", "JointPlotVisualizer", "joint_plot"] + ########################################################################## ## Joint Plot Visualizer ########################################################################## -class JointPlotVisualizer(FeatureVisualizer): +class JointPlot(FeatureVisualizer): """ - JointPlotVisualizer allows for a simultaneous visualization of the relationship - between two variables and the distrbution of each individual variable. The - relationship is plotted along the joint axis and univariate distributions - are plotted on top of the x axis and to the right of the y axis. + Joint plots are useful for machine learning on multi-dimensional data, allowing for + the visualization of complex interactions between different data dimensions, their + varying distributions, and even their relationships to the target variable for + prediction. + + The Yellowbrick ``JointPlot`` can be used both for pairwise feature analysis and + feature-to-target plots. For pairwise feature analysis, the ``columns`` argument can + be used to specify the index of the two desired columns in ``X``. If ``y`` is also + specified, the plot can be colored with a heatmap or by class. For feature-to-target + plots, the user can provide either ``X`` and ``y`` as 1D vectors, or a ``columns`` + argument with an index to a single feature in ``X`` to be plotted against ``y``. + + Histograms can be included by setting the ``hist`` argument to ``True`` for a + frequency distribution, or to ``"density"`` for a probability density function. Note + that histograms requires matplotlib 2.0.2 or greater. Parameters ---------- - ax: matplotlib Axes, default: None - This is inherited from FeatureVisualizer but is defined within - JointPlotVisualizer since there are three axes objects. - - feature: string, default: None - The name of the X variable - If a DataFrame is passed to fit and feature is None, feature - is selected as the column of the DataFrame. There must be only - one column in the DataFrame. - - target: string, default: None - The name of the Y variable - If target is None and a y value is passed to fit then the target - is selected from the target vector. - - joint_plot: one of {'scatter', 'hex'}, default: 'scatter' - The type of plot to render in the joint axis - Currently, the choices are scatter and hex. - Use scatter for small datasets and hex for large datasets - - joint_args: dict, default: None - Keyword arguments used for customizing the joint plot: - - ============= ================================================================== - Property Description - ------------- ------------------------------------------------------------------ - alpha transparency - facecolor background color of the joint axis - aspect aspect ratio - fit used if scatter is selected for joint_plot to draw a - best fit line - values can be True or False. - Uses ``Yellowbrick.bestfit`` - estimator used if scatter is selected for joint_plot to determine - the type of best fit line to use. Refer to - Yellowbrick.bestfit for types of estimators that can be used. - x_bins used if hex is selected to set the number of bins for the x value - y_bins used if hex is selected to set the number of bins for the y value - cmap string or matplotlib cmap to colorize lines - Use either color to colorize the lines on a per class basis or - colormap to color them on a continuous scale. - ============= ================================================================== - - xy_plot: one of {'hist'}, default: 'hist' - The type of plot to render along the x and y axes - Currently, the choice is hist - - xy_args: dict, default: None - Keyword arguments used for customizing the x and y plots: - - ============== ===================================================== - Property Description - -------------- ----------------------------------------------------- - alpha transparency - facecolor_x background color of the x axis - facecolor_y background color of the y axis - bins used to set up the number of bins for the hist plot - histcolor_x used to set the color for the histogram on the x axis - histcolor_y used to set the color for the histogram on the y axis - ============== ===================================================== - - size: float, default: 600 - Size of each side of the figure in pixels - - ratio: float, default: 5 - Ratio of joint axis size to the x and y axes height - - space: float, default: 0.2 - Space between the joint axis and the x and y axes + ax : matplotlib Axes, default: None + The axes to plot the figure on. If None is passed in the current axes will be + used (or generated if required). This is considered the base axes where the + the primary joint plot is drawn. It will be shifted and two additional axes + added above (xhax) and to the right (yhax) if hist=True. + + columns : int, str, [int, int], [str, str], default: None + Determines what data is plotted in the joint plot and acts as a selection index + into the data passed to ``fit(X, y)``. This data therefore must be indexable by + the column type (e.g. an int for a numpy array or a string for a DataFrame). + + If None is specified then either both X and y must be 1D vectors and they will + be plotted against each other or X must be a 2D array with only 2 columns. If a + single index is specified then the data is indexed as ``X[columns]`` and plotted + jointly with the target variable, y. If two indices are specified then they are + both selected from X, additionally in this case, if y is specified, then it is + used to plot the color of points. + + Note that these names are also used as the x and y axes labels if they aren't + specified in the joint_kws argument. + + correlation : str, default: 'pearson' + The algorithm used to compute the relationship between the variables in the + joint plot, one of: 'pearson', 'covariance', 'spearman', 'kendalltau'. + + kind : str in {'scatter', 'hex'}, default: 'scatter' + The type of plot to render in the joint axes. Note that when kind='hex' the + target cannot be plotted by color. + + hist : {True, False, None, 'density', 'frequency'}, default: True + Draw histograms showing the distribution of the variables plotted jointly. + If set to 'density', the probability density function will be plotted. + If set to True or 'frequency' then the frequency will be plotted. + Requires Matplotlib >= 2.0.2. + + alpha : float, default: 0.65 + Specify a transparency where 1 is completely opaque and 0 is completely + transparent. This property makes densely clustered points more visible. + + {joint, hist}_kws : dict, default: None + Additional keyword arguments for the plot components. kwargs : dict Keyword arguments that are passed to the base class and may influence the visualization as defined in other Visualizers. + Attributes + ---------- + corr_ : float + The correlation or relationship of the data in the joint plot, specified by the + correlation algorithm. + Examples -------- - >>> visualizer = JointPlotVisualizer() - >>> visualizer.fit(X,y) - >>> visualizer.poof() - - Notes - ----- - These parameters can be influenced later on in the visualization - process, but can and should be set as early as possible. + >>> viz = JointPlot(columns=["temp", "humidity"]) + >>> viz.fit(X, y) + >>> viz.poof() """ - def __init__(self, ax=None, feature=None, target=None, - joint_plot='scatter', joint_args=None, - xy_plot='hist', xy_args=None, - size=600, ratio=5, space=.2, **kwargs): - - # Check matplotlib version - needs to be version 2.0.0 or greater. - mpl_vers_maj = int(mpl.__version__.split(".")[0]) - if mpl_vers_maj < 2: - warnings.warn(( - "{} requires matplotlib major version 2 or greater. " - "Please upgrade." - ).format(self.__class__.__name__)) - - super(JointPlotVisualizer, self).__init__(ax, **kwargs) - - self.feature = feature - self.target = target - self.joint_plot = joint_plot - self.joint_args = joint_args - self.xy_plot = xy_plot - self.xy_args = xy_args - self.size = (size, size) - self.ratio = ratio - self.space = space - - def fit(self, X, y, **kwargs): - """ - Sets up the X and y variables for the jointplot - and checks to ensure that X and y are of the - correct data type + # TODO: should we couple more closely with Rank2D? + correlation_methods = { + "pearson": lambda x, y: pearsonr(x, y)[0], + "spearman": lambda x, y: spearmanr(x, y)[0], + "covariance": lambda x, y: np.cov(x, y)[0, 1], + "kendalltau": lambda x, y: kendalltau(x, y)[0], + } + + def __init__( + self, + ax=None, + columns=None, + correlation="pearson", + kind="scatter", + hist=True, + alpha=0.65, + joint_kws=None, + hist_kws=None, + **kwargs + ): + # Initialize the visualizer + super(JointPlot, self).__init__(ax=ax, **kwargs) + self._xhax, self._yhax = None, None + + # Set and validate the columns + self.columns = columns + if self.columns is not None and not isinstance(self.columns, (int, str)): + self.columns = tuple(self.columns) + if len(self.columns) > 2: + raise YellowbrickValueError( + ( + "'{}' contains too many indices or is invalid for joint plot - " + "specify either a single int or str index or two columns as a list" + ).format(columns) + ) + + # Seet and validate the correlation + self.correlation = correlation + if self.correlation not in self.correlation_methods: + raise YellowbrickValueError( + "'{}' is an invalid correlation method, use one of {}".format( + self.correlation, ", ".join(self.correlation_methods.keys()) + ) + ) - Fit calls draw + # Set and validate the kind of plot + self.kind = kind + if self.kind not in {"scatter", "hex", "hexbin"}: + raise YellowbrickValueError( + ("'{}' is invalid joint plot kind, use 'scatter' or 'hex'").format( + self.kind + ) + ) - Parameters - ---------- + # Set and validate the histogram if specified + self.hist = hist + if self.hist not in {True, "density", "frequency", None, False}: + raise YellowbrickValueError( + ( + "'{}' is an invalid argument for hist, use None, True, " + "False, 'density', or 'frequency'" + ).format(hist) + ) - X : ndarray or DataFrame of shape n x 1 - A matrix of n instances with 1 feature + # If hist is True, test the version availability + if self.hist in {True, "density", "frequency"}: + self._layout() - y : ndarray or Series of length n - An array or series of the target value + # Set the additional visual parameters + self.alpha = alpha + self.joint_kws = joint_kws + self.hist_kws = hist_kws - kwargs: dict - keyword arguments passed to Scikit-Learn API. + @property + def xhax(self): """ + The axes of the histogram for the top of the JointPlot (X-axis) + """ + if self._xhax is None: + raise AttributeError( + "this visualizer does not have a histogram for the X axis" + ) + return self._xhax - #throw an error if X has more than 1 column - if is_dataframe(X): - nrows, ncols = X.shape - - if ncols > 1: - raise YellowbrickValueError(( - "X needs to be an ndarray or DataFrame with one feature, " - "please select one feature from the DataFrame" - )) + @property + def yhax(self): + """ + The axes of the histogram for the right of the JointPlot (Y-axis) + """ + if self._yhax is None: + raise AttributeError( + "this visualizer does not have a histogram for the Y axis" + ) + return self._yhax - #throw an error is y is None - if y is None: - raise YellowbrickValueError(( - "Joint plots are useful for classification and regression " - "problems, which require a target variable" - )) + def _layout(self): + """ + Creates the grid layout for the joint plot, adding new axes for the histograms + if necessary and modifying the aspect ratio. Does not modify the axes or the + layout if self.hist is False or None. + """ + # Ensure the axes are created if not hist, then return. + if not self.hist: + self.ax + return + + # Ensure matplotlib version compatibility + if make_axes_locatable is None: + raise YellowbrickValueError( + ( + "joint plot histograms requires matplotlib 2.0.2 or greater " + "please upgrade matplotlib or set hist=False on the visualizer" + ) + ) + # Create the new axes for the histograms + divider = make_axes_locatable(self.ax) + self._xhax = divider.append_axes("top", size=1, pad=0.1, sharex=self.ax) + self._yhax = divider.append_axes("right", size=1, pad=0.1, sharey=self.ax) - # Handle the feature name if it is None. - if self.feature is None: + # Modify the display of the axes + self._xhax.xaxis.tick_top() + self._yhax.yaxis.tick_right() + self._xhax.grid(False, axis="y") + self._yhax.grid(False, axis="x") - # If X is a data frame, get the columns off it. - if is_dataframe(X): - self.feature = X.columns + def fit(self, X, y=None): + """ + Fits the JointPlot, creating a correlative visualization between the columns + specified during initialization and the data and target passed into fit: - else: - self.feature = ['x'] + - If self.columns is None then X and y must both be specified as 1D arrays + or X must be a 2D array with only 2 columns. + - If self.columns is a single int or str, that column is selected to be + visualized against the target y. + - If self.columns is two ints or strs, those columns are visualized against + each other. If y is specified then it is used to color the points. - # Handle the target name if it is None. - if self.target is None: - self.target = ['y'] + This is the main entry point into the joint plot visualization. - self.draw(X, y, **kwargs) - return self + Parameters + ---------- + X : array-like + An array-like object of either 1 or 2 dimensions depending on self.columns. + Usually this is a 2D table with shape (n, m) - def draw(self, X, y, **kwargs): + y : array-like, default: None + An vector or 1D array that has the same length as X. May be used to either + directly plot data or to color data points. """ - Sets up the layout for the joint plot draw calls ``draw_joint`` and - ``draw_xy`` to render the visualizations. - """ - fig = plt.gcf() - gs = plt.GridSpec(self.ratio + 1, self.ratio + 1) + # Convert python objects to numpy arrays + if isinstance(X, (list, tuple)): + X = np.array(X) + + if y is not None and isinstance(y, (list, tuple)): + y = np.array(y) + + # Case where no columns are specified + if self.columns is None: + if (y is None and (X.ndim != 2 or X.shape[1] != 2)) or ( + y is not None and (X.ndim != 1 or y.ndim != 1) + ): + raise YellowbrickValueError( + ( + "when self.columns is None specify either X and y as 1D arrays " + "or X as a matrix with 2 columns" + ) + ) + + if y is None: + # Draw the first column as x and the second column as y + self.draw(X[:, 0], X[:, 1], xlabel="0", ylabel="1") + return self + + # Draw x against y + self.draw(X, y, xlabel="x", ylabel="y") + return self + + # Case where a single string or int index is specified + if isinstance(self.columns, (int, str)): + if y is None: + raise YellowbrickValueError( + "when self.columns is a single index, y must be specified" + ) + + # fetch the index from X -- raising index error if not possible + x = self._index_into(self.columns, X) + self.draw(x, y, xlabel=str(self.columns), ylabel="target") + return self + + # Case where there is a double index for both columns + columns = tuple(self.columns) + if len(columns) != 2: + raise YellowbrickValueError( + ("'{}' contains too many indices or is invalid for joint plot").format( + columns + ) + ) - #Set up the 3 axes objects - joint_ax = fig.add_subplot(gs[1:, :-1]) - x_ax = fig.add_subplot(gs[0, :-1], sharex=joint_ax) - y_ax = fig.add_subplot(gs[1:, -1], sharey=joint_ax) + # TODO: color the points based on the target if it is given + x = self._index_into(columns[0], X) + y = self._index_into(columns[1], X) + self.draw(x, y, xlabel=str(columns[0]), ylabel=str(columns[1])) + return self - fig.tight_layout() - fig.subplots_adjust(hspace=self.space, wspace=self.space) + def draw(self, x, y, xlabel=None, ylabel=None): + """ + Draw the joint plot for the data in x and y. - self.fig = fig - self.joint_ax = joint_ax - self.x_ax = x_ax - self.y_ax = y_ax + Parameters + ---------- + x, y : 1D array-like + The data to plot for the x axis and the y axis - self.draw_joint(X, y, **kwargs) - self.draw_xy(X, y, **kwargs) + xlabel, ylabel : str + The labels for the x and y axes. + """ + # This is a little weird to be here, but it is the best place to perform + # this computation given how fit calls draw and returns. + self.corr_ = self.correlation_methods[self.correlation](x, y) + + # First draw the joint plot + joint_kws = self.joint_kws or {} + joint_kws.setdefault("alpha", self.alpha) + joint_kws.setdefault("label", "{}={:0.3f}".format(self.correlation, self.corr_)) + + # Draw scatter joint plot + if self.kind == "scatter": + self.ax.scatter(x, y, **joint_kws) + + # TODO: Draw best fit line (or should this be kind='reg'?) + + # Draw hexbin joint plot + elif self.kind in ("hex", "hexbin"): + joint_kws.setdefault("mincnt", 1) + joint_kws.setdefault("gridsize", 50) + joint_kws.setdefault("cmap", "Blues") + self.ax.hexbin(x, y, **joint_kws) + + # Something bad happened + else: + raise ValueError("unknown joint plot kind '{}'".format(self.kind)) + + # Set the X and Y axis labels on the plot + self.ax.set_xlabel(xlabel) + self.ax.set_ylabel(ylabel) + + # If we're not going to draw histograms, stop here + if not self.hist: + # Ensure the current axes is always the main joint plot axes + plt.sca(self.ax) + return self.ax + + # Draw the histograms + hist_kws = self.hist_kws or {} + hist_kws.setdefault("bins", 50) + if self.hist == "density": + hist_kws.setdefault("density", True) + + self.xhax.hist(x, **hist_kws) + self.yhax.hist(y, orientation="horizontal", **hist_kws) + + # Ensure the current axes is always the main joint plot axes + plt.sca(self.ax) + return self.ax - def draw_joint(self, X, y, **kwargs): + def finalize(self, **kwargs): """ - Draws the visualization for the joint axis. + Finalize executes any remaining image modifications making it ready to show. """ - - if self.joint_args is None: - self.joint_args = {} - - self.joint_args.setdefault("alpha", 0.4) - facecolor = self.joint_args.pop("facecolor", "#dddddd") - self.joint_ax.set_facecolor(facecolor) - - if self.joint_plot == "scatter": - aspect = self.joint_args.pop("aspect", "auto") - self.joint_ax.set_aspect(aspect) - self.joint_ax.scatter(X, y, **self.joint_args) - - fit = self.joint_args.pop("fit", True) - if fit: - estimator = self.joint_args.pop("estimator", "linear") - draw_best_fit(X, y, self.joint_ax, estimator) - - elif self.joint_plot == "hex": - x_bins = self.joint_args.pop("x_bins", 50) - y_bins = self.joint_args.pop("y_bins", 50) - colormap = self.joint_args.pop("cmap", 'Blues') - gridsize = int(np.mean([x_bins, y_bins])) - - xmin = X.min() - xmax = X.max() - ymin = y.min() - ymax = y.max() - - self.joint_ax.hexbin(X, y, - gridsize=gridsize, cmap=colormap, mincnt=1, **self.joint_args - ) - self.joint_ax.axis([xmin, xmax, ymin, ymax]) - - def draw_xy(self, X, y, **kwargs): + # Set the aspect ratio to make the visualization square + # TODO: still unable to make plot square using make_axes_locatable + # x0,x1 = self.ax.get_xlim() + # y0,y1 = self.ax.get_ylim() + # self.ax.set_aspect(abs(x1-x0)/abs(y1-y0)) + + # Add the title to the plot if the user has set one. + self.set_title("") + + # TODO: use manual legend so legend works with both scatter and hexbin + # Set the legend with full opacity patches using manual legend. + # Or Add the colorbar if this is a continuous plot. + if self.kind == "scatter": + self.ax.legend(loc="best", frameon=True) + + # Finalize the histograms + if self.hist: + plt.setp(self.xhax.get_xticklabels(), visible=False) + plt.setp(self.yhax.get_yticklabels(), visible=False) + plt.sca(self.ax) + + # Call tight layout to maximize readability + plt.tight_layout() + + def _index_into(self, idx, data): """ - Draws the visualization for the x and y axes + Attempts to get the column from the data using the specified index, raises an + exception if this is not possible from this point in the stack. """ + try: + if is_dataframe(data): + # Assume column indexing + return data[idx] + # Otherwise assume numpy array-like indexing + return data[:, idx] + except Exception as e: + raise IndexError( + "could not index column '{}' into type {}: {}".format( + self.columns, data.__class__.__name__, e + ) + ) - if self.xy_args is None: - self.xy_args = {} - facecolor_x = self.xy_args.pop("facecolor_x", "#dddddd") - self.x_ax.set_facecolor(facecolor_x) - facecolor_y = self.xy_args.pop("facecolor_y", "#dddddd") - self.y_ax.set_facecolor(facecolor_y) +# Alias for JointPlot +JointPlotVisualizer = JointPlot - if self.xy_plot == "hist": - hist_bins = self.xy_args.pop("bins", 50) - self.xy_args.setdefault("alpha", 0.4) - histcolor_x = self.xy_args.pop("histcolor_x", "#6897bb") - self.x_ax.set_facecolor(facecolor_x) - histcolor_y = self.xy_args.pop("histcolor_y", "#6897bb") - self.y_ax.set_facecolor(facecolor_y) - self.x_ax.hist(X, bins=hist_bins, color=histcolor_x, **self.xy_args) - self.y_ax.hist(y, bins=hist_bins, color=histcolor_y, - orientation='horizontal', **self.xy_args) +########################################################################## +## Quick Method for JointPlot visualizations +########################################################################## - def finalize(self, **kwargs): - """ - Finalize executes any subclass-specific axes finalization steps. - The user calls poof and poof calls finalize. - Parameters - ---------- - kwargs: generic keyword arguments. - """ - self.joint_ax.set_xlabel(self.feature) - self.joint_ax.set_ylabel(self.target) - - plt.setp(self.x_ax.get_xticklabels(), visible=False) - plt.setp(self.y_ax.get_yticklabels(), visible=False) - - plt.setp(self.x_ax.yaxis.get_majorticklines(), visible=False) - plt.setp(self.x_ax.yaxis.get_minorticklines(), visible=False) - plt.setp(self.y_ax.xaxis.get_majorticklines(), visible=False) - plt.setp(self.y_ax.xaxis.get_minorticklines(), visible=False) - plt.setp(self.x_ax.get_yticklabels(), visible=False) - plt.setp(self.y_ax.get_xticklabels(), visible=False) - self.x_ax.yaxis.grid(False) - self.y_ax.xaxis.grid(False) - self.fig.suptitle("Joint Plot of {} vs {}" - .format(self.feature, self.target), y=1.05) +def joint_plot(): + raise NotImplementedError("quick method still needs to be implemented") diff --git a/yellowbrick/features/manifold.py b/yellowbrick/features/manifold.py index 8a6577040..80aadcab1 100644 --- a/yellowbrick/features/manifold.py +++ b/yellowbrick/features/manifold.py @@ -1,10 +1,13 @@ # yellowbrick.features.manifold # Use manifold algorithms for high dimensional visualization. # -# Author: Benjamin Bengfort +# Author: Benjamin Bengfort # Created: Sat May 12 11:25:24 2018 -0400 # -# ID: manifold.py [] benjamin@bengfort.com $ +# Copyright (C) 2018 The scikit-yb developers +# For license information, see LICENSE.txt +# +# ID: manifold.py [02f8c27] benjamin@bengfort.com $ """ Use manifold algorithms for high dimensional visualization. @@ -14,19 +17,16 @@ ## Imports ########################################################################## -import numpy as np -import matplotlib.pyplot as plt - -from six import string_types +import warnings from yellowbrick.utils.timer import Timer -from yellowbrick.draw import manual_legend from yellowbrick.utils.types import is_estimator -from yellowbrick.style import palettes, resolve_colors -from yellowbrick.features.base import FeatureVisualizer -from yellowbrick.exceptions import YellowbrickValueError, NotFitted +from yellowbrick.exceptions import ModelError, NotFitted +from yellowbrick.features.projection import ProjectionVisualizer +from yellowbrick.exceptions import YellowbrickValueError, YellowbrickWarning from sklearn.base import clone +from sklearn.exceptions import NotFittedError from sklearn.manifold import LocallyLinearEmbedding from sklearn.manifold import Isomap, MDS, TSNE, SpectralEmbedding @@ -36,14 +36,14 @@ ########################################################################## MANIFOLD_ALGORITHMS = { - "lle": LocallyLinearEmbedding(method="standard", eigen_solver='auto'), - "ltsa":LocallyLinearEmbedding(method="ltsa", eigen_solver='auto'), - "hessian": LocallyLinearEmbedding(method="hessian", eigen_solver='auto'), - "modified": LocallyLinearEmbedding(method="modified", eigen_solver='auto'), + "lle": LocallyLinearEmbedding(method="standard", eigen_solver="auto"), + "ltsa": LocallyLinearEmbedding(method="ltsa", eigen_solver="auto"), + "hessian": LocallyLinearEmbedding(method="hessian", eigen_solver="auto"), + "modified": LocallyLinearEmbedding(method="modified", eigen_solver="auto"), "isomap": Isomap(), "mds": MDS(), "spectral": SpectralEmbedding(), - "tsne": TSNE(init='pca'), + "tsne": TSNE(init="pca"), } MANIFOLD_NAMES = { @@ -57,18 +57,13 @@ "tsne": "t-SNE", } -# Target type constants -AUTO = "auto" -SINGLE = "single" -DISCRETE = "discrete" -CONTINUOUS = "continuous" - ########################################################################## ## Manifold Embeddings ########################################################################## -class Manifold(FeatureVisualizer): + +class Manifold(ProjectionVisualizer): """ The Manifold visualizer provides high dimensional visualization for feature analysis by embedding data into 2 dimensions using the sklearn.manifold @@ -109,42 +104,72 @@ class Manifold(FeatureVisualizer): The axes to plot the figure on. If None, the current axes will be used or generated if required. - manifold : str or Transformer, default: "lle" + manifold : str or Transformer, default: "mds" Specify the manifold algorithm to perform the embedding. Either one of the strings listed in the table above, or an actual scikit-learn transformer. The constructed manifold is accessible with the manifold property, so as to modify hyperparameters before fit. - n_neighbors : int, default: 10 + n_neighbors : int, default: None Many manifold algorithms are nearest neighbors based, for those that are, this parameter specfies the number of neighbors to use in the - embedding. If the manifold algorithm doesn't use nearest neighbors, - then this parameter is ignored. - - colors : str or list of colors, default: None - Specify the colors used, though note that the specification depends - very much on whether the target is continuous or discrete. If - continuous, colors must be the name of a colormap. If discrete, then - colors can be the name of a palette or a list of colors to use for each - class in the target. - - target : str, default: "auto" + embedding. If n_neighbors is not specified for those embeddings, it is + set to 5 and a warning is issued. If the manifold algorithm doesn't use + nearest neighbors, then this parameter is ignored. + + features : list, default: None + The names of the features specified by the columns of the input dataset. + This length of this list must match the number of columns in X, otherwise + an exception will be raised on ``fit()``. + + classes : list, default: None + The class labels for each class in y, ordered by sorted class index. These + names act as a label encoder for the legend, identifying integer classes + or renaming string labels. If omitted, the class labels will be taken from + the unique values in y. + + Note that the length of this list must match the number of unique values in + y, otherwise an exception is raised. This parameter is only used in the + discrete target type case and is ignored otherwise. + + colors : list or tuple, default: None + A single color to plot all instances as or a list of colors to color each + instance according to its class in the discrete case or as an ordered + colormap in the sequential case. If not enough colors per class are + specified then the colors are treated as a cycle. + + colormap : string or cmap, default: None + The colormap used to create the individual colors. In the discrete case + it is used to compute the number of colors needed for each class and + in the continuous case it is used to create a sequential color map based + on the range of the target. + + target_type : str, default: "auto" Specify the type of target as either "discrete" (classes) or "continuous" - (real numbers, usually for regression). If "auto", the Manifold will + (real numbers, usually for regression). If "auto", then it will attempt to determine the type by counting the number of unique values. - If the target is discrete, points will be colored by the target class - and a legend will be displayed. If continuous, points will be displayed - with a colormap and a color bar will be displayed. In either case, if - no target is specified, only a single color will be drawn. + If the target is discrete, the colors are returned as a dict with classes + being the keys. If continuous the colors will be list having value of + color for each point. In either case, if no target is specified, then + color will be specified as the first color in the color cycle. + + projection : int or string, default: 2 + The number of axes to project into, either 2d or 3d. To plot 3d plots + with matplotlib, please ensure a 3d axes is passed to the visualizer, + otherwise one will be created using the current figure. - alpha : float, default: 0.7 + alpha : float, default: 0.75 Specify a transparency where 1 is completely opaque and 0 is completely transparent. This property makes densely clustered points more visible. random_state : int or RandomState, default: None Fixes the random state for stochastic manifold algorithms. + colorbar : bool, default: True + If the target_type is "continous" draw a colorbar to the right of the + scatter plot. The colobar axes is accessible using the cax property. + kwargs : dict Keyword arguments passed to the base class and may influence the feature visualization properties. @@ -154,11 +179,20 @@ class in the target. fit_time_ : yellowbrick.utils.timer.Timer The amount of time in seconds it took to fit the Manifold. - classes_ : np.ndarray, optional - If discrete, the classes identified in the target y. + classes_ : ndarray, shape (n_classes,) + The class labels that define the discrete values in the target. Only + available if the target type is discrete. This is guaranteed to be + strings even if the classes are a different type. - range_ : tuple of floats, optional - If continuous, the maximum and minimum values in the target y. + features_ : ndarray, shape (n_features,) + The names of the features discovered or used in the visualizer that + can be used as an index to access or modify data in X. If a user passes + feature names in, those features are used. Otherwise the columns of a + DataFrame are used or just simply the indices of the data array. + + range_ : (min y, max y) + A tuple that describes the minimum and maximum values in the target. + Only available if the target type is continuous. Examples -------- @@ -201,25 +235,38 @@ class in the target. def __init__( self, ax=None, - manifold="lle", - n_neighbors=10, + manifold="mds", + n_neighbors=None, + features=None, + classes=None, colors=None, - target=AUTO, - alpha=0.7, + colormap=None, + target_type="auto", + projection=2, + alpha=0.75, random_state=None, + colorbar=True, **kwargs ): - super(Manifold, self).__init__(ax, **kwargs) + + super(Manifold, self).__init__( + ax, + features, + classes, + colors, + colormap, + target_type, + projection, + alpha, + colorbar, + **kwargs + ) self._name = None self._manifold = None - self._target_color_type = None self.n_neighbors = n_neighbors - self.colors = colors - self.target = target - self.alpha = alpha self.random_state = random_state - self.manifold = manifold # must be set last + self.manifold = manifold # must be set last @property def manifold(self): @@ -239,14 +286,40 @@ def manifold(self, transformer): if not is_estimator(transformer): if transformer not in self.ALGORITHMS: raise YellowbrickValueError( - "could not create manifold for '%s'".format(str(transformer)) + "could not create manifold for '{}'".format(str(transformer)) ) + # 2 components is required for 2D plots + n_components = self.projection + requires_default_neighbors = { + "lle", + "ltsa", + "isomap", + "hessian", + "spectral", + "modified", + } + + # Check if the n_neighbors attribute needs to be set. + if self.n_neighbors is None and transformer in requires_default_neighbors: + if transformer == "hessian": + self.n_neighbors = int( + 1 + (n_components * (1 + (n_components + 1) / 2)) + ) + else: + self.n_neighbors = 5 + + # Issue a warning that the n_neighbors was set to a default. + warnmsg = ( + "using n_neighbors={}; please explicitly specify for the '{}' manifold" + ).format(self.n_neighbors, str(transformer)) + warnings.warn(warnmsg, YellowbrickWarning) + # Create a new transformer with the specified params self._name = MANIFOLD_NAMES[transformer] transformer = clone(self.ALGORITHMS[transformer]) params = { - "n_components": 2, + "n_components": n_components, "n_neighbors": self.n_neighbors, "random_state": self.random_state, } @@ -261,15 +334,42 @@ def manifold(self, transformer): if self._name is None: self._name = self._manifold.__class__.__name__ - def fit(self, X, y=None): + def fit(self, X, y=None, **kwargs): """ Fits the manifold on X and transforms the data to plot it on the axes. See fit_transform() for more details. + + Parameters + ---------- + X : array-like of shape (n, m) + A matrix or data frame with n instances and m features + + y : array-like of shape (n,), optional + A vector or series with target values for each instance in X. This + vector is used to determine the color of the points in X. + + Returns + ------- + self : Manifold + Returns the visualizer object. + """ - self.fit_transform(X, y) + if not hasattr(self.manifold, "transform"): + name = self.manifold.__class__.__name__ + raise ModelError( + ( + "{} requires data to be simultaneously fit and transformed, " + "use fit_transform instead" + ).format(name) + ) + + # Call super to compute features, classes, colors, etc. + super(Manifold, self).fit(X, y) + with Timer() as self.fit_time_: + self.manifold.fit(X) return self - def fit_transform(self, X, y=None): + def fit_transform(self, X, y=None, **kwargs): """ Fits the manifold on X and transforms the data to plot it on the axes. The optional y specified can be used to declare discrete colors. If @@ -282,7 +382,7 @@ def fit_transform(self, X, y=None): Parameters ---------- X : array-like of shape (n, m) - A matrix or data frame with n instances and m features where m > 2. + A matrix or data frame with n instances and m features y : array-like of shape (n,), optional A vector or series with target values for each instance in X. This @@ -290,37 +390,21 @@ def fit_transform(self, X, y=None): Returns ------- - self : Manifold - Returns the visualizer object. - """ - # Determine target type - self._determine_target_color_type(y) - - # Compute classes and colors if target type is discrete - if self._target_color_type == DISCRETE: - self.classes_ = np.unique(y) - - color_kwargs = {'n_colors': len(self.classes_)} - - if isinstance(self.colors, string_types): - color_kwargs['colormap'] = self.colors - else: - color_kwargs['colors'] = self.colors - - self._colors = resolve_colors(**color_kwargs) + Xprime : array-like of shape (n, 2) + Returns the 2-dimensional embedding of the instances. - # Compute target range if colors are continuous - elif self._target_color_type == CONTINUOUS: - y = np.asarray(y) - self.range_ = (y.min(), y.max()) + """ + # Because some manifolds do not have transform, we cannot call individual + # fit and transform methods, but must do it manually here. + # Call super fit to compute features, classes, colors, etc. + super(Manifold, self).fit(X, y) with Timer() as self.fit_time_: Xp = self.manifold.fit_transform(X) - self.draw(Xp, y) return Xp - def transform(self, X): + def transform(self, X, y=None, **kwargs): """ Returns the transformed data points from the manifold embedding. @@ -329,63 +413,40 @@ def transform(self, X): X : array-like of shape (n, m) A matrix or data frame with n instances and m features - Returns - ------- - Xprime : array-like of shape (n, 2) - Returns the 2-dimensional embedding of the instances. - """ - try: - return self.manifold.transform(X) - except AttributeError as e: - raise AttributeError(str(e) + " try using fit_transform instead.") - - def draw(self, X, y=None): - """ - Draws the points described by X and colored by the points in y. Can be - called multiple times before finalize to add more scatter plots to the - axes, however ``fit()`` must be called before use. - - Parameters - ---------- - X : array-like of shape (n, 2) - The matrix produced by the ``transform()`` method. - y : array-like of shape (n,), optional The target, used to specify the colors of the points. Returns ------- - self.ax : matplotlib Axes object - Returns the axes that the scatter plot was drawn on. - """ - scatter_kwargs = {"alpha": self.alpha} - - # Determine the colors - if self._target_color_type == SINGLE: - scatter_kwargs["c"] = "b" - - elif self._target_color_type == DISCRETE: - if y is None: - raise YellowbrickValueError("y is required for discrete target") - - scatter_kwargs["c"] = [ - self._colors[np.searchsorted(self.classes_, (yi))] for yi in y - ] - - elif self._target_color_type == CONTINUOUS: - if y is None: - raise YellowbrickValueError("y is required for continuous target") + Xprime : array-like of shape (n, 2) + Returns the 2-dimensional embedding of the instances. - # TODO manually make colorbar so we can draw it in finalize - scatter_kwargs["c"] = y - scatter_kwargs["cmap"] = self.colors or palettes.DEFAULT_SEQUENCE + Note + ---- + This method does not work with MDS, TSNE and SpectralEmbedding because + it is yet to be implemented in sklearn. + """ + # Because some manifolds do not have transform we cannot call super + try: + Xp = self.manifold.transform(X) + self.draw(Xp, y) + return Xp + except NotFittedError: + raise NotFitted.from_estimator(self, "transform") + except AttributeError: + name = self.manifold.__class__.__name__ + raise ModelError( + ( + "{} requires data to be simultaneously fit and transformed, " + "use fit_transform instead" + ).format(name) + ) - else: - # Technically this should never be raised - raise NotFitted("could not determine target color type") + return Xp - # Draw the scatter plot with the associated colors and alpha - self._scatter = self.ax.scatter(X[:,0], X[:,1], **scatter_kwargs) + def draw(self, Xp, y=None): + # Calls draw method from super class which draws scatter plot. + super(Manifold, self).draw(Xp, y) return self.ax def finalize(self): @@ -393,64 +454,37 @@ def finalize(self): Add title and modify axes to make the image ready for display. """ self.set_title( - '{} Manifold (fit in {:0.2f} seconds)'.format( + "{} Manifold (fit in {:0.2f} seconds)".format( self._name, self.fit_time_.interval ) ) - self.ax.set_xticklabels([]) - self.ax.set_yticklabels([]) - - if self._target_color_type == DISCRETE: - # Add the legend - manual_legend(self, self.classes_, self._colors) - - elif self._target_color_type == CONTINUOUS: - # Add the color bar - plt.colorbar(self._scatter, ax=self.ax) - - def _determine_target_color_type(self, y): - """ - Determines the target color type from the vector y as follows: - - - if y is None: only a single color is used - - if target is auto: determine if y is continuous or discrete - - otherwise specify supplied target type - - This property will be used to compute the colors for each point. - """ - if y is None: - self._target_color_type = SINGLE - elif self.target == "auto": - # NOTE: See #73 for a generalization to use when implemented - if len(np.unique(y)) < 10: - self._target_color_type = DISCRETE - else: - self._target_color_type = CONTINUOUS - else: - self._target_color_type = self.target - - if self._target_color_type not in {SINGLE, DISCRETE, CONTINUOUS}: - raise YellowbrickValueError(( - "could not determine target color type " - "from target='{}' to '{}'" - ).format(self.target, self._target_color_type)) + self.ax.set_xlabel("Using {} features".format(len(self.features_))) + # Draws legend for discrete target and colorbar for continuous. + super(Manifold, self).finalize() ########################################################################## ## Quick Method ########################################################################## + def manifold_embedding( X, y=None, ax=None, - manifold="lle", - n_neighbors=10, + manifold="mds", + n_neighbors=None, + features=None, + classes=None, colors=None, - target=AUTO, - alpha=0.7, + colormap=None, + target_type="auto", + projection=2, + alpha=0.75, random_state=None, - **kwargs): + colorbar=True, + **kwargs +): """Quick method for Manifold visualizer. The Manifold visualizer provides high dimensional visualization for feature @@ -470,9 +504,9 @@ def manifold_embedding( A vector or series with target values for each instance in X. This vector is used to determine the color of the points in X. - ax : matplotlib Axes, default: None - The axes to plot the figure on. If None, the current axes will be used - or generated if required. + ax : matplotlib.Axes, default: None + The axis to plot the figure on. If None is passed in the current axes + will be used (or generated if required). manifold : str or Transformer, default: "lle" Specify the manifold algorithm to perform the embedding. Either one of @@ -493,54 +527,116 @@ def manifold_embedding( ``"tsne"`` `t-SNE `_ ============== ========================== - n_neighbors : int, default: 10 + n_neighbors : int, default: None Many manifold algorithms are nearest neighbors based, for those that are, this parameter specfies the number of neighbors to use in the - embedding. If the manifold algorithm doesn't use nearest neighbors, - then this parameter is ignored. - - colors : str or list of colors, default: None - Specify the colors used, though note that the specification depends - very much on whether the target is continuous or discrete. If - continuous, colors must be the name of a colormap. If discrete, then - colors can be the name of a palette or a list of colors to use for each - class in the target. - - target : str, default: "auto" + embedding. If n_neighbors is not specified for those embeddings, it is + set to 5 and a warning is issued. If the manifold algorithm doesn't use + nearest neighbors, then this parameter is ignored. + + features : list, default: None + The names of the features specified by the columns of the input dataset. + This length of this list must match the number of columns in X, otherwise + an exception will be raised on ``fit()``. + + classes : list, default: None + The class labels for each class in y, ordered by sorted class index. These + names act as a label encoder for the legend, identifying integer classes + or renaming string labels. If omitted, the class labels will be taken from + the unique values in y. + + Note that the length of this list must match the number of unique values in + y, otherwise an exception is raised. This parameter is only used in the + discrete target type case and is ignored otherwise. + + colors : list or tuple, default: None + A single color to plot all instances as or a list of colors to color each + instance according to its class in the discrete case or as an ordered + colormap in the sequential case. If not enough colors per class are + specified then the colors are treated as a cycle. + + colormap : string or cmap, default: None + The colormap used to create the individual colors. In the discrete case + it is used to compute the number of colors needed for each class and + in the continuous case it is used to create a sequential color map based + on the range of the target. + + target_type : str, default: "auto" Specify the type of target as either "discrete" (classes) or "continuous" - (real numbers, usually for regression). If "auto", the Manifold will + (real numbers, usually for regression). If "auto", then it will attempt to determine the type by counting the number of unique values. - If the target is discrete, points will be colored by the target class - and a legend will be displayed. If continuous, points will be displayed - with a colormap and a color bar will be displayed. In either case, if - no target is specified, only a single color will be drawn. + If the target is discrete, the colors are returned as a dict with classes + being the keys. If continuous the colors will be list having value of + color for each point. In either case, if no target is specified, then + color will be specified as the first color in the color cycle. + + projection : int or string, default: 2 + The number of axes to project into, either 2d or 3d. To plot 3d plots + with matplotlib, please ensure a 3d axes is passed to the visualizer, + otherwise one will be created using the current figure. - alpha : float, default: 0.7 + alpha : float, default: 0.75 Specify a transparency where 1 is completely opaque and 0 is completely transparent. This property makes densely clustered points more visible. random_state : int or RandomState, default: None Fixes the random state for stochastic manifold algorithms. + colorbar : bool, default: True + If the target_type is "continous" draw a colorbar to the right of the + scatter plot. The colobar axes is accessible using the cax property. + kwargs : dict Keyword arguments passed to the base class and may influence the feature visualization properties. + Attributes + ---------- + fit_time_ : yellowbrick.utils.timer.Timer + The amount of time in seconds it took to fit the Manifold. + + classes_ : ndarray, shape (n_classes,) + The class labels that define the discrete values in the target. Only + available if the target type is discrete. This is guaranteed to be + strings even if the classes are a different type. + + features_ : ndarray, shape (n_features,) + The names of the features discovered or used in the visualizer that + can be used as an index to access or modify data in X. If a user passes + feature names in, those features are used. Otherwise the columns of a + DataFrame are used or just simply the indices of the data array. + + range_ : (min y, max y) + A tuple that describes the minimum and maximum values in the target. + Only available if the target type is continuous. + + Returns ------- - ax : matplotlib axes - Returns the axes that the embedded scatter plot was drawn on. + viz : Manifold + Returns the fitted, finalized visualizer """ # Instantiate the visualizer viz = Manifold( - ax=ax, manifold=manifold, n_neighbors=n_neighbors, colors=colors, - target=target, alpha = alpha, random_state=random_state, **kwargs + ax=ax, + manifold=manifold, + n_neighbors=n_neighbors, + features=features, + classes=classes, + colors=colors, + colormap=colormap, + target_type=target_type, + projection=projection, + alpha=alpha, + random_state=random_state, + colorbar=colorbar, + **kwargs ) - # Fit and poof (calls draw) - viz.fit(X, y) - viz.poof() + # Fit and finalize (calls draw) + viz.fit_transform(X, y) + viz.finalize() - # Return the axes object - return viz.ax + # Return the visualizer object + return viz diff --git a/yellowbrick/features/pca.py b/yellowbrick/features/pca.py index 981104039..59b6ee5b6 100644 --- a/yellowbrick/features/pca.py +++ b/yellowbrick/features/pca.py @@ -2,11 +2,14 @@ # yellowbrick.features.pca # Decomposition based feature visualization with PCA. # -# Author: Carlo Morales <@cjmorale> -# Author: Raúl Peralta Lozada <@RaulPL> -# Author: Benjamin Bengfort <@bbengfort> +# Author: Carlo Morales +# Author: Raúl Peralta Lozada +# Author: Benjamin Bengfort # Created: Tue May 23 18:34:27 2017 -0400 # +# Copyright (C) 2017 The scikit-yb developers +# For license information, see LICENSE.txt +# # ID: pca.py [] cmorales@pacificmetrics.com $ """ @@ -18,110 +21,245 @@ ########################################################################## # NOTE: must import mplot3d to load the 3D projection -import mpl_toolkits.mplot3d # noqa +import numpy as np import matplotlib.pyplot as plt +from mpl_toolkits.axes_grid1 import make_axes_locatable -from yellowbrick.features.base import MultiFeatureVisualizer from yellowbrick.style import palettes -from yellowbrick.exceptions import YellowbrickValueError +from yellowbrick.features.projection import ProjectionVisualizer +from yellowbrick.exceptions import YellowbrickValueError, NotFitted from sklearn.pipeline import Pipeline -from sklearn.decomposition import PCA +from sklearn.decomposition import PCA as PCATransformer from sklearn.preprocessing import StandardScaler +from sklearn.exceptions import NotFittedError ########################################################################## -##2D and #3D PCA Visualizer +# 2D and 3D PCA Visualizer ########################################################################## -class PCADecomposition(MultiFeatureVisualizer): + +class PCA(ProjectionVisualizer): """ Produce a two or three dimensional principal component plot of a data array - projected onto it's largest sequential principal components. It is common + projected onto its largest sequential principal components. It is common practice to scale the data array ``X`` before applying a PC decomposition. Variable scaling can be controlled using the ``scale`` argument. Parameters ---------- ax : matplotlib Axes, default: None - The axes to plot the figure on. If None is passed in the current axes. + The axes to plot the figure on. If None is passed in, the current axes will be used (or generated if required). - features: list, default: None - a list of feature names to use - If a DataFrame is passed to fit and features is None, feature - names are selected as the columns of the DataFrame. + features : list, default: None + The names of the features specified by the columns of the input dataset. + This length of this list must match the number of columns in X, otherwise + an exception will be raised on ``fit()``. + + classes : list, default: None + The class labels for each class in y, ordered by sorted class index. These + names act as a label encoder for the legend, identifying integer classes + or renaming string labels. If omitted, the class labels will be taken from + the unique values in y. + + Note that the length of this list must match the number of unique values in + y, otherwise an exception is raised. This parameter is only used in the + discrete target type case and is ignored otherwise. scale : bool, default: True Boolean that indicates if user wants to scale data. - proj_dim : int, default: 2 - Dimension of the PCA visualizer. + projection : int or string, default: 2 + The number of axes to project into, either 2d or 3d. To plot 3d plots + with matplotlib, please ensure a 3d axes is passed to the visualizer, + otherwise one will be created using the current figure. proj_features : bool, default: False Boolean that indicates if the user wants to project the features in the projected space. If True the plot will be similar to a biplot. - color : list or tuple of colors, default: None - Specify the colors for each individual class. + colors : list or tuple, default: None + A single color to plot all instances as or a list of colors to color each + instance according to its class in the discrete case or as an ordered + colormap in the sequential case. If not enough colors per class are + specified then the colors are treated as a cycle. colormap : string or cmap, default: None - Optional string or matplotlib cmap to colorize lines. - Use either color to colorize the lines on a per class basis or - colormap to color them on a continuous scale. + The colormap used to create the individual colors. In the discrete case + it is used to compute the number of colors needed for each class and + in the continuous case it is used to create a sequential color map based + on the range of the target. + + alpha : float, default: 0.75 + Specify a transparency where 1 is completely opaque and 0 is completely + transparent. This property makes densely clustered points more visible. random_state : int, RandomState instance or None, optional (default None) - If input data is larger than 500x500 and the number of components to - extract is lower than 80% of the smallest dimension of the data, then - the more efficient `randomized` solver is enabled, this parameter sets - the random state on this solver. + This parameter sets the random state on this solver. If the input X is + larger than 500x500 and the number of components to extract is lower + than 80% of the smallest dimension of the data, then the more efficient + `randomized` solver is enabled. + + colorbar : bool, default: True + If the target_type is "continous" draw a colorbar to the right of the + scatter plot. The colobar axes is accessible using the cax property. + + heatmap : bool, default: False + Add a heatmap showing contribution of each feature in the principal components. + Also draws a colorbar for readability purpose. The heatmap is accessible + using lax property and colorbar using uax property. kwargs : dict Keyword arguments that are passed to the base class and may influence the visualization as defined in other Visualizers. + Attributes + ---------- + pca_components_ : ndarray, shape (n_features, n_components) + This tells about the magnitude of each feature in the pricipal components. + This is primarily used to draw the biplots. + + classes_ : ndarray, shape (n_classes,) + The class labels that define the discrete values in the target. Only + available if the target type is discrete. This is guaranteed to be + strings even if the classes are a different type. + + features_ : ndarray, shape (n_features,) + The names of the features discovered or used in the visualizer that + can be used as an index to access or modify data in X. If a user passes + feature names in, those features are used. Otherwise the columns of a + DataFrame are used or just simply the indices of the data array. + + range_ : (min y, max y) + A tuple that describes the minimum and maximum values in the target. + Only available if the target type is continuous. + Examples -------- >>> from sklearn import datasets >>> iris = datasets.load_iris() >>> X = iris.data >>> y = iris.target - >>> visualizer = PCADecomposition() - >>> visualizer.fit_transform(X) + >>> visualizer = PCA() + >>> visualizer.fit_transform(X, y) >>> visualizer.poof() """ - def __init__(self, - ax=None, - features=None, - scale=True, - proj_dim=2, - proj_features=False, - color=None, - colormap=palettes.DEFAULT_SEQUENCE, - random_state=None, - **kwargs): - super(PCADecomposition, self).__init__(ax=ax, - features=features, - **kwargs) - - if proj_dim not in (2, 3): - raise YellowbrickValueError("proj_dim object is not 2 or 3.") + + def __init__( + self, + ax=None, + features=None, + classes=None, + scale=True, + projection=2, + proj_features=False, + colors=None, + colormap=None, + alpha=0.75, + random_state=None, + colorbar=True, + heatmap=False, + **kwargs + ): + super(PCA, self).__init__( + ax=ax, + features=features, + classes=classes, + colors=colors, + colormap=colormap, + projection=projection, + alpha=alpha, + colorbar=colorbar, + **kwargs + ) # Data Parameters self.scale = scale - self.proj_dim = proj_dim self.proj_features = proj_features # Create the PCA transformer self.pca_transformer = Pipeline( - [('scale', StandardScaler(with_std=self.scale)), - ('pca', PCA(self.proj_dim, random_state=random_state))] + [ + ("scale", StandardScaler(with_std=self.scale)), + ("pca", PCATransformer(self.projection, random_state=random_state)), + ] ) + self.alpha = alpha # Visual Parameters - self.color = color - self.colormap = colormap + self.heatmap = heatmap + + self._uax, self._lax = None, None + + # No heatmap can be drawn with 3d plots as they do not have permit axes + # division. + if self.projection == 3 and self.heatmap: + raise YellowbrickValueError( + "heatmap and colorbar are not compatible with 3d projections" + ) + + @property + def uax(self): + """ + The axes of the colorbar, bottom of scatter plot. This is the colorbar + for heatmap and not for the scatter plot. + """ + if self._uax is None: + raise AttributeError("This visualizer does not have an axes for colorbar") + + return self._uax + + @property + def lax(self): + """ + The axes of the heatmap below scatter plot. + """ + if self._lax is None: + raise AttributeError("This visualizer does not have an axes for heatmap") + + return self._lax + + def layout(self, divider=None): + """ + Creates the layout for colorbar and heatmap, adding new axes for the heatmap + if necessary and modifying the aspect ratio. Does not modify the axes or the + layout if ``self.heatmap`` is ``False`` or ``None``. + + Parameters + ---------- + divider: AxesDivider + An AxesDivider to be passed among all layout calls. + """ + + # Ensure matplotlib version compatibility + if make_axes_locatable is None: + raise YellowbrickValueError( + ( + "heatmap requires matplotlib 2.0.2 or greater " + "please upgrade matplotlib or set heatmap=False on the visualizer" + ) + ) + + # Create the new axes for the colorbar and heatmap + if divider is None: + divider = make_axes_locatable(self.ax) + + # Call to super class ensures that a colorbar is drawn when target is + # continuous. + super(PCA, self).layout(divider) + + if self.heatmap: + + # Axes for colorbar(for heatmap). + if self._uax is None: + self._uax = divider.append_axes("bottom", size="10%", pad=0.7) + + # Axes for heatmap + if self._lax is None: + self._lax = divider.append_axes("bottom", size="15%", pad=0.5) def fit(self, X, y=None, **kwargs): """ @@ -139,88 +277,205 @@ def fit(self, X, y=None, **kwargs): Returns ------- self : visualizer - Returns self for use in Pipelines + Returns self for use in Pipelines. + + """ - super(PCADecomposition, self).fit(X=X, y=y, **kwargs) + # Call super fit to compute features, classes, colors, etc. + super(PCA, self).fit(X=X, y=y, **kwargs) self.pca_transformer.fit(X) - self.pca_components_ = self.pca_transformer.named_steps['pca'].components_ + self.pca_components_ = self.pca_transformer.named_steps["pca"].components_ return self def transform(self, X, y=None, **kwargs): - self.pca_features_ = self.pca_transformer.transform(X) - self.draw() - return self.pca_features_ - - def draw(self, **kwargs): - X = self.pca_features_ - if self.proj_dim == 2: - self.ax.scatter(X[:, 0], X[:, 1], c=self.color, cmap=self.colormap) - if self.proj_features: - x_vector = self.pca_components_[0] - y_vector = self.pca_components_[1] - max_x = max(X[:, 0]) - max_y = max(X[:, 1]) - for i in range(self.pca_components_.shape[1]): - self.ax.arrow( - x=0, y=0, - dx=x_vector[i] * max_x, - dy=y_vector[i] * max_y, - color='r', head_width=0.05, - width=0.005, - ) - self.ax.text( - x_vector[i] * max_x * 1.05, - y_vector[i] * max_y * 1.05, - self.features_[i], color='r' - ) - if self.proj_dim == 3: - self.fig = plt.figure() - self.ax = self.fig.add_subplot(111, projection='3d') - self.ax.scatter(X[:, 0], X[:, 1], X[:, 2], - c=self.color, cmap=self.colormap) - if self.proj_features: - x_vector = self.pca_components_[0] - y_vector = self.pca_components_[1] - z_vector = self.pca_components_[2] - max_x = max(X[:, 0]) - max_y = max(X[:, 1]) - max_z = max(X[:, 1]) - for i in range(self.pca_components_.shape[1]): - self.ax.plot( - [0, x_vector[i] * max_x], - [0, y_vector[i] * max_y], - [0, z_vector[i] * max_z], - color='r' - ) - self.ax.text( - x_vector[i] * max_x * 1.05, - y_vector[i] * max_y * 1.05, - z_vector[i] * max_z * 1.05, - self.features_[i], color='r' - ) + """ + Calls the internal `transform` method of the scikit-learn PCA transformer, which + performs a dimensionality reduction on the input features ``X``. Next calls the + ``draw`` method of the Yellowbrick visualizer, finally returning a new array of + transformed features of shape ``(len(X), proj_dim)``. + + Parameters + ---------- + X : ndarray or DataFrame of shape n x m + A matrix of n instances with m features. + + y : ndarray or Series of length n + An array or series of target or class values. + + Returns + ------- + Xp : ndarray or DataFrame of shape n x m + Returns a new array-like object of transformed features of shape + ``(len(X), proj_dim)``. + """ + try: + Xp = self.pca_transformer.transform(X) + self.draw(Xp, y) + return Xp + except NotFittedError: + raise NotFitted.from_estimator(self, "transform") + + def draw(self, Xp, y): + """ + Plots a scatterplot of points that represented the decomposition, + `pca_features_`, of the original features, `X`, projected into either 2 or + 3 dimensions. + + If 2 dimensions are selected, a colorbar and heatmap can also be optionally + included to show the magnitude of each feature value to the component. + + Parameters + ---------- + Xp : array-like of shape (n, 2) or (n, 3) + The matrix produced by the ``transform()`` method. + + y : array-like of shape (n,), optional + The target, used to specify the colors of the points. + + + Returns + ------- + self.ax : matplotlib Axes object + Returns the axes that the scatter plot was drawn on. + """ + # Call to super draw which draws the scatter plot. + super(PCA, self).draw(Xp, y) + if self.proj_features: + # Draws projection features in transformed space. + self._draw_projection_features(Xp, y) + if self.projection == 2: + if self.heatmap: + if not self.colormap: + self.colormap = palettes.DEFAULT_SEQUENCE + # TODO: change to pcolormesh instead of imshow per #615 spec + im = self.lax.imshow( + self.pca_components_, + interpolation="none", + cmap=self.colormap, + aspect="auto", + ) + plt.colorbar( + im, + cax=self.uax, + orientation="horizontal", + ticks=[self.pca_components_.min(), 0, self.pca_components_.max()], + ) + return self.ax + + def _draw_projection_features(self, Xp, y): + """ + Draw the projection of features in the transformed space. + Parameters + ---------- + Xp : array-like of shape (n, 2) or (n, 3) + The matrix produced by the ``transform()`` method. + + y : array-like of shape (n,), optional + The target, used to specify the colors of the points. + + Returns + ------- + self.ax : matplotlib Axes object + Returns the axes that the scatter plot was drawn on. + + """ + + x_vector = self.pca_components_[0] + y_vector = self.pca_components_[1] + max_x = max(Xp[:, 0]) + max_y = max(Xp[:, 1]) + if self.projection == 2: + for i in range(self.pca_components_.shape[1]): + self.ax.arrow( + x=0, + y=0, + dx=x_vector[i] * max_x, + dy=y_vector[i] * max_y, + color="r", + head_width=0.05, + width=0.005, + ) + self.ax.text( + x_vector[i] * max_x * 1.05, + y_vector[i] * max_y * 1.05, + self.features_[i], + color="r", + ) + elif self.projection == 3: + z_vector = self.pca_components_[2] + max_z = max(Xp[:, 1]) + for i in range(self.pca_components_.shape[1]): + self.ax.plot( + [0, x_vector[i] * max_x], + [0, y_vector[i] * max_y], + [0, z_vector[i] * max_z], + color="r", + ) + self.ax.text( + x_vector[i] * max_x * 1.05, + y_vector[i] * max_y * 1.05, + z_vector[i] * max_z * 1.05, + self.features_[i], + color="r", + ) + else: + raise YellowbrickValueError("Projection dimensions must be either 2 or 3") + return self.ax def finalize(self, **kwargs): - # Set the title - self.ax.set_title('Principal Component Plot') - self.ax.set_xlabel('Principal Component 1') - self.ax.set_ylabel('Principal Component 2') - if self.proj_dim == 3: - self.ax.set_zlabel('Principal Component 3') + """ + Draws the title, labels, legends, heatmap, and colorbar as specified by the + keyword arguments. + """ + super(PCA, self).finalize() + + self.ax.set_title("Principal Component Plot") + self.ax.set_xlabel("$PC_1$") + self.ax.set_ylabel("$PC_2$") + if self.projection == 3: + self.ax.set_zlabel("$PC_3$") + if self.heatmap == True: + self.lax.set_xticks(np.arange(-0.5, len(self.features_))) + self.lax.set_xticklabels([]) + # Makes the labels centered. + self.lax.set_xticks(np.arange(0, len(self.features_)), minor=True) + self.lax.set_xticklabels( + self.features_, rotation=90, fontsize=12, minor=True + ) + self.lax.set_yticks(np.arange(0.5, 2)) + self.lax.set_yticklabels(["$PC_1$", "$PC_2$"], va="bottom", fontsize=10) + self.fig.tight_layout() ########################################################################## ## Quick Method ########################################################################## -def pca_decomposition(X, y=None, ax=None, features=None, scale=True, - proj_dim=2, proj_features=False, color=None, - colormap=palettes.DEFAULT_SEQUENCE, - random_state=None, **kwargs): - """Produce a two or three dimensional principal component plot of the data array ``X`` - projected onto it's largest sequential principal components. It is common practice to scale the - data array ``X`` before applying a PC decomposition. Variable scaling can be controlled using - the ``scale`` argument. + +def pca_decomposition( + X, + y=None, + ax=None, + features=None, + classes=None, + scale=True, + projection=2, + proj_features=False, + colors=None, + colormap=None, + alpha=0.75, + random_state=None, + colorbar=True, + heatmap=False, + **kwargs +): + + """ + Produce a two or three dimensional principal component plot of the data array ``X`` + projected onto its largest sequential principal components. It is common practice + to scale the data array ``X`` before applying a PC decomposition. Variable scaling + can be controlled using the ``scale`` argument. Parameters ---------- @@ -231,62 +486,125 @@ def pca_decomposition(X, y=None, ax=None, features=None, scale=True, An array or series of target or class values. ax : matplotlib Axes, default: None - The axes to plot the figure on. If None is passed in the current axes. + The axes to plot the figure on. If None is passed in, the current axes will be used (or generated if required). - features: list, default: None - a list of feature names to use - If a DataFrame is passed to fit and features is None, feature - names are selected as the columns of the DataFrame. + features : list, default: None + The names of the features specified by the columns of the input dataset. + This length of this list must match the number of columns in X, otherwise + an exception will be raised on ``fit()``. + + classes : list, default: None + The class labels for each class in y, ordered by sorted class index. These + names act as a label encoder for the legend, identifying integer classes + or renaming string labels. If omitted, the class labels will be taken from + the unique values in y. + + Note that the length of this list must match the number of unique values in + y, otherwise an exception is raised. This parameter is only used in the + discrete target type case and is ignored otherwise. scale : bool, default: True Boolean that indicates if user wants to scale data. - proj_dim : int, default: 2 - Dimension of the PCA visualizer. + projection : int or string, default: 2 + The number of axes to project into, either 2d or 3d. To plot 3d plots + with matplotlib, please ensure a 3d axes is passed to the visualizer, + otherwise one will be created using the current figure. proj_features : bool, default: False Boolean that indicates if the user wants to project the features in the projected space. If True the plot will be similar to a biplot. - color : list or tuple of colors, default: None - Specify the colors for each individual class. + colors : list or tuple, default: None + A single color to plot all instances as or a list of colors to color each + instance according to its class in the discrete case or as an ordered + colormap in the sequential case. If not enough colors per class are + specified then the colors are treated as a cycle. colormap : string or cmap, default: None - Optional string or matplotlib cmap to colorize lines. - Use either color to colorize the lines on a per class basis or - colormap to color them on a continuous scale. + The colormap used to create the individual colors. In the discrete case + it is used to compute the number of colors needed for each class and + in the continuous case it is used to create a sequential color map based + on the range of the target. + + alpha : float, default: 0.75 + Specify a transparency where 1 is completely opaque and 0 is completely + transparent. This property makes densely clustered points more visible. random_state : int, RandomState instance or None, optional (default None) - If input data is larger than 500x500 and the number of components to - extract is lower than 80% of the smallest dimension of the data, then - the more efficient `randomized` solver is enabled, this parameter sets - the random state on this solver. + This parameter sets the random state on this solver. If the input X is + larger than 500x500 and the number of components to extract is lower + than 80% of the smallest dimension of the data, then the more efficient + `randomized` solver is enabled. + + colorbar : bool, default: True + If the target_type is "continous" draw a colorbar to the right of the + scatter plot. The colobar axes is accessible using the cax property. + + heatmap : bool, default: False + Add a heatmap showing contribution of each feature in the principal components. + Also draws a colorbar for readability purpose. The heatmap is accessible + using lax property and colorbar using uax property. kwargs : dict Keyword arguments that are passed to the base class and may influence the visualization as defined in other Visualizers. + Attributes + ---------- + pca_components_ : ndarray, shape (n_features, n_components) + This tells about the magnitude of each feature in the pricipal components. + This is primarily used to draw the biplots. + + classes_ : ndarray, shape (n_classes,) + The class labels that define the discrete values in the target. Only + available if the target type is discrete. This is guaranteed to be + strings even if the classes are a different type. + + features_ : ndarray, shape (n_features,) + The names of the features discovered or used in the visualizer that + can be used as an index to access or modify data in X. If a user passes + feature names in, those features are used. Otherwise the columns of a + DataFrame are used or just simply the indices of the data array. + + range_ : (min y, max y) + A tuple that describes the minimum and maximum values in the target. + Only available if the target type is continuous. + Examples -------- >>> from sklearn import datasets >>> iris = datasets.load_iris() >>> X = iris.data >>> y = iris.target - >>> pca_decomposition(X, color=y, proj_dim=3, colormap='RdBu_r') + >>> pca_decomposition(X, y, colors=['r', 'g', 'b'], projection=3) """ # Instantiate the visualizer - visualizer = PCADecomposition( - ax=ax, features=features, scale=scale, proj_dim=proj_dim, - proj_features=proj_features, color=color, colormap=colormap, - random_state=random_state, **kwargs + visualizer = PCA( + ax=ax, + features=features, + scale=scale, + projection=projection, + proj_features=proj_features, + colors=colors, + colormap=colormap, + alpha=alpha, + random_state=random_state, + colorbar=colorbar, + heatmap=heatmap, + **kwargs ) # Fit and transform the visualizer (calls draw) visualizer.fit(X, y) - visualizer.transform(X) - visualizer.poof() + visualizer.transform(X, y) + visualizer.finalize() + + # Returns the visualizer object. + return visualizer + - # Return the axes object on the visualizer - return visualizer.ax +# Alias for PCA +PCADecomposition = PCA diff --git a/yellowbrick/features/pcoords.py b/yellowbrick/features/pcoords.py index f5c742ed4..8876e00b3 100644 --- a/yellowbrick/features/pcoords.py +++ b/yellowbrick/features/pcoords.py @@ -1,11 +1,11 @@ # yellowbrick.features.pcoords # Implementations of parallel coordinates for feature analysis. # -# Author: Benjamin Bengfort +# Author: Benjamin Bengfort # Author: @thekylesaurus # Created: Mon Oct 03 21:46:06 2016 -0400 # -# Copyright (C) 2016 District Data Labs +# Copyright (C) 2016 The scikit-yb developers # For license information, see LICENSE.txt # # ID: pcoords.py [0f4b236] benjamin@bengfort.com $ @@ -20,26 +20,37 @@ import numpy as np -from six import string_types from numpy.random import RandomState from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler from sklearn.preprocessing import Normalizer, StandardScaler from yellowbrick.draw import manual_legend -from yellowbrick.utils import is_dataframe, is_series from yellowbrick.features.base import DataVisualizer +from yellowbrick.utils import is_dataframe, is_series from yellowbrick.exceptions import YellowbrickTypeError, YellowbrickValueError -from yellowbrick.style.colors import resolve_colors ########################################################################## ## Quick Methods ########################################################################## -def parallel_coordinates(X, y, ax=None, features=None, classes=None, - normalize=None, sample=1.0, color=None, colormap=None, - alpha=None, fast=False, vlines=True, vlines_kwds=None, - **kwargs): + +def parallel_coordinates( + X, + y, + ax=None, + features=None, + classes=None, + normalize=None, + sample=1.0, + colors=None, + colormap=None, + alpha=None, + fast=False, + vlines=True, + vlines_kwds=None, + **kwargs +): """Displays each feature as a vertical axis and each instance as a line. This helper function is a quick wrapper to utilize the ParallelCoordinates @@ -78,7 +89,7 @@ def parallel_coordinates(X, y, ax=None, features=None, classes=None, If int, specifies the maximum number of samples to display. If float, specifies a fraction between 0 and 1 to display. - color : list or tuple, default: None + colors : list or tuple, default: None optional list or tuple of colors to colorize lines Use either color to colorize the lines on a per class basis or colormap to color them on a continuous scale. @@ -111,27 +122,38 @@ def parallel_coordinates(X, y, ax=None, features=None, classes=None, Returns ------- - ax : matplotlib axes - Returns the axes that the parallel coordinates were drawn on. + viz : ParallelCoordinates + Returns the fitted, finalized visualizer """ # Instantiate the visualizer visualizer = ParallelCoordinates( - ax, features, classes, normalize, sample, color, colormap, alpha, - fast, vlines, vlines_kwds, **kwargs + ax=ax, + features=features, + classes=classes, + normalize=normalize, + sample=sample, + colors=colors, + colormap=colormap, + alpha=alpha, + fast=fast, + vlines=vlines, + vlines_kwds=vlines_kwds, + **kwargs ) # Fit and transform the visualizer (calls draw) visualizer.fit(X, y, **kwargs) visualizer.transform(X) - # Return the axes object on the visualizer - return visualizer.ax + # Return the visualizer object + return visualizer ########################################################################## ## Static Parallel Coordinates Visualizer ########################################################################## + class ParallelCoordinates(DataVisualizer): """ Parallel coordinates displays each feature as a vertical axis spaced @@ -141,7 +163,6 @@ class ParallelCoordinates(DataVisualizer): Parameters ---------- - ax : matplotlib Axes, default: None The axis to plot the figure on. If None is passed in the current axes will be used (or generated if required). @@ -153,8 +174,13 @@ class ParallelCoordinates(DataVisualizer): classes : list, default: None a list of class names for the legend - If classes is None and a y value is passed to fit then the classes - are selected from the target vector. + The class labels for each class in y, ordered by sorted class index. These + names act as a label encoder for the legend, identifying integer classes + or renaming string labels. If omitted, the class labels will be taken from + the unique values in y. + + Note that the length of this list must match the number of unique values in + y, otherwise an exception is raised. normalize : string or None, default: None specifies which normalization method to use, if any @@ -175,15 +201,14 @@ class ParallelCoordinates(DataVisualizer): shuffle : boolean, default: True specifies whether sample is drawn randomly - color : list or tuple, default: None - optional list or tuple of colors to colorize lines - Use either color to colorize the lines on a per class basis or - colormap to color them on a continuous scale. + colors : list or tuple, default: None + A single color to plot all instances as or a list of colors to color each + instance according to its class. If not enough colors per class are + specified then the colors are treated as a cycle. colormap : string or cmap, default: None - optional string or matplotlib cmap to colorize lines - Use either color to colorize the lines on a per class basis or - colormap to color them on a continuous scale. + The colormap used to create the individual colors. If classes are + specified the colormap is used to evenly space colors across each class. alpha : float, default: None Specify a transparency where 1 is completely opaque and 0 is completely @@ -211,6 +236,17 @@ class ParallelCoordinates(DataVisualizer): n_samples_ : int number of samples included in the visualization object + features_ : ndarray, shape (n_features,) + The names of the features discovered or used in the visualizer that + can be used as an index to access or modify data in X. If a user passes + feature names in, those features are used. Otherwise the columns of a + DataFrame are used or just simply the indices of the data array. + + classes_ : ndarray, shape (n_classes,) + The class labels that define the discrete values in the target. Only + available if the target type is discrete. This is guaranteed to be + strings even if the classes are a different type. + Examples -------- @@ -227,31 +263,39 @@ class ParallelCoordinates(DataVisualizer): """ NORMALIZERS = { - 'minmax': MinMaxScaler(), - 'maxabs': MaxAbsScaler(), - 'standard': StandardScaler(), - 'l1': Normalizer('l1'), - 'l2': Normalizer('l2'), + "minmax": MinMaxScaler(), + "maxabs": MaxAbsScaler(), + "standard": StandardScaler(), + "l1": Normalizer("l1"), + "l2": Normalizer("l2"), } - def __init__(self, - ax=None, - features=None, - classes=None, - normalize=None, - sample=1.0, - random_state=None, - shuffle=False, - color=None, - colormap=None, - alpha=None, - fast=False, - vlines=True, - vlines_kwds=None, - **kwargs): - + def __init__( + self, + ax=None, + features=None, + classes=None, + normalize=None, + sample=1.0, + random_state=None, + shuffle=False, + colors=None, + colormap=None, + alpha=None, + fast=False, + vlines=True, + vlines_kwds=None, + **kwargs + ): + if "target_type" not in kwargs: + kwargs["target_type"] = "discrete" super(ParallelCoordinates, self).__init__( - ax, features, classes, color, colormap, **kwargs + ax=ax, + features=features, + classes=classes, + colors=colors, + colormap=colormap, + **kwargs ) # Validate 'normalize' argument @@ -259,8 +303,7 @@ def __init__(self, self.normalize = normalize else: raise YellowbrickValueError( - "'{}' is an unrecognized normalization method" - .format(normalize) + "'{}' is an unrecognized normalization method".format(normalize) ) # Validate 'sample' argument @@ -275,18 +318,14 @@ def __init__(self, "`sample` parameter of type `float` must be between 0 and 1" ) else: - raise YellowbrickTypeError( - "`sample` parameter must be int or float" - ) + raise YellowbrickTypeError("`sample` parameter must be int or float") self.sample = sample # Set sample parameters if isinstance(shuffle, bool): self.shuffle = shuffle else: - raise YellowbrickTypeError( - "`shuffle` parameter must be boolean" - ) + raise YellowbrickTypeError("`shuffle` parameter must be boolean") if self.shuffle: if (random_state is None) or isinstance(random_state, int): self._rng = RandomState(random_state) @@ -294,7 +333,7 @@ def __init__(self, self._rng = random_state else: raise YellowbrickTypeError( - "`random_state` parameter must be None, int, or np.random.RandomState" + "`random_state` must be None, int, or np.random.RandomState" ) else: self._rng = None @@ -303,9 +342,7 @@ def __init__(self, self.fast = fast self.alpha = alpha self.show_vlines = vlines - self.vlines_kwds = vlines_kwds or { - 'linewidth': 1, 'color': 'black' - } + self.vlines_kwds = vlines_kwds or {"linewidth": 1, "color": "black"} # Internal properties self._increments = None @@ -333,34 +370,15 @@ def fit(self, X, y=None, **kwargs): self : instance Returns the instance of the transformer/visualizer """ + # Determine the features, classes, and colors + super(ParallelCoordinates, self).fit(X, y) # Convert from pandas data types if is_dataframe(X): - # Get column names before reverting to an np.ndarray - if self.features_ is None: - self.features_ = np.array(X.columns) - X = X.values if is_series(y): y = y.values - # Assign integer labels to the feature columns from the input - if self.features_ is None: - self.features_ = np.arange(0, X.shape[1]) - - # Ensure that all classes are represented in the color mapping (before sample) - # NOTE: np.unique also specifies the ordering of the classes - if self.classes_ is None: - self.classes_ = [str(label) for label in np.unique(y)] - - # Create the color mapping for each class - # TODO: Allow both colormap, listed colors, and palette definition - # TODO: Make this an independent function or property for override! - color_values = resolve_colors( - n_colors=len(self.classes_), colormap=self.colormap, colors=self.color - ) - self._colors = dict(zip(self.classes_, color_values)) - # Ticks for each feature specified self._increments = np.arange(len(self.features_)) @@ -371,8 +389,8 @@ def fit(self, X, y=None, **kwargs): if self.normalize is not None: X = self.NORMALIZERS[self.normalize].fit_transform(X) - # the super method calls draw and returns self - return super(ParallelCoordinates, self).fit(X, y, **kwargs) + self.draw(X, y, **kwargs) + return self def draw(self, X, y, **kwargs): """ @@ -424,18 +442,9 @@ def draw_instances(self, X, y, **kwargs): for idx in range(len(X)): Xi = X[idx] yi = y[idx] + color = self.get_colors([yi])[0] - # TODO: generalize this duplicated code into a single function - if isinstance(yi, string_types): - label = yi - else: - # TODO: what happens if yi is not in classes?! - label = self.classes_[yi] - - self.ax.plot( - self._increments, Xi, - color=self._colors[label], alpha=alpha, **kwargs - ) + self.ax.plot(self._increments, Xi, color=color, alpha=alpha, **kwargs) return self.ax @@ -470,18 +479,18 @@ def draw_classes(self, X, y, **kwargs): # Plot each class as a single line plot for yi in y_values: - if isinstance(yi, string_types): - label = yi - else: - # TODO: what happens if yi is not in classes?! - label = self.classes_[yi] + color = self.get_colors([yi])[0] X_in_class = X_separated[y == yi, :] increments_in_class = increments_separated * len(X_in_class) if len(X_in_class) > 0: self.ax.plot( - increments_in_class, X_in_class.flatten(), linewidth=1, - color=self._colors[label], alpha=alpha, **kwargs + increments_in_class, + X_in_class.flatten(), + linewidth=1, + color=color, + alpha=alpha, + **kwargs ) return self.ax @@ -498,7 +507,7 @@ def finalize(self, **kwargs): """ # Set the title self.set_title( - 'Parallel Coordinates for {} Features'.format(len(self.features_)) + "Parallel Coordinates for {} Features".format(len(self.features_)) ) # Add the vertical lines @@ -515,7 +524,7 @@ def finalize(self, **kwargs): # Add the legend sorting classes by name labels = sorted(list(self._colors.keys())) colors = [self._colors[lbl] for lbl in labels] - manual_legend(self, labels, colors, loc='best', frameon=True) + manual_legend(self, labels, colors, loc="best", frameon=True) # Add the grid view self.ax.grid() diff --git a/yellowbrick/features/projection.py b/yellowbrick/features/projection.py new file mode 100644 index 000000000..6f3120679 --- /dev/null +++ b/yellowbrick/features/projection.py @@ -0,0 +1,335 @@ +# yellowbrick.features.projection +# Base class for all projection (decomposition) high dimensional data visualizers. +# +# Author: Naresh Bachwani +# Created: Wed Jul 17 08:59:33 2019 -0400 +# +# Copyright (C) 2019, the scikit-yb developers +# For license information, see LICENSE.txt +# +# ID: projection.py [21eb9d2] 43993586+naresh-bachwani@users.noreply.github.com $ + +""" +Base class for all projection (decomposition) high dimensional data visualizers. +""" + +########################################################################## +## Imports +########################################################################## + +import warnings +import matplotlib as mpl +import matplotlib.pyplot as plt +from mpl_toolkits.axes_grid1 import make_axes_locatable +import mpl_toolkits.mplot3d # noqa + +from yellowbrick.draw import manual_legend +from yellowbrick.features.base import DataVisualizer, TargetType +from yellowbrick.exceptions import YellowbrickValueError, YellowbrickWarning, NotFitted + + +########################################################################## +## Projection Visualizers +########################################################################## + + +class ProjectionVisualizer(DataVisualizer): + """ + The ProjectionVisualizer provides functionality for projecting a multi-dimensional + dataset into either 2 or 3 components so they can be plotted as a scatter plot on + 2d or 3d axes. The visualizer acts as a transformer, and draws the transformed data + on behalf of the user. Because it is a DataVisualizer, the ProjectionVisualizer + can plot continuous scatter plots with a colormap or discrete scatter plots with + a legend. + + This visualizer is a base class and is not intended to be uses directly. + Subclasses should implement a ``transform()`` method that calls ``draw()`` using + the transformed data and the optional target as input. + + Parameters + ---------- + ax : matplotlib Axes, default: None + The axes to plot the figure on. If None is passed in the current axes. + will be used (or generated if required). + + features : list, default: None + The names of the features specified by the columns of the input dataset. + This length of this list must match the number of columns in X, otherwise + an exception will be raised on ``fit()``. + + classes : list, default: None + The class labels for each class in y, ordered by sorted class index. These + names act as a label encoder for the legend, identifying integer classes + or renaming string labels. If omitted, the class labels will be taken from + the unique values in y. + + Note that the length of this list must match the number of unique values in + y, otherwise an exception is raised. This parameter is only used in the + discrete target type case and is ignored otherwise. + + colors : list or tuple, default: None + A single color to plot all instances as or a list of colors to color each + instance according to its class in the discrete case or as an ordered + colormap in the sequential case. If not enough colors per class are + specified then the colors are treated as a cycle. + + colormap : string or cmap, default: None + The colormap used to create the individual colors. In the discrete case + it is used to compute the number of colors needed for each class and + in the continuous case it is used to create a sequential color map based + on the range of the target. + + target_type : str, default: "auto" + Specify the type of target as either "discrete" (classes) or "continuous" + (real numbers, usually for regression). If "auto", then it will + attempt to determine the type by counting the number of unique values. + + If the target is discrete, the colors are returned as a dict with classes + being the keys. If continuous the colors will be list having value of + color for each point. In either case, if no target is specified, then + color will be specified as the first color in the color cycle. + + projection : int or string, default: 2 + The number of axes to project into, either 2d or 3d. To plot 3d plots + with matplotlib, please ensure a 3d axes is passed to the visualizer, + otherwise one will be created using the current figure. + + alpha : float, default: 0.75 + Specify a transparency where 1 is completely opaque and 0 is completely + transparent. This property makes densely clustered points more visible. + + colorbar : bool, default: True + If the target_type is "continous" draw a colorbar to the right of the + scatter plot. The colobar axes is accessible using the cax property. + + kwargs : dict + Keyword arguments that are passed to the base class and may influence + the visualization as defined in other Visualizers. + """ + + def __init__( + self, + ax=None, + features=None, + classes=None, + colors=None, + colormap=None, + target_type="auto", + projection=2, + alpha=0.75, + colorbar=True, + **kwargs + ): + + super(ProjectionVisualizer, self).__init__( + ax=ax, + features=features, + classes=classes, + colors=colors, + colormap=colormap, + target_type=target_type, + **kwargs + ) + + # Convert string to integer + if isinstance(projection, str): + if projection in {"2D", "2d"}: + projection = 2 + if projection in {"3D", "3d"}: + projection = 3 + if projection not in {2, 3}: + raise YellowbrickValueError("Projection dimensions must be either 2 or 3") + self.projection = projection + + if self.ax.name != "3d" and self.projection == 3: + warnings.warn( + "data projection to 3 dimensions requires a 3d axes to draw on.", + YellowbrickWarning, + ) + + self.alpha = alpha + self.colorbar = colorbar + self._cax = None + + @property + def cax(self): + """ + The axes of the colorbar, right of the scatterplot. + """ + if self._cax is None: + raise AttributeError("This visualizer does not have an axes for colorbar") + + return self._cax + + @property + def ax(self): + """ + Overloads the axes property from base class. If no axes is specified then + creates an axes for users. A 3d axes is created for 3 dimensional plots. + """ + if not hasattr(self, "_ax") or self._ax is None: + if self.projection == 3: + fig = plt.gcf() + self._ax = fig.add_subplot(111, projection="3d") + else: + self._ax = plt.gca() + return self._ax + + @ax.setter + def ax(self, ax): + self._ax = ax + + def layout(self, divider=None): + """ + Creates the layout for colorbar when target type is continuous. + The colorbar is added to the right of the scatterplot. + + Subclasses can override this method to add other axes or layouts. + + Parameters + ---------- + divider: AxesDivider + An AxesDivider to be passed among all layout calls. + """ + if ( + self._target_color_type == TargetType.CONTINUOUS + and self.projection == 2 + and self.colorbar + and self._cax is None + ): + # Ensure matplotlib version compatibility + if make_axes_locatable is None: + raise YellowbrickValueError( + ( + "Colorbar requires matplotlib 2.0.2 or greater " + "please upgrade matplotlib" + ) + ) + + # Create the new axes for the colorbar + if divider is None: + divider = make_axes_locatable(self.ax) + + self._cax = divider.append_axes("right", size="5%", pad=0.3) + self._cax.set_yticks([]) + self._cax.set_xticks([]) + + def fit_transform(self, X, y=None): + """ + Fits the visualizer on the input data, and returns transformed X. + + Parameters + ---------- + X : ndarray or DataFrame of shape n x m + A matrix or data frame of n instances with m features where m>2. + + y : array-like of shape (n,), optional + A vector or series with target values for each instance in X. This + vector is used to determine the color of the points in X. + + Returns + ------- + Xprime : array-like of shape (n, 2) + Returns the 2-dimensional embedding of the instances. + """ + return self.fit(X, y).transform(X, y) + + def draw(self, Xp, y=None): + """ + Draws the points described by Xp and colored by the points in y. Can be + called multiple times before finalize to add more scatter plots to the + axes, however ``fit()`` must be called before use. + + Parameters + ---------- + Xp : array-like of shape (n, 2) or (n, 3) + The matrix produced by the ``transform()`` method. + + y : array-like of shape (n,), optional + The target, used to specify the colors of the points. + + Returns + ------- + self.ax : matplotlib Axes object + Returns the axes that the scatter plot was drawn on. + """ + scatter_kwargs = self._determine_scatter_kwargs(y) + + # Draws the layout of the visualizer. It draws the axes for colorbars, + # heatmap, etc. + self.layout() + + if self.projection == 2: + # Adds colorbar axis for continuous target type. + self.ax.scatter(Xp[:, 0], Xp[:, 1], **scatter_kwargs) + + if self.projection == 3: + self.ax.scatter(Xp[:, 0], Xp[:, 1], Xp[:, 2], **scatter_kwargs) + + return self.ax + + def finalize(self): + """ + Draws legends and colorbar for scatter plots. + """ + self.ax.set_xticklabels([]) + self.ax.set_yticklabels([]) + if self.projection == 3: + self.ax.set_zticklabels([]) + + if self._target_color_type == TargetType.DISCRETE: + # Add the legend + manual_legend( + self, self.classes_, list(self._colors.values()), frameon=True + ) + elif self._target_color_type == TargetType.CONTINUOUS: + if self.colorbar: + if self.projection == 3: + sm = plt.cm.ScalarMappable(cmap=self._colors, norm=self._norm) + self.cbar = plt.colorbar(sm, ax=self.ax) + + else: + # Manually draw the colorbar. + self.cbar = mpl.colorbar.ColorbarBase( + self.cax, cmap=self._colors, norm=self._norm + ) + + def _determine_scatter_kwargs(self, y=None): + """ + Determines scatter argumnets to pass into ``plt.scatter()``. If y is + discrete or single then determine colors. If continuous then determine + colors and colormap.Also normalize to range + + Parameters + ---------- + y : array-like of shape (n,), optional + The target, used to specify the colors of the points for continuous + target. + """ + + scatter_kwargs = {"alpha": self.alpha} + # Determine the colors + if self._target_color_type == TargetType.SINGLE: + scatter_kwargs["c"] = self._colors + + elif self._target_color_type == TargetType.DISCRETE: + if y is None: + raise YellowbrickValueError("y is required for discrete target") + + try: + scatter_kwargs["c"] = [self._colors[self.classes_[yi]] for yi in y] + except IndexError: + raise YellowbrickValueError("Target needs to be label encoded.") + + elif self._target_color_type == TargetType.CONTINUOUS: + if y is None: + raise YellowbrickValueError("y is required for continuous target") + + scatter_kwargs["c"] = y + scatter_kwargs["cmap"] = self._colors + self._norm = mpl.colors.Normalize(vmin=self.range_[0], vmax=self.range_[1]) + + else: + # Technically this should never be raised + raise NotFitted("could not determine target color type") + return scatter_kwargs diff --git a/yellowbrick/features/radviz.py b/yellowbrick/features/radviz.py index 714beed63..ddb871a47 100644 --- a/yellowbrick/features/radviz.py +++ b/yellowbrick/features/radviz.py @@ -1,10 +1,10 @@ # yellowbrick.features.radviz # Implements radviz for feature analysis. # -# Author: Benjamin Bengfort +# Author: Benjamin Bengfort # Created: Fri Oct 07 13:18:00 2016 -0400 # -# Copyright (C) 2016 District Data Labs +# Copyright (C) 2016 The scikit-yb developers # For license information, see LICENSE.txt # # ID: radviz.py [0f4b236] benjamin@bengfort.com $ @@ -24,15 +24,24 @@ from yellowbrick.utils import is_dataframe from yellowbrick.utils import nan_warnings from yellowbrick.features.base import DataVisualizer -from yellowbrick.style.colors import resolve_colors ########################################################################## ## Quick Methods ########################################################################## -def radviz(X, y=None, ax=None, features=None, classes=None, - color=None, colormap=None, alpha=1.0, **kwargs): + +def radviz( + X, + y=None, + ax=None, + features=None, + classes=None, + colors=None, + colormap=None, + alpha=1.0, + **kwargs +): """ Displays each feature as an axis around a circle surrounding a scatter plot whose points are each individual instance. @@ -58,7 +67,7 @@ def radviz(X, y=None, ax=None, features=None, classes=None, classes : list of strings, default: None The names of the classes in the target - color : list or tuple of colors, default: None + colors : list or tuple of colors, default: None Specify the colors for each individual class colormap : string or matplotlib cmap, default: None @@ -70,26 +79,33 @@ def radviz(X, y=None, ax=None, features=None, classes=None, Returns ------- - ax : matplotlib axes - Returns the axes that the parallel coordinates were drawn on. + viz : RadViz + Returns the fitted, finalized visualizer """ # Instantiate the visualizer visualizer = RadialVisualizer( - ax, features, classes, color, colormap, alpha, **kwargs + ax=ax, + features=features, + classes=classes, + colors=colors, + colormap=colormap, + alpha=alpha, + **kwargs ) # Fit and transform the visualizer (calls draw) visualizer.fit(X, y, **kwargs) visualizer.transform(X) - # Return the axes object on the visualizer - return visualizer.ax + # Return the visualizer object + return visualizer ########################################################################## ## Static RadViz Visualizer ########################################################################## + class RadialVisualizer(DataVisualizer): """ RadViz is a multivariate data visualization algorithm that plots each @@ -99,30 +115,37 @@ class RadialVisualizer(DataVisualizer): Parameters ---------- - ax : matplotlib Axes, default: None The axis to plot the figure on. If None is passed in the current axes will be used (or generated if required). features : list, default: None a list of feature names to use - If a DataFrame is passed to fit and features is None, feature - names are selected as the columns of the DataFrame. + The names of the features specified by the columns of the input dataset. + This length of this list must match the number of columns in X, otherwise + an exception will be raised on ``fit()``. classes : list, default: None a list of class names for the legend - If classes is None and a y value is passed to fit then the classes - are selected from the target vector. + The class labels for each class in y, ordered by sorted class index. These + names act as a label encoder for the legend, identifying integer classes + or renaming string labels. If omitted, the class labels will be taken from + the unique values in y. - color : list or tuple, default: None + Note that the length of this list must match the number of unique values in + y, otherwise an exception is raised. This parameter is only used in the + discrete target type case and is ignored otherwise. + + colors : list or tuple, default: None optional list or tuple of colors to colorize lines - Use either color to colorize the lines on a per class basis or - colormap to color them on a continuous scale. + A single color to plot all instances as or a list of colors to color each + instance according to its class. If not enough colors per class are + specified then the colors are treated as a cycle. colormap : string or cmap, default: None optional string or matplotlib cmap to colorize lines - Use either color to colorize the lines on a per class basis or - colormap to color them on a continuous scale. + The colormap used to create the individual colors. If classes are + specified the colormap is used to evenly space colors across each class. alpha : float, default: 1.0 Specify a transparency where 1 is completely opaque and 0 is completely @@ -140,16 +163,39 @@ class RadialVisualizer(DataVisualizer): >>> visualizer.transform(X) >>> visualizer.poof() - Notes - ----- - These parameters can be influenced later on in the visualization - process, but can and should be set as early as possible. + Attributes + ---------- + features_ : ndarray, shape (n_features,) + The names of the features discovered or used in the visualizer that + can be used as an index to access or modify data in X. If a user passes + feature names in, those features are used. Otherwise the columns of a + DataFrame are used or just simply the indices of the data array. + + classes_ : ndarray, shape (n_classes,) + The class labels that define the discrete values in the target. Only + available if the target type is discrete. This is guaranteed to be + strings even if the classes are a different type. """ - def __init__(self, ax=None, features=None, classes=None, color=None, - colormap=None, alpha=1.0, **kwargs): + def __init__( + self, + ax=None, + features=None, + classes=None, + colors=None, + colormap=None, + alpha=1.0, + **kwargs + ): + if "target_type" not in kwargs: + kwargs["target_type"] = "discrete" super(RadialVisualizer, self).__init__( - ax, features, classes, color, colormap, **kwargs + ax=ax, + features=features, + classes=classes, + colors=colors, + colormap=colormap, + **kwargs ) self.alpha = alpha @@ -162,6 +208,32 @@ def normalize(X): b = X.max(axis=0) return (X - a[np.newaxis, :]) / ((b - a)[np.newaxis, :]) + def fit(self, X, y=None, **kwargs): + """ + The fit method is the primary drawing input for the + visualization since it has both the X and y data required for the + viz and the transform method does not. + + Parameters + ---------- + X : ndarray or DataFrame of shape n x m + A matrix of n instances with m features + + y : ndarray or Series of length n + An array or series of target or class values + + kwargs : dict + Pass generic arguments to the drawing method + + Returns + ------- + self : instance + Returns the instance of the transformer/visualizer + """ + super(RadialVisualizer, self).fit(X, y) + self.draw(X, y, **kwargs) + return self + def draw(self, X, y, **kwargs): """ Called from the fit method, this method creates the radviz canvas and @@ -180,73 +252,102 @@ def draw(self, X, y, **kwargs): nrows, ncols = X.shape # Set the axes limits - self.ax.set_xlim([-1,1]) - self.ax.set_ylim([-1,1]) - - # Create the colors - # TODO: Allow both colormap, listed colors, and palette definition - # TODO: Make this an independent function or property for override! - color_values = resolve_colors( - n_colors=len(self.classes_), colormap=self.colormap, colors=self.color - ) - self._colors = dict(zip(self.classes_, color_values)) + self.ax.set_xlim([-1, 1]) + self.ax.set_ylim([-1, 1]) # Create a data structure to hold scatter plot representations - to_plot = {} - for kls in self.classes_: - to_plot[kls] = [[], []] + to_plot = {label: [[], []] for label in self.classes_} # Compute the arcs around the circumference for each feature axis # TODO: make this an independent function for override - s = np.array([ + s = np.array( + [ (np.cos(t), np.sin(t)) - for t in [ - 2.0 * np.pi * (i / float(ncols)) - for i in range(ncols) - ] - ]) + for t in [2.0 * np.pi * (i / float(ncols)) for i in range(ncols)] + ] + ) # Compute the locations of the scatter plot for each class # Normalize the data first to plot along the 0, 1 axis for i, row in enumerate(self.normalize(X)): row_ = np.repeat(np.expand_dims(row, axis=1), 2, axis=1) - xy = (s * row_).sum(axis=0) / row.sum() - kls = self.classes_[y[i]] + xy = (s * row_).sum(axis=0) / row.sum() + label = self._label_encoder[y[i]] - to_plot[kls][0].append(xy[0]) - to_plot[kls][1].append(xy[1]) + to_plot[label][0].append(xy[0]) + to_plot[label][1].append(xy[1]) # Add the scatter plots from the to_plot function # TODO: store these plots to add more instances to later # TODO: make this a separate function - for i, kls in enumerate(self.classes_): + for label in self.classes_: + color = self.get_colors([label])[0] self.ax.scatter( - to_plot[kls][0], to_plot[kls][1], color=self._colors[kls], - label=str(kls), alpha=self.alpha, **kwargs + to_plot[label][0], + to_plot[label][1], + color=color, + label=label, + alpha=self.alpha, + **kwargs ) # Add the circular axis path # TODO: Make this a seperate function (along with labeling) - self.ax.add_patch(patches.Circle( - (0.0, 0.0), radius=1.0, facecolor='none', edgecolor='grey', linewidth=.5 - )) + self.ax.add_patch( + patches.Circle( + (0.0, 0.0), + radius=1.0, + facecolor="none", + edgecolor="grey", + linewidth=0.5, + ) + ) # Add the feature names for xy, name in zip(s, self.features_): # Add the patch indicating the location of the axis - self.ax.add_patch(patches.Circle(xy, radius=0.025, facecolor='#777777')) + self.ax.add_patch(patches.Circle(xy, radius=0.025, facecolor="#777777")) # Add the feature names offset around the axis marker if xy[0] < 0.0 and xy[1] < 0.0: - self.ax.text(xy[0] - 0.025, xy[1] - 0.025, name, ha='right', va='top', size='small') + self.ax.text( + xy[0] - 0.025, + xy[1] - 0.025, + name, + ha="right", + va="top", + size="small", + ) elif xy[0] < 0.0 and xy[1] >= 0.0: - self.ax.text(xy[0] - 0.025, xy[1] + 0.025, name, ha='right', va='bottom', size='small') + self.ax.text( + xy[0] - 0.025, + xy[1] + 0.025, + name, + ha="right", + va="bottom", + size="small", + ) elif xy[0] >= 0.0 and xy[1] < 0.0: - self.ax.text(xy[0] + 0.025, xy[1] - 0.025, name, ha='left', va='top', size='small') + self.ax.text( + xy[0] + 0.025, + xy[1] - 0.025, + name, + ha="left", + va="top", + size="small", + ) elif xy[0] >= 0.0 and xy[1] >= 0.0: - self.ax.text(xy[0] + 0.025, xy[1] + 0.025, name, ha='left', va='bottom', size='small') - - self.ax.axis('equal') + self.ax.text( + xy[0] + 0.025, + xy[1] + 0.025, + name, + ha="left", + va="bottom", + size="small", + ) + + self.ax.axis("equal") + return self.ax def finalize(self, **kwargs): """ @@ -259,17 +360,15 @@ def finalize(self, **kwargs): """ # Set the title - self.set_title( - 'RadViz for {} Features'.format(len(self.features_)) - ) + self.set_title("RadViz for {} Features".format(len(self.features_))) # Remove the ticks from the graph self.ax.set_yticks([]) self.ax.set_xticks([]) # Add the legend - colors = [self._colors[c] for c in self.classes_] - manual_legend(self, self.classes_, colors, loc='best') + colors = self.get_colors(self.classes_) + manual_legend(self, self.classes_, colors, loc="best") # Alias for RadViz diff --git a/yellowbrick/features/rankd.py b/yellowbrick/features/rankd.py index b6ef1b2f1..229135632 100644 --- a/yellowbrick/features/rankd.py +++ b/yellowbrick/features/rankd.py @@ -1,10 +1,10 @@ # yellowbrick.features.rankd # Implements 1D (histograms) and 2D (joint plot) feature rankings. # -# Author: Benjamin Bengfort +# Author: Benjamin Bengfort # Created: Fri Oct 07 15:14:01 2016 -0400 # -# Copyright (C) 2016 District Data Labs +# Copyright (C) 2016 The scikit-yb developers # For license information, see LICENSE.txt # # ID: rankd.py [ee754dc] benjamin@bengfort.com $ @@ -17,130 +17,50 @@ ## Imports ########################################################################## +import warnings import numpy as np +import matplotlib as mpl + from scipy.stats import shapiro from scipy.stats import spearmanr +from scipy.stats import kendalltau as sp_kendalltau from yellowbrick.utils import is_dataframe from yellowbrick.features.base import MultiFeatureVisualizer -from yellowbrick.exceptions import YellowbrickValueError +from yellowbrick.exceptions import YellowbrickValueError, YellowbrickWarning __all__ = ["rank1d", "rank2d", "Rank1D", "Rank2D"] ########################################################################## -## Quick Methods +## Metrics ########################################################################## -def rank1d(X, y=None, ax=None, algorithm='shapiro', features=None, - orient='h', show_feature_names=True, **kwargs): - """Scores each feature with the algorithm and ranks them in a bar plot. - - This helper function is a quick wrapper to utilize the Rank1D Visualizer - (Transformer) for one-off analysis. - - Parameters - ---------- - X : ndarray or DataFrame of shape n x m - A matrix of n instances with m features - - y : ndarray or Series of length n - An array or series of target or class values - - ax : matplotlib axes - the axis to plot the figure on. - - algorithm : one of {'shapiro', }, default: 'shapiro' - The ranking algorithm to use, default is 'Shapiro-Wilk. - - features : list - A list of feature names to use. - If a DataFrame is passed to fit and features is None, feature - names are selected as the columns of the DataFrame. - - orient : 'h' or 'v' - Specifies a horizontal or vertical bar chart. - - show_feature_names : boolean, default: True - If True, the feature names are used to label the axis ticks in the - plot. - - Returns - ------- - ax : matplotlib axes - Returns the axes that the parallel coordinates were drawn on. +def kendalltau(X): """ - # Instantiate the visualizer - visualizer = Rank1D(ax, algorithm, features, orient, show_feature_names, - **kwargs) - - # Fit and transform the visualizer (calls draw) - visualizer.fit(X, y, **kwargs) - visualizer.transform(X) - - # Return the axes object on the visualizer - return visualizer.ax - -def rank2d(X, y=None, ax=None, algorithm='pearson', features=None, - show_feature_names=True, colormap='RdBu_r', **kwargs): - """Displays pairwise comparisons of features with the algorithm and ranks - them in a lower-left triangle heatmap plot. - - This helper function is a quick wrapper to utilize the Rank2D Visualizer - (Transformer) for one-off analysis. + Accepts a matrix X and returns a correlation matrix so that each column + is the variable and each row is the observations. Parameters ---------- X : ndarray or DataFrame of shape n x m A matrix of n instances with m features - y : ndarray or Series of length n - An array or series of target or class values - - ax : matplotlib axes - the axis to plot the figure on. - - algorithm : one of {pearson, covariance, spearman} - the ranking algorithm to use, default is Pearson correlation. - - features : list - A list of feature names to use. - If a DataFrame is passed to fit and features is None, feature - names are selected as the columns of the DataFrame. - - show_feature_names : boolean, default: True - If True, the feature names are used to label the axis ticks in the - plot. - - colormap : string or cmap - optional string or matplotlib cmap to colorize lines - Use either color to colorize the lines on a per class basis or - colormap to color them on a continuous scale. - - Returns - ------- - ax : matplotlib axes - Returns the axes that the parallel coordinates were drawn on. - """ - # Instantiate the visualizer - visualizer = Rank2D(ax, algorithm, features, colormap, show_feature_names, - **kwargs) - - # Fit and transform the visualizer (calls draw) - visualizer.fit(X, y, **kwargs) - visualizer.transform(X) - - # Return the axes object on the visualizer - return visualizer.ax + corrs = np.zeros((X.shape[1], X.shape[1])) + for idx, cola in enumerate(X.T): + for jdx, colb in enumerate(X.T): + corrs[idx, jdx] = sp_kendalltau(cola, colb)[0] + return corrs ########################################################################## ## Base Feature Visualizer ########################################################################## + class RankDBase(MultiFeatureVisualizer): """ Base visualizer for Rank1D and Rank2D @@ -151,6 +71,10 @@ class RankDBase(MultiFeatureVisualizer): The axis to plot the figure on. If None is passed in the current axes will be used (or generated if required). + fig : matplotlib Figure, default: None + The figure to plot the Visualizer on. If None is passed in the current + plot will be used (or generated if required). + algorithm : string The ranking algorithm to use; options and defaults vary by subclass @@ -190,13 +114,20 @@ class RankDBase(MultiFeatureVisualizer): ranking_methods = {} - def __init__(self, ax=None, algorithm=None, features=None, - show_feature_names=True, **kwargs): + def __init__( + self, + ax=None, + fig=None, + algorithm=None, + features=None, + show_feature_names=True, + **kwargs + ): """ Initialize the class with the options required to rank and order features as well as visualize the result. """ - super(RankDBase, self).__init__(ax=ax, features=features, **kwargs) + super(RankDBase, self).__init__(ax=ax, fig=fig, features=features, **kwargs) # Data Parameters self.ranking_ = algorithm @@ -259,7 +190,7 @@ def rank(self, X, algorithm=None): # Extract matrix from dataframe if necessary if is_dataframe(X): - X = X.as_matrix() + X = X.values return self.ranking_methods[algorithm](X) @@ -274,7 +205,16 @@ def finalize(self, **kwargs): generic keyword arguments """ - # Set the title + # There is a known bug in matplotlib 3.1.1 that affects RankD plots + # See #912 and #914 for details. + if mpl.__version__ == "3.1.1": + msg = ( + "RankD plots may be clipped when using matplotlib v3.1.1, " + "upgrade to matplotlib v3.1.2 or later to fix the plots." + ) + warnings.warn(msg, YellowbrickWarning) + + # Set the title for all RankD visualizations. self.set_title( "{} Ranking of {} Features".format( self.ranking_.title(), len(self.features_) @@ -286,6 +226,7 @@ def finalize(self, **kwargs): ## Rank 1D Feature Visualizer ########################################################################## + class Rank1D(RankDBase): """ Rank1D computes a score for each feature in the data set with a specific @@ -306,13 +247,16 @@ class Rank1D(RankDBase): If a DataFrame is passed to fit and features is None, feature names are selected as the columns of the DataFrame. - orient : 'h' or 'v' + orient : 'h' or 'v', default='h' Specifies a horizontal or vertical bar chart. show_feature_names : boolean, default: True If True, the feature names are used to label the x and y ticks in the plot. + color: string + Specify color for barchart + kwargs : dict Keyword arguments that are passed to the base class and may influence the visualization as defined in other Visualizers. @@ -331,29 +275,39 @@ class Rank1D(RankDBase): >>> visualizer.poof() """ - ranking_methods = { - 'shapiro': lambda X: np.array([shapiro(x)[0] for x in X.T]), - } - - def __init__(self, ax=None, algorithm='shapiro', features=None, - orient='h', show_feature_names=True, **kwargs): + ranking_methods = {"shapiro": lambda X: np.array([shapiro(x)[0] for x in X.T])} + + def __init__( + self, + ax=None, + algorithm="shapiro", + features=None, + orient="h", + show_feature_names=True, + color=None, + **kwargs + ): """ Initialize the class with the options required to rank and order features as well as visualize the result. """ super(Rank1D, self).__init__( - ax=ax, algorithm=algorithm, features=features, - show_feature_names=show_feature_names, **kwargs + ax=ax, + algorithm=algorithm, + features=features, + show_feature_names=show_feature_names, + **kwargs ) + self.color = color self.orientation_ = orient def draw(self, **kwargs): """ Draws the bar plot of the ranking array of features. """ - if self.orientation_ == 'h': + if self.orientation_ == "h": # Make the plot - self.ax.barh(np.arange(len(self.ranks_)), self.ranks_, color='b') + self.ax.barh(np.arange(len(self.ranks_)), self.ranks_, color=self.color) # Add ticks and tick labels self.ax.set_yticks(np.arange(len(self.ranks_))) @@ -368,9 +322,9 @@ def draw(self, **kwargs): # Turn off y grid lines self.ax.yaxis.grid(False) - elif self.orientation_ == 'v': + elif self.orientation_ == "v": # Make the plot - self.ax.bar(np.arange(len(self.ranks_)), self.ranks_, color='b') + self.ax.bar(np.arange(len(self.ranks_)), self.ranks_, color=self.color) # Add ticks and tick labels self.ax.set_xticks(np.arange(len(self.ranks_))) @@ -383,15 +337,14 @@ def draw(self, **kwargs): self.ax.xaxis.grid(False) else: - raise YellowbrickValueError( - "Orientation must be 'h' or 'v'" - ) + raise YellowbrickValueError("Orientation must be 'h' or 'v'") ########################################################################## ## Rank 2D Feature Visualizer ########################################################################## + class Rank2D(RankDBase): """ Rank2D performs pairwise comparisons of each feature in the data set with @@ -404,8 +357,9 @@ class Rank2D(RankDBase): The axis to plot the figure on. If None is passed in the current axes will be used (or generated if required). - algorithm : one of {'pearson', 'covariance', 'spearman'}, default: 'pearson' - The ranking algorithm to use, default is Pearson correlation. + algorithm : str, default: 'pearson' + The ranking algorithm to use, one of: + 'pearson', 'covariance', 'spearman', 'kendalltau'. features : list A list of feature names to use. @@ -446,22 +400,33 @@ class Rank2D(RankDBase): """ ranking_methods = { - 'pearson': lambda X: np.corrcoef(X.transpose()), - 'covariance': lambda X: np.cov(X.transpose()), - 'spearman': lambda X: spearmanr(X)[0], + "pearson": lambda X: np.corrcoef(X.transpose()), + "covariance": lambda X: np.cov(X.transpose()), + "spearman": lambda X: spearmanr(X, axis=0)[0], + "kendalltau": lambda X: kendalltau(X), } - def __init__(self, ax=None, algorithm='pearson', features=None, - colormap='RdBu_r', show_feature_names=True, **kwargs): + def __init__( + self, + ax=None, + algorithm="pearson", + features=None, + colormap="RdBu_r", + show_feature_names=True, + **kwargs + ): """ Initialize the class with the options required to rank and order features as well as visualize the result. """ super(Rank2D, self).__init__( - ax=ax, algorithm=algorithm, features=features, - show_feature_names=show_feature_names, **kwargs + ax=ax, + algorithm=algorithm, + features=features, + show_feature_names=show_feature_names, + **kwargs ) - self.colormap=colormap + self.colormap = colormap def draw(self, **kwargs): """ @@ -480,9 +445,7 @@ def draw(self, **kwargs): mesh = self.ax.pcolormesh(data, cmap=self.colormap, vmin=-1, vmax=1) # Set the Axis limits - self.ax.set( - xlim=(0, data.shape[1]), ylim=(0, data.shape[0]) - ) + self.ax.set(xlim=(0, data.shape[1]), ylim=(0, data.shape[0])) # Add the colorbar cb = self.ax.figure.colorbar(mesh, None, self.ax) @@ -500,3 +463,146 @@ def draw(self, **kwargs): else: self.ax.set_xticklabels([]) self.ax.set_yticklabels([]) + + +########################################################################## +## Quick Methods +########################################################################## + + +def rank1d( + X, + y=None, + ax=None, + algorithm="shapiro", + features=None, + orient="h", + show_feature_names=True, + color=None, + **kwargs +): + """Scores each feature with the algorithm and ranks them in a bar plot. + + This helper function is a quick wrapper to utilize the Rank1D Visualizer + (Transformer) for one-off analysis. + + Parameters + ---------- + X : ndarray or DataFrame of shape n x m + A matrix of n instances with m features + + y : ndarray or Series of length n + An array or series of target or class values + + ax : matplotlib axes + the axis to plot the figure on. + + algorithm : one of {'shapiro', }, default: 'shapiro' + The ranking algorithm to use, default is 'Shapiro-Wilk. + + features : list + A list of feature names to use. + If a DataFrame is passed to fit and features is None, feature + names are selected as the columns of the DataFrame. + + orient : 'h' or 'v' + Specifies a horizontal or vertical bar chart. + + show_feature_names : boolean, default: True + If True, the feature names are used to label the axis ticks in the + plot. + + color: string + Specify color for barchart + + Returns + ------- + viz : Rank1D + Returns the fitted, finalized visualizer. + + """ + # Instantiate the visualizer + visualizer = Rank1D( + ax=ax, + algorithm=algorithm, + features=features, + orient=orient, + show_feature_names=show_feature_names, + color=color, + **kwargs + ) + + # Fit and transform the visualizer (calls draw) + visualizer.fit(X, y, **kwargs) + visualizer.transform(X) + + # Return the visualizer object + return visualizer + + +def rank2d( + X, + y=None, + ax=None, + algorithm="pearson", + features=None, + show_feature_names=True, + colormap="RdBu_r", + **kwargs +): + """Displays pairwise comparisons of features with the algorithm and ranks + them in a lower-left triangle heatmap plot. + + This helper function is a quick wrapper to utilize the Rank2D Visualizer + (Transformer) for one-off analysis. + + Parameters + ---------- + X : ndarray or DataFrame of shape n x m + A matrix of n instances with m features + + y : ndarray or Series of length n + An array or series of target or class values + + ax : matplotlib axes + the axis to plot the figure on. + + algorithm : one of {pearson, covariance, spearman, kendalltau} + the ranking algorithm to use, default is Pearson correlation. + + features : list + A list of feature names to use. + If a DataFrame is passed to fit and features is None, feature + names are selected as the columns of the DataFrame. + + show_feature_names : boolean, default: True + If True, the feature names are used to label the axis ticks in the + plot. + + colormap : string or cmap + optional string or matplotlib cmap to colorize lines + Use either color to colorize the lines on a per class basis or + colormap to color them on a continuous scale. + + Returns + ------- + viz : Rank2D + Returns the fitted, finalized visualizer + + """ + # Instantiate the visualizer + visualizer = Rank2D( + ax=ax, + algorithm=algorithm, + features=features, + colormap=colormap, + show_feature_names=show_feature_names, + **kwargs + ) + + # Fit and transform the visualizer (calls draw) + visualizer.fit(X, y, **kwargs) + visualizer.transform(X) + + # Return the visualizer object + return visualizer diff --git a/yellowbrick/gridsearch/__init__.py b/yellowbrick/gridsearch/__init__.py index 76b9e73cc..c9e9237fa 100644 --- a/yellowbrick/gridsearch/__init__.py +++ b/yellowbrick/gridsearch/__init__.py @@ -1,3 +1,14 @@ +# yellowbrick.gridsearch +# Visualizers for the results of GridSearchCV. +# +# Author: Phillip Schafer +# Created: Sat Feb 3 10:18:33 2018 -0500 +# +# Copyright (C) 2018 The scikit-yb developers +# For license information, see LICENSE.txt +# +# ID: __init__.py [03724ed] pbs929@users.noreply.github.com $ + """ Visualizers for the results of GridSearchCV. """ diff --git a/yellowbrick/gridsearch/base.py b/yellowbrick/gridsearch/base.py index 7514b6ea5..c6abce57b 100644 --- a/yellowbrick/gridsearch/base.py +++ b/yellowbrick/gridsearch/base.py @@ -1,3 +1,14 @@ +# yellowbrick.gridsearch.base +# Base class for grid search visualizers +# +# Author: Phillip Schafer +# Created: Sat Feb 3 10:18:33 2018 -0500 +# +# Copyright (C) 2018 The scikit-yb developers +# For license information, see LICENSE.txt +# +# ID: base.py [03724ed] pbs929@users.noreply.github.com $ + """ Base class for grid search visualizers """ @@ -9,16 +20,19 @@ import numpy as np from ..utils import is_gridsearch from ..base import ModelVisualizer -from ..exceptions import (YellowbrickTypeError, - YellowbrickKeyError, - YellowbrickValueError) +from ..exceptions import ( + YellowbrickTypeError, + YellowbrickKeyError, + YellowbrickValueError, +) ########################################################################## ## Dimension reduction utility ########################################################################## -def param_projection(cv_results, x_param, y_param, metric='mean_test_score'): + +def param_projection(cv_results, x_param, y_param, metric="mean_test_score"): """ Projects the grid search results onto 2 dimensions. @@ -55,19 +69,24 @@ def param_projection(cv_results, x_param, y_param, metric='mean_test_score'): # These are masked arrays where the cases where each parameter is # non-applicable are masked. try: - x_vals = cv_results['param_' + x_param] + x_vals = cv_results["param_" + x_param] except KeyError: - raise YellowbrickKeyError("Parameter '{}' does not exist in the grid " - "search results".format(x_param)) + raise YellowbrickKeyError( + "Parameter '{}' does not exist in the grid " + "search results".format(x_param) + ) try: - y_vals = cv_results['param_' + y_param] + y_vals = cv_results["param_" + y_param] except KeyError: - raise YellowbrickKeyError("Parameter '{}' does not exist in the grid " - "search results".format(y_param)) + raise YellowbrickKeyError( + "Parameter '{}' does not exist in the grid " + "search results".format(y_param) + ) if metric not in cv_results: - raise YellowbrickKeyError("Metric '{}' does not exist in the grid " - "search results".format(metric)) + raise YellowbrickKeyError( + "Metric '{}' does not exist in the grid " "search results".format(metric) + ) # Get unique, unmasked values of the two display parameters unique_x_vals = sorted(list(set(x_vals.compressed()))) @@ -115,8 +134,8 @@ def param_projection(cv_results, x_param, y_param, metric='mean_test_score'): ## Base Grid Search Visualizer ########################################################################## -class GridSearchVisualizer(ModelVisualizer): +class GridSearchVisualizer(ModelVisualizer): def __init__(self, model, ax=None, **kwargs): """ Check to see if model is an instance of GridSearchCV. @@ -124,9 +143,7 @@ def __init__(self, model, ax=None, **kwargs): """ # A bit of type checking if not is_gridsearch(model): - raise YellowbrickTypeError( - "This estimator is not a GridSearchCV instance" - ) + raise YellowbrickTypeError("This estimator is not a GridSearchCV instance") # Initialize the super method. super(GridSearchVisualizer, self).__init__(model, ax=ax, **kwargs) diff --git a/yellowbrick/gridsearch/pcolor.py b/yellowbrick/gridsearch/pcolor.py index 6585403d1..39efe7177 100644 --- a/yellowbrick/gridsearch/pcolor.py +++ b/yellowbrick/gridsearch/pcolor.py @@ -1,3 +1,14 @@ +# yellowbrick.gridsearch.pcolor +# Colorplot visualizer for gridsearch results. +# +# Author: Phillip Schafer +# Created: Sat Feb 3 10:18:33 2018 -0500 +# +# Copyright (C) 2018 The scikit-yb developers +# For license information, see LICENSE.txt +# +# ID: pcolor.py [03724ed] pbs929@users.noreply.github.com $ + """ Colorplot visualizer for gridsearch results. """ @@ -8,18 +19,15 @@ ## Packages for export -__all__ = [ - "GridSearchColorPlot", - "gridsearch_color_plot" -] +__all__ = ["GridSearchColorPlot", "gridsearch_color_plot"] ########################################################################## ## Quick method ########################################################################## -def gridsearch_color_plot(model, x_param, y_param, X=None, y=None, ax=None, - **kwargs): + +def gridsearch_color_plot(model, x_param, y_param, X=None, y=None, ax=None, **kwargs): """Quick method: Create a color plot showing the best grid search scores across two parameters. @@ -120,8 +128,16 @@ class GridSearchColorPlot(GridSearchVisualizer): >>> model.poof() """ - def __init__(self, model, x_param, y_param, metric='mean_test_score', - colormap='RdBu_r', ax=None, **kwargs): + def __init__( + self, + model, + x_param, + y_param, + metric="mean_test_score", + colormap="RdBu_r", + ax=None, + **kwargs + ): super(GridSearchColorPlot, self).__init__(model, ax=ax, **kwargs) self.x_param = x_param self.y_param = y_param @@ -138,9 +154,10 @@ def draw(self): data = np.ma.masked_invalid(best_scores) # Plot and fill in hatch for nans - mesh = self.ax.pcolor(data, cmap=self.colormap, - vmin=np.nanmin(data), vmax=np.nanmax(data)) - self.ax.patch.set(hatch='x', edgecolor='black') + mesh = self.ax.pcolor( + data, cmap=self.colormap, vmin=np.nanmin(data), vmax=np.nanmax(data) + ) + self.ax.patch.set(hatch="x", edgecolor="black") # Ticks and tick labels self.ax.set_xticks(np.arange(len(x_vals)) + 0.5) diff --git a/yellowbrick/model_selection/__init__.py b/yellowbrick/model_selection/__init__.py index 78c9c245a..06892aa4f 100644 --- a/yellowbrick/model_selection/__init__.py +++ b/yellowbrick/model_selection/__init__.py @@ -4,7 +4,7 @@ # Author: Benjamin Bengfort # Created: Fri Mar 30 10:36:12 2018 -0400 # -# ID: __init__.py [] benjamin@bengfort.com $ +# ID: __init__.py [c5355ee] benjamin@bengfort.com $ """ Visualizers that wrap the model selection libraries of Scikit-Learn @@ -17,3 +17,7 @@ from .learning_curve import LearningCurve, learning_curve from .validation_curve import ValidationCurve, validation_curve from .cross_validation import CVScores, cv_scores + +# RFECV and Feature Importances moved here as of YB v1.0 +from .importances import FeatureImportances, feature_importances +from .rfecv import RFECV, rfecv diff --git a/yellowbrick/model_selection/cross_validation.py b/yellowbrick/model_selection/cross_validation.py index dca1b5c02..09d1b3a0d 100644 --- a/yellowbrick/model_selection/cross_validation.py +++ b/yellowbrick/model_selection/cross_validation.py @@ -3,9 +3,12 @@ # # Author: Prema Damodaran Roman # Created: Wed June 6 2018 13:32:00 -0500 -# Author: Rebecca Bilbro +# Author: Rebecca Bilbro # Updated: Fri Aug 10 13:15:43 2018 -0500 # +# Copyright (C) 2018 The scikit-yb developers +# For license information, see LICENSE.txt +# # ID: cross_validation.py [7f47800] pdamo24@gmail.com $ """ @@ -13,7 +16,7 @@ """ ########################################################################## -## Imports +# Imports ########################################################################## import numpy as np @@ -24,9 +27,10 @@ ########################################################################## -## CVScores Visualizer +# CVScores Visualizer ########################################################################## + class CVScores(ModelVisualizer): """ CVScores displays cross-validated scores as a bar chart, with the @@ -63,10 +67,22 @@ class CVScores(ModelVisualizer): See scikit-learn `cross-validation guide `_ for more information on the possible metrics that can be used. + color: string + Specify color for barchart + kwargs : dict Keyword arguments that are passed to the base class and may influence the visualization as defined in other Visualizers. + Attributes + ------- + cv_scores_ : ndarray shape (n_splits, ) + The cross-validated scores from each subsection of the data + + cv_scores_mean_ : float + Average cross-validated score across all subsections of the data + + Examples -------- @@ -91,10 +107,11 @@ class CVScores(ModelVisualizer): """ - def __init__(self, model, ax=None, cv=None, scoring=None, **kwargs): + def __init__(self, model, ax=None, cv=None, scoring=None, color=None, **kwargs): super(CVScores, self).__init__(model, ax=ax, **kwargs) self.cv = cv + self.color = color self.scoring = scoring def fit(self, X, y, **kwargs): @@ -134,16 +151,17 @@ def draw(self, **kwargs): average value of the scores. """ - color = kwargs.pop("color", "b") width = kwargs.pop("width", 0.3) linewidth = kwargs.pop("linewidth", 1) xvals = np.arange(1, len(self.cv_scores_) + 1, 1) - self.ax.bar(xvals, self.cv_scores_, width=width) + self.ax.bar(xvals, self.cv_scores_, width=width, color=self.color) self.ax.axhline( - self.cv_scores_mean_, color=color, + self.cv_scores_mean_, + color=self.color, label="Mean score = {:0.3f}".format(self.cv_scores_mean_), - linestyle='--', linewidth=linewidth + linestyle="--", + linewidth=linewidth, ) return self.ax @@ -154,7 +172,7 @@ def finalize(self, **kwargs): """ # Set the title of the figure - self.set_title('Cross Validation Scores for {}'.format(self.name)) + self.set_title("Cross Validation Scores for {}".format(self.name)) # Add the legend loc = kwargs.pop("loc", "best") @@ -165,15 +183,16 @@ def finalize(self, **kwargs): self.ax.xaxis.set_major_locator(ticker.MultipleLocator(1)) # Set the axis labels - self.ax.set_xlabel('Training Instances') - self.ax.set_ylabel('Score') + self.ax.set_xlabel("Training Instances") + self.ax.set_ylabel("Score") ########################################################################## -## Quick Method +# Quick Method ########################################################################## -def cv_scores(model, X, y, ax=None, cv=None, scoring=None, **kwargs): + +def cv_scores(model, X, y, ax=None, cv=None, scoring=None, color=None, **kwargs): """ Displays cross validation scores as a bar chart and the average of the scores as a horizontal line @@ -221,21 +240,25 @@ def cv_scores(model, X, y, ax=None, cv=None, scoring=None, **kwargs): See scikit-learn `cross-validation guide `_ for more information on the possible metrics that can be used. + color: string + Specify color for barchart + kwargs : dict Keyword arguments that are passed to the base class and may influence the visualization as defined in other Visualizers. Returns ------- - ax : matplotlib.Axes - The axes object that the validation curves were drawn on. + visualizer : CVScores + The fitted visualizer. """ # Initialize the visualizer - visualizer = CVScores(model, ax=ax, cv=cv, scoring=scoring) + visualizer = CVScores(model, ax=ax, cv=cv, scoring=scoring, color=None) # Fit and poof the visualizer visualizer.fit(X, y) visualizer.poof(**kwargs) - return visualizer.ax + + return visualizer diff --git a/yellowbrick/features/importances.py b/yellowbrick/model_selection/importances.py similarity index 71% rename from yellowbrick/features/importances.py rename to yellowbrick/model_selection/importances.py index a82671748..f0b8908d0 100644 --- a/yellowbrick/features/importances.py +++ b/yellowbrick/model_selection/importances.py @@ -1,12 +1,11 @@ -# yellowbrick.features.importances +# yellowbrick.model_selection.importances # Feature importance visualizer # -# Author: Benjamin Bengfort +# Author: Benjamin Bengfort +# Author: Rebecca Bilbro # Created: Fri Mar 02 15:21:36 2018 -0500 -# Author: Rebecca Bilbro -# Updated: Sun Jun 24 10:53:36 2018 -0500 # -# Copyright (C) 2018 District Data Labs +# Copyright (C) 2018 The scikit-yb developers # For license information, see LICENSE.txt # # ID: importances.py [] benjamin@bengfort.com $ @@ -21,19 +20,21 @@ ## Imports ########################################################################## +import warnings import numpy as np import matplotlib.pyplot as plt -from yellowbrick.utils import is_dataframe, is_classifier +from yellowbrick.draw import bar_stack from yellowbrick.base import ModelVisualizer -from yellowbrick.exceptions import YellowbrickTypeError, NotFitted -from ..style.palettes import color_palette - +from yellowbrick.style.colors import resolve_colors +from yellowbrick.utils import is_dataframe, is_classifier +from yellowbrick.exceptions import YellowbrickTypeError, NotFitted, YellowbrickWarning ########################################################################## ## Feature Visualizer ########################################################################## + class FeatureImportances(ModelVisualizer): """ Displays the most informative features in a model by showing a bar chart @@ -50,7 +51,9 @@ class FeatureImportances(ModelVisualizer): ---------- model : Estimator A Scikit-Learn estimator that learns feature importances. Must support - either ``coef_`` or ``feature_importances_`` parameters. + either ``coef_`` or ``feature_importances_`` parameters. If the estimator + is not fitted, it is fit when the visualizer is fitted, unless otherwise + specified by ``is_fitted``. ax : matplotlib Axes, default: None The axis to plot the figure on. If None is passed in the current axes @@ -67,7 +70,7 @@ class FeatureImportances(ModelVisualizer): absolute : bool, default: False Make all coeficients absolute to more easily compare negative - coeficients with positive ones. + coefficients with positive ones. xlabel : str, default: None The label for the X-axis. If None is automatically determined by the @@ -78,6 +81,18 @@ class FeatureImportances(ModelVisualizer): then a stacked bar plot is plotted; otherwise the mean of the feature importance across classes are plotted. + colors: list of strings + Specify colors for each bar in the chart if ``stack==False``. + + colormap : string or matplotlib cmap + Specify a colormap to color the classes if ``stack==True``. + + is_fitted : bool or str, default='auto' + Specify if the wrapped estimator is already fitted. If False, the estimator + will be fit when the visualizer is fit, otherwise, the estimator will not be + modified. If 'auto' (default), a helper method will check if the estimator + is fitted before fitting it again. + kwargs : dict Keyword arguments that are passed to the base class and may influence the visualization as defined in other Visualizers. @@ -91,7 +106,7 @@ class FeatureImportances(ModelVisualizer): The numeric value of the feature importance computed by the model classes_ : np.array - The classees labeled. Is not None only for classifier. + The classes labeled. Is not None only for classifier. Examples -------- @@ -102,14 +117,34 @@ class FeatureImportances(ModelVisualizer): >>> visualizer.poof() """ - def __init__(self, model, ax=None, labels=None, relative=True, - absolute=False, xlabel=None, stack=False, **kwargs): - super(FeatureImportances, self).__init__(model, ax, **kwargs) + def __init__( + self, + model, + ax=None, + labels=None, + relative=True, + absolute=False, + xlabel=None, + stack=False, + colors=None, + colormap=None, + is_fitted="auto", + **kwargs + ): + # Initialize the visualizer bases + super(FeatureImportances, self).__init__( + model, ax=ax, is_fitted=is_fitted, **kwargs + ) # Data Parameters self.set_params( - labels=labels, relative=relative, absolute=absolute, - xlabel=xlabel, stack=stack + labels=labels, + relative=relative, + absolute=absolute, + xlabel=xlabel, + stack=stack, + colors=colors, + colormap=colormap, ) def fit(self, X, y=None, **kwargs): @@ -133,6 +168,7 @@ def fit(self, X, y=None, **kwargs): self : visualizer The fit method must always return self to support pipelines. """ + # Super call fits the underlying estimator if it's not already fitted super(FeatureImportances, self).fit(X, y, **kwargs) # Get the feature importances from the model @@ -150,8 +186,14 @@ def fit(self, X, y=None, **kwargs): # therefore we flatten by taking the average by # column to get shape (n_features,) (see LogisticRegression) if not self.stack and self.feature_importances_.ndim > 1: - self.feature_importances_ = np.mean(self.feature_importances_, - axis=0) + self.feature_importances_ = np.mean(self.feature_importances_, axis=0) + warnings.warn( + ( + "detected multi-dimensional feature importances but stack=False, " + "using mean to aggregate them." + ), + YellowbrickWarning, + ) # Apply absolute value filter before normalization if self.absolute: @@ -196,7 +238,7 @@ def draw(self, **kwargs): Draws the feature importances as a bar chart; called from fit. """ # Quick validation - for param in ('feature_importances_', 'features_'): + for param in ("feature_importances_", "features_"): if not hasattr(self, param): raise NotFitted("missing required param '{}'".format(param)) @@ -205,30 +247,26 @@ def draw(self, **kwargs): # Plot the bar chart if self.stack: - colors = color_palette(kwargs.pop('colors', None), - len(self.classes_)) - zeros = np.zeros(self.feature_importances_.shape[1]) - left_arr = np.zeros((self.feature_importances_.shape[1], 2)) - - for idx in range(len(self.feature_importances_)): - left = [ - left_arr[j, int(self.feature_importances_[idx][j] > 0)] - for j in range(len(self.feature_importances_[idx])) - ] - - self.ax.barh(pos, self.feature_importances_[idx], left=left, - color=colors[idx], label=self.classes_[idx]) - - left_arr[:, 0] += np.minimum(self.feature_importances_[idx], - zeros) - left_arr[:, 1] += np.maximum(self.feature_importances_[idx], - zeros) + colors = resolve_colors(len(self.classes_), colormap=self.colormap) + legend_kws = {"bbox_to_anchor": (1.04, 0.5), "loc": "center left"} + bar_stack( + self.feature_importances_, + ax=self.ax, + labels=list(self.classes_), + ticks=self.features_, + orientation="h", + colors=colors, + legend_kws=legend_kws, + ) else: - self.ax.barh(pos, self.feature_importances_, align='center') + colors = resolve_colors( + len(self.features_), colormap=self.colormap, colors=self.colors + ) + self.ax.barh(pos, self.feature_importances_, color=colors, align="center") - # Set the labels for the bars - self.ax.set_yticks(pos) - self.ax.set_yticklabels(self.features_) + # Set the labels for the bars + self.ax.set_yticks(pos) + self.ax.set_yticklabels(self.features_) return self.ax @@ -237,17 +275,18 @@ def finalize(self, **kwargs): Finalize the drawing setting labels and title. """ # Set the title - self.set_title('Feature Importances of {} Features using {}'.format( - len(self.features_), self.name)) + self.set_title( + "Feature Importances of {} Features using {}".format( + len(self.features_), self.name + ) + ) # Set the xlabel self.ax.set_xlabel(self._get_xlabel()) # Remove the ygrid - self.ax.grid(False, axis='y') + self.ax.grid(False, axis="y") - if self.stack: - plt.legend(bbox_to_anchor=(1.04, 0.5), loc="center left") # Ensure we have a tight fit plt.tight_layout() @@ -306,17 +345,30 @@ def _is_fitted(self): """ Returns true if the visualizer has been fit. """ - return hasattr(self, 'feature_importances_') and hasattr(self, 'features_') + return hasattr(self, "feature_importances_") and hasattr(self, "features_") ########################################################################## ## Quick Method ########################################################################## -def feature_importances(model, X, y=None, ax=None, labels=None, - relative=True, absolute=False, xlabel=None, - stack=False, **kwargs): - """ + +def feature_importances( + model, + X, + y=None, + ax=None, + labels=None, + relative=True, + absolute=False, + xlabel=None, + stack=False, + colors=None, + colormap=None, + is_fitted="auto", + **kwargs +): + """Quick Method: Displays the most informative features in a model by showing a bar chart of features ranked by their importances. Although primarily a feature engineering mechanism, this visualizer requires a model that has either a @@ -326,7 +378,9 @@ def feature_importances(model, X, y=None, ax=None, labels=None, ---------- model : Estimator A Scikit-Learn estimator that learns feature importances. Must support - either ``coef_`` or ``feature_importances_`` parameters. + either ``coef_`` or ``feature_importances_`` parameters. If the estimator + is not fitted, it is fit when the visualizer is fitted, unless otherwise + specified by ``is_fitted``. X : ndarray or DataFrame of shape n x m A matrix of n instances with m features @@ -360,22 +414,45 @@ def feature_importances(model, X, y=None, ax=None, labels=None, then a stacked bar plot is plotted; otherwise the mean of the feature importance across classes are plotted. + colors: list of strings + Specify colors for each bar in the chart if ``stack==False``. + + colormap : string or matplotlib cmap + Specify a colormap to color the classes if ``stack==True``. + + is_fitted : bool or str, default='auto' + Specify if the wrapped estimator is already fitted. If False, the estimator + will be fit when the visualizer is fit, otherwise, the estimator will not be + modified. If 'auto' (default), a helper method will check if the estimator + is fitted before fitting it again. + kwargs : dict Keyword arguments that are passed to the base class and may influence the visualization as defined in other Visualizers. Returns ------- - ax : matplotlib axes - Returns the axes that the parallel coordinates were drawn on. + viz : FeatureImportances + The feature importances visualizer, fitted and finalized. """ # Instantiate the visualizer visualizer = FeatureImportances( - model, ax, labels, relative, absolute, xlabel, stack, **kwargs) + model, + ax=ax, + labels=labels, + relative=relative, + absolute=absolute, + xlabel=xlabel, + stack=stack, + colors=colors, + colormap=colormap, + is_fitted=is_fitted, + **kwargs + ) # Fit and transform the visualizer (calls draw) visualizer.fit(X, y) visualizer.finalize() - # Return the axes object on the visualizer - return visualizer.ax + # Return the visualizer + return visualizer diff --git a/yellowbrick/model_selection/learning_curve.py b/yellowbrick/model_selection/learning_curve.py index ff367af51..07ca55162 100644 --- a/yellowbrick/model_selection/learning_curve.py +++ b/yellowbrick/model_selection/learning_curve.py @@ -1,10 +1,13 @@ # yellowbrick.model_selection.learning_curve # Implements a learning curve visualization for model selection. # -# Author: Jason Keung +# Author: Jason Keung # Created: Mon May 22 09:22:00 2017 -0500 # -# ID: learning_curve.py [] jason.s.keung@gmail.com $ +# Copyright (C) 2017 The scikit-yb developers +# For license information, see LICENSE.txt +# +# ID: learning_curve.py [c5355ee] benjamin@bengfort.com $ """ Implements a learning curve visualization for model selection. @@ -28,9 +31,10 @@ ########################################################################## -## LearningCurve Visualizer +# LearningCurve Visualizer ########################################################################## + class LearningCurve(ModelVisualizer): """ Visualizes the learning curve for both test and training data for @@ -84,7 +88,7 @@ class LearningCurve(ModelVisualizer): - An iterable yielding train/test splits. see the scikit-learn - `cross-validation guide `_ + `cross-validation guide `_ for more information on the possible strategies that can be used here. scoring : string, callable or None, optional, default: None @@ -156,18 +160,29 @@ class LearningCurve(ModelVisualizer): ----- This visualizer is essentially a wrapper for the ``sklearn.model_selection.learning_curve utility``, discussed in the - `validation curves `_ + `validation curves `__ documentation. .. seealso:: The documentation for the - `learning_curve `_ + `learning_curve `__ function, which this visualizer wraps. """ - def __init__(self, model, ax=None, groups=None, - train_sizes=DEFAULT_TRAIN_SIZES, cv=None, scoring=None, - exploit_incremental_learning=False, n_jobs=1, - pre_dispatch="all", shuffle=False, random_state=None, - **kwargs): + + def __init__( + self, + model, + ax=None, + groups=None, + train_sizes=DEFAULT_TRAIN_SIZES, + cv=None, + scoring=None, + exploit_incremental_learning=False, + n_jobs=1, + pre_dispatch="all", + shuffle=False, + random_state=None, + **kwargs + ): # Initialize the model visualizer super(LearningCurve, self).__init__(model, ax=ax, **kwargs) @@ -177,14 +192,20 @@ def __init__(self, model, ax=None, groups=None, if train_sizes.ndim != 1: raise YellowbrickValueError( "must specify array of train sizes, '{}' is not valid".format( - repr(train_sizes) - )) + repr(train_sizes) + ) + ) # Set the metric parameters to be used later self.set_params( - groups=groups, train_sizes=train_sizes, cv=cv, scoring=scoring, + groups=groups, + train_sizes=train_sizes, + cv=cv, + scoring=scoring, exploit_incremental_learning=exploit_incremental_learning, - n_jobs=n_jobs, pre_dispatch=pre_dispatch, shuffle=shuffle, + n_jobs=n_jobs, + pre_dispatch=pre_dispatch, + shuffle=shuffle, random_state=random_state, ) @@ -214,9 +235,15 @@ def fit(self, X, y=None): sklc_kwargs = { key: self.get_params()[key] for key in ( - 'groups', 'train_sizes', 'cv', 'scoring', - 'exploit_incremental_learning', 'n_jobs', - 'pre_dispatch', 'shuffle', 'random_state', + "groups", + "train_sizes", + "cv", + "scoring", + "exploit_incremental_learning", + "n_jobs", + "pre_dispatch", + "shuffle", + "random_state", ) } @@ -254,15 +281,13 @@ def draw(self, **kwargs): for idx, (mean, std) in enumerate(curves): # Plot one standard deviation above and below the mean self.ax.fill_between( - self.train_sizes_, mean - std, mean+std, alpha=0.25, - color=colors[idx], + self.train_sizes_, mean - std, mean + std, alpha=0.25, color=colors[idx] ) # Plot the mean curves so they are in front of the variance fill for idx, (mean, _) in enumerate(curves): self.ax.plot( - self.train_sizes_, mean, 'o-', color=colors[idx], - label=labels[idx], + self.train_sizes_, mean, "o-", color=colors[idx], label=labels[idx] ) return self.ax @@ -272,25 +297,37 @@ def finalize(self, **kwargs): Add the title, legend, and other visual final touches to the plot. """ # Set the title of the figure - self.set_title('Learning Curve for {}'.format(self.name)) + self.set_title("Learning Curve for {}".format(self.name)) # Add the legend - self.ax.legend(frameon=True, loc='best') + self.ax.legend(frameon=True, loc="best") # Set the axis labels - self.ax.set_xlabel('Training Instances') - self.ax.set_ylabel('Score') + self.ax.set_xlabel("Training Instances") + self.ax.set_ylabel("Score") ########################################################################## -## Quick Methods +# Quick Methods ########################################################################## -def learning_curve(model, X, y, ax=None, groups=None, - train_sizes=DEFAULT_TRAIN_SIZES, cv=None, scoring=None, - exploit_incremental_learning=False, n_jobs=1, - pre_dispatch="all", shuffle=False, random_state=None, - **kwargs): + +def learning_curve( + model, + X, + y, + ax=None, + groups=None, + train_sizes=DEFAULT_TRAIN_SIZES, + cv=None, + scoring=None, + exploit_incremental_learning=False, + n_jobs=1, + pre_dispatch="all", + shuffle=False, + random_state=None, + **kwargs +): """ Displays a learning curve based on number of samples vs training and cross validation scores. The learning curve aims to show how a model @@ -341,7 +378,7 @@ def learning_curve(model, X, y, ax=None, groups=None, - An iterable yielding train/test splits. see the scikit-learn - `cross-validation guide `_ + `cross-validation guide `_ for more information on the possible strategies that can be used here. scoring : string, callable or None, optional, default: None @@ -379,18 +416,25 @@ def learning_curve(model, X, y, ax=None, groups=None, Returns ------- - ax : matplotlib axes - Returns the axes that the learning curve were drawn on. + visualizer : LearningCurve + Returns the fitted visualizer. """ # Initialize the visualizer oz = LearningCurve( - model, ax=ax, groups=groups, train_sizes=train_sizes, cv=cv, - scoring=scoring, n_jobs=n_jobs, pre_dispatch=pre_dispatch, - shuffle=shuffle, random_state=random_state, + model, + ax=ax, + groups=groups, + train_sizes=train_sizes, + cv=cv, + scoring=scoring, + n_jobs=n_jobs, + pre_dispatch=pre_dispatch, + shuffle=shuffle, + random_state=random_state, exploit_incremental_learning=exploit_incremental_learning, ) # Fit and poof the visualizer oz.fit(X, y) oz.poof(**kwargs) - return oz.ax + return oz diff --git a/yellowbrick/features/rfecv.py b/yellowbrick/model_selection/rfecv.py similarity index 88% rename from yellowbrick/features/rfecv.py rename to yellowbrick/model_selection/rfecv.py index 0b5a65a14..3d75ff951 100644 --- a/yellowbrick/features/rfecv.py +++ b/yellowbrick/model_selection/rfecv.py @@ -1,10 +1,13 @@ -# yellowbrick.features.rfecv +# yellowbrick.model_selection.rfecv # Visualize the number of features selected with recursive feature elimination # -# Author: Benjamin Bengfort +# Author: Benjamin Bengfort # Created: Tue Apr 03 17:31:37 2018 -0400 # -# ID: rfecv.py [] benjamin@bengfort.com $ +# Copyright (C) 2018 The scikit-yb developers +# For license information, see LICENSE.txt +# +# ID: rfecv.py [a4599db] rebeccabilbro@users.noreply.github.com $ """ Visualize the number of features selected using recursive feature elimination @@ -28,6 +31,7 @@ ## Recursive Feature Elimination ########################################################################## + class RFECV(ModelVisualizer): """ Recursive Feature Elimination, Cross-Validated (RFECV) feature selection. @@ -112,6 +116,10 @@ class RFECV(ModelVisualizer): functions such as ``predict()`` and ``score()`` are passed through to this estimator (it rewraps the original model). + n_feature_subsets_ : array of shape [n_subsets_of_features] + The number of features removed on each iteration of RFE, computed by the + number of features in the dataset and the step parameter. + Notes ----- This model wraps ``sklearn.feature_selection.RFE`` and not @@ -129,8 +137,9 @@ class RFECV(ModelVisualizer): or ``feature_importances_`` attribute when fitted. """ - def __init__(self, model, ax=None, step=1, groups=None, cv=None, - scoring=None, **kwargs): + def __init__( + self, model, ax=None, step=1, groups=None, cv=None, scoring=None, **kwargs + ): # Initialize the model visualizer super(RFECV, self).__init__(model, ax=ax, **kwargs) @@ -167,23 +176,20 @@ def fit(self, X, y=None): else: step = int(self.step) - if step < 0: + if step <= 0: raise YellowbrickValueError("step must be >0") # Create the RFE model rfe = RFE(self.estimator, step=step) - n_feature_subsets = np.arange(1, n_features+1) + self.n_feature_subsets_ = np.arange(1, n_features + step, step) # Create the cross validation params # TODO: handle random state - cv_params = { - key: self.get_params()[key] - for key in ('groups', 'cv', 'scoring') - } + cv_params = {key: self.get_params()[key] for key in ("groups", "cv", "scoring")} # Perform cross-validation for each feature subset scores = [] - for n_features_to_select in n_feature_subsets: + for n_features_to_select in self.n_feature_subsets_: rfe.set_params(n_features_to_select=n_features_to_select) scores.append(cross_val_score(rfe, X, y, **cv_params)) @@ -192,7 +198,7 @@ def fit(self, X, y=None): # Find the best RFE model bestidx = self.cv_scores_.mean(axis=1).argmax() - self.n_features_ = n_feature_subsets[bestidx] + self.n_features_ = self.n_feature_subsets_[bestidx] # Fit the final RFE model for the number of features self.rfe_estimator_ = rfe @@ -214,23 +220,24 @@ def draw(self, **kwargs): Renders the rfecv curve. """ # Compute the curves - x = np.arange(1, len(self.cv_scores_)+1) + x = self.n_feature_subsets_ means = self.cv_scores_.mean(axis=1) sigmas = self.cv_scores_.std(axis=1) - # Plot one standard deviation above and below the mean - self.ax.fill_between(x, means - sigmas, means+sigmas, alpha=0.25) + self.ax.fill_between(x, means - sigmas, means + sigmas, alpha=0.25) # Plot the curve - self.ax.plot(x, means, 'o-') + self.ax.plot(x, means, "o-") # Plot the maximum number of features self.ax.axvline( - self.n_features_, c='k', ls='--', + self.n_features_, + c="k", + ls="--", label="n_features = {}\nscore = {:0.3f}".format( self.n_features_, self.cv_scores_.mean(axis=1).max() - ) + ), ) return self.ax @@ -240,22 +247,22 @@ def finalize(self, **kwargs): Add the title, legend, and other visual final touches to the plot. """ # Set the title of the figure - self.set_title('RFECV for {}'.format(self.name)) + self.set_title("RFECV for {}".format(self.name)) # Add the legend - self.ax.legend(frameon=True, loc='best') + self.ax.legend(frameon=True, loc="best") # Set the axis labels - self.ax.set_xlabel('Number of Features Selected') - self.ax.set_ylabel('Score') + self.ax.set_xlabel("Number of Features Selected") + self.ax.set_ylabel("Score") ########################################################################## ## Quick Methods ########################################################################## -def rfecv(model, X, y, ax=None, step=1, groups=None, cv=None, - scoring=None, **kwargs): + +def rfecv(model, X, y, ax=None, step=1, groups=None, cv=None, scoring=None, **kwargs): """ Performs recursive feature elimination with cross-validation to determine an optimal number of features for a model. Visualizes the feature subsets @@ -319,8 +326,8 @@ def rfecv(model, X, y, ax=None, step=1, groups=None, cv=None, Returns ------- - ax : matplotlib axes - Returns the axes that the rfecv were drawn on. + viz : RFECV + Returns the fitted, finalized visualizer. """ # Initialize the visualizer oz = RFECV(model, ax=ax, step=step, groups=groups, cv=cv, scoring=scoring) @@ -328,4 +335,5 @@ def rfecv(model, X, y, ax=None, step=1, groups=None, cv=None, # Fit and poof the visualizer oz.fit(X, y) oz.poof(**kwargs) - return oz.ax + + return oz diff --git a/yellowbrick/model_selection/validation_curve.py b/yellowbrick/model_selection/validation_curve.py index 3dd9ca356..f7b012f23 100644 --- a/yellowbrick/model_selection/validation_curve.py +++ b/yellowbrick/model_selection/validation_curve.py @@ -1,17 +1,20 @@ # yellowbrick.model_selection.validation_curve # Implements a visual validation curve for a hyperparameter. # -# Author: Benjamin Bengfort +# Author: Benjamin Bengfort # Created: Sat Mar 31 06:27:28 2018 -0400 # -# ID: validation_curve.py [] benjamin@bengfort.com $ +# Copyright (C) 2018 The scikit-yb developers +# For license information, see LICENSE.txt +# +# ID: validation_curve.py [c5355ee] benjamin@bengfort.com $ """ Implements a visual validation curve for a hyperparameter. """ ########################################################################## -## Imports +# Imports ########################################################################## import numpy as np @@ -24,9 +27,10 @@ ########################################################################## -## ValidationCurve visualizer +# ValidationCurve visualizer ########################################################################## + class ValidationCurve(ModelVisualizer): """ Visualizes the validation curve for both test and training data for a @@ -83,7 +87,7 @@ class ValidationCurve(ModelVisualizer): - An iterable yielding train/test splits. see the scikit-learn - `cross-validation guide `_ + `cross-validation guide `_ for more information on the possible strategies that can be used here. scoring : string, callable or None, optional, default: None @@ -137,18 +141,29 @@ class ValidationCurve(ModelVisualizer): Notes ----- This visualizer is essentially a wrapper for the - ``sklearn.model_selection.validation_curve utility``, discussed in the - `validation curves `_ + ``sklearn.model_selection.learning_curve utility``, discussed in the + `validation curves `__ documentation. .. seealso:: The documentation for the - `validation_curve `_ + `learning_curve `__ function, which this visualizer wraps. """ - def __init__(self, model, param_name, param_range, ax=None, logx=False, - groups=None, cv=None, scoring=None, n_jobs=1, - pre_dispatch="all", **kwargs): + def __init__( + self, + model, + param_name, + param_range, + ax=None, + logx=False, + groups=None, + cv=None, + scoring=None, + n_jobs=1, + pre_dispatch="all", + **kwargs + ): # Initialize the model visualizer super(ValidationCurve, self).__init__(model, ax=ax, **kwargs) @@ -159,16 +174,21 @@ def __init__(self, model, param_name, param_range, ax=None, logx=False, raise YellowbrickValueError( "must specify array of param values, '{}' is not valid".format( repr(param_range) - )) + ) + ) # Set the visual and validation curve parameters on the estimator self.set_params( - param_name=param_name, param_range=param_range, logx=logx, - groups=groups, cv=cv, scoring=scoring, n_jobs=n_jobs, + param_name=param_name, + param_range=param_range, + logx=logx, + groups=groups, + cv=cv, + scoring=scoring, + n_jobs=n_jobs, pre_dispatch=pre_dispatch, ) - def fit(self, X, y=None): """ Fits the validation curve with the wrapped estimator and parameter @@ -195,8 +215,13 @@ def fit(self, X, y=None): skvc_kwargs = { key: self.get_params()[key] for key in ( - 'param_name', 'param_range', 'groups', 'cv', 'scoring', - 'n_jobs', 'pre_dispatch', + "param_name", + "param_range", + "groups", + "cv", + "scoring", + "n_jobs", + "pre_dispatch", ) } @@ -234,19 +259,17 @@ def draw(self, **kwargs): for idx, (mean, std) in enumerate(curves): # Plot one standard deviation above and below the mean self.ax.fill_between( - self.param_range, mean - std, mean+std, alpha=0.25, - color=colors[idx], + self.param_range, mean - std, mean + std, alpha=0.25, color=colors[idx] ) # Plot the mean curves so they are in front of the variance fill for idx, (mean, _) in enumerate(curves): self.ax.plot( - self.param_range, mean, 'd-', color=colors[idx], - label=labels[idx], + self.param_range, mean, "d-", color=colors[idx], label=labels[idx] ) if self.logx: - self.ax.set_xscale('log') + self.ax.set_xscale("log") return self.ax @@ -255,23 +278,36 @@ def finalize(self, **kwargs): Add the title, legend, and other visual final touches to the plot. """ # Set the title of the figure - self.set_title('Validation Curve for {}'.format(self.name)) + self.set_title("Validation Curve for {}".format(self.name)) # Add the legend - self.ax.legend(frameon=True, loc='best') + self.ax.legend(frameon=True, loc="best") # Set the axis labels self.ax.set_xlabel(self.param_name) - self.ax.set_ylabel('score') + self.ax.set_ylabel("score") ########################################################################## -## Quick Method +# Quick Method ########################################################################## -def validation_curve(model, X, y, param_name, param_range, ax=None, logx=False, - groups=None, cv=None, scoring=None, n_jobs=1, - pre_dispatch="all", **kwargs): + +def validation_curve( + model, + X, + y, + param_name, + param_range, + ax=None, + logx=False, + groups=None, + cv=None, + scoring=None, + n_jobs=1, + pre_dispatch="all", + **kwargs +): """ Displays a validation curve for the specified param and values, plotting both the train and cross-validated test scores. The validation curve is a @@ -324,7 +360,7 @@ def validation_curve(model, X, y, param_name, param_range, ax=None, logx=False, - An iterable yielding train/test splits. see the scikit-learn - `cross-validation guide `_ + `cross-validation guide `_ for more information on the possible strategies that can be used here. scoring : string, callable or None, optional, default: None @@ -348,17 +384,25 @@ def validation_curve(model, X, y, param_name, param_range, ax=None, logx=False, Returns ------- - ax : matplotlib.Axes - The axes object that the validation curves were drawn on. + visualizer : ValidationCurve + The fitted visualizer """ # Initialize the visualizer oz = ValidationCurve( - model, param_name, param_range, ax=ax, logx=logx, groups=groups, - cv=cv, scoring=scoring, n_jobs=n_jobs, pre_dispatch=pre_dispatch + model, + param_name, + param_range, + ax=ax, + logx=logx, + groups=groups, + cv=cv, + scoring=scoring, + n_jobs=n_jobs, + pre_dispatch=pre_dispatch, ) # Fit and poof the visualizer oz.fit(X, y) oz.poof(**kwargs) - return oz.ax + return oz diff --git a/yellowbrick/pipeline.py b/yellowbrick/pipeline.py index 8541b3ea4..481cc8845 100644 --- a/yellowbrick/pipeline.py +++ b/yellowbrick/pipeline.py @@ -1,10 +1,10 @@ # yellowbrick.pipeline # Implements a visual pipeline that subclasses Scikit-Learn pipelines. # -# Author: Benjamin Bengfort +# Author: Benjamin Bengfort # Created: Fri Oct 07 21:41:06 2016 -0400 # -# Copyright (C) 2016 District Data Labs +# Copyright (C) 2016 The sckit-yb developers # For license information, see LICENSE.txt # # ID: pipeline.py [1efae1f] benjamin@bengfort.com $ @@ -27,6 +27,7 @@ ## Visual Pipeline ########################################################################## + class VisualPipeline(Pipeline): """Pipeline of transforms and visualizers with a final estimator. @@ -68,10 +69,7 @@ class VisualPipeline(Pipeline): @property def visual_steps(self): - return dict( - step for step in self.steps - if isinstance(step[1], Visualizer) - ) + return dict(step for step in self.steps if isinstance(step[1], Visualizer)) def poof(self, outdir=None, ext=".pdf", **kwargs): """ @@ -92,13 +90,18 @@ def poof(self, outdir=None, ext=".pdf", **kwargs): kwargs : dict Keyword arguments to pass to the ``poof()`` method of all steps. """ + axes = [] for name, step in self.visual_steps.items(): if outdir is not None: outpath = path.join(outdir, slugify(name) + ext) else: outpath = None - step.poof(outpath=outpath, **kwargs) + ax = step.poof(outpath=outpath, **kwargs) + axes.append(ax) + + # Return axes array to ensure figures are shown in notebook + return axes def fit_transform_poof(self, X, y=None, outpath=None, **kwargs): """ diff --git a/yellowbrick/regressor/__init__.py b/yellowbrick/regressor/__init__.py index 9270f3405..b908e7616 100644 --- a/yellowbrick/regressor/__init__.py +++ b/yellowbrick/regressor/__init__.py @@ -1,10 +1,10 @@ # yellowbrick.regressor # Visualizers for Regression analysis and diagnostics # -# Author: Benjamin Bengfort +# Author: Benjamin Bengfort # Created: Mon Mar 06 12:23:55 2017 -0500 # -# Copyright (C) 2016 District Data Labs +# Copyright (C) 2016 The scikit-yb developers # For license information, see LICENSE.txt # # ID: __init__.py [7d3f5e6] benjamin@bengfort.com $ @@ -22,3 +22,4 @@ from .base import * from .residuals import * from .alphas import * +from .influence import * diff --git a/yellowbrick/regressor/alphas.py b/yellowbrick/regressor/alphas.py index e299d6e27..77a2f7cf0 100644 --- a/yellowbrick/regressor/alphas.py +++ b/yellowbrick/regressor/alphas.py @@ -1,10 +1,11 @@ # yellowbrick.regressor.alphas # Implements alpha selection visualizers for regularization # -# Author: Benjamin Bengfort +# Author: Benjamin Bengfort +# Author: Rebecca Bilbro # Created: Mon Mar 06 19:22:07 2017 -0500 # -# Copyright (C) 2016 District Data Labs +# Copyright (C) 2016 The scikit-yb developers # For license information, see LICENSE.txt # # ID: alphas.py [7d3f5e6] benjamin@bengfort.com $ @@ -21,22 +22,21 @@ from functools import partial -from .base import RegressionScoreVisualizer -from ..exceptions import YellowbrickTypeError -from ..exceptions import YellowbrickValueError +from yellowbrick.exceptions import YellowbrickTypeError +from yellowbrick.exceptions import YellowbrickValueError +from yellowbrick.regressor.base import RegressionScoreVisualizer from sklearn.model_selection import cross_val_score ## Packages for export -__all__ = [ - "AlphaSelection", "ManualAlphaSelection" -] +__all__ = ["AlphaSelection", "ManualAlphaSelection"] ########################################################################## ## AlphaSelection Visualizer ########################################################################## + class AlphaSelection(RegressionScoreVisualizer): """ The Alpha Selection Visualizer demonstrates how different values of alpha @@ -68,13 +68,20 @@ class AlphaSelection(RegressionScoreVisualizer): model : a Scikit-Learn regressor Should be an instance of a regressor, and specifically one whose name ends with "CV" otherwise a will raise a YellowbrickTypeError exception - on instantiation. To use non-CV regressors see: - ``ManualAlphaSelection``. + on instantiation. To use non-CV regressors see: ``ManualAlphaSelection``. + If the estimator is not fitted, it is fit when the visualizer is fitted, + unless otherwise specified by ``is_fitted``. ax : matplotlib Axes, default: None The axes to plot the figure on. If None is passed in the current axes will be used (or generated if required). + is_fitted : bool or str, default='auto' + Specify if the wrapped estimator is already fitted. If False, the estimator + will be fit when the visualizer is fit, otherwise, the estimator will not be + modified. If 'auto' (default), a helper method will check if the estimator + is fitted before fitting it again. + kwargs : dict Keyword arguments that are passed to the base class and may influence the visualization as defined in other Visualizers. @@ -108,18 +115,20 @@ class AlphaSelection(RegressionScoreVisualizer): For RidgeCV, make sure ``store_cv_values=True``. """ - def __init__(self, model, ax=None, **kwargs): + def __init__(self, model, ax=None, is_fitted="auto", **kwargs): # Check to make sure this is a "RegressorCV" name = model.__class__.__name__ if not name.endswith("CV"): - raise YellowbrickTypeError(( - "'{}' is not a CV regularization model;" - " try ManualAlphaSelection instead." - ).format(name)) + raise YellowbrickTypeError( + ( + "'{}' is not a CV regularization model;" + " try ManualAlphaSelection instead." + ).format(name) + ) # Set the store_cv_values parameter on RidgeCV - if 'store_cv_values' in model.get_params().keys(): + if "store_cv_values" in model.get_params().keys(): model.set_params(store_cv_values=True) # Call super to initialize the class @@ -130,16 +139,13 @@ def fit(self, X, y, **kwargs): A simple pass-through method; calls fit on the estimator and then draws the alpha-error plot. """ - self.estimator.fit(X, y, **kwargs) + # Fit the underlying model + super(AlphaSelection, self).fit(X, y, **kwargs) + + # Draw the alpha to error curve self.draw() return self - def score(self, X, y, **kwargs): - """ - Simply returns the score of the underlying CV model - """ - return self.estimator.score(X, y, **kwargs) - def draw(self): """ Draws the alpha plot based on the values on the estimator. @@ -148,16 +154,15 @@ def draw(self): alphas = self._find_alphas_param() errors = self._find_errors_param() - - alpha = self.estimator.alpha_ # Get decision from the estimator - name = self.name[:-2].lower() # Remove the CV from the label + alpha = self.estimator.alpha_ # Get decision from the estimator + name = self.name[:-2].lower() # Remove the CV from the label # Plot the alpha against the error self.ax.plot(alphas, errors, label=name) # Draw a dashed vline at the alpha label = "$\\alpha={:0.3f}$".format(alpha) - self.ax.axvline(alpha, color='k', linestyle='dashed', label=label) + self.ax.axvline(alpha, color="k", linestyle="dashed", label=label) return self.ax @@ -167,16 +172,14 @@ def finalize(self): X and Y axis labels and adding the legend. """ # Set the title - self.set_title( - '{} Alpha Error'.format(self.name) - ) + self.set_title("{} Alpha Error".format(self.name)) # Set the x and y labels self.ax.set_xlabel("alpha") self.ax.set_ylabel("error (or score)") # Set the legend - self.ax.legend(loc='best', frameon=True) + self.ax.legend(loc="best", frameon=True) def _find_alphas_param(self): """ @@ -186,7 +189,7 @@ def _find_alphas_param(self): """ # NOTE: The order of the search is very important! - for attr in ("cv_alphas_", "alphas_", "alphas",): + for attr in ("cv_alphas_", "alphas_", "alphas"): try: return getattr(self.estimator, attr) except AttributeError: @@ -206,10 +209,10 @@ def _find_errors_param(self): """ # NOTE: The order of the search is very important! - if hasattr(self.estimator, 'mse_path_'): + if hasattr(self.estimator, "mse_path_"): return self.estimator.mse_path_.mean(1) - if hasattr(self.estimator, 'cv_values_'): + if hasattr(self.estimator, "cv_values_"): return self.estimator.cv_values_.mean(0) raise YellowbrickValueError( @@ -218,10 +221,12 @@ def _find_errors_param(self): ) ) + ########################################################################## ## ManualAlphaSelection Visualizer ########################################################################## + class ManualAlphaSelection(AlphaSelection): """ The ``AlphaSelection`` visualizer requires a "RegressorCV", that is a @@ -235,9 +240,9 @@ class ManualAlphaSelection(AlphaSelection): Parameters ---------- - model : a Scikit-Learn regressor - Should be an instance of a regressor, and specifically one whose name - doesn't end with "CV". The regressor must support a call to + model : an unfitted Scikit-Learn regressor + Should be an instance of an unfitted regressor, and specifically one + whose name doesn't end with "CV". The regressor must support a call to ``set_params(alpha=alpha)`` and be fit multiple times. If the regressor name ends with "CV" a ``YellowbrickValueError`` is raised. @@ -294,16 +299,16 @@ class ManualAlphaSelection(AlphaSelection): "RegressorCV" estimators. """ - def __init__(self, model, ax=None, alphas=None, - cv=None, scoring=None, **kwargs): + def __init__(self, model, ax=None, alphas=None, cv=None, scoring=None, **kwargs): # Check to make sure this is not a "RegressorCV" name = model.__class__.__name__ if name.endswith("CV"): - raise YellowbrickTypeError(( - "'{}' is a CV regularization model;" - " try AlphaSelection instead." - ).format(name)) + raise YellowbrickTypeError( + ( + "'{}' is a CV regularization model;" " try AlphaSelection instead." + ).format(name) + ) # Call super to initialize the class super(ManualAlphaSelection, self).__init__(model, ax=ax, **kwargs) @@ -334,12 +339,6 @@ def fit(self, X, y, **args): # Always make sure to return self from fit return self - def score(self, X, y, **kwargs): - """ - Simply returns the score of the underlying CV model - """ - return self.estimator.score(X, y, **kwargs) - def draw(self): """ Draws the alphas values against their associated error in a similar @@ -351,11 +350,69 @@ def draw(self): # Draw a dashed vline at the alpha with maximal error alpha = self.alphas[np.where(self.errors == self.errors.max())][0] label = "$\\alpha_{{max}}={:0.3f}$".format(alpha) - self.ax.axvline(alpha, color='k', linestyle='dashed', label=label) + self.ax.axvline(alpha, color="k", linestyle="dashed", label=label) # Draw a dashed vline at the alpha with minimal error alpha = self.alphas[np.where(self.errors == self.errors.min())][0] label = "$\\alpha_{{min}}={:0.3f}$".format(alpha) - self.ax.axvline(alpha, color='k', linestyle='dashed', label=label) + self.ax.axvline(alpha, color="k", linestyle="dashed", label=label) return self.ax + + +########################################################################## +## Quick Method +########################################################################## + + +def alphas(model, X, y=None, ax=None, is_fitted="auto", **kwargs): + """Quick Method: + The Alpha Selection Visualizer demonstrates how different values of alpha + influence model selection during the regularization of linear models. + Generally speaking, alpha increases the affect of regularization, e.g. if + alpha is zero there is no regularization and the higher the alpha, the + more the regularization parameter influences the final model. + + Parameters + ---------- + + model : a Scikit-Learn regressor + Should be an instance of a regressor, and specifically one whose name + ends with "CV" otherwise a will raise a YellowbrickTypeError exception + on instantiation. To use non-CV regressors see: ``ManualAlphaSelection``. + If the estimator is not fitted, it is fit when the visualizer is fitted, + unless otherwise specified by ``is_fitted``. + + X : ndarray or DataFrame of shape n x m + A matrix of n instances with m features. + + y : ndarray or Series of length n + An array or series of target values. + + ax : matplotlib Axes, default: None + The axes to plot the figure on. If None is passed in the current axes + will be used (or generated if required). + + is_fitted : bool or str, default='auto' + Specify if the wrapped estimator is already fitted. If False, the estimator + will be fit when the visualizer is fit, otherwise, the estimator will not be + modified. If 'auto' (default), a helper method will check if the estimator + is fitted before fitting it again. + + kwargs : dict + Keyword arguments that are passed to the base class and may influence + the visualization as defined in other Visualizers. + + Returns + ------- + visualizer : AlphaSelection + Returns the alpha selection visualizer + """ + # Instantiate the visualizer + visualizer = AlphaSelection(model, ax, is_fitted=is_fitted, **kwargs) + + visualizer.score(X, y) + visualizer.finalize() + + # Return the visualizer + return visualizer diff --git a/yellowbrick/regressor/base.py b/yellowbrick/regressor/base.py index 6fcbc0a21..77782ccc4 100644 --- a/yellowbrick/regressor/base.py +++ b/yellowbrick/regressor/base.py @@ -1,11 +1,11 @@ # yellowbrick.regressor.base # Base classes for regressor Visualizers. # -# Author: Rebecca Bilbro -# Author: Benjamin Bengfort +# Author: Rebecca Bilbro +# Author: Benjamin Bengfort # Created: Fri Jun 03 10:30:36 2016 -0700 # -# Copyright (C) 2016 District Data Labs +# Copyright (C) 2016 The scikit-yb developers # For license information, see LICENSE.txt # # ID: base.py [7d3f5e6] benjamin@bengfort.com $ @@ -24,43 +24,86 @@ ## Packages for export -__all__ = [ - "RegressionScoreVisualizer", -] +__all__ = ["RegressionScoreVisualizer"] ########################################################################## ## Regression Visualization Base Object ########################################################################## + class RegressionScoreVisualizer(ScoreVisualizer): - """ - Base class for all ScoreVisualizers that evaluate a regression estimator. + """Base class for regressor model selection. + + The RegressionScoreVisualizer wraps a regression model to produce a + visualization when the score method is called, usually to allow the user + to effectively compare the performance between models. + + The base class provides helper functionality to ensure that regression + visualizers consistently store the trained score for access post visualization + and that a correct regressor is passed to the visualizer. + + Parameters + ---------- + model : estimator + A scikit-learn estimator that should be a regressor. If the model is + not a regressor, an exception is raised. + + ax : matplotlib Axes, default: None + The axis to plot the figure on. If None is passed in the current axes + will be used (or generated if required). - The primary functionality of this class is to perform a check to ensure - the passed in estimator is a regressor, otherwise it raises a - ``YellowbrickTypeError``. + fig : matplotlib Figure, default: None + The figure to plot the Visualizer on. If None is passed in the current + plot will be used (or generated if required). + + force_model : bool, default: False + Do not check to ensure that the underlying estimator is a classifier. This + will prevent an exception when the visualizer is initialized but may result + in unexpected or unintended behavior. + + kwargs: dict + Keyword arguments passed to the super class. + + Attributes + ---------- + score_ : float + An evaluation metric of the regressor on test data produced when + ``score()`` is called. This metric is between 0 and 1 -- higher scores are + generally better. For regressors, this score is usually the r2_score, but + ensure you check the underlying model for more details about the metric. """ - def __init__(self, model, ax=None, **kwargs): - if not isregressor(model): + def __init__(self, model, ax=None, fig=None, force_model=False, **kwargs): + if not force_model and not isregressor(model): raise YellowbrickTypeError( "This estimator is not a regressor; try a classifier or " "clustering score visualizer instead!" - ) + ) - super(RegressionScoreVisualizer, self).__init__(model, ax=ax, **kwargs) + self.force_model = force_model + super(RegressionScoreVisualizer, self).__init__(model, ax=ax, fig=fig, **kwargs) def score(self, X, y, **kwargs): """ - The score method is the primary entry point for drawing. + The score function is the hook for visual interaction. Pass in test + data and the visualizer will create predictions on the data and + evaluate them with respect to the test values. The evaluation will + then be passed to draw() and the result of the estimator score will + be returned. + + Parameters + ---------- + X : array-like + X (also X_test) are the dependent variables of test set to predict + + y : array-like + y (also y_test) is the independent actual variables to score against Returns ------- score : float The R^2 score of the underlying regressor """ - raise NotImplementedError( - "Subclasses of RegressionScoreVisualizer must implement score " - " and return an R^2 score of the underlying estimator" - ) + self.score_ = self.estimator.score(X, y) + return self.score_ diff --git a/yellowbrick/regressor/influence.py b/yellowbrick/regressor/influence.py new file mode 100644 index 000000000..0be2da23b --- /dev/null +++ b/yellowbrick/regressor/influence.py @@ -0,0 +1,282 @@ +# yellowbrick.regressor.influence +# Visualize the influence and leverage of individual instances on a regression model. +# +# Author: Benjamin Bengfort +# Created: Sun Jun 09 15:21:17 2019 -0400 +# +# Copyright (C) 2019 The scikit-yb developers +# For license information, see LICENSE.txt +# +# ID: influence.py [fe14cfd] benjamin@bengfort.com $ + +""" +Visualize the influence and leverage of individual instances on a regression model. +""" + +########################################################################## +## Imports +########################################################################## + +import numpy as np +import scipy as sp + +from yellowbrick.base import Visualizer +from sklearn.linear_model import LinearRegression + + +########################################################################## +## Cook's Distance +########################################################################## + + +class CooksDistance(Visualizer): + """ + Cook's Distance is a measure of how influential an instance is to the computation of + a regression, e.g. if the instance is removed would the estimated coeficients of the + underlying model be substantially changed? Because of this, Cook's Distance is + generally used to detect outliers in standard, OLS regression. In fact, a general + rule of thumb is that D(i) > 4/n is a good threshold for determining highly + influential points as outliers and this visualizer can report the percentage of data + that is above that threshold. + + This implementation of Cook's Distance assumes Ordinary Least Squares regression, + and therefore embeds a ``sklearn.linear_model.LinearRegression`` under the hood. + Distance is computed via the non-whitened leverage of the projection matrix, + computed inside of ``fit()``. The results of this visualizer are therefore similar + to, but not as advanced, as a similar computation using statsmodels. Computing the + influence for other regression models requires leave one out validation and can be + expensive to compute. + + .. seealso:: + For a longer discussion on detecting outliers in regression and computing + leverage and influence, see `linear regression in python, outliers/leverage + detect `_ by Huiming Song. + + Parameters + ---------- + ax : matplotlib Axes, default: None + The axes to plot the figure on. If None is passed in the current axes + will be used (or generated if required). + + draw_threshold : bool, default: True + Draw a horizontal line at D(i) == 4/n to easily identify the most influential + points on the final regression. This will also draw a legend that specifies the + percentage of data points that are above the threshold. + + linefmt : str, default: 'C0-' + A string defining the properties of the vertical lines of the stem plot, usually + this will be a color or a color and a line style. The default is simply a solid + line with the first color of the color cycle. + + markerfmt : str, default: ',' + A string defining the properties of the markers at the stem plot heads. The + default is "pixel", e.g. basically no marker head at the top of the stem plot. + + kwargs : dict + Keyword arguments that are passed to the base class and may influence the final + visualization (e.g. size or title parameters). + + Attributes + ---------- + distance_ : array, 1D + The Cook's distance value for each instance specified in ``X``, e.g. an 1D array + with shape ``(X.shape[0],)``. + + p_values_ : array, 1D + The p values associated with the F-test of Cook's distance distribution. A 1D + array whose shape matches ``distance_``. + + influence_threshold_ : float + A rule of thumb influence threshold to determine outliers in the regression + model, defined as It=4/n. + + outlier_percentage_ : float + The percentage of instances whose Cook's distance is greater than the influnce + threshold, the percentage is 0.0 <= p <= 100.0. + + Notes + ----- + Cook's Distance is very similar to DFFITS, another diagnostic that is meant to show + how influential a point is in a statistical regression. Although the computed values + of Cook's and DFFITS are different, they are conceptually identical and there even + exists a closed-form formula to convert one value to another. Because of this, we + have chosen to implement Cook's distance rather than or in addition to DFFITS. + """ + + def __init__( + self, ax=None, draw_threshold=True, linefmt="C0-", markerfmt=",", **kwargs + ): + # Initialize the visualizer + super(CooksDistance, self).__init__(ax=ax, **kwargs) + + # Set "hyperparameters" + self.set_params( + draw_threshold=draw_threshold, linefmt=linefmt, markerfmt=markerfmt + ) + + # An internal LinearRegression used to compute the residuals and MSE + # This implementation doesn't support any regressor, it is OLS-specific + self._model = LinearRegression() + + def fit(self, X, y): + """ + Computes the leverage of X and uses the residuals of a + ``sklearn.linear_model.LinearRegression`` to compute the Cook's Distance of each + observation in X, their p-values and the number of outliers defined by the + number of observations supplied. + + Parameters + ---------- + X : array-like, 2D + The exogenous design matrix, e.g. training data. + + y : array-like, 1D + The endogenous response variable, e.g. target data. + + Returns + ------- + self : CooksDistance + Fit returns the visualizer instance. + """ + # Fit a linear model to X and y to compute MSE + self._model.fit(X, y) + + # Leverage is computed as the diagonal of the projection matrix of X + # TODO: whiten X before computing leverage + leverage = (X * np.linalg.pinv(X).T).sum(1) + + # Compute the rank and the degrees of freedom of the OLS model + rank = np.linalg.matrix_rank(X) + df = X.shape[0] - rank + + # Compute the MSE from the residuals + residuals = y - self._model.predict(X) + mse = np.dot(residuals, residuals) / df + + # Compute Cook's distance + residuals_studentized = residuals / np.sqrt(mse) / np.sqrt(1 - leverage) + self.distance_ = residuals_studentized ** 2 / X.shape[1] + self.distance_ *= leverage / (1 - leverage) + + # Compute the p-values of Cook's Distance + # TODO: honestly this was done because it was only in the statsmodels + # implementation... I have no idea what this is or why its important. + self.p_values_ = sp.stats.f.sf(self.distance_, X.shape[1], df) + + # Compute the influence threshold rule of thumb + self.influence_threshold_ = 4 / X.shape[0] + self.outlier_percentage_ = ( + sum(self.distance_ > self.influence_threshold_) / X.shape[0] + ) + self.outlier_percentage_ *= 100.0 + + self.draw() + return self + + def draw(self): + """ + Draws a stem plot where each stem is the Cook's Distance of the instance at the + index specified by the x axis. Optionaly draws a threshold line. + """ + # Draw a stem plot with the influence for each instance + _, _, baseline = self.ax.stem( + self.distance_, linefmt=self.linefmt, markerfmt=self.markerfmt + ) + + # No padding on either side of the instance index + self.ax.set_xlim(0, len(self.distance_)) + + # Draw the threshold for most influential points + if self.draw_threshold: + label = r"{:0.2f}% > $I_t$ ($I_t=\frac {{4}} {{n}}$)".format( + self.outlier_percentage_ + ) + self.ax.axhline( + self.influence_threshold_, + ls="--", + label=label, + c=baseline.get_color(), + lw=baseline.get_linewidth(), + ) + + return self.ax + + def finalize(self): + """ + Prepares the visualization for presentation and reporting. + """ + # Set the title and axis labels + self.set_title("Cook's Distance Outlier Detection") + self.ax.set_xlabel("instance index") + self.ax.set_ylabel("influence (I)") + + # Only add the legend if the influence threshold has been plotted + if self.draw_threshold: + self.ax.legend(loc="best", frameon=True) + + +def cooks_distance( + X, y, ax=None, draw_threshold=True, linefmt="C0-", markerfmt=",", **kwargs +): + """ + Cook's Distance is a measure of how influential an instance is to the computation of + a regression, e.g. if the instance is removed would the estimated coeficients of the + underlying model be substantially changed? Because of this, Cook's Distance is + generally used to detect outliers in standard, OLS regression. In fact, a general + rule of thumb is that D(i) > 4/n is a good threshold for determining highly + influential points as outliers and this visualizer can report the percentage of data + that is above that threshold. + + This implementation of Cook's Distance assumes Ordinary Least Squares regression, + and therefore embeds a ``sklearn.linear_model.LinearRegression`` under the hood. + Distance is computed via the non-whitened leverage of the projection matrix, + computed inside of ``fit()``. The results of this visualizer are therefore similar + to, but not as advanced, as a similar computation using statsmodels. Computing the + influence for other regression models requires leave one out validation and can be + expensive to compute. + + .. seealso:: + For a longer discussion on detecting outliers in regression and computing + leverage and influence, see `linear regression in python, outliers/leverage + detect `_ by Huiming Song. + + Parameters + ---------- + X : array-like, 2D + The exogenous design matrix, e.g. training data. + + y : array-like, 1D + The endogenous response variable, e.g. target data. + + ax : matplotlib Axes, default: None + The axes to plot the figure on. If None is passed in the current axes + will be used (or generated if required). + + draw_threshold : bool, default: True + Draw a horizontal line at D(i) == 4/n to easily identify the most influential + points on the final regression. This will also draw a legend that specifies the + percentage of data points that are above the threshold. + + linefmt : str, default: 'C0-' + A string defining the properties of the vertical lines of the stem plot, usually + this will be a color or a color and a line style. The default is simply a solid + line with the first color of the color cycle. + + markerfmt: str, default: ',' + A string defining the properties of the markers at the stem plot heads. The + default is "pixel", e.g. basically no marker head at the top of the stem plot. + + kwargs : dict + Keyword arguments that are passed to the base class and may influence the final + visualization (e.g. size or title parameters). + """ + viz = CooksDistance( + ax=ax, + draw_threshold=draw_threshold, + linefmt=linefmt, + markerfmt=markerfmt, + **kwargs + ) + viz.fit(X, y) + viz.finalize() + return viz diff --git a/yellowbrick/regressor/residuals.py b/yellowbrick/regressor/residuals.py index 4fa85c4c4..beb0de665 100644 --- a/yellowbrick/regressor/residuals.py +++ b/yellowbrick/regressor/residuals.py @@ -1,11 +1,11 @@ # yellowbrick.regressor.residuals # Regressor visualizers that score residuals: prediction vs. actual data. # -# Author: Rebecca Bilbro -# Author: Benjamin Bengfort +# Author: Rebecca Bilbro +# Author: Benjamin Bengfort # Created: Fri Jun 03 10:30:36 2016 -0700 # -# Copyright (C) 2016 2016 District Data Labs +# Copyright (C) 2016 The scikit-yb developers # For license information, see LICENSE.txt # # ID: residuals.py [7d3f5e6] benjamin@bengfort.com $ @@ -29,30 +29,27 @@ from sklearn.model_selection import train_test_split -from .base import RegressionScoreVisualizer -from ..draw import manual_legend -from ..style.palettes import LINE_COLOR -from ..utils.decorators import memoized -from ..exceptions import YellowbrickValueError -from ..bestfit import draw_best_fit, draw_identity_line - +from yellowbrick.draw import manual_legend +from yellowbrick.utils.decorators import memoized +from yellowbrick.style.palettes import LINE_COLOR +from yellowbrick.exceptions import YellowbrickValueError +from yellowbrick.regressor.base import RegressionScoreVisualizer +from yellowbrick.bestfit import draw_best_fit, draw_identity_line ## Packages for export -__all__ = [ - "PredictionError", "prediction_error", - "ResidualsPlot", "residuals_plot" -] +__all__ = ["PredictionError", "prediction_error", "ResidualsPlot", "residuals_plot"] ########################################################################## ## Prediction Error Plots ########################################################################## + class PredictionError(RegressionScoreVisualizer): """ The prediction error visualizer plots the actual targets from the dataset against the predicted values generated by our model(s). This visualizer is - used to dectect noise or heteroscedasticity along a range of the target + used to detect noise or heteroscedasticity along a range of the target domain. Parameters @@ -61,6 +58,8 @@ class PredictionError(RegressionScoreVisualizer): model : a Scikit-Learn regressor Should be an instance of a regressor, otherwise will raise a YellowbrickTypeError exception on instantiation. + If the estimator is not fitted, it is fit when the visualizer is fitted, + unless otherwise specified by ``is_fitted``. ax : matplotlib Axes, default: None The axes to plot the figure on. If None is passed in the current axes @@ -79,7 +78,7 @@ class PredictionError(RegressionScoreVisualizer): predicted and measured value of the target variable. The color of the bestfit line is determined by the ``line_color`` argument. - identity: bool, default: True + identity : bool, default: True Draw the 45 degree identity line, y=x in order to better show the relationship or pattern of the residuals. E.g. to estimate if the model is over- or under- estimating the given values. The color of the @@ -95,10 +94,23 @@ class PredictionError(RegressionScoreVisualizer): Specify a transparency where 1 is completely opaque and 0 is completely transparent. This property makes densely clustered points more visible. + is_fitted : bool or str, default='auto' + Specify if the wrapped estimator is already fitted. If False, the estimator + will be fit when the visualizer is fit, otherwise, the estimator will not be + modified. If 'auto' (default), a helper method will check if the estimator + is fitted before fitting it again. + kwargs : dict Keyword arguments that are passed to the base class and may influence the visualization as defined in other Visualizers. + Attributes + ---------- + + score_ : float + The R^2 score that specifies the goodness of fit of the underlying + regression model to the test data. + Examples -------- @@ -116,15 +128,27 @@ class PredictionError(RegressionScoreVisualizer): its primary entry point is the `score()` method. """ - def __init__(self, model, ax=None, shared_limits=True, - bestfit=True, identity=True, alpha=0.75, **kwargs): + def __init__( + self, + model, + ax=None, + shared_limits=True, + bestfit=True, + identity=True, + alpha=0.75, + is_fitted="auto", + **kwargs + ): + # Whether or not to check if the model is already fitted + self.is_fitted = is_fitted + # Initialize the visualizer super(PredictionError, self).__init__(model, ax=ax, **kwargs) # Visual arguments self.colors = { - 'point': kwargs.pop('point_color', None), - 'line': kwargs.pop('line_color', LINE_COLOR), + "point": kwargs.pop("point_color", None), + "line": kwargs.pop("line_color", LINE_COLOR), } # Drawing arguments @@ -153,7 +177,8 @@ def score(self, X, y=None, **kwargs): ------- score : float """ - self.score_ = self.estimator.score(X, y, **kwargs) + # super will set score_ on the visualizer + super(PredictionError, self).score(X, y, **kwargs) y_pred = self.predict(X) self.draw(y, y_pred) @@ -171,30 +196,34 @@ def draw(self, y, y_pred): An array or series of predicted target values Returns - ------ - ax : the axis with the plotted figure + ------- + ax : matplotlib Axes + The axis with the plotted figure """ label = "$R^2 = {:0.3f}$".format(self.score_) self.ax.scatter( - y, - y_pred, - c=self.colors['point'], - alpha=self.alpha, - label=label) + y, y_pred, c=self.colors["point"], alpha=self.alpha, label=label + ) - # TODO If score is happening inside a loop, draw would get called multiple times. + # TODO If score happens inside a loop, draw gets called multiple times. # Ideally we'd want the best fit line to be drawn only once if self.bestfit: draw_best_fit( - y, y_pred, self.ax, 'linear', ls='--', lw=2, - c=self.colors['line'], label='best fit' + y, + y_pred, + self.ax, + "linear", + ls="--", + lw=2, + c=self.colors["line"], + label="best fit", ) # Set the axes limits based on the range of X and Y data # NOTE: shared_limits will be accounted for in finalize() # TODO: do better than add one for really small residuals - self.ax.set_xlim(y.min()-1, y.max()+1) - self.ax.set_ylim(y_pred.min()-1, y_pred.max()+1) + self.ax.set_xlim(y.min() - 1, y.max() + 1) + self.ax.set_ylim(y_pred.min() - 1, y_pred.max() + 1) return self.ax @@ -208,9 +237,7 @@ def finalize(self, **kwargs): kwargs: generic keyword arguments. """ # Set the title on the plot - self.set_title( - 'Prediction Error for {}'.format(self.name) - ) + self.set_title("Prediction Error for {}".format(self.name)) # Square the axes to ensure a 45 degree line if self.shared_limits: @@ -219,28 +246,29 @@ def finalize(self, **kwargs): xlim = self.ax.get_xlim() # Find the range that captures all data - bounds = ( - min(ylim[0], xlim[0]), - max(ylim[1], xlim[1]), - ) + bounds = (min(ylim[0], xlim[0]), max(ylim[1], xlim[1])) # Reset the limits self.ax.set_xlim(bounds) self.ax.set_ylim(bounds) # Ensure the aspect ratio is square - self.ax.set_aspect('equal', adjustable='box') + self.ax.set_aspect("equal", adjustable="box") # Draw the 45 degree line if self.identity: draw_identity_line( - ax=self.ax, ls='--', lw=2, c=self.colors['line'], - alpha=0.5, label="identity" + ax=self.ax, + ls="--", + lw=2, + c=self.colors["line"], + alpha=0.5, + label="identity", ) # Set the axes labels - self.ax.set_ylabel(r'$\hat{y}$') - self.ax.set_xlabel(r'$y$') + self.ax.set_ylabel(r"$\hat{y}$") + self.ax.set_xlabel(r"$y$") # Set the legend # Note: it would be nice to be able to use the manual_legend utility @@ -249,11 +277,11 @@ def finalize(self, **kwargs): # bit tricky because adding a manual legend here would override the # best fit and 45 degree line legend components. In particular, the # best fit is plotted in draw because it depends on y and y_pred. - self.ax.legend(loc='best', frameon=True) + self.ax.legend(loc="best", frameon=True) -def prediction_error(model, X, y=None, ax=None, alpha=0.75, **kwargs): - """ - Quick method: + +def prediction_error(model, X, y=None, ax=None, alpha=0.75, is_fitted="auto", **kwargs): + """Quickly plot a prediction error visualizer Plot the actual targets from the dataset against the predicted values generated by our model(s). @@ -264,6 +292,10 @@ def prediction_error(model, X, y=None, ax=None, alpha=0.75, **kwargs): Parameters ---------- model : the Scikit-Learn estimator (should be a regressor) + Should be an instance of a regressor, otherwise will raise a + YellowbrickTypeError exception on instantiation. + If the estimator is not fitted, it is fit when the visualizer is fitted, + unless otherwise specified by ``is_fitted``. X : ndarray or DataFrame of shape n x m A matrix of n instances with m features. @@ -282,7 +314,7 @@ def prediction_error(model, X, y=None, ax=None, alpha=0.75, **kwargs): shared_limits to False, but note that this will distort the figure and should be accounted for during analysis. - besfit : bool, default: True + bestfit : bool, default: True Draw a linear best fit line to estimate the correlation between the predicted and measured value of the target variable. The color of the bestfit line is determined by the ``line_color`` argument. @@ -303,6 +335,12 @@ def prediction_error(model, X, y=None, ax=None, alpha=0.75, **kwargs): Specify a transparency where 1 is completely opaque and 0 is completely transparent. This property makes densely clustered points more visible. + is_fitted : bool or str, default='auto' + Specify if the wrapped estimator is already fitted. If False, the estimator + will be fit when the visualizer is fit, otherwise, the estimator will not be + modified. If 'auto' (default), a helper method will check if the estimator + is fitted before fitting it again. + kwargs : dict Keyword arguments that are passed to the base class and may influence the visualization as defined in other Visualizers. @@ -313,7 +351,7 @@ def prediction_error(model, X, y=None, ax=None, alpha=0.75, **kwargs): Returns the axes that the prediction error plot was drawn on. """ # Instantiate the visualizer - visualizer = PredictionError(model, ax, alpha=alpha, **kwargs) + visualizer = PredictionError(model, ax, alpha=alpha, is_fitted=is_fitted, **kwargs) # Create the train and test splits X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) @@ -324,13 +362,14 @@ def prediction_error(model, X, y=None, ax=None, alpha=0.75, **kwargs): visualizer.finalize() # Return the axes object on the visualizer - return visualizer.ax + return visualizer ########################################################################## ## Residuals Plots ########################################################################## + class ResidualsPlot(RegressionScoreVisualizer): """ A residual plot shows the residuals on the vertical axis and the @@ -345,6 +384,8 @@ class ResidualsPlot(RegressionScoreVisualizer): model : a Scikit-Learn regressor Should be an instance of a regressor, otherwise will raise a YellowbrickTypeError exception on instantiation. + If the estimator is not fitted, it is fit when the visualizer is fitted, + unless otherwise specified by ``is_fitted``. ax : matplotlib Axes, default: None The axes to plot the figure on. If None is passed in the current axes @@ -370,14 +411,37 @@ class ResidualsPlot(RegressionScoreVisualizer): line_color : color, default: dark grey Defines the color of the zero error line, can be any matplotlib color. - alpha : float, default: 0.75 - Specify a transparency where 1 is completely opaque and 0 is completely - transparent. This property makes densely clustered points more visible. + train_alpha : float, default: 0.75 + Specify a transparency for traininig data, where 1 is completely opaque + and 0 is completely transparent. This property makes densely clustered + points more visible. + + test_alpha : float, default: 0.75 + Specify a transparency for test data, where 1 is completely opaque + and 0 is completely transparent. This property makes densely clustered + points more visible. + + is_fitted : bool or str, default='auto' + Specify if the wrapped estimator is already fitted. If False, the estimator + will be fit when the visualizer is fit, otherwise, the estimator will not be + modified. If 'auto' (default), a helper method will check if the estimator + is fitted before fitting it again. kwargs : dict Keyword arguments that are passed to the base class and may influence the visualization as defined in other Visualizers. + Attributes + ---------- + + train_score_ : float + The R^2 score that specifies the goodness of fit of the underlying + regression model to the training data. + + test_score_ : float + The R^2 score that specifies the goodness of fit of the underlying + regression model to the test data. + Examples -------- @@ -395,34 +459,48 @@ class ResidualsPlot(RegressionScoreVisualizer): The residuals histogram feature requires matplotlib 2.0.2 or greater. """ - def __init__(self, model, ax=None, hist=True, train_color='b', - test_color='g', line_color=LINE_COLOR, alpha=0.75, - **kwargs): + def __init__( + self, + model, + ax=None, + hist=True, + train_color="b", + test_color="g", + line_color=LINE_COLOR, + train_alpha=0.75, + test_alpha=0.75, + is_fitted="auto", + **kwargs + ): + # Whether or not to check if the model is already fitted + self.is_fitted = is_fitted + + # Initialize the visualizer base super(ResidualsPlot, self).__init__(model, ax=ax, **kwargs) # TODO: allow more scatter plot arguments for train and test points # See #475 (RE: ScatterPlotMixin) self.colors = { - 'train_point': train_color, - 'test_point': test_color, - 'line': line_color, + "train_point": train_color, + "test_point": test_color, + "line": line_color, } self.hist = hist - if self.hist not in {True, 'density', 'frequency', None, False}: - raise YellowbrickValueError( - "'{}' is an invalid argument for hist, use None, True, " \ - "False, 'density', or 'frequency'".format(hist) - ) + if self.hist not in {True, "density", "frequency", None, False}: + raise YellowbrickValueError( + "'{}' is an invalid argument for hist, use None, True, " + "False, 'density', or 'frequency'".format(hist) + ) - if self.hist in {True, 'density', 'frequency'}: - self.hax # If hist is True, test the version availability + if self.hist in {True, "density", "frequency"}: + self.hax # If hist is True, test the version availability # Store labels and colors for the legend ordered by call self._labels, self._colors = [], [] - self.alpha = alpha + self.alphas = {"train_point": train_alpha, "test_point": test_alpha} @memoized def hax(self): @@ -430,16 +508,18 @@ def hax(self): Returns the histogram axes, creating it only on demand. """ if make_axes_locatable is None: - raise YellowbrickValueError(( - "residuals histogram requires matplotlib 2.0.2 or greater " - "please upgrade matplotlib or set hist=False on the visualizer" - )) + raise YellowbrickValueError( + ( + "residuals histogram requires matplotlib 2.0.2 or greater " + "please upgrade matplotlib or set hist=False on the visualizer" + ) + ) divider = make_axes_locatable(self.ax) hax = divider.append_axes("right", size=1, pad=0.1, sharey=self.ax) hax.yaxis.tick_right() - hax.grid(False, axis='x') + hax.grid(False, axis="x") return hax @@ -457,8 +537,10 @@ def fit(self, X, y, **kwargs): Returns ------- - self : visualizer instance + self : ResidualsPlot + The visualizer instance """ + # fit the underlying model to the data super(ResidualsPlot, self).fit(X, y, **kwargs) self.score(X, y, train=True) return self @@ -482,11 +564,12 @@ def score(self, X, y=None, train=False, **kwargs): are the train data. Returns - ------ + ------- score : float The score of the underlying estimator, usually the R-squared score for regression estimators. """ + # Do not call super in order to differentiate train and test scores. score = self.estimator.score(X, y, **kwargs) if train: self.train_score_ = score @@ -494,8 +577,8 @@ def score(self, X, y=None, train=False, **kwargs): self.test_score_ = score y_pred = self.predict(X) - scores = y_pred - y - self.draw(y_pred, scores, train=train) + residuals = y_pred - y + self.draw(y_pred, residuals, train=train) return score @@ -521,30 +604,31 @@ def draw(self, y_pred, residuals, train=False, **kwargs): are the train data. Returns - ------ - ax : the axis with the plotted figure + ------- + ax : matplotlib Axes + The axis with the plotted figure """ if train: - color = self.colors['train_point'] + color = self.colors["train_point"] label = "Train $R^2 = {:0.3f}$".format(self.train_score_) + alpha = self.alphas["train_point"] else: - color = self.colors['test_point'] + color = self.colors["test_point"] label = "Test $R^2 = {:0.3f}$".format(self.test_score_) + alpha = self.alphas["test_point"] # Update the legend information self._labels.append(label) self._colors.append(color) # Draw the residuals scatter plot - self.ax.scatter( - y_pred, residuals, c=color, alpha=self.alpha, label=label - ) + self.ax.scatter(y_pred, residuals, c=color, alpha=alpha, label=label) # Add residuals histogram - if self.hist in {True, 'frequency'}: + if self.hist in {True, "frequency"}: self.hax.hist(residuals, bins=50, orientation="horizontal", color=color) - elif self.hist == 'density': + elif self.hist == "density": self.hax.hist( residuals, bins=50, orientation="horizontal", density=True, color=color ) @@ -563,38 +647,45 @@ def finalize(self, **kwargs): kwargs: generic keyword arguments. """ # Add the title to the plot - self.set_title('Residuals for {} Model'.format(self.name)) + self.set_title("Residuals for {} Model".format(self.name)) # Set the legend with full opacity patches using manual legend - manual_legend( - self, self._labels, self._colors, loc='best', frameon=True - ) + manual_legend(self, self._labels, self._colors, loc="best", frameon=True) # Create a full line across the figure at zero error. - self.ax.axhline(y=0, c=self.colors['line']) + self.ax.axhline(y=0, c=self.colors["line"]) # Set the axes labels - self.ax.set_ylabel('Residuals') + self.ax.set_ylabel("Residuals") self.ax.set_xlabel("Predicted Value") # Finalize the histogram axes if self.hist: - self.hax.axhline(y=0, c=self.colors['line']) + self.hax.axhline(y=0, c=self.colors["line"]) self.hax.set_xlabel("Distribution") -def residuals_plot(model, - X, - y, - ax=None, - hist=True, - test_size=0.25, - train_color='b', - test_color='g', - line_color=LINE_COLOR, - random_state=None, - alpha=0.75, - **kwargs): +########################################################################## +## Quick Method +########################################################################## + + +def residuals_plot( + model, + X, + y, + ax=None, + hist=True, + test_size=0.25, + train_color="b", + test_color="g", + line_color=LINE_COLOR, + random_state=None, + train_alpha=0.75, + test_alpha=0.75, + is_fitted="auto", + **kwargs +): """Quick method: Divides the dataset X, y into a train and test split (the size of the @@ -609,6 +700,8 @@ def residuals_plot(model, model : a Scikit-Learn regressor Should be an instance of a regressor, otherwise will raise a YellowbrickTypeError exception on instantiation. + If the estimator is not fitted, it is fit when the visualizer is fitted, + unless otherwise specified by ``is_fitted``. X : ndarray or DataFrame of shape n x m A matrix of n instances with m features. @@ -648,9 +741,21 @@ def residuals_plot(model, random_state : int, RandomState instance or None, optional Passed to the train_test_split function. - alpha : float, default: 0.75 - Specify a transparency where 1 is completely opaque and 0 is completely - transparent. This property makes densely clustered points more visible. + train_alpha : float, default: 0.75 + Specify a transparency for traininig data, where 1 is completely opaque + and 0 is completely transparent. This property makes densely clustered + points more visible. + + test_alpha : float, default: 0.75 + Specify a transparency for test data, where 1 is completely opaque and + 0 is completely transparent. This property makes densely clustered + points more visible. + + is_fitted : bool or str, default="auto" + Specify if the wrapped estimator is already fitted. If False, the estimator + will be fit when the visualizer is fit, otherwise, the estimator will not be + modified. If "auto" (default), a helper method will check if the estimator + is fitted before fitting it again. kwargs : dict Keyword arguments that are passed to the base class and may influence @@ -658,13 +763,21 @@ def residuals_plot(model, Returns ------- - ax : matplotlib axes - Returns the axes that the residuals plot was drawn on. + visualizer : ResidualsPlot + Returns the residuals plot visualizer """ # Instantiate the visualizer + visualizer = ResidualsPlot( - model=model, ax=ax, hist=hist, train_color=train_color, - test_color=test_color, line_color=line_color, alpha=alpha, + model=model, + ax=ax, + hist=hist, + train_color=train_color, + test_color=test_color, + line_color=line_color, + train_alpha=train_alpha, + test_alpha=test_alpha, + is_fitted=is_fitted, **kwargs ) @@ -678,5 +791,5 @@ def residuals_plot(model, visualizer.score(X_test, y_test) visualizer.finalize() - # Return the axes object on the visualizer - return visualizer.ax + # Return the visualizer + return visualizer diff --git a/yellowbrick/style/__init__.py b/yellowbrick/style/__init__.py index 0b2045e71..e50917689 100644 --- a/yellowbrick/style/__init__.py +++ b/yellowbrick/style/__init__.py @@ -1,10 +1,10 @@ # yellowbrick.style # Manage the style and aesthetic of the yellowbrick library. # -# Author: Benjamin Bengfort +# Author: Benjamin Bengfort # Created: Tue Oct 04 15:09:48 2016 -0400 # -# Copyright (C) 2016 District Data Labs +# Copyright (C) 2016 The scikit-yb developers # For license information, see LICENSE.txt # # ID: __init__.py [c6aff34] benjamin@bengfort.com $ diff --git a/yellowbrick/style/colors.py b/yellowbrick/style/colors.py index d495947fe..a28590972 100644 --- a/yellowbrick/style/colors.py +++ b/yellowbrick/style/colors.py @@ -1,10 +1,10 @@ # yellowbrick.colors # Colors and color helpers brought in from a different library. # -# Author: Benjamin Bengfort +# Author: Benjamin Bengfort # Created: Fri Jun 24 17:02:53 2016 -0400 # -# Copyright (C) 2016 District Data Labs +# Copyright (C) 2016 The scikit-yb developers # For license information, see LICENSE.txt # # ID: colors.py [c6aff34] benjamin@bengfort.com $ @@ -25,12 +25,13 @@ import matplotlib.cm as cm from copy import copy -from six import string_types + from yellowbrick.exceptions import YellowbrickValueError # Check to see if matplotlib is at least sorta up to date from distutils.version import LooseVersion + mpl_ge_150 = LooseVersion(mpl.__version__) >= "1.5.0" @@ -38,20 +39,21 @@ ## Color Utilities ########################################################################## + def get_color_cycle(): """ Returns the current color cycle from matplotlib. """ if mpl_ge_150: - cyl = mpl.rcParams['axes.prop_cycle'] + cyl = mpl.rcParams["axes.prop_cycle"] # matplotlib 1.5 verifies that axes.prop_cycle *is* a cycler # but no garuantee that there's a `color` key. # so users could have a custom rcParams w/ no color... try: - return [x['color'] for x in cyl] + return [x["color"] for x in cyl] except KeyError: pass # just return axes.color style below - return mpl.rcParams['axes.color_cycle'] + return mpl.rcParams["axes.color_cycle"] def resolve_colors(n_colors=None, colormap=None, colors=None): @@ -68,11 +70,12 @@ def resolve_colors(n_colors=None, colormap=None, colors=None): truncate or multiple the colors available. If None the length of the colors will not be modified. - colormap : str, default: None + colormap : str, yellowbrick.style.palettes.ColorPalette, matplotlib.cm, default: None The name of the matplotlib color map with which to generate colors. colors : iterable, default: None - A collection of colors to use specifically with the plot. + A collection of colors to use specifically with the plot. Overrides + colormap if both are specified. Returns ------- @@ -87,26 +90,58 @@ def resolve_colors(n_colors=None, colormap=None, colors=None): # Work with the colormap if specified and colors is not if colormap is not None and colors is None: - if isinstance(colormap, string_types): + # Must import here to avoid recursive import + from .palettes import PALETTES, ColorPalette + + if isinstance(colormap, str): try: - colormap = cm.get_cmap(colormap) + + # try to get colormap from PALETTES first + _colormap = PALETTES.get(colormap, None) + + if _colormap is None: + + colormap = cm.get_cmap(colormap) + n_colors = n_colors or len(get_color_cycle()) + _colors = list(map(colormap, np.linspace(0, 1, num=n_colors))) + + else: + + _colors = ColorPalette(_colormap).as_rgb() + n_colors = n_colors or len(_colors) + except ValueError as e: + raise YellowbrickValueError(e) + # if yellowbrick color palette is provided as colormap + elif isinstance(colormap, ColorPalette): + + _colors = colormap.as_rgb() + n_colors = n_colors or len(_colors) - n_colors = n_colors or len(get_color_cycle()) - _colors = list(map(colormap, np.linspace(0, 1, num=n_colors))) + # if matplotlib color palette is provided as colormap + elif isinstance(colormap, mpl.colors.Colormap): + n_colors = n_colors or len(get_color_cycle()) + _colors = list(map(colormap, np.linspace(0, 1, num=n_colors))) + else: + raise YellowbrickValueError( + "Colormap type {} is not recognized. Possible types are: {}".format( + type(colormap), + ", ".join( + ["yellowbrick.style.ColorPalette,", "matplotlib.cm,", "str"] + ), + ) + ) # Work with the color list elif colors is not None: # Warn if both colormap and colors is specified. if colormap is not None: - warnings.warn( - "both colormap and colors specified; using colors" - ) + warnings.warn("both colormap and colors specified; using colors") - _colors = list(colors) # Ensure colors is a list + _colors = list(colors) # Ensure colors is a list # Get the default colors else: @@ -114,9 +149,7 @@ def resolve_colors(n_colors=None, colormap=None, colors=None): # Truncate or multiple the color list according to the number of colors if n_colors is not None and len(_colors) != n_colors: - _colors = [ - _colors[idx % len(_colors)] for idx in np.arange(n_colors) - ] + _colors = [_colors[idx % len(_colors)] for idx in np.arange(n_colors)] return _colors @@ -126,7 +159,7 @@ class ColorMap(object): A helper for mapping categorical values to colors on demand. """ - def __init__(self, colors='flatui', shuffle=False): + def __init__(self, colors="flatui", shuffle=False): """ Specify either a list of colors or one of the color names. If shuffle is True then the colors will be shuffled randomly. @@ -146,7 +179,7 @@ def colors(self, value): """ Converts color strings into a color listing. """ - if isinstance(value, string_types): + if isinstance(value, str): # Must import here to avoid recursive import from .palettes import PALETTES diff --git a/yellowbrick/style/palettes.py b/yellowbrick/style/palettes.py index d4b4c11a9..a1d5a6b0e 100644 --- a/yellowbrick/style/palettes.py +++ b/yellowbrick/style/palettes.py @@ -1,13 +1,13 @@ # yellowbrick.style.palettes # Implements the variety of colors that yellowbrick allows access to by name. # -# Author: Patrick O'Melveny +# Author: Patrick O'Melveny +# Author: Benjamin Bengfort # -# Copyright (C) 2016 District Data Lab +# Copyright (C) 2016 The scikit-yb developers # For license information, see LICENSE.txt # -# ID: palettes.py [] pvomelveny@gmail.com +# ID: palettes.py [c6aff34] benjamin@bengfort.com $ """ Implements the variety of colors that yellowbrick allows access to by name. @@ -20,16 +20,12 @@ ## Imports ######################################################################### -from __future__ import division - import numpy as np import matplotlib as mpl import matplotlib.pyplot as plt import matplotlib.colors as mplcol from itertools import cycle -from six import string_types -from six.moves import range from .colors import get_color_cycle from yellowbrick.exceptions import YellowbrickValueError @@ -45,8 +41,8 @@ ## Special, Named Colors ########################################################################## -YB_KEY = '#111111' # The yellowbrick key (black) color is very dark grey -LINE_COLOR = YB_KEY # Colors for best fit lines, diagonals, etc. +YB_KEY = "#111111" # The yellowbrick key (black) color is very dark grey +LINE_COLOR = YB_KEY # Colors for best fit lines, diagonals, etc. ########################################################################## @@ -58,310 +54,1294 @@ PALETTES = { # "name": ['blue', 'green', 'red', 'maroon', 'yellow', 'cyan'] # The yellowbrick default palette - "yellowbrick": ['#0272a2', '#9fc377', '#ca0b03', '#a50258', '#d7c703', '#88cada'], - + "yellowbrick": ["#0272a2", "#9fc377", "#ca0b03", "#a50258", "#d7c703", "#88cada"], # The following are from ColorBrewer - "accent": ['#386cb0', '#7fc97f', '#f0027f', '#beaed4', '#ffff99', '#fdc086'], - "dark": ['#7570b3', '#66a61e', '#d95f02', '#e7298a', '#e6ab02', '#1b9e77'], - "pastel": ['#cbd5e8', '#b3e2cd', '#fdcdac', '#f4cae4', '#fff2ae', '#e6f5c9'], - "bold": ['#377eb8', '#4daf4a', '#e41a1c', '#984ea3', '#ffff33', '#ff7f00'], - "muted": ['#80b1d3', '#8dd3c7', '#fb8072', '#bebada', '#ffffb3', '#fdb462'], - + "accent": ["#386cb0", "#7fc97f", "#f0027f", "#beaed4", "#ffff99", "#fdc086"], + "dark": ["#7570b3", "#66a61e", "#d95f02", "#e7298a", "#e6ab02", "#1b9e77"], + "pastel": ["#cbd5e8", "#b3e2cd", "#fdcdac", "#f4cae4", "#fff2ae", "#e6f5c9"], + "bold": ["#377eb8", "#4daf4a", "#e41a1c", "#984ea3", "#ffff33", "#ff7f00"], + "muted": ["#80b1d3", "#8dd3c7", "#fb8072", "#bebada", "#ffffb3", "#fdb462"], # The reset colors back to the original mpl color codes - "reset": ['#0000ff', '#008000', '#ff0000', '#bf00bf', '#bfbf00', '#00bfbf', '#000000'], - + "reset": [ + "#0000ff", + "#008000", + "#ff0000", + "#bf00bf", + "#bfbf00", + "#00bfbf", + "#000000", + ], # Colorblind colors - "colorblind": ["#0072B2", "#009E73", "#D55E00", "#CC79A7", "#F0E442", "#56B4E9"], - "sns_colorblind": ["#0072B2", "#009E73", "#D55E00", "#CC79A7", "#F0E442", "#56B4E9"], - + "colorblind": ["#0072B2", "#009E73", "#D55E00", "#CC79A7", "#F0E442", "#56B4E9"], + "sns_colorblind": [ + "#0072B2", + "#009E73", + "#D55E00", + "#CC79A7", + "#F0E442", + "#56B4E9", + ], # The following are Seaborn colors - "sns_deep": ["#4C72B0", "#55A868", "#C44E52", "#8172B2", "#CCB974", "#64B5CD"], - "sns_muted": ["#4878CF", "#6ACC65", "#D65F5F", "#B47CC7", "#C4AD66", "#77BEDB"], + "sns_deep": ["#4C72B0", "#55A868", "#C44E52", "#8172B2", "#CCB974", "#64B5CD"], + "sns_muted": ["#4878CF", "#6ACC65", "#D65F5F", "#B47CC7", "#C4AD66", "#77BEDB"], "sns_pastel": ["#92C6FF", "#97F0AA", "#FF9F9A", "#D0BBFF", "#FFFEA3", "#B0E0E6"], "sns_bright": ["#003FFF", "#03ED3A", "#E8000B", "#8A2BE2", "#FFC400", "#00D7FF"], - "sns_dark": ["#001C7F", "#017517", "#8C0900", "#7600A1", "#B8860B", "#006374"], - + "sns_dark": ["#001C7F", "#017517", "#8C0900", "#7600A1", "#B8860B", "#006374"], # Other palettes - "flatui": ["#34495e", "#2ecc71", "#e74c3c", "#9b59b6", "#f4d03f", "#3498db"], - - "paired": ["#a6cee3", "#1f78b4", "#b2df8a", "#33a02c", "#fb9a99", "#e31a1c", - "#cab2d6", "#6a3d9a", "#ffff99", "#b15928", "#fdbf6f", "#ff7f00"], - - "set1": ["#377eb8", "#4daf4a", "#e41a1c", "#984ea3", "#ffff33", "#ff7f00", - "#a65628", "#f781bf", "#999999"], - + "flatui": ["#34495e", "#2ecc71", "#e74c3c", "#9b59b6", "#f4d03f", "#3498db"], + "paired": [ + "#a6cee3", + "#1f78b4", + "#b2df8a", + "#33a02c", + "#fb9a99", + "#e31a1c", + "#cab2d6", + "#6a3d9a", + "#ffff99", + "#b15928", + "#fdbf6f", + "#ff7f00", + ], + "set1": [ + "#377eb8", + "#4daf4a", + "#e41a1c", + "#984ea3", + "#ffff33", + "#ff7f00", + "#a65628", + "#f781bf", + "#999999", + ], # colors extracted from this blog post during pycon2017: # http://lewisandquark.tumblr.com/ - "neural_paint": ["#167192", "#6e7548", "#c5a2ab", "#00ccff", "#de78ae", "#ffcc99", - "#3d3f42", "#ffffcc"], + "neural_paint": [ + "#167192", + "#6e7548", + "#c5a2ab", + "#00ccff", + "#de78ae", + "#ffcc99", + "#3d3f42", + "#ffffcc", + ], } SEQUENCES = { "ddl_heat": { - 12: ['#DBDBDB', '#DCD5CC', '#DCCEBE', '#DDC8AF', '#DEC2A0', '#DEBB91', '#DFB583', '#DFAE74', '#E0A865', '#E1A256', '#E19B48', '#E29539'], + 12: [ + "#DBDBDB", + "#DCD5CC", + "#DCCEBE", + "#DDC8AF", + "#DEC2A0", + "#DEBB91", + "#DFB583", + "#DFAE74", + "#E0A865", + "#E1A256", + "#E19B48", + "#E29539", + ] }, "YlGn": { 3: ["#f7fcb9", "#addd8e", "#31a354"], 4: ["#ffffcc", "#c2e699", "#78c679", "#238443"], 5: ["#ffffcc", "#c2e699", "#78c679", "#31a354", "#006837"], 6: ["#ffffcc", "#d9f0a3", "#addd8e", "#78c679", "#31a354", "#006837"], - 7: ["#ffffcc", "#d9f0a3", "#addd8e", "#78c679", "#41ab5d", "#238443", "#005a32"], - - 8: ["#ffffe5", "#f7fcb9", "#d9f0a3", "#addd8e", "#78c679", "#41ab5d", "#238443", "#005a32"], - 9: ["#ffffe5", "#f7fcb9", "#d9f0a3", "#addd8e", "#78c679", "#41ab5d", "#238443", "#006837", "#004529"], + 7: [ + "#ffffcc", + "#d9f0a3", + "#addd8e", + "#78c679", + "#41ab5d", + "#238443", + "#005a32", + ], + 8: [ + "#ffffe5", + "#f7fcb9", + "#d9f0a3", + "#addd8e", + "#78c679", + "#41ab5d", + "#238443", + "#005a32", + ], + 9: [ + "#ffffe5", + "#f7fcb9", + "#d9f0a3", + "#addd8e", + "#78c679", + "#41ab5d", + "#238443", + "#006837", + "#004529", + ], }, "YlGnBu": { 3: ["#edf8b1", "#7fcdbb", "#2c7fb8"], 4: ["#ffffcc", "#a1dab4", "#41b6c4", "#225ea8"], 5: ["#ffffcc", "#a1dab4", "#41b6c4", "#2c7fb8", "#253494"], 6: ["#ffffcc", "#c7e9b4", "#7fcdbb", "#41b6c4", "#2c7fb8", "#253494"], - 7: ["#ffffcc", "#c7e9b4", "#7fcdbb", "#41b6c4", "#1d91c0", "#225ea8", "#0c2c84"], - 8: ["#ffffd9", "#edf8b1", "#c7e9b4", "#7fcdbb", "#41b6c4", "#1d91c0", "#225ea8", "#0c2c84"], - 9: ["#ffffd9", "#edf8b1", "#c7e9b4", "#7fcdbb", "#41b6c4", "#1d91c0", "#225ea8", "#253494", "#081d58"], + 7: [ + "#ffffcc", + "#c7e9b4", + "#7fcdbb", + "#41b6c4", + "#1d91c0", + "#225ea8", + "#0c2c84", + ], + 8: [ + "#ffffd9", + "#edf8b1", + "#c7e9b4", + "#7fcdbb", + "#41b6c4", + "#1d91c0", + "#225ea8", + "#0c2c84", + ], + 9: [ + "#ffffd9", + "#edf8b1", + "#c7e9b4", + "#7fcdbb", + "#41b6c4", + "#1d91c0", + "#225ea8", + "#253494", + "#081d58", + ], }, "GnBu": { 3: ["#e0f3db", "#a8ddb5", "#43a2ca"], 4: ["#f0f9e8", "#bae4bc", "#7bccc4", "#2b8cbe"], 5: ["#f0f9e8", "#bae4bc", "#7bccc4", "#43a2ca", "#0868ac"], 6: ["#f0f9e8", "#ccebc5", "#a8ddb5", "#7bccc4", "#43a2ca", "#0868ac"], - 7: ["#f0f9e8", "#ccebc5", "#a8ddb5", "#7bccc4", "#4eb3d3", "#2b8cbe", "#08589e"], - 8: ["#f7fcf0", "#e0f3db", "#ccebc5", "#a8ddb5", "#7bccc4", "#4eb3d3", "#2b8cbe", "#08589e"], - 9: ["#f7fcf0", "#e0f3db", "#ccebc5", "#a8ddb5", "#7bccc4", "#4eb3d3", "#2b8cbe", "#0868ac", "#084081"], + 7: [ + "#f0f9e8", + "#ccebc5", + "#a8ddb5", + "#7bccc4", + "#4eb3d3", + "#2b8cbe", + "#08589e", + ], + 8: [ + "#f7fcf0", + "#e0f3db", + "#ccebc5", + "#a8ddb5", + "#7bccc4", + "#4eb3d3", + "#2b8cbe", + "#08589e", + ], + 9: [ + "#f7fcf0", + "#e0f3db", + "#ccebc5", + "#a8ddb5", + "#7bccc4", + "#4eb3d3", + "#2b8cbe", + "#0868ac", + "#084081", + ], }, "BuGn": { 3: ["#e5f5f9", "#99d8c9", "#2ca25f"], 4: ["#edf8fb", "#b2e2e2", "#66c2a4", "#238b45"], 5: ["#edf8fb", "#b2e2e2", "#66c2a4", "#2ca25f", "#006d2c"], 6: ["#edf8fb", "#ccece6", "#99d8c9", "#66c2a4", "#2ca25f", "#006d2c"], - 7: ["#edf8fb", "#ccece6", "#99d8c9", "#66c2a4", "#41ae76", "#238b45", "#005824"], - 8: ["#f7fcfd", "#e5f5f9", "#ccece6", "#99d8c9", "#66c2a4", "#41ae76", "#238b45", "#005824"], - 9: ["#f7fcfd", "#e5f5f9", "#ccece6", "#99d8c9", "#66c2a4", "#41ae76", "#238b45", "#006d2c", "#00441b"], + 7: [ + "#edf8fb", + "#ccece6", + "#99d8c9", + "#66c2a4", + "#41ae76", + "#238b45", + "#005824", + ], + 8: [ + "#f7fcfd", + "#e5f5f9", + "#ccece6", + "#99d8c9", + "#66c2a4", + "#41ae76", + "#238b45", + "#005824", + ], + 9: [ + "#f7fcfd", + "#e5f5f9", + "#ccece6", + "#99d8c9", + "#66c2a4", + "#41ae76", + "#238b45", + "#006d2c", + "#00441b", + ], }, "PuBuGn": { 3: ["#ece2f0", "#a6bddb", "#1c9099"], 4: ["#f6eff7", "#bdc9e1", "#67a9cf", "#02818a"], 5: ["#f6eff7", "#bdc9e1", "#67a9cf", "#1c9099", "#016c59"], 6: ["#f6eff7", "#d0d1e6", "#a6bddb", "#67a9cf", "#1c9099", "#016c59"], - 7: ["#f6eff7", "#d0d1e6", "#a6bddb", "#67a9cf", "#3690c0", "#02818a", "#016450"], - 8: ["#fff7fb", "#ece2f0", "#d0d1e6", "#a6bddb", "#67a9cf", "#3690c0", "#02818a", "#016450"], - 9: ["#fff7fb", "#ece2f0", "#d0d1e6", "#a6bddb", "#67a9cf", "#3690c0", "#02818a", "#016c59", "#014636"], + 7: [ + "#f6eff7", + "#d0d1e6", + "#a6bddb", + "#67a9cf", + "#3690c0", + "#02818a", + "#016450", + ], + 8: [ + "#fff7fb", + "#ece2f0", + "#d0d1e6", + "#a6bddb", + "#67a9cf", + "#3690c0", + "#02818a", + "#016450", + ], + 9: [ + "#fff7fb", + "#ece2f0", + "#d0d1e6", + "#a6bddb", + "#67a9cf", + "#3690c0", + "#02818a", + "#016c59", + "#014636", + ], }, "PuBu": { 3: ["#ece7f2", "#a6bddb", "#2b8cbe"], 4: ["#f1eef6", "#bdc9e1", "#74a9cf", "#0570b0"], 5: ["#f1eef6", "#bdc9e1", "#74a9cf", "#2b8cbe", "#045a8d"], 6: ["#f1eef6", "#d0d1e6", "#a6bddb", "#74a9cf", "#2b8cbe", "#045a8d"], - 7: ["#f1eef6", "#d0d1e6", "#a6bddb", "#74a9cf", "#3690c0", "#0570b0", "#034e7b"], - 8: ["#fff7fb", "#ece7f2", "#d0d1e6", "#a6bddb", "#74a9cf", "#3690c0", "#0570b0", "#034e7b"], - 9: ["#fff7fb", "#ece7f2", "#d0d1e6", "#a6bddb", "#74a9cf", "#3690c0", "#0570b0", "#045a8d", "#023858"], + 7: [ + "#f1eef6", + "#d0d1e6", + "#a6bddb", + "#74a9cf", + "#3690c0", + "#0570b0", + "#034e7b", + ], + 8: [ + "#fff7fb", + "#ece7f2", + "#d0d1e6", + "#a6bddb", + "#74a9cf", + "#3690c0", + "#0570b0", + "#034e7b", + ], + 9: [ + "#fff7fb", + "#ece7f2", + "#d0d1e6", + "#a6bddb", + "#74a9cf", + "#3690c0", + "#0570b0", + "#045a8d", + "#023858", + ], }, "BuPu": { 3: ["#e0ecf4", "#9ebcda", "#8856a7"], 4: ["#edf8fb", "#b3cde3", "#8c96c6", "#88419d"], 5: ["#edf8fb", "#b3cde3", "#8c96c6", "#8856a7", "#810f7c"], 6: ["#edf8fb", "#bfd3e6", "#9ebcda", "#8c96c6", "#8856a7", "#810f7c"], - 7: ["#edf8fb", "#bfd3e6", "#9ebcda", "#8c96c6", "#8c6bb1", "#88419d", "#6e016b"], - 8: ["#f7fcfd", "#e0ecf4", "#bfd3e6", "#9ebcda", "#8c96c6", "#8c6bb1", "#88419d", "#6e016b"], - 9: ["#f7fcfd", "#e0ecf4", "#bfd3e6", "#9ebcda", "#8c96c6", "#8c6bb1", "#88419d", "#810f7c", "#4d004b"], + 7: [ + "#edf8fb", + "#bfd3e6", + "#9ebcda", + "#8c96c6", + "#8c6bb1", + "#88419d", + "#6e016b", + ], + 8: [ + "#f7fcfd", + "#e0ecf4", + "#bfd3e6", + "#9ebcda", + "#8c96c6", + "#8c6bb1", + "#88419d", + "#6e016b", + ], + 9: [ + "#f7fcfd", + "#e0ecf4", + "#bfd3e6", + "#9ebcda", + "#8c96c6", + "#8c6bb1", + "#88419d", + "#810f7c", + "#4d004b", + ], }, "RdPu": { 3: ["#fde0dd", "#fa9fb5", "#c51b8a"], 4: ["#feebe2", "#fbb4b9", "#f768a1", "#ae017e"], 5: ["#feebe2", "#fbb4b9", "#f768a1", "#c51b8a", "#7a0177"], 6: ["#feebe2", "#fcc5c0", "#fa9fb5", "#f768a1", "#c51b8a", "#7a0177"], - 7: ["#feebe2", "#fcc5c0", "#fa9fb5", "#f768a1", "#dd3497", "#ae017e", "#7a0177"], - 8: ["#fff7f3", "#fde0dd", "#fcc5c0", "#fa9fb5", "#f768a1", "#dd3497", "#ae017e", "#7a0177"], - 9: ["#fff7f3", "#fde0dd", "#fcc5c0", "#fa9fb5", "#f768a1", "#dd3497", "#ae017e", "#7a0177", "#49006a"], + 7: [ + "#feebe2", + "#fcc5c0", + "#fa9fb5", + "#f768a1", + "#dd3497", + "#ae017e", + "#7a0177", + ], + 8: [ + "#fff7f3", + "#fde0dd", + "#fcc5c0", + "#fa9fb5", + "#f768a1", + "#dd3497", + "#ae017e", + "#7a0177", + ], + 9: [ + "#fff7f3", + "#fde0dd", + "#fcc5c0", + "#fa9fb5", + "#f768a1", + "#dd3497", + "#ae017e", + "#7a0177", + "#49006a", + ], }, "PuRd": { 3: ["#e7e1ef", "#c994c7", "#dd1c77"], 4: ["#f1eef6", "#d7b5d8", "#df65b0", "#ce1256"], 5: ["#f1eef6", "#d7b5d8", "#df65b0", "#dd1c77", "#980043"], 6: ["#f1eef6", "#d4b9da", "#c994c7", "#df65b0", "#dd1c77", "#980043"], - 7: ["#f1eef6", "#d4b9da", "#c994c7", "#df65b0", "#e7298a", "#ce1256", "#91003f"], - 8: ["#f7f4f9", "#e7e1ef", "#d4b9da", "#c994c7", "#df65b0", "#e7298a", "#ce1256", "#91003f"], - 9: ["#f7f4f9", "#e7e1ef", "#d4b9da", "#c994c7", "#df65b0", "#e7298a", "#ce1256", "#980043", "#67001f"], + 7: [ + "#f1eef6", + "#d4b9da", + "#c994c7", + "#df65b0", + "#e7298a", + "#ce1256", + "#91003f", + ], + 8: [ + "#f7f4f9", + "#e7e1ef", + "#d4b9da", + "#c994c7", + "#df65b0", + "#e7298a", + "#ce1256", + "#91003f", + ], + 9: [ + "#f7f4f9", + "#e7e1ef", + "#d4b9da", + "#c994c7", + "#df65b0", + "#e7298a", + "#ce1256", + "#980043", + "#67001f", + ], }, "OrRd": { 3: ["#fee8c8", "#fdbb84", "#e34a33"], 4: ["#fef0d9", "#fdcc8a", "#fc8d59", "#d7301f"], 5: ["#fef0d9", "#fdcc8a", "#fc8d59", "#e34a33", "#b30000"], 6: ["#fef0d9", "#fdd49e", "#fdbb84", "#fc8d59", "#e34a33", "#b30000"], - 7: ["#fef0d9", "#fdd49e", "#fdbb84", "#fc8d59", "#ef6548", "#d7301f", "#990000"], - 8: ["#fff7ec", "#fee8c8", "#fdd49e", "#fdbb84", "#fc8d59", "#ef6548", "#d7301f", "#990000"], - 9: ["#fff7ec", "#fee8c8", "#fdd49e", "#fdbb84", "#fc8d59", "#ef6548", "#d7301f", "#b30000", "#7f0000"], + 7: [ + "#fef0d9", + "#fdd49e", + "#fdbb84", + "#fc8d59", + "#ef6548", + "#d7301f", + "#990000", + ], + 8: [ + "#fff7ec", + "#fee8c8", + "#fdd49e", + "#fdbb84", + "#fc8d59", + "#ef6548", + "#d7301f", + "#990000", + ], + 9: [ + "#fff7ec", + "#fee8c8", + "#fdd49e", + "#fdbb84", + "#fc8d59", + "#ef6548", + "#d7301f", + "#b30000", + "#7f0000", + ], }, "YlOrRd": { 3: ["#ffeda0", "#feb24c", "#f03b20"], 4: ["#ffffb2", "#fecc5c", "#fd8d3c", "#e31a1c"], 5: ["#ffffb2", "#fecc5c", "#fd8d3c", "#f03b20", "#bd0026"], 6: ["#ffffb2", "#fed976", "#feb24c", "#fd8d3c", "#f03b20", "#bd0026"], - 7: ["#ffffb2", "#fed976", "#feb24c", "#fd8d3c", "#fc4e2a", "#e31a1c", "#b10026"], - 8: ["#ffffcc", "#ffeda0", "#fed976", "#feb24c", "#fd8d3c", "#fc4e2a", "#e31a1c", "#b10026"], - 9: ["#ffffcc", "#ffeda0", "#fed976", "#feb24c", "#fd8d3c", "#fc4e2a", "#e31a1c", "#bd0026", "#800026"], + 7: [ + "#ffffb2", + "#fed976", + "#feb24c", + "#fd8d3c", + "#fc4e2a", + "#e31a1c", + "#b10026", + ], + 8: [ + "#ffffcc", + "#ffeda0", + "#fed976", + "#feb24c", + "#fd8d3c", + "#fc4e2a", + "#e31a1c", + "#b10026", + ], + 9: [ + "#ffffcc", + "#ffeda0", + "#fed976", + "#feb24c", + "#fd8d3c", + "#fc4e2a", + "#e31a1c", + "#bd0026", + "#800026", + ], }, "YlOrBr": { 3: ["#fff7bc", "#fec44f", "#d95f0e"], 4: ["#ffffd4", "#fed98e", "#fe9929", "#cc4c02"], 5: ["#ffffd4", "#fed98e", "#fe9929", "#d95f0e", "#993404"], 6: ["#ffffd4", "#fee391", "#fec44f", "#fe9929", "#d95f0e", "#993404"], - 7: ["#ffffd4", "#fee391", "#fec44f", "#fe9929", "#ec7014", "#cc4c02", "#8c2d04"], - 8: ["#ffffe5", "#fff7bc", "#fee391", "#fec44f", "#fe9929", "#ec7014", "#cc4c02", "#8c2d04"], - 9: ["#ffffe5", "#fff7bc", "#fee391", "#fec44f", "#fe9929", "#ec7014", "#cc4c02", "#993404", "#662506"], + 7: [ + "#ffffd4", + "#fee391", + "#fec44f", + "#fe9929", + "#ec7014", + "#cc4c02", + "#8c2d04", + ], + 8: [ + "#ffffe5", + "#fff7bc", + "#fee391", + "#fec44f", + "#fe9929", + "#ec7014", + "#cc4c02", + "#8c2d04", + ], + 9: [ + "#ffffe5", + "#fff7bc", + "#fee391", + "#fec44f", + "#fe9929", + "#ec7014", + "#cc4c02", + "#993404", + "#662506", + ], }, "Purples": { 3: ["#efedf5", "#bcbddc", "#756bb1"], 4: ["#f2f0f7", "#cbc9e2", "#9e9ac8", "#6a51a3"], 5: ["#f2f0f7", "#cbc9e2", "#9e9ac8", "#756bb1", "#54278f"], 6: ["#f2f0f7", "#dadaeb", "#bcbddc", "#9e9ac8", "#756bb1", "#54278f"], - 7: ["#f2f0f7", "#dadaeb", "#bcbddc", "#9e9ac8", "#807dba", "#6a51a3", "#4a1486"], - 8: ["#fcfbfd", "#efedf5", "#dadaeb", "#bcbddc", "#9e9ac8", "#807dba", "#6a51a3", "#4a1486"], - 9: ["#fcfbfd", "#efedf5", "#dadaeb", "#bcbddc", "#9e9ac8", "#807dba", "#6a51a3", "#54278f", "#3f007d"], + 7: [ + "#f2f0f7", + "#dadaeb", + "#bcbddc", + "#9e9ac8", + "#807dba", + "#6a51a3", + "#4a1486", + ], + 8: [ + "#fcfbfd", + "#efedf5", + "#dadaeb", + "#bcbddc", + "#9e9ac8", + "#807dba", + "#6a51a3", + "#4a1486", + ], + 9: [ + "#fcfbfd", + "#efedf5", + "#dadaeb", + "#bcbddc", + "#9e9ac8", + "#807dba", + "#6a51a3", + "#54278f", + "#3f007d", + ], }, "Blues": { 3: ["#deebf7", "#9ecae1", "#3182bd"], 4: ["#eff3ff", "#bdd7e7", "#6baed6", "#2171b5"], 5: ["#eff3ff", "#bdd7e7", "#6baed6", "#3182bd", "#08519c"], 6: ["#eff3ff", "#c6dbef", "#9ecae1", "#6baed6", "#3182bd", "#08519c"], - 7: ["#eff3ff", "#c6dbef", "#9ecae1", "#6baed6", "#4292c6", "#2171b5", "#084594"], - 8: ["#f7fbff", "#deebf7", "#c6dbef", "#9ecae1", "#6baed6", "#4292c6", "#2171b5", "#084594"], - 9: ["#f7fbff", "#deebf7", "#c6dbef", "#9ecae1", "#6baed6", "#4292c6", "#2171b5", "#08519c", "#08306b"], + 7: [ + "#eff3ff", + "#c6dbef", + "#9ecae1", + "#6baed6", + "#4292c6", + "#2171b5", + "#084594", + ], + 8: [ + "#f7fbff", + "#deebf7", + "#c6dbef", + "#9ecae1", + "#6baed6", + "#4292c6", + "#2171b5", + "#084594", + ], + 9: [ + "#f7fbff", + "#deebf7", + "#c6dbef", + "#9ecae1", + "#6baed6", + "#4292c6", + "#2171b5", + "#08519c", + "#08306b", + ], }, "Greens": { 3: ["#e5f5e0", "#a1d99b", "#31a354"], 4: ["#edf8e9", "#bae4b3", "#74c476", "#238b45"], 5: ["#edf8e9", "#bae4b3", "#74c476", "#31a354", "#006d2c"], 6: ["#edf8e9", "#c7e9c0", "#a1d99b", "#74c476", "#31a354", "#006d2c"], - 7: ["#edf8e9", "#c7e9c0", "#a1d99b", "#74c476", "#41ab5d", "#238b45", "#005a32"], - 8: ["#f7fcf5", "#e5f5e0", "#c7e9c0", "#a1d99b", "#74c476", "#41ab5d", "#238b45", "#005a32"], - 9: ["#f7fcf5", "#e5f5e0", "#c7e9c0", "#a1d99b", "#74c476", "#41ab5d", "#238b45", "#006d2c", "#00441b"], + 7: [ + "#edf8e9", + "#c7e9c0", + "#a1d99b", + "#74c476", + "#41ab5d", + "#238b45", + "#005a32", + ], + 8: [ + "#f7fcf5", + "#e5f5e0", + "#c7e9c0", + "#a1d99b", + "#74c476", + "#41ab5d", + "#238b45", + "#005a32", + ], + 9: [ + "#f7fcf5", + "#e5f5e0", + "#c7e9c0", + "#a1d99b", + "#74c476", + "#41ab5d", + "#238b45", + "#006d2c", + "#00441b", + ], }, "Oranges": { 3: ["#fee6ce", "#fdae6b", "#e6550d"], 4: ["#feedde", "#fdbe85", "#fd8d3c", "#d94701"], 5: ["#feedde", "#fdbe85", "#fd8d3c", "#e6550d", "#a63603"], 6: ["#feedde", "#fdd0a2", "#fdae6b", "#fd8d3c", "#e6550d", "#a63603"], - 7: ["#feedde", "#fdd0a2", "#fdae6b", "#fd8d3c", "#f16913", "#d94801", "#8c2d04"], - 8: ["#fff5eb", "#fee6ce", "#fdd0a2", "#fdae6b", "#fd8d3c", "#f16913", "#d94801", "#8c2d04"], - 9: ["#fff5eb", "#fee6ce", "#fdd0a2", "#fdae6b", "#fd8d3c", "#f16913", "#d94801", "#a63603", "#7f2704"], + 7: [ + "#feedde", + "#fdd0a2", + "#fdae6b", + "#fd8d3c", + "#f16913", + "#d94801", + "#8c2d04", + ], + 8: [ + "#fff5eb", + "#fee6ce", + "#fdd0a2", + "#fdae6b", + "#fd8d3c", + "#f16913", + "#d94801", + "#8c2d04", + ], + 9: [ + "#fff5eb", + "#fee6ce", + "#fdd0a2", + "#fdae6b", + "#fd8d3c", + "#f16913", + "#d94801", + "#a63603", + "#7f2704", + ], }, "Reds": { 3: ["#fee0d2", "#fc9272", "#de2d26"], 4: ["#fee5d9", "#fcae91", "#fb6a4a", "#cb181d"], 5: ["#fee5d9", "#fcae91", "#fb6a4a", "#de2d26", "#a50f15"], 6: ["#fee5d9", "#fcbba1", "#fc9272", "#fb6a4a", "#de2d26", "#a50f15"], - 7: ["#fee5d9", "#fcbba1", "#fc9272", "#fb6a4a", "#ef3b2c", "#cb181d", "#99000d"], - 8: ["#fff5f0", "#fee0d2", "#fcbba1", "#fc9272", "#fb6a4a", "#ef3b2c", "#cb181d", "#99000d"], - 9: ["#fff5f0", "#fee0d2", "#fcbba1", "#fc9272", "#fb6a4a", "#ef3b2c", "#cb181d", "#a50f15", "#67000d"], + 7: [ + "#fee5d9", + "#fcbba1", + "#fc9272", + "#fb6a4a", + "#ef3b2c", + "#cb181d", + "#99000d", + ], + 8: [ + "#fff5f0", + "#fee0d2", + "#fcbba1", + "#fc9272", + "#fb6a4a", + "#ef3b2c", + "#cb181d", + "#99000d", + ], + 9: [ + "#fff5f0", + "#fee0d2", + "#fcbba1", + "#fc9272", + "#fb6a4a", + "#ef3b2c", + "#cb181d", + "#a50f15", + "#67000d", + ], }, "Greys": { 3: ["#f0f0f0", "#bdbdbd", "#636363"], 4: ["#f7f7f7", "#cccccc", "#969696", "#525252"], 5: ["#f7f7f7", "#cccccc", "#969696", "#636363", "#252525"], 6: ["#f7f7f7", "#d9d9d9", "#bdbdbd", "#969696", "#636363", "#252525"], - 7: ["#f7f7f7", "#d9d9d9", "#bdbdbd", "#969696", "#737373", "#525252", "#252525"], - 8: ["#ffffff", "#f0f0f0", "#d9d9d9", "#bdbdbd", "#969696", "#737373", "#525252", "#252525"], - 9: ["#ffffff", "#f0f0f0", "#d9d9d9", "#bdbdbd", "#969696", "#737373", "#525252", "#252525", "#000000"], + 7: [ + "#f7f7f7", + "#d9d9d9", + "#bdbdbd", + "#969696", + "#737373", + "#525252", + "#252525", + ], + 8: [ + "#ffffff", + "#f0f0f0", + "#d9d9d9", + "#bdbdbd", + "#969696", + "#737373", + "#525252", + "#252525", + ], + 9: [ + "#ffffff", + "#f0f0f0", + "#d9d9d9", + "#bdbdbd", + "#969696", + "#737373", + "#525252", + "#252525", + "#000000", + ], }, "PuOr": { 3: ["#f1a340", "#f7f7f7", "#998ec3"], 4: ["#e66101", "#fdb863", "#b2abd2", "#5e3c99"], 5: ["#e66101", "#fdb863", "#f7f7f7", "#b2abd2", "#5e3c99"], 6: ["#b35806", "#f1a340", "#fee0b6", "#d8daeb", "#998ec3", "#542788"], - 7: ["#b35806", "#f1a340", "#fee0b6", "#f7f7f7", "#d8daeb", "#998ec3", "#542788"], - 8: ["#b35806", "#e08214", "#fdb863", "#fee0b6", "#d8daeb", "#b2abd2", "#8073ac", "#542788"], - 9: ["#b35806", "#e08214", "#fdb863", "#fee0b6", "#f7f7f7", "#d8daeb", "#b2abd2", "#8073ac", "#542788"], - 10: ["#7f3b08", "#b35806", "#e08214", "#fdb863", "#fee0b6", "#d8daeb", "#b2abd2", "#8073ac", "#542788", "#2d004b"], - 11: ["#7f3b08", "#b35806", "#e08214", "#fdb863", "#fee0b6", "#f7f7f7", "#d8daeb", "#b2abd2", "#8073ac", "#542788", "#2d004b"], + 7: [ + "#b35806", + "#f1a340", + "#fee0b6", + "#f7f7f7", + "#d8daeb", + "#998ec3", + "#542788", + ], + 8: [ + "#b35806", + "#e08214", + "#fdb863", + "#fee0b6", + "#d8daeb", + "#b2abd2", + "#8073ac", + "#542788", + ], + 9: [ + "#b35806", + "#e08214", + "#fdb863", + "#fee0b6", + "#f7f7f7", + "#d8daeb", + "#b2abd2", + "#8073ac", + "#542788", + ], + 10: [ + "#7f3b08", + "#b35806", + "#e08214", + "#fdb863", + "#fee0b6", + "#d8daeb", + "#b2abd2", + "#8073ac", + "#542788", + "#2d004b", + ], + 11: [ + "#7f3b08", + "#b35806", + "#e08214", + "#fdb863", + "#fee0b6", + "#f7f7f7", + "#d8daeb", + "#b2abd2", + "#8073ac", + "#542788", + "#2d004b", + ], }, "BrBG": { 3: ["#d8b365", "#f5f5f5", "#5ab4ac"], 4: ["#a6611a", "#dfc27d", "#80cdc1", "#018571"], 5: ["#a6611a", "#dfc27d", "#f5f5f5", "#80cdc1", "#018571"], 6: ["#8c510a", "#d8b365", "#f6e8c3", "#c7eae5", "#5ab4ac", "#01665e"], - 7: ["#8c510a", "#d8b365", "#f6e8c3", "#f5f5f5", "#c7eae5", "#5ab4ac", "#01665e"], - 8: ["#8c510a", "#bf812d", "#dfc27d", "#f6e8c3", "#c7eae5", "#80cdc1", "#35978f", "#01665e"], - 9: ["#8c510a", "#bf812d", "#dfc27d", "#f6e8c3", "#f5f5f5", "#c7eae5", "#80cdc1", "#35978f", "#01665e"], - 10: ["#543005", "#8c510a", "#bf812d", "#dfc27d", "#f6e8c3", "#c7eae5", "#80cdc1", "#35978f", "#01665e", "#003c30"], - 11: ["#543005", "#8c510a", "#bf812d", "#dfc27d", "#f6e8c3", "#f5f5f5", "#c7eae5", "#80cdc1", "#35978f", "#01665e", "#003c30"], + 7: [ + "#8c510a", + "#d8b365", + "#f6e8c3", + "#f5f5f5", + "#c7eae5", + "#5ab4ac", + "#01665e", + ], + 8: [ + "#8c510a", + "#bf812d", + "#dfc27d", + "#f6e8c3", + "#c7eae5", + "#80cdc1", + "#35978f", + "#01665e", + ], + 9: [ + "#8c510a", + "#bf812d", + "#dfc27d", + "#f6e8c3", + "#f5f5f5", + "#c7eae5", + "#80cdc1", + "#35978f", + "#01665e", + ], + 10: [ + "#543005", + "#8c510a", + "#bf812d", + "#dfc27d", + "#f6e8c3", + "#c7eae5", + "#80cdc1", + "#35978f", + "#01665e", + "#003c30", + ], + 11: [ + "#543005", + "#8c510a", + "#bf812d", + "#dfc27d", + "#f6e8c3", + "#f5f5f5", + "#c7eae5", + "#80cdc1", + "#35978f", + "#01665e", + "#003c30", + ], }, "PRGn": { 3: ["#af8dc3", "#f7f7f7", "#7fbf7b"], 4: ["#7b3294", "#c2a5cf", "#a6dba0", "#008837"], 5: ["#7b3294", "#c2a5cf", "#f7f7f7", "#a6dba0", "#008837"], 6: ["#762a83", "#af8dc3", "#e7d4e8", "#d9f0d3", "#7fbf7b", "#1b7837"], - 7: ["#762a83", "#af8dc3", "#e7d4e8", "#f7f7f7", "#d9f0d3", "#7fbf7b", "#1b7837"], - 8: ["#762a83", "#9970ab", "#c2a5cf", "#e7d4e8", "#d9f0d3", "#a6dba0", "#5aae61", "#1b7837"], - 9: ["#762a83", "#9970ab", "#c2a5cf", "#e7d4e8", "#f7f7f7", "#d9f0d3", "#a6dba0", "#5aae61", "#1b7837"], - 10: ["#40004b", "#762a83", "#9970ab", "#c2a5cf", "#e7d4e8", "#d9f0d3", "#a6dba0", "#5aae61", "#1b7837", "#00441b"], - 11: ["#40004b", "#762a83", "#9970ab", "#c2a5cf", "#e7d4e8", "#f7f7f7", "#d9f0d3", "#a6dba0", "#5aae61", "#1b7837", "#00441b"], + 7: [ + "#762a83", + "#af8dc3", + "#e7d4e8", + "#f7f7f7", + "#d9f0d3", + "#7fbf7b", + "#1b7837", + ], + 8: [ + "#762a83", + "#9970ab", + "#c2a5cf", + "#e7d4e8", + "#d9f0d3", + "#a6dba0", + "#5aae61", + "#1b7837", + ], + 9: [ + "#762a83", + "#9970ab", + "#c2a5cf", + "#e7d4e8", + "#f7f7f7", + "#d9f0d3", + "#a6dba0", + "#5aae61", + "#1b7837", + ], + 10: [ + "#40004b", + "#762a83", + "#9970ab", + "#c2a5cf", + "#e7d4e8", + "#d9f0d3", + "#a6dba0", + "#5aae61", + "#1b7837", + "#00441b", + ], + 11: [ + "#40004b", + "#762a83", + "#9970ab", + "#c2a5cf", + "#e7d4e8", + "#f7f7f7", + "#d9f0d3", + "#a6dba0", + "#5aae61", + "#1b7837", + "#00441b", + ], }, "PiYG": { 3: ["#e9a3c9", "#f7f7f7", "#a1d76a"], 4: ["#d01c8b", "#f1b6da", "#b8e186", "#4dac26"], 5: ["#d01c8b", "#f1b6da", "#f7f7f7", "#b8e186", "#4dac26"], 6: ["#c51b7d", "#e9a3c9", "#fde0ef", "#e6f5d0", "#a1d76a", "#4d9221"], - 7: ["#c51b7d", "#e9a3c9", "#fde0ef", "#f7f7f7", "#e6f5d0", "#a1d76a", "#4d9221"], - 8: ["#c51b7d", "#de77ae", "#f1b6da", "#fde0ef", "#e6f5d0", "#b8e186", "#7fbc41", "#4d9221"], - 9: ["#c51b7d", "#de77ae", "#f1b6da", "#fde0ef", "#f7f7f7", "#e6f5d0", "#b8e186", "#7fbc41", "#4d9221"], - 10: ["#8e0152", "#c51b7d", "#de77ae", "#f1b6da", "#fde0ef", "#e6f5d0", "#b8e186", "#7fbc41", "#4d9221", "#276419"], - 11: ["#8e0152", "#c51b7d", "#de77ae", "#f1b6da", "#fde0ef", "#f7f7f7", "#e6f5d0", "#b8e186", "#7fbc41", "#4d9221", "#276419"], + 7: [ + "#c51b7d", + "#e9a3c9", + "#fde0ef", + "#f7f7f7", + "#e6f5d0", + "#a1d76a", + "#4d9221", + ], + 8: [ + "#c51b7d", + "#de77ae", + "#f1b6da", + "#fde0ef", + "#e6f5d0", + "#b8e186", + "#7fbc41", + "#4d9221", + ], + 9: [ + "#c51b7d", + "#de77ae", + "#f1b6da", + "#fde0ef", + "#f7f7f7", + "#e6f5d0", + "#b8e186", + "#7fbc41", + "#4d9221", + ], + 10: [ + "#8e0152", + "#c51b7d", + "#de77ae", + "#f1b6da", + "#fde0ef", + "#e6f5d0", + "#b8e186", + "#7fbc41", + "#4d9221", + "#276419", + ], + 11: [ + "#8e0152", + "#c51b7d", + "#de77ae", + "#f1b6da", + "#fde0ef", + "#f7f7f7", + "#e6f5d0", + "#b8e186", + "#7fbc41", + "#4d9221", + "#276419", + ], }, "RdBu": { 3: ["#ef8a62", "#f7f7f7", "#67a9cf"], 4: ["#ca0020", "#f4a582", "#92c5de", "#0571b0"], 5: ["#ca0020", "#f4a582", "#f7f7f7", "#92c5de", "#0571b0"], 6: ["#b2182b", "#ef8a62", "#fddbc7", "#d1e5f0", "#67a9cf", "#2166ac"], - 7: ["#b2182b", "#ef8a62", "#fddbc7", "#f7f7f7", "#d1e5f0", "#67a9cf", "#2166ac"], - 8: ["#b2182b", "#d6604d", "#f4a582", "#fddbc7", "#d1e5f0", "#92c5de", "#4393c3", "#2166ac"], - 9: ["#b2182b", "#d6604d", "#f4a582", "#fddbc7", "#f7f7f7", "#d1e5f0", "#92c5de", "#4393c3", "#2166ac"], - 10: ["#67001f", "#b2182b", "#d6604d", "#f4a582", "#fddbc7", "#d1e5f0", "#92c5de", "#4393c3", "#2166ac", "#053061"], - 11: ["#67001f", "#b2182b", "#d6604d", "#f4a582", "#fddbc7", "#f7f7f7", "#d1e5f0", "#92c5de", "#4393c3", "#2166ac", "#053061"], + 7: [ + "#b2182b", + "#ef8a62", + "#fddbc7", + "#f7f7f7", + "#d1e5f0", + "#67a9cf", + "#2166ac", + ], + 8: [ + "#b2182b", + "#d6604d", + "#f4a582", + "#fddbc7", + "#d1e5f0", + "#92c5de", + "#4393c3", + "#2166ac", + ], + 9: [ + "#b2182b", + "#d6604d", + "#f4a582", + "#fddbc7", + "#f7f7f7", + "#d1e5f0", + "#92c5de", + "#4393c3", + "#2166ac", + ], + 10: [ + "#67001f", + "#b2182b", + "#d6604d", + "#f4a582", + "#fddbc7", + "#d1e5f0", + "#92c5de", + "#4393c3", + "#2166ac", + "#053061", + ], + 11: [ + "#67001f", + "#b2182b", + "#d6604d", + "#f4a582", + "#fddbc7", + "#f7f7f7", + "#d1e5f0", + "#92c5de", + "#4393c3", + "#2166ac", + "#053061", + ], }, "RdGy": { 3: ["#ef8a62", "#ffffff", "#999999"], 4: ["#ca0020", "#f4a582", "#bababa", "#404040"], 5: ["#ca0020", "#f4a582", "#ffffff", "#bababa", "#404040"], 6: ["#b2182b", "#ef8a62", "#fddbc7", "#e0e0e0", "#999999", "#4d4d4d"], - 7: ["#b2182b", "#ef8a62", "#fddbc7", "#ffffff", "#e0e0e0", "#999999", "#4d4d4d"], - 8: ["#b2182b", "#d6604d", "#f4a582", "#fddbc7", "#e0e0e0", "#bababa", "#878787", "#4d4d4d"], - 9: ["#b2182b", "#d6604d", "#f4a582", "#fddbc7", "#ffffff", "#e0e0e0", "#bababa", "#878787", "#4d4d4d"], - 10: ["#67001f", "#b2182b", "#d6604d", "#f4a582", "#fddbc7", "#e0e0e0", "#bababa", "#878787", "#4d4d4d", "#1a1a1a"], - 11: ["#67001f", "#b2182b", "#d6604d", "#f4a582", "#fddbc7", "#ffffff", "#e0e0e0", "#bababa", "#878787", "#4d4d4d", "#1a1a1a"], + 7: [ + "#b2182b", + "#ef8a62", + "#fddbc7", + "#ffffff", + "#e0e0e0", + "#999999", + "#4d4d4d", + ], + 8: [ + "#b2182b", + "#d6604d", + "#f4a582", + "#fddbc7", + "#e0e0e0", + "#bababa", + "#878787", + "#4d4d4d", + ], + 9: [ + "#b2182b", + "#d6604d", + "#f4a582", + "#fddbc7", + "#ffffff", + "#e0e0e0", + "#bababa", + "#878787", + "#4d4d4d", + ], + 10: [ + "#67001f", + "#b2182b", + "#d6604d", + "#f4a582", + "#fddbc7", + "#e0e0e0", + "#bababa", + "#878787", + "#4d4d4d", + "#1a1a1a", + ], + 11: [ + "#67001f", + "#b2182b", + "#d6604d", + "#f4a582", + "#fddbc7", + "#ffffff", + "#e0e0e0", + "#bababa", + "#878787", + "#4d4d4d", + "#1a1a1a", + ], }, "RdYlBu": { 3: ["#fc8d59", "#ffffbf", "#91bfdb"], 4: ["#d7191c", "#fdae61", "#abd9e9", "#2c7bb6"], 5: ["#d7191c", "#fdae61", "#ffffbf", "#abd9e9", "#2c7bb6"], 6: ["#d73027", "#fc8d59", "#fee090", "#e0f3f8", "#91bfdb", "#4575b4"], - 7: ["#d73027", "#fc8d59", "#fee090", "#ffffbf", "#e0f3f8", "#91bfdb", "#4575b4"], - 8: ["#d73027", "#f46d43", "#fdae61", "#fee090", "#e0f3f8", "#abd9e9", "#74add1", "#4575b4"], - 9: ["#d73027", "#f46d43", "#fdae61", "#fee090", "#ffffbf", "#e0f3f8", "#abd9e9", "#74add1", "#4575b4"], - 10: ["#a50026", "#d73027", "#f46d43", "#fdae61", "#fee090", "#e0f3f8", "#abd9e9", "#74add1", "#4575b4", "#313695"], - 11: ["#a50026", "#d73027", "#f46d43", "#fdae61", "#fee090", "#ffffbf", "#e0f3f8", "#abd9e9", "#74add1", "#4575b4", "#313695"], + 7: [ + "#d73027", + "#fc8d59", + "#fee090", + "#ffffbf", + "#e0f3f8", + "#91bfdb", + "#4575b4", + ], + 8: [ + "#d73027", + "#f46d43", + "#fdae61", + "#fee090", + "#e0f3f8", + "#abd9e9", + "#74add1", + "#4575b4", + ], + 9: [ + "#d73027", + "#f46d43", + "#fdae61", + "#fee090", + "#ffffbf", + "#e0f3f8", + "#abd9e9", + "#74add1", + "#4575b4", + ], + 10: [ + "#a50026", + "#d73027", + "#f46d43", + "#fdae61", + "#fee090", + "#e0f3f8", + "#abd9e9", + "#74add1", + "#4575b4", + "#313695", + ], + 11: [ + "#a50026", + "#d73027", + "#f46d43", + "#fdae61", + "#fee090", + "#ffffbf", + "#e0f3f8", + "#abd9e9", + "#74add1", + "#4575b4", + "#313695", + ], }, "Spectral": { 3: ["#fc8d59", "#ffffbf", "#99d594"], 4: ["#d7191c", "#fdae61", "#abdda4", "#2b83ba"], 5: ["#d7191c", "#fdae61", "#ffffbf", "#abdda4", "#2b83ba"], 6: ["#d53e4f", "#fc8d59", "#fee08b", "#e6f598", "#99d594", "#3288bd"], - 7: ["#d53e4f", "#fc8d59", "#fee08b", "#ffffbf", "#e6f598", "#99d594", "#3288bd"], - 8: ["#d53e4f", "#f46d43", "#fdae61", "#fee08b", "#e6f598", "#abdda4", "#66c2a5", "#3288bd"], - 9: ["#d53e4f", "#f46d43", "#fdae61", "#fee08b", "#ffffbf", "#e6f598", "#abdda4", "#66c2a5", "#3288bd"], - 10: ["#9e0142", "#d53e4f", "#f46d43", "#fdae61", "#fee08b", "#e6f598", "#abdda4", "#66c2a5", "#3288bd", "#5e4fa2"], - 11: ["#9e0142", "#d53e4f", "#f46d43", "#fdae61", "#fee08b", "#ffffbf", "#e6f598", "#abdda4", "#66c2a5", "#3288bd", "#5e4fa2"], + 7: [ + "#d53e4f", + "#fc8d59", + "#fee08b", + "#ffffbf", + "#e6f598", + "#99d594", + "#3288bd", + ], + 8: [ + "#d53e4f", + "#f46d43", + "#fdae61", + "#fee08b", + "#e6f598", + "#abdda4", + "#66c2a5", + "#3288bd", + ], + 9: [ + "#d53e4f", + "#f46d43", + "#fdae61", + "#fee08b", + "#ffffbf", + "#e6f598", + "#abdda4", + "#66c2a5", + "#3288bd", + ], + 10: [ + "#9e0142", + "#d53e4f", + "#f46d43", + "#fdae61", + "#fee08b", + "#e6f598", + "#abdda4", + "#66c2a5", + "#3288bd", + "#5e4fa2", + ], + 11: [ + "#9e0142", + "#d53e4f", + "#f46d43", + "#fdae61", + "#fee08b", + "#ffffbf", + "#e6f598", + "#abdda4", + "#66c2a5", + "#3288bd", + "#5e4fa2", + ], }, "RdYlGn": { 3: ["#fc8d59", "#ffffbf", "#91cf60"], 4: ["#d7191c", "#fdae61", "#a6d96a", "#1a9641"], 5: ["#d7191c", "#fdae61", "#ffffbf", "#a6d96a", "#1a9641"], 6: ["#d73027", "#fc8d59", "#fee08b", "#d9ef8b", "#91cf60", "#1a9850"], - 7: ["#d73027", "#fc8d59", "#fee08b", "#ffffbf", "#d9ef8b", "#91cf60", "#1a9850"], - 8: ["#d73027", "#f46d43", "#fdae61", "#fee08b", "#d9ef8b", "#a6d96a", "#66bd63", "#1a9850"], - 9: ["#d73027", "#f46d43", "#fdae61", "#fee08b", "#ffffbf", "#d9ef8b", "#a6d96a", "#66bd63", "#1a9850"], - 10: ["#a50026", "#d73027", "#f46d43", "#fdae61", "#fee08b", "#d9ef8b", "#a6d96a", "#66bd63", "#1a9850", "#006837"], - 11: ["#a50026", "#d73027", "#f46d43", "#fdae61", "#fee08b", "#ffffbf", "#d9ef8b", "#a6d96a", "#66bd63", "#1a9850", "#006837"], + 7: [ + "#d73027", + "#fc8d59", + "#fee08b", + "#ffffbf", + "#d9ef8b", + "#91cf60", + "#1a9850", + ], + 8: [ + "#d73027", + "#f46d43", + "#fdae61", + "#fee08b", + "#d9ef8b", + "#a6d96a", + "#66bd63", + "#1a9850", + ], + 9: [ + "#d73027", + "#f46d43", + "#fdae61", + "#fee08b", + "#ffffbf", + "#d9ef8b", + "#a6d96a", + "#66bd63", + "#1a9850", + ], + 10: [ + "#a50026", + "#d73027", + "#f46d43", + "#fdae61", + "#fee08b", + "#d9ef8b", + "#a6d96a", + "#66bd63", + "#1a9850", + "#006837", + ], + 11: [ + "#a50026", + "#d73027", + "#f46d43", + "#fdae61", + "#fee08b", + "#ffffbf", + "#d9ef8b", + "#a6d96a", + "#66bd63", + "#1a9850", + "#006837", + ], }, } @@ -376,6 +1356,7 @@ ## Palette Object ########################################################################## + class ColorPalette(list): """ A wrapper for functionality surrounding a list of colors, including a @@ -393,7 +1374,7 @@ def __init__(self, name_or_list): specify a palette name or a list of RGB or Hex values """ - if isinstance(name_or_list, string_types): + if isinstance(name_or_list, str): if name_or_list not in PALETTES: raise YellowbrickValueError( "'{}' is not a recognized palette!".format(name_or_list) @@ -408,6 +1389,7 @@ def __enter__(self): Open the context and assign the pallete to the mpl.rcParams """ from .rcmod import set_palette + self._orig_palette = color_palette() set_palette(self) return self @@ -417,6 +1399,7 @@ def __exit__(self, *args): Close the context and restore the original palette """ from .rcmod import set_palette + set_palette(self._orig_palette) def as_hex(self): @@ -446,11 +1429,14 @@ def plot(self, size=1): """ n = len(self) fig, ax = plt.subplots(1, 1, figsize=(n * size, size)) - ax.imshow(np.arange(n).reshape(1,n), - cmap=mpl.colors.ListedColormap(list(self)), - interpolation="nearest", aspect="auto") - ax.set_xticks(np.arange(n) - .5) - ax.set_yticks([-.5, .5]) + ax.imshow( + np.arange(n).reshape(1, n), + cmap=mpl.colors.ListedColormap(list(self)), + interpolation="nearest", + aspect="auto", + ) + ax.set_xticks(np.arange(n) - 0.5) + ax.set_yticks([-0.5, 0.5]) ax.set_xticklabels([]) ax.set_yticklabels([]) @@ -459,6 +1445,7 @@ def plot(self, size=1): ## Palette Functions ########################################################################## + def color_palette(palette=None, n_colors=None): """ Return a color palette object with color definition and handling. @@ -527,7 +1514,7 @@ def color_palette(palette=None, n_colors=None): if n_colors is None: n_colors = len(palette) - elif not isinstance(palette, string_types): + elif not isinstance(palette, str): if n_colors is None: n_colors = len(palette) @@ -576,9 +1563,7 @@ def set_color_codes(palette="accent"): """ if palette not in PALETTES: - raise YellowbrickValueError( - "'{}' is not a recognized palette!".format(palette) - ) + raise YellowbrickValueError("'{}' is not a recognized palette!".format(palette)) # Fetch the colors and adapt the length colors = PALETTES[palette] @@ -602,6 +1587,7 @@ def set_color_codes(palette="accent"): ## Sequence Functions ########################################################################## + def color_sequence(palette=None, n_colors=None): """ Return a `ListedColormap` object from a named sequence palette. Useful @@ -670,15 +1656,13 @@ def color_sequence(palette=None, n_colors=None): palette = palette or DEFAULT_SEQUENCE # Create a listed color map from the sequence - if not isinstance(palette, string_types): + if not isinstance(palette, str): return mplcol.ListedColormap(palette) # Otherwise perform a case-insensitive lookup sequences = {key.lower(): key for key in SEQUENCES.keys()} if palette.lower() not in sequences: - raise YellowbrickValueError( - "'{}' is not a recognized palette!".format(palette) - ) + raise YellowbrickValueError("'{}' is not a recognized palette!".format(palette)) # Collect the palette into the dictionary of lists. n_palettes = SEQUENCES[sequences[palette.lower()]] diff --git a/yellowbrick/style/rcmod.py b/yellowbrick/style/rcmod.py index 051e561e1..a41948d55 100644 --- a/yellowbrick/style/rcmod.py +++ b/yellowbrick/style/rcmod.py @@ -1,10 +1,10 @@ # yellowbrick.style.rcmod # Modifies the matplotlib rcParams in order to make yellowbrick appealing. # -# Author: Benjamin Bengfort +# Author: Benjamin Bengfort # Created: Thu Oct 06 08:45:38 2016 -0400 # -# Copyright (C) 2016 District Data Labs +# Copyright (C) 2016 The scikit-yb developers # For license information, see LICENSE.txt # # ID: rcmod.py [c6aff34] benjamin@bengfort.com $ @@ -27,7 +27,8 @@ # Check to see if we have a slightly modern version of mpl from distutils.version import LooseVersion -mpl_ge_150 = LooseVersion(mpl.__version__) >= '1.5.0' + +mpl_ge_150 = LooseVersion(mpl.__version__) >= "1.5.0" from .. import _orig_rc_params @@ -38,10 +39,7 @@ ## Exports ########################################################################## -__all__ = [ - "set_aesthetic", "set_style", "set_palette", - "reset_defaults", "reset_orig", -] +__all__ = ["set_aesthetic", "set_style", "set_palette", "reset_defaults", "reset_orig"] ########################################################################## @@ -49,21 +47,16 @@ ########################################################################## _style_keys = ( - "axes.facecolor", "axes.edgecolor", "axes.grid", "axes.axisbelow", "axes.linewidth", "axes.labelcolor", - "figure.facecolor", - "grid.color", "grid.linestyle", - "text.color", - "xtick.color", "ytick.color", "xtick.direction", @@ -72,13 +65,10 @@ "ytick.major.size", "xtick.minor.size", "ytick.minor.size", - "legend.frameon", "legend.numpoints", "legend.scatterpoints", - "lines.solid_capstyle", - "image.cmap", "font.family", "font.sans-serif", @@ -86,27 +76,23 @@ _context_keys = ( "figure.figsize", - "font.size", "axes.labelsize", "axes.titlesize", "xtick.labelsize", "ytick.labelsize", "legend.fontsize", - "grid.linewidth", "lines.linewidth", "patch.linewidth", "lines.markersize", "lines.markeredgewidth", - "xtick.major.width", "ytick.major.width", "xtick.minor.width", "ytick.minor.width", - "xtick.major.pad", - "ytick.major.pad" + "ytick.major.pad", ) @@ -114,8 +100,10 @@ ## rcParams Keys ########################################################################## -def set_aesthetic(palette="yellowbrick", font="sans-serif", font_scale=1, - color_codes=True, rc=None): + +def set_aesthetic( + palette="yellowbrick", font="sans-serif", font_scale=1, color_codes=True, rc=None +): """ Set aesthetic parameters in one step. @@ -162,6 +150,7 @@ def reset_orig(): ## Axes Styles ########################################################################## + def _axes_style(style=None, rc=None): """ Return a parameter dict for the aesthetic style of the plots. @@ -210,8 +199,12 @@ def _axes_style(style=None, rc=None): "axes.axisbelow": True, "image.cmap": "Greys", "font.family": ["sans-serif"], - "font.sans-serif": ["Arial", "Liberation Sans", - "Bitstream Vera Sans", "sans-serif"], + "font.sans-serif": [ + "Arial", + "Liberation Sans", + "Bitstream Vera Sans", + "sans-serif", + ], "grid.linestyle": "-", "axes.grid": True, "lines.solid_capstyle": "round", @@ -260,6 +253,7 @@ def set_style(style=None, rc=None): ## Context ########################################################################## + def _plotting_context(context=None, font_scale=1, rc=None): """ Return a parameter dict to scale elements of the figure. @@ -296,7 +290,6 @@ def _plotting_context(context=None, font_scale=1, rc=None): else: # Set up dictionary of default parameters base_context = { - "figure.figsize": np.array([8, 5.5]), "font.size": 12, "axes.labelsize": 11, @@ -304,29 +297,32 @@ def _plotting_context(context=None, font_scale=1, rc=None): "xtick.labelsize": 10, "ytick.labelsize": 10, "legend.fontsize": 10, - "grid.linewidth": 1, "lines.linewidth": 1.75, - "patch.linewidth": .3, + "patch.linewidth": 0.3, "lines.markersize": 7, "lines.markeredgewidth": 0, - "xtick.major.width": 1, "ytick.major.width": 1, - "xtick.minor.width": .5, - "ytick.minor.width": .5, - + "xtick.minor.width": 0.5, + "ytick.minor.width": 0.5, "xtick.major.pad": 7, "ytick.major.pad": 7, } # Scale all the parameters by the same factor depending on the context - scaling = dict(paper=.8, notebook=1, talk=1.3, poster=1.6)['notebook'] + scaling = dict(paper=0.8, notebook=1, talk=1.3, poster=1.6)["notebook"] context_dict = {k: v * scaling for k, v in base_context.items()} # Now independently scale the fonts - font_keys = ["axes.labelsize", "axes.titlesize", "legend.fontsize", - "xtick.labelsize", "ytick.labelsize", "font.size"] + font_keys = [ + "axes.labelsize", + "axes.titlesize", + "legend.fontsize", + "xtick.labelsize", + "ytick.labelsize", + "font.size", + ] font_dict = {k: context_dict[k] * font_scale for k in font_keys} context_dict.update(font_dict) @@ -393,17 +389,20 @@ def __call__(self, func): def wrapper(*args, **kwargs): with self: return func(*args, **kwargs) + return wrapper class _AxesStyle(_RCAesthetics): """Light wrapper on a dict to set style temporarily.""" + _keys = _style_keys _set = staticmethod(set_style) class _PlottingContext(_RCAesthetics): """Light wrapper on a dict to set context temporarily.""" + _keys = _context_keys _set = staticmethod(_set_context) @@ -412,6 +411,7 @@ class _PlottingContext(_RCAesthetics): ## Colors/Palettes ########################################################################## + def set_palette(palette, n_colors=None, color_codes=False): """ Set the matplotlib color cycle using a seaborn palette. @@ -432,8 +432,9 @@ def set_palette(palette, n_colors=None, color_codes=False): colors = color_palette(palette, n_colors) if mpl_ge_150: from cycler import cycler - cyl = cycler('color', colors) - mpl.rcParams['axes.prop_cycle'] = cyl + + cyl = cycler("color", colors) + mpl.rcParams["axes.prop_cycle"] = cyl else: mpl.rcParams["axes.color_cycle"] = list(colors) mpl.rcParams["patch.facecolor"] = colors[0] diff --git a/yellowbrick/style/utils.py b/yellowbrick/style/utils.py index 96c7fd82b..719fba0ec 100644 --- a/yellowbrick/style/utils.py +++ b/yellowbrick/style/utils.py @@ -1,3 +1,14 @@ +# yellowbrick.style.utils +# Utility functions for styles +# +# Author: Neal Humphrey +# Created: Wed Mar 22 12:39:35 2017 -0400 +# +# Copyright (C) 2017 The scikit-yb developers +# For license information, see LICENSE.txt +# +# ID: utils.py [45268fc] humphrey.neal@gmail.com $ + """ Utility functions for styles """ @@ -30,19 +41,20 @@ def find_text_color(base_color, dark_color="black", light_color="white", coef_ch a list, user can enter 0 or 1 as list index. 0 is default. """ - #Coefficients: - # option 0: http://www.nbdtech.com/Blog/archive/2008/04/27/Calculating-the-Perceived-Brightness-of-a-Color.aspx - # option 1: http://stackoverflow.com/questions/596216/formula-to-determine-brightness-of-rgb-color - coef_options = [np.array((.241, .691, .068, 0)), - np.array((.299, .587, .114, 0)) - ] + # Coefficients: + # option 0: http://www.nbdtech.com/Blog/archive/2008/04/27/Calculating-the-Perceived-Brightness-of-a-Color.aspx + # option 1: http://stackoverflow.com/questions/596216/formula-to-determine-brightness-of-rgb-color + coef_options = [ + np.array((0.241, 0.691, 0.068, 0)), + np.array((0.299, 0.587, 0.114, 0)), + ] - coefs= coef_options[coef_choice] + coefs = coef_options[coef_choice] rgb = np.array(base_color) * 255 - brightness = np.sqrt(np.dot(coefs, rgb**2)) + brightness = np.sqrt(np.dot(coefs, rgb ** 2)) - #Threshold from option 0 link; determined by trial and error. - #base is light + # Threshold from option 0 link; determined by trial and error. + # base is light if brightness > 130: return dark_color return light_color diff --git a/yellowbrick/target/__init__.py b/yellowbrick/target/__init__.py index 9051abf6a..6d3193e1d 100644 --- a/yellowbrick/target/__init__.py +++ b/yellowbrick/target/__init__.py @@ -4,7 +4,7 @@ # Author: Benjamin Bengfort # Created: Thu Jul 19 08:57:05 2018 -0400 # -# ID: __init__.py [] benjamin@bengfort.com $ +# ID: __init__.py [d742c57] benjamin@bengfort.com $ """ Implements visualizers related to the dependent (target) variable, y. For @@ -22,3 +22,6 @@ from .class_balance import ClassBalance, class_balance from .binning import BalancedBinningReference, balanced_binning_reference from .feature_correlation import FeatureCorrelation + +# Alias the TargetType defined in yellowbrick.utils.target +from yellowbrick.utils.target import TargetType diff --git a/yellowbrick/target/base.py b/yellowbrick/target/base.py index 0ee0f6c7b..3a83ff120 100644 --- a/yellowbrick/target/base.py +++ b/yellowbrick/target/base.py @@ -1,31 +1,48 @@ # yellowbrick.target.base # Base classes for target visualizers # -# Author: Benjamin Bengfort +# Author: Benjamin Bengfort # Created: Thu Jul 19 09:25:53 2018 -0400 # -# ID: base.py [] benjamin@bengfort.com $ +# Copyright (C) 2018 The scikit-yb developers +# For license information, see LICENSE.txt +# +# ID: base.py [d742c57] benjamin@bengfort.com $ """ Base classes for target visualizers """ ########################################################################## -## Imports +# Imports ########################################################################## -from ..base import Visualizer +from yellowbrick.base import Visualizer ########################################################################## -## TargetVisualizer Base Class +# TargetVisualizer Base Class ########################################################################## + class TargetVisualizer(Visualizer): """ The base class for target visualizers, generic enough to support any computation on a single vector, y. This Visualizer is based on the LabelEncoder in sklearn.preprocessing, which only accepts a target y. + + Parameters + ---------- + ax : matplotlib Axes, default: None + The axis to plot the figure on. If None is passed in the current axes + will be used (or generated if required). + + fig : matplotlib Figure, default: None + The figure to plot the Visualizer on. If None is passed in the current + plot will be used (or generated if required). + + kwargs : dict + Keyword arguments that are passed to the base class """ def fit(self, y): @@ -35,6 +52,4 @@ def fit(self, y): of pipelines, but must be used separately; similar to how the LabelEncoder is used. """ - raise NotImplementedError( - "target visualizers must implement a fit method" - ) + raise NotImplementedError("target visualizers must implement a fit method") diff --git a/yellowbrick/target/binning.py b/yellowbrick/target/binning.py index 05327bef4..3dcd60d05 100644 --- a/yellowbrick/target/binning.py +++ b/yellowbrick/target/binning.py @@ -1,13 +1,12 @@ - # yellowbrick.target.binning # Implementations of histogram with vertical lines to help with balanced binning. # -# Author: Juan L. Kehoe (juanluo2008@gmail.com) -# Author: Prema Damodaran Roman (pdamo24@gmail.com) +# Author: Juan L. Kehoe +# Author: Prema Damodaran Roman # Created: Tue Mar 13 19:50:54 2018 -0400 # -# Copyright (C) 2018 District Data Labs +# Copyright (C) 2018 The scikit-yb developers # For license information, see LICENSE.txt # # ID: binning.py @@ -17,18 +16,18 @@ """ ########################################################################## -## Imports +# Imports ########################################################################## -import matplotlib.pyplot as plt import numpy as np -from .base import TargetVisualizer +from yellowbrick.target.base import TargetVisualizer from yellowbrick.exceptions import YellowbrickValueError ########################################################################## -## Balanced Binning Reference +# Balanced Binning Reference ########################################################################## + class BalancedBinningReference(TargetVisualizer): """ BalancedBinningReference generates a histogram with vertical lines @@ -41,7 +40,7 @@ class BalancedBinningReference(TargetVisualizer): This is inherited from FeatureVisualizer and is defined within ``BalancedBinningReference``. - target : string, default: "Frequency" + target : string, default: "y" The name of the ``y`` variable bins : number of bins to generate the histogram, default: 4 @@ -49,10 +48,10 @@ class BalancedBinningReference(TargetVisualizer): kwargs : dict Keyword arguments that are passed to the base class and may influence the visualization as defined in other Visualizers. - + Attributes ---------- - bin_edges : binning reference values + bin_edges_ : binning reference values Examples -------- @@ -89,8 +88,10 @@ def draw(self, y, **kwargs): self.bin_edges_ = bin_edges self.ax.hist(y, bins=self.bins, color=kwargs.pop("color", "#6897bb"), **kwargs) - # add vetical line with binning reference values - plt.vlines(bin_edges,0,max(hist),colors=kwargs.pop("colors", "r")) + # add vertical line with binning reference values + self.ax.vlines(bin_edges, 0, max(hist), colors=kwargs.pop("colors", "r")) + + return self.ax def fit(self, y, **kwargs): """ @@ -107,26 +108,19 @@ def fit(self, y, **kwargs): """ - #throw an error if y has more than 1 column + # throw an error if y has more than 1 column if y.ndim > 1: - raise YellowbrickValueError("y needs to be an array or Series with one dimension") + raise YellowbrickValueError( + "y needs to be an array or Series with one dimension" + ) # Handle the target name if it is None. if self.target is None: - self.target = 'Frequency' + self.target = "y" self.draw(y) return self - - def poof(self, **kwargs): - """ - Creates the labels for the feature and target variables. - """ - - self.ax.set_xlabel(self.target) - self.finalize(**kwargs) - def finalize(self, **kwargs): """ Finalize executes any subclass-specific axes finalization steps. @@ -137,20 +131,21 @@ def finalize(self, **kwargs): kwargs: generic keyword arguments. """ - + self.ax.set_xlabel(self.target) for tk in self.ax.get_xticklabels(): tk.set_visible(True) - + for tk in self.ax.get_yticklabels(): tk.set_visible(True) - - + + ########################################################################## -## Quick Method +# Quick Method ########################################################################## - -def balanced_binning_reference(y, ax=None, target='Frequency', bins=4, **kwargs): - + + +def balanced_binning_reference(y, ax=None, target="y", bins=4, **kwargs): + """ BalancedBinningReference generates a histogram with vertical lines showing the recommended value point to bin your data so they can be evenly @@ -159,12 +154,12 @@ def balanced_binning_reference(y, ax=None, target='Frequency', bins=4, **kwargs) Parameters ---------- y : an array of one dimension or a pandas Series - + ax : matplotlib Axes, default: None This is inherited from FeatureVisualizer and is defined within ``BalancedBinningReference``. - target : string, default: "Frequency" + target : string, default: "y" The name of the ``y`` variable bins : number of bins to generate the histogram, default: 4 @@ -173,18 +168,17 @@ def balanced_binning_reference(y, ax=None, target='Frequency', bins=4, **kwargs) Keyword arguments that are passed to the base class and may influence the visualization as defined in other Visualizers. + Returns + ------- + visualizer : BalancedBinningReference + Returns fitted visualizer """ # Initialize the visualizer visualizer = BalancedBinningReference(ax=ax, bins=bins, target=target, **kwargs) - + # Fit and poof the visualizer visualizer.fit(y) visualizer.poof() - - - - - - + return visualizer diff --git a/yellowbrick/target/class_balance.py b/yellowbrick/target/class_balance.py index 8da018634..4927bd3eb 100644 --- a/yellowbrick/target/class_balance.py +++ b/yellowbrick/target/class_balance.py @@ -1,11 +1,14 @@ # yellowbrick.classifier.class_balance # Class balance visualizer for showing per-class support. # -# Author: Rebecca Bilbro -# Author: Benjamin Bengfort +# Author: Rebecca Bilbro +# Author: Benjamin Bengfort # Author: Neal Humphrey # Created: Wed May 18 12:39:40 2016 -0400 # +# Copyright (C) 2016 The scikit-yb developers +# For license information, see LICENSE.txt +# # ID: class_balance.py [5388065] neal@nhumphrey.com $ """ @@ -13,14 +16,14 @@ """ ########################################################################## -## Imports +# Imports ########################################################################## import numpy as np -from .base import TargetVisualizer -from ..style.colors import resolve_colors -from ..exceptions import YellowbrickValueError +from yellowbrick.style.colors import resolve_colors +from yellowbrick.target.base import TargetVisualizer +from yellowbrick.exceptions import YellowbrickValueError from sklearn.utils.multiclass import unique_labels, type_of_target @@ -31,9 +34,10 @@ ########################################################################## -## Class Balance Chart +# Class Balance Chart ########################################################################## + class ClassBalance(TargetVisualizer): """ One of the biggest challenges for classification models is an imbalance of @@ -61,6 +65,12 @@ class ClassBalance(TargetVisualizer): LabelEncoder.classes\_ as this parameter. If not specified, the labels in the data will be used. + colors: list of strings + Specify colors for the barchart (will override colormap if both are provided). + + colormap : string or matplotlib cmap + Specify a colormap to color the classes. + kwargs: dict, optional Keyword arguments passed to the super class. Here, used to colorize the bars in the histogram. @@ -74,8 +84,8 @@ class ClassBalance(TargetVisualizer): A table representing the support of each class in the target. It is a vector when in balance mode, or a table with two rows in compare mode. - Example - ------- + Examples + -------- To simply observe the balance of classes in the target: >>> viz = ClassBalance().fit(y) @@ -89,8 +99,11 @@ class ClassBalance(TargetVisualizer): >>> viz.poof() """ - def __init__(self, ax=None, labels=None, **kwargs): + def __init__(self, ax=None, labels=None, colors=None, colormap=None, **kwargs): self.labels = labels + self.colors = colors + self.colormap = colormap + super(ClassBalance, self).__init__(ax, **kwargs) def fit(self, y_train, y_test=None): @@ -108,7 +121,7 @@ def fit(self, y_train, y_test=None): Parameters ---------- y_train : array-like - Array or list of shape (n,) that containes discrete data. + Array or list of shape (n,) that contains discrete data. y_test : array-like, optional Array or list of shape (m,) that contains discrete data. If @@ -117,10 +130,12 @@ def fit(self, y_train, y_test=None): # check to make sure that y_train is not a 2D array, e.g. X if y_train.ndim == 2: - raise YellowbrickValueError(( - "fit has changed to only require a 1D array, y " - "since version 0.9; please see the docs for more info" - )) + raise YellowbrickValueError( + ( + "fit has changed to only require a 1D array, y " + "since version 0.9; please see the docs for more info" + ) + ) # Check the target types for the y variables self._validate_target(y_train) @@ -133,27 +148,24 @@ def fit(self, y_train, y_test=None): # Validate the classes with the class names if self.labels is not None: if len(self.labels) != len(self.classes_): - raise YellowbrickValueError(( - "discovered {} classes in the data, does not match " - "the {} labels specified." - ).format(len(self.classes_), len(self.labels))) + raise YellowbrickValueError( + ( + "discovered {} classes in the data, does not match " + "the {} labels specified." + ).format(len(self.classes_), len(self.labels)) + ) # Determine if we're in compare or balance mode self._mode = BALANCE if y_test is None else COMPARE # Compute the support values if self._mode == BALANCE: - self.support_ = np.array([ - (y_train == idx).sum() for idx in self.classes_ - ]) + self.support_ = np.array([(y_train == idx).sum() for idx in self.classes_]) else: - self.support_ = np.array([ - [ - (y == idx).sum() for idx in self.classes_ - ] - for y in targets - ]) + self.support_ = np.array( + [[(y == idx).sum() for idx in self.classes_] for y in targets] + ) # Draw the bar chart self.draw() @@ -166,12 +178,17 @@ def draw(self): Renders the class balance chart on the specified axes from support. """ # Number of colors is either number of classes or 2 - colors = resolve_colors(len(self.support_)) + colors = resolve_colors( + len(self.support_), colormap=self.colormap, colors=self.colors + ) if self._mode == BALANCE: self.ax.bar( - np.arange(len(self.support_)), self.support_, - color=colors, align='center', width=0.5 + np.arange(len(self.support_)), + self.support_, + color=colors, + align="center", + width=0.5, ) # Compare mode @@ -185,8 +202,7 @@ def draw(self): index = index + bar_width self.ax.bar( - index, support, bar_width, - color=colors[idx], label=labels[idx] + index, support, bar_width, color=colors[idx], label=labels[idx] ) return self.ax @@ -202,22 +218,20 @@ def finalize(self, **kwargs): """ # Set the title - self.set_title( - 'Class Balance for {:,} Instances'.format(self.support_.sum()) - ) + self.set_title("Class Balance for {:,} Instances".format(self.support_.sum())) # Set the x ticks with the class names or labels if specified labels = self.labels if self.labels is not None else self.classes_ xticks = np.arange(len(labels)) if self._mode == COMPARE: - xticks = xticks + (0.35/2) + xticks = xticks + (0.35 / 2) self.ax.set_xticks(xticks) self.ax.set_xticklabels(labels) # Compute the ceiling for the y limit cmax = self.support_.max() - self.ax.set_ylim(0, cmax + cmax* 0.1) + self.ax.set_ylim(0, cmax + cmax * 0.1) self.ax.set_ylabel("support") # Remove the vertical grid @@ -237,16 +251,21 @@ def _validate_target(self, y): y_type = type_of_target(y) if y_type not in ("binary", "multiclass"): - raise YellowbrickValueError(( - "'{}' target type not supported, only binary and multiclass" - ).format(y_type)) + raise YellowbrickValueError( + ("'{}' target type not supported, only binary and multiclass").format( + y_type + ) + ) ########################################################################## -## Quick Method +# Quick Method ########################################################################## -def class_balance(y_train, y_test=None, ax=None, labels=None, **kwargs): + +def class_balance( + y_train, y_test=None, ax=None, labels=None, color=None, colormap=None, **kwargs +): """Quick method: One of the biggest challenges for classification models is an imbalance of @@ -282,21 +301,27 @@ def class_balance(y_train, y_test=None, ax=None, labels=None, **kwargs): LabelEncoder.classes\_ as this parameter. If not specified, the labels in the data will be used. + colors: list of strings + Specify colors for the barchart (will override colormap if both are provided). + + colormap : string or matplotlib cmap + Specify a colormap to color the classes. + kwargs: dict, optional Keyword arguments passed to the super class. Here, used to colorize the bars in the histogram. Returns ------- - ax : matplotlib axes - Returns the axes that the class balance plot was drawn on. + visualizer : ClassBalance + Returns the fitted visualizer """ # Instantiate the visualizer - visualizer = ClassBalance(ax=ax, labels=labels, **kwargs) + visualizer = ClassBalance(ax=ax, labels=labels, color=None, colormap=None, **kwargs) # Fit and transform the visualizer (calls draw) visualizer.fit(y_train, y_test) visualizer.finalize() - # Return the axes object on the visualizer - return visualizer.ax + # Return the visualizer + return visualizer diff --git a/yellowbrick/target/feature_correlation.py b/yellowbrick/target/feature_correlation.py index ffc5579d0..65a0b6cda 100644 --- a/yellowbrick/target/feature_correlation.py +++ b/yellowbrick/target/feature_correlation.py @@ -1,23 +1,26 @@ # yellowbrick.classifier.feature_correlation # Feature correlation to dependent variable visualizer. # -# Author Zijie (ZJ) Poh +# Author Zijie (ZJ) Poh # Created: Wed Jul 29 15:30:40 2018 -0700 # -# ID: feature_correlation.py [] poh.zijie@gmail.com $ +# Copyright (C) 2018 The scikit-yb developers +# For license information, see LICENSE.txt +# +# ID: feature_correlation.py [33aec16] 8103276+zjpoh@users.noreply.github.com $ """ Feature Correlation to Dependent Variable Visualizer. """ ########################################################################## -## Imports +# Imports ########################################################################## import numpy as np -from yellowbrick.target.base import TargetVisualizer from yellowbrick.utils import is_dataframe +from yellowbrick.target.base import TargetVisualizer from yellowbrick.exceptions import YellowbrickValueError, YellowbrickWarning from sklearn.feature_selection import mutual_info_classif @@ -26,24 +29,25 @@ from scipy.stats import pearsonr ########################################################################## -## Supported Correlation Computations +# Supported Correlation Computations ########################################################################## CORRELATION_LABELS = { - 'pearson': 'Pearson Correlation', - 'mutual_info-regression': 'Mutual Information', - 'mutual_info-classification': 'Mutual Information' + "pearson": "Pearson Correlation", + "mutual_info-regression": "Mutual Information", + "mutual_info-classification": "Mutual Information", } CORRELATION_METHODS = { - 'mutual_info-regression': mutual_info_regression, - 'mutual_info-classification': mutual_info_classif + "mutual_info-regression": mutual_info_regression, + "mutual_info-classification": mutual_info_classif, } ########################################################################## -## Class Feature Correlation +# Class Feature Correlation ########################################################################## + class FeatureCorrelation(TargetVisualizer): """ Displays the correlation between features and dependent variables. @@ -85,6 +89,9 @@ class FeatureCorrelation(TargetVisualizer): Must have labels or the fitted data is a DataFrame with column names. If feature_index is provided, feature_names will be ignored. + color: string + Specify color for barchart + kwargs : dict Keyword arguments that are passed to the base class and may influence the visualization as defined in other Visualizers. @@ -105,9 +112,17 @@ class FeatureCorrelation(TargetVisualizer): >>> viz.poof() """ - def __init__(self, ax=None, method='pearson', - labels=None, sort=False, feature_index=None, - feature_names=None, **kwargs): + def __init__( + self, + ax=None, + method="pearson", + labels=None, + sort=False, + feature_index=None, + feature_names=None, + color=None, + **kwargs + ): super(FeatureCorrelation, self).__init__(ax=None, **kwargs) self.correlation_labels = CORRELATION_LABELS @@ -115,18 +130,19 @@ def __init__(self, ax=None, method='pearson', if method not in self.correlation_labels: raise YellowbrickValueError( - 'Method {} not implement; choose from {}'.format( + "Method {} not implement; choose from {}".format( method, ", ".join(self.correlation_labels) ) ) # Parameters self.set_params( + sort=sort, + color=color, method=method, labels=labels, - sort=sort, feature_index=feature_index, - feature_names=feature_names + feature_names=feature_names, ) def fit(self, X, y, **kwargs): @@ -162,7 +178,7 @@ def fit(self, X, y, **kwargs): else: self.scores_ = np.array( self.correlation_methods[self.method](X, y, **kwargs) - ) + ) # If feature indices are given, plot only the given features if self.feature_index: @@ -184,7 +200,7 @@ def draw(self): """ pos = np.arange(self.scores_.shape[0]) + 0.5 - self.ax.barh(pos, self.scores_) + self.ax.barh(pos, self.scores_, color=self.color) # Set the labels for the bars self.ax.set_yticks(pos) @@ -196,11 +212,11 @@ def finalize(self): """ Finalize the drawing setting labels and title. """ - self.set_title('Features correlation with dependent variable') + self.set_title("Features correlation with dependent variable") self.ax.set_xlabel(self.correlation_labels[self.method]) - self.ax.grid(False, axis='y') + self.ax.grid(False, axis="y") def _create_labels_for_features(self, X): """ @@ -230,33 +246,38 @@ def _select_features_to_plot(self, X): if self.feature_index: if self.feature_names: raise YellowbrickWarning( - 'Both feature_index and feature_names ' - 'are specified. feature_names is ignored' + "Both feature_index and feature_names " + "are specified. feature_names is ignored" ) - if (min(self.feature_index) < 0 - or max(self.feature_index) >= X.shape[1]): - raise YellowbrickValueError('Feature index is out of range') + if min(self.feature_index) < 0 or max(self.feature_index) >= X.shape[1]: + raise YellowbrickValueError("Feature index is out of range") elif self.feature_names: self.feature_index = [] features_list = self.features_.tolist() for feature_name in self.feature_names: try: - self.feature_index.append( - features_list.index(feature_name) - ) + self.feature_index.append(features_list.index(feature_name)) except ValueError: - raise YellowbrickValueError( - '{} not in labels'.format(feature_name) - ) + raise YellowbrickValueError("{} not in labels".format(feature_name)) ########################################################################## -## Quick Method +# Quick Method ########################################################################## -def feature_correlation(X, y, ax=None, method='pearson', - labels=None, sort=False, feature_index=None, - feature_names=None, **kwargs): + +def feature_correlation( + X, + y, + ax=None, + method="pearson", + labels=None, + sort=False, + feature_index=None, + feature_names=None, + color=None, + **kwargs +): """ Displays the correlation between features and dependent variables. @@ -304,23 +325,34 @@ def feature_correlation(X, y, ax=None, method='pearson', Must have labels or the fitted data is a DataFrame with column names. If feature_index is provided, feature_names will be ignored. + color: string + Specify color for barchart + kwargs : dict Keyword arguments that are passed to the base class and may influence the visualization as defined in other Visualizers. Returns ------- - ax : matplotlib axes - Returns the axes that the parallel coordinates were drawn on. + visualizer : FeatureCorrelation + Returns the fitted visualizer. """ # Instantiate the visualizer - viz = FeatureCorrelation(ax, method, labels, sort, - feature_index, feature_names, **kwargs) + viz = FeatureCorrelation( + ax=ax, + method=method, + labels=labels, + sort=sort, + color=color, + feature_index=feature_index, + feature_names=feature_names, + **kwargs + ) # Fit and transform the visualizer (calls draw) viz.fit(X, y, **kwargs) viz.finalize() - # Return the axes object on the visualizer - return viz.ax + # Return the visualizer + return viz diff --git a/yellowbrick/text/__init__.py b/yellowbrick/text/__init__.py index 3936eba84..3a69f3f40 100644 --- a/yellowbrick/text/__init__.py +++ b/yellowbrick/text/__init__.py @@ -1,10 +1,10 @@ # yellowbrick.text # Visualizers for text feature analysis and diagnostics. # -# Author: Rebecca Bilbro +# Author: Rebecca Bilbro # Created: 2017-01-20 14:42 # -# Copyright (C) 2017 District Data Labs +# Copyright (C) 2017 The scikit-yb developers # For license information, see LICENSE.txt # # ID: __init__.py [75d9b20] rebecca.bilbro@bytecubed.com $ @@ -18,6 +18,7 @@ ########################################################################## from .tsne import TSNEVisualizer, tsne +from .umap_vis import UMAPVisualizer, umap from .freqdist import FreqDistVisualizer, freqdist from .postag import PosTagVisualizer from .dispersion import DispersionPlot, dispersion diff --git a/yellowbrick/text/base.py b/yellowbrick/text/base.py index 7a82c01d5..8306b8085 100644 --- a/yellowbrick/text/base.py +++ b/yellowbrick/text/base.py @@ -1,10 +1,10 @@ # yellowbrick.text.base # Base classes for text feature visualizers and feature selection tools. # -# Author: Rebecca Bilbro -# Created: 2017-01-20 14:44 +# Author: Rebecca Bilbro +# Created: Sat Jan 21 09:37:01 2017 -0500 # -# Copyright (C) 2017 District Data Labs +# Copyright (C) 2017 The scikit-yb developers # For license information, see LICENSE.txt # # ID: base.py [75d9b20] rebecca.bilbro@bytecubed.com $ @@ -25,6 +25,7 @@ ## Text Visualizers ########################################################################## + class TextVisualizer(Visualizer, TransformerMixin): """ Base class for text feature visualization to investigate documents @@ -39,7 +40,7 @@ class TextVisualizer(Visualizer, TransformerMixin): Accepts as input a DataFrame or Numpy array. """ - def __init__(self, ax=None, **kwargs): + def __init__(self, ax=None, fig=None, **kwargs): """ These parameters can be influenced later on in the visualization process, but can and should be set as early as possible. @@ -49,11 +50,15 @@ def __init__(self, ax=None, **kwargs): ax : axes the axis to plot the figure on + fig : matplotlib Figure, default: None + The figure to plot the Visualizer on. If None is passed in the current + plot will be used (or generated if required). + kwargs : dict Pass generic arguments to the drawing method """ - super(TextVisualizer, self).__init__(ax=ax, **kwargs) + super(TextVisualizer, self).__init__(ax=ax, fig=fig, **kwargs) def fit(self, X, y=None, **fit_params): """ diff --git a/yellowbrick/text/dispersion.py b/yellowbrick/text/dispersion.py index 55d945636..f6eef357b 100644 --- a/yellowbrick/text/dispersion.py +++ b/yellowbrick/text/dispersion.py @@ -2,12 +2,12 @@ # Implementations of lexical dispersions for text visualization. # # Author: Larry Gray -# Created: 2018-06-21 10:06 +# Created: Fri Jun 22 15:40:49 2018 -0400 # -# Copyright (C) 2018 District Data Labs +# Copyright (C) 2018 The scikit-yb developers # For license information, see LICENSE.txt # -# ID: dispersion.py [] lwgray@gmail.com $ +# ID: dispersion.py [3822dd6] lwgray@gmail.com $ """ Implementation of lexical dispersion for text visualization @@ -31,6 +31,7 @@ ## Dispersion Plot Visualizer ########################################################################## + class DispersionPlot(TextVisualizer): """ DispersionPlotVisualizer allows for visualization of the lexical dispersion @@ -42,7 +43,7 @@ class DispersionPlot(TextVisualizer): ---------- target_words : list A list of target words whose dispersion across a corpus passed at fit - will be visualized. + will be visualized. ax : matplotlib axes, default: None The axes to plot the figure on. @@ -58,7 +59,7 @@ class DispersionPlot(TextVisualizer): Qualitative colormap for discrete target ignore_case : boolean, default: False - Specify whether input will be case-sensitive. + Specify whether input will be case-sensitive. annotate_docs : boolean, default: False Specify whether document boundaries will be displayed. Vertical lines @@ -74,8 +75,17 @@ class DispersionPlot(TextVisualizer): # NOTE: cannot be np.nan NULL_CLASS = None - def __init__(self, target_words, ax=None, colors=None, ignore_case=False, - annotate_docs=False, labels=None, colormap=None, **kwargs): + def __init__( + self, + target_words, + ax=None, + colors=None, + ignore_case=False, + annotate_docs=False, + labels=None, + colormap=None, + **kwargs + ): super(DispersionPlot, self).__init__(ax=ax, **kwargs) self.labels = labels @@ -90,7 +100,6 @@ def _compute_dispersion(self, text, y): self.boundaries_ = [] offset = 0 - if y is None: y = itertools.repeat(None) @@ -112,13 +121,14 @@ def _compute_dispersion(self, text, y): def _check_missing_words(self, points): for index in range(len(self.indexed_words_)): - if index in points[:,1]: + if index in points[:, 1]: pass else: - raise YellowbrickValueError(( - "The indexed word '{}' is not found in " - "this corpus" - ).format(self.indexed_words_[index])) + raise YellowbrickValueError( + ("The indexed word '{}' is not found in " "this corpus").format( + self.indexed_words_[index] + ) + ) def fit(self, X, y=None, **kwargs): """ @@ -129,7 +139,7 @@ def fit(self, X, y=None, **kwargs): ---------- X : list or generator Should be provided as a list of documents or a generator - that yields a list of documents that contain a list of + that yields a list of documents that contain a list of words in the order they appear in the document. y : ndarray or Series of length n @@ -162,13 +172,12 @@ def fit(self, X, y=None, **kwargs): try: points_target = np.stack(self._compute_dispersion(X, y)) except ValueError: - raise YellowbrickValueError(( - "No indexed words were found in the corpus" - )) - points = np.stack(zip(points_target[:,0].astype(int), - points_target[:,1].astype(int))) + raise YellowbrickValueError(("No indexed words were found in the corpus")) + points = np.stack( + zip(points_target[:, 0].astype(int), points_target[:, 1].astype(int)) + ) - self.target = points_target[:,2] + self.target = points_target[:, 2] self._check_missing_words(points) @@ -187,14 +196,17 @@ def draw(self, points, target=None, **kwargs): # Resolve the labels with the classes labels = self.labels if self.labels is not None else self.classes_ if len(labels) != len(self.classes_): - raise YellowbrickValueError(( - "number of supplied labels ({}) does not " - "match the number of classes ({})" - ).format(len(labels), len(self.classes_))) + raise YellowbrickValueError( + ( + "number of supplied labels ({}) does not " + "match the number of classes ({})" + ).format(len(labels), len(self.classes_)) + ) # Create the color mapping for the labels. color_values = resolve_colors( - n_colors=len(labels), colormap=self.colormap, colors=self.color) + n_colors=len(labels), colormap=self.colormap, colors=self.color + ) colors = dict(zip(labels, color_values)) # Transform labels into a map of class to label @@ -203,28 +215,36 @@ def draw(self, points, target=None, **kwargs): # Define boundaries with a vertical line if self.annotate_docs: for xcoords in self.boundaries_: - self.ax.axvline(x=xcoords, color='lightgray', linestyle='dashed') + self.ax.axvline(x=xcoords, color="lightgray", linestyle="dashed") - series = defaultdict(lambda: {'x':[], 'y':[]}) + series = defaultdict(lambda: {"x": [], "y": []}) if target is not None: for point, t in zip(points, target): label = labels[t] - series[label]['x'].append(point[0]) - series[label]['y'].append(point[1]) + series[label]["x"].append(point[0]) + series[label]["y"].append(point[1]) else: label = self.classes_[0] for x, y in points: - series[label]['x'].append(x) - series[label]['y'].append(y) + series[label]["x"].append(x) + series[label]["y"].append(y) for label, points in series.items(): - self.ax.scatter(points['x'], points['y'], marker='|', - c=colors[label], zorder=100, label=label) + self.ax.scatter( + points["x"], + points["y"], + marker="|", + c=colors[label], + zorder=100, + label=label, + ) self.ax.set_yticks(list(range(len(self.indexed_words_)))) self.ax.set_yticklabels(self.indexed_words_) + return self.ax + def finalize(self, **kwargs): """ The finalize method executes any subclass-specific axes @@ -243,17 +263,29 @@ def finalize(self, **kwargs): if not all(self.classes_ == np.array([self.NULL_CLASS])): box = self.ax.get_position() self.ax.set_position([box.x0, box.y0, box.width * 0.8, box.height]) - self.ax.legend(loc='center left', bbox_to_anchor=(1, 0.5)) + self.ax.legend(loc="center left", bbox_to_anchor=(1, 0.5)) + ########################################################################## ## Quick Method ########################################################################## -def dispersion(words, corpus, y=None, ax=None, colors=None, colormap=None, - labels=None, annotate_docs=False, ignore_case=False, **kwargs): + +def dispersion( + words, + corpus, + y=None, + ax=None, + colors=None, + colormap=None, + labels=None, + annotate_docs=False, + ignore_case=False, + **kwargs +): """ Displays lexical dispersion plot for words in a corpus - This helper function is a quick wrapper to utilize the DisperstionPlot + This helper function is a quick wrapper to utilize the DispersionPlot Visualizer for one-off analysis Parameters @@ -289,26 +321,32 @@ def dispersion(words, corpus, y=None, ax=None, colors=None, colormap=None, are positioned at the end of each document. ignore_case : boolean, default: False - Specify whether input will be case-sensitive. + Specify whether input will be case-sensitive. kwargs : dict Pass any additional keyword arguments to the super class. Returns ------- - ax: matplotlib axes - Returns the axes that the plot was drawn on + viz: DispersionPlot + Returns the fitted, finalized visualizer """ # Instantiate the visualizer visualizer = DispersionPlot( - words, ax=ax, colors=colors, colormap=colormap, - ignore_case=ignore_case, labels=labels, - annotate_docs=annotate_docs, **kwargs + words, + ax=ax, + colors=colors, + colormap=colormap, + ignore_case=ignore_case, + labels=labels, + annotate_docs=annotate_docs, + **kwargs ) # Fit and transform the visualizer (calls draw) visualizer.fit(corpus, y, **kwargs) + visualizer.finalize() - # Return the axes object on the visualizer - return visualizer.ax + # Return the visualizer object + return visualizer diff --git a/yellowbrick/text/freqdist.py b/yellowbrick/text/freqdist.py index d1aa633da..186ea497b 100644 --- a/yellowbrick/text/freqdist.py +++ b/yellowbrick/text/freqdist.py @@ -1,10 +1,10 @@ # yellowbrick.text.freqdist # Implementations of frequency distributions for text visualization. # -# Author: Rebecca Bilbro -# Created: 2017-02-08 10:06 +# Author: Rebecca Bilbro +# Created: Mon Feb 20 12:38:20 2017 -0500 # -# Copyright (C) 2017 District Data Labs +# Copyright (C) 2017 The scikit-yb developers # For license information, see LICENSE.txt # # ID: freqdist.py [67b2740] rebecca.bilbro@bytecubed.com $ @@ -29,7 +29,8 @@ ## Quick Method ########################################################################## -def freqdist(X, y=None, ax=None, color=None, N=50, **kwargs): + +def freqdist(X, y=None, ax=None, n=50, orient="h", color=None, **kwargs): """Displays frequency distribution plot for text. This helper function is a quick wrapper to utilize the FreqDist @@ -45,34 +46,36 @@ def freqdist(X, y=None, ax=None, color=None, N=50, **kwargs): y: ndarray or Series of length n An array or series of target or class values - ax: matplotlib axes + ax : matplotlib axes, default: None The axes to plot the figure on. - color: string - Specify color for barchart - - N: integer + n: integer, default: 50 Top N tokens to be plotted. + orient : 'h' or 'v', default: 'h' + Specifies a horizontal or vertical bar chart. + + color : string + Specify color for bars + kwargs: dict Keyword arguments passed to the super class. Returns ------- - ax: matplotlib axes - Returns the axes that the plot was drawn on. + visualizer: FreqDistVisualizer + Returns the fitted, finalized visualizer """ # Instantiate the visualizer - visualizer = FreqDistVisualizer( - ax, X, color, **kwargs - ) + visualizer = FreqDistVisualizer(ax=ax, n=n, orient=orient, color=color, **kwargs) # Fit and transform the visualizer (calls draw) visualizer.fit(X, y, **kwargs) visualizer.transform(X) + visualizer.finalize() - # Return the axes object on the visualizer - return visualizer.ax + # Return the visualizer object + return visualizer class FrequencyVisualizer(TextVisualizer): @@ -101,7 +104,7 @@ class FrequencyVisualizer(TextVisualizer): orient : 'h' or 'v', default: 'h' Specifies a horizontal or vertical bar chart. - color : list or tuple of colors + color : string Specify color for bars kwargs : dict @@ -111,15 +114,13 @@ class FrequencyVisualizer(TextVisualizer): process, but can and should be set as early as possible. """ - def __init__(self, features, ax=None, n=50, orient='h', color=None, **kwargs): + def __init__(self, features, ax=None, n=50, orient="h", color=None, **kwargs): super(FreqDistVisualizer, self).__init__(ax=ax, **kwargs) # Check that the orient is correct orient = orient.lower().strip() - if orient not in {'h', 'v'}: - raise YellowbrickValueError( - "Orientation must be 'h' or 'v'" - ) + if orient not in {"h", "v"}: + raise YellowbrickValueError("Orientation must be 'h' or 'v'") # Visualizer parameters self.N = n @@ -188,7 +189,7 @@ def fit(self, X, y=None): # Frequency distribution of entire corpus. self.freqdist_ = self.count(X) - self.sorted_ = self.freqdist_.argsort()[::-1] # Descending order + self.sorted_ = self.freqdist_.argsort()[::-1] # Descending order # Compute the number of words, vocab, and hapaxes self.vocab_ = self.freqdist_.shape[0] @@ -210,26 +211,24 @@ def draw(self, **kwargs): """ # Prepare the data - bins = np.arange(self.N) - words = [self.features[i] for i in self.sorted_[:self.N]] + bins = np.arange(self.N) + words = [self.features[i] for i in self.sorted_[: self.N]] freqs = {} # Set up the bar plots if self.conditional_freqdist_: - for label, values in sorted(self.conditional_freqdist_.items(), key=itemgetter(0)): - freqs[label] = [ - values[i] for i in self.sorted_[:self.N] - ] + for label, values in sorted( + self.conditional_freqdist_.items(), key=itemgetter(0) + ): + freqs[label] = [values[i] for i in self.sorted_[: self.N]] else: - freqs['corpus'] = [ - self.freqdist_[i] for i in self.sorted_[:self.N] - ] + freqs["corpus"] = [self.freqdist_[i] for i in self.sorted_[: self.N]] # Draw a horizontal barplot - if self.orient == 'h': + if self.orient == "h": # Add the barchart, stacking if necessary for label, freq in freqs.items(): - self.ax.barh(bins, freq, label=label, align='center') + self.ax.barh(bins, freq, label=label, color=self.color, align="center") # Set the y ticks to the words self.ax.set_yticks(bins) @@ -243,10 +242,10 @@ def draw(self, **kwargs): self.ax.xaxis.grid(True) # Draw a vertical barplot - elif self.orient == 'v': + elif self.orient == "v": # Add the barchart, stacking if necessary for label, freq in freqs.items(): - self.ax.bar(bins, freq, label=label, align='edge') + self.ax.bar(bins, freq, label=label, color=self.color, align="edge") # Set the y ticks to the words self.ax.set_xticks(bins) @@ -258,9 +257,9 @@ def draw(self, **kwargs): # Unknown state else: - raise YellowbrickValueError( - "Orientation must be 'h' or 'v'" - ) + raise YellowbrickValueError("Orientation must be 'h' or 'v'") + + return self.ax def finalize(self, **kwargs): """ @@ -273,21 +272,25 @@ def finalize(self, **kwargs): """ # Set the title - self.set_title( - 'Frequency Distribution of Top {} tokens'.format(self.N) - ) + self.set_title("Frequency Distribution of Top {} tokens".format(self.N)) # Create the vocab, count, and hapaxes labels infolabel = "vocab: {:,}\nwords: {:,}\nhapax: {:,}".format( self.vocab_, self.words_, self.hapaxes_ ) - self.ax.text(0.68, 0.97, infolabel, transform=self.ax.transAxes, - fontsize=9, verticalalignment='top', - bbox={'boxstyle':'round', 'facecolor':'white', 'alpha':.8}) + self.ax.text( + 0.68, + 0.97, + infolabel, + transform=self.ax.transAxes, + fontsize=9, + verticalalignment="top", + bbox={"boxstyle": "round", "facecolor": "white", "alpha": 0.8}, + ) # Set the legend and the grid - self.ax.legend(loc='upper right', frameon=True) + self.ax.legend(loc="upper right", frameon=True) # Backwards compatibility alias diff --git a/yellowbrick/text/postag.py b/yellowbrick/text/postag.py index 034eea138..b1119a068 100644 --- a/yellowbrick/text/postag.py +++ b/yellowbrick/text/postag.py @@ -1,10 +1,10 @@ # yellowbrick.text.postag # Implementation of part-of-speech visualization for text. # -# Author: Rebecca Bilbro -# Created: 2017-03-05 14:44 +# Author: Rebecca Bilbro +# Created: Sun Mar 5 18:07:06 2017 -0500 # -# Copyright (C) 2017 District Data Labs +# Copyright (C) 2017 The scikit-yb developers # For license information, see LICENSE.txt # # ID: postag.py [849f5a8] rebecca.bilbro@bytecubed.com $ @@ -19,126 +19,482 @@ # Imports ########################################################################## +import numpy as np + +from yellowbrick.draw import bar_stack from yellowbrick.text.base import TextVisualizer +from yellowbrick.style.colors import resolve_colors +from yellowbrick.exceptions import YellowbrickValueError + +########################################################################## +# Part-of-speech tag punctuation and labels +########################################################################## + +# NOTE: Penn Treebank converts all sentence closers (!,?,;) to periods +PUNCT_TAGS = [".", ":", ",", "``", "''", "(", ")", "#", "$"] + +TAGSET_NAMES = {"penn_treebank": "Penn Treebank", "universal": "Universal Dependencies"} + +PENN_TAGS = [ + "noun", + "verb", + "adjective", + "adverb", + "preposition", + "determiner", + "pronoun", + "conjunction", + "infinitive", + "wh- word", + "modal", + "possessive", + "existential", + "punctuation", + "digit", + "non-English", + "interjection", + "list", + "symbol", + "other", +] + +UNIVERSAL_TAGS = [ + "noun", + "verb", + "adjective", + "adverb", + "adposition", + "determiner", + "pronoun", + "conjunction", + "infinitive", + "punctuation", + "number", + "interjection", + "symbol", + "other", +] + ########################################################################## # PosTagVisualizer ########################################################################## + class PosTagVisualizer(TextVisualizer): """ - A part-of-speech tag visualizer colorizes text to enable - the user to visualize the proportions of nouns, verbs, etc. - and to use this information to make decisions about - part-of-speech tagging, text normalization (e.g. stemming - vs lemmatization) and vectorization. + Parts of speech (e.g. verbs, nouns, prepositions, adjectives) + indicate how a word is functioning within the context of a sentence. + In English as in many other languages, a single word can function in + multiple ways. Part-of-speech tagging lets us encode information not + only about a word’s definition, but also its use in context (for + example the words “ship” and “shop” can be either a verb or a noun, + depending on the context). + + The PosTagVisualizer creates a bar chart to visualize the relative + proportions of different parts-of-speech in a corpus. + + Note that the PosTagVisualizer requires documents to already be + part-of-speech tagged; the visualizer expects the corpus to come in + the form of a list of (document) lists of (sentence) lists of + (tag, token) tuples. Parameters ---------- + ax : matplotlib axes + The axes to plot the figure on. + + tagset: string + The tagset that was used to perform part-of-speech tagging. + Either "penn_treebank" or "universal", defaults to "penn_treebank". + Use "universal" if corpus has been tagged using SpaCy. + + colors : list or tuple of strings + Specify the colors for each individual part-of-speech. Will override + colormap if both are provided. + + colormap : string or matplotlib cmap + Specify a colormap to color the parts-of-speech. + + frequency: bool {True, False}, default: False + If set to True, part-of-speech tags will be plotted according to frequency, + from most to least frequent. + + stack : bool {True, False}, default : False + Plot the PosTag frequency chart as a per-class stacked bar chart. + Note that fit() requires y for this visualization. + kwargs : dict - Pass any additional keyword arguments to the super class. - cmap : dict - ANSII colormap + Pass any additional keyword arguments to the PosTagVisualizer. - These parameters can be influenced later on in the visualization - process, but can and should be set as early as possible. + Attributes + ---------- + pos_tag_counts_: dict + Mapping of part-of-speech tags to counts. + + Examples + -------- + >>> viz = PosTagVisualizer() + >>> viz.fit(X) + >>> viz.poof() """ - def __init__(self, ax=None, **kwargs): + + def __init__( + self, + ax=None, + tagset="penn_treebank", + colormap=None, + colors=None, + frequency=False, + stack=False, + **kwargs + ): + super(PosTagVisualizer, self).__init__(ax=ax, **kwargs) + + self.tagset_names = TAGSET_NAMES + + if tagset not in self.tagset_names: + raise YellowbrickValueError( + ("'{}' is an invalid tagset. Please choose one of {}.").format( + tagset, ", ".join(self.tagset_names.keys()) + ) + ) + else: + self.tagset = tagset + + self.punct_tags = frozenset(PUNCT_TAGS) + self.frequency = frequency + self.colormap = colormap + self.colors = colors + self.stack = stack + + def fit(self, X, y=None, **kwargs): """ - Initializes the base frequency distributions with many - of the options required in order to make this - visualization work. + Fits the corpus to the appropriate tag map. + Text documents must be tokenized & tagged before passing to fit. + + Parameters + ---------- + X : list or generator + Should be provided as a list of documents or a generator + that yields a list of documents that contain a list of + sentences that contain (token, tag) tuples. + + y : ndarray or Series of length n + An optional array of target values that are ignored by the + visualizer. + + kwargs : dict + Pass generic arguments to the drawing method + + Returns + ------- + self : instance + Returns the instance of the transformer/visualizer """ - super(PosTagVisualizer, self).__init__(ax=ax, **kwargs) + self.labels_ = ["documents"] + if self.stack: + if y is None: + raise YellowbrickValueError("Specify y for stack=True") + self.labels_ = np.unique(y) - # TODO: hard-coding in the ANSII colormap for now. - # Can we let the user reset the colors here? - self.COLORS = { - 'white' : "\033[0;37m{}\033[0m", - 'yellow' : "\033[0;33m{}\033[0m", - 'green' : "\033[0;32m{}\033[0m", - 'blue' : "\033[0;34m{}\033[0m", - 'cyan' : "\033[0;36m{}\033[0m", - 'red' : "\033[0;31m{}\033[0m", - 'magenta' : "\033[0;35m{}\033[0m", - 'black' : "\033[0;30m{}\033[0m", - 'darkwhite' : "\033[1;37m{}\033[0m", - 'darkyellow' : "\033[1;33m{}\033[0m", - 'darkgreen' : "\033[1;32m{}\033[0m", - 'darkblue' : "\033[1;34m{}\033[0m", - 'darkcyan' : "\033[1;36m{}\033[0m", - 'darkred' : "\033[1;31m{}\033[0m", - 'darkmagenta': "\033[1;35m{}\033[0m", - 'darkblack' : "\033[1;30m{}\033[0m", - None : "\033[0;0m{}\033[0m" - } + if self.tagset == "penn_treebank": + self.pos_tag_counts_ = self._penn_tag_map() + self._handle_treebank(X, y) - self.TAGS = { - 'NN' : 'green', - 'NNS' : 'green', - 'NNP' : 'green', - 'NNPS' : 'green', - 'VB' : 'blue', - 'VBD' : 'blue', - 'VBG' : 'blue', - 'VBN' : 'blue', - 'VBP' : 'blue', - 'VBZ' : 'blue', - 'JJ' : 'red', - 'JJR' : 'red', - 'JJS' : 'red', - 'RB' : 'cyan', - 'RBR' : 'cyan', - 'RBS' : 'cyan', - 'IN' : 'darkwhite', - 'POS' : 'darkyellow', - 'PRP$' : 'magenta', - 'DT' : 'black', - 'CC' : 'black', - 'CD' : 'black', - 'WDT' : 'black', - 'WP' : 'black', - 'WP$' : 'black', - 'WRB' : 'black', - 'EX' : 'yellow', - 'FW' : 'yellow', - 'LS' : 'yellow', - 'MD' : 'yellow', - 'PDT' : 'yellow', - 'RP' : 'yellow', - 'SYM' : 'yellow', - 'TO' : 'yellow', - 'None' : 'off' - } + elif self.tagset == "universal": + self.pos_tag_counts_ = self._uni_tag_map() + self._handle_universal(X, y) + + self.draw() + + return self + + def _penn_tag_map(self): + """ + Returns a Penn Treebank part-of-speech tag map. + """ + self._pos_tags = PENN_TAGS + return self._make_tag_map(PENN_TAGS) + + def _uni_tag_map(self): + """ + Returns a Universal Dependencies part-of-speech tag map. + """ + self._pos_tags = UNIVERSAL_TAGS + return self._make_tag_map(UNIVERSAL_TAGS) + + def _make_tag_map(self, tagset): + """ + Returns a map of the tagset to a counter unless stack=True then returns + a map of labels to a map of tagset to counters. + """ + # ensures the dict contains a zero counter per tag + zeros = [0] * len(tagset) + return {label: dict(zip(tagset, zeros)) for label in self.labels_} + return dict(zip(tagset, zeros)) - def colorize(self, token, color): + def _handle_universal(self, X, y=None): """ - Colorize text + Scan through the corpus to compute counts of each Universal + Dependencies part-of-speech. Parameters ---------- - token : str - A str representation of + X : list or generator + Should be provided as a list of documents or a generator + that yields a list of documents that contain a list of + sentences that contain (token, tag) tuples. + """ + jump = { + # combine proper and regular nouns + "NOUN": "noun", + "PROPN": "noun", + "ADJ": "adjective", + "VERB": "verb", + # include particles with adverbs + "ADV": "adverb", + "PART": "adverb", + "ADP": "adposition", + "PRON": "pronoun", + "CCONJ": "conjunction", + "PUNCT": "punctuation", + "DET": "determiner", + "NUM": "number", + "INTJ": "interjection", + "SYM": "symbol", + } + + for idx, tagged_doc in enumerate(X): + for tagged_sent in tagged_doc: + for _, tag in tagged_sent: + if tag == "SPACE": + continue + if self.stack: + counter = self.pos_tag_counts_[y[idx]] + else: + counter = self.pos_tag_counts_["documents"] + + counter[jump.get(tag, "other")] += 1 + + def _handle_treebank(self, X, y=None): + """ + Create a part-of-speech tag mapping using the Penn Treebank tags + Parameters + ---------- + X : list or generator + Should be provided as a list of documents or a generator + that yields a list of documents that contain a list of + sentences that contain (token, tag) tuples. """ - return self.COLORS[color].format(token) + for idx, tagged_doc in enumerate(X): + for tagged_sent in tagged_doc: + for _, tag in tagged_sent: + if self.stack: + counter = self.pos_tag_counts_[y[idx]] + else: + counter = self.pos_tag_counts_["documents"] - def transform(self, tagged_tuples): + if tag.startswith("N"): + counter["noun"] += 1 + elif tag.startswith("J"): + counter["adjective"] += 1 + elif tag.startswith("V"): + counter["verb"] += 1 + # include particles with adverbs + elif tag.startswith("RB") or tag == "RP": + counter["adverb"] += 1 + elif tag.startswith("PR"): + counter["pronoun"] += 1 + elif tag.startswith("W"): + counter["wh- word"] += 1 + elif tag == "CC": + counter["conjunction"] += 1 + elif tag == "CD": + counter["digit"] += 1 + # combine predeterminer and determiner + elif tag in ["DT" or "PDT"]: + counter["determiner"] += 1 + elif tag == "EX": + counter["existential"] += 1 + elif tag == "FW": + counter["non-English"] += 1 + elif tag == "IN": + counter["preposition"] += 1 + elif tag == "POS": + counter["possessive"] += 1 + elif tag == "LS": + counter["list"] += 1 + elif tag == "MD": + counter["modal"] += 1 + elif tag in self.punct_tags: + counter["punctuation"] += 1 + elif tag == "TO": + counter["infinitive"] += 1 + elif tag == "UH": + counter["interjection"] += 1 + elif tag == "SYM": + counter["symbol"] += 1 + else: + counter["other"] += 1 + + def draw(self, **kwargs): """ - The transform method transforms the raw text input for the - part-of-speech tagging visualization. It requires that - documents be in the form of (tag, token) tuples. + Called from the fit method, this method creates the canvas and + draws the part-of-speech tag mapping as a bar chart. Parameters ---------- - tagged_token_tuples : list of tuples - A list of (tag, token) tuples + kwargs: dict + generic keyword arguments. - Text documents must be tokenized and tagged before passing to fit() + Returns + ------- + ax : matplotlib axes + Axes on which the PosTagVisualizer was drawn. """ - self.tagged = [ - (self.TAGS.get(tag),tok) for tok, tag in tagged_tuples - ] - # - # print(' '.join((colorize(token, color) for color, token in self.tagged))) - # print('\n') + # Converts nested dict to nested list + pos_tag_counts = np.array( + [list(i.values()) for i in self.pos_tag_counts_.values()] + ) + # stores sum of nested list column wise + pos_tag_sum = np.sum(pos_tag_counts, axis=0) + + if self.frequency: + # sorts the count and tags by sum for frequency true + idx = (pos_tag_sum).argsort()[::-1] + self._pos_tags = np.array(self._pos_tags)[idx] + pos_tag_counts = pos_tag_counts[:, idx] + + if self.stack: + bar_stack( + pos_tag_counts, + ax=self.ax, + labels=list(self.labels_), + ticks=self._pos_tags, + colors=self.colors, + colormap=self.colormap, + ) + else: + xidx = np.arange(len(self._pos_tags)) + colors = resolve_colors( + n_colors=len(self._pos_tags), colormap=self.colormap, colors=self.colors + ) + self.ax.bar(xidx, pos_tag_counts[0], color=colors) + + return self.ax + + def finalize(self, **kwargs): + """ + Finalize the plot with ticks, labels, and title + + Parameters + ---------- + kwargs: dict + generic keyword arguments. + """ + # NOTE: not deduping here, so this is total, not unique + self.ax.set_ylabel("Count") + + if self.frequency: + self.ax.set_xlabel( + "{} part-of-speech tags, sorted by frequency".format( + self.tagset_names[self.tagset] + ) + ) + else: + self.ax.set_xlabel( + "{} part-of-speech tags".format(self.tagset_names[self.tagset]) + ) + + # bar stack(helper) sets the ticks if stack is true + if not self.stack: + self.ax.set_xticks(range(len(self._pos_tags))) + self.ax.set_xticklabels(self._pos_tags, rotation=90) + + self.set_title( + "PosTag plot for {}-token corpus".format( + (sum([sum(i.values()) for i in self.pos_tag_counts_.values()])) + ) + ) + + def poof(self, outpath=None, **kwargs): + if outpath is not None: + kwargs["bbox_inches"] = kwargs.get("bbox_inches", "tight") + return super(PosTagVisualizer, self).poof(outpath, **kwargs) + + +########################################################################## +## Quick Method +########################################################################## + + +def postag( + X, + y=None, + ax=None, + tagset="penn_treebank", + colormap=None, + colors=None, + frequency=False, + stack=False, + **kwargs +): + + """ + Display a barchart with the counts of different parts of speech + in X, which consists of a part-of-speech-tagged corpus, which the + visualizer expects to be a list of lists of lists of (token, tag) + tuples. + + Parameters + ---------- + X : list or generator + Should be provided as a list of documents or a generator + that yields a list of documents that contain a list of + sentences that contain (token, tag) tuples. + + ax : matplotlib axes + The axes to plot the figure on. + + tagset: string + The tagset that was used to perform part-of-speech tagging. + Either "penn_treebank" or "universal", defaults to "penn_treebank". + Use "universal" if corpus has been tagged using SpaCy. + + colors : list or tuple of colors + Specify the colors for each individual part-of-speech. + + colormap : string or matplotlib cmap + Specify a colormap to color the parts-of-speech. + + frequency: bool {True, False}, default: False + If set to True, part-of-speech tags will be plotted according to frequency, + from most to least frequent. + + kwargs : dict + Pass any additional keyword arguments to the PosTagVisualizer. + + Returns + ------- + visualizer: PosTagVisualizer + Returns the fitted, finalized visualizer + """ + # Instantiate the visualizer + visualizer = PosTagVisualizer( + ax=ax, + tagset=tagset, + colors=colors, + colormap=colormap, + frequency=frequency, + stack=stack, + **kwargs + ) + + # Fit and transform the visualizer (calls draw) + visualizer.fit(X, y=y, **kwargs) + visualizer.finalize() + + # Return the visualizer object + return visualizer diff --git a/yellowbrick/text/tsne.py b/yellowbrick/text/tsne.py index d60f7a24b..50ccaf435 100644 --- a/yellowbrick/text/tsne.py +++ b/yellowbrick/text/tsne.py @@ -1,11 +1,11 @@ # yellowbrick.text.tsne # Implements TSNE visualizations of documents in 2D space. # -# Author: Benjamin Bengfort -# Author: Rebecca Bilbro +# Author: Benjamin Bengfort +# Author: Rebecca Bilbro # Created: Mon Feb 20 06:33:29 2017 -0500 # -# Copyright (C) 2016 Bengfort.com +# Copyright (C) 2016 The scikit-yb developers # For license information, see LICENSE.txt # # ID: tsne.py [6aa9198] benjamin@bengfort.com $ @@ -35,8 +35,19 @@ ## Quick Methods ########################################################################## -def tsne(X, y=None, ax=None, decompose='svd', decompose_by=50, classes=None, - colors=None, colormap=None, alpha=0.7, **kwargs): + +def tsne( + X, + y=None, + ax=None, + decompose="svd", + decompose_by=50, + classes=None, + colors=None, + colormap=None, + alpha=0.7, + **kwargs +): """ Display a projection of a vectorized corpus in two dimensions using TSNE, a nonlinear dimensionality reduction method that is particularly well @@ -88,26 +99,35 @@ def tsne(X, y=None, ax=None, decompose='svd', decompose_by=50, classes=None, Returns ------- - ax : matplotlib axes - Returns the axes that the parallel coordinates were drawn on. + visualizer: TSNEVisualizer + Returns the fitted, finalized visualizer """ # Instantiate the visualizer visualizer = TSNEVisualizer( - ax, decompose, decompose_by, classes, colors, colormap, alpha, **kwargs + ax=ax, + decompose=decompose, + decompose_by=decompose_by, + classes=classes, + colors=colors, + colormap=colormap, + alpha=alpha, + **kwargs ) # Fit and transform the visualizer (calls draw) visualizer.fit(X, y, **kwargs) visualizer.transform(X) + visualizer.finalize() - # Return the axes object on the visualizer - return visualizer.ax + # Return the visualizer object + return visualizer ########################################################################## ## TSNEVisualizer ########################################################################## + class TSNEVisualizer(TextVisualizer): """ Display a projection of a vectorized corpus in two dimensions using TSNE, @@ -175,9 +195,19 @@ class TSNEVisualizer(TextVisualizer): # NOTE: cannot be np.nan NULL_CLASS = None - def __init__(self, ax=None, decompose='svd', decompose_by=50, - labels=None, classes=None, colors=None, colormap=None, - random_state=None, alpha=0.7, **kwargs): + def __init__( + self, + ax=None, + decompose="svd", + decompose_by=50, + labels=None, + classes=None, + colors=None, + colormap=None, + random_state=None, + alpha=0.7, + **kwargs + ): # Visual Parameters self.alpha = alpha @@ -188,16 +218,14 @@ def __init__(self, ax=None, decompose='svd', decompose_by=50, # Fetch TSNE kwargs from kwargs by popping only keys belonging to TSNE params tsne_kwargs = { - key: kwargs.pop(key) - for key in TSNE().get_params() - if key in kwargs + key: kwargs.pop(key) for key in TSNE().get_params() if key in kwargs } self.transformer_ = self.make_transformer(decompose, decompose_by, tsne_kwargs) # Call super at the end so that size and title are set correctly super(TSNEVisualizer, self).__init__(ax=ax, **kwargs) - def make_transformer(self, decompose='svd', decompose_by=50, tsne_kwargs={}): + def make_transformer(self, decompose="svd", decompose_by=50, tsne_kwargs={}): """ Creates an internal transformer pipeline to project the data set into 2D space using TSNE, applying an pre-decomposition technique ahead of @@ -226,10 +254,7 @@ def make_transformer(self, decompose='svd', decompose_by=50, tsne_kwargs={}): # TODO: detect decompose by inferring from sparse matrix or dense or # If number of features > 50 etc. - decompositions = { - 'svd': TruncatedSVD, - 'pca': PCA, - } + decompositions = {"svd": TruncatedSVD, "pca": PCA} if decompose and decompose.lower() not in decompositions: raise YellowbrickValueError( @@ -244,12 +269,20 @@ def make_transformer(self, decompose='svd', decompose_by=50, tsne_kwargs={}): # Add the pre-decomposition if decompose: klass = decompositions[decompose] - steps.append((decompose, klass( - n_components=decompose_by, random_state=self.random_state))) + steps.append( + ( + decompose, + klass(n_components=decompose_by, random_state=self.random_state), + ) + ) # Add the TSNE manifold - steps.append(('tsne', TSNE( - n_components=2, random_state=self.random_state, **tsne_kwargs))) + steps.append( + ( + "tsne", + TSNE(n_components=2, random_state=self.random_state, **tsne_kwargs), + ) + ) # return the pipeline return Pipeline(steps) @@ -312,15 +345,17 @@ def draw(self, points, target=None, **kwargs): # Resolve the labels with the classes labels = self.labels if self.labels is not None else self.classes_ if len(labels) != len(self.classes_): - raise YellowbrickValueError(( - "number of supplied labels ({}) does not " - "match the number of classes ({})" - ).format(len(labels), len(self.classes_))) - + raise YellowbrickValueError( + ( + "number of supplied labels ({}) does not " + "match the number of classes ({})" + ).format(len(labels), len(self.classes_)) + ) # Create the color mapping for the labels. self.color_values_ = resolve_colors( - n_colors=len(labels), colormap=self.colormap, colors=self.color) + n_colors=len(labels), colormap=self.colormap, colors=self.colors + ) colors = dict(zip(labels, self.color_values_)) # Transform labels into a map of class to label @@ -329,34 +364,33 @@ def draw(self, points, target=None, **kwargs): # Expand the points into vectors of x and y for scatter plotting, # assigning them to their label if the label has been passed in. # Additionally, filter classes not specified directly by the user. - series = defaultdict(lambda: {'x':[], 'y':[]}) + series = defaultdict(lambda: {"x": [], "y": []}) if target is not None: for t, point in zip(target, points): label = labels[t] - series[label]['x'].append(point[0]) - series[label]['y'].append(point[1]) + series[label]["x"].append(point[0]) + series[label]["y"].append(point[1]) else: label = self.classes_[0] - for x,y in points: - series[label]['x'].append(x) - series[label]['y'].append(y) + for x, y in points: + series[label]["x"].append(x) + series[label]["y"].append(y) # Plot the points for label, points in series.items(): self.ax.scatter( - points['x'], points['y'], c=colors[label], - alpha=self.alpha, label=label + points["x"], points["y"], c=colors[label], alpha=self.alpha, label=label ) + return self.ax + def finalize(self, **kwargs): """ Finalize the drawing by adding a title and legend, and removing the axes objects that do not convey information about TNSE. """ - self.set_title( - "TSNE Projection of {} Documents".format(self.n_instances_) - ) + self.set_title("TSNE Projection of {} Documents".format(self.n_instances_)) # Remove the ticks self.ax.set_yticks([]) @@ -367,6 +401,9 @@ def finalize(self, **kwargs): box = self.ax.get_position() self.ax.set_position([box.x0, box.y0, box.width * 0.8, box.height]) manual_legend( - self, self.classes_, self.color_values_, - loc='center left', bbox_to_anchor=(1, 0.5) + self, + self.classes_, + self.color_values_, + loc="center left", + bbox_to_anchor=(1, 0.5), ) diff --git a/yellowbrick/text/umap_vis.py b/yellowbrick/text/umap_vis.py new file mode 100644 index 000000000..e3545e3ac --- /dev/null +++ b/yellowbrick/text/umap_vis.py @@ -0,0 +1,367 @@ +# yellowbrick.text.umap_vis +# Implements UMAP visualizations of documents in 2D space. +# +# Author: John Healy +# Created: Mon Dec 03 14:00:00 2018 -0500 +# +# Copyright (C) 2019 The sckit-yb developers +# For license information, see LICENSE.txt +# +# ID: umap_vis.py [73a44e5] jchealy@gmail.com $ + +""" +Implements UMAP visualizations of documents in 2D space. +""" + +########################################################################## +## Imports +########################################################################## + +import warnings +import numpy as np + +from collections import defaultdict + +from yellowbrick.draw import manual_legend +from yellowbrick.text.base import TextVisualizer +from yellowbrick.style.colors import resolve_colors +from yellowbrick.exceptions import YellowbrickValueError + +from sklearn.pipeline import Pipeline + +try: + from umap import UMAP +except ImportError: + UMAP = None +except (RuntimeError, AttributeError): + UMAP = None + warnings.warn( + "Error Importing UMAP. UMAP does not support python 2.7 on Windows 32 bit." + ) + +########################################################################## +## Quick Methods +########################################################################## + + +def umap( + X, y=None, ax=None, classes=None, colors=None, colormap=None, alpha=0.7, **kwargs +): + """ + Display a projection of a vectorized corpus in two dimensions using UMAP (Uniform + Manifold Approximation and Projection), a nonlinear dimensionality reduction method + that is particularly well suited to embedding in two or three dimensions for + visualization as a scatter plot. UMAP is a relatively new technique but is often + used to visualize clusters or groups of data points and their relative proximities. + It typically is fast, scalable, and can be applied directly to sparse matrices + eliminating the need to run a ``TruncatedSVD`` as a pre-processing step. + + The current default for UMAP is Euclidean distance. Hellinger distance would be a + more appropriate distance function to use with CountVectorize data. That will be + released in a forthcoming version of UMAP. In the meantime cosine distance is likely + a better text default that Euclidean and can be set using the keyword argument + ``metric='cosine'``. + + Parameters + ---------- + + X : ndarray or DataFrame of shape n x m + A matrix of n instances with m features representing the corpus of + vectorized documents to visualize with umap. + + y : ndarray or Series of length n + An optional array or series of target or class values for instances. + If this is specified, then the points will be colored according to + their class. Often cluster labels are passed in to color the documents + in cluster space, so this method is used both for classification and + clustering methods. + + ax : matplotlib axes + The axes to plot the figure on. + + classes : list of strings + The names of the classes in the target, used to create a legend. + + colors : list or tuple of colors + Specify the colors for each individual class + + colormap : string or matplotlib cmap + Sequential colormap for continuous target + + alpha : float, default: 0.7 + Specify a transparency where 1 is completely opaque and 0 is completely + transparent. This property makes densely clustered points more visible. + + kwargs : dict + Pass any additional keyword arguments to the UMAP transformer. + + ------- + visualizer: UMAPVisualizer + Returns the fitted, finalized visualizer + """ + # Instantiate the visualizer + visualizer = UMAPVisualizer(ax, classes, colors, colormap, alpha, **kwargs) + + # Fit and transform the visualizer (calls draw) + visualizer.fit_transform(X, y, **kwargs) + visualizer.finalize() + + # Return the visualizer object + return visualizer + + +########################################################################## +## UMAPVisualizer +########################################################################## + + +class UMAPVisualizer(TextVisualizer): + """ + Display a projection of a vectorized corpus in two dimensions using UMAP (Uniform + Manifold Approximation and Projection), a nonlinear dimensionality reduction method + that is particularly well suited to embedding in two or three dimensions for + visualization as a scatter plot. UMAP is a relatively new technique but is often + used to visualize clusters or groups of data points and their relative proximities. + It typically is fast, scalable, and can be applied directly to sparse matrices + eliminating the need to run a ``TruncatedSVD`` as a pre-processing step. + + The current default for UMAP is Euclidean distance. Hellinger distance would be a + more appropriate distance function to use with CountVectorize data. That will be + released in a forthcoming version of UMAP. In the meantime cosine distance is likely + a better text default that Euclidean and can be set using the keyword argument + ``metric='cosine'``. + + For more, see https://github.com/lmcinnes/umap + + Parameters + ---------- + + ax : matplotlib axes + The axes to plot the figure on. + + labels : list of strings + The names of the classes in the target, used to create a legend. + Labels must match names of classes in sorted order. + + colors : list or tuple of colors + Specify the colors for each individual class + + colormap : string or matplotlib cmap + Sequential colormap for continuous target + + random_state : int, RandomState instance or None, optional, default: None + If int, random_state is the seed used by the random number generator; + If RandomState instance, random_state is the random number generator; + If None, the random number generator is the RandomState instance used + by np.random. The random state is applied to the preliminary + decomposition as well as UMAP. + + alpha : float, default: 0.7 + Specify a transparency where 1 is completely opaque and 0 is completely + transparent. This property makes densely clustered points more visible. + + kwargs : dict + Pass any additional keyword arguments to the UMAP transformer. + + Examples + -------- + + >>> model = MyVisualizer(metric='cosine') + >>> model.fit(X) + >>> model.poof() + + """ + + # NOTE: cannot be np.nan + NULL_CLASS = None + + def __init__( + self, + ax=None, + labels=None, + classes=None, + colors=None, + colormap=None, + random_state=None, + alpha=0.7, + **kwargs + ): + + if UMAP is None: + raise YellowbrickValueError( + ( + "umap package doesn't seem to be installed." + "Please install UMAP via: pip install umap-learn" + ) + ) + + # Visual Parameters + self.alpha = alpha + self.labels = labels + self.colors = colors + self.colormap = colormap + self.random_state = random_state + + # Fetch UMAP kwargs from kwargs by popping only keys belonging to UMAP params + umap_kwargs = { + key: kwargs.pop(key) for key in UMAP().get_params() if key in kwargs + } + + # UMAP doesn't require any pre-processing before embedding and thus doesn't + # require a pipeline. + self.transformer_ = self.make_transformer(umap_kwargs) + + # Call super at the end so that size and title are set correctly + super(UMAPVisualizer, self).__init__(ax=ax, **kwargs) + + def make_transformer(self, umap_kwargs={}): + """ + Creates an internal transformer pipeline to project the data set into + 2D space using UMAP. This method will reset the transformer on the + class. + + Parameters + ---------- + umap_kwargs : dict + Keyword arguments for the internal UMAP transformer + + Returns + ------- + transformer : Pipeline + Pipelined transformer for UMAP projections + """ + + # Create the pipeline steps + steps = [] + + # Add the UMAP manifold + steps.append( + ( + "umap", + UMAP(n_components=2, random_state=self.random_state, **umap_kwargs), + ) + ) + + # return the pipeline + return Pipeline(steps) + + def fit(self, X, y=None, **kwargs): + """ + The fit method is the primary drawing input for the UMAP projection + since the visualization requires both X and an optional y value. The + fit method expects an array of numeric vectors, so text documents must + be vectorized before passing them to this method. + + Parameters + ---------- + X : ndarray or DataFrame of shape n x m + A matrix of n instances with m features representing the corpus of + vectorized documents to visualize with UMAP. + + y : ndarray or Series of length n + An optional array or series of target or class values for + instances. If this is specified, then the points will be colored + according to their class. Often cluster labels are passed in to + color the documents in cluster space, so this method is used both + for classification and clustering methods. + + kwargs : dict + Pass generic arguments to the drawing method + + Returns + ------- + self : instance + Returns the instance of the transformer/visualizer + """ + + # Store the classes we observed in y + if y is not None: + self.classes_ = np.unique(y) + elif y is None and self.labels is not None: + self.classes_ = np.array([self.labels[0]]) + else: + self.classes_ = np.array([self.NULL_CLASS]) + + # Fit our internal transformer and transform the data. + vecs = self.transformer_.fit_transform(X) + self.n_instances_ = vecs.shape[0] + + # Draw the vectors + self.draw(vecs, y, **kwargs) + + # Fit always returns self. + return self + + def draw(self, points, target=None, **kwargs): + """ + Called from the fit method, this method draws the UMAP scatter plot, + from a set of decomposed points in 2 dimensions. This method also + accepts a third dimension, target, which is used to specify the colors + of each of the points. If the target is not specified, then the points + are plotted as a single cloud to show similar documents. + """ + # Resolve the labels with the classes + labels = self.labels if self.labels is not None else self.classes_ + if len(labels) != len(self.classes_): + raise YellowbrickValueError( + ( + "number of supplied labels ({}) does not " + "match the number of classes ({})" + ).format(len(labels), len(self.classes_)) + ) + + # Create the color mapping for the labels. + self.color_values_ = resolve_colors( + n_colors=len(labels), colormap=self.colormap, colors=self.colors + ) + colors = dict(zip(labels, self.color_values_)) + + # Transform labels into a map of class to label + labels = dict(zip(self.classes_, labels)) + + # Expand the points into vectors of x and y for scatter plotting, + # assigning them to their label if the label has been passed in. + # Additionally, filter classes not specified directly by the user. + series = defaultdict(lambda: {"x": [], "y": []}) + + if target is not None: + for t, point in zip(target, points): + label = labels[t] + series[label]["x"].append(point[0]) + series[label]["y"].append(point[1]) + else: + label = self.classes_[0] + for x, y in points: + series[label]["x"].append(x) + series[label]["y"].append(y) + + # Plot the points + for label, points in series.items(): + self.ax.scatter( + points["x"], points["y"], c=colors[label], alpha=self.alpha, label=label + ) + + return self.ax + + def finalize(self, **kwargs): + """ + Finalize the drawing by adding a title and legend, and removing the + axes objects that do not convey information about UMAP. + """ + self.set_title("UMAP Projection of {} Documents".format(self.n_instances_)) + + # Remove the ticks + self.ax.set_yticks([]) + self.ax.set_xticks([]) + + # Add the legend outside of the figure box. + if not all(self.classes_ == np.array([self.NULL_CLASS])): + box = self.ax.get_position() + self.ax.set_position([box.x0, box.y0, box.width * 0.8, box.height]) + manual_legend( + self, + self.classes_, + self.color_values_, + loc="center left", + bbox_to_anchor=(1, 0.5), + ) diff --git a/yellowbrick/utils/__init__.py b/yellowbrick/utils/__init__.py index 864802bf3..b3dc53916 100644 --- a/yellowbrick/utils/__init__.py +++ b/yellowbrick/utils/__init__.py @@ -1,13 +1,13 @@ # yellowbrick.utils # Utility functions and helpers for the Yellowbrick library. # -# Author: Jason Keung -# Author: Patrick O'Melveny -# Author: Benjamin Bengfort -# Author: Rebecca Bilbro +# Author: Jason Keung +# Author: Patrick O'Melveny +# Author: Benjamin Bengfort +# Author: Rebecca Bilbro # Created: Thu Jun 02 15:33:18 2016 -0500 # -# Copyright (C) 2016 District Data LAbs +# Copyright (C) 2016 The scikit-yb developers # For license information, see LICENSE.txt # # ID: __init__.py [79cd8cf] benjamin@bengfort.com $ @@ -22,3 +22,4 @@ from .helpers import * from .types import * +from .kneed import * diff --git a/yellowbrick/utils/decorators.py b/yellowbrick/utils/decorators.py index 81315fed9..73bb32c0d 100644 --- a/yellowbrick/utils/decorators.py +++ b/yellowbrick/utils/decorators.py @@ -1,10 +1,10 @@ # yellowbrick.utils.decorators # Decorators and descriptors for annotating yellowbrick library functions. # -# Author: Benjamin Bengfort +# Author: Benjamin Bengfort # Created: Thu May 18 15:13:33 2017 -0400 # -# Copyright (C) 2017 District Data Labs +# Copyright (C) 2017 The sckit-yb developers # For license information, see LICENSE.txt # # ID: decorators.py [79cd8cf] benjamin@bengfort.com $ @@ -24,6 +24,7 @@ ## Decorators ########################################################################## + def memoized(fget): """ Return a property attribute for new-style classes that only calls its @@ -40,7 +41,7 @@ def memoized(fget): python-memoized-property `python-memoized-property `_ """ - attr_name = '_{0}'.format(fget.__name__) + attr_name = "_{0}".format(fget.__name__) @wraps(fget) def fget_memoized(self): diff --git a/yellowbrick/utils/helpers.py b/yellowbrick/utils/helpers.py index 8882cbc85..01294ab47 100644 --- a/yellowbrick/utils/helpers.py +++ b/yellowbrick/utils/helpers.py @@ -1,10 +1,11 @@ # yellowbrick.utils.helpers # Helper functions and generic utilities for use in Yellowbrick code. # -# Author: Benjamin Bengfort +# Author: Benjamin Bengfort +# Author: Rebecca Bilbro # Created: Fri May 19 10:39:30 2017 -0700 # -# Copyright (C) 2017 District Data Labs +# Copyright (C) 2019 The scikit-yb developers # For license information, see LICENSE.txt # # ID: helpers.py [79cd8cf] benjamin@bengfort.com $ @@ -17,21 +18,103 @@ ## Imports ########################################################################## -from __future__ import division - import re +import sklearn import numpy as np from sklearn.pipeline import Pipeline +from sklearn.utils.validation import check_is_fitted -from .types import is_estimator +from yellowbrick.utils.types import is_estimator from yellowbrick.exceptions import YellowbrickTypeError - ########################################################################## ## Model and Feature Information ########################################################################## + +def is_fitted(estimator): + """ + In order to ensure that we don't call ``fit`` on an already-fitted model, + this utility function calls ``predict`` on the estimator, returning ``False`` + if it raises a ``sklearn.exceptions.NotFittedError`` and ``True`` otherwise. + + NOTE: This is the solution proposed to scikit-yb: https://bit.ly/2LWQxZO (see + also: https://stackoverflow.com/a/39900933/6552250), though it remains unclear + how it will perform with sklearn-style Estimators and Transformers from other + 3rd party libraries like Keras, XGBoost, etc. + """ + try: + estimator.predict(np.zeros((7, 3))) + except sklearn.exceptions.NotFittedError: + return False + except AttributeError: + # Some clustering models (LDA, PCA, Agglomerative) don't implement ``predict`` + try: + check_is_fitted( + estimator, + [ + "coef_", + "estimator_", + "labels_", + "n_clusters_", + "children_", + "components_", + "n_components_", + "n_iter_", + "n_batch_iter_", + "explained_variance_", + "singular_values_", + "mean_", + ], + all_or_any=any, + ) + return True + except sklearn.exceptions.NotFittedError: + return False + except Exception: + # Assume it's fitted, since ``NotFittedError`` wasn't raised + return True + + return True + + +def check_fitted(estimator, is_fitted_by="auto", **kwargs): + """ + Determines whether or not to check if the model has been fitted, and will return + ``True`` if so. The ``is_fitted_by`` argument is set to ``'auto'`` by default, + such that the check leaves it to the ``is_fitted`` helper method to determine if + a ``NotFitted`` error is raised. However, if the user prefers to override this + automatic functionality (e.g. if a 3rd party sklearn-like estimator has been used + that doesn't precisely implement the sklearn API), and ``is_fitted_by`` has been + set to either ``True`` or ``False``, we assume the user has supplied the necessary + information about whether or not the model is fit using the Visualizer's + ``is_fitted`` parameter. + + .. todo:: add other measures for checking if an estimator is fitted e.g. by coefs + + Parameters + ----------- + estimator : sklearn.Estimator + The model to check fittedness + + is_fitted_by : bool or str, default: 'auto' + If bool, that value is returned, otherwise ``is_fitted`` is used to check + for an exception + + kwargs : dict + Other optional parameters specific to the ``is_fitted_by`` mechanism. + + Returns + -------- + is_fitted : bool + Whether or not the model is already fitted + """ + if isinstance(is_fitted_by, str) and is_fitted_by.lower() == "auto": + return is_fitted(estimator) + return bool(is_fitted_by) + + def get_model_name(model): """ Detects the model name for a Scikit-Learn model or pipeline. @@ -50,9 +133,7 @@ def get_model_name(model): """ if not is_estimator(model): raise YellowbrickTypeError( - "Cannot detect the model name for non estimator: '{}'".format( - type(model) - ) + "Cannot detect the model name for non estimator: '{}'".format(type(model)) ) else: @@ -65,12 +146,15 @@ def get_model_name(model): def has_ndarray_int_columns(features, X): """ Checks if numeric feature columns exist in ndarray """ _, ncols = X.shape - if not all(d.isdigit() for d in features if isinstance(d, str)) or not isinstance(X, np.ndarray): + if not all(d.isdigit() for d in features if isinstance(d, str)) or not isinstance( + X, np.ndarray + ): return False ndarray_columns = np.arange(0, ncols) feature_cols = np.unique([int(d) for d in features]) return all(np.in1d(feature_cols, ndarray_columns)) + # Alias for closer name to isinstance and issubclass hasndarrayintcolumns = has_ndarray_int_columns @@ -88,7 +172,7 @@ def is_monotonic(a, increasing=True): Test if the array is montonically increasing, otherwise test if the array is montonically decreasing. """ - a = np.asarray(a) # ensure a is array-like + a = np.asarray(a) # ensure a is array-like if a.ndim > 1: raise ValueError("not supported for multi-dimensonal arrays") @@ -105,8 +189,8 @@ def is_monotonic(a, increasing=True): ## Numeric Computations ########################################################################## -#From here: http://stackoverflow.com/questions/26248654/numpy-return-0-with-divide-by-zero -def div_safe( numerator, denominator ): +# From here: https://bit.ly/2xR64lI +def div_safe(numerator, denominator): """ Ufunc-extension that returns 0 instead of nan when dividing numpy arrays @@ -120,15 +204,15 @@ def div_safe( numerator, denominator ): example: div_safe( [-1, 0, 1], 0 ) == [0, 0, 0] """ - #First handle scalars + # First handle scalars if np.isscalar(numerator): raise ValueError("div_safe should only be used with an array-like numerator") - #Then numpy arrays + # Then numpy arrays try: - with np.errstate(divide='ignore', invalid='ignore'): - result = np.true_divide( numerator, denominator ) - result[ ~ np.isfinite( result )] = 0 # -inf inf NaN + with np.errstate(divide="ignore", invalid="ignore"): + result = np.true_divide(numerator, denominator) + result[~np.isfinite(result)] = 0 # -inf inf NaN return result except ValueError as e: raise e @@ -178,14 +262,14 @@ def prop_to_size(vals, mi=0.0, ma=5.0, power=0.5, log=False): if delta == 0.0: delta = 1.0 - return mi + (ma-mi) * ((vals -vals.min()) / delta) ** power - + return mi + (ma - mi) * ((vals - vals.min()) / delta) ** power ########################################################################## -## String Computations +# String Computations ########################################################################## + def slugify(text): """ Returns a slug of given text, normalizing unicode data for file-safe @@ -201,8 +285,8 @@ def slugify(text): slug : string A normalized slug representation of the text - .. seealso:: http://yashchandra.com/2014/05/08/how-to-generate-clean-url-or-a-slug-in-python/ + .. seealso:: https://bit.ly/2NW7s1j """ - slug = re.sub(r'[^\w]+', ' ', text) + slug = re.sub(r"[^\w]+", " ", text) slug = "-".join(slug.lower().strip().split()) return slug diff --git a/yellowbrick/utils/kneed.py b/yellowbrick/utils/kneed.py new file mode 100644 index 000000000..f14192c3a --- /dev/null +++ b/yellowbrick/utils/kneed.py @@ -0,0 +1,279 @@ +# yellowbrick.utils.kneed +# A port of the knee-point detection package, kneed. +# +# Author: Kevin Arvai +# Author: Pradeep Singh +# Created: Mon Apr 15 09:43:18 2019 -0400 +# +# Copyright (C) 2017 Kevin Arvai +# All rights reserved. +# Redistribution and use in source and binary forms, with or without modification, +# are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this list +# of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, this +# list of conditions and the following disclaimer in the documentation and/or other +# materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its contributors may +# be used to endorse or promote products derived from this software without specific +# prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS +# OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN +# IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# ID: kneed.py [] pswaldia@no-reply.github.com $ + +""" +This package contains a port of the knee-point detection package, kneed, by +Kevin Arvai and hosted at https://github.com/arvkevi/kneed. This port is maintained +with permission by the Yellowbrick contributors. +""" +import numpy as np +from scipy import interpolate +from scipy.signal import argrelextrema +import warnings + +from yellowbrick.exceptions import YellowbrickWarning + + +class KneeLocator(object): + """ + Finds the "elbow" or "knee" which is a value corresponding to the point of maximum curvature + in an elbow curve, using knee point detection algorithm. This point is accessible via the + `knee` attribute. + + Parameters + ---------- + x : list + A list of k values representing the no. of clusters in KMeans Clustering algorithm. + + y : list + A list of silhouette score corresponding to each value of k. + + S : float, default: 1.0 + Sensitivity parameter that allows us to adjust how aggressive we want KneeLocator to + be when detecting "knees" or "elbows". + + curve_nature : string, default: 'concave' + A string that determines the nature of the elbow curve in which "knee" or "elbow" is + to be found. + + curve_direction : string, default: 'increasing' + A string that determines tha increasing or decreasing nature of the elbow curve in + which "knee" or "elbow" is to be found. + + Notes + ----- + The KneeLocator is implemented using the "knee point detection algorithm" which can be read at + `` + """ + + def __init__( + self, x, y, S=1.0, curve_nature="concave", curve_direction="increasing" + ): + + # Raw Input + self.x = x + self.y = y + self.curve_nature = curve_nature + self.curve_direction = curve_direction + self.N = len(self.x) + self.S = S + self.all_knees = set() + self.all_norm_knees = set() + + # Step 1: fit a smooth line + uspline = interpolate.interp1d(self.x, self.y) + self.x = np.array(x) + self.Ds_y = uspline(self.x) + + # Step 2: normalize values + self.x_normalized = self.__normalize(self.x) + self.y_normalized = self.__normalize(self.Ds_y) + + # Step 3: Calculate the Difference curve + self.x_normalized, self.y_normalized = self.transform_xy( + self.x_normalized, + self.y_normalized, + self.curve_direction, + self.curve_nature, + ) + # normalized difference curve + self.y_distance = self.y_normalized - self.x_normalized + self.x_distance = self.x_normalized.copy() + + # Step 4: Identify local maxima/minima + # local maxima + self.maxima_inidices = argrelextrema(self.y_distance, np.greater)[0] + self.x_distance_maxima = self.x_distance[self.maxima_inidices] + self.y_distance_maxima = self.y_distance[self.maxima_inidices] + + # local minima + self.minima_indices = argrelextrema(self.y_distance, np.less)[0] + self.x_distance_minima = self.x_distance[self.minima_indices] + self.y_distance_minima = self.y_distance[self.minima_indices] + + # Step 5: Calculate thresholds + self.Tmx = self.y_distance_maxima - ( + self.S * np.abs(np.diff(self.x_normalized).mean()) + ) + + # Step 6: find knee + self.find_knee() + if (self.all_knees or self.all_norm_knees) == set(): + warning_message = ( + "No 'knee' or 'elbow point' detected " + "This could be due to bad clustering, no " + "actual clusters being formed etc." + ) + warnings.warn(warning_message, YellowbrickWarning) + self.knee = None + self.norm_knee = None + else: + self.knee, self.norm_knee = min(self.all_knees), min(self.all_norm_knees) + + @staticmethod + def __normalize(a): + """ + Normalizes an array. + Parameters + ----------- + a : list + The array to normalize + """ + return (a - min(a)) / (max(a) - min(a)) + + @staticmethod + def transform_xy(x, y, direction, curve): + """transform x and y to concave, increasing based on curve_direction and curve_nature""" + # convert elbows to knees + if curve == "convex": + x = x.max() - x + y = y.max() - y + # flip decreasing functions to increasing + if direction == "decreasing": + y = np.flip(y) + + if curve == "convex": + x = np.flip(x) + y = np.flip(y) + + return x, y + + def find_knee(self,): + """This function finds and sets the knee value and the normalized knee value. """ + if not self.maxima_inidices.size: + warning_message = ( + 'No "knee" or "elbow point" detected ' + "This could be due to bad clustering, no " + "actual clusters being formed etc." + ) + warnings.warn(warning_message, YellowbrickWarning) + return None, None + + # artificially place a local max at the last item in the x_distance array + self.maxima_inidices = np.append(self.maxima_inidices, len(self.x_distance) - 1) + self.minima_indices = np.append(self.minima_indices, len(self.x_distance) - 1) + + # placeholder for which threshold region i is located in. + maxima_threshold_index = 0 + minima_threshold_index = 0 + # traverse the distance curve + for idx, i in enumerate(self.x_distance): + # reached the end of the curve + if i == 1.0: + break + # values in distance curve are at or after a local maximum + if idx >= self.maxima_inidices[maxima_threshold_index]: + threshold = self.Tmx[maxima_threshold_index] + threshold_index = idx + maxima_threshold_index += 1 + # values in distance curve are at or after a local minimum + if idx >= self.minima_indices[minima_threshold_index]: + threshold = 0.0 + minima_threshold_index += 1 + # Do not evaluate values in the distance curve before the first local maximum. + if idx < self.maxima_inidices[0]: + continue + + # evaluate the threshold + if self.y_distance[idx] < threshold: + if self.curve_nature == "convex": + if self.curve_direction == "decreasing": + knee = self.x[threshold_index] + self.all_knees.add(knee) + norm_knee = self.x_normalized[threshold_index] + self.all_norm_knees.add(norm_knee) + else: + knee = self.x[-(threshold_index + 1)] + self.all_knees.add(knee) + norm_knee = self.x_normalized[-(threshold_index + 1)] + self.all_norm_knees.add(norm_knee) + + elif self.curve_nature == "concave": + if self.curve_direction == "decreasing": + knee = self.x[-(threshold_index + 1)] + self.all_knees.add(knee) + norm_knee = self.x_normalized[-(threshold_index + 1)] + self.all_norm_knees.add(norm_knee) + else: + knee = self.x[threshold_index] + self.all_knees.add(knee) + norm_knee = self.x_normalized[threshold_index] + self.all_norm_knees.add(norm_knee) + + def plot_knee_normalized(self,): + """ + Plots the normalized curve, the distance curve (x_distance, y_normalized) and the + knee, if it exists. + """ + import matplotlib.pyplot as plt + + plt.figure(figsize=(8, 8)) + plt.plot(self.x_normalized, self.y_normalized) + plt.plot(self.x_distance, self.y_distance, "r") + plt.xticks( + np.arange(self.x_normalized.min(), self.x_normalized.max() + 0.1, 0.1) + ) + plt.yticks(np.arange(self.y_distance.min(), self.y_normalized.max() + 0.1, 0.1)) + + plt.vlines(self.norm_knee, plt.ylim()[0], plt.ylim()[1]) + + def plot_knee(self,): + """ + Plot the curve and the knee, if it exists + + """ + import matplotlib.pyplot as plt + + plt.figure(figsize=(8, 8)) + plt.plot(self.x, self.y) + plt.vlines(self.knee, plt.ylim()[0], plt.ylim()[1]) + + # Niceties for users working with elbows rather than knees + @property + def elbow(self): + return self.knee + + @property + def norm_elbow(self): + return self.norm_knee + + @property + def all_elbows(self): + return self.all_knees + + @property + def all_norm_elbows(self): + return self.all_norm_knees diff --git a/yellowbrick/utils/nan_warnings.py b/yellowbrick/utils/nan_warnings.py index 70a0cada1..eaa671028 100644 --- a/yellowbrick/utils/nan_warnings.py +++ b/yellowbrick/utils/nan_warnings.py @@ -1,3 +1,14 @@ +# yellowbrick.utils.nan_warnings +# Small helpers that help find and filter missing data. +# +# Author: Aylr +# Created: Thu Dec 28 11:37:42 2017 -0700 +# +# Copyright (C) 2018 The sckit-yb developers +# For license information, see LICENSE.txt +# +# ID: nan_warnings.py [d2276d6] Aylr@users.noreply.github.com $ +# """ Small helpers that help find and filter missing data. """ @@ -60,9 +71,10 @@ def warn_if_nans_exist(X): percent = 100 * null_count / total if null_count > 0: - warning_message = \ - 'Warning! Found {} rows of {} ({:0.2f}%) with nan values. Only ' \ - 'complete rows will be plotted.'.format(null_count, total, percent) + warning_message = ( + "Warning! Found {} rows of {} ({:0.2f}%) with nan values. Only " + "complete rows will be plotted.".format(null_count, total, percent) + ) warnings.warn(warning_message, DataWarning) diff --git a/yellowbrick/utils/target.py b/yellowbrick/utils/target.py new file mode 100644 index 000000000..2bd3a070d --- /dev/null +++ b/yellowbrick/utils/target.py @@ -0,0 +1,106 @@ +# yellowbrick.utils.target +# Helper functions related to the target variable. +# +# Author: Benjamin Bengfort +# Created: Thu Dec 27 20:16:18 2018 -0500 +# +# Copyright (C) 2018 The sckit-yb developers +# For license information, see LICENSE.txt +# +# ID: target.py [899c88a] benjamin@bengfort.com $ + +""" +Helper functions related to the target variable. +""" + +########################################################################## +## Imports and Module Variables +########################################################################## + +import numpy as np + +from enum import Enum +from sklearn.utils.multiclass import type_of_target +from yellowbrick.exceptions import YellowbrickValueError + + +__all__ = ["MAX_DISCRETE_CLASSES", "TargetType", "target_color_type"] + +MAX_DISCRETE_CLASSES = 12 + + +class TargetType(Enum): + """Constants for defining target colors by input type""" + + AUTO = "auto" + SINGLE = "single" + DISCRETE = "discrete" + CONTINUOUS = "continuous" + UNKNOWN = "unknown" + + @classmethod + def validate(klass, val): + if isinstance(val, klass): + return + + try: + klass(val) + except ValueError: + raise YellowbrickValueError("unknown target color type '{}'".format(val)) + + def __eq__(self, other): + if isinstance(other, str): + try: + return TargetType(other.lower()) == self + except ValueError: + return False + return super(TargetType, self).__eq__(other) + + +########################################################################## +## Helper Functions +########################################################################## + + +def target_color_type(y): + """ + Determines the type of color space that will best represent the target + variable y, e.g. either a discrete (categorical) color space or a + continuous color space that requires a colormap. This function can handle + both 1D or column vectors as well as multi-output targets. + + Parameters + ---------- + y : array-like + Must be a valid array-like data structure that can be passed to a + scikit-learn supervised estimator. + + Returns + ------- + color_type : string + One of: + + * 'discrete': `y` is either a binary target or a multiclass target + with <= 12 discrete classes. + * 'continuous': `y` is an array-like of floats that are not all + integers or a multiclass target with > 12 discrete classes. + * 'unknown': `y` is array-like but none of the above. For example + a multilabel-indicator or a 3D array. No exception is raised. + """ + if y is None or len(np.unique(y)) == 1: + return TargetType.SINGLE + + ttype = type_of_target(y) + + if ttype.startswith("continuous"): + return TargetType.CONTINUOUS + + if ttype.startswith("binary"): + return TargetType.DISCRETE + + if ttype.startswith("multiclass"): + if len(np.unique(y)) > MAX_DISCRETE_CLASSES: + return TargetType.CONTINUOUS + return TargetType.DISCRETE + + return TargetType.UNKNOWN diff --git a/yellowbrick/utils/timer.py b/yellowbrick/utils/timer.py index 6b63816d2..141911982 100644 --- a/yellowbrick/utils/timer.py +++ b/yellowbrick/utils/timer.py @@ -1,11 +1,13 @@ # yellowbrick.utils.timer # Timer utilities # -# Author: ZJ Poh +# Author: ZJ Poh # Created: Mon Jul 16 10:51:13 2017 -0700 # -# Copyright (C) 2017 District Data Labs +# Copyright (C) 2017 The sckit-yb developers # For license information, see LICENSE.txt +# +# ID: timer.py [75b0f6a] 8103276+zjpoh@users.noreply.github.com $ """ Timer utilities """ @@ -32,8 +34,9 @@ class Timer: A context object timer. Usage: >>> with Timer() as timer: ... do_something() - >>> print timer.interval + >>> print(timer.interval) """ + def __init__(self): self.time = time.time diff --git a/yellowbrick/utils/types.py b/yellowbrick/utils/types.py index ee4ff8e7b..04bb67cec 100644 --- a/yellowbrick/utils/types.py +++ b/yellowbrick/utils/types.py @@ -1,10 +1,10 @@ # yellowbrick.utils.types # Detection utilities for Scikit-Learn and Numpy types for flexibility # -# Author: Benjamin Bengfort +# Author: Benjamin Bengfort # Created: Fri May 19 10:51:13 2017 -0700 # -# Copyright (C) 2017 District Data Labs +# Copyright (C) 2017 The sckit-yb developers # For license information, see LICENSE.txt # # ID: types.py [79cd8cf] benjamin@bengfort.com $ @@ -27,6 +27,7 @@ ## Model Type checking utilities ########################################################################## + def is_estimator(model): """ Determines if a model is an estimator using issubclass and isinstance. @@ -42,6 +43,7 @@ def is_estimator(model): return isinstance(model, BaseEstimator) + # Alias for closer name to isinstance and issubclass isestimator = is_estimator @@ -65,6 +67,7 @@ def is_classifier(estimator): # Test the _estimator_type property return getattr(estimator, "_estimator_type", None) == "classifier" + # Alias for closer name to isinstance and issubclass isclassifier = is_classifier @@ -88,6 +91,7 @@ def is_regressor(estimator): # Test the _estimator_type property return getattr(estimator, "_estimator_type", None) == "regressor" + # Alias for closer name to isinstance and issubclass isregressor = is_regressor @@ -106,6 +110,7 @@ def is_clusterer(estimator): # Test the _estimator_type property return getattr(estimator, "_estimator_type", None) == "clusterer" + # Alias for closer name to isinstance and issubclass isclusterer = is_clusterer @@ -144,10 +149,10 @@ def is_probabilistic(estimator): The object to test if is probabilistic, especially a Scikit-Learn estimator or Yellowbrick visualizer. """ - return any([ - hasattr(estimator, 'predict_proba'), - hasattr(estimator, 'decision_function'), - ]) + return any( + [hasattr(estimator, "predict_proba"), hasattr(estimator, "decision_function")] + ) + # Alias for closer name to isinstance and issubclass isprobabilistic = is_probabilistic @@ -157,6 +162,7 @@ def is_probabilistic(estimator): ## Data Type checking utilities ########################################################################## + def is_dataframe(obj): """ Returns True if the given object is a Pandas Data Frame. @@ -169,11 +175,13 @@ def is_dataframe(obj): try: # This is the best method of type checking from pandas import DataFrame + return isinstance(obj, DataFrame) except ImportError: # Pandas is not a dependency, so this is scary return obj.__class__.__name__ == "DataFrame" + # Alias for closer name to isinstance and issubclass isdataframe = is_dataframe @@ -190,11 +198,13 @@ def is_series(obj): try: # This is the best method of type checking from pandas import Series + return isinstance(obj, Series) except ImportError: # Pandas is not a dependency, so this is scary return obj.__class__.__name__ == "Series" + # Alias for closer name to isinstance and issubclass isseries = is_series @@ -208,7 +218,7 @@ def is_structured_array(obj): obj: instance The object to test whether or not is a Numpy Structured Array. """ - if isinstance(obj, np.ndarray) and hasattr(obj, 'dtype'): + if isinstance(obj, np.ndarray) and hasattr(obj, "dtype"): if obj.dtype.names is not None: return True return False diff --git a/yellowbrick/utils/wrapper.py b/yellowbrick/utils/wrapper.py index 1c8c77106..f5a586c0a 100644 --- a/yellowbrick/utils/wrapper.py +++ b/yellowbrick/utils/wrapper.py @@ -1,10 +1,10 @@ # yellowbrick.utils.wrapper # Utility package that provides a wrapper for new style classes. # -# Author: Benjamin Bengfort +# Author: Benjamin Bengfort # Created: Sun May 21 20:27:32 2017 -0700 # -# Copyright (C) 2017 District Data Labs +# Copyright (C) 2017 The sckit-yb developers # For license information, see LICENSE.txt # # ID: wrapper.py [b2ecd50] benjamin@bengfort.com $ diff --git a/yellowbrick/version.py b/yellowbrick/version.py index 5c2b96e3a..74c0811de 100644 --- a/yellowbrick/version.py +++ b/yellowbrick/version.py @@ -1,10 +1,10 @@ # yellowbrick.version # Maintains version and package information for deployment. # -# Author: Benjamin Bengfort +# Author: Benjamin Bengfort # Created: Mon Jan 25 14:22:52 2016 -0500 # -# Copyright (C) 2016 District Data Labs +# Copyright (C) 2016 The sckit-yb developers # For license information, see LICENSE.txt # # ID: version.py [0c5ba04] benjamin@bengfort.com $ @@ -18,26 +18,28 @@ ########################################################################## __version_info__ = { - 'major': 0, - 'minor': 9, - 'micro': 1, - 'releaselevel': 'final', - 'serial': 13, + "major": 1, + "minor": 0, + "micro": 0, + "releaselevel": "final", + "serial": 14, } ########################################################################## ## Helper Functions ########################################################################## + def get_version(short=False): """ Prints the version. """ - assert __version_info__['releaselevel'] in ('alpha', 'beta', 'final') - vers = ["%(major)i.%(minor)i" % __version_info__, ] - if __version_info__['micro']: + assert __version_info__["releaselevel"] in ("alpha", "beta", "final") + vers = ["%(major)i.%(minor)i" % __version_info__] + if __version_info__["micro"]: vers.append(".%(micro)i" % __version_info__) - if __version_info__['releaselevel'] != 'final' and not short: - vers.append('%s%i' % (__version_info__['releaselevel'][0], - __version_info__['serial'])) - return ''.join(vers) + if __version_info__["releaselevel"] != "final" and not short: + vers.append( + "%s%i" % (__version_info__["releaselevel"][0], __version_info__["serial"]) + ) + return "".join(vers)