Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ENH] diagnostic plots part 1 - cross-plots #46

Merged
merged 6 commits into from
Aug 27, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 0 additions & 51 deletions examples/utils.py

This file was deleted.

236 changes: 236 additions & 0 deletions skpro/utils/plotting.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,236 @@
# -*- coding: utf-8 -*-
"""Utility functions for plotting."""
import numpy as np
import pandas as pd

from skpro.utils.validation._dependencies import _check_soft_dependencies

__authors__ = ["fkiraly", "frthjf"]


def plot_crossplot_interval(y_true, y_pred, coverage=None, ax=None):
"""Probabilistic cross-plot for regression, truth vs prediction interval.

Plots:

* x-axis: ground truth value
* y-axis: median predictive value, with error bars being
the prediction interval at symmetric coverage ``coverage``

Parameters
----------
y_true : array-like, [n_samples, n_targets]
Ground truth values
y_pred : skpro distribution, or predict_interval return, [n_samples, n_targets]
symmetric prediction intervals are obtained
via the coverage parameterfrom y_pred
Predicted values
coverage : float, optional, default=0.9
Coverage of the prediction interval
Used only if y_pred a distribution
ax : matplotlib axes, optional
Axes to plot on, if None, a new figure is created and returned

Returns
-------
ax : matplotlib axes
Axes containing the plot
If ax was None, a new figure is created and returned
If ax was not None, the same ax is returned with plot added

Example
-------
>>> from skpro.utils.plotting import plot_crossplot_interval # doctest: +SKIP
>>> from skpro.regression.residual import ResidualDouble # doctest: +SKIP
>>> from sklearn.ensemble import RandomForestRegressor # doctest: +SKIP
>>> from sklearn.linear_model import LinearRegression # doctest: +SKIP
>>> from sklearn.datasets import load_diabetes # doctest: +SKIP
>>>
>>> X, y = load_diabetes(return_X_y=True, as_frame=True) # doctest: +SKIP
>>> reg_mean = LinearRegression() # doctest: +SKIP
>>> reg_resid = RandomForestRegressor() # doctest: +SKIP
>>> reg_proba = ResidualDouble(reg_mean, reg_resid) # doctest: +SKIP
>>>
>>> reg_proba.fit(X, y) # doctest: +SKIP
ResidualDouble(...)
>>> y_pred = reg_proba.predict_proba(X) # doctest: +SKIP
>>> plot_crossplot_interval(y, y_pred) # doctest: +SKIP
"""
_check_soft_dependencies("matplotlib")

from matplotlib import pyplot

if hasattr(y_pred, "quantile"):
if coverage is None:
coverage = 0.9
quantile_pts = [0.5 - coverage / 2, 0.5, 0.5 + coverage / 2]
y_quantiles = y_pred.quantile(quantile_pts)
y_mid = y_quantiles.iloc[:, 1]
y_quantiles = y_quantiles.iloc[:, [0, 2]]
else:
y_quantiles = y_pred
y_mid = y_quantiles.mean(axis=1)

y_mid_two = pd.DataFrame([y_mid, y_mid]).values
y_quantiles_np = y_quantiles.values.T
y_bars = np.abs(y_mid_two - y_quantiles_np)

if ax is None:
_, ax = pyplot.subplots()

ax.plot(y_true, y_true, "g.", label="Optimum")
ax.errorbar(
y_true.values,
y_mid,
yerr=y_bars,
label="Predictions",
fmt="b.",
ecolor="r",
linewidth=0.5,
)
ax.set_ylabel(r"Prediction interval $\widehat{y}_i$")
ax.set_xlabel(r"Correct label $y_i$")
ax.legend(loc="best")

return ax


def plot_crossplot_std(y_true, y_pred, ax=None):
r"""Probabilistic cross-plot for regression, error vs predictive standard deviation.

Plots:

* x-axis: absolute error samples $|y_i - \widehat{y}_i.\mu|$
* y-axis: predictive standard deviation $\widehat{y}_i.\sigma$,
of the prediction $\widehat{y}_i$ corresponding to $y_i$

Parameters
----------
y_true : array-like, [n_samples, n_targets]
Ground truth values
y_pred : skpro distribution, or predict_var return, [n_samples, n_targets]
Predicted values
ax : matplotlib axes, optional
Axes to plot on, if None, a new figure is created and returned

Returns
-------
ax : matplotlib axes
Axes containing the plot
If ax was None, a new figure is created and returned
If ax was not None, the same ax is returned with plot added

Example
-------
>>> from skpro.utils.plotting import plot_crossplot_std # doctest: +SKIP
>>> from skpro.regression.residual import ResidualDouble # doctest: +SKIP
>>> from sklearn.ensemble import RandomForestRegressor # doctest: +SKIP
>>> from sklearn.linear_model import LinearRegression # doctest: +SKIP
>>> from sklearn.datasets import load_diabetes # doctest: +SKIP
>>>
>>> X, y = load_diabetes(return_X_y=True, as_frame=True) # doctest: +SKIP
>>> reg_mean = LinearRegression() # doctest: +SKIP
>>> reg_resid = RandomForestRegressor() # doctest: +SKIP
>>> reg_proba = ResidualDouble(reg_mean, reg_resid) # doctest: +SKIP
>>>
>>> reg_proba.fit(X, y) # doctest: +SKIP
ResidualDouble(...)
>>> y_pred = reg_proba.predict_proba(X) # doctest: +SKIP
>>> plot_crossplot_std(y, y_pred) # doctest: +SKIP
"""
_check_soft_dependencies("matplotlib")

from matplotlib import pyplot

if hasattr(y_pred, "_tags"):
y_var = y_pred.var()

y_std = np.sqrt(y_var)

if ax is None:
_, ax = pyplot.subplots()

ax.plot(
np.abs(y_pred.mean().values.flatten() - y_true.values.flatten()),
y_std.values.flatten(),
"b.",
)
ax.set_ylabel(r"Predictive variance of $\widehat{y}_i$")
ax.set_xlabel(r"Absolute errors $|y_i - \widehat{y}_i|$")
# ax.legend(loc="best")

return ax


def plot_crossplot_loss(y_true, y_pred, metric, ax=None):
r"""Cross-loss-plot for probabilistic regression.

Plots:

* x-axis: ground truth values $y_i$
* y-axis: loss of the prediction $\widehat{y}_i$ corresponding to $y_i$,
as calculated by ``metric.evaluate_by_index``

Parameters
----------
y_true : array-like, [n_samples, n_targets]
Ground truth values
y_pred : skpro distribution, or predict_var return, [n_samples, n_targets]
Predicted values
metric : skpro metric
Metric to calculate the loss
ax : matplotlib axes, optional
Axes to plot on, if None, a new figure is created and returned

Returns
-------
ax : matplotlib axes
Axes containing the plot
If ax was None, a new figure is created and returned
If ax was not None, the same ax is returned with plot added

Example
-------
>>> from skpro.utils.plotting import plot_crossplot_loss # doctest: +SKIP
>>> from skpro.metrics import CRPS # doctest: +SKIP
>>> from skpro.regression.residual import ResidualDouble # doctest: +SKIP
>>> from sklearn.ensemble import RandomForestRegressor # doctest: +SKIP
>>> from sklearn.linear_model import LinearRegression # doctest: +SKIP
>>> from sklearn.datasets import load_diabetes # doctest: +SKIP
>>>
>>> X, y = load_diabetes(return_X_y=True, as_frame=True) # doctest: +SKIP
>>> reg_mean = LinearRegression() # doctest: +SKIP
>>> reg_resid = RandomForestRegressor() # doctest: +SKIP
>>> reg_proba = ResidualDouble(reg_mean, reg_resid) # doctest: +SKIP
>>>
>>> reg_proba.fit(X, y) # doctest: +SKIP
ResidualDouble(...)
>>> y_pred = reg_proba.predict_proba(X) # doctest: +SKIP
>>> crps_metric = CRPS() # doctest: +SKIP
>>> plot_crossplot_loss(y, y_pred, crps_metric) # doctest: +SKIP
"""
_check_soft_dependencies("matplotlib")

from matplotlib import pyplot

losses = metric.evaluate_by_index(y_true, y_pred)
loss_vals = losses.values.flatten()
total_loss = np.mean(loss_vals).round(2)
total_loss_std = np.std(loss_vals) / np.sqrt(len(loss_vals))
total_loss_std = total_loss_std.round(2)

overall = f"{total_loss} +/- {total_loss_std} sterr of mean"

if ax is None:
_, ax = pyplot.subplots()

ax.plot(y_true, losses, "y_")

ax.set_title(f"mean {metric.name}: {overall}")

ax.set_xlabel(r"Correct label $y_i$")
ax.set_ylabel(metric.name + r"($y_i$, $\widehat{y}_i$)")

ax.tick_params(colors="y")

return ax
2 changes: 2 additions & 0 deletions skpro/utils/tests/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# -*- coding: utf-8 -*-
"""Tests for utilities."""
87 changes: 87 additions & 0 deletions skpro/utils/tests/test_plots.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
# -*- coding: utf-8 -*-
"""Test functionality of time series plotting functions."""
# copyright: skpro developers, BSD-3-Clause License (see LICENSE file)

import pytest

from skpro.utils.validation._dependencies import _check_soft_dependencies


@pytest.mark.skipif(
not _check_soft_dependencies("matplotlib", severity="none"),
reason="skip test if required soft dependency for matplotlib not available",
)
def test_plot_crossplot_interval():
"""Test that plot_crossplot_interval runs without error."""
_check_soft_dependencies("matplotlib")

from sklearn.datasets import load_diabetes
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

from skpro.regression.residual import ResidualDouble
from skpro.utils.plotting import plot_crossplot_interval

X, y = load_diabetes(return_X_y=True, as_frame=True)
reg_mean = LinearRegression()
reg_resid = RandomForestRegressor()
reg_proba = ResidualDouble(reg_mean, reg_resid)

reg_proba.fit(X, y)
y_pred = reg_proba.predict_proba(X)

plot_crossplot_interval(y, y_pred)


@pytest.mark.skipif(
not _check_soft_dependencies("matplotlib", severity="none"),
reason="skip test if required soft dependency for matplotlib not available",
)
def test_plot_crossplot_std():
"""Test that plot_crossplot_std runs without error."""
_check_soft_dependencies("matplotlib")

from sklearn.datasets import load_diabetes
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

from skpro.regression.residual import ResidualDouble
from skpro.utils.plotting import plot_crossplot_std

X, y = load_diabetes(return_X_y=True, as_frame=True)
reg_mean = LinearRegression()
reg_resid = RandomForestRegressor()
reg_proba = ResidualDouble(reg_mean, reg_resid)

reg_proba.fit(X, y)
y_pred = reg_proba.predict_proba(X)

plot_crossplot_std(y, y_pred)


@pytest.mark.skipif(
not _check_soft_dependencies("matplotlib", severity="none"),
reason="skip test if required soft dependency for matplotlib not available",
)
def test_plot_crossplot_loss():
"""Test that plot_crossplot_loss runs without error."""
_check_soft_dependencies("matplotlib")

from sklearn.datasets import load_diabetes
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

from skpro.metrics import CRPS
from skpro.regression.residual import ResidualDouble
from skpro.utils.plotting import plot_crossplot_loss

X, y = load_diabetes(return_X_y=True, as_frame=True)
reg_mean = LinearRegression()
reg_resid = RandomForestRegressor()
reg_proba = ResidualDouble(reg_mean, reg_resid)

reg_proba.fit(X, y)
y_pred = reg_proba.predict_proba(X)

crps_metric = CRPS()
plot_crossplot_loss(y, y_pred, crps_metric)