Skip to content

Commit

Permalink
[ENH] diagnostic plots part 1 - cross-plots (#46)
Browse files Browse the repository at this point in the history
Moves the diagnostic plots to the new interface.

Part 1 - cross-plots.
  • Loading branch information
fkiraly authored Aug 27, 2023
1 parent 7220551 commit 7fa2aa6
Show file tree
Hide file tree
Showing 4 changed files with 325 additions and 51 deletions.
51 changes: 0 additions & 51 deletions examples/utils.py

This file was deleted.

236 changes: 236 additions & 0 deletions skpro/utils/plotting.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,236 @@
# -*- coding: utf-8 -*-
"""Utility functions for plotting."""
import numpy as np
import pandas as pd

from skpro.utils.validation._dependencies import _check_soft_dependencies

__authors__ = ["fkiraly", "frthjf"]


def plot_crossplot_interval(y_true, y_pred, coverage=None, ax=None):
"""Probabilistic cross-plot for regression, truth vs prediction interval.
Plots:
* x-axis: ground truth value
* y-axis: median predictive value, with error bars being
the prediction interval at symmetric coverage ``coverage``
Parameters
----------
y_true : array-like, [n_samples, n_targets]
Ground truth values
y_pred : skpro distribution, or predict_interval return, [n_samples, n_targets]
symmetric prediction intervals are obtained
via the coverage parameterfrom y_pred
Predicted values
coverage : float, optional, default=0.9
Coverage of the prediction interval
Used only if y_pred a distribution
ax : matplotlib axes, optional
Axes to plot on, if None, a new figure is created and returned
Returns
-------
ax : matplotlib axes
Axes containing the plot
If ax was None, a new figure is created and returned
If ax was not None, the same ax is returned with plot added
Example
-------
>>> from skpro.utils.plotting import plot_crossplot_interval # doctest: +SKIP
>>> from skpro.regression.residual import ResidualDouble # doctest: +SKIP
>>> from sklearn.ensemble import RandomForestRegressor # doctest: +SKIP
>>> from sklearn.linear_model import LinearRegression # doctest: +SKIP
>>> from sklearn.datasets import load_diabetes # doctest: +SKIP
>>>
>>> X, y = load_diabetes(return_X_y=True, as_frame=True) # doctest: +SKIP
>>> reg_mean = LinearRegression() # doctest: +SKIP
>>> reg_resid = RandomForestRegressor() # doctest: +SKIP
>>> reg_proba = ResidualDouble(reg_mean, reg_resid) # doctest: +SKIP
>>>
>>> reg_proba.fit(X, y) # doctest: +SKIP
ResidualDouble(...)
>>> y_pred = reg_proba.predict_proba(X) # doctest: +SKIP
>>> plot_crossplot_interval(y, y_pred) # doctest: +SKIP
"""
_check_soft_dependencies("matplotlib")

from matplotlib import pyplot

if hasattr(y_pred, "quantile"):
if coverage is None:
coverage = 0.9
quantile_pts = [0.5 - coverage / 2, 0.5, 0.5 + coverage / 2]
y_quantiles = y_pred.quantile(quantile_pts)
y_mid = y_quantiles.iloc[:, 1]
y_quantiles = y_quantiles.iloc[:, [0, 2]]
else:
y_quantiles = y_pred
y_mid = y_quantiles.mean(axis=1)

y_mid_two = pd.DataFrame([y_mid, y_mid]).values
y_quantiles_np = y_quantiles.values.T
y_bars = np.abs(y_mid_two - y_quantiles_np)

if ax is None:
_, ax = pyplot.subplots()

ax.plot(y_true, y_true, "g.", label="Optimum")
ax.errorbar(
y_true.values,
y_mid,
yerr=y_bars,
label="Predictions",
fmt="b.",
ecolor="r",
linewidth=0.5,
)
ax.set_ylabel(r"Prediction interval $\widehat{y}_i$")
ax.set_xlabel(r"Correct label $y_i$")
ax.legend(loc="best")

return ax


def plot_crossplot_std(y_true, y_pred, ax=None):
r"""Probabilistic cross-plot for regression, error vs predictive standard deviation.
Plots:
* x-axis: absolute error samples $|y_i - \widehat{y}_i.\mu|$
* y-axis: predictive standard deviation $\widehat{y}_i.\sigma$,
of the prediction $\widehat{y}_i$ corresponding to $y_i$
Parameters
----------
y_true : array-like, [n_samples, n_targets]
Ground truth values
y_pred : skpro distribution, or predict_var return, [n_samples, n_targets]
Predicted values
ax : matplotlib axes, optional
Axes to plot on, if None, a new figure is created and returned
Returns
-------
ax : matplotlib axes
Axes containing the plot
If ax was None, a new figure is created and returned
If ax was not None, the same ax is returned with plot added
Example
-------
>>> from skpro.utils.plotting import plot_crossplot_std # doctest: +SKIP
>>> from skpro.regression.residual import ResidualDouble # doctest: +SKIP
>>> from sklearn.ensemble import RandomForestRegressor # doctest: +SKIP
>>> from sklearn.linear_model import LinearRegression # doctest: +SKIP
>>> from sklearn.datasets import load_diabetes # doctest: +SKIP
>>>
>>> X, y = load_diabetes(return_X_y=True, as_frame=True) # doctest: +SKIP
>>> reg_mean = LinearRegression() # doctest: +SKIP
>>> reg_resid = RandomForestRegressor() # doctest: +SKIP
>>> reg_proba = ResidualDouble(reg_mean, reg_resid) # doctest: +SKIP
>>>
>>> reg_proba.fit(X, y) # doctest: +SKIP
ResidualDouble(...)
>>> y_pred = reg_proba.predict_proba(X) # doctest: +SKIP
>>> plot_crossplot_std(y, y_pred) # doctest: +SKIP
"""
_check_soft_dependencies("matplotlib")

from matplotlib import pyplot

if hasattr(y_pred, "_tags"):
y_var = y_pred.var()

y_std = np.sqrt(y_var)

if ax is None:
_, ax = pyplot.subplots()

ax.plot(
np.abs(y_pred.mean().values.flatten() - y_true.values.flatten()),
y_std.values.flatten(),
"b.",
)
ax.set_ylabel(r"Predictive variance of $\widehat{y}_i$")
ax.set_xlabel(r"Absolute errors $|y_i - \widehat{y}_i|$")
# ax.legend(loc="best")

return ax


def plot_crossplot_loss(y_true, y_pred, metric, ax=None):
r"""Cross-loss-plot for probabilistic regression.
Plots:
* x-axis: ground truth values $y_i$
* y-axis: loss of the prediction $\widehat{y}_i$ corresponding to $y_i$,
as calculated by ``metric.evaluate_by_index``
Parameters
----------
y_true : array-like, [n_samples, n_targets]
Ground truth values
y_pred : skpro distribution, or predict_var return, [n_samples, n_targets]
Predicted values
metric : skpro metric
Metric to calculate the loss
ax : matplotlib axes, optional
Axes to plot on, if None, a new figure is created and returned
Returns
-------
ax : matplotlib axes
Axes containing the plot
If ax was None, a new figure is created and returned
If ax was not None, the same ax is returned with plot added
Example
-------
>>> from skpro.utils.plotting import plot_crossplot_loss # doctest: +SKIP
>>> from skpro.metrics import CRPS # doctest: +SKIP
>>> from skpro.regression.residual import ResidualDouble # doctest: +SKIP
>>> from sklearn.ensemble import RandomForestRegressor # doctest: +SKIP
>>> from sklearn.linear_model import LinearRegression # doctest: +SKIP
>>> from sklearn.datasets import load_diabetes # doctest: +SKIP
>>>
>>> X, y = load_diabetes(return_X_y=True, as_frame=True) # doctest: +SKIP
>>> reg_mean = LinearRegression() # doctest: +SKIP
>>> reg_resid = RandomForestRegressor() # doctest: +SKIP
>>> reg_proba = ResidualDouble(reg_mean, reg_resid) # doctest: +SKIP
>>>
>>> reg_proba.fit(X, y) # doctest: +SKIP
ResidualDouble(...)
>>> y_pred = reg_proba.predict_proba(X) # doctest: +SKIP
>>> crps_metric = CRPS() # doctest: +SKIP
>>> plot_crossplot_loss(y, y_pred, crps_metric) # doctest: +SKIP
"""
_check_soft_dependencies("matplotlib")

from matplotlib import pyplot

losses = metric.evaluate_by_index(y_true, y_pred)
loss_vals = losses.values.flatten()
total_loss = np.mean(loss_vals).round(2)
total_loss_std = np.std(loss_vals) / np.sqrt(len(loss_vals))
total_loss_std = total_loss_std.round(2)

overall = f"{total_loss} +/- {total_loss_std} sterr of mean"

if ax is None:
_, ax = pyplot.subplots()

ax.plot(y_true, losses, "y_")

ax.set_title(f"mean {metric.name}: {overall}")

ax.set_xlabel(r"Correct label $y_i$")
ax.set_ylabel(metric.name + r"($y_i$, $\widehat{y}_i$)")

ax.tick_params(colors="y")

return ax
2 changes: 2 additions & 0 deletions skpro/utils/tests/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# -*- coding: utf-8 -*-
"""Tests for utilities."""
87 changes: 87 additions & 0 deletions skpro/utils/tests/test_plots.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
# -*- coding: utf-8 -*-
"""Test functionality of time series plotting functions."""
# copyright: skpro developers, BSD-3-Clause License (see LICENSE file)

import pytest

from skpro.utils.validation._dependencies import _check_soft_dependencies


@pytest.mark.skipif(
not _check_soft_dependencies("matplotlib", severity="none"),
reason="skip test if required soft dependency for matplotlib not available",
)
def test_plot_crossplot_interval():
"""Test that plot_crossplot_interval runs without error."""
_check_soft_dependencies("matplotlib")

from sklearn.datasets import load_diabetes
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

from skpro.regression.residual import ResidualDouble
from skpro.utils.plotting import plot_crossplot_interval

X, y = load_diabetes(return_X_y=True, as_frame=True)
reg_mean = LinearRegression()
reg_resid = RandomForestRegressor()
reg_proba = ResidualDouble(reg_mean, reg_resid)

reg_proba.fit(X, y)
y_pred = reg_proba.predict_proba(X)

plot_crossplot_interval(y, y_pred)


@pytest.mark.skipif(
not _check_soft_dependencies("matplotlib", severity="none"),
reason="skip test if required soft dependency for matplotlib not available",
)
def test_plot_crossplot_std():
"""Test that plot_crossplot_std runs without error."""
_check_soft_dependencies("matplotlib")

from sklearn.datasets import load_diabetes
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

from skpro.regression.residual import ResidualDouble
from skpro.utils.plotting import plot_crossplot_std

X, y = load_diabetes(return_X_y=True, as_frame=True)
reg_mean = LinearRegression()
reg_resid = RandomForestRegressor()
reg_proba = ResidualDouble(reg_mean, reg_resid)

reg_proba.fit(X, y)
y_pred = reg_proba.predict_proba(X)

plot_crossplot_std(y, y_pred)


@pytest.mark.skipif(
not _check_soft_dependencies("matplotlib", severity="none"),
reason="skip test if required soft dependency for matplotlib not available",
)
def test_plot_crossplot_loss():
"""Test that plot_crossplot_loss runs without error."""
_check_soft_dependencies("matplotlib")

from sklearn.datasets import load_diabetes
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

from skpro.metrics import CRPS
from skpro.regression.residual import ResidualDouble
from skpro.utils.plotting import plot_crossplot_loss

X, y = load_diabetes(return_X_y=True, as_frame=True)
reg_mean = LinearRegression()
reg_resid = RandomForestRegressor()
reg_proba = ResidualDouble(reg_mean, reg_resid)

reg_proba.fit(X, y)
y_pred = reg_proba.predict_proba(X)

crps_metric = CRPS()
plot_crossplot_loss(y, y_pred, crps_metric)

0 comments on commit 7fa2aa6

Please sign in to comment.