From 2b011bc4746422ce0e391d8892bb53d8d79a826c Mon Sep 17 00:00:00 2001 From: Sebastian Schmidl Date: Mon, 2 Sep 2024 11:24:13 +0200 Subject: [PATCH] [ENH] Add performance metrics for anomaly detection (from TimeEval) (#1938) * feat: add TSAD metrics and tests from TimeEval * fix: prts soft dependency specification * feat: update CODEOWNERS * feat: add to docs * fix: prts version specification until https://github.com/CompML/PRTS/pull/77 is merged and add skiptests conditions if not installed * feat: handle str-object-names in check_soft_dependencies correctly * fix: check_soft_dependencies test case for str-object reference * fix: address PR review change requests --- CODEOWNERS | 1 + .../anomaly_detection/__init__.py | 37 ++ .../anomaly_detection/_binary.py | 209 ++++++++++ .../anomaly_detection/_continuous.py | 296 ++++++++++++++ .../anomaly_detection/_util.py | 119 ++++++ .../anomaly_detection/_vus_metrics.py | 382 ++++++++++++++++++ .../anomaly_detection/tests/__init__.py | 1 + .../tests/test_ad_metrics.py | 304 ++++++++++++++ .../tests/test_thresholding.py | 37 ++ .../anomaly_detection/thresholding.py | 147 +++++++ aeon/utils/validation/_dependencies.py | 2 + .../validation/tests/test_dependencies.py | 2 +- docs/api_reference/performance_metrics.rst | 39 +- pyproject.toml | 1 + 14 files changed, 1575 insertions(+), 2 deletions(-) create mode 100644 aeon/performance_metrics/anomaly_detection/__init__.py create mode 100644 aeon/performance_metrics/anomaly_detection/_binary.py create mode 100644 aeon/performance_metrics/anomaly_detection/_continuous.py create mode 100644 aeon/performance_metrics/anomaly_detection/_util.py create mode 100644 aeon/performance_metrics/anomaly_detection/_vus_metrics.py create mode 100644 aeon/performance_metrics/anomaly_detection/tests/__init__.py create mode 100644 aeon/performance_metrics/anomaly_detection/tests/test_ad_metrics.py create mode 100644 aeon/performance_metrics/anomaly_detection/tests/test_thresholding.py create mode 100644 aeon/performance_metrics/anomaly_detection/thresholding.py diff --git a/CODEOWNERS b/CODEOWNERS index b003844696..7c51713485 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -20,6 +20,7 @@ aeon/forecasting/ @aiwalter @guzalbulatova @ltsaprounis aeon/networks/ @hadifawaz1999 aeon/performance_metrics/forecasting/ @aiwalter +aeon/performance_metrics/anomaly_detection/ @codelionx @MatthewMiddlehurst aeon/pipeline/ @aiwalter diff --git a/aeon/performance_metrics/anomaly_detection/__init__.py b/aeon/performance_metrics/anomaly_detection/__init__.py new file mode 100644 index 0000000000..20b52bd503 --- /dev/null +++ b/aeon/performance_metrics/anomaly_detection/__init__.py @@ -0,0 +1,37 @@ +"""Metrics for anomaly detection.""" + +from aeon.performance_metrics.anomaly_detection._binary import ( + range_f_score, + range_precision, + range_recall, +) +from aeon.performance_metrics.anomaly_detection._continuous import ( + f_score_at_k_points, + f_score_at_k_ranges, + pr_auc_score, + roc_auc_score, + rp_rr_auc_score, +) +from aeon.performance_metrics.anomaly_detection._vus_metrics import ( + range_pr_auc_score, + range_pr_roc_auc_support, + range_pr_vus_score, + range_roc_auc_score, + range_roc_vus_score, +) + +__all__ = [ + "range_precision", + "range_recall", + "range_f_score", + "roc_auc_score", + "pr_auc_score", + "rp_rr_auc_score", + "f_score_at_k_points", + "f_score_at_k_ranges", + "range_pr_roc_auc_support", + "range_roc_auc_score", + "range_pr_auc_score", + "range_pr_vus_score", + "range_roc_vus_score", +] diff --git a/aeon/performance_metrics/anomaly_detection/_binary.py b/aeon/performance_metrics/anomaly_detection/_binary.py new file mode 100644 index 0000000000..6c6c812ef9 --- /dev/null +++ b/aeon/performance_metrics/anomaly_detection/_binary.py @@ -0,0 +1,209 @@ +"""Metrics on binary predictions for anomaly detection.""" + +__maintainer__ = ["CodeLionX"] +__all__ = ["range_precision", "range_recall", "range_f_score"] + +import warnings + +import numpy as np + +from aeon.performance_metrics.anomaly_detection._util import check_y +from aeon.utils.validation._dependencies import _check_soft_dependencies + + +def range_precision( + y_true: np.ndarray, + y_pred: np.ndarray, + alpha: float = 0, + cardinality: str = "reciprocal", + bias: str = "flat", +) -> float: + """Compute the range-based precision metric. + + Range-based metrics were introduced by Tatbul et al. at NeurIPS 2018 [1]_. This + implementation uses the community package `prts `_ + as a soft-dependency. + + Range precision is the average precision of each predicted anomaly range. For each + predicted continuous anomaly range the overlap size, position, and cardinality is + considered. For more details, please refer to the paper [1]_. + + Parameters + ---------- + y_true : np.ndarray + True binary labels of shape (n_instances,). + y_pred : np.ndarray + Anomaly scores for each point of the time series of shape (n_instances,). + alpha : float + Weight of the existence reward. Because precision by definition emphasizes on + prediction quality, there is no need for an existence reward and this value + should always be set to 0. + cardinality : {'reciprocal', 'one', 'udf_gamma'} + Cardinality type. + bias : {'flat', 'front', 'middle', 'back'} + Positional bias type. + + Returns + ------- + float + Range-based precision + + References + ---------- + .. [1] Tatbul, Nesime, Tae Jun Lee, Stan Zdonik, Mejbah Alam, and Justin + Gottschlich. "Precision and Recall for Time Series." In Proceedings of the + International Conference on Neural Information Processing Systems (NeurIPS), + 1920–30. 2018. + http://papers.nips.cc/paper/7462-precision-and-recall-for-time-series.pdf. + """ + _check_soft_dependencies("prts", obj="range_precision", suppress_import_stdout=True) + + from prts import ts_precision + + y_true, y_pred = check_y(y_true, y_pred, force_y_pred_continuous=False) + if np.unique(y_pred).shape[0] == 1: + warnings.warn( + "Cannot compute metric for a constant value in y_score, returning 0.0!", + stacklevel=2, + ) + return 0.0 + return ts_precision(y_true, y_pred, alpha=alpha, cardinality=cardinality, bias=bias) + + +def range_recall( + y_true: np.ndarray, + y_pred: np.ndarray, + alpha: float = 0, + cardinality: str = "reciprocal", + bias: str = "flat", +) -> float: + """Compute the range-based recall metric. + + Range-based metrics were introduced by Tatbul et al. at NeurIPS 2018 [1]_. This + implementation uses the community package `prts `_ + as a soft-dependency. + + Range recall is the average recall of each real anomaly range. For each real + anomaly range the overlap size, position, and cardinality with predicted anomaly + ranges are considered. In addition, an existence reward can be given that boosts + the recall even if just a single point of the real anomaly is in the predicted + ranges. For more details, please refer to the paper [1]_. + + Parameters + ---------- + y_true : np.ndarray + True binary labels of shape (n_instances,). + y_pred : np.ndarray + Anomaly scores for each point of the time series of shape (n_instances,). + alpha : float + Weight of the existence reward. If 0: no existence reward, if 1: only existence + reward. The existence reward is given if the real anomaly range has overlap + with even a single point of the predicted anomaly range. + cardinality : {'reciprocal', 'one', 'udf_gamma'} + Cardinality type. + bias : {'flat', 'front', 'middle', 'back'} + Positional bias type. + + Returns + ------- + float + Range-based recall + + References + ---------- + .. [1] Tatbul, Nesime, Tae Jun Lee, Stan Zdonik, Mejbah Alam, and Justin + Gottschlich. "Precision and Recall for Time Series." In Proceedings of the + International Conference on Neural Information Processing Systems (NeurIPS), + 1920–30. 2018. + http://papers.nips.cc/paper/7462-precision-and-recall-for-time-series.pdf. + """ + _check_soft_dependencies("prts", obj="range_recall", suppress_import_stdout=True) + + from prts import ts_recall + + y_true, y_pred = check_y(y_true, y_pred, force_y_pred_continuous=False) + if np.unique(y_pred).shape[0] == 1: + warnings.warn( + "Cannot compute metric for a constant value in y_score, returning 0.0!", + stacklevel=2, + ) + return 0.0 + return ts_recall(y_true, y_pred, alpha=alpha, cardinality=cardinality, bias=bias) + + +def range_f_score( + y_true: np.ndarray, + y_pred: np.ndarray, + beta: float = 1, + p_alpha: float = 0, + r_alpha: float = 0.5, + cardinality: str = "reciprocal", + p_bias: str = "flat", + r_bias: str = "flat", +) -> float: + """Compute the F-score using the range-based recall and precision metrics. + + Range-based metrics were introduced by Tatbul et al. at NeurIPS 2018 [1]_. This + implementation uses the community package `prts `_ + as a soft-dependency. + + The F-beta score is the weighted harmonic mean of precision and recall, reaching + its optimal value at 1 and its worst value at 0. This implementation uses the + range-based precision and range-based recall as basis. + + Parameters + ---------- + y_true : np.ndarray + True binary labels of shape (n_instances,). + y_pred : np.ndarray + Anomaly scores for each point of the time series of shape (n_instances,). + beta : float + F-score beta determines the weight of recall in the combined score. + beta < 1 lends more weight to precision, while beta > 1 favors recall. + p_alpha : float + Weight of the existence reward for the range-based precision. For most - when + not all - cases, `p_alpha` should be set to 0. + r_alpha : float + Weight of the existence reward. If 0: no existence reward, if 1: only + existence reward. + cardinality : {'reciprocal', 'one', 'udf_gamma'} + Cardinality type. + p_bias : {'flat', 'front', 'middle', 'back'} + Positional bias type. + r_bias : {'flat', 'front', 'middle', 'back'} + Positional bias type. + + Returns + ------- + float + Range-based F-score + + References + ---------- + .. [1] Tatbul, Nesime, Tae Jun Lee, Stan Zdonik, Mejbah Alam, and Justin + Gottschlich. "Precision and Recall for Time Series." In Proceedings of the + International Conference on Neural Information Processing Systems (NeurIPS), + 1920–30. 2018. + http://papers.nips.cc/paper/7462-precision-and-recall-for-time-series.pdf. + """ + _check_soft_dependencies("prts", obj="range_recall", suppress_import_stdout=True) + + from prts import ts_fscore + + y_true, y_pred = check_y(y_true, y_pred, force_y_pred_continuous=False) + if np.unique(y_pred).shape[0] == 1: + warnings.warn( + "Cannot compute metric for a constant value in y_score, returning 0.0!", + stacklevel=2, + ) + return 0.0 + return ts_fscore( + y_true, + y_pred, + beta=beta, + p_alpha=p_alpha, + r_alpha=r_alpha, + cardinality=cardinality, + p_bias=p_bias, + r_bias=r_bias, + ) diff --git a/aeon/performance_metrics/anomaly_detection/_continuous.py b/aeon/performance_metrics/anomaly_detection/_continuous.py new file mode 100644 index 0000000000..1d5925fb25 --- /dev/null +++ b/aeon/performance_metrics/anomaly_detection/_continuous.py @@ -0,0 +1,296 @@ +"""Metrics on anomaly scores for anomaly detection.""" + +from __future__ import annotations + +__maintainer__ = ["CodeLionX"] +__all__ = [ + "roc_auc_score", + "pr_auc_score", + "f_score_at_k_points", + "f_score_at_k_ranges", + "rp_rr_auc_score", +] + +import warnings + +import numpy as np +from sklearn.metrics import auc, f1_score, precision_recall_curve +from sklearn.metrics import roc_auc_score as _roc_auc_score + +from aeon.performance_metrics.anomaly_detection._util import check_y +from aeon.performance_metrics.anomaly_detection.thresholding import ( + top_k_points_threshold, + top_k_ranges_threshold, +) +from aeon.utils.validation._dependencies import _check_soft_dependencies + + +def roc_auc_score(y_true: np.ndarray, y_score: np.ndarray) -> float: + """Compute the ROC AUC score. + + Parameters + ---------- + y_true : np.ndarray + True binary labels of shape (n_instances,). + y_score : np.ndarray + Anomaly scores for each point of the time series of shape (n_instances,). + + Returns + ------- + float + ROC AUC score. + + See Also + -------- + sklearn.metrics.roc_auc_score + Is used internally. + """ + y_true, y_pred = check_y(y_true, y_score, force_y_pred_continuous=True) + if np.unique(y_score).shape[0] == 1: + warnings.warn( + "Cannot compute metric for a constant value in y_score, returning 0.0!", + stacklevel=2, + ) + return 0.0 + return _roc_auc_score(y_true, y_score) + + +def pr_auc_score(y_true: np.ndarray, y_score: np.ndarray) -> float: + """Compute the precision-recall AUC score. + + Parameters + ---------- + y_true : np.ndarray + True binary labels of shape (n_instances,). + y_score : np.ndarray + Anomaly scores for each point of the time series of shape (n_instances,). + + Returns + ------- + float + Precision-recall AUC score. + + See Also + -------- + sklearn.metrics.precision_recall_curve + Function used under the hood. + """ + y_true, y_pred = check_y(y_true, y_score, force_y_pred_continuous=True) + if np.unique(y_score).shape[0] == 1: + warnings.warn( + "Cannot compute metric for a constant value in y_score, returning 0.0!", + stacklevel=2, + ) + return 0.0 + + x, y, _ = precision_recall_curve(y_true, np.array(y_score)) + area = auc(y, x) + return area + + +def f_score_at_k_points( + y_true: np.ndarray, y_score: np.ndarray, k: int | None = None +) -> float: + """Compute the F-score at k based on single points. + + This metric only considers the top-k predicted anomalous points within the scoring + by finding a threshold on the scoring that produces at least k anomalous points. If + `k` is not specified, the number of anomalies within the ground truth is used as + `k`. + + Parameters + ---------- + y_true : np.ndarray + True binary labels of shape (n_instances,). + y_score : np.ndarray + Anomaly scores for each point of the time series of shape (n_instances,). + k : int (optional) + Number of top anomalies used to calculate precision. If `k` is not specified + (`None`) the number of true anomalies (based on the ground truth values) is + used. + + Returns + ------- + float + F1 score at k. + + See Also + -------- + aeon.performance_metrics.anomaly_detection.thresholding.top_k_points_threshold + Function used to find the threshold. + """ + y_true, y_pred = check_y(y_true, y_score, force_y_pred_continuous=True) + if np.unique(y_score).shape[0] == 1: + warnings.warn( + "Cannot compute metric for a constant value in y_score, returning 0.0!", + stacklevel=2, + ) + return 0.0 + + threshold = top_k_points_threshold(y_true, y_score, k) + y_pred = y_score >= threshold + return f1_score(y_true, y_pred) + + +def f_score_at_k_ranges( + y_true: np.ndarray, y_score: np.ndarray, k: int | None = None +) -> float: + """Compute the range-based F-score at k based on anomaly ranges. + + This metric only considers the top-k predicted anomaly ranges within the scoring by + finding a threshold on the scoring that produces at least k anomalous ranges. If `k` + is not specified, the number of anomalies within the ground truth is used as `k`. + + This implementation uses the community package + `prts `_ as a soft-dependency to compute the + range-based F-score. + + Parameters + ---------- + y_true : np.ndarray + True binary labels of shape (n_instances,). + y_score : np.ndarray + Anomaly scores for each point of the time series of shape (n_instances,). + k : int (optional) + Number of top anomalies used to calculate precision. If `k` is not specified + (`None`) the number of true anomalies (based on the ground truth values) is + used. + + Returns + ------- + float + F1 score at k. + + See Also + -------- + aeon.performance_metrics.anomaly_detection.thresholding.top_k_ranges_threshold + Function used to find the threshold. + """ + _check_soft_dependencies( + "prts", obj="f_score_at_k_ranges", suppress_import_stdout=True + ) + + from prts import ts_fscore + + y_true, y_pred = check_y(y_true, y_score, force_y_pred_continuous=True) + if np.unique(y_score).shape[0] == 1: + warnings.warn( + "Cannot compute metric for a constant value in y_score, returning 0.0!", + stacklevel=2, + ) + return 0.0 + + threshold = top_k_ranges_threshold(y_true, y_score, k) + y_pred = y_score >= threshold + return ts_fscore(y_true, y_pred, p_alpha=1, r_alpha=1, cardinality="reciprocal") + + +def rp_rr_auc_score( + y_true: np.ndarray, + y_score: np.ndarray, + max_samples: int = 50, + r_alpha: float = 0.5, + p_alpha: float = 0, + cardinality: str = "reciprocal", + bias: str = "flat", +) -> float: + """Compute the AUC-score of the range-based precision-recall curve. + + Computes the area under the precision recall curve when using the range-based + precision and range-based recall metric introduced by Tatbul et al. at NeurIPS 2018 + [1]_. This implementation uses the community package + `prts `_ as a soft-dependency. + + This metric only considers the top-k predicted anomaly ranges within the scoring by + finding a threshold on the scoring that produces at least k anomalous ranges. If `k` + is not specified, the number of anomalies within the ground truth is used as `k`. + + Parameters + ---------- + y_true : np.ndarray + True binary labels of shape (n_instances,). + y_score : np.ndarray + Anomaly scores for each point of the time series of shape (n_instances,). + max_samples: int + The implementation of the range-based precision and recall metrics is quite + slow because it relies on the non-optimized ``prts``-package. To prevent long + runtimes caused by scorings with high precision (many thresholds), just a + specific amount of possible thresholds is sampled. This parameter controls the + maximum number of thresholds; however, too low numbers degrade the metrics' + quality. + r_alpha : float + Weight of the existence reward for the range-based recall. + p_alpha : float + Weight of the existence reward for the range-based precision. For most - when + not all - cases, `p_alpha` should be set to 0. + cardinality : {'reciprocal', 'one', 'udf_gamma'} + Cardinality type. + bias : {'flat', 'front', 'middle', 'back'} + Positional bias type. + + Returns + ------- + float + Area under the range-based precision-recall curve. + + References + ---------- + .. [1] Tatbul, Nesime, Tae Jun Lee, Stan Zdonik, Mejbah Alam, and Justin + Gottschlich. "Precision and Recall for Time Series." In Proceedings of the + International Conference on Neural Information Processing Systems (NeurIPS), + 1920–30. 2018. + http://papers.nips.cc/paper/7462-precision-and-recall-for-time-series.pdf. + """ + _check_soft_dependencies( + "prts", obj="f_score_at_k_ranges", suppress_import_stdout=True + ) + + from prts import ts_precision, ts_recall + + y_true, y_pred = check_y(y_true, y_score, force_y_pred_continuous=True) + if np.unique(y_score).shape[0] == 1: + warnings.warn( + "Cannot compute metric for a constant value in y_score, returning 0.0!", + stacklevel=2, + ) + return 0.0 + + thresholds = np.unique(y_score) + thresholds.sort() + # The first precision and recall values are precision=class balance and recall=1.0, + # which corresponds to a classifier that always predicts the positive class, + # independently of the threshold. This means that we can skip the first threshold! + p0 = y_true.sum() / len(y_true) + r0 = 1.0 + thresholds = thresholds[1:] + + # sample thresholds + n_thresholds = thresholds.shape[0] + if n_thresholds > max_samples: + every_nth = n_thresholds // (max_samples - 1) + sampled_thresholds = thresholds[::every_nth] + if thresholds[-1] == sampled_thresholds[-1]: + thresholds = sampled_thresholds + else: + thresholds = np.r_[sampled_thresholds, thresholds[-1]] + + recalls = np.zeros_like(thresholds) + precisions = np.zeros_like(thresholds) + for i, threshold in enumerate(thresholds): + y_pred = (y_score >= threshold).astype(np.int64) + recalls[i] = ts_recall( + y_true, y_pred, alpha=r_alpha, cardinality=cardinality, bias=bias + ) + precisions[i] = ts_precision( + y_true, y_pred, alpha=p_alpha, cardinality=cardinality, bias=bias + ) + # first sort by recall, then by precision to break ties + # (important for noisy scorings) + sorted_idx = np.lexsort((precisions * (-1), recalls))[::-1] + + # add first and last points to the curve + recalls = np.r_[r0, recalls[sorted_idx], 0] + precisions = np.r_[p0, precisions[sorted_idx], 1] + + # calculate area under the curve + return auc(recalls, precisions) diff --git a/aeon/performance_metrics/anomaly_detection/_util.py b/aeon/performance_metrics/anomaly_detection/_util.py new file mode 100644 index 0000000000..8e48e19a68 --- /dev/null +++ b/aeon/performance_metrics/anomaly_detection/_util.py @@ -0,0 +1,119 @@ +"""Utility functions for anomaly detection performance metrics.""" + +__maintainer__ = ["CodeLionX"] +__all__ = ["check_y"] + +import warnings +from typing import Tuple + +import numpy as np +from sklearn.utils import assert_all_finite, check_consistent_length, column_or_1d + + +def check_y( + y_true: np.ndarray, + y_pred: np.ndarray, + force_y_pred_continuous: bool = False, + inf_is_1: bool = True, + neginf_is_0: bool = True, + nan_is_0: bool = True, +) -> Tuple[np.ndarray, np.ndarray]: + """Check the input arrays for the performance metrics. + + This function checks the input arrays for the performance metrics. You can control + the interpretation of infinity, negative infinity, and NaN values in the anomaly + scoring with the parameters. If the respective parameter is set to ``False``, the + corresponding values are considered as wrong predictions (either as FP or FN + depending on the label). + + Parameters + ---------- + y_true : np.ndarray + True binary labels of shape (n_instances,). + y_pred : np.ndarray + Either the anomaly scores for each point of the time series or the binary + anomaly prediction of shape (n_instances,). If ``force_y_pred_continous`` is set + ``y_pred`` must be a float array, otherwise it must be an integer array. + force_y_pred_continuous : bool, default false + If ``True`` the function checks whether ``y_pred`` is a continuous float array. + Otherwise, it is assumed to contain binary predictions as integer array. If the + user accidentally swapped the input parameters, this functions swappes them + back. + inf_is_1 : bool, default True + Whether to treat infinite values in the anomaly scores as 1. + neginf_is_0 : bool, default True + Whether to treat negative infinite values in the anomaly scores as 0. + nan_is_0 : bool, default True + Whether to treat NaN values in the anomaly scores as 0. + + Returns + ------- + Tuple[np.ndarray, np.ndarray] + Tuple of the cleaned ``y_true`` and ``y_pred`` arrays. + """ + y_true = np.array(y_true).copy() + y_pred = np.array(y_pred).copy() + + # check labels + if ( + force_y_pred_continuous + and y_true.dtype == np.float_ + and y_pred.dtype == np.int_ + ): + warnings.warn( + "Assuming that y_true and y_score where permuted, because their" + "dtypes indicate so. y_true should be an integer array and" + "y_score a float array!", + stacklevel=2, + ) + return check_y( + y_pred, + y_true, + force_y_pred_continuous=force_y_pred_continuous, + inf_is_1=inf_is_1, + neginf_is_0=neginf_is_0, + nan_is_0=nan_is_0, + ) + + y_true = column_or_1d(y_true) + assert_all_finite(y_true) + + # check scores + y_pred = column_or_1d(y_pred) + + check_consistent_length([y_true, y_pred]) + if not force_y_pred_continuous and y_pred.dtype not in [np.int_, np.bool_]: + raise ValueError( + "When using metrics other than AUC/VUS-metrics that need binary " + "(0 or 1) scores (like Precision, Recall or F1-Score), the scores must " + "be integers and should only contain the values {0, 1}. Please " + "consider applying a threshold to the scores!" + ) + elif force_y_pred_continuous and y_pred.dtype != np.float_: + raise ValueError( + "When using continuous scoring metrics, the scores must be floats!" + ) + + # substitute NaNs and Infs + nan_mask = np.isnan(y_pred) + inf_mask = np.isinf(y_pred) + neginf_mask = np.isneginf(y_pred) + penalize_mask = np.full_like(y_pred, dtype=bool, fill_value=False) + if inf_is_1: + y_pred[inf_mask] = 1 + else: + penalize_mask = penalize_mask | inf_mask + if neginf_is_0: + y_pred[neginf_mask] = 0 + else: + penalize_mask = penalize_mask | neginf_mask + if nan_is_0: + y_pred[nan_mask] = 0 + else: + penalize_mask = penalize_mask | nan_mask + y_pred[penalize_mask] = (~np.array(y_true[penalize_mask], dtype=bool)).astype( + np.int_ + ) + + assert_all_finite(y_pred) + return y_true, y_pred diff --git a/aeon/performance_metrics/anomaly_detection/_vus_metrics.py b/aeon/performance_metrics/anomaly_detection/_vus_metrics.py new file mode 100644 index 0000000000..e1e4b53ce4 --- /dev/null +++ b/aeon/performance_metrics/anomaly_detection/_vus_metrics.py @@ -0,0 +1,382 @@ +"""VUS and AUC metrics for anomaly detection on time series.""" + +from __future__ import annotations + +__maintainer__ = ["CodeLionX"] +__all__ = [ + "range_pr_auc_score", + "range_roc_auc_score", + "range_pr_vus_score", + "range_roc_vus_score", + "range_pr_roc_auc_support", +] + +import warnings + +import numpy as np + +from aeon.performance_metrics.anomaly_detection._util import check_y + + +def _anomaly_bounds(y_true: np.ndarray) -> tuple[np.ndarray, np.ndarray]: + """Corresponds to range_convers_new.""" + # convert to boolean/binary + labels = y_true > 0 + # deal with start and end of time series + labels = np.diff(np.r_[0, labels, 0]) + # extract begin and end of anomalous regions + index = np.arange(0, labels.shape[0]) + starts = index[labels == 1] + ends = index[labels == -1] + return starts, ends + + +def _extend_anomaly_labels( + y_true: np.ndarray, buffer_size: int | None = None +) -> tuple[np.ndarray, np.ndarray]: + """Extend the anomaly labels with slopes on both ends. + + Also makes the labels continuous instead of binary. + """ + starts, ends = _anomaly_bounds(y_true) + + if buffer_size is None: + # per default: set buffer size as median anomaly length: + buffer_size = int(np.median(ends - starts)) + + if buffer_size <= 1: + anomalies = np.array(list(zip(starts, ends - 1))) + return y_true.astype(np.float_), anomalies + + y_true_cont = y_true.astype(np.float_) + slope_length = buffer_size // 2 + length = y_true_cont.shape[0] + for s, e in zip(starts, ends): + e -= 1 + x1 = np.arange(e, min(e + slope_length, length)) + y_true_cont[x1] += np.sqrt(1 - (x1 - e) / buffer_size) + x2 = np.arange(max(s - slope_length, 0), s) + y_true_cont[x2] += np.sqrt(1 - (s - x2) / buffer_size) + y_true_cont = np.clip(y_true_cont, 0, 1) + starts, ends = _anomaly_bounds(y_true_cont) + anomalies = np.array(list(zip(starts, ends - 1))) + + return y_true_cont, anomalies + + +def _uniform_threshold_sampling(y_score: np.ndarray) -> np.ndarray: + """Create the threshold via uniform sampling.""" + # magic number from original implementation + n_samples = 250 + thresholds = np.sort(y_score)[::-1] + thresholds = thresholds[ + np.linspace(0, thresholds.shape[0] - 1, n_samples, dtype=np.int_) + ] + return thresholds + + +def range_pr_roc_auc_support( + y_true: np.ndarray, + y_score: np.ndarray, + buffer_size: int | None = None, + skip_check: bool = False, +) -> tuple[float, float]: + """Compute the range-based PR and ROC AUC. + + Computes the area under the precision-recall-curve and the area under the + receiver operating characteristic using the range-based precision and range-based + recall definition from Paparrizos et al. published at VLDB 2022 [1]_. + + We first extend the anomaly labels by two slopes of ``buffer_size//2`` length on + both sides of each anomaly, uniformly sample thresholds from the anomaly score, and + then compute the confusion matrix for all thresholds. Using the resulting precision + and recall values, we can plot a curve and compute its area. + + Parameters + ---------- + y_true : np.ndarray + True binary labels of shape (n_instances,). + y_score : np.ndarray + Anomaly scores for each point of the time series of shape (n_instances,). + buffer_size : int, optional + Size of the buffer region around an anomaly. We add an increasing slope of size + ``buffer_size//2`` to the beginning of anomalies and a decreasing slope of size + ``buffer_size//2`` to the end of anomalies. Per default + (when ``buffer_size==None``), ``buffer_size`` is the median length of the + anomalies within the time series. However, you can also set it to the period + size of the dominant frequency or any other desired value. + skip_check : bool, default False + Whether to skip the input checks. + + Returns + ------- + Tuple[float, float] + Range-based PR AUC and range-based ROC AUC. + + References + ---------- + .. [1] John Paparrizos, Paul Boniol, Themis Palpanas, Ruey S. Tsay, + Aaron Elmore, and Michael J. Franklin. Volume Under the Surface: A New Accuracy + Evaluation Measure for Time-Series Anomaly Detection. PVLDB, 15(11): + 2774 - 2787, 2022. + doi:`10.14778/3551793.3551830 `_ + """ + if not skip_check: + y_true, y_pred = check_y(y_true, y_score, force_y_pred_continuous=True) + y_true_cont, anomalies = _extend_anomaly_labels(y_true, buffer_size) + thresholds = _uniform_threshold_sampling(y_score) + p = np.average([np.sum(y_true), np.sum(y_true_cont)]) + + recalls = np.zeros(thresholds.shape[0] + 2) # tprs + fprs = np.zeros(thresholds.shape[0] + 2) + precisions = np.ones(thresholds.shape[0] + 1) + + for i, t in enumerate(thresholds): + y_pred = y_score >= t + product = y_true_cont * y_pred + tp = np.sum(product) + # fp = np.dot((np.ones_like(y_pred) - y_true_cont).T, y_pred) + fp = np.sum(y_pred) - tp + n = len(y_pred) - p + + existence_reward = [np.sum(product[s : e + 1]) > 0 for s, e in anomalies] + existence_reward = np.sum(existence_reward) / anomalies.shape[0] + + recall = min(tp / p, 1) * existence_reward # = tpr + fpr = min(fp / n, 1) + precision = tp / np.sum(y_pred) + + recalls[i + 1] = recall + fprs[i + 1] = fpr + precisions[i + 1] = precision + + recalls[-1] = 1 + fprs[-1] = 1 + + range_pr_auc = np.sum( + (recalls[1:-1] - recalls[:-2]) * (precisions[1:] + precisions[:-1]) / 2 + ) + range_roc_auc = np.sum((fprs[1:] - fprs[:-1]) * (recalls[1:] + recalls[:-1]) / 2) + return range_pr_auc, range_roc_auc + + +def range_roc_auc_score( + y_true: np.ndarray, y_score: np.ndarray, buffer_size: int | None = None +) -> float: + """Compute the range-based area under the ROC curve. + + Computes the area under the receiver-operating-characteristic-curve using the + range-based TPR and range-based FPR definition from Paparrizos et al. + published at VLDB 2022 [1]_. + + We first extend the anomaly labels by two slopes of ``buffer_size//2`` length on + both sides of each anomaly, uniformly sample thresholds from the anomaly score, and + then compute the confusion matrix for all thresholds. Using the resulting precision + and recall values, we can plot a curve and compute its area. + + Parameters + ---------- + y_true : np.ndarray + True binary labels of shape (n_instances,). + y_score : np.ndarray + Anomaly scores for each point of the time series of shape (n_instances,). + buffer_size : int, optional + Size of the buffer region around an anomaly. We add an increasing slope of size + ``buffer_size//2`` to the beginning of anomalies and a decreasing slope of size + ``buffer_size//2`` to the end of anomalies. Per default + (when ``buffer_size==None``), ``buffer_size`` is the median length of the + anomalies within the time series. However, you can also set it to the period + size of the dominant frequency or any other desired value. + + Returns + ------- + Tuple[float, float] + Range-based ROC AUC score. + + References + ---------- + .. [1] John Paparrizos, Paul Boniol, Themis Palpanas, Ruey S. Tsay, + Aaron Elmore, and Michael J. Franklin. Volume Under the Surface: A New Accuracy + Evaluation Measure for Time-Series Anomaly Detection. PVLDB, 15(11): + 2774 - 2787, 2022. + doi:`10.14778/3551793.3551830 `_ + """ + y_true, y_pred = check_y(y_true, y_score, force_y_pred_continuous=True) + if np.unique(y_score).shape[0] == 1: + warnings.warn( + "Cannot compute metric for a constant value in y_score, returning 0.0!", + stacklevel=2, + ) + return 0.0 + + _, range_auc_roc = range_pr_roc_auc_support( + y_true, y_score, buffer_size, skip_check=True + ) + return range_auc_roc + + +def range_pr_auc_score( + y_true: np.ndarray, y_score: np.ndarray, buffer_size: int | None = None +) -> float: + """Compute the area under the range-based PR curve. + + Computes the area under the precision-recall-curve using the range-based precision + and range-based recall definition from Paparrizos et al. published at VLDB 2022 + [1]_. + + We first extend the anomaly labels by two slopes of ``buffer_size//2`` length on + both sides of each anomaly, uniformly sample thresholds from the anomaly score, and + then compute the confusion matrix for all thresholds. Using the resulting precision + and recall values, we can plot a curve and compute its area. + + Parameters + ---------- + y_true : np.ndarray + True binary labels of shape (n_instances,). + y_score : np.ndarray + Anomaly scores for each point of the time series of shape (n_instances,). + buffer_size : int, optional + Size of the buffer region around an anomaly. We add an increasing slope of size + ``buffer_size//2`` to the beginning of anomalies and a decreasing slope of size + ``buffer_size//2`` to the end of anomalies. Per default + (when ``buffer_size==None``), ``buffer_size`` is the median length of the + anomalies within the time series. However, you can also set it to the period + size of the dominant frequency or any other desired value. + + Returns + ------- + Tuple[float, float] + Range-based PR AUC score. + + References + ---------- + .. [1] John Paparrizos, Paul Boniol, Themis Palpanas, Ruey S. Tsay, + Aaron Elmore, and Michael J. Franklin. Volume Under the Surface: A New Accuracy + Evaluation Measure for Time-Series Anomaly Detection. PVLDB, 15(11): + 2774 - 2787, 2022. + doi:`10.14778/3551793.3551830 `_ + """ + y_true, y_pred = check_y(y_true, y_score, force_y_pred_continuous=True) + if np.unique(y_score).shape[0] == 1: + warnings.warn( + "Cannot compute metric for a constant value in y_score, returning 0.0!", + stacklevel=2, + ) + return 0.0 + + range_pr_auc, _ = range_pr_roc_auc_support( + y_true, y_score, buffer_size, skip_check=True + ) + return range_pr_auc + + +def range_pr_vus_score( + y_true: np.ndarray, y_score: np.ndarray, max_buffer_size: int = 500 +) -> float: + """Compute the range-based PR VUS score. + + Computes the volume under the precision-recall-buffer_size-surface using the + range-based precision and range-based recall definition from Paparrizos et al. + published at VLDB 2022 [1]_. + + For all buffer sizes from 0 to ``max_buffer_size``, we first extend the anomaly + labels by two slopes of ``buffer_size//2`` length on both sides of each anomaly, + uniformly sample thresholds from the anomaly score, and then compute the confusion + matrix for all thresholds. Using the resulting precision and recall values, we can + plot a curve and compute its area. + + Parameters + ---------- + y_true : np.ndarray + True binary labels of shape (n_instances,). + y_score : np.ndarray + Anomaly scores for each point of the time series of shape (n_instances,). + max_buffer_size : int, default=500 + Maximum size of the buffer region around an anomaly. We iterate over all buffer + sizes from 0 to ``may_buffer_size`` to create the surface. + + Returns + ------- + Tuple[float, float] + Range-based PR VUS score. + + References + ---------- + .. [1] John Paparrizos, Paul Boniol, Themis Palpanas, Ruey S. Tsay, + Aaron Elmore, and Michael J. Franklin. Volume Under the Surface: A New Accuracy + Evaluation Measure for Time-Series Anomaly Detection. PVLDB, 15(11): + 2774 - 2787, 2022. + doi:`10.14778/3551793.3551830 `_ + """ + y_true, y_pred = check_y(y_true, y_score, force_y_pred_continuous=True) + if np.unique(y_score).shape[0] == 1: + warnings.warn( + "Cannot compute metric for a constant value in y_score, returning 0.0!", + stacklevel=2, + ) + return 0.0 + + prs = np.zeros(max_buffer_size + 1) + for buffer_size in np.arange(0, max_buffer_size + 1): + pr_auc, _ = range_pr_roc_auc_support( + y_true, y_score, buffer_size=buffer_size, skip_check=True + ) + prs[buffer_size] = pr_auc + range_pr_volume = np.sum(prs) / (max_buffer_size + 1) + return range_pr_volume + + +def range_roc_vus_score( + y_true: np.ndarray, y_score: np.ndarray, max_buffer_size: int = 500 +) -> float: + """Compute the range-based ROC VUS score. + + Computes the volume under the receiver-operating-characteristic-buffer_size-surface + using the range-based TPR and range-based FPR definition from Paparrizos et al. + published at VLDB 2022 [1]_. + + For all buffer sizes from 0 to ``max_buffer_size``, we first extend the anomaly + labels by two slopes of ``buffer_size//2`` length on both sides of each anomaly, + uniformly sample thresholds from the anomaly score, and then compute the confusion + matrix for all thresholds. Using the resulting false positive (FPR) and false + positive rates (FPR), we can plot a curve and compute its area. + + Parameters + ---------- + y_true : np.ndarray + True binary labels of shape (n_instances,). + y_score : np.ndarray + Anomaly scores for each point of the time series of shape (n_instances,). + max_buffer_size : int, default=500 + Maximum size of the buffer region around an anomaly. We iterate over all buffer + sizes from 0 to ``may_buffer_size`` to create the surface. + + Returns + ------- + Tuple[float, float] + Range-based PR VUS score. + + References + ---------- + .. [1] John Paparrizos, Paul Boniol, Themis Palpanas, Ruey S. Tsay, + Aaron Elmore, and Michael J. Franklin. Volume Under the Surface: A New Accuracy + Evaluation Measure for Time-Series Anomaly Detection. PVLDB, 15(11): + 2774 - 2787, 2022. + doi:`10.14778/3551793.3551830 `_ + """ + y_true, y_pred = check_y(y_true, y_score, force_y_pred_continuous=True) + if np.unique(y_score).shape[0] == 1: + warnings.warn( + "Cannot compute metric for a constant value in y_score, returning 0.0!", + stacklevel=2, + ) + return 0.0 + + rocs = np.zeros(max_buffer_size + 1) + for buffer_size in np.arange(0, max_buffer_size + 1): + _, roc_auc = range_pr_roc_auc_support( + y_true, y_score, buffer_size=buffer_size, skip_check=True + ) + rocs[buffer_size] = roc_auc + range_roc_volume = np.sum(rocs) / (max_buffer_size + 1) + return range_roc_volume diff --git a/aeon/performance_metrics/anomaly_detection/tests/__init__.py b/aeon/performance_metrics/anomaly_detection/tests/__init__.py new file mode 100644 index 0000000000..85491bb5d7 --- /dev/null +++ b/aeon/performance_metrics/anomaly_detection/tests/__init__.py @@ -0,0 +1 @@ +"""Tests for metrics in anomaly_detection module.""" diff --git a/aeon/performance_metrics/anomaly_detection/tests/test_ad_metrics.py b/aeon/performance_metrics/anomaly_detection/tests/test_ad_metrics.py new file mode 100644 index 0000000000..617e55027a --- /dev/null +++ b/aeon/performance_metrics/anomaly_detection/tests/test_ad_metrics.py @@ -0,0 +1,304 @@ +"""Tests for metrics in anomaly_detection module.""" + +import numpy as np +import pytest + +from aeon.performance_metrics.anomaly_detection import ( + f_score_at_k_points, + f_score_at_k_ranges, + pr_auc_score, + range_f_score, + range_pr_auc_score, + range_pr_vus_score, + range_precision, + range_recall, + range_roc_auc_score, + range_roc_vus_score, + roc_auc_score, + rp_rr_auc_score, +) +from aeon.testing.data_generation import make_example_1d_numpy +from aeon.utils.validation._dependencies import _check_soft_dependencies + +pr_metrics = [pr_auc_score] +range_metrics = [ + range_roc_auc_score, + range_pr_auc_score, + range_roc_vus_score, + range_pr_vus_score, +] +other_metrics = [ + roc_auc_score, + f_score_at_k_points, +] +continuous_metrics = [*pr_metrics, *other_metrics, *range_metrics] +binary_metrics = [] + +if _check_soft_dependencies("prts", severity="none"): + pr_metrics.append(rp_rr_auc_score) + range_metrics.extend( + [ + range_recall, + range_precision, + range_f_score, + ] + ) + other_metrics.extend( + [ + f_score_at_k_ranges, + rp_rr_auc_score, + ] + ) + continuous_metrics.extend( + [ + rp_rr_auc_score, + f_score_at_k_ranges, + ] + ) + binary_metrics = [range_recall, range_precision, range_f_score] + +metrics = [*pr_metrics, *range_metrics, *other_metrics] + + +@pytest.mark.parametrize("metric", metrics, ids=[m.__name__ for m in metrics]) +def test_metric_output(metric): + """Test output has correct format.""" + y_score = make_example_1d_numpy(n_timepoints=20, random_state=0) + y_true = make_example_1d_numpy(n_timepoints=20, random_state=1) + y_true = (y_true > 0.5).astype(int) + + if metric in continuous_metrics: + res = metric( + y_true=y_true, + y_score=y_score, + ) + else: + y_pred = (y_score > 0.5).astype(int) + res = metric( + y_true=y_true, + y_pred=y_pred, + ) + + assert isinstance(res, float) + assert 0.0 <= res <= 1.0 + + +@pytest.mark.parametrize( + "metric", continuous_metrics, ids=[m.__name__ for m in continuous_metrics] +) +def test_continuous_metric_requires_scores(metric): + """Test that continuous metrics require float scores.""" + y_scores = np.array([1, 0, 1, 0, 0]) + y_true = np.array([0, 0, 1, 0, 0]) + + with pytest.raises(ValueError) as ex: + metric(y_true, y_scores) + assert "scores must be floats" in str(ex.value) + + y_scores = y_scores.astype(np.float_) + metric(y_true, y_scores) + assert True, "Metric should accept float, so no error should be raised" + + +@pytest.mark.parametrize( + "metric", binary_metrics, ids=[m.__name__ for m in binary_metrics] +) +def test_binary_metric_requires_predictions(metric): + """Test that binary metrics require integer or boolean predictions.""" + y_scores = np.array([0.8, 0.1, 0.9, 0.3, 0.3], dtype=np.float_) + y_true = np.array([0, 0, 1, 0, 0], dtype=np.bool_) + + with pytest.raises(ValueError) as ex: + metric(y_true, y_scores) + assert "scores must be integers" in str(ex.value) + + y_pred = np.array(y_scores > 0.5, dtype=np.int_) + metric(y_true, y_pred) + + y_pred = y_pred.astype(np.bool_) + metric(y_true, y_pred) + assert True, "Metric should accept ints and bools, so no error should be raised" + + +@pytest.mark.parametrize("metric", metrics, ids=[m.__name__ for m in metrics]) +def test_edge_cases(metric): + """Test edge cases for all metrics.""" + y_true = np.zeros(10, dtype=np.int_) + y_true[2:4] = 1 + y_true[6:8] = 1 + y_zeros = np.zeros_like(y_true, dtype=np.float_) + y_flat = np.full_like(y_true, fill_value=0.5, dtype=np.float_) + y_ones = np.ones_like(y_true, dtype=np.float_) + + for y_pred in [y_zeros, y_flat, y_ones]: + if metric in binary_metrics: + score = metric(y_true, y_pred.astype(np.int_)) + else: + score = metric(y_true, y_pred) + np.testing.assert_almost_equal(score, 0) + + +@pytest.mark.parametrize("metric", pr_metrics, ids=[m.__name__ for m in pr_metrics]) +def test_edge_cases_pr_metrics(metric): + """Test edge cases for PR metrics.""" + y_true = np.zeros(10, dtype=np.int_) + y_true[2:4] = 1 + y_true[6:8] = 1 + y_inverted = (y_true * -1 + 1).astype(np.float_) + + score = metric(y_true, y_inverted) + assert score <= 0.2, f"{metric.__name__}(y_true, y_inverted)={score} is not <= 0.2" + + +@pytest.mark.skipif( + not _check_soft_dependencies("prts", severity="none"), + reason="required soft dependency prts not available", +) +def test_range_based_f1(): + """Test range-based F1 score.""" + y_pred = np.array([0, 1, 1, 0]) + y_true = np.array([0, 1, 0, 0]) + result = range_f_score(y_true, y_pred) + np.testing.assert_almost_equal(result, 0.66666, decimal=4) + + +@pytest.mark.skipif( + not _check_soft_dependencies("prts", severity="none"), + reason="required soft dependency prts not available", +) +def test_range_based_precision(): + """Test range-based precision.""" + y_pred = np.array([0, 1, 1, 0]) + y_true = np.array([0, 1, 0, 0]) + result = range_precision(y_true, y_pred) + assert result == 0.5 + + +@pytest.mark.skipif( + not _check_soft_dependencies("prts", severity="none"), + reason="required soft dependency prts not available", +) +def test_range_based_recall(): + """Test range-based recall.""" + y_pred = np.array([0, 1, 1, 0]) + y_true = np.array([0, 1, 0, 0]) + result = range_recall(y_true, y_pred) + assert result == 1 + + +@pytest.mark.skipif( + not _check_soft_dependencies("prts", severity="none"), + reason="required soft dependency prts not available", +) +def test_rf1_value_error(): + """Test range-based F1 score raises ValueError on binary predictions.""" + y_pred = np.array([0, 0.2, 0.7, 0]) + y_true = np.array([0, 1, 0, 0]) + with pytest.raises(ValueError): + range_f_score(y_true, y_pred) + + +def test_pr_curve_auc(): + """Test PR curve AUC.""" + y_pred = np.array([0, 0.1, 1.0, 0.5, 0, 0]) + y_true = np.array([0, 0, 1, 1, 0, 0]) + result = pr_auc_score(y_true, y_pred) + np.testing.assert_almost_equal(result, 1.0000, decimal=4) + + +# def test_average_precision(): +# y_pred = np.array([0, 0.1, 1., .5, 0, 0]) +# y_true = np.array([0, 1, 1, 0, 0, 0]) +# result = average_precision_score(y_true, y_pred) +# np.testing.assert_almost_equal(result, 0.8333, decimal=4) + + +@pytest.mark.skipif( + not _check_soft_dependencies("prts", severity="none"), + reason="required soft dependency prts not available", +) +def test_range_based_p_range_based_r_curve_auc(): + """Test range-based precision-recall curve AUC.""" + y_pred = np.array([0, 0.1, 1.0, 0.5, 0.1, 0]) + y_true = np.array([0, 1, 1, 1, 0, 0]) + result = rp_rr_auc_score(y_true, y_pred) + np.testing.assert_almost_equal(result, 0.9792, decimal=4) + + +@pytest.mark.skipif( + not _check_soft_dependencies("prts", severity="none"), + reason="required soft dependency prts not available", +) +def test_range_based_p_range_based_r_auc_perfect_hit(): + """Test range-based precision-recall curve AUC with perfect hit.""" + y_pred = np.array([0, 0, 0.5, 0.5, 0, 0]) + y_true = np.array([0, 0, 1, 1, 0, 0]) + result = rp_rr_auc_score(y_true, y_pred) + np.testing.assert_almost_equal(result, 1.0000, decimal=4) + + +@pytest.mark.skipif( + not _check_soft_dependencies("prts", severity="none"), + reason="required soft dependency prts not available", +) +def test_f_score_at_k_ranges(): + """Test range-based F1 score at k ranges.""" + y_pred = np.array([0.4, 0.1, 1.0, 0.5, 0.1, 0, 0.4, 0.5]) + y_true = np.array([0, 1, 1, 1, 0, 0, 1, 0]) + result = f_score_at_k_ranges(y_true, y_pred) + np.testing.assert_almost_equal(result, 0.5000, decimal=4) + result = f_score_at_k_ranges(y_true, y_pred, k=3) + np.testing.assert_almost_equal(result, 0.8000, decimal=4) + + +def test_fscore_at_k_points(): + """Test F1 score at k points.""" + y_pred = np.array([0.4, 0.6, 1.0, 0.5, 0.1, 0, 0.5, 0.4]) + y_true = np.array([0, 1, 1, 1, 0, 0, 1, 0]) + result = f_score_at_k_points(y_true, y_pred) + np.testing.assert_almost_equal(result, 1.0000, decimal=4) + result = f_score_at_k_points(y_true, y_pred, k=1) + np.testing.assert_almost_equal(result, 0.4000, decimal=4) + + +# VUS fixtures +y_true = np.zeros(200) +y_true[10:20] = 1 +y_true[28:33] = 1 +y_true[110:120] = 1 +y_score = np.random.default_rng(41).random(200) * 0.5 +y_score[16:22] = 1 +y_score[33:38] = 1 +y_score[160:170] = 1 + + +def test_range_pr_auc_compat(): + """Test range-based PR AUC.""" + result = range_pr_auc_score(y_true, y_score) + np.testing.assert_almost_equal(result, 0.3737854660, decimal=10) + + +def test_range_roc_auc_compat(): + """Test range-based ROC AUC.""" + result = range_roc_auc_score(y_true, y_score) + np.testing.assert_almost_equal(result, 0.7108527197, decimal=10) + + +def test_edge_case_existence_reward_compat(): + """Test edge case for existence reward in range-based PR/ROC AUC.""" + result = range_pr_auc_score(y_true, y_score, buffer_size=4) + np.testing.assert_almost_equal(result, 0.2506464391, decimal=10) + result = range_roc_auc_score(y_true, y_score, buffer_size=4) + np.testing.assert_almost_equal(result, 0.6143220816, decimal=10) + + +def test_range_pr_volume_compat(): + """Test range-based PR volume under the curve.""" + result = range_pr_vus_score(y_true, y_score, max_buffer_size=200) + np.testing.assert_almost_equal(result, 0.7493254559, decimal=10) + + +def test_range_roc_volume_compat(): + """Test range-based ROC volume under the curve.""" + result = range_roc_vus_score(y_true, y_score, max_buffer_size=200) + np.testing.assert_almost_equal(result, 0.8763382130, decimal=10) diff --git a/aeon/performance_metrics/anomaly_detection/tests/test_thresholding.py b/aeon/performance_metrics/anomaly_detection/tests/test_thresholding.py new file mode 100644 index 0000000000..42994da406 --- /dev/null +++ b/aeon/performance_metrics/anomaly_detection/tests/test_thresholding.py @@ -0,0 +1,37 @@ +"""Test anomaly detection thresholding methods.""" + +import numpy as np + +from aeon.performance_metrics.anomaly_detection.thresholding import ( + percentile_threshold, + sigma_threshold, + top_k_points_threshold, + top_k_ranges_threshold, +) + +y_true = np.array([0, 0, 1, 0, 0, 0, 0, 0, 0]) +y_score = np.array([np.nan, 0.1, 0.9, 0.5, 0.6, 0.55, 0.2, 0.1, np.nan]) + + +def test_percentile_thresholding(): + """Test percentile thresholding.""" + threshold = percentile_threshold(y_score, percentile=90) + np.testing.assert_almost_equal(threshold, 0.72, decimal=2) + + +def test_top_k_points_thresholding(): + """Test top k points thresholding.""" + threshold = top_k_points_threshold(y_true, y_score, k=2) + np.testing.assert_almost_equal(threshold, 0.58, decimal=2) + + +def test_top_k_ranges_thresholding(): + """Test top k ranges thresholding.""" + threshold = top_k_ranges_threshold(y_true, y_score, k=2) + np.testing.assert_almost_equal(threshold, 0.60, decimal=2) + + +def test_sigma_thresholding(): + """Test sigma thresholding.""" + threshold = sigma_threshold(y_score, factor=1) + np.testing.assert_almost_equal(threshold, 0.70, decimal=2) diff --git a/aeon/performance_metrics/anomaly_detection/thresholding.py b/aeon/performance_metrics/anomaly_detection/thresholding.py new file mode 100644 index 0000000000..a882a7887f --- /dev/null +++ b/aeon/performance_metrics/anomaly_detection/thresholding.py @@ -0,0 +1,147 @@ +"""Functions to compute thresholds to convert anomaly scores to binary predictions.""" + +from __future__ import annotations + +__maintainer__ = ["CodeLionX"] +__all__ = [ + "percentile_threshold", + "sigma_threshold", + "top_k_points_threshold", + "top_k_ranges_threshold", +] + +import numpy as np + + +def percentile_threshold(y_score: np.ndarray, percentile: int) -> float: + """Calculate a threshold based on a percentile of the anomaly scores. + + Uses the xth-percentile of the anomaly scoring as threshold ignoring NaNs and using + a linear interpolation. + + Parameters + ---------- + y_score : np.ndarray + Anomaly scores for each point of the time series of shape (n_instances,). + percentile : int + Percentile to use as threshold between 0 and 100. + + Returns + ------- + float + Threshold based on the percentile. + """ + return np.nanpercentile(y_score, percentile) + + +def sigma_threshold(y_score: np.ndarray, factor: float = 2) -> float: + r"""Calculate a threshold based on the standard deviation of the anomaly scores. + + Computes a threshold :math:`\theta` based on the anomaly scoring's mean + :math:`\mu_s` and the standard deviation :math:`\sigma_s`, ignoring NaNs: + + .. math:: + \theta = \mu_{s} + x \cdot \sigma_{s} + + Parameters + ---------- + y_score : np.ndarray + Anomaly scores for each point of the time series of shape (n_instances,). + factor : float + Number of standard deviations to use as threshold (:math:`x`). + + Returns + ------- + float + Threshold based on the standard deviation. + """ + return np.nanmean(y_score) + factor * np.nanstd(y_score) + + +def top_k_points_threshold( + y_true: np.ndarray, y_score: np.ndarray, k: int | None = None +) -> float: + """Calculate a threshold such that at least `k` anomalous points are found. + + The anomalies are single-point anomalies. + + Computes a threshold based on the number of expected anomalies (number of + anomalies). This method iterates over all possible thresholds from high to low to + find the first threshold that yields `k` or more anomalous points. If `k` is `None`, + the ground truth data is used to calculate the real number of anomalies. + + Parameters + ---------- + y_true : np.ndarray + True binary labels of shape (n_instances,). + y_score : np.ndarray + Anomaly scores for each point of the time series of shape (n_instances,). + k : optional int + Number of expected anomalies. If `k` is `None`, the ground truth data is used + to calculate the real number of anomalies. + + Returns + ------- + float + Threshold such that there are at least `k` anomalous points. + """ + if k is None: + return np.nanpercentile(y_score, (1 - y_true.sum() / y_true.shape[0]) * 100) + else: + return np.nanpercentile(y_score, (1 - k / y_true.shape[0]) * 100) + + +def top_k_ranges_threshold( + y_true: np.ndarray, y_score: np.ndarray, k: int | None = None +) -> float: + """Calculate a threshold such that at least `k` anomalies are found. + + The anomalies are either single-points anomalies or continuous anomalous ranges. + + Computes a threshold based on the number of expected anomalous subsequences / + ranges (number of anomalies). This method iterates over all possible thresholds + from high to low to find the first threshold that yields `k` or more continuous + anomalous ranges. If `k` is `None`, the ground truth data is used to calculate the + real number of anomalies (anomalous ranges). + + Parameters + ---------- + y_true : np.ndarray + True binary labels of shape (n_instances,). + y_score : np.ndarray + Anomaly scores for each point of the time series of shape (n_instances,). + k : optional int + Number of expected anomalies. If `k` is `None`, the ground truth data is used + to calculate the real number of anomalies. + + Returns + ------- + float + Threshold such that there are at least `k` anomalous ranges. + """ + if k is None: + k = _count_anomaly_ranges(y_true) + thresholds = np.unique(y_score)[::-1] + + # exclude minimum from thresholds, because all points are >= minimum! + for t in thresholds[:-1]: + y_pred = np.array(y_score >= t, dtype=np.int_) + detected_n = _count_anomaly_ranges(y_pred) + if detected_n >= k: + return t + + +def _count_anomaly_ranges(y: np.ndarray) -> int: + """Count the number of continuous anomalous ranges in a binary sequence. + + Parameters + ---------- + y : np.ndarray + Binary sequence of shape (n_instances,). + + Returns + ------- + int + Number of continuous anomalous ranges. + """ + return int(np.sum(np.diff(np.r_[0, y, 0]) == 1)) diff --git a/aeon/utils/validation/_dependencies.py b/aeon/utils/validation/_dependencies.py index 594e3b2e0c..ee31e71087 100644 --- a/aeon/utils/validation/_dependencies.py +++ b/aeon/utils/validation/_dependencies.py @@ -78,6 +78,8 @@ def _check_soft_dependencies( if obj is None: class_name = "This functionality" + elif isinstance(obj, str): + class_name = obj elif not isclass(obj): class_name = type(obj).__name__ elif isclass(obj): diff --git a/aeon/utils/validation/tests/test_dependencies.py b/aeon/utils/validation/tests/test_dependencies.py index d05c6b5040..89cbd20383 100644 --- a/aeon/utils/validation/tests/test_dependencies.py +++ b/aeon/utils/validation/tests/test_dependencies.py @@ -20,7 +20,7 @@ def test__check_soft_dependencies(): _check_soft_dependencies("FOOBAR") with pytest.raises(ModuleNotFoundError, match="No module named 'FOOBAR'"): _check_soft_dependencies("FOOBAR", suppress_import_stdout=True) - with pytest.raises(ModuleNotFoundError, match="str requires package 'FOOBAR'"): + with pytest.raises(ModuleNotFoundError, match="FOOBAR requires package 'FOOBAR'"): _check_soft_dependencies("FOOBAR", obj="FOOBAR") with pytest.raises(RuntimeError, match="severity argument must be "): _check_soft_dependencies("FOOBAR", severity="FOOBAR") diff --git a/docs/api_reference/performance_metrics.rst b/docs/api_reference/performance_metrics.rst index 18aeecf05e..8e0754ad89 100644 --- a/docs/api_reference/performance_metrics.rst +++ b/docs/api_reference/performance_metrics.rst @@ -44,7 +44,7 @@ Forecasting Segmentation -~~~~~~~~~~~~ +------------ .. currentmodule:: aeon.performance_metrics.segmentation.metrics @@ -55,3 +55,40 @@ Segmentation count_error hausdorff_error prediction_ratio + +Anomaly Detection +----------------- + +.. currentmodule:: aeon.performance_metrics.anomaly_detection + +.. autosummary:: + :toctree: auto_generated/ + :template: function.rst + + range_precision + range_recall + range_f_score + roc_auc_score + pr_auc_score + rp_rr_auc_score + f_score_at_k_points + f_score_at_k_ranges + range_pr_roc_auc_support + range_roc_auc_score + range_pr_auc_score + range_pr_vus_score + range_roc_vus_score + +AD Thresholding +~~~~~~~~~~~~~~~ + +.. currentmodule:: aeon.performance_metrics.anomaly_detection.thresholding + +.. autosummary:: + :toctree: auto_generated/ + :template: function.rst + + percentile_threshold + sigma_threshold + top_k_points_threshold + top_k_ranges_threshold diff --git a/pyproject.toml b/pyproject.toml index 377d5c736a..cbea69021e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -81,6 +81,7 @@ all_extras = [ "tslearn>=0.5.2", "xarray", "pyod>=1.1.3", + "prts>=1.0.0.0", # dependency of tensorflow, see issue #1724 "keras<3.4.0",