diff --git a/CODEOWNERS b/CODEOWNERS
index b003844696..7c51713485 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -20,6 +20,7 @@ aeon/forecasting/ @aiwalter @guzalbulatova @ltsaprounis
aeon/networks/ @hadifawaz1999
aeon/performance_metrics/forecasting/ @aiwalter
+aeon/performance_metrics/anomaly_detection/ @codelionx @MatthewMiddlehurst
aeon/pipeline/ @aiwalter
diff --git a/aeon/performance_metrics/anomaly_detection/__init__.py b/aeon/performance_metrics/anomaly_detection/__init__.py
new file mode 100644
index 0000000000..20b52bd503
--- /dev/null
+++ b/aeon/performance_metrics/anomaly_detection/__init__.py
@@ -0,0 +1,37 @@
+"""Metrics for anomaly detection."""
+
+from aeon.performance_metrics.anomaly_detection._binary import (
+ range_f_score,
+ range_precision,
+ range_recall,
+)
+from aeon.performance_metrics.anomaly_detection._continuous import (
+ f_score_at_k_points,
+ f_score_at_k_ranges,
+ pr_auc_score,
+ roc_auc_score,
+ rp_rr_auc_score,
+)
+from aeon.performance_metrics.anomaly_detection._vus_metrics import (
+ range_pr_auc_score,
+ range_pr_roc_auc_support,
+ range_pr_vus_score,
+ range_roc_auc_score,
+ range_roc_vus_score,
+)
+
+__all__ = [
+ "range_precision",
+ "range_recall",
+ "range_f_score",
+ "roc_auc_score",
+ "pr_auc_score",
+ "rp_rr_auc_score",
+ "f_score_at_k_points",
+ "f_score_at_k_ranges",
+ "range_pr_roc_auc_support",
+ "range_roc_auc_score",
+ "range_pr_auc_score",
+ "range_pr_vus_score",
+ "range_roc_vus_score",
+]
diff --git a/aeon/performance_metrics/anomaly_detection/_binary.py b/aeon/performance_metrics/anomaly_detection/_binary.py
new file mode 100644
index 0000000000..6c6c812ef9
--- /dev/null
+++ b/aeon/performance_metrics/anomaly_detection/_binary.py
@@ -0,0 +1,209 @@
+"""Metrics on binary predictions for anomaly detection."""
+
+__maintainer__ = ["CodeLionX"]
+__all__ = ["range_precision", "range_recall", "range_f_score"]
+
+import warnings
+
+import numpy as np
+
+from aeon.performance_metrics.anomaly_detection._util import check_y
+from aeon.utils.validation._dependencies import _check_soft_dependencies
+
+
+def range_precision(
+ y_true: np.ndarray,
+ y_pred: np.ndarray,
+ alpha: float = 0,
+ cardinality: str = "reciprocal",
+ bias: str = "flat",
+) -> float:
+ """Compute the range-based precision metric.
+
+ Range-based metrics were introduced by Tatbul et al. at NeurIPS 2018 [1]_. This
+ implementation uses the community package `prts `_
+ as a soft-dependency.
+
+ Range precision is the average precision of each predicted anomaly range. For each
+ predicted continuous anomaly range the overlap size, position, and cardinality is
+ considered. For more details, please refer to the paper [1]_.
+
+ Parameters
+ ----------
+ y_true : np.ndarray
+ True binary labels of shape (n_instances,).
+ y_pred : np.ndarray
+ Anomaly scores for each point of the time series of shape (n_instances,).
+ alpha : float
+ Weight of the existence reward. Because precision by definition emphasizes on
+ prediction quality, there is no need for an existence reward and this value
+ should always be set to 0.
+ cardinality : {'reciprocal', 'one', 'udf_gamma'}
+ Cardinality type.
+ bias : {'flat', 'front', 'middle', 'back'}
+ Positional bias type.
+
+ Returns
+ -------
+ float
+ Range-based precision
+
+ References
+ ----------
+ .. [1] Tatbul, Nesime, Tae Jun Lee, Stan Zdonik, Mejbah Alam, and Justin
+ Gottschlich. "Precision and Recall for Time Series." In Proceedings of the
+ International Conference on Neural Information Processing Systems (NeurIPS),
+ 1920–30. 2018.
+ http://papers.nips.cc/paper/7462-precision-and-recall-for-time-series.pdf.
+ """
+ _check_soft_dependencies("prts", obj="range_precision", suppress_import_stdout=True)
+
+ from prts import ts_precision
+
+ y_true, y_pred = check_y(y_true, y_pred, force_y_pred_continuous=False)
+ if np.unique(y_pred).shape[0] == 1:
+ warnings.warn(
+ "Cannot compute metric for a constant value in y_score, returning 0.0!",
+ stacklevel=2,
+ )
+ return 0.0
+ return ts_precision(y_true, y_pred, alpha=alpha, cardinality=cardinality, bias=bias)
+
+
+def range_recall(
+ y_true: np.ndarray,
+ y_pred: np.ndarray,
+ alpha: float = 0,
+ cardinality: str = "reciprocal",
+ bias: str = "flat",
+) -> float:
+ """Compute the range-based recall metric.
+
+ Range-based metrics were introduced by Tatbul et al. at NeurIPS 2018 [1]_. This
+ implementation uses the community package `prts `_
+ as a soft-dependency.
+
+ Range recall is the average recall of each real anomaly range. For each real
+ anomaly range the overlap size, position, and cardinality with predicted anomaly
+ ranges are considered. In addition, an existence reward can be given that boosts
+ the recall even if just a single point of the real anomaly is in the predicted
+ ranges. For more details, please refer to the paper [1]_.
+
+ Parameters
+ ----------
+ y_true : np.ndarray
+ True binary labels of shape (n_instances,).
+ y_pred : np.ndarray
+ Anomaly scores for each point of the time series of shape (n_instances,).
+ alpha : float
+ Weight of the existence reward. If 0: no existence reward, if 1: only existence
+ reward. The existence reward is given if the real anomaly range has overlap
+ with even a single point of the predicted anomaly range.
+ cardinality : {'reciprocal', 'one', 'udf_gamma'}
+ Cardinality type.
+ bias : {'flat', 'front', 'middle', 'back'}
+ Positional bias type.
+
+ Returns
+ -------
+ float
+ Range-based recall
+
+ References
+ ----------
+ .. [1] Tatbul, Nesime, Tae Jun Lee, Stan Zdonik, Mejbah Alam, and Justin
+ Gottschlich. "Precision and Recall for Time Series." In Proceedings of the
+ International Conference on Neural Information Processing Systems (NeurIPS),
+ 1920–30. 2018.
+ http://papers.nips.cc/paper/7462-precision-and-recall-for-time-series.pdf.
+ """
+ _check_soft_dependencies("prts", obj="range_recall", suppress_import_stdout=True)
+
+ from prts import ts_recall
+
+ y_true, y_pred = check_y(y_true, y_pred, force_y_pred_continuous=False)
+ if np.unique(y_pred).shape[0] == 1:
+ warnings.warn(
+ "Cannot compute metric for a constant value in y_score, returning 0.0!",
+ stacklevel=2,
+ )
+ return 0.0
+ return ts_recall(y_true, y_pred, alpha=alpha, cardinality=cardinality, bias=bias)
+
+
+def range_f_score(
+ y_true: np.ndarray,
+ y_pred: np.ndarray,
+ beta: float = 1,
+ p_alpha: float = 0,
+ r_alpha: float = 0.5,
+ cardinality: str = "reciprocal",
+ p_bias: str = "flat",
+ r_bias: str = "flat",
+) -> float:
+ """Compute the F-score using the range-based recall and precision metrics.
+
+ Range-based metrics were introduced by Tatbul et al. at NeurIPS 2018 [1]_. This
+ implementation uses the community package `prts `_
+ as a soft-dependency.
+
+ The F-beta score is the weighted harmonic mean of precision and recall, reaching
+ its optimal value at 1 and its worst value at 0. This implementation uses the
+ range-based precision and range-based recall as basis.
+
+ Parameters
+ ----------
+ y_true : np.ndarray
+ True binary labels of shape (n_instances,).
+ y_pred : np.ndarray
+ Anomaly scores for each point of the time series of shape (n_instances,).
+ beta : float
+ F-score beta determines the weight of recall in the combined score.
+ beta < 1 lends more weight to precision, while beta > 1 favors recall.
+ p_alpha : float
+ Weight of the existence reward for the range-based precision. For most - when
+ not all - cases, `p_alpha` should be set to 0.
+ r_alpha : float
+ Weight of the existence reward. If 0: no existence reward, if 1: only
+ existence reward.
+ cardinality : {'reciprocal', 'one', 'udf_gamma'}
+ Cardinality type.
+ p_bias : {'flat', 'front', 'middle', 'back'}
+ Positional bias type.
+ r_bias : {'flat', 'front', 'middle', 'back'}
+ Positional bias type.
+
+ Returns
+ -------
+ float
+ Range-based F-score
+
+ References
+ ----------
+ .. [1] Tatbul, Nesime, Tae Jun Lee, Stan Zdonik, Mejbah Alam, and Justin
+ Gottschlich. "Precision and Recall for Time Series." In Proceedings of the
+ International Conference on Neural Information Processing Systems (NeurIPS),
+ 1920–30. 2018.
+ http://papers.nips.cc/paper/7462-precision-and-recall-for-time-series.pdf.
+ """
+ _check_soft_dependencies("prts", obj="range_recall", suppress_import_stdout=True)
+
+ from prts import ts_fscore
+
+ y_true, y_pred = check_y(y_true, y_pred, force_y_pred_continuous=False)
+ if np.unique(y_pred).shape[0] == 1:
+ warnings.warn(
+ "Cannot compute metric for a constant value in y_score, returning 0.0!",
+ stacklevel=2,
+ )
+ return 0.0
+ return ts_fscore(
+ y_true,
+ y_pred,
+ beta=beta,
+ p_alpha=p_alpha,
+ r_alpha=r_alpha,
+ cardinality=cardinality,
+ p_bias=p_bias,
+ r_bias=r_bias,
+ )
diff --git a/aeon/performance_metrics/anomaly_detection/_continuous.py b/aeon/performance_metrics/anomaly_detection/_continuous.py
new file mode 100644
index 0000000000..1d5925fb25
--- /dev/null
+++ b/aeon/performance_metrics/anomaly_detection/_continuous.py
@@ -0,0 +1,296 @@
+"""Metrics on anomaly scores for anomaly detection."""
+
+from __future__ import annotations
+
+__maintainer__ = ["CodeLionX"]
+__all__ = [
+ "roc_auc_score",
+ "pr_auc_score",
+ "f_score_at_k_points",
+ "f_score_at_k_ranges",
+ "rp_rr_auc_score",
+]
+
+import warnings
+
+import numpy as np
+from sklearn.metrics import auc, f1_score, precision_recall_curve
+from sklearn.metrics import roc_auc_score as _roc_auc_score
+
+from aeon.performance_metrics.anomaly_detection._util import check_y
+from aeon.performance_metrics.anomaly_detection.thresholding import (
+ top_k_points_threshold,
+ top_k_ranges_threshold,
+)
+from aeon.utils.validation._dependencies import _check_soft_dependencies
+
+
+def roc_auc_score(y_true: np.ndarray, y_score: np.ndarray) -> float:
+ """Compute the ROC AUC score.
+
+ Parameters
+ ----------
+ y_true : np.ndarray
+ True binary labels of shape (n_instances,).
+ y_score : np.ndarray
+ Anomaly scores for each point of the time series of shape (n_instances,).
+
+ Returns
+ -------
+ float
+ ROC AUC score.
+
+ See Also
+ --------
+ sklearn.metrics.roc_auc_score
+ Is used internally.
+ """
+ y_true, y_pred = check_y(y_true, y_score, force_y_pred_continuous=True)
+ if np.unique(y_score).shape[0] == 1:
+ warnings.warn(
+ "Cannot compute metric for a constant value in y_score, returning 0.0!",
+ stacklevel=2,
+ )
+ return 0.0
+ return _roc_auc_score(y_true, y_score)
+
+
+def pr_auc_score(y_true: np.ndarray, y_score: np.ndarray) -> float:
+ """Compute the precision-recall AUC score.
+
+ Parameters
+ ----------
+ y_true : np.ndarray
+ True binary labels of shape (n_instances,).
+ y_score : np.ndarray
+ Anomaly scores for each point of the time series of shape (n_instances,).
+
+ Returns
+ -------
+ float
+ Precision-recall AUC score.
+
+ See Also
+ --------
+ sklearn.metrics.precision_recall_curve
+ Function used under the hood.
+ """
+ y_true, y_pred = check_y(y_true, y_score, force_y_pred_continuous=True)
+ if np.unique(y_score).shape[0] == 1:
+ warnings.warn(
+ "Cannot compute metric for a constant value in y_score, returning 0.0!",
+ stacklevel=2,
+ )
+ return 0.0
+
+ x, y, _ = precision_recall_curve(y_true, np.array(y_score))
+ area = auc(y, x)
+ return area
+
+
+def f_score_at_k_points(
+ y_true: np.ndarray, y_score: np.ndarray, k: int | None = None
+) -> float:
+ """Compute the F-score at k based on single points.
+
+ This metric only considers the top-k predicted anomalous points within the scoring
+ by finding a threshold on the scoring that produces at least k anomalous points. If
+ `k` is not specified, the number of anomalies within the ground truth is used as
+ `k`.
+
+ Parameters
+ ----------
+ y_true : np.ndarray
+ True binary labels of shape (n_instances,).
+ y_score : np.ndarray
+ Anomaly scores for each point of the time series of shape (n_instances,).
+ k : int (optional)
+ Number of top anomalies used to calculate precision. If `k` is not specified
+ (`None`) the number of true anomalies (based on the ground truth values) is
+ used.
+
+ Returns
+ -------
+ float
+ F1 score at k.
+
+ See Also
+ --------
+ aeon.performance_metrics.anomaly_detection.thresholding.top_k_points_threshold
+ Function used to find the threshold.
+ """
+ y_true, y_pred = check_y(y_true, y_score, force_y_pred_continuous=True)
+ if np.unique(y_score).shape[0] == 1:
+ warnings.warn(
+ "Cannot compute metric for a constant value in y_score, returning 0.0!",
+ stacklevel=2,
+ )
+ return 0.0
+
+ threshold = top_k_points_threshold(y_true, y_score, k)
+ y_pred = y_score >= threshold
+ return f1_score(y_true, y_pred)
+
+
+def f_score_at_k_ranges(
+ y_true: np.ndarray, y_score: np.ndarray, k: int | None = None
+) -> float:
+ """Compute the range-based F-score at k based on anomaly ranges.
+
+ This metric only considers the top-k predicted anomaly ranges within the scoring by
+ finding a threshold on the scoring that produces at least k anomalous ranges. If `k`
+ is not specified, the number of anomalies within the ground truth is used as `k`.
+
+ This implementation uses the community package
+ `prts `_ as a soft-dependency to compute the
+ range-based F-score.
+
+ Parameters
+ ----------
+ y_true : np.ndarray
+ True binary labels of shape (n_instances,).
+ y_score : np.ndarray
+ Anomaly scores for each point of the time series of shape (n_instances,).
+ k : int (optional)
+ Number of top anomalies used to calculate precision. If `k` is not specified
+ (`None`) the number of true anomalies (based on the ground truth values) is
+ used.
+
+ Returns
+ -------
+ float
+ F1 score at k.
+
+ See Also
+ --------
+ aeon.performance_metrics.anomaly_detection.thresholding.top_k_ranges_threshold
+ Function used to find the threshold.
+ """
+ _check_soft_dependencies(
+ "prts", obj="f_score_at_k_ranges", suppress_import_stdout=True
+ )
+
+ from prts import ts_fscore
+
+ y_true, y_pred = check_y(y_true, y_score, force_y_pred_continuous=True)
+ if np.unique(y_score).shape[0] == 1:
+ warnings.warn(
+ "Cannot compute metric for a constant value in y_score, returning 0.0!",
+ stacklevel=2,
+ )
+ return 0.0
+
+ threshold = top_k_ranges_threshold(y_true, y_score, k)
+ y_pred = y_score >= threshold
+ return ts_fscore(y_true, y_pred, p_alpha=1, r_alpha=1, cardinality="reciprocal")
+
+
+def rp_rr_auc_score(
+ y_true: np.ndarray,
+ y_score: np.ndarray,
+ max_samples: int = 50,
+ r_alpha: float = 0.5,
+ p_alpha: float = 0,
+ cardinality: str = "reciprocal",
+ bias: str = "flat",
+) -> float:
+ """Compute the AUC-score of the range-based precision-recall curve.
+
+ Computes the area under the precision recall curve when using the range-based
+ precision and range-based recall metric introduced by Tatbul et al. at NeurIPS 2018
+ [1]_. This implementation uses the community package
+ `prts `_ as a soft-dependency.
+
+ This metric only considers the top-k predicted anomaly ranges within the scoring by
+ finding a threshold on the scoring that produces at least k anomalous ranges. If `k`
+ is not specified, the number of anomalies within the ground truth is used as `k`.
+
+ Parameters
+ ----------
+ y_true : np.ndarray
+ True binary labels of shape (n_instances,).
+ y_score : np.ndarray
+ Anomaly scores for each point of the time series of shape (n_instances,).
+ max_samples: int
+ The implementation of the range-based precision and recall metrics is quite
+ slow because it relies on the non-optimized ``prts``-package. To prevent long
+ runtimes caused by scorings with high precision (many thresholds), just a
+ specific amount of possible thresholds is sampled. This parameter controls the
+ maximum number of thresholds; however, too low numbers degrade the metrics'
+ quality.
+ r_alpha : float
+ Weight of the existence reward for the range-based recall.
+ p_alpha : float
+ Weight of the existence reward for the range-based precision. For most - when
+ not all - cases, `p_alpha` should be set to 0.
+ cardinality : {'reciprocal', 'one', 'udf_gamma'}
+ Cardinality type.
+ bias : {'flat', 'front', 'middle', 'back'}
+ Positional bias type.
+
+ Returns
+ -------
+ float
+ Area under the range-based precision-recall curve.
+
+ References
+ ----------
+ .. [1] Tatbul, Nesime, Tae Jun Lee, Stan Zdonik, Mejbah Alam, and Justin
+ Gottschlich. "Precision and Recall for Time Series." In Proceedings of the
+ International Conference on Neural Information Processing Systems (NeurIPS),
+ 1920–30. 2018.
+ http://papers.nips.cc/paper/7462-precision-and-recall-for-time-series.pdf.
+ """
+ _check_soft_dependencies(
+ "prts", obj="f_score_at_k_ranges", suppress_import_stdout=True
+ )
+
+ from prts import ts_precision, ts_recall
+
+ y_true, y_pred = check_y(y_true, y_score, force_y_pred_continuous=True)
+ if np.unique(y_score).shape[0] == 1:
+ warnings.warn(
+ "Cannot compute metric for a constant value in y_score, returning 0.0!",
+ stacklevel=2,
+ )
+ return 0.0
+
+ thresholds = np.unique(y_score)
+ thresholds.sort()
+ # The first precision and recall values are precision=class balance and recall=1.0,
+ # which corresponds to a classifier that always predicts the positive class,
+ # independently of the threshold. This means that we can skip the first threshold!
+ p0 = y_true.sum() / len(y_true)
+ r0 = 1.0
+ thresholds = thresholds[1:]
+
+ # sample thresholds
+ n_thresholds = thresholds.shape[0]
+ if n_thresholds > max_samples:
+ every_nth = n_thresholds // (max_samples - 1)
+ sampled_thresholds = thresholds[::every_nth]
+ if thresholds[-1] == sampled_thresholds[-1]:
+ thresholds = sampled_thresholds
+ else:
+ thresholds = np.r_[sampled_thresholds, thresholds[-1]]
+
+ recalls = np.zeros_like(thresholds)
+ precisions = np.zeros_like(thresholds)
+ for i, threshold in enumerate(thresholds):
+ y_pred = (y_score >= threshold).astype(np.int64)
+ recalls[i] = ts_recall(
+ y_true, y_pred, alpha=r_alpha, cardinality=cardinality, bias=bias
+ )
+ precisions[i] = ts_precision(
+ y_true, y_pred, alpha=p_alpha, cardinality=cardinality, bias=bias
+ )
+ # first sort by recall, then by precision to break ties
+ # (important for noisy scorings)
+ sorted_idx = np.lexsort((precisions * (-1), recalls))[::-1]
+
+ # add first and last points to the curve
+ recalls = np.r_[r0, recalls[sorted_idx], 0]
+ precisions = np.r_[p0, precisions[sorted_idx], 1]
+
+ # calculate area under the curve
+ return auc(recalls, precisions)
diff --git a/aeon/performance_metrics/anomaly_detection/_util.py b/aeon/performance_metrics/anomaly_detection/_util.py
new file mode 100644
index 0000000000..8e48e19a68
--- /dev/null
+++ b/aeon/performance_metrics/anomaly_detection/_util.py
@@ -0,0 +1,119 @@
+"""Utility functions for anomaly detection performance metrics."""
+
+__maintainer__ = ["CodeLionX"]
+__all__ = ["check_y"]
+
+import warnings
+from typing import Tuple
+
+import numpy as np
+from sklearn.utils import assert_all_finite, check_consistent_length, column_or_1d
+
+
+def check_y(
+ y_true: np.ndarray,
+ y_pred: np.ndarray,
+ force_y_pred_continuous: bool = False,
+ inf_is_1: bool = True,
+ neginf_is_0: bool = True,
+ nan_is_0: bool = True,
+) -> Tuple[np.ndarray, np.ndarray]:
+ """Check the input arrays for the performance metrics.
+
+ This function checks the input arrays for the performance metrics. You can control
+ the interpretation of infinity, negative infinity, and NaN values in the anomaly
+ scoring with the parameters. If the respective parameter is set to ``False``, the
+ corresponding values are considered as wrong predictions (either as FP or FN
+ depending on the label).
+
+ Parameters
+ ----------
+ y_true : np.ndarray
+ True binary labels of shape (n_instances,).
+ y_pred : np.ndarray
+ Either the anomaly scores for each point of the time series or the binary
+ anomaly prediction of shape (n_instances,). If ``force_y_pred_continous`` is set
+ ``y_pred`` must be a float array, otherwise it must be an integer array.
+ force_y_pred_continuous : bool, default false
+ If ``True`` the function checks whether ``y_pred`` is a continuous float array.
+ Otherwise, it is assumed to contain binary predictions as integer array. If the
+ user accidentally swapped the input parameters, this functions swappes them
+ back.
+ inf_is_1 : bool, default True
+ Whether to treat infinite values in the anomaly scores as 1.
+ neginf_is_0 : bool, default True
+ Whether to treat negative infinite values in the anomaly scores as 0.
+ nan_is_0 : bool, default True
+ Whether to treat NaN values in the anomaly scores as 0.
+
+ Returns
+ -------
+ Tuple[np.ndarray, np.ndarray]
+ Tuple of the cleaned ``y_true`` and ``y_pred`` arrays.
+ """
+ y_true = np.array(y_true).copy()
+ y_pred = np.array(y_pred).copy()
+
+ # check labels
+ if (
+ force_y_pred_continuous
+ and y_true.dtype == np.float_
+ and y_pred.dtype == np.int_
+ ):
+ warnings.warn(
+ "Assuming that y_true and y_score where permuted, because their"
+ "dtypes indicate so. y_true should be an integer array and"
+ "y_score a float array!",
+ stacklevel=2,
+ )
+ return check_y(
+ y_pred,
+ y_true,
+ force_y_pred_continuous=force_y_pred_continuous,
+ inf_is_1=inf_is_1,
+ neginf_is_0=neginf_is_0,
+ nan_is_0=nan_is_0,
+ )
+
+ y_true = column_or_1d(y_true)
+ assert_all_finite(y_true)
+
+ # check scores
+ y_pred = column_or_1d(y_pred)
+
+ check_consistent_length([y_true, y_pred])
+ if not force_y_pred_continuous and y_pred.dtype not in [np.int_, np.bool_]:
+ raise ValueError(
+ "When using metrics other than AUC/VUS-metrics that need binary "
+ "(0 or 1) scores (like Precision, Recall or F1-Score), the scores must "
+ "be integers and should only contain the values {0, 1}. Please "
+ "consider applying a threshold to the scores!"
+ )
+ elif force_y_pred_continuous and y_pred.dtype != np.float_:
+ raise ValueError(
+ "When using continuous scoring metrics, the scores must be floats!"
+ )
+
+ # substitute NaNs and Infs
+ nan_mask = np.isnan(y_pred)
+ inf_mask = np.isinf(y_pred)
+ neginf_mask = np.isneginf(y_pred)
+ penalize_mask = np.full_like(y_pred, dtype=bool, fill_value=False)
+ if inf_is_1:
+ y_pred[inf_mask] = 1
+ else:
+ penalize_mask = penalize_mask | inf_mask
+ if neginf_is_0:
+ y_pred[neginf_mask] = 0
+ else:
+ penalize_mask = penalize_mask | neginf_mask
+ if nan_is_0:
+ y_pred[nan_mask] = 0
+ else:
+ penalize_mask = penalize_mask | nan_mask
+ y_pred[penalize_mask] = (~np.array(y_true[penalize_mask], dtype=bool)).astype(
+ np.int_
+ )
+
+ assert_all_finite(y_pred)
+ return y_true, y_pred
diff --git a/aeon/performance_metrics/anomaly_detection/_vus_metrics.py b/aeon/performance_metrics/anomaly_detection/_vus_metrics.py
new file mode 100644
index 0000000000..e1e4b53ce4
--- /dev/null
+++ b/aeon/performance_metrics/anomaly_detection/_vus_metrics.py
@@ -0,0 +1,382 @@
+"""VUS and AUC metrics for anomaly detection on time series."""
+
+from __future__ import annotations
+
+__maintainer__ = ["CodeLionX"]
+__all__ = [
+ "range_pr_auc_score",
+ "range_roc_auc_score",
+ "range_pr_vus_score",
+ "range_roc_vus_score",
+ "range_pr_roc_auc_support",
+]
+
+import warnings
+
+import numpy as np
+
+from aeon.performance_metrics.anomaly_detection._util import check_y
+
+
+def _anomaly_bounds(y_true: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
+ """Corresponds to range_convers_new."""
+ # convert to boolean/binary
+ labels = y_true > 0
+ # deal with start and end of time series
+ labels = np.diff(np.r_[0, labels, 0])
+ # extract begin and end of anomalous regions
+ index = np.arange(0, labels.shape[0])
+ starts = index[labels == 1]
+ ends = index[labels == -1]
+ return starts, ends
+
+
+def _extend_anomaly_labels(
+ y_true: np.ndarray, buffer_size: int | None = None
+) -> tuple[np.ndarray, np.ndarray]:
+ """Extend the anomaly labels with slopes on both ends.
+
+ Also makes the labels continuous instead of binary.
+ """
+ starts, ends = _anomaly_bounds(y_true)
+
+ if buffer_size is None:
+ # per default: set buffer size as median anomaly length:
+ buffer_size = int(np.median(ends - starts))
+
+ if buffer_size <= 1:
+ anomalies = np.array(list(zip(starts, ends - 1)))
+ return y_true.astype(np.float_), anomalies
+
+ y_true_cont = y_true.astype(np.float_)
+ slope_length = buffer_size // 2
+ length = y_true_cont.shape[0]
+ for s, e in zip(starts, ends):
+ e -= 1
+ x1 = np.arange(e, min(e + slope_length, length))
+ y_true_cont[x1] += np.sqrt(1 - (x1 - e) / buffer_size)
+ x2 = np.arange(max(s - slope_length, 0), s)
+ y_true_cont[x2] += np.sqrt(1 - (s - x2) / buffer_size)
+ y_true_cont = np.clip(y_true_cont, 0, 1)
+ starts, ends = _anomaly_bounds(y_true_cont)
+ anomalies = np.array(list(zip(starts, ends - 1)))
+
+ return y_true_cont, anomalies
+
+
+def _uniform_threshold_sampling(y_score: np.ndarray) -> np.ndarray:
+ """Create the threshold via uniform sampling."""
+ # magic number from original implementation
+ n_samples = 250
+ thresholds = np.sort(y_score)[::-1]
+ thresholds = thresholds[
+ np.linspace(0, thresholds.shape[0] - 1, n_samples, dtype=np.int_)
+ ]
+ return thresholds
+
+
+def range_pr_roc_auc_support(
+ y_true: np.ndarray,
+ y_score: np.ndarray,
+ buffer_size: int | None = None,
+ skip_check: bool = False,
+) -> tuple[float, float]:
+ """Compute the range-based PR and ROC AUC.
+
+ Computes the area under the precision-recall-curve and the area under the
+ receiver operating characteristic using the range-based precision and range-based
+ recall definition from Paparrizos et al. published at VLDB 2022 [1]_.
+
+ We first extend the anomaly labels by two slopes of ``buffer_size//2`` length on
+ both sides of each anomaly, uniformly sample thresholds from the anomaly score, and
+ then compute the confusion matrix for all thresholds. Using the resulting precision
+ and recall values, we can plot a curve and compute its area.
+
+ Parameters
+ ----------
+ y_true : np.ndarray
+ True binary labels of shape (n_instances,).
+ y_score : np.ndarray
+ Anomaly scores for each point of the time series of shape (n_instances,).
+ buffer_size : int, optional
+ Size of the buffer region around an anomaly. We add an increasing slope of size
+ ``buffer_size//2`` to the beginning of anomalies and a decreasing slope of size
+ ``buffer_size//2`` to the end of anomalies. Per default
+ (when ``buffer_size==None``), ``buffer_size`` is the median length of the
+ anomalies within the time series. However, you can also set it to the period
+ size of the dominant frequency or any other desired value.
+ skip_check : bool, default False
+ Whether to skip the input checks.
+
+ Returns
+ -------
+ Tuple[float, float]
+ Range-based PR AUC and range-based ROC AUC.
+
+ References
+ ----------
+ .. [1] John Paparrizos, Paul Boniol, Themis Palpanas, Ruey S. Tsay,
+ Aaron Elmore, and Michael J. Franklin. Volume Under the Surface: A New Accuracy
+ Evaluation Measure for Time-Series Anomaly Detection. PVLDB, 15(11):
+ 2774 - 2787, 2022.
+ doi:`10.14778/3551793.3551830 `_
+ """
+ if not skip_check:
+ y_true, y_pred = check_y(y_true, y_score, force_y_pred_continuous=True)
+ y_true_cont, anomalies = _extend_anomaly_labels(y_true, buffer_size)
+ thresholds = _uniform_threshold_sampling(y_score)
+ p = np.average([np.sum(y_true), np.sum(y_true_cont)])
+
+ recalls = np.zeros(thresholds.shape[0] + 2) # tprs
+ fprs = np.zeros(thresholds.shape[0] + 2)
+ precisions = np.ones(thresholds.shape[0] + 1)
+
+ for i, t in enumerate(thresholds):
+ y_pred = y_score >= t
+ product = y_true_cont * y_pred
+ tp = np.sum(product)
+ # fp = np.dot((np.ones_like(y_pred) - y_true_cont).T, y_pred)
+ fp = np.sum(y_pred) - tp
+ n = len(y_pred) - p
+
+ existence_reward = [np.sum(product[s : e + 1]) > 0 for s, e in anomalies]
+ existence_reward = np.sum(existence_reward) / anomalies.shape[0]
+
+ recall = min(tp / p, 1) * existence_reward # = tpr
+ fpr = min(fp / n, 1)
+ precision = tp / np.sum(y_pred)
+
+ recalls[i + 1] = recall
+ fprs[i + 1] = fpr
+ precisions[i + 1] = precision
+
+ recalls[-1] = 1
+ fprs[-1] = 1
+
+ range_pr_auc = np.sum(
+ (recalls[1:-1] - recalls[:-2]) * (precisions[1:] + precisions[:-1]) / 2
+ )
+ range_roc_auc = np.sum((fprs[1:] - fprs[:-1]) * (recalls[1:] + recalls[:-1]) / 2)
+ return range_pr_auc, range_roc_auc
+
+
+def range_roc_auc_score(
+ y_true: np.ndarray, y_score: np.ndarray, buffer_size: int | None = None
+) -> float:
+ """Compute the range-based area under the ROC curve.
+
+ Computes the area under the receiver-operating-characteristic-curve using the
+ range-based TPR and range-based FPR definition from Paparrizos et al.
+ published at VLDB 2022 [1]_.
+
+ We first extend the anomaly labels by two slopes of ``buffer_size//2`` length on
+ both sides of each anomaly, uniformly sample thresholds from the anomaly score, and
+ then compute the confusion matrix for all thresholds. Using the resulting precision
+ and recall values, we can plot a curve and compute its area.
+
+ Parameters
+ ----------
+ y_true : np.ndarray
+ True binary labels of shape (n_instances,).
+ y_score : np.ndarray
+ Anomaly scores for each point of the time series of shape (n_instances,).
+ buffer_size : int, optional
+ Size of the buffer region around an anomaly. We add an increasing slope of size
+ ``buffer_size//2`` to the beginning of anomalies and a decreasing slope of size
+ ``buffer_size//2`` to the end of anomalies. Per default
+ (when ``buffer_size==None``), ``buffer_size`` is the median length of the
+ anomalies within the time series. However, you can also set it to the period
+ size of the dominant frequency or any other desired value.
+
+ Returns
+ -------
+ Tuple[float, float]
+ Range-based ROC AUC score.
+
+ References
+ ----------
+ .. [1] John Paparrizos, Paul Boniol, Themis Palpanas, Ruey S. Tsay,
+ Aaron Elmore, and Michael J. Franklin. Volume Under the Surface: A New Accuracy
+ Evaluation Measure for Time-Series Anomaly Detection. PVLDB, 15(11):
+ 2774 - 2787, 2022.
+ doi:`10.14778/3551793.3551830 `_
+ """
+ y_true, y_pred = check_y(y_true, y_score, force_y_pred_continuous=True)
+ if np.unique(y_score).shape[0] == 1:
+ warnings.warn(
+ "Cannot compute metric for a constant value in y_score, returning 0.0!",
+ stacklevel=2,
+ )
+ return 0.0
+
+ _, range_auc_roc = range_pr_roc_auc_support(
+ y_true, y_score, buffer_size, skip_check=True
+ )
+ return range_auc_roc
+
+
+def range_pr_auc_score(
+ y_true: np.ndarray, y_score: np.ndarray, buffer_size: int | None = None
+) -> float:
+ """Compute the area under the range-based PR curve.
+
+ Computes the area under the precision-recall-curve using the range-based precision
+ and range-based recall definition from Paparrizos et al. published at VLDB 2022
+ [1]_.
+
+ We first extend the anomaly labels by two slopes of ``buffer_size//2`` length on
+ both sides of each anomaly, uniformly sample thresholds from the anomaly score, and
+ then compute the confusion matrix for all thresholds. Using the resulting precision
+ and recall values, we can plot a curve and compute its area.
+
+ Parameters
+ ----------
+ y_true : np.ndarray
+ True binary labels of shape (n_instances,).
+ y_score : np.ndarray
+ Anomaly scores for each point of the time series of shape (n_instances,).
+ buffer_size : int, optional
+ Size of the buffer region around an anomaly. We add an increasing slope of size
+ ``buffer_size//2`` to the beginning of anomalies and a decreasing slope of size
+ ``buffer_size//2`` to the end of anomalies. Per default
+ (when ``buffer_size==None``), ``buffer_size`` is the median length of the
+ anomalies within the time series. However, you can also set it to the period
+ size of the dominant frequency or any other desired value.
+
+ Returns
+ -------
+ Tuple[float, float]
+ Range-based PR AUC score.
+
+ References
+ ----------
+ .. [1] John Paparrizos, Paul Boniol, Themis Palpanas, Ruey S. Tsay,
+ Aaron Elmore, and Michael J. Franklin. Volume Under the Surface: A New Accuracy
+ Evaluation Measure for Time-Series Anomaly Detection. PVLDB, 15(11):
+ 2774 - 2787, 2022.
+ doi:`10.14778/3551793.3551830 `_
+ """
+ y_true, y_pred = check_y(y_true, y_score, force_y_pred_continuous=True)
+ if np.unique(y_score).shape[0] == 1:
+ warnings.warn(
+ "Cannot compute metric for a constant value in y_score, returning 0.0!",
+ stacklevel=2,
+ )
+ return 0.0
+
+ range_pr_auc, _ = range_pr_roc_auc_support(
+ y_true, y_score, buffer_size, skip_check=True
+ )
+ return range_pr_auc
+
+
+def range_pr_vus_score(
+ y_true: np.ndarray, y_score: np.ndarray, max_buffer_size: int = 500
+) -> float:
+ """Compute the range-based PR VUS score.
+
+ Computes the volume under the precision-recall-buffer_size-surface using the
+ range-based precision and range-based recall definition from Paparrizos et al.
+ published at VLDB 2022 [1]_.
+
+ For all buffer sizes from 0 to ``max_buffer_size``, we first extend the anomaly
+ labels by two slopes of ``buffer_size//2`` length on both sides of each anomaly,
+ uniformly sample thresholds from the anomaly score, and then compute the confusion
+ matrix for all thresholds. Using the resulting precision and recall values, we can
+ plot a curve and compute its area.
+
+ Parameters
+ ----------
+ y_true : np.ndarray
+ True binary labels of shape (n_instances,).
+ y_score : np.ndarray
+ Anomaly scores for each point of the time series of shape (n_instances,).
+ max_buffer_size : int, default=500
+ Maximum size of the buffer region around an anomaly. We iterate over all buffer
+ sizes from 0 to ``may_buffer_size`` to create the surface.
+
+ Returns
+ -------
+ Tuple[float, float]
+ Range-based PR VUS score.
+
+ References
+ ----------
+ .. [1] John Paparrizos, Paul Boniol, Themis Palpanas, Ruey S. Tsay,
+ Aaron Elmore, and Michael J. Franklin. Volume Under the Surface: A New Accuracy
+ Evaluation Measure for Time-Series Anomaly Detection. PVLDB, 15(11):
+ 2774 - 2787, 2022.
+ doi:`10.14778/3551793.3551830 `_
+ """
+ y_true, y_pred = check_y(y_true, y_score, force_y_pred_continuous=True)
+ if np.unique(y_score).shape[0] == 1:
+ warnings.warn(
+ "Cannot compute metric for a constant value in y_score, returning 0.0!",
+ stacklevel=2,
+ )
+ return 0.0
+
+ prs = np.zeros(max_buffer_size + 1)
+ for buffer_size in np.arange(0, max_buffer_size + 1):
+ pr_auc, _ = range_pr_roc_auc_support(
+ y_true, y_score, buffer_size=buffer_size, skip_check=True
+ )
+ prs[buffer_size] = pr_auc
+ range_pr_volume = np.sum(prs) / (max_buffer_size + 1)
+ return range_pr_volume
+
+
+def range_roc_vus_score(
+ y_true: np.ndarray, y_score: np.ndarray, max_buffer_size: int = 500
+) -> float:
+ """Compute the range-based ROC VUS score.
+
+ Computes the volume under the receiver-operating-characteristic-buffer_size-surface
+ using the range-based TPR and range-based FPR definition from Paparrizos et al.
+ published at VLDB 2022 [1]_.
+
+ For all buffer sizes from 0 to ``max_buffer_size``, we first extend the anomaly
+ labels by two slopes of ``buffer_size//2`` length on both sides of each anomaly,
+ uniformly sample thresholds from the anomaly score, and then compute the confusion
+ matrix for all thresholds. Using the resulting false positive (FPR) and false
+ positive rates (FPR), we can plot a curve and compute its area.
+
+ Parameters
+ ----------
+ y_true : np.ndarray
+ True binary labels of shape (n_instances,).
+ y_score : np.ndarray
+ Anomaly scores for each point of the time series of shape (n_instances,).
+ max_buffer_size : int, default=500
+ Maximum size of the buffer region around an anomaly. We iterate over all buffer
+ sizes from 0 to ``may_buffer_size`` to create the surface.
+
+ Returns
+ -------
+ Tuple[float, float]
+ Range-based PR VUS score.
+
+ References
+ ----------
+ .. [1] John Paparrizos, Paul Boniol, Themis Palpanas, Ruey S. Tsay,
+ Aaron Elmore, and Michael J. Franklin. Volume Under the Surface: A New Accuracy
+ Evaluation Measure for Time-Series Anomaly Detection. PVLDB, 15(11):
+ 2774 - 2787, 2022.
+ doi:`10.14778/3551793.3551830 `_
+ """
+ y_true, y_pred = check_y(y_true, y_score, force_y_pred_continuous=True)
+ if np.unique(y_score).shape[0] == 1:
+ warnings.warn(
+ "Cannot compute metric for a constant value in y_score, returning 0.0!",
+ stacklevel=2,
+ )
+ return 0.0
+
+ rocs = np.zeros(max_buffer_size + 1)
+ for buffer_size in np.arange(0, max_buffer_size + 1):
+ _, roc_auc = range_pr_roc_auc_support(
+ y_true, y_score, buffer_size=buffer_size, skip_check=True
+ )
+ rocs[buffer_size] = roc_auc
+ range_roc_volume = np.sum(rocs) / (max_buffer_size + 1)
+ return range_roc_volume
diff --git a/aeon/performance_metrics/anomaly_detection/tests/__init__.py b/aeon/performance_metrics/anomaly_detection/tests/__init__.py
new file mode 100644
index 0000000000..85491bb5d7
--- /dev/null
+++ b/aeon/performance_metrics/anomaly_detection/tests/__init__.py
@@ -0,0 +1 @@
+"""Tests for metrics in anomaly_detection module."""
diff --git a/aeon/performance_metrics/anomaly_detection/tests/test_ad_metrics.py b/aeon/performance_metrics/anomaly_detection/tests/test_ad_metrics.py
new file mode 100644
index 0000000000..617e55027a
--- /dev/null
+++ b/aeon/performance_metrics/anomaly_detection/tests/test_ad_metrics.py
@@ -0,0 +1,304 @@
+"""Tests for metrics in anomaly_detection module."""
+
+import numpy as np
+import pytest
+
+from aeon.performance_metrics.anomaly_detection import (
+ f_score_at_k_points,
+ f_score_at_k_ranges,
+ pr_auc_score,
+ range_f_score,
+ range_pr_auc_score,
+ range_pr_vus_score,
+ range_precision,
+ range_recall,
+ range_roc_auc_score,
+ range_roc_vus_score,
+ roc_auc_score,
+ rp_rr_auc_score,
+)
+from aeon.testing.data_generation import make_example_1d_numpy
+from aeon.utils.validation._dependencies import _check_soft_dependencies
+
+pr_metrics = [pr_auc_score]
+range_metrics = [
+ range_roc_auc_score,
+ range_pr_auc_score,
+ range_roc_vus_score,
+ range_pr_vus_score,
+]
+other_metrics = [
+ roc_auc_score,
+ f_score_at_k_points,
+]
+continuous_metrics = [*pr_metrics, *other_metrics, *range_metrics]
+binary_metrics = []
+
+if _check_soft_dependencies("prts", severity="none"):
+ pr_metrics.append(rp_rr_auc_score)
+ range_metrics.extend(
+ [
+ range_recall,
+ range_precision,
+ range_f_score,
+ ]
+ )
+ other_metrics.extend(
+ [
+ f_score_at_k_ranges,
+ rp_rr_auc_score,
+ ]
+ )
+ continuous_metrics.extend(
+ [
+ rp_rr_auc_score,
+ f_score_at_k_ranges,
+ ]
+ )
+ binary_metrics = [range_recall, range_precision, range_f_score]
+
+metrics = [*pr_metrics, *range_metrics, *other_metrics]
+
+
+@pytest.mark.parametrize("metric", metrics, ids=[m.__name__ for m in metrics])
+def test_metric_output(metric):
+ """Test output has correct format."""
+ y_score = make_example_1d_numpy(n_timepoints=20, random_state=0)
+ y_true = make_example_1d_numpy(n_timepoints=20, random_state=1)
+ y_true = (y_true > 0.5).astype(int)
+
+ if metric in continuous_metrics:
+ res = metric(
+ y_true=y_true,
+ y_score=y_score,
+ )
+ else:
+ y_pred = (y_score > 0.5).astype(int)
+ res = metric(
+ y_true=y_true,
+ y_pred=y_pred,
+ )
+
+ assert isinstance(res, float)
+ assert 0.0 <= res <= 1.0
+
+
+@pytest.mark.parametrize(
+ "metric", continuous_metrics, ids=[m.__name__ for m in continuous_metrics]
+)
+def test_continuous_metric_requires_scores(metric):
+ """Test that continuous metrics require float scores."""
+ y_scores = np.array([1, 0, 1, 0, 0])
+ y_true = np.array([0, 0, 1, 0, 0])
+
+ with pytest.raises(ValueError) as ex:
+ metric(y_true, y_scores)
+ assert "scores must be floats" in str(ex.value)
+
+ y_scores = y_scores.astype(np.float_)
+ metric(y_true, y_scores)
+ assert True, "Metric should accept float, so no error should be raised"
+
+
+@pytest.mark.parametrize(
+ "metric", binary_metrics, ids=[m.__name__ for m in binary_metrics]
+)
+def test_binary_metric_requires_predictions(metric):
+ """Test that binary metrics require integer or boolean predictions."""
+ y_scores = np.array([0.8, 0.1, 0.9, 0.3, 0.3], dtype=np.float_)
+ y_true = np.array([0, 0, 1, 0, 0], dtype=np.bool_)
+
+ with pytest.raises(ValueError) as ex:
+ metric(y_true, y_scores)
+ assert "scores must be integers" in str(ex.value)
+
+ y_pred = np.array(y_scores > 0.5, dtype=np.int_)
+ metric(y_true, y_pred)
+
+ y_pred = y_pred.astype(np.bool_)
+ metric(y_true, y_pred)
+ assert True, "Metric should accept ints and bools, so no error should be raised"
+
+
+@pytest.mark.parametrize("metric", metrics, ids=[m.__name__ for m in metrics])
+def test_edge_cases(metric):
+ """Test edge cases for all metrics."""
+ y_true = np.zeros(10, dtype=np.int_)
+ y_true[2:4] = 1
+ y_true[6:8] = 1
+ y_zeros = np.zeros_like(y_true, dtype=np.float_)
+ y_flat = np.full_like(y_true, fill_value=0.5, dtype=np.float_)
+ y_ones = np.ones_like(y_true, dtype=np.float_)
+
+ for y_pred in [y_zeros, y_flat, y_ones]:
+ if metric in binary_metrics:
+ score = metric(y_true, y_pred.astype(np.int_))
+ else:
+ score = metric(y_true, y_pred)
+ np.testing.assert_almost_equal(score, 0)
+
+
+@pytest.mark.parametrize("metric", pr_metrics, ids=[m.__name__ for m in pr_metrics])
+def test_edge_cases_pr_metrics(metric):
+ """Test edge cases for PR metrics."""
+ y_true = np.zeros(10, dtype=np.int_)
+ y_true[2:4] = 1
+ y_true[6:8] = 1
+ y_inverted = (y_true * -1 + 1).astype(np.float_)
+
+ score = metric(y_true, y_inverted)
+ assert score <= 0.2, f"{metric.__name__}(y_true, y_inverted)={score} is not <= 0.2"
+
+
+@pytest.mark.skipif(
+ not _check_soft_dependencies("prts", severity="none"),
+ reason="required soft dependency prts not available",
+)
+def test_range_based_f1():
+ """Test range-based F1 score."""
+ y_pred = np.array([0, 1, 1, 0])
+ y_true = np.array([0, 1, 0, 0])
+ result = range_f_score(y_true, y_pred)
+ np.testing.assert_almost_equal(result, 0.66666, decimal=4)
+
+
+@pytest.mark.skipif(
+ not _check_soft_dependencies("prts", severity="none"),
+ reason="required soft dependency prts not available",
+)
+def test_range_based_precision():
+ """Test range-based precision."""
+ y_pred = np.array([0, 1, 1, 0])
+ y_true = np.array([0, 1, 0, 0])
+ result = range_precision(y_true, y_pred)
+ assert result == 0.5
+
+
+@pytest.mark.skipif(
+ not _check_soft_dependencies("prts", severity="none"),
+ reason="required soft dependency prts not available",
+)
+def test_range_based_recall():
+ """Test range-based recall."""
+ y_pred = np.array([0, 1, 1, 0])
+ y_true = np.array([0, 1, 0, 0])
+ result = range_recall(y_true, y_pred)
+ assert result == 1
+
+
+@pytest.mark.skipif(
+ not _check_soft_dependencies("prts", severity="none"),
+ reason="required soft dependency prts not available",
+)
+def test_rf1_value_error():
+ """Test range-based F1 score raises ValueError on binary predictions."""
+ y_pred = np.array([0, 0.2, 0.7, 0])
+ y_true = np.array([0, 1, 0, 0])
+ with pytest.raises(ValueError):
+ range_f_score(y_true, y_pred)
+
+
+def test_pr_curve_auc():
+ """Test PR curve AUC."""
+ y_pred = np.array([0, 0.1, 1.0, 0.5, 0, 0])
+ y_true = np.array([0, 0, 1, 1, 0, 0])
+ result = pr_auc_score(y_true, y_pred)
+ np.testing.assert_almost_equal(result, 1.0000, decimal=4)
+
+
+# def test_average_precision():
+# y_pred = np.array([0, 0.1, 1., .5, 0, 0])
+# y_true = np.array([0, 1, 1, 0, 0, 0])
+# result = average_precision_score(y_true, y_pred)
+# np.testing.assert_almost_equal(result, 0.8333, decimal=4)
+
+
+@pytest.mark.skipif(
+ not _check_soft_dependencies("prts", severity="none"),
+ reason="required soft dependency prts not available",
+)
+def test_range_based_p_range_based_r_curve_auc():
+ """Test range-based precision-recall curve AUC."""
+ y_pred = np.array([0, 0.1, 1.0, 0.5, 0.1, 0])
+ y_true = np.array([0, 1, 1, 1, 0, 0])
+ result = rp_rr_auc_score(y_true, y_pred)
+ np.testing.assert_almost_equal(result, 0.9792, decimal=4)
+
+
+@pytest.mark.skipif(
+ not _check_soft_dependencies("prts", severity="none"),
+ reason="required soft dependency prts not available",
+)
+def test_range_based_p_range_based_r_auc_perfect_hit():
+ """Test range-based precision-recall curve AUC with perfect hit."""
+ y_pred = np.array([0, 0, 0.5, 0.5, 0, 0])
+ y_true = np.array([0, 0, 1, 1, 0, 0])
+ result = rp_rr_auc_score(y_true, y_pred)
+ np.testing.assert_almost_equal(result, 1.0000, decimal=4)
+
+
+@pytest.mark.skipif(
+ not _check_soft_dependencies("prts", severity="none"),
+ reason="required soft dependency prts not available",
+)
+def test_f_score_at_k_ranges():
+ """Test range-based F1 score at k ranges."""
+ y_pred = np.array([0.4, 0.1, 1.0, 0.5, 0.1, 0, 0.4, 0.5])
+ y_true = np.array([0, 1, 1, 1, 0, 0, 1, 0])
+ result = f_score_at_k_ranges(y_true, y_pred)
+ np.testing.assert_almost_equal(result, 0.5000, decimal=4)
+ result = f_score_at_k_ranges(y_true, y_pred, k=3)
+ np.testing.assert_almost_equal(result, 0.8000, decimal=4)
+
+
+def test_fscore_at_k_points():
+ """Test F1 score at k points."""
+ y_pred = np.array([0.4, 0.6, 1.0, 0.5, 0.1, 0, 0.5, 0.4])
+ y_true = np.array([0, 1, 1, 1, 0, 0, 1, 0])
+ result = f_score_at_k_points(y_true, y_pred)
+ np.testing.assert_almost_equal(result, 1.0000, decimal=4)
+ result = f_score_at_k_points(y_true, y_pred, k=1)
+ np.testing.assert_almost_equal(result, 0.4000, decimal=4)
+
+
+# VUS fixtures
+y_true = np.zeros(200)
+y_true[10:20] = 1
+y_true[28:33] = 1
+y_true[110:120] = 1
+y_score = np.random.default_rng(41).random(200) * 0.5
+y_score[16:22] = 1
+y_score[33:38] = 1
+y_score[160:170] = 1
+
+
+def test_range_pr_auc_compat():
+ """Test range-based PR AUC."""
+ result = range_pr_auc_score(y_true, y_score)
+ np.testing.assert_almost_equal(result, 0.3737854660, decimal=10)
+
+
+def test_range_roc_auc_compat():
+ """Test range-based ROC AUC."""
+ result = range_roc_auc_score(y_true, y_score)
+ np.testing.assert_almost_equal(result, 0.7108527197, decimal=10)
+
+
+def test_edge_case_existence_reward_compat():
+ """Test edge case for existence reward in range-based PR/ROC AUC."""
+ result = range_pr_auc_score(y_true, y_score, buffer_size=4)
+ np.testing.assert_almost_equal(result, 0.2506464391, decimal=10)
+ result = range_roc_auc_score(y_true, y_score, buffer_size=4)
+ np.testing.assert_almost_equal(result, 0.6143220816, decimal=10)
+
+
+def test_range_pr_volume_compat():
+ """Test range-based PR volume under the curve."""
+ result = range_pr_vus_score(y_true, y_score, max_buffer_size=200)
+ np.testing.assert_almost_equal(result, 0.7493254559, decimal=10)
+
+
+def test_range_roc_volume_compat():
+ """Test range-based ROC volume under the curve."""
+ result = range_roc_vus_score(y_true, y_score, max_buffer_size=200)
+ np.testing.assert_almost_equal(result, 0.8763382130, decimal=10)
diff --git a/aeon/performance_metrics/anomaly_detection/tests/test_thresholding.py b/aeon/performance_metrics/anomaly_detection/tests/test_thresholding.py
new file mode 100644
index 0000000000..42994da406
--- /dev/null
+++ b/aeon/performance_metrics/anomaly_detection/tests/test_thresholding.py
@@ -0,0 +1,37 @@
+"""Test anomaly detection thresholding methods."""
+
+import numpy as np
+
+from aeon.performance_metrics.anomaly_detection.thresholding import (
+ percentile_threshold,
+ sigma_threshold,
+ top_k_points_threshold,
+ top_k_ranges_threshold,
+)
+
+y_true = np.array([0, 0, 1, 0, 0, 0, 0, 0, 0])
+y_score = np.array([np.nan, 0.1, 0.9, 0.5, 0.6, 0.55, 0.2, 0.1, np.nan])
+
+
+def test_percentile_thresholding():
+ """Test percentile thresholding."""
+ threshold = percentile_threshold(y_score, percentile=90)
+ np.testing.assert_almost_equal(threshold, 0.72, decimal=2)
+
+
+def test_top_k_points_thresholding():
+ """Test top k points thresholding."""
+ threshold = top_k_points_threshold(y_true, y_score, k=2)
+ np.testing.assert_almost_equal(threshold, 0.58, decimal=2)
+
+
+def test_top_k_ranges_thresholding():
+ """Test top k ranges thresholding."""
+ threshold = top_k_ranges_threshold(y_true, y_score, k=2)
+ np.testing.assert_almost_equal(threshold, 0.60, decimal=2)
+
+
+def test_sigma_thresholding():
+ """Test sigma thresholding."""
+ threshold = sigma_threshold(y_score, factor=1)
+ np.testing.assert_almost_equal(threshold, 0.70, decimal=2)
diff --git a/aeon/performance_metrics/anomaly_detection/thresholding.py b/aeon/performance_metrics/anomaly_detection/thresholding.py
new file mode 100644
index 0000000000..a882a7887f
--- /dev/null
+++ b/aeon/performance_metrics/anomaly_detection/thresholding.py
@@ -0,0 +1,147 @@
+"""Functions to compute thresholds to convert anomaly scores to binary predictions."""
+
+from __future__ import annotations
+
+__maintainer__ = ["CodeLionX"]
+__all__ = [
+ "percentile_threshold",
+ "sigma_threshold",
+ "top_k_points_threshold",
+ "top_k_ranges_threshold",
+]
+
+import numpy as np
+
+
+def percentile_threshold(y_score: np.ndarray, percentile: int) -> float:
+ """Calculate a threshold based on a percentile of the anomaly scores.
+
+ Uses the xth-percentile of the anomaly scoring as threshold ignoring NaNs and using
+ a linear interpolation.
+
+ Parameters
+ ----------
+ y_score : np.ndarray
+ Anomaly scores for each point of the time series of shape (n_instances,).
+ percentile : int
+ Percentile to use as threshold between 0 and 100.
+
+ Returns
+ -------
+ float
+ Threshold based on the percentile.
+ """
+ return np.nanpercentile(y_score, percentile)
+
+
+def sigma_threshold(y_score: np.ndarray, factor: float = 2) -> float:
+ r"""Calculate a threshold based on the standard deviation of the anomaly scores.
+
+ Computes a threshold :math:`\theta` based on the anomaly scoring's mean
+ :math:`\mu_s` and the standard deviation :math:`\sigma_s`, ignoring NaNs:
+
+ .. math::
+ \theta = \mu_{s} + x \cdot \sigma_{s}
+
+ Parameters
+ ----------
+ y_score : np.ndarray
+ Anomaly scores for each point of the time series of shape (n_instances,).
+ factor : float
+ Number of standard deviations to use as threshold (:math:`x`).
+
+ Returns
+ -------
+ float
+ Threshold based on the standard deviation.
+ """
+ return np.nanmean(y_score) + factor * np.nanstd(y_score)
+
+
+def top_k_points_threshold(
+ y_true: np.ndarray, y_score: np.ndarray, k: int | None = None
+) -> float:
+ """Calculate a threshold such that at least `k` anomalous points are found.
+
+ The anomalies are single-point anomalies.
+
+ Computes a threshold based on the number of expected anomalies (number of
+ anomalies). This method iterates over all possible thresholds from high to low to
+ find the first threshold that yields `k` or more anomalous points. If `k` is `None`,
+ the ground truth data is used to calculate the real number of anomalies.
+
+ Parameters
+ ----------
+ y_true : np.ndarray
+ True binary labels of shape (n_instances,).
+ y_score : np.ndarray
+ Anomaly scores for each point of the time series of shape (n_instances,).
+ k : optional int
+ Number of expected anomalies. If `k` is `None`, the ground truth data is used
+ to calculate the real number of anomalies.
+
+ Returns
+ -------
+ float
+ Threshold such that there are at least `k` anomalous points.
+ """
+ if k is None:
+ return np.nanpercentile(y_score, (1 - y_true.sum() / y_true.shape[0]) * 100)
+ else:
+ return np.nanpercentile(y_score, (1 - k / y_true.shape[0]) * 100)
+
+
+def top_k_ranges_threshold(
+ y_true: np.ndarray, y_score: np.ndarray, k: int | None = None
+) -> float:
+ """Calculate a threshold such that at least `k` anomalies are found.
+
+ The anomalies are either single-points anomalies or continuous anomalous ranges.
+
+ Computes a threshold based on the number of expected anomalous subsequences /
+ ranges (number of anomalies). This method iterates over all possible thresholds
+ from high to low to find the first threshold that yields `k` or more continuous
+ anomalous ranges. If `k` is `None`, the ground truth data is used to calculate the
+ real number of anomalies (anomalous ranges).
+
+ Parameters
+ ----------
+ y_true : np.ndarray
+ True binary labels of shape (n_instances,).
+ y_score : np.ndarray
+ Anomaly scores for each point of the time series of shape (n_instances,).
+ k : optional int
+ Number of expected anomalies. If `k` is `None`, the ground truth data is used
+ to calculate the real number of anomalies.
+
+ Returns
+ -------
+ float
+ Threshold such that there are at least `k` anomalous ranges.
+ """
+ if k is None:
+ k = _count_anomaly_ranges(y_true)
+ thresholds = np.unique(y_score)[::-1]
+
+ # exclude minimum from thresholds, because all points are >= minimum!
+ for t in thresholds[:-1]:
+ y_pred = np.array(y_score >= t, dtype=np.int_)
+ detected_n = _count_anomaly_ranges(y_pred)
+ if detected_n >= k:
+ return t
+
+
+def _count_anomaly_ranges(y: np.ndarray) -> int:
+ """Count the number of continuous anomalous ranges in a binary sequence.
+
+ Parameters
+ ----------
+ y : np.ndarray
+ Binary sequence of shape (n_instances,).
+
+ Returns
+ -------
+ int
+ Number of continuous anomalous ranges.
+ """
+ return int(np.sum(np.diff(np.r_[0, y, 0]) == 1))
diff --git a/aeon/utils/validation/_dependencies.py b/aeon/utils/validation/_dependencies.py
index 594e3b2e0c..ee31e71087 100644
--- a/aeon/utils/validation/_dependencies.py
+++ b/aeon/utils/validation/_dependencies.py
@@ -78,6 +78,8 @@ def _check_soft_dependencies(
if obj is None:
class_name = "This functionality"
+ elif isinstance(obj, str):
+ class_name = obj
elif not isclass(obj):
class_name = type(obj).__name__
elif isclass(obj):
diff --git a/aeon/utils/validation/tests/test_dependencies.py b/aeon/utils/validation/tests/test_dependencies.py
index d05c6b5040..89cbd20383 100644
--- a/aeon/utils/validation/tests/test_dependencies.py
+++ b/aeon/utils/validation/tests/test_dependencies.py
@@ -20,7 +20,7 @@ def test__check_soft_dependencies():
_check_soft_dependencies("FOOBAR")
with pytest.raises(ModuleNotFoundError, match="No module named 'FOOBAR'"):
_check_soft_dependencies("FOOBAR", suppress_import_stdout=True)
- with pytest.raises(ModuleNotFoundError, match="str requires package 'FOOBAR'"):
+ with pytest.raises(ModuleNotFoundError, match="FOOBAR requires package 'FOOBAR'"):
_check_soft_dependencies("FOOBAR", obj="FOOBAR")
with pytest.raises(RuntimeError, match="severity argument must be "):
_check_soft_dependencies("FOOBAR", severity="FOOBAR")
diff --git a/docs/api_reference/performance_metrics.rst b/docs/api_reference/performance_metrics.rst
index 18aeecf05e..8e0754ad89 100644
--- a/docs/api_reference/performance_metrics.rst
+++ b/docs/api_reference/performance_metrics.rst
@@ -44,7 +44,7 @@ Forecasting
Segmentation
-~~~~~~~~~~~~
+------------
.. currentmodule:: aeon.performance_metrics.segmentation.metrics
@@ -55,3 +55,40 @@ Segmentation
count_error
hausdorff_error
prediction_ratio
+
+Anomaly Detection
+-----------------
+
+.. currentmodule:: aeon.performance_metrics.anomaly_detection
+
+.. autosummary::
+ :toctree: auto_generated/
+ :template: function.rst
+
+ range_precision
+ range_recall
+ range_f_score
+ roc_auc_score
+ pr_auc_score
+ rp_rr_auc_score
+ f_score_at_k_points
+ f_score_at_k_ranges
+ range_pr_roc_auc_support
+ range_roc_auc_score
+ range_pr_auc_score
+ range_pr_vus_score
+ range_roc_vus_score
+
+AD Thresholding
+~~~~~~~~~~~~~~~
+
+.. currentmodule:: aeon.performance_metrics.anomaly_detection.thresholding
+
+.. autosummary::
+ :toctree: auto_generated/
+ :template: function.rst
+
+ percentile_threshold
+ sigma_threshold
+ top_k_points_threshold
+ top_k_ranges_threshold
diff --git a/pyproject.toml b/pyproject.toml
index 377d5c736a..cbea69021e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -81,6 +81,7 @@ all_extras = [
"tslearn>=0.5.2",
"xarray",
"pyod>=1.1.3",
+ "prts>=1.0.0.0",
# dependency of tensorflow, see issue #1724
"keras<3.4.0",