From 2b011bc4746422ce0e391d8892bb53d8d79a826c Mon Sep 17 00:00:00 2001
From: Sebastian Schmidl <CodeLionX@users.noreply.github.com>
Date: Mon, 2 Sep 2024 11:24:13 +0200
Subject: [PATCH] [ENH] Add performance metrics for anomaly detection (from
 TimeEval) (#1938)

* feat: add TSAD metrics and tests from TimeEval

* fix: prts soft dependency specification

* feat: update CODEOWNERS

* feat: add to docs

* fix: prts version specification until https://github.com/CompML/PRTS/pull/77 is merged and add skiptests conditions if not installed

* feat: handle str-object-names in check_soft_dependencies correctly

* fix: check_soft_dependencies test case for str-object reference

* fix: address PR review change requests
---
 CODEOWNERS                                    |   1 +
 .../anomaly_detection/__init__.py             |  37 ++
 .../anomaly_detection/_binary.py              | 209 ++++++++++
 .../anomaly_detection/_continuous.py          | 296 ++++++++++++++
 .../anomaly_detection/_util.py                | 119 ++++++
 .../anomaly_detection/_vus_metrics.py         | 382 ++++++++++++++++++
 .../anomaly_detection/tests/__init__.py       |   1 +
 .../tests/test_ad_metrics.py                  | 304 ++++++++++++++
 .../tests/test_thresholding.py                |  37 ++
 .../anomaly_detection/thresholding.py         | 147 +++++++
 aeon/utils/validation/_dependencies.py        |   2 +
 .../validation/tests/test_dependencies.py     |   2 +-
 docs/api_reference/performance_metrics.rst    |  39 +-
 pyproject.toml                                |   1 +
 14 files changed, 1575 insertions(+), 2 deletions(-)
 create mode 100644 aeon/performance_metrics/anomaly_detection/__init__.py
 create mode 100644 aeon/performance_metrics/anomaly_detection/_binary.py
 create mode 100644 aeon/performance_metrics/anomaly_detection/_continuous.py
 create mode 100644 aeon/performance_metrics/anomaly_detection/_util.py
 create mode 100644 aeon/performance_metrics/anomaly_detection/_vus_metrics.py
 create mode 100644 aeon/performance_metrics/anomaly_detection/tests/__init__.py
 create mode 100644 aeon/performance_metrics/anomaly_detection/tests/test_ad_metrics.py
 create mode 100644 aeon/performance_metrics/anomaly_detection/tests/test_thresholding.py
 create mode 100644 aeon/performance_metrics/anomaly_detection/thresholding.py

diff --git a/CODEOWNERS b/CODEOWNERS
index b003844696..7c51713485 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -20,6 +20,7 @@ aeon/forecasting/ @aiwalter @guzalbulatova @ltsaprounis
 aeon/networks/ @hadifawaz1999
 
 aeon/performance_metrics/forecasting/ @aiwalter
+aeon/performance_metrics/anomaly_detection/ @codelionx @MatthewMiddlehurst
 
 aeon/pipeline/ @aiwalter
 
diff --git a/aeon/performance_metrics/anomaly_detection/__init__.py b/aeon/performance_metrics/anomaly_detection/__init__.py
new file mode 100644
index 0000000000..20b52bd503
--- /dev/null
+++ b/aeon/performance_metrics/anomaly_detection/__init__.py
@@ -0,0 +1,37 @@
+"""Metrics for anomaly detection."""
+
+from aeon.performance_metrics.anomaly_detection._binary import (
+    range_f_score,
+    range_precision,
+    range_recall,
+)
+from aeon.performance_metrics.anomaly_detection._continuous import (
+    f_score_at_k_points,
+    f_score_at_k_ranges,
+    pr_auc_score,
+    roc_auc_score,
+    rp_rr_auc_score,
+)
+from aeon.performance_metrics.anomaly_detection._vus_metrics import (
+    range_pr_auc_score,
+    range_pr_roc_auc_support,
+    range_pr_vus_score,
+    range_roc_auc_score,
+    range_roc_vus_score,
+)
+
+__all__ = [
+    "range_precision",
+    "range_recall",
+    "range_f_score",
+    "roc_auc_score",
+    "pr_auc_score",
+    "rp_rr_auc_score",
+    "f_score_at_k_points",
+    "f_score_at_k_ranges",
+    "range_pr_roc_auc_support",
+    "range_roc_auc_score",
+    "range_pr_auc_score",
+    "range_pr_vus_score",
+    "range_roc_vus_score",
+]
diff --git a/aeon/performance_metrics/anomaly_detection/_binary.py b/aeon/performance_metrics/anomaly_detection/_binary.py
new file mode 100644
index 0000000000..6c6c812ef9
--- /dev/null
+++ b/aeon/performance_metrics/anomaly_detection/_binary.py
@@ -0,0 +1,209 @@
+"""Metrics on binary predictions for anomaly detection."""
+
+__maintainer__ = ["CodeLionX"]
+__all__ = ["range_precision", "range_recall", "range_f_score"]
+
+import warnings
+
+import numpy as np
+
+from aeon.performance_metrics.anomaly_detection._util import check_y
+from aeon.utils.validation._dependencies import _check_soft_dependencies
+
+
+def range_precision(
+    y_true: np.ndarray,
+    y_pred: np.ndarray,
+    alpha: float = 0,
+    cardinality: str = "reciprocal",
+    bias: str = "flat",
+) -> float:
+    """Compute the range-based precision metric.
+
+    Range-based metrics were introduced by Tatbul et al. at NeurIPS 2018 [1]_. This
+    implementation uses the community package `prts <https://pypi.org/project/prts/>`_
+    as a soft-dependency.
+
+    Range precision is the average precision of each predicted anomaly range. For each
+    predicted continuous anomaly range the overlap size, position, and cardinality is
+    considered. For more details, please refer to the paper [1]_.
+
+    Parameters
+    ----------
+    y_true : np.ndarray
+        True binary labels of shape (n_instances,).
+    y_pred : np.ndarray
+        Anomaly scores for each point of the time series of shape (n_instances,).
+    alpha : float
+        Weight of the existence reward. Because precision by definition emphasizes on
+        prediction quality, there is no need for an existence reward and this value
+        should always be set to 0.
+    cardinality : {'reciprocal', 'one', 'udf_gamma'}
+        Cardinality type.
+    bias : {'flat', 'front', 'middle', 'back'}
+        Positional bias type.
+
+    Returns
+    -------
+    float
+        Range-based precision
+
+    References
+    ----------
+    .. [1] Tatbul, Nesime, Tae Jun Lee, Stan Zdonik, Mejbah Alam, and Justin
+       Gottschlich. "Precision and Recall for Time Series." In Proceedings of the
+       International Conference on Neural Information Processing Systems (NeurIPS),
+       1920–30. 2018.
+       http://papers.nips.cc/paper/7462-precision-and-recall-for-time-series.pdf.
+    """
+    _check_soft_dependencies("prts", obj="range_precision", suppress_import_stdout=True)
+
+    from prts import ts_precision
+
+    y_true, y_pred = check_y(y_true, y_pred, force_y_pred_continuous=False)
+    if np.unique(y_pred).shape[0] == 1:
+        warnings.warn(
+            "Cannot compute metric for a constant value in y_score, returning 0.0!",
+            stacklevel=2,
+        )
+        return 0.0
+    return ts_precision(y_true, y_pred, alpha=alpha, cardinality=cardinality, bias=bias)
+
+
+def range_recall(
+    y_true: np.ndarray,
+    y_pred: np.ndarray,
+    alpha: float = 0,
+    cardinality: str = "reciprocal",
+    bias: str = "flat",
+) -> float:
+    """Compute the range-based recall metric.
+
+    Range-based metrics were introduced by Tatbul et al. at NeurIPS 2018 [1]_. This
+    implementation uses the community package `prts <https://pypi.org/project/prts/>`_
+    as a soft-dependency.
+
+    Range recall is the average recall of each real anomaly range. For each real
+    anomaly range the overlap size, position, and cardinality with predicted anomaly
+    ranges are considered. In addition, an existence reward can be given that boosts
+    the recall even if just a single point of the real anomaly is in the predicted
+    ranges. For more details, please refer to the paper [1]_.
+
+    Parameters
+    ----------
+    y_true : np.ndarray
+        True binary labels of shape (n_instances,).
+    y_pred : np.ndarray
+        Anomaly scores for each point of the time series of shape (n_instances,).
+    alpha : float
+        Weight of the existence reward. If 0: no existence reward, if 1: only existence
+        reward. The existence reward is given if the real anomaly range has overlap
+        with even a single point of the predicted anomaly range.
+    cardinality : {'reciprocal', 'one', 'udf_gamma'}
+        Cardinality type.
+    bias : {'flat', 'front', 'middle', 'back'}
+        Positional bias type.
+
+    Returns
+    -------
+    float
+        Range-based recall
+
+    References
+    ----------
+    .. [1] Tatbul, Nesime, Tae Jun Lee, Stan Zdonik, Mejbah Alam, and Justin
+       Gottschlich. "Precision and Recall for Time Series." In Proceedings of the
+       International Conference on Neural Information Processing Systems (NeurIPS),
+       1920–30. 2018.
+       http://papers.nips.cc/paper/7462-precision-and-recall-for-time-series.pdf.
+    """
+    _check_soft_dependencies("prts", obj="range_recall", suppress_import_stdout=True)
+
+    from prts import ts_recall
+
+    y_true, y_pred = check_y(y_true, y_pred, force_y_pred_continuous=False)
+    if np.unique(y_pred).shape[0] == 1:
+        warnings.warn(
+            "Cannot compute metric for a constant value in y_score, returning 0.0!",
+            stacklevel=2,
+        )
+        return 0.0
+    return ts_recall(y_true, y_pred, alpha=alpha, cardinality=cardinality, bias=bias)
+
+
+def range_f_score(
+    y_true: np.ndarray,
+    y_pred: np.ndarray,
+    beta: float = 1,
+    p_alpha: float = 0,
+    r_alpha: float = 0.5,
+    cardinality: str = "reciprocal",
+    p_bias: str = "flat",
+    r_bias: str = "flat",
+) -> float:
+    """Compute the F-score using the range-based recall and precision metrics.
+
+    Range-based metrics were introduced by Tatbul et al. at NeurIPS 2018 [1]_. This
+    implementation uses the community package `prts <https://pypi.org/project/prts/>`_
+    as a soft-dependency.
+
+    The F-beta score is the weighted harmonic mean of precision and recall, reaching
+    its optimal value at 1 and its worst value at 0. This implementation uses the
+    range-based precision and range-based recall as basis.
+
+    Parameters
+    ----------
+    y_true : np.ndarray
+        True binary labels of shape (n_instances,).
+    y_pred : np.ndarray
+        Anomaly scores for each point of the time series of shape (n_instances,).
+    beta : float
+        F-score beta determines the weight of recall in the combined score.
+        beta < 1 lends more weight to precision, while beta > 1 favors recall.
+    p_alpha : float
+        Weight of the existence reward for the range-based precision. For most - when
+        not all - cases, `p_alpha` should be set to 0.
+    r_alpha : float
+        Weight of the existence reward. If 0: no existence reward, if 1: only
+        existence reward.
+    cardinality : {'reciprocal', 'one', 'udf_gamma'}
+        Cardinality type.
+    p_bias : {'flat', 'front', 'middle', 'back'}
+        Positional bias type.
+    r_bias : {'flat', 'front', 'middle', 'back'}
+        Positional bias type.
+
+    Returns
+    -------
+    float
+        Range-based F-score
+
+    References
+    ----------
+    .. [1] Tatbul, Nesime, Tae Jun Lee, Stan Zdonik, Mejbah Alam, and Justin
+       Gottschlich. "Precision and Recall for Time Series." In Proceedings of the
+       International Conference on Neural Information Processing Systems (NeurIPS),
+       1920–30. 2018.
+       http://papers.nips.cc/paper/7462-precision-and-recall-for-time-series.pdf.
+    """
+    _check_soft_dependencies("prts", obj="range_recall", suppress_import_stdout=True)
+
+    from prts import ts_fscore
+
+    y_true, y_pred = check_y(y_true, y_pred, force_y_pred_continuous=False)
+    if np.unique(y_pred).shape[0] == 1:
+        warnings.warn(
+            "Cannot compute metric for a constant value in y_score, returning 0.0!",
+            stacklevel=2,
+        )
+        return 0.0
+    return ts_fscore(
+        y_true,
+        y_pred,
+        beta=beta,
+        p_alpha=p_alpha,
+        r_alpha=r_alpha,
+        cardinality=cardinality,
+        p_bias=p_bias,
+        r_bias=r_bias,
+    )
diff --git a/aeon/performance_metrics/anomaly_detection/_continuous.py b/aeon/performance_metrics/anomaly_detection/_continuous.py
new file mode 100644
index 0000000000..1d5925fb25
--- /dev/null
+++ b/aeon/performance_metrics/anomaly_detection/_continuous.py
@@ -0,0 +1,296 @@
+"""Metrics on anomaly scores for anomaly detection."""
+
+from __future__ import annotations
+
+__maintainer__ = ["CodeLionX"]
+__all__ = [
+    "roc_auc_score",
+    "pr_auc_score",
+    "f_score_at_k_points",
+    "f_score_at_k_ranges",
+    "rp_rr_auc_score",
+]
+
+import warnings
+
+import numpy as np
+from sklearn.metrics import auc, f1_score, precision_recall_curve
+from sklearn.metrics import roc_auc_score as _roc_auc_score
+
+from aeon.performance_metrics.anomaly_detection._util import check_y
+from aeon.performance_metrics.anomaly_detection.thresholding import (
+    top_k_points_threshold,
+    top_k_ranges_threshold,
+)
+from aeon.utils.validation._dependencies import _check_soft_dependencies
+
+
+def roc_auc_score(y_true: np.ndarray, y_score: np.ndarray) -> float:
+    """Compute the ROC AUC score.
+
+    Parameters
+    ----------
+    y_true : np.ndarray
+        True binary labels of shape (n_instances,).
+    y_score : np.ndarray
+        Anomaly scores for each point of the time series of shape (n_instances,).
+
+    Returns
+    -------
+    float
+        ROC AUC score.
+
+    See Also
+    --------
+    sklearn.metrics.roc_auc_score
+        Is used internally.
+    """
+    y_true, y_pred = check_y(y_true, y_score, force_y_pred_continuous=True)
+    if np.unique(y_score).shape[0] == 1:
+        warnings.warn(
+            "Cannot compute metric for a constant value in y_score, returning 0.0!",
+            stacklevel=2,
+        )
+        return 0.0
+    return _roc_auc_score(y_true, y_score)
+
+
+def pr_auc_score(y_true: np.ndarray, y_score: np.ndarray) -> float:
+    """Compute the precision-recall AUC score.
+
+    Parameters
+    ----------
+    y_true : np.ndarray
+        True binary labels of shape (n_instances,).
+    y_score : np.ndarray
+        Anomaly scores for each point of the time series of shape (n_instances,).
+
+    Returns
+    -------
+    float
+        Precision-recall AUC score.
+
+    See Also
+    --------
+    sklearn.metrics.precision_recall_curve
+        Function used under the hood.
+    """
+    y_true, y_pred = check_y(y_true, y_score, force_y_pred_continuous=True)
+    if np.unique(y_score).shape[0] == 1:
+        warnings.warn(
+            "Cannot compute metric for a constant value in y_score, returning 0.0!",
+            stacklevel=2,
+        )
+        return 0.0
+
+    x, y, _ = precision_recall_curve(y_true, np.array(y_score))
+    area = auc(y, x)
+    return area
+
+
+def f_score_at_k_points(
+    y_true: np.ndarray, y_score: np.ndarray, k: int | None = None
+) -> float:
+    """Compute the F-score at k based on single points.
+
+    This metric only considers the top-k predicted anomalous points within the scoring
+    by finding a threshold on the scoring that produces at least k anomalous points. If
+    `k` is not specified, the number of anomalies within the ground truth is used as
+    `k`.
+
+    Parameters
+    ----------
+    y_true : np.ndarray
+        True binary labels of shape (n_instances,).
+    y_score : np.ndarray
+        Anomaly scores for each point of the time series of shape (n_instances,).
+    k : int (optional)
+        Number of top anomalies used to calculate precision. If `k` is not specified
+        (`None`) the number of true anomalies (based on the ground truth values) is
+        used.
+
+    Returns
+    -------
+    float
+        F1 score at k.
+
+    See Also
+    --------
+    aeon.performance_metrics.anomaly_detection.thresholding.top_k_points_threshold
+        Function used to find the threshold.
+    """
+    y_true, y_pred = check_y(y_true, y_score, force_y_pred_continuous=True)
+    if np.unique(y_score).shape[0] == 1:
+        warnings.warn(
+            "Cannot compute metric for a constant value in y_score, returning 0.0!",
+            stacklevel=2,
+        )
+        return 0.0
+
+    threshold = top_k_points_threshold(y_true, y_score, k)
+    y_pred = y_score >= threshold
+    return f1_score(y_true, y_pred)
+
+
+def f_score_at_k_ranges(
+    y_true: np.ndarray, y_score: np.ndarray, k: int | None = None
+) -> float:
+    """Compute the range-based F-score at k based on anomaly ranges.
+
+    This metric only considers the top-k predicted anomaly ranges within the scoring by
+    finding a threshold on the scoring that produces at least k anomalous ranges. If `k`
+    is not specified, the number of anomalies within the ground truth is used as `k`.
+
+    This implementation uses the community package
+    `prts <https://pypi.org/project/prts/>`_ as a soft-dependency to compute the
+    range-based F-score.
+
+    Parameters
+    ----------
+    y_true : np.ndarray
+        True binary labels of shape (n_instances,).
+    y_score : np.ndarray
+        Anomaly scores for each point of the time series of shape (n_instances,).
+    k : int (optional)
+        Number of top anomalies used to calculate precision. If `k` is not specified
+        (`None`) the number of true anomalies (based on the ground truth values) is
+        used.
+
+    Returns
+    -------
+    float
+        F1 score at k.
+
+    See Also
+    --------
+    aeon.performance_metrics.anomaly_detection.thresholding.top_k_ranges_threshold
+        Function used to find the threshold.
+    """
+    _check_soft_dependencies(
+        "prts", obj="f_score_at_k_ranges", suppress_import_stdout=True
+    )
+
+    from prts import ts_fscore
+
+    y_true, y_pred = check_y(y_true, y_score, force_y_pred_continuous=True)
+    if np.unique(y_score).shape[0] == 1:
+        warnings.warn(
+            "Cannot compute metric for a constant value in y_score, returning 0.0!",
+            stacklevel=2,
+        )
+        return 0.0
+
+    threshold = top_k_ranges_threshold(y_true, y_score, k)
+    y_pred = y_score >= threshold
+    return ts_fscore(y_true, y_pred, p_alpha=1, r_alpha=1, cardinality="reciprocal")
+
+
+def rp_rr_auc_score(
+    y_true: np.ndarray,
+    y_score: np.ndarray,
+    max_samples: int = 50,
+    r_alpha: float = 0.5,
+    p_alpha: float = 0,
+    cardinality: str = "reciprocal",
+    bias: str = "flat",
+) -> float:
+    """Compute the AUC-score of the range-based precision-recall curve.
+
+    Computes the area under the precision recall curve when using the range-based
+    precision and range-based recall metric introduced by Tatbul et al. at NeurIPS 2018
+    [1]_. This implementation uses the community package
+    `prts <https://pypi.org/project/prts/>`_ as a soft-dependency.
+
+    This metric only considers the top-k predicted anomaly ranges within the scoring by
+    finding a threshold on the scoring that produces at least k anomalous ranges. If `k`
+    is not specified, the number of anomalies within the ground truth is used as `k`.
+
+    Parameters
+    ----------
+    y_true : np.ndarray
+        True binary labels of shape (n_instances,).
+    y_score : np.ndarray
+        Anomaly scores for each point of the time series of shape (n_instances,).
+    max_samples: int
+        The implementation of the range-based precision and recall metrics is quite
+        slow because it relies on the non-optimized ``prts``-package. To prevent long
+        runtimes caused by scorings with high precision (many thresholds), just a
+        specific amount of possible thresholds is sampled. This parameter controls the
+        maximum number of thresholds; however, too low numbers degrade the metrics'
+        quality.
+    r_alpha : float
+        Weight of the existence reward for the range-based recall.
+    p_alpha : float
+        Weight of the existence reward for the range-based precision. For most - when
+        not all - cases, `p_alpha` should be set to 0.
+    cardinality : {'reciprocal', 'one', 'udf_gamma'}
+        Cardinality type.
+    bias : {'flat', 'front', 'middle', 'back'}
+        Positional bias type.
+
+    Returns
+    -------
+    float
+        Area under the range-based precision-recall curve.
+
+    References
+    ----------
+    .. [1] Tatbul, Nesime, Tae Jun Lee, Stan Zdonik, Mejbah Alam, and Justin
+       Gottschlich. "Precision and Recall for Time Series." In Proceedings of the
+       International Conference on Neural Information Processing Systems (NeurIPS),
+       1920–30. 2018.
+       http://papers.nips.cc/paper/7462-precision-and-recall-for-time-series.pdf.
+    """
+    _check_soft_dependencies(
+        "prts", obj="f_score_at_k_ranges", suppress_import_stdout=True
+    )
+
+    from prts import ts_precision, ts_recall
+
+    y_true, y_pred = check_y(y_true, y_score, force_y_pred_continuous=True)
+    if np.unique(y_score).shape[0] == 1:
+        warnings.warn(
+            "Cannot compute metric for a constant value in y_score, returning 0.0!",
+            stacklevel=2,
+        )
+        return 0.0
+
+    thresholds = np.unique(y_score)
+    thresholds.sort()
+    # The first precision and recall values are precision=class balance and recall=1.0,
+    # which corresponds to a classifier that always predicts the positive class,
+    # independently of the threshold. This means that we can skip the first threshold!
+    p0 = y_true.sum() / len(y_true)
+    r0 = 1.0
+    thresholds = thresholds[1:]
+
+    # sample thresholds
+    n_thresholds = thresholds.shape[0]
+    if n_thresholds > max_samples:
+        every_nth = n_thresholds // (max_samples - 1)
+        sampled_thresholds = thresholds[::every_nth]
+        if thresholds[-1] == sampled_thresholds[-1]:
+            thresholds = sampled_thresholds
+        else:
+            thresholds = np.r_[sampled_thresholds, thresholds[-1]]
+
+    recalls = np.zeros_like(thresholds)
+    precisions = np.zeros_like(thresholds)
+    for i, threshold in enumerate(thresholds):
+        y_pred = (y_score >= threshold).astype(np.int64)
+        recalls[i] = ts_recall(
+            y_true, y_pred, alpha=r_alpha, cardinality=cardinality, bias=bias
+        )
+        precisions[i] = ts_precision(
+            y_true, y_pred, alpha=p_alpha, cardinality=cardinality, bias=bias
+        )
+    # first sort by recall, then by precision to break ties
+    # (important for noisy scorings)
+    sorted_idx = np.lexsort((precisions * (-1), recalls))[::-1]
+
+    # add first and last points to the curve
+    recalls = np.r_[r0, recalls[sorted_idx], 0]
+    precisions = np.r_[p0, precisions[sorted_idx], 1]
+
+    # calculate area under the curve
+    return auc(recalls, precisions)
diff --git a/aeon/performance_metrics/anomaly_detection/_util.py b/aeon/performance_metrics/anomaly_detection/_util.py
new file mode 100644
index 0000000000..8e48e19a68
--- /dev/null
+++ b/aeon/performance_metrics/anomaly_detection/_util.py
@@ -0,0 +1,119 @@
+"""Utility functions for anomaly detection performance metrics."""
+
+__maintainer__ = ["CodeLionX"]
+__all__ = ["check_y"]
+
+import warnings
+from typing import Tuple
+
+import numpy as np
+from sklearn.utils import assert_all_finite, check_consistent_length, column_or_1d
+
+
+def check_y(
+    y_true: np.ndarray,
+    y_pred: np.ndarray,
+    force_y_pred_continuous: bool = False,
+    inf_is_1: bool = True,
+    neginf_is_0: bool = True,
+    nan_is_0: bool = True,
+) -> Tuple[np.ndarray, np.ndarray]:
+    """Check the input arrays for the performance metrics.
+
+    This function checks the input arrays for the performance metrics. You can control
+    the interpretation of infinity, negative infinity, and NaN values in the anomaly
+    scoring with the parameters. If the respective parameter is set to ``False``, the
+    corresponding values are considered as wrong predictions (either as FP or FN
+    depending on the label).
+
+    Parameters
+    ----------
+    y_true : np.ndarray
+        True binary labels of shape (n_instances,).
+    y_pred : np.ndarray
+        Either the anomaly scores for each point of the time series or the binary
+        anomaly prediction of shape (n_instances,). If ``force_y_pred_continous`` is set
+        ``y_pred`` must be a float array, otherwise it must be an integer array.
+    force_y_pred_continuous : bool, default false
+        If ``True`` the function checks whether ``y_pred`` is a continuous float array.
+        Otherwise, it is assumed to contain binary predictions as integer array. If the
+        user accidentally swapped the input parameters, this functions swappes them
+        back.
+    inf_is_1 : bool, default True
+        Whether to treat infinite values in the anomaly scores as 1.
+    neginf_is_0 : bool, default True
+        Whether to treat negative infinite values in the anomaly scores as 0.
+    nan_is_0 : bool, default True
+        Whether to treat NaN values in the anomaly scores as 0.
+
+    Returns
+    -------
+    Tuple[np.ndarray, np.ndarray]
+        Tuple of the cleaned ``y_true`` and ``y_pred`` arrays.
+    """
+    y_true = np.array(y_true).copy()
+    y_pred = np.array(y_pred).copy()
+
+    # check labels
+    if (
+        force_y_pred_continuous
+        and y_true.dtype == np.float_
+        and y_pred.dtype == np.int_
+    ):
+        warnings.warn(
+            "Assuming that y_true and y_score where permuted, because their"
+            "dtypes indicate so. y_true should be an integer array and"
+            "y_score a float array!",
+            stacklevel=2,
+        )
+        return check_y(
+            y_pred,
+            y_true,
+            force_y_pred_continuous=force_y_pred_continuous,
+            inf_is_1=inf_is_1,
+            neginf_is_0=neginf_is_0,
+            nan_is_0=nan_is_0,
+        )
+
+    y_true = column_or_1d(y_true)
+    assert_all_finite(y_true)
+
+    # check scores
+    y_pred = column_or_1d(y_pred)
+
+    check_consistent_length([y_true, y_pred])
+    if not force_y_pred_continuous and y_pred.dtype not in [np.int_, np.bool_]:
+        raise ValueError(
+            "When using metrics other than AUC/VUS-metrics that need binary "
+            "(0 or 1) scores (like Precision, Recall or F1-Score), the scores must "
+            "be integers and should only contain the values {0, 1}. Please "
+            "consider applying a threshold to the scores!"
+        )
+    elif force_y_pred_continuous and y_pred.dtype != np.float_:
+        raise ValueError(
+            "When using continuous scoring metrics, the scores must be floats!"
+        )
+
+    # substitute NaNs and Infs
+    nan_mask = np.isnan(y_pred)
+    inf_mask = np.isinf(y_pred)
+    neginf_mask = np.isneginf(y_pred)
+    penalize_mask = np.full_like(y_pred, dtype=bool, fill_value=False)
+    if inf_is_1:
+        y_pred[inf_mask] = 1
+    else:
+        penalize_mask = penalize_mask | inf_mask
+    if neginf_is_0:
+        y_pred[neginf_mask] = 0
+    else:
+        penalize_mask = penalize_mask | neginf_mask
+    if nan_is_0:
+        y_pred[nan_mask] = 0
+    else:
+        penalize_mask = penalize_mask | nan_mask
+    y_pred[penalize_mask] = (~np.array(y_true[penalize_mask], dtype=bool)).astype(
+        np.int_
+    )
+
+    assert_all_finite(y_pred)
+    return y_true, y_pred
diff --git a/aeon/performance_metrics/anomaly_detection/_vus_metrics.py b/aeon/performance_metrics/anomaly_detection/_vus_metrics.py
new file mode 100644
index 0000000000..e1e4b53ce4
--- /dev/null
+++ b/aeon/performance_metrics/anomaly_detection/_vus_metrics.py
@@ -0,0 +1,382 @@
+"""VUS and AUC metrics for anomaly detection on time series."""
+
+from __future__ import annotations
+
+__maintainer__ = ["CodeLionX"]
+__all__ = [
+    "range_pr_auc_score",
+    "range_roc_auc_score",
+    "range_pr_vus_score",
+    "range_roc_vus_score",
+    "range_pr_roc_auc_support",
+]
+
+import warnings
+
+import numpy as np
+
+from aeon.performance_metrics.anomaly_detection._util import check_y
+
+
+def _anomaly_bounds(y_true: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
+    """Corresponds to range_convers_new."""
+    # convert to boolean/binary
+    labels = y_true > 0
+    # deal with start and end of time series
+    labels = np.diff(np.r_[0, labels, 0])
+    # extract begin and end of anomalous regions
+    index = np.arange(0, labels.shape[0])
+    starts = index[labels == 1]
+    ends = index[labels == -1]
+    return starts, ends
+
+
+def _extend_anomaly_labels(
+    y_true: np.ndarray, buffer_size: int | None = None
+) -> tuple[np.ndarray, np.ndarray]:
+    """Extend the anomaly labels with slopes on both ends.
+
+    Also makes the labels continuous instead of binary.
+    """
+    starts, ends = _anomaly_bounds(y_true)
+
+    if buffer_size is None:
+        # per default: set buffer size as median anomaly length:
+        buffer_size = int(np.median(ends - starts))
+
+    if buffer_size <= 1:
+        anomalies = np.array(list(zip(starts, ends - 1)))
+        return y_true.astype(np.float_), anomalies
+
+    y_true_cont = y_true.astype(np.float_)
+    slope_length = buffer_size // 2
+    length = y_true_cont.shape[0]
+    for s, e in zip(starts, ends):
+        e -= 1
+        x1 = np.arange(e, min(e + slope_length, length))
+        y_true_cont[x1] += np.sqrt(1 - (x1 - e) / buffer_size)
+        x2 = np.arange(max(s - slope_length, 0), s)
+        y_true_cont[x2] += np.sqrt(1 - (s - x2) / buffer_size)
+    y_true_cont = np.clip(y_true_cont, 0, 1)
+    starts, ends = _anomaly_bounds(y_true_cont)
+    anomalies = np.array(list(zip(starts, ends - 1)))
+
+    return y_true_cont, anomalies
+
+
+def _uniform_threshold_sampling(y_score: np.ndarray) -> np.ndarray:
+    """Create the threshold via uniform sampling."""
+    # magic number from original implementation
+    n_samples = 250
+    thresholds = np.sort(y_score)[::-1]
+    thresholds = thresholds[
+        np.linspace(0, thresholds.shape[0] - 1, n_samples, dtype=np.int_)
+    ]
+    return thresholds
+
+
+def range_pr_roc_auc_support(
+    y_true: np.ndarray,
+    y_score: np.ndarray,
+    buffer_size: int | None = None,
+    skip_check: bool = False,
+) -> tuple[float, float]:
+    """Compute the range-based PR and ROC AUC.
+
+    Computes the area under the precision-recall-curve and the area under the
+    receiver operating characteristic using the range-based precision and range-based
+    recall definition from Paparrizos et al. published at VLDB 2022 [1]_.
+
+    We first extend the anomaly labels by two slopes of ``buffer_size//2`` length on
+    both sides of each anomaly, uniformly sample thresholds from the anomaly score, and
+    then compute the confusion matrix for all thresholds. Using the resulting precision
+    and recall values, we can plot a curve and compute its area.
+
+    Parameters
+    ----------
+    y_true : np.ndarray
+        True binary labels of shape (n_instances,).
+    y_score : np.ndarray
+        Anomaly scores for each point of the time series of shape (n_instances,).
+    buffer_size : int, optional
+        Size of the buffer region around an anomaly. We add an increasing slope of size
+        ``buffer_size//2`` to the beginning of anomalies and a decreasing slope of size
+        ``buffer_size//2`` to the end of anomalies. Per default
+        (when ``buffer_size==None``), ``buffer_size`` is the median length of the
+        anomalies within the time series. However, you can also set it to the period
+        size of the dominant frequency or any other desired value.
+    skip_check : bool, default False
+        Whether to skip the input checks.
+
+    Returns
+    -------
+    Tuple[float, float]
+        Range-based PR AUC and range-based ROC AUC.
+
+    References
+    ----------
+    .. [1] John Paparrizos, Paul Boniol, Themis Palpanas, Ruey S. Tsay,
+       Aaron Elmore, and Michael J. Franklin. Volume Under the Surface: A New Accuracy
+       Evaluation Measure for Time-Series Anomaly Detection. PVLDB, 15(11):
+       2774 - 2787, 2022.
+       doi:`10.14778/3551793.3551830 <https://doi.org/10.14778/3551793.3551830>`_
+    """
+    if not skip_check:
+        y_true, y_pred = check_y(y_true, y_score, force_y_pred_continuous=True)
+    y_true_cont, anomalies = _extend_anomaly_labels(y_true, buffer_size)
+    thresholds = _uniform_threshold_sampling(y_score)
+    p = np.average([np.sum(y_true), np.sum(y_true_cont)])
+
+    recalls = np.zeros(thresholds.shape[0] + 2)  # tprs
+    fprs = np.zeros(thresholds.shape[0] + 2)
+    precisions = np.ones(thresholds.shape[0] + 1)
+
+    for i, t in enumerate(thresholds):
+        y_pred = y_score >= t
+        product = y_true_cont * y_pred
+        tp = np.sum(product)
+        # fp = np.dot((np.ones_like(y_pred) - y_true_cont).T, y_pred)
+        fp = np.sum(y_pred) - tp
+        n = len(y_pred) - p
+
+        existence_reward = [np.sum(product[s : e + 1]) > 0 for s, e in anomalies]
+        existence_reward = np.sum(existence_reward) / anomalies.shape[0]
+
+        recall = min(tp / p, 1) * existence_reward  # = tpr
+        fpr = min(fp / n, 1)
+        precision = tp / np.sum(y_pred)
+
+        recalls[i + 1] = recall
+        fprs[i + 1] = fpr
+        precisions[i + 1] = precision
+
+    recalls[-1] = 1
+    fprs[-1] = 1
+
+    range_pr_auc = np.sum(
+        (recalls[1:-1] - recalls[:-2]) * (precisions[1:] + precisions[:-1]) / 2
+    )
+    range_roc_auc = np.sum((fprs[1:] - fprs[:-1]) * (recalls[1:] + recalls[:-1]) / 2)
+    return range_pr_auc, range_roc_auc
+
+
+def range_roc_auc_score(
+    y_true: np.ndarray, y_score: np.ndarray, buffer_size: int | None = None
+) -> float:
+    """Compute the range-based area under the ROC curve.
+
+    Computes the area under the receiver-operating-characteristic-curve using the
+    range-based TPR and range-based FPR definition from Paparrizos et al.
+    published at VLDB 2022 [1]_.
+
+    We first extend the anomaly labels by two slopes of ``buffer_size//2`` length on
+    both sides of each anomaly, uniformly sample thresholds from the anomaly score, and
+    then compute the confusion matrix for all thresholds. Using the resulting precision
+    and recall values, we can plot a curve and compute its area.
+
+    Parameters
+    ----------
+    y_true : np.ndarray
+        True binary labels of shape (n_instances,).
+    y_score : np.ndarray
+        Anomaly scores for each point of the time series of shape (n_instances,).
+    buffer_size : int, optional
+        Size of the buffer region around an anomaly. We add an increasing slope of size
+        ``buffer_size//2`` to the beginning of anomalies and a decreasing slope of size
+        ``buffer_size//2`` to the end of anomalies. Per default
+        (when ``buffer_size==None``), ``buffer_size`` is the median length of the
+        anomalies within the time series. However, you can also set it to the period
+        size of the dominant frequency or any other desired value.
+
+    Returns
+    -------
+    Tuple[float, float]
+        Range-based ROC AUC score.
+
+    References
+    ----------
+    .. [1] John Paparrizos, Paul Boniol, Themis Palpanas, Ruey S. Tsay,
+       Aaron Elmore, and Michael J. Franklin. Volume Under the Surface: A New Accuracy
+       Evaluation Measure for Time-Series Anomaly Detection. PVLDB, 15(11):
+       2774 - 2787, 2022.
+       doi:`10.14778/3551793.3551830 <https://doi.org/10.14778/3551793.3551830>`_
+    """
+    y_true, y_pred = check_y(y_true, y_score, force_y_pred_continuous=True)
+    if np.unique(y_score).shape[0] == 1:
+        warnings.warn(
+            "Cannot compute metric for a constant value in y_score, returning 0.0!",
+            stacklevel=2,
+        )
+        return 0.0
+
+    _, range_auc_roc = range_pr_roc_auc_support(
+        y_true, y_score, buffer_size, skip_check=True
+    )
+    return range_auc_roc
+
+
+def range_pr_auc_score(
+    y_true: np.ndarray, y_score: np.ndarray, buffer_size: int | None = None
+) -> float:
+    """Compute the area under the range-based PR curve.
+
+    Computes the area under the precision-recall-curve using the range-based precision
+    and range-based recall definition from Paparrizos et al. published at VLDB 2022
+    [1]_.
+
+    We first extend the anomaly labels by two slopes of ``buffer_size//2`` length on
+    both sides of each anomaly, uniformly sample thresholds from the anomaly score, and
+    then compute the confusion matrix for all thresholds. Using the resulting precision
+    and recall values, we can plot a curve and compute its area.
+
+    Parameters
+    ----------
+    y_true : np.ndarray
+        True binary labels of shape (n_instances,).
+    y_score : np.ndarray
+        Anomaly scores for each point of the time series of shape (n_instances,).
+    buffer_size : int, optional
+        Size of the buffer region around an anomaly. We add an increasing slope of size
+        ``buffer_size//2`` to the beginning of anomalies and a decreasing slope of size
+        ``buffer_size//2`` to the end of anomalies. Per default
+        (when ``buffer_size==None``), ``buffer_size`` is the median length of the
+        anomalies within the time series. However, you can also set it to the period
+        size of the dominant frequency or any other desired value.
+
+    Returns
+    -------
+    Tuple[float, float]
+        Range-based PR AUC score.
+
+    References
+    ----------
+    .. [1] John Paparrizos, Paul Boniol, Themis Palpanas, Ruey S. Tsay,
+       Aaron Elmore, and Michael J. Franklin. Volume Under the Surface: A New Accuracy
+       Evaluation Measure for Time-Series Anomaly Detection. PVLDB, 15(11):
+       2774 - 2787, 2022.
+       doi:`10.14778/3551793.3551830 <https://doi.org/10.14778/3551793.3551830>`_
+    """
+    y_true, y_pred = check_y(y_true, y_score, force_y_pred_continuous=True)
+    if np.unique(y_score).shape[0] == 1:
+        warnings.warn(
+            "Cannot compute metric for a constant value in y_score, returning 0.0!",
+            stacklevel=2,
+        )
+        return 0.0
+
+    range_pr_auc, _ = range_pr_roc_auc_support(
+        y_true, y_score, buffer_size, skip_check=True
+    )
+    return range_pr_auc
+
+
+def range_pr_vus_score(
+    y_true: np.ndarray, y_score: np.ndarray, max_buffer_size: int = 500
+) -> float:
+    """Compute the range-based PR VUS score.
+
+    Computes the volume under the precision-recall-buffer_size-surface using the
+    range-based precision and range-based recall definition from Paparrizos et al.
+    published at VLDB 2022 [1]_.
+
+    For all buffer sizes from 0 to ``max_buffer_size``, we first extend the anomaly
+    labels by two slopes of ``buffer_size//2`` length on both sides of each anomaly,
+    uniformly sample thresholds from the anomaly score, and then compute the confusion
+    matrix for all thresholds. Using the resulting precision and recall values, we can
+    plot a curve and compute its area.
+
+    Parameters
+    ----------
+    y_true : np.ndarray
+        True binary labels of shape (n_instances,).
+    y_score : np.ndarray
+        Anomaly scores for each point of the time series of shape (n_instances,).
+    max_buffer_size : int, default=500
+        Maximum size of the buffer region around an anomaly. We iterate over all buffer
+        sizes from 0 to ``may_buffer_size`` to create the surface.
+
+    Returns
+    -------
+    Tuple[float, float]
+        Range-based PR VUS score.
+
+    References
+    ----------
+    .. [1] John Paparrizos, Paul Boniol, Themis Palpanas, Ruey S. Tsay,
+       Aaron Elmore, and Michael J. Franklin. Volume Under the Surface: A New Accuracy
+       Evaluation Measure for Time-Series Anomaly Detection. PVLDB, 15(11):
+       2774 - 2787, 2022.
+       doi:`10.14778/3551793.3551830 <https://doi.org/10.14778/3551793.3551830>`_
+    """
+    y_true, y_pred = check_y(y_true, y_score, force_y_pred_continuous=True)
+    if np.unique(y_score).shape[0] == 1:
+        warnings.warn(
+            "Cannot compute metric for a constant value in y_score, returning 0.0!",
+            stacklevel=2,
+        )
+        return 0.0
+
+    prs = np.zeros(max_buffer_size + 1)
+    for buffer_size in np.arange(0, max_buffer_size + 1):
+        pr_auc, _ = range_pr_roc_auc_support(
+            y_true, y_score, buffer_size=buffer_size, skip_check=True
+        )
+        prs[buffer_size] = pr_auc
+    range_pr_volume = np.sum(prs) / (max_buffer_size + 1)
+    return range_pr_volume
+
+
+def range_roc_vus_score(
+    y_true: np.ndarray, y_score: np.ndarray, max_buffer_size: int = 500
+) -> float:
+    """Compute the range-based ROC VUS score.
+
+    Computes the volume under the receiver-operating-characteristic-buffer_size-surface
+    using the range-based TPR and range-based FPR definition from Paparrizos et al.
+    published at VLDB 2022 [1]_.
+
+    For all buffer sizes from 0 to ``max_buffer_size``, we first extend the anomaly
+    labels by two slopes of ``buffer_size//2`` length on both sides of each anomaly,
+    uniformly sample thresholds from the anomaly score, and then compute the confusion
+    matrix for all thresholds. Using the resulting false positive (FPR) and false
+    positive rates (FPR), we can plot a curve and compute its area.
+
+    Parameters
+    ----------
+    y_true : np.ndarray
+        True binary labels of shape (n_instances,).
+    y_score : np.ndarray
+        Anomaly scores for each point of the time series of shape (n_instances,).
+    max_buffer_size : int, default=500
+        Maximum size of the buffer region around an anomaly. We iterate over all buffer
+        sizes from 0 to ``may_buffer_size`` to create the surface.
+
+    Returns
+    -------
+    Tuple[float, float]
+        Range-based PR VUS score.
+
+    References
+    ----------
+    .. [1] John Paparrizos, Paul Boniol, Themis Palpanas, Ruey S. Tsay,
+       Aaron Elmore, and Michael J. Franklin. Volume Under the Surface: A New Accuracy
+       Evaluation Measure for Time-Series Anomaly Detection. PVLDB, 15(11):
+       2774 - 2787, 2022.
+       doi:`10.14778/3551793.3551830 <https://doi.org/10.14778/3551793.3551830>`_
+    """
+    y_true, y_pred = check_y(y_true, y_score, force_y_pred_continuous=True)
+    if np.unique(y_score).shape[0] == 1:
+        warnings.warn(
+            "Cannot compute metric for a constant value in y_score, returning 0.0!",
+            stacklevel=2,
+        )
+        return 0.0
+
+    rocs = np.zeros(max_buffer_size + 1)
+    for buffer_size in np.arange(0, max_buffer_size + 1):
+        _, roc_auc = range_pr_roc_auc_support(
+            y_true, y_score, buffer_size=buffer_size, skip_check=True
+        )
+        rocs[buffer_size] = roc_auc
+    range_roc_volume = np.sum(rocs) / (max_buffer_size + 1)
+    return range_roc_volume
diff --git a/aeon/performance_metrics/anomaly_detection/tests/__init__.py b/aeon/performance_metrics/anomaly_detection/tests/__init__.py
new file mode 100644
index 0000000000..85491bb5d7
--- /dev/null
+++ b/aeon/performance_metrics/anomaly_detection/tests/__init__.py
@@ -0,0 +1 @@
+"""Tests for metrics in anomaly_detection module."""
diff --git a/aeon/performance_metrics/anomaly_detection/tests/test_ad_metrics.py b/aeon/performance_metrics/anomaly_detection/tests/test_ad_metrics.py
new file mode 100644
index 0000000000..617e55027a
--- /dev/null
+++ b/aeon/performance_metrics/anomaly_detection/tests/test_ad_metrics.py
@@ -0,0 +1,304 @@
+"""Tests for metrics in anomaly_detection module."""
+
+import numpy as np
+import pytest
+
+from aeon.performance_metrics.anomaly_detection import (
+    f_score_at_k_points,
+    f_score_at_k_ranges,
+    pr_auc_score,
+    range_f_score,
+    range_pr_auc_score,
+    range_pr_vus_score,
+    range_precision,
+    range_recall,
+    range_roc_auc_score,
+    range_roc_vus_score,
+    roc_auc_score,
+    rp_rr_auc_score,
+)
+from aeon.testing.data_generation import make_example_1d_numpy
+from aeon.utils.validation._dependencies import _check_soft_dependencies
+
+pr_metrics = [pr_auc_score]
+range_metrics = [
+    range_roc_auc_score,
+    range_pr_auc_score,
+    range_roc_vus_score,
+    range_pr_vus_score,
+]
+other_metrics = [
+    roc_auc_score,
+    f_score_at_k_points,
+]
+continuous_metrics = [*pr_metrics, *other_metrics, *range_metrics]
+binary_metrics = []
+
+if _check_soft_dependencies("prts", severity="none"):
+    pr_metrics.append(rp_rr_auc_score)
+    range_metrics.extend(
+        [
+            range_recall,
+            range_precision,
+            range_f_score,
+        ]
+    )
+    other_metrics.extend(
+        [
+            f_score_at_k_ranges,
+            rp_rr_auc_score,
+        ]
+    )
+    continuous_metrics.extend(
+        [
+            rp_rr_auc_score,
+            f_score_at_k_ranges,
+        ]
+    )
+    binary_metrics = [range_recall, range_precision, range_f_score]
+
+metrics = [*pr_metrics, *range_metrics, *other_metrics]
+
+
+@pytest.mark.parametrize("metric", metrics, ids=[m.__name__ for m in metrics])
+def test_metric_output(metric):
+    """Test output has correct format."""
+    y_score = make_example_1d_numpy(n_timepoints=20, random_state=0)
+    y_true = make_example_1d_numpy(n_timepoints=20, random_state=1)
+    y_true = (y_true > 0.5).astype(int)
+
+    if metric in continuous_metrics:
+        res = metric(
+            y_true=y_true,
+            y_score=y_score,
+        )
+    else:
+        y_pred = (y_score > 0.5).astype(int)
+        res = metric(
+            y_true=y_true,
+            y_pred=y_pred,
+        )
+
+    assert isinstance(res, float)
+    assert 0.0 <= res <= 1.0
+
+
+@pytest.mark.parametrize(
+    "metric", continuous_metrics, ids=[m.__name__ for m in continuous_metrics]
+)
+def test_continuous_metric_requires_scores(metric):
+    """Test that continuous metrics require float scores."""
+    y_scores = np.array([1, 0, 1, 0, 0])
+    y_true = np.array([0, 0, 1, 0, 0])
+
+    with pytest.raises(ValueError) as ex:
+        metric(y_true, y_scores)
+    assert "scores must be floats" in str(ex.value)
+
+    y_scores = y_scores.astype(np.float_)
+    metric(y_true, y_scores)
+    assert True, "Metric should accept float, so no error should be raised"
+
+
+@pytest.mark.parametrize(
+    "metric", binary_metrics, ids=[m.__name__ for m in binary_metrics]
+)
+def test_binary_metric_requires_predictions(metric):
+    """Test that binary metrics require integer or boolean predictions."""
+    y_scores = np.array([0.8, 0.1, 0.9, 0.3, 0.3], dtype=np.float_)
+    y_true = np.array([0, 0, 1, 0, 0], dtype=np.bool_)
+
+    with pytest.raises(ValueError) as ex:
+        metric(y_true, y_scores)
+    assert "scores must be integers" in str(ex.value)
+
+    y_pred = np.array(y_scores > 0.5, dtype=np.int_)
+    metric(y_true, y_pred)
+
+    y_pred = y_pred.astype(np.bool_)
+    metric(y_true, y_pred)
+    assert True, "Metric should accept ints and bools, so no error should be raised"
+
+
+@pytest.mark.parametrize("metric", metrics, ids=[m.__name__ for m in metrics])
+def test_edge_cases(metric):
+    """Test edge cases for all metrics."""
+    y_true = np.zeros(10, dtype=np.int_)
+    y_true[2:4] = 1
+    y_true[6:8] = 1
+    y_zeros = np.zeros_like(y_true, dtype=np.float_)
+    y_flat = np.full_like(y_true, fill_value=0.5, dtype=np.float_)
+    y_ones = np.ones_like(y_true, dtype=np.float_)
+
+    for y_pred in [y_zeros, y_flat, y_ones]:
+        if metric in binary_metrics:
+            score = metric(y_true, y_pred.astype(np.int_))
+        else:
+            score = metric(y_true, y_pred)
+        np.testing.assert_almost_equal(score, 0)
+
+
+@pytest.mark.parametrize("metric", pr_metrics, ids=[m.__name__ for m in pr_metrics])
+def test_edge_cases_pr_metrics(metric):
+    """Test edge cases for PR metrics."""
+    y_true = np.zeros(10, dtype=np.int_)
+    y_true[2:4] = 1
+    y_true[6:8] = 1
+    y_inverted = (y_true * -1 + 1).astype(np.float_)
+
+    score = metric(y_true, y_inverted)
+    assert score <= 0.2, f"{metric.__name__}(y_true, y_inverted)={score} is not <= 0.2"
+
+
+@pytest.mark.skipif(
+    not _check_soft_dependencies("prts", severity="none"),
+    reason="required soft dependency prts not available",
+)
+def test_range_based_f1():
+    """Test range-based F1 score."""
+    y_pred = np.array([0, 1, 1, 0])
+    y_true = np.array([0, 1, 0, 0])
+    result = range_f_score(y_true, y_pred)
+    np.testing.assert_almost_equal(result, 0.66666, decimal=4)
+
+
+@pytest.mark.skipif(
+    not _check_soft_dependencies("prts", severity="none"),
+    reason="required soft dependency prts not available",
+)
+def test_range_based_precision():
+    """Test range-based precision."""
+    y_pred = np.array([0, 1, 1, 0])
+    y_true = np.array([0, 1, 0, 0])
+    result = range_precision(y_true, y_pred)
+    assert result == 0.5
+
+
+@pytest.mark.skipif(
+    not _check_soft_dependencies("prts", severity="none"),
+    reason="required soft dependency prts not available",
+)
+def test_range_based_recall():
+    """Test range-based recall."""
+    y_pred = np.array([0, 1, 1, 0])
+    y_true = np.array([0, 1, 0, 0])
+    result = range_recall(y_true, y_pred)
+    assert result == 1
+
+
+@pytest.mark.skipif(
+    not _check_soft_dependencies("prts", severity="none"),
+    reason="required soft dependency prts not available",
+)
+def test_rf1_value_error():
+    """Test range-based F1 score raises ValueError on binary predictions."""
+    y_pred = np.array([0, 0.2, 0.7, 0])
+    y_true = np.array([0, 1, 0, 0])
+    with pytest.raises(ValueError):
+        range_f_score(y_true, y_pred)
+
+
+def test_pr_curve_auc():
+    """Test PR curve AUC."""
+    y_pred = np.array([0, 0.1, 1.0, 0.5, 0, 0])
+    y_true = np.array([0, 0, 1, 1, 0, 0])
+    result = pr_auc_score(y_true, y_pred)
+    np.testing.assert_almost_equal(result, 1.0000, decimal=4)
+
+
+# def test_average_precision():
+#     y_pred = np.array([0, 0.1, 1., .5, 0, 0])
+#     y_true = np.array([0, 1, 1, 0, 0, 0])
+#     result = average_precision_score(y_true, y_pred)
+#     np.testing.assert_almost_equal(result, 0.8333, decimal=4)
+
+
+@pytest.mark.skipif(
+    not _check_soft_dependencies("prts", severity="none"),
+    reason="required soft dependency prts not available",
+)
+def test_range_based_p_range_based_r_curve_auc():
+    """Test range-based precision-recall curve AUC."""
+    y_pred = np.array([0, 0.1, 1.0, 0.5, 0.1, 0])
+    y_true = np.array([0, 1, 1, 1, 0, 0])
+    result = rp_rr_auc_score(y_true, y_pred)
+    np.testing.assert_almost_equal(result, 0.9792, decimal=4)
+
+
+@pytest.mark.skipif(
+    not _check_soft_dependencies("prts", severity="none"),
+    reason="required soft dependency prts not available",
+)
+def test_range_based_p_range_based_r_auc_perfect_hit():
+    """Test range-based precision-recall curve AUC with perfect hit."""
+    y_pred = np.array([0, 0, 0.5, 0.5, 0, 0])
+    y_true = np.array([0, 0, 1, 1, 0, 0])
+    result = rp_rr_auc_score(y_true, y_pred)
+    np.testing.assert_almost_equal(result, 1.0000, decimal=4)
+
+
+@pytest.mark.skipif(
+    not _check_soft_dependencies("prts", severity="none"),
+    reason="required soft dependency prts not available",
+)
+def test_f_score_at_k_ranges():
+    """Test range-based F1 score at k ranges."""
+    y_pred = np.array([0.4, 0.1, 1.0, 0.5, 0.1, 0, 0.4, 0.5])
+    y_true = np.array([0, 1, 1, 1, 0, 0, 1, 0])
+    result = f_score_at_k_ranges(y_true, y_pred)
+    np.testing.assert_almost_equal(result, 0.5000, decimal=4)
+    result = f_score_at_k_ranges(y_true, y_pred, k=3)
+    np.testing.assert_almost_equal(result, 0.8000, decimal=4)
+
+
+def test_fscore_at_k_points():
+    """Test F1 score at k points."""
+    y_pred = np.array([0.4, 0.6, 1.0, 0.5, 0.1, 0, 0.5, 0.4])
+    y_true = np.array([0, 1, 1, 1, 0, 0, 1, 0])
+    result = f_score_at_k_points(y_true, y_pred)
+    np.testing.assert_almost_equal(result, 1.0000, decimal=4)
+    result = f_score_at_k_points(y_true, y_pred, k=1)
+    np.testing.assert_almost_equal(result, 0.4000, decimal=4)
+
+
+# VUS fixtures
+y_true = np.zeros(200)
+y_true[10:20] = 1
+y_true[28:33] = 1
+y_true[110:120] = 1
+y_score = np.random.default_rng(41).random(200) * 0.5
+y_score[16:22] = 1
+y_score[33:38] = 1
+y_score[160:170] = 1
+
+
+def test_range_pr_auc_compat():
+    """Test range-based PR AUC."""
+    result = range_pr_auc_score(y_true, y_score)
+    np.testing.assert_almost_equal(result, 0.3737854660, decimal=10)
+
+
+def test_range_roc_auc_compat():
+    """Test range-based ROC AUC."""
+    result = range_roc_auc_score(y_true, y_score)
+    np.testing.assert_almost_equal(result, 0.7108527197, decimal=10)
+
+
+def test_edge_case_existence_reward_compat():
+    """Test edge case for existence reward in range-based PR/ROC AUC."""
+    result = range_pr_auc_score(y_true, y_score, buffer_size=4)
+    np.testing.assert_almost_equal(result, 0.2506464391, decimal=10)
+    result = range_roc_auc_score(y_true, y_score, buffer_size=4)
+    np.testing.assert_almost_equal(result, 0.6143220816, decimal=10)
+
+
+def test_range_pr_volume_compat():
+    """Test range-based PR volume under the curve."""
+    result = range_pr_vus_score(y_true, y_score, max_buffer_size=200)
+    np.testing.assert_almost_equal(result, 0.7493254559, decimal=10)
+
+
+def test_range_roc_volume_compat():
+    """Test range-based ROC volume under the curve."""
+    result = range_roc_vus_score(y_true, y_score, max_buffer_size=200)
+    np.testing.assert_almost_equal(result, 0.8763382130, decimal=10)
diff --git a/aeon/performance_metrics/anomaly_detection/tests/test_thresholding.py b/aeon/performance_metrics/anomaly_detection/tests/test_thresholding.py
new file mode 100644
index 0000000000..42994da406
--- /dev/null
+++ b/aeon/performance_metrics/anomaly_detection/tests/test_thresholding.py
@@ -0,0 +1,37 @@
+"""Test anomaly detection thresholding methods."""
+
+import numpy as np
+
+from aeon.performance_metrics.anomaly_detection.thresholding import (
+    percentile_threshold,
+    sigma_threshold,
+    top_k_points_threshold,
+    top_k_ranges_threshold,
+)
+
+y_true = np.array([0, 0, 1, 0, 0, 0, 0, 0, 0])
+y_score = np.array([np.nan, 0.1, 0.9, 0.5, 0.6, 0.55, 0.2, 0.1, np.nan])
+
+
+def test_percentile_thresholding():
+    """Test percentile thresholding."""
+    threshold = percentile_threshold(y_score, percentile=90)
+    np.testing.assert_almost_equal(threshold, 0.72, decimal=2)
+
+
+def test_top_k_points_thresholding():
+    """Test top k points thresholding."""
+    threshold = top_k_points_threshold(y_true, y_score, k=2)
+    np.testing.assert_almost_equal(threshold, 0.58, decimal=2)
+
+
+def test_top_k_ranges_thresholding():
+    """Test top k ranges thresholding."""
+    threshold = top_k_ranges_threshold(y_true, y_score, k=2)
+    np.testing.assert_almost_equal(threshold, 0.60, decimal=2)
+
+
+def test_sigma_thresholding():
+    """Test sigma thresholding."""
+    threshold = sigma_threshold(y_score, factor=1)
+    np.testing.assert_almost_equal(threshold, 0.70, decimal=2)
diff --git a/aeon/performance_metrics/anomaly_detection/thresholding.py b/aeon/performance_metrics/anomaly_detection/thresholding.py
new file mode 100644
index 0000000000..a882a7887f
--- /dev/null
+++ b/aeon/performance_metrics/anomaly_detection/thresholding.py
@@ -0,0 +1,147 @@
+"""Functions to compute thresholds to convert anomaly scores to binary predictions."""
+
+from __future__ import annotations
+
+__maintainer__ = ["CodeLionX"]
+__all__ = [
+    "percentile_threshold",
+    "sigma_threshold",
+    "top_k_points_threshold",
+    "top_k_ranges_threshold",
+]
+
+import numpy as np
+
+
+def percentile_threshold(y_score: np.ndarray, percentile: int) -> float:
+    """Calculate a threshold based on a percentile of the anomaly scores.
+
+    Uses the xth-percentile of the anomaly scoring as threshold ignoring NaNs and using
+    a linear interpolation.
+
+    Parameters
+    ----------
+    y_score : np.ndarray
+        Anomaly scores for each point of the time series of shape (n_instances,).
+    percentile : int
+        Percentile to use as threshold between 0 and 100.
+
+    Returns
+    -------
+    float
+        Threshold based on the percentile.
+    """
+    return np.nanpercentile(y_score, percentile)
+
+
+def sigma_threshold(y_score: np.ndarray, factor: float = 2) -> float:
+    r"""Calculate a threshold based on the standard deviation of the anomaly scores.
+
+    Computes a threshold :math:`\theta` based on the anomaly scoring's mean
+    :math:`\mu_s` and the standard deviation :math:`\sigma_s`, ignoring NaNs:
+
+    .. math::
+       \theta = \mu_{s} + x \cdot \sigma_{s}
+
+    Parameters
+    ----------
+    y_score : np.ndarray
+        Anomaly scores for each point of the time series of shape (n_instances,).
+    factor : float
+        Number of standard deviations to use as threshold (:math:`x`).
+
+    Returns
+    -------
+    float
+        Threshold based on the standard deviation.
+    """
+    return np.nanmean(y_score) + factor * np.nanstd(y_score)
+
+
+def top_k_points_threshold(
+    y_true: np.ndarray, y_score: np.ndarray, k: int | None = None
+) -> float:
+    """Calculate a threshold such that at least `k` anomalous points are found.
+
+    The anomalies are single-point anomalies.
+
+    Computes a threshold based on the number of expected anomalies (number of
+    anomalies). This method iterates over all possible thresholds from high to low to
+    find the first threshold that yields `k` or more anomalous points. If `k` is `None`,
+    the ground truth data is used to calculate the real number of anomalies.
+
+    Parameters
+    ----------
+    y_true : np.ndarray
+        True binary labels of shape (n_instances,).
+    y_score : np.ndarray
+        Anomaly scores for each point of the time series of shape (n_instances,).
+    k : optional int
+        Number of expected anomalies. If `k` is `None`, the ground truth data is used
+        to calculate the real number of anomalies.
+
+    Returns
+    -------
+    float
+        Threshold such that there are at least `k` anomalous points.
+    """
+    if k is None:
+        return np.nanpercentile(y_score, (1 - y_true.sum() / y_true.shape[0]) * 100)
+    else:
+        return np.nanpercentile(y_score, (1 - k / y_true.shape[0]) * 100)
+
+
+def top_k_ranges_threshold(
+    y_true: np.ndarray, y_score: np.ndarray, k: int | None = None
+) -> float:
+    """Calculate a threshold such that at least `k` anomalies are found.
+
+    The anomalies are either single-points anomalies or continuous anomalous ranges.
+
+    Computes a threshold based on the number of expected anomalous subsequences /
+    ranges (number of anomalies). This method iterates over all possible thresholds
+    from high to low to find the first threshold that yields `k` or more continuous
+    anomalous ranges. If `k` is `None`, the ground truth data is used to calculate the
+    real number of anomalies (anomalous ranges).
+
+    Parameters
+    ----------
+    y_true : np.ndarray
+        True binary labels of shape (n_instances,).
+    y_score : np.ndarray
+        Anomaly scores for each point of the time series of shape (n_instances,).
+    k : optional int
+        Number of expected anomalies. If `k` is `None`, the ground truth data is used
+        to calculate the real number of anomalies.
+
+    Returns
+    -------
+    float
+        Threshold such that there are at least `k` anomalous ranges.
+    """
+    if k is None:
+        k = _count_anomaly_ranges(y_true)
+    thresholds = np.unique(y_score)[::-1]
+
+    # exclude minimum from thresholds, because all points are >= minimum!
+    for t in thresholds[:-1]:
+        y_pred = np.array(y_score >= t, dtype=np.int_)
+        detected_n = _count_anomaly_ranges(y_pred)
+        if detected_n >= k:
+            return t
+
+
+def _count_anomaly_ranges(y: np.ndarray) -> int:
+    """Count the number of continuous anomalous ranges in a binary sequence.
+
+    Parameters
+    ----------
+    y : np.ndarray
+        Binary sequence of shape (n_instances,).
+
+    Returns
+    -------
+    int
+        Number of continuous anomalous ranges.
+    """
+    return int(np.sum(np.diff(np.r_[0, y, 0]) == 1))
diff --git a/aeon/utils/validation/_dependencies.py b/aeon/utils/validation/_dependencies.py
index 594e3b2e0c..ee31e71087 100644
--- a/aeon/utils/validation/_dependencies.py
+++ b/aeon/utils/validation/_dependencies.py
@@ -78,6 +78,8 @@ def _check_soft_dependencies(
 
     if obj is None:
         class_name = "This functionality"
+    elif isinstance(obj, str):
+        class_name = obj
     elif not isclass(obj):
         class_name = type(obj).__name__
     elif isclass(obj):
diff --git a/aeon/utils/validation/tests/test_dependencies.py b/aeon/utils/validation/tests/test_dependencies.py
index d05c6b5040..89cbd20383 100644
--- a/aeon/utils/validation/tests/test_dependencies.py
+++ b/aeon/utils/validation/tests/test_dependencies.py
@@ -20,7 +20,7 @@ def test__check_soft_dependencies():
         _check_soft_dependencies("FOOBAR")
     with pytest.raises(ModuleNotFoundError, match="No module named 'FOOBAR'"):
         _check_soft_dependencies("FOOBAR", suppress_import_stdout=True)
-    with pytest.raises(ModuleNotFoundError, match="str requires package 'FOOBAR'"):
+    with pytest.raises(ModuleNotFoundError, match="FOOBAR requires package 'FOOBAR'"):
         _check_soft_dependencies("FOOBAR", obj="FOOBAR")
     with pytest.raises(RuntimeError, match="severity argument must be "):
         _check_soft_dependencies("FOOBAR", severity="FOOBAR")
diff --git a/docs/api_reference/performance_metrics.rst b/docs/api_reference/performance_metrics.rst
index 18aeecf05e..8e0754ad89 100644
--- a/docs/api_reference/performance_metrics.rst
+++ b/docs/api_reference/performance_metrics.rst
@@ -44,7 +44,7 @@ Forecasting
 
 
 Segmentation
-~~~~~~~~~~~~
+------------
 
 .. currentmodule:: aeon.performance_metrics.segmentation.metrics
 
@@ -55,3 +55,40 @@ Segmentation
     count_error
     hausdorff_error
     prediction_ratio
+
+Anomaly Detection
+-----------------
+
+.. currentmodule:: aeon.performance_metrics.anomaly_detection
+
+.. autosummary::
+    :toctree: auto_generated/
+    :template: function.rst
+
+    range_precision
+    range_recall
+    range_f_score
+    roc_auc_score
+    pr_auc_score
+    rp_rr_auc_score
+    f_score_at_k_points
+    f_score_at_k_ranges
+    range_pr_roc_auc_support
+    range_roc_auc_score
+    range_pr_auc_score
+    range_pr_vus_score
+    range_roc_vus_score
+
+AD Thresholding
+~~~~~~~~~~~~~~~
+
+.. currentmodule:: aeon.performance_metrics.anomaly_detection.thresholding
+
+.. autosummary::
+    :toctree: auto_generated/
+    :template: function.rst
+
+    percentile_threshold
+    sigma_threshold
+    top_k_points_threshold
+    top_k_ranges_threshold
diff --git a/pyproject.toml b/pyproject.toml
index 377d5c736a..cbea69021e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -81,6 +81,7 @@ all_extras = [
     "tslearn>=0.5.2",
     "xarray",
     "pyod>=1.1.3",
+    "prts>=1.0.0.0",
 
     # dependency of tensorflow, see issue #1724
     "keras<3.4.0",