Average precision and AUROC update (#374)

* add realized perf AP metric * add CBPE BC AP implementation * update CBPE metrics _common_cleaning and estimate_auroc/accuracy * ap sampling error update * update docs --------- Co-authored-by: Niels <[email protected]> Co-authored-by: Niels Nuyttens <[email protected]>
NannyML · Mar 8, 2024 · 70aecce · 70aecce
1 parent 0ec1fc8
commit 70aecce
Show file tree

Hide file tree

Showing 17 changed files with 1,252 additions and 1,613 deletions.
diff --git a/...alculation/binary/tutorial-performance-calculation-binary-car-loan-analysis.svg b/...alculation/binary/tutorial-performance-calculation-binary-car-loan-analysis.svg
diff --git a/...ation/binary/tutorial-standard-metrics-calculation-binary-car-loan-analysis.svg b/...ation/binary/tutorial-standard-metrics-calculation-binary-car-loan-analysis.svg
diff --git a/docs/example_notebooks/Tutorial - Calculating Standard Metrics - Binary Classification.ipynb b/docs/example_notebooks/Tutorial - Calculating Standard Metrics - Binary Classification.ipynb
diff --git a/docs/example_notebooks/Tutorial - Realized Performance - Binary Classification.ipynb b/docs/example_notebooks/Tutorial - Realized Performance - Binary Classification.ipynb
diff --git a/docs/tutorials/performance_calculation/binary_performance_calculation.rst b/docs/tutorials/performance_calculation/binary_performance_calculation.rst
@@ -9,6 +9,7 @@ We currently support the following **standard** metrics for bianry classificatio
     * **roc_auc**
     * **f1**
     * **precision**
+    * **average_precision**
     * **recall**
     * **specificity**
     * **accuracy**

diff --git a/docs/tutorials/performance_estimation/binary_performance_estimation.rst b/docs/tutorials/performance_estimation/binary_performance_estimation.rst
@@ -9,6 +9,7 @@ We currently support the following **standard** metrics for bianry classificatio
     * **roc_auc**
     * **f1**
     * **precision**
+    * **average_precision**
     * **recall**
     * **specificity**
     * **accuracy**

diff --git a/nannyml/drift/__init__.py b/nannyml/drift/__init__.py
@@ -23,7 +23,7 @@
 - Domain Classifer: detects drift by looking at how performance a domain classifier is at distinguising
   between the reference and the chunk datasets.
 """
-from .multivariate.domain_classifier import DomainClassifierCalculator
 from .multivariate.data_reconstruction import DataReconstructionDriftCalculator
+from .multivariate.domain_classifier import DomainClassifierCalculator
 from .ranker import AlertCountRanker, CorrelationRanker
 from .univariate import FeatureType, Method, MethodFactory, UnivariateDriftCalculator
diff --git a/nannyml/performance_calculation/__init__.py b/nannyml/performance_calculation/__init__.py
@@ -19,6 +19,7 @@
     'accuracy',
     'confusion_matrix',
     'business_value',
+    'average_precision',
 ]
 
 SUPPORTED_REGRESSION_METRIC_VALUES = [

diff --git a/nannyml/performance_calculation/calculator.py b/nannyml/performance_calculation/calculator.py
@@ -28,7 +28,7 @@
 ...     y_true='repaid',
 ...     timestamp_column_name='timestamp',
 ...     problem_type='classification_binary',
-...     metrics=['roc_auc', 'f1', 'precision', 'recall', 'specificity', 'accuracy'],
+...     metrics=['roc_auc', 'f1', 'precision', 'recall', 'specificity', 'accuracy', 'average_precision'],
 ...     chunk_size=5000)
 >>> calc.fit(reference_df)
 >>> results = calc.calculate(analysis_df)
@@ -62,6 +62,7 @@
     'roc_auc': StandardDeviationThreshold(),
     'f1': StandardDeviationThreshold(),
     'precision': StandardDeviationThreshold(),
+    'average_precision': StandardDeviationThreshold(),
     'recall': StandardDeviationThreshold(),
     'specificity': StandardDeviationThreshold(),
     'accuracy': StandardDeviationThreshold(),
@@ -128,6 +129,7 @@ def __init__(
                     'roc_auc': StandardDeviationThreshold(),
                     'f1': StandardDeviationThreshold(),
                     'precision': StandardDeviationThreshold(),
+                    'average_precision': StandardDeviationThreshold(),
                     'recall': StandardDeviationThreshold(),
                     'specificity': StandardDeviationThreshold(),
                     'accuracy': StandardDeviationThreshold(),
@@ -187,7 +189,7 @@ def __init__(
         ...     y_true='repaid',
         ...     timestamp_column_name='timestamp',
         ...     problem_type='classification_binary',
-        ...     metrics=['roc_auc', 'f1', 'precision', 'recall', 'specificity', 'accuracy'],
+        ...     metrics=['roc_auc', 'f1', 'precision', 'recall', 'specificity', 'accuracy', 'average_precision'],
         ...     chunk_size=5000)
         >>> calc.fit(reference_df)
         >>> results = calc.calculate(analysis_df)

diff --git a/nannyml/performance_calculation/metrics/binary_classification.py b/nannyml/performance_calculation/metrics/binary_classification.py
@@ -1,12 +1,20 @@
 #  Author:   Niels Nuyttens  <[email protected]>
 #
 #  License: Apache Software License 2.0
+"""Module containing implemenations for binary classification metrics and utilities."""
 import warnings
 from typing import Any, Dict, List, Optional, Tuple, Union
 
 import numpy as np
 import pandas as pd
-from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score, roc_auc_score
+from sklearn.metrics import (
+    average_precision_score,
+    confusion_matrix,
+    f1_score,
+    precision_score,
+    recall_score,
+    roc_auc_score,
+)
 
 from nannyml._typing import ProblemType
 from nannyml.base import _list_missing, _remove_nans
@@ -16,6 +24,8 @@
 from nannyml.sampling_error.binary_classification import (
     accuracy_sampling_error,
     accuracy_sampling_error_components,
+    ap_sampling_error,
+    ap_sampling_error_components,
     auroc_sampling_error,
     auroc_sampling_error_components,
     business_value_sampling_error,
@@ -64,7 +74,7 @@ def __init__(
             The Threshold instance that determines how the lower and upper threshold values will be calculated.
         y_pred_proba: Optional[str], default=None
             Name(s) of the column(s) containing your model output. For binary classification, pass a single string
-            refering to the model output column.
+            referring to the model output column.
         """
         super().__init__(
             name='roc_auc',
@@ -81,9 +91,11 @@ def __init__(
         self._sampling_error_components: Tuple = ()
 
     def __str__(self):
+        """Metric string."""
         return "roc_auc"
 
     def _fit(self, reference_data: pd.DataFrame):
+        """Metric _fit implementation on reference data."""
         _list_missing([self.y_true, self.y_pred_proba], list(reference_data.columns))
         self._sampling_error_components = auroc_sampling_error_components(
             y_true_reference=reference_data[self.y_true],
@@ -111,6 +123,88 @@ def _sampling_error(self, data: pd.DataFrame) -> float:
         return auroc_sampling_error(self._sampling_error_components, data)
 
 
+@MetricFactory.register(metric='average_precision', use_case=ProblemType.CLASSIFICATION_BINARY)
+class BinaryClassificationAP(Metric):
+    """Average Precision metric.
+
+    https://scikit-learn.org/stable/modules/generated/sklearn.metrics.average_precision_score.html
+    """
+
+    def __init__(
+        self,
+        y_true: str,
+        y_pred: str,
+        threshold: Threshold,
+        y_pred_proba: Optional[str] = None,
+        **kwargs,
+    ):
+        """Creates a new AP instance.
+
+        Parameters
+        ----------
+        y_true: str
+            The name of the column containing target values.
+        y_pred: str
+            The name of the column containing your model predictions.
+        threshold: Threshold
+            The Threshold instance that determines how the lower and upper threshold values will be calculated.
+        y_pred_proba: Optional[str], default=None
+            Name(s) of the column(s) containing your model output. For binary classification, pass a single string
+            referring to the model output column.
+        """
+        super().__init__(
+            name='average_precision',
+            y_true=y_true,
+            y_pred=y_pred,
+            threshold=threshold,
+            y_pred_proba=y_pred_proba,
+            lower_threshold_limit=0,
+            upper_threshold_limit=1,
+            components=[('Average Precision', 'average_precision')],
+        )
+
+        # sampling error
+        self._sampling_error_components: Tuple = ()
+
+    def __str__(self):
+        """Metric string."""
+        return "average_precision"
+
+    def _fit(self, reference_data: pd.DataFrame):
+        """Metric _fit implementation on reference data."""
+        _list_missing([self.y_true, self.y_pred_proba], list(reference_data.columns))
+        # we don't want to count missing rows for sampling error
+        reference_data = _remove_nans(reference_data, (self.y_true, self.y_pred))
+
+        if 1 not in reference_data[self.y_true].unique():
+            self._sampling_error_components = np.NaN, 0
+        else:
+            self._sampling_error_components = ap_sampling_error_components(
+                y_true_reference=reference_data[self.y_true],
+                y_pred_proba_reference=reference_data[self.y_pred_proba],
+            )
+
+    def _calculate(self, data: pd.DataFrame):
+        """Redefine to handle NaNs and edge cases."""
+        _list_missing([self.y_true, self.y_pred_proba], list(data.columns))
+        data = _remove_nans(data, (self.y_true, self.y_pred))
+
+        y_true = data[self.y_true]
+        y_pred_proba = data[self.y_pred_proba]
+
+        if 1 not in y_true.unique():
+            warnings.warn(
+                f"'{self.y_true}' does not contain positive class for chunk, cannot calculate {self.display_name}. "
+                f"Returning NaN."
+            )
+            return np.NaN
+        else:
+            return average_precision_score(y_true, y_pred_proba)
+
+    def _sampling_error(self, data: pd.DataFrame) -> float:
+        return ap_sampling_error(self._sampling_error_components, data)
+
+
 @MetricFactory.register(metric='f1', use_case=ProblemType.CLASSIFICATION_BINARY)
 class BinaryClassificationF1(Metric):
     """F1 score metric."""
@@ -156,6 +250,7 @@ def __str__(self):
 
     def _fit(self, reference_data: pd.DataFrame):
         _list_missing([self.y_true, self.y_pred], list(reference_data.columns))
+        # TODO: maybe handle data quality issues here and pass clean data to sampling error calculation?
         self._sampling_error_components = f1_sampling_error_components(
             y_true_reference=reference_data[self.y_true],
             y_pred_reference=reference_data[self.y_pred],

diff --git a/nannyml/performance_estimation/confidence_based/__init__.py b/nannyml/performance_estimation/confidence_based/__init__.py
@@ -24,6 +24,7 @@
 SUPPORTED_METRIC_VALUES = [
     'roc_auc',
     'f1',
+    'average_precision',
     'precision',
     'recall',
     'specificity',

diff --git a/nannyml/performance_estimation/confidence_based/cbpe.py b/nannyml/performance_estimation/confidence_based/cbpe.py
@@ -49,6 +49,7 @@
     'accuracy': StandardDeviationThreshold(),
     'confusion_matrix': StandardDeviationThreshold(),
     'business_value': StandardDeviationThreshold(),
+    'average_precision': StandardDeviationThreshold(),
 }