Merge pull request #426 from BastienZim/develop

Implementation of Data-OOB
aai-institute · Sep 12, 2023 · ca9591e · ca9591e
2 parents c55e77d + 8c86480
commit ca9591e
Show file tree

Hide file tree

Showing 6 changed files with 642 additions and 0 deletions.
diff --git a/docs/assets/pydvl.bib b/docs/assets/pydvl.bib
@@ -312,3 +312,19 @@ @inproceedings{yan_if_2021
   langid = {english},
   keywords = {notion}
 }
+
+@InProceedings{kwon_data_2023,
+  title = 	 {Data-{OOB}: Out-of-bag Estimate as a Simple and Efficient Data Value},
+  author =       {Kwon, Yongchan and Zou, James},
+  booktitle = 	 {Proceedings of the 40th International Conference on Machine Learning},
+  pages = 	 {18135--18152},
+  year = 	 {2023},
+  editor = 	 {Krause, Andreas and Brunskill, Emma and Cho, Kyunghyun and Engelhardt, Barbara and Sabato, Sivan and Scarlett, Jonathan},
+  volume = 	 {202},
+  series = 	 {Proceedings of Machine Learning Research},
+  month = 	 {23--29 Jul},
+  publisher =    {PMLR},
+  pdf = 	 {https://proceedings.mlr.press/v202/kwon23e/kwon23e.pdf},
+  url = 	 {https://proceedings.mlr.press/v202/kwon23e.html},
+  abstract = 	 {Data valuation is a powerful framework for providing statistical insights into which data are beneficial or detrimental to model training. Many Shapley-based data valuation methods have shown promising results in various downstream tasks, however, they are well known to be computationally challenging as it requires training a large number of models. As a result, it has been recognized as infeasible to apply to large datasets. To address this issue, we propose Data-OOB, a new data valuation method for a bagging model that utilizes the out-of-bag estimate. The proposed method is computationally efficient and can scale to millions of data by reusing trained weak learners. Specifically, Data-OOB takes less than $2.25$ hours on a single CPU processor when there are $10^6$ samples to evaluate and the input dimension is $100$. Furthermore, Data-OOB has solid theoretical interpretations in that it identifies the same important data point as the infinitesimal jackknife influence function when two different points are compared. We conduct comprehensive experiments using 12 classification datasets, each with thousands of sample sizes. We demonstrate that the proposed method significantly outperforms existing state-of-the-art data valuation methods in identifying mislabeled data and finding a set of helpful (or harmful) data points, highlighting the potential for applying data values in real-world applications.}
+}
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -199,6 +199,7 @@ nav:
       - KNN Shapley: examples/shapley_knn_flowers.ipynb
       - Data utility learning: examples/shapley_utility_learning.ipynb
       - Least Core: examples/least_core_basic.ipynb
+      - Data OOB: examples/data_oob.ipynb
   - The Influence Function:
     - Introduction: influence/index.md
     - Examples:

diff --git a/notebooks/data_oob.ipynb b/notebooks/data_oob.ipynb
diff --git a/src/pydvl/value/__init__.py b/src/pydvl/value/__init__.py
@@ -10,6 +10,7 @@
 from ..utils import Dataset, Scorer, Utility
 from .least_core import *
 from .loo import *
+from .oob import *
 from .sampler import *
 from .semivalues import *
 from .shapley import *

diff --git a/src/pydvl/value/oob/__init__.py b/src/pydvl/value/oob/__init__.py
@@ -0,0 +1 @@
+from .oob import *
diff --git a/src/pydvl/value/oob/oob.py b/src/pydvl/value/oob/oob.py
@@ -0,0 +1,140 @@
+"""
+## References
+
+[^1]: <a name="kwon_data_2023"></a>Kwon et al.
+[Data-OOB: Out-of-bag Estimate as a Simple and Efficient Data Value](https://proceedings.mlr.press/v202/kwon23e.html).
+In: Published at ICML 2023
+
+"""
+
+from __future__ import annotations
+
+from collections.abc import Callable
+from typing import TypeVar
+
+import numpy as np
+from numpy.typing import NDArray
+from sklearn.base import is_classifier, is_regressor
+from sklearn.ensemble import BaggingClassifier, BaggingRegressor
+
+from pydvl.utils import Utility, maybe_progress
+from pydvl.value.result import ValuationResult
+
+__all__ = ["compute_data_oob"]
+
+T = TypeVar("T", bound=np.number)
+
+
+def compute_data_oob(
+    u: Utility,
+    n_est: int = 10,
+    max_samples: float = 0.8,
+    n_jobs: int = None,
+    loss: Callable = None,
+    *,
+    progress: bool = False,
+) -> ValuationResult:
+    r"""Computes Data out of bag values
+
+    This implements the method described in (Kwon and Zou, 2023) <sup><a href="kwon_data_2023">1</a></sup>.
+    It fits several base estimators provided through u.model through a bagging process. The point value corresponds to the average loss of estimators which were not fit on it.
+
+    $w_{bj}\in Z$ is the number of times the j-th datum $(x_j, y_j)$ is selected in the b-th bootstrap dataset.
+
+    $$\psi((x_i,y_i),\Theta_B):=\frac{\sum_{b=1}^{B}\mathbb{1}(w_{bi}=0)T(y_i, \hat{f}_b(x_i))}{\sum_{b=1}^{B}
+    \mathbb{1}
+    (w_{bi}=0)}$$
+
+    With:
+
+    $$
+    T: Y \times Y
+    \rightarrow \mathbb{R}
+    $$
+
+    T is a score function that represents the goodness of a weak learner $\hat{f}_b$ at the i-th datum $(x_i, y_i)$.
+
+    There is a need to tune n_est and max_samples jointly to ensure all samples are at least 1 time oob, otherwise the result could include a nan value for that datum.
+
+    Args:
+        u: Utility object with model, data, and scoring function.
+        n_est: Number of estimator used in the bagging procedure.
+        max_samples: The fraction of samples to draw to train each base estimator.
+        n_jobs: The number of jobs to run in parallel used in the bagging
+            procedure for both fit and predict.
+        loss: A function taking as parameters model prediction and corresponding
+            data labels(preds, y) and returning an array of point-wise errors.
+        progress: If True, display a progress bar.
+
+    Returns:
+        Object with the data values.
+    """
+
+    result: ValuationResult[np.int_, np.float_] = ValuationResult.empty(
+        algorithm="data_oob", indices=u.data.indices, data_names=u.data.data_names
+    )
+
+    if is_classifier(u.model):
+        bag = BaggingClassifier(
+            u.model, n_estimators=n_est, max_samples=max_samples, n_jobs=n_jobs
+        )
+        if loss is None:
+            loss = point_wise_accuracy
+    elif is_regressor(u.model):
+        bag = BaggingRegressor(
+            u.model, n_estimators=n_est, max_samples=max_samples, n_jobs=n_jobs
+        )
+        if loss is None:
+            loss = neg_l2_distance
+    else:
+        raise Exception(
+            "Model has to be a classifier or a regressor in sklearn format."
+        )
+
+    bag.fit(u.data.x_train, u.data.y_train)
+
+    for est, samples in maybe_progress(
+        zip(bag.estimators_, bag.estimators_samples_), progress, total=n_est
+    ):  # The bottleneck is the bag fitting not this part so TQDM is not very useful here
+        oob_idx = np.setxor1d(u.data.indices, np.unique(samples))
+        array_loss = loss(
+            preds=est.predict(u.data.x_train[oob_idx]), y=u.data.y_train[oob_idx]
+        )
+        result += ValuationResult(
+            algorithm="data_oob",
+            indices=oob_idx,
+            values=array_loss,
+            counts=np.ones_like(array_loss, dtype=u.data.indices.dtype),
+        )
+    return result
+
+
+def point_wise_accuracy(preds: NDArray, y: NDArray) -> NDArray:
+    r"""Computes point wise accuracy
+
+    Args:
+        preds: Model prediction on
+        y:  data labels corresponding to the model predictions
+
+    Returns:
+        Array of point wise accuracy
+    """
+    return np.array(preds == y, dtype=np.int_)
+
+
+def neg_l2_distance(preds: NDArray[T], y: NDArray[T]) -> NDArray[T]:
+    r"""Computes negative l2 distance between label and model prediction
+
+    Args:
+        preds: Model prediction on
+        y:  data labels corresponding to the model predictions
+
+    Returns:
+        Array with point wise negative l2 distance between label and model prediction
+    """
+    return -np.square(
+        np.array(
+            preds - y,
+            dtype=np.float64,
+        )
+    )