aai-institute · janosg · Jun 13, 2024 · Jun 10, 2024 · Jun 11, 2024 · Jun 12, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,14 @@
 - Refactoring of least-core data valuation methods with more supported sampling methods
   and consistent interface.
   [PR #580](https://github.com/aai-institute/pyDVL/pull/580)
+- Refactoring of owen shapley valuation with new sampler architecture
+  [PR #597](https://github.com/aai-institute/pyDVL/pull/597)
+
+### Fixed
+
+- Fix a bug in pydvl.utils.numeric.random_subset where 1 - q was used instead of q
+  as the probability of an element being sampled
+  [PR #597](https://github.com/aai-institute/pyDVL/pull/597)
 
 ## 0.9.0 - 🆕 New methods, better docs and bugfixes 📚🐞
 

diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -22,6 +22,7 @@ pytest-xdist>=3.3.1
 pytest-cases~=3.8
 pytest-split
 pytest-sugar
+pytest-rerunfailures
 nbmake
 wheel
 twine==4.0.2
diff --git a/src/pydvl/utils/numeric.py b/src/pydvl/utils/numeric.py
@@ -96,7 +96,7 @@ def random_subset(
         The subset
     """
     rng = np.random.default_rng(seed)
-    selection = rng.uniform(size=len(s)) > q
+    selection = rng.uniform(size=len(s)) < q
     return s[selection]
 
 

diff --git a/src/pydvl/valuation/games.py b/src/pydvl/valuation/games.py
@@ -27,6 +27,7 @@
 from pydvl.valuation.dataset import Dataset
 from pydvl.valuation.methods._solve_least_core_problems import LeastCoreProblem
 from pydvl.valuation.result import ValuationResult
+from pydvl.valuation.types import SampleT
 from pydvl.valuation.utility.base import UtilityBase
 
 __all__ = [
@@ -91,13 +92,16 @@ def __call__(self, sample: SampleT | None) -> float:
         if sample is None or len(sample.subset) == 0:
             return 0
 
+        if self.training_data is None:
+            raise ValueError("Utility object has no training data.")
+
         idxs = np.array(sample.subset, dtype=np.int32)
         try:
-            score = self.score(self.training_data.x[idxs])
+            score: float = self.score(self.training_data.x[idxs])
         except (KeyboardInterrupt, SystemExit):
             raise
         except Exception:
-            score = 0
+            score = 0.0
         return score
 
     def with_dataset(self, dataset: Dataset):
@@ -209,7 +213,7 @@ def _score(self, X: NDArray) -> float:
     @lru_cache
     def shapley_values(self) -> ValuationResult:
         exact_values = np.ones(self.n_players) / self.n_players
-        result: ValuationResult[np.int_, np.int_] = ValuationResult(
+        result = ValuationResult(
             algorithm="exact_shapley",
             status=Status.Converged,
             indices=self.data.indices,
@@ -334,7 +338,7 @@ def _score(self, X: NDArray) -> float:
 
     @lru_cache
     def shapley_values(self) -> ValuationResult:
-        result: ValuationResult[np.int_, np.int_] = ValuationResult(
+        result = ValuationResult(
             algorithm="exact_shapley",
             status=Status.Converged,
             indices=self.data.indices,
@@ -405,7 +409,7 @@ def shapley_values(self) -> ValuationResult:
             value_left = precomputed_values[self.left, self.right]
             value_right = precomputed_values[self.right, self.left]
         exact_values = np.array([value_left] * self.left + [value_right] * self.right)
-        result: ValuationResult[np.int_, np.int_] = ValuationResult(
+        result = ValuationResult(
             algorithm="exact_shapley",
             status=Status.Converged,
             indices=self.data.indices,
@@ -427,7 +431,7 @@ def least_core_values(self) -> ValuationResult:
             subsidy = 0.0
             exact_values = np.array([0.0] * self.left + [1.0] * self.right)
 
-        result: ValuationResult[np.int_, np.int_] = ValuationResult(
+        result = ValuationResult(
             algorithm="exact_least_core",
             status=Status.Converged,
             indices=self.data.indices,
@@ -517,7 +521,7 @@ def _score(self, X: NDArray) -> float:
 
     @lru_cache
     def shapley_values(self) -> ValuationResult:
-        result: ValuationResult[np.int_, np.int_] = ValuationResult(
+        result = ValuationResult(
             algorithm="exact_shapley",
             status=Status.Converged,
             indices=self.data.indices,
@@ -592,7 +596,7 @@ def _score(self, X: NDArray) -> float:
     @lru_cache
     def shapley_values(self) -> ValuationResult:
         exact_values = 2 * np.ones_like(self.data.x)
-        result: ValuationResult[np.int_, np.int_] = ValuationResult(
+        result = ValuationResult(
             algorithm="exact_shapley",
             status=Status.Converged,
             indices=self.data.indices,
@@ -657,7 +661,7 @@ def least_core_values(self) -> ValuationResult:
             )
             subsidy = (self.n_players - 1) / (2 * self.n_players)
 
-        result: ValuationResult[np.int_, np.int_] = ValuationResult(
+        result = ValuationResult(
             algorithm="exact_least_core",
             status=Status.Converged,
             indices=self.data.indices,

diff --git a/src/pydvl/valuation/methods/least_core.py b/src/pydvl/valuation/methods/least_core.py
@@ -10,6 +10,7 @@
 from numpy.typing import NDArray
 from tqdm.auto import tqdm
 
+from pydvl.utils.types import Seed
 from pydvl.valuation.base import Valuation
 from pydvl.valuation.dataset import Dataset
 from pydvl.valuation.methods._solve_least_core_problems import (
@@ -221,10 +222,11 @@ def __init__(
         non_negative_subsidy: bool = False,
         solver_options: dict | None = None,
         progress: bool = True,
+        seed: Seed | None = None,
     ):
         super().__init__(
             utility=utility,
-            sampler=UniformSampler(index_iteration=NoIndexIteration),
+            sampler=UniformSampler(index_iteration=NoIndexIteration, seed=seed),
             n_samples=n_samples,
             non_negative_subsidy=non_negative_subsidy,
             solver_options=solver_options,

diff --git a/src/pydvl/valuation/methods/owen_shapley.py b/src/pydvl/valuation/methods/owen_shapley.py
@@ -1,7 +1,81 @@
+from __future__ import annotations
+
+from typing import Any
+
+from typing_extensions import Self
+
+from pydvl.utils import Status
+from pydvl.valuation.dataset import Dataset
+from pydvl.valuation.methods.semivalue import SemivalueValuation
+from pydvl.valuation.result import ValuationResult
+from pydvl.valuation.samplers.powerset import OwenSampler
+from pydvl.valuation.stopping import NoStopping
+from pydvl.valuation.utility.base import UtilityBase
+
 """
 ## References
 
 [^1]: <a name="okhrati_multilinear_2021"></a>Okhrati, R., Lipani, A., 2021.
     [A Multilinear Sampling Algorithm to Estimate Shapley Values](https://ieeexplore.ieee.org/abstract/document/9412511).
     In: 2020 25th International Conference on Pattern Recognition (ICPR), pp. 7992–7999. IEEE.
 """
+
+
+class OwenShapleyValuation(SemivalueValuation):
+    """Umbrella class to calculate least-core values with Owen sampling schemes.
+
+    Owen shapley values converge to true Shapley values as the number of samples
+    increases but have been shown to need fewer samples than other sampling schemes.
+
+    The number of samples is governed by the sampler object. There are no convergence
+    criteria for Owen shapley values as they will just run for a fixed number of
+    samples.
+
+    Args:
+        utility: Utility object with model and scoring function.
+        sampler: Owen sampling scheme to use. Can be OwenSampler or
+            AntitheticOwenSampler.
+        progress: Whether to show a progress bar.
+
+    """
+
+    def __init__(
+        self,
+        utility: UtilityBase,
+        sampler: OwenSampler,
+        progress: dict[str, Any] | bool = False,
+    ):
+        super().__init__(
+            utility=utility,
+            sampler=sampler,
+            is_done=NoStopping(),
+            progress=progress,
+        )
+
+    def fit(self, dataset: Dataset) -> Self:
+        """Calculate the Owen shapley values for a given dataset.
+
+        This method has to be called before calling `values()`.
+
+        Calculating the least core valuation is a computationally expensive task that
+        can be parallelized. To do so, call the `fit()` method inside a
+        `joblib.parallel_config` context manager as follows:
+
+        ```python
+        from joblib import parallel_config
+
+        with parallel_config(n_jobs=4):
+            valuation.fit(data)
+        ```
+
+        """
+        # since we bypassed the convergence checks we need to set the status to
+        # converged manually
+        super().fit(dataset)
+        # make the type checker happy
+        if self.result is not None:
+            self.result._status = Status.Converged
+        return self
+
+    def coefficient(self, n: int, k: int) -> float:
+        return 1
diff --git a/src/pydvl/valuation/samplers/powerset.py b/src/pydvl/valuation/samplers/powerset.py
@@ -61,6 +61,8 @@
     "PowersetSampler",
     "TruncatedUniformStratifiedSampler",
     "UniformSampler",
+    "OwenSampler",
+    "AntitheticOwenSampler",
     "UniformStratifiedSampler",
     "VarianceReducedStratifiedSampler",
     "IndexIteration",
@@ -210,10 +212,7 @@ def process(
     ) -> list[ValueUpdate]:
         r = []
         for sample in batch:
-            assert sample.idx is not None
-            u_i = self.utility(
-                Sample(sample.idx, np.array(list({sample.idx}.union(sample.subset))))
-            )
+            u_i = self.utility(sample.with_idx_in_subset())
             u = self.utility(sample)
             marginal = (u_i - u) * self.coefficient(self.n_indices, len(sample.subset))
             r.append(ValueUpdate(sample.idx, marginal))
@@ -369,6 +368,114 @@ def _generate(self, indices: IndexSetT) -> SampleGenerator:
                 yield Sample(idx, subset)
 
 
+class OwenSampler(StochasticSamplerMixin, PowersetSampler):
+    """A sampler for Owen shapley values.
+
+    For each index $i$ the Owen sampler loops over a deterministic grid of probabilities
+    (containing `n_samples_outer` entries between 0 and 1) and then draws
+    `n_samples_inner` subsets of the complement of the current index where each element
+    is sampled with the given probability.
+
+    The total number of samples drawn is therefore `n_samples_outer * n_samples_inner`.
+
+    Args:
+        n_samples_outer: The number of entries in the probability grid used for
+            the outer loop in Owen sampling.
+        n_samples_inner: The number of samples drawn for each probability. In the
+            original paper this was fixed to 2 for all experiments which is why we
+            give it a default value of 2.
+        batch_size: The batch size of the sampler.
+        seed: The seed for the random number generator.
+
+    """
+
+    def __init__(
+        self,
+        n_samples_outer: int,
+        n_samples_inner: int = 2,
+        batch_size: int = 1,
+        seed: Seed | None = None,
+    ):
+        super().__init__(
+            batch_size=batch_size, index_iteration=SequentialIndexIteration, seed=seed
+        )
+        self._n_samples_inner = n_samples_inner
+        self._n_samples_outer = n_samples_outer
+        self._q_stop = 1.0
+
+    def _generate(self, indices: IndexSetT) -> SampleGenerator:
+        probabilities = np.linspace(
+            start=0, stop=self._q_stop, num=self._n_samples_outer
+        )
+        for idx in self.index_iterator(indices):
+            _complement = complement(indices, [idx] if idx is not None else [])
+            for prob in probabilities:
+                for _ in range(self._n_samples_inner):
+                    subset = random_subset(_complement, q=prob, seed=self._rng)
+                    yield Sample(idx, subset)
+
+    @staticmethod
+    def weight(n: int, subset_len: int) -> float:
+        return 1.0
+
+    def sample_limit(self, indices: IndexSetT) -> int:
+        return len(indices) * self._n_samples_outer * self._n_samples_inner
+
+
+class AntitheticOwenSampler(OwenSampler):
+    """A sampler for antithetic Owen shapley values.
+
+    For each index $i$, the antithetic Owen sampler loops over a deterministic grid of
+    probabilities (containing `n_samples_outer` entries between 0 and 0.5) and then
+    draws `n_samples_inner` subsets of the complement of the current index where each
+    element is sampled with the given probability. For each sample obtained that way,
+    a second sample is generated by taking the complement of the first sample.
+
+    The total number of samples drawn is therefore
+    `2 * n_samples_outer * n_samples_inner`.
+
+    For the same number of total samples, the antithetic Owen sampler yields usually
+    more precise estimates of shapley values than the regular Owen sampler.
+
+    Args:
+        n_samples_outer: The number of entries in the probability grid used for
+            the outer loop in Owen sampling.
+        n_samples_inner: The number of samples drawn for each probability. In the
+            original paper this was fixed to 2 for all experiments which is why we
+            give it a default value of 2.
+        batch_size: The batch size of the sampler.
+        seed: The seed for the random number generator.
+
+    """
+
+    def __init__(
+        self,
+        n_samples_outer: int,
+        n_samples_inner: int = 2,
+        batch_size: int = 1,
+        seed: Seed | None = None,
+    ):
+        super().__init__(
+            n_samples_outer=n_samples_outer,
+            n_samples_inner=n_samples_inner,
+            batch_size=batch_size,
+            seed=seed,
+        )
+        self._q_stop = 0.5
+
+    def _generate(self, indices: IndexSetT) -> SampleGenerator:
+        for sample in super()._generate(indices):
+            idx, subset = sample
+            _exclude = [idx] if idx is not None else []
+            _exclude += subset.tolist()
+            _antithetic_subset = complement(indices, _exclude)
+            yield sample
+            yield Sample(idx, _antithetic_subset)
+
+    def sample_limit(self, indices: IndexSetT) -> int:
+        return 2 * super().sample_limit(indices)
+
+
 class AntitheticSampler(StochasticSamplerMixin, PowersetSampler):
     """An iterator to perform uniform random sampling of subsets, and their
     complements.