From 1d7febb5430e74985e4585d014008681753262d3 Mon Sep 17 00:00:00 2001
From: Janos Gabler <janos.gabler@gmail.com>
Date: Mon, 10 Jun 2024 17:37:36 +0200
Subject: [PATCH 1/8] Remove tolerate fixture and use pytest-rerunfailures
 instead.

---
 requirements-dev.txt                          |   1 +
 tests/conftest.py                             |  51 +-----
 tests/parallel/test_parallel.py               |   2 +-
 tests/test_plugin.py                          |  56 ------
 tests/tolerate.py                             | 173 ------------------
 .../test_montecarlo_shapley_valuations.py     |  33 ++--
 tests/value/shapley/test_montecarlo.py        |  15 +-
 7 files changed, 24 insertions(+), 307 deletions(-)
 delete mode 100644 tests/test_plugin.py
 delete mode 100644 tests/tolerate.py

diff --git a/requirements-dev.txt b/requirements-dev.txt
index e4b4611ef..3d7bc9f2b 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -22,6 +22,7 @@ pytest-xdist>=3.3.1
 pytest-cases~=3.8
 pytest-split
 pytest-sugar
+pytest-rerunfailures
 nbmake
 wheel
 twine==4.0.2
diff --git a/tests/conftest.py b/tests/conftest.py
index b08f09377..49d2ef2da 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -13,11 +13,6 @@
 from pydvl.parallel import available_cpus
 from pydvl.utils import Dataset, MemcachedClientConfig
 from tests.cache import CloudPickleCache
-from tests.tolerate import (
-    TolerateErrorFixture,
-    TolerateErrorsSession,
-    wrap_pytest_function,
-)
 
 if TYPE_CHECKING:
     from _pytest.terminal import TerminalReporter
@@ -35,19 +30,6 @@ def pytest_addoption(parser):
         action="store_true",
         help="Run tests marked as slow using the @slow marker",
     )
-    group = parser.getgroup("tolerate")
-    group.addoption(
-        "--tolerate-verbose",
-        action="store_true",
-        default=False,
-        help="Dump diagnostic and progress information.",
-    )
-    group.addoption(
-        "--tolerate-quiet",
-        action="store_true",
-        default=False,
-        help="Disable reporting. Verbose mode takes precedence.",
-    )
 
 
 @pytest.fixture
@@ -202,16 +184,10 @@ def pytest_xdist_auto_num_workers(config) -> Optional[int]:
 
 
 ################################################################################
-# Tolerate Errors and CloudPickleCache Plugins
+# CloudPickleCache Plugins
 
 
 def pytest_configure(config: "Config"):
-    config.addinivalue_line(
-        "markers",
-        "tolerate: mark a test to swallow errors up to a certain threshold. "
-        "Use to test (ε,δ)-approximations.",
-    )
-    config._tolerate_session = TolerateErrorsSession(config)
     config.cloud_pickle_cache = CloudPickleCache.for_config(config, _ispytest=True)
 
     config.addinivalue_line(
@@ -233,28 +209,3 @@ def pytest_runtest_setup(item: pytest.Item):
     if marker:
         if not item.config.getoption("--slow-tests"):
             pytest.skip("slow test")
-
-
-@pytest.fixture(scope="function")
-def tolerate(request: pytest.FixtureRequest):
-    fixture = TolerateErrorFixture(request.node)
-    return fixture
-
-
-@pytest.hookimpl(hookwrapper=True)
-def pytest_runtest_call(item: pytest.Function):
-    marker = item.get_closest_marker("tolerate")
-    has_fixture = hasattr(item, "funcargs") and isinstance(
-        item.funcargs.get("tolerate"), TolerateErrorFixture
-    )
-    if marker:
-        if not has_fixture:
-            wrap_pytest_function(item)
-    yield
-
-
-def pytest_terminal_summary(
-    terminalreporter: "TerminalReporter", exitstatus: int, config: "Config"
-):
-    tolerate_session = terminalreporter.config._tolerate_session
-    tolerate_session.display(terminalreporter)
diff --git a/tests/parallel/test_parallel.py b/tests/parallel/test_parallel.py
index c71c8c743..d79a97d52 100644
--- a/tests/parallel/test_parallel.py
+++ b/tests/parallel/test_parallel.py
@@ -219,7 +219,7 @@ def func(_):
 
 
 @pytest.mark.timeout(30)
-@pytest.mark.tolerate(max_failures=1)
+@pytest.mark.flaky(reruns=1)
 def test_future_cancellation(parallel_backend):
     if not isinstance(parallel_backend, RayParallelBackend):
         pytest.skip("Currently this test only works with Ray")
diff --git a/tests/test_plugin.py b/tests/test_plugin.py
deleted file mode 100644
index efbceeb2a..000000000
--- a/tests/test_plugin.py
+++ /dev/null
@@ -1,56 +0,0 @@
-import pytest
-
-
-@pytest.mark.tolerate(max_failures=1)
-@pytest.mark.parametrize("i", range(1))
-def test_marker_only(i):
-    assert False
-
-
-@pytest.fixture(scope="function", params=[0, pytest.param(1, marks=pytest.mark.xfail)])
-def data(request):
-    yield request.param
-
-
-@pytest.mark.tolerate(max_failures=1)
-def test_marker_only_with_data_fixture(data):
-    assert False
-
-
-@pytest.mark.parametrize("i", [1, pytest.param(2, marks=pytest.mark.xfail)])
-def test_fixture_only(tolerate, i):
-    for _ in range(i):
-        with tolerate(max_failures=1):
-            assert False
-
-
-@pytest.mark.xfail(
-    reason="This should fail because we should pass arguments when calling the tolerate fixture"
-)
-def test_fixture_call_no_arguments(tolerate):
-    for _ in range(1):
-        with tolerate():
-            assert False
-
-
-@pytest.mark.tolerate(max_failures=1)
-@pytest.mark.parametrize("i", [1, pytest.param(2, marks=pytest.mark.xfail)])
-def test_marker_and_fixture(tolerate, i):
-    for _ in range(i):
-        with tolerate:
-            assert False
-
-
-@pytest.mark.xfail(
-    reason="This should fail because the tolerate marker expects arguments",
-    raises=ValueError,
-)
-@pytest.mark.tolerate()
-def test_failure():
-    pass
-
-
-@pytest.mark.tolerate(max_failures=0, exceptions_to_ignore=TypeError)
-@pytest.mark.parametrize("i", range(5))
-def test_marker_ignore_exception(i):
-    raise TypeError
diff --git a/tests/tolerate.py b/tests/tolerate.py
deleted file mode 100644
index 4fc758d95..000000000
--- a/tests/tolerate.py
+++ /dev/null
@@ -1,173 +0,0 @@
-import functools
-from collections import defaultdict
-from typing import TYPE_CHECKING, Optional, Sequence, Type
-
-import pytest
-
-if TYPE_CHECKING:
-    from _pytest.config import Config
-    from _pytest.terminal import TerminalReporter
-
-__all__ = ["TolerateErrorFixture", "TolerateErrorsSession", "wrap_pytest_function"]
-
-EXCEPTIONS_TYPE = Optional[Sequence[Type[BaseException]]]
-
-
-class TolerateErrorsSession:
-    def __init__(self, config: "Config") -> None:
-        self.verbose = config.getoption("tolerate_verbose")
-        self.quiet = False if self.verbose else config.getoption("tolerate_quiet")
-        self.columns = ["passed", "failed", "skipped", "max_failures"]
-        self.labels = {
-            "name": "Name",
-            "passed": "Passed",
-            "failed": "Failed",
-            "skipped": "Skipped",
-            "max_failures": "Maximum Allowed # Failures",
-        }
-        self._tests = defaultdict(TolerateErrorsTestItem)
-
-    def get_max_failures(self, key: str) -> int:
-        return self._tests[key].max_failures
-
-    def set_max_failures(self, key: str, value: int) -> None:
-        self._tests[key].max_failures = value
-
-    def get_num_passed(self, key: str) -> int:
-        return self._tests[key].passed
-
-    def increment_num_passed(self, key: str) -> None:
-        self._tests[key].passed += 1
-
-    def get_num_failures(self, key: str) -> int:
-        return self._tests[key].failed
-
-    def increment_num_failures(self, key: str) -> None:
-        self._tests[key].failed += 1
-
-    def get_num_skipped(self, key: str) -> int:
-        return self._tests[key].skipped
-
-    def increment_num_skipped(self, key: str) -> None:
-        self._tests[key].skipped += 1
-
-    def set_exceptions_to_ignore(self, key: str, value: EXCEPTIONS_TYPE) -> None:
-        if value is None:
-            self._tests[key].exceptions_to_ignore = tuple()
-        elif isinstance(value, Sequence):
-            self._tests[key].exceptions_to_ignore = value
-        else:
-            self._tests[key].exceptions_to_ignore = (value,)
-
-    def get_exceptions_to_ignore(self, key: str) -> EXCEPTIONS_TYPE:
-        return self._tests[key].exceptions_to_ignore
-
-    def has_exceeded_max_failures(self, key: str) -> bool:
-        return self._tests[key].failed > self._tests[key].max_failures
-
-    def display(self, terminalreporter: "TerminalReporter"):
-        if self.quiet:
-            return
-        if len(self._tests) == 0:
-            return
-        terminalreporter.ensure_newline()
-        terminalreporter.write_line("")
-        widths = {
-            "name": 3
-            + max(len(self.labels["name"]), max(len(name) for name in self._tests))
-        }
-        for key in self.columns:
-            widths[key] = 5 + len(self.labels[key])
-
-        labels_line = self.labels["name"].ljust(widths["name"]) + "".join(
-            self.labels[prop].rjust(widths[prop]) for prop in self.columns
-        )
-        terminalreporter.write_line(
-            " tolerate: {count} tests ".format(count=len(self._tests)).center(
-                len(labels_line), "-"
-            ),
-            yellow=True,
-        )
-        terminalreporter.write_line(labels_line)
-        terminalreporter.write_line("-" * len(labels_line), yellow=True)
-        for name in self._tests:
-            has_error = self.has_exceeded_max_failures(name)
-            terminalreporter.write(
-                name.ljust(widths["name"]),
-                red=has_error,
-                green=not has_error,
-                bold=True,
-            )
-            for prop in self.columns:
-                terminalreporter.write(
-                    "{0:>{1}}".format(self._tests[name][prop], widths[prop])
-                )
-            terminalreporter.write("\n")
-        terminalreporter.write_line("-" * len(labels_line), yellow=True)
-        terminalreporter.write_line("")
-
-
-class TolerateErrorsTestItem:
-    def __init__(self):
-        self.max_failures = 0
-        self.failed = 0
-        self.passed = 0
-        self.skipped = 0
-        self.exceptions_to_ignore = tuple()
-
-    def __getitem__(self, item: str):
-        return getattr(self, item)
-
-
-class TolerateErrorFixture:
-    def __init__(self, node: pytest.Item):
-        if hasattr(node, "originalname"):
-            self.name = node.originalname
-        else:
-            self.name = node.name
-        self.session: TolerateErrorsSession = node.config._tolerate_session
-        marker = node.get_closest_marker("tolerate")
-        if marker:
-            max_failures = marker.kwargs.get("max_failures", 0)
-            exceptions_to_ignore = marker.kwargs.get("exceptions_to_ignore")
-            self.session.set_max_failures(self.name, max_failures)
-            self.session.set_exceptions_to_ignore(self.name, exceptions_to_ignore)
-
-    def __call__(
-        self, max_failures: int, *, exceptions_to_ignore: EXCEPTIONS_TYPE = None
-    ):
-        self.session.set_max_failures(self.name, max_failures)
-        self.session.set_exceptions_to_ignore(self.name, exceptions_to_ignore)
-        return self
-
-    def __enter__(self):
-        if self.session.has_exceeded_max_failures(self.name):
-            self.session.increment_num_skipped(self.name)
-            pytest.skip(
-                f"Maximum number of allowed failures, {self.session.get_max_failures(self.name)}, was already exceeded"
-            )
-
-    def __exit__(self, exc_type, exc_val, exc_tb) -> bool:
-        if exc_type is None:
-            self.session.increment_num_passed(self.name)
-        else:
-            exceptions_to_ignore = self.session.get_exceptions_to_ignore(self.name)
-            if not any(exc_type is x for x in exceptions_to_ignore):
-                self.session.increment_num_failures(self.name)
-        if self.session.has_exceeded_max_failures(self.name):
-            pytest.fail(
-                f"Maximum number of allowed failures, {self.session.get_max_failures(self.name)}, was exceeded"
-            )
-        return True
-
-
-def wrap_pytest_function(pyfuncitem: pytest.Function):
-    testfunction = pyfuncitem.obj
-    tolerate_obj = TolerateErrorFixture(pyfuncitem)
-
-    @functools.wraps(testfunction)
-    def wrapper(*args, **kwargs):
-        with tolerate_obj:
-            testfunction(*args, **kwargs)
-
-    pyfuncitem.obj = wrapper
diff --git a/tests/valuation/methods/test_montecarlo_shapley_valuations.py b/tests/valuation/methods/test_montecarlo_shapley_valuations.py
index 0b0975936..4cf63bf73 100644
--- a/tests/valuation/methods/test_montecarlo_shapley_valuations.py
+++ b/tests/valuation/methods/test_montecarlo_shapley_valuations.py
@@ -137,10 +137,6 @@ def test_seed(
         np.testing.assert_equal(values_1.values, values_3.values)
 
 
-# @pytest.mark.skip(
-#     "This test is brittle and the bound isn't sharp. "
-#     "We should at least document the bound in the documentation."
-# )
 @pytest.mark.slow
 @pytest.mark.parametrize("num_samples, delta, eps", [(6, 0.1, 0.1)])
 @pytest.mark.parametrize(
@@ -150,10 +146,10 @@ def test_seed(
         UniformSampler,
     ],
 )
+@pytest.mark.flaky(reruns=1)
 def test_hoeffding_bound_montecarlo(
     analytic_shapley,
     dummy_train_data,
-    tolerate,
     n_jobs,
     sampler_class,
     delta,
@@ -164,20 +160,19 @@ def test_hoeffding_bound_montecarlo(
     n_samples = num_samples_permutation_hoeffding(delta=delta, eps=eps, u_range=1)
 
     for _ in range(10):
-        with tolerate(max_failures=int(10 * delta)):
-            sampler = sampler_class()
-            valuation = DataShapleyValuation(
-                utility=u,
-                sampler=sampler,
-                progress=False,
-                is_done=MaxChecks(n_samples),
-            )
-            with parallel_config(n_jobs=n_jobs):
-                valuation.fit(dummy_train_data)
-            values = valuation.values()
-
-            check_total_value(u, values, atol=len(dummy_train_data) * eps)
-            check_rank_correlation(values, exact_values, threshold=0.8)
+        sampler = sampler_class()
+        valuation = DataShapleyValuation(
+            utility=u,
+            sampler=sampler,
+            progress=False,
+            is_done=MaxChecks(n_samples),
+        )
+        with parallel_config(n_jobs=n_jobs):
+            valuation.fit(dummy_train_data)
+        values = valuation.values()
+
+        check_total_value(u, values, atol=len(dummy_train_data) * eps)
+        check_rank_correlation(values, exact_values, threshold=0.8)
 
 
 @pytest.mark.slow
diff --git a/tests/value/shapley/test_montecarlo.py b/tests/value/shapley/test_montecarlo.py
index d59937bec..0821b29cf 100644
--- a/tests/value/shapley/test_montecarlo.py
+++ b/tests/value/shapley/test_montecarlo.py
@@ -144,10 +144,10 @@ def test_seed(
         ShapleyMode.CombinatorialMontecarlo,
     ],
 )
+@pytest.mark.flaky(reruns=1)
 def test_hoeffding_bound_montecarlo(
     num_samples,
     analytic_shapley,
-    tolerate,
     n_jobs,
     fun: ShapleyMode,
     delta: float,
@@ -158,13 +158,12 @@ def test_hoeffding_bound_montecarlo(
     n_samples = num_samples_permutation_hoeffding(delta=delta, eps=eps, u_range=1)
 
     for _ in range(10):
-        with tolerate(max_failures=int(10 * delta)):
-            values = compute_shapley_values(
-                u=u, mode=fun, done=MaxChecks(n_samples), n_jobs=n_jobs
-            )
-            # Trivial bound on total error using triangle inequality
-            check_total_value(u, values, atol=len(u.data) * eps)
-            check_rank_correlation(values, exact_values, threshold=0.8)
+        values = compute_shapley_values(
+            u=u, mode=fun, done=MaxChecks(n_samples), n_jobs=n_jobs
+        )
+        # Trivial bound on total error using triangle inequality
+        check_total_value(u, values, atol=len(u.data) * eps)
+        check_rank_correlation(values, exact_values, threshold=0.8)
 
 
 @pytest.mark.slow

From 53b139069cf28c104bb2c50d3fb3ce9f9ff8f7f5 Mon Sep 17 00:00:00 2001
From: Janos Gabler <janos.gabler@gmail.com>
Date: Tue, 11 Jun 2024 17:32:14 +0200
Subject: [PATCH 2/8] Add first draft of owen valuations.

---
 src/pydvl/utils/numeric.py                    |  2 +-
 src/pydvl/valuation/methods/least_core.py     |  4 +-
 src/pydvl/valuation/methods/owen_shapley.py   | 25 ++++++
 src/pydvl/valuation/samplers/powerset.py      | 64 ++++++++++++++
 src/pydvl/valuation/types.py                  |  7 +-
 tests/utils/test_numeric.py                   | 11 +++
 .../test_montecarlo_shapley_valuations.py     |  2 +-
 tests/valuation/samplers/test_sampler.py      | 86 ++++++++++++++++++-
 8 files changed, 196 insertions(+), 5 deletions(-)

diff --git a/src/pydvl/utils/numeric.py b/src/pydvl/utils/numeric.py
index 679573a82..d79cbefec 100644
--- a/src/pydvl/utils/numeric.py
+++ b/src/pydvl/utils/numeric.py
@@ -96,7 +96,7 @@ def random_subset(
         The subset
     """
     rng = np.random.default_rng(seed)
-    selection = rng.uniform(size=len(s)) > q
+    selection = rng.uniform(size=len(s)) < q
     return s[selection]
 
 
diff --git a/src/pydvl/valuation/methods/least_core.py b/src/pydvl/valuation/methods/least_core.py
index 4a6c528fc..776ae276f 100644
--- a/src/pydvl/valuation/methods/least_core.py
+++ b/src/pydvl/valuation/methods/least_core.py
@@ -10,6 +10,7 @@
 from numpy.typing import NDArray
 from tqdm.auto import tqdm
 
+from pydvl.utils.types import Seed
 from pydvl.valuation.base import Valuation
 from pydvl.valuation.dataset import Dataset
 from pydvl.valuation.methods._solve_least_core_problems import (
@@ -221,10 +222,11 @@ def __init__(
         non_negative_subsidy: bool = False,
         solver_options: dict | None = None,
         progress: bool = True,
+        seed: Seed | None = None,
     ):
         super().__init__(
             utility=utility,
-            sampler=UniformSampler(index_iteration=NoIndexIteration),
+            sampler=UniformSampler(index_iteration=NoIndexIteration, seed=seed),
             n_samples=n_samples,
             non_negative_subsidy=non_negative_subsidy,
             solver_options=solver_options,
diff --git a/src/pydvl/valuation/methods/owen_shapley.py b/src/pydvl/valuation/methods/owen_shapley.py
index 835a9e4dd..dc8f3e4d5 100644
--- a/src/pydvl/valuation/methods/owen_shapley.py
+++ b/src/pydvl/valuation/methods/owen_shapley.py
@@ -1,3 +1,10 @@
+from __future__ import annotations
+
+from pydvl.valuation.methods.semivalue import SemivalueValuation
+from pydvl.valuation.samplers.powerset import OwenSampler
+from pydvl.valuation.stopping import NoStopping
+from pydvl.valuation.utility.base import UtilityBase
+
 """
 ## References
 
@@ -5,3 +12,21 @@
     [A Multilinear Sampling Algorithm to Estimate Shapley Values](https://ieeexplore.ieee.org/abstract/document/9412511).
     In: 2020 25th International Conference on Pattern Recognition (ICPR), pp. 7992–7999. IEEE.
 """
+
+
+class OwenShapleyValuation(SemivalueValuation):
+    def __init__(
+        self,
+        utility: UtilityBase,
+        sampler: OwenSampler,
+        progress: dict[str, Any] | bool = False,
+    ):
+        super().__init__(
+            utility=utility,
+            sampler=sampler,
+            is_done=NoStopping(),
+            progress=progress,
+        )
+
+    def coefficient(self, n: int, k: int) -> float:
+        return 1
diff --git a/src/pydvl/valuation/samplers/powerset.py b/src/pydvl/valuation/samplers/powerset.py
index 27acf0d40..22130094b 100644
--- a/src/pydvl/valuation/samplers/powerset.py
+++ b/src/pydvl/valuation/samplers/powerset.py
@@ -61,6 +61,8 @@
     "PowersetSampler",
     "TruncatedUniformStratifiedSampler",
     "UniformSampler",
+    "OwenSampler",
+    "AntitheticOwenSampler",
     "UniformStratifiedSampler",
     "VarianceReducedStratifiedSampler",
     "IndexIteration",
@@ -369,6 +371,68 @@ def _generate(self, indices: IndexSetT) -> SampleGenerator:
                 yield Sample(idx, subset)
 
 
+class OwenSampler(StochasticSamplerMixin, PowersetSampler):
+    def __init__(
+        self,
+        n_samples_outer: int,
+        n_samples_inner: int = 2,
+        batch_size: int = 1,
+        seed: Seed | None = None,
+    ):
+        super().__init__(
+            batch_size=batch_size, index_iteration=SequentialIndexIteration, seed=seed
+        )
+        self._n_samples_inner = n_samples_inner
+        self._n_samples_outer = n_samples_outer
+        self._q_stop = 1.0
+
+    def _generate(self, indices: IndexSetT) -> SampleGenerator:
+        probabilities = np.linspace(
+            start=0, stop=self._q_stop, num=self._n_samples_outer
+        )
+        for idx in self.index_iterator(indices):
+            _complement = complement(indices, [idx] if idx is not None else [])
+            for prob in probabilities:
+                for _ in range(self._n_samples_inner):
+                    subset = random_subset(_complement, q=prob, seed=self._rng)
+                    yield Sample(idx, subset)
+
+    def weight(n: int, subset_len: int) -> float:
+        return 1.0
+
+    def sample_limit(self, indices: IndexSetT) -> int:
+        return len(indices) * self._n_samples_outer * self._n_samples_inner
+
+
+class AntitheticOwenSampler(OwenSampler):
+    def __init__(
+        self,
+        n_samples_outer: int,
+        n_samples_inner: int = 2,
+        batch_size: int = 1,
+        seed: Seed | None = None,
+    ):
+        super().__init__(
+            n_samples_outer=n_samples_outer,
+            n_samples_inner=n_samples_inner,
+            batch_size=batch_size,
+            seed=seed,
+        )
+        self._q_stop = 0.5
+
+    def _generate(self, indices: IndexSetT) -> SampleGenerator:
+        for sample in super()._generate(indices):
+            idx, subset = sample
+            _exclude = [idx] if idx is not None else []
+            _exclude += subset.tolist()
+            _antithetic_subset = complement(indices, _exclude)
+            yield sample
+            yield Sample(idx, _antithetic_subset)
+
+    def sample_limit(self, indices: IndexSetT) -> int:
+        return 2 * super().sample_limit(indices)
+
+
 class AntitheticSampler(StochasticSamplerMixin, PowersetSampler):
     """An iterator to perform uniform random sampling of subsets, and their
     complements.
diff --git a/src/pydvl/valuation/types.py b/src/pydvl/valuation/types.py
index dba732168..f97cbae4a 100644
--- a/src/pydvl/valuation/types.py
+++ b/src/pydvl/valuation/types.py
@@ -1,11 +1,12 @@
 from __future__ import annotations
 
 import hashlib
-from dataclasses import dataclass
+from dataclasses import dataclass, replace
 from typing import Callable, Generator, Iterable, Protocol, TypeVar, Union
 
 import numpy as np
 from numpy.typing import NDArray
+from typing_extensions import Self
 
 __all__ = [
     "BatchGenerator",
@@ -52,6 +53,10 @@ def __hash__(self):
         sha256_hash = hashlib.sha256(self.subset.tobytes()).hexdigest()
         return int(sha256_hash, base=16)
 
+    def with_idx_in_subset(self) -> Self:
+        new_subset = np.concatenate([self.subset, [self.idx]])
+        return replace(self, subset=new_subset)
+
 
 SampleT = TypeVar("SampleT", bound=Sample)
 
diff --git a/tests/utils/test_numeric.py b/tests/utils/test_numeric.py
index b722c24f8..e31e5e1dd 100644
--- a/tests/utils/test_numeric.py
+++ b/tests/utils/test_numeric.py
@@ -6,6 +6,7 @@
     random_matrix_with_condition_number,
     random_powerset,
     random_powerset_label_min,
+    random_subset,
     random_subset_of_size,
     running_moments,
 )
@@ -273,3 +274,13 @@ def test_random_powerset_label_min(
 
         if idx == check_num_samples:
             break
+
+
+@pytest.mark.flaky(reruns=1)
+def test_size_of_random_subset():
+    """This test discovered an actual bug where (1 - q) was used instead of q."""
+    subset = random_subset(np.arange(10), q=0)
+    assert len(subset) == 0
+
+    subset = random_subset(np.arange(10), q=1)
+    assert len(subset) == 10
diff --git a/tests/valuation/methods/test_montecarlo_shapley_valuations.py b/tests/valuation/methods/test_montecarlo_shapley_valuations.py
index 4cf63bf73..842b07c9c 100644
--- a/tests/valuation/methods/test_montecarlo_shapley_valuations.py
+++ b/tests/valuation/methods/test_montecarlo_shapley_valuations.py
@@ -105,7 +105,7 @@ def test_games(
     [
         # TODO Add Permutation Montecarlo once issue #416 is closed.
         (PermutationSampler, dict(is_done=MaxChecks(50))),
-        (UniformSampler, dict(is_done=MaxChecks(4))),
+        (UniformSampler, dict(is_done=MaxChecks(50))),
         # (ShapleyMode.Owen, dict(n_samples=4, max_q=200)),
         # (ShapleyMode.OwenAntithetic, dict(n_samples=4, max_q=200)),
         # (ShapleyMode.GroupTesting, dict(n_samples=21, epsilon=0.2, delta=0.01)),
diff --git a/tests/valuation/samplers/test_sampler.py b/tests/valuation/samplers/test_sampler.py
index fed3d12e6..023e3ee32 100644
--- a/tests/valuation/samplers/test_sampler.py
+++ b/tests/valuation/samplers/test_sampler.py
@@ -1,4 +1,4 @@
-from itertools import takewhile
+from itertools import islice, takewhile
 from typing import Iterator, List, Type, Union
 
 import numpy as np
@@ -13,11 +13,13 @@
     PermutationSampler,
 )
 from pydvl.valuation.samplers.powerset import (
+    AntitheticOwenSampler,
     AntitheticSampler,
     DeterministicUniformSampler,
     IndexIteration,
     LOOSampler,
     NoIndexIteration,
+    OwenSampler,
     PowersetSampler,
     RandomIndexIteration,
     SequentialIndexIteration,
@@ -155,6 +157,8 @@ def _check_subsets(batches, expected):
         AntitheticSampler(index_iteration=RandomIndexIteration),
         TruncatedUniformStratifiedSampler(lower_bound=1, upper_bound=2),
         VarianceReducedStratifiedSampler(samples_per_setsize=lambda _: 2),
+        OwenSampler(n_samples_outer=4),
+        AntitheticOwenSampler(n_samples_outer=2),
     ],
 )
 @pytest.mark.parametrize("indices", [np.array([]), np.array([0, 1, 2])])
@@ -187,6 +191,8 @@ def test_proper(sampler, indices):
         AntitheticSampler(index_iteration=RandomIndexIteration),
         TruncatedUniformStratifiedSampler(lower_bound=1, upper_bound=2),
         VarianceReducedStratifiedSampler(samples_per_setsize=lambda _: 2),
+        OwenSampler(n_samples_outer=4),
+        AntitheticOwenSampler(n_samples_outer=2),
     ],
 )
 def test_sample_counter(sampler):
@@ -214,6 +220,8 @@ def test_sample_counter(sampler):
         (DeterministicUniformSampler(index_iteration=NoIndexIteration), 8),
         (DeterministicPermutationSampler(), 6),
         (LOOSampler(), 3),
+        (OwenSampler(n_samples_outer=4, n_samples_inner=2), 4 * 2 * 3),
+        (AntitheticOwenSampler(n_samples_outer=3, n_samples_inner=2), 2 * 3 * 2 * 3),
     ],
 )
 def test_length_for_finite_samplers(sampler, expected_length):
@@ -315,3 +323,79 @@ def _create_seeded_sample_iter(
         lambda _: sampler.n_samples < max_iterations, sampler.generate_batches(indices)
     )
     return sample_stream
+
+
+@pytest.mark.flaky(reruns=1)
+def test_owen_sampler():
+    n_outer = 5
+    n_inner = 100
+    sampler = OwenSampler(
+        n_samples_outer=n_outer, n_samples_inner=n_inner, batch_size=1
+    )
+    indices = np.arange(5000)
+
+    # extract samples for the first and second index
+    n_samples = n_outer * n_inner
+    samples = [b[0] for b in islice(sampler.generate_batches(indices), n_samples * 2)]
+    samples_0 = samples[:n_samples]
+    samples_1 = samples[n_samples:]
+
+    # check that indices are correct
+    assert all(sample.idx == 0 for sample in samples_0)
+    assert all(sample.idx == 1 for sample in samples_1)
+
+    # check that the sample_sizes are close to expected sizes
+    for samples in [samples_0, samples_1]:
+        _check_sample_sizes(
+            samples,
+            n_samples_outer=n_outer,
+            n_indices=len(indices),
+            probs=np.array([0.0, 0.25, 0.5, 0.75, 1]),
+        )
+
+
+@pytest.mark.flaky(reruns=1)
+def test_antithetic_owen_sampler():
+    n_outer = 3
+    n_inner = 100
+    sampler = AntitheticOwenSampler(
+        n_samples_outer=n_outer, n_samples_inner=n_inner, batch_size=1
+    )
+    indices = np.arange(5000)
+
+    # extract samples for the first and second index
+    n_samples = n_outer * n_inner
+    samples = [b[0] for b in islice(sampler.generate_batches(indices), n_samples * 4)]
+    samples_0 = samples[: n_samples * 2 : 2]
+    samples_0_complement = samples[1 : 2 * n_samples : 2]
+    samples_1 = samples[2 * n_samples :: 2]
+    samples_1_complement = samples[2 * n_samples + 1 :: 2]
+
+    # check that indices are correct
+    assert all(sample.idx == 0 for sample in samples_0)
+    assert all(sample.idx == 0 for sample in samples_0_complement)
+    assert all(sample.idx == 1 for sample in samples_1)
+    assert all(sample.idx == 1 for sample in samples_1_complement)
+
+    # check that the sample_sizes are close to expected sizes
+    for samples in [samples_0, samples_1]:
+        _check_sample_sizes(
+            samples,
+            n_samples_outer=n_outer,
+            n_indices=len(indices),
+            probs=np.array([0.0, 0.25, 0.5]),
+        )
+    for samples in [samples_0_complement, samples_1_complement]:
+        _check_sample_sizes(
+            samples,
+            n_samples_outer=n_outer,
+            n_indices=len(indices),
+            probs=np.array([1.0, 0.75, 0.5]),
+        )
+
+
+def _check_sample_sizes(samples, n_samples_outer, n_indices, probs):
+    sizes = np.array([len(sample.subset) for sample in samples])
+    avg_sizes = sizes.reshape(n_samples_outer, -1).mean(axis=1)
+    expected_sizes = probs * n_indices
+    assert np.allclose(avg_sizes, expected_sizes, rtol=0.01)

From 7c3197efdfc0a399718a4e62a7f14892f9c44915 Mon Sep 17 00:00:00 2001
From: Janos Gabler <janos.gabler@gmail.com>
Date: Wed, 12 Jun 2024 09:56:23 +0200
Subject: [PATCH 3/8] Refactor montecarlo shapley tests.

---
 tests/conftest.py                             |  3 -
 .../test_montecarlo_shapley_valuations.py     | 63 ++++++++++++-------
 2 files changed, 41 insertions(+), 25 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 49d2ef2da..037775c30 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -14,9 +14,6 @@
 from pydvl.utils import Dataset, MemcachedClientConfig
 from tests.cache import CloudPickleCache
 
-if TYPE_CHECKING:
-    from _pytest.terminal import TerminalReporter
-
 
 def pytest_addoption(parser):
     parser.addoption(
diff --git a/tests/valuation/methods/test_montecarlo_shapley_valuations.py b/tests/valuation/methods/test_montecarlo_shapley_valuations.py
index 842b07c9c..95ac8024b 100644
--- a/tests/valuation/methods/test_montecarlo_shapley_valuations.py
+++ b/tests/valuation/methods/test_montecarlo_shapley_valuations.py
@@ -33,10 +33,24 @@
     indirect=["test_game"],
 )
 @pytest.mark.parametrize(
-    "sampler_class, rtol, atol, kwargs",
+    "sampler_class, sampler_kwargs, valuation_class, valuation_kwargs, rtol, atol",
     [
-        (PermutationSampler, 0.2, 1e-4, dict(is_done=MaxUpdates(500))),
-        (UniformSampler, 0.2, 1e-4, dict(is_done=MaxUpdates(2**10))),
+        (
+            PermutationSampler,
+            {},
+            DataShapleyValuation,
+            {"is_done": MaxUpdates(500)},
+            0.2,
+            1e-4,
+        ),
+        (
+            UniformSampler,
+            {},
+            DataShapleyValuation,
+            {"is_done": MaxUpdates(2**10)},
+            0.2,
+            1e-4,
+        ),
         # (ShapleyMode.Owen, 0.2, 1e-4, dict(n_samples=5, max_q=200)),
         # (ShapleyMode.OwenAntithetic, 0.1, 1e-4, dict(n_samples=5, max_q=200)),
         # Because of the inaccuracy of GroupTesting, a high atol is required for the
@@ -53,12 +67,14 @@ def test_games(
     test_game,
     n_jobs,
     sampler_class,
+    sampler_kwargs,
+    valuation_class,
+    valuation_kwargs,
     rtol,
     atol,
-    kwargs,
     seed,
 ):
-    """Tests values for all methods using a toy games.
+    """Tests shapley values for all methods using toy games.
 
     For permutation, the rtol for each scorer is chosen
     so that the number of samples selected is just above the (ε,δ) bound for ε =
@@ -67,7 +83,6 @@ def test_games(
 
     TODO:
         - Uncomment the other methods once they are implemented
-        - Find out why parallelization seems to affect the results
 
     FIXME:
      - We don't have a bound for Owen.
@@ -76,12 +91,12 @@ def test_games(
        samples
 
     """
-    sampler = sampler_class(seed=seed)
-    valuation = DataShapleyValuation(
+    sampler = sampler_class(seed=seed, **sampler_kwargs)
+    valuation = valuation_class(
         utility=test_game.u,
         sampler=sampler,
         progress=False,
-        **kwargs,
+        **valuation_kwargs,
     )
     with parallel_config(n_jobs=n_jobs):
         valuation.fit(test_game.data)
@@ -101,11 +116,11 @@ def test_games(
     indirect=["test_game"],
 )
 @pytest.mark.parametrize(
-    "sampler_class, kwargs",
+    "sampler_class, sampler_kwargs, valuation_class, valuation_kwargs",
     [
         # TODO Add Permutation Montecarlo once issue #416 is closed.
-        (PermutationSampler, dict(is_done=MaxChecks(50))),
-        (UniformSampler, dict(is_done=MaxChecks(50))),
+        (PermutationSampler, {}, DataShapleyValuation, {"is_done": MaxChecks(50)}),
+        (UniformSampler, {}, DataShapleyValuation, {"is_done": MaxChecks(50)}),
         # (ShapleyMode.Owen, dict(n_samples=4, max_q=200)),
         # (ShapleyMode.OwenAntithetic, dict(n_samples=4, max_q=200)),
         # (ShapleyMode.GroupTesting, dict(n_samples=21, epsilon=0.2, delta=0.01)),
@@ -114,18 +129,20 @@ def test_games(
 def test_seed(
     test_game,
     sampler_class,
-    kwargs,
+    sampler_kwargs,
+    valuation_class,
+    valuation_kwargs,
     seed,
     seed_alt,
 ):
     values = []
     for s in [seed, seed, seed_alt]:
-        valuation = DataShapleyValuation(
+        valuation = valuation_class(
             utility=test_game.u,
-            sampler=sampler_class(seed=s),
+            sampler=sampler_class(seed=s, **sampler_kwargs),
             progress=False,
             # TODO: Why is a deepcopy necessary here?
-            **deepcopy(kwargs),
+            **deepcopy(valuation_kwargs),
         )
         valuation.fit(test_game.data)
         values.append(valuation.values())
@@ -180,9 +197,9 @@ def test_hoeffding_bound_montecarlo(
     "a, b, num_points", [(2, 0, 21)]  # training set will have 0.3 * 21 ~= 6 samples
 )
 @pytest.mark.parametrize(
-    "sampler_class, kwargs",
+    "sampler_class, sampler_kwargs, valuation_class, valuation_kwargs",
     [
-        (PermutationSampler, {"is_done": MaxUpdates(500)}),
+        (PermutationSampler, {}, DataShapleyValuation, {"is_done": MaxUpdates(500)}),
         # (ShapleyMode.Owen, dict(n_samples=4, max_q=400)),
         # (ShapleyMode.OwenAntithetic, dict(n_samples=4, max_q=400)),
         # (
@@ -195,7 +212,9 @@ def test_linear_montecarlo_with_outlier(
     linear_dataset,
     n_jobs,
     sampler_class,
-    kwargs: dict,
+    sampler_kwargs,
+    valuation_class,
+    valuation_kwargs,
     cache_backend,
 ):
     """Tests whether valuation methods are able to detect an obvious outlier.
@@ -225,11 +244,11 @@ def test_linear_montecarlo_with_outlier(
         cache_backend=cache_backend,
     )
 
-    valuation = DataShapleyValuation(
+    valuation = valuation_class(
         utility=utility,
-        sampler=sampler_class(),
+        sampler=sampler_class(**sampler_kwargs),
         progress=False,
-        **kwargs,
+        **valuation_kwargs,
     )
     with parallel_config(n_jobs=n_jobs):
         valuation.fit(data_train)

From 291d3ecf07bef66055a7af7f9989d2148c9b5128 Mon Sep 17 00:00:00 2001
From: Janos Gabler <janos.gabler@gmail.com>
Date: Wed, 12 Jun 2024 10:28:37 +0200
Subject: [PATCH 4/8] Write tests for owen shapley.

---
 src/pydvl/valuation/methods/owen_shapley.py   |  8 +++
 src/pydvl/valuation/samplers/powerset.py      |  1 +
 .../test_montecarlo_shapley_valuations.py     | 50 ++++++++++++++++---
 3 files changed, 52 insertions(+), 7 deletions(-)

diff --git a/src/pydvl/valuation/methods/owen_shapley.py b/src/pydvl/valuation/methods/owen_shapley.py
index dc8f3e4d5..c5f5cb39d 100644
--- a/src/pydvl/valuation/methods/owen_shapley.py
+++ b/src/pydvl/valuation/methods/owen_shapley.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+from pydvl.utils import Status
 from pydvl.valuation.methods.semivalue import SemivalueValuation
 from pydvl.valuation.samplers.powerset import OwenSampler
 from pydvl.valuation.stopping import NoStopping
@@ -28,5 +29,12 @@ def __init__(
             progress=progress,
         )
 
+    def fit(self, dataset: Dataset) -> ValuationResult:
+        # since we bypassed the convergence checks we need to set the status to
+        # converged manually
+        super().fit(dataset)
+        self.result._status = Status.Converged
+        return self
+
     def coefficient(self, n: int, k: int) -> float:
         return 1
diff --git a/src/pydvl/valuation/samplers/powerset.py b/src/pydvl/valuation/samplers/powerset.py
index 22130094b..b873f914e 100644
--- a/src/pydvl/valuation/samplers/powerset.py
+++ b/src/pydvl/valuation/samplers/powerset.py
@@ -397,6 +397,7 @@ def _generate(self, indices: IndexSetT) -> SampleGenerator:
                     subset = random_subset(_complement, q=prob, seed=self._rng)
                     yield Sample(idx, subset)
 
+    @staticmethod
     def weight(n: int, subset_len: int) -> float:
         return 1.0
 
diff --git a/tests/valuation/methods/test_montecarlo_shapley_valuations.py b/tests/valuation/methods/test_montecarlo_shapley_valuations.py
index 95ac8024b..b2149712c 100644
--- a/tests/valuation/methods/test_montecarlo_shapley_valuations.py
+++ b/tests/valuation/methods/test_montecarlo_shapley_valuations.py
@@ -9,9 +9,11 @@
 from pydvl.utils.numeric import num_samples_permutation_hoeffding
 from pydvl.utils.status import Status
 from pydvl.valuation.dataset import GroupedDataset
-from pydvl.valuation.methods import DataShapleyValuation
+from pydvl.valuation.methods import DataShapleyValuation, OwenShapleyValuation
 from pydvl.valuation.samplers import (
+    AntitheticOwenSampler,
     DeterministicUniformSampler,
+    OwenSampler,
     PermutationSampler,
     UniformSampler,
 )
@@ -51,8 +53,22 @@
             0.2,
             1e-4,
         ),
-        # (ShapleyMode.Owen, 0.2, 1e-4, dict(n_samples=5, max_q=200)),
-        # (ShapleyMode.OwenAntithetic, 0.1, 1e-4, dict(n_samples=5, max_q=200)),
+        (
+            OwenSampler,
+            {"n_samples_outer": 200, "n_samples_inner": 5},
+            OwenShapleyValuation,
+            {},
+            0.2,
+            1e-4,
+        ),
+        (
+            AntitheticOwenSampler,
+            {"n_samples_outer": 200, "n_samples_inner": 5},
+            OwenShapleyValuation,
+            {},
+            0.1,
+            1e-4,
+        ),
         # Because of the inaccuracy of GroupTesting, a high atol is required for the
         # value 0, for which the rtol has no effect.
         # (
@@ -121,8 +137,18 @@ def test_games(
         # TODO Add Permutation Montecarlo once issue #416 is closed.
         (PermutationSampler, {}, DataShapleyValuation, {"is_done": MaxChecks(50)}),
         (UniformSampler, {}, DataShapleyValuation, {"is_done": MaxChecks(50)}),
-        # (ShapleyMode.Owen, dict(n_samples=4, max_q=200)),
-        # (ShapleyMode.OwenAntithetic, dict(n_samples=4, max_q=200)),
+        (
+            OwenSampler,
+            {"n_samples_outer": 20, "n_samples_inner": 4},
+            OwenShapleyValuation,
+            {},
+        ),
+        (
+            AntitheticOwenSampler,
+            {"n_samples_outer": 20, "n_samples_inner": 4},
+            OwenShapleyValuation,
+            {},
+        ),
         # (ShapleyMode.GroupTesting, dict(n_samples=21, epsilon=0.2, delta=0.01)),
     ],
 )
@@ -200,8 +226,18 @@ def test_hoeffding_bound_montecarlo(
     "sampler_class, sampler_kwargs, valuation_class, valuation_kwargs",
     [
         (PermutationSampler, {}, DataShapleyValuation, {"is_done": MaxUpdates(500)}),
-        # (ShapleyMode.Owen, dict(n_samples=4, max_q=400)),
-        # (ShapleyMode.OwenAntithetic, dict(n_samples=4, max_q=400)),
+        (
+            OwenSampler,
+            {"n_samples_outer": 400, "n_samples_inner": 4},
+            OwenShapleyValuation,
+            {},
+        ),
+        (
+            AntitheticOwenSampler,
+            {"n_samples_outer": 400, "n_samples_inner": 4},
+            OwenShapleyValuation,
+            {},
+        ),
         # (
         #     ShapleyMode.GroupTesting,
         #     dict(n_samples=int(5e4), epsilon=0.25, delta=0.1),

From d42b0301043ec4b3ba476d414c2dedf626676714 Mon Sep 17 00:00:00 2001
From: Janos Gabler <janos.gabler@gmail.com>
Date: Wed, 12 Jun 2024 10:44:38 +0200
Subject: [PATCH 5/8] Write docstrings.

---
 src/pydvl/valuation/methods/owen_shapley.py | 33 +++++++++++++++++++++
 src/pydvl/valuation/samplers/powerset.py    | 26 ++++++++++++++++
 2 files changed, 59 insertions(+)

diff --git a/src/pydvl/valuation/methods/owen_shapley.py b/src/pydvl/valuation/methods/owen_shapley.py
index c5f5cb39d..3128eb55e 100644
--- a/src/pydvl/valuation/methods/owen_shapley.py
+++ b/src/pydvl/valuation/methods/owen_shapley.py
@@ -16,6 +16,23 @@
 
 
 class OwenShapleyValuation(SemivalueValuation):
+    """Umbrella class to calculate least-core values with Owen sampling schemes.
+
+    Owen shapley values converge to true Shapley values as the number of samples
+    increases but have been shown to need fewer samples than other sampling schemes.
+
+    The number of samples is governed by the sampler object. There are no convergence
+    criteria for Owen shapley values as they will just run for a fixed number of
+    samples.
+
+    Args:
+        utility: Utility object with model and scoring function.
+        sampler: Owen sampling scheme to use. Can be OwenSampler or
+            AntitheticOwenSampler.
+        progress: Whether to show a progress bar.
+
+    """
+
     def __init__(
         self,
         utility: UtilityBase,
@@ -30,6 +47,22 @@ def __init__(
         )
 
     def fit(self, dataset: Dataset) -> ValuationResult:
+        """Calculate the Owen shapley values for a given dataset.
+
+        This method has to be called before calling `values()`.
+
+        Calculating the least core valuation is a computationally expensive task that
+        can be parallelized. To do so, call the `fit()` method inside a
+        `joblib.parallel_config` context manager as follows:
+
+        ```python
+        from joblib import parallel_config
+
+        with parallel_config(n_jobs=4):
+            valuation.fit(data)
+        ```
+
+        """
         # since we bypassed the convergence checks we need to set the status to
         # converged manually
         super().fit(dataset)
diff --git a/src/pydvl/valuation/samplers/powerset.py b/src/pydvl/valuation/samplers/powerset.py
index b873f914e..6da27b1a5 100644
--- a/src/pydvl/valuation/samplers/powerset.py
+++ b/src/pydvl/valuation/samplers/powerset.py
@@ -372,6 +372,19 @@ def _generate(self, indices: IndexSetT) -> SampleGenerator:
 
 
 class OwenSampler(StochasticSamplerMixin, PowersetSampler):
+    """A sampler for Owen shapley values.
+
+    Args:
+        n_samples_outer: The number of entries in the probability grid used for
+            the outer loop in Owen sampling.
+        n_samples_inner: The number of samples drawn for each probability. In the
+            original paper this was fixed to 2 for all experiments which is why we
+            give it a default value of 2.
+        batch_size: The batch size of the sampler.
+        seed: The seed for the random number generator.
+
+    """
+
     def __init__(
         self,
         n_samples_outer: int,
@@ -406,6 +419,19 @@ def sample_limit(self, indices: IndexSetT) -> int:
 
 
 class AntitheticOwenSampler(OwenSampler):
+    """A sampler for antithetic Owen shapley values.
+
+    Args:
+        n_samples_outer: The number of entries in the probability grid used for
+            the outer loop in Owen sampling.
+        n_samples_inner: The number of samples drawn for each probability. In the
+            original paper this was fixed to 2 for all experiments which is why we
+            give it a default value of 2.
+        batch_size: The batch size of the sampler.
+        seed: The seed for the random number generator.
+
+    """
+
     def __init__(
         self,
         n_samples_outer: int,

From b6c4ba904c3be4a1385911b60430265e0de1d654 Mon Sep 17 00:00:00 2001
From: Janos Gabler <janos.gabler@gmail.com>
Date: Wed, 12 Jun 2024 11:08:58 +0200
Subject: [PATCH 6/8] Update changelog.

---
 CHANGELOG.md                             |  7 +++++++
 src/pydvl/valuation/samplers/powerset.py |  5 +----
 src/pydvl/valuation/types.py             |  8 +++++++-
 tests/valuation/test_types.py            | 11 +++++++++++
 4 files changed, 26 insertions(+), 5 deletions(-)
 create mode 100644 tests/valuation/test_types.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 6b090b49b..f4fde4f38 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,13 @@
 - Refactoring of least-core data valuation methods with more supported sampling methods
   and consistent interface.
   [PR #580](https://github.com/aai-institute/pyDVL/pull/580)
+- Refactoring of owen shapley valuation with new sampler architecture
+  [PR #597](https://github.com/aai-institute/pyDVL/pull/597)
+
+### Fixed
+
+- Fix a bug in pydvl.utils.numeric.random_subset
+  [PR #597](https://github.com/aai-institute/pyDVL/pull/597)
 
 ## 0.9.0 - 🆕 New methods, better docs and bugfixes 📚🐞
 
diff --git a/src/pydvl/valuation/samplers/powerset.py b/src/pydvl/valuation/samplers/powerset.py
index 6da27b1a5..ec9a14f3d 100644
--- a/src/pydvl/valuation/samplers/powerset.py
+++ b/src/pydvl/valuation/samplers/powerset.py
@@ -212,10 +212,7 @@ def process(
     ) -> list[ValueUpdate]:
         r = []
         for sample in batch:
-            assert sample.idx is not None
-            u_i = self.utility(
-                Sample(sample.idx, np.array(list({sample.idx}.union(sample.subset))))
-            )
+            u_i = self.utility(sample.with_idx_in_subset())
             u = self.utility(sample)
             marginal = (u_i - u) * self.coefficient(self.n_indices, len(sample.subset))
             r.append(ValueUpdate(sample.idx, marginal))
diff --git a/src/pydvl/valuation/types.py b/src/pydvl/valuation/types.py
index f97cbae4a..e7b47544b 100644
--- a/src/pydvl/valuation/types.py
+++ b/src/pydvl/valuation/types.py
@@ -54,7 +54,13 @@ def __hash__(self):
         return int(sha256_hash, base=16)
 
     def with_idx_in_subset(self) -> Self:
-        new_subset = np.concatenate([self.subset, [self.idx]])
+        if self.idx in self.subset:
+            return self
+
+        if self.idx is None:
+            raise ValueError("Cannot add idx to subset if idx is None.")
+
+        new_subset = np.array(self.subset.tolist() + [self.idx])
         return replace(self, subset=new_subset)
 
 
diff --git a/tests/valuation/test_types.py b/tests/valuation/test_types.py
new file mode 100644
index 000000000..5cfe379cd
--- /dev/null
+++ b/tests/valuation/test_types.py
@@ -0,0 +1,11 @@
+import numpy as np
+from numpy.testing import assert_array_equal
+
+from pydvl.valuation.types import Sample
+
+
+def test_sample():
+    sample = Sample(idx=1, subset=np.array([2, 3, 4]))
+    new_sample = sample.with_idx_in_subset()
+    assert new_sample.idx == 1
+    assert_array_equal(new_sample.subset, np.array([2, 3, 4, 1]))

From 98d82f2b19b908d773c739f84bdf13862d1f6f01 Mon Sep 17 00:00:00 2001
From: Janos Gabler <janos.gabler@gmail.com>
Date: Wed, 12 Jun 2024 16:25:15 +0200
Subject: [PATCH 7/8] Incorporate review comments.

---
 CHANGELOG.md                             |  3 ++-
 src/pydvl/valuation/samplers/powerset.py | 19 +++++++++++++++++++
 src/pydvl/valuation/types.py             | 10 ++++++++++
 3 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index f4fde4f38..760b492dd 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -12,7 +12,8 @@
 
 ### Fixed
 
-- Fix a bug in pydvl.utils.numeric.random_subset
+- Fix a bug in pydvl.utils.numeric.random_subset where 1 - q was used instead of q
+  as the probability of an element being sampled
   [PR #597](https://github.com/aai-institute/pyDVL/pull/597)
 
 ## 0.9.0 - 🆕 New methods, better docs and bugfixes 📚🐞
diff --git a/src/pydvl/valuation/samplers/powerset.py b/src/pydvl/valuation/samplers/powerset.py
index ec9a14f3d..fe2e4f8a5 100644
--- a/src/pydvl/valuation/samplers/powerset.py
+++ b/src/pydvl/valuation/samplers/powerset.py
@@ -371,6 +371,13 @@ def _generate(self, indices: IndexSetT) -> SampleGenerator:
 class OwenSampler(StochasticSamplerMixin, PowersetSampler):
     """A sampler for Owen shapley values.
 
+    For each index $i$ the Owen sampler loops over a deterministic grid of probabilities
+    (containing `n_samples_outer` entries between 0 and 1) and then draws
+    `n_samples_inner` subsets of the complement of the current index where each element
+    is sampled with the given probability.
+
+    The total number of samples drawn is therefore `n_samples_outer * n_samples_inner`.
+
     Args:
         n_samples_outer: The number of entries in the probability grid used for
             the outer loop in Owen sampling.
@@ -418,6 +425,18 @@ def sample_limit(self, indices: IndexSetT) -> int:
 class AntitheticOwenSampler(OwenSampler):
     """A sampler for antithetic Owen shapley values.
 
+    For each index $i$, the antithetic Owen sampler loops over a deterministic grid of
+    probabilities (containing `n_samples_outer` entries between 0 and 0.5) and then
+    draws `n_samples_inner` subsets of the complement of the current index where each
+    element is sampled with the given probability. For each sample obtained that way,
+    a second sample is generated by taking the complement of the first sample.
+
+    The total number of samples drawn is therefore
+    `2 * n_samples_outer * n_samples_inner`.
+
+    For the same number of total samples, the antithetic Owen sampler yields usually
+    more precise estimates of shapley values than the regular Owen sampler.
+
     Args:
         n_samples_outer: The number of entries in the probability grid used for
             the outer loop in Owen sampling.
diff --git a/src/pydvl/valuation/types.py b/src/pydvl/valuation/types.py
index e7b47544b..2dbbea534 100644
--- a/src/pydvl/valuation/types.py
+++ b/src/pydvl/valuation/types.py
@@ -54,6 +54,16 @@ def __hash__(self):
         return int(sha256_hash, base=16)
 
     def with_idx_in_subset(self) -> Self:
+        """Return a copy of sample with idx added to the subset.
+
+        Returns the original sample if idx was already part of the subset.
+
+        Returns:
+            Sample: A copy of the sample with idx added to the subset.
+
+        Raises:
+            ValueError: If idx is None.
+        """
         if self.idx in self.subset:
             return self
 

From 56129836bad5d9b28d1efda914b6a6797b429426 Mon Sep 17 00:00:00 2001
From: Janos Gabler <janos.gabler@gmail.com>
Date: Wed, 12 Jun 2024 16:42:40 +0200
Subject: [PATCH 8/8] Make type checker a bit happier.

---
 src/pydvl/valuation/games.py                | 22 ++++++++++++---------
 src/pydvl/valuation/methods/owen_shapley.py | 12 +++++++++--
 src/pydvl/valuation/types.py                |  2 +-
 3 files changed, 24 insertions(+), 12 deletions(-)

diff --git a/src/pydvl/valuation/games.py b/src/pydvl/valuation/games.py
index 4b347c812..29891a98d 100644
--- a/src/pydvl/valuation/games.py
+++ b/src/pydvl/valuation/games.py
@@ -27,6 +27,7 @@
 from pydvl.valuation.dataset import Dataset
 from pydvl.valuation.methods._solve_least_core_problems import LeastCoreProblem
 from pydvl.valuation.result import ValuationResult
+from pydvl.valuation.types import SampleT
 from pydvl.valuation.utility.base import UtilityBase
 
 __all__ = [
@@ -91,13 +92,16 @@ def __call__(self, sample: SampleT | None) -> float:
         if sample is None or len(sample.subset) == 0:
             return 0
 
+        if self.training_data is None:
+            raise ValueError("Utility object has no training data.")
+
         idxs = np.array(sample.subset, dtype=np.int32)
         try:
-            score = self.score(self.training_data.x[idxs])
+            score: float = self.score(self.training_data.x[idxs])
         except (KeyboardInterrupt, SystemExit):
             raise
         except Exception:
-            score = 0
+            score = 0.0
         return score
 
     def with_dataset(self, dataset: Dataset):
@@ -209,7 +213,7 @@ def _score(self, X: NDArray) -> float:
     @lru_cache
     def shapley_values(self) -> ValuationResult:
         exact_values = np.ones(self.n_players) / self.n_players
-        result: ValuationResult[np.int_, np.int_] = ValuationResult(
+        result = ValuationResult(
             algorithm="exact_shapley",
             status=Status.Converged,
             indices=self.data.indices,
@@ -334,7 +338,7 @@ def _score(self, X: NDArray) -> float:
 
     @lru_cache
     def shapley_values(self) -> ValuationResult:
-        result: ValuationResult[np.int_, np.int_] = ValuationResult(
+        result = ValuationResult(
             algorithm="exact_shapley",
             status=Status.Converged,
             indices=self.data.indices,
@@ -405,7 +409,7 @@ def shapley_values(self) -> ValuationResult:
             value_left = precomputed_values[self.left, self.right]
             value_right = precomputed_values[self.right, self.left]
         exact_values = np.array([value_left] * self.left + [value_right] * self.right)
-        result: ValuationResult[np.int_, np.int_] = ValuationResult(
+        result = ValuationResult(
             algorithm="exact_shapley",
             status=Status.Converged,
             indices=self.data.indices,
@@ -427,7 +431,7 @@ def least_core_values(self) -> ValuationResult:
             subsidy = 0.0
             exact_values = np.array([0.0] * self.left + [1.0] * self.right)
 
-        result: ValuationResult[np.int_, np.int_] = ValuationResult(
+        result = ValuationResult(
             algorithm="exact_least_core",
             status=Status.Converged,
             indices=self.data.indices,
@@ -517,7 +521,7 @@ def _score(self, X: NDArray) -> float:
 
     @lru_cache
     def shapley_values(self) -> ValuationResult:
-        result: ValuationResult[np.int_, np.int_] = ValuationResult(
+        result = ValuationResult(
             algorithm="exact_shapley",
             status=Status.Converged,
             indices=self.data.indices,
@@ -592,7 +596,7 @@ def _score(self, X: NDArray) -> float:
     @lru_cache
     def shapley_values(self) -> ValuationResult:
         exact_values = 2 * np.ones_like(self.data.x)
-        result: ValuationResult[np.int_, np.int_] = ValuationResult(
+        result = ValuationResult(
             algorithm="exact_shapley",
             status=Status.Converged,
             indices=self.data.indices,
@@ -657,7 +661,7 @@ def least_core_values(self) -> ValuationResult:
             )
             subsidy = (self.n_players - 1) / (2 * self.n_players)
 
-        result: ValuationResult[np.int_, np.int_] = ValuationResult(
+        result = ValuationResult(
             algorithm="exact_least_core",
             status=Status.Converged,
             indices=self.data.indices,
diff --git a/src/pydvl/valuation/methods/owen_shapley.py b/src/pydvl/valuation/methods/owen_shapley.py
index 3128eb55e..1cbd6e824 100644
--- a/src/pydvl/valuation/methods/owen_shapley.py
+++ b/src/pydvl/valuation/methods/owen_shapley.py
@@ -1,7 +1,13 @@
 from __future__ import annotations
 
+from typing import Any
+
+from typing_extensions import Self
+
 from pydvl.utils import Status
+from pydvl.valuation.dataset import Dataset
 from pydvl.valuation.methods.semivalue import SemivalueValuation
+from pydvl.valuation.result import ValuationResult
 from pydvl.valuation.samplers.powerset import OwenSampler
 from pydvl.valuation.stopping import NoStopping
 from pydvl.valuation.utility.base import UtilityBase
@@ -46,7 +52,7 @@ def __init__(
             progress=progress,
         )
 
-    def fit(self, dataset: Dataset) -> ValuationResult:
+    def fit(self, dataset: Dataset) -> Self:
         """Calculate the Owen shapley values for a given dataset.
 
         This method has to be called before calling `values()`.
@@ -66,7 +72,9 @@ def fit(self, dataset: Dataset) -> ValuationResult:
         # since we bypassed the convergence checks we need to set the status to
         # converged manually
         super().fit(dataset)
-        self.result._status = Status.Converged
+        # make the type checker happy
+        if self.result is not None:
+            self.result._status = Status.Converged
         return self
 
     def coefficient(self, n: int, k: int) -> float:
diff --git a/src/pydvl/valuation/types.py b/src/pydvl/valuation/types.py
index 2dbbea534..a5563e2cf 100644
--- a/src/pydvl/valuation/types.py
+++ b/src/pydvl/valuation/types.py
@@ -31,7 +31,7 @@
 
 @dataclass(frozen=True)
 class ValueUpdate:
-    idx: int | IndexT
+    idx: int | IndexT | None
     update: float