From 40971adc64b1518c9b4876f2961fca7b803c6f41 Mon Sep 17 00:00:00 2001 From: Markus Semmler Date: Tue, 29 Aug 2023 23:42:43 +0200 Subject: [PATCH 01/32] Add Seed type to `pydvl.utils.types`. --- src/pydvl/utils/types.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/pydvl/utils/types.py b/src/pydvl/utils/types.py index 9c51d629e..d19b5cae3 100644 --- a/src/pydvl/utils/types.py +++ b/src/pydvl/utils/types.py @@ -5,8 +5,9 @@ import inspect from abc import ABCMeta -from typing import Any, Callable, Protocol, TypeVar +from typing import Any, Callable, Optional, Protocol, TypeVar, Union +from numpy.random import Generator, SeedSequence from numpy.typing import NDArray __all__ = ["SupervisedModel", "MapFunction", "ReduceFunction", "NoPublicConstructor"] @@ -92,3 +93,6 @@ def __call__(cls, *args, **kwargs): def create(cls, *args: Any, **kwargs: Any): return super().__call__(*args, **kwargs) + + +Seed = Union[int, Generator] From a798b391d01bfa323a7c9e3faa6435e01e1c0f0a Mon Sep 17 00:00:00 2001 From: Markus Semmler Date: Tue, 29 Aug 2023 23:46:41 +0200 Subject: [PATCH 02/32] Add seed parameter to :func:`pydvl.utils.numeric.random_subset`, :func:`pydvl.utils.numeric.random_powerset` and :func:`pydvl.utils.numeric.random_subset_of_size`. Add test case for checking reproduce-ability. --- src/pydvl/utils/numeric.py | 40 ++++++++++++++++++----- tests/conftest.py | 10 ++++++ tests/utils/test_numeric.py | 63 +++++++++++++++++++++++++++++++++++++ 3 files changed, 106 insertions(+), 7 deletions(-) diff --git a/src/pydvl/utils/numeric.py b/src/pydvl/utils/numeric.py index c639da82b..c9ade3d59 100644 --- a/src/pydvl/utils/numeric.py +++ b/src/pydvl/utils/numeric.py @@ -10,6 +10,8 @@ import numpy as np from numpy.typing import NDArray +from pydvl.utils.types import Seed + __all__ = [ "running_moments", "num_samples_permutation_hoeffding", @@ -19,6 +21,7 @@ "random_powerset", "random_subset_of_size", "top_k_value_accuracy", + "sets_are_equal", ] T = TypeVar("T", bound=np.generic) @@ -59,21 +62,29 @@ def num_samples_permutation_hoeffding(eps: float, delta: float, u_range: float) return int(np.ceil(np.log(2 / delta) * 2 * u_range**2 / eps**2)) -def random_subset(s: NDArray[T], q: float = 0.5) -> NDArray[T]: +def random_subset( + s: NDArray[T], + q: float = 0.5, + seed: Optional[Seed] = None, +) -> NDArray[T]: """Returns one subset at random from ``s``. :param s: set to sample from :param q: Sampling probability for elements. The default 0.5 yields a uniform distribution over the power set of s. + :param seed: Either an instance of a numpy random number generator or a seed for it. :return: the subset """ - rng = np.random.default_rng() + rng = np.random.default_rng(seed) selection = rng.uniform(size=len(s)) > q return s[selection] def random_powerset( - s: NDArray[T], n_samples: Optional[int] = None, q: float = 0.5 + s: NDArray[T], + n_samples: Optional[int] = None, + q: float = 0.5, + seed: Optional[Seed] = None, ) -> Generator[NDArray[T], None, None]: """Samples subsets from the power set of the argument, without pre-generating all subsets and in no order. @@ -91,6 +102,7 @@ def random_powerset( Defaults to `np.iinfo(np.int32).max` :param q: Sampling probability for elements. The default 0.5 yields a uniform distribution over the power set of s. + :param seed: Either an instance of a numpy random number generator or a seed for it. :return: Samples from the power set of s :raises: TypeError: if the data `s` is not a NumPy array @@ -102,30 +114,40 @@ def random_powerset( if q < 0 or q > 1: raise ValueError("Element sampling probability must be in [0,1]") + rng = np.random.default_rng(seed) total = 1 if n_samples is None: n_samples = np.iinfo(np.int32).max while total <= n_samples: - yield random_subset(s, q) + yield random_subset(s, q, seed=rng) total += 1 -def random_subset_of_size(s: NDArray[T], size: int) -> NDArray[T]: +def random_subset_of_size( + s: NDArray[T], + size: int, + seed: Optional[Seed] = None, +) -> NDArray[T]: """Samples a random subset of given size uniformly from the powerset of ``s``. :param s: Set to sample from :param size: Size of the subset to generate + :param seed: Either an instance of a numpy random number generator or a seed for it. :return: The subset :raises ValueError: If size > len(s) """ if size > len(s): raise ValueError("Cannot sample subset larger than set") - rng = np.random.default_rng() + rng = np.random.default_rng(seed) return rng.choice(s, size=size, replace=False) -def random_matrix_with_condition_number(n: int, condition_number: float) -> NDArray: +def random_matrix_with_condition_number( + n: int, + condition_number: float, + # TODO Add seed parameter +) -> NDArray: """Constructs a square matrix with a given condition number. Taken from: @@ -227,3 +249,7 @@ def top_k_value_accuracy( top_k_pred_values = np.argsort(y_pred)[-k:] top_k_accuracy = len(np.intersect1d(top_k_exact_values, top_k_pred_values)) / k return top_k_accuracy + + +def sets_are_equal(subset_1: NDArray[np.int_], subset_2: NDArray[np.int_]): + return list(subset_1) == list(subset_2) diff --git a/tests/conftest.py b/tests/conftest.py index 9d82da504..2cf9de4f8 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -59,6 +59,16 @@ def seed(request): return 24 +@pytest.fixture() +def seed_alt(request): + return 42 + + +@pytest.fixture() +def collision_tol(request): + return 0.01 + + @pytest.fixture(autouse=True) def pytorch_seed(seed): try: diff --git a/tests/utils/test_numeric.py b/tests/utils/test_numeric.py index e6101defb..12168455d 100644 --- a/tests/utils/test_numeric.py +++ b/tests/utils/test_numeric.py @@ -1,5 +1,6 @@ import numpy as np import pytest +from numpy._typing import NDArray from pydvl.utils.numeric import ( powerset, @@ -7,6 +8,7 @@ random_powerset, random_subset_of_size, running_moments, + sets_are_equal, ) @@ -68,6 +70,37 @@ def test_random_powerset(n, max_subsets): ) +@pytest.mark.parametrize("n, max_subsets", [(10, 2**10)]) +def test_random_powerset_reproducible(n, max_subsets, seed): + """ + Test that the same seeds produce the same results, and different seeds produce + different results for method :func:`random_powerset`. + """ + s = np.arange(n) + parallel_subset_generators = zip( + random_powerset(s, n_samples=max_subsets, seed=seed), + random_powerset(s, n_samples=max_subsets, seed=seed), + ) + + n_collisions = sum(map(lambda t: sets_are_equal(*t), parallel_subset_generators)) + assert n_collisions == max_subsets + + +@pytest.mark.parametrize("n, max_subsets", [(10, 2**10)]) +def test_random_powerset_stochastic(n, max_subsets, seed, seed_alt, collision_tol): + """ + Test that the same seeds produce the same results, and different seeds produce + different results for method :func:`random_powerset`. + """ + s = np.arange(n) + parallel_subset_generators = zip( + random_powerset(s, n_samples=max_subsets, seed=seed), + random_powerset(s, n_samples=max_subsets, seed=seed_alt), + ) + n_collisions = sum(map(lambda t: sets_are_equal(*t), parallel_subset_generators)) + assert n_collisions / max_subsets < collision_tol + + @pytest.mark.parametrize( "n, size, exception", [(0, 0, None), (0, 1, ValueError), (10, 0, None), (10, 3, None), (1000, 40, None)], @@ -83,6 +116,36 @@ def test_random_subset_of_size(n, size, exception): assert np.all([x in s for x in ss]) +@pytest.mark.parametrize( + "n, size", + [(10, 3), (1000, 40)], +) +def test_random_subset_of_size_stochastic(n, size, seed, seed_alt): + """ + Test that the same seeds produce the same results, and different seeds produce + different results for method :func:`random_subset_of_size`. + """ + s = np.arange(n) + subset_1 = random_subset_of_size(s, size=size, seed=seed) + subset_2 = random_subset_of_size(s, size=size, seed=seed_alt) + assert not sets_are_equal(subset_1, subset_2) + + +@pytest.mark.parametrize( + "n, size", + [(10, 3), (1000, 40)], +) +def test_random_subset_of_size_stochastic(n, size, seed): + """ + Test that the same seeds produce the same results, and different seeds produce + different results for method :func:`random_subset_of_size`. + """ + s = np.arange(n) + subset_1 = random_subset_of_size(s, size=size, seed=seed) + subset_2 = random_subset_of_size(s, size=size, seed=seed) + assert sets_are_equal(subset_1, subset_2) + + @pytest.mark.parametrize( "n, cond, exception", [ From 4e68acf0efa9acb752c624d54548548cad11903f Mon Sep 17 00:00:00 2001 From: Markus Semmler Date: Wed, 30 Aug 2023 00:49:34 +0200 Subject: [PATCH 03/32] Add seed parameter to :func:`pydvl.utils.parallel.map_reduce.MapReduceJob.__call__`. Create function :func:`pydvl.utils.parallel.check_fn_accepts_parameter` and :func:`pydvl.utils.types.ensure_seedsequence`. And add test case for testing the reproduce-ability. --- src/pydvl/utils/parallel/check.py | 47 +++++++++++++++++++++++++ src/pydvl/utils/parallel/map_reduce.py | 37 ++++++++++++++++++-- src/pydvl/utils/types.py | 22 ++++++++++++ tests/utils/test_parallel.py | 48 ++++++++++++++++++++++++++ 4 files changed, 151 insertions(+), 3 deletions(-) create mode 100644 src/pydvl/utils/parallel/check.py diff --git a/src/pydvl/utils/parallel/check.py b/src/pydvl/utils/parallel/check.py new file mode 100644 index 000000000..bd95a6fe1 --- /dev/null +++ b/src/pydvl/utils/parallel/check.py @@ -0,0 +1,47 @@ +import inspect +from functools import partial +from typing import Callable + + +def check_fn_accepts_parameter(fn: Callable, param_name: str) -> bool: + """ + Checks whether the given function accepts the given parameter or not. + + :param fn: The function to check. + :param param_name: The name of the parameter to check. + + :return: True if the function accepts the parameter, False otherwise. + """ + args_set_by_partial = set() + + # Recursive function to go through nested functools.partial objects + def check_partial(g: Callable): + nonlocal args_set_by_partial + + # If the function is a functools.partial, get the original function + if isinstance(g, partial): + args_set_by_partial.update(g.keywords.keys()) + args_set_by_partial.update(g.args) + return check_partial(g.func) + else: + return g + + # Get the original function from functools.partial if needed + original_function = check_partial(fn) + + sig = inspect.signature(original_function) + params = sig.parameters + + # Check if the parameter was set by functools.partial + if param_name in args_set_by_partial: + return False + + # Check if the function accepts the specific parameter + if param_name in params: + return True + + # Check if the function accepts **kwargs + if any(p.kind == p.VAR_KEYWORD for p in params.values()): + return True + + return False diff --git a/src/pydvl/utils/parallel/map_reduce.py b/src/pydvl/utils/parallel/map_reduce.py index d40d3f1a6..45f79f3e1 100644 --- a/src/pydvl/utils/parallel/map_reduce.py +++ b/src/pydvl/utils/parallel/map_reduce.py @@ -1,12 +1,21 @@ from itertools import accumulate, repeat from typing import Any, Collection, Dict, Generic, List, Optional, TypeVar, Union +import numpy as np from joblib import Parallel, delayed +from numpy.random import SeedSequence from numpy.typing import NDArray from ..config import ParallelConfig -from ..types import MapFunction, ReduceFunction, maybe_add_argument +from ..types import ( + MapFunction, + ReduceFunction, + Seed, + ensure_seed_sequence, + maybe_add_argument, +) from .backend import init_parallel_backend +from .check import check_fn_accepts_parameter __all__ = ["MapReduceJob"] @@ -98,19 +107,41 @@ def __init__( def __call__( self, + seed: Optional[Seed] = None, ) -> R: + """ + Runs the map-reduce job. + + :param seed: Either an instance of a numpy random number generator or a seed + for it. + + :return: The result of the reduce function. + """ if self.config.backend == "joblib": backend = "loky" else: backend = self.config.backend + # In joblib the levels are reversed. # 0 means no logging and 50 means log everything to stdout verbose = 50 - self.config.logging_level + seed_seq = ensure_seed_sequence(seed) with Parallel(backend=backend, n_jobs=self.n_jobs, verbose=verbose) as parallel: chunks = self._chunkify(self.inputs_, n_chunks=self.n_jobs) + + # Allow functions which don't accept or need a seed parameter. + lst_add_kwargs: List[Dict[str, Union[int, SeedSequence]]] = [ + {"job_id": j} for j in range(len(chunks)) + ] + if check_fn_accepts_parameter(self._map_func, "seed"): + lst_add_kwargs = [ + {**d, **{"seed": seed.entropy}} + for d, seed in zip(lst_add_kwargs, seed_seq.spawn(len(chunks))) + ] + map_results: List[R] = parallel( - delayed(self._map_func)(next_chunk, job_id=j, **self.map_kwargs) - for j, next_chunk in enumerate(chunks) + delayed(self._map_func)(next_chunk, **add_kwargs, **self.map_kwargs) + for next_chunk, add_kwargs in zip(chunks, lst_add_kwargs) ) reduce_results: R = self._reduce_func(map_results, **self.reduce_kwargs) return reduce_results diff --git a/src/pydvl/utils/types.py b/src/pydvl/utils/types.py index d19b5cae3..aa877a196 100644 --- a/src/pydvl/utils/types.py +++ b/src/pydvl/utils/types.py @@ -3,6 +3,7 @@ """ from __future__ import annotations +import functools import inspect from abc import ABCMeta from typing import Any, Callable, Optional, Protocol, TypeVar, Union @@ -60,6 +61,7 @@ def maybe_add_argument(fun: Callable, new_arg: str): if new_arg in params.keys(): return fun + @functools.wraps(fun) def wrapper(*args, **kwargs): try: del kwargs[new_arg] @@ -96,3 +98,23 @@ def create(cls, *args: Any, **kwargs: Any): Seed = Union[int, Generator] + + +def ensure_seed_sequence( + seed: Optional[Union[Seed, SeedSequence]] = None +) -> SeedSequence: + """ + If the passed seed is a SeedSequence object then it is returned as is. If it is + a Generator the internal protected seed sequence from the generator gets extracted. + Otherwise, a new SeedSequence object is created from the passed (optional) seed. + + :param seed: Either an int, a Generator object a SeedSequence object or None. + + :returns: A SeedSequence object. + """ + if isinstance(seed, SeedSequence): + return seed + elif isinstance(seed, Generator): + return seed.bit_generator._seed_seq # noqa + else: + return SeedSequence(seed) diff --git a/tests/utils/test_parallel.py b/tests/utils/test_parallel.py index e3b3fa0fc..29f57eacb 100644 --- a/tests/utils/test_parallel.py +++ b/tests/utils/test_parallel.py @@ -2,6 +2,7 @@ import os import time from functools import partial, reduce +from typing import List, Optional import numpy as np import pytest @@ -9,6 +10,7 @@ from pydvl.utils.parallel import MapReduceJob, init_parallel_backend from pydvl.utils.parallel.backend import effective_n_jobs from pydvl.utils.parallel.futures import init_executor +from pydvl.utils.types import Seed def test_effective_n_jobs(parallel_config, num_workers): @@ -145,6 +147,40 @@ def reduce_func(x, y): assert result == 150 +def test_map_reduce_reproducible(parallel_config, seed, seed_alt): + """ + Test that the same result is obtained when using the same seed. And that different + results are obtained when using different seeds. + """ + + map_reduce_job = MapReduceJob( + None, + map_func=_sum_of_random_integers, + reduce_func=_mean_func, + config=parallel_config, + ) + result_1 = map_reduce_job(seed=seed) + result_2 = map_reduce_job(seed=seed) + assert result_1 == result_2 + + +def test_map_reduce_stochastic(parallel_config, seed, seed_alt): + """ + Test that the same result is obtained when using the same seed. And that different + results are obtained when using different seeds. + """ + + map_reduce_job = MapReduceJob( + None, + map_func=_sum_of_random_integers, + reduce_func=_mean_func, + config=parallel_config, + ) + result_1 = map_reduce_job(seed=seed) + result_2 = map_reduce_job(seed=seed_alt) + assert result_1 != result_2 + + def test_wrap_function(parallel_config, num_workers): if parallel_config.backend != "ray": pytest.skip("Only makes sense for ray") @@ -229,3 +265,15 @@ def test_future_cancellation(parallel_config): future.result() assert time.monotonic() - start < 1 + + +# Helper functions for tests :func:`test_map_reduce_reproducible` and +# :func:`test_map_reduce_stochastic`. +def _sum_of_random_integers(x: None, seed: Optional[Seed] = None): + rng = np.random.default_rng(seed) + values = rng.integers(0, rng.integers(100), 10) + return np.sum(values) + + +def _mean_func(means): + return np.mean(means) From b78dc1418d9506b7b667b11fdb7ed3b2d5ebd178 Mon Sep 17 00:00:00 2001 From: Markus Semmler Date: Wed, 30 Aug 2023 00:55:39 +0200 Subject: [PATCH 04/32] Add seed parameter to :func:`pydvl.value.result.ValuationResult`. --- src/pydvl/value/result.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/src/pydvl/value/result.py b/src/pydvl/value/result.py index 219b8ea90..c91b16a26 100644 --- a/src/pydvl/value/result.py +++ b/src/pydvl/value/result.py @@ -66,6 +66,7 @@ from pydvl.utils.dataset import Dataset from pydvl.utils.numeric import running_moments from pydvl.utils.status import Status +from pydvl.utils.types import Seed try: import pandas # Try to import here for the benefit of mypy @@ -657,7 +658,11 @@ def to_dataframe( @classmethod def from_random( - cls, size: int, total: Optional[float] = None, **kwargs + cls, + size: int, + total: Optional[float] = None, + seed: Optional[Seed] = None, + **kwargs, ) -> "ValuationResult": """Creates a :class:`ValuationResult` object and fills it with an array of random values from a uniform distribution in [-1,1]. The values can @@ -668,6 +673,9 @@ def from_random( ("efficiency" property of Shapley values). :param kwargs: Additional options to pass to the constructor of :class:`ValuationResult`. Use to override status, names, etc. + :param seed: Either an instance of a numpy random number generator or a seed for + it. + :return: A valuation result with its status set to :attr:`Status.Converged` by default. :raises ValueError: If ``size`` is less than 1. @@ -678,7 +686,8 @@ def from_random( if size < 1: raise ValueError("Size must be a positive integer") - values = np.random.uniform(low=-1, high=1, size=size) + rng = np.random.default_rng(seed) + values = rng.uniform(low=-1, high=1, size=size) if total is not None: values *= total / np.sum(values) From 743618990993405c0722677a4a058f624e43c549 Mon Sep 17 00:00:00 2001 From: Markus Semmler Date: Wed, 30 Aug 2023 06:17:44 +0200 Subject: [PATCH 05/32] Add class `pydvl.value.sampler.StochasticSampler` with seed property. Derive stochastic samplers from `pydvl.value.sampler.StochasticSampler`. Add test cases for reproducibility and stochasticity. --- src/pydvl/value/sampler.py | 52 +++++++++++++++++++++----------- src/pydvl/value/semivalues.py | 30 +++++++++++++----- tests/value/test_sampler.py | 57 ++++++++++++++++++++++++++++++++++- 3 files changed, 113 insertions(+), 26 deletions(-) diff --git a/src/pydvl/value/sampler.py b/src/pydvl/value/sampler.py index 069d50574..3d01ba7fd 100644 --- a/src/pydvl/value/sampler.py +++ b/src/pydvl/value/sampler.py @@ -54,6 +54,8 @@ "UniformSampler", ] +from pydvl.utils.types import Seed, ensure_seed_seq + T = TypeVar("T", bound=np.generic) SampleT = Tuple[T, NDArray[T]] Sequence.register(np.ndarray) @@ -142,17 +144,10 @@ def complement(self, exclude: Sequence[T]) -> NDArray[T]: return np.setxor1d(self._indices, exclude) def iterindices(self) -> Iterator[T]: - """Iterates over indices in the order specified at construction. - - FIXME: this is probably not very useful, but I couldn't decide - which method is better - """ + """Iterates over indices in the order specified at construction.""" if self._index_iteration is PowersetSampler.IndexIteration.Sequential: for idx in self._outer_indices: yield idx - elif self._index_iteration is PowersetSampler.IndexIteration.Random: - while True: - yield np.random.choice(self._outer_indices, size=1).item() @overload def __getitem__(self, key: slice) -> PowersetSampler[T]: @@ -205,6 +200,24 @@ def weight(cls, n: int, subset_len: int) -> float: ... +class StochasticSampler(PowersetSampler[T], abc.ABC): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self._rng = np.random.default_rng() + + @property + def seed(self) -> int: + return ensure_seed_seq(self._rng).entropy + + @seed.setter + def seed(self, seed: Seed): + """ + :param seed: Either an instance of a numpy random number generator or a seed + for it. + """ + self._rng = np.random.default_rng(seed) + + class DeterministicUniformSampler(PowersetSampler[T]): def __init__(self, indices: NDArray[T], *args, **kwargs): """An iterator to perform uniform deterministic sampling of subsets. @@ -244,7 +257,7 @@ def weight(cls, n: int, subset_len: int) -> float: return float(2 ** (n - 1)) if n > 0 else 1.0 -class UniformSampler(PowersetSampler[T]): +class UniformSampler(StochasticSampler[T]): """An iterator to perform uniform random sampling of subsets. Iterating over every index $i$, either in sequence or at random depending on @@ -269,7 +282,7 @@ class UniformSampler(PowersetSampler[T]): def __iter__(self) -> Iterator[SampleT]: while True: for idx in self.iterindices(): - subset = random_subset(self.complement([idx])) + subset = random_subset(self.complement([idx]), seed=self._rng) yield idx, subset self._n_samples += 1 if self._n_samples == 0: # Empty index set @@ -291,7 +304,7 @@ def __init__(self, indices: NDArray[T], *args, **kwargs): void(indices, args, kwargs) -class AntitheticSampler(PowersetSampler[T]): +class AntitheticSampler(StochasticSampler[T]): """An iterator to perform uniform random sampling of subsets, and their complements. @@ -303,7 +316,7 @@ class AntitheticSampler(PowersetSampler[T]): def __iter__(self) -> Iterator[SampleT]: while True: for idx in self.iterindices(): - subset = random_subset(self.complement([idx])) + subset = random_subset(self.complement([idx]), seed=self._rng) yield idx, subset self._n_samples += 1 yield idx, self.complement(np.concatenate((subset, np.array([idx])))) @@ -316,7 +329,7 @@ def weight(cls, n: int, subset_len: int) -> float: return float(2 ** (n - 1)) if n > 0 else 1.0 -class PermutationSampler(PowersetSampler[T]): +class PermutationSampler(StochasticSampler[T]): """Sample permutations of indices and iterate through each returning increasing subsets, as required for the permutation definition of semi-values. @@ -334,10 +347,11 @@ class PermutationSampler(PowersetSampler[T]): """ def __iter__(self) -> Iterator[SampleT]: - rng = np.random.default_rng() # FIXME: waiting for better rng handling while True: - permutation = rng.permutation(self._indices) + permutation = self._rng.permutation(self._indices) for i, idx in enumerate(permutation): + if not i: + continue yield idx, permutation[:i] self._n_samples += 1 if self._n_samples == 0: # Empty index set @@ -375,7 +389,7 @@ def __iter__(self) -> Iterator[SampleT]: self._n_samples += 1 -class RandomHierarchicalSampler(PowersetSampler[T]): +class RandomHierarchicalSampler(StochasticSampler[T]): """For every index, sample a set size, then a set of that size. .. todo:: @@ -385,8 +399,10 @@ class RandomHierarchicalSampler(PowersetSampler[T]): def __iter__(self) -> Iterator[SampleT]: while True: for idx in self.iterindices(): - k = np.random.choice(np.arange(len(self._indices)), size=1).item() - subset = random_subset_of_size(self.complement([idx]), size=k) + k = self._rng.choice(np.arange(len(self._indices)), size=1).item() + subset = random_subset_of_size( + self.complement([idx]), size=k, seed=self._rng + ) yield idx, subset self._n_samples += 1 if self._n_samples == 0: # Empty index set diff --git a/src/pydvl/value/semivalues.py b/src/pydvl/value/semivalues.py index 52dd88e13..b38e2fff6 100644 --- a/src/pydvl/value/semivalues.py +++ b/src/pydvl/value/semivalues.py @@ -54,7 +54,7 @@ import logging import math from enum import Enum -from typing import Protocol, Tuple, Type, TypeVar, cast +from typing import Optional, Protocol, Tuple, Type, TypeVar, cast import numpy as np import scipy as sp @@ -62,6 +62,7 @@ from tqdm import tqdm from pydvl.utils import ParallelConfig, Utility +from pydvl.utils.types import Seed, ensure_seed_seq from pydvl.value import ValuationResult from pydvl.value.sampler import PermutationSampler, PowersetSampler, SampleT from pydvl.value.stopping import MaxUpdates, StoppingCriterion @@ -186,8 +187,9 @@ def semivalues( return result # Ensure that we always have n_submitted_jobs running + n_remaining_slots = n_submitted_jobs - len(pending) try: - for _ in range(n_submitted_jobs - len(pending)): + for i in range(n_remaining_slots): pending.add( executor.submit( _marginal, @@ -197,8 +199,7 @@ def semivalues( ) ) except StopIteration: - if len(pending) == 0: - return result + return result def shapley_coefficient(n: int, k: int) -> float: @@ -239,6 +240,7 @@ def compute_shapley_semivalues( n_jobs: int = 1, config: ParallelConfig = ParallelConfig(), progress: bool = False, + seed: Optional[Seed] = None, ) -> ValuationResult: """Computes Shapley values for a given utility function. @@ -252,14 +254,17 @@ def compute_shapley_semivalues( :param sampler_t: The sampler type to use. See :mod:`pydvl.value.sampler` for a list. :param n_jobs: Number of parallel jobs to use. + :param seed: Either an instance of a numpy random number generator or a seed for it. :param config: Object configuring parallel computation, with cluster address, number of cpus, etc. :param progress: Whether to display a progress bar. :return: Object with the results. """ + sampler_instance = sampler_t(u.data.indices) + sampler_instance.seed = seed return semivalues( - sampler_t(u.data.indices), + sampler_instance, u, shapley_coefficient, done, @@ -277,6 +282,7 @@ def compute_banzhaf_semivalues( n_jobs: int = 1, config: ParallelConfig = ParallelConfig(), progress: bool = False, + seed: Optional[Seed] = None, ) -> ValuationResult: """Computes Banzhaf values for a given utility function. @@ -288,14 +294,17 @@ def compute_banzhaf_semivalues( :param sampler_t: The sampler type to use. See :mod:`pydvl.value.sampler` for a list. :param n_jobs: Number of parallel jobs to use. + :param seed: Either an instance of a numpy random number generator or a seed for it. :param config: Object configuring parallel computation, with cluster address, number of cpus, etc. :param progress: Whether to display a progress bar. :return: Object with the results. """ + sampler_instance = sampler_t(u.data.indices) + sampler_instance.seed = seed return semivalues( - sampler_t(u.data.indices), + sampler_instance, u, banzhaf_coefficient, done, @@ -315,6 +324,7 @@ def compute_beta_shapley_semivalues( n_jobs: int = 1, config: ParallelConfig = ParallelConfig(), progress: bool = False, + seed: Optional[Seed] = None, ) -> ValuationResult: """Computes Beta Shapley values for a given utility function. @@ -328,14 +338,17 @@ def compute_beta_shapley_semivalues( :param sampler_t: The sampler type to use. See :mod:`pydvl.value.sampler` for a list. :param n_jobs: Number of parallel jobs to use. + :param seed: Either an instance of a numpy random number generator or a seed for it. :param config: Object configuring parallel computation, with cluster address, number of cpus, etc. :param progress: Whether to display a progress bar. :return: Object with the results. """ + sampler_instance = sampler_t(u.data.indices) + sampler_instance.seed = seed return semivalues( - sampler_t(u.data.indices), + sampler_instance, u, beta_coefficient(alpha, beta), done, @@ -364,6 +377,7 @@ def compute_semivalues( mode: SemiValueMode = SemiValueMode.Shapley, sampler_t: Type[PowersetSampler] = PermutationSampler, n_jobs: int = 1, + seed: Optional[Seed] = None, **kwargs, ) -> ValuationResult: """Convenience entry point for most common semi-value computations. @@ -394,12 +408,14 @@ def compute_semivalues( :param sampler_t: The sampler type to use. See :mod:`pydvl.value.sampler` for a list. :param n_jobs: Number of parallel jobs to use. + :param seed: Either an instance of a numpy random number generator or a seed for it. :param kwargs: Additional keyword arguments passed to :func:`~pydvl.value.semivalues.semivalues`. :return: Object with the results. """ sampler_instance = sampler_t(u.data.indices) + sampler_instance.seed = seed if mode == SemiValueMode.Shapley: coefficient = shapley_coefficient elif mode == SemiValueMode.BetaShapley: diff --git a/tests/value/test_sampler.py b/tests/value/test_sampler.py index 570d11ed9..3aa2eef3d 100644 --- a/tests/value/test_sampler.py +++ b/tests/value/test_sampler.py @@ -1,14 +1,18 @@ from itertools import takewhile +from typing import Iterator, List, Type import numpy as np import pytest +from numpy._typing import NDArray -from pydvl.utils import powerset +from pydvl.utils import powerset, sets_are_equal +from pydvl.utils.types import Seed from pydvl.value.sampler import ( AntitheticSampler, DeterministicPermutationSampler, DeterministicUniformSampler, PermutationSampler, + PowersetSampler, RandomHierarchicalSampler, UniformSampler, ) @@ -19,6 +23,7 @@ [ DeterministicUniformSampler, UniformSampler, + DeterministicPermutationSampler, PermutationSampler, AntitheticSampler, RandomHierarchicalSampler, @@ -35,6 +40,44 @@ def test_proper(sampler_class, indices): assert set(subset) in subsets +@pytest.mark.parametrize( + "sampler_class", + [ + UniformSampler, + PermutationSampler, + AntitheticSampler, + RandomHierarchicalSampler, + ], +) +@pytest.mark.parametrize("indices", [(), (list(range(100)))]) +def test_proper_reproducible(sampler_class, indices, seed): + """Test that the sampler is reproducible.""" + samples_1 = _create_seeded_sample_iter(sampler_class, indices, seed) + samples_2 = _create_seeded_sample_iter(sampler_class, indices, seed) + + for (_, subset_1), (_, subset_2) in zip(samples_1, samples_2): + assert sets_are_equal(subset_1, subset_2) + + +@pytest.mark.parametrize( + "sampler_class", + [ + UniformSampler, + PermutationSampler, + AntitheticSampler, + RandomHierarchicalSampler, + ], +) +@pytest.mark.parametrize("indices", [(), (list(range(100)))]) +def test_proper_stochastic(sampler_class, indices, seed, seed_alt): + """Test that the sampler is reproducible.""" + samples_1 = _create_seeded_sample_iter(sampler_class, indices, seed) + samples_2 = _create_seeded_sample_iter(sampler_class, indices, seed_alt) + + for (_, subset_1), (_, subset_2) in zip(samples_1, samples_2): + assert not sets_are_equal(subset_1, subset_2) + + @pytest.mark.parametrize( "sampler_class", [ @@ -70,3 +113,15 @@ def test_chunkify_permutation(sampler_class): # Missing tests for: # - Correct distribution of subsets for random samplers + + +def _create_seeded_sample_iter( + sampler_class: Type[PowersetSampler], + indices: List, + seed: Seed, +) -> Iterator: + max_iterations = len(indices) + sampler = sampler_class(np.array(indices)) + sampler.seed = seed + sample_stream = takewhile(lambda _: sampler.n_samples < max_iterations, sampler) + return sample_stream From 1be1bb02f288acb077d7c80867e28b02821113e8 Mon Sep 17 00:00:00 2001 From: Markus Semmler Date: Wed, 30 Aug 2023 22:25:22 +0200 Subject: [PATCH 06/32] Merge function fn_accepts_param_name into maybe_add_argument and adapt `map_reduce.py` accordingly. --- .../{parallel/check.py => functional.py} | 52 ++++++++++++------- src/pydvl/utils/parallel/map_reduce.py | 27 ++++------ src/pydvl/utils/types.py | 11 ++-- 3 files changed, 47 insertions(+), 43 deletions(-) rename src/pydvl/utils/{parallel/check.py => functional.py} (51%) diff --git a/src/pydvl/utils/parallel/check.py b/src/pydvl/utils/functional.py similarity index 51% rename from src/pydvl/utils/parallel/check.py rename to src/pydvl/utils/functional.py index bd95a6fe1..186f1167d 100644 --- a/src/pydvl/utils/parallel/check.py +++ b/src/pydvl/utils/functional.py @@ -1,35 +1,24 @@ +from __future__ import annotations + import inspect from functools import partial -from typing import Callable +from typing import Callable, Set, Tuple + +__all__ = ["fn_accepts_param_name"] -def check_fn_accepts_parameter(fn: Callable, param_name: str) -> bool: +def fn_accepts_param_name(fn: Callable, param_name: str) -> bool: """ - Checks whether the given function accepts the given parameter or not. + Checks if a function accepts a given parameter, even if it is set by partial. :param fn: The function to check. :param param_name: The name of the parameter to check. - :return: True if the function accepts the parameter, False otherwise. """ - args_set_by_partial = set() - - # Recursive function to go through nested functools.partial objects - def check_partial(g: Callable): - nonlocal args_set_by_partial - - # If the function is a functools.partial, get the original function - if isinstance(g, partial): - args_set_by_partial.update(g.keywords.keys()) - args_set_by_partial.update(g.args) - return check_partial(g.func) - else: - return g - # Get the original function from functools.partial if needed - original_function = check_partial(fn) + wrapped_fn, args_set_by_partial = _unroll_partial_fn(fn) - sig = inspect.signature(original_function) + sig = inspect.signature(wrapped_fn) params = sig.parameters # Check if the parameter was set by functools.partial @@ -45,3 +34,26 @@ def check_partial(g: Callable): return True return False + + +def _unroll_partial_fn(fn: Callable) -> Tuple[Callable, Set[str]]: + """ + Unroll a function that was set by functools.partial. + + :param fn: Either or a function to unroll. + :return: A tuple of the unrolled function and a set of the parameters that were set + by functools.partial. + """ + args_set_by_partial = set() + + def _rec_unroll_partial_function(g: Callable): + nonlocal args_set_by_partial + + if isinstance(g, partial): + args_set_by_partial.update(g.keywords.keys()) + args_set_by_partial.update(g.args) + return _rec_unroll_partial_function(g.func) + else: + return g + + return _rec_unroll_partial_function(fn), args_set_by_partial diff --git a/src/pydvl/utils/parallel/map_reduce.py b/src/pydvl/utils/parallel/map_reduce.py index 45f79f3e1..0da912b6a 100644 --- a/src/pydvl/utils/parallel/map_reduce.py +++ b/src/pydvl/utils/parallel/map_reduce.py @@ -1,3 +1,4 @@ +from functools import reduce from itertools import accumulate, repeat from typing import Any, Collection, Dict, Generic, List, Optional, TypeVar, Union @@ -11,11 +12,10 @@ MapFunction, ReduceFunction, Seed, - ensure_seed_sequence, + ensure_seed_seq, maybe_add_argument, ) from .backend import init_parallel_backend -from .check import check_fn_accepts_parameter __all__ = ["MapReduceJob"] @@ -102,7 +102,7 @@ def __init__( self.map_kwargs = map_kwargs if map_kwargs is not None else dict() self.reduce_kwargs = reduce_kwargs if reduce_kwargs is not None else dict() - self._map_func = maybe_add_argument(map_func, "job_id") + self._map_func = reduce(maybe_add_argument, ["job_id", "seed"], map_func) self._reduce_func = reduce_func def __call__( @@ -125,23 +125,16 @@ def __call__( # In joblib the levels are reversed. # 0 means no logging and 50 means log everything to stdout verbose = 50 - self.config.logging_level - seed_seq = ensure_seed_sequence(seed) + seed_seq = ensure_seed_seq(seed) with Parallel(backend=backend, n_jobs=self.n_jobs, verbose=verbose) as parallel: chunks = self._chunkify(self.inputs_, n_chunks=self.n_jobs) - - # Allow functions which don't accept or need a seed parameter. - lst_add_kwargs: List[Dict[str, Union[int, SeedSequence]]] = [ - {"job_id": j} for j in range(len(chunks)) - ] - if check_fn_accepts_parameter(self._map_func, "seed"): - lst_add_kwargs = [ - {**d, **{"seed": seed.entropy}} - for d, seed in zip(lst_add_kwargs, seed_seq.spawn(len(chunks))) - ] - map_results: List[R] = parallel( - delayed(self._map_func)(next_chunk, **add_kwargs, **self.map_kwargs) - for next_chunk, add_kwargs in zip(chunks, lst_add_kwargs) + delayed(self._map_func)( + next_chunk, job_id=j, seed=seed.entropy, **self.map_kwargs + ) + for j, (next_chunk, seed) in enumerate( + zip(chunks, seed_seq.spawn(len(chunks))) + ) ) reduce_results: R = self._reduce_func(map_results, **self.reduce_kwargs) return reduce_results diff --git a/src/pydvl/utils/types.py b/src/pydvl/utils/types.py index aa877a196..1cf831827 100644 --- a/src/pydvl/utils/types.py +++ b/src/pydvl/utils/types.py @@ -4,15 +4,17 @@ from __future__ import annotations import functools -import inspect from abc import ABCMeta from typing import Any, Callable, Optional, Protocol, TypeVar, Union from numpy.random import Generator, SeedSequence from numpy.typing import NDArray +from pydvl.utils.functional import fn_accepts_param_name + __all__ = ["SupervisedModel", "MapFunction", "ReduceFunction", "NoPublicConstructor"] + R = TypeVar("R", covariant=True) @@ -57,8 +59,7 @@ def maybe_add_argument(fun: Callable, new_arg: str): (and ignore). :return: A new function accepting one more keyword argument. """ - params = inspect.signature(fun).parameters - if new_arg in params.keys(): + if fn_accepts_param_name(fun, new_arg): return fun @functools.wraps(fun) @@ -100,9 +101,7 @@ def create(cls, *args: Any, **kwargs: Any): Seed = Union[int, Generator] -def ensure_seed_sequence( - seed: Optional[Union[Seed, SeedSequence]] = None -) -> SeedSequence: +def ensure_seed_seq(seed: Optional[Union[Seed, SeedSequence]] = None) -> SeedSequence: """ If the passed seed is a SeedSequence object then it is returned as is. If it is a Generator the internal protected seed sequence from the generator gets extracted. From 7b9c07dc7e055f46fdc7779bc0b74a778d87d6c1 Mon Sep 17 00:00:00 2001 From: Markus Semmler Date: Wed, 30 Aug 2023 22:26:32 +0200 Subject: [PATCH 07/32] Rename :func:`pydvl.utils.types.ensure_seed_seq` to :func:`pydvl.utils.types.ensure_seed_sequence`. --- src/pydvl/utils/parallel/map_reduce.py | 4 ++-- src/pydvl/utils/types.py | 4 +++- src/pydvl/value/sampler.py | 4 ++-- src/pydvl/value/semivalues.py | 2 +- 4 files changed, 8 insertions(+), 6 deletions(-) diff --git a/src/pydvl/utils/parallel/map_reduce.py b/src/pydvl/utils/parallel/map_reduce.py index 0da912b6a..dfd3de534 100644 --- a/src/pydvl/utils/parallel/map_reduce.py +++ b/src/pydvl/utils/parallel/map_reduce.py @@ -12,7 +12,7 @@ MapFunction, ReduceFunction, Seed, - ensure_seed_seq, + ensure_seed_sequence, maybe_add_argument, ) from .backend import init_parallel_backend @@ -125,7 +125,7 @@ def __call__( # In joblib the levels are reversed. # 0 means no logging and 50 means log everything to stdout verbose = 50 - self.config.logging_level - seed_seq = ensure_seed_seq(seed) + seed_seq = ensure_seed_sequence(seed) with Parallel(backend=backend, n_jobs=self.n_jobs, verbose=verbose) as parallel: chunks = self._chunkify(self.inputs_, n_chunks=self.n_jobs) map_results: List[R] = parallel( diff --git a/src/pydvl/utils/types.py b/src/pydvl/utils/types.py index 1cf831827..957ef8f51 100644 --- a/src/pydvl/utils/types.py +++ b/src/pydvl/utils/types.py @@ -101,7 +101,9 @@ def create(cls, *args: Any, **kwargs: Any): Seed = Union[int, Generator] -def ensure_seed_seq(seed: Optional[Union[Seed, SeedSequence]] = None) -> SeedSequence: +def ensure_seed_sequence( + seed: Optional[Union[Seed, SeedSequence]] = None +) -> SeedSequence: """ If the passed seed is a SeedSequence object then it is returned as is. If it is a Generator the internal protected seed sequence from the generator gets extracted. diff --git a/src/pydvl/value/sampler.py b/src/pydvl/value/sampler.py index 3d01ba7fd..4752c2d03 100644 --- a/src/pydvl/value/sampler.py +++ b/src/pydvl/value/sampler.py @@ -54,7 +54,7 @@ "UniformSampler", ] -from pydvl.utils.types import Seed, ensure_seed_seq +from pydvl.utils.types import Seed, ensure_seed_sequence T = TypeVar("T", bound=np.generic) SampleT = Tuple[T, NDArray[T]] @@ -207,7 +207,7 @@ def __init__(self, *args, **kwargs): @property def seed(self) -> int: - return ensure_seed_seq(self._rng).entropy + return ensure_seed_sequence(self._rng).entropy @seed.setter def seed(self, seed: Seed): diff --git a/src/pydvl/value/semivalues.py b/src/pydvl/value/semivalues.py index b38e2fff6..6c254727c 100644 --- a/src/pydvl/value/semivalues.py +++ b/src/pydvl/value/semivalues.py @@ -62,7 +62,7 @@ from tqdm import tqdm from pydvl.utils import ParallelConfig, Utility -from pydvl.utils.types import Seed, ensure_seed_seq +from pydvl.utils.types import Seed, ensure_seed_sequence from pydvl.value import ValuationResult from pydvl.value.sampler import PermutationSampler, PowersetSampler, SampleT from pydvl.value.stopping import MaxUpdates, StoppingCriterion From fe0383440eba6274c9a96797a157110de5187413 Mon Sep 17 00:00:00 2001 From: Markus Semmler Date: Wed, 30 Aug 2023 22:35:58 +0200 Subject: [PATCH 08/32] Add StochasticSampler mixin and add back the different typeof index iterations. --- src/pydvl/value/sampler.py | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/src/pydvl/value/sampler.py b/src/pydvl/value/sampler.py index 4752c2d03..cdd715ba7 100644 --- a/src/pydvl/value/sampler.py +++ b/src/pydvl/value/sampler.py @@ -43,6 +43,7 @@ from numpy.typing import NDArray from pydvl.utils.numeric import powerset, random_subset, random_subset_of_size +from pydvl.utils.types import Seed, ensure_seed_sequence __all__ = [ "AntitheticSampler", @@ -52,9 +53,9 @@ "PowersetSampler", "RandomHierarchicalSampler", "UniformSampler", + "StochasticSamplerMixin", ] -from pydvl.utils.types import Seed, ensure_seed_sequence T = TypeVar("T", bound=np.generic) SampleT = Tuple[T, NDArray[T]] @@ -144,10 +145,17 @@ def complement(self, exclude: Sequence[T]) -> NDArray[T]: return np.setxor1d(self._indices, exclude) def iterindices(self) -> Iterator[T]: - """Iterates over indices in the order specified at construction.""" + """Iterates over indices in the order specified at construction. + + FIXME: this is probably not very useful, but I couldn't decide + which method is better + """ if self._index_iteration is PowersetSampler.IndexIteration.Sequential: for idx in self._outer_indices: yield idx + elif self._index_iteration is PowersetSampler.IndexIteration.Random: + while True: + yield np.random.choice(self._outer_indices, size=1).item() @overload def __getitem__(self, key: slice) -> PowersetSampler[T]: @@ -200,10 +208,10 @@ def weight(cls, n: int, subset_len: int) -> float: ... -class StochasticSampler(PowersetSampler[T], abc.ABC): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self._rng = np.random.default_rng() +class StochasticSamplerMixin: + """Mixin class for samplers which use a random number generator.""" + + _rng = np.random.default_rng() @property def seed(self) -> int: @@ -257,7 +265,7 @@ def weight(cls, n: int, subset_len: int) -> float: return float(2 ** (n - 1)) if n > 0 else 1.0 -class UniformSampler(StochasticSampler[T]): +class UniformSampler(PowersetSampler[T], StochasticSamplerMixin): """An iterator to perform uniform random sampling of subsets. Iterating over every index $i$, either in sequence or at random depending on @@ -304,7 +312,7 @@ def __init__(self, indices: NDArray[T], *args, **kwargs): void(indices, args, kwargs) -class AntitheticSampler(StochasticSampler[T]): +class AntitheticSampler(PowersetSampler[T], StochasticSamplerMixin): """An iterator to perform uniform random sampling of subsets, and their complements. @@ -329,7 +337,7 @@ def weight(cls, n: int, subset_len: int) -> float: return float(2 ** (n - 1)) if n > 0 else 1.0 -class PermutationSampler(StochasticSampler[T]): +class PermutationSampler(PowersetSampler[T], StochasticSamplerMixin): """Sample permutations of indices and iterate through each returning increasing subsets, as required for the permutation definition of semi-values. @@ -389,7 +397,7 @@ def __iter__(self) -> Iterator[SampleT]: self._n_samples += 1 -class RandomHierarchicalSampler(StochasticSampler[T]): +class RandomHierarchicalSampler(PowersetSampler[T], StochasticSamplerMixin): """For every index, sample a set size, then a set of that size. .. todo:: From 615fe34e614dc64711f97a094e7c5e10a93f2f17 Mon Sep 17 00:00:00 2001 From: Markus Semmler Date: Wed, 30 Aug 2023 22:52:07 +0200 Subject: [PATCH 09/32] Remove sets_are_equal function and replace by `set(a) == set(b)`. --- src/pydvl/utils/numeric.py | 5 ----- tests/utils/test_numeric.py | 40 +++++++++++++++++++++++++++---------- tests/value/test_sampler.py | 7 +++---- 3 files changed, 32 insertions(+), 20 deletions(-) diff --git a/src/pydvl/utils/numeric.py b/src/pydvl/utils/numeric.py index c9ade3d59..0cd6d0571 100644 --- a/src/pydvl/utils/numeric.py +++ b/src/pydvl/utils/numeric.py @@ -21,7 +21,6 @@ "random_powerset", "random_subset_of_size", "top_k_value_accuracy", - "sets_are_equal", ] T = TypeVar("T", bound=np.generic) @@ -249,7 +248,3 @@ def top_k_value_accuracy( top_k_pred_values = np.argsort(y_pred)[-k:] top_k_accuracy = len(np.intersect1d(top_k_exact_values, top_k_pred_values)) / k return top_k_accuracy - - -def sets_are_equal(subset_1: NDArray[np.int_], subset_2: NDArray[np.int_]): - return list(subset_1) == list(subset_2) diff --git a/tests/utils/test_numeric.py b/tests/utils/test_numeric.py index 12168455d..212ff2db6 100644 --- a/tests/utils/test_numeric.py +++ b/tests/utils/test_numeric.py @@ -8,8 +8,8 @@ random_powerset, random_subset_of_size, running_moments, - sets_are_equal, ) +from pydvl.utils.types import Seed def test_powerset(): @@ -76,13 +76,9 @@ def test_random_powerset_reproducible(n, max_subsets, seed): Test that the same seeds produce the same results, and different seeds produce different results for method :func:`random_powerset`. """ - s = np.arange(n) - parallel_subset_generators = zip( - random_powerset(s, n_samples=max_subsets, seed=seed), - random_powerset(s, n_samples=max_subsets, seed=seed), + n_collisions = _count_random_powerset_generator_collisions( + n, max_subsets, seed, seed ) - - n_collisions = sum(map(lambda t: sets_are_equal(*t), parallel_subset_generators)) assert n_collisions == max_subsets @@ -92,13 +88,35 @@ def test_random_powerset_stochastic(n, max_subsets, seed, seed_alt, collision_to Test that the same seeds produce the same results, and different seeds produce different results for method :func:`random_powerset`. """ + n_collisions = _count_random_powerset_generator_collisions( + n, max_subsets, seed, seed_alt + ) + assert n_collisions / max_subsets < collision_tol + + +def _count_random_powerset_generator_collisions( + n: int, max_subsets: int, seed: Seed, seed_alt: Seed +): + """ + Count the number of collisions between two generators of random subsets of a set + with `n` elements, each generating `max_subsets` subsets, using two different seeds. + + :param n: number of elements in the set. + :param max_subsets: number of subsets to generate. + :param seed: Seed for the first generator. + :param seed_alt: Seed for the second generator. + + :return: Number of collisions between the two generators. + """ s = np.arange(n) parallel_subset_generators = zip( random_powerset(s, n_samples=max_subsets, seed=seed), random_powerset(s, n_samples=max_subsets, seed=seed_alt), ) - n_collisions = sum(map(lambda t: sets_are_equal(*t), parallel_subset_generators)) - assert n_collisions / max_subsets < collision_tol + n_collisions = sum( + map(lambda t: set(t[0]) == set(t[1]), parallel_subset_generators) + ) + return n_collisions @pytest.mark.parametrize( @@ -128,7 +146,7 @@ def test_random_subset_of_size_stochastic(n, size, seed, seed_alt): s = np.arange(n) subset_1 = random_subset_of_size(s, size=size, seed=seed) subset_2 = random_subset_of_size(s, size=size, seed=seed_alt) - assert not sets_are_equal(subset_1, subset_2) + assert set(subset_1) != set(subset_2) @pytest.mark.parametrize( @@ -143,7 +161,7 @@ def test_random_subset_of_size_stochastic(n, size, seed): s = np.arange(n) subset_1 = random_subset_of_size(s, size=size, seed=seed) subset_2 = random_subset_of_size(s, size=size, seed=seed) - assert sets_are_equal(subset_1, subset_2) + assert set(subset_1) == set(subset_2) @pytest.mark.parametrize( diff --git a/tests/value/test_sampler.py b/tests/value/test_sampler.py index 3aa2eef3d..7b8b06a25 100644 --- a/tests/value/test_sampler.py +++ b/tests/value/test_sampler.py @@ -3,9 +3,8 @@ import numpy as np import pytest -from numpy._typing import NDArray -from pydvl.utils import powerset, sets_are_equal +from pydvl.utils import powerset from pydvl.utils.types import Seed from pydvl.value.sampler import ( AntitheticSampler, @@ -56,7 +55,7 @@ def test_proper_reproducible(sampler_class, indices, seed): samples_2 = _create_seeded_sample_iter(sampler_class, indices, seed) for (_, subset_1), (_, subset_2) in zip(samples_1, samples_2): - assert sets_are_equal(subset_1, subset_2) + assert set(subset_1) == set(subset_2) @pytest.mark.parametrize( @@ -75,7 +74,7 @@ def test_proper_stochastic(sampler_class, indices, seed, seed_alt): samples_2 = _create_seeded_sample_iter(sampler_class, indices, seed_alt) for (_, subset_1), (_, subset_2) in zip(samples_1, samples_2): - assert not sets_are_equal(subset_1, subset_2) + assert set(subset_1) != set(subset_2) @pytest.mark.parametrize( From df24a53a6eede76dcbb39d70b4d9174ebe248710 Mon Sep 17 00:00:00 2001 From: Markus Semmler Date: Wed, 30 Aug 2023 23:07:12 +0200 Subject: [PATCH 10/32] Change structure so that seed is set over the constructor for the sampler. Introduced StochasticSampler type. --- src/pydvl/value/sampler.py | 46 +++++++++++++++++++++++++---------- src/pydvl/value/semivalues.py | 41 ++++++++++++++++--------------- tests/value/test_sampler.py | 8 +++--- 3 files changed, 59 insertions(+), 36 deletions(-) diff --git a/src/pydvl/value/sampler.py b/src/pydvl/value/sampler.py index cdd715ba7..4f29b1be7 100644 --- a/src/pydvl/value/sampler.py +++ b/src/pydvl/value/sampler.py @@ -36,7 +36,17 @@ import math from enum import Enum from itertools import permutations -from typing import Generic, Iterable, Iterator, Sequence, Tuple, TypeVar, overload +from typing import ( + Generic, + Iterable, + Iterator, + Optional, + Sequence, + Tuple, + TypeVar, + Union, + overload, +) import numpy as np from deprecate import deprecated, void @@ -211,18 +221,7 @@ def weight(cls, n: int, subset_len: int) -> float: class StochasticSamplerMixin: """Mixin class for samplers which use a random number generator.""" - _rng = np.random.default_rng() - - @property - def seed(self) -> int: - return ensure_seed_sequence(self._rng).entropy - - @seed.setter - def seed(self, seed: Seed): - """ - :param seed: Either an instance of a numpy random number generator or a seed - for it. - """ + def __init__(self, seed: Optional[Seed] = None): self._rng = np.random.default_rng(seed) @@ -287,6 +286,10 @@ class UniformSampler(PowersetSampler[T], StochasticSamplerMixin): """ + def __init__(self, *args, seed: Optional[Seed] = None, **kwargs): + super().__init__(*args, **kwargs) + StochasticSamplerMixin.__init__(self, seed=seed) + def __iter__(self) -> Iterator[SampleT]: while True: for idx in self.iterindices(): @@ -321,6 +324,10 @@ class AntitheticSampler(PowersetSampler[T], StochasticSamplerMixin): the set $S$, including the index $i$ itself. """ + def __init__(self, *args, seed: Optional[Seed] = None, **kwargs): + super().__init__(*args, **kwargs) + StochasticSamplerMixin.__init__(self, seed=seed) + def __iter__(self) -> Iterator[SampleT]: while True: for idx in self.iterindices(): @@ -354,6 +361,10 @@ class PermutationSampler(PowersetSampler[T], StochasticSamplerMixin): will be doubled wrt. a "direct" implementation of permutation MC """ + def __init__(self, *args, seed: Optional[Seed] = None, **kwargs): + super().__init__(*args, **kwargs) + StochasticSamplerMixin.__init__(self, seed=seed) + def __iter__(self) -> Iterator[SampleT]: while True: permutation = self._rng.permutation(self._indices) @@ -404,6 +415,10 @@ class RandomHierarchicalSampler(PowersetSampler[T], StochasticSamplerMixin): This is unnecessary, but a step towards proper stratified sampling. """ + def __init__(self, *args, seed: Optional[Seed] = None, **kwargs): + super().__init__(*args, **kwargs) + StochasticSamplerMixin.__init__(self, seed=seed) + def __iter__(self) -> Iterator[SampleT]: while True: for idx in self.iterindices(): @@ -419,3 +434,8 @@ def __iter__(self) -> Iterator[SampleT]: @classmethod def weight(cls, n: int, subset_len: int) -> float: return float(2 ** (n - 1)) if n > 0 else 1.0 + + +StochasticSampler = Union[ + UniformSampler, PermutationSampler, RandomHierarchicalSampler, AntitheticSampler +] diff --git a/src/pydvl/value/semivalues.py b/src/pydvl/value/semivalues.py index 6c254727c..780eaf022 100644 --- a/src/pydvl/value/semivalues.py +++ b/src/pydvl/value/semivalues.py @@ -64,7 +64,12 @@ from pydvl.utils import ParallelConfig, Utility from pydvl.utils.types import Seed, ensure_seed_sequence from pydvl.value import ValuationResult -from pydvl.value.sampler import PermutationSampler, PowersetSampler, SampleT +from pydvl.value.sampler import ( + PermutationSampler, + PowersetSampler, + SampleT, + StochasticSampler, +) from pydvl.value.stopping import MaxUpdates, StoppingCriterion __all__ = [ @@ -187,9 +192,8 @@ def semivalues( return result # Ensure that we always have n_submitted_jobs running - n_remaining_slots = n_submitted_jobs - len(pending) try: - for i in range(n_remaining_slots): + for _ in range(n_submitted_jobs - len(pending)): pending.add( executor.submit( _marginal, @@ -236,7 +240,7 @@ def compute_shapley_semivalues( u: Utility, *, done: StoppingCriterion = MaxUpdates(100), - sampler_t: Type[PowersetSampler] = PermutationSampler, + sampler_t: Type[StochasticSampler] = PermutationSampler, n_jobs: int = 1, config: ParallelConfig = ParallelConfig(), progress: bool = False, @@ -261,10 +265,8 @@ def compute_shapley_semivalues( :return: Object with the results. """ - sampler_instance = sampler_t(u.data.indices) - sampler_instance.seed = seed return semivalues( - sampler_instance, + sampler_t(u.data.indices, seed=seed), u, shapley_coefficient, done, @@ -278,7 +280,7 @@ def compute_banzhaf_semivalues( u: Utility, *, done: StoppingCriterion = MaxUpdates(100), - sampler_t: Type[PowersetSampler] = PermutationSampler, + sampler_t: Type[StochasticSampler] = PermutationSampler, n_jobs: int = 1, config: ParallelConfig = ParallelConfig(), progress: bool = False, @@ -301,10 +303,8 @@ def compute_banzhaf_semivalues( :return: Object with the results. """ - sampler_instance = sampler_t(u.data.indices) - sampler_instance.seed = seed return semivalues( - sampler_instance, + sampler_t(u.data.indices, seed=seed), u, banzhaf_coefficient, done, @@ -320,7 +320,7 @@ def compute_beta_shapley_semivalues( alpha: float = 1, beta: float = 1, done: StoppingCriterion = MaxUpdates(100), - sampler_t: Type[PowersetSampler] = PermutationSampler, + sampler_t: Type[StochasticSampler] = PermutationSampler, n_jobs: int = 1, config: ParallelConfig = ParallelConfig(), progress: bool = False, @@ -345,10 +345,8 @@ def compute_beta_shapley_semivalues( :return: Object with the results. """ - sampler_instance = sampler_t(u.data.indices) - sampler_instance.seed = seed return semivalues( - sampler_instance, + sampler_t(u.data.indices, seed=seed), u, beta_coefficient(alpha, beta), done, @@ -375,7 +373,7 @@ def compute_semivalues( *, done: StoppingCriterion = MaxUpdates(100), mode: SemiValueMode = SemiValueMode.Shapley, - sampler_t: Type[PowersetSampler] = PermutationSampler, + sampler_t: Type[StochasticSampler] = PermutationSampler, n_jobs: int = 1, seed: Optional[Seed] = None, **kwargs, @@ -414,8 +412,6 @@ def compute_semivalues( :return: Object with the results. """ - sampler_instance = sampler_t(u.data.indices) - sampler_instance.seed = seed if mode == SemiValueMode.Shapley: coefficient = shapley_coefficient elif mode == SemiValueMode.BetaShapley: @@ -427,4 +423,11 @@ def compute_semivalues( else: raise ValueError(f"Unknown mode {mode}") coefficient = cast(SVCoefficient, coefficient) - return semivalues(sampler_instance, u, coefficient, done, n_jobs=n_jobs, **kwargs) + return semivalues( + sampler_t(u.data.indices, seed=seed), + u, + coefficient, + done, + n_jobs=n_jobs, + **kwargs, + ) diff --git a/tests/value/test_sampler.py b/tests/value/test_sampler.py index 7b8b06a25..87d9db31d 100644 --- a/tests/value/test_sampler.py +++ b/tests/value/test_sampler.py @@ -1,5 +1,5 @@ from itertools import takewhile -from typing import Iterator, List, Type +from typing import Iterator, List, Type, Union import numpy as np import pytest @@ -13,6 +13,7 @@ PermutationSampler, PowersetSampler, RandomHierarchicalSampler, + StochasticSampler, UniformSampler, ) @@ -115,12 +116,11 @@ def test_chunkify_permutation(sampler_class): def _create_seeded_sample_iter( - sampler_class: Type[PowersetSampler], + sampler_class: Type[StochasticSampler], indices: List, seed: Seed, ) -> Iterator: max_iterations = len(indices) - sampler = sampler_class(np.array(indices)) - sampler.seed = seed + sampler = sampler_class(np.array(indices), seed=seed) sample_stream = takewhile(lambda _: sampler.n_samples < max_iterations, sampler) return sample_stream From 778a7eb73077825d32cd44ad528318a9bd7cd8a5 Mon Sep 17 00:00:00 2001 From: Markus Semmler Date: Thu, 31 Aug 2023 06:16:46 +0200 Subject: [PATCH 11/32] Add seed parameter to all methods of `pydvl.value.shapley.common`. Added test cased for reproducibility and stochasticity. --- src/pydvl/value/shapley/common.py | 12 +++- src/pydvl/value/shapley/gt.py | 23 +++++-- src/pydvl/value/shapley/montecarlo.py | 39 +++++++++--- src/pydvl/value/shapley/owen.py | 14 +++- tests/value/__init__.py | 9 ++- tests/value/shapley/test_montecarlo.py | 88 ++++++++++++++++++++++++++ 6 files changed, 164 insertions(+), 21 deletions(-) diff --git a/src/pydvl/value/shapley/common.py b/src/pydvl/value/shapley/common.py index fe43274d5..010ae9121 100644 --- a/src/pydvl/value/shapley/common.py +++ b/src/pydvl/value/shapley/common.py @@ -1,4 +1,7 @@ +from typing import Optional + from pydvl.utils import Utility +from pydvl.utils.types import Seed from pydvl.value.result import ValuationResult from pydvl.value.shapley.gt import group_testing_shapley from pydvl.value.shapley.knn import knn_shapley @@ -24,6 +27,7 @@ def compute_shapley_values( done: StoppingCriterion = MaxUpdates(100), mode: ShapleyMode = ShapleyMode.TruncatedMontecarlo, n_jobs: int = 1, + seed: Optional[Seed] = None, **kwargs, ) -> ValuationResult: """Umbrella method to compute Shapley values with any of the available @@ -83,6 +87,8 @@ def compute_shapley_values( using boolean operators. Some methods ignore this argument, others require specific subtypes. :param n_jobs: Number of parallel jobs (available only to some methods) + :param seed: Either an instance of a numpy random number generator or a seed + for it. :param mode: Choose which shapley algorithm to use. See :class:`~pydvl.value.shapley.ShapleyMode` for a list of allowed value. @@ -102,11 +108,11 @@ def compute_shapley_values( ): truncation = kwargs.pop("truncation", NoTruncation()) return permutation_montecarlo_shapley( # type: ignore - u=u, done=done, truncation=truncation, n_jobs=n_jobs, **kwargs + u=u, done=done, truncation=truncation, n_jobs=n_jobs, seed=seed, **kwargs ) elif mode == ShapleyMode.CombinatorialMontecarlo: return combinatorial_montecarlo_shapley( - u, done=done, n_jobs=n_jobs, progress=progress + u, done=done, n_jobs=n_jobs, seed=seed, progress=progress ) elif mode == ShapleyMode.CombinatorialExact: return combinatorial_exact_shapley(u, n_jobs=n_jobs, progress=progress) @@ -129,6 +135,7 @@ def compute_shapley_values( max_q=int(kwargs.get("max_q", -1)), method=method, n_jobs=n_jobs, + seed=seed, ) elif mode == ShapleyMode.KNN: return knn_shapley(u, progress=progress) @@ -147,6 +154,7 @@ def compute_shapley_values( n_samples=n_samples, n_jobs=n_jobs, progress=progress, + seed=seed, **kwargs, ) else: diff --git a/src/pydvl/value/shapley/gt.py b/src/pydvl/value/shapley/gt.py index cc14da315..3c5c75f29 100644 --- a/src/pydvl/value/shapley/gt.py +++ b/src/pydvl/value/shapley/gt.py @@ -17,7 +17,7 @@ """ import logging from collections import namedtuple -from typing import Iterable, Tuple, TypeVar, cast +from typing import Iterable, Optional, Tuple, TypeVar, cast import cvxpy as cp import numpy as np @@ -27,6 +27,7 @@ from pydvl.utils.numeric import random_subset_of_size from pydvl.utils.parallel.backend import effective_n_jobs from pydvl.utils.status import Status +from pydvl.utils.types import Seed, ensure_seed_sequence from pydvl.value import ValuationResult __all__ = ["group_testing_shapley", "num_samples_eps_delta"] @@ -114,7 +115,11 @@ def num_samples_eps_delta( def _group_testing_shapley( - u: Utility, n_samples: int, progress: bool = False, job_id: int = 1 + u: Utility, + n_samples: int, + progress: bool = False, + job_id: int = 1, + seed: Optional[Seed] = None, ): """Helper function for :func:`group_testing_shapley`. @@ -125,9 +130,11 @@ def _group_testing_shapley( :param n_samples: total number of samples (subsets) to use. :param progress: Whether to display progress bars for each job. :param job_id: id to use for reporting progress (e.g. to place progres bars) + :param seed: Either an instance of a numpy random number generator or a seed + for it. :return: """ - rng = np.random.default_rng() + rng = np.random.default_rng(seed) n = len(u.data.indices) const = _constants(n, 1, 1, 1) # don't care about eps,delta,range @@ -138,7 +145,7 @@ def _group_testing_shapley( for t in maybe_progress(n_samples, progress=progress, position=job_id): k = rng.choice(const.kk, size=1, p=const.q).item() - s = random_subset_of_size(u.data.indices, k) + s = random_subset_of_size(u.data.indices, k, seed=rng) uu[t] = u(s) betas[t, s] = 1 return uu, betas @@ -153,6 +160,7 @@ def group_testing_shapley( n_jobs: int = 1, config: ParallelConfig = ParallelConfig(), progress: bool = False, + seed: Optional[Seed] = None, **options, ) -> ValuationResult: """Implements group testing for approximation of Shapley values as described @@ -181,6 +189,8 @@ def group_testing_shapley( :param config: Object configuring parallel computation, with cluster address, number of cpus, etc. :param progress: Whether to display progress bars for each job. + :param seed: Either an instance of a numpy random number generator or a seed + for it. :param options: Additional options to pass to `cvxpy.Problem.solve() `_. E.g. to change the solver (which defaults to `cvxpy.SCS`) pass @@ -219,6 +229,9 @@ def reducer( np.float_ ), np.concatenate(list(x[1] for x in results_it)).astype(np.int_) + seed_sequence = ensure_seed_sequence(seed) + map_reduce_seed_sequence, cvxpy_seed = tuple(seed_sequence.spawn(2)) + map_reduce_job: MapReduceJob[Utility, Tuple[NDArray, NDArray]] = MapReduceJob( u, map_func=_group_testing_shapley, @@ -227,7 +240,7 @@ def reducer( config=config, n_jobs=n_jobs, ) - uu, betas = map_reduce_job() + uu, betas = map_reduce_job(seed=map_reduce_seed_sequence.entropy) # Matrix of estimated differences. See Eqs. (3) and (4) in the paper. C = np.zeros(shape=(n, n)) diff --git a/src/pydvl/value/shapley/montecarlo.py b/src/pydvl/value/shapley/montecarlo.py index 9eb5b640d..08c1a0beb 100644 --- a/src/pydvl/value/shapley/montecarlo.py +++ b/src/pydvl/value/shapley/montecarlo.py @@ -38,7 +38,7 @@ from concurrent.futures import FIRST_COMPLETED, Future, wait from functools import reduce from itertools import cycle, takewhile -from typing import Sequence +from typing import Optional, Sequence import numpy as np from deprecate import deprecated @@ -49,6 +49,7 @@ from pydvl.utils.config import ParallelConfig from pydvl.utils.numeric import random_powerset from pydvl.utils.parallel import CancellationPolicy, MapReduceJob +from pydvl.utils.types import Seed, ensure_seed_sequence from pydvl.utils.utility import Utility from pydvl.value.result import ValuationResult from pydvl.value.shapley.truncated import NoTruncation, TruncationPolicy @@ -60,7 +61,10 @@ def _permutation_montecarlo_one_step( - u: Utility, truncation: TruncationPolicy, algorithm_name: str + u: Utility, + truncation: TruncationPolicy, + algorithm_name: str, + seed: Optional[Seed] = None, ) -> ValuationResult: """Helper function for :func:`permutation_montecarlo_shapley`. @@ -73,16 +77,16 @@ def _permutation_montecarlo_one_step( processing a permutation and set all subsequent marginals to zero. :param algorithm_name: For the results object. Used internally by different variants of Shapley using this subroutine - + :param seed: Either an instance of a numpy random number generator or a seed + for it. :return: An object with the results """ result = ValuationResult.zeros( algorithm=algorithm_name, indices=u.data.indices, data_names=u.data.data_names ) - prev_score = 0.0 - permutation = np.random.permutation(u.data.indices) + permutation = np.random.default_rng(seed).permutation(u.data.indices) permutation_done = False truncation.reset() for i, idx in enumerate(permutation): @@ -121,6 +125,7 @@ def permutation_montecarlo_shapley( n_jobs: int = 1, config: ParallelConfig = ParallelConfig(), progress: bool = False, + seed: Seed = None, ) -> ValuationResult: r"""Computes an approximate Shapley value by sampling independent permutations of the index set, approximating the sum: @@ -159,6 +164,8 @@ def permutation_montecarlo_shapley( interrupt processing a permutation and set all subsequent marginals to zero. Typically used to stop computation when the marginal is small. :param n_jobs: number of jobs across which to distribute the computation. + :param seed: Either an instance of a numpy random number generator or a seed + for it. :param config: Object configuring parallel computation, with cluster address, number of cpus, etc. :param progress: Whether to display progress bars for each job. @@ -171,6 +178,7 @@ def permutation_montecarlo_shapley( max_workers = effective_n_jobs(n_jobs, config) n_submitted_jobs = 2 * max_workers # number of jobs in the executor's queue + seed_sequence = ensure_seed_sequence(seed) result = ValuationResult.zeros(algorithm=algorithm) pbar = tqdm(disable=not progress, total=100, unit="%") @@ -195,9 +203,15 @@ def permutation_montecarlo_shapley( return result # Ensure that we always have n_submitted_jobs in the queue or running - for _ in range(n_submitted_jobs - len(pending)): + n_remaining_slots = n_submitted_jobs - len(pending) + seeds = seed_sequence.spawn(n_remaining_slots) + for i in range(n_remaining_slots): future = executor.submit( - _permutation_montecarlo_one_step, u, truncation, algorithm + _permutation_montecarlo_one_step, + u, + truncation, + algorithm, + seed=seeds[i].entropy, ) pending.add(future) @@ -209,6 +223,7 @@ def _combinatorial_montecarlo_shapley( *, progress: bool = False, job_id: int = 1, + seed: Optional[Seed] = None, ) -> ValuationResult: """Helper function for :func:`combinatorial_montecarlo_shapley`. @@ -221,6 +236,8 @@ def _combinatorial_montecarlo_shapley( subsets for an index. :param progress: Whether to display progress bars for each job. :param job_id: id to use for reporting progress + :param seed: Either an instance of a numpy random number generator or a seed + for it. :return: A tuple of ndarrays with estimated values and standard errors """ n = len(u.data) @@ -236,6 +253,7 @@ def _combinatorial_montecarlo_shapley( data_names=[u.data.data_names[i] for i in indices], ) + rng = np.random.default_rng(seed) repeat_indices = takewhile(lambda _: not done(result), cycle(indices)) pbar = tqdm(disable=not progress, position=job_id, total=100, unit="%") for idx in repeat_indices: @@ -243,7 +261,7 @@ def _combinatorial_montecarlo_shapley( pbar.refresh() # Randomly sample subsets of full dataset without idx subset = np.setxor1d(u.data.indices, [idx], assume_unique=True) - s = next(random_powerset(subset, n_samples=1)) + s = next(random_powerset(subset, n_samples=1, seed=rng)) marginal = (u({idx}.union(s)) - u(s)) / math.comb(n - 1, len(s)) result.update(idx, correction * marginal) @@ -257,6 +275,7 @@ def combinatorial_montecarlo_shapley( n_jobs: int = 1, config: ParallelConfig = ParallelConfig(), progress: bool = False, + seed: Optional[Seed] = None, ) -> ValuationResult: r"""Computes an approximate Shapley value using the combinatorial definition: @@ -281,6 +300,8 @@ def combinatorial_montecarlo_shapley( :param n_jobs: number of parallel jobs across which to distribute the computation. Each worker receives a chunk of :attr:`~pydvl.utils.dataset.Dataset.indices` + :param seed: Either an instance of a numpy random number generator or a seed + for it. :param config: Object configuring parallel computation, with cluster address, number of cpus, etc. :param progress: Whether to display progress bars for each job. @@ -295,4 +316,4 @@ def combinatorial_montecarlo_shapley( n_jobs=n_jobs, config=config, ) - return map_reduce_job() + return map_reduce_job(seed=seed) diff --git a/src/pydvl/value/shapley/owen.py b/src/pydvl/value/shapley/owen.py index ec61b4f4d..b5d7f7d14 100644 --- a/src/pydvl/value/shapley/owen.py +++ b/src/pydvl/value/shapley/owen.py @@ -2,13 +2,14 @@ from enum import Enum from functools import reduce from itertools import cycle, takewhile -from typing import Sequence +from typing import Optional, Sequence import numpy as np from numpy.typing import NDArray from tqdm import tqdm from pydvl.utils import MapReduceJob, ParallelConfig, Utility, random_powerset +from pydvl.utils.types import Seed from pydvl.value import ValuationResult from pydvl.value.stopping import MinUpdates @@ -29,6 +30,7 @@ def _owen_sampling_shapley( *, progress: bool = False, job_id: int = 1, + seed: Optional[Seed] = None ) -> ValuationResult: r"""This is the algorithm as detailed in the paper: to compute the outer integral over q ∈ [0,1], use uniformly distributed points for evaluation @@ -47,6 +49,8 @@ def _owen_sampling_shapley( :param max_q: number of subdivisions for the integration over $q$ :param progress: Whether to display progress bars for each job :param job_id: For positioning of the progress bar + :param seed: Either an instance of a numpy random number generator or a seed + for it. :return: Object with the data values, errors. """ q_stop = {OwenAlgorithm.Standard: 1.0, OwenAlgorithm.Antithetic: 0.5} @@ -58,6 +62,7 @@ def _owen_sampling_shapley( data_names=[u.data.data_names[i] for i in indices], ) + rng = np.random.default_rng(seed) done = MinUpdates(1) repeat_indices = takewhile(lambda _: not done(result), cycle(indices)) pbar = tqdm(disable=not progress, position=job_id, total=100, unit="%") @@ -67,7 +72,7 @@ def _owen_sampling_shapley( e = np.zeros(max_q) subset = np.setxor1d(u.data.indices, [idx], assume_unique=True) for j, q in enumerate(q_steps): - for s in random_powerset(subset, n_samples=n_samples, q=q): + for s in random_powerset(subset, n_samples=n_samples, q=q, seed=rng): marginal = u({idx}.union(s)) - u(s) if method == OwenAlgorithm.Antithetic and q != 0.5: s_complement = np.setxor1d(subset, s, assume_unique=True) @@ -93,6 +98,7 @@ def owen_sampling_shapley( n_jobs: int = 1, config: ParallelConfig = ParallelConfig(), progress: bool = False, + seed: Optional[Seed] = None ) -> ValuationResult: r"""Owen sampling of Shapley values as described in :footcite:t:`okhrati_multilinear_2021`. @@ -138,6 +144,8 @@ def owen_sampling_shapley( :param config: Object configuring parallel computation, with cluster address, number of cpus, etc. :param progress: Whether to display progress bars for each job. + :param seed: Either an instance of a numpy random number generator or a seed + for it. :return: Object with the data values. .. versionadded:: 0.3.0 @@ -161,4 +169,4 @@ def owen_sampling_shapley( config=config, ) - return map_reduce_job() + return map_reduce_job(seed=seed) diff --git a/tests/value/__init__.py b/tests/value/__init__.py index ce16e73e4..4eab5ebc6 100644 --- a/tests/value/__init__.py +++ b/tests/value/__init__.py @@ -46,6 +46,7 @@ def check_values( atol: float = 1e-5, *, extra_values_names: Sequence[str] = tuple(), + equal: bool = True, ): """Compares values in dictionaries. @@ -65,13 +66,17 @@ def check_values( have |value - exact_value| < 0.1 for every value. :param extra_values_names: Sequence of names of extra values that should also be compared. + :param equal: True, if it should be checked that the values are equal, False if it + should be checked that they are not equal. """ values.sort() exact_values.sort() - assert np.allclose(values.values, exact_values.values, rtol=rtol, atol=atol) + assert (not equal) ^ np.allclose( + values.values, exact_values.values, rtol=rtol, atol=atol + ) for name in extra_values_names: - assert np.isclose( + assert (not equal) ^ np.isclose( getattr(values, name), getattr(exact_values, name), rtol=rtol, atol=atol ) diff --git a/tests/value/shapley/test_montecarlo.py b/tests/value/shapley/test_montecarlo.py index 0f92cde29..a46d84147 100644 --- a/tests/value/shapley/test_montecarlo.py +++ b/tests/value/shapley/test_montecarlo.py @@ -7,6 +7,7 @@ from pydvl.utils import GroupedDataset, MemcachedConfig, Status, Utility from pydvl.utils.numeric import num_samples_permutation_hoeffding from pydvl.utils.score import Scorer, squashed_r2 +from pydvl.utils.types import Seed from pydvl.value import compute_shapley_values from pydvl.value.shapley import ShapleyMode from pydvl.value.shapley.naive import combinatorial_exact_shapley @@ -62,6 +63,93 @@ def test_analytic_montecarlo_shapley( check_values(values, exact_values, rtol=rtol, atol=atol) +test_cases_montecarlo_shapley_reproducible_stochastic = [ + (12, ShapleyMode.PermutationMontecarlo, {"done": MaxUpdates(10)}), + # FIXME! it should be enough with 2**(len(data)-1) samples + ( + 8, + ShapleyMode.CombinatorialMontecarlo, + {"done": MaxUpdates(2**10)}, + ), + (12, ShapleyMode.Owen, dict(n_samples=4, max_q=200)), + (12, ShapleyMode.OwenAntithetic, dict(n_samples=4, max_q=200)), + ( + 4, + ShapleyMode.GroupTesting, + dict(n_samples=int(21), epsilon=0.2, delta=0.01), + ), +] + + +@pytest.mark.parametrize( + "num_samples, fun, kwargs", test_cases_montecarlo_shapley_reproducible_stochastic +) +def test_analytical_montecarlo_shapley_reproducible( + num_samples, + analytic_shapley, + parallel_config, + n_jobs, + fun: ShapleyMode, + kwargs: dict, + seed: Seed, +): + u, _ = analytic_shapley + values_1 = compute_shapley_values( + u, + mode=fun, + n_jobs=n_jobs, + config=parallel_config, + progress=False, + seed=seed, + **kwargs + ) + values_2 = compute_shapley_values( + u, + mode=fun, + n_jobs=n_jobs, + config=parallel_config, + progress=False, + seed=seed, + **kwargs + ) + check_values(values_1, values_2, rtol=0.0, atol=0.0) + + +@pytest.mark.parametrize( + "num_samples, fun, kwargs", test_cases_montecarlo_shapley_reproducible_stochastic +) +def test_analytical_montecarlo_shapley_stochastic( + num_samples, + analytic_shapley, + parallel_config, + n_jobs, + fun: ShapleyMode, + kwargs: dict, + seed: Seed, + seed_alt: Seed, +): + u, exact_values = analytic_shapley + values_1 = compute_shapley_values( + u, + mode=fun, + n_jobs=n_jobs, + config=parallel_config, + progress=False, + seed=seed, + **kwargs + ) + values_2 = compute_shapley_values( + u, + mode=fun, + n_jobs=n_jobs, + config=parallel_config, + progress=False, + seed=seed_alt, + **kwargs + ) + check_values(values_1, values_2, rtol=0.0, atol=0.0, equal=False) + + @pytest.mark.parametrize("num_samples, delta, eps", [(8, 0.1, 0.1)]) @pytest.mark.parametrize( "fun", [ShapleyMode.PermutationMontecarlo, ShapleyMode.CombinatorialMontecarlo] From 86a22393941cab43de15fe9534da09de825a92d5 Mon Sep 17 00:00:00 2001 From: Markus Semmler Date: Thu, 31 Aug 2023 06:41:51 +0200 Subject: [PATCH 12/32] Add seed parameter to `pydvl.utils.numeric.random_matrix_with_condition_number`. Added test cases for reproducibility and stochasticity. --- src/pydvl/utils/numeric.py | 10 +++++----- tests/influence/test_torch_differentiable.py | 2 +- tests/utils/test_numeric.py | 19 +++++++++++++++++++ 3 files changed, 25 insertions(+), 6 deletions(-) diff --git a/src/pydvl/utils/numeric.py b/src/pydvl/utils/numeric.py index 0cd6d0571..9c9cac283 100644 --- a/src/pydvl/utils/numeric.py +++ b/src/pydvl/utils/numeric.py @@ -143,9 +143,7 @@ def random_subset_of_size( def random_matrix_with_condition_number( - n: int, - condition_number: float, - # TODO Add seed parameter + n: int, condition_number: float, seed: Optional[Seed] = None ) -> NDArray: """Constructs a square matrix with a given condition number. @@ -157,6 +155,7 @@ def random_matrix_with_condition_number( :param n: size of the matrix :param condition_number: duh + :param seed: Either an instance of a numpy random number generator or a seed for it. :return: An (n,n) matrix with the requested condition number. """ if n < 2: @@ -165,6 +164,7 @@ def random_matrix_with_condition_number( if condition_number <= 1: raise ValueError("Condition number must be greater than 1") + rng = np.random.default_rng(seed) log_condition_number = np.log(condition_number) exp_vec = np.arange( -log_condition_number / 4.0, @@ -174,8 +174,8 @@ def random_matrix_with_condition_number( exp_vec = exp_vec[:n] s: np.ndarray = np.exp(exp_vec) S = np.diag(s) - U, _ = np.linalg.qr((np.random.rand(n, n) - 5.0) * 200) - V, _ = np.linalg.qr((np.random.rand(n, n) - 5.0) * 200) + U, _ = np.linalg.qr((rng.uniform(size=(n, n)) - 5.0) * 200) + V, _ = np.linalg.qr((rng.uniform(size=(n, n)) - 5.0) * 200) P: np.ndarray = U.dot(S).dot(V.T) P = P.dot(P.T) return P diff --git a/tests/influence/test_torch_differentiable.py b/tests/influence/test_torch_differentiable.py index 621288d6f..2747466a5 100644 --- a/tests/influence/test_torch_differentiable.py +++ b/tests/influence/test_torch_differentiable.py @@ -193,4 +193,4 @@ def test_inversion_methods( ) assert np.allclose(linear_inverse, linear_cg, rtol=1e-1) - assert np.allclose(linear_inverse, linear_lissa, rtol=1e-1) + assert np.allclose(linear_inverse, linear_lissa, rtol=1e-1, atol=2e-1) diff --git a/tests/utils/test_numeric.py b/tests/utils/test_numeric.py index 212ff2db6..77ce63df9 100644 --- a/tests/utils/test_numeric.py +++ b/tests/utils/test_numeric.py @@ -190,6 +190,25 @@ def test_random_matrix_with_condition_number(n, cond, exception): pytest.fail("Matrix is not positive definite") +@pytest.mark.parametrize( + "n, cond", + [ + (2, 10), + (7, 23), + (10, 2), + ], +) +@pytest.mark.parametrize( + "seed,seed_alt", + [(24, 42), (24, 24)], + ids=["different", "same"], +) +def test_random_matrix_with_condition_number_reproducible(n, cond, seed, seed_alt): + mat_1 = random_matrix_with_condition_number(n, cond, seed=seed) + mat_2 = random_matrix_with_condition_number(n, cond, seed=seed_alt) + assert (seed != seed_alt) ^ bool(np.sum(mat_1 != mat_2) == 0) + + def test_running_moments(): """Test that running moments are correct.""" n_samples, n_values = 15, 1000 From 7ff27c0ba10c9c81d6fd42e9a0ff0c00c8fc0804 Mon Sep 17 00:00:00 2001 From: Markus Semmler Date: Thu, 31 Aug 2023 06:55:00 +0200 Subject: [PATCH 13/32] Fix type hints as noted in https://github.com/appliedAI-Initiative/pyDVL/actions/runs/6033122169/job/16369337328?pr=396. --- src/pydvl/utils/functional.py | 2 +- src/pydvl/utils/types.py | 4 ++-- src/pydvl/value/shapley/gt.py | 2 +- src/pydvl/value/shapley/montecarlo.py | 4 ++-- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/pydvl/utils/functional.py b/src/pydvl/utils/functional.py index 186f1167d..c21c702ae 100644 --- a/src/pydvl/utils/functional.py +++ b/src/pydvl/utils/functional.py @@ -44,7 +44,7 @@ def _unroll_partial_fn(fn: Callable) -> Tuple[Callable, Set[str]]: :return: A tuple of the unrolled function and a set of the parameters that were set by functools.partial. """ - args_set_by_partial = set() + args_set_by_partial: Set[str] = set() def _rec_unroll_partial_function(g: Callable): nonlocal args_set_by_partial diff --git a/src/pydvl/utils/types.py b/src/pydvl/utils/types.py index 957ef8f51..4fdfff832 100644 --- a/src/pydvl/utils/types.py +++ b/src/pydvl/utils/types.py @@ -5,7 +5,7 @@ import functools from abc import ABCMeta -from typing import Any, Callable, Optional, Protocol, TypeVar, Union +from typing import Any, Callable, Optional, Protocol, TypeVar, Union, cast from numpy.random import Generator, SeedSequence from numpy.typing import NDArray @@ -116,6 +116,6 @@ def ensure_seed_sequence( if isinstance(seed, SeedSequence): return seed elif isinstance(seed, Generator): - return seed.bit_generator._seed_seq # noqa + return cast(SeedSequence, seed.bit_generator.seed_seq) # type: ignore else: return SeedSequence(seed) diff --git a/src/pydvl/value/shapley/gt.py b/src/pydvl/value/shapley/gt.py index 3c5c75f29..cf24a08d7 100644 --- a/src/pydvl/value/shapley/gt.py +++ b/src/pydvl/value/shapley/gt.py @@ -240,7 +240,7 @@ def reducer( config=config, n_jobs=n_jobs, ) - uu, betas = map_reduce_job(seed=map_reduce_seed_sequence.entropy) + uu, betas = map_reduce_job(seed=cast(int, map_reduce_seed_sequence.entropy)) # Matrix of estimated differences. See Eqs. (3) and (4) in the paper. C = np.zeros(shape=(n, n)) diff --git a/src/pydvl/value/shapley/montecarlo.py b/src/pydvl/value/shapley/montecarlo.py index 08c1a0beb..f14a2edb4 100644 --- a/src/pydvl/value/shapley/montecarlo.py +++ b/src/pydvl/value/shapley/montecarlo.py @@ -38,7 +38,7 @@ from concurrent.futures import FIRST_COMPLETED, Future, wait from functools import reduce from itertools import cycle, takewhile -from typing import Optional, Sequence +from typing import Optional, Sequence, cast import numpy as np from deprecate import deprecated @@ -211,7 +211,7 @@ def permutation_montecarlo_shapley( u, truncation, algorithm, - seed=seeds[i].entropy, + seed=cast(int, seeds[i].entropy), ) pending.add(future) From 4455ce98002c5f64825866960ac9b079428bcbcd Mon Sep 17 00:00:00 2001 From: Markus Semmler Date: Thu, 31 Aug 2023 07:13:48 +0200 Subject: [PATCH 14/32] Split test cases into reproducible and stochastic to match the other cases. --- tests/utils/test_numeric.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/tests/utils/test_numeric.py b/tests/utils/test_numeric.py index 77ce63df9..c8cbcc6be 100644 --- a/tests/utils/test_numeric.py +++ b/tests/utils/test_numeric.py @@ -198,15 +198,24 @@ def test_random_matrix_with_condition_number(n, cond, exception): (10, 2), ], ) +def test_random_matrix_with_condition_number_reproducible(n, cond, seed): + mat_1 = random_matrix_with_condition_number(n, cond, seed=seed) + mat_2 = random_matrix_with_condition_number(n, cond, seed=seed) + assert np.all(mat_1 == mat_2) + + @pytest.mark.parametrize( - "seed,seed_alt", - [(24, 42), (24, 24)], - ids=["different", "same"], + "n, cond", + [ + (2, 10), + (7, 23), + (10, 2), + ], ) -def test_random_matrix_with_condition_number_reproducible(n, cond, seed, seed_alt): +def test_random_matrix_with_condition_number_stochastic(n, cond, seed, seed_alt): mat_1 = random_matrix_with_condition_number(n, cond, seed=seed) mat_2 = random_matrix_with_condition_number(n, cond, seed=seed_alt) - assert (seed != seed_alt) ^ bool(np.sum(mat_1 != mat_2) == 0) + assert np.any(mat_1 != mat_2) def test_running_moments(): From 611cb0d04a131bbc6abfa93f2e447fed91e0e0e8 Mon Sep 17 00:00:00 2001 From: Markus Semmler Date: Fri, 1 Sep 2023 04:47:00 +0200 Subject: [PATCH 15/32] Remove constructors from samplers by using mixin formalism correctly. --- src/pydvl/value/sampler.py | 32 ++++++++++---------------------- tests/value/test_sampler.py | 4 ++-- 2 files changed, 12 insertions(+), 24 deletions(-) diff --git a/src/pydvl/value/sampler.py b/src/pydvl/value/sampler.py index 4f29b1be7..7986e9247 100644 --- a/src/pydvl/value/sampler.py +++ b/src/pydvl/value/sampler.py @@ -120,6 +120,7 @@ def __init__( indices: NDArray[T], index_iteration: IndexIteration = IndexIteration.Sequential, outer_indices: NDArray[T] = None, + **kwargs, ): """ :param indices: The set of items (indices) to sample from. @@ -221,7 +222,8 @@ def weight(cls, n: int, subset_len: int) -> float: class StochasticSamplerMixin: """Mixin class for samplers which use a random number generator.""" - def __init__(self, seed: Optional[Seed] = None): + def __init__(self, *args, seed: Optional[Seed] = None, **kwargs): + super().__init__(*args, **kwargs) self._rng = np.random.default_rng(seed) @@ -264,7 +266,7 @@ def weight(cls, n: int, subset_len: int) -> float: return float(2 ** (n - 1)) if n > 0 else 1.0 -class UniformSampler(PowersetSampler[T], StochasticSamplerMixin): +class UniformSampler(StochasticSamplerMixin, PowersetSampler[T]): """An iterator to perform uniform random sampling of subsets. Iterating over every index $i$, either in sequence or at random depending on @@ -286,10 +288,6 @@ class UniformSampler(PowersetSampler[T], StochasticSamplerMixin): """ - def __init__(self, *args, seed: Optional[Seed] = None, **kwargs): - super().__init__(*args, **kwargs) - StochasticSamplerMixin.__init__(self, seed=seed) - def __iter__(self) -> Iterator[SampleT]: while True: for idx in self.iterindices(): @@ -315,7 +313,7 @@ def __init__(self, indices: NDArray[T], *args, **kwargs): void(indices, args, kwargs) -class AntitheticSampler(PowersetSampler[T], StochasticSamplerMixin): +class AntitheticSampler(StochasticSamplerMixin, PowersetSampler[T]): """An iterator to perform uniform random sampling of subsets, and their complements. @@ -324,10 +322,6 @@ class AntitheticSampler(PowersetSampler[T], StochasticSamplerMixin): the set $S$, including the index $i$ itself. """ - def __init__(self, *args, seed: Optional[Seed] = None, **kwargs): - super().__init__(*args, **kwargs) - StochasticSamplerMixin.__init__(self, seed=seed) - def __iter__(self) -> Iterator[SampleT]: while True: for idx in self.iterindices(): @@ -344,7 +338,7 @@ def weight(cls, n: int, subset_len: int) -> float: return float(2 ** (n - 1)) if n > 0 else 1.0 -class PermutationSampler(PowersetSampler[T], StochasticSamplerMixin): +class PermutationSampler(StochasticSamplerMixin, PowersetSampler[T]): """Sample permutations of indices and iterate through each returning increasing subsets, as required for the permutation definition of semi-values. @@ -361,10 +355,6 @@ class PermutationSampler(PowersetSampler[T], StochasticSamplerMixin): will be doubled wrt. a "direct" implementation of permutation MC """ - def __init__(self, *args, seed: Optional[Seed] = None, **kwargs): - super().__init__(*args, **kwargs) - StochasticSamplerMixin.__init__(self, seed=seed) - def __iter__(self) -> Iterator[SampleT]: while True: permutation = self._rng.permutation(self._indices) @@ -408,17 +398,13 @@ def __iter__(self) -> Iterator[SampleT]: self._n_samples += 1 -class RandomHierarchicalSampler(PowersetSampler[T], StochasticSamplerMixin): +class RandomHierarchicalSampler(StochasticSamplerMixin, PowersetSampler[T]): """For every index, sample a set size, then a set of that size. .. todo:: This is unnecessary, but a step towards proper stratified sampling. """ - def __init__(self, *args, seed: Optional[Seed] = None, **kwargs): - super().__init__(*args, **kwargs) - StochasticSamplerMixin.__init__(self, seed=seed) - def __iter__(self) -> Iterator[SampleT]: while True: for idx in self.iterindices(): @@ -436,6 +422,8 @@ def weight(cls, n: int, subset_len: int) -> float: return float(2 ** (n - 1)) if n > 0 else 1.0 +# TODO Replace by Intersection[StochasticSamplerMixin, PowersetSampler[T]] +# See https://github.com/python/typing/issues/213 StochasticSampler = Union[ - UniformSampler, PermutationSampler, RandomHierarchicalSampler, AntitheticSampler + UniformSampler, PermutationSampler, AntitheticSampler, RandomHierarchicalSampler ] diff --git a/tests/value/test_sampler.py b/tests/value/test_sampler.py index 87d9db31d..ed402da1d 100644 --- a/tests/value/test_sampler.py +++ b/tests/value/test_sampler.py @@ -116,11 +116,11 @@ def test_chunkify_permutation(sampler_class): def _create_seeded_sample_iter( - sampler_class: Type[StochasticSampler], + sampler_t: Type[StochasticSampler], indices: List, seed: Seed, ) -> Iterator: max_iterations = len(indices) - sampler = sampler_class(np.array(indices), seed=seed) + sampler = sampler_t(indices=np.array(indices), seed=seed) sample_stream = takewhile(lambda _: sampler.n_samples < max_iterations, sampler) return sample_stream From 12e90abd00430333c33d5cd675b87accf7778a50 Mon Sep 17 00:00:00 2001 From: Markus Semmler Date: Fri, 1 Sep 2023 04:48:24 +0200 Subject: [PATCH 16/32] Remove comment from functional.py --- src/pydvl/utils/functional.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/pydvl/utils/functional.py b/src/pydvl/utils/functional.py index c21c702ae..ee191cb03 100644 --- a/src/pydvl/utils/functional.py +++ b/src/pydvl/utils/functional.py @@ -21,18 +21,12 @@ def fn_accepts_param_name(fn: Callable, param_name: str) -> bool: sig = inspect.signature(wrapped_fn) params = sig.parameters - # Check if the parameter was set by functools.partial if param_name in args_set_by_partial: return False - # Check if the function accepts the specific parameter if param_name in params: return True - # Check if the function accepts **kwargs - if any(p.kind == p.VAR_KEYWORD for p in params.values()): - return True - return False From 9186d4f8e876ee34bea8fdfb17dd380812832db1 Mon Sep 17 00:00:00 2001 From: Markus Semmler Date: Fri, 1 Sep 2023 04:49:05 +0200 Subject: [PATCH 17/32] Extract separate method `call_fun_remove_arg`. Integrate backlog class for repetition of experiments. Create test case for semivalues reproducibility. Change from linear dataset to housing dataset. --- src/pydvl/utils/parallel/backlog.py | 37 +++++++++++ src/pydvl/utils/parallel/map_reduce.py | 5 +- src/pydvl/utils/types.py | 47 +++++++++++--- src/pydvl/value/sampler.py | 2 +- src/pydvl/value/semivalues.py | 10 ++- src/pydvl/value/shapley/gt.py | 7 ++- src/pydvl/value/shapley/montecarlo.py | 22 ++++--- tests/value/__init__.py | 7 +-- tests/value/shapley/test_montecarlo.py | 82 +++++++++++------------- tests/value/test_semivalues.py | 87 +++++++++++++++++++++++++- 10 files changed, 228 insertions(+), 78 deletions(-) create mode 100644 src/pydvl/utils/parallel/backlog.py diff --git a/src/pydvl/utils/parallel/backlog.py b/src/pydvl/utils/parallel/backlog.py new file mode 100644 index 000000000..3cd3ea1d0 --- /dev/null +++ b/src/pydvl/utils/parallel/backlog.py @@ -0,0 +1,37 @@ +from copy import copy +from functools import partial +from typing import Callable, Generator, Generic, List, Tuple, TypeVar + +T = TypeVar("T") + + +def wrap_index(*args, i: int, fn: Callable, **kwargs): + return i, fn(*args, **kwargs) + + +class Backlog(Generic[T]): + """A backlog is a queue of items added in no particular order. Each item has an + index used to determine the sequence in which the items are processed. A function + call can be modified using the wrap method to include this index in the output. + This modification should be applied before invoking the function.""" + + def __init__(self): + self._backlog: List[Tuple[int, T]] = [] + self._n_delivered = 0 + self._n_registered = 0 + self._n_wrapped = 0 + + def add(self, item: Tuple[int, T]): + self._backlog.append(item) + self._backlog = sorted(self._backlog, key=lambda t: t[0]) + self._n_registered += 1 + + def get(self) -> Generator[T, None, None]: + while len(self._backlog) > 0 and self._backlog[0][0] == self._n_delivered: + self._n_delivered += 1 + yield self._backlog[0][1] + self._backlog = self._backlog[1:] + + def wrap(self, fn: Callable) -> Callable: + self._n_wrapped += 1 + return partial(wrap_index, fn=fn, i=copy(self._n_wrapped - 1)) diff --git a/src/pydvl/utils/parallel/map_reduce.py b/src/pydvl/utils/parallel/map_reduce.py index dfd3de534..1226089a8 100644 --- a/src/pydvl/utils/parallel/map_reduce.py +++ b/src/pydvl/utils/parallel/map_reduce.py @@ -1,3 +1,4 @@ +import pickle from functools import reduce from itertools import accumulate, repeat from typing import Any, Collection, Dict, Generic, List, Optional, TypeVar, Union @@ -107,7 +108,7 @@ def __init__( def __call__( self, - seed: Optional[Seed] = None, + seed: Optional[Union[Seed, SeedSequence]] = None, ) -> R: """ Runs the map-reduce job. @@ -130,7 +131,7 @@ def __call__( chunks = self._chunkify(self.inputs_, n_chunks=self.n_jobs) map_results: List[R] = parallel( delayed(self._map_func)( - next_chunk, job_id=j, seed=seed.entropy, **self.map_kwargs + next_chunk, job_id=j, seed=seed, **self.map_kwargs ) for j, (next_chunk, seed) in enumerate( zip(chunks, seed_seq.spawn(len(chunks))) diff --git a/src/pydvl/utils/types.py b/src/pydvl/utils/types.py index 4fdfff832..a998a11ba 100644 --- a/src/pydvl/utils/types.py +++ b/src/pydvl/utils/types.py @@ -5,7 +5,8 @@ import functools from abc import ABCMeta -from typing import Any, Callable, Optional, Protocol, TypeVar, Union, cast +from copy import deepcopy +from typing import Any, Callable, Optional, Protocol, Tuple, TypeVar, Union, cast from numpy.random import Generator, SeedSequence from numpy.typing import NDArray @@ -46,6 +47,24 @@ def score(self, x: NDArray, y: NDArray) -> float: pass +def call_fun_remove_arg(*args, fun: Callable, arg: str, **kwargs): + """ + Calls the given function with the given arguments, but removes the given argument. + + :param args: Positional arguments to pass to the function. + :param fun: The function to call. + :param arg: The name of the argument to remove. + :param kwargs: Keyword arguments to pass to the function. + :return: The return value of the function. + """ + try: + del kwargs[arg] + except KeyError: + pass + + return fun(*args, **kwargs) + + def maybe_add_argument(fun: Callable, new_arg: str): """Wraps a function to accept the given keyword parameter if it doesn't already. @@ -62,15 +81,7 @@ def maybe_add_argument(fun: Callable, new_arg: str): if fn_accepts_param_name(fun, new_arg): return fun - @functools.wraps(fun) - def wrapper(*args, **kwargs): - try: - del kwargs[new_arg] - except KeyError: - pass - return fun(*args, **kwargs) - - return wrapper + return functools.partial(call_fun_remove_arg, fun=fun, arg=new_arg) class NoPublicConstructor(ABCMeta): @@ -119,3 +130,19 @@ def ensure_seed_sequence( return cast(SeedSequence, seed.bit_generator.seed_seq) # type: ignore else: return SeedSequence(seed) + + +def call_fn_multiple_seeds( + fn: Callable, *args, seeds: Tuple[Seed, ...], **kwargs +) -> Tuple: + """ + Execute a function multiple times with different seeds. It copies the arguments + and keyword arguments before passing them to the function. + + :param fn: The function to execute. + :param args: The arguments to pass to the function. + :param seeds: The seeds to use. + :param kwargs: The keyword arguments to pass to the function. + :return: A tuple of the results of the function. + """ + return tuple(fn(*deepcopy(args), **deepcopy(kwargs), seed=seed) for seed in seeds) diff --git a/src/pydvl/value/sampler.py b/src/pydvl/value/sampler.py index 7986e9247..121568139 100644 --- a/src/pydvl/value/sampler.py +++ b/src/pydvl/value/sampler.py @@ -53,7 +53,7 @@ from numpy.typing import NDArray from pydvl.utils.numeric import powerset, random_subset, random_subset_of_size -from pydvl.utils.types import Seed, ensure_seed_sequence +from pydvl.utils.types import Seed __all__ = [ "AntitheticSampler", diff --git a/src/pydvl/value/semivalues.py b/src/pydvl/value/semivalues.py index 780eaf022..98bad0ee6 100644 --- a/src/pydvl/value/semivalues.py +++ b/src/pydvl/value/semivalues.py @@ -62,7 +62,8 @@ from tqdm import tqdm from pydvl.utils import ParallelConfig, Utility -from pydvl.utils.types import Seed, ensure_seed_sequence +from pydvl.utils.parallel.backlog import Backlog +from pydvl.utils.types import Seed from pydvl.value import ValuationResult from pydvl.value.sampler import ( PermutationSampler, @@ -175,6 +176,7 @@ def semivalues( sampler_it = iter(sampler) pbar = tqdm(disable=not progress, total=100, unit="%") + backlog = Backlog[Tuple[int, float]]() with init_executor( max_workers=max_workers, config=config, cancel_futures=True @@ -186,7 +188,9 @@ def semivalues( completed, pending = wait(pending, timeout=1, return_when=FIRST_COMPLETED) for future in completed: - idx, marginal = future.result() + backlog.add(future.result()) + + for idx, marginal in backlog.get(): result.update(idx, marginal) if done(result): return result @@ -196,7 +200,7 @@ def semivalues( for _ in range(n_submitted_jobs - len(pending)): pending.add( executor.submit( - _marginal, + backlog.wrap(_marginal), u=u, coefficient=correction, sample=next(sampler_it), diff --git a/src/pydvl/value/shapley/gt.py b/src/pydvl/value/shapley/gt.py index cf24a08d7..188977ec3 100644 --- a/src/pydvl/value/shapley/gt.py +++ b/src/pydvl/value/shapley/gt.py @@ -17,10 +17,11 @@ """ import logging from collections import namedtuple -from typing import Iterable, Optional, Tuple, TypeVar, cast +from typing import Iterable, Optional, Tuple, TypeVar, Union, cast import cvxpy as cp import numpy as np +from numpy.random import SeedSequence from numpy.typing import NDArray from pydvl.utils import MapReduceJob, ParallelConfig, Utility, maybe_progress @@ -119,7 +120,7 @@ def _group_testing_shapley( n_samples: int, progress: bool = False, job_id: int = 1, - seed: Optional[Seed] = None, + seed: Optional[Union[Seed, SeedSequence]] = None, ): """Helper function for :func:`group_testing_shapley`. @@ -240,7 +241,7 @@ def reducer( config=config, n_jobs=n_jobs, ) - uu, betas = map_reduce_job(seed=cast(int, map_reduce_seed_sequence.entropy)) + uu, betas = map_reduce_job(seed=map_reduce_seed_sequence) # Matrix of estimated differences. See Eqs. (3) and (4) in the paper. C = np.zeros(shape=(n, n)) diff --git a/src/pydvl/value/shapley/montecarlo.py b/src/pydvl/value/shapley/montecarlo.py index f14a2edb4..baae5bca7 100644 --- a/src/pydvl/value/shapley/montecarlo.py +++ b/src/pydvl/value/shapley/montecarlo.py @@ -38,10 +38,11 @@ from concurrent.futures import FIRST_COMPLETED, Future, wait from functools import reduce from itertools import cycle, takewhile -from typing import Optional, Sequence, cast +from typing import Optional, Sequence, Union import numpy as np from deprecate import deprecated +from numpy.random import SeedSequence from numpy.typing import NDArray from tqdm import tqdm @@ -49,11 +50,12 @@ from pydvl.utils.config import ParallelConfig from pydvl.utils.numeric import random_powerset from pydvl.utils.parallel import CancellationPolicy, MapReduceJob +from pydvl.utils.parallel.backlog import Backlog from pydvl.utils.types import Seed, ensure_seed_sequence from pydvl.utils.utility import Utility from pydvl.value.result import ValuationResult from pydvl.value.shapley.truncated import NoTruncation, TruncationPolicy -from pydvl.value.stopping import MaxChecks, StoppingCriterion +from pydvl.value.stopping import StoppingCriterion logger = logging.getLogger(__name__) @@ -64,7 +66,7 @@ def _permutation_montecarlo_one_step( u: Utility, truncation: TruncationPolicy, algorithm_name: str, - seed: Optional[Seed] = None, + seed: Optional[Union[Seed, SeedSequence]] = None, ) -> ValuationResult: """Helper function for :func:`permutation_montecarlo_shapley`. @@ -182,6 +184,8 @@ def permutation_montecarlo_shapley( result = ValuationResult.zeros(algorithm=algorithm) pbar = tqdm(disable=not progress, total=100, unit="%") + n_submitted = 0 + backlog = Backlog[ValuationResult]() with init_executor( max_workers=max_workers, config=config, cancel_futures=CancellationPolicy.ALL @@ -194,9 +198,12 @@ def permutation_montecarlo_shapley( completed, pending = wait( pending, timeout=config.wait_timeout, return_when=FIRST_COMPLETED ) - for future in completed: - result += future.result() + backlog.add(future.result()) + + for future_result in backlog.get(): + result += future_result + # we could check outside the loop, but that means more # submissions if the stopping criterion is unstable if done(result): @@ -207,12 +214,13 @@ def permutation_montecarlo_shapley( seeds = seed_sequence.spawn(n_remaining_slots) for i in range(n_remaining_slots): future = executor.submit( - _permutation_montecarlo_one_step, + backlog.wrap(_permutation_montecarlo_one_step), u, truncation, algorithm, - seed=cast(int, seeds[i].entropy), + seed=seeds[i], ) + n_submitted += 1 pending.add(future) diff --git a/tests/value/__init__.py b/tests/value/__init__.py index 4eab5ebc6..795eef798 100644 --- a/tests/value/__init__.py +++ b/tests/value/__init__.py @@ -46,7 +46,6 @@ def check_values( atol: float = 1e-5, *, extra_values_names: Sequence[str] = tuple(), - equal: bool = True, ): """Compares values in dictionaries. @@ -72,11 +71,9 @@ def check_values( values.sort() exact_values.sort() - assert (not equal) ^ np.allclose( - values.values, exact_values.values, rtol=rtol, atol=atol - ) + assert np.allclose(values.values, exact_values.values, rtol=rtol, atol=atol) for name in extra_values_names: - assert (not equal) ^ np.isclose( + assert np.isclose( getattr(values, name), getattr(exact_values, name), rtol=rtol, atol=atol ) diff --git a/tests/value/shapley/test_montecarlo.py b/tests/value/shapley/test_montecarlo.py index a46d84147..4c4032fa9 100644 --- a/tests/value/shapley/test_montecarlo.py +++ b/tests/value/shapley/test_montecarlo.py @@ -1,19 +1,28 @@ import logging +from copy import copy, deepcopy import numpy as np import pytest from sklearn.linear_model import LinearRegression -from pydvl.utils import GroupedDataset, MemcachedConfig, Status, Utility +from pydvl.utils import ( + Dataset, + GroupedDataset, + MemcachedConfig, + ParallelConfig, + Status, + Utility, +) from pydvl.utils.numeric import num_samples_permutation_hoeffding from pydvl.utils.score import Scorer, squashed_r2 -from pydvl.utils.types import Seed +from pydvl.utils.types import Seed, call_fn_multiple_seeds from pydvl.value import compute_shapley_values from pydvl.value.shapley import ShapleyMode from pydvl.value.shapley.naive import combinatorial_exact_shapley from pydvl.value.stopping import MaxChecks, MaxUpdates from .. import check_rank_correlation, check_total_value, check_values +from ..conftest import polynomial_dataset log = logging.getLogger(__name__) @@ -64,12 +73,12 @@ def test_analytic_montecarlo_shapley( test_cases_montecarlo_shapley_reproducible_stochastic = [ - (12, ShapleyMode.PermutationMontecarlo, {"done": MaxUpdates(10)}), + (12, ShapleyMode.PermutationMontecarlo, {"done": MaxChecks(1)}), # FIXME! it should be enough with 2**(len(data)-1) samples ( 8, ShapleyMode.CombinatorialMontecarlo, - {"done": MaxUpdates(2**10)}, + {"done": MaxChecks(1)}, ), (12, ShapleyMode.Owen, dict(n_samples=4, max_q=200)), (12, ShapleyMode.OwenAntithetic, dict(n_samples=4, max_q=200)), @@ -84,70 +93,55 @@ def test_analytic_montecarlo_shapley( @pytest.mark.parametrize( "num_samples, fun, kwargs", test_cases_montecarlo_shapley_reproducible_stochastic ) -def test_analytical_montecarlo_shapley_reproducible( - num_samples, - analytic_shapley, - parallel_config, - n_jobs, +@pytest.mark.parametrize("num_points, num_features", [(12, 3)]) +def test_montecarlo_shapley_housing_dataset_reproducible( + num_samples: int, + housing_dataset: Dataset, + parallel_config: ParallelConfig, + n_jobs: int, fun: ShapleyMode, kwargs: dict, seed: Seed, ): - u, _ = analytic_shapley - values_1 = compute_shapley_values( - u, - mode=fun, - n_jobs=n_jobs, - config=parallel_config, - progress=False, - seed=seed, - **kwargs - ) - values_2 = compute_shapley_values( - u, + values_1, values_2 = call_fn_multiple_seeds( + compute_shapley_values, + Utility(LinearRegression(), data=housing_dataset, scorer="r2"), mode=fun, n_jobs=n_jobs, config=parallel_config, progress=False, - seed=seed, - **kwargs + seeds=(seed, seed), + **deepcopy(kwargs) ) - check_values(values_1, values_2, rtol=0.0, atol=0.0) + np.testing.assert_equal(values_1.values, values_2.values) @pytest.mark.parametrize( "num_samples, fun, kwargs", test_cases_montecarlo_shapley_reproducible_stochastic ) -def test_analytical_montecarlo_shapley_stochastic( - num_samples, - analytic_shapley, - parallel_config, - n_jobs, +@pytest.mark.parametrize("num_points, num_features", [(12, 4)]) +def test_montecarlo_shapley_housing_dataset_stochastic( + num_samples: int, + housing_dataset: Dataset, + parallel_config: ParallelConfig, + n_jobs: int, fun: ShapleyMode, kwargs: dict, seed: Seed, seed_alt: Seed, ): - u, exact_values = analytic_shapley - values_1 = compute_shapley_values( - u, - mode=fun, - n_jobs=n_jobs, - config=parallel_config, - progress=False, - seed=seed, - **kwargs - ) - values_2 = compute_shapley_values( - u, + values_1, values_2 = call_fn_multiple_seeds( + compute_shapley_values, + Utility(LinearRegression(), data=housing_dataset, scorer="r2"), mode=fun, n_jobs=n_jobs, config=parallel_config, progress=False, - seed=seed_alt, - **kwargs + seeds=(seed, seed_alt), + **deepcopy(kwargs) ) - check_values(values_1, values_2, rtol=0.0, atol=0.0, equal=False) + with pytest.raises(AssertionError): + np.testing.assert_equal(values_1.values, values_2.values) @pytest.mark.parametrize("num_samples, delta, eps", [(8, 0.1, 0.1)]) diff --git a/tests/value/test_semivalues.py b/tests/value/test_semivalues.py index bc39deb39..48c355ee2 100644 --- a/tests/value/test_semivalues.py +++ b/tests/value/test_semivalues.py @@ -1,10 +1,13 @@ import math -from typing import Dict, Type +from typing import Type import numpy as np import pytest +from sklearn.linear_model import LinearRegression -from pydvl.utils import ParallelConfig, Utility +from pydvl.utils import Dataset, ParallelConfig, Utility +from pydvl.utils.types import Seed, call_fn_multiple_seeds +from pydvl.value import ValuationResult from pydvl.value.sampler import ( AntitheticSampler, DeterministicPermutationSampler, @@ -20,7 +23,7 @@ semivalues, shapley_coefficient, ) -from pydvl.value.stopping import AbsoluteStandardError, MaxUpdates +from pydvl.value.stopping import AbsoluteStandardError, MaxUpdates, StoppingCriterion from . import check_values @@ -58,6 +61,84 @@ def test_shapley( check_values(values, exact_values, rtol=0.2) +def semivalues_seed_wrapper( + sampler_t: Type[PowersetSampler], u: Utility, *args, seed: Seed, **kwargs +) -> ValuationResult: + """ + Wrapper for semivalues that takes a seed as an argument to be used with + call_fn_multiple_seeds. + """ + sampler = sampler_t(u.data.indices, seed=seed) + return semivalues(sampler, u, *args, **kwargs) + + +@pytest.mark.parametrize("num_samples", [5]) +@pytest.mark.parametrize( + "sampler_t", + [ + UniformSampler, + PermutationSampler, + AntitheticSampler, + ], +) +@pytest.mark.parametrize("coefficient", [shapley_coefficient, beta_coefficient(1, 1)]) +@pytest.mark.parametrize("num_points, num_features", [(12, 3)]) +def test_semivalues_shapley_reproducible( + num_samples: int, + housing_dataset: Dataset, + sampler_t: Type[PowersetSampler], + coefficient: SVCoefficient, + n_jobs: int, + parallel_config: ParallelConfig, + seed: Seed, +): + values_1, values_2 = call_fn_multiple_seeds( + semivalues_seed_wrapper, + sampler_t, + Utility(LinearRegression(), data=housing_dataset, scorer="r2"), + coefficient, + AbsoluteStandardError(0.02, 1.0) | MaxUpdates(2 ** (num_samples * 2)), + n_jobs=n_jobs, + config=parallel_config, + seeds=(seed, seed), + ) + assert np.all(values_1.values == values_2.values) + + +@pytest.mark.parametrize("num_samples", [5]) +@pytest.mark.parametrize( + "sampler_t", + [ + UniformSampler, + PermutationSampler, + AntitheticSampler, + ], +) +@pytest.mark.parametrize("coefficient", [shapley_coefficient, beta_coefficient(1, 1)]) +@pytest.mark.parametrize("num_points, num_features", [(12, 3)]) +def test_semivalues_shapley_stochastic( + num_samples: int, + housing_dataset: Dataset, + sampler_t: Type[PowersetSampler], + coefficient: SVCoefficient, + n_jobs: int, + parallel_config: ParallelConfig, + seed: Seed, + seed_alt: Seed, +): + values_1, values_2 = call_fn_multiple_seeds( + semivalues_seed_wrapper, + sampler_t, + Utility(LinearRegression(), data=housing_dataset, scorer="r2"), + coefficient, + AbsoluteStandardError(0.02, 1.0) | MaxUpdates(2 ** (num_samples * 2)), + n_jobs=n_jobs, + config=parallel_config, + seeds=(seed, seed_alt), + ) + assert np.any(values_1.values != values_2.values) + + @pytest.mark.parametrize("num_samples", [5]) @pytest.mark.parametrize( "sampler", From 26d59bffd31c01726d0b25b0750837ef70a9b1b7 Mon Sep 17 00:00:00 2001 From: Markus Semmler Date: Fri, 1 Sep 2023 16:45:48 +0200 Subject: [PATCH 18/32] Remove backlog from semivalues.py and montecarlo.py. --- src/pydvl/utils/parallel/backlog.py | 37 --------------------------- src/pydvl/value/semivalues.py | 11 +++----- src/pydvl/value/shapley/montecarlo.py | 12 ++------- 3 files changed, 6 insertions(+), 54 deletions(-) delete mode 100644 src/pydvl/utils/parallel/backlog.py diff --git a/src/pydvl/utils/parallel/backlog.py b/src/pydvl/utils/parallel/backlog.py deleted file mode 100644 index 3cd3ea1d0..000000000 --- a/src/pydvl/utils/parallel/backlog.py +++ /dev/null @@ -1,37 +0,0 @@ -from copy import copy -from functools import partial -from typing import Callable, Generator, Generic, List, Tuple, TypeVar - -T = TypeVar("T") - - -def wrap_index(*args, i: int, fn: Callable, **kwargs): - return i, fn(*args, **kwargs) - - -class Backlog(Generic[T]): - """A backlog is a queue of items added in no particular order. Each item has an - index used to determine the sequence in which the items are processed. A function - call can be modified using the wrap method to include this index in the output. - This modification should be applied before invoking the function.""" - - def __init__(self): - self._backlog: List[Tuple[int, T]] = [] - self._n_delivered = 0 - self._n_registered = 0 - self._n_wrapped = 0 - - def add(self, item: Tuple[int, T]): - self._backlog.append(item) - self._backlog = sorted(self._backlog, key=lambda t: t[0]) - self._n_registered += 1 - - def get(self) -> Generator[T, None, None]: - while len(self._backlog) > 0 and self._backlog[0][0] == self._n_delivered: - self._n_delivered += 1 - yield self._backlog[0][1] - self._backlog = self._backlog[1:] - - def wrap(self, fn: Callable) -> Callable: - self._n_wrapped += 1 - return partial(wrap_index, fn=fn, i=copy(self._n_wrapped - 1)) diff --git a/src/pydvl/value/semivalues.py b/src/pydvl/value/semivalues.py index ecd572a4d..4078ca523 100644 --- a/src/pydvl/value/semivalues.py +++ b/src/pydvl/value/semivalues.py @@ -81,7 +81,6 @@ from tqdm import tqdm from pydvl.utils import ParallelConfig, Utility -from pydvl.utils.parallel.backlog import Backlog from pydvl.utils.types import Seed from pydvl.value import ValuationResult from pydvl.value.sampler import ( @@ -200,7 +199,6 @@ def compute_generic_semivalues( sampler_it = iter(sampler) pbar = tqdm(disable=not progress, total=100, unit="%") - backlog = Backlog[Tuple[int, float]]() with init_executor( max_workers=max_workers, config=config, cancel_futures=True @@ -212,9 +210,7 @@ def compute_generic_semivalues( completed, pending = wait(pending, timeout=1, return_when=FIRST_COMPLETED) for future in completed: - backlog.add(future.result()) - - for idx, marginal in backlog.get(): + idx, marginal = future.result() result.update(idx, marginal) if done(result): return result @@ -224,14 +220,15 @@ def compute_generic_semivalues( for _ in range(n_submitted_jobs - len(pending)): pending.add( executor.submit( - backlog.wrap(_marginal), + _marginal, u=u, coefficient=correction, sample=next(sampler_it), ) ) except StopIteration: - return result + if len(pending) == 0: + return result def shapley_coefficient(n: int, k: int) -> float: diff --git a/src/pydvl/value/shapley/montecarlo.py b/src/pydvl/value/shapley/montecarlo.py index 6ed995feb..eca44139f 100644 --- a/src/pydvl/value/shapley/montecarlo.py +++ b/src/pydvl/value/shapley/montecarlo.py @@ -60,7 +60,6 @@ from pydvl.utils.config import ParallelConfig from pydvl.utils.numeric import random_powerset from pydvl.utils.parallel import CancellationPolicy, MapReduceJob -from pydvl.utils.parallel.backlog import Backlog from pydvl.utils.types import Seed, ensure_seed_sequence from pydvl.utils.utility import Utility from pydvl.value.result import ValuationResult @@ -200,8 +199,6 @@ def permutation_montecarlo_shapley( result = ValuationResult.zeros(algorithm=algorithm) pbar = tqdm(disable=not progress, total=100, unit="%") - n_submitted = 0 - backlog = Backlog[ValuationResult]() with init_executor( max_workers=max_workers, config=config, cancel_futures=CancellationPolicy.ALL @@ -215,11 +212,7 @@ def permutation_montecarlo_shapley( pending, timeout=config.wait_timeout, return_when=FIRST_COMPLETED ) for future in completed: - backlog.add(future.result()) - - for future_result in backlog.get(): - result += future_result - + result += future.result() # we could check outside the loop, but that means more # submissions if the stopping criterion is unstable if done(result): @@ -230,13 +223,12 @@ def permutation_montecarlo_shapley( seeds = seed_sequence.spawn(n_remaining_slots) for i in range(n_remaining_slots): future = executor.submit( - backlog.wrap(_permutation_montecarlo_one_step), + _permutation_montecarlo_one_step, u, truncation, algorithm, seed=seeds[i], ) - n_submitted += 1 pending.add(future) From 4e889f7b6f97656245bb427e511dee6f136da6b1 Mon Sep 17 00:00:00 2001 From: Markus Semmler Date: Fri, 1 Sep 2023 18:04:34 +0200 Subject: [PATCH 19/32] Remove reproducibility tests from semivalues.py and deactivated affected test cases from test monte carlo. --- src/pydvl/utils/functional.py | 45 +++++--------- src/pydvl/utils/parallel/map_reduce.py | 1 + src/pydvl/utils/types.py | 4 +- tests/utils/test_parallel.py | 2 +- tests/value/shapley/test_montecarlo.py | 15 ++--- tests/value/test_semivalues.py | 85 +------------------------- 6 files changed, 28 insertions(+), 124 deletions(-) diff --git a/src/pydvl/utils/functional.py b/src/pydvl/utils/functional.py index d4f684d9a..55f6a7a2d 100644 --- a/src/pydvl/utils/functional.py +++ b/src/pydvl/utils/functional.py @@ -4,53 +4,36 @@ from functools import partial from typing import Callable, Set, Tuple -__all__ = ["fn_accepts_param_name"] +__all__ = ["unroll_partial_fn_args"] -def fn_accepts_param_name(fn: Callable, param_name: str) -> bool: +def unroll_partial_fn_args(fun: Callable) -> Set[str]: """ - Checks if a function accepts a given parameter, even if it is set by partial. + Unroll a function that was set by functools.partial. Args: - fn: The function to check. - param_name: The name of the parameter to check. + fun: Either or a function to unroll. Returns: - True if the function accepts the parameter, False otherwise. - """ - - wrapped_fn, args_set_by_partial = _unroll_partial_fn(fn) - - sig = inspect.signature(wrapped_fn) - params = sig.parameters - - if param_name in args_set_by_partial: - return False - - if param_name in params: - return True - - return False - - -def _unroll_partial_fn(fn: Callable) -> Tuple[Callable, Set[str]]: - """ - Unroll a function that was set by functools.partial. - - :param fn: Either or a function to unroll. - :return: A tuple of the unrolled function and a set of the parameters that were set - by functools.partial. + A tuple of the unrolled function and a set of the parameters that were set by + functools.partial. """ args_set_by_partial: Set[str] = set() def _rec_unroll_partial_function(g: Callable): + """ + Store arguments and recursively call itself if the function is a partial. In the + end, return the original function. + """ nonlocal args_set_by_partial if isinstance(g, partial): args_set_by_partial.update(g.keywords.keys()) args_set_by_partial.update(g.args) - return _rec_unroll_partial_function(g.func) + return _rec_unroll_partial_function(g.keywords["fun"]) else: return g - return _rec_unroll_partial_function(fn), args_set_by_partial + wrapped_fn = _rec_unroll_partial_function(fun) + sig = inspect.signature(wrapped_fn) + return args_set_by_partial | set(sig.parameters.keys()) diff --git a/src/pydvl/utils/parallel/map_reduce.py b/src/pydvl/utils/parallel/map_reduce.py index f8aa5a4dd..4f0b55af3 100644 --- a/src/pydvl/utils/parallel/map_reduce.py +++ b/src/pydvl/utils/parallel/map_reduce.py @@ -147,6 +147,7 @@ def __call__( zip(chunks, seed_seq.spawn(len(chunks))) ) ) + reduce_results: R = self._reduce_func(map_results, **self.reduce_kwargs) return reduce_results diff --git a/src/pydvl/utils/types.py b/src/pydvl/utils/types.py index 7060b21ef..36eb5ce89 100644 --- a/src/pydvl/utils/types.py +++ b/src/pydvl/utils/types.py @@ -11,7 +11,7 @@ from numpy.random import Generator, SeedSequence from numpy.typing import NDArray -from pydvl.utils.functional import fn_accepts_param_name +from pydvl.utils.functional import unroll_partial_fn_args __all__ = ["SupervisedModel", "MapFunction", "ReduceFunction", "NoPublicConstructor"] @@ -83,7 +83,7 @@ def maybe_add_argument(fun: Callable, new_arg: str) -> Callable: Returns: A new function accepting one more keyword argument. """ - if fn_accepts_param_name(fun, new_arg): + if new_arg in unroll_partial_fn_args(fun): return fun return functools.partial(call_fun_remove_arg, fun=fun, arg=new_arg) diff --git a/tests/utils/test_parallel.py b/tests/utils/test_parallel.py index 29f57eacb..3316fe7c3 100644 --- a/tests/utils/test_parallel.py +++ b/tests/utils/test_parallel.py @@ -147,7 +147,7 @@ def reduce_func(x, y): assert result == 150 -def test_map_reduce_reproducible(parallel_config, seed, seed_alt): +def test_map_reduce_reproducible(parallel_config, seed): """ Test that the same result is obtained when using the same seed. And that different results are obtained when using different seeds. diff --git a/tests/value/shapley/test_montecarlo.py b/tests/value/shapley/test_montecarlo.py index 4c4032fa9..6ec72a8f3 100644 --- a/tests/value/shapley/test_montecarlo.py +++ b/tests/value/shapley/test_montecarlo.py @@ -73,13 +73,14 @@ def test_analytic_montecarlo_shapley( test_cases_montecarlo_shapley_reproducible_stochastic = [ - (12, ShapleyMode.PermutationMontecarlo, {"done": MaxChecks(1)}), - # FIXME! it should be enough with 2**(len(data)-1) samples - ( - 8, - ShapleyMode.CombinatorialMontecarlo, - {"done": MaxChecks(1)}, - ), + # TODO Add once issue #416 is closed. + # (12, ShapleyMode.PermutationMontecarlo, {"done": MaxChecks(1)}), + # # FIXME! it should be enough with 2**(len(data)-1) samples + # ( + # 8, + # ShapleyMode.CombinatorialMontecarlo, + # {"done": MaxChecks(1)}, + # ), (12, ShapleyMode.Owen, dict(n_samples=4, max_q=200)), (12, ShapleyMode.OwenAntithetic, dict(n_samples=4, max_q=200)), ( diff --git a/tests/value/test_semivalues.py b/tests/value/test_semivalues.py index d52f21e4a..ec937d028 100644 --- a/tests/value/test_semivalues.py +++ b/tests/value/test_semivalues.py @@ -3,11 +3,8 @@ import numpy as np import pytest -from sklearn.linear_model import LinearRegression -from pydvl.utils import Dataset, ParallelConfig, Utility -from pydvl.utils.types import Seed, call_fn_multiple_seeds -from pydvl.value import ValuationResult +from pydvl.utils import ParallelConfig from pydvl.value.sampler import ( AntitheticSampler, DeterministicPermutationSampler, @@ -23,7 +20,7 @@ compute_generic_semivalues, shapley_coefficient, ) -from pydvl.value.stopping import AbsoluteStandardError, MaxUpdates, StoppingCriterion +from pydvl.value.stopping import AbsoluteStandardError, MaxUpdates from . import check_values @@ -61,84 +58,6 @@ def test_shapley( check_values(values, exact_values, rtol=0.2) -def semivalues_seed_wrapper( - sampler_t: Type[PowersetSampler], u: Utility, *args, seed: Seed, **kwargs -) -> ValuationResult: - """ - Wrapper for semivalues that takes a seed as an argument to be used with - call_fn_multiple_seeds. - """ - sampler = sampler_t(u.data.indices, seed=seed) - return semivalues(sampler, u, *args, **kwargs) - - -@pytest.mark.parametrize("num_samples", [5]) -@pytest.mark.parametrize( - "sampler_t", - [ - UniformSampler, - PermutationSampler, - AntitheticSampler, - ], -) -@pytest.mark.parametrize("coefficient", [shapley_coefficient, beta_coefficient(1, 1)]) -@pytest.mark.parametrize("num_points, num_features", [(12, 3)]) -def test_semivalues_shapley_reproducible( - num_samples: int, - housing_dataset: Dataset, - sampler_t: Type[PowersetSampler], - coefficient: SVCoefficient, - n_jobs: int, - parallel_config: ParallelConfig, - seed: Seed, -): - values_1, values_2 = call_fn_multiple_seeds( - semivalues_seed_wrapper, - sampler_t, - Utility(LinearRegression(), data=housing_dataset, scorer="r2"), - coefficient, - AbsoluteStandardError(0.02, 1.0) | MaxUpdates(2 ** (num_samples * 2)), - n_jobs=n_jobs, - config=parallel_config, - seeds=(seed, seed), - ) - assert np.all(values_1.values == values_2.values) - - -@pytest.mark.parametrize("num_samples", [5]) -@pytest.mark.parametrize( - "sampler_t", - [ - UniformSampler, - PermutationSampler, - AntitheticSampler, - ], -) -@pytest.mark.parametrize("coefficient", [shapley_coefficient, beta_coefficient(1, 1)]) -@pytest.mark.parametrize("num_points, num_features", [(12, 3)]) -def test_semivalues_shapley_stochastic( - num_samples: int, - housing_dataset: Dataset, - sampler_t: Type[PowersetSampler], - coefficient: SVCoefficient, - n_jobs: int, - parallel_config: ParallelConfig, - seed: Seed, - seed_alt: Seed, -): - values_1, values_2 = call_fn_multiple_seeds( - semivalues_seed_wrapper, - sampler_t, - Utility(LinearRegression(), data=housing_dataset, scorer="r2"), - coefficient, - AbsoluteStandardError(0.02, 1.0) | MaxUpdates(2 ** (num_samples * 2)), - n_jobs=n_jobs, - config=parallel_config, - seeds=(seed, seed_alt), - ) - assert np.any(values_1.values != values_2.values) - - @pytest.mark.parametrize("num_samples", [5]) @pytest.mark.parametrize( "sampler", From c28ce5e8bbb78b99a01d39897c6590e7fa7cd82a Mon Sep 17 00:00:00 2001 From: Markus Semmler Date: Fri, 1 Sep 2023 18:18:14 +0200 Subject: [PATCH 20/32] Deactivate test case and add TODO --- tests/value/shapley/test_montecarlo.py | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/tests/value/shapley/test_montecarlo.py b/tests/value/shapley/test_montecarlo.py index 6ec72a8f3..da079952f 100644 --- a/tests/value/shapley/test_montecarlo.py +++ b/tests/value/shapley/test_montecarlo.py @@ -75,19 +75,14 @@ def test_analytic_montecarlo_shapley( test_cases_montecarlo_shapley_reproducible_stochastic = [ # TODO Add once issue #416 is closed. # (12, ShapleyMode.PermutationMontecarlo, {"done": MaxChecks(1)}), - # # FIXME! it should be enough with 2**(len(data)-1) samples - # ( - # 8, - # ShapleyMode.CombinatorialMontecarlo, - # {"done": MaxChecks(1)}, - # ), - (12, ShapleyMode.Owen, dict(n_samples=4, max_q=200)), - (12, ShapleyMode.OwenAntithetic, dict(n_samples=4, max_q=200)), ( - 4, - ShapleyMode.GroupTesting, - dict(n_samples=int(21), epsilon=0.2, delta=0.01), + 12, + ShapleyMode.CombinatorialMontecarlo, + {"done": MaxChecks(4)}, ), + (12, ShapleyMode.Owen, dict(n_samples=4, max_q=200)), + (12, ShapleyMode.OwenAntithetic, dict(n_samples=4, max_q=200)), + (4, ShapleyMode.GroupTesting, dict(n_samples=int(21), epsilon=0.2, delta=0.01)), ] From 6dee93d62e39b20c492ee710c2a19b8d393c3788 Mon Sep 17 00:00:00 2001 From: Markus Semmler Date: Fri, 1 Sep 2023 18:54:47 +0200 Subject: [PATCH 21/32] Fix comments and typos. --- src/pydvl/utils/functional.py | 28 ++++++++++++------- src/pydvl/utils/numeric.py | 2 -- src/pydvl/utils/types.py | 37 +++++++------------------- src/pydvl/value/result.py | 3 +-- src/pydvl/value/shapley/common.py | 2 +- src/pydvl/value/shapley/gt.py | 4 +-- tests/value/shapley/test_montecarlo.py | 3 ++- tests/value/utils.py | 25 +++++++++++++++++ 8 files changed, 58 insertions(+), 46 deletions(-) create mode 100644 tests/value/utils.py diff --git a/src/pydvl/utils/functional.py b/src/pydvl/utils/functional.py index 55f6a7a2d..092f042c3 100644 --- a/src/pydvl/utils/functional.py +++ b/src/pydvl/utils/functional.py @@ -2,35 +2,43 @@ import inspect from functools import partial -from typing import Callable, Set, Tuple +from typing import Callable, Set, Tuple, Union -__all__ = ["unroll_partial_fn_args"] +__all__ = ["get_free_args_fn"] -def unroll_partial_fn_args(fun: Callable) -> Set[str]: +def get_free_args_fn(fun: Union[Callable, partial]) -> Set[str]: """ - Unroll a function that was set by functools.partial. + Accept a function or partial definition and return the set of arguments that are + free. An argument is free if it is not set by the partial and is a parameter of the + function. Args: - fun: Either or a function to unroll. + fun: A partial or a function to unroll. Returns: - A tuple of the unrolled function and a set of the parameters that were set by - functools.partial. + A set of arguments that were set by the partial. """ args_set_by_partial: Set[str] = set() - def _rec_unroll_partial_function(g: Callable): + def _rec_unroll_partial_function(g: Union[Callable, partial]) -> Callable: """ Store arguments and recursively call itself if the function is a partial. In the - end, return the original function. + end, return the initial wrapped function. + + Args: + g: A partial or a function to unroll. + + Returns: + Initial wrapped function. """ nonlocal args_set_by_partial if isinstance(g, partial): args_set_by_partial.update(g.keywords.keys()) args_set_by_partial.update(g.args) - return _rec_unroll_partial_function(g.keywords["fun"]) + inner_fn = g.keywords["fn"] if "fn" in g.keywords else g.func + return _rec_unroll_partial_function(inner_fn) else: return g diff --git a/src/pydvl/utils/numeric.py b/src/pydvl/utils/numeric.py index 08c465661..d8b1ce915 100644 --- a/src/pydvl/utils/numeric.py +++ b/src/pydvl/utils/numeric.py @@ -122,8 +122,6 @@ def random_powerset( ValueError: if the element sampling probability is not in [0,1] """ - if not isinstance(s, np.ndarray): - raise TypeError("Set must be an NDArray") if q < 0 or q > 1: raise ValueError("Element sampling probability must be in [0,1]") diff --git a/src/pydvl/utils/types.py b/src/pydvl/utils/types.py index 36eb5ce89..59041b9b9 100644 --- a/src/pydvl/utils/types.py +++ b/src/pydvl/utils/types.py @@ -5,13 +5,12 @@ import functools from abc import ABCMeta -from copy import deepcopy -from typing import Any, Callable, Optional, Protocol, Tuple, TypeVar, Union, cast +from typing import Any, Callable, Optional, Protocol, TypeVar, Union, cast from numpy.random import Generator, SeedSequence from numpy.typing import NDArray -from pydvl.utils.functional import unroll_partial_fn_args +from pydvl.utils.functional import get_free_args_fn __all__ = ["SupervisedModel", "MapFunction", "ReduceFunction", "NoPublicConstructor"] @@ -46,13 +45,14 @@ def score(self, x: NDArray, y: NDArray) -> float: pass -def call_fun_remove_arg(*args, fun: Callable, arg: str, **kwargs): +def call_fun_remove_arg(*args, fn: Callable, arg: str, **kwargs): """ - Calls the given function with the given arguments, but removes the given argument. + Calls the given function with the given arguments. In the process it removes the + specified keyword argument from the keyword arguments. Args: args: Positional arguments to pass to the function. - fun: The function to call. + fn: The function to call. arg: The name of the argument to remove. kwargs: Keyword arguments to pass to the function. @@ -64,7 +64,7 @@ def call_fun_remove_arg(*args, fun: Callable, arg: str, **kwargs): except KeyError: pass - return fun(*args, **kwargs) + return fn(*args, **kwargs) def maybe_add_argument(fun: Callable, new_arg: str) -> Callable: @@ -83,10 +83,10 @@ def maybe_add_argument(fun: Callable, new_arg: str) -> Callable: Returns: A new function accepting one more keyword argument. """ - if new_arg in unroll_partial_fn_args(fun): + if new_arg in get_free_args_fn(fun): return fun - return functools.partial(call_fun_remove_arg, fun=fun, arg=new_arg) + return functools.partial(call_fun_remove_arg, fn=fun, arg=new_arg) class NoPublicConstructor(ABCMeta): @@ -137,22 +137,3 @@ def ensure_seed_sequence( return cast(SeedSequence, seed.bit_generator.seed_seq) # type: ignore else: return SeedSequence(seed) - - -def call_fn_multiple_seeds( - fn: Callable, *args, seeds: Tuple[Seed, ...], **kwargs -) -> Tuple: - """ - Execute a function multiple times with different seeds. It copies the arguments - and keyword arguments before passing them to the function. - - Args: - fn: The function to execute. - args: The arguments to pass to the function. - seeds: The seeds to use. - kwargs: The keyword arguments to pass to the function. - - Returns: - A tuple of the results of the function. - """ - return tuple(fn(*deepcopy(args), **deepcopy(kwargs), seed=seed) for seed in seeds) diff --git a/src/pydvl/value/result.py b/src/pydvl/value/result.py index 6fc8fff2f..1d12b218e 100644 --- a/src/pydvl/value/result.py +++ b/src/pydvl/value/result.py @@ -695,8 +695,7 @@ def from_random( ("efficiency" property of Shapley values). kwargs: Additional options to pass to the constructor of [ValuationResult][pydvl.value.result.ValuationResult]. Use to override status, names, etc. - seed: ither an instance of a numpy random number generator or a seed for - it. + Returns: A valuation result with its status set to [Status.Converged][pydvl.utils.status.Status] by default. diff --git a/src/pydvl/value/shapley/common.py b/src/pydvl/value/shapley/common.py index fb69847ce..8f328712c 100644 --- a/src/pydvl/value/shapley/common.py +++ b/src/pydvl/value/shapley/common.py @@ -90,7 +90,7 @@ def compute_shapley_values( others require specific subtypes. n_jobs: Number of parallel jobs (available only to some methods) seed: Either an instance of a numpy random number generator or a seed - for it. + for it. mode: Choose which shapley algorithm to use. See [ShapleyMode][pydvl.value.shapley.ShapleyMode] for a list of allowed value. diff --git a/src/pydvl/value/shapley/gt.py b/src/pydvl/value/shapley/gt.py index cc3284421..1b4b2f3e7 100644 --- a/src/pydvl/value/shapley/gt.py +++ b/src/pydvl/value/shapley/gt.py @@ -141,7 +141,7 @@ def _group_testing_shapley( progress: Whether to display progress bars for each job. job_id: id to use for reporting progress (e.g. to place progres bars) seed: Either an instance of a numpy random number generator or a seed - for it. + for it. Returns: """ @@ -203,7 +203,7 @@ def group_testing_shapley( address, number of cpus, etc. progress: Whether to display progress bars for each job. seed: Either an instance of a numpy random number generator or a seed - for it. + for it. options: Additional options to pass to [cvxpy.Problem.solve()](https://www.cvxpy.org/tutorial/advanced/index.html#solve-method-options). E.g. to change the solver (which defaults to `cvxpy.SCS`) pass diff --git a/tests/value/shapley/test_montecarlo.py b/tests/value/shapley/test_montecarlo.py index da079952f..4df932935 100644 --- a/tests/value/shapley/test_montecarlo.py +++ b/tests/value/shapley/test_montecarlo.py @@ -15,7 +15,7 @@ ) from pydvl.utils.numeric import num_samples_permutation_hoeffding from pydvl.utils.score import Scorer, squashed_r2 -from pydvl.utils.types import Seed, call_fn_multiple_seeds +from pydvl.utils.types import Seed from pydvl.value import compute_shapley_values from pydvl.value.shapley import ShapleyMode from pydvl.value.shapley.naive import combinatorial_exact_shapley @@ -23,6 +23,7 @@ from .. import check_rank_correlation, check_total_value, check_values from ..conftest import polynomial_dataset +from ..utils import call_fn_multiple_seeds log = logging.getLogger(__name__) diff --git a/tests/value/utils.py b/tests/value/utils.py new file mode 100644 index 000000000..7c38e344f --- /dev/null +++ b/tests/value/utils.py @@ -0,0 +1,25 @@ +from __future__ import annotations + +from copy import deepcopy +from typing import Callable, Tuple + +from pydvl.utils.types import Seed + + +def call_fn_multiple_seeds( + fn: Callable, *args, seeds: Tuple[Seed, ...], **kwargs +) -> Tuple: + """ + Execute a function multiple times with different seeds. It copies the arguments + and keyword arguments before passing them to the function. + + Args: + fn: The function to execute. + args: The arguments to pass to the function. + seeds: The seeds to use. + kwargs: The keyword arguments to pass to the function. + + Returns: + A tuple of the results of the function. + """ + return tuple(fn(*deepcopy(args), **deepcopy(kwargs), seed=seed) for seed in seeds) From 3a053d37b1e283966a50dd16b8fabdd059c48b2c Mon Sep 17 00:00:00 2001 From: Markus Semmler Date: Fri, 1 Sep 2023 19:12:58 +0200 Subject: [PATCH 22/32] Optimized function names and further extended one comment. --- src/pydvl/utils/functional.py | 42 +++++++++++++++++++++++++++++------ src/pydvl/utils/types.py | 26 ++-------------------- 2 files changed, 37 insertions(+), 31 deletions(-) diff --git a/src/pydvl/utils/functional.py b/src/pydvl/utils/functional.py index 092f042c3..37d9a18ea 100644 --- a/src/pydvl/utils/functional.py +++ b/src/pydvl/utils/functional.py @@ -4,7 +4,30 @@ from functools import partial from typing import Callable, Set, Tuple, Union -__all__ = ["get_free_args_fn"] +__all__ = ["get_free_args_fn", "fn_accept_additional_argument"] + + +def fn_accept_additional_argument(*args, fn: Callable, arg: str, **kwargs): + """ + Calls the given function with the given arguments. In the process of calling the + wrapped function, it removes the specified keyword argument from the passed keyword + arguments. This function can be pickled by `pickle` as it is on the . + + Args: + args: Positional arguments to pass to the function. + fn: The function to call. + arg: The name of the argument to remove. + kwargs: Keyword arguments to pass to the function. + + Returns: + The return value of the function. + """ + try: + del kwargs[arg] + except KeyError: + pass + + return fn(*args, **kwargs) def get_free_args_fn(fun: Union[Callable, partial]) -> Set[str]: @@ -21,10 +44,11 @@ def get_free_args_fn(fun: Union[Callable, partial]) -> Set[str]: """ args_set_by_partial: Set[str] = set() - def _rec_unroll_partial_function(g: Union[Callable, partial]) -> Callable: + def _rec_unroll_partial_function_args(g: Union[Callable, partial]) -> Callable: """ Store arguments and recursively call itself if the function is a partial. In the - end, return the initial wrapped function. + end, return the initial wrapped function. Besides partial functions it also + supports `partial(fn_accept_additional_argument, *args, **kwargs)` constructs. Args: g: A partial or a function to unroll. @@ -34,14 +58,18 @@ def _rec_unroll_partial_function(g: Union[Callable, partial]) -> Callable: """ nonlocal args_set_by_partial - if isinstance(g, partial): + if isinstance(g, partial) and g.func == fn_accept_additional_argument: + arg = g.keywords["arg"] + if arg in args_set_by_partial: + args_set_by_partial.remove(arg) + return _rec_unroll_partial_function_args(g.keywords["fn"]) + elif isinstance(g, partial): args_set_by_partial.update(g.keywords.keys()) args_set_by_partial.update(g.args) - inner_fn = g.keywords["fn"] if "fn" in g.keywords else g.func - return _rec_unroll_partial_function(inner_fn) + return _rec_unroll_partial_function_args(g.func) else: return g - wrapped_fn = _rec_unroll_partial_function(fun) + wrapped_fn = _rec_unroll_partial_function_args(fun) sig = inspect.signature(wrapped_fn) return args_set_by_partial | set(sig.parameters.keys()) diff --git a/src/pydvl/utils/types.py b/src/pydvl/utils/types.py index 59041b9b9..c3acf410c 100644 --- a/src/pydvl/utils/types.py +++ b/src/pydvl/utils/types.py @@ -10,7 +10,7 @@ from numpy.random import Generator, SeedSequence from numpy.typing import NDArray -from pydvl.utils.functional import get_free_args_fn +from pydvl.utils.functional import fn_accept_additional_argument, get_free_args_fn __all__ = ["SupervisedModel", "MapFunction", "ReduceFunction", "NoPublicConstructor"] @@ -45,28 +45,6 @@ def score(self, x: NDArray, y: NDArray) -> float: pass -def call_fun_remove_arg(*args, fn: Callable, arg: str, **kwargs): - """ - Calls the given function with the given arguments. In the process it removes the - specified keyword argument from the keyword arguments. - - Args: - args: Positional arguments to pass to the function. - fn: The function to call. - arg: The name of the argument to remove. - kwargs: Keyword arguments to pass to the function. - - Returns: - The return value of the function. - """ - try: - del kwargs[arg] - except KeyError: - pass - - return fn(*args, **kwargs) - - def maybe_add_argument(fun: Callable, new_arg: str) -> Callable: """Wraps a function to accept the given keyword parameter if it doesn't already. @@ -86,7 +64,7 @@ def maybe_add_argument(fun: Callable, new_arg: str) -> Callable: if new_arg in get_free_args_fn(fun): return fun - return functools.partial(call_fun_remove_arg, fn=fun, arg=new_arg) + return functools.partial(fn_accept_additional_argument, fn=fun, arg=new_arg) class NoPublicConstructor(ABCMeta): From d3fb8ac1b33a4cba69b1d7035d557feb62a6e146 Mon Sep 17 00:00:00 2001 From: Markus Semmler Date: Fri, 1 Sep 2023 21:40:14 +0200 Subject: [PATCH 23/32] Add extended documentation. --- src/pydvl/utils/functional.py | 30 ++++++++++++++++++++++++++---- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/src/pydvl/utils/functional.py b/src/pydvl/utils/functional.py index 37d9a18ea..e2293c36f 100644 --- a/src/pydvl/utils/functional.py +++ b/src/pydvl/utils/functional.py @@ -32,12 +32,34 @@ def fn_accept_additional_argument(*args, fn: Callable, arg: str, **kwargs): def get_free_args_fn(fun: Union[Callable, partial]) -> Set[str]: """ - Accept a function or partial definition and return the set of arguments that are - free. An argument is free if it is not set by the partial and is a parameter of the - function. + Accept a function or a partial definition and return the set of arguments that are + free. An argument is considered free if it is not set by the partial and is a + parameter of the function. Formally, this can be described as follows: + + Let phi be the function that returns the set of arguments that are set by a given + function. For functions f and g, one can extend the function phi recursively to psi + as follows: + + a) If `f = partial(g, **kwargs)`, two cases arise: + a) If `g = fn_accept_additional_argument`, then + `psi(f) = psi(kwargs.fn) + {kwargs.arg}` + b) Else, `psi(f) = psi(g) - kwargs.keys()` + b) Else,`psi(g) = phi(g)` (Note that this is the base case.) + + Transforming `fn_accept_additional_argument` into a partial function is done by: + `f(fn, arg) = partial(fn_accept_additional_argument, **{fn: fn, arg: arg})` + This function is the inverse (with respect to phi) to the function + `g(fn, arg, val) = partial(g, {arg: val})` in the sense that + `phi(fn) = phi(f(g(fn, arg, val), arg))` + + Together, these components form an algebraic system for reasoning about function + arguments. Each operation (phi, psi, f, g) transforms a function into a new + function in a way that changes the set of arguments that have been applied, and we + have precise rules for understanding those transformations. Args: - fun: A partial or a function to unroll. + fun: A function composed of raw functions `f` or partial functions as + constructed in the description. Returns: A set of arguments that were set by the partial. From cb546ea0b2b8216b060bbfb46012ed49632164bd Mon Sep 17 00:00:00 2001 From: Markus Semmler Date: Fri, 1 Sep 2023 22:24:01 +0200 Subject: [PATCH 24/32] Adapted CHANGELOG.md. --- CHANGELOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index fc7b3f163..b53b43c57 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,10 @@ the board, with a focus on documentation and usability. [PR #365](https://github.com/aai-institute/pyDVL/pull/365) - Enabled parallel computation for Leave-One-Out values [PR #406](https://github.com/aai-institute/pyDVL/pull/406) +- Added seed to functions from `pydvl.utils.numeric`, `pydvl.value.shapley` and + `pydvl.value.semivalues`. Introduced new type `Seed` and conversion function + `ensure_seed_sequence`. + [PR #396](https://github.com/aai-institute/pyDVL/pull/396) ### Changed - Replaced sphinx with mkdocs for documentation. Major overhaul of documentation From 688c9d787852b34883b8b3d64ea12f9aa50e6572 Mon Sep 17 00:00:00 2001 From: Markus Semmler Date: Fri, 1 Sep 2023 23:54:14 +0200 Subject: [PATCH 25/32] Fix indent in docstring. --- src/pydvl/value/semivalues.py | 3 ++- src/pydvl/value/shapley/common.py | 3 +-- src/pydvl/value/shapley/gt.py | 6 ++---- src/pydvl/value/shapley/montecarlo.py | 2 +- 4 files changed, 6 insertions(+), 8 deletions(-) diff --git a/src/pydvl/value/semivalues.py b/src/pydvl/value/semivalues.py index 4078ca523..5cec32d5c 100644 --- a/src/pydvl/value/semivalues.py +++ b/src/pydvl/value/semivalues.py @@ -371,7 +371,8 @@ def compute_beta_shapley_semivalues( sampler_t: The sampler type to use. See :mod:`pydvl.value.sampler` for a list. n_jobs: Number of parallel jobs to use. seed: Either an instance of a numpy random number generator or a seed for it. - config: Object configuring parallel computation, with cluster address, number of cpus, etc. + config: Object configuring parallel computation, with cluster address, number of + cpus, etc. progress: Whether to display a progress bar. Returns: diff --git a/src/pydvl/value/shapley/common.py b/src/pydvl/value/shapley/common.py index 8f328712c..97f4e2945 100644 --- a/src/pydvl/value/shapley/common.py +++ b/src/pydvl/value/shapley/common.py @@ -89,8 +89,7 @@ def compute_shapley_values( criteria using boolean operators. Some methods ignore this argument, others require specific subtypes. n_jobs: Number of parallel jobs (available only to some methods) - seed: Either an instance of a numpy random number generator or a seed - for it. + seed: Either an instance of a numpy random number generator or a seed for it. mode: Choose which shapley algorithm to use. See [ShapleyMode][pydvl.value.shapley.ShapleyMode] for a list of allowed value. diff --git a/src/pydvl/value/shapley/gt.py b/src/pydvl/value/shapley/gt.py index 1b4b2f3e7..cc207129f 100644 --- a/src/pydvl/value/shapley/gt.py +++ b/src/pydvl/value/shapley/gt.py @@ -140,8 +140,7 @@ def _group_testing_shapley( n_samples: total number of samples (subsets) to use. progress: Whether to display progress bars for each job. job_id: id to use for reporting progress (e.g. to place progres bars) - seed: Either an instance of a numpy random number generator or a seed - for it. + seed: Either an instance of a numpy random number generator or a seed for it. Returns: """ @@ -202,8 +201,7 @@ def group_testing_shapley( config: Object configuring parallel computation, with cluster address, number of cpus, etc. progress: Whether to display progress bars for each job. - seed: Either an instance of a numpy random number generator or a seed - for it. + seed: Either an instance of a numpy random number generator or a seed for it. options: Additional options to pass to [cvxpy.Problem.solve()](https://www.cvxpy.org/tutorial/advanced/index.html#solve-method-options). E.g. to change the solver (which defaults to `cvxpy.SCS`) pass diff --git a/src/pydvl/value/shapley/montecarlo.py b/src/pydvl/value/shapley/montecarlo.py index eca44139f..704eb928e 100644 --- a/src/pydvl/value/shapley/montecarlo.py +++ b/src/pydvl/value/shapley/montecarlo.py @@ -323,7 +323,7 @@ def combinatorial_montecarlo_shapley( number of cpus, etc. progress: Whether to display progress bars for each job. seed: Either an instance of a numpy random number generator or a seed for it. - f + Returns: Object with the data values. """ From b827439e9d42dbebaf2f394fde03299fc9f03ac0 Mon Sep 17 00:00:00 2001 From: Miguel de Benito Delgado Date: Sat, 2 Sep 2023 12:15:48 +0200 Subject: [PATCH 26/32] Fix corner-case --- tests/utils/test_parallel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/utils/test_parallel.py b/tests/utils/test_parallel.py index 3316fe7c3..b0588bb07 100644 --- a/tests/utils/test_parallel.py +++ b/tests/utils/test_parallel.py @@ -271,7 +271,7 @@ def test_future_cancellation(parallel_config): # :func:`test_map_reduce_stochastic`. def _sum_of_random_integers(x: None, seed: Optional[Seed] = None): rng = np.random.default_rng(seed) - values = rng.integers(0, rng.integers(100), 10) + values = rng.integers(0, rng.integers(10, 100), 10) return np.sum(values) From 2ba97e7e9fa3cf215af7edbafa914848ffc4205b Mon Sep 17 00:00:00 2001 From: Miguel de Benito Delgado Date: Sat, 2 Sep 2023 12:16:17 +0200 Subject: [PATCH 27/32] Remove unnecessary function and merge tests --- tests/utils/test_parallel.py | 45 +++++++++++++----------------------- 1 file changed, 16 insertions(+), 29 deletions(-) diff --git a/tests/utils/test_parallel.py b/tests/utils/test_parallel.py index b0588bb07..8ba145aa8 100644 --- a/tests/utils/test_parallel.py +++ b/tests/utils/test_parallel.py @@ -147,38 +147,29 @@ def reduce_func(x, y): assert result == 150 -def test_map_reduce_reproducible(parallel_config, seed): - """ - Test that the same result is obtained when using the same seed. And that different - results are obtained when using different seeds. - """ - - map_reduce_job = MapReduceJob( - None, - map_func=_sum_of_random_integers, - reduce_func=_mean_func, - config=parallel_config, - ) - result_1 = map_reduce_job(seed=seed) - result_2 = map_reduce_job(seed=seed) - assert result_1 == result_2 - - -def test_map_reduce_stochastic(parallel_config, seed, seed_alt): - """ - Test that the same result is obtained when using the same seed. And that different - results are obtained when using different seeds. +@pytest.mark.parametrize( + "seed_1, seed_2, op", + [ + (None, None, operator.ne), + (None, 42, operator.ne), + (42, None, operator.ne), + (42, 42, operator.eq), + ], +) +def test_map_reduce_seeding(parallel_config, seed_1, seed_2, op): + """Test that the same result is obtained when using the same seed. And that + different results are obtained when using different seeds. """ map_reduce_job = MapReduceJob( None, map_func=_sum_of_random_integers, - reduce_func=_mean_func, + reduce_func=np.mean, config=parallel_config, ) - result_1 = map_reduce_job(seed=seed) - result_2 = map_reduce_job(seed=seed_alt) - assert result_1 != result_2 + result_1 = map_reduce_job(seed=seed_1) + result_2 = map_reduce_job(seed=seed_2) + assert op(result_1, result_2) def test_wrap_function(parallel_config, num_workers): @@ -273,7 +264,3 @@ def _sum_of_random_integers(x: None, seed: Optional[Seed] = None): rng = np.random.default_rng(seed) values = rng.integers(0, rng.integers(10, 100), 10) return np.sum(values) - - -def _mean_func(means): - return np.mean(means) From 087fdbbcd8fea662c578fc4cac79725f1bfad5dc Mon Sep 17 00:00:00 2001 From: Miguel de Benito Delgado Date: Sat, 2 Sep 2023 12:17:41 +0200 Subject: [PATCH 28/32] Move maybe_add_argument to functional.py --- src/pydvl/utils/functional.py | 24 ++++++++++++++++++++++- src/pydvl/utils/parallel/map_reduce.py | 2 +- src/pydvl/utils/types.py | 27 +------------------------- 3 files changed, 25 insertions(+), 28 deletions(-) diff --git a/src/pydvl/utils/functional.py b/src/pydvl/utils/functional.py index e2293c36f..197c1448f 100644 --- a/src/pydvl/utils/functional.py +++ b/src/pydvl/utils/functional.py @@ -4,7 +4,7 @@ from functools import partial from typing import Callable, Set, Tuple, Union -__all__ = ["get_free_args_fn", "fn_accept_additional_argument"] +__all__ = ["maybe_add_argument"] def fn_accept_additional_argument(*args, fn: Callable, arg: str, **kwargs): @@ -95,3 +95,25 @@ def _rec_unroll_partial_function_args(g: Union[Callable, partial]) -> Callable: wrapped_fn = _rec_unroll_partial_function_args(fun) sig = inspect.signature(wrapped_fn) return args_set_by_partial | set(sig.parameters.keys()) + + +def maybe_add_argument(fun: Callable, new_arg: str) -> Callable: + """Wraps a function to accept the given keyword parameter if it doesn't + already. + + If `fun` already takes a keyword parameter of name `new_arg`, then it is + returned as is. Otherwise, a wrapper is returned which merely ignores the + argument. + + Args: + fun: The function to wrap + new_arg: The name of the argument that the new function will accept + (and ignore). + + Returns: + A new function accepting one more keyword argument. + """ + if new_arg in free_arguments(fun): + return fun + + return functools.partial(_accept_additional_argument, fun=fun, arg=new_arg) diff --git a/src/pydvl/utils/parallel/map_reduce.py b/src/pydvl/utils/parallel/map_reduce.py index 4f0b55af3..dadeb9478 100644 --- a/src/pydvl/utils/parallel/map_reduce.py +++ b/src/pydvl/utils/parallel/map_reduce.py @@ -20,8 +20,8 @@ ReduceFunction, Seed, ensure_seed_sequence, - maybe_add_argument, ) +from ..functional import maybe_add_argument from .backend import init_parallel_backend __all__ = ["MapReduceJob"] diff --git a/src/pydvl/utils/types.py b/src/pydvl/utils/types.py index c3acf410c..81f949f64 100644 --- a/src/pydvl/utils/types.py +++ b/src/pydvl/utils/types.py @@ -3,15 +3,12 @@ """ from __future__ import annotations -import functools from abc import ABCMeta -from typing import Any, Callable, Optional, Protocol, TypeVar, Union, cast +from typing import Any, Optional, Protocol, TypeVar, Union, cast from numpy.random import Generator, SeedSequence from numpy.typing import NDArray -from pydvl.utils.functional import fn_accept_additional_argument, get_free_args_fn - __all__ = ["SupervisedModel", "MapFunction", "ReduceFunction", "NoPublicConstructor"] R = TypeVar("R", covariant=True) @@ -45,28 +42,6 @@ def score(self, x: NDArray, y: NDArray) -> float: pass -def maybe_add_argument(fun: Callable, new_arg: str) -> Callable: - """Wraps a function to accept the given keyword parameter if it doesn't - already. - - If `fun` already takes a keyword parameter of name `new_arg`, then it is - returned as is. Otherwise, a wrapper is returned which merely ignores the - argument. - - Args: - fun: The function to wrap - new_arg: The name of the argument that the new function will accept - (and ignore). - - Returns: - A new function accepting one more keyword argument. - """ - if new_arg in get_free_args_fn(fun): - return fun - - return functools.partial(fn_accept_additional_argument, fn=fun, arg=new_arg) - - class NoPublicConstructor(ABCMeta): """Metaclass that ensures a private constructor From 317f8b1fa326d5222765190d416e107313c839c3 Mon Sep 17 00:00:00 2001 From: Miguel de Benito Delgado Date: Sat, 2 Sep 2023 12:19:11 +0200 Subject: [PATCH 29/32] Renaming and simplifying docstrings --- src/pydvl/utils/functional.py | 70 +++++++++++++---------------------- 1 file changed, 26 insertions(+), 44 deletions(-) diff --git a/src/pydvl/utils/functional.py b/src/pydvl/utils/functional.py index 197c1448f..370f7781a 100644 --- a/src/pydvl/utils/functional.py +++ b/src/pydvl/utils/functional.py @@ -1,21 +1,23 @@ +""" +Supporting utilities for manipulating arguments of functions. +""" + from __future__ import annotations import inspect from functools import partial -from typing import Callable, Set, Tuple, Union +from typing import Callable, Set, Union __all__ = ["maybe_add_argument"] -def fn_accept_additional_argument(*args, fn: Callable, arg: str, **kwargs): - """ - Calls the given function with the given arguments. In the process of calling the - wrapped function, it removes the specified keyword argument from the passed keyword - arguments. This function can be pickled by `pickle` as it is on the . +def _accept_additional_argument(*args, fun: Callable, arg: str, **kwargs): + """Calls the given function with the given positional and keyword arguments, + removing `arg` from the keyword arguments. Args: args: Positional arguments to pass to the function. - fn: The function to call. + fun: The function to call. arg: The name of the argument to remove. kwargs: Keyword arguments to pass to the function. @@ -27,50 +29,30 @@ def fn_accept_additional_argument(*args, fn: Callable, arg: str, **kwargs): except KeyError: pass - return fn(*args, **kwargs) + return fun(*args, **kwargs) -def get_free_args_fn(fun: Union[Callable, partial]) -> Set[str]: - """ - Accept a function or a partial definition and return the set of arguments that are - free. An argument is considered free if it is not set by the partial and is a - parameter of the function. Formally, this can be described as follows: - - Let phi be the function that returns the set of arguments that are set by a given - function. For functions f and g, one can extend the function phi recursively to psi - as follows: - - a) If `f = partial(g, **kwargs)`, two cases arise: - a) If `g = fn_accept_additional_argument`, then - `psi(f) = psi(kwargs.fn) + {kwargs.arg}` - b) Else, `psi(f) = psi(g) - kwargs.keys()` - b) Else,`psi(g) = phi(g)` (Note that this is the base case.) - - Transforming `fn_accept_additional_argument` into a partial function is done by: - `f(fn, arg) = partial(fn_accept_additional_argument, **{fn: fn, arg: arg})` - This function is the inverse (with respect to phi) to the function - `g(fn, arg, val) = partial(g, {arg: val})` in the sense that - `phi(fn) = phi(f(g(fn, arg, val), arg))` - - Together, these components form an algebraic system for reasoning about function - arguments. Each operation (phi, psi, f, g) transforms a function into a new - function in a way that changes the set of arguments that have been applied, and we - have precise rules for understanding those transformations. +def free_arguments(fun: Union[Callable, partial]) -> Set[str]: + """Computes the set of free arguments for a function or [partial object][]. + + All arguments of a function are considered free unless they are set by a + partial. For example, if `f = partial(g, a=1)`, then `a` is not a free + argument of `f`. Args: - fun: A function composed of raw functions `f` or partial functions as - constructed in the description. + fun: A callable or a [partial object][]. Returns: - A set of arguments that were set by the partial. + The set of free arguments of `fun`. """ args_set_by_partial: Set[str] = set() def _rec_unroll_partial_function_args(g: Union[Callable, partial]) -> Callable: - """ - Store arguments and recursively call itself if the function is a partial. In the - end, return the initial wrapped function. Besides partial functions it also - supports `partial(fn_accept_additional_argument, *args, **kwargs)` constructs. + """Stores arguments and recursively call itself if `g` is a partial. In + the end, returns the initially wrapped function. + + This handles the construct `partial(_accept_additional_argument, *args, + **kwargs)` that is used by `maybe_add_argument`. Args: g: A partial or a function to unroll. @@ -80,11 +62,11 @@ def _rec_unroll_partial_function_args(g: Union[Callable, partial]) -> Callable: """ nonlocal args_set_by_partial - if isinstance(g, partial) and g.func == fn_accept_additional_argument: + if isinstance(g, partial) and g.func == _accept_additional_argument: arg = g.keywords["arg"] if arg in args_set_by_partial: args_set_by_partial.remove(arg) - return _rec_unroll_partial_function_args(g.keywords["fn"]) + return _rec_unroll_partial_function_args(g.keywords["fun"]) elif isinstance(g, partial): args_set_by_partial.update(g.keywords.keys()) args_set_by_partial.update(g.args) @@ -116,4 +98,4 @@ def maybe_add_argument(fun: Callable, new_arg: str) -> Callable: if new_arg in free_arguments(fun): return fun - return functools.partial(_accept_additional_argument, fun=fun, arg=new_arg) + return partial(_accept_additional_argument, fun=fun, arg=new_arg) From f007ce8c82fe6cbbab5156ef636bb4d771b7363c Mon Sep 17 00:00:00 2001 From: Miguel de Benito Delgado Date: Sat, 2 Sep 2023 12:19:26 +0200 Subject: [PATCH 30/32] Cosmetic --- CHANGELOG.md | 7 +++++-- src/pydvl/utils/functional.py | 13 ++++++++++--- src/pydvl/utils/parallel/map_reduce.py | 7 +------ src/pydvl/utils/types.py | 11 ++++++++++- src/pydvl/value/loo/loo.py | 2 +- src/pydvl/value/shapley/montecarlo.py | 6 ++++-- 6 files changed, 31 insertions(+), 15 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ca0f29452..bc82e515b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,9 +1,11 @@ # Changelog -## 0.7.0 - 📚 Documentation overhaul, new methods and bug fixes 💥 +## 0.7.0 - 📚🆕 Documentation and IF overhaul, new methods and bug fixes 💥🐞 This is our first β release! We have worked hard to deliver improvements across -the board, with a focus on documentation and usability. +the board, with a focus on documentation and usability. We have also reworked +the internals of the `influence` module, improved parallelism and handling of +randomness. ### Added @@ -19,6 +21,7 @@ the board, with a focus on documentation and usability. [PR #396](https://github.com/aai-institute/pyDVL/pull/396) ### Changed + - Replaced sphinx with mkdocs for documentation. Major overhaul of documentation [PR #352](https://github.com/aai-institute/pyDVL/pull/352) - Made ray an optional dependency, relying on joblib as default parallel backend diff --git a/src/pydvl/utils/functional.py b/src/pydvl/utils/functional.py index 370f7781a..879068b9c 100644 --- a/src/pydvl/utils/functional.py +++ b/src/pydvl/utils/functional.py @@ -33,7 +33,8 @@ def _accept_additional_argument(*args, fun: Callable, arg: str, **kwargs): def free_arguments(fun: Union[Callable, partial]) -> Set[str]: - """Computes the set of free arguments for a function or [partial object][]. + """Computes the set of free arguments for a function or + [functools.partial][] object. All arguments of a function are considered free unless they are set by a partial. For example, if `f = partial(g, a=1)`, then `a` is not a free @@ -44,12 +45,15 @@ def free_arguments(fun: Union[Callable, partial]) -> Set[str]: Returns: The set of free arguments of `fun`. + + !!! tip "New in version 0.7.0" """ args_set_by_partial: Set[str] = set() def _rec_unroll_partial_function_args(g: Union[Callable, partial]) -> Callable: - """Stores arguments and recursively call itself if `g` is a partial. In - the end, returns the initially wrapped function. + """Stores arguments and recursively call itself if `g` is a + [functools.partial][] object. In the end, returns the initially wrapped + function. This handles the construct `partial(_accept_additional_argument, *args, **kwargs)` that is used by `maybe_add_argument`. @@ -94,6 +98,9 @@ def maybe_add_argument(fun: Callable, new_arg: str) -> Callable: Returns: A new function accepting one more keyword argument. + + !!! tip "Changed in version 0.7.0" + Ability to work with partials. """ if new_arg in free_arguments(fun): return fun diff --git a/src/pydvl/utils/parallel/map_reduce.py b/src/pydvl/utils/parallel/map_reduce.py index dadeb9478..149cd2752 100644 --- a/src/pydvl/utils/parallel/map_reduce.py +++ b/src/pydvl/utils/parallel/map_reduce.py @@ -15,13 +15,8 @@ from numpy.typing import NDArray from ..config import ParallelConfig -from ..types import ( - MapFunction, - ReduceFunction, - Seed, - ensure_seed_sequence, -) from ..functional import maybe_add_argument +from ..types import MapFunction, ReduceFunction, Seed, ensure_seed_sequence from .backend import init_parallel_backend __all__ = ["MapReduceJob"] diff --git a/src/pydvl/utils/types.py b/src/pydvl/utils/types.py index 81f949f64..5df91923d 100644 --- a/src/pydvl/utils/types.py +++ b/src/pydvl/utils/types.py @@ -9,7 +9,14 @@ from numpy.random import Generator, SeedSequence from numpy.typing import NDArray -__all__ = ["SupervisedModel", "MapFunction", "ReduceFunction", "NoPublicConstructor"] +__all__ = [ + "ensure_seed_sequence", + "MapFunction", + "NoPublicConstructor", + "ReduceFunction", + "Seed", + "SupervisedModel", +] R = TypeVar("R", covariant=True) @@ -83,6 +90,8 @@ def ensure_seed_sequence( Returns: A SeedSequence object. + + !!! tip "New in version 0.7.0" """ if isinstance(seed, SeedSequence): return seed diff --git a/src/pydvl/value/loo/loo.py b/src/pydvl/value/loo/loo.py index 0ef951a89..893594260 100644 --- a/src/pydvl/value/loo/loo.py +++ b/src/pydvl/value/loo/loo.py @@ -32,7 +32,7 @@ def compute_loo( Returns: Object with the data values. - !!! tip "New in version 0.8.0" + !!! tip "New in version 0.7.0" Renamed from `naive_loo` and added parallel computation. """ diff --git a/src/pydvl/value/shapley/montecarlo.py b/src/pydvl/value/shapley/montecarlo.py index 704eb928e..06c4e023f 100644 --- a/src/pydvl/value/shapley/montecarlo.py +++ b/src/pydvl/value/shapley/montecarlo.py @@ -241,7 +241,8 @@ def _combinatorial_montecarlo_shapley( job_id: int = 1, seed: Optional[Seed] = None, ) -> ValuationResult: - """Helper function for [combinatorial_montecarlo_shapley()][pydvl.value.shapley.montecarlo.combinatorial_montecarlo_shapley]. + """Helper function for + [combinatorial_montecarlo_shapley][pydvl.value.shapley.montecarlo.combinatorial_montecarlo_shapley]. This is the code that is sent to workers to compute values using the combinatorial definition. @@ -252,7 +253,8 @@ def _combinatorial_montecarlo_shapley( done: Check on the results which decides when to stop sampling subsets for an index. progress: Whether to display progress bars for each job. - seed: Either an instance of a numpy random number generator or a seed for it. + seed: Either an instance of a numpy random number generator or a seed + for it. job_id: id to use for reporting progress Returns: From 1bb75e11d3dc94e99a37578d46e1309df1057aa8 Mon Sep 17 00:00:00 2001 From: Miguel de Benito Delgado Date: Sat, 2 Sep 2023 12:42:20 +0200 Subject: [PATCH 31/32] Nicer headers in API --- docs/css/extra.css | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/docs/css/extra.css b/docs/css/extra.css index 159879bef..4716fee11 100644 --- a/docs/css/extra.css +++ b/docs/css/extra.css @@ -77,6 +77,12 @@ a.autorefs-external:hover::after { user-select: none; } +/* Nicer style of headers in generated API */ +h2 code { + font-size: large!important; + background-color: inherit!important; +} + /* Remove cell input and output prompt */ .jp-InputArea-prompt, .jp-OutputArea-prompt { display: none !important; From e38e1ff232bae9ebbc08e9df2d0415a0bda98a10 Mon Sep 17 00:00:00 2001 From: Miguel de Benito Delgado Date: Sat, 2 Sep 2023 12:45:29 +0200 Subject: [PATCH 32/32] Cleanup --- tests/value/__init__.py | 2 -- tests/value/shapley/test_montecarlo.py | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/value/__init__.py b/tests/value/__init__.py index 795eef798..ce16e73e4 100644 --- a/tests/value/__init__.py +++ b/tests/value/__init__.py @@ -65,8 +65,6 @@ def check_values( have |value - exact_value| < 0.1 for every value. :param extra_values_names: Sequence of names of extra values that should also be compared. - :param equal: True, if it should be checked that the values are equal, False if it - should be checked that they are not equal. """ values.sort() exact_values.sort() diff --git a/tests/value/shapley/test_montecarlo.py b/tests/value/shapley/test_montecarlo.py index 4df932935..b7961cdb8 100644 --- a/tests/value/shapley/test_montecarlo.py +++ b/tests/value/shapley/test_montecarlo.py @@ -83,7 +83,7 @@ def test_analytic_montecarlo_shapley( ), (12, ShapleyMode.Owen, dict(n_samples=4, max_q=200)), (12, ShapleyMode.OwenAntithetic, dict(n_samples=4, max_q=200)), - (4, ShapleyMode.GroupTesting, dict(n_samples=int(21), epsilon=0.2, delta=0.01)), + (4, ShapleyMode.GroupTesting, dict(n_samples=21, epsilon=0.2, delta=0.01)), ]