diff --git a/CHANGELOG.md b/CHANGELOG.md index 42a845cf5..bc82e515b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,9 +1,11 @@ # Changelog -## 0.7.0 - 📚 Documentation overhaul, new methods and bug fixes 💥 +## 0.7.0 - 📚🆕 Documentation and IF overhaul, new methods and bug fixes 💥🐞 This is our first β release! We have worked hard to deliver improvements across -the board, with a focus on documentation and usability. +the board, with a focus on documentation and usability. We have also reworked +the internals of the `influence` module, improved parallelism and handling of +randomness. ### Added @@ -13,8 +15,13 @@ the board, with a focus on documentation and usability. [PR #406](https://github.com/aai-institute/pyDVL/pull/406) - Added more abbreviations to documentation [PR #415](https://github.com/aai-institute/pyDVL/pull/415) +- Added seed to functions from `pydvl.utils.numeric`, `pydvl.value.shapley` and + `pydvl.value.semivalues`. Introduced new type `Seed` and conversion function + `ensure_seed_sequence`. + [PR #396](https://github.com/aai-institute/pyDVL/pull/396) ### Changed + - Replaced sphinx with mkdocs for documentation. Major overhaul of documentation [PR #352](https://github.com/aai-institute/pyDVL/pull/352) - Made ray an optional dependency, relying on joblib as default parallel backend diff --git a/docs/css/extra.css b/docs/css/extra.css index 159879bef..4716fee11 100644 --- a/docs/css/extra.css +++ b/docs/css/extra.css @@ -77,6 +77,12 @@ a.autorefs-external:hover::after { user-select: none; } +/* Nicer style of headers in generated API */ +h2 code { + font-size: large!important; + background-color: inherit!important; +} + /* Remove cell input and output prompt */ .jp-InputArea-prompt, .jp-OutputArea-prompt { display: none !important; diff --git a/src/pydvl/utils/functional.py b/src/pydvl/utils/functional.py new file mode 100644 index 000000000..879068b9c --- /dev/null +++ b/src/pydvl/utils/functional.py @@ -0,0 +1,108 @@ +""" +Supporting utilities for manipulating arguments of functions. +""" + +from __future__ import annotations + +import inspect +from functools import partial +from typing import Callable, Set, Union + +__all__ = ["maybe_add_argument"] + + +def _accept_additional_argument(*args, fun: Callable, arg: str, **kwargs): + """Calls the given function with the given positional and keyword arguments, + removing `arg` from the keyword arguments. + + Args: + args: Positional arguments to pass to the function. + fun: The function to call. + arg: The name of the argument to remove. + kwargs: Keyword arguments to pass to the function. + + Returns: + The return value of the function. + """ + try: + del kwargs[arg] + except KeyError: + pass + + return fun(*args, **kwargs) + + +def free_arguments(fun: Union[Callable, partial]) -> Set[str]: + """Computes the set of free arguments for a function or + [functools.partial][] object. + + All arguments of a function are considered free unless they are set by a + partial. For example, if `f = partial(g, a=1)`, then `a` is not a free + argument of `f`. + + Args: + fun: A callable or a [partial object][]. + + Returns: + The set of free arguments of `fun`. + + !!! tip "New in version 0.7.0" + """ + args_set_by_partial: Set[str] = set() + + def _rec_unroll_partial_function_args(g: Union[Callable, partial]) -> Callable: + """Stores arguments and recursively call itself if `g` is a + [functools.partial][] object. In the end, returns the initially wrapped + function. + + This handles the construct `partial(_accept_additional_argument, *args, + **kwargs)` that is used by `maybe_add_argument`. + + Args: + g: A partial or a function to unroll. + + Returns: + Initial wrapped function. + """ + nonlocal args_set_by_partial + + if isinstance(g, partial) and g.func == _accept_additional_argument: + arg = g.keywords["arg"] + if arg in args_set_by_partial: + args_set_by_partial.remove(arg) + return _rec_unroll_partial_function_args(g.keywords["fun"]) + elif isinstance(g, partial): + args_set_by_partial.update(g.keywords.keys()) + args_set_by_partial.update(g.args) + return _rec_unroll_partial_function_args(g.func) + else: + return g + + wrapped_fn = _rec_unroll_partial_function_args(fun) + sig = inspect.signature(wrapped_fn) + return args_set_by_partial | set(sig.parameters.keys()) + + +def maybe_add_argument(fun: Callable, new_arg: str) -> Callable: + """Wraps a function to accept the given keyword parameter if it doesn't + already. + + If `fun` already takes a keyword parameter of name `new_arg`, then it is + returned as is. Otherwise, a wrapper is returned which merely ignores the + argument. + + Args: + fun: The function to wrap + new_arg: The name of the argument that the new function will accept + (and ignore). + + Returns: + A new function accepting one more keyword argument. + + !!! tip "Changed in version 0.7.0" + Ability to work with partials. + """ + if new_arg in free_arguments(fun): + return fun + + return partial(_accept_additional_argument, fun=fun, arg=new_arg) diff --git a/src/pydvl/utils/numeric.py b/src/pydvl/utils/numeric.py index d78f8d5be..d8b1ce915 100644 --- a/src/pydvl/utils/numeric.py +++ b/src/pydvl/utils/numeric.py @@ -10,6 +10,8 @@ import numpy as np from numpy.typing import NDArray +from pydvl.utils.types import Seed + __all__ = [ "running_moments", "num_samples_permutation_hoeffding", @@ -68,24 +70,32 @@ def num_samples_permutation_hoeffding(eps: float, delta: float, u_range: float) return int(np.ceil(np.log(2 / delta) * 2 * u_range**2 / eps**2)) -def random_subset(s: NDArray[T], q: float = 0.5) -> NDArray[T]: - """Returns one subset at random from `s`. +def random_subset( + s: NDArray[T], + q: float = 0.5, + seed: Optional[Seed] = None, +) -> NDArray[T]: + """Returns one subset at random from ``s``. Args: s: set to sample from q: Sampling probability for elements. The default 0.5 yields a uniform distribution over the power set of s. + seed: Either an instance of a numpy random number generator or a seed for it. Returns: The subset """ - rng = np.random.default_rng() + rng = np.random.default_rng(seed) selection = rng.uniform(size=len(s)) > q return s[selection] def random_powerset( - s: NDArray[T], n_samples: Optional[int] = None, q: float = 0.5 + s: NDArray[T], + n_samples: Optional[int] = None, + q: float = 0.5, + seed: Optional[Seed] = None, ) -> Generator[NDArray[T], None, None]: """Samples subsets from the power set of the argument, without pre-generating all subsets and in no order. @@ -103,6 +113,7 @@ def random_powerset( Defaults to `np.iinfo(np.int32).max` q: Sampling probability for elements. The default 0.5 yields a uniform distribution over the power set of s. + seed: Either an instance of a numpy random number generator or a seed for it. Returns: Samples from the power set of `s`. @@ -114,21 +125,27 @@ def random_powerset( if q < 0 or q > 1: raise ValueError("Element sampling probability must be in [0,1]") + rng = np.random.default_rng(seed) total = 1 if n_samples is None: n_samples = np.iinfo(np.int32).max while total <= n_samples: - yield random_subset(s, q) + yield random_subset(s, q, seed=rng) total += 1 -def random_subset_of_size(s: NDArray[T], size: int) -> NDArray[T]: +def random_subset_of_size( + s: NDArray[T], + size: int, + seed: Optional[Seed] = None, +) -> NDArray[T]: """Samples a random subset of given size uniformly from the powerset of `s`. Args: s: Set to sample from size: Size of the subset to generate + seed: Either an instance of a numpy random number generator or a seed for it. Returns: The subset @@ -138,11 +155,13 @@ def random_subset_of_size(s: NDArray[T], size: int) -> NDArray[T]: """ if size > len(s): raise ValueError("Cannot sample subset larger than set") - rng = np.random.default_rng() + rng = np.random.default_rng(seed) return rng.choice(s, size=size, replace=False) -def random_matrix_with_condition_number(n: int, condition_number: float) -> NDArray: +def random_matrix_with_condition_number( + n: int, condition_number: float, seed: Optional[Seed] = None +) -> NDArray: """Constructs a square matrix with a given condition number. Taken from: @@ -156,6 +175,7 @@ def random_matrix_with_condition_number(n: int, condition_number: float) -> NDAr Args: n: size of the matrix condition_number: duh + seed: Either an instance of a numpy random number generator or a seed for it. Returns: An (n,n) matrix with the requested condition number. @@ -166,6 +186,7 @@ def random_matrix_with_condition_number(n: int, condition_number: float) -> NDAr if condition_number <= 1: raise ValueError("Condition number must be greater than 1") + rng = np.random.default_rng(seed) log_condition_number = np.log(condition_number) exp_vec = np.arange( -log_condition_number / 4.0, @@ -175,8 +196,8 @@ def random_matrix_with_condition_number(n: int, condition_number: float) -> NDAr exp_vec = exp_vec[:n] s: np.ndarray = np.exp(exp_vec) S = np.diag(s) - U, _ = np.linalg.qr((np.random.rand(n, n) - 5.0) * 200) - V, _ = np.linalg.qr((np.random.rand(n, n) - 5.0) * 200) + U, _ = np.linalg.qr((rng.uniform(size=(n, n)) - 5.0) * 200) + V, _ = np.linalg.qr((rng.uniform(size=(n, n)) - 5.0) * 200) P: np.ndarray = U.dot(S).dot(V.T) P = P.dot(P.T) return P diff --git a/src/pydvl/utils/parallel/map_reduce.py b/src/pydvl/utils/parallel/map_reduce.py index f7dec6179..149cd2752 100644 --- a/src/pydvl/utils/parallel/map_reduce.py +++ b/src/pydvl/utils/parallel/map_reduce.py @@ -6,14 +6,17 @@ This interface might be deprecated or changed in a future release before 1.0 """ +from functools import reduce from itertools import accumulate, repeat from typing import Any, Collection, Dict, Generic, List, Optional, TypeVar, Union from joblib import Parallel, delayed +from numpy.random import SeedSequence from numpy.typing import NDArray from ..config import ParallelConfig -from ..types import MapFunction, ReduceFunction, maybe_add_argument +from ..functional import maybe_add_argument +from ..types import MapFunction, ReduceFunction, Seed, ensure_seed_sequence from .backend import init_parallel_backend __all__ = ["MapReduceJob"] @@ -104,12 +107,23 @@ def __init__( self.map_kwargs = map_kwargs if map_kwargs is not None else dict() self.reduce_kwargs = reduce_kwargs if reduce_kwargs is not None else dict() - self._map_func = maybe_add_argument(map_func, "job_id") + self._map_func = reduce(maybe_add_argument, ["job_id", "seed"], map_func) self._reduce_func = reduce_func def __call__( self, + seed: Optional[Union[Seed, SeedSequence]] = None, ) -> R: + """ + Runs the map-reduce job. + + Args: + seed: Either an instance of a numpy random number generator or a seed for + it. + + Returns: + The result of the reduce function. + """ if self.config.backend == "joblib": backend = "loky" else: @@ -117,12 +131,18 @@ def __call__( # In joblib the levels are reversed. # 0 means no logging and 50 means log everything to stdout verbose = 50 - self.config.logging_level + seed_seq = ensure_seed_sequence(seed) with Parallel(backend=backend, n_jobs=self.n_jobs, verbose=verbose) as parallel: chunks = self._chunkify(self.inputs_, n_chunks=self.n_jobs) map_results: List[R] = parallel( - delayed(self._map_func)(next_chunk, job_id=j, **self.map_kwargs) - for j, next_chunk in enumerate(chunks) + delayed(self._map_func)( + next_chunk, job_id=j, seed=seed, **self.map_kwargs + ) + for j, (next_chunk, seed) in enumerate( + zip(chunks, seed_seq.spawn(len(chunks))) + ) ) + reduce_results: R = self._reduce_func(map_results, **self.reduce_kwargs) return reduce_results diff --git a/src/pydvl/utils/types.py b/src/pydvl/utils/types.py index 0de94cf73..5df91923d 100644 --- a/src/pydvl/utils/types.py +++ b/src/pydvl/utils/types.py @@ -3,13 +3,20 @@ """ from __future__ import annotations -import inspect from abc import ABCMeta -from typing import Any, Callable, Protocol, TypeVar +from typing import Any, Optional, Protocol, TypeVar, Union, cast +from numpy.random import Generator, SeedSequence from numpy.typing import NDArray -__all__ = ["SupervisedModel", "MapFunction", "ReduceFunction", "NoPublicConstructor"] +__all__ = [ + "ensure_seed_sequence", + "MapFunction", + "NoPublicConstructor", + "ReduceFunction", + "Seed", + "SupervisedModel", +] R = TypeVar("R", covariant=True) @@ -42,36 +49,6 @@ def score(self, x: NDArray, y: NDArray) -> float: pass -def maybe_add_argument(fun: Callable, new_arg: str) -> Callable: - """Wraps a function to accept the given keyword parameter if it doesn't - already. - - If `fun` already takes a keyword parameter of name `new_arg`, then it is - returned as is. Otherwise, a wrapper is returned which merely ignores the - argument. - - Args: - fun: The function to wrap - new_arg: The name of the argument that the new function will accept - (and ignore). - - Returns: - A new function accepting one more keyword argument. - """ - params = inspect.signature(fun).parameters - if new_arg in params.keys(): - return fun - - def wrapper(*args, **kwargs): - try: - del kwargs[new_arg] - except KeyError: - pass - return fun(*args, **kwargs) - - return wrapper - - class NoPublicConstructor(ABCMeta): """Metaclass that ensures a private constructor @@ -95,3 +72,30 @@ def __call__(cls, *args, **kwargs): def create(cls, *args: Any, **kwargs: Any): return super().__call__(*args, **kwargs) + + +Seed = Union[int, Generator] + + +def ensure_seed_sequence( + seed: Optional[Union[Seed, SeedSequence]] = None +) -> SeedSequence: + """ + If the passed seed is a SeedSequence object then it is returned as is. If it is + a Generator the internal protected seed sequence from the generator gets extracted. + Otherwise, a new SeedSequence object is created from the passed (optional) seed. + + Args: + seed: Either an int, a Generator object a SeedSequence object or None. + + Returns: + A SeedSequence object. + + !!! tip "New in version 0.7.0" + """ + if isinstance(seed, SeedSequence): + return seed + elif isinstance(seed, Generator): + return cast(SeedSequence, seed.bit_generator.seed_seq) # type: ignore + else: + return SeedSequence(seed) diff --git a/src/pydvl/value/loo/loo.py b/src/pydvl/value/loo/loo.py index 0ef951a89..893594260 100644 --- a/src/pydvl/value/loo/loo.py +++ b/src/pydvl/value/loo/loo.py @@ -32,7 +32,7 @@ def compute_loo( Returns: Object with the data values. - !!! tip "New in version 0.8.0" + !!! tip "New in version 0.7.0" Renamed from `naive_loo` and added parallel computation. """ diff --git a/src/pydvl/value/result.py b/src/pydvl/value/result.py index e5cc9ec23..1d12b218e 100644 --- a/src/pydvl/value/result.py +++ b/src/pydvl/value/result.py @@ -71,6 +71,7 @@ from pydvl.utils.dataset import Dataset from pydvl.utils.numeric import running_moments from pydvl.utils.status import Status +from pydvl.utils.types import Seed try: import pandas # Try to import here for the benefit of mypy @@ -678,7 +679,11 @@ def to_dataframe( @classmethod def from_random( - cls, size: int, total: Optional[float] = None, **kwargs + cls, + size: int, + total: Optional[float] = None, + seed: Optional[Seed] = None, + **kwargs, ) -> "ValuationResult": """Creates a [ValuationResult][pydvl.value.result.ValuationResult] object and fills it with an array of random values from a uniform distribution in [-1,1]. The values can @@ -703,7 +708,8 @@ def from_random( if size < 1: raise ValueError("Size must be a positive integer") - values = np.random.uniform(low=-1, high=1, size=size) + rng = np.random.default_rng(seed) + values = rng.uniform(low=-1, high=1, size=size) if total is not None: values *= total / np.sum(values) diff --git a/src/pydvl/value/sampler.py b/src/pydvl/value/sampler.py index 341532e9b..0e3e479e9 100644 --- a/src/pydvl/value/sampler.py +++ b/src/pydvl/value/sampler.py @@ -44,13 +44,24 @@ import math from enum import Enum from itertools import permutations -from typing import Generic, Iterable, Iterator, Sequence, Tuple, TypeVar, overload +from typing import ( + Generic, + Iterable, + Iterator, + Optional, + Sequence, + Tuple, + TypeVar, + Union, + overload, +) import numpy as np from deprecate import deprecated, void from numpy.typing import NDArray from pydvl.utils.numeric import powerset, random_subset, random_subset_of_size +from pydvl.utils.types import Seed __all__ = [ "AntitheticSampler", @@ -60,8 +71,10 @@ "PowersetSampler", "RandomHierarchicalSampler", "UniformSampler", + "StochasticSamplerMixin", ] + T = TypeVar("T", bound=np.generic) SampleT = Tuple[T, NDArray[T]] Sequence.register(np.ndarray) @@ -210,6 +223,14 @@ def weight(cls, n: int, subset_len: int) -> float: ... +class StochasticSamplerMixin: + """Mixin class for samplers which use a random number generator.""" + + def __init__(self, *args, seed: Optional[Seed] = None, **kwargs): + super().__init__(*args, **kwargs) + self._rng = np.random.default_rng(seed) + + class DeterministicUniformSampler(PowersetSampler[T]): def __init__(self, indices: NDArray[T], *args, **kwargs): """An iterator to perform uniform deterministic sampling of subsets. @@ -247,7 +268,7 @@ def weight(cls, n: int, subset_len: int) -> float: return float(2 ** (n - 1)) if n > 0 else 1.0 -class UniformSampler(PowersetSampler[T]): +class UniformSampler(StochasticSamplerMixin, PowersetSampler[T]): """An iterator to perform uniform random sampling of subsets. Iterating over every index $i$, either in sequence or at random depending on @@ -271,7 +292,7 @@ class UniformSampler(PowersetSampler[T]): def __iter__(self) -> Iterator[SampleT]: while True: for idx in self.iterindices(): - subset = random_subset(self.complement([idx])) + subset = random_subset(self.complement([idx]), seed=self._rng) yield idx, subset self._n_samples += 1 if self._n_samples == 0: # Empty index set @@ -293,7 +314,7 @@ def __init__(self, indices: NDArray[T], *args, **kwargs): void(indices, args, kwargs) -class AntitheticSampler(PowersetSampler[T]): +class AntitheticSampler(StochasticSamplerMixin, PowersetSampler[T]): """An iterator to perform uniform random sampling of subsets, and their complements. @@ -305,7 +326,7 @@ class AntitheticSampler(PowersetSampler[T]): def __iter__(self) -> Iterator[SampleT]: while True: for idx in self.iterindices(): - subset = random_subset(self.complement([idx])) + subset = random_subset(self.complement([idx]), seed=self._rng) yield idx, subset self._n_samples += 1 yield idx, self.complement(np.concatenate((subset, np.array([idx])))) @@ -318,7 +339,7 @@ def weight(cls, n: int, subset_len: int) -> float: return float(2 ** (n - 1)) if n > 0 else 1.0 -class PermutationSampler(PowersetSampler[T]): +class PermutationSampler(StochasticSamplerMixin, PowersetSampler[T]): """Sample permutations of indices and iterate through each returning increasing subsets, as required for the permutation definition of semi-values. @@ -336,9 +357,8 @@ class PermutationSampler(PowersetSampler[T]): """ def __iter__(self) -> Iterator[SampleT]: - rng = np.random.default_rng() # FIXME: waiting for better rng handling while True: - permutation = rng.permutation(self._indices) + permutation = self._rng.permutation(self._indices) for i, idx in enumerate(permutation): yield idx, permutation[:i] self._n_samples += 1 @@ -377,7 +397,7 @@ def __iter__(self) -> Iterator[SampleT]: self._n_samples += 1 -class RandomHierarchicalSampler(PowersetSampler[T]): +class RandomHierarchicalSampler(StochasticSamplerMixin, PowersetSampler[T]): """For every index, sample a set size, then a set of that size. !!! Todo @@ -387,8 +407,10 @@ class RandomHierarchicalSampler(PowersetSampler[T]): def __iter__(self) -> Iterator[SampleT]: while True: for idx in self.iterindices(): - k = np.random.choice(np.arange(len(self._indices)), size=1).item() - subset = random_subset_of_size(self.complement([idx]), size=k) + k = self._rng.choice(np.arange(len(self._indices)), size=1).item() + subset = random_subset_of_size( + self.complement([idx]), size=k, seed=self._rng + ) yield idx, subset self._n_samples += 1 if self._n_samples == 0: # Empty index set @@ -397,3 +419,10 @@ def __iter__(self) -> Iterator[SampleT]: @classmethod def weight(cls, n: int, subset_len: int) -> float: return float(2 ** (n - 1)) if n > 0 else 1.0 + + +# TODO Replace by Intersection[StochasticSamplerMixin, PowersetSampler[T]] +# See https://github.com/python/typing/issues/213 +StochasticSampler = Union[ + UniformSampler, PermutationSampler, AntitheticSampler, RandomHierarchicalSampler +] diff --git a/src/pydvl/value/semivalues.py b/src/pydvl/value/semivalues.py index 62ec04c3a..5cec32d5c 100644 --- a/src/pydvl/value/semivalues.py +++ b/src/pydvl/value/semivalues.py @@ -73,7 +73,7 @@ import logging import math from enum import Enum -from typing import Protocol, Tuple, Type, TypeVar, cast +from typing import Optional, Protocol, Tuple, Type, TypeVar, cast import numpy as np import scipy as sp @@ -81,8 +81,14 @@ from tqdm import tqdm from pydvl.utils import ParallelConfig, Utility +from pydvl.utils.types import Seed from pydvl.value import ValuationResult -from pydvl.value.sampler import PermutationSampler, PowersetSampler, SampleT +from pydvl.value.sampler import ( + PermutationSampler, + PowersetSampler, + SampleT, + StochasticSampler, +) from pydvl.value.stopping import MaxUpdates, StoppingCriterion __all__ = [ @@ -259,10 +265,11 @@ def compute_shapley_semivalues( u: Utility, *, done: StoppingCriterion = MaxUpdates(100), - sampler_t: Type[PowersetSampler] = PermutationSampler, + sampler_t: Type[StochasticSampler] = PermutationSampler, n_jobs: int = 1, config: ParallelConfig = ParallelConfig(), progress: bool = False, + seed: Optional[Seed] = None, ) -> ValuationResult: """Computes Shapley values for a given utility function. @@ -280,13 +287,14 @@ def compute_shapley_semivalues( n_jobs: Number of parallel jobs to use. config: Object configuring parallel computation, with cluster address, number of cpus, etc. + seed: Either an instance of a numpy random number generator or a seed for it. progress: Whether to display a progress bar. Returns: Object with the results. """ return compute_generic_semivalues( - sampler_t(u.data.indices), + sampler_t(u.data.indices, seed=seed), u, shapley_coefficient, done, @@ -300,10 +308,11 @@ def compute_banzhaf_semivalues( u: Utility, *, done: StoppingCriterion = MaxUpdates(100), - sampler_t: Type[PowersetSampler] = PermutationSampler, + sampler_t: Type[StochasticSampler] = PermutationSampler, n_jobs: int = 1, config: ParallelConfig = ParallelConfig(), progress: bool = False, + seed: Optional[Seed] = None, ) -> ValuationResult: """Computes Banzhaf values for a given utility function. @@ -317,6 +326,7 @@ def compute_banzhaf_semivalues( sampler_t: The sampler type to use. See :mod:`pydvl.value.sampler` for a list. n_jobs: Number of parallel jobs to use. + seed: Either an instance of a numpy random number generator or a seed for it. config: Object configuring parallel computation, with cluster address, number of cpus, etc. progress: Whether to display a progress bar. @@ -325,7 +335,7 @@ def compute_banzhaf_semivalues( Object with the results. """ return compute_generic_semivalues( - sampler_t(u.data.indices), + sampler_t(u.data.indices, seed=seed), u, banzhaf_coefficient, done, @@ -341,10 +351,11 @@ def compute_beta_shapley_semivalues( alpha: float = 1, beta: float = 1, done: StoppingCriterion = MaxUpdates(100), - sampler_t: Type[PowersetSampler] = PermutationSampler, + sampler_t: Type[StochasticSampler] = PermutationSampler, n_jobs: int = 1, config: ParallelConfig = ParallelConfig(), progress: bool = False, + seed: Optional[Seed] = None, ) -> ValuationResult: """Computes Beta Shapley values for a given utility function. @@ -359,14 +370,16 @@ def compute_beta_shapley_semivalues( done: Stopping criterion. sampler_t: The sampler type to use. See :mod:`pydvl.value.sampler` for a list. n_jobs: Number of parallel jobs to use. - config: Object configuring parallel computation, with cluster address, number of cpus, etc. + seed: Either an instance of a numpy random number generator or a seed for it. + config: Object configuring parallel computation, with cluster address, number of + cpus, etc. progress: Whether to display a progress bar. Returns: Object with the results. """ return compute_generic_semivalues( - sampler_t(u.data.indices), + sampler_t(u.data.indices, seed=seed), u, beta_coefficient(alpha, beta), done, @@ -400,8 +413,9 @@ def compute_semivalues( *, done: StoppingCriterion = MaxUpdates(100), mode: SemiValueMode = SemiValueMode.Shapley, - sampler_t: Type[PowersetSampler] = PermutationSampler, + sampler_t: Type[StochasticSampler] = PermutationSampler, n_jobs: int = 1, + seed: Optional[Seed] = None, **kwargs, ) -> ValuationResult: """Convenience entry point for most common semi-value computations. @@ -444,13 +458,13 @@ def compute_semivalues( sampler_t: The sampler type to use. See [sampler][pydvl.value.sampler] for a list. n_jobs: Number of parallel jobs to use. + seed: Either an instance of a numpy random number generator or a seed for it. kwargs: Additional keyword arguments passed to [compute_generic_semivalues][pydvl.value.semivalues.compute_generic_semivalues]. Returns: Object with the results. """ - sampler_instance = sampler_t(u.data.indices) if mode == SemiValueMode.Shapley: coefficient = shapley_coefficient elif mode == SemiValueMode.BetaShapley: @@ -463,5 +477,10 @@ def compute_semivalues( raise ValueError(f"Unknown mode {mode}") coefficient = cast(SVCoefficient, coefficient) return compute_generic_semivalues( - sampler_instance, u, coefficient, done, n_jobs=n_jobs, **kwargs + sampler_t(u.data.indices, seed=seed), + u, + coefficient, + done, + n_jobs=n_jobs, + **kwargs, ) diff --git a/src/pydvl/value/shapley/common.py b/src/pydvl/value/shapley/common.py index b55e018c8..97f4e2945 100644 --- a/src/pydvl/value/shapley/common.py +++ b/src/pydvl/value/shapley/common.py @@ -1,4 +1,7 @@ +from typing import Optional + from pydvl.utils import Utility +from pydvl.utils.types import Seed from pydvl.value.result import ValuationResult from pydvl.value.shapley.gt import group_testing_shapley from pydvl.value.shapley.knn import knn_shapley @@ -24,6 +27,7 @@ def compute_shapley_values( done: StoppingCriterion = MaxUpdates(100), mode: ShapleyMode = ShapleyMode.TruncatedMontecarlo, n_jobs: int = 1, + seed: Optional[Seed] = None, **kwargs, ) -> ValuationResult: """Umbrella method to compute Shapley values with any of the available @@ -85,6 +89,7 @@ def compute_shapley_values( criteria using boolean operators. Some methods ignore this argument, others require specific subtypes. n_jobs: Number of parallel jobs (available only to some methods) + seed: Either an instance of a numpy random number generator or a seed for it. mode: Choose which shapley algorithm to use. See [ShapleyMode][pydvl.value.shapley.ShapleyMode] for a list of allowed value. @@ -104,11 +109,11 @@ def compute_shapley_values( ): truncation = kwargs.pop("truncation", NoTruncation()) return permutation_montecarlo_shapley( # type: ignore - u=u, done=done, truncation=truncation, n_jobs=n_jobs, **kwargs + u=u, done=done, truncation=truncation, n_jobs=n_jobs, seed=seed, **kwargs ) elif mode == ShapleyMode.CombinatorialMontecarlo: return combinatorial_montecarlo_shapley( - u, done=done, n_jobs=n_jobs, progress=progress + u, done=done, n_jobs=n_jobs, seed=seed, progress=progress ) elif mode == ShapleyMode.CombinatorialExact: return combinatorial_exact_shapley(u, n_jobs=n_jobs, progress=progress) @@ -131,6 +136,7 @@ def compute_shapley_values( max_q=int(kwargs.get("max_q", -1)), method=method, n_jobs=n_jobs, + seed=seed, ) elif mode == ShapleyMode.KNN: return knn_shapley(u, progress=progress) @@ -149,6 +155,7 @@ def compute_shapley_values( n_samples=int(n_samples), n_jobs=n_jobs, progress=progress, + seed=seed, **kwargs, ) else: diff --git a/src/pydvl/value/shapley/gt.py b/src/pydvl/value/shapley/gt.py index 1ab20a07c..cc207129f 100644 --- a/src/pydvl/value/shapley/gt.py +++ b/src/pydvl/value/shapley/gt.py @@ -22,16 +22,18 @@ """ import logging from collections import namedtuple -from typing import Iterable, Tuple, TypeVar, cast +from typing import Iterable, Optional, Tuple, TypeVar, Union, cast import cvxpy as cp import numpy as np +from numpy.random import SeedSequence from numpy.typing import NDArray from pydvl.utils import MapReduceJob, ParallelConfig, Utility, maybe_progress from pydvl.utils.numeric import random_subset_of_size from pydvl.utils.parallel.backend import effective_n_jobs from pydvl.utils.status import Status +from pydvl.utils.types import Seed, ensure_seed_sequence from pydvl.value import ValuationResult __all__ = ["group_testing_shapley", "num_samples_eps_delta"] @@ -121,7 +123,11 @@ def num_samples_eps_delta( def _group_testing_shapley( - u: Utility, n_samples: int, progress: bool = False, job_id: int = 1 + u: Utility, + n_samples: int, + progress: bool = False, + job_id: int = 1, + seed: Optional[Union[Seed, SeedSequence]] = None, ): """Helper function for [group_testing_shapley()][pydvl.value.shapley.gt.group_testing_shapley]. @@ -134,11 +140,11 @@ def _group_testing_shapley( n_samples: total number of samples (subsets) to use. progress: Whether to display progress bars for each job. job_id: id to use for reporting progress (e.g. to place progres bars) - + seed: Either an instance of a numpy random number generator or a seed for it. Returns: """ - rng = np.random.default_rng() + rng = np.random.default_rng(seed) n = len(u.data.indices) const = _constants(n, 1, 1, 1) # don't care about eps,delta,range @@ -149,7 +155,7 @@ def _group_testing_shapley( for t in maybe_progress(n_samples, progress=progress, position=job_id): k = rng.choice(const.kk, size=1, p=const.q).item() - s = random_subset_of_size(u.data.indices, k) + s = random_subset_of_size(u.data.indices, k, seed=rng) uu[t] = u(s) betas[t, s] = 1 return uu, betas @@ -164,6 +170,7 @@ def group_testing_shapley( n_jobs: int = 1, config: ParallelConfig = ParallelConfig(), progress: bool = False, + seed: Optional[Seed] = None, **options, ) -> ValuationResult: """Implements group testing for approximation of Shapley values as described @@ -194,6 +201,7 @@ def group_testing_shapley( config: Object configuring parallel computation, with cluster address, number of cpus, etc. progress: Whether to display progress bars for each job. + seed: Either an instance of a numpy random number generator or a seed for it. options: Additional options to pass to [cvxpy.Problem.solve()](https://www.cvxpy.org/tutorial/advanced/index.html#solve-method-options). E.g. to change the solver (which defaults to `cvxpy.SCS`) pass @@ -233,6 +241,9 @@ def reducer( np.float_ ), np.concatenate(list(x[1] for x in results_it)).astype(np.int_) + seed_sequence = ensure_seed_sequence(seed) + map_reduce_seed_sequence, cvxpy_seed = tuple(seed_sequence.spawn(2)) + map_reduce_job: MapReduceJob[Utility, Tuple[NDArray, NDArray]] = MapReduceJob( u, map_func=_group_testing_shapley, @@ -241,7 +252,7 @@ def reducer( config=config, n_jobs=n_jobs, ) - uu, betas = map_reduce_job() + uu, betas = map_reduce_job(seed=map_reduce_seed_sequence) # Matrix of estimated differences. See Eqs. (3) and (4) in the paper. C = np.zeros(shape=(n, n)) diff --git a/src/pydvl/value/shapley/montecarlo.py b/src/pydvl/value/shapley/montecarlo.py index e9d59445a..06c4e023f 100644 --- a/src/pydvl/value/shapley/montecarlo.py +++ b/src/pydvl/value/shapley/montecarlo.py @@ -48,10 +48,11 @@ from concurrent.futures import FIRST_COMPLETED, Future, wait from functools import reduce from itertools import cycle, takewhile -from typing import Sequence +from typing import Optional, Sequence, Union import numpy as np from deprecate import deprecated +from numpy.random import SeedSequence from numpy.typing import NDArray from tqdm import tqdm @@ -59,10 +60,11 @@ from pydvl.utils.config import ParallelConfig from pydvl.utils.numeric import random_powerset from pydvl.utils.parallel import CancellationPolicy, MapReduceJob +from pydvl.utils.types import Seed, ensure_seed_sequence from pydvl.utils.utility import Utility from pydvl.value.result import ValuationResult from pydvl.value.shapley.truncated import NoTruncation, TruncationPolicy -from pydvl.value.stopping import MaxChecks, StoppingCriterion +from pydvl.value.stopping import StoppingCriterion logger = logging.getLogger(__name__) @@ -70,7 +72,10 @@ def _permutation_montecarlo_one_step( - u: Utility, truncation: TruncationPolicy, algorithm_name: str + u: Utility, + truncation: TruncationPolicy, + algorithm_name: str, + seed: Optional[Union[Seed, SeedSequence]] = None, ) -> ValuationResult: """Helper function for [permutation_montecarlo_shapley()][pydvl.value.shapley.montecarlo.permutation_montecarlo_shapley]. @@ -82,6 +87,7 @@ def _permutation_montecarlo_one_step( processing a permutation and set all subsequent marginals to zero. algorithm_name: For the results object. Used internally by different variants of Shapley using this subroutine + seed: Either an instance of a numpy random number generator or a seed for it. Returns: An object with the results @@ -90,9 +96,8 @@ def _permutation_montecarlo_one_step( result = ValuationResult.zeros( algorithm=algorithm_name, indices=u.data.indices, data_names=u.data.data_names ) - prev_score = 0.0 - permutation = np.random.permutation(u.data.indices) + permutation = np.random.default_rng(seed).permutation(u.data.indices) permutation_done = False truncation.reset() for i, idx in enumerate(permutation): @@ -131,6 +136,7 @@ def permutation_montecarlo_shapley( n_jobs: int = 1, config: ParallelConfig = ParallelConfig(), progress: bool = False, + seed: Seed = None, ) -> ValuationResult: r"""Computes an approximate Shapley value by sampling independent permutations of the index set, approximating the sum: @@ -177,6 +183,7 @@ def permutation_montecarlo_shapley( config: Object configuring parallel computation, with cluster address, number of cpus, etc. progress: Whether to display a progress bar. + seed: Either an instance of a numpy random number generator or a seed for it. Returns: Object with the data values. @@ -188,6 +195,7 @@ def permutation_montecarlo_shapley( max_workers = effective_n_jobs(n_jobs, config) n_submitted_jobs = 2 * max_workers # number of jobs in the executor's queue + seed_sequence = ensure_seed_sequence(seed) result = ValuationResult.zeros(algorithm=algorithm) pbar = tqdm(disable=not progress, total=100, unit="%") @@ -203,7 +211,6 @@ def permutation_montecarlo_shapley( completed, pending = wait( pending, timeout=config.wait_timeout, return_when=FIRST_COMPLETED ) - for future in completed: result += future.result() # we could check outside the loop, but that means more @@ -212,9 +219,15 @@ def permutation_montecarlo_shapley( return result # Ensure that we always have n_submitted_jobs in the queue or running - for _ in range(n_submitted_jobs - len(pending)): + n_remaining_slots = n_submitted_jobs - len(pending) + seeds = seed_sequence.spawn(n_remaining_slots) + for i in range(n_remaining_slots): future = executor.submit( - _permutation_montecarlo_one_step, u, truncation, algorithm + _permutation_montecarlo_one_step, + u, + truncation, + algorithm, + seed=seeds[i], ) pending.add(future) @@ -226,8 +239,10 @@ def _combinatorial_montecarlo_shapley( *, progress: bool = False, job_id: int = 1, + seed: Optional[Seed] = None, ) -> ValuationResult: - """Helper function for [combinatorial_montecarlo_shapley()][pydvl.value.shapley.montecarlo.combinatorial_montecarlo_shapley]. + """Helper function for + [combinatorial_montecarlo_shapley][pydvl.value.shapley.montecarlo.combinatorial_montecarlo_shapley]. This is the code that is sent to workers to compute values using the combinatorial definition. @@ -238,6 +253,8 @@ def _combinatorial_montecarlo_shapley( done: Check on the results which decides when to stop sampling subsets for an index. progress: Whether to display progress bars for each job. + seed: Either an instance of a numpy random number generator or a seed + for it. job_id: id to use for reporting progress Returns: @@ -256,6 +273,7 @@ def _combinatorial_montecarlo_shapley( data_names=[u.data.data_names[i] for i in indices], ) + rng = np.random.default_rng(seed) repeat_indices = takewhile(lambda _: not done(result), cycle(indices)) pbar = tqdm(disable=not progress, position=job_id, total=100, unit="%") for idx in repeat_indices: @@ -263,7 +281,7 @@ def _combinatorial_montecarlo_shapley( pbar.refresh() # Randomly sample subsets of full dataset without idx subset = np.setxor1d(u.data.indices, [idx], assume_unique=True) - s = next(random_powerset(subset, n_samples=1)) + s = next(random_powerset(subset, n_samples=1, seed=rng)) marginal = (u({idx}.union(s)) - u(s)) / math.comb(n - 1, len(s)) result.update(idx, correction * marginal) @@ -277,6 +295,7 @@ def combinatorial_montecarlo_shapley( n_jobs: int = 1, config: ParallelConfig = ParallelConfig(), progress: bool = False, + seed: Optional[Seed] = None, ) -> ValuationResult: r"""Computes an approximate Shapley value using the combinatorial definition: @@ -305,6 +324,7 @@ def combinatorial_montecarlo_shapley( config: Object configuring parallel computation, with cluster address, number of cpus, etc. progress: Whether to display progress bars for each job. + seed: Either an instance of a numpy random number generator or a seed for it. Returns: Object with the data values. @@ -318,4 +338,4 @@ def combinatorial_montecarlo_shapley( n_jobs=n_jobs, config=config, ) - return map_reduce_job() + return map_reduce_job(seed=seed) diff --git a/src/pydvl/value/shapley/owen.py b/src/pydvl/value/shapley/owen.py index 86cfcf47f..a19682ef3 100644 --- a/src/pydvl/value/shapley/owen.py +++ b/src/pydvl/value/shapley/owen.py @@ -10,13 +10,14 @@ from enum import Enum from functools import reduce from itertools import cycle, takewhile -from typing import Sequence +from typing import Optional, Sequence import numpy as np from numpy.typing import NDArray from tqdm import tqdm from pydvl.utils import MapReduceJob, ParallelConfig, Utility, random_powerset +from pydvl.utils.types import Seed from pydvl.value import ValuationResult from pydvl.value.stopping import MinUpdates @@ -37,6 +38,7 @@ def _owen_sampling_shapley( *, progress: bool = False, job_id: int = 1, + seed: Optional[Seed] = None ) -> ValuationResult: r"""This is the algorithm as detailed in the paper: to compute the outer integral over q ∈ [0,1], use uniformly distributed points for evaluation @@ -57,6 +59,7 @@ def _owen_sampling_shapley( max_q: number of subdivisions for the integration over $q$ progress: Whether to display progress bars for each job job_id: For positioning of the progress bar + seed: Either an instance of a numpy random number generator or a seed for it. Returns: Object with the data values, errors. @@ -70,6 +73,7 @@ def _owen_sampling_shapley( data_names=[u.data.data_names[i] for i in indices], ) + rng = np.random.default_rng(seed) done = MinUpdates(1) repeat_indices = takewhile(lambda _: not done(result), cycle(indices)) pbar = tqdm(disable=not progress, position=job_id, total=100, unit="%") @@ -79,7 +83,7 @@ def _owen_sampling_shapley( e = np.zeros(max_q) subset = np.setxor1d(u.data.indices, [idx], assume_unique=True) for j, q in enumerate(q_steps): - for s in random_powerset(subset, n_samples=n_samples, q=q): + for s in random_powerset(subset, n_samples=n_samples, q=q, seed=rng): marginal = u({idx}.union(s)) - u(s) if method == OwenAlgorithm.Antithetic and q != 0.5: s_complement = np.setxor1d(subset, s, assume_unique=True) @@ -105,6 +109,7 @@ def owen_sampling_shapley( n_jobs: int = 1, config: ParallelConfig = ParallelConfig(), progress: bool = False, + seed: Optional[Seed] = None ) -> ValuationResult: r"""Owen sampling of Shapley values as described in (Okhrati and Lipani, 2021)1. @@ -153,6 +158,7 @@ def owen_sampling_shapley( config: Object configuring parallel computation, with cluster address, number of cpus, etc. progress: Whether to display progress bars for each job. + seed: Either an instance of a numpy random number generator or a seed for it. Returns: Object with the data values. @@ -178,4 +184,4 @@ def owen_sampling_shapley( config=config, ) - return map_reduce_job() + return map_reduce_job(seed=seed) diff --git a/tests/conftest.py b/tests/conftest.py index 9d82da504..2cf9de4f8 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -59,6 +59,16 @@ def seed(request): return 24 +@pytest.fixture() +def seed_alt(request): + return 42 + + +@pytest.fixture() +def collision_tol(request): + return 0.01 + + @pytest.fixture(autouse=True) def pytorch_seed(seed): try: diff --git a/tests/influence/test_torch_differentiable.py b/tests/influence/test_torch_differentiable.py index 621288d6f..2747466a5 100644 --- a/tests/influence/test_torch_differentiable.py +++ b/tests/influence/test_torch_differentiable.py @@ -193,4 +193,4 @@ def test_inversion_methods( ) assert np.allclose(linear_inverse, linear_cg, rtol=1e-1) - assert np.allclose(linear_inverse, linear_lissa, rtol=1e-1) + assert np.allclose(linear_inverse, linear_lissa, rtol=1e-1, atol=2e-1) diff --git a/tests/utils/test_numeric.py b/tests/utils/test_numeric.py index e6101defb..632290cdb 100644 --- a/tests/utils/test_numeric.py +++ b/tests/utils/test_numeric.py @@ -1,5 +1,6 @@ import numpy as np import pytest +from numpy._typing import NDArray from pydvl.utils.numeric import ( powerset, @@ -8,6 +9,7 @@ random_subset_of_size, running_moments, ) +from pydvl.utils.types import Seed def test_powerset(): @@ -68,6 +70,57 @@ def test_random_powerset(n, max_subsets): ) +@pytest.mark.parametrize("n, max_subsets", [(10, 2**10)]) +def test_random_powerset_reproducible(n, max_subsets, seed): + """ + Test that the same seeds produce the same results, and different seeds produce + different results for method :func:`random_powerset`. + """ + n_collisions = _count_random_powerset_generator_collisions( + n, max_subsets, seed, seed + ) + assert n_collisions == max_subsets + + +@pytest.mark.parametrize("n, max_subsets", [(10, 2**10)]) +def test_random_powerset_stochastic(n, max_subsets, seed, seed_alt, collision_tol): + """ + Test that the same seeds produce the same results, and different seeds produce + different results for method :func:`random_powerset`. + """ + n_collisions = _count_random_powerset_generator_collisions( + n, max_subsets, seed, seed_alt + ) + assert n_collisions / max_subsets < collision_tol + + +def _count_random_powerset_generator_collisions( + n: int, max_subsets: int, seed: Seed, seed_alt: Seed +): + """ + Count the number of collisions between two generators of random subsets of a set + with `n` elements, each generating `max_subsets` subsets, using two different seeds. + + Args: + n: number of elements in the set. + max_subsets: number of subsets to generate. + seed: Seed for the first generator. + seed_alt: Seed for the second generator. + + Returns: + Number of collisions between the two generators. + """ + s = np.arange(n) + parallel_subset_generators = zip( + random_powerset(s, n_samples=max_subsets, seed=seed), + random_powerset(s, n_samples=max_subsets, seed=seed_alt), + ) + n_collisions = sum( + map(lambda t: set(t[0]) == set(t[1]), parallel_subset_generators) + ) + return n_collisions + + @pytest.mark.parametrize( "n, size, exception", [(0, 0, None), (0, 1, ValueError), (10, 0, None), (10, 3, None), (1000, 40, None)], @@ -83,6 +136,36 @@ def test_random_subset_of_size(n, size, exception): assert np.all([x in s for x in ss]) +@pytest.mark.parametrize( + "n, size", + [(10, 3), (1000, 40)], +) +def test_random_subset_of_size_stochastic(n, size, seed, seed_alt): + """ + Test that the same seeds produce the same results, and different seeds produce + different results for method :func:`random_subset_of_size`. + """ + s = np.arange(n) + subset_1 = random_subset_of_size(s, size=size, seed=seed) + subset_2 = random_subset_of_size(s, size=size, seed=seed_alt) + assert set(subset_1) != set(subset_2) + + +@pytest.mark.parametrize( + "n, size", + [(10, 3), (1000, 40)], +) +def test_random_subset_of_size_stochastic(n, size, seed): + """ + Test that the same seeds produce the same results, and different seeds produce + different results for method :func:`random_subset_of_size`. + """ + s = np.arange(n) + subset_1 = random_subset_of_size(s, size=size, seed=seed) + subset_2 = random_subset_of_size(s, size=size, seed=seed) + assert set(subset_1) == set(subset_2) + + @pytest.mark.parametrize( "n, cond, exception", [ @@ -109,6 +192,34 @@ def test_random_matrix_with_condition_number(n, cond, exception): pytest.fail("Matrix is not positive definite") +@pytest.mark.parametrize( + "n, cond", + [ + (2, 10), + (7, 23), + (10, 2), + ], +) +def test_random_matrix_with_condition_number_reproducible(n, cond, seed): + mat_1 = random_matrix_with_condition_number(n, cond, seed=seed) + mat_2 = random_matrix_with_condition_number(n, cond, seed=seed) + assert np.all(mat_1 == mat_2) + + +@pytest.mark.parametrize( + "n, cond", + [ + (2, 10), + (7, 23), + (10, 2), + ], +) +def test_random_matrix_with_condition_number_stochastic(n, cond, seed, seed_alt): + mat_1 = random_matrix_with_condition_number(n, cond, seed=seed) + mat_2 = random_matrix_with_condition_number(n, cond, seed=seed_alt) + assert np.any(mat_1 != mat_2) + + def test_running_moments(): """Test that running moments are correct.""" n_samples, n_values = 15, 1000 diff --git a/tests/utils/test_parallel.py b/tests/utils/test_parallel.py index e3b3fa0fc..8ba145aa8 100644 --- a/tests/utils/test_parallel.py +++ b/tests/utils/test_parallel.py @@ -2,6 +2,7 @@ import os import time from functools import partial, reduce +from typing import List, Optional import numpy as np import pytest @@ -9,6 +10,7 @@ from pydvl.utils.parallel import MapReduceJob, init_parallel_backend from pydvl.utils.parallel.backend import effective_n_jobs from pydvl.utils.parallel.futures import init_executor +from pydvl.utils.types import Seed def test_effective_n_jobs(parallel_config, num_workers): @@ -145,6 +147,31 @@ def reduce_func(x, y): assert result == 150 +@pytest.mark.parametrize( + "seed_1, seed_2, op", + [ + (None, None, operator.ne), + (None, 42, operator.ne), + (42, None, operator.ne), + (42, 42, operator.eq), + ], +) +def test_map_reduce_seeding(parallel_config, seed_1, seed_2, op): + """Test that the same result is obtained when using the same seed. And that + different results are obtained when using different seeds. + """ + + map_reduce_job = MapReduceJob( + None, + map_func=_sum_of_random_integers, + reduce_func=np.mean, + config=parallel_config, + ) + result_1 = map_reduce_job(seed=seed_1) + result_2 = map_reduce_job(seed=seed_2) + assert op(result_1, result_2) + + def test_wrap_function(parallel_config, num_workers): if parallel_config.backend != "ray": pytest.skip("Only makes sense for ray") @@ -229,3 +256,11 @@ def test_future_cancellation(parallel_config): future.result() assert time.monotonic() - start < 1 + + +# Helper functions for tests :func:`test_map_reduce_reproducible` and +# :func:`test_map_reduce_stochastic`. +def _sum_of_random_integers(x: None, seed: Optional[Seed] = None): + rng = np.random.default_rng(seed) + values = rng.integers(0, rng.integers(10, 100), 10) + return np.sum(values) diff --git a/tests/value/shapley/test_montecarlo.py b/tests/value/shapley/test_montecarlo.py index 0f92cde29..b7961cdb8 100644 --- a/tests/value/shapley/test_montecarlo.py +++ b/tests/value/shapley/test_montecarlo.py @@ -1,18 +1,29 @@ import logging +from copy import copy, deepcopy import numpy as np import pytest from sklearn.linear_model import LinearRegression -from pydvl.utils import GroupedDataset, MemcachedConfig, Status, Utility +from pydvl.utils import ( + Dataset, + GroupedDataset, + MemcachedConfig, + ParallelConfig, + Status, + Utility, +) from pydvl.utils.numeric import num_samples_permutation_hoeffding from pydvl.utils.score import Scorer, squashed_r2 +from pydvl.utils.types import Seed from pydvl.value import compute_shapley_values from pydvl.value.shapley import ShapleyMode from pydvl.value.shapley.naive import combinatorial_exact_shapley from pydvl.value.stopping import MaxChecks, MaxUpdates from .. import check_rank_correlation, check_total_value, check_values +from ..conftest import polynomial_dataset +from ..utils import call_fn_multiple_seeds log = logging.getLogger(__name__) @@ -62,6 +73,74 @@ def test_analytic_montecarlo_shapley( check_values(values, exact_values, rtol=rtol, atol=atol) +test_cases_montecarlo_shapley_reproducible_stochastic = [ + # TODO Add once issue #416 is closed. + # (12, ShapleyMode.PermutationMontecarlo, {"done": MaxChecks(1)}), + ( + 12, + ShapleyMode.CombinatorialMontecarlo, + {"done": MaxChecks(4)}, + ), + (12, ShapleyMode.Owen, dict(n_samples=4, max_q=200)), + (12, ShapleyMode.OwenAntithetic, dict(n_samples=4, max_q=200)), + (4, ShapleyMode.GroupTesting, dict(n_samples=21, epsilon=0.2, delta=0.01)), +] + + +@pytest.mark.parametrize( + "num_samples, fun, kwargs", test_cases_montecarlo_shapley_reproducible_stochastic +) +@pytest.mark.parametrize("num_points, num_features", [(12, 3)]) +def test_montecarlo_shapley_housing_dataset_reproducible( + num_samples: int, + housing_dataset: Dataset, + parallel_config: ParallelConfig, + n_jobs: int, + fun: ShapleyMode, + kwargs: dict, + seed: Seed, +): + values_1, values_2 = call_fn_multiple_seeds( + compute_shapley_values, + Utility(LinearRegression(), data=housing_dataset, scorer="r2"), + mode=fun, + n_jobs=n_jobs, + config=parallel_config, + progress=False, + seeds=(seed, seed), + **deepcopy(kwargs) + ) + np.testing.assert_equal(values_1.values, values_2.values) + + +@pytest.mark.parametrize( + "num_samples, fun, kwargs", test_cases_montecarlo_shapley_reproducible_stochastic +) +@pytest.mark.parametrize("num_points, num_features", [(12, 4)]) +def test_montecarlo_shapley_housing_dataset_stochastic( + num_samples: int, + housing_dataset: Dataset, + parallel_config: ParallelConfig, + n_jobs: int, + fun: ShapleyMode, + kwargs: dict, + seed: Seed, + seed_alt: Seed, +): + values_1, values_2 = call_fn_multiple_seeds( + compute_shapley_values, + Utility(LinearRegression(), data=housing_dataset, scorer="r2"), + mode=fun, + n_jobs=n_jobs, + config=parallel_config, + progress=False, + seeds=(seed, seed_alt), + **deepcopy(kwargs) + ) + with pytest.raises(AssertionError): + np.testing.assert_equal(values_1.values, values_2.values) + + @pytest.mark.parametrize("num_samples, delta, eps", [(8, 0.1, 0.1)]) @pytest.mark.parametrize( "fun", [ShapleyMode.PermutationMontecarlo, ShapleyMode.CombinatorialMontecarlo] diff --git a/tests/value/test_sampler.py b/tests/value/test_sampler.py index 570d11ed9..5fbf8f5c0 100644 --- a/tests/value/test_sampler.py +++ b/tests/value/test_sampler.py @@ -1,15 +1,18 @@ from itertools import takewhile +from typing import Iterator, List, Type import numpy as np import pytest from pydvl.utils import powerset +from pydvl.utils.types import Seed from pydvl.value.sampler import ( AntitheticSampler, DeterministicPermutationSampler, DeterministicUniformSampler, PermutationSampler, RandomHierarchicalSampler, + StochasticSampler, UniformSampler, ) @@ -19,6 +22,7 @@ [ DeterministicUniformSampler, UniformSampler, + DeterministicPermutationSampler, PermutationSampler, AntitheticSampler, RandomHierarchicalSampler, @@ -35,6 +39,44 @@ def test_proper(sampler_class, indices): assert set(subset) in subsets +@pytest.mark.parametrize( + "sampler_class", + [ + UniformSampler, + PermutationSampler, + AntitheticSampler, + RandomHierarchicalSampler, + ], +) +@pytest.mark.parametrize("indices", [(), (list(range(100)))]) +def test_proper_reproducible(sampler_class, indices, seed): + """Test that the sampler is reproducible.""" + samples_1 = _create_seeded_sample_iter(sampler_class, indices, seed) + samples_2 = _create_seeded_sample_iter(sampler_class, indices, seed) + + for (_, subset_1), (_, subset_2) in zip(samples_1, samples_2): + assert set(subset_1) == set(subset_2) + + +@pytest.mark.parametrize( + "sampler_class", + [ + UniformSampler, + PermutationSampler, + AntitheticSampler, + RandomHierarchicalSampler, + ], +) +@pytest.mark.parametrize("indices", [(), (list(range(100)))]) +def test_proper_stochastic(sampler_class, indices, seed, seed_alt): + """Test that the sampler is reproducible.""" + samples_1 = _create_seeded_sample_iter(sampler_class, indices, seed) + samples_2 = _create_seeded_sample_iter(sampler_class, indices, seed_alt) + + for (_, subset_1), (_, subset_2) in zip(samples_1, samples_2): + assert len(subset_1) == 0 or set(subset_1) != set(subset_2) + + @pytest.mark.parametrize( "sampler_class", [ @@ -70,3 +112,14 @@ def test_chunkify_permutation(sampler_class): # Missing tests for: # - Correct distribution of subsets for random samplers + + +def _create_seeded_sample_iter( + sampler_t: Type[StochasticSampler], + indices: List, + seed: Seed, +) -> Iterator: + max_iterations = len(indices) + sampler = sampler_t(indices=np.array(indices), seed=seed) + sample_stream = takewhile(lambda _: sampler.n_samples < max_iterations, sampler) + return sample_stream diff --git a/tests/value/test_semivalues.py b/tests/value/test_semivalues.py index 23fca06eb..ec937d028 100644 --- a/tests/value/test_semivalues.py +++ b/tests/value/test_semivalues.py @@ -1,10 +1,10 @@ import math -from typing import Dict, Type +from typing import Type import numpy as np import pytest -from pydvl.utils import ParallelConfig, Utility +from pydvl.utils import ParallelConfig from pydvl.value.sampler import ( AntitheticSampler, DeterministicPermutationSampler, diff --git a/tests/value/utils.py b/tests/value/utils.py new file mode 100644 index 000000000..7c38e344f --- /dev/null +++ b/tests/value/utils.py @@ -0,0 +1,25 @@ +from __future__ import annotations + +from copy import deepcopy +from typing import Callable, Tuple + +from pydvl.utils.types import Seed + + +def call_fn_multiple_seeds( + fn: Callable, *args, seeds: Tuple[Seed, ...], **kwargs +) -> Tuple: + """ + Execute a function multiple times with different seeds. It copies the arguments + and keyword arguments before passing them to the function. + + Args: + fn: The function to execute. + args: The arguments to pass to the function. + seeds: The seeds to use. + kwargs: The keyword arguments to pass to the function. + + Returns: + A tuple of the results of the function. + """ + return tuple(fn(*deepcopy(args), **deepcopy(kwargs), seed=seed) for seed in seeds)