Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor owen #597

Merged
merged 8 commits into from
Jun 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,14 @@
- Refactoring of least-core data valuation methods with more supported sampling methods
and consistent interface.
[PR #580](https://github.com/aai-institute/pyDVL/pull/580)
- Refactoring of owen shapley valuation with new sampler architecture
[PR #597](https://github.com/aai-institute/pyDVL/pull/597)

### Fixed

- Fix a bug in pydvl.utils.numeric.random_subset where 1 - q was used instead of q
as the probability of an element being sampled
[PR #597](https://github.com/aai-institute/pyDVL/pull/597)

## 0.9.0 - 🆕 New methods, better docs and bugfixes 📚🐞

Expand Down
1 change: 1 addition & 0 deletions requirements-dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ pytest-xdist>=3.3.1
pytest-cases~=3.8
pytest-split
pytest-sugar
pytest-rerunfailures
nbmake
wheel
twine==4.0.2
2 changes: 1 addition & 1 deletion src/pydvl/utils/numeric.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ def random_subset(
The subset
"""
rng = np.random.default_rng(seed)
selection = rng.uniform(size=len(s)) > q
janosg marked this conversation as resolved.
Show resolved Hide resolved
selection = rng.uniform(size=len(s)) < q
return s[selection]


Expand Down
22 changes: 13 additions & 9 deletions src/pydvl/valuation/games.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
from pydvl.valuation.dataset import Dataset
from pydvl.valuation.methods._solve_least_core_problems import LeastCoreProblem
from pydvl.valuation.result import ValuationResult
from pydvl.valuation.types import SampleT
from pydvl.valuation.utility.base import UtilityBase

__all__ = [
Expand Down Expand Up @@ -91,13 +92,16 @@ def __call__(self, sample: SampleT | None) -> float:
if sample is None or len(sample.subset) == 0:
return 0

if self.training_data is None:
raise ValueError("Utility object has no training data.")

idxs = np.array(sample.subset, dtype=np.int32)
try:
score = self.score(self.training_data.x[idxs])
score: float = self.score(self.training_data.x[idxs])
except (KeyboardInterrupt, SystemExit):
raise
except Exception:
score = 0
score = 0.0
return score

def with_dataset(self, dataset: Dataset):
Expand Down Expand Up @@ -209,7 +213,7 @@ def _score(self, X: NDArray) -> float:
@lru_cache
def shapley_values(self) -> ValuationResult:
exact_values = np.ones(self.n_players) / self.n_players
result: ValuationResult[np.int_, np.int_] = ValuationResult(
result = ValuationResult(
algorithm="exact_shapley",
status=Status.Converged,
indices=self.data.indices,
Expand Down Expand Up @@ -334,7 +338,7 @@ def _score(self, X: NDArray) -> float:

@lru_cache
def shapley_values(self) -> ValuationResult:
result: ValuationResult[np.int_, np.int_] = ValuationResult(
result = ValuationResult(
algorithm="exact_shapley",
status=Status.Converged,
indices=self.data.indices,
Expand Down Expand Up @@ -405,7 +409,7 @@ def shapley_values(self) -> ValuationResult:
value_left = precomputed_values[self.left, self.right]
value_right = precomputed_values[self.right, self.left]
exact_values = np.array([value_left] * self.left + [value_right] * self.right)
result: ValuationResult[np.int_, np.int_] = ValuationResult(
result = ValuationResult(
algorithm="exact_shapley",
status=Status.Converged,
indices=self.data.indices,
Expand All @@ -427,7 +431,7 @@ def least_core_values(self) -> ValuationResult:
subsidy = 0.0
exact_values = np.array([0.0] * self.left + [1.0] * self.right)

result: ValuationResult[np.int_, np.int_] = ValuationResult(
result = ValuationResult(
algorithm="exact_least_core",
status=Status.Converged,
indices=self.data.indices,
Expand Down Expand Up @@ -517,7 +521,7 @@ def _score(self, X: NDArray) -> float:

@lru_cache
def shapley_values(self) -> ValuationResult:
result: ValuationResult[np.int_, np.int_] = ValuationResult(
result = ValuationResult(
algorithm="exact_shapley",
status=Status.Converged,
indices=self.data.indices,
Expand Down Expand Up @@ -592,7 +596,7 @@ def _score(self, X: NDArray) -> float:
@lru_cache
def shapley_values(self) -> ValuationResult:
exact_values = 2 * np.ones_like(self.data.x)
result: ValuationResult[np.int_, np.int_] = ValuationResult(
result = ValuationResult(
algorithm="exact_shapley",
status=Status.Converged,
indices=self.data.indices,
Expand Down Expand Up @@ -657,7 +661,7 @@ def least_core_values(self) -> ValuationResult:
)
subsidy = (self.n_players - 1) / (2 * self.n_players)

result: ValuationResult[np.int_, np.int_] = ValuationResult(
result = ValuationResult(
algorithm="exact_least_core",
status=Status.Converged,
indices=self.data.indices,
Expand Down
4 changes: 3 additions & 1 deletion src/pydvl/valuation/methods/least_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from numpy.typing import NDArray
from tqdm.auto import tqdm

from pydvl.utils.types import Seed
from pydvl.valuation.base import Valuation
from pydvl.valuation.dataset import Dataset
from pydvl.valuation.methods._solve_least_core_problems import (
Expand Down Expand Up @@ -221,10 +222,11 @@ def __init__(
non_negative_subsidy: bool = False,
solver_options: dict | None = None,
progress: bool = True,
seed: Seed | None = None,
):
super().__init__(
utility=utility,
sampler=UniformSampler(index_iteration=NoIndexIteration),
sampler=UniformSampler(index_iteration=NoIndexIteration, seed=seed),
n_samples=n_samples,
non_negative_subsidy=non_negative_subsidy,
solver_options=solver_options,
Expand Down
74 changes: 74 additions & 0 deletions src/pydvl/valuation/methods/owen_shapley.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,81 @@
from __future__ import annotations

from typing import Any

from typing_extensions import Self

from pydvl.utils import Status
from pydvl.valuation.dataset import Dataset
from pydvl.valuation.methods.semivalue import SemivalueValuation
from pydvl.valuation.result import ValuationResult
from pydvl.valuation.samplers.powerset import OwenSampler
from pydvl.valuation.stopping import NoStopping
from pydvl.valuation.utility.base import UtilityBase

"""
## References

[^1]: <a name="okhrati_multilinear_2021"></a>Okhrati, R., Lipani, A., 2021.
[A Multilinear Sampling Algorithm to Estimate Shapley Values](https://ieeexplore.ieee.org/abstract/document/9412511).
In: 2020 25th International Conference on Pattern Recognition (ICPR), pp. 7992–7999. IEEE.
"""


class OwenShapleyValuation(SemivalueValuation):
"""Umbrella class to calculate least-core values with Owen sampling schemes.
janosg marked this conversation as resolved.
Show resolved Hide resolved

Owen shapley values converge to true Shapley values as the number of samples
increases but have been shown to need fewer samples than other sampling schemes.

The number of samples is governed by the sampler object. There are no convergence
criteria for Owen shapley values as they will just run for a fixed number of
samples.

Args:
utility: Utility object with model and scoring function.
sampler: Owen sampling scheme to use. Can be OwenSampler or
AntitheticOwenSampler.
progress: Whether to show a progress bar.

"""

def __init__(
self,
utility: UtilityBase,
sampler: OwenSampler,
progress: dict[str, Any] | bool = False,
):
super().__init__(
utility=utility,
sampler=sampler,
is_done=NoStopping(),
progress=progress,
)

def fit(self, dataset: Dataset) -> Self:
"""Calculate the Owen shapley values for a given dataset.

This method has to be called before calling `values()`.

Calculating the least core valuation is a computationally expensive task that
can be parallelized. To do so, call the `fit()` method inside a
`joblib.parallel_config` context manager as follows:

```python
from joblib import parallel_config

with parallel_config(n_jobs=4):
valuation.fit(data)
```

"""
# since we bypassed the convergence checks we need to set the status to
# converged manually
super().fit(dataset)
# make the type checker happy
if self.result is not None:
self.result._status = Status.Converged
return self

def coefficient(self, n: int, k: int) -> float:
return 1
115 changes: 111 additions & 4 deletions src/pydvl/valuation/samplers/powerset.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,8 @@
"PowersetSampler",
"TruncatedUniformStratifiedSampler",
"UniformSampler",
"OwenSampler",
"AntitheticOwenSampler",
"UniformStratifiedSampler",
"VarianceReducedStratifiedSampler",
"IndexIteration",
Expand Down Expand Up @@ -210,10 +212,7 @@ def process(
) -> list[ValueUpdate]:
r = []
for sample in batch:
assert sample.idx is not None
u_i = self.utility(
Sample(sample.idx, np.array(list({sample.idx}.union(sample.subset))))
)
u_i = self.utility(sample.with_idx_in_subset())
u = self.utility(sample)
marginal = (u_i - u) * self.coefficient(self.n_indices, len(sample.subset))
r.append(ValueUpdate(sample.idx, marginal))
Expand Down Expand Up @@ -369,6 +368,114 @@ def _generate(self, indices: IndexSetT) -> SampleGenerator:
yield Sample(idx, subset)


class OwenSampler(StochasticSamplerMixin, PowersetSampler):
"""A sampler for Owen shapley values.

janosg marked this conversation as resolved.
Show resolved Hide resolved
For each index $i$ the Owen sampler loops over a deterministic grid of probabilities
(containing `n_samples_outer` entries between 0 and 1) and then draws
`n_samples_inner` subsets of the complement of the current index where each element
is sampled with the given probability.

The total number of samples drawn is therefore `n_samples_outer * n_samples_inner`.

Args:
n_samples_outer: The number of entries in the probability grid used for
the outer loop in Owen sampling.
n_samples_inner: The number of samples drawn for each probability. In the
original paper this was fixed to 2 for all experiments which is why we
give it a default value of 2.
batch_size: The batch size of the sampler.
seed: The seed for the random number generator.

"""

def __init__(
self,
n_samples_outer: int,
n_samples_inner: int = 2,
batch_size: int = 1,
seed: Seed | None = None,
):
super().__init__(
batch_size=batch_size, index_iteration=SequentialIndexIteration, seed=seed
)
self._n_samples_inner = n_samples_inner
self._n_samples_outer = n_samples_outer
self._q_stop = 1.0

def _generate(self, indices: IndexSetT) -> SampleGenerator:
probabilities = np.linspace(
start=0, stop=self._q_stop, num=self._n_samples_outer
)
for idx in self.index_iterator(indices):
_complement = complement(indices, [idx] if idx is not None else [])
for prob in probabilities:
for _ in range(self._n_samples_inner):
subset = random_subset(_complement, q=prob, seed=self._rng)
yield Sample(idx, subset)

@staticmethod
def weight(n: int, subset_len: int) -> float:
return 1.0

def sample_limit(self, indices: IndexSetT) -> int:
return len(indices) * self._n_samples_outer * self._n_samples_inner


class AntitheticOwenSampler(OwenSampler):
"""A sampler for antithetic Owen shapley values.

janosg marked this conversation as resolved.
Show resolved Hide resolved
For each index $i$, the antithetic Owen sampler loops over a deterministic grid of
probabilities (containing `n_samples_outer` entries between 0 and 0.5) and then
draws `n_samples_inner` subsets of the complement of the current index where each
element is sampled with the given probability. For each sample obtained that way,
a second sample is generated by taking the complement of the first sample.

The total number of samples drawn is therefore
`2 * n_samples_outer * n_samples_inner`.

For the same number of total samples, the antithetic Owen sampler yields usually
more precise estimates of shapley values than the regular Owen sampler.

Args:
n_samples_outer: The number of entries in the probability grid used for
the outer loop in Owen sampling.
n_samples_inner: The number of samples drawn for each probability. In the
original paper this was fixed to 2 for all experiments which is why we
give it a default value of 2.
batch_size: The batch size of the sampler.
seed: The seed for the random number generator.

"""

def __init__(
self,
n_samples_outer: int,
n_samples_inner: int = 2,
batch_size: int = 1,
seed: Seed | None = None,
):
super().__init__(
n_samples_outer=n_samples_outer,
n_samples_inner=n_samples_inner,
batch_size=batch_size,
seed=seed,
)
self._q_stop = 0.5

def _generate(self, indices: IndexSetT) -> SampleGenerator:
for sample in super()._generate(indices):
idx, subset = sample
_exclude = [idx] if idx is not None else []
_exclude += subset.tolist()
_antithetic_subset = complement(indices, _exclude)
yield sample
yield Sample(idx, _antithetic_subset)

def sample_limit(self, indices: IndexSetT) -> int:
return 2 * super().sample_limit(indices)


class AntitheticSampler(StochasticSamplerMixin, PowersetSampler):
"""An iterator to perform uniform random sampling of subsets, and their
complements.
Expand Down
Loading