From 92153150269d6bf30be174c0412485274d490abc Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 20 Jun 2024 11:16:48 +0000 Subject: [PATCH] Deployed d13af356 to devel with MkDocs 1.5.3 and mike 2.0.0 --- devel/CHANGELOG/index.html | 74 ++++--- devel/api/pydvl/index.html | 4 +- devel/api/pydvl/influence/array/index.html | 4 +- .../base_influence_function_model/index.html | 4 +- devel/api/pydvl/influence/index.html | 4 +- .../influence/influence_calculator/index.html | 4 +- .../api/pydvl/influence/torch/base/index.html | 4 +- .../torch/batch_operation/index.html | 4 +- .../influence/torch/functional/index.html | 4 +- devel/api/pydvl/influence/torch/index.html | 4 +- .../torch/influence_function_model/index.html | 4 +- .../pydvl/influence/torch/operator/index.html | 4 +- .../torch/pre_conditioner/index.html | 4 +- .../api/pydvl/influence/torch/util/index.html | 4 +- devel/api/pydvl/influence/types/index.html | 4 +- devel/api/pydvl/parallel/backend/index.html | 4 +- devel/api/pydvl/parallel/backends/index.html | 4 +- .../pydvl/parallel/backends/joblib/index.html | 4 +- .../pydvl/parallel/backends/ray/index.html | 4 +- devel/api/pydvl/parallel/config/index.html | 4 +- devel/api/pydvl/parallel/futures/index.html | 4 +- .../api/pydvl/parallel/futures/ray/index.html | 4 +- devel/api/pydvl/parallel/index.html | 4 +- .../api/pydvl/parallel/map_reduce/index.html | 4 +- devel/api/pydvl/reporting/index.html | 4 +- devel/api/pydvl/reporting/plots/index.html | 22 +- devel/api/pydvl/reporting/scores/index.html | 10 +- devel/api/pydvl/utils/caching/base/index.html | 4 +- .../api/pydvl/utils/caching/config/index.html | 4 +- devel/api/pydvl/utils/caching/disk/index.html | 4 +- devel/api/pydvl/utils/caching/index.html | 4 +- .../pydvl/utils/caching/memcached/index.html | 4 +- .../api/pydvl/utils/caching/memory/index.html | 4 +- devel/api/pydvl/utils/config/index.html | 4 +- devel/api/pydvl/utils/dataset/index.html | 4 +- devel/api/pydvl/utils/exceptions/index.html | 4 +- devel/api/pydvl/utils/functional/index.html | 4 +- devel/api/pydvl/utils/index.html | 4 +- devel/api/pydvl/utils/numeric/index.html | 36 ++-- devel/api/pydvl/utils/progress/index.html | 4 +- devel/api/pydvl/utils/score/index.html | 4 +- devel/api/pydvl/utils/status/index.html | 4 +- devel/api/pydvl/utils/types/index.html | 4 +- devel/api/pydvl/utils/utility/index.html | 4 +- devel/api/pydvl/value/games/index.html | 4 +- devel/api/pydvl/value/index.html | 4 +- .../pydvl/value/least_core/common/index.html | 6 +- devel/api/pydvl/value/least_core/index.html | 4 +- .../value/least_core/montecarlo/index.html | 4 +- .../pydvl/value/least_core/naive/index.html | 4 +- devel/api/pydvl/value/loo/index.html | 4 +- devel/api/pydvl/value/loo/loo/index.html | 4 +- devel/api/pydvl/value/oob/index.html | 4 +- devel/api/pydvl/value/oob/oob/index.html | 4 +- devel/api/pydvl/value/result/index.html | 22 +- devel/api/pydvl/value/sampler/index.html | 4 +- devel/api/pydvl/value/semivalues/index.html | 4 +- .../pydvl/value/shapley/classwise/index.html | 10 +- .../api/pydvl/value/shapley/common/index.html | 4 +- devel/api/pydvl/value/shapley/gt/index.html | 6 +- devel/api/pydvl/value/shapley/index.html | 4 +- devel/api/pydvl/value/shapley/knn/index.html | 6 +- .../pydvl/value/shapley/montecarlo/index.html | 4 +- .../api/pydvl/value/shapley/naive/index.html | 4 +- devel/api/pydvl/value/shapley/owen/index.html | 4 +- .../pydvl/value/shapley/truncated/index.html | 4 +- .../api/pydvl/value/shapley/types/index.html | 4 +- devel/api/pydvl/value/stopping/index.html | 6 +- devel/search/search_index.json | 2 +- devel/sitemap.xml | 192 +++++++++--------- devel/sitemap.xml.gz | Bin 901 -> 900 bytes 71 files changed, 325 insertions(+), 299 deletions(-) diff --git a/devel/CHANGELOG/index.html b/devel/CHANGELOG/index.html index 65148d902..8d02edb9a 100644 --- a/devel/CHANGELOG/index.html +++ b/devel/CHANGELOG/index.html @@ -3274,6 +3274,15 @@ + + +
np.float_
with np.float64
and np.alltrue
with np.all
,
+ as the old aliases are removed in NumPy 2.0
+ PR #604log_duration
PR #567FutureWarning
for ParallelConfig
constantly raised without actually
instantiating the object
@@ -4242,7 +4266,7 @@ LissaInfluence
, when not using CPU device
PR #495DaskInfluenceCalcualator
with TorchnumpyConverter
for single dimensional arrays
@@ -4327,7 +4351,7 @@ data_names
in ValuationResult.zeros()
PR #443plot_influence_distribution(
- influences: NDArray[float_], index: int, title_extra: str = ""
+ influences: NDArray[float64], index: int, title_extra: str = ""
) -> Axes
NDArray[float_]
+ NDArray[float64]
@@ -5008,7 +5008,7 @@ def plot_influence_distribution(
- influences: NDArray[np.float_], index: int, title_extra: str = ""
+ influences: NDArray[np.float64], index: int, title_extra: str = ""
) -> plt.Axes:
"""Plots the histogram of the influence that all samples in the training set
have over a single sample index.
@@ -5042,8 +5042,10 @@ ¶
NDArray[float_]
+ NDArray[float64]
@@ -5132,7 +5134,7 @@ def plot_influence_distribution_by_label(
- influences: NDArray[np.float_], labels: NDArray[np.float_], title_extra: str = ""
+ influences: NDArray[np.float64], labels: NDArray[np.float64], title_extra: str = ""
):
"""Plots the histogram of the influence that all samples in the training set
have over a single sample index, separated by labels.
@@ -5186,7 +5188,7 @@
- 2024-06-14
+ 2024-06-20
@@ -5196,7 +5198,7 @@
- 2024-06-14
+ 2024-06-20
diff --git a/devel/api/pydvl/reporting/scores/index.html b/devel/api/pydvl/reporting/scores/index.html
index 8b07378a3..3fa3628b6 100644
--- a/devel/api/pydvl/reporting/scores/index.html
+++ b/devel/api/pydvl/reporting/scores/index.html
@@ -3392,7 +3392,7 @@
compute_removal_score(
u: Utility,
values: ValuationResult,
- percentages: Union[NDArray[float_], Iterable[float]],
+ percentages: Union[NDArray[float64], Iterable[float]],
*,
remove_best: bool = False,
progress: bool = False
@@ -3451,7 +3451,7 @@
TYPE:
- Union[NDArray[float_], Iterable[float]]
+ Union[NDArray[float64], Iterable[float]]
def compute_removal_score(
u: Utility,
values: ValuationResult,
- percentages: Union[NDArray[np.float_], Iterable[float]],
+ percentages: Union[NDArray[np.float64], Iterable[float]],
*,
remove_best: bool = False,
progress: bool = False,
@@ -3638,7 +3638,7 @@
2024-06-14
+ 2024-06-20
-
@@ -3648,7 +3648,7 @@
2024-06-14
+ 2024-06-20
diff --git a/devel/api/pydvl/utils/caching/base/index.html b/devel/api/pydvl/utils/caching/base/index.html
index 428725d14..1b53538c0 100644
--- a/devel/api/pydvl/utils/caching/base/index.html
+++ b/devel/api/pydvl/utils/caching/base/index.html
@@ -4557,7 +4557,7 @@
2024-06-14
+ 2024-06-20
@@ -4567,7 +4567,7 @@
2024-06-14
+ 2024-06-20
diff --git a/devel/api/pydvl/utils/caching/config/index.html b/devel/api/pydvl/utils/caching/config/index.html
index 35e920dda..c08f3c8cc 100644
--- a/devel/api/pydvl/utils/caching/config/index.html
+++ b/devel/api/pydvl/utils/caching/config/index.html
@@ -3598,7 +3598,7 @@
2024-06-14
+ 2024-06-20
@@ -3608,7 +3608,7 @@
2024-06-14
+ 2024-06-20
diff --git a/devel/api/pydvl/utils/caching/disk/index.html b/devel/api/pydvl/utils/caching/disk/index.html
index e0abf35ad..bc0ae5d3f 100644
--- a/devel/api/pydvl/utils/caching/disk/index.html
+++ b/devel/api/pydvl/utils/caching/disk/index.html
@@ -4106,7 +4106,7 @@
- 2024-06-14
+ 2024-06-20
@@ -4116,7 +4116,7 @@
- 2024-06-14
+ 2024-06-20
diff --git a/devel/api/pydvl/utils/caching/index.html b/devel/api/pydvl/utils/caching/index.html
index e46d5bfa0..d5518affb 100644
--- a/devel/api/pydvl/utils/caching/index.html
+++ b/devel/api/pydvl/utils/caching/index.html
@@ -3471,7 +3471,7 @@ Unexpected cache misses
- 2024-06-14
+ 2024-06-20
@@ -3481,7 +3481,7 @@ Unexpected cache misses
- 2024-06-14
+ 2024-06-20
diff --git a/devel/api/pydvl/utils/caching/memcached/index.html b/devel/api/pydvl/utils/caching/memcached/index.html
index 194fbaecb..f61652ac4 100644
--- a/devel/api/pydvl/utils/caching/memcached/index.html
+++ b/devel/api/pydvl/utils/caching/memcached/index.html
@@ -4408,7 +4408,7 @@
- 2024-06-14
+ 2024-06-20
@@ -4418,7 +4418,7 @@
- 2024-06-14
+ 2024-06-20
diff --git a/devel/api/pydvl/utils/caching/memory/index.html b/devel/api/pydvl/utils/caching/memory/index.html
index 47edaa36e..b677a4b33 100644
--- a/devel/api/pydvl/utils/caching/memory/index.html
+++ b/devel/api/pydvl/utils/caching/memory/index.html
@@ -4016,7 +4016,7 @@
- 2024-06-14
+ 2024-06-20
@@ -4026,7 +4026,7 @@
- 2024-06-14
+ 2024-06-20
diff --git a/devel/api/pydvl/utils/config/index.html b/devel/api/pydvl/utils/config/index.html
index e8b47b2ef..854236428 100644
--- a/devel/api/pydvl/utils/config/index.html
+++ b/devel/api/pydvl/utils/config/index.html
@@ -3767,7 +3767,7 @@
2024-06-14
+ 2024-06-20
@@ -3777,7 +3777,7 @@
2024-06-14
+ 2024-06-20
diff --git a/devel/api/pydvl/utils/dataset/index.html b/devel/api/pydvl/utils/dataset/index.html
index 1c324126f..1faabac13 100644
--- a/devel/api/pydvl/utils/dataset/index.html
+++ b/devel/api/pydvl/utils/dataset/index.html
@@ -6794,7 +6794,7 @@
- 2024-06-14
+ 2024-06-20
@@ -6804,7 +6804,7 @@
- 2024-06-14
+ 2024-06-20
diff --git a/devel/api/pydvl/utils/exceptions/index.html b/devel/api/pydvl/utils/exceptions/index.html
index 1b91d7f22..15aa112db 100644
--- a/devel/api/pydvl/utils/exceptions/index.html
+++ b/devel/api/pydvl/utils/exceptions/index.html
@@ -3630,7 +3630,7 @@
- 2024-06-14
+ 2024-06-20
@@ -3640,7 +3640,7 @@
- 2024-06-14
+ 2024-06-20
diff --git a/devel/api/pydvl/utils/functional/index.html b/devel/api/pydvl/utils/functional/index.html
index 94e032981..2c6179368 100644
--- a/devel/api/pydvl/utils/functional/index.html
+++ b/devel/api/pydvl/utils/functional/index.html
@@ -3754,7 +3754,7 @@
2024-06-14
+ 2024-06-20
@@ -3764,7 +3764,7 @@
2024-06-14
+ 2024-06-20
diff --git a/devel/api/pydvl/utils/index.html b/devel/api/pydvl/utils/index.html
index 3020cfc9e..5668f651b 100644
--- a/devel/api/pydvl/utils/index.html
+++ b/devel/api/pydvl/utils/index.html
@@ -3342,7 +3342,7 @@
2024-06-14
+ 2024-06-20
@@ -3352,7 +3352,7 @@
2024-06-14
+ 2024-06-20
diff --git a/devel/api/pydvl/utils/numeric/index.html b/devel/api/pydvl/utils/numeric/index.html
index 66807ed6f..1b84e8137 100644
--- a/devel/api/pydvl/utils/numeric/index.html
+++ b/devel/api/pydvl/utils/numeric/index.html
@@ -4858,11 +4858,11 @@
¶
running_moments(
- previous_avg: float | NDArray[float_],
- previous_variance: float | NDArray[float_],
+ previous_avg: float | NDArray[float64],
+ previous_variance: float | NDArray[float64],
count: int,
- new_value: float | NDArray[float_],
-) -> Tuple[float | NDArray[float_], float | NDArray[float_]]
+ new_value: float | NDArray[float64],
+) -> Tuple[float | NDArray[float64], float | NDArray[float64]]
-
-
-
-
-
-
-
-
-
-
-
TYPE:
- float | NDArray[float_]
+ float | NDArray[float64]
TYPE:
- float | NDArray[float_]
+ float | NDArray[float64]
Tuple[float | NDArray[float_], float | NDArray[float_]]
+ Tuple[float | NDArray[float64], float | NDArray[float64]]
def running_moments(
- previous_avg: float | NDArray[np.float_],
- previous_variance: float | NDArray[np.float_],
+ previous_avg: float | NDArray[np.float64],
+ previous_variance: float | NDArray[np.float64],
count: int,
- new_value: float | NDArray[np.float_],
-) -> Tuple[float | NDArray[np.float_], float | NDArray[np.float_]]:
+ new_value: float | NDArray[np.float64],
+) -> Tuple[float | NDArray[np.float64], float | NDArray[np.float64]]:
"""Uses Welford's algorithm to calculate the running average and variance of
a set of numbers.
@@ -5058,7 +5058,7 @@
¶
top_k_value_accuracy(
- y_true: NDArray[float_], y_pred: NDArray[float_], k: int = 3
+ y_true: NDArray[float64], y_pred: NDArray[float64], k: int = 3
) -> float
@@ -5086,7 +5086,7 @@
TYPE:
- NDArray[float_]
+ NDArray[float64]
def top_k_value_accuracy(
- y_true: NDArray[np.float_], y_pred: NDArray[np.float_], k: int = 3
+ y_true: NDArray[np.float64], y_pred: NDArray[np.float64], k: int = 3
) -> float:
"""Computes the top-k accuracy for the estimated values by comparing indices
of the highest k values.
@@ -5223,7 +5223,7 @@
2024-06-14
+ 2024-06-20
-
@@ -5233,7 +5233,7 @@
2024-06-14
+ 2024-06-20
diff --git a/devel/api/pydvl/utils/progress/index.html b/devel/api/pydvl/utils/progress/index.html
index d9068993d..98374744d 100644
--- a/devel/api/pydvl/utils/progress/index.html
+++ b/devel/api/pydvl/utils/progress/index.html
@@ -3648,7 +3648,7 @@
2024-06-14
+ 2024-06-20
@@ -3658,7 +3658,7 @@
2024-06-14
+ 2024-06-20
diff --git a/devel/api/pydvl/utils/score/index.html b/devel/api/pydvl/utils/score/index.html
index 4f6b3831d..ec689869e 100644
--- a/devel/api/pydvl/utils/score/index.html
+++ b/devel/api/pydvl/utils/score/index.html
@@ -3954,7 +3954,7 @@
2024-06-14
+ 2024-06-20
@@ -3964,7 +3964,7 @@
2024-06-14
+ 2024-06-20
diff --git a/devel/api/pydvl/utils/status/index.html b/devel/api/pydvl/utils/status/index.html
index ee6fd289e..aeb12f747 100644
--- a/devel/api/pydvl/utils/status/index.html
+++ b/devel/api/pydvl/utils/status/index.html
@@ -3538,7 +3538,7 @@ Boolean casting
- 2024-06-14
+ 2024-06-20
@@ -3548,7 +3548,7 @@ Boolean casting
- 2024-06-14
+ 2024-06-20
diff --git a/devel/api/pydvl/utils/types/index.html b/devel/api/pydvl/utils/types/index.html
index bea8d7558..7057304ae 100644
--- a/devel/api/pydvl/utils/types/index.html
+++ b/devel/api/pydvl/utils/types/index.html
@@ -3958,7 +3958,7 @@
2024-06-14
+ 2024-06-20
@@ -3968,7 +3968,7 @@
2024-06-14
+ 2024-06-20
diff --git a/devel/api/pydvl/utils/utility/index.html b/devel/api/pydvl/utils/utility/index.html
index 94fd033a8..e0eaddb6c 100644
--- a/devel/api/pydvl/utils/utility/index.html
+++ b/devel/api/pydvl/utils/utility/index.html
@@ -4225,7 +4225,7 @@
2024-06-14
+ 2024-06-20
@@ -4235,7 +4235,7 @@
2024-06-14
+ 2024-06-20
diff --git a/devel/api/pydvl/value/games/index.html b/devel/api/pydvl/value/games/index.html
index 4fdfe7c56..54027b829 100644
--- a/devel/api/pydvl/value/games/index.html
+++ b/devel/api/pydvl/value/games/index.html
@@ -5856,7 +5856,7 @@
2024-06-14
+ 2024-06-20
@@ -5866,7 +5866,7 @@
2024-06-14
+ 2024-06-20
diff --git a/devel/api/pydvl/value/index.html b/devel/api/pydvl/value/index.html
index 89672be6f..62074877b 100644
--- a/devel/api/pydvl/value/index.html
+++ b/devel/api/pydvl/value/index.html
@@ -3347,7 +3347,7 @@
2024-06-14
+ 2024-06-20
@@ -3357,7 +3357,7 @@
2024-06-14
+ 2024-06-20
diff --git a/devel/api/pydvl/value/least_core/common/index.html b/devel/api/pydvl/value/least_core/common/index.html
index 1e8f48783..d03454da5 100644
--- a/devel/api/pydvl/value/least_core/common/index.html
+++ b/devel/api/pydvl/value/least_core/common/index.html
@@ -3625,7 +3625,7 @@
solver_options=solver_options,
)
- values: Optional[NDArray[np.float_]]
+ values: Optional[NDArray[np.float64]]
if subsidy is None:
logger.debug("No values were found")
@@ -4038,7 +4038,7 @@
- 2024-06-14
+ 2024-06-20
@@ -4048,7 +4048,7 @@
- 2024-06-14
+ 2024-06-20
diff --git a/devel/api/pydvl/value/least_core/index.html b/devel/api/pydvl/value/least_core/index.html
index 214fd2c8b..b4f59fc58 100644
--- a/devel/api/pydvl/value/least_core/index.html
+++ b/devel/api/pydvl/value/least_core/index.html
@@ -3754,7 +3754,7 @@
- 2024-06-14
+ 2024-06-20
@@ -3764,7 +3764,7 @@
- 2024-06-14
+ 2024-06-20
diff --git a/devel/api/pydvl/value/least_core/montecarlo/index.html b/devel/api/pydvl/value/least_core/montecarlo/index.html
index e47c97fca..7c256f6d4 100644
--- a/devel/api/pydvl/value/least_core/montecarlo/index.html
+++ b/devel/api/pydvl/value/least_core/montecarlo/index.html
@@ -4013,7 +4013,7 @@
- 2024-06-14
+ 2024-06-20
@@ -4023,7 +4023,7 @@
- 2024-06-14
+ 2024-06-20
diff --git a/devel/api/pydvl/value/least_core/naive/index.html b/devel/api/pydvl/value/least_core/naive/index.html
index 90e6ab933..b20670008 100644
--- a/devel/api/pydvl/value/least_core/naive/index.html
+++ b/devel/api/pydvl/value/least_core/naive/index.html
@@ -3782,7 +3782,7 @@
- 2024-06-14
+ 2024-06-20
@@ -3792,7 +3792,7 @@
- 2024-06-14
+ 2024-06-20
diff --git a/devel/api/pydvl/value/loo/index.html b/devel/api/pydvl/value/loo/index.html
index 948d485c5..b8910a6bf 100644
--- a/devel/api/pydvl/value/loo/index.html
+++ b/devel/api/pydvl/value/loo/index.html
@@ -3344,7 +3344,7 @@
2024-06-14
+ 2024-06-20
@@ -3354,7 +3354,7 @@
2024-06-14
+ 2024-06-20
diff --git a/devel/api/pydvl/value/loo/loo/index.html b/devel/api/pydvl/value/loo/loo/index.html
index 3affed8bc..250571d5b 100644
--- a/devel/api/pydvl/value/loo/loo/index.html
+++ b/devel/api/pydvl/value/loo/loo/index.html
@@ -3771,7 +3771,7 @@
2024-06-14
+ 2024-06-20
@@ -3781,7 +3781,7 @@
2024-06-14
+ 2024-06-20
diff --git a/devel/api/pydvl/value/oob/index.html b/devel/api/pydvl/value/oob/index.html
index 995e30bfb..18f050403 100644
--- a/devel/api/pydvl/value/oob/index.html
+++ b/devel/api/pydvl/value/oob/index.html
@@ -3344,7 +3344,7 @@
2024-06-14
+ 2024-06-20
@@ -3354,7 +3354,7 @@
2024-06-14
+ 2024-06-20
diff --git a/devel/api/pydvl/value/oob/oob/index.html b/devel/api/pydvl/value/oob/oob/index.html
index 49c62b40f..cc51562c7 100644
--- a/devel/api/pydvl/value/oob/oob/index.html
+++ b/devel/api/pydvl/value/oob/oob/index.html
@@ -4142,7 +4142,7 @@
2024-06-14
+ 2024-06-20
@@ -4152,7 +4152,7 @@
2024-06-14
+ 2024-06-20
diff --git a/devel/api/pydvl/value/result/index.html b/devel/api/pydvl/value/result/index.html
index a5d2389ad..3d815ab2a 100644
--- a/devel/api/pydvl/value/result/index.html
+++ b/devel/api/pydvl/value/result/index.html
@@ -3991,8 +3991,8 @@
¶
ValuationResult(
*,
- values: NDArray[float_],
- variances: Optional[NDArray[float_]] = None,
+ values: NDArray[float64],
+ variances: Optional[NDArray[float64]] = None,
counts: Optional[NDArray[int_]] = None,
indices: Optional[NDArray[IndexT]] = None,
data_names: Optional[Sequence[NameT] | NDArray[NameT]] = None,
@@ -4069,7 +4069,7 @@ Operating on r
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
TYPE:
- Optional[NDArray[float_]]
+ Optional[NDArray[float64]]
DEFAULT:
@@ -4304,8 +4304,8 @@ Operating on r
266
@@ -3956,7 +3956,7 @@ def __init__(
self,
*,
- values: NDArray[np.float_],
- variances: Optional[NDArray[np.float_]] = None,
+ values: NDArray[np.float64],
+ variances: Optional[NDArray[np.float64]] = None,
counts: Optional[NDArray[np.int_]] = None,
indices: Optional[NDArray[IndexT]] = None,
data_names: Optional[Sequence[NameT] | NDArray[NameT]] = None,
@@ -4378,7 +4378,7 @@
¶
-values: NDArray[float_]
+
¶
-variances: NDArray[float_]
+
¶
-stderr: NDArray[float_]
+
2024-06-14
+ 2024-06-20
@@ -6274,7 +6274,7 @@
-
2024-06-14
+ 2024-06-20
diff --git a/devel/api/pydvl/value/sampler/index.html b/devel/api/pydvl/value/sampler/index.html
index f979d7180..3f445e78b 100644
--- a/devel/api/pydvl/value/sampler/index.html
+++ b/devel/api/pydvl/value/sampler/index.html
@@ -5877,7 +5877,7 @@
-
- 2024-06-14
+ 2024-06-20
@@ -5887,7 +5887,7 @@
- 2024-06-14
+ 2024-06-20
diff --git a/devel/api/pydvl/value/semivalues/index.html b/devel/api/pydvl/value/semivalues/index.html
index eac92d297..075363f71 100644
--- a/devel/api/pydvl/value/semivalues/index.html
+++ b/devel/api/pydvl/value/semivalues/index.html
@@ -7191,7 +7191,7 @@
2024-06-14
+ 2024-06-20
@@ -7201,7 +7201,7 @@
-
2024-06-14
+ 2024-06-20
diff --git a/devel/api/pydvl/value/shapley/classwise/index.html b/devel/api/pydvl/value/shapley/classwise/index.html
index a05557ef5..19fd832ad 100644
--- a/devel/api/pydvl/value/shapley/classwise/index.html
+++ b/devel/api/pydvl/value/shapley/classwise/index.html
@@ -3776,7 +3776,7 @@
- ¶
234
def estimate_in_class_and_out_of_class_score(
self,
model: SupervisedModel,
- x_test: NDArray[np.float_],
+ x_test: NDArray[np.float64],
y_test: NDArray[np.int_],
rescale_scores: bool = True,
) -> Tuple[float, float]:
@@ -4662,7 +4662,7 @@
- 2024-06-14
+ 2024-06-20
@@ -4672,7 +4672,7 @@
- 2024-06-14
+ 2024-06-20
diff --git a/devel/api/pydvl/value/shapley/common/index.html b/devel/api/pydvl/value/shapley/common/index.html
index 22585a897..4381c5d04 100644
--- a/devel/api/pydvl/value/shapley/common/index.html
+++ b/devel/api/pydvl/value/shapley/common/index.html
@@ -3912,7 +3912,7 @@
- 2024-06-14
+ 2024-06-20
@@ -3922,7 +3922,7 @@
- 2024-06-14
+ 2024-06-20
diff --git a/devel/api/pydvl/value/shapley/gt/index.html b/devel/api/pydvl/value/shapley/gt/index.html
index 54a24407f..12040cbc6 100644
--- a/devel/api/pydvl/value/shapley/gt/index.html
+++ b/devel/api/pydvl/value/shapley/gt/index.html
@@ -4114,7 +4114,7 @@
results_it: Iterable[Tuple[NDArray, NDArray]]
) -> Tuple[NDArray, NDArray]:
return np.concatenate(list(x[0] for x in results_it)).astype(
- np.float_
+ np.float64
), np.concatenate(list(x[1] for x in results_it)).astype(np.int_)
seed_sequence = ensure_seed_sequence(seed)
@@ -4206,7 +4206,7 @@
2024-06-14
+ 2024-06-20
-
@@ -4216,7 +4216,7 @@
2024-06-14
+ 2024-06-20
diff --git a/devel/api/pydvl/value/shapley/index.html b/devel/api/pydvl/value/shapley/index.html
index 56a4bfa98..99ac3ec6c 100644
--- a/devel/api/pydvl/value/shapley/index.html
+++ b/devel/api/pydvl/value/shapley/index.html
@@ -3352,7 +3352,7 @@
2024-06-14
+ 2024-06-20
@@ -3362,7 +3362,7 @@
2024-06-14
+ 2024-06-20
diff --git a/devel/api/pydvl/value/shapley/knn/index.html b/devel/api/pydvl/value/shapley/knn/index.html
index e3a2d3412..cbdd34103 100644
--- a/devel/api/pydvl/value/shapley/knn/index.html
+++ b/devel/api/pydvl/value/shapley/knn/index.html
@@ -3654,7 +3654,7 @@
# closest to farthest
_, indices = nns.kneighbors(u.data.x_test)
- values: NDArray[np.float_] = np.zeros_like(u.data.indices, dtype=np.float_)
+ values: NDArray[np.float64] = np.zeros_like(u.data.indices, dtype=np.float64)
n = len(u.data)
yt = u.data.y_train
iterator = enumerate(zip(u.data.y_test, indices), start=1)
@@ -3714,7 +3714,7 @@
2024-06-14
+ 2024-06-20
-
@@ -3724,7 +3724,7 @@
2024-06-14
+ 2024-06-20
diff --git a/devel/api/pydvl/value/shapley/montecarlo/index.html b/devel/api/pydvl/value/shapley/montecarlo/index.html
index 5cb8de52d..fed01f7c8 100644
--- a/devel/api/pydvl/value/shapley/montecarlo/index.html
+++ b/devel/api/pydvl/value/shapley/montecarlo/index.html
@@ -4332,7 +4332,7 @@
- 2024-06-14
+ 2024-06-20
@@ -4342,7 +4342,7 @@
- 2024-06-14
+ 2024-06-20
diff --git a/devel/api/pydvl/value/shapley/naive/index.html b/devel/api/pydvl/value/shapley/naive/index.html
index 17bd679a7..635fa655e 100644
--- a/devel/api/pydvl/value/shapley/naive/index.html
+++ b/devel/api/pydvl/value/shapley/naive/index.html
@@ -3959,7 +3959,7 @@
- 2024-06-14
+ 2024-06-20
@@ -3969,7 +3969,7 @@
- 2024-06-14
+ 2024-06-20
diff --git a/devel/api/pydvl/value/shapley/owen/index.html b/devel/api/pydvl/value/shapley/owen/index.html
index bbc242ed5..eb7a52dfe 100644
--- a/devel/api/pydvl/value/shapley/owen/index.html
+++ b/devel/api/pydvl/value/shapley/owen/index.html
@@ -4009,7 +4009,7 @@
2024-06-14
+ 2024-06-20
@@ -4019,7 +4019,7 @@
2024-06-14
+ 2024-06-20
diff --git a/devel/api/pydvl/value/shapley/truncated/index.html b/devel/api/pydvl/value/shapley/truncated/index.html
index 63f18986d..f20d5399f 100644
--- a/devel/api/pydvl/value/shapley/truncated/index.html
+++ b/devel/api/pydvl/value/shapley/truncated/index.html
@@ -4782,7 +4782,7 @@
- 2024-06-14
+ 2024-06-20
@@ -4792,7 +4792,7 @@
- 2024-06-14
+ 2024-06-20
diff --git a/devel/api/pydvl/value/shapley/types/index.html b/devel/api/pydvl/value/shapley/types/index.html
index 10a314e8f..c4467ec45 100644
--- a/devel/api/pydvl/value/shapley/types/index.html
+++ b/devel/api/pydvl/value/shapley/types/index.html
@@ -3454,7 +3454,7 @@
2024-06-14
+ 2024-06-20
@@ -3464,7 +3464,7 @@
2024-06-14
+ 2024-06-20
diff --git a/devel/api/pydvl/value/stopping/index.html b/devel/api/pydvl/value/stopping/index.html
index 81fd27893..e53dea332 100644
--- a/devel/api/pydvl/value/stopping/index.html
+++ b/devel/api/pydvl/value/stopping/index.html
@@ -6022,7 +6022,7 @@
raise ValueError("rtol must be in (0, 1)")
self.rtol = rtol
self.burn_in = burn_in
- self._memory: NDArray[np.float_] | None = None
+ self._memory: NDArray[np.float64] | None = None
self._corr = 0.0
self._completion = 0.0
self._iterations = 0
@@ -6421,7 +6421,7 @@
2024-06-14
+ 2024-06-20
-
@@ -6431,7 +6431,7 @@
2024-06-14
+ 2024-06-20
diff --git a/devel/search/search_index.json b/devel/search/search_index.json
index 93b1271d3..e240ed274 100644
--- a/devel/search/search_index.json
+++ b/devel/search/search_index.json
@@ -1 +1 @@
-{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"","title":"The python library for data valuation","text":"
-
-
-
-
-
-
-
-
-
pyDVL collects algorithms for data valuation and influence function computation. For the full list see Methods. It supports out-of-core and distributed computation, as well as local or distributed caching of results.
If you're a first time user of pyDVL, we recommend you to go through Getting started.
-
Getting started
Steps to install and requirements
-
Example gallery
Notebooks with worked-out examples of data valuation and influence functions
-
Data valuation
Basics of data valuation and description of the main algorithms
-
Influence Function
An introduction to the influence function and its computation with pyDVL
-
Supported methods
List of all methods implemented with references.
-
API Reference
Full documentation of the API
"},{"location":"CHANGELOG/","title":"Changelog","text":""},{"location":"CHANGELOG/#unreleased","title":"Unreleased","text":""},{"location":"CHANGELOG/#added","title":"Added","text":" - New method
InverseHarmonicMeanInfluence
, implementation for the paper DataInf: Efficiently Estimating Data Influence in LoRA-tuned LLMs and Diffusion Models
PR #582 - Add new backend implementations for influence computation to account for block-diagonal approximations PR #582
- Extend
DirectInfluence
with block-diagonal and Gauss-Newton approximation PR #591 - Extend
LissaInfluence
with block-diagonal and Gauss-Newton approximation PR #593 - Extend
NystroemSketchInfluence
with block-diagonal and Gauss-Newton approximation PR #596
"},{"location":"CHANGELOG/#changed","title":"Changed","text":" - Breaking Changes
- Rename parameter
hessian_regularization
of DirectInfluence
to regularization
and change the type annotation to allow for block-wise regularization parameters PR #591 - Rename parameter
hessian_regularization
of LissaInfluence
to regularization
and change the type annotation to allow for block-wise regularization parameters PR #593 - Remove parameter
h0
from init of LissaInfluence
PR #593 - Rename parameter
hessian_regularization
of NystroemSketchInfluence
to regularization
and change the type annotation to allow for block-wise regularization parameters PR #596
"},{"location":"CHANGELOG/#092-bug-fixes-logging-improvement","title":"0.9.2 - \ud83c\udfd7 Bug fixes, logging improvement","text":""},{"location":"CHANGELOG/#added_1","title":"Added","text":" - Add progress bars to the computation of
LazyChunkSequence
and NestedLazyChunkSequence
PR #567 - Add a device fixture for
pytest
, which depending on the availability and user input (pytest --with-cuda
) resolves to cuda device PR #574
"},{"location":"CHANGELOG/#fixed","title":"Fixed","text":" - Fixed logging issue in decorator
log_duration
PR #567 - Fixed missing move of tensors to model device in
EkfacInfluence
implementation PR #570 - Missing move to device of
preconditioner
in CgInfluence
implementation PR #572 - Raise a more specific error message, when a
RunTimeError
occurs in torch.linalg.eigh
, so the user can check if it is related to a known issue PR #578 - Fix an edge case (empty train data) in the test
test_classwise_scorer_accuracies_manual_derivation
, which resulted in undefined behavior (np.nan
to int
conversion with different results depending on OS) PR #579
"},{"location":"CHANGELOG/#changed_1","title":"Changed","text":" - Changed logging behavior of iterative methods
LissaInfluence
and CgInfluence
to warn on not achieving desired tolerance within maxiter
, add parameter warn_on_max_iteration
to set the level for this information to logging.DEBUG
PR #567
"},{"location":"CHANGELOG/#091-bug-fixes-logging-improvement","title":"0.9.1 - Bug fixes, logging improvement","text":""},{"location":"CHANGELOG/#fixed_1","title":"Fixed","text":" FutureWarning
for ParallelConfig
constantly raised without actually instantiating the object PR #562
"},{"location":"CHANGELOG/#090-new-methods-better-docs-and-bugfixes","title":"0.9.0 - \ud83c\udd95 New methods, better docs and bugfixes \ud83d\udcda\ud83d\udc1e","text":""},{"location":"CHANGELOG/#added_2","title":"Added","text":" - New method
MSR Banzhaf
with accompanying notebook, and new stopping criterion RankCorrelation
PR #520 - New method:
NystroemSketchInfluence
PR #504 - New preconditioned block variant of conjugate gradient PR #507
- Improvements to documentation: fixes, links, text, example gallery, LFS and more PR #532, PR #543
- Glossary of data valuation and influence terms in the documentation [PR #537](https://github.com/aai-institute/pyDVL/pull/537
- Documentation about writing notes for new features, changes or deprecations PR #557
"},{"location":"CHANGELOG/#fixed_2","title":"Fixed","text":" - Bug in
LissaInfluence
, when not using CPU device PR #495 - Memory issue with
CgInfluence
and ArnoldiInfluence
PR #498 - Raising specific error message with install instruction, when trying to load
pydvl.utils.cache.memcached
without pymemcache
installed. If pymemcache
is available, all symbols from pydvl.utils.cache.memcached
are available through pydvl.utils.cache
PR #509
"},{"location":"CHANGELOG/#changed_2","title":"Changed","text":" - Add property
model_dtype
to instances of type TorchInfluenceFunctionModel
- Bump versions of CI actions to avoid warnings PR #502
- Add Python Version 3.11 to supported versions PR #510
- Documentation improvements and cleanup PR #521, PR #522
- Simplified parallel backend configuration PR #549
"},{"location":"CHANGELOG/#081-new-method-and-notebook-games-with-exact-shapley-values-bug-fixes-and-cleanup","title":"0.8.1 - \ud83c\udd95 \ud83c\udfd7 New method and notebook, Games with exact shapley values, bug fixes and cleanup","text":""},{"location":"CHANGELOG/#added_3","title":"Added","text":" - Implement new method:
EkfacInfluence
PR #451 - New notebook to showcase ekfac for LLMs PR #483
- Implemented exact games in Castro et al. 2009 and 2017 PR #341
"},{"location":"CHANGELOG/#fixed_3","title":"Fixed","text":" - Bug in using
DaskInfluenceCalcualator
with TorchnumpyConverter
for single dimensional arrays PR #485 - Fix implementations of
to
methods of TorchInfluenceFunctionModel
implementations PR #487 - Fixed bug with checking for converged values in semivalues PR #341
"},{"location":"CHANGELOG/#changed_3","title":"Changed","text":" - Add applications of data valuation section, display examples more prominently, make all sections visible in table of contents, use mkdocs material cards in the home page PR #492
"},{"location":"CHANGELOG/#080-new-interfaces-scaling-computation-bug-fixes-and-improvements","title":"0.8.0 - \ud83c\udd95 New interfaces, scaling computation, bug fixes and improvements \ud83c\udf81","text":""},{"location":"CHANGELOG/#added_4","title":"Added","text":" - New cache backends: InMemoryCacheBackend and DiskCacheBackend PR #458
- New influence function interface
InfluenceFunctionModel
- Data parallel computation with
DaskInfluenceCalculator
PR #26 - Sequential batch-wise computation and write to disk with
SequentialInfluenceCalculator
PR #377 - Adapt notebooks to new influence abstractions PR #430
"},{"location":"CHANGELOG/#changed_4","title":"Changed","text":" - Refactor and simplify caching implementation PR #458
- Simplify display of computation progress PR #466
- Improve readme and explain better the examples PR #465
- Simplify and improve tests, add CodeCov code coverage PR #429
- Breaking Changes
- Removed
compute_influences
and all related code. Replaced by new InfluenceFunctionModel
interface. Removed modules: - influence.general
- influence.inversion
- influence.twice_differentiable
- influence.torch.torch_differentiable
"},{"location":"CHANGELOG/#fixed_4","title":"Fixed","text":" - Import bug in README PR #457
"},{"location":"CHANGELOG/#071-new-methods-bug-fixes-and-improvements-for-local-tests","title":"0.7.1 - \ud83c\udd95 New methods, bug fixes and improvements for local tests \ud83d\udc1e\ud83e\uddea","text":""},{"location":"CHANGELOG/#added_5","title":"Added","text":" - New method: Class-wise Shapley values PR #338
- New method: Data-OOB by @BastienZim PR #426, PR $431
- Added
AntitheticPermutationSampler
PR #439 - Faster semi-value computation with per-index check of stopping criteria (optional) PR #437
"},{"location":"CHANGELOG/#fixed_5","title":"Fixed","text":" - Fix initialization of
data_names
in ValuationResult.zeros()
PR #443
"},{"location":"CHANGELOG/#changed_5","title":"Changed","text":" - No longer using docker within tests to start a memcached server PR #444
- Using pytest-xdist for faster local tests PR #440
- Improvements and fixes to notebooks PR #436
- Refactoring of parallel module. Old imports will stop working in v0.9.0 PR #421
"},{"location":"CHANGELOG/#070-documentation-and-if-overhaul-new-methods-and-bug-fixes","title":"0.7.0 - \ud83d\udcda\ud83c\udd95 Documentation and IF overhaul, new methods and bug fixes \ud83d\udca5\ud83d\udc1e","text":"This is our first \u03b2 release! We have worked hard to deliver improvements across the board, with a focus on documentation and usability. We have also reworked the internals of the influence
module, improved parallelism and handling of randomness.
"},{"location":"CHANGELOG/#added_6","title":"Added","text":" - Implemented solving the Hessian equation via spectral low-rank approximation PR #365
- Enabled parallel computation for Leave-One-Out values PR #406
- Added more abbreviations to documentation PR #415
- Added seed to functions from
pydvl.utils.numeric
, pydvl.value.shapley
and pydvl.value.semivalues
. Introduced new type Seed
and conversion function ensure_seed_sequence
. PR #396 - Added
batch_size
parameter to compute_banzhaf_semivalues
, compute_beta_shapley_semivalues
, compute_shapley_semivalues
and compute_generic_semivalues
. PR #428 - Added classwise Shapley as proposed by (Schoch et al. 2021) [https://arxiv.org/abs/2211.06800] PR #338
"},{"location":"CHANGELOG/#changed_6","title":"Changed","text":" - Replaced sphinx with mkdocs for documentation. Major overhaul of documentation PR #352
- Made ray an optional dependency, relying on joblib as default parallel backend PR #408
- Decoupled
ray.init
from ParallelConfig
PR #373 - Breaking Changes
- Signature change: return information about Hessian inversion from
compute_influence_factors
PR #375 - Major changes to IF interface and functionality. Foundation for a framework abstraction for IF computation. PR #278 PR #394
- Renamed
semivalues
to compute_generic_semivalues
PR #413 - New
joblib
backend as default instead of ray. Simplify MapReduceJob. PR #355 - Bump torch dependency for influence package to 2.0 PR #365
"},{"location":"CHANGELOG/#fixed_6","title":"Fixed","text":" - Fixes to parallel computation of generic semi-values: properly handle all samplers and stopping criteria, irrespective of parallel backend. PR #372
- Optimises memory usage in IF calculation PR #375
- Fix adding valuation results with overlapping indices and different lengths PR #370
- Fixed bugs in conjugate gradient and
linear_solve
PR #358 - Fix installation of dev requirements for Python3.10 PR #382
- Improvements to IF documentation PR #371
"},{"location":"CHANGELOG/#061-bug-fixes-and-small-improvements","title":"0.6.1 - \ud83c\udfd7 Bug fixes and small improvements","text":" - Fix parsing keyword arguments of
compute_semivalues
dispatch function PR #333 - Create new
RayExecutor
class based on the concurrent.futures API, use the new class to fix an issue with Truncated Monte Carlo Shapley (TMCS) starting too many processes and dying, plus other small changes PR #329 - Fix creation of GroupedDataset objects using the
from_arrays
and from_sklearn
class methods PR #324 - Fix release job not triggering on CI when a new tag is pushed PR #331
- Added alias
ApproShapley
from Castro et al. 2009 for permutation Shapley PR #332
"},{"location":"CHANGELOG/#060-new-algorithms-cleanup-and-bug-fixes","title":"0.6.0 - \ud83c\udd95 New algorithms, cleanup and bug fixes \ud83c\udfd7","text":" - Fixes in
ValuationResult
: bugs around data names, semantics of empty()
, new method zeros()
and normalised random values PR #327 - New method: Implements generalised semi-values for data valuation, including Data Banzhaf and Beta Shapley, with configurable sampling strategies PR #319
- Adds kwargs parameter to
from_array
and from_sklearn
Dataset and GroupedDataset class methods PR #316 - PEP-561 conformance: added
py.typed
PR #307 - Removed default non-negativity constraint on least core subsidy and added instead a
non_negative_subsidy
boolean flag. Renamed options
to solver_options
and pass it as dict. Change default least-core solver to SCS with 10000 max_iters. PR #304 - Cleanup: removed unnecessary decorator
@unpackable
PR #233 - Stopping criteria: fixed problem with
StandardError
and enable proper composition of index convergence statuses. Fixed a bug with n_jobs
in truncated_montecarlo_shapley
. PR #300 and PR #305 - Shuffling code around to allow for simpler user imports, some cleanup and documentation fixes. PR #284
- Bug fix: Warn instead of raising an error when
n_iterations
is less than the size of the dataset in Monte Carlo Least Core PR #281
"},{"location":"CHANGELOG/#050-fixes-nicer-interfaces-and-more-breaking-changes","title":"0.5.0 - \ud83d\udca5 Fixes, nicer interfaces and... more breaking changes \ud83d\ude12","text":" - Fixed parallel and antithetic Owen sampling for Shapley values. Simplified and extended tests. PR #267
- Added
Scorer
class for a cleaner interface. Fixed minor bugs around Group-Testing Shapley, added more tests and switched to cvxpy for the solver. PR #264 - Generalised stopping criteria for valuation algorithms. Improved classes
ValuationResult
and Status
with more operations. Some minor issues fixed. PR #252 - Fixed a bug whereby
compute_shapley_values
would only spawn one process when using n_jobs=-1
and Monte Carlo methods. PR #270 - Bugfix in
RayParallelBackend
: wrong semantics for kwargs
. PR #268 - Splitting of problem preparation and solution in Least-Core computation. Umbrella function for LC methods. PR #257
- Operations on
ValuationResult
and Status
and some cleanup PR #248 - Bug fix and minor improvements: Fixes bug in TMCS with remote Ray cluster, raises an error for dummy sequential parallel backend with TMCS, clones model inside
Utility
before fitting by default, with flag clone_before_fit
to disable it, catches all warnings in Utility
when show_warnings
is False
. Adds Miner and Gloves toy games utilities PR #247
"},{"location":"CHANGELOG/#040-new-algorithms-and-more-breaking-changes","title":"0.4.0 - \ud83c\udfed\ud83d\udca5 New algorithms and more breaking changes","text":" - GH action to mark issues as stale PR #201
- Disabled caching of Utility values as well as repeated evaluations by default PR #211
- Test and officially support Python version 3.9 and 3.10 PR #208
- Breaking change: Introduces a class ValuationResult to gather and inspect results from all valuation algorithms PR #214
- Fixes bug in Influence calculation with multidimensional input and adds new example notebook PR #195
- Breaking change: Passes the input to
MapReduceJob
at initialization, removes chunkify_inputs
argument from MapReduceJob
, removes n_runs
argument from MapReduceJob
, calls the parallel backend's put()
method for each generated chunk in _chunkify()
, renames ParallelConfig's num_workers
attribute to n_local_workers
, fixes a bug in MapReduceJob
's chunkification when n_runs
>= n_jobs
, and defines a sequential parallel backend to run all jobs in the current thread PR #232 - New method: Implements exact and monte carlo Least Core for data valuation, adds
from_arrays()
class method to the Dataset
and GroupedDataset
classes, adds extra_values
argument to ValuationResult
, adds compute_removal_score()
and compute_random_removal_score()
helper functions PR #237 - New method: Group Testing Shapley for valuation, from Jia et al. 2019 PR #240
- Fixes bug in ray initialization in
RayParallelBackend
class PR #239 - Implements \"Egalitarian Least Core\", adds cvxpy as a dependency and uses it instead of scipy as optimizer PR #243
"},{"location":"CHANGELOG/#030-breaking-changes","title":"0.3.0 - \ud83d\udca5 Breaking changes","text":" - Simplified and fixed powerset sampling and testing PR #181
- Simplified and fixed publishing to PyPI from CI PR #183
- Fixed bug in release script and updated contributing docs. PR #184
- Added Pull Request template PR #185
- Modified Pull Request template to automatically link PR to issue PR ##186
- First implementation of Owen Sampling, squashed scores, better testing PR #194
- Improved documentation on caching, Shapley, caveats of values, bibtex PR #194
- Breaking change: Rearranging of modules to accommodate for new methods PR #194
"},{"location":"CHANGELOG/#020-better-docs","title":"0.2.0 - \ud83d\udcda Better docs","text":"Mostly API documentation and notebooks, plus some bugfixes.
"},{"location":"CHANGELOG/#added_7","title":"Added","text":"In PR #161: - Support for $$ math in sphinx docs. - Usage of sphinx extension for external links (introducing new directives like :gh:
, :issue:
and :tfl:
to construct standardised links to external resources). - Only update auto-generated documentation files if there are changes. Some minor additions to update_docs.py
. - Parallelization of exact combinatorial Shapley. - Integrated KNN shapley into the main interface compute_shapley_values
.
"},{"location":"CHANGELOG/#changed_7","title":"Changed","text":"In PR #161: - Improved main docs and Shapley notebooks. Added or fixed many docstrings, readme and documentation for contributors. Typos, grammar and style in code, documentation and notebooks. - Internal renaming and rearranging in the parallelization and caching modules.
"},{"location":"CHANGELOG/#fixed_7","title":"Fixed","text":" - Bug in random matrix generation PR #161.
- Bugs in MapReduceJob's
_chunkify
and _backpressure
methods PR #176.
"},{"location":"CHANGELOG/#010-first-release","title":"0.1.0 - \ud83c\udf89 first release","text":"This is very first release of pyDVL.
It contains:
-
Data Valuation Methods:
-
Leave-One-Out
- Influence Functions
- Shapley:
- Exact Permutation and Combinatorial
- Montecarlo Permutation and Combinatorial
- Truncated Montecarlo Permutation
- Caching of results with Memcached
- Parallelization of computations with Ray
- Documentation
- Notebooks containing examples of different use cases
"},{"location":"CONTRIBUTING/","title":"Contributing to pyDVL","text":"The goal of pyDVL is to be a repository of successful algorithms for the valuation of data, in a broader sense. Contributions are welcome from anyone in the form of pull requests, bug reports and feature requests.
We will consider for inclusion any (tested) implementation of an algorithm appearing in a peer-reviewed journal (even if the method does not improve the state of the art, for benchmarking and comparison purposes). We are also open to improvements to the currently implemented methods and other ideas. Please open a ticket with yours.
If you are interested in setting up a similar project, consider the template pymetrius.
"},{"location":"CONTRIBUTING/#local-development","title":"Local development","text":"This project uses black to format code and pre-commit to invoke it as a git pre-commit hook. Consider installing any of black's IDE integrations to make your life easier.
Run the following to set up the pre-commit git hook to run before pushes:
pre-commit install --hook-type pre-push\n
Additionally, we use Git LFS for some files like images. Install with
git lfs install\n
"},{"location":"CONTRIBUTING/#setting-up-your-environment","title":"Setting up your environment","text":"We strongly suggest using some form of virtual environment for working with the library. E.g. with venv:
python -m venv ./venv\n. venv/bin/activate # `venv\\Scripts\\activate` in windows\npip install -r requirements-dev.txt -r requirements-docs.txt\n
With conda:
conda create -n pydvl python=3.8\nconda activate pydvl\npip install -r requirements-dev.txt -r requirements-docs.txt\n
A very convenient way of working with your library during development is to install it in editable mode into your environment by running
pip install -e .\n
In order to build the documentation locally (which is done as part of the tox suite) you need to install additional non-python dependencies as described in the documentation of mkdocs-material.
In addition, pandoc is required. Except for OSX, it should be installed automatically as a dependency with requirements-docs.txt
. Under OSX you can install pandoc (you'll need at least version 2.11) with:
brew install pandoc\n
Remember to mark all autogenerated directories as excluded in your IDE. In particular docs_build
and .tox
should be marked as excluded to avoid slowdowns when searching or refactoring code.
If you use remote execution, don't forget to exclude data paths from deployment (unless you really want to sync them).
"},{"location":"CONTRIBUTING/#testing","title":"Testing","text":"Automated builds, tests, generation of documentation and publishing are handled by CI pipelines. Before pushing your changes to the remote we recommend to execute tox
locally in order to detect mistakes early on and to avoid failing pipelines. tox will: * run the test suite * build the documentation * build and test installation of the package. * generate coverage and pylint reports in html, as well as badges.
You can configure pytest, coverage and pylint by adjusting pyproject.toml.
Besides the usual unit tests, most algorithms are tested using pytest. This requires ray for the parallelization and Memcached for caching. Please install both before running the tests. We run tests in CI as well.
It is possible to pass optional command line arguments to pytest, for example to run only certain tests using patterns (-k
) or marker (-m
).
tox -e tests -- <optional arguments>\n
There are a few important arguments:
--memcached-service
allows to change the default of localhost:11211
(memcached's default) to a different address.
Memcached is needed for testing caching as well as speeding certain methods (e.g. Permutation Shapley).
To start memcached locally in the background with Docker use:
docker run --name pydvl-memcache -p 11211:11211 -d memcached\n
-n
sets the number of parallel workers for pytest-xdist.
There are two layers of parallelization in the tests. An inner one within the tests themselves, i.e. the parallelism in the algorithms, and an outer one by pytest-xdist. The latter is controlled by the -n
argument. If you experience segmentation faults with the tests, try running them with -n 0
to disable parallelization.
-
--slow-tests
enables running slow tests. See below for a description of slow tests.
-
--with-cuda
sets the device fixture in tests/influence/torch/conftest.py to cuda
if it is available. Using this fixture within tests, you can run parts of your tests on a cuda
device. Be aware, that you still have to take care of the usage of the device manually in a specific test. Setting this flag does not result in running all tests on a GPU.
"},{"location":"CONTRIBUTING/#markers","title":"Markers","text":"We use a few different markers to differentiate between tests and runs groups of them of separately. Use pytest --markers
to get a list and description of all available markers.
Two important markers are:
pytest.mark.slow
which is used to mark slow tests and skip them by default.
A slow test is any test that takes 45 seconds or more to run and that can be skipped most of the time. In some cases a test is slow, but it is required in order to ensure that a feature works as expected and that are no bugs. In those cases, we should not use this marker.
Slow tests are always run on CI. Locally, they are skipped by default but can be additionally run using: pytest --slow-tests
.
pytest.mark.torch
which is used to mark tests that require PyTorch.
To test modules that rely on PyTorch, use:
tox -e tests -- -m \"torch\"\n
"},{"location":"CONTRIBUTING/#other-things","title":"Other Things","text":"To test the notebooks separately, run (see below for details):
tox -e notebook-tests\n
To create a package locally, run:
python setup.py sdist bdist_wheel\n
"},{"location":"CONTRIBUTING/#notebooks","title":"Notebooks","text":"We use notebooks both as documentation (copied over to docs/examples
) and as integration tests. All notebooks in the notebooks
directory are executed during the test run. Because run times are typically too long for large datasets, you must check for the CI
environment variable to work with smaller ones. For example, you can select a subset of the data:
# In CI we only use a subset of the training set\nif os.environ.get('CI'):\n training_data = training_data[:10]\n
This switching should happen in a separate notebook cell tagged with hide
to hide the cell's input and output when rendering it as part of the documents. We want to avoid as much clutter and boilerplate as possible in the notebooks themselves.
Because we want documentation to include the full dataset, we commit notebooks with their outputs running with full datasets to the repo. The notebooks are then added by CI to the section Examples of the documentation.
"},{"location":"CONTRIBUTING/#hiding-cells-in-notebooks","title":"Hiding cells in notebooks","text":"Switching between CI or not, importing generic modules and plotting results are all examples of boilerplate code irrelevant to a reader interested in pyDVL's functionality. For this reason we choose to isolate this code into separate cells which are then hidden in the documentation.
In order to do this, cells are marked with tags understood by the mkdocs plugin mkdocs-jupyter
, namely adding the following to the metadata of the relevant cells:
\"tags\": [\n \"hide\"\n]\n
To hide the cell's input and output.
Or:
\"tags\": [\n \"hide-input\"\n]\n
To only hide the input and
\"tags\": [\n \"hide-output\"\n]\n
for hiding the output only. It is important to leave a warning at the top of the document to avoid confusion. Examples for hidden imports and plots are available in the notebooks, e.g. in notebooks/shapley_basic_spotify.ipynb.
"},{"location":"CONTRIBUTING/#plots-in-notebooks","title":"Plots in Notebooks","text":"If you add a plot to a notebook, which should also render nicely in browser dark mode, add the tag invertible-output, i.e.
\"tags\": [\n \"invertible-output\"\n]\n
This applies a simple CSS-filter to the output image of the cell."},{"location":"CONTRIBUTING/#documentation","title":"Documentation","text":"API documentation and examples from notebooks are built with mkdocs, using a number of plugins, including mkdoctrings, with versioning handled by mike.
Notebooks are an integral part of the documentation as well, please read the section on notebooks above.
If you want to build the documentation locally, please make sure you followed the instructions in the section Setting up your environment.
Use the following command to build the documentation the same way it is done in CI:
mkdocs build\n
Locally, you can use this command instead to continuously rebuild documentation on changes to the docs
and src
folder:
mkdocs serve\n
This will rebuild the documentation on changes to .md
files inside docs
, notebooks and python files.
On OSX, it is possible that the cairo lib file is not properly linked when installed via homebrew. In this case you might encounter an error like this
OSError: no library called \"cairo-2\" was found\nno library called \"cairo\" was found\nno library called \"libcairo-2\" was found\n
when calling mkdocs build
or mkdocs serve
. This can be resolved via setting the environment variable DYLD_FALLBACK_LIBRARY_PATH
: export DYLD_FALLBACK_LIBRARY_PATH=$DYLD_FALLBACK_LIBRARY_PATH:/opt/homebrew/lib\n
"},{"location":"CONTRIBUTING/#adding-new-pages","title":"Adding new pages","text":"Navigation is configured in mkdocs.yaml
using the nav section. We use the plugin mkdoc-literate-nav which allows fine-grained control of the navigation structure. However, most pages are explicitly listed and manually arranged in the nav
section of the configuration.
"},{"location":"CONTRIBUTING/#creating-stable-references-for-autorefs","title":"Creating stable references for autorefs","text":"mkdocstrings includes the plugin autorefs to enable automatic linking across pages with e.g. [a link][to-something]
. Anchors are autogenerated from section titles, and are not guaranteed to be unique. In order to ensure that a link will remain valid, add a custom anchor to the section title:
## Some section { #permanent-anchor-to-some-section }\n
(note the space after the opening brace). You can then refer to it within another markdown file with [Some section][permanent-anchor-to-some-section]
.
"},{"location":"CONTRIBUTING/#adding-notes-about-new-features-changes-or-deprecations","title":"Adding notes about new features, changes or deprecations","text":"We use the admonition extension of Mkdocs Material to create admonitions, also known as call-outs, that hold information about when a certain feature was added, changed or deprecated and optionally a description with more details. We put the admonition directly in a module's, a function's or class' docstring.
We use the following syntax:
!!! tip \"<Event Type> in version <Version Number>\"\n\n <Optional Description>\n
The description is useful when the note is about a smaller change such as a parameter.
- For a new feature, we use:
!!! tip \"New in version <Version Number>\"\n\n <Optional Description>\n
- For a change to an existing feature we use:
!!! tip \"Changed in version <Version Number>\"\n\n <Optional Description>\n
For example, for a change in version 1.2.3
that adds kwargs to a class' constructor we would write:
!!! tip \"Changed in version 1.2.3\"\n\n Added kwargs to the constructor.\n
- For a deprecation we use:
!!! tip \"Deprecated in version <Version Number>\"\n\n <Optional Description>\n
"},{"location":"CONTRIBUTING/#using-bibliography","title":"Using bibliography","text":"Bibliographic citations are managed with the plugin mkdocs-bibtex. To enter a citation first add the entry to docs/pydvl.bib
. For team contributor this should be an export of the Zotero folder software/pydvl
in the TransferLab Zotero library. All other contributors just add the bibtex data, and a maintainer will add it to the group library upon merging.
To add a citation inside a markdown file, use the notation [@citekey]
. Alas, because of when mkdocs-bibtex enters the pipeline, it won't process docstrings. For module documentation, we manually inject html into the markdown files. For example, in pydvl.value.shapley.montecarlo
we have:
\"\"\"\nModule docstring...\n\n## References\n\n[^1]: <a name=\"ghorbani_data_2019\"></a>Ghorbani, A., Zou, J., 2019.\n [Data Shapley: Equitable Valuation of Data for Machine\n Learning](https://proceedings.mlr.press/v97/ghorbani19c.html).\n In: Proceedings of the 36th International Conference on Machine Learning,\n PMLR, pp. 2242\u20132251.\n\"\"\"\n
and then later in the file, inside a function's docstring:
This function implements (Ghorbani and Zou, 2019)<sup><a \n href=\"#ghorbani_data_2019\">1</a></sup>\n
"},{"location":"CONTRIBUTING/#writing-mathematics","title":"Writing mathematics","text":"Use LaTeX delimiters $
and $$
for inline and displayed mathematics respectively.
Warning: backslashes must be escaped in docstrings! (although there are exceptions). For simplicity, declare the string as \"raw\" with the prefix r
:
# This will work\ndef f(x: float) -> float:\n r\"\"\" Computes \n $${ f(x) = \\frac{1}{x^2} }$$\n \"\"\"\n return 1/(x*x)\n\n# This throws an obscure error\ndef f(x: float) -> float:\n \"\"\" Computes \n $$\\frac{1}{x^2}$$\n \"\"\"\n return 1/(x*x)\n
Note how there is no space after the dollar signs. This is important! You can use braces for legibility like in the first example.
"},{"location":"CONTRIBUTING/#abbreviations","title":"Abbreviations","text":"We keep the abbreviations used in the documentation inside the docs_include/abbreviations.md file.
The syntax for abbreviations is:
*[ABBR]: Abbreviation\n
"},{"location":"CONTRIBUTING/#ci","title":"CI","text":"We use workflows to:
- Run the tests.
- Publish documentation.
- Publish packages to testpypi / pypi.
- Mark issues as stale after 30 days. We do this only for issues with the label
awaiting-reply
which indicates that we have answered a question / feature request / PR and are waiting for the OP to reply / update his work.
"},{"location":"CONTRIBUTING/#tests","title":"Tests","text":"We test all algorithms with simple datasets in CI jobs. This can amount to a sizeable amount of time, so care must be taken not to overdo it: 1. All algorithm tests must be on very simple datasets and as quick as possible 2. We try not to trigger CI pipelines when unnecessary (see Skipping CI runs). 3. We split the tests based on their duration into groups and run them in parallel.
For that we use pytest-split to first store the duration of all tests with tox -e tests -- --store-durations --slow-tests
in a .test_durations
file.
Alternatively, we case use pytest directly pytest --store-durations --slow-tests
.
Note This does not have to be done each time a new test or test case is added. For new tests and test cases pytes-split assumes average test execution time(calculated based on the stored information) for every test which does not have duration information stored. Thus, there's no need to store durations after changing the test suite. However, when there are major changes in the suite compared to what's stored in .test_durations, it's recommended to update the duration information with --store-durations
to ensure that the splitting is in balance.
Then we can have as many splits as we want:
tox -e tests -- --splits 3 --group 1\ntox -e tests -- --splits 3 --group 2\ntox -e tests -- --splits 3 --group 3\n
Alternatively, we case use pytest directly pytest --splits 3 ---group 1
.
Each one of these commands should be run in a separate shell/job to run the test groups in parallel and decrease the total runtime.
"},{"location":"CONTRIBUTING/#running-github-actions-locally","title":"Running Github Actions locally","text":"To run Github Actions locally we use act. It uses the workflows defined in .github/workflows
and determines the set of actions that need to be run. It uses the Docker API to either pull or build the necessary images, as defined in our workflow files and finally determines the execution path based on the dependencies that were defined.
Once it has the execution path, it then uses the Docker API to run containers for each action based on the images prepared earlier. The environment variables and filesystem are all configured to match what GitHub provides.
You can install it manually using:
curl -s https://raw.githubusercontent.com/nektos/act/master/install.sh | sudo bash -s -- -d -b ~/bin \n
And then simply add it to your PATH variable: PATH=~/bin:$PATH
Refer to its official readme for more installation options.
"},{"location":"CONTRIBUTING/#act-cheatsheet","title":"act cheatsheet","text":"By default, act
will run all workflows in .github/workflows
. You can use the -W
flag to specify a specific workflow file to run, or you can rely on the job id to be unique (but then you'll see warnings for the workflows without that job id).
# Run only the main tests for python 3.8 after a push event (implicit) \nact -W .github/workflows/run-tests-workflow.yaml \\\n -j run-tests \\\n --input tests_to_run=base\\\n --input python_version=3.8\n
Other common flags are:
# List all actions for all events:\nact -l\n\n# List the actions for a specific event:\nact workflow_dispatch -l\n\n# List the actions for a specific job:\nact -j lint -l\n\n# Run the default (`push`) event:\nact\n\n# Run a specific event:\nact pull_request\n\n# Run a specific job:\nact -j lint\n\n# Collect artifacts to the /tmp/artifacts folder:\nact --artifact-server-path /tmp/artifacts\n\n# Run a job in a specific workflow (useful if you have duplicate job names)\nact -j lint -W .github/workflows/tox.yml\n\n# Run in dry-run mode:\nact -n\n\n# Enable verbose-logging (can be used with any of the above commands)\nact -v\n
"},{"location":"CONTRIBUTING/#example","title":"Example","text":"To run the publish
job (the most difficult one to test) you would simply use:
- When triggered by a release:
act release -j publish --eventpath events.json\n
With events.json
containing:
{\n \"act\": true\n}\n
This will use your current branch. If you want to test a specific branch you have to use the workflow_dispatch
event (see below).
- To instead run it as if it had been manually triggered (i.e.
workflow_dispatch
) you would instead use:
act workflow_dispatch -j publish --eventpath events.json\n
With events.json
containing:
{\n \"act\": true,\n \"inputs\": {\n \"tag_name\": \"v0.6.0\"\n }\n}\n
"},{"location":"CONTRIBUTING/#skipping-ci-runs","title":"Skipping CI runs","text":"One sometimes would like to skip CI for certain commits (e.g. updating the readme). In order to do this, simply prefix the commit message with [skip ci]
. The string can be anywhere, but adding it to the beginning of the commit message makes it more evident when looking at commits in a PR.
Refer to the official GitHub documentation for more information.
"},{"location":"CONTRIBUTING/#release-processes","title":"Release processes","text":""},{"location":"CONTRIBUTING/#automatic-release-process","title":"Automatic release process","text":"In order to create an automatic release, a few prerequisites need to be satisfied:
- The project's virtualenv needs to be active
- The repository needs to be on the
develop
branch - The repository must be clean (including no untracked files)
Then, a new release can be created using the script build_scripts/release-version.sh
(leave out the version parameter to have bumpversion
automatically derive the next release version by bumping the patch part):
build_scripts/release-version.sh 0.1.6\n
To find out how to use the script, pass the -h
or --help
flags:
build_scripts/release-version.sh --help\n
If running in interactive mode (without -y|--yes
), the script will output a summary of pending changes and ask for confirmation before executing the actions.
Once this is done, a tag will be created on the repository. You should then create a GitHub release for that tag. That will a trigger a CI pipeline that will automatically create a package and publish it from CI to PyPI.
"},{"location":"CONTRIBUTING/#manual-release-process","title":"Manual release process","text":"If the automatic release process doesn't cover your use case, you can also create a new release manually by following these steps:
- (Repeat as needed) implement features on feature branches merged into
develop
. Each merge into develop will publish a new pre-release version to TestPyPI. These versions can be installed using pip install --pre --index-url https://test.pypi.org/simple/
. - When ready to release: From the develop branch create the release branch and perform release activities (update changelog, news, ...). For your own convenience, define an env variable for the release version
export RELEASE_VERSION=\"vX.Y.Z\"\ngit checkout develop\ngit branch release/${RELEASE_VERSION} && git checkout release/${RELEASE_VERSION}\n
- Run
bumpversion --commit release
if the release is only a patch release, otherwise the full version can be specified using bumpversion --commit --new-version X.Y.Z release
(the release
part is ignored but required by bumpversion ). - Merge the release branch into
master
, tag the merge commit, and push back to the repo. The CI pipeline publishes the package based on the tagged commit. git checkout master\ngit merge --no-ff release/${RELEASE_VERSION}\ngit tag -a ${RELEASE_VERSION} -m\"Release ${RELEASE_VERSION}\"\ngit push --follow-tags origin master\n
- Switch back to the release branch
release/vX.Y.Z
and pre-bump the version: bumpversion --commit patch
. This ensures that develop
pre-releases are always strictly more recent than the last published release version from master
. - Merge the release branch into
develop
: git checkout develop\ngit merge --no-ff release/${RELEASE_VERSION}\ngit push origin develop\n
- Delete the release branch if necessary:
git branch -d release/${RELEASE_VERSION}
- Create a GitHub release for the created tag.
- Pour yourself a cup of coffee, you earned it!
- A package will be automatically created and published from CI to PyPI.
"},{"location":"CONTRIBUTING/#ci-and-requirements-for-publishing","title":"CI and requirements for publishing","text":"In order to publish new versions of the package from the development branch, the CI pipeline requires the following secret variables set up:
TEST_PYPI_USERNAME\nTEST_PYPI_PASSWORD\nPYPI_USERNAME\nPYPI_PASSWORD\n
The first 2 are used after tests run on the develop branch's CI workflow to automatically publish packages to TestPyPI.
The last 2 are used in the publish.yaml CI workflow to publish packages to PyPI from develop
after a GitHub release.
"},{"location":"CONTRIBUTING/#publish-to-testpypi","title":"Publish to TestPyPI","text":"We use bump2version to bump the build part of the version number without commiting or tagging the change and then publish a package to TestPyPI from CI using Twine. The version has the GitHub run number appended.
For more details refer to the files .github/workflows/publish.yaml and .github/workflows/tox.yaml.
"},{"location":"api/pydvl/","title":"API Reference","text":""},{"location":"api/pydvl/#pydvl","title":"pydvl","text":""},{"location":"api/pydvl/#pydvl--the-python-data-valuation-library-api","title":"The Python Data Valuation Library API","text":"This is the API documentation for the Python Data Valuation Library (PyDVL). Use the table of contents to access the documentation for each module.
The two main modules you will want to look at are value and influence.
"},{"location":"api/pydvl/influence/","title":"Influence","text":""},{"location":"api/pydvl/influence/#pydvl.influence","title":"pydvl.influence","text":"This package contains algorithms for the computation of the influence function.
See The Influence function for an introduction to the concepts and methods implemented here.
Warning
Much of the code in this package is experimental or untested and is subject to modification. In particular, the package structure and basic API will probably change.
"},{"location":"api/pydvl/influence/array/","title":"Array","text":""},{"location":"api/pydvl/influence/array/#pydvl.influence.array","title":"pydvl.influence.array","text":"This module provides classes and utilities for handling large arrays that are chunked and lazily evaluated. It includes abstract base classes for converting between tensor types and NumPy arrays, aggregating blocks of data, and abstract representations of lazy arrays. Concrete implementations are provided for handling chunked lazy arrays (chunked in one resp. two dimensions), with support for efficient storage and retrieval using the Zarr library.
"},{"location":"api/pydvl/influence/array/#pydvl.influence.array.NumpyConverter","title":"NumpyConverter","text":" Bases: Generic[TensorType]
, ABC
Base class for converting TensorType objects into numpy arrays and vice versa.
"},{"location":"api/pydvl/influence/array/#pydvl.influence.array.NumpyConverter.to_numpy","title":"to_numpy abstractmethod
","text":"to_numpy(x: TensorType) -> NDArray\n
Override this method for converting a TensorType object into a numpy array
Source code in src/pydvl/influence/array.py
@abstractmethod\ndef to_numpy(self, x: TensorType) -> NDArray:\n \"\"\"Override this method for converting a TensorType object into a numpy array\"\"\"\n
"},{"location":"api/pydvl/influence/array/#pydvl.influence.array.NumpyConverter.from_numpy","title":"from_numpy abstractmethod
","text":"from_numpy(x: NDArray) -> TensorType\n
Override this method for converting a numpy array into a TensorType object
Source code in src/pydvl/influence/array.py
@abstractmethod\ndef from_numpy(self, x: NDArray) -> TensorType:\n \"\"\"Override this method for converting a numpy array into a TensorType object\"\"\"\n
"},{"location":"api/pydvl/influence/array/#pydvl.influence.array.SequenceAggregator","title":"SequenceAggregator","text":" Bases: Generic[TensorType]
, ABC
"},{"location":"api/pydvl/influence/array/#pydvl.influence.array.SequenceAggregator.__call__","title":"__call__ abstractmethod
","text":"__call__(tensor_sequence: LazyChunkSequence)\n
Aggregates tensors from a sequence.
Implement this method to define how a sequence of tensors, provided by a generator, should be combined.
Source code in src/pydvl/influence/array.py
@abstractmethod\ndef __call__(\n self,\n tensor_sequence: LazyChunkSequence,\n):\n \"\"\"\n Aggregates tensors from a sequence.\n\n Implement this method to define how a sequence of tensors, provided by a\n generator, should be combined.\n \"\"\"\n
"},{"location":"api/pydvl/influence/array/#pydvl.influence.array.ListAggregator","title":"ListAggregator","text":" Bases: SequenceAggregator
"},{"location":"api/pydvl/influence/array/#pydvl.influence.array.ListAggregator.__call__","title":"__call__","text":"__call__(tensor_sequence: LazyChunkSequence) -> List[TensorType]\n
Aggregates tensors from a single-level generator into a list. This method simply collects each tensor emitted by the generator into a single list.
PARAMETER DESCRIPTION tensor_sequence
Object wrapping a generator that yields TensorType
objects.
TYPE: LazyChunkSequence
RETURNS DESCRIPTION List[TensorType]
A list containing all the tensors provided by the tensor_generator.
Source code in src/pydvl/influence/array.py
def __call__(\n self,\n tensor_sequence: LazyChunkSequence,\n) -> List[TensorType]:\n \"\"\"\n Aggregates tensors from a single-level generator into a list. This method simply\n collects each tensor emitted by the generator into a single list.\n\n Args:\n tensor_sequence: Object wrapping a generator that yields `TensorType`\n objects.\n\n Returns:\n A list containing all the tensors provided by the tensor_generator.\n \"\"\"\n\n gen = cast(Iterator[TensorType], tensor_sequence.generator_factory())\n\n if tensor_sequence.len_generator is not None:\n gen = cast(\n Iterator[TensorType],\n tqdm(gen, total=tensor_sequence.len_generator, desc=\"Blocks\"),\n )\n\n return [t for t in gen]\n
"},{"location":"api/pydvl/influence/array/#pydvl.influence.array.NestedSequenceAggregator","title":"NestedSequenceAggregator","text":" Bases: Generic[TensorType]
, ABC
"},{"location":"api/pydvl/influence/array/#pydvl.influence.array.NestedSequenceAggregator.__call__","title":"__call__ abstractmethod
","text":"__call__(nested_sequence_of_tensors: NestedLazyChunkSequence)\n
Aggregates tensors from a nested sequence of tensors.
Implement this method to specify how tensors, nested in two layers of generators, should be combined. Useful for complex data structures where tensors are not directly accessible in a flat list.
Source code in src/pydvl/influence/array.py
@abstractmethod\ndef __call__(self, nested_sequence_of_tensors: NestedLazyChunkSequence):\n \"\"\"\n Aggregates tensors from a nested sequence of tensors.\n\n Implement this method to specify how tensors, nested in two layers of\n generators, should be combined. Useful for complex data structures where tensors\n are not directly accessible in a flat list.\n \"\"\"\n
"},{"location":"api/pydvl/influence/array/#pydvl.influence.array.NestedListAggregator","title":"NestedListAggregator","text":" Bases: NestedSequenceAggregator
"},{"location":"api/pydvl/influence/array/#pydvl.influence.array.NestedListAggregator.__call__","title":"__call__","text":"__call__(\n nested_sequence_of_tensors: NestedLazyChunkSequence,\n) -> List[List[TensorType]]\n
Aggregates tensors from a nested generator structure into a list of lists. Each inner generator is converted into a list of tensors, resulting in a nested list structure.
Args: nested_sequence_of_tensors: Object wrapping a generator of generators, where each inner generator yields TensorType objects.
RETURNS DESCRIPTION List[List[TensorType]]
A list of lists, where each inner list contains tensors returned from one of the inner generators.
Source code in src/pydvl/influence/array.py
def __call__(\n self,\n nested_sequence_of_tensors: NestedLazyChunkSequence,\n) -> List[List[TensorType]]:\n \"\"\"\n Aggregates tensors from a nested generator structure into a list of lists.\n Each inner generator is converted into a list of tensors, resulting in a nested\n list structure.\n\n Args:\n nested_sequence_of_tensors: Object wrapping a generator of generators,\n where each inner generator yields TensorType objects.\n\n Returns:\n A list of lists, where each inner list contains tensors returned from one\n of the inner generators.\n \"\"\"\n outer_gen = cast(\n Iterator[Iterator[TensorType]],\n nested_sequence_of_tensors.generator_factory(),\n )\n len_outer_gen = nested_sequence_of_tensors.len_outer_generator\n if len_outer_gen is not None:\n outer_gen = cast(\n Iterator[Iterator[TensorType]],\n tqdm(outer_gen, total=len_outer_gen, desc=\"Row blocks\"),\n )\n\n return [list(tensor_gen) for tensor_gen in outer_gen]\n
"},{"location":"api/pydvl/influence/array/#pydvl.influence.array.LazyChunkSequence","title":"LazyChunkSequence","text":"LazyChunkSequence(\n generator_factory: Callable[[], Generator[TensorType, None, None]],\n len_generator: Optional[int] = None,\n)\n
Bases: Generic[TensorType]
A class representing a chunked, and lazily evaluated array, where the chunking is restricted to the first dimension
This class is designed to handle large arrays that don't fit in memory. It works by generating chunks of the array on demand and can also convert these chunks to a Zarr array for efficient storage and retrieval.
ATTRIBUTE DESCRIPTION generator_factory
A factory function that returns a generator. This generator yields chunks of the large array when called.
len_generator
if the number of elements from the generator is known from the context, this optional parameter can be used to improve logging by adding a progressbar.
Source code in src/pydvl/influence/array.py
def __init__(\n self,\n generator_factory: Callable[[], Generator[TensorType, None, None]],\n len_generator: Optional[int] = None,\n):\n self.generator_factory = generator_factory\n self.len_generator = len_generator\n
"},{"location":"api/pydvl/influence/array/#pydvl.influence.array.LazyChunkSequence.compute","title":"compute","text":"compute(aggregator: Optional[SequenceAggregator] = None)\n
Computes and optionally aggregates the chunks of the array using the provided aggregator. This method initiates the generation of chunks and then combines them according to the aggregator's logic.
PARAMETER DESCRIPTION aggregator
An optional aggregator for combining the chunks of the array. If None, a default ListAggregator is used to simply collect the chunks into a list.
TYPE: Optional[SequenceAggregator]
DEFAULT: None
RETURNS DESCRIPTION The aggregated result of all chunks of the array, the format of which depends on the aggregator used.
Source code in src/pydvl/influence/array.py
@log_duration(log_level=logging.INFO)\ndef compute(self, aggregator: Optional[SequenceAggregator] = None):\n \"\"\"\n Computes and optionally aggregates the chunks of the array using the provided\n aggregator. This method initiates the generation of chunks and then\n combines them according to the aggregator's logic.\n\n Args:\n aggregator: An optional aggregator for combining the chunks of\n the array. If None, a default ListAggregator is used to simply collect\n the chunks into a list.\n\n Returns:\n The aggregated result of all chunks of the array, the format of which\n depends on the aggregator used.\n\n \"\"\"\n if aggregator is None:\n aggregator = ListAggregator()\n return aggregator(self)\n
"},{"location":"api/pydvl/influence/array/#pydvl.influence.array.LazyChunkSequence.to_zarr","title":"to_zarr","text":"to_zarr(\n path_or_url: Union[str, StoreLike],\n converter: NumpyConverter,\n return_stored: bool = False,\n overwrite: bool = False,\n) -> Optional[Array]\n
Converts the array into Zarr format, a storage format optimized for large arrays, and stores it at the specified path or URL. This method is suitable for scenarios where the data needs to be saved for later use or for large datasets requiring efficient storage.
PARAMETER DESCRIPTION path_or_url
The file path or URL where the Zarr array will be stored. Also excepts instances of zarr stores.
TYPE: Union[str, StoreLike]
converter
A converter for transforming blocks into NumPy arrays compatible with Zarr.
TYPE: NumpyConverter
return_stored
If True, the method returns the stored Zarr array; otherwise, it returns None.
TYPE: bool
DEFAULT: False
overwrite
If True, overwrites existing data at the given path_or_url. If False, an error is raised in case of existing data.
TYPE: bool
DEFAULT: False
RETURNS DESCRIPTION Optional[Array]
The Zarr array if return_stored is True; otherwise, None.
Source code in src/pydvl/influence/array.py
@log_duration(log_level=logging.INFO)\ndef to_zarr(\n self,\n path_or_url: Union[str, StoreLike],\n converter: NumpyConverter,\n return_stored: bool = False,\n overwrite: bool = False,\n) -> Optional[zarr.Array]:\n \"\"\"\n Converts the array into Zarr format, a storage format optimized for large\n arrays, and stores it at the specified path or URL. This method is suitable for\n scenarios where the data needs to be saved for later use or for large datasets\n requiring efficient storage.\n\n Args:\n path_or_url: The file path or URL where the Zarr array will be stored.\n Also excepts instances of zarr stores.\n converter: A converter for transforming blocks into NumPy arrays\n compatible with Zarr.\n return_stored: If True, the method returns the stored Zarr array; otherwise,\n it returns None.\n overwrite: If True, overwrites existing data at the given path_or_url.\n If False, an error is raised in case of existing data.\n\n Returns:\n The Zarr array if return_stored is True; otherwise, None.\n \"\"\"\n row_idx = 0\n z = None\n\n gen = cast(Iterator[TensorType], self.generator_factory())\n\n if self.len_generator is not None:\n gen = cast(\n Iterator[TensorType], tqdm(gen, total=self.len_generator, desc=\"Blocks\")\n )\n\n for block in gen:\n numpy_block = converter.to_numpy(block)\n\n if z is None:\n z = self._initialize_zarr_array(numpy_block, path_or_url, overwrite)\n\n new_shape = self._new_shape_according_to_block(numpy_block, row_idx)\n z.resize(new_shape)\n\n z[row_idx : row_idx + numpy_block.shape[0]] = numpy_block\n row_idx += numpy_block.shape[0]\n\n return z if return_stored else None\n
"},{"location":"api/pydvl/influence/array/#pydvl.influence.array.NestedLazyChunkSequence","title":"NestedLazyChunkSequence","text":"NestedLazyChunkSequence(\n generator_factory: Callable[\n [], Generator[Generator[TensorType, None, None], None, None]\n ],\n len_outer_generator: Optional[int] = None,\n)\n
Bases: Generic[TensorType]
A class representing chunked, and lazily evaluated array, where the chunking is restricted to the first two dimensions.
This class is designed for handling large arrays where individual chunks are loaded and processed lazily. It supports converting these chunks into a Zarr array for efficient storage and retrieval, with chunking applied along the first two dimensions.
ATTRIBUTE DESCRIPTION generator_factory
A factory function that returns a generator of generators. Each inner generator yields chunks
len_outer_generator
if the number of elements from the outer generator is known from the context, this optional parameter can be used to improve logging by adding a progressbar.
Source code in src/pydvl/influence/array.py
def __init__(\n self,\n generator_factory: Callable[\n [], Generator[Generator[TensorType, None, None], None, None]\n ],\n len_outer_generator: Optional[int] = None,\n):\n self.generator_factory = generator_factory\n self.len_outer_generator = len_outer_generator\n
"},{"location":"api/pydvl/influence/array/#pydvl.influence.array.NestedLazyChunkSequence.compute","title":"compute","text":"compute(aggregator: Optional[NestedSequenceAggregator] = None)\n
Computes and optionally aggregates the chunks of the array using the provided aggregator. This method initiates the generation of chunks and then combines them according to the aggregator's logic.
PARAMETER DESCRIPTION aggregator
An optional aggregator for combining the chunks of the array. If None, a default NestedListAggregator is used to simply collect the chunks into a list of lists.
TYPE: Optional[NestedSequenceAggregator]
DEFAULT: None
RETURNS DESCRIPTION The aggregated result of all chunks of the array, the format of which
depends on the aggregator used.
Source code in src/pydvl/influence/array.py
@log_duration(log_level=logging.INFO)\ndef compute(self, aggregator: Optional[NestedSequenceAggregator] = None):\n \"\"\"\n Computes and optionally aggregates the chunks of the array using the provided\n aggregator. This method initiates the generation of chunks and then\n combines them according to the aggregator's logic.\n\n Args:\n aggregator: An optional aggregator for combining the chunks of\n the array. If None, a default\n [NestedListAggregator][pydvl.influence.array.NestedListAggregator]\n is used to simply collect the chunks into a list of lists.\n\n Returns:\n The aggregated result of all chunks of the array, the format of which\n depends on the aggregator used.\n\n \"\"\"\n if aggregator is None:\n aggregator = NestedListAggregator()\n return aggregator(self)\n
"},{"location":"api/pydvl/influence/array/#pydvl.influence.array.NestedLazyChunkSequence.to_zarr","title":"to_zarr","text":"to_zarr(\n path_or_url: Union[str, StoreLike],\n converter: NumpyConverter,\n return_stored: bool = False,\n overwrite: bool = False,\n) -> Optional[Array]\n
Converts the array into Zarr format, a storage format optimized for large arrays, and stores it at the specified path or URL. This method is suitable for scenarios where the data needs to be saved for later use or for large datasets requiring efficient storage.
PARAMETER DESCRIPTION path_or_url
The file path or URL where the Zarr array will be stored. Also excepts instances of zarr stores.
TYPE: Union[str, StoreLike]
converter
A converter for transforming blocks into NumPy arrays compatible with Zarr.
TYPE: NumpyConverter
return_stored
If True, the method returns the stored Zarr array; otherwise, it returns None.
TYPE: bool
DEFAULT: False
overwrite
If True, overwrites existing data at the given path_or_url. If False, an error is raised in case of existing data.
TYPE: bool
DEFAULT: False
RETURNS DESCRIPTION Optional[Array]
The Zarr array if return_stored is True; otherwise, None.
Source code in src/pydvl/influence/array.py
@log_duration(log_level=logging.INFO)\ndef to_zarr(\n self,\n path_or_url: Union[str, StoreLike],\n converter: NumpyConverter,\n return_stored: bool = False,\n overwrite: bool = False,\n) -> Optional[zarr.Array]:\n \"\"\"\n Converts the array into Zarr format, a storage format optimized for large\n arrays, and stores it at the specified path or URL. This method is suitable for\n scenarios where the data needs to be saved for later use or for large datasets\n requiring efficient storage.\n\n Args:\n path_or_url: The file path or URL where the Zarr array will be stored.\n Also excepts instances of zarr stores.\n converter: A converter for transforming blocks into NumPy arrays\n compatible with Zarr.\n return_stored: If True, the method returns the stored Zarr array;\n otherwise, it returns None.\n overwrite: If True, overwrites existing data at the given path_or_url.\n If False, an error is raised in case of existing data.\n\n Returns:\n The Zarr array if return_stored is True; otherwise, None.\n \"\"\"\n\n row_idx = 0\n z = None\n numpy_block = None\n block_generator = cast(Iterator[Iterator[TensorType]], self.generator_factory())\n\n if self.len_outer_generator is not None:\n block_generator = cast(\n Iterator[Iterator[TensorType]],\n tqdm(\n block_generator, total=self.len_outer_generator, desc=\"Row blocks\"\n ),\n )\n\n for row_blocks in block_generator:\n col_idx = 0\n for block in row_blocks:\n numpy_block = converter.to_numpy(block)\n if z is None:\n z = self._initialize_zarr_array(numpy_block, path_or_url, overwrite)\n new_shape = self._new_shape_according_to_block(\n z, numpy_block, row_idx, col_idx\n )\n z.resize(new_shape)\n idx_slice_to_update = self._idx_slice_for_update(\n numpy_block, row_idx, col_idx\n )\n z[idx_slice_to_update] = numpy_block\n\n col_idx += numpy_block.shape[1]\n\n if numpy_block is None:\n raise ValueError(\"Generator is empty\")\n\n row_idx += numpy_block.shape[0]\n\n return z if return_stored else None\n
"},{"location":"api/pydvl/influence/base_influence_function_model/","title":"Base influence function model","text":""},{"location":"api/pydvl/influence/base_influence_function_model/#pydvl.influence.base_influence_function_model","title":"pydvl.influence.base_influence_function_model","text":""},{"location":"api/pydvl/influence/base_influence_function_model/#pydvl.influence.base_influence_function_model.InfluenceFunctionModel","title":"InfluenceFunctionModel","text":" Bases: Generic[TensorType, DataLoaderType]
, ABC
Generic abstract base class for computing influence related quantities. For a specific influence algorithm and tensor framework, inherit from this base class
"},{"location":"api/pydvl/influence/base_influence_function_model/#pydvl.influence.base_influence_function_model.InfluenceFunctionModel.n_parameters","title":"n_parameters abstractmethod
property
","text":"n_parameters\n
Number of trainable parameters of the underlying model
"},{"location":"api/pydvl/influence/base_influence_function_model/#pydvl.influence.base_influence_function_model.InfluenceFunctionModel.is_thread_safe","title":"is_thread_safe abstractmethod
property
","text":"is_thread_safe: bool\n
Whether the influence computation is thread safe
"},{"location":"api/pydvl/influence/base_influence_function_model/#pydvl.influence.base_influence_function_model.InfluenceFunctionModel.is_fitted","title":"is_fitted abstractmethod
property
","text":"is_fitted\n
Override this, to expose the fitting status of the instance.
"},{"location":"api/pydvl/influence/base_influence_function_model/#pydvl.influence.base_influence_function_model.InfluenceFunctionModel.fit","title":"fit abstractmethod
","text":"fit(data: DataLoaderType) -> InfluenceFunctionModel\n
Override this method to fit the influence function model to training data, e.g. pre-compute hessian matrix or matrix decompositions
PARAMETER DESCRIPTION data
TYPE: DataLoaderType
RETURNS DESCRIPTION InfluenceFunctionModel
The fitted instance
Source code in src/pydvl/influence/base_influence_function_model.py
@abstractmethod\ndef fit(self, data: DataLoaderType) -> InfluenceFunctionModel:\n \"\"\"\n Override this method to fit the influence function model to training data,\n e.g. pre-compute hessian matrix or matrix decompositions\n\n Args:\n data:\n\n Returns:\n The fitted instance\n \"\"\"\n
"},{"location":"api/pydvl/influence/base_influence_function_model/#pydvl.influence.base_influence_function_model.InfluenceFunctionModel.fit_required","title":"fit_required staticmethod
","text":"fit_required(method)\n
Decorator to enforce the fitted check
Source code in src/pydvl/influence/base_influence_function_model.py
@staticmethod\ndef fit_required(method):\n \"\"\"Decorator to enforce the fitted check\"\"\"\n\n @wraps(method)\n def wrapper(self, *args, **kwargs):\n if not self.is_fitted:\n raise NotFittedException(type(self))\n return method(self, *args, **kwargs)\n\n return wrapper\n
"},{"location":"api/pydvl/influence/base_influence_function_model/#pydvl.influence.base_influence_function_model.InfluenceFunctionModel.influence_factors","title":"influence_factors","text":"influence_factors(x: TensorType, y: TensorType) -> TensorType\n
Computes the approximation of
\\[ H^{-1}\\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\] where the gradient is meant to be per sample of the batch \\((x, y)\\). For all input tensors it is assumed, that the first dimension is the batch dimension.
PARAMETER DESCRIPTION x
model input to use in the gradient computations
TYPE: TensorType
y
label tensor to compute gradients
TYPE: TensorType
RETURNS DESCRIPTION TensorType
Tensor representing the element-wise inverse Hessian matrix vector products
Source code in src/pydvl/influence/base_influence_function_model.py
def influence_factors(self, x: TensorType, y: TensorType) -> TensorType:\n r\"\"\"\n Computes the approximation of\n\n \\[ H^{-1}\\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\]\n\n where the gradient is meant to be per sample of the batch $(x, y)$.\n For all input tensors it is assumed,\n that the first dimension is the batch dimension.\n\n Args:\n x: model input to use in the gradient computations\n y: label tensor to compute gradients\n\n Returns:\n Tensor representing the element-wise inverse Hessian matrix vector products\n\n \"\"\"\n if not self.is_fitted:\n raise NotFittedException(type(self))\n return self._influence_factors(x, y)\n
"},{"location":"api/pydvl/influence/base_influence_function_model/#pydvl.influence.base_influence_function_model.InfluenceFunctionModel.influences","title":"influences","text":"influences(\n x_test: TensorType,\n y_test: TensorType,\n x: Optional[TensorType] = None,\n y: Optional[TensorType] = None,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> TensorType\n
Computes the approximation of
\\[ \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}})), \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the case of up-weighting influence, resp.
\\[ \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{test}, f_{\\theta}(x_{test})), \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the perturbation type influence case.
PARAMETER DESCRIPTION x_test
model input to use in the gradient computations of \\(H^{-1}\\nabla_{theta} \\ell(y_{test}, f_{\\theta}(x_{test}))\\)
TYPE: TensorType
y_test
label tensor to compute gradients
TYPE: TensorType
x
optional model input to use in the gradient computations \\(\\nabla_{theta}\\ell(y, f_{\\theta}(x))\\), resp. \\(\\nabla_{x}\\nabla_{theta}\\ell(y, f_{\\theta}(x))\\), if None, use \\(x=x_{test}\\)
TYPE: Optional[TensorType]
DEFAULT: None
y
optional label tensor to compute gradients
TYPE: Optional[TensorType]
DEFAULT: None
mode
enum value of InfluenceMode
TYPE: InfluenceMode
DEFAULT: Up
RETURNS DESCRIPTION TensorType
Tensor representing the element-wise scalar products for the provided batch
Source code in src/pydvl/influence/base_influence_function_model.py
def influences(\n self,\n x_test: TensorType,\n y_test: TensorType,\n x: Optional[TensorType] = None,\n y: Optional[TensorType] = None,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> TensorType:\n r\"\"\"\n Computes the approximation of\n\n \\[ \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}},\n f_{\\theta}(x_{\\text{test}})),\n \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the case of up-weighting influence, resp.\n\n \\[ \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{test}, f_{\\theta}(x_{test})),\n \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the perturbation type influence case.\n\n Args:\n x_test: model input to use in the gradient computations\n of $H^{-1}\\nabla_{theta} \\ell(y_{test}, f_{\\theta}(x_{test}))$\n y_test: label tensor to compute gradients\n x: optional model input to use in the gradient computations\n $\\nabla_{theta}\\ell(y, f_{\\theta}(x))$,\n resp. $\\nabla_{x}\\nabla_{theta}\\ell(y, f_{\\theta}(x))$,\n if None, use $x=x_{test}$\n y: optional label tensor to compute gradients\n mode: enum value of [InfluenceMode]\n [pydvl.influence.base_influence_function_model.InfluenceMode]\n\n Returns:\n Tensor representing the element-wise scalar products for the provided batch\n\n \"\"\"\n if not self.is_fitted:\n raise NotFittedException(type(self))\n\n if x is None and y is not None:\n raise ValueError(\n \"Providing labels y, without providing model input x \"\n \"is not supported\"\n )\n\n if x is not None and y is None:\n raise ValueError(\n \"Providing model input x, without providing labels y \"\n \"is not supported\"\n )\n\n return self._influences(x_test, y_test, x, y, mode)\n
"},{"location":"api/pydvl/influence/base_influence_function_model/#pydvl.influence.base_influence_function_model.InfluenceFunctionModel.influences_from_factors","title":"influences_from_factors abstractmethod
","text":"influences_from_factors(\n z_test_factors: TensorType,\n x: TensorType,\n y: TensorType,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> TensorType\n
Override this method to implement the computation of
\\[ \\langle z_{\\text{test_factors}}, \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the case of up-weighting influence, resp.
\\[ \\langle z_{\\text{test_factors}}, \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the perturbation type influence case. The gradient is meant to be per sample of the batch \\((x, y)\\).
PARAMETER DESCRIPTION z_test_factors
pre-computed array, approximating \\(H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}}))\\)
TYPE: TensorType
x
model input to use in the gradient computations \\(\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))\\), resp. \\(\\nabla_{x}\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))\\), if None, use \\(x=x_{\\text{test}}\\)
TYPE: TensorType
y
label tensor to compute gradients
TYPE: TensorType
mode
enum value of InfluenceMode
TYPE: InfluenceMode
DEFAULT: Up
RETURNS DESCRIPTION TensorType
Tensor representing the element-wise scalar products for the provided batch
Source code in src/pydvl/influence/base_influence_function_model.py
@abstractmethod\ndef influences_from_factors(\n self,\n z_test_factors: TensorType,\n x: TensorType,\n y: TensorType,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> TensorType:\n r\"\"\"\n Override this method to implement the computation of\n\n \\[ \\langle z_{\\text{test_factors}},\n \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the case of up-weighting influence, resp.\n\n \\[ \\langle z_{\\text{test_factors}},\n \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the perturbation type influence case. The gradient is meant to be per sample\n of the batch $(x, y)$.\n\n Args:\n z_test_factors: pre-computed array, approximating\n $H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}},\n f_{\\theta}(x_{\\text{test}}))$\n x: model input to use in the gradient computations\n $\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))$,\n resp. $\\nabla_{x}\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))$,\n if None, use $x=x_{\\text{test}}$\n y: label tensor to compute gradients\n mode: enum value of [InfluenceMode]\n [pydvl.influence.base_influence_function_model.InfluenceMode]\n\n Returns:\n Tensor representing the element-wise scalar products for the provided batch\n\n \"\"\"\n
"},{"location":"api/pydvl/influence/base_influence_function_model/#pydvl.influence.base_influence_function_model.ComposableInfluence","title":"ComposableInfluence","text":" Bases: InfluenceFunctionModel
, Generic[TensorType, BatchType, DataLoaderType, BlockMapperType]
, ABC
Generic abstract base class, that allow for block-wise computation of influence quantities. Inherit from this base class for specific influence algorithms and tensor frameworks.
"},{"location":"api/pydvl/influence/base_influence_function_model/#pydvl.influence.base_influence_function_model.ComposableInfluence.n_parameters","title":"n_parameters abstractmethod
property
","text":"n_parameters\n
Number of trainable parameters of the underlying model
"},{"location":"api/pydvl/influence/base_influence_function_model/#pydvl.influence.base_influence_function_model.ComposableInfluence.is_thread_safe","title":"is_thread_safe abstractmethod
property
","text":"is_thread_safe: bool\n
Whether the influence computation is thread safe
"},{"location":"api/pydvl/influence/base_influence_function_model/#pydvl.influence.base_influence_function_model.ComposableInfluence.fit_required","title":"fit_required staticmethod
","text":"fit_required(method)\n
Decorator to enforce the fitted check
Source code in src/pydvl/influence/base_influence_function_model.py
@staticmethod\ndef fit_required(method):\n \"\"\"Decorator to enforce the fitted check\"\"\"\n\n @wraps(method)\n def wrapper(self, *args, **kwargs):\n if not self.is_fitted:\n raise NotFittedException(type(self))\n return method(self, *args, **kwargs)\n\n return wrapper\n
"},{"location":"api/pydvl/influence/base_influence_function_model/#pydvl.influence.base_influence_function_model.ComposableInfluence.influence_factors","title":"influence_factors","text":"influence_factors(x: TensorType, y: TensorType) -> TensorType\n
Computes the approximation of
\\[ H^{-1}\\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\] where the gradient is meant to be per sample of the batch \\((x, y)\\). For all input tensors it is assumed, that the first dimension is the batch dimension.
PARAMETER DESCRIPTION x
model input to use in the gradient computations
TYPE: TensorType
y
label tensor to compute gradients
TYPE: TensorType
RETURNS DESCRIPTION TensorType
Tensor representing the element-wise inverse Hessian matrix vector products
Source code in src/pydvl/influence/base_influence_function_model.py
def influence_factors(self, x: TensorType, y: TensorType) -> TensorType:\n r\"\"\"\n Computes the approximation of\n\n \\[ H^{-1}\\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\]\n\n where the gradient is meant to be per sample of the batch $(x, y)$.\n For all input tensors it is assumed,\n that the first dimension is the batch dimension.\n\n Args:\n x: model input to use in the gradient computations\n y: label tensor to compute gradients\n\n Returns:\n Tensor representing the element-wise inverse Hessian matrix vector products\n\n \"\"\"\n if not self.is_fitted:\n raise NotFittedException(type(self))\n return self._influence_factors(x, y)\n
"},{"location":"api/pydvl/influence/base_influence_function_model/#pydvl.influence.base_influence_function_model.ComposableInfluence.influences","title":"influences","text":"influences(\n x_test: TensorType,\n y_test: TensorType,\n x: Optional[TensorType] = None,\n y: Optional[TensorType] = None,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> TensorType\n
Computes the approximation of
\\[ \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}})), \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the case of up-weighting influence, resp.
\\[ \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{test}, f_{\\theta}(x_{test})), \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the perturbation type influence case.
PARAMETER DESCRIPTION x_test
model input to use in the gradient computations of \\(H^{-1}\\nabla_{theta} \\ell(y_{test}, f_{\\theta}(x_{test}))\\)
TYPE: TensorType
y_test
label tensor to compute gradients
TYPE: TensorType
x
optional model input to use in the gradient computations \\(\\nabla_{theta}\\ell(y, f_{\\theta}(x))\\), resp. \\(\\nabla_{x}\\nabla_{theta}\\ell(y, f_{\\theta}(x))\\), if None, use \\(x=x_{test}\\)
TYPE: Optional[TensorType]
DEFAULT: None
y
optional label tensor to compute gradients
TYPE: Optional[TensorType]
DEFAULT: None
mode
enum value of InfluenceMode
TYPE: InfluenceMode
DEFAULT: Up
RETURNS DESCRIPTION TensorType
Tensor representing the element-wise scalar products for the provided batch
Source code in src/pydvl/influence/base_influence_function_model.py
def influences(\n self,\n x_test: TensorType,\n y_test: TensorType,\n x: Optional[TensorType] = None,\n y: Optional[TensorType] = None,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> TensorType:\n r\"\"\"\n Computes the approximation of\n\n \\[ \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}},\n f_{\\theta}(x_{\\text{test}})),\n \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the case of up-weighting influence, resp.\n\n \\[ \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{test}, f_{\\theta}(x_{test})),\n \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the perturbation type influence case.\n\n Args:\n x_test: model input to use in the gradient computations\n of $H^{-1}\\nabla_{theta} \\ell(y_{test}, f_{\\theta}(x_{test}))$\n y_test: label tensor to compute gradients\n x: optional model input to use in the gradient computations\n $\\nabla_{theta}\\ell(y, f_{\\theta}(x))$,\n resp. $\\nabla_{x}\\nabla_{theta}\\ell(y, f_{\\theta}(x))$,\n if None, use $x=x_{test}$\n y: optional label tensor to compute gradients\n mode: enum value of [InfluenceMode]\n [pydvl.influence.base_influence_function_model.InfluenceMode]\n\n Returns:\n Tensor representing the element-wise scalar products for the provided batch\n\n \"\"\"\n if not self.is_fitted:\n raise NotFittedException(type(self))\n\n if x is None and y is not None:\n raise ValueError(\n \"Providing labels y, without providing model input x \"\n \"is not supported\"\n )\n\n if x is not None and y is None:\n raise ValueError(\n \"Providing model input x, without providing labels y \"\n \"is not supported\"\n )\n\n return self._influences(x_test, y_test, x, y, mode)\n
"},{"location":"api/pydvl/influence/base_influence_function_model/#pydvl.influence.base_influence_function_model.ComposableInfluence.fit","title":"fit","text":"fit(data: DataLoaderType) -> InfluenceFunctionModel\n
Fitting to provided data, by internally creating a block mapper instance from it. Args: data: iterable of tensors
RETURNS DESCRIPTION InfluenceFunctionModel
Fitted instance
Source code in src/pydvl/influence/base_influence_function_model.py
@log_duration(log_level=logging.INFO)\ndef fit(self, data: DataLoaderType) -> InfluenceFunctionModel:\n \"\"\"\n Fitting to provided data, by internally creating a block mapper instance from\n it.\n Args:\n data: iterable of tensors\n\n Returns:\n Fitted instance\n \"\"\"\n self.block_mapper = self._create_block_mapper(data)\n return self\n
"},{"location":"api/pydvl/influence/base_influence_function_model/#pydvl.influence.base_influence_function_model.ComposableInfluence.influences_by_block","title":"influences_by_block","text":"influences_by_block(\n x_test: TensorType,\n y_test: TensorType,\n x: Optional[TensorType] = None,\n y: Optional[TensorType] = None,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> OrderedDict[str, TensorType]\n
Compute the block-wise influence values for the provided data, i.e. an approximation of
\\[ \\langle H^{-1}\\nabla_{theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}})), \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the case of up-weighting influence, resp.
\\[ \\langle H^{-1}\\nabla_{theta} \\ell(y_{test}, f_{\\theta}(x_{test})), \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the perturbation type influence case.
PARAMETER DESCRIPTION x_test
model input to use in the gradient computations of the approximation of \\(H^{-1}\\nabla_{theta} \\ell(y_{test}, f_{\\theta}(x_{test}))\\)
TYPE: TensorType
y_test
label tensor to compute gradients
TYPE: TensorType
x
optional model input to use in the gradient computations \\(\\nabla_{theta}\\ell(y, f_{\\theta}(x))\\), resp. \\(\\nabla_{x}\\nabla_{theta}\\ell(y, f_{\\theta}(x))\\), if None, use \\(x=x_{test}\\)
TYPE: Optional[TensorType]
DEFAULT: None
y
optional label tensor to compute gradients
TYPE: Optional[TensorType]
DEFAULT: None
mode
enum value of InfluenceMode
TYPE: InfluenceMode
DEFAULT: Up
RETURNS DESCRIPTION OrderedDict[str, TensorType]
Ordered dictionary of tensors representing the element-wise scalar products
OrderedDict[str, TensorType]
for the provided batch per block.
Source code in src/pydvl/influence/base_influence_function_model.py
@InfluenceFunctionModel.fit_required\ndef influences_by_block(\n self,\n x_test: TensorType,\n y_test: TensorType,\n x: Optional[TensorType] = None,\n y: Optional[TensorType] = None,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> OrderedDict[str, TensorType]:\n r\"\"\"\n Compute the block-wise influence values for the provided data, i.e. an\n approximation of\n\n \\[ \\langle H^{-1}\\nabla_{theta} \\ell(y_{\\text{test}},\n f_{\\theta}(x_{\\text{test}})),\n \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the case of up-weighting influence, resp.\n\n \\[ \\langle H^{-1}\\nabla_{theta} \\ell(y_{test}, f_{\\theta}(x_{test})),\n \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the perturbation type influence case.\n\n Args:\n x_test: model input to use in the gradient computations\n of the approximation of\n $H^{-1}\\nabla_{theta} \\ell(y_{test}, f_{\\theta}(x_{test}))$\n y_test: label tensor to compute gradients\n x: optional model input to use in the gradient computations\n $\\nabla_{theta}\\ell(y, f_{\\theta}(x))$,\n resp. $\\nabla_{x}\\nabla_{theta}\\ell(y, f_{\\theta}(x))$,\n if None, use $x=x_{test}$\n y: optional label tensor to compute gradients\n mode: enum value of [InfluenceMode]\n [pydvl.influence.base_influence_function_model.InfluenceMode]\n\n Returns:\n Ordered dictionary of tensors representing the element-wise scalar products\n for the provided batch per block.\n\n \"\"\"\n left_batch = self._create_batch(x_test, y_test)\n\n if x is None:\n if y is not None:\n raise ValueError(\n \"Providing labels y, without providing model input x \"\n \"is not supported\"\n )\n right_batch = left_batch\n else:\n if y is None:\n raise ValueError(\n \"Providing model input x, without providing labels y \"\n \"is not supported\"\n )\n right_batch = self._create_batch(x, y)\n\n return self.block_mapper.interactions(left_batch, right_batch, mode)\n
"},{"location":"api/pydvl/influence/base_influence_function_model/#pydvl.influence.base_influence_function_model.ComposableInfluence.influence_factors_by_block","title":"influence_factors_by_block","text":"influence_factors_by_block(\n x: TensorType, y: TensorType\n) -> OrderedDict[str, TensorType]\n
Compute the block-wise approximation of
\\[ H^{-1}\\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\] where the gradient is meant to be per sample of the batch \\((x, y)\\).
PARAMETER DESCRIPTION x
model input to use in the gradient computations
TYPE: TensorType
y
label tensor to compute gradients
TYPE: TensorType
RETURNS DESCRIPTION OrderedDict[str, TensorType]
Ordered dictionary of tensors representing the element-wise
OrderedDict[str, TensorType]
approximate inverse Hessian matrix vector products per block.
Source code in src/pydvl/influence/base_influence_function_model.py
@InfluenceFunctionModel.fit_required\ndef influence_factors_by_block(\n self, x: TensorType, y: TensorType\n) -> OrderedDict[str, TensorType]:\n r\"\"\"\n Compute the block-wise approximation of\n\n \\[ H^{-1}\\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\]\n\n where the gradient is meant to be per sample of the batch $(x, y)$.\n\n Args:\n x: model input to use in the gradient computations\n y: label tensor to compute gradients\n\n Returns:\n Ordered dictionary of tensors representing the element-wise\n approximate inverse Hessian matrix vector products per block.\n\n \"\"\"\n return self.block_mapper.transformed_grads(self._create_batch(x, y))\n
"},{"location":"api/pydvl/influence/base_influence_function_model/#pydvl.influence.base_influence_function_model.ComposableInfluence.influences_from_factors_by_block","title":"influences_from_factors_by_block","text":"influences_from_factors_by_block(\n z_test_factors: OrderedDict[str, TensorType],\n x: TensorType,\n y: TensorType,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> OrderedDict[str, TensorType]\n
Block-wise computation of
\\[ \\langle z_{\\text{test_factors}}, \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the case of up-weighting influence, resp.
\\[ \\langle z_{\\text{test_factors}}, \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the perturbation type influence case. The gradient is meant to be per sample of the batch \\((x, y)\\).
PARAMETER DESCRIPTION z_test_factors
pre-computed array, approximating \\(H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}}))\\)
TYPE: OrderedDict[str, TensorType]
x
model input to use in the gradient computations \\(\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))\\), resp. \\(\\nabla_{x}\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))\\), if None, use \\(x=x_{\\text{test}}\\)
TYPE: TensorType
y
label tensor to compute gradients
TYPE: TensorType
mode
enum value of InfluenceMode
TYPE: InfluenceMode
DEFAULT: Up
RETURNS DESCRIPTION OrderedDict[str, TensorType]
Ordered dictionary of tensors representing the element-wise scalar products
OrderedDict[str, TensorType]
for the provided batch per block
Source code in src/pydvl/influence/base_influence_function_model.py
@InfluenceFunctionModel.fit_required\ndef influences_from_factors_by_block(\n self,\n z_test_factors: OrderedDict[str, TensorType],\n x: TensorType,\n y: TensorType,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> OrderedDict[str, TensorType]:\n r\"\"\"\n Block-wise computation of\n\n \\[ \\langle z_{\\text{test_factors}},\n \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the case of up-weighting influence, resp.\n\n \\[ \\langle z_{\\text{test_factors}},\n \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the perturbation type influence case. The gradient is meant to be per sample\n of the batch $(x, y)$.\n\n Args:\n z_test_factors: pre-computed array, approximating\n $H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}},\n f_{\\theta}(x_{\\text{test}}))$\n x: model input to use in the gradient computations\n $\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))$,\n resp. $\\nabla_{x}\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))$,\n if None, use $x=x_{\\text{test}}$\n y: label tensor to compute gradients\n mode: enum value of [InfluenceMode]\n [pydvl.influence.base_influence_function_model.InfluenceMode]\n\n Returns:\n Ordered dictionary of tensors representing the element-wise scalar products\n for the provided batch per block\n\n \"\"\"\n return self.block_mapper.interactions_from_transformed_grads(\n z_test_factors, self._create_batch(x, y), mode\n )\n
"},{"location":"api/pydvl/influence/base_influence_function_model/#pydvl.influence.base_influence_function_model.ComposableInfluence.influences_from_factors","title":"influences_from_factors","text":"influences_from_factors(\n z_test_factors: TensorType,\n x: TensorType,\n y: TensorType,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> TensorType\n
Computation of
\\[ \\langle z_{\\text{test_factors}}, \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the case of up-weighting influence, resp.
\\[ \\langle z_{\\text{test_factors}}, \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the perturbation type influence case. The gradient is meant to be per sample of the batch \\((x, y)\\).
PARAMETER DESCRIPTION z_test_factors
pre-computed array, approximating \\(H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}}))\\)
TYPE: TensorType
x
model input to use in the gradient computations \\(\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))\\), resp. \\(\\nabla_{x}\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))\\), if None, use \\(x=x_{\\text{test}}\\)
TYPE: TensorType
y
label tensor to compute gradients
TYPE: TensorType
mode
enum value of InfluenceMode
TYPE: InfluenceMode
DEFAULT: Up
RETURNS DESCRIPTION TensorType
Tensor representing the element-wise scalar products for the provided batch
Source code in src/pydvl/influence/base_influence_function_model.py
@InfluenceFunctionModel.fit_required\ndef influences_from_factors(\n self,\n z_test_factors: TensorType,\n x: TensorType,\n y: TensorType,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> TensorType:\n r\"\"\"\n Computation of\n\n \\[ \\langle z_{\\text{test_factors}},\n \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the case of up-weighting influence, resp.\n\n \\[ \\langle z_{\\text{test_factors}},\n \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the perturbation type influence case. The gradient is meant to be per sample\n of the batch $(x, y)$.\n\n Args:\n z_test_factors: pre-computed array, approximating\n $H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}},\n f_{\\theta}(x_{\\text{test}}))$\n x: model input to use in the gradient computations\n $\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))$,\n resp. $\\nabla_{x}\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))$,\n if None, use $x=x_{\\text{test}}$\n y: label tensor to compute gradients\n mode: enum value of [InfluenceMode]\n [pydvl.influence.base_influence_function_model.InfluenceMode]\n\n Returns:\n Tensor representing the element-wise scalar products for the provided batch\n\n \"\"\"\n tensors = self.block_mapper.generate_interactions_from_transformed_grads(\n z_test_factors,\n self._create_batch(x, y),\n mode,\n )\n result: TensorType = next(tensors)\n for tensor in tensors:\n result = result + tensor\n return result\n
"},{"location":"api/pydvl/influence/influence_calculator/","title":"Influence calculator","text":""},{"location":"api/pydvl/influence/influence_calculator/#pydvl.influence.influence_calculator","title":"pydvl.influence.influence_calculator","text":"This module provides functionality for calculating influences for large amount of data. The computation is based on a chunk computation model in the form of an instance of InfluenceFunctionModel, which is mapped over collection of chunks.
"},{"location":"api/pydvl/influence/influence_calculator/#pydvl.influence.influence_calculator.DisableClientSingleThreadCheck","title":"DisableClientSingleThreadCheck","text":"This type can be provided to the initialization of a DaskInfluenceCalculator instead of a distributed client object. It is useful in those scenarios, where the user want to disable the checking for thread-safety in the initialization phase, e.g. when using the single machine synchronous scheduler for debugging purposes.
Example from pydvl.influence import DisableClientThreadingCheck\n\nda_calc = DaskInfluenceCalculator(if_model,\n TorchNumpyConverter(),\n DisableClientThreadingCheck)\nda_influences = da_calc.influences(da_x_test, da_y_test, da_x, da_y)\nda_influences.compute(scheduler='synchronous')\n
"},{"location":"api/pydvl/influence/influence_calculator/#pydvl.influence.influence_calculator.DaskInfluenceCalculator","title":"DaskInfluenceCalculator","text":"DaskInfluenceCalculator(\n influence_function_model: InfluenceFunctionModel,\n converter: NumpyConverter,\n client: Union[Client, Type[DisableClientSingleThreadCheck]],\n)\n
This class is designed to compute influences over dask.array.Array collections, leveraging the capabilities of Dask for distributed computing and parallel processing. It requires an influence computation model of type InfluenceFunctionModel, which defines how influences are computed on a chunk of data. Essentially, this class functions by mapping the influence function model across the various chunks of a dask.array.Array collection.
PARAMETER DESCRIPTION influence_function_model
instance of type InfluenceFunctionModel, that specifies the computation logic for influence on data chunks. It's a pivotal part of the calculator, determining how influence is computed and applied across the data array.
TYPE: InfluenceFunctionModel
converter
A utility for converting numpy arrays to TensorType objects, facilitating the interaction between numpy arrays and the influence function model.
TYPE: NumpyConverter
client
This parameter accepts either of two types:
-
A distributed Client object
-
The special type DisableClientSingleThreadCheck, which serves as a flag to bypass certain checks.
During initialization, the system verifies if all workers are operating in single-threaded mode when the provided influence_function_model is designated as not thread-safe (indicated by the is_thread_safe
property). If this condition is not met, the initialization will raise a specific error, signaling a potential thread-safety conflict.
To intentionally skip this safety check (e.g., for debugging purposes using the single machine synchronous scheduler), you can supply the DisableClientSingleThreadCheck type.
TYPE: Union[Client, Type[DisableClientSingleThreadCheck]]
Warning
Make sure to set threads_per_worker=1
, when using the distributed scheduler for computing, if your implementation of InfluenceFunctionModel is not thread-safe.
client = Client(threads_per_worker=1)\n
For details on dask schedulers see the official documentation. Example import torch\nfrom torch.utils.data import Dataset, DataLoader\nfrom pydvl.influence import DaskInfluenceCalculator\nfrom pydvl.influence.torch import CgInfluence\nfrom pydvl.influence.torch.util import (\n torch_dataset_to_dask_array,\n TorchNumpyConverter,\n)\nfrom distributed import Client\n\n# Possible some out of memory large Dataset\ntrain_data_set: Dataset = LargeDataSet(...)\ntest_data_set: Dataset = LargeDataSet(...)\n\ntrain_dataloader = DataLoader(train_data_set)\ninfl_model = CgInfluence(model, loss, hessian_regularization=0.01)\ninfl_model = if_model.fit(train_dataloader)\n\n# wrap your input data into dask arrays\nchunk_size = 10\nda_x, da_y = torch_dataset_to_dask_array(train_data_set, chunk_size=chunk_size)\nda_x_test, da_y_test = torch_dataset_to_dask_array(test_data_set,\n chunk_size=chunk_size)\n\n# use only one thread for scheduling, due to non-thread safety of some torch\n# operations\nclient = Client(n_workers=4, threads_per_worker=1)\n\ninfl_calc = DaskInfluenceCalculator(infl_model,\n TorchNumpyConverter(device=torch.device(\"cpu\")),\n client)\nda_influences = infl_calc.influences(da_x_test, da_y_test, da_x, da_y)\n# da_influences is a dask.array.Array\n\n# trigger computation and write chunks to disk in parallel\nda_influences.to_zarr(\"path/or/url\")\n
Source code in src/pydvl/influence/influence_calculator.py
def __init__(\n self,\n influence_function_model: InfluenceFunctionModel,\n converter: NumpyConverter,\n client: Union[Client, Type[DisableClientSingleThreadCheck]],\n):\n self._n_parameters = influence_function_model.n_parameters\n self.influence_function_model = influence_function_model\n self.numpy_converter = converter\n\n if isinstance(client, type(DisableClientSingleThreadCheck)):\n logger.warning(DisableClientSingleThreadCheck.warning_msg())\n self.influence_function_model = delayed(influence_function_model)\n elif isinstance(client, Client):\n self._validate_client(client, influence_function_model)\n self.influence_function_model = client.scatter(\n influence_function_model, broadcast=True\n )\n else:\n raise ValueError(\n \"The 'client' parameter \"\n \"must either be a distributed.Client object or the\"\n \"type 'DisableClientSingleThreadCheck'.\"\n )\n
"},{"location":"api/pydvl/influence/influence_calculator/#pydvl.influence.influence_calculator.DaskInfluenceCalculator.n_parameters","title":"n_parameters property
","text":"n_parameters\n
Number of trainable parameters of the underlying model used in the batch computation
"},{"location":"api/pydvl/influence/influence_calculator/#pydvl.influence.influence_calculator.DaskInfluenceCalculator.influence_factors","title":"influence_factors","text":"influence_factors(x: Array, y: Array) -> Array\n
Computes the expression
\\[ H^{-1}\\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\] where the gradients are computed for the chunks of \\((x, y)\\).
PARAMETER DESCRIPTION x
model input to use in the gradient computations
TYPE: Array
y
label tensor to compute gradients
TYPE: Array
RETURNS DESCRIPTION Array
dask.array.Array representing the element-wise inverse Hessian matrix vector products for the provided batch.
Source code in src/pydvl/influence/influence_calculator.py
def influence_factors(self, x: da.Array, y: da.Array) -> da.Array:\n r\"\"\"\n Computes the expression\n\n \\[ H^{-1}\\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\]\n\n where the gradients are computed for the chunks of $(x, y)$.\n\n Args:\n x: model input to use in the gradient computations\n y: label tensor to compute gradients\n\n Returns:\n [dask.array.Array][dask.array.Array] representing the element-wise inverse\n Hessian matrix vector products for the provided batch.\n\n \"\"\"\n\n self._validate_aligned_chunking(x, y)\n self._validate_dimensions_not_chunked(x)\n self._validate_dimensions_not_chunked(y)\n\n def func(x_numpy: NDArray, y_numpy: NDArray, model: InfluenceFunctionModel):\n factors = model.influence_factors(\n self.numpy_converter.from_numpy(x_numpy),\n self.numpy_converter.from_numpy(y_numpy),\n )\n return self.numpy_converter.to_numpy(factors)\n\n chunks = []\n for x_chunk, y_chunk, chunk_size in zip(\n x.to_delayed(), y.to_delayed(), x.chunks[0]\n ):\n chunk_shape = (chunk_size, self.n_parameters)\n chunk_array = da.from_delayed(\n delayed(func)(\n x_chunk.squeeze()[()],\n y_chunk.squeeze()[()],\n self.influence_function_model,\n ),\n dtype=x.dtype,\n shape=chunk_shape,\n )\n chunks.append(chunk_array)\n\n return da.concatenate(chunks)\n
"},{"location":"api/pydvl/influence/influence_calculator/#pydvl.influence.influence_calculator.DaskInfluenceCalculator.influences","title":"influences","text":"influences(\n x_test: Array,\n y_test: Array,\n x: Optional[Array] = None,\n y: Optional[Array] = None,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> Array\n
Compute approximation of
\\[ \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}})), \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the case of up-weighting influence, resp.
\\[ \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}})), \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the perturbation type influence case. The computation is done block-wise for the chunks of the provided dask arrays.
PARAMETER DESCRIPTION x_test
model input to use in the gradient computations of \\(H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}}))\\)
TYPE: Array
y_test
label tensor to compute gradients
TYPE: Array
x
optional model input to use in the gradient computations \\(\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))\\), resp. \\(\\nabla_{x}\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))\\), if None, use \\(x=x_{\\text{test}}\\)
TYPE: Optional[Array]
DEFAULT: None
y
optional label tensor to compute gradients
TYPE: Optional[Array]
DEFAULT: None
mode
enum value of InfluenceMode
TYPE: InfluenceMode
DEFAULT: Up
RETURNS DESCRIPTION Array
dask.array.Array representing the element-wise scalar products for the provided batch.
Source code in src/pydvl/influence/influence_calculator.py
def influences(\n self,\n x_test: da.Array,\n y_test: da.Array,\n x: Optional[da.Array] = None,\n y: Optional[da.Array] = None,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> da.Array:\n r\"\"\"\n Compute approximation of\n\n \\[ \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}},\n f_{\\theta}(x_{\\text{test}})), \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the case of up-weighting influence, resp.\n\n \\[ \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}},\n f_{\\theta}(x_{\\text{test}})),\n \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the perturbation type influence case. The computation is done block-wise\n for the chunks of the provided dask arrays.\n\n Args:\n x_test: model input to use in the gradient computations of\n $H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}},\n f_{\\theta}(x_{\\text{test}}))$\n y_test: label tensor to compute gradients\n x: optional model input to use in the gradient computations\n $\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))$,\n resp. $\\nabla_{x}\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))$,\n if None, use $x=x_{\\text{test}}$\n y: optional label tensor to compute gradients\n mode: enum value of [InfluenceMode]\n [pydvl.influence.base_influence_function_model.InfluenceMode]\n\n Returns:\n [dask.array.Array][dask.array.Array] representing the element-wise scalar\n products for the provided batch.\n\n \"\"\"\n\n self._validate_aligned_chunking(x_test, y_test)\n self._validate_dimensions_not_chunked(x_test)\n self._validate_dimensions_not_chunked(y_test)\n\n if (x is None) != (y is None):\n if x is None:\n raise ValueError(\n \"Providing labels y without providing model input x \"\n \"is not supported\"\n )\n if y is None:\n raise ValueError(\n \"Providing model input x without labels y is not supported\"\n )\n elif x is not None:\n self._validate_aligned_chunking(x, y)\n self._validate_dimensions_not_chunked(x)\n self._validate_dimensions_not_chunked(y)\n else:\n x, y = x_test, y_test\n\n def func(\n x_test_numpy: NDArray,\n y_test_numpy: NDArray,\n x_numpy: NDArray,\n y_numpy: NDArray,\n model: InfluenceFunctionModel,\n ):\n values = model.influences(\n self.numpy_converter.from_numpy(x_test_numpy),\n self.numpy_converter.from_numpy(y_test_numpy),\n self.numpy_converter.from_numpy(x_numpy),\n self.numpy_converter.from_numpy(y_numpy),\n mode,\n )\n return self.numpy_converter.to_numpy(values)\n\n un_chunked_x_shapes = [s[0] for s in x_test.chunks[1:]]\n x_test_chunk_sizes = x_test.chunks[0]\n x_chunk_sizes = x.chunks[0]\n blocks = []\n block_shape: Tuple[int, ...]\n\n for x_test_chunk, y_test_chunk, test_chunk_size in zip(\n x_test.to_delayed(), y_test.to_delayed(), x_test_chunk_sizes\n ):\n row = []\n for x_chunk, y_chunk, chunk_size in zip(\n x.to_delayed(), y.to_delayed(), x_chunk_sizes # type:ignore\n ):\n if mode == InfluenceMode.Up:\n block_shape = (test_chunk_size, chunk_size)\n elif mode == InfluenceMode.Perturbation:\n block_shape = (test_chunk_size, chunk_size, *un_chunked_x_shapes)\n else:\n raise UnsupportedInfluenceModeException(mode)\n\n block_array = da.from_delayed(\n delayed(func)(\n x_test_chunk.squeeze()[()],\n y_test_chunk.squeeze()[()],\n x_chunk.squeeze()[()],\n y_chunk.squeeze()[()],\n self.influence_function_model,\n ),\n shape=block_shape,\n dtype=x_test.dtype,\n )\n\n if mode == InfluenceMode.Perturbation:\n n_dims = block_array.ndim\n new_order = tuple(range(2, n_dims)) + (0, 1)\n block_array = block_array.transpose(new_order)\n\n row.append(block_array)\n blocks.append(row)\n\n values_array = da.block(blocks)\n\n if mode == InfluenceMode.Perturbation:\n n_dims = values_array.ndim\n new_order = (n_dims - 2, n_dims - 1) + tuple(range(n_dims - 2))\n values_array = values_array.transpose(new_order)\n\n return values_array\n
"},{"location":"api/pydvl/influence/influence_calculator/#pydvl.influence.influence_calculator.DaskInfluenceCalculator.influences_from_factors","title":"influences_from_factors","text":"influences_from_factors(\n z_test_factors: Array,\n x: Array,\n y: Array,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> Array\n
Computation of
\\[ \\langle z_{\\text{test_factors}}, \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the case of up-weighting influence, resp.
\\[ \\langle z_{\\text{test_factors}}, \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the perturbation type influence case. The gradient is meant to be per sample of the batch \\((x, y)\\).
PARAMETER DESCRIPTION z_test_factors
pre-computed array, approximating \\(H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}}))\\)
TYPE: Array
x
optional model input to use in the gradient computations \\(\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))\\), resp. \\(\\nabla_{x}\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))\\), if None, use \\(x=x_{\\text{test}}\\)
TYPE: Array
y
optional label tensor to compute gradients
TYPE: Array
mode
enum value of InfluenceMode
TYPE: InfluenceMode
DEFAULT: Up
RETURNS DESCRIPTION Array
dask.array.Array representing the element-wise scalar product of the provided batch
Source code in src/pydvl/influence/influence_calculator.py
def influences_from_factors(\n self,\n z_test_factors: da.Array,\n x: da.Array,\n y: da.Array,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> da.Array:\n r\"\"\"\n Computation of\n\n \\[ \\langle z_{\\text{test_factors}},\n \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the case of up-weighting influence, resp.\n\n \\[ \\langle z_{\\text{test_factors}},\n \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the perturbation type influence case. The gradient is meant\n to be per sample of the batch $(x, y)$.\n\n Args:\n z_test_factors: pre-computed array, approximating\n $H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}},\n f_{\\theta}(x_{\\text{test}}))$\n x: optional model input to use in the gradient computations\n $\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))$,\n resp. $\\nabla_{x}\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))$,\n if None, use $x=x_{\\text{test}}$\n y: optional label tensor to compute gradients\n mode: enum value of [InfluenceMode]\n [pydvl.influence.base_influence_function_model.InfluenceMode]\n\n Returns:\n [dask.array.Array][dask.array.Array] representing the element-wise scalar\n product of the provided batch\n\n \"\"\"\n self._validate_aligned_chunking(x, y)\n self._validate_dimensions_not_chunked(x)\n self._validate_dimensions_not_chunked(y)\n self._validate_dimensions_not_chunked(z_test_factors)\n\n def func(\n z_test_numpy: NDArray,\n x_numpy: NDArray,\n y_numpy: NDArray,\n model: InfluenceFunctionModel,\n ):\n ups = model.influences_from_factors(\n self.numpy_converter.from_numpy(z_test_numpy),\n self.numpy_converter.from_numpy(x_numpy),\n self.numpy_converter.from_numpy(y_numpy),\n mode=mode,\n )\n return self.numpy_converter.to_numpy(ups)\n\n un_chunked_x_shape = [s[0] for s in x.chunks[1:]]\n x_chunk_sizes = x.chunks[0]\n z_test_chunk_sizes = z_test_factors.chunks[0]\n blocks = []\n block_shape: Tuple[int, ...]\n\n for z_test_chunk, z_test_chunk_size in zip(\n z_test_factors.to_delayed(), z_test_chunk_sizes\n ):\n row = []\n for x_chunk, y_chunk, chunk_size in zip(\n x.to_delayed(), y.to_delayed(), x_chunk_sizes\n ):\n if mode == InfluenceMode.Perturbation:\n block_shape = (z_test_chunk_size, chunk_size, *un_chunked_x_shape)\n elif mode == InfluenceMode.Up:\n block_shape = (z_test_chunk_size, chunk_size)\n else:\n raise UnsupportedInfluenceModeException(mode)\n\n block_array = da.from_delayed(\n delayed(func)(\n z_test_chunk.squeeze()[()],\n x_chunk.squeeze()[()],\n y_chunk.squeeze()[()],\n self.influence_function_model,\n ),\n shape=block_shape,\n dtype=z_test_factors.dtype,\n )\n\n if mode == InfluenceMode.Perturbation:\n n_dims = block_array.ndim\n new_order = tuple(range(2, n_dims)) + (0, 1)\n block_array = block_array.transpose(*new_order)\n\n row.append(block_array)\n blocks.append(row)\n\n values_array = da.block(blocks)\n\n if mode == InfluenceMode.Perturbation:\n n_dims = values_array.ndim\n new_order = (n_dims - 2, n_dims - 1) + tuple(range(n_dims - 2))\n values_array = values_array.transpose(*new_order)\n\n return values_array\n
"},{"location":"api/pydvl/influence/influence_calculator/#pydvl.influence.influence_calculator.SequentialInfluenceCalculator","title":"SequentialInfluenceCalculator","text":"SequentialInfluenceCalculator(influence_function_model: InfluenceFunctionModel)\n
This class serves as a simple wrapper for processing batches of data in a sequential manner. It is particularly useful in scenarios where parallel or distributed processing is not required or not feasible. The core functionality of this class is to apply a specified influence computation model, of type InfluenceFunctionModel, to batches of data one at a time.
PARAMETER DESCRIPTION influence_function_model
An instance of type [InfluenceFunctionModel] [pydvl.influence.base_influence_function_model.InfluenceFunctionModel], that specifies the computation logic for influence on data chunks.
TYPE: InfluenceFunctionModel
Example from pydvl.influence import SequentialInfluenceCalculator\nfrom pydvl.influence.torch.util import (\nNestedTorchCatAggregator,\nTorchNumpyConverter,\n)\nfrom pydvl.influence.torch import CgInfluence\n\nbatch_size = 10\ntrain_dataloader = DataLoader(..., batch_size=batch_size)\ntest_dataloader = DataLoader(..., batch_size=batch_size)\n\ninfl_model = CgInfluence(model, loss, hessian_regularization=0.01)\ninfl_model = infl_model.fit(train_dataloader)\n\ninfl_calc = SequentialInfluenceCalculator(if_model)\n\n# this does not trigger the computation\nlazy_influences = infl_calc.influences(test_dataloader, train_dataloader)\n\n# trigger computation and pull the result into main memory, result is the full\n# tensor for all combinations of the two loaders\ninfluences = lazy_influences.compute(aggregator=NestedTorchCatAggregator())\n# or\n# trigger computation and write results chunk-wise to disk using zarr in a\n# sequential manner\nlazy_influences.to_zarr(\"local_path/or/url\", TorchNumpyConverter())\n
Source code in src/pydvl/influence/influence_calculator.py
def __init__(\n self,\n influence_function_model: InfluenceFunctionModel,\n):\n self.influence_function_model = influence_function_model\n
"},{"location":"api/pydvl/influence/influence_calculator/#pydvl.influence.influence_calculator.SequentialInfluenceCalculator.influence_factors","title":"influence_factors","text":"influence_factors(\n data_iterable: Iterable[Tuple[TensorType, TensorType]]\n) -> LazyChunkSequence\n
Compute the expression
\\[ H^{-1}\\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\] where the gradient are computed for the chunks \\((x, y)\\) of the data_iterable in a sequential manner.
PARAMETER DESCRIPTION data_iterable
An iterable that returns tuples of tensors. Each tuple consists of a pair of tensors (x, y), representing input data and corresponding targets.
TYPE: Iterable[Tuple[TensorType, TensorType]]
RETURNS DESCRIPTION LazyChunkSequence
A lazy data structure representing the chunks of the resulting tensor
Source code in src/pydvl/influence/influence_calculator.py
def influence_factors(\n self,\n data_iterable: Iterable[Tuple[TensorType, TensorType]],\n) -> LazyChunkSequence:\n r\"\"\"\n Compute the expression\n\n \\[ H^{-1}\\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\]\n\n where the gradient are computed for the chunks $(x, y)$ of the data_iterable in\n a sequential manner.\n\n Args:\n data_iterable: An iterable that returns tuples of tensors.\n Each tuple consists of a pair of tensors (x, y), representing input data\n and corresponding targets.\n\n Returns:\n A lazy data structure representing the chunks of the resulting tensor\n \"\"\"\n try:\n len_iterable = len(cast(Sized, data_iterable))\n except Exception as e:\n logger.debug(f\"Failed to retrieve len of data iterable: {e}\")\n len_iterable = None\n\n tensors_gen_factory = partial(self._influence_factors_gen, data_iterable)\n return LazyChunkSequence(tensors_gen_factory, len_generator=len_iterable)\n
"},{"location":"api/pydvl/influence/influence_calculator/#pydvl.influence.influence_calculator.SequentialInfluenceCalculator.influences","title":"influences","text":"influences(\n test_data_iterable: Iterable[Tuple[TensorType, TensorType]],\n train_data_iterable: Iterable[Tuple[TensorType, TensorType]],\n mode: InfluenceMode = InfluenceMode.Up,\n) -> NestedLazyChunkSequence\n
Compute approximation of
\\[ \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}})), \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the case of up-weighting influence, resp.
\\[ \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}})), \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the perturbation type influence case. The computation is done block-wise for the chunks of the provided data iterables and aggregated into a single tensor in memory.
PARAMETER DESCRIPTION test_data_iterable
An iterable that returns tuples of tensors. Each tuple consists of a pair of tensors (x, y), representing input data and corresponding targets.
TYPE: Iterable[Tuple[TensorType, TensorType]]
train_data_iterable
An iterable that returns tuples of tensors. Each tuple consists of a pair of tensors (x, y), representing input data and corresponding targets.
TYPE: Iterable[Tuple[TensorType, TensorType]]
mode
enum value of InfluenceMode
TYPE: InfluenceMode
DEFAULT: Up
RETURNS DESCRIPTION NestedLazyChunkSequence
A lazy data structure representing the chunks of the resulting tensor
Source code in src/pydvl/influence/influence_calculator.py
def influences(\n self,\n test_data_iterable: Iterable[Tuple[TensorType, TensorType]],\n train_data_iterable: Iterable[Tuple[TensorType, TensorType]],\n mode: InfluenceMode = InfluenceMode.Up,\n) -> NestedLazyChunkSequence:\n r\"\"\"\n Compute approximation of\n\n \\[ \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}},\n f_{\\theta}(x_{\\text{test}})), \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the case of up-weighting influence, resp.\n\n \\[ \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}},\n f_{\\theta}(x_{\\text{test}})),\n \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the perturbation type influence case. The computation is done block-wise for\n the chunks of the provided\n data iterables and aggregated into a single tensor in memory.\n\n Args:\n test_data_iterable: An iterable that returns tuples of tensors.\n Each tuple consists of a pair of tensors (x, y), representing input data\n and corresponding targets.\n train_data_iterable: An iterable that returns tuples of tensors.\n Each tuple consists of a pair of tensors (x, y), representing input data\n and corresponding targets.\n mode: enum value of [InfluenceMode]\n [pydvl.influence.base_influence_function_model.InfluenceMode]\n\n Returns:\n A lazy data structure representing the chunks of the resulting tensor\n\n \"\"\"\n nested_tensor_gen_factory = partial(\n self._influences_gen,\n test_data_iterable,\n train_data_iterable,\n mode,\n )\n\n try:\n len_iterable = len(cast(Sized, test_data_iterable))\n except Exception as e:\n logger.debug(f\"Failed to retrieve len of test data iterable: {e}\")\n len_iterable = None\n\n return NestedLazyChunkSequence(\n nested_tensor_gen_factory, len_outer_generator=len_iterable\n )\n
"},{"location":"api/pydvl/influence/influence_calculator/#pydvl.influence.influence_calculator.SequentialInfluenceCalculator.influences_from_factors","title":"influences_from_factors","text":"influences_from_factors(\n z_test_factors: Iterable[TensorType],\n train_data_iterable: Iterable[Tuple[TensorType, TensorType]],\n mode: InfluenceMode = InfluenceMode.Up,\n) -> NestedLazyChunkSequence\n
Computation of
\\[ \\langle z_{\\text{test_factors}}, \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the case of up-weighting influence, resp.
\\[ \\langle z_{\\text{test_factors}}, \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the perturbation type influence case. The gradient is meant to be per sample of the batch \\((x, y)\\).
PARAMETER DESCRIPTION z_test_factors
Pre-computed iterable of tensors, approximating \\(H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}}))\\)
TYPE: Iterable[TensorType]
train_data_iterable
An iterable that returns tuples of tensors. Each tuple consists of a pair of tensors (x, y), representing input data and corresponding targets.
TYPE: Iterable[Tuple[TensorType, TensorType]]
mode
enum value of InfluenceMode
TYPE: InfluenceMode
DEFAULT: Up
RETURNS DESCRIPTION NestedLazyChunkSequence
A lazy data structure representing the chunks of the resulting tensor
Source code in src/pydvl/influence/influence_calculator.py
def influences_from_factors(\n self,\n z_test_factors: Iterable[TensorType],\n train_data_iterable: Iterable[Tuple[TensorType, TensorType]],\n mode: InfluenceMode = InfluenceMode.Up,\n) -> NestedLazyChunkSequence:\n r\"\"\"\n Computation of\n\n \\[ \\langle z_{\\text{test_factors}}, \\nabla_{\\theta} \\ell(y, f_{\\theta}(x))\n \\rangle \\]\n\n for the case of up-weighting influence, resp.\n\n \\[ \\langle z_{\\text{test_factors}}, \\nabla_{x} \\nabla_{\\theta}\n \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the perturbation type influence case. The gradient is meant to be per sample\n of the batch $(x, y)$.\n\n Args:\n z_test_factors: Pre-computed iterable of tensors, approximating\n $H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}},\n f_{\\theta}(x_{\\text{test}}))$\n train_data_iterable: An iterable that returns tuples of tensors.\n Each tuple consists of a pair of tensors (x, y), representing input data\n and corresponding targets.\n mode: enum value of [InfluenceMode]\n [pydvl.influence.base_influence_function_model.InfluenceMode]\n\n Returns:\n A lazy data structure representing the chunks of the resulting tensor\n\n \"\"\"\n nested_tensor_gen = partial(\n self._influences_from_factors_gen,\n z_test_factors,\n train_data_iterable,\n mode,\n )\n\n try:\n len_iterable = len(cast(Sized, z_test_factors))\n except Exception as e:\n logger.debug(f\"Failed to retrieve len of factors iterable: {e}\")\n len_iterable = None\n\n return NestedLazyChunkSequence(\n nested_tensor_gen, len_outer_generator=len_iterable\n )\n
"},{"location":"api/pydvl/influence/types/","title":"Types","text":""},{"location":"api/pydvl/influence/types/#pydvl.influence.types","title":"pydvl.influence.types","text":"This module offers a set of generic types, which can be used to build modular and flexible components for influence computation for different tensor frameworks.
Key components include:
-
GradientProvider: A generic abstract base class designed to provide methods for computing per-sample gradients and other related computations for given data batches.
-
BilinearForm: A generic abstract base class for representing bilinear forms for computing inner products involving gradients.
-
Operator: A generic abstract base class for operators that can apply transformations to vectors and matrices and can be represented as bilinear forms.
-
OperatorGradientComposition: A generic abstract composition class that integrates an operator with a gradient provider to compute interactions between batches of data.
-
BlockMapper: A generic abstract base class for mapping operations across multiple compositional blocks, given by objects of type OperatorGradientComposition, and aggregating the results.
To see the usage of these types, see the implementation ComposableInfluence . Using these components allows the straightforward implementation of various combinations of approximations of inverse Hessian applications (or Gauss-Newton approximations), different blocking strategies (e.g. layer-wise or block-wise) and different ways to compute gradients.
For the usage with a specific tensor framework, these types must be subclassed. An example for torch is provided in the module pydvl.influence.torch.base and the base class TorchComposableInfluence.
"},{"location":"api/pydvl/influence/types/#pydvl.influence.types.InfluenceMode","title":"InfluenceMode","text":" Bases: str
, Enum
Enum representation for the types of influence.
ATTRIBUTE DESCRIPTION Up
Approximating the influence of a point
Perturbation
Perturbation definition of the influence score
"},{"location":"api/pydvl/influence/types/#pydvl.influence.types.Batch","title":"Batch dataclass
","text":"Batch(x: TensorType, y: TensorType)\n
Bases: Generic[TensorType]
Represents a batch of data containing features and labels.
ATTRIBUTE DESCRIPTION x
Represents the input features of the batch.
TYPE: TensorType
y
Represents the labels or targets associated with the input features.
TYPE: TensorType
"},{"location":"api/pydvl/influence/types/#pydvl.influence.types.GradientProvider","title":"GradientProvider","text":" Bases: Generic[BatchType, TensorType]
, ABC
Provides an interface for calculating per-sample gradients and other related computations for a given batch of data.
This class must be subclassed with implementations for its abstract methods tailored to specific gradient computation needs, e.g. using an autograd engine for a model loss function. Consider a function
\\[ \\ell: \\mathbb{R}^{d_1} \\times \\mathbb{R}^{d_2} \\times \\mathbb{R}^{n} \\times \\mathbb{R}^{n}, \\quad \\ell(\\omega_1, \\omega_2, x, y) = \\operatorname{loss}(f(\\omega_1, \\omega_2; x), y) \\] e.g. a two layer neural network \\(f\\) with a loss function, then this object should compute the expressions:
\\[ \\nabla_{\\omega_{i}}\\ell(\\omega_1, \\omega_2, x, y), \\nabla_{\\omega_{i}}\\nabla_{x}\\ell(\\omega_1, \\omega_2, x, y), \\nabla_{\\omega}\\ell(\\omega_1, \\omega_2, x, y) \\cdot v\\]"},{"location":"api/pydvl/influence/types/#pydvl.influence.types.GradientProvider.jacobian_prod","title":"jacobian_prod abstractmethod
","text":"jacobian_prod(batch: BatchType, g: TensorType) -> TensorType\n
Computes the matrix-Jacobian product for the provided batch and input tensor. Given the example in the class docstring, this means
\\[ (\\nabla_{\\omega_{1}}\\ell(\\omega_1, \\omega_2, \\text{batch.x}, \\text{batch.y}), \\nabla_{\\omega_{2}}\\ell(\\omega_1, \\omega_2, \\text{batch.x}, \\text{batch.y})) \\cdot g^T\\] where g must be a tensor of shape \\((K, d_1+d_2)\\), so the resulting tensor is of shape \\((N, K)\\).
PARAMETER DESCRIPTION batch
The batch of data for which to compute the Jacobian.
TYPE: BatchType
g
The tensor to be used in the matrix-Jacobian product calculation.
TYPE: TensorType
RETURNS DESCRIPTION TensorType
The resulting tensor from the matrix-Jacobian product computation.
Source code in src/pydvl/influence/types.py
@abstractmethod\ndef jacobian_prod(\n self,\n batch: BatchType,\n g: TensorType,\n) -> TensorType:\n r\"\"\"\n Computes the matrix-Jacobian product for the provided batch and input tensor.\n Given the example in the class docstring, this means\n\n $$ (\\nabla_{\\omega_{1}}\\ell(\\omega_1, \\omega_2,\n \\text{batch.x}, \\text{batch.y}),\n \\nabla_{\\omega_{2}}\\ell(\\omega_1, \\omega_2,\n \\text{batch.x}, \\text{batch.y})) \\cdot g^T$$\n\n where g must be a tensor of shape $(K, d_1+d_2)$, so the resulting tensor\n is of shape $(N, K)$.\n\n Args:\n batch: The batch of data for which to compute the Jacobian.\n g: The tensor to be used in the matrix-Jacobian product\n calculation.\n\n Returns:\n The resulting tensor from the matrix-Jacobian product computation.\n \"\"\"\n
"},{"location":"api/pydvl/influence/types/#pydvl.influence.types.GradientProvider.flat_grads","title":"flat_grads abstractmethod
","text":"flat_grads(batch: BatchType) -> TensorType\n
Computes and returns the flat per-sample gradients for the provided batch. Given the example in the class docstring, this means
\\[ (\\nabla_{\\omega_{1}}\\ell(\\omega_1, \\omega_2, \\text{batch.x}, \\text{batch.y}), \\nabla_{\\omega_{2}}\\ell(\\omega_1, \\omega_2, \\text{batch.x}, \\text{batch.y}))\\] where the first dimension of the resulting tensor is always considered to be the batch dimension, so the shape of the resulting tensor is \\((N, d_1+d_2)\\), where \\(N\\) is the number of samples in the batch.
PARAMETER DESCRIPTION batch
The batch of data for which to compute the gradients.
TYPE: BatchType
RETURNS DESCRIPTION TensorType
A tensor containing the flat gradients computed per sample.
Source code in src/pydvl/influence/types.py
@abstractmethod\ndef flat_grads(self, batch: BatchType) -> TensorType:\n r\"\"\"\n Computes and returns the flat per-sample gradients for the provided batch.\n Given the example in the class docstring, this means\n\n $$ (\\nabla_{\\omega_{1}}\\ell(\\omega_1, \\omega_2,\n \\text{batch.x}, \\text{batch.y}),\n \\nabla_{\\omega_{2}}\\ell(\\omega_1, \\omega_2,\n \\text{batch.x}, \\text{batch.y}))$$\n\n where the first dimension of the resulting tensor is always considered to be\n the batch dimension, so the shape of the resulting tensor is $(N, d_1+d_2)$,\n where $N$ is the number of samples in the batch.\n\n Args:\n batch: The batch of data for which to compute the gradients.\n\n Returns:\n A tensor containing the flat gradients computed per sample.\n \"\"\"\n
"},{"location":"api/pydvl/influence/types/#pydvl.influence.types.GradientProvider.flat_mixed_grads","title":"flat_mixed_grads abstractmethod
","text":"flat_mixed_grads(batch: BatchType) -> TensorType\n
Computes and returns the flat per-sample mixed gradients for the provided batch. Given the example in the class docstring, this means
\\[ (\\nabla_{\\omega_1}\\nabla_{x}\\ell(\\omega_1, \\omega_2, \\text{batch.x}, \\text{batch.y}), \\nabla_{\\omega_1}\\nabla_{x}\\ell(\\omega_1, \\omega_2, \\text{batch.x}, \\text{batch.y} ))\\] where the first dimension of the resulting tensor is always considered to be the batch dimension and the last to be the non-batch input related derivatives. So the shape of the resulting tensor is \\((N, n, d_1 + d_2)\\), where \\(N\\) is the number of samples in the batch.
PARAMETER DESCRIPTION batch
The batch of data for which to compute the flat mixed gradients.
TYPE: BatchType
RETURNS DESCRIPTION TensorType
A tensor containing the flat mixed gradients computed per sample.
Source code in src/pydvl/influence/types.py
@abstractmethod\ndef flat_mixed_grads(self, batch: BatchType) -> TensorType:\n r\"\"\"\n Computes and returns the flat per-sample mixed gradients for the provided batch.\n Given the example in the class docstring, this means\n\n $$ (\\nabla_{\\omega_1}\\nabla_{x}\\ell(\\omega_1,\n \\omega_2, \\text{batch.x}, \\text{batch.y}),\n \\nabla_{\\omega_1}\\nabla_{x}\\ell(\\omega_1,\n \\omega_2, \\text{batch.x}, \\text{batch.y} ))$$\n\n where the first dimension of the resulting tensor is always considered to be\n the batch dimension and the last to be the non-batch input related derivatives.\n So the shape of the resulting tensor is $(N, n, d_1 + d_2)$,\n where $N$ is the number of samples in the batch.\n\n Args:\n batch: The batch of data for which to compute the flat mixed gradients.\n\n Returns:\n A tensor containing the flat mixed gradients computed per sample.\n \"\"\"\n
"},{"location":"api/pydvl/influence/types/#pydvl.influence.types.BilinearForm","title":"BilinearForm","text":" Bases: Generic[TensorType, BatchType, GradientProviderType]
, ABC
Abstract base class for bilinear forms, which facilitates the computation of inner products involving gradients of batches of data.
"},{"location":"api/pydvl/influence/types/#pydvl.influence.types.BilinearForm.inner_prod","title":"inner_prod abstractmethod
","text":"inner_prod(left: TensorType, right: Optional[TensorType]) -> TensorType\n
Computes the inner product of two vectors, i.e.
\\[ \\langle x, y \\rangle_{B}\\] if we denote the bilinear-form by \\(\\langle \\cdot, \\cdot \\rangle_{B}\\). The implementations must take care of according vectorization to make it applicable to the case, where left
and right
are not one-dimensional. In this case, the trailing dimension of the left
and right
tensors are considered for the computation of the inner product. For example, if left
is a tensor of shape \\((N, D)\\) and, right
is of shape \\((M,..., D)\\), then the result is of shape \\((N, M, ...)\\).
PARAMETER DESCRIPTION left
The first tensor in the inner product computation.
TYPE: TensorType
right
The second tensor, optional; if not provided, the inner product will use left
tensor for both arguments.
TYPE: Optional[TensorType]
RETURNS DESCRIPTION TensorType
A tensor representing the inner product.
Source code in src/pydvl/influence/types.py
@abstractmethod\ndef inner_prod(self, left: TensorType, right: Optional[TensorType]) -> TensorType:\n r\"\"\"\n Computes the inner product of two vectors, i.e.\n\n $$ \\langle x, y \\rangle_{B}$$\n\n if we denote the bilinear-form by $\\langle \\cdot, \\cdot \\rangle_{B}$.\n The implementations must take care of according vectorization to make\n it applicable to the case, where `left` and `right` are not one-dimensional.\n In this case, the trailing dimension of the `left` and `right` tensors are\n considered for the computation of the inner product. For example,\n if `left` is a tensor of shape $(N, D)$ and, `right` is of shape $(M,..., D)$,\n then the result is of shape $(N, M, ...)$.\n\n Args:\n left: The first tensor in the inner product computation.\n right: The second tensor, optional; if not provided, the inner product will\n use `left` tensor for both arguments.\n\n Returns:\n A tensor representing the inner product.\n \"\"\"\n
"},{"location":"api/pydvl/influence/types/#pydvl.influence.types.BilinearForm.grads_inner_prod","title":"grads_inner_prod","text":"grads_inner_prod(\n left: BatchType,\n right: Optional[BatchType],\n gradient_provider: GradientProviderType,\n) -> TensorType\n
Computes the gradient inner product of two batches of data, i.e.
\\[ \\langle \\nabla_{\\omega}\\ell(\\omega, \\text{left.x}, \\text{left.y}), \\nabla_{\\omega}\\ell(\\omega, \\text{right.x}, \\text{right.y}) \\rangle_{B}\\] where \\(\\nabla_{\\omega}\\ell(\\omega, \\cdot, \\cdot)\\) is represented by the gradient_provider
and the expression must be understood sample-wise.
PARAMETER DESCRIPTION left
The first batch for gradient and inner product computation
TYPE: BatchType
right
The second batch for gradient and inner product computation, optional; if not provided, the inner product will use the gradient computed for left
for both arguments.
TYPE: Optional[BatchType]
gradient_provider
The gradient provider to compute the gradients.
TYPE: GradientProviderType
RETURNS DESCRIPTION TensorType
A tensor representing the inner products of the per-sample gradients
Source code in src/pydvl/influence/types.py
def grads_inner_prod(\n self,\n left: BatchType,\n right: Optional[BatchType],\n gradient_provider: GradientProviderType,\n) -> TensorType:\n r\"\"\"\n Computes the gradient inner product of two batches of data, i.e.\n\n $$ \\langle \\nabla_{\\omega}\\ell(\\omega, \\text{left.x}, \\text{left.y}),\n \\nabla_{\\omega}\\ell(\\omega, \\text{right.x}, \\text{right.y}) \\rangle_{B}$$\n\n where $\\nabla_{\\omega}\\ell(\\omega, \\cdot, \\cdot)$ is represented by the\n `gradient_provider` and the expression must be understood sample-wise.\n\n Args:\n left: The first batch for gradient and inner product computation\n right: The second batch for gradient and inner product computation,\n optional; if not provided, the inner product will use the gradient\n computed for `left` for both arguments.\n gradient_provider: The gradient provider to compute the gradients.\n\n Returns:\n A tensor representing the inner products of the per-sample gradients\n \"\"\"\n left_grad = gradient_provider.flat_grads(left)\n if right is None:\n right_grad = left_grad\n else:\n right_grad = gradient_provider.flat_grads(right)\n return self.inner_prod(left_grad, right_grad)\n
"},{"location":"api/pydvl/influence/types/#pydvl.influence.types.BilinearForm.mixed_grads_inner_prod","title":"mixed_grads_inner_prod","text":"mixed_grads_inner_prod(\n left: BatchType,\n right: Optional[BatchType],\n gradient_provider: GradientProviderType,\n) -> TensorType\n
Computes the mixed gradient inner product of two batches of data, i.e.
\\[ \\langle \\nabla_{\\omega}\\ell(\\omega, \\text{left.x}, \\text{left.y}), \\nabla_{\\omega}\\nabla_{x}\\ell(\\omega, \\text{right.x}, \\text{right.y}) \\rangle_{B}\\] where \\(\\nabla_{\\omega}\\ell(\\omega, \\cdot)\\) and \\(\\nabla_{\\omega}\\nabla_{x}\\ell(\\omega, \\cdot)\\) are represented by the gradient_provider
. The expression must be understood sample-wise.
PARAMETER DESCRIPTION left
The first batch for gradient and inner product computation
TYPE: BatchType
right
The second batch for gradient and inner product computation
TYPE: Optional[BatchType]
gradient_provider
The gradient provider to compute the gradients.
TYPE: GradientProviderType
RETURNS DESCRIPTION TensorType
A tensor representing the inner products of the mixed per-sample gradients
Source code in src/pydvl/influence/types.py
def mixed_grads_inner_prod(\n self,\n left: BatchType,\n right: Optional[BatchType],\n gradient_provider: GradientProviderType,\n) -> TensorType:\n r\"\"\"\n Computes the mixed gradient inner product of two batches of data, i.e.\n\n $$ \\langle \\nabla_{\\omega}\\ell(\\omega, \\text{left.x}, \\text{left.y}),\n \\nabla_{\\omega}\\nabla_{x}\\ell(\\omega, \\text{right.x}, \\text{right.y})\n \\rangle_{B}$$\n\n where $\\nabla_{\\omega}\\ell(\\omega, \\cdot)$ and\n $\\nabla_{\\omega}\\nabla_{x}\\ell(\\omega, \\cdot)$ are represented by the\n `gradient_provider`. The expression must be understood sample-wise.\n\n Args:\n left: The first batch for gradient and inner product computation\n right: The second batch for gradient and inner product computation\n gradient_provider: The gradient provider to compute the gradients.\n\n Returns:\n A tensor representing the inner products of the mixed per-sample gradients\n \"\"\"\n left_grad = gradient_provider.flat_grads(left)\n if right is None:\n right = left\n right_mixed_grad = gradient_provider.flat_mixed_grads(right)\n return self.inner_prod(left_grad, right_mixed_grad)\n
"},{"location":"api/pydvl/influence/types/#pydvl.influence.types.Operator","title":"Operator","text":" Bases: Generic[TensorType, BilinearFormType]
, ABC
Abstract base class for operators, capable of applying transformations to vectors and matrices, and can be represented as a bilinear form.
"},{"location":"api/pydvl/influence/types/#pydvl.influence.types.Operator.input_size","title":"input_size abstractmethod
property
","text":"input_size: int\n
Abstract property to get the needed size for inputs to the operator instance
RETURNS DESCRIPTION int
An integer representing the input size.
"},{"location":"api/pydvl/influence/types/#pydvl.influence.types.Operator.apply","title":"apply","text":"apply(tensor: TensorType) -> TensorType\n
Applies the operator to a tensor.
PARAMETER DESCRIPTION tensor
A tensor, whose tailing dimension must conform to the operator's input size
TYPE: TensorType
RETURNS DESCRIPTION TensorType
A tensor representing the result of the operator application.
Source code in src/pydvl/influence/types.py
def apply(self, tensor: TensorType) -> TensorType:\n \"\"\"\n Applies the operator to a tensor.\n\n Args:\n tensor: A tensor, whose tailing dimension must conform to the\n operator's input size\n\n Returns:\n A tensor representing the result of the operator application.\n \"\"\"\n self._validate_tensor_input(tensor)\n return self._apply(tensor)\n
"},{"location":"api/pydvl/influence/types/#pydvl.influence.types.Operator.as_bilinear_form","title":"as_bilinear_form abstractmethod
","text":"as_bilinear_form() -> BilinearFormType\n
Represents the operator as a bilinear form, i.e. the weighted inner product
\\[ \\langle \\operatorname{Op}(x), y \\rangle\\] RETURNS DESCRIPTION BilinearFormType
An instance of type BilinearForm representing this operator.
Source code in src/pydvl/influence/types.py
@abstractmethod\ndef as_bilinear_form(self) -> BilinearFormType:\n r\"\"\"\n Represents the operator as a bilinear form, i.e. the weighted inner product\n\n $$ \\langle \\operatorname{Op}(x), y \\rangle$$\n\n Returns:\n An instance of type [BilinearForm][pydvl.influence.types.BilinearForm]\n representing this operator.\n \"\"\"\n
"},{"location":"api/pydvl/influence/types/#pydvl.influence.types.OperatorGradientComposition","title":"OperatorGradientComposition","text":"OperatorGradientComposition(op: OperatorType, gp: GradientProviderType)\n
Bases: Generic[TensorType, BatchType, OperatorType, GradientProviderType]
Generic base class representing a composable block that integrates an operator and a gradient provider to compute interactions between batches of data.
This block is designed to be flexible, handling different computational modes via an abstract operator and gradient provider.
ATTRIBUTE DESCRIPTION op
The operator used for transformations and influence computations.
gp
The gradient provider used for obtaining necessary gradients.
Source code in src/pydvl/influence/types.py
def __init__(self, op: OperatorType, gp: GradientProviderType):\n self.gp = gp\n self.op = op\n
"},{"location":"api/pydvl/influence/types/#pydvl.influence.types.OperatorGradientComposition.interactions","title":"interactions","text":"interactions(\n left_batch: BatchType, right_batch: Optional[BatchType], mode: InfluenceMode\n)\n
Computes the interaction between the gradients on two batches of data based on the specified mode weighted by the operator action, i.e.
\\[ \\langle \\operatorname{Op}(\\nabla_{\\omega}\\ell(\\omega, \\text{left.x}, \\text{left.y})), \\nabla_{\\omega}\\ell(\\omega, \\text{right.x}, \\text{right.y}) \\rangle\\] for the case InfluenceMode.Up
and
\\[ \\langle \\operatorname{Op}(\\nabla_{\\omega}\\ell(\\omega, \\text{left.x}, \\text{left.y})), \\nabla_{\\omega}\\nabla_{x}\\ell(\\omega, \\text{right.x}, \\text{right.y}) \\rangle \\] for the case InfluenceMode.Perturbation
.
PARAMETER DESCRIPTION left_batch
The left data batch for gradient computation.
TYPE: BatchType
right_batch
The right data batch for gradient computation.
TYPE: Optional[BatchType]
mode
An instance of InfluenceMode determining the type of influence computation.
TYPE: InfluenceMode
RETURNS DESCRIPTION The result of the influence computation as dictated by the mode.
Source code in src/pydvl/influence/types.py
def interactions(\n self,\n left_batch: BatchType,\n right_batch: Optional[BatchType],\n mode: InfluenceMode,\n):\n r\"\"\"\n Computes the interaction between the gradients on two batches of data based on\n the specified mode weighted by the operator action,\n i.e.\n\n $$ \\langle \\operatorname{Op}(\\nabla_{\\omega}\\ell(\\omega, \\text{left.x},\n \\text{left.y})),\n \\nabla_{\\omega}\\ell(\\omega, \\text{right.x}, \\text{right.y}) \\rangle$$\n\n for the case `InfluenceMode.Up` and\n\n $$ \\langle \\operatorname{Op}(\\nabla_{\\omega}\\ell(\\omega, \\text{left.x},\n \\text{left.y})),\n \\nabla_{\\omega}\\nabla_{x}\\ell(\\omega, \\text{right.x}, \\text{right.y}) \\rangle $$\n\n for the case `InfluenceMode.Perturbation`.\n\n Args:\n left_batch: The left data batch for gradient computation.\n right_batch: The right data batch for gradient computation.\n mode: An instance of InfluenceMode determining the type of influence\n computation.\n\n Returns:\n The result of the influence computation as dictated by the mode.\n \"\"\"\n bilinear_form = self.op.as_bilinear_form()\n if mode == InfluenceMode.Up:\n return bilinear_form.grads_inner_prod(left_batch, right_batch, self.gp)\n elif mode == InfluenceMode.Perturbation:\n return bilinear_form.mixed_grads_inner_prod(\n left_batch, right_batch, self.gp\n )\n else:\n raise UnsupportedInfluenceModeException(mode)\n
"},{"location":"api/pydvl/influence/types/#pydvl.influence.types.OperatorGradientComposition.transformed_grads","title":"transformed_grads","text":"transformed_grads(batch: BatchType)\n
Computes the gradients of a data batch, transformed by the operator application , i.e. the expressions
\\[ \\operatorname{Op}(\\nabla_{\\omega}\\ell(\\omega, \\text{batch.x}, \\text{batch.y})) \\] PARAMETER DESCRIPTION batch
The data batch for gradient computation.
TYPE: BatchType
RETURNS DESCRIPTION A tensor representing the application of the operator to the gradients.
Source code in src/pydvl/influence/types.py
def transformed_grads(self, batch: BatchType):\n r\"\"\"\n Computes the gradients of a data batch, transformed by the operator application\n , i.e. the expressions\n\n $$ \\operatorname{Op}(\\nabla_{\\omega}\\ell(\\omega, \\text{batch.x},\n \\text{batch.y})) $$\n\n Args:\n batch: The data batch for gradient computation.\n\n Returns:\n A tensor representing the application of the operator to the gradients.\n\n \"\"\"\n grads = self.gp.flat_grads(batch)\n return self.op.apply(grads)\n
"},{"location":"api/pydvl/influence/types/#pydvl.influence.types.OperatorGradientComposition.interactions_from_transformed_grads","title":"interactions_from_transformed_grads","text":"interactions_from_transformed_grads(\n left_factors: TensorType, right_batch: BatchType, mode: InfluenceMode\n)\n
Computes the interaction between the transformed gradients on two batches of data using pre-computed factors and a batch of data, based on the specified mode. This means
\\[ \\langle \\text{left_factors}, \\nabla_{\\omega}\\ell(\\omega, \\text{right.x}, \\text{right.y}) \\rangle\\] for the case InfluenceMode.Up
and
\\[ \\langle \\text{left_factors}, \\nabla_{\\omega}\\nabla_{x}\\ell(\\omega, \\text{right.x}, \\text{right.y}) \\rangle \\] for the case InfluenceMode.Perturbation
.
PARAMETER DESCRIPTION left_factors
Pre-computed tensor factors from a left batch.
TYPE: TensorType
right_batch
The right data batch for influence computation.
TYPE: BatchType
mode
An instance of InfluenceMode determining the type of influence computation.
TYPE: InfluenceMode
RETURNS DESCRIPTION The result of the interaction computation using the provided factors and batch gradients.
Source code in src/pydvl/influence/types.py
def interactions_from_transformed_grads(\n self, left_factors: TensorType, right_batch: BatchType, mode: InfluenceMode\n):\n r\"\"\"\n Computes the interaction between the transformed gradients on two batches of\n data using pre-computed factors and a batch of data,\n based on the specified mode. This means\n\n $$ \\langle \\text{left_factors},\n \\nabla_{\\omega}\\ell(\\omega, \\text{right.x}, \\text{right.y}) \\rangle$$\n\n for the case `InfluenceMode.Up` and\n\n $$ \\langle \\text{left_factors},\n \\nabla_{\\omega}\\nabla_{x}\\ell(\\omega, \\text{right.x}, \\text{right.y}) \\rangle $$\n\n for the case `InfluenceMode.Perturbation`.\n\n Args:\n left_factors: Pre-computed tensor factors from a left batch.\n right_batch: The right data batch for influence computation.\n mode: An instance of InfluenceMode determining the type of influence\n computation.\n\n Returns:\n The result of the interaction computation using the provided factors and\n batch gradients.\n \"\"\"\n if mode is InfluenceMode.Up:\n right_grads = self.gp.flat_grads(right_batch)\n else:\n right_grads = self.gp.flat_mixed_grads(right_batch)\n return self._tensor_inner_product(left_factors, right_grads)\n
"},{"location":"api/pydvl/influence/types/#pydvl.influence.types.BlockMapper","title":"BlockMapper","text":"BlockMapper(\n composable_block_dict: OrderedDict[str, OperatorGradientCompositionType]\n)\n
Bases: Generic[TensorType, BatchType, OperatorGradientCompositionType]
, ABC
Abstract base class for mapping operations across multiple compositional blocks.
This class takes a dictionary of compositional blocks and applies their methods to batches or tensors, and aggregates the results.
ATTRIBUTE DESCRIPTION composable_block_dict
A dictionary mapping string identifiers to composable blocks which define operations like transformations and interactions.
Source code in src/pydvl/influence/types.py
def __init__(\n self, composable_block_dict: OrderedDict[str, OperatorGradientCompositionType]\n):\n self.composable_block_dict = composable_block_dict\n
"},{"location":"api/pydvl/influence/types/#pydvl.influence.types.BlockMapper.transformed_grads","title":"transformed_grads","text":"transformed_grads(batch: BatchType) -> OrderedDict[str, TensorType]\n
Computes and returns the transformed gradients for a batch in dictionary with the keys defined by the block names.
PARAMETER DESCRIPTION batch
The batch of data for which to compute transformed gradients.
TYPE: BatchType
RETURNS DESCRIPTION OrderedDict[str, TensorType]
An ordered dictionary of transformed gradients by block.
Source code in src/pydvl/influence/types.py
def transformed_grads(\n self,\n batch: BatchType,\n) -> OrderedDict[str, TensorType]:\n \"\"\"\n Computes and returns the transformed gradients for a batch in dictionary\n with the keys defined by the block names.\n\n Args:\n batch: The batch of data for which to compute transformed gradients.\n\n Returns:\n An ordered dictionary of transformed gradients by block.\n \"\"\"\n tensor_gen = self.generate_transformed_grads(batch)\n return self._to_ordered_dict(tensor_gen)\n
"},{"location":"api/pydvl/influence/types/#pydvl.influence.types.BlockMapper.interactions","title":"interactions","text":"interactions(\n left_batch: BatchType, right_batch: BatchType, mode: InfluenceMode\n) -> OrderedDict[str, TensorType]\n
Computes interactions between two batches, aggregated by block, based on a specified mode.
PARAMETER DESCRIPTION left_batch
The left batch for interaction computation.
TYPE: BatchType
right_batch
The right batch for interaction computation.
TYPE: BatchType
mode
The mode determining the type of interactions.
TYPE: InfluenceMode
RETURNS DESCRIPTION OrderedDict[str, TensorType]
An ordered dictionary of gradient interactions by block.
Source code in src/pydvl/influence/types.py
def interactions(\n self, left_batch: BatchType, right_batch: BatchType, mode: InfluenceMode\n) -> OrderedDict[str, TensorType]:\n \"\"\"\n Computes interactions between two batches, aggregated by block,\n based on a specified mode.\n\n Args:\n left_batch: The left batch for interaction computation.\n right_batch: The right batch for interaction computation.\n mode: The mode determining the type of interactions.\n\n Returns:\n An ordered dictionary of gradient interactions by block.\n \"\"\"\n tensor_gen = self.generate_interactions(left_batch, right_batch, mode)\n return self._to_ordered_dict(tensor_gen)\n
"},{"location":"api/pydvl/influence/types/#pydvl.influence.types.BlockMapper.interactions_from_transformed_grads","title":"interactions_from_transformed_grads","text":"interactions_from_transformed_grads(\n left_factors: OrderedDict[str, TensorType],\n right_batch: BatchType,\n mode: InfluenceMode,\n) -> OrderedDict[str, TensorType]\n
Computes interactions from transformed gradients and a right batch, aggregated by block and based on a mode.
PARAMETER DESCRIPTION left_factors
Pre-computed factors as a tensor or an ordered dictionary of tensors by block. If the input is a tensor, it is split into blocks according to the ordering in the composable_block_dict
attribute.
TYPE: OrderedDict[str, TensorType]
right_batch
The right batch for interaction computation.
TYPE: BatchType
mode
The mode determining the type of interactions.
TYPE: InfluenceMode
RETURNS DESCRIPTION OrderedDict[str, TensorType]
An ordered dictionary of interactions from transformed gradients by block.
Source code in src/pydvl/influence/types.py
def interactions_from_transformed_grads(\n self,\n left_factors: OrderedDict[str, TensorType],\n right_batch: BatchType,\n mode: InfluenceMode,\n) -> OrderedDict[str, TensorType]:\n \"\"\"\n Computes interactions from transformed gradients and a right batch,\n aggregated by block and based on a mode.\n\n Args:\n left_factors: Pre-computed factors as a tensor or an ordered dictionary of\n tensors by block. If the input is a tensor, it is split into blocks\n according to the ordering in the `composable_block_dict` attribute.\n right_batch: The right batch for interaction computation.\n mode: The mode determining the type of interactions.\n\n Returns:\n An ordered dictionary of interactions from transformed gradients by block.\n \"\"\"\n tensor_gen = self.generate_interactions_from_transformed_grads(\n left_factors, right_batch, mode\n )\n return self._to_ordered_dict(tensor_gen)\n
"},{"location":"api/pydvl/influence/types/#pydvl.influence.types.BlockMapper.generate_transformed_grads","title":"generate_transformed_grads","text":"generate_transformed_grads(\n batch: BatchType,\n) -> Generator[TensorType, None, None]\n
Generator that yields transformed gradients for a given batch, processed by each block.
PARAMETER DESCRIPTION batch
The batch of data for which to generate transformed gradients.
TYPE: BatchType
YIELDS DESCRIPTION TensorType
Transformed gradients for each block.
Source code in src/pydvl/influence/types.py
def generate_transformed_grads(\n self, batch: BatchType\n) -> Generator[TensorType, None, None]:\n \"\"\"\n Generator that yields transformed gradients for a given batch,\n processed by each block.\n\n Args:\n batch: The batch of data for which to generate transformed gradients.\n\n Yields:\n Transformed gradients for each block.\n \"\"\"\n for comp_block in self.composable_block_dict.values():\n yield comp_block.transformed_grads(batch)\n
"},{"location":"api/pydvl/influence/types/#pydvl.influence.types.BlockMapper.generate_interactions","title":"generate_interactions","text":"generate_interactions(\n left_batch: BatchType, right_batch: Optional[BatchType], mode: InfluenceMode\n) -> Generator[TensorType, None, None]\n
Generator that yields gradient interactions between two batches, processed by each block based on a mode.
PARAMETER DESCRIPTION left_batch
The left batch for interaction computation.
TYPE: BatchType
right_batch
The right batch for interaction computation.
TYPE: Optional[BatchType]
mode
The mode determining the type of interactions.
TYPE: InfluenceMode
YIELDS DESCRIPTION TensorType
Gradient interactions for each block.
TYPE:: TensorType
Source code in src/pydvl/influence/types.py
def generate_interactions(\n self,\n left_batch: BatchType,\n right_batch: Optional[BatchType],\n mode: InfluenceMode,\n) -> Generator[TensorType, None, None]:\n \"\"\"\n Generator that yields gradient interactions between two batches, processed by\n each block based on a mode.\n\n Args:\n left_batch: The left batch for interaction computation.\n right_batch: The right batch for interaction computation.\n mode: The mode determining the type of interactions.\n\n Yields:\n TensorType: Gradient interactions for each block.\n \"\"\"\n for comp_block in self.composable_block_dict.values():\n yield comp_block.interactions(left_batch, right_batch, mode)\n
"},{"location":"api/pydvl/influence/types/#pydvl.influence.types.BlockMapper.generate_interactions_from_transformed_grads","title":"generate_interactions_from_transformed_grads","text":"generate_interactions_from_transformed_grads(\n left_factors: Union[TensorType, OrderedDict[str, TensorType]],\n right_batch: BatchType,\n mode: InfluenceMode,\n) -> Generator[TensorType, None, None]\n
Generator that yields interactions computed from pre-computed factors and a right batch, processed by each block based on a mode.
PARAMETER DESCRIPTION left_factors
Pre-computed factors as a tensor or an ordered dictionary of tensors by block.
TYPE: Union[TensorType, OrderedDict[str, TensorType]]
right_batch
The right batch for interaction computation.
TYPE: BatchType
mode
The mode determining the type of interactions.
TYPE: InfluenceMode
YIELDS DESCRIPTION TensorType
Interactions for each block.
TYPE:: TensorType
Source code in src/pydvl/influence/types.py
def generate_interactions_from_transformed_grads(\n self,\n left_factors: Union[TensorType, OrderedDict[str, TensorType]],\n right_batch: BatchType,\n mode: InfluenceMode,\n) -> Generator[TensorType, None, None]:\n \"\"\"\n Generator that yields interactions computed from pre-computed factors and a\n right batch, processed by each block based on a mode.\n\n Args:\n left_factors: Pre-computed factors as a tensor or an ordered dictionary of\n tensors by block.\n right_batch: The right batch for interaction computation.\n mode: The mode determining the type of interactions.\n\n Yields:\n TensorType: Interactions for each block.\n \"\"\"\n if not isinstance(left_factors, dict):\n left_factors_dict = self._split_to_blocks(left_factors)\n else:\n left_factors_dict = cast(OrderedDict[str, TensorType], left_factors)\n for k, comp_block in self.composable_block_dict.items():\n yield comp_block.interactions_from_transformed_grads(\n left_factors_dict[k], right_batch, mode\n )\n
"},{"location":"api/pydvl/influence/torch/","title":"Torch","text":""},{"location":"api/pydvl/influence/torch/#pydvl.influence.torch","title":"pydvl.influence.torch","text":""},{"location":"api/pydvl/influence/torch/base/","title":"Base","text":""},{"location":"api/pydvl/influence/torch/base/#pydvl.influence.torch.base","title":"pydvl.influence.torch.base","text":""},{"location":"api/pydvl/influence/torch/base/#pydvl.influence.torch.base.TorchBatch","title":"TorchBatch dataclass
","text":"TorchBatch(x: Tensor, y: Tensor)\n
Bases: Batch
A convenience class for handling batches of data. Validates, the alignment of the first dimension (batch dimension) of the input and target tensor
ATTRIBUTE DESCRIPTION x
The input tensor that contains features or data points.
TYPE: Tensor
y
The target tensor that contains labels corresponding to the inputs.
TYPE: Tensor
"},{"location":"api/pydvl/influence/torch/base/#pydvl.influence.torch.base.TorchGradientProvider","title":"TorchGradientProvider","text":"TorchGradientProvider(\n model: Module, loss: LossType, restrict_to: Optional[Dict[str, Parameter]]\n)\n
Bases: GradientProvider[TorchBatch, Tensor]
Compute per-sample gradients of a function defined by a torch.nn.Module and a loss function using torch.func.
Consider a function
\\[ \\ell: \\mathbb{R}^{d_1} \\times \\mathbb{R}^{d_2} \\times \\mathbb{R}^{n} \\times \\mathbb{R}^{n}, \\quad \\ell(\\omega_1, \\omega_2, x, y) = \\operatorname{loss}(f(\\omega_1, \\omega_2; x), y) \\] e.g. a two layer neural network \\(f\\) with a loss function, then this object should compute the expressions:
\\[ \\nabla_{\\omega_{i}}\\ell(\\omega_1, \\omega_2, x, y), \\nabla_{\\omega_{i}}\\nabla_{x}\\ell(\\omega_1, \\omega_2, x, y), \\nabla_{\\omega}\\ell(\\omega_1, \\omega_2, x, y) \\cdot v\\] Source code in src/pydvl/influence/torch/base.py
def __init__(\n self,\n model: torch.nn.Module,\n loss: LossType,\n restrict_to: Optional[Dict[str, torch.nn.Parameter]],\n):\n self.model = model\n self.loss = loss\n\n if restrict_to is None:\n restrict_to = ModelParameterDictBuilder(model).build_from_block_mode(\n BlockMode.FULL\n )\n\n self.params_to_restrict_to = restrict_to\n
"},{"location":"api/pydvl/influence/torch/base/#pydvl.influence.torch.base.TorchGradientProvider.grads","title":"grads","text":"grads(batch: TorchBatch) -> Dict[str, Tensor]\n
Computes and returns a dictionary mapping parameter names to their respective per-sample gradients. Given the example in the class docstring, this means
\\[ \\text{result}[\\omega_i] = \\nabla_{\\omega_{i}}\\ell(\\omega_1, \\omega_2, \\text{batch.x}, \\text{batch.y}), \\] where the first dimension of the resulting tensors is always considered to be the batch dimension, so the shape of the resulting tensors are \\((N, d_i)\\), where \\(N\\) is the number of samples in the batch.
PARAMETER DESCRIPTION batch
The batch of data for which to compute gradients.
TYPE: TorchBatch
RETURNS DESCRIPTION Dict[str, Tensor]
A dictionary where keys are gradient identifiers and values are the gradients computed per sample.
Source code in src/pydvl/influence/torch/base.py
def grads(self, batch: TorchBatch) -> Dict[str, torch.Tensor]:\n r\"\"\"\n Computes and returns a dictionary mapping parameter names to their respective\n per-sample gradients. Given the example in the class docstring, this means\n\n $$ \\text{result}[\\omega_i] = \\nabla_{\\omega_{i}}\\ell(\\omega_1, \\omega_2,\n \\text{batch.x}, \\text{batch.y}), $$\n\n where the first dimension of the resulting tensors is always considered to be\n the batch dimension, so the shape of the resulting tensors are $(N, d_i)$,\n where $N$ is the number of samples in the batch.\n\n Args:\n batch: The batch of data for which to compute gradients.\n\n Returns:\n A dictionary where keys are gradient identifiers and values are the\n gradients computed per sample.\n \"\"\"\n gradient_dict = self._grads(batch.to(self.device))\n return self._detach_dict(gradient_dict)\n
"},{"location":"api/pydvl/influence/torch/base/#pydvl.influence.torch.base.TorchGradientProvider.mixed_grads","title":"mixed_grads","text":"mixed_grads(batch: TorchBatch) -> Dict[str, Tensor]\n
Computes and returns a dictionary mapping gradient names to their respective per-sample mixed gradients. In this context, mixed gradients refer to computing gradients with respect to the instance definition in addition to compute derivatives with respect to the input batch. Given the example in the class docstring, this means
\\[ \\text{result}[\\omega_i] = \\nabla_{\\omega_{i}}\\nabla_{x}\\ell(\\omega_1, \\omega_2, \\text{batch.x}, \\text{batch.y}), \\] where the first dimension of the resulting tensors is always considered to be the batch dimension and the last to be the non-batch input related derivatives. So the shape of the resulting tensors are \\((N, n, d_i)\\), where \\(N\\) is the number of samples in the batch.
PARAMETER DESCRIPTION batch
The batch of data for which to compute mixed gradients.
TYPE: TorchBatch
RETURNS DESCRIPTION Dict[str, Tensor]
A dictionary where keys are gradient identifiers and values are the mixed gradients computed per sample.
Source code in src/pydvl/influence/torch/base.py
def mixed_grads(self, batch: TorchBatch) -> Dict[str, torch.Tensor]:\n r\"\"\"\n Computes and returns a dictionary mapping gradient names to their respective\n per-sample mixed gradients. In this context, mixed gradients refer to computing\n gradients with respect to the instance definition in addition to\n compute derivatives with respect to the input batch.\n Given the example in the class docstring, this means\n\n $$ \\text{result}[\\omega_i] = \\nabla_{\\omega_{i}}\\nabla_{x}\\ell(\\omega_1,\n \\omega_2, \\text{batch.x}, \\text{batch.y}), $$\n\n where the first dimension of the resulting tensors is always considered to be\n the batch dimension and the last to be the non-batch input related derivatives.\n So the shape of the resulting tensors are $(N, n, d_i)$,\n where $N$ is the number of samples in the batch.\n\n Args:\n batch: The batch of data for which to compute mixed gradients.\n\n Returns:\n A dictionary where keys are gradient identifiers and values are the\n mixed gradients computed per sample.\n \"\"\"\n gradient_dict = self._mixed_grads(batch.to(self.device))\n return self._detach_dict(gradient_dict)\n
"},{"location":"api/pydvl/influence/torch/base/#pydvl.influence.torch.base.TorchGradientProvider.jacobian_prod","title":"jacobian_prod","text":"jacobian_prod(batch: TorchBatch, g: Tensor) -> Tensor\n
Computes the matrix-Jacobian product for the provided batch and input tensor. Given the example in the class docstring, this means
\\[ (\\nabla_{\\omega_{1}}\\ell(\\omega_1, \\omega_2, \\text{batch.x}, \\text{batch.y}), \\nabla_{\\omega_{2}}\\ell(\\omega_1, \\omega_2, \\text{batch.x}, \\text{batch.y})) \\cdot g^T\\] where g must be a tensor of shape \\((K, d_1+d_2)\\), so the resulting tensor is of shape \\((N, K)\\).
PARAMETER DESCRIPTION batch
The batch of data for which to compute the Jacobian.
TYPE: TorchBatch
g
The tensor to be used in the matrix-Jacobian product calculation.
TYPE: Tensor
RETURNS DESCRIPTION Tensor
The resulting tensor from the matrix-Jacobian product computation.
Source code in src/pydvl/influence/torch/base.py
def jacobian_prod(\n self,\n batch: TorchBatch,\n g: torch.Tensor,\n) -> torch.Tensor:\n r\"\"\"\n Computes the matrix-Jacobian product for the provided batch and input tensor.\n Given the example in the class docstring, this means\n\n $$ (\\nabla_{\\omega_{1}}\\ell(\\omega_1, \\omega_2,\n \\text{batch.x}, \\text{batch.y}),\n \\nabla_{\\omega_{2}}\\ell(\\omega_1, \\omega_2,\n \\text{batch.x}, \\text{batch.y})) \\cdot g^T$$\n\n where g must be a tensor of shape $(K, d_1+d_2)$, so the resulting tensor\n is of shape $(N, K)$.\n\n Args:\n batch: The batch of data for which to compute the Jacobian.\n g: The tensor to be used in the matrix-Jacobian product\n calculation.\n\n Returns:\n The resulting tensor from the matrix-Jacobian product computation.\n \"\"\"\n result = self._jacobian_prod(batch.to(self.device), g.to(self.device))\n if result.requires_grad:\n result = result.detach()\n return result\n
"},{"location":"api/pydvl/influence/torch/base/#pydvl.influence.torch.base.OperatorBilinearForm","title":"OperatorBilinearForm","text":"OperatorBilinearForm(operator: 'TensorOperator')\n
Bases: BilinearForm[Tensor, TorchBatch, TorchGradientProvider]
Base class for bilinear forms based on an instance of TorchOperator. This means it computes weighted inner products of the form:
\\[ \\langle \\operatorname{Op}(x), y \\rangle \\] Source code in src/pydvl/influence/torch/base.py
def __init__(\n self,\n operator: \"TensorOperator\",\n):\n self.operator = operator\n
"},{"location":"api/pydvl/influence/torch/base/#pydvl.influence.torch.base.OperatorBilinearForm.grads_inner_prod","title":"grads_inner_prod","text":"grads_inner_prod(\n left: BatchType,\n right: Optional[BatchType],\n gradient_provider: GradientProviderType,\n) -> TensorType\n
Computes the gradient inner product of two batches of data, i.e.
\\[ \\langle \\nabla_{\\omega}\\ell(\\omega, \\text{left.x}, \\text{left.y}), \\nabla_{\\omega}\\ell(\\omega, \\text{right.x}, \\text{right.y}) \\rangle_{B}\\] where \\(\\nabla_{\\omega}\\ell(\\omega, \\cdot, \\cdot)\\) is represented by the gradient_provider
and the expression must be understood sample-wise.
PARAMETER DESCRIPTION left
The first batch for gradient and inner product computation
TYPE: BatchType
right
The second batch for gradient and inner product computation, optional; if not provided, the inner product will use the gradient computed for left
for both arguments.
TYPE: Optional[BatchType]
gradient_provider
The gradient provider to compute the gradients.
TYPE: GradientProviderType
RETURNS DESCRIPTION TensorType
A tensor representing the inner products of the per-sample gradients
Source code in src/pydvl/influence/types.py
def grads_inner_prod(\n self,\n left: BatchType,\n right: Optional[BatchType],\n gradient_provider: GradientProviderType,\n) -> TensorType:\n r\"\"\"\n Computes the gradient inner product of two batches of data, i.e.\n\n $$ \\langle \\nabla_{\\omega}\\ell(\\omega, \\text{left.x}, \\text{left.y}),\n \\nabla_{\\omega}\\ell(\\omega, \\text{right.x}, \\text{right.y}) \\rangle_{B}$$\n\n where $\\nabla_{\\omega}\\ell(\\omega, \\cdot, \\cdot)$ is represented by the\n `gradient_provider` and the expression must be understood sample-wise.\n\n Args:\n left: The first batch for gradient and inner product computation\n right: The second batch for gradient and inner product computation,\n optional; if not provided, the inner product will use the gradient\n computed for `left` for both arguments.\n gradient_provider: The gradient provider to compute the gradients.\n\n Returns:\n A tensor representing the inner products of the per-sample gradients\n \"\"\"\n left_grad = gradient_provider.flat_grads(left)\n if right is None:\n right_grad = left_grad\n else:\n right_grad = gradient_provider.flat_grads(right)\n return self.inner_prod(left_grad, right_grad)\n
"},{"location":"api/pydvl/influence/torch/base/#pydvl.influence.torch.base.OperatorBilinearForm.mixed_grads_inner_prod","title":"mixed_grads_inner_prod","text":"mixed_grads_inner_prod(\n left: BatchType,\n right: Optional[BatchType],\n gradient_provider: GradientProviderType,\n) -> TensorType\n
Computes the mixed gradient inner product of two batches of data, i.e.
\\[ \\langle \\nabla_{\\omega}\\ell(\\omega, \\text{left.x}, \\text{left.y}), \\nabla_{\\omega}\\nabla_{x}\\ell(\\omega, \\text{right.x}, \\text{right.y}) \\rangle_{B}\\] where \\(\\nabla_{\\omega}\\ell(\\omega, \\cdot)\\) and \\(\\nabla_{\\omega}\\nabla_{x}\\ell(\\omega, \\cdot)\\) are represented by the gradient_provider
. The expression must be understood sample-wise.
PARAMETER DESCRIPTION left
The first batch for gradient and inner product computation
TYPE: BatchType
right
The second batch for gradient and inner product computation
TYPE: Optional[BatchType]
gradient_provider
The gradient provider to compute the gradients.
TYPE: GradientProviderType
RETURNS DESCRIPTION TensorType
A tensor representing the inner products of the mixed per-sample gradients
Source code in src/pydvl/influence/types.py
def mixed_grads_inner_prod(\n self,\n left: BatchType,\n right: Optional[BatchType],\n gradient_provider: GradientProviderType,\n) -> TensorType:\n r\"\"\"\n Computes the mixed gradient inner product of two batches of data, i.e.\n\n $$ \\langle \\nabla_{\\omega}\\ell(\\omega, \\text{left.x}, \\text{left.y}),\n \\nabla_{\\omega}\\nabla_{x}\\ell(\\omega, \\text{right.x}, \\text{right.y})\n \\rangle_{B}$$\n\n where $\\nabla_{\\omega}\\ell(\\omega, \\cdot)$ and\n $\\nabla_{\\omega}\\nabla_{x}\\ell(\\omega, \\cdot)$ are represented by the\n `gradient_provider`. The expression must be understood sample-wise.\n\n Args:\n left: The first batch for gradient and inner product computation\n right: The second batch for gradient and inner product computation\n gradient_provider: The gradient provider to compute the gradients.\n\n Returns:\n A tensor representing the inner products of the mixed per-sample gradients\n \"\"\"\n left_grad = gradient_provider.flat_grads(left)\n if right is None:\n right = left\n right_mixed_grad = gradient_provider.flat_mixed_grads(right)\n return self.inner_prod(left_grad, right_mixed_grad)\n
"},{"location":"api/pydvl/influence/torch/base/#pydvl.influence.torch.base.OperatorBilinearForm.inner_prod","title":"inner_prod","text":"inner_prod(left: Tensor, right: Optional[Tensor]) -> Tensor\n
Computes the weighted inner product of two vectors, i.e.
\\[ \\langle \\operatorname{Op}(\\text{left}), \\text{right} \\rangle \\] PARAMETER DESCRIPTION left
The first tensor in the inner product computation.
TYPE: Tensor
right
The second tensor, optional; if not provided, the inner product will use left
tensor for both arguments.
TYPE: Optional[Tensor]
RETURNS DESCRIPTION Tensor
A tensor representing the inner product.
Source code in src/pydvl/influence/torch/base.py
def inner_prod(\n self, left: torch.Tensor, right: Optional[torch.Tensor]\n) -> torch.Tensor:\n r\"\"\"\n Computes the weighted inner product of two vectors, i.e.\n\n $$ \\langle \\operatorname{Op}(\\text{left}), \\text{right} \\rangle $$\n\n Args:\n left: The first tensor in the inner product computation.\n right: The second tensor, optional; if not provided, the inner product will\n use `left` tensor for both arguments.\n\n Returns:\n A tensor representing the inner product.\n \"\"\"\n if right is None:\n right = left\n if left.shape[0] <= right.shape[0]:\n return self._inner_product(left, right)\n return self._inner_product(right, left).T\n
"},{"location":"api/pydvl/influence/torch/base/#pydvl.influence.torch.base.DictBilinearForm","title":"DictBilinearForm","text":"DictBilinearForm(operator: 'TensorDictOperator')\n
Bases: OperatorBilinearForm
Base class for bilinear forms based on an instance of TorchOperator. This means it computes weighted inner products of the form:
\\[ \\langle \\operatorname{Op}(x), y \\rangle \\] Source code in src/pydvl/influence/torch/base.py
def __init__(\n self,\n operator: \"TensorDictOperator\",\n):\n super().__init__(operator)\n
"},{"location":"api/pydvl/influence/torch/base/#pydvl.influence.torch.base.DictBilinearForm.inner_prod","title":"inner_prod","text":"inner_prod(left: Tensor, right: Optional[Tensor]) -> Tensor\n
Computes the weighted inner product of two vectors, i.e.
\\[ \\langle \\operatorname{Op}(\\text{left}), \\text{right} \\rangle \\] PARAMETER DESCRIPTION left
The first tensor in the inner product computation.
TYPE: Tensor
right
The second tensor, optional; if not provided, the inner product will use left
tensor for both arguments.
TYPE: Optional[Tensor]
RETURNS DESCRIPTION Tensor
A tensor representing the inner product.
Source code in src/pydvl/influence/torch/base.py
def inner_prod(\n self, left: torch.Tensor, right: Optional[torch.Tensor]\n) -> torch.Tensor:\n r\"\"\"\n Computes the weighted inner product of two vectors, i.e.\n\n $$ \\langle \\operatorname{Op}(\\text{left}), \\text{right} \\rangle $$\n\n Args:\n left: The first tensor in the inner product computation.\n right: The second tensor, optional; if not provided, the inner product will\n use `left` tensor for both arguments.\n\n Returns:\n A tensor representing the inner product.\n \"\"\"\n if right is None:\n right = left\n if left.shape[0] <= right.shape[0]:\n return self._inner_product(left, right)\n return self._inner_product(right, left).T\n
"},{"location":"api/pydvl/influence/torch/base/#pydvl.influence.torch.base.DictBilinearForm.grads_inner_prod","title":"grads_inner_prod","text":"grads_inner_prod(\n left: TorchBatch,\n right: Optional[TorchBatch],\n gradient_provider: TorchGradientProvider,\n) -> Tensor\n
Computes the gradient inner product of two batches of data, i.e.
\\[ \\langle \\nabla_{\\omega}\\ell(\\omega, \\text{left.x}, \\text{left.y}), \\nabla_{\\omega}\\ell(\\omega, \\text{right.x}, \\text{right.y}) \\rangle_{B}\\] where \\(\\nabla_{\\omega}\\ell(\\omega, \\cdot, \\cdot)\\) is represented by the gradient_provider
and the expression must be understood sample-wise.
PARAMETER DESCRIPTION left
The first batch for gradient and inner product computation
TYPE: TorchBatch
right
The second batch for gradient and inner product computation, optional; if not provided, the inner product will use the gradient computed for left
for both arguments.
TYPE: Optional[TorchBatch]
gradient_provider
The gradient provider to compute the gradients.
TYPE: TorchGradientProvider
RETURNS DESCRIPTION Tensor
A tensor representing the inner products of the per-sample gradients
Source code in src/pydvl/influence/torch/base.py
def grads_inner_prod(\n self,\n left: TorchBatch,\n right: Optional[TorchBatch],\n gradient_provider: TorchGradientProvider,\n) -> torch.Tensor:\n r\"\"\"\n Computes the gradient inner product of two batches of data, i.e.\n\n $$ \\langle \\nabla_{\\omega}\\ell(\\omega, \\text{left.x}, \\text{left.y}),\n \\nabla_{\\omega}\\ell(\\omega, \\text{right.x}, \\text{right.y}) \\rangle_{B}$$\n\n where $\\nabla_{\\omega}\\ell(\\omega, \\cdot, \\cdot)$ is represented by the\n `gradient_provider` and the expression must be understood sample-wise.\n\n Args:\n left: The first batch for gradient and inner product computation\n right: The second batch for gradient and inner product computation,\n optional; if not provided, the inner product will use the gradient\n computed for `left` for both arguments.\n gradient_provider: The gradient provider to compute the gradients.\n\n Returns:\n A tensor representing the inner products of the per-sample gradients\n \"\"\"\n operator = cast(TensorDictOperator, self.operator)\n left_grads = gradient_provider.grads(left)\n if right is None:\n right_grads = left_grads\n else:\n right_grads = gradient_provider.grads(right)\n\n left_batch_size, right_batch_size = next(\n (\n (l.shape[0], r.shape[0])\n for r, l in zip(left_grads.values(), right_grads.values())\n )\n )\n\n if left_batch_size <= right_batch_size:\n left_grads = operator.apply_to_dict(left_grads)\n tensor_pairs = zip(left_grads.values(), right_grads.values())\n else:\n right_grads = operator.apply_to_dict(right_grads)\n tensor_pairs = zip(left_grads.values(), right_grads.values())\n\n tensors_to_reduce = (\n self._aggregate_grads(left, right) for left, right in tensor_pairs\n )\n\n return cast(torch.Tensor, sum(tensors_to_reduce))\n
"},{"location":"api/pydvl/influence/torch/base/#pydvl.influence.torch.base.DictBilinearForm.mixed_grads_inner_prod","title":"mixed_grads_inner_prod","text":"mixed_grads_inner_prod(\n left: TorchBatch,\n right: Optional[TorchBatch],\n gradient_provider: TorchGradientProvider,\n) -> Tensor\n
Computes the mixed gradient inner product of two batches of data, i.e.
\\[ \\langle \\nabla_{\\omega}\\ell(\\omega, \\text{left.x}, \\text{left.y}), \\nabla_{\\omega}\\nabla_{x}\\ell(\\omega, \\text{right.x}, \\text{right.y}) \\rangle_{B}\\] where \\(\\nabla_{\\omega}\\ell(\\omega, \\cdot)\\) and \\(\\nabla_{\\omega}\\nabla_{x}\\ell(\\omega, \\cdot)\\) are represented by the gradient_provider
. The expression must be understood sample-wise.
PARAMETER DESCRIPTION left
The first batch for gradient and inner product computation
TYPE: TorchBatch
right
The second batch for gradient and inner product computation
TYPE: Optional[TorchBatch]
gradient_provider
The gradient provider to compute the gradients.
TYPE: TorchGradientProvider
RETURNS DESCRIPTION Tensor
A tensor representing the inner products of the mixed per-sample gradients
Source code in src/pydvl/influence/torch/base.py
def mixed_grads_inner_prod(\n self,\n left: TorchBatch,\n right: Optional[TorchBatch],\n gradient_provider: TorchGradientProvider,\n) -> torch.Tensor:\n r\"\"\"\n Computes the mixed gradient inner product of two batches of data, i.e.\n\n $$ \\langle \\nabla_{\\omega}\\ell(\\omega, \\text{left.x}, \\text{left.y}),\n \\nabla_{\\omega}\\nabla_{x}\\ell(\\omega, \\text{right.x}, \\text{right.y})\n \\rangle_{B}$$\n\n where $\\nabla_{\\omega}\\ell(\\omega, \\cdot)$ and\n $\\nabla_{\\omega}\\nabla_{x}\\ell(\\omega, \\cdot)$ are represented by the\n `gradient_provider`. The expression must be understood sample-wise.\n\n Args:\n left: The first batch for gradient and inner product computation\n right: The second batch for gradient and inner product computation\n gradient_provider: The gradient provider to compute the gradients.\n\n Returns:\n A tensor representing the inner products of the mixed per-sample gradients\n \"\"\"\n operator = cast(TensorDictOperator, self.operator)\n if right is None:\n right = left\n right_grads = gradient_provider.mixed_grads(right)\n left_grads = gradient_provider.grads(left)\n left_grads = operator.apply_to_dict(left_grads)\n left_grads_views = (t.reshape(t.shape[0], -1) for t in left_grads.values())\n right_grads_views = (\n t.reshape(*right.x.shape, -1) for t in right_grads.values()\n )\n tensor_pairs = zip(left_grads_views, right_grads_views)\n tensors_to_reduce = (\n self._aggregate_mixed_grads(left, right) for left, right in tensor_pairs\n )\n return cast(torch.Tensor, sum(tensors_to_reduce))\n
"},{"location":"api/pydvl/influence/torch/base/#pydvl.influence.torch.base.TensorOperator","title":"TensorOperator","text":" Bases: Operator[Tensor, OperatorBilinearForm]
, ABC
Abstract base class for operators that can be applied to instances of torch.Tensor.
"},{"location":"api/pydvl/influence/torch/base/#pydvl.influence.torch.base.TensorOperator.input_size","title":"input_size abstractmethod
property
","text":"input_size: int\n
Abstract property to get the needed size for inputs to the operator instance
RETURNS DESCRIPTION int
An integer representing the input size.
"},{"location":"api/pydvl/influence/torch/base/#pydvl.influence.torch.base.TensorOperator.apply","title":"apply","text":"apply(tensor: TensorType) -> TensorType\n
Applies the operator to a tensor.
PARAMETER DESCRIPTION tensor
A tensor, whose tailing dimension must conform to the operator's input size
TYPE: TensorType
RETURNS DESCRIPTION TensorType
A tensor representing the result of the operator application.
Source code in src/pydvl/influence/types.py
def apply(self, tensor: TensorType) -> TensorType:\n \"\"\"\n Applies the operator to a tensor.\n\n Args:\n tensor: A tensor, whose tailing dimension must conform to the\n operator's input size\n\n Returns:\n A tensor representing the result of the operator application.\n \"\"\"\n self._validate_tensor_input(tensor)\n return self._apply(tensor)\n
"},{"location":"api/pydvl/influence/torch/base/#pydvl.influence.torch.base.TensorDictOperator","title":"TensorDictOperator","text":" Bases: TensorOperator
, ABC
Abstract base class for operators that can be applied to instances of torch.Tensor and compatible dictionaries mapping strings to tensors. Input dictionaries must conform to the structure defined by the property input_dict_structure
. Useful for operators involving autograd functionality to avoid intermediate flattening and concatenating of gradient inputs.
"},{"location":"api/pydvl/influence/torch/base/#pydvl.influence.torch.base.TensorDictOperator.input_size","title":"input_size abstractmethod
property
","text":"input_size: int\n
Abstract property to get the needed size for inputs to the operator instance
RETURNS DESCRIPTION int
An integer representing the input size.
"},{"location":"api/pydvl/influence/torch/base/#pydvl.influence.torch.base.TensorDictOperator.input_dict_structure","title":"input_dict_structure abstractmethod
property
","text":"input_dict_structure: Dict[str, Tuple[int, ...]]\n
Implement this to expose the expected structure of the input tensor dict, i.e. a dictionary of shapes (excluding the first batch dimension), in order to validate the input tensor dicts.
"},{"location":"api/pydvl/influence/torch/base/#pydvl.influence.torch.base.TensorDictOperator.apply","title":"apply","text":"apply(tensor: TensorType) -> TensorType\n
Applies the operator to a tensor.
PARAMETER DESCRIPTION tensor
A tensor, whose tailing dimension must conform to the operator's input size
TYPE: TensorType
RETURNS DESCRIPTION TensorType
A tensor representing the result of the operator application.
Source code in src/pydvl/influence/types.py
def apply(self, tensor: TensorType) -> TensorType:\n \"\"\"\n Applies the operator to a tensor.\n\n Args:\n tensor: A tensor, whose tailing dimension must conform to the\n operator's input size\n\n Returns:\n A tensor representing the result of the operator application.\n \"\"\"\n self._validate_tensor_input(tensor)\n return self._apply(tensor)\n
"},{"location":"api/pydvl/influence/torch/base/#pydvl.influence.torch.base.TensorDictOperator.apply_to_dict","title":"apply_to_dict","text":"apply_to_dict(mat: Dict[str, Tensor]) -> Dict[str, Tensor]\n
Applies the operator to a dictionary of tensors, compatible to the structure defined by the property input_dict_structure
.
PARAMETER DESCRIPTION mat
dictionary of tensors, whose keys and shapes match the property input_dict_structure
.
TYPE: Dict[str, Tensor]
RETURNS DESCRIPTION Dict[str, Tensor]
A dictionary of tensors after applying the operator
Source code in src/pydvl/influence/torch/base.py
def apply_to_dict(self, mat: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:\n \"\"\"\n Applies the operator to a dictionary of tensors, compatible to the structure\n defined by the property `input_dict_structure`.\n\n Args:\n mat: dictionary of tensors, whose keys and shapes match the property\n `input_dict_structure`.\n\n Returns:\n A dictionary of tensors after applying the operator\n \"\"\"\n\n if not self._validate_mat_dict(mat):\n raise ValueError(\n f\"Incompatible input structure, expected (excluding batch\"\n f\"dimension): \\n {self.input_dict_structure}\"\n )\n\n return self._apply_to_dict(self._dict_to_device(mat))\n
"},{"location":"api/pydvl/influence/torch/base/#pydvl.influence.torch.base.TorchOperatorGradientComposition","title":"TorchOperatorGradientComposition","text":"TorchOperatorGradientComposition(\n op: TorchOperatorType, gp: TorchGradientProvider\n)\n
Bases: OperatorGradientComposition[Tensor, TorchBatch, TorchOperatorType, TorchGradientProvider]
Representing a composable block that integrates an TorchOperator and a TorchPerSampleGradientProvider
This block is designed to be flexible, handling different computational modes via an abstract operator and gradient provider.
Source code in src/pydvl/influence/torch/base.py
def __init__(self, op: TorchOperatorType, gp: TorchGradientProvider):\n super().__init__(op, gp)\n
"},{"location":"api/pydvl/influence/torch/base/#pydvl.influence.torch.base.TorchOperatorGradientComposition.interactions","title":"interactions","text":"interactions(\n left_batch: BatchType, right_batch: Optional[BatchType], mode: InfluenceMode\n)\n
Computes the interaction between the gradients on two batches of data based on the specified mode weighted by the operator action, i.e.
\\[ \\langle \\operatorname{Op}(\\nabla_{\\omega}\\ell(\\omega, \\text{left.x}, \\text{left.y})), \\nabla_{\\omega}\\ell(\\omega, \\text{right.x}, \\text{right.y}) \\rangle\\] for the case InfluenceMode.Up
and
\\[ \\langle \\operatorname{Op}(\\nabla_{\\omega}\\ell(\\omega, \\text{left.x}, \\text{left.y})), \\nabla_{\\omega}\\nabla_{x}\\ell(\\omega, \\text{right.x}, \\text{right.y}) \\rangle \\] for the case InfluenceMode.Perturbation
.
PARAMETER DESCRIPTION left_batch
The left data batch for gradient computation.
TYPE: BatchType
right_batch
The right data batch for gradient computation.
TYPE: Optional[BatchType]
mode
An instance of InfluenceMode determining the type of influence computation.
TYPE: InfluenceMode
RETURNS DESCRIPTION The result of the influence computation as dictated by the mode.
Source code in src/pydvl/influence/types.py
def interactions(\n self,\n left_batch: BatchType,\n right_batch: Optional[BatchType],\n mode: InfluenceMode,\n):\n r\"\"\"\n Computes the interaction between the gradients on two batches of data based on\n the specified mode weighted by the operator action,\n i.e.\n\n $$ \\langle \\operatorname{Op}(\\nabla_{\\omega}\\ell(\\omega, \\text{left.x},\n \\text{left.y})),\n \\nabla_{\\omega}\\ell(\\omega, \\text{right.x}, \\text{right.y}) \\rangle$$\n\n for the case `InfluenceMode.Up` and\n\n $$ \\langle \\operatorname{Op}(\\nabla_{\\omega}\\ell(\\omega, \\text{left.x},\n \\text{left.y})),\n \\nabla_{\\omega}\\nabla_{x}\\ell(\\omega, \\text{right.x}, \\text{right.y}) \\rangle $$\n\n for the case `InfluenceMode.Perturbation`.\n\n Args:\n left_batch: The left data batch for gradient computation.\n right_batch: The right data batch for gradient computation.\n mode: An instance of InfluenceMode determining the type of influence\n computation.\n\n Returns:\n The result of the influence computation as dictated by the mode.\n \"\"\"\n bilinear_form = self.op.as_bilinear_form()\n if mode == InfluenceMode.Up:\n return bilinear_form.grads_inner_prod(left_batch, right_batch, self.gp)\n elif mode == InfluenceMode.Perturbation:\n return bilinear_form.mixed_grads_inner_prod(\n left_batch, right_batch, self.gp\n )\n else:\n raise UnsupportedInfluenceModeException(mode)\n
"},{"location":"api/pydvl/influence/torch/base/#pydvl.influence.torch.base.TorchOperatorGradientComposition.transformed_grads","title":"transformed_grads","text":"transformed_grads(batch: BatchType)\n
Computes the gradients of a data batch, transformed by the operator application , i.e. the expressions
\\[ \\operatorname{Op}(\\nabla_{\\omega}\\ell(\\omega, \\text{batch.x}, \\text{batch.y})) \\] PARAMETER DESCRIPTION batch
The data batch for gradient computation.
TYPE: BatchType
RETURNS DESCRIPTION A tensor representing the application of the operator to the gradients.
Source code in src/pydvl/influence/types.py
def transformed_grads(self, batch: BatchType):\n r\"\"\"\n Computes the gradients of a data batch, transformed by the operator application\n , i.e. the expressions\n\n $$ \\operatorname{Op}(\\nabla_{\\omega}\\ell(\\omega, \\text{batch.x},\n \\text{batch.y})) $$\n\n Args:\n batch: The data batch for gradient computation.\n\n Returns:\n A tensor representing the application of the operator to the gradients.\n\n \"\"\"\n grads = self.gp.flat_grads(batch)\n return self.op.apply(grads)\n
"},{"location":"api/pydvl/influence/torch/base/#pydvl.influence.torch.base.TorchOperatorGradientComposition.interactions_from_transformed_grads","title":"interactions_from_transformed_grads","text":"interactions_from_transformed_grads(\n left_factors: TensorType, right_batch: BatchType, mode: InfluenceMode\n)\n
Computes the interaction between the transformed gradients on two batches of data using pre-computed factors and a batch of data, based on the specified mode. This means
\\[ \\langle \\text{left_factors}, \\nabla_{\\omega}\\ell(\\omega, \\text{right.x}, \\text{right.y}) \\rangle\\] for the case InfluenceMode.Up
and
\\[ \\langle \\text{left_factors}, \\nabla_{\\omega}\\nabla_{x}\\ell(\\omega, \\text{right.x}, \\text{right.y}) \\rangle \\] for the case InfluenceMode.Perturbation
.
PARAMETER DESCRIPTION left_factors
Pre-computed tensor factors from a left batch.
TYPE: TensorType
right_batch
The right data batch for influence computation.
TYPE: BatchType
mode
An instance of InfluenceMode determining the type of influence computation.
TYPE: InfluenceMode
RETURNS DESCRIPTION The result of the interaction computation using the provided factors and batch gradients.
Source code in src/pydvl/influence/types.py
def interactions_from_transformed_grads(\n self, left_factors: TensorType, right_batch: BatchType, mode: InfluenceMode\n):\n r\"\"\"\n Computes the interaction between the transformed gradients on two batches of\n data using pre-computed factors and a batch of data,\n based on the specified mode. This means\n\n $$ \\langle \\text{left_factors},\n \\nabla_{\\omega}\\ell(\\omega, \\text{right.x}, \\text{right.y}) \\rangle$$\n\n for the case `InfluenceMode.Up` and\n\n $$ \\langle \\text{left_factors},\n \\nabla_{\\omega}\\nabla_{x}\\ell(\\omega, \\text{right.x}, \\text{right.y}) \\rangle $$\n\n for the case `InfluenceMode.Perturbation`.\n\n Args:\n left_factors: Pre-computed tensor factors from a left batch.\n right_batch: The right data batch for influence computation.\n mode: An instance of InfluenceMode determining the type of influence\n computation.\n\n Returns:\n The result of the interaction computation using the provided factors and\n batch gradients.\n \"\"\"\n if mode is InfluenceMode.Up:\n right_grads = self.gp.flat_grads(right_batch)\n else:\n right_grads = self.gp.flat_mixed_grads(right_batch)\n return self._tensor_inner_product(left_factors, right_grads)\n
"},{"location":"api/pydvl/influence/torch/base/#pydvl.influence.torch.base.TorchBlockMapper","title":"TorchBlockMapper","text":"TorchBlockMapper(\n composable_block_dict: OrderedDict[str, TorchOperatorGradientComposition]\n)\n
Bases: BlockMapper[Tensor, TorchBatch, TorchOperatorGradientComposition[TorchOperatorType]]
Class for mapping operations across multiple compositional blocks represented by instances of TorchOperatorGradientComposition.
This class takes a dictionary of compositional blocks and applies their methods to batches or tensors, and aggregates the results.
Source code in src/pydvl/influence/torch/base.py
def __init__(\n self, composable_block_dict: OrderedDict[str, TorchOperatorGradientComposition]\n):\n super().__init__(composable_block_dict)\n
"},{"location":"api/pydvl/influence/torch/base/#pydvl.influence.torch.base.TorchBlockMapper.transformed_grads","title":"transformed_grads","text":"transformed_grads(batch: BatchType) -> OrderedDict[str, TensorType]\n
Computes and returns the transformed gradients for a batch in dictionary with the keys defined by the block names.
PARAMETER DESCRIPTION batch
The batch of data for which to compute transformed gradients.
TYPE: BatchType
RETURNS DESCRIPTION OrderedDict[str, TensorType]
An ordered dictionary of transformed gradients by block.
Source code in src/pydvl/influence/types.py
def transformed_grads(\n self,\n batch: BatchType,\n) -> OrderedDict[str, TensorType]:\n \"\"\"\n Computes and returns the transformed gradients for a batch in dictionary\n with the keys defined by the block names.\n\n Args:\n batch: The batch of data for which to compute transformed gradients.\n\n Returns:\n An ordered dictionary of transformed gradients by block.\n \"\"\"\n tensor_gen = self.generate_transformed_grads(batch)\n return self._to_ordered_dict(tensor_gen)\n
"},{"location":"api/pydvl/influence/torch/base/#pydvl.influence.torch.base.TorchBlockMapper.interactions","title":"interactions","text":"interactions(\n left_batch: BatchType, right_batch: BatchType, mode: InfluenceMode\n) -> OrderedDict[str, TensorType]\n
Computes interactions between two batches, aggregated by block, based on a specified mode.
PARAMETER DESCRIPTION left_batch
The left batch for interaction computation.
TYPE: BatchType
right_batch
The right batch for interaction computation.
TYPE: BatchType
mode
The mode determining the type of interactions.
TYPE: InfluenceMode
RETURNS DESCRIPTION OrderedDict[str, TensorType]
An ordered dictionary of gradient interactions by block.
Source code in src/pydvl/influence/types.py
def interactions(\n self, left_batch: BatchType, right_batch: BatchType, mode: InfluenceMode\n) -> OrderedDict[str, TensorType]:\n \"\"\"\n Computes interactions between two batches, aggregated by block,\n based on a specified mode.\n\n Args:\n left_batch: The left batch for interaction computation.\n right_batch: The right batch for interaction computation.\n mode: The mode determining the type of interactions.\n\n Returns:\n An ordered dictionary of gradient interactions by block.\n \"\"\"\n tensor_gen = self.generate_interactions(left_batch, right_batch, mode)\n return self._to_ordered_dict(tensor_gen)\n
"},{"location":"api/pydvl/influence/torch/base/#pydvl.influence.torch.base.TorchBlockMapper.interactions_from_transformed_grads","title":"interactions_from_transformed_grads","text":"interactions_from_transformed_grads(\n left_factors: OrderedDict[str, TensorType],\n right_batch: BatchType,\n mode: InfluenceMode,\n) -> OrderedDict[str, TensorType]\n
Computes interactions from transformed gradients and a right batch, aggregated by block and based on a mode.
PARAMETER DESCRIPTION left_factors
Pre-computed factors as a tensor or an ordered dictionary of tensors by block. If the input is a tensor, it is split into blocks according to the ordering in the composable_block_dict
attribute.
TYPE: OrderedDict[str, TensorType]
right_batch
The right batch for interaction computation.
TYPE: BatchType
mode
The mode determining the type of interactions.
TYPE: InfluenceMode
RETURNS DESCRIPTION OrderedDict[str, TensorType]
An ordered dictionary of interactions from transformed gradients by block.
Source code in src/pydvl/influence/types.py
def interactions_from_transformed_grads(\n self,\n left_factors: OrderedDict[str, TensorType],\n right_batch: BatchType,\n mode: InfluenceMode,\n) -> OrderedDict[str, TensorType]:\n \"\"\"\n Computes interactions from transformed gradients and a right batch,\n aggregated by block and based on a mode.\n\n Args:\n left_factors: Pre-computed factors as a tensor or an ordered dictionary of\n tensors by block. If the input is a tensor, it is split into blocks\n according to the ordering in the `composable_block_dict` attribute.\n right_batch: The right batch for interaction computation.\n mode: The mode determining the type of interactions.\n\n Returns:\n An ordered dictionary of interactions from transformed gradients by block.\n \"\"\"\n tensor_gen = self.generate_interactions_from_transformed_grads(\n left_factors, right_batch, mode\n )\n return self._to_ordered_dict(tensor_gen)\n
"},{"location":"api/pydvl/influence/torch/base/#pydvl.influence.torch.base.TorchBlockMapper.generate_transformed_grads","title":"generate_transformed_grads","text":"generate_transformed_grads(\n batch: BatchType,\n) -> Generator[TensorType, None, None]\n
Generator that yields transformed gradients for a given batch, processed by each block.
PARAMETER DESCRIPTION batch
The batch of data for which to generate transformed gradients.
TYPE: BatchType
YIELDS DESCRIPTION TensorType
Transformed gradients for each block.
Source code in src/pydvl/influence/types.py
def generate_transformed_grads(\n self, batch: BatchType\n) -> Generator[TensorType, None, None]:\n \"\"\"\n Generator that yields transformed gradients for a given batch,\n processed by each block.\n\n Args:\n batch: The batch of data for which to generate transformed gradients.\n\n Yields:\n Transformed gradients for each block.\n \"\"\"\n for comp_block in self.composable_block_dict.values():\n yield comp_block.transformed_grads(batch)\n
"},{"location":"api/pydvl/influence/torch/base/#pydvl.influence.torch.base.TorchBlockMapper.generate_interactions","title":"generate_interactions","text":"generate_interactions(\n left_batch: BatchType, right_batch: Optional[BatchType], mode: InfluenceMode\n) -> Generator[TensorType, None, None]\n
Generator that yields gradient interactions between two batches, processed by each block based on a mode.
PARAMETER DESCRIPTION left_batch
The left batch for interaction computation.
TYPE: BatchType
right_batch
The right batch for interaction computation.
TYPE: Optional[BatchType]
mode
The mode determining the type of interactions.
TYPE: InfluenceMode
YIELDS DESCRIPTION TensorType
Gradient interactions for each block.
TYPE:: TensorType
Source code in src/pydvl/influence/types.py
def generate_interactions(\n self,\n left_batch: BatchType,\n right_batch: Optional[BatchType],\n mode: InfluenceMode,\n) -> Generator[TensorType, None, None]:\n \"\"\"\n Generator that yields gradient interactions between two batches, processed by\n each block based on a mode.\n\n Args:\n left_batch: The left batch for interaction computation.\n right_batch: The right batch for interaction computation.\n mode: The mode determining the type of interactions.\n\n Yields:\n TensorType: Gradient interactions for each block.\n \"\"\"\n for comp_block in self.composable_block_dict.values():\n yield comp_block.interactions(left_batch, right_batch, mode)\n
"},{"location":"api/pydvl/influence/torch/base/#pydvl.influence.torch.base.TorchBlockMapper.generate_interactions_from_transformed_grads","title":"generate_interactions_from_transformed_grads","text":"generate_interactions_from_transformed_grads(\n left_factors: Union[TensorType, OrderedDict[str, TensorType]],\n right_batch: BatchType,\n mode: InfluenceMode,\n) -> Generator[TensorType, None, None]\n
Generator that yields interactions computed from pre-computed factors and a right batch, processed by each block based on a mode.
PARAMETER DESCRIPTION left_factors
Pre-computed factors as a tensor or an ordered dictionary of tensors by block.
TYPE: Union[TensorType, OrderedDict[str, TensorType]]
right_batch
The right batch for interaction computation.
TYPE: BatchType
mode
The mode determining the type of interactions.
TYPE: InfluenceMode
YIELDS DESCRIPTION TensorType
Interactions for each block.
TYPE:: TensorType
Source code in src/pydvl/influence/types.py
def generate_interactions_from_transformed_grads(\n self,\n left_factors: Union[TensorType, OrderedDict[str, TensorType]],\n right_batch: BatchType,\n mode: InfluenceMode,\n) -> Generator[TensorType, None, None]:\n \"\"\"\n Generator that yields interactions computed from pre-computed factors and a\n right batch, processed by each block based on a mode.\n\n Args:\n left_factors: Pre-computed factors as a tensor or an ordered dictionary of\n tensors by block.\n right_batch: The right batch for interaction computation.\n mode: The mode determining the type of interactions.\n\n Yields:\n TensorType: Interactions for each block.\n \"\"\"\n if not isinstance(left_factors, dict):\n left_factors_dict = self._split_to_blocks(left_factors)\n else:\n left_factors_dict = cast(OrderedDict[str, TensorType], left_factors)\n for k, comp_block in self.composable_block_dict.items():\n yield comp_block.interactions_from_transformed_grads(\n left_factors_dict[k], right_batch, mode\n )\n
"},{"location":"api/pydvl/influence/torch/base/#pydvl.influence.torch.base.TorchComposableInfluence","title":"TorchComposableInfluence","text":"TorchComposableInfluence(\n model: Module,\n block_structure: Union[\n BlockMode, OrderedDict[str, List[str]]\n ] = BlockMode.FULL,\n regularization: Optional[Union[float, Dict[str, Optional[float]]]] = None,\n)\n
Bases: ComposableInfluence[Tensor, TorchBatch, DataLoader, TorchBlockMapper[TorchOperatorType]]
, ModelInfoMixin
, ABC
Abstract base class, that allow for block-wise computation of influence quantities with the torch framework. Inherit from this base class for specific influence algorithms.
Source code in src/pydvl/influence/torch/base.py
def __init__(\n self,\n model: torch.nn.Module,\n block_structure: Union[BlockMode, OrderedDict[str, List[str]]] = BlockMode.FULL,\n regularization: Optional[Union[float, Dict[str, Optional[float]]]] = None,\n):\n parameter_dict_builder = ModelParameterDictBuilder(model)\n if isinstance(block_structure, BlockMode):\n self.parameter_dict = parameter_dict_builder.build_from_block_mode(\n block_structure\n )\n else:\n self.parameter_dict = parameter_dict_builder.build(block_structure)\n\n self._regularization_dict = self._build_regularization_dict(regularization)\n\n super().__init__(model)\n
"},{"location":"api/pydvl/influence/torch/base/#pydvl.influence.torch.base.TorchComposableInfluence.is_thread_safe","title":"is_thread_safe abstractmethod
property
","text":"is_thread_safe: bool\n
Whether the influence computation is thread safe
"},{"location":"api/pydvl/influence/torch/base/#pydvl.influence.torch.base.TorchComposableInfluence.fit","title":"fit","text":"fit(data: DataLoaderType) -> InfluenceFunctionModel\n
Fitting to provided data, by internally creating a block mapper instance from it. Args: data: iterable of tensors
RETURNS DESCRIPTION InfluenceFunctionModel
Fitted instance
Source code in src/pydvl/influence/base_influence_function_model.py
@log_duration(log_level=logging.INFO)\ndef fit(self, data: DataLoaderType) -> InfluenceFunctionModel:\n \"\"\"\n Fitting to provided data, by internally creating a block mapper instance from\n it.\n Args:\n data: iterable of tensors\n\n Returns:\n Fitted instance\n \"\"\"\n self.block_mapper = self._create_block_mapper(data)\n return self\n
"},{"location":"api/pydvl/influence/torch/base/#pydvl.influence.torch.base.TorchComposableInfluence.fit_required","title":"fit_required staticmethod
","text":"fit_required(method)\n
Decorator to enforce the fitted check
Source code in src/pydvl/influence/base_influence_function_model.py
@staticmethod\ndef fit_required(method):\n \"\"\"Decorator to enforce the fitted check\"\"\"\n\n @wraps(method)\n def wrapper(self, *args, **kwargs):\n if not self.is_fitted:\n raise NotFittedException(type(self))\n return method(self, *args, **kwargs)\n\n return wrapper\n
"},{"location":"api/pydvl/influence/torch/base/#pydvl.influence.torch.base.TorchComposableInfluence.influence_factors","title":"influence_factors","text":"influence_factors(x: TensorType, y: TensorType) -> TensorType\n
Computes the approximation of
\\[ H^{-1}\\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\] where the gradient is meant to be per sample of the batch \\((x, y)\\). For all input tensors it is assumed, that the first dimension is the batch dimension.
PARAMETER DESCRIPTION x
model input to use in the gradient computations
TYPE: TensorType
y
label tensor to compute gradients
TYPE: TensorType
RETURNS DESCRIPTION TensorType
Tensor representing the element-wise inverse Hessian matrix vector products
Source code in src/pydvl/influence/base_influence_function_model.py
def influence_factors(self, x: TensorType, y: TensorType) -> TensorType:\n r\"\"\"\n Computes the approximation of\n\n \\[ H^{-1}\\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\]\n\n where the gradient is meant to be per sample of the batch $(x, y)$.\n For all input tensors it is assumed,\n that the first dimension is the batch dimension.\n\n Args:\n x: model input to use in the gradient computations\n y: label tensor to compute gradients\n\n Returns:\n Tensor representing the element-wise inverse Hessian matrix vector products\n\n \"\"\"\n if not self.is_fitted:\n raise NotFittedException(type(self))\n return self._influence_factors(x, y)\n
"},{"location":"api/pydvl/influence/torch/base/#pydvl.influence.torch.base.TorchComposableInfluence.influences","title":"influences","text":"influences(\n x_test: TensorType,\n y_test: TensorType,\n x: Optional[TensorType] = None,\n y: Optional[TensorType] = None,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> TensorType\n
Computes the approximation of
\\[ \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}})), \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the case of up-weighting influence, resp.
\\[ \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{test}, f_{\\theta}(x_{test})), \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the perturbation type influence case.
PARAMETER DESCRIPTION x_test
model input to use in the gradient computations of \\(H^{-1}\\nabla_{theta} \\ell(y_{test}, f_{\\theta}(x_{test}))\\)
TYPE: TensorType
y_test
label tensor to compute gradients
TYPE: TensorType
x
optional model input to use in the gradient computations \\(\\nabla_{theta}\\ell(y, f_{\\theta}(x))\\), resp. \\(\\nabla_{x}\\nabla_{theta}\\ell(y, f_{\\theta}(x))\\), if None, use \\(x=x_{test}\\)
TYPE: Optional[TensorType]
DEFAULT: None
y
optional label tensor to compute gradients
TYPE: Optional[TensorType]
DEFAULT: None
mode
enum value of InfluenceMode
TYPE: InfluenceMode
DEFAULT: Up
RETURNS DESCRIPTION TensorType
Tensor representing the element-wise scalar products for the provided batch
Source code in src/pydvl/influence/base_influence_function_model.py
def influences(\n self,\n x_test: TensorType,\n y_test: TensorType,\n x: Optional[TensorType] = None,\n y: Optional[TensorType] = None,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> TensorType:\n r\"\"\"\n Computes the approximation of\n\n \\[ \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}},\n f_{\\theta}(x_{\\text{test}})),\n \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the case of up-weighting influence, resp.\n\n \\[ \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{test}, f_{\\theta}(x_{test})),\n \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the perturbation type influence case.\n\n Args:\n x_test: model input to use in the gradient computations\n of $H^{-1}\\nabla_{theta} \\ell(y_{test}, f_{\\theta}(x_{test}))$\n y_test: label tensor to compute gradients\n x: optional model input to use in the gradient computations\n $\\nabla_{theta}\\ell(y, f_{\\theta}(x))$,\n resp. $\\nabla_{x}\\nabla_{theta}\\ell(y, f_{\\theta}(x))$,\n if None, use $x=x_{test}$\n y: optional label tensor to compute gradients\n mode: enum value of [InfluenceMode]\n [pydvl.influence.base_influence_function_model.InfluenceMode]\n\n Returns:\n Tensor representing the element-wise scalar products for the provided batch\n\n \"\"\"\n if not self.is_fitted:\n raise NotFittedException(type(self))\n\n if x is None and y is not None:\n raise ValueError(\n \"Providing labels y, without providing model input x \"\n \"is not supported\"\n )\n\n if x is not None and y is None:\n raise ValueError(\n \"Providing model input x, without providing labels y \"\n \"is not supported\"\n )\n\n return self._influences(x_test, y_test, x, y, mode)\n
"},{"location":"api/pydvl/influence/torch/base/#pydvl.influence.torch.base.TorchComposableInfluence.influences_from_factors","title":"influences_from_factors","text":"influences_from_factors(\n z_test_factors: TensorType,\n x: TensorType,\n y: TensorType,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> TensorType\n
Computation of
\\[ \\langle z_{\\text{test_factors}}, \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the case of up-weighting influence, resp.
\\[ \\langle z_{\\text{test_factors}}, \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the perturbation type influence case. The gradient is meant to be per sample of the batch \\((x, y)\\).
PARAMETER DESCRIPTION z_test_factors
pre-computed array, approximating \\(H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}}))\\)
TYPE: TensorType
x
model input to use in the gradient computations \\(\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))\\), resp. \\(\\nabla_{x}\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))\\), if None, use \\(x=x_{\\text{test}}\\)
TYPE: TensorType
y
label tensor to compute gradients
TYPE: TensorType
mode
enum value of InfluenceMode
TYPE: InfluenceMode
DEFAULT: Up
RETURNS DESCRIPTION TensorType
Tensor representing the element-wise scalar products for the provided batch
Source code in src/pydvl/influence/base_influence_function_model.py
@InfluenceFunctionModel.fit_required\ndef influences_from_factors(\n self,\n z_test_factors: TensorType,\n x: TensorType,\n y: TensorType,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> TensorType:\n r\"\"\"\n Computation of\n\n \\[ \\langle z_{\\text{test_factors}},\n \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the case of up-weighting influence, resp.\n\n \\[ \\langle z_{\\text{test_factors}},\n \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the perturbation type influence case. The gradient is meant to be per sample\n of the batch $(x, y)$.\n\n Args:\n z_test_factors: pre-computed array, approximating\n $H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}},\n f_{\\theta}(x_{\\text{test}}))$\n x: model input to use in the gradient computations\n $\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))$,\n resp. $\\nabla_{x}\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))$,\n if None, use $x=x_{\\text{test}}$\n y: label tensor to compute gradients\n mode: enum value of [InfluenceMode]\n [pydvl.influence.base_influence_function_model.InfluenceMode]\n\n Returns:\n Tensor representing the element-wise scalar products for the provided batch\n\n \"\"\"\n tensors = self.block_mapper.generate_interactions_from_transformed_grads(\n z_test_factors,\n self._create_batch(x, y),\n mode,\n )\n result: TensorType = next(tensors)\n for tensor in tensors:\n result = result + tensor\n return result\n
"},{"location":"api/pydvl/influence/torch/base/#pydvl.influence.torch.base.TorchComposableInfluence.influences_by_block","title":"influences_by_block","text":"influences_by_block(\n x_test: TensorType,\n y_test: TensorType,\n x: Optional[TensorType] = None,\n y: Optional[TensorType] = None,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> OrderedDict[str, TensorType]\n
Compute the block-wise influence values for the provided data, i.e. an approximation of
\\[ \\langle H^{-1}\\nabla_{theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}})), \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the case of up-weighting influence, resp.
\\[ \\langle H^{-1}\\nabla_{theta} \\ell(y_{test}, f_{\\theta}(x_{test})), \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the perturbation type influence case.
PARAMETER DESCRIPTION x_test
model input to use in the gradient computations of the approximation of \\(H^{-1}\\nabla_{theta} \\ell(y_{test}, f_{\\theta}(x_{test}))\\)
TYPE: TensorType
y_test
label tensor to compute gradients
TYPE: TensorType
x
optional model input to use in the gradient computations \\(\\nabla_{theta}\\ell(y, f_{\\theta}(x))\\), resp. \\(\\nabla_{x}\\nabla_{theta}\\ell(y, f_{\\theta}(x))\\), if None, use \\(x=x_{test}\\)
TYPE: Optional[TensorType]
DEFAULT: None
y
optional label tensor to compute gradients
TYPE: Optional[TensorType]
DEFAULT: None
mode
enum value of InfluenceMode
TYPE: InfluenceMode
DEFAULT: Up
RETURNS DESCRIPTION OrderedDict[str, TensorType]
Ordered dictionary of tensors representing the element-wise scalar products
OrderedDict[str, TensorType]
for the provided batch per block.
Source code in src/pydvl/influence/base_influence_function_model.py
@InfluenceFunctionModel.fit_required\ndef influences_by_block(\n self,\n x_test: TensorType,\n y_test: TensorType,\n x: Optional[TensorType] = None,\n y: Optional[TensorType] = None,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> OrderedDict[str, TensorType]:\n r\"\"\"\n Compute the block-wise influence values for the provided data, i.e. an\n approximation of\n\n \\[ \\langle H^{-1}\\nabla_{theta} \\ell(y_{\\text{test}},\n f_{\\theta}(x_{\\text{test}})),\n \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the case of up-weighting influence, resp.\n\n \\[ \\langle H^{-1}\\nabla_{theta} \\ell(y_{test}, f_{\\theta}(x_{test})),\n \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the perturbation type influence case.\n\n Args:\n x_test: model input to use in the gradient computations\n of the approximation of\n $H^{-1}\\nabla_{theta} \\ell(y_{test}, f_{\\theta}(x_{test}))$\n y_test: label tensor to compute gradients\n x: optional model input to use in the gradient computations\n $\\nabla_{theta}\\ell(y, f_{\\theta}(x))$,\n resp. $\\nabla_{x}\\nabla_{theta}\\ell(y, f_{\\theta}(x))$,\n if None, use $x=x_{test}$\n y: optional label tensor to compute gradients\n mode: enum value of [InfluenceMode]\n [pydvl.influence.base_influence_function_model.InfluenceMode]\n\n Returns:\n Ordered dictionary of tensors representing the element-wise scalar products\n for the provided batch per block.\n\n \"\"\"\n left_batch = self._create_batch(x_test, y_test)\n\n if x is None:\n if y is not None:\n raise ValueError(\n \"Providing labels y, without providing model input x \"\n \"is not supported\"\n )\n right_batch = left_batch\n else:\n if y is None:\n raise ValueError(\n \"Providing model input x, without providing labels y \"\n \"is not supported\"\n )\n right_batch = self._create_batch(x, y)\n\n return self.block_mapper.interactions(left_batch, right_batch, mode)\n
"},{"location":"api/pydvl/influence/torch/base/#pydvl.influence.torch.base.TorchComposableInfluence.influence_factors_by_block","title":"influence_factors_by_block","text":"influence_factors_by_block(\n x: TensorType, y: TensorType\n) -> OrderedDict[str, TensorType]\n
Compute the block-wise approximation of
\\[ H^{-1}\\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\] where the gradient is meant to be per sample of the batch \\((x, y)\\).
PARAMETER DESCRIPTION x
model input to use in the gradient computations
TYPE: TensorType
y
label tensor to compute gradients
TYPE: TensorType
RETURNS DESCRIPTION OrderedDict[str, TensorType]
Ordered dictionary of tensors representing the element-wise
OrderedDict[str, TensorType]
approximate inverse Hessian matrix vector products per block.
Source code in src/pydvl/influence/base_influence_function_model.py
@InfluenceFunctionModel.fit_required\ndef influence_factors_by_block(\n self, x: TensorType, y: TensorType\n) -> OrderedDict[str, TensorType]:\n r\"\"\"\n Compute the block-wise approximation of\n\n \\[ H^{-1}\\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\]\n\n where the gradient is meant to be per sample of the batch $(x, y)$.\n\n Args:\n x: model input to use in the gradient computations\n y: label tensor to compute gradients\n\n Returns:\n Ordered dictionary of tensors representing the element-wise\n approximate inverse Hessian matrix vector products per block.\n\n \"\"\"\n return self.block_mapper.transformed_grads(self._create_batch(x, y))\n
"},{"location":"api/pydvl/influence/torch/base/#pydvl.influence.torch.base.TorchComposableInfluence.influences_from_factors_by_block","title":"influences_from_factors_by_block","text":"influences_from_factors_by_block(\n z_test_factors: OrderedDict[str, TensorType],\n x: TensorType,\n y: TensorType,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> OrderedDict[str, TensorType]\n
Block-wise computation of
\\[ \\langle z_{\\text{test_factors}}, \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the case of up-weighting influence, resp.
\\[ \\langle z_{\\text{test_factors}}, \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the perturbation type influence case. The gradient is meant to be per sample of the batch \\((x, y)\\).
PARAMETER DESCRIPTION z_test_factors
pre-computed array, approximating \\(H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}}))\\)
TYPE: OrderedDict[str, TensorType]
x
model input to use in the gradient computations \\(\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))\\), resp. \\(\\nabla_{x}\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))\\), if None, use \\(x=x_{\\text{test}}\\)
TYPE: TensorType
y
label tensor to compute gradients
TYPE: TensorType
mode
enum value of InfluenceMode
TYPE: InfluenceMode
DEFAULT: Up
RETURNS DESCRIPTION OrderedDict[str, TensorType]
Ordered dictionary of tensors representing the element-wise scalar products
OrderedDict[str, TensorType]
for the provided batch per block
Source code in src/pydvl/influence/base_influence_function_model.py
@InfluenceFunctionModel.fit_required\ndef influences_from_factors_by_block(\n self,\n z_test_factors: OrderedDict[str, TensorType],\n x: TensorType,\n y: TensorType,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> OrderedDict[str, TensorType]:\n r\"\"\"\n Block-wise computation of\n\n \\[ \\langle z_{\\text{test_factors}},\n \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the case of up-weighting influence, resp.\n\n \\[ \\langle z_{\\text{test_factors}},\n \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the perturbation type influence case. The gradient is meant to be per sample\n of the batch $(x, y)$.\n\n Args:\n z_test_factors: pre-computed array, approximating\n $H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}},\n f_{\\theta}(x_{\\text{test}}))$\n x: model input to use in the gradient computations\n $\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))$,\n resp. $\\nabla_{x}\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))$,\n if None, use $x=x_{\\text{test}}$\n y: label tensor to compute gradients\n mode: enum value of [InfluenceMode]\n [pydvl.influence.base_influence_function_model.InfluenceMode]\n\n Returns:\n Ordered dictionary of tensors representing the element-wise scalar products\n for the provided batch per block\n\n \"\"\"\n return self.block_mapper.interactions_from_transformed_grads(\n z_test_factors, self._create_batch(x, y), mode\n )\n
"},{"location":"api/pydvl/influence/torch/batch_operation/","title":"Batch operation","text":""},{"location":"api/pydvl/influence/torch/batch_operation/#pydvl.influence.torch.batch_operation","title":"pydvl.influence.torch.batch_operation","text":"This module contains abstractions and implementations for operations carried out on a batch \\(b\\). These operations are of the form
$$ m(b) \\cdot v$$,
where \\(m(b)\\) is a matrix defined by the data in the batch and \\(v\\) is a vector or matrix. These batch operations can be used to conveniently build aggregations or recursions over sequence of batches, e.g. an average of the form
$$ \\frac{1}{|B|} \\sum_{b in B}m(b)\\cdot v$$,
which is useful in the case that keeping \\(B\\) in memory is not feasible.
"},{"location":"api/pydvl/influence/torch/batch_operation/#pydvl.influence.torch.batch_operation.HessianBatchOperation","title":"HessianBatchOperation","text":"HessianBatchOperation(\n model: Module,\n loss: LossType,\n restrict_to: Optional[Dict[str, Parameter]] = None,\n)\n
Bases: _ModelBasedBatchOperation
Given a model and loss function computes the Hessian vector or matrix product with respect to the model parameters, i.e.
\\[\\begin{align*} &\\nabla^2_{\\theta} L(b;\\theta) \\cdot v \\\\\\ &L(b;\\theta) = \\left( \\frac{1}{|b|} \\sum_{(x,y) \\in b} \\text{loss}(\\text{model}(x; \\theta), y)\\right), \\end{align*}\\] where model is a torch.nn.Module and \\(v\\) is a vector or matrix.
PARAMETER DESCRIPTION model
The model.
TYPE: Module
loss
The loss function.
TYPE: LossType
restrict_to
The parameters to restrict the second order differentiation to, i.e. the corresponding sub-matrix of the Hessian. If None, the full Hessian is used. Make sure the input matches the corrct dimension, i.e. the last dimension must be equal to the property input_size
.
TYPE: Optional[Dict[str, Parameter]]
DEFAULT: None
Source code in src/pydvl/influence/torch/batch_operation.py
def __init__(\n self,\n model: torch.nn.Module,\n loss: LossType,\n restrict_to: Optional[Dict[str, torch.nn.Parameter]] = None,\n):\n super().__init__(model, restrict_to=restrict_to)\n self._batch_hvp = create_batch_hvp_function(model, loss, reverse_only=True)\n self.loss = loss\n
"},{"location":"api/pydvl/influence/torch/batch_operation/#pydvl.influence.torch.batch_operation.HessianBatchOperation.apply","title":"apply","text":"apply(batch: TorchBatch, tensor: Tensor)\n
Applies the batch operation to a tensor. Args: batch: Batch of data for computation tensor: A tensor consistent to the operation, i.e. it must be at most 2-dim, and it's tailing dimension must be equal to the property input_size
.
RETURNS DESCRIPTION A tensor after applying the batch operation
Source code in src/pydvl/influence/torch/batch_operation.py
def apply(self, batch: TorchBatch, tensor: torch.Tensor):\n \"\"\"\n Applies the batch operation to a tensor.\n Args:\n batch: Batch of data for computation\n tensor: A tensor consistent to the operation, i.e. it must be\n at most 2-dim, and it's tailing dimension must\n be equal to the property `input_size`.\n\n Returns:\n A tensor after applying the batch operation\n \"\"\"\n\n if not tensor.ndim <= 2:\n raise ValueError(\n f\"The input tensor must be at most 2-dimensional, got {tensor.ndim}\"\n )\n\n if tensor.shape[-1] != self.input_size:\n raise ValueError(\n \"The last dimension of the input tensor must be equal to the \"\n \"property `input_size`.\"\n )\n\n if tensor.ndim == 2:\n return self._apply_to_mat(batch.to(self.device), tensor.to(self.device))\n return self._apply_to_vec(batch.to(self.device), tensor.to(self.device))\n
"},{"location":"api/pydvl/influence/torch/batch_operation/#pydvl.influence.torch.batch_operation.GaussNewtonBatchOperation","title":"GaussNewtonBatchOperation","text":"GaussNewtonBatchOperation(\n model: Module,\n loss: LossType,\n restrict_to: Optional[Dict[str, Parameter]] = None,\n)\n
Bases: _ModelBasedBatchOperation
Given a model and loss function computes the Gauss-Newton vector or matrix product with respect to the model parameters, i.e.
\\[\\begin{align*} G(\\text{model}, \\text{loss}, b, \\theta) &\\cdot v, \\\\\\ G(\\text{model}, \\text{loss}, b, \\theta) &= \\frac{1}{|b|}\\sum_{(x, y) \\in b}\\nabla_{\\theta}\\ell (x,y; \\theta) \\nabla_{\\theta}\\ell (x,y; \\theta)^t, \\\\\\ \\ell(x,y; \\theta) &= \\text{loss}(\\text{model}(x; \\theta), y) \\end{align*}\\] where model is a torch.nn.Module and \\(v\\) is a vector or matrix.
PARAMETER DESCRIPTION model
The model.
TYPE: Module
loss
The loss function.
TYPE: LossType
restrict_to
The parameters to restrict the differentiation to, i.e. the corresponding sub-matrix of the Jacobian. If None, the full Jacobian is used. Make sure the input matches the corrct dimension, i.e. the last dimension must be equal to the property input_size
.
TYPE: Optional[Dict[str, Parameter]]
DEFAULT: None
Source code in src/pydvl/influence/torch/batch_operation.py
def __init__(\n self,\n model: torch.nn.Module,\n loss: LossType,\n restrict_to: Optional[Dict[str, torch.nn.Parameter]] = None,\n):\n super().__init__(model, restrict_to=restrict_to)\n self.gradient_provider = TorchGradientProvider(\n model, loss, self.params_to_restrict_to\n )\n
"},{"location":"api/pydvl/influence/torch/batch_operation/#pydvl.influence.torch.batch_operation.GaussNewtonBatchOperation.apply","title":"apply","text":"apply(batch: TorchBatch, tensor: Tensor)\n
Applies the batch operation to a tensor. Args: batch: Batch of data for computation tensor: A tensor consistent to the operation, i.e. it must be at most 2-dim, and it's tailing dimension must be equal to the property input_size
.
RETURNS DESCRIPTION A tensor after applying the batch operation
Source code in src/pydvl/influence/torch/batch_operation.py
def apply(self, batch: TorchBatch, tensor: torch.Tensor):\n \"\"\"\n Applies the batch operation to a tensor.\n Args:\n batch: Batch of data for computation\n tensor: A tensor consistent to the operation, i.e. it must be\n at most 2-dim, and it's tailing dimension must\n be equal to the property `input_size`.\n\n Returns:\n A tensor after applying the batch operation\n \"\"\"\n\n if not tensor.ndim <= 2:\n raise ValueError(\n f\"The input tensor must be at most 2-dimensional, got {tensor.ndim}\"\n )\n\n if tensor.shape[-1] != self.input_size:\n raise ValueError(\n \"The last dimension of the input tensor must be equal to the \"\n \"property `input_size`.\"\n )\n\n if tensor.ndim == 2:\n return self._apply_to_mat(batch.to(self.device), tensor.to(self.device))\n return self._apply_to_vec(batch.to(self.device), tensor.to(self.device))\n
"},{"location":"api/pydvl/influence/torch/batch_operation/#pydvl.influence.torch.batch_operation.InverseHarmonicMeanBatchOperation","title":"InverseHarmonicMeanBatchOperation","text":"InverseHarmonicMeanBatchOperation(\n model: Module,\n loss: Callable[[Tensor, Tensor], Tensor],\n regularization: float,\n restrict_to: Optional[Dict[str, Parameter]] = None,\n)\n
Bases: _ModelBasedBatchOperation
Given a model and loss function computes an approximation of the inverse Gauss-Newton vector or matrix product. Viewing the damped Gauss-newton matrix
\\[\\begin{align*} G_{\\lambda}(\\text{model}, \\text{loss}, b, \\theta) &= \\frac{1}{|b|}\\sum_{(x, y) \\in b}\\nabla_{\\theta}\\ell (x,y; \\theta) \\nabla_{\\theta}\\ell (x,y; \\theta)^t + \\lambda \\operatorname{I}, \\\\\\ \\ell(x,y; \\theta) &= \\text{loss}(\\text{model}(x; \\theta), y) \\end{align*}\\] as an arithmetic mean of the rank-\\(1\\) updates, this operation replaces it with the harmonic mean of the rank-\\(1\\) updates, i.e.
\\[ \\tilde{G}_{\\lambda}(\\text{model}, \\text{loss}, b, \\theta) = \\left(n \\sum_{(x, y) \\in b} \\left( \\nabla_{\\theta}\\ell (x,y; \\theta) \\nabla_{\\theta}\\ell (x,y; \\theta)^t + \\lambda \\operatorname{I}\\right)^{-1} \\right)^{-1}\\] and computes
\\[ \\tilde{G}_{\\lambda}^{-1}(\\text{model}, \\text{loss}, b, \\theta) \\cdot v.\\] where model is a torch.nn.Module and \\(v\\) is a vector or matrix. In other words, it switches the order of summation and inversion, which resolves to the inverse harmonic mean
of the rank-\\(1\\) updates.
The inverses of the rank-\\(1\\) updates are not calculated explicitly, but instead a vectorized version of the Sherman\u2013Morrison formula is applied.
For more information, see Inverse Harmonic Mean.
PARAMETER DESCRIPTION model
The model.
TYPE: Module
loss
The loss function.
TYPE: Callable[[Tensor, Tensor], Tensor]
restrict_to
The parameters to restrict the differentiation to, i.e. the corresponding sub-matrix of the Jacobian. If None, the full Jacobian is used. Make sure the input matches the corrct dimension, i.e. the last dimension must be equal to the property input_size
.
TYPE: Optional[Dict[str, Parameter]]
DEFAULT: None
Source code in src/pydvl/influence/torch/batch_operation.py
def __init__(\n self,\n model: torch.nn.Module,\n loss: Callable[[torch.Tensor, torch.Tensor], torch.Tensor],\n regularization: float,\n restrict_to: Optional[Dict[str, torch.nn.Parameter]] = None,\n):\n if regularization <= 0:\n raise ValueError(\"regularization must be positive\")\n self.regularization = regularization\n\n super().__init__(model, restrict_to=restrict_to)\n self.gradient_provider = TorchGradientProvider(\n model, loss, self.params_to_restrict_to\n )\n
"},{"location":"api/pydvl/influence/torch/batch_operation/#pydvl.influence.torch.batch_operation.InverseHarmonicMeanBatchOperation.apply","title":"apply","text":"apply(batch: TorchBatch, tensor: Tensor)\n
Applies the batch operation to a tensor. Args: batch: Batch of data for computation tensor: A tensor consistent to the operation, i.e. it must be at most 2-dim, and it's tailing dimension must be equal to the property input_size
.
RETURNS DESCRIPTION A tensor after applying the batch operation
Source code in src/pydvl/influence/torch/batch_operation.py
def apply(self, batch: TorchBatch, tensor: torch.Tensor):\n \"\"\"\n Applies the batch operation to a tensor.\n Args:\n batch: Batch of data for computation\n tensor: A tensor consistent to the operation, i.e. it must be\n at most 2-dim, and it's tailing dimension must\n be equal to the property `input_size`.\n\n Returns:\n A tensor after applying the batch operation\n \"\"\"\n\n if not tensor.ndim <= 2:\n raise ValueError(\n f\"The input tensor must be at most 2-dimensional, got {tensor.ndim}\"\n )\n\n if tensor.shape[-1] != self.input_size:\n raise ValueError(\n \"The last dimension of the input tensor must be equal to the \"\n \"property `input_size`.\"\n )\n\n if tensor.ndim == 2:\n return self._apply_to_mat(batch.to(self.device), tensor.to(self.device))\n return self._apply_to_vec(batch.to(self.device), tensor.to(self.device))\n
"},{"location":"api/pydvl/influence/torch/batch_operation/#pydvl.influence.torch.batch_operation.ChunkAveraging","title":"ChunkAveraging","text":" Bases: _TensorAveraging[_TensorDictChunkAveraging]
Averages tensors, provided by a generator, and normalizes by the number of tensors.
"},{"location":"api/pydvl/influence/torch/batch_operation/#pydvl.influence.torch.batch_operation.PointAveraging","title":"PointAveraging","text":"PointAveraging(batch_dim: int = 0)\n
Bases: _TensorAveraging[_TensorDictPointAveraging]
Averages tensors provided by a generator. The averaging is weighted by the number of points in each tensor and the final result is normalized by the number of total points.
PARAMETER DESCRIPTION batch_dim
Dimension to extract the number of points for the weighting.
TYPE: int
DEFAULT: 0
Source code in src/pydvl/influence/torch/batch_operation.py
def __init__(self, batch_dim: int = 0):\n self.batch_dim = batch_dim\n
"},{"location":"api/pydvl/influence/torch/functional/","title":"Functional","text":""},{"location":"api/pydvl/influence/torch/functional/#pydvl.influence.torch.functional","title":"pydvl.influence.torch.functional","text":"This module provides methods for efficiently computing tensors related to first and second order derivatives of torch models, using functionality from torch.func. To indicate higher-order functions, i.e. functions which return functions, we use the naming convention create_**_function
.
In particular, the module contains functionality for
- Sample, batch-wise and empirical loss functions:
- create_per_sample_loss_function
- create_batch_loss_function
- create_empirical_loss_function
- Per sample gradient and jacobian product functions:
- create_per_sample_gradient_function
- create_per_sample_mixed_derivative_function
- create_matrix_jacobian_product_function
- Hessian, low rank approximation of Hessian and Hessian vector products:
- hvp
- create_hvp_function
- create_batch_hvp_function
- hessian
- model_hessian_low_rank
"},{"location":"api/pydvl/influence/torch/functional/#pydvl.influence.torch.functional.LowRankProductRepresentation","title":"LowRankProductRepresentation dataclass
","text":"LowRankProductRepresentation(eigen_vals: Tensor, projections: Tensor)\n
Representation of a low rank product of the form \\(H = V D V^T\\), where D is a diagonal matrix and V is orthogonal.
PARAMETER DESCRIPTION eigen_vals
Diagonal of D.
TYPE: Tensor
projections
The matrix V.
TYPE: Tensor
"},{"location":"api/pydvl/influence/torch/functional/#pydvl.influence.torch.functional.LowRankProductRepresentation.to","title":"to","text":"to(device: device)\n
Move the representing tensors to a device
Source code in src/pydvl/influence/torch/functional.py
def to(self, device: torch.device):\n \"\"\"\n Move the representing tensors to a device\n \"\"\"\n return LowRankProductRepresentation(\n self.eigen_vals.to(device), self.projections.to(device)\n )\n
"},{"location":"api/pydvl/influence/torch/functional/#pydvl.influence.torch.functional.hvp","title":"hvp","text":"hvp(\n func: Callable[[Dict[str, Tensor]], Tensor],\n params: Dict[str, Tensor],\n vec: Dict[str, Tensor],\n reverse_only: bool = True,\n) -> Dict[str, Tensor]\n
Computes the Hessian-vector product (HVP) for a given function at the given parameters, i.e.
\\[\\nabla_{\\theta} \\nabla_{\\theta} f (\\theta)\\cdot v\\] This function can operate in two modes, either reverse-mode autodiff only or both forward- and reverse-mode autodiff.
PARAMETER DESCRIPTION func
The scalar-valued function for which the HVP is computed.
TYPE: Callable[[Dict[str, Tensor]], Tensor]
params
The parameters at which the HVP is computed.
TYPE: Dict[str, Tensor]
vec
The vector with which the Hessian is multiplied.
TYPE: Dict[str, Tensor]
reverse_only
Whether to use only reverse-mode autodiff (True, default) or both forward- and reverse-mode autodiff (False).
TYPE: bool
DEFAULT: True
RETURNS DESCRIPTION Dict[str, Tensor]
The HVP of the function at the given parameters with the given vector.
Example >>> def f(z): return torch.sum(z**2)\n>>> u = torch.ones(10, requires_grad=True)\n>>> v = torch.ones(10)\n>>> hvp_vec = hvp(f, u, v)\n>>> assert torch.allclose(hvp_vec, torch.full((10, ), 2.0))\n
Source code in src/pydvl/influence/torch/functional.py
def hvp(\n func: Callable[[Dict[str, torch.Tensor]], torch.Tensor],\n params: Dict[str, torch.Tensor],\n vec: Dict[str, torch.Tensor],\n reverse_only: bool = True,\n) -> Dict[str, torch.Tensor]:\n r\"\"\"\n Computes the Hessian-vector product (HVP) for a given function at the given\n parameters, i.e.\n\n \\[\\nabla_{\\theta} \\nabla_{\\theta} f (\\theta)\\cdot v\\]\n\n This function can operate in two modes, either reverse-mode autodiff only or both\n forward- and reverse-mode autodiff.\n\n Args:\n func: The scalar-valued function for which the HVP is computed.\n params: The parameters at which the HVP is computed.\n vec: The vector with which the Hessian is multiplied.\n reverse_only: Whether to use only reverse-mode autodiff\n (True, default) or both forward- and reverse-mode autodiff (False).\n\n Returns:\n The HVP of the function at the given parameters with the given vector.\n\n ??? Example\n\n ```pycon\n >>> def f(z): return torch.sum(z**2)\n >>> u = torch.ones(10, requires_grad=True)\n >>> v = torch.ones(10)\n >>> hvp_vec = hvp(f, u, v)\n >>> assert torch.allclose(hvp_vec, torch.full((10, ), 2.0))\n ```\n \"\"\"\n\n output: Dict[str, torch.Tensor]\n\n if reverse_only:\n _, vjp_fn = vjp(grad(func), params)\n output = vjp_fn(vec)[0]\n else:\n output = jvp(grad(func), (params,), (vec,))[1]\n\n return output\n
"},{"location":"api/pydvl/influence/torch/functional/#pydvl.influence.torch.functional.create_batch_hvp_function","title":"create_batch_hvp_function","text":"create_batch_hvp_function(\n model: Module,\n loss: Callable[[Tensor, Tensor], Tensor],\n reverse_only: bool = True,\n) -> Callable[[Dict[str, Tensor], Tensor, Tensor, Tensor], Tensor]\n
Creates a function to compute Hessian-vector product (HVP) for a given model and loss function, where the Hessian information is computed for a provided batch.
This function takes a PyTorch model, a loss function, and an optional boolean parameter. It returns a callable that computes the Hessian-vector product for batches of input data and a given vector. The computation can be performed in reverse mode only, based on the reverse_only
parameter.
PARAMETER DESCRIPTION model
The PyTorch model for which the Hessian-vector product is to be computed.
TYPE: Module
loss
The loss function. It should take two torch.Tensor objects as input and return a torch.Tensor.
TYPE: Callable[[Tensor, Tensor], Tensor]
reverse_only
If True, the Hessian-vector product is computed in reverse mode only.
TYPE: bool
DEFAULT: True
RETURNS DESCRIPTION Callable[[Dict[str, Tensor], Tensor, Tensor, Tensor], Tensor]
A function that takes three torch.Tensor
objects - input data (x
), target data (y
), and a vector (vec
), and returns the Hessian-vector product of the loss evaluated on x
, y
times vec
.
Example # Assume `model` is a PyTorch model and `loss_fn` is a loss function.\nb_hvp_function = batch_hvp(model, loss_fn)\n\n# `x_batch`, `y_batch` are batches of input and target data,\n# and `vec` is a vector.\nhvp_result = b_hvp_function(x_batch, y_batch, vec)\n
Source code in src/pydvl/influence/torch/functional.py
def create_batch_hvp_function(\n model: torch.nn.Module,\n loss: Callable[[torch.Tensor, torch.Tensor], torch.Tensor],\n reverse_only: bool = True,\n) -> Callable[\n [Dict[str, torch.Tensor], torch.Tensor, torch.Tensor, torch.Tensor], torch.Tensor\n]:\n r\"\"\"\n Creates a function to compute Hessian-vector product (HVP) for a given model and\n loss function, where the Hessian information is computed for a provided batch.\n\n This function takes a PyTorch model, a loss function,\n and an optional boolean parameter. It returns a callable\n that computes the Hessian-vector product for batches of input data\n and a given vector. The computation can be performed in reverse mode only,\n based on the `reverse_only` parameter.\n\n Args:\n model: The PyTorch model for which the Hessian-vector product is to be computed.\n loss: The loss function. It should take two\n torch.Tensor objects as input and return a torch.Tensor.\n reverse_only (bool, optional): If True, the Hessian-vector product is computed\n in reverse mode only.\n\n Returns:\n A function that takes three `torch.Tensor` objects - input data (`x`),\n target data (`y`), and a vector (`vec`),\n and returns the Hessian-vector product of the loss\n evaluated on `x`, `y` times `vec`.\n\n ??? Example\n ```python\n # Assume `model` is a PyTorch model and `loss_fn` is a loss function.\n b_hvp_function = batch_hvp(model, loss_fn)\n\n # `x_batch`, `y_batch` are batches of input and target data,\n # and `vec` is a vector.\n hvp_result = b_hvp_function(x_batch, y_batch, vec)\n ```\n \"\"\"\n\n def b_hvp(\n params: Dict[str, torch.Tensor],\n x: torch.Tensor,\n y: torch.Tensor,\n vec: torch.Tensor,\n ):\n return flatten_dimensions(\n hvp(\n lambda p: create_batch_loss_function(model, loss)(p, x, y),\n params,\n align_structure(params, vec),\n reverse_only=reverse_only,\n ).values()\n )\n\n return b_hvp\n
"},{"location":"api/pydvl/influence/torch/functional/#pydvl.influence.torch.functional.create_empirical_loss_function","title":"create_empirical_loss_function","text":"create_empirical_loss_function(\n model: Module,\n loss: Callable[[Tensor, Tensor], Tensor],\n data_loader: DataLoader,\n) -> Callable[[Dict[str, Tensor]], Tensor]\n
Creates a function to compute the empirical loss of a given model on a given dataset. If we denote the model parameters with \\( \\theta \\), the resulting function approximates:
\\[ f(\\theta) = \\frac{1}{N}\\sum_{i=1}^N \\operatorname{loss}(y_i, \\operatorname{model}(\\theta, x_i)) \\] for a loss function \\(\\operatorname{loss}\\) and a model \\(\\operatorname{model}\\) with model parameters \\(\\theta\\), where \\(N\\) is the number of all elements provided by the data_loader.
PARAMETER DESCRIPTION model
The model for which the loss should be computed.
TYPE: Module
loss
The loss function to be used.
TYPE: Callable[[Tensor, Tensor], Tensor]
data_loader
The data loader for iterating over the dataset.
TYPE: DataLoader
RETURNS DESCRIPTION Callable[[Dict[str, Tensor]], Tensor]
A function that computes the empirical loss of the model on the dataset for given model parameters.
Source code in src/pydvl/influence/torch/functional.py
def create_empirical_loss_function(\n model: torch.nn.Module,\n loss: Callable[[torch.Tensor, torch.Tensor], torch.Tensor],\n data_loader: DataLoader,\n) -> Callable[[Dict[str, torch.Tensor]], torch.Tensor]:\n r\"\"\"\n Creates a function to compute the empirical loss of a given model\n on a given dataset. If we denote the model parameters with \\( \\theta \\),\n the resulting function approximates:\n\n \\[\n f(\\theta) = \\frac{1}{N}\\sum_{i=1}^N\n \\operatorname{loss}(y_i, \\operatorname{model}(\\theta, x_i))\n \\]\n\n for a loss function $\\operatorname{loss}$ and a model $\\operatorname{model}$\n with model parameters $\\theta$, where $N$ is the number of all elements provided\n by the data_loader.\n\n Args:\n model: The model for which the loss should be computed.\n loss: The loss function to be used.\n data_loader: The data loader for iterating over the dataset.\n\n Returns:\n A function that computes the empirical loss of the model on the dataset for\n given model parameters.\n\n \"\"\"\n\n def empirical_loss(params: Dict[str, torch.Tensor]):\n total_loss = to_model_device(torch.zeros((), requires_grad=True), model)\n total_samples = to_model_device(torch.zeros(()), model)\n\n for x, y in iter(data_loader):\n output = functional_call(\n model,\n params,\n (to_model_device(x, model),),\n )\n loss_value = loss(output, to_model_device(y, model))\n total_loss = total_loss + loss_value * x.size(0)\n total_samples += x.size(0)\n\n return total_loss / total_samples\n\n return empirical_loss\n
"},{"location":"api/pydvl/influence/torch/functional/#pydvl.influence.torch.functional.create_batch_loss_function","title":"create_batch_loss_function","text":"create_batch_loss_function(\n model: Module, loss: Callable[[Tensor, Tensor], Tensor]\n) -> Callable[[Dict[str, Tensor], Tensor, Tensor], Tensor]\n
Creates a function to compute the loss of a given model on a given batch of data, i.e. the function
\\[f(\\theta, x, y) = \\frac{1}{N} \\sum_{i=1}^N \\operatorname{loss}(\\operatorname{model}(\\theta, x_i), y_i)\\] for a loss function \\(\\operatorname{loss}\\) and a model \\(\\operatorname{model}\\) with model parameters \\(\\theta\\), where \\(N\\) is the number of elements in the batch. Args: model: The model for which the loss should be computed. loss: The loss function to be used, which should be able to handle a batch dimension
RETURNS DESCRIPTION Callable[[Dict[str, Tensor], Tensor, Tensor], Tensor]
A function that computes the loss of the model on a batch for given model parameters. The model parameter input to the function must take the form of a dict conform to model.named_parameters(), i.e. the keys must be a subset of the parameters and the corresponding tensor shapes must align. For the data input, the first dimension has to be the batch dimension.
Source code in src/pydvl/influence/torch/functional.py
def create_batch_loss_function(\n model: torch.nn.Module,\n loss: Callable[[torch.Tensor, torch.Tensor], torch.Tensor],\n) -> Callable[[Dict[str, torch.Tensor], torch.Tensor, torch.Tensor], torch.Tensor]:\n r\"\"\"\n Creates a function to compute the loss of a given model on a given batch of data,\n i.e. the function\n\n \\[f(\\theta, x, y) = \\frac{1}{N} \\sum_{i=1}^N\n \\operatorname{loss}(\\operatorname{model}(\\theta, x_i), y_i)\\]\n\n for a loss function $\\operatorname{loss}$ and a model $\\operatorname{model}$\n with model parameters $\\theta$, where $N$ is the number of elements in the batch.\n Args:\n model: The model for which the loss should be computed.\n loss: The loss function to be used, which should be able to handle\n a batch dimension\n\n Returns:\n A function that computes the loss of the model on a batch for given\n model parameters. The model parameter input to the function must take\n the form of a dict conform to model.named_parameters(), i.e. the keys\n must be a subset of the parameters and the corresponding tensor shapes\n must align. For the data input, the first dimension has to be the batch\n dimension.\n \"\"\"\n\n def batch_loss(params: Dict[str, torch.Tensor], x: torch.Tensor, y: torch.Tensor):\n outputs = functional_call(model, params, (to_model_device(x, model),))\n return loss(outputs, y)\n\n return batch_loss\n
"},{"location":"api/pydvl/influence/torch/functional/#pydvl.influence.torch.functional.create_hvp_function","title":"create_hvp_function","text":"create_hvp_function(\n model: Module,\n loss: Callable[[Tensor, Tensor], Tensor],\n data_loader: DataLoader,\n precompute_grad: bool = True,\n use_average: bool = True,\n reverse_only: bool = True,\n track_gradients: bool = False,\n) -> Callable[[Tensor], Tensor]\n
Returns a function that calculates the approximate Hessian-vector product for a given vector. If you want to compute the exact hessian, i.e., pulling all data into memory and compute a full gradient computation, use the function hvp.
PARAMETER DESCRIPTION model
A PyTorch module representing the model whose loss function's Hessian is to be computed.
TYPE: Module
loss
A callable that takes the model's output and target as input and returns the scalar loss.
TYPE: Callable[[Tensor, Tensor], Tensor]
data_loader
A DataLoader instance that provides batches of data for calculating the Hessian-vector product. Each batch from the DataLoader is assumed to return a tuple where the first element is the model's input and the second element is the target output.
TYPE: DataLoader
precompute_grad
If True
, the full data gradient is precomputed and kept in memory, which can speed up the hessian vector product computation. Set this to False
, if you can't afford to keep the full computation graph in memory.
TYPE: bool
DEFAULT: True
use_average
If True
, the returned function uses batch-wise computation via a batch loss function and averages the results. If False
, the function uses backpropagation on the full empirical loss function, which is more accurate than averaging the batch hessians, but probably has a way higher memory usage.
TYPE: bool
DEFAULT: True
reverse_only
Whether to use only reverse-mode autodiff or both forward- and reverse-mode autodiff. Ignored if precompute_grad
is True
.
TYPE: bool
DEFAULT: True
track_gradients
Whether to track gradients for the resulting tensor of the Hessian-vector products.
TYPE: bool
DEFAULT: False
RETURNS DESCRIPTION Callable[[Tensor], Tensor]
A function that takes a single argument, a vector, and returns the
Callable[[Tensor], Tensor]
product of the Hessian of the loss
function with respect to the
Callable[[Tensor], Tensor]
model
's parameters and the input vector.
Source code in src/pydvl/influence/torch/functional.py
def create_hvp_function(\n model: torch.nn.Module,\n loss: Callable[[torch.Tensor, torch.Tensor], torch.Tensor],\n data_loader: DataLoader,\n precompute_grad: bool = True,\n use_average: bool = True,\n reverse_only: bool = True,\n track_gradients: bool = False,\n) -> Callable[[torch.Tensor], torch.Tensor]:\n \"\"\"\n Returns a function that calculates the approximate Hessian-vector product\n for a given vector. If you want to compute the exact hessian,\n i.e., pulling all data into memory and compute a full gradient computation, use\n the function [hvp][pydvl.influence.torch.functional.hvp].\n\n Args:\n model: A PyTorch module representing the model whose loss function's\n Hessian is to be computed.\n loss: A callable that takes the model's output and target as input and\n returns the scalar loss.\n data_loader: A DataLoader instance that provides batches of data for\n calculating the Hessian-vector product. Each batch from the\n DataLoader is assumed to return a tuple where the first element is\n the model's input and the second element is the target output.\n precompute_grad: If `True`, the full data gradient is precomputed and\n kept in memory, which can speed up the hessian vector product\n computation. Set this to `False`, if you can't afford to keep the\n full computation graph in memory.\n use_average: If `True`, the returned function uses batch-wise\n computation via\n [a batch loss function][pydvl.influence.torch.functional.create_batch_loss_function]\n and averages the results.\n If `False`, the function uses backpropagation on the full\n [empirical loss function]\n [pydvl.influence.torch.functional.create_empirical_loss_function],\n which is more accurate than averaging the batch hessians, but\n probably has a way higher memory usage.\n reverse_only: Whether to use only reverse-mode autodiff or\n both forward- and reverse-mode autodiff. Ignored if\n `precompute_grad` is `True`.\n track_gradients: Whether to track gradients for the resulting tensor of\n the Hessian-vector products.\n\n Returns:\n A function that takes a single argument, a vector, and returns the\n product of the Hessian of the `loss` function with respect to the\n `model`'s parameters and the input vector.\n \"\"\"\n\n if precompute_grad:\n model_params = {k: p for k, p in model.named_parameters() if p.requires_grad}\n\n if use_average:\n model_dtype = next(p.dtype for p in model.parameters() if p.requires_grad)\n total_grad_xy = torch.empty(0, dtype=model_dtype)\n total_points = 0\n grad_func = torch.func.grad(create_batch_loss_function(model, loss))\n for x, y in iter(data_loader):\n grad_xy = grad_func(\n model_params, to_model_device(x, model), to_model_device(y, model)\n )\n grad_xy = flatten_dimensions(grad_xy.values())\n if total_grad_xy.nelement() == 0:\n total_grad_xy = torch.zeros_like(grad_xy)\n total_grad_xy += grad_xy * len(x)\n total_points += len(x)\n total_grad_xy /= total_points\n else:\n total_grad_xy = torch.func.grad(\n create_empirical_loss_function(model, loss, data_loader)\n )(model_params)\n total_grad_xy = flatten_dimensions(total_grad_xy.values())\n\n def precomputed_grads_hvp_function(\n precomputed_grads: torch.Tensor, vec: torch.Tensor\n ) -> torch.Tensor:\n vec = to_model_device(vec, model)\n if vec.ndim == 1:\n vec = vec.unsqueeze(0)\n\n z = (precomputed_grads * torch.autograd.Variable(vec)).sum(dim=1)\n\n mvp = []\n for i in range(len(z)):\n mvp.append(\n flatten_dimensions(\n torch.autograd.grad(\n z[i], list(model_params.values()), retain_graph=True\n )\n )\n )\n result = torch.stack([arr.contiguous().view(-1) for arr in mvp])\n\n if not track_gradients:\n result = result.detach()\n\n return result\n\n return partial(precomputed_grads_hvp_function, total_grad_xy)\n\n def hvp_function(vec: torch.Tensor) -> torch.Tensor:\n params = get_model_parameters(model, detach=not track_gradients)\n v = align_structure(params, vec)\n empirical_loss = create_empirical_loss_function(model, loss, data_loader)\n return flatten_dimensions(\n hvp(empirical_loss, params, v, reverse_only=reverse_only).values()\n )\n\n def avg_hvp_function(vec: torch.Tensor) -> torch.Tensor:\n n_batches = len(data_loader)\n avg_hessian = to_model_device(torch.zeros_like(vec), model)\n b_hvp = create_batch_hvp_function(model, loss, reverse_only)\n params = get_model_parameters(model, detach=not track_gradients)\n for t_x, t_y in iter(data_loader):\n t_x, t_y = to_model_device(t_x, model), to_model_device(t_y, model)\n avg_hessian += b_hvp(params, t_x, t_y, to_model_device(vec, model))\n\n return avg_hessian / float(n_batches)\n\n return avg_hvp_function if use_average else hvp_function\n
"},{"location":"api/pydvl/influence/torch/functional/#pydvl.influence.torch.functional.hessian","title":"hessian","text":"hessian(\n model: Module,\n loss: Callable[[Tensor, Tensor], Tensor],\n data_loader: DataLoader,\n use_hessian_avg: bool = True,\n track_gradients: bool = False,\n restrict_to: Optional[Dict[str, Tensor]] = None,\n) -> Tensor\n
Computes the Hessian matrix for a given model and loss function.
PARAMETER DESCRIPTION model
The PyTorch model for which the Hessian is computed.
TYPE: Module
loss
A callable that computes the loss.
TYPE: Callable[[Tensor, Tensor], Tensor]
data_loader
DataLoader providing batches of input data and corresponding ground truths.
TYPE: DataLoader
use_hessian_avg
Flag to indicate whether the average Hessian across mini-batches should be computed. If False, the empirical loss across the entire dataset is used.
TYPE: bool
DEFAULT: True
track_gradients
Whether to track gradients for the resulting tensor of the hessian vector products.
TYPE: bool
DEFAULT: False
restrict_to
The parameters to restrict the second order differentiation to, i.e. the corresponding sub-matrix of the Hessian. If None, the full Hessian is computed.
TYPE: Optional[Dict[str, Tensor]]
DEFAULT: None
RETURNS DESCRIPTION Tensor
A tensor representing the Hessian matrix. The shape of the tensor will be (n_parameters, n_parameters), where n_parameters is the number of trainable parameters in the model.
Source code in src/pydvl/influence/torch/functional.py
def hessian(\n model: torch.nn.Module,\n loss: Callable[[torch.Tensor, torch.Tensor], torch.Tensor],\n data_loader: DataLoader,\n use_hessian_avg: bool = True,\n track_gradients: bool = False,\n restrict_to: Optional[Dict[str, torch.Tensor]] = None,\n) -> torch.Tensor:\n \"\"\"\n Computes the Hessian matrix for a given model and loss function.\n\n Args:\n model: The PyTorch model for which the Hessian is computed.\n loss: A callable that computes the loss.\n data_loader: DataLoader providing batches of input data and corresponding\n ground truths.\n use_hessian_avg: Flag to indicate whether the average Hessian across\n mini-batches should be computed.\n If False, the empirical loss across the entire dataset is used.\n track_gradients: Whether to track gradients for the resulting tensor of\n the hessian vector products.\n restrict_to: The parameters to restrict the second order differentiation to,\n i.e. the corresponding sub-matrix of the Hessian. If None, the full Hessian\n is computed.\n\n Returns:\n A tensor representing the Hessian matrix. The shape of the tensor will be\n (n_parameters, n_parameters), where n_parameters is the number of trainable\n parameters in the model.\n \"\"\"\n params = restrict_to\n\n if params is None:\n params = get_model_parameters(model, detach=not track_gradients)\n n_parameters = sum([p.numel() for p in params.values()])\n model_dtype = next((p.dtype for p in params.values()))\n\n flat_params = flatten_dimensions(params.values())\n\n if use_hessian_avg:\n n_samples = 0\n hessian_mat = to_model_device(\n torch.zeros((n_parameters, n_parameters), dtype=model_dtype), model\n )\n batch_loss = create_batch_loss_function(model, loss)\n\n def flat_input_batch_loss(\n p: torch.Tensor, t_x: torch.Tensor, t_y: torch.Tensor\n ):\n return batch_loss(align_structure(params, p), t_x, t_y)\n\n for x, y in iter(data_loader):\n n_samples += x.shape[0]\n batch_hessian = torch.func.hessian(flat_input_batch_loss)(\n flat_params, to_model_device(x, model), to_model_device(y, model)\n )\n if not track_gradients and batch_hessian.requires_grad:\n batch_hessian = batch_hessian.detach()\n hessian_mat += x.shape[0] * batch_hessian\n\n hessian_mat /= n_samples\n else:\n\n def flat_input_empirical_loss(p: torch.Tensor):\n return create_empirical_loss_function(model, loss, data_loader)(\n align_with_model(p, model)\n )\n\n hessian_mat = torch.func.jacrev(torch.func.jacrev(flat_input_empirical_loss))(\n flat_params\n )\n\n return hessian_mat\n
"},{"location":"api/pydvl/influence/torch/functional/#pydvl.influence.torch.functional.gauss_newton","title":"gauss_newton","text":"gauss_newton(\n model: Module,\n loss: Callable[[Tensor, Tensor], Tensor],\n data_loader: DataLoader,\n restrict_to: Optional[Dict[str, Tensor]] = None,\n)\n
Compute the Gauss-Newton matrix, i.e.
$$ \\sum_{i=1}^N \\nabla_{\\theta}\\ell(m(x_i; \\theta), y) \\nabla_{\\theta}\\ell(m(x_i; \\theta), y)^t,$$ for a loss function \\(\\ell\\) and a model \\(m\\) with model parameters \\(\\theta\\).
PARAMETER DESCRIPTION model
The PyTorch model.
TYPE: Module
loss
A callable that computes the loss.
TYPE: Callable[[Tensor, Tensor], Tensor]
data_loader
A PyTorch DataLoader providing batches of input data and corresponding output data.
TYPE: DataLoader
restrict_to
The parameters to restrict the differentiation to, i.e. the corresponding sub-matrix of the Jacobian. If None, the full Jacobian is used.
TYPE: Optional[Dict[str, Tensor]]
DEFAULT: None
RETURNS DESCRIPTION The Gauss-Newton matrix.
Source code in src/pydvl/influence/torch/functional.py
def gauss_newton(\n model: torch.nn.Module,\n loss: Callable[[torch.Tensor, torch.Tensor], torch.Tensor],\n data_loader: DataLoader,\n restrict_to: Optional[Dict[str, torch.Tensor]] = None,\n):\n r\"\"\"\n Compute the Gauss-Newton matrix, i.e.\n\n $$ \\sum_{i=1}^N \\nabla_{\\theta}\\ell(m(x_i; \\theta), y)\n \\nabla_{\\theta}\\ell(m(x_i; \\theta), y)^t,$$\n for a loss function $\\ell$ and a model $m$ with model parameters $\\theta$.\n\n Args:\n model: The PyTorch model.\n loss: A callable that computes the loss.\n data_loader: A PyTorch DataLoader providing batches of input data and\n corresponding output data.\n restrict_to: The parameters to restrict the differentiation to,\n i.e. the corresponding sub-matrix of the Jacobian. If None, the full\n Jacobian is used.\n\n Returns:\n The Gauss-Newton matrix.\n \"\"\"\n\n per_sample_grads = create_per_sample_gradient_function(model, loss)\n\n params = restrict_to\n if params is None:\n params = get_model_parameters(model)\n\n def generate_batch_matrices():\n for x, y in data_loader:\n grads = flatten_dimensions(\n per_sample_grads(params, x, y).values(), shape=(x.shape[0], -1)\n )\n batch_mat = grads.t() @ grads\n yield batch_mat.detach()\n\n n_points = 0\n tensors = generate_batch_matrices()\n result = next(tensors)\n\n for t in tensors:\n result += t\n n_points += t.shape[0]\n\n return result / n_points\n
"},{"location":"api/pydvl/influence/torch/functional/#pydvl.influence.torch.functional.create_per_sample_loss_function","title":"create_per_sample_loss_function","text":"create_per_sample_loss_function(\n model: Module, loss: Callable[[Tensor, Tensor], Tensor]\n) -> Callable[[Dict[str, Tensor], Tensor, Tensor], Tensor]\n
Generates a function to compute per-sample losses using PyTorch's vmap, i.e. the vector-valued function
\\[ f(\\theta, x, y) = (\\operatorname{loss}(\\operatorname{model}(\\theta, x_1), y_1), \\dots, \\operatorname{loss}(\\operatorname{model}(\\theta, x_N), y_N)), \\] for a loss function \\(\\operatorname{loss}\\) and a model \\(\\operatorname{model}\\) with model parameters \\(\\theta\\), where \\(N\\) is the number of elements in the batch.
PARAMETER DESCRIPTION model
The PyTorch model for which per-sample losses will be computed.
TYPE: Module
loss
A callable that computes the loss.
TYPE: Callable[[Tensor, Tensor], Tensor]
RETURNS DESCRIPTION Callable[[Dict[str, Tensor], Tensor, Tensor], Tensor]
A callable that computes the loss for each sample in the batch, given a dictionary of model inputs, the model's predictions, and the true values. The callable will return a tensor where each entry corresponds to the loss of the corresponding sample.
Source code in src/pydvl/influence/torch/functional.py
def create_per_sample_loss_function(\n model: torch.nn.Module, loss: Callable[[torch.Tensor, torch.Tensor], torch.Tensor]\n) -> Callable[[Dict[str, torch.Tensor], torch.Tensor, torch.Tensor], torch.Tensor]:\n r\"\"\"\n Generates a function to compute per-sample losses using PyTorch's vmap,\n i.e. the vector-valued function\n\n \\[ f(\\theta, x, y) = (\\operatorname{loss}(\\operatorname{model}(\\theta, x_1), y_1),\n \\dots,\n \\operatorname{loss}(\\operatorname{model}(\\theta, x_N), y_N)), \\]\n\n for a loss function $\\operatorname{loss}$ and a model $\\operatorname{model}$ with\n model parameters $\\theta$, where $N$ is the number of elements in the batch.\n\n Args:\n model: The PyTorch model for which per-sample losses will be computed.\n loss: A callable that computes the loss.\n\n Returns:\n A callable that computes the loss for each sample in the batch,\n given a dictionary of model inputs, the model's predictions,\n and the true values. The callable will return a tensor where\n each entry corresponds to the loss of the corresponding sample.\n \"\"\"\n\n def compute_loss(\n params: Dict[str, torch.Tensor], x: torch.Tensor, y: torch.Tensor\n ) -> torch.Tensor:\n outputs = functional_call(\n model, params, (to_model_device(x.unsqueeze(0), model),)\n )\n return loss(outputs, y.unsqueeze(0))\n\n vmap_loss: Callable[\n [Dict[str, torch.Tensor], torch.Tensor, torch.Tensor], torch.Tensor\n ] = torch.vmap(compute_loss, in_dims=(None, 0, 0))\n return vmap_loss\n
"},{"location":"api/pydvl/influence/torch/functional/#pydvl.influence.torch.functional.create_per_sample_gradient_function","title":"create_per_sample_gradient_function","text":"create_per_sample_gradient_function(\n model: Module, loss: Callable[[Tensor, Tensor], Tensor]\n) -> Callable[[Dict[str, Tensor], Tensor, Tensor], Dict[str, Tensor]]\n
Generates a function to computes the per-sample gradient of the loss with respect to the model's parameters, i.e. the tensor-valued function
\\[ f(\\theta, x, y) = (\\nabla_{\\theta}\\operatorname{loss} (\\operatorname{model}(\\theta, x_1), y_1), \\dots, \\nabla_{\\theta}\\operatorname{loss}(\\operatorname{model}(\\theta, x_N), y_N) \\] for a loss function \\(\\operatorname{loss}\\) and a model \\(\\operatorname{model}\\) with model parameters \\(\\theta\\), where \\(N\\) is the number of elements in the batch.
PARAMETER DESCRIPTION model
The PyTorch model for which per-sample gradients will be computed.
TYPE: Module
loss
A callable that computes the loss.
TYPE: Callable[[Tensor, Tensor], Tensor]
RETURNS DESCRIPTION Callable[[Dict[str, Tensor], Tensor, Tensor], Dict[str, Tensor]]
A callable that takes a dictionary of model parameters, the model's input, and the labels. It returns a dictionary with the same keys as the model's named parameters. Each entry in the returned dictionary corresponds to the gradient of the corresponding model parameter for each sample in the batch.
Source code in src/pydvl/influence/torch/functional.py
def create_per_sample_gradient_function(\n model: torch.nn.Module, loss: Callable[[torch.Tensor, torch.Tensor], torch.Tensor]\n) -> Callable[\n [Dict[str, torch.Tensor], torch.Tensor, torch.Tensor], Dict[str, torch.Tensor]\n]:\n r\"\"\"\n Generates a function to computes the per-sample gradient of the loss with respect to\n the model's parameters, i.e. the tensor-valued function\n\n \\[ f(\\theta, x, y) = (\\nabla_{\\theta}\\operatorname{loss}\n (\\operatorname{model}(\\theta, x_1), y_1), \\dots,\n \\nabla_{\\theta}\\operatorname{loss}(\\operatorname{model}(\\theta, x_N), y_N) \\]\n\n for a loss function $\\operatorname{loss}$ and a model $\\operatorname{model}$ with\n model parameters $\\theta$, where $N$ is the number of elements in the batch.\n\n Args:\n model: The PyTorch model for which per-sample gradients will be computed.\n loss: A callable that computes the loss.\n\n Returns:\n A callable that takes a dictionary of model parameters, the model's input,\n and the labels. It returns a dictionary with the same keys as the model's\n named parameters. Each entry in the returned dictionary corresponds to\n the gradient of the corresponding model parameter for each sample\n in the batch.\n\n \"\"\"\n\n per_sample_grad: Callable[\n [Dict[str, torch.Tensor], torch.Tensor, torch.Tensor], Dict[str, torch.Tensor]\n ] = torch.func.jacrev(create_per_sample_loss_function(model, loss))\n return per_sample_grad\n
"},{"location":"api/pydvl/influence/torch/functional/#pydvl.influence.torch.functional.create_matrix_jacobian_product_function","title":"create_matrix_jacobian_product_function","text":"create_matrix_jacobian_product_function(\n model: Module, loss: Callable[[Tensor, Tensor], Tensor], g: Tensor\n) -> Callable[[Dict[str, Tensor], Tensor, Tensor], Tensor]\n
Generates a function to computes the matrix-Jacobian product (MJP) of the per-sample loss with respect to the model's parameters, i.e. the function
\\[ f(\\theta, x, y) = g \\, @ \\, (\\nabla_{\\theta}\\operatorname{loss} (\\operatorname{model}(\\theta, x_i), y_i))_i^T \\] for a loss function \\(\\operatorname{loss}\\) and a model \\(\\operatorname{model}\\) with model parameters \\(\\theta\\).
PARAMETER DESCRIPTION model
The PyTorch model for which the MJP will be computed.
TYPE: Module
loss
A callable that computes the loss.
TYPE: Callable[[Tensor, Tensor], Tensor]
g
Matrix for which the product with the Jacobian will be computed. The shape of this matrix should be consistent with the shape of the jacobian.
TYPE: Tensor
RETURNS DESCRIPTION Callable[[Dict[str, Tensor], Tensor, Tensor], Tensor]
A callable that takes a dictionary of model inputs, the model's input, and the labels. The callable returns the matrix-Jacobian product of the per-sample loss with respect to the model's parameters for the given matrix g
.
Source code in src/pydvl/influence/torch/functional.py
def create_matrix_jacobian_product_function(\n model: torch.nn.Module,\n loss: Callable[[torch.Tensor, torch.Tensor], torch.Tensor],\n g: torch.Tensor,\n) -> Callable[[Dict[str, torch.Tensor], torch.Tensor, torch.Tensor], torch.Tensor]:\n r\"\"\"\n Generates a function to computes the matrix-Jacobian product (MJP) of the\n per-sample loss with respect to the model's parameters, i.e. the function\n\n \\[ f(\\theta, x, y) = g \\, @ \\, (\\nabla_{\\theta}\\operatorname{loss}\n (\\operatorname{model}(\\theta, x_i), y_i))_i^T \\]\n\n for a loss function $\\operatorname{loss}$ and a model $\\operatorname{model}$ with\n model parameters $\\theta$.\n\n Args:\n model: The PyTorch model for which the MJP will be computed.\n loss: A callable that computes the loss.\n g: Matrix for which the product with the Jacobian will be computed.\n The shape of this matrix should be consistent with the shape of\n the jacobian.\n\n Returns:\n A callable that takes a dictionary of model inputs, the model's input,\n and the labels. The callable returns the matrix-Jacobian product of the\n per-sample loss with respect to the model's parameters for the given\n matrix `g`.\n\n \"\"\"\n\n def single_jvp(\n params: Dict[str, torch.Tensor],\n x: torch.Tensor,\n y: torch.Tensor,\n _g: torch.Tensor,\n ):\n return torch.func.jvp(\n lambda p: create_per_sample_loss_function(model, loss)(p, x, y),\n (params,),\n (align_with_model(_g, model),),\n )[1]\n\n def full_jvp(params: Dict[str, torch.Tensor], x: torch.Tensor, y: torch.Tensor):\n return torch.func.vmap(single_jvp, in_dims=(None, None, None, 0))(\n params, x, y, g\n )\n\n return full_jvp\n
"},{"location":"api/pydvl/influence/torch/functional/#pydvl.influence.torch.functional.create_per_sample_mixed_derivative_function","title":"create_per_sample_mixed_derivative_function","text":"create_per_sample_mixed_derivative_function(\n model: Module, loss: Callable[[Tensor, Tensor], Tensor]\n) -> Callable[[Dict[str, Tensor], Tensor, Tensor], Dict[str, Tensor]]\n
Generates a function to computes the mixed derivatives, of the per-sample loss with respect to the model parameters and the input, i.e. the function
\\[ f(\\theta, x, y) = \\nabla_{\\theta}\\nabla_{x}\\operatorname{loss} (\\operatorname{model}(\\theta, x), y) \\] for a loss function \\(\\operatorname{loss}\\) and a model \\(\\operatorname{model}\\) with model parameters \\(\\theta\\).
PARAMETER DESCRIPTION model
The PyTorch model for which the mixed derivatives are computed.
TYPE: Module
loss
A callable that computes the loss.
TYPE: Callable[[Tensor, Tensor], Tensor]
RETURNS DESCRIPTION Callable[[Dict[str, Tensor], Tensor, Tensor], Dict[str, Tensor]]
A callable that takes a dictionary of model inputs, the model's input, and the labels. The callable returns the mixed derivatives of the per-sample loss with respect to the model's parameters and input.
Source code in src/pydvl/influence/torch/functional.py
def create_per_sample_mixed_derivative_function(\n model: torch.nn.Module, loss: Callable[[torch.Tensor, torch.Tensor], torch.Tensor]\n) -> Callable[\n [Dict[str, torch.Tensor], torch.Tensor, torch.Tensor], Dict[str, torch.Tensor]\n]:\n r\"\"\"\n Generates a function to computes the mixed derivatives, of the per-sample loss with\n respect to the model parameters and the input, i.e. the function\n\n \\[ f(\\theta, x, y) = \\nabla_{\\theta}\\nabla_{x}\\operatorname{loss}\n (\\operatorname{model}(\\theta, x), y) \\]\n\n for a loss function $\\operatorname{loss}$ and a model $\\operatorname{model}$ with\n model parameters $\\theta$.\n\n Args:\n model: The PyTorch model for which the mixed derivatives are computed.\n loss: A callable that computes the loss.\n\n Returns:\n A callable that takes a dictionary of model inputs, the model's input,\n and the labels. The callable returns the mixed derivatives of the\n per-sample loss with respect to the model's parameters and input.\n\n \"\"\"\n\n def compute_loss(params: Dict[str, torch.Tensor], x: torch.Tensor, y: torch.Tensor):\n outputs = functional_call(\n model, params, (to_model_device(x.unsqueeze(0), model),)\n )\n return loss(outputs, y.unsqueeze(0))\n\n per_samp_mix_derivative: Callable[\n [Dict[str, torch.Tensor], torch.Tensor, torch.Tensor], Dict[str, torch.Tensor]\n ] = torch.vmap(\n torch.func.jacrev(torch.func.grad(compute_loss, argnums=1)),\n in_dims=(None, 0, 0),\n )\n return per_samp_mix_derivative\n
"},{"location":"api/pydvl/influence/torch/functional/#pydvl.influence.torch.functional.lanzcos_low_rank_hessian_approx","title":"lanzcos_low_rank_hessian_approx","text":"lanzcos_low_rank_hessian_approx(\n hessian_vp: Callable[[Tensor], Tensor],\n matrix_shape: Tuple[int, int],\n hessian_perturbation: float = 0.0,\n rank_estimate: int = 10,\n krylov_dimension: Optional[int] = None,\n tol: float = 1e-06,\n max_iter: Optional[int] = None,\n device: Optional[device] = None,\n eigen_computation_on_gpu: bool = False,\n torch_dtype: Optional[dtype] = None,\n) -> LowRankProductRepresentation\n
Calculates a low-rank approximation of the Hessian matrix of a scalar-valued function using the implicitly restarted Lanczos algorithm, i.e.:
\\[ H_{\\text{approx}} = V D V^T\\] where \\(D\\) is a diagonal matrix with the top (in absolute value) rank_estimate
eigenvalues of the Hessian and \\(V\\) contains the corresponding eigenvectors.
PARAMETER DESCRIPTION hessian_vp
A function that takes a vector and returns the product of the Hessian of the loss function.
TYPE: Callable[[Tensor], Tensor]
matrix_shape
The shape of the matrix, represented by the hessian vector product.
TYPE: Tuple[int, int]
hessian_perturbation
Regularization parameter added to the Hessian-vector product for numerical stability.
TYPE: float
DEFAULT: 0.0
rank_estimate
The number of eigenvalues and corresponding eigenvectors to compute. Represents the desired rank of the Hessian approximation.
TYPE: int
DEFAULT: 10
krylov_dimension
The number of Krylov vectors to use for the Lanczos method. If not provided, it defaults to \\( \\min(\\text{model.n_parameters}, \\max(2 \\times \\text{rank_estimate} + 1, 20)) \\).
TYPE: Optional[int]
DEFAULT: None
tol
The stopping criteria for the Lanczos algorithm, which stops when the difference in the approximated eigenvalue is less than tol
. Defaults to 1e-6.
TYPE: float
DEFAULT: 1e-06
max_iter
The maximum number of iterations for the Lanczos method. If not provided, it defaults to \\( 10 \\cdot \\text{model.n_parameters}\\).
TYPE: Optional[int]
DEFAULT: None
device
The device to use for executing the hessian vector product.
TYPE: Optional[device]
DEFAULT: None
eigen_computation_on_gpu
If True, tries to execute the eigen pair approximation on the provided device via cupy implementation. Ensure that either your model is small enough, or you use a small rank_estimate to fit your device's memory. If False, the eigen pair approximation is executed on the CPU with scipy's wrapper to ARPACK.
TYPE: bool
DEFAULT: False
torch_dtype
If not provided, the current torch default dtype is used for conversion to torch.
TYPE: Optional[dtype]
DEFAULT: None
RETURNS DESCRIPTION LowRankProductRepresentation
LowRankProductRepresentation instance that contains the top (up until rank_estimate) eigenvalues and corresponding eigenvectors of the Hessian.
Source code in src/pydvl/influence/torch/functional.py
def lanzcos_low_rank_hessian_approx(\n hessian_vp: Callable[[torch.Tensor], torch.Tensor],\n matrix_shape: Tuple[int, int],\n hessian_perturbation: float = 0.0,\n rank_estimate: int = 10,\n krylov_dimension: Optional[int] = None,\n tol: float = 1e-6,\n max_iter: Optional[int] = None,\n device: Optional[torch.device] = None,\n eigen_computation_on_gpu: bool = False,\n torch_dtype: Optional[torch.dtype] = None,\n) -> LowRankProductRepresentation:\n r\"\"\"\n Calculates a low-rank approximation of the Hessian matrix of a scalar-valued\n function using the implicitly restarted Lanczos algorithm, i.e.:\n\n \\[ H_{\\text{approx}} = V D V^T\\]\n\n where \\(D\\) is a diagonal matrix with the top (in absolute value) `rank_estimate`\n eigenvalues of the Hessian and \\(V\\) contains the corresponding eigenvectors.\n\n Args:\n hessian_vp: A function that takes a vector and returns the product of\n the Hessian of the loss function.\n matrix_shape: The shape of the matrix, represented by the hessian vector\n product.\n hessian_perturbation: Regularization parameter added to the\n Hessian-vector product for numerical stability.\n rank_estimate: The number of eigenvalues and corresponding eigenvectors\n to compute. Represents the desired rank of the Hessian approximation.\n krylov_dimension: The number of Krylov vectors to use for the Lanczos\n method. If not provided, it defaults to\n \\( \\min(\\text{model.n_parameters},\n \\max(2 \\times \\text{rank_estimate} + 1, 20)) \\).\n tol: The stopping criteria for the Lanczos algorithm, which stops when\n the difference in the approximated eigenvalue is less than `tol`.\n Defaults to 1e-6.\n max_iter: The maximum number of iterations for the Lanczos method. If\n not provided, it defaults to \\( 10 \\cdot \\text{model.n_parameters}\\).\n device: The device to use for executing the hessian vector product.\n eigen_computation_on_gpu: If True, tries to execute the eigen pair\n approximation on the provided device via [cupy](https://cupy.dev/)\n implementation. Ensure that either your model is small enough, or you\n use a small rank_estimate to fit your device's memory. If False, the\n eigen pair approximation is executed on the CPU with scipy's wrapper to\n ARPACK.\n torch_dtype: If not provided, the current torch default dtype is used for\n conversion to torch.\n\n Returns:\n [LowRankProductRepresentation]\n [pydvl.influence.torch.functional.LowRankProductRepresentation]\n instance that contains the top (up until rank_estimate) eigenvalues\n and corresponding eigenvectors of the Hessian.\n \"\"\"\n\n torch_dtype = torch.get_default_dtype() if torch_dtype is None else torch_dtype\n\n if eigen_computation_on_gpu:\n try:\n import cupy as cp\n from cupyx.scipy.sparse.linalg import LinearOperator, eigsh\n from torch.utils.dlpack import from_dlpack, to_dlpack\n except ImportError as e:\n raise ImportError(\n f\"Try to install missing dependencies or set eigen_computation_on_gpu \"\n f\"to False: {e}\"\n )\n\n if device is None:\n raise ValueError(\n \"Without setting an explicit device, cupy is not supported\"\n )\n\n def to_torch_conversion_function(x: cp.NDArray) -> torch.Tensor:\n return from_dlpack(x.toDlpack()).to(torch_dtype)\n\n def mv(x):\n x = to_torch_conversion_function(x)\n y = hessian_vp(x) + hessian_perturbation * x\n return cp.from_dlpack(to_dlpack(y))\n\n else:\n from scipy.sparse.linalg import LinearOperator, eigsh\n\n def mv(x):\n x_torch = torch.as_tensor(x, device=device, dtype=torch_dtype)\n y = (\n (hessian_vp(x_torch) + hessian_perturbation * x_torch)\n .detach()\n .cpu()\n .numpy()\n )\n return y\n\n to_torch_conversion_function = partial(torch.as_tensor, dtype=torch_dtype)\n\n try:\n eigen_vals, eigen_vecs = eigsh(\n LinearOperator(matrix_shape, matvec=mv),\n k=rank_estimate,\n maxiter=max_iter,\n tol=tol,\n ncv=krylov_dimension,\n return_eigenvectors=True,\n )\n\n except ArpackNoConvergence as e:\n logger.warning(\n f\"ARPACK did not converge for parameters {max_iter=}, {tol=}, \"\n f\"{krylov_dimension=}, {rank_estimate=}. \\n \"\n f\"Returning the best approximation found so far. \"\n f\"Use those with care or modify parameters.\\n Original error: {e}\"\n )\n\n eigen_vals, eigen_vecs = e.eigenvalues, e.eigenvectors\n\n eigen_vals = to_torch_conversion_function(eigen_vals)\n eigen_vecs = to_torch_conversion_function(eigen_vecs)\n\n return LowRankProductRepresentation(eigen_vals, eigen_vecs)\n
"},{"location":"api/pydvl/influence/torch/functional/#pydvl.influence.torch.functional.model_hessian_low_rank","title":"model_hessian_low_rank","text":"model_hessian_low_rank(\n model: Module,\n loss: Callable[[Tensor, Tensor], Tensor],\n training_data: DataLoader,\n hessian_perturbation: float = 0.0,\n rank_estimate: int = 10,\n krylov_dimension: Optional[int] = None,\n tol: float = 1e-06,\n max_iter: Optional[int] = None,\n eigen_computation_on_gpu: bool = False,\n precompute_grad: bool = False,\n) -> LowRankProductRepresentation\n
Calculates a low-rank approximation of the Hessian matrix of the model's loss function using the implicitly restarted Lanczos algorithm, i.e.
\\[ H_{\\text{approx}} = V D V^T\\] where \\(D\\) is a diagonal matrix with the top (in absolute value) rank_estimate
eigenvalues of the Hessian and \\(V\\) contains the corresponding eigenvectors.
PARAMETER DESCRIPTION model
A PyTorch model instance. The Hessian will be calculated with respect to this model's parameters.
TYPE: Module
loss
A callable that computes the loss.
training_data
A DataLoader instance that provides the model's training data. Used in calculating the Hessian-vector products.
TYPE: DataLoader
hessian_perturbation
Optional regularization parameter added to the Hessian-vector product for numerical stability.
TYPE: float
DEFAULT: 0.0
rank_estimate
The number of eigenvalues and corresponding eigenvectors to compute. Represents the desired rank of the Hessian approximation.
TYPE: int
DEFAULT: 10
krylov_dimension
The number of Krylov vectors to use for the Lanczos method. If not provided, it defaults to min(model.n_parameters, max(2*rank_estimate + 1, 20)).
TYPE: Optional[int]
DEFAULT: None
tol
The stopping criteria for the Lanczos algorithm, which stops when the difference in the approximated eigenvalue is less than tol
. Defaults to 1e-6.
TYPE: float
DEFAULT: 1e-06
max_iter
The maximum number of iterations for the Lanczos method. If not provided, it defaults to 10*model.n_parameters.
TYPE: Optional[int]
DEFAULT: None
eigen_computation_on_gpu
If True, tries to execute the eigen pair approximation on the provided device via cupy implementation. Make sure, that either your model is small enough or you use a small rank_estimate to fit your device's memory. If False, the eigen pair approximation is executed on the CPU by scipy wrapper to ARPACK.
TYPE: bool
DEFAULT: False
precompute_grad
If True, the full data gradient is precomputed and kept in memory, which can speed up the hessian vector product computation. Set this to False, if you can't afford to keep the full computation graph in memory.
TYPE: bool
DEFAULT: False
RETURNS DESCRIPTION LowRankProductRepresentation
LowRankProductRepresentation instance that contains the top (up until rank_estimate) eigenvalues and corresponding eigenvectors of the Hessian.
Source code in src/pydvl/influence/torch/functional.py
def model_hessian_low_rank(\n model: torch.nn.Module,\n loss: Callable[[torch.Tensor, torch.Tensor], torch.Tensor],\n training_data: DataLoader,\n hessian_perturbation: float = 0.0,\n rank_estimate: int = 10,\n krylov_dimension: Optional[int] = None,\n tol: float = 1e-6,\n max_iter: Optional[int] = None,\n eigen_computation_on_gpu: bool = False,\n precompute_grad: bool = False,\n) -> LowRankProductRepresentation:\n r\"\"\"\n Calculates a low-rank approximation of the Hessian matrix of the model's\n loss function using the implicitly restarted Lanczos algorithm, i.e.\n\n \\[ H_{\\text{approx}} = V D V^T\\]\n\n where \\(D\\) is a diagonal matrix with the top (in absolute value) `rank_estimate`\n eigenvalues of the Hessian and \\(V\\) contains the corresponding eigenvectors.\n\n\n Args:\n model: A PyTorch model instance. The Hessian will be calculated with respect to\n this model's parameters.\n loss : A callable that computes the loss.\n training_data: A DataLoader instance that provides the model's training data.\n Used in calculating the Hessian-vector products.\n hessian_perturbation: Optional regularization parameter added to the\n Hessian-vector product for numerical stability.\n rank_estimate: The number of eigenvalues and corresponding eigenvectors to\n compute. Represents the desired rank of the Hessian approximation.\n krylov_dimension: The number of Krylov vectors to use for the Lanczos method.\n If not provided, it defaults to min(model.n_parameters,\n max(2*rank_estimate + 1, 20)).\n tol: The stopping criteria for the Lanczos algorithm,\n which stops when the difference in the approximated eigenvalue is less than\n `tol`. Defaults to 1e-6.\n max_iter: The maximum number of iterations for the Lanczos method.\n If not provided, it defaults to 10*model.n_parameters.\n eigen_computation_on_gpu: If True, tries to execute the eigen pair approximation\n on the provided device via cupy implementation.\n Make sure, that either your model is small enough or you use a\n small rank_estimate to fit your device's memory.\n If False, the eigen pair approximation is executed on the CPU by\n scipy wrapper to ARPACK.\n precompute_grad: If True, the full data gradient is precomputed and kept\n in memory, which can speed up the hessian vector product computation.\n Set this to False, if you can't afford to keep the full computation graph\n in memory.\n\n Returns:\n [LowRankProductRepresentation]\n [pydvl.influence.torch.functional.LowRankProductRepresentation]\n instance that contains the top (up until rank_estimate) eigenvalues\n and corresponding eigenvectors of the Hessian.\n \"\"\"\n raw_hvp = create_hvp_function(\n model, loss, training_data, use_average=True, precompute_grad=precompute_grad\n )\n n_params = sum([p.numel() for p in model.parameters() if p.requires_grad])\n device = next(model.parameters()).device\n return lanzcos_low_rank_hessian_approx(\n hessian_vp=raw_hvp,\n matrix_shape=(n_params, n_params),\n hessian_perturbation=hessian_perturbation,\n rank_estimate=rank_estimate,\n krylov_dimension=krylov_dimension,\n tol=tol,\n max_iter=max_iter,\n device=device,\n eigen_computation_on_gpu=eigen_computation_on_gpu,\n )\n
"},{"location":"api/pydvl/influence/torch/functional/#pydvl.influence.torch.functional.randomized_nystroem_approximation","title":"randomized_nystroem_approximation","text":"randomized_nystroem_approximation(\n mat_mat_prod: Union[Tensor, Callable[[Tensor], Tensor]],\n input_dim: int,\n rank: int,\n input_type: dtype,\n shift_func: Optional[Callable[[Tensor], Tensor]] = None,\n mat_vec_device: device = torch.device(\"cpu\"),\n) -> LowRankProductRepresentation\n
Given a matrix vector product function (representing a symmetric positive definite matrix \\(A\\) ), computes a random Nystr\u00f6m low rank approximation of \\(A\\) in factored form, i.e.
\\[ A_{\\text{nys}} = (A \\Omega)(\\Omega^T A \\Omega)^{\\dagger}(A \\Omega)^T = U \\Sigma U^T \\] where \\(\\Omega\\) is a standard normal random matrix.
PARAMETER DESCRIPTION mat_mat_prod
A callable representing the matrix vector product
TYPE: Union[Tensor, Callable[[Tensor], Tensor]]
input_dim
dimension of the input for the matrix vector product
TYPE: int
input_type
data_type of inputs
TYPE: dtype
rank
rank of the approximation
TYPE: int
shift_func
optional function for computing the stabilizing shift in the construction of the randomized nystroem approximation, defaults to
\\[ \\sqrt{\\operatorname{\\text{input_dim}}} \\cdot \\varepsilon(\\operatorname{\\text{input_type}}) \\cdot \\|A\\Omega\\|_2,\\] where \\(\\varepsilon(\\operatorname{\\text{input_type}})\\) is the value of the machine precision corresponding to the data type.
TYPE: Optional[Callable[[Tensor], Tensor]]
DEFAULT: None
mat_vec_device
device where the matrix vector product has to be executed
TYPE: device
DEFAULT: device('cpu')
RETURNS DESCRIPTION LowRankProductRepresentation
object containing, \\(U\\) and \\(\\Sigma\\)
Source code in src/pydvl/influence/torch/functional.py
def randomized_nystroem_approximation(\n mat_mat_prod: Union[torch.Tensor, Callable[[torch.Tensor], torch.Tensor]],\n input_dim: int,\n rank: int,\n input_type: torch.dtype,\n shift_func: Optional[Callable[[torch.Tensor], torch.Tensor]] = None,\n mat_vec_device: torch.device = torch.device(\"cpu\"),\n) -> LowRankProductRepresentation:\n r\"\"\"\n Given a matrix vector product function (representing a symmetric positive definite\n matrix $A$ ), computes a random Nystr\u00f6m low rank approximation of\n $A$ in factored form, i.e.\n\n $$ A_{\\text{nys}} = (A \\Omega)(\\Omega^T A \\Omega)^{\\dagger}(A \\Omega)^T\n = U \\Sigma U^T $$\n\n where $\\Omega$ is a standard normal random matrix.\n\n Args:\n mat_mat_prod: A callable representing the matrix vector product\n input_dim: dimension of the input for the matrix vector product\n input_type: data_type of inputs\n rank: rank of the approximation\n shift_func: optional function for computing the stabilizing shift in the\n construction of the randomized nystroem approximation, defaults to\n\n $$ \\sqrt{\\operatorname{\\text{input_dim}}} \\cdot\n \\varepsilon(\\operatorname{\\text{input_type}}) \\cdot \\|A\\Omega\\|_2,$$\n\n where $\\varepsilon(\\operatorname{\\text{input_type}})$ is the value of the\n machine precision corresponding to the data type.\n mat_vec_device: device where the matrix vector product has to be executed\n\n Returns:\n object containing, $U$ and $\\Sigma$\n \"\"\"\n\n if shift_func is None:\n\n def shift_func(x: torch.Tensor):\n return (\n torch.sqrt(torch.as_tensor(input_dim))\n * torch.finfo(x.dtype).eps\n * torch.linalg.norm(x)\n )\n\n _mat_mat_prod: Callable[[torch.Tensor], torch.Tensor]\n\n if isinstance(mat_mat_prod, torch.Tensor):\n\n def _mat_mat_prod(x: torch.Tensor):\n return mat_mat_prod @ x\n\n else:\n _mat_mat_prod = mat_mat_prod\n\n random_sample_matrix = torch.randn(\n input_dim, rank, device=mat_vec_device, dtype=input_type\n )\n random_sample_matrix, _ = torch.linalg.qr(random_sample_matrix)\n\n sketch_mat = _mat_mat_prod(random_sample_matrix)\n\n shift = shift_func(sketch_mat)\n sketch_mat += shift * random_sample_matrix\n cholesky_mat = torch.matmul(random_sample_matrix.t(), sketch_mat)\n try:\n triangular_mat = torch.linalg.cholesky(cholesky_mat)\n except _LinAlgError as e:\n logger.warning(\n f\"Encountered error in cholesky decomposition: {e}.\\n \"\n f\"Increasing shift by smallest eigenvalue and re-compute\"\n )\n eigen_vals, eigen_vectors = torch.linalg.eigh(cholesky_mat)\n shift += torch.abs(torch.min(eigen_vals))\n eigen_vals += shift\n triangular_mat = torch.linalg.cholesky(\n torch.mm(eigen_vectors, torch.mm(torch.diag(eigen_vals), eigen_vectors.T))\n )\n\n svd_input = torch.linalg.solve_triangular(\n triangular_mat.t(), sketch_mat, upper=True, left=False\n )\n left_singular_vecs, singular_vals, _ = torch.linalg.svd(\n svd_input, full_matrices=False\n )\n singular_vals = torch.clamp(singular_vals**2 - shift, min=0)\n\n return LowRankProductRepresentation(singular_vals, left_singular_vecs)\n
"},{"location":"api/pydvl/influence/torch/functional/#pydvl.influence.torch.functional.model_hessian_nystroem_approximation","title":"model_hessian_nystroem_approximation","text":"model_hessian_nystroem_approximation(\n model: Module,\n loss: Callable[[Tensor, Tensor], Tensor],\n data_loader: DataLoader,\n rank: int,\n shift_func: Optional[Callable[[Tensor], Tensor]] = None,\n) -> LowRankProductRepresentation\n
Given a model, loss and a data_loader, computes a random Nystr\u00f6m low rank approximation of the corresponding Hessian matrix in factored form, i.e.
\\[ H_{\\text{nys}} = (H \\Omega)(\\Omega^T H \\Omega)^{+}(H \\Omega)^T = U \\Sigma U^T \\] PARAMETER DESCRIPTION model
A PyTorch model instance. The Hessian will be calculated with respect to this model's parameters.
TYPE: Module
loss
A callable that computes the loss.
data_loader
A DataLoader instance that provides the model's training data. Used in calculating the Hessian-vector products.
TYPE: DataLoader
rank
rank of the approximation
TYPE: int
shift_func
optional function for computing the stabilizing shift in the construction of the randomized nystroem approximation, defaults to
\\[ \\sqrt{\\operatorname{\\text{input_dim}}} \\cdot \\varepsilon(\\operatorname{\\text{input_type}}) \\cdot \\|A\\Omega\\|_2,\\] where \\(\\varepsilon(\\operatorname{\\text{input_type}})\\) is the value of the machine precision corresponding to the data type.
TYPE: Optional[Callable[[Tensor], Tensor]]
DEFAULT: None
RETURNS DESCRIPTION LowRankProductRepresentation
object containing, \\(U\\) and \\(\\Sigma\\)
Source code in src/pydvl/influence/torch/functional.py
def model_hessian_nystroem_approximation(\n model: torch.nn.Module,\n loss: Callable[[torch.Tensor, torch.Tensor], torch.Tensor],\n data_loader: DataLoader,\n rank: int,\n shift_func: Optional[Callable[[torch.Tensor], torch.Tensor]] = None,\n) -> LowRankProductRepresentation:\n r\"\"\"\n Given a model, loss and a data_loader, computes a random Nystr\u00f6m low rank approximation of\n the corresponding Hessian matrix in factored form, i.e.\n\n $$ H_{\\text{nys}} = (H \\Omega)(\\Omega^T H \\Omega)^{+}(H \\Omega)^T\n = U \\Sigma U^T $$\n\n Args:\n model: A PyTorch model instance. The Hessian will be calculated with respect to\n this model's parameters.\n loss : A callable that computes the loss.\n data_loader: A DataLoader instance that provides the model's training data.\n Used in calculating the Hessian-vector products.\n rank: rank of the approximation\n shift_func: optional function for computing the stabilizing shift in the\n construction of the randomized nystroem approximation, defaults to\n\n $$ \\sqrt{\\operatorname{\\text{input_dim}}} \\cdot\n \\varepsilon(\\operatorname{\\text{input_type}}) \\cdot \\|A\\Omega\\|_2,$$\n\n where $\\varepsilon(\\operatorname{\\text{input_type}})$ is the value of the\n machine precision corresponding to the data type.\n\n Returns:\n object containing, $U$ and $\\Sigma$\n \"\"\"\n\n model_hvp = create_hvp_function(\n model, loss, data_loader, precompute_grad=False, use_average=True\n )\n device = next((p.device for p in model.parameters()))\n dtype = next((p.dtype for p in model.parameters()))\n in_dim = sum((p.numel() for p in model.parameters() if p.requires_grad))\n\n def model_hessian_mat_mat_prod(x: torch.Tensor):\n return torch.func.vmap(model_hvp, in_dims=1, randomness=\"same\")(x).t()\n\n return randomized_nystroem_approximation(\n model_hessian_mat_mat_prod,\n in_dim,\n rank,\n dtype,\n shift_func=shift_func,\n mat_vec_device=device,\n )\n
"},{"location":"api/pydvl/influence/torch/functional/#pydvl.influence.torch.functional.operator_nystroem_approximation","title":"operator_nystroem_approximation","text":"operator_nystroem_approximation(\n operator: \"TensorOperator\",\n rank: int,\n shift_func: Optional[Callable[[Tensor], Tensor]] = None,\n)\n
Given an operator (representing a symmetric positive definite matrix \\(A\\) ), computes a random Nystr\u00f6m low rank approximation of \\(A\\) in factored form, i.e.
\\[ A_{\\text{nys}} = (A \\Omega)(\\Omega^T A \\Omega)^{\\dagger}(A \\Omega)^T = U \\Sigma U^T \\] where \\(\\Omega\\) is a standard normal random matrix.
PARAMETER DESCRIPTION operator
the operator to approximate
TYPE: 'TensorOperator'
rank
rank of the approximation
TYPE: int
shift_func
optional function for computing the stabilizing shift in the construction of the randomized nystroem approximation, defaults to
\\[ \\sqrt{\\operatorname{\\text{input_dim}}} \\cdot \\varepsilon(\\operatorname{\\text{input_type}}) \\cdot \\|A\\Omega\\|_2,\\] where \\(\\varepsilon(\\operatorname{\\text{input_type}})\\) is the value of the machine precision corresponding to the data type.
TYPE: Optional[Callable[[Tensor], Tensor]]
DEFAULT: None
RETURNS DESCRIPTION object containing, \\(U\\) and \\(\\Sigma\\)
Source code in src/pydvl/influence/torch/functional.py
def operator_nystroem_approximation(\n operator: \"TensorOperator\",\n rank: int,\n shift_func: Optional[Callable[[torch.Tensor], torch.Tensor]] = None,\n):\n r\"\"\"\n Given an operator (representing a symmetric positive definite\n matrix $A$ ), computes a random Nystr\u00f6m low rank approximation of\n $A$ in factored form, i.e.\n\n $$ A_{\\text{nys}} = (A \\Omega)(\\Omega^T A \\Omega)^{\\dagger}(A \\Omega)^T\n = U \\Sigma U^T $$\n\n where $\\Omega$ is a standard normal random matrix.\n\n Args:\n operator: the operator to approximate\n rank: rank of the approximation\n shift_func: optional function for computing the stabilizing shift in the\n construction of the randomized nystroem approximation, defaults to\n\n $$ \\sqrt{\\operatorname{\\text{input_dim}}} \\cdot\n \\varepsilon(\\operatorname{\\text{input_type}}) \\cdot \\|A\\Omega\\|_2,$$\n\n where $\\varepsilon(\\operatorname{\\text{input_type}})$ is the value of the\n machine precision corresponding to the data type.\n\n Returns:\n object containing, $U$ and $\\Sigma$\n \"\"\"\n\n def mat_mat_prod(x: torch.Tensor):\n return operator.apply(x.t()).t()\n\n return randomized_nystroem_approximation(\n mat_mat_prod,\n operator.input_size,\n rank,\n operator.dtype,\n shift_func=shift_func,\n mat_vec_device=operator.device,\n )\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/","title":"Influence function model","text":""},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model","title":"pydvl.influence.torch.influence_function_model","text":"This module implements several implementations of InfluenceFunctionModel utilizing PyTorch.
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.TorchInfluenceFunctionModel","title":"TorchInfluenceFunctionModel","text":"TorchInfluenceFunctionModel(\n model: Module, loss: Callable[[Tensor, Tensor], Tensor]\n)\n
Bases: InfluenceFunctionModel[Tensor, DataLoader]
, ABC
Abstract base class for influence computation related to torch models
Source code in src/pydvl/influence/torch/influence_function_model.py
def __init__(\n self,\n model: nn.Module,\n loss: Callable[[torch.Tensor, torch.Tensor], torch.Tensor],\n):\n self.loss = loss\n self.model = model\n self._n_parameters = sum(\n [p.numel() for p in model.parameters() if p.requires_grad]\n )\n self._model_device = next(\n (p.device for p in model.parameters() if p.requires_grad)\n )\n self._model_params = {\n k: p.detach() for k, p in self.model.named_parameters() if p.requires_grad\n }\n self._model_dtype = next(\n (p.dtype for p in model.parameters() if p.requires_grad)\n )\n super().__init__()\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.TorchInfluenceFunctionModel.is_fitted","title":"is_fitted abstractmethod
property
","text":"is_fitted\n
Override this, to expose the fitting status of the instance.
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.TorchInfluenceFunctionModel.fit","title":"fit abstractmethod
","text":"fit(data: DataLoaderType) -> InfluenceFunctionModel\n
Override this method to fit the influence function model to training data, e.g. pre-compute hessian matrix or matrix decompositions
PARAMETER DESCRIPTION data
TYPE: DataLoaderType
RETURNS DESCRIPTION InfluenceFunctionModel
The fitted instance
Source code in src/pydvl/influence/base_influence_function_model.py
@abstractmethod\ndef fit(self, data: DataLoaderType) -> InfluenceFunctionModel:\n \"\"\"\n Override this method to fit the influence function model to training data,\n e.g. pre-compute hessian matrix or matrix decompositions\n\n Args:\n data:\n\n Returns:\n The fitted instance\n \"\"\"\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.TorchInfluenceFunctionModel.fit_required","title":"fit_required staticmethod
","text":"fit_required(method)\n
Decorator to enforce the fitted check
Source code in src/pydvl/influence/base_influence_function_model.py
@staticmethod\ndef fit_required(method):\n \"\"\"Decorator to enforce the fitted check\"\"\"\n\n @wraps(method)\n def wrapper(self, *args, **kwargs):\n if not self.is_fitted:\n raise NotFittedException(type(self))\n return method(self, *args, **kwargs)\n\n return wrapper\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.TorchInfluenceFunctionModel.influences","title":"influences","text":"influences(\n x_test: Tensor,\n y_test: Tensor,\n x: Optional[Tensor] = None,\n y: Optional[Tensor] = None,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> Tensor\n
Compute the approximation of
\\[ \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}})), \\nabla_{\\theta} \\ell(y, f_{\\theta}(x))\\rangle \\] for the case of up-weighting influence, resp.
\\[ \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}})), \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the perturbation type influence case. For all input tensors it is assumed, that the first dimension is the batch dimension (in case, you want to provide a single sample z, call z.unsqueeze(0) if no batch dimension is present).
PARAMETER DESCRIPTION x_test
model input to use in the gradient computations of \\(H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}}))\\)
TYPE: Tensor
y_test
label tensor to compute gradients
TYPE: Tensor
x
optional model input to use in the gradient computations \\(\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))\\), resp. \\(\\nabla_{x}\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))\\), if None, use \\(x=x_{\\text{test}}\\)
TYPE: Optional[Tensor]
DEFAULT: None
y
optional label tensor to compute gradients
TYPE: Optional[Tensor]
DEFAULT: None
mode
enum value of InfluenceMode
TYPE: InfluenceMode
DEFAULT: Up
RETURNS DESCRIPTION Tensor
Tensor representing the element-wise scalar products for the provided batch
Source code in src/pydvl/influence/torch/influence_function_model.py
def influences(\n self,\n x_test: torch.Tensor,\n y_test: torch.Tensor,\n x: Optional[torch.Tensor] = None,\n y: Optional[torch.Tensor] = None,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> torch.Tensor:\n r\"\"\"\n Compute the approximation of\n\n \\[\n \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}},\n f_{\\theta}(x_{\\text{test}})), \\nabla_{\\theta} \\ell(y, f_{\\theta}(x))\\rangle\n \\]\n\n for the case of up-weighting influence, resp.\n\n \\[\n \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}})),\n \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle\n \\]\n\n for the perturbation type influence case. For all input tensors it is assumed,\n that the first dimension is the batch dimension (in case, you want to provide\n a single sample z, call z.unsqueeze(0) if no batch dimension is present).\n\n Args:\n x_test: model input to use in the gradient computations\n of $H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}},\n f_{\\theta}(x_{\\text{test}}))$\n y_test: label tensor to compute gradients\n x: optional model input to use in the gradient computations\n $\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))$,\n resp. $\\nabla_{x}\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))$,\n if None, use $x=x_{\\text{test}}$\n y: optional label tensor to compute gradients\n mode: enum value of [InfluenceMode]\n [pydvl.influence.base_influence_function_model.InfluenceMode]\n\n Returns:\n Tensor representing the element-wise scalar products for the provided batch\n\n \"\"\"\n t: torch.Tensor = super().influences(x_test, y_test, x, y, mode=mode)\n return t\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.TorchInfluenceFunctionModel.influence_factors","title":"influence_factors","text":"influence_factors(x: Tensor, y: Tensor) -> Tensor\n
Compute approximation of
\\[ H^{-1}\\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\] where the gradient is meant to be per sample of the batch \\((x, y)\\). For all input tensors it is assumed, that the first dimension is the batch dimension (in case, you want to provide a single sample z, call z.unsqueeze(0) if no batch dimension is present).
PARAMETER DESCRIPTION x
model input to use in the gradient computations
TYPE: Tensor
y
label tensor to compute gradients
TYPE: Tensor
RETURNS DESCRIPTION Tensor
Tensor representing the element-wise inverse Hessian matrix vector products
Source code in src/pydvl/influence/torch/influence_function_model.py
def influence_factors(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:\n r\"\"\"\n Compute approximation of\n\n \\[ H^{-1}\\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\]\n\n where the gradient is meant to be per sample of the batch $(x, y)$.\n For all input tensors it is assumed,\n that the first dimension is the batch dimension (in case, you want to provide\n a single sample z, call z.unsqueeze(0) if no batch dimension is present).\n\n Args:\n x: model input to use in the gradient computations\n y: label tensor to compute gradients\n\n Returns:\n Tensor representing the element-wise inverse Hessian matrix vector products\n\n \"\"\"\n return super().influence_factors(x, y)\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.TorchInfluenceFunctionModel.influences_from_factors","title":"influences_from_factors","text":"influences_from_factors(\n z_test_factors: Tensor,\n x: Tensor,\n y: Tensor,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> Tensor\n
Computation of
\\[ \\langle z_{\\text{test_factors}}, \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the case of up-weighting influence, resp.
\\[ \\langle z_{\\text{test_factors}}, \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the perturbation type influence case. The gradient is meant to be per sample of the batch \\((x, y)\\). For all input tensors it is assumed, that the first dimension is the batch dimension (in case, you want to provide a single sample z, call z.unsqueeze(0) if no batch dimension is present).
PARAMETER DESCRIPTION z_test_factors
pre-computed tensor, approximating \\(H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}}))\\)
TYPE: Tensor
x
model input to use in the gradient computations \\(\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))\\), resp. \\(\\nabla_{x}\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))\\)
TYPE: Tensor
y
label tensor to compute gradients
TYPE: Tensor
mode
enum value of InfluenceMode
TYPE: InfluenceMode
DEFAULT: Up
RETURNS DESCRIPTION Tensor
Tensor representing the element-wise scalar products for the provided batch
Source code in src/pydvl/influence/torch/influence_function_model.py
def influences_from_factors(\n self,\n z_test_factors: torch.Tensor,\n x: torch.Tensor,\n y: torch.Tensor,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> torch.Tensor:\n r\"\"\"\n Computation of\n\n \\[ \\langle z_{\\text{test_factors}},\n \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the case of up-weighting influence, resp.\n\n \\[ \\langle z_{\\text{test_factors}},\n \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the perturbation type influence case. The gradient is meant to be per sample\n of the batch $(x, y)$. For all input tensors it is assumed,\n that the first dimension is the batch dimension (in case, you want to provide\n a single sample z, call z.unsqueeze(0) if no batch dimension is present).\n\n Args:\n z_test_factors: pre-computed tensor, approximating\n $H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}},\n f_{\\theta}(x_{\\text{test}}))$\n x: model input to use in the gradient computations\n $\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))$,\n resp. $\\nabla_{x}\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))$\n y: label tensor to compute gradients\n mode: enum value of [InfluenceMode]\n [pydvl.influence.base_influence_function_model.InfluenceMode]\n\n Returns:\n Tensor representing the element-wise scalar products for the provided batch\n\n \"\"\"\n if mode == InfluenceMode.Up:\n return (\n z_test_factors.to(self.model_device)\n @ self._loss_grad(x.to(self.model_device), y.to(self.model_device)).T\n )\n elif mode == InfluenceMode.Perturbation:\n return torch.einsum(\n \"ia,j...a->ij...\",\n z_test_factors.to(self.model_device),\n self._flat_loss_mixed_grad(\n x.to(self.model_device), y.to(self.model_device)\n ),\n )\n else:\n raise UnsupportedInfluenceModeException(mode)\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.DirectInfluence","title":"DirectInfluence","text":"DirectInfluence(\n model: Module,\n loss: LossType,\n regularization: Optional[Union[float, Dict[str, Optional[float]]]] = None,\n block_structure: Union[\n BlockMode, OrderedDict[str, List[str]]\n ] = BlockMode.FULL,\n second_order_mode: SecondOrderMode = SecondOrderMode.HESSIAN,\n)\n
Bases: TorchComposableInfluence[DirectSolveOperator]
Given a model and training data, it finds x such that \\(Hx = b\\), with \\(H\\) being the model hessian or Gauss-Newton matrix.
PARAMETER DESCRIPTION model
The model.
TYPE: Module
loss
The loss function.
TYPE: LossType
regularization
The regularization parameter. In case a dictionary is provided, the keys must be a subset of the block identifiers.
TYPE: Optional[Union[float, Dict[str, Optional[float]]]]
DEFAULT: None
block_structure
The blocking structure, either a pre-defined enum or a custom block structure, see the information regarding block-diagonal approximation.
TYPE: Union[BlockMode, OrderedDict[str, List[str]]]
DEFAULT: FULL
second_order_mode
The second order mode, either SecondOrderMode.HESSIAN
or SecondOrderMode.GAUSS_NEWTON
.
TYPE: SecondOrderMode
DEFAULT: HESSIAN
Source code in src/pydvl/influence/torch/influence_function_model.py
def __init__(\n self,\n model: nn.Module,\n loss: LossType,\n regularization: Optional[Union[float, Dict[str, Optional[float]]]] = None,\n block_structure: Union[BlockMode, OrderedDict[str, List[str]]] = BlockMode.FULL,\n second_order_mode: SecondOrderMode = SecondOrderMode.HESSIAN,\n):\n super().__init__(\n model,\n block_structure=block_structure,\n regularization=regularization,\n )\n self.second_order_mode = second_order_mode\n self.loss = loss\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.DirectInfluence.fit","title":"fit","text":"fit(data: DataLoaderType) -> InfluenceFunctionModel\n
Fitting to provided data, by internally creating a block mapper instance from it. Args: data: iterable of tensors
RETURNS DESCRIPTION InfluenceFunctionModel
Fitted instance
Source code in src/pydvl/influence/base_influence_function_model.py
@log_duration(log_level=logging.INFO)\ndef fit(self, data: DataLoaderType) -> InfluenceFunctionModel:\n \"\"\"\n Fitting to provided data, by internally creating a block mapper instance from\n it.\n Args:\n data: iterable of tensors\n\n Returns:\n Fitted instance\n \"\"\"\n self.block_mapper = self._create_block_mapper(data)\n return self\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.DirectInfluence.fit_required","title":"fit_required staticmethod
","text":"fit_required(method)\n
Decorator to enforce the fitted check
Source code in src/pydvl/influence/base_influence_function_model.py
@staticmethod\ndef fit_required(method):\n \"\"\"Decorator to enforce the fitted check\"\"\"\n\n @wraps(method)\n def wrapper(self, *args, **kwargs):\n if not self.is_fitted:\n raise NotFittedException(type(self))\n return method(self, *args, **kwargs)\n\n return wrapper\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.DirectInfluence.influence_factors","title":"influence_factors","text":"influence_factors(x: TensorType, y: TensorType) -> TensorType\n
Computes the approximation of
\\[ H^{-1}\\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\] where the gradient is meant to be per sample of the batch \\((x, y)\\). For all input tensors it is assumed, that the first dimension is the batch dimension.
PARAMETER DESCRIPTION x
model input to use in the gradient computations
TYPE: TensorType
y
label tensor to compute gradients
TYPE: TensorType
RETURNS DESCRIPTION TensorType
Tensor representing the element-wise inverse Hessian matrix vector products
Source code in src/pydvl/influence/base_influence_function_model.py
def influence_factors(self, x: TensorType, y: TensorType) -> TensorType:\n r\"\"\"\n Computes the approximation of\n\n \\[ H^{-1}\\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\]\n\n where the gradient is meant to be per sample of the batch $(x, y)$.\n For all input tensors it is assumed,\n that the first dimension is the batch dimension.\n\n Args:\n x: model input to use in the gradient computations\n y: label tensor to compute gradients\n\n Returns:\n Tensor representing the element-wise inverse Hessian matrix vector products\n\n \"\"\"\n if not self.is_fitted:\n raise NotFittedException(type(self))\n return self._influence_factors(x, y)\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.DirectInfluence.influences","title":"influences","text":"influences(\n x_test: TensorType,\n y_test: TensorType,\n x: Optional[TensorType] = None,\n y: Optional[TensorType] = None,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> TensorType\n
Computes the approximation of
\\[ \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}})), \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the case of up-weighting influence, resp.
\\[ \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{test}, f_{\\theta}(x_{test})), \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the perturbation type influence case.
PARAMETER DESCRIPTION x_test
model input to use in the gradient computations of \\(H^{-1}\\nabla_{theta} \\ell(y_{test}, f_{\\theta}(x_{test}))\\)
TYPE: TensorType
y_test
label tensor to compute gradients
TYPE: TensorType
x
optional model input to use in the gradient computations \\(\\nabla_{theta}\\ell(y, f_{\\theta}(x))\\), resp. \\(\\nabla_{x}\\nabla_{theta}\\ell(y, f_{\\theta}(x))\\), if None, use \\(x=x_{test}\\)
TYPE: Optional[TensorType]
DEFAULT: None
y
optional label tensor to compute gradients
TYPE: Optional[TensorType]
DEFAULT: None
mode
enum value of InfluenceMode
TYPE: InfluenceMode
DEFAULT: Up
RETURNS DESCRIPTION TensorType
Tensor representing the element-wise scalar products for the provided batch
Source code in src/pydvl/influence/base_influence_function_model.py
def influences(\n self,\n x_test: TensorType,\n y_test: TensorType,\n x: Optional[TensorType] = None,\n y: Optional[TensorType] = None,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> TensorType:\n r\"\"\"\n Computes the approximation of\n\n \\[ \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}},\n f_{\\theta}(x_{\\text{test}})),\n \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the case of up-weighting influence, resp.\n\n \\[ \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{test}, f_{\\theta}(x_{test})),\n \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the perturbation type influence case.\n\n Args:\n x_test: model input to use in the gradient computations\n of $H^{-1}\\nabla_{theta} \\ell(y_{test}, f_{\\theta}(x_{test}))$\n y_test: label tensor to compute gradients\n x: optional model input to use in the gradient computations\n $\\nabla_{theta}\\ell(y, f_{\\theta}(x))$,\n resp. $\\nabla_{x}\\nabla_{theta}\\ell(y, f_{\\theta}(x))$,\n if None, use $x=x_{test}$\n y: optional label tensor to compute gradients\n mode: enum value of [InfluenceMode]\n [pydvl.influence.base_influence_function_model.InfluenceMode]\n\n Returns:\n Tensor representing the element-wise scalar products for the provided batch\n\n \"\"\"\n if not self.is_fitted:\n raise NotFittedException(type(self))\n\n if x is None and y is not None:\n raise ValueError(\n \"Providing labels y, without providing model input x \"\n \"is not supported\"\n )\n\n if x is not None and y is None:\n raise ValueError(\n \"Providing model input x, without providing labels y \"\n \"is not supported\"\n )\n\n return self._influences(x_test, y_test, x, y, mode)\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.DirectInfluence.influences_from_factors","title":"influences_from_factors","text":"influences_from_factors(\n z_test_factors: TensorType,\n x: TensorType,\n y: TensorType,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> TensorType\n
Computation of
\\[ \\langle z_{\\text{test_factors}}, \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the case of up-weighting influence, resp.
\\[ \\langle z_{\\text{test_factors}}, \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the perturbation type influence case. The gradient is meant to be per sample of the batch \\((x, y)\\).
PARAMETER DESCRIPTION z_test_factors
pre-computed array, approximating \\(H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}}))\\)
TYPE: TensorType
x
model input to use in the gradient computations \\(\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))\\), resp. \\(\\nabla_{x}\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))\\), if None, use \\(x=x_{\\text{test}}\\)
TYPE: TensorType
y
label tensor to compute gradients
TYPE: TensorType
mode
enum value of InfluenceMode
TYPE: InfluenceMode
DEFAULT: Up
RETURNS DESCRIPTION TensorType
Tensor representing the element-wise scalar products for the provided batch
Source code in src/pydvl/influence/base_influence_function_model.py
@InfluenceFunctionModel.fit_required\ndef influences_from_factors(\n self,\n z_test_factors: TensorType,\n x: TensorType,\n y: TensorType,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> TensorType:\n r\"\"\"\n Computation of\n\n \\[ \\langle z_{\\text{test_factors}},\n \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the case of up-weighting influence, resp.\n\n \\[ \\langle z_{\\text{test_factors}},\n \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the perturbation type influence case. The gradient is meant to be per sample\n of the batch $(x, y)$.\n\n Args:\n z_test_factors: pre-computed array, approximating\n $H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}},\n f_{\\theta}(x_{\\text{test}}))$\n x: model input to use in the gradient computations\n $\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))$,\n resp. $\\nabla_{x}\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))$,\n if None, use $x=x_{\\text{test}}$\n y: label tensor to compute gradients\n mode: enum value of [InfluenceMode]\n [pydvl.influence.base_influence_function_model.InfluenceMode]\n\n Returns:\n Tensor representing the element-wise scalar products for the provided batch\n\n \"\"\"\n tensors = self.block_mapper.generate_interactions_from_transformed_grads(\n z_test_factors,\n self._create_batch(x, y),\n mode,\n )\n result: TensorType = next(tensors)\n for tensor in tensors:\n result = result + tensor\n return result\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.DirectInfluence.influences_by_block","title":"influences_by_block","text":"influences_by_block(\n x_test: TensorType,\n y_test: TensorType,\n x: Optional[TensorType] = None,\n y: Optional[TensorType] = None,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> OrderedDict[str, TensorType]\n
Compute the block-wise influence values for the provided data, i.e. an approximation of
\\[ \\langle H^{-1}\\nabla_{theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}})), \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the case of up-weighting influence, resp.
\\[ \\langle H^{-1}\\nabla_{theta} \\ell(y_{test}, f_{\\theta}(x_{test})), \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the perturbation type influence case.
PARAMETER DESCRIPTION x_test
model input to use in the gradient computations of the approximation of \\(H^{-1}\\nabla_{theta} \\ell(y_{test}, f_{\\theta}(x_{test}))\\)
TYPE: TensorType
y_test
label tensor to compute gradients
TYPE: TensorType
x
optional model input to use in the gradient computations \\(\\nabla_{theta}\\ell(y, f_{\\theta}(x))\\), resp. \\(\\nabla_{x}\\nabla_{theta}\\ell(y, f_{\\theta}(x))\\), if None, use \\(x=x_{test}\\)
TYPE: Optional[TensorType]
DEFAULT: None
y
optional label tensor to compute gradients
TYPE: Optional[TensorType]
DEFAULT: None
mode
enum value of InfluenceMode
TYPE: InfluenceMode
DEFAULT: Up
RETURNS DESCRIPTION OrderedDict[str, TensorType]
Ordered dictionary of tensors representing the element-wise scalar products
OrderedDict[str, TensorType]
for the provided batch per block.
Source code in src/pydvl/influence/base_influence_function_model.py
@InfluenceFunctionModel.fit_required\ndef influences_by_block(\n self,\n x_test: TensorType,\n y_test: TensorType,\n x: Optional[TensorType] = None,\n y: Optional[TensorType] = None,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> OrderedDict[str, TensorType]:\n r\"\"\"\n Compute the block-wise influence values for the provided data, i.e. an\n approximation of\n\n \\[ \\langle H^{-1}\\nabla_{theta} \\ell(y_{\\text{test}},\n f_{\\theta}(x_{\\text{test}})),\n \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the case of up-weighting influence, resp.\n\n \\[ \\langle H^{-1}\\nabla_{theta} \\ell(y_{test}, f_{\\theta}(x_{test})),\n \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the perturbation type influence case.\n\n Args:\n x_test: model input to use in the gradient computations\n of the approximation of\n $H^{-1}\\nabla_{theta} \\ell(y_{test}, f_{\\theta}(x_{test}))$\n y_test: label tensor to compute gradients\n x: optional model input to use in the gradient computations\n $\\nabla_{theta}\\ell(y, f_{\\theta}(x))$,\n resp. $\\nabla_{x}\\nabla_{theta}\\ell(y, f_{\\theta}(x))$,\n if None, use $x=x_{test}$\n y: optional label tensor to compute gradients\n mode: enum value of [InfluenceMode]\n [pydvl.influence.base_influence_function_model.InfluenceMode]\n\n Returns:\n Ordered dictionary of tensors representing the element-wise scalar products\n for the provided batch per block.\n\n \"\"\"\n left_batch = self._create_batch(x_test, y_test)\n\n if x is None:\n if y is not None:\n raise ValueError(\n \"Providing labels y, without providing model input x \"\n \"is not supported\"\n )\n right_batch = left_batch\n else:\n if y is None:\n raise ValueError(\n \"Providing model input x, without providing labels y \"\n \"is not supported\"\n )\n right_batch = self._create_batch(x, y)\n\n return self.block_mapper.interactions(left_batch, right_batch, mode)\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.DirectInfluence.influence_factors_by_block","title":"influence_factors_by_block","text":"influence_factors_by_block(\n x: TensorType, y: TensorType\n) -> OrderedDict[str, TensorType]\n
Compute the block-wise approximation of
\\[ H^{-1}\\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\] where the gradient is meant to be per sample of the batch \\((x, y)\\).
PARAMETER DESCRIPTION x
model input to use in the gradient computations
TYPE: TensorType
y
label tensor to compute gradients
TYPE: TensorType
RETURNS DESCRIPTION OrderedDict[str, TensorType]
Ordered dictionary of tensors representing the element-wise
OrderedDict[str, TensorType]
approximate inverse Hessian matrix vector products per block.
Source code in src/pydvl/influence/base_influence_function_model.py
@InfluenceFunctionModel.fit_required\ndef influence_factors_by_block(\n self, x: TensorType, y: TensorType\n) -> OrderedDict[str, TensorType]:\n r\"\"\"\n Compute the block-wise approximation of\n\n \\[ H^{-1}\\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\]\n\n where the gradient is meant to be per sample of the batch $(x, y)$.\n\n Args:\n x: model input to use in the gradient computations\n y: label tensor to compute gradients\n\n Returns:\n Ordered dictionary of tensors representing the element-wise\n approximate inverse Hessian matrix vector products per block.\n\n \"\"\"\n return self.block_mapper.transformed_grads(self._create_batch(x, y))\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.DirectInfluence.influences_from_factors_by_block","title":"influences_from_factors_by_block","text":"influences_from_factors_by_block(\n z_test_factors: OrderedDict[str, TensorType],\n x: TensorType,\n y: TensorType,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> OrderedDict[str, TensorType]\n
Block-wise computation of
\\[ \\langle z_{\\text{test_factors}}, \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the case of up-weighting influence, resp.
\\[ \\langle z_{\\text{test_factors}}, \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the perturbation type influence case. The gradient is meant to be per sample of the batch \\((x, y)\\).
PARAMETER DESCRIPTION z_test_factors
pre-computed array, approximating \\(H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}}))\\)
TYPE: OrderedDict[str, TensorType]
x
model input to use in the gradient computations \\(\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))\\), resp. \\(\\nabla_{x}\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))\\), if None, use \\(x=x_{\\text{test}}\\)
TYPE: TensorType
y
label tensor to compute gradients
TYPE: TensorType
mode
enum value of InfluenceMode
TYPE: InfluenceMode
DEFAULT: Up
RETURNS DESCRIPTION OrderedDict[str, TensorType]
Ordered dictionary of tensors representing the element-wise scalar products
OrderedDict[str, TensorType]
for the provided batch per block
Source code in src/pydvl/influence/base_influence_function_model.py
@InfluenceFunctionModel.fit_required\ndef influences_from_factors_by_block(\n self,\n z_test_factors: OrderedDict[str, TensorType],\n x: TensorType,\n y: TensorType,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> OrderedDict[str, TensorType]:\n r\"\"\"\n Block-wise computation of\n\n \\[ \\langle z_{\\text{test_factors}},\n \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the case of up-weighting influence, resp.\n\n \\[ \\langle z_{\\text{test_factors}},\n \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the perturbation type influence case. The gradient is meant to be per sample\n of the batch $(x, y)$.\n\n Args:\n z_test_factors: pre-computed array, approximating\n $H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}},\n f_{\\theta}(x_{\\text{test}}))$\n x: model input to use in the gradient computations\n $\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))$,\n resp. $\\nabla_{x}\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))$,\n if None, use $x=x_{\\text{test}}$\n y: label tensor to compute gradients\n mode: enum value of [InfluenceMode]\n [pydvl.influence.base_influence_function_model.InfluenceMode]\n\n Returns:\n Ordered dictionary of tensors representing the element-wise scalar products\n for the provided batch per block\n\n \"\"\"\n return self.block_mapper.interactions_from_transformed_grads(\n z_test_factors, self._create_batch(x, y), mode\n )\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.DirectInfluence.with_regularization","title":"with_regularization","text":"with_regularization(\n regularization: Union[float, Dict[str, Optional[float]]]\n) -> TorchComposableInfluence\n
Update the regularization parameter. Args: regularization: Either a positive float or a dictionary with the block names as keys and the regularization values as values.
RETURNS DESCRIPTION TorchComposableInfluence
The modified instance
Source code in src/pydvl/influence/torch/influence_function_model.py
def with_regularization(\n self, regularization: Union[float, Dict[str, Optional[float]]]\n) -> TorchComposableInfluence:\n \"\"\"\n Update the regularization parameter.\n Args:\n regularization: Either a positive float or a dictionary with the\n block names as keys and the regularization values as values.\n\n Returns:\n The modified instance\n\n \"\"\"\n self._regularization_dict = self._build_regularization_dict(regularization)\n for k, reg in self._regularization_dict.items():\n self.block_mapper.composable_block_dict[k].op.regularization = reg\n return self\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.CgInfluence","title":"CgInfluence","text":"CgInfluence(\n model: Module,\n loss: Callable[[Tensor, Tensor], Tensor],\n hessian_regularization: float = 0.0,\n x0: Optional[Tensor] = None,\n rtol: float = 1e-07,\n atol: float = 1e-07,\n maxiter: Optional[int] = None,\n progress: bool = False,\n precompute_grad: bool = False,\n pre_conditioner: Optional[PreConditioner] = None,\n use_block_cg: bool = False,\n warn_on_max_iteration: bool = True,\n)\n
Bases: TorchInfluenceFunctionModel
Given a model and training data, it uses conjugate gradient to calculate the inverse of the Hessian Vector Product. More precisely, it finds x such that \\(Hx = b\\), with \\(H\\) being the model hessian. For more info, see Conjugate Gradient.
PARAMETER DESCRIPTION model
A PyTorch model. The Hessian will be calculated with respect to this model's parameters.
TYPE: Module
loss
A callable that takes the model's output and target as input and returns the scalar loss.
TYPE: Callable[[Tensor, Tensor], Tensor]
hessian_regularization
Optional regularization parameter added to the Hessian-vector product for numerical stability.
TYPE: float
DEFAULT: 0.0
x0
Initial guess for hvp. If None, defaults to b.
TYPE: Optional[Tensor]
DEFAULT: None
rtol
Maximum relative tolerance of result.
TYPE: float
DEFAULT: 1e-07
atol
Absolute tolerance of result.
TYPE: float
DEFAULT: 1e-07
maxiter
Maximum number of iterations. If None, defaults to 10*len(b).
TYPE: Optional[int]
DEFAULT: None
progress
If True, display progress bars for computing in the non-block mode (use_block_cg=False).
TYPE: bool
DEFAULT: False
precompute_grad
If True, the full data gradient is precomputed and kept in memory, which can speed up the hessian vector product computation. Set this to False, if you can't afford to keep the full computation graph in memory.
TYPE: bool
DEFAULT: False
pre_conditioner
Optional pre-conditioner to improve convergence of conjugate gradient method
TYPE: Optional[PreConditioner]
DEFAULT: None
use_block_cg
If True, use block variant of conjugate gradient method, which solves several right hand sides simultaneously
TYPE: bool
DEFAULT: False
warn_on_max_iteration
If True, logs a warning, if the desired tolerance is not achieved within maxiter
iterations. If False, the log level for this information is logging.DEBUG
TYPE: bool
DEFAULT: True
Source code in src/pydvl/influence/torch/influence_function_model.py
def __init__(\n self,\n model: nn.Module,\n loss: Callable[[torch.Tensor, torch.Tensor], torch.Tensor],\n hessian_regularization: float = 0.0,\n x0: Optional[torch.Tensor] = None,\n rtol: float = 1e-7,\n atol: float = 1e-7,\n maxiter: Optional[int] = None,\n progress: bool = False,\n precompute_grad: bool = False,\n pre_conditioner: Optional[PreConditioner] = None,\n use_block_cg: bool = False,\n warn_on_max_iteration: bool = True,\n):\n super().__init__(model, loss)\n self.warn_on_max_iteration = warn_on_max_iteration\n self.use_block_cg = use_block_cg\n self.pre_conditioner = pre_conditioner\n self.precompute_grad = precompute_grad\n self.progress = progress\n self.maxiter = maxiter\n self.atol = atol\n self.rtol = rtol\n self.x0 = x0\n self.hessian_regularization = hessian_regularization\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.CgInfluence.fit_required","title":"fit_required staticmethod
","text":"fit_required(method)\n
Decorator to enforce the fitted check
Source code in src/pydvl/influence/base_influence_function_model.py
@staticmethod\ndef fit_required(method):\n \"\"\"Decorator to enforce the fitted check\"\"\"\n\n @wraps(method)\n def wrapper(self, *args, **kwargs):\n if not self.is_fitted:\n raise NotFittedException(type(self))\n return method(self, *args, **kwargs)\n\n return wrapper\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.CgInfluence.influence_factors","title":"influence_factors","text":"influence_factors(x: Tensor, y: Tensor) -> Tensor\n
Compute approximation of
\\[ H^{-1}\\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\] where the gradient is meant to be per sample of the batch \\((x, y)\\). For all input tensors it is assumed, that the first dimension is the batch dimension (in case, you want to provide a single sample z, call z.unsqueeze(0) if no batch dimension is present).
PARAMETER DESCRIPTION x
model input to use in the gradient computations
TYPE: Tensor
y
label tensor to compute gradients
TYPE: Tensor
RETURNS DESCRIPTION Tensor
Tensor representing the element-wise inverse Hessian matrix vector products
Source code in src/pydvl/influence/torch/influence_function_model.py
def influence_factors(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:\n r\"\"\"\n Compute approximation of\n\n \\[ H^{-1}\\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\]\n\n where the gradient is meant to be per sample of the batch $(x, y)$.\n For all input tensors it is assumed,\n that the first dimension is the batch dimension (in case, you want to provide\n a single sample z, call z.unsqueeze(0) if no batch dimension is present).\n\n Args:\n x: model input to use in the gradient computations\n y: label tensor to compute gradients\n\n Returns:\n Tensor representing the element-wise inverse Hessian matrix vector products\n\n \"\"\"\n return super().influence_factors(x, y)\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.CgInfluence.influences_from_factors","title":"influences_from_factors","text":"influences_from_factors(\n z_test_factors: Tensor,\n x: Tensor,\n y: Tensor,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> Tensor\n
Computation of
\\[ \\langle z_{\\text{test_factors}}, \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the case of up-weighting influence, resp.
\\[ \\langle z_{\\text{test_factors}}, \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the perturbation type influence case. The gradient is meant to be per sample of the batch \\((x, y)\\). For all input tensors it is assumed, that the first dimension is the batch dimension (in case, you want to provide a single sample z, call z.unsqueeze(0) if no batch dimension is present).
PARAMETER DESCRIPTION z_test_factors
pre-computed tensor, approximating \\(H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}}))\\)
TYPE: Tensor
x
model input to use in the gradient computations \\(\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))\\), resp. \\(\\nabla_{x}\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))\\)
TYPE: Tensor
y
label tensor to compute gradients
TYPE: Tensor
mode
enum value of InfluenceMode
TYPE: InfluenceMode
DEFAULT: Up
RETURNS DESCRIPTION Tensor
Tensor representing the element-wise scalar products for the provided batch
Source code in src/pydvl/influence/torch/influence_function_model.py
def influences_from_factors(\n self,\n z_test_factors: torch.Tensor,\n x: torch.Tensor,\n y: torch.Tensor,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> torch.Tensor:\n r\"\"\"\n Computation of\n\n \\[ \\langle z_{\\text{test_factors}},\n \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the case of up-weighting influence, resp.\n\n \\[ \\langle z_{\\text{test_factors}},\n \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the perturbation type influence case. The gradient is meant to be per sample\n of the batch $(x, y)$. For all input tensors it is assumed,\n that the first dimension is the batch dimension (in case, you want to provide\n a single sample z, call z.unsqueeze(0) if no batch dimension is present).\n\n Args:\n z_test_factors: pre-computed tensor, approximating\n $H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}},\n f_{\\theta}(x_{\\text{test}}))$\n x: model input to use in the gradient computations\n $\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))$,\n resp. $\\nabla_{x}\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))$\n y: label tensor to compute gradients\n mode: enum value of [InfluenceMode]\n [pydvl.influence.base_influence_function_model.InfluenceMode]\n\n Returns:\n Tensor representing the element-wise scalar products for the provided batch\n\n \"\"\"\n if mode == InfluenceMode.Up:\n return (\n z_test_factors.to(self.model_device)\n @ self._loss_grad(x.to(self.model_device), y.to(self.model_device)).T\n )\n elif mode == InfluenceMode.Perturbation:\n return torch.einsum(\n \"ia,j...a->ij...\",\n z_test_factors.to(self.model_device),\n self._flat_loss_mixed_grad(\n x.to(self.model_device), y.to(self.model_device)\n ),\n )\n else:\n raise UnsupportedInfluenceModeException(mode)\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.CgInfluence.influences","title":"influences","text":"influences(\n x_test: Tensor,\n y_test: Tensor,\n x: Optional[Tensor] = None,\n y: Optional[Tensor] = None,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> Tensor\n
Compute an approximation of
\\[ \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}})), \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle, \\] for the case of up-weighting influence, resp.
\\[ \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}})), \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the case of perturbation-type influence. The approximate action of \\(H^{-1}\\) is achieved via the conjugate gradient method.
PARAMETER DESCRIPTION x_test
model input to use in the gradient computations of \\(H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}}))\\)
TYPE: Tensor
y_test
label tensor to compute gradients
TYPE: Tensor
x
optional model input to use in the gradient computations \\(\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))\\), resp. \\(\\nabla_{x}\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))\\), if None, use \\(x=x_{\\text{test}}\\)
TYPE: Optional[Tensor]
DEFAULT: None
y
optional label tensor to compute gradients
TYPE: Optional[Tensor]
DEFAULT: None
mode
enum value of InfluenceMode
TYPE: InfluenceMode
DEFAULT: Up
RETURNS DESCRIPTION Tensor
A tensor representing the element-wise scalar products for the provided batch.
Source code in src/pydvl/influence/torch/influence_function_model.py
@log_duration\ndef influences(\n self,\n x_test: torch.Tensor,\n y_test: torch.Tensor,\n x: Optional[torch.Tensor] = None,\n y: Optional[torch.Tensor] = None,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> torch.Tensor:\n r\"\"\"\n Compute an approximation of\n\n \\[ \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}},\n f_{\\theta}(x_{\\text{test}})),\n \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle, \\]\n\n for the case of up-weighting influence, resp.\n\n \\[ \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}},\n f_{\\theta}(x_{\\text{test}})),\n \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the case of perturbation-type influence. The approximate action of\n $H^{-1}$ is achieved via the [conjugate gradient\n method](https://en.wikipedia.org/wiki/Conjugate_gradient_method).\n\n Args:\n x_test: model input to use in the gradient computations of\n $H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}},\n f_{\\theta}(x_{\\text{test}}))$\n y_test: label tensor to compute gradients\n x: optional model input to use in the gradient computations\n $\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))$,\n resp. $\\nabla_{x}\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))$,\n if None, use $x=x_{\\text{test}}$\n y: optional label tensor to compute gradients\n mode: enum value of [InfluenceMode]\n [pydvl.influence.base_influence_function_model.InfluenceMode]\n\n Returns:\n A tensor representing the element-wise scalar products for the\n provided batch.\n\n \"\"\"\n return super().influences(x_test, y_test, x, y, mode=mode)\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.LissaInfluence","title":"LissaInfluence","text":"LissaInfluence(\n model: Module,\n loss: Callable[[Tensor, Tensor], Tensor],\n regularization: Optional[Union[float, Dict[str, Optional[float]]]] = None,\n maxiter: int = 1000,\n dampen: float = 0.0,\n scale: float = 10.0,\n rtol: float = 0.0001,\n progress: bool = False,\n warn_on_max_iteration: bool = True,\n block_structure: Union[\n BlockMode, OrderedDict[str, List[str]]\n ] = BlockMode.FULL,\n second_order_mode: SecondOrderMode = SecondOrderMode.HESSIAN,\n)\n
Bases: TorchComposableInfluence[LissaOperator[BatchOperationType]]
Uses LISSA, Linear time Stochastic Second-Order Algorithm, to iteratively approximate the inverse Hessian. More precisely, it finds x s.t. \\(Hx = b\\), with \\(H\\) being the model's second derivative wrt. the parameters. This is done with the update
\\[H^{-1}_{j+1} b = b + (I - d) \\ H - \\frac{H^{-1}_j b}{s},\\] where \\(I\\) is the identity matrix, \\(d\\) is a dampening term and \\(s\\) a scaling factor that are applied to help convergence. For details, see Linear time Stochastic Second-Order Approximation (LiSSA)
PARAMETER DESCRIPTION model
A PyTorch model. The Hessian will be calculated with respect to this model's parameters.
TYPE: Module
loss
A callable that takes the model's output and target as input and returns the scalar loss.
TYPE: Callable[[Tensor, Tensor], Tensor]
regularization
Optional regularization parameter added to the Hessian-vector product for numerical stability.
TYPE: Optional[Union[float, Dict[str, Optional[float]]]]
DEFAULT: None
maxiter
Maximum number of iterations.
TYPE: int
DEFAULT: 1000
dampen
Dampening factor, defaults to 0 for no dampening.
TYPE: float
DEFAULT: 0.0
scale
Scaling factor, defaults to 10.
TYPE: float
DEFAULT: 10.0
rtol
tolerance to use for early stopping
TYPE: float
DEFAULT: 0.0001
progress
If True, display progress bars.
TYPE: bool
DEFAULT: False
warn_on_max_iteration
If True, logs a warning, if the desired tolerance is not achieved within maxiter
iterations. If False, the log level for this information is logging.DEBUG
TYPE: bool
DEFAULT: True
block_structure
The blocking structure, either a pre-defined enum or a custom block structure, see the information regarding block-diagonal approximation.
TYPE: Union[BlockMode, OrderedDict[str, List[str]]]
DEFAULT: FULL
second_order_mode
The second order mode, either SecondOrderMode.HESSIAN
or SecondOrderMode.GAUSS_NEWTON
.
TYPE: SecondOrderMode
DEFAULT: HESSIAN
Source code in src/pydvl/influence/torch/influence_function_model.py
def __init__(\n self,\n model: nn.Module,\n loss: Callable[[torch.Tensor, torch.Tensor], torch.Tensor],\n regularization: Optional[Union[float, Dict[str, Optional[float]]]] = None,\n maxiter: int = 1000,\n dampen: float = 0.0,\n scale: float = 10.0,\n rtol: float = 1e-4,\n progress: bool = False,\n warn_on_max_iteration: bool = True,\n block_structure: Union[BlockMode, OrderedDict[str, List[str]]] = BlockMode.FULL,\n second_order_mode: SecondOrderMode = SecondOrderMode.HESSIAN,\n):\n super().__init__(model, block_structure, regularization)\n self.maxiter = maxiter\n self.progress = progress\n self.rtol = rtol\n self.scale = scale\n self.dampen = dampen\n self.loss = loss\n self.second_order_mode = second_order_mode\n self.warn_on_max_iteration = warn_on_max_iteration\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.LissaInfluence.fit","title":"fit","text":"fit(data: DataLoaderType) -> InfluenceFunctionModel\n
Fitting to provided data, by internally creating a block mapper instance from it. Args: data: iterable of tensors
RETURNS DESCRIPTION InfluenceFunctionModel
Fitted instance
Source code in src/pydvl/influence/base_influence_function_model.py
@log_duration(log_level=logging.INFO)\ndef fit(self, data: DataLoaderType) -> InfluenceFunctionModel:\n \"\"\"\n Fitting to provided data, by internally creating a block mapper instance from\n it.\n Args:\n data: iterable of tensors\n\n Returns:\n Fitted instance\n \"\"\"\n self.block_mapper = self._create_block_mapper(data)\n return self\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.LissaInfluence.fit_required","title":"fit_required staticmethod
","text":"fit_required(method)\n
Decorator to enforce the fitted check
Source code in src/pydvl/influence/base_influence_function_model.py
@staticmethod\ndef fit_required(method):\n \"\"\"Decorator to enforce the fitted check\"\"\"\n\n @wraps(method)\n def wrapper(self, *args, **kwargs):\n if not self.is_fitted:\n raise NotFittedException(type(self))\n return method(self, *args, **kwargs)\n\n return wrapper\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.LissaInfluence.influence_factors","title":"influence_factors","text":"influence_factors(x: TensorType, y: TensorType) -> TensorType\n
Computes the approximation of
\\[ H^{-1}\\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\] where the gradient is meant to be per sample of the batch \\((x, y)\\). For all input tensors it is assumed, that the first dimension is the batch dimension.
PARAMETER DESCRIPTION x
model input to use in the gradient computations
TYPE: TensorType
y
label tensor to compute gradients
TYPE: TensorType
RETURNS DESCRIPTION TensorType
Tensor representing the element-wise inverse Hessian matrix vector products
Source code in src/pydvl/influence/base_influence_function_model.py
def influence_factors(self, x: TensorType, y: TensorType) -> TensorType:\n r\"\"\"\n Computes the approximation of\n\n \\[ H^{-1}\\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\]\n\n where the gradient is meant to be per sample of the batch $(x, y)$.\n For all input tensors it is assumed,\n that the first dimension is the batch dimension.\n\n Args:\n x: model input to use in the gradient computations\n y: label tensor to compute gradients\n\n Returns:\n Tensor representing the element-wise inverse Hessian matrix vector products\n\n \"\"\"\n if not self.is_fitted:\n raise NotFittedException(type(self))\n return self._influence_factors(x, y)\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.LissaInfluence.influences","title":"influences","text":"influences(\n x_test: TensorType,\n y_test: TensorType,\n x: Optional[TensorType] = None,\n y: Optional[TensorType] = None,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> TensorType\n
Computes the approximation of
\\[ \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}})), \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the case of up-weighting influence, resp.
\\[ \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{test}, f_{\\theta}(x_{test})), \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the perturbation type influence case.
PARAMETER DESCRIPTION x_test
model input to use in the gradient computations of \\(H^{-1}\\nabla_{theta} \\ell(y_{test}, f_{\\theta}(x_{test}))\\)
TYPE: TensorType
y_test
label tensor to compute gradients
TYPE: TensorType
x
optional model input to use in the gradient computations \\(\\nabla_{theta}\\ell(y, f_{\\theta}(x))\\), resp. \\(\\nabla_{x}\\nabla_{theta}\\ell(y, f_{\\theta}(x))\\), if None, use \\(x=x_{test}\\)
TYPE: Optional[TensorType]
DEFAULT: None
y
optional label tensor to compute gradients
TYPE: Optional[TensorType]
DEFAULT: None
mode
enum value of InfluenceMode
TYPE: InfluenceMode
DEFAULT: Up
RETURNS DESCRIPTION TensorType
Tensor representing the element-wise scalar products for the provided batch
Source code in src/pydvl/influence/base_influence_function_model.py
def influences(\n self,\n x_test: TensorType,\n y_test: TensorType,\n x: Optional[TensorType] = None,\n y: Optional[TensorType] = None,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> TensorType:\n r\"\"\"\n Computes the approximation of\n\n \\[ \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}},\n f_{\\theta}(x_{\\text{test}})),\n \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the case of up-weighting influence, resp.\n\n \\[ \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{test}, f_{\\theta}(x_{test})),\n \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the perturbation type influence case.\n\n Args:\n x_test: model input to use in the gradient computations\n of $H^{-1}\\nabla_{theta} \\ell(y_{test}, f_{\\theta}(x_{test}))$\n y_test: label tensor to compute gradients\n x: optional model input to use in the gradient computations\n $\\nabla_{theta}\\ell(y, f_{\\theta}(x))$,\n resp. $\\nabla_{x}\\nabla_{theta}\\ell(y, f_{\\theta}(x))$,\n if None, use $x=x_{test}$\n y: optional label tensor to compute gradients\n mode: enum value of [InfluenceMode]\n [pydvl.influence.base_influence_function_model.InfluenceMode]\n\n Returns:\n Tensor representing the element-wise scalar products for the provided batch\n\n \"\"\"\n if not self.is_fitted:\n raise NotFittedException(type(self))\n\n if x is None and y is not None:\n raise ValueError(\n \"Providing labels y, without providing model input x \"\n \"is not supported\"\n )\n\n if x is not None and y is None:\n raise ValueError(\n \"Providing model input x, without providing labels y \"\n \"is not supported\"\n )\n\n return self._influences(x_test, y_test, x, y, mode)\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.LissaInfluence.influences_from_factors","title":"influences_from_factors","text":"influences_from_factors(\n z_test_factors: TensorType,\n x: TensorType,\n y: TensorType,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> TensorType\n
Computation of
\\[ \\langle z_{\\text{test_factors}}, \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the case of up-weighting influence, resp.
\\[ \\langle z_{\\text{test_factors}}, \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the perturbation type influence case. The gradient is meant to be per sample of the batch \\((x, y)\\).
PARAMETER DESCRIPTION z_test_factors
pre-computed array, approximating \\(H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}}))\\)
TYPE: TensorType
x
model input to use in the gradient computations \\(\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))\\), resp. \\(\\nabla_{x}\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))\\), if None, use \\(x=x_{\\text{test}}\\)
TYPE: TensorType
y
label tensor to compute gradients
TYPE: TensorType
mode
enum value of InfluenceMode
TYPE: InfluenceMode
DEFAULT: Up
RETURNS DESCRIPTION TensorType
Tensor representing the element-wise scalar products for the provided batch
Source code in src/pydvl/influence/base_influence_function_model.py
@InfluenceFunctionModel.fit_required\ndef influences_from_factors(\n self,\n z_test_factors: TensorType,\n x: TensorType,\n y: TensorType,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> TensorType:\n r\"\"\"\n Computation of\n\n \\[ \\langle z_{\\text{test_factors}},\n \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the case of up-weighting influence, resp.\n\n \\[ \\langle z_{\\text{test_factors}},\n \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the perturbation type influence case. The gradient is meant to be per sample\n of the batch $(x, y)$.\n\n Args:\n z_test_factors: pre-computed array, approximating\n $H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}},\n f_{\\theta}(x_{\\text{test}}))$\n x: model input to use in the gradient computations\n $\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))$,\n resp. $\\nabla_{x}\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))$,\n if None, use $x=x_{\\text{test}}$\n y: label tensor to compute gradients\n mode: enum value of [InfluenceMode]\n [pydvl.influence.base_influence_function_model.InfluenceMode]\n\n Returns:\n Tensor representing the element-wise scalar products for the provided batch\n\n \"\"\"\n tensors = self.block_mapper.generate_interactions_from_transformed_grads(\n z_test_factors,\n self._create_batch(x, y),\n mode,\n )\n result: TensorType = next(tensors)\n for tensor in tensors:\n result = result + tensor\n return result\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.LissaInfluence.influences_by_block","title":"influences_by_block","text":"influences_by_block(\n x_test: TensorType,\n y_test: TensorType,\n x: Optional[TensorType] = None,\n y: Optional[TensorType] = None,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> OrderedDict[str, TensorType]\n
Compute the block-wise influence values for the provided data, i.e. an approximation of
\\[ \\langle H^{-1}\\nabla_{theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}})), \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the case of up-weighting influence, resp.
\\[ \\langle H^{-1}\\nabla_{theta} \\ell(y_{test}, f_{\\theta}(x_{test})), \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the perturbation type influence case.
PARAMETER DESCRIPTION x_test
model input to use in the gradient computations of the approximation of \\(H^{-1}\\nabla_{theta} \\ell(y_{test}, f_{\\theta}(x_{test}))\\)
TYPE: TensorType
y_test
label tensor to compute gradients
TYPE: TensorType
x
optional model input to use in the gradient computations \\(\\nabla_{theta}\\ell(y, f_{\\theta}(x))\\), resp. \\(\\nabla_{x}\\nabla_{theta}\\ell(y, f_{\\theta}(x))\\), if None, use \\(x=x_{test}\\)
TYPE: Optional[TensorType]
DEFAULT: None
y
optional label tensor to compute gradients
TYPE: Optional[TensorType]
DEFAULT: None
mode
enum value of InfluenceMode
TYPE: InfluenceMode
DEFAULT: Up
RETURNS DESCRIPTION OrderedDict[str, TensorType]
Ordered dictionary of tensors representing the element-wise scalar products
OrderedDict[str, TensorType]
for the provided batch per block.
Source code in src/pydvl/influence/base_influence_function_model.py
@InfluenceFunctionModel.fit_required\ndef influences_by_block(\n self,\n x_test: TensorType,\n y_test: TensorType,\n x: Optional[TensorType] = None,\n y: Optional[TensorType] = None,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> OrderedDict[str, TensorType]:\n r\"\"\"\n Compute the block-wise influence values for the provided data, i.e. an\n approximation of\n\n \\[ \\langle H^{-1}\\nabla_{theta} \\ell(y_{\\text{test}},\n f_{\\theta}(x_{\\text{test}})),\n \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the case of up-weighting influence, resp.\n\n \\[ \\langle H^{-1}\\nabla_{theta} \\ell(y_{test}, f_{\\theta}(x_{test})),\n \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the perturbation type influence case.\n\n Args:\n x_test: model input to use in the gradient computations\n of the approximation of\n $H^{-1}\\nabla_{theta} \\ell(y_{test}, f_{\\theta}(x_{test}))$\n y_test: label tensor to compute gradients\n x: optional model input to use in the gradient computations\n $\\nabla_{theta}\\ell(y, f_{\\theta}(x))$,\n resp. $\\nabla_{x}\\nabla_{theta}\\ell(y, f_{\\theta}(x))$,\n if None, use $x=x_{test}$\n y: optional label tensor to compute gradients\n mode: enum value of [InfluenceMode]\n [pydvl.influence.base_influence_function_model.InfluenceMode]\n\n Returns:\n Ordered dictionary of tensors representing the element-wise scalar products\n for the provided batch per block.\n\n \"\"\"\n left_batch = self._create_batch(x_test, y_test)\n\n if x is None:\n if y is not None:\n raise ValueError(\n \"Providing labels y, without providing model input x \"\n \"is not supported\"\n )\n right_batch = left_batch\n else:\n if y is None:\n raise ValueError(\n \"Providing model input x, without providing labels y \"\n \"is not supported\"\n )\n right_batch = self._create_batch(x, y)\n\n return self.block_mapper.interactions(left_batch, right_batch, mode)\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.LissaInfluence.influence_factors_by_block","title":"influence_factors_by_block","text":"influence_factors_by_block(\n x: TensorType, y: TensorType\n) -> OrderedDict[str, TensorType]\n
Compute the block-wise approximation of
\\[ H^{-1}\\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\] where the gradient is meant to be per sample of the batch \\((x, y)\\).
PARAMETER DESCRIPTION x
model input to use in the gradient computations
TYPE: TensorType
y
label tensor to compute gradients
TYPE: TensorType
RETURNS DESCRIPTION OrderedDict[str, TensorType]
Ordered dictionary of tensors representing the element-wise
OrderedDict[str, TensorType]
approximate inverse Hessian matrix vector products per block.
Source code in src/pydvl/influence/base_influence_function_model.py
@InfluenceFunctionModel.fit_required\ndef influence_factors_by_block(\n self, x: TensorType, y: TensorType\n) -> OrderedDict[str, TensorType]:\n r\"\"\"\n Compute the block-wise approximation of\n\n \\[ H^{-1}\\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\]\n\n where the gradient is meant to be per sample of the batch $(x, y)$.\n\n Args:\n x: model input to use in the gradient computations\n y: label tensor to compute gradients\n\n Returns:\n Ordered dictionary of tensors representing the element-wise\n approximate inverse Hessian matrix vector products per block.\n\n \"\"\"\n return self.block_mapper.transformed_grads(self._create_batch(x, y))\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.LissaInfluence.influences_from_factors_by_block","title":"influences_from_factors_by_block","text":"influences_from_factors_by_block(\n z_test_factors: OrderedDict[str, TensorType],\n x: TensorType,\n y: TensorType,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> OrderedDict[str, TensorType]\n
Block-wise computation of
\\[ \\langle z_{\\text{test_factors}}, \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the case of up-weighting influence, resp.
\\[ \\langle z_{\\text{test_factors}}, \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the perturbation type influence case. The gradient is meant to be per sample of the batch \\((x, y)\\).
PARAMETER DESCRIPTION z_test_factors
pre-computed array, approximating \\(H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}}))\\)
TYPE: OrderedDict[str, TensorType]
x
model input to use in the gradient computations \\(\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))\\), resp. \\(\\nabla_{x}\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))\\), if None, use \\(x=x_{\\text{test}}\\)
TYPE: TensorType
y
label tensor to compute gradients
TYPE: TensorType
mode
enum value of InfluenceMode
TYPE: InfluenceMode
DEFAULT: Up
RETURNS DESCRIPTION OrderedDict[str, TensorType]
Ordered dictionary of tensors representing the element-wise scalar products
OrderedDict[str, TensorType]
for the provided batch per block
Source code in src/pydvl/influence/base_influence_function_model.py
@InfluenceFunctionModel.fit_required\ndef influences_from_factors_by_block(\n self,\n z_test_factors: OrderedDict[str, TensorType],\n x: TensorType,\n y: TensorType,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> OrderedDict[str, TensorType]:\n r\"\"\"\n Block-wise computation of\n\n \\[ \\langle z_{\\text{test_factors}},\n \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the case of up-weighting influence, resp.\n\n \\[ \\langle z_{\\text{test_factors}},\n \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the perturbation type influence case. The gradient is meant to be per sample\n of the batch $(x, y)$.\n\n Args:\n z_test_factors: pre-computed array, approximating\n $H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}},\n f_{\\theta}(x_{\\text{test}}))$\n x: model input to use in the gradient computations\n $\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))$,\n resp. $\\nabla_{x}\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))$,\n if None, use $x=x_{\\text{test}}$\n y: label tensor to compute gradients\n mode: enum value of [InfluenceMode]\n [pydvl.influence.base_influence_function_model.InfluenceMode]\n\n Returns:\n Ordered dictionary of tensors representing the element-wise scalar products\n for the provided batch per block\n\n \"\"\"\n return self.block_mapper.interactions_from_transformed_grads(\n z_test_factors, self._create_batch(x, y), mode\n )\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.LissaInfluence.with_regularization","title":"with_regularization","text":"with_regularization(\n regularization: Union[float, Dict[str, Optional[float]]]\n) -> TorchComposableInfluence\n
Update the regularization parameter. Args: regularization: Either a positive float or a dictionary with the block names as keys and the regularization values as values.
RETURNS DESCRIPTION TorchComposableInfluence
The modified instance
Source code in src/pydvl/influence/torch/influence_function_model.py
def with_regularization(\n self, regularization: Union[float, Dict[str, Optional[float]]]\n) -> TorchComposableInfluence:\n \"\"\"\n Update the regularization parameter.\n Args:\n regularization: Either a positive float or a dictionary with the\n block names as keys and the regularization values as values.\n\n Returns:\n The modified instance\n\n \"\"\"\n self._regularization_dict = self._build_regularization_dict(regularization)\n for k, reg in self._regularization_dict.items():\n self.block_mapper.composable_block_dict[k].op.regularization = reg\n return self\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.ArnoldiInfluence","title":"ArnoldiInfluence","text":"ArnoldiInfluence(\n model: Module,\n loss: Callable[[Tensor, Tensor], Tensor],\n hessian_regularization: float = 0.0,\n rank_estimate: int = 10,\n krylov_dimension: Optional[int] = None,\n tol: float = 1e-06,\n max_iter: Optional[int] = None,\n eigen_computation_on_gpu: bool = False,\n precompute_grad: bool = False,\n)\n
Bases: TorchInfluenceFunctionModel
Solves the linear system Hx = b, where H is the Hessian of the model's loss function and b is the given right-hand side vector. It employs the [implicitly restarted Arnoldi method] (https://en.wikipedia.org/wiki/Arnoldi_iteration) for computing a partial eigen decomposition, which is used fo the inversion i.e.
\\[x = V D^{-1} V^T b\\] where \\(D\\) is a diagonal matrix with the top (in absolute value) rank_estimate
eigenvalues of the Hessian and \\(V\\) contains the corresponding eigenvectors. For more information, see Arnoldi.
PARAMETER DESCRIPTION model
A PyTorch model. The Hessian will be calculated with respect to this model's parameters.
TYPE: Module
loss
A callable that takes the model's output and target as input and returns the scalar loss.
TYPE: Callable[[Tensor, Tensor], Tensor]
hessian_regularization
Optional regularization parameter added to the Hessian-vector product for numerical stability.
TYPE: float
DEFAULT: 0.0
rank_estimate
The number of eigenvalues and corresponding eigenvectors to compute. Represents the desired rank of the Hessian approximation.
TYPE: int
DEFAULT: 10
krylov_dimension
The number of Krylov vectors to use for the Lanczos method. Defaults to min(model's number of parameters, max(2 times rank_estimate + 1, 20)).
TYPE: Optional[int]
DEFAULT: None
tol
The stopping criteria for the Lanczos algorithm. Ignored if low_rank_representation
is provided.
TYPE: float
DEFAULT: 1e-06
max_iter
The maximum number of iterations for the Lanczos method. Ignored if low_rank_representation
is provided.
TYPE: Optional[int]
DEFAULT: None
eigen_computation_on_gpu
If True, tries to execute the eigen pair approximation on the model's device via a cupy implementation. Ensure the model size or rank_estimate is appropriate for device memory. If False, the eigen pair approximation is executed on the CPU by the scipy wrapper to ARPACK.
TYPE: bool
DEFAULT: False
precompute_grad
If True, the full data gradient is precomputed and kept in memory, which can speed up the hessian vector product computation. Set this to False, if you can't afford to keep the full computation graph in memory.
TYPE: bool
DEFAULT: False
Source code in src/pydvl/influence/torch/influence_function_model.py
def __init__(\n self,\n model: nn.Module,\n loss: Callable[[torch.Tensor, torch.Tensor], torch.Tensor],\n hessian_regularization: float = 0.0,\n rank_estimate: int = 10,\n krylov_dimension: Optional[int] = None,\n tol: float = 1e-6,\n max_iter: Optional[int] = None,\n eigen_computation_on_gpu: bool = False,\n precompute_grad: bool = False,\n):\n super().__init__(model, loss)\n self.hessian_regularization = hessian_regularization\n self.rank_estimate = rank_estimate\n self.tol = tol\n self.max_iter = max_iter\n self.krylov_dimension = krylov_dimension\n self.eigen_computation_on_gpu = eigen_computation_on_gpu\n self.precompute_grad = precompute_grad\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.ArnoldiInfluence.fit_required","title":"fit_required staticmethod
","text":"fit_required(method)\n
Decorator to enforce the fitted check
Source code in src/pydvl/influence/base_influence_function_model.py
@staticmethod\ndef fit_required(method):\n \"\"\"Decorator to enforce the fitted check\"\"\"\n\n @wraps(method)\n def wrapper(self, *args, **kwargs):\n if not self.is_fitted:\n raise NotFittedException(type(self))\n return method(self, *args, **kwargs)\n\n return wrapper\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.ArnoldiInfluence.influence_factors","title":"influence_factors","text":"influence_factors(x: Tensor, y: Tensor) -> Tensor\n
Compute approximation of
\\[ H^{-1}\\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\] where the gradient is meant to be per sample of the batch \\((x, y)\\). For all input tensors it is assumed, that the first dimension is the batch dimension (in case, you want to provide a single sample z, call z.unsqueeze(0) if no batch dimension is present).
PARAMETER DESCRIPTION x
model input to use in the gradient computations
TYPE: Tensor
y
label tensor to compute gradients
TYPE: Tensor
RETURNS DESCRIPTION Tensor
Tensor representing the element-wise inverse Hessian matrix vector products
Source code in src/pydvl/influence/torch/influence_function_model.py
def influence_factors(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:\n r\"\"\"\n Compute approximation of\n\n \\[ H^{-1}\\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\]\n\n where the gradient is meant to be per sample of the batch $(x, y)$.\n For all input tensors it is assumed,\n that the first dimension is the batch dimension (in case, you want to provide\n a single sample z, call z.unsqueeze(0) if no batch dimension is present).\n\n Args:\n x: model input to use in the gradient computations\n y: label tensor to compute gradients\n\n Returns:\n Tensor representing the element-wise inverse Hessian matrix vector products\n\n \"\"\"\n return super().influence_factors(x, y)\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.ArnoldiInfluence.influences","title":"influences","text":"influences(\n x_test: Tensor,\n y_test: Tensor,\n x: Optional[Tensor] = None,\n y: Optional[Tensor] = None,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> Tensor\n
Compute the approximation of
\\[ \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}})), \\nabla_{\\theta} \\ell(y, f_{\\theta}(x))\\rangle \\] for the case of up-weighting influence, resp.
\\[ \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}})), \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the perturbation type influence case. For all input tensors it is assumed, that the first dimension is the batch dimension (in case, you want to provide a single sample z, call z.unsqueeze(0) if no batch dimension is present).
PARAMETER DESCRIPTION x_test
model input to use in the gradient computations of \\(H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}}))\\)
TYPE: Tensor
y_test
label tensor to compute gradients
TYPE: Tensor
x
optional model input to use in the gradient computations \\(\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))\\), resp. \\(\\nabla_{x}\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))\\), if None, use \\(x=x_{\\text{test}}\\)
TYPE: Optional[Tensor]
DEFAULT: None
y
optional label tensor to compute gradients
TYPE: Optional[Tensor]
DEFAULT: None
mode
enum value of InfluenceMode
TYPE: InfluenceMode
DEFAULT: Up
RETURNS DESCRIPTION Tensor
Tensor representing the element-wise scalar products for the provided batch
Source code in src/pydvl/influence/torch/influence_function_model.py
def influences(\n self,\n x_test: torch.Tensor,\n y_test: torch.Tensor,\n x: Optional[torch.Tensor] = None,\n y: Optional[torch.Tensor] = None,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> torch.Tensor:\n r\"\"\"\n Compute the approximation of\n\n \\[\n \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}},\n f_{\\theta}(x_{\\text{test}})), \\nabla_{\\theta} \\ell(y, f_{\\theta}(x))\\rangle\n \\]\n\n for the case of up-weighting influence, resp.\n\n \\[\n \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}})),\n \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle\n \\]\n\n for the perturbation type influence case. For all input tensors it is assumed,\n that the first dimension is the batch dimension (in case, you want to provide\n a single sample z, call z.unsqueeze(0) if no batch dimension is present).\n\n Args:\n x_test: model input to use in the gradient computations\n of $H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}},\n f_{\\theta}(x_{\\text{test}}))$\n y_test: label tensor to compute gradients\n x: optional model input to use in the gradient computations\n $\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))$,\n resp. $\\nabla_{x}\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))$,\n if None, use $x=x_{\\text{test}}$\n y: optional label tensor to compute gradients\n mode: enum value of [InfluenceMode]\n [pydvl.influence.base_influence_function_model.InfluenceMode]\n\n Returns:\n Tensor representing the element-wise scalar products for the provided batch\n\n \"\"\"\n t: torch.Tensor = super().influences(x_test, y_test, x, y, mode=mode)\n return t\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.ArnoldiInfluence.influences_from_factors","title":"influences_from_factors","text":"influences_from_factors(\n z_test_factors: Tensor,\n x: Tensor,\n y: Tensor,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> Tensor\n
Computation of
\\[ \\langle z_{\\text{test_factors}}, \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the case of up-weighting influence, resp.
\\[ \\langle z_{\\text{test_factors}}, \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the perturbation type influence case. The gradient is meant to be per sample of the batch \\((x, y)\\). For all input tensors it is assumed, that the first dimension is the batch dimension (in case, you want to provide a single sample z, call z.unsqueeze(0) if no batch dimension is present).
PARAMETER DESCRIPTION z_test_factors
pre-computed tensor, approximating \\(H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}}))\\)
TYPE: Tensor
x
model input to use in the gradient computations \\(\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))\\), resp. \\(\\nabla_{x}\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))\\)
TYPE: Tensor
y
label tensor to compute gradients
TYPE: Tensor
mode
enum value of InfluenceMode
TYPE: InfluenceMode
DEFAULT: Up
RETURNS DESCRIPTION Tensor
Tensor representing the element-wise scalar products for the provided batch
Source code in src/pydvl/influence/torch/influence_function_model.py
def influences_from_factors(\n self,\n z_test_factors: torch.Tensor,\n x: torch.Tensor,\n y: torch.Tensor,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> torch.Tensor:\n r\"\"\"\n Computation of\n\n \\[ \\langle z_{\\text{test_factors}},\n \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the case of up-weighting influence, resp.\n\n \\[ \\langle z_{\\text{test_factors}},\n \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the perturbation type influence case. The gradient is meant to be per sample\n of the batch $(x, y)$. For all input tensors it is assumed,\n that the first dimension is the batch dimension (in case, you want to provide\n a single sample z, call z.unsqueeze(0) if no batch dimension is present).\n\n Args:\n z_test_factors: pre-computed tensor, approximating\n $H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}},\n f_{\\theta}(x_{\\text{test}}))$\n x: model input to use in the gradient computations\n $\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))$,\n resp. $\\nabla_{x}\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))$\n y: label tensor to compute gradients\n mode: enum value of [InfluenceMode]\n [pydvl.influence.base_influence_function_model.InfluenceMode]\n\n Returns:\n Tensor representing the element-wise scalar products for the provided batch\n\n \"\"\"\n if mode == InfluenceMode.Up:\n return (\n z_test_factors.to(self.model_device)\n @ self._loss_grad(x.to(self.model_device), y.to(self.model_device)).T\n )\n elif mode == InfluenceMode.Perturbation:\n return torch.einsum(\n \"ia,j...a->ij...\",\n z_test_factors.to(self.model_device),\n self._flat_loss_mixed_grad(\n x.to(self.model_device), y.to(self.model_device)\n ),\n )\n else:\n raise UnsupportedInfluenceModeException(mode)\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.ArnoldiInfluence.fit","title":"fit","text":"fit(data: DataLoader) -> ArnoldiInfluence\n
Fitting corresponds to the computation of the low rank decomposition
\\[ V D^{-1} V^T \\] of the Hessian defined by the provided data loader.
PARAMETER DESCRIPTION data
The data to compute the Hessian with.
TYPE: DataLoader
RETURNS DESCRIPTION ArnoldiInfluence
The fitted instance.
Source code in src/pydvl/influence/torch/influence_function_model.py
@log_duration(log_level=logging.INFO)\ndef fit(self, data: DataLoader) -> ArnoldiInfluence:\n r\"\"\"\n Fitting corresponds to the computation of the low rank decomposition\n\n \\[ V D^{-1} V^T \\]\n\n of the Hessian defined by the provided data loader.\n\n Args:\n data: The data to compute the Hessian with.\n\n Returns:\n The fitted instance.\n\n \"\"\"\n low_rank_representation = model_hessian_low_rank(\n self.model,\n self.loss,\n data,\n hessian_perturbation=0.0, # regularization is applied, when computing values\n rank_estimate=self.rank_estimate,\n krylov_dimension=self.krylov_dimension,\n tol=self.tol,\n max_iter=self.max_iter,\n eigen_computation_on_gpu=self.eigen_computation_on_gpu,\n precompute_grad=self.precompute_grad,\n )\n self.low_rank_representation = low_rank_representation.to(self.model_device)\n return self\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.EkfacInfluence","title":"EkfacInfluence","text":"EkfacInfluence(\n model: Module,\n update_diagonal: bool = False,\n hessian_regularization: float = 0.0,\n progress: bool = False,\n)\n
Bases: TorchInfluenceFunctionModel
Approximately solves the linear system Hx = b, where H is the Hessian of a model with the empirical categorical cross entropy as loss function and b is the given right-hand side vector. It employs the EK-FAC method, which is based on the kronecker factorization of the Hessian.
Contrary to the other influence function methods, this implementation can only be used for classification tasks with a cross entropy loss function. However, it is much faster than the other methods and can be used efficiently for very large datasets and models. For more information, see Eigenvalue Corrected K-FAC.
PARAMETER DESCRIPTION model
A PyTorch model. The Hessian will be calculated with respect to this model's parameters.
TYPE: Module
update_diagonal
If True, the diagonal values in the ekfac representation are refitted from the training data after calculating the KFAC blocks. This provides a more accurate approximation of the Hessian, but it is computationally more expensive.
TYPE: bool
DEFAULT: False
hessian_regularization
Regularization of the hessian.
TYPE: float
DEFAULT: 0.0
progress
If True, display progress bars.
TYPE: bool
DEFAULT: False
Source code in src/pydvl/influence/torch/influence_function_model.py
def __init__(\n self,\n model: nn.Module,\n update_diagonal: bool = False,\n hessian_regularization: float = 0.0,\n progress: bool = False,\n):\n super().__init__(model, torch.nn.functional.cross_entropy)\n self.hessian_regularization = hessian_regularization\n self.update_diagonal = update_diagonal\n self.active_layers = self._parse_active_layers()\n self.progress = progress\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.EkfacInfluence.fit_required","title":"fit_required staticmethod
","text":"fit_required(method)\n
Decorator to enforce the fitted check
Source code in src/pydvl/influence/base_influence_function_model.py
@staticmethod\ndef fit_required(method):\n \"\"\"Decorator to enforce the fitted check\"\"\"\n\n @wraps(method)\n def wrapper(self, *args, **kwargs):\n if not self.is_fitted:\n raise NotFittedException(type(self))\n return method(self, *args, **kwargs)\n\n return wrapper\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.EkfacInfluence.influence_factors","title":"influence_factors","text":"influence_factors(x: Tensor, y: Tensor) -> Tensor\n
Compute approximation of
\\[ H^{-1}\\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\] where the gradient is meant to be per sample of the batch \\((x, y)\\). For all input tensors it is assumed, that the first dimension is the batch dimension (in case, you want to provide a single sample z, call z.unsqueeze(0) if no batch dimension is present).
PARAMETER DESCRIPTION x
model input to use in the gradient computations
TYPE: Tensor
y
label tensor to compute gradients
TYPE: Tensor
RETURNS DESCRIPTION Tensor
Tensor representing the element-wise inverse Hessian matrix vector products
Source code in src/pydvl/influence/torch/influence_function_model.py
def influence_factors(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:\n r\"\"\"\n Compute approximation of\n\n \\[ H^{-1}\\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\]\n\n where the gradient is meant to be per sample of the batch $(x, y)$.\n For all input tensors it is assumed,\n that the first dimension is the batch dimension (in case, you want to provide\n a single sample z, call z.unsqueeze(0) if no batch dimension is present).\n\n Args:\n x: model input to use in the gradient computations\n y: label tensor to compute gradients\n\n Returns:\n Tensor representing the element-wise inverse Hessian matrix vector products\n\n \"\"\"\n return super().influence_factors(x, y)\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.EkfacInfluence.influences","title":"influences","text":"influences(\n x_test: Tensor,\n y_test: Tensor,\n x: Optional[Tensor] = None,\n y: Optional[Tensor] = None,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> Tensor\n
Compute the approximation of
\\[ \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}})), \\nabla_{\\theta} \\ell(y, f_{\\theta}(x))\\rangle \\] for the case of up-weighting influence, resp.
\\[ \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}})), \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the perturbation type influence case. For all input tensors it is assumed, that the first dimension is the batch dimension (in case, you want to provide a single sample z, call z.unsqueeze(0) if no batch dimension is present).
PARAMETER DESCRIPTION x_test
model input to use in the gradient computations of \\(H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}}))\\)
TYPE: Tensor
y_test
label tensor to compute gradients
TYPE: Tensor
x
optional model input to use in the gradient computations \\(\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))\\), resp. \\(\\nabla_{x}\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))\\), if None, use \\(x=x_{\\text{test}}\\)
TYPE: Optional[Tensor]
DEFAULT: None
y
optional label tensor to compute gradients
TYPE: Optional[Tensor]
DEFAULT: None
mode
enum value of InfluenceMode
TYPE: InfluenceMode
DEFAULT: Up
RETURNS DESCRIPTION Tensor
Tensor representing the element-wise scalar products for the provided batch
Source code in src/pydvl/influence/torch/influence_function_model.py
def influences(\n self,\n x_test: torch.Tensor,\n y_test: torch.Tensor,\n x: Optional[torch.Tensor] = None,\n y: Optional[torch.Tensor] = None,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> torch.Tensor:\n r\"\"\"\n Compute the approximation of\n\n \\[\n \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}},\n f_{\\theta}(x_{\\text{test}})), \\nabla_{\\theta} \\ell(y, f_{\\theta}(x))\\rangle\n \\]\n\n for the case of up-weighting influence, resp.\n\n \\[\n \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}})),\n \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle\n \\]\n\n for the perturbation type influence case. For all input tensors it is assumed,\n that the first dimension is the batch dimension (in case, you want to provide\n a single sample z, call z.unsqueeze(0) if no batch dimension is present).\n\n Args:\n x_test: model input to use in the gradient computations\n of $H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}},\n f_{\\theta}(x_{\\text{test}}))$\n y_test: label tensor to compute gradients\n x: optional model input to use in the gradient computations\n $\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))$,\n resp. $\\nabla_{x}\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))$,\n if None, use $x=x_{\\text{test}}$\n y: optional label tensor to compute gradients\n mode: enum value of [InfluenceMode]\n [pydvl.influence.base_influence_function_model.InfluenceMode]\n\n Returns:\n Tensor representing the element-wise scalar products for the provided batch\n\n \"\"\"\n t: torch.Tensor = super().influences(x_test, y_test, x, y, mode=mode)\n return t\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.EkfacInfluence.influences_from_factors","title":"influences_from_factors","text":"influences_from_factors(\n z_test_factors: Tensor,\n x: Tensor,\n y: Tensor,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> Tensor\n
Computation of
\\[ \\langle z_{\\text{test_factors}}, \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the case of up-weighting influence, resp.
\\[ \\langle z_{\\text{test_factors}}, \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the perturbation type influence case. The gradient is meant to be per sample of the batch \\((x, y)\\). For all input tensors it is assumed, that the first dimension is the batch dimension (in case, you want to provide a single sample z, call z.unsqueeze(0) if no batch dimension is present).
PARAMETER DESCRIPTION z_test_factors
pre-computed tensor, approximating \\(H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}}))\\)
TYPE: Tensor
x
model input to use in the gradient computations \\(\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))\\), resp. \\(\\nabla_{x}\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))\\)
TYPE: Tensor
y
label tensor to compute gradients
TYPE: Tensor
mode
enum value of InfluenceMode
TYPE: InfluenceMode
DEFAULT: Up
RETURNS DESCRIPTION Tensor
Tensor representing the element-wise scalar products for the provided batch
Source code in src/pydvl/influence/torch/influence_function_model.py
def influences_from_factors(\n self,\n z_test_factors: torch.Tensor,\n x: torch.Tensor,\n y: torch.Tensor,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> torch.Tensor:\n r\"\"\"\n Computation of\n\n \\[ \\langle z_{\\text{test_factors}},\n \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the case of up-weighting influence, resp.\n\n \\[ \\langle z_{\\text{test_factors}},\n \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the perturbation type influence case. The gradient is meant to be per sample\n of the batch $(x, y)$. For all input tensors it is assumed,\n that the first dimension is the batch dimension (in case, you want to provide\n a single sample z, call z.unsqueeze(0) if no batch dimension is present).\n\n Args:\n z_test_factors: pre-computed tensor, approximating\n $H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}},\n f_{\\theta}(x_{\\text{test}}))$\n x: model input to use in the gradient computations\n $\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))$,\n resp. $\\nabla_{x}\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))$\n y: label tensor to compute gradients\n mode: enum value of [InfluenceMode]\n [pydvl.influence.base_influence_function_model.InfluenceMode]\n\n Returns:\n Tensor representing the element-wise scalar products for the provided batch\n\n \"\"\"\n if mode == InfluenceMode.Up:\n return (\n z_test_factors.to(self.model_device)\n @ self._loss_grad(x.to(self.model_device), y.to(self.model_device)).T\n )\n elif mode == InfluenceMode.Perturbation:\n return torch.einsum(\n \"ia,j...a->ij...\",\n z_test_factors.to(self.model_device),\n self._flat_loss_mixed_grad(\n x.to(self.model_device), y.to(self.model_device)\n ),\n )\n else:\n raise UnsupportedInfluenceModeException(mode)\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.EkfacInfluence.fit","title":"fit","text":"fit(data: DataLoader) -> EkfacInfluence\n
Compute the KFAC blocks for each layer of the model, using the provided data. It then creates an EkfacRepresentation object that stores the KFAC blocks for each layer, their eigenvalue decomposition and diagonal values.
Source code in src/pydvl/influence/torch/influence_function_model.py
@log_duration(log_level=logging.INFO)\ndef fit(self, data: DataLoader) -> EkfacInfluence:\n \"\"\"\n Compute the KFAC blocks for each layer of the model, using the provided data.\n It then creates an EkfacRepresentation object that stores the KFAC blocks for\n each layer, their eigenvalue decomposition and diagonal values.\n \"\"\"\n forward_x, grad_y = self._get_kfac_blocks(data)\n layers_evecs_a = {}\n layers_evect_g = {}\n layers_diags = {}\n for key in self.active_layers.keys():\n evals_a, evecs_a = safe_torch_linalg_eigh(forward_x[key])\n evals_g, evecs_g = safe_torch_linalg_eigh(grad_y[key])\n layers_evecs_a[key] = evecs_a\n layers_evect_g[key] = evecs_g\n layers_diags[key] = torch.kron(evals_g.view(-1, 1), evals_a.view(-1, 1))\n\n self.ekfac_representation = EkfacRepresentation(\n self.active_layers.keys(),\n self.active_layers.values(),\n layers_evecs_a.values(),\n layers_evect_g.values(),\n layers_diags.values(),\n )\n if self.update_diagonal:\n self._update_diag(data)\n return self\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.EkfacInfluence.influences_by_layer","title":"influences_by_layer","text":"influences_by_layer(\n x_test: Tensor,\n y_test: Tensor,\n x: Optional[Tensor] = None,\n y: Optional[Tensor] = None,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> Dict[str, Tensor]\n
Compute the influence of the data on the test data for each layer of the model.
PARAMETER DESCRIPTION x_test
model input to use in the gradient computations of \\(H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}}))\\)
TYPE: Tensor
y_test
label tensor to compute gradients
TYPE: Tensor
x
optional model input to use in the gradient computations \\(\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))\\), resp. \\(\\nabla_{x}\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))\\), if None, use \\(x=x_{\\text{test}}\\)
TYPE: Optional[Tensor]
DEFAULT: None
y
optional label tensor to compute gradients
TYPE: Optional[Tensor]
DEFAULT: None
mode
enum value of InfluenceMode
TYPE: InfluenceMode
DEFAULT: Up
RETURNS DESCRIPTION Dict[str, Tensor]
A dictionary containing the influence of the data on the test data for each
Dict[str, Tensor]
layer of the model, with the layer name as key.
Source code in src/pydvl/influence/torch/influence_function_model.py
def influences_by_layer(\n self,\n x_test: torch.Tensor,\n y_test: torch.Tensor,\n x: Optional[torch.Tensor] = None,\n y: Optional[torch.Tensor] = None,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> Dict[str, torch.Tensor]:\n r\"\"\"\n Compute the influence of the data on the test data for each layer of the model.\n\n Args:\n x_test: model input to use in the gradient computations of\n $H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}},\n f_{\\theta}(x_{\\text{test}}))$\n y_test: label tensor to compute gradients\n x: optional model input to use in the gradient computations\n $\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))$,\n resp. $\\nabla_{x}\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))$,\n if None, use $x=x_{\\text{test}}$\n y: optional label tensor to compute gradients\n mode: enum value of [InfluenceMode]\n [pydvl.influence.base_influence_function_model.InfluenceMode]\n\n Returns:\n A dictionary containing the influence of the data on the test data for each\n layer of the model, with the layer name as key.\n \"\"\"\n if not self.is_fitted:\n raise ValueError(\n \"Instance must be fitted before calling influence methods on it\"\n )\n\n if x is None:\n if y is not None:\n raise ValueError(\n \"Providing labels y, without providing model input x \"\n \"is not supported\"\n )\n\n return self._symmetric_values_by_layer(\n x_test.to(self.model_device),\n y_test.to(self.model_device),\n mode,\n )\n\n if y is None:\n raise ValueError(\n \"Providing model input x without providing labels y is not supported\"\n )\n\n return self._non_symmetric_values_by_layer(\n x_test.to(self.model_device),\n y_test.to(self.model_device),\n x.to(self.model_device),\n y.to(self.model_device),\n mode,\n )\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.EkfacInfluence.influence_factors_by_layer","title":"influence_factors_by_layer","text":"influence_factors_by_layer(x: Tensor, y: Tensor) -> Dict[str, Tensor]\n
Computes the approximation of
\\[ H^{-1}\\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\] for each layer of the model separately.
PARAMETER DESCRIPTION x
model input to use in the gradient computations
TYPE: Tensor
y
label tensor to compute gradients
TYPE: Tensor
RETURNS DESCRIPTION Dict[str, Tensor]
A dictionary containing the influence factors for each layer of the model,
Dict[str, Tensor]
with the layer name as key.
Source code in src/pydvl/influence/torch/influence_function_model.py
def influence_factors_by_layer(\n self,\n x: torch.Tensor,\n y: torch.Tensor,\n) -> Dict[str, torch.Tensor]:\n r\"\"\"\n Computes the approximation of\n\n \\[ H^{-1}\\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\]\n\n for each layer of the model separately.\n\n Args:\n x: model input to use in the gradient computations\n y: label tensor to compute gradients\n\n Returns:\n A dictionary containing the influence factors for each layer of the model,\n with the layer name as key.\n \"\"\"\n if not self.is_fitted:\n raise ValueError(\n \"Instance must be fitted before calling influence methods on it\"\n )\n\n return self._solve_hvp_by_layer(\n self._loss_grad(x.to(self.model_device), y.to(self.model_device)),\n self.ekfac_representation,\n self.hessian_regularization,\n )\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.EkfacInfluence.influences_from_factors_by_layer","title":"influences_from_factors_by_layer","text":"influences_from_factors_by_layer(\n z_test_factors: Dict[str, Tensor],\n x: Tensor,\n y: Tensor,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> Dict[str, Tensor]\n
Computation of
\\[ \\langle z_{\\text{test_factors}}, \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the case of up-weighting influence, resp.
\\[ \\langle z_{\\text{test_factors}}, \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the perturbation type influence case for each layer of the model separately. The gradients are meant to be per sample of the batch \\((x, y)\\).
PARAMETER DESCRIPTION z_test_factors
pre-computed tensor, approximating \\(H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}}))\\)
TYPE: Dict[str, Tensor]
x
model input to use in the gradient computations \\(\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))\\), resp. \\(\\nabla_{x}\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))\\)
TYPE: Tensor
y
label tensor to compute gradients
TYPE: Tensor
mode
enum value of InfluenceMode
TYPE: InfluenceMode
DEFAULT: Up
RETURNS DESCRIPTION Dict[str, Tensor]
A dictionary containing the influence of the data on the test data
Dict[str, Tensor]
for each layer of the model, with the layer name as key.
Source code in src/pydvl/influence/torch/influence_function_model.py
def influences_from_factors_by_layer(\n self,\n z_test_factors: Dict[str, torch.Tensor],\n x: torch.Tensor,\n y: torch.Tensor,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> Dict[str, torch.Tensor]:\n r\"\"\"\n Computation of\n\n \\[ \\langle z_{\\text{test_factors}},\n \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the case of up-weighting influence, resp.\n\n \\[ \\langle z_{\\text{test_factors}},\n \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the perturbation type influence case for each layer of the model\n separately. The gradients are meant to be per sample of the batch $(x,\n y)$.\n\n Args:\n z_test_factors: pre-computed tensor, approximating\n $H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}},\n f_{\\theta}(x_{\\text{test}}))$\n x: model input to use in the gradient computations\n $\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))$,\n resp. $\\nabla_{x}\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))$\n y: label tensor to compute gradients\n mode: enum value of [InfluenceMode]\n [pydvl.influence.base_influence_function_model.InfluenceMode]\n\n Returns:\n A dictionary containing the influence of the data on the test data\n for each layer of the model, with the layer name as key.\n \"\"\"\n if mode == InfluenceMode.Up:\n total_grad = self._loss_grad(\n x.to(self.model_device), y.to(self.model_device)\n )\n start_idx = 0\n influences = {}\n for layer_id, layer_z_test in z_test_factors.items():\n end_idx = start_idx + layer_z_test.shape[1]\n influences[layer_id] = (\n layer_z_test.to(self.model_device)\n @ total_grad[:, start_idx:end_idx].T\n )\n start_idx = end_idx\n return influences\n elif mode == InfluenceMode.Perturbation:\n total_mixed_grad = self._flat_loss_mixed_grad(\n x.to(self.model_device), y.to(self.model_device)\n )\n start_idx = 0\n influences = {}\n for layer_id, layer_z_test in z_test_factors.items():\n end_idx = start_idx + layer_z_test.shape[1]\n influences[layer_id] = torch.einsum(\n \"ia,j...a->ij...\",\n layer_z_test.to(self.model_device),\n total_mixed_grad[:, start_idx:end_idx],\n )\n start_idx = end_idx\n return influences\n else:\n raise UnsupportedInfluenceModeException(mode)\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.EkfacInfluence.explore_hessian_regularization","title":"explore_hessian_regularization","text":"explore_hessian_regularization(\n x: Tensor, y: Tensor, regularization_values: List[float]\n) -> Dict[float, Dict[str, Tensor]]\n
Efficiently computes the influence for input x and label y for each layer of the model, for different values of the hessian regularization parameter. This is done by computing the gradient of the loss function for the input x and label y only once and then solving the Hessian Vector Product for each regularization value. This is useful for finding the optimal regularization value and for exploring how robust the influence values are to changes in the regularization value.
PARAMETER DESCRIPTION x
model input to use in the gradient computations
TYPE: Tensor
y
label tensor to compute gradients
TYPE: Tensor
regularization_values
list of regularization values to use
TYPE: List[float]
RETURNS DESCRIPTION Dict[float, Dict[str, Tensor]]
A dictionary containing with keys being the regularization values and values
Dict[float, Dict[str, Tensor]]
being dictionaries containing the influences for each layer of the model,
Dict[float, Dict[str, Tensor]]
with the layer name as key.
Source code in src/pydvl/influence/torch/influence_function_model.py
def explore_hessian_regularization(\n self,\n x: torch.Tensor,\n y: torch.Tensor,\n regularization_values: List[float],\n) -> Dict[float, Dict[str, torch.Tensor]]:\n \"\"\"\n Efficiently computes the influence for input x and label y for each layer of the\n model, for different values of the hessian regularization parameter. This is done\n by computing the gradient of the loss function for the input x and label y only once\n and then solving the Hessian Vector Product for each regularization value. This is\n useful for finding the optimal regularization value and for exploring\n how robust the influence values are to changes in the regularization value.\n\n Args:\n x: model input to use in the gradient computations\n y: label tensor to compute gradients\n regularization_values: list of regularization values to use\n\n Returns:\n A dictionary containing with keys being the regularization values and values\n being dictionaries containing the influences for each layer of the model,\n with the layer name as key.\n \"\"\"\n grad = self._loss_grad(x.to(self.model_device), y.to(self.model_device))\n influences_by_reg_value = {}\n for reg_value in regularization_values:\n reg_factors = self._solve_hvp_by_layer(\n grad, self.ekfac_representation, reg_value\n )\n values = {}\n start_idx = 0\n for layer_id, layer_fac in reg_factors.items():\n end_idx = start_idx + layer_fac.shape[1]\n values[layer_id] = layer_fac @ grad[:, start_idx:end_idx].T\n start_idx = end_idx\n influences_by_reg_value[reg_value] = values\n return influences_by_reg_value\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.NystroemSketchInfluence","title":"NystroemSketchInfluence","text":"NystroemSketchInfluence(\n model: Module,\n loss: Callable[[Tensor, Tensor], Tensor],\n regularization: Union[float, Dict[str, float]],\n rank: int,\n block_structure: Union[\n BlockMode, OrderedDict[str, List[str]]\n ] = BlockMode.FULL,\n second_order_mode: SecondOrderMode = SecondOrderMode.HESSIAN,\n)\n
Bases: TorchComposableInfluence[LowRankOperator]
Given a model and training data, it uses a low-rank approximation of the Hessian (derived via random projection Nystr\u00f6m approximation) in combination with the Sherman\u2013Morrison\u2013Woodbury formula to calculate the inverse of the Hessian Vector Product. More concrete, it computes a low-rank approximation
\\[\\begin{align*} H_{\\text{nys}} &= (H\\Omega)(\\Omega^TH\\Omega)^{+}(H\\Omega)^T \\\\\\ &= U \\Lambda U^T \\end{align*}\\] in factorized form and approximates the action of the inverse Hessian via
\\[ (H_{\\text{nys}} + \\lambda I)^{-1} = U(\\Lambda+\\lambda I)U^T + \\frac{1}{\\lambda}(I\u2212UU^T). \\] PARAMETER DESCRIPTION model
A PyTorch model. The Hessian will be calculated with respect to this model's parameters.
TYPE: Module
loss
A callable that takes the model's output and target as input and returns the scalar loss.
TYPE: Callable[[Tensor, Tensor], Tensor]
regularization
Optional regularization parameter added to the Hessian-vector product for numerical stability.
TYPE: Union[float, Dict[str, float]]
rank
rank of the low-rank approximation
TYPE: int
Source code in src/pydvl/influence/torch/influence_function_model.py
def __init__(\n self,\n model: torch.nn.Module,\n loss: Callable[[torch.Tensor, torch.Tensor], torch.Tensor],\n regularization: Union[float, Dict[str, float]],\n rank: int,\n block_structure: Union[BlockMode, OrderedDict[str, List[str]]] = BlockMode.FULL,\n second_order_mode: SecondOrderMode = SecondOrderMode.HESSIAN,\n):\n super().__init__(\n model,\n block_structure,\n regularization=cast(\n Union[float, Dict[str, Optional[float]]], regularization\n ),\n )\n self.second_order_mode = second_order_mode\n self.rank = rank\n self.loss = loss\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.NystroemSketchInfluence.fit","title":"fit","text":"fit(data: DataLoaderType) -> InfluenceFunctionModel\n
Fitting to provided data, by internally creating a block mapper instance from it. Args: data: iterable of tensors
RETURNS DESCRIPTION InfluenceFunctionModel
Fitted instance
Source code in src/pydvl/influence/base_influence_function_model.py
@log_duration(log_level=logging.INFO)\ndef fit(self, data: DataLoaderType) -> InfluenceFunctionModel:\n \"\"\"\n Fitting to provided data, by internally creating a block mapper instance from\n it.\n Args:\n data: iterable of tensors\n\n Returns:\n Fitted instance\n \"\"\"\n self.block_mapper = self._create_block_mapper(data)\n return self\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.NystroemSketchInfluence.fit_required","title":"fit_required staticmethod
","text":"fit_required(method)\n
Decorator to enforce the fitted check
Source code in src/pydvl/influence/base_influence_function_model.py
@staticmethod\ndef fit_required(method):\n \"\"\"Decorator to enforce the fitted check\"\"\"\n\n @wraps(method)\n def wrapper(self, *args, **kwargs):\n if not self.is_fitted:\n raise NotFittedException(type(self))\n return method(self, *args, **kwargs)\n\n return wrapper\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.NystroemSketchInfluence.influence_factors","title":"influence_factors","text":"influence_factors(x: TensorType, y: TensorType) -> TensorType\n
Computes the approximation of
\\[ H^{-1}\\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\] where the gradient is meant to be per sample of the batch \\((x, y)\\). For all input tensors it is assumed, that the first dimension is the batch dimension.
PARAMETER DESCRIPTION x
model input to use in the gradient computations
TYPE: TensorType
y
label tensor to compute gradients
TYPE: TensorType
RETURNS DESCRIPTION TensorType
Tensor representing the element-wise inverse Hessian matrix vector products
Source code in src/pydvl/influence/base_influence_function_model.py
def influence_factors(self, x: TensorType, y: TensorType) -> TensorType:\n r\"\"\"\n Computes the approximation of\n\n \\[ H^{-1}\\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\]\n\n where the gradient is meant to be per sample of the batch $(x, y)$.\n For all input tensors it is assumed,\n that the first dimension is the batch dimension.\n\n Args:\n x: model input to use in the gradient computations\n y: label tensor to compute gradients\n\n Returns:\n Tensor representing the element-wise inverse Hessian matrix vector products\n\n \"\"\"\n if not self.is_fitted:\n raise NotFittedException(type(self))\n return self._influence_factors(x, y)\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.NystroemSketchInfluence.influences","title":"influences","text":"influences(\n x_test: TensorType,\n y_test: TensorType,\n x: Optional[TensorType] = None,\n y: Optional[TensorType] = None,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> TensorType\n
Computes the approximation of
\\[ \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}})), \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the case of up-weighting influence, resp.
\\[ \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{test}, f_{\\theta}(x_{test})), \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the perturbation type influence case.
PARAMETER DESCRIPTION x_test
model input to use in the gradient computations of \\(H^{-1}\\nabla_{theta} \\ell(y_{test}, f_{\\theta}(x_{test}))\\)
TYPE: TensorType
y_test
label tensor to compute gradients
TYPE: TensorType
x
optional model input to use in the gradient computations \\(\\nabla_{theta}\\ell(y, f_{\\theta}(x))\\), resp. \\(\\nabla_{x}\\nabla_{theta}\\ell(y, f_{\\theta}(x))\\), if None, use \\(x=x_{test}\\)
TYPE: Optional[TensorType]
DEFAULT: None
y
optional label tensor to compute gradients
TYPE: Optional[TensorType]
DEFAULT: None
mode
enum value of InfluenceMode
TYPE: InfluenceMode
DEFAULT: Up
RETURNS DESCRIPTION TensorType
Tensor representing the element-wise scalar products for the provided batch
Source code in src/pydvl/influence/base_influence_function_model.py
def influences(\n self,\n x_test: TensorType,\n y_test: TensorType,\n x: Optional[TensorType] = None,\n y: Optional[TensorType] = None,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> TensorType:\n r\"\"\"\n Computes the approximation of\n\n \\[ \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}},\n f_{\\theta}(x_{\\text{test}})),\n \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the case of up-weighting influence, resp.\n\n \\[ \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{test}, f_{\\theta}(x_{test})),\n \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the perturbation type influence case.\n\n Args:\n x_test: model input to use in the gradient computations\n of $H^{-1}\\nabla_{theta} \\ell(y_{test}, f_{\\theta}(x_{test}))$\n y_test: label tensor to compute gradients\n x: optional model input to use in the gradient computations\n $\\nabla_{theta}\\ell(y, f_{\\theta}(x))$,\n resp. $\\nabla_{x}\\nabla_{theta}\\ell(y, f_{\\theta}(x))$,\n if None, use $x=x_{test}$\n y: optional label tensor to compute gradients\n mode: enum value of [InfluenceMode]\n [pydvl.influence.base_influence_function_model.InfluenceMode]\n\n Returns:\n Tensor representing the element-wise scalar products for the provided batch\n\n \"\"\"\n if not self.is_fitted:\n raise NotFittedException(type(self))\n\n if x is None and y is not None:\n raise ValueError(\n \"Providing labels y, without providing model input x \"\n \"is not supported\"\n )\n\n if x is not None and y is None:\n raise ValueError(\n \"Providing model input x, without providing labels y \"\n \"is not supported\"\n )\n\n return self._influences(x_test, y_test, x, y, mode)\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.NystroemSketchInfluence.influences_from_factors","title":"influences_from_factors","text":"influences_from_factors(\n z_test_factors: TensorType,\n x: TensorType,\n y: TensorType,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> TensorType\n
Computation of
\\[ \\langle z_{\\text{test_factors}}, \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the case of up-weighting influence, resp.
\\[ \\langle z_{\\text{test_factors}}, \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the perturbation type influence case. The gradient is meant to be per sample of the batch \\((x, y)\\).
PARAMETER DESCRIPTION z_test_factors
pre-computed array, approximating \\(H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}}))\\)
TYPE: TensorType
x
model input to use in the gradient computations \\(\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))\\), resp. \\(\\nabla_{x}\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))\\), if None, use \\(x=x_{\\text{test}}\\)
TYPE: TensorType
y
label tensor to compute gradients
TYPE: TensorType
mode
enum value of InfluenceMode
TYPE: InfluenceMode
DEFAULT: Up
RETURNS DESCRIPTION TensorType
Tensor representing the element-wise scalar products for the provided batch
Source code in src/pydvl/influence/base_influence_function_model.py
@InfluenceFunctionModel.fit_required\ndef influences_from_factors(\n self,\n z_test_factors: TensorType,\n x: TensorType,\n y: TensorType,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> TensorType:\n r\"\"\"\n Computation of\n\n \\[ \\langle z_{\\text{test_factors}},\n \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the case of up-weighting influence, resp.\n\n \\[ \\langle z_{\\text{test_factors}},\n \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the perturbation type influence case. The gradient is meant to be per sample\n of the batch $(x, y)$.\n\n Args:\n z_test_factors: pre-computed array, approximating\n $H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}},\n f_{\\theta}(x_{\\text{test}}))$\n x: model input to use in the gradient computations\n $\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))$,\n resp. $\\nabla_{x}\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))$,\n if None, use $x=x_{\\text{test}}$\n y: label tensor to compute gradients\n mode: enum value of [InfluenceMode]\n [pydvl.influence.base_influence_function_model.InfluenceMode]\n\n Returns:\n Tensor representing the element-wise scalar products for the provided batch\n\n \"\"\"\n tensors = self.block_mapper.generate_interactions_from_transformed_grads(\n z_test_factors,\n self._create_batch(x, y),\n mode,\n )\n result: TensorType = next(tensors)\n for tensor in tensors:\n result = result + tensor\n return result\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.NystroemSketchInfluence.influences_by_block","title":"influences_by_block","text":"influences_by_block(\n x_test: TensorType,\n y_test: TensorType,\n x: Optional[TensorType] = None,\n y: Optional[TensorType] = None,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> OrderedDict[str, TensorType]\n
Compute the block-wise influence values for the provided data, i.e. an approximation of
\\[ \\langle H^{-1}\\nabla_{theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}})), \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the case of up-weighting influence, resp.
\\[ \\langle H^{-1}\\nabla_{theta} \\ell(y_{test}, f_{\\theta}(x_{test})), \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the perturbation type influence case.
PARAMETER DESCRIPTION x_test
model input to use in the gradient computations of the approximation of \\(H^{-1}\\nabla_{theta} \\ell(y_{test}, f_{\\theta}(x_{test}))\\)
TYPE: TensorType
y_test
label tensor to compute gradients
TYPE: TensorType
x
optional model input to use in the gradient computations \\(\\nabla_{theta}\\ell(y, f_{\\theta}(x))\\), resp. \\(\\nabla_{x}\\nabla_{theta}\\ell(y, f_{\\theta}(x))\\), if None, use \\(x=x_{test}\\)
TYPE: Optional[TensorType]
DEFAULT: None
y
optional label tensor to compute gradients
TYPE: Optional[TensorType]
DEFAULT: None
mode
enum value of InfluenceMode
TYPE: InfluenceMode
DEFAULT: Up
RETURNS DESCRIPTION OrderedDict[str, TensorType]
Ordered dictionary of tensors representing the element-wise scalar products
OrderedDict[str, TensorType]
for the provided batch per block.
Source code in src/pydvl/influence/base_influence_function_model.py
@InfluenceFunctionModel.fit_required\ndef influences_by_block(\n self,\n x_test: TensorType,\n y_test: TensorType,\n x: Optional[TensorType] = None,\n y: Optional[TensorType] = None,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> OrderedDict[str, TensorType]:\n r\"\"\"\n Compute the block-wise influence values for the provided data, i.e. an\n approximation of\n\n \\[ \\langle H^{-1}\\nabla_{theta} \\ell(y_{\\text{test}},\n f_{\\theta}(x_{\\text{test}})),\n \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the case of up-weighting influence, resp.\n\n \\[ \\langle H^{-1}\\nabla_{theta} \\ell(y_{test}, f_{\\theta}(x_{test})),\n \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the perturbation type influence case.\n\n Args:\n x_test: model input to use in the gradient computations\n of the approximation of\n $H^{-1}\\nabla_{theta} \\ell(y_{test}, f_{\\theta}(x_{test}))$\n y_test: label tensor to compute gradients\n x: optional model input to use in the gradient computations\n $\\nabla_{theta}\\ell(y, f_{\\theta}(x))$,\n resp. $\\nabla_{x}\\nabla_{theta}\\ell(y, f_{\\theta}(x))$,\n if None, use $x=x_{test}$\n y: optional label tensor to compute gradients\n mode: enum value of [InfluenceMode]\n [pydvl.influence.base_influence_function_model.InfluenceMode]\n\n Returns:\n Ordered dictionary of tensors representing the element-wise scalar products\n for the provided batch per block.\n\n \"\"\"\n left_batch = self._create_batch(x_test, y_test)\n\n if x is None:\n if y is not None:\n raise ValueError(\n \"Providing labels y, without providing model input x \"\n \"is not supported\"\n )\n right_batch = left_batch\n else:\n if y is None:\n raise ValueError(\n \"Providing model input x, without providing labels y \"\n \"is not supported\"\n )\n right_batch = self._create_batch(x, y)\n\n return self.block_mapper.interactions(left_batch, right_batch, mode)\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.NystroemSketchInfluence.influence_factors_by_block","title":"influence_factors_by_block","text":"influence_factors_by_block(\n x: TensorType, y: TensorType\n) -> OrderedDict[str, TensorType]\n
Compute the block-wise approximation of
\\[ H^{-1}\\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\] where the gradient is meant to be per sample of the batch \\((x, y)\\).
PARAMETER DESCRIPTION x
model input to use in the gradient computations
TYPE: TensorType
y
label tensor to compute gradients
TYPE: TensorType
RETURNS DESCRIPTION OrderedDict[str, TensorType]
Ordered dictionary of tensors representing the element-wise
OrderedDict[str, TensorType]
approximate inverse Hessian matrix vector products per block.
Source code in src/pydvl/influence/base_influence_function_model.py
@InfluenceFunctionModel.fit_required\ndef influence_factors_by_block(\n self, x: TensorType, y: TensorType\n) -> OrderedDict[str, TensorType]:\n r\"\"\"\n Compute the block-wise approximation of\n\n \\[ H^{-1}\\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\]\n\n where the gradient is meant to be per sample of the batch $(x, y)$.\n\n Args:\n x: model input to use in the gradient computations\n y: label tensor to compute gradients\n\n Returns:\n Ordered dictionary of tensors representing the element-wise\n approximate inverse Hessian matrix vector products per block.\n\n \"\"\"\n return self.block_mapper.transformed_grads(self._create_batch(x, y))\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.NystroemSketchInfluence.influences_from_factors_by_block","title":"influences_from_factors_by_block","text":"influences_from_factors_by_block(\n z_test_factors: OrderedDict[str, TensorType],\n x: TensorType,\n y: TensorType,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> OrderedDict[str, TensorType]\n
Block-wise computation of
\\[ \\langle z_{\\text{test_factors}}, \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the case of up-weighting influence, resp.
\\[ \\langle z_{\\text{test_factors}}, \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the perturbation type influence case. The gradient is meant to be per sample of the batch \\((x, y)\\).
PARAMETER DESCRIPTION z_test_factors
pre-computed array, approximating \\(H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}}))\\)
TYPE: OrderedDict[str, TensorType]
x
model input to use in the gradient computations \\(\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))\\), resp. \\(\\nabla_{x}\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))\\), if None, use \\(x=x_{\\text{test}}\\)
TYPE: TensorType
y
label tensor to compute gradients
TYPE: TensorType
mode
enum value of InfluenceMode
TYPE: InfluenceMode
DEFAULT: Up
RETURNS DESCRIPTION OrderedDict[str, TensorType]
Ordered dictionary of tensors representing the element-wise scalar products
OrderedDict[str, TensorType]
for the provided batch per block
Source code in src/pydvl/influence/base_influence_function_model.py
@InfluenceFunctionModel.fit_required\ndef influences_from_factors_by_block(\n self,\n z_test_factors: OrderedDict[str, TensorType],\n x: TensorType,\n y: TensorType,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> OrderedDict[str, TensorType]:\n r\"\"\"\n Block-wise computation of\n\n \\[ \\langle z_{\\text{test_factors}},\n \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the case of up-weighting influence, resp.\n\n \\[ \\langle z_{\\text{test_factors}},\n \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the perturbation type influence case. The gradient is meant to be per sample\n of the batch $(x, y)$.\n\n Args:\n z_test_factors: pre-computed array, approximating\n $H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}},\n f_{\\theta}(x_{\\text{test}}))$\n x: model input to use in the gradient computations\n $\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))$,\n resp. $\\nabla_{x}\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))$,\n if None, use $x=x_{\\text{test}}$\n y: label tensor to compute gradients\n mode: enum value of [InfluenceMode]\n [pydvl.influence.base_influence_function_model.InfluenceMode]\n\n Returns:\n Ordered dictionary of tensors representing the element-wise scalar products\n for the provided batch per block\n\n \"\"\"\n return self.block_mapper.interactions_from_transformed_grads(\n z_test_factors, self._create_batch(x, y), mode\n )\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.InverseHarmonicMeanInfluence","title":"InverseHarmonicMeanInfluence","text":"InverseHarmonicMeanInfluence(\n model: Module,\n loss: LossType,\n regularization: Union[float, Dict[str, float]],\n block_structure: Union[\n BlockMode, OrderedDict[str, List[str]]\n ] = BlockMode.FULL,\n)\n
Bases: TorchComposableInfluence[InverseHarmonicMeanOperator]
This implementation replaces the inverse Hessian matrix in the influence computation with an approximation of the inverse Gauss-Newton vector product.
Viewing the damped Gauss-newton matrix
\\[\\begin{align*} G_{\\lambda}(\\theta) &= \\frac{1}{N}\\sum_{i}^N\\nabla_{\\theta}\\ell (x_i,y_i; \\theta) \\nabla_{\\theta}\\ell (x_i, y_i; \\theta)^t + \\lambda \\operatorname{I}, \\\\\\ \\ell(x,y; \\theta) &= \\text{loss}(\\text{model}(x; \\theta), y) \\end{align*}\\] as an arithmetic mean of the rank-\\(1\\) updates, this implementation replaces it with the harmonic mean of the rank-\\(1\\) updates, i.e.
\\[ \\tilde{G}_{\\lambda}(\\theta) = \\left(N \\cdot \\sum_{i=1}^N \\left( \\nabla_{\\theta}\\ell (x_i,y_i; \\theta) \\nabla_{\\theta}\\ell (x_i,y_i; \\theta)^t + \\lambda \\operatorname{I}\\right)^{-1} \\right)^{-1}\\] and uses the matrix
\\[ \\tilde{G}_{\\lambda}^{-1}(\\theta)\\] instead of the inverse Hessian.
In other words, it switches the order of summation and inversion, which resolves to the inverse harmonic mean
of the rank-\\(1\\) updates. The results are averaged over the batches provided by the data loader.
The inverses of the rank-\\(1\\) updates are not calculated explicitly, but instead a vectorized version of the Sherman\u2013Morrison formula is applied.
For more information, see Inverse Harmonic Mean.
PARAMETER DESCRIPTION model
The model.
TYPE: Module
loss
The loss function.
TYPE: LossType
regularization
The regularization parameter. In case a dictionary is provided, the keys must match the blocking structure and the specification must be complete, so every block needs a positive regularization value, which differs from the description in block-diagonal approximation.
TYPE: Union[float, Dict[str, float]]
block_structure
The blocking structure, either a pre-defined enum or a custom block structure, see the information regarding block-diagonal approximation.
TYPE: Union[BlockMode, OrderedDict[str, List[str]]]
DEFAULT: FULL
Source code in src/pydvl/influence/torch/influence_function_model.py
def __init__(\n self,\n model: torch.nn.Module,\n loss: LossType,\n regularization: Union[float, Dict[str, float]],\n block_structure: Union[BlockMode, OrderedDict[str, List[str]]] = BlockMode.FULL,\n):\n super().__init__(\n model,\n block_structure,\n regularization=cast(\n Union[float, Dict[str, Optional[float]]], regularization\n ),\n )\n self.loss = loss\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.InverseHarmonicMeanInfluence.fit","title":"fit","text":"fit(data: DataLoaderType) -> InfluenceFunctionModel\n
Fitting to provided data, by internally creating a block mapper instance from it. Args: data: iterable of tensors
RETURNS DESCRIPTION InfluenceFunctionModel
Fitted instance
Source code in src/pydvl/influence/base_influence_function_model.py
@log_duration(log_level=logging.INFO)\ndef fit(self, data: DataLoaderType) -> InfluenceFunctionModel:\n \"\"\"\n Fitting to provided data, by internally creating a block mapper instance from\n it.\n Args:\n data: iterable of tensors\n\n Returns:\n Fitted instance\n \"\"\"\n self.block_mapper = self._create_block_mapper(data)\n return self\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.InverseHarmonicMeanInfluence.fit_required","title":"fit_required staticmethod
","text":"fit_required(method)\n
Decorator to enforce the fitted check
Source code in src/pydvl/influence/base_influence_function_model.py
@staticmethod\ndef fit_required(method):\n \"\"\"Decorator to enforce the fitted check\"\"\"\n\n @wraps(method)\n def wrapper(self, *args, **kwargs):\n if not self.is_fitted:\n raise NotFittedException(type(self))\n return method(self, *args, **kwargs)\n\n return wrapper\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.InverseHarmonicMeanInfluence.influence_factors","title":"influence_factors","text":"influence_factors(x: TensorType, y: TensorType) -> TensorType\n
Computes the approximation of
\\[ H^{-1}\\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\] where the gradient is meant to be per sample of the batch \\((x, y)\\). For all input tensors it is assumed, that the first dimension is the batch dimension.
PARAMETER DESCRIPTION x
model input to use in the gradient computations
TYPE: TensorType
y
label tensor to compute gradients
TYPE: TensorType
RETURNS DESCRIPTION TensorType
Tensor representing the element-wise inverse Hessian matrix vector products
Source code in src/pydvl/influence/base_influence_function_model.py
def influence_factors(self, x: TensorType, y: TensorType) -> TensorType:\n r\"\"\"\n Computes the approximation of\n\n \\[ H^{-1}\\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\]\n\n where the gradient is meant to be per sample of the batch $(x, y)$.\n For all input tensors it is assumed,\n that the first dimension is the batch dimension.\n\n Args:\n x: model input to use in the gradient computations\n y: label tensor to compute gradients\n\n Returns:\n Tensor representing the element-wise inverse Hessian matrix vector products\n\n \"\"\"\n if not self.is_fitted:\n raise NotFittedException(type(self))\n return self._influence_factors(x, y)\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.InverseHarmonicMeanInfluence.influences","title":"influences","text":"influences(\n x_test: TensorType,\n y_test: TensorType,\n x: Optional[TensorType] = None,\n y: Optional[TensorType] = None,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> TensorType\n
Computes the approximation of
\\[ \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}})), \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the case of up-weighting influence, resp.
\\[ \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{test}, f_{\\theta}(x_{test})), \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the perturbation type influence case.
PARAMETER DESCRIPTION x_test
model input to use in the gradient computations of \\(H^{-1}\\nabla_{theta} \\ell(y_{test}, f_{\\theta}(x_{test}))\\)
TYPE: TensorType
y_test
label tensor to compute gradients
TYPE: TensorType
x
optional model input to use in the gradient computations \\(\\nabla_{theta}\\ell(y, f_{\\theta}(x))\\), resp. \\(\\nabla_{x}\\nabla_{theta}\\ell(y, f_{\\theta}(x))\\), if None, use \\(x=x_{test}\\)
TYPE: Optional[TensorType]
DEFAULT: None
y
optional label tensor to compute gradients
TYPE: Optional[TensorType]
DEFAULT: None
mode
enum value of InfluenceMode
TYPE: InfluenceMode
DEFAULT: Up
RETURNS DESCRIPTION TensorType
Tensor representing the element-wise scalar products for the provided batch
Source code in src/pydvl/influence/base_influence_function_model.py
def influences(\n self,\n x_test: TensorType,\n y_test: TensorType,\n x: Optional[TensorType] = None,\n y: Optional[TensorType] = None,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> TensorType:\n r\"\"\"\n Computes the approximation of\n\n \\[ \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}},\n f_{\\theta}(x_{\\text{test}})),\n \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the case of up-weighting influence, resp.\n\n \\[ \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{test}, f_{\\theta}(x_{test})),\n \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the perturbation type influence case.\n\n Args:\n x_test: model input to use in the gradient computations\n of $H^{-1}\\nabla_{theta} \\ell(y_{test}, f_{\\theta}(x_{test}))$\n y_test: label tensor to compute gradients\n x: optional model input to use in the gradient computations\n $\\nabla_{theta}\\ell(y, f_{\\theta}(x))$,\n resp. $\\nabla_{x}\\nabla_{theta}\\ell(y, f_{\\theta}(x))$,\n if None, use $x=x_{test}$\n y: optional label tensor to compute gradients\n mode: enum value of [InfluenceMode]\n [pydvl.influence.base_influence_function_model.InfluenceMode]\n\n Returns:\n Tensor representing the element-wise scalar products for the provided batch\n\n \"\"\"\n if not self.is_fitted:\n raise NotFittedException(type(self))\n\n if x is None and y is not None:\n raise ValueError(\n \"Providing labels y, without providing model input x \"\n \"is not supported\"\n )\n\n if x is not None and y is None:\n raise ValueError(\n \"Providing model input x, without providing labels y \"\n \"is not supported\"\n )\n\n return self._influences(x_test, y_test, x, y, mode)\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.InverseHarmonicMeanInfluence.influences_from_factors","title":"influences_from_factors","text":"influences_from_factors(\n z_test_factors: TensorType,\n x: TensorType,\n y: TensorType,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> TensorType\n
Computation of
\\[ \\langle z_{\\text{test_factors}}, \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the case of up-weighting influence, resp.
\\[ \\langle z_{\\text{test_factors}}, \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the perturbation type influence case. The gradient is meant to be per sample of the batch \\((x, y)\\).
PARAMETER DESCRIPTION z_test_factors
pre-computed array, approximating \\(H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}}))\\)
TYPE: TensorType
x
model input to use in the gradient computations \\(\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))\\), resp. \\(\\nabla_{x}\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))\\), if None, use \\(x=x_{\\text{test}}\\)
TYPE: TensorType
y
label tensor to compute gradients
TYPE: TensorType
mode
enum value of InfluenceMode
TYPE: InfluenceMode
DEFAULT: Up
RETURNS DESCRIPTION TensorType
Tensor representing the element-wise scalar products for the provided batch
Source code in src/pydvl/influence/base_influence_function_model.py
@InfluenceFunctionModel.fit_required\ndef influences_from_factors(\n self,\n z_test_factors: TensorType,\n x: TensorType,\n y: TensorType,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> TensorType:\n r\"\"\"\n Computation of\n\n \\[ \\langle z_{\\text{test_factors}},\n \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the case of up-weighting influence, resp.\n\n \\[ \\langle z_{\\text{test_factors}},\n \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the perturbation type influence case. The gradient is meant to be per sample\n of the batch $(x, y)$.\n\n Args:\n z_test_factors: pre-computed array, approximating\n $H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}},\n f_{\\theta}(x_{\\text{test}}))$\n x: model input to use in the gradient computations\n $\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))$,\n resp. $\\nabla_{x}\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))$,\n if None, use $x=x_{\\text{test}}$\n y: label tensor to compute gradients\n mode: enum value of [InfluenceMode]\n [pydvl.influence.base_influence_function_model.InfluenceMode]\n\n Returns:\n Tensor representing the element-wise scalar products for the provided batch\n\n \"\"\"\n tensors = self.block_mapper.generate_interactions_from_transformed_grads(\n z_test_factors,\n self._create_batch(x, y),\n mode,\n )\n result: TensorType = next(tensors)\n for tensor in tensors:\n result = result + tensor\n return result\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.InverseHarmonicMeanInfluence.influences_by_block","title":"influences_by_block","text":"influences_by_block(\n x_test: TensorType,\n y_test: TensorType,\n x: Optional[TensorType] = None,\n y: Optional[TensorType] = None,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> OrderedDict[str, TensorType]\n
Compute the block-wise influence values for the provided data, i.e. an approximation of
\\[ \\langle H^{-1}\\nabla_{theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}})), \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the case of up-weighting influence, resp.
\\[ \\langle H^{-1}\\nabla_{theta} \\ell(y_{test}, f_{\\theta}(x_{test})), \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the perturbation type influence case.
PARAMETER DESCRIPTION x_test
model input to use in the gradient computations of the approximation of \\(H^{-1}\\nabla_{theta} \\ell(y_{test}, f_{\\theta}(x_{test}))\\)
TYPE: TensorType
y_test
label tensor to compute gradients
TYPE: TensorType
x
optional model input to use in the gradient computations \\(\\nabla_{theta}\\ell(y, f_{\\theta}(x))\\), resp. \\(\\nabla_{x}\\nabla_{theta}\\ell(y, f_{\\theta}(x))\\), if None, use \\(x=x_{test}\\)
TYPE: Optional[TensorType]
DEFAULT: None
y
optional label tensor to compute gradients
TYPE: Optional[TensorType]
DEFAULT: None
mode
enum value of InfluenceMode
TYPE: InfluenceMode
DEFAULT: Up
RETURNS DESCRIPTION OrderedDict[str, TensorType]
Ordered dictionary of tensors representing the element-wise scalar products
OrderedDict[str, TensorType]
for the provided batch per block.
Source code in src/pydvl/influence/base_influence_function_model.py
@InfluenceFunctionModel.fit_required\ndef influences_by_block(\n self,\n x_test: TensorType,\n y_test: TensorType,\n x: Optional[TensorType] = None,\n y: Optional[TensorType] = None,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> OrderedDict[str, TensorType]:\n r\"\"\"\n Compute the block-wise influence values for the provided data, i.e. an\n approximation of\n\n \\[ \\langle H^{-1}\\nabla_{theta} \\ell(y_{\\text{test}},\n f_{\\theta}(x_{\\text{test}})),\n \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the case of up-weighting influence, resp.\n\n \\[ \\langle H^{-1}\\nabla_{theta} \\ell(y_{test}, f_{\\theta}(x_{test})),\n \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the perturbation type influence case.\n\n Args:\n x_test: model input to use in the gradient computations\n of the approximation of\n $H^{-1}\\nabla_{theta} \\ell(y_{test}, f_{\\theta}(x_{test}))$\n y_test: label tensor to compute gradients\n x: optional model input to use in the gradient computations\n $\\nabla_{theta}\\ell(y, f_{\\theta}(x))$,\n resp. $\\nabla_{x}\\nabla_{theta}\\ell(y, f_{\\theta}(x))$,\n if None, use $x=x_{test}$\n y: optional label tensor to compute gradients\n mode: enum value of [InfluenceMode]\n [pydvl.influence.base_influence_function_model.InfluenceMode]\n\n Returns:\n Ordered dictionary of tensors representing the element-wise scalar products\n for the provided batch per block.\n\n \"\"\"\n left_batch = self._create_batch(x_test, y_test)\n\n if x is None:\n if y is not None:\n raise ValueError(\n \"Providing labels y, without providing model input x \"\n \"is not supported\"\n )\n right_batch = left_batch\n else:\n if y is None:\n raise ValueError(\n \"Providing model input x, without providing labels y \"\n \"is not supported\"\n )\n right_batch = self._create_batch(x, y)\n\n return self.block_mapper.interactions(left_batch, right_batch, mode)\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.InverseHarmonicMeanInfluence.influence_factors_by_block","title":"influence_factors_by_block","text":"influence_factors_by_block(\n x: TensorType, y: TensorType\n) -> OrderedDict[str, TensorType]\n
Compute the block-wise approximation of
\\[ H^{-1}\\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\] where the gradient is meant to be per sample of the batch \\((x, y)\\).
PARAMETER DESCRIPTION x
model input to use in the gradient computations
TYPE: TensorType
y
label tensor to compute gradients
TYPE: TensorType
RETURNS DESCRIPTION OrderedDict[str, TensorType]
Ordered dictionary of tensors representing the element-wise
OrderedDict[str, TensorType]
approximate inverse Hessian matrix vector products per block.
Source code in src/pydvl/influence/base_influence_function_model.py
@InfluenceFunctionModel.fit_required\ndef influence_factors_by_block(\n self, x: TensorType, y: TensorType\n) -> OrderedDict[str, TensorType]:\n r\"\"\"\n Compute the block-wise approximation of\n\n \\[ H^{-1}\\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\]\n\n where the gradient is meant to be per sample of the batch $(x, y)$.\n\n Args:\n x: model input to use in the gradient computations\n y: label tensor to compute gradients\n\n Returns:\n Ordered dictionary of tensors representing the element-wise\n approximate inverse Hessian matrix vector products per block.\n\n \"\"\"\n return self.block_mapper.transformed_grads(self._create_batch(x, y))\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.InverseHarmonicMeanInfluence.influences_from_factors_by_block","title":"influences_from_factors_by_block","text":"influences_from_factors_by_block(\n z_test_factors: OrderedDict[str, TensorType],\n x: TensorType,\n y: TensorType,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> OrderedDict[str, TensorType]\n
Block-wise computation of
\\[ \\langle z_{\\text{test_factors}}, \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the case of up-weighting influence, resp.
\\[ \\langle z_{\\text{test_factors}}, \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the perturbation type influence case. The gradient is meant to be per sample of the batch \\((x, y)\\).
PARAMETER DESCRIPTION z_test_factors
pre-computed array, approximating \\(H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}}))\\)
TYPE: OrderedDict[str, TensorType]
x
model input to use in the gradient computations \\(\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))\\), resp. \\(\\nabla_{x}\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))\\), if None, use \\(x=x_{\\text{test}}\\)
TYPE: TensorType
y
label tensor to compute gradients
TYPE: TensorType
mode
enum value of InfluenceMode
TYPE: InfluenceMode
DEFAULT: Up
RETURNS DESCRIPTION OrderedDict[str, TensorType]
Ordered dictionary of tensors representing the element-wise scalar products
OrderedDict[str, TensorType]
for the provided batch per block
Source code in src/pydvl/influence/base_influence_function_model.py
@InfluenceFunctionModel.fit_required\ndef influences_from_factors_by_block(\n self,\n z_test_factors: OrderedDict[str, TensorType],\n x: TensorType,\n y: TensorType,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> OrderedDict[str, TensorType]:\n r\"\"\"\n Block-wise computation of\n\n \\[ \\langle z_{\\text{test_factors}},\n \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the case of up-weighting influence, resp.\n\n \\[ \\langle z_{\\text{test_factors}},\n \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the perturbation type influence case. The gradient is meant to be per sample\n of the batch $(x, y)$.\n\n Args:\n z_test_factors: pre-computed array, approximating\n $H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}},\n f_{\\theta}(x_{\\text{test}}))$\n x: model input to use in the gradient computations\n $\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))$,\n resp. $\\nabla_{x}\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))$,\n if None, use $x=x_{\\text{test}}$\n y: label tensor to compute gradients\n mode: enum value of [InfluenceMode]\n [pydvl.influence.base_influence_function_model.InfluenceMode]\n\n Returns:\n Ordered dictionary of tensors representing the element-wise scalar products\n for the provided batch per block\n\n \"\"\"\n return self.block_mapper.interactions_from_transformed_grads(\n z_test_factors, self._create_batch(x, y), mode\n )\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.InverseHarmonicMeanInfluence.with_regularization","title":"with_regularization","text":"with_regularization(\n regularization: Union[float, Dict[str, Optional[float]]]\n) -> TorchComposableInfluence\n
Update the regularization parameter. Args: regularization: Either a positive float or a dictionary with the block names as keys and the regularization values as values.
RETURNS DESCRIPTION TorchComposableInfluence
The modified instance
Source code in src/pydvl/influence/torch/influence_function_model.py
def with_regularization(\n self, regularization: Union[float, Dict[str, Optional[float]]]\n) -> TorchComposableInfluence:\n \"\"\"\n Update the regularization parameter.\n Args:\n regularization: Either a positive float or a dictionary with the\n block names as keys and the regularization values as values.\n\n Returns:\n The modified instance\n\n \"\"\"\n self._regularization_dict = self._build_regularization_dict(regularization)\n for k, reg in self._regularization_dict.items():\n self.block_mapper.composable_block_dict[k].op.regularization = reg\n return self\n
"},{"location":"api/pydvl/influence/torch/operator/","title":"Operator","text":""},{"location":"api/pydvl/influence/torch/operator/#pydvl.influence.torch.operator","title":"pydvl.influence.torch.operator","text":""},{"location":"api/pydvl/influence/torch/operator/#pydvl.influence.torch.operator.GaussNewtonOperator","title":"GaussNewtonOperator","text":"GaussNewtonOperator(\n model: Module,\n loss: Callable[[Tensor, Tensor], Tensor],\n dataloader: DataLoader,\n restrict_to: Optional[Dict[str, Parameter]] = None,\n)\n
Bases: _AveragingBatchOperator[GaussNewtonBatchOperation, PointAveraging]
Given a model and loss function computes the Gauss-Newton vector or matrix product with respect to the model parameters on a batch, i.e.
\\[\\begin{align*} G(\\text{model}, \\text{loss}, b, \\theta) &\\cdot v, \\\\\\ G(\\text{model}, \\text{loss}, b, \\theta) &= \\frac{1}{|b|}\\sum_{(x, y) \\in b}\\nabla_{\\theta}\\ell (x,y; \\theta) \\nabla_{\\theta}\\ell (x,y; \\theta)^t, \\\\\\ \\ell(x,y; \\theta) &= \\text{loss}(\\text{model}(x; \\theta), y) \\end{align*}\\] where model is a torch.nn.Module and \\(v\\) is a vector or matrix, and average the results over the batches provided by the data loader.
PARAMETER DESCRIPTION model
The model.
TYPE: Module
loss
The loss function.
TYPE: Callable[[Tensor, Tensor], Tensor]
dataloader
The data loader providing batches of data.
TYPE: DataLoader
restrict_to
The parameters to restrict the differentiation to, i.e. the corresponding sub-matrix of the Jacobian. If None, the full Jacobian is used. Make sure the input matches the corrct dimension, i.e. the last dimension must be equal to the property input_size
.
TYPE: Optional[Dict[str, Parameter]]
DEFAULT: None
Source code in src/pydvl/influence/torch/operator.py
def __init__(\n self,\n model: nn.Module,\n loss: Callable[[torch.Tensor, torch.Tensor], torch.Tensor],\n dataloader: DataLoader,\n restrict_to: Optional[Dict[str, nn.Parameter]] = None,\n):\n batch_op = GaussNewtonBatchOperation(\n model,\n loss,\n restrict_to=restrict_to,\n )\n averaging = PointAveraging()\n super().__init__(batch_op, dataloader, averaging)\n
"},{"location":"api/pydvl/influence/torch/operator/#pydvl.influence.torch.operator.GaussNewtonOperator.apply","title":"apply","text":"apply(tensor: TensorType) -> TensorType\n
Applies the operator to a tensor.
PARAMETER DESCRIPTION tensor
A tensor, whose tailing dimension must conform to the operator's input size
TYPE: TensorType
RETURNS DESCRIPTION TensorType
A tensor representing the result of the operator application.
Source code in src/pydvl/influence/types.py
def apply(self, tensor: TensorType) -> TensorType:\n \"\"\"\n Applies the operator to a tensor.\n\n Args:\n tensor: A tensor, whose tailing dimension must conform to the\n operator's input size\n\n Returns:\n A tensor representing the result of the operator application.\n \"\"\"\n self._validate_tensor_input(tensor)\n return self._apply(tensor)\n
"},{"location":"api/pydvl/influence/torch/operator/#pydvl.influence.torch.operator.GaussNewtonOperator.apply_to_dict","title":"apply_to_dict","text":"apply_to_dict(mat: Dict[str, Tensor]) -> Dict[str, Tensor]\n
Applies the operator to a dictionary of tensors, compatible to the structure defined by the property input_dict_structure
.
PARAMETER DESCRIPTION mat
dictionary of tensors, whose keys and shapes match the property input_dict_structure
.
TYPE: Dict[str, Tensor]
RETURNS DESCRIPTION Dict[str, Tensor]
A dictionary of tensors after applying the operator
Source code in src/pydvl/influence/torch/base.py
def apply_to_dict(self, mat: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:\n \"\"\"\n Applies the operator to a dictionary of tensors, compatible to the structure\n defined by the property `input_dict_structure`.\n\n Args:\n mat: dictionary of tensors, whose keys and shapes match the property\n `input_dict_structure`.\n\n Returns:\n A dictionary of tensors after applying the operator\n \"\"\"\n\n if not self._validate_mat_dict(mat):\n raise ValueError(\n f\"Incompatible input structure, expected (excluding batch\"\n f\"dimension): \\n {self.input_dict_structure}\"\n )\n\n return self._apply_to_dict(self._dict_to_device(mat))\n
"},{"location":"api/pydvl/influence/torch/operator/#pydvl.influence.torch.operator.HessianOperator","title":"HessianOperator","text":"HessianOperator(\n model: Module,\n loss: Callable[[Tensor, Tensor], Tensor],\n dataloader: DataLoader,\n restrict_to: Optional[Dict[str, Parameter]] = None,\n)\n
Bases: _AveragingBatchOperator[HessianBatchOperation, ChunkAveraging]
Given a model and loss function computes the Hessian vector or matrix product with respect to the model parameters for a given batch, i.e.
\\[\\begin{align*} &\\nabla^2_{\\theta} L(b;\\theta) \\cdot v \\\\\\ &L(b;\\theta) = \\left( \\frac{1}{|b|} \\sum_{(x,y) \\in b} \\text{loss}(\\text{model}(x; \\theta), y)\\right), \\end{align*}\\] where model is a torch.nn.Module and \\(v\\) is a vector or matrix, and average the results over the batches provided by the data loader.
PARAMETER DESCRIPTION model
The model.
TYPE: Module
loss
The loss function.
TYPE: Callable[[Tensor, Tensor], Tensor]
dataloader
The data loader providing batches of data.
TYPE: DataLoader
restrict_to
The parameters to restrict the second order differentiation to, i.e. the corresponding sub-matrix of the Hessian. If None, the full Hessian is used. Make sure the input matches the corrct dimension, i.e. the last dimension must be equal to the property input_size
.
TYPE: Optional[Dict[str, Parameter]]
DEFAULT: None
Source code in src/pydvl/influence/torch/operator.py
def __init__(\n self,\n model: nn.Module,\n loss: Callable[[torch.Tensor, torch.Tensor], torch.Tensor],\n dataloader: DataLoader,\n restrict_to: Optional[Dict[str, nn.Parameter]] = None,\n):\n batch_op = HessianBatchOperation(model, loss, restrict_to=restrict_to)\n averaging = ChunkAveraging()\n super().__init__(batch_op, dataloader, averaging)\n
"},{"location":"api/pydvl/influence/torch/operator/#pydvl.influence.torch.operator.HessianOperator.apply","title":"apply","text":"apply(tensor: TensorType) -> TensorType\n
Applies the operator to a tensor.
PARAMETER DESCRIPTION tensor
A tensor, whose tailing dimension must conform to the operator's input size
TYPE: TensorType
RETURNS DESCRIPTION TensorType
A tensor representing the result of the operator application.
Source code in src/pydvl/influence/types.py
def apply(self, tensor: TensorType) -> TensorType:\n \"\"\"\n Applies the operator to a tensor.\n\n Args:\n tensor: A tensor, whose tailing dimension must conform to the\n operator's input size\n\n Returns:\n A tensor representing the result of the operator application.\n \"\"\"\n self._validate_tensor_input(tensor)\n return self._apply(tensor)\n
"},{"location":"api/pydvl/influence/torch/operator/#pydvl.influence.torch.operator.HessianOperator.apply_to_dict","title":"apply_to_dict","text":"apply_to_dict(mat: Dict[str, Tensor]) -> Dict[str, Tensor]\n
Applies the operator to a dictionary of tensors, compatible to the structure defined by the property input_dict_structure
.
PARAMETER DESCRIPTION mat
dictionary of tensors, whose keys and shapes match the property input_dict_structure
.
TYPE: Dict[str, Tensor]
RETURNS DESCRIPTION Dict[str, Tensor]
A dictionary of tensors after applying the operator
Source code in src/pydvl/influence/torch/base.py
def apply_to_dict(self, mat: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:\n \"\"\"\n Applies the operator to a dictionary of tensors, compatible to the structure\n defined by the property `input_dict_structure`.\n\n Args:\n mat: dictionary of tensors, whose keys and shapes match the property\n `input_dict_structure`.\n\n Returns:\n A dictionary of tensors after applying the operator\n \"\"\"\n\n if not self._validate_mat_dict(mat):\n raise ValueError(\n f\"Incompatible input structure, expected (excluding batch\"\n f\"dimension): \\n {self.input_dict_structure}\"\n )\n\n return self._apply_to_dict(self._dict_to_device(mat))\n
"},{"location":"api/pydvl/influence/torch/operator/#pydvl.influence.torch.operator.InverseHarmonicMeanOperator","title":"InverseHarmonicMeanOperator","text":"InverseHarmonicMeanOperator(\n model: Module,\n loss: Callable[[Tensor, Tensor], Tensor],\n dataloader: DataLoader,\n regularization: float,\n restrict_to: Optional[Dict[str, Parameter]] = None,\n)\n
Bases: _AveragingBatchOperator[InverseHarmonicMeanBatchOperation, PointAveraging]
Given a model and loss function computes an approximation of the inverse Gauss-Newton vector or matrix product per batch and averages the results.
Viewing the damped Gauss-newton matrix
\\[\\begin{align*} G_{\\lambda}(\\text{model}, \\text{loss}, b, \\theta) &= \\frac{1}{|b|}\\sum_{(x, y) \\in b}\\nabla_{\\theta}\\ell (x,y; \\theta) \\nabla_{\\theta}\\ell (x,y; \\theta)^t + \\lambda \\operatorname{I}, \\\\\\ \\ell(x,y; \\theta) &= \\text{loss}(\\text{model}(x; \\theta), y) \\end{align*}\\] as an arithmetic mean of the rank-\\(1\\) updates, this operator replaces it with the harmonic mean of the rank-\\(1\\) updates, i.e.
\\[ \\tilde{G}_{\\lambda}(\\text{model}, \\text{loss}, b, \\theta) = \\left(n \\sum_{(x, y) \\in b} \\left( \\nabla_{\\theta}\\ell (x,y; \\theta) \\nabla_{\\theta}\\ell (x,y; \\theta)^t + \\lambda \\operatorname{I}\\right)^{-1} \\right)^{-1}\\] and computes
\\[ \\tilde{G}_{\\lambda}^{-1}(\\text{model}, \\text{loss}, b, \\theta) \\cdot v.\\] for any given batch \\(b\\), where model is a torch.nn.Module and \\(v\\) is a vector or matrix.
In other words, it switches the order of summation and inversion, which resolves to the inverse harmonic mean
of the rank-\\(1\\) updates. The results are averaged over the batches provided by the data loader.
The inverses of the rank-\\(1\\) updates are not calculated explicitly, but instead a vectorized version of the Sherman\u2013Morrison formula is applied.
For more information, see Inverse Harmonic Mean.
PARAMETER DESCRIPTION model
The model.
TYPE: Module
loss
The loss function.
TYPE: Callable[[Tensor, Tensor], Tensor]
dataloader
The data loader providing batches of data.
TYPE: DataLoader
restrict_to
The parameters to restrict the differentiation to, i.e. the corresponding sub-matrix of the Jacobian. If None, the full Jacobian is used. Make sure the input matches the corrct dimension, i.e. the last dimension must be equal to the property input_size
.
TYPE: Optional[Dict[str, Parameter]]
DEFAULT: None
Source code in src/pydvl/influence/torch/operator.py
def __init__(\n self,\n model: nn.Module,\n loss: Callable[[torch.Tensor, torch.Tensor], torch.Tensor],\n dataloader: DataLoader,\n regularization: float,\n restrict_to: Optional[Dict[str, nn.Parameter]] = None,\n):\n if regularization <= 0:\n raise ValueError(\"regularization must be positive\")\n\n self._regularization = regularization\n\n batch_op = InverseHarmonicMeanBatchOperation(\n model,\n loss,\n regularization,\n restrict_to=restrict_to,\n )\n averaging = PointAveraging()\n super().__init__(batch_op, dataloader, averaging)\n
"},{"location":"api/pydvl/influence/torch/operator/#pydvl.influence.torch.operator.InverseHarmonicMeanOperator.apply","title":"apply","text":"apply(tensor: TensorType) -> TensorType\n
Applies the operator to a tensor.
PARAMETER DESCRIPTION tensor
A tensor, whose tailing dimension must conform to the operator's input size
TYPE: TensorType
RETURNS DESCRIPTION TensorType
A tensor representing the result of the operator application.
Source code in src/pydvl/influence/types.py
def apply(self, tensor: TensorType) -> TensorType:\n \"\"\"\n Applies the operator to a tensor.\n\n Args:\n tensor: A tensor, whose tailing dimension must conform to the\n operator's input size\n\n Returns:\n A tensor representing the result of the operator application.\n \"\"\"\n self._validate_tensor_input(tensor)\n return self._apply(tensor)\n
"},{"location":"api/pydvl/influence/torch/operator/#pydvl.influence.torch.operator.InverseHarmonicMeanOperator.apply_to_dict","title":"apply_to_dict","text":"apply_to_dict(mat: Dict[str, Tensor]) -> Dict[str, Tensor]\n
Applies the operator to a dictionary of tensors, compatible to the structure defined by the property input_dict_structure
.
PARAMETER DESCRIPTION mat
dictionary of tensors, whose keys and shapes match the property input_dict_structure
.
TYPE: Dict[str, Tensor]
RETURNS DESCRIPTION Dict[str, Tensor]
A dictionary of tensors after applying the operator
Source code in src/pydvl/influence/torch/base.py
def apply_to_dict(self, mat: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:\n \"\"\"\n Applies the operator to a dictionary of tensors, compatible to the structure\n defined by the property `input_dict_structure`.\n\n Args:\n mat: dictionary of tensors, whose keys and shapes match the property\n `input_dict_structure`.\n\n Returns:\n A dictionary of tensors after applying the operator\n \"\"\"\n\n if not self._validate_mat_dict(mat):\n raise ValueError(\n f\"Incompatible input structure, expected (excluding batch\"\n f\"dimension): \\n {self.input_dict_structure}\"\n )\n\n return self._apply_to_dict(self._dict_to_device(mat))\n
"},{"location":"api/pydvl/influence/torch/operator/#pydvl.influence.torch.operator.DirectSolveOperator","title":"DirectSolveOperator","text":"DirectSolveOperator(\n matrix: Tensor,\n regularization: Optional[float] = None,\n in_place_regularization: bool = False,\n)\n
Bases: TensorOperator
Given a matrix \\(A\\) and an optional regularization parameter \\(\\lambda\\), computes the solution of the system \\((A+\\lambda I)x = b\\), where \\(b\\) is a vector or a matrix. Internally, it uses the routine torch.linalg.solve.
PARAMETER DESCRIPTION matrix
the system matrix
TYPE: Tensor
regularization
the regularization parameter
TYPE: Optional[float]
DEFAULT: None
in_place_regularization
If True, the input matrix is modified in-place, by adding the regularization value to the diagonal.
TYPE: bool
DEFAULT: False
Source code in src/pydvl/influence/torch/operator.py
def __init__(\n self,\n matrix: torch.Tensor,\n regularization: Optional[float] = None,\n in_place_regularization: bool = False,\n):\n if regularization is None:\n self.matrix = matrix\n else:\n self.matrix = self._update_diagonal(\n matrix if in_place_regularization else matrix.clone(), regularization\n )\n self._regularization = regularization\n
"},{"location":"api/pydvl/influence/torch/operator/#pydvl.influence.torch.operator.DirectSolveOperator.apply","title":"apply","text":"apply(tensor: TensorType) -> TensorType\n
Applies the operator to a tensor.
PARAMETER DESCRIPTION tensor
A tensor, whose tailing dimension must conform to the operator's input size
TYPE: TensorType
RETURNS DESCRIPTION TensorType
A tensor representing the result of the operator application.
Source code in src/pydvl/influence/types.py
def apply(self, tensor: TensorType) -> TensorType:\n \"\"\"\n Applies the operator to a tensor.\n\n Args:\n tensor: A tensor, whose tailing dimension must conform to the\n operator's input size\n\n Returns:\n A tensor representing the result of the operator application.\n \"\"\"\n self._validate_tensor_input(tensor)\n return self._apply(tensor)\n
"},{"location":"api/pydvl/influence/torch/operator/#pydvl.influence.torch.operator.LissaOperator","title":"LissaOperator","text":"LissaOperator(\n batch_operation: BatchOperationType,\n data: DataLoader,\n regularization: Optional[float] = None,\n maxiter: int = 1000,\n dampen: float = 0.0,\n scale: float = 10.0,\n rtol: float = 0.0001,\n progress: bool = False,\n warn_on_max_iteration: bool = True,\n)\n
Bases: TensorOperator
, Generic[BatchOperationType]
Uses LISSA, Linear time Stochastic Second-Order Algorithm, to iteratively approximate the solution of the system \\((A + \\lambda I)x = b\\). This is done with the update
\\[(A + \\lambda I)^{-1}_{j+1} b = b + (I - d) \\ (A + \\lambda I) - \\frac{(A + \\lambda I)^{-1}_j b}{s},\\] where \\(I\\) is the identity matrix, \\(d\\) is a dampening term and \\(s\\) a scaling factor that are applied to help convergence. For details, see Linear time Stochastic Second-Order Approximation (LiSSA)
PARAMETER DESCRIPTION batch_operation
The BatchOperation
representing the action of A on a batch of the data loader.
TYPE: BatchOperationType
data
a pytorch dataloader
TYPE: DataLoader
regularization
Optional regularization parameter added to the Hessian-vector product for numerical stability.
TYPE: Optional[float]
DEFAULT: None
maxiter
Maximum number of iterations.
TYPE: int
DEFAULT: 1000
dampen
Dampening factor, defaults to 0 for no dampening.
TYPE: float
DEFAULT: 0.0
scale
Scaling factor, defaults to 10.
TYPE: float
DEFAULT: 10.0
rtol
tolerance to use for early stopping
TYPE: float
DEFAULT: 0.0001
progress
If True, display progress bars.
TYPE: bool
DEFAULT: False
warn_on_max_iteration
If True, logs a warning, if the desired tolerance is not achieved within maxiter
iterations. If False, the log level for this information is logging.DEBUG
TYPE: bool
DEFAULT: True
Source code in src/pydvl/influence/torch/operator.py
def __init__(\n self,\n batch_operation: BatchOperationType,\n data: DataLoader,\n regularization: Optional[float] = None,\n maxiter: int = 1000,\n dampen: float = 0.0,\n scale: float = 10.0,\n rtol: float = 1e-4,\n progress: bool = False,\n warn_on_max_iteration: bool = True,\n):\n\n if regularization is not None and regularization < 0:\n raise ValueError(\"regularization must be non-negative\")\n\n self.data = data\n self.warn_on_max_iteration = warn_on_max_iteration\n self.progress = progress\n self.rtol = rtol\n self.scale = scale\n self.dampen = dampen\n self.maxiter = maxiter\n self.batch_operation = batch_operation\n self._regularization = regularization\n
"},{"location":"api/pydvl/influence/torch/operator/#pydvl.influence.torch.operator.LissaOperator.apply","title":"apply","text":"apply(tensor: TensorType) -> TensorType\n
Applies the operator to a tensor.
PARAMETER DESCRIPTION tensor
A tensor, whose tailing dimension must conform to the operator's input size
TYPE: TensorType
RETURNS DESCRIPTION TensorType
A tensor representing the result of the operator application.
Source code in src/pydvl/influence/types.py
def apply(self, tensor: TensorType) -> TensorType:\n \"\"\"\n Applies the operator to a tensor.\n\n Args:\n tensor: A tensor, whose tailing dimension must conform to the\n operator's input size\n\n Returns:\n A tensor representing the result of the operator application.\n \"\"\"\n self._validate_tensor_input(tensor)\n return self._apply(tensor)\n
"},{"location":"api/pydvl/influence/torch/operator/#pydvl.influence.torch.operator.LowRankOperator","title":"LowRankOperator","text":"LowRankOperator(\n low_rank_representation: LowRankProductRepresentation,\n regularization: float,\n exact: bool = True,\n)\n
Bases: TensorOperator
Given a low rank representation of a matrix
\\[ A = V D V^T\\] with a diagonal matrix \\(D\\) and an optional regularization parameter \\(\\lambda\\), computes
$$ (V D V^T+\\lambda I)^{-1}b$$.
Depending on the value of the exact
flag, the inverse action is computed exactly using the [Sherman\u2013Morrison\u2013Woodbury formula] (https://en.wikipedia.org/wiki/Woodbury_matrix_identity). If exact
is set to False
, the inverse action is approximated by
\\[ V^T(D+\\lambda I)^{-1}Vb\\] Args:
Source code in src/pydvl/influence/torch/operator.py
def __init__(\n self,\n low_rank_representation: LowRankProductRepresentation,\n regularization: float,\n exact: bool = True,\n):\n\n if exact and (regularization is None or regularization <= 0):\n raise ValueError(\"regularization must be positive when exact=True\")\n elif regularization is not None and regularization < 0:\n raise ValueError(\"regularization must be non-negative\")\n\n self._regularization = regularization\n self._exact = exact\n self._low_rank_representation = low_rank_representation\n
"},{"location":"api/pydvl/influence/torch/operator/#pydvl.influence.torch.operator.LowRankOperator.apply","title":"apply","text":"apply(tensor: TensorType) -> TensorType\n
Applies the operator to a tensor.
PARAMETER DESCRIPTION tensor
A tensor, whose tailing dimension must conform to the operator's input size
TYPE: TensorType
RETURNS DESCRIPTION TensorType
A tensor representing the result of the operator application.
Source code in src/pydvl/influence/types.py
def apply(self, tensor: TensorType) -> TensorType:\n \"\"\"\n Applies the operator to a tensor.\n\n Args:\n tensor: A tensor, whose tailing dimension must conform to the\n operator's input size\n\n Returns:\n A tensor representing the result of the operator application.\n \"\"\"\n self._validate_tensor_input(tensor)\n return self._apply(tensor)\n
"},{"location":"api/pydvl/influence/torch/pre_conditioner/","title":"Pre conditioner","text":""},{"location":"api/pydvl/influence/torch/pre_conditioner/#pydvl.influence.torch.pre_conditioner","title":"pydvl.influence.torch.pre_conditioner","text":""},{"location":"api/pydvl/influence/torch/pre_conditioner/#pydvl.influence.torch.pre_conditioner.PreConditioner","title":"PreConditioner","text":" Bases: ABC
Abstract base class for implementing pre-conditioners for improving the convergence of CG for systems of the form
\\[ ( A + \\lambda \\operatorname{I})x = \\operatorname{rhs} \\] i.e. a matrix \\(M\\) such that \\(M^{-1}(A + \\lambda \\operatorname{I})\\) has a better condition number than \\(A + \\lambda \\operatorname{I}\\).
"},{"location":"api/pydvl/influence/torch/pre_conditioner/#pydvl.influence.torch.pre_conditioner.PreConditioner.fit","title":"fit abstractmethod
","text":"fit(\n mat_mat_prod: Callable[[Tensor], Tensor],\n size: int,\n dtype: dtype,\n device: device,\n regularization: float = 0.0,\n)\n
Implement this to fit the pre-conditioner to the matrix represented by the mat_mat_prod Args: mat_mat_prod: a callable that computes the matrix-matrix product size: size of the matrix represented by mat_mat_prod
dtype: data type of the matrix represented by mat_mat_prod
device: device of the matrix represented by mat_mat_prod
regularization: regularization parameter \\(\\lambda\\) in the equation $ ( A + \\lambda \\operatorname{I})x = \\operatorname{rhs} $ Returns: self
Source code in src/pydvl/influence/torch/pre_conditioner.py
@abstractmethod\ndef fit(\n self,\n mat_mat_prod: Callable[[torch.Tensor], torch.Tensor],\n size: int,\n dtype: torch.dtype,\n device: torch.device,\n regularization: float = 0.0,\n):\n r\"\"\"\n Implement this to fit the pre-conditioner to the matrix represented by the\n mat_mat_prod\n Args:\n mat_mat_prod: a callable that computes the matrix-matrix product\n size: size of the matrix represented by `mat_mat_prod`\n dtype: data type of the matrix represented by `mat_mat_prod`\n device: device of the matrix represented by `mat_mat_prod`\n regularization: regularization parameter $\\lambda$ in the equation\n $ ( A + \\lambda \\operatorname{I})x = \\operatorname{rhs} $\n Returns:\n self\n \"\"\"\n pass\n
"},{"location":"api/pydvl/influence/torch/pre_conditioner/#pydvl.influence.torch.pre_conditioner.PreConditioner.solve","title":"solve","text":"solve(rhs: Tensor)\n
Solve the equation \\(M@Z = \\operatorname{rhs}\\) Args: rhs: right hand side of the equation, corresponds to the residuum vector (or matrix) in the conjugate gradient method
RETURNS DESCRIPTION solution \\(M^{-1}\\operatorname{rhs}\\)
Source code in src/pydvl/influence/torch/pre_conditioner.py
def solve(self, rhs: torch.Tensor):\n r\"\"\"\n Solve the equation $M@Z = \\operatorname{rhs}$\n Args:\n rhs: right hand side of the equation, corresponds to the residuum vector\n (or matrix) in the conjugate gradient method\n\n Returns:\n solution $M^{-1}\\operatorname{rhs}$\n\n \"\"\"\n if not self.is_fitted:\n raise NotFittedException(type(self))\n\n return self._solve(rhs)\n
"},{"location":"api/pydvl/influence/torch/pre_conditioner/#pydvl.influence.torch.pre_conditioner.PreConditioner.to","title":"to abstractmethod
","text":"to(device: device) -> PreConditioner\n
Implement this to move the (potentially fitted) preconditioner to a specific device
Source code in src/pydvl/influence/torch/pre_conditioner.py
@abstractmethod\ndef to(self, device: torch.device) -> PreConditioner:\n \"\"\"Implement this to move the (potentially fitted) preconditioner to a\n specific device\"\"\"\n
"},{"location":"api/pydvl/influence/torch/pre_conditioner/#pydvl.influence.torch.pre_conditioner.JacobiPreConditioner","title":"JacobiPreConditioner","text":"JacobiPreConditioner(num_samples_estimator: int = 1)\n
Bases: PreConditioner
Pre-conditioner for improving the convergence of CG for systems of the form
\\[ ( A + \\lambda \\operatorname{I})x = \\operatorname{rhs} \\] The JacobiPreConditioner uses the diagonal information of the matrix \\(A\\). The diagonal elements are not computed directly but estimated via Hutchinson's estimator.
\\[ M = \\frac{1}{m} \\sum_{i=1}^m u_i \\odot Au_i + \\lambda \\operatorname{I} \\] where \\(u_i\\) are i.i.d. Gaussian random vectors. Works well in the case the matrix \\(A + \\lambda \\operatorname{I}\\) is diagonal dominant. For more information, see the documentation of Conjugate Gradient Args: num_samples_estimator: number of samples to use in computation of Hutchinson's estimator
Source code in src/pydvl/influence/torch/pre_conditioner.py
def __init__(self, num_samples_estimator: int = 1):\n self.num_samples_estimator = num_samples_estimator\n
"},{"location":"api/pydvl/influence/torch/pre_conditioner/#pydvl.influence.torch.pre_conditioner.JacobiPreConditioner.solve","title":"solve","text":"solve(rhs: Tensor)\n
Solve the equation \\(M@Z = \\operatorname{rhs}\\) Args: rhs: right hand side of the equation, corresponds to the residuum vector (or matrix) in the conjugate gradient method
RETURNS DESCRIPTION solution \\(M^{-1}\\operatorname{rhs}\\)
Source code in src/pydvl/influence/torch/pre_conditioner.py
def solve(self, rhs: torch.Tensor):\n r\"\"\"\n Solve the equation $M@Z = \\operatorname{rhs}$\n Args:\n rhs: right hand side of the equation, corresponds to the residuum vector\n (or matrix) in the conjugate gradient method\n\n Returns:\n solution $M^{-1}\\operatorname{rhs}$\n\n \"\"\"\n if not self.is_fitted:\n raise NotFittedException(type(self))\n\n return self._solve(rhs)\n
"},{"location":"api/pydvl/influence/torch/pre_conditioner/#pydvl.influence.torch.pre_conditioner.JacobiPreConditioner.fit","title":"fit","text":"fit(\n mat_mat_prod: Callable[[Tensor], Tensor],\n size: int,\n dtype: dtype,\n device: device,\n regularization: float = 0.0,\n)\n
Fits by computing an estimate of the diagonal of the matrix represented by mat_mat_prod
via Hutchinson's estimator
PARAMETER DESCRIPTION mat_mat_prod
a callable representing the matrix-matrix product
TYPE: Callable[[Tensor], Tensor]
size
size of the square matrix
TYPE: int
dtype
needed data type of inputs for the mat_mat_prod
TYPE: dtype
device
needed device for inputs of mat_mat_prod
TYPE: device
regularization
regularization parameter \\(\\lambda\\) in \\((A+\\lambda I)x=b\\)
TYPE: float
DEFAULT: 0.0
Source code in src/pydvl/influence/torch/pre_conditioner.py
def fit(\n self,\n mat_mat_prod: Callable[[torch.Tensor], torch.Tensor],\n size: int,\n dtype: torch.dtype,\n device: torch.device,\n regularization: float = 0.0,\n):\n r\"\"\"\n Fits by computing an estimate of the diagonal of the matrix represented by\n `mat_mat_prod` via Hutchinson's estimator\n\n Args:\n mat_mat_prod: a callable representing the matrix-matrix product\n size: size of the square matrix\n dtype: needed data type of inputs for the mat_mat_prod\n device: needed device for inputs of mat_mat_prod\n regularization: regularization parameter\n $\\lambda$ in $(A+\\lambda I)x=b$\n \"\"\"\n random_samples = torch.randn(\n size, self.num_samples_estimator, device=device, dtype=dtype\n )\n diagonal_estimate = torch.sum(\n torch.mul(random_samples, mat_mat_prod(random_samples)), dim=1\n )\n diagonal_estimate /= self.num_samples_estimator\n self._diag = diagonal_estimate\n self._reg = regularization\n
"},{"location":"api/pydvl/influence/torch/pre_conditioner/#pydvl.influence.torch.pre_conditioner.NystroemPreConditioner","title":"NystroemPreConditioner","text":"NystroemPreConditioner(rank: int)\n
Bases: PreConditioner
Pre-conditioner for improving the convergence of CG for systems of the form
\\[ (A + \\lambda \\operatorname{I})x = \\operatorname{rhs} \\] The NystroemPreConditioner computes a low-rank approximation
\\[ A_{\\text{nys}} = (A \\Omega)(\\Omega^T A \\Omega)^{\\dagger}(A \\Omega)^T = U \\Sigma U^T, \\] where \\((\\cdot)^{\\dagger}\\) denotes the Moore-Penrose inverse, and uses the matrix
\\[ M^{-1} = (\\lambda + \\sigma_{\\text{rank}})U(\\Sigma+ \\lambda \\operatorname{I})^{-1}U^T+(\\operatorname{I} - UU^T) \\] for pre-conditioning, where \\( \\sigma_{\\text{rank}} \\) is the smallest eigenvalue of the low-rank approximation.
Source code in src/pydvl/influence/torch/pre_conditioner.py
def __init__(self, rank: int):\n self._rank = rank\n
"},{"location":"api/pydvl/influence/torch/pre_conditioner/#pydvl.influence.torch.pre_conditioner.NystroemPreConditioner.solve","title":"solve","text":"solve(rhs: Tensor)\n
Solve the equation \\(M@Z = \\operatorname{rhs}\\) Args: rhs: right hand side of the equation, corresponds to the residuum vector (or matrix) in the conjugate gradient method
RETURNS DESCRIPTION solution \\(M^{-1}\\operatorname{rhs}\\)
Source code in src/pydvl/influence/torch/pre_conditioner.py
def solve(self, rhs: torch.Tensor):\n r\"\"\"\n Solve the equation $M@Z = \\operatorname{rhs}$\n Args:\n rhs: right hand side of the equation, corresponds to the residuum vector\n (or matrix) in the conjugate gradient method\n\n Returns:\n solution $M^{-1}\\operatorname{rhs}$\n\n \"\"\"\n if not self.is_fitted:\n raise NotFittedException(type(self))\n\n return self._solve(rhs)\n
"},{"location":"api/pydvl/influence/torch/pre_conditioner/#pydvl.influence.torch.pre_conditioner.NystroemPreConditioner.fit","title":"fit","text":"fit(\n mat_mat_prod: Callable[[Tensor], Tensor],\n size: int,\n dtype: dtype,\n device: device,\n regularization: float = 0.0,\n)\n
Fits by computing a low-rank approximation of the matrix represented by mat_mat_prod
via Nystroem approximation
PARAMETER DESCRIPTION mat_mat_prod
a callable representing the matrix-matrix product
TYPE: Callable[[Tensor], Tensor]
size
size of the square matrix
TYPE: int
dtype
needed data type of inputs for the mat_mat_prod
TYPE: dtype
device
needed device for inputs of mat_mat_prod
TYPE: device
regularization
regularization parameter \\(\\lambda\\) in \\((A+\\lambda I)x=b\\)
TYPE: float
DEFAULT: 0.0
Source code in src/pydvl/influence/torch/pre_conditioner.py
def fit(\n self,\n mat_mat_prod: Callable[[torch.Tensor], torch.Tensor],\n size: int,\n dtype: torch.dtype,\n device: torch.device,\n regularization: float = 0.0,\n):\n r\"\"\"\n Fits by computing a low-rank approximation of the matrix represented by\n `mat_mat_prod` via Nystroem approximation\n\n Args:\n mat_mat_prod: a callable representing the matrix-matrix product\n size: size of the square matrix\n dtype: needed data type of inputs for the mat_mat_prod\n device: needed device for inputs of mat_mat_prod\n regularization: regularization parameter\n $\\lambda$ in $(A+\\lambda I)x=b$\n \"\"\"\n\n self._low_rank_approx = randomized_nystroem_approximation(\n mat_mat_prod, size, self._rank, dtype, mat_vec_device=device\n )\n self._regularization = regularization\n
"},{"location":"api/pydvl/influence/torch/util/","title":"Util","text":""},{"location":"api/pydvl/influence/torch/util/#pydvl.influence.torch.util","title":"pydvl.influence.torch.util","text":""},{"location":"api/pydvl/influence/torch/util/#pydvl.influence.torch.util.TorchTensorContainerType","title":"TorchTensorContainerType module-attribute
","text":"TorchTensorContainerType = Union[\n Tensor, Collection[Tensor], Mapping[str, Tensor]\n]\n
Type for a PyTorch tensor or a container thereof.
"},{"location":"api/pydvl/influence/torch/util/#pydvl.influence.torch.util.TorchNumpyConverter","title":"TorchNumpyConverter","text":"TorchNumpyConverter(device: Optional[device] = None)\n
Bases: NumpyConverter[Tensor]
Helper class for converting between torch.Tensor and numpy.ndarray
PARAMETER DESCRIPTION device
Optional device parameter to move the resulting torch tensors to the specified device
TYPE: Optional[device]
DEFAULT: None
Source code in src/pydvl/influence/torch/util.py
def __init__(self, device: Optional[torch.device] = None):\n self.device = device\n
"},{"location":"api/pydvl/influence/torch/util/#pydvl.influence.torch.util.TorchNumpyConverter.to_numpy","title":"to_numpy","text":"to_numpy(x: Tensor) -> NDArray\n
Convert a detached torch.Tensor to numpy.ndarray
Source code in src/pydvl/influence/torch/util.py
def to_numpy(self, x: torch.Tensor) -> NDArray:\n \"\"\"\n Convert a detached [torch.Tensor][torch.Tensor] to\n [numpy.ndarray][numpy.ndarray]\n \"\"\"\n arr: NDArray = x.cpu().numpy()\n return arr\n
"},{"location":"api/pydvl/influence/torch/util/#pydvl.influence.torch.util.TorchNumpyConverter.from_numpy","title":"from_numpy","text":"from_numpy(x: NDArray) -> Tensor\n
Convert a numpy.ndarray to torch.Tensor and optionally move it to a provided device
Source code in src/pydvl/influence/torch/util.py
def from_numpy(self, x: NDArray) -> torch.Tensor:\n \"\"\"\n Convert a [numpy.ndarray][numpy.ndarray] to [torch.Tensor][torch.Tensor] and\n optionally move it to a provided device\n \"\"\"\n t = torch.from_numpy(x)\n if self.device is not None:\n t = t.to(self.device)\n return t\n
"},{"location":"api/pydvl/influence/torch/util/#pydvl.influence.torch.util.TorchCatAggregator","title":"TorchCatAggregator","text":" Bases: SequenceAggregator[Tensor]
An aggregator that concatenates tensors using PyTorch's torch.cat function. Concatenation is done along the first dimension of the chunks.
"},{"location":"api/pydvl/influence/torch/util/#pydvl.influence.torch.util.TorchCatAggregator.__call__","title":"__call__","text":"__call__(tensor_sequence: LazyChunkSequence[Tensor])\n
Aggregates tensors from a single-level generator into a single tensor by concatenating them. This method is a straightforward way to combine a sequence of tensors into one larger tensor.
PARAMETER DESCRIPTION tensor_sequence
Object wrapping a generator that yields torch.Tensor
objects.
TYPE: LazyChunkSequence[Tensor]
RETURNS DESCRIPTION A single tensor formed by concatenating all tensors from the generator. The concatenation is performed along the default dimension (0).
Source code in src/pydvl/influence/torch/util.py
def __call__(\n self,\n tensor_sequence: LazyChunkSequence[torch.Tensor],\n):\n \"\"\"\n Aggregates tensors from a single-level generator into a single tensor by\n concatenating them. This method is a straightforward way to combine a sequence\n of tensors into one larger tensor.\n\n Args:\n tensor_sequence: Object wrapping a generator that yields `torch.Tensor`\n objects.\n\n Returns:\n A single tensor formed by concatenating all tensors from the generator.\n The concatenation is performed along the default dimension (0).\n \"\"\"\n t_gen = cast(Iterator[torch.Tensor], tensor_sequence.generator_factory())\n len_generator = tensor_sequence.len_generator\n if len_generator is not None:\n t_gen = cast(\n Iterator[torch.Tensor], tqdm(t_gen, total=len_generator, desc=\"Blocks\")\n )\n\n return torch.cat(list(t_gen))\n
"},{"location":"api/pydvl/influence/torch/util/#pydvl.influence.torch.util.NestedTorchCatAggregator","title":"NestedTorchCatAggregator","text":" Bases: NestedSequenceAggregator[Tensor]
An aggregator that concatenates tensors using PyTorch's torch.cat function. Concatenation is done along the first two dimensions of the chunks.
"},{"location":"api/pydvl/influence/torch/util/#pydvl.influence.torch.util.NestedTorchCatAggregator.__call__","title":"__call__","text":"__call__(nested_sequence_of_tensors: NestedLazyChunkSequence[Tensor])\n
Aggregates tensors from a nested generator structure into a single tensor by concatenating. Each inner generator is first concatenated along dimension 1 into a tensor, and then these tensors are concatenated along dimension 0 together to form the final tensor.
PARAMETER DESCRIPTION nested_sequence_of_tensors
Object wrapping a generator of generators, where each inner generator yields torch.Tensor
objects.
TYPE: NestedLazyChunkSequence[Tensor]
RETURNS DESCRIPTION A single tensor formed by concatenating all tensors from the nested
generators.
Source code in src/pydvl/influence/torch/util.py
def __call__(\n self, nested_sequence_of_tensors: NestedLazyChunkSequence[torch.Tensor]\n):\n \"\"\"\n Aggregates tensors from a nested generator structure into a single tensor by\n concatenating. Each inner generator is first concatenated along dimension 1 into\n a tensor, and then these tensors are concatenated along dimension 0 together to\n form the final tensor.\n\n Args:\n nested_sequence_of_tensors: Object wrapping a generator of generators,\n where each inner generator yields `torch.Tensor` objects.\n\n Returns:\n A single tensor formed by concatenating all tensors from the nested\n generators.\n\n \"\"\"\n\n outer_gen = cast(\n Iterator[Iterator[torch.Tensor]],\n nested_sequence_of_tensors.generator_factory(),\n )\n len_outer_generator = nested_sequence_of_tensors.len_outer_generator\n if len_outer_generator is not None:\n outer_gen = cast(\n Iterator[Iterator[torch.Tensor]],\n tqdm(outer_gen, total=len_outer_generator, desc=\"Row blocks\"),\n )\n\n return torch.cat(\n list(\n map(\n lambda tensor_gen: torch.cat(list(tensor_gen), dim=1),\n outer_gen,\n )\n )\n )\n
"},{"location":"api/pydvl/influence/torch/util/#pydvl.influence.torch.util.EkfacRepresentation","title":"EkfacRepresentation dataclass
","text":"EkfacRepresentation(\n layer_names: Iterable[str],\n layers_module: Iterable[Module],\n evecs_a: Iterable[Tensor],\n evecs_g: Iterable[Tensor],\n diags: Iterable[Tensor],\n)\n
Container class for the EKFAC representation of the Hessian. It can be iterated over to get the layers names and their corresponding module, eigenvectors and diagonal elements of the factorized Hessian matrix.
PARAMETER DESCRIPTION layer_names
Names of the layers.
TYPE: Iterable[str]
layers_module
The layers.
TYPE: Iterable[Module]
evecs_a
The a eigenvectors of the ekfac representation.
TYPE: Iterable[Tensor]
evecs_g
The g eigenvectors of the ekfac representation.
TYPE: Iterable[Tensor]
diags
The diagonal elements of the factorized Hessian matrix.
TYPE: Iterable[Tensor]
"},{"location":"api/pydvl/influence/torch/util/#pydvl.influence.torch.util.EkfacRepresentation.get_layer_evecs","title":"get_layer_evecs","text":"get_layer_evecs() -> Tuple[Dict[str, Tensor], Dict[str, Tensor]]\n
It returns two dictionaries, one for the a eigenvectors and one for the g eigenvectors, with the layer names as keys. The eigenvectors are in the same order as the layers in the model.
Source code in src/pydvl/influence/torch/util.py
def get_layer_evecs(\n self,\n) -> Tuple[Dict[str, torch.Tensor], Dict[str, torch.Tensor]]:\n \"\"\"\n It returns two dictionaries, one for the a eigenvectors and one for the g\n eigenvectors, with the layer names as keys. The eigenvectors are in the same\n order as the layers in the model.\n \"\"\"\n evecs_a_dict = {layer_name: evec_a for layer_name, (_, evec_a, _, _) in self}\n evecs_g_dict = {layer_name: evec_g for layer_name, (_, _, evec_g, _) in self}\n return evecs_a_dict, evecs_g_dict\n
"},{"location":"api/pydvl/influence/torch/util/#pydvl.influence.torch.util.TorchLinalgEighException","title":"TorchLinalgEighException","text":"TorchLinalgEighException(original_exception: RuntimeError)\n
Bases: Exception
Exception to wrap a RunTimeError raised by torch.linalg.eigh, when used with large matrices, see https://github.com/pytorch/pytorch/issues/92141
Source code in src/pydvl/influence/torch/util.py
def __init__(self, original_exception: RuntimeError):\n func = torch.linalg.eigh\n err_msg = (\n f\"A RunTimeError occurred in '{func.__module__}.{func.__qualname__}'. \"\n \"This might be related to known issues with \"\n \"[torch.linalg.eigh][torch.linalg.eigh] on certain matrix sizes.\\n \"\n \"For more details, refer to \"\n \"https://github.com/pytorch/pytorch/issues/92141. \\n\"\n \"In this case, consider to use a different implementation, which does not \"\n \"depend on the usage of [torch.linalg.eigh][torch.linalg.eigh].\\n\"\n f\" Inspect the original exception message: \\n{str(original_exception)}\"\n )\n super().__init__(err_msg)\n
"},{"location":"api/pydvl/influence/torch/util/#pydvl.influence.torch.util.BlockMode","title":"BlockMode","text":" Bases: Enum
Enumeration for different modes of grouping model parameters.
ATTRIBUTE DESCRIPTION LAYER_WISE
Groups parameters by layers of the model.
TYPE: str
PARAMETER_WISE
Groups parameters individually.
TYPE: str
FULL
Groups all parameters together.
TYPE: str
"},{"location":"api/pydvl/influence/torch/util/#pydvl.influence.torch.util.ModelParameterDictBuilder","title":"ModelParameterDictBuilder dataclass
","text":"ModelParameterDictBuilder(model: Module, detach: bool = True)\n
A builder class for creating ordered dictionaries of model parameters based on specified block modes or custom blocking structures.
ATTRIBUTE DESCRIPTION model
The neural network model.
TYPE: Module
detach
Whether to detach the parameters from the computation graph.
TYPE: bool
"},{"location":"api/pydvl/influence/torch/util/#pydvl.influence.torch.util.ModelParameterDictBuilder.build","title":"build","text":"build(\n block_structure: OrderedDict[str, List[str]]\n) -> Dict[str, Dict[str, Parameter]]\n
Builds an ordered dictionary of model parameters based on the specified block structure represented by an ordered dictionary, where the keys are block identifiers and the values are lists of model parameter names contained in this block.
PARAMETER DESCRIPTION block_structure
The block structure specifying how to group the parameters.
TYPE: OrderedDict[str, List[str]]
RETURNS DESCRIPTION Dict[str, Dict[str, Parameter]]
An ordered dictionary of ordered dictionaries, where the outer dictionary's
Dict[str, Dict[str, Parameter]]
keys are block identifiers and the inner dictionaries map parameter names
Dict[str, Dict[str, Parameter]]
to parameters.
Source code in src/pydvl/influence/torch/util.py
def build(\n self, block_structure: OrderedDict[str, List[str]]\n) -> Dict[str, Dict[str, torch.nn.Parameter]]:\n \"\"\"\n Builds an ordered dictionary of model parameters based on the specified block\n structure represented by an ordered dictionary, where the keys are block\n identifiers and the values are lists of model parameter names contained in\n this block.\n\n Args:\n block_structure: The block structure specifying how to group the parameters.\n\n Returns:\n An ordered dictionary of ordered dictionaries, where the outer dictionary's\n keys are block identifiers and the inner dictionaries map parameter names\n to parameters.\n \"\"\"\n parameter_dict = {}\n\n for block_name, parameter_names in block_structure.items():\n inner_ordered_dict = {}\n for parameter_name in parameter_names:\n parameter = self._extract_parameter_by_name(parameter_name)\n if parameter.requires_grad:\n inner_ordered_dict[parameter_name] = self._optional_detach(\n parameter\n )\n else:\n warnings.warn(\n f\"The parameter {parameter_name} from the block \"\n f\"{block_name} is mark as not trainable in the model \"\n f\"and will be excluded from the computation.\"\n )\n parameter_dict[block_name] = inner_ordered_dict\n\n return parameter_dict\n
"},{"location":"api/pydvl/influence/torch/util/#pydvl.influence.torch.util.ModelParameterDictBuilder.build_from_block_mode","title":"build_from_block_mode","text":"build_from_block_mode(block_mode: BlockMode) -> Dict[str, Dict[str, Parameter]]\n
Builds an ordered dictionary of model parameters based on the specified block mode or custom blocking structure represented by an ordered dictionary, where the keys are block identifiers and the values are lists of model parameter names contained in this block.
PARAMETER DESCRIPTION block_mode
The block mode specifying how to group the parameters.
TYPE: BlockMode
RETURNS DESCRIPTION Dict[str, Dict[str, Parameter]]
An ordered dictionary of ordered dictionaries, where the outer dictionary's
Dict[str, Dict[str, Parameter]]
keys are block identifiers and the inner dictionaries map parameter names
Dict[str, Dict[str, Parameter]]
to parameters.
Source code in src/pydvl/influence/torch/util.py
def build_from_block_mode(\n self, block_mode: BlockMode\n) -> Dict[str, Dict[str, torch.nn.Parameter]]:\n \"\"\"\n Builds an ordered dictionary of model parameters based on the specified block\n mode or custom blocking structure represented by an ordered dictionary, where\n the keys are block identifiers and the values are lists of model parameter names\n contained in this block.\n\n Args:\n block_mode: The block mode specifying how to group the parameters.\n\n Returns:\n An ordered dictionary of ordered dictionaries, where the outer dictionary's\n keys are block identifiers and the inner dictionaries map parameter names\n to parameters.\n \"\"\"\n\n block_mode_mapping = {\n BlockMode.FULL: self._build_full,\n BlockMode.PARAMETER_WISE: self._build_parameter_wise,\n BlockMode.LAYER_WISE: self._build_layer_wise,\n }\n\n parameter_dict_func = block_mode_mapping.get(block_mode, None)\n\n if parameter_dict_func is None:\n raise ValueError(f\"Unknown block mode {block_mode}.\")\n\n return self.build(parameter_dict_func())\n
"},{"location":"api/pydvl/influence/torch/util/#pydvl.influence.torch.util.ModelInfoMixin","title":"ModelInfoMixin","text":"ModelInfoMixin(model: Module)\n
A mixin class for classes that contain information about a model.
Source code in src/pydvl/influence/torch/util.py
def __init__(self, model: torch.nn.Module):\n self.model = model\n
"},{"location":"api/pydvl/influence/torch/util/#pydvl.influence.torch.util.to_model_device","title":"to_model_device","text":"to_model_device(x: Tensor, model: Module) -> Tensor\n
Returns the tensor x
moved to the device of the model
, if device of model is set
PARAMETER DESCRIPTION x
The tensor to be moved to the device of the model.
TYPE: Tensor
model
The model whose device will be used to move the tensor.
TYPE: Module
RETURNS DESCRIPTION Tensor
The tensor x
moved to the device of the model
, if device of model is set.
Source code in src/pydvl/influence/torch/util.py
def to_model_device(x: torch.Tensor, model: torch.nn.Module) -> torch.Tensor:\n \"\"\"\n Returns the tensor `x` moved to the device of the `model`, if device of model is set\n\n Args:\n x: The tensor to be moved to the device of the model.\n model: The model whose device will be used to move the tensor.\n\n Returns:\n The tensor `x` moved to the device of the `model`, if device of model is set.\n \"\"\"\n device = next(model.parameters()).device\n return x.to(device)\n
"},{"location":"api/pydvl/influence/torch/util/#pydvl.influence.torch.util.reshape_vector_to_tensors","title":"reshape_vector_to_tensors","text":"reshape_vector_to_tensors(\n input_vector: Tensor, target_shapes: Iterable[Tuple[int, ...]]\n) -> Tuple[Tensor, ...]\n
Reshape a 1D tensor into multiple tensors with specified shapes.
This function takes a 1D tensor (input_vector) and reshapes it into a series of tensors with shapes given by 'target_shapes'. The reshaped tensors are returned as a tuple in the same order as their corresponding shapes.
Note The total number of elements in 'input_vector' must be equal to the sum of the products of the shapes in 'target_shapes'.
PARAMETER DESCRIPTION input_vector
The 1D tensor to be reshaped. Must be 1D.
TYPE: Tensor
target_shapes
An iterable of tuples. Each tuple defines the shape of a tensor to be reshaped from the 'input_vector'.
TYPE: Iterable[Tuple[int, ...]]
RETURNS DESCRIPTION Tuple[Tensor, ...]
A tuple of reshaped tensors.
RAISES DESCRIPTION ValueError
If 'input_vector' is not a 1D tensor or if the total number of elements in 'input_vector' does not match the sum of the products of the shapes in 'target_shapes'.
Source code in src/pydvl/influence/torch/util.py
def reshape_vector_to_tensors(\n input_vector: torch.Tensor, target_shapes: Iterable[Tuple[int, ...]]\n) -> Tuple[torch.Tensor, ...]:\n \"\"\"\n Reshape a 1D tensor into multiple tensors with specified shapes.\n\n This function takes a 1D tensor (input_vector) and reshapes it into a series of\n tensors with shapes given by 'target_shapes'.\n The reshaped tensors are returned as a tuple in the same order\n as their corresponding shapes.\n\n Note:\n The total number of elements in 'input_vector' must be equal to the\n sum of the products of the shapes in 'target_shapes'.\n\n Args:\n input_vector: The 1D tensor to be reshaped. Must be 1D.\n target_shapes: An iterable of tuples. Each tuple defines the shape of a tensor\n to be reshaped from the 'input_vector'.\n\n Returns:\n A tuple of reshaped tensors.\n\n Raises:\n ValueError: If 'input_vector' is not a 1D tensor or if the total\n number of elements in 'input_vector' does not\n match the sum of the products of the shapes in 'target_shapes'.\n \"\"\"\n\n if input_vector.dim() != 1:\n raise ValueError(\"Input vector must be a 1D tensor\")\n\n total_elements = sum(math.prod(shape) for shape in target_shapes)\n\n if total_elements != input_vector.shape[0]:\n raise ValueError(\n f\"The total elements in shapes {total_elements} \"\n f\"does not match the vector length {input_vector.shape[0]}\"\n )\n\n tensors = []\n start = 0\n for shape in target_shapes:\n size = math.prod(shape) # compute the total size of the tensor with this shape\n tensors.append(\n input_vector[start : start + size].view(shape)\n ) # slice the vector and reshape it\n start += size\n return tuple(tensors)\n
"},{"location":"api/pydvl/influence/torch/util/#pydvl.influence.torch.util.align_structure","title":"align_structure","text":"align_structure(\n source: Mapping[str, Tensor], target: TorchTensorContainerType\n) -> Dict[str, Tensor]\n
This function transforms target
to have the same structure as source
, i.e., it should be a dictionary with the same keys as source
and each corresponding value in target
should have the same shape as the value in source
.
PARAMETER DESCRIPTION source
The reference dictionary containing PyTorch tensors.
TYPE: Mapping[str, Tensor]
target
The input to be harmonized. It can be a dictionary, tuple, or tensor.
TYPE: TorchTensorContainerType
RETURNS DESCRIPTION Dict[str, Tensor]
The harmonized version of target
.
RAISES DESCRIPTION ValueError
If target
cannot be harmonized to match source
.
Source code in src/pydvl/influence/torch/util.py
def align_structure(\n source: Mapping[str, torch.Tensor],\n target: TorchTensorContainerType,\n) -> Dict[str, torch.Tensor]:\n \"\"\"\n This function transforms `target` to have the same structure as `source`, i.e.,\n it should be a dictionary with the same keys as `source` and each corresponding\n value in `target` should have the same shape as the value in `source`.\n\n Args:\n source: The reference dictionary containing PyTorch tensors.\n target: The input to be harmonized. It can be a dictionary, tuple, or tensor.\n\n Returns:\n The harmonized version of `target`.\n\n Raises:\n ValueError: If `target` cannot be harmonized to match `source`.\n \"\"\"\n\n tangent_dict: Dict[str, torch.Tensor]\n\n if isinstance(target, dict):\n if list(target.keys()) != list(source.keys()):\n raise ValueError(\"The keys in 'target' do not match the keys in 'source'.\")\n\n if [v.shape for v in target.values()] != [v.shape for v in source.values()]:\n raise ValueError(\n \"The shapes of the values in 'target' do not match the shapes \"\n \"of the values in 'source'.\"\n )\n\n tangent_dict = target\n\n elif isinstance(target, tuple) or isinstance(target, list):\n if [v.shape for v in target] != [v.shape for v in source.values()]:\n raise ValueError(\n \"'target' is a tuple/list but its elements' shapes do not match \"\n \"the shapes of the values in 'source'.\"\n )\n\n tangent_dict = dict(zip(source.keys(), target))\n\n elif isinstance(target, torch.Tensor):\n try:\n tangent_dict = dict(\n zip(\n source.keys(),\n reshape_vector_to_tensors(\n target, [p.shape for p in source.values()]\n ),\n )\n )\n except Exception as e:\n raise ValueError(\n f\"'target' is a tensor but cannot be reshaped to match 'source'. \"\n f\"Original error: {e}\"\n )\n\n else:\n raise ValueError(f\"'target' is of type {type(target)} which is not supported.\")\n\n return tangent_dict\n
"},{"location":"api/pydvl/influence/torch/util/#pydvl.influence.torch.util.align_with_model","title":"align_with_model","text":"align_with_model(x: TorchTensorContainerType, model: Module)\n
Aligns an input to the model's parameter structure, i.e. transforms it into a dict with the same keys as model.named_parameters() and matching tensor shapes
PARAMETER DESCRIPTION x
The input to be aligned. It can be a dictionary, tuple, or tensor.
TYPE: TorchTensorContainerType
model
model to use for alignment
TYPE: Module
RETURNS DESCRIPTION The aligned version of x
.
RAISES DESCRIPTION ValueError
If x
cannot be aligned to match the model's parameters .
Source code in src/pydvl/influence/torch/util.py
def align_with_model(x: TorchTensorContainerType, model: torch.nn.Module):\n \"\"\"\n Aligns an input to the model's parameter structure, i.e. transforms it into a dict\n with the same keys as model.named_parameters() and matching tensor shapes\n\n Args:\n x: The input to be aligned. It can be a dictionary, tuple, or tensor.\n model: model to use for alignment\n\n Returns:\n The aligned version of `x`.\n\n Raises:\n ValueError: If `x` cannot be aligned to match the model's parameters .\n\n \"\"\"\n model_params = get_model_parameters(model, detach=False)\n return align_structure(model_params, x)\n
"},{"location":"api/pydvl/influence/torch/util/#pydvl.influence.torch.util.flatten_dimensions","title":"flatten_dimensions","text":"flatten_dimensions(\n tensors: Iterable[Tensor],\n shape: Optional[Tuple[int, ...]] = None,\n concat_at: int = -1,\n) -> Tensor\n
Flattens the dimensions of each tensor in the given iterable and concatenates them along a specified dimension.
This function takes an iterable of PyTorch tensors and flattens each tensor. Optionally, each tensor can be reshaped to a specified shape before concatenation. The concatenation is performed along the dimension specified by concat_at
.
PARAMETER DESCRIPTION tensors
An iterable containing PyTorch tensors to be flattened and concatenated.
TYPE: Iterable[Tensor]
shape
A tuple representing the desired shape to which each tensor is reshaped before concatenation. If None, tensors are flattened to 1D.
TYPE: Optional[Tuple[int, ...]]
DEFAULT: None
concat_at
The dimension along which to concatenate the tensors.
TYPE: int
DEFAULT: -1
RETURNS DESCRIPTION Tensor
A single tensor resulting from the concatenation of the input tensors,
Tensor
each either flattened or reshaped as specified.
Example >>> tensors = [torch.tensor([[1, 2], [3, 4]]), torch.tensor([[5, 6], [7, 8]])]\n>>> flatten_dimensions(tensors)\ntensor([1, 2, 3, 4, 5, 6, 7, 8])\n\n>>> flatten_dimensions(tensors, shape=(2, 2), concat_at=0)\ntensor([[1, 2],\n [3, 4],\n [5, 6],\n [7, 8]])\n
Source code in src/pydvl/influence/torch/util.py
def flatten_dimensions(\n tensors: Iterable[torch.Tensor],\n shape: Optional[Tuple[int, ...]] = None,\n concat_at: int = -1,\n) -> torch.Tensor:\n \"\"\"\n Flattens the dimensions of each tensor in the given iterable and concatenates them\n along a specified dimension.\n\n This function takes an iterable of PyTorch tensors and flattens each tensor.\n Optionally, each tensor can be reshaped to a specified shape before concatenation.\n The concatenation is performed along the dimension specified by `concat_at`.\n\n Args:\n tensors: An iterable containing PyTorch tensors to be flattened\n and concatenated.\n shape: A tuple representing the desired shape to which each tensor is reshaped\n before concatenation. If None, tensors are flattened to 1D.\n concat_at: The dimension along which to concatenate the tensors.\n\n Returns:\n A single tensor resulting from the concatenation of the input tensors,\n each either flattened or reshaped as specified.\n\n ??? Example\n ```pycon\n >>> tensors = [torch.tensor([[1, 2], [3, 4]]), torch.tensor([[5, 6], [7, 8]])]\n >>> flatten_dimensions(tensors)\n tensor([1, 2, 3, 4, 5, 6, 7, 8])\n\n >>> flatten_dimensions(tensors, shape=(2, 2), concat_at=0)\n tensor([[1, 2],\n [3, 4],\n [5, 6],\n [7, 8]])\n ```\n \"\"\"\n return torch.cat(\n [t.reshape(-1) if shape is None else t.reshape(*shape) for t in tensors],\n dim=concat_at,\n )\n
"},{"location":"api/pydvl/influence/torch/util/#pydvl.influence.torch.util.torch_dataset_to_dask_array","title":"torch_dataset_to_dask_array","text":"torch_dataset_to_dask_array(\n dataset: Dataset,\n chunk_size: int,\n total_size: Optional[int] = None,\n resulting_dtype: Type[number] = np.float32,\n) -> Tuple[Array, ...]\n
Construct tuple of dask arrays from a PyTorch dataset, using dask.delayed
PARAMETER DESCRIPTION dataset
A PyTorch dataset
TYPE: Dataset
chunk_size
The size of the chunks for the resulting Dask arrays.
TYPE: int
total_size
If the dataset does not implement len, provide the length via this parameter. If None the length of the dataset is inferred via accessing the dataset once.
TYPE: Optional[int]
DEFAULT: None
resulting_dtype
The dtype of the resulting dask.array.Array
TYPE: Type[number]
DEFAULT: float32
Example import torch\nfrom torch.utils.data import TensorDataset\nx = torch.rand((20, 3))\ny = torch.rand((20, 1))\ndataset = TensorDataset(x, y)\nda_x, da_y = torch_dataset_to_dask_array(dataset, 4)\n
RETURNS DESCRIPTION Tuple[Array, ...]
Tuple of Dask arrays corresponding to each tensor in the dataset.
Source code in src/pydvl/influence/torch/util.py
def torch_dataset_to_dask_array(\n dataset: Dataset,\n chunk_size: int,\n total_size: Optional[int] = None,\n resulting_dtype: Type[np.number] = np.float32,\n) -> Tuple[da.Array, ...]:\n \"\"\"\n Construct tuple of dask arrays from a PyTorch dataset, using dask.delayed\n\n Args:\n dataset: A PyTorch [dataset][torch.utils.data.Dataset]\n chunk_size: The size of the chunks for the resulting Dask arrays.\n total_size: If the dataset does not implement len, provide the length\n via this parameter. If None\n the length of the dataset is inferred via accessing the dataset once.\n resulting_dtype: The dtype of the resulting [dask.array.Array][dask.array.Array]\n\n ??? Example\n ```python\n import torch\n from torch.utils.data import TensorDataset\n x = torch.rand((20, 3))\n y = torch.rand((20, 1))\n dataset = TensorDataset(x, y)\n da_x, da_y = torch_dataset_to_dask_array(dataset, 4)\n ```\n\n Returns:\n Tuple of Dask arrays corresponding to each tensor in the dataset.\n \"\"\"\n\n def _infer_data_len(d_set: Dataset):\n try:\n n_data = len(d_set)\n if total_size is not None and n_data != total_size:\n raise ValueError(\n f\"The number of samples in the dataset ({n_data}), derived \"\n f\"from calling \u00b4len\u00b4, does not match the provided \"\n f\"total number of samples ({total_size}). \"\n f\"Call the function without total_size.\"\n )\n return n_data\n except TypeError as e:\n err_msg = (\n f\"Could not infer the number of samples in the dataset from \"\n f\"calling \u00b4len\u00b4. Original error: {e}.\"\n )\n if total_size is not None:\n logger.warning(\n err_msg\n + f\" Using the provided total number of samples {total_size}.\"\n )\n return total_size\n else:\n logger.warning(\n err_msg + \" Infer the number of samples from the dataset, \"\n \"via iterating the dataset once. \"\n \"This might induce severe overhead, so consider\"\n \"providing total_size, if you know the number of samples \"\n \"beforehand.\"\n )\n idx = 0\n while True:\n try:\n t = d_set[idx]\n if all(_t.numel() == 0 for _t in t):\n return idx\n idx += 1\n\n except IndexError:\n return idx\n\n sample = dataset[0]\n if not isinstance(sample, tuple):\n sample = (sample,)\n\n def _get_chunk(\n start_idx: int, stop_idx: int, d_set: Dataset\n ) -> Tuple[torch.Tensor, ...]:\n try:\n t = d_set[start_idx:stop_idx]\n if not isinstance(t, tuple):\n t = (t,)\n return t # type:ignore\n except Exception:\n nested_tensor_list = [\n [d_set[idx][k] for idx in range(start_idx, stop_idx)]\n for k in range(len(sample))\n ]\n return tuple(map(torch.stack, nested_tensor_list))\n\n n_samples = _infer_data_len(dataset)\n chunk_indices = [\n (i, min(i + chunk_size, n_samples)) for i in range(0, n_samples, chunk_size)\n ]\n delayed_dataset = dask.delayed(dataset)\n delayed_chunks = [\n dask.delayed(partial(_get_chunk, start, stop))(delayed_dataset)\n for (start, stop) in chunk_indices\n ]\n\n delayed_arrays_dict: Dict[int, List[da.Array]] = {k: [] for k in range(len(sample))}\n\n for chunk, (start, stop) in zip(delayed_chunks, chunk_indices):\n for tensor_idx, sample_tensor in enumerate(sample):\n delayed_tensor = da.from_delayed(\n dask.delayed(lambda t: t.cpu().numpy())(chunk[tensor_idx]),\n shape=(stop - start, *sample_tensor.shape),\n dtype=resulting_dtype,\n )\n\n delayed_arrays_dict[tensor_idx].append(delayed_tensor)\n\n return tuple(\n da.concatenate(array_list) for array_list in delayed_arrays_dict.values()\n )\n
"},{"location":"api/pydvl/influence/torch/util/#pydvl.influence.torch.util.empirical_cross_entropy_loss_fn","title":"empirical_cross_entropy_loss_fn","text":"empirical_cross_entropy_loss_fn(\n model_output: Tensor, *args, **kwargs\n) -> Tensor\n
Computes the empirical cross entropy loss of the model output. This is the cross entropy loss of the model output without the labels. The function takes all the usual arguments and keyword arguments of the cross entropy loss function, so that it is compatible with the PyTorch cross entropy loss function. However, it ignores everything except the first argument, which is the model output.
PARAMETER DESCRIPTION model_output
The output of the model.
TYPE: Tensor
Source code in src/pydvl/influence/torch/util.py
def empirical_cross_entropy_loss_fn(\n model_output: torch.Tensor, *args, **kwargs\n) -> torch.Tensor:\n \"\"\"\n Computes the empirical cross entropy loss of the model output. This is the\n cross entropy loss of the model output without the labels. The function takes\n all the usual arguments and keyword arguments of the cross entropy loss\n function, so that it is compatible with the PyTorch cross entropy loss\n function. However, it ignores everything except the first argument, which is\n the model output.\n\n Args:\n model_output: The output of the model.\n \"\"\"\n probs_ = torch.softmax(model_output, dim=1)\n log_probs_ = torch.log(probs_)\n log_probs_ = torch.where(\n torch.isfinite(log_probs_), log_probs_, torch.zeros_like(log_probs_)\n )\n return torch.sum(log_probs_ * probs_.detach() ** 0.5)\n
"},{"location":"api/pydvl/influence/torch/util/#pydvl.influence.torch.util.safe_torch_linalg_eigh","title":"safe_torch_linalg_eigh","text":"safe_torch_linalg_eigh(*args, **kwargs)\n
A wrapper around torch.linalg.eigh
that safely handles potential runtime errors by raising a custom TorchLinalgEighException
with more context, especially related to the issues reported in https://github.com/pytorch/pytorch/issues/92141.
PARAMETER DESCRIPTION *args
Positional arguments passed to torch.linalg.eigh
.
DEFAULT: ()
**kwargs
Keyword arguments passed to torch.linalg.eigh
.
DEFAULT: {}
RETURNS DESCRIPTION The result of calling torch.linalg.eigh
with the provided arguments.
RAISES DESCRIPTION TorchLinalgEighException
If a RuntimeError
occurs during the execution of torch.linalg.eigh
.
Source code in src/pydvl/influence/torch/util.py
@catch_and_raise_exception(RuntimeError, lambda e: TorchLinalgEighException(e))\ndef safe_torch_linalg_eigh(*args, **kwargs):\n \"\"\"\n A wrapper around `torch.linalg.eigh` that safely handles potential runtime errors\n by raising a custom `TorchLinalgEighException` with more context,\n especially related to the issues reported in\n [https://github.com/pytorch/pytorch/issues/92141](\n https://github.com/pytorch/pytorch/issues/92141).\n\n Args:\n *args: Positional arguments passed to `torch.linalg.eigh`.\n **kwargs: Keyword arguments passed to `torch.linalg.eigh`.\n\n Returns:\n The result of calling `torch.linalg.eigh` with the provided arguments.\n\n Raises:\n TorchLinalgEighException: If a `RuntimeError` occurs during the execution of\n `torch.linalg.eigh`.\n \"\"\"\n return torch.linalg.eigh(*args, **kwargs)\n
"},{"location":"api/pydvl/influence/torch/util/#pydvl.influence.torch.util.get_model_parameters","title":"get_model_parameters","text":"get_model_parameters(\n model: Module, detach: bool = True, require_grad_only: bool = True\n) -> Dict[str, Tensor]\n
Returns a dictionary of model parameters, optionally restricted to parameters requiring gradients and optionally detaching them from the computation graph.
PARAMETER DESCRIPTION model
The neural network model.
TYPE: Module
detach
Whether to detach the parameters from the computation graph.
TYPE: bool
DEFAULT: True
require_grad_only
Whether to include only parameters that require gradients.
TYPE: bool
DEFAULT: True
RETURNS DESCRIPTION Dict[str, Tensor]
A dict of named model parameters.
Source code in src/pydvl/influence/torch/util.py
def get_model_parameters(\n model: torch.nn.Module, detach: bool = True, require_grad_only: bool = True\n) -> Dict[str, torch.Tensor]:\n \"\"\"\n Returns a dictionary of model parameters, optionally restricted to parameters\n requiring gradients and optionally detaching them from the computation\n graph.\n\n Args:\n model: The neural network model.\n detach: Whether to detach the parameters from the computation graph.\n require_grad_only: Whether to include only parameters that require gradients.\n\n Returns:\n A dict of named model parameters.\n \"\"\"\n\n parameter_dict = {}\n for k, p in model.named_parameters():\n if require_grad_only and not p.requires_grad:\n continue\n parameter_dict[k] = p.detach() if detach else p\n\n return parameter_dict\n
"},{"location":"api/pydvl/parallel/","title":"Parallel","text":""},{"location":"api/pydvl/parallel/#pydvl.parallel","title":"pydvl.parallel","text":"This module provides a common interface to parallelization backends. The list of supported backends is here. Backends should be instantiated directly and passed to the respective valuation method.
We use executors that implement the Executor interface to submit tasks in parallel. The basic high-level pattern is:
from pydvl.parallel import JoblibParallelBackend\n\nparallel_backend = JoblibParallelBackend()\nwith parallel_backend.executor(max_workers=2) as executor:\n future = executor.submit(lambda x: x + 1, 1)\n result = future.result()\nassert result == 2\n
Running a map-style job is also easy:
from pydvl.parallel import JoblibParallelBackend\n\nparallel_backend = JoblibParallelBackend()\nwith parallel_backend.executor(max_workers=2) as executor:\n results = list(executor.map(lambda x: x + 1, range(5)))\nassert results == [1, 2, 3, 4, 5]\n
Passsing large objects
When running tasks which accept heavy inputs, it is important to first use put()
on the object and use the returned reference as argument to the callable within submit()
. For example:
u_ref = parallel_backend.put(u)\n...\nexecutor.submit(task, utility=u)\n
Note that task()
does not need to be changed in any way: the backend will get()
the object and pass it to the function upon invocation. There is an alternative map-reduce implementation MapReduceJob which internally uses joblib's higher level API with Parallel()
which then indirectly also supports the use of Dask and Ray.
"},{"location":"api/pydvl/parallel/backend/","title":"Backend","text":""},{"location":"api/pydvl/parallel/backend/#pydvl.parallel.backend","title":"pydvl.parallel.backend","text":""},{"location":"api/pydvl/parallel/backend/#pydvl.parallel.backend.CancellationPolicy","title":"CancellationPolicy","text":" Bases: Flag
Policy to use when cancelling futures after exiting an Executor.
Note
Not all backends support all policies.
ATTRIBUTE DESCRIPTION NONE
Do not cancel any futures.
PENDING
Cancel all pending futures, but not running ones.
RUNNING
Cancel all running futures, but not pending ones.
ALL
Cancel all pending and running futures.
"},{"location":"api/pydvl/parallel/backend/#pydvl.parallel.backend.ParallelBackend","title":"ParallelBackend","text":"Abstract base class for all parallel backends.
"},{"location":"api/pydvl/parallel/backend/#pydvl.parallel.backend.ParallelBackend.executor","title":"executor abstractmethod
classmethod
","text":"executor(\n max_workers: int | None = None,\n *,\n config: ParallelConfig | None = None,\n cancel_futures: CancellationPolicy | bool = CancellationPolicy.PENDING\n) -> Executor\n
Returns a futures executor for the parallel backend.
Source code in src/pydvl/parallel/backend.py
@classmethod\n@abstractmethod\ndef executor(\n cls,\n max_workers: int | None = None,\n *,\n config: ParallelConfig | None = None,\n cancel_futures: CancellationPolicy | bool = CancellationPolicy.PENDING,\n) -> Executor:\n \"\"\"Returns a futures executor for the parallel backend.\"\"\"\n ...\n
"},{"location":"api/pydvl/parallel/backend/#pydvl.parallel.backend.init_parallel_backend","title":"init_parallel_backend","text":"init_parallel_backend(\n config: ParallelConfig | None = None, backend_name: str | None = None\n) -> ParallelBackend\n
Initializes the parallel backend and returns an instance of it.
The following example creates a parallel backend instance with the default configuration, which is a local joblib backend.
If you don't pass any arguments, then by default it will instantiate the JoblibParallelBackend:
Example parallel_backend = init_parallel_backend()\n
To create a parallel backend instance with for example ray
as a backend, you can pass the backend name as a string:.
Example parallel_backend = init_parallel_backend(backend_name=\"ray\")\n
The following is an example of the deprecated way for instantiating a parallel backend:
Example config = ParallelConfig()\nparallel_backend = init_parallel_backend(config)\n
PARAMETER DESCRIPTION backend_name
Name of the backend to instantiate.
TYPE: str | None
DEFAULT: None
config
(DEPRECATED) Object configuring parallel computation, with cluster address, number of cpus, etc.
TYPE: ParallelConfig | None
DEFAULT: None
Source code in src/pydvl/parallel/backend.py
@deprecated(\n target=True,\n args_mapping={\"config\": \"config\"},\n deprecated_in=\"0.9.0\",\n remove_in=\"0.10.0\",\n)\ndef init_parallel_backend(\n config: ParallelConfig | None = None, backend_name: str | None = None\n) -> ParallelBackend:\n \"\"\"Initializes the parallel backend and returns an instance of it.\n\n The following example creates a parallel backend instance with the default\n configuration, which is a local joblib backend.\n\n If you don't pass any arguments, then by default it will instantiate\n the JoblibParallelBackend:\n\n ??? Example\n ```python\n parallel_backend = init_parallel_backend()\n ```\n\n To create a parallel backend instance with for example `ray` as a backend,\n you can pass the backend name as a string:.\n\n ??? Example\n ```python\n parallel_backend = init_parallel_backend(backend_name=\"ray\")\n ```\n\n\n The following is an example of the deprecated\n way for instantiating a parallel backend:\n\n ??? Example\n ``` python\n config = ParallelConfig()\n parallel_backend = init_parallel_backend(config)\n ```\n\n Args:\n backend_name: Name of the backend to instantiate.\n config: (**DEPRECATED**) Object configuring parallel computation,\n with cluster address, number of cpus, etc.\n\n\n \"\"\"\n if backend_name is None:\n if config is None:\n backend_name = \"joblib\"\n else:\n backend_name = config.backend\n\n try:\n parallel_backend_cls = ParallelBackend.BACKENDS[backend_name]\n except KeyError:\n raise NotImplementedError(f\"Unexpected parallel backend {backend_name}\")\n return parallel_backend_cls(config) # type: ignore\n
"},{"location":"api/pydvl/parallel/backend/#pydvl.parallel.backend.available_cpus","title":"available_cpus","text":"available_cpus() -> int\n
Platform-independent count of available cores.
FIXME: do we really need this or is os.cpu_count
enough? Is this portable?
RETURNS DESCRIPTION int
Number of cores, or 1 if it is not possible to determine.
Source code in src/pydvl/parallel/backend.py
def available_cpus() -> int:\n \"\"\"Platform-independent count of available cores.\n\n FIXME: do we really need this or is `os.cpu_count` enough? Is this portable?\n\n Returns:\n Number of cores, or 1 if it is not possible to determine.\n \"\"\"\n from platform import system\n\n if system() != \"Linux\":\n return os.cpu_count() or 1\n return len(os.sched_getaffinity(0)) # type: ignore\n
"},{"location":"api/pydvl/parallel/config/","title":"Config","text":""},{"location":"api/pydvl/parallel/config/#pydvl.parallel.config","title":"pydvl.parallel.config","text":""},{"location":"api/pydvl/parallel/config/#pydvl.parallel.config.ParallelConfig","title":"ParallelConfig dataclass
","text":"ParallelConfig(\n backend: Literal[\"joblib\", \"ray\"] = \"joblib\",\n address: Optional[Union[str, Tuple[str, int]]] = None,\n n_cpus_local: Optional[int] = None,\n logging_level: Optional[int] = None,\n wait_timeout: float = 1.0,\n)\n
Configuration for parallel computation backend.
PARAMETER DESCRIPTION backend
Type of backend to use. Defaults to 'joblib'
TYPE: Literal['joblib', 'ray']
DEFAULT: 'joblib'
address
(DEPRECATED) Address of existing remote or local cluster to use.
TYPE: Optional[Union[str, Tuple[str, int]]]
DEFAULT: None
n_cpus_local
(DEPRECATED) Number of CPUs to use when creating a local ray cluster. This has no effect when using an existing ray cluster.
TYPE: Optional[int]
DEFAULT: None
logging_level
(DEPRECATED) Logging level for the parallel backend's worker.
TYPE: Optional[int]
DEFAULT: None
wait_timeout
(DEPRECATED) Timeout in seconds for waiting on futures.
TYPE: float
DEFAULT: 1.0
"},{"location":"api/pydvl/parallel/map_reduce/","title":"Map reduce","text":""},{"location":"api/pydvl/parallel/map_reduce/#pydvl.parallel.map_reduce","title":"pydvl.parallel.map_reduce","text":"This module contains a wrapper around joblib's Parallel()
class that makes it easy to run map-reduce jobs.
Deprecation
This interface might be deprecated or changed in a future release before 1.0
"},{"location":"api/pydvl/parallel/map_reduce/#pydvl.parallel.map_reduce.MapReduceJob","title":"MapReduceJob","text":"MapReduceJob(\n inputs: Union[Collection[T], T],\n map_func: MapFunction[R],\n reduce_func: ReduceFunction[R] = identity,\n parallel_backend: Optional[ParallelBackend] = None,\n config: Optional[ParallelConfig] = None,\n *,\n map_kwargs: Optional[Dict] = None,\n reduce_kwargs: Optional[Dict] = None,\n n_jobs: int = -1,\n timeout: Optional[float] = None\n)\n
Bases: Generic[T, R]
Takes an embarrassingly parallel fun and runs it in n_jobs
parallel jobs, splitting the data evenly into a number of chunks equal to the number of jobs.
Typing information for objects of this class requires the type of the inputs that are split for map_func
and the type of its output.
PARAMETER DESCRIPTION inputs
The input that will be split and passed to map_func
. if it's not a sequence object. It will be repeat n_jobs
number of times.
TYPE: Union[Collection[T], T]
map_func
Function that will be applied to the input chunks in each job.
TYPE: MapFunction[R]
reduce_func
Function that will be applied to the results of map_func
to reduce them.
TYPE: ReduceFunction[R]
DEFAULT: identity
map_kwargs
Keyword arguments that will be passed to map_func
in each job. Alternatively, one can use functools.partial.
TYPE: Optional[Dict]
DEFAULT: None
reduce_kwargs
Keyword arguments that will be passed to reduce_func
in each job. Alternatively, one can use functools.partial.
TYPE: Optional[Dict]
DEFAULT: None
parallel_backend
Parallel backend instance to use for parallelizing computations. If None
, use JoblibParallelBackend backend. See the Parallel Backends package for available options.
TYPE: Optional[ParallelBackend]
DEFAULT: None
config
(DEPRECATED) Object configuring parallel computation, with cluster address, number of cpus, etc.
TYPE: Optional[ParallelConfig]
DEFAULT: None
n_jobs
Number of parallel jobs to run. Does not accept 0
TYPE: int
DEFAULT: -1
Example A simple usage example with 2 jobs:
>>> from pydvl.parallel import MapReduceJob\n>>> import numpy as np\n>>> map_reduce_job: MapReduceJob[np.ndarray, np.ndarray] = MapReduceJob(\n... np.arange(5),\n... map_func=np.sum,\n... reduce_func=np.sum,\n... n_jobs=2,\n... )\n>>> map_reduce_job()\n10\n
When passed a single object as input, it will be repeated for each job:
>>> from pydvl.parallel import MapReduceJob\n>>> import numpy as np\n>>> map_reduce_job: MapReduceJob[int, np.ndarray] = MapReduceJob(\n... 5,\n... map_func=lambda x: np.array([x]),\n... reduce_func=np.sum,\n... n_jobs=2,\n... )\n>>> map_reduce_job()\n10\n
Source code in src/pydvl/parallel/map_reduce.py
@deprecated(\n target=True,\n args_mapping={\"config\": \"config\"},\n deprecated_in=\"0.9.0\",\n remove_in=\"0.10.0\",\n)\ndef __init__(\n self,\n inputs: Union[Collection[T], T],\n map_func: MapFunction[R],\n reduce_func: ReduceFunction[R] = identity,\n parallel_backend: Optional[ParallelBackend] = None,\n config: Optional[ParallelConfig] = None,\n *,\n map_kwargs: Optional[Dict] = None,\n reduce_kwargs: Optional[Dict] = None,\n n_jobs: int = -1,\n timeout: Optional[float] = None,\n):\n parallel_backend = _maybe_init_parallel_backend(parallel_backend, config)\n\n self.parallel_backend = parallel_backend\n\n self.timeout = timeout\n\n self._n_jobs = -1\n # This uses the setter defined below\n self.n_jobs = n_jobs\n\n self.inputs_ = inputs\n\n self.map_kwargs = map_kwargs if map_kwargs is not None else dict()\n self.reduce_kwargs = reduce_kwargs if reduce_kwargs is not None else dict()\n\n self._map_func = reduce(maybe_add_argument, [\"job_id\", \"seed\"], map_func)\n self._reduce_func = reduce_func\n
"},{"location":"api/pydvl/parallel/map_reduce/#pydvl.parallel.map_reduce.MapReduceJob.n_jobs","title":"n_jobs property
writable
","text":"n_jobs: int\n
Effective number of jobs according to the used ParallelBackend instance.
"},{"location":"api/pydvl/parallel/map_reduce/#pydvl.parallel.map_reduce.MapReduceJob.__call__","title":"__call__","text":"__call__(seed: Optional[Union[Seed, SeedSequence]] = None) -> R\n
Runs the map-reduce job.
PARAMETER DESCRIPTION seed
Either an instance of a numpy random number generator or a seed for it.
TYPE: Optional[Union[Seed, SeedSequence]]
DEFAULT: None
RETURNS DESCRIPTION R
The result of the reduce function.
Source code in src/pydvl/parallel/map_reduce.py
def __call__(\n self,\n seed: Optional[Union[Seed, SeedSequence]] = None,\n) -> R:\n \"\"\"\n Runs the map-reduce job.\n\n Args:\n seed: Either an instance of a numpy random number generator or a seed for\n it.\n\n Returns:\n The result of the reduce function.\n \"\"\"\n seed_seq = ensure_seed_sequence(seed)\n\n if hasattr(self.parallel_backend, \"_joblib_backend_name\"):\n backend = getattr(self.parallel_backend, \"_joblib_backend_name\")\n else:\n warnings.warn(\n \"Parallel backend \"\n f\"{self.parallel_backend.__class__.__name__}. \"\n \"should have a `_joblib_backend_name` attribute in order to work \"\n \"property with MapReduceJob. \"\n \"Defaulting to joblib loky backend\"\n )\n backend = \"loky\"\n\n with Parallel(backend=backend, prefer=\"processes\") as parallel:\n chunks = self._chunkify(self.inputs_, n_chunks=self.n_jobs)\n map_results: List[R] = parallel(\n delayed(self._map_func)(\n next_chunk, job_id=j, seed=seed, **self.map_kwargs\n )\n for j, (next_chunk, seed) in enumerate(\n zip(chunks, seed_seq.spawn(len(chunks)))\n )\n )\n\n reduce_results: R = self._reduce_func(map_results, **self.reduce_kwargs)\n return reduce_results\n
"},{"location":"api/pydvl/parallel/backends/","title":"Backends","text":""},{"location":"api/pydvl/parallel/backends/#pydvl.parallel.backends","title":"pydvl.parallel.backends","text":""},{"location":"api/pydvl/parallel/backends/joblib/","title":"Joblib","text":""},{"location":"api/pydvl/parallel/backends/joblib/#pydvl.parallel.backends.joblib","title":"pydvl.parallel.backends.joblib","text":""},{"location":"api/pydvl/parallel/backends/joblib/#pydvl.parallel.backends.joblib.JoblibParallelBackend","title":"JoblibParallelBackend","text":"JoblibParallelBackend(config: ParallelConfig | None = None)\n
Bases: ParallelBackend
Class used to wrap joblib to make it transparent to algorithms.
Example
from pydvl.parallel import JoblibParallelBackend\nparallel_backend = JoblibParallelBackend()\n
Source code in src/pydvl/parallel/backends/joblib.py
@deprecated(\n target=True,\n args_mapping={\"config\": None},\n deprecated_in=\"0.9.0\",\n remove_in=\"0.10.0\",\n)\ndef __init__(self, config: ParallelConfig | None = None) -> None:\n n_jobs: int | None = None\n if config is not None:\n n_jobs = config.n_cpus_local\n self.config = {\n \"n_jobs\": n_jobs,\n }\n
"},{"location":"api/pydvl/parallel/backends/joblib/#pydvl.parallel.backends.joblib.JoblibParallelBackend.executor","title":"executor classmethod
","text":"executor(\n max_workers: int | None = None,\n *,\n config: ParallelConfig | None = None,\n cancel_futures: CancellationPolicy | bool = CancellationPolicy.NONE\n) -> Executor\n
Returns a futures executor for the parallel backend.
Example
from pydvl.parallel import JoblibParallelBackend\nparallel_backend = JoblibParallelBackend()\nwith parallel_backend.executor() as executor:\n executor.submit(...)\n
PARAMETER DESCRIPTION max_workers
Maximum number of parallel workers.
TYPE: int | None
DEFAULT: None
config
(DEPRECATED) Object configuring parallel computation, with cluster address, number of cpus, etc.
TYPE: ParallelConfig | None
DEFAULT: None
cancel_futures
Policy to use when cancelling futures after exiting an Executor.
TYPE: CancellationPolicy | bool
DEFAULT: NONE
RETURNS DESCRIPTION Executor
Instance of _ReusablePoolExecutor.
Source code in src/pydvl/parallel/backends/joblib.py
@classmethod\ndef executor(\n cls,\n max_workers: int | None = None,\n *,\n config: ParallelConfig | None = None,\n cancel_futures: CancellationPolicy | bool = CancellationPolicy.NONE,\n) -> Executor:\n \"\"\"Returns a futures executor for the parallel backend.\n\n !!! Example\n ``` python\n from pydvl.parallel import JoblibParallelBackend\n parallel_backend = JoblibParallelBackend()\n with parallel_backend.executor() as executor:\n executor.submit(...)\n ```\n\n Args:\n max_workers: Maximum number of parallel workers.\n config: (**DEPRECATED**) Object configuring parallel computation,\n with cluster address, number of cpus, etc.\n cancel_futures: Policy to use when cancelling futures\n after exiting an Executor.\n\n Returns:\n Instance of [_ReusablePoolExecutor][joblib.externals.loky.reusable_executor._ReusablePoolExecutor].\n \"\"\"\n if config is not None:\n warnings.warn(\n \"The `JoblibParallelBackend` uses deprecated arguments: \"\n \"`config`. They were deprecated since v0.9.0 \"\n \"and will be removed in v0.10.0.\",\n FutureWarning,\n )\n\n if cancel_futures not in (CancellationPolicy.NONE, False):\n warnings.warn(\n \"Cancellation of futures is not supported by the joblib backend\",\n )\n return cast(Executor, get_reusable_executor(max_workers=max_workers))\n
"},{"location":"api/pydvl/parallel/backends/joblib/#pydvl.parallel.backends.joblib.JoblibParallelBackend.wrap","title":"wrap","text":"wrap(fun: Callable, **kwargs) -> Callable\n
Wraps a function as a joblib delayed.
PARAMETER DESCRIPTION fun
the function to wrap
TYPE: Callable
RETURNS DESCRIPTION Callable
The delayed function.
Source code in src/pydvl/parallel/backends/joblib.py
def wrap(self, fun: Callable, **kwargs) -> Callable:\n \"\"\"Wraps a function as a joblib delayed.\n\n Args:\n fun: the function to wrap\n\n Returns:\n The delayed function.\n \"\"\"\n return delayed(fun) # type: ignore\n
"},{"location":"api/pydvl/parallel/backends/ray/","title":"Ray","text":""},{"location":"api/pydvl/parallel/backends/ray/#pydvl.parallel.backends.ray","title":"pydvl.parallel.backends.ray","text":""},{"location":"api/pydvl/parallel/backends/ray/#pydvl.parallel.backends.ray.RayParallelBackend","title":"RayParallelBackend","text":"RayParallelBackend(config: ParallelConfig | None = None)\n
Bases: ParallelBackend
Class used to wrap ray to make it transparent to algorithms.
Example
import ray\nfrom pydvl.parallel import RayParallelBackend\nray.init()\nparallel_backend = RayParallelBackend()\n
Source code in src/pydvl/parallel/backends/ray.py
@deprecated(\n target=True,\n args_mapping={\"config\": None},\n deprecated_in=\"0.9.0\",\n remove_in=\"0.10.0\",\n)\ndef __init__(self, config: ParallelConfig | None = None) -> None:\n if not ray.is_initialized():\n raise RuntimeError(\n \"Starting from v0.9.0, ray is no longer automatically initialized. \"\n \"Please use `ray.init()` with the desired configuration \"\n \"before using this class.\"\n )\n # Register ray joblib backend\n register_ray()\n
"},{"location":"api/pydvl/parallel/backends/ray/#pydvl.parallel.backends.ray.RayParallelBackend.executor","title":"executor classmethod
","text":"executor(\n max_workers: int | None = None,\n *,\n config: ParallelConfig | None = None,\n cancel_futures: CancellationPolicy | bool = CancellationPolicy.PENDING\n) -> Executor\n
Returns a futures executor for the parallel backend.
Example
import ray\nfrom pydvl.parallel import RayParallelBackend\nray.init()\nparallel_backend = RayParallelBackend()\nwith parallel_backend.executor() as executor:\n executor.submit(...)\n
PARAMETER DESCRIPTION max_workers
Maximum number of parallel workers.
TYPE: int | None
DEFAULT: None
config
(DEPRECATED) Object configuring parallel computation, with cluster address, number of cpus, etc.
TYPE: ParallelConfig | None
DEFAULT: None
cancel_futures
Policy to use when cancelling futures after exiting an Executor.
TYPE: CancellationPolicy | bool
DEFAULT: PENDING
RETURNS DESCRIPTION Executor
Instance of RayExecutor.
Source code in src/pydvl/parallel/backends/ray.py
@classmethod\ndef executor(\n cls,\n max_workers: int | None = None,\n *,\n config: ParallelConfig | None = None,\n cancel_futures: CancellationPolicy | bool = CancellationPolicy.PENDING,\n) -> Executor:\n \"\"\"Returns a futures executor for the parallel backend.\n\n !!! Example\n ``` python\n import ray\n from pydvl.parallel import RayParallelBackend\n ray.init()\n parallel_backend = RayParallelBackend()\n with parallel_backend.executor() as executor:\n executor.submit(...)\n ```\n\n Args:\n max_workers: Maximum number of parallel workers.\n config: (**DEPRECATED**) Object configuring parallel computation,\n with cluster address, number of cpus, etc.\n cancel_futures: Policy to use when cancelling futures\n after exiting an Executor.\n\n Returns:\n Instance of [RayExecutor][pydvl.parallel.futures.ray.RayExecutor].\n \"\"\"\n # Imported here to avoid circular import errors\n from pydvl.parallel.futures.ray import RayExecutor\n\n if config is not None:\n warnings.warn(\n \"The `RayParallelBackend` uses deprecated arguments: \"\n \"`config`. They were deprecated since v0.9.0 \"\n \"and will be removed in v0.10.0.\",\n FutureWarning,\n )\n\n return RayExecutor(max_workers, cancel_futures=cancel_futures) # type: ignore\n
"},{"location":"api/pydvl/parallel/backends/ray/#pydvl.parallel.backends.ray.RayParallelBackend.wrap","title":"wrap","text":"wrap(fun: Callable, **kwargs) -> Callable\n
Wraps a function as a ray remote.
PARAMETER DESCRIPTION fun
the function to wrap
TYPE: Callable
kwargs
keyword arguments to pass to @ray.remote
DEFAULT: {}
RETURNS DESCRIPTION Callable
The .remote
method of the ray RemoteFunction
.
Source code in src/pydvl/parallel/backends/ray.py
def wrap(self, fun: Callable, **kwargs) -> Callable:\n \"\"\"Wraps a function as a ray remote.\n\n Args:\n fun: the function to wrap\n kwargs: keyword arguments to pass to @ray.remote\n\n Returns:\n The `.remote` method of the ray `RemoteFunction`.\n \"\"\"\n if len(kwargs) > 0:\n return ray.remote(**kwargs)(fun).remote # type: ignore\n return ray.remote(fun).remote # type: ignore\n
"},{"location":"api/pydvl/parallel/futures/","title":"Futures","text":""},{"location":"api/pydvl/parallel/futures/#pydvl.parallel.futures","title":"pydvl.parallel.futures","text":""},{"location":"api/pydvl/parallel/futures/#pydvl.parallel.futures.init_executor","title":"init_executor","text":"init_executor(\n max_workers: Optional[int] = None,\n config: Optional[ParallelConfig] = None,\n **kwargs\n) -> Generator[Executor, None, None]\n
Initializes a futures executor for the given parallel configuration.
PARAMETER DESCRIPTION max_workers
Maximum number of concurrent tasks.
TYPE: Optional[int]
DEFAULT: None
config
instance of ParallelConfig with cluster address, number of cpus, etc.
TYPE: Optional[ParallelConfig]
DEFAULT: None
kwargs
Other optional parameter that will be passed to the executor.
DEFAULT: {}
Examples from pydvl.parallel.futures import init_executor, ParallelConfig\n\nconfig = ParallelConfig(backend=\"ray\")\nwith init_executor(max_workers=1, config=config) as executor:\n future = executor.submit(lambda x: x + 1, 1)\n result = future.result()\nassert result == 2\n
from pydvl.parallel.futures import init_executor\nwith init_executor() as executor:\n results = list(executor.map(lambda x: x + 1, range(5)))\nassert results == [1, 2, 3, 4, 5]\n
Source code in src/pydvl/parallel/futures/__init__.py
@contextmanager\n@deprecated(\n target=None,\n deprecated_in=\"0.9.0\",\n remove_in=\"0.10.0\",\n)\ndef init_executor(\n max_workers: Optional[int] = None,\n config: Optional[ParallelConfig] = None,\n **kwargs,\n) -> Generator[Executor, None, None]:\n \"\"\"Initializes a futures executor for the given parallel configuration.\n\n Args:\n max_workers: Maximum number of concurrent tasks.\n config: instance of [ParallelConfig][pydvl.utils.config.ParallelConfig]\n with cluster address, number of cpus, etc.\n kwargs: Other optional parameter that will be passed to the executor.\n\n\n ??? Examples\n ``` python\n from pydvl.parallel.futures import init_executor, ParallelConfig\n\n config = ParallelConfig(backend=\"ray\")\n with init_executor(max_workers=1, config=config) as executor:\n future = executor.submit(lambda x: x + 1, 1)\n result = future.result()\n assert result == 2\n ```\n ``` python\n from pydvl.parallel.futures import init_executor\n with init_executor() as executor:\n results = list(executor.map(lambda x: x + 1, range(5)))\n assert results == [1, 2, 3, 4, 5]\n ```\n \"\"\"\n\n if config is None:\n config = ParallelConfig()\n\n try:\n cls = ParallelBackend.BACKENDS[config.backend]\n with cls.executor(max_workers=max_workers, config=config, **kwargs) as e:\n yield e\n except KeyError:\n raise NotImplementedError(f\"Unexpected parallel backend {config.backend}\")\n
"},{"location":"api/pydvl/parallel/futures/ray/","title":"Ray","text":""},{"location":"api/pydvl/parallel/futures/ray/#pydvl.parallel.futures.ray","title":"pydvl.parallel.futures.ray","text":""},{"location":"api/pydvl/parallel/futures/ray/#pydvl.parallel.futures.ray.RayExecutor","title":"RayExecutor","text":"RayExecutor(\n max_workers: Optional[int] = None,\n *,\n config: Optional[ParallelConfig] = None,\n cancel_futures: Union[CancellationPolicy, bool] = CancellationPolicy.ALL\n)\n
Bases: Executor
Asynchronous executor using Ray that implements the concurrent.futures API.
PARAMETER DESCRIPTION max_workers
Maximum number of concurrent tasks. Each task can request itself any number of vCPUs. You must ensure the product of this value and the n_cpus_per_job parameter passed to submit() does not exceed available cluster resources. If set to None
, it will default to the total number of vCPUs in the ray cluster.
TYPE: Optional[int]
DEFAULT: None
cancel_futures
Select which futures will be cancelled when exiting this context manager. Pending
is the default, which will cancel all pending futures, but not running ones, as done by concurrent.futures.ProcessPoolExecutor. Additionally, All
cancels all pending and running futures, and None
doesn't cancel any. See CancellationPolicy
TYPE: Union[CancellationPolicy, bool]
DEFAULT: ALL
Source code in src/pydvl/parallel/futures/ray.py
@deprecated(\n target=True,\n args_mapping={\"config\": None},\n deprecated_in=\"0.9.0\",\n remove_in=\"0.10.0\",\n)\ndef __init__(\n self,\n max_workers: Optional[int] = None,\n *,\n config: Optional[ParallelConfig] = None,\n cancel_futures: Union[CancellationPolicy, bool] = CancellationPolicy.ALL,\n):\n if max_workers is not None:\n if max_workers <= 0:\n raise ValueError(\"max_workers must be greater than 0\")\n max_workers = max_workers\n\n if isinstance(cancel_futures, CancellationPolicy):\n self._cancel_futures = cancel_futures\n else:\n self._cancel_futures = (\n CancellationPolicy.PENDING\n if cancel_futures\n else CancellationPolicy.NONE\n )\n\n if not ray.is_initialized():\n raise RuntimeError(\n \"Starting from v0.9.0, ray is no longer automatically initialized. \"\n \"Please use `ray.init()` with the desired configuration \"\n \"before using this class.\"\n )\n\n self._max_workers = max_workers\n if self._max_workers is None:\n self._max_workers = int(ray._private.state.cluster_resources()[\"CPU\"])\n\n self._shutdown = False\n self._shutdown_lock = threading.Lock()\n self._queue_lock = threading.Lock()\n self._work_queue: \"queue.Queue[Optional[_WorkItem]]\" = queue.Queue(\n maxsize=self._max_workers\n )\n self._pending_queue: \"queue.SimpleQueue[Optional[_WorkItem]]\" = (\n queue.SimpleQueue()\n )\n\n # Work Item Manager Thread\n self._work_item_manager_thread: Optional[_WorkItemManagerThread] = None\n
"},{"location":"api/pydvl/parallel/futures/ray/#pydvl.parallel.futures.ray.RayExecutor.submit","title":"submit","text":"submit(fn: Callable[..., T], *args, **kwargs) -> Future[T]\n
Submits a callable to be executed with the given arguments.
Schedules the callable to be executed as fn(*args, **kwargs) and returns a Future instance representing the execution of the callable.
PARAMETER DESCRIPTION fn
Callable.
TYPE: Callable[..., T]
args
Positional arguments that will be passed to fn
.
DEFAULT: ()
kwargs
Keyword arguments that will be passed to fn
. It can also optionally contain options for the ray remote function as a dictionary as the keyword argument remote_function_options
.
DEFAULT: {}
Returns: A Future representing the given call.
RAISES DESCRIPTION RuntimeError
If a task is submitted after the executor has been shut down.
Source code in src/pydvl/parallel/futures/ray.py
def submit(self, fn: Callable[..., T], *args, **kwargs) -> \"Future[T]\":\n r\"\"\"Submits a callable to be executed with the given arguments.\n\n Schedules the callable to be executed as fn(\\*args, \\**kwargs)\n and returns a Future instance representing the execution of the callable.\n\n Args:\n fn: Callable.\n args: Positional arguments that will be passed to `fn`.\n kwargs: Keyword arguments that will be passed to `fn`.\n It can also optionally contain options for the ray remote function\n as a dictionary as the keyword argument `remote_function_options`.\n Returns:\n A Future representing the given call.\n\n Raises:\n RuntimeError: If a task is submitted after the executor has been shut down.\n \"\"\"\n with self._shutdown_lock:\n logger.debug(\"executor acquired shutdown lock\")\n if self._shutdown:\n raise RuntimeError(\"cannot schedule new futures after shutdown\")\n\n logging.debug(\"Creating future and putting work item in work queue\")\n future: \"Future[T]\" = Future()\n remote_function_options = kwargs.pop(\"remote_function_options\", None)\n w = _WorkItem(\n future,\n fn,\n args,\n kwargs,\n remote_function_options=remote_function_options,\n )\n self._put_work_item_in_queue(w)\n # We delay starting the thread until the first call to submit\n self._start_work_item_manager_thread()\n return future\n
"},{"location":"api/pydvl/parallel/futures/ray/#pydvl.parallel.futures.ray.RayExecutor.shutdown","title":"shutdown","text":"shutdown(wait: bool = True, *, cancel_futures: Optional[bool] = None) -> None\n
Clean up the resources associated with the Executor.
This method tries to mimic the behaviour of Executor.shutdown while allowing one more value for cancel_futures
which instructs it to use the CancellationPolicy defined upon construction.
PARAMETER DESCRIPTION wait
Whether to wait for pending futures to finish.
TYPE: bool
DEFAULT: True
cancel_futures
Overrides the executor's default policy for cancelling futures on exit. If True
, all pending futures are cancelled, and if False
, no futures are cancelled. If None
(default), the executor's policy set at initialization is used.
TYPE: Optional[bool]
DEFAULT: None
Source code in src/pydvl/parallel/futures/ray.py
def shutdown(\n self, wait: bool = True, *, cancel_futures: Optional[bool] = None\n) -> None:\n \"\"\"Clean up the resources associated with the Executor.\n\n This method tries to mimic the behaviour of\n [Executor.shutdown][concurrent.futures.Executor.shutdown]\n while allowing one more value for ``cancel_futures`` which instructs it\n to use the [CancellationPolicy][pydvl.parallel.backend.CancellationPolicy]\n defined upon construction.\n\n Args:\n wait: Whether to wait for pending futures to finish.\n cancel_futures: Overrides the executor's default policy for\n cancelling futures on exit. If ``True``, all pending futures are\n cancelled, and if ``False``, no futures are cancelled. If ``None``\n (default), the executor's policy set at initialization is used.\n \"\"\"\n logger.debug(\"executor shutting down\")\n with self._shutdown_lock:\n logger.debug(\"executor acquired shutdown lock\")\n self._shutdown = True\n self._cancel_futures = {\n None: self._cancel_futures,\n True: CancellationPolicy.PENDING,\n False: CancellationPolicy.NONE,\n }[cancel_futures]\n\n if wait:\n logger.debug(\"executor waiting for futures to finish\")\n if self._work_item_manager_thread is not None:\n # Putting None in the queue to signal\n # to work item manager thread that we are shutting down\n self._put_work_item_in_queue(None)\n logger.debug(\n \"executor waiting for work item manager thread to terminate\"\n )\n self._work_item_manager_thread.join()\n # To reduce the risk of opening too many files, remove references to\n # objects that use file descriptors.\n self._work_item_manager_thread = None\n del self._work_queue\n del self._pending_queue\n
"},{"location":"api/pydvl/parallel/futures/ray/#pydvl.parallel.futures.ray.RayExecutor.__exit__","title":"__exit__","text":"__exit__(exc_type, exc_val, exc_tb)\n
Exit the runtime context related to the RayExecutor object.
Source code in src/pydvl/parallel/futures/ray.py
def __exit__(self, exc_type, exc_val, exc_tb):\n \"\"\"Exit the runtime context related to the RayExecutor object.\"\"\"\n self.shutdown()\n return False\n
"},{"location":"api/pydvl/reporting/","title":"Reporting","text":""},{"location":"api/pydvl/reporting/#pydvl.reporting","title":"pydvl.reporting","text":""},{"location":"api/pydvl/reporting/plots/","title":"Plots","text":""},{"location":"api/pydvl/reporting/plots/#pydvl.reporting.plots","title":"pydvl.reporting.plots","text":""},{"location":"api/pydvl/reporting/plots/#pydvl.reporting.plots.shaded_mean_std","title":"shaded_mean_std","text":"shaded_mean_std(\n data: ndarray,\n abscissa: Optional[Sequence[Any]] = None,\n num_std: float = 1.0,\n mean_color: Optional[str] = \"dodgerblue\",\n shade_color: Optional[str] = \"lightblue\",\n title: Optional[str] = None,\n xlabel: Optional[str] = None,\n ylabel: Optional[str] = None,\n ax: Optional[Axes] = None,\n **kwargs\n) -> Axes\n
The usual mean \\(\\pm\\) std deviation plot to aggregate runs of experiments.
Deprecation notice
This function is bogus and will be removed in the future in favour of properly computed confidence intervals.
PARAMETER DESCRIPTION data
axis 0 is to be aggregated on (e.g. runs) and axis 1 is the data for each run.
TYPE: ndarray
abscissa
values for the x-axis. Leave empty to use increasing integers.
TYPE: Optional[Sequence[Any]]
DEFAULT: None
num_std
number of standard deviations to shade around the mean.
TYPE: float
DEFAULT: 1.0
mean_color
color for the mean
TYPE: Optional[str]
DEFAULT: 'dodgerblue'
shade_color
color for the shaded region
TYPE: Optional[str]
DEFAULT: 'lightblue'
title
Title text. To use mathematics, use LaTeX notation.
TYPE: Optional[str]
DEFAULT: None
xlabel
Text for the horizontal axis.
TYPE: Optional[str]
DEFAULT: None
ylabel
Text for the vertical axis
TYPE: Optional[str]
DEFAULT: None
ax
If passed, axes object into which to insert the figure. Otherwise, a new figure is created and returned
TYPE: Optional[Axes]
DEFAULT: None
kwargs
these are forwarded to the ax.plot() call for the mean.
DEFAULT: {}
RETURNS DESCRIPTION Axes
The axes used (or created)
Source code in src/pydvl/reporting/plots.py
@deprecated(target=None, deprecated_in=\"0.7.1\", remove_in=\"0.9.0\")\ndef shaded_mean_std(\n data: np.ndarray,\n abscissa: Optional[Sequence[Any]] = None,\n num_std: float = 1.0,\n mean_color: Optional[str] = \"dodgerblue\",\n shade_color: Optional[str] = \"lightblue\",\n title: Optional[str] = None,\n xlabel: Optional[str] = None,\n ylabel: Optional[str] = None,\n ax: Optional[Axes] = None,\n **kwargs,\n) -> Axes:\n r\"\"\"The usual mean \\(\\pm\\) std deviation plot to aggregate runs of\n experiments.\n\n !!! warning \"Deprecation notice\"\n This function is bogus and will be removed in the future in favour of\n properly computed confidence intervals.\n\n Args:\n data: axis 0 is to be aggregated on (e.g. runs) and axis 1 is the\n data for each run.\n abscissa: values for the x-axis. Leave empty to use increasing integers.\n num_std: number of standard deviations to shade around the mean.\n mean_color: color for the mean\n shade_color: color for the shaded region\n title: Title text. To use mathematics, use LaTeX notation.\n xlabel: Text for the horizontal axis.\n ylabel: Text for the vertical axis\n ax: If passed, axes object into which to insert the figure. Otherwise,\n a new figure is created and returned\n kwargs: these are forwarded to the ax.plot() call for the mean.\n\n Returns:\n The axes used (or created)\n \"\"\"\n assert len(data.shape) == 2\n mean = data.mean(axis=0)\n std = num_std * data.std(axis=0)\n\n if ax is None:\n fig, ax = plt.subplots()\n if abscissa is None:\n abscissa = list(range(data.shape[1]))\n\n ax.fill_between(abscissa, mean - std, mean + std, alpha=0.3, color=shade_color)\n ax.plot(abscissa, mean, color=mean_color, **kwargs)\n\n ax.set_title(title)\n ax.set_xlabel(xlabel)\n ax.set_ylabel(ylabel)\n\n return ax\n
"},{"location":"api/pydvl/reporting/plots/#pydvl.reporting.plots.plot_ci_array","title":"plot_ci_array","text":"plot_ci_array(\n data: NDArray,\n level: float,\n type: Literal[\"normal\", \"t\", \"auto\"] = \"normal\",\n abscissa: Optional[Sequence[str]] = None,\n mean_color: Optional[str] = \"dodgerblue\",\n shade_color: Optional[str] = \"lightblue\",\n ax: Optional[Axes] = None,\n **kwargs\n) -> Axes\n
Plot values and a confidence interval from a 2D array.
Supported intervals are based on the normal and the t distributions.
PARAMETER DESCRIPTION data
A 2D array with M different values for each of the N indices.
TYPE: NDArray
level
The confidence level.
TYPE: float
type
The type of confidence interval to use.
TYPE: Literal['normal', 't', 'auto']
DEFAULT: 'normal'
abscissa
The values for the x-axis. Leave empty to use increasing integers.
TYPE: Optional[Sequence[str]]
DEFAULT: None
mean_color
The color of the mean line.
TYPE: Optional[str]
DEFAULT: 'dodgerblue'
shade_color
The color of the confidence interval.
TYPE: Optional[str]
DEFAULT: 'lightblue'
ax
If passed, axes object into which to insert the figure. Otherwise, a new figure is created and the axes returned.
TYPE: Optional[Axes]
DEFAULT: None
**kwargs
Additional arguments to pass to the plot function.
DEFAULT: {}
RETURNS DESCRIPTION Axes
The matplotlib axes.
Source code in src/pydvl/reporting/plots.py
def plot_ci_array(\n data: NDArray,\n level: float,\n type: Literal[\"normal\", \"t\", \"auto\"] = \"normal\",\n abscissa: Optional[Sequence[str]] = None,\n mean_color: Optional[str] = \"dodgerblue\",\n shade_color: Optional[str] = \"lightblue\",\n ax: Optional[plt.Axes] = None,\n **kwargs,\n) -> plt.Axes:\n \"\"\"Plot values and a confidence interval from a 2D array.\n\n Supported intervals are based on the normal and the t distributions.\n\n Args:\n data: A 2D array with M different values for each of the N indices.\n level: The confidence level.\n type: The type of confidence interval to use.\n abscissa: The values for the x-axis. Leave empty to use increasing\n integers.\n mean_color: The color of the mean line.\n shade_color: The color of the confidence interval.\n ax: If passed, axes object into which to insert the figure. Otherwise,\n a new figure is created and the axes returned.\n **kwargs: Additional arguments to pass to the plot function.\n\n Returns:\n The matplotlib axes.\n \"\"\"\n\n m, n = data.shape\n\n means = np.mean(data, axis=0)\n variances = np.var(data, axis=0, ddof=1)\n\n dummy = ValuationResult[np.int_, np.object_](\n algorithm=\"dummy\",\n values=means,\n variances=variances,\n counts=np.ones_like(means, dtype=np.int_) * m,\n indices=np.arange(n),\n data_names=np.array(abscissa, dtype=str)\n if abscissa is not None\n else np.arange(n, dtype=str),\n )\n\n return plot_ci_values(\n dummy,\n level=level,\n type=type,\n mean_color=mean_color,\n shade_color=shade_color,\n ax=ax,\n **kwargs,\n )\n
"},{"location":"api/pydvl/reporting/plots/#pydvl.reporting.plots.plot_ci_values","title":"plot_ci_values","text":"plot_ci_values(\n values: ValuationResult,\n level: float,\n type: Literal[\"normal\", \"t\", \"auto\"] = \"auto\",\n abscissa: Optional[Sequence[str]] = None,\n mean_color: Optional[str] = \"dodgerblue\",\n shade_color: Optional[str] = \"lightblue\",\n ax: Optional[Axes] = None,\n **kwargs\n)\n
Plot values and a confidence interval.
Uses values.data_names
for the x-axis.
Supported intervals are based on the normal and the t distributions.
PARAMETER DESCRIPTION values
The valuation result.
TYPE: ValuationResult
level
The confidence level.
TYPE: float
type
The type of confidence interval to use. If \"auto\", uses \"norm\" if the minimum number of updates for all indices is greater than 30, otherwise uses \"t\".
TYPE: Literal['normal', 't', 'auto']
DEFAULT: 'auto'
abscissa
The values for the x-axis. Leave empty to use increasing integers.
TYPE: Optional[Sequence[str]]
DEFAULT: None
mean_color
The color of the mean line.
TYPE: Optional[str]
DEFAULT: 'dodgerblue'
shade_color
The color of the confidence interval.
TYPE: Optional[str]
DEFAULT: 'lightblue'
ax
If passed, axes object into which to insert the figure. Otherwise, a new figure is created and the axes returned.
TYPE: Optional[Axes]
DEFAULT: None
**kwargs
Additional arguments to pass to the plot function.
DEFAULT: {}
RETURNS DESCRIPTION The matplotlib axes.
Source code in src/pydvl/reporting/plots.py
def plot_ci_values(\n values: ValuationResult,\n level: float,\n type: Literal[\"normal\", \"t\", \"auto\"] = \"auto\",\n abscissa: Optional[Sequence[str]] = None,\n mean_color: Optional[str] = \"dodgerblue\",\n shade_color: Optional[str] = \"lightblue\",\n ax: Optional[plt.Axes] = None,\n **kwargs,\n):\n \"\"\"Plot values and a confidence interval.\n\n Uses `values.data_names` for the x-axis.\n\n Supported intervals are based on the normal and the t distributions.\n\n Args:\n values: The valuation result.\n level: The confidence level.\n type: The type of confidence interval to use. If \"auto\", uses \"norm\" if\n the minimum number of updates for all indices is greater than 30,\n otherwise uses \"t\".\n abscissa: The values for the x-axis. Leave empty to use increasing\n integers.\n mean_color: The color of the mean line.\n shade_color: The color of the confidence interval.\n ax: If passed, axes object into which to insert the figure. Otherwise,\n a new figure is created and the axes returned.\n **kwargs: Additional arguments to pass to the plot function.\n\n Returns:\n The matplotlib axes.\n \"\"\"\n\n ppfs = {\n \"normal\": norm.ppf,\n \"t\": partial(t.ppf, df=values.counts - 1),\n \"auto\": norm.ppf\n if np.min(values.counts) > 30\n else partial(t.ppf, df=values.counts - 1),\n }\n\n try:\n score = ppfs[type](1 - level / 2)\n except KeyError:\n raise ValueError(\n f\"Unknown confidence interval type requested: {type}.\"\n ) from None\n\n if abscissa is None:\n abscissa = [str(i) for i, _ in enumerate(values)]\n bound = score * values.stderr\n\n if ax is None:\n fig, ax = plt.subplots()\n\n ax.fill_between(\n abscissa,\n values.values - bound,\n values.values + bound,\n alpha=0.3,\n color=shade_color,\n )\n ax.plot(abscissa, values.values, color=mean_color, **kwargs)\n return ax\n
"},{"location":"api/pydvl/reporting/plots/#pydvl.reporting.plots.spearman_correlation","title":"spearman_correlation","text":"spearman_correlation(vv: List[OrderedDict], num_values: int, pvalue: float)\n
Simple matrix plots with spearman correlation for each pair in vv.
PARAMETER DESCRIPTION vv
list of OrderedDicts with index: value. Spearman correlation is computed for the keys.
TYPE: List[OrderedDict]
num_values
Use only these many values from the data (from the start of the OrderedDicts)
TYPE: int
pvalue
correlation coefficients for which the p-value is below the threshold pvalue/len(vv)
will be discarded.
TYPE: float
Source code in src/pydvl/reporting/plots.py
def spearman_correlation(vv: List[OrderedDict], num_values: int, pvalue: float):\n \"\"\"Simple matrix plots with spearman correlation for each pair in vv.\n\n Args:\n vv: list of OrderedDicts with index: value. Spearman correlation\n is computed for the keys.\n num_values: Use only these many values from the data (from the start\n of the OrderedDicts)\n pvalue: correlation coefficients for which the p-value is below the\n threshold `pvalue/len(vv)` will be discarded.\n \"\"\"\n r: np.ndarray = np.ndarray((len(vv), len(vv)))\n p: np.ndarray = np.ndarray((len(vv), len(vv)))\n for i, a in enumerate(vv):\n for j, b in enumerate(vv):\n from scipy.stats._stats_py import SpearmanrResult\n\n spearman: SpearmanrResult = sp.stats.spearmanr(\n list(a.keys())[:num_values], list(b.keys())[:num_values]\n )\n r[i][j] = (\n spearman.correlation if spearman.pvalue < pvalue / len(vv) else np.nan\n ) # Bonferroni correction\n p[i][j] = spearman.pvalue\n fig, axs = plt.subplots(1, 2, figsize=(16, 7))\n plot1 = axs[0].matshow(r, vmin=-1, vmax=1)\n axs[0].set_title(f\"Spearman correlation (top {num_values} values)\")\n axs[0].set_xlabel(\"Runs\")\n axs[0].set_ylabel(\"Runs\")\n fig.colorbar(plot1, ax=axs[0])\n plot2 = axs[1].matshow(p, vmin=0, vmax=1)\n axs[1].set_title(\"p-value\")\n axs[1].set_xlabel(\"Runs\")\n axs[1].set_ylabel(\"Runs\")\n fig.colorbar(plot2, ax=axs[1])\n\n return fig\n
"},{"location":"api/pydvl/reporting/plots/#pydvl.reporting.plots.plot_shapley","title":"plot_shapley","text":"plot_shapley(\n df: DataFrame,\n *,\n level: float = 0.05,\n ax: Optional[Axes] = None,\n title: Optional[str] = None,\n xlabel: Optional[str] = None,\n ylabel: Optional[str] = None,\n prefix: Optional[str] = \"data_value\"\n) -> Axes\n
Plots the shapley values, as returned from compute_shapley_values, with error bars corresponding to an \\(\\alpha\\)-level Normal confidence interval.
PARAMETER DESCRIPTION df
dataframe with the shapley values
TYPE: DataFrame
level
confidence level for the error bars
TYPE: float
DEFAULT: 0.05
ax
axes to plot on or None if a new subplots should be created
TYPE: Optional[Axes]
DEFAULT: None
title
string, title of the plot
TYPE: Optional[str]
DEFAULT: None
xlabel
string, x label of the plot
TYPE: Optional[str]
DEFAULT: None
ylabel
string, y label of the plot
TYPE: Optional[str]
DEFAULT: None
RETURNS DESCRIPTION Axes
The axes created or used
Source code in src/pydvl/reporting/plots.py
def plot_shapley(\n df: pd.DataFrame,\n *,\n level: float = 0.05,\n ax: Optional[plt.Axes] = None,\n title: Optional[str] = None,\n xlabel: Optional[str] = None,\n ylabel: Optional[str] = None,\n prefix: Optional[str] = \"data_value\",\n) -> plt.Axes:\n r\"\"\"Plots the shapley values, as returned from\n [compute_shapley_values][pydvl.value.shapley.common.compute_shapley_values],\n with error bars corresponding to an $\\alpha$-level Normal confidence\n interval.\n\n Args:\n df: dataframe with the shapley values\n level: confidence level for the error bars\n ax: axes to plot on or None if a new subplots should be created\n title: string, title of the plot\n xlabel: string, x label of the plot\n ylabel: string, y label of the plot\n\n Returns:\n The axes created or used\n \"\"\"\n if ax is None:\n _, ax = plt.subplots()\n\n yerr = norm.ppf(1 - level / 2) * df[f\"{prefix}_stderr\"]\n\n ax.errorbar(x=df.index, y=df[prefix], yerr=yerr, fmt=\"o\", capsize=6)\n ax.set_xlabel(xlabel)\n ax.set_ylabel(ylabel)\n ax.set_title(title)\n plt.xticks(rotation=60)\n return ax\n
"},{"location":"api/pydvl/reporting/plots/#pydvl.reporting.plots.plot_influence_distribution","title":"plot_influence_distribution","text":"plot_influence_distribution(\n influences: NDArray[float_], index: int, title_extra: str = \"\"\n) -> Axes\n
Plots the histogram of the influence that all samples in the training set have over a single sample index.
PARAMETER DESCRIPTION influences
array of influences (training samples x test samples)
TYPE: NDArray[float_]
index
Index of the test sample for which the influences will be plotted.
TYPE: int
title_extra
Additional text that will be appended to the title.
TYPE: str
DEFAULT: ''
Source code in src/pydvl/reporting/plots.py
def plot_influence_distribution(\n influences: NDArray[np.float_], index: int, title_extra: str = \"\"\n) -> plt.Axes:\n \"\"\"Plots the histogram of the influence that all samples in the training set\n have over a single sample index.\n\n Args:\n influences: array of influences (training samples x test samples)\n index: Index of the test sample for which the influences\n will be plotted.\n title_extra: Additional text that will be appended to the title.\n \"\"\"\n _, ax = plt.subplots()\n ax.hist(influences[:, index], alpha=0.7)\n ax.set_xlabel(\"Influence values\")\n ax.set_ylabel(\"Number of samples\")\n ax.set_title(f\"Distribution of influences {title_extra}\")\n return ax\n
"},{"location":"api/pydvl/reporting/plots/#pydvl.reporting.plots.plot_influence_distribution_by_label","title":"plot_influence_distribution_by_label","text":"plot_influence_distribution_by_label(\n influences: NDArray[float_], labels: NDArray[float_], title_extra: str = \"\"\n)\n
Plots the histogram of the influence that all samples in the training set have over a single sample index, separated by labels.
PARAMETER DESCRIPTION influences
array of influences (training samples x test samples)
TYPE: NDArray[float_]
labels
labels for the training set.
TYPE: NDArray[float_]
title_extra
Additional text that will be appended to the title.
TYPE: str
DEFAULT: ''
Source code in src/pydvl/reporting/plots.py
def plot_influence_distribution_by_label(\n influences: NDArray[np.float_], labels: NDArray[np.float_], title_extra: str = \"\"\n):\n \"\"\"Plots the histogram of the influence that all samples in the training set\n have over a single sample index, separated by labels.\n\n Args:\n influences: array of influences (training samples x test samples)\n labels: labels for the training set.\n title_extra: Additional text that will be appended to the title.\n \"\"\"\n _, ax = plt.subplots()\n unique_labels = np.unique(labels)\n for label in unique_labels:\n ax.hist(influences[labels == label], label=label, alpha=0.7)\n ax.set_xlabel(\"Influence values\")\n ax.set_ylabel(\"Number of samples\")\n ax.set_title(f\"Distribution of influences {title_extra}\")\n ax.legend()\n plt.show()\n
"},{"location":"api/pydvl/reporting/scores/","title":"Scores","text":""},{"location":"api/pydvl/reporting/scores/#pydvl.reporting.scores","title":"pydvl.reporting.scores","text":""},{"location":"api/pydvl/reporting/scores/#pydvl.reporting.scores.compute_removal_score","title":"compute_removal_score","text":"compute_removal_score(\n u: Utility,\n values: ValuationResult,\n percentages: Union[NDArray[float_], Iterable[float]],\n *,\n remove_best: bool = False,\n progress: bool = False\n) -> Dict[float, float]\n
Fits model and computes score on the test set after incrementally removing a percentage of data points from the training set, based on their values.
PARAMETER DESCRIPTION u
Utility object with model, data, and scoring function.
TYPE: Utility
values
Data values of data instances in the training set.
TYPE: ValuationResult
percentages
Sequence of removal percentages.
TYPE: Union[NDArray[float_], Iterable[float]]
remove_best
If True, removes data points in order of decreasing valuation.
TYPE: bool
DEFAULT: False
progress
If True, display a progress bar.
TYPE: bool
DEFAULT: False
RETURNS DESCRIPTION Dict[float, float]
Dictionary that maps the percentages to their respective scores.
Source code in src/pydvl/reporting/scores.py
def compute_removal_score(\n u: Utility,\n values: ValuationResult,\n percentages: Union[NDArray[np.float_], Iterable[float]],\n *,\n remove_best: bool = False,\n progress: bool = False,\n) -> Dict[float, float]:\n r\"\"\"Fits model and computes score on the test set after incrementally removing\n a percentage of data points from the training set, based on their values.\n\n Args:\n u: Utility object with model, data, and scoring function.\n values: Data values of data instances in the training set.\n percentages: Sequence of removal percentages.\n remove_best: If True, removes data points in order of decreasing valuation.\n progress: If True, display a progress bar.\n\n Returns:\n Dictionary that maps the percentages to their respective scores.\n \"\"\"\n # Sanity checks\n if np.any([x >= 1.0 or x < 0.0 for x in percentages]):\n raise ValueError(\"All percentages should be in the range [0.0, 1.0)\")\n\n if len(values) != len(u.data.indices):\n raise ValueError(\n f\"The number of values, {len(values) }, should be equal to the number of data indices, {len(u.data.indices)}\"\n )\n\n scores = {}\n\n # We sort in descending order if we want to remove the best values\n values.sort(reverse=remove_best)\n\n for pct in tqdm(percentages, disable=not progress, desc=\"Removal Scores\"):\n n_removal = int(pct * len(u.data))\n indices = values.indices[n_removal:]\n score = u(indices)\n scores[pct] = score\n return scores\n
"},{"location":"api/pydvl/utils/","title":"Utils","text":""},{"location":"api/pydvl/utils/#pydvl.utils","title":"pydvl.utils","text":""},{"location":"api/pydvl/utils/config/","title":"Config","text":""},{"location":"api/pydvl/utils/config/#pydvl.utils.config","title":"pydvl.utils.config","text":""},{"location":"api/pydvl/utils/config/#pydvl.utils.config.ParallelConfig","title":"ParallelConfig dataclass
","text":"ParallelConfig(\n backend: Literal[\"joblib\", \"ray\"] = \"joblib\",\n address: Optional[Union[str, Tuple[str, int]]] = None,\n n_cpus_local: Optional[int] = None,\n logging_level: Optional[int] = None,\n wait_timeout: float = 1.0,\n)\n
Configuration for parallel computation backend.
PARAMETER DESCRIPTION backend
Type of backend to use. Defaults to 'joblib'
TYPE: Literal['joblib', 'ray']
DEFAULT: 'joblib'
address
(DEPRECATED) Address of existing remote or local cluster to use.
TYPE: Optional[Union[str, Tuple[str, int]]]
DEFAULT: None
n_cpus_local
(DEPRECATED) Number of CPUs to use when creating a local ray cluster. This has no effect when using an existing ray cluster.
TYPE: Optional[int]
DEFAULT: None
logging_level
(DEPRECATED) Logging level for the parallel backend's worker.
TYPE: Optional[int]
DEFAULT: None
wait_timeout
(DEPRECATED) Timeout in seconds for waiting on futures.
TYPE: float
DEFAULT: 1.0
"},{"location":"api/pydvl/utils/config/#pydvl.utils.config.CachedFuncConfig","title":"CachedFuncConfig dataclass
","text":"CachedFuncConfig(\n hash_prefix: Optional[str] = None,\n ignore_args: Collection[str] = list(),\n time_threshold: float = 0.3,\n allow_repeated_evaluations: bool = False,\n rtol_stderr: float = 0.1,\n min_repetitions: int = 3,\n)\n
Configuration for cached functions and methods, providing memoization of function calls.
Instances of this class are typically used as arguments for the construction of a Utility.
PARAMETER DESCRIPTION hash_prefix
Optional string prefix that be prepended to the cache key. This can be provided in order to guarantee cache reuse across runs.
TYPE: Optional[str]
DEFAULT: None
ignore_args
Do not take these keyword arguments into account when hashing the wrapped function for usage as key. This allows sharing the cache among different jobs for the same experiment run if the callable happens to have \"nuisance\" parameters like job_id
which do not affect the result of the computation.
TYPE: Collection[str]
DEFAULT: list()
time_threshold
Computations taking less time than this many seconds are not cached. A value of 0 means that it will always cache results.
TYPE: float
DEFAULT: 0.3
allow_repeated_evaluations
If True
, repeated calls to a function with the same arguments will be allowed and outputs averaged until the running standard deviation of the mean stabilizes below rtol_stderr * mean
.
TYPE: bool
DEFAULT: False
rtol_stderr
relative tolerance for repeated evaluations. More precisely, memcached() will stop evaluating the function once the standard deviation of the mean is smaller than rtol_stderr * mean
.
TYPE: float
DEFAULT: 0.1
min_repetitions
minimum number of times that a function evaluation on the same arguments is repeated before returning cached values. Useful for stochastic functions only. If the model training is very noisy, set this number to higher values to reduce variance.
TYPE: int
DEFAULT: 3
"},{"location":"api/pydvl/utils/dataset/","title":"Dataset","text":""},{"location":"api/pydvl/utils/dataset/#pydvl.utils.dataset","title":"pydvl.utils.dataset","text":"This module contains convenience classes to handle data and groups thereof.
Shapley and Least Core value computations require evaluation of a scoring function (the utility). This is typically the performance of the model on a test set (as an approximation to its true expected performance). It is therefore convenient to keep both the training data and the test data together to be passed around to methods in shapley and least_core. This is done with Dataset.
This abstraction layer also seamlessly grouping data points together if one is interested in computing their value as a group, see GroupedDataset.
Objects of both types are used to construct a Utility object.
"},{"location":"api/pydvl/utils/dataset/#pydvl.utils.dataset.Dataset","title":"Dataset","text":"Dataset(\n x_train: Union[NDArray, DataFrame],\n y_train: Union[NDArray, DataFrame],\n x_test: Union[NDArray, DataFrame],\n y_test: Union[NDArray, DataFrame],\n feature_names: Optional[Sequence[str]] = None,\n target_names: Optional[Sequence[str]] = None,\n data_names: Optional[Sequence[str]] = None,\n description: Optional[str] = None,\n is_multi_output: bool = False,\n)\n
A convenience class to handle datasets.
It holds a dataset, split into training and test data, together with several labels on feature names, data point names and a description.
PARAMETER DESCRIPTION x_train
training data
TYPE: Union[NDArray, DataFrame]
y_train
labels for training data
TYPE: Union[NDArray, DataFrame]
x_test
test data
TYPE: Union[NDArray, DataFrame]
y_test
labels for test data
TYPE: Union[NDArray, DataFrame]
feature_names
name of the features of input data
TYPE: Optional[Sequence[str]]
DEFAULT: None
target_names
names of the features of target data
TYPE: Optional[Sequence[str]]
DEFAULT: None
data_names
names assigned to data points. For example, if the dataset is a time series, each entry can be a timestamp which can be referenced directly instead of using a row number.
TYPE: Optional[Sequence[str]]
DEFAULT: None
description
A textual description of the dataset.
TYPE: Optional[str]
DEFAULT: None
is_multi_output
set to False
if labels are scalars, or to True
if they are vectors of dimension > 1.
TYPE: bool
DEFAULT: False
Source code in src/pydvl/utils/dataset.py
def __init__(\n self,\n x_train: Union[NDArray, pd.DataFrame],\n y_train: Union[NDArray, pd.DataFrame],\n x_test: Union[NDArray, pd.DataFrame],\n y_test: Union[NDArray, pd.DataFrame],\n feature_names: Optional[Sequence[str]] = None,\n target_names: Optional[Sequence[str]] = None,\n data_names: Optional[Sequence[str]] = None,\n description: Optional[str] = None,\n # FIXME: use same parameter name as in check_X_y()\n is_multi_output: bool = False,\n):\n \"\"\"Constructs a Dataset from data and labels.\n\n Args:\n x_train: training data\n y_train: labels for training data\n x_test: test data\n y_test: labels for test data\n feature_names: name of the features of input data\n target_names: names of the features of target data\n data_names: names assigned to data points.\n For example, if the dataset is a time series, each entry can be a\n timestamp which can be referenced directly instead of using a row\n number.\n description: A textual description of the dataset.\n is_multi_output: set to `False` if labels are scalars, or to\n `True` if they are vectors of dimension > 1.\n \"\"\"\n self.x_train, self.y_train = check_X_y(\n x_train, y_train, multi_output=is_multi_output\n )\n self.x_test, self.y_test = check_X_y(\n x_test, y_test, multi_output=is_multi_output\n )\n\n if x_train.shape[-1] != x_test.shape[-1]:\n raise ValueError(\n f\"Mismatching number of features: \"\n f\"{x_train.shape[-1]} and {x_test.shape[-1]}\"\n )\n if x_train.shape[0] != y_train.shape[0]:\n raise ValueError(\n f\"Mismatching number of samples: \"\n f\"{x_train.shape[-1]} and {x_test.shape[-1]}\"\n )\n if x_test.shape[0] != y_test.shape[0]:\n raise ValueError(\n f\"Mismatching number of samples: \"\n f\"{x_test.shape[-1]} and {y_test.shape[-1]}\"\n )\n\n def make_names(s: str, a: np.ndarray) -> List[str]:\n n = a.shape[1] if len(a.shape) > 1 else 1\n return [f\"{s}{i:0{1 + int(np.log10(n))}d}\" for i in range(1, n + 1)]\n\n self.feature_names = feature_names\n self.target_names = target_names\n\n if self.feature_names is None:\n if isinstance(x_train, pd.DataFrame):\n self.feature_names = x_train.columns.tolist()\n else:\n self.feature_names = make_names(\"x\", x_train)\n\n if self.target_names is None:\n if isinstance(y_train, pd.DataFrame):\n self.target_names = y_train.columns.tolist()\n else:\n self.target_names = make_names(\"y\", y_train)\n\n if len(self.x_train.shape) > 1:\n if (\n len(self.feature_names) != self.x_train.shape[-1]\n or len(self.feature_names) != self.x_test.shape[-1]\n ):\n raise ValueError(\"Mismatching number of features and names\")\n if len(self.y_train.shape) > 1:\n if (\n len(self.target_names) != self.y_train.shape[-1]\n or len(self.target_names) != self.y_test.shape[-1]\n ):\n raise ValueError(\"Mismatching number of targets and names\")\n\n self.description = description or \"No description\"\n self._indices = np.arange(len(self.x_train), dtype=np.int_)\n self._data_names = (\n np.array(data_names, dtype=object)\n if data_names is not None\n else self._indices.astype(object)\n )\n
"},{"location":"api/pydvl/utils/dataset/#pydvl.utils.dataset.Dataset.indices","title":"indices property
","text":"indices: NDArray[int_]\n
Index of positions in data.x_train.
Contiguous integers from 0 to len(Dataset).
"},{"location":"api/pydvl/utils/dataset/#pydvl.utils.dataset.Dataset.data_names","title":"data_names property
","text":"data_names: NDArray[object_]\n
Names of each individual datapoint.
Used for reporting Shapley values.
"},{"location":"api/pydvl/utils/dataset/#pydvl.utils.dataset.Dataset.dim","title":"dim property
","text":"dim: int\n
Returns the number of dimensions of a sample.
"},{"location":"api/pydvl/utils/dataset/#pydvl.utils.dataset.Dataset.get_training_data","title":"get_training_data","text":"get_training_data(\n indices: Optional[Iterable[int]] = None,\n) -> Tuple[NDArray, NDArray]\n
Given a set of indices, returns the training data that refer to those indices.
This is used mainly by Utility to retrieve subsets of the data from indices. It is typically not needed in algorithms.
PARAMETER DESCRIPTION indices
Optional indices that will be used to select points from the training data. If None
, the entire training data will be returned.
TYPE: Optional[Iterable[int]]
DEFAULT: None
RETURNS DESCRIPTION Tuple[NDArray, NDArray]
If indices
is not None
, the selected x and y arrays from the training data. Otherwise, the entire dataset.
Source code in src/pydvl/utils/dataset.py
def get_training_data(\n self, indices: Optional[Iterable[int]] = None\n) -> Tuple[NDArray, NDArray]:\n \"\"\"Given a set of indices, returns the training data that refer to those\n indices.\n\n This is used mainly by [Utility][pydvl.utils.utility.Utility] to retrieve\n subsets of the data from indices. It is typically **not needed in\n algorithms**.\n\n Args:\n indices: Optional indices that will be used to select points from\n the training data. If `None`, the entire training data will be\n returned.\n\n Returns:\n If `indices` is not `None`, the selected x and y arrays from the\n training data. Otherwise, the entire dataset.\n \"\"\"\n if indices is None:\n return self.x_train, self.y_train\n x = self.x_train[indices]\n y = self.y_train[indices]\n return x, y\n
"},{"location":"api/pydvl/utils/dataset/#pydvl.utils.dataset.Dataset.get_test_data","title":"get_test_data","text":"get_test_data(\n indices: Optional[Iterable[int]] = None,\n) -> Tuple[NDArray, NDArray]\n
Returns the entire test set regardless of the passed indices.
The passed indices will not be used because for data valuation we generally want to score the trained model on the entire test data.
Additionally, the way this method is used in the Utility class, the passed indices will be those of the training data and would not work on the test data.
There may be cases where it is desired to use parts of the test data. In those cases, it is recommended to inherit from Dataset and override get_test_data().
For example, the following snippet shows how one could go about mapping the training data indices into test data indices inside get_test_data():
Example >>> from pydvl.utils import Dataset\n>>> import numpy as np\n>>> class DatasetWithTestDataIndices(Dataset):\n... def get_test_data(self, indices=None):\n... if indices is None:\n... return self.x_test, self.y_test\n... fraction = len(list(indices)) / len(self)\n... mapped_indices = len(self.x_test) / len(self) * np.asarray(indices)\n... mapped_indices = np.unique(mapped_indices.astype(int))\n... return self.x_test[mapped_indices], self.y_test[mapped_indices]\n...\n>>> X = np.random.rand(100, 10)\n>>> y = np.random.randint(0, 2, 100)\n>>> dataset = DatasetWithTestDataIndices.from_arrays(X, y)\n>>> indices = np.random.choice(dataset.indices, 30, replace=False)\n>>> _ = dataset.get_training_data(indices)\n>>> _ = dataset.get_test_data(indices)\n
PARAMETER DESCRIPTION indices
Optional indices into the test data. This argument is unused left for compatibility with get_training_data().
TYPE: Optional[Iterable[int]]
DEFAULT: None
RETURNS DESCRIPTION Tuple[NDArray, NDArray]
The entire test data.
Source code in src/pydvl/utils/dataset.py
def get_test_data(\n self, indices: Optional[Iterable[int]] = None\n) -> Tuple[NDArray, NDArray]:\n \"\"\"Returns the entire test set regardless of the passed indices.\n\n The passed indices will not be used because for data valuation\n we generally want to score the trained model on the entire test data.\n\n Additionally, the way this method is used in the\n [Utility][pydvl.utils.utility.Utility] class, the passed indices will\n be those of the training data and would not work on the test data.\n\n There may be cases where it is desired to use parts of the test data.\n In those cases, it is recommended to inherit from\n [Dataset][pydvl.utils.dataset.Dataset] and override\n [get_test_data()][pydvl.utils.dataset.Dataset.get_test_data].\n\n For example, the following snippet shows how one could go about\n mapping the training data indices into test data indices\n inside [get_test_data()][pydvl.utils.dataset.Dataset.get_test_data]:\n\n ??? Example\n ```pycon\n >>> from pydvl.utils import Dataset\n >>> import numpy as np\n >>> class DatasetWithTestDataIndices(Dataset):\n ... def get_test_data(self, indices=None):\n ... if indices is None:\n ... return self.x_test, self.y_test\n ... fraction = len(list(indices)) / len(self)\n ... mapped_indices = len(self.x_test) / len(self) * np.asarray(indices)\n ... mapped_indices = np.unique(mapped_indices.astype(int))\n ... return self.x_test[mapped_indices], self.y_test[mapped_indices]\n ...\n >>> X = np.random.rand(100, 10)\n >>> y = np.random.randint(0, 2, 100)\n >>> dataset = DatasetWithTestDataIndices.from_arrays(X, y)\n >>> indices = np.random.choice(dataset.indices, 30, replace=False)\n >>> _ = dataset.get_training_data(indices)\n >>> _ = dataset.get_test_data(indices)\n ```\n\n Args:\n indices: Optional indices into the test data. This argument is\n unused left for compatibility with\n [get_training_data()][pydvl.utils.dataset.Dataset.get_training_data].\n\n Returns:\n The entire test data.\n \"\"\"\n return self.x_test, self.y_test\n
"},{"location":"api/pydvl/utils/dataset/#pydvl.utils.dataset.Dataset.from_sklearn","title":"from_sklearn classmethod
","text":"from_sklearn(\n data: Bunch,\n train_size: float = 0.8,\n random_state: Optional[int] = None,\n stratify_by_target: bool = False,\n **kwargs\n) -> Dataset\n
Constructs a Dataset object from a sklearn.utils.Bunch, as returned by the load_*
functions in scikit-learn toy datasets.
Example >>> from pydvl.utils import Dataset\n>>> from sklearn.datasets import load_boston\n>>> dataset = Dataset.from_sklearn(load_boston())\n
PARAMETER DESCRIPTION data
scikit-learn Bunch object. The following attributes are supported:
data
: covariates. target
: target variables (labels). feature_names
(optional): the feature names. target_names
(optional): the target names. DESCR
(optional): a description.
TYPE: Bunch
train_size
size of the training dataset. Used in train_test_split
TYPE: float
DEFAULT: 0.8
random_state
seed for train / test split
TYPE: Optional[int]
DEFAULT: None
stratify_by_target
If True
, data is split in a stratified fashion, using the target variable as labels. Read more in scikit-learn's user guide.
TYPE: bool
DEFAULT: False
kwargs
Additional keyword arguments to pass to the Dataset constructor. Use this to pass e.g. is_multi_output
.
DEFAULT: {}
RETURNS DESCRIPTION Dataset
Object with the sklearn dataset
Changed in version 0.6.0
Added kwargs to pass to the Dataset constructor.
Source code in src/pydvl/utils/dataset.py
@classmethod\ndef from_sklearn(\n cls,\n data: Bunch,\n train_size: float = 0.8,\n random_state: Optional[int] = None,\n stratify_by_target: bool = False,\n **kwargs,\n) -> \"Dataset\":\n \"\"\"Constructs a [Dataset][pydvl.utils.Dataset] object from a\n [sklearn.utils.Bunch][], as returned by the `load_*`\n functions in [scikit-learn toy datasets](https://scikit-learn.org/stable/datasets/toy_dataset.html).\n\n ??? Example\n ```pycon\n >>> from pydvl.utils import Dataset\n >>> from sklearn.datasets import load_boston\n >>> dataset = Dataset.from_sklearn(load_boston())\n ```\n\n Args:\n data: scikit-learn Bunch object. The following attributes are supported:\n\n - `data`: covariates.\n - `target`: target variables (labels).\n - `feature_names` (**optional**): the feature names.\n - `target_names` (**optional**): the target names.\n - `DESCR` (**optional**): a description.\n train_size: size of the training dataset. Used in `train_test_split`\n random_state: seed for train / test split\n stratify_by_target: If `True`, data is split in a stratified\n fashion, using the target variable as labels. Read more in\n [scikit-learn's user guide](https://scikit-learn.org/stable/modules/cross_validation.html#stratification).\n kwargs: Additional keyword arguments to pass to the\n [Dataset][pydvl.utils.Dataset] constructor. Use this to pass e.g. `is_multi_output`.\n\n Returns:\n Object with the sklearn dataset\n\n !!! tip \"Changed in version 0.6.0\"\n Added kwargs to pass to the [Dataset][pydvl.utils.Dataset] constructor.\n \"\"\"\n x_train, x_test, y_train, y_test = train_test_split(\n data.data,\n data.target,\n train_size=train_size,\n random_state=random_state,\n stratify=data.target if stratify_by_target else None,\n )\n return cls(\n x_train,\n y_train,\n x_test,\n y_test,\n feature_names=data.get(\"feature_names\"),\n target_names=data.get(\"target_names\"),\n description=data.get(\"DESCR\"),\n **kwargs,\n )\n
"},{"location":"api/pydvl/utils/dataset/#pydvl.utils.dataset.Dataset.from_arrays","title":"from_arrays classmethod
","text":"from_arrays(\n X: NDArray,\n y: NDArray,\n train_size: float = 0.8,\n random_state: Optional[int] = None,\n stratify_by_target: bool = False,\n **kwargs\n) -> Dataset\n
Constructs a Dataset object from X and y numpy arrays as returned by the make_*
functions in sklearn generated datasets.
Example >>> from pydvl.utils import Dataset\n>>> from sklearn.datasets import make_regression\n>>> X, y = make_regression()\n>>> dataset = Dataset.from_arrays(X, y)\n
PARAMETER DESCRIPTION X
numpy array of shape (n_samples, n_features)
TYPE: NDArray
y
numpy array of shape (n_samples,)
TYPE: NDArray
train_size
size of the training dataset. Used in train_test_split
TYPE: float
DEFAULT: 0.8
random_state
seed for train / test split
TYPE: Optional[int]
DEFAULT: None
stratify_by_target
If True
, data is split in a stratified fashion, using the y variable as labels. Read more in sklearn's user guide.
TYPE: bool
DEFAULT: False
kwargs
Additional keyword arguments to pass to the Dataset constructor. Use this to pass e.g. feature_names
or target_names
.
DEFAULT: {}
RETURNS DESCRIPTION Dataset
Object with the passed X and y arrays split across training and test sets.
New in version 0.4.0
Changed in version 0.6.0
Added kwargs to pass to the Dataset constructor.
Source code in src/pydvl/utils/dataset.py
@classmethod\ndef from_arrays(\n cls,\n X: NDArray,\n y: NDArray,\n train_size: float = 0.8,\n random_state: Optional[int] = None,\n stratify_by_target: bool = False,\n **kwargs,\n) -> \"Dataset\":\n \"\"\"Constructs a [Dataset][pydvl.utils.Dataset] object from X and y numpy arrays as\n returned by the `make_*` functions in [sklearn generated datasets](https://scikit-learn.org/stable/datasets/sample_generators.html).\n\n ??? Example\n ```pycon\n >>> from pydvl.utils import Dataset\n >>> from sklearn.datasets import make_regression\n >>> X, y = make_regression()\n >>> dataset = Dataset.from_arrays(X, y)\n ```\n\n Args:\n X: numpy array of shape (n_samples, n_features)\n y: numpy array of shape (n_samples,)\n train_size: size of the training dataset. Used in `train_test_split`\n random_state: seed for train / test split\n stratify_by_target: If `True`, data is split in a stratified fashion,\n using the y variable as labels. Read more in [sklearn's user\n guide](https://scikit-learn.org/stable/modules/cross_validation.html#stratification).\n kwargs: Additional keyword arguments to pass to the\n [Dataset][pydvl.utils.Dataset] constructor. Use this to pass e.g. `feature_names`\n or `target_names`.\n\n Returns:\n Object with the passed X and y arrays split across training and test sets.\n\n !!! tip \"New in version 0.4.0\"\n\n !!! tip \"Changed in version 0.6.0\"\n Added kwargs to pass to the [Dataset][pydvl.utils.Dataset] constructor.\n \"\"\"\n x_train, x_test, y_train, y_test = train_test_split(\n X,\n y,\n train_size=train_size,\n random_state=random_state,\n stratify=y if stratify_by_target else None,\n )\n return cls(x_train, y_train, x_test, y_test, **kwargs)\n
"},{"location":"api/pydvl/utils/dataset/#pydvl.utils.dataset.GroupedDataset","title":"GroupedDataset","text":"GroupedDataset(\n x_train: NDArray,\n y_train: NDArray,\n x_test: NDArray,\n y_test: NDArray,\n data_groups: Sequence,\n feature_names: Optional[Sequence[str]] = None,\n target_names: Optional[Sequence[str]] = None,\n group_names: Optional[Sequence[str]] = None,\n description: Optional[str] = None,\n **kwargs\n)\n
Bases: Dataset
Used for calculating Shapley values of subsets of the data considered as logical units. For instance, one can group by value of a categorical feature, by bin into which a continuous feature falls, or by label.
PARAMETER DESCRIPTION x_train
training data
TYPE: NDArray
y_train
labels of training data
TYPE: NDArray
x_test
test data
TYPE: NDArray
y_test
labels of test data
TYPE: NDArray
data_groups
Iterable of the same length as x_train
containing a group label for each training data point. The label can be of any type, e.g. str
or int
. Data points with the same label will then be grouped by this object and considered as one for effects of valuation.
TYPE: Sequence
feature_names
names of the covariates' features.
TYPE: Optional[Sequence[str]]
DEFAULT: None
target_names
names of the labels or targets y
TYPE: Optional[Sequence[str]]
DEFAULT: None
group_names
names of the groups. If not provided, the labels from data_groups
will be used.
TYPE: Optional[Sequence[str]]
DEFAULT: None
description
A textual description of the dataset
TYPE: Optional[str]
DEFAULT: None
kwargs
Additional keyword arguments to pass to the Dataset constructor.
DEFAULT: {}
Changed in version 0.6.0
Added group_names
and forwarding of kwargs
Source code in src/pydvl/utils/dataset.py
def __init__(\n self,\n x_train: NDArray,\n y_train: NDArray,\n x_test: NDArray,\n y_test: NDArray,\n data_groups: Sequence,\n feature_names: Optional[Sequence[str]] = None,\n target_names: Optional[Sequence[str]] = None,\n group_names: Optional[Sequence[str]] = None,\n description: Optional[str] = None,\n **kwargs,\n):\n \"\"\"Class for grouping datasets.\n\n Used for calculating Shapley values of subsets of the data considered\n as logical units. For instance, one can group by value of a categorical\n feature, by bin into which a continuous feature falls, or by label.\n\n Args:\n x_train: training data\n y_train: labels of training data\n x_test: test data\n y_test: labels of test data\n data_groups: Iterable of the same length as `x_train` containing\n a group label for each training data point. The label can be of any\n type, e.g. `str` or `int`. Data points with the same label will\n then be grouped by this object and considered as one for effects of\n valuation.\n feature_names: names of the covariates' features.\n target_names: names of the labels or targets y\n group_names: names of the groups. If not provided, the labels\n from `data_groups` will be used.\n description: A textual description of the dataset\n kwargs: Additional keyword arguments to pass to the\n [Dataset][pydvl.utils.Dataset] constructor.\n\n !!! tip \"Changed in version 0.6.0\"\n Added `group_names` and forwarding of `kwargs`\n \"\"\"\n super().__init__(\n x_train=x_train,\n y_train=y_train,\n x_test=x_test,\n y_test=y_test,\n feature_names=feature_names,\n target_names=target_names,\n description=description,\n **kwargs,\n )\n\n if len(data_groups) != len(x_train):\n raise ValueError(\n f\"data_groups and x_train must have the same length.\"\n f\"Instead got {len(data_groups)=} and {len(x_train)=}\"\n )\n\n self.groups: OrderedDict[Any, List[int]] = OrderedDict(\n {k: [] for k in set(data_groups)}\n )\n for idx, group in enumerate(data_groups):\n self.groups[group].append(idx)\n self.group_items = list(self.groups.items())\n self._indices = np.arange(len(self.groups.keys()))\n self._data_names = (\n np.array(group_names, dtype=object)\n if group_names is not None\n else np.array(list(self.groups.keys()), dtype=object)\n )\n
"},{"location":"api/pydvl/utils/dataset/#pydvl.utils.dataset.GroupedDataset.dim","title":"dim property
","text":"dim: int\n
Returns the number of dimensions of a sample.
"},{"location":"api/pydvl/utils/dataset/#pydvl.utils.dataset.GroupedDataset.indices","title":"indices property
","text":"indices\n
Indices of the groups.
"},{"location":"api/pydvl/utils/dataset/#pydvl.utils.dataset.GroupedDataset.data_names","title":"data_names property
","text":"data_names\n
Names of the groups.
"},{"location":"api/pydvl/utils/dataset/#pydvl.utils.dataset.GroupedDataset.get_test_data","title":"get_test_data","text":"get_test_data(\n indices: Optional[Iterable[int]] = None,\n) -> Tuple[NDArray, NDArray]\n
Returns the entire test set regardless of the passed indices.
The passed indices will not be used because for data valuation we generally want to score the trained model on the entire test data.
Additionally, the way this method is used in the Utility class, the passed indices will be those of the training data and would not work on the test data.
There may be cases where it is desired to use parts of the test data. In those cases, it is recommended to inherit from Dataset and override get_test_data().
For example, the following snippet shows how one could go about mapping the training data indices into test data indices inside get_test_data():
Example >>> from pydvl.utils import Dataset\n>>> import numpy as np\n>>> class DatasetWithTestDataIndices(Dataset):\n... def get_test_data(self, indices=None):\n... if indices is None:\n... return self.x_test, self.y_test\n... fraction = len(list(indices)) / len(self)\n... mapped_indices = len(self.x_test) / len(self) * np.asarray(indices)\n... mapped_indices = np.unique(mapped_indices.astype(int))\n... return self.x_test[mapped_indices], self.y_test[mapped_indices]\n...\n>>> X = np.random.rand(100, 10)\n>>> y = np.random.randint(0, 2, 100)\n>>> dataset = DatasetWithTestDataIndices.from_arrays(X, y)\n>>> indices = np.random.choice(dataset.indices, 30, replace=False)\n>>> _ = dataset.get_training_data(indices)\n>>> _ = dataset.get_test_data(indices)\n
PARAMETER DESCRIPTION indices
Optional indices into the test data. This argument is unused left for compatibility with get_training_data().
TYPE: Optional[Iterable[int]]
DEFAULT: None
RETURNS DESCRIPTION Tuple[NDArray, NDArray]
The entire test data.
Source code in src/pydvl/utils/dataset.py
def get_test_data(\n self, indices: Optional[Iterable[int]] = None\n) -> Tuple[NDArray, NDArray]:\n \"\"\"Returns the entire test set regardless of the passed indices.\n\n The passed indices will not be used because for data valuation\n we generally want to score the trained model on the entire test data.\n\n Additionally, the way this method is used in the\n [Utility][pydvl.utils.utility.Utility] class, the passed indices will\n be those of the training data and would not work on the test data.\n\n There may be cases where it is desired to use parts of the test data.\n In those cases, it is recommended to inherit from\n [Dataset][pydvl.utils.dataset.Dataset] and override\n [get_test_data()][pydvl.utils.dataset.Dataset.get_test_data].\n\n For example, the following snippet shows how one could go about\n mapping the training data indices into test data indices\n inside [get_test_data()][pydvl.utils.dataset.Dataset.get_test_data]:\n\n ??? Example\n ```pycon\n >>> from pydvl.utils import Dataset\n >>> import numpy as np\n >>> class DatasetWithTestDataIndices(Dataset):\n ... def get_test_data(self, indices=None):\n ... if indices is None:\n ... return self.x_test, self.y_test\n ... fraction = len(list(indices)) / len(self)\n ... mapped_indices = len(self.x_test) / len(self) * np.asarray(indices)\n ... mapped_indices = np.unique(mapped_indices.astype(int))\n ... return self.x_test[mapped_indices], self.y_test[mapped_indices]\n ...\n >>> X = np.random.rand(100, 10)\n >>> y = np.random.randint(0, 2, 100)\n >>> dataset = DatasetWithTestDataIndices.from_arrays(X, y)\n >>> indices = np.random.choice(dataset.indices, 30, replace=False)\n >>> _ = dataset.get_training_data(indices)\n >>> _ = dataset.get_test_data(indices)\n ```\n\n Args:\n indices: Optional indices into the test data. This argument is\n unused left for compatibility with\n [get_training_data()][pydvl.utils.dataset.Dataset.get_training_data].\n\n Returns:\n The entire test data.\n \"\"\"\n return self.x_test, self.y_test\n
"},{"location":"api/pydvl/utils/dataset/#pydvl.utils.dataset.GroupedDataset.get_training_data","title":"get_training_data","text":"get_training_data(\n indices: Optional[Iterable[int]] = None,\n) -> Tuple[NDArray, NDArray]\n
Returns the data and labels of all samples in the given groups.
PARAMETER DESCRIPTION indices
group indices whose elements to return. If None
, all data from all groups are returned.
TYPE: Optional[Iterable[int]]
DEFAULT: None
RETURNS DESCRIPTION Tuple[NDArray, NDArray]
Tuple of training data x and labels y.
Source code in src/pydvl/utils/dataset.py
def get_training_data(\n self, indices: Optional[Iterable[int]] = None\n) -> Tuple[NDArray, NDArray]:\n \"\"\"Returns the data and labels of all samples in the given groups.\n\n Args:\n indices: group indices whose elements to return. If `None`,\n all data from all groups are returned.\n\n Returns:\n Tuple of training data x and labels y.\n \"\"\"\n if indices is None:\n indices = self.indices\n data_indices = [\n idx for group_id in indices for idx in self.group_items[group_id][1]\n ]\n return super().get_training_data(data_indices)\n
"},{"location":"api/pydvl/utils/dataset/#pydvl.utils.dataset.GroupedDataset.from_sklearn","title":"from_sklearn classmethod
","text":"from_sklearn(\n data: Bunch,\n train_size: float = 0.8,\n random_state: Optional[int] = None,\n stratify_by_target: bool = False,\n data_groups: Optional[Sequence] = None,\n **kwargs\n) -> GroupedDataset\n
Constructs a GroupedDataset object from a sklearn.utils.Bunch as returned by the load_*
functions in scikit-learn toy datasets and groups it.
Example >>> from sklearn.datasets import load_iris\n>>> from pydvl.utils import GroupedDataset\n>>> iris = load_iris()\n>>> data_groups = iris.data[:, 0] // 0.5\n>>> dataset = GroupedDataset.from_sklearn(iris, data_groups=data_groups)\n
PARAMETER DESCRIPTION data
scikit-learn Bunch object. The following attributes are supported:
data
: covariates. target
: target variables (labels). feature_names
(optional): the feature names. target_names
(optional): the target names. DESCR
(optional): a description.
TYPE: Bunch
train_size
size of the training dataset. Used in train_test_split
.
TYPE: float
DEFAULT: 0.8
random_state
seed for train / test split.
TYPE: Optional[int]
DEFAULT: None
stratify_by_target
If True
, data is split in a stratified fashion, using the target variable as labels. Read more in sklearn's user guide.
TYPE: bool
DEFAULT: False
data_groups
an array holding the group index or name for each data point. The length of this array must be equal to the number of data points in the dataset.
TYPE: Optional[Sequence]
DEFAULT: None
kwargs
Additional keyword arguments to pass to the Dataset constructor.
DEFAULT: {}
RETURNS DESCRIPTION GroupedDataset
Dataset with the selected sklearn data
Source code in src/pydvl/utils/dataset.py
@classmethod\ndef from_sklearn(\n cls,\n data: Bunch,\n train_size: float = 0.8,\n random_state: Optional[int] = None,\n stratify_by_target: bool = False,\n data_groups: Optional[Sequence] = None,\n **kwargs,\n) -> \"GroupedDataset\":\n \"\"\"Constructs a [GroupedDataset][pydvl.utils.GroupedDataset] object from a\n [sklearn.utils.Bunch][sklearn.utils.Bunch] as returned by the `load_*` functions in\n [scikit-learn toy datasets](https://scikit-learn.org/stable/datasets/toy_dataset.html) and groups\n it.\n\n ??? Example\n ```pycon\n >>> from sklearn.datasets import load_iris\n >>> from pydvl.utils import GroupedDataset\n >>> iris = load_iris()\n >>> data_groups = iris.data[:, 0] // 0.5\n >>> dataset = GroupedDataset.from_sklearn(iris, data_groups=data_groups)\n ```\n\n Args:\n data: scikit-learn Bunch object. The following attributes are supported:\n\n - `data`: covariates.\n - `target`: target variables (labels).\n - `feature_names` (**optional**): the feature names.\n - `target_names` (**optional**): the target names.\n - `DESCR` (**optional**): a description.\n train_size: size of the training dataset. Used in `train_test_split`.\n random_state: seed for train / test split.\n stratify_by_target: If `True`, data is split in a stratified\n fashion, using the target variable as labels. Read more in\n [sklearn's user guide](https://scikit-learn.org/stable/modules/cross_validation.html#stratification).\n data_groups: an array holding the group index or name for each\n data point. The length of this array must be equal to the number of\n data points in the dataset.\n kwargs: Additional keyword arguments to pass to the\n [Dataset][pydvl.utils.Dataset] constructor.\n\n Returns:\n Dataset with the selected sklearn data\n \"\"\"\n if data_groups is None:\n raise ValueError(\n \"data_groups must be provided when constructing a GroupedDataset\"\n )\n\n x_train, x_test, y_train, y_test, data_groups_train, _ = train_test_split(\n data.data,\n data.target,\n data_groups,\n train_size=train_size,\n random_state=random_state,\n stratify=data.target if stratify_by_target else None,\n )\n\n dataset = Dataset(\n x_train=x_train, y_train=y_train, x_test=x_test, y_test=y_test, **kwargs\n )\n return cls.from_dataset(dataset, data_groups_train) # type: ignore\n
"},{"location":"api/pydvl/utils/dataset/#pydvl.utils.dataset.GroupedDataset.from_arrays","title":"from_arrays classmethod
","text":"from_arrays(\n X: NDArray,\n y: NDArray,\n train_size: float = 0.8,\n random_state: Optional[int] = None,\n stratify_by_target: bool = False,\n data_groups: Optional[Sequence] = None,\n **kwargs\n) -> Dataset\n
Constructs a GroupedDataset object from X and y numpy arrays as returned by the make_*
functions in scikit-learn generated datasets.
Example >>> from sklearn.datasets import make_classification\n>>> from pydvl.utils import GroupedDataset\n>>> X, y = make_classification(\n... n_samples=100,\n... n_features=4,\n... n_informative=2,\n... n_redundant=0,\n... random_state=0,\n... shuffle=False\n... )\n>>> data_groups = X[:, 0] // 0.5\n>>> dataset = GroupedDataset.from_arrays(X, y, data_groups=data_groups)\n
PARAMETER DESCRIPTION X
array of shape (n_samples, n_features)
TYPE: NDArray
y
array of shape (n_samples,)
TYPE: NDArray
train_size
size of the training dataset. Used in train_test_split
.
TYPE: float
DEFAULT: 0.8
random_state
seed for train / test split.
TYPE: Optional[int]
DEFAULT: None
stratify_by_target
If True
, data is split in a stratified fashion, using the y variable as labels. Read more in sklearn's user guide.
TYPE: bool
DEFAULT: False
data_groups
an array holding the group index or name for each data point. The length of this array must be equal to the number of data points in the dataset.
TYPE: Optional[Sequence]
DEFAULT: None
kwargs
Additional keyword arguments that will be passed to the Dataset constructor.
DEFAULT: {}
RETURNS DESCRIPTION Dataset
Dataset with the passed X and y arrays split across training and test sets.
New in version 0.4.0
Changed in version 0.6.0
Added kwargs to pass to the Dataset constructor.
Source code in src/pydvl/utils/dataset.py
@classmethod\ndef from_arrays(\n cls,\n X: NDArray,\n y: NDArray,\n train_size: float = 0.8,\n random_state: Optional[int] = None,\n stratify_by_target: bool = False,\n data_groups: Optional[Sequence] = None,\n **kwargs,\n) -> \"Dataset\":\n \"\"\"Constructs a [GroupedDataset][pydvl.utils.GroupedDataset] object from X and y numpy arrays\n as returned by the `make_*` functions in\n [scikit-learn generated datasets](https://scikit-learn.org/stable/datasets/sample_generators.html).\n\n ??? Example\n ```pycon\n >>> from sklearn.datasets import make_classification\n >>> from pydvl.utils import GroupedDataset\n >>> X, y = make_classification(\n ... n_samples=100,\n ... n_features=4,\n ... n_informative=2,\n ... n_redundant=0,\n ... random_state=0,\n ... shuffle=False\n ... )\n >>> data_groups = X[:, 0] // 0.5\n >>> dataset = GroupedDataset.from_arrays(X, y, data_groups=data_groups)\n ```\n\n Args:\n X: array of shape (n_samples, n_features)\n y: array of shape (n_samples,)\n train_size: size of the training dataset. Used in `train_test_split`.\n random_state: seed for train / test split.\n stratify_by_target: If `True`, data is split in a stratified\n fashion, using the y variable as labels. Read more in\n [sklearn's user guide](https://scikit-learn.org/stable/modules/cross_validation.html#stratification).\n data_groups: an array holding the group index or name for each data\n point. The length of this array must be equal to the number of\n data points in the dataset.\n kwargs: Additional keyword arguments that will be passed to the\n [Dataset][pydvl.utils.Dataset] constructor.\n\n Returns:\n Dataset with the passed X and y arrays split across training and\n test sets.\n\n !!! tip \"New in version 0.4.0\"\n\n !!! tip \"Changed in version 0.6.0\"\n Added kwargs to pass to the [Dataset][pydvl.utils.Dataset] constructor.\n \"\"\"\n if data_groups is None:\n raise ValueError(\n \"data_groups must be provided when constructing a GroupedDataset\"\n )\n x_train, x_test, y_train, y_test, data_groups_train, _ = train_test_split(\n X,\n y,\n data_groups,\n train_size=train_size,\n random_state=random_state,\n stratify=y if stratify_by_target else None,\n )\n dataset = Dataset(\n x_train=x_train, y_train=y_train, x_test=x_test, y_test=y_test, **kwargs\n )\n return cls.from_dataset(dataset, data_groups_train)\n
"},{"location":"api/pydvl/utils/dataset/#pydvl.utils.dataset.GroupedDataset.from_dataset","title":"from_dataset classmethod
","text":"from_dataset(dataset: Dataset, data_groups: Sequence[Any]) -> GroupedDataset\n
Creates a GroupedDataset object from the data a Dataset object and a mapping of data groups.
Example >>> import numpy as np\n>>> from pydvl.utils import Dataset, GroupedDataset\n>>> dataset = Dataset.from_arrays(\n... X=np.asarray([[1, 2], [3, 4], [5, 6], [7, 8]]),\n... y=np.asarray([0, 1, 0, 1]),\n... )\n>>> dataset = GroupedDataset.from_dataset(dataset, data_groups=[0, 0, 1, 1])\n
PARAMETER DESCRIPTION dataset
The original data.
TYPE: Dataset
data_groups
An array holding the group index or name for each data point. The length of this array must be equal to the number of data points in the dataset.
TYPE: Sequence[Any]
RETURNS DESCRIPTION GroupedDataset
A GroupedDataset with the initial Dataset grouped by data_groups.
Source code in src/pydvl/utils/dataset.py
@classmethod\ndef from_dataset(\n cls, dataset: Dataset, data_groups: Sequence[Any]\n) -> \"GroupedDataset\":\n \"\"\"Creates a [GroupedDataset][pydvl.utils.GroupedDataset] object from the data a\n [Dataset][pydvl.utils.Dataset] object and a mapping of data groups.\n\n ??? Example\n ```pycon\n >>> import numpy as np\n >>> from pydvl.utils import Dataset, GroupedDataset\n >>> dataset = Dataset.from_arrays(\n ... X=np.asarray([[1, 2], [3, 4], [5, 6], [7, 8]]),\n ... y=np.asarray([0, 1, 0, 1]),\n ... )\n >>> dataset = GroupedDataset.from_dataset(dataset, data_groups=[0, 0, 1, 1])\n ```\n\n Args:\n dataset: The original data.\n data_groups: An array holding the group index or name for each data\n point. The length of this array must be equal to the number of\n data points in the dataset.\n\n Returns:\n A [GroupedDataset][pydvl.utils.GroupedDataset] with the initial\n [Dataset][pydvl.utils.Dataset] grouped by data_groups.\n \"\"\"\n return cls(\n x_train=dataset.x_train,\n y_train=dataset.y_train,\n x_test=dataset.x_test,\n y_test=dataset.y_test,\n data_groups=data_groups,\n feature_names=dataset.feature_names,\n target_names=dataset.target_names,\n description=dataset.description,\n )\n
"},{"location":"api/pydvl/utils/exceptions/","title":"Exceptions","text":""},{"location":"api/pydvl/utils/exceptions/#pydvl.utils.exceptions","title":"pydvl.utils.exceptions","text":""},{"location":"api/pydvl/utils/exceptions/#pydvl.utils.exceptions.catch_and_raise_exception","title":"catch_and_raise_exception","text":"catch_and_raise_exception(\n catch_exception_type: Type[CatchExceptionType],\n raise_exception_factory: Callable[[CatchExceptionType], RaiseExceptionType],\n) -> Callable\n
A decorator that catches exceptions of a specified exception type and raises another specified exception.
PARAMETER DESCRIPTION catch_exception_type
The type of the exception to catch.
TYPE: Type[CatchExceptionType]
raise_exception_factory
A factory function that creates a new exception.
TYPE: Callable[[CatchExceptionType], RaiseExceptionType]
RETURNS DESCRIPTION Callable
A decorator function that wraps the target function.
Example @catch_and_raise_exception(RuntimeError, lambda e: TorchLinalgEighException(e))\ndef safe_torch_linalg_eigh(*args, **kwargs):\n '''\n A wrapper around `torch.linalg.eigh` that safely handles potential runtime errors\n by raising a custom `TorchLinalgEighException` with more context,\n especially related to the issues reported in\n https://github.com/pytorch/pytorch/issues/92141.\n\n Args:\n *args: Positional arguments passed to `torch.linalg.eigh`.\n **kwargs: Keyword arguments passed to `torch.linalg.eigh`.\n\n Returns:\n The result of calling `torch.linalg.eigh` with the provided arguments.\n\n Raises:\n TorchLinalgEighException: If a `RuntimeError` occurs during the execution of\n `torch.linalg.eigh`.\n '''\n return torch.linalg.eigh(*args, **kwargs)\n
Source code in src/pydvl/utils/exceptions.py
def catch_and_raise_exception(\n catch_exception_type: Type[CatchExceptionType],\n raise_exception_factory: Callable[[CatchExceptionType], RaiseExceptionType],\n) -> Callable:\n \"\"\"\n A decorator that catches exceptions of a specified exception type and raises\n another specified exception.\n\n Args:\n catch_exception_type: The type of the exception to catch.\n raise_exception_factory: A factory function that creates a new exception.\n\n Returns:\n A decorator function that wraps the target function.\n\n ??? Example\n\n ```python\n @catch_and_raise_exception(RuntimeError, lambda e: TorchLinalgEighException(e))\n def safe_torch_linalg_eigh(*args, **kwargs):\n '''\n A wrapper around `torch.linalg.eigh` that safely handles potential runtime errors\n by raising a custom `TorchLinalgEighException` with more context,\n especially related to the issues reported in\n https://github.com/pytorch/pytorch/issues/92141.\n\n Args:\n *args: Positional arguments passed to `torch.linalg.eigh`.\n **kwargs: Keyword arguments passed to `torch.linalg.eigh`.\n\n Returns:\n The result of calling `torch.linalg.eigh` with the provided arguments.\n\n Raises:\n TorchLinalgEighException: If a `RuntimeError` occurs during the execution of\n `torch.linalg.eigh`.\n '''\n return torch.linalg.eigh(*args, **kwargs)\n ```\n \"\"\"\n\n def decorator(func):\n @wraps(func)\n def wrapper(*args, **kwargs):\n try:\n return func(*args, **kwargs)\n except catch_exception_type as e:\n raise raise_exception_factory(e) from e\n\n return wrapper\n\n return decorator\n
"},{"location":"api/pydvl/utils/functional/","title":"Functional","text":""},{"location":"api/pydvl/utils/functional/#pydvl.utils.functional","title":"pydvl.utils.functional","text":"Supporting utilities for manipulating arguments of functions.
"},{"location":"api/pydvl/utils/functional/#pydvl.utils.functional.free_arguments","title":"free_arguments","text":"free_arguments(fun: Union[Callable, partial]) -> Set[str]\n
Computes the set of free arguments for a function or functools.partial object.
All arguments of a function are considered free unless they are set by a partial. For example, if f = partial(g, a=1)
, then a
is not a free argument of f
.
PARAMETER DESCRIPTION fun
A callable or a [partial object][].
TYPE: Union[Callable, partial]
RETURNS DESCRIPTION Set[str]
The set of free arguments of fun
.
New in version 0.7.0
Source code in src/pydvl/utils/functional.py
def free_arguments(fun: Union[Callable, partial]) -> Set[str]:\n \"\"\"Computes the set of free arguments for a function or\n [functools.partial][] object.\n\n All arguments of a function are considered free unless they are set by a\n partial. For example, if `f = partial(g, a=1)`, then `a` is not a free\n argument of `f`.\n\n Args:\n fun: A callable or a [partial object][].\n\n Returns:\n The set of free arguments of `fun`.\n\n !!! tip \"New in version 0.7.0\"\n \"\"\"\n args_set_by_partial: Set[str] = set()\n\n def _rec_unroll_partial_function_args(g: Union[Callable, partial]) -> Callable:\n \"\"\"Stores arguments and recursively call itself if `g` is a\n [functools.partial][] object. In the end, returns the initially wrapped\n function.\n\n This handles the construct `partial(_accept_additional_argument, *args,\n **kwargs)` that is used by `maybe_add_argument`.\n\n Args:\n g: A partial or a function to unroll.\n\n Returns:\n Initial wrapped function.\n \"\"\"\n nonlocal args_set_by_partial\n\n if isinstance(g, partial) and g.func == _accept_additional_argument:\n arg = g.keywords[\"arg\"]\n if arg in args_set_by_partial:\n args_set_by_partial.remove(arg)\n return _rec_unroll_partial_function_args(g.keywords[\"fun\"])\n elif isinstance(g, partial):\n args_set_by_partial.update(g.keywords.keys())\n args_set_by_partial.update(g.args)\n return _rec_unroll_partial_function_args(g.func)\n else:\n return g\n\n wrapped_fn = _rec_unroll_partial_function_args(fun)\n sig = inspect.signature(wrapped_fn)\n return args_set_by_partial | set(sig.parameters.keys())\n
"},{"location":"api/pydvl/utils/functional/#pydvl.utils.functional.maybe_add_argument","title":"maybe_add_argument","text":"maybe_add_argument(fun: Callable, new_arg: str) -> Callable\n
Wraps a function to accept the given keyword parameter if it doesn't already.
If fun
already takes a keyword parameter of name new_arg
, then it is returned as is. Otherwise, a wrapper is returned which merely ignores the argument.
PARAMETER DESCRIPTION fun
The function to wrap
TYPE: Callable
new_arg
The name of the argument that the new function will accept (and ignore).
TYPE: str
RETURNS DESCRIPTION Callable
A new function accepting one more keyword argument.
Changed in version 0.7.0
Ability to work with partials.
Source code in src/pydvl/utils/functional.py
def maybe_add_argument(fun: Callable, new_arg: str) -> Callable:\n \"\"\"Wraps a function to accept the given keyword parameter if it doesn't\n already.\n\n If `fun` already takes a keyword parameter of name `new_arg`, then it is\n returned as is. Otherwise, a wrapper is returned which merely ignores the\n argument.\n\n Args:\n fun: The function to wrap\n new_arg: The name of the argument that the new function will accept\n (and ignore).\n\n Returns:\n A new function accepting one more keyword argument.\n\n !!! tip \"Changed in version 0.7.0\"\n Ability to work with partials.\n \"\"\"\n if new_arg in free_arguments(fun):\n return fun\n\n return partial(_accept_additional_argument, fun=fun, arg=new_arg)\n
"},{"location":"api/pydvl/utils/numeric/","title":"Numeric","text":""},{"location":"api/pydvl/utils/numeric/#pydvl.utils.numeric","title":"pydvl.utils.numeric","text":"This module contains routines for numerical computations used across the library.
"},{"location":"api/pydvl/utils/numeric/#pydvl.utils.numeric.powerset","title":"powerset","text":"powerset(s: NDArray[T]) -> Iterator[Collection[T]]\n
Returns an iterator for the power set of the argument.
Subsets are generated in sequence by growing size. See random_powerset() for random sampling.
Example >>> import numpy as np\n>>> from pydvl.utils.numeric import powerset\n>>> list(powerset(np.array((1,2))))\n[(), (1,), (2,), (1, 2)]\n
PARAMETER DESCRIPTION s
The set to use
TYPE: NDArray[T]
RETURNS DESCRIPTION Iterator[Collection[T]]
An iterator over all subsets of the set of indices s
.
Source code in src/pydvl/utils/numeric.py
def powerset(s: NDArray[T]) -> Iterator[Collection[T]]:\n \"\"\"Returns an iterator for the power set of the argument.\n\n Subsets are generated in sequence by growing size. See\n [random_powerset()][pydvl.utils.numeric.random_powerset] for random\n sampling.\n\n ??? Example\n ``` pycon\n >>> import numpy as np\n >>> from pydvl.utils.numeric import powerset\n >>> list(powerset(np.array((1,2))))\n [(), (1,), (2,), (1, 2)]\n ```\n\n Args:\n s: The set to use\n\n Returns:\n An iterator over all subsets of the set of indices `s`.\n \"\"\"\n return chain.from_iterable(combinations(s, r) for r in range(len(s) + 1))\n
"},{"location":"api/pydvl/utils/numeric/#pydvl.utils.numeric.num_samples_permutation_hoeffding","title":"num_samples_permutation_hoeffding","text":"num_samples_permutation_hoeffding(\n eps: float, delta: float, u_range: float\n) -> int\n
Lower bound on the number of samples required for MonteCarlo Shapley to obtain an (\u03b5,\u03b4)-approximation.
That is: with probability 1-\u03b4, the estimated value for one data point will be \u03b5-close to the true quantity, if at least this many permutations are sampled.
PARAMETER DESCRIPTION eps
\u03b5 > 0
TYPE: float
delta
0 < \u03b4 <= 1
TYPE: float
u_range
Range of the Utility function
TYPE: float
RETURNS DESCRIPTION int
Number of permutations required to guarantee \u03b5-correct Shapley values with probability 1-\u03b4
Source code in src/pydvl/utils/numeric.py
def num_samples_permutation_hoeffding(eps: float, delta: float, u_range: float) -> int:\n \"\"\"Lower bound on the number of samples required for MonteCarlo Shapley to\n obtain an (\u03b5,\u03b4)-approximation.\n\n That is: with probability 1-\u03b4, the estimated value for one data point will\n be \u03b5-close to the true quantity, if at least this many permutations are\n sampled.\n\n Args:\n eps: \u03b5 > 0\n delta: 0 < \u03b4 <= 1\n u_range: Range of the [Utility][pydvl.utils.utility.Utility] function\n\n Returns:\n Number of _permutations_ required to guarantee \u03b5-correct Shapley\n values with probability 1-\u03b4\n \"\"\"\n return int(np.ceil(np.log(2 / delta) * 2 * u_range**2 / eps**2))\n
"},{"location":"api/pydvl/utils/numeric/#pydvl.utils.numeric.random_subset","title":"random_subset","text":"random_subset(\n s: NDArray[T], q: float = 0.5, seed: Optional[Seed] = None\n) -> NDArray[T]\n
Returns one subset at random from s
.
PARAMETER DESCRIPTION s
set to sample from
TYPE: NDArray[T]
q
Sampling probability for elements. The default 0.5 yields a uniform distribution over the power set of s.
TYPE: float
DEFAULT: 0.5
seed
Either an instance of a numpy random number generator or a seed for it.
TYPE: Optional[Seed]
DEFAULT: None
RETURNS DESCRIPTION NDArray[T]
The subset
Source code in src/pydvl/utils/numeric.py
def random_subset(\n s: NDArray[T], q: float = 0.5, seed: Optional[Seed] = None\n) -> NDArray[T]:\n \"\"\"Returns one subset at random from ``s``.\n\n Args:\n s: set to sample from\n q: Sampling probability for elements. The default 0.5 yields a\n uniform distribution over the power set of s.\n seed: Either an instance of a numpy random number generator or a seed\n for it.\n\n Returns:\n The subset\n \"\"\"\n rng = np.random.default_rng(seed)\n selection = rng.uniform(size=len(s)) > q\n return s[selection]\n
"},{"location":"api/pydvl/utils/numeric/#pydvl.utils.numeric.random_powerset","title":"random_powerset","text":"random_powerset(\n s: NDArray[T],\n n_samples: Optional[int] = None,\n q: float = 0.5,\n seed: Optional[Seed] = None,\n) -> Generator[NDArray[T], None, None]\n
Samples subsets from the power set of the argument, without pre-generating all subsets and in no order.
See powerset if you wish to deterministically generate all subsets.
To generate subsets, len(s)
Bernoulli draws with probability q
are drawn. The default value of q = 0.5
provides a uniform distribution over the power set of s
. Other choices can be used e.g. to implement owen_sampling_shapley.
PARAMETER DESCRIPTION s
set to sample from
TYPE: NDArray[T]
n_samples
if set, stop the generator after this many steps. Defaults to np.iinfo(np.int32).max
TYPE: Optional[int]
DEFAULT: None
q
Sampling probability for elements. The default 0.5 yields a uniform distribution over the power set of s.
TYPE: float
DEFAULT: 0.5
seed
Either an instance of a numpy random number generator or a seed for it.
TYPE: Optional[Seed]
DEFAULT: None
RETURNS DESCRIPTION Generator[NDArray[T], None, None]
Samples from the power set of s
.
RAISES DESCRIPTION ValueError
if the element sampling probability is not in [0,1]
Source code in src/pydvl/utils/numeric.py
def random_powerset(\n s: NDArray[T],\n n_samples: Optional[int] = None,\n q: float = 0.5,\n seed: Optional[Seed] = None,\n) -> Generator[NDArray[T], None, None]:\n \"\"\"Samples subsets from the power set of the argument, without\n pre-generating all subsets and in no order.\n\n See [powerset][pydvl.utils.numeric.powerset] if you wish to deterministically generate all subsets.\n\n To generate subsets, `len(s)` Bernoulli draws with probability `q` are\n drawn. The default value of `q = 0.5` provides a uniform distribution over\n the power set of `s`. Other choices can be used e.g. to implement\n [owen_sampling_shapley][pydvl.value.shapley.owen.owen_sampling_shapley].\n\n Args:\n s: set to sample from\n n_samples: if set, stop the generator after this many steps.\n Defaults to `np.iinfo(np.int32).max`\n q: Sampling probability for elements. The default 0.5 yields a\n uniform distribution over the power set of s.\n seed: Either an instance of a numpy random number generator or a seed for it.\n\n Returns:\n Samples from the power set of `s`.\n\n Raises:\n ValueError: if the element sampling probability is not in [0,1]\n\n \"\"\"\n if q < 0 or q > 1:\n raise ValueError(\"Element sampling probability must be in [0,1]\")\n\n rng = np.random.default_rng(seed)\n total = 1\n if n_samples is None:\n n_samples = np.iinfo(np.int32).max\n while total <= n_samples:\n yield random_subset(s, q, seed=rng)\n total += 1\n
"},{"location":"api/pydvl/utils/numeric/#pydvl.utils.numeric.random_powerset_label_min","title":"random_powerset_label_min","text":"random_powerset_label_min(\n s: NDArray[T],\n labels: NDArray[int_],\n min_elements_per_label: int = 1,\n seed: Optional[Seed] = None,\n) -> Generator[NDArray[T], None, None]\n
Draws random subsets from s
, while ensuring that at least min_elements_per_label
elements per label are included in the draw. It can be used for classification problems to ensure that a set contains information for all labels (or not if min_elements_per_label=0
).
PARAMETER DESCRIPTION s
Set to sample from
TYPE: NDArray[T]
labels
Labels for the samples
TYPE: NDArray[int_]
min_elements_per_label
Minimum number of elements for each label.
TYPE: int
DEFAULT: 1
seed
Either an instance of a numpy random number generator or a seed for it.
TYPE: Optional[Seed]
DEFAULT: None
RETURNS DESCRIPTION Generator[NDArray[T], None, None]
Generated draw from the powerset of s with min_elements_per_label
for each
Generator[NDArray[T], None, None]
label.
RAISES DESCRIPTION ValueError
If s
and labels
are of different length or min_elements_per_label
is smaller than 0.
Source code in src/pydvl/utils/numeric.py
def random_powerset_label_min(\n s: NDArray[T],\n labels: NDArray[np.int_],\n min_elements_per_label: int = 1,\n seed: Optional[Seed] = None,\n) -> Generator[NDArray[T], None, None]:\n \"\"\"Draws random subsets from `s`, while ensuring that at least\n `min_elements_per_label` elements per label are included in the draw. It can be used\n for classification problems to ensure that a set contains information for all labels\n (or not if `min_elements_per_label=0`).\n\n Args:\n s: Set to sample from\n labels: Labels for the samples\n min_elements_per_label: Minimum number of elements for each label.\n seed: Either an instance of a numpy random number generator or a seed for it.\n\n Returns:\n Generated draw from the powerset of s with `min_elements_per_label` for each\n label.\n\n Raises:\n ValueError: If `s` and `labels` are of different length or\n `min_elements_per_label` is smaller than 0.\n \"\"\"\n if len(labels) != len(s):\n raise ValueError(\"Set and labels have to be of same size.\")\n\n if min_elements_per_label < 0:\n raise ValueError(\n f\"Parameter min_elements={min_elements_per_label} needs to be bigger or \"\n f\"equal to 0.\"\n )\n\n rng = np.random.default_rng(seed)\n unique_labels = np.unique(labels)\n\n while True:\n subsets: List[NDArray[T]] = []\n for label in unique_labels:\n label_indices = np.asarray(np.where(labels == label)[0])\n subset_size = int(\n rng.integers(\n min(min_elements_per_label, len(label_indices)),\n len(label_indices) + 1,\n )\n )\n if subset_size > 0:\n subsets.append(\n random_subset_of_size(s[label_indices], subset_size, seed=rng)\n )\n\n if len(subsets) > 0:\n subset = np.concatenate(tuple(subsets))\n rng.shuffle(subset)\n yield subset\n else:\n yield np.array([], dtype=s.dtype)\n
"},{"location":"api/pydvl/utils/numeric/#pydvl.utils.numeric.random_subset_of_size","title":"random_subset_of_size","text":"random_subset_of_size(\n s: NDArray[T], size: int, seed: Optional[Seed] = None\n) -> NDArray[T]\n
Samples a random subset of given size uniformly from the powerset of s
.
PARAMETER DESCRIPTION s
Set to sample from
TYPE: NDArray[T]
size
Size of the subset to generate
TYPE: int
seed
Either an instance of a numpy random number generator or a seed for it.
TYPE: Optional[Seed]
DEFAULT: None
RETURNS DESCRIPTION NDArray[T]
The subset
Raises ValueError: If size > len(s)
Source code in src/pydvl/utils/numeric.py
def random_subset_of_size(\n s: NDArray[T], size: int, seed: Optional[Seed] = None\n) -> NDArray[T]:\n \"\"\"Samples a random subset of given size uniformly from the powerset\n of `s`.\n\n Args:\n s: Set to sample from\n size: Size of the subset to generate\n seed: Either an instance of a numpy random number generator or a seed for it.\n\n Returns:\n The subset\n\n Raises\n ValueError: If size > len(s)\n \"\"\"\n if size > len(s):\n raise ValueError(\"Cannot sample subset larger than set\")\n rng = np.random.default_rng(seed)\n return rng.choice(s, size=size, replace=False)\n
"},{"location":"api/pydvl/utils/numeric/#pydvl.utils.numeric.random_matrix_with_condition_number","title":"random_matrix_with_condition_number","text":"random_matrix_with_condition_number(\n n: int, condition_number: float, seed: Optional[Seed] = None\n) -> NDArray\n
Constructs a square matrix with a given condition number.
Taken from: https://gist.github.com/bstellato/23322fe5d87bb71da922fbc41d658079#file-random_mat_condition_number-py
Also see: https://math.stackexchange.com/questions/1351616/condition-number-of-ata.
PARAMETER DESCRIPTION n
size of the matrix
TYPE: int
condition_number
duh
TYPE: float
seed
Either an instance of a numpy random number generator or a seed for it.
TYPE: Optional[Seed]
DEFAULT: None
RETURNS DESCRIPTION NDArray
An (n,n) matrix with the requested condition number.
Source code in src/pydvl/utils/numeric.py
def random_matrix_with_condition_number(\n n: int, condition_number: float, seed: Optional[Seed] = None\n) -> NDArray:\n \"\"\"Constructs a square matrix with a given condition number.\n\n Taken from:\n [https://gist.github.com/bstellato/23322fe5d87bb71da922fbc41d658079#file-random_mat_condition_number-py](\n https://gist.github.com/bstellato/23322fe5d87bb71da922fbc41d658079#file-random_mat_condition_number-py)\n\n Also see:\n [https://math.stackexchange.com/questions/1351616/condition-number-of-ata](\n https://math.stackexchange.com/questions/1351616/condition-number-of-ata).\n\n Args:\n n: size of the matrix\n condition_number: duh\n seed: Either an instance of a numpy random number generator or a seed for it.\n\n Returns:\n An (n,n) matrix with the requested condition number.\n \"\"\"\n if n < 2:\n raise ValueError(\"Matrix size must be at least 2\")\n\n if condition_number <= 1:\n raise ValueError(\"Condition number must be greater than 1\")\n\n rng = np.random.default_rng(seed)\n log_condition_number = np.log(condition_number)\n exp_vec = np.arange(\n -log_condition_number / 4.0,\n log_condition_number * (n + 1) / (4 * (n - 1)),\n log_condition_number / (2.0 * (n - 1)),\n )\n exp_vec = exp_vec[:n]\n s: np.ndarray = np.exp(exp_vec)\n S = np.diag(s)\n U, _ = np.linalg.qr((rng.uniform(size=(n, n)) - 5.0) * 200)\n V, _ = np.linalg.qr((rng.uniform(size=(n, n)) - 5.0) * 200)\n P: np.ndarray = U.dot(S).dot(V.T)\n P = P.dot(P.T)\n return P\n
"},{"location":"api/pydvl/utils/numeric/#pydvl.utils.numeric.running_moments","title":"running_moments","text":"running_moments(\n previous_avg: float | NDArray[float_],\n previous_variance: float | NDArray[float_],\n count: int,\n new_value: float | NDArray[float_],\n) -> Tuple[float | NDArray[float_], float | NDArray[float_]]\n
Uses Welford's algorithm to calculate the running average and variance of a set of numbers.
See Welford's algorithm in wikipedia
Warning
This is not really using Welford's correction for numerical stability for the variance. (FIXME)
Todo
This could be generalised to arbitrary moments. See this paper
PARAMETER DESCRIPTION previous_avg
average value at previous step
TYPE: float | NDArray[float_]
previous_variance
variance at previous step
TYPE: float | NDArray[float_]
count
number of points seen so far
TYPE: int
new_value
new value in the series of numbers
TYPE: float | NDArray[float_]
RETURNS DESCRIPTION Tuple[float | NDArray[float_], float | NDArray[float_]]
new_average, new_variance, calculated with the new count
Source code in src/pydvl/utils/numeric.py
def running_moments(\n previous_avg: float | NDArray[np.float_],\n previous_variance: float | NDArray[np.float_],\n count: int,\n new_value: float | NDArray[np.float_],\n) -> Tuple[float | NDArray[np.float_], float | NDArray[np.float_]]:\n \"\"\"Uses Welford's algorithm to calculate the running average and variance of\n a set of numbers.\n\n See [Welford's algorithm in wikipedia](https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Welford's_online_algorithm)\n\n !!! Warning\n This is not really using Welford's correction for numerical stability\n for the variance. (FIXME)\n\n !!! Todo\n This could be generalised to arbitrary moments. See [this paper](https://www.osti.gov/biblio/1028931)\n\n Args:\n previous_avg: average value at previous step\n previous_variance: variance at previous step\n count: number of points seen so far\n new_value: new value in the series of numbers\n\n Returns:\n new_average, new_variance, calculated with the new count\n \"\"\"\n # broadcasted operations seem not to be supported by mypy, so we ignore the type\n new_average = (new_value + count * previous_avg) / (count + 1) # type: ignore\n new_variance = previous_variance + (\n (new_value - previous_avg) * (new_value - new_average) - previous_variance\n ) / (count + 1)\n return new_average, new_variance\n
"},{"location":"api/pydvl/utils/numeric/#pydvl.utils.numeric.top_k_value_accuracy","title":"top_k_value_accuracy","text":"top_k_value_accuracy(\n y_true: NDArray[float_], y_pred: NDArray[float_], k: int = 3\n) -> float\n
Computes the top-k accuracy for the estimated values by comparing indices of the highest k values.
PARAMETER DESCRIPTION y_true
Exact/true value
TYPE: NDArray[float_]
y_pred
Predicted/estimated value
TYPE: NDArray[float_]
k
Number of the highest values taken into account
TYPE: int
DEFAULT: 3
RETURNS DESCRIPTION float
Accuracy
Source code in src/pydvl/utils/numeric.py
def top_k_value_accuracy(\n y_true: NDArray[np.float_], y_pred: NDArray[np.float_], k: int = 3\n) -> float:\n \"\"\"Computes the top-k accuracy for the estimated values by comparing indices\n of the highest k values.\n\n Args:\n y_true: Exact/true value\n y_pred: Predicted/estimated value\n k: Number of the highest values taken into account\n\n Returns:\n Accuracy\n \"\"\"\n top_k_exact_values = np.argsort(y_true)[-k:]\n top_k_pred_values = np.argsort(y_pred)[-k:]\n top_k_accuracy = len(np.intersect1d(top_k_exact_values, top_k_pred_values)) / k\n return top_k_accuracy\n
"},{"location":"api/pydvl/utils/progress/","title":"Progress","text":""},{"location":"api/pydvl/utils/progress/#pydvl.utils.progress","title":"pydvl.utils.progress","text":""},{"location":"api/pydvl/utils/progress/#pydvl.utils.progress.repeat_indices","title":"repeat_indices","text":"repeat_indices(\n indices: Collection[int],\n result: ValuationResult,\n done: StoppingCriterion,\n **kwargs\n) -> Iterator[int]\n
Helper function to cycle indefinitely over a collection of indices until the stopping criterion is satisfied while displaying progress.
PARAMETER DESCRIPTION indices
Collection of indices that will be cycled until done.
TYPE: Collection[int]
result
Object containing the current results.
TYPE: ValuationResult
done
Stopping criterion.
TYPE: StoppingCriterion
kwargs
Keyword arguments passed to tqdm.
DEFAULT: {}
Source code in src/pydvl/utils/progress.py
def repeat_indices(\n indices: Collection[int],\n result: \"ValuationResult\",\n done: \"StoppingCriterion\",\n **kwargs,\n) -> Iterator[int]:\n \"\"\"Helper function to cycle indefinitely over a collection of indices\n until the stopping criterion is satisfied while displaying progress.\n\n Args:\n indices: Collection of indices that will be cycled until done.\n result: Object containing the current results.\n done: Stopping criterion.\n kwargs: Keyword arguments passed to tqdm.\n \"\"\"\n with tqdm(total=100, unit=\"%\", **kwargs) as pbar:\n it = takewhile(lambda _: not done(result), cycle(indices))\n for i in it:\n yield i\n pbar.update(100 * done.completion() - pbar.n)\n pbar.refresh()\n
"},{"location":"api/pydvl/utils/progress/#pydvl.utils.progress.log_duration","title":"log_duration","text":"log_duration(_func=None, *, log_level=logging.DEBUG)\n
Decorator to log execution time of a function with a configurable logging level. It can be used with or without specifying a log level.
Source code in src/pydvl/utils/progress.py
def log_duration(_func=None, *, log_level=logging.DEBUG):\n \"\"\"\n Decorator to log execution time of a function with a configurable logging level.\n It can be used with or without specifying a log level.\n \"\"\"\n\n def decorator_log_duration(func):\n @wraps(func)\n def wrapper_log_duration(*args, **kwargs):\n func_name = func.__qualname__\n logger.log(log_level, f\"Function '{func_name}' is starting.\")\n start_time = time()\n result = func(*args, **kwargs)\n duration = time() - start_time\n logger.log(\n log_level,\n f\"Function '{func_name}' completed. \" f\"Duration: {duration:.2f} sec\",\n )\n return result\n\n return wrapper_log_duration\n\n if _func is None:\n # If log_duration was called without arguments, return decorator\n return decorator_log_duration\n else:\n # If log_duration was called with a function, apply decorator directly\n return decorator_log_duration(_func)\n
"},{"location":"api/pydvl/utils/score/","title":"Score","text":""},{"location":"api/pydvl/utils/score/#pydvl.utils.score","title":"pydvl.utils.score","text":"This module provides a Scorer class that wraps scoring functions with additional information.
Scorers are the fundamental building block of many data valuation methods. They are typically used by the Utility class to evaluate the quality of a model when trained on subsets of the training data.
Scorers can be constructed in the same way as in scikit-learn: either from known strings or from a callable. Greater values must be better. If they are not, a negated version can be used, see scikit-learn's make_scorer().
Scorer provides additional information about the scoring function, like its range and default values, which can be used by some data valuation methods (like group_testing_shapley()) to estimate the number of samples required for a certain quality of approximation.
"},{"location":"api/pydvl/utils/score/#pydvl.utils.score.squashed_r2","title":"squashed_r2 module-attribute
","text":"squashed_r2 = compose_score(Scorer('r2'), _sigmoid, (0, 1), 'squashed r2')\n
A scorer that squashes the R\u00b2 score into the range [0, 1] using a sigmoid.
"},{"location":"api/pydvl/utils/score/#pydvl.utils.score.squashed_variance","title":"squashed_variance module-attribute
","text":"squashed_variance = compose_score(\n Scorer(\"explained_variance\"),\n _sigmoid,\n (0, 1),\n \"squashed explained variance\",\n)\n
A scorer that squashes the explained variance score into the range [0, 1] using a sigmoid.
"},{"location":"api/pydvl/utils/score/#pydvl.utils.score.ScorerCallable","title":"ScorerCallable","text":" Bases: Protocol
Signature for a scorer
"},{"location":"api/pydvl/utils/score/#pydvl.utils.score.Scorer","title":"Scorer","text":"Scorer(\n scoring: Union[str, ScorerCallable],\n default: float = np.nan,\n range: Tuple = (-np.inf, np.inf),\n name: Optional[str] = None,\n)\n
A scoring callable that takes a model, data, and labels and returns a scalar.
PARAMETER DESCRIPTION scoring
Either a string or callable that can be passed to get_scorer.
TYPE: Union[str, ScorerCallable]
default
score to be used when a model cannot be fit, e.g. when too little data is passed, or errors arise.
TYPE: float
DEFAULT: nan
range
numerical range of the score function. Some Monte Carlo methods can use this to estimate the number of samples required for a certain quality of approximation. If not provided, it can be read from the scoring
object if it provides it, for instance if it was constructed with compose_score().
TYPE: Tuple
DEFAULT: (-inf, inf)
name
The name of the scorer. If not provided, the name of the function passed will be used.
TYPE: Optional[str]
DEFAULT: None
New in version 0.5.0
Source code in src/pydvl/utils/score.py
def __init__(\n self,\n scoring: Union[str, ScorerCallable],\n default: float = np.nan,\n range: Tuple = (-np.inf, np.inf),\n name: Optional[str] = None,\n):\n if name is None and isinstance(scoring, str):\n name = scoring\n self._scorer = get_scorer(scoring)\n self.default = default\n # TODO: auto-fill from known scorers ?\n self.range = np.array(range)\n self._name = getattr(self._scorer, \"__name__\", name or \"scorer\")\n
"},{"location":"api/pydvl/utils/score/#pydvl.utils.score.compose_score","title":"compose_score","text":"compose_score(\n scorer: Scorer,\n transformation: Callable[[float], float],\n range: Tuple[float, float],\n name: str,\n) -> Scorer\n
Composes a scoring function with an arbitrary scalar transformation.
Useful to squash unbounded scores into ranges manageable by data valuation methods.
Example:
sigmoid = lambda x: 1/(1+np.exp(-x))\ncompose_score(Scorer(\"r2\"), sigmoid, range=(0,1), name=\"squashed r2\")\n
PARAMETER DESCRIPTION scorer
The object to be composed.
TYPE: Scorer
transformation
A scalar transformation
TYPE: Callable[[float], float]
range
The range of the transformation. This will be used e.g. by Utility for the range of the composed.
TYPE: Tuple[float, float]
name
A string representation for the composition, for str()
.
TYPE: str
RETURNS DESCRIPTION Scorer
The composite Scorer.
Source code in src/pydvl/utils/score.py
def compose_score(\n scorer: Scorer,\n transformation: Callable[[float], float],\n range: Tuple[float, float],\n name: str,\n) -> Scorer:\n \"\"\"Composes a scoring function with an arbitrary scalar transformation.\n\n Useful to squash unbounded scores into ranges manageable by data valuation\n methods.\n\n Example:\n\n ```python\n sigmoid = lambda x: 1/(1+np.exp(-x))\n compose_score(Scorer(\"r2\"), sigmoid, range=(0,1), name=\"squashed r2\")\n ```\n\n Args:\n scorer: The object to be composed.\n transformation: A scalar transformation\n range: The range of the transformation. This will be used e.g. by\n [Utility][pydvl.utils.utility.Utility] for the range of the composed.\n name: A string representation for the composition, for `str()`.\n\n Returns:\n The composite [Scorer][pydvl.utils.score.Scorer].\n \"\"\"\n\n class CompositeScorer(Scorer):\n def __call__(self, model: SupervisedModel, X: NDArray, y: NDArray) -> float:\n score = self._scorer(model=model, X=X, y=y)\n return transformation(score)\n\n return CompositeScorer(scorer, range=range, name=name)\n
"},{"location":"api/pydvl/utils/status/","title":"Status","text":""},{"location":"api/pydvl/utils/status/#pydvl.utils.status","title":"pydvl.utils.status","text":""},{"location":"api/pydvl/utils/status/#pydvl.utils.status.Status","title":"Status","text":" Bases: Enum
Status of a computation.
Statuses can be combined using bitwise or (|
) and bitwise and (&
) to get the status of a combined computation. For example, if we have two computations, one that has converged and one that has failed, then the combined status is Status.Converged | Status.Failed == Status.Converged
, but Status.Converged & Status.Failed == Status.Failed
.
"},{"location":"api/pydvl/utils/status/#pydvl.utils.status.Status--or","title":"OR","text":"The result of bitwise or-ing two valuation statuses with |
is given by the following table:
P C F P P C P C C C C F P C F where P = Pending, C = Converged, F = Failed.
"},{"location":"api/pydvl/utils/status/#pydvl.utils.status.Status--and","title":"AND","text":"The result of bitwise and-ing two valuation statuses with &
is given by the following table:
P C F P P P F C P C F F F F F where P = Pending, C = Converged, F = Failed.
"},{"location":"api/pydvl/utils/status/#pydvl.utils.status.Status--not","title":"NOT","text":"The result of bitwise negation of a Status with ~
is Failed
if the status is Converged
, or Converged
otherwise:
~P == C, ~C == F, ~F == C\n
"},{"location":"api/pydvl/utils/status/#pydvl.utils.status.Status--boolean-casting","title":"Boolean casting","text":"A Status evaluates to True
iff it's Converged
or Failed
:
bool(Status.Pending) == False\nbool(Status.Converged) == True\nbool(Status.Failed) == True\n
Warning
These truth values are inconsistent with the usual boolean operations. In particular the XOR of two instances of Status
is not the same as the XOR of their boolean values.
"},{"location":"api/pydvl/utils/types/","title":"Types","text":""},{"location":"api/pydvl/utils/types/#pydvl.utils.types","title":"pydvl.utils.types","text":"This module contains types, protocols, decorators and generic function transformations. Some of it probably belongs elsewhere.
"},{"location":"api/pydvl/utils/types/#pydvl.utils.types.SupervisedModel","title":"SupervisedModel","text":" Bases: Protocol
This is the minimal Protocol that valuation methods require from models in order to work.
All that is needed are the standard sklearn methods fit()
, predict()
and score()
.
"},{"location":"api/pydvl/utils/types/#pydvl.utils.types.SupervisedModel.fit","title":"fit","text":"fit(x: NDArray, y: NDArray)\n
Fit the model to the data
PARAMETER DESCRIPTION x
Independent variables
TYPE: NDArray
y
Dependent variable
TYPE: NDArray
Source code in src/pydvl/utils/types.py
def fit(self, x: NDArray, y: NDArray):\n \"\"\"Fit the model to the data\n\n Args:\n x: Independent variables\n y: Dependent variable\n \"\"\"\n pass\n
"},{"location":"api/pydvl/utils/types/#pydvl.utils.types.SupervisedModel.predict","title":"predict","text":"predict(x: NDArray) -> NDArray\n
Compute predictions for the input
PARAMETER DESCRIPTION x
Independent variables for which to compute predictions
TYPE: NDArray
RETURNS DESCRIPTION NDArray
Predictions for the input
Source code in src/pydvl/utils/types.py
def predict(self, x: NDArray) -> NDArray:\n \"\"\"Compute predictions for the input\n\n Args:\n x: Independent variables for which to compute predictions\n\n Returns:\n Predictions for the input\n \"\"\"\n pass\n
"},{"location":"api/pydvl/utils/types/#pydvl.utils.types.SupervisedModel.score","title":"score","text":"score(x: NDArray, y: NDArray) -> float\n
Compute the score of the model given test data
PARAMETER DESCRIPTION x
Independent variables
TYPE: NDArray
y
Dependent variable
TYPE: NDArray
RETURNS DESCRIPTION float
The score of the model on (x, y)
Source code in src/pydvl/utils/types.py
def score(self, x: NDArray, y: NDArray) -> float:\n \"\"\"Compute the score of the model given test data\n\n Args:\n x: Independent variables\n y: Dependent variable\n\n Returns:\n The score of the model on `(x, y)`\n \"\"\"\n pass\n
"},{"location":"api/pydvl/utils/types/#pydvl.utils.types.ensure_seed_sequence","title":"ensure_seed_sequence","text":"ensure_seed_sequence(\n seed: Optional[Union[Seed, SeedSequence]] = None\n) -> SeedSequence\n
If the passed seed is a SeedSequence object then it is returned as is. If it is a Generator the internal protected seed sequence from the generator gets extracted. Otherwise, a new SeedSequence object is created from the passed (optional) seed.
PARAMETER DESCRIPTION seed
Either an int, a Generator object a SeedSequence object or None.
TYPE: Optional[Union[Seed, SeedSequence]]
DEFAULT: None
RETURNS DESCRIPTION SeedSequence
A SeedSequence object.
New in version 0.7.0
Source code in src/pydvl/utils/types.py
def ensure_seed_sequence(\n seed: Optional[Union[Seed, SeedSequence]] = None\n) -> SeedSequence:\n \"\"\"\n If the passed seed is a SeedSequence object then it is returned as is. If it is\n a Generator the internal protected seed sequence from the generator gets extracted.\n Otherwise, a new SeedSequence object is created from the passed (optional) seed.\n\n Args:\n seed: Either an int, a Generator object a SeedSequence object or None.\n\n Returns:\n A SeedSequence object.\n\n !!! tip \"New in version 0.7.0\"\n \"\"\"\n if isinstance(seed, SeedSequence):\n return seed\n elif isinstance(seed, Generator):\n return cast(SeedSequence, seed.bit_generator.seed_seq) # type: ignore\n else:\n return SeedSequence(seed)\n
"},{"location":"api/pydvl/utils/utility/","title":"Utility","text":""},{"location":"api/pydvl/utils/utility/#pydvl.utils.utility","title":"pydvl.utils.utility","text":"This module contains classes to manage and learn utility functions for the computation of values. Please see the documentation on Computing Data Values for more information.
Utility holds information about model, data and scoring function (the latter being what one usually understands under utility in the general definition of Shapley value). It is automatically cached across machines when the cache is configured and it is enabled upon construction.
DataUtilityLearning adds support for learning the scoring function to avoid repeated re-training of the model to compute the score.
This module also contains derived Utility
classes for toy games that are used for testing and for demonstration purposes.
"},{"location":"api/pydvl/utils/utility/#pydvl.utils.utility--references","title":"References","text":" -
Wang, T., Yang, Y. and Jia, R., 2021. Improving cooperative game theory-based data valuation via data utility learning. arXiv preprint arXiv:2107.06336.\u00a0\u21a9
"},{"location":"api/pydvl/utils/utility/#pydvl.utils.utility.Utility","title":"Utility","text":"Utility(\n model: SupervisedModel,\n data: Dataset,\n scorer: Optional[Union[str, Scorer]] = None,\n *,\n default_score: float = 0.0,\n score_range: Tuple[float, float] = (-np.inf, np.inf),\n catch_errors: bool = True,\n show_warnings: bool = False,\n cache_backend: Optional[CacheBackend] = None,\n cached_func_options: Optional[CachedFuncConfig] = None,\n clone_before_fit: bool = True\n)\n
Convenience wrapper with configurable memoization of the scoring function.
An instance of Utility
holds the triple of model, dataset and scoring function which determines the value of data points. This is used for the computation of all game-theoretic values like Shapley values and the Least Core.
The Utility expect the model to fulfill the SupervisedModel interface i.e. to have fit()
, predict()
, and score()
methods.
When calling the utility, the model will be cloned if it is a Sci-Kit Learn model, otherwise a copy is created using copy.deepcopy
Since evaluating the scoring function requires retraining the model and that can be time-consuming, this class wraps it and caches the results of each execution. Caching is available both locally and across nodes, but must always be enabled for your project first, see the documentation and the module documentation.
ATTRIBUTE DESCRIPTION model
The supervised model.
TYPE: SupervisedModel
data
An object containing the split data.
TYPE: Dataset
scorer
A scoring function. If None, the score()
method of the model will be used. See score for ways to create and compose scorers, in particular how to set default values and ranges.
TYPE: Scorer
PARAMETER DESCRIPTION model
Any supervised model. Typical choices can be found in the [sci-kit learn documentation][https://scikit-learn.org/stable/supervised_learning.html].
TYPE: SupervisedModel
data
Dataset or GroupedDataset instance.
TYPE: Dataset
scorer
A scoring object. If None, the score()
method of the model will be used. See score for ways to create and compose scorers, in particular how to set default values and ranges. For convenience, a string can be passed, which will be used to construct a Scorer.
TYPE: Optional[Union[str, Scorer]]
DEFAULT: None
default_score
As a convenience when no scorer
object is passed (where a default value can be provided), this argument also allows to set the default score for models that have not been fit, e.g. when too little data is passed, or errors arise.
TYPE: float
DEFAULT: 0.0
score_range
As with default_score
, this is a convenience argument for when no scorer
argument is provided, to set the numerical range of the score function. Some Monte Carlo methods can use this to estimate the number of samples required for a certain quality of approximation.
TYPE: Tuple[float, float]
DEFAULT: (-inf, inf)
catch_errors
set to True
to catch the errors when fit()
fails. This could happen in several steps of the pipeline, e.g. when too little training data is passed, which happens often during Shapley value calculations. When this happens, the default_score
is returned as a score and computation continues.
TYPE: bool
DEFAULT: True
show_warnings
Set to False
to suppress warnings thrown by fit()
.
TYPE: bool
DEFAULT: False
cache_backend
Optional instance of CacheBackend used to wrap the _utility method of the Utility instance. By default, this is set to None and that means that the utility evaluations will not be cached.
TYPE: Optional[CacheBackend]
DEFAULT: None
cached_func_options
Optional configuration object for cached utility evaluation.
TYPE: Optional[CachedFuncConfig]
DEFAULT: None
clone_before_fit
If True
, the model will be cloned before calling fit()
.
TYPE: bool
DEFAULT: True
Example >>> from pydvl.utils import Utility, DataUtilityLearning, Dataset\n>>> from sklearn.linear_model import LinearRegression, LogisticRegression\n>>> from sklearn.datasets import load_iris\n>>> dataset = Dataset.from_sklearn(load_iris(), random_state=16)\n>>> u = Utility(LogisticRegression(random_state=16), dataset)\n>>> u(dataset.indices)\n0.9\n
With caching enabled:
>>> from pydvl.utils import Utility, DataUtilityLearning, Dataset\n>>> from pydvl.utils.caching.memory import InMemoryCacheBackend\n>>> from sklearn.linear_model import LinearRegression, LogisticRegression\n>>> from sklearn.datasets import load_iris\n>>> dataset = Dataset.from_sklearn(load_iris(), random_state=16)\n>>> cache_backend = InMemoryCacheBackend()\n>>> u = Utility(LogisticRegression(random_state=16), dataset, cache_backend=cache_backend)\n>>> u(dataset.indices)\n0.9\n
Source code in src/pydvl/utils/utility.py
def __init__(\n self,\n model: SupervisedModel,\n data: Dataset,\n scorer: Optional[Union[str, Scorer]] = None,\n *,\n default_score: float = 0.0,\n score_range: Tuple[float, float] = (-np.inf, np.inf),\n catch_errors: bool = True,\n show_warnings: bool = False,\n cache_backend: Optional[CacheBackend] = None,\n cached_func_options: Optional[CachedFuncConfig] = None,\n clone_before_fit: bool = True,\n):\n self.model = self._clone_model(model)\n self.data = data\n if isinstance(scorer, str):\n scorer = Scorer(scorer, default=default_score, range=score_range)\n self.scorer = check_scoring(self.model, scorer)\n self.default_score = scorer.default if scorer is not None else default_score\n # TODO: auto-fill from known scorers ?\n self.score_range = scorer.range if scorer is not None else np.array(score_range)\n self.clone_before_fit = clone_before_fit\n self.catch_errors = catch_errors\n self.show_warnings = show_warnings\n self.cache = cache_backend\n if cached_func_options is None:\n cached_func_options = CachedFuncConfig()\n # TODO: Find a better way to do this.\n if cached_func_options.hash_prefix is None:\n # FIX: This does not handle reusing the same across runs.\n cached_func_options.hash_prefix = str(hash((model, data, scorer)))\n self.cached_func_options = cached_func_options\n self._initialize_utility_wrapper()\n
"},{"location":"api/pydvl/utils/utility/#pydvl.utils.utility.Utility.cache_stats","title":"cache_stats property
","text":"cache_stats: Optional[CacheStats]\n
Cache statistics are gathered when cache is enabled. See CacheStats for all fields returned.
"},{"location":"api/pydvl/utils/utility/#pydvl.utils.utility.Utility.__call__","title":"__call__","text":"__call__(indices: Iterable[int]) -> float\n
PARAMETER DESCRIPTION indices
a subset of valid indices for the x_train
attribute of Dataset.
TYPE: Iterable[int]
Source code in src/pydvl/utils/utility.py
def __call__(self, indices: Iterable[int]) -> float:\n \"\"\"\n Args:\n indices: a subset of valid indices for the\n `x_train` attribute of [Dataset][pydvl.utils.dataset.Dataset].\n \"\"\"\n utility: float = self._utility_wrapper(frozenset(indices))\n return utility\n
"},{"location":"api/pydvl/utils/utility/#pydvl.utils.utility.DataUtilityLearning","title":"DataUtilityLearning","text":"DataUtilityLearning(u: Utility, training_budget: int, model: SupervisedModel)\n
Implementation of Data Utility Learning (Wang et al., 2022)1.
This object wraps a Utility and delegates calls to it, up until a given budget (number of iterations). Every tuple of input and output (a so-called utility sample) is stored. Once the budget is exhausted, DataUtilityLearning
fits the given model to the utility samples. Subsequent calls will use the learned model to predict the utility instead of delegating.
PARAMETER DESCRIPTION u
The Utility to learn.
TYPE: Utility
training_budget
Number of utility samples to collect before fitting the given model.
TYPE: int
model
A supervised regression model
TYPE: SupervisedModel
Example >>> from pydvl.utils import Utility, DataUtilityLearning, Dataset\n>>> from sklearn.linear_model import LinearRegression, LogisticRegression\n>>> from sklearn.datasets import load_iris\n>>> dataset = Dataset.from_sklearn(load_iris())\n>>> u = Utility(LogisticRegression(), dataset)\n>>> wrapped_u = DataUtilityLearning(u, 3, LinearRegression())\n... # First 3 calls will be computed normally\n>>> for i in range(3):\n... _ = wrapped_u((i,))\n>>> wrapped_u((1, 2, 3)) # Subsequent calls will be computed using the fit model for DUL\n0.0\n
Source code in src/pydvl/utils/utility.py
def __init__(\n self, u: Utility, training_budget: int, model: SupervisedModel\n) -> None:\n self.utility = u\n self.training_budget = training_budget\n self.model = model\n self._current_iteration = 0\n self._is_model_fit = False\n self._utility_samples: Dict[FrozenSet, Tuple[NDArray[np.bool_], float]] = {}\n
"},{"location":"api/pydvl/utils/utility/#pydvl.utils.utility.DataUtilityLearning.data","title":"data property
","text":"data: Dataset\n
Returns the wrapped utility's Dataset.
"},{"location":"api/pydvl/utils/caching/","title":"Caching","text":""},{"location":"api/pydvl/utils/caching/#pydvl.utils.caching","title":"pydvl.utils.caching","text":"This module provides caching of functions.
PyDVL can cache (memoize) the computation of the utility function and speed up some computations for data valuation.
Warning
Function evaluations are cached with a key based on the function's signature and code. This can lead to undesired cache hits, see Cache reuse.
Remember not to reuse utility objects for different datasets.
"},{"location":"api/pydvl/utils/caching/#pydvl.utils.caching--configuration","title":"Configuration","text":"Caching is disabled by default but can be enabled easily, see Setting up the cache. When enabled, it will be added to any callable used to construct a Utility (done with the wrap method of CacheBackend). Depending on the nature of the utility you might want to enable the computation of a running average of function values, see Usage with stochastic functions. You can see all configuration options under CachedFuncConfig.
"},{"location":"api/pydvl/utils/caching/#pydvl.utils.caching--supported-backends","title":"Supported Backends","text":"pyDVL supports 3 different caching backends:
- InMemoryCacheBackend: an in-memory cache backend that uses a dictionary to store and retrieve cached values. This is used to share cached values between threads in a single process.
- DiskCacheBackend: a disk-based cache backend that uses pickled values written to and read from disk. This is used to share cached values between processes in a single machine.
-
MemcachedCacheBackend: a Memcached-based cache backend that uses pickled values written to and read from a Memcached server. This is used to share cached values between processes across multiple machines.
Info
This specific backend requires optional dependencies not installed by default. See Extra dependencies for more information.
"},{"location":"api/pydvl/utils/caching/#pydvl.utils.caching--usage-with-stochastic-functions","title":"Usage with stochastic functions","text":"In addition to standard memoization, the wrapped functions can compute running average and standard error of repeated evaluations for the same input. This can be useful for stochastic functions with high variance (e.g. model training for small sample sizes), but drastically reduces the speed benefits of memoization.
This behaviour can be activated with the option allow_repeated_evaluations.
"},{"location":"api/pydvl/utils/caching/#pydvl.utils.caching--cache-reuse","title":"Cache reuse","text":"When working directly with CachedFunc, it is essential to only cache pure functions. If they have any kind of state, either internal or external (e.g. a closure over some data that may change), then the cache will fail to notice this and the same value will be returned.
When a function is wrapped with CachedFunc for memoization, its signature (input and output names) and code are used as a key for the cache.
If you are running experiments with the same Utility but different datasets, this will lead to evaluations of the utility on new data returning old values because utilities only use sample indices as arguments (so there is no way to tell the difference between '1' for dataset A and '1' for dataset 2 from the point of view of the cache). One solution is to empty the cache between runs by calling the clear
method of the cache backend instance, but the preferred one is to use a different Utility object for each dataset.
"},{"location":"api/pydvl/utils/caching/#pydvl.utils.caching--unexpected-cache-misses","title":"Unexpected cache misses","text":"Because all arguments to a function are used as part of the key for the cache, sometimes one must exclude some of them. For example, If a function is going to run across multiple processes and some reporting arguments are added (like a job_id
for logging purposes), these will be part of the signature and make the functions distinct to the eyes of the cache. This can be avoided with the use of ignore_args option in the configuration.
"},{"location":"api/pydvl/utils/caching/base/","title":"Base","text":""},{"location":"api/pydvl/utils/caching/base/#pydvl.utils.caching.base","title":"pydvl.utils.caching.base","text":""},{"location":"api/pydvl/utils/caching/base/#pydvl.utils.caching.base.CacheStats","title":"CacheStats dataclass
","text":"CacheStats(\n sets: int = 0,\n misses: int = 0,\n hits: int = 0,\n timeouts: int = 0,\n errors: int = 0,\n reconnects: int = 0,\n)\n
Class used to store statistics gathered by cached functions.
ATTRIBUTE DESCRIPTION sets
Number of times a value was set in the cache.
TYPE: int
misses
Number of times a value was not found in the cache.
TYPE: int
hits
Number of times a value was found in the cache.
TYPE: int
timeouts
Number of times a timeout occurred.
TYPE: int
errors
Number of times an error occurred.
TYPE: int
reconnects
Number of times the client reconnected to the server.
TYPE: int
"},{"location":"api/pydvl/utils/caching/base/#pydvl.utils.caching.base.CacheResult","title":"CacheResult dataclass
","text":"CacheResult(value: float, count: int = 1, variance: float = 0.0)\n
A class used to store the cached result of a computation as well as count and variance when using repeated evaluation.
ATTRIBUTE DESCRIPTION value
Cached value.
TYPE: float
count
Number of times this value has been computed.
TYPE: int
variance
Variance associated with the cached value.
TYPE: float
"},{"location":"api/pydvl/utils/caching/base/#pydvl.utils.caching.base.CacheBackend","title":"CacheBackend","text":"CacheBackend()\n
Bases: ABC
Abstract base class for cache backends.
Defines interface for cache access including wrapping callables, getting/setting results, clearing cache, and combining cache keys.
ATTRIBUTE DESCRIPTION stats
Cache statistics tracker.
Source code in src/pydvl/utils/caching/base.py
def __init__(self) -> None:\n self.stats = CacheStats()\n
"},{"location":"api/pydvl/utils/caching/base/#pydvl.utils.caching.base.CacheBackend.wrap","title":"wrap","text":"wrap(\n func: Callable, *, config: Optional[CachedFuncConfig] = None\n) -> CachedFunc\n
Wraps a function to cache its results.
PARAMETER DESCRIPTION func
The function to wrap.
TYPE: Callable
config
Optional caching options for the wrapped function.
TYPE: Optional[CachedFuncConfig]
DEFAULT: None
RETURNS DESCRIPTION CachedFunc
The wrapped cached function.
Source code in src/pydvl/utils/caching/base.py
def wrap(\n self,\n func: Callable,\n *,\n config: Optional[CachedFuncConfig] = None,\n) -> \"CachedFunc\":\n \"\"\"Wraps a function to cache its results.\n\n Args:\n func: The function to wrap.\n config: Optional caching options for the wrapped function.\n\n Returns:\n The wrapped cached function.\n \"\"\"\n return CachedFunc(\n func,\n cache_backend=self,\n config=config,\n )\n
"},{"location":"api/pydvl/utils/caching/base/#pydvl.utils.caching.base.CacheBackend.get","title":"get abstractmethod
","text":"get(key: str) -> Optional[CacheResult]\n
Abstract method to retrieve a cached result.
Implemented by subclasses.
PARAMETER DESCRIPTION key
The cache key.
TYPE: str
RETURNS DESCRIPTION Optional[CacheResult]
The cached result or None if not found.
Source code in src/pydvl/utils/caching/base.py
@abstractmethod\ndef get(self, key: str) -> Optional[CacheResult]:\n \"\"\"Abstract method to retrieve a cached result.\n\n Implemented by subclasses.\n\n Args:\n key: The cache key.\n\n Returns:\n The cached result or None if not found.\n \"\"\"\n pass\n
"},{"location":"api/pydvl/utils/caching/base/#pydvl.utils.caching.base.CacheBackend.set","title":"set abstractmethod
","text":"set(key: str, value: CacheResult) -> None\n
Abstract method to set a cached result.
Implemented by subclasses.
PARAMETER DESCRIPTION key
The cache key.
TYPE: str
value
The result to cache.
TYPE: CacheResult
Source code in src/pydvl/utils/caching/base.py
@abstractmethod\ndef set(self, key: str, value: CacheResult) -> None:\n \"\"\"Abstract method to set a cached result.\n\n Implemented by subclasses.\n\n Args:\n key: The cache key.\n value: The result to cache.\n \"\"\"\n pass\n
"},{"location":"api/pydvl/utils/caching/base/#pydvl.utils.caching.base.CacheBackend.clear","title":"clear abstractmethod
","text":"clear() -> None\n
Abstract method to clear the entire cache.
Source code in src/pydvl/utils/caching/base.py
@abstractmethod\ndef clear(self) -> None:\n \"\"\"Abstract method to clear the entire cache.\"\"\"\n pass\n
"},{"location":"api/pydvl/utils/caching/base/#pydvl.utils.caching.base.CacheBackend.combine_hashes","title":"combine_hashes abstractmethod
","text":"combine_hashes(*args: str) -> str\n
Abstract method to combine cache keys.
Source code in src/pydvl/utils/caching/base.py
@abstractmethod\ndef combine_hashes(self, *args: str) -> str:\n \"\"\"Abstract method to combine cache keys.\"\"\"\n pass\n
"},{"location":"api/pydvl/utils/caching/base/#pydvl.utils.caching.base.CachedFunc","title":"CachedFunc","text":"CachedFunc(\n func: Callable[..., float],\n *,\n cache_backend: CacheBackend,\n config: Optional[CachedFuncConfig] = None\n)\n
Caches callable function results with a provided cache backend.
Wraps a callable function to cache its results using a provided an instance of a subclass of CacheBackend.
This class is heavily inspired from that of joblib.memory.MemorizedFunc.
This class caches calls to the wrapped callable by generating a hash key based on the wrapped callable's code, the arguments passed to it and the optional hash_prefix.
Warning
This class only works with hashable arguments to the wrapped callable.
PARAMETER DESCRIPTION func
Callable to wrap.
TYPE: Callable[..., float]
cache_backend
Instance of CacheBackendBase that handles setting and getting values.
TYPE: CacheBackend
config
Configuration for wrapped function.
TYPE: Optional[CachedFuncConfig]
DEFAULT: None
Source code in src/pydvl/utils/caching/base.py
def __init__(\n self,\n func: Callable[..., float],\n *,\n cache_backend: CacheBackend,\n config: Optional[CachedFuncConfig] = None,\n) -> None:\n self.func = func\n self.cache_backend = cache_backend\n if config is None:\n config = CachedFuncConfig()\n self.config = config\n\n self.__doc__ = f\"A wrapper around {func.__name__}() with caching enabled.\\n\" + (\n CachedFunc.__doc__ or \"\"\n )\n self.__name__ = f\"cached_{func.__name__}\"\n path = list(reversed(func.__qualname__.split(\".\")))\n patched = [f\"cached_{path[0]}\"] + path[1:]\n self.__qualname__ = \".\".join(reversed(patched))\n
"},{"location":"api/pydvl/utils/caching/base/#pydvl.utils.caching.base.CachedFunc.stats","title":"stats property
","text":"stats: CacheStats\n
Cache backend statistics.
"},{"location":"api/pydvl/utils/caching/base/#pydvl.utils.caching.base.CachedFunc.__call__","title":"__call__","text":"__call__(*args, **kwargs) -> float\n
Call the wrapped cached function.
Executes the wrapped function, caching and returning the result.
Source code in src/pydvl/utils/caching/base.py
def __call__(self, *args, **kwargs) -> float:\n \"\"\"Call the wrapped cached function.\n\n Executes the wrapped function, caching and returning the result.\n \"\"\"\n return self._cached_call(args, kwargs)\n
"},{"location":"api/pydvl/utils/caching/config/","title":"Config","text":""},{"location":"api/pydvl/utils/caching/config/#pydvl.utils.caching.config","title":"pydvl.utils.caching.config","text":""},{"location":"api/pydvl/utils/caching/config/#pydvl.utils.caching.config.CachedFuncConfig","title":"CachedFuncConfig dataclass
","text":"CachedFuncConfig(\n hash_prefix: Optional[str] = None,\n ignore_args: Collection[str] = list(),\n time_threshold: float = 0.3,\n allow_repeated_evaluations: bool = False,\n rtol_stderr: float = 0.1,\n min_repetitions: int = 3,\n)\n
Configuration for cached functions and methods, providing memoization of function calls.
Instances of this class are typically used as arguments for the construction of a Utility.
PARAMETER DESCRIPTION hash_prefix
Optional string prefix that be prepended to the cache key. This can be provided in order to guarantee cache reuse across runs.
TYPE: Optional[str]
DEFAULT: None
ignore_args
Do not take these keyword arguments into account when hashing the wrapped function for usage as key. This allows sharing the cache among different jobs for the same experiment run if the callable happens to have \"nuisance\" parameters like job_id
which do not affect the result of the computation.
TYPE: Collection[str]
DEFAULT: list()
time_threshold
Computations taking less time than this many seconds are not cached. A value of 0 means that it will always cache results.
TYPE: float
DEFAULT: 0.3
allow_repeated_evaluations
If True
, repeated calls to a function with the same arguments will be allowed and outputs averaged until the running standard deviation of the mean stabilizes below rtol_stderr * mean
.
TYPE: bool
DEFAULT: False
rtol_stderr
relative tolerance for repeated evaluations. More precisely, memcached() will stop evaluating the function once the standard deviation of the mean is smaller than rtol_stderr * mean
.
TYPE: float
DEFAULT: 0.1
min_repetitions
minimum number of times that a function evaluation on the same arguments is repeated before returning cached values. Useful for stochastic functions only. If the model training is very noisy, set this number to higher values to reduce variance.
TYPE: int
DEFAULT: 3
"},{"location":"api/pydvl/utils/caching/disk/","title":"Disk","text":""},{"location":"api/pydvl/utils/caching/disk/#pydvl.utils.caching.disk","title":"pydvl.utils.caching.disk","text":""},{"location":"api/pydvl/utils/caching/disk/#pydvl.utils.caching.disk.DiskCacheBackend","title":"DiskCacheBackend","text":"DiskCacheBackend(cache_dir: Optional[Union[PathLike, str]] = None)\n
Bases: CacheBackend
Disk cache backend that stores results in files.
Implements the CacheBackend interface for a disk-based cache. Stores cache entries as pickled files on disk, keyed by cache key. This allows sharing evaluations across processes in a single node/computer.
PARAMETER DESCRIPTION cache_dir
Base directory for cache storage.
TYPE: Optional[Union[PathLike, str]]
DEFAULT: None
ATTRIBUTE DESCRIPTION cache_dir
Base directory for cache storage.
Example Basic usage:
>>> from pydvl.utils.caching.disk import DiskCacheBackend\n>>> cache_backend = DiskCacheBackend()\n>>> cache_backend.clear()\n>>> value = 42\n>>> cache_backend.set(\"key\", value)\n>>> cache_backend.get(\"key\")\n42\n
Callable wrapping:
>>> from pydvl.utils.caching.disk import DiskCacheBackend\n>>> cache_backend = DiskCacheBackend()\n>>> cache_backend.clear()\n>>> value = 42\n>>> def foo(x: int):\n... return x + 1\n...\n>>> wrapped_foo = cache_backend.wrap(foo)\n>>> wrapped_foo(value)\n43\n>>> wrapped_foo.stats.misses\n1\n>>> wrapped_foo.stats.hits\n0\n>>> wrapped_foo(value)\n43\n>>> wrapped_foo.stats.misses\n1\n>>> wrapped_foo.stats.hits\n1\n
PARAMETER DESCRIPTION cache_dir
Base directory for cache storage. If not provided, this defaults to a newly created temporary directory.
TYPE: Optional[Union[PathLike, str]]
DEFAULT: None
Source code in src/pydvl/utils/caching/disk.py
def __init__(\n self,\n cache_dir: Optional[Union[os.PathLike, str]] = None,\n) -> None:\n \"\"\"Initialize the disk cache backend.\n\n Args:\n cache_dir: Base directory for cache storage.\n If not provided, this defaults to a newly created\n temporary directory.\n \"\"\"\n super().__init__()\n if cache_dir is None:\n cache_dir = tempfile.mkdtemp(prefix=\"pydvl\")\n self.cache_dir = Path(cache_dir)\n self.cache_dir.mkdir(exist_ok=True, parents=True)\n
"},{"location":"api/pydvl/utils/caching/disk/#pydvl.utils.caching.disk.DiskCacheBackend.wrap","title":"wrap","text":"wrap(\n func: Callable, *, config: Optional[CachedFuncConfig] = None\n) -> CachedFunc\n
Wraps a function to cache its results.
PARAMETER DESCRIPTION func
The function to wrap.
TYPE: Callable
config
Optional caching options for the wrapped function.
TYPE: Optional[CachedFuncConfig]
DEFAULT: None
RETURNS DESCRIPTION CachedFunc
The wrapped cached function.
Source code in src/pydvl/utils/caching/base.py
def wrap(\n self,\n func: Callable,\n *,\n config: Optional[CachedFuncConfig] = None,\n) -> \"CachedFunc\":\n \"\"\"Wraps a function to cache its results.\n\n Args:\n func: The function to wrap.\n config: Optional caching options for the wrapped function.\n\n Returns:\n The wrapped cached function.\n \"\"\"\n return CachedFunc(\n func,\n cache_backend=self,\n config=config,\n )\n
"},{"location":"api/pydvl/utils/caching/disk/#pydvl.utils.caching.disk.DiskCacheBackend.get","title":"get","text":"get(key: str) -> Optional[Any]\n
Get a value from the cache.
PARAMETER DESCRIPTION key
Cache key.
TYPE: str
RETURNS DESCRIPTION Optional[Any]
Cached value or None if not found.
Source code in src/pydvl/utils/caching/disk.py
def get(self, key: str) -> Optional[Any]:\n \"\"\"Get a value from the cache.\n\n Args:\n key: Cache key.\n\n Returns:\n Cached value or None if not found.\n \"\"\"\n cache_file = self.cache_dir / key\n if not cache_file.exists():\n self.stats.misses += 1\n return None\n self.stats.hits += 1\n with cache_file.open(\"rb\") as f:\n return cloudpickle.load(f)\n
"},{"location":"api/pydvl/utils/caching/disk/#pydvl.utils.caching.disk.DiskCacheBackend.set","title":"set","text":"set(key: str, value: Any) -> None\n
Set a value in the cache.
PARAMETER DESCRIPTION key
Cache key.
TYPE: str
value
Value to cache.
TYPE: Any
Source code in src/pydvl/utils/caching/disk.py
def set(self, key: str, value: Any) -> None:\n \"\"\"Set a value in the cache.\n\n Args:\n key: Cache key.\n value: Value to cache.\n \"\"\"\n cache_file = self.cache_dir / key\n self.stats.sets += 1\n with cache_file.open(\"wb\") as f:\n cloudpickle.dump(value, f, protocol=PICKLE_VERSION)\n
"},{"location":"api/pydvl/utils/caching/disk/#pydvl.utils.caching.disk.DiskCacheBackend.clear","title":"clear","text":"clear() -> None\n
Deletes cache directory and recreates it.
Source code in src/pydvl/utils/caching/disk.py
def clear(self) -> None:\n \"\"\"Deletes cache directory and recreates it.\"\"\"\n shutil.rmtree(self.cache_dir)\n self.cache_dir.mkdir(exist_ok=True, parents=True)\n
"},{"location":"api/pydvl/utils/caching/disk/#pydvl.utils.caching.disk.DiskCacheBackend.combine_hashes","title":"combine_hashes","text":"combine_hashes(*args: str) -> str\n
Join cache key components.
Source code in src/pydvl/utils/caching/disk.py
def combine_hashes(self, *args: str) -> str:\n \"\"\"Join cache key components.\"\"\"\n return os.pathsep.join(args)\n
"},{"location":"api/pydvl/utils/caching/memcached/","title":"Memcached","text":""},{"location":"api/pydvl/utils/caching/memcached/#pydvl.utils.caching.memcached","title":"pydvl.utils.caching.memcached","text":""},{"location":"api/pydvl/utils/caching/memcached/#pydvl.utils.caching.memcached.MemcachedClientConfig","title":"MemcachedClientConfig dataclass
","text":"MemcachedClientConfig(\n server: Tuple[str, int] = (\"localhost\", 11211),\n connect_timeout: float = 1.0,\n timeout: float = 1.0,\n no_delay: bool = True,\n serde: PickleSerde = PickleSerde(pickle_version=PICKLE_VERSION),\n)\n
Configuration of the memcached client.
PARAMETER DESCRIPTION server
A tuple of (IP|domain name, port).
TYPE: Tuple[str, int]
DEFAULT: ('localhost', 11211)
connect_timeout
How many seconds to wait before raising ConnectionRefusedError
on failure to connect.
TYPE: float
DEFAULT: 1.0
timeout
Duration in seconds to wait for send or recv calls on the socket connected to memcached.
TYPE: float
DEFAULT: 1.0
no_delay
If True, set the TCP_NODELAY
flag, which may help with performance in some cases.
TYPE: bool
DEFAULT: True
serde
Serializer / Deserializer (\"serde\"). The default PickleSerde
should work in most cases. See pymemcache.client.base.Client for details.
TYPE: PickleSerde
DEFAULT: PickleSerde(pickle_version=PICKLE_VERSION)
"},{"location":"api/pydvl/utils/caching/memcached/#pydvl.utils.caching.memcached.MemcachedCacheBackend","title":"MemcachedCacheBackend","text":"MemcachedCacheBackend(config: MemcachedClientConfig = MemcachedClientConfig())\n
Bases: CacheBackend
Memcached cache backend for the distributed caching of functions.
Implements the CacheBackend interface for a memcached based cache. This allows sharing evaluations across processes and nodes in a cluster. You can run memcached as a service, locally or remotely, see the caching documentation.
PARAMETER DESCRIPTION config
Memcached client configuration.
TYPE: MemcachedClientConfig
DEFAULT: MemcachedClientConfig()
ATTRIBUTE DESCRIPTION config
Memcached client configuration.
client
Memcached client instance.
Example Basic usage:
>>> from pydvl.utils.caching.memcached import MemcachedCacheBackend\n>>> cache_backend = MemcachedCacheBackend()\n>>> cache_backend.clear()\n>>> value = 42\n>>> cache_backend.set(\"key\", value)\n>>> cache_backend.get(\"key\")\n42\n
Callable wrapping:
>>> from pydvl.utils.caching.memcached import MemcachedCacheBackend\n>>> cache_backend = MemcachedCacheBackend()\n>>> cache_backend.clear()\n>>> value = 42\n>>> def foo(x: int):\n... return x + 1\n...\n>>> wrapped_foo = cache_backend.wrap(foo)\n>>> wrapped_foo(value)\n43\n>>> wrapped_foo.stats.misses\n1\n>>> wrapped_foo.stats.hits\n0\n>>> wrapped_foo(value)\n43\n>>> wrapped_foo.stats.misses\n1\n>>> wrapped_foo.stats.hits\n1\n
PARAMETER DESCRIPTION config
Memcached client configuration.
TYPE: MemcachedClientConfig
DEFAULT: MemcachedClientConfig()
Source code in src/pydvl/utils/caching/memcached.py
def __init__(self, config: MemcachedClientConfig = MemcachedClientConfig()) -> None:\n \"\"\"Initialize memcached cache backend.\n\n Args:\n config: Memcached client configuration.\n \"\"\"\n\n super().__init__()\n self.config = config\n self.client = self._connect(self.config)\n
"},{"location":"api/pydvl/utils/caching/memcached/#pydvl.utils.caching.memcached.MemcachedCacheBackend.wrap","title":"wrap","text":"wrap(\n func: Callable, *, config: Optional[CachedFuncConfig] = None\n) -> CachedFunc\n
Wraps a function to cache its results.
PARAMETER DESCRIPTION func
The function to wrap.
TYPE: Callable
config
Optional caching options for the wrapped function.
TYPE: Optional[CachedFuncConfig]
DEFAULT: None
RETURNS DESCRIPTION CachedFunc
The wrapped cached function.
Source code in src/pydvl/utils/caching/base.py
def wrap(\n self,\n func: Callable,\n *,\n config: Optional[CachedFuncConfig] = None,\n) -> \"CachedFunc\":\n \"\"\"Wraps a function to cache its results.\n\n Args:\n func: The function to wrap.\n config: Optional caching options for the wrapped function.\n\n Returns:\n The wrapped cached function.\n \"\"\"\n return CachedFunc(\n func,\n cache_backend=self,\n config=config,\n )\n
"},{"location":"api/pydvl/utils/caching/memcached/#pydvl.utils.caching.memcached.MemcachedCacheBackend.get","title":"get","text":"get(key: str) -> Optional[Any]\n
Get value from memcached.
PARAMETER DESCRIPTION key
Cache key.
TYPE: str
RETURNS DESCRIPTION Optional[Any]
Cached value or None if not found or client disconnected.
Source code in src/pydvl/utils/caching/memcached.py
def get(self, key: str) -> Optional[Any]:\n \"\"\"Get value from memcached.\n\n Args:\n key: Cache key.\n\n Returns:\n Cached value or None if not found or client disconnected.\n \"\"\"\n result = None\n try:\n result = self.client.get(key)\n except socket.timeout as e:\n self.stats.timeouts += 1\n warnings.warn(f\"{type(self).__name__}: {str(e)}\", RuntimeWarning)\n except OSError as e:\n self.stats.errors += 1\n warnings.warn(f\"{type(self).__name__}: {str(e)}\", RuntimeWarning)\n except AttributeError as e:\n # FIXME: this depends on _recv() failing on invalid sockets\n # See pymemcache.base.py,\n self.stats.reconnects += 1\n warnings.warn(f\"{type(self).__name__}: {str(e)}\", RuntimeWarning)\n self.client = self._connect(self.config)\n if result is None:\n self.stats.misses += 1\n else:\n self.stats.hits += 1\n return result\n
"},{"location":"api/pydvl/utils/caching/memcached/#pydvl.utils.caching.memcached.MemcachedCacheBackend.set","title":"set","text":"set(key: str, value: Any) -> None\n
Set value in memcached.
PARAMETER DESCRIPTION key
Cache key.
TYPE: str
value
Value to cache.
TYPE: Any
Source code in src/pydvl/utils/caching/memcached.py
def set(self, key: str, value: Any) -> None:\n \"\"\"Set value in memcached.\n\n Args:\n key: Cache key.\n value: Value to cache.\n \"\"\"\n self.client.set(key, value, noreply=True)\n self.stats.sets += 1\n
"},{"location":"api/pydvl/utils/caching/memcached/#pydvl.utils.caching.memcached.MemcachedCacheBackend.clear","title":"clear","text":"clear() -> None\n
Flush all values from memcached.
Source code in src/pydvl/utils/caching/memcached.py
def clear(self) -> None:\n \"\"\"Flush all values from memcached.\"\"\"\n self.client.flush_all(noreply=True)\n
"},{"location":"api/pydvl/utils/caching/memcached/#pydvl.utils.caching.memcached.MemcachedCacheBackend.combine_hashes","title":"combine_hashes","text":"combine_hashes(*args: str) -> str\n
Join cache key components for Memcached.
Source code in src/pydvl/utils/caching/memcached.py
def combine_hashes(self, *args: str) -> str:\n \"\"\"Join cache key components for Memcached.\"\"\"\n return \":\".join(args)\n
"},{"location":"api/pydvl/utils/caching/memcached/#pydvl.utils.caching.memcached.MemcachedCacheBackend.__getstate__","title":"__getstate__","text":"__getstate__() -> Dict\n
Enables pickling after a socket has been opened to the memcached server, by removing the client from the stored data.
Source code in src/pydvl/utils/caching/memcached.py
def __getstate__(self) -> Dict:\n \"\"\"Enables pickling after a socket has been opened to the\n memcached server, by removing the client from the stored\n data.\"\"\"\n odict = self.__dict__.copy()\n del odict[\"client\"]\n return odict\n
"},{"location":"api/pydvl/utils/caching/memcached/#pydvl.utils.caching.memcached.MemcachedCacheBackend.__setstate__","title":"__setstate__","text":"__setstate__(d: Dict)\n
Restores a client connection after loading from a pickle.
Source code in src/pydvl/utils/caching/memcached.py
def __setstate__(self, d: Dict):\n \"\"\"Restores a client connection after loading from a pickle.\"\"\"\n self.config = d[\"config\"]\n self.stats = d[\"stats\"]\n self.client = self._connect(self.config)\n
"},{"location":"api/pydvl/utils/caching/memory/","title":"Memory","text":""},{"location":"api/pydvl/utils/caching/memory/#pydvl.utils.caching.memory","title":"pydvl.utils.caching.memory","text":""},{"location":"api/pydvl/utils/caching/memory/#pydvl.utils.caching.memory.InMemoryCacheBackend","title":"InMemoryCacheBackend","text":"InMemoryCacheBackend()\n
Bases: CacheBackend
In-memory cache backend that stores results in a dictionary.
Implements the CacheBackend interface for an in-memory-based cache. Stores cache entries as values in a dictionary, keyed by cache key. This allows sharing evaluations across threads in a single process.
The implementation is not thread-safe.
ATTRIBUTE DESCRIPTION cached_values
Dictionary used to store cached values.
TYPE: Dict[str, Any]
Example Basic usage:
>>> from pydvl.utils.caching.memory import InMemoryCacheBackend\n>>> cache_backend = InMemoryCacheBackend()\n>>> cache_backend.clear()\n>>> value = 42\n>>> cache_backend.set(\"key\", value)\n>>> cache_backend.get(\"key\")\n42\n
Callable wrapping:
>>> from pydvl.utils.caching.memory import InMemoryCacheBackend\n>>> cache_backend = InMemoryCacheBackend()\n>>> cache_backend.clear()\n>>> value = 42\n>>> def foo(x: int):\n... return x + 1\n...\n>>> wrapped_foo = cache_backend.wrap(foo)\n>>> wrapped_foo(value)\n43\n>>> wrapped_foo.stats.misses\n1\n>>> wrapped_foo.stats.hits\n0\n>>> wrapped_foo(value)\n43\n>>> wrapped_foo.stats.misses\n1\n>>> wrapped_foo.stats.hits\n1\n
Source code in src/pydvl/utils/caching/memory.py
def __init__(self) -> None:\n \"\"\"Initialize the in-memory cache backend.\"\"\"\n super().__init__()\n self.cached_values: Dict[str, Any] = {}\n
"},{"location":"api/pydvl/utils/caching/memory/#pydvl.utils.caching.memory.InMemoryCacheBackend.wrap","title":"wrap","text":"wrap(\n func: Callable, *, config: Optional[CachedFuncConfig] = None\n) -> CachedFunc\n
Wraps a function to cache its results.
PARAMETER DESCRIPTION func
The function to wrap.
TYPE: Callable
config
Optional caching options for the wrapped function.
TYPE: Optional[CachedFuncConfig]
DEFAULT: None
RETURNS DESCRIPTION CachedFunc
The wrapped cached function.
Source code in src/pydvl/utils/caching/base.py
def wrap(\n self,\n func: Callable,\n *,\n config: Optional[CachedFuncConfig] = None,\n) -> \"CachedFunc\":\n \"\"\"Wraps a function to cache its results.\n\n Args:\n func: The function to wrap.\n config: Optional caching options for the wrapped function.\n\n Returns:\n The wrapped cached function.\n \"\"\"\n return CachedFunc(\n func,\n cache_backend=self,\n config=config,\n )\n
"},{"location":"api/pydvl/utils/caching/memory/#pydvl.utils.caching.memory.InMemoryCacheBackend.get","title":"get","text":"get(key: str) -> Optional[Any]\n
Get a value from the cache.
PARAMETER DESCRIPTION key
Cache key.
TYPE: str
RETURNS DESCRIPTION Optional[Any]
Cached value or None if not found.
Source code in src/pydvl/utils/caching/memory.py
def get(self, key: str) -> Optional[Any]:\n \"\"\"Get a value from the cache.\n\n Args:\n key: Cache key.\n\n Returns:\n Cached value or None if not found.\n \"\"\"\n value = self.cached_values.get(key, None)\n if value is not None:\n self.stats.hits += 1\n else:\n self.stats.misses += 1\n return value\n
"},{"location":"api/pydvl/utils/caching/memory/#pydvl.utils.caching.memory.InMemoryCacheBackend.set","title":"set","text":"set(key: str, value: Any) -> None\n
Set a value in the cache.
PARAMETER DESCRIPTION key
Cache key.
TYPE: str
value
Value to cache.
TYPE: Any
Source code in src/pydvl/utils/caching/memory.py
def set(self, key: str, value: Any) -> None:\n \"\"\"Set a value in the cache.\n\n Args:\n key: Cache key.\n value: Value to cache.\n \"\"\"\n self.cached_values[key] = value\n self.stats.sets += 1\n
"},{"location":"api/pydvl/utils/caching/memory/#pydvl.utils.caching.memory.InMemoryCacheBackend.clear","title":"clear","text":"clear() -> None\n
Deletes cache dictionary and recreates it.
Source code in src/pydvl/utils/caching/memory.py
def clear(self) -> None:\n \"\"\"Deletes cache dictionary and recreates it.\"\"\"\n del self.cached_values\n self.cached_values = {}\n
"},{"location":"api/pydvl/utils/caching/memory/#pydvl.utils.caching.memory.InMemoryCacheBackend.combine_hashes","title":"combine_hashes","text":"combine_hashes(*args: str) -> str\n
Join cache key components.
Source code in src/pydvl/utils/caching/memory.py
def combine_hashes(self, *args: str) -> str:\n \"\"\"Join cache key components.\"\"\"\n return os.pathsep.join(args)\n
"},{"location":"api/pydvl/value/","title":"Value","text":""},{"location":"api/pydvl/value/#pydvl.value","title":"pydvl.value","text":"This module implements algorithms for the exact and approximate computation of values and semi-values.
See Data valuation for an introduction to the concepts and methods implemented here.
"},{"location":"api/pydvl/value/games/","title":"Games","text":""},{"location":"api/pydvl/value/games/#pydvl.value.games","title":"pydvl.value.games","text":"This module provides several predefined games and, depending on the game, the corresponding Shapley values, Least Core values or both of them, for benchmarking purposes.
"},{"location":"api/pydvl/value/games/#pydvl.value.games--references","title":"References","text":" -
Castro, J., G\u00f3mez, D. and Tejada, J., 2009. Polynomial calculation of the Shapley value based on sampling. Computers & Operations Research, 36(5), pp.1726-1730.\u00a0\u21a9
"},{"location":"api/pydvl/value/games/#pydvl.value.games.DummyGameDataset","title":"DummyGameDataset","text":"DummyGameDataset(n_players: int, description: Optional[str] = None)\n
Bases: Dataset
Dummy game dataset.
Initializes a dummy game dataset with n_players and an optional description.
This class is used internally inside the Game class.
PARAMETER DESCRIPTION n_players
Number of players that participate in the game.
TYPE: int
description
Optional description of the dataset.
TYPE: Optional[str]
DEFAULT: None
Source code in src/pydvl/value/games.py
def __init__(self, n_players: int, description: Optional[str] = None) -> None:\n x = np.arange(0, n_players, 1).reshape(-1, 1)\n nil = np.zeros_like(x)\n super().__init__(\n x,\n nil.copy(),\n nil.copy(),\n nil.copy(),\n feature_names=[\"x\"],\n target_names=[\"y\"],\n description=description,\n )\n
"},{"location":"api/pydvl/value/games/#pydvl.value.games.DummyGameDataset.indices","title":"indices property
","text":"indices: NDArray[int_]\n
Index of positions in data.x_train.
Contiguous integers from 0 to len(Dataset).
"},{"location":"api/pydvl/value/games/#pydvl.value.games.DummyGameDataset.data_names","title":"data_names property
","text":"data_names: NDArray[object_]\n
Names of each individual datapoint.
Used for reporting Shapley values.
"},{"location":"api/pydvl/value/games/#pydvl.value.games.DummyGameDataset.dim","title":"dim property
","text":"dim: int\n
Returns the number of dimensions of a sample.
"},{"location":"api/pydvl/value/games/#pydvl.value.games.DummyGameDataset.get_training_data","title":"get_training_data","text":"get_training_data(\n indices: Optional[Iterable[int]] = None,\n) -> Tuple[NDArray, NDArray]\n
Given a set of indices, returns the training data that refer to those indices.
This is used mainly by Utility to retrieve subsets of the data from indices. It is typically not needed in algorithms.
PARAMETER DESCRIPTION indices
Optional indices that will be used to select points from the training data. If None
, the entire training data will be returned.
TYPE: Optional[Iterable[int]]
DEFAULT: None
RETURNS DESCRIPTION Tuple[NDArray, NDArray]
If indices
is not None
, the selected x and y arrays from the training data. Otherwise, the entire dataset.
Source code in src/pydvl/utils/dataset.py
def get_training_data(\n self, indices: Optional[Iterable[int]] = None\n) -> Tuple[NDArray, NDArray]:\n \"\"\"Given a set of indices, returns the training data that refer to those\n indices.\n\n This is used mainly by [Utility][pydvl.utils.utility.Utility] to retrieve\n subsets of the data from indices. It is typically **not needed in\n algorithms**.\n\n Args:\n indices: Optional indices that will be used to select points from\n the training data. If `None`, the entire training data will be\n returned.\n\n Returns:\n If `indices` is not `None`, the selected x and y arrays from the\n training data. Otherwise, the entire dataset.\n \"\"\"\n if indices is None:\n return self.x_train, self.y_train\n x = self.x_train[indices]\n y = self.y_train[indices]\n return x, y\n
"},{"location":"api/pydvl/value/games/#pydvl.value.games.DummyGameDataset.from_sklearn","title":"from_sklearn classmethod
","text":"from_sklearn(\n data: Bunch,\n train_size: float = 0.8,\n random_state: Optional[int] = None,\n stratify_by_target: bool = False,\n **kwargs\n) -> Dataset\n
Constructs a Dataset object from a sklearn.utils.Bunch, as returned by the load_*
functions in scikit-learn toy datasets.
Example >>> from pydvl.utils import Dataset\n>>> from sklearn.datasets import load_boston\n>>> dataset = Dataset.from_sklearn(load_boston())\n
PARAMETER DESCRIPTION data
scikit-learn Bunch object. The following attributes are supported:
data
: covariates. target
: target variables (labels). feature_names
(optional): the feature names. target_names
(optional): the target names. DESCR
(optional): a description.
TYPE: Bunch
train_size
size of the training dataset. Used in train_test_split
TYPE: float
DEFAULT: 0.8
random_state
seed for train / test split
TYPE: Optional[int]
DEFAULT: None
stratify_by_target
If True
, data is split in a stratified fashion, using the target variable as labels. Read more in scikit-learn's user guide.
TYPE: bool
DEFAULT: False
kwargs
Additional keyword arguments to pass to the Dataset constructor. Use this to pass e.g. is_multi_output
.
DEFAULT: {}
RETURNS DESCRIPTION Dataset
Object with the sklearn dataset
Changed in version 0.6.0
Added kwargs to pass to the Dataset constructor.
Source code in src/pydvl/utils/dataset.py
@classmethod\ndef from_sklearn(\n cls,\n data: Bunch,\n train_size: float = 0.8,\n random_state: Optional[int] = None,\n stratify_by_target: bool = False,\n **kwargs,\n) -> \"Dataset\":\n \"\"\"Constructs a [Dataset][pydvl.utils.Dataset] object from a\n [sklearn.utils.Bunch][], as returned by the `load_*`\n functions in [scikit-learn toy datasets](https://scikit-learn.org/stable/datasets/toy_dataset.html).\n\n ??? Example\n ```pycon\n >>> from pydvl.utils import Dataset\n >>> from sklearn.datasets import load_boston\n >>> dataset = Dataset.from_sklearn(load_boston())\n ```\n\n Args:\n data: scikit-learn Bunch object. The following attributes are supported:\n\n - `data`: covariates.\n - `target`: target variables (labels).\n - `feature_names` (**optional**): the feature names.\n - `target_names` (**optional**): the target names.\n - `DESCR` (**optional**): a description.\n train_size: size of the training dataset. Used in `train_test_split`\n random_state: seed for train / test split\n stratify_by_target: If `True`, data is split in a stratified\n fashion, using the target variable as labels. Read more in\n [scikit-learn's user guide](https://scikit-learn.org/stable/modules/cross_validation.html#stratification).\n kwargs: Additional keyword arguments to pass to the\n [Dataset][pydvl.utils.Dataset] constructor. Use this to pass e.g. `is_multi_output`.\n\n Returns:\n Object with the sklearn dataset\n\n !!! tip \"Changed in version 0.6.0\"\n Added kwargs to pass to the [Dataset][pydvl.utils.Dataset] constructor.\n \"\"\"\n x_train, x_test, y_train, y_test = train_test_split(\n data.data,\n data.target,\n train_size=train_size,\n random_state=random_state,\n stratify=data.target if stratify_by_target else None,\n )\n return cls(\n x_train,\n y_train,\n x_test,\n y_test,\n feature_names=data.get(\"feature_names\"),\n target_names=data.get(\"target_names\"),\n description=data.get(\"DESCR\"),\n **kwargs,\n )\n
"},{"location":"api/pydvl/value/games/#pydvl.value.games.DummyGameDataset.from_arrays","title":"from_arrays classmethod
","text":"from_arrays(\n X: NDArray,\n y: NDArray,\n train_size: float = 0.8,\n random_state: Optional[int] = None,\n stratify_by_target: bool = False,\n **kwargs\n) -> Dataset\n
Constructs a Dataset object from X and y numpy arrays as returned by the make_*
functions in sklearn generated datasets.
Example >>> from pydvl.utils import Dataset\n>>> from sklearn.datasets import make_regression\n>>> X, y = make_regression()\n>>> dataset = Dataset.from_arrays(X, y)\n
PARAMETER DESCRIPTION X
numpy array of shape (n_samples, n_features)
TYPE: NDArray
y
numpy array of shape (n_samples,)
TYPE: NDArray
train_size
size of the training dataset. Used in train_test_split
TYPE: float
DEFAULT: 0.8
random_state
seed for train / test split
TYPE: Optional[int]
DEFAULT: None
stratify_by_target
If True
, data is split in a stratified fashion, using the y variable as labels. Read more in sklearn's user guide.
TYPE: bool
DEFAULT: False
kwargs
Additional keyword arguments to pass to the Dataset constructor. Use this to pass e.g. feature_names
or target_names
.
DEFAULT: {}
RETURNS DESCRIPTION Dataset
Object with the passed X and y arrays split across training and test sets.
New in version 0.4.0
Changed in version 0.6.0
Added kwargs to pass to the Dataset constructor.
Source code in src/pydvl/utils/dataset.py
@classmethod\ndef from_arrays(\n cls,\n X: NDArray,\n y: NDArray,\n train_size: float = 0.8,\n random_state: Optional[int] = None,\n stratify_by_target: bool = False,\n **kwargs,\n) -> \"Dataset\":\n \"\"\"Constructs a [Dataset][pydvl.utils.Dataset] object from X and y numpy arrays as\n returned by the `make_*` functions in [sklearn generated datasets](https://scikit-learn.org/stable/datasets/sample_generators.html).\n\n ??? Example\n ```pycon\n >>> from pydvl.utils import Dataset\n >>> from sklearn.datasets import make_regression\n >>> X, y = make_regression()\n >>> dataset = Dataset.from_arrays(X, y)\n ```\n\n Args:\n X: numpy array of shape (n_samples, n_features)\n y: numpy array of shape (n_samples,)\n train_size: size of the training dataset. Used in `train_test_split`\n random_state: seed for train / test split\n stratify_by_target: If `True`, data is split in a stratified fashion,\n using the y variable as labels. Read more in [sklearn's user\n guide](https://scikit-learn.org/stable/modules/cross_validation.html#stratification).\n kwargs: Additional keyword arguments to pass to the\n [Dataset][pydvl.utils.Dataset] constructor. Use this to pass e.g. `feature_names`\n or `target_names`.\n\n Returns:\n Object with the passed X and y arrays split across training and test sets.\n\n !!! tip \"New in version 0.4.0\"\n\n !!! tip \"Changed in version 0.6.0\"\n Added kwargs to pass to the [Dataset][pydvl.utils.Dataset] constructor.\n \"\"\"\n x_train, x_test, y_train, y_test = train_test_split(\n X,\n y,\n train_size=train_size,\n random_state=random_state,\n stratify=y if stratify_by_target else None,\n )\n return cls(x_train, y_train, x_test, y_test, **kwargs)\n
"},{"location":"api/pydvl/value/games/#pydvl.value.games.DummyGameDataset.get_test_data","title":"get_test_data","text":"get_test_data(\n indices: Optional[Iterable[int]] = None,\n) -> Tuple[NDArray, NDArray]\n
Returns the subsets of the train set instead of the test set.
PARAMETER DESCRIPTION indices
Indices into the training data.
TYPE: Optional[Iterable[int]]
DEFAULT: None
RETURNS DESCRIPTION Tuple[NDArray, NDArray]
Subset of the train data.
Source code in src/pydvl/value/games.py
def get_test_data(\n self, indices: Optional[Iterable[int]] = None\n) -> Tuple[NDArray, NDArray]:\n \"\"\"Returns the subsets of the train set instead of the test set.\n\n Args:\n indices: Indices into the training data.\n\n Returns:\n Subset of the train data.\n \"\"\"\n if indices is None:\n return self.x_train, self.y_train\n x = self.x_train[indices]\n y = self.y_train[indices]\n return x, y\n
"},{"location":"api/pydvl/value/games/#pydvl.value.games.DummyModel","title":"DummyModel","text":"DummyModel()\n
Bases: SupervisedModel
Dummy model class.
A dummy supervised model used for testing purposes only.
Source code in src/pydvl/value/games.py
def __init__(self) -> None:\n pass\n
"},{"location":"api/pydvl/value/games/#pydvl.value.games.Game","title":"Game","text":"Game(\n n_players: int,\n score_range: Tuple[float, float] = (-np.inf, np.inf),\n description: Optional[str] = None,\n)\n
Bases: ABC
Base class for games
Any Game subclass has to implement the abstract _score
method to assign a score to each coalition/subset and at least one of shapley_values
, least_core_values
.
PARAMETER DESCRIPTION n_players
Number of players that participate in the game.
TYPE: int
score_range
Minimum and maximum values of the _score
method.
TYPE: Tuple[float, float]
DEFAULT: (-inf, inf)
description
Optional string description of the dummy dataset that will be created.
TYPE: Optional[str]
DEFAULT: None
ATTRIBUTE DESCRIPTION n_players
Number of players that participate in the game.
data
Dummy dataset object.
u
Utility object with a dummy model and dataset.
Source code in src/pydvl/value/games.py
def __init__(\n self,\n n_players: int,\n score_range: Tuple[float, float] = (-np.inf, np.inf),\n description: Optional[str] = None,\n):\n self.n_players = n_players\n self.data = DummyGameDataset(self.n_players, description)\n self.u = Utility(\n DummyModel(),\n self.data,\n scorer=Scorer(self._score, range=score_range),\n catch_errors=False,\n show_warnings=True,\n )\n
"},{"location":"api/pydvl/value/games/#pydvl.value.games.SymmetricVotingGame","title":"SymmetricVotingGame","text":"SymmetricVotingGame(n_players: int)\n
Bases: Game
Toy game that is used for testing and demonstration purposes.
A symmetric voting game defined in (Castro et al., 2009)1 Section 4.1
For this game the utility of a coalition is 1 if its cardinality is greater than num_samples/2, or 0 otherwise.
\\[{ v(S) = \\left\\{\\begin{array}{ll} 1, & \\text{ if} \\quad \\mid S \\mid > \\frac{N}{2} \\\\ 0, & \\text{ otherwise} \\end{array}\\right. }\\] PARAMETER DESCRIPTION n_players
Number of players that participate in the game.
TYPE: int
Source code in src/pydvl/value/games.py
def __init__(self, n_players: int) -> None:\n if n_players % 2 != 0:\n raise ValueError(\"n_players must be an even number.\")\n description = \"Dummy data for the symmetric voting game in Castro et al. 2009\"\n super().__init__(\n n_players,\n score_range=(0, 1),\n description=description,\n )\n
"},{"location":"api/pydvl/value/games/#pydvl.value.games.AsymmetricVotingGame","title":"AsymmetricVotingGame","text":"AsymmetricVotingGame(n_players: int = 51)\n
Bases: Game
Toy game that is used for testing and demonstration purposes.
An asymmetric voting game defined in (Castro et al., 2009)1 Section 4.2.
For this game the player set is \\(N = \\{1,\\dots,51\\}\\) and the utility of a coalition is given by:
\\[{ v(S) = \\left\\{\\begin{array}{ll} 1, & \\text{ if} \\quad \\sum\\limits_{i \\in S} w_i > \\sum\\limits_{j \\in N}\\frac{w_j}{2} \\\\ 0, & \\text{ otherwise} \\end{array}\\right. }\\] where \\(w = [w_1,\\dots, w_{51}]\\) is a list of weights associated with each player.
PARAMETER DESCRIPTION n_players
Number of players that participate in the game.
TYPE: int
DEFAULT: 51
Source code in src/pydvl/value/games.py
def __init__(self, n_players: int = 51) -> None:\n if n_players != 51:\n raise ValueError(\n f\"{self.__class__.__name__} only supports n_players=51 but got {n_players=}.\"\n )\n description = \"Dummy data for the asymmetric voting game in Castro et al. 2009\"\n super().__init__(\n n_players,\n score_range=(0, 1),\n description=description,\n )\n\n ranges = [\n range(0, 1),\n range(1, 2),\n range(2, 3),\n range(3, 5),\n range(5, 6),\n range(6, 7),\n range(7, 9),\n range(9, 10),\n range(10, 12),\n range(12, 15),\n range(15, 16),\n range(16, 20),\n range(20, 24),\n range(24, 26),\n range(26, 30),\n range(30, 34),\n range(34, 35),\n range(35, 44),\n range(44, 51),\n ]\n\n ranges_weights = [\n 45,\n 41,\n 27,\n 26,\n 25,\n 21,\n 17,\n 14,\n 13,\n 12,\n 11,\n 10,\n 9,\n 8,\n 7,\n 6,\n 5,\n 4,\n 3,\n ]\n ranges_values = [\n \"0.08831\",\n \"0.07973\",\n \"0.05096\",\n \"0.04898\",\n \"0.047\",\n \"0.03917\",\n \"0.03147\",\n \"0.02577\",\n \"0.02388\",\n \"0.022\",\n \"0.02013\",\n \"0.01827\",\n \"0.01641\",\n \"0.01456\",\n \"0.01272\",\n \"0.01088\",\n \"0.009053\",\n \"0.00723\",\n \"0.005412\",\n ]\n\n self.weight_table = np.zeros(self.n_players)\n exact_values = np.zeros(self.n_players)\n for r, w, v in zip(ranges, ranges_weights, ranges_values):\n self.weight_table[r] = w\n exact_values[r] = v\n\n self.exact_values = exact_values\n self.threshold = np.sum(self.weight_table) / 2\n
"},{"location":"api/pydvl/value/games/#pydvl.value.games.ShoesGame","title":"ShoesGame","text":"ShoesGame(left: int, right: int)\n
Bases: Game
Toy game that is used for testing and demonstration purposes.
A shoes game defined in (Castro et al., 2009)1.
In this game, some players have a left shoe and others a right shoe. Single shoes have a worth of zero while pairs have a worth of 1.
The payoff of a coalition \\(S\\) is:
\\[{ v(S) = \\min( \\mid S \\cap L \\mid, \\mid S \\cap R \\mid ) }\\] Where \\(L\\), respectively \\(R\\), is the set of players with left shoes, respectively right shoes.
PARAMETER DESCRIPTION left
Number of players with a left shoe.
TYPE: int
right
Number of players with a right shoe.
TYPE: int
Source code in src/pydvl/value/games.py
def __init__(self, left: int, right: int) -> None:\n self.left = left\n self.right = right\n n_players = self.left + self.right\n description = \"Dummy data for the shoe game in Castro et al. 2009\"\n max_score = n_players // 2\n super().__init__(n_players, score_range=(0, max_score), description=description)\n
"},{"location":"api/pydvl/value/games/#pydvl.value.games.AirportGame","title":"AirportGame","text":"AirportGame(n_players: int = 100)\n
Bases: Game
Toy game that is used for testing and demonstration purposes.
An airport game defined in (Castro et al., 2009)1 Section 4.3
PARAMETER DESCRIPTION n_players
Number of players that participate in the game.
TYPE: int
DEFAULT: 100
Source code in src/pydvl/value/games.py
def __init__(self, n_players: int = 100) -> None:\n if n_players != 100:\n raise ValueError(\n f\"{self.__class__.__name__} only supports n_players=100 but got {n_players=}.\"\n )\n description = \"A dummy dataset for the airport game in Castro et al. 2009\"\n super().__init__(n_players, score_range=(0, 100), description=description)\n ranges = [\n range(0, 8),\n range(8, 20),\n range(20, 26),\n range(26, 40),\n range(40, 48),\n range(48, 57),\n range(57, 70),\n range(70, 80),\n range(80, 90),\n range(90, 100),\n ]\n exact = [\n 0.01,\n 0.020869565,\n 0.033369565,\n 0.046883079,\n 0.063549745,\n 0.082780515,\n 0.106036329,\n 0.139369662,\n 0.189369662,\n 0.289369662,\n ]\n c = list(range(1, 10))\n score_table = np.zeros(100)\n exact_values = np.zeros(100)\n\n for r, v in zip(ranges, exact):\n score_table[r] = c\n exact_values[r] = v\n\n self.exact_values = exact_values\n self.score_table = score_table\n
"},{"location":"api/pydvl/value/games/#pydvl.value.games.MinimumSpanningTreeGame","title":"MinimumSpanningTreeGame","text":"MinimumSpanningTreeGame(n_players: int = 100)\n
Bases: Game
Toy game that is used for testing and demonstration purposes.
A minimum spanning tree game defined in (Castro et al., 2009)1.
Let \\(G = (N \\cup \\{0\\},E)\\) be a valued graph where \\(N = \\{1,\\dots,100\\}\\), and the cost associated to an edge \\((i, j)\\) is:
\\[{ c_{ij} = \\left\\{\\begin{array}{lll} 1, & \\text{ if} & i = j + 1 \\text{ or } i = j - 1 \\\\ & & \\text{ or } (i = 1 \\text{ and } j = 100) \\text{ or } (i = 100 \\text{ and } j = 1) \\\\ 101, & \\text{ if} & i = 0 \\text{ or } j = 0 \\\\ \\infty, & \\text{ otherwise} \\end{array}\\right. }\\] A minimum spanning tree game \\((N, c)\\) is a cost game, where for a given coalition \\(S \\subset N\\), \\(v(S)\\) is the sum of the edge cost of the minimum spanning tree, i.e. \\(v(S)\\) = Minimum Spanning Tree of the graph \\(G|_{S\\cup\\{0\\}}\\), which is the partial graph restricted to the players \\(S\\) and the source node \\(0\\).
PARAMETER DESCRIPTION n_players
Number of players that participate in the game.
TYPE: int
DEFAULT: 100
Source code in src/pydvl/value/games.py
def __init__(self, n_players: int = 100) -> None:\n if n_players != 100:\n raise ValueError(\n f\"{self.__class__.__name__} only supports n_players=100 but got {n_players=}.\"\n )\n description = (\n \"A dummy dataset for the minimum spanning tree game in Castro et al. 2009\"\n )\n super().__init__(n_players, score_range=(0, np.inf), description=description)\n\n graph = np.zeros(shape=(self.n_players, self.n_players))\n\n for i in range(self.n_players):\n for j in range(self.n_players):\n if (\n i == j + 1\n or i == j - 1\n or (i == 1 and j == self.n_players - 1)\n or (i == self.n_players - 1 and j == 1)\n ):\n graph[i, j] = 1\n elif i == 0 or j == 0:\n graph[i, j] = 0\n else:\n graph[i, j] = np.inf\n assert np.all(graph == graph.T)\n\n self.graph = graph\n
"},{"location":"api/pydvl/value/games/#pydvl.value.games.MinerGame","title":"MinerGame","text":"MinerGame(n_players: int)\n
Bases: Game
Toy game that is used for testing and demonstration purposes.
Consider a group of n miners, who have discovered large bars of gold.
If two miners can carry one piece of gold, then the payoff of a coalition \\(S\\) is:
\\[{ v(S) = \\left\\{\\begin{array}{lll} \\mid S \\mid / 2, & \\text{ if} & \\mid S \\mid \\text{ is even} \\\\ ( \\mid S \\mid - 1)/2, & \\text{ otherwise} \\end{array}\\right. }\\] If there are more than two miners and there is an even number of miners, then the core consists of the single payoff where each miner gets 1/2.
If there is an odd number of miners, then the core is empty.
Taken from Wikipedia
PARAMETER DESCRIPTION n_players
Number of miners that participate in the game.
TYPE: int
Source code in src/pydvl/value/games.py
def __init__(self, n_players: int) -> None:\n if n_players <= 2:\n raise ValueError(f\"n_players, {n_players}, should be > 2\")\n description = \"Dummy data for Miner Game taken from https://en.wikipedia.org/wiki/Core_(game_theory)\"\n super().__init__(\n n_players,\n score_range=(0, n_players // 2),\n description=description,\n )\n
"},{"location":"api/pydvl/value/result/","title":"Result","text":""},{"location":"api/pydvl/value/result/#pydvl.value.result","title":"pydvl.value.result","text":"This module collects types and methods for the inspection of the results of valuation algorithms.
The most important class is ValuationResult, which provides access to raw values, as well as convenient behaviour as a Sequence
with extended indexing and updating abilities, and conversion to pandas DataFrames.
"},{"location":"api/pydvl/value/result/#pydvl.value.result--operating-on-results","title":"Operating on results","text":"Results can be added together with the standard +
operator. Because values are typically running averages of iterative algorithms, addition behaves like a weighted average of the two results, with the weights being the number of updates in each result: adding two results is the same as generating one result with the mean of the values of the two results as values. The variances are updated accordingly. See ValuationResult for details.
Results can also be sorted by value, variance or number of updates, see sort(). The arrays of ValuationResult.values, ValuationResult.variances, ValuationResult.counts, ValuationResult.indices, ValuationResult.names are sorted in the same way.
Indexing and slicing of results is supported and ValueItem objects are returned. These objects can be compared with the usual operators, which take only the ValueItem.value into account.
"},{"location":"api/pydvl/value/result/#pydvl.value.result--creating-result-objects","title":"Creating result objects","text":"The most commonly used factory method is ValuationResult.zeros(), which creates a result object with all values, variances and counts set to zero. ValuationResult.empty() creates an empty result object, which can be used as a starting point for adding results together. Empty results are discarded when added to other results. Finally, ValuationResult.from_random() samples random values uniformly.
"},{"location":"api/pydvl/value/result/#pydvl.value.result.ValueItem","title":"ValueItem dataclass
","text":"ValueItem(\n index: IndexT,\n name: NameT,\n value: float,\n variance: Optional[float],\n count: Optional[int],\n)\n
Bases: Generic[IndexT, NameT]
The result of a value computation for one datum.
ValueItems
can be compared with the usual operators, forming a total order. Comparisons take only the value
into account.
Todo
Maybe have a mode of comparing similar to np.isclose
, or taking the variance
into account.
ATTRIBUTE DESCRIPTION index
Index of the sample with this value in the original Dataset
TYPE: IndexT
name
Name of the sample if it was provided. Otherwise, str(index)
TYPE: NameT
value
The value
TYPE: float
variance
Variance of the value if it was computed with an approximate method
TYPE: Optional[float]
count
Number of updates for this value
TYPE: Optional[int]
"},{"location":"api/pydvl/value/result/#pydvl.value.result.ValueItem.stderr","title":"stderr property
","text":"stderr: Optional[float]\n
Standard error of the value.
"},{"location":"api/pydvl/value/result/#pydvl.value.result.ValuationResult","title":"ValuationResult","text":"ValuationResult(\n *,\n values: NDArray[float_],\n variances: Optional[NDArray[float_]] = None,\n counts: Optional[NDArray[int_]] = None,\n indices: Optional[NDArray[IndexT]] = None,\n data_names: Optional[Sequence[NameT] | NDArray[NameT]] = None,\n algorithm: str = \"\",\n status: Status = Status.Pending,\n sort: bool = False,\n **extra_values\n)\n
Bases: Sequence
, Iterable[ValueItem[IndexT, NameT]]
, Generic[IndexT, NameT]
Objects of this class hold the results of valuation algorithms.
These include indices in the original Dataset, any data names (e.g. group names in GroupedDataset), the values themselves, and variance of the computation in the case of Monte Carlo methods. ValuationResults
can be iterated over like any Sequence
: iter(valuation_result)
returns a generator of ValueItem in the order in which the object is sorted.
"},{"location":"api/pydvl/value/result/#pydvl.value.result.ValuationResult--indexing","title":"Indexing","text":"Indexing can be position-based, when accessing any of the attributes values, variances, counts and indices, as well as when iterating over the object, or using the item access operator, both getter and setter. The \"position\" is either the original sequence in which the data was passed to the constructor, or the sequence in which the object is sorted, see below.
Alternatively, indexing can be data-based, i.e. using the indices in the original dataset. This is the case for the methods get() and update().
"},{"location":"api/pydvl/value/result/#pydvl.value.result.ValuationResult--sorting","title":"Sorting","text":"Results can be sorted in-place with sort(), or alternatively using python's standard sorted()
and reversed()
Note that sorting values affects how iterators and the object itself as Sequence
behave: values[0]
returns a ValueItem with the highest or lowest ranking point if this object is sorted by descending or ascending value, respectively. If unsorted, values[0]
returns the ValueItem
at position 0, which has data index indices[0]
in the Dataset.
The same applies to direct indexing of the ValuationResult
: the index is positional, according to the sorting. It does not refer to the \"data index\". To sort according to data index, use sort() with key=\"index\"
.
In order to access ValueItem objects by their data index, use get().
"},{"location":"api/pydvl/value/result/#pydvl.value.result.ValuationResult--operating-on-results","title":"Operating on results","text":"Results can be added to each other with the +
operator. Means and variances are correctly updated, using the counts
attribute.
Results can also be updated with new values using update(). Means and variances are updated accordingly using the Welford algorithm.
Empty objects behave in a special way, see empty().
PARAMETER DESCRIPTION values
An array of values. If omitted, defaults to an empty array or to an array of zeros if indices
are given.
TYPE: NDArray[float_]
indices
An optional array of indices in the original dataset. If omitted, defaults to np.arange(len(values))
. Warning: It is common to pass the indices of a Dataset here. Attention must be paid in a parallel context to copy them to the local process. Just do indices=np.copy(data.indices)
.
TYPE: Optional[NDArray[IndexT]]
DEFAULT: None
variances
An optional array of variances in the computation of each value.
TYPE: Optional[NDArray[float_]]
DEFAULT: None
counts
An optional array with the number of updates for each value. Defaults to an array of ones.
TYPE: Optional[NDArray[int_]]
DEFAULT: None
data_names
Names for the data points. Defaults to index numbers if not set.
TYPE: Optional[Sequence[NameT] | NDArray[NameT]]
DEFAULT: None
algorithm
The method used.
TYPE: str
DEFAULT: ''
status
The end status of the algorithm.
TYPE: Status
DEFAULT: Pending
sort
Whether to sort the indices by ascending value. See above how this affects usage as an iterable or sequence.
TYPE: bool
DEFAULT: False
extra_values
Additional values that can be passed as keyword arguments. This can contain, for example, the least core value.
DEFAULT: {}
RAISES DESCRIPTION ValueError
If input arrays have mismatching lengths.
Source code in src/pydvl/value/result.py
def __init__(\n self,\n *,\n values: NDArray[np.float_],\n variances: Optional[NDArray[np.float_]] = None,\n counts: Optional[NDArray[np.int_]] = None,\n indices: Optional[NDArray[IndexT]] = None,\n data_names: Optional[Sequence[NameT] | NDArray[NameT]] = None,\n algorithm: str = \"\",\n status: Status = Status.Pending,\n sort: bool = False,\n **extra_values,\n):\n if variances is not None and len(variances) != len(values):\n raise ValueError(\"Lengths of values and variances do not match\")\n if data_names is not None and len(data_names) != len(values):\n raise ValueError(\"Lengths of values and data_names do not match\")\n if indices is not None and len(indices) != len(values):\n raise ValueError(\"Lengths of values and indices do not match\")\n\n self._algorithm = algorithm\n self._status = Status(status) # Just in case we are given a string\n self._values = values\n self._variances = np.zeros_like(values) if variances is None else variances\n self._counts = np.ones_like(values) if counts is None else counts\n self._sort_order = None\n self._extra_values = extra_values or {}\n\n # Yuk...\n if data_names is None:\n if indices is not None:\n self._names = np.copy(indices)\n else:\n self._names = np.arange(len(self._values), dtype=np.int_)\n elif not isinstance(data_names, np.ndarray):\n self._names = np.array(data_names)\n else:\n self._names = data_names.copy()\n if len(np.unique(self._names)) != len(self._names):\n raise ValueError(\"Data names must be unique\")\n\n if indices is None:\n indices = np.arange(len(self._values), dtype=np.int_)\n self._indices = indices\n self._positions = {idx: pos for pos, idx in enumerate(indices)}\n\n self._sort_positions: NDArray[np.int_] = np.arange(\n len(self._values), dtype=np.int_\n )\n if sort:\n self.sort()\n
"},{"location":"api/pydvl/value/result/#pydvl.value.result.ValuationResult.values","title":"values property
","text":"values: NDArray[float_]\n
The values, possibly sorted.
"},{"location":"api/pydvl/value/result/#pydvl.value.result.ValuationResult.variances","title":"variances property
","text":"variances: NDArray[float_]\n
The variances, possibly sorted.
"},{"location":"api/pydvl/value/result/#pydvl.value.result.ValuationResult.stderr","title":"stderr property
","text":"stderr: NDArray[float_]\n
The raw standard errors, possibly sorted.
"},{"location":"api/pydvl/value/result/#pydvl.value.result.ValuationResult.counts","title":"counts property
","text":"counts: NDArray[int_]\n
The raw counts, possibly sorted.
"},{"location":"api/pydvl/value/result/#pydvl.value.result.ValuationResult.indices","title":"indices property
","text":"indices: NDArray[IndexT]\n
The indices for the values, possibly sorted.
If the object is unsorted, then these are the same as declared at construction or np.arange(len(values))
if none were passed.
"},{"location":"api/pydvl/value/result/#pydvl.value.result.ValuationResult.names","title":"names property
","text":"names: NDArray[NameT]\n
The names for the values, possibly sorted. If the object is unsorted, then these are the same as declared at construction or np.arange(len(values))
if none were passed.
"},{"location":"api/pydvl/value/result/#pydvl.value.result.ValuationResult.sort","title":"sort","text":"sort(\n reverse: bool = False,\n key: Literal[\"value\", \"variance\", \"index\", \"name\"] = \"value\",\n) -> None\n
Sorts the indices in place by key
.
Once sorted, iteration over the results, and indexing of all the properties ValuationResult.values, ValuationResult.variances, ValuationResult.counts, ValuationResult.indices and ValuationResult.names will follow the same order.
PARAMETER DESCRIPTION reverse
Whether to sort in descending order by value.
TYPE: bool
DEFAULT: False
key
The key to sort by. Defaults to ValueItem.value.
TYPE: Literal['value', 'variance', 'index', 'name']
DEFAULT: 'value'
Source code in src/pydvl/value/result.py
def sort(\n self,\n reverse: bool = False,\n # Need a \"Comparable\" type here\n key: Literal[\"value\", \"variance\", \"index\", \"name\"] = \"value\",\n) -> None:\n \"\"\"Sorts the indices in place by `key`.\n\n Once sorted, iteration over the results, and indexing of all the\n properties\n [ValuationResult.values][pydvl.value.result.ValuationResult.values],\n [ValuationResult.variances][pydvl.value.result.ValuationResult.variances],\n [ValuationResult.counts][pydvl.value.result.ValuationResult.counts],\n [ValuationResult.indices][pydvl.value.result.ValuationResult.indices]\n and [ValuationResult.names][pydvl.value.result.ValuationResult.names]\n will follow the same order.\n\n Args:\n reverse: Whether to sort in descending order by value.\n key: The key to sort by. Defaults to\n [ValueItem.value][pydvl.value.result.ValueItem].\n \"\"\"\n keymap = {\n \"index\": \"_indices\",\n \"value\": \"_values\",\n \"variance\": \"_variances\",\n \"name\": \"_names\",\n }\n self._sort_positions = np.argsort(getattr(self, keymap[key]))\n if reverse:\n self._sort_positions = self._sort_positions[::-1]\n self._sort_order = reverse\n
"},{"location":"api/pydvl/value/result/#pydvl.value.result.ValuationResult.__getattr__","title":"__getattr__","text":"__getattr__(attr: str) -> Any\n
Allows access to extra values as if they were properties of the instance.
Source code in src/pydvl/value/result.py
def __getattr__(self, attr: str) -> Any:\n \"\"\"Allows access to extra values as if they were properties of the instance.\"\"\"\n # This is here to avoid a RecursionError when copying or pickling the object\n if attr == \"_extra_values\":\n raise AttributeError()\n try:\n return self._extra_values[attr]\n except KeyError as e:\n raise AttributeError(\n f\"{self.__class__.__name__} object has no attribute {attr}\"\n ) from e\n
"},{"location":"api/pydvl/value/result/#pydvl.value.result.ValuationResult.__iter__","title":"__iter__","text":"__iter__() -> Iterator[ValueItem[IndexT, NameT]]\n
Iterate over the results returning ValueItem objects. To sort in place before iteration, use sort().
Source code in src/pydvl/value/result.py
def __iter__(self) -> Iterator[ValueItem[IndexT, NameT]]:\n \"\"\"Iterate over the results returning [ValueItem][pydvl.value.result.ValueItem] objects.\n To sort in place before iteration, use [sort()][pydvl.value.result.ValuationResult.sort].\n \"\"\"\n for pos in self._sort_positions:\n yield ValueItem(\n self._indices[pos],\n self._names[pos],\n self._values[pos],\n self._variances[pos],\n self._counts[pos],\n )\n
"},{"location":"api/pydvl/value/result/#pydvl.value.result.ValuationResult.__add__","title":"__add__","text":"__add__(\n other: ValuationResult[IndexT, NameT]\n) -> ValuationResult[IndexT, NameT]\n
Adds two ValuationResults.
The values must have been computed with the same algorithm. An exception to this is if one argument has empty values, in which case the other argument is returned.
Warning
Abusing this will introduce numerical errors.
Means and standard errors are correctly handled. Statuses are added with bit-wise &
, see Status. data_names
are taken from the left summand, or if unavailable from the right one. The algorithm
string is carried over if both terms have the same one or concatenated.
It is possible to add ValuationResults of different lengths, and with different or overlapping indices. The result will have the union of indices, and the values.
Warning
FIXME: Arbitrary extra_values
aren't handled.
Source code in src/pydvl/value/result.py
def __add__(\n self, other: ValuationResult[IndexT, NameT]\n) -> ValuationResult[IndexT, NameT]:\n \"\"\"Adds two ValuationResults.\n\n The values must have been computed with the same algorithm. An exception\n to this is if one argument has empty values, in which case the other\n argument is returned.\n\n !!! Warning\n Abusing this will introduce numerical errors.\n\n Means and standard errors are correctly handled. Statuses are added with\n bit-wise `&`, see [Status][pydvl.value.result.Status].\n `data_names` are taken from the left summand, or if unavailable from\n the right one. The `algorithm` string is carried over if both terms\n have the same one or concatenated.\n\n It is possible to add ValuationResults of different lengths, and with\n different or overlapping indices. The result will have the union of\n indices, and the values.\n\n !!! Warning\n FIXME: Arbitrary `extra_values` aren't handled.\n\n \"\"\"\n # empty results\n if len(self.values) == 0:\n return other\n if len(other.values) == 0:\n return self\n\n self._check_compatible(other)\n\n indices = np.union1d(self._indices, other._indices).astype(self._indices.dtype)\n this_pos = np.searchsorted(indices, self._indices)\n other_pos = np.searchsorted(indices, other._indices)\n\n n: NDArray[np.int_] = np.zeros_like(indices, dtype=int)\n m: NDArray[np.int_] = np.zeros_like(indices, dtype=int)\n xn: NDArray[np.int_] = np.zeros_like(indices, dtype=float)\n xm: NDArray[np.int_] = np.zeros_like(indices, dtype=float)\n vn: NDArray[np.int_] = np.zeros_like(indices, dtype=float)\n vm: NDArray[np.int_] = np.zeros_like(indices, dtype=float)\n\n n[this_pos] = self._counts\n xn[this_pos] = self._values\n vn[this_pos] = self._variances\n m[other_pos] = other._counts\n xm[other_pos] = other._values\n vm[other_pos] = other._variances\n\n # np.maximum(1, n + m) covers case n = m = 0.\n n_m_sum = np.maximum(1, n + m)\n\n # Sample mean of n+m samples from two means of n and m samples\n xnm = (n * xn + m * xm) / n_m_sum\n\n # Sample variance of n+m samples from two sample variances of n and m samples\n vnm = (n * (vn + xn**2) + m * (vm + xm**2)) / n_m_sum - xnm**2\n\n if np.any(vnm < 0):\n if np.any(vnm < -1e-6):\n logger.warning(\n \"Numerical error in variance computation. \"\n f\"Negative sample variances clipped to 0 in {vnm}\"\n )\n vnm[np.where(vnm < 0)] = 0\n\n # Merging of names:\n # If an index has the same name in both results, it must be the same.\n # If an index has a name in one result but not the other, the name is\n # taken from the result with the name.\n if self._names.dtype != other._names.dtype:\n if np.can_cast(other._names.dtype, self._names.dtype, casting=\"safe\"):\n other._names = other._names.astype(self._names.dtype)\n logger.warning(\n f\"Casting ValuationResult.names from {other._names.dtype} to {self._names.dtype}\"\n )\n else:\n raise TypeError(\n f\"Cannot cast ValuationResult.names from \"\n f\"{other._names.dtype} to {self._names.dtype}\"\n )\n\n both_pos = np.intersect1d(this_pos, other_pos)\n\n if len(both_pos) > 0:\n this_names: NDArray = np.empty_like(indices, dtype=object)\n other_names: NDArray = np.empty_like(indices, dtype=object)\n this_names[this_pos] = self._names\n other_names[other_pos] = other._names\n\n this_shared_names = np.take(this_names, both_pos)\n other_shared_names = np.take(other_names, both_pos)\n\n if np.any(this_shared_names != other_shared_names):\n raise ValueError(f\"Mismatching names in ValuationResults\")\n\n names = np.empty_like(indices, dtype=self._names.dtype)\n names[this_pos] = self._names\n names[other_pos] = other._names\n\n return ValuationResult(\n algorithm=self.algorithm or other.algorithm or \"\",\n status=self.status & other.status,\n indices=indices,\n values=xnm,\n variances=vnm,\n counts=n + m,\n data_names=names,\n # FIXME: What to do with extra_values? This is not commutative:\n # extra_values=self._extra_values.update(other._extra_values),\n )\n
"},{"location":"api/pydvl/value/result/#pydvl.value.result.ValuationResult.update","title":"update","text":"update(idx: int, new_value: float) -> ValuationResult[IndexT, NameT]\n
Updates the result in place with a new value, using running mean and variance.
PARAMETER DESCRIPTION idx
Data index of the value to update.
TYPE: int
new_value
New value to add to the result.
TYPE: float
RETURNS DESCRIPTION ValuationResult[IndexT, NameT]
A reference to the same, modified result.
RAISES DESCRIPTION IndexError
If the index is not found.
Source code in src/pydvl/value/result.py
def update(self, idx: int, new_value: float) -> ValuationResult[IndexT, NameT]:\n \"\"\"Updates the result in place with a new value, using running mean\n and variance.\n\n Args:\n idx: Data index of the value to update.\n new_value: New value to add to the result.\n\n Returns:\n A reference to the same, modified result.\n\n Raises:\n IndexError: If the index is not found.\n \"\"\"\n try:\n pos = self._positions[idx]\n except KeyError:\n raise IndexError(f\"Index {idx} not found in ValuationResult\")\n val, var = running_moments(\n self._values[pos], self._variances[pos], self._counts[pos], new_value\n )\n self[pos] = ValueItem(\n index=cast(IndexT, idx), # FIXME\n name=self._names[pos],\n value=val,\n variance=var,\n count=self._counts[pos] + 1,\n )\n return self\n
"},{"location":"api/pydvl/value/result/#pydvl.value.result.ValuationResult.scale","title":"scale","text":"scale(factor: float, indices: Optional[NDArray[IndexT]] = None)\n
Scales the values and variances of the result by a coefficient.
PARAMETER DESCRIPTION factor
Factor to scale by.
TYPE: float
indices
Indices to scale. If None, all values are scaled.
TYPE: Optional[NDArray[IndexT]]
DEFAULT: None
Source code in src/pydvl/value/result.py
def scale(self, factor: float, indices: Optional[NDArray[IndexT]] = None):\n \"\"\"\n Scales the values and variances of the result by a coefficient.\n\n Args:\n factor: Factor to scale by.\n indices: Indices to scale. If None, all values are scaled.\n \"\"\"\n self._values[self._sort_positions[indices]] *= factor\n self._variances[self._sort_positions[indices]] *= factor**2\n
"},{"location":"api/pydvl/value/result/#pydvl.value.result.ValuationResult.get","title":"get","text":"get(idx: Integral) -> ValueItem\n
Retrieves a ValueItem by data index, as opposed to sort index, like the indexing operator.
RAISES DESCRIPTION IndexError
If the index is not found.
Source code in src/pydvl/value/result.py
def get(self, idx: Integral) -> ValueItem:\n \"\"\"Retrieves a ValueItem by data index, as opposed to sort index, like\n the indexing operator.\n\n Raises:\n IndexError: If the index is not found.\n \"\"\"\n try:\n pos = self._positions[idx]\n except KeyError:\n raise IndexError(f\"Index {idx} not found in ValuationResult\")\n\n return ValueItem(\n self._indices[pos],\n self._names[pos],\n self._values[pos],\n self._variances[pos],\n self._counts[pos],\n )\n
"},{"location":"api/pydvl/value/result/#pydvl.value.result.ValuationResult.to_dataframe","title":"to_dataframe","text":"to_dataframe(\n column: Optional[str] = None, use_names: bool = False\n) -> DataFrame\n
Returns values as a dataframe.
PARAMETER DESCRIPTION column
Name for the column holding the data value. Defaults to the name of the algorithm used.
TYPE: Optional[str]
DEFAULT: None
use_names
Whether to use data names instead of indices for the DataFrame's index.
TYPE: bool
DEFAULT: False
RETURNS DESCRIPTION DataFrame
A dataframe with two columns, one for the values, with name given as explained in column
, and another with standard errors for approximate algorithms. The latter will be named column+'_stderr'
.
Source code in src/pydvl/value/result.py
def to_dataframe(\n self, column: Optional[str] = None, use_names: bool = False\n) -> pd.DataFrame:\n \"\"\"Returns values as a dataframe.\n\n Args:\n column: Name for the column holding the data value. Defaults to\n the name of the algorithm used.\n use_names: Whether to use data names instead of indices for the\n DataFrame's index.\n\n Returns:\n A dataframe with two columns, one for the values, with name\n given as explained in `column`, and another with standard errors for\n approximate algorithms. The latter will be named `column+'_stderr'`.\n \"\"\"\n column = column or self._algorithm\n df = pd.DataFrame(\n self._values[self._sort_positions],\n index=(\n self._names[self._sort_positions]\n if use_names\n else self._indices[self._sort_positions]\n ),\n columns=[column],\n )\n df[column + \"_stderr\"] = self.stderr[self._sort_positions]\n df[column + \"_updates\"] = self.counts[self._sort_positions]\n return df\n
"},{"location":"api/pydvl/value/result/#pydvl.value.result.ValuationResult.from_random","title":"from_random classmethod
","text":"from_random(\n size: int,\n total: Optional[float] = None,\n seed: Optional[Seed] = None,\n **kwargs\n) -> \"ValuationResult\"\n
Creates a ValuationResult object and fills it with an array of random values from a uniform distribution in [-1,1]. The values can be made to sum up to a given total number (doing so will change their range).
PARAMETER DESCRIPTION size
Number of values to generate
TYPE: int
total
If set, the values are normalized to sum to this number (\"efficiency\" property of Shapley values).
TYPE: Optional[float]
DEFAULT: None
kwargs
Additional options to pass to the constructor of ValuationResult. Use to override status, names, etc.
DEFAULT: {}
RETURNS DESCRIPTION 'ValuationResult'
A valuation result with its status set to
'ValuationResult'
Status.Converged by default.
RAISES DESCRIPTION ValueError
If size
is less than 1.
Changed in version 0.6.0
Added parameter total
. Check for zero size
Source code in src/pydvl/value/result.py
@classmethod\ndef from_random(\n cls,\n size: int,\n total: Optional[float] = None,\n seed: Optional[Seed] = None,\n **kwargs,\n) -> \"ValuationResult\":\n \"\"\"Creates a [ValuationResult][pydvl.value.result.ValuationResult] object and fills it with an array\n of random values from a uniform distribution in [-1,1]. The values can\n be made to sum up to a given total number (doing so will change their range).\n\n Args:\n size: Number of values to generate\n total: If set, the values are normalized to sum to this number\n (\"efficiency\" property of Shapley values).\n kwargs: Additional options to pass to the constructor of\n [ValuationResult][pydvl.value.result.ValuationResult]. Use to override status, names, etc.\n\n Returns:\n A valuation result with its status set to\n [Status.Converged][pydvl.utils.status.Status] by default.\n\n Raises:\n ValueError: If `size` is less than 1.\n\n !!! tip \"Changed in version 0.6.0\"\n Added parameter `total`. Check for zero size\n \"\"\"\n if size < 1:\n raise ValueError(\"Size must be a positive integer\")\n\n rng = np.random.default_rng(seed)\n values = rng.uniform(low=-1, high=1, size=size)\n if total is not None:\n values *= total / np.sum(values)\n\n options = dict(values=values, status=Status.Converged, algorithm=\"random\")\n options.update(kwargs)\n return cls(**options) # type: ignore\n
"},{"location":"api/pydvl/value/result/#pydvl.value.result.ValuationResult.empty","title":"empty classmethod
","text":"empty(\n algorithm: str = \"\",\n indices: Optional[Sequence[IndexT] | NDArray[IndexT]] = None,\n data_names: Optional[Sequence[NameT] | NDArray[NameT]] = None,\n n_samples: int = 0,\n) -> ValuationResult\n
Creates an empty ValuationResult object.
Empty results are characterised by having an empty array of values. When another result is added to an empty one, the empty one is discarded.
PARAMETER DESCRIPTION algorithm
Name of the algorithm used to compute the values
TYPE: str
DEFAULT: ''
indices
Optional sequence or array of indices.
TYPE: Optional[Sequence[IndexT] | NDArray[IndexT]]
DEFAULT: None
data_names
Optional sequences or array of names for the data points. Defaults to index numbers if not set.
TYPE: Optional[Sequence[NameT] | NDArray[NameT]]
DEFAULT: None
n_samples
Number of valuation result entries.
TYPE: int
DEFAULT: 0
RETURNS DESCRIPTION ValuationResult
Object with the results.
Source code in src/pydvl/value/result.py
@classmethod\ndef empty(\n cls,\n algorithm: str = \"\",\n indices: Optional[Sequence[IndexT] | NDArray[IndexT]] = None,\n data_names: Optional[Sequence[NameT] | NDArray[NameT]] = None,\n n_samples: int = 0,\n) -> ValuationResult:\n \"\"\"Creates an empty [ValuationResult][pydvl.value.result.ValuationResult] object.\n\n Empty results are characterised by having an empty array of values. When\n another result is added to an empty one, the empty one is discarded.\n\n Args:\n algorithm: Name of the algorithm used to compute the values\n indices: Optional sequence or array of indices.\n data_names: Optional sequences or array of names for the data points.\n Defaults to index numbers if not set.\n n_samples: Number of valuation result entries.\n\n Returns:\n Object with the results.\n \"\"\"\n if indices is not None or data_names is not None or n_samples != 0:\n return cls.zeros(\n algorithm=algorithm,\n indices=indices,\n data_names=data_names,\n n_samples=n_samples,\n )\n return cls(algorithm=algorithm, status=Status.Pending, values=np.array([]))\n
"},{"location":"api/pydvl/value/result/#pydvl.value.result.ValuationResult.zeros","title":"zeros classmethod
","text":"zeros(\n algorithm: str = \"\",\n indices: Optional[Sequence[IndexT] | NDArray[IndexT]] = None,\n data_names: Optional[Sequence[NameT] | NDArray[NameT]] = None,\n n_samples: int = 0,\n) -> ValuationResult\n
Creates an empty ValuationResult object.
Empty results are characterised by having an empty array of values. When another result is added to an empty one, the empty one is ignored.
PARAMETER DESCRIPTION algorithm
Name of the algorithm used to compute the values
TYPE: str
DEFAULT: ''
indices
Data indices to use. A copy will be made. If not given, the indices will be set to the range [0, n_samples)
.
TYPE: Optional[Sequence[IndexT] | NDArray[IndexT]]
DEFAULT: None
data_names
Data names to use. A copy will be made. If not given, the names will be set to the string representation of the indices.
TYPE: Optional[Sequence[NameT] | NDArray[NameT]]
DEFAULT: None
n_samples
Number of data points whose values are computed. If not given, the length of indices
will be used.
TYPE: int
DEFAULT: 0
RETURNS DESCRIPTION ValuationResult
Object with the results.
Source code in src/pydvl/value/result.py
@classmethod\ndef zeros(\n cls,\n algorithm: str = \"\",\n indices: Optional[Sequence[IndexT] | NDArray[IndexT]] = None,\n data_names: Optional[Sequence[NameT] | NDArray[NameT]] = None,\n n_samples: int = 0,\n) -> ValuationResult:\n \"\"\"Creates an empty [ValuationResult][pydvl.value.result.ValuationResult] object.\n\n Empty results are characterised by having an empty array of values. When\n another result is added to an empty one, the empty one is ignored.\n\n Args:\n algorithm: Name of the algorithm used to compute the values\n indices: Data indices to use. A copy will be made. If not given,\n the indices will be set to the range `[0, n_samples)`.\n data_names: Data names to use. A copy will be made. If not given,\n the names will be set to the string representation of the indices.\n n_samples: Number of data points whose values are computed. If\n not given, the length of `indices` will be used.\n\n Returns:\n Object with the results.\n \"\"\"\n if indices is None:\n indices = np.arange(n_samples, dtype=np.int_)\n else:\n indices = np.array(indices, dtype=np.int_)\n\n if data_names is None:\n data_names = np.array(indices)\n else:\n data_names = np.array(data_names)\n\n return cls(\n algorithm=algorithm,\n status=Status.Pending,\n indices=indices,\n data_names=data_names,\n values=np.zeros(len(indices)),\n variances=np.zeros(len(indices)),\n counts=np.zeros(len(indices), dtype=np.int_),\n )\n
"},{"location":"api/pydvl/value/sampler/","title":"Sampler","text":""},{"location":"api/pydvl/value/sampler/#pydvl.value.sampler","title":"pydvl.value.sampler","text":"Samplers iterate over subsets of indices.
The classes in this module are used to iterate over indices and subsets of their complement in the whole set, as required for the computation of marginal utility for semi-values. The elements returned when iterating over any subclass of PowersetSampler are tuples of the form (idx, subset)
, where idx
is the index of the element being added to the subset, and subset
is the subset of the complement of idx
. The classes in this module are used to iterate over an index set \\(I\\) as required for the computation of marginal utility for semi-values. The elements returned when iterating over any subclass of :class:PowersetSampler
are tuples of the form \\((i, S)\\), where \\(i\\) is an index of interest, and \\(S \\subset I \\setminus \\{i\\}\\) is a subset of the complement of \\(i\\).
The iteration happens in two nested loops. An outer loop iterates over \\(I\\), and an inner loop iterates over the powerset of \\(I \\setminus \\{i\\}\\). The outer iteration can be either sequential or at random.
Note
This is the natural mode of iteration for the combinatorial definition of semi-values, in particular Shapley value. For the computation using permutations, adhering to this interface is not ideal, but we stick to it for consistency.
The samplers are used in the semivalues module to compute any semi-value, in particular Shapley and Beta values, and Banzhaf indices.
"},{"location":"api/pydvl/value/sampler/#pydvl.value.sampler--slicing-of-samplers","title":"Slicing of samplers","text":"The samplers can be sliced for parallel computation. For those which are embarrassingly parallel, this is done by slicing the set of \"outer\" indices and returning new samplers over those slices. This includes all truly powerset-based samplers, such as DeterministicUniformSampler and UniformSampler. In contrast, slicing a PermutationSampler creates a new sampler which iterates over the same indices.
"},{"location":"api/pydvl/value/sampler/#pydvl.value.sampler--references","title":"References","text":" -
Mitchell, Rory, Joshua Cooper, Eibe Frank, and Geoffrey Holmes. Sampling Permutations for Shapley Value Estimation. Journal of Machine Learning Research 23, no. 43 (2022): 1\u201346.\u00a0\u21a9
-
Wang, J.T. and Jia, R., 2023. Data Banzhaf: A Robust Data Valuation Framework for Machine Learning. In: Proceedings of The 26th International Conference on Artificial Intelligence and Statistics, pp. 6388-6421.\u00a0\u21a9
"},{"location":"api/pydvl/value/sampler/#pydvl.value.sampler.PowersetSampler","title":"PowersetSampler","text":"PowersetSampler(\n indices: NDArray[IndexT],\n index_iteration: IndexIteration = IndexIteration.Sequential,\n outer_indices: NDArray[IndexT] | None = None,\n **kwargs\n)\n
Bases: ABC
, Iterable[SampleT]
, Generic[IndexT]
Samplers are custom iterables over subsets of indices.
Calling iter()
on a sampler returns an iterator over tuples of the form \\((i, S)\\), where \\(i\\) is an index of interest, and \\(S \\subset I \\setminus \\{i\\}\\) is a subset of the complement of \\(i\\).
This is done in two nested loops, where the outer loop iterates over the set of indices, and the inner loop iterates over subsets of the complement of the current index. The outer iteration can be either sequential or at random.
Note
Samplers are not iterators themselves, so that each call to iter()
e.g. in a for loop creates a new iterator.
Example >>>for idx, s in DeterministicUniformSampler(np.arange(2)):\n>>> print(s, end=\"\")\n[][2,][][1,]\n
"},{"location":"api/pydvl/value/sampler/#pydvl.value.sampler.PowersetSampler--methods-required-in-subclasses","title":"Methods required in subclasses","text":"Samplers must implement a weight() function to be used as a multiplier in Monte Carlo sums, so that the limit expectation coincides with the semi-value.
"},{"location":"api/pydvl/value/sampler/#pydvl.value.sampler.PowersetSampler--slicing-of-samplers","title":"Slicing of samplers","text":"The samplers can be sliced for parallel computation. For those which are embarrassingly parallel, this is done by slicing the set of \"outer\" indices and returning new samplers over those slices.
index_iteration: the order in which indices are iterated over\nouter_indices: The set of items (indices) over which to iterate\n when sampling. Subsets are taken from the complement of each index\n in succession. For embarrassingly parallel computations, this set\n is sliced and the samplers are used to iterate over the slices.\n
Source code in src/pydvl/value/sampler.py
def __init__(\n self,\n indices: NDArray[IndexT],\n index_iteration: IndexIteration = IndexIteration.Sequential,\n outer_indices: NDArray[IndexT] | None = None,\n **kwargs,\n):\n \"\"\"\n Args:\n indices: The set of items (indices) to sample from.\n index_iteration: the order in which indices are iterated over\n outer_indices: The set of items (indices) over which to iterate\n when sampling. Subsets are taken from the complement of each index\n in succession. For embarrassingly parallel computations, this set\n is sliced and the samplers are used to iterate over the slices.\n \"\"\"\n self._indices = indices\n self._index_iteration = index_iteration\n self._outer_indices = outer_indices if outer_indices is not None else indices\n self._n = len(indices)\n self._n_samples = 0\n
"},{"location":"api/pydvl/value/sampler/#pydvl.value.sampler.PowersetSampler.iterindices","title":"iterindices","text":"iterindices() -> Iterator[IndexT]\n
Iterates over indices in the order specified at construction.
this is probably not very useful, but I couldn't decide which method is better
Source code in src/pydvl/value/sampler.py
def iterindices(self) -> Iterator[IndexT]:\n \"\"\"Iterates over indices in the order specified at construction.\n\n FIXME: this is probably not very useful, but I couldn't decide\n which method is better\n \"\"\"\n if self._index_iteration is PowersetSampler.IndexIteration.Sequential:\n for idx in self._outer_indices:\n yield idx\n elif self._index_iteration is PowersetSampler.IndexIteration.Random:\n while True:\n yield np.random.choice(self._outer_indices, size=1).item()\n
"},{"location":"api/pydvl/value/sampler/#pydvl.value.sampler.PowersetSampler.__len__","title":"__len__","text":"__len__() -> int\n
Returns the number of outer indices over which the sampler iterates.
Source code in src/pydvl/value/sampler.py
def __len__(self) -> int:\n \"\"\"Returns the number of outer indices over which the sampler iterates.\"\"\"\n return len(self._outer_indices)\n
"},{"location":"api/pydvl/value/sampler/#pydvl.value.sampler.PowersetSampler.weight","title":"weight abstractmethod
classmethod
","text":"weight(n: int, subset_len: int) -> float\n
Factor by which to multiply Monte Carlo samples, so that the mean converges to the desired expression.
By the Law of Large Numbers, the sample mean of \\(\\delta_i(S_j)\\) converges to the expectation under the distribution from which \\(S_j\\) is sampled.
\\[ \\frac{1}{m} \\sum_{j = 1}^m \\delta_i (S_j) c (S_j) \\longrightarrow \\underset{S \\sim \\mathcal{D}_{- i}}{\\mathbb{E}} [\\delta_i (S) c ( S)]\\] We add a factor \\(c(S_j)\\) in order to have this expectation coincide with the desired expression.
Source code in src/pydvl/value/sampler.py
@classmethod\n@abc.abstractmethod\ndef weight(cls, n: int, subset_len: int) -> float:\n r\"\"\"Factor by which to multiply Monte Carlo samples, so that the\n mean converges to the desired expression.\n\n By the Law of Large Numbers, the sample mean of $\\delta_i(S_j)$\n converges to the expectation under the distribution from which $S_j$ is\n sampled.\n\n $$ \\frac{1}{m} \\sum_{j = 1}^m \\delta_i (S_j) c (S_j) \\longrightarrow\n \\underset{S \\sim \\mathcal{D}_{- i}}{\\mathbb{E}} [\\delta_i (S) c (\n S)]$$\n\n We add a factor $c(S_j)$ in order to have this expectation coincide with\n the desired expression.\n \"\"\"\n ...\n
"},{"location":"api/pydvl/value/sampler/#pydvl.value.sampler.StochasticSamplerMixin","title":"StochasticSamplerMixin","text":"StochasticSamplerMixin(*args, seed: Optional[Seed] = None, **kwargs)\n
Mixin class for samplers which use a random number generator.
Source code in src/pydvl/value/sampler.py
def __init__(self, *args, seed: Optional[Seed] = None, **kwargs):\n super().__init__(*args, **kwargs)\n self._rng = np.random.default_rng(seed)\n
"},{"location":"api/pydvl/value/sampler/#pydvl.value.sampler.DeterministicUniformSampler","title":"DeterministicUniformSampler","text":"DeterministicUniformSampler(indices: NDArray[IndexT], *args, **kwargs)\n
Bases: PowersetSampler[IndexT]
For every index \\(i\\), each subset of the complement indices - {i}
is returned.
Note
Indices are always iterated over sequentially, irrespective of the value of index_iteration
upon construction.
Example >>> for idx, s in DeterministicUniformSampler(np.arange(2)):\n>>> print(f\"{idx} - {s}\", end=\", \")\n1 - [], 1 - [2], 2 - [], 2 - [1],\n
PARAMETER DESCRIPTION indices
The set of items (indices) to sample from.
TYPE: NDArray[IndexT]
Source code in src/pydvl/value/sampler.py
def __init__(self, indices: NDArray[IndexT], *args, **kwargs):\n \"\"\"An iterator to perform uniform deterministic sampling of subsets.\n\n For every index $i$, each subset of the complement `indices - {i}` is\n returned.\n\n !!! Note\n Indices are always iterated over sequentially, irrespective of\n the value of `index_iteration` upon construction.\n\n ??? Example\n ``` pycon\n >>> for idx, s in DeterministicUniformSampler(np.arange(2)):\n >>> print(f\"{idx} - {s}\", end=\", \")\n 1 - [], 1 - [2], 2 - [], 2 - [1],\n ```\n\n Args:\n indices: The set of items (indices) to sample from.\n \"\"\"\n # Force sequential iteration\n kwargs.update({\"index_iteration\": PowersetSampler.IndexIteration.Sequential})\n super().__init__(indices, *args, **kwargs)\n
"},{"location":"api/pydvl/value/sampler/#pydvl.value.sampler.DeterministicUniformSampler.iterindices","title":"iterindices","text":"iterindices() -> Iterator[IndexT]\n
Iterates over indices in the order specified at construction.
this is probably not very useful, but I couldn't decide which method is better
Source code in src/pydvl/value/sampler.py
def iterindices(self) -> Iterator[IndexT]:\n \"\"\"Iterates over indices in the order specified at construction.\n\n FIXME: this is probably not very useful, but I couldn't decide\n which method is better\n \"\"\"\n if self._index_iteration is PowersetSampler.IndexIteration.Sequential:\n for idx in self._outer_indices:\n yield idx\n elif self._index_iteration is PowersetSampler.IndexIteration.Random:\n while True:\n yield np.random.choice(self._outer_indices, size=1).item()\n
"},{"location":"api/pydvl/value/sampler/#pydvl.value.sampler.DeterministicUniformSampler.__len__","title":"__len__","text":"__len__() -> int\n
Returns the number of outer indices over which the sampler iterates.
Source code in src/pydvl/value/sampler.py
def __len__(self) -> int:\n \"\"\"Returns the number of outer indices over which the sampler iterates.\"\"\"\n return len(self._outer_indices)\n
"},{"location":"api/pydvl/value/sampler/#pydvl.value.sampler.UniformSampler","title":"UniformSampler","text":"UniformSampler(*args, seed: Optional[Seed] = None, **kwargs)\n
Bases: StochasticSamplerMixin
, PowersetSampler[IndexT]
An iterator to perform uniform random sampling of subsets.
Iterating over every index \\(i\\), either in sequence or at random depending on the value of index_iteration
, one subset of the complement indices - {i}
is sampled with equal probability \\(2^{n-1}\\). The iterator never ends.
Example The code
for idx, s in UniformSampler(np.arange(3)):\n print(f\"{idx} - {s}\", end=\", \")\n
Produces the output: 0 - [1 4], 1 - [2 3], 2 - [0 1 3], 3 - [], 4 - [2], 0 - [1 3 4], 1 - [0 2]\n(...)\n
Source code in src/pydvl/value/sampler.py
def __init__(self, *args, seed: Optional[Seed] = None, **kwargs):\n super().__init__(*args, **kwargs)\n self._rng = np.random.default_rng(seed)\n
"},{"location":"api/pydvl/value/sampler/#pydvl.value.sampler.UniformSampler.iterindices","title":"iterindices","text":"iterindices() -> Iterator[IndexT]\n
Iterates over indices in the order specified at construction.
this is probably not very useful, but I couldn't decide which method is better
Source code in src/pydvl/value/sampler.py
def iterindices(self) -> Iterator[IndexT]:\n \"\"\"Iterates over indices in the order specified at construction.\n\n FIXME: this is probably not very useful, but I couldn't decide\n which method is better\n \"\"\"\n if self._index_iteration is PowersetSampler.IndexIteration.Sequential:\n for idx in self._outer_indices:\n yield idx\n elif self._index_iteration is PowersetSampler.IndexIteration.Random:\n while True:\n yield np.random.choice(self._outer_indices, size=1).item()\n
"},{"location":"api/pydvl/value/sampler/#pydvl.value.sampler.UniformSampler.__len__","title":"__len__","text":"__len__() -> int\n
Returns the number of outer indices over which the sampler iterates.
Source code in src/pydvl/value/sampler.py
def __len__(self) -> int:\n \"\"\"Returns the number of outer indices over which the sampler iterates.\"\"\"\n return len(self._outer_indices)\n
"},{"location":"api/pydvl/value/sampler/#pydvl.value.sampler.UniformSampler.weight","title":"weight classmethod
","text":"weight(n: int, subset_len: int) -> float\n
Correction coming from Monte Carlo integration so that the mean of the marginals converges to the value: the uniform distribution over the powerset of a set with n-1 elements has mass 2^{n-1} over each subset.
Source code in src/pydvl/value/sampler.py
@classmethod\ndef weight(cls, n: int, subset_len: int) -> float:\n \"\"\"Correction coming from Monte Carlo integration so that the mean of\n the marginals converges to the value: the uniform distribution over the\n powerset of a set with n-1 elements has mass 2^{n-1} over each subset.\"\"\"\n return float(2 ** (n - 1)) if n > 0 else 1.0\n
"},{"location":"api/pydvl/value/sampler/#pydvl.value.sampler.MSRSampler","title":"MSRSampler","text":"MSRSampler(*args, seed: Optional[Seed] = None, **kwargs)\n
Bases: StochasticSamplerMixin
, PowersetSampler[IndexT]
An iterator to perform sampling of random subsets.
This sampler does not return any index, it only returns subsets of the data. This sampler is used in (Wang et. al.)2.
Source code in src/pydvl/value/sampler.py
def __init__(self, *args, seed: Optional[Seed] = None, **kwargs):\n super().__init__(*args, **kwargs)\n self._rng = np.random.default_rng(seed)\n
"},{"location":"api/pydvl/value/sampler/#pydvl.value.sampler.MSRSampler.iterindices","title":"iterindices","text":"iterindices() -> Iterator[IndexT]\n
Iterates over indices in the order specified at construction.
this is probably not very useful, but I couldn't decide which method is better
Source code in src/pydvl/value/sampler.py
def iterindices(self) -> Iterator[IndexT]:\n \"\"\"Iterates over indices in the order specified at construction.\n\n FIXME: this is probably not very useful, but I couldn't decide\n which method is better\n \"\"\"\n if self._index_iteration is PowersetSampler.IndexIteration.Sequential:\n for idx in self._outer_indices:\n yield idx\n elif self._index_iteration is PowersetSampler.IndexIteration.Random:\n while True:\n yield np.random.choice(self._outer_indices, size=1).item()\n
"},{"location":"api/pydvl/value/sampler/#pydvl.value.sampler.MSRSampler.__len__","title":"__len__","text":"__len__() -> int\n
Returns the number of outer indices over which the sampler iterates.
Source code in src/pydvl/value/sampler.py
def __len__(self) -> int:\n \"\"\"Returns the number of outer indices over which the sampler iterates.\"\"\"\n return len(self._outer_indices)\n
"},{"location":"api/pydvl/value/sampler/#pydvl.value.sampler.AntitheticSampler","title":"AntitheticSampler","text":"AntitheticSampler(*args, seed: Optional[Seed] = None, **kwargs)\n
Bases: StochasticSamplerMixin
, PowersetSampler[IndexT]
An iterator to perform uniform random sampling of subsets, and their complements.
Works as UniformSampler, but for every tuple \\((i,S)\\), it subsequently returns \\((i,S^c)\\), where \\(S^c\\) is the complement of the set \\(S\\) in the set of indices, excluding \\(i\\).
Source code in src/pydvl/value/sampler.py
def __init__(self, *args, seed: Optional[Seed] = None, **kwargs):\n super().__init__(*args, **kwargs)\n self._rng = np.random.default_rng(seed)\n
"},{"location":"api/pydvl/value/sampler/#pydvl.value.sampler.AntitheticSampler.iterindices","title":"iterindices","text":"iterindices() -> Iterator[IndexT]\n
Iterates over indices in the order specified at construction.
this is probably not very useful, but I couldn't decide which method is better
Source code in src/pydvl/value/sampler.py
def iterindices(self) -> Iterator[IndexT]:\n \"\"\"Iterates over indices in the order specified at construction.\n\n FIXME: this is probably not very useful, but I couldn't decide\n which method is better\n \"\"\"\n if self._index_iteration is PowersetSampler.IndexIteration.Sequential:\n for idx in self._outer_indices:\n yield idx\n elif self._index_iteration is PowersetSampler.IndexIteration.Random:\n while True:\n yield np.random.choice(self._outer_indices, size=1).item()\n
"},{"location":"api/pydvl/value/sampler/#pydvl.value.sampler.AntitheticSampler.__len__","title":"__len__","text":"__len__() -> int\n
Returns the number of outer indices over which the sampler iterates.
Source code in src/pydvl/value/sampler.py
def __len__(self) -> int:\n \"\"\"Returns the number of outer indices over which the sampler iterates.\"\"\"\n return len(self._outer_indices)\n
"},{"location":"api/pydvl/value/sampler/#pydvl.value.sampler.PermutationSampler","title":"PermutationSampler","text":"PermutationSampler(*args, seed: Optional[Seed] = None, **kwargs)\n
Bases: StochasticSamplerMixin
, PowersetSampler[IndexT]
Sample permutations of indices and iterate through each returning increasing subsets, as required for the permutation definition of semi-values.
This sampler does not implement the two loops described in PowersetSampler. Instead, for a permutation (3,1,4,2)
, it returns in sequence the tuples of index and sets: (3, {})
, (1, {3})
, (4, {3,1})
and (2, {3,1,4})
.
Note that the full index set is never returned.
Warning
This sampler requires caching to be enabled or computation will be doubled wrt. a \"direct\" implementation of permutation MC
Source code in src/pydvl/value/sampler.py
def __init__(self, *args, seed: Optional[Seed] = None, **kwargs):\n super().__init__(*args, **kwargs)\n self._rng = np.random.default_rng(seed)\n
"},{"location":"api/pydvl/value/sampler/#pydvl.value.sampler.PermutationSampler.iterindices","title":"iterindices","text":"iterindices() -> Iterator[IndexT]\n
Iterates over indices in the order specified at construction.
this is probably not very useful, but I couldn't decide which method is better
Source code in src/pydvl/value/sampler.py
def iterindices(self) -> Iterator[IndexT]:\n \"\"\"Iterates over indices in the order specified at construction.\n\n FIXME: this is probably not very useful, but I couldn't decide\n which method is better\n \"\"\"\n if self._index_iteration is PowersetSampler.IndexIteration.Sequential:\n for idx in self._outer_indices:\n yield idx\n elif self._index_iteration is PowersetSampler.IndexIteration.Random:\n while True:\n yield np.random.choice(self._outer_indices, size=1).item()\n
"},{"location":"api/pydvl/value/sampler/#pydvl.value.sampler.PermutationSampler.__len__","title":"__len__","text":"__len__() -> int\n
Returns the number of outer indices over which the sampler iterates.
Source code in src/pydvl/value/sampler.py
def __len__(self) -> int:\n \"\"\"Returns the number of outer indices over which the sampler iterates.\"\"\"\n return len(self._outer_indices)\n
"},{"location":"api/pydvl/value/sampler/#pydvl.value.sampler.PermutationSampler.__getitem__","title":"__getitem__","text":"__getitem__(key: slice | list[int]) -> PowersetSampler[IndexT]\n
Permutation samplers cannot be split across indices, so we return a copy of the full sampler.
Source code in src/pydvl/value/sampler.py
def __getitem__(self, key: slice | list[int]) -> PowersetSampler[IndexT]:\n \"\"\"Permutation samplers cannot be split across indices, so we return\n a copy of the full sampler.\"\"\"\n return super().__getitem__(slice(None))\n
"},{"location":"api/pydvl/value/sampler/#pydvl.value.sampler.AntitheticPermutationSampler","title":"AntitheticPermutationSampler","text":"AntitheticPermutationSampler(*args, seed: Optional[Seed] = None, **kwargs)\n
Bases: PermutationSampler[IndexT]
Samples permutations like PermutationSampler, but after each permutation, it returns the same permutation in reverse order.
This sampler was suggested in (Mitchell et al. 2022)1
New in version 0.7.1
Source code in src/pydvl/value/sampler.py
def __init__(self, *args, seed: Optional[Seed] = None, **kwargs):\n super().__init__(*args, **kwargs)\n self._rng = np.random.default_rng(seed)\n
"},{"location":"api/pydvl/value/sampler/#pydvl.value.sampler.AntitheticPermutationSampler.iterindices","title":"iterindices","text":"iterindices() -> Iterator[IndexT]\n
Iterates over indices in the order specified at construction.
this is probably not very useful, but I couldn't decide which method is better
Source code in src/pydvl/value/sampler.py
def iterindices(self) -> Iterator[IndexT]:\n \"\"\"Iterates over indices in the order specified at construction.\n\n FIXME: this is probably not very useful, but I couldn't decide\n which method is better\n \"\"\"\n if self._index_iteration is PowersetSampler.IndexIteration.Sequential:\n for idx in self._outer_indices:\n yield idx\n elif self._index_iteration is PowersetSampler.IndexIteration.Random:\n while True:\n yield np.random.choice(self._outer_indices, size=1).item()\n
"},{"location":"api/pydvl/value/sampler/#pydvl.value.sampler.AntitheticPermutationSampler.__getitem__","title":"__getitem__","text":"__getitem__(key: slice | list[int]) -> PowersetSampler[IndexT]\n
Permutation samplers cannot be split across indices, so we return a copy of the full sampler.
Source code in src/pydvl/value/sampler.py
def __getitem__(self, key: slice | list[int]) -> PowersetSampler[IndexT]:\n \"\"\"Permutation samplers cannot be split across indices, so we return\n a copy of the full sampler.\"\"\"\n return super().__getitem__(slice(None))\n
"},{"location":"api/pydvl/value/sampler/#pydvl.value.sampler.AntitheticPermutationSampler.__len__","title":"__len__","text":"__len__() -> int\n
Returns the number of outer indices over which the sampler iterates.
Source code in src/pydvl/value/sampler.py
def __len__(self) -> int:\n \"\"\"Returns the number of outer indices over which the sampler iterates.\"\"\"\n return len(self._outer_indices)\n
"},{"location":"api/pydvl/value/sampler/#pydvl.value.sampler.DeterministicPermutationSampler","title":"DeterministicPermutationSampler","text":"DeterministicPermutationSampler(*args, seed: Optional[Seed] = None, **kwargs)\n
Bases: PermutationSampler[IndexT]
Samples all n! permutations of the indices deterministically, and iterates through them, returning sets as required for the permutation-based definition of semi-values.
Warning
This sampler requires caching to be enabled or computation will be doubled wrt. a \"direct\" implementation of permutation MC
Warning
This sampler is not parallelizable, as it always iterates over the whole set of permutations in the same order. Different processes would always return the same values for all indices.
Source code in src/pydvl/value/sampler.py
def __init__(self, *args, seed: Optional[Seed] = None, **kwargs):\n super().__init__(*args, **kwargs)\n self._rng = np.random.default_rng(seed)\n
"},{"location":"api/pydvl/value/sampler/#pydvl.value.sampler.DeterministicPermutationSampler.iterindices","title":"iterindices","text":"iterindices() -> Iterator[IndexT]\n
Iterates over indices in the order specified at construction.
this is probably not very useful, but I couldn't decide which method is better
Source code in src/pydvl/value/sampler.py
def iterindices(self) -> Iterator[IndexT]:\n \"\"\"Iterates over indices in the order specified at construction.\n\n FIXME: this is probably not very useful, but I couldn't decide\n which method is better\n \"\"\"\n if self._index_iteration is PowersetSampler.IndexIteration.Sequential:\n for idx in self._outer_indices:\n yield idx\n elif self._index_iteration is PowersetSampler.IndexIteration.Random:\n while True:\n yield np.random.choice(self._outer_indices, size=1).item()\n
"},{"location":"api/pydvl/value/sampler/#pydvl.value.sampler.DeterministicPermutationSampler.__getitem__","title":"__getitem__","text":"__getitem__(key: slice | list[int]) -> PowersetSampler[IndexT]\n
Permutation samplers cannot be split across indices, so we return a copy of the full sampler.
Source code in src/pydvl/value/sampler.py
def __getitem__(self, key: slice | list[int]) -> PowersetSampler[IndexT]:\n \"\"\"Permutation samplers cannot be split across indices, so we return\n a copy of the full sampler.\"\"\"\n return super().__getitem__(slice(None))\n
"},{"location":"api/pydvl/value/sampler/#pydvl.value.sampler.DeterministicPermutationSampler.__len__","title":"__len__","text":"__len__() -> int\n
Returns the number of outer indices over which the sampler iterates.
Source code in src/pydvl/value/sampler.py
def __len__(self) -> int:\n \"\"\"Returns the number of outer indices over which the sampler iterates.\"\"\"\n return len(self._outer_indices)\n
"},{"location":"api/pydvl/value/sampler/#pydvl.value.sampler.RandomHierarchicalSampler","title":"RandomHierarchicalSampler","text":"RandomHierarchicalSampler(*args, seed: Optional[Seed] = None, **kwargs)\n
Bases: StochasticSamplerMixin
, PowersetSampler[IndexT]
For every index, sample a set size, then a set of that size.
Todo
This is unnecessary, but a step towards proper stratified sampling.
Source code in src/pydvl/value/sampler.py
def __init__(self, *args, seed: Optional[Seed] = None, **kwargs):\n super().__init__(*args, **kwargs)\n self._rng = np.random.default_rng(seed)\n
"},{"location":"api/pydvl/value/sampler/#pydvl.value.sampler.RandomHierarchicalSampler.iterindices","title":"iterindices","text":"iterindices() -> Iterator[IndexT]\n
Iterates over indices in the order specified at construction.
this is probably not very useful, but I couldn't decide which method is better
Source code in src/pydvl/value/sampler.py
def iterindices(self) -> Iterator[IndexT]:\n \"\"\"Iterates over indices in the order specified at construction.\n\n FIXME: this is probably not very useful, but I couldn't decide\n which method is better\n \"\"\"\n if self._index_iteration is PowersetSampler.IndexIteration.Sequential:\n for idx in self._outer_indices:\n yield idx\n elif self._index_iteration is PowersetSampler.IndexIteration.Random:\n while True:\n yield np.random.choice(self._outer_indices, size=1).item()\n
"},{"location":"api/pydvl/value/sampler/#pydvl.value.sampler.RandomHierarchicalSampler.__len__","title":"__len__","text":"__len__() -> int\n
Returns the number of outer indices over which the sampler iterates.
Source code in src/pydvl/value/sampler.py
def __len__(self) -> int:\n \"\"\"Returns the number of outer indices over which the sampler iterates.\"\"\"\n return len(self._outer_indices)\n
"},{"location":"api/pydvl/value/semivalues/","title":"Semivalues","text":""},{"location":"api/pydvl/value/semivalues/#pydvl.value.semivalues","title":"pydvl.value.semivalues","text":"This module provides the core functionality for the computation of generic semi-values. A semi-value is any valuation function with the form:
\\[v_\\text{semi}(i) = \\sum_{i=1}^n w(k) \\sum_{S \\subset D_{-i}^{(k)}} [U(S_{+i})-U(S)],\\] where the coefficients \\(w(k)\\) satisfy the property:
\\[\\sum_{k=1}^n w(k) = 1.\\] Note For implementation consistency, we slightly depart from the common definition of semi-values, which includes a factor \\(1/n\\) in the sum over subsets. Instead, we subsume this factor into the coefficient \\(w(k)\\).
"},{"location":"api/pydvl/value/semivalues/#pydvl.value.semivalues--main-components","title":"Main components","text":"The computation of a semi-value requires two components:
- A subset sampler that generates subsets of the set \\(D\\) of interest.
- A coefficient \\(w(k)\\) that assigns a weight to each subset size \\(k\\).
Samplers can be found in sampler, and can be classified into two categories: powerset samplers and permutation samplers. Powerset samplers generate subsets of \\(D_{-i}\\), while the permutation sampler generates permutations of \\(D\\). The former conform to the above definition of semi-values, while the latter reformulates it as:
\\[ v(i) = \\frac{1}{n!} \\sum_{\\sigma \\in \\Pi(n)} \\tilde{w}( | \\sigma_{:i} | )[U(\\sigma_{:i} \\cup \\{i\\}) \u2212 U(\\sigma_{:i})], \\] where \\(\\sigma_{:i}\\) denotes the set of indices in permutation sigma before the position where \\(i\\) appears (see Data valuation for details), and
\\[ \\tilde{w} (k) = n \\binom{n - 1}{k} w (k) \\] is the weight correction due to the reformulation.
Warning
Both PermutationSampler and DeterministicPermutationSampler require caching to be enabled or computation will be doubled wrt. a 'direct' implementation of permutation MC.
"},{"location":"api/pydvl/value/semivalues/#pydvl.value.semivalues--computing-semi-values","title":"Computing semi-values","text":"Samplers and coefficients can be arbitrarily mixed by means of the main entry point of this module, compute_generic_semivalues. There are several pre-defined coefficients, including the Shapley value of (Ghorbani and Zou, 2019)1, the Banzhaf index of (Wang and Jia)3, and the Beta coefficient of (Kwon and Zou, 2022)2. For each of these methods, there is a convenience wrapper function. Respectively, these are: compute_shapley_semivalues, compute_banzhaf_semivalues, and compute_beta_shapley_semivalues. instead.
Parallelization and batching
In order to ensure reproducibility and fine-grained control of parallelization, samples are generated in the main process and then distributed to worker processes for evaluation. For small sample sizes, this can lead to a significant overhead. To avoid this, we temporarily provide an additional argument batch_size
to all methods which can improve performance with small models up to an order of magnitude. Note that this argument will be removed before version 1.0 in favour of a more general solution.
"},{"location":"api/pydvl/value/semivalues/#pydvl.value.semivalues--references","title":"References","text":" -
Ghorbani, A., Zou, J., 2019. Data Shapley: Equitable Valuation of Data for Machine Learning. In: Proceedings of the 36th International Conference on Machine Learning, PMLR, pp. 2242\u20132251.\u00a0\u21a9
-
Kwon, Y. and Zou, J., 2022. Beta Shapley: A Unified and Noise-reduced Data Valuation Framework for Machine Learning. In: Proceedings of the 25th International Conference on Artificial Intelligence and Statistics (AISTATS) 2022, Vol. 151. PMLR, Valencia, Spain.\u00a0\u21a9
-
Wang, J.T. and Jia, R., 2023. Data Banzhaf: A Robust Data Valuation Framework for Machine Learning. In: Proceedings of The 26th International Conference on Artificial Intelligence and Statistics, pp. 6388-6421.\u00a0\u21a9
"},{"location":"api/pydvl/value/semivalues/#pydvl.value.semivalues.SVCoefficient","title":"SVCoefficient","text":" Bases: Protocol
The protocol that coefficients for the computation of semi-values must fulfill.
"},{"location":"api/pydvl/value/semivalues/#pydvl.value.semivalues.SVCoefficient.__call__","title":"__call__","text":"__call__(n: int, k: int) -> float\n
Computes the coefficient for a given subset size.
PARAMETER DESCRIPTION n
Total number of elements in the set.
TYPE: int
k
Size of the subset for which the coefficient is being computed
TYPE: int
Source code in src/pydvl/value/semivalues.py
def __call__(self, n: int, k: int) -> float:\n \"\"\"Computes the coefficient for a given subset size.\n\n Args:\n n: Total number of elements in the set.\n k: Size of the subset for which the coefficient is being computed\n \"\"\"\n ...\n
"},{"location":"api/pydvl/value/semivalues/#pydvl.value.semivalues.DefaultMarginal","title":"DefaultMarginal","text":" Bases: MarginalFunction
"},{"location":"api/pydvl/value/semivalues/#pydvl.value.semivalues.DefaultMarginal.__call__","title":"__call__","text":"__call__(\n u: Utility, coefficient: SVCoefficient, samples: Iterable[SampleT]\n) -> Tuple[MarginalT, ...]\n
Computation of marginal utility. This is a helper function for compute_generic_semivalues.
PARAMETER DESCRIPTION u
Utility object with model, data, and scoring function.
TYPE: Utility
coefficient
The semivalue coefficient and sampler weight
TYPE: SVCoefficient
samples
A collection of samples. Each sample is a tuple of index and subset of indices to compute a marginal utility.
TYPE: Iterable[SampleT]
RETURNS DESCRIPTION MarginalT
A collection of marginals. Each marginal is a tuple with index and its marginal
...
utility.
Source code in src/pydvl/value/semivalues.py
def __call__(\n self, u: Utility, coefficient: SVCoefficient, samples: Iterable[SampleT]\n) -> Tuple[MarginalT, ...]:\n \"\"\"Computation of marginal utility. This is a helper function for\n [compute_generic_semivalues][pydvl.value.semivalues.compute_generic_semivalues].\n\n Args:\n u: Utility object with model, data, and scoring function.\n coefficient: The semivalue coefficient and sampler weight\n samples: A collection of samples. Each sample is a tuple of index and subset of\n indices to compute a marginal utility.\n\n Returns:\n A collection of marginals. Each marginal is a tuple with index and its marginal\n utility.\n \"\"\"\n n = len(u.data)\n marginals: List[MarginalT] = []\n for idx, s in samples:\n marginal = (u({idx}.union(s)) - u(s)) * coefficient(n, len(s))\n marginals.append((idx, marginal))\n return tuple(marginals)\n
"},{"location":"api/pydvl/value/semivalues/#pydvl.value.semivalues.RawUtility","title":"RawUtility","text":" Bases: MarginalFunction
"},{"location":"api/pydvl/value/semivalues/#pydvl.value.semivalues.RawUtility.__call__","title":"__call__","text":"__call__(\n u: Utility, coefficient: SVCoefficient, samples: Iterable[SampleT]\n) -> Tuple[MarginalT, ...]\n
Computation of raw utility without marginalization. This is a helper function for compute_generic_semivalues.
PARAMETER DESCRIPTION u
Utility object with model, data, and scoring function.
TYPE: Utility
coefficient
The semivalue coefficient and sampler weight
TYPE: SVCoefficient
samples
A collection of samples. Each sample is a tuple of index and subset of indices to compute a marginal utility.
TYPE: Iterable[SampleT]
RETURNS DESCRIPTION Tuple[MarginalT, ...]
A collection of marginals. Each marginal is a tuple with index and its raw utility.
Source code in src/pydvl/value/semivalues.py
def __call__(\n self, u: Utility, coefficient: SVCoefficient, samples: Iterable[SampleT]\n) -> Tuple[MarginalT, ...]:\n \"\"\"Computation of raw utility without marginalization. This is a helper function for\n [compute_generic_semivalues][pydvl.value.semivalues.compute_generic_semivalues].\n\n Args:\n u: Utility object with model, data, and scoring function.\n coefficient: The semivalue coefficient and sampler weight\n samples: A collection of samples. Each sample is a tuple of index and subset of\n indices to compute a marginal utility.\n\n Returns:\n A collection of marginals. Each marginal is a tuple with index and its raw utility.\n \"\"\"\n marginals: List[MarginalT] = []\n for idx, s in samples:\n marginals.append((s, u(s)))\n return tuple(marginals)\n
"},{"location":"api/pydvl/value/semivalues/#pydvl.value.semivalues.FutureProcessor","title":"FutureProcessor","text":"The FutureProcessor class used to process the results of the parallel marginal evaluations.
The marginals are evaluated in parallel by n_jobs
threads, but some algorithms require a central method to postprocess the marginal results. This can be achieved through the future processor. This base class does not perform any postprocessing, it is a noop used in most data valuation algorithms.
"},{"location":"api/pydvl/value/semivalues/#pydvl.value.semivalues.MSRFutureProcessor","title":"MSRFutureProcessor","text":"MSRFutureProcessor(u: Utility)\n
Bases: FutureProcessor
This FutureProcessor processes the raw marginals in a way that MSR sampling requires.
MSR sampling evaluates the utility once, and then updates all data semivalues based on this one evaluation. In order to do this, the RawUtility value needs to be postprocessed through this class. For more details on MSR, please refer to the paper (Wang et. al.)3. This processor keeps track of the current values and computes marginals for all data points, so that the values in the ValuationResult can be updated properly down the line.
Source code in src/pydvl/value/semivalues.py
def __init__(self, u: Utility):\n self.n = len(u.data)\n self.all_indices = u.data.indices.copy()\n self.point_in_subset = np.zeros((self.n,))\n self.positive_sums = np.zeros((self.n,))\n self.negative_sums = np.zeros((self.n,))\n self.total_evaluations = 0\n
"},{"location":"api/pydvl/value/semivalues/#pydvl.value.semivalues.MSRFutureProcessor.__call__","title":"__call__","text":"__call__(\n future_result: List[Tuple[List[IndexT], float]]\n) -> List[List[MarginalT]]\n
Computation of marginal utility using Maximum Sample Reuse.
This processor requires the Marginal Function to be set to RawUtility.\n Then, this processor computes marginals based on the utility value and the index set provided.\n\n The final formula that gives the Banzhaf semivalue using MSR is:\n $$\\hat{\\phi}_{MSR}(i) = \frac{1}{|\\mathbf{S}_{\n
i i}|} \\sum_{S \\in \\mathbf{S}{ i i}} U(S) - \frac{1}{|\\mathbf{S}{ ot{ i} i}|} \\sum_{S \\in \\mathbf{S}_{ ot{ i} i}} U(S)$$
Args:\n future_result: Result of the parallel computing jobs comprised of\n a list of indices that were used to evaluate the utility, and the evaluation result (metric).\n\n Returns:\n A collection of marginals. Each marginal is a tuple with index and its marginal\n utility.\n
Source code in src/pydvl/value/semivalues.py
def __call__(\n self, future_result: List[Tuple[List[IndexT], float]]\n) -> List[List[MarginalT]]:\n \"\"\"Computation of marginal utility using Maximum Sample Reuse.\n\n This processor requires the Marginal Function to be set to RawUtility.\n Then, this processor computes marginals based on the utility value and the index set provided.\n\n The final formula that gives the Banzhaf semivalue using MSR is:\n $$\\hat{\\phi}_{MSR}(i) = \\frac{1}{|\\mathbf{S}_{\\ni i}|} \\sum_{S \\in \\mathbf{S}_{\\ni i}} U(S)\n - \\frac{1}{|\\mathbf{S}_{\\not{\\ni} i}|} \\sum_{S \\in \\mathbf{S}_{\\not{\\ni} i}} U(S)$$\n\n Args:\n future_result: Result of the parallel computing jobs comprised of\n a list of indices that were used to evaluate the utility, and the evaluation result (metric).\n\n Returns:\n A collection of marginals. Each marginal is a tuple with index and its marginal\n utility.\n \"\"\"\n marginals: List[List[MarginalT]] = []\n for batch_id, (s, evaluation) in enumerate(future_result):\n previous_values = self.compute_values()\n self.total_evaluations += 1\n self.point_in_subset[s] += 1\n self.positive_sums[s] += evaluation\n not_s = np.setdiff1d(self.all_indices, s)\n self.negative_sums[not_s] += evaluation\n new_values = self.compute_values()\n # Hack to work around the update mechanic that does not work out of the box for MSR\n marginal_vals = (\n self.total_evaluations * new_values\n - (self.total_evaluations - 1) * previous_values\n )\n marginals.append([])\n for data_index in range(self.n):\n marginals[batch_id].append(\n (data_index, float(marginal_vals[data_index]))\n )\n return marginals\n
"},{"location":"api/pydvl/value/semivalues/#pydvl.value.semivalues.SemiValueMode","title":"SemiValueMode","text":" Bases: str
, Enum
Enumeration of semi-value modes.
Deprecation notice
This enum and the associated methods are deprecated and will be removed in 0.8.0.
"},{"location":"api/pydvl/value/semivalues/#pydvl.value.semivalues.compute_generic_semivalues","title":"compute_generic_semivalues","text":"compute_generic_semivalues(\n sampler: PowersetSampler[IndexT],\n u: Utility,\n coefficient: SVCoefficient,\n done: StoppingCriterion,\n *,\n marginal: MarginalFunction = DefaultMarginal(),\n future_processor: FutureProcessor = FutureProcessor(),\n batch_size: int = 1,\n skip_converged: bool = False,\n n_jobs: int = 1,\n parallel_backend: Optional[ParallelBackend] = None,\n config: Optional[ParallelConfig] = None,\n progress: bool = False\n) -> ValuationResult\n
Computes semi-values for a given utility function and subset sampler.
PARAMETER DESCRIPTION sampler
The subset sampler to use for utility computations.
TYPE: PowersetSampler[IndexT]
u
Utility object with model, data, and scoring function.
TYPE: Utility
coefficient
The semi-value coefficient
TYPE: SVCoefficient
done
Stopping criterion.
TYPE: StoppingCriterion
marginal
Marginal function to be used for computing the semivalues
TYPE: MarginalFunction
DEFAULT: DefaultMarginal()
future_processor
Additional postprocessing steps required for some algorithms
TYPE: FutureProcessor
DEFAULT: FutureProcessor()
batch_size
Number of marginal evaluations per single parallel job.
TYPE: int
DEFAULT: 1
skip_converged
Whether to skip marginal evaluations for indices that have already converged. CAUTION: This is only entirely safe if the stopping criterion is MaxUpdates. For any other stopping criterion, the convergence status of indices may change during the computation, or they may be marked as having converged even though in fact the estimated values are far from the true values (e.g. for AbsoluteStandardError, you will probably have to carefully adjust the threshold).
TYPE: bool
DEFAULT: False
n_jobs
Number of parallel jobs to use.
TYPE: int
DEFAULT: 1
parallel_backend
Parallel backend instance to use for parallelizing computations. If None
, use JoblibParallelBackend backend. See the Parallel Backends package for available options.
TYPE: Optional[ParallelBackend]
DEFAULT: None
config
(DEPRECATED) Object configuring parallel computation, with cluster address, number of cpus, etc.
TYPE: Optional[ParallelConfig]
DEFAULT: None
progress
Whether to display a progress bar.
TYPE: bool
DEFAULT: False
RETURNS DESCRIPTION ValuationResult
Object with the results.
Deprecation notice
Parameter batch_size
is for experimental use and will be removed in future versions.
Changed in version 0.9.0
Deprecated config
argument and added a parallel_backend
argument to allow users to pass the Parallel Backend instance directly.
Source code in src/pydvl/value/semivalues.py
@deprecated(\n target=True,\n args_mapping={\"config\": \"config\"},\n deprecated_in=\"0.9.0\",\n remove_in=\"0.10.0\",\n)\ndef compute_generic_semivalues(\n sampler: PowersetSampler[IndexT],\n u: Utility,\n coefficient: SVCoefficient,\n done: StoppingCriterion,\n *,\n marginal: MarginalFunction = DefaultMarginal(),\n future_processor: FutureProcessor = FutureProcessor(),\n batch_size: int = 1,\n skip_converged: bool = False,\n n_jobs: int = 1,\n parallel_backend: Optional[ParallelBackend] = None,\n config: Optional[ParallelConfig] = None,\n progress: bool = False,\n) -> ValuationResult:\n \"\"\"Computes semi-values for a given utility function and subset sampler.\n\n Args:\n sampler: The subset sampler to use for utility computations.\n u: Utility object with model, data, and scoring function.\n coefficient: The semi-value coefficient\n done: Stopping criterion.\n marginal: Marginal function to be used for computing the semivalues\n future_processor: Additional postprocessing steps required for some algorithms\n batch_size: Number of marginal evaluations per single parallel job.\n skip_converged: Whether to skip marginal evaluations for indices that\n have already converged. **CAUTION**: This is only entirely safe if\n the stopping criterion is [MaxUpdates][pydvl.value.stopping.MaxUpdates].\n For any other stopping criterion, the convergence status of indices\n may change during the computation, or they may be marked as having\n converged even though in fact the estimated values are far from the\n true values (e.g. for\n [AbsoluteStandardError][pydvl.value.stopping.AbsoluteStandardError],\n you will probably have to carefully adjust the threshold).\n n_jobs: Number of parallel jobs to use.\n parallel_backend: Parallel backend instance to use\n for parallelizing computations. If `None`,\n use [JoblibParallelBackend][pydvl.parallel.backends.JoblibParallelBackend] backend.\n See the [Parallel Backends][pydvl.parallel.backends] package\n for available options.\n config: (**DEPRECATED**) Object configuring parallel computation,\n with cluster address, number of cpus, etc.\n progress: Whether to display a progress bar.\n\n Returns:\n Object with the results.\n\n !!! warning \"Deprecation notice\"\n Parameter `batch_size` is for experimental use and will be removed in\n future versions.\n\n !!! tip \"Changed in version 0.9.0\"\n Deprecated `config` argument and added a `parallel_backend`\n argument to allow users to pass the Parallel Backend instance\n directly.\n \"\"\"\n if isinstance(sampler, PermutationSampler) and u.cache is None:\n log.warning(\n \"PermutationSampler requires caching to be enabled or computation \"\n \"will be doubled wrt. a 'direct' implementation of permutation MC\"\n )\n\n if batch_size != 1:\n warnings.warn(\n \"Parameter `batch_size` is for experimental use and will be\"\n \" removed in future versions\",\n DeprecationWarning,\n )\n\n result = ValuationResult.zeros(\n algorithm=f\"semivalue-{str(sampler)}-{coefficient.__name__}\", # type: ignore\n indices=u.data.indices,\n data_names=u.data.data_names,\n )\n\n parallel_backend = _maybe_init_parallel_backend(parallel_backend, config)\n u = parallel_backend.put(u)\n correction = parallel_backend.put(\n lambda n, k: coefficient(n, k) * sampler.weight(n, k)\n )\n\n max_workers = parallel_backend.effective_n_jobs(n_jobs)\n n_submitted_jobs = 2 * max_workers # number of jobs in the queue\n\n sampler_it = iter(sampler)\n pbar = tqdm(disable=not progress, total=100, unit=\"%\")\n\n with parallel_backend.executor(\n max_workers=max_workers, cancel_futures=True\n ) as executor:\n pending: set[Future] = set()\n while True:\n pbar.n = 100 * done.completion()\n pbar.refresh()\n\n completed, pending = wait(pending, timeout=1, return_when=FIRST_COMPLETED)\n for future in completed:\n processed_future = future_processor(\n future.result()\n ) # List of tuples or\n for batch_future in processed_future:\n if isinstance(batch_future, list): # Case when batch size is > 1\n for idx, marginal_val in batch_future:\n result.update(idx, marginal_val)\n else: # Batch size 1\n idx, marginal_val = batch_future\n result.update(idx, marginal_val)\n if done(result):\n return result\n\n # Ensure that we always have n_submitted_jobs running\n try:\n while len(pending) < n_submitted_jobs:\n samples = tuple(islice(sampler_it, batch_size))\n if len(samples) == 0:\n raise StopIteration\n\n # Filter out samples for indices that have already converged\n filtered_samples = samples\n if skip_converged and np.count_nonzero(done.converged) > 0:\n # TODO: cloudpickle can't pickle result of `filter` on python 3.8\n filtered_samples = tuple(\n filter(lambda t: not done.converged[t[0]], samples)\n )\n\n if filtered_samples:\n pending.add(\n executor.submit(\n marginal,\n u=u,\n coefficient=correction,\n samples=filtered_samples,\n )\n )\n except StopIteration:\n if len(pending) == 0:\n return result\n
"},{"location":"api/pydvl/value/semivalues/#pydvl.value.semivalues.compute_shapley_semivalues","title":"compute_shapley_semivalues","text":"compute_shapley_semivalues(\n u: Utility,\n *,\n done: StoppingCriterion,\n sampler_t: Type[StochasticSampler] = PermutationSampler,\n batch_size: int = 1,\n n_jobs: int = 1,\n parallel_backend: Optional[ParallelBackend] = None,\n config: Optional[ParallelConfig] = None,\n progress: bool = False,\n seed: Optional[Seed] = None\n) -> ValuationResult\n
Computes Shapley values for a given utility function.
This is a convenience wrapper for compute_generic_semivalues with the Shapley coefficient. Use compute_shapley_values for a more flexible interface and additional methods, including TMCS.
PARAMETER DESCRIPTION u
Utility object with model, data, and scoring function.
TYPE: Utility
done
Stopping criterion.
TYPE: StoppingCriterion
sampler_t
The sampler type to use. See the sampler module for a list.
TYPE: Type[StochasticSampler]
DEFAULT: PermutationSampler
batch_size
Number of marginal evaluations per single parallel job.
TYPE: int
DEFAULT: 1
n_jobs
Number of parallel jobs to use.
TYPE: int
DEFAULT: 1
parallel_backend
Parallel backend instance to use for parallelizing computations. If None
, use JoblibParallelBackend backend. See the Parallel Backends package for available options.
TYPE: Optional[ParallelBackend]
DEFAULT: None
config
(DEPRECATED) Object configuring parallel computation, with cluster address, number of cpus, etc.
TYPE: Optional[ParallelConfig]
DEFAULT: None
seed
Either an instance of a numpy random number generator or a seed for it.
TYPE: Optional[Seed]
DEFAULT: None
progress
Whether to display a progress bar.
TYPE: bool
DEFAULT: False
RETURNS DESCRIPTION ValuationResult
Object with the results.
Deprecation notice
Parameter batch_size
is for experimental use and will be removed in future versions.
Changed in version 0.9.0
Deprecated config
argument and added a parallel_backend
argument to allow users to pass the Parallel Backend instance directly.
Source code in src/pydvl/value/semivalues.py
@deprecated(\n target=True,\n args_mapping={\"config\": \"config\"},\n deprecated_in=\"0.9.0\",\n remove_in=\"0.10.0\",\n)\ndef compute_shapley_semivalues(\n u: Utility,\n *,\n done: StoppingCriterion,\n sampler_t: Type[StochasticSampler] = PermutationSampler,\n batch_size: int = 1,\n n_jobs: int = 1,\n parallel_backend: Optional[ParallelBackend] = None,\n config: Optional[ParallelConfig] = None,\n progress: bool = False,\n seed: Optional[Seed] = None,\n) -> ValuationResult:\n \"\"\"Computes Shapley values for a given utility function.\n\n This is a convenience wrapper for\n [compute_generic_semivalues][pydvl.value.semivalues.compute_generic_semivalues]\n with the Shapley coefficient. Use\n [compute_shapley_values][pydvl.value.shapley.common.compute_shapley_values]\n for a more flexible interface and additional methods, including TMCS.\n\n Args:\n u: Utility object with model, data, and scoring function.\n done: Stopping criterion.\n sampler_t: The sampler type to use. See the\n [sampler][pydvl.value.sampler] module for a list.\n batch_size: Number of marginal evaluations per single parallel job.\n n_jobs: Number of parallel jobs to use.\n parallel_backend: Parallel backend instance to use\n for parallelizing computations. If `None`,\n use [JoblibParallelBackend][pydvl.parallel.backends.JoblibParallelBackend] backend.\n See the [Parallel Backends][pydvl.parallel.backends] package\n for available options.\n config: (**DEPRECATED**) Object configuring parallel computation,\n with cluster address, number of cpus, etc.\n seed: Either an instance of a numpy random number generator or a seed\n for it.\n progress: Whether to display a progress bar.\n\n Returns:\n Object with the results.\n\n !!! warning \"Deprecation notice\"\n Parameter `batch_size` is for experimental use and will be removed in\n future versions.\n\n !!! tip \"Changed in version 0.9.0\"\n Deprecated `config` argument and added a `parallel_backend`\n argument to allow users to pass the Parallel Backend instance\n directly.\n \"\"\"\n # HACK: cannot infer return type because of useless IndexT, NameT\n return compute_generic_semivalues( # type: ignore\n sampler_t(u.data.indices, seed=seed),\n u,\n shapley_coefficient,\n done,\n batch_size=batch_size,\n n_jobs=n_jobs,\n parallel_backend=parallel_backend,\n config=config,\n progress=progress,\n )\n
"},{"location":"api/pydvl/value/semivalues/#pydvl.value.semivalues.compute_banzhaf_semivalues","title":"compute_banzhaf_semivalues","text":"compute_banzhaf_semivalues(\n u: Utility,\n *,\n done: StoppingCriterion,\n sampler_t: Type[StochasticSampler] = PermutationSampler,\n batch_size: int = 1,\n n_jobs: int = 1,\n parallel_backend: Optional[ParallelBackend] = None,\n config: Optional[ParallelConfig] = None,\n progress: bool = False,\n seed: Optional[Seed] = None\n) -> ValuationResult\n
Computes Banzhaf values for a given utility function.
This is a convenience wrapper for compute_generic_semivalues with the Banzhaf coefficient.
PARAMETER DESCRIPTION u
Utility object with model, data, and scoring function.
TYPE: Utility
done
Stopping criterion.
TYPE: StoppingCriterion
sampler_t
The sampler type to use. See the sampler module for a list.
TYPE: Type[StochasticSampler]
DEFAULT: PermutationSampler
batch_size
Number of marginal evaluations per single parallel job.
TYPE: int
DEFAULT: 1
n_jobs
Number of parallel jobs to use.
TYPE: int
DEFAULT: 1
seed
Either an instance of a numpy random number generator or a seed for it.
TYPE: Optional[Seed]
DEFAULT: None
parallel_backend
Parallel backend instance to use for parallelizing computations. If None
, use JoblibParallelBackend backend. See the Parallel Backends package for available options.
TYPE: Optional[ParallelBackend]
DEFAULT: None
config
(DEPRECATED) Object configuring parallel computation, with cluster address, number of cpus, etc.
TYPE: Optional[ParallelConfig]
DEFAULT: None
progress
Whether to display a progress bar.
TYPE: bool
DEFAULT: False
RETURNS DESCRIPTION ValuationResult
Object with the results.
Deprecation notice
Parameter batch_size
is for experimental use and will be removed in future versions.
Changed in version 0.9.0
Deprecated config
argument and added a parallel_backend
argument to allow users to pass the Parallel Backend instance directly.
Source code in src/pydvl/value/semivalues.py
@deprecated(\n target=True,\n args_mapping={\"config\": \"config\"},\n deprecated_in=\"0.9.0\",\n remove_in=\"0.10.0\",\n)\ndef compute_banzhaf_semivalues(\n u: Utility,\n *,\n done: StoppingCriterion,\n sampler_t: Type[StochasticSampler] = PermutationSampler,\n batch_size: int = 1,\n n_jobs: int = 1,\n parallel_backend: Optional[ParallelBackend] = None,\n config: Optional[ParallelConfig] = None,\n progress: bool = False,\n seed: Optional[Seed] = None,\n) -> ValuationResult:\n \"\"\"Computes Banzhaf values for a given utility function.\n\n This is a convenience wrapper for\n [compute_generic_semivalues][pydvl.value.semivalues.compute_generic_semivalues]\n with the Banzhaf coefficient.\n\n Args:\n u: Utility object with model, data, and scoring function.\n done: Stopping criterion.\n sampler_t: The sampler type to use. See the\n [sampler][pydvl.value.sampler] module for a list.\n batch_size: Number of marginal evaluations per single parallel job.\n n_jobs: Number of parallel jobs to use.\n seed: Either an instance of a numpy random number generator or a seed\n for it.\n parallel_backend: Parallel backend instance to use\n for parallelizing computations. If `None`,\n use [JoblibParallelBackend][pydvl.parallel.backends.JoblibParallelBackend] backend.\n See the [Parallel Backends][pydvl.parallel.backends] package\n for available options.\n config: (**DEPRECATED**) Object configuring parallel computation,\n with cluster address, number of cpus, etc.\n progress: Whether to display a progress bar.\n\n Returns:\n Object with the results.\n\n !!! warning \"Deprecation notice\"\n Parameter `batch_size` is for experimental use and will be removed in\n future versions.\n\n !!! tip \"Changed in version 0.9.0\"\n Deprecated `config` argument and added a `parallel_backend`\n argument to allow users to pass the Parallel Backend instance\n directly.\n \"\"\"\n # HACK: cannot infer return type because of useless IndexT, NameT\n return compute_generic_semivalues( # type: ignore\n sampler_t(u.data.indices, seed=seed),\n u,\n banzhaf_coefficient,\n done,\n batch_size=batch_size,\n n_jobs=n_jobs,\n parallel_backend=parallel_backend,\n config=config,\n progress=progress,\n )\n
"},{"location":"api/pydvl/value/semivalues/#pydvl.value.semivalues.compute_msr_banzhaf_semivalues","title":"compute_msr_banzhaf_semivalues","text":"compute_msr_banzhaf_semivalues(\n u: Utility,\n *,\n done: StoppingCriterion,\n sampler_t: Type[StochasticSampler] = MSRSampler,\n batch_size: int = 1,\n n_jobs: int = 1,\n parallel_backend: Optional[ParallelBackend] = None,\n config: Optional[ParallelConfig] = None,\n progress: bool = False,\n seed: Optional[Seed] = None\n) -> ValuationResult\n
Computes MSR sampled Banzhaf values for a given utility function.
This is a convenience wrapper for compute_generic_semivalues with the Banzhaf coefficient and MSR sampling.
This algorithm works by sampling random subsets and then evaluating the utility on that subset only once. Based on the evaluation and the subset indices, the MSRFutureProcessor then computes the marginal updates like in the paper (Wang et. al.)3. Their approach updates the semivalues for all data points every time a new evaluation is computed. This increases sample efficiency compared to normal Monte Carlo updates.
PARAMETER DESCRIPTION u
Utility object with model, data, and scoring function.
TYPE: Utility
done
Stopping criterion.
TYPE: StoppingCriterion
sampler_t
The sampler type to use. See the sampler module for a list.
TYPE: Type[StochasticSampler]
DEFAULT: MSRSampler
batch_size
Number of marginal evaluations per single parallel job.
TYPE: int
DEFAULT: 1
n_jobs
Number of parallel jobs to use.
TYPE: int
DEFAULT: 1
seed
Either an instance of a numpy random number generator or a seed for it.
TYPE: Optional[Seed]
DEFAULT: None
config
Object configuring parallel computation, with cluster address, number of cpus, etc.
TYPE: Optional[ParallelConfig]
DEFAULT: None
progress
Whether to display a progress bar.
TYPE: bool
DEFAULT: False
RETURNS DESCRIPTION ValuationResult
Object with the results.
Deprecation notice
Parameter batch_size
is for experimental use and will be removed in future versions.
Source code in src/pydvl/value/semivalues.py
@deprecated(\n target=True,\n args_mapping={\"config\": \"config\"},\n deprecated_in=\"0.9.0\",\n remove_in=\"0.10.0\",\n)\ndef compute_msr_banzhaf_semivalues(\n u: Utility,\n *,\n done: StoppingCriterion,\n sampler_t: Type[StochasticSampler] = MSRSampler,\n batch_size: int = 1,\n n_jobs: int = 1,\n parallel_backend: Optional[ParallelBackend] = None,\n config: Optional[ParallelConfig] = None,\n progress: bool = False,\n seed: Optional[Seed] = None,\n) -> ValuationResult:\n \"\"\"Computes MSR sampled Banzhaf values for a given utility function.\n\n This is a convenience wrapper for\n [compute_generic_semivalues][pydvl.value.semivalues.compute_generic_semivalues]\n with the Banzhaf coefficient and MSR sampling.\n\n This algorithm works by sampling random subsets and then evaluating the utility\n on that subset only once. Based on the evaluation and the subset indices,\n the MSRFutureProcessor then computes the marginal updates like in the paper\n (Wang et. al.)<sup><a href=\"wang_data_2023\">3</a></sup>.\n Their approach updates the semivalues for all data points every time a new evaluation\n is computed. This increases sample efficiency compared to normal Monte Carlo updates.\n\n Args:\n u: Utility object with model, data, and scoring function.\n done: Stopping criterion.\n sampler_t: The sampler type to use. See the\n [sampler][pydvl.value.sampler] module for a list.\n batch_size: Number of marginal evaluations per single parallel job.\n n_jobs: Number of parallel jobs to use.\n seed: Either an instance of a numpy random number generator or a seed\n for it.\n config: Object configuring parallel computation, with cluster address,\n number of cpus, etc.\n progress: Whether to display a progress bar.\n\n Returns:\n Object with the results.\n\n !!! warning \"Deprecation notice\"\n Parameter `batch_size` is for experimental use and will be removed in\n future versions.\n \"\"\"\n # HACK: cannot infer return type because of useless IndexT, NameT\n return compute_generic_semivalues( # type: ignore\n sampler_t(u.data.indices, seed=seed),\n u,\n always_one_coefficient,\n done,\n marginal=RawUtility(),\n future_processor=MSRFutureProcessor(u),\n batch_size=batch_size,\n n_jobs=n_jobs,\n parallel_backend=parallel_backend,\n config=config,\n progress=progress,\n )\n
"},{"location":"api/pydvl/value/semivalues/#pydvl.value.semivalues.compute_beta_shapley_semivalues","title":"compute_beta_shapley_semivalues","text":"compute_beta_shapley_semivalues(\n u: Utility,\n *,\n alpha: float = 1,\n beta: float = 1,\n done: StoppingCriterion,\n sampler_t: Type[StochasticSampler] = PermutationSampler,\n batch_size: int = 1,\n n_jobs: int = 1,\n parallel_backend: Optional[ParallelBackend] = None,\n config: Optional[ParallelConfig] = None,\n progress: bool = False,\n seed: Optional[Seed] = None\n) -> ValuationResult\n
Computes Beta Shapley values for a given utility function.
This is a convenience wrapper for compute_generic_semivalues with the Beta Shapley coefficient.
PARAMETER DESCRIPTION u
Utility object with model, data, and scoring function.
TYPE: Utility
alpha
Alpha parameter of the Beta distribution.
TYPE: float
DEFAULT: 1
beta
Beta parameter of the Beta distribution.
TYPE: float
DEFAULT: 1
done
Stopping criterion.
TYPE: StoppingCriterion
sampler_t
The sampler type to use. See the sampler module for a list.
TYPE: Type[StochasticSampler]
DEFAULT: PermutationSampler
batch_size
Number of marginal evaluations per (parallelized) task.
TYPE: int
DEFAULT: 1
n_jobs
Number of parallel jobs to use.
TYPE: int
DEFAULT: 1
seed
Either an instance of a numpy random number generator or a seed for it.
TYPE: Optional[Seed]
DEFAULT: None
parallel_backend
Parallel backend instance to use for parallelizing computations. If None
, use JoblibParallelBackend backend. See the Parallel Backends package for available options.
TYPE: Optional[ParallelBackend]
DEFAULT: None
config
(DEPRECATED) Object configuring parallel computation, with cluster address, number of cpus, etc.
TYPE: Optional[ParallelConfig]
DEFAULT: None
progress
Whether to display a progress bar.
TYPE: bool
DEFAULT: False
RETURNS DESCRIPTION ValuationResult
Object with the results.
Deprecation notice
Parameter batch_size
is for experimental use and will be removed in future versions.
Changed in version 0.9.0
Deprecated config
argument and added a parallel_backend
argument to allow users to pass the Parallel Backend instance directly.
Source code in src/pydvl/value/semivalues.py
@deprecated(\n target=True,\n args_mapping={\"config\": \"config\"},\n deprecated_in=\"0.9.0\",\n remove_in=\"0.10.0\",\n)\ndef compute_beta_shapley_semivalues(\n u: Utility,\n *,\n alpha: float = 1,\n beta: float = 1,\n done: StoppingCriterion,\n sampler_t: Type[StochasticSampler] = PermutationSampler,\n batch_size: int = 1,\n n_jobs: int = 1,\n parallel_backend: Optional[ParallelBackend] = None,\n config: Optional[ParallelConfig] = None,\n progress: bool = False,\n seed: Optional[Seed] = None,\n) -> ValuationResult:\n \"\"\"Computes Beta Shapley values for a given utility function.\n\n This is a convenience wrapper for\n [compute_generic_semivalues][pydvl.value.semivalues.compute_generic_semivalues]\n with the Beta Shapley coefficient.\n\n Args:\n u: Utility object with model, data, and scoring function.\n alpha: Alpha parameter of the Beta distribution.\n beta: Beta parameter of the Beta distribution.\n done: Stopping criterion.\n sampler_t: The sampler type to use. See the\n [sampler][pydvl.value.sampler] module for a list.\n batch_size: Number of marginal evaluations per (parallelized) task.\n n_jobs: Number of parallel jobs to use.\n seed: Either an instance of a numpy random number generator or a seed for it.\n parallel_backend: Parallel backend instance to use\n for parallelizing computations. If `None`,\n use [JoblibParallelBackend][pydvl.parallel.backends.JoblibParallelBackend] backend.\n See the [Parallel Backends][pydvl.parallel.backends] package\n for available options.\n config: (**DEPRECATED**) Object configuring parallel computation,\n with cluster address, number of cpus, etc.\n progress: Whether to display a progress bar.\n\n Returns:\n Object with the results.\n\n !!! warning \"Deprecation notice\"\n Parameter `batch_size` is for experimental use and will be removed in\n future versions.\n\n !!! tip \"Changed in version 0.9.0\"\n Deprecated `config` argument and added a `parallel_backend`\n argument to allow users to pass the Parallel Backend instance\n directly.\n \"\"\"\n # HACK: cannot infer return type because of useless IndexT, NameT\n return compute_generic_semivalues( # type: ignore\n sampler_t(u.data.indices, seed=seed),\n u,\n beta_coefficient(alpha, beta),\n done,\n batch_size=batch_size,\n n_jobs=n_jobs,\n parallel_backend=parallel_backend,\n config=config,\n progress=progress,\n )\n
"},{"location":"api/pydvl/value/semivalues/#pydvl.value.semivalues.compute_semivalues","title":"compute_semivalues","text":"compute_semivalues(\n u: Utility,\n *,\n done: StoppingCriterion,\n mode: SemiValueMode = SemiValueMode.Shapley,\n sampler_t: Type[StochasticSampler] = PermutationSampler,\n batch_size: int = 1,\n n_jobs: int = 1,\n seed: Optional[Seed] = None,\n **kwargs\n) -> ValuationResult\n
Convenience entry point for most common semi-value computations.
Deprecation warning
This method is deprecated and will be replaced in 0.8.0 by the more general implementation of compute_generic_semivalues. Use compute_shapley_semivalues, compute_banzhaf_semivalues, or compute_beta_shapley_semivalues instead.
The modes supported with this interface are the following. For greater flexibility use compute_generic_semivalues directly.
- SemiValueMode.Shapley: Shapley values.
- SemiValueMode.BetaShapley: Implements the Beta Shapley semi-value as introduced in (Kwon and Zou, 2022)1. Pass additional keyword arguments
alpha
and beta
to set the parameters of the Beta distribution (both default to 1). - SemiValueMode.Banzhaf: Implements the Banzhaf semi-value as introduced in (Wang and Jia, 2022)1.
See Data valuation for an overview of valuation.
PARAMETER DESCRIPTION u
Utility object with model, data, and scoring function.
TYPE: Utility
done
Stopping criterion.
TYPE: StoppingCriterion
mode
The semi-value mode to use. See SemiValueMode for a list.
TYPE: SemiValueMode
DEFAULT: Shapley
sampler_t
The sampler type to use. See sampler for a list.
TYPE: Type[StochasticSampler]
DEFAULT: PermutationSampler
batch_size
Number of marginal evaluations per (parallelized) task.
TYPE: int
DEFAULT: 1
n_jobs
Number of parallel jobs to use.
TYPE: int
DEFAULT: 1
seed
Either an instance of a numpy random number generator or a seed for it.
TYPE: Optional[Seed]
DEFAULT: None
kwargs
Additional keyword arguments passed to compute_generic_semivalues.
DEFAULT: {}
RETURNS DESCRIPTION ValuationResult
Object with the results.
Deprecation notice
Parameter batch_size
is for experimental use and will be removed in future versions.
Source code in src/pydvl/value/semivalues.py
@deprecated(target=True, deprecated_in=\"0.7.0\", remove_in=\"0.8.0\")\ndef compute_semivalues(\n u: Utility,\n *,\n done: StoppingCriterion,\n mode: SemiValueMode = SemiValueMode.Shapley,\n sampler_t: Type[StochasticSampler] = PermutationSampler,\n batch_size: int = 1,\n n_jobs: int = 1,\n seed: Optional[Seed] = None,\n **kwargs,\n) -> ValuationResult:\n \"\"\"Convenience entry point for most common semi-value computations.\n\n !!! warning \"Deprecation warning\"\n This method is deprecated and will be replaced in 0.8.0 by the more\n general implementation of\n [compute_generic_semivalues][pydvl.value.semivalues.compute_generic_semivalues].\n Use\n [compute_shapley_semivalues][pydvl.value.semivalues.compute_shapley_semivalues],\n [compute_banzhaf_semivalues][pydvl.value.semivalues.compute_banzhaf_semivalues],\n or\n [compute_beta_shapley_semivalues][pydvl.value.semivalues.compute_beta_shapley_semivalues]\n instead.\n\n The modes supported with this interface are the following. For greater\n flexibility use\n [compute_generic_semivalues][pydvl.value.semivalues.compute_generic_semivalues]\n directly.\n\n - [SemiValueMode.Shapley][pydvl.value.semivalues.SemiValueMode]:\n Shapley values.\n - [SemiValueMode.BetaShapley][pydvl.value.semivalues.SemiValueMode]:\n Implements the Beta Shapley semi-value as introduced in\n (Kwon and Zou, 2022)<sup><a href=\"#kwon_beta_2022\">1</a></sup>.\n Pass additional keyword arguments `alpha` and `beta` to set the\n parameters of the Beta distribution (both default to 1).\n - [SemiValueMode.Banzhaf][pydvl.value.semivalues.SemiValueMode]: Implements\n the Banzhaf semi-value as introduced in (Wang and Jia, 2022)<sup><a\n href=\"#wang_data_2023\">1</a></sup>.\n\n See [Data valuation][data-valuation] for an overview of valuation.\n\n Args:\n u: Utility object with model, data, and scoring function.\n done: Stopping criterion.\n mode: The semi-value mode to use. See\n [SemiValueMode][pydvl.value.semivalues.SemiValueMode] for a list.\n sampler_t: The sampler type to use. See [sampler][pydvl.value.sampler]\n for a list.\n batch_size: Number of marginal evaluations per (parallelized) task.\n n_jobs: Number of parallel jobs to use.\n seed: Either an instance of a numpy random number generator or a seed for it.\n kwargs: Additional keyword arguments passed to\n [compute_generic_semivalues][pydvl.value.semivalues.compute_generic_semivalues].\n\n Returns:\n Object with the results.\n\n !!! warning \"Deprecation notice\"\n Parameter `batch_size` is for experimental use and will be removed in\n future versions.\n \"\"\"\n if mode == SemiValueMode.Shapley:\n coefficient = shapley_coefficient\n elif mode == SemiValueMode.BetaShapley:\n alpha = kwargs.pop(\"alpha\", 1)\n beta = kwargs.pop(\"beta\", 1)\n coefficient = beta_coefficient(alpha, beta)\n elif mode == SemiValueMode.Banzhaf:\n coefficient = banzhaf_coefficient\n else:\n raise ValueError(f\"Unknown mode {mode}\")\n coefficient = cast(SVCoefficient, coefficient)\n\n # HACK: cannot infer return type because of useless IndexT, NameT\n return compute_generic_semivalues( # type: ignore\n sampler_t(u.data.indices, seed=seed),\n u,\n coefficient,\n done,\n n_jobs=n_jobs,\n batch_size=batch_size,\n **kwargs,\n )\n
"},{"location":"api/pydvl/value/stopping/","title":"Stopping","text":""},{"location":"api/pydvl/value/stopping/#pydvl.value.stopping","title":"pydvl.value.stopping","text":"Stopping criteria for value computations.
This module provides a basic set of stopping criteria, like MaxUpdates, MaxTime, or HistoryDeviation among others. These can behave in different ways depending on the context. For example, MaxUpdates limits the number of updates to values, which depending on the algorithm may mean a different number of utility evaluations or imply other computations like solving a linear or quadratic program.
Stopping criteria are callables that are evaluated on a ValuationResult and return a Status object. They can be combined using boolean operators.
"},{"location":"api/pydvl/value/stopping/#pydvl.value.stopping--how-convergence-is-determined","title":"How convergence is determined","text":"Most stopping criteria keep track of the convergence of each index separately but make global decisions based on the overall convergence of some fraction of all indices. For example, if we have a stopping criterion that checks whether the standard error of 90% of values is below a threshold, then methods will keep updating all indices until 90% of them have converged, irrespective of the quality of the individual estimates, and without freezing updates for indices along the way as values individually attain low standard error.
This has some practical implications, because some values do tend to converge sooner than others. For example, assume we use the criterion AbsoluteStandardError(0.02) | MaxUpdates(1000)
. Then values close to 0 might be marked as \"converged\" rather quickly because they fulfill the first criterion, say after 20 iterations, despite being poor estimates. Because other indices take much longer to have low standard error and the criterion is a global check, the \"converged\" ones keep being updated and end up being good estimates. In this case, this has been beneficial, but one might not wish for converged values to be updated, if one is sure that the criterion is adequate for individual values.
Semi-value methods include a parameter skip_converged
that allows to skip the computation of values that have converged. The way to avoid doing this too early is to use a more stringent check, e.g. AbsoluteStandardError(1e-3) | MaxUpdates(1000)
. With skip_converged=True
this check can still take less time than the first one, despite requiring more iterations for some indices.
"},{"location":"api/pydvl/value/stopping/#pydvl.value.stopping--choosing-a-stopping-criterion","title":"Choosing a stopping criterion","text":"The choice of a stopping criterion greatly depends on the algorithm and the context. A safe bet is to combine a MaxUpdates or a MaxTime with a HistoryDeviation or an AbsoluteStandardError. The former will ensure that the computation does not run for too long, while the latter will try to achieve results that are stable enough. Note however that if the threshold is too strict, one will always end up running until a maximum number of iterations or time. Also keep in mind that different values converge at different times, so you might want to use tight thresholds and skip_converged
as described above for semi-values.
Example from pydvl.value import AbsoluteStandardError, MaxUpdates, compute_banzhaf_semivalues\n\nutility = ... # some utility object\ncriterion = AbsoluteStandardError(threshold=1e-3, burn_in=32) | MaxUpdates(1000)\nvalues = compute_banzhaf_semivalues(\n utility,\n criterion,\n skip_converged=True, # skip values that have converged (CAREFUL!)\n)\n
This will compute the Banzhaf semivalues for utility
until either the absolute standard error is below 1e-3
or 1000
updates have been performed. The burn_in
parameter is used to discard the first 32
updates from the computation of the standard error. The skip_converged
parameter is used to avoid computing more marginals for indices that have converged, which is useful if AbsoluteStandardError is met before MaxUpdates for some indices. Warning
Be careful not to reuse the same stopping criterion for different computations. The object has state and will not be reset between calls to value computation methods. If you need to reuse the same criterion, you should create a new instance.
"},{"location":"api/pydvl/value/stopping/#pydvl.value.stopping--creating-stopping-criteria","title":"Creating stopping criteria","text":"The easiest way is to declare a function implementing the interface StoppingCriterionCallable and wrap it with make_criterion(). This creates a StoppingCriterion object that can be composed with other stopping criteria.
Alternatively, and in particular if reporting of completion is required, one can inherit from this class and implement the abstract methods _check
and completion.
"},{"location":"api/pydvl/value/stopping/#pydvl.value.stopping--combining-stopping-criteria","title":"Combining stopping criteria","text":"Objects of type StoppingCriterion can be combined with the binary operators &
(and), and |
(or), following the truth tables of Status. The unary operator ~
(not) is also supported. See StoppingCriterion for details on how these operations affect the behavior of the stopping criteria.
"},{"location":"api/pydvl/value/stopping/#pydvl.value.stopping--references","title":"References","text":" -
Ghorbani, A., Zou, J., 2019. Data Shapley: Equitable Valuation of Data for Machine Learning. In: Proceedings of the 36th International Conference on Machine Learning, PMLR, pp. 2242\u20132251.\u00a0\u21a9
-
Wang, J.T. and Jia, R., 2023. Data Banzhaf: A Robust Data Valuation Framework for Machine Learning. In: Proceedings of The 26th International Conference on Artificial Intelligence and Statistics, pp. 6388-6421.\u00a0\u21a9
"},{"location":"api/pydvl/value/stopping/#pydvl.value.stopping.StoppingCriterionCallable","title":"StoppingCriterionCallable","text":" Bases: Protocol
Signature for a stopping criterion
"},{"location":"api/pydvl/value/stopping/#pydvl.value.stopping.StoppingCriterion","title":"StoppingCriterion","text":"StoppingCriterion(modify_result: bool = True)\n
Bases: ABC
A composable callable object to determine whether a computation must stop.
A StoppingCriterion
is a callable taking a ValuationResult and returning a Status. It also keeps track of individual convergence of values with converged, and reports the overall completion of the computation with completion.
Instances of StoppingCriterion
can be composed with the binary operators &
(and), and |
(or), following the truth tables of Status. The unary operator ~
(not) is also supported. These boolean operations act according to the following rules:
- The results of
check()
are combined with the operator. See Status for the truth tables. - The results of converged are combined with the operator (returning another boolean array).
- The completion method returns the min, max, or the complement to 1 of the completions of the operands, for AND, OR and NOT respectively. This is required for cases where one of the criteria does not keep track of the convergence of single values, e.g. MaxUpdates, because completion by default returns the mean of the boolean convergence array.
"},{"location":"api/pydvl/value/stopping/#pydvl.value.stopping.StoppingCriterion--subclassing","title":"Subclassing","text":"Subclassing this class requires implementing a check()
method that returns a Status object based on a given ValuationResult. This method should update the attribute _converged
, which is a boolean array indicating whether the value for each index has converged. When this does not make sense for a particular stopping criterion, completion should be overridden to provide an overall completion value, since its default implementation attempts to compute the mean of _converged
.
PARAMETER DESCRIPTION modify_result
If True
the status of the input ValuationResult is modified in place after the call.
TYPE: bool
DEFAULT: True
Source code in src/pydvl/value/stopping.py
def __init__(self, modify_result: bool = True):\n self.modify_result = modify_result\n self._converged = np.full(0, False)\n
"},{"location":"api/pydvl/value/stopping/#pydvl.value.stopping.StoppingCriterion.converged","title":"converged property
","text":"converged: NDArray[bool_]\n
Returns a boolean array indicating whether the values have converged for each data point.
Inheriting classes must set the _converged
attribute in their check()
.
RETURNS DESCRIPTION NDArray[bool_]
A boolean array indicating whether the values have converged for
NDArray[bool_]
each data point.
"},{"location":"api/pydvl/value/stopping/#pydvl.value.stopping.StoppingCriterion.completion","title":"completion","text":"completion() -> float\n
Returns a value between 0 and 1 indicating the completion of the computation.
Source code in src/pydvl/value/stopping.py
def completion(self) -> float:\n \"\"\"Returns a value between 0 and 1 indicating the completion of the\n computation.\n \"\"\"\n if self.converged.size == 0:\n return 0.0\n return float(np.mean(self.converged).item())\n
"},{"location":"api/pydvl/value/stopping/#pydvl.value.stopping.StoppingCriterion.__call__","title":"__call__","text":"__call__(result: ValuationResult) -> Status\n
Calls check()
, maybe updating the result.
Source code in src/pydvl/value/stopping.py
def __call__(self, result: ValuationResult) -> Status:\n \"\"\"Calls `check()`, maybe updating the result.\"\"\"\n if len(result) == 0:\n logger.warning(\n \"At least one iteration finished but no results where generated. \"\n \"Please check that your scorer and utility return valid numbers.\"\n )\n status = self._check(result)\n if self.modify_result: # FIXME: this is not nice\n result._status = status\n return status\n
"},{"location":"api/pydvl/value/stopping/#pydvl.value.stopping.AbsoluteStandardError","title":"AbsoluteStandardError","text":"AbsoluteStandardError(\n threshold: float,\n fraction: float = 1.0,\n burn_in: int = 4,\n modify_result: bool = True,\n)\n
Bases: StoppingCriterion
Determine convergence based on the standard error of the values.
If \\(s_i\\) is the standard error for datum \\(i\\), then this criterion returns Converged if \\(s_i < \\epsilon\\) for all \\(i\\) and a threshold value \\(\\epsilon \\gt 0\\).
PARAMETER DESCRIPTION threshold
A value is considered to have converged if the standard error is below this threshold. A way of choosing it is to pick some percentage of the range of the values. For Shapley values this is the difference between the maximum and minimum of the utility function (to see this substitute the maximum and minimum values of the utility into the marginal contribution formula).
TYPE: float
fraction
The fraction of values that must have converged for the criterion to return Converged.
TYPE: float
DEFAULT: 1.0
burn_in
The number of iterations to ignore before checking for convergence. This is required because computations typically start with zero variance, as a result of using zeros(). The default is set to an arbitrary minimum which is usually enough but may need to be increased.
TYPE: int
DEFAULT: 4
Source code in src/pydvl/value/stopping.py
def __init__(\n self,\n threshold: float,\n fraction: float = 1.0,\n burn_in: int = 4,\n modify_result: bool = True,\n):\n super().__init__(modify_result=modify_result)\n self.threshold = threshold\n self.fraction = fraction\n self.burn_in = burn_in\n
"},{"location":"api/pydvl/value/stopping/#pydvl.value.stopping.AbsoluteStandardError.converged","title":"converged property
","text":"converged: NDArray[bool_]\n
Returns a boolean array indicating whether the values have converged for each data point.
Inheriting classes must set the _converged
attribute in their check()
.
RETURNS DESCRIPTION NDArray[bool_]
A boolean array indicating whether the values have converged for
NDArray[bool_]
each data point.
"},{"location":"api/pydvl/value/stopping/#pydvl.value.stopping.AbsoluteStandardError.completion","title":"completion","text":"completion() -> float\n
Returns a value between 0 and 1 indicating the completion of the computation.
Source code in src/pydvl/value/stopping.py
def completion(self) -> float:\n \"\"\"Returns a value between 0 and 1 indicating the completion of the\n computation.\n \"\"\"\n if self.converged.size == 0:\n return 0.0\n return float(np.mean(self.converged).item())\n
"},{"location":"api/pydvl/value/stopping/#pydvl.value.stopping.AbsoluteStandardError.__call__","title":"__call__","text":"__call__(result: ValuationResult) -> Status\n
Calls check()
, maybe updating the result.
Source code in src/pydvl/value/stopping.py
def __call__(self, result: ValuationResult) -> Status:\n \"\"\"Calls `check()`, maybe updating the result.\"\"\"\n if len(result) == 0:\n logger.warning(\n \"At least one iteration finished but no results where generated. \"\n \"Please check that your scorer and utility return valid numbers.\"\n )\n status = self._check(result)\n if self.modify_result: # FIXME: this is not nice\n result._status = status\n return status\n
"},{"location":"api/pydvl/value/stopping/#pydvl.value.stopping.MaxChecks","title":"MaxChecks","text":"MaxChecks(n_checks: Optional[int], modify_result: bool = True)\n
Bases: StoppingCriterion
Terminate as soon as the number of checks exceeds the threshold.
A \"check\" is one call to the criterion.
PARAMETER DESCRIPTION n_checks
Threshold: if None
, no _check is performed, effectively creating a (never) stopping criterion that always returns Pending
.
TYPE: Optional[int]
Source code in src/pydvl/value/stopping.py
def __init__(self, n_checks: Optional[int], modify_result: bool = True):\n super().__init__(modify_result=modify_result)\n if n_checks is not None and n_checks < 1:\n raise ValueError(\"n_iterations must be at least 1 or None\")\n self.n_checks = n_checks\n self._count = 0\n
"},{"location":"api/pydvl/value/stopping/#pydvl.value.stopping.MaxChecks.converged","title":"converged property
","text":"converged: NDArray[bool_]\n
Returns a boolean array indicating whether the values have converged for each data point.
Inheriting classes must set the _converged
attribute in their check()
.
RETURNS DESCRIPTION NDArray[bool_]
A boolean array indicating whether the values have converged for
NDArray[bool_]
each data point.
"},{"location":"api/pydvl/value/stopping/#pydvl.value.stopping.MaxChecks.__call__","title":"__call__","text":"__call__(result: ValuationResult) -> Status\n
Calls check()
, maybe updating the result.
Source code in src/pydvl/value/stopping.py
def __call__(self, result: ValuationResult) -> Status:\n \"\"\"Calls `check()`, maybe updating the result.\"\"\"\n if len(result) == 0:\n logger.warning(\n \"At least one iteration finished but no results where generated. \"\n \"Please check that your scorer and utility return valid numbers.\"\n )\n status = self._check(result)\n if self.modify_result: # FIXME: this is not nice\n result._status = status\n return status\n
"},{"location":"api/pydvl/value/stopping/#pydvl.value.stopping.MaxUpdates","title":"MaxUpdates","text":"MaxUpdates(n_updates: Optional[int], modify_result: bool = True)\n
Bases: StoppingCriterion
Terminate if any number of value updates exceeds or equals the given threshold.
Note
If you want to ensure that all values have been updated, you probably want MinUpdates instead.
This checks the counts
field of a ValuationResult, i.e. the number of times that each index has been updated. For powerset samplers, the maximum of this number coincides with the maximum number of subsets sampled. For permutation samplers, it coincides with the number of permutations sampled.
PARAMETER DESCRIPTION n_updates
Threshold: if None
, no _check is performed, effectively creating a (never) stopping criterion that always returns Pending
.
TYPE: Optional[int]
Source code in src/pydvl/value/stopping.py
def __init__(self, n_updates: Optional[int], modify_result: bool = True):\n super().__init__(modify_result=modify_result)\n if n_updates is not None and n_updates < 1:\n raise ValueError(\"n_updates must be at least 1 or None\")\n self.n_updates = n_updates\n self.last_max = 0\n
"},{"location":"api/pydvl/value/stopping/#pydvl.value.stopping.MaxUpdates.converged","title":"converged property
","text":"converged: NDArray[bool_]\n
Returns a boolean array indicating whether the values have converged for each data point.
Inheriting classes must set the _converged
attribute in their check()
.
RETURNS DESCRIPTION NDArray[bool_]
A boolean array indicating whether the values have converged for
NDArray[bool_]
each data point.
"},{"location":"api/pydvl/value/stopping/#pydvl.value.stopping.MaxUpdates.__call__","title":"__call__","text":"__call__(result: ValuationResult) -> Status\n
Calls check()
, maybe updating the result.
Source code in src/pydvl/value/stopping.py
def __call__(self, result: ValuationResult) -> Status:\n \"\"\"Calls `check()`, maybe updating the result.\"\"\"\n if len(result) == 0:\n logger.warning(\n \"At least one iteration finished but no results where generated. \"\n \"Please check that your scorer and utility return valid numbers.\"\n )\n status = self._check(result)\n if self.modify_result: # FIXME: this is not nice\n result._status = status\n return status\n
"},{"location":"api/pydvl/value/stopping/#pydvl.value.stopping.MinUpdates","title":"MinUpdates","text":"MinUpdates(n_updates: Optional[int], modify_result: bool = True)\n
Bases: StoppingCriterion
Terminate as soon as all value updates exceed or equal the given threshold.
This checks the counts
field of a ValuationResult, i.e. the number of times that each index has been updated. For powerset samplers, the minimum of this number is a lower bound for the number of subsets sampled. For permutation samplers, it lower-bounds the amount of permutations sampled.
PARAMETER DESCRIPTION n_updates
Threshold: if None
, no _check is performed, effectively creating a (never) stopping criterion that always returns Pending
.
TYPE: Optional[int]
Source code in src/pydvl/value/stopping.py
def __init__(self, n_updates: Optional[int], modify_result: bool = True):\n super().__init__(modify_result=modify_result)\n self.n_updates = n_updates\n self.last_min = 0\n
"},{"location":"api/pydvl/value/stopping/#pydvl.value.stopping.MinUpdates.converged","title":"converged property
","text":"converged: NDArray[bool_]\n
Returns a boolean array indicating whether the values have converged for each data point.
Inheriting classes must set the _converged
attribute in their check()
.
RETURNS DESCRIPTION NDArray[bool_]
A boolean array indicating whether the values have converged for
NDArray[bool_]
each data point.
"},{"location":"api/pydvl/value/stopping/#pydvl.value.stopping.MinUpdates.__call__","title":"__call__","text":"__call__(result: ValuationResult) -> Status\n
Calls check()
, maybe updating the result.
Source code in src/pydvl/value/stopping.py
def __call__(self, result: ValuationResult) -> Status:\n \"\"\"Calls `check()`, maybe updating the result.\"\"\"\n if len(result) == 0:\n logger.warning(\n \"At least one iteration finished but no results where generated. \"\n \"Please check that your scorer and utility return valid numbers.\"\n )\n status = self._check(result)\n if self.modify_result: # FIXME: this is not nice\n result._status = status\n return status\n
"},{"location":"api/pydvl/value/stopping/#pydvl.value.stopping.MaxTime","title":"MaxTime","text":"MaxTime(seconds: Optional[float], modify_result: bool = True)\n
Bases: StoppingCriterion
Terminate if the computation time exceeds the given number of seconds.
Checks the elapsed time since construction
PARAMETER DESCRIPTION seconds
Threshold: The computation is terminated if the elapsed time between object construction and a _check exceeds this value. If None
, no _check is performed, effectively creating a (never) stopping criterion that always returns Pending
.
TYPE: Optional[float]
Source code in src/pydvl/value/stopping.py
def __init__(self, seconds: Optional[float], modify_result: bool = True):\n super().__init__(modify_result=modify_result)\n self.max_seconds = seconds or np.inf\n if self.max_seconds <= 0:\n raise ValueError(\"Number of seconds for MaxTime must be positive or None\")\n self.start = time()\n
"},{"location":"api/pydvl/value/stopping/#pydvl.value.stopping.MaxTime.converged","title":"converged property
","text":"converged: NDArray[bool_]\n
Returns a boolean array indicating whether the values have converged for each data point.
Inheriting classes must set the _converged
attribute in their check()
.
RETURNS DESCRIPTION NDArray[bool_]
A boolean array indicating whether the values have converged for
NDArray[bool_]
each data point.
"},{"location":"api/pydvl/value/stopping/#pydvl.value.stopping.MaxTime.__call__","title":"__call__","text":"__call__(result: ValuationResult) -> Status\n
Calls check()
, maybe updating the result.
Source code in src/pydvl/value/stopping.py
def __call__(self, result: ValuationResult) -> Status:\n \"\"\"Calls `check()`, maybe updating the result.\"\"\"\n if len(result) == 0:\n logger.warning(\n \"At least one iteration finished but no results where generated. \"\n \"Please check that your scorer and utility return valid numbers.\"\n )\n status = self._check(result)\n if self.modify_result: # FIXME: this is not nice\n result._status = status\n return status\n
"},{"location":"api/pydvl/value/stopping/#pydvl.value.stopping.HistoryDeviation","title":"HistoryDeviation","text":"HistoryDeviation(\n n_steps: int,\n rtol: float,\n pin_converged: bool = True,\n modify_result: bool = True,\n)\n
Bases: StoppingCriterion
A simple check for relative distance to a previous step in the computation.
The method used by (Ghorbani and Zou, 2019)1 computes the relative distances between the current values \\(v_i^t\\) and the values at the previous checkpoint \\(v_i^{t-\\tau}\\). If the sum is below a given threshold, the computation is terminated.
\\[\\sum_{i=1}^n \\frac{\\left| v_i^t - v_i^{t-\\tau} \\right|}{v_i^t} < \\epsilon.\\] When the denominator is zero, the summand is set to the value of \\(v_i^{ t-\\tau}\\).
This implementation is slightly generalised to allow for different number of updates to individual indices, as happens with powerset samplers instead of permutations. Every subset of indices that is found to converge can be pinned to that state. Once all indices have converged the method has converged.
Warning
This criterion is meant for the reproduction of the results in the paper, but we do not recommend using it in practice.
PARAMETER DESCRIPTION n_steps
Checkpoint values every so many updates and use these saved values to compare.
TYPE: int
rtol
Relative tolerance for convergence (\\(\\epsilon\\) in the formula).
TYPE: float
pin_converged
If True
, once an index has converged, it is pinned
TYPE: bool
DEFAULT: True
Source code in src/pydvl/value/stopping.py
def __init__(\n self,\n n_steps: int,\n rtol: float,\n pin_converged: bool = True,\n modify_result: bool = True,\n):\n super().__init__(modify_result=modify_result)\n if n_steps < 1:\n raise ValueError(\"n_steps must be at least 1\")\n if rtol <= 0 or rtol >= 1:\n raise ValueError(\"rtol must be in (0, 1)\")\n\n self.n_steps = n_steps\n self.rtol = rtol\n self.update_op = np.logical_or if pin_converged else np.logical_and\n self._memory = None # type: ignore\n
"},{"location":"api/pydvl/value/stopping/#pydvl.value.stopping.HistoryDeviation.converged","title":"converged property
","text":"converged: NDArray[bool_]\n
Returns a boolean array indicating whether the values have converged for each data point.
Inheriting classes must set the _converged
attribute in their check()
.
RETURNS DESCRIPTION NDArray[bool_]
A boolean array indicating whether the values have converged for
NDArray[bool_]
each data point.
"},{"location":"api/pydvl/value/stopping/#pydvl.value.stopping.HistoryDeviation.completion","title":"completion","text":"completion() -> float\n
Returns a value between 0 and 1 indicating the completion of the computation.
Source code in src/pydvl/value/stopping.py
def completion(self) -> float:\n \"\"\"Returns a value between 0 and 1 indicating the completion of the\n computation.\n \"\"\"\n if self.converged.size == 0:\n return 0.0\n return float(np.mean(self.converged).item())\n
"},{"location":"api/pydvl/value/stopping/#pydvl.value.stopping.HistoryDeviation.__call__","title":"__call__","text":"__call__(result: ValuationResult) -> Status\n
Calls check()
, maybe updating the result.
Source code in src/pydvl/value/stopping.py
def __call__(self, result: ValuationResult) -> Status:\n \"\"\"Calls `check()`, maybe updating the result.\"\"\"\n if len(result) == 0:\n logger.warning(\n \"At least one iteration finished but no results where generated. \"\n \"Please check that your scorer and utility return valid numbers.\"\n )\n status = self._check(result)\n if self.modify_result: # FIXME: this is not nice\n result._status = status\n return status\n
"},{"location":"api/pydvl/value/stopping/#pydvl.value.stopping.RankCorrelation","title":"RankCorrelation","text":"RankCorrelation(rtol: float, burn_in: int, modify_result: bool = True)\n
Bases: StoppingCriterion
A check for stability of Spearman correlation between checks.
When the change in rank correlation between two successive iterations is below a given threshold, the computation is terminated. The criterion computes the Spearman correlation between two successive iterations. The Spearman correlation uses the ordering indices of the given values and correlates them. This means it focuses on the order of the elements instead of their exact values. If the order stops changing (meaning the Banzhaf semivalues estimates converge), the criterion stops the algorithm.
This criterion is used in (Wang et. al.)2.
PARAMETER DESCRIPTION rtol
Relative tolerance for convergence (\\(\\epsilon\\) in the formula)
TYPE: float
modify_result
If True
, the status of the input ValuationResult is modified in place after the call.
TYPE: bool
DEFAULT: True
burn_in
The minimum number of iterations before checking for convergence. This is required because the first correlation is meaningless.
TYPE: int
Added in 0.9.0
Source code in src/pydvl/value/stopping.py
def __init__(\n self,\n rtol: float,\n burn_in: int,\n modify_result: bool = True,\n):\n super().__init__(modify_result=modify_result)\n if rtol <= 0 or rtol >= 1:\n raise ValueError(\"rtol must be in (0, 1)\")\n self.rtol = rtol\n self.burn_in = burn_in\n self._memory: NDArray[np.float_] | None = None\n self._corr = 0.0\n self._completion = 0.0\n self._iterations = 0\n
"},{"location":"api/pydvl/value/stopping/#pydvl.value.stopping.RankCorrelation.converged","title":"converged property
","text":"converged: NDArray[bool_]\n
Returns a boolean array indicating whether the values have converged for each data point.
Inheriting classes must set the _converged
attribute in their check()
.
RETURNS DESCRIPTION NDArray[bool_]
A boolean array indicating whether the values have converged for
NDArray[bool_]
each data point.
"},{"location":"api/pydvl/value/stopping/#pydvl.value.stopping.RankCorrelation.__call__","title":"__call__","text":"__call__(result: ValuationResult) -> Status\n
Calls check()
, maybe updating the result.
Source code in src/pydvl/value/stopping.py
def __call__(self, result: ValuationResult) -> Status:\n \"\"\"Calls `check()`, maybe updating the result.\"\"\"\n if len(result) == 0:\n logger.warning(\n \"At least one iteration finished but no results where generated. \"\n \"Please check that your scorer and utility return valid numbers.\"\n )\n status = self._check(result)\n if self.modify_result: # FIXME: this is not nice\n result._status = status\n return status\n
"},{"location":"api/pydvl/value/stopping/#pydvl.value.stopping.make_criterion","title":"make_criterion","text":"make_criterion(\n fun: StoppingCriterionCallable,\n converged: Callable[[], NDArray[bool_]] | None = None,\n completion: Callable[[], float] | None = None,\n name: str | None = None,\n) -> Type[StoppingCriterion]\n
Create a new StoppingCriterion from a function. Use this to enable simpler functions to be composed with bitwise operators
PARAMETER DESCRIPTION fun
The callable to wrap.
TYPE: StoppingCriterionCallable
converged
A callable that returns a boolean array indicating what values have converged.
TYPE: Callable[[], NDArray[bool_]] | None
DEFAULT: None
completion
A callable that returns a value between 0 and 1 indicating the rate of completion of the computation. If not provided, the fraction of converged values is used.
TYPE: Callable[[], float] | None
DEFAULT: None
name
The name of the new criterion. If None
, the __name__
of the function is used.
TYPE: str | None
DEFAULT: None
RETURNS DESCRIPTION Type[StoppingCriterion]
A new subclass of StoppingCriterion.
Source code in src/pydvl/value/stopping.py
def make_criterion(\n fun: StoppingCriterionCallable,\n converged: Callable[[], NDArray[np.bool_]] | None = None,\n completion: Callable[[], float] | None = None,\n name: str | None = None,\n) -> Type[StoppingCriterion]:\n \"\"\"Create a new [StoppingCriterion][pydvl.value.stopping.StoppingCriterion] from a function.\n Use this to enable simpler functions to be composed with bitwise operators\n\n Args:\n fun: The callable to wrap.\n converged: A callable that returns a boolean array indicating what\n values have converged.\n completion: A callable that returns a value between 0 and 1 indicating\n the rate of completion of the computation. If not provided, the fraction\n of converged values is used.\n name: The name of the new criterion. If `None`, the `__name__` of\n the function is used.\n\n Returns:\n A new subclass of [StoppingCriterion][pydvl.value.stopping.StoppingCriterion].\n \"\"\"\n\n class WrappedCriterion(StoppingCriterion):\n def __init__(self, modify_result: bool = True):\n super().__init__(modify_result=modify_result)\n self._name = name or getattr(fun, \"__name__\", \"WrappedCriterion\")\n\n def _check(self, result: ValuationResult) -> Status:\n return fun(result)\n\n @property\n def converged(self) -> NDArray[np.bool_]:\n if converged is None:\n return super().converged\n return converged()\n\n def __str__(self):\n return self._name\n\n def completion(self) -> float:\n if completion is None:\n return super().completion()\n return completion()\n\n return WrappedCriterion\n
"},{"location":"api/pydvl/value/least_core/","title":"Least core","text":""},{"location":"api/pydvl/value/least_core/#pydvl.value.least_core","title":"pydvl.value.least_core","text":"New in version 0.4.0
This package holds all routines for the computation of Least Core data values.
Please refer to Data valuation for an overview.
In addition to the standard interface via compute_least_core_values(), because computing the Least Core values requires the solution of a linear and a quadratic problem after computing all the utility values, there is the possibility of performing each step separately. This is useful when running multiple experiments: use lc_prepare_problem() or mclc_prepare_problem() to prepare a list of problems to solve, then solve them in parallel with lc_solve_problems().
Note that mclc_prepare_problem() is parallelized itself, so preparing the problems should be done in sequence in this case. The solution of the linear systems can then be done in parallel.
"},{"location":"api/pydvl/value/least_core/#pydvl.value.least_core.LeastCoreMode","title":"LeastCoreMode","text":" Bases: Enum
Available Least Core algorithms.
"},{"location":"api/pydvl/value/least_core/#pydvl.value.least_core.compute_least_core_values","title":"compute_least_core_values","text":"compute_least_core_values(\n u: Utility,\n *,\n n_jobs: int = 1,\n n_iterations: Optional[int] = None,\n mode: LeastCoreMode = LeastCoreMode.MonteCarlo,\n non_negative_subsidy: bool = False,\n solver_options: Optional[dict] = None,\n progress: bool = False,\n **kwargs\n) -> ValuationResult\n
Umbrella method to compute Least Core values with any of the available algorithms.
See Data valuation for an overview.
The following algorithms are available. Note that the exact method can only work with very small datasets and is thus intended only for testing.
exact
: uses the complete powerset of the training set for the constraints combinatorial_exact_shapley(). montecarlo
: uses the approximate Monte Carlo Least Core algorithm. Implemented in montecarlo_least_core().
PARAMETER DESCRIPTION u
Utility object with model, data, and scoring function
TYPE: Utility
n_jobs
Number of jobs to run in parallel. Only used for Monte Carlo Least Core.
TYPE: int
DEFAULT: 1
n_iterations
Number of subsets to sample and evaluate the utility on. Only used for Monte Carlo Least Core.
TYPE: Optional[int]
DEFAULT: None
mode
Algorithm to use. See LeastCoreMode for available options.
TYPE: LeastCoreMode
DEFAULT: MonteCarlo
non_negative_subsidy
If True, the least core subsidy \\(e\\) is constrained to be non-negative.
TYPE: bool
DEFAULT: False
solver_options
Optional dictionary of options passed to the solvers.
TYPE: Optional[dict]
DEFAULT: None
RETURNS DESCRIPTION ValuationResult
Object with the computed values.
New in version 0.5.0
Source code in src/pydvl/value/least_core/__init__.py
def compute_least_core_values(\n u: Utility,\n *,\n n_jobs: int = 1,\n n_iterations: Optional[int] = None,\n mode: LeastCoreMode = LeastCoreMode.MonteCarlo,\n non_negative_subsidy: bool = False,\n solver_options: Optional[dict] = None,\n progress: bool = False,\n **kwargs,\n) -> ValuationResult:\n \"\"\"Umbrella method to compute Least Core values with any of the available\n algorithms.\n\n See [Data valuation][data-valuation] for an overview.\n\n The following algorithms are available. Note that the exact method can only\n work with very small datasets and is thus intended only for testing.\n\n - `exact`: uses the complete powerset of the training set for the constraints\n [combinatorial_exact_shapley()][pydvl.value.shapley.naive.combinatorial_exact_shapley].\n - `montecarlo`: uses the approximate Monte Carlo Least Core algorithm.\n Implemented in [montecarlo_least_core()][pydvl.value.least_core.montecarlo.montecarlo_least_core].\n\n Args:\n u: Utility object with model, data, and scoring function\n n_jobs: Number of jobs to run in parallel. Only used for Monte Carlo\n Least Core.\n n_iterations: Number of subsets to sample and evaluate the utility on.\n Only used for Monte Carlo Least Core.\n mode: Algorithm to use. See\n [LeastCoreMode][pydvl.value.least_core.LeastCoreMode] for available\n options.\n non_negative_subsidy: If True, the least core subsidy $e$ is constrained\n to be non-negative.\n solver_options: Optional dictionary of options passed to the solvers.\n\n Returns:\n Object with the computed values.\n\n !!! tip \"New in version 0.5.0\"\n \"\"\"\n\n if mode == LeastCoreMode.MonteCarlo:\n # TODO fix progress showing in remote case\n progress = False\n if n_iterations is None:\n raise ValueError(\"n_iterations cannot be None for Monte Carlo Least Core\")\n return montecarlo_least_core( # type: ignore\n u=u,\n n_iterations=n_iterations,\n n_jobs=n_jobs,\n progress=progress,\n non_negative_subsidy=non_negative_subsidy,\n solver_options=solver_options,\n **kwargs,\n )\n elif mode == LeastCoreMode.Exact:\n return exact_least_core(\n u=u,\n progress=progress,\n non_negative_subsidy=non_negative_subsidy,\n solver_options=solver_options,\n )\n\n raise ValueError(f\"Invalid value encountered in {mode=}\")\n
"},{"location":"api/pydvl/value/least_core/common/","title":"Common","text":""},{"location":"api/pydvl/value/least_core/common/#pydvl.value.least_core.common","title":"pydvl.value.least_core.common","text":""},{"location":"api/pydvl/value/least_core/common/#pydvl.value.least_core.common.lc_solve_problem","title":"lc_solve_problem","text":"lc_solve_problem(\n problem: LeastCoreProblem,\n *,\n u: Utility,\n algorithm: str,\n non_negative_subsidy: bool = False,\n solver_options: Optional[dict] = None\n) -> ValuationResult\n
Solves a linear problem as prepared by mclc_prepare_problem(). Useful for parallel execution of multiple experiments by running this as a remote task.
See exact_least_core() or montecarlo_least_core() for argument descriptions.
Source code in src/pydvl/value/least_core/common.py
def lc_solve_problem(\n problem: LeastCoreProblem,\n *,\n u: Utility,\n algorithm: str,\n non_negative_subsidy: bool = False,\n solver_options: Optional[dict] = None,\n) -> ValuationResult:\n \"\"\"Solves a linear problem as prepared by\n [mclc_prepare_problem()][pydvl.value.least_core.montecarlo.mclc_prepare_problem].\n Useful for parallel execution of multiple experiments by running this as a\n remote task.\n\n See [exact_least_core()][pydvl.value.least_core.naive.exact_least_core] or\n [montecarlo_least_core()][pydvl.value.least_core.montecarlo.montecarlo_least_core] for\n argument descriptions.\n \"\"\"\n n = len(u.data)\n\n if np.any(np.isnan(problem.utility_values)):\n warnings.warn(\n f\"Calculation returned \"\n f\"{np.sum(np.isnan(problem.utility_values))} NaN \"\n f\"values out of {problem.utility_values.size}\",\n RuntimeWarning,\n )\n\n if solver_options is None:\n solver_options = {}\n\n if \"solver\" not in solver_options:\n solver_options[\"solver\"] = cp.SCS\n\n if \"max_iters\" not in solver_options and solver_options[\"solver\"] == cp.SCS:\n solver_options[\"max_iters\"] = 10000\n\n logger.debug(\"Removing possible duplicate values in lower bound array\")\n b_lb = problem.utility_values\n A_lb, unique_indices = np.unique(problem.A_lb, return_index=True, axis=0)\n b_lb = b_lb[unique_indices]\n\n logger.debug(\"Building equality constraint\")\n A_eq = np.ones((1, n))\n # We might have already computed the total utility one or more times.\n # This is the index of the row(s) in A_lb with all ones.\n total_utility_indices = np.where(A_lb.sum(axis=1) == n)[0]\n if len(total_utility_indices) == 0:\n b_eq = np.array([u(u.data.indices)])\n else:\n b_eq = b_lb[total_utility_indices]\n # Remove the row(s) corresponding to the total utility\n # from the lower bound constraints\n # because given the equality constraint\n # it is the same as using the constraint e >= 0\n # (i.e. setting non_negative_subsidy = True).\n mask: NDArray[np.bool_] = np.ones_like(b_lb, dtype=bool)\n mask[total_utility_indices] = False\n b_lb = b_lb[mask]\n A_lb = A_lb[mask]\n\n # Remove the row(s) corresponding to the empty subset\n # because, given u(\u2205) = (which is almost always the case,\n # it is the same as using the constraint e >= 0\n # (i.e. setting non_negative_subsidy = True).\n emptyset_utility_indices = np.where(A_lb.sum(axis=1) == 0)[0]\n if len(emptyset_utility_indices) > 0:\n mask = np.ones_like(b_lb, dtype=bool)\n mask[emptyset_utility_indices] = False\n b_lb = b_lb[mask]\n A_lb = A_lb[mask]\n\n _, subsidy = _solve_least_core_linear_program(\n A_eq=A_eq,\n b_eq=b_eq,\n A_lb=A_lb,\n b_lb=b_lb,\n non_negative_subsidy=non_negative_subsidy,\n solver_options=solver_options,\n )\n\n values: Optional[NDArray[np.float_]]\n\n if subsidy is None:\n logger.debug(\"No values were found\")\n status = Status.Failed\n values = np.empty(n)\n values[:] = np.nan\n subsidy = np.nan\n else:\n values = _solve_egalitarian_least_core_quadratic_program(\n subsidy,\n A_eq=A_eq,\n b_eq=b_eq,\n A_lb=A_lb,\n b_lb=b_lb,\n solver_options=solver_options,\n )\n\n if values is None:\n logger.debug(\"No values were found\")\n status = Status.Failed\n values = np.empty(n)\n values[:] = np.nan\n subsidy = np.nan\n else:\n status = Status.Converged\n\n return ValuationResult(\n algorithm=algorithm,\n status=status,\n values=values,\n subsidy=subsidy,\n stderr=None,\n data_names=u.data.data_names,\n )\n
"},{"location":"api/pydvl/value/least_core/common/#pydvl.value.least_core.common.lc_solve_problems","title":"lc_solve_problems","text":"lc_solve_problems(\n problems: Sequence[LeastCoreProblem],\n u: Utility,\n algorithm: str,\n parallel_backend: Optional[ParallelBackend] = None,\n config: Optional[ParallelConfig] = None,\n n_jobs: int = 1,\n non_negative_subsidy: bool = True,\n solver_options: Optional[dict] = None,\n **options\n) -> List[ValuationResult]\n
Solves a list of linear problems in parallel.
PARAMETER DESCRIPTION u
Utility.
TYPE: Utility
problems
Least Core problems to solve, as returned by mclc_prepare_problem().
TYPE: Sequence[LeastCoreProblem]
algorithm
Name of the valuation algorithm.
TYPE: str
parallel_backend
Parallel backend instance to use for parallelizing computations. If None
, use JoblibParallelBackend backend. See the Parallel Backends package for available options.
TYPE: Optional[ParallelBackend]
DEFAULT: None
config
(DEPRECATED) Object configuring parallel computation, with cluster address, number of cpus, etc.
TYPE: Optional[ParallelConfig]
DEFAULT: None
n_jobs
Number of parallel jobs to run.
TYPE: int
DEFAULT: 1
non_negative_subsidy
If True, the least core subsidy \\(e\\) is constrained to be non-negative.
TYPE: bool
DEFAULT: True
solver_options
Additional options to pass to the solver.
TYPE: Optional[dict]
DEFAULT: None
RETURNS DESCRIPTION List[ValuationResult]
List of solutions.
Source code in src/pydvl/value/least_core/common.py
@deprecated(\n target=True,\n args_mapping={\"config\": \"config\"},\n deprecated_in=\"0.9.0\",\n remove_in=\"0.10.0\",\n)\ndef lc_solve_problems(\n problems: Sequence[LeastCoreProblem],\n u: Utility,\n algorithm: str,\n parallel_backend: Optional[ParallelBackend] = None,\n config: Optional[ParallelConfig] = None,\n n_jobs: int = 1,\n non_negative_subsidy: bool = True,\n solver_options: Optional[dict] = None,\n **options,\n) -> List[ValuationResult]:\n \"\"\"Solves a list of linear problems in parallel.\n\n Args:\n u: Utility.\n problems: Least Core problems to solve, as returned by\n [mclc_prepare_problem()][pydvl.value.least_core.montecarlo.mclc_prepare_problem].\n algorithm: Name of the valuation algorithm.\n parallel_backend: Parallel backend instance to use\n for parallelizing computations. If `None`,\n use [JoblibParallelBackend][pydvl.parallel.backends.JoblibParallelBackend] backend.\n See the [Parallel Backends][pydvl.parallel.backends] package\n for available options.\n config: (**DEPRECATED**) Object configuring parallel computation,\n with cluster address, number of cpus, etc.\n n_jobs: Number of parallel jobs to run.\n non_negative_subsidy: If True, the least core subsidy $e$ is constrained\n to be non-negative.\n solver_options: Additional options to pass to the solver.\n\n Returns:\n List of solutions.\n \"\"\"\n\n def _map_func(\n problems: List[LeastCoreProblem], *args, **kwargs\n ) -> List[ValuationResult]:\n return [lc_solve_problem(p, *args, **kwargs) for p in problems]\n\n parallel_backend = _maybe_init_parallel_backend(parallel_backend, config)\n\n map_reduce_job: MapReduceJob[\n \"LeastCoreProblem\", \"List[ValuationResult]\"\n ] = MapReduceJob(\n inputs=problems,\n map_func=_map_func,\n map_kwargs=dict(\n u=u,\n algorithm=algorithm,\n non_negative_subsidy=non_negative_subsidy,\n solver_options=solver_options,\n **options,\n ),\n reduce_func=lambda x: list(itertools.chain(*x)),\n parallel_backend=parallel_backend,\n n_jobs=n_jobs,\n )\n solutions = map_reduce_job()\n\n return solutions\n
"},{"location":"api/pydvl/value/least_core/montecarlo/","title":"Montecarlo","text":""},{"location":"api/pydvl/value/least_core/montecarlo/#pydvl.value.least_core.montecarlo","title":"pydvl.value.least_core.montecarlo","text":""},{"location":"api/pydvl/value/least_core/montecarlo/#pydvl.value.least_core.montecarlo.montecarlo_least_core","title":"montecarlo_least_core","text":"montecarlo_least_core(\n u: Utility,\n n_iterations: int,\n *,\n n_jobs: int = 1,\n parallel_backend: Optional[ParallelBackend] = None,\n config: Optional[ParallelConfig] = None,\n non_negative_subsidy: bool = False,\n solver_options: Optional[dict] = None,\n progress: bool = False,\n seed: Optional[Seed] = None\n) -> ValuationResult\n
Computes approximate Least Core values using a Monte Carlo approach.
\\[ \\begin{array}{lll} \\text{minimize} & \\displaystyle{e} & \\\\ \\text{subject to} & \\displaystyle\\sum_{i\\in N} x_{i} = v(N) & \\\\ & \\displaystyle\\sum_{i\\in S} x_{i} + e \\geq v(S) & , \\forall S \\in \\{S_1, S_2, \\dots, S_m \\overset{\\mathrm{iid}}{\\sim} U(2^N) \\} \\end{array} \\] Where:
- \\(U(2^N)\\) is the uniform distribution over the powerset of \\(N\\).
- \\(m\\) is the number of subsets that will be sampled and whose utility will be computed and used to compute the data values.
PARAMETER DESCRIPTION u
Utility object with model, data, and scoring function
TYPE: Utility
n_iterations
total number of iterations to use
TYPE: int
n_jobs
number of jobs across which to distribute the computation
TYPE: int
DEFAULT: 1
parallel_backend
Parallel backend instance to use for parallelizing computations. If None
, use JoblibParallelBackend backend. See the Parallel Backends package for available options.
TYPE: Optional[ParallelBackend]
DEFAULT: None
config
(DEPRECATED) Object configuring parallel computation, with cluster address, number of cpus, etc.
TYPE: Optional[ParallelConfig]
DEFAULT: None
non_negative_subsidy
If True, the least core subsidy \\(e\\) is constrained to be non-negative.
TYPE: bool
DEFAULT: False
solver_options
Dictionary of options that will be used to select a solver and to configure it. Refer to cvxpy's documentation for all possible options.
TYPE: Optional[dict]
DEFAULT: None
progress
If True, shows a tqdm progress bar
TYPE: bool
DEFAULT: False
seed
Either an instance of a numpy random number generator or a seed for it.
TYPE: Optional[Seed]
DEFAULT: None
RETURNS DESCRIPTION ValuationResult
Object with the data values and the least core value.
Changed in version 0.9.0
Deprecated config
argument and added a parallel_backend
argument to allow users to pass the Parallel Backend instance directly.
Source code in src/pydvl/value/least_core/montecarlo.py
@deprecated(\n target=True,\n args_mapping={\"config\": \"config\"},\n deprecated_in=\"0.9.0\",\n remove_in=\"0.10.0\",\n)\ndef montecarlo_least_core(\n u: Utility,\n n_iterations: int,\n *,\n n_jobs: int = 1,\n parallel_backend: Optional[ParallelBackend] = None,\n config: Optional[ParallelConfig] = None,\n non_negative_subsidy: bool = False,\n solver_options: Optional[dict] = None,\n progress: bool = False,\n seed: Optional[Seed] = None,\n) -> ValuationResult:\n r\"\"\"Computes approximate Least Core values using a Monte Carlo approach.\n\n $$\n \\begin{array}{lll}\n \\text{minimize} & \\displaystyle{e} & \\\\\n \\text{subject to} & \\displaystyle\\sum_{i\\in N} x_{i} = v(N) & \\\\\n & \\displaystyle\\sum_{i\\in S} x_{i} + e \\geq v(S) & ,\n \\forall S \\in \\{S_1, S_2, \\dots, S_m \\overset{\\mathrm{iid}}{\\sim} U(2^N) \\}\n \\end{array}\n $$\n\n Where:\n\n * $U(2^N)$ is the uniform distribution over the powerset of $N$.\n * $m$ is the number of subsets that will be sampled and whose utility will\n be computed and used to compute the data values.\n\n Args:\n u: Utility object with model, data, and scoring function\n n_iterations: total number of iterations to use\n n_jobs: number of jobs across which to distribute the computation\n parallel_backend: Parallel backend instance to use\n for parallelizing computations. If `None`,\n use [JoblibParallelBackend][pydvl.parallel.backends.JoblibParallelBackend] backend.\n See the [Parallel Backends][pydvl.parallel.backends] package\n for available options.\n config: (**DEPRECATED**) Object configuring parallel computation,\n with cluster address, number of cpus, etc.\n non_negative_subsidy: If True, the least core subsidy $e$ is constrained\n to be non-negative.\n solver_options: Dictionary of options that will be used to select a solver\n and to configure it. Refer to [cvxpy's\n documentation](https://www.cvxpy.org/tutorial/advanced/index.html#setting-solver-options)\n for all possible options.\n progress: If True, shows a tqdm progress bar\n seed: Either an instance of a numpy random number generator or a seed for it.\n\n Returns:\n Object with the data values and the least core value.\n\n !!! tip \"Changed in version 0.9.0\"\n Deprecated `config` argument and added a `parallel_backend`\n argument to allow users to pass the Parallel Backend instance\n directly.\n \"\"\"\n problem = mclc_prepare_problem(\n u,\n n_iterations,\n n_jobs=n_jobs,\n parallel_backend=parallel_backend,\n config=config,\n progress=progress,\n seed=seed,\n )\n return lc_solve_problem(\n problem,\n u=u,\n algorithm=\"montecarlo_least_core\",\n non_negative_subsidy=non_negative_subsidy,\n solver_options=solver_options,\n )\n
"},{"location":"api/pydvl/value/least_core/montecarlo/#pydvl.value.least_core.montecarlo.mclc_prepare_problem","title":"mclc_prepare_problem","text":"mclc_prepare_problem(\n u: Utility,\n n_iterations: int,\n *,\n n_jobs: int = 1,\n parallel_backend: Optional[ParallelBackend] = None,\n config: Optional[ParallelConfig] = None,\n progress: bool = False,\n seed: Optional[Seed] = None\n) -> LeastCoreProblem\n
Prepares a linear problem by sampling subsets of the data. Use this to separate the problem preparation from the solving with lc_solve_problem(). Useful for parallel execution of multiple experiments.
See montecarlo_least_core for argument descriptions.
Changed in version 0.9.0
Deprecated config
argument and added a parallel_backend
argument to allow users to pass the Parallel Backend instance directly.
Source code in src/pydvl/value/least_core/montecarlo.py
@deprecated(\n target=True,\n args_mapping={\"config\": \"config\"},\n deprecated_in=\"0.9.0\",\n remove_in=\"0.10.0\",\n)\ndef mclc_prepare_problem(\n u: Utility,\n n_iterations: int,\n *,\n n_jobs: int = 1,\n parallel_backend: Optional[ParallelBackend] = None,\n config: Optional[ParallelConfig] = None,\n progress: bool = False,\n seed: Optional[Seed] = None,\n) -> LeastCoreProblem:\n \"\"\"Prepares a linear problem by sampling subsets of the data. Use this to\n separate the problem preparation from the solving with\n [lc_solve_problem()][pydvl.value.least_core.common.lc_solve_problem]. Useful\n for parallel execution of multiple experiments.\n\n See\n [montecarlo_least_core][pydvl.value.least_core.montecarlo.montecarlo_least_core]\n for argument descriptions.\n\n !!! note \"Changed in version 0.9.0\"\n Deprecated `config` argument and added a `parallel_backend`\n argument to allow users to pass the Parallel Backend instance\n directly.\n \"\"\"\n n = len(u.data)\n\n if n_iterations < n:\n warnings.warn(\n f\"Number of iterations '{n_iterations}' is smaller the size of the dataset '{n}'. \"\n f\"This is not optimal because in the worst case we need at least '{n}' constraints \"\n \"to satisfy the individual rationality condition.\"\n )\n\n if n_iterations > 2**n:\n warnings.warn(\n f\"Passed n_iterations is greater than the number subsets! \"\n f\"Setting it to 2^{n}\",\n RuntimeWarning,\n )\n n_iterations = 2**n\n\n parallel_backend = _maybe_init_parallel_backend(parallel_backend, config)\n\n iterations_per_job = max(\n 1, n_iterations // parallel_backend.effective_n_jobs(n_jobs)\n )\n\n map_reduce_job: MapReduceJob[\"Utility\", \"LeastCoreProblem\"] = MapReduceJob(\n inputs=u,\n map_func=_montecarlo_least_core,\n reduce_func=_reduce_func,\n map_kwargs=dict(n_iterations=iterations_per_job, progress=progress),\n n_jobs=n_jobs,\n parallel_backend=parallel_backend,\n )\n\n return map_reduce_job(seed=seed)\n
"},{"location":"api/pydvl/value/least_core/naive/","title":"Naive","text":""},{"location":"api/pydvl/value/least_core/naive/#pydvl.value.least_core.naive","title":"pydvl.value.least_core.naive","text":""},{"location":"api/pydvl/value/least_core/naive/#pydvl.value.least_core.naive.exact_least_core","title":"exact_least_core","text":"exact_least_core(\n u: Utility,\n *,\n non_negative_subsidy: bool = False,\n solver_options: Optional[dict] = None,\n progress: bool = True\n) -> ValuationResult\n
Computes the exact Least Core values.
Note
If the training set contains more than 20 instances a warning is printed because the computation is very expensive. This method is mostly used for internal testing and simple use cases. Please refer to the Monte Carlo method for practical applications.
The least core is the solution to the following Linear Programming problem:
\\[ \\begin{array}{lll} \\text{minimize} & \\displaystyle{e} & \\\\ \\text{subject to} & \\displaystyle\\sum_{i\\in N} x_{i} = v(N) & \\\\ & \\displaystyle\\sum_{i\\in S} x_{i} + e \\geq v(S) &, \\forall S \\subseteq N \\\\ \\end{array} \\] Where \\(N = \\{1, 2, \\dots, n\\}\\) are the training set's indices.
PARAMETER DESCRIPTION u
Utility object with model, data, and scoring function
TYPE: Utility
non_negative_subsidy
If True, the least core subsidy \\(e\\) is constrained to be non-negative.
TYPE: bool
DEFAULT: False
solver_options
Dictionary of options that will be used to select a solver and to configure it. Refer to the cvxpy's documentation for all possible options.
TYPE: Optional[dict]
DEFAULT: None
progress
If True, shows a tqdm progress bar
TYPE: bool
DEFAULT: True
RETURNS DESCRIPTION ValuationResult
Object with the data values and the least core value.
Source code in src/pydvl/value/least_core/naive.py
def exact_least_core(\n u: Utility,\n *,\n non_negative_subsidy: bool = False,\n solver_options: Optional[dict] = None,\n progress: bool = True,\n) -> ValuationResult:\n r\"\"\"Computes the exact Least Core values.\n\n !!! Note\n If the training set contains more than 20 instances a warning is printed\n because the computation is very expensive. This method is mostly used for\n internal testing and simple use cases. Please refer to the\n [Monte Carlo method][pydvl.value.least_core.montecarlo.montecarlo_least_core]\n for practical applications.\n\n The least core is the solution to the following Linear Programming problem:\n\n $$\n \\begin{array}{lll}\n \\text{minimize} & \\displaystyle{e} & \\\\\n \\text{subject to} & \\displaystyle\\sum_{i\\in N} x_{i} = v(N) & \\\\\n & \\displaystyle\\sum_{i\\in S} x_{i} + e \\geq v(S) &, \\forall S \\subseteq N \\\\\n \\end{array}\n $$\n\n Where $N = \\{1, 2, \\dots, n\\}$ are the training set's indices.\n\n Args:\n u: Utility object with model, data, and scoring function\n non_negative_subsidy: If True, the least core subsidy $e$ is constrained\n to be non-negative.\n solver_options: Dictionary of options that will be used to select a solver\n and to configure it. Refer to the [cvxpy's\n documentation](https://www.cvxpy.org/tutorial/advanced/index.html#setting-solver-options)\n for all possible options.\n progress: If True, shows a tqdm progress bar\n\n Returns:\n Object with the data values and the least core value.\n \"\"\"\n n = len(u.data)\n if n > 20: # Arbitrary choice, will depend on time required, caching, etc.\n warnings.warn(f\"Large dataset! Computation requires 2^{n} calls to model.fit()\")\n\n problem = lc_prepare_problem(u, progress=progress)\n return lc_solve_problem(\n problem=problem,\n u=u,\n algorithm=\"exact_least_core\",\n non_negative_subsidy=non_negative_subsidy,\n solver_options=solver_options,\n )\n
"},{"location":"api/pydvl/value/least_core/naive/#pydvl.value.least_core.naive.lc_prepare_problem","title":"lc_prepare_problem","text":"lc_prepare_problem(u: Utility, progress: bool = False) -> LeastCoreProblem\n
Prepares a linear problem with all subsets of the data Use this to separate the problem preparation from the solving with lc_solve_problem(). Useful for parallel execution of multiple experiments.
See exact_least_core() for argument descriptions.
Source code in src/pydvl/value/least_core/naive.py
def lc_prepare_problem(u: Utility, progress: bool = False) -> LeastCoreProblem:\n \"\"\"Prepares a linear problem with all subsets of the data\n Use this to separate the problem preparation from the solving with\n [lc_solve_problem()][pydvl.value.least_core.common.lc_solve_problem]. Useful for\n parallel execution of multiple experiments.\n\n See [exact_least_core()][pydvl.value.least_core.naive.exact_least_core] for argument\n descriptions.\n \"\"\"\n n = len(u.data)\n\n logger.debug(\"Building vectors and matrices for linear programming problem\")\n powerset_size = 2**n\n A_lb = np.zeros((powerset_size, n))\n\n logger.debug(\"Iterating over all subsets\")\n utility_values = np.zeros(powerset_size)\n for i, subset in enumerate( # type: ignore\n tqdm(\n powerset(u.data.indices),\n disable=not progress,\n total=powerset_size - 1,\n position=0,\n )\n ):\n indices: NDArray[np.bool_] = np.zeros(n, dtype=bool)\n indices[list(subset)] = True\n A_lb[i, indices] = 1\n utility_values[i] = u(subset) # type: ignore\n\n return LeastCoreProblem(utility_values, A_lb)\n
"},{"location":"api/pydvl/value/loo/","title":"Loo","text":""},{"location":"api/pydvl/value/loo/#pydvl.value.loo","title":"pydvl.value.loo","text":""},{"location":"api/pydvl/value/loo/loo/","title":"Loo","text":""},{"location":"api/pydvl/value/loo/loo/#pydvl.value.loo.loo","title":"pydvl.value.loo.loo","text":""},{"location":"api/pydvl/value/loo/loo/#pydvl.value.loo.loo.compute_loo","title":"compute_loo","text":"compute_loo(\n u: Utility,\n *,\n n_jobs: int = 1,\n parallel_backend: Optional[ParallelBackend] = None,\n config: Optional[ParallelConfig] = None,\n progress: bool = True\n) -> ValuationResult\n
Computes leave one out value:
\\[v(i) = u(D) - u(D \\setminus \\{i\\}) \\] PARAMETER DESCRIPTION u
Utility object with model, data, and scoring function
TYPE: Utility
progress
If True, display a progress bar
TYPE: bool
DEFAULT: True
n_jobs
Number of parallel jobs to use
TYPE: int
DEFAULT: 1
parallel_backend
Parallel backend instance to use for parallelizing computations. If None
, use JoblibParallelBackend backend. See the Parallel Backends package for available options.
TYPE: Optional[ParallelBackend]
DEFAULT: None
config
(DEPRECATED) Object configuring parallel computation, with cluster address, number of cpus, etc.
TYPE: Optional[ParallelConfig]
DEFAULT: None
progress
If True, display a progress bar
TYPE: bool
DEFAULT: True
RETURNS DESCRIPTION ValuationResult
Object with the data values.
New in version 0.7.0
Renamed from naive_loo
and added parallel computation.
Changed in version 0.9.0
Deprecated config
argument and added a parallel_backend
argument to allow users to pass the Parallel Backend instance directly.
Source code in src/pydvl/value/loo/loo.py
@deprecated(\n target=True,\n args_mapping={\"config\": \"config\"},\n deprecated_in=\"0.9.0\",\n remove_in=\"0.10.0\",\n)\ndef compute_loo(\n u: Utility,\n *,\n n_jobs: int = 1,\n parallel_backend: Optional[ParallelBackend] = None,\n config: Optional[ParallelConfig] = None,\n progress: bool = True,\n) -> ValuationResult:\n r\"\"\"Computes leave one out value:\n\n $$v(i) = u(D) - u(D \\setminus \\{i\\}) $$\n\n Args:\n u: Utility object with model, data, and scoring function\n progress: If True, display a progress bar\n n_jobs: Number of parallel jobs to use\n parallel_backend: Parallel backend instance to use\n for parallelizing computations. If `None`,\n use [JoblibParallelBackend][pydvl.parallel.backends.JoblibParallelBackend] backend.\n See the [Parallel Backends][pydvl.parallel.backends] package\n for available options.\n config: (**DEPRECATED**) Object configuring parallel computation,\n with cluster address, number of cpus, etc.\n progress: If True, display a progress bar\n\n Returns:\n Object with the data values.\n\n !!! tip \"New in version 0.7.0\"\n Renamed from `naive_loo` and added parallel computation.\n\n !!! tip \"Changed in version 0.9.0\"\n Deprecated `config` argument and added a `parallel_backend`\n argument to allow users to pass the Parallel Backend instance\n directly.\n \"\"\"\n if len(u.data) < 3:\n raise ValueError(\"Dataset must have at least 2 elements\")\n\n result = ValuationResult.zeros(\n algorithm=\"loo\",\n indices=u.data.indices,\n data_names=u.data.data_names,\n )\n\n all_indices = set(u.data.indices)\n total_utility = u(u.data.indices)\n\n def fun(idx: int) -> tuple[int, float]:\n return idx, total_utility - u(all_indices.difference({idx}))\n\n parallel_backend = _maybe_init_parallel_backend(parallel_backend, config)\n max_workers = parallel_backend.effective_n_jobs(n_jobs)\n n_submitted_jobs = 2 * max_workers # number of jobs in the queue\n\n # NOTE: this could be done with a simple executor.map(), but we want to\n # display a progress bar\n\n with parallel_backend.executor(\n max_workers=max_workers, cancel_futures=True\n ) as executor:\n pending: set[Future] = set()\n index_it = iter(u.data.indices)\n\n pbar = tqdm(disable=not progress, total=100, unit=\"%\")\n while True:\n pbar.n = 100 * sum(result.counts) / len(u.data)\n pbar.refresh()\n completed, pending = wait(pending, timeout=0.1, return_when=FIRST_COMPLETED)\n for future in completed:\n idx, marginal = future.result()\n result.update(idx, marginal)\n\n # Ensure that we always have n_submitted_jobs running\n try:\n for _ in range(n_submitted_jobs - len(pending)):\n pending.add(executor.submit(fun, next(index_it)))\n except StopIteration:\n if len(pending) == 0:\n return result\n
"},{"location":"api/pydvl/value/oob/","title":"Oob","text":""},{"location":"api/pydvl/value/oob/#pydvl.value.oob","title":"pydvl.value.oob","text":""},{"location":"api/pydvl/value/oob/oob/","title":"Oob","text":""},{"location":"api/pydvl/value/oob/oob/#pydvl.value.oob.oob","title":"pydvl.value.oob.oob","text":""},{"location":"api/pydvl/value/oob/oob/#pydvl.value.oob.oob--references","title":"References","text":" -
Kwon et al. Data-OOB: Out-of-bag Estimate as a Simple and Efficient Data Value. In: Published at ICML 2023\u00a0\u21a9
"},{"location":"api/pydvl/value/oob/oob/#pydvl.value.oob.oob.compute_data_oob","title":"compute_data_oob","text":"compute_data_oob(\n u: Utility,\n *,\n n_est: int = 10,\n max_samples: float = 0.8,\n loss: Optional[LossFunction] = None,\n n_jobs: Optional[int] = None,\n seed: Optional[Seed] = None,\n progress: bool = False\n) -> ValuationResult\n
Computes Data out of bag values
This implements the method described in (Kwon and Zou, 2023)1. It fits several base estimators provided through u.model through a bagging process. The point value corresponds to the average loss of estimators which were not fit on it.
\\(w_{bj}\\in Z\\) is the number of times the j-th datum \\((x_j, y_j)\\) is selected in the b-th bootstrap dataset.
\\[\\psi((x_i,y_i),\\Theta_B):=\\frac{\\sum_{b=1}^{B}\\mathbb{1}(w_{bi}=0)T(y_i, \\hat{f}_b(x_i))}{\\sum_{b=1}^{B} \\mathbb{1} (w_{bi}=0)}\\] With:
\\[ T: Y \\times Y \\rightarrow \\mathbb{R} \\] T is a score function that represents the goodness of a weak learner \\(\\hat{f}_b\\) at the i-th datum \\((x_i, y_i)\\).
n_est
and max_samples
must be tuned jointly to ensure that all samples are at least 1 time out-of-bag, otherwise the result could include a NaN value for that datum.
PARAMETER DESCRIPTION u
Utility object with model, data, and scoring function.
TYPE: Utility
n_est
Number of estimator used in the bagging procedure.
TYPE: int
DEFAULT: 10
max_samples
The fraction of samples to draw to train each base estimator.
TYPE: float
DEFAULT: 0.8
loss
A function taking as parameters model prediction and corresponding data labels(y_true, y_pred) and returning an array of point-wise errors.
TYPE: Optional[LossFunction]
DEFAULT: None
n_jobs
The number of jobs to run in parallel used in the bagging procedure for both fit and predict.
TYPE: Optional[int]
DEFAULT: None
seed
Either an instance of a numpy random number generator or a seed for it.
TYPE: Optional[Seed]
DEFAULT: None
progress
If True, display a progress bar.
TYPE: bool
DEFAULT: False
RETURNS DESCRIPTION ValuationResult
Object with the data values.
Source code in src/pydvl/value/oob/oob.py
def compute_data_oob(\n u: Utility,\n *,\n n_est: int = 10,\n max_samples: float = 0.8,\n loss: Optional[LossFunction] = None,\n n_jobs: Optional[int] = None,\n seed: Optional[Seed] = None,\n progress: bool = False,\n) -> ValuationResult:\n r\"\"\"Computes Data out of bag values\n\n This implements the method described in\n (Kwon and Zou, 2023)<sup><a href=\"kwon_data_2023\">1</a></sup>.\n It fits several base estimators provided through u.model through a bagging\n process. The point value corresponds to the average loss of estimators which\n were not fit on it.\n\n $w_{bj}\\in Z$ is the number of times the j-th datum $(x_j, y_j)$ is selected\n in the b-th bootstrap dataset.\n\n $$\\psi((x_i,y_i),\\Theta_B):=\\frac{\\sum_{b=1}^{B}\\mathbb{1}(w_{bi}=0)T(y_i,\n \\hat{f}_b(x_i))}{\\sum_{b=1}^{B}\n \\mathbb{1}\n (w_{bi}=0)}$$\n\n With:\n\n $$\n T: Y \\times Y\n \\rightarrow \\mathbb{R}\n $$\n\n T is a score function that represents the goodness of a weak learner\n $\\hat{f}_b$ at the i-th datum $(x_i, y_i)$.\n\n `n_est` and `max_samples` must be tuned jointly to ensure that all samples\n are at least 1 time out-of-bag, otherwise the result could include a NaN\n value for that datum.\n\n Args:\n u: Utility object with model, data, and scoring function.\n n_est: Number of estimator used in the bagging procedure.\n max_samples: The fraction of samples to draw to train each base\n estimator.\n loss: A function taking as parameters model prediction and corresponding\n data labels(y_true, y_pred) and returning an array of point-wise errors.\n n_jobs: The number of jobs to run in parallel used in the bagging\n procedure for both fit and predict.\n seed: Either an instance of a numpy random number generator or a seed\n for it.\n progress: If True, display a progress bar.\n\n Returns:\n Object with the data values.\n \"\"\"\n rng = np.random.default_rng(seed)\n random_state = np.random.RandomState(rng.bit_generator)\n\n result: ValuationResult[np.int_, np.object_] = ValuationResult.empty(\n algorithm=\"data_oob\", indices=u.data.indices, data_names=u.data.data_names\n )\n\n if is_classifier(u.model):\n bag = BaggingClassifier(\n u.model,\n n_estimators=n_est,\n max_samples=max_samples,\n n_jobs=n_jobs,\n random_state=random_state,\n )\n if loss is None:\n loss = point_wise_accuracy\n elif is_regressor(u.model):\n bag = BaggingRegressor(\n u.model,\n n_estimators=n_est,\n max_samples=max_samples,\n n_jobs=n_jobs,\n random_state=random_state,\n )\n if loss is None:\n loss = neg_l2_distance\n else:\n raise Exception(\n \"Model has to be a classifier or a regressor in sklearn format.\"\n )\n\n bag.fit(u.data.x_train, u.data.y_train)\n\n for est, samples in tqdm(\n zip(bag.estimators_, bag.estimators_samples_), disable=not progress, total=n_est\n ): # The bottleneck is the bag fitting not this part so TQDM is not very useful here\n oob_idx = np.setxor1d(u.data.indices, np.unique(samples))\n array_loss = loss(\n y_true=u.data.y_train[oob_idx],\n y_pred=est.predict(u.data.x_train[oob_idx]),\n )\n result += ValuationResult(\n algorithm=\"data_oob\",\n indices=oob_idx,\n values=array_loss,\n counts=np.ones_like(array_loss, dtype=u.data.indices.dtype),\n )\n return result\n
"},{"location":"api/pydvl/value/oob/oob/#pydvl.value.oob.oob.point_wise_accuracy","title":"point_wise_accuracy","text":"point_wise_accuracy(y_true: NDArray[T], y_pred: NDArray[T]) -> NDArray[T]\n
Point-wise 0-1 loss between two arrays
PARAMETER DESCRIPTION y_true
Array of true values (e.g. labels)
TYPE: NDArray[T]
y_pred
Array of estimated values (e.g. model predictions)
TYPE: NDArray[T]
RETURNS DESCRIPTION NDArray[T]
Array with point-wise 0-1 losses between labels and model predictions
Source code in src/pydvl/value/oob/oob.py
def point_wise_accuracy(y_true: NDArray[T], y_pred: NDArray[T]) -> NDArray[T]:\n r\"\"\"Point-wise 0-1 loss between two arrays\n\n Args:\n y_true: Array of true values (e.g. labels)\n y_pred: Array of estimated values (e.g. model predictions)\n\n Returns:\n Array with point-wise 0-1 losses between labels and model predictions\n \"\"\"\n return np.array(y_pred == y_true, dtype=y_pred.dtype)\n
"},{"location":"api/pydvl/value/oob/oob/#pydvl.value.oob.oob.neg_l2_distance","title":"neg_l2_distance","text":"neg_l2_distance(y_true: NDArray[T], y_pred: NDArray[T]) -> NDArray[T]\n
Point-wise negative \\(l_2\\) distance between two arrays
PARAMETER DESCRIPTION y_true
Array of true values (e.g. labels)
TYPE: NDArray[T]
y_pred
Array of estimated values (e.g. model predictions)
TYPE: NDArray[T]
RETURNS DESCRIPTION NDArray[T]
Array with point-wise negative \\(l_2\\) distances between labels and model
NDArray[T]
predictions
Source code in src/pydvl/value/oob/oob.py
def neg_l2_distance(y_true: NDArray[T], y_pred: NDArray[T]) -> NDArray[T]:\n r\"\"\"Point-wise negative $l_2$ distance between two arrays\n\n Args:\n y_true: Array of true values (e.g. labels)\n y_pred: Array of estimated values (e.g. model predictions)\n\n Returns:\n Array with point-wise negative $l_2$ distances between labels and model\n predictions\n \"\"\"\n return -np.square(np.array(y_pred - y_true), dtype=y_pred.dtype)\n
"},{"location":"api/pydvl/value/shapley/","title":"Shapley","text":""},{"location":"api/pydvl/value/shapley/#pydvl.value.shapley","title":"pydvl.value.shapley","text":"This package holds all routines for the computation of Shapley Data value. Users will want to use compute_shapley_values or compute_semivalues as interfaces to most methods defined in the modules.
Please refer to the guide on data valuation for an overview of all methods.
"},{"location":"api/pydvl/value/shapley/classwise/","title":"Classwise","text":""},{"location":"api/pydvl/value/shapley/classwise/#pydvl.value.shapley.classwise","title":"pydvl.value.shapley.classwise","text":"Class-wise Shapley (Schoch et al., 2022)1 offers a Shapley framework tailored for classification problems. Let \\(D\\) be a dataset, \\(D_{y_i}\\) be the subset of \\(D\\) with labels \\(y_i\\), and \\(D_{-y_i}\\) be the complement of \\(D_{y_i}\\) in \\(D\\). The key idea is that a sample \\((x_i, y_i)\\), might enhance the overall performance on \\(D\\), while being detrimental for the performance on \\(D_{y_i}\\). The Class-wise value is defined as:
\\[ v_u(i) = \\frac{1}{2^{|D_{-y_i}|}} \\sum_{S_{-y_i}} \\frac{1}{|D_{y_i}|!} \\sum_{S_{y_i}} \\binom{|D_{y_i}|-1}{|S_{y_i}|}^{-1} [u( S_{y_i} \\cup \\{i\\} | S_{-y_i} ) \u2212 u( S_{y_i} | S_{-y_i})], \\] where \\(S_{y_i} \\subseteq D_{y_i} \\setminus \\{i\\}\\) and \\(S_{-y_i} \\subseteq D_{-y_i}\\).
Analysis of Class-wise Shapley
For a detailed analysis of the method, with comparison to other valuation techniques, please refer to the main documentation.
In practice, the quantity above is estimated using Monte Carlo sampling of the powerset and the set of index permutations. This results in the estimator
\\[ v_u(i) = \\frac{1}{K} \\sum_k \\frac{1}{L} \\sum_l [u(\\sigma^{(l)}_{:i} \\cup \\{i\\} | S^{(k)} ) \u2212 u( \\sigma^{(l)}_{:i} | S^{(k)})], \\] with \\(S^{(1)}, \\dots, S^{(K)} \\subseteq T_{-y_i},\\) \\(\\sigma^{(1)}, \\dots, \\sigma^{(L)} \\in \\Pi(T_{y_i}\\setminus\\{i\\}),\\) and \\(\\sigma^{(l)}_{:i}\\) denoting the set of indices in permutation \\(\\sigma^{(l)}\\) before the position where \\(i\\) appears. The sets \\(T_{y_i}\\) and \\(T_{-y_i}\\) are the training sets for the labels \\(y_i\\) and \\(-y_i\\), respectively.
Notes for derivation of test cases The unit tests include the following manually constructed data: Let \\(D=\\{(1,0),(2,0),(3,0),(4,1)\\}\\) be the test set and \\(T=\\{(1,0),(2,0),(3,1),(4,1)\\}\\) the train set. This specific dataset is chosen as it allows to solve the model
\\[y = \\max(0, \\min(1, \\text{round}(\\beta^T x)))\\] in closed form \\(\\beta = \\frac{\\text{dot}(x, y)}{\\text{dot}(x, x)}\\). From the closed-form solution, the tables for in-class accuracy \\(a_S(D_{y_i})\\) and out-of-class accuracy \\(a_S(D_{-y_i})\\) can be calculated. By using these tables and setting \\(\\{S^{(1)}, \\dots, S^{(K)}\\} = 2^{T_{-y_i}}\\) and \\(\\{\\sigma^{(1)}, \\dots, \\sigma^{(L)}\\} = \\Pi(T_{y_i}\\setminus\\{i\\})\\), the Monte Carlo estimator can be evaluated (\\(2^M\\) is the powerset of \\(M\\)). The details of the derivation are left to the eager reader.
"},{"location":"api/pydvl/value/shapley/classwise/#pydvl.value.shapley.classwise--references","title":"References","text":" -
Schoch, Stephanie, Haifeng Xu, and Yangfeng Ji. CS-Shapley: Class-wise Shapley Values for Data Valuation in Classification. In Proc. of the Thirty-Sixth Conference on Neural Information Processing Systems (NeurIPS). New Orleans, Louisiana, USA, 2022.\u00a0\u21a9
"},{"location":"api/pydvl/value/shapley/classwise/#pydvl.value.shapley.classwise.ClasswiseScorer","title":"ClasswiseScorer","text":"ClasswiseScorer(\n scoring: Union[str, ScorerCallable] = \"accuracy\",\n default: float = 0.0,\n range: Tuple[float, float] = (0, 1),\n in_class_discount_fn: Callable[[float], float] = lambda x: x,\n out_of_class_discount_fn: Callable[[float], float] = np.exp,\n initial_label: Optional[int] = None,\n name: Optional[str] = None,\n)\n
Bases: Scorer
A Scorer designed for evaluation in classification problems. Its value is computed from an in-class and an out-of-class \"inner score\" (Schoch et al., 2022) 1. Let \\(S\\) be the training set and \\(D\\) be the valuation set. For each label \\(c\\), \\(D\\) is factorized into two disjoint sets: \\(D_c\\) for in-class instances and \\(D_{-c}\\) for out-of-class instances. The score combines an in-class metric of performance, adjusted by a discounted out-of-class metric. These inner scores must be provided upon construction or default to accuracy. They are combined into:
\\[ u(S_{y_i}) = f(a_S(D_{y_i}))\\ g(a_S(D_{-y_i})), \\] where \\(f\\) and \\(g\\) are continuous, monotonic functions. For a detailed explanation, refer to section four of (Schoch et al., 2022) 1.
Warning
Metrics must support multiple class labels if you intend to apply them to a multi-class problem. For instance, the metric 'accuracy' supports multiple classes, but the metric f1
does not. For a two-class classification problem, using f1_weighted
is essentially equivalent to using accuracy
.
PARAMETER DESCRIPTION scoring
Name of the scoring function or a callable that can be passed to Scorer.
TYPE: Union[str, ScorerCallable]
DEFAULT: 'accuracy'
default
Score to use when a model fails to provide a number, e.g. when too little was used to train it, or errors arise.
TYPE: float
DEFAULT: 0.0
range
Numerical range of the score function. Some Monte Carlo methods can use this to estimate the number of samples required for a certain quality of approximation. If not provided, it can be read from the scoring
object if it provides it, for instance if it was constructed with compose_score.
TYPE: Tuple[float, float]
DEFAULT: (0, 1)
in_class_discount_fn
Continuous, monotonic increasing function used to discount the in-class score.
TYPE: Callable[[float], float]
DEFAULT: lambda x: x
out_of_class_discount_fn
Continuous, monotonic increasing function used to discount the out-of-class score.
TYPE: Callable[[float], float]
DEFAULT: exp
initial_label
Set initial label (for the first iteration)
TYPE: Optional[int]
DEFAULT: None
name
Name of the scorer. If not provided, the name of the inner scoring function will be prefixed by classwise
.
TYPE: Optional[str]
DEFAULT: None
New in version 0.7.1
Source code in src/pydvl/value/shapley/classwise.py
def __init__(\n self,\n scoring: Union[str, ScorerCallable] = \"accuracy\",\n default: float = 0.0,\n range: Tuple[float, float] = (0, 1),\n in_class_discount_fn: Callable[[float], float] = lambda x: x,\n out_of_class_discount_fn: Callable[[float], float] = np.exp,\n initial_label: Optional[int] = None,\n name: Optional[str] = None,\n):\n disc_score_in_class = in_class_discount_fn(range[1])\n disc_score_out_of_class = out_of_class_discount_fn(range[1])\n transformed_range = (0, disc_score_in_class * disc_score_out_of_class)\n super().__init__(\n scoring=scoring,\n range=transformed_range,\n default=default,\n name=name or f\"classwise {str(scoring)}\",\n )\n self._in_class_discount_fn = in_class_discount_fn\n self._out_of_class_discount_fn = out_of_class_discount_fn\n self.label = initial_label\n
"},{"location":"api/pydvl/value/shapley/classwise/#pydvl.value.shapley.classwise.ClasswiseScorer.estimate_in_class_and_out_of_class_score","title":"estimate_in_class_and_out_of_class_score","text":"estimate_in_class_and_out_of_class_score(\n model: SupervisedModel,\n x_test: NDArray[float_],\n y_test: NDArray[int_],\n rescale_scores: bool = True,\n) -> Tuple[float, float]\n
Computes in-class and out-of-class scores using the provided inner scoring function. The result is
\\[ a_S(D=\\{(x_1, y_1), \\dots, (x_K, y_K)\\}) = \\frac{1}{N} \\sum_k s(y(x_k), y_k). \\] In this context, for label \\(c\\) calculations are executed twice: once for \\(D_c\\) and once for \\(D_{-c}\\) to determine the in-class and out-of-class scores, respectively. By default, the raw scores are multiplied by \\(\\frac{|D_c|}{|D|}\\) and \\(\\frac{|D_{-c}|}{|D|}\\), respectively. This is done to ensure that both scores are of the same order of magnitude. This normalization is particularly useful when the inner score function \\(a_S\\) is calculated by an estimator of the form \\(\\frac{1}{N} \\sum_i x_i\\), e.g. the accuracy.
PARAMETER DESCRIPTION model
Model used for computing the score on the validation set.
TYPE: SupervisedModel
x_test
Array containing the features of the classification problem.
TYPE: NDArray[float_]
y_test
Array containing the labels of the classification problem.
TYPE: NDArray[int_]
rescale_scores
If set to True, the scores will be denormalized. This is particularly useful when the inner score function \\(a_S\\) is calculated by an estimator of the form \\(\\frac{1}{N} \\sum_i x_i\\).
TYPE: bool
DEFAULT: True
RETURNS DESCRIPTION Tuple[float, float]
Tuple containing the in-class and out-of-class scores.
Source code in src/pydvl/value/shapley/classwise.py
def estimate_in_class_and_out_of_class_score(\n self,\n model: SupervisedModel,\n x_test: NDArray[np.float_],\n y_test: NDArray[np.int_],\n rescale_scores: bool = True,\n) -> Tuple[float, float]:\n r\"\"\"\n Computes in-class and out-of-class scores using the provided inner\n scoring function. The result is\n\n $$\n a_S(D=\\{(x_1, y_1), \\dots, (x_K, y_K)\\}) = \\frac{1}{N} \\sum_k s(y(x_k), y_k).\n $$\n\n In this context, for label $c$ calculations are executed twice: once for $D_c$\n and once for $D_{-c}$ to determine the in-class and out-of-class scores,\n respectively. By default, the raw scores are multiplied by $\\frac{|D_c|}{|D|}$\n and $\\frac{|D_{-c}|}{|D|}$, respectively. This is done to ensure that both\n scores are of the same order of magnitude. This normalization is particularly\n useful when the inner score function $a_S$ is calculated by an estimator of the\n form $\\frac{1}{N} \\sum_i x_i$, e.g. the accuracy.\n\n Args:\n model: Model used for computing the score on the validation set.\n x_test: Array containing the features of the classification problem.\n y_test: Array containing the labels of the classification problem.\n rescale_scores: If set to True, the scores will be denormalized. This is\n particularly useful when the inner score function $a_S$ is calculated by\n an estimator of the form $\\frac{1}{N} \\sum_i x_i$.\n\n Returns:\n Tuple containing the in-class and out-of-class scores.\n \"\"\"\n scorer = self._scorer\n label_set_match = y_test == self.label\n label_set = np.where(label_set_match)[0]\n num_classes = len(np.unique(y_test))\n\n if len(label_set) == 0:\n return 0, 1 / (num_classes - 1)\n\n complement_label_set = np.where(~label_set_match)[0]\n in_class_score = scorer(model, x_test[label_set], y_test[label_set])\n out_of_class_score = scorer(\n model, x_test[complement_label_set], y_test[complement_label_set]\n )\n\n if rescale_scores:\n n_in_class = np.count_nonzero(y_test == self.label)\n n_out_of_class = len(y_test) - n_in_class\n in_class_score *= n_in_class / (n_in_class + n_out_of_class)\n out_of_class_score *= n_out_of_class / (n_in_class + n_out_of_class)\n\n return in_class_score, out_of_class_score\n
"},{"location":"api/pydvl/value/shapley/classwise/#pydvl.value.shapley.classwise.compute_classwise_shapley_values","title":"compute_classwise_shapley_values","text":"compute_classwise_shapley_values(\n u: Utility,\n *,\n done: StoppingCriterion,\n truncation: TruncationPolicy,\n done_sample_complements: Optional[StoppingCriterion] = None,\n normalize_values: bool = True,\n use_default_scorer_value: bool = True,\n min_elements_per_label: int = 1,\n n_jobs: int = 1,\n parallel_backend: Optional[ParallelBackend] = None,\n config: Optional[ParallelConfig] = None,\n progress: bool = False,\n seed: Optional[Seed] = None\n) -> ValuationResult\n
Computes an approximate Class-wise Shapley value by sampling independent permutations of the index set for each label and index sets sampled from the powerset of the complement (with respect to the currently evaluated label), approximating the sum:
\\[ v_u(i) = \\frac{1}{K} \\sum_k \\frac{1}{L} \\sum_l [u(\\sigma^{(l)}_{:i} \\cup \\{i\\} | S^{(k)} ) \u2212 u( \\sigma^{(l)}_{:i} | S^{(k)})], \\] where \\(\\sigma_{:i}\\) denotes the set of indices in permutation sigma before the position where \\(i\\) appears and \\(S\\) is a subset of the index set of all other labels (see the main documentation for details).
PARAMETER DESCRIPTION u
Utility object containing model, data, and scoring function. The scorer must be of type ClasswiseScorer.
TYPE: Utility
done
Function that checks whether the computation needs to stop.
TYPE: StoppingCriterion
truncation
Callable function that decides whether to interrupt processing a permutation and set subsequent marginals to zero.
TYPE: TruncationPolicy
done_sample_complements
Function checking whether computation needs to stop. Otherwise, it will resample conditional sets until the stopping criterion is met.
TYPE: Optional[StoppingCriterion]
DEFAULT: None
normalize_values
Indicates whether to normalize the values by the variation in each class times their in-class accuracy.
TYPE: bool
DEFAULT: True
done_sample_complements
Number of times to resample the complement set for each permutation.
TYPE: Optional[StoppingCriterion]
DEFAULT: None
use_default_scorer_value
The first set of indices is the sampled complement set. Unless not otherwise specified, the default scorer value is used for this. If it is set to false, the base score is calculated from the utility.
TYPE: bool
DEFAULT: True
min_elements_per_label
The minimum number of elements for each opposite label.
TYPE: int
DEFAULT: 1
n_jobs
Number of parallel jobs to run.
TYPE: int
DEFAULT: 1
parallel_backend
Parallel backend instance to use for parallelizing computations. If None
, use JoblibParallelBackend backend. See the Parallel Backends package for available options.
TYPE: Optional[ParallelBackend]
DEFAULT: None
config
(DEPRECATED) Object configuring parallel computation, with cluster address, number of cpus, etc.
TYPE: Optional[ParallelConfig]
DEFAULT: None
progress
Whether to display a progress bar.
TYPE: bool
DEFAULT: False
seed
Either an instance of a numpy random number generator or a seed for it.
TYPE: Optional[Seed]
DEFAULT: None
RETURNS DESCRIPTION ValuationResult
ValuationResult object containing computed data values.
New in version 0.7.1
Source code in src/pydvl/value/shapley/classwise.py
@deprecated(\n target=True,\n args_mapping={\"config\": \"config\"},\n deprecated_in=\"0.9.0\",\n remove_in=\"0.10.0\",\n)\ndef compute_classwise_shapley_values(\n u: Utility,\n *,\n done: StoppingCriterion,\n truncation: TruncationPolicy,\n done_sample_complements: Optional[StoppingCriterion] = None,\n normalize_values: bool = True,\n use_default_scorer_value: bool = True,\n min_elements_per_label: int = 1,\n n_jobs: int = 1,\n parallel_backend: Optional[ParallelBackend] = None,\n config: Optional[ParallelConfig] = None,\n progress: bool = False,\n seed: Optional[Seed] = None,\n) -> ValuationResult:\n r\"\"\"\n Computes an approximate Class-wise Shapley value by sampling independent\n permutations of the index set for each label and index sets sampled from the\n powerset of the complement (with respect to the currently evaluated label),\n approximating the sum:\n\n $$\n v_u(i) = \\frac{1}{K} \\sum_k \\frac{1}{L} \\sum_l\n [u(\\sigma^{(l)}_{:i} \\cup \\{i\\} | S^{(k)} ) \u2212 u( \\sigma^{(l)}_{:i} | S^{(k)})],\n $$\n\n where $\\sigma_{:i}$ denotes the set of indices in permutation sigma before\n the position where $i$ appears and $S$ is a subset of the index set of all\n other labels (see [the main documentation][class-wise-shapley] for\n details).\n\n Args:\n u: Utility object containing model, data, and scoring function. The\n scorer must be of type\n [ClasswiseScorer][pydvl.value.shapley.classwise.ClasswiseScorer].\n done: Function that checks whether the computation needs to stop.\n truncation: Callable function that decides whether to interrupt processing a\n permutation and set subsequent marginals to zero.\n done_sample_complements: Function checking whether computation needs to stop.\n Otherwise, it will resample conditional sets until the stopping criterion is\n met.\n normalize_values: Indicates whether to normalize the values by the variation\n in each class times their in-class accuracy.\n done_sample_complements: Number of times to resample the complement set\n for each permutation.\n use_default_scorer_value: The first set of indices is the sampled complement\n set. Unless not otherwise specified, the default scorer value is used for\n this. If it is set to false, the base score is calculated from the utility.\n min_elements_per_label: The minimum number of elements for each opposite\n label.\n n_jobs: Number of parallel jobs to run.\n parallel_backend: Parallel backend instance to use\n for parallelizing computations. If `None`,\n use [JoblibParallelBackend][pydvl.parallel.backends.JoblibParallelBackend] backend.\n See the [Parallel Backends][pydvl.parallel.backends] package\n for available options.\n config: (**DEPRECATED**) Object configuring parallel computation,\n with cluster address, number of cpus, etc.\n progress: Whether to display a progress bar.\n seed: Either an instance of a numpy random number generator or a seed for it.\n\n Returns:\n ValuationResult object containing computed data values.\n\n !!! tip \"New in version 0.7.1\"\n \"\"\"\n dim_correct = u.data.y_train.ndim == 1 and u.data.y_test.ndim == 1\n is_integral = all(\n map(\n lambda v: isinstance(v, numbers.Integral), (*u.data.y_train, *u.data.y_test)\n )\n )\n if not dim_correct or not is_integral:\n raise ValueError(\n \"The supplied dataset has to be a 1-dimensional classification dataset.\"\n )\n\n if not isinstance(u.scorer, ClasswiseScorer):\n raise ValueError(\n \"Please set a subclass of ClasswiseScorer object as scorer object of the\"\n \" utility. See scoring argument of Utility.\"\n )\n\n parallel_backend = _maybe_init_parallel_backend(parallel_backend, config)\n u_ref = parallel_backend.put(u)\n n_jobs = parallel_backend.effective_n_jobs(n_jobs)\n n_submitted_jobs = 2 * n_jobs\n\n pbar = tqdm(disable=not progress, position=0, total=100, unit=\"%\")\n algorithm = \"classwise_shapley\"\n accumulated_result = ValuationResult.zeros(\n algorithm=algorithm, indices=u.data.indices, data_names=u.data.data_names\n )\n terminate_exec = False\n seed_sequence = ensure_seed_sequence(seed)\n\n parallel_backend = _maybe_init_parallel_backend(parallel_backend, config)\n\n with parallel_backend.executor(max_workers=n_jobs) as executor:\n pending: Set[Future] = set()\n while True:\n completed_futures, pending = wait(\n pending, timeout=60, return_when=FIRST_COMPLETED\n )\n for future in completed_futures:\n accumulated_result += future.result()\n if done(accumulated_result):\n terminate_exec = True\n break\n\n pbar.n = 100 * done.completion()\n pbar.refresh()\n if terminate_exec:\n break\n\n n_remaining_slots = n_submitted_jobs - len(pending)\n seeds = seed_sequence.spawn(n_remaining_slots)\n for i in range(n_remaining_slots):\n future = executor.submit(\n _permutation_montecarlo_classwise_shapley_one_step,\n u_ref,\n truncation=truncation,\n done_sample_complements=done_sample_complements,\n use_default_scorer_value=use_default_scorer_value,\n min_elements_per_label=min_elements_per_label,\n algorithm_name=algorithm,\n seed=seeds[i],\n )\n pending.add(future)\n\n result = accumulated_result\n if normalize_values:\n result = _normalize_classwise_shapley_values(result, u)\n\n return result\n
"},{"location":"api/pydvl/value/shapley/common/","title":"Common","text":""},{"location":"api/pydvl/value/shapley/common/#pydvl.value.shapley.common","title":"pydvl.value.shapley.common","text":""},{"location":"api/pydvl/value/shapley/common/#pydvl.value.shapley.common.compute_shapley_values","title":"compute_shapley_values","text":"compute_shapley_values(\n u: Utility,\n *,\n done: StoppingCriterion = MaxChecks(None),\n mode: ShapleyMode = ShapleyMode.TruncatedMontecarlo,\n n_jobs: int = 1,\n seed: Optional[Seed] = None,\n **kwargs\n) -> ValuationResult\n
Umbrella method to compute Shapley values with any of the available algorithms.
See Data valuation for an overview.
The following algorithms are available. Note that the exact methods can only work with very small datasets and are thus intended only for testing. Some algorithms also accept additional arguments, please refer to the documentation of each particular method.
combinatorial_exact
: uses the combinatorial implementation of data Shapley. Implemented in combinatorial_exact_shapley(). combinatorial_montecarlo
: uses the approximate Monte Carlo implementation of combinatorial data Shapley. Implemented in combinatorial_montecarlo_shapley(). permutation_exact
: uses the permutation-based implementation of data Shapley. Computation is not parallelized. Implemented in permutation_exact_shapley(). permutation_montecarlo
: uses the approximate Monte Carlo implementation of permutation data Shapley. Accepts a TruncationPolicy to stop computing marginals. Implemented in permutation_montecarlo_shapley(). owen_sampling
: Uses the Owen continuous extension of the utility function to the unit cube. Implemented in owen_sampling_shapley(). This method does not take a StoppingCriterion but instead requires a parameter q_max
for the number of subdivisions of the unit interval to use for integration, and another parameter n_samples
for the number of subsets to sample for each \\(q\\). owen_halved
: Same as 'owen_sampling' but uses correlated samples in the expectation. Implemented in owen_sampling_shapley(). This method requires an additional parameter q_max
for the number of subdivisions of the interval [0,0.5] to use for integration, and another parameter n_samples
for the number of subsets to sample for each \\(q\\). group_testing
: estimates differences of Shapley values and solves a constraint satisfaction problem. High sample complexity, not recommended. Implemented in group_testing_shapley(). This method does not take a StoppingCriterion but instead requires a parameter n_samples
for the number of iterations to run.
Additionally, one can use model-specific methods:
knn
: Exact method for K-Nearest neighbour models. Implemented in knn_shapley().
PARAMETER DESCRIPTION u
Utility object with model, data, and scoring function.
TYPE: Utility
done
Object used to determine when to stop the computation for Monte Carlo methods. The default is to stop after 100 iterations. See the available criteria in stopping. It is possible to combine several of them using boolean operators. Some methods ignore this argument, others require specific subtypes.
TYPE: StoppingCriterion
DEFAULT: MaxChecks(None)
n_jobs
Number of parallel jobs (available only to some methods)
TYPE: int
DEFAULT: 1
seed
Either an instance of a numpy random number generator or a seed for it.
TYPE: Optional[Seed]
DEFAULT: None
mode
Choose which shapley algorithm to use. See ShapleyMode for a list of allowed value.
TYPE: ShapleyMode
DEFAULT: TruncatedMontecarlo
RETURNS DESCRIPTION ValuationResult
Object with the results.
Source code in src/pydvl/value/shapley/common.py
def compute_shapley_values(\n u: Utility,\n *,\n done: StoppingCriterion = MaxChecks(None),\n mode: ShapleyMode = ShapleyMode.TruncatedMontecarlo,\n n_jobs: int = 1,\n seed: Optional[Seed] = None,\n **kwargs,\n) -> ValuationResult:\n \"\"\"Umbrella method to compute Shapley values with any of the available\n algorithms.\n\n See [Data valuation][data-valuation] for an overview.\n\n The following algorithms are available. Note that the exact methods can only\n work with very small datasets and are thus intended only for testing. Some\n algorithms also accept additional arguments, please refer to the\n documentation of each particular method.\n\n - `combinatorial_exact`: uses the combinatorial implementation of data\n Shapley. Implemented in\n [combinatorial_exact_shapley()][pydvl.value.shapley.naive.combinatorial_exact_shapley].\n - `combinatorial_montecarlo`: uses the approximate Monte Carlo\n implementation of combinatorial data Shapley. Implemented in\n [combinatorial_montecarlo_shapley()][pydvl.value.shapley.montecarlo.combinatorial_montecarlo_shapley].\n - `permutation_exact`: uses the permutation-based implementation of data\n Shapley. Computation is **not parallelized**. Implemented in\n [permutation_exact_shapley()][pydvl.value.shapley.naive.permutation_exact_shapley].\n - `permutation_montecarlo`: uses the approximate Monte Carlo\n implementation of permutation data Shapley. Accepts a\n [TruncationPolicy][pydvl.value.shapley.truncated.TruncationPolicy] to stop\n computing marginals. Implemented in\n [permutation_montecarlo_shapley()][pydvl.value.shapley.montecarlo.permutation_montecarlo_shapley].\n - `owen_sampling`: Uses the Owen continuous extension of the utility\n function to the unit cube. Implemented in\n [owen_sampling_shapley()][pydvl.value.shapley.owen.owen_sampling_shapley]. This\n method does not take a [StoppingCriterion][pydvl.value.stopping.StoppingCriterion]\n but instead requires a parameter `q_max` for the number of subdivisions\n of the unit interval to use for integration, and another parameter\n `n_samples` for the number of subsets to sample for each $q$.\n - `owen_halved`: Same as 'owen_sampling' but uses correlated samples in the\n expectation. Implemented in\n [owen_sampling_shapley()][pydvl.value.shapley.owen.owen_sampling_shapley].\n This method requires an additional parameter `q_max` for the number of\n subdivisions of the interval [0,0.5] to use for integration, and another\n parameter `n_samples` for the number of subsets to sample for each $q$.\n - `group_testing`: estimates differences of Shapley values and solves a\n constraint satisfaction problem. High sample complexity, not recommended.\n Implemented in [group_testing_shapley()][pydvl.value.shapley.gt.group_testing_shapley]. This\n method does not take a [StoppingCriterion][pydvl.value.stopping.StoppingCriterion]\n but instead requires a parameter `n_samples` for the number of\n iterations to run.\n\n Additionally, one can use model-specific methods:\n\n - `knn`: Exact method for K-Nearest neighbour models. Implemented in\n [knn_shapley()][pydvl.value.shapley.knn.knn_shapley].\n\n Args:\n u: [Utility][pydvl.utils.utility.Utility] object with model, data, and\n scoring function.\n done: Object used to determine when to stop the computation for Monte\n Carlo methods. The default is to stop after 100 iterations. See the\n available criteria in [stopping][pydvl.value.stopping]. It is\n possible to combine several of them using boolean operators. Some\n methods ignore this argument, others require specific subtypes.\n n_jobs: Number of parallel jobs (available only to some methods)\n seed: Either an instance of a numpy random number generator or a seed\n for it.\n mode: Choose which shapley algorithm to use. See\n [ShapleyMode][pydvl.value.shapley.ShapleyMode] for a list of allowed\n value.\n\n Returns:\n Object with the results.\n\n \"\"\"\n progress: bool = kwargs.pop(\"progress\", False)\n\n if mode not in list(ShapleyMode):\n raise ValueError(f\"Invalid value encountered in {mode=}\")\n\n if mode in (\n ShapleyMode.PermutationMontecarlo,\n ShapleyMode.ApproShapley,\n ShapleyMode.TruncatedMontecarlo,\n ):\n truncation = kwargs.pop(\"truncation\", NoTruncation())\n return permutation_montecarlo_shapley( # type: ignore\n u=u,\n done=done,\n truncation=truncation,\n n_jobs=n_jobs,\n seed=seed,\n progress=progress,\n **kwargs,\n )\n elif mode == ShapleyMode.CombinatorialMontecarlo:\n return combinatorial_montecarlo_shapley( # type: ignore\n u, done=done, n_jobs=n_jobs, seed=seed, progress=progress\n )\n elif mode == ShapleyMode.CombinatorialExact:\n return combinatorial_exact_shapley(u, n_jobs=n_jobs, progress=progress) # type: ignore\n elif mode == ShapleyMode.PermutationExact:\n return permutation_exact_shapley(u, progress=progress)\n elif mode == ShapleyMode.Owen or mode == ShapleyMode.OwenAntithetic:\n if kwargs.get(\"n_samples\") is None:\n raise ValueError(\"n_samples cannot be None for Owen methods\")\n if kwargs.get(\"max_q\") is None:\n raise ValueError(\"Owen Sampling requires max_q for the outer integral\")\n\n method = (\n OwenAlgorithm.Standard\n if mode == ShapleyMode.Owen\n else OwenAlgorithm.Antithetic\n )\n return owen_sampling_shapley( # type: ignore\n u,\n n_samples=int(kwargs.get(\"n_samples\", -1)),\n max_q=int(kwargs.get(\"max_q\", -1)),\n method=method,\n n_jobs=n_jobs,\n seed=seed,\n )\n elif mode == ShapleyMode.KNN:\n return knn_shapley(u, progress=progress)\n elif mode == ShapleyMode.GroupTesting:\n n_samples = kwargs.pop(\"n_samples\")\n if n_samples is None:\n raise ValueError(\"n_samples cannot be None for Group Testing\")\n epsilon = kwargs.pop(\"epsilon\")\n if epsilon is None:\n raise ValueError(\"Group Testing requires error bound epsilon\")\n delta = kwargs.pop(\"delta\", 0.05)\n return group_testing_shapley( # type: ignore\n u,\n epsilon=float(epsilon),\n delta=delta,\n n_samples=int(n_samples),\n n_jobs=n_jobs,\n progress=progress,\n seed=seed,\n **kwargs,\n )\n else:\n raise ValueError(f\"Invalid value encountered in {mode=}\")\n
"},{"location":"api/pydvl/value/shapley/gt/","title":"Gt","text":""},{"location":"api/pydvl/value/shapley/gt/#pydvl.value.shapley.gt","title":"pydvl.value.shapley.gt","text":"This module implements Group Testing for the approximation of Shapley values, as introduced in (Jia, R. et al., 2019)1. The sampling of index subsets is done in such a way that an approximation to the true Shapley values can be computed with guarantees.
Warning
This method is very inefficient. Potential improvements to the implementation notwithstanding, convergence seems to be very slow (in terms of evaluations of the utility required). We recommend other Monte Carlo methods instead.
You can read more in the documentation.
New in version 0.4.0
"},{"location":"api/pydvl/value/shapley/gt/#pydvl.value.shapley.gt--references","title":"References","text":" -
Jia, R. et al., 2019. Towards Efficient Data Valuation Based on the Shapley Value. In: Proceedings of the 22nd International Conference on Artificial Intelligence and Statistics, pp. 1167\u20131176. PMLR.\u00a0\u21a9
"},{"location":"api/pydvl/value/shapley/gt/#pydvl.value.shapley.gt.num_samples_eps_delta","title":"num_samples_eps_delta","text":"num_samples_eps_delta(\n eps: float, delta: float, n: int, utility_range: float\n) -> int\n
Implements the formula in Theorem 3 of (Jia, R. et al., 2019)1 which gives a lower bound on the number of samples required to obtain an (\u03b5/\u221an,\u03b4/(N(N-1))-approximation to all pair-wise differences of Shapley values, wrt. \\(\\ell_2\\) norm.
PARAMETER DESCRIPTION eps
\u03b5
TYPE: float
delta
\u03b4
TYPE: float
n
Number of data points
TYPE: int
utility_range
Range of the Utility function
TYPE: float
Returns: Number of samples from \\(2^{[n]}\\) guaranteeing \u03b5/\u221an-correct Shapley pair-wise differences of values with probability 1-\u03b4/(N(N-1)).
New in version 0.4.0
Source code in src/pydvl/value/shapley/gt.py
def num_samples_eps_delta(\n eps: float, delta: float, n: int, utility_range: float\n) -> int:\n r\"\"\"Implements the formula in Theorem 3 of (Jia, R. et al., 2019)<sup><a href=\"#jia_efficient_2019\">1</a></sup>\n which gives a lower bound on the number of samples required to obtain an\n (\u03b5/\u221an,\u03b4/(N(N-1))-approximation to all pair-wise differences of Shapley\n values, wrt. $\\ell_2$ norm.\n\n Args:\n eps: \u03b5\n delta: \u03b4\n n: Number of data points\n utility_range: Range of the [Utility][pydvl.utils.utility.Utility] function\n Returns:\n Number of samples from $2^{[n]}$ guaranteeing \u03b5/\u221an-correct Shapley\n pair-wise differences of values with probability 1-\u03b4/(N(N-1)).\n\n !!! tip \"New in version 0.4.0\"\n\n \"\"\"\n constants = _constants(n=n, epsilon=eps, delta=delta, utility_range=utility_range)\n return int(constants.T)\n
"},{"location":"api/pydvl/value/shapley/gt/#pydvl.value.shapley.gt.group_testing_shapley","title":"group_testing_shapley","text":"group_testing_shapley(\n u: Utility,\n n_samples: int,\n epsilon: float,\n delta: float,\n *,\n n_jobs: int = 1,\n parallel_backend: Optional[ParallelBackend] = None,\n config: Optional[ParallelConfig] = None,\n progress: bool = False,\n seed: Optional[Seed] = None,\n **options: dict\n) -> ValuationResult\n
Implements group testing for approximation of Shapley values as described in (Jia, R. et al., 2019)1.
Warning
This method is very inefficient. It requires several orders of magnitude more evaluations of the utility than others in montecarlo. It also uses several intermediate objects like the results from the runners and the constraint matrices which can become rather large.
By picking a specific distribution over subsets, the differences in Shapley values can be approximated with a Monte Carlo sum. These are then used to solve for the individual values in a feasibility problem.
PARAMETER DESCRIPTION u
Utility object with model, data, and scoring function
TYPE: Utility
n_samples
Number of tests to perform. Use num_samples_eps_delta to estimate this.
TYPE: int
epsilon
From the (\u03b5,\u03b4) sample bound. Use the same as for the estimation of n_iterations
.
TYPE: float
delta
From the (\u03b5,\u03b4) sample bound. Use the same as for the estimation of n_iterations
.
TYPE: float
n_jobs
Number of parallel jobs to use. Each worker performs a chunk of all tests (i.e. utility evaluations).
TYPE: int
DEFAULT: 1
parallel_backend
Parallel backend instance to use for parallelizing computations. If None
, use JoblibParallelBackend backend. See the Parallel Backends package for available options.
TYPE: Optional[ParallelBackend]
DEFAULT: None
config
(DEPRECATED) Object configuring parallel computation, with cluster address, number of cpus, etc.
TYPE: Optional[ParallelConfig]
DEFAULT: None
progress
Whether to display progress bars for each job.
TYPE: bool
DEFAULT: False
seed
Either an instance of a numpy random number generator or a seed for it.
TYPE: Optional[Seed]
DEFAULT: None
options
Additional options to pass to cvxpy.Problem.solve(). E.g. to change the solver (which defaults to cvxpy.SCS
) pass solver=cvxpy.CVXOPT
.
TYPE: dict
DEFAULT: {}
RETURNS DESCRIPTION ValuationResult
Object with the data values.
New in version 0.4.0
Changed in version 0.5.0
Changed the solver to cvxpy instead of scipy's linprog. Added the ability to pass arbitrary options to it.
Changed in version 0.9.0
Deprecated config
argument and added a parallel_backend
argument to allow users to pass the Parallel Backend instance directly.
Source code in src/pydvl/value/shapley/gt.py
@deprecated(\n target=True,\n args_mapping={\"config\": \"config\"},\n deprecated_in=\"0.9.0\",\n remove_in=\"0.10.0\",\n)\ndef group_testing_shapley(\n u: Utility,\n n_samples: int,\n epsilon: float,\n delta: float,\n *,\n n_jobs: int = 1,\n parallel_backend: Optional[ParallelBackend] = None,\n config: Optional[ParallelConfig] = None,\n progress: bool = False,\n seed: Optional[Seed] = None,\n **options: dict,\n) -> ValuationResult:\n \"\"\"Implements group testing for approximation of Shapley values as described\n in (Jia, R. et al., 2019)<sup><a href=\"#jia_efficient_2019\">1</a></sup>.\n\n !!! Warning\n This method is very inefficient. It requires several orders of magnitude\n more evaluations of the utility than others in\n [montecarlo][pydvl.value.shapley.montecarlo]. It also uses several intermediate\n objects like the results from the runners and the constraint matrices\n which can become rather large.\n\n By picking a specific distribution over subsets, the differences in Shapley\n values can be approximated with a Monte Carlo sum. These are then used to\n solve for the individual values in a feasibility problem.\n\n Args:\n u: Utility object with model, data, and scoring function\n n_samples: Number of tests to perform. Use\n [num_samples_eps_delta][pydvl.value.shapley.gt.num_samples_eps_delta]\n to estimate this.\n epsilon: From the (\u03b5,\u03b4) sample bound. Use the same as for the\n estimation of `n_iterations`.\n delta: From the (\u03b5,\u03b4) sample bound. Use the same as for the\n estimation of `n_iterations`.\n n_jobs: Number of parallel jobs to use. Each worker performs a chunk\n of all tests (i.e. utility evaluations).\n parallel_backend: Parallel backend instance to use\n for parallelizing computations. If `None`,\n use [JoblibParallelBackend][pydvl.parallel.backends.JoblibParallelBackend] backend.\n See the [Parallel Backends][pydvl.parallel.backends] package\n for available options.\n config: (**DEPRECATED**) Object configuring parallel computation,\n with cluster address, number of cpus, etc.\n progress: Whether to display progress bars for each job.\n seed: Either an instance of a numpy random number generator or a seed for it.\n options: Additional options to pass to\n [cvxpy.Problem.solve()](https://www.cvxpy.org/tutorial/advanced/index.html#solve-method-options).\n E.g. to change the solver (which defaults to `cvxpy.SCS`) pass\n `solver=cvxpy.CVXOPT`.\n\n Returns:\n Object with the data values.\n\n !!! tip \"New in version 0.4.0\"\n\n !!! tip \"Changed in version 0.5.0\"\n Changed the solver to cvxpy instead of scipy's linprog. Added the ability\n to pass arbitrary options to it.\n\n !!! tip \"Changed in version 0.9.0\"\n Deprecated `config` argument and added a `parallel_backend`\n argument to allow users to pass the Parallel Backend instance\n directly.\n \"\"\"\n\n n = len(u.data.indices)\n\n const = _constants(\n n=n,\n epsilon=epsilon,\n delta=delta,\n utility_range=u.score_range.max() - u.score_range.min(),\n )\n T = n_samples\n if T < const.T:\n log.warning(\n f\"n_samples of {T} are below the required {const.T} for the \"\n f\"\u03b5={epsilon:.02f} guarantee at \u03b4={1 - delta:.02f} probability\"\n )\n\n parallel_backend = _maybe_init_parallel_backend(parallel_backend, config)\n\n samples_per_job = max(1, n_samples // parallel_backend.effective_n_jobs(n_jobs))\n\n def reducer(\n results_it: Iterable[Tuple[NDArray, NDArray]]\n ) -> Tuple[NDArray, NDArray]:\n return np.concatenate(list(x[0] for x in results_it)).astype(\n np.float_\n ), np.concatenate(list(x[1] for x in results_it)).astype(np.int_)\n\n seed_sequence = ensure_seed_sequence(seed)\n map_reduce_seed_sequence, cvxpy_seed = tuple(seed_sequence.spawn(2))\n\n map_reduce_job: MapReduceJob[Utility, Tuple[NDArray, NDArray]] = MapReduceJob(\n u,\n map_func=_group_testing_shapley,\n reduce_func=reducer,\n map_kwargs=dict(n_samples=samples_per_job, progress=progress),\n parallel_backend=parallel_backend,\n n_jobs=n_jobs,\n )\n uu, betas = map_reduce_job(seed=map_reduce_seed_sequence)\n\n # Matrix of estimated differences. See Eqs. (3) and (4) in the paper.\n C = np.zeros(shape=(n, n))\n for i in range(n):\n for j in range(i + 1, n):\n C[i, j] = np.dot(uu, betas[:, i] - betas[:, j])\n C *= const.Z / T\n total_utility = u(u.data.indices)\n\n ###########################################################################\n # Solution of the constraint problem with cvxpy\n\n v = cp.Variable(n)\n constraints = [cp.sum(v) == total_utility]\n for i in range(n):\n for j in range(i + 1, n):\n constraints.append(v[i] - v[j] <= epsilon + C[i, j])\n constraints.append(v[j] - v[i] <= epsilon - C[i, j])\n\n problem = cp.Problem(cp.Minimize(0), constraints)\n solver = options.pop(\"solver\", cp.SCS)\n problem.solve(solver=solver, **options)\n\n if problem.status != \"optimal\":\n log.warning(f\"cvxpy returned status {problem.status}\")\n values = (\n np.nan * np.ones_like(u.data.indices)\n if not hasattr(v.value, \"__len__\")\n else v.value\n )\n status = Status.Failed\n else:\n values = v.value\n status = Status.Converged\n\n return ValuationResult(\n algorithm=\"group_testing_shapley\",\n status=status,\n values=values,\n data_names=u.data.data_names,\n solver_status=problem.status,\n )\n
"},{"location":"api/pydvl/value/shapley/knn/","title":"Knn","text":""},{"location":"api/pydvl/value/shapley/knn/#pydvl.value.shapley.knn","title":"pydvl.value.shapley.knn","text":"This module contains Shapley computations for K-Nearest Neighbours.
Todo
Implement approximate KNN computation for sublinear complexity
"},{"location":"api/pydvl/value/shapley/knn/#pydvl.value.shapley.knn--references","title":"References","text":" -
Jia, R. et al., 2019. Efficient Task-Specific Data Valuation for Nearest Neighbor Algorithms. In: Proceedings of the VLDB Endowment, Vol. 12, No. 11, pp. 1610\u20131623.\u00a0\u21a9
"},{"location":"api/pydvl/value/shapley/knn/#pydvl.value.shapley.knn.knn_shapley","title":"knn_shapley","text":"knn_shapley(u: Utility, *, progress: bool = True) -> ValuationResult\n
Computes exact Shapley values for a KNN classifier.
This implements the method described in (Jia, R. et al., 2019)1. It exploits the local structure of K-Nearest Neighbours to reduce the number of calls to the utility function to a constant number per index, thus reducing computation time to \\(O(n)\\).
PARAMETER DESCRIPTION u
Utility with a KNN model to extract parameters from. The object will not be modified nor used other than to call get_params()
TYPE: Utility
progress
Whether to display a progress bar.
TYPE: bool
DEFAULT: True
RETURNS DESCRIPTION ValuationResult
Object with the data values.
RAISES DESCRIPTION TypeError
If the model in the utility is not a sklearn.neighbors.KNeighborsClassifier.
New in version 0.1.0
Source code in src/pydvl/value/shapley/knn.py
def knn_shapley(u: Utility, *, progress: bool = True) -> ValuationResult:\n \"\"\"Computes exact Shapley values for a KNN classifier.\n\n This implements the method described in (Jia, R. et al., 2019)<sup><a href=\"#jia_efficient_2019a\">1</a></sup>.\n It exploits the local structure of K-Nearest Neighbours to reduce the number\n of calls to the utility function to a constant number per index, thus\n reducing computation time to $O(n)$.\n\n Args:\n u: Utility with a KNN model to extract parameters from. The object\n will not be modified nor used other than to call [get_params()](\n <https://scikit-learn.org/stable/modules/generated/sklearn.base.BaseEstimator.html#sklearn.base.BaseEstimator.get_params>)\n progress: Whether to display a progress bar.\n\n Returns:\n Object with the data values.\n\n Raises:\n TypeError: If the model in the utility is not a\n [sklearn.neighbors.KNeighborsClassifier][].\n\n !!! tip \"New in version 0.1.0\"\n\n \"\"\"\n if not isinstance(u.model, KNeighborsClassifier):\n raise TypeError(\"KNN Shapley requires a K-Nearest Neighbours model\")\n\n defaults: Dict[str, Union[int, str]] = {\n \"algorithm\": \"ball_tree\" if u.data.dim >= 20 else \"kd_tree\",\n \"metric\": \"minkowski\",\n \"p\": 2,\n }\n defaults.update(u.model.get_params())\n # HACK: NearestNeighbors doesn't support this. There will be more...\n del defaults[\"weights\"]\n n_neighbors: int = int(defaults[\"n_neighbors\"])\n defaults[\"n_neighbors\"] = len(u.data) # We want all training points sorted\n\n assert n_neighbors < len(u.data)\n # assert data.target_dim == 1\n\n nns = NearestNeighbors(**defaults).fit(u.data.x_train)\n # closest to farthest\n _, indices = nns.kneighbors(u.data.x_test)\n\n values: NDArray[np.float_] = np.zeros_like(u.data.indices, dtype=np.float_)\n n = len(u.data)\n yt = u.data.y_train\n iterator = enumerate(zip(u.data.y_test, indices), start=1)\n for j, (y, ii) in tqdm(iterator, disable=not progress):\n value_at_x = int(yt[ii[-1]] == y) / n\n values[ii[-1]] += (value_at_x - values[ii[-1]]) / j\n for i in range(n - 2, n_neighbors, -1): # farthest to closest\n value_at_x = (\n values[ii[i + 1]] + (int(yt[ii[i]] == y) - int(yt[ii[i + 1]] == y)) / i\n )\n values[ii[i]] += (value_at_x - values[ii[i]]) / j\n for i in range(n_neighbors, -1, -1): # farthest to closest\n value_at_x = (\n values[ii[i + 1]]\n + (int(yt[ii[i]] == y) - int(yt[ii[i + 1]] == y)) / n_neighbors\n )\n values[ii[i]] += (value_at_x - values[ii[i]]) / j\n\n return ValuationResult(\n algorithm=\"knn_shapley\",\n status=Status.Converged,\n values=values,\n data_names=u.data.data_names,\n )\n
"},{"location":"api/pydvl/value/shapley/montecarlo/","title":"Montecarlo","text":""},{"location":"api/pydvl/value/shapley/montecarlo/#pydvl.value.shapley.montecarlo","title":"pydvl.value.shapley.montecarlo","text":"Monte Carlo approximations to Shapley Data values.
Warning
You probably want to use the common interface provided by compute_shapley_values() instead of directly using the functions in this module.
Because exact computation of Shapley values requires \\(\\mathcal{O}(2^n)\\) re-trainings of the model, several Monte Carlo approximations are available. The first two sample from the powerset of the training data directly: combinatorial_montecarlo_shapley() and owen_sampling_shapley(). The latter uses a reformulation in terms of a continuous extension of the utility.
Alternatively, employing another reformulation of the expression above as a sum over permutations, one has the implementation in permutation_montecarlo_shapley() with the option to pass an early stopping strategy to reduce computation as done in Truncated MonteCarlo Shapley (TMCS).
Also see
It is also possible to use group_testing_shapley() to reduce the number of evaluations of the utility. The method is however typically outperformed by others in this module.
Also see
Additionally, you can consider grouping your data points using GroupedDataset and computing the values of the groups instead. This is not to be confused with \"group testing\" as implemented in group_testing_shapley(): any of the algorithms mentioned above, including Group Testing, can work to valuate groups of samples as units.
"},{"location":"api/pydvl/value/shapley/montecarlo/#pydvl.value.shapley.montecarlo--references","title":"References","text":" -
Ghorbani, A., Zou, J., 2019. Data Shapley: Equitable Valuation of Data for Machine Learning. In: Proceedings of the 36th International Conference on Machine Learning, PMLR, pp. 2242\u20132251.\u00a0\u21a9
"},{"location":"api/pydvl/value/shapley/montecarlo/#pydvl.value.shapley.montecarlo.permutation_montecarlo_shapley","title":"permutation_montecarlo_shapley","text":"permutation_montecarlo_shapley(\n u: Utility,\n done: StoppingCriterion,\n *,\n truncation: TruncationPolicy = NoTruncation(),\n n_jobs: int = 1,\n parallel_backend: Optional[ParallelBackend] = None,\n config: Optional[ParallelConfig] = None,\n progress: bool = False,\n seed: Optional[Seed] = None\n) -> ValuationResult\n
Computes an approximate Shapley value by sampling independent permutations of the index set, approximating the sum:
\\[ v_u(x_i) = \\frac{1}{n!} \\sum_{\\sigma \\in \\Pi(n)} \\tilde{w}( | \\sigma_{:i} | )[u(\\sigma_{:i} \\cup \\{i\\}) \u2212 u(\\sigma_{:i})], \\] where \\(\\sigma_{:i}\\) denotes the set of indices in permutation sigma before the position where \\(i\\) appears (see [[data-valuation]] for details).
This implements the method described in (Ghorbani and Zou, 2019)1 with a double stopping criterion.
Todo
Think of how to add Robin-Gelman or some other more principled stopping criterion.
Instead of naively implementing the expectation, we sequentially add points to coalitions from a permutation and incrementally compute marginal utilities. We stop computing marginals for a given permutation based on a TruncationPolicy. (Ghorbani and Zou, 2019)1 mention two policies: one that stops after a certain fraction of marginals are computed, implemented in FixedTruncation, and one that stops if the last computed utility (\"score\") is close to the total utility using the standard deviation of the utility as a measure of proximity, implemented in BootstrapTruncation.
We keep sampling permutations and updating all shapley values until the StoppingCriterion returns True
.
PARAMETER DESCRIPTION u
Utility object with model, data, and scoring function.
TYPE: Utility
done
function checking whether computation must stop.
TYPE: StoppingCriterion
truncation
An optional callable which decides whether to interrupt processing a permutation and set all subsequent marginals to zero. Typically used to stop computation when the marginal is small.
TYPE: TruncationPolicy
DEFAULT: NoTruncation()
n_jobs
number of jobs across which to distribute the computation.
TYPE: int
DEFAULT: 1
parallel_backend
Parallel backend instance to use for parallelizing computations. If None
, use JoblibParallelBackend backend. See the Parallel Backends package for available options.
TYPE: Optional[ParallelBackend]
DEFAULT: None
config
(DEPRECATED) Object configuring parallel computation, with cluster address, number of cpus, etc.
TYPE: Optional[ParallelConfig]
DEFAULT: None
progress
Whether to display a progress bar.
TYPE: bool
DEFAULT: False
seed
Either an instance of a numpy random number generator or a seed for it.
TYPE: Optional[Seed]
DEFAULT: None
RETURNS DESCRIPTION ValuationResult
Object with the data values.
Changed in version 0.9.0
Deprecated config
argument and added a parallel_backend
argument to allow users to pass the Parallel Backend instance directly.
Source code in src/pydvl/value/shapley/montecarlo.py
@deprecated(\n target=True,\n args_mapping={\"config\": \"config\"},\n deprecated_in=\"0.9.0\",\n remove_in=\"0.10.0\",\n)\ndef permutation_montecarlo_shapley(\n u: Utility,\n done: StoppingCriterion,\n *,\n truncation: TruncationPolicy = NoTruncation(),\n n_jobs: int = 1,\n parallel_backend: Optional[ParallelBackend] = None,\n config: Optional[ParallelConfig] = None,\n progress: bool = False,\n seed: Optional[Seed] = None,\n) -> ValuationResult:\n r\"\"\"Computes an approximate Shapley value by sampling independent\n permutations of the index set, approximating the sum:\n\n $$\n v_u(x_i) = \\frac{1}{n!} \\sum_{\\sigma \\in \\Pi(n)}\n \\tilde{w}( | \\sigma_{:i} | )[u(\\sigma_{:i} \\cup \\{i\\}) \u2212 u(\\sigma_{:i})],\n $$\n\n where $\\sigma_{:i}$ denotes the set of indices in permutation sigma before\n the position where $i$ appears (see [[data-valuation]] for details).\n\n This implements the method described in (Ghorbani and Zou, 2019)<sup><a\n href=\"#ghorbani_data_2019\">1</a></sup> with a double stopping criterion.\n\n !!! Todo\n Think of how to add Robin-Gelman or some other more principled stopping\n criterion.\n\n Instead of naively implementing the expectation, we sequentially add points\n to coalitions from a permutation and incrementally compute marginal utilities.\n We stop computing marginals for a given permutation based on a\n [TruncationPolicy][pydvl.value.shapley.truncated.TruncationPolicy].\n (Ghorbani and Zou, 2019)<sup><a href=\"#ghorbani_data_2019\">1</a></sup>\n mention two policies: one that stops after a certain\n fraction of marginals are computed, implemented in\n [FixedTruncation][pydvl.value.shapley.truncated.FixedTruncation],\n and one that stops if the last computed utility (\"score\") is close to the\n total utility using the standard deviation of the utility as a measure of\n proximity, implemented in\n [BootstrapTruncation][pydvl.value.shapley.truncated.BootstrapTruncation].\n\n We keep sampling permutations and updating all shapley values\n until the [StoppingCriterion][pydvl.value.stopping.StoppingCriterion] returns\n `True`.\n\n Args:\n u: Utility object with model, data, and scoring function.\n done: function checking whether computation must stop.\n truncation: An optional callable which decides whether to interrupt\n processing a permutation and set all subsequent marginals to\n zero. Typically used to stop computation when the marginal is small.\n n_jobs: number of jobs across which to distribute the computation.\n parallel_backend: Parallel backend instance to use\n for parallelizing computations. If `None`,\n use [JoblibParallelBackend][pydvl.parallel.backends.JoblibParallelBackend] backend.\n See the [Parallel Backends][pydvl.parallel.backends] package\n for available options.\n config: (**DEPRECATED**) Object configuring parallel computation,\n with cluster address, number of cpus, etc.\n progress: Whether to display a progress bar.\n seed: Either an instance of a numpy random number generator or a seed for it.\n\n Returns:\n Object with the data values.\n\n !!! tip \"Changed in version 0.9.0\"\n Deprecated `config` argument and added a `parallel_backend`\n argument to allow users to pass the Parallel Backend instance\n directly.\n \"\"\"\n algorithm = \"permutation_montecarlo_shapley\"\n\n parallel_backend = _maybe_init_parallel_backend(parallel_backend, config)\n u = parallel_backend.put(u)\n max_workers = parallel_backend.effective_n_jobs(n_jobs)\n n_submitted_jobs = 2 * max_workers # number of jobs in the executor's queue\n\n seed_sequence = ensure_seed_sequence(seed)\n result = ValuationResult.zeros(\n algorithm=algorithm, indices=u.data.indices, data_names=u.data.data_names\n )\n\n pbar = tqdm(disable=not progress, total=100, unit=\"%\")\n\n with parallel_backend.executor(\n max_workers=max_workers, cancel_futures=CancellationPolicy.ALL\n ) as executor:\n pending: set[Future] = set()\n while True:\n pbar.n = 100 * done.completion()\n pbar.refresh()\n\n completed, pending = wait(pending, timeout=1.0, return_when=FIRST_COMPLETED)\n for future in completed:\n result += future.result()\n # we could check outside the loop, but that means more\n # submissions if the stopping criterion is unstable\n if done(result):\n return result\n\n # Ensure that we always have n_submitted_jobs in the queue or running\n n_remaining_slots = n_submitted_jobs - len(pending)\n seeds = seed_sequence.spawn(n_remaining_slots)\n for i in range(n_remaining_slots):\n future = executor.submit(\n _permutation_montecarlo_one_step,\n u,\n truncation,\n algorithm,\n seed=seeds[i],\n )\n pending.add(future)\n
"},{"location":"api/pydvl/value/shapley/montecarlo/#pydvl.value.shapley.montecarlo.combinatorial_montecarlo_shapley","title":"combinatorial_montecarlo_shapley","text":"combinatorial_montecarlo_shapley(\n u: Utility,\n done: StoppingCriterion,\n *,\n n_jobs: int = 1,\n parallel_backend: Optional[ParallelBackend] = None,\n config: Optional[ParallelConfig] = None,\n progress: bool = False,\n seed: Optional[Seed] = None\n) -> ValuationResult\n
Computes an approximate Shapley value using the combinatorial definition:
\\[v_u(i) = \\frac{1}{n} \\sum_{S \\subseteq N \\setminus \\{i\\}} \\binom{n-1}{ | S | }^{-1} [u(S \\cup \\{i\\}) \u2212 u(S)]\\] This consists of randomly sampling subsets of the power set of the training indices in u.data, and computing their marginal utilities. See Data valuation for details.
Note that because sampling is done with replacement, the approximation is poor even for \\(2^{m}\\) subsets with \\(m>n\\), even though there are \\(2^{n-1}\\) subsets for each \\(i\\). Prefer permutation_montecarlo_shapley().
Parallelization is done by splitting the set of indices across processes and computing the sum over subsets \\(S \\subseteq N \\setminus \\{i\\}\\) separately.
PARAMETER DESCRIPTION u
Utility object with model, data, and scoring function
TYPE: Utility
done
Stopping criterion for the computation.
TYPE: StoppingCriterion
n_jobs
number of parallel jobs across which to distribute the computation. Each worker receives a chunk of indices
TYPE: int
DEFAULT: 1
parallel_backend
Parallel backend instance to use for parallelizing computations. If None
, use JoblibParallelBackend backend. See the Parallel Backends package for available options.
TYPE: Optional[ParallelBackend]
DEFAULT: None
config
(DEPRECATED) Object configuring parallel computation, with cluster address, number of cpus, etc.
TYPE: Optional[ParallelConfig]
DEFAULT: None
progress
Whether to display progress bars for each job.
TYPE: bool
DEFAULT: False
seed
Either an instance of a numpy random number generator or a seed for it.
TYPE: Optional[Seed]
DEFAULT: None
RETURNS DESCRIPTION ValuationResult
Object with the data values.
Changed in version 0.9.0
Deprecated config
argument and added a parallel_backend
argument to allow users to pass the Parallel Backend instance directly.
Source code in src/pydvl/value/shapley/montecarlo.py
@deprecated(\n target=True,\n args_mapping={\"config\": \"config\"},\n deprecated_in=\"0.9.0\",\n remove_in=\"0.10.0\",\n)\ndef combinatorial_montecarlo_shapley(\n u: Utility,\n done: StoppingCriterion,\n *,\n n_jobs: int = 1,\n parallel_backend: Optional[ParallelBackend] = None,\n config: Optional[ParallelConfig] = None,\n progress: bool = False,\n seed: Optional[Seed] = None,\n) -> ValuationResult:\n r\"\"\"Computes an approximate Shapley value using the combinatorial\n definition:\n\n $$v_u(i) = \\frac{1}{n} \\sum_{S \\subseteq N \\setminus \\{i\\}}\n \\binom{n-1}{ | S | }^{-1} [u(S \\cup \\{i\\}) \u2212 u(S)]$$\n\n This consists of randomly sampling subsets of the power set of the training\n indices in [u.data][pydvl.utils.utility.Utility], and computing their\n marginal utilities. See [Data valuation][data-valuation] for details.\n\n Note that because sampling is done with replacement, the approximation is\n poor even for $2^{m}$ subsets with $m>n$, even though there are $2^{n-1}$\n subsets for each $i$. Prefer\n [permutation_montecarlo_shapley()][pydvl.value.shapley.montecarlo.permutation_montecarlo_shapley].\n\n Parallelization is done by splitting the set of indices across processes and\n computing the sum over subsets $S \\subseteq N \\setminus \\{i\\}$ separately.\n\n Args:\n u: Utility object with model, data, and scoring function\n done: Stopping criterion for the computation.\n n_jobs: number of parallel jobs across which to distribute the\n computation. Each worker receives a chunk of\n [indices][pydvl.utils.dataset.Dataset.indices]\n parallel_backend: Parallel backend instance to use\n for parallelizing computations. If `None`,\n use [JoblibParallelBackend][pydvl.parallel.backends.JoblibParallelBackend] backend.\n See the [Parallel Backends][pydvl.parallel.backends] package\n for available options.\n config: (**DEPRECATED**) Object configuring parallel computation,\n with cluster address, number of cpus, etc.\n progress: Whether to display progress bars for each job.\n seed: Either an instance of a numpy random number generator or a seed for it.\n\n Returns:\n Object with the data values.\n\n !!! tip \"Changed in version 0.9.0\"\n Deprecated `config` argument and added a `parallel_backend`\n argument to allow users to pass the Parallel Backend instance\n directly.\n \"\"\"\n parallel_backend = _maybe_init_parallel_backend(parallel_backend, config)\n\n map_reduce_job: MapReduceJob[NDArray, ValuationResult] = MapReduceJob(\n u.data.indices,\n map_func=_combinatorial_montecarlo_shapley,\n reduce_func=lambda results: reduce(operator.add, results),\n map_kwargs=dict(u=u, done=done, progress=progress),\n n_jobs=n_jobs,\n parallel_backend=parallel_backend,\n )\n return map_reduce_job(seed=seed)\n
"},{"location":"api/pydvl/value/shapley/naive/","title":"Naive","text":""},{"location":"api/pydvl/value/shapley/naive/#pydvl.value.shapley.naive","title":"pydvl.value.shapley.naive","text":"This module implements exact Shapley values using either the combinatorial or permutation definition.
The exact computation of \\(n\\) values takes \\(\\mathcal{O}(2^n)\\) evaluations of the utility and is therefore only possible for small datasets. For larger datasets, consider using any of the approximations, such as Monte Carlo, or proxy models like kNN.
See Data valuation for details.
"},{"location":"api/pydvl/value/shapley/naive/#pydvl.value.shapley.naive.permutation_exact_shapley","title":"permutation_exact_shapley","text":"permutation_exact_shapley(\n u: Utility, *, progress: bool = True\n) -> ValuationResult\n
Computes the exact Shapley value using the formulation with permutations:
\\[v_u(x_i) = \\frac{1}{n!} \\sum_{\\sigma \\in \\Pi(n)} [u(\\sigma_{i-1} \\cup {i}) \u2212 u(\\sigma_{i})].\\] See Data valuation for details.
When the length of the training set is > 10 this prints a warning since the computation becomes too expensive. Used mostly for internal testing and simple use cases. Please refer to the Monte Carlo approximations for practical applications.
PARAMETER DESCRIPTION u
Utility object with model, data, and scoring function
TYPE: Utility
progress
Whether to display progress bars for each job.
TYPE: bool
DEFAULT: True
RETURNS DESCRIPTION ValuationResult
Object with the data values.
Source code in src/pydvl/value/shapley/naive.py
def permutation_exact_shapley(u: Utility, *, progress: bool = True) -> ValuationResult:\n r\"\"\"Computes the exact Shapley value using the formulation with permutations:\n\n $$v_u(x_i) = \\frac{1}{n!} \\sum_{\\sigma \\in \\Pi(n)} [u(\\sigma_{i-1}\n \\cup {i}) \u2212 u(\\sigma_{i})].$$\n\n See [Data valuation][data-valuation] for details.\n\n When the length of the training set is > 10 this prints a warning since the\n computation becomes too expensive. Used mostly for internal testing and\n simple use cases. Please refer to the [Monte Carlo\n approximations][pydvl.value.shapley.montecarlo] for practical applications.\n\n Args:\n u: Utility object with model, data, and scoring function\n progress: Whether to display progress bars for each job.\n\n Returns:\n Object with the data values.\n \"\"\"\n\n n = len(u.data)\n # Note that the cache in utility saves most of the refitting because we\n # use frozenset for the input.\n if n > 10:\n warnings.warn(\n f\"Large dataset! Computation requires {n}! calls to utility()\",\n RuntimeWarning,\n )\n\n values = np.zeros(n)\n for p in tqdm(\n permutations(u.data.indices),\n disable=not progress,\n desc=\"Permutation\",\n total=math.factorial(n),\n ):\n for i, idx in enumerate(p):\n values[idx] += u(p[: i + 1]) - u(p[:i])\n values /= math.factorial(n)\n\n return ValuationResult(\n algorithm=\"permutation_exact_shapley\",\n status=Status.Converged,\n values=values,\n data_names=u.data.data_names,\n )\n
"},{"location":"api/pydvl/value/shapley/naive/#pydvl.value.shapley.naive.combinatorial_exact_shapley","title":"combinatorial_exact_shapley","text":"combinatorial_exact_shapley(\n u: Utility,\n *,\n n_jobs: int = 1,\n parallel_backend: Optional[ParallelBackend] = None,\n config: Optional[ParallelConfig] = None,\n progress: bool = False\n) -> ValuationResult\n
Computes the exact Shapley value using the combinatorial definition.
\\[v_u(i) = \\frac{1}{n} \\sum_{S \\subseteq N \\setminus \\{i\\}} \\binom{n-1}{ | S | }^{-1} [u(S \\cup \\{i\\}) \u2212 u(S)].\\] See Data valuation for details.
Note
If the length of the training set is > n_jobs*20 this prints a warning because the computation is very expensive. Used mostly for internal testing and simple use cases. Please refer to the Monte Carlo approximations for practical applications.
PARAMETER DESCRIPTION u
Utility object with model, data, and scoring function
TYPE: Utility
n_jobs
Number of parallel jobs to use
TYPE: int
DEFAULT: 1
parallel_backend
Parallel backend instance to use for parallelizing computations. If None
, use JoblibParallelBackend backend. See the Parallel Backends package for available options.
TYPE: Optional[ParallelBackend]
DEFAULT: None
config
(DEPRECATED) Object configuring parallel computation, with cluster address, number of cpus, etc.
TYPE: Optional[ParallelConfig]
DEFAULT: None
progress
Whether to display progress bars for each job.
TYPE: bool
DEFAULT: False
RETURNS DESCRIPTION ValuationResult
Object with the data values.
Changed in version 0.9.0
Deprecated config
argument and added a parallel_backend
argument to allow users to pass the Parallel Backend instance directly.
Source code in src/pydvl/value/shapley/naive.py
@deprecated(\n target=True,\n args_mapping={\"config\": \"config\"},\n deprecated_in=\"0.9.0\",\n remove_in=\"0.10.0\",\n)\ndef combinatorial_exact_shapley(\n u: Utility,\n *,\n n_jobs: int = 1,\n parallel_backend: Optional[ParallelBackend] = None,\n config: Optional[ParallelConfig] = None,\n progress: bool = False,\n) -> ValuationResult:\n r\"\"\"Computes the exact Shapley value using the combinatorial definition.\n\n $$v_u(i) = \\frac{1}{n} \\sum_{S \\subseteq N \\setminus \\{i\\}}\n \\binom{n-1}{ | S | }^{-1} [u(S \\cup \\{i\\}) \u2212 u(S)].$$\n\n See [Data valuation][data-valuation] for details.\n\n !!! Note\n If the length of the training set is > n_jobs*20 this prints a warning\n because the computation is very expensive. Used mostly for internal\n testing and simple use cases. Please refer to the\n [Monte Carlo][pydvl.value.shapley.montecarlo] approximations for\n practical applications.\n\n Args:\n u: Utility object with model, data, and scoring function\n n_jobs: Number of parallel jobs to use\n parallel_backend: Parallel backend instance to use\n for parallelizing computations. If `None`,\n use [JoblibParallelBackend][pydvl.parallel.backends.JoblibParallelBackend] backend.\n See the [Parallel Backends][pydvl.parallel.backends] package\n for available options.\n config: (**DEPRECATED**) Object configuring parallel computation,\n with cluster address, number of cpus, etc.\n progress: Whether to display progress bars for each job.\n\n Returns:\n Object with the data values.\n\n !!! tip \"Changed in version 0.9.0\"\n Deprecated `config` argument and added a `parallel_backend`\n argument to allow users to pass the Parallel Backend instance\n directly.\n \"\"\"\n # Arbitrary choice, will depend on time required, caching, etc.\n if len(u.data) // n_jobs > 20:\n warnings.warn(\n f\"Large dataset! Computation requires 2^{len(u.data)} calls to model.fit()\"\n )\n\n def reduce_fun(results: List[NDArray]) -> NDArray:\n return np.array(results).sum(axis=0) # type: ignore\n\n parallel_backend = _maybe_init_parallel_backend(parallel_backend, config)\n\n map_reduce_job: MapReduceJob[NDArray, NDArray] = MapReduceJob(\n u.data.indices,\n map_func=_combinatorial_exact_shapley,\n map_kwargs=dict(u=u, progress=progress),\n reduce_func=reduce_fun,\n n_jobs=n_jobs,\n parallel_backend=parallel_backend,\n )\n values = map_reduce_job()\n return ValuationResult(\n algorithm=\"combinatorial_exact_shapley\",\n status=Status.Converged,\n values=values,\n data_names=u.data.data_names,\n )\n
"},{"location":"api/pydvl/value/shapley/owen/","title":"Owen","text":""},{"location":"api/pydvl/value/shapley/owen/#pydvl.value.shapley.owen","title":"pydvl.value.shapley.owen","text":""},{"location":"api/pydvl/value/shapley/owen/#pydvl.value.shapley.owen--references","title":"References","text":" -
Okhrati, R., Lipani, A., 2021. A Multilinear Sampling Algorithm to Estimate Shapley Values. In: 2020 25th International Conference on Pattern Recognition (ICPR), pp. 7992\u20137999. IEEE.\u00a0\u21a9
"},{"location":"api/pydvl/value/shapley/owen/#pydvl.value.shapley.owen.OwenAlgorithm","title":"OwenAlgorithm","text":" Bases: Enum
Choices for the Owen sampling method.
ATTRIBUTE DESCRIPTION Standard
Use q \u2208 [0, 1]
Antithetic
Use q \u2208 [0, 0.5] and correlated samples
"},{"location":"api/pydvl/value/shapley/owen/#pydvl.value.shapley.owen.owen_sampling_shapley","title":"owen_sampling_shapley","text":"owen_sampling_shapley(\n u: Utility,\n n_samples: int,\n max_q: int,\n *,\n method: OwenAlgorithm = OwenAlgorithm.Standard,\n n_jobs: int = 1,\n parallel_backend: Optional[ParallelBackend] = None,\n config: Optional[ParallelConfig] = None,\n progress: bool = False,\n seed: Optional[Seed] = None\n) -> ValuationResult\n
Owen sampling of Shapley values as described in (Okhrati and Lipani, 2021)1.
This function computes a Monte Carlo approximation to
\\[v_u(i) = \\int_0^1 \\mathbb{E}_{S \\sim P_q(D_{\\backslash \\{i\\}})} [u(S \\cup \\{i\\}) - u(S)]\\] using one of two methods. The first one, selected with the argument mode = OwenAlgorithm.Standard
, approximates the integral with:
\\[\\hat{v}_u(i) = \\frac{1}{Q M} \\sum_{j=0}^Q \\sum_{m=1}^M [u(S^{(q_j)}_m \\cup \\{i\\}) - u(S^{(q_j)}_m)],\\] where \\(q_j = \\frac{j}{Q} \\in [0,1]\\) and the sets \\(S^{(q_j)}\\) are such that a sample \\(x \\in S^{(q_j)}\\) if a draw from a \\(Ber(q_j)\\) distribution is 1.
The second method, selected with the argument mode = OwenAlgorithm.Antithetic
, uses correlated samples in the inner sum to reduce the variance:
\\[\\hat{v}_u(i) = \\frac{1}{2 Q M} \\sum_{j=0}^Q \\sum_{m=1}^M [u(S^{(q_j)}_m \\cup \\{i\\}) - u(S^{(q_j)}_m) + u((S^{(q_j)}_m)^c \\cup \\{i\\}) - u((S^{( q_j)}_m)^c)],\\] where now \\(q_j = \\frac{j}{2Q} \\in [0,\\frac{1}{2}]\\), and \\(S^c\\) is the complement of \\(S\\).
Note
The outer integration could be done instead with a quadrature rule.
PARAMETER DESCRIPTION u
Utility object holding data, model and scoring function.
TYPE: Utility
n_samples
Numer of sets to sample for each value of q
TYPE: int
max_q
Number of subdivisions for q \u2208 [0,1] (the element sampling probability) used to approximate the outer integral.
TYPE: int
method
Selects the algorithm to use, see the description. Either OwenAlgorithm.Full for \\(q \\in [0,1]\\) or OwenAlgorithm.Halved for \\(q \\in [0,0.5]\\) and correlated samples
TYPE: OwenAlgorithm
DEFAULT: Standard
n_jobs
Number of parallel jobs to use. Each worker receives a chunk of the total of max_q
values for q.
TYPE: int
DEFAULT: 1
parallel_backend
Parallel backend instance to use for parallelizing computations. If None
, use JoblibParallelBackend backend. See the Parallel Backends package for available options.
TYPE: Optional[ParallelBackend]
DEFAULT: None
config
(DEPRECATED) Object configuring parallel computation, with cluster address, number of cpus, etc.
TYPE: Optional[ParallelConfig]
DEFAULT: None
progress
Whether to display progress bars for each job.
TYPE: bool
DEFAULT: False
seed
Either an instance of a numpy random number generator or a seed for it.
TYPE: Optional[Seed]
DEFAULT: None
RETURNS DESCRIPTION ValuationResult
Object with the data values.
New in version 0.3.0
Changed in version 0.5.0
Support for parallel computation and enable antithetic sampling.
Changed in version 0.9.0
Deprecated config
argument and added a parallel_backend
argument to allow users to pass the Parallel Backend instance directly.
Source code in src/pydvl/value/shapley/owen.py
@deprecated(\n target=True,\n args_mapping={\"config\": \"config\"},\n deprecated_in=\"0.9.0\",\n remove_in=\"0.10.0\",\n)\ndef owen_sampling_shapley(\n u: Utility,\n n_samples: int,\n max_q: int,\n *,\n method: OwenAlgorithm = OwenAlgorithm.Standard,\n n_jobs: int = 1,\n parallel_backend: Optional[ParallelBackend] = None,\n config: Optional[ParallelConfig] = None,\n progress: bool = False,\n seed: Optional[Seed] = None\n) -> ValuationResult:\n r\"\"\"Owen sampling of Shapley values as described in\n (Okhrati and Lipani, 2021)<sup><a href=\"#okhrati_multilinear_2021\">1</a></sup>.\n\n This function computes a Monte Carlo approximation to\n\n $$v_u(i) = \\int_0^1 \\mathbb{E}_{S \\sim P_q(D_{\\backslash \\{i\\}})}\n [u(S \\cup \\{i\\}) - u(S)]$$\n\n using one of two methods. The first one, selected with the argument ``mode =\n OwenAlgorithm.Standard``, approximates the integral with:\n\n $$\\hat{v}_u(i) = \\frac{1}{Q M} \\sum_{j=0}^Q \\sum_{m=1}^M [u(S^{(q_j)}_m\n \\cup \\{i\\}) - u(S^{(q_j)}_m)],$$\n\n where $q_j = \\frac{j}{Q} \\in [0,1]$ and the sets $S^{(q_j)}$ are such that a\n sample $x \\in S^{(q_j)}$ if a draw from a $Ber(q_j)$ distribution is 1.\n\n The second method, selected with the argument ``mode =\n OwenAlgorithm.Antithetic``, uses correlated samples in the inner sum to\n reduce the variance:\n\n $$\\hat{v}_u(i) = \\frac{1}{2 Q M} \\sum_{j=0}^Q \\sum_{m=1}^M [u(S^{(q_j)}_m\n \\cup \\{i\\}) - u(S^{(q_j)}_m) + u((S^{(q_j)}_m)^c \\cup \\{i\\}) - u((S^{(\n q_j)}_m)^c)],$$\n\n where now $q_j = \\frac{j}{2Q} \\in [0,\\frac{1}{2}]$, and $S^c$ is the\n complement of $S$.\n\n !!! Note\n The outer integration could be done instead with a quadrature rule.\n\n Args:\n u: [Utility][pydvl.utils.utility.Utility] object holding data, model\n and scoring function.\n n_samples: Numer of sets to sample for each value of q\n max_q: Number of subdivisions for q \u2208 [0,1] (the element sampling\n probability) used to approximate the outer integral.\n method: Selects the algorithm to use, see the description. Either\n [OwenAlgorithm.Full][pydvl.value.shapley.owen.OwenAlgorithm] for\n $q \\in [0,1]$ or\n [OwenAlgorithm.Halved][pydvl.value.shapley.owen.OwenAlgorithm] for\n $q \\in [0,0.5]$ and correlated samples\n n_jobs: Number of parallel jobs to use. Each worker receives a chunk\n of the total of `max_q` values for q.\n parallel_backend: Parallel backend instance to use\n for parallelizing computations. If `None`,\n use [JoblibParallelBackend][pydvl.parallel.backends.JoblibParallelBackend] backend.\n See the [Parallel Backends][pydvl.parallel.backends] package\n for available options.\n config: (**DEPRECATED**) Object configuring parallel computation,\n with cluster address, number of cpus, etc.\n progress: Whether to display progress bars for each job.\n seed: Either an instance of a numpy random number generator or a seed for it.\n\n Returns:\n Object with the data values.\n\n !!! tip \"New in version 0.3.0\"\n\n !!! tip \"Changed in version 0.5.0\"\n Support for parallel computation and enable antithetic sampling.\n\n !!! tip \"Changed in version 0.9.0\"\n Deprecated `config` argument and added a `parallel_backend`\n argument to allow users to pass the Parallel Backend instance\n directly.\n\n \"\"\"\n parallel_backend = _maybe_init_parallel_backend(parallel_backend, config)\n\n map_reduce_job: MapReduceJob[NDArray, ValuationResult] = MapReduceJob(\n u.data.indices,\n map_func=_owen_sampling_shapley,\n reduce_func=lambda results: reduce(operator.add, results),\n map_kwargs=dict(\n u=u,\n method=OwenAlgorithm(method),\n n_samples=n_samples,\n max_q=max_q,\n progress=progress,\n ),\n n_jobs=n_jobs,\n parallel_backend=parallel_backend,\n )\n\n return map_reduce_job(seed=seed)\n
"},{"location":"api/pydvl/value/shapley/truncated/","title":"Truncated","text":""},{"location":"api/pydvl/value/shapley/truncated/#pydvl.value.shapley.truncated","title":"pydvl.value.shapley.truncated","text":""},{"location":"api/pydvl/value/shapley/truncated/#pydvl.value.shapley.truncated--references","title":"References","text":" -
Ghorbani, A., Zou, J., 2019. Data Shapley: Equitable Valuation of Data for Machine Learning. In: Proceedings of the 36th International Conference on Machine Learning, PMLR, pp. 2242\u20132251.\u00a0\u21a9
"},{"location":"api/pydvl/value/shapley/truncated/#pydvl.value.shapley.truncated.TruncationPolicy","title":"TruncationPolicy","text":"TruncationPolicy()\n
Bases: ABC
A policy for deciding whether to stop computing marginals in a permutation.
Statistics are kept on the number of calls and truncations as n_calls
and n_truncations
respectively.
ATTRIBUTE DESCRIPTION n_calls
Number of calls to the policy.
TYPE: int
n_truncations
Number of truncations made by the policy.
TYPE: int
Todo
Because the policy objects are copied to the workers, the statistics are not accessible from the coordinating process. We need to add methods for this.
Source code in src/pydvl/value/shapley/truncated.py
def __init__(self) -> None:\n self.n_calls: int = 0\n self.n_truncations: int = 0\n
"},{"location":"api/pydvl/value/shapley/truncated/#pydvl.value.shapley.truncated.TruncationPolicy.reset","title":"reset abstractmethod
","text":"reset(u: Optional[Utility] = None)\n
Reset the policy to a state ready for a new permutation.
Source code in src/pydvl/value/shapley/truncated.py
@abc.abstractmethod\ndef reset(self, u: Optional[Utility] = None):\n \"\"\"Reset the policy to a state ready for a new permutation.\"\"\"\n ...\n
"},{"location":"api/pydvl/value/shapley/truncated/#pydvl.value.shapley.truncated.TruncationPolicy.__call__","title":"__call__","text":"__call__(idx: int, score: float) -> bool\n
Check whether the computation should be interrupted.
PARAMETER DESCRIPTION idx
Position in the permutation currently being computed.
TYPE: int
score
Last utility computed.
TYPE: float
RETURNS DESCRIPTION bool
True
if the computation should be interrupted.
Source code in src/pydvl/value/shapley/truncated.py
def __call__(self, idx: int, score: float) -> bool:\n \"\"\"Check whether the computation should be interrupted.\n\n Args:\n idx: Position in the permutation currently being computed.\n score: Last utility computed.\n\n Returns:\n `True` if the computation should be interrupted.\n \"\"\"\n ret = self._check(idx, score)\n self.n_calls += 1\n self.n_truncations += 1 if ret else 0\n return ret\n
"},{"location":"api/pydvl/value/shapley/truncated/#pydvl.value.shapley.truncated.NoTruncation","title":"NoTruncation","text":"NoTruncation()\n
Bases: TruncationPolicy
A policy which never interrupts the computation.
Source code in src/pydvl/value/shapley/truncated.py
def __init__(self) -> None:\n self.n_calls: int = 0\n self.n_truncations: int = 0\n
"},{"location":"api/pydvl/value/shapley/truncated/#pydvl.value.shapley.truncated.NoTruncation.__call__","title":"__call__","text":"__call__(idx: int, score: float) -> bool\n
Check whether the computation should be interrupted.
PARAMETER DESCRIPTION idx
Position in the permutation currently being computed.
TYPE: int
score
Last utility computed.
TYPE: float
RETURNS DESCRIPTION bool
True
if the computation should be interrupted.
Source code in src/pydvl/value/shapley/truncated.py
def __call__(self, idx: int, score: float) -> bool:\n \"\"\"Check whether the computation should be interrupted.\n\n Args:\n idx: Position in the permutation currently being computed.\n score: Last utility computed.\n\n Returns:\n `True` if the computation should be interrupted.\n \"\"\"\n ret = self._check(idx, score)\n self.n_calls += 1\n self.n_truncations += 1 if ret else 0\n return ret\n
"},{"location":"api/pydvl/value/shapley/truncated/#pydvl.value.shapley.truncated.FixedTruncation","title":"FixedTruncation","text":"FixedTruncation(u: Utility, fraction: float)\n
Bases: TruncationPolicy
Break a permutation after computing a fixed number of marginals.
The experiments in Appendix B of (Ghorbani and Zou, 2019)1 show that when the training set size is large enough, one can simply truncate the iteration over permutations after a fixed number of steps. This happens because beyond a certain number of samples in a training set, the model becomes insensitive to new ones. Alas, this strongly depends on the data distribution and the model and there is no automatic way of estimating this number.
PARAMETER DESCRIPTION u
Utility object with model, data, and scoring function
TYPE: Utility
fraction
Fraction of marginals in a permutation to compute before stopping (e.g. 0.5 to compute half of the marginals).
TYPE: float
Source code in src/pydvl/value/shapley/truncated.py
def __init__(self, u: Utility, fraction: float):\n super().__init__()\n if fraction <= 0 or fraction > 1:\n raise ValueError(\"fraction must be in (0, 1]\")\n self.max_marginals = len(u.data) * fraction\n self.count = 0\n
"},{"location":"api/pydvl/value/shapley/truncated/#pydvl.value.shapley.truncated.FixedTruncation.__call__","title":"__call__","text":"__call__(idx: int, score: float) -> bool\n
Check whether the computation should be interrupted.
PARAMETER DESCRIPTION idx
Position in the permutation currently being computed.
TYPE: int
score
Last utility computed.
TYPE: float
RETURNS DESCRIPTION bool
True
if the computation should be interrupted.
Source code in src/pydvl/value/shapley/truncated.py
def __call__(self, idx: int, score: float) -> bool:\n \"\"\"Check whether the computation should be interrupted.\n\n Args:\n idx: Position in the permutation currently being computed.\n score: Last utility computed.\n\n Returns:\n `True` if the computation should be interrupted.\n \"\"\"\n ret = self._check(idx, score)\n self.n_calls += 1\n self.n_truncations += 1 if ret else 0\n return ret\n
"},{"location":"api/pydvl/value/shapley/truncated/#pydvl.value.shapley.truncated.RelativeTruncation","title":"RelativeTruncation","text":"RelativeTruncation(u: Utility, rtol: float)\n
Bases: TruncationPolicy
Break a permutation if the marginal utility is too low.
This is called \"performance tolerance\" in (Ghorbani and Zou, 2019)1.
PARAMETER DESCRIPTION u
Utility object with model, data, and scoring function
TYPE: Utility
rtol
Relative tolerance. The permutation is broken if the last computed utility is less than total_utility * rtol
.
TYPE: float
Source code in src/pydvl/value/shapley/truncated.py
def __init__(self, u: Utility, rtol: float):\n super().__init__()\n self.rtol = rtol\n logger.info(\"Computing total utility for permutation truncation.\")\n self.total_utility = self.reset(u)\n self._u = u\n
"},{"location":"api/pydvl/value/shapley/truncated/#pydvl.value.shapley.truncated.RelativeTruncation.__call__","title":"__call__","text":"__call__(idx: int, score: float) -> bool\n
Check whether the computation should be interrupted.
PARAMETER DESCRIPTION idx
Position in the permutation currently being computed.
TYPE: int
score
Last utility computed.
TYPE: float
RETURNS DESCRIPTION bool
True
if the computation should be interrupted.
Source code in src/pydvl/value/shapley/truncated.py
def __call__(self, idx: int, score: float) -> bool:\n \"\"\"Check whether the computation should be interrupted.\n\n Args:\n idx: Position in the permutation currently being computed.\n score: Last utility computed.\n\n Returns:\n `True` if the computation should be interrupted.\n \"\"\"\n ret = self._check(idx, score)\n self.n_calls += 1\n self.n_truncations += 1 if ret else 0\n return ret\n
"},{"location":"api/pydvl/value/shapley/truncated/#pydvl.value.shapley.truncated.BootstrapTruncation","title":"BootstrapTruncation","text":"BootstrapTruncation(u: Utility, n_samples: int, sigmas: float = 1)\n
Bases: TruncationPolicy
Break a permutation if the last computed utility is close to the total utility, measured as a multiple of the standard deviation of the utilities.
PARAMETER DESCRIPTION u
Utility object with model, data, and scoring function
TYPE: Utility
n_samples
Number of bootstrap samples to use to compute the variance of the utilities.
TYPE: int
sigmas
Number of standard deviations to use as a threshold.
TYPE: float
DEFAULT: 1
Source code in src/pydvl/value/shapley/truncated.py
def __init__(self, u: Utility, n_samples: int, sigmas: float = 1):\n super().__init__()\n self.n_samples = n_samples\n logger.info(\"Computing total utility for permutation truncation.\")\n self.total_utility = u(u.data.indices)\n self.count: int = 0\n self.variance: float = 0\n self.mean: float = 0\n self.sigmas: float = sigmas\n
"},{"location":"api/pydvl/value/shapley/truncated/#pydvl.value.shapley.truncated.BootstrapTruncation.__call__","title":"__call__","text":"__call__(idx: int, score: float) -> bool\n
Check whether the computation should be interrupted.
PARAMETER DESCRIPTION idx
Position in the permutation currently being computed.
TYPE: int
score
Last utility computed.
TYPE: float
RETURNS DESCRIPTION bool
True
if the computation should be interrupted.
Source code in src/pydvl/value/shapley/truncated.py
def __call__(self, idx: int, score: float) -> bool:\n \"\"\"Check whether the computation should be interrupted.\n\n Args:\n idx: Position in the permutation currently being computed.\n score: Last utility computed.\n\n Returns:\n `True` if the computation should be interrupted.\n \"\"\"\n ret = self._check(idx, score)\n self.n_calls += 1\n self.n_truncations += 1 if ret else 0\n return ret\n
"},{"location":"api/pydvl/value/shapley/types/","title":"Types","text":""},{"location":"api/pydvl/value/shapley/types/#pydvl.value.shapley.types","title":"pydvl.value.shapley.types","text":""},{"location":"api/pydvl/value/shapley/types/#pydvl.value.shapley.types.ShapleyMode","title":"ShapleyMode","text":" Bases: str
, Enum
Supported algorithms for the computation of Shapley values.
Todo
Make algorithms register themselves here.
"},{"location":"examples/","title":"Examples","text":""},{"location":"examples/#data-valuation","title":"Data valuation","text":" -
Shapley values
An introduction using the spotify dataset, showcasing grouped datasets and applied to improving model performance and identifying bogus data.
-
KNN Shapley
A showcase of a fast model-specific valuation method using the iris dataset.
-
Data utility learning
Learning a utility function from a few evaluations and using it to estimate the value of the remaining data.
-
Least Core
An alternative solution concept from game theory, illustrated on a classification problem.
-
Data OOB
A different and fast strategy for data valuation, using the out-of-bag error of a bagging model.
-
Faster Banzhaf values
Using Banzhaf values to estimate the value of data points in MNIST, and evaluating convergence speed of MSR.
"},{"location":"examples/#influence-functions","title":"Influence functions","text":" -
For CNNs
Detecting corrupted labels with influence functions on the ImageNet dataset.
-
For language models
Using the IMDB dataset for sentiment analysis and a fine-tuned BERT model.
-
For mislabeled data
Detecting corrupted labels using a synthetic dataset.
-
For outlier detection
Using the wine dataset
"},{"location":"examples/data_oob/","title":"Data OOB","text":" This notebook introduces the Data- OOB method, an implementation based on a publication from Kwon and Zou \" Data- OOB : Out-of-bag Estimate as a Simple and Efficient Data Value \" ICML 2023 , using pyDVL.
The objective of this paper is mainly to overcome the computational bottleneck of shapley-based data valuation methods that require to fit a significant number of models to accurately estimate marginal contributions. The algorithms compute data values from out of bag estimates using a bagging model.
The value can be interpreted as a partition of the OOB estimate, which is originally introduced to estimate the prediction error. This OOB estimate is given as:
\\[ \\sum_{i=1}^n\\frac{\\sum_{b=1}^{B}\\mathbb{1}(w_{bi}=0)T(y_i, \\hat{f}_b(x_i))}{\\sum_{b=1}^{B} \\mathbb{1} (w_{bi}=0)} \\] %autoreload\nfrom pydvl.utils import Dataset, Scorer, Seed, Utility, ensure_seed_sequence\nfrom pydvl.value import ValuationResult, compute_data_oob\n
We will work with the adult classification dataset from the UCI repository. The objective is to predict whether a person earns more than 50k a year based on a set of features such as age, education, occupation, etc.
With a helper function we download the data and obtain the following pandas dataframe, where the categorical features have been removed:
\nFound cached file: adult_data.pkl.\n
\n
data_adult.head()\n
age fnlwgt education-num capital-gain capital-loss hours-per-week income 0 39 77516 13 2174 0 40 <=50K 1 50 83311 13 0 0 13 <=50K 2 38 215646 9 0 0 40 <=50K 3 53 234721 7 0 0 40 <=50K 4 28 338409 13 0 0 40 <=50K data = Dataset.from_arrays(\n X=data_adult.drop(columns=[\"income\"]).values,\n y=data_adult.loc[:, \"income\"].cat.codes.values,\n random_state=random_state,\n)\n\nmodel = KNeighborsClassifier(n_neighbors=5)\n\nutility = Utility(model, data, Scorer(\"accuracy\", default=0.0))\n
n_estimators = [100, 500]\noob_values = [\n compute_data_oob(utility, n_est=n_est, max_samples=0.95, seed=random_state)\n for n_est in n_estimators\n]\n
The two results are stored in an array of ValuationResult objects. Here's their distribution. The left-hand side depicts value as it increases with rank and a 99% t-confidence interval. The right-hand side shows the histogram of values.
Observe how adding estimators reduces the variance of the values, but doesn't change their distribution much.
"},{"location":"examples/data_oob/#bagging-for-data-valuation","title":"Bagging for data valuation","text":""},{"location":"examples/data_oob/#setup","title":"Setup","text":" We begin by importing the main libraries and setting some defaults.
If you are reading this in the documentation, some boilerplate (including most plotting code) has been omitted for convenience."},{"location":"examples/data_oob/#computing-the-oob-values","title":"Computing the OOB values","text":" The main idea of Data- OOB is to take an existing classifier or regression model and compute a per-sample out-of-bag performance estimate via bagging.
For this example, we use a simple KNN classifier with \\(k=5\\) neighbours on the data and compute the data-oob values with two choices for the number of estimators in the bagging. For that we construct a Utility object using the Scorer class to specify the metric to use for the evaluation. Note how we pass a random seed to Dataset.from_arrays in order to ensure that we always get the same split when running this notebook multiple times. This will be particularly important when running the standard point removal experiments later.
We then use the compute_data_oob function to compute the data-oob values.
"},{"location":"examples/data_oob/#point-removal-experiments","title":"Point removal experiments","text":" The standard procedure for the evaluation of data valuation schemes is the point removal experiment. The objective is to measure the evolution of performance when the best/worst points are removed from the training set. This can be done with the function compute_removal_score , which takes precomputed values and computes the performance of the model as points are removed.
In order to test the true performance of DataOOB, we repeat the whole task of computing the values and the point removal experiment multiple times, including the splitting of the dataset into training and valuation sets. It is important to remember to pass random state adequately for full reproducibility.
"},{"location":"examples/influence_imagenet/","title":"For CNNs","text":"If you are reading this in the documentation, some boilerplate has been omitted for convenience. from pydvl.influence.torch import CgInfluence\nfrom pydvl.reporting.plots import plot_influence_distribution_by_label\nfrom sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, f1_score\n
label_names = {90: \"tables\", 100: \"boats\"}\ntrain_ds, val_ds, test_ds = load_preprocess_imagenet(\n train_size=0.8,\n test_size=0.1,\n keep_labels=label_names,\n downsampling_ratio=1,\n)\n\nprint(\"Normalised image dtype:\", train_ds[\"normalized_images\"][0].dtype)\nprint(\"Label type:\", type(train_ds[\"labels\"][0]))\nprint(\"Image type:\", type(train_ds[\"images\"][0]))\ntrain_ds.info()\n
Let's take a closer look at a few image samples
Let's now further pre-process the data and prepare for model training. The helper function process_io
converts the normalized images into tensors and the labels to the indices 0 and 1 to train the classifier.
def process_io(df: pd.DataFrame, labels: dict) -> Tuple[torch.Tensor, torch.Tensor]:\n x = df[\"normalized_images\"]\n y = df[\"labels\"]\n ds_label_to_model_label = {\n ds_label: idx for idx, ds_label in enumerate(labels.values())\n }\n x_nn = torch.stack(x.tolist()).to(DEVICE)\n y_nn = torch.tensor([ds_label_to_model_label[yi] for yi in y], device=DEVICE)\n return x_nn, y_nn\n\n\ntrain_x, train_y = process_io(train_ds, label_names)\nval_x, val_y = process_io(val_ds, label_names)\ntest_x, test_y = process_io(test_ds, label_names)\n\nbatch_size = 768\ntrain_data = DataLoader(TensorDataset(train_x, train_y), batch_size=batch_size)\ntest_data = DataLoader(TensorDataset(test_x, test_y), batch_size=batch_size)\nval_data = DataLoader(TensorDataset(val_x, val_y), batch_size=batch_size)\n
device = torch.device(\"cuda:0\" if torch.cuda.is_available() else \"cpu\")\nmodel_ft = new_resnet_model(output_size=len(label_names))\nmgr = TrainingManager(\n \"model_ft\",\n model_ft,\n nn.CrossEntropyLoss(),\n train_data,\n val_data,\n MODEL_PATH,\n device=device,\n)\n# Set use_cache=False to retrain the model\ntrain_loss, val_loss = mgr.train(n_epochs=50, use_cache=True)\n
plot_losses(Losses(train_loss, val_loss))\n
The confusion matrix and \\(F_1\\) score look good, especially considering the low resolution of the images and their complexity (they contain different objects)
pred_y_test = np.argmax(model_ft(test_x).cpu().detach(), axis=1).cpu()\nmodel_score = f1_score(test_y.cpu(), pred_y_test, average=\"weighted\")\n\ncm = confusion_matrix(test_y.cpu(), pred_y_test)\ndisp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=label_names.values())\nprint(\"f1_score of model:\", model_score)\ndisp.plot();\n
\nf1_score of model: 0.9062805208898536\n
\n
influence_model = CgInfluence(mgr.model, mgr.loss, hessian_reg, progress=True)\ninfluence_model = influence_model.fit(train_data)\n
On the instantiated influence object, we can call the method influences , which takes some test data and some input dataset with labels (which typically is the training data, or a subset of it). The influence type will be up
. The other option, perturbation
, is beyond the scope of this notebook, but more info can be found in the notebook using the Wine dataset or in the documentation for pyDVL.
influences = influence_model.influences(test_x, test_y, train_x, train_y, mode=\"up\")\n
The output is a matrix of size test_set_length
x training_set_length
. Each row represents a test data point, and each column a training data point, so that entry \\((i,j)\\) represents the influence of training point \\(j\\) on test point \\(i\\) .
Now we plot the histogram of the influence that all training images have on the image selected above, separated by their label.
Rather unsurprisingly, the training points with the highest influence have the same label. Now we can take the training images with the same label and show those with highest and lowest scores.
Looking at the images, it is difficult to explain why those on the right are more influential than those on the left. At first sight, the choice seems to be random (or at the very least noisy). Let's dig in a bit more by looking at average influences:
avg_influences = np.mean(influences.cpu().numpy(), axis=0)\n
Once again, let's plot the histogram of influence values by label.
Next, for each class (you can change value by changing label key) we can have a look at the top and bottom images by average influence, i.e. we can show the images that have the highest and lowest average influence over all test images.
Once again, it is not easy to explain why the images on the left have a lower influence than the ones on the right.
corrupted_model = new_resnet_model(output_size=len(label_names))\ncorrupted_dataset, corrupted_indices = corrupt_imagenet(\n dataset=train_ds,\n fraction_to_corrupt=0.1,\n avg_influences=avg_influences,\n)\n\ncorrupted_train_x, corrupted_train_y = process_io(corrupted_dataset, label_names)\ncorrupted_data = DataLoader(\n TensorDataset(corrupted_train_x, corrupted_train_y), batch_size=batch_size\n)\n\nmgr = TrainingManager(\n \"corrupted_model\",\n corrupted_model,\n nn.CrossEntropyLoss(),\n corrupted_data,\n val_data,\n MODEL_PATH,\n device=device,\n)\ntraining_loss, validation_loss = mgr.train(n_epochs=50, use_cache=True)\n
plot_losses(Losses(training_loss, validation_loss))\n
\nF1 score of model with corrupted data: 0.8541666666666666\n
\n
Interestingly, despite being trained on a corrupted dataset, the model has a fairly high \\(F_1\\) score. Let's now calculate the influence of the corrupted training data points over the test data points.
influence_model = CgInfluence(mgr.model, mgr.loss, hessian_reg, progress=True)\ninfluence_model = influence_model.fit(corrupted_data)\ninfluences = influence_model.influences(\n test_x, test_y, corrupted_train_x, corrupted_train_y\n)\n
As before, since we are interested in the average influence on the test dataset, we take the average of influences across rows, and then plot the highest and lowest influences for a chosen label
avg_corrupted_influences = np.mean(influences.cpu().numpy(), axis=0)\n
As expected, the samples with lowest (negative) influence for the label \"boats\" are those that have been corrupted: all the images on the left are tables! We can compare the average influence of corrupted data with non-corrupted ones
label avg_non_corrupted_infl avg_corrupted_infl score_diff 0 tables -0.405254 -12.999691 12.594438 1 boats -0.544211 -13.080050 12.535838 And indeed corrupted data have a more negative influence on average than clean ones!
Despite this being a useful property, influence functions are known to be unreliable for tasks of data valuation, especially in deep learning where the fundamental assumption of the theory (convexity) is grossly violated. A lot of factors (e.g. the size of the network, the training process or the Hessian regularization term) can interfere with the computation, to the point that often the results that we obtain cannot be trusted. This has been extensively studied in the recent paper:
Basu, S., P. Pope, and S. Feizi. Influence Functions in Deep Learning Are Fragile. International Conference on Learning Representations (ICLR). 2021 .
Nevertheless, influence functions offer a relatively quick and mathematically rigorous way to evaluate (at first order) the importance of a training point for a model's prediction.
"},{"location":"examples/influence_imagenet/#influence-functions-for-neural-networks","title":"Influence functions for neural networks","text":" This notebook explores the use of influence functions for convolutional neural networks. In the first part we will investigate the usefulness, or lack thereof, of influence functions for the interpretation of a classifier's outputs.
For our study we choose a pre-trained ResNet18, fine-tuned on the tiny-imagenet dataset . This dataset was created for a Stanford course on Deep Learning for Computer Vision , and is a subset of the famous ImageNet with 200 classes instead of 1000, and images down-sampled to a lower resolution of 64x64 pixels.
After tuning the last layers of the network, we will use pyDVL to find the most and the least influential training images for the test set. This can sometimes be used to explain inference errors, or to direct efforts during data collection, although we will face inconclusive results with our model and data. This illustrates well-known issues of influence functions for neural networks.
However, in the final part of the notebook we will see that influence functions are an effective tool for finding anomalous or corrupted data points.
We conclude with an appendix with some basic theoretical concepts used.
"},{"location":"examples/influence_imagenet/#imports-and-setup","title":"Imports and setup","text":""},{"location":"examples/influence_imagenet/#loading-and-preprocessing-the-dataset","title":"Loading and preprocessing the dataset","text":" We pick two classes arbitrarily to work with: 90 and 100, corresponding respectively to dining tables, and boats in Venice (you can of course select any other two classes, or more of them, although that would imply longer training times and some modifications in the notebook below). The dataset is loaded with load_preprocess_imagenet()
, which returns three pandas DataFrames
with training, validation and test sets respectively. Each dataframe has three columns: normalized images, labels and the original images. Note that you can load a subset of the data decreasing downsampling_ratio.
"},{"location":"examples/influence_imagenet/#model-definition-and-training","title":"Model definition and training","text":" We use a ResNet18 from torchvision
with final layers modified for binary classification.
For training, we use the convenience class TrainingManager
which transparently handles persistence after training. It is not part of the main pyDVL package but just a way to reduce clutter in this notebook.
We train the model for 50 epochs and save the results. Then we plot the train and validation loss curves.
"},{"location":"examples/influence_imagenet/#influence-computation","title":"Influence computation","text":" Let's now calculate influences! The central interface for computing influences is InfluenceFunctionModel . Since Resnet18 is quite big, we pick the conjugate gradient implementation CgInfluence , which takes a trained torch.nn.Module , the training loss and the training data. Other important parameters are the Hessian regularization term, which should be chosen as small as possible for the computation to converge (further details on why this is important can be found in the Appendix ).
"},{"location":"examples/influence_imagenet/#analysing-influences","title":"Analysing influences","text":" With the computed influences we can study single images or all of them together:
"},{"location":"examples/influence_imagenet/#influence-on-a-single-test-image","title":"Influence on a single test image","text":" Let's take any image in the test set:
"},{"location":"examples/influence_imagenet/#analysing-the-average-influence-on-test-samples","title":"Analysing the average influence on test samples","text":" By averaging across the rows of the influence matrix, we obtain the average influence of each training sample on the whole test set:
"},{"location":"examples/influence_imagenet/#detecting-corrupted-data","title":"Detecting corrupted data","text":" After facing the shortcomings of influence functions for explaining decisions, we move to an application with clear-cut results. Influences can be successfully used to detect corrupted or mislabeled samples, making them an effective tool to \"debug\" training data.
We begin by training a new model (with the same architecture as before) on a dataset with some corrupted labels. The method get_corrupted_imagenet
will take the training dataset and corrupt a certain fraction of the labels by flipping them. We use the same number of epochs and optimizer as before.
"},{"location":"examples/influence_imagenet/#theory-of-influence-functions-for-neural-networks","title":"Theory of influence functions for neural networks","text":" In this appendix we will briefly go through the basic ideas of influence functions adapted for neural networks as introduced in Koh, Pang Wei, and Percy Liang. \"Understanding Black-box Predictions via Influence Functions\" International conference on machine learning. PMLR, 2017.
Note however that this paper departs from the standard and established theory and notation for influence functions. For a rigorous introduction to the topic we recommend classical texts like Hampel, Frank R., Elvezio M. Ronchetti, Peter J. Rousseeuw, and Werner A. Stahel. Robust Statistics: The Approach Based on Influence Functions. 1st edition. Wiley Series in Probability and Statistics. New York: Wiley-Interscience, 2005. https://doi.org/10.1002/9781118186435.
"},{"location":"examples/influence_imagenet/#upweighting-points","title":"Upweighting points","text":" Let's start by considering some input space \\(\\mathcal{X}\\) to a model (e.g. images) and an output space \\(\\mathcal{Y}\\) (e.g. labels). Let's take \\(z_i = (x_i, y_i)\\) to be the \\(i\\) -th training point, and \\(\\theta\\) to be the (potentially highly) multi-dimensional parameters of the neural network (i.e. \\(\\theta\\) is a big array with very many parameters). We will indicate with \\(L(z, \\theta)\\) the loss of the model for point \\(z\\) and parameters \\(\\theta\\) . When training the model we minimize the loss over all points, i.e. the optimal parameters are calculated through gradient descent on the following formula:
\\[ \\hat{\\theta} = \\arg \\min_\\theta \\frac{1}{n}\\sum_{i=1}^n L(z_i, \\theta) \\] where \\(n\\) is the total number of training data points.
For notational convenience, let's define
\\[ \\hat{\\theta}_{-z} = \\arg \\min_\\theta \\frac{1}{n}\\sum_{z_i \\ne z} L(z_i, \\theta) \\ , \\] i.e. \\(\\hat{\\theta}_{-z}\\) are the model parameters that minimize the total loss when \\(z\\) is not in the training dataset.
In order to check the impact of each training point on the model, we would need to calculate \\(\\hat{\\theta}_{-z}\\) for each \\(z\\) in the training dataset, thus re-training the model at least ~ \\(n\\) times (more if model training is noisy). This is computationally very expensive, especially for big neural networks. To circumvent this problem, we can just calculate a first order approximation of \\(\\hat{\\theta}\\) . This can be done through single backpropagation and without re-training the full model.
Let's define
\\[ \\hat{\\theta}_{\\epsilon, z} = \\arg \\min_\\theta \\frac{1}{n}\\sum_{i=1}^n L(z_i, \\theta) + \\epsilon L(z, \\theta) \\ , \\] which is the optimal \\(\\hat{\\theta}\\) if we were to up-weigh \\(z\\) by an amount \\(\\epsilon\\) .
From a classical result (a simple derivation is available in Appendix A of Koh and Liang's paper), we know that:
\\[ \\frac{d \\ \\hat{\\theta}_{\\epsilon, z}}{d \\epsilon} \\Big|_{\\epsilon=0} = -H_{\\hat{\\theta}}^{-1} \\nabla_\\theta L(z, \\hat{\\theta}) \\] where \\(H_{\\hat{\\theta}} = \\frac{1}{n} \\sum_{i=1}^n \\nabla_\\theta^2 L(z_i, \\hat{\\theta})\\) is the Hessian of \\(L\\) . Importantly, notice that this expression is only valid when \\(\\hat{\\theta}\\) is a minimum of \\(L\\) , or otherwise \\(H_{\\hat{\\theta}}\\) cannot be inverted!
"},{"location":"examples/influence_imagenet/#approximating-the-influence-of-a-point","title":"Approximating the influence of a point","text":" We will define the influence of training point \\(z\\) on test point \\(z_{\\text{test}}\\) as \\(\\mathcal{I}(z, z_{\\text{test}}) = L(z_{\\text{test}}, \\hat{\\theta}_{-z}) - L(z_{\\text{test}}, \\hat{\\theta})\\) (notice that it is higher for points \\(z\\) which positively impact the model score, since if they are excluded, the loss is higher). In practice, however, we will always use the infinitesimal approximation \\(\\mathcal{I}_{up}(z, z_{\\text{test}})\\) , defined as
\\[ \\mathcal{I}_{up}(z, z_{\\text{test}}) = - \\frac{d L(z_{\\text{test}}, \\hat{\\theta}_{\\epsilon, z})}{d \\epsilon} \\Big|_{\\epsilon=0} \\] Using the chain rule and the results calculated above, we thus have:
\\[ \\mathcal{I}_{up}(z, z_{\\text{test}}) = - \\nabla_\\theta L(z_{\\text{test}}, \\hat{\\theta})^\\top \\ \\frac{d \\hat{\\theta}_{\\epsilon, z}}{d \\epsilon} \\Big|_{\\epsilon=0} = \\nabla_\\theta L(z_{\\text{test}}, \\hat{\\theta})^\\top \\ H_{\\hat{\\theta}}^{-1} \\ \\nabla_\\theta L(z, \\hat{\\theta}) \\] In order to calculate this expression we need the gradient and the Hessian of the loss wrt. the model parameters \\(\\hat{\\theta}\\) . This can be easily done through a single backpropagation pass.
"},{"location":"examples/influence_imagenet/#regularizing-the-hessian","title":"Regularizing the Hessian","text":" One very important assumption that we make when approximating influence is that \\(\\hat{\\theta}\\) is at least a local minimum of the loss. However, we clearly cannot guarantee this except for convex models, and despite good apparent convergence, \\(\\hat{\\theta}\\) might be located in a region with flat curvature or close to a saddle point. In particular, the Hessian might have vanishing eigenvalues making its direct inversion impossible.
To circumvent this problem, instead of inverting the true Hessian \\(H_{\\hat{\\theta}}\\) , one can invert a small perturbation thereof: \\(H_{\\hat{\\theta}} + \\lambda \\mathbb{I}\\) , with \\(\\mathbb{I}\\) being the identity matrix. This standard trick ensures that the eigenvalues of \\(H_{\\hat{\\theta}}\\) are bounded away from zero and therefore the matrix is invertible. In order for this regularization not to corrupt the outcome too much, the parameter \\(\\lambda\\) should be as small as possible while still allowing a reliable inversion of \\(H_{\\hat{\\theta}} + \\lambda \\mathbb{I}\\) .
"},{"location":"examples/influence_sentiment_analysis/","title":"For language models","text":" This notebooks showcases the use of influence functions for large language models. In particular, it focuses on sentiment analysis using the IMDB dataset and a fine-tuned BERT model.
Not all the methods for influence function calculation can scale to large models and datasets. In this notebook we will use the Kronecker-Factored Approximate Curvature method, which is the only one that can scale to current state-of-the-art language models.
The notebook is structured as follows:
- Setup imports the required libraries and downloads the dataset and the model.
- Sentiment analysis loads the model and the dataset and goes through a few examples of sentiment analysis.
- Model and data preparation prepares the model and the dataset for influence function calculation. In particular, it assigns all the linear layers to require gradients and wraps the model so that only logits are returned (and not the loss or attention masks).
- Influence function computation : shows how to calculate the influence function for a few test and train examples.
- Analysis of influence values : analyses the influence values, trying to extract general information about the model and how it is affected by corruption in the training data.
- Influence functions by layer : since ekfac is based on a block diagonal approximation of the Fisher information matrix, we can compute the influence function separately for each layer of the neural network. This section shows how to do that and how to analyse the results.
Finally, the Appendix shows how to select the Hessian regularization parameter to obtain the best influence function approximation.
If you are reading this in the documentation, some boilerplate has been omitted for convenience. Let's start by importing the required libraries. If not already installed, you can install them with pip install -r requirements-notebooks.txt
.
import os\nfrom copy import deepcopy\nfrom typing import Sequence\n\nimport matplotlib.pyplot as plt\nimport torch\nimport torch.nn.functional as F\nfrom datasets import load_dataset\nfrom IPython.display import HTML, display\nfrom sklearn.metrics import f1_score\nfrom transformers import AutoModelForSequenceClassification, AutoTokenizer\n\nfrom pydvl.influence.torch import EkfacInfluence\nfrom support.torch import ImdbDataset, ModelLogitsWrapper\n
Sentiment analysis is the task of classifying a sentence as having a positive or negative sentiment. For example, the sentence \"I love this movie\" has a positive sentiment, while \"I hate this movie\" has a negative sentiment. In this notebook we will use the IMDB dataset, which contains 50,000 movie reviews with corresponding labels. The dataset is split into 25,000 reviews for training and 25,000 reviews for testing. The dataset is balanced, meaning that there are the same number of positive and negative reviews in the training and test set.
imdb = load_dataset(\"imdb\")\n
Let's print an example of review and its label
sample_review = imdb[\"train\"].select([24])\n\nprint(f\"Here is a sample review with label {sample_review['label'][0]}: \\n\")\n\ndisplay(HTML(sample_review[\"text\"][0].split(\"<br/>\")[0]))\ndisplay(HTML(sample_review[\"text\"][0].split(\"<br/>\")[-1]))\n
\nHere is a sample review with label 0: \n\n
\n
Without wishing to be a killjoy, Brad Sykes is responsible for at least two of the most dull and clich\u00e9d films i've ever seen - this being one of them, and Camp Blood being another. I bought this for \u00a31, but remember, you can't put a price on 71 minutes of your life. You'd do well to avoid this turkey, even at a bargain basement price. The review is negative, and so label 0 is associated to negative sentiment.
The model is a BERT model fine-tuned on the IMDB dataset. BERT is a large language model that has been pre-trained on a large corpus of text. The model was fine-tuned on the IMDB dataset by AssemblyAI and is available on the HuggingFace model hub. We also load its tokenizer, which is used to convert sentences into numeric tokens.
tokenizer = AutoTokenizer.from_pretrained(\"assemblyai/distilbert-base-uncased-sst2\")\nmodel = AutoModelForSequenceClassification.from_pretrained(\n \"assemblyai/distilbert-base-uncased-sst2\"\n)\n
Even if the model is trained on movie reviews, it can be used to classify any sentence as positive or negative. Let's try it on a simple sentence created by us.
example_phrase = (\n \"Pydvl is the best data valuation library, and it is fully open-source!\"\n)\n\ntokenized_example = tokenizer(\n [example_phrase],\n return_tensors=\"pt\",\n truncation=True,\n)\n\nmodel_output = model(\n input_ids=tokenized_example.input_ids,\n)\n
The model output is a SequenceClassificationOutput
object, which contains the logits and other information.
\nModel Output:\n SequenceClassifierOutput(loss=None, logits=tensor([[-2.6237, 2.8350]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)\n
\n
For calculating probabilities and for the influence functions we only need the logits. Then the softmax function converts the logits into probabilities.
model_predictions = F.softmax(model_output.logits, dim=1)\n
The model is quite confident that the sentence has a positive sentiment, which is correct.
\nPositive probability: 99.6%\nNegative probability: 0.4%\n
\n
Let's examine the model's f1 score on a small subset of the test set.
sample_test_set = imdb[\"test\"].shuffle(seed=seed).select(range(50 if not is_CI else 5))\nsample_test_set = sample_test_set.map(\n lambda example: tokenizer(example[\"text\"], truncation=True, padding=\"max_length\"),\n batched=True,\n)\nsample_test_set.set_format(\"torch\", columns=[\"input_ids\", \"attention_mask\", \"label\"])\nmodel.eval()\nwith torch.no_grad():\n logits = model(\n input_ids=sample_test_set[\"input_ids\"],\n attention_mask=sample_test_set[\"attention_mask\"],\n ).logits\n predictions = torch.argmax(logits, dim=1)\n
f1_score_value = f1_score(sample_test_set[\"label\"], predictions)\nprint(f\"F1 Score: {round(f1_score_value, 3)}\")\n
\nF1 Score: 0.955\n
\n
In this section we will define two helper function and classes that will be used in the rest of the notebook.
def print_sentiment_preds(\n model: ModelLogitsWrapper, model_input: torch.Tensor, true_label: int\n):\n \"\"\"\n Prints the sentiment predictions in a human-readable format given a model and an\n input. It also prints the true label.\n \"\"\"\n model_predictions = F.softmax(model(model_input.unsqueeze(0)), dim=1)\n print(\n \"Positive probability: \"\n + str(round(model_predictions[0][1].item(), 3) * 100)\n + \"%\"\n )\n print(\n \"Negative probability: \"\n + str(round(model_predictions[0][0].item(), 3) * 100)\n + \"%\"\n )\n\n true_label = \"Positive\" if true_label == 1 else \"Negative\"\n print(f\"True label: {true_label} \\n\")\n\n\ndef strip_layer_names(param_names: Sequence[str]):\n \"\"\"\n Helper function that strips the parameter names of the model and the transformer,\n so that they can be printed and compared more easily.\n \"\"\"\n stripped_param_names = []\n for name in param_names:\n name = name.replace(\"model.\", \"\")\n if name.startswith(\"distilbert.transformer.\"):\n name = name.replace(\"distilbert.transformer.\", \"\")\n stripped_param_names.append(name)\n return stripped_param_names\n
Importantly, we will need to assign all the linear layers to require gradients, so that we can compute the influence function with respect to them. Keep in mind that the current implementation of Ekfac only supports linear layers, so if any other type of layer in the model requires gradients the initialisation of the influence function class will fail.
for param in model.named_parameters():\n param[1].requires_grad = False\n\nfor m_name, module in model.named_modules():\n if len(list(module.children())) == 0 and len(list(module.parameters())) > 0:\n if isinstance(module, torch.nn.Linear):\n for p_name, param in module.named_parameters():\n if (\n (\"ffn\" in m_name and not is_CI)\n or \"pre_classifier\" in m_name\n or \"classifier\" in m_name\n ):\n param.requires_grad = True\n
Albeit restrictive, linear layers constitute a large fraction of the parameters of most large language models, and so our analysis still holds a lot of information about the full neural network.
\nTotal parameters: 66.96 millions\nParameters requiring gradients: 28.93 millions\nRatio of Linear over other layer types: 43.20%\n
\n
We are now ready to compute the influence function for a few testing and training examples. Let's start by selecting a subset of the full training and testing dataset and wrapping them in a DataLoader
object, so that we can easily do batching.
NUM_TRAIN_EXAMPLES = 100 if not is_CI else 7\nNUM_TEST_EXAMPLES = 100 if not is_CI else 5\n\nsmall_train_dataset = (\n imdb[\"train\"]\n .shuffle(seed=seed)\n .select([i for i in list(range(NUM_TRAIN_EXAMPLES))])\n)\nsmall_test_dataset = (\n imdb[\"test\"].shuffle(seed=seed).select([i for i in list(range(NUM_TEST_EXAMPLES))])\n)\n\ntrain_dataset = ImdbDataset(small_train_dataset, tokenizer=tokenizer)\ntest_dataset = ImdbDataset(small_test_dataset, tokenizer=tokenizer)\n\ntrain_dataloader = torch.utils.data.DataLoader(\n train_dataset, batch_size=7, shuffle=True\n)\ntest_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=5, shuffle=True)\n
For influence computation we need to take the model in evaluation mode, so that no dropout or batch normalization is applied. Then, we can fit the Ekfac representation.
wrapped_model = ModelLogitsWrapper(model)\nwrapped_model.eval()\n\nekfac_influence_model = EkfacInfluence(\n wrapped_model,\n progress=True,\n)\nekfac_influence_model = ekfac_influence_model.fit(train_dataloader)\n
\nK-FAC blocks - batch progress: 0%| | 0/15 [00:00<?, ?it/s]
\n
And the approximate Hessian is thus obtained. Considering that the model has almost 30 million parameters requiring gradients, this was very fast! Of course, this Hessian is computed using only a very small fraction (~0.4%) of the training data, and for a better approximation we should use a larger subset.
Before continuing, we need to set the Hessian regularization parameter to an appropriate value. A way to decide which is better can be found in the Appendix . Here, we will just set it to 1e-5.
ekfac_influence_model.hessian_regularization = 1e-5\n
We calculate the influence of the first batch of training data over the first batch of test data. This is because influence functions are very expensive to compute, and so to keep the runtime of this notebook within a few minutes we need to restrict ourselves to a small number of examples.
test_input, test_labels, test_text = next(iter(test_dataloader))\ntrain_input, train_labels, train_text = next(iter(train_dataloader))\n
And let's finally compute the influence function values
ekfac_train_influences = ekfac_influence_model.influences(\n test_input,\n test_labels,\n train_input,\n train_labels,\n)\n
\n/home/jakob/Documents/pyDVL/venv/lib/python3.10/site-packages/transformers/models/distilbert/modeling_distilbert.py:222: UserWarning: There is a performance drop because we have not yet implemented the batching rule for aten::masked_fill.Tensor. Please file us an issue on GitHub so that we can prioritize its implementation. (Triggered internally at ../aten/src/ATen/functorch/BatchedFallback.cpp:82.)\n scores = scores.masked_fill(\n
\n
Now that we have calculated the influences for a few examples, let's analyse some of the extreme values.
Let's plot the influence values as a heatmap for easily spotting patterns.
Most of the test and training examples have similar influence, close to zero. However, there is one test and one training samples that stand out. In particular, their cross influence is very large and negative. Let's examine them more closely.
\nTraining example with idx 3: \n\nPositive probability: 18.099999999999998%\nNegative probability: 81.89999999999999%\nTrue label: Positive \n\nSentence:\n
\n
In the process of trying to establish the audiences' empathy with Jake Roedel (Tobey Maguire) the filmmakers slander the North and the Jayhawkers. Missouri never withdrew from the Union and the Union Army was not an invading force. The Southerners fought for State's Rights: the right to own slaves, elect crooked legislatures and judges, and employ a political spoils system. There's nothing noble in that. The Missourians could have easily traveled east and joined the Confederate Army. It seems to me that the story has nothing to do with ambiguity. When Jake leaves the Bushwhackers, it's not because he saw error in his way, he certainly doesn't give himself over to the virtue of the cause of abolition. We can see that, despite being positive, this review is quite hard to classify. Its language is overall negative, mostly associated to the facts narrated rather than the movie itself. Notice how several terms are related to war and invasion.
\nTest example with idx 4: \n\nPositive probability: 39.6%\nNegative probability: 60.4%\nTrue label: Negative \n\nSentence:\n
\n
\"An astronaut (Michael Emmet) dies while returning from a mission and his body is recovered by the military. The base where the dead astronaut is taken to becomes the scene of a bizarre invasion plan from outer space. Alien embryos inside the dead astronaut resurrect the corpse and begin a terrifying assault on the military staff in the hopes of conquering the world,\" according to the DVD sleeve's synopsis. A Roger Corman \"American International\" production. The man who fell to Earth impregnated, Mr. Emmet (as John Corcoran), does all right. Angela Greene is his pretty conflicted fianc\u00e9e. And, Ed Nelson (as Dave Randall) is featured as prominently. With a bigger budget, better opening, and a re-write for crisper characterizations, this could have been something approaching classic 1950s science fiction. *** Night of the Blood Beast (1958) Bernard L. Kowalski, Roger Corman ~ Michael Emmet, Angela Greene, Ed Nelson This review is also quite hard to classify. This time it has a negative sentiment towards the movie, but it also contains several words with positive connotation. The parallel with the previous review is quite interesting since both talk about an invasion.
As it is often the case when analysing influence functions, it is hard to understand why these examples have such a large influence. We have seen some interesting patterns, mostly related to similarities in the language and words used, but it is hard to say with certainty if these are the reasons for such a large influence.
A recent paper has explored this topic in high detail, even for much larger language models than BERT (up to ~50 billion parameters!). Among the most interesting findings is that smaller models tend to rely a lot on word-to-word correspondencies, while larger models are more capable of extracting higher level concepts, drawing connections between words across multiple phrases.
For more info, you can visit our blog on influence functions for large language models
In this sections we want to get an idea of how influence functions change when training examples are corrupted. In the next cell we will flip the label of all the training examples and compute the influences on the same test batch as before.
modified_train_labels = deepcopy(train_labels)\nmodified_train_labels = 1 - train_labels\n\ncorrupted_ekfac_train_influences = ekfac_influence_model.influences(\n test_input,\n test_labels,\n train_input,\n modified_train_labels,\n)\n
Overall, when corrupted the influences tend to become negative, as expected. Nevertheless, there are cases where values go from slightly negative to positive, mostly isolated to the second and last test samples. Single values can be quite noisy, so it is difficult to generalise this result, but it would be interesting to see how common these cases are in the full test dataset.
Since ekfac is based on a block diagonal approximation of the Fisher information matrix, we can compute the influence functions separately for each layer of the neural network. In this section we show how to do that and we briefly analyse the results.
influences_by_layer = ekfac_influence_model.influences_by_layer(\n test_input,\n test_labels,\n train_input,\n train_labels,\n)\n
The method influences_by_layer
returns a dictionary containing the influence function values for each layer of the neural network as a tensor. To recover the full influence values as returned by the influences
(as done in the previous section), we need to sum each layer's values.
influences = torch.zeros_like(ekfac_train_influences)\nfor layer_id, value in influences_by_layer.items():\n influences += value.detach()\n
And if we plot the result as a heatmap we can see that the results are the same as in Negative influence training examples
Let's analyse how the influence values change across different layers for given test and train examples.
The plot above shows the influences for test idx 0 and all train idx apart idx=3 (excluded for clarity since it has a very large absolute value). We can see that the scores tend to keep their sign across layers, but in almost all cases tend to decrease when approaching the output layer. This is not always the case, and in fact other test examples show different patterns. Understanding why this happens is an interesting research direction.
Ekfac is a powerful approximate method for computing the influence function of models that use a cross-entropy loss. In this notebook we applied it to sentiment analysis with BERT on the IMDB dataset. However, this method can be applied to much larger models and problems, e.g. to analyse the influence of entire sentences generated by GPT, Llama or Claude. For more info, you can visit our paper pill on influence functions for large language models
The Hessian regularization value impacts a lot the quality of the influence function approximation. In general, the value should be chosen as small as possible so that the results are finite. In practice, even when finite the influence values can be too large and lead to numerical instabilities. In this section we show how to efficiently analyse the impact of the Hessian regularization value with the ekfac method.
Let's start with a few additional imports.
import pandas as pd\nfrom scipy.stats import pearsonr, spearmanr\n
The method explore_hessian_regularization
will calculate the influence values of the training examples with each other for a range of Hessian regularization values. The method optimises gradient calculation and Hessian inversion to minimise the computation time.
influences_by_reg_value = ekfac_influence_model.explore_hessian_regularization(\n train_input,\n train_labels,\n regularization_values=[1e-15, 1e-9, 1e-5, 1],\n)\n
\n/home/jakob/Documents/pyDVL/venv/lib/python3.10/site-packages/transformers/models/distilbert/modeling_distilbert.py:222: UserWarning: There is a performance drop because we have not yet implemented the batching rule for aten::masked_fill.Tensor. Please file us an issue on GitHub so that we can prioritize its implementation. (Triggered internally at ../aten/src/ATen/functorch/BatchedFallback.cpp:82.)\n scores = scores.masked_fill(\n
\n
The resulting object, influences_by_reg_value
is a dictionary that associates to each regularization value the influences for each layer of the neural network. This is a lot of data, so we will first organise it in a pandas dataframe and take the average across training examples.
cols = [\"reg_value\", \"layer_id\", \"mean_infl\"]\ninfl_df = pd.DataFrame(influences_by_reg_value, columns=cols)\nfor reg_value in influences_by_reg_value:\n for layer_id, layer_influences in influences_by_reg_value[reg_value].items():\n mean_infl = torch.mean(layer_influences, dim=0).detach().numpy()\n infl_df = pd.concat(\n [infl_df, pd.DataFrame([[reg_value, layer_id, mean_infl]], columns=cols)]\n )\n
\n/tmp/ipykernel_8503/1081261490.py:6: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n infl_df = pd.concat(\n
\n
With this dataframe, we can take contiguous values of regularization and, for each layer, calculate the Pearson and Spearman correlation coefficients. This will give us an idea of how the influence values change with the regularization value.
result_corr = {}\nfor layer_id, group_df in infl_df.groupby(\"layer_id\"):\n result_corr[layer_id + \"_pearson\"] = {}\n result_corr[layer_id + \"_spearman\"] = {}\n for idx, mean_infl in enumerate(group_df[\"mean_infl\"]):\n if idx == 0:\n continue\n reg_value_diff = f\"Reg: {group_df['reg_value'].iloc[idx-1]} -> {group_df['reg_value'].iloc[idx]}\"\n pearson = pearsonr(mean_infl, group_df[\"mean_infl\"].iloc[idx - 1]).statistic\n spearman = spearmanr(mean_infl, group_df[\"mean_infl\"].iloc[idx - 1]).statistic\n result_corr[layer_id + \"_pearson\"].update({f\"{reg_value_diff}\": pearson})\n result_corr[layer_id + \"_spearman\"].update({f\"{reg_value_diff}\": spearman})\nresult_df = pd.DataFrame(result_corr).T\n
Let's plot the correlations heatmap. The y-axis reports Spearman and Pearson correlations for each layer, while the x-axis reports pairs of regularization values. High correlations mean that influences are stable across regularization values.
In our case, we can see that for regularization = 1 the spearman correlation becomes very bad. However, for a large range of regularization values smaller than 1 the sample rankings are stable. This is a good indicator that the model is not too sensitive to the regularization value. We therefore chose the value 1e-5 for our analysis.
"},{"location":"examples/influence_sentiment_analysis/#influence-functions-for-large-language-models","title":"Influence functions for Large Language Models","text":""},{"location":"examples/influence_sentiment_analysis/#setup","title":"Setup","text":""},{"location":"examples/influence_sentiment_analysis/#sentiment-analysis","title":"Sentiment Analysis","text":""},{"location":"examples/influence_sentiment_analysis/#model-and-data-preparation","title":"Model and Data Preparation","text":""},{"location":"examples/influence_sentiment_analysis/#influence-function-computation","title":"Influence function computation","text":""},{"location":"examples/influence_sentiment_analysis/#analysis-of-influence-values","title":"Analysis of influence values","text":""},{"location":"examples/influence_sentiment_analysis/#negative-influence-training-examples","title":"Negative influence training examples","text":""},{"location":"examples/influence_sentiment_analysis/#influence-of-corrupted-training-examples","title":"Influence of corrupted training examples","text":""},{"location":"examples/influence_sentiment_analysis/#influence-functions-by-layer","title":"Influence functions by layer","text":""},{"location":"examples/influence_sentiment_analysis/#conclusion","title":"Conclusion","text":""},{"location":"examples/influence_sentiment_analysis/#appendix-choosing-the-hessian-regularization-value","title":"Appendix: Choosing the Hessian regularization value","text":""},{"location":"examples/influence_synthetic/","title":"For mislabeled data","text":"If you are reading this in the documentation, some boilerplate has been omitted for convenience. %autoreload\n%matplotlib inline\n\nimport os\nimport random\nimport numpy as np\nimport torch\nimport torch.nn.functional as F\nimport matplotlib.pyplot as plt\nfrom pydvl.influence.torch import DirectInfluence, CgInfluence\nfrom support.shapley import (\n synthetic_classification_dataset,\n decision_boundary_fixed_variance_2d,\n)\nfrom support.common import (\n plot_gaussian_blobs,\n plot_losses,\n plot_influences,\n)\nfrom support.torch import (\n fit_torch_model,\n TorchLogisticRegression,\n)\nfrom sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay\nfrom torch.optim import AdamW, lr_scheduler\nfrom torch.utils.data import DataLoader, TensorDataset\n
The following code snippet generates the aforementioned dataset.
train_data, val_data, test_data = synthetic_classification_dataset(\n means, sigma, num_samples, train_size=0.7, test_size=0.2\n)\n
Given the simplicity of the dataset, we can calculate exactly the optimal decision boundary(that which maximizes our accuracy). The following code maps a continuous line of z values to a 2-dimensional vector in feature space (More details are in the appendix to this notebook.)
decision_boundary_fn = decision_boundary_fixed_variance_2d(means[0], means[1])\ndecision_boundary = decision_boundary_fn(np.linspace(-1.5, 1.5, 100))\n
plot_gaussian_blobs(\n train_data,\n test_data,\n xlabel=\"$x_0$\",\n ylabel=\"$x_1$\",\n legend_title=\"$y - labels$\",\n line=decision_boundary,\n s=10,\n suptitle=\"Plot of train-test data\",\n)\n
Note that there are samples which go across the optimal decision boundary and will be wrongly labelled. The optimal decision boundary can not discriminate these as the mislabelling is a consequence of the presence of random noise.
model = TorchLogisticRegression(num_features)\ndevice = torch.device(\"cuda:0\" if torch.cuda.is_available() else \"cpu\")\nmodel.to(device)\n\nnum_epochs = 50\nlr = 0.05\nweight_decay = 0.05\nbatch_size = 256\n\ntrain_data_loader = DataLoader(\n TensorDataset(\n torch.as_tensor(train_data[0]),\n torch.as_tensor(train_data[1], dtype=torch.float64).unsqueeze(-1),\n ),\n batch_size=batch_size,\n shuffle=True,\n)\n\nval_data_loader = DataLoader(\n TensorDataset(\n torch.as_tensor(val_data[0]),\n torch.as_tensor(val_data[1], dtype=torch.float64).unsqueeze(-1),\n ),\n batch_size=batch_size,\n shuffle=True,\n)\n\noptimizer = AdamW(params=model.parameters(), lr=lr, weight_decay=weight_decay)\nscheduler = lr_scheduler.CosineAnnealingLR(optimizer, T_max=num_epochs)\nlosses = fit_torch_model(\n model=model,\n training_data=train_data_loader,\n val_data=val_data_loader,\n loss=F.binary_cross_entropy,\n optimizer=optimizer,\n scheduler=scheduler,\n num_epochs=num_epochs,\n device=device,\n)\n
And let's check that the model is not overfitting
plot_losses(losses)\n
A look at the confusion matrix also shows good results
It is important that the model converges to a point near the optimum, since the influence values assume that we are at a minimum (or close) in the loss landscape. The function
\\[I(x_1, y_1, x_2, y_2) \\colon \\mathbb{R}^d \\times \\mathbb{R}^d \\to \\mathbb{R}\\] measures the influence of the data point \\(x_1\\) onto \\(x_2\\) conditioned on the training targets \\(y_1\\) and \\(y_2\\) trough some model parameters \\(\\theta\\) . If the loss function L is differentiable, we can take \\(I\\) to be
$$ I(x_1, x_2) = \\nabla_\\theta\\; L(x_1, y_1) ^\\mathsf{T} \\; H_\\theta^{-1} \\; \\nabla_\\theta \\; L(x_2, y_2) $$ See \"Understanding Black-box Predictions via Influence Functions\" for a detailed derivation of this formula
Let's take a subset of the training data points, which we will calculate the influence values of.
x = train_data[0][:100]\ny = train_data[1][:100]\n
In pyDVL, the influence of the training points on the test points can be calculated with the following
train_x = torch.as_tensor(x)\ntrain_y = torch.as_tensor(y, dtype=torch.float64).unsqueeze(-1)\ntest_x = torch.as_tensor(test_data[0])\ntest_y = torch.as_tensor(test_data[1], dtype=torch.float64).unsqueeze(-1)\n\ntrain_data_loader = DataLoader(\n TensorDataset(train_x, train_y),\n batch_size=batch_size,\n)\n\ninfluence_model = DirectInfluence(\n model,\n F.binary_cross_entropy,\n regularization=0.0,\n)\ninfluence_model = influence_model.fit(train_data_loader)\n\ninfluence_values = influence_model.influences(\n test_x, test_y, train_x, train_y, mode=\"up\"\n)\n
The above explicitly constructs the Hessian. This can often be computationally expensive and conjugate gradient approximate calculation should be used for bigger models.
With the influence type 'up', training influences have shape [NxM] where N is the number of test samples and M is the number of training samples. They therefore associate to each training sample its influence on each test sample. Influence type 'perturbation', instead, return an array of shape [NxMxF], where F is the number of features in input, ie. the length of x.
In our case, in order to have a value of the total average influence of a point we can just average across training samples.
mean_train_influences = np.mean(influence_values.cpu().numpy(), axis=0)\n
Let's plot the results (adjust colorbar_limits for better color gradient)
plot_influences(\n x,\n mean_train_influences,\n line=decision_boundary,\n xlabel=\"$x_0$\",\n ylabel=\"$x_1$\",\n suptitle=\"Influences of input points\",\n legend_title=\"influence values\",\n # colorbar_limits=(-0.3,),\n);\n
We can see that, as we approach the separation line, the influences tend to move away from zero, i.e. the points become more decisive for model training, some in a positive way, some negative.
As a further test, let's introduce some labelling errors into \\(y\\) and see how the distribution of the influences changes. Let's flip the first 10 labels and calculate influences
y_corrupted = np.copy(y)\ny_corrupted[:10] = [1 - yi for yi in y[:10]]\ntrain_y_corrupted = torch.as_tensor(y_corrupted, dtype=torch.float64).unsqueeze(-1)\ntrain_corrupted_data_loader = DataLoader(\n TensorDataset(\n train_x,\n train_y_corrupted,\n ),\n batch_size=batch_size,\n)\n\ninfluence_model = DirectInfluence(\n model,\n F.binary_cross_entropy,\n regularization=0.0,\n)\ninfluence_model = influence_model.fit(train_corrupted_data_loader)\ninfluence_values = influence_model.influences(\n test_x, test_y, train_x, train_y_corrupted, mode=\"up\"\n)\n\nmean_train_influences = np.mean(influence_values.cpu().numpy(), axis=0)\n
\nAverage mislabelled data influence: -0.8618301488627411\nAverage correct data influence: 0.011604730452803018\n
\n
Red circles indicate the points which have been corrupted. We can see that the mislabelled data have a more negative average influence on the model, especially those that are farther away from the decision boundary.
The \"direct\" method that we have used above involves the inversion of the Hessian matrix of the model. If a model has \\(n\\) training points and \\(\\theta \\in \\mathbb{R}^p\\) parameters, this requires \\(O(n \\ p^2 + p^3)\\) operations, which for larger models, like neural networks, becomes quickly unfeasible. Conjugate gradient avoids the explicit computation of the Hessian via a technique called implicit Hessian-vector products (HVPs), which typically takes \\(O(n \\ p)\\) operations.
In the next cell we will use conjugate gradient to compute the influence factors. Since logistic regression is a very simple model, \"cg\" actually slows computation with respect to the direct method, which in this case is a much better choice. Nevertheless, we are able to verify that the influences calculated with \"cg\" are the same (to a minor error) as those calculated directly.
influence_model = CgInfluence(\n model,\n F.binary_cross_entropy,\n hessian_regularization=0.0,\n)\ninfluence_model = influence_model.fit(train_corrupted_data_loader)\ninfluence_values = influence_model.influences(\n test_x, test_y, train_x, train_y_corrupted\n)\nmean_train_influences = np.mean(influence_values.cpu().numpy(), axis=0)\n\nprint(\"Average mislabelled data influence:\", np.mean(mean_train_influences[:10]))\nprint(\"Average correct data influence:\", np.mean(mean_train_influences[10:]))\n
\nAverage mislabelled data influence: -0.30172696155741363\nAverage correct data influence: 0.0\n
\n
Averages are very similar to the ones calculated through direct method. Same is true for the plot
"},{"location":"examples/influence_synthetic/#influence-functions-for-data-mislabeling","title":"Influence functions for data mislabeling","text":" In this notebook, we will take a closer look at the theory of influence functions with the help of a synthetic dataset. Data mislabeling occurs whenever some examples from a usually big dataset are wrongly-labeled. In real-life this happens fairly often, e.g. as a consequence of human error, or noise in the data.
Let's consider a classification problem with the following notation:
\\[ \\begin{align*} x_i &\\in \\mathbb{R}^d \\\\ y_i &\\in \\{0, 1\\} \\\\ \\forall i &\\in [ N ] \\end{align*} \\] In other words, we have a dataset containing \\(N\\) samples, each with label 1 or 0. As typical example you can think of y indicating whether a patient has a disease based on some feature representation \\(x\\) .
Let's now introduce a toy model that will help us delve into the theory and practical utility of influence functions. We will assume that \\(y\\) is a Bernoulli binary random variable while the input \\(x\\) is d-dimensional Gaussian distribution which depends on the label \\(y\\) . More precisely:
\\[ y_i \\sim \\text{Ber}\\left (0.5 \\right) \\\\ x_i \\sim \\mathcal{N}\\left ((1 - y_i) \\mu_1 + y_i \\mu_2, \\sigma^2 I \\right), \\] with fixed means and diagonal covariance. Implementing the sampling scheme in python is straightforward and can be achieved by first sampling \\(y\\) and afterward \\(x\\) .
"},{"location":"examples/influence_synthetic/#imports","title":"Imports","text":""},{"location":"examples/influence_synthetic/#dataset","title":"Dataset","text":""},{"location":"examples/influence_synthetic/#plotting-the-dataset","title":"Plotting the dataset","text":" Let's plot the dataset is plotted with their respective labels and the optimal decision line
"},{"location":"examples/influence_synthetic/#training-the-model","title":"Training the model","text":" We will now train a logistic regression model on the training data. This can be done with the following
"},{"location":"examples/influence_synthetic/#calculating-influences","title":"Calculating influences","text":""},{"location":"examples/influence_synthetic/#inversion-through-conjugate-gradient","title":"Inversion through conjugate gradient","text":""},{"location":"examples/influence_synthetic/#appendix-calculating-the-decision-boundary","title":"Appendix: Calculating the decision boundary","text":" For obtaining the optimal discriminator one has to solve the equation
\\[p(x|y=0)=p(x|y=1)\\] and determine the solution set \\(X\\) . Let's take the following probabilities
\\[ \\begin{align*} p(x|y=0)&=\\mathcal{N}\\left (\\mu_1, \\sigma^2 I \\right) \\\\ p(x|y=1)&=\\mathcal{N}\\left (\\mu_2, \\sigma^2 I \\right) \\end{align*} \\] For a single fixed diagonal variance parameterized by \\(\\sigma\\) , the optimal discriminator lays at points which are equidistant from the means of the two distributions, i.e.
\\[ \\begin{align*} \\| x - \\mu_1 \\|^2 &= \\| x - \\mu_2 \\|^2 \\\\ \\| \\mu_1 \\|^2 -2 x^\\mathsf{T} \\mu_1 &= \\| \\mu_2 \\|^2 -2 x^\\mathsf{T} \\mu_2 \\\\ \\implies 0 &= 2 (\\mu_2 - \\mu_1)^\\mathsf{T} x + \\| \\mu_1 \\|^2 - \\| \\mu_2 \\|^2 \\\\ 0 &= \\mu_1^\\mathsf{T}x - \\mu_2^\\mathsf{T}x - \\frac{1}{2} \\mu_1^\\mathsf{T} \\mu_1 + \\frac{1}{2} \\mu_2^\\mathsf{T} \\mu_2 \\end{align*} \\] This is just the implicit description of the line. Solving for the explicit form can be achieved by enforcing a functional form \\(f(z) = x = a z + b\\) with \\(z \\in \\mathbb{R}\\) onto \\(x\\) . After the term is inserted in the previous equation
\\[ 0 = (\\mu_2 - \\mu_1)^\\mathsf{T} (az + b) + \\frac{1}{2} \\| \\mu_1 \\|^2 - \\| \\mu_2 \\|^2 \\] We can write \\(a\\) since, by symmetry, it is expected to be explicitly orthogonal to \\(\\mu_2 - \\mu_1\\) . Then, solving for \\(b\\) , the solution can be found to be
\\[ f(z) = \\underbrace{\\begin{bmatrix} 0 & 1 \\\\ -1 & 0 \\end{bmatrix} (\\mu_2 - \\mu_1)}_a z + \\underbrace{\\frac{\\mu_1 + \\mu_2}{2}}_b \\]"},{"location":"examples/influence_wine/","title":"For outlier detection","text":"If you are reading this in the documentation, some boilerplate has been omitted for convenience. Let's start by loading the imports, the dataset and splitting it into train, validation and test sets. We will use a large test set to have a less noisy estimate of the average influence.
%autoreload\n%matplotlib inline\n\nimport os\nimport random\n\nimport matplotlib.pyplot as plt\nimport numpy as np\nimport torch\nimport torch.nn.functional as F\nfrom support.common import plot_losses\nfrom support.torch import TorchMLP, fit_torch_model\nfrom pydvl.influence.torch import (\n DirectInfluence,\n CgInfluence,\n ArnoldiInfluence,\n EkfacInfluence,\n NystroemSketchInfluence,\n LissaInfluence,\n)\nfrom pydvl.influence import InfluenceMode\nfrom support.shapley import load_wine_dataset\nfrom sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, f1_score\nfrom torch.optim import Adam, lr_scheduler\nfrom torch.utils.data import DataLoader, TensorDataset\nfrom scipy.stats import pearsonr, spearmanr\n
training_data, val_data, test_data, feature_names = load_wine_dataset(\n train_size=0.6, test_size=0.3\n)\n
We will corrupt some of the training points by flipping their labels
num_corrupted_idxs = 10\ntraining_data[1][:num_corrupted_idxs] = torch.tensor(\n [(val + 1) % 3 for val in training_data[1][:num_corrupted_idxs]]\n)\n
and let's wrap it in a pytorch data loader
training_data_loader = DataLoader(\n TensorDataset(*training_data), batch_size=32, shuffle=False\n)\nval_data_loader = DataLoader(TensorDataset(*val_data), batch_size=32, shuffle=False)\ntest_data_loader = DataLoader(TensorDataset(*test_data), batch_size=32, shuffle=False)\n
feature_dimension = 13\nnum_classes = 3\nnetwork_size = [16, 16]\nlayers_size = [feature_dimension, *network_size, num_classes]\nnum_epochs = 300\nlr = 0.005\nweight_decay = 0.01\n\nnn_model = TorchMLP(layers_size)\nnn_model.to(device)\n\noptimizer = Adam(params=nn_model.parameters(), lr=lr, weight_decay=weight_decay)\nscheduler = lr_scheduler.CosineAnnealingLR(optimizer, T_max=num_epochs)\n\nlosses = fit_torch_model(\n model=nn_model,\n training_data=training_data_loader,\n val_data=val_data_loader,\n loss=F.cross_entropy,\n optimizer=optimizer,\n scheduler=scheduler,\n num_epochs=num_epochs,\n device=device,\n)\n
Let's check that the training has found a stable minimum by plotting the training and validation loss
plot_losses(losses)\n
Since it is a classification problem, let's also take a look at the confusion matrix on the test set
And let's compute the f1 score of the model
f1_score(test_data[1], pred_y_test, average=\"weighted\")\n
\n0.943730275125624
\n
Let's now move to calculating influences of each point on the total score.
influence_model = DirectInfluence(\n nn_model,\n F.cross_entropy,\n regularization=0.1,\n)\ninfluence_model = influence_model.fit(training_data_loader)\ntrain_influences = influence_model.influences(*test_data, *training_data, mode=\"up\")\n
the returned matrix, train_influences, has a quantity of columns equal to the points in the training set, and a number of rows equal to the points in the test set. At each element \\(a_{i,j}\\) it stores the influence that training point \\(j\\) has on the classification of test point \\(i\\) .
If we take the average across every column of the influences matrix, we obtain an estimate of the overall influence of a training point on the total accuracy of the network.
mean_train_influences = np.mean(train_influences.cpu().numpy(), axis=0)\nmean_train_influences.shape\n
\n(106,)
\n
The following histogram shows that there are big differences in score within the training set (notice the log-scale on the y axis).
We can see that the corrupted points tend to have a negative effect on the model, as expected
\nAverage influence of corrupted points: -1.0840776\nAverage influence of other points: 0.11192768\n
\n
We have seen how to calculate the influence of single training points on each test point using influence_type 'up'. Using influence_type 'perturbation' we can also calculate the influence of the input features of each point. In the next cell we will calculate the average influence of each feature on training and test points, and ultimately assess which are the most relevant to model performance.
influence_model.regularization = 1.0\nfeature_influences = influence_model.influences(\n *test_data, *training_data, mode=\"perturbation\"\n)\n
The explicit calculation of the Hessian matrix is numerically challenging, and due to the high memory need infeasible for larger models. PyDVL allows to use several approximation methods for the action of the inverse Hessian matrix to overcome this bottleneck:
- Iteration-based:
- Conjugate Gradients (Cg)
- Linear time Stochastic Second-Order Approximation ( LiSSA )
- Low-rank Approximations:
- Arnoldi
- Nystr\u00f6m Sketch-and-Solve (Nystr\u00f6m)
- Factorization-based:
- Eigenvalue-corrected Kronecker Factorization ( EKFAC )
In the following, we show the usage of these approximation methods and investigate their performance.
Since the Hessian is symmetric and positive definite (at least after applying a sufficient regularization), we can utilize the Conjugate Gradients Algorithm to approximately solve the equations
\\[ (H + \\lambda \\operatorname{I}) x = b\\] Most importantly, the algorithm do not require the computation of the full Hessian matrix, but only requires the implementation of Hessian vector products. pyDVL implements a stable block variant of preconditioned conjugate gradients algorithm.
from pydvl.influence.torch.pre_conditioner import NystroemPreConditioner\n\nnn_model.to(\"cpu\")\ncg_influence_model = CgInfluence(\n nn_model,\n F.cross_entropy,\n hessian_regularization=0.1,\n progress=True,\n use_block_cg=True,\n pre_conditioner=NystroemPreConditioner(rank=5),\n)\ncg_influence_model = cg_influence_model.fit(training_data_loader)\ncg_train_influences = cg_influence_model.influences(\n *test_data, *training_data, mode=\"up\"\n)\nmean_cg_train_influences = np.mean(cg_train_influences.numpy(), axis=0)\n
Let's compare the results obtained through conjugate gradient with those from the direct method
\nPercentage error of Cg over direct method:38.18922936916351 %\n
\n
\nPearson Correlation Cg vs direct 0.9978821390094573\nSpearman Correlation Cg vs direct 0.9946595460614153\n
\n
The LiSSA method is a stochastic approximation of the inverse Hessian vector product. Compared to conjugate gradient it is faster but less accurate and typically suffers from instability.
In order to find the solution of the HVP, LiSSA iteratively approximates the inverse of the Hessian matrix with the following update:
\\[H^{-1}_{j+1} b = b + (I - d) \\ H - \\frac{H^{-1}_j b}{s},\\] where \\(d\\) and \\(s\\) are a dampening and a scaling factor.
lissa_influence_model = LissaInfluence(\n nn_model,\n F.cross_entropy,\n regularization=0.1,\n progress=True,\n)\nlissa_influence_model = lissa_influence_model.fit(training_data_loader)\nlissa_train_influences = lissa_influence_model.influences(\n *test_data, *training_data, mode=\"up\"\n)\nmean_lissa_train_influences = np.mean(lissa_train_influences.numpy(), axis=0)\n
\nPercentage error of Lissa over direct method:119.32581663131714 %\n
\n
\nPearson Correlation Lissa vs direct 0.9875324674899437\nSpearman Correlation Lissa vs direct 0.9758067360253924\n
\n
The Arnoldi method leverages a low rank approximation of the Hessian matrix to reduce the memory requirements. It is generally much faster than the conjugate gradient method and can achieve similar accuracy.
arnoldi_influence_model = ArnoldiInfluence(\n nn_model,\n F.cross_entropy,\n rank_estimate=30,\n hessian_regularization=0.1,\n)\narnoldi_influence_model = arnoldi_influence_model.fit(training_data_loader)\narnoldi_train_influences = arnoldi_influence_model.influences(\n *test_data, *training_data, mode=\"up\"\n)\nmean_arnoldi_train_influences = np.mean(arnoldi_train_influences.numpy(), axis=0)\n
\nPercentage error of Arnoldi over direct method:40.1591956615448 %\n
\n
\nPearson Correlation Arnoldi vs direct 0.9913769850406638\nSpearman Correlation Arnoldi vs direct 0.9818122276242538\n
\n
Similar to the Arnoldi method. the Nystr\u00f6m method uses a low-rank approximation, which is computed from random projections of the Hessian matrix. In general the approximation is expected to be worse then the Arnoldi approximation, but is cheaper to compute.
nystroem_influence_model = NystroemSketchInfluence(\n nn_model,\n F.cross_entropy,\n rank=30,\n regularization=0.1,\n)\nnystroem_influence_model = nystroem_influence_model.fit(training_data_loader)\nnystroem_train_influences = nystroem_influence_model.influences(\n *test_data, *training_data, mode=\"up\"\n)\nmean_nystroem_train_influences = np.mean(nystroem_train_influences.numpy(), axis=0)\n
\nPercentage error of Nystr\u00f6m over direct method:106.66680335998535 %\n
\n
\nPearson Correlation Nystr\u00f6m vs direct 0.9951186619181842\nSpearman Correlation Nystr\u00f6m vs direct 0.9858830642114014\n
\n
The EKFAC method is a more recent technique that leverages the Kronecker product structure of the Hessian matrix to reduce the memory requirements. It is generally much faster than iterative methods like conjugate gradient and Arnoldi and it allows for an easier handling of memory. Therefore, it is the only technique that can scale to very large models (e.g. billions of parameters). Its accuracy is however much worse. Let's see how it performs on our example.
ekfac_influence_model = EkfacInfluence(\n nn_model,\n update_diagonal=True,\n hessian_regularization=0.1,\n)\nekfac_influence_model = ekfac_influence_model.fit(training_data_loader)\nekfac_train_influences = ekfac_influence_model.influences(\n *test_data, *training_data, mode=\"up\"\n)\nmean_ekfac_train_influences = np.mean(ekfac_train_influences.numpy(), axis=0)\n
\nPercentage error of EK-FAC over direct method:1995.9354400634766 %\n
\n
The accuracy is not good, and it is not recommended to use this method for small models. Nevertheless, a look at the actual influence values reveals that the EK-FAC estimates are not completely off.
The above plot shows a good correlation between the EK-FAC and the direct method. Corrupted points have been circled in red, and in both the direct and approximate case they are correcly identified as having negative influence on the model's accuracy. This is confirmed by explicit calculation of the Pearson and Spearman correlation coefficients.
\nPearson Correlation EK-FAC vs direct 0.9595030844711058\nSpearman Correlation EK-FAC vs direct 0.8974028264100562\n
\n
The correlation between the EK-FAC and the direct method is quite good, and it improves significantly if we just keep top-20 highest absolute influences.
\nPearson Correlation EK-FAC vs direct - top-20 influences 0.9891339337484283\nSpearman Correlation EK-FAC vs direct - top-20 influences 0.9593984962406013\n
\n
When we calculate influence scores, typically we are more interested in assessing which training points have the highest or lowest impact on the model rather than having a precise estimate of the influence value. EK-FAC then provides a fast and memory-efficient way to calculate a coarse influence ranking of the training points which scales very well even to the largest neural networks.
This was a quick introduction to the pyDVL interface for influence functions. Despite their speed and simplicity, influence functions are known to be a very noisy estimator of data quality, as pointed out in the paper \"Influence functions in deep learning are fragile\" . The size of the network, the weight decay, the inversion method used for calculating influences, the size of the test set: they all add up to the total amount of noise. Experiments may therefore give quantitative and qualitatively different results if not averaged across several realisations. Shapley values, on the contrary, have shown to be a more robust, but this comes at the cost of high computational requirements. PyDVL employs several parallelization and caching techniques to optimize such calculations.
"},{"location":"examples/influence_wine/#influence-functions-for-outlier-detection","title":"Influence functions for outlier detection","text":" This notebook shows how to calculate influences on a NN model using pyDVL for an arbitrary dataset, and how this can be used to find anomalous or corrupted data points.
It uses the wine dataset from sklearn: given a set of 13 different input parameters regarding a particular bottle, each related to some physical property (e.g. concentration of magnesium, malic acidity, alcoholic percentage, etc.), the model will need to predict to which of 3 classes the wine belongs to. For more details, please refer to the sklearn documentation .
"},{"location":"examples/influence_wine/#imports","title":"Imports","text":""},{"location":"examples/influence_wine/#dataset","title":"Dataset","text":""},{"location":"examples/influence_wine/#fit-a-neural-network-to-the-data","title":"Fit a neural network to the data","text":" We will train a 2-layer neural network. PyDVL has some convenience wrappers to initialize a pytorch NN. If you already have a model loaded and trained, you can skip this section.
"},{"location":"examples/influence_wine/#calculating-influences-for-small-neural-networks","title":"Calculating influences for small neural networks","text":" The following cell calculates the influences of each training data point on the neural network. Neural networks have typically a very bumpy parameter space, which, during training, is explored until the configuration that minimises the loss is found. There is an important assumption in influence functions that the model lays at a (at least local) minimum of such loss, and if this is not fulfilled many issues can arise. In order to avoid this scenario, a regularisation term should be used whenever dealing with big and noisy models.
"},{"location":"examples/influence_wine/#influence-of-training-features","title":"Influence of training features","text":""},{"location":"examples/influence_wine/#speeding-up-influences-for-big-models","title":"Speeding up influences for big models","text":""},{"location":"examples/influence_wine/#cg","title":"Cg","text":""},{"location":"examples/influence_wine/#lissa","title":"Lissa","text":""},{"location":"examples/influence_wine/#arnoldi","title":"Arnoldi","text":""},{"location":"examples/influence_wine/#nystrom","title":"Nystr\u00f6m","text":""},{"location":"examples/influence_wine/#ekfac","title":"EKFAC","text":""},{"location":"examples/influence_wine/#conclusions","title":"Conclusions","text":""},{"location":"examples/least_core_basic/","title":"Least Core","text":" We will be using the following functions and classes from pyDVL.
%autoreload\nfrom pydvl.utils import (\n Dataset,\n Utility,\n)\nfrom pydvl.value import compute_least_core_values, LeastCoreMode, ValuationResult\nfrom pydvl.reporting.plots import shaded_mean_std\nfrom pydvl.reporting.scores import compute_removal_score\n
X, y = make_classification(\n n_samples=dataset_size,\n n_features=50,\n n_informative=25,\n n_classes=3,\n random_state=random_state,\n)\n
full_dataset = Dataset.from_arrays(\n X, y, stratify_by_target=True, random_state=random_state\n)\nsmall_dataset = Dataset.from_arrays(\n X,\n y,\n stratify_by_target=True,\n train_size=train_size,\n random_state=random_state,\n)\n
model = LogisticRegression(max_iter=500, solver=\"liblinear\")\n
model.fit(full_dataset.x_train, full_dataset.y_train)\nprint(\n f\"Training accuracy: {100 * model.score(full_dataset.x_train, full_dataset.y_train):0.2f}%\"\n)\nprint(\n f\"Testing accuracy: {100 * model.score(full_dataset.x_test, full_dataset.y_test):0.2f}%\"\n)\n
\nTraining accuracy: 86.25%\nTesting accuracy: 70.00%\n
\n
model.fit(small_dataset.x_train, small_dataset.y_train)\nprint(\n f\"Training accuracy: {100 * model.score(small_dataset.x_train, small_dataset.y_train):0.2f}%\"\n)\nprint(\n f\"Testing accuracy: {100 * model.score(small_dataset.x_test, small_dataset.y_test):0.2f}%\"\n)\n
\nTraining accuracy: 100.00%\nTesting accuracy: 47.89%\n
\n
utility = Utility(model=model, data=small_dataset)\n
exact_values = compute_least_core_values(\n u=utility,\n mode=LeastCoreMode.Exact,\n progress=True,\n)\n
exact_values_df = exact_values.to_dataframe(column=\"exact_value\").T\nexact_values_df = exact_values_df[sorted(exact_values_df.columns)]\n
budget_array = np.linspace(200, 2 ** len(small_dataset), num=10, dtype=int)\n\nall_estimated_values_df = []\nall_errors = {budget: [] for budget in budget_array}\n\nfor budget in tqdm(budget_array):\n dfs = []\n errors = []\n column_name = f\"estimated_value_{budget}\"\n for i in range(20):\n values = compute_least_core_values(\n u=utility,\n mode=LeastCoreMode.MonteCarlo,\n n_iterations=budget,\n n_jobs=n_jobs,\n )\n df = (\n values.to_dataframe(column=column_name)\n .drop(columns=[f\"{column_name}_stderr\", f\"{column_name}_updates\"])\n .T\n )\n df = df[sorted(df.columns)]\n error = mean_squared_error(\n exact_values_df.loc[\"exact_value\"].values, np.nan_to_num(df.values.ravel())\n )\n all_errors[budget].append(error)\n df[\"budget\"] = budget\n dfs.append(df)\n estimated_values_df = pd.concat(dfs)\n all_estimated_values_df.append(estimated_values_df)\n\nvalues_df = pd.concat(all_estimated_values_df)\nerrors_df = pd.DataFrame(all_errors)\n
We can see that the approximation error decreases, on average, as the we increase the budget.
Still, the decrease may not always necessarily happen when we increase the number of iterations because of the fact that we sample the subsets with replacement in the Monte Carlo method i.e there may be repeated subsets.
utility = Utility(model=model, data=full_dataset)\n
method_names = [\"Random\", \"Least Core\"]\nremoval_percentages = np.arange(0, 0.41, 0.05)\n
all_scores = []\n\nfor i in trange(5):\n for method_name in method_names:\n if method_name == \"Random\":\n values = ValuationResult.from_random(size=len(utility.data))\n else:\n values = compute_least_core_values(\n u=utility,\n mode=LeastCoreMode.MonteCarlo,\n n_iterations=n_iterations,\n n_jobs=n_jobs,\n )\n scores = compute_removal_score(\n u=utility,\n values=values,\n percentages=removal_percentages,\n remove_best=True,\n )\n scores[\"method_name\"] = method_name\n all_scores.append(scores)\n\nscores_df = pd.DataFrame(all_scores)\n
We can clearly see that removing the most valuable data points, as given by the Least Core method, leads to, on average, a decrease in the model's performance and that the method outperforms random removal of data points.
all_scores = []\n\nfor i in trange(5):\n for method_name in method_names:\n if method_name == \"Random\":\n values = ValuationResult.from_random(size=len(utility.data))\n else:\n values = compute_least_core_values(\n u=utility,\n mode=LeastCoreMode.MonteCarlo,\n n_iterations=n_iterations,\n n_jobs=n_jobs,\n )\n scores = compute_removal_score(\n u=utility,\n values=values,\n percentages=removal_percentages,\n )\n scores[\"method_name\"] = method_name\n all_scores.append(scores)\n\nscores_df = pd.DataFrame(all_scores)\n
We can clearly see that removing the least valuable data points, as given by the Least Core method, leads to, on average, an increase in the model's performance and that the method outperforms the random removal of data points.
"},{"location":"examples/least_core_basic/#least-core-for-data-valuation","title":"Least Core for Data Valuation","text":" This notebook introduces Least Core methods for the computation of data values using pyDVL.
Shapley values define a fair way of distributing the worth of the whole training set when every data point is part of it. But they do not consider the question of stability of subsets: Could some data points obtain a higher payoff if they formed smaller subsets? It is argued that this might be relevant if data providers are paid based on data value, since Shapley values can incentivise them not to contribute their data to the \"grand coalition\", but instead try to form smaller ones. Whether this is of actual practical relevance is debatable, but in any case, the least core is an alternative tool available for any task of Data Valuation
The Core is another approach to compute data values originating in cooperative game theory that attempts to answer those questions. It is the set of feasible payoffs that cannot be improved upon by a coalition of the participants.
Its use for Data Valuation was first described in the paper If You Like Shapley Then You\u2019ll Love the Core by Tom Yan and Ariel D. Procaccia.
The Least Core value \\(v\\) of the \\(i\\) -th sample in dataset \\(D\\) wrt. utility \\(u\\) is computed by solving the following Linear Program:
\\[ \\begin{array}{lll} \\text{minimize} & \\displaystyle{e} & \\\\ \\text{subject to} & \\displaystyle\\sum_{x_i\\in D} v_u(x_i) = u(D) & \\\\ & \\displaystyle\\sum_{x_i\\in S} v_u(x_i) + e \\geq u(S) &, \\forall S \\subset D, S \\neq \\emptyset \\\\ \\end{array} \\] To illustrate this method we will use a synthetic dataset. We will first use a subset of 10 data point to compute the exact values and use them to assess the Monte Carlo approximation. Afterwards, we will conduct the data removal experiments as described by Ghorbani and Zou in their paper Data Shapley: Equitable Valuation of Data for Machine Learning : We compute the data valuation given different computation budgets and incrementally remove a percentage of the best, respectively worst, data points and observe how that affects the utility.
"},{"location":"examples/least_core_basic/#setup","title":"Setup","text":" We begin by importing the main libraries and setting some defaults.
If you are reading this in the documentation, some boilerplate (including most plotting code) has been omitted for convenience."},{"location":"examples/least_core_basic/#dataset","title":"Dataset","text":" We generate a synthetic dataset using the make_classification
function from scikit-learn.
We sample 200 data points from a 50-dimensional Gaussian distribution with 25 informative features and 25 non-informative features (generated as random linear combinations of the informative features).
The 200 samples are uniformly distributed across 3 classes with a small percentage of noise added to the labels to make the task a bit more difficult.
"},{"location":"examples/least_core_basic/#estimating-least-core-values","title":"Estimating Least Core Values","text":" In this first section we will use a smaller subset of the dataset containing 10 samples in order to be able to compute exact values in a reasonable amount of time. Afterwards, we will use the Monte Carlo method with a limited budget (maximum number of subsets) to approximate these values.
"},{"location":"examples/least_core_basic/#data-removal","title":"Data Removal","text":" We now move on to the data removal experiments using the full dataset.
In these experiments, we first rank the data points from most valuable to least valuable using the values estimated by the Monte Carlo Least Core method. Then, we gradually remove from 5 to 40 percent, by increments of 5 percentage points, of the most valuable/least valuable ones, train the model on this subset and compute its accuracy.
"},{"location":"examples/least_core_basic/#remove-best","title":"Remove Best","text":" We start by removing the best data points and seeing how the model's accuracy evolves.
"},{"location":"examples/least_core_basic/#remove-worst","title":"Remove Worst","text":" We then proceed to removing the worst data points and seeing how the model's accuracy evolves.
"},{"location":"examples/msr_banzhaf_digits/","title":"Banzhaf Semivalues","text":" We will be using the following functions from pyDVL. The main entry point is the function compute_banzhaf_semivalues()
. In order to use it we need the classes Dataset , Utility and Scorer .
%autoreload\nfrom pydvl.reporting.plots import plot_shapley\nfrom support.banzhaf import load_digits_dataset\nfrom pydvl.value import *\n
training_data, _, test_data = load_digits_dataset(\n test_size=0.3, random_state=random_state\n)\n
Training and test data are then used to instantiate a Dataset object:
dataset = Dataset(*training_data, *test_data)\n
import torch\nfrom support.banzhaf import TorchCNNModel\n\ndevice = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\nmodel = TorchCNNModel(lr=0.001, epochs=40, batch_size=32, device=device)\nmodel.fit(x=training_data[0], y=training_data[1])\n
\nTrain Accuracy: 0.705\nTest Accuracy: 0.630\n
\n
The final component is the scoring function. It can be anything like accuracy or \\(R^2\\) , and is set with a string from the standard sklearn scoring methods . Please refer to that documentation on information on how to define your own scoring function.
We group dataset, model and scoring function into an instance of Utility and compute the Banzhaf semi-values. We take all defaults, and choose to stop computation using the MaxChecks stopping criterion, which terminates after a fixed number of calls to it. With the default batch_size
of 1 this means that we will retrain the model.
Note how we enable caching using memcached (assuming memcached runs with the default configuration for localhost). This is necessary in the current preliminary implementation of permutation sampling , which is the default for compute_banzhaf_semivalues .
from pydvl.utils import MemcachedCacheBackend, MemcachedClientConfig\n\n# Compute regular Banzhaf semivalue\nutility = Utility(\n model=model,\n data=dataset,\n scorer=Scorer(\"accuracy\", default=0.0, range=(0, 1)),\n cache_backend=MemcachedCacheBackend(MemcachedClientConfig()),\n)\nvalues = compute_banzhaf_semivalues(\n utility, done=MaxChecks(max_checks), n_jobs=n_jobs, progress=True\n)\nvalues.sort(key=\"value\")\ndf = values.to_dataframe(column=\"banzhaf_value\", use_names=True)\n
The returned dataframe contains the mean and variance of the Monte Carlo estimates for the values:
banzhaf_value banzhaf_value_stderr banzhaf_value_updates 156 -1.097920 6.662418e-02 5 21 -0.925489 1.230752e-01 5 152 -0.913313 3.358054e-02 5 73 -0.778884 3.668419e-05 5 85 -0.644435 3.454322e-08 5 Let us plot the results. In the next cell we will take the 30 images with the lowest score and plot their values with 95% Normal confidence intervals. Keep in mind that Permutation Monte Carlo Banzhaf is typically very noisy, and it can take many steps to arrive at a clean estimate.
\nAverage value of first 10 data points: 0.650003277874342\nExact values:\n39 0.432836\n45 0.455392\n158 0.533221\n144 0.571260\n36 0.633091\n161 0.697940\n77 0.698507\n28 0.752367\n35 0.838752\n175 0.886668\nName: banzhaf_value, dtype: float64\n
\n
For the first 5 images, we will falsify their label, for images 6-10, we will add some noise.
x_train_anomalous = training_data[0].copy()\ny_train_anomalous = training_data[1].copy()\nanomalous_indices = high_dvl.index.map(int).values[:10]\n\n# Set label of first 5 images to 0\ny_train_anomalous[high_dvl.index.map(int).values[:5]] = 0\n\n# Add noise to images 6-10\nindices = high_dvl.index.values[5:10].astype(int)\ncurrent_images = x_train_anomalous[indices]\nnoisy_images = current_images + 0.5 * np.random.randn(*current_images.shape)\nnoisy_images[noisy_images < 0] = 0.0\nnoisy_images[noisy_images > 1] = 1.0\nx_train_anomalous[indices] = noisy_images\n
anomalous_dataset = Dataset(\n x_train=x_train_anomalous,\n y_train=y_train_anomalous,\n x_test=test_data[0],\n y_test=test_data[1],\n)\n\nanomalous_utility = Utility(\n model=TorchCNNModel(),\n data=anomalous_dataset,\n scorer=Scorer(\"accuracy\", default=0.0, range=(0, 1)),\n cache_backend=MemcachedCacheBackend(MemcachedClientConfig()),\n)\nanomalous_values = compute_banzhaf_semivalues(\n anomalous_utility, done=MaxChecks(max_checks), n_jobs=n_jobs, progress=True\n)\nanomalous_values.sort(key=\"value\")\nanomalous_df = anomalous_values.to_dataframe(column=\"banzhaf_value\", use_names=True)\n
Let us now take a look at the low-value images and check how many of our anomalous images are part of it.
As can be seen in this figure, the valuation of the data points has decreased significantly by adding noise or falsifying their labels. This shows the potential of using Banzhaf values or other data valuation methods to detect mislabeled data points or noisy input data.
\nAverage value of original data points: 0.650003277874342\nAverage value of modified, anomalous data points: -0.02501543656281746\nFor reference, these are the average data values of all data points used for training (anomalous):\nbanzhaf_value 0.006044\nbanzhaf_value_stderr 0.103098\nbanzhaf_value_updates 5.000000\ndtype: float64\nThese are the average data values of all points (original data):\nbanzhaf_value 0.005047\nbanzhaf_value_stderr 0.115262\nbanzhaf_value_updates 5.000000\ndtype: float64\n
\n
utility = Utility(\n model=TorchCNNModel(),\n data=dataset,\n scorer=Scorer(\"accuracy\", default=0.0, range=(0, 1)),\n cache_backend=MemcachedCacheBackend(MemcachedClientConfig()),\n)\n
Computing the values is the same, but we now use a better stopping criterion. Instead of fixing the number of utility evaluations with MaxChecks , we use RankCorrelation to stop when the change in Spearman correlation between the ranking of two successive iterations is below a threshold.
values = compute_msr_banzhaf_semivalues(\n utility,\n done=RankCorrelation(rtol=0.0001, burn_in=10),\n n_jobs=n_jobs,\n progress=True,\n)\nvalues.sort(key=\"value\")\nmsr_df = values.to_dataframe(column=\"banzhaf_value\", use_names=True)\n
Inspection of the values reveals (generally) much lower variances. Notice the number of updates to each value as well.
banzhaf_value banzhaf_value_stderr banzhaf_value_updates 137 -0.264918 0.093597 11 20 -0.217394 0.127022 11 19 -0.210309 0.087179 11 41 -0.210119 0.071534 11 192 -0.191667 0.130774 11 from sklearn.linear_model import SGDClassifier\n\nif is_CI:\n utility = Utility(\n model=SGDClassifier(max_iter=2),\n data=dataset,\n scorer=Scorer(\"accuracy\", default=0.0, range=(0, 1)),\n )\nelse:\n utility = Utility(\n model=TorchCNNModel(),\n data=dataset,\n scorer=Scorer(\"accuracy\", default=0.0, range=(0, 1)),\n )\n
def get_semivalues_and_history(\n sampler_t, max_checks=max_checks, n_jobs=n_jobs, progress=True\n):\n _history = HistoryDeviation(n_steps=max_checks, rtol=1e-9)\n if sampler_t == MSRSampler:\n semivalue_function = compute_msr_banzhaf_semivalues\n else:\n semivalue_function = compute_banzhaf_semivalues\n _values = semivalue_function(\n utility,\n sampler_t=sampler_t,\n done=MaxChecks(max_checks + 2) | _history,\n n_jobs=n_jobs,\n progress=progress,\n )\n return _history, _values\n
# Monte Carlo Permutation Sampling Banzhaf semivalues\nhistory_permutation, permutation_values = get_semivalues_and_history(PermutationSampler)\n
# MSR Banzhaf values\nhistory_msr, msr_values = get_semivalues_and_history(MSRSampler)\n
# UniformSampler\nhistory_uniform, uniform_values = get_semivalues_and_history(UniformSampler)\n
# AntitheticSampler\nhistory_antithetic, antithetic_values = get_semivalues_and_history(AntitheticSampler)\n
# RandomHierarchicalSampler\nhistory_random, random_values = get_semivalues_and_history(RandomHierarchicalSampler)\n
The plot above visualizes the convergence speed of different samplers used for Banzhaf semivalue calculation. /It shows the average magnitude of how much the semivalues are updated in every step of the algorithm.
As you can see, MSR Banzhaf stabilizes much faster. After 1000 iterations (subsets sampled and evaluated with the utility), Permutation Monte Carlo Banzhaf has evaluated the marginal function about 5 times per data point (we are using 200 data points). For MSR , the semivalue of each data point was updated 1000 times. Due to this, the values converge much faster wrt. the number of utility evaluations, which is the key advantage of MSR sampling.
MSR sampling does come at a cost, however, which is that the updates to the semivalues are more noisy than in other methods. We will analyze the impact of this tradeoff in the next sections. First, let us look at how similar all the computed semivalues are. They are all Banzhaf values, so in a perfect world, all samplers should result in the exact same semivalues. However, due to randomness in the utility (recall that we use a neural network) and randomness in the samplers, the resulting values are likely never exactly the same. Another quality measure is that a good sampler would lead to very consistent values, a bad one to less consistent values. Let us first examine how similar the results are, then we'll look at consistency.
This plot shows that the samplers lead to quite different Banzhaf semivalues, however, all of them have some points in common. The MSR Sampler does not seem to be significantly worse than any others.
In an ideal setting without randomness, the overlap of points would be higher, however, the stochastic nature of the CNN model that we use together with the fact that we use only 200 data points for training, might overshadow these results. As a matter of fact we have the rather discouraging following result:
\nTotal number of top 20 points that all samplers have in common: 0\n
\n
"},{"location":"examples/msr_banzhaf_digits/#banzhaf-semi-values-for-data-valuation","title":"Banzhaf Semi-values for data valuation","text":" This notebook showcases Data Banzhaf: A Robust Data Valuation Framework for Machine Learning by Wang, and Jia.
Computing Banzhaf semi-values using pyDVL follows basically the same procedure as all other semi-value-based methods like Shapley values. However, Data-Banzhaf tends to be more robust to stochasticity in the training process than other semi-values. A property that we study here.
Additionally, we compare two sampling techniques: the standard permutation-based Monte Carlo sampling, and the so-called MSR (Maximum Sample Reuse) principle.
In order to highlight the strengths of Data-Banzhaf, we require a stochastic model. For this reason, we use a CNN to classify handwritten digits from the scikit-learn toy datasets .
"},{"location":"examples/msr_banzhaf_digits/#setup","title":"Setup","text":"If you are reading this in the documentation, some boilerplate (including most plotting code) has been omitted for convenience."},{"location":"examples/msr_banzhaf_digits/#loading-the-dataset","title":"Loading the dataset","text":" We use a support function, load_digits_dataset()
, which downloads the data and prepares it for usage. It returns four arrays that we then use to construct a Dataset . The data consists of grayscale images of shape 8x8 pixels with 16 shades of gray. These images contain handwritten digits from 0 to 9.
"},{"location":"examples/msr_banzhaf_digits/#creating-the-utility-and-computing-banzhaf-semivalues","title":"Creating the utility and computing Banzhaf semivalues","text":" Now we can calculate the contribution of each training sample to the model performance. First we need a model and a Scorer .
As a model, we use a simple CNN written torch, and wrapped into an object to convert numpy arrays into tensors (as of v0.9.0 valuation methods in pyDVL work only with numpy arrays). Note that any model that implements the protocol pydvl.utils.types.SupervisedModel , which is just the standard sklearn interface of fit()
, predict()
and score()
can be used to construct the utility.
"},{"location":"examples/msr_banzhaf_digits/#evaluation-on-anomalous-data","title":"Evaluation on anomalous data","text":" An interesting use-case for data valuation is finding anomalous data. Maybe some of the data is really noisy or has been mislabeled. To simulate this, we will change some of the labels of our dataset and add noise to some others. Intuitively, these anomalous data points should then have a lower value.
To evaluate this, let us first check the average value of the first 10 data points, as these will be the ones that we modify. Currently, these are the 10 data points with the highest values:
"},{"location":"examples/msr_banzhaf_digits/#maximum-sample-reuse-banzhaf","title":"Maximum Sample Reuse Banzhaf","text":" Despite the previous results already being useful, we had to retrain the model a number of times and yet the variance of the value estimates was high. This has consequences for the stability of the top-k ranking of points, which decreases the applicability of the method. We now introduce a different sampling method called Maximum Sample Reuse ( MSR ) which reuses every sample for updating the Banzhaf values. The method was introduced by the authors of Data-Banzhaf and is much more sample-efficient, as we will show.
We next construct a new utility. Note how this time we don't use a cache: the chance of hitting twice the same subset of the training set is low enough that one can dispense with it (nevertheless it can still be useful, e.g. when running many experiments).
"},{"location":"examples/msr_banzhaf_digits/#compare-convergence-speed-of-banzhaf-and-msr-banzhaf-values","title":"Compare convergence speed of Banzhaf and MSR Banzhaf Values","text":" Conventional margin-based samplers produce require evaluating the utility twice to do one update of the value, and permutation samplers do instead \\(n+1\\) evaluations for \\(n\\) updates. Maximum Sample Reuse ( MSR ) updates instead all indices in every sample that the utility evaluates. We compare the convergence rates of these methods.
In order to do so, we will compute the semi-values using different samplers and use a high number of iterations to make sure that the values have converged.
"},{"location":"examples/msr_banzhaf_digits/#similarity-of-the-semivalues-computed-using-different-samplers","title":"Similarity of the semivalues computed using different samplers","text":""},{"location":"examples/msr_banzhaf_digits/#consistency-of-the-semivalues","title":"Consistency of the semivalues","text":" Finally, we want to analyze how consistent the semivalues returned by the different samplers are. In order to do this, we compute semivalues multiple times and check how many of the data points in the top and lowest 20% of valuation of the data overlap.
"},{"location":"examples/msr_banzhaf_digits/#conclusion","title":"Conclusion","text":" MSR sampling updates the semivalue estimates for every index in the sample, much more frequently than any other sampler available, which leads to much faster convergence . Additionally, the sampler is more consistent with its value estimates than the other samplers, which might be caused by the higher number of value updates.
There is alas no general recommendation. It is best to try different samplers when computing semivalues and test which one is best suited for your use case. Nevertheless, the MSR sampler seems like a more efficient sampler which may bring fast results and is well-suited for stochastic models.
"},{"location":"examples/shapley_basic_spotify/","title":"Shapley values","text":" This notebook introduces Shapley methods for the computation of data value using pyDVL.
In order to illustrate the practical advantages, we will predict the popularity of songs in the dataset Top Hits Spotify from 2000-2019 , and highlight how data valuation can help investigate and boost the performance of the models. In doing so, we will describe the basic usage patterns of pyDVL.
Recall that data value is a function of three things:
- The dataset.
- The model.
- The performance metric or scoring function.
Below we will describe how to instantiate each one of these objects and how to use them for data valuation. Please also see the documentation on data valuation .
We will be using the following functions from pyDVL. The main entry point is the function compute_shapley_values() , which provides a facade to all Shapley methods. In order to use it we need the classes Dataset , Utility and Scorer .
%autoreload\nfrom pydvl.reporting.plots import plot_shapley\nfrom pydvl.utils.dataset import GroupedDataset\nfrom support.shapley import load_spotify_dataset\nfrom pydvl.value import *\n
training_data, val_data, test_data = load_spotify_dataset(\n val_size=0.3, test_size=0.3, target_column=\"popularity\", random_state=random_state\n)\n
training_data[0].head()\n
artist song duration_ms explicit year danceability energy key loudness mode speechiness acousticness instrumentalness liveness valence tempo genre 1561 Fetty Wap 679 (feat. Remy Boyz) 196693 True 2015 0.618 0.717 7 -5.738 1 0.3180 0.00256 0.000000 0.6250 0.603 190.050 8 1410 Meghan Trainor All About That Bass 187920 True 2015 0.807 0.887 9 -3.726 1 0.0503 0.05730 0.000003 0.1240 0.961 134.052 14 1772 Katy Perry Chained To The Rhythm 237733 False 2017 0.562 0.800 0 -5.404 1 0.1120 0.08140 0.000000 0.1990 0.471 95.029 14 1670 Sigala Sweet Lovin' - Radio Edit 202149 False 2015 0.683 0.910 10 -1.231 1 0.0515 0.05530 0.000005 0.3360 0.674 124.977 15 1780 Liam Payne Strip That Down 204502 False 2017 0.869 0.485 6 -5.595 1 0.0545 0.24600 0.000000 0.0765 0.527 106.028 14 The dataset has many high-level features, some quite intuitive ('duration_ms' or 'tempo'), while others are a bit more cryptic ('valence'?). For information on each feature, please consult the dataset's website .
In our analysis, we will use all the columns, except for 'artist' and 'song', to predict the 'popularity' of each song. We will nonetheless keep the information on song and artist in a separate object for future reference.
song_name = training_data[0][\"song\"]\nartist = training_data[0][\"artist\"]\ntraining_data[0] = training_data[0].drop([\"song\", \"artist\"], axis=1)\ntest_data[0] = test_data[0].drop([\"song\", \"artist\"], axis=1)\nval_data[0] = val_data[0].drop([\"song\", \"artist\"], axis=1)\n
Input and label data are then used to instantiate a Dataset object:
dataset = Dataset(*training_data, *val_data)\n
The calculation of exact Shapley values is computationally very expensive (exponentially so!) because it requires training the model on every possible subset of the training set. For this reason, PyDVL implements techniques to speed up the calculation, such as Monte Carlo approximations , surrogate models or caching of intermediate results and grouping of data to calculate group Shapley values instead of single data points.
In our case, we will group songs by artist and calculate the Shapley value for the artists. Given the pandas Series for 'artist', to group the dataset by it, one does the following:
grouped_dataset = GroupedDataset.from_dataset(dataset=dataset, data_groups=artist)\n
utility = Utility(\n model=GradientBoostingRegressor(n_estimators=3),\n data=grouped_dataset,\n scorer=Scorer(\"neg_mean_absolute_error\", default=0.0),\n)\nvalues = compute_shapley_values(\n utility,\n mode=ShapleyMode.TruncatedMontecarlo,\n # Stop if the standard error is below 1% of the range of the values (which is ~2),\n # or if the number of updates exceeds 1000\n done=AbsoluteStandardError(threshold=0.2, fraction=0.9) | MaxUpdates(1000),\n truncation=RelativeTruncation(utility, rtol=0.01),\n n_jobs=-1,\n)\nvalues.sort(key=\"value\")\ndf = values.to_dataframe(column=\"data_value\", use_names=True)\n
\nCancellation of futures is not supported by the joblib backend\n
\n
The function compute_shapley_values() serves as a common access point to all Shapley methods. For most of them, we must choose a StoppingCriterion
with the argument done=
. In this case we choose to stop when the ratio of standard error to value is below 0.2 for at least 90% of the training points, or if the number of updates of any index exceeds 1000. The mode
argument specifies the Shapley method to use. In this case, we use the Truncated Monte Carlo approximation , which is the fastest of the Monte Carlo methods, owing both to using the permutation definition of Shapley values and the ability to truncate the iteration over a given permutation. We configure this to happen when the contribution of the remaining elements is below 1% of the total utility with the parameter truncation=
and the policy RelativeTruncation .
Let's take a look at the returned dataframe:
df.head()\n
data_value data_value_stderr Years & Years -1.150663 0.195376 Reik -1.123071 0.126558 Astrid S -0.945702 0.331619 Liam Payne -0.886687 0.112654 DB Boulevard -0.847957 0.057503 The first thing to notice is that we sorted the results in ascending order of Shapley value. The index holds the labels for each data group: in this case, artist names. The column data_value
is just that: the Shapley Data value, and data_value_stderr
is its estimated standard error because we are using a Monte Carlo approximation.
Let us plot the results. In the next cell we will take the 30 artists with the lowest score and plot their values with 95% Normal confidence intervals. Keep in mind that Monte Carlo Shapley is typically very noisy, and it can take many steps to arrive at a clean estimate.
We can immediately see that many artists (groups of samples) have very low, even negative value, which means that they tend to decrease the total score of the model when present in the training set! What happens if we remove them?
In the next cell we create a new training set excluding the artists with the lowest scores:
low_dvl_artists = df.iloc[: int(0.2 * len(df))].index.to_list()\nartist_filter = ~artist.isin(low_dvl_artists)\nX_train_good_dvl = training_data[0][artist_filter]\ny_train_good_dvl = training_data[1][artist_filter]\n
Now we will use this \"cleaned\" dataset to retrain the same model and compare its mean absolute error to the one trained on the full dataset. Notice that the score now is calculated using the test set, while in the calculation of the Shapley values we were using the validation set.
model_good_data = GradientBoostingRegressor(n_estimators=3).fit(\n X_train_good_dvl, y_train_good_dvl\n)\nerror_good_data = mean_absolute_error(\n model_good_data.predict(test_data[0]), test_data[1]\n)\n\nmodel_all_data = GradientBoostingRegressor(n_estimators=3).fit(\n training_data[0], training_data[1]\n)\nerror_all_data = mean_absolute_error(model_all_data.predict(test_data[0]), test_data[1])\n\nprint(f\"Improvement: {100*(error_all_data - error_good_data)/error_all_data:02f}%\")\n
\nImprovement: 15.314214%\n
\n
The score has improved by almost 14%! This is quite an important result, as it shows a consistent process to improve the performance of a model by excluding data points from its training set.
One must however proceed with caution instead of simply throwing away data. For one, `mean_absolute_error` is an estimate of generalization error on unseen data, so the improvement we see on the test set might not be as large upon deployment. It would be advisable to cross-validate this whole process to obtain more conservative estimates. It is also advisable to manually inspect the artists with low value and to try to understand the reason why the model behaves like it does. Finally, remember that **the value depends on the model chosen**! Artists that are detrimental to the Gradient Boosting Regressor might be informative for a different model (although it is likely that the worst ones share some characteristic making them \"bad\" for other regressors). Let us take all the songs by Billie Eilish, set their score to 0 and re-calculate the Shapley values.
y_train_anomalous = training_data[1].copy(deep=True)\ny_train_anomalous[artist == \"Billie Eilish\"] = 0\nanomalous_dataset = Dataset(\n x_train=training_data[0],\n y_train=y_train_anomalous,\n x_test=val_data[0],\n y_test=val_data[1],\n)\ngrouped_anomalous_dataset = GroupedDataset.from_dataset(anomalous_dataset, artist)\nanomalous_utility = Utility(\n model=GradientBoostingRegressor(n_estimators=3),\n data=grouped_anomalous_dataset,\n scorer=Scorer(\"neg_mean_absolute_error\", default=0.0),\n)\nvalues = compute_shapley_values(\n anomalous_utility,\n mode=ShapleyMode.TruncatedMontecarlo,\n done=AbsoluteStandardError(threshold=0.2, fraction=0.9) | MaxUpdates(1000),\n n_jobs=-1,\n)\nvalues.sort(key=\"value\")\ndf = values.to_dataframe(column=\"data_value\", use_names=True)\n
\nCancellation of futures is not supported by the joblib backend\n
\n
Let us now consider the low-value artists (at least for predictive purposes, no claims are made about their artistic value!) and plot the results
And Billie Eilish (our anomalous data group) has moved from top contributor to having negative impact on the performance of the model, as expected!
What is going on? A popularity of 0 for Billie Eilish's songs is inconsistent with listening patterns for other artists. In artificially setting this, we degrade the predictive power of the model.
By dropping low-value groups or samples, one can often increase model performance, but by inspecting them, it is possible to identify bogus data sources or acquisition methods.
"},{"location":"examples/shapley_basic_spotify/#shapley-for-data-valuation","title":"Shapley for data valuation","text":""},{"location":"examples/shapley_basic_spotify/#setup","title":"Setup","text":" We begin by importing the main libraries and setting some defaults.
If you are reading this in the documentation, some boilerplate (including most plotting code) has been omitted for convenience."},{"location":"examples/shapley_basic_spotify/#loading-and-grouping-the-dataset","title":"Loading and grouping the dataset","text":" pyDVL provides a support function for this notebook, load_spotify_dataset()
, which downloads data on songs published after 2014, and splits 30% of data for testing, and 30% of the remaining data for validation. The return value is a triple of training, validation and test data as lists of the form [X_input, Y_label]
.
"},{"location":"examples/shapley_basic_spotify/#creating-the-utility-and-computing-values","title":"Creating the utility and computing values","text":" Now we can calculate the contribution of each group to the model performance.
As a model, we use scikit-learn's GradientBoostingRegressor , but pyDVL can work with any model from sklearn, xgboost or lightgbm. More precisely, any model that implements the protocol pydvl.utils.types.SupervisedModel , which is just the standard sklearn interface of fit()
, predict()
and score()
can be used to construct the utility.
The third and final component is the scoring function. It can be anything like accuracy or \\(R^2\\) , and is set with a string from the standard sklearn scoring methods . Please refer to that documentation on information on how to define your own scoring function.
We group dataset, model and scoring function into an instance of Utility .
"},{"location":"examples/shapley_basic_spotify/#evaluation-on-anomalous-data","title":"Evaluation on anomalous data","text":" One interesting test is to corrupt some data and to monitor how their value changes. To do this, we will take one of the artists with the highest value and set the popularity of all their songs to 0.
"},{"location":"examples/shapley_knn_flowers/","title":"KNN Shapley","text":" This notebook shows how to calculate Shapley values for the K-Nearest Neighbours algorithm. By making use of the local structure of KNN, it is possible to compute an exact value in almost linear time, as opposed to exponential complexity of exact, model-agnostic Shapley.
The main idea is to exploit the fact that adding or removing points beyond the k-ball doesn't influence the score. Because the algorithm then essentially only needs to do a search it runs in \\(\\mathcal{O}(N \\log N)\\) time.
By further using approximate nearest neighbours, it is possible to achieve \\((\\epsilon,\\delta)\\) -approximations in sublinear time. However, this is not implemented in pyDVL yet.
We refer to the original paper that pyDVL implements for details: Jia, Ruoxi, David Dao, Boxin Wang, Frances Ann Hubis, Nezihe Merve Gurel, Bo Li, Ce Zhang, Costas Spanos, and Dawn Song. Efficient Task-Specific Data Valuation for Nearest Neighbor Algorithms . Proceedings of the VLDB Endowment 12, no. 11 (1 July 2019): 1610\u201323.
The main entry point is the function compute_shapley_values() , which provides a facade to all Shapley methods. In order to use it we need the classes Dataset , Utility and Scorer , all of which can be imported from pydvl.value
:
from pydvl.value import *\n
sklearn_dataset = datasets.load_iris()\ndata = Dataset.from_sklearn(sklearn_dataset)\nknn = sk.neighbors.KNeighborsClassifier(n_neighbors=5)\nutility = Utility(knn, data)\n
shapley_values = compute_shapley_values(utility, mode=ShapleyMode.KNN, progress=True)\nshapley_values.sort(key=\"value\")\nvalues = shapley_values.values\n
\n0it [00:00, ?it/s]
\n
If we now look at the distribution of Shapley values for each class, we see that each has samples with both high and low scores. This is expected, because an accurate model uses information of all classes.
corrupted_data = deepcopy(data)\nn_corrupted = 10\ncorrupted_data.y_train[:n_corrupted] = (corrupted_data.y_train[:n_corrupted] + 1) % 3\nknn = sk.neighbors.KNeighborsClassifier(n_neighbors=5)\ncontaminated_values = compute_shapley_values(\n Utility(knn, corrupted_data), mode=ShapleyMode.KNN\n)\n
Taking the average corrupted value and comparing it to non-corrupted ones, we notice that on average anomalous points have a much lower score, i.e. they tend to be much less valuable to the model.
To do this, first we make sure that we access the results by data index with a call to ValuationResult.sort()
, then we split the values into two groups: corrupted and non-corrupted. Note how we access property values
of the ValuationResult
object. This is a numpy array of values, sorted however the object was sorted. Finally, we compute the quantiles of the two groups and compare them. We see that the corrupted mean is in the lowest percentile of the value distribution, while the correct mean is in the 70th percentile.
contaminated_values.sort(\n key=\"index\"\n) # This is redundant, but illustrates sorting, which is in-place\n\ncorrupted_shapley_values = contaminated_values.values[:n_corrupted]\ncorrect_shapley_values = contaminated_values.values[n_corrupted:]\n\nmean_corrupted = np.mean(corrupted_shapley_values)\nmean_correct = np.mean(correct_shapley_values)\npercentile_corrupted = np.round(100 * np.mean(values < mean_corrupted), 0)\npercentile_correct = np.round(100 * np.mean(values < mean_correct), 0)\n\nprint(\n f\"The corrupted mean is at percentile {percentile_corrupted:.0f} of the value distribution.\"\n)\nprint(\n f\"The correct mean is percentile {percentile_correct:.0f} of the value distribution.\"\n)\n
\nThe corrupted mean is at percentile 2 of the value distribution.\nThe correct mean is percentile 71 of the value distribution.\n
\n
This is confirmed if we plot the distribution of Shapley values and circle corrupt points in red. They all tend to have low Shapley scores, regardless of their position in space and assigned label:
"},{"location":"examples/shapley_knn_flowers/#knn-shapley","title":"KNN Shapley","text":""},{"location":"examples/shapley_knn_flowers/#setup","title":"Setup","text":" We begin by importing the main libraries and setting some defaults.
If you are reading this in the documentation, some boilerplate (including most plotting code) has been omitted for convenience."},{"location":"examples/shapley_knn_flowers/#building-a-dataset-and-a-utility","title":"Building a Dataset and a Utility","text":" We use the sklearn iris dataset and wrap it into a pydvl.utils.dataset.Dataset calling the factory pydvl.utils.dataset.Dataset.from_sklearn() . This automatically creates a train/test split for us which will be used to compute the utility.
We then create a model and instantiate a Utility using data and model. The model needs to implement the protocol pydvl.utils.types.SupervisedModel , which is just the standard sklearn interface of fit()
, predict()
and score()
. In constructing the Utility
one can also choose a scoring function, but we pick the default which is just the model's knn.score()
.
"},{"location":"examples/shapley_knn_flowers/#computing-values","title":"Computing values","text":" Calculating the Shapley values is straightforward. We just call compute_shapley_values() with the utility object we created above. The function returns a ValuationResult . This object contains the values themselves, data indices and labels.
"},{"location":"examples/shapley_knn_flowers/#inspecting-the-results","title":"Inspecting the results","text":" Let us first look at the labels' distribution as a function of petal and sepal length:
"},{"location":"examples/shapley_knn_flowers/#corrupting-labels","title":"Corrupting labels","text":" To test how informative values are, we can corrupt some training labels and see how their Shapley values change with respect to the non-corrupted points.
"},{"location":"examples/shapley_utility_learning/","title":"Data utility learning","text":" This notebook introduces Data Utility Learning , a method of approximating Data Shapley values by learning to estimate the utility function.
The idea is to employ a model to learn the performance of the learning algorithm of interest on unseen data combinations (i.e. subsets of the dataset). The method was originally described in Wang, Tianhao, Yu Yang, and Ruoxi Jia. Improving Cooperative Game Theory-Based Data Valuation via Data Utility Learning . arXiv, 2022 .
Warning: Work on Data Utility Learning is preliminary. It remains to be seen when or whether it can be put effectively into application. For this further testing and benchmarking are required.
Recall the definition of Shapley value \\(v_u(i)\\) for data point \\(i\\) :
\\[\\begin{equation} v_u(i) = \\frac{1}{n} \\sum_{S \\subseteq N \\setminus \\{i\\}} \\binom{n-1}{|S|}^{-1} [u(S \\cup \\{i\\}) \u2212 u(S)] , \\tag{1} \\label{eq:shapley-def} \\end{equation}\\] where \\(N\\) is the set of all indices in the training set and \\(u\\) is the utility.
In Data Utility Learning, to avoid the exponential cost of computing this sum, one learns a surrogate model for \\(u\\) . We start by sampling so-called utility samples to form a training set \\(S_\\mathrm{train}\\) for our utility model. Each utility sample is a tuple consisting of a subset of indices \\(S_j\\) in the dataset and its utility \\(u(S_j)\\) :
\\[\\mathcal{S}_\\mathrm{train} = \\{(S_j, u(S_j): j = 1 , ..., m_\\mathrm{train}\\}\\] where \\(m_\\mathrm{train}\\) denotes the training budget for the learned utility function.
The subsets are then transformed into boolean vectors \\(\\phi\\) in which a \\(1\\) at index \\(k\\) means that the \\(k\\) -th sample of the dataset is present in the subset:
\\[S_j \\mapsto \\phi_j \\in \\{ 0, 1 \\}^{N}\\] We fit a regression model \\(\\tilde{u}\\) , called data utility model , on the transformed utility samples \\(\\phi (\\mathcal{S}_\\mathrm{train}) := \\{(\\phi(S_j), u(S_j): j = 1 , ..., m_\\mathrm{train}\\}\\) and use it to predict instead of computing the utility for any \\(S_j \\notin \\mathcal{S}_\\mathrm{train}\\) . We abuse notation and identify \\(\\tilde{u}\\) with the composition \\(\\tilde{u} \\circ \\phi : N \\rightarrow \\mathbb{R}\\) .
The main assumption is that it is much faster to fit and use \\(\\tilde{u}\\) than it is to compute \\(u\\) and that for most \\(i\\) , \\(v_\\tilde{u}(i) \\approx v_u(i)\\) in some sense.
As is the case with all other Shapley methods, the main entry point is the function compute_shapley_values() , which provides a facade to all algorithms in this family. We use it with the usual classes Dataset and Utility . In addition, we must import the core class for learning a utility, DataUtilityLearning .
%autoreload\nfrom pydvl.utils import DataUtilityLearning, top_k_value_accuracy\nfrom pydvl.reporting.plots import shaded_mean_std\nfrom pydvl.value import *\n
dataset = Dataset.from_sklearn(\n load_iris(),\n train_size=train_size,\n random_state=random_state,\n stratify_by_target=True,\n)\n
We verify that, as in the paper, if we fit a Support-Vector Classifier to the training data, we obtain an accuracy of around 92%:
model = LinearSVC()\nmodel.fit(dataset.x_train, dataset.y_train)\nprint(f\"Mean accuracy: {100 * model.score(dataset.x_test, dataset.y_test):0.2f}%\")\n
\nMean accuracy: 92.59%\n
\n
computation_times = {}\n
utility = Utility(model=model, data=dataset)\n
start_time = time.monotonic()\n\nresult = compute_shapley_values(\n u=utility,\n mode=ShapleyMode.CombinatorialExact,\n n_jobs=-1,\n progress=False,\n)\n\ncomputation_time = time.monotonic() - start_time\ncomputation_times[\"exact\"] = computation_time\n\ndf = result.to_dataframe(column=\"exact\").drop(columns=[\"exact_stderr\"])\n
We now estimate the Data Shapley values using the DataUtilityLearning wrapper. This class wraps a Utility and delegates calls to it, up until a given budget. Every call yields a utility sample which is saved under the hood for training of the given utility model. Once the budget is exhausted, DataUtilityLearning
fits the model to the utility samples and all subsequent calls use the learned model to predict the wrapped utility instead of delegating to it.
For the utility model we follow the paper and use a fully connected neural network. To train it we use a total of training_budget
utility samples. We repeat this multiple times for each training budget.
Note how we use a MonteCarlo approximation instead of `combinatorial_exact` as before. This is because the exact computation samples subsets in a particular order, from the lowest size to the largest. Because the training budget for the model to learn the utility is around 1/4th of the total number of subsets, this would mean that we would never see utility samples for the larger sizes and the model would be biased (try it!) mlp_kwargs = dict(\n hidden_layer_sizes=(20, 10),\n activation=\"relu\",\n solver=\"adam\",\n learning_rate_init=0.001,\n batch_size=batch_size,\n max_iter=800,\n)\n\nprint(\n f\"Doing {n_runs} runs for each of {len(training_budget_values)} different training budgets.\"\n)\n\npbar = tqdm(\n product(range(n_runs), training_budget_values),\n total=n_runs * len(training_budget_values),\n)\nfor idx, budget in pbar:\n pbar.set_postfix_str(f\"Run {idx} for training budget: {budget}\")\n dul_utility = DataUtilityLearning(\n u=utility, training_budget=budget, model=MLPRegressor(**mlp_kwargs)\n )\n\n start_time = time.monotonic()\n\n # DUL will kick in after training_budget calls to utility\n result = compute_shapley_values(\n u=dul_utility,\n mode=ShapleyMode.PermutationMontecarlo,\n done=MaxUpdates(300),\n n_jobs=-1,\n )\n\n computation_time = time.monotonic() - start_time\n if budget in computation_times:\n computation_times[budget].append(computation_time)\n else:\n computation_times[budget] = [computation_time]\n\n dul_df = result.to_dataframe(column=f\"{budget}_{idx}\").drop(\n columns=[f\"{budget}_{idx}_stderr\"]\n )\n df = pd.concat([df, dul_df], axis=1)\n\ncomputation_times_df = pd.DataFrame(computation_times)\n
\nDoing 10 runs for each of 10 different training budgets.\n
\n
\n 0%| | 0/100 [00:00<?, ?it/s]
\n
Next we compute the \\(l_1\\) error for the different training budgets across all runs and plot mean and standard deviation. We obtain results analogous to Figure 1 of the paper, verifying that the method indeed works for estimating the Data Shapley values (at least in this context).
In the plot we also display the mean and standard deviation of the computation time taken for each training budget.
errors = np.zeros((len(training_budget_values), n_runs), dtype=float)\naccuracies = np.zeros((len(training_budget_values), n_runs), dtype=float)\n\ntop_k = 3\n\nfor i, budget in enumerate(training_budget_values):\n for j in range(n_runs):\n y_true = df[\"exact\"].values\n y_estimated = df[f\"{budget}_{j}\"].values\n errors[i, j] = np.linalg.norm(y_true - y_estimated, ord=2)\n accuracies[i, j] = top_k_value_accuracy(y_true, y_estimated, k=top_k)\n\nerror_from_mean = np.linalg.norm(df[\"exact\"].values - df[\"exact\"].values.mean(), ord=2)\n
Let us next look at how well the ranking of values resulting from using the surrogate \\(\\tilde{u}\\) matches the ranking by the exact values. For this we fix \\(k=3\\) and consider the \\(k\\) samples with the highest value according to \\(\\tilde{u}\\) and \\(u\\) :
Finally, for each sample, we look at the distance of the estimates to the exact value across runs. Boxes are centered at the 50th percentile with wiskers at the 25th and 75th. We plot relative distances, as a percentage. We observe a general tendency to underestimate the value:
highest_value_index = df.index[df[\"exact\"].argmax()]\ny_train_corrupted = dataset.y_train.copy()\ny_train_corrupted[highest_value_index] = (\n y_train_corrupted[highest_value_index] + 1\n) % 3\n\ncorrupted_dataset = Dataset(\n x_train=dataset.x_train,\n y_train=y_train_corrupted,\n x_test=dataset.x_test,\n y_test=dataset.y_test,\n)\n
We retrain the model on the new dataset and verify that the accuracy decreases:
model = LinearSVC()\nmodel.fit(dataset.x_train, y_train_corrupted)\nprint(f\"Mean accuracy: {100 * model.score(dataset.x_test, dataset.y_test):0.2f}%\")\n
\nMean accuracy: 82.96%\n
\n
Finally, we recompute the values of all samples using the exact method and the best training budget previously obtained and then plot the resulting scores.
best_training_budget = training_budget_values[errors.mean(axis=1).argmin()]\n\nutility = Utility(\n model=LinearSVC(),\n data=corrupted_dataset,\n)\n\nresult = compute_shapley_values(\n u=utility,\n mode=ShapleyMode.CombinatorialExact,\n n_jobs=-1,\n progress=False,\n)\ndf_corrupted = result.to_dataframe(column=\"exact\").drop(columns=[\"exact_stderr\"])\n\ndul_utility = DataUtilityLearning(\n u=utility, training_budget=best_training_budget, model=MLPRegressor(**mlp_kwargs)\n)\n\nresult = compute_shapley_values(\n u=dul_utility,\n mode=ShapleyMode.PermutationMontecarlo,\n done=MaxUpdates(300),\n n_jobs=-1,\n)\ndul_df = result.to_dataframe(column=\"estimated\").drop(columns=[\"estimated_stderr\"])\ndf_corrupted = pd.concat([df_corrupted, dul_df], axis=1)\n
We can see in the figure that both methods assign the lowest value to the sample with the corrupted label.
As mentioned above, despite the previous results, this work is preliminary and the usefulness of Data Utility Learning remains to be tested in practice."},{"location":"examples/shapley_utility_learning/#data-utility-learning","title":"Data Utility Learning","text":""},{"location":"examples/shapley_utility_learning/#setup","title":"Setup","text":" We begin by importing the main libraries and setting some defaults.
If you are reading this in the documentation, some boilerplate (including most plotting code) has been omitted for convenience."},{"location":"examples/shapley_utility_learning/#dataset","title":"Dataset","text":" Following the paper, we take 15 samples (10%) from the Iris dataset and compute their Data Shapley values by using all the remaining samples as test set for computing the utility, which in this case is accuracy.
"},{"location":"examples/shapley_utility_learning/#data-shapley","title":"Data Shapley","text":" We start by defining the utility using the model and computing the exact Data Shapley values by definition \\(\\ref{eq:shapley-def}\\) .
"},{"location":"examples/shapley_utility_learning/#evaluation-on-anomalous-data","title":"Evaluation on anomalous data","text":" One interesting way to assess the Data Utility Learning approach is to corrupt some data and monitor how the value changes. To do this, we will take the sample with the highest score and change its label.
"},{"location":"getting-started/","title":"Getting started","text":"If you want to jump straight in, install pyDVL and then check out the examples. You will probably want to install with support for influence function computation.
We have introductions to the ideas behind Data valuation and Influence functions, as well as a short overview of common applications.
"},{"location":"getting-started/#","title":"Installing pyDVL","text":"To install the latest release use:
pip install pyDVL\n
See Extras for optional dependencies, in particular if you are interested in influence functions. You can also install the latest development version from TestPyPI:
pip install pyDVL --index-url https://test.pypi.org/simple/\n
In order to check the installation you can use:
python -c \"import pydvl; print(pydvl.__version__)\"\n
"},{"location":"getting-started/#dependencies","title":"Dependencies","text":"pyDVL requires Python >= 3.8, numpy, scikit-learn, scipy, cvxpy for the core methods, and joblib for parallelization locally. Additionally,the Influence functions module requires PyTorch (see Extras below).
"},{"location":"getting-started/#installation-extras","title":"Extras","text":"pyDVL has a few extra dependencies that can be optionally installed:
"},{"location":"getting-started/#installation-influences","title":"Influence functions","text":"To use the module on influence functions, pydvl.influence, run:
pip install pyDVL[influence]\n
This includes a dependency on PyTorch (Version 2.0 and above) and thus is left out by default.
"},{"location":"getting-started/#cupy","title":"CuPy","text":"In case that you have a supported version of CUDA installed (v11.2 to 11.8 as of this writing), you can enable eigenvalue computations for low-rank approximations with CuPy on the GPU by using:
pip install pyDVL[cupy]\n
This installs cupy-cuda11x.
If you use a different version of CUDA, please install CuPy manually.
"},{"location":"getting-started/#ray","title":"Ray","text":"If you want to use Ray to distribute data valuation workloads across nodes in a cluster (it can be used locally as well, but for this we recommend joblib instead) install pyDVL using:
pip install pyDVL[ray]\n
see the intro to parallelization for more details on how to use it.
"},{"location":"getting-started/#memcached","title":"Memcached","text":"If you want to use Memcached for caching utility evaluations, use:
pip install pyDVL[memcached]\n
This installs pymemcache additionally. Be aware that you still have to start a memcached server manually. See Setting up the Memcached cache.
"},{"location":"getting-started/advanced-usage/","title":"Advanced usage","text":"Besides the dos and don'ts of data valuation itself, which are the subject of the examples and the documentation of each method, there are two main things to keep in mind when using pyDVL namely Parallelization and Caching.
"},{"location":"getting-started/advanced-usage/#setting-up-parallelization","title":"Parallelization","text":"pyDVL uses parallelization to scale and speed up computations. It does so using one of Dask, Ray or Joblib. The first is used in the influence package whereas the other two are used in the value package.
"},{"location":"getting-started/advanced-usage/#data-valuation","title":"Data valuation","text":"For data valuation, pyDVL uses joblib for local parallelization (within one machine) and supports using Ray for distributed parallelization (across multiple machines).
The former works out of the box but for the latter you will need to install additional dependencies (see Extras) and to provide a running cluster (or run ray in local mode).
Info
As of v0.9.0 pyDVL does not allow requesting resources per task sent to the cluster, so you will need to make sure that each worker has enough resources to handle the tasks it receives. A data valuation task using game-theoretic methods will typically make a copy of the whole model and dataset to each worker, even if the re-training only happens on a subset of the data. This means that you should make sure that each worker has enough memory to handle the whole dataset.
We use backend classes for both joblib and ray as well as two types of executors for the different algorithms: the first uses a map reduce pattern as seen in the MapReduceJob class and the second implements the futures executor interface from concurrent.futures.
As a convenience, you can also instantiate a parallel backend class by using the init_parallel_backend function:
from pydvl.parallel import init_parallel_backend\nparallel_backend = init_parallel_backend(backend_name=\"joblib\")\n
Info
The executor classes are not meant to be instantiated and used by users of pyDVL. They are used internally as part of the computations of the different methods.
Deprecation notice
We are currently planning to deprecate MapReduceJob in favour of the futures executor interface because it allows for more diverse computation patterns with interruptions.
"},{"location":"getting-started/advanced-usage/#joblib","title":"Joblib","text":"Please follow the instructions in Joblib's documentation for all possible configuration options that you can pass to the parallel_config context manager.
To use the joblib parallel backend with the loky
backend and verbosity set to 100
to compute exact shapley values you would use:
import joblib\nfrom pydvl.parallel import JoblibParallelBackend\nfrom pydvl.value.shapley import combinatorial_exact_shapley\nfrom pydvl.utils.utility import Utility\n\nparallel_backend = JoblibParallelBackend() \nu = Utility(...)\n\nwith joblib.parallel_config(backend=\"loky\", verbose=100):\n values = combinatorial_exact_shapley(u, parallel_backend=parallel_backend)\n
"},{"location":"getting-started/advanced-usage/#ray","title":"Ray","text":"Additional dependencies
The Ray parallel backend requires optional dependencies. See Extras for more information.
Please follow the instructions in Ray's documentation to set up a remote cluster. You could alternatively use a local cluster and in that case you don't have to set anything up.
Before starting a computation, you should initialize ray by calling ray.init
with the appropriate parameters:
To set up and start a local ray cluster with 4 CPUs you would use:
import ray\n\nray.init(num_cpus=4)\n
Whereas for a remote ray cluster you would use:
import ray\n\naddress = \"<Hypothetical Ray Cluster IP Address>\"\nray.init(address)\n
To use the ray parallel backend to compute exact shapley values you would use:
import ray\nfrom pydvl.parallel import RayParallelBackend\nfrom pydvl.value.shapley import combinatorial_exact_shapley\nfrom pydvl.utils.utility import Utility\n\nray.init()\nparallel_backend = RayParallelBackend()\nu = Utility(...)\nvaues = combinatorial_exact_shapley(u, parallel_backend=parallel_backend)\n
"},{"location":"getting-started/advanced-usage/#futures-executor","title":"Futures executor","text":"For the futures executor interface, we have implemented an executor class for ray in RayExecutor and rely on joblib's loky get_reusable_executor function to instantiate an executor for local parallelization.
They are both compatibles with the builtin ThreadPoolExecutor and ProcessPoolExecutor classes.
>>> from joblib.externals.loky import _ReusablePoolExecutor\n>>> from pydvl.parallel import JoblibParallelBackend\n>>> parallel_backend = JoblibParallelBackend() \n>>> with parallel_backend.executor() as executor:\n... results = list(executor.map(lambda x: x + 1, range(3)))\n...\n>>> results\n[1, 2, 3]\n
"},{"location":"getting-started/advanced-usage/#map-reduce","title":"Map-reduce","text":"The map-reduce interface is older and more limited in the patterns it allows us to use.
To reproduce the previous example using MapReduceJob, we would use:
>>> from pydvl.parallel import JoblibParallelBackend, MapReduceJob\n>>> parallel_backend = JoblibParallelBackend() \n>>> map_reduce_job = MapReduceJob(\n... list(range(3)),\n... map_func=lambda x: x[0] + 1,\n... parallel_backend=parallel_backend,\n... )\n>>> results = map_reduce_job()\n>>> results\n[1, 2, 3]\n
"},{"location":"getting-started/advanced-usage/#influence-functions","title":"Influence functions","text":"Refer to Scaling influence computation for explanations about parallelization for Influence Functions.
"},{"location":"getting-started/advanced-usage/#getting-started-cache","title":"Caching","text":"PyDVL can cache (memoize) the computation of the utility function and speed up some computations for data valuation. It is however disabled by default. When it is enabled it takes into account the data indices passed as argument and the utility function wrapped into the Utility object. This means that care must be taken when reusing the same utility function with different data, see the documentation for the caching package for more information.
In general, caching won't play a major role in the computation of Shapley values because the probability of sampling the same subset twice, and hence needing the same utility function computation, is very low. However, it can be very useful when comparing methods that use the same utility function, or when running multiple experiments with the same data.
pyDVL supports 3 different caching backends:
-
InMemoryCacheBackend: an in-memory cache backend that uses a dictionary to store and retrieve cached values. This is used to share cached values between threads in a single process.
-
DiskCacheBackend: a disk-based cache backend that uses pickled values written to and read from disk. This is used to share cached values between processes in a single machine.
-
MemcachedCacheBackend: a Memcached-based cache backend that uses pickled values written to and read from a Memcached server. This is used to share cached values between processes across multiple machines.
Memcached extras The Memcached backend requires optional dependencies. See Extras for more information.
As an example, here's how one would use the disk-based cached backend with a utility:
from pydvl.utils.caching.disk import DiskCacheBackend\nfrom pydvl.utils.utility import Utility\n\ncache_backend = DiskCacheBackend()\nu = Utility(..., cache_backend=cache_backend)\n
Please refer to the documentation and examples of each backend class for more details.
When is the cache really necessary?
Crucially, semi-value computations with the PermutationSampler require caching to be enabled, or they will take twice as long as the direct implementation in compute_shapley_values.
Using the cache
Continue reading about the cache in the documentation for the caching package.
"},{"location":"getting-started/advanced-usage/#setting-up-memcached","title":"Setting up the Memcached cache","text":"Memcached is an in-memory key-value store accessible over the network. pyDVL can use it to cache the computation of the utility function and speed up some computations (in particular, semi-value computations with the PermutationSampler but other methods may benefit as well).
You can either install it as a package or run it inside a docker container (the simplest). For installation instructions, refer to the Getting started section in memcached's wiki. Then you can run it with:
memcached -u user\n
To run memcached inside a container in daemon mode instead, use:
docker container run -d --rm -p 11211:11211 memcached:latest\n
"},{"location":"getting-started/applications/","title":"Applications of data valuation","text":"Data valuation methods can improve various aspects of data engineering and machine learning workflows. When applied judiciously, these methods can enhance data quality, model performance, and cost-effectiveness.
However, the results can be inconsistent. Values have a strong dependency on the training procedure and the performance metric used. For instance, accuracy is a poor metric for imbalanced sets and this has a stark effect on data values. Some models exhibit great variance in some regimes and this again has a detrimental effect on values. See Problems of data values for more on this.
Here we quickly enumerate the most common uses of data valuation. For a comprehensive overview, along with concrete examples, please refer to the Transferlab blog post on this topic.
"},{"location":"getting-started/applications/#data-engineering","title":"Data engineering","text":"Some of the promising applications in data engineering include:
- Removing low-value data points to increase model performance.
- Pruning redundant samples enables more efficient training of large models.
- Active learning. Points predicted to have high-value points can be prioritized for labeling, reducing the cost of data collection.
- Analyzing high- and low-value data to guide data collection and improve upstream data processes. Low-value points may reveal data issues to address.
- Identify irrelevant or duplicated data when evaluating offerings from data providers.
"},{"location":"getting-started/applications/#model-development","title":"Model development","text":"Some of the useful applications include:
- Data attribution for interpretation and debugging: Analyzing the most or least valuable samples for a class can reveal cases where the model relies on confounding features instead of true signal. Investigating influential points for misclassified examples highlights limitations to address.
- Sensitivity / robustness analysis: (Broderick et al., 2021)1 shows that removing a small fraction of highly influential data can completely flip model conclusions. This can reveal potential issues with the modeling approach, data collection process, or intrinsic difficulties of the problem that require further inspection.
- Continual learning: in order to avoid forgetting when training on new data, a subset of previously seen data is presented again. Data valuation can help in the selection of the most valuable samples to retain.
"},{"location":"getting-started/applications/#attacks","title":"Attacks","text":"Data valuation techniques have applications in detecting data manipulation and contamination, although the feasibility of such attacks is limited.
- Watermark removal: Points with low value on a correct validation set may be part of a watermarking mechanism.
- Poisoning attacks: Influential points can be shifted to induce large changes in model estimators.
"},{"location":"getting-started/applications/#data-markets","title":"Data markets","text":"Additionally, one of the motivating applications for the whole field is that of data markets, where data valuation can be the key component to determine the price of data.
Game-theoretic valuation methods like Shapley values can help assign fair prices, but have limitations around handling duplicates or adversarial data. Model-free methods like LAVA (Just et al., 2023)2 and CRAIG are particularly well suited for this, as they use the Wasserstein distance between a vendor's data and the buyer's to determine the value of the former.
However, this is a complex problem which can face simple practical problems like data owners not willing to disclose their data for valuation, even to a broker.
-
Broderick, T., Giordano, R., Meager, R., 2021. An Automatic Finite-Sample Robustness Metric: When Can Dropping a Little Data Make a Big Difference? \u21a9
-
Just, H.A., Kang, F., Wang, T., Zeng, Y., Ko, M., Jin, M., Jia, R., 2023. LAVA: Data Valuation without Pre-Specified Learning Algorithms. Presented at the The Eleventh International Conference on Learning Representations (ICLR 2023).\u00a0\u21a9
"},{"location":"getting-started/benchmarking/","title":"Benchmarking tasks","text":"Because the magnitudes of values or influences from different algorithms, or datasets, are not comparable to each other, evaluation of the methods is typically done with downstream tasks.
"},{"location":"getting-started/benchmarking/#benchmarking-valuation-methods","title":"Benchmarking valuation methods","text":"Data valuation is particularly useful for data selection, pruning and inspection in general. For this reason, the most common benchmarks are data removal and noisy label detection.
"},{"location":"getting-started/benchmarking/#high-value-point-removal","title":"High-value point removal","text":"After computing the values for all data in \\(T = \\{ \\mathbf{z}_i : i = 1, \\ldots, n \\}\\), the set is sorted by decreasing value. We denote by \\(T_{[i :]}\\) the sorted sequence of points \\((\\mathbf{z}_i, \\mathbf{z}_{i + 1}, \\ldots, \\mathbf{z}_n)\\) for \\(1 \\leqslant i \\leqslant n\\). Now train successively \\(f_{T [i :]}\\) and compute its accuracy \\(a_{T_{[i :]}} (D_{\\operatorname{test}})\\) on the held-out test set, then plot all numbers. By using \\(D_{\\operatorname{test}}\\) one approximates the expected accuracy drop on unseen data. Because the points removed have a high value, one expects performance to drop visibly wrt. a random baseline.
"},{"location":"getting-started/benchmarking/#low-value-point-removal","title":"Low-value point removal","text":"The complementary experiment removes data in increasing order, with the lowest valued points first. Here one expects performance to increase relatively to randomly removing points before training. Additionally, every real dataset will include slightly out-of-distribution points, so one should also expect an absolute increase in performance when some of the lowest valued points are removed.
"},{"location":"getting-started/benchmarking/#value-transfer","title":"Value transfer","text":"This experiment explores the extent to which data values computed with one (cheap) model can be transferred to another (potentially more complex) one. Different classifiers are used as a source to calculate data values. These values are then used in the point removal tasks described above, but using a different (target) model for evaluation of the accuracies \\(a_{T [i :]}\\). A multi-layer perceptron is added for evaluation as well.
"},{"location":"getting-started/benchmarking/#noisy-label-detection","title":"Noisy label detection","text":"This experiment tests the ability of a method to detect mislabeled instances in the data. A fixed fraction \\(\\alpha\\) of the training data are picked at random and their labels flipped. Data values are computed, then the \\(\\alpha\\)-fraction of lowest-valued points are selected, and the overlap with the subset of flipped points is computed. This synthetic experiment is however hard to put into practical use, since the fraction \\(\\alpha\\) is of course unknown in practice.
"},{"location":"getting-started/benchmarking/#rank-stability","title":"Rank stability","text":"Introduced in [@wang_data_2022], one can look at how stable the top \\(k\\)% of the values is across runs. Rank stability of a method is necessary but not sufficient for good results. Ideally one wants to identify high-value points reliably (good precision and recall) and consistently (good rank stability).
"},{"location":"getting-started/benchmarking/#benchmarking-influence-function-methods","title":"Benchmarking Influence function methods","text":"Todo
This section is basically a stub
Although in principle one can compute the average influence over the test set and run the same tasks as above, because influences are computed for each pair of training and test sample, they typically require different experiments to compare their efficacy.
"},{"location":"getting-started/benchmarking/#approximation-quality","title":"Approximation quality","text":"The biggest difficulty when computing influences is the approximation of the inverse Hessian-vector product. For this reason one often sees in the literature the quality of the approximation to LOO as an indicator of performance, the exact Influence Function being a first order approximation to it. However, as shown by (Bae et al., 2022)1, the different approximation errors ensuing for lack of convexity, approximate Hessian-vector products and so on, lead to this being a poor benchmark overall.
"},{"location":"getting-started/benchmarking/#data-re-labelling","title":"Data re-labelling","text":"(Kong et al., 2022)2 introduce a method using IFs to re-label harmful training samples in order to improve accuracy. One can then take the obtained improvement as a measure of the quality of the IF method.
"},{"location":"getting-started/benchmarking/#post-hoc-fairness-adjustment","title":"Post-hoc fairness adjustment","text":"Introduced in [@...], the idea is to compute influences over a carefully selected fair set, and using them to re-weight the training data.
-
Bae, J., Ng, N., Lo, A., Ghassemi, M., Grosse, R.B., 2022. If Influence Functions are the Answer, Then What is the Question?, in: Advances in Neural Information Processing Systems. Presented at the NeurIPS 2022, pp. 17953\u201317967.\u00a0\u21a9
-
Kong, S., Shen, Y., Huang, L., 2022. Resolving Training Biases via Influence-based Data Relabeling. Presented at the International Conference on Learning Representations (ICLR 2022).\u00a0\u21a9
"},{"location":"getting-started/first-steps/","title":"First steps","text":"Warning
Make sure you have read Getting started before using the library. In particular read about which extra dependencies you may need.
"},{"location":"getting-started/first-steps/#main-concepts","title":"Main concepts","text":"pyDVL aims to be a repository of production-ready, reference implementations of algorithms for data valuation and influence functions. Even though we only briefly introduce key concepts in the documentation, the following sections should be enough to get you started.
- Basics of data valuation for key objects and usage patterns for Shapley value computation and related methods.
- Computing Influence Values for instructions on how to compute influence functions.
"},{"location":"getting-started/first-steps/#running-the-examples","title":"Running the examples","text":"If you are somewhat familiar with the concepts of data valuation, you can start by browsing our worked-out examples illustrating pyDVL's capabilities either:
- In the examples under Basics of data valuation and Computing Influence Values.
- Using binder notebooks, deployed from each example's page.
- Locally, by starting a jupyter server at the root of the project. You will have to install jupyter first manually since it's not a dependency of the library.
"},{"location":"getting-started/first-steps/#advanced-usage","title":"Advanced usage","text":"Refer to the Advanced usage page for explanations on how to enable and use parallelization and caching.
"},{"location":"getting-started/glossary/","title":"Glossary","text":"This glossary is meant to provide only brief explanations of each term, helping to clarify the concepts and techniques used in the library. For more detailed information, please refer to the relevant literature or resources.
Warning
This glossary is still a work in progress. Pull requests are welcome!
Terms in data valuation and influence functions:
"},{"location":"getting-started/glossary/#arnoldi-method","title":"Arnoldi Method","text":"The Arnoldi method approximately computes eigenvalue, eigenvector pairs of a symmetric matrix. For influence functions, it is used to approximate the iHVP. Introduced by (Schioppa et al., 2022)1 in the context of influence functions.
- Implementation (torch)
- Documentation (torch)
"},{"location":"getting-started/glossary/#block-conjugate-gradient","title":"Block Conjugate Gradient","text":"A blocked version of CG, which solves several linear systems simultaneously. For Influence Functions, it is used to approximate the iHVP.
- Implementation (torch)
- Documentation (torch)
"},{"location":"getting-started/glossary/#class-wise-shapley","title":"Class-wise Shapley","text":"Class-wise Shapley is a Shapley valuation method which introduces a utility function that balances in-class, and out-of-class accuracy, with the goal of favoring points that improve the model's performance on the class they belong to. It is estimated to be particularly useful in imbalanced datasets, but more research is needed to confirm this. Introduced by (Schoch et al., 2022)2.
- Implementation
- Documentation
"},{"location":"getting-started/glossary/#conjugate-gradient","title":"Conjugate Gradient","text":"CG is an algorithm for solving linear systems with a symmetric and positive-definite coefficient matrix. For Influence Functions, it is used to approximate the iHVP.
- Implementation (torch)
- Documentation (torch)
"},{"location":"getting-started/glossary/#data-utility-learning","title":"Data Utility Learning","text":"Data Utility Learning is a method that uses an ML model to learn the utility function. Essentially, it learns to predict the performance of a model when trained on a given set of indices from the dataset. The cost of training this model is quickly amortized by avoiding costly re-evaluations of the original utility. Introduced by (Wang et al., 2022)3.
- Implementation
- Documentation
"},{"location":"getting-started/glossary/#eigenvalue-corrected-kronecker-factored-approximate-curvature","title":"Eigenvalue-corrected Kronecker-Factored Approximate Curvature","text":"EKFAC builds on K-FAC by correcting for the approximation errors in the eigenvalues of the blocks of the Kronecker-factored approximate curvature matrix. This correction aims to refine the accuracy of natural gradient approximations, thus potentially offering better training efficiency and stability in neural networks.
- Implementation (torch)
- Documentation (torch)
"},{"location":"getting-started/glossary/#group-testing","title":"Group Testing","text":"Group Testing is a strategy for identifying characteristics within groups of items efficiently, by testing groups rather than individuals to quickly narrow down the search for items with specific properties. Introduced into data valuation by (Jia et al., 2019)4.
- Implementation
- Documentation
"},{"location":"getting-started/glossary/#influence-function","title":"Influence Function","text":"The Influence Function measures the impact of a single data point on a statistical estimator. In machine learning, it's used to understand how much a particular data point affects the model's prediction. Introduced into data valuation by (Koh and Liang, 2017)5.
- Documentation
"},{"location":"getting-started/glossary/#inverse-hessian-vector-product","title":"Inverse Hessian-vector product","text":"iHVP is the operation of calculating the product of the inverse Hessian matrix of a function and a vector, without explicitly constructing nor inverting the full Hessian matrix first. This is essential for influence function computation.
"},{"location":"getting-started/glossary/#kronecker-factored-approximate-curvature","title":"Kronecker-Factored Approximate Curvature","text":"K-FAC is an optimization technique that approximates the Fisher Information matrix's inverse efficiently. It uses the Kronecker product to factor the matrix, significantly speeding up the computation of natural gradient updates and potentially improving training efficiency.
"},{"location":"getting-started/glossary/#least-core","title":"Least Core","text":"The Least Core is a solution concept in cooperative game theory, referring to the smallest set of payoffs to players that cannot be improved upon by any coalition, ensuring stability in the allocation of value. In data valuation, it implies solving a linear and a quadratic system whose constraints are determined by the evaluations of the utility function on every subset of the training data. Introduced as data valuation method by (Yan and Procaccia, 2021)6.
- Implementation
- Documentation
"},{"location":"getting-started/glossary/#linear-time-stochastic-second-order-algorithm","title":"Linear-time Stochastic Second-order Algorithm","text":"LiSSA is an efficient algorithm for approximating the inverse Hessian-vector product, enabling faster computations in large-scale machine learning problems, particularly for second-order optimization. For Influence Functions, it is used to approximate the iHVP. Introduced by (Agarwal et al., 2017)7.
- Implementation (torch)
- Documentation (torch)
"},{"location":"getting-started/glossary/#leave-one-out","title":"Leave-One-Out","text":"LOO in the context of data valuation refers to the process of evaluating the impact of removing individual data points on the model's performance. The value of a training point is defined as the marginal change in the model's performance when that point is removed from the training set.
- Implementation
- Documentation
"},{"location":"getting-started/glossary/#maximum-sample-reuse","title":"Maximum Sample Reuse","text":"MSR is a sampling method for data valuation that updates the value of every data point in one sample. This method can achieve much faster convergence. Introduced by (Wang and Jia, 2023)8
- Implementation
"},{"location":"getting-started/glossary/#monte-carlo-least-core","title":"Monte Carlo Least Core","text":"MCLC is a variation of the Least Core that uses a reduced amount of constraints, sampled randomly from the powerset of the training data. Introduced by (Yan and Procaccia, 2021)6.
- Implementation
- Documentation
"},{"location":"getting-started/glossary/#monte-carlo-shapley","title":"Monte Carlo Shapley","text":"MCS estimates the Shapley Value using a Monte Carlo approximation to the sum over subsets of the training set. This reduces computation to polynomial time at the cost of accuracy, but this loss is typically irrelevant for downstream applications in ML. Introduced into data valuation by (Ghorbani and Zou, 2019)9.
- Implementation
- Documentation
"},{"location":"getting-started/glossary/#nystrom-low-rank-approximation","title":"Nystr\u00f6m Low-Rank Approximation","text":"The Nystr\u00f6m approximation computes a low-rank approximation to a symmetric positive-definite matrix via random projections. For influence functions, it is used to approximate the iHVP. Introduced as sketch and solve algorithm in (Hataya and Yamada, 2023)10, and as preconditioner for PCG in (Frangella et al., 2023)11.
- Implementation Sketch-and-Solve (torch)
- Documentation Sketch-and-Solve (torch)
- Implementation Preconditioner (torch)
"},{"location":"getting-started/glossary/#point-removal-task","title":"Point removal task","text":"A task in data valuation where the quality of a valuation method is measured through the impact of incrementally removing data points on the model's performance, where the points are removed in order of their value. See
- Benchmarking tasks
"},{"location":"getting-started/glossary/#preconditioned-block-conjugate-gradient","title":"Preconditioned Block Conjugate Gradient","text":"A blocked version of PCG, which solves several linear systems simultaneously. For Influence Functions, it is used to approximate the iHVP.
- Implementation CG (torch)
- Implementation Preconditioner (torch)
- Documentation (torch)
"},{"location":"getting-started/glossary/#preconditioned-conjugate-gradient","title":"Preconditioned Conjugate Gradient","text":"A preconditioned version of CG for improved convergence, depending on the characteristics of the matrix and the preconditioner. For Influence Functions, it is used to approximate the iHVP.
- Implementation CG (torch)
- Implementation Preconditioner (torch)
- Documentation (torch)
"},{"location":"getting-started/glossary/#shapley-value","title":"Shapley Value","text":"Shapley Value is a concept from cooperative game theory that allocates payouts to players based on their contribution to the total payoff. In data valuation, players are data points. The method assigns a value to each data point based on a weighted average of its marginal contributions to the model's performance when trained on each subset of the training set. This requires \\(\\mathcal{O}(2^{n-1})\\) re-trainings of the model, which is infeasible for even trivial data set sizes, so one resorts to approximations like TMCS. Introduced into data valuation by (Ghorbani and Zou, 2019)9.
- Implementation
- Documentation
"},{"location":"getting-started/glossary/#truncated-monte-carlo-shapley","title":"Truncated Monte Carlo Shapley","text":"TMCS is an efficient approach to estimating the Shapley Value using a truncated version of the Monte Carlo method, reducing computation time while maintaining accuracy in large datasets. Introduced by (Ghorbani and Zou, 2019)9.
- Implementation
- Documentation
"},{"location":"getting-started/glossary/#weighted-accuracy-drop","title":"Weighted Accuracy Drop","text":"WAD is a metric to evaluate the impact of sequentially removing data points on the performance of a machine learning model, weighted by their rank, i.e. by the time at which they were removed. Introduced by (Schoch et al., 2022)2.
"},{"location":"getting-started/glossary/#other-terms","title":"Other terms","text":""},{"location":"getting-started/glossary/#coefficient-of-variation","title":"Coefficient of Variation","text":"CV is a statistical measure of the dispersion of data points in a data series around the mean, expressed as a percentage. It's used to compare the degree of variation from one data series to another, even if the means are drastically different.
"},{"location":"getting-started/glossary/#constraint-satisfaction-problem","title":"Constraint Satisfaction Problem","text":"A CSP involves finding values for variables within specified constraints or conditions, commonly used in scheduling, planning, and design problems where solutions must satisfy a set of restrictions.
"},{"location":"getting-started/glossary/#out-of-bag","title":"Out-of-Bag","text":"OOB refers to data samples in an ensemble learning context (like random forests) that are not selected for training a specific model within the ensemble. These OOB samples are used as a validation set to estimate the model's accuracy, providing a convenient internal cross-validation mechanism.
"},{"location":"getting-started/glossary/#machine-learning-reproducibility-challenge","title":"Machine Learning Reproducibility Challenge","text":"The MLRC is an initiative that encourages the verification and replication of machine learning research findings, promoting transparency and reliability in the field. Papers are published in Transactions on Machine Learning Research (TMLR).
-
Schioppa, A., Zablotskaia, P., Vilar, D., Sokolov, A., 2022. Scaling Up Influence Functions. Proc. AAAI Conf. Artif. Intell. 36, 8179\u20138186. https://doi.org/10.1609/aaai.v36i8.20791 \u21a9
-
Schoch, S., Xu, H., Ji, Y., 2022. CS-Shapley: Class-wise Shapley Values for Data Valuation in Classification, in: Proc. Of the Thirty-Sixth Conference on Neural Information Processing Systems (NeurIPS). Presented at the Advances in Neural Information Processing Systems (NeurIPS 2022).\u00a0\u21a9\u21a9
-
Wang, T., Yang, Y., Jia, R., 2022. Improving Cooperative Game Theory-based Data Valuation via Data Utility Learning. Presented at the International Conference on Learning Representations (ICLR 2022). Workshop on Socially Responsible Machine Learning, arXiv. https://doi.org/10.48550/arXiv.2107.06336 \u21a9
-
Jia, R., Dao, D., Wang, B., Hubis, F.A., Gurel, N.M., Li, B., Zhang, C., Spanos, C., Song, D., 2019. Efficient task-specific data valuation for nearest neighbor algorithms. Proc. VLDB Endow. 12, 1610\u20131623. https://doi.org/10.14778/3342263.3342637 \u21a9
-
Koh, P.W., Liang, P., 2017. Understanding Black-box Predictions via Influence Functions, in: Proceedings of the 34th International Conference on Machine Learning. Presented at the International Conference on Machine Learning, PMLR, pp. 1885\u20131894.\u00a0\u21a9
-
Yan, T., Procaccia, A.D., 2021. If You Like Shapley Then You\u2019ll Love the Core, in: Proceedings of the 35th AAAI Conference on Artificial Intelligence, 2021. Presented at the AAAI Conference on Artificial Intelligence, Association for the Advancement of Artificial Intelligence, pp. 5751\u20135759. https://doi.org/10.1609/aaai.v35i6.16721 \u21a9\u21a9
-
Agarwal, N., Bullins, B., Hazan, E., 2017. Second-Order Stochastic Optimization for Machine Learning in Linear Time. JMLR 18, 1\u201340.\u00a0\u21a9
-
Wang, J.T., Jia, R., 2023. Data Banzhaf: A Robust Data Valuation Framework for Machine Learning, in: Proceedings of The 26th International Conference on Artificial Intelligence and Statistics. Presented at the International Conference on Artificial Intelligence and Statistics, PMLR, pp. 6388\u20136421.\u00a0\u21a9
-
Ghorbani, A., Zou, J., 2019. Data Shapley: Equitable Valuation of Data for Machine Learning, in: Proceedings of the 36th International Conference on Machine Learning, PMLR. Presented at the International Conference on Machine Learning (ICML 2019), PMLR, pp. 2242\u20132251.\u00a0\u21a9\u21a9\u21a9
-
Hataya, R., Yamada, M., 2023. Nystr\u00f6m Method for Accurate and Scalable Implicit Differentiation, in: Proceedings of The 26th International Conference on Artificial Intelligence and Statistics. Presented at the International Conference on Artificial Intelligence and Statistics, PMLR, pp. 4643\u20134654.\u00a0\u21a9
-
Frangella, Z., Tropp, J.A., Udell, M., 2023. Randomized Nystr\u00f6m Preconditioning. SIAM J. Matrix Anal. Appl. 44, 718\u2013752. https://doi.org/10.1137/21M1466244 \u21a9
"},{"location":"getting-started/methods/","title":"Methods","text":"We currently implement the following methods:
"},{"location":"getting-started/methods/#data-valuation","title":"Data valuation","text":" -
LOO.
-
Permutation Shapley (also called ApproxShapley) (Castro et al., 2009)1.
-
TMCS (Ghorbani and Zou, 2019)2.
-
Data Banzhaf [@wang_data_2022].
-
Beta Shapley (Kwon and Zou, 2022)3.
-
CS-Shapley (Schoch et al., 2022)4.
-
Least Core (Yan and Procaccia, 2021)5.
-
Owen Sampling (Okhrati and Lipani, 2021)6.
-
Data Utility Learning (Wang et al., 2022)7.
-
kNN-Shapley (Jia et al., 2019)8.
-
Group Testing (Jia et al., 2019)9
-
Data-OOB (Kwon and Zou, 2023)10.
"},{"location":"getting-started/methods/#influence-functions","title":"Influence functions","text":" -
CG Influence. (Koh and Liang, 2017)11.
-
Direct Influence (Koh and Liang, 2017)11.
-
LiSSA (Agarwal et al., 2017)12.
-
Arnoldi Influence (Schioppa et al., 2022)13.
-
EKFAC Influence (George et al., 2018; Martens and Grosse, 2015)1415.
-
Nystr\u00f6m Influence, based on the ideas in (Hataya and Yamada, 2023)16 for bi-level optimization.
-
Inverse-harmonic-mean Influence (Kwon et al., 2023)17.
-
Castro, J., G\u00f3mez, D., Tejada, J., 2009. Polynomial calculation of the Shapley value based on sampling. Computers & Operations Research, Selected papers presented at the Tenth International Symposium on Locational Decisions (ISOLDE X) 36, 1726\u20131730. https://doi.org/10.1016/j.cor.2008.04.004 \u21a9
-
Ghorbani, A., Zou, J., 2019. Data Shapley: Equitable Valuation of Data for Machine Learning, in: Proceedings of the 36th International Conference on Machine Learning, PMLR. Presented at the International Conference on Machine Learning (ICML 2019), PMLR, pp. 2242\u20132251.\u00a0\u21a9
-
Kwon, Y., Zou, J., 2022. Beta Shapley: A Unified and Noise-reduced Data Valuation Framework for Machine Learning, in: Proceedings of the 25th International Conference on Artificial Intelligence and Statistics (AISTATS) 2022,. Presented at the AISTATS 2022, PMLR.\u00a0\u21a9
-
Schoch, S., Xu, H., Ji, Y., 2022. CS-Shapley: Class-wise Shapley Values for Data Valuation in Classification, in: Proc. Of the Thirty-Sixth Conference on Neural Information Processing Systems (NeurIPS). Presented at the Advances in Neural Information Processing Systems (NeurIPS 2022).\u00a0\u21a9
-
Yan, T., Procaccia, A.D., 2021. If You Like Shapley Then You\u2019ll Love the Core, in: Proceedings of the 35th AAAI Conference on Artificial Intelligence, 2021. Presented at the AAAI Conference on Artificial Intelligence, Association for the Advancement of Artificial Intelligence, pp. 5751\u20135759. https://doi.org/10.1609/aaai.v35i6.16721 \u21a9
-
Okhrati, R., Lipani, A., 2021. A Multilinear Sampling Algorithm to Estimate Shapley Values, in: 2020 25th International Conference on Pattern Recognition (ICPR). Presented at the 2020 25th International Conference on Pattern Recognition (ICPR), IEEE, pp. 7992\u20137999. https://doi.org/10.1109/ICPR48806.2021.9412511 \u21a9
-
Wang, T., Yang, Y., Jia, R., 2022. Improving Cooperative Game Theory-based Data Valuation via Data Utility Learning. Presented at the International Conference on Learning Representations (ICLR 2022). Workshop on Socially Responsible Machine Learning, arXiv. https://doi.org/10.48550/arXiv.2107.06336 \u21a9
-
Jia, R., Dao, D., Wang, B., Hubis, F.A., Gurel, N.M., Li, B., Zhang, C., Spanos, C., Song, D., 2019. Efficient task-specific data valuation for nearest neighbor algorithms. Proc. VLDB Endow. 12, 1610\u20131623. https://doi.org/10.14778/3342263.3342637 \u21a9
-
Jia, R., Dao, D., Wang, B., Hubis, F.A., Hynes, N., G\u00fcrel, N.M., Li, B., Zhang, C., Song, D., Spanos, C.J., 2019. Towards Efficient Data Valuation Based on the Shapley Value, in: Proceedings of the 22nd International Conference on Artificial Intelligence and Statistics. Presented at the International Conference on Artificial Intelligence and Statistics (AISTATS), PMLR, pp. 1167\u20131176.\u00a0\u21a9
-
Kwon, Y., Zou, J., 2023. Data-OOB: Out-of-bag Estimate as a Simple and Efficient Data Value, in: Proceedings of the 40th International Conference on Machine Learning. Presented at the International Conference on Machine Learning, PMLR, pp. 18135\u201318152.\u00a0\u21a9
-
Koh, P.W., Liang, P., 2017. Understanding Black-box Predictions via Influence Functions, in: Proceedings of the 34th International Conference on Machine Learning. Presented at the International Conference on Machine Learning, PMLR, pp. 1885\u20131894.\u00a0\u21a9\u21a9
-
Agarwal, N., Bullins, B., Hazan, E., 2017. Second-Order Stochastic Optimization for Machine Learning in Linear Time. JMLR 18, 1\u201340.\u00a0\u21a9
-
Schioppa, A., Zablotskaia, P., Vilar, D., Sokolov, A., 2022. Scaling Up Influence Functions. Proc. AAAI Conf. Artif. Intell. 36, 8179\u20138186. https://doi.org/10.1609/aaai.v36i8.20791 \u21a9
-
George, T., Laurent, C., Bouthillier, X., Ballas, N., Vincent, P., 2018. Fast Approximate Natural Gradient Descent in a Kronecker Factored Eigenbasis, in: Advances in Neural Information Processing Systems. Curran Associates, Inc.\u00a0\u21a9
-
Martens, J., Grosse, R., 2015. Optimizing Neural Networks with Kronecker-factored Approximate Curvature, in: Proceedings of the 32nd International Conference on Machine Learning. Presented at the International Conference on Machine Learning, PMLR, pp. 2408\u20132417.\u00a0\u21a9
-
Hataya, R., Yamada, M., 2023. Nystr\u00f6m Method for Accurate and Scalable Implicit Differentiation, in: Proceedings of The 26th International Conference on Artificial Intelligence and Statistics. Presented at the International Conference on Artificial Intelligence and Statistics, PMLR, pp. 4643\u20134654.\u00a0\u21a9
-
Kwon, Y., Wu, E., Wu, K., Zou, J., 2023. DataInf: Efficiently Estimating Data Influence in LoRA-tuned LLMs and Diffusion Models. Presented at the The Twelfth International Conference on Learning Representations. https://doi.org/10.48550/arXiv.2310.00902 \u21a9
"},{"location":"influence/","title":"The influence function","text":""},{"location":"influence/#the-influence-function","title":"The influence function","text":"Warning
The code in the package pydvl.influence is experimental. Package structure and basic API are bound to change before v1.0.0
The influence function (IF) is a method to quantify the effect (influence) that each training point has on the parameters of a model, and by extension on any function thereof. In particular, it allows to estimate how much each training sample affects the error on a test point, making the IF useful for understanding and debugging models.
Alas, the influence function relies on some assumptions that can make their application difficult. Yet another drawback is that they require the computation of the inverse of the Hessian of the model wrt. its parameters, which is intractable for large models like deep neural networks. Much of the recent research tackles this issue using approximations, like a Neuman series (Agarwal et al., 2017)1, with the most successful solution using a low-rank approximation that iteratively finds increasing eigenspaces of the Hessian (Schioppa et al., 2022)2.
pyDVL implements several methods for the efficient computation of the IF for machine learning. In the examples we document some of the difficulties that can arise when using the IF.
"},{"location":"influence/#construction","title":"Construction","text":"First introduced in the context of robust statistics in (Hampel, 1974)3, the IF was popularized in the context of machine learning in (Koh and Liang, 2017)4.
Following their formulation, consider an input space \\(\\mathcal{X}\\) (e.g. images) and an output space \\(\\mathcal{Y}\\) (e.g. labels). Let's take \\(z_i = (x_i, y_i)\\), for \\(i \\in \\{1,...,n\\}\\) to be the \\(i\\)-th training point, and \\(\\theta\\) to be the (potentially highly) multi-dimensional parameters of a model (e.g. \\(\\theta\\) is a big array with all of a neural network's parameters, including biases and/or dropout rates). We will denote with \\(L(z, \\theta)\\) the loss of the model for point \\(z\\) when the parameters are \\(\\theta.\\)
To train a model, we typically minimize the loss over all \\(z_i\\), i.e. the optimal parameters are
\\[\\hat{\\theta} = \\arg \\min_\\theta \\sum_{i=1}^n L(z_i, \\theta).\\] In practice, lack of convexity means that one doesn't really obtain the minimizer of the loss, and the training is stopped when the validation loss stops decreasing.
For notational convenience, let's define
\\[\\hat{\\theta}_{-z} = \\arg \\min_\\theta \\sum_{z_i \\ne z} L(z_i, \\theta), \\] i.e. \\(\\hat{\\theta}_{-z}\\) are the model parameters that minimize the total loss when \\(z\\) is not in the training dataset.
In order to compute the impact of each training point on the model, we would need to calculate \\(\\hat{\\theta}_{-z}\\) for each \\(z\\) in the training dataset, thus re-training the model at least ~\\(n\\) times (more if model training is stochastic). This is computationally very expensive, especially for big neural networks. To circumvent this problem, we can just calculate a first order approximation of \\(\\hat{\\theta}\\). This can be done through single backpropagation and without re-training the full model.
pyDVL supports two ways of computing the empirical influence function, namely up-weighting of samples and perturbation influences.
"},{"location":"influence/#approximating-the-influence-of-a-point","title":"Approximating the influence of a point","text":"Let's define
\\[\\hat{\\theta}_{\\epsilon, z} = \\arg \\min_\\theta \\frac{1}{n}\\sum_{i=1}^n L(z_i, \\theta) + \\epsilon L(z, \\theta), \\] which is the optimal \\(\\hat{\\theta}\\) when we up-weight \\(z\\) by an amount \\(\\epsilon \\gt 0\\).
From a classical result (a simple derivation is available in Appendix A of (Koh and Liang, 2017)4), we know that:
\\[\\frac{d \\ \\hat{\\theta}_{\\epsilon, z}}{d \\epsilon} \\Big|_{\\epsilon=0} = -H_{\\hat{\\theta}}^{-1} \\nabla_\\theta L(z, \\hat{\\theta}), \\] where \\(H_{\\hat{\\theta}} = \\frac{1}{n} \\sum_{i=1}^n \\nabla_\\theta^2 L(z_i, \\hat{\\theta})\\) is the Hessian of \\(L\\). These quantities are also knows as influence factors.
Importantly, notice that this expression is only valid when \\(\\hat{\\theta}\\) is a minimum of \\(L\\), or otherwise \\(H_{\\hat{\\theta}}\\) cannot be inverted! At the same time, in machine learning full convergence is rarely achieved, so direct Hessian inversion is not possible. Approximations need to be developed that circumvent the problem of inverting the Hessian of the model in all those (frequent) cases where it is not positive definite.
The influence of training point \\(z\\) on test point \\(z_{\\text{test}}\\) is defined as:
\\[\\mathcal{I}(z, z_{\\text{test}}) = L(z_{\\text{test}}, \\hat{\\theta}_{-z}) - L(z_{\\text{test}}, \\hat{\\theta}). \\] Notice that \\(\\mathcal{I}\\) is higher for points \\(z\\) which positively impact the model score, since the loss is higher when they are excluded from training. In practice, one needs to rely on the following infinitesimal approximation:
\\[\\mathcal{I}_{up}(z, z_{\\text{test}}) = - \\frac{d L(z_{\\text{test}}, \\hat{\\theta}_{\\epsilon, z})}{d \\epsilon} \\Big|_{\\epsilon=0} \\] Using the chain rule and the results calculated above, we get:
\\[\\mathcal{I}_{up}(z, z_{\\text{test}}) = - \\nabla_\\theta L(z_{\\text{test}}, \\hat{\\theta})^\\top \\ \\frac{d \\hat{\\theta}_{\\epsilon, z}}{d \\epsilon} \\Big|_{\\epsilon=0} = \\nabla_\\theta L(z_{\\text{test}}, \\hat{\\theta})^\\top \\ H_{\\hat{\\theta}}^{-1} \\ \\nabla_\\theta L(z, \\hat{\\theta}) \\] All the resulting factors are gradients of the loss wrt. the model parameters \\(\\hat{\\theta}\\). This can be easily computed through one or more backpropagation passes.
"},{"location":"influence/#perturbation-definition-of-the-influence-score","title":"Perturbation definition of the influence score","text":"How would the loss of the model change if, instead of up-weighting an individual point \\(z\\), we were to up-weight only a single feature of that point? Given \\(z = (x, y)\\), we can define \\(z_{\\delta} = (x+\\delta, y)\\), where \\(\\delta\\) is a vector of zeros except for a 1 in the position of the feature we want to up-weight. In order to approximate the effect of modifying a single feature of a single point on the model score we can define
\\[\\hat{\\theta}_{\\epsilon, z_{\\delta} ,-z} = \\arg \\min_\\theta \\frac{1}{n}\\sum_{i=1}^n L(z_{i}, \\theta) + \\epsilon L(z_{\\delta}, \\theta) - \\epsilon L(z, \\theta), \\] Similarly to what was done above, we up-weight point \\(z_{\\delta}\\), but then we also remove the up-weighting for all the features that are not modified by \\(\\delta\\). From the calculations in the previous section, it is then easy to see that
\\[\\frac{d \\ \\hat{\\theta}_{\\epsilon, z_{\\delta} ,-z}}{d \\epsilon} \\Big|_{\\epsilon=0} = -H_{\\hat{\\theta}}^{-1} \\nabla_\\theta \\Big( L(z_{\\delta}, \\hat{\\theta}) - L(z, \\hat{\\theta}) \\Big) \\] and if the feature space is continuous and as \\(\\delta \\to 0\\) we can write
\\[\\frac{d \\ \\hat{\\theta}_{\\epsilon, z_{\\delta} ,-z}}{d \\epsilon} \\Big|_{\\epsilon=0} = -H_{\\hat{\\theta}}^{-1} \\ \\nabla_x \\nabla_\\theta L(z, \\hat{\\theta}) \\delta + \\mathcal{o}(\\delta) \\] The influence of each feature of \\(z\\) on the loss of the model can therefore be estimated through the following quantity:
\\[\\mathcal{I}_{pert}(z, z_{\\text{test}}) = - \\lim_{\\delta \\to 0} \\ \\frac{1}{\\delta} \\frac{d L(z_{\\text{test}}, \\hat{\\theta}_{\\epsilon, \\ z_{\\delta}, \\ -z})}{d \\epsilon} \\Big|_{\\epsilon=0} \\] which, using the chain rule and the results calculated above, is equal to
\\[\\mathcal{I}_{pert}(z, z_{\\text{test}}) = - \\nabla_\\theta L(z_{\\text{test}}, \\hat{\\theta})^\\top \\ \\frac{d \\hat{\\theta}_{\\epsilon, z_{\\delta} ,-z}}{d \\epsilon} \\Big|_{\\epsilon=0} = \\nabla_\\theta L(z_{\\text{test}}, \\hat{\\theta})^\\top \\ H_{\\hat{\\theta}}^{-1} \\ \\nabla_x \\nabla_\\theta L(z, \\hat{\\theta}) \\] The perturbation definition of the influence score is not straightforward to understand, but it has a simple interpretation: it tells how much the loss of the model changes when a certain feature of point z is up-weighted. A positive perturbation influence score indicates that the feature might have a positive effect on the accuracy of the model.
It is worth noting that the perturbation influence score is a very rough estimate of the impact of a point on the models loss and it is subject to large approximation errors. It can nonetheless be used to build training-set attacks, as done in (Koh and Liang, 2017)4.
"},{"location":"influence/#computation","title":"Computation","text":"The main abstraction of the library for influence calculation is InfluenceFunctionModel. On implementations of this abstraction, you can call the method influences
to compute influences.
pyDVL provides implementations to use with pytorch model in pydvl.influence.torch. For detailed information on available implementations see the documentation in InfluenceFunctionModel.
Given a pre-trained pytorch model and a loss, a basic example would look like
from torch.utils.data import DataLoader\nfrom pydvl.influence.torch import DirectInfluence\n\ntraining_data_loader = DataLoader(...)\ninfl_model = DirectInfluence(model, loss)\ninfl_model = infl_model.fit(training_data_loader)\n\ninfluences = infl_model.influences(x_test, y_test, x, y)\n
for batches \\(z_{\\text{test}} = (x_{\\text{test}}, y_{\\text{test}})\\) and \\(z = (x, y)\\) of data. The result is a tensor with one row per test point in \\(z_{\\text{test}}\\) and one column per point in \\(z\\). Thus, each entry \\((i, j)\\) represents the influence of training point \\(z[j]\\) on test point \\(z_{\\text{test}}[i]\\). Warning
Compared to the mathematical definitions above, we switch the ordering of \\(z\\) and \\(z_{\\text{test}}\\), in order to make the input ordering consistent with the dimensions of the resulting tensor. More concrete if the first dimension of \\(z_{\\text{test}}\\) is \\(N\\) and that of \\(z\\), the resulting tensor is of shape \\(N \\times M\\)
A large positive influence indicates that training point \\(j\\) tends to improve the performance of the model on test point \\(i\\), and vice versa, a large negative influence indicates that training point \\(j\\) tends to worsen the performance of the model on test point \\(i\\).
"},{"location":"influence/#hessian-regularization","title":"Hessian regularization","text":"Additionally, and as discussed in the introduction, in machine learning training rarely converges to a global minimum of the loss. Despite good apparent convergence, \\(\\hat{\\theta}\\) might be located in a region with flat curvature or close to a saddle point. In particular, the Hessian might have vanishing eigenvalues making its direct inversion impossible. Certain methods, such as the Arnoldi method are robust against these problems, but most are not.
To circumvent this problem, many approximate methods can be implemented. The simplest adds a small hessian perturbation term, i.e. \\(H_{\\hat{\\theta}} + \\lambda \\mathbb{I}\\), with \\(\\mathbb{I}\\) being the identity matrix.
from torch.utils.data import DataLoader\nfrom pydvl.influence.torch import DirectInfluence\n\ntraining_data_loader = DataLoader(...)\ninfl_model = DirectInfluence(model, loss, regularization=0.01)\ninfl_model = infl_model.fit(training_data_loader)\n
This standard trick ensures that the eigenvalues of \\(H_{\\hat{\\theta}}\\) are bounded away from zero and therefore the matrix is invertible. In order for this regularization not to corrupt the outcome too much, the parameter \\(\\lambda\\) should be as small as possible while still allowing a reliable inversion of \\(H_{\\hat{\\theta}} + \\lambda \\mathbb{I}\\).
"},{"location":"influence/#block-diagonal-approximation","title":"Block-diagonal approximation","text":"This implementation is capable of using a block-diagonal approximation. The full matrix is approximated by a block-diagonal version, which reduces both the time and memory consumption. The blocking structure can be specified via the block_structure
parameter. The block_structure
parameter can either be a BlockMode enum (which provides layer-wise or parameter-wise blocking) or a custom block structure defined by an ordered dictionary with the keys being the block identifiers (arbitrary strings) and the values being lists of parameter names contained in the block.
from torch.utils.data import DataLoader\nfrom pydvl.influence.torch import DirectInfluence, BlockMode, SecondOrderMode\n\ntraining_data_loader = DataLoader(...)\n# layer-wise block-diagonal approximation\ninfl_model = DirectInfluence(model, loss,\n regularization=0.1,\n block_structure=BlockMode.LAYER_WISE)\n\nblock_structure = OrderedDict((\n (\"custom_block1\", [\"0.weight\", \"1.bias\"]), \n (\"custom_block2\", [\"1.weight\", \"0.bias\"]),\n))\n# custom block-diagonal structure\ninfl_model = DirectInfluence(model, loss,\n regularization=0.1,\n block_structure=block_structure)\ninfl_model = infl_model.fit(training_data_loader)\n
If you would like to apply a block-specific regularization, you can provide a dictionary with the block names as keys and the regularization values as values. If no value is provided for a specific key, no regularization is applied for the corresponding block. regularization = {\n\"custom_block1\": 0.1,\n\"custom_block2\": 0.2,\n}\ninfl_model = DirectInfluence(model, loss,\n regularization=regularization,\n block_structure=block_structure)\ninfl_model = infl_model.fit(training_data_loader)\n
Accordingly, if you choose a layer-wise or parameter-wise structure (by providing BlockMode.LAYER_WISE
or BlockMode.PARAMETER_WISE
for block_structure
) the keys must be the layer names or parameter names, respectively. You can retrieve the block-wise influence information from the methods with suffix _by_block
. By default, block_structure
is set to BlockMode.FULL
and in this case these methods will return a dictionary with the empty string being the only key."},{"location":"influence/#gauss-newton-approximation","title":"Gauss-Newton approximation","text":"In the computation of the influence values, the inversion of the Hessian can be replaced by the inversion of the Gauss-Newton matrix
\\[ G_{\\hat{\\theta}}=n^{-1} \\sum_{i=1}^n \\nabla_{\\theta}L(z_i, \\hat{\\theta}) \\nabla_{\\theta}L(z_i, \\hat{\\theta})^T \\] so the computed values are of the form
\\[\\nabla_\\theta L(z_{\\text{test}}, \\hat{\\theta})^\\top \\ G_{\\hat{\\theta}}^{-1} \\ \\nabla_\\theta L(z, \\hat{\\theta}). \\] The parameter second_orer_mode
is used to configure this approximation.
from torch.utils.data import DataLoader\nfrom pydvl.influence.torch import DirectInfluence, BlockMode, SecondOrderMode\n\ntraining_data_loader = DataLoader(...)\ninfl_model = DirectInfluence(model, loss,\n regularization={\"layer_1\": 0.1, \"layer_2\": 0.2},\n block_structure=BlockMode.LAYER_WISE,\n second_order_mode=SecondOrderMode.GAUSS_NEWTON)\ninfl_model = infl_model.fit(training_data_loader)\n
"},{"location":"influence/#perturbation-influences","title":"Perturbation influences","text":"The method of empirical influence computation can be selected with the parameter mode
:
from pydvl.influence import InfluenceMode\n\ninfluences = infl_model.influences(x_test, y_test, x, y,\n mode=InfluenceMode.Perturbation)\n
The result is a tensor with at least three dimensions. The first two dimensions are the same as in the case of mode=InfluenceMode.Up
case, i.e. one row per test point and one column per training point. The remaining dimensions are the same as the number of input features in the data. Therefore, each entry in the tensor represents the influence of each feature of each training point on each test point."},{"location":"influence/#influence-factors","title":"Influence factors","text":"The influence factors(refer to the previous section for a definition) are typically the most computationally demanding part of influence calculation. They can be obtained via calling the influence_factors
method, saved, and later used for influence calculation on different subsets of the training dataset.
influence_factors = infl_model.influence_factors(x_test, y_test)\ninfluences = infl_model.influences_from_factors(influence_factors, x, y)\n
-
Agarwal, N., Bullins, B., Hazan, E., 2017. Second-Order Stochastic Optimization for Machine Learning in Linear Time. JMLR 18, 1\u201340.\u00a0\u21a9
-
Schioppa, A., Zablotskaia, P., Vilar, D., Sokolov, A., 2022. Scaling Up Influence Functions. Proc. AAAI Conf. Artif. Intell. 36, 8179\u20138186. https://doi.org/10.1609/aaai.v36i8.20791 \u21a9
-
Hampel, F.R., 1974. The Influence Curve and Its Role in Robust Estimation. J. Am. Stat. Assoc. 69, 383\u2013393. https://doi.org/10.2307/2285666 \u21a9
-
Koh, P.W., Liang, P., 2017. Understanding Black-box Predictions via Influence Functions, in: Proceedings of the 34th International Conference on Machine Learning. Presented at the International Conference on Machine Learning, PMLR, pp. 1885\u20131894.\u00a0\u21a9\u21a9\u21a9
"},{"location":"influence/influence_function_model/","title":"Influence Function Model","text":"In almost every practical application it is not possible to construct, even less invert the complete Hessian in memory. pyDVL offers several implementations of the interface InfluenceFunctionModel , which do not compute the full Hessian (in contrast to DirectInfluence ).
"},{"location":"influence/influence_function_model/#conjugate-gradient","title":"Conjugate Gradient","text":"This classical procedure for solving linear systems of equations is an iterative method that does not require the explicit inversion of the Hessian. Instead, it only requires the calculation of Hessian-vector products, making it a good choice for large datasets or models with many parameters. It is nevertheless much slower to converge than the direct inversion method and not as accurate.
More info on the theory of conjugate gradient can be found on Wikipedia, or in text books such as (Trefethen and Bau, 1997, Lecture 38)1.
pyDVL also implements a stable block variant of the conjugate gradient method, defined in (Ji and Li, 2017)2, which solves several right hand sides simultaneously.
Optionally, the user can provide a pre-conditioner to improve convergence, such as a Jacobi pre-conditioner , which is a simple diagonal pre-conditioner based on Hutchinson's diagonal estimator (Bekas et al., 2007)3, or a Nystr\u00f6m approximation based pre-conditioner , described in (Frangella et al., 2023)4.
from pydvl.influence.torch import CgInfluence\nfrom pydvl.influence.torch.pre_conditioner import NystroemPreConditioner\n\nif_model = CgInfluence(\n model,\n loss,\n hessian_regularization=0.0,\n rtol=1e-7,\n atol=1e-7,\n maxiter=None,\n use_block_cg=True,\n pre_conditioner=NystroemPreConditioner(rank=10)\n)\nif_model.fit(train_loader)\n
The additional optional parameters rtol
, atol
, maxiter
, use_block_cg
and pre_conditioner
are respectively, the relative tolerance, the absolute tolerance, the maximum number of iterations, a flag indicating whether to use block variant of cg and an optional pre-conditioner.
"},{"location":"influence/influence_function_model/#linear-time-stochastic-second-order-approximation-lissa","title":"Linear time Stochastic Second-Order Approximation (LiSSA)","text":"The LiSSA method is a stochastic approximation of the inverse Hessian vector product. Compared to conjugate gradient it is faster but less accurate and typically suffers from instability.
In order to find the solution of the HVP, LiSSA iteratively approximates the inverse of the Hessian matrix with the following update:
\\[H^{-1}_{j+1} b = b + (I - d) \\ H - \\frac{H^{-1}_j b}{s},\\] where \\(d\\) and \\(s\\) are a dampening and a scaling factor, which are essential for the convergence of the method and they need to be chosen carefully, and I is the identity matrix. More info on the theory of LiSSA can be found in the original paper (Agarwal et al., 2017)5.
from pydvl.influence.torch import LissaInfluence, BlockMode, SecondOrderMode\nif_model = LissaInfluence(\n model,\n loss,\n regularization=0.0 \n maxiter=1000,\n dampen=0.0,\n scale=10.0,\n rtol=1e-4,\n block_structure=BlockMode.FULL,\n second_order_mode=SecondOrderMode.GAUSS_NEWTON\n)\nif_model.fit(train_loader)\n
with the additional optional parameters maxiter
, dampen
, scale
, and rtol
, being the maximum number of iterations, the dampening factor, the scaling factor and the relative tolerance, respectively. This implementation is capable of using a block-matrix approximation, see Block-diagonal approximation, and can handle Gauss-Newton approximation.
"},{"location":"influence/influence_function_model/#arnoldi","title":"Arnoldi","text":"The Arnoldi method is a Krylov subspace method for approximating dominating eigenvalues and eigenvectors. Under a low rank assumption on the Hessian at a minimizer (which is typically observed for deep neural networks), this approximation captures the essential action of the Hessian. More concretely, for \\(Hx=b\\) the solution is approximated by
\\[x \\approx V D^{-1} V^T b\\] where \\(D\\) is a diagonal matrix with the top (in absolute value) eigenvalues of the Hessian and \\(V\\) contains the corresponding eigenvectors. See also (Schioppa et al., 2022)6.
from pydvl.influence.torch import ArnoldiInfluence\nif_model = ArnoldiInfluence(\n model,\n loss,\n hessian_regularization=0.0,\n rank_estimate=10,\n tol=1e-6,\n)\nif_model.fit(train_loader)\n
"},{"location":"influence/influence_function_model/#eigenvalue-corrected-k-fac","title":"Eigenvalue Corrected K-FAC","text":"K-FAC, short for Kronecker-Factored Approximate Curvature, is a method that approximates the Fisher information matrix FIM of a model. It is possible to show that for classification models with appropriate loss functions the FIM is equal to the Hessian of the model\u2019s loss over the dataset. In this restricted but nonetheless important context K-FAC offers an efficient way to approximate the Hessian and hence the influence scores. For more info and details refer to the original paper (Martens and Grosse, 2015)7.
The K-FAC method is implemented in the class EkfacInfluence . The following code snippet shows how to use the K-FAC method to calculate the influence function of a model. Note that, in contrast to the other methods for influence function calculation, K-FAC does not require the loss function as an input. This is because the current implementation is only applicable to classification models with a cross entropy loss function.
from pydvl.influence.torch import EkfacInfluence\nif_model = EkfacInfluence(\n model,\n hessian_regularization=0.0,\n)\nif_model.fit(train_loader)\n
Upon initialization, the K-FAC method will parse the model and extract which layers require grad and which do not. Then it will only calculate the influence scores for the layers that require grad. The current implementation of the K-FAC method is only available for linear layers, and therefore if the model contains non-linear layers that require gradient the K-FAC method will raise a NotImplementedLayerRepresentationException. A further improvement of the K-FAC method is the Eigenvalue Corrected K-FAC (EKFAC) method (George et al., 2018)8, which allows to further re-fit the eigenvalues of the Hessian, thus providing a more accurate approximation. On top of the K-FAC method, the EKFAC method is implemented by setting update_diagonal=True
when initialising EkfacInfluence . The following code snippet shows how to use the EKFAC method to calculate the influence function of a model.
from pydvl.influence.torch import EkfacInfluence\nif_model = EkfacInfluence(\n model,\n update_diagonal=True,\n hessian_regularization=0.0,\n)\nif_model.fit(train_loader)\n
"},{"location":"influence/influence_function_model/#nystrom-sketch-and-solve","title":"Nystr\u00f6m Sketch-and-Solve","text":"This approximation is based on a Nystr\u00f6m low-rank approximation of the form
\\[\\begin{align*} H_{\\text{nys}} &= (H\\Omega)(\\Omega^TH\\Omega)^{\\dagger}(H\\Omega)^T \\\\\\ &= U \\Lambda U^T, \\end{align*}\\] where \\((\\cdot)^{\\dagger}\\) denotes the Moore-Penrose inverse, in combination with the Sherman\u2013Morrison\u2013Woodbury formula to calculate the action of its inverse:
\\[\\begin{equation*} (H_{\\text{nys}} + \\lambda I)^{-1}x = U(\\Lambda+\\lambda I)U^Tx + \\frac{1}{\\lambda}(I\u2212UU^T)x, \\end{equation*}\\] see also (Hataya and Yamada, 2023)9 and (Frangella et al., 2023)4. The essential parameter is the rank of the approximation.
from pydvl.influence.torch import NystroemSketchInfluence\nif_model = NystroemSketchInfluence(\n model,\n loss,\n rank=10,\n regularization=0.0,\n block_structure=BlockMode.FULL,\n second_order_mode=SecondOrderMode.HESSIAN\n)\nif_model.fit(train_loader)\n
This implementation is capable of using a block-matrix approximation, see Block-diagonal approximation, and can handle Gauss-Newton approximation."},{"location":"influence/influence_function_model/#inverse-harmonic-mean","title":"Inverse Harmonic Mean","text":"This implementation replaces the inverse Hessian matrix in the influence computation with an approximation of the inverse Gauss-Newton vector product and was proposed in (Kwon et al., 2023)10.
The approximation method comprises the following steps:
-
Replace the Hessian \\(H(\\theta)\\) with the Gauss-Newton matrix \\(G(\\theta)\\):
\\[\\begin{equation*} G(\\theta)=n^{-1} \\sum_{i=1}^n \\nabla_{\\theta}\\ell_i\\nabla_{\\theta}\\ell_i^T \\end{equation*}\\] which results in
\\[\\begin{equation*} \\mathcal{I}(z_{t}, z) \\approx \\nabla_{\\theta} \\ell(z_{t}, \\theta)^T (G(\\theta) + \\lambda I_d)^{-1} \\nabla_{\\theta} \\ell(z, \\theta) \\end{equation*}\\] -
Simplify the problem by breaking it down into a block diagonal structure, where each block \\(G_l(\\theta)\\) corresponds to the l-th block:
\\[\\begin{equation*} G_{l}(\\theta) = n^{-1} \\sum_{i=1}^n \\nabla_{\\theta_l} \\ell_i \\nabla_{\\theta_l} \\ell_i^{T} + \\lambda_l I_{d_l}, \\end{equation*}\\] which leads to
\\[\\begin{equation*} \\mathcal{I}(z_{t}, z) \\approx \\nabla_{\\theta} \\ell(z_{t}, \\theta)^T \\operatorname{diag}(G_1(\\theta)^{-1}, \\dots, G_L(\\theta)^{-1}) \\nabla_{\\theta} \\ell(z, \\theta) \\end{equation*}\\] -
Substitute the arithmetic mean of the rank-\\(1\\) updates in \\(G_l(\\theta)\\), with the inverse harmonic mean \\(R_l(\\theta)\\) of the rank-1 updates:
\\[\\begin{align*} G_l(\\theta)^{-1} &= \\left( n^{-1} \\sum_{i=1}^n \\nabla_{\\theta_l} \\ell(z_i, \\theta) \\nabla_{\\theta_l} \\ell(z_i, \\theta)^{T} + \\lambda_l I_{d_l}\\right)^{-1} \\\\\\ R_{l}(\\theta)&= n^{-1} \\sum_{i=1}^n \\left( \\nabla_{\\theta_l} \\ell(z_i, \\theta) \\nabla_{\\theta_l} \\ell(z_i, \\theta)^{T} + \\lambda_l I_{d_l} \\right)^{-1} \\end{align*}\\] -
Use the Sherman\u2013Morrison formula to get an explicit representation of the inverses in the definition of \\(R_l(\\theta):\\)
\\[\\begin{align*} R_l(\\theta) &= n^{-1} \\sum_{i=1}^n \\left( \\nabla_{\\theta_l} \\ell_i \\nabla_{\\theta_l} \\ell_i^{T} + \\lambda_l I_{d_l}\\right)^{-1} \\\\\\ &= n^{-1} \\sum_{i=1}^n \\lambda_l^{-1} \\left(I_{d_l} - \\frac{\\nabla_{\\theta_l} \\ell_i \\nabla_{\\theta_l} \\ell_i^{T}}{\\lambda_l + \\\\|\\nabla_{\\theta_l} \\ell_i\\\\|_2^2}\\right) , \\end{align*}\\] which means application of \\(R_l(\\theta)\\) boils down to computing \\(n\\) rank-\\(1\\) updates.
from pydvl.influence.torch import InverseHarmonicMeanInfluence, BlockMode\n\nif_model = InverseHarmonicMeanInfluence(\n model,\n loss,\n regularization=1e-1,\n block_structure=BlockMode.LAYER_WISE\n)\nif_model.fit(train_loader)\n
This implementation is capable of using a block-matrix approximation, see Block-diagonal approximation. These implementations represent the calculation logic on in memory tensors. To scale up to large collection of data, we map these influence function models over these collections. For a detailed discussion see the documentation page Scaling Computation.
-
Trefethen, L.N., Bau, D., Iii, 1997. Numerical Linear Algebra. Society for Industrial and Applied Mathematics. https://doi.org/10.1137/1.9780898719574 \u21a9
-
Ji, H., Li, Y., 2017. A breakdown-free block conjugate gradient method. Bit Numer Math 57, 379\u2013403. https://doi.org/10.1007/s10543-016-0631-z \u21a9
-
Bekas, C., Kokiopoulou, E., Saad, Y., 2007. An estimator for the diagonal of a matrix. Applied Numerical Mathematics, Numerical Algorithms, Parallelism and Applications (2) 57, 1214\u20131229. https://doi.org/10.1016/j.apnum.2007.01.003 \u21a9
-
Frangella, Z., Tropp, J.A., Udell, M., 2023. Randomized Nystr\u00f6m Preconditioning. SIAM J. Matrix Anal. Appl. 44, 718\u2013752. https://doi.org/10.1137/21M1466244 \u21a9\u21a9
-
Agarwal, N., Bullins, B., Hazan, E., 2017. Second-Order Stochastic Optimization for Machine Learning in Linear Time. JMLR 18, 1\u201340.\u00a0\u21a9
-
Schioppa, A., Zablotskaia, P., Vilar, D., Sokolov, A., 2022. Scaling Up Influence Functions. Proc. AAAI Conf. Artif. Intell. 36, 8179\u20138186. https://doi.org/10.1609/aaai.v36i8.20791 \u21a9
-
Martens, J., Grosse, R., 2015. Optimizing Neural Networks with Kronecker-factored Approximate Curvature, in: Proceedings of the 32nd International Conference on Machine Learning. Presented at the International Conference on Machine Learning, PMLR, pp. 2408\u20132417.\u00a0\u21a9
-
George, T., Laurent, C., Bouthillier, X., Ballas, N., Vincent, P., 2018. Fast Approximate Natural Gradient Descent in a Kronecker Factored Eigenbasis, in: Advances in Neural Information Processing Systems. Curran Associates, Inc.\u00a0\u21a9
-
Hataya, R., Yamada, M., 2023. Nystr\u00f6m Method for Accurate and Scalable Implicit Differentiation, in: Proceedings of The 26th International Conference on Artificial Intelligence and Statistics. Presented at the International Conference on Artificial Intelligence and Statistics, PMLR, pp. 4643\u20134654.\u00a0\u21a9
-
Kwon, Y., Wu, E., Wu, K., Zou, J., 2023. DataInf: Efficiently Estimating Data Influence in LoRA-tuned LLMs and Diffusion Models. Presented at the The Twelfth International Conference on Learning Representations. https://doi.org/10.48550/arXiv.2310.00902 \u21a9
"},{"location":"influence/scaling_computation/","title":"Scaling Influence Computation","text":"The implementations of InfluenceFunctionModel provide a convenient way to calculate influences for in memory tensors.
Nevertheless, there is a need for computing the influences on batches of data. This might happen, if your input data does not fit into memory (e.g. it is very high-dimensional) or for large models the derivative computations exceed your memory or any combinations of these. For this scenario, we want to map our influence function model over collections of batches (or chunks) of data.
"},{"location":"influence/scaling_computation/#sequential","title":"Sequential","text":"The simplest way is to use a double for-loop to iterate over the batches sequentially and collect them. pyDVL provides the simple convenience class SequentialInfluenceCalculator to do this. The batch size should be chosen as large as possible, such that the corresponding batches fit into memory.
from pydvl.influence import SequentialInfluenceCalculator\nfrom pydvl.influence.torch.util import (\n TorchNumpyConverter, NestedTorchCatAggregator,\n)\nfrom pydvl.influence.torch import CgInfluence\n\nbatch_size = 10\ntrain_dataloader = DataLoader(..., batch_size=batch_size)\ntest_dataloader = DataLoader(..., batch_size=batch_size)\n\ninfl_model = CgInfluence(model, loss, hessian_regularization=0.01)\ninfl_model = infl_model.fit(train_dataloader)\n\ninfl_calc = SequentialInfluenceCalculator(infl_model)\n\n# this does not trigger the computation\nlazy_influences = infl_calc.influences(test_dataloader, train_dataloader)\n\n# trigger computation and pull the result into main memory, \n# result is the full tensor for all combinations of the two loaders\ninfluences = lazy_influences.compute(aggregator=NestedTorchCatAggregator())\n# or\n# trigger computation and write results chunk-wise to disk using zarr \n# in a sequential manner\nlazy_influences.to_zarr(\"local_path/or/url\", TorchNumpyConverter())\n
When invoking the compute
method, you have the option to specify a custom aggregator by implementing NestedSequenceAggregator. This allows for the aggregation of computed chunks. Such an approach is particularly beneficial for straightforward aggregation tasks, commonly seen in sequential computation models. Examples include operations like concatenation, as implemented in NestedTorchCatAggregator, or basic min and max operations. For more intricate aggregations, such as an argmax operation, it's advisable to use the DaskInfluenceCalculator (refer to Parallel for more details). This is because it returns data structures in the form of dask.array.Array objects, which offer an API almost fully compatible with NumPy arrays.
"},{"location":"influence/scaling_computation/#parallel","title":"Parallel","text":"While the sequential calculation helps in the case the resulting tensors are too large to fit into memory, the batches are computed one after another. Because the influence computation itself is completely data parallel, you may want to use a parallel processing framework.
pyDVL provides an implementation of a parallel computation model using dask. The wrapper class DaskInfluenceCalculator has convenience methods to map the influence function computation over chunks of data in a parallel manner.
Again, choosing an appropriate chunk size can be crucial. For a better understanding see the official dask best practice documentation and the following blog entry.
Warning
Make sure to set threads_per_worker=1
, when using the distributed scheduler for computing, if your implementation of InfluenceFunctionModel is not thread-safe.
client = Client(threads_per_worker=1)\n
For details on dask schedulers see the official documentation. import torch\nfrom torch.utils.data import Dataset, DataLoader\nfrom pydvl.influence import DaskInfluenceCalculator\nfrom pydvl.influence.torch import CgInfluence\nfrom pydvl.influence.torch.util import (\n torch_dataset_to_dask_array,\n TorchNumpyConverter,\n)\nfrom distributed import Client\n\ntrain_data_set: Dataset = LargeDataSet(\n ...) # Possible some out of memory large Dataset\ntest_data_set: Dataset = LargeDataSet(\n ...) # Possible some out of memory large Dataset\n\ntrain_dataloader = DataLoader(train_data_set)\ninfl_model = CgInfluence(model, loss, hessian_regularization=0.01)\ninfl_model = infl_model.fit(train_dataloader)\n\n# wrap your input data into dask arrays\nchunk_size = 10\nda_x, da_y = torch_dataset_to_dask_array(train_data_set, chunk_size=chunk_size)\nda_x_test, da_y_test = torch_dataset_to_dask_array(test_data_set,\n chunk_size=chunk_size)\n\n# use only one thread for scheduling, \n# due to non-thread safety of some torch operations\nclient = Client(n_workers=4, threads_per_worker=1)\n\ninfl_calc = DaskInfluenceCalculator(infl_model, \n converter=TorchNumpyConverter(\n device=torch.device(\"cpu\")\n ),\n client=client)\nda_influences = infl_calc.influences(da_x_test, da_y_test, da_x, da_y)\n# da_influences is a dask.array.Array\n# trigger computation and write chunks to disk in parallel\nda_influences.to_zarr(\"path/or/url\")\n
During initialization of the DaskInfluenceCalculator, the system verifies if all workers are operating in single-threaded mode when the provided influence_function_model is designated as not thread-safe (indicated by the is_thread_safe
property). If this condition is not met, the initialization will raise a specific error, signaling a potential thread-safety conflict. To intentionally skip this safety check (e.g., for debugging purposes using the single machine synchronous scheduler), you can supply the DisableClientSingleThreadCheck type.
from pydvl.influence import DisableClientSingleThreadCheck\n\ninfl_calc = DaskInfluenceCalculator(infl_model,\n TorchNumpyConverter(device=torch.device(\"cpu\")),\n DisableClientSingleThreadCheck)\nda_influences = infl_calc.influences(da_x_test, da_y_test, da_x, da_y)\nda_influences.compute(scheduler=\"synchronous\")\n
"},{"location":"value/","title":"Data valuation","text":"Info
If you want to jump right into it, skip ahead to Computing data values. If you want a quick list of applications, see Applications of data valuation. For a list of all algorithms implemented in pyDVL, see Methods.
Data valuation is the task of assigning a number to each element of a training set which reflects its contribution to the final performance of some model trained on it. Some methods attempt to be model-agnostic, but in most cases the model is an integral part of the method. In these cases, this number is not an intrinsic property of the element of interest, but typically a function of three factors:
-
The dataset \\(D\\), or more generally, the distribution it was sampled from: In some cases one only cares about values wrt. a given data set, in others value would ideally be the (expected) contribution of a data point to any random set \\(D\\) sampled from the same distribution. pyDVL implements methods of the first kind.
-
The algorithm \\(\\mathcal{A}\\) mapping the data \\(D\\) to some estimator \\(f\\) in a model class \\(\\mathcal{F}\\). E.g. MSE minimization to find the parameters of a linear model.
-
The performance metric of interest \\(u\\) for the problem. When value depends on a model, it must be measured in some way which uses it. E.g. the \\(R^2\\) score or the negative MSE over a test set. This metric will be computed over a held-out valuation set.
pyDVL collects algorithms for the computation of data values in this sense, mostly those derived from cooperative game theory. The methods can be found in the package [[pydvl.value]], with support from modules pydvl.utils.dataset and pydvl.utils.utility, as detailed below.
Warning
Be sure to read the section on the difficulties using data values.
There are three main families of methods for data valuation: game-theoretic, influence-based and intrinsic. As of v0.8.1 pyDVL supports the first two. Here, we focus on game-theoretic concepts and refer to the main documentation on the influence funtion for the second.
"},{"location":"value/#game-theoretical-methods","title":"Game theoretical methods","text":"The main contenders in game-theoretic approaches are Shapley values (Ghorbani and Zou, 2019)1, (Kwon et al., 2021)2, (Schoch et al., 2022)3, their generalization to so-called semi-values by (Kwon and Zou, 2022)4 and [@wang_data_2022], and the Core (Yan and Procaccia, 2021)5. All of these are implemented in pyDVL. For a full list see Methods
In these methods, data points are considered players in a cooperative game whose outcome is the performance of the model when trained on subsets (coalitions) of the data, measured on a held-out valuation set. This outcome, or utility, must typically be computed for every subset of the training set, so that an exact computation is \\(\\mathcal{O} (2^n)\\) in the number of samples \\(n\\), with each iteration requiring a full re-fitting of the model using a coalition as training set. Consequently, most methods involve Monte Carlo approximations, and sometimes approximate utilities which are faster to compute, e.g. proxy models (Wang et al., 2022)6 or constant-cost approximations like Neural Tangent Kernels (Wu et al., 2022)7.
The reasoning behind using game theory is that, in order to be useful, an assignment of value, dubbed valuation function, is usually required to fulfil certain requirements of consistency and \"fairness\". For instance, in some applications value should not depend on the order in which data are considered, or it should be equal for samples that contribute equally to any subset of the data (of equal size). When considering aggregated value for (sub-)sets of data there are additional desiderata, like having a value function that does not increase with repeated samples. Game-theoretic methods are all rooted in axioms that by construction ensure different desiderata, but despite their practical usefulness, none of them are either necessary or sufficient for all applications. For instance, SV methods try to equitably distribute all value among all samples, failing to identify repeated ones as unnecessary, with e.g. a zero value.
"},{"location":"value/#computing-data-values","title":"Computing data values","text":"Using pyDVL to compute data values is a simple process that can be broken down into three steps:
- Creating a Dataset object from your data.
- Creating a Utility which ties your model to the dataset and a scoring function.
- Computing values with a method of your choice, e.g. via compute_shapley_values.
"},{"location":"value/#creating-a-dataset","title":"Creating a Dataset","text":"The first item in the tuple \\((D, \\mathcal{A}, u)\\) characterising data value is the dataset. The class Dataset is a simple convenience wrapper for the train and test splits that is used throughout pyDVL. The test set will be used to evaluate a scoring function for the model.
It can be used as follows:
import numpy as np\nfrom pydvl.utils import Dataset\nfrom sklearn.model_selection import train_test_split\nX, y = np.arange(100).reshape((50, 2)), np.arange(50)\nX_train, X_test, y_train, y_test = train_test_split(\n X, y, test_size=0.5, random_state=16\n)\ndataset = Dataset(X_train, X_test, y_train, y_test)\n
It is also possible to construct Datasets from sklearn toy datasets for illustrative purposes using from_sklearn.
"},{"location":"value/#grouping-data","title":"Grouping data","text":"Be it because data valuation methods are computationally very expensive, or because we are interested in the groups themselves, it can be often useful or necessary to group samples to valuate them together. GroupedDataset provides an alternative to Dataset with the same interface which allows this.
You can see an example in action in the Spotify notebook, but here's a simple example grouping a pre-existing Dataset
. First we construct an array mapping each index in the dataset to a group, then use from_dataset:
import numpy as np\nfrom pydvl.utils import GroupedDataset\n\n# Randomly assign elements to any one of num_groups:\ndata_groups = np.random.randint(0, num_groups, len(dataset))\ngrouped_dataset = GroupedDataset.from_dataset(dataset, data_groups)\n
"},{"location":"value/#creating-a-utility","title":"Creating a Utility","text":"In pyDVL we have slightly overloaded the name \"utility\" and use it to refer to an object that keeps track of all three items in \\((D, \\mathcal{A}, u)\\). This will be an instance of Utility which, as mentioned, is a convenient wrapper for the dataset, model and scoring function used for valuation methods.
Here's a minimal example:
import sklearn as sk\nfrom pydvl.utils import Dataset, Utility\n\ndataset = Dataset.from_sklearn(sk.datasets.load_iris())\nmodel = sk.svm.SVC()\nutility = Utility(model, dataset)\n
The object utility
is a callable that data valuation methods will execute with different subsets of training data. Each call will retrain the model on a subset and evaluate it on the test data using a scoring function. By default, Utility will use model.score()
, but it is possible to use any scoring function (greater values must be better). In particular, the constructor accepts the same types as argument as sklearn.model_selection.cross_validate: a string, a scorer callable or None for the default.
utility = Utility(model, dataset, \"explained_variance\")\n
Utility
will wrap the fit()
method of the model to cache its results. This greatly reduces computation times of Monte Carlo methods. Because of how caching is implemented, it is important not to reuse Utility
objects for different datasets. You can read more about setting up the cache in the installation guide, and in the documentation of the caching module.
"},{"location":"value/#using-custom-scorers","title":"Using custom scorers","text":"The scoring
argument of Utility can be used to specify a custom Scorer object. This is a simple wrapper for a callable that takes a model, and test data and returns a score.
More importantly, the object provides information about the range of the score, which is used by some methods by estimate the number of samples necessary, and about what default value to use when the model fails to train.
Note
The most important property of a Scorer
is its default value. Because many models will fail to fit on small subsets of the data, it is important to provide a sensible default value for the score.
It is possible to skip the construction of the Scorer when constructing the Utility
object. The two following calls are equivalent:
from pydvl.utils import Utility, Scorer\n\nutility = Utility(\n model, dataset, \"explained_variance\", score_range=(-np.inf, 1), default_score=0.0\n)\nutility = Utility(\n model, dataset, Scorer(\"explained_variance\", range=(-np.inf, 1), default=0.0)\n)\n
"},{"location":"value/#learning-the-utility","title":"Learning the utility","text":"Because each evaluation of the utility entails a full retrain of the model with a new subset of the training set, it is natural to try to learn this mapping from subsets to scores. This is the idea behind Data Utility Learning (DUL) (Wang et al., 2022)6 and in pyDVL it's as simple as wrapping the Utility
inside DataUtilityLearning:
from pydvl.utils import Utility, DataUtilityLearning, Dataset\nfrom sklearn.linear_model import LinearRegression, LogisticRegression\nfrom sklearn.datasets import load_iris\n\ndataset = Dataset.from_sklearn(load_iris())\nu = Utility(LogisticRegression(), dataset)\ntraining_budget = 3\nwrapped_u = DataUtilityLearning(u, training_budget, LinearRegression())\n\n# First 3 calls will be computed normally\nfor i in range(training_budget):\n _ = wrapped_u((i,))\n# Subsequent calls will be computed using the fit model for DUL\nwrapped_u((1, 2, 3))\n
As you can see, all that is required is a model to learn the utility itself and the fitting and using of the learned model happens behind the scenes.
There is a longer example with an investigation of the results achieved by DUL in a dedicated notebook.
"},{"location":"value/#leave-one-out-values","title":"Leave-One-Out values","text":"LOO is the simplest approach to valuation. It assigns to each sample its marginal utility as value:
\\[v_u(i) = u(D) \u2212 u(D_{-i}).\\] For notational simplicity, we consider the valuation function as defined over the indices of the dataset \\(D\\), and \\(i \\in D\\) is the index of the sample, \\(D_{-i}\\) is the training set without the sample \\(x_i\\), and \\(u\\) is the utility function. See the section on notation for more.
For the purposes of data valuation, this is rarely useful beyond serving as a baseline for benchmarking. Although in some benchmarks it can perform astonishingly well on occasion. One particular weakness is that it does not necessarily correlate with an intrinsic value of a sample: since it is a marginal utility, it is affected by diminishing returns. Often, the training set is large enough for a single sample not to have any significant effect on training performance, despite any qualities it may possess. Whether this is indicative of low value or not depends on each one's goals and definitions, but other methods are typically preferable.
from pydvl.value.loo import compute_loo\n\nvalues = compute_loo(utility, n_jobs=-1)\n
The return value of all valuation functions is an object of type ValuationResult. This can be iterated over, indexed with integers, slices and Iterables, as well as converted to a pandas.DataFrame.
"},{"location":"value/#problems-of-data-values","title":"Problems of data values","text":"There are a number of factors that affect how useful values can be for your project. In particular, regression can be especially tricky, but the particular nature of every (non-trivial) ML problem can have an effect:
-
Variance of the utility: Classical applications of game theoretic value concepts operate with deterministic utilities, as do many of the bounds in the literature. But in ML we use an evaluation of the model on a validation set as a proxy for the true risk. Even if the utility is bounded, its variance will affect final values, and even more so any Monte Carlo estimates. Several works have tried to cope with variance. [@wang_data_2022] prove that by relaxing one of the Shapley axioms and considering the general class of semi-values, of which Shapley is an instance, one can prove that a choice of constant weights is the best one can do in a utility-agnostic setting. This method, dubbed Data Banzhaf, is available in pyDVL as compute_banzhaf_semivalues.
Averaging repeated utility evaluations One workaround in pyDVL is to configure the caching system to allow multiple evaluations of the utility for every index set. A moving average is computed and returned once the standard error is small, see CachedFuncConfig. Note however that in practice, the likelihood of cache hits is low, so one would have to force recomputation manually somehow.
-
Unbounded utility: Choosing a scorer for a classifier is simple: accuracy or some F-score provides a bounded number with a clear interpretation. However, in regression problems most scores, like \\(R^2\\), are not bounded because regressors can be arbitrarily bad. This leads to great variability in the utility for low sample sizes, and hence unreliable Monte Carlo approximations to the values. Nevertheless, in practice it is only the ranking of samples that matters, and this tends to be accurate (wrt. to the true ranking) despite inaccurate values.
Squashing scores pyDVL offers a dedicated function composition for scorer functions which can be used to squash a score. The following is defined in module score:
import numpy as np\nfrom pydvl.utils import compose_score\n\ndef sigmoid(x: float) -> float:\n return float(1 / (1 + np.exp(-x)))\n\nsquashed_r2 = compose_score(\"r2\", sigmoid, \"squashed r2\")\n\nsquashed_variance = compose_score(\n \"explained_variance\", sigmoid, \"squashed explained variance\"\n)\n
These squashed scores can prove useful in regression problems, but they can also introduce issues in the low-value regime. -
Data set size: Computing exact Shapley values is NP-hard, and Monte Carlo approximations can converge slowly. Massive datasets are thus impractical, at least with game-theoretical methods. A workaround is to group samples and investigate their value together. You can do this using GroupedDataset. There is a fully worked-out example here. Some algorithms also provide different sampling strategies to reduce the variance, but due to a no-free-lunch-type theorem, no single strategy can be optimal for all utilities. Finally, model specific methods like kNN-Shapley (Jia et al., 2019)8, or altogether different and typically faster approaches like Data-OOB (Kwon and Zou, 2023)9 can also be used.
-
Model size: Since every evaluation of the utility entails retraining the whole model on a subset of the data, large models require great amounts of computation. But also, they will effortlessly interpolate small to medium datasets, leading to great variance in the evaluation of performance on the dedicated validation set. One mitigation for this problem is cross-validation, but this would incur massive computational cost. As of v0.8.1 there are no facilities in pyDVL for cross-validating the utility (note that this would require cross-validating the whole value computation).
"},{"location":"value/#notation-and-nomenclature","title":"Notation and nomenclature","text":"Todo
Organize this section better and use its content consistently throughout the documentation.
The following notation is used throughout the documentation:
Let \\(D = \\{x_1, \\ldots, x_n\\}\\) be a training set of \\(n\\) samples.
The utility function \\(u:\\mathcal{D} \\rightarrow \\mathbb{R}\\) maps subsets of \\(D\\) to real numbers. In pyDVL, we typically call this mapping a score for consistency with sklearn, and reserve the term utility for the triple of dataset \\(D\\), model \\(f\\) and score \\(u\\), since they are used together to compute the value.
The value \\(v\\) of the \\(i\\)-th sample in dataset \\(D\\) wrt. utility \\(u\\) is denoted as \\(v_u(x_i)\\) or simply \\(v(i)\\).
For any \\(S \\subseteq D\\), we denote by \\(S_{-i}\\) the set of samples in \\(D\\) excluding \\(x_i\\), and \\(S_{+i}\\) denotes the set \\(S\\) with \\(x_i\\) added.
The marginal utility of adding sample \\(x_i\\) to a subset \\(S\\) is denoted as \\(\\delta(i) := u(S_{+i}) - u(S)\\).
The set \\(D_{-i}^{(k)}\\) contains all subsets of \\(D\\) of size \\(k\\) that do not include sample \\(x_i\\).
-
Ghorbani, A., Zou, J., 2019. Data Shapley: Equitable Valuation of Data for Machine Learning, in: Proceedings of the 36th International Conference on Machine Learning, PMLR. Presented at the International Conference on Machine Learning (ICML 2019), PMLR, pp. 2242\u20132251.\u00a0\u21a9
-
Kwon, Y., Rivas, M.A., Zou, J., 2021. Efficient Computation and Analysis of Distributional Shapley Values, in: Proceedings of the 24th International Conference on Artificial Intelligence and Statistics. Presented at the International Conference on Artificial Intelligence and Statistics, PMLR, pp. 793\u2013801.\u00a0\u21a9
-
Schoch, S., Xu, H., Ji, Y., 2022. CS-Shapley: Class-wise Shapley Values for Data Valuation in Classification, in: Proc. Of the Thirty-Sixth Conference on Neural Information Processing Systems (NeurIPS). Presented at the Advances in Neural Information Processing Systems (NeurIPS 2022).\u00a0\u21a9
-
Kwon, Y., Zou, J., 2022. Beta Shapley: A Unified and Noise-reduced Data Valuation Framework for Machine Learning, in: Proceedings of the 25th International Conference on Artificial Intelligence and Statistics (AISTATS) 2022,. Presented at the AISTATS 2022, PMLR.\u00a0\u21a9
-
Yan, T., Procaccia, A.D., 2021. If You Like Shapley Then You\u2019ll Love the Core, in: Proceedings of the 35th AAAI Conference on Artificial Intelligence, 2021. Presented at the AAAI Conference on Artificial Intelligence, Association for the Advancement of Artificial Intelligence, pp. 5751\u20135759. https://doi.org/10.1609/aaai.v35i6.16721 \u21a9
-
Wang, T., Yang, Y., Jia, R., 2022. Improving Cooperative Game Theory-based Data Valuation via Data Utility Learning. Presented at the International Conference on Learning Representations (ICLR 2022). Workshop on Socially Responsible Machine Learning, arXiv. https://doi.org/10.48550/arXiv.2107.06336 \u21a9\u21a9
-
Wu, Z., Shu, Y., Low, B.K.H., 2022. DAVINZ: Data Valuation using Deep Neural Networks at Initialization, in: Proceedings of the 39th International Conference on Machine Learning. Presented at the International Conference on Machine Learning, PMLR, pp. 24150\u201324176.\u00a0\u21a9
-
Jia, R., Dao, D., Wang, B., Hubis, F.A., Gurel, N.M., Li, B., Zhang, C., Spanos, C., Song, D., 2019. Efficient task-specific data valuation for nearest neighbor algorithms. Proc. VLDB Endow. 12, 1610\u20131623. https://doi.org/10.14778/3342263.3342637 \u21a9
-
Kwon, Y., Zou, J., 2023. Data-OOB: Out-of-bag Estimate as a Simple and Efficient Data Value, in: Proceedings of the 40th International Conference on Machine Learning. Presented at the International Conference on Machine Learning, PMLR, pp. 18135\u201318152.\u00a0\u21a9
"},{"location":"value/classwise-shapley/","title":"Class-wise Shapley","text":"Class-wise Shapley (CWS) (Schoch et al., 2022)1 offers a Shapley framework tailored for classification problems. Given a sample \\(x_i\\) with label \\(y_i \\in \\mathbb{N}\\), let \\(D_{y_i}\\) be the subset of \\(D\\) with labels \\(y_i\\), and \\(D_{-y_i}\\) be the complement of \\(D_{y_i}\\) in \\(D\\). The key idea is that the sample \\((x_i, y_i)\\) might improve the overall model performance on \\(D\\), while being detrimental for the performance on \\(D_{y_i},\\) e.g. because of a wrong label. To address this issue, the authors introduced
\\[ v_u(i) = \\frac{1}{2^{|D_{-y_i}|}} \\sum_{S_{-y_i}} \\left [ \\frac{1}{|D_{y_i}|}\\sum_{S_{y_i}} \\binom{|D_{y_i}|-1}{|S_{y_i}|}^{-1} \\delta(S_{y_i} | S_{-y_i}) \\right ], \\] where \\(S_{y_i} \\subseteq D_{y_i} \\setminus \\{i\\}\\) and \\(S_{-y_i} \\subseteq D_{-y_i}\\) is arbitrary (in particular, not the complement of \\(S_{y_i}\\)). The function \\(\\delta\\) is called set-conditional marginal Shapley value and is defined as
\\[ \\delta(S | C) = u( S_{+i} | C ) \u2212 u(S | C), \\] for any set \\(S\\) such that \\(i \\notin S, C\\) and \\(S \\cap C = \\emptyset\\).
In practical applications, estimating this quantity is done both with Monte Carlo sampling of the powerset, and the set of index permutations (Castro et al., 2009)2. Typically, this requires fewer samples than the original Shapley value, although the actual speed-up depends on the model and the dataset.
Computing classwise Shapley values
Like all other game-theoretic valuation methods, CWS requires a Utility object constructed with model and dataset, with the peculiarity of requiring a specific ClasswiseScorer. The entry point is the function compute_classwise_shapley_values:
from pydvl.value import *\n\nmodel = ...\ndata = Dataset(...)\nscorer = ClasswiseScorer(...)\nutility = Utility(model, data, scorer)\nvalues = compute_classwise_shapley_values(\n utility,\n done=HistoryDeviation(n_steps=500, rtol=5e-2) | MaxUpdates(5000),\n truncation=RelativeTruncation(utility, rtol=0.01),\n done_sample_complements=MaxChecks(1),\n normalize_values=True\n)\n
"},{"location":"value/classwise-shapley/#the-class-wise-scorer","title":"The class-wise scorer","text":"In order to use the classwise Shapley value, one needs to define a ClasswiseScorer. This scorer is defined as
\\[ u(S) = f(a_S(D_{y_i})) g(a_S(D_{-y_i})), \\] where \\(f\\) and \\(g\\) are monotonically increasing functions, \\(a_S(D_{y_i})\\) is the in-class accuracy, and \\(a_S(D_{-y_i})\\) is the out-of-class accuracy (the names originate from a choice by the authors to use accuracy, but in principle any other score, like \\(F_1\\) can be used).
The authors show that \\(f(x)=x\\) and \\(g(x)=e^x\\) have favorable properties and are therefore the defaults, but we leave the option to set different functions \\(f\\) and \\(g\\) for an exploration with different base scores.
The default class-wise scorer
Constructing the CWS scorer requires choosing a metric and the functions \\(f\\) and \\(g\\):
import numpy as np\nfrom pydvl.value.shapley.classwise import ClasswiseScorer\n\n# These are the defaults\nidentity = lambda x: x\nscorer = ClasswiseScorer(\n \"accuracy\",\n in_class_discount_fn=identity,\n out_of_class_discount_fn=np.exp\n)\n
Surface of the discounted utility function The level curves for \\(f(x)=x\\) and \\(g(x)=e^x\\) are depicted below. The lines illustrate the contour lines, annotated with their respective gradients. Level curves of the class-wise utility
"},{"location":"value/classwise-shapley/#evaluation","title":"Evaluation","text":"We illustrate the method with two experiments: point removal and noise removal, as well as an analysis of the distribution of the values. For this we employ the nine datasets used in (Schoch et al., 2022)1, using the same pre-processing. For images, PCA is used to reduce down to 32 the features found by a pre-trained Resnet18
model. Standard loc-scale normalization is performed for all models except gradient boosting, since the latter is not sensitive to the scale of the features.
Datasets used for evaluation Dataset Data Type Classes Input Dims OpenML ID Diabetes Tabular 2 8 37 Click Tabular 2 11 1216 CPU Tabular 2 21 197 Covertype Tabular 7 54 1596 Phoneme Tabular 2 5 1489 FMNIST Image 2 32 40996 CIFAR10 Image 2 32 40927 MNIST (binary) Image 2 32 554 MNIST (multi) Image 10 32 554 We show mean and coefficient of variation (CV) \\(\\frac{\\sigma}{\\mu}\\) of an \"inner metric\". The former shows the performance of the method, whereas the latter displays its stability: we normalize by the mean to see the relative effect of the standard deviation. Ideally the mean value is maximal and CV minimal.
Finally, we note that for all sampling-based valuation methods the same number of evaluations of the marginal utility was used. This is important to make the algorithms comparable, but in practice one should consider using a more sophisticated stopping criterion.
"},{"location":"value/classwise-shapley/#dataset-pruning-for-logistic-regression-point-removal","title":"Dataset pruning for logistic regression (point removal)","text":"In (best-)point removal, one first computes values for the training set and then removes in sequence the points with the highest values. After each removal, the remaining points are used to train the model from scratch and performance is measured on a test set. This produces a curve of performance vs. number of points removed which we show below.
As a scalar summary of this curve, (Schoch et al., 2022)1 define Weighted Accuracy Drop (WAD) as:
\\[ \\text{WAD} = \\sum_{j=1}^{n} \\left ( \\frac{1}{j} \\sum_{i=1}^{j} a_{T_{-\\{1 \\colon i-1 \\}}}(D) - a_{T_{-\\{1 \\colon i \\}}}(D) \\right) = a_T(D) - \\sum_{j=1}^{n} \\frac{a_{T_{-\\{1 \\colon j \\}}}(D)}{j} , \\] where \\(a_T(D)\\) is the accuracy of the model (trained on \\(T\\)) evaluated on \\(D\\) and \\(T_{-\\{1 \\colon j \\}}\\) is the set \\(T\\) without elements from \\(\\{1, \\dots , j \\}\\).
We run the point removal experiment for a logistic regression model five times and compute WAD for each run, then report the mean \\(\\mu_\\text{WAD}\\) and standard deviation \\(\\sigma_\\text{WAD}\\).
Mean WAD for best-point removal on logistic regression. Values computed using LOO, CWS, Beta Shapley, and TMCS
We see that CWS is competitive with all three other methods. In all problems except MNIST (multi)
it outperforms TMCS, while in that case TMCS has a slight advantage.
In order to understand the variability of WAD we look at its coefficient of variation (lower is better):
Coefficient of Variation of WAD for best-point removal on logistic regression. Values computed using LOO, CWS, Beta Shapley, and TMCS
CWS is not the best method in terms of CV. For CIFAR10
, Click
, CPU
and MNIST (binary)
Beta Shapley has the lowest CV. For Diabetes
, MNIST (multi)
and Phoneme
CWS is the winner and for FMNIST
and Covertype
TMCS takes the lead. Besides LOO, TMCS has the highest relative standard deviation.
The following plot shows accuracy vs number of samples removed. Random values serve as a baseline. The shaded area represents the 95% bootstrap confidence interval of the mean across 5 runs.
Accuracy after best-sample removal using values from logistic regression
Because samples are removed from high to low valuation order, we expect a steep decrease in the curve.
Overall we conclude that in terms of mean WAD, CWS and TMCS perform best, with CWS's CV on par with Beta Shapley's, making CWS a competitive method.
"},{"location":"value/classwise-shapley/#dataset-pruning-for-a-neural-network-by-value-transfer","title":"Dataset pruning for a neural network by value transfer","text":"Transfer of values from one model to another is probably of greater practical relevance: values are computed using a cheap model and used to prune the dataset before training a more expensive one.
The following plot shows accuracy vs number of samples removed for transfer from logistic regression to a neural network. The shaded area represents the 95% bootstrap confidence interval of the mean across 5 runs.
Accuracy after sample removal using values transferred from logistic regression to an MLP
As in the previous experiment samples are removed from high to low valuation order and hence we expect a steep decrease in the curve. CWS is competitive with the other methods, especially in very unbalanced datasets like Click
. In other datasets, like Covertype
, Diabetes
and MNIST (multi)
the performance is on par with TMCS.
"},{"location":"value/classwise-shapley/#detection-of-mis-labeled-data-points","title":"Detection of mis-labeled data points","text":"The next experiment tries to detect mis-labeled data points in binary classification tasks. 20% of the indices is flipped at random (we don't consider multi-class datasets because there isn't a unique flipping strategy). The following table shows the mean of the area under the curve (AUC) for five runs.
Mean AUC for mis-labeled data point detection. Values computed using LOO, CWS, Beta Shapley, and TMCS
In the majority of cases TMCS has a slight advantage over CWS, except for Click
, where CWS has a slight edge, most probably due to the unbalanced nature of the dataset. The following plot shows the CV for the AUC of the five runs.
Coefficient of variation of AUC for mis-labeled data point detection. Values computed using LOO, CWS, Beta Shapley, and TMCS
In terms of CV, CWS has a clear edge over TMCS and Beta Shapley.
Finally, we look at the ROC curves training the classifier on the \\(n\\) first samples in increasing order of valuation (i.e. starting with the worst):
Mean ROC across 5 runs with 95% bootstrap CI
Although at first sight TMCS seems to be the winner, CWS stays competitive after factoring in running time. For a perfectly balanced dataset, CWS needs on average fewer samples than TCMS.
"},{"location":"value/classwise-shapley/#value-distribution","title":"Value distribution","text":"For illustration, we compare the distribution of values computed by TMCS and CWS.
Histogram and estimated density of the values computed by TMCS and CWS on all nine datasets
For Click
TMCS has a multi-modal distribution of values. We hypothesize that this is due to the highly unbalanced nature of the dataset, and notice that CWS has a single mode, leading to its greater performance on this dataset.
"},{"location":"value/classwise-shapley/#conclusion","title":"Conclusion","text":"CWS is an effective way to handle classification problems, in particular for unbalanced datasets. It reduces the computing requirements by considering in-class and out-of-class points separately.
-
Schoch, S., Xu, H., Ji, Y., 2022. CS-Shapley: Class-wise Shapley Values for Data Valuation in Classification, in: Proc. Of the Thirty-Sixth Conference on Neural Information Processing Systems (NeurIPS). Presented at the Advances in Neural Information Processing Systems (NeurIPS 2022).\u00a0\u21a9\u21a9\u21a9
-
Castro, J., G\u00f3mez, D., Tejada, J., 2009. Polynomial calculation of the Shapley value based on sampling. Computers & Operations Research, Selected papers presented at the Tenth International Symposium on Locational Decisions (ISOLDE X) 36, 1726\u20131730. https://doi.org/10.1016/j.cor.2008.04.004 \u21a9
"},{"location":"value/semi-values/","title":"Semi-values","text":"SV is a particular case of a more general concept called semi-value, which is a generalization to different weighting schemes. A semi-value is any valuation function with the form:
\\[ v_\\text{semi}(i) = \\sum_{i=1}^n w(k) \\sum_{S \\subset D_{-i}^{(k)}} [u(S_{+i}) - u(S)], \\] where the coefficients \\(w(k)\\) satisfy the property:
\\[\\sum_{k=1}^n w(k) = 1,\\] the set \\(D_{-i}^{(k)}\\) contains all subsets of \\(D\\) of size \\(k\\) that do not include sample \\(x_i\\), \\(S_{+i}\\) is the set \\(S\\) with \\(x_i\\) added, and \\(u\\) is the utility function.
Two instances of this are Banzhaf indices (Wang and Jia, 2023)1, and Beta Shapley (Kwon and Zou, 2022)2, with better numerical and rank stability in certain situations.
Note
Shapley values are a particular case of semi-values and can therefore also be computed with the methods described here. However, as of version 0.8.1, we recommend using compute_shapley_values instead, in particular because it implements truncation policies for TMCS.
"},{"location":"value/semi-values/#beta-shapley","title":"Beta Shapley","text":"For some machine learning applications, where the utility is typically the performance when trained on a set \\(S \\subset D\\), diminishing returns are often observed when computing the marginal utility of adding a new data point.
Beta Shapley is a weighting scheme that uses the Beta function to place more weight on subsets deemed to be more informative. The weights are defined as:
\\[ w(k) := \\frac{B(k+\\beta, n-k+1+\\alpha)}{B(\\alpha, \\beta)}, \\] where \\(B\\) is the Beta function, and \\(\\alpha\\) and \\(\\beta\\) are parameters that control the weighting of the subsets. Setting both to 1 recovers Shapley values, and setting \\(\\alpha = 1\\), and \\(\\beta = 16\\) is reported in (Kwon and Zou, 2022)2 to be a good choice for some applications. Beta Shapley values are available in pyDVL through compute_beta_shapley_semivalues:
from pydvl.value import *\n\nutility = Utility(model, data)\nvalues = compute_beta_shapley_semivalues(\n u=utility, done=AbsoluteStandardError(threshold=1e-4), alpha=1, beta=16\n)\n
See however the Banzhaf indices section for an alternative choice of weights which is reported to work better.
"},{"location":"value/semi-values/#banzhaf-indices","title":"Banzhaf indices","text":"As noted in the section Problems of Data Values, the Shapley value can be very sensitive to variance in the utility function. For machine learning applications, where the utility is typically the performance when trained on a set \\(S \\subset D\\), this variance is often largest for smaller subsets \\(S\\). It is therefore reasonable to try reducing the relative contribution of these subsets with adequate weights.
One such choice of weights is the Banzhaf index, which is defined as the constant:
\\[w(k) := 2^{n-1},\\] for all set sizes \\(k\\). The intuition for picking a constant weight is that for any choice of weight function \\(w\\), one can always construct a utility with higher variance where \\(w\\) is greater. Therefore, in a worst-case sense, the best one can do is to pick a constant weight.
The authors of (Wang and Jia, 2023)1 show that Banzhaf indices are more robust to variance in the utility function than Shapley and Beta Shapley values. They are available in pyDVL through compute_banzhaf_semivalues:
from pydvl.value import *\n\nutility = Utility(model, data)\nvalues = compute_banzhaf_semivalues(\n u=utility, done=AbsoluteStandardError(threshold=1e-4), alpha=1, beta=16\n)\n
"},{"location":"value/semi-values/#banzhaf-semi-values-with-msr-sampling","title":"Banzhaf semi-values with MSR sampling","text":"Wang et. al. propose a more sample-efficient method for computing Banzhaf semivalues in their paper Data Banzhaf: A Robust Data Valuation Framework for Machine Learning (Wang and Jia, 2023)1. This method updates all semivalues per evaluation of the utility (i.e. per model trained) based on whether a specific data point was included in the data subset or not. The expression for computing the semivalues is
\\[\\hat{\\phi}_{MSR}(i) = \\frac{1}{|\\mathbf{S}_{\\ni i}|} \\sum_{S \\in \\mathbf{S}_{\\ni i}} U(S) - \\frac{1}{|\\mathbf{S}_{\\not{\\ni} i}|} \\sum_{S \\in \\mathbf{S}_{\\not{\\ni} i}} U(S)\\] where \\(\\mathbf{S}_{\\ni i}\\) are the subsets that contain the index \\(i\\) and \\(\\mathbf{S}_{\\not{\\ni} i}\\) are the subsets not containing the index \\(i\\).
The function implementing this method is compute_msr_banzhaf_semivalues.
from pydvl.value import compute_msr_banzhaf_semivalues, RankCorrelation, Utility\n\nutility = Utility(model, data)\nvalues = compute_msr_banzhaf_semivalues(\n u=utility, done=RankCorrelation(rtol=0.001),\n )\n
For further details on how to use this method and a comparison of the sample efficiency, we suggest to take a look at the example notebook msr_banzhaf_spotify."},{"location":"value/semi-values/#general-semi-values","title":"General semi-values","text":"As explained above, both Beta Shapley and Banzhaf indices are special cases of semi-values. In pyDVL we provide a general method for computing these with any combination of the three ingredients that define a semi-value:
- A utility function \\(u\\).
- A sampling method
- A weighting scheme \\(w\\).
You can construct any combination of these three ingredients with compute_generic_semivalues. The utility function is the same as for Shapley values, and the sampling method can be any of the types defined in the samplers module. For instance, the following snippet is equivalent to the above:
from pydvl.value import *\n\ndata = Dataset(...)\nutility = Utility(model, data)\nvalues = compute_generic_semivalues(\n sampler=PermutationSampler(data.indices),\n u=utility,\n coefficient=beta_coefficient(alpha=1, beta=16),\n done=AbsoluteStandardError(threshold=1e-4),\n)\n
Allowing any coefficient can help when experimenting with models which are more sensitive to changes in training set size. However, Data Banzhaf indices are proven to be the most robust to variance in the utility function, in the sense of rank stability, across a range of models and datasets (Wang and Jia, 2023)1.
Careful with permutation sampling
This generic implementation of semi-values allowing for any combination of sampling and weighting schemes is very flexible and, in principle, it recovers the original Shapley value, so that compute_shapley_values is no longer necessary. However, it loses the optimization in permutation sampling that reuses the utility computation from the last iteration when iterating over a permutation. This doubles the computation requirements (and slightly increases variance) when using permutation sampling, unless the cache is enabled. In addition, as mentioned above, truncation policies are not supported by this generic implementation (as of v0.8.1). For these reasons it is preferable to use compute_shapley_values whenever not computing other semi-values.
-
Wang, J.T., Jia, R., 2023. Data Banzhaf: A Robust Data Valuation Framework for Machine Learning, in: Proceedings of The 26th International Conference on Artificial Intelligence and Statistics. Presented at the International Conference on Artificial Intelligence and Statistics, PMLR, pp. 6388\u20136421.\u00a0\u21a9\u21a9\u21a9\u21a9
-
Kwon, Y., Zou, J., 2022. Beta Shapley: A Unified and Noise-reduced Data Valuation Framework for Machine Learning, in: Proceedings of the 25th International Conference on Artificial Intelligence and Statistics (AISTATS) 2022,. Presented at the AISTATS 2022, PMLR.\u00a0\u21a9\u21a9
"},{"location":"value/shapley/","title":"Shapley value","text":""},{"location":"value/shapley/#shapley-value","title":"Shapley value","text":"The Shapley method is an approach to compute data values originating in cooperative game theory. Shapley values are a common way of assigning payoffs to each participant in a cooperative game (i.e. one in which players can form coalitions) in a way that ensures that certain axioms are fulfilled.
pyDVL implements several methods for the computation and approximation of Shapley values. They can all be accessed via the facade function compute_shapley_values. The supported methods are enumerated in ShapleyMode.
Empirically, the most useful method is the so-called Truncated Monte Carlo Shapley (Ghorbani and Zou, 2019)1, which is a Monte Carlo approximation of the permutation Shapley value.
"},{"location":"value/shapley/#combinatorial-shapley","title":"Combinatorial Shapley","text":"The first algorithm is just a verbatim implementation of the definition. As such it returns as exact a value as the utility function allows (see what this means in Problems of Data Values).
The value \\(v\\) of the \\(i\\)-th sample in dataset \\(D\\) wrt. utility \\(u\\) is computed as a weighted sum of its marginal utility wrt. every possible coalition of training samples within the training set:
\\[ v(i) = \\frac{1}{n} \\sum_{S \\subseteq D_{-i}} \\binom{n-1}{ | S | }^{-1} [u(S_{+i}) \u2212 u(S)] ,\\] where \\(D_{-i}\\) denotes the set of samples in \\(D\\) excluding \\(x_i\\), and \\(S_{+i}\\) denotes the set \\(S\\) with \\(x_i\\) added.
from pydvl.value import compute_shapley_values\n\nvalues = compute_shapley_values(utility, mode=\"combinatorial_exact\")\ndf = values.to_dataframe(column='value')\n
We can convert the return value to a pandas.DataFrame. and name the column with the results as value
. Please refer to the documentation in shapley and ValuationResult for more information.
"},{"location":"value/shapley/#monte-carlo-combinatorial-shapley","title":"Monte Carlo Combinatorial Shapley","text":"Because the number of subsets \\(S \\subseteq D_{-i}\\) is \\(2^{ | D | - 1 }\\), one typically must resort to approximations. The simplest one is done via Monte Carlo sampling of the powerset \\(\\mathcal{P}(D)\\). In pyDVL this simple technique is called \"Monte Carlo Combinatorial\". The method has very poor converge rate and others are preferred, but if desired, usage follows the same pattern:
from pydvl.value import compute_shapley_values, MaxUpdates\n\nvalues = compute_shapley_values(\n utility, mode=\"combinatorial_montecarlo\", done=MaxUpdates(1000)\n)\ndf = values.to_dataframe(column='cmc')\n
The DataFrames returned by most Monte Carlo methods will contain approximate standard errors as an additional column, in this case named cmc_stderr
.
Note the usage of the object MaxUpdates as the stop condition. This is an instance of a StoppingCriterion. Other examples are MaxTime and AbsoluteStandardError.
"},{"location":"value/shapley/#owen-sampling","title":"Owen sampling","text":"Owen Sampling (Okhrati and Lipani, 2021)2 is a practical algorithm based on the combinatorial definition. It uses a continuous extension of the utility from \\(\\{0,1\\}^n\\), where a 1 in position \\(i\\) means that sample \\(x_i\\) is used to train the model, to \\([0,1]^n\\). The ensuing expression for Shapley value uses integration instead of discrete weights:
\\[ v_u(i) = \\int_0^1 \\mathbb{E}_{S \\sim P_q(D_{-i})} [u(S_{+i}) - u(S)]. \\] Using Owen sampling follows the same pattern as every other method for Shapley values in pyDVL. First construct the dataset and utility, then call compute_shapley_values:
from pydvl.value import compute_shapley_values\n\nvalues = compute_shapley_values(\n u=utility, mode=\"owen\", n_iterations=4, max_q=200\n)\n
There are more details on Owen sampling, and its variant Antithetic Owen Sampling in the documentation for the function doing the work behind the scenes: owen_sampling_shapley.
Note that in this case we do not pass a StoppingCriterion to the function, but instead the number of iterations and the maximum number of samples to use in the integration.
"},{"location":"value/shapley/#permutation-shapley","title":"Permutation Shapley","text":"An equivalent way of computing Shapley values (ApproShapley
) appeared in (Castro et al., 2009)3 and is the basis for the method most often used in practice. It uses permutations over indices instead of subsets:
\\[ v_u(x_i) = \\frac{1}{n!} \\sum_{\\sigma \\in \\Pi(n)} [u(\\sigma_{:i} \\cup \\{x_i\\}) \u2212 u(\\sigma_{:i})], \\] where \\(\\sigma_{:i}\\) denotes the set of indices in permutation sigma before the position where \\(i\\) appears. To approximate this sum (which has \\(\\mathcal{O}(n!)\\) terms!) one uses Monte Carlo sampling of permutations, something which has surprisingly low sample complexity. One notable difference wrt. the combinatorial approach above is that the approximations always fulfill the efficiency axiom of Shapley, namely \\(\\sum_{i=1}^n \\hat{v}_i = u(D)\\) (see (Castro et al., 2009)3, Proposition 3.2).
By adding two types of early stopping, the result is the so-called Truncated Monte Carlo Shapley (Ghorbani and Zou, 2019)1, which is efficient enough to be useful in applications. The first is simply a convergence criterion, of which there are several to choose from. The second is a criterion to truncate the iteration over single permutations. RelativeTruncation chooses to stop iterating over samples in a permutation when the marginal utility becomes too small.
from pydvl.value import compute_shapley_values, MaxUpdates, RelativeTruncation\n\nvalues = compute_shapley_values(\n u=utility,\n mode=\"permutation_montecarlo\",\n done=MaxUpdates(1000),\n truncation=RelativeTruncation(utility, rtol=0.01)\n)\n
You can see this method in action in this example using the Spotify dataset.
"},{"location":"value/shapley/#exact-shapley-for-knn","title":"Exact Shapley for KNN","text":"It is possible to exploit the local structure of K-Nearest Neighbours to reduce the amount of subsets to consider: because no sample besides the K closest affects the score, most are irrelevant and it is possible to compute a value in linear time. This method was introduced by (Jia et al., 2019)4, and can be used in pyDVL with:
from pydvl.utils import Dataset, Utility\nfrom pydvl.value import compute_shapley_values\nfrom sklearn.neighbors import KNeighborsClassifier\n\nmodel = KNeighborsClassifier(n_neighbors=5)\ndata = Dataset(...)\nutility = Utility(model, data)\nvalues = compute_shapley_values(u=utility, mode=\"knn\")\n
"},{"location":"value/shapley/#group-testing","title":"Group testing","text":"An alternative method for the approximation of Shapley values introduced in (Jia et al., 2019)4 first estimates the differences of values with a Monte Carlo sum. With
\\[\\hat{\\Delta}_{i j} \\approx v_i - v_j,\\] one then solves the following linear constraint satisfaction problem (CSP) to infer the final values:
\\[ \\begin{array}{lll} \\sum_{i = 1}^N v_i & = & U (D)\\\\ | v_i - v_j - \\hat{\\Delta}_{i j} | & \\leqslant & \\frac{\\varepsilon}{2 \\sqrt{N}} \\end{array} \\] Warning
We have reproduced this method in pyDVL for completeness and benchmarking, but we don't advocate its use because of the speed and memory cost. Despite our best efforts, the number of samples required in practice for convergence can be several orders of magnitude worse than with e.g. TMCS. Additionally, the CSP can sometimes turn out to be infeasible.
Usage follows the same pattern as every other Shapley method, but with the addition of an epsilon
parameter required for the solution of the CSP. It should be the same value used to compute the minimum number of samples required. This can be done with num_samples_eps_delta, but note that the number returned will be huge! In practice, fewer samples can be enough, but the actual number will strongly depend on the utility, in particular its variance.
from pydvl.utils import Dataset, Utility\nfrom pydvl.value import compute_shapley_values\n\nmodel = ...\ndata = Dataset(...)\nutility = Utility(model, data, score_range=(_min, _max))\nmin_iterations = num_samples_eps_delta(epsilon, delta, n, utility.score_range)\nvalues = compute_shapley_values(\n u=utility, mode=\"group_testing\", n_iterations=min_iterations, eps=eps\n)\n
-
Ghorbani, A., Zou, J., 2019. Data Shapley: Equitable Valuation of Data for Machine Learning, in: Proceedings of the 36th International Conference on Machine Learning, PMLR. Presented at the International Conference on Machine Learning (ICML 2019), PMLR, pp. 2242\u20132251.\u00a0\u21a9\u21a9
-
Okhrati, R., Lipani, A., 2021. A Multilinear Sampling Algorithm to Estimate Shapley Values, in: 2020 25th International Conference on Pattern Recognition (ICPR). Presented at the 2020 25th International Conference on Pattern Recognition (ICPR), IEEE, pp. 7992\u20137999. https://doi.org/10.1109/ICPR48806.2021.9412511 \u21a9
-
Castro, J., G\u00f3mez, D., Tejada, J., 2009. Polynomial calculation of the Shapley value based on sampling. Computers & Operations Research, Selected papers presented at the Tenth International Symposium on Locational Decisions (ISOLDE X) 36, 1726\u20131730. https://doi.org/10.1016/j.cor.2008.04.004 \u21a9\u21a9
-
Jia, R., Dao, D., Wang, B., Hubis, F.A., Gurel, N.M., Li, B., Zhang, C., Spanos, C., Song, D., 2019. Efficient task-specific data valuation for nearest neighbor algorithms. Proc. VLDB Endow. 12, 1610\u20131623. https://doi.org/10.14778/3342263.3342637 \u21a9\u21a9
"},{"location":"value/the-core/","title":"Core values","text":"Shapley values define a fair way to distribute payoffs amongst all participants (training points) when they form a grand coalition, i.e. when the model is trained on the whole dataset. But they do not consider the question of stability: under which conditions do all participants in a game form the grand coalition? Are the payoffs distributed in such a way that prioritizes its formation?
The Core is another solution concept in cooperative game theory that attempts to ensure stability in the sense that it provides the set of feasible payoffs that cannot be improved upon by a sub-coalition. This can be interesting for some applications of data valuation because it yields values consistent with training on the whole dataset, avoiding the spurious selection of subsets.
It satisfies the following 2 properties:
-
Efficiency: The payoffs are distributed such that it is not possible to make any participant better off without making another one worse off. \\(\\sum_{i \\in D} v(i) = u(D).\\)
-
Coalitional rationality: The sum of payoffs to the agents in any coalition \\(S\\) is at least as large as the amount that these agents could earn by forming a coalition on their own. \\(\\sum_{i \\in S} v(i) \\geq u(S), \\forall S \\subset D.\\)
The Core was first introduced into data valuation by (Yan and Procaccia, 2021)1, in the following form.
"},{"location":"value/the-core/#least-core-values","title":"Least Core values","text":"Unfortunately, for many cooperative games the Core may be empty. By relaxing the coalitional rationality property by a subsidy \\(e \\gt 0\\), we are then able to find approximate payoffs:
\\[ \\sum_{i\\in S} v(i) + e \\geq u(S), \\forall S \\subset D, S \\neq \\emptyset \\ ,\\] The Least Core (LC) values \\(\\{v\\}\\) for utility \\(u\\) are computed by solving the following linear program:
\\[ \\begin{array}{lll} \\text{minimize} & e & \\\\ \\text{subject to} & \\sum_{i\\in D} v(i) = u(D) & \\\\ & \\sum_{i\\in S} v(i) + e \\geq u(S) &, \\forall S \\subset D, S \\neq \\emptyset \\\\ \\end{array} \\] Note that solving this program yields a set of solutions \\(\\{v_j:N \\rightarrow \\mathbb{R}\\}\\), whereas the Shapley value is a single function \\(v\\). In order to obtain a single valuation to use, one breaks ties by solving a quadratic program to select the \\(v\\) in the LC with the smallest \\(\\ell_2\\) norm. This is called the egalitarian least core.
"},{"location":"value/the-core/#exact-least-core","title":"Exact Least Core","text":"This first algorithm is just a verbatim implementation of the definition, in compute_least_core_values. It computes all constraints for the linear problem by evaluating the utility on every subset of the training data, and returns as exact a value as the utility function allows (see what this means in Problems of Data Values).
from pydvl.value import compute_least_core_values\n\nvalues = compute_least_core_values(utility, mode=\"exact\")\n
"},{"location":"value/the-core/#monte-carlo-least-core","title":"Monte Carlo Least Core","text":"Because the number of subsets \\(S \\subseteq D \\setminus \\{i\\}\\) is \\(2^{ | D | - 1 }\\), one typically must resort to approximations.
The simplest one consists in using a fraction of all subsets for the constraints. (Yan and Procaccia, 2021)1 show that a quantity of order \\(\\mathcal{O}((n - \\log \\Delta ) / \\delta^2)\\) is enough to obtain a so-called \\(\\delta\\)-approximate least core with high probability. I.e. the following property holds with probability \\(1-\\Delta\\) over the choice of subsets:
\\[ \\mathbb{P}_{S\\sim D}\\left[\\sum_{i\\in S} v(i) + e^{*} \\geq u(S)\\right] \\geq 1 - \\delta, \\] where \\(e^{*}\\) is the optimal least core subsidy. This approximation is also implemented in compute_least_core_values:
from pydvl.value import compute_least_core_values\n\nvalues = compute_least_core_values(\n utility, mode=\"montecarlo\", n_iterations=n_iterations\n)\n
Note
Although any number is supported, it is best to choose n_iterations
to be at least equal to the number of data points.
Because computing the Least Core values requires the solution of a linear and a quadratic problem after computing all the utility values, we offer the possibility of splitting the latter from the former. This is useful when running multiple experiments: use mclc_prepare_problem to prepare a list of problems to solve, then solve them in parallel with lc_solve_problems.
from pydvl.value.least_core import mclc_prepare_problem, lc_solve_problems\n\nn_experiments = 10\nproblems = [mclc_prepare_problem(utility, n_iterations=n_iterations)\n for _ in range(n_experiments)]\nvalues = lc_solve_problems(problems)\n
"},{"location":"value/the-core/#method-comparison","title":"Method comparison","text":"The TransferLab team reproduced the results of the original paper in a publication for the 2022 MLRC (Benmerzoug and Benito Delgado, 2023)2.
Best sample removal on binary image classification
Roughly speaking, MCLC performs better in identifying high value points, as measured by best-sample removal tasks. In all other aspects, it performs worse or similarly to TMCS at comparable sample budgets. But using an equal number of subsets is more computationally expensive because of the need to solve large linear and quadratic optimization problems.
Worst sample removal on binary image classification
For these reasons we recommend some variation of SV like TMCS for outlier detection, data cleaning and pruning, and perhaps MCLC for the selection of interesting points to be inspected for the improvement of data collection or model design.
-
Yan, T., Procaccia, A.D., 2021. If You Like Shapley Then You\u2019ll Love the Core, in: Proceedings of the 35th AAAI Conference on Artificial Intelligence, 2021. Presented at the AAAI Conference on Artificial Intelligence, Association for the Advancement of Artificial Intelligence, pp. 5751\u20135759. https://doi.org/10.1609/aaai.v35i6.16721 \u21a9\u21a9
-
Benmerzoug, A., Benito Delgado, M. de, 2023. [Re] If you like Shapley, then you\u2019ll love the core. ReScience C 9. https://doi.org/10.5281/zenodo.8173733 \u21a9
"}]}
\ No newline at end of file
+{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"","title":"The python library for data valuation","text":"pyDVL collects algorithms for data valuation and influence function computation. For the full list see Methods. It supports out-of-core and distributed computation, as well as local or distributed caching of results.
If you're a first time user of pyDVL, we recommend you to go through Getting started.
-
Getting started
Steps to install and requirements
-
Example gallery
Notebooks with worked-out examples of data valuation and influence functions
-
Data valuation
Basics of data valuation and description of the main algorithms
-
Influence Function
An introduction to the influence function and its computation with pyDVL
-
Supported methods
List of all methods implemented with references.
-
API Reference
Full documentation of the API
"},{"location":"CHANGELOG/","title":"Changelog","text":""},{"location":"CHANGELOG/#unreleased","title":"Unreleased","text":""},{"location":"CHANGELOG/#added","title":"Added","text":" - New method
InverseHarmonicMeanInfluence
, implementation for the paper DataInf: Efficiently Estimating Data Influence in LoRA-tuned LLMs and Diffusion Models
PR #582 - Add new backend implementations for influence computation to account for block-diagonal approximations PR #582
- Extend
DirectInfluence
with block-diagonal and Gauss-Newton approximation PR #591 - Extend
LissaInfluence
with block-diagonal and Gauss-Newton approximation PR #593 - Extend
NystroemSketchInfluence
with block-diagonal and Gauss-Newton approximation PR #596
"},{"location":"CHANGELOG/#fixed","title":"Fixed","text":" - Replace
np.float_
with np.float64
and np.alltrue
with np.all
, as the old aliases are removed in NumPy 2.0 PR #604
"},{"location":"CHANGELOG/#changed","title":"Changed","text":" - Breaking Changes
- Rename parameter
hessian_regularization
of DirectInfluence
to regularization
and change the type annotation to allow for block-wise regularization parameters PR #591 - Rename parameter
hessian_regularization
of LissaInfluence
to regularization
and change the type annotation to allow for block-wise regularization parameters PR #593 - Remove parameter
h0
from init of LissaInfluence
PR #593 - Rename parameter
hessian_regularization
of NystroemSketchInfluence
to regularization
and change the type annotation to allow for block-wise regularization parameters PR #596
"},{"location":"CHANGELOG/#092-bug-fixes-logging-improvement","title":"0.9.2 - \ud83c\udfd7 Bug fixes, logging improvement","text":""},{"location":"CHANGELOG/#added_1","title":"Added","text":" - Add progress bars to the computation of
LazyChunkSequence
and NestedLazyChunkSequence
PR #567 - Add a device fixture for
pytest
, which depending on the availability and user input (pytest --with-cuda
) resolves to cuda device PR #574
"},{"location":"CHANGELOG/#fixed_1","title":"Fixed","text":" - Fixed logging issue in decorator
log_duration
PR #567 - Fixed missing move of tensors to model device in
EkfacInfluence
implementation PR #570 - Missing move to device of
preconditioner
in CgInfluence
implementation PR #572 - Raise a more specific error message, when a
RunTimeError
occurs in torch.linalg.eigh
, so the user can check if it is related to a known issue PR #578 - Fix an edge case (empty train data) in the test
test_classwise_scorer_accuracies_manual_derivation
, which resulted in undefined behavior (np.nan
to int
conversion with different results depending on OS) PR #579
"},{"location":"CHANGELOG/#changed_1","title":"Changed","text":" - Changed logging behavior of iterative methods
LissaInfluence
and CgInfluence
to warn on not achieving desired tolerance within maxiter
, add parameter warn_on_max_iteration
to set the level for this information to logging.DEBUG
PR #567
"},{"location":"CHANGELOG/#091-bug-fixes-logging-improvement","title":"0.9.1 - Bug fixes, logging improvement","text":""},{"location":"CHANGELOG/#fixed_2","title":"Fixed","text":" FutureWarning
for ParallelConfig
constantly raised without actually instantiating the object PR #562
"},{"location":"CHANGELOG/#090-new-methods-better-docs-and-bugfixes","title":"0.9.0 - \ud83c\udd95 New methods, better docs and bugfixes \ud83d\udcda\ud83d\udc1e","text":""},{"location":"CHANGELOG/#added_2","title":"Added","text":" - New method
MSR Banzhaf
with accompanying notebook, and new stopping criterion RankCorrelation
PR #520 - New method:
NystroemSketchInfluence
PR #504 - New preconditioned block variant of conjugate gradient PR #507
- Improvements to documentation: fixes, links, text, example gallery, LFS and more PR #532, PR #543
- Glossary of data valuation and influence terms in the documentation [PR #537](https://github.com/aai-institute/pyDVL/pull/537
- Documentation about writing notes for new features, changes or deprecations PR #557
"},{"location":"CHANGELOG/#fixed_3","title":"Fixed","text":" - Bug in
LissaInfluence
, when not using CPU device PR #495 - Memory issue with
CgInfluence
and ArnoldiInfluence
PR #498 - Raising specific error message with install instruction, when trying to load
pydvl.utils.cache.memcached
without pymemcache
installed. If pymemcache
is available, all symbols from pydvl.utils.cache.memcached
are available through pydvl.utils.cache
PR #509
"},{"location":"CHANGELOG/#changed_2","title":"Changed","text":" - Add property
model_dtype
to instances of type TorchInfluenceFunctionModel
- Bump versions of CI actions to avoid warnings PR #502
- Add Python Version 3.11 to supported versions PR #510
- Documentation improvements and cleanup PR #521, PR #522
- Simplified parallel backend configuration PR #549
"},{"location":"CHANGELOG/#081-new-method-and-notebook-games-with-exact-shapley-values-bug-fixes-and-cleanup","title":"0.8.1 - \ud83c\udd95 \ud83c\udfd7 New method and notebook, Games with exact shapley values, bug fixes and cleanup","text":""},{"location":"CHANGELOG/#added_3","title":"Added","text":" - Implement new method:
EkfacInfluence
PR #451 - New notebook to showcase ekfac for LLMs PR #483
- Implemented exact games in Castro et al. 2009 and 2017 PR #341
"},{"location":"CHANGELOG/#fixed_4","title":"Fixed","text":" - Bug in using
DaskInfluenceCalcualator
with TorchnumpyConverter
for single dimensional arrays PR #485 - Fix implementations of
to
methods of TorchInfluenceFunctionModel
implementations PR #487 - Fixed bug with checking for converged values in semivalues PR #341
"},{"location":"CHANGELOG/#changed_3","title":"Changed","text":" - Add applications of data valuation section, display examples more prominently, make all sections visible in table of contents, use mkdocs material cards in the home page PR #492
"},{"location":"CHANGELOG/#080-new-interfaces-scaling-computation-bug-fixes-and-improvements","title":"0.8.0 - \ud83c\udd95 New interfaces, scaling computation, bug fixes and improvements \ud83c\udf81","text":""},{"location":"CHANGELOG/#added_4","title":"Added","text":" - New cache backends: InMemoryCacheBackend and DiskCacheBackend PR #458
- New influence function interface
InfluenceFunctionModel
- Data parallel computation with
DaskInfluenceCalculator
PR #26 - Sequential batch-wise computation and write to disk with
SequentialInfluenceCalculator
PR #377 - Adapt notebooks to new influence abstractions PR #430
"},{"location":"CHANGELOG/#changed_4","title":"Changed","text":" - Refactor and simplify caching implementation PR #458
- Simplify display of computation progress PR #466
- Improve readme and explain better the examples PR #465
- Simplify and improve tests, add CodeCov code coverage PR #429
- Breaking Changes
- Removed
compute_influences
and all related code. Replaced by new InfluenceFunctionModel
interface. Removed modules: - influence.general
- influence.inversion
- influence.twice_differentiable
- influence.torch.torch_differentiable
"},{"location":"CHANGELOG/#fixed_5","title":"Fixed","text":" - Import bug in README PR #457
"},{"location":"CHANGELOG/#071-new-methods-bug-fixes-and-improvements-for-local-tests","title":"0.7.1 - \ud83c\udd95 New methods, bug fixes and improvements for local tests \ud83d\udc1e\ud83e\uddea","text":""},{"location":"CHANGELOG/#added_5","title":"Added","text":" - New method: Class-wise Shapley values PR #338
- New method: Data-OOB by @BastienZim PR #426, PR $431
- Added
AntitheticPermutationSampler
PR #439 - Faster semi-value computation with per-index check of stopping criteria (optional) PR #437
"},{"location":"CHANGELOG/#fixed_6","title":"Fixed","text":" - Fix initialization of
data_names
in ValuationResult.zeros()
PR #443
"},{"location":"CHANGELOG/#changed_5","title":"Changed","text":" - No longer using docker within tests to start a memcached server PR #444
- Using pytest-xdist for faster local tests PR #440
- Improvements and fixes to notebooks PR #436
- Refactoring of parallel module. Old imports will stop working in v0.9.0 PR #421
"},{"location":"CHANGELOG/#070-documentation-and-if-overhaul-new-methods-and-bug-fixes","title":"0.7.0 - \ud83d\udcda\ud83c\udd95 Documentation and IF overhaul, new methods and bug fixes \ud83d\udca5\ud83d\udc1e","text":"This is our first \u03b2 release! We have worked hard to deliver improvements across the board, with a focus on documentation and usability. We have also reworked the internals of the influence
module, improved parallelism and handling of randomness.
"},{"location":"CHANGELOG/#added_6","title":"Added","text":" - Implemented solving the Hessian equation via spectral low-rank approximation PR #365
- Enabled parallel computation for Leave-One-Out values PR #406
- Added more abbreviations to documentation PR #415
- Added seed to functions from
pydvl.utils.numeric
, pydvl.value.shapley
and pydvl.value.semivalues
. Introduced new type Seed
and conversion function ensure_seed_sequence
. PR #396 - Added
batch_size
parameter to compute_banzhaf_semivalues
, compute_beta_shapley_semivalues
, compute_shapley_semivalues
and compute_generic_semivalues
. PR #428 - Added classwise Shapley as proposed by (Schoch et al. 2021) [https://arxiv.org/abs/2211.06800] PR #338
"},{"location":"CHANGELOG/#changed_6","title":"Changed","text":" - Replaced sphinx with mkdocs for documentation. Major overhaul of documentation PR #352
- Made ray an optional dependency, relying on joblib as default parallel backend PR #408
- Decoupled
ray.init
from ParallelConfig
PR #373 - Breaking Changes
- Signature change: return information about Hessian inversion from
compute_influence_factors
PR #375 - Major changes to IF interface and functionality. Foundation for a framework abstraction for IF computation. PR #278 PR #394
- Renamed
semivalues
to compute_generic_semivalues
PR #413 - New
joblib
backend as default instead of ray. Simplify MapReduceJob. PR #355 - Bump torch dependency for influence package to 2.0 PR #365
"},{"location":"CHANGELOG/#fixed_7","title":"Fixed","text":" - Fixes to parallel computation of generic semi-values: properly handle all samplers and stopping criteria, irrespective of parallel backend. PR #372
- Optimises memory usage in IF calculation PR #375
- Fix adding valuation results with overlapping indices and different lengths PR #370
- Fixed bugs in conjugate gradient and
linear_solve
PR #358 - Fix installation of dev requirements for Python3.10 PR #382
- Improvements to IF documentation PR #371
"},{"location":"CHANGELOG/#061-bug-fixes-and-small-improvements","title":"0.6.1 - \ud83c\udfd7 Bug fixes and small improvements","text":" - Fix parsing keyword arguments of
compute_semivalues
dispatch function PR #333 - Create new
RayExecutor
class based on the concurrent.futures API, use the new class to fix an issue with Truncated Monte Carlo Shapley (TMCS) starting too many processes and dying, plus other small changes PR #329 - Fix creation of GroupedDataset objects using the
from_arrays
and from_sklearn
class methods PR #324 - Fix release job not triggering on CI when a new tag is pushed PR #331
- Added alias
ApproShapley
from Castro et al. 2009 for permutation Shapley PR #332
"},{"location":"CHANGELOG/#060-new-algorithms-cleanup-and-bug-fixes","title":"0.6.0 - \ud83c\udd95 New algorithms, cleanup and bug fixes \ud83c\udfd7","text":" - Fixes in
ValuationResult
: bugs around data names, semantics of empty()
, new method zeros()
and normalised random values PR #327 - New method: Implements generalised semi-values for data valuation, including Data Banzhaf and Beta Shapley, with configurable sampling strategies PR #319
- Adds kwargs parameter to
from_array
and from_sklearn
Dataset and GroupedDataset class methods PR #316 - PEP-561 conformance: added
py.typed
PR #307 - Removed default non-negativity constraint on least core subsidy and added instead a
non_negative_subsidy
boolean flag. Renamed options
to solver_options
and pass it as dict. Change default least-core solver to SCS with 10000 max_iters. PR #304 - Cleanup: removed unnecessary decorator
@unpackable
PR #233 - Stopping criteria: fixed problem with
StandardError
and enable proper composition of index convergence statuses. Fixed a bug with n_jobs
in truncated_montecarlo_shapley
. PR #300 and PR #305 - Shuffling code around to allow for simpler user imports, some cleanup and documentation fixes. PR #284
- Bug fix: Warn instead of raising an error when
n_iterations
is less than the size of the dataset in Monte Carlo Least Core PR #281
"},{"location":"CHANGELOG/#050-fixes-nicer-interfaces-and-more-breaking-changes","title":"0.5.0 - \ud83d\udca5 Fixes, nicer interfaces and... more breaking changes \ud83d\ude12","text":" - Fixed parallel and antithetic Owen sampling for Shapley values. Simplified and extended tests. PR #267
- Added
Scorer
class for a cleaner interface. Fixed minor bugs around Group-Testing Shapley, added more tests and switched to cvxpy for the solver. PR #264 - Generalised stopping criteria for valuation algorithms. Improved classes
ValuationResult
and Status
with more operations. Some minor issues fixed. PR #252 - Fixed a bug whereby
compute_shapley_values
would only spawn one process when using n_jobs=-1
and Monte Carlo methods. PR #270 - Bugfix in
RayParallelBackend
: wrong semantics for kwargs
. PR #268 - Splitting of problem preparation and solution in Least-Core computation. Umbrella function for LC methods. PR #257
- Operations on
ValuationResult
and Status
and some cleanup PR #248 - Bug fix and minor improvements: Fixes bug in TMCS with remote Ray cluster, raises an error for dummy sequential parallel backend with TMCS, clones model inside
Utility
before fitting by default, with flag clone_before_fit
to disable it, catches all warnings in Utility
when show_warnings
is False
. Adds Miner and Gloves toy games utilities PR #247
"},{"location":"CHANGELOG/#040-new-algorithms-and-more-breaking-changes","title":"0.4.0 - \ud83c\udfed\ud83d\udca5 New algorithms and more breaking changes","text":" - GH action to mark issues as stale PR #201
- Disabled caching of Utility values as well as repeated evaluations by default PR #211
- Test and officially support Python version 3.9 and 3.10 PR #208
- Breaking change: Introduces a class ValuationResult to gather and inspect results from all valuation algorithms PR #214
- Fixes bug in Influence calculation with multidimensional input and adds new example notebook PR #195
- Breaking change: Passes the input to
MapReduceJob
at initialization, removes chunkify_inputs
argument from MapReduceJob
, removes n_runs
argument from MapReduceJob
, calls the parallel backend's put()
method for each generated chunk in _chunkify()
, renames ParallelConfig's num_workers
attribute to n_local_workers
, fixes a bug in MapReduceJob
's chunkification when n_runs
>= n_jobs
, and defines a sequential parallel backend to run all jobs in the current thread PR #232 - New method: Implements exact and monte carlo Least Core for data valuation, adds
from_arrays()
class method to the Dataset
and GroupedDataset
classes, adds extra_values
argument to ValuationResult
, adds compute_removal_score()
and compute_random_removal_score()
helper functions PR #237 - New method: Group Testing Shapley for valuation, from Jia et al. 2019 PR #240
- Fixes bug in ray initialization in
RayParallelBackend
class PR #239 - Implements \"Egalitarian Least Core\", adds cvxpy as a dependency and uses it instead of scipy as optimizer PR #243
"},{"location":"CHANGELOG/#030-breaking-changes","title":"0.3.0 - \ud83d\udca5 Breaking changes","text":" - Simplified and fixed powerset sampling and testing PR #181
- Simplified and fixed publishing to PyPI from CI PR #183
- Fixed bug in release script and updated contributing docs. PR #184
- Added Pull Request template PR #185
- Modified Pull Request template to automatically link PR to issue PR ##186
- First implementation of Owen Sampling, squashed scores, better testing PR #194
- Improved documentation on caching, Shapley, caveats of values, bibtex PR #194
- Breaking change: Rearranging of modules to accommodate for new methods PR #194
"},{"location":"CHANGELOG/#020-better-docs","title":"0.2.0 - \ud83d\udcda Better docs","text":"Mostly API documentation and notebooks, plus some bugfixes.
"},{"location":"CHANGELOG/#added_7","title":"Added","text":"In PR #161: - Support for $$ math in sphinx docs. - Usage of sphinx extension for external links (introducing new directives like :gh:
, :issue:
and :tfl:
to construct standardised links to external resources). - Only update auto-generated documentation files if there are changes. Some minor additions to update_docs.py
. - Parallelization of exact combinatorial Shapley. - Integrated KNN shapley into the main interface compute_shapley_values
.
"},{"location":"CHANGELOG/#changed_7","title":"Changed","text":"In PR #161: - Improved main docs and Shapley notebooks. Added or fixed many docstrings, readme and documentation for contributors. Typos, grammar and style in code, documentation and notebooks. - Internal renaming and rearranging in the parallelization and caching modules.
"},{"location":"CHANGELOG/#fixed_8","title":"Fixed","text":" - Bug in random matrix generation PR #161.
- Bugs in MapReduceJob's
_chunkify
and _backpressure
methods PR #176.
"},{"location":"CHANGELOG/#010-first-release","title":"0.1.0 - \ud83c\udf89 first release","text":"This is very first release of pyDVL.
It contains:
-
Data Valuation Methods:
-
Leave-One-Out
- Influence Functions
- Shapley:
- Exact Permutation and Combinatorial
- Montecarlo Permutation and Combinatorial
- Truncated Montecarlo Permutation
- Caching of results with Memcached
- Parallelization of computations with Ray
- Documentation
- Notebooks containing examples of different use cases
"},{"location":"CONTRIBUTING/","title":"Contributing to pyDVL","text":"The goal of pyDVL is to be a repository of successful algorithms for the valuation of data, in a broader sense. Contributions are welcome from anyone in the form of pull requests, bug reports and feature requests.
We will consider for inclusion any (tested) implementation of an algorithm appearing in a peer-reviewed journal (even if the method does not improve the state of the art, for benchmarking and comparison purposes). We are also open to improvements to the currently implemented methods and other ideas. Please open a ticket with yours.
If you are interested in setting up a similar project, consider the template pymetrius.
"},{"location":"CONTRIBUTING/#local-development","title":"Local development","text":"This project uses black to format code and pre-commit to invoke it as a git pre-commit hook. Consider installing any of black's IDE integrations to make your life easier.
Run the following to set up the pre-commit git hook to run before pushes:
pre-commit install --hook-type pre-push\n
Additionally, we use Git LFS for some files like images. Install with
git lfs install\n
"},{"location":"CONTRIBUTING/#setting-up-your-environment","title":"Setting up your environment","text":"We strongly suggest using some form of virtual environment for working with the library. E.g. with venv:
python -m venv ./venv\n. venv/bin/activate # `venv\\Scripts\\activate` in windows\npip install -r requirements-dev.txt -r requirements-docs.txt\n
With conda:
conda create -n pydvl python=3.8\nconda activate pydvl\npip install -r requirements-dev.txt -r requirements-docs.txt\n
A very convenient way of working with your library during development is to install it in editable mode into your environment by running
pip install -e .\n
In order to build the documentation locally (which is done as part of the tox suite) you need to install additional non-python dependencies as described in the documentation of mkdocs-material.
In addition, pandoc is required. Except for OSX, it should be installed automatically as a dependency with requirements-docs.txt
. Under OSX you can install pandoc (you'll need at least version 2.11) with:
brew install pandoc\n
Remember to mark all autogenerated directories as excluded in your IDE. In particular docs_build
and .tox
should be marked as excluded to avoid slowdowns when searching or refactoring code.
If you use remote execution, don't forget to exclude data paths from deployment (unless you really want to sync them).
"},{"location":"CONTRIBUTING/#testing","title":"Testing","text":"Automated builds, tests, generation of documentation and publishing are handled by CI pipelines. Before pushing your changes to the remote we recommend to execute tox
locally in order to detect mistakes early on and to avoid failing pipelines. tox will: * run the test suite * build the documentation * build and test installation of the package. * generate coverage and pylint reports in html, as well as badges.
You can configure pytest, coverage and pylint by adjusting pyproject.toml.
Besides the usual unit tests, most algorithms are tested using pytest. This requires ray for the parallelization and Memcached for caching. Please install both before running the tests. We run tests in CI as well.
It is possible to pass optional command line arguments to pytest, for example to run only certain tests using patterns (-k
) or marker (-m
).
tox -e tests -- <optional arguments>\n
There are a few important arguments:
--memcached-service
allows to change the default of localhost:11211
(memcached's default) to a different address.
Memcached is needed for testing caching as well as speeding certain methods (e.g. Permutation Shapley).
To start memcached locally in the background with Docker use:
docker run --name pydvl-memcache -p 11211:11211 -d memcached\n
-n
sets the number of parallel workers for pytest-xdist.
There are two layers of parallelization in the tests. An inner one within the tests themselves, i.e. the parallelism in the algorithms, and an outer one by pytest-xdist. The latter is controlled by the -n
argument. If you experience segmentation faults with the tests, try running them with -n 0
to disable parallelization.
-
--slow-tests
enables running slow tests. See below for a description of slow tests.
-
--with-cuda
sets the device fixture in tests/influence/torch/conftest.py to cuda
if it is available. Using this fixture within tests, you can run parts of your tests on a cuda
device. Be aware, that you still have to take care of the usage of the device manually in a specific test. Setting this flag does not result in running all tests on a GPU.
"},{"location":"CONTRIBUTING/#markers","title":"Markers","text":"We use a few different markers to differentiate between tests and runs groups of them of separately. Use pytest --markers
to get a list and description of all available markers.
Two important markers are:
pytest.mark.slow
which is used to mark slow tests and skip them by default.
A slow test is any test that takes 45 seconds or more to run and that can be skipped most of the time. In some cases a test is slow, but it is required in order to ensure that a feature works as expected and that are no bugs. In those cases, we should not use this marker.
Slow tests are always run on CI. Locally, they are skipped by default but can be additionally run using: pytest --slow-tests
.
pytest.mark.torch
which is used to mark tests that require PyTorch.
To test modules that rely on PyTorch, use:
tox -e tests -- -m \"torch\"\n
"},{"location":"CONTRIBUTING/#other-things","title":"Other Things","text":"To test the notebooks separately, run (see below for details):
tox -e notebook-tests\n
To create a package locally, run:
python setup.py sdist bdist_wheel\n
"},{"location":"CONTRIBUTING/#notebooks","title":"Notebooks","text":"We use notebooks both as documentation (copied over to docs/examples
) and as integration tests. All notebooks in the notebooks
directory are executed during the test run. Because run times are typically too long for large datasets, you must check for the CI
environment variable to work with smaller ones. For example, you can select a subset of the data:
# In CI we only use a subset of the training set\nif os.environ.get('CI'):\n training_data = training_data[:10]\n
This switching should happen in a separate notebook cell tagged with hide
to hide the cell's input and output when rendering it as part of the documents. We want to avoid as much clutter and boilerplate as possible in the notebooks themselves.
Because we want documentation to include the full dataset, we commit notebooks with their outputs running with full datasets to the repo. The notebooks are then added by CI to the section Examples of the documentation.
"},{"location":"CONTRIBUTING/#hiding-cells-in-notebooks","title":"Hiding cells in notebooks","text":"Switching between CI or not, importing generic modules and plotting results are all examples of boilerplate code irrelevant to a reader interested in pyDVL's functionality. For this reason we choose to isolate this code into separate cells which are then hidden in the documentation.
In order to do this, cells are marked with tags understood by the mkdocs plugin mkdocs-jupyter
, namely adding the following to the metadata of the relevant cells:
\"tags\": [\n \"hide\"\n]\n
To hide the cell's input and output.
Or:
\"tags\": [\n \"hide-input\"\n]\n
To only hide the input and
\"tags\": [\n \"hide-output\"\n]\n
for hiding the output only. It is important to leave a warning at the top of the document to avoid confusion. Examples for hidden imports and plots are available in the notebooks, e.g. in notebooks/shapley_basic_spotify.ipynb.
"},{"location":"CONTRIBUTING/#plots-in-notebooks","title":"Plots in Notebooks","text":"If you add a plot to a notebook, which should also render nicely in browser dark mode, add the tag invertible-output, i.e.
\"tags\": [\n \"invertible-output\"\n]\n
This applies a simple CSS-filter to the output image of the cell."},{"location":"CONTRIBUTING/#documentation","title":"Documentation","text":"API documentation and examples from notebooks are built with mkdocs, using a number of plugins, including mkdoctrings, with versioning handled by mike.
Notebooks are an integral part of the documentation as well, please read the section on notebooks above.
If you want to build the documentation locally, please make sure you followed the instructions in the section Setting up your environment.
Use the following command to build the documentation the same way it is done in CI:
mkdocs build\n
Locally, you can use this command instead to continuously rebuild documentation on changes to the docs
and src
folder:
mkdocs serve\n
This will rebuild the documentation on changes to .md
files inside docs
, notebooks and python files.
On OSX, it is possible that the cairo lib file is not properly linked when installed via homebrew. In this case you might encounter an error like this
OSError: no library called \"cairo-2\" was found\nno library called \"cairo\" was found\nno library called \"libcairo-2\" was found\n
when calling mkdocs build
or mkdocs serve
. This can be resolved via setting the environment variable DYLD_FALLBACK_LIBRARY_PATH
: export DYLD_FALLBACK_LIBRARY_PATH=$DYLD_FALLBACK_LIBRARY_PATH:/opt/homebrew/lib\n
"},{"location":"CONTRIBUTING/#adding-new-pages","title":"Adding new pages","text":"Navigation is configured in mkdocs.yaml
using the nav section. We use the plugin mkdoc-literate-nav which allows fine-grained control of the navigation structure. However, most pages are explicitly listed and manually arranged in the nav
section of the configuration.
"},{"location":"CONTRIBUTING/#creating-stable-references-for-autorefs","title":"Creating stable references for autorefs","text":"mkdocstrings includes the plugin autorefs to enable automatic linking across pages with e.g. [a link][to-something]
. Anchors are autogenerated from section titles, and are not guaranteed to be unique. In order to ensure that a link will remain valid, add a custom anchor to the section title:
## Some section { #permanent-anchor-to-some-section }\n
(note the space after the opening brace). You can then refer to it within another markdown file with [Some section][permanent-anchor-to-some-section]
.
"},{"location":"CONTRIBUTING/#adding-notes-about-new-features-changes-or-deprecations","title":"Adding notes about new features, changes or deprecations","text":"We use the admonition extension of Mkdocs Material to create admonitions, also known as call-outs, that hold information about when a certain feature was added, changed or deprecated and optionally a description with more details. We put the admonition directly in a module's, a function's or class' docstring.
We use the following syntax:
!!! tip \"<Event Type> in version <Version Number>\"\n\n <Optional Description>\n
The description is useful when the note is about a smaller change such as a parameter.
- For a new feature, we use:
!!! tip \"New in version <Version Number>\"\n\n <Optional Description>\n
- For a change to an existing feature we use:
!!! tip \"Changed in version <Version Number>\"\n\n <Optional Description>\n
For example, for a change in version 1.2.3
that adds kwargs to a class' constructor we would write:
!!! tip \"Changed in version 1.2.3\"\n\n Added kwargs to the constructor.\n
- For a deprecation we use:
!!! tip \"Deprecated in version <Version Number>\"\n\n <Optional Description>\n
"},{"location":"CONTRIBUTING/#using-bibliography","title":"Using bibliography","text":"Bibliographic citations are managed with the plugin mkdocs-bibtex. To enter a citation first add the entry to docs/pydvl.bib
. For team contributor this should be an export of the Zotero folder software/pydvl
in the TransferLab Zotero library. All other contributors just add the bibtex data, and a maintainer will add it to the group library upon merging.
To add a citation inside a markdown file, use the notation [@citekey]
. Alas, because of when mkdocs-bibtex enters the pipeline, it won't process docstrings. For module documentation, we manually inject html into the markdown files. For example, in pydvl.value.shapley.montecarlo
we have:
\"\"\"\nModule docstring...\n\n## References\n\n[^1]: <a name=\"ghorbani_data_2019\"></a>Ghorbani, A., Zou, J., 2019.\n [Data Shapley: Equitable Valuation of Data for Machine\n Learning](https://proceedings.mlr.press/v97/ghorbani19c.html).\n In: Proceedings of the 36th International Conference on Machine Learning,\n PMLR, pp. 2242\u20132251.\n\"\"\"\n
and then later in the file, inside a function's docstring:
This function implements (Ghorbani and Zou, 2019)<sup><a \n href=\"#ghorbani_data_2019\">1</a></sup>\n
"},{"location":"CONTRIBUTING/#writing-mathematics","title":"Writing mathematics","text":"Use LaTeX delimiters $
and $$
for inline and displayed mathematics respectively.
Warning: backslashes must be escaped in docstrings! (although there are exceptions). For simplicity, declare the string as \"raw\" with the prefix r
:
# This will work\ndef f(x: float) -> float:\n r\"\"\" Computes \n $${ f(x) = \\frac{1}{x^2} }$$\n \"\"\"\n return 1/(x*x)\n\n# This throws an obscure error\ndef f(x: float) -> float:\n \"\"\" Computes \n $$\\frac{1}{x^2}$$\n \"\"\"\n return 1/(x*x)\n
Note how there is no space after the dollar signs. This is important! You can use braces for legibility like in the first example.
"},{"location":"CONTRIBUTING/#abbreviations","title":"Abbreviations","text":"We keep the abbreviations used in the documentation inside the docs_include/abbreviations.md file.
The syntax for abbreviations is:
*[ABBR]: Abbreviation\n
"},{"location":"CONTRIBUTING/#ci","title":"CI","text":"We use workflows to:
- Run the tests.
- Publish documentation.
- Publish packages to testpypi / pypi.
- Mark issues as stale after 30 days. We do this only for issues with the label
awaiting-reply
which indicates that we have answered a question / feature request / PR and are waiting for the OP to reply / update his work.
"},{"location":"CONTRIBUTING/#tests","title":"Tests","text":"We test all algorithms with simple datasets in CI jobs. This can amount to a sizeable amount of time, so care must be taken not to overdo it: 1. All algorithm tests must be on very simple datasets and as quick as possible 2. We try not to trigger CI pipelines when unnecessary (see Skipping CI runs). 3. We split the tests based on their duration into groups and run them in parallel.
For that we use pytest-split to first store the duration of all tests with tox -e tests -- --store-durations --slow-tests
in a .test_durations
file.
Alternatively, we case use pytest directly pytest --store-durations --slow-tests
.
Note This does not have to be done each time a new test or test case is added. For new tests and test cases pytes-split assumes average test execution time(calculated based on the stored information) for every test which does not have duration information stored. Thus, there's no need to store durations after changing the test suite. However, when there are major changes in the suite compared to what's stored in .test_durations, it's recommended to update the duration information with --store-durations
to ensure that the splitting is in balance.
Then we can have as many splits as we want:
tox -e tests -- --splits 3 --group 1\ntox -e tests -- --splits 3 --group 2\ntox -e tests -- --splits 3 --group 3\n
Alternatively, we case use pytest directly pytest --splits 3 ---group 1
.
Each one of these commands should be run in a separate shell/job to run the test groups in parallel and decrease the total runtime.
"},{"location":"CONTRIBUTING/#running-github-actions-locally","title":"Running Github Actions locally","text":"To run Github Actions locally we use act. It uses the workflows defined in .github/workflows
and determines the set of actions that need to be run. It uses the Docker API to either pull or build the necessary images, as defined in our workflow files and finally determines the execution path based on the dependencies that were defined.
Once it has the execution path, it then uses the Docker API to run containers for each action based on the images prepared earlier. The environment variables and filesystem are all configured to match what GitHub provides.
You can install it manually using:
curl -s https://raw.githubusercontent.com/nektos/act/master/install.sh | sudo bash -s -- -d -b ~/bin \n
And then simply add it to your PATH variable: PATH=~/bin:$PATH
Refer to its official readme for more installation options.
"},{"location":"CONTRIBUTING/#act-cheatsheet","title":"act cheatsheet","text":"By default, act
will run all workflows in .github/workflows
. You can use the -W
flag to specify a specific workflow file to run, or you can rely on the job id to be unique (but then you'll see warnings for the workflows without that job id).
# Run only the main tests for python 3.8 after a push event (implicit) \nact -W .github/workflows/run-tests-workflow.yaml \\\n -j run-tests \\\n --input tests_to_run=base\\\n --input python_version=3.8\n
Other common flags are:
# List all actions for all events:\nact -l\n\n# List the actions for a specific event:\nact workflow_dispatch -l\n\n# List the actions for a specific job:\nact -j lint -l\n\n# Run the default (`push`) event:\nact\n\n# Run a specific event:\nact pull_request\n\n# Run a specific job:\nact -j lint\n\n# Collect artifacts to the /tmp/artifacts folder:\nact --artifact-server-path /tmp/artifacts\n\n# Run a job in a specific workflow (useful if you have duplicate job names)\nact -j lint -W .github/workflows/tox.yml\n\n# Run in dry-run mode:\nact -n\n\n# Enable verbose-logging (can be used with any of the above commands)\nact -v\n
"},{"location":"CONTRIBUTING/#example","title":"Example","text":"To run the publish
job (the most difficult one to test) you would simply use:
- When triggered by a release:
act release -j publish --eventpath events.json\n
With events.json
containing:
{\n \"act\": true\n}\n
This will use your current branch. If you want to test a specific branch you have to use the workflow_dispatch
event (see below).
- To instead run it as if it had been manually triggered (i.e.
workflow_dispatch
) you would instead use:
act workflow_dispatch -j publish --eventpath events.json\n
With events.json
containing:
{\n \"act\": true,\n \"inputs\": {\n \"tag_name\": \"v0.6.0\"\n }\n}\n
"},{"location":"CONTRIBUTING/#skipping-ci-runs","title":"Skipping CI runs","text":"One sometimes would like to skip CI for certain commits (e.g. updating the readme). In order to do this, simply prefix the commit message with [skip ci]
. The string can be anywhere, but adding it to the beginning of the commit message makes it more evident when looking at commits in a PR.
Refer to the official GitHub documentation for more information.
"},{"location":"CONTRIBUTING/#release-processes","title":"Release processes","text":""},{"location":"CONTRIBUTING/#automatic-release-process","title":"Automatic release process","text":"In order to create an automatic release, a few prerequisites need to be satisfied:
- The project's virtualenv needs to be active
- The repository needs to be on the
develop
branch - The repository must be clean (including no untracked files)
Then, a new release can be created using the script build_scripts/release-version.sh
(leave out the version parameter to have bumpversion
automatically derive the next release version by bumping the patch part):
build_scripts/release-version.sh 0.1.6\n
To find out how to use the script, pass the -h
or --help
flags:
build_scripts/release-version.sh --help\n
If running in interactive mode (without -y|--yes
), the script will output a summary of pending changes and ask for confirmation before executing the actions.
Once this is done, a tag will be created on the repository. You should then create a GitHub release for that tag. That will a trigger a CI pipeline that will automatically create a package and publish it from CI to PyPI.
"},{"location":"CONTRIBUTING/#manual-release-process","title":"Manual release process","text":"If the automatic release process doesn't cover your use case, you can also create a new release manually by following these steps:
- (Repeat as needed) implement features on feature branches merged into
develop
. Each merge into develop will publish a new pre-release version to TestPyPI. These versions can be installed using pip install --pre --index-url https://test.pypi.org/simple/
. - When ready to release: From the develop branch create the release branch and perform release activities (update changelog, news, ...). For your own convenience, define an env variable for the release version
export RELEASE_VERSION=\"vX.Y.Z\"\ngit checkout develop\ngit branch release/${RELEASE_VERSION} && git checkout release/${RELEASE_VERSION}\n
- Run
bumpversion --commit release
if the release is only a patch release, otherwise the full version can be specified using bumpversion --commit --new-version X.Y.Z release
(the release
part is ignored but required by bumpversion ). - Merge the release branch into
master
, tag the merge commit, and push back to the repo. The CI pipeline publishes the package based on the tagged commit. git checkout master\ngit merge --no-ff release/${RELEASE_VERSION}\ngit tag -a ${RELEASE_VERSION} -m\"Release ${RELEASE_VERSION}\"\ngit push --follow-tags origin master\n
- Switch back to the release branch
release/vX.Y.Z
and pre-bump the version: bumpversion --commit patch
. This ensures that develop
pre-releases are always strictly more recent than the last published release version from master
. - Merge the release branch into
develop
: git checkout develop\ngit merge --no-ff release/${RELEASE_VERSION}\ngit push origin develop\n
- Delete the release branch if necessary:
git branch -d release/${RELEASE_VERSION}
- Create a GitHub release for the created tag.
- Pour yourself a cup of coffee, you earned it!
- A package will be automatically created and published from CI to PyPI.
"},{"location":"CONTRIBUTING/#ci-and-requirements-for-publishing","title":"CI and requirements for publishing","text":"In order to publish new versions of the package from the development branch, the CI pipeline requires the following secret variables set up:
TEST_PYPI_USERNAME\nTEST_PYPI_PASSWORD\nPYPI_USERNAME\nPYPI_PASSWORD\n
The first 2 are used after tests run on the develop branch's CI workflow to automatically publish packages to TestPyPI.
The last 2 are used in the publish.yaml CI workflow to publish packages to PyPI from develop
after a GitHub release.
"},{"location":"CONTRIBUTING/#publish-to-testpypi","title":"Publish to TestPyPI","text":"We use bump2version to bump the build part of the version number without commiting or tagging the change and then publish a package to TestPyPI from CI using Twine. The version has the GitHub run number appended.
For more details refer to the files .github/workflows/publish.yaml and .github/workflows/tox.yaml.
"},{"location":"api/pydvl/","title":"API Reference","text":""},{"location":"api/pydvl/#pydvl","title":"pydvl","text":""},{"location":"api/pydvl/#pydvl--the-python-data-valuation-library-api","title":"The Python Data Valuation Library API","text":"This is the API documentation for the Python Data Valuation Library (PyDVL). Use the table of contents to access the documentation for each module.
The two main modules you will want to look at are value and influence.
"},{"location":"api/pydvl/influence/","title":"Influence","text":""},{"location":"api/pydvl/influence/#pydvl.influence","title":"pydvl.influence","text":"This package contains algorithms for the computation of the influence function.
See The Influence function for an introduction to the concepts and methods implemented here.
Warning
Much of the code in this package is experimental or untested and is subject to modification. In particular, the package structure and basic API will probably change.
"},{"location":"api/pydvl/influence/array/","title":"Array","text":""},{"location":"api/pydvl/influence/array/#pydvl.influence.array","title":"pydvl.influence.array","text":"This module provides classes and utilities for handling large arrays that are chunked and lazily evaluated. It includes abstract base classes for converting between tensor types and NumPy arrays, aggregating blocks of data, and abstract representations of lazy arrays. Concrete implementations are provided for handling chunked lazy arrays (chunked in one resp. two dimensions), with support for efficient storage and retrieval using the Zarr library.
"},{"location":"api/pydvl/influence/array/#pydvl.influence.array.NumpyConverter","title":"NumpyConverter","text":" Bases: Generic[TensorType]
, ABC
Base class for converting TensorType objects into numpy arrays and vice versa.
"},{"location":"api/pydvl/influence/array/#pydvl.influence.array.NumpyConverter.to_numpy","title":"to_numpy abstractmethod
","text":"to_numpy(x: TensorType) -> NDArray\n
Override this method for converting a TensorType object into a numpy array
Source code in src/pydvl/influence/array.py
@abstractmethod\ndef to_numpy(self, x: TensorType) -> NDArray:\n \"\"\"Override this method for converting a TensorType object into a numpy array\"\"\"\n
"},{"location":"api/pydvl/influence/array/#pydvl.influence.array.NumpyConverter.from_numpy","title":"from_numpy abstractmethod
","text":"from_numpy(x: NDArray) -> TensorType\n
Override this method for converting a numpy array into a TensorType object
Source code in src/pydvl/influence/array.py
@abstractmethod\ndef from_numpy(self, x: NDArray) -> TensorType:\n \"\"\"Override this method for converting a numpy array into a TensorType object\"\"\"\n
"},{"location":"api/pydvl/influence/array/#pydvl.influence.array.SequenceAggregator","title":"SequenceAggregator","text":" Bases: Generic[TensorType]
, ABC
"},{"location":"api/pydvl/influence/array/#pydvl.influence.array.SequenceAggregator.__call__","title":"__call__ abstractmethod
","text":"__call__(tensor_sequence: LazyChunkSequence)\n
Aggregates tensors from a sequence.
Implement this method to define how a sequence of tensors, provided by a generator, should be combined.
Source code in src/pydvl/influence/array.py
@abstractmethod\ndef __call__(\n self,\n tensor_sequence: LazyChunkSequence,\n):\n \"\"\"\n Aggregates tensors from a sequence.\n\n Implement this method to define how a sequence of tensors, provided by a\n generator, should be combined.\n \"\"\"\n
"},{"location":"api/pydvl/influence/array/#pydvl.influence.array.ListAggregator","title":"ListAggregator","text":" Bases: SequenceAggregator
"},{"location":"api/pydvl/influence/array/#pydvl.influence.array.ListAggregator.__call__","title":"__call__","text":"__call__(tensor_sequence: LazyChunkSequence) -> List[TensorType]\n
Aggregates tensors from a single-level generator into a list. This method simply collects each tensor emitted by the generator into a single list.
PARAMETER DESCRIPTION tensor_sequence
Object wrapping a generator that yields TensorType
objects.
TYPE: LazyChunkSequence
RETURNS DESCRIPTION List[TensorType]
A list containing all the tensors provided by the tensor_generator.
Source code in src/pydvl/influence/array.py
def __call__(\n self,\n tensor_sequence: LazyChunkSequence,\n) -> List[TensorType]:\n \"\"\"\n Aggregates tensors from a single-level generator into a list. This method simply\n collects each tensor emitted by the generator into a single list.\n\n Args:\n tensor_sequence: Object wrapping a generator that yields `TensorType`\n objects.\n\n Returns:\n A list containing all the tensors provided by the tensor_generator.\n \"\"\"\n\n gen = cast(Iterator[TensorType], tensor_sequence.generator_factory())\n\n if tensor_sequence.len_generator is not None:\n gen = cast(\n Iterator[TensorType],\n tqdm(gen, total=tensor_sequence.len_generator, desc=\"Blocks\"),\n )\n\n return [t for t in gen]\n
"},{"location":"api/pydvl/influence/array/#pydvl.influence.array.NestedSequenceAggregator","title":"NestedSequenceAggregator","text":" Bases: Generic[TensorType]
, ABC
"},{"location":"api/pydvl/influence/array/#pydvl.influence.array.NestedSequenceAggregator.__call__","title":"__call__ abstractmethod
","text":"__call__(nested_sequence_of_tensors: NestedLazyChunkSequence)\n
Aggregates tensors from a nested sequence of tensors.
Implement this method to specify how tensors, nested in two layers of generators, should be combined. Useful for complex data structures where tensors are not directly accessible in a flat list.
Source code in src/pydvl/influence/array.py
@abstractmethod\ndef __call__(self, nested_sequence_of_tensors: NestedLazyChunkSequence):\n \"\"\"\n Aggregates tensors from a nested sequence of tensors.\n\n Implement this method to specify how tensors, nested in two layers of\n generators, should be combined. Useful for complex data structures where tensors\n are not directly accessible in a flat list.\n \"\"\"\n
"},{"location":"api/pydvl/influence/array/#pydvl.influence.array.NestedListAggregator","title":"NestedListAggregator","text":" Bases: NestedSequenceAggregator
"},{"location":"api/pydvl/influence/array/#pydvl.influence.array.NestedListAggregator.__call__","title":"__call__","text":"__call__(\n nested_sequence_of_tensors: NestedLazyChunkSequence,\n) -> List[List[TensorType]]\n
Aggregates tensors from a nested generator structure into a list of lists. Each inner generator is converted into a list of tensors, resulting in a nested list structure.
Args: nested_sequence_of_tensors: Object wrapping a generator of generators, where each inner generator yields TensorType objects.
RETURNS DESCRIPTION List[List[TensorType]]
A list of lists, where each inner list contains tensors returned from one of the inner generators.
Source code in src/pydvl/influence/array.py
def __call__(\n self,\n nested_sequence_of_tensors: NestedLazyChunkSequence,\n) -> List[List[TensorType]]:\n \"\"\"\n Aggregates tensors from a nested generator structure into a list of lists.\n Each inner generator is converted into a list of tensors, resulting in a nested\n list structure.\n\n Args:\n nested_sequence_of_tensors: Object wrapping a generator of generators,\n where each inner generator yields TensorType objects.\n\n Returns:\n A list of lists, where each inner list contains tensors returned from one\n of the inner generators.\n \"\"\"\n outer_gen = cast(\n Iterator[Iterator[TensorType]],\n nested_sequence_of_tensors.generator_factory(),\n )\n len_outer_gen = nested_sequence_of_tensors.len_outer_generator\n if len_outer_gen is not None:\n outer_gen = cast(\n Iterator[Iterator[TensorType]],\n tqdm(outer_gen, total=len_outer_gen, desc=\"Row blocks\"),\n )\n\n return [list(tensor_gen) for tensor_gen in outer_gen]\n
"},{"location":"api/pydvl/influence/array/#pydvl.influence.array.LazyChunkSequence","title":"LazyChunkSequence","text":"LazyChunkSequence(\n generator_factory: Callable[[], Generator[TensorType, None, None]],\n len_generator: Optional[int] = None,\n)\n
Bases: Generic[TensorType]
A class representing a chunked, and lazily evaluated array, where the chunking is restricted to the first dimension
This class is designed to handle large arrays that don't fit in memory. It works by generating chunks of the array on demand and can also convert these chunks to a Zarr array for efficient storage and retrieval.
ATTRIBUTE DESCRIPTION generator_factory
A factory function that returns a generator. This generator yields chunks of the large array when called.
len_generator
if the number of elements from the generator is known from the context, this optional parameter can be used to improve logging by adding a progressbar.
Source code in src/pydvl/influence/array.py
def __init__(\n self,\n generator_factory: Callable[[], Generator[TensorType, None, None]],\n len_generator: Optional[int] = None,\n):\n self.generator_factory = generator_factory\n self.len_generator = len_generator\n
"},{"location":"api/pydvl/influence/array/#pydvl.influence.array.LazyChunkSequence.compute","title":"compute","text":"compute(aggregator: Optional[SequenceAggregator] = None)\n
Computes and optionally aggregates the chunks of the array using the provided aggregator. This method initiates the generation of chunks and then combines them according to the aggregator's logic.
PARAMETER DESCRIPTION aggregator
An optional aggregator for combining the chunks of the array. If None, a default ListAggregator is used to simply collect the chunks into a list.
TYPE: Optional[SequenceAggregator]
DEFAULT: None
RETURNS DESCRIPTION The aggregated result of all chunks of the array, the format of which depends on the aggregator used.
Source code in src/pydvl/influence/array.py
@log_duration(log_level=logging.INFO)\ndef compute(self, aggregator: Optional[SequenceAggregator] = None):\n \"\"\"\n Computes and optionally aggregates the chunks of the array using the provided\n aggregator. This method initiates the generation of chunks and then\n combines them according to the aggregator's logic.\n\n Args:\n aggregator: An optional aggregator for combining the chunks of\n the array. If None, a default ListAggregator is used to simply collect\n the chunks into a list.\n\n Returns:\n The aggregated result of all chunks of the array, the format of which\n depends on the aggregator used.\n\n \"\"\"\n if aggregator is None:\n aggregator = ListAggregator()\n return aggregator(self)\n
"},{"location":"api/pydvl/influence/array/#pydvl.influence.array.LazyChunkSequence.to_zarr","title":"to_zarr","text":"to_zarr(\n path_or_url: Union[str, StoreLike],\n converter: NumpyConverter,\n return_stored: bool = False,\n overwrite: bool = False,\n) -> Optional[Array]\n
Converts the array into Zarr format, a storage format optimized for large arrays, and stores it at the specified path or URL. This method is suitable for scenarios where the data needs to be saved for later use or for large datasets requiring efficient storage.
PARAMETER DESCRIPTION path_or_url
The file path or URL where the Zarr array will be stored. Also excepts instances of zarr stores.
TYPE: Union[str, StoreLike]
converter
A converter for transforming blocks into NumPy arrays compatible with Zarr.
TYPE: NumpyConverter
return_stored
If True, the method returns the stored Zarr array; otherwise, it returns None.
TYPE: bool
DEFAULT: False
overwrite
If True, overwrites existing data at the given path_or_url. If False, an error is raised in case of existing data.
TYPE: bool
DEFAULT: False
RETURNS DESCRIPTION Optional[Array]
The Zarr array if return_stored is True; otherwise, None.
Source code in src/pydvl/influence/array.py
@log_duration(log_level=logging.INFO)\ndef to_zarr(\n self,\n path_or_url: Union[str, StoreLike],\n converter: NumpyConverter,\n return_stored: bool = False,\n overwrite: bool = False,\n) -> Optional[zarr.Array]:\n \"\"\"\n Converts the array into Zarr format, a storage format optimized for large\n arrays, and stores it at the specified path or URL. This method is suitable for\n scenarios where the data needs to be saved for later use or for large datasets\n requiring efficient storage.\n\n Args:\n path_or_url: The file path or URL where the Zarr array will be stored.\n Also excepts instances of zarr stores.\n converter: A converter for transforming blocks into NumPy arrays\n compatible with Zarr.\n return_stored: If True, the method returns the stored Zarr array; otherwise,\n it returns None.\n overwrite: If True, overwrites existing data at the given path_or_url.\n If False, an error is raised in case of existing data.\n\n Returns:\n The Zarr array if return_stored is True; otherwise, None.\n \"\"\"\n row_idx = 0\n z = None\n\n gen = cast(Iterator[TensorType], self.generator_factory())\n\n if self.len_generator is not None:\n gen = cast(\n Iterator[TensorType], tqdm(gen, total=self.len_generator, desc=\"Blocks\")\n )\n\n for block in gen:\n numpy_block = converter.to_numpy(block)\n\n if z is None:\n z = self._initialize_zarr_array(numpy_block, path_or_url, overwrite)\n\n new_shape = self._new_shape_according_to_block(numpy_block, row_idx)\n z.resize(new_shape)\n\n z[row_idx : row_idx + numpy_block.shape[0]] = numpy_block\n row_idx += numpy_block.shape[0]\n\n return z if return_stored else None\n
"},{"location":"api/pydvl/influence/array/#pydvl.influence.array.NestedLazyChunkSequence","title":"NestedLazyChunkSequence","text":"NestedLazyChunkSequence(\n generator_factory: Callable[\n [], Generator[Generator[TensorType, None, None], None, None]\n ],\n len_outer_generator: Optional[int] = None,\n)\n
Bases: Generic[TensorType]
A class representing chunked, and lazily evaluated array, where the chunking is restricted to the first two dimensions.
This class is designed for handling large arrays where individual chunks are loaded and processed lazily. It supports converting these chunks into a Zarr array for efficient storage and retrieval, with chunking applied along the first two dimensions.
ATTRIBUTE DESCRIPTION generator_factory
A factory function that returns a generator of generators. Each inner generator yields chunks
len_outer_generator
if the number of elements from the outer generator is known from the context, this optional parameter can be used to improve logging by adding a progressbar.
Source code in src/pydvl/influence/array.py
def __init__(\n self,\n generator_factory: Callable[\n [], Generator[Generator[TensorType, None, None], None, None]\n ],\n len_outer_generator: Optional[int] = None,\n):\n self.generator_factory = generator_factory\n self.len_outer_generator = len_outer_generator\n
"},{"location":"api/pydvl/influence/array/#pydvl.influence.array.NestedLazyChunkSequence.compute","title":"compute","text":"compute(aggregator: Optional[NestedSequenceAggregator] = None)\n
Computes and optionally aggregates the chunks of the array using the provided aggregator. This method initiates the generation of chunks and then combines them according to the aggregator's logic.
PARAMETER DESCRIPTION aggregator
An optional aggregator for combining the chunks of the array. If None, a default NestedListAggregator is used to simply collect the chunks into a list of lists.
TYPE: Optional[NestedSequenceAggregator]
DEFAULT: None
RETURNS DESCRIPTION The aggregated result of all chunks of the array, the format of which
depends on the aggregator used.
Source code in src/pydvl/influence/array.py
@log_duration(log_level=logging.INFO)\ndef compute(self, aggregator: Optional[NestedSequenceAggregator] = None):\n \"\"\"\n Computes and optionally aggregates the chunks of the array using the provided\n aggregator. This method initiates the generation of chunks and then\n combines them according to the aggregator's logic.\n\n Args:\n aggregator: An optional aggregator for combining the chunks of\n the array. If None, a default\n [NestedListAggregator][pydvl.influence.array.NestedListAggregator]\n is used to simply collect the chunks into a list of lists.\n\n Returns:\n The aggregated result of all chunks of the array, the format of which\n depends on the aggregator used.\n\n \"\"\"\n if aggregator is None:\n aggregator = NestedListAggregator()\n return aggregator(self)\n
"},{"location":"api/pydvl/influence/array/#pydvl.influence.array.NestedLazyChunkSequence.to_zarr","title":"to_zarr","text":"to_zarr(\n path_or_url: Union[str, StoreLike],\n converter: NumpyConverter,\n return_stored: bool = False,\n overwrite: bool = False,\n) -> Optional[Array]\n
Converts the array into Zarr format, a storage format optimized for large arrays, and stores it at the specified path or URL. This method is suitable for scenarios where the data needs to be saved for later use or for large datasets requiring efficient storage.
PARAMETER DESCRIPTION path_or_url
The file path or URL where the Zarr array will be stored. Also excepts instances of zarr stores.
TYPE: Union[str, StoreLike]
converter
A converter for transforming blocks into NumPy arrays compatible with Zarr.
TYPE: NumpyConverter
return_stored
If True, the method returns the stored Zarr array; otherwise, it returns None.
TYPE: bool
DEFAULT: False
overwrite
If True, overwrites existing data at the given path_or_url. If False, an error is raised in case of existing data.
TYPE: bool
DEFAULT: False
RETURNS DESCRIPTION Optional[Array]
The Zarr array if return_stored is True; otherwise, None.
Source code in src/pydvl/influence/array.py
@log_duration(log_level=logging.INFO)\ndef to_zarr(\n self,\n path_or_url: Union[str, StoreLike],\n converter: NumpyConverter,\n return_stored: bool = False,\n overwrite: bool = False,\n) -> Optional[zarr.Array]:\n \"\"\"\n Converts the array into Zarr format, a storage format optimized for large\n arrays, and stores it at the specified path or URL. This method is suitable for\n scenarios where the data needs to be saved for later use or for large datasets\n requiring efficient storage.\n\n Args:\n path_or_url: The file path or URL where the Zarr array will be stored.\n Also excepts instances of zarr stores.\n converter: A converter for transforming blocks into NumPy arrays\n compatible with Zarr.\n return_stored: If True, the method returns the stored Zarr array;\n otherwise, it returns None.\n overwrite: If True, overwrites existing data at the given path_or_url.\n If False, an error is raised in case of existing data.\n\n Returns:\n The Zarr array if return_stored is True; otherwise, None.\n \"\"\"\n\n row_idx = 0\n z = None\n numpy_block = None\n block_generator = cast(Iterator[Iterator[TensorType]], self.generator_factory())\n\n if self.len_outer_generator is not None:\n block_generator = cast(\n Iterator[Iterator[TensorType]],\n tqdm(\n block_generator, total=self.len_outer_generator, desc=\"Row blocks\"\n ),\n )\n\n for row_blocks in block_generator:\n col_idx = 0\n for block in row_blocks:\n numpy_block = converter.to_numpy(block)\n if z is None:\n z = self._initialize_zarr_array(numpy_block, path_or_url, overwrite)\n new_shape = self._new_shape_according_to_block(\n z, numpy_block, row_idx, col_idx\n )\n z.resize(new_shape)\n idx_slice_to_update = self._idx_slice_for_update(\n numpy_block, row_idx, col_idx\n )\n z[idx_slice_to_update] = numpy_block\n\n col_idx += numpy_block.shape[1]\n\n if numpy_block is None:\n raise ValueError(\"Generator is empty\")\n\n row_idx += numpy_block.shape[0]\n\n return z if return_stored else None\n
"},{"location":"api/pydvl/influence/base_influence_function_model/","title":"Base influence function model","text":""},{"location":"api/pydvl/influence/base_influence_function_model/#pydvl.influence.base_influence_function_model","title":"pydvl.influence.base_influence_function_model","text":""},{"location":"api/pydvl/influence/base_influence_function_model/#pydvl.influence.base_influence_function_model.InfluenceFunctionModel","title":"InfluenceFunctionModel","text":" Bases: Generic[TensorType, DataLoaderType]
, ABC
Generic abstract base class for computing influence related quantities. For a specific influence algorithm and tensor framework, inherit from this base class
"},{"location":"api/pydvl/influence/base_influence_function_model/#pydvl.influence.base_influence_function_model.InfluenceFunctionModel.n_parameters","title":"n_parameters abstractmethod
property
","text":"n_parameters\n
Number of trainable parameters of the underlying model
"},{"location":"api/pydvl/influence/base_influence_function_model/#pydvl.influence.base_influence_function_model.InfluenceFunctionModel.is_thread_safe","title":"is_thread_safe abstractmethod
property
","text":"is_thread_safe: bool\n
Whether the influence computation is thread safe
"},{"location":"api/pydvl/influence/base_influence_function_model/#pydvl.influence.base_influence_function_model.InfluenceFunctionModel.is_fitted","title":"is_fitted abstractmethod
property
","text":"is_fitted\n
Override this, to expose the fitting status of the instance.
"},{"location":"api/pydvl/influence/base_influence_function_model/#pydvl.influence.base_influence_function_model.InfluenceFunctionModel.fit","title":"fit abstractmethod
","text":"fit(data: DataLoaderType) -> InfluenceFunctionModel\n
Override this method to fit the influence function model to training data, e.g. pre-compute hessian matrix or matrix decompositions
PARAMETER DESCRIPTION data
TYPE: DataLoaderType
RETURNS DESCRIPTION InfluenceFunctionModel
The fitted instance
Source code in src/pydvl/influence/base_influence_function_model.py
@abstractmethod\ndef fit(self, data: DataLoaderType) -> InfluenceFunctionModel:\n \"\"\"\n Override this method to fit the influence function model to training data,\n e.g. pre-compute hessian matrix or matrix decompositions\n\n Args:\n data:\n\n Returns:\n The fitted instance\n \"\"\"\n
"},{"location":"api/pydvl/influence/base_influence_function_model/#pydvl.influence.base_influence_function_model.InfluenceFunctionModel.fit_required","title":"fit_required staticmethod
","text":"fit_required(method)\n
Decorator to enforce the fitted check
Source code in src/pydvl/influence/base_influence_function_model.py
@staticmethod\ndef fit_required(method):\n \"\"\"Decorator to enforce the fitted check\"\"\"\n\n @wraps(method)\n def wrapper(self, *args, **kwargs):\n if not self.is_fitted:\n raise NotFittedException(type(self))\n return method(self, *args, **kwargs)\n\n return wrapper\n
"},{"location":"api/pydvl/influence/base_influence_function_model/#pydvl.influence.base_influence_function_model.InfluenceFunctionModel.influence_factors","title":"influence_factors","text":"influence_factors(x: TensorType, y: TensorType) -> TensorType\n
Computes the approximation of
\\[ H^{-1}\\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\] where the gradient is meant to be per sample of the batch \\((x, y)\\). For all input tensors it is assumed, that the first dimension is the batch dimension.
PARAMETER DESCRIPTION x
model input to use in the gradient computations
TYPE: TensorType
y
label tensor to compute gradients
TYPE: TensorType
RETURNS DESCRIPTION TensorType
Tensor representing the element-wise inverse Hessian matrix vector products
Source code in src/pydvl/influence/base_influence_function_model.py
def influence_factors(self, x: TensorType, y: TensorType) -> TensorType:\n r\"\"\"\n Computes the approximation of\n\n \\[ H^{-1}\\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\]\n\n where the gradient is meant to be per sample of the batch $(x, y)$.\n For all input tensors it is assumed,\n that the first dimension is the batch dimension.\n\n Args:\n x: model input to use in the gradient computations\n y: label tensor to compute gradients\n\n Returns:\n Tensor representing the element-wise inverse Hessian matrix vector products\n\n \"\"\"\n if not self.is_fitted:\n raise NotFittedException(type(self))\n return self._influence_factors(x, y)\n
"},{"location":"api/pydvl/influence/base_influence_function_model/#pydvl.influence.base_influence_function_model.InfluenceFunctionModel.influences","title":"influences","text":"influences(\n x_test: TensorType,\n y_test: TensorType,\n x: Optional[TensorType] = None,\n y: Optional[TensorType] = None,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> TensorType\n
Computes the approximation of
\\[ \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}})), \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the case of up-weighting influence, resp.
\\[ \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{test}, f_{\\theta}(x_{test})), \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the perturbation type influence case.
PARAMETER DESCRIPTION x_test
model input to use in the gradient computations of \\(H^{-1}\\nabla_{theta} \\ell(y_{test}, f_{\\theta}(x_{test}))\\)
TYPE: TensorType
y_test
label tensor to compute gradients
TYPE: TensorType
x
optional model input to use in the gradient computations \\(\\nabla_{theta}\\ell(y, f_{\\theta}(x))\\), resp. \\(\\nabla_{x}\\nabla_{theta}\\ell(y, f_{\\theta}(x))\\), if None, use \\(x=x_{test}\\)
TYPE: Optional[TensorType]
DEFAULT: None
y
optional label tensor to compute gradients
TYPE: Optional[TensorType]
DEFAULT: None
mode
enum value of InfluenceMode
TYPE: InfluenceMode
DEFAULT: Up
RETURNS DESCRIPTION TensorType
Tensor representing the element-wise scalar products for the provided batch
Source code in src/pydvl/influence/base_influence_function_model.py
def influences(\n self,\n x_test: TensorType,\n y_test: TensorType,\n x: Optional[TensorType] = None,\n y: Optional[TensorType] = None,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> TensorType:\n r\"\"\"\n Computes the approximation of\n\n \\[ \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}},\n f_{\\theta}(x_{\\text{test}})),\n \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the case of up-weighting influence, resp.\n\n \\[ \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{test}, f_{\\theta}(x_{test})),\n \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the perturbation type influence case.\n\n Args:\n x_test: model input to use in the gradient computations\n of $H^{-1}\\nabla_{theta} \\ell(y_{test}, f_{\\theta}(x_{test}))$\n y_test: label tensor to compute gradients\n x: optional model input to use in the gradient computations\n $\\nabla_{theta}\\ell(y, f_{\\theta}(x))$,\n resp. $\\nabla_{x}\\nabla_{theta}\\ell(y, f_{\\theta}(x))$,\n if None, use $x=x_{test}$\n y: optional label tensor to compute gradients\n mode: enum value of [InfluenceMode]\n [pydvl.influence.base_influence_function_model.InfluenceMode]\n\n Returns:\n Tensor representing the element-wise scalar products for the provided batch\n\n \"\"\"\n if not self.is_fitted:\n raise NotFittedException(type(self))\n\n if x is None and y is not None:\n raise ValueError(\n \"Providing labels y, without providing model input x \"\n \"is not supported\"\n )\n\n if x is not None and y is None:\n raise ValueError(\n \"Providing model input x, without providing labels y \"\n \"is not supported\"\n )\n\n return self._influences(x_test, y_test, x, y, mode)\n
"},{"location":"api/pydvl/influence/base_influence_function_model/#pydvl.influence.base_influence_function_model.InfluenceFunctionModel.influences_from_factors","title":"influences_from_factors abstractmethod
","text":"influences_from_factors(\n z_test_factors: TensorType,\n x: TensorType,\n y: TensorType,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> TensorType\n
Override this method to implement the computation of
\\[ \\langle z_{\\text{test_factors}}, \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the case of up-weighting influence, resp.
\\[ \\langle z_{\\text{test_factors}}, \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the perturbation type influence case. The gradient is meant to be per sample of the batch \\((x, y)\\).
PARAMETER DESCRIPTION z_test_factors
pre-computed array, approximating \\(H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}}))\\)
TYPE: TensorType
x
model input to use in the gradient computations \\(\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))\\), resp. \\(\\nabla_{x}\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))\\), if None, use \\(x=x_{\\text{test}}\\)
TYPE: TensorType
y
label tensor to compute gradients
TYPE: TensorType
mode
enum value of InfluenceMode
TYPE: InfluenceMode
DEFAULT: Up
RETURNS DESCRIPTION TensorType
Tensor representing the element-wise scalar products for the provided batch
Source code in src/pydvl/influence/base_influence_function_model.py
@abstractmethod\ndef influences_from_factors(\n self,\n z_test_factors: TensorType,\n x: TensorType,\n y: TensorType,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> TensorType:\n r\"\"\"\n Override this method to implement the computation of\n\n \\[ \\langle z_{\\text{test_factors}},\n \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the case of up-weighting influence, resp.\n\n \\[ \\langle z_{\\text{test_factors}},\n \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the perturbation type influence case. The gradient is meant to be per sample\n of the batch $(x, y)$.\n\n Args:\n z_test_factors: pre-computed array, approximating\n $H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}},\n f_{\\theta}(x_{\\text{test}}))$\n x: model input to use in the gradient computations\n $\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))$,\n resp. $\\nabla_{x}\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))$,\n if None, use $x=x_{\\text{test}}$\n y: label tensor to compute gradients\n mode: enum value of [InfluenceMode]\n [pydvl.influence.base_influence_function_model.InfluenceMode]\n\n Returns:\n Tensor representing the element-wise scalar products for the provided batch\n\n \"\"\"\n
"},{"location":"api/pydvl/influence/base_influence_function_model/#pydvl.influence.base_influence_function_model.ComposableInfluence","title":"ComposableInfluence","text":" Bases: InfluenceFunctionModel
, Generic[TensorType, BatchType, DataLoaderType, BlockMapperType]
, ABC
Generic abstract base class, that allow for block-wise computation of influence quantities. Inherit from this base class for specific influence algorithms and tensor frameworks.
"},{"location":"api/pydvl/influence/base_influence_function_model/#pydvl.influence.base_influence_function_model.ComposableInfluence.n_parameters","title":"n_parameters abstractmethod
property
","text":"n_parameters\n
Number of trainable parameters of the underlying model
"},{"location":"api/pydvl/influence/base_influence_function_model/#pydvl.influence.base_influence_function_model.ComposableInfluence.is_thread_safe","title":"is_thread_safe abstractmethod
property
","text":"is_thread_safe: bool\n
Whether the influence computation is thread safe
"},{"location":"api/pydvl/influence/base_influence_function_model/#pydvl.influence.base_influence_function_model.ComposableInfluence.fit_required","title":"fit_required staticmethod
","text":"fit_required(method)\n
Decorator to enforce the fitted check
Source code in src/pydvl/influence/base_influence_function_model.py
@staticmethod\ndef fit_required(method):\n \"\"\"Decorator to enforce the fitted check\"\"\"\n\n @wraps(method)\n def wrapper(self, *args, **kwargs):\n if not self.is_fitted:\n raise NotFittedException(type(self))\n return method(self, *args, **kwargs)\n\n return wrapper\n
"},{"location":"api/pydvl/influence/base_influence_function_model/#pydvl.influence.base_influence_function_model.ComposableInfluence.influence_factors","title":"influence_factors","text":"influence_factors(x: TensorType, y: TensorType) -> TensorType\n
Computes the approximation of
\\[ H^{-1}\\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\] where the gradient is meant to be per sample of the batch \\((x, y)\\). For all input tensors it is assumed, that the first dimension is the batch dimension.
PARAMETER DESCRIPTION x
model input to use in the gradient computations
TYPE: TensorType
y
label tensor to compute gradients
TYPE: TensorType
RETURNS DESCRIPTION TensorType
Tensor representing the element-wise inverse Hessian matrix vector products
Source code in src/pydvl/influence/base_influence_function_model.py
def influence_factors(self, x: TensorType, y: TensorType) -> TensorType:\n r\"\"\"\n Computes the approximation of\n\n \\[ H^{-1}\\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\]\n\n where the gradient is meant to be per sample of the batch $(x, y)$.\n For all input tensors it is assumed,\n that the first dimension is the batch dimension.\n\n Args:\n x: model input to use in the gradient computations\n y: label tensor to compute gradients\n\n Returns:\n Tensor representing the element-wise inverse Hessian matrix vector products\n\n \"\"\"\n if not self.is_fitted:\n raise NotFittedException(type(self))\n return self._influence_factors(x, y)\n
"},{"location":"api/pydvl/influence/base_influence_function_model/#pydvl.influence.base_influence_function_model.ComposableInfluence.influences","title":"influences","text":"influences(\n x_test: TensorType,\n y_test: TensorType,\n x: Optional[TensorType] = None,\n y: Optional[TensorType] = None,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> TensorType\n
Computes the approximation of
\\[ \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}})), \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the case of up-weighting influence, resp.
\\[ \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{test}, f_{\\theta}(x_{test})), \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the perturbation type influence case.
PARAMETER DESCRIPTION x_test
model input to use in the gradient computations of \\(H^{-1}\\nabla_{theta} \\ell(y_{test}, f_{\\theta}(x_{test}))\\)
TYPE: TensorType
y_test
label tensor to compute gradients
TYPE: TensorType
x
optional model input to use in the gradient computations \\(\\nabla_{theta}\\ell(y, f_{\\theta}(x))\\), resp. \\(\\nabla_{x}\\nabla_{theta}\\ell(y, f_{\\theta}(x))\\), if None, use \\(x=x_{test}\\)
TYPE: Optional[TensorType]
DEFAULT: None
y
optional label tensor to compute gradients
TYPE: Optional[TensorType]
DEFAULT: None
mode
enum value of InfluenceMode
TYPE: InfluenceMode
DEFAULT: Up
RETURNS DESCRIPTION TensorType
Tensor representing the element-wise scalar products for the provided batch
Source code in src/pydvl/influence/base_influence_function_model.py
def influences(\n self,\n x_test: TensorType,\n y_test: TensorType,\n x: Optional[TensorType] = None,\n y: Optional[TensorType] = None,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> TensorType:\n r\"\"\"\n Computes the approximation of\n\n \\[ \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}},\n f_{\\theta}(x_{\\text{test}})),\n \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the case of up-weighting influence, resp.\n\n \\[ \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{test}, f_{\\theta}(x_{test})),\n \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the perturbation type influence case.\n\n Args:\n x_test: model input to use in the gradient computations\n of $H^{-1}\\nabla_{theta} \\ell(y_{test}, f_{\\theta}(x_{test}))$\n y_test: label tensor to compute gradients\n x: optional model input to use in the gradient computations\n $\\nabla_{theta}\\ell(y, f_{\\theta}(x))$,\n resp. $\\nabla_{x}\\nabla_{theta}\\ell(y, f_{\\theta}(x))$,\n if None, use $x=x_{test}$\n y: optional label tensor to compute gradients\n mode: enum value of [InfluenceMode]\n [pydvl.influence.base_influence_function_model.InfluenceMode]\n\n Returns:\n Tensor representing the element-wise scalar products for the provided batch\n\n \"\"\"\n if not self.is_fitted:\n raise NotFittedException(type(self))\n\n if x is None and y is not None:\n raise ValueError(\n \"Providing labels y, without providing model input x \"\n \"is not supported\"\n )\n\n if x is not None and y is None:\n raise ValueError(\n \"Providing model input x, without providing labels y \"\n \"is not supported\"\n )\n\n return self._influences(x_test, y_test, x, y, mode)\n
"},{"location":"api/pydvl/influence/base_influence_function_model/#pydvl.influence.base_influence_function_model.ComposableInfluence.fit","title":"fit","text":"fit(data: DataLoaderType) -> InfluenceFunctionModel\n
Fitting to provided data, by internally creating a block mapper instance from it. Args: data: iterable of tensors
RETURNS DESCRIPTION InfluenceFunctionModel
Fitted instance
Source code in src/pydvl/influence/base_influence_function_model.py
@log_duration(log_level=logging.INFO)\ndef fit(self, data: DataLoaderType) -> InfluenceFunctionModel:\n \"\"\"\n Fitting to provided data, by internally creating a block mapper instance from\n it.\n Args:\n data: iterable of tensors\n\n Returns:\n Fitted instance\n \"\"\"\n self.block_mapper = self._create_block_mapper(data)\n return self\n
"},{"location":"api/pydvl/influence/base_influence_function_model/#pydvl.influence.base_influence_function_model.ComposableInfluence.influences_by_block","title":"influences_by_block","text":"influences_by_block(\n x_test: TensorType,\n y_test: TensorType,\n x: Optional[TensorType] = None,\n y: Optional[TensorType] = None,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> OrderedDict[str, TensorType]\n
Compute the block-wise influence values for the provided data, i.e. an approximation of
\\[ \\langle H^{-1}\\nabla_{theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}})), \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the case of up-weighting influence, resp.
\\[ \\langle H^{-1}\\nabla_{theta} \\ell(y_{test}, f_{\\theta}(x_{test})), \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the perturbation type influence case.
PARAMETER DESCRIPTION x_test
model input to use in the gradient computations of the approximation of \\(H^{-1}\\nabla_{theta} \\ell(y_{test}, f_{\\theta}(x_{test}))\\)
TYPE: TensorType
y_test
label tensor to compute gradients
TYPE: TensorType
x
optional model input to use in the gradient computations \\(\\nabla_{theta}\\ell(y, f_{\\theta}(x))\\), resp. \\(\\nabla_{x}\\nabla_{theta}\\ell(y, f_{\\theta}(x))\\), if None, use \\(x=x_{test}\\)
TYPE: Optional[TensorType]
DEFAULT: None
y
optional label tensor to compute gradients
TYPE: Optional[TensorType]
DEFAULT: None
mode
enum value of InfluenceMode
TYPE: InfluenceMode
DEFAULT: Up
RETURNS DESCRIPTION OrderedDict[str, TensorType]
Ordered dictionary of tensors representing the element-wise scalar products
OrderedDict[str, TensorType]
for the provided batch per block.
Source code in src/pydvl/influence/base_influence_function_model.py
@InfluenceFunctionModel.fit_required\ndef influences_by_block(\n self,\n x_test: TensorType,\n y_test: TensorType,\n x: Optional[TensorType] = None,\n y: Optional[TensorType] = None,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> OrderedDict[str, TensorType]:\n r\"\"\"\n Compute the block-wise influence values for the provided data, i.e. an\n approximation of\n\n \\[ \\langle H^{-1}\\nabla_{theta} \\ell(y_{\\text{test}},\n f_{\\theta}(x_{\\text{test}})),\n \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the case of up-weighting influence, resp.\n\n \\[ \\langle H^{-1}\\nabla_{theta} \\ell(y_{test}, f_{\\theta}(x_{test})),\n \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the perturbation type influence case.\n\n Args:\n x_test: model input to use in the gradient computations\n of the approximation of\n $H^{-1}\\nabla_{theta} \\ell(y_{test}, f_{\\theta}(x_{test}))$\n y_test: label tensor to compute gradients\n x: optional model input to use in the gradient computations\n $\\nabla_{theta}\\ell(y, f_{\\theta}(x))$,\n resp. $\\nabla_{x}\\nabla_{theta}\\ell(y, f_{\\theta}(x))$,\n if None, use $x=x_{test}$\n y: optional label tensor to compute gradients\n mode: enum value of [InfluenceMode]\n [pydvl.influence.base_influence_function_model.InfluenceMode]\n\n Returns:\n Ordered dictionary of tensors representing the element-wise scalar products\n for the provided batch per block.\n\n \"\"\"\n left_batch = self._create_batch(x_test, y_test)\n\n if x is None:\n if y is not None:\n raise ValueError(\n \"Providing labels y, without providing model input x \"\n \"is not supported\"\n )\n right_batch = left_batch\n else:\n if y is None:\n raise ValueError(\n \"Providing model input x, without providing labels y \"\n \"is not supported\"\n )\n right_batch = self._create_batch(x, y)\n\n return self.block_mapper.interactions(left_batch, right_batch, mode)\n
"},{"location":"api/pydvl/influence/base_influence_function_model/#pydvl.influence.base_influence_function_model.ComposableInfluence.influence_factors_by_block","title":"influence_factors_by_block","text":"influence_factors_by_block(\n x: TensorType, y: TensorType\n) -> OrderedDict[str, TensorType]\n
Compute the block-wise approximation of
\\[ H^{-1}\\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\] where the gradient is meant to be per sample of the batch \\((x, y)\\).
PARAMETER DESCRIPTION x
model input to use in the gradient computations
TYPE: TensorType
y
label tensor to compute gradients
TYPE: TensorType
RETURNS DESCRIPTION OrderedDict[str, TensorType]
Ordered dictionary of tensors representing the element-wise
OrderedDict[str, TensorType]
approximate inverse Hessian matrix vector products per block.
Source code in src/pydvl/influence/base_influence_function_model.py
@InfluenceFunctionModel.fit_required\ndef influence_factors_by_block(\n self, x: TensorType, y: TensorType\n) -> OrderedDict[str, TensorType]:\n r\"\"\"\n Compute the block-wise approximation of\n\n \\[ H^{-1}\\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\]\n\n where the gradient is meant to be per sample of the batch $(x, y)$.\n\n Args:\n x: model input to use in the gradient computations\n y: label tensor to compute gradients\n\n Returns:\n Ordered dictionary of tensors representing the element-wise\n approximate inverse Hessian matrix vector products per block.\n\n \"\"\"\n return self.block_mapper.transformed_grads(self._create_batch(x, y))\n
"},{"location":"api/pydvl/influence/base_influence_function_model/#pydvl.influence.base_influence_function_model.ComposableInfluence.influences_from_factors_by_block","title":"influences_from_factors_by_block","text":"influences_from_factors_by_block(\n z_test_factors: OrderedDict[str, TensorType],\n x: TensorType,\n y: TensorType,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> OrderedDict[str, TensorType]\n
Block-wise computation of
\\[ \\langle z_{\\text{test_factors}}, \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the case of up-weighting influence, resp.
\\[ \\langle z_{\\text{test_factors}}, \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the perturbation type influence case. The gradient is meant to be per sample of the batch \\((x, y)\\).
PARAMETER DESCRIPTION z_test_factors
pre-computed array, approximating \\(H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}}))\\)
TYPE: OrderedDict[str, TensorType]
x
model input to use in the gradient computations \\(\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))\\), resp. \\(\\nabla_{x}\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))\\), if None, use \\(x=x_{\\text{test}}\\)
TYPE: TensorType
y
label tensor to compute gradients
TYPE: TensorType
mode
enum value of InfluenceMode
TYPE: InfluenceMode
DEFAULT: Up
RETURNS DESCRIPTION OrderedDict[str, TensorType]
Ordered dictionary of tensors representing the element-wise scalar products
OrderedDict[str, TensorType]
for the provided batch per block
Source code in src/pydvl/influence/base_influence_function_model.py
@InfluenceFunctionModel.fit_required\ndef influences_from_factors_by_block(\n self,\n z_test_factors: OrderedDict[str, TensorType],\n x: TensorType,\n y: TensorType,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> OrderedDict[str, TensorType]:\n r\"\"\"\n Block-wise computation of\n\n \\[ \\langle z_{\\text{test_factors}},\n \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the case of up-weighting influence, resp.\n\n \\[ \\langle z_{\\text{test_factors}},\n \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the perturbation type influence case. The gradient is meant to be per sample\n of the batch $(x, y)$.\n\n Args:\n z_test_factors: pre-computed array, approximating\n $H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}},\n f_{\\theta}(x_{\\text{test}}))$\n x: model input to use in the gradient computations\n $\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))$,\n resp. $\\nabla_{x}\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))$,\n if None, use $x=x_{\\text{test}}$\n y: label tensor to compute gradients\n mode: enum value of [InfluenceMode]\n [pydvl.influence.base_influence_function_model.InfluenceMode]\n\n Returns:\n Ordered dictionary of tensors representing the element-wise scalar products\n for the provided batch per block\n\n \"\"\"\n return self.block_mapper.interactions_from_transformed_grads(\n z_test_factors, self._create_batch(x, y), mode\n )\n
"},{"location":"api/pydvl/influence/base_influence_function_model/#pydvl.influence.base_influence_function_model.ComposableInfluence.influences_from_factors","title":"influences_from_factors","text":"influences_from_factors(\n z_test_factors: TensorType,\n x: TensorType,\n y: TensorType,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> TensorType\n
Computation of
\\[ \\langle z_{\\text{test_factors}}, \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the case of up-weighting influence, resp.
\\[ \\langle z_{\\text{test_factors}}, \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the perturbation type influence case. The gradient is meant to be per sample of the batch \\((x, y)\\).
PARAMETER DESCRIPTION z_test_factors
pre-computed array, approximating \\(H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}}))\\)
TYPE: TensorType
x
model input to use in the gradient computations \\(\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))\\), resp. \\(\\nabla_{x}\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))\\), if None, use \\(x=x_{\\text{test}}\\)
TYPE: TensorType
y
label tensor to compute gradients
TYPE: TensorType
mode
enum value of InfluenceMode
TYPE: InfluenceMode
DEFAULT: Up
RETURNS DESCRIPTION TensorType
Tensor representing the element-wise scalar products for the provided batch
Source code in src/pydvl/influence/base_influence_function_model.py
@InfluenceFunctionModel.fit_required\ndef influences_from_factors(\n self,\n z_test_factors: TensorType,\n x: TensorType,\n y: TensorType,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> TensorType:\n r\"\"\"\n Computation of\n\n \\[ \\langle z_{\\text{test_factors}},\n \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the case of up-weighting influence, resp.\n\n \\[ \\langle z_{\\text{test_factors}},\n \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the perturbation type influence case. The gradient is meant to be per sample\n of the batch $(x, y)$.\n\n Args:\n z_test_factors: pre-computed array, approximating\n $H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}},\n f_{\\theta}(x_{\\text{test}}))$\n x: model input to use in the gradient computations\n $\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))$,\n resp. $\\nabla_{x}\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))$,\n if None, use $x=x_{\\text{test}}$\n y: label tensor to compute gradients\n mode: enum value of [InfluenceMode]\n [pydvl.influence.base_influence_function_model.InfluenceMode]\n\n Returns:\n Tensor representing the element-wise scalar products for the provided batch\n\n \"\"\"\n tensors = self.block_mapper.generate_interactions_from_transformed_grads(\n z_test_factors,\n self._create_batch(x, y),\n mode,\n )\n result: TensorType = next(tensors)\n for tensor in tensors:\n result = result + tensor\n return result\n
"},{"location":"api/pydvl/influence/influence_calculator/","title":"Influence calculator","text":""},{"location":"api/pydvl/influence/influence_calculator/#pydvl.influence.influence_calculator","title":"pydvl.influence.influence_calculator","text":"This module provides functionality for calculating influences for large amount of data. The computation is based on a chunk computation model in the form of an instance of InfluenceFunctionModel, which is mapped over collection of chunks.
"},{"location":"api/pydvl/influence/influence_calculator/#pydvl.influence.influence_calculator.DisableClientSingleThreadCheck","title":"DisableClientSingleThreadCheck","text":"This type can be provided to the initialization of a DaskInfluenceCalculator instead of a distributed client object. It is useful in those scenarios, where the user want to disable the checking for thread-safety in the initialization phase, e.g. when using the single machine synchronous scheduler for debugging purposes.
Example from pydvl.influence import DisableClientThreadingCheck\n\nda_calc = DaskInfluenceCalculator(if_model,\n TorchNumpyConverter(),\n DisableClientThreadingCheck)\nda_influences = da_calc.influences(da_x_test, da_y_test, da_x, da_y)\nda_influences.compute(scheduler='synchronous')\n
"},{"location":"api/pydvl/influence/influence_calculator/#pydvl.influence.influence_calculator.DaskInfluenceCalculator","title":"DaskInfluenceCalculator","text":"DaskInfluenceCalculator(\n influence_function_model: InfluenceFunctionModel,\n converter: NumpyConverter,\n client: Union[Client, Type[DisableClientSingleThreadCheck]],\n)\n
This class is designed to compute influences over dask.array.Array collections, leveraging the capabilities of Dask for distributed computing and parallel processing. It requires an influence computation model of type InfluenceFunctionModel, which defines how influences are computed on a chunk of data. Essentially, this class functions by mapping the influence function model across the various chunks of a dask.array.Array collection.
PARAMETER DESCRIPTION influence_function_model
instance of type InfluenceFunctionModel, that specifies the computation logic for influence on data chunks. It's a pivotal part of the calculator, determining how influence is computed and applied across the data array.
TYPE: InfluenceFunctionModel
converter
A utility for converting numpy arrays to TensorType objects, facilitating the interaction between numpy arrays and the influence function model.
TYPE: NumpyConverter
client
This parameter accepts either of two types:
-
A distributed Client object
-
The special type DisableClientSingleThreadCheck, which serves as a flag to bypass certain checks.
During initialization, the system verifies if all workers are operating in single-threaded mode when the provided influence_function_model is designated as not thread-safe (indicated by the is_thread_safe
property). If this condition is not met, the initialization will raise a specific error, signaling a potential thread-safety conflict.
To intentionally skip this safety check (e.g., for debugging purposes using the single machine synchronous scheduler), you can supply the DisableClientSingleThreadCheck type.
TYPE: Union[Client, Type[DisableClientSingleThreadCheck]]
Warning
Make sure to set threads_per_worker=1
, when using the distributed scheduler for computing, if your implementation of InfluenceFunctionModel is not thread-safe.
client = Client(threads_per_worker=1)\n
For details on dask schedulers see the official documentation. Example import torch\nfrom torch.utils.data import Dataset, DataLoader\nfrom pydvl.influence import DaskInfluenceCalculator\nfrom pydvl.influence.torch import CgInfluence\nfrom pydvl.influence.torch.util import (\n torch_dataset_to_dask_array,\n TorchNumpyConverter,\n)\nfrom distributed import Client\n\n# Possible some out of memory large Dataset\ntrain_data_set: Dataset = LargeDataSet(...)\ntest_data_set: Dataset = LargeDataSet(...)\n\ntrain_dataloader = DataLoader(train_data_set)\ninfl_model = CgInfluence(model, loss, hessian_regularization=0.01)\ninfl_model = if_model.fit(train_dataloader)\n\n# wrap your input data into dask arrays\nchunk_size = 10\nda_x, da_y = torch_dataset_to_dask_array(train_data_set, chunk_size=chunk_size)\nda_x_test, da_y_test = torch_dataset_to_dask_array(test_data_set,\n chunk_size=chunk_size)\n\n# use only one thread for scheduling, due to non-thread safety of some torch\n# operations\nclient = Client(n_workers=4, threads_per_worker=1)\n\ninfl_calc = DaskInfluenceCalculator(infl_model,\n TorchNumpyConverter(device=torch.device(\"cpu\")),\n client)\nda_influences = infl_calc.influences(da_x_test, da_y_test, da_x, da_y)\n# da_influences is a dask.array.Array\n\n# trigger computation and write chunks to disk in parallel\nda_influences.to_zarr(\"path/or/url\")\n
Source code in src/pydvl/influence/influence_calculator.py
def __init__(\n self,\n influence_function_model: InfluenceFunctionModel,\n converter: NumpyConverter,\n client: Union[Client, Type[DisableClientSingleThreadCheck]],\n):\n self._n_parameters = influence_function_model.n_parameters\n self.influence_function_model = influence_function_model\n self.numpy_converter = converter\n\n if isinstance(client, type(DisableClientSingleThreadCheck)):\n logger.warning(DisableClientSingleThreadCheck.warning_msg())\n self.influence_function_model = delayed(influence_function_model)\n elif isinstance(client, Client):\n self._validate_client(client, influence_function_model)\n self.influence_function_model = client.scatter(\n influence_function_model, broadcast=True\n )\n else:\n raise ValueError(\n \"The 'client' parameter \"\n \"must either be a distributed.Client object or the\"\n \"type 'DisableClientSingleThreadCheck'.\"\n )\n
"},{"location":"api/pydvl/influence/influence_calculator/#pydvl.influence.influence_calculator.DaskInfluenceCalculator.n_parameters","title":"n_parameters property
","text":"n_parameters\n
Number of trainable parameters of the underlying model used in the batch computation
"},{"location":"api/pydvl/influence/influence_calculator/#pydvl.influence.influence_calculator.DaskInfluenceCalculator.influence_factors","title":"influence_factors","text":"influence_factors(x: Array, y: Array) -> Array\n
Computes the expression
\\[ H^{-1}\\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\] where the gradients are computed for the chunks of \\((x, y)\\).
PARAMETER DESCRIPTION x
model input to use in the gradient computations
TYPE: Array
y
label tensor to compute gradients
TYPE: Array
RETURNS DESCRIPTION Array
dask.array.Array representing the element-wise inverse Hessian matrix vector products for the provided batch.
Source code in src/pydvl/influence/influence_calculator.py
def influence_factors(self, x: da.Array, y: da.Array) -> da.Array:\n r\"\"\"\n Computes the expression\n\n \\[ H^{-1}\\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\]\n\n where the gradients are computed for the chunks of $(x, y)$.\n\n Args:\n x: model input to use in the gradient computations\n y: label tensor to compute gradients\n\n Returns:\n [dask.array.Array][dask.array.Array] representing the element-wise inverse\n Hessian matrix vector products for the provided batch.\n\n \"\"\"\n\n self._validate_aligned_chunking(x, y)\n self._validate_dimensions_not_chunked(x)\n self._validate_dimensions_not_chunked(y)\n\n def func(x_numpy: NDArray, y_numpy: NDArray, model: InfluenceFunctionModel):\n factors = model.influence_factors(\n self.numpy_converter.from_numpy(x_numpy),\n self.numpy_converter.from_numpy(y_numpy),\n )\n return self.numpy_converter.to_numpy(factors)\n\n chunks = []\n for x_chunk, y_chunk, chunk_size in zip(\n x.to_delayed(), y.to_delayed(), x.chunks[0]\n ):\n chunk_shape = (chunk_size, self.n_parameters)\n chunk_array = da.from_delayed(\n delayed(func)(\n x_chunk.squeeze()[()],\n y_chunk.squeeze()[()],\n self.influence_function_model,\n ),\n dtype=x.dtype,\n shape=chunk_shape,\n )\n chunks.append(chunk_array)\n\n return da.concatenate(chunks)\n
"},{"location":"api/pydvl/influence/influence_calculator/#pydvl.influence.influence_calculator.DaskInfluenceCalculator.influences","title":"influences","text":"influences(\n x_test: Array,\n y_test: Array,\n x: Optional[Array] = None,\n y: Optional[Array] = None,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> Array\n
Compute approximation of
\\[ \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}})), \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the case of up-weighting influence, resp.
\\[ \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}})), \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the perturbation type influence case. The computation is done block-wise for the chunks of the provided dask arrays.
PARAMETER DESCRIPTION x_test
model input to use in the gradient computations of \\(H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}}))\\)
TYPE: Array
y_test
label tensor to compute gradients
TYPE: Array
x
optional model input to use in the gradient computations \\(\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))\\), resp. \\(\\nabla_{x}\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))\\), if None, use \\(x=x_{\\text{test}}\\)
TYPE: Optional[Array]
DEFAULT: None
y
optional label tensor to compute gradients
TYPE: Optional[Array]
DEFAULT: None
mode
enum value of InfluenceMode
TYPE: InfluenceMode
DEFAULT: Up
RETURNS DESCRIPTION Array
dask.array.Array representing the element-wise scalar products for the provided batch.
Source code in src/pydvl/influence/influence_calculator.py
def influences(\n self,\n x_test: da.Array,\n y_test: da.Array,\n x: Optional[da.Array] = None,\n y: Optional[da.Array] = None,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> da.Array:\n r\"\"\"\n Compute approximation of\n\n \\[ \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}},\n f_{\\theta}(x_{\\text{test}})), \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the case of up-weighting influence, resp.\n\n \\[ \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}},\n f_{\\theta}(x_{\\text{test}})),\n \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the perturbation type influence case. The computation is done block-wise\n for the chunks of the provided dask arrays.\n\n Args:\n x_test: model input to use in the gradient computations of\n $H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}},\n f_{\\theta}(x_{\\text{test}}))$\n y_test: label tensor to compute gradients\n x: optional model input to use in the gradient computations\n $\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))$,\n resp. $\\nabla_{x}\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))$,\n if None, use $x=x_{\\text{test}}$\n y: optional label tensor to compute gradients\n mode: enum value of [InfluenceMode]\n [pydvl.influence.base_influence_function_model.InfluenceMode]\n\n Returns:\n [dask.array.Array][dask.array.Array] representing the element-wise scalar\n products for the provided batch.\n\n \"\"\"\n\n self._validate_aligned_chunking(x_test, y_test)\n self._validate_dimensions_not_chunked(x_test)\n self._validate_dimensions_not_chunked(y_test)\n\n if (x is None) != (y is None):\n if x is None:\n raise ValueError(\n \"Providing labels y without providing model input x \"\n \"is not supported\"\n )\n if y is None:\n raise ValueError(\n \"Providing model input x without labels y is not supported\"\n )\n elif x is not None:\n self._validate_aligned_chunking(x, y)\n self._validate_dimensions_not_chunked(x)\n self._validate_dimensions_not_chunked(y)\n else:\n x, y = x_test, y_test\n\n def func(\n x_test_numpy: NDArray,\n y_test_numpy: NDArray,\n x_numpy: NDArray,\n y_numpy: NDArray,\n model: InfluenceFunctionModel,\n ):\n values = model.influences(\n self.numpy_converter.from_numpy(x_test_numpy),\n self.numpy_converter.from_numpy(y_test_numpy),\n self.numpy_converter.from_numpy(x_numpy),\n self.numpy_converter.from_numpy(y_numpy),\n mode,\n )\n return self.numpy_converter.to_numpy(values)\n\n un_chunked_x_shapes = [s[0] for s in x_test.chunks[1:]]\n x_test_chunk_sizes = x_test.chunks[0]\n x_chunk_sizes = x.chunks[0]\n blocks = []\n block_shape: Tuple[int, ...]\n\n for x_test_chunk, y_test_chunk, test_chunk_size in zip(\n x_test.to_delayed(), y_test.to_delayed(), x_test_chunk_sizes\n ):\n row = []\n for x_chunk, y_chunk, chunk_size in zip(\n x.to_delayed(), y.to_delayed(), x_chunk_sizes # type:ignore\n ):\n if mode == InfluenceMode.Up:\n block_shape = (test_chunk_size, chunk_size)\n elif mode == InfluenceMode.Perturbation:\n block_shape = (test_chunk_size, chunk_size, *un_chunked_x_shapes)\n else:\n raise UnsupportedInfluenceModeException(mode)\n\n block_array = da.from_delayed(\n delayed(func)(\n x_test_chunk.squeeze()[()],\n y_test_chunk.squeeze()[()],\n x_chunk.squeeze()[()],\n y_chunk.squeeze()[()],\n self.influence_function_model,\n ),\n shape=block_shape,\n dtype=x_test.dtype,\n )\n\n if mode == InfluenceMode.Perturbation:\n n_dims = block_array.ndim\n new_order = tuple(range(2, n_dims)) + (0, 1)\n block_array = block_array.transpose(new_order)\n\n row.append(block_array)\n blocks.append(row)\n\n values_array = da.block(blocks)\n\n if mode == InfluenceMode.Perturbation:\n n_dims = values_array.ndim\n new_order = (n_dims - 2, n_dims - 1) + tuple(range(n_dims - 2))\n values_array = values_array.transpose(new_order)\n\n return values_array\n
"},{"location":"api/pydvl/influence/influence_calculator/#pydvl.influence.influence_calculator.DaskInfluenceCalculator.influences_from_factors","title":"influences_from_factors","text":"influences_from_factors(\n z_test_factors: Array,\n x: Array,\n y: Array,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> Array\n
Computation of
\\[ \\langle z_{\\text{test_factors}}, \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the case of up-weighting influence, resp.
\\[ \\langle z_{\\text{test_factors}}, \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the perturbation type influence case. The gradient is meant to be per sample of the batch \\((x, y)\\).
PARAMETER DESCRIPTION z_test_factors
pre-computed array, approximating \\(H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}}))\\)
TYPE: Array
x
optional model input to use in the gradient computations \\(\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))\\), resp. \\(\\nabla_{x}\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))\\), if None, use \\(x=x_{\\text{test}}\\)
TYPE: Array
y
optional label tensor to compute gradients
TYPE: Array
mode
enum value of InfluenceMode
TYPE: InfluenceMode
DEFAULT: Up
RETURNS DESCRIPTION Array
dask.array.Array representing the element-wise scalar product of the provided batch
Source code in src/pydvl/influence/influence_calculator.py
def influences_from_factors(\n self,\n z_test_factors: da.Array,\n x: da.Array,\n y: da.Array,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> da.Array:\n r\"\"\"\n Computation of\n\n \\[ \\langle z_{\\text{test_factors}},\n \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the case of up-weighting influence, resp.\n\n \\[ \\langle z_{\\text{test_factors}},\n \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the perturbation type influence case. The gradient is meant\n to be per sample of the batch $(x, y)$.\n\n Args:\n z_test_factors: pre-computed array, approximating\n $H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}},\n f_{\\theta}(x_{\\text{test}}))$\n x: optional model input to use in the gradient computations\n $\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))$,\n resp. $\\nabla_{x}\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))$,\n if None, use $x=x_{\\text{test}}$\n y: optional label tensor to compute gradients\n mode: enum value of [InfluenceMode]\n [pydvl.influence.base_influence_function_model.InfluenceMode]\n\n Returns:\n [dask.array.Array][dask.array.Array] representing the element-wise scalar\n product of the provided batch\n\n \"\"\"\n self._validate_aligned_chunking(x, y)\n self._validate_dimensions_not_chunked(x)\n self._validate_dimensions_not_chunked(y)\n self._validate_dimensions_not_chunked(z_test_factors)\n\n def func(\n z_test_numpy: NDArray,\n x_numpy: NDArray,\n y_numpy: NDArray,\n model: InfluenceFunctionModel,\n ):\n ups = model.influences_from_factors(\n self.numpy_converter.from_numpy(z_test_numpy),\n self.numpy_converter.from_numpy(x_numpy),\n self.numpy_converter.from_numpy(y_numpy),\n mode=mode,\n )\n return self.numpy_converter.to_numpy(ups)\n\n un_chunked_x_shape = [s[0] for s in x.chunks[1:]]\n x_chunk_sizes = x.chunks[0]\n z_test_chunk_sizes = z_test_factors.chunks[0]\n blocks = []\n block_shape: Tuple[int, ...]\n\n for z_test_chunk, z_test_chunk_size in zip(\n z_test_factors.to_delayed(), z_test_chunk_sizes\n ):\n row = []\n for x_chunk, y_chunk, chunk_size in zip(\n x.to_delayed(), y.to_delayed(), x_chunk_sizes\n ):\n if mode == InfluenceMode.Perturbation:\n block_shape = (z_test_chunk_size, chunk_size, *un_chunked_x_shape)\n elif mode == InfluenceMode.Up:\n block_shape = (z_test_chunk_size, chunk_size)\n else:\n raise UnsupportedInfluenceModeException(mode)\n\n block_array = da.from_delayed(\n delayed(func)(\n z_test_chunk.squeeze()[()],\n x_chunk.squeeze()[()],\n y_chunk.squeeze()[()],\n self.influence_function_model,\n ),\n shape=block_shape,\n dtype=z_test_factors.dtype,\n )\n\n if mode == InfluenceMode.Perturbation:\n n_dims = block_array.ndim\n new_order = tuple(range(2, n_dims)) + (0, 1)\n block_array = block_array.transpose(*new_order)\n\n row.append(block_array)\n blocks.append(row)\n\n values_array = da.block(blocks)\n\n if mode == InfluenceMode.Perturbation:\n n_dims = values_array.ndim\n new_order = (n_dims - 2, n_dims - 1) + tuple(range(n_dims - 2))\n values_array = values_array.transpose(*new_order)\n\n return values_array\n
"},{"location":"api/pydvl/influence/influence_calculator/#pydvl.influence.influence_calculator.SequentialInfluenceCalculator","title":"SequentialInfluenceCalculator","text":"SequentialInfluenceCalculator(influence_function_model: InfluenceFunctionModel)\n
This class serves as a simple wrapper for processing batches of data in a sequential manner. It is particularly useful in scenarios where parallel or distributed processing is not required or not feasible. The core functionality of this class is to apply a specified influence computation model, of type InfluenceFunctionModel, to batches of data one at a time.
PARAMETER DESCRIPTION influence_function_model
An instance of type [InfluenceFunctionModel] [pydvl.influence.base_influence_function_model.InfluenceFunctionModel], that specifies the computation logic for influence on data chunks.
TYPE: InfluenceFunctionModel
Example from pydvl.influence import SequentialInfluenceCalculator\nfrom pydvl.influence.torch.util import (\nNestedTorchCatAggregator,\nTorchNumpyConverter,\n)\nfrom pydvl.influence.torch import CgInfluence\n\nbatch_size = 10\ntrain_dataloader = DataLoader(..., batch_size=batch_size)\ntest_dataloader = DataLoader(..., batch_size=batch_size)\n\ninfl_model = CgInfluence(model, loss, hessian_regularization=0.01)\ninfl_model = infl_model.fit(train_dataloader)\n\ninfl_calc = SequentialInfluenceCalculator(if_model)\n\n# this does not trigger the computation\nlazy_influences = infl_calc.influences(test_dataloader, train_dataloader)\n\n# trigger computation and pull the result into main memory, result is the full\n# tensor for all combinations of the two loaders\ninfluences = lazy_influences.compute(aggregator=NestedTorchCatAggregator())\n# or\n# trigger computation and write results chunk-wise to disk using zarr in a\n# sequential manner\nlazy_influences.to_zarr(\"local_path/or/url\", TorchNumpyConverter())\n
Source code in src/pydvl/influence/influence_calculator.py
def __init__(\n self,\n influence_function_model: InfluenceFunctionModel,\n):\n self.influence_function_model = influence_function_model\n
"},{"location":"api/pydvl/influence/influence_calculator/#pydvl.influence.influence_calculator.SequentialInfluenceCalculator.influence_factors","title":"influence_factors","text":"influence_factors(\n data_iterable: Iterable[Tuple[TensorType, TensorType]]\n) -> LazyChunkSequence\n
Compute the expression
\\[ H^{-1}\\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\] where the gradient are computed for the chunks \\((x, y)\\) of the data_iterable in a sequential manner.
PARAMETER DESCRIPTION data_iterable
An iterable that returns tuples of tensors. Each tuple consists of a pair of tensors (x, y), representing input data and corresponding targets.
TYPE: Iterable[Tuple[TensorType, TensorType]]
RETURNS DESCRIPTION LazyChunkSequence
A lazy data structure representing the chunks of the resulting tensor
Source code in src/pydvl/influence/influence_calculator.py
def influence_factors(\n self,\n data_iterable: Iterable[Tuple[TensorType, TensorType]],\n) -> LazyChunkSequence:\n r\"\"\"\n Compute the expression\n\n \\[ H^{-1}\\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\]\n\n where the gradient are computed for the chunks $(x, y)$ of the data_iterable in\n a sequential manner.\n\n Args:\n data_iterable: An iterable that returns tuples of tensors.\n Each tuple consists of a pair of tensors (x, y), representing input data\n and corresponding targets.\n\n Returns:\n A lazy data structure representing the chunks of the resulting tensor\n \"\"\"\n try:\n len_iterable = len(cast(Sized, data_iterable))\n except Exception as e:\n logger.debug(f\"Failed to retrieve len of data iterable: {e}\")\n len_iterable = None\n\n tensors_gen_factory = partial(self._influence_factors_gen, data_iterable)\n return LazyChunkSequence(tensors_gen_factory, len_generator=len_iterable)\n
"},{"location":"api/pydvl/influence/influence_calculator/#pydvl.influence.influence_calculator.SequentialInfluenceCalculator.influences","title":"influences","text":"influences(\n test_data_iterable: Iterable[Tuple[TensorType, TensorType]],\n train_data_iterable: Iterable[Tuple[TensorType, TensorType]],\n mode: InfluenceMode = InfluenceMode.Up,\n) -> NestedLazyChunkSequence\n
Compute approximation of
\\[ \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}})), \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the case of up-weighting influence, resp.
\\[ \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}})), \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the perturbation type influence case. The computation is done block-wise for the chunks of the provided data iterables and aggregated into a single tensor in memory.
PARAMETER DESCRIPTION test_data_iterable
An iterable that returns tuples of tensors. Each tuple consists of a pair of tensors (x, y), representing input data and corresponding targets.
TYPE: Iterable[Tuple[TensorType, TensorType]]
train_data_iterable
An iterable that returns tuples of tensors. Each tuple consists of a pair of tensors (x, y), representing input data and corresponding targets.
TYPE: Iterable[Tuple[TensorType, TensorType]]
mode
enum value of InfluenceMode
TYPE: InfluenceMode
DEFAULT: Up
RETURNS DESCRIPTION NestedLazyChunkSequence
A lazy data structure representing the chunks of the resulting tensor
Source code in src/pydvl/influence/influence_calculator.py
def influences(\n self,\n test_data_iterable: Iterable[Tuple[TensorType, TensorType]],\n train_data_iterable: Iterable[Tuple[TensorType, TensorType]],\n mode: InfluenceMode = InfluenceMode.Up,\n) -> NestedLazyChunkSequence:\n r\"\"\"\n Compute approximation of\n\n \\[ \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}},\n f_{\\theta}(x_{\\text{test}})), \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the case of up-weighting influence, resp.\n\n \\[ \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}},\n f_{\\theta}(x_{\\text{test}})),\n \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the perturbation type influence case. The computation is done block-wise for\n the chunks of the provided\n data iterables and aggregated into a single tensor in memory.\n\n Args:\n test_data_iterable: An iterable that returns tuples of tensors.\n Each tuple consists of a pair of tensors (x, y), representing input data\n and corresponding targets.\n train_data_iterable: An iterable that returns tuples of tensors.\n Each tuple consists of a pair of tensors (x, y), representing input data\n and corresponding targets.\n mode: enum value of [InfluenceMode]\n [pydvl.influence.base_influence_function_model.InfluenceMode]\n\n Returns:\n A lazy data structure representing the chunks of the resulting tensor\n\n \"\"\"\n nested_tensor_gen_factory = partial(\n self._influences_gen,\n test_data_iterable,\n train_data_iterable,\n mode,\n )\n\n try:\n len_iterable = len(cast(Sized, test_data_iterable))\n except Exception as e:\n logger.debug(f\"Failed to retrieve len of test data iterable: {e}\")\n len_iterable = None\n\n return NestedLazyChunkSequence(\n nested_tensor_gen_factory, len_outer_generator=len_iterable\n )\n
"},{"location":"api/pydvl/influence/influence_calculator/#pydvl.influence.influence_calculator.SequentialInfluenceCalculator.influences_from_factors","title":"influences_from_factors","text":"influences_from_factors(\n z_test_factors: Iterable[TensorType],\n train_data_iterable: Iterable[Tuple[TensorType, TensorType]],\n mode: InfluenceMode = InfluenceMode.Up,\n) -> NestedLazyChunkSequence\n
Computation of
\\[ \\langle z_{\\text{test_factors}}, \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the case of up-weighting influence, resp.
\\[ \\langle z_{\\text{test_factors}}, \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the perturbation type influence case. The gradient is meant to be per sample of the batch \\((x, y)\\).
PARAMETER DESCRIPTION z_test_factors
Pre-computed iterable of tensors, approximating \\(H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}}))\\)
TYPE: Iterable[TensorType]
train_data_iterable
An iterable that returns tuples of tensors. Each tuple consists of a pair of tensors (x, y), representing input data and corresponding targets.
TYPE: Iterable[Tuple[TensorType, TensorType]]
mode
enum value of InfluenceMode
TYPE: InfluenceMode
DEFAULT: Up
RETURNS DESCRIPTION NestedLazyChunkSequence
A lazy data structure representing the chunks of the resulting tensor
Source code in src/pydvl/influence/influence_calculator.py
def influences_from_factors(\n self,\n z_test_factors: Iterable[TensorType],\n train_data_iterable: Iterable[Tuple[TensorType, TensorType]],\n mode: InfluenceMode = InfluenceMode.Up,\n) -> NestedLazyChunkSequence:\n r\"\"\"\n Computation of\n\n \\[ \\langle z_{\\text{test_factors}}, \\nabla_{\\theta} \\ell(y, f_{\\theta}(x))\n \\rangle \\]\n\n for the case of up-weighting influence, resp.\n\n \\[ \\langle z_{\\text{test_factors}}, \\nabla_{x} \\nabla_{\\theta}\n \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the perturbation type influence case. The gradient is meant to be per sample\n of the batch $(x, y)$.\n\n Args:\n z_test_factors: Pre-computed iterable of tensors, approximating\n $H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}},\n f_{\\theta}(x_{\\text{test}}))$\n train_data_iterable: An iterable that returns tuples of tensors.\n Each tuple consists of a pair of tensors (x, y), representing input data\n and corresponding targets.\n mode: enum value of [InfluenceMode]\n [pydvl.influence.base_influence_function_model.InfluenceMode]\n\n Returns:\n A lazy data structure representing the chunks of the resulting tensor\n\n \"\"\"\n nested_tensor_gen = partial(\n self._influences_from_factors_gen,\n z_test_factors,\n train_data_iterable,\n mode,\n )\n\n try:\n len_iterable = len(cast(Sized, z_test_factors))\n except Exception as e:\n logger.debug(f\"Failed to retrieve len of factors iterable: {e}\")\n len_iterable = None\n\n return NestedLazyChunkSequence(\n nested_tensor_gen, len_outer_generator=len_iterable\n )\n
"},{"location":"api/pydvl/influence/types/","title":"Types","text":""},{"location":"api/pydvl/influence/types/#pydvl.influence.types","title":"pydvl.influence.types","text":"This module offers a set of generic types, which can be used to build modular and flexible components for influence computation for different tensor frameworks.
Key components include:
-
GradientProvider: A generic abstract base class designed to provide methods for computing per-sample gradients and other related computations for given data batches.
-
BilinearForm: A generic abstract base class for representing bilinear forms for computing inner products involving gradients.
-
Operator: A generic abstract base class for operators that can apply transformations to vectors and matrices and can be represented as bilinear forms.
-
OperatorGradientComposition: A generic abstract composition class that integrates an operator with a gradient provider to compute interactions between batches of data.
-
BlockMapper: A generic abstract base class for mapping operations across multiple compositional blocks, given by objects of type OperatorGradientComposition, and aggregating the results.
To see the usage of these types, see the implementation ComposableInfluence . Using these components allows the straightforward implementation of various combinations of approximations of inverse Hessian applications (or Gauss-Newton approximations), different blocking strategies (e.g. layer-wise or block-wise) and different ways to compute gradients.
For the usage with a specific tensor framework, these types must be subclassed. An example for torch is provided in the module pydvl.influence.torch.base and the base class TorchComposableInfluence.
"},{"location":"api/pydvl/influence/types/#pydvl.influence.types.InfluenceMode","title":"InfluenceMode","text":" Bases: str
, Enum
Enum representation for the types of influence.
ATTRIBUTE DESCRIPTION Up
Approximating the influence of a point
Perturbation
Perturbation definition of the influence score
"},{"location":"api/pydvl/influence/types/#pydvl.influence.types.Batch","title":"Batch dataclass
","text":"Batch(x: TensorType, y: TensorType)\n
Bases: Generic[TensorType]
Represents a batch of data containing features and labels.
ATTRIBUTE DESCRIPTION x
Represents the input features of the batch.
TYPE: TensorType
y
Represents the labels or targets associated with the input features.
TYPE: TensorType
"},{"location":"api/pydvl/influence/types/#pydvl.influence.types.GradientProvider","title":"GradientProvider","text":" Bases: Generic[BatchType, TensorType]
, ABC
Provides an interface for calculating per-sample gradients and other related computations for a given batch of data.
This class must be subclassed with implementations for its abstract methods tailored to specific gradient computation needs, e.g. using an autograd engine for a model loss function. Consider a function
\\[ \\ell: \\mathbb{R}^{d_1} \\times \\mathbb{R}^{d_2} \\times \\mathbb{R}^{n} \\times \\mathbb{R}^{n}, \\quad \\ell(\\omega_1, \\omega_2, x, y) = \\operatorname{loss}(f(\\omega_1, \\omega_2; x), y) \\] e.g. a two layer neural network \\(f\\) with a loss function, then this object should compute the expressions:
\\[ \\nabla_{\\omega_{i}}\\ell(\\omega_1, \\omega_2, x, y), \\nabla_{\\omega_{i}}\\nabla_{x}\\ell(\\omega_1, \\omega_2, x, y), \\nabla_{\\omega}\\ell(\\omega_1, \\omega_2, x, y) \\cdot v\\]"},{"location":"api/pydvl/influence/types/#pydvl.influence.types.GradientProvider.jacobian_prod","title":"jacobian_prod abstractmethod
","text":"jacobian_prod(batch: BatchType, g: TensorType) -> TensorType\n
Computes the matrix-Jacobian product for the provided batch and input tensor. Given the example in the class docstring, this means
\\[ (\\nabla_{\\omega_{1}}\\ell(\\omega_1, \\omega_2, \\text{batch.x}, \\text{batch.y}), \\nabla_{\\omega_{2}}\\ell(\\omega_1, \\omega_2, \\text{batch.x}, \\text{batch.y})) \\cdot g^T\\] where g must be a tensor of shape \\((K, d_1+d_2)\\), so the resulting tensor is of shape \\((N, K)\\).
PARAMETER DESCRIPTION batch
The batch of data for which to compute the Jacobian.
TYPE: BatchType
g
The tensor to be used in the matrix-Jacobian product calculation.
TYPE: TensorType
RETURNS DESCRIPTION TensorType
The resulting tensor from the matrix-Jacobian product computation.
Source code in src/pydvl/influence/types.py
@abstractmethod\ndef jacobian_prod(\n self,\n batch: BatchType,\n g: TensorType,\n) -> TensorType:\n r\"\"\"\n Computes the matrix-Jacobian product for the provided batch and input tensor.\n Given the example in the class docstring, this means\n\n $$ (\\nabla_{\\omega_{1}}\\ell(\\omega_1, \\omega_2,\n \\text{batch.x}, \\text{batch.y}),\n \\nabla_{\\omega_{2}}\\ell(\\omega_1, \\omega_2,\n \\text{batch.x}, \\text{batch.y})) \\cdot g^T$$\n\n where g must be a tensor of shape $(K, d_1+d_2)$, so the resulting tensor\n is of shape $(N, K)$.\n\n Args:\n batch: The batch of data for which to compute the Jacobian.\n g: The tensor to be used in the matrix-Jacobian product\n calculation.\n\n Returns:\n The resulting tensor from the matrix-Jacobian product computation.\n \"\"\"\n
"},{"location":"api/pydvl/influence/types/#pydvl.influence.types.GradientProvider.flat_grads","title":"flat_grads abstractmethod
","text":"flat_grads(batch: BatchType) -> TensorType\n
Computes and returns the flat per-sample gradients for the provided batch. Given the example in the class docstring, this means
\\[ (\\nabla_{\\omega_{1}}\\ell(\\omega_1, \\omega_2, \\text{batch.x}, \\text{batch.y}), \\nabla_{\\omega_{2}}\\ell(\\omega_1, \\omega_2, \\text{batch.x}, \\text{batch.y}))\\] where the first dimension of the resulting tensor is always considered to be the batch dimension, so the shape of the resulting tensor is \\((N, d_1+d_2)\\), where \\(N\\) is the number of samples in the batch.
PARAMETER DESCRIPTION batch
The batch of data for which to compute the gradients.
TYPE: BatchType
RETURNS DESCRIPTION TensorType
A tensor containing the flat gradients computed per sample.
Source code in src/pydvl/influence/types.py
@abstractmethod\ndef flat_grads(self, batch: BatchType) -> TensorType:\n r\"\"\"\n Computes and returns the flat per-sample gradients for the provided batch.\n Given the example in the class docstring, this means\n\n $$ (\\nabla_{\\omega_{1}}\\ell(\\omega_1, \\omega_2,\n \\text{batch.x}, \\text{batch.y}),\n \\nabla_{\\omega_{2}}\\ell(\\omega_1, \\omega_2,\n \\text{batch.x}, \\text{batch.y}))$$\n\n where the first dimension of the resulting tensor is always considered to be\n the batch dimension, so the shape of the resulting tensor is $(N, d_1+d_2)$,\n where $N$ is the number of samples in the batch.\n\n Args:\n batch: The batch of data for which to compute the gradients.\n\n Returns:\n A tensor containing the flat gradients computed per sample.\n \"\"\"\n
"},{"location":"api/pydvl/influence/types/#pydvl.influence.types.GradientProvider.flat_mixed_grads","title":"flat_mixed_grads abstractmethod
","text":"flat_mixed_grads(batch: BatchType) -> TensorType\n
Computes and returns the flat per-sample mixed gradients for the provided batch. Given the example in the class docstring, this means
\\[ (\\nabla_{\\omega_1}\\nabla_{x}\\ell(\\omega_1, \\omega_2, \\text{batch.x}, \\text{batch.y}), \\nabla_{\\omega_1}\\nabla_{x}\\ell(\\omega_1, \\omega_2, \\text{batch.x}, \\text{batch.y} ))\\] where the first dimension of the resulting tensor is always considered to be the batch dimension and the last to be the non-batch input related derivatives. So the shape of the resulting tensor is \\((N, n, d_1 + d_2)\\), where \\(N\\) is the number of samples in the batch.
PARAMETER DESCRIPTION batch
The batch of data for which to compute the flat mixed gradients.
TYPE: BatchType
RETURNS DESCRIPTION TensorType
A tensor containing the flat mixed gradients computed per sample.
Source code in src/pydvl/influence/types.py
@abstractmethod\ndef flat_mixed_grads(self, batch: BatchType) -> TensorType:\n r\"\"\"\n Computes and returns the flat per-sample mixed gradients for the provided batch.\n Given the example in the class docstring, this means\n\n $$ (\\nabla_{\\omega_1}\\nabla_{x}\\ell(\\omega_1,\n \\omega_2, \\text{batch.x}, \\text{batch.y}),\n \\nabla_{\\omega_1}\\nabla_{x}\\ell(\\omega_1,\n \\omega_2, \\text{batch.x}, \\text{batch.y} ))$$\n\n where the first dimension of the resulting tensor is always considered to be\n the batch dimension and the last to be the non-batch input related derivatives.\n So the shape of the resulting tensor is $(N, n, d_1 + d_2)$,\n where $N$ is the number of samples in the batch.\n\n Args:\n batch: The batch of data for which to compute the flat mixed gradients.\n\n Returns:\n A tensor containing the flat mixed gradients computed per sample.\n \"\"\"\n
"},{"location":"api/pydvl/influence/types/#pydvl.influence.types.BilinearForm","title":"BilinearForm","text":" Bases: Generic[TensorType, BatchType, GradientProviderType]
, ABC
Abstract base class for bilinear forms, which facilitates the computation of inner products involving gradients of batches of data.
"},{"location":"api/pydvl/influence/types/#pydvl.influence.types.BilinearForm.inner_prod","title":"inner_prod abstractmethod
","text":"inner_prod(left: TensorType, right: Optional[TensorType]) -> TensorType\n
Computes the inner product of two vectors, i.e.
\\[ \\langle x, y \\rangle_{B}\\] if we denote the bilinear-form by \\(\\langle \\cdot, \\cdot \\rangle_{B}\\). The implementations must take care of according vectorization to make it applicable to the case, where left
and right
are not one-dimensional. In this case, the trailing dimension of the left
and right
tensors are considered for the computation of the inner product. For example, if left
is a tensor of shape \\((N, D)\\) and, right
is of shape \\((M,..., D)\\), then the result is of shape \\((N, M, ...)\\).
PARAMETER DESCRIPTION left
The first tensor in the inner product computation.
TYPE: TensorType
right
The second tensor, optional; if not provided, the inner product will use left
tensor for both arguments.
TYPE: Optional[TensorType]
RETURNS DESCRIPTION TensorType
A tensor representing the inner product.
Source code in src/pydvl/influence/types.py
@abstractmethod\ndef inner_prod(self, left: TensorType, right: Optional[TensorType]) -> TensorType:\n r\"\"\"\n Computes the inner product of two vectors, i.e.\n\n $$ \\langle x, y \\rangle_{B}$$\n\n if we denote the bilinear-form by $\\langle \\cdot, \\cdot \\rangle_{B}$.\n The implementations must take care of according vectorization to make\n it applicable to the case, where `left` and `right` are not one-dimensional.\n In this case, the trailing dimension of the `left` and `right` tensors are\n considered for the computation of the inner product. For example,\n if `left` is a tensor of shape $(N, D)$ and, `right` is of shape $(M,..., D)$,\n then the result is of shape $(N, M, ...)$.\n\n Args:\n left: The first tensor in the inner product computation.\n right: The second tensor, optional; if not provided, the inner product will\n use `left` tensor for both arguments.\n\n Returns:\n A tensor representing the inner product.\n \"\"\"\n
"},{"location":"api/pydvl/influence/types/#pydvl.influence.types.BilinearForm.grads_inner_prod","title":"grads_inner_prod","text":"grads_inner_prod(\n left: BatchType,\n right: Optional[BatchType],\n gradient_provider: GradientProviderType,\n) -> TensorType\n
Computes the gradient inner product of two batches of data, i.e.
\\[ \\langle \\nabla_{\\omega}\\ell(\\omega, \\text{left.x}, \\text{left.y}), \\nabla_{\\omega}\\ell(\\omega, \\text{right.x}, \\text{right.y}) \\rangle_{B}\\] where \\(\\nabla_{\\omega}\\ell(\\omega, \\cdot, \\cdot)\\) is represented by the gradient_provider
and the expression must be understood sample-wise.
PARAMETER DESCRIPTION left
The first batch for gradient and inner product computation
TYPE: BatchType
right
The second batch for gradient and inner product computation, optional; if not provided, the inner product will use the gradient computed for left
for both arguments.
TYPE: Optional[BatchType]
gradient_provider
The gradient provider to compute the gradients.
TYPE: GradientProviderType
RETURNS DESCRIPTION TensorType
A tensor representing the inner products of the per-sample gradients
Source code in src/pydvl/influence/types.py
def grads_inner_prod(\n self,\n left: BatchType,\n right: Optional[BatchType],\n gradient_provider: GradientProviderType,\n) -> TensorType:\n r\"\"\"\n Computes the gradient inner product of two batches of data, i.e.\n\n $$ \\langle \\nabla_{\\omega}\\ell(\\omega, \\text{left.x}, \\text{left.y}),\n \\nabla_{\\omega}\\ell(\\omega, \\text{right.x}, \\text{right.y}) \\rangle_{B}$$\n\n where $\\nabla_{\\omega}\\ell(\\omega, \\cdot, \\cdot)$ is represented by the\n `gradient_provider` and the expression must be understood sample-wise.\n\n Args:\n left: The first batch for gradient and inner product computation\n right: The second batch for gradient and inner product computation,\n optional; if not provided, the inner product will use the gradient\n computed for `left` for both arguments.\n gradient_provider: The gradient provider to compute the gradients.\n\n Returns:\n A tensor representing the inner products of the per-sample gradients\n \"\"\"\n left_grad = gradient_provider.flat_grads(left)\n if right is None:\n right_grad = left_grad\n else:\n right_grad = gradient_provider.flat_grads(right)\n return self.inner_prod(left_grad, right_grad)\n
"},{"location":"api/pydvl/influence/types/#pydvl.influence.types.BilinearForm.mixed_grads_inner_prod","title":"mixed_grads_inner_prod","text":"mixed_grads_inner_prod(\n left: BatchType,\n right: Optional[BatchType],\n gradient_provider: GradientProviderType,\n) -> TensorType\n
Computes the mixed gradient inner product of two batches of data, i.e.
\\[ \\langle \\nabla_{\\omega}\\ell(\\omega, \\text{left.x}, \\text{left.y}), \\nabla_{\\omega}\\nabla_{x}\\ell(\\omega, \\text{right.x}, \\text{right.y}) \\rangle_{B}\\] where \\(\\nabla_{\\omega}\\ell(\\omega, \\cdot)\\) and \\(\\nabla_{\\omega}\\nabla_{x}\\ell(\\omega, \\cdot)\\) are represented by the gradient_provider
. The expression must be understood sample-wise.
PARAMETER DESCRIPTION left
The first batch for gradient and inner product computation
TYPE: BatchType
right
The second batch for gradient and inner product computation
TYPE: Optional[BatchType]
gradient_provider
The gradient provider to compute the gradients.
TYPE: GradientProviderType
RETURNS DESCRIPTION TensorType
A tensor representing the inner products of the mixed per-sample gradients
Source code in src/pydvl/influence/types.py
def mixed_grads_inner_prod(\n self,\n left: BatchType,\n right: Optional[BatchType],\n gradient_provider: GradientProviderType,\n) -> TensorType:\n r\"\"\"\n Computes the mixed gradient inner product of two batches of data, i.e.\n\n $$ \\langle \\nabla_{\\omega}\\ell(\\omega, \\text{left.x}, \\text{left.y}),\n \\nabla_{\\omega}\\nabla_{x}\\ell(\\omega, \\text{right.x}, \\text{right.y})\n \\rangle_{B}$$\n\n where $\\nabla_{\\omega}\\ell(\\omega, \\cdot)$ and\n $\\nabla_{\\omega}\\nabla_{x}\\ell(\\omega, \\cdot)$ are represented by the\n `gradient_provider`. The expression must be understood sample-wise.\n\n Args:\n left: The first batch for gradient and inner product computation\n right: The second batch for gradient and inner product computation\n gradient_provider: The gradient provider to compute the gradients.\n\n Returns:\n A tensor representing the inner products of the mixed per-sample gradients\n \"\"\"\n left_grad = gradient_provider.flat_grads(left)\n if right is None:\n right = left\n right_mixed_grad = gradient_provider.flat_mixed_grads(right)\n return self.inner_prod(left_grad, right_mixed_grad)\n
"},{"location":"api/pydvl/influence/types/#pydvl.influence.types.Operator","title":"Operator","text":" Bases: Generic[TensorType, BilinearFormType]
, ABC
Abstract base class for operators, capable of applying transformations to vectors and matrices, and can be represented as a bilinear form.
"},{"location":"api/pydvl/influence/types/#pydvl.influence.types.Operator.input_size","title":"input_size abstractmethod
property
","text":"input_size: int\n
Abstract property to get the needed size for inputs to the operator instance
RETURNS DESCRIPTION int
An integer representing the input size.
"},{"location":"api/pydvl/influence/types/#pydvl.influence.types.Operator.apply","title":"apply","text":"apply(tensor: TensorType) -> TensorType\n
Applies the operator to a tensor.
PARAMETER DESCRIPTION tensor
A tensor, whose tailing dimension must conform to the operator's input size
TYPE: TensorType
RETURNS DESCRIPTION TensorType
A tensor representing the result of the operator application.
Source code in src/pydvl/influence/types.py
def apply(self, tensor: TensorType) -> TensorType:\n \"\"\"\n Applies the operator to a tensor.\n\n Args:\n tensor: A tensor, whose tailing dimension must conform to the\n operator's input size\n\n Returns:\n A tensor representing the result of the operator application.\n \"\"\"\n self._validate_tensor_input(tensor)\n return self._apply(tensor)\n
"},{"location":"api/pydvl/influence/types/#pydvl.influence.types.Operator.as_bilinear_form","title":"as_bilinear_form abstractmethod
","text":"as_bilinear_form() -> BilinearFormType\n
Represents the operator as a bilinear form, i.e. the weighted inner product
\\[ \\langle \\operatorname{Op}(x), y \\rangle\\] RETURNS DESCRIPTION BilinearFormType
An instance of type BilinearForm representing this operator.
Source code in src/pydvl/influence/types.py
@abstractmethod\ndef as_bilinear_form(self) -> BilinearFormType:\n r\"\"\"\n Represents the operator as a bilinear form, i.e. the weighted inner product\n\n $$ \\langle \\operatorname{Op}(x), y \\rangle$$\n\n Returns:\n An instance of type [BilinearForm][pydvl.influence.types.BilinearForm]\n representing this operator.\n \"\"\"\n
"},{"location":"api/pydvl/influence/types/#pydvl.influence.types.OperatorGradientComposition","title":"OperatorGradientComposition","text":"OperatorGradientComposition(op: OperatorType, gp: GradientProviderType)\n
Bases: Generic[TensorType, BatchType, OperatorType, GradientProviderType]
Generic base class representing a composable block that integrates an operator and a gradient provider to compute interactions between batches of data.
This block is designed to be flexible, handling different computational modes via an abstract operator and gradient provider.
ATTRIBUTE DESCRIPTION op
The operator used for transformations and influence computations.
gp
The gradient provider used for obtaining necessary gradients.
Source code in src/pydvl/influence/types.py
def __init__(self, op: OperatorType, gp: GradientProviderType):\n self.gp = gp\n self.op = op\n
"},{"location":"api/pydvl/influence/types/#pydvl.influence.types.OperatorGradientComposition.interactions","title":"interactions","text":"interactions(\n left_batch: BatchType, right_batch: Optional[BatchType], mode: InfluenceMode\n)\n
Computes the interaction between the gradients on two batches of data based on the specified mode weighted by the operator action, i.e.
\\[ \\langle \\operatorname{Op}(\\nabla_{\\omega}\\ell(\\omega, \\text{left.x}, \\text{left.y})), \\nabla_{\\omega}\\ell(\\omega, \\text{right.x}, \\text{right.y}) \\rangle\\] for the case InfluenceMode.Up
and
\\[ \\langle \\operatorname{Op}(\\nabla_{\\omega}\\ell(\\omega, \\text{left.x}, \\text{left.y})), \\nabla_{\\omega}\\nabla_{x}\\ell(\\omega, \\text{right.x}, \\text{right.y}) \\rangle \\] for the case InfluenceMode.Perturbation
.
PARAMETER DESCRIPTION left_batch
The left data batch for gradient computation.
TYPE: BatchType
right_batch
The right data batch for gradient computation.
TYPE: Optional[BatchType]
mode
An instance of InfluenceMode determining the type of influence computation.
TYPE: InfluenceMode
RETURNS DESCRIPTION The result of the influence computation as dictated by the mode.
Source code in src/pydvl/influence/types.py
def interactions(\n self,\n left_batch: BatchType,\n right_batch: Optional[BatchType],\n mode: InfluenceMode,\n):\n r\"\"\"\n Computes the interaction between the gradients on two batches of data based on\n the specified mode weighted by the operator action,\n i.e.\n\n $$ \\langle \\operatorname{Op}(\\nabla_{\\omega}\\ell(\\omega, \\text{left.x},\n \\text{left.y})),\n \\nabla_{\\omega}\\ell(\\omega, \\text{right.x}, \\text{right.y}) \\rangle$$\n\n for the case `InfluenceMode.Up` and\n\n $$ \\langle \\operatorname{Op}(\\nabla_{\\omega}\\ell(\\omega, \\text{left.x},\n \\text{left.y})),\n \\nabla_{\\omega}\\nabla_{x}\\ell(\\omega, \\text{right.x}, \\text{right.y}) \\rangle $$\n\n for the case `InfluenceMode.Perturbation`.\n\n Args:\n left_batch: The left data batch for gradient computation.\n right_batch: The right data batch for gradient computation.\n mode: An instance of InfluenceMode determining the type of influence\n computation.\n\n Returns:\n The result of the influence computation as dictated by the mode.\n \"\"\"\n bilinear_form = self.op.as_bilinear_form()\n if mode == InfluenceMode.Up:\n return bilinear_form.grads_inner_prod(left_batch, right_batch, self.gp)\n elif mode == InfluenceMode.Perturbation:\n return bilinear_form.mixed_grads_inner_prod(\n left_batch, right_batch, self.gp\n )\n else:\n raise UnsupportedInfluenceModeException(mode)\n
"},{"location":"api/pydvl/influence/types/#pydvl.influence.types.OperatorGradientComposition.transformed_grads","title":"transformed_grads","text":"transformed_grads(batch: BatchType)\n
Computes the gradients of a data batch, transformed by the operator application , i.e. the expressions
\\[ \\operatorname{Op}(\\nabla_{\\omega}\\ell(\\omega, \\text{batch.x}, \\text{batch.y})) \\] PARAMETER DESCRIPTION batch
The data batch for gradient computation.
TYPE: BatchType
RETURNS DESCRIPTION A tensor representing the application of the operator to the gradients.
Source code in src/pydvl/influence/types.py
def transformed_grads(self, batch: BatchType):\n r\"\"\"\n Computes the gradients of a data batch, transformed by the operator application\n , i.e. the expressions\n\n $$ \\operatorname{Op}(\\nabla_{\\omega}\\ell(\\omega, \\text{batch.x},\n \\text{batch.y})) $$\n\n Args:\n batch: The data batch for gradient computation.\n\n Returns:\n A tensor representing the application of the operator to the gradients.\n\n \"\"\"\n grads = self.gp.flat_grads(batch)\n return self.op.apply(grads)\n
"},{"location":"api/pydvl/influence/types/#pydvl.influence.types.OperatorGradientComposition.interactions_from_transformed_grads","title":"interactions_from_transformed_grads","text":"interactions_from_transformed_grads(\n left_factors: TensorType, right_batch: BatchType, mode: InfluenceMode\n)\n
Computes the interaction between the transformed gradients on two batches of data using pre-computed factors and a batch of data, based on the specified mode. This means
\\[ \\langle \\text{left_factors}, \\nabla_{\\omega}\\ell(\\omega, \\text{right.x}, \\text{right.y}) \\rangle\\] for the case InfluenceMode.Up
and
\\[ \\langle \\text{left_factors}, \\nabla_{\\omega}\\nabla_{x}\\ell(\\omega, \\text{right.x}, \\text{right.y}) \\rangle \\] for the case InfluenceMode.Perturbation
.
PARAMETER DESCRIPTION left_factors
Pre-computed tensor factors from a left batch.
TYPE: TensorType
right_batch
The right data batch for influence computation.
TYPE: BatchType
mode
An instance of InfluenceMode determining the type of influence computation.
TYPE: InfluenceMode
RETURNS DESCRIPTION The result of the interaction computation using the provided factors and batch gradients.
Source code in src/pydvl/influence/types.py
def interactions_from_transformed_grads(\n self, left_factors: TensorType, right_batch: BatchType, mode: InfluenceMode\n):\n r\"\"\"\n Computes the interaction between the transformed gradients on two batches of\n data using pre-computed factors and a batch of data,\n based on the specified mode. This means\n\n $$ \\langle \\text{left_factors},\n \\nabla_{\\omega}\\ell(\\omega, \\text{right.x}, \\text{right.y}) \\rangle$$\n\n for the case `InfluenceMode.Up` and\n\n $$ \\langle \\text{left_factors},\n \\nabla_{\\omega}\\nabla_{x}\\ell(\\omega, \\text{right.x}, \\text{right.y}) \\rangle $$\n\n for the case `InfluenceMode.Perturbation`.\n\n Args:\n left_factors: Pre-computed tensor factors from a left batch.\n right_batch: The right data batch for influence computation.\n mode: An instance of InfluenceMode determining the type of influence\n computation.\n\n Returns:\n The result of the interaction computation using the provided factors and\n batch gradients.\n \"\"\"\n if mode is InfluenceMode.Up:\n right_grads = self.gp.flat_grads(right_batch)\n else:\n right_grads = self.gp.flat_mixed_grads(right_batch)\n return self._tensor_inner_product(left_factors, right_grads)\n
"},{"location":"api/pydvl/influence/types/#pydvl.influence.types.BlockMapper","title":"BlockMapper","text":"BlockMapper(\n composable_block_dict: OrderedDict[str, OperatorGradientCompositionType]\n)\n
Bases: Generic[TensorType, BatchType, OperatorGradientCompositionType]
, ABC
Abstract base class for mapping operations across multiple compositional blocks.
This class takes a dictionary of compositional blocks and applies their methods to batches or tensors, and aggregates the results.
ATTRIBUTE DESCRIPTION composable_block_dict
A dictionary mapping string identifiers to composable blocks which define operations like transformations and interactions.
Source code in src/pydvl/influence/types.py
def __init__(\n self, composable_block_dict: OrderedDict[str, OperatorGradientCompositionType]\n):\n self.composable_block_dict = composable_block_dict\n
"},{"location":"api/pydvl/influence/types/#pydvl.influence.types.BlockMapper.transformed_grads","title":"transformed_grads","text":"transformed_grads(batch: BatchType) -> OrderedDict[str, TensorType]\n
Computes and returns the transformed gradients for a batch in dictionary with the keys defined by the block names.
PARAMETER DESCRIPTION batch
The batch of data for which to compute transformed gradients.
TYPE: BatchType
RETURNS DESCRIPTION OrderedDict[str, TensorType]
An ordered dictionary of transformed gradients by block.
Source code in src/pydvl/influence/types.py
def transformed_grads(\n self,\n batch: BatchType,\n) -> OrderedDict[str, TensorType]:\n \"\"\"\n Computes and returns the transformed gradients for a batch in dictionary\n with the keys defined by the block names.\n\n Args:\n batch: The batch of data for which to compute transformed gradients.\n\n Returns:\n An ordered dictionary of transformed gradients by block.\n \"\"\"\n tensor_gen = self.generate_transformed_grads(batch)\n return self._to_ordered_dict(tensor_gen)\n
"},{"location":"api/pydvl/influence/types/#pydvl.influence.types.BlockMapper.interactions","title":"interactions","text":"interactions(\n left_batch: BatchType, right_batch: BatchType, mode: InfluenceMode\n) -> OrderedDict[str, TensorType]\n
Computes interactions between two batches, aggregated by block, based on a specified mode.
PARAMETER DESCRIPTION left_batch
The left batch for interaction computation.
TYPE: BatchType
right_batch
The right batch for interaction computation.
TYPE: BatchType
mode
The mode determining the type of interactions.
TYPE: InfluenceMode
RETURNS DESCRIPTION OrderedDict[str, TensorType]
An ordered dictionary of gradient interactions by block.
Source code in src/pydvl/influence/types.py
def interactions(\n self, left_batch: BatchType, right_batch: BatchType, mode: InfluenceMode\n) -> OrderedDict[str, TensorType]:\n \"\"\"\n Computes interactions between two batches, aggregated by block,\n based on a specified mode.\n\n Args:\n left_batch: The left batch for interaction computation.\n right_batch: The right batch for interaction computation.\n mode: The mode determining the type of interactions.\n\n Returns:\n An ordered dictionary of gradient interactions by block.\n \"\"\"\n tensor_gen = self.generate_interactions(left_batch, right_batch, mode)\n return self._to_ordered_dict(tensor_gen)\n
"},{"location":"api/pydvl/influence/types/#pydvl.influence.types.BlockMapper.interactions_from_transformed_grads","title":"interactions_from_transformed_grads","text":"interactions_from_transformed_grads(\n left_factors: OrderedDict[str, TensorType],\n right_batch: BatchType,\n mode: InfluenceMode,\n) -> OrderedDict[str, TensorType]\n
Computes interactions from transformed gradients and a right batch, aggregated by block and based on a mode.
PARAMETER DESCRIPTION left_factors
Pre-computed factors as a tensor or an ordered dictionary of tensors by block. If the input is a tensor, it is split into blocks according to the ordering in the composable_block_dict
attribute.
TYPE: OrderedDict[str, TensorType]
right_batch
The right batch for interaction computation.
TYPE: BatchType
mode
The mode determining the type of interactions.
TYPE: InfluenceMode
RETURNS DESCRIPTION OrderedDict[str, TensorType]
An ordered dictionary of interactions from transformed gradients by block.
Source code in src/pydvl/influence/types.py
def interactions_from_transformed_grads(\n self,\n left_factors: OrderedDict[str, TensorType],\n right_batch: BatchType,\n mode: InfluenceMode,\n) -> OrderedDict[str, TensorType]:\n \"\"\"\n Computes interactions from transformed gradients and a right batch,\n aggregated by block and based on a mode.\n\n Args:\n left_factors: Pre-computed factors as a tensor or an ordered dictionary of\n tensors by block. If the input is a tensor, it is split into blocks\n according to the ordering in the `composable_block_dict` attribute.\n right_batch: The right batch for interaction computation.\n mode: The mode determining the type of interactions.\n\n Returns:\n An ordered dictionary of interactions from transformed gradients by block.\n \"\"\"\n tensor_gen = self.generate_interactions_from_transformed_grads(\n left_factors, right_batch, mode\n )\n return self._to_ordered_dict(tensor_gen)\n
"},{"location":"api/pydvl/influence/types/#pydvl.influence.types.BlockMapper.generate_transformed_grads","title":"generate_transformed_grads","text":"generate_transformed_grads(\n batch: BatchType,\n) -> Generator[TensorType, None, None]\n
Generator that yields transformed gradients for a given batch, processed by each block.
PARAMETER DESCRIPTION batch
The batch of data for which to generate transformed gradients.
TYPE: BatchType
YIELDS DESCRIPTION TensorType
Transformed gradients for each block.
Source code in src/pydvl/influence/types.py
def generate_transformed_grads(\n self, batch: BatchType\n) -> Generator[TensorType, None, None]:\n \"\"\"\n Generator that yields transformed gradients for a given batch,\n processed by each block.\n\n Args:\n batch: The batch of data for which to generate transformed gradients.\n\n Yields:\n Transformed gradients for each block.\n \"\"\"\n for comp_block in self.composable_block_dict.values():\n yield comp_block.transformed_grads(batch)\n
"},{"location":"api/pydvl/influence/types/#pydvl.influence.types.BlockMapper.generate_interactions","title":"generate_interactions","text":"generate_interactions(\n left_batch: BatchType, right_batch: Optional[BatchType], mode: InfluenceMode\n) -> Generator[TensorType, None, None]\n
Generator that yields gradient interactions between two batches, processed by each block based on a mode.
PARAMETER DESCRIPTION left_batch
The left batch for interaction computation.
TYPE: BatchType
right_batch
The right batch for interaction computation.
TYPE: Optional[BatchType]
mode
The mode determining the type of interactions.
TYPE: InfluenceMode
YIELDS DESCRIPTION TensorType
Gradient interactions for each block.
TYPE:: TensorType
Source code in src/pydvl/influence/types.py
def generate_interactions(\n self,\n left_batch: BatchType,\n right_batch: Optional[BatchType],\n mode: InfluenceMode,\n) -> Generator[TensorType, None, None]:\n \"\"\"\n Generator that yields gradient interactions between two batches, processed by\n each block based on a mode.\n\n Args:\n left_batch: The left batch for interaction computation.\n right_batch: The right batch for interaction computation.\n mode: The mode determining the type of interactions.\n\n Yields:\n TensorType: Gradient interactions for each block.\n \"\"\"\n for comp_block in self.composable_block_dict.values():\n yield comp_block.interactions(left_batch, right_batch, mode)\n
"},{"location":"api/pydvl/influence/types/#pydvl.influence.types.BlockMapper.generate_interactions_from_transformed_grads","title":"generate_interactions_from_transformed_grads","text":"generate_interactions_from_transformed_grads(\n left_factors: Union[TensorType, OrderedDict[str, TensorType]],\n right_batch: BatchType,\n mode: InfluenceMode,\n) -> Generator[TensorType, None, None]\n
Generator that yields interactions computed from pre-computed factors and a right batch, processed by each block based on a mode.
PARAMETER DESCRIPTION left_factors
Pre-computed factors as a tensor or an ordered dictionary of tensors by block.
TYPE: Union[TensorType, OrderedDict[str, TensorType]]
right_batch
The right batch for interaction computation.
TYPE: BatchType
mode
The mode determining the type of interactions.
TYPE: InfluenceMode
YIELDS DESCRIPTION TensorType
Interactions for each block.
TYPE:: TensorType
Source code in src/pydvl/influence/types.py
def generate_interactions_from_transformed_grads(\n self,\n left_factors: Union[TensorType, OrderedDict[str, TensorType]],\n right_batch: BatchType,\n mode: InfluenceMode,\n) -> Generator[TensorType, None, None]:\n \"\"\"\n Generator that yields interactions computed from pre-computed factors and a\n right batch, processed by each block based on a mode.\n\n Args:\n left_factors: Pre-computed factors as a tensor or an ordered dictionary of\n tensors by block.\n right_batch: The right batch for interaction computation.\n mode: The mode determining the type of interactions.\n\n Yields:\n TensorType: Interactions for each block.\n \"\"\"\n if not isinstance(left_factors, dict):\n left_factors_dict = self._split_to_blocks(left_factors)\n else:\n left_factors_dict = cast(OrderedDict[str, TensorType], left_factors)\n for k, comp_block in self.composable_block_dict.items():\n yield comp_block.interactions_from_transformed_grads(\n left_factors_dict[k], right_batch, mode\n )\n
"},{"location":"api/pydvl/influence/torch/","title":"Torch","text":""},{"location":"api/pydvl/influence/torch/#pydvl.influence.torch","title":"pydvl.influence.torch","text":""},{"location":"api/pydvl/influence/torch/base/","title":"Base","text":""},{"location":"api/pydvl/influence/torch/base/#pydvl.influence.torch.base","title":"pydvl.influence.torch.base","text":""},{"location":"api/pydvl/influence/torch/base/#pydvl.influence.torch.base.TorchBatch","title":"TorchBatch dataclass
","text":"TorchBatch(x: Tensor, y: Tensor)\n
Bases: Batch
A convenience class for handling batches of data. Validates, the alignment of the first dimension (batch dimension) of the input and target tensor
ATTRIBUTE DESCRIPTION x
The input tensor that contains features or data points.
TYPE: Tensor
y
The target tensor that contains labels corresponding to the inputs.
TYPE: Tensor
"},{"location":"api/pydvl/influence/torch/base/#pydvl.influence.torch.base.TorchGradientProvider","title":"TorchGradientProvider","text":"TorchGradientProvider(\n model: Module, loss: LossType, restrict_to: Optional[Dict[str, Parameter]]\n)\n
Bases: GradientProvider[TorchBatch, Tensor]
Compute per-sample gradients of a function defined by a torch.nn.Module and a loss function using torch.func.
Consider a function
\\[ \\ell: \\mathbb{R}^{d_1} \\times \\mathbb{R}^{d_2} \\times \\mathbb{R}^{n} \\times \\mathbb{R}^{n}, \\quad \\ell(\\omega_1, \\omega_2, x, y) = \\operatorname{loss}(f(\\omega_1, \\omega_2; x), y) \\] e.g. a two layer neural network \\(f\\) with a loss function, then this object should compute the expressions:
\\[ \\nabla_{\\omega_{i}}\\ell(\\omega_1, \\omega_2, x, y), \\nabla_{\\omega_{i}}\\nabla_{x}\\ell(\\omega_1, \\omega_2, x, y), \\nabla_{\\omega}\\ell(\\omega_1, \\omega_2, x, y) \\cdot v\\] Source code in src/pydvl/influence/torch/base.py
def __init__(\n self,\n model: torch.nn.Module,\n loss: LossType,\n restrict_to: Optional[Dict[str, torch.nn.Parameter]],\n):\n self.model = model\n self.loss = loss\n\n if restrict_to is None:\n restrict_to = ModelParameterDictBuilder(model).build_from_block_mode(\n BlockMode.FULL\n )\n\n self.params_to_restrict_to = restrict_to\n
"},{"location":"api/pydvl/influence/torch/base/#pydvl.influence.torch.base.TorchGradientProvider.grads","title":"grads","text":"grads(batch: TorchBatch) -> Dict[str, Tensor]\n
Computes and returns a dictionary mapping parameter names to their respective per-sample gradients. Given the example in the class docstring, this means
\\[ \\text{result}[\\omega_i] = \\nabla_{\\omega_{i}}\\ell(\\omega_1, \\omega_2, \\text{batch.x}, \\text{batch.y}), \\] where the first dimension of the resulting tensors is always considered to be the batch dimension, so the shape of the resulting tensors are \\((N, d_i)\\), where \\(N\\) is the number of samples in the batch.
PARAMETER DESCRIPTION batch
The batch of data for which to compute gradients.
TYPE: TorchBatch
RETURNS DESCRIPTION Dict[str, Tensor]
A dictionary where keys are gradient identifiers and values are the gradients computed per sample.
Source code in src/pydvl/influence/torch/base.py
def grads(self, batch: TorchBatch) -> Dict[str, torch.Tensor]:\n r\"\"\"\n Computes and returns a dictionary mapping parameter names to their respective\n per-sample gradients. Given the example in the class docstring, this means\n\n $$ \\text{result}[\\omega_i] = \\nabla_{\\omega_{i}}\\ell(\\omega_1, \\omega_2,\n \\text{batch.x}, \\text{batch.y}), $$\n\n where the first dimension of the resulting tensors is always considered to be\n the batch dimension, so the shape of the resulting tensors are $(N, d_i)$,\n where $N$ is the number of samples in the batch.\n\n Args:\n batch: The batch of data for which to compute gradients.\n\n Returns:\n A dictionary where keys are gradient identifiers and values are the\n gradients computed per sample.\n \"\"\"\n gradient_dict = self._grads(batch.to(self.device))\n return self._detach_dict(gradient_dict)\n
"},{"location":"api/pydvl/influence/torch/base/#pydvl.influence.torch.base.TorchGradientProvider.mixed_grads","title":"mixed_grads","text":"mixed_grads(batch: TorchBatch) -> Dict[str, Tensor]\n
Computes and returns a dictionary mapping gradient names to their respective per-sample mixed gradients. In this context, mixed gradients refer to computing gradients with respect to the instance definition in addition to compute derivatives with respect to the input batch. Given the example in the class docstring, this means
\\[ \\text{result}[\\omega_i] = \\nabla_{\\omega_{i}}\\nabla_{x}\\ell(\\omega_1, \\omega_2, \\text{batch.x}, \\text{batch.y}), \\] where the first dimension of the resulting tensors is always considered to be the batch dimension and the last to be the non-batch input related derivatives. So the shape of the resulting tensors are \\((N, n, d_i)\\), where \\(N\\) is the number of samples in the batch.
PARAMETER DESCRIPTION batch
The batch of data for which to compute mixed gradients.
TYPE: TorchBatch
RETURNS DESCRIPTION Dict[str, Tensor]
A dictionary where keys are gradient identifiers and values are the mixed gradients computed per sample.
Source code in src/pydvl/influence/torch/base.py
def mixed_grads(self, batch: TorchBatch) -> Dict[str, torch.Tensor]:\n r\"\"\"\n Computes and returns a dictionary mapping gradient names to their respective\n per-sample mixed gradients. In this context, mixed gradients refer to computing\n gradients with respect to the instance definition in addition to\n compute derivatives with respect to the input batch.\n Given the example in the class docstring, this means\n\n $$ \\text{result}[\\omega_i] = \\nabla_{\\omega_{i}}\\nabla_{x}\\ell(\\omega_1,\n \\omega_2, \\text{batch.x}, \\text{batch.y}), $$\n\n where the first dimension of the resulting tensors is always considered to be\n the batch dimension and the last to be the non-batch input related derivatives.\n So the shape of the resulting tensors are $(N, n, d_i)$,\n where $N$ is the number of samples in the batch.\n\n Args:\n batch: The batch of data for which to compute mixed gradients.\n\n Returns:\n A dictionary where keys are gradient identifiers and values are the\n mixed gradients computed per sample.\n \"\"\"\n gradient_dict = self._mixed_grads(batch.to(self.device))\n return self._detach_dict(gradient_dict)\n
"},{"location":"api/pydvl/influence/torch/base/#pydvl.influence.torch.base.TorchGradientProvider.jacobian_prod","title":"jacobian_prod","text":"jacobian_prod(batch: TorchBatch, g: Tensor) -> Tensor\n
Computes the matrix-Jacobian product for the provided batch and input tensor. Given the example in the class docstring, this means
\\[ (\\nabla_{\\omega_{1}}\\ell(\\omega_1, \\omega_2, \\text{batch.x}, \\text{batch.y}), \\nabla_{\\omega_{2}}\\ell(\\omega_1, \\omega_2, \\text{batch.x}, \\text{batch.y})) \\cdot g^T\\] where g must be a tensor of shape \\((K, d_1+d_2)\\), so the resulting tensor is of shape \\((N, K)\\).
PARAMETER DESCRIPTION batch
The batch of data for which to compute the Jacobian.
TYPE: TorchBatch
g
The tensor to be used in the matrix-Jacobian product calculation.
TYPE: Tensor
RETURNS DESCRIPTION Tensor
The resulting tensor from the matrix-Jacobian product computation.
Source code in src/pydvl/influence/torch/base.py
def jacobian_prod(\n self,\n batch: TorchBatch,\n g: torch.Tensor,\n) -> torch.Tensor:\n r\"\"\"\n Computes the matrix-Jacobian product for the provided batch and input tensor.\n Given the example in the class docstring, this means\n\n $$ (\\nabla_{\\omega_{1}}\\ell(\\omega_1, \\omega_2,\n \\text{batch.x}, \\text{batch.y}),\n \\nabla_{\\omega_{2}}\\ell(\\omega_1, \\omega_2,\n \\text{batch.x}, \\text{batch.y})) \\cdot g^T$$\n\n where g must be a tensor of shape $(K, d_1+d_2)$, so the resulting tensor\n is of shape $(N, K)$.\n\n Args:\n batch: The batch of data for which to compute the Jacobian.\n g: The tensor to be used in the matrix-Jacobian product\n calculation.\n\n Returns:\n The resulting tensor from the matrix-Jacobian product computation.\n \"\"\"\n result = self._jacobian_prod(batch.to(self.device), g.to(self.device))\n if result.requires_grad:\n result = result.detach()\n return result\n
"},{"location":"api/pydvl/influence/torch/base/#pydvl.influence.torch.base.OperatorBilinearForm","title":"OperatorBilinearForm","text":"OperatorBilinearForm(operator: 'TensorOperator')\n
Bases: BilinearForm[Tensor, TorchBatch, TorchGradientProvider]
Base class for bilinear forms based on an instance of TorchOperator. This means it computes weighted inner products of the form:
\\[ \\langle \\operatorname{Op}(x), y \\rangle \\] Source code in src/pydvl/influence/torch/base.py
def __init__(\n self,\n operator: \"TensorOperator\",\n):\n self.operator = operator\n
"},{"location":"api/pydvl/influence/torch/base/#pydvl.influence.torch.base.OperatorBilinearForm.grads_inner_prod","title":"grads_inner_prod","text":"grads_inner_prod(\n left: BatchType,\n right: Optional[BatchType],\n gradient_provider: GradientProviderType,\n) -> TensorType\n
Computes the gradient inner product of two batches of data, i.e.
\\[ \\langle \\nabla_{\\omega}\\ell(\\omega, \\text{left.x}, \\text{left.y}), \\nabla_{\\omega}\\ell(\\omega, \\text{right.x}, \\text{right.y}) \\rangle_{B}\\] where \\(\\nabla_{\\omega}\\ell(\\omega, \\cdot, \\cdot)\\) is represented by the gradient_provider
and the expression must be understood sample-wise.
PARAMETER DESCRIPTION left
The first batch for gradient and inner product computation
TYPE: BatchType
right
The second batch for gradient and inner product computation, optional; if not provided, the inner product will use the gradient computed for left
for both arguments.
TYPE: Optional[BatchType]
gradient_provider
The gradient provider to compute the gradients.
TYPE: GradientProviderType
RETURNS DESCRIPTION TensorType
A tensor representing the inner products of the per-sample gradients
Source code in src/pydvl/influence/types.py
def grads_inner_prod(\n self,\n left: BatchType,\n right: Optional[BatchType],\n gradient_provider: GradientProviderType,\n) -> TensorType:\n r\"\"\"\n Computes the gradient inner product of two batches of data, i.e.\n\n $$ \\langle \\nabla_{\\omega}\\ell(\\omega, \\text{left.x}, \\text{left.y}),\n \\nabla_{\\omega}\\ell(\\omega, \\text{right.x}, \\text{right.y}) \\rangle_{B}$$\n\n where $\\nabla_{\\omega}\\ell(\\omega, \\cdot, \\cdot)$ is represented by the\n `gradient_provider` and the expression must be understood sample-wise.\n\n Args:\n left: The first batch for gradient and inner product computation\n right: The second batch for gradient and inner product computation,\n optional; if not provided, the inner product will use the gradient\n computed for `left` for both arguments.\n gradient_provider: The gradient provider to compute the gradients.\n\n Returns:\n A tensor representing the inner products of the per-sample gradients\n \"\"\"\n left_grad = gradient_provider.flat_grads(left)\n if right is None:\n right_grad = left_grad\n else:\n right_grad = gradient_provider.flat_grads(right)\n return self.inner_prod(left_grad, right_grad)\n
"},{"location":"api/pydvl/influence/torch/base/#pydvl.influence.torch.base.OperatorBilinearForm.mixed_grads_inner_prod","title":"mixed_grads_inner_prod","text":"mixed_grads_inner_prod(\n left: BatchType,\n right: Optional[BatchType],\n gradient_provider: GradientProviderType,\n) -> TensorType\n
Computes the mixed gradient inner product of two batches of data, i.e.
\\[ \\langle \\nabla_{\\omega}\\ell(\\omega, \\text{left.x}, \\text{left.y}), \\nabla_{\\omega}\\nabla_{x}\\ell(\\omega, \\text{right.x}, \\text{right.y}) \\rangle_{B}\\] where \\(\\nabla_{\\omega}\\ell(\\omega, \\cdot)\\) and \\(\\nabla_{\\omega}\\nabla_{x}\\ell(\\omega, \\cdot)\\) are represented by the gradient_provider
. The expression must be understood sample-wise.
PARAMETER DESCRIPTION left
The first batch for gradient and inner product computation
TYPE: BatchType
right
The second batch for gradient and inner product computation
TYPE: Optional[BatchType]
gradient_provider
The gradient provider to compute the gradients.
TYPE: GradientProviderType
RETURNS DESCRIPTION TensorType
A tensor representing the inner products of the mixed per-sample gradients
Source code in src/pydvl/influence/types.py
def mixed_grads_inner_prod(\n self,\n left: BatchType,\n right: Optional[BatchType],\n gradient_provider: GradientProviderType,\n) -> TensorType:\n r\"\"\"\n Computes the mixed gradient inner product of two batches of data, i.e.\n\n $$ \\langle \\nabla_{\\omega}\\ell(\\omega, \\text{left.x}, \\text{left.y}),\n \\nabla_{\\omega}\\nabla_{x}\\ell(\\omega, \\text{right.x}, \\text{right.y})\n \\rangle_{B}$$\n\n where $\\nabla_{\\omega}\\ell(\\omega, \\cdot)$ and\n $\\nabla_{\\omega}\\nabla_{x}\\ell(\\omega, \\cdot)$ are represented by the\n `gradient_provider`. The expression must be understood sample-wise.\n\n Args:\n left: The first batch for gradient and inner product computation\n right: The second batch for gradient and inner product computation\n gradient_provider: The gradient provider to compute the gradients.\n\n Returns:\n A tensor representing the inner products of the mixed per-sample gradients\n \"\"\"\n left_grad = gradient_provider.flat_grads(left)\n if right is None:\n right = left\n right_mixed_grad = gradient_provider.flat_mixed_grads(right)\n return self.inner_prod(left_grad, right_mixed_grad)\n
"},{"location":"api/pydvl/influence/torch/base/#pydvl.influence.torch.base.OperatorBilinearForm.inner_prod","title":"inner_prod","text":"inner_prod(left: Tensor, right: Optional[Tensor]) -> Tensor\n
Computes the weighted inner product of two vectors, i.e.
\\[ \\langle \\operatorname{Op}(\\text{left}), \\text{right} \\rangle \\] PARAMETER DESCRIPTION left
The first tensor in the inner product computation.
TYPE: Tensor
right
The second tensor, optional; if not provided, the inner product will use left
tensor for both arguments.
TYPE: Optional[Tensor]
RETURNS DESCRIPTION Tensor
A tensor representing the inner product.
Source code in src/pydvl/influence/torch/base.py
def inner_prod(\n self, left: torch.Tensor, right: Optional[torch.Tensor]\n) -> torch.Tensor:\n r\"\"\"\n Computes the weighted inner product of two vectors, i.e.\n\n $$ \\langle \\operatorname{Op}(\\text{left}), \\text{right} \\rangle $$\n\n Args:\n left: The first tensor in the inner product computation.\n right: The second tensor, optional; if not provided, the inner product will\n use `left` tensor for both arguments.\n\n Returns:\n A tensor representing the inner product.\n \"\"\"\n if right is None:\n right = left\n if left.shape[0] <= right.shape[0]:\n return self._inner_product(left, right)\n return self._inner_product(right, left).T\n
"},{"location":"api/pydvl/influence/torch/base/#pydvl.influence.torch.base.DictBilinearForm","title":"DictBilinearForm","text":"DictBilinearForm(operator: 'TensorDictOperator')\n
Bases: OperatorBilinearForm
Base class for bilinear forms based on an instance of TorchOperator. This means it computes weighted inner products of the form:
\\[ \\langle \\operatorname{Op}(x), y \\rangle \\] Source code in src/pydvl/influence/torch/base.py
def __init__(\n self,\n operator: \"TensorDictOperator\",\n):\n super().__init__(operator)\n
"},{"location":"api/pydvl/influence/torch/base/#pydvl.influence.torch.base.DictBilinearForm.inner_prod","title":"inner_prod","text":"inner_prod(left: Tensor, right: Optional[Tensor]) -> Tensor\n
Computes the weighted inner product of two vectors, i.e.
\\[ \\langle \\operatorname{Op}(\\text{left}), \\text{right} \\rangle \\] PARAMETER DESCRIPTION left
The first tensor in the inner product computation.
TYPE: Tensor
right
The second tensor, optional; if not provided, the inner product will use left
tensor for both arguments.
TYPE: Optional[Tensor]
RETURNS DESCRIPTION Tensor
A tensor representing the inner product.
Source code in src/pydvl/influence/torch/base.py
def inner_prod(\n self, left: torch.Tensor, right: Optional[torch.Tensor]\n) -> torch.Tensor:\n r\"\"\"\n Computes the weighted inner product of two vectors, i.e.\n\n $$ \\langle \\operatorname{Op}(\\text{left}), \\text{right} \\rangle $$\n\n Args:\n left: The first tensor in the inner product computation.\n right: The second tensor, optional; if not provided, the inner product will\n use `left` tensor for both arguments.\n\n Returns:\n A tensor representing the inner product.\n \"\"\"\n if right is None:\n right = left\n if left.shape[0] <= right.shape[0]:\n return self._inner_product(left, right)\n return self._inner_product(right, left).T\n
"},{"location":"api/pydvl/influence/torch/base/#pydvl.influence.torch.base.DictBilinearForm.grads_inner_prod","title":"grads_inner_prod","text":"grads_inner_prod(\n left: TorchBatch,\n right: Optional[TorchBatch],\n gradient_provider: TorchGradientProvider,\n) -> Tensor\n
Computes the gradient inner product of two batches of data, i.e.
\\[ \\langle \\nabla_{\\omega}\\ell(\\omega, \\text{left.x}, \\text{left.y}), \\nabla_{\\omega}\\ell(\\omega, \\text{right.x}, \\text{right.y}) \\rangle_{B}\\] where \\(\\nabla_{\\omega}\\ell(\\omega, \\cdot, \\cdot)\\) is represented by the gradient_provider
and the expression must be understood sample-wise.
PARAMETER DESCRIPTION left
The first batch for gradient and inner product computation
TYPE: TorchBatch
right
The second batch for gradient and inner product computation, optional; if not provided, the inner product will use the gradient computed for left
for both arguments.
TYPE: Optional[TorchBatch]
gradient_provider
The gradient provider to compute the gradients.
TYPE: TorchGradientProvider
RETURNS DESCRIPTION Tensor
A tensor representing the inner products of the per-sample gradients
Source code in src/pydvl/influence/torch/base.py
def grads_inner_prod(\n self,\n left: TorchBatch,\n right: Optional[TorchBatch],\n gradient_provider: TorchGradientProvider,\n) -> torch.Tensor:\n r\"\"\"\n Computes the gradient inner product of two batches of data, i.e.\n\n $$ \\langle \\nabla_{\\omega}\\ell(\\omega, \\text{left.x}, \\text{left.y}),\n \\nabla_{\\omega}\\ell(\\omega, \\text{right.x}, \\text{right.y}) \\rangle_{B}$$\n\n where $\\nabla_{\\omega}\\ell(\\omega, \\cdot, \\cdot)$ is represented by the\n `gradient_provider` and the expression must be understood sample-wise.\n\n Args:\n left: The first batch for gradient and inner product computation\n right: The second batch for gradient and inner product computation,\n optional; if not provided, the inner product will use the gradient\n computed for `left` for both arguments.\n gradient_provider: The gradient provider to compute the gradients.\n\n Returns:\n A tensor representing the inner products of the per-sample gradients\n \"\"\"\n operator = cast(TensorDictOperator, self.operator)\n left_grads = gradient_provider.grads(left)\n if right is None:\n right_grads = left_grads\n else:\n right_grads = gradient_provider.grads(right)\n\n left_batch_size, right_batch_size = next(\n (\n (l.shape[0], r.shape[0])\n for r, l in zip(left_grads.values(), right_grads.values())\n )\n )\n\n if left_batch_size <= right_batch_size:\n left_grads = operator.apply_to_dict(left_grads)\n tensor_pairs = zip(left_grads.values(), right_grads.values())\n else:\n right_grads = operator.apply_to_dict(right_grads)\n tensor_pairs = zip(left_grads.values(), right_grads.values())\n\n tensors_to_reduce = (\n self._aggregate_grads(left, right) for left, right in tensor_pairs\n )\n\n return cast(torch.Tensor, sum(tensors_to_reduce))\n
"},{"location":"api/pydvl/influence/torch/base/#pydvl.influence.torch.base.DictBilinearForm.mixed_grads_inner_prod","title":"mixed_grads_inner_prod","text":"mixed_grads_inner_prod(\n left: TorchBatch,\n right: Optional[TorchBatch],\n gradient_provider: TorchGradientProvider,\n) -> Tensor\n
Computes the mixed gradient inner product of two batches of data, i.e.
\\[ \\langle \\nabla_{\\omega}\\ell(\\omega, \\text{left.x}, \\text{left.y}), \\nabla_{\\omega}\\nabla_{x}\\ell(\\omega, \\text{right.x}, \\text{right.y}) \\rangle_{B}\\] where \\(\\nabla_{\\omega}\\ell(\\omega, \\cdot)\\) and \\(\\nabla_{\\omega}\\nabla_{x}\\ell(\\omega, \\cdot)\\) are represented by the gradient_provider
. The expression must be understood sample-wise.
PARAMETER DESCRIPTION left
The first batch for gradient and inner product computation
TYPE: TorchBatch
right
The second batch for gradient and inner product computation
TYPE: Optional[TorchBatch]
gradient_provider
The gradient provider to compute the gradients.
TYPE: TorchGradientProvider
RETURNS DESCRIPTION Tensor
A tensor representing the inner products of the mixed per-sample gradients
Source code in src/pydvl/influence/torch/base.py
def mixed_grads_inner_prod(\n self,\n left: TorchBatch,\n right: Optional[TorchBatch],\n gradient_provider: TorchGradientProvider,\n) -> torch.Tensor:\n r\"\"\"\n Computes the mixed gradient inner product of two batches of data, i.e.\n\n $$ \\langle \\nabla_{\\omega}\\ell(\\omega, \\text{left.x}, \\text{left.y}),\n \\nabla_{\\omega}\\nabla_{x}\\ell(\\omega, \\text{right.x}, \\text{right.y})\n \\rangle_{B}$$\n\n where $\\nabla_{\\omega}\\ell(\\omega, \\cdot)$ and\n $\\nabla_{\\omega}\\nabla_{x}\\ell(\\omega, \\cdot)$ are represented by the\n `gradient_provider`. The expression must be understood sample-wise.\n\n Args:\n left: The first batch for gradient and inner product computation\n right: The second batch for gradient and inner product computation\n gradient_provider: The gradient provider to compute the gradients.\n\n Returns:\n A tensor representing the inner products of the mixed per-sample gradients\n \"\"\"\n operator = cast(TensorDictOperator, self.operator)\n if right is None:\n right = left\n right_grads = gradient_provider.mixed_grads(right)\n left_grads = gradient_provider.grads(left)\n left_grads = operator.apply_to_dict(left_grads)\n left_grads_views = (t.reshape(t.shape[0], -1) for t in left_grads.values())\n right_grads_views = (\n t.reshape(*right.x.shape, -1) for t in right_grads.values()\n )\n tensor_pairs = zip(left_grads_views, right_grads_views)\n tensors_to_reduce = (\n self._aggregate_mixed_grads(left, right) for left, right in tensor_pairs\n )\n return cast(torch.Tensor, sum(tensors_to_reduce))\n
"},{"location":"api/pydvl/influence/torch/base/#pydvl.influence.torch.base.TensorOperator","title":"TensorOperator","text":" Bases: Operator[Tensor, OperatorBilinearForm]
, ABC
Abstract base class for operators that can be applied to instances of torch.Tensor.
"},{"location":"api/pydvl/influence/torch/base/#pydvl.influence.torch.base.TensorOperator.input_size","title":"input_size abstractmethod
property
","text":"input_size: int\n
Abstract property to get the needed size for inputs to the operator instance
RETURNS DESCRIPTION int
An integer representing the input size.
"},{"location":"api/pydvl/influence/torch/base/#pydvl.influence.torch.base.TensorOperator.apply","title":"apply","text":"apply(tensor: TensorType) -> TensorType\n
Applies the operator to a tensor.
PARAMETER DESCRIPTION tensor
A tensor, whose tailing dimension must conform to the operator's input size
TYPE: TensorType
RETURNS DESCRIPTION TensorType
A tensor representing the result of the operator application.
Source code in src/pydvl/influence/types.py
def apply(self, tensor: TensorType) -> TensorType:\n \"\"\"\n Applies the operator to a tensor.\n\n Args:\n tensor: A tensor, whose tailing dimension must conform to the\n operator's input size\n\n Returns:\n A tensor representing the result of the operator application.\n \"\"\"\n self._validate_tensor_input(tensor)\n return self._apply(tensor)\n
"},{"location":"api/pydvl/influence/torch/base/#pydvl.influence.torch.base.TensorDictOperator","title":"TensorDictOperator","text":" Bases: TensorOperator
, ABC
Abstract base class for operators that can be applied to instances of torch.Tensor and compatible dictionaries mapping strings to tensors. Input dictionaries must conform to the structure defined by the property input_dict_structure
. Useful for operators involving autograd functionality to avoid intermediate flattening and concatenating of gradient inputs.
"},{"location":"api/pydvl/influence/torch/base/#pydvl.influence.torch.base.TensorDictOperator.input_size","title":"input_size abstractmethod
property
","text":"input_size: int\n
Abstract property to get the needed size for inputs to the operator instance
RETURNS DESCRIPTION int
An integer representing the input size.
"},{"location":"api/pydvl/influence/torch/base/#pydvl.influence.torch.base.TensorDictOperator.input_dict_structure","title":"input_dict_structure abstractmethod
property
","text":"input_dict_structure: Dict[str, Tuple[int, ...]]\n
Implement this to expose the expected structure of the input tensor dict, i.e. a dictionary of shapes (excluding the first batch dimension), in order to validate the input tensor dicts.
"},{"location":"api/pydvl/influence/torch/base/#pydvl.influence.torch.base.TensorDictOperator.apply","title":"apply","text":"apply(tensor: TensorType) -> TensorType\n
Applies the operator to a tensor.
PARAMETER DESCRIPTION tensor
A tensor, whose tailing dimension must conform to the operator's input size
TYPE: TensorType
RETURNS DESCRIPTION TensorType
A tensor representing the result of the operator application.
Source code in src/pydvl/influence/types.py
def apply(self, tensor: TensorType) -> TensorType:\n \"\"\"\n Applies the operator to a tensor.\n\n Args:\n tensor: A tensor, whose tailing dimension must conform to the\n operator's input size\n\n Returns:\n A tensor representing the result of the operator application.\n \"\"\"\n self._validate_tensor_input(tensor)\n return self._apply(tensor)\n
"},{"location":"api/pydvl/influence/torch/base/#pydvl.influence.torch.base.TensorDictOperator.apply_to_dict","title":"apply_to_dict","text":"apply_to_dict(mat: Dict[str, Tensor]) -> Dict[str, Tensor]\n
Applies the operator to a dictionary of tensors, compatible to the structure defined by the property input_dict_structure
.
PARAMETER DESCRIPTION mat
dictionary of tensors, whose keys and shapes match the property input_dict_structure
.
TYPE: Dict[str, Tensor]
RETURNS DESCRIPTION Dict[str, Tensor]
A dictionary of tensors after applying the operator
Source code in src/pydvl/influence/torch/base.py
def apply_to_dict(self, mat: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:\n \"\"\"\n Applies the operator to a dictionary of tensors, compatible to the structure\n defined by the property `input_dict_structure`.\n\n Args:\n mat: dictionary of tensors, whose keys and shapes match the property\n `input_dict_structure`.\n\n Returns:\n A dictionary of tensors after applying the operator\n \"\"\"\n\n if not self._validate_mat_dict(mat):\n raise ValueError(\n f\"Incompatible input structure, expected (excluding batch\"\n f\"dimension): \\n {self.input_dict_structure}\"\n )\n\n return self._apply_to_dict(self._dict_to_device(mat))\n
"},{"location":"api/pydvl/influence/torch/base/#pydvl.influence.torch.base.TorchOperatorGradientComposition","title":"TorchOperatorGradientComposition","text":"TorchOperatorGradientComposition(\n op: TorchOperatorType, gp: TorchGradientProvider\n)\n
Bases: OperatorGradientComposition[Tensor, TorchBatch, TorchOperatorType, TorchGradientProvider]
Representing a composable block that integrates an TorchOperator and a TorchPerSampleGradientProvider
This block is designed to be flexible, handling different computational modes via an abstract operator and gradient provider.
Source code in src/pydvl/influence/torch/base.py
def __init__(self, op: TorchOperatorType, gp: TorchGradientProvider):\n super().__init__(op, gp)\n
"},{"location":"api/pydvl/influence/torch/base/#pydvl.influence.torch.base.TorchOperatorGradientComposition.interactions","title":"interactions","text":"interactions(\n left_batch: BatchType, right_batch: Optional[BatchType], mode: InfluenceMode\n)\n
Computes the interaction between the gradients on two batches of data based on the specified mode weighted by the operator action, i.e.
\\[ \\langle \\operatorname{Op}(\\nabla_{\\omega}\\ell(\\omega, \\text{left.x}, \\text{left.y})), \\nabla_{\\omega}\\ell(\\omega, \\text{right.x}, \\text{right.y}) \\rangle\\] for the case InfluenceMode.Up
and
\\[ \\langle \\operatorname{Op}(\\nabla_{\\omega}\\ell(\\omega, \\text{left.x}, \\text{left.y})), \\nabla_{\\omega}\\nabla_{x}\\ell(\\omega, \\text{right.x}, \\text{right.y}) \\rangle \\] for the case InfluenceMode.Perturbation
.
PARAMETER DESCRIPTION left_batch
The left data batch for gradient computation.
TYPE: BatchType
right_batch
The right data batch for gradient computation.
TYPE: Optional[BatchType]
mode
An instance of InfluenceMode determining the type of influence computation.
TYPE: InfluenceMode
RETURNS DESCRIPTION The result of the influence computation as dictated by the mode.
Source code in src/pydvl/influence/types.py
def interactions(\n self,\n left_batch: BatchType,\n right_batch: Optional[BatchType],\n mode: InfluenceMode,\n):\n r\"\"\"\n Computes the interaction between the gradients on two batches of data based on\n the specified mode weighted by the operator action,\n i.e.\n\n $$ \\langle \\operatorname{Op}(\\nabla_{\\omega}\\ell(\\omega, \\text{left.x},\n \\text{left.y})),\n \\nabla_{\\omega}\\ell(\\omega, \\text{right.x}, \\text{right.y}) \\rangle$$\n\n for the case `InfluenceMode.Up` and\n\n $$ \\langle \\operatorname{Op}(\\nabla_{\\omega}\\ell(\\omega, \\text{left.x},\n \\text{left.y})),\n \\nabla_{\\omega}\\nabla_{x}\\ell(\\omega, \\text{right.x}, \\text{right.y}) \\rangle $$\n\n for the case `InfluenceMode.Perturbation`.\n\n Args:\n left_batch: The left data batch for gradient computation.\n right_batch: The right data batch for gradient computation.\n mode: An instance of InfluenceMode determining the type of influence\n computation.\n\n Returns:\n The result of the influence computation as dictated by the mode.\n \"\"\"\n bilinear_form = self.op.as_bilinear_form()\n if mode == InfluenceMode.Up:\n return bilinear_form.grads_inner_prod(left_batch, right_batch, self.gp)\n elif mode == InfluenceMode.Perturbation:\n return bilinear_form.mixed_grads_inner_prod(\n left_batch, right_batch, self.gp\n )\n else:\n raise UnsupportedInfluenceModeException(mode)\n
"},{"location":"api/pydvl/influence/torch/base/#pydvl.influence.torch.base.TorchOperatorGradientComposition.transformed_grads","title":"transformed_grads","text":"transformed_grads(batch: BatchType)\n
Computes the gradients of a data batch, transformed by the operator application , i.e. the expressions
\\[ \\operatorname{Op}(\\nabla_{\\omega}\\ell(\\omega, \\text{batch.x}, \\text{batch.y})) \\] PARAMETER DESCRIPTION batch
The data batch for gradient computation.
TYPE: BatchType
RETURNS DESCRIPTION A tensor representing the application of the operator to the gradients.
Source code in src/pydvl/influence/types.py
def transformed_grads(self, batch: BatchType):\n r\"\"\"\n Computes the gradients of a data batch, transformed by the operator application\n , i.e. the expressions\n\n $$ \\operatorname{Op}(\\nabla_{\\omega}\\ell(\\omega, \\text{batch.x},\n \\text{batch.y})) $$\n\n Args:\n batch: The data batch for gradient computation.\n\n Returns:\n A tensor representing the application of the operator to the gradients.\n\n \"\"\"\n grads = self.gp.flat_grads(batch)\n return self.op.apply(grads)\n
"},{"location":"api/pydvl/influence/torch/base/#pydvl.influence.torch.base.TorchOperatorGradientComposition.interactions_from_transformed_grads","title":"interactions_from_transformed_grads","text":"interactions_from_transformed_grads(\n left_factors: TensorType, right_batch: BatchType, mode: InfluenceMode\n)\n
Computes the interaction between the transformed gradients on two batches of data using pre-computed factors and a batch of data, based on the specified mode. This means
\\[ \\langle \\text{left_factors}, \\nabla_{\\omega}\\ell(\\omega, \\text{right.x}, \\text{right.y}) \\rangle\\] for the case InfluenceMode.Up
and
\\[ \\langle \\text{left_factors}, \\nabla_{\\omega}\\nabla_{x}\\ell(\\omega, \\text{right.x}, \\text{right.y}) \\rangle \\] for the case InfluenceMode.Perturbation
.
PARAMETER DESCRIPTION left_factors
Pre-computed tensor factors from a left batch.
TYPE: TensorType
right_batch
The right data batch for influence computation.
TYPE: BatchType
mode
An instance of InfluenceMode determining the type of influence computation.
TYPE: InfluenceMode
RETURNS DESCRIPTION The result of the interaction computation using the provided factors and batch gradients.
Source code in src/pydvl/influence/types.py
def interactions_from_transformed_grads(\n self, left_factors: TensorType, right_batch: BatchType, mode: InfluenceMode\n):\n r\"\"\"\n Computes the interaction between the transformed gradients on two batches of\n data using pre-computed factors and a batch of data,\n based on the specified mode. This means\n\n $$ \\langle \\text{left_factors},\n \\nabla_{\\omega}\\ell(\\omega, \\text{right.x}, \\text{right.y}) \\rangle$$\n\n for the case `InfluenceMode.Up` and\n\n $$ \\langle \\text{left_factors},\n \\nabla_{\\omega}\\nabla_{x}\\ell(\\omega, \\text{right.x}, \\text{right.y}) \\rangle $$\n\n for the case `InfluenceMode.Perturbation`.\n\n Args:\n left_factors: Pre-computed tensor factors from a left batch.\n right_batch: The right data batch for influence computation.\n mode: An instance of InfluenceMode determining the type of influence\n computation.\n\n Returns:\n The result of the interaction computation using the provided factors and\n batch gradients.\n \"\"\"\n if mode is InfluenceMode.Up:\n right_grads = self.gp.flat_grads(right_batch)\n else:\n right_grads = self.gp.flat_mixed_grads(right_batch)\n return self._tensor_inner_product(left_factors, right_grads)\n
"},{"location":"api/pydvl/influence/torch/base/#pydvl.influence.torch.base.TorchBlockMapper","title":"TorchBlockMapper","text":"TorchBlockMapper(\n composable_block_dict: OrderedDict[str, TorchOperatorGradientComposition]\n)\n
Bases: BlockMapper[Tensor, TorchBatch, TorchOperatorGradientComposition[TorchOperatorType]]
Class for mapping operations across multiple compositional blocks represented by instances of TorchOperatorGradientComposition.
This class takes a dictionary of compositional blocks and applies their methods to batches or tensors, and aggregates the results.
Source code in src/pydvl/influence/torch/base.py
def __init__(\n self, composable_block_dict: OrderedDict[str, TorchOperatorGradientComposition]\n):\n super().__init__(composable_block_dict)\n
"},{"location":"api/pydvl/influence/torch/base/#pydvl.influence.torch.base.TorchBlockMapper.transformed_grads","title":"transformed_grads","text":"transformed_grads(batch: BatchType) -> OrderedDict[str, TensorType]\n
Computes and returns the transformed gradients for a batch in dictionary with the keys defined by the block names.
PARAMETER DESCRIPTION batch
The batch of data for which to compute transformed gradients.
TYPE: BatchType
RETURNS DESCRIPTION OrderedDict[str, TensorType]
An ordered dictionary of transformed gradients by block.
Source code in src/pydvl/influence/types.py
def transformed_grads(\n self,\n batch: BatchType,\n) -> OrderedDict[str, TensorType]:\n \"\"\"\n Computes and returns the transformed gradients for a batch in dictionary\n with the keys defined by the block names.\n\n Args:\n batch: The batch of data for which to compute transformed gradients.\n\n Returns:\n An ordered dictionary of transformed gradients by block.\n \"\"\"\n tensor_gen = self.generate_transformed_grads(batch)\n return self._to_ordered_dict(tensor_gen)\n
"},{"location":"api/pydvl/influence/torch/base/#pydvl.influence.torch.base.TorchBlockMapper.interactions","title":"interactions","text":"interactions(\n left_batch: BatchType, right_batch: BatchType, mode: InfluenceMode\n) -> OrderedDict[str, TensorType]\n
Computes interactions between two batches, aggregated by block, based on a specified mode.
PARAMETER DESCRIPTION left_batch
The left batch for interaction computation.
TYPE: BatchType
right_batch
The right batch for interaction computation.
TYPE: BatchType
mode
The mode determining the type of interactions.
TYPE: InfluenceMode
RETURNS DESCRIPTION OrderedDict[str, TensorType]
An ordered dictionary of gradient interactions by block.
Source code in src/pydvl/influence/types.py
def interactions(\n self, left_batch: BatchType, right_batch: BatchType, mode: InfluenceMode\n) -> OrderedDict[str, TensorType]:\n \"\"\"\n Computes interactions between two batches, aggregated by block,\n based on a specified mode.\n\n Args:\n left_batch: The left batch for interaction computation.\n right_batch: The right batch for interaction computation.\n mode: The mode determining the type of interactions.\n\n Returns:\n An ordered dictionary of gradient interactions by block.\n \"\"\"\n tensor_gen = self.generate_interactions(left_batch, right_batch, mode)\n return self._to_ordered_dict(tensor_gen)\n
"},{"location":"api/pydvl/influence/torch/base/#pydvl.influence.torch.base.TorchBlockMapper.interactions_from_transformed_grads","title":"interactions_from_transformed_grads","text":"interactions_from_transformed_grads(\n left_factors: OrderedDict[str, TensorType],\n right_batch: BatchType,\n mode: InfluenceMode,\n) -> OrderedDict[str, TensorType]\n
Computes interactions from transformed gradients and a right batch, aggregated by block and based on a mode.
PARAMETER DESCRIPTION left_factors
Pre-computed factors as a tensor or an ordered dictionary of tensors by block. If the input is a tensor, it is split into blocks according to the ordering in the composable_block_dict
attribute.
TYPE: OrderedDict[str, TensorType]
right_batch
The right batch for interaction computation.
TYPE: BatchType
mode
The mode determining the type of interactions.
TYPE: InfluenceMode
RETURNS DESCRIPTION OrderedDict[str, TensorType]
An ordered dictionary of interactions from transformed gradients by block.
Source code in src/pydvl/influence/types.py
def interactions_from_transformed_grads(\n self,\n left_factors: OrderedDict[str, TensorType],\n right_batch: BatchType,\n mode: InfluenceMode,\n) -> OrderedDict[str, TensorType]:\n \"\"\"\n Computes interactions from transformed gradients and a right batch,\n aggregated by block and based on a mode.\n\n Args:\n left_factors: Pre-computed factors as a tensor or an ordered dictionary of\n tensors by block. If the input is a tensor, it is split into blocks\n according to the ordering in the `composable_block_dict` attribute.\n right_batch: The right batch for interaction computation.\n mode: The mode determining the type of interactions.\n\n Returns:\n An ordered dictionary of interactions from transformed gradients by block.\n \"\"\"\n tensor_gen = self.generate_interactions_from_transformed_grads(\n left_factors, right_batch, mode\n )\n return self._to_ordered_dict(tensor_gen)\n
"},{"location":"api/pydvl/influence/torch/base/#pydvl.influence.torch.base.TorchBlockMapper.generate_transformed_grads","title":"generate_transformed_grads","text":"generate_transformed_grads(\n batch: BatchType,\n) -> Generator[TensorType, None, None]\n
Generator that yields transformed gradients for a given batch, processed by each block.
PARAMETER DESCRIPTION batch
The batch of data for which to generate transformed gradients.
TYPE: BatchType
YIELDS DESCRIPTION TensorType
Transformed gradients for each block.
Source code in src/pydvl/influence/types.py
def generate_transformed_grads(\n self, batch: BatchType\n) -> Generator[TensorType, None, None]:\n \"\"\"\n Generator that yields transformed gradients for a given batch,\n processed by each block.\n\n Args:\n batch: The batch of data for which to generate transformed gradients.\n\n Yields:\n Transformed gradients for each block.\n \"\"\"\n for comp_block in self.composable_block_dict.values():\n yield comp_block.transformed_grads(batch)\n
"},{"location":"api/pydvl/influence/torch/base/#pydvl.influence.torch.base.TorchBlockMapper.generate_interactions","title":"generate_interactions","text":"generate_interactions(\n left_batch: BatchType, right_batch: Optional[BatchType], mode: InfluenceMode\n) -> Generator[TensorType, None, None]\n
Generator that yields gradient interactions between two batches, processed by each block based on a mode.
PARAMETER DESCRIPTION left_batch
The left batch for interaction computation.
TYPE: BatchType
right_batch
The right batch for interaction computation.
TYPE: Optional[BatchType]
mode
The mode determining the type of interactions.
TYPE: InfluenceMode
YIELDS DESCRIPTION TensorType
Gradient interactions for each block.
TYPE:: TensorType
Source code in src/pydvl/influence/types.py
def generate_interactions(\n self,\n left_batch: BatchType,\n right_batch: Optional[BatchType],\n mode: InfluenceMode,\n) -> Generator[TensorType, None, None]:\n \"\"\"\n Generator that yields gradient interactions between two batches, processed by\n each block based on a mode.\n\n Args:\n left_batch: The left batch for interaction computation.\n right_batch: The right batch for interaction computation.\n mode: The mode determining the type of interactions.\n\n Yields:\n TensorType: Gradient interactions for each block.\n \"\"\"\n for comp_block in self.composable_block_dict.values():\n yield comp_block.interactions(left_batch, right_batch, mode)\n
"},{"location":"api/pydvl/influence/torch/base/#pydvl.influence.torch.base.TorchBlockMapper.generate_interactions_from_transformed_grads","title":"generate_interactions_from_transformed_grads","text":"generate_interactions_from_transformed_grads(\n left_factors: Union[TensorType, OrderedDict[str, TensorType]],\n right_batch: BatchType,\n mode: InfluenceMode,\n) -> Generator[TensorType, None, None]\n
Generator that yields interactions computed from pre-computed factors and a right batch, processed by each block based on a mode.
PARAMETER DESCRIPTION left_factors
Pre-computed factors as a tensor or an ordered dictionary of tensors by block.
TYPE: Union[TensorType, OrderedDict[str, TensorType]]
right_batch
The right batch for interaction computation.
TYPE: BatchType
mode
The mode determining the type of interactions.
TYPE: InfluenceMode
YIELDS DESCRIPTION TensorType
Interactions for each block.
TYPE:: TensorType
Source code in src/pydvl/influence/types.py
def generate_interactions_from_transformed_grads(\n self,\n left_factors: Union[TensorType, OrderedDict[str, TensorType]],\n right_batch: BatchType,\n mode: InfluenceMode,\n) -> Generator[TensorType, None, None]:\n \"\"\"\n Generator that yields interactions computed from pre-computed factors and a\n right batch, processed by each block based on a mode.\n\n Args:\n left_factors: Pre-computed factors as a tensor or an ordered dictionary of\n tensors by block.\n right_batch: The right batch for interaction computation.\n mode: The mode determining the type of interactions.\n\n Yields:\n TensorType: Interactions for each block.\n \"\"\"\n if not isinstance(left_factors, dict):\n left_factors_dict = self._split_to_blocks(left_factors)\n else:\n left_factors_dict = cast(OrderedDict[str, TensorType], left_factors)\n for k, comp_block in self.composable_block_dict.items():\n yield comp_block.interactions_from_transformed_grads(\n left_factors_dict[k], right_batch, mode\n )\n
"},{"location":"api/pydvl/influence/torch/base/#pydvl.influence.torch.base.TorchComposableInfluence","title":"TorchComposableInfluence","text":"TorchComposableInfluence(\n model: Module,\n block_structure: Union[\n BlockMode, OrderedDict[str, List[str]]\n ] = BlockMode.FULL,\n regularization: Optional[Union[float, Dict[str, Optional[float]]]] = None,\n)\n
Bases: ComposableInfluence[Tensor, TorchBatch, DataLoader, TorchBlockMapper[TorchOperatorType]]
, ModelInfoMixin
, ABC
Abstract base class, that allow for block-wise computation of influence quantities with the torch framework. Inherit from this base class for specific influence algorithms.
Source code in src/pydvl/influence/torch/base.py
def __init__(\n self,\n model: torch.nn.Module,\n block_structure: Union[BlockMode, OrderedDict[str, List[str]]] = BlockMode.FULL,\n regularization: Optional[Union[float, Dict[str, Optional[float]]]] = None,\n):\n parameter_dict_builder = ModelParameterDictBuilder(model)\n if isinstance(block_structure, BlockMode):\n self.parameter_dict = parameter_dict_builder.build_from_block_mode(\n block_structure\n )\n else:\n self.parameter_dict = parameter_dict_builder.build(block_structure)\n\n self._regularization_dict = self._build_regularization_dict(regularization)\n\n super().__init__(model)\n
"},{"location":"api/pydvl/influence/torch/base/#pydvl.influence.torch.base.TorchComposableInfluence.is_thread_safe","title":"is_thread_safe abstractmethod
property
","text":"is_thread_safe: bool\n
Whether the influence computation is thread safe
"},{"location":"api/pydvl/influence/torch/base/#pydvl.influence.torch.base.TorchComposableInfluence.fit","title":"fit","text":"fit(data: DataLoaderType) -> InfluenceFunctionModel\n
Fitting to provided data, by internally creating a block mapper instance from it. Args: data: iterable of tensors
RETURNS DESCRIPTION InfluenceFunctionModel
Fitted instance
Source code in src/pydvl/influence/base_influence_function_model.py
@log_duration(log_level=logging.INFO)\ndef fit(self, data: DataLoaderType) -> InfluenceFunctionModel:\n \"\"\"\n Fitting to provided data, by internally creating a block mapper instance from\n it.\n Args:\n data: iterable of tensors\n\n Returns:\n Fitted instance\n \"\"\"\n self.block_mapper = self._create_block_mapper(data)\n return self\n
"},{"location":"api/pydvl/influence/torch/base/#pydvl.influence.torch.base.TorchComposableInfluence.fit_required","title":"fit_required staticmethod
","text":"fit_required(method)\n
Decorator to enforce the fitted check
Source code in src/pydvl/influence/base_influence_function_model.py
@staticmethod\ndef fit_required(method):\n \"\"\"Decorator to enforce the fitted check\"\"\"\n\n @wraps(method)\n def wrapper(self, *args, **kwargs):\n if not self.is_fitted:\n raise NotFittedException(type(self))\n return method(self, *args, **kwargs)\n\n return wrapper\n
"},{"location":"api/pydvl/influence/torch/base/#pydvl.influence.torch.base.TorchComposableInfluence.influence_factors","title":"influence_factors","text":"influence_factors(x: TensorType, y: TensorType) -> TensorType\n
Computes the approximation of
\\[ H^{-1}\\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\] where the gradient is meant to be per sample of the batch \\((x, y)\\). For all input tensors it is assumed, that the first dimension is the batch dimension.
PARAMETER DESCRIPTION x
model input to use in the gradient computations
TYPE: TensorType
y
label tensor to compute gradients
TYPE: TensorType
RETURNS DESCRIPTION TensorType
Tensor representing the element-wise inverse Hessian matrix vector products
Source code in src/pydvl/influence/base_influence_function_model.py
def influence_factors(self, x: TensorType, y: TensorType) -> TensorType:\n r\"\"\"\n Computes the approximation of\n\n \\[ H^{-1}\\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\]\n\n where the gradient is meant to be per sample of the batch $(x, y)$.\n For all input tensors it is assumed,\n that the first dimension is the batch dimension.\n\n Args:\n x: model input to use in the gradient computations\n y: label tensor to compute gradients\n\n Returns:\n Tensor representing the element-wise inverse Hessian matrix vector products\n\n \"\"\"\n if not self.is_fitted:\n raise NotFittedException(type(self))\n return self._influence_factors(x, y)\n
"},{"location":"api/pydvl/influence/torch/base/#pydvl.influence.torch.base.TorchComposableInfluence.influences","title":"influences","text":"influences(\n x_test: TensorType,\n y_test: TensorType,\n x: Optional[TensorType] = None,\n y: Optional[TensorType] = None,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> TensorType\n
Computes the approximation of
\\[ \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}})), \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the case of up-weighting influence, resp.
\\[ \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{test}, f_{\\theta}(x_{test})), \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the perturbation type influence case.
PARAMETER DESCRIPTION x_test
model input to use in the gradient computations of \\(H^{-1}\\nabla_{theta} \\ell(y_{test}, f_{\\theta}(x_{test}))\\)
TYPE: TensorType
y_test
label tensor to compute gradients
TYPE: TensorType
x
optional model input to use in the gradient computations \\(\\nabla_{theta}\\ell(y, f_{\\theta}(x))\\), resp. \\(\\nabla_{x}\\nabla_{theta}\\ell(y, f_{\\theta}(x))\\), if None, use \\(x=x_{test}\\)
TYPE: Optional[TensorType]
DEFAULT: None
y
optional label tensor to compute gradients
TYPE: Optional[TensorType]
DEFAULT: None
mode
enum value of InfluenceMode
TYPE: InfluenceMode
DEFAULT: Up
RETURNS DESCRIPTION TensorType
Tensor representing the element-wise scalar products for the provided batch
Source code in src/pydvl/influence/base_influence_function_model.py
def influences(\n self,\n x_test: TensorType,\n y_test: TensorType,\n x: Optional[TensorType] = None,\n y: Optional[TensorType] = None,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> TensorType:\n r\"\"\"\n Computes the approximation of\n\n \\[ \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}},\n f_{\\theta}(x_{\\text{test}})),\n \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the case of up-weighting influence, resp.\n\n \\[ \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{test}, f_{\\theta}(x_{test})),\n \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the perturbation type influence case.\n\n Args:\n x_test: model input to use in the gradient computations\n of $H^{-1}\\nabla_{theta} \\ell(y_{test}, f_{\\theta}(x_{test}))$\n y_test: label tensor to compute gradients\n x: optional model input to use in the gradient computations\n $\\nabla_{theta}\\ell(y, f_{\\theta}(x))$,\n resp. $\\nabla_{x}\\nabla_{theta}\\ell(y, f_{\\theta}(x))$,\n if None, use $x=x_{test}$\n y: optional label tensor to compute gradients\n mode: enum value of [InfluenceMode]\n [pydvl.influence.base_influence_function_model.InfluenceMode]\n\n Returns:\n Tensor representing the element-wise scalar products for the provided batch\n\n \"\"\"\n if not self.is_fitted:\n raise NotFittedException(type(self))\n\n if x is None and y is not None:\n raise ValueError(\n \"Providing labels y, without providing model input x \"\n \"is not supported\"\n )\n\n if x is not None and y is None:\n raise ValueError(\n \"Providing model input x, without providing labels y \"\n \"is not supported\"\n )\n\n return self._influences(x_test, y_test, x, y, mode)\n
"},{"location":"api/pydvl/influence/torch/base/#pydvl.influence.torch.base.TorchComposableInfluence.influences_from_factors","title":"influences_from_factors","text":"influences_from_factors(\n z_test_factors: TensorType,\n x: TensorType,\n y: TensorType,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> TensorType\n
Computation of
\\[ \\langle z_{\\text{test_factors}}, \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the case of up-weighting influence, resp.
\\[ \\langle z_{\\text{test_factors}}, \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the perturbation type influence case. The gradient is meant to be per sample of the batch \\((x, y)\\).
PARAMETER DESCRIPTION z_test_factors
pre-computed array, approximating \\(H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}}))\\)
TYPE: TensorType
x
model input to use in the gradient computations \\(\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))\\), resp. \\(\\nabla_{x}\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))\\), if None, use \\(x=x_{\\text{test}}\\)
TYPE: TensorType
y
label tensor to compute gradients
TYPE: TensorType
mode
enum value of InfluenceMode
TYPE: InfluenceMode
DEFAULT: Up
RETURNS DESCRIPTION TensorType
Tensor representing the element-wise scalar products for the provided batch
Source code in src/pydvl/influence/base_influence_function_model.py
@InfluenceFunctionModel.fit_required\ndef influences_from_factors(\n self,\n z_test_factors: TensorType,\n x: TensorType,\n y: TensorType,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> TensorType:\n r\"\"\"\n Computation of\n\n \\[ \\langle z_{\\text{test_factors}},\n \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the case of up-weighting influence, resp.\n\n \\[ \\langle z_{\\text{test_factors}},\n \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the perturbation type influence case. The gradient is meant to be per sample\n of the batch $(x, y)$.\n\n Args:\n z_test_factors: pre-computed array, approximating\n $H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}},\n f_{\\theta}(x_{\\text{test}}))$\n x: model input to use in the gradient computations\n $\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))$,\n resp. $\\nabla_{x}\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))$,\n if None, use $x=x_{\\text{test}}$\n y: label tensor to compute gradients\n mode: enum value of [InfluenceMode]\n [pydvl.influence.base_influence_function_model.InfluenceMode]\n\n Returns:\n Tensor representing the element-wise scalar products for the provided batch\n\n \"\"\"\n tensors = self.block_mapper.generate_interactions_from_transformed_grads(\n z_test_factors,\n self._create_batch(x, y),\n mode,\n )\n result: TensorType = next(tensors)\n for tensor in tensors:\n result = result + tensor\n return result\n
"},{"location":"api/pydvl/influence/torch/base/#pydvl.influence.torch.base.TorchComposableInfluence.influences_by_block","title":"influences_by_block","text":"influences_by_block(\n x_test: TensorType,\n y_test: TensorType,\n x: Optional[TensorType] = None,\n y: Optional[TensorType] = None,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> OrderedDict[str, TensorType]\n
Compute the block-wise influence values for the provided data, i.e. an approximation of
\\[ \\langle H^{-1}\\nabla_{theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}})), \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the case of up-weighting influence, resp.
\\[ \\langle H^{-1}\\nabla_{theta} \\ell(y_{test}, f_{\\theta}(x_{test})), \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the perturbation type influence case.
PARAMETER DESCRIPTION x_test
model input to use in the gradient computations of the approximation of \\(H^{-1}\\nabla_{theta} \\ell(y_{test}, f_{\\theta}(x_{test}))\\)
TYPE: TensorType
y_test
label tensor to compute gradients
TYPE: TensorType
x
optional model input to use in the gradient computations \\(\\nabla_{theta}\\ell(y, f_{\\theta}(x))\\), resp. \\(\\nabla_{x}\\nabla_{theta}\\ell(y, f_{\\theta}(x))\\), if None, use \\(x=x_{test}\\)
TYPE: Optional[TensorType]
DEFAULT: None
y
optional label tensor to compute gradients
TYPE: Optional[TensorType]
DEFAULT: None
mode
enum value of InfluenceMode
TYPE: InfluenceMode
DEFAULT: Up
RETURNS DESCRIPTION OrderedDict[str, TensorType]
Ordered dictionary of tensors representing the element-wise scalar products
OrderedDict[str, TensorType]
for the provided batch per block.
Source code in src/pydvl/influence/base_influence_function_model.py
@InfluenceFunctionModel.fit_required\ndef influences_by_block(\n self,\n x_test: TensorType,\n y_test: TensorType,\n x: Optional[TensorType] = None,\n y: Optional[TensorType] = None,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> OrderedDict[str, TensorType]:\n r\"\"\"\n Compute the block-wise influence values for the provided data, i.e. an\n approximation of\n\n \\[ \\langle H^{-1}\\nabla_{theta} \\ell(y_{\\text{test}},\n f_{\\theta}(x_{\\text{test}})),\n \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the case of up-weighting influence, resp.\n\n \\[ \\langle H^{-1}\\nabla_{theta} \\ell(y_{test}, f_{\\theta}(x_{test})),\n \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the perturbation type influence case.\n\n Args:\n x_test: model input to use in the gradient computations\n of the approximation of\n $H^{-1}\\nabla_{theta} \\ell(y_{test}, f_{\\theta}(x_{test}))$\n y_test: label tensor to compute gradients\n x: optional model input to use in the gradient computations\n $\\nabla_{theta}\\ell(y, f_{\\theta}(x))$,\n resp. $\\nabla_{x}\\nabla_{theta}\\ell(y, f_{\\theta}(x))$,\n if None, use $x=x_{test}$\n y: optional label tensor to compute gradients\n mode: enum value of [InfluenceMode]\n [pydvl.influence.base_influence_function_model.InfluenceMode]\n\n Returns:\n Ordered dictionary of tensors representing the element-wise scalar products\n for the provided batch per block.\n\n \"\"\"\n left_batch = self._create_batch(x_test, y_test)\n\n if x is None:\n if y is not None:\n raise ValueError(\n \"Providing labels y, without providing model input x \"\n \"is not supported\"\n )\n right_batch = left_batch\n else:\n if y is None:\n raise ValueError(\n \"Providing model input x, without providing labels y \"\n \"is not supported\"\n )\n right_batch = self._create_batch(x, y)\n\n return self.block_mapper.interactions(left_batch, right_batch, mode)\n
"},{"location":"api/pydvl/influence/torch/base/#pydvl.influence.torch.base.TorchComposableInfluence.influence_factors_by_block","title":"influence_factors_by_block","text":"influence_factors_by_block(\n x: TensorType, y: TensorType\n) -> OrderedDict[str, TensorType]\n
Compute the block-wise approximation of
\\[ H^{-1}\\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\] where the gradient is meant to be per sample of the batch \\((x, y)\\).
PARAMETER DESCRIPTION x
model input to use in the gradient computations
TYPE: TensorType
y
label tensor to compute gradients
TYPE: TensorType
RETURNS DESCRIPTION OrderedDict[str, TensorType]
Ordered dictionary of tensors representing the element-wise
OrderedDict[str, TensorType]
approximate inverse Hessian matrix vector products per block.
Source code in src/pydvl/influence/base_influence_function_model.py
@InfluenceFunctionModel.fit_required\ndef influence_factors_by_block(\n self, x: TensorType, y: TensorType\n) -> OrderedDict[str, TensorType]:\n r\"\"\"\n Compute the block-wise approximation of\n\n \\[ H^{-1}\\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\]\n\n where the gradient is meant to be per sample of the batch $(x, y)$.\n\n Args:\n x: model input to use in the gradient computations\n y: label tensor to compute gradients\n\n Returns:\n Ordered dictionary of tensors representing the element-wise\n approximate inverse Hessian matrix vector products per block.\n\n \"\"\"\n return self.block_mapper.transformed_grads(self._create_batch(x, y))\n
"},{"location":"api/pydvl/influence/torch/base/#pydvl.influence.torch.base.TorchComposableInfluence.influences_from_factors_by_block","title":"influences_from_factors_by_block","text":"influences_from_factors_by_block(\n z_test_factors: OrderedDict[str, TensorType],\n x: TensorType,\n y: TensorType,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> OrderedDict[str, TensorType]\n
Block-wise computation of
\\[ \\langle z_{\\text{test_factors}}, \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the case of up-weighting influence, resp.
\\[ \\langle z_{\\text{test_factors}}, \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the perturbation type influence case. The gradient is meant to be per sample of the batch \\((x, y)\\).
PARAMETER DESCRIPTION z_test_factors
pre-computed array, approximating \\(H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}}))\\)
TYPE: OrderedDict[str, TensorType]
x
model input to use in the gradient computations \\(\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))\\), resp. \\(\\nabla_{x}\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))\\), if None, use \\(x=x_{\\text{test}}\\)
TYPE: TensorType
y
label tensor to compute gradients
TYPE: TensorType
mode
enum value of InfluenceMode
TYPE: InfluenceMode
DEFAULT: Up
RETURNS DESCRIPTION OrderedDict[str, TensorType]
Ordered dictionary of tensors representing the element-wise scalar products
OrderedDict[str, TensorType]
for the provided batch per block
Source code in src/pydvl/influence/base_influence_function_model.py
@InfluenceFunctionModel.fit_required\ndef influences_from_factors_by_block(\n self,\n z_test_factors: OrderedDict[str, TensorType],\n x: TensorType,\n y: TensorType,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> OrderedDict[str, TensorType]:\n r\"\"\"\n Block-wise computation of\n\n \\[ \\langle z_{\\text{test_factors}},\n \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the case of up-weighting influence, resp.\n\n \\[ \\langle z_{\\text{test_factors}},\n \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the perturbation type influence case. The gradient is meant to be per sample\n of the batch $(x, y)$.\n\n Args:\n z_test_factors: pre-computed array, approximating\n $H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}},\n f_{\\theta}(x_{\\text{test}}))$\n x: model input to use in the gradient computations\n $\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))$,\n resp. $\\nabla_{x}\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))$,\n if None, use $x=x_{\\text{test}}$\n y: label tensor to compute gradients\n mode: enum value of [InfluenceMode]\n [pydvl.influence.base_influence_function_model.InfluenceMode]\n\n Returns:\n Ordered dictionary of tensors representing the element-wise scalar products\n for the provided batch per block\n\n \"\"\"\n return self.block_mapper.interactions_from_transformed_grads(\n z_test_factors, self._create_batch(x, y), mode\n )\n
"},{"location":"api/pydvl/influence/torch/batch_operation/","title":"Batch operation","text":""},{"location":"api/pydvl/influence/torch/batch_operation/#pydvl.influence.torch.batch_operation","title":"pydvl.influence.torch.batch_operation","text":"This module contains abstractions and implementations for operations carried out on a batch \\(b\\). These operations are of the form
$$ m(b) \\cdot v$$,
where \\(m(b)\\) is a matrix defined by the data in the batch and \\(v\\) is a vector or matrix. These batch operations can be used to conveniently build aggregations or recursions over sequence of batches, e.g. an average of the form
$$ \\frac{1}{|B|} \\sum_{b in B}m(b)\\cdot v$$,
which is useful in the case that keeping \\(B\\) in memory is not feasible.
"},{"location":"api/pydvl/influence/torch/batch_operation/#pydvl.influence.torch.batch_operation.HessianBatchOperation","title":"HessianBatchOperation","text":"HessianBatchOperation(\n model: Module,\n loss: LossType,\n restrict_to: Optional[Dict[str, Parameter]] = None,\n)\n
Bases: _ModelBasedBatchOperation
Given a model and loss function computes the Hessian vector or matrix product with respect to the model parameters, i.e.
\\[\\begin{align*} &\\nabla^2_{\\theta} L(b;\\theta) \\cdot v \\\\\\ &L(b;\\theta) = \\left( \\frac{1}{|b|} \\sum_{(x,y) \\in b} \\text{loss}(\\text{model}(x; \\theta), y)\\right), \\end{align*}\\] where model is a torch.nn.Module and \\(v\\) is a vector or matrix.
PARAMETER DESCRIPTION model
The model.
TYPE: Module
loss
The loss function.
TYPE: LossType
restrict_to
The parameters to restrict the second order differentiation to, i.e. the corresponding sub-matrix of the Hessian. If None, the full Hessian is used. Make sure the input matches the corrct dimension, i.e. the last dimension must be equal to the property input_size
.
TYPE: Optional[Dict[str, Parameter]]
DEFAULT: None
Source code in src/pydvl/influence/torch/batch_operation.py
def __init__(\n self,\n model: torch.nn.Module,\n loss: LossType,\n restrict_to: Optional[Dict[str, torch.nn.Parameter]] = None,\n):\n super().__init__(model, restrict_to=restrict_to)\n self._batch_hvp = create_batch_hvp_function(model, loss, reverse_only=True)\n self.loss = loss\n
"},{"location":"api/pydvl/influence/torch/batch_operation/#pydvl.influence.torch.batch_operation.HessianBatchOperation.apply","title":"apply","text":"apply(batch: TorchBatch, tensor: Tensor)\n
Applies the batch operation to a tensor. Args: batch: Batch of data for computation tensor: A tensor consistent to the operation, i.e. it must be at most 2-dim, and it's tailing dimension must be equal to the property input_size
.
RETURNS DESCRIPTION A tensor after applying the batch operation
Source code in src/pydvl/influence/torch/batch_operation.py
def apply(self, batch: TorchBatch, tensor: torch.Tensor):\n \"\"\"\n Applies the batch operation to a tensor.\n Args:\n batch: Batch of data for computation\n tensor: A tensor consistent to the operation, i.e. it must be\n at most 2-dim, and it's tailing dimension must\n be equal to the property `input_size`.\n\n Returns:\n A tensor after applying the batch operation\n \"\"\"\n\n if not tensor.ndim <= 2:\n raise ValueError(\n f\"The input tensor must be at most 2-dimensional, got {tensor.ndim}\"\n )\n\n if tensor.shape[-1] != self.input_size:\n raise ValueError(\n \"The last dimension of the input tensor must be equal to the \"\n \"property `input_size`.\"\n )\n\n if tensor.ndim == 2:\n return self._apply_to_mat(batch.to(self.device), tensor.to(self.device))\n return self._apply_to_vec(batch.to(self.device), tensor.to(self.device))\n
"},{"location":"api/pydvl/influence/torch/batch_operation/#pydvl.influence.torch.batch_operation.GaussNewtonBatchOperation","title":"GaussNewtonBatchOperation","text":"GaussNewtonBatchOperation(\n model: Module,\n loss: LossType,\n restrict_to: Optional[Dict[str, Parameter]] = None,\n)\n
Bases: _ModelBasedBatchOperation
Given a model and loss function computes the Gauss-Newton vector or matrix product with respect to the model parameters, i.e.
\\[\\begin{align*} G(\\text{model}, \\text{loss}, b, \\theta) &\\cdot v, \\\\\\ G(\\text{model}, \\text{loss}, b, \\theta) &= \\frac{1}{|b|}\\sum_{(x, y) \\in b}\\nabla_{\\theta}\\ell (x,y; \\theta) \\nabla_{\\theta}\\ell (x,y; \\theta)^t, \\\\\\ \\ell(x,y; \\theta) &= \\text{loss}(\\text{model}(x; \\theta), y) \\end{align*}\\] where model is a torch.nn.Module and \\(v\\) is a vector or matrix.
PARAMETER DESCRIPTION model
The model.
TYPE: Module
loss
The loss function.
TYPE: LossType
restrict_to
The parameters to restrict the differentiation to, i.e. the corresponding sub-matrix of the Jacobian. If None, the full Jacobian is used. Make sure the input matches the corrct dimension, i.e. the last dimension must be equal to the property input_size
.
TYPE: Optional[Dict[str, Parameter]]
DEFAULT: None
Source code in src/pydvl/influence/torch/batch_operation.py
def __init__(\n self,\n model: torch.nn.Module,\n loss: LossType,\n restrict_to: Optional[Dict[str, torch.nn.Parameter]] = None,\n):\n super().__init__(model, restrict_to=restrict_to)\n self.gradient_provider = TorchGradientProvider(\n model, loss, self.params_to_restrict_to\n )\n
"},{"location":"api/pydvl/influence/torch/batch_operation/#pydvl.influence.torch.batch_operation.GaussNewtonBatchOperation.apply","title":"apply","text":"apply(batch: TorchBatch, tensor: Tensor)\n
Applies the batch operation to a tensor. Args: batch: Batch of data for computation tensor: A tensor consistent to the operation, i.e. it must be at most 2-dim, and it's tailing dimension must be equal to the property input_size
.
RETURNS DESCRIPTION A tensor after applying the batch operation
Source code in src/pydvl/influence/torch/batch_operation.py
def apply(self, batch: TorchBatch, tensor: torch.Tensor):\n \"\"\"\n Applies the batch operation to a tensor.\n Args:\n batch: Batch of data for computation\n tensor: A tensor consistent to the operation, i.e. it must be\n at most 2-dim, and it's tailing dimension must\n be equal to the property `input_size`.\n\n Returns:\n A tensor after applying the batch operation\n \"\"\"\n\n if not tensor.ndim <= 2:\n raise ValueError(\n f\"The input tensor must be at most 2-dimensional, got {tensor.ndim}\"\n )\n\n if tensor.shape[-1] != self.input_size:\n raise ValueError(\n \"The last dimension of the input tensor must be equal to the \"\n \"property `input_size`.\"\n )\n\n if tensor.ndim == 2:\n return self._apply_to_mat(batch.to(self.device), tensor.to(self.device))\n return self._apply_to_vec(batch.to(self.device), tensor.to(self.device))\n
"},{"location":"api/pydvl/influence/torch/batch_operation/#pydvl.influence.torch.batch_operation.InverseHarmonicMeanBatchOperation","title":"InverseHarmonicMeanBatchOperation","text":"InverseHarmonicMeanBatchOperation(\n model: Module,\n loss: Callable[[Tensor, Tensor], Tensor],\n regularization: float,\n restrict_to: Optional[Dict[str, Parameter]] = None,\n)\n
Bases: _ModelBasedBatchOperation
Given a model and loss function computes an approximation of the inverse Gauss-Newton vector or matrix product. Viewing the damped Gauss-newton matrix
\\[\\begin{align*} G_{\\lambda}(\\text{model}, \\text{loss}, b, \\theta) &= \\frac{1}{|b|}\\sum_{(x, y) \\in b}\\nabla_{\\theta}\\ell (x,y; \\theta) \\nabla_{\\theta}\\ell (x,y; \\theta)^t + \\lambda \\operatorname{I}, \\\\\\ \\ell(x,y; \\theta) &= \\text{loss}(\\text{model}(x; \\theta), y) \\end{align*}\\] as an arithmetic mean of the rank-\\(1\\) updates, this operation replaces it with the harmonic mean of the rank-\\(1\\) updates, i.e.
\\[ \\tilde{G}_{\\lambda}(\\text{model}, \\text{loss}, b, \\theta) = \\left(n \\sum_{(x, y) \\in b} \\left( \\nabla_{\\theta}\\ell (x,y; \\theta) \\nabla_{\\theta}\\ell (x,y; \\theta)^t + \\lambda \\operatorname{I}\\right)^{-1} \\right)^{-1}\\] and computes
\\[ \\tilde{G}_{\\lambda}^{-1}(\\text{model}, \\text{loss}, b, \\theta) \\cdot v.\\] where model is a torch.nn.Module and \\(v\\) is a vector or matrix. In other words, it switches the order of summation and inversion, which resolves to the inverse harmonic mean
of the rank-\\(1\\) updates.
The inverses of the rank-\\(1\\) updates are not calculated explicitly, but instead a vectorized version of the Sherman\u2013Morrison formula is applied.
For more information, see Inverse Harmonic Mean.
PARAMETER DESCRIPTION model
The model.
TYPE: Module
loss
The loss function.
TYPE: Callable[[Tensor, Tensor], Tensor]
restrict_to
The parameters to restrict the differentiation to, i.e. the corresponding sub-matrix of the Jacobian. If None, the full Jacobian is used. Make sure the input matches the corrct dimension, i.e. the last dimension must be equal to the property input_size
.
TYPE: Optional[Dict[str, Parameter]]
DEFAULT: None
Source code in src/pydvl/influence/torch/batch_operation.py
def __init__(\n self,\n model: torch.nn.Module,\n loss: Callable[[torch.Tensor, torch.Tensor], torch.Tensor],\n regularization: float,\n restrict_to: Optional[Dict[str, torch.nn.Parameter]] = None,\n):\n if regularization <= 0:\n raise ValueError(\"regularization must be positive\")\n self.regularization = regularization\n\n super().__init__(model, restrict_to=restrict_to)\n self.gradient_provider = TorchGradientProvider(\n model, loss, self.params_to_restrict_to\n )\n
"},{"location":"api/pydvl/influence/torch/batch_operation/#pydvl.influence.torch.batch_operation.InverseHarmonicMeanBatchOperation.apply","title":"apply","text":"apply(batch: TorchBatch, tensor: Tensor)\n
Applies the batch operation to a tensor. Args: batch: Batch of data for computation tensor: A tensor consistent to the operation, i.e. it must be at most 2-dim, and it's tailing dimension must be equal to the property input_size
.
RETURNS DESCRIPTION A tensor after applying the batch operation
Source code in src/pydvl/influence/torch/batch_operation.py
def apply(self, batch: TorchBatch, tensor: torch.Tensor):\n \"\"\"\n Applies the batch operation to a tensor.\n Args:\n batch: Batch of data for computation\n tensor: A tensor consistent to the operation, i.e. it must be\n at most 2-dim, and it's tailing dimension must\n be equal to the property `input_size`.\n\n Returns:\n A tensor after applying the batch operation\n \"\"\"\n\n if not tensor.ndim <= 2:\n raise ValueError(\n f\"The input tensor must be at most 2-dimensional, got {tensor.ndim}\"\n )\n\n if tensor.shape[-1] != self.input_size:\n raise ValueError(\n \"The last dimension of the input tensor must be equal to the \"\n \"property `input_size`.\"\n )\n\n if tensor.ndim == 2:\n return self._apply_to_mat(batch.to(self.device), tensor.to(self.device))\n return self._apply_to_vec(batch.to(self.device), tensor.to(self.device))\n
"},{"location":"api/pydvl/influence/torch/batch_operation/#pydvl.influence.torch.batch_operation.ChunkAveraging","title":"ChunkAveraging","text":" Bases: _TensorAveraging[_TensorDictChunkAveraging]
Averages tensors, provided by a generator, and normalizes by the number of tensors.
"},{"location":"api/pydvl/influence/torch/batch_operation/#pydvl.influence.torch.batch_operation.PointAveraging","title":"PointAveraging","text":"PointAveraging(batch_dim: int = 0)\n
Bases: _TensorAveraging[_TensorDictPointAveraging]
Averages tensors provided by a generator. The averaging is weighted by the number of points in each tensor and the final result is normalized by the number of total points.
PARAMETER DESCRIPTION batch_dim
Dimension to extract the number of points for the weighting.
TYPE: int
DEFAULT: 0
Source code in src/pydvl/influence/torch/batch_operation.py
def __init__(self, batch_dim: int = 0):\n self.batch_dim = batch_dim\n
"},{"location":"api/pydvl/influence/torch/functional/","title":"Functional","text":""},{"location":"api/pydvl/influence/torch/functional/#pydvl.influence.torch.functional","title":"pydvl.influence.torch.functional","text":"This module provides methods for efficiently computing tensors related to first and second order derivatives of torch models, using functionality from torch.func. To indicate higher-order functions, i.e. functions which return functions, we use the naming convention create_**_function
.
In particular, the module contains functionality for
- Sample, batch-wise and empirical loss functions:
- create_per_sample_loss_function
- create_batch_loss_function
- create_empirical_loss_function
- Per sample gradient and jacobian product functions:
- create_per_sample_gradient_function
- create_per_sample_mixed_derivative_function
- create_matrix_jacobian_product_function
- Hessian, low rank approximation of Hessian and Hessian vector products:
- hvp
- create_hvp_function
- create_batch_hvp_function
- hessian
- model_hessian_low_rank
"},{"location":"api/pydvl/influence/torch/functional/#pydvl.influence.torch.functional.LowRankProductRepresentation","title":"LowRankProductRepresentation dataclass
","text":"LowRankProductRepresentation(eigen_vals: Tensor, projections: Tensor)\n
Representation of a low rank product of the form \\(H = V D V^T\\), where D is a diagonal matrix and V is orthogonal.
PARAMETER DESCRIPTION eigen_vals
Diagonal of D.
TYPE: Tensor
projections
The matrix V.
TYPE: Tensor
"},{"location":"api/pydvl/influence/torch/functional/#pydvl.influence.torch.functional.LowRankProductRepresentation.to","title":"to","text":"to(device: device)\n
Move the representing tensors to a device
Source code in src/pydvl/influence/torch/functional.py
def to(self, device: torch.device):\n \"\"\"\n Move the representing tensors to a device\n \"\"\"\n return LowRankProductRepresentation(\n self.eigen_vals.to(device), self.projections.to(device)\n )\n
"},{"location":"api/pydvl/influence/torch/functional/#pydvl.influence.torch.functional.hvp","title":"hvp","text":"hvp(\n func: Callable[[Dict[str, Tensor]], Tensor],\n params: Dict[str, Tensor],\n vec: Dict[str, Tensor],\n reverse_only: bool = True,\n) -> Dict[str, Tensor]\n
Computes the Hessian-vector product (HVP) for a given function at the given parameters, i.e.
\\[\\nabla_{\\theta} \\nabla_{\\theta} f (\\theta)\\cdot v\\] This function can operate in two modes, either reverse-mode autodiff only or both forward- and reverse-mode autodiff.
PARAMETER DESCRIPTION func
The scalar-valued function for which the HVP is computed.
TYPE: Callable[[Dict[str, Tensor]], Tensor]
params
The parameters at which the HVP is computed.
TYPE: Dict[str, Tensor]
vec
The vector with which the Hessian is multiplied.
TYPE: Dict[str, Tensor]
reverse_only
Whether to use only reverse-mode autodiff (True, default) or both forward- and reverse-mode autodiff (False).
TYPE: bool
DEFAULT: True
RETURNS DESCRIPTION Dict[str, Tensor]
The HVP of the function at the given parameters with the given vector.
Example >>> def f(z): return torch.sum(z**2)\n>>> u = torch.ones(10, requires_grad=True)\n>>> v = torch.ones(10)\n>>> hvp_vec = hvp(f, u, v)\n>>> assert torch.allclose(hvp_vec, torch.full((10, ), 2.0))\n
Source code in src/pydvl/influence/torch/functional.py
def hvp(\n func: Callable[[Dict[str, torch.Tensor]], torch.Tensor],\n params: Dict[str, torch.Tensor],\n vec: Dict[str, torch.Tensor],\n reverse_only: bool = True,\n) -> Dict[str, torch.Tensor]:\n r\"\"\"\n Computes the Hessian-vector product (HVP) for a given function at the given\n parameters, i.e.\n\n \\[\\nabla_{\\theta} \\nabla_{\\theta} f (\\theta)\\cdot v\\]\n\n This function can operate in two modes, either reverse-mode autodiff only or both\n forward- and reverse-mode autodiff.\n\n Args:\n func: The scalar-valued function for which the HVP is computed.\n params: The parameters at which the HVP is computed.\n vec: The vector with which the Hessian is multiplied.\n reverse_only: Whether to use only reverse-mode autodiff\n (True, default) or both forward- and reverse-mode autodiff (False).\n\n Returns:\n The HVP of the function at the given parameters with the given vector.\n\n ??? Example\n\n ```pycon\n >>> def f(z): return torch.sum(z**2)\n >>> u = torch.ones(10, requires_grad=True)\n >>> v = torch.ones(10)\n >>> hvp_vec = hvp(f, u, v)\n >>> assert torch.allclose(hvp_vec, torch.full((10, ), 2.0))\n ```\n \"\"\"\n\n output: Dict[str, torch.Tensor]\n\n if reverse_only:\n _, vjp_fn = vjp(grad(func), params)\n output = vjp_fn(vec)[0]\n else:\n output = jvp(grad(func), (params,), (vec,))[1]\n\n return output\n
"},{"location":"api/pydvl/influence/torch/functional/#pydvl.influence.torch.functional.create_batch_hvp_function","title":"create_batch_hvp_function","text":"create_batch_hvp_function(\n model: Module,\n loss: Callable[[Tensor, Tensor], Tensor],\n reverse_only: bool = True,\n) -> Callable[[Dict[str, Tensor], Tensor, Tensor, Tensor], Tensor]\n
Creates a function to compute Hessian-vector product (HVP) for a given model and loss function, where the Hessian information is computed for a provided batch.
This function takes a PyTorch model, a loss function, and an optional boolean parameter. It returns a callable that computes the Hessian-vector product for batches of input data and a given vector. The computation can be performed in reverse mode only, based on the reverse_only
parameter.
PARAMETER DESCRIPTION model
The PyTorch model for which the Hessian-vector product is to be computed.
TYPE: Module
loss
The loss function. It should take two torch.Tensor objects as input and return a torch.Tensor.
TYPE: Callable[[Tensor, Tensor], Tensor]
reverse_only
If True, the Hessian-vector product is computed in reverse mode only.
TYPE: bool
DEFAULT: True
RETURNS DESCRIPTION Callable[[Dict[str, Tensor], Tensor, Tensor, Tensor], Tensor]
A function that takes three torch.Tensor
objects - input data (x
), target data (y
), and a vector (vec
), and returns the Hessian-vector product of the loss evaluated on x
, y
times vec
.
Example # Assume `model` is a PyTorch model and `loss_fn` is a loss function.\nb_hvp_function = batch_hvp(model, loss_fn)\n\n# `x_batch`, `y_batch` are batches of input and target data,\n# and `vec` is a vector.\nhvp_result = b_hvp_function(x_batch, y_batch, vec)\n
Source code in src/pydvl/influence/torch/functional.py
def create_batch_hvp_function(\n model: torch.nn.Module,\n loss: Callable[[torch.Tensor, torch.Tensor], torch.Tensor],\n reverse_only: bool = True,\n) -> Callable[\n [Dict[str, torch.Tensor], torch.Tensor, torch.Tensor, torch.Tensor], torch.Tensor\n]:\n r\"\"\"\n Creates a function to compute Hessian-vector product (HVP) for a given model and\n loss function, where the Hessian information is computed for a provided batch.\n\n This function takes a PyTorch model, a loss function,\n and an optional boolean parameter. It returns a callable\n that computes the Hessian-vector product for batches of input data\n and a given vector. The computation can be performed in reverse mode only,\n based on the `reverse_only` parameter.\n\n Args:\n model: The PyTorch model for which the Hessian-vector product is to be computed.\n loss: The loss function. It should take two\n torch.Tensor objects as input and return a torch.Tensor.\n reverse_only (bool, optional): If True, the Hessian-vector product is computed\n in reverse mode only.\n\n Returns:\n A function that takes three `torch.Tensor` objects - input data (`x`),\n target data (`y`), and a vector (`vec`),\n and returns the Hessian-vector product of the loss\n evaluated on `x`, `y` times `vec`.\n\n ??? Example\n ```python\n # Assume `model` is a PyTorch model and `loss_fn` is a loss function.\n b_hvp_function = batch_hvp(model, loss_fn)\n\n # `x_batch`, `y_batch` are batches of input and target data,\n # and `vec` is a vector.\n hvp_result = b_hvp_function(x_batch, y_batch, vec)\n ```\n \"\"\"\n\n def b_hvp(\n params: Dict[str, torch.Tensor],\n x: torch.Tensor,\n y: torch.Tensor,\n vec: torch.Tensor,\n ):\n return flatten_dimensions(\n hvp(\n lambda p: create_batch_loss_function(model, loss)(p, x, y),\n params,\n align_structure(params, vec),\n reverse_only=reverse_only,\n ).values()\n )\n\n return b_hvp\n
"},{"location":"api/pydvl/influence/torch/functional/#pydvl.influence.torch.functional.create_empirical_loss_function","title":"create_empirical_loss_function","text":"create_empirical_loss_function(\n model: Module,\n loss: Callable[[Tensor, Tensor], Tensor],\n data_loader: DataLoader,\n) -> Callable[[Dict[str, Tensor]], Tensor]\n
Creates a function to compute the empirical loss of a given model on a given dataset. If we denote the model parameters with \\( \\theta \\), the resulting function approximates:
\\[ f(\\theta) = \\frac{1}{N}\\sum_{i=1}^N \\operatorname{loss}(y_i, \\operatorname{model}(\\theta, x_i)) \\] for a loss function \\(\\operatorname{loss}\\) and a model \\(\\operatorname{model}\\) with model parameters \\(\\theta\\), where \\(N\\) is the number of all elements provided by the data_loader.
PARAMETER DESCRIPTION model
The model for which the loss should be computed.
TYPE: Module
loss
The loss function to be used.
TYPE: Callable[[Tensor, Tensor], Tensor]
data_loader
The data loader for iterating over the dataset.
TYPE: DataLoader
RETURNS DESCRIPTION Callable[[Dict[str, Tensor]], Tensor]
A function that computes the empirical loss of the model on the dataset for given model parameters.
Source code in src/pydvl/influence/torch/functional.py
def create_empirical_loss_function(\n model: torch.nn.Module,\n loss: Callable[[torch.Tensor, torch.Tensor], torch.Tensor],\n data_loader: DataLoader,\n) -> Callable[[Dict[str, torch.Tensor]], torch.Tensor]:\n r\"\"\"\n Creates a function to compute the empirical loss of a given model\n on a given dataset. If we denote the model parameters with \\( \\theta \\),\n the resulting function approximates:\n\n \\[\n f(\\theta) = \\frac{1}{N}\\sum_{i=1}^N\n \\operatorname{loss}(y_i, \\operatorname{model}(\\theta, x_i))\n \\]\n\n for a loss function $\\operatorname{loss}$ and a model $\\operatorname{model}$\n with model parameters $\\theta$, where $N$ is the number of all elements provided\n by the data_loader.\n\n Args:\n model: The model for which the loss should be computed.\n loss: The loss function to be used.\n data_loader: The data loader for iterating over the dataset.\n\n Returns:\n A function that computes the empirical loss of the model on the dataset for\n given model parameters.\n\n \"\"\"\n\n def empirical_loss(params: Dict[str, torch.Tensor]):\n total_loss = to_model_device(torch.zeros((), requires_grad=True), model)\n total_samples = to_model_device(torch.zeros(()), model)\n\n for x, y in iter(data_loader):\n output = functional_call(\n model,\n params,\n (to_model_device(x, model),),\n )\n loss_value = loss(output, to_model_device(y, model))\n total_loss = total_loss + loss_value * x.size(0)\n total_samples += x.size(0)\n\n return total_loss / total_samples\n\n return empirical_loss\n
"},{"location":"api/pydvl/influence/torch/functional/#pydvl.influence.torch.functional.create_batch_loss_function","title":"create_batch_loss_function","text":"create_batch_loss_function(\n model: Module, loss: Callable[[Tensor, Tensor], Tensor]\n) -> Callable[[Dict[str, Tensor], Tensor, Tensor], Tensor]\n
Creates a function to compute the loss of a given model on a given batch of data, i.e. the function
\\[f(\\theta, x, y) = \\frac{1}{N} \\sum_{i=1}^N \\operatorname{loss}(\\operatorname{model}(\\theta, x_i), y_i)\\] for a loss function \\(\\operatorname{loss}\\) and a model \\(\\operatorname{model}\\) with model parameters \\(\\theta\\), where \\(N\\) is the number of elements in the batch. Args: model: The model for which the loss should be computed. loss: The loss function to be used, which should be able to handle a batch dimension
RETURNS DESCRIPTION Callable[[Dict[str, Tensor], Tensor, Tensor], Tensor]
A function that computes the loss of the model on a batch for given model parameters. The model parameter input to the function must take the form of a dict conform to model.named_parameters(), i.e. the keys must be a subset of the parameters and the corresponding tensor shapes must align. For the data input, the first dimension has to be the batch dimension.
Source code in src/pydvl/influence/torch/functional.py
def create_batch_loss_function(\n model: torch.nn.Module,\n loss: Callable[[torch.Tensor, torch.Tensor], torch.Tensor],\n) -> Callable[[Dict[str, torch.Tensor], torch.Tensor, torch.Tensor], torch.Tensor]:\n r\"\"\"\n Creates a function to compute the loss of a given model on a given batch of data,\n i.e. the function\n\n \\[f(\\theta, x, y) = \\frac{1}{N} \\sum_{i=1}^N\n \\operatorname{loss}(\\operatorname{model}(\\theta, x_i), y_i)\\]\n\n for a loss function $\\operatorname{loss}$ and a model $\\operatorname{model}$\n with model parameters $\\theta$, where $N$ is the number of elements in the batch.\n Args:\n model: The model for which the loss should be computed.\n loss: The loss function to be used, which should be able to handle\n a batch dimension\n\n Returns:\n A function that computes the loss of the model on a batch for given\n model parameters. The model parameter input to the function must take\n the form of a dict conform to model.named_parameters(), i.e. the keys\n must be a subset of the parameters and the corresponding tensor shapes\n must align. For the data input, the first dimension has to be the batch\n dimension.\n \"\"\"\n\n def batch_loss(params: Dict[str, torch.Tensor], x: torch.Tensor, y: torch.Tensor):\n outputs = functional_call(model, params, (to_model_device(x, model),))\n return loss(outputs, y)\n\n return batch_loss\n
"},{"location":"api/pydvl/influence/torch/functional/#pydvl.influence.torch.functional.create_hvp_function","title":"create_hvp_function","text":"create_hvp_function(\n model: Module,\n loss: Callable[[Tensor, Tensor], Tensor],\n data_loader: DataLoader,\n precompute_grad: bool = True,\n use_average: bool = True,\n reverse_only: bool = True,\n track_gradients: bool = False,\n) -> Callable[[Tensor], Tensor]\n
Returns a function that calculates the approximate Hessian-vector product for a given vector. If you want to compute the exact hessian, i.e., pulling all data into memory and compute a full gradient computation, use the function hvp.
PARAMETER DESCRIPTION model
A PyTorch module representing the model whose loss function's Hessian is to be computed.
TYPE: Module
loss
A callable that takes the model's output and target as input and returns the scalar loss.
TYPE: Callable[[Tensor, Tensor], Tensor]
data_loader
A DataLoader instance that provides batches of data for calculating the Hessian-vector product. Each batch from the DataLoader is assumed to return a tuple where the first element is the model's input and the second element is the target output.
TYPE: DataLoader
precompute_grad
If True
, the full data gradient is precomputed and kept in memory, which can speed up the hessian vector product computation. Set this to False
, if you can't afford to keep the full computation graph in memory.
TYPE: bool
DEFAULT: True
use_average
If True
, the returned function uses batch-wise computation via a batch loss function and averages the results. If False
, the function uses backpropagation on the full empirical loss function, which is more accurate than averaging the batch hessians, but probably has a way higher memory usage.
TYPE: bool
DEFAULT: True
reverse_only
Whether to use only reverse-mode autodiff or both forward- and reverse-mode autodiff. Ignored if precompute_grad
is True
.
TYPE: bool
DEFAULT: True
track_gradients
Whether to track gradients for the resulting tensor of the Hessian-vector products.
TYPE: bool
DEFAULT: False
RETURNS DESCRIPTION Callable[[Tensor], Tensor]
A function that takes a single argument, a vector, and returns the
Callable[[Tensor], Tensor]
product of the Hessian of the loss
function with respect to the
Callable[[Tensor], Tensor]
model
's parameters and the input vector.
Source code in src/pydvl/influence/torch/functional.py
def create_hvp_function(\n model: torch.nn.Module,\n loss: Callable[[torch.Tensor, torch.Tensor], torch.Tensor],\n data_loader: DataLoader,\n precompute_grad: bool = True,\n use_average: bool = True,\n reverse_only: bool = True,\n track_gradients: bool = False,\n) -> Callable[[torch.Tensor], torch.Tensor]:\n \"\"\"\n Returns a function that calculates the approximate Hessian-vector product\n for a given vector. If you want to compute the exact hessian,\n i.e., pulling all data into memory and compute a full gradient computation, use\n the function [hvp][pydvl.influence.torch.functional.hvp].\n\n Args:\n model: A PyTorch module representing the model whose loss function's\n Hessian is to be computed.\n loss: A callable that takes the model's output and target as input and\n returns the scalar loss.\n data_loader: A DataLoader instance that provides batches of data for\n calculating the Hessian-vector product. Each batch from the\n DataLoader is assumed to return a tuple where the first element is\n the model's input and the second element is the target output.\n precompute_grad: If `True`, the full data gradient is precomputed and\n kept in memory, which can speed up the hessian vector product\n computation. Set this to `False`, if you can't afford to keep the\n full computation graph in memory.\n use_average: If `True`, the returned function uses batch-wise\n computation via\n [a batch loss function][pydvl.influence.torch.functional.create_batch_loss_function]\n and averages the results.\n If `False`, the function uses backpropagation on the full\n [empirical loss function]\n [pydvl.influence.torch.functional.create_empirical_loss_function],\n which is more accurate than averaging the batch hessians, but\n probably has a way higher memory usage.\n reverse_only: Whether to use only reverse-mode autodiff or\n both forward- and reverse-mode autodiff. Ignored if\n `precompute_grad` is `True`.\n track_gradients: Whether to track gradients for the resulting tensor of\n the Hessian-vector products.\n\n Returns:\n A function that takes a single argument, a vector, and returns the\n product of the Hessian of the `loss` function with respect to the\n `model`'s parameters and the input vector.\n \"\"\"\n\n if precompute_grad:\n model_params = {k: p for k, p in model.named_parameters() if p.requires_grad}\n\n if use_average:\n model_dtype = next(p.dtype for p in model.parameters() if p.requires_grad)\n total_grad_xy = torch.empty(0, dtype=model_dtype)\n total_points = 0\n grad_func = torch.func.grad(create_batch_loss_function(model, loss))\n for x, y in iter(data_loader):\n grad_xy = grad_func(\n model_params, to_model_device(x, model), to_model_device(y, model)\n )\n grad_xy = flatten_dimensions(grad_xy.values())\n if total_grad_xy.nelement() == 0:\n total_grad_xy = torch.zeros_like(grad_xy)\n total_grad_xy += grad_xy * len(x)\n total_points += len(x)\n total_grad_xy /= total_points\n else:\n total_grad_xy = torch.func.grad(\n create_empirical_loss_function(model, loss, data_loader)\n )(model_params)\n total_grad_xy = flatten_dimensions(total_grad_xy.values())\n\n def precomputed_grads_hvp_function(\n precomputed_grads: torch.Tensor, vec: torch.Tensor\n ) -> torch.Tensor:\n vec = to_model_device(vec, model)\n if vec.ndim == 1:\n vec = vec.unsqueeze(0)\n\n z = (precomputed_grads * torch.autograd.Variable(vec)).sum(dim=1)\n\n mvp = []\n for i in range(len(z)):\n mvp.append(\n flatten_dimensions(\n torch.autograd.grad(\n z[i], list(model_params.values()), retain_graph=True\n )\n )\n )\n result = torch.stack([arr.contiguous().view(-1) for arr in mvp])\n\n if not track_gradients:\n result = result.detach()\n\n return result\n\n return partial(precomputed_grads_hvp_function, total_grad_xy)\n\n def hvp_function(vec: torch.Tensor) -> torch.Tensor:\n params = get_model_parameters(model, detach=not track_gradients)\n v = align_structure(params, vec)\n empirical_loss = create_empirical_loss_function(model, loss, data_loader)\n return flatten_dimensions(\n hvp(empirical_loss, params, v, reverse_only=reverse_only).values()\n )\n\n def avg_hvp_function(vec: torch.Tensor) -> torch.Tensor:\n n_batches = len(data_loader)\n avg_hessian = to_model_device(torch.zeros_like(vec), model)\n b_hvp = create_batch_hvp_function(model, loss, reverse_only)\n params = get_model_parameters(model, detach=not track_gradients)\n for t_x, t_y in iter(data_loader):\n t_x, t_y = to_model_device(t_x, model), to_model_device(t_y, model)\n avg_hessian += b_hvp(params, t_x, t_y, to_model_device(vec, model))\n\n return avg_hessian / float(n_batches)\n\n return avg_hvp_function if use_average else hvp_function\n
"},{"location":"api/pydvl/influence/torch/functional/#pydvl.influence.torch.functional.hessian","title":"hessian","text":"hessian(\n model: Module,\n loss: Callable[[Tensor, Tensor], Tensor],\n data_loader: DataLoader,\n use_hessian_avg: bool = True,\n track_gradients: bool = False,\n restrict_to: Optional[Dict[str, Tensor]] = None,\n) -> Tensor\n
Computes the Hessian matrix for a given model and loss function.
PARAMETER DESCRIPTION model
The PyTorch model for which the Hessian is computed.
TYPE: Module
loss
A callable that computes the loss.
TYPE: Callable[[Tensor, Tensor], Tensor]
data_loader
DataLoader providing batches of input data and corresponding ground truths.
TYPE: DataLoader
use_hessian_avg
Flag to indicate whether the average Hessian across mini-batches should be computed. If False, the empirical loss across the entire dataset is used.
TYPE: bool
DEFAULT: True
track_gradients
Whether to track gradients for the resulting tensor of the hessian vector products.
TYPE: bool
DEFAULT: False
restrict_to
The parameters to restrict the second order differentiation to, i.e. the corresponding sub-matrix of the Hessian. If None, the full Hessian is computed.
TYPE: Optional[Dict[str, Tensor]]
DEFAULT: None
RETURNS DESCRIPTION Tensor
A tensor representing the Hessian matrix. The shape of the tensor will be (n_parameters, n_parameters), where n_parameters is the number of trainable parameters in the model.
Source code in src/pydvl/influence/torch/functional.py
def hessian(\n model: torch.nn.Module,\n loss: Callable[[torch.Tensor, torch.Tensor], torch.Tensor],\n data_loader: DataLoader,\n use_hessian_avg: bool = True,\n track_gradients: bool = False,\n restrict_to: Optional[Dict[str, torch.Tensor]] = None,\n) -> torch.Tensor:\n \"\"\"\n Computes the Hessian matrix for a given model and loss function.\n\n Args:\n model: The PyTorch model for which the Hessian is computed.\n loss: A callable that computes the loss.\n data_loader: DataLoader providing batches of input data and corresponding\n ground truths.\n use_hessian_avg: Flag to indicate whether the average Hessian across\n mini-batches should be computed.\n If False, the empirical loss across the entire dataset is used.\n track_gradients: Whether to track gradients for the resulting tensor of\n the hessian vector products.\n restrict_to: The parameters to restrict the second order differentiation to,\n i.e. the corresponding sub-matrix of the Hessian. If None, the full Hessian\n is computed.\n\n Returns:\n A tensor representing the Hessian matrix. The shape of the tensor will be\n (n_parameters, n_parameters), where n_parameters is the number of trainable\n parameters in the model.\n \"\"\"\n params = restrict_to\n\n if params is None:\n params = get_model_parameters(model, detach=not track_gradients)\n n_parameters = sum([p.numel() for p in params.values()])\n model_dtype = next((p.dtype for p in params.values()))\n\n flat_params = flatten_dimensions(params.values())\n\n if use_hessian_avg:\n n_samples = 0\n hessian_mat = to_model_device(\n torch.zeros((n_parameters, n_parameters), dtype=model_dtype), model\n )\n batch_loss = create_batch_loss_function(model, loss)\n\n def flat_input_batch_loss(\n p: torch.Tensor, t_x: torch.Tensor, t_y: torch.Tensor\n ):\n return batch_loss(align_structure(params, p), t_x, t_y)\n\n for x, y in iter(data_loader):\n n_samples += x.shape[0]\n batch_hessian = torch.func.hessian(flat_input_batch_loss)(\n flat_params, to_model_device(x, model), to_model_device(y, model)\n )\n if not track_gradients and batch_hessian.requires_grad:\n batch_hessian = batch_hessian.detach()\n hessian_mat += x.shape[0] * batch_hessian\n\n hessian_mat /= n_samples\n else:\n\n def flat_input_empirical_loss(p: torch.Tensor):\n return create_empirical_loss_function(model, loss, data_loader)(\n align_with_model(p, model)\n )\n\n hessian_mat = torch.func.jacrev(torch.func.jacrev(flat_input_empirical_loss))(\n flat_params\n )\n\n return hessian_mat\n
"},{"location":"api/pydvl/influence/torch/functional/#pydvl.influence.torch.functional.gauss_newton","title":"gauss_newton","text":"gauss_newton(\n model: Module,\n loss: Callable[[Tensor, Tensor], Tensor],\n data_loader: DataLoader,\n restrict_to: Optional[Dict[str, Tensor]] = None,\n)\n
Compute the Gauss-Newton matrix, i.e.
$$ \\sum_{i=1}^N \\nabla_{\\theta}\\ell(m(x_i; \\theta), y) \\nabla_{\\theta}\\ell(m(x_i; \\theta), y)^t,$$ for a loss function \\(\\ell\\) and a model \\(m\\) with model parameters \\(\\theta\\).
PARAMETER DESCRIPTION model
The PyTorch model.
TYPE: Module
loss
A callable that computes the loss.
TYPE: Callable[[Tensor, Tensor], Tensor]
data_loader
A PyTorch DataLoader providing batches of input data and corresponding output data.
TYPE: DataLoader
restrict_to
The parameters to restrict the differentiation to, i.e. the corresponding sub-matrix of the Jacobian. If None, the full Jacobian is used.
TYPE: Optional[Dict[str, Tensor]]
DEFAULT: None
RETURNS DESCRIPTION The Gauss-Newton matrix.
Source code in src/pydvl/influence/torch/functional.py
def gauss_newton(\n model: torch.nn.Module,\n loss: Callable[[torch.Tensor, torch.Tensor], torch.Tensor],\n data_loader: DataLoader,\n restrict_to: Optional[Dict[str, torch.Tensor]] = None,\n):\n r\"\"\"\n Compute the Gauss-Newton matrix, i.e.\n\n $$ \\sum_{i=1}^N \\nabla_{\\theta}\\ell(m(x_i; \\theta), y)\n \\nabla_{\\theta}\\ell(m(x_i; \\theta), y)^t,$$\n for a loss function $\\ell$ and a model $m$ with model parameters $\\theta$.\n\n Args:\n model: The PyTorch model.\n loss: A callable that computes the loss.\n data_loader: A PyTorch DataLoader providing batches of input data and\n corresponding output data.\n restrict_to: The parameters to restrict the differentiation to,\n i.e. the corresponding sub-matrix of the Jacobian. If None, the full\n Jacobian is used.\n\n Returns:\n The Gauss-Newton matrix.\n \"\"\"\n\n per_sample_grads = create_per_sample_gradient_function(model, loss)\n\n params = restrict_to\n if params is None:\n params = get_model_parameters(model)\n\n def generate_batch_matrices():\n for x, y in data_loader:\n grads = flatten_dimensions(\n per_sample_grads(params, x, y).values(), shape=(x.shape[0], -1)\n )\n batch_mat = grads.t() @ grads\n yield batch_mat.detach()\n\n n_points = 0\n tensors = generate_batch_matrices()\n result = next(tensors)\n\n for t in tensors:\n result += t\n n_points += t.shape[0]\n\n return result / n_points\n
"},{"location":"api/pydvl/influence/torch/functional/#pydvl.influence.torch.functional.create_per_sample_loss_function","title":"create_per_sample_loss_function","text":"create_per_sample_loss_function(\n model: Module, loss: Callable[[Tensor, Tensor], Tensor]\n) -> Callable[[Dict[str, Tensor], Tensor, Tensor], Tensor]\n
Generates a function to compute per-sample losses using PyTorch's vmap, i.e. the vector-valued function
\\[ f(\\theta, x, y) = (\\operatorname{loss}(\\operatorname{model}(\\theta, x_1), y_1), \\dots, \\operatorname{loss}(\\operatorname{model}(\\theta, x_N), y_N)), \\] for a loss function \\(\\operatorname{loss}\\) and a model \\(\\operatorname{model}\\) with model parameters \\(\\theta\\), where \\(N\\) is the number of elements in the batch.
PARAMETER DESCRIPTION model
The PyTorch model for which per-sample losses will be computed.
TYPE: Module
loss
A callable that computes the loss.
TYPE: Callable[[Tensor, Tensor], Tensor]
RETURNS DESCRIPTION Callable[[Dict[str, Tensor], Tensor, Tensor], Tensor]
A callable that computes the loss for each sample in the batch, given a dictionary of model inputs, the model's predictions, and the true values. The callable will return a tensor where each entry corresponds to the loss of the corresponding sample.
Source code in src/pydvl/influence/torch/functional.py
def create_per_sample_loss_function(\n model: torch.nn.Module, loss: Callable[[torch.Tensor, torch.Tensor], torch.Tensor]\n) -> Callable[[Dict[str, torch.Tensor], torch.Tensor, torch.Tensor], torch.Tensor]:\n r\"\"\"\n Generates a function to compute per-sample losses using PyTorch's vmap,\n i.e. the vector-valued function\n\n \\[ f(\\theta, x, y) = (\\operatorname{loss}(\\operatorname{model}(\\theta, x_1), y_1),\n \\dots,\n \\operatorname{loss}(\\operatorname{model}(\\theta, x_N), y_N)), \\]\n\n for a loss function $\\operatorname{loss}$ and a model $\\operatorname{model}$ with\n model parameters $\\theta$, where $N$ is the number of elements in the batch.\n\n Args:\n model: The PyTorch model for which per-sample losses will be computed.\n loss: A callable that computes the loss.\n\n Returns:\n A callable that computes the loss for each sample in the batch,\n given a dictionary of model inputs, the model's predictions,\n and the true values. The callable will return a tensor where\n each entry corresponds to the loss of the corresponding sample.\n \"\"\"\n\n def compute_loss(\n params: Dict[str, torch.Tensor], x: torch.Tensor, y: torch.Tensor\n ) -> torch.Tensor:\n outputs = functional_call(\n model, params, (to_model_device(x.unsqueeze(0), model),)\n )\n return loss(outputs, y.unsqueeze(0))\n\n vmap_loss: Callable[\n [Dict[str, torch.Tensor], torch.Tensor, torch.Tensor], torch.Tensor\n ] = torch.vmap(compute_loss, in_dims=(None, 0, 0))\n return vmap_loss\n
"},{"location":"api/pydvl/influence/torch/functional/#pydvl.influence.torch.functional.create_per_sample_gradient_function","title":"create_per_sample_gradient_function","text":"create_per_sample_gradient_function(\n model: Module, loss: Callable[[Tensor, Tensor], Tensor]\n) -> Callable[[Dict[str, Tensor], Tensor, Tensor], Dict[str, Tensor]]\n
Generates a function to computes the per-sample gradient of the loss with respect to the model's parameters, i.e. the tensor-valued function
\\[ f(\\theta, x, y) = (\\nabla_{\\theta}\\operatorname{loss} (\\operatorname{model}(\\theta, x_1), y_1), \\dots, \\nabla_{\\theta}\\operatorname{loss}(\\operatorname{model}(\\theta, x_N), y_N) \\] for a loss function \\(\\operatorname{loss}\\) and a model \\(\\operatorname{model}\\) with model parameters \\(\\theta\\), where \\(N\\) is the number of elements in the batch.
PARAMETER DESCRIPTION model
The PyTorch model for which per-sample gradients will be computed.
TYPE: Module
loss
A callable that computes the loss.
TYPE: Callable[[Tensor, Tensor], Tensor]
RETURNS DESCRIPTION Callable[[Dict[str, Tensor], Tensor, Tensor], Dict[str, Tensor]]
A callable that takes a dictionary of model parameters, the model's input, and the labels. It returns a dictionary with the same keys as the model's named parameters. Each entry in the returned dictionary corresponds to the gradient of the corresponding model parameter for each sample in the batch.
Source code in src/pydvl/influence/torch/functional.py
def create_per_sample_gradient_function(\n model: torch.nn.Module, loss: Callable[[torch.Tensor, torch.Tensor], torch.Tensor]\n) -> Callable[\n [Dict[str, torch.Tensor], torch.Tensor, torch.Tensor], Dict[str, torch.Tensor]\n]:\n r\"\"\"\n Generates a function to computes the per-sample gradient of the loss with respect to\n the model's parameters, i.e. the tensor-valued function\n\n \\[ f(\\theta, x, y) = (\\nabla_{\\theta}\\operatorname{loss}\n (\\operatorname{model}(\\theta, x_1), y_1), \\dots,\n \\nabla_{\\theta}\\operatorname{loss}(\\operatorname{model}(\\theta, x_N), y_N) \\]\n\n for a loss function $\\operatorname{loss}$ and a model $\\operatorname{model}$ with\n model parameters $\\theta$, where $N$ is the number of elements in the batch.\n\n Args:\n model: The PyTorch model for which per-sample gradients will be computed.\n loss: A callable that computes the loss.\n\n Returns:\n A callable that takes a dictionary of model parameters, the model's input,\n and the labels. It returns a dictionary with the same keys as the model's\n named parameters. Each entry in the returned dictionary corresponds to\n the gradient of the corresponding model parameter for each sample\n in the batch.\n\n \"\"\"\n\n per_sample_grad: Callable[\n [Dict[str, torch.Tensor], torch.Tensor, torch.Tensor], Dict[str, torch.Tensor]\n ] = torch.func.jacrev(create_per_sample_loss_function(model, loss))\n return per_sample_grad\n
"},{"location":"api/pydvl/influence/torch/functional/#pydvl.influence.torch.functional.create_matrix_jacobian_product_function","title":"create_matrix_jacobian_product_function","text":"create_matrix_jacobian_product_function(\n model: Module, loss: Callable[[Tensor, Tensor], Tensor], g: Tensor\n) -> Callable[[Dict[str, Tensor], Tensor, Tensor], Tensor]\n
Generates a function to computes the matrix-Jacobian product (MJP) of the per-sample loss with respect to the model's parameters, i.e. the function
\\[ f(\\theta, x, y) = g \\, @ \\, (\\nabla_{\\theta}\\operatorname{loss} (\\operatorname{model}(\\theta, x_i), y_i))_i^T \\] for a loss function \\(\\operatorname{loss}\\) and a model \\(\\operatorname{model}\\) with model parameters \\(\\theta\\).
PARAMETER DESCRIPTION model
The PyTorch model for which the MJP will be computed.
TYPE: Module
loss
A callable that computes the loss.
TYPE: Callable[[Tensor, Tensor], Tensor]
g
Matrix for which the product with the Jacobian will be computed. The shape of this matrix should be consistent with the shape of the jacobian.
TYPE: Tensor
RETURNS DESCRIPTION Callable[[Dict[str, Tensor], Tensor, Tensor], Tensor]
A callable that takes a dictionary of model inputs, the model's input, and the labels. The callable returns the matrix-Jacobian product of the per-sample loss with respect to the model's parameters for the given matrix g
.
Source code in src/pydvl/influence/torch/functional.py
def create_matrix_jacobian_product_function(\n model: torch.nn.Module,\n loss: Callable[[torch.Tensor, torch.Tensor], torch.Tensor],\n g: torch.Tensor,\n) -> Callable[[Dict[str, torch.Tensor], torch.Tensor, torch.Tensor], torch.Tensor]:\n r\"\"\"\n Generates a function to computes the matrix-Jacobian product (MJP) of the\n per-sample loss with respect to the model's parameters, i.e. the function\n\n \\[ f(\\theta, x, y) = g \\, @ \\, (\\nabla_{\\theta}\\operatorname{loss}\n (\\operatorname{model}(\\theta, x_i), y_i))_i^T \\]\n\n for a loss function $\\operatorname{loss}$ and a model $\\operatorname{model}$ with\n model parameters $\\theta$.\n\n Args:\n model: The PyTorch model for which the MJP will be computed.\n loss: A callable that computes the loss.\n g: Matrix for which the product with the Jacobian will be computed.\n The shape of this matrix should be consistent with the shape of\n the jacobian.\n\n Returns:\n A callable that takes a dictionary of model inputs, the model's input,\n and the labels. The callable returns the matrix-Jacobian product of the\n per-sample loss with respect to the model's parameters for the given\n matrix `g`.\n\n \"\"\"\n\n def single_jvp(\n params: Dict[str, torch.Tensor],\n x: torch.Tensor,\n y: torch.Tensor,\n _g: torch.Tensor,\n ):\n return torch.func.jvp(\n lambda p: create_per_sample_loss_function(model, loss)(p, x, y),\n (params,),\n (align_with_model(_g, model),),\n )[1]\n\n def full_jvp(params: Dict[str, torch.Tensor], x: torch.Tensor, y: torch.Tensor):\n return torch.func.vmap(single_jvp, in_dims=(None, None, None, 0))(\n params, x, y, g\n )\n\n return full_jvp\n
"},{"location":"api/pydvl/influence/torch/functional/#pydvl.influence.torch.functional.create_per_sample_mixed_derivative_function","title":"create_per_sample_mixed_derivative_function","text":"create_per_sample_mixed_derivative_function(\n model: Module, loss: Callable[[Tensor, Tensor], Tensor]\n) -> Callable[[Dict[str, Tensor], Tensor, Tensor], Dict[str, Tensor]]\n
Generates a function to computes the mixed derivatives, of the per-sample loss with respect to the model parameters and the input, i.e. the function
\\[ f(\\theta, x, y) = \\nabla_{\\theta}\\nabla_{x}\\operatorname{loss} (\\operatorname{model}(\\theta, x), y) \\] for a loss function \\(\\operatorname{loss}\\) and a model \\(\\operatorname{model}\\) with model parameters \\(\\theta\\).
PARAMETER DESCRIPTION model
The PyTorch model for which the mixed derivatives are computed.
TYPE: Module
loss
A callable that computes the loss.
TYPE: Callable[[Tensor, Tensor], Tensor]
RETURNS DESCRIPTION Callable[[Dict[str, Tensor], Tensor, Tensor], Dict[str, Tensor]]
A callable that takes a dictionary of model inputs, the model's input, and the labels. The callable returns the mixed derivatives of the per-sample loss with respect to the model's parameters and input.
Source code in src/pydvl/influence/torch/functional.py
def create_per_sample_mixed_derivative_function(\n model: torch.nn.Module, loss: Callable[[torch.Tensor, torch.Tensor], torch.Tensor]\n) -> Callable[\n [Dict[str, torch.Tensor], torch.Tensor, torch.Tensor], Dict[str, torch.Tensor]\n]:\n r\"\"\"\n Generates a function to computes the mixed derivatives, of the per-sample loss with\n respect to the model parameters and the input, i.e. the function\n\n \\[ f(\\theta, x, y) = \\nabla_{\\theta}\\nabla_{x}\\operatorname{loss}\n (\\operatorname{model}(\\theta, x), y) \\]\n\n for a loss function $\\operatorname{loss}$ and a model $\\operatorname{model}$ with\n model parameters $\\theta$.\n\n Args:\n model: The PyTorch model for which the mixed derivatives are computed.\n loss: A callable that computes the loss.\n\n Returns:\n A callable that takes a dictionary of model inputs, the model's input,\n and the labels. The callable returns the mixed derivatives of the\n per-sample loss with respect to the model's parameters and input.\n\n \"\"\"\n\n def compute_loss(params: Dict[str, torch.Tensor], x: torch.Tensor, y: torch.Tensor):\n outputs = functional_call(\n model, params, (to_model_device(x.unsqueeze(0), model),)\n )\n return loss(outputs, y.unsqueeze(0))\n\n per_samp_mix_derivative: Callable[\n [Dict[str, torch.Tensor], torch.Tensor, torch.Tensor], Dict[str, torch.Tensor]\n ] = torch.vmap(\n torch.func.jacrev(torch.func.grad(compute_loss, argnums=1)),\n in_dims=(None, 0, 0),\n )\n return per_samp_mix_derivative\n
"},{"location":"api/pydvl/influence/torch/functional/#pydvl.influence.torch.functional.lanzcos_low_rank_hessian_approx","title":"lanzcos_low_rank_hessian_approx","text":"lanzcos_low_rank_hessian_approx(\n hessian_vp: Callable[[Tensor], Tensor],\n matrix_shape: Tuple[int, int],\n hessian_perturbation: float = 0.0,\n rank_estimate: int = 10,\n krylov_dimension: Optional[int] = None,\n tol: float = 1e-06,\n max_iter: Optional[int] = None,\n device: Optional[device] = None,\n eigen_computation_on_gpu: bool = False,\n torch_dtype: Optional[dtype] = None,\n) -> LowRankProductRepresentation\n
Calculates a low-rank approximation of the Hessian matrix of a scalar-valued function using the implicitly restarted Lanczos algorithm, i.e.:
\\[ H_{\\text{approx}} = V D V^T\\] where \\(D\\) is a diagonal matrix with the top (in absolute value) rank_estimate
eigenvalues of the Hessian and \\(V\\) contains the corresponding eigenvectors.
PARAMETER DESCRIPTION hessian_vp
A function that takes a vector and returns the product of the Hessian of the loss function.
TYPE: Callable[[Tensor], Tensor]
matrix_shape
The shape of the matrix, represented by the hessian vector product.
TYPE: Tuple[int, int]
hessian_perturbation
Regularization parameter added to the Hessian-vector product for numerical stability.
TYPE: float
DEFAULT: 0.0
rank_estimate
The number of eigenvalues and corresponding eigenvectors to compute. Represents the desired rank of the Hessian approximation.
TYPE: int
DEFAULT: 10
krylov_dimension
The number of Krylov vectors to use for the Lanczos method. If not provided, it defaults to \\( \\min(\\text{model.n_parameters}, \\max(2 \\times \\text{rank_estimate} + 1, 20)) \\).
TYPE: Optional[int]
DEFAULT: None
tol
The stopping criteria for the Lanczos algorithm, which stops when the difference in the approximated eigenvalue is less than tol
. Defaults to 1e-6.
TYPE: float
DEFAULT: 1e-06
max_iter
The maximum number of iterations for the Lanczos method. If not provided, it defaults to \\( 10 \\cdot \\text{model.n_parameters}\\).
TYPE: Optional[int]
DEFAULT: None
device
The device to use for executing the hessian vector product.
TYPE: Optional[device]
DEFAULT: None
eigen_computation_on_gpu
If True, tries to execute the eigen pair approximation on the provided device via cupy implementation. Ensure that either your model is small enough, or you use a small rank_estimate to fit your device's memory. If False, the eigen pair approximation is executed on the CPU with scipy's wrapper to ARPACK.
TYPE: bool
DEFAULT: False
torch_dtype
If not provided, the current torch default dtype is used for conversion to torch.
TYPE: Optional[dtype]
DEFAULT: None
RETURNS DESCRIPTION LowRankProductRepresentation
LowRankProductRepresentation instance that contains the top (up until rank_estimate) eigenvalues and corresponding eigenvectors of the Hessian.
Source code in src/pydvl/influence/torch/functional.py
def lanzcos_low_rank_hessian_approx(\n hessian_vp: Callable[[torch.Tensor], torch.Tensor],\n matrix_shape: Tuple[int, int],\n hessian_perturbation: float = 0.0,\n rank_estimate: int = 10,\n krylov_dimension: Optional[int] = None,\n tol: float = 1e-6,\n max_iter: Optional[int] = None,\n device: Optional[torch.device] = None,\n eigen_computation_on_gpu: bool = False,\n torch_dtype: Optional[torch.dtype] = None,\n) -> LowRankProductRepresentation:\n r\"\"\"\n Calculates a low-rank approximation of the Hessian matrix of a scalar-valued\n function using the implicitly restarted Lanczos algorithm, i.e.:\n\n \\[ H_{\\text{approx}} = V D V^T\\]\n\n where \\(D\\) is a diagonal matrix with the top (in absolute value) `rank_estimate`\n eigenvalues of the Hessian and \\(V\\) contains the corresponding eigenvectors.\n\n Args:\n hessian_vp: A function that takes a vector and returns the product of\n the Hessian of the loss function.\n matrix_shape: The shape of the matrix, represented by the hessian vector\n product.\n hessian_perturbation: Regularization parameter added to the\n Hessian-vector product for numerical stability.\n rank_estimate: The number of eigenvalues and corresponding eigenvectors\n to compute. Represents the desired rank of the Hessian approximation.\n krylov_dimension: The number of Krylov vectors to use for the Lanczos\n method. If not provided, it defaults to\n \\( \\min(\\text{model.n_parameters},\n \\max(2 \\times \\text{rank_estimate} + 1, 20)) \\).\n tol: The stopping criteria for the Lanczos algorithm, which stops when\n the difference in the approximated eigenvalue is less than `tol`.\n Defaults to 1e-6.\n max_iter: The maximum number of iterations for the Lanczos method. If\n not provided, it defaults to \\( 10 \\cdot \\text{model.n_parameters}\\).\n device: The device to use for executing the hessian vector product.\n eigen_computation_on_gpu: If True, tries to execute the eigen pair\n approximation on the provided device via [cupy](https://cupy.dev/)\n implementation. Ensure that either your model is small enough, or you\n use a small rank_estimate to fit your device's memory. If False, the\n eigen pair approximation is executed on the CPU with scipy's wrapper to\n ARPACK.\n torch_dtype: If not provided, the current torch default dtype is used for\n conversion to torch.\n\n Returns:\n [LowRankProductRepresentation]\n [pydvl.influence.torch.functional.LowRankProductRepresentation]\n instance that contains the top (up until rank_estimate) eigenvalues\n and corresponding eigenvectors of the Hessian.\n \"\"\"\n\n torch_dtype = torch.get_default_dtype() if torch_dtype is None else torch_dtype\n\n if eigen_computation_on_gpu:\n try:\n import cupy as cp\n from cupyx.scipy.sparse.linalg import LinearOperator, eigsh\n from torch.utils.dlpack import from_dlpack, to_dlpack\n except ImportError as e:\n raise ImportError(\n f\"Try to install missing dependencies or set eigen_computation_on_gpu \"\n f\"to False: {e}\"\n )\n\n if device is None:\n raise ValueError(\n \"Without setting an explicit device, cupy is not supported\"\n )\n\n def to_torch_conversion_function(x: cp.NDArray) -> torch.Tensor:\n return from_dlpack(x.toDlpack()).to(torch_dtype)\n\n def mv(x):\n x = to_torch_conversion_function(x)\n y = hessian_vp(x) + hessian_perturbation * x\n return cp.from_dlpack(to_dlpack(y))\n\n else:\n from scipy.sparse.linalg import LinearOperator, eigsh\n\n def mv(x):\n x_torch = torch.as_tensor(x, device=device, dtype=torch_dtype)\n y = (\n (hessian_vp(x_torch) + hessian_perturbation * x_torch)\n .detach()\n .cpu()\n .numpy()\n )\n return y\n\n to_torch_conversion_function = partial(torch.as_tensor, dtype=torch_dtype)\n\n try:\n eigen_vals, eigen_vecs = eigsh(\n LinearOperator(matrix_shape, matvec=mv),\n k=rank_estimate,\n maxiter=max_iter,\n tol=tol,\n ncv=krylov_dimension,\n return_eigenvectors=True,\n )\n\n except ArpackNoConvergence as e:\n logger.warning(\n f\"ARPACK did not converge for parameters {max_iter=}, {tol=}, \"\n f\"{krylov_dimension=}, {rank_estimate=}. \\n \"\n f\"Returning the best approximation found so far. \"\n f\"Use those with care or modify parameters.\\n Original error: {e}\"\n )\n\n eigen_vals, eigen_vecs = e.eigenvalues, e.eigenvectors\n\n eigen_vals = to_torch_conversion_function(eigen_vals)\n eigen_vecs = to_torch_conversion_function(eigen_vecs)\n\n return LowRankProductRepresentation(eigen_vals, eigen_vecs)\n
"},{"location":"api/pydvl/influence/torch/functional/#pydvl.influence.torch.functional.model_hessian_low_rank","title":"model_hessian_low_rank","text":"model_hessian_low_rank(\n model: Module,\n loss: Callable[[Tensor, Tensor], Tensor],\n training_data: DataLoader,\n hessian_perturbation: float = 0.0,\n rank_estimate: int = 10,\n krylov_dimension: Optional[int] = None,\n tol: float = 1e-06,\n max_iter: Optional[int] = None,\n eigen_computation_on_gpu: bool = False,\n precompute_grad: bool = False,\n) -> LowRankProductRepresentation\n
Calculates a low-rank approximation of the Hessian matrix of the model's loss function using the implicitly restarted Lanczos algorithm, i.e.
\\[ H_{\\text{approx}} = V D V^T\\] where \\(D\\) is a diagonal matrix with the top (in absolute value) rank_estimate
eigenvalues of the Hessian and \\(V\\) contains the corresponding eigenvectors.
PARAMETER DESCRIPTION model
A PyTorch model instance. The Hessian will be calculated with respect to this model's parameters.
TYPE: Module
loss
A callable that computes the loss.
training_data
A DataLoader instance that provides the model's training data. Used in calculating the Hessian-vector products.
TYPE: DataLoader
hessian_perturbation
Optional regularization parameter added to the Hessian-vector product for numerical stability.
TYPE: float
DEFAULT: 0.0
rank_estimate
The number of eigenvalues and corresponding eigenvectors to compute. Represents the desired rank of the Hessian approximation.
TYPE: int
DEFAULT: 10
krylov_dimension
The number of Krylov vectors to use for the Lanczos method. If not provided, it defaults to min(model.n_parameters, max(2*rank_estimate + 1, 20)).
TYPE: Optional[int]
DEFAULT: None
tol
The stopping criteria for the Lanczos algorithm, which stops when the difference in the approximated eigenvalue is less than tol
. Defaults to 1e-6.
TYPE: float
DEFAULT: 1e-06
max_iter
The maximum number of iterations for the Lanczos method. If not provided, it defaults to 10*model.n_parameters.
TYPE: Optional[int]
DEFAULT: None
eigen_computation_on_gpu
If True, tries to execute the eigen pair approximation on the provided device via cupy implementation. Make sure, that either your model is small enough or you use a small rank_estimate to fit your device's memory. If False, the eigen pair approximation is executed on the CPU by scipy wrapper to ARPACK.
TYPE: bool
DEFAULT: False
precompute_grad
If True, the full data gradient is precomputed and kept in memory, which can speed up the hessian vector product computation. Set this to False, if you can't afford to keep the full computation graph in memory.
TYPE: bool
DEFAULT: False
RETURNS DESCRIPTION LowRankProductRepresentation
LowRankProductRepresentation instance that contains the top (up until rank_estimate) eigenvalues and corresponding eigenvectors of the Hessian.
Source code in src/pydvl/influence/torch/functional.py
def model_hessian_low_rank(\n model: torch.nn.Module,\n loss: Callable[[torch.Tensor, torch.Tensor], torch.Tensor],\n training_data: DataLoader,\n hessian_perturbation: float = 0.0,\n rank_estimate: int = 10,\n krylov_dimension: Optional[int] = None,\n tol: float = 1e-6,\n max_iter: Optional[int] = None,\n eigen_computation_on_gpu: bool = False,\n precompute_grad: bool = False,\n) -> LowRankProductRepresentation:\n r\"\"\"\n Calculates a low-rank approximation of the Hessian matrix of the model's\n loss function using the implicitly restarted Lanczos algorithm, i.e.\n\n \\[ H_{\\text{approx}} = V D V^T\\]\n\n where \\(D\\) is a diagonal matrix with the top (in absolute value) `rank_estimate`\n eigenvalues of the Hessian and \\(V\\) contains the corresponding eigenvectors.\n\n\n Args:\n model: A PyTorch model instance. The Hessian will be calculated with respect to\n this model's parameters.\n loss : A callable that computes the loss.\n training_data: A DataLoader instance that provides the model's training data.\n Used in calculating the Hessian-vector products.\n hessian_perturbation: Optional regularization parameter added to the\n Hessian-vector product for numerical stability.\n rank_estimate: The number of eigenvalues and corresponding eigenvectors to\n compute. Represents the desired rank of the Hessian approximation.\n krylov_dimension: The number of Krylov vectors to use for the Lanczos method.\n If not provided, it defaults to min(model.n_parameters,\n max(2*rank_estimate + 1, 20)).\n tol: The stopping criteria for the Lanczos algorithm,\n which stops when the difference in the approximated eigenvalue is less than\n `tol`. Defaults to 1e-6.\n max_iter: The maximum number of iterations for the Lanczos method.\n If not provided, it defaults to 10*model.n_parameters.\n eigen_computation_on_gpu: If True, tries to execute the eigen pair approximation\n on the provided device via cupy implementation.\n Make sure, that either your model is small enough or you use a\n small rank_estimate to fit your device's memory.\n If False, the eigen pair approximation is executed on the CPU by\n scipy wrapper to ARPACK.\n precompute_grad: If True, the full data gradient is precomputed and kept\n in memory, which can speed up the hessian vector product computation.\n Set this to False, if you can't afford to keep the full computation graph\n in memory.\n\n Returns:\n [LowRankProductRepresentation]\n [pydvl.influence.torch.functional.LowRankProductRepresentation]\n instance that contains the top (up until rank_estimate) eigenvalues\n and corresponding eigenvectors of the Hessian.\n \"\"\"\n raw_hvp = create_hvp_function(\n model, loss, training_data, use_average=True, precompute_grad=precompute_grad\n )\n n_params = sum([p.numel() for p in model.parameters() if p.requires_grad])\n device = next(model.parameters()).device\n return lanzcos_low_rank_hessian_approx(\n hessian_vp=raw_hvp,\n matrix_shape=(n_params, n_params),\n hessian_perturbation=hessian_perturbation,\n rank_estimate=rank_estimate,\n krylov_dimension=krylov_dimension,\n tol=tol,\n max_iter=max_iter,\n device=device,\n eigen_computation_on_gpu=eigen_computation_on_gpu,\n )\n
"},{"location":"api/pydvl/influence/torch/functional/#pydvl.influence.torch.functional.randomized_nystroem_approximation","title":"randomized_nystroem_approximation","text":"randomized_nystroem_approximation(\n mat_mat_prod: Union[Tensor, Callable[[Tensor], Tensor]],\n input_dim: int,\n rank: int,\n input_type: dtype,\n shift_func: Optional[Callable[[Tensor], Tensor]] = None,\n mat_vec_device: device = torch.device(\"cpu\"),\n) -> LowRankProductRepresentation\n
Given a matrix vector product function (representing a symmetric positive definite matrix \\(A\\) ), computes a random Nystr\u00f6m low rank approximation of \\(A\\) in factored form, i.e.
\\[ A_{\\text{nys}} = (A \\Omega)(\\Omega^T A \\Omega)^{\\dagger}(A \\Omega)^T = U \\Sigma U^T \\] where \\(\\Omega\\) is a standard normal random matrix.
PARAMETER DESCRIPTION mat_mat_prod
A callable representing the matrix vector product
TYPE: Union[Tensor, Callable[[Tensor], Tensor]]
input_dim
dimension of the input for the matrix vector product
TYPE: int
input_type
data_type of inputs
TYPE: dtype
rank
rank of the approximation
TYPE: int
shift_func
optional function for computing the stabilizing shift in the construction of the randomized nystroem approximation, defaults to
\\[ \\sqrt{\\operatorname{\\text{input_dim}}} \\cdot \\varepsilon(\\operatorname{\\text{input_type}}) \\cdot \\|A\\Omega\\|_2,\\] where \\(\\varepsilon(\\operatorname{\\text{input_type}})\\) is the value of the machine precision corresponding to the data type.
TYPE: Optional[Callable[[Tensor], Tensor]]
DEFAULT: None
mat_vec_device
device where the matrix vector product has to be executed
TYPE: device
DEFAULT: device('cpu')
RETURNS DESCRIPTION LowRankProductRepresentation
object containing, \\(U\\) and \\(\\Sigma\\)
Source code in src/pydvl/influence/torch/functional.py
def randomized_nystroem_approximation(\n mat_mat_prod: Union[torch.Tensor, Callable[[torch.Tensor], torch.Tensor]],\n input_dim: int,\n rank: int,\n input_type: torch.dtype,\n shift_func: Optional[Callable[[torch.Tensor], torch.Tensor]] = None,\n mat_vec_device: torch.device = torch.device(\"cpu\"),\n) -> LowRankProductRepresentation:\n r\"\"\"\n Given a matrix vector product function (representing a symmetric positive definite\n matrix $A$ ), computes a random Nystr\u00f6m low rank approximation of\n $A$ in factored form, i.e.\n\n $$ A_{\\text{nys}} = (A \\Omega)(\\Omega^T A \\Omega)^{\\dagger}(A \\Omega)^T\n = U \\Sigma U^T $$\n\n where $\\Omega$ is a standard normal random matrix.\n\n Args:\n mat_mat_prod: A callable representing the matrix vector product\n input_dim: dimension of the input for the matrix vector product\n input_type: data_type of inputs\n rank: rank of the approximation\n shift_func: optional function for computing the stabilizing shift in the\n construction of the randomized nystroem approximation, defaults to\n\n $$ \\sqrt{\\operatorname{\\text{input_dim}}} \\cdot\n \\varepsilon(\\operatorname{\\text{input_type}}) \\cdot \\|A\\Omega\\|_2,$$\n\n where $\\varepsilon(\\operatorname{\\text{input_type}})$ is the value of the\n machine precision corresponding to the data type.\n mat_vec_device: device where the matrix vector product has to be executed\n\n Returns:\n object containing, $U$ and $\\Sigma$\n \"\"\"\n\n if shift_func is None:\n\n def shift_func(x: torch.Tensor):\n return (\n torch.sqrt(torch.as_tensor(input_dim))\n * torch.finfo(x.dtype).eps\n * torch.linalg.norm(x)\n )\n\n _mat_mat_prod: Callable[[torch.Tensor], torch.Tensor]\n\n if isinstance(mat_mat_prod, torch.Tensor):\n\n def _mat_mat_prod(x: torch.Tensor):\n return mat_mat_prod @ x\n\n else:\n _mat_mat_prod = mat_mat_prod\n\n random_sample_matrix = torch.randn(\n input_dim, rank, device=mat_vec_device, dtype=input_type\n )\n random_sample_matrix, _ = torch.linalg.qr(random_sample_matrix)\n\n sketch_mat = _mat_mat_prod(random_sample_matrix)\n\n shift = shift_func(sketch_mat)\n sketch_mat += shift * random_sample_matrix\n cholesky_mat = torch.matmul(random_sample_matrix.t(), sketch_mat)\n try:\n triangular_mat = torch.linalg.cholesky(cholesky_mat)\n except _LinAlgError as e:\n logger.warning(\n f\"Encountered error in cholesky decomposition: {e}.\\n \"\n f\"Increasing shift by smallest eigenvalue and re-compute\"\n )\n eigen_vals, eigen_vectors = torch.linalg.eigh(cholesky_mat)\n shift += torch.abs(torch.min(eigen_vals))\n eigen_vals += shift\n triangular_mat = torch.linalg.cholesky(\n torch.mm(eigen_vectors, torch.mm(torch.diag(eigen_vals), eigen_vectors.T))\n )\n\n svd_input = torch.linalg.solve_triangular(\n triangular_mat.t(), sketch_mat, upper=True, left=False\n )\n left_singular_vecs, singular_vals, _ = torch.linalg.svd(\n svd_input, full_matrices=False\n )\n singular_vals = torch.clamp(singular_vals**2 - shift, min=0)\n\n return LowRankProductRepresentation(singular_vals, left_singular_vecs)\n
"},{"location":"api/pydvl/influence/torch/functional/#pydvl.influence.torch.functional.model_hessian_nystroem_approximation","title":"model_hessian_nystroem_approximation","text":"model_hessian_nystroem_approximation(\n model: Module,\n loss: Callable[[Tensor, Tensor], Tensor],\n data_loader: DataLoader,\n rank: int,\n shift_func: Optional[Callable[[Tensor], Tensor]] = None,\n) -> LowRankProductRepresentation\n
Given a model, loss and a data_loader, computes a random Nystr\u00f6m low rank approximation of the corresponding Hessian matrix in factored form, i.e.
\\[ H_{\\text{nys}} = (H \\Omega)(\\Omega^T H \\Omega)^{+}(H \\Omega)^T = U \\Sigma U^T \\] PARAMETER DESCRIPTION model
A PyTorch model instance. The Hessian will be calculated with respect to this model's parameters.
TYPE: Module
loss
A callable that computes the loss.
data_loader
A DataLoader instance that provides the model's training data. Used in calculating the Hessian-vector products.
TYPE: DataLoader
rank
rank of the approximation
TYPE: int
shift_func
optional function for computing the stabilizing shift in the construction of the randomized nystroem approximation, defaults to
\\[ \\sqrt{\\operatorname{\\text{input_dim}}} \\cdot \\varepsilon(\\operatorname{\\text{input_type}}) \\cdot \\|A\\Omega\\|_2,\\] where \\(\\varepsilon(\\operatorname{\\text{input_type}})\\) is the value of the machine precision corresponding to the data type.
TYPE: Optional[Callable[[Tensor], Tensor]]
DEFAULT: None
RETURNS DESCRIPTION LowRankProductRepresentation
object containing, \\(U\\) and \\(\\Sigma\\)
Source code in src/pydvl/influence/torch/functional.py
def model_hessian_nystroem_approximation(\n model: torch.nn.Module,\n loss: Callable[[torch.Tensor, torch.Tensor], torch.Tensor],\n data_loader: DataLoader,\n rank: int,\n shift_func: Optional[Callable[[torch.Tensor], torch.Tensor]] = None,\n) -> LowRankProductRepresentation:\n r\"\"\"\n Given a model, loss and a data_loader, computes a random Nystr\u00f6m low rank approximation of\n the corresponding Hessian matrix in factored form, i.e.\n\n $$ H_{\\text{nys}} = (H \\Omega)(\\Omega^T H \\Omega)^{+}(H \\Omega)^T\n = U \\Sigma U^T $$\n\n Args:\n model: A PyTorch model instance. The Hessian will be calculated with respect to\n this model's parameters.\n loss : A callable that computes the loss.\n data_loader: A DataLoader instance that provides the model's training data.\n Used in calculating the Hessian-vector products.\n rank: rank of the approximation\n shift_func: optional function for computing the stabilizing shift in the\n construction of the randomized nystroem approximation, defaults to\n\n $$ \\sqrt{\\operatorname{\\text{input_dim}}} \\cdot\n \\varepsilon(\\operatorname{\\text{input_type}}) \\cdot \\|A\\Omega\\|_2,$$\n\n where $\\varepsilon(\\operatorname{\\text{input_type}})$ is the value of the\n machine precision corresponding to the data type.\n\n Returns:\n object containing, $U$ and $\\Sigma$\n \"\"\"\n\n model_hvp = create_hvp_function(\n model, loss, data_loader, precompute_grad=False, use_average=True\n )\n device = next((p.device for p in model.parameters()))\n dtype = next((p.dtype for p in model.parameters()))\n in_dim = sum((p.numel() for p in model.parameters() if p.requires_grad))\n\n def model_hessian_mat_mat_prod(x: torch.Tensor):\n return torch.func.vmap(model_hvp, in_dims=1, randomness=\"same\")(x).t()\n\n return randomized_nystroem_approximation(\n model_hessian_mat_mat_prod,\n in_dim,\n rank,\n dtype,\n shift_func=shift_func,\n mat_vec_device=device,\n )\n
"},{"location":"api/pydvl/influence/torch/functional/#pydvl.influence.torch.functional.operator_nystroem_approximation","title":"operator_nystroem_approximation","text":"operator_nystroem_approximation(\n operator: \"TensorOperator\",\n rank: int,\n shift_func: Optional[Callable[[Tensor], Tensor]] = None,\n)\n
Given an operator (representing a symmetric positive definite matrix \\(A\\) ), computes a random Nystr\u00f6m low rank approximation of \\(A\\) in factored form, i.e.
\\[ A_{\\text{nys}} = (A \\Omega)(\\Omega^T A \\Omega)^{\\dagger}(A \\Omega)^T = U \\Sigma U^T \\] where \\(\\Omega\\) is a standard normal random matrix.
PARAMETER DESCRIPTION operator
the operator to approximate
TYPE: 'TensorOperator'
rank
rank of the approximation
TYPE: int
shift_func
optional function for computing the stabilizing shift in the construction of the randomized nystroem approximation, defaults to
\\[ \\sqrt{\\operatorname{\\text{input_dim}}} \\cdot \\varepsilon(\\operatorname{\\text{input_type}}) \\cdot \\|A\\Omega\\|_2,\\] where \\(\\varepsilon(\\operatorname{\\text{input_type}})\\) is the value of the machine precision corresponding to the data type.
TYPE: Optional[Callable[[Tensor], Tensor]]
DEFAULT: None
RETURNS DESCRIPTION object containing, \\(U\\) and \\(\\Sigma\\)
Source code in src/pydvl/influence/torch/functional.py
def operator_nystroem_approximation(\n operator: \"TensorOperator\",\n rank: int,\n shift_func: Optional[Callable[[torch.Tensor], torch.Tensor]] = None,\n):\n r\"\"\"\n Given an operator (representing a symmetric positive definite\n matrix $A$ ), computes a random Nystr\u00f6m low rank approximation of\n $A$ in factored form, i.e.\n\n $$ A_{\\text{nys}} = (A \\Omega)(\\Omega^T A \\Omega)^{\\dagger}(A \\Omega)^T\n = U \\Sigma U^T $$\n\n where $\\Omega$ is a standard normal random matrix.\n\n Args:\n operator: the operator to approximate\n rank: rank of the approximation\n shift_func: optional function for computing the stabilizing shift in the\n construction of the randomized nystroem approximation, defaults to\n\n $$ \\sqrt{\\operatorname{\\text{input_dim}}} \\cdot\n \\varepsilon(\\operatorname{\\text{input_type}}) \\cdot \\|A\\Omega\\|_2,$$\n\n where $\\varepsilon(\\operatorname{\\text{input_type}})$ is the value of the\n machine precision corresponding to the data type.\n\n Returns:\n object containing, $U$ and $\\Sigma$\n \"\"\"\n\n def mat_mat_prod(x: torch.Tensor):\n return operator.apply(x.t()).t()\n\n return randomized_nystroem_approximation(\n mat_mat_prod,\n operator.input_size,\n rank,\n operator.dtype,\n shift_func=shift_func,\n mat_vec_device=operator.device,\n )\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/","title":"Influence function model","text":""},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model","title":"pydvl.influence.torch.influence_function_model","text":"This module implements several implementations of InfluenceFunctionModel utilizing PyTorch.
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.TorchInfluenceFunctionModel","title":"TorchInfluenceFunctionModel","text":"TorchInfluenceFunctionModel(\n model: Module, loss: Callable[[Tensor, Tensor], Tensor]\n)\n
Bases: InfluenceFunctionModel[Tensor, DataLoader]
, ABC
Abstract base class for influence computation related to torch models
Source code in src/pydvl/influence/torch/influence_function_model.py
def __init__(\n self,\n model: nn.Module,\n loss: Callable[[torch.Tensor, torch.Tensor], torch.Tensor],\n):\n self.loss = loss\n self.model = model\n self._n_parameters = sum(\n [p.numel() for p in model.parameters() if p.requires_grad]\n )\n self._model_device = next(\n (p.device for p in model.parameters() if p.requires_grad)\n )\n self._model_params = {\n k: p.detach() for k, p in self.model.named_parameters() if p.requires_grad\n }\n self._model_dtype = next(\n (p.dtype for p in model.parameters() if p.requires_grad)\n )\n super().__init__()\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.TorchInfluenceFunctionModel.is_fitted","title":"is_fitted abstractmethod
property
","text":"is_fitted\n
Override this, to expose the fitting status of the instance.
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.TorchInfluenceFunctionModel.fit","title":"fit abstractmethod
","text":"fit(data: DataLoaderType) -> InfluenceFunctionModel\n
Override this method to fit the influence function model to training data, e.g. pre-compute hessian matrix or matrix decompositions
PARAMETER DESCRIPTION data
TYPE: DataLoaderType
RETURNS DESCRIPTION InfluenceFunctionModel
The fitted instance
Source code in src/pydvl/influence/base_influence_function_model.py
@abstractmethod\ndef fit(self, data: DataLoaderType) -> InfluenceFunctionModel:\n \"\"\"\n Override this method to fit the influence function model to training data,\n e.g. pre-compute hessian matrix or matrix decompositions\n\n Args:\n data:\n\n Returns:\n The fitted instance\n \"\"\"\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.TorchInfluenceFunctionModel.fit_required","title":"fit_required staticmethod
","text":"fit_required(method)\n
Decorator to enforce the fitted check
Source code in src/pydvl/influence/base_influence_function_model.py
@staticmethod\ndef fit_required(method):\n \"\"\"Decorator to enforce the fitted check\"\"\"\n\n @wraps(method)\n def wrapper(self, *args, **kwargs):\n if not self.is_fitted:\n raise NotFittedException(type(self))\n return method(self, *args, **kwargs)\n\n return wrapper\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.TorchInfluenceFunctionModel.influences","title":"influences","text":"influences(\n x_test: Tensor,\n y_test: Tensor,\n x: Optional[Tensor] = None,\n y: Optional[Tensor] = None,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> Tensor\n
Compute the approximation of
\\[ \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}})), \\nabla_{\\theta} \\ell(y, f_{\\theta}(x))\\rangle \\] for the case of up-weighting influence, resp.
\\[ \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}})), \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the perturbation type influence case. For all input tensors it is assumed, that the first dimension is the batch dimension (in case, you want to provide a single sample z, call z.unsqueeze(0) if no batch dimension is present).
PARAMETER DESCRIPTION x_test
model input to use in the gradient computations of \\(H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}}))\\)
TYPE: Tensor
y_test
label tensor to compute gradients
TYPE: Tensor
x
optional model input to use in the gradient computations \\(\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))\\), resp. \\(\\nabla_{x}\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))\\), if None, use \\(x=x_{\\text{test}}\\)
TYPE: Optional[Tensor]
DEFAULT: None
y
optional label tensor to compute gradients
TYPE: Optional[Tensor]
DEFAULT: None
mode
enum value of InfluenceMode
TYPE: InfluenceMode
DEFAULT: Up
RETURNS DESCRIPTION Tensor
Tensor representing the element-wise scalar products for the provided batch
Source code in src/pydvl/influence/torch/influence_function_model.py
def influences(\n self,\n x_test: torch.Tensor,\n y_test: torch.Tensor,\n x: Optional[torch.Tensor] = None,\n y: Optional[torch.Tensor] = None,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> torch.Tensor:\n r\"\"\"\n Compute the approximation of\n\n \\[\n \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}},\n f_{\\theta}(x_{\\text{test}})), \\nabla_{\\theta} \\ell(y, f_{\\theta}(x))\\rangle\n \\]\n\n for the case of up-weighting influence, resp.\n\n \\[\n \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}})),\n \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle\n \\]\n\n for the perturbation type influence case. For all input tensors it is assumed,\n that the first dimension is the batch dimension (in case, you want to provide\n a single sample z, call z.unsqueeze(0) if no batch dimension is present).\n\n Args:\n x_test: model input to use in the gradient computations\n of $H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}},\n f_{\\theta}(x_{\\text{test}}))$\n y_test: label tensor to compute gradients\n x: optional model input to use in the gradient computations\n $\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))$,\n resp. $\\nabla_{x}\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))$,\n if None, use $x=x_{\\text{test}}$\n y: optional label tensor to compute gradients\n mode: enum value of [InfluenceMode]\n [pydvl.influence.base_influence_function_model.InfluenceMode]\n\n Returns:\n Tensor representing the element-wise scalar products for the provided batch\n\n \"\"\"\n t: torch.Tensor = super().influences(x_test, y_test, x, y, mode=mode)\n return t\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.TorchInfluenceFunctionModel.influence_factors","title":"influence_factors","text":"influence_factors(x: Tensor, y: Tensor) -> Tensor\n
Compute approximation of
\\[ H^{-1}\\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\] where the gradient is meant to be per sample of the batch \\((x, y)\\). For all input tensors it is assumed, that the first dimension is the batch dimension (in case, you want to provide a single sample z, call z.unsqueeze(0) if no batch dimension is present).
PARAMETER DESCRIPTION x
model input to use in the gradient computations
TYPE: Tensor
y
label tensor to compute gradients
TYPE: Tensor
RETURNS DESCRIPTION Tensor
Tensor representing the element-wise inverse Hessian matrix vector products
Source code in src/pydvl/influence/torch/influence_function_model.py
def influence_factors(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:\n r\"\"\"\n Compute approximation of\n\n \\[ H^{-1}\\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\]\n\n where the gradient is meant to be per sample of the batch $(x, y)$.\n For all input tensors it is assumed,\n that the first dimension is the batch dimension (in case, you want to provide\n a single sample z, call z.unsqueeze(0) if no batch dimension is present).\n\n Args:\n x: model input to use in the gradient computations\n y: label tensor to compute gradients\n\n Returns:\n Tensor representing the element-wise inverse Hessian matrix vector products\n\n \"\"\"\n return super().influence_factors(x, y)\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.TorchInfluenceFunctionModel.influences_from_factors","title":"influences_from_factors","text":"influences_from_factors(\n z_test_factors: Tensor,\n x: Tensor,\n y: Tensor,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> Tensor\n
Computation of
\\[ \\langle z_{\\text{test_factors}}, \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the case of up-weighting influence, resp.
\\[ \\langle z_{\\text{test_factors}}, \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the perturbation type influence case. The gradient is meant to be per sample of the batch \\((x, y)\\). For all input tensors it is assumed, that the first dimension is the batch dimension (in case, you want to provide a single sample z, call z.unsqueeze(0) if no batch dimension is present).
PARAMETER DESCRIPTION z_test_factors
pre-computed tensor, approximating \\(H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}}))\\)
TYPE: Tensor
x
model input to use in the gradient computations \\(\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))\\), resp. \\(\\nabla_{x}\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))\\)
TYPE: Tensor
y
label tensor to compute gradients
TYPE: Tensor
mode
enum value of InfluenceMode
TYPE: InfluenceMode
DEFAULT: Up
RETURNS DESCRIPTION Tensor
Tensor representing the element-wise scalar products for the provided batch
Source code in src/pydvl/influence/torch/influence_function_model.py
def influences_from_factors(\n self,\n z_test_factors: torch.Tensor,\n x: torch.Tensor,\n y: torch.Tensor,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> torch.Tensor:\n r\"\"\"\n Computation of\n\n \\[ \\langle z_{\\text{test_factors}},\n \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the case of up-weighting influence, resp.\n\n \\[ \\langle z_{\\text{test_factors}},\n \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the perturbation type influence case. The gradient is meant to be per sample\n of the batch $(x, y)$. For all input tensors it is assumed,\n that the first dimension is the batch dimension (in case, you want to provide\n a single sample z, call z.unsqueeze(0) if no batch dimension is present).\n\n Args:\n z_test_factors: pre-computed tensor, approximating\n $H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}},\n f_{\\theta}(x_{\\text{test}}))$\n x: model input to use in the gradient computations\n $\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))$,\n resp. $\\nabla_{x}\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))$\n y: label tensor to compute gradients\n mode: enum value of [InfluenceMode]\n [pydvl.influence.base_influence_function_model.InfluenceMode]\n\n Returns:\n Tensor representing the element-wise scalar products for the provided batch\n\n \"\"\"\n if mode == InfluenceMode.Up:\n return (\n z_test_factors.to(self.model_device)\n @ self._loss_grad(x.to(self.model_device), y.to(self.model_device)).T\n )\n elif mode == InfluenceMode.Perturbation:\n return torch.einsum(\n \"ia,j...a->ij...\",\n z_test_factors.to(self.model_device),\n self._flat_loss_mixed_grad(\n x.to(self.model_device), y.to(self.model_device)\n ),\n )\n else:\n raise UnsupportedInfluenceModeException(mode)\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.DirectInfluence","title":"DirectInfluence","text":"DirectInfluence(\n model: Module,\n loss: LossType,\n regularization: Optional[Union[float, Dict[str, Optional[float]]]] = None,\n block_structure: Union[\n BlockMode, OrderedDict[str, List[str]]\n ] = BlockMode.FULL,\n second_order_mode: SecondOrderMode = SecondOrderMode.HESSIAN,\n)\n
Bases: TorchComposableInfluence[DirectSolveOperator]
Given a model and training data, it finds x such that \\(Hx = b\\), with \\(H\\) being the model hessian or Gauss-Newton matrix.
PARAMETER DESCRIPTION model
The model.
TYPE: Module
loss
The loss function.
TYPE: LossType
regularization
The regularization parameter. In case a dictionary is provided, the keys must be a subset of the block identifiers.
TYPE: Optional[Union[float, Dict[str, Optional[float]]]]
DEFAULT: None
block_structure
The blocking structure, either a pre-defined enum or a custom block structure, see the information regarding block-diagonal approximation.
TYPE: Union[BlockMode, OrderedDict[str, List[str]]]
DEFAULT: FULL
second_order_mode
The second order mode, either SecondOrderMode.HESSIAN
or SecondOrderMode.GAUSS_NEWTON
.
TYPE: SecondOrderMode
DEFAULT: HESSIAN
Source code in src/pydvl/influence/torch/influence_function_model.py
def __init__(\n self,\n model: nn.Module,\n loss: LossType,\n regularization: Optional[Union[float, Dict[str, Optional[float]]]] = None,\n block_structure: Union[BlockMode, OrderedDict[str, List[str]]] = BlockMode.FULL,\n second_order_mode: SecondOrderMode = SecondOrderMode.HESSIAN,\n):\n super().__init__(\n model,\n block_structure=block_structure,\n regularization=regularization,\n )\n self.second_order_mode = second_order_mode\n self.loss = loss\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.DirectInfluence.fit","title":"fit","text":"fit(data: DataLoaderType) -> InfluenceFunctionModel\n
Fitting to provided data, by internally creating a block mapper instance from it. Args: data: iterable of tensors
RETURNS DESCRIPTION InfluenceFunctionModel
Fitted instance
Source code in src/pydvl/influence/base_influence_function_model.py
@log_duration(log_level=logging.INFO)\ndef fit(self, data: DataLoaderType) -> InfluenceFunctionModel:\n \"\"\"\n Fitting to provided data, by internally creating a block mapper instance from\n it.\n Args:\n data: iterable of tensors\n\n Returns:\n Fitted instance\n \"\"\"\n self.block_mapper = self._create_block_mapper(data)\n return self\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.DirectInfluence.fit_required","title":"fit_required staticmethod
","text":"fit_required(method)\n
Decorator to enforce the fitted check
Source code in src/pydvl/influence/base_influence_function_model.py
@staticmethod\ndef fit_required(method):\n \"\"\"Decorator to enforce the fitted check\"\"\"\n\n @wraps(method)\n def wrapper(self, *args, **kwargs):\n if not self.is_fitted:\n raise NotFittedException(type(self))\n return method(self, *args, **kwargs)\n\n return wrapper\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.DirectInfluence.influence_factors","title":"influence_factors","text":"influence_factors(x: TensorType, y: TensorType) -> TensorType\n
Computes the approximation of
\\[ H^{-1}\\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\] where the gradient is meant to be per sample of the batch \\((x, y)\\). For all input tensors it is assumed, that the first dimension is the batch dimension.
PARAMETER DESCRIPTION x
model input to use in the gradient computations
TYPE: TensorType
y
label tensor to compute gradients
TYPE: TensorType
RETURNS DESCRIPTION TensorType
Tensor representing the element-wise inverse Hessian matrix vector products
Source code in src/pydvl/influence/base_influence_function_model.py
def influence_factors(self, x: TensorType, y: TensorType) -> TensorType:\n r\"\"\"\n Computes the approximation of\n\n \\[ H^{-1}\\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\]\n\n where the gradient is meant to be per sample of the batch $(x, y)$.\n For all input tensors it is assumed,\n that the first dimension is the batch dimension.\n\n Args:\n x: model input to use in the gradient computations\n y: label tensor to compute gradients\n\n Returns:\n Tensor representing the element-wise inverse Hessian matrix vector products\n\n \"\"\"\n if not self.is_fitted:\n raise NotFittedException(type(self))\n return self._influence_factors(x, y)\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.DirectInfluence.influences","title":"influences","text":"influences(\n x_test: TensorType,\n y_test: TensorType,\n x: Optional[TensorType] = None,\n y: Optional[TensorType] = None,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> TensorType\n
Computes the approximation of
\\[ \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}})), \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the case of up-weighting influence, resp.
\\[ \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{test}, f_{\\theta}(x_{test})), \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the perturbation type influence case.
PARAMETER DESCRIPTION x_test
model input to use in the gradient computations of \\(H^{-1}\\nabla_{theta} \\ell(y_{test}, f_{\\theta}(x_{test}))\\)
TYPE: TensorType
y_test
label tensor to compute gradients
TYPE: TensorType
x
optional model input to use in the gradient computations \\(\\nabla_{theta}\\ell(y, f_{\\theta}(x))\\), resp. \\(\\nabla_{x}\\nabla_{theta}\\ell(y, f_{\\theta}(x))\\), if None, use \\(x=x_{test}\\)
TYPE: Optional[TensorType]
DEFAULT: None
y
optional label tensor to compute gradients
TYPE: Optional[TensorType]
DEFAULT: None
mode
enum value of InfluenceMode
TYPE: InfluenceMode
DEFAULT: Up
RETURNS DESCRIPTION TensorType
Tensor representing the element-wise scalar products for the provided batch
Source code in src/pydvl/influence/base_influence_function_model.py
def influences(\n self,\n x_test: TensorType,\n y_test: TensorType,\n x: Optional[TensorType] = None,\n y: Optional[TensorType] = None,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> TensorType:\n r\"\"\"\n Computes the approximation of\n\n \\[ \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}},\n f_{\\theta}(x_{\\text{test}})),\n \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the case of up-weighting influence, resp.\n\n \\[ \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{test}, f_{\\theta}(x_{test})),\n \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the perturbation type influence case.\n\n Args:\n x_test: model input to use in the gradient computations\n of $H^{-1}\\nabla_{theta} \\ell(y_{test}, f_{\\theta}(x_{test}))$\n y_test: label tensor to compute gradients\n x: optional model input to use in the gradient computations\n $\\nabla_{theta}\\ell(y, f_{\\theta}(x))$,\n resp. $\\nabla_{x}\\nabla_{theta}\\ell(y, f_{\\theta}(x))$,\n if None, use $x=x_{test}$\n y: optional label tensor to compute gradients\n mode: enum value of [InfluenceMode]\n [pydvl.influence.base_influence_function_model.InfluenceMode]\n\n Returns:\n Tensor representing the element-wise scalar products for the provided batch\n\n \"\"\"\n if not self.is_fitted:\n raise NotFittedException(type(self))\n\n if x is None and y is not None:\n raise ValueError(\n \"Providing labels y, without providing model input x \"\n \"is not supported\"\n )\n\n if x is not None and y is None:\n raise ValueError(\n \"Providing model input x, without providing labels y \"\n \"is not supported\"\n )\n\n return self._influences(x_test, y_test, x, y, mode)\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.DirectInfluence.influences_from_factors","title":"influences_from_factors","text":"influences_from_factors(\n z_test_factors: TensorType,\n x: TensorType,\n y: TensorType,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> TensorType\n
Computation of
\\[ \\langle z_{\\text{test_factors}}, \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the case of up-weighting influence, resp.
\\[ \\langle z_{\\text{test_factors}}, \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the perturbation type influence case. The gradient is meant to be per sample of the batch \\((x, y)\\).
PARAMETER DESCRIPTION z_test_factors
pre-computed array, approximating \\(H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}}))\\)
TYPE: TensorType
x
model input to use in the gradient computations \\(\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))\\), resp. \\(\\nabla_{x}\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))\\), if None, use \\(x=x_{\\text{test}}\\)
TYPE: TensorType
y
label tensor to compute gradients
TYPE: TensorType
mode
enum value of InfluenceMode
TYPE: InfluenceMode
DEFAULT: Up
RETURNS DESCRIPTION TensorType
Tensor representing the element-wise scalar products for the provided batch
Source code in src/pydvl/influence/base_influence_function_model.py
@InfluenceFunctionModel.fit_required\ndef influences_from_factors(\n self,\n z_test_factors: TensorType,\n x: TensorType,\n y: TensorType,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> TensorType:\n r\"\"\"\n Computation of\n\n \\[ \\langle z_{\\text{test_factors}},\n \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the case of up-weighting influence, resp.\n\n \\[ \\langle z_{\\text{test_factors}},\n \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the perturbation type influence case. The gradient is meant to be per sample\n of the batch $(x, y)$.\n\n Args:\n z_test_factors: pre-computed array, approximating\n $H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}},\n f_{\\theta}(x_{\\text{test}}))$\n x: model input to use in the gradient computations\n $\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))$,\n resp. $\\nabla_{x}\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))$,\n if None, use $x=x_{\\text{test}}$\n y: label tensor to compute gradients\n mode: enum value of [InfluenceMode]\n [pydvl.influence.base_influence_function_model.InfluenceMode]\n\n Returns:\n Tensor representing the element-wise scalar products for the provided batch\n\n \"\"\"\n tensors = self.block_mapper.generate_interactions_from_transformed_grads(\n z_test_factors,\n self._create_batch(x, y),\n mode,\n )\n result: TensorType = next(tensors)\n for tensor in tensors:\n result = result + tensor\n return result\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.DirectInfluence.influences_by_block","title":"influences_by_block","text":"influences_by_block(\n x_test: TensorType,\n y_test: TensorType,\n x: Optional[TensorType] = None,\n y: Optional[TensorType] = None,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> OrderedDict[str, TensorType]\n
Compute the block-wise influence values for the provided data, i.e. an approximation of
\\[ \\langle H^{-1}\\nabla_{theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}})), \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the case of up-weighting influence, resp.
\\[ \\langle H^{-1}\\nabla_{theta} \\ell(y_{test}, f_{\\theta}(x_{test})), \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the perturbation type influence case.
PARAMETER DESCRIPTION x_test
model input to use in the gradient computations of the approximation of \\(H^{-1}\\nabla_{theta} \\ell(y_{test}, f_{\\theta}(x_{test}))\\)
TYPE: TensorType
y_test
label tensor to compute gradients
TYPE: TensorType
x
optional model input to use in the gradient computations \\(\\nabla_{theta}\\ell(y, f_{\\theta}(x))\\), resp. \\(\\nabla_{x}\\nabla_{theta}\\ell(y, f_{\\theta}(x))\\), if None, use \\(x=x_{test}\\)
TYPE: Optional[TensorType]
DEFAULT: None
y
optional label tensor to compute gradients
TYPE: Optional[TensorType]
DEFAULT: None
mode
enum value of InfluenceMode
TYPE: InfluenceMode
DEFAULT: Up
RETURNS DESCRIPTION OrderedDict[str, TensorType]
Ordered dictionary of tensors representing the element-wise scalar products
OrderedDict[str, TensorType]
for the provided batch per block.
Source code in src/pydvl/influence/base_influence_function_model.py
@InfluenceFunctionModel.fit_required\ndef influences_by_block(\n self,\n x_test: TensorType,\n y_test: TensorType,\n x: Optional[TensorType] = None,\n y: Optional[TensorType] = None,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> OrderedDict[str, TensorType]:\n r\"\"\"\n Compute the block-wise influence values for the provided data, i.e. an\n approximation of\n\n \\[ \\langle H^{-1}\\nabla_{theta} \\ell(y_{\\text{test}},\n f_{\\theta}(x_{\\text{test}})),\n \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the case of up-weighting influence, resp.\n\n \\[ \\langle H^{-1}\\nabla_{theta} \\ell(y_{test}, f_{\\theta}(x_{test})),\n \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the perturbation type influence case.\n\n Args:\n x_test: model input to use in the gradient computations\n of the approximation of\n $H^{-1}\\nabla_{theta} \\ell(y_{test}, f_{\\theta}(x_{test}))$\n y_test: label tensor to compute gradients\n x: optional model input to use in the gradient computations\n $\\nabla_{theta}\\ell(y, f_{\\theta}(x))$,\n resp. $\\nabla_{x}\\nabla_{theta}\\ell(y, f_{\\theta}(x))$,\n if None, use $x=x_{test}$\n y: optional label tensor to compute gradients\n mode: enum value of [InfluenceMode]\n [pydvl.influence.base_influence_function_model.InfluenceMode]\n\n Returns:\n Ordered dictionary of tensors representing the element-wise scalar products\n for the provided batch per block.\n\n \"\"\"\n left_batch = self._create_batch(x_test, y_test)\n\n if x is None:\n if y is not None:\n raise ValueError(\n \"Providing labels y, without providing model input x \"\n \"is not supported\"\n )\n right_batch = left_batch\n else:\n if y is None:\n raise ValueError(\n \"Providing model input x, without providing labels y \"\n \"is not supported\"\n )\n right_batch = self._create_batch(x, y)\n\n return self.block_mapper.interactions(left_batch, right_batch, mode)\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.DirectInfluence.influence_factors_by_block","title":"influence_factors_by_block","text":"influence_factors_by_block(\n x: TensorType, y: TensorType\n) -> OrderedDict[str, TensorType]\n
Compute the block-wise approximation of
\\[ H^{-1}\\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\] where the gradient is meant to be per sample of the batch \\((x, y)\\).
PARAMETER DESCRIPTION x
model input to use in the gradient computations
TYPE: TensorType
y
label tensor to compute gradients
TYPE: TensorType
RETURNS DESCRIPTION OrderedDict[str, TensorType]
Ordered dictionary of tensors representing the element-wise
OrderedDict[str, TensorType]
approximate inverse Hessian matrix vector products per block.
Source code in src/pydvl/influence/base_influence_function_model.py
@InfluenceFunctionModel.fit_required\ndef influence_factors_by_block(\n self, x: TensorType, y: TensorType\n) -> OrderedDict[str, TensorType]:\n r\"\"\"\n Compute the block-wise approximation of\n\n \\[ H^{-1}\\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\]\n\n where the gradient is meant to be per sample of the batch $(x, y)$.\n\n Args:\n x: model input to use in the gradient computations\n y: label tensor to compute gradients\n\n Returns:\n Ordered dictionary of tensors representing the element-wise\n approximate inverse Hessian matrix vector products per block.\n\n \"\"\"\n return self.block_mapper.transformed_grads(self._create_batch(x, y))\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.DirectInfluence.influences_from_factors_by_block","title":"influences_from_factors_by_block","text":"influences_from_factors_by_block(\n z_test_factors: OrderedDict[str, TensorType],\n x: TensorType,\n y: TensorType,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> OrderedDict[str, TensorType]\n
Block-wise computation of
\\[ \\langle z_{\\text{test_factors}}, \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the case of up-weighting influence, resp.
\\[ \\langle z_{\\text{test_factors}}, \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the perturbation type influence case. The gradient is meant to be per sample of the batch \\((x, y)\\).
PARAMETER DESCRIPTION z_test_factors
pre-computed array, approximating \\(H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}}))\\)
TYPE: OrderedDict[str, TensorType]
x
model input to use in the gradient computations \\(\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))\\), resp. \\(\\nabla_{x}\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))\\), if None, use \\(x=x_{\\text{test}}\\)
TYPE: TensorType
y
label tensor to compute gradients
TYPE: TensorType
mode
enum value of InfluenceMode
TYPE: InfluenceMode
DEFAULT: Up
RETURNS DESCRIPTION OrderedDict[str, TensorType]
Ordered dictionary of tensors representing the element-wise scalar products
OrderedDict[str, TensorType]
for the provided batch per block
Source code in src/pydvl/influence/base_influence_function_model.py
@InfluenceFunctionModel.fit_required\ndef influences_from_factors_by_block(\n self,\n z_test_factors: OrderedDict[str, TensorType],\n x: TensorType,\n y: TensorType,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> OrderedDict[str, TensorType]:\n r\"\"\"\n Block-wise computation of\n\n \\[ \\langle z_{\\text{test_factors}},\n \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the case of up-weighting influence, resp.\n\n \\[ \\langle z_{\\text{test_factors}},\n \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the perturbation type influence case. The gradient is meant to be per sample\n of the batch $(x, y)$.\n\n Args:\n z_test_factors: pre-computed array, approximating\n $H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}},\n f_{\\theta}(x_{\\text{test}}))$\n x: model input to use in the gradient computations\n $\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))$,\n resp. $\\nabla_{x}\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))$,\n if None, use $x=x_{\\text{test}}$\n y: label tensor to compute gradients\n mode: enum value of [InfluenceMode]\n [pydvl.influence.base_influence_function_model.InfluenceMode]\n\n Returns:\n Ordered dictionary of tensors representing the element-wise scalar products\n for the provided batch per block\n\n \"\"\"\n return self.block_mapper.interactions_from_transformed_grads(\n z_test_factors, self._create_batch(x, y), mode\n )\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.DirectInfluence.with_regularization","title":"with_regularization","text":"with_regularization(\n regularization: Union[float, Dict[str, Optional[float]]]\n) -> TorchComposableInfluence\n
Update the regularization parameter. Args: regularization: Either a positive float or a dictionary with the block names as keys and the regularization values as values.
RETURNS DESCRIPTION TorchComposableInfluence
The modified instance
Source code in src/pydvl/influence/torch/influence_function_model.py
def with_regularization(\n self, regularization: Union[float, Dict[str, Optional[float]]]\n) -> TorchComposableInfluence:\n \"\"\"\n Update the regularization parameter.\n Args:\n regularization: Either a positive float or a dictionary with the\n block names as keys and the regularization values as values.\n\n Returns:\n The modified instance\n\n \"\"\"\n self._regularization_dict = self._build_regularization_dict(regularization)\n for k, reg in self._regularization_dict.items():\n self.block_mapper.composable_block_dict[k].op.regularization = reg\n return self\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.CgInfluence","title":"CgInfluence","text":"CgInfluence(\n model: Module,\n loss: Callable[[Tensor, Tensor], Tensor],\n hessian_regularization: float = 0.0,\n x0: Optional[Tensor] = None,\n rtol: float = 1e-07,\n atol: float = 1e-07,\n maxiter: Optional[int] = None,\n progress: bool = False,\n precompute_grad: bool = False,\n pre_conditioner: Optional[PreConditioner] = None,\n use_block_cg: bool = False,\n warn_on_max_iteration: bool = True,\n)\n
Bases: TorchInfluenceFunctionModel
Given a model and training data, it uses conjugate gradient to calculate the inverse of the Hessian Vector Product. More precisely, it finds x such that \\(Hx = b\\), with \\(H\\) being the model hessian. For more info, see Conjugate Gradient.
PARAMETER DESCRIPTION model
A PyTorch model. The Hessian will be calculated with respect to this model's parameters.
TYPE: Module
loss
A callable that takes the model's output and target as input and returns the scalar loss.
TYPE: Callable[[Tensor, Tensor], Tensor]
hessian_regularization
Optional regularization parameter added to the Hessian-vector product for numerical stability.
TYPE: float
DEFAULT: 0.0
x0
Initial guess for hvp. If None, defaults to b.
TYPE: Optional[Tensor]
DEFAULT: None
rtol
Maximum relative tolerance of result.
TYPE: float
DEFAULT: 1e-07
atol
Absolute tolerance of result.
TYPE: float
DEFAULT: 1e-07
maxiter
Maximum number of iterations. If None, defaults to 10*len(b).
TYPE: Optional[int]
DEFAULT: None
progress
If True, display progress bars for computing in the non-block mode (use_block_cg=False).
TYPE: bool
DEFAULT: False
precompute_grad
If True, the full data gradient is precomputed and kept in memory, which can speed up the hessian vector product computation. Set this to False, if you can't afford to keep the full computation graph in memory.
TYPE: bool
DEFAULT: False
pre_conditioner
Optional pre-conditioner to improve convergence of conjugate gradient method
TYPE: Optional[PreConditioner]
DEFAULT: None
use_block_cg
If True, use block variant of conjugate gradient method, which solves several right hand sides simultaneously
TYPE: bool
DEFAULT: False
warn_on_max_iteration
If True, logs a warning, if the desired tolerance is not achieved within maxiter
iterations. If False, the log level for this information is logging.DEBUG
TYPE: bool
DEFAULT: True
Source code in src/pydvl/influence/torch/influence_function_model.py
def __init__(\n self,\n model: nn.Module,\n loss: Callable[[torch.Tensor, torch.Tensor], torch.Tensor],\n hessian_regularization: float = 0.0,\n x0: Optional[torch.Tensor] = None,\n rtol: float = 1e-7,\n atol: float = 1e-7,\n maxiter: Optional[int] = None,\n progress: bool = False,\n precompute_grad: bool = False,\n pre_conditioner: Optional[PreConditioner] = None,\n use_block_cg: bool = False,\n warn_on_max_iteration: bool = True,\n):\n super().__init__(model, loss)\n self.warn_on_max_iteration = warn_on_max_iteration\n self.use_block_cg = use_block_cg\n self.pre_conditioner = pre_conditioner\n self.precompute_grad = precompute_grad\n self.progress = progress\n self.maxiter = maxiter\n self.atol = atol\n self.rtol = rtol\n self.x0 = x0\n self.hessian_regularization = hessian_regularization\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.CgInfluence.fit_required","title":"fit_required staticmethod
","text":"fit_required(method)\n
Decorator to enforce the fitted check
Source code in src/pydvl/influence/base_influence_function_model.py
@staticmethod\ndef fit_required(method):\n \"\"\"Decorator to enforce the fitted check\"\"\"\n\n @wraps(method)\n def wrapper(self, *args, **kwargs):\n if not self.is_fitted:\n raise NotFittedException(type(self))\n return method(self, *args, **kwargs)\n\n return wrapper\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.CgInfluence.influence_factors","title":"influence_factors","text":"influence_factors(x: Tensor, y: Tensor) -> Tensor\n
Compute approximation of
\\[ H^{-1}\\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\] where the gradient is meant to be per sample of the batch \\((x, y)\\). For all input tensors it is assumed, that the first dimension is the batch dimension (in case, you want to provide a single sample z, call z.unsqueeze(0) if no batch dimension is present).
PARAMETER DESCRIPTION x
model input to use in the gradient computations
TYPE: Tensor
y
label tensor to compute gradients
TYPE: Tensor
RETURNS DESCRIPTION Tensor
Tensor representing the element-wise inverse Hessian matrix vector products
Source code in src/pydvl/influence/torch/influence_function_model.py
def influence_factors(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:\n r\"\"\"\n Compute approximation of\n\n \\[ H^{-1}\\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\]\n\n where the gradient is meant to be per sample of the batch $(x, y)$.\n For all input tensors it is assumed,\n that the first dimension is the batch dimension (in case, you want to provide\n a single sample z, call z.unsqueeze(0) if no batch dimension is present).\n\n Args:\n x: model input to use in the gradient computations\n y: label tensor to compute gradients\n\n Returns:\n Tensor representing the element-wise inverse Hessian matrix vector products\n\n \"\"\"\n return super().influence_factors(x, y)\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.CgInfluence.influences_from_factors","title":"influences_from_factors","text":"influences_from_factors(\n z_test_factors: Tensor,\n x: Tensor,\n y: Tensor,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> Tensor\n
Computation of
\\[ \\langle z_{\\text{test_factors}}, \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the case of up-weighting influence, resp.
\\[ \\langle z_{\\text{test_factors}}, \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the perturbation type influence case. The gradient is meant to be per sample of the batch \\((x, y)\\). For all input tensors it is assumed, that the first dimension is the batch dimension (in case, you want to provide a single sample z, call z.unsqueeze(0) if no batch dimension is present).
PARAMETER DESCRIPTION z_test_factors
pre-computed tensor, approximating \\(H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}}))\\)
TYPE: Tensor
x
model input to use in the gradient computations \\(\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))\\), resp. \\(\\nabla_{x}\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))\\)
TYPE: Tensor
y
label tensor to compute gradients
TYPE: Tensor
mode
enum value of InfluenceMode
TYPE: InfluenceMode
DEFAULT: Up
RETURNS DESCRIPTION Tensor
Tensor representing the element-wise scalar products for the provided batch
Source code in src/pydvl/influence/torch/influence_function_model.py
def influences_from_factors(\n self,\n z_test_factors: torch.Tensor,\n x: torch.Tensor,\n y: torch.Tensor,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> torch.Tensor:\n r\"\"\"\n Computation of\n\n \\[ \\langle z_{\\text{test_factors}},\n \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the case of up-weighting influence, resp.\n\n \\[ \\langle z_{\\text{test_factors}},\n \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the perturbation type influence case. The gradient is meant to be per sample\n of the batch $(x, y)$. For all input tensors it is assumed,\n that the first dimension is the batch dimension (in case, you want to provide\n a single sample z, call z.unsqueeze(0) if no batch dimension is present).\n\n Args:\n z_test_factors: pre-computed tensor, approximating\n $H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}},\n f_{\\theta}(x_{\\text{test}}))$\n x: model input to use in the gradient computations\n $\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))$,\n resp. $\\nabla_{x}\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))$\n y: label tensor to compute gradients\n mode: enum value of [InfluenceMode]\n [pydvl.influence.base_influence_function_model.InfluenceMode]\n\n Returns:\n Tensor representing the element-wise scalar products for the provided batch\n\n \"\"\"\n if mode == InfluenceMode.Up:\n return (\n z_test_factors.to(self.model_device)\n @ self._loss_grad(x.to(self.model_device), y.to(self.model_device)).T\n )\n elif mode == InfluenceMode.Perturbation:\n return torch.einsum(\n \"ia,j...a->ij...\",\n z_test_factors.to(self.model_device),\n self._flat_loss_mixed_grad(\n x.to(self.model_device), y.to(self.model_device)\n ),\n )\n else:\n raise UnsupportedInfluenceModeException(mode)\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.CgInfluence.influences","title":"influences","text":"influences(\n x_test: Tensor,\n y_test: Tensor,\n x: Optional[Tensor] = None,\n y: Optional[Tensor] = None,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> Tensor\n
Compute an approximation of
\\[ \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}})), \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle, \\] for the case of up-weighting influence, resp.
\\[ \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}})), \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the case of perturbation-type influence. The approximate action of \\(H^{-1}\\) is achieved via the conjugate gradient method.
PARAMETER DESCRIPTION x_test
model input to use in the gradient computations of \\(H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}}))\\)
TYPE: Tensor
y_test
label tensor to compute gradients
TYPE: Tensor
x
optional model input to use in the gradient computations \\(\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))\\), resp. \\(\\nabla_{x}\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))\\), if None, use \\(x=x_{\\text{test}}\\)
TYPE: Optional[Tensor]
DEFAULT: None
y
optional label tensor to compute gradients
TYPE: Optional[Tensor]
DEFAULT: None
mode
enum value of InfluenceMode
TYPE: InfluenceMode
DEFAULT: Up
RETURNS DESCRIPTION Tensor
A tensor representing the element-wise scalar products for the provided batch.
Source code in src/pydvl/influence/torch/influence_function_model.py
@log_duration\ndef influences(\n self,\n x_test: torch.Tensor,\n y_test: torch.Tensor,\n x: Optional[torch.Tensor] = None,\n y: Optional[torch.Tensor] = None,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> torch.Tensor:\n r\"\"\"\n Compute an approximation of\n\n \\[ \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}},\n f_{\\theta}(x_{\\text{test}})),\n \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle, \\]\n\n for the case of up-weighting influence, resp.\n\n \\[ \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}},\n f_{\\theta}(x_{\\text{test}})),\n \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the case of perturbation-type influence. The approximate action of\n $H^{-1}$ is achieved via the [conjugate gradient\n method](https://en.wikipedia.org/wiki/Conjugate_gradient_method).\n\n Args:\n x_test: model input to use in the gradient computations of\n $H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}},\n f_{\\theta}(x_{\\text{test}}))$\n y_test: label tensor to compute gradients\n x: optional model input to use in the gradient computations\n $\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))$,\n resp. $\\nabla_{x}\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))$,\n if None, use $x=x_{\\text{test}}$\n y: optional label tensor to compute gradients\n mode: enum value of [InfluenceMode]\n [pydvl.influence.base_influence_function_model.InfluenceMode]\n\n Returns:\n A tensor representing the element-wise scalar products for the\n provided batch.\n\n \"\"\"\n return super().influences(x_test, y_test, x, y, mode=mode)\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.LissaInfluence","title":"LissaInfluence","text":"LissaInfluence(\n model: Module,\n loss: Callable[[Tensor, Tensor], Tensor],\n regularization: Optional[Union[float, Dict[str, Optional[float]]]] = None,\n maxiter: int = 1000,\n dampen: float = 0.0,\n scale: float = 10.0,\n rtol: float = 0.0001,\n progress: bool = False,\n warn_on_max_iteration: bool = True,\n block_structure: Union[\n BlockMode, OrderedDict[str, List[str]]\n ] = BlockMode.FULL,\n second_order_mode: SecondOrderMode = SecondOrderMode.HESSIAN,\n)\n
Bases: TorchComposableInfluence[LissaOperator[BatchOperationType]]
Uses LISSA, Linear time Stochastic Second-Order Algorithm, to iteratively approximate the inverse Hessian. More precisely, it finds x s.t. \\(Hx = b\\), with \\(H\\) being the model's second derivative wrt. the parameters. This is done with the update
\\[H^{-1}_{j+1} b = b + (I - d) \\ H - \\frac{H^{-1}_j b}{s},\\] where \\(I\\) is the identity matrix, \\(d\\) is a dampening term and \\(s\\) a scaling factor that are applied to help convergence. For details, see Linear time Stochastic Second-Order Approximation (LiSSA)
PARAMETER DESCRIPTION model
A PyTorch model. The Hessian will be calculated with respect to this model's parameters.
TYPE: Module
loss
A callable that takes the model's output and target as input and returns the scalar loss.
TYPE: Callable[[Tensor, Tensor], Tensor]
regularization
Optional regularization parameter added to the Hessian-vector product for numerical stability.
TYPE: Optional[Union[float, Dict[str, Optional[float]]]]
DEFAULT: None
maxiter
Maximum number of iterations.
TYPE: int
DEFAULT: 1000
dampen
Dampening factor, defaults to 0 for no dampening.
TYPE: float
DEFAULT: 0.0
scale
Scaling factor, defaults to 10.
TYPE: float
DEFAULT: 10.0
rtol
tolerance to use for early stopping
TYPE: float
DEFAULT: 0.0001
progress
If True, display progress bars.
TYPE: bool
DEFAULT: False
warn_on_max_iteration
If True, logs a warning, if the desired tolerance is not achieved within maxiter
iterations. If False, the log level for this information is logging.DEBUG
TYPE: bool
DEFAULT: True
block_structure
The blocking structure, either a pre-defined enum or a custom block structure, see the information regarding block-diagonal approximation.
TYPE: Union[BlockMode, OrderedDict[str, List[str]]]
DEFAULT: FULL
second_order_mode
The second order mode, either SecondOrderMode.HESSIAN
or SecondOrderMode.GAUSS_NEWTON
.
TYPE: SecondOrderMode
DEFAULT: HESSIAN
Source code in src/pydvl/influence/torch/influence_function_model.py
def __init__(\n self,\n model: nn.Module,\n loss: Callable[[torch.Tensor, torch.Tensor], torch.Tensor],\n regularization: Optional[Union[float, Dict[str, Optional[float]]]] = None,\n maxiter: int = 1000,\n dampen: float = 0.0,\n scale: float = 10.0,\n rtol: float = 1e-4,\n progress: bool = False,\n warn_on_max_iteration: bool = True,\n block_structure: Union[BlockMode, OrderedDict[str, List[str]]] = BlockMode.FULL,\n second_order_mode: SecondOrderMode = SecondOrderMode.HESSIAN,\n):\n super().__init__(model, block_structure, regularization)\n self.maxiter = maxiter\n self.progress = progress\n self.rtol = rtol\n self.scale = scale\n self.dampen = dampen\n self.loss = loss\n self.second_order_mode = second_order_mode\n self.warn_on_max_iteration = warn_on_max_iteration\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.LissaInfluence.fit","title":"fit","text":"fit(data: DataLoaderType) -> InfluenceFunctionModel\n
Fitting to provided data, by internally creating a block mapper instance from it. Args: data: iterable of tensors
RETURNS DESCRIPTION InfluenceFunctionModel
Fitted instance
Source code in src/pydvl/influence/base_influence_function_model.py
@log_duration(log_level=logging.INFO)\ndef fit(self, data: DataLoaderType) -> InfluenceFunctionModel:\n \"\"\"\n Fitting to provided data, by internally creating a block mapper instance from\n it.\n Args:\n data: iterable of tensors\n\n Returns:\n Fitted instance\n \"\"\"\n self.block_mapper = self._create_block_mapper(data)\n return self\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.LissaInfluence.fit_required","title":"fit_required staticmethod
","text":"fit_required(method)\n
Decorator to enforce the fitted check
Source code in src/pydvl/influence/base_influence_function_model.py
@staticmethod\ndef fit_required(method):\n \"\"\"Decorator to enforce the fitted check\"\"\"\n\n @wraps(method)\n def wrapper(self, *args, **kwargs):\n if not self.is_fitted:\n raise NotFittedException(type(self))\n return method(self, *args, **kwargs)\n\n return wrapper\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.LissaInfluence.influence_factors","title":"influence_factors","text":"influence_factors(x: TensorType, y: TensorType) -> TensorType\n
Computes the approximation of
\\[ H^{-1}\\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\] where the gradient is meant to be per sample of the batch \\((x, y)\\). For all input tensors it is assumed, that the first dimension is the batch dimension.
PARAMETER DESCRIPTION x
model input to use in the gradient computations
TYPE: TensorType
y
label tensor to compute gradients
TYPE: TensorType
RETURNS DESCRIPTION TensorType
Tensor representing the element-wise inverse Hessian matrix vector products
Source code in src/pydvl/influence/base_influence_function_model.py
def influence_factors(self, x: TensorType, y: TensorType) -> TensorType:\n r\"\"\"\n Computes the approximation of\n\n \\[ H^{-1}\\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\]\n\n where the gradient is meant to be per sample of the batch $(x, y)$.\n For all input tensors it is assumed,\n that the first dimension is the batch dimension.\n\n Args:\n x: model input to use in the gradient computations\n y: label tensor to compute gradients\n\n Returns:\n Tensor representing the element-wise inverse Hessian matrix vector products\n\n \"\"\"\n if not self.is_fitted:\n raise NotFittedException(type(self))\n return self._influence_factors(x, y)\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.LissaInfluence.influences","title":"influences","text":"influences(\n x_test: TensorType,\n y_test: TensorType,\n x: Optional[TensorType] = None,\n y: Optional[TensorType] = None,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> TensorType\n
Computes the approximation of
\\[ \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}})), \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the case of up-weighting influence, resp.
\\[ \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{test}, f_{\\theta}(x_{test})), \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the perturbation type influence case.
PARAMETER DESCRIPTION x_test
model input to use in the gradient computations of \\(H^{-1}\\nabla_{theta} \\ell(y_{test}, f_{\\theta}(x_{test}))\\)
TYPE: TensorType
y_test
label tensor to compute gradients
TYPE: TensorType
x
optional model input to use in the gradient computations \\(\\nabla_{theta}\\ell(y, f_{\\theta}(x))\\), resp. \\(\\nabla_{x}\\nabla_{theta}\\ell(y, f_{\\theta}(x))\\), if None, use \\(x=x_{test}\\)
TYPE: Optional[TensorType]
DEFAULT: None
y
optional label tensor to compute gradients
TYPE: Optional[TensorType]
DEFAULT: None
mode
enum value of InfluenceMode
TYPE: InfluenceMode
DEFAULT: Up
RETURNS DESCRIPTION TensorType
Tensor representing the element-wise scalar products for the provided batch
Source code in src/pydvl/influence/base_influence_function_model.py
def influences(\n self,\n x_test: TensorType,\n y_test: TensorType,\n x: Optional[TensorType] = None,\n y: Optional[TensorType] = None,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> TensorType:\n r\"\"\"\n Computes the approximation of\n\n \\[ \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}},\n f_{\\theta}(x_{\\text{test}})),\n \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the case of up-weighting influence, resp.\n\n \\[ \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{test}, f_{\\theta}(x_{test})),\n \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the perturbation type influence case.\n\n Args:\n x_test: model input to use in the gradient computations\n of $H^{-1}\\nabla_{theta} \\ell(y_{test}, f_{\\theta}(x_{test}))$\n y_test: label tensor to compute gradients\n x: optional model input to use in the gradient computations\n $\\nabla_{theta}\\ell(y, f_{\\theta}(x))$,\n resp. $\\nabla_{x}\\nabla_{theta}\\ell(y, f_{\\theta}(x))$,\n if None, use $x=x_{test}$\n y: optional label tensor to compute gradients\n mode: enum value of [InfluenceMode]\n [pydvl.influence.base_influence_function_model.InfluenceMode]\n\n Returns:\n Tensor representing the element-wise scalar products for the provided batch\n\n \"\"\"\n if not self.is_fitted:\n raise NotFittedException(type(self))\n\n if x is None and y is not None:\n raise ValueError(\n \"Providing labels y, without providing model input x \"\n \"is not supported\"\n )\n\n if x is not None and y is None:\n raise ValueError(\n \"Providing model input x, without providing labels y \"\n \"is not supported\"\n )\n\n return self._influences(x_test, y_test, x, y, mode)\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.LissaInfluence.influences_from_factors","title":"influences_from_factors","text":"influences_from_factors(\n z_test_factors: TensorType,\n x: TensorType,\n y: TensorType,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> TensorType\n
Computation of
\\[ \\langle z_{\\text{test_factors}}, \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the case of up-weighting influence, resp.
\\[ \\langle z_{\\text{test_factors}}, \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the perturbation type influence case. The gradient is meant to be per sample of the batch \\((x, y)\\).
PARAMETER DESCRIPTION z_test_factors
pre-computed array, approximating \\(H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}}))\\)
TYPE: TensorType
x
model input to use in the gradient computations \\(\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))\\), resp. \\(\\nabla_{x}\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))\\), if None, use \\(x=x_{\\text{test}}\\)
TYPE: TensorType
y
label tensor to compute gradients
TYPE: TensorType
mode
enum value of InfluenceMode
TYPE: InfluenceMode
DEFAULT: Up
RETURNS DESCRIPTION TensorType
Tensor representing the element-wise scalar products for the provided batch
Source code in src/pydvl/influence/base_influence_function_model.py
@InfluenceFunctionModel.fit_required\ndef influences_from_factors(\n self,\n z_test_factors: TensorType,\n x: TensorType,\n y: TensorType,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> TensorType:\n r\"\"\"\n Computation of\n\n \\[ \\langle z_{\\text{test_factors}},\n \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the case of up-weighting influence, resp.\n\n \\[ \\langle z_{\\text{test_factors}},\n \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the perturbation type influence case. The gradient is meant to be per sample\n of the batch $(x, y)$.\n\n Args:\n z_test_factors: pre-computed array, approximating\n $H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}},\n f_{\\theta}(x_{\\text{test}}))$\n x: model input to use in the gradient computations\n $\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))$,\n resp. $\\nabla_{x}\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))$,\n if None, use $x=x_{\\text{test}}$\n y: label tensor to compute gradients\n mode: enum value of [InfluenceMode]\n [pydvl.influence.base_influence_function_model.InfluenceMode]\n\n Returns:\n Tensor representing the element-wise scalar products for the provided batch\n\n \"\"\"\n tensors = self.block_mapper.generate_interactions_from_transformed_grads(\n z_test_factors,\n self._create_batch(x, y),\n mode,\n )\n result: TensorType = next(tensors)\n for tensor in tensors:\n result = result + tensor\n return result\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.LissaInfluence.influences_by_block","title":"influences_by_block","text":"influences_by_block(\n x_test: TensorType,\n y_test: TensorType,\n x: Optional[TensorType] = None,\n y: Optional[TensorType] = None,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> OrderedDict[str, TensorType]\n
Compute the block-wise influence values for the provided data, i.e. an approximation of
\\[ \\langle H^{-1}\\nabla_{theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}})), \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the case of up-weighting influence, resp.
\\[ \\langle H^{-1}\\nabla_{theta} \\ell(y_{test}, f_{\\theta}(x_{test})), \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the perturbation type influence case.
PARAMETER DESCRIPTION x_test
model input to use in the gradient computations of the approximation of \\(H^{-1}\\nabla_{theta} \\ell(y_{test}, f_{\\theta}(x_{test}))\\)
TYPE: TensorType
y_test
label tensor to compute gradients
TYPE: TensorType
x
optional model input to use in the gradient computations \\(\\nabla_{theta}\\ell(y, f_{\\theta}(x))\\), resp. \\(\\nabla_{x}\\nabla_{theta}\\ell(y, f_{\\theta}(x))\\), if None, use \\(x=x_{test}\\)
TYPE: Optional[TensorType]
DEFAULT: None
y
optional label tensor to compute gradients
TYPE: Optional[TensorType]
DEFAULT: None
mode
enum value of InfluenceMode
TYPE: InfluenceMode
DEFAULT: Up
RETURNS DESCRIPTION OrderedDict[str, TensorType]
Ordered dictionary of tensors representing the element-wise scalar products
OrderedDict[str, TensorType]
for the provided batch per block.
Source code in src/pydvl/influence/base_influence_function_model.py
@InfluenceFunctionModel.fit_required\ndef influences_by_block(\n self,\n x_test: TensorType,\n y_test: TensorType,\n x: Optional[TensorType] = None,\n y: Optional[TensorType] = None,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> OrderedDict[str, TensorType]:\n r\"\"\"\n Compute the block-wise influence values for the provided data, i.e. an\n approximation of\n\n \\[ \\langle H^{-1}\\nabla_{theta} \\ell(y_{\\text{test}},\n f_{\\theta}(x_{\\text{test}})),\n \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the case of up-weighting influence, resp.\n\n \\[ \\langle H^{-1}\\nabla_{theta} \\ell(y_{test}, f_{\\theta}(x_{test})),\n \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the perturbation type influence case.\n\n Args:\n x_test: model input to use in the gradient computations\n of the approximation of\n $H^{-1}\\nabla_{theta} \\ell(y_{test}, f_{\\theta}(x_{test}))$\n y_test: label tensor to compute gradients\n x: optional model input to use in the gradient computations\n $\\nabla_{theta}\\ell(y, f_{\\theta}(x))$,\n resp. $\\nabla_{x}\\nabla_{theta}\\ell(y, f_{\\theta}(x))$,\n if None, use $x=x_{test}$\n y: optional label tensor to compute gradients\n mode: enum value of [InfluenceMode]\n [pydvl.influence.base_influence_function_model.InfluenceMode]\n\n Returns:\n Ordered dictionary of tensors representing the element-wise scalar products\n for the provided batch per block.\n\n \"\"\"\n left_batch = self._create_batch(x_test, y_test)\n\n if x is None:\n if y is not None:\n raise ValueError(\n \"Providing labels y, without providing model input x \"\n \"is not supported\"\n )\n right_batch = left_batch\n else:\n if y is None:\n raise ValueError(\n \"Providing model input x, without providing labels y \"\n \"is not supported\"\n )\n right_batch = self._create_batch(x, y)\n\n return self.block_mapper.interactions(left_batch, right_batch, mode)\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.LissaInfluence.influence_factors_by_block","title":"influence_factors_by_block","text":"influence_factors_by_block(\n x: TensorType, y: TensorType\n) -> OrderedDict[str, TensorType]\n
Compute the block-wise approximation of
\\[ H^{-1}\\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\] where the gradient is meant to be per sample of the batch \\((x, y)\\).
PARAMETER DESCRIPTION x
model input to use in the gradient computations
TYPE: TensorType
y
label tensor to compute gradients
TYPE: TensorType
RETURNS DESCRIPTION OrderedDict[str, TensorType]
Ordered dictionary of tensors representing the element-wise
OrderedDict[str, TensorType]
approximate inverse Hessian matrix vector products per block.
Source code in src/pydvl/influence/base_influence_function_model.py
@InfluenceFunctionModel.fit_required\ndef influence_factors_by_block(\n self, x: TensorType, y: TensorType\n) -> OrderedDict[str, TensorType]:\n r\"\"\"\n Compute the block-wise approximation of\n\n \\[ H^{-1}\\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\]\n\n where the gradient is meant to be per sample of the batch $(x, y)$.\n\n Args:\n x: model input to use in the gradient computations\n y: label tensor to compute gradients\n\n Returns:\n Ordered dictionary of tensors representing the element-wise\n approximate inverse Hessian matrix vector products per block.\n\n \"\"\"\n return self.block_mapper.transformed_grads(self._create_batch(x, y))\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.LissaInfluence.influences_from_factors_by_block","title":"influences_from_factors_by_block","text":"influences_from_factors_by_block(\n z_test_factors: OrderedDict[str, TensorType],\n x: TensorType,\n y: TensorType,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> OrderedDict[str, TensorType]\n
Block-wise computation of
\\[ \\langle z_{\\text{test_factors}}, \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the case of up-weighting influence, resp.
\\[ \\langle z_{\\text{test_factors}}, \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the perturbation type influence case. The gradient is meant to be per sample of the batch \\((x, y)\\).
PARAMETER DESCRIPTION z_test_factors
pre-computed array, approximating \\(H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}}))\\)
TYPE: OrderedDict[str, TensorType]
x
model input to use in the gradient computations \\(\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))\\), resp. \\(\\nabla_{x}\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))\\), if None, use \\(x=x_{\\text{test}}\\)
TYPE: TensorType
y
label tensor to compute gradients
TYPE: TensorType
mode
enum value of InfluenceMode
TYPE: InfluenceMode
DEFAULT: Up
RETURNS DESCRIPTION OrderedDict[str, TensorType]
Ordered dictionary of tensors representing the element-wise scalar products
OrderedDict[str, TensorType]
for the provided batch per block
Source code in src/pydvl/influence/base_influence_function_model.py
@InfluenceFunctionModel.fit_required\ndef influences_from_factors_by_block(\n self,\n z_test_factors: OrderedDict[str, TensorType],\n x: TensorType,\n y: TensorType,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> OrderedDict[str, TensorType]:\n r\"\"\"\n Block-wise computation of\n\n \\[ \\langle z_{\\text{test_factors}},\n \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the case of up-weighting influence, resp.\n\n \\[ \\langle z_{\\text{test_factors}},\n \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the perturbation type influence case. The gradient is meant to be per sample\n of the batch $(x, y)$.\n\n Args:\n z_test_factors: pre-computed array, approximating\n $H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}},\n f_{\\theta}(x_{\\text{test}}))$\n x: model input to use in the gradient computations\n $\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))$,\n resp. $\\nabla_{x}\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))$,\n if None, use $x=x_{\\text{test}}$\n y: label tensor to compute gradients\n mode: enum value of [InfluenceMode]\n [pydvl.influence.base_influence_function_model.InfluenceMode]\n\n Returns:\n Ordered dictionary of tensors representing the element-wise scalar products\n for the provided batch per block\n\n \"\"\"\n return self.block_mapper.interactions_from_transformed_grads(\n z_test_factors, self._create_batch(x, y), mode\n )\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.LissaInfluence.with_regularization","title":"with_regularization","text":"with_regularization(\n regularization: Union[float, Dict[str, Optional[float]]]\n) -> TorchComposableInfluence\n
Update the regularization parameter. Args: regularization: Either a positive float or a dictionary with the block names as keys and the regularization values as values.
RETURNS DESCRIPTION TorchComposableInfluence
The modified instance
Source code in src/pydvl/influence/torch/influence_function_model.py
def with_regularization(\n self, regularization: Union[float, Dict[str, Optional[float]]]\n) -> TorchComposableInfluence:\n \"\"\"\n Update the regularization parameter.\n Args:\n regularization: Either a positive float or a dictionary with the\n block names as keys and the regularization values as values.\n\n Returns:\n The modified instance\n\n \"\"\"\n self._regularization_dict = self._build_regularization_dict(regularization)\n for k, reg in self._regularization_dict.items():\n self.block_mapper.composable_block_dict[k].op.regularization = reg\n return self\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.ArnoldiInfluence","title":"ArnoldiInfluence","text":"ArnoldiInfluence(\n model: Module,\n loss: Callable[[Tensor, Tensor], Tensor],\n hessian_regularization: float = 0.0,\n rank_estimate: int = 10,\n krylov_dimension: Optional[int] = None,\n tol: float = 1e-06,\n max_iter: Optional[int] = None,\n eigen_computation_on_gpu: bool = False,\n precompute_grad: bool = False,\n)\n
Bases: TorchInfluenceFunctionModel
Solves the linear system Hx = b, where H is the Hessian of the model's loss function and b is the given right-hand side vector. It employs the [implicitly restarted Arnoldi method] (https://en.wikipedia.org/wiki/Arnoldi_iteration) for computing a partial eigen decomposition, which is used fo the inversion i.e.
\\[x = V D^{-1} V^T b\\] where \\(D\\) is a diagonal matrix with the top (in absolute value) rank_estimate
eigenvalues of the Hessian and \\(V\\) contains the corresponding eigenvectors. For more information, see Arnoldi.
PARAMETER DESCRIPTION model
A PyTorch model. The Hessian will be calculated with respect to this model's parameters.
TYPE: Module
loss
A callable that takes the model's output and target as input and returns the scalar loss.
TYPE: Callable[[Tensor, Tensor], Tensor]
hessian_regularization
Optional regularization parameter added to the Hessian-vector product for numerical stability.
TYPE: float
DEFAULT: 0.0
rank_estimate
The number of eigenvalues and corresponding eigenvectors to compute. Represents the desired rank of the Hessian approximation.
TYPE: int
DEFAULT: 10
krylov_dimension
The number of Krylov vectors to use for the Lanczos method. Defaults to min(model's number of parameters, max(2 times rank_estimate + 1, 20)).
TYPE: Optional[int]
DEFAULT: None
tol
The stopping criteria for the Lanczos algorithm. Ignored if low_rank_representation
is provided.
TYPE: float
DEFAULT: 1e-06
max_iter
The maximum number of iterations for the Lanczos method. Ignored if low_rank_representation
is provided.
TYPE: Optional[int]
DEFAULT: None
eigen_computation_on_gpu
If True, tries to execute the eigen pair approximation on the model's device via a cupy implementation. Ensure the model size or rank_estimate is appropriate for device memory. If False, the eigen pair approximation is executed on the CPU by the scipy wrapper to ARPACK.
TYPE: bool
DEFAULT: False
precompute_grad
If True, the full data gradient is precomputed and kept in memory, which can speed up the hessian vector product computation. Set this to False, if you can't afford to keep the full computation graph in memory.
TYPE: bool
DEFAULT: False
Source code in src/pydvl/influence/torch/influence_function_model.py
def __init__(\n self,\n model: nn.Module,\n loss: Callable[[torch.Tensor, torch.Tensor], torch.Tensor],\n hessian_regularization: float = 0.0,\n rank_estimate: int = 10,\n krylov_dimension: Optional[int] = None,\n tol: float = 1e-6,\n max_iter: Optional[int] = None,\n eigen_computation_on_gpu: bool = False,\n precompute_grad: bool = False,\n):\n super().__init__(model, loss)\n self.hessian_regularization = hessian_regularization\n self.rank_estimate = rank_estimate\n self.tol = tol\n self.max_iter = max_iter\n self.krylov_dimension = krylov_dimension\n self.eigen_computation_on_gpu = eigen_computation_on_gpu\n self.precompute_grad = precompute_grad\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.ArnoldiInfluence.fit_required","title":"fit_required staticmethod
","text":"fit_required(method)\n
Decorator to enforce the fitted check
Source code in src/pydvl/influence/base_influence_function_model.py
@staticmethod\ndef fit_required(method):\n \"\"\"Decorator to enforce the fitted check\"\"\"\n\n @wraps(method)\n def wrapper(self, *args, **kwargs):\n if not self.is_fitted:\n raise NotFittedException(type(self))\n return method(self, *args, **kwargs)\n\n return wrapper\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.ArnoldiInfluence.influence_factors","title":"influence_factors","text":"influence_factors(x: Tensor, y: Tensor) -> Tensor\n
Compute approximation of
\\[ H^{-1}\\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\] where the gradient is meant to be per sample of the batch \\((x, y)\\). For all input tensors it is assumed, that the first dimension is the batch dimension (in case, you want to provide a single sample z, call z.unsqueeze(0) if no batch dimension is present).
PARAMETER DESCRIPTION x
model input to use in the gradient computations
TYPE: Tensor
y
label tensor to compute gradients
TYPE: Tensor
RETURNS DESCRIPTION Tensor
Tensor representing the element-wise inverse Hessian matrix vector products
Source code in src/pydvl/influence/torch/influence_function_model.py
def influence_factors(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:\n r\"\"\"\n Compute approximation of\n\n \\[ H^{-1}\\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\]\n\n where the gradient is meant to be per sample of the batch $(x, y)$.\n For all input tensors it is assumed,\n that the first dimension is the batch dimension (in case, you want to provide\n a single sample z, call z.unsqueeze(0) if no batch dimension is present).\n\n Args:\n x: model input to use in the gradient computations\n y: label tensor to compute gradients\n\n Returns:\n Tensor representing the element-wise inverse Hessian matrix vector products\n\n \"\"\"\n return super().influence_factors(x, y)\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.ArnoldiInfluence.influences","title":"influences","text":"influences(\n x_test: Tensor,\n y_test: Tensor,\n x: Optional[Tensor] = None,\n y: Optional[Tensor] = None,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> Tensor\n
Compute the approximation of
\\[ \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}})), \\nabla_{\\theta} \\ell(y, f_{\\theta}(x))\\rangle \\] for the case of up-weighting influence, resp.
\\[ \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}})), \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the perturbation type influence case. For all input tensors it is assumed, that the first dimension is the batch dimension (in case, you want to provide a single sample z, call z.unsqueeze(0) if no batch dimension is present).
PARAMETER DESCRIPTION x_test
model input to use in the gradient computations of \\(H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}}))\\)
TYPE: Tensor
y_test
label tensor to compute gradients
TYPE: Tensor
x
optional model input to use in the gradient computations \\(\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))\\), resp. \\(\\nabla_{x}\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))\\), if None, use \\(x=x_{\\text{test}}\\)
TYPE: Optional[Tensor]
DEFAULT: None
y
optional label tensor to compute gradients
TYPE: Optional[Tensor]
DEFAULT: None
mode
enum value of InfluenceMode
TYPE: InfluenceMode
DEFAULT: Up
RETURNS DESCRIPTION Tensor
Tensor representing the element-wise scalar products for the provided batch
Source code in src/pydvl/influence/torch/influence_function_model.py
def influences(\n self,\n x_test: torch.Tensor,\n y_test: torch.Tensor,\n x: Optional[torch.Tensor] = None,\n y: Optional[torch.Tensor] = None,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> torch.Tensor:\n r\"\"\"\n Compute the approximation of\n\n \\[\n \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}},\n f_{\\theta}(x_{\\text{test}})), \\nabla_{\\theta} \\ell(y, f_{\\theta}(x))\\rangle\n \\]\n\n for the case of up-weighting influence, resp.\n\n \\[\n \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}})),\n \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle\n \\]\n\n for the perturbation type influence case. For all input tensors it is assumed,\n that the first dimension is the batch dimension (in case, you want to provide\n a single sample z, call z.unsqueeze(0) if no batch dimension is present).\n\n Args:\n x_test: model input to use in the gradient computations\n of $H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}},\n f_{\\theta}(x_{\\text{test}}))$\n y_test: label tensor to compute gradients\n x: optional model input to use in the gradient computations\n $\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))$,\n resp. $\\nabla_{x}\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))$,\n if None, use $x=x_{\\text{test}}$\n y: optional label tensor to compute gradients\n mode: enum value of [InfluenceMode]\n [pydvl.influence.base_influence_function_model.InfluenceMode]\n\n Returns:\n Tensor representing the element-wise scalar products for the provided batch\n\n \"\"\"\n t: torch.Tensor = super().influences(x_test, y_test, x, y, mode=mode)\n return t\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.ArnoldiInfluence.influences_from_factors","title":"influences_from_factors","text":"influences_from_factors(\n z_test_factors: Tensor,\n x: Tensor,\n y: Tensor,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> Tensor\n
Computation of
\\[ \\langle z_{\\text{test_factors}}, \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the case of up-weighting influence, resp.
\\[ \\langle z_{\\text{test_factors}}, \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the perturbation type influence case. The gradient is meant to be per sample of the batch \\((x, y)\\). For all input tensors it is assumed, that the first dimension is the batch dimension (in case, you want to provide a single sample z, call z.unsqueeze(0) if no batch dimension is present).
PARAMETER DESCRIPTION z_test_factors
pre-computed tensor, approximating \\(H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}}))\\)
TYPE: Tensor
x
model input to use in the gradient computations \\(\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))\\), resp. \\(\\nabla_{x}\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))\\)
TYPE: Tensor
y
label tensor to compute gradients
TYPE: Tensor
mode
enum value of InfluenceMode
TYPE: InfluenceMode
DEFAULT: Up
RETURNS DESCRIPTION Tensor
Tensor representing the element-wise scalar products for the provided batch
Source code in src/pydvl/influence/torch/influence_function_model.py
def influences_from_factors(\n self,\n z_test_factors: torch.Tensor,\n x: torch.Tensor,\n y: torch.Tensor,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> torch.Tensor:\n r\"\"\"\n Computation of\n\n \\[ \\langle z_{\\text{test_factors}},\n \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the case of up-weighting influence, resp.\n\n \\[ \\langle z_{\\text{test_factors}},\n \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the perturbation type influence case. The gradient is meant to be per sample\n of the batch $(x, y)$. For all input tensors it is assumed,\n that the first dimension is the batch dimension (in case, you want to provide\n a single sample z, call z.unsqueeze(0) if no batch dimension is present).\n\n Args:\n z_test_factors: pre-computed tensor, approximating\n $H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}},\n f_{\\theta}(x_{\\text{test}}))$\n x: model input to use in the gradient computations\n $\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))$,\n resp. $\\nabla_{x}\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))$\n y: label tensor to compute gradients\n mode: enum value of [InfluenceMode]\n [pydvl.influence.base_influence_function_model.InfluenceMode]\n\n Returns:\n Tensor representing the element-wise scalar products for the provided batch\n\n \"\"\"\n if mode == InfluenceMode.Up:\n return (\n z_test_factors.to(self.model_device)\n @ self._loss_grad(x.to(self.model_device), y.to(self.model_device)).T\n )\n elif mode == InfluenceMode.Perturbation:\n return torch.einsum(\n \"ia,j...a->ij...\",\n z_test_factors.to(self.model_device),\n self._flat_loss_mixed_grad(\n x.to(self.model_device), y.to(self.model_device)\n ),\n )\n else:\n raise UnsupportedInfluenceModeException(mode)\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.ArnoldiInfluence.fit","title":"fit","text":"fit(data: DataLoader) -> ArnoldiInfluence\n
Fitting corresponds to the computation of the low rank decomposition
\\[ V D^{-1} V^T \\] of the Hessian defined by the provided data loader.
PARAMETER DESCRIPTION data
The data to compute the Hessian with.
TYPE: DataLoader
RETURNS DESCRIPTION ArnoldiInfluence
The fitted instance.
Source code in src/pydvl/influence/torch/influence_function_model.py
@log_duration(log_level=logging.INFO)\ndef fit(self, data: DataLoader) -> ArnoldiInfluence:\n r\"\"\"\n Fitting corresponds to the computation of the low rank decomposition\n\n \\[ V D^{-1} V^T \\]\n\n of the Hessian defined by the provided data loader.\n\n Args:\n data: The data to compute the Hessian with.\n\n Returns:\n The fitted instance.\n\n \"\"\"\n low_rank_representation = model_hessian_low_rank(\n self.model,\n self.loss,\n data,\n hessian_perturbation=0.0, # regularization is applied, when computing values\n rank_estimate=self.rank_estimate,\n krylov_dimension=self.krylov_dimension,\n tol=self.tol,\n max_iter=self.max_iter,\n eigen_computation_on_gpu=self.eigen_computation_on_gpu,\n precompute_grad=self.precompute_grad,\n )\n self.low_rank_representation = low_rank_representation.to(self.model_device)\n return self\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.EkfacInfluence","title":"EkfacInfluence","text":"EkfacInfluence(\n model: Module,\n update_diagonal: bool = False,\n hessian_regularization: float = 0.0,\n progress: bool = False,\n)\n
Bases: TorchInfluenceFunctionModel
Approximately solves the linear system Hx = b, where H is the Hessian of a model with the empirical categorical cross entropy as loss function and b is the given right-hand side vector. It employs the EK-FAC method, which is based on the kronecker factorization of the Hessian.
Contrary to the other influence function methods, this implementation can only be used for classification tasks with a cross entropy loss function. However, it is much faster than the other methods and can be used efficiently for very large datasets and models. For more information, see Eigenvalue Corrected K-FAC.
PARAMETER DESCRIPTION model
A PyTorch model. The Hessian will be calculated with respect to this model's parameters.
TYPE: Module
update_diagonal
If True, the diagonal values in the ekfac representation are refitted from the training data after calculating the KFAC blocks. This provides a more accurate approximation of the Hessian, but it is computationally more expensive.
TYPE: bool
DEFAULT: False
hessian_regularization
Regularization of the hessian.
TYPE: float
DEFAULT: 0.0
progress
If True, display progress bars.
TYPE: bool
DEFAULT: False
Source code in src/pydvl/influence/torch/influence_function_model.py
def __init__(\n self,\n model: nn.Module,\n update_diagonal: bool = False,\n hessian_regularization: float = 0.0,\n progress: bool = False,\n):\n super().__init__(model, torch.nn.functional.cross_entropy)\n self.hessian_regularization = hessian_regularization\n self.update_diagonal = update_diagonal\n self.active_layers = self._parse_active_layers()\n self.progress = progress\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.EkfacInfluence.fit_required","title":"fit_required staticmethod
","text":"fit_required(method)\n
Decorator to enforce the fitted check
Source code in src/pydvl/influence/base_influence_function_model.py
@staticmethod\ndef fit_required(method):\n \"\"\"Decorator to enforce the fitted check\"\"\"\n\n @wraps(method)\n def wrapper(self, *args, **kwargs):\n if not self.is_fitted:\n raise NotFittedException(type(self))\n return method(self, *args, **kwargs)\n\n return wrapper\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.EkfacInfluence.influence_factors","title":"influence_factors","text":"influence_factors(x: Tensor, y: Tensor) -> Tensor\n
Compute approximation of
\\[ H^{-1}\\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\] where the gradient is meant to be per sample of the batch \\((x, y)\\). For all input tensors it is assumed, that the first dimension is the batch dimension (in case, you want to provide a single sample z, call z.unsqueeze(0) if no batch dimension is present).
PARAMETER DESCRIPTION x
model input to use in the gradient computations
TYPE: Tensor
y
label tensor to compute gradients
TYPE: Tensor
RETURNS DESCRIPTION Tensor
Tensor representing the element-wise inverse Hessian matrix vector products
Source code in src/pydvl/influence/torch/influence_function_model.py
def influence_factors(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:\n r\"\"\"\n Compute approximation of\n\n \\[ H^{-1}\\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\]\n\n where the gradient is meant to be per sample of the batch $(x, y)$.\n For all input tensors it is assumed,\n that the first dimension is the batch dimension (in case, you want to provide\n a single sample z, call z.unsqueeze(0) if no batch dimension is present).\n\n Args:\n x: model input to use in the gradient computations\n y: label tensor to compute gradients\n\n Returns:\n Tensor representing the element-wise inverse Hessian matrix vector products\n\n \"\"\"\n return super().influence_factors(x, y)\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.EkfacInfluence.influences","title":"influences","text":"influences(\n x_test: Tensor,\n y_test: Tensor,\n x: Optional[Tensor] = None,\n y: Optional[Tensor] = None,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> Tensor\n
Compute the approximation of
\\[ \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}})), \\nabla_{\\theta} \\ell(y, f_{\\theta}(x))\\rangle \\] for the case of up-weighting influence, resp.
\\[ \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}})), \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the perturbation type influence case. For all input tensors it is assumed, that the first dimension is the batch dimension (in case, you want to provide a single sample z, call z.unsqueeze(0) if no batch dimension is present).
PARAMETER DESCRIPTION x_test
model input to use in the gradient computations of \\(H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}}))\\)
TYPE: Tensor
y_test
label tensor to compute gradients
TYPE: Tensor
x
optional model input to use in the gradient computations \\(\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))\\), resp. \\(\\nabla_{x}\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))\\), if None, use \\(x=x_{\\text{test}}\\)
TYPE: Optional[Tensor]
DEFAULT: None
y
optional label tensor to compute gradients
TYPE: Optional[Tensor]
DEFAULT: None
mode
enum value of InfluenceMode
TYPE: InfluenceMode
DEFAULT: Up
RETURNS DESCRIPTION Tensor
Tensor representing the element-wise scalar products for the provided batch
Source code in src/pydvl/influence/torch/influence_function_model.py
def influences(\n self,\n x_test: torch.Tensor,\n y_test: torch.Tensor,\n x: Optional[torch.Tensor] = None,\n y: Optional[torch.Tensor] = None,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> torch.Tensor:\n r\"\"\"\n Compute the approximation of\n\n \\[\n \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}},\n f_{\\theta}(x_{\\text{test}})), \\nabla_{\\theta} \\ell(y, f_{\\theta}(x))\\rangle\n \\]\n\n for the case of up-weighting influence, resp.\n\n \\[\n \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}})),\n \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle\n \\]\n\n for the perturbation type influence case. For all input tensors it is assumed,\n that the first dimension is the batch dimension (in case, you want to provide\n a single sample z, call z.unsqueeze(0) if no batch dimension is present).\n\n Args:\n x_test: model input to use in the gradient computations\n of $H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}},\n f_{\\theta}(x_{\\text{test}}))$\n y_test: label tensor to compute gradients\n x: optional model input to use in the gradient computations\n $\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))$,\n resp. $\\nabla_{x}\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))$,\n if None, use $x=x_{\\text{test}}$\n y: optional label tensor to compute gradients\n mode: enum value of [InfluenceMode]\n [pydvl.influence.base_influence_function_model.InfluenceMode]\n\n Returns:\n Tensor representing the element-wise scalar products for the provided batch\n\n \"\"\"\n t: torch.Tensor = super().influences(x_test, y_test, x, y, mode=mode)\n return t\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.EkfacInfluence.influences_from_factors","title":"influences_from_factors","text":"influences_from_factors(\n z_test_factors: Tensor,\n x: Tensor,\n y: Tensor,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> Tensor\n
Computation of
\\[ \\langle z_{\\text{test_factors}}, \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the case of up-weighting influence, resp.
\\[ \\langle z_{\\text{test_factors}}, \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the perturbation type influence case. The gradient is meant to be per sample of the batch \\((x, y)\\). For all input tensors it is assumed, that the first dimension is the batch dimension (in case, you want to provide a single sample z, call z.unsqueeze(0) if no batch dimension is present).
PARAMETER DESCRIPTION z_test_factors
pre-computed tensor, approximating \\(H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}}))\\)
TYPE: Tensor
x
model input to use in the gradient computations \\(\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))\\), resp. \\(\\nabla_{x}\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))\\)
TYPE: Tensor
y
label tensor to compute gradients
TYPE: Tensor
mode
enum value of InfluenceMode
TYPE: InfluenceMode
DEFAULT: Up
RETURNS DESCRIPTION Tensor
Tensor representing the element-wise scalar products for the provided batch
Source code in src/pydvl/influence/torch/influence_function_model.py
def influences_from_factors(\n self,\n z_test_factors: torch.Tensor,\n x: torch.Tensor,\n y: torch.Tensor,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> torch.Tensor:\n r\"\"\"\n Computation of\n\n \\[ \\langle z_{\\text{test_factors}},\n \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the case of up-weighting influence, resp.\n\n \\[ \\langle z_{\\text{test_factors}},\n \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the perturbation type influence case. The gradient is meant to be per sample\n of the batch $(x, y)$. For all input tensors it is assumed,\n that the first dimension is the batch dimension (in case, you want to provide\n a single sample z, call z.unsqueeze(0) if no batch dimension is present).\n\n Args:\n z_test_factors: pre-computed tensor, approximating\n $H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}},\n f_{\\theta}(x_{\\text{test}}))$\n x: model input to use in the gradient computations\n $\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))$,\n resp. $\\nabla_{x}\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))$\n y: label tensor to compute gradients\n mode: enum value of [InfluenceMode]\n [pydvl.influence.base_influence_function_model.InfluenceMode]\n\n Returns:\n Tensor representing the element-wise scalar products for the provided batch\n\n \"\"\"\n if mode == InfluenceMode.Up:\n return (\n z_test_factors.to(self.model_device)\n @ self._loss_grad(x.to(self.model_device), y.to(self.model_device)).T\n )\n elif mode == InfluenceMode.Perturbation:\n return torch.einsum(\n \"ia,j...a->ij...\",\n z_test_factors.to(self.model_device),\n self._flat_loss_mixed_grad(\n x.to(self.model_device), y.to(self.model_device)\n ),\n )\n else:\n raise UnsupportedInfluenceModeException(mode)\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.EkfacInfluence.fit","title":"fit","text":"fit(data: DataLoader) -> EkfacInfluence\n
Compute the KFAC blocks for each layer of the model, using the provided data. It then creates an EkfacRepresentation object that stores the KFAC blocks for each layer, their eigenvalue decomposition and diagonal values.
Source code in src/pydvl/influence/torch/influence_function_model.py
@log_duration(log_level=logging.INFO)\ndef fit(self, data: DataLoader) -> EkfacInfluence:\n \"\"\"\n Compute the KFAC blocks for each layer of the model, using the provided data.\n It then creates an EkfacRepresentation object that stores the KFAC blocks for\n each layer, their eigenvalue decomposition and diagonal values.\n \"\"\"\n forward_x, grad_y = self._get_kfac_blocks(data)\n layers_evecs_a = {}\n layers_evect_g = {}\n layers_diags = {}\n for key in self.active_layers.keys():\n evals_a, evecs_a = safe_torch_linalg_eigh(forward_x[key])\n evals_g, evecs_g = safe_torch_linalg_eigh(grad_y[key])\n layers_evecs_a[key] = evecs_a\n layers_evect_g[key] = evecs_g\n layers_diags[key] = torch.kron(evals_g.view(-1, 1), evals_a.view(-1, 1))\n\n self.ekfac_representation = EkfacRepresentation(\n self.active_layers.keys(),\n self.active_layers.values(),\n layers_evecs_a.values(),\n layers_evect_g.values(),\n layers_diags.values(),\n )\n if self.update_diagonal:\n self._update_diag(data)\n return self\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.EkfacInfluence.influences_by_layer","title":"influences_by_layer","text":"influences_by_layer(\n x_test: Tensor,\n y_test: Tensor,\n x: Optional[Tensor] = None,\n y: Optional[Tensor] = None,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> Dict[str, Tensor]\n
Compute the influence of the data on the test data for each layer of the model.
PARAMETER DESCRIPTION x_test
model input to use in the gradient computations of \\(H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}}))\\)
TYPE: Tensor
y_test
label tensor to compute gradients
TYPE: Tensor
x
optional model input to use in the gradient computations \\(\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))\\), resp. \\(\\nabla_{x}\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))\\), if None, use \\(x=x_{\\text{test}}\\)
TYPE: Optional[Tensor]
DEFAULT: None
y
optional label tensor to compute gradients
TYPE: Optional[Tensor]
DEFAULT: None
mode
enum value of InfluenceMode
TYPE: InfluenceMode
DEFAULT: Up
RETURNS DESCRIPTION Dict[str, Tensor]
A dictionary containing the influence of the data on the test data for each
Dict[str, Tensor]
layer of the model, with the layer name as key.
Source code in src/pydvl/influence/torch/influence_function_model.py
def influences_by_layer(\n self,\n x_test: torch.Tensor,\n y_test: torch.Tensor,\n x: Optional[torch.Tensor] = None,\n y: Optional[torch.Tensor] = None,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> Dict[str, torch.Tensor]:\n r\"\"\"\n Compute the influence of the data on the test data for each layer of the model.\n\n Args:\n x_test: model input to use in the gradient computations of\n $H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}},\n f_{\\theta}(x_{\\text{test}}))$\n y_test: label tensor to compute gradients\n x: optional model input to use in the gradient computations\n $\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))$,\n resp. $\\nabla_{x}\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))$,\n if None, use $x=x_{\\text{test}}$\n y: optional label tensor to compute gradients\n mode: enum value of [InfluenceMode]\n [pydvl.influence.base_influence_function_model.InfluenceMode]\n\n Returns:\n A dictionary containing the influence of the data on the test data for each\n layer of the model, with the layer name as key.\n \"\"\"\n if not self.is_fitted:\n raise ValueError(\n \"Instance must be fitted before calling influence methods on it\"\n )\n\n if x is None:\n if y is not None:\n raise ValueError(\n \"Providing labels y, without providing model input x \"\n \"is not supported\"\n )\n\n return self._symmetric_values_by_layer(\n x_test.to(self.model_device),\n y_test.to(self.model_device),\n mode,\n )\n\n if y is None:\n raise ValueError(\n \"Providing model input x without providing labels y is not supported\"\n )\n\n return self._non_symmetric_values_by_layer(\n x_test.to(self.model_device),\n y_test.to(self.model_device),\n x.to(self.model_device),\n y.to(self.model_device),\n mode,\n )\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.EkfacInfluence.influence_factors_by_layer","title":"influence_factors_by_layer","text":"influence_factors_by_layer(x: Tensor, y: Tensor) -> Dict[str, Tensor]\n
Computes the approximation of
\\[ H^{-1}\\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\] for each layer of the model separately.
PARAMETER DESCRIPTION x
model input to use in the gradient computations
TYPE: Tensor
y
label tensor to compute gradients
TYPE: Tensor
RETURNS DESCRIPTION Dict[str, Tensor]
A dictionary containing the influence factors for each layer of the model,
Dict[str, Tensor]
with the layer name as key.
Source code in src/pydvl/influence/torch/influence_function_model.py
def influence_factors_by_layer(\n self,\n x: torch.Tensor,\n y: torch.Tensor,\n) -> Dict[str, torch.Tensor]:\n r\"\"\"\n Computes the approximation of\n\n \\[ H^{-1}\\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\]\n\n for each layer of the model separately.\n\n Args:\n x: model input to use in the gradient computations\n y: label tensor to compute gradients\n\n Returns:\n A dictionary containing the influence factors for each layer of the model,\n with the layer name as key.\n \"\"\"\n if not self.is_fitted:\n raise ValueError(\n \"Instance must be fitted before calling influence methods on it\"\n )\n\n return self._solve_hvp_by_layer(\n self._loss_grad(x.to(self.model_device), y.to(self.model_device)),\n self.ekfac_representation,\n self.hessian_regularization,\n )\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.EkfacInfluence.influences_from_factors_by_layer","title":"influences_from_factors_by_layer","text":"influences_from_factors_by_layer(\n z_test_factors: Dict[str, Tensor],\n x: Tensor,\n y: Tensor,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> Dict[str, Tensor]\n
Computation of
\\[ \\langle z_{\\text{test_factors}}, \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the case of up-weighting influence, resp.
\\[ \\langle z_{\\text{test_factors}}, \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the perturbation type influence case for each layer of the model separately. The gradients are meant to be per sample of the batch \\((x, y)\\).
PARAMETER DESCRIPTION z_test_factors
pre-computed tensor, approximating \\(H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}}))\\)
TYPE: Dict[str, Tensor]
x
model input to use in the gradient computations \\(\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))\\), resp. \\(\\nabla_{x}\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))\\)
TYPE: Tensor
y
label tensor to compute gradients
TYPE: Tensor
mode
enum value of InfluenceMode
TYPE: InfluenceMode
DEFAULT: Up
RETURNS DESCRIPTION Dict[str, Tensor]
A dictionary containing the influence of the data on the test data
Dict[str, Tensor]
for each layer of the model, with the layer name as key.
Source code in src/pydvl/influence/torch/influence_function_model.py
def influences_from_factors_by_layer(\n self,\n z_test_factors: Dict[str, torch.Tensor],\n x: torch.Tensor,\n y: torch.Tensor,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> Dict[str, torch.Tensor]:\n r\"\"\"\n Computation of\n\n \\[ \\langle z_{\\text{test_factors}},\n \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the case of up-weighting influence, resp.\n\n \\[ \\langle z_{\\text{test_factors}},\n \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the perturbation type influence case for each layer of the model\n separately. The gradients are meant to be per sample of the batch $(x,\n y)$.\n\n Args:\n z_test_factors: pre-computed tensor, approximating\n $H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}},\n f_{\\theta}(x_{\\text{test}}))$\n x: model input to use in the gradient computations\n $\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))$,\n resp. $\\nabla_{x}\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))$\n y: label tensor to compute gradients\n mode: enum value of [InfluenceMode]\n [pydvl.influence.base_influence_function_model.InfluenceMode]\n\n Returns:\n A dictionary containing the influence of the data on the test data\n for each layer of the model, with the layer name as key.\n \"\"\"\n if mode == InfluenceMode.Up:\n total_grad = self._loss_grad(\n x.to(self.model_device), y.to(self.model_device)\n )\n start_idx = 0\n influences = {}\n for layer_id, layer_z_test in z_test_factors.items():\n end_idx = start_idx + layer_z_test.shape[1]\n influences[layer_id] = (\n layer_z_test.to(self.model_device)\n @ total_grad[:, start_idx:end_idx].T\n )\n start_idx = end_idx\n return influences\n elif mode == InfluenceMode.Perturbation:\n total_mixed_grad = self._flat_loss_mixed_grad(\n x.to(self.model_device), y.to(self.model_device)\n )\n start_idx = 0\n influences = {}\n for layer_id, layer_z_test in z_test_factors.items():\n end_idx = start_idx + layer_z_test.shape[1]\n influences[layer_id] = torch.einsum(\n \"ia,j...a->ij...\",\n layer_z_test.to(self.model_device),\n total_mixed_grad[:, start_idx:end_idx],\n )\n start_idx = end_idx\n return influences\n else:\n raise UnsupportedInfluenceModeException(mode)\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.EkfacInfluence.explore_hessian_regularization","title":"explore_hessian_regularization","text":"explore_hessian_regularization(\n x: Tensor, y: Tensor, regularization_values: List[float]\n) -> Dict[float, Dict[str, Tensor]]\n
Efficiently computes the influence for input x and label y for each layer of the model, for different values of the hessian regularization parameter. This is done by computing the gradient of the loss function for the input x and label y only once and then solving the Hessian Vector Product for each regularization value. This is useful for finding the optimal regularization value and for exploring how robust the influence values are to changes in the regularization value.
PARAMETER DESCRIPTION x
model input to use in the gradient computations
TYPE: Tensor
y
label tensor to compute gradients
TYPE: Tensor
regularization_values
list of regularization values to use
TYPE: List[float]
RETURNS DESCRIPTION Dict[float, Dict[str, Tensor]]
A dictionary containing with keys being the regularization values and values
Dict[float, Dict[str, Tensor]]
being dictionaries containing the influences for each layer of the model,
Dict[float, Dict[str, Tensor]]
with the layer name as key.
Source code in src/pydvl/influence/torch/influence_function_model.py
def explore_hessian_regularization(\n self,\n x: torch.Tensor,\n y: torch.Tensor,\n regularization_values: List[float],\n) -> Dict[float, Dict[str, torch.Tensor]]:\n \"\"\"\n Efficiently computes the influence for input x and label y for each layer of the\n model, for different values of the hessian regularization parameter. This is done\n by computing the gradient of the loss function for the input x and label y only once\n and then solving the Hessian Vector Product for each regularization value. This is\n useful for finding the optimal regularization value and for exploring\n how robust the influence values are to changes in the regularization value.\n\n Args:\n x: model input to use in the gradient computations\n y: label tensor to compute gradients\n regularization_values: list of regularization values to use\n\n Returns:\n A dictionary containing with keys being the regularization values and values\n being dictionaries containing the influences for each layer of the model,\n with the layer name as key.\n \"\"\"\n grad = self._loss_grad(x.to(self.model_device), y.to(self.model_device))\n influences_by_reg_value = {}\n for reg_value in regularization_values:\n reg_factors = self._solve_hvp_by_layer(\n grad, self.ekfac_representation, reg_value\n )\n values = {}\n start_idx = 0\n for layer_id, layer_fac in reg_factors.items():\n end_idx = start_idx + layer_fac.shape[1]\n values[layer_id] = layer_fac @ grad[:, start_idx:end_idx].T\n start_idx = end_idx\n influences_by_reg_value[reg_value] = values\n return influences_by_reg_value\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.NystroemSketchInfluence","title":"NystroemSketchInfluence","text":"NystroemSketchInfluence(\n model: Module,\n loss: Callable[[Tensor, Tensor], Tensor],\n regularization: Union[float, Dict[str, float]],\n rank: int,\n block_structure: Union[\n BlockMode, OrderedDict[str, List[str]]\n ] = BlockMode.FULL,\n second_order_mode: SecondOrderMode = SecondOrderMode.HESSIAN,\n)\n
Bases: TorchComposableInfluence[LowRankOperator]
Given a model and training data, it uses a low-rank approximation of the Hessian (derived via random projection Nystr\u00f6m approximation) in combination with the Sherman\u2013Morrison\u2013Woodbury formula to calculate the inverse of the Hessian Vector Product. More concrete, it computes a low-rank approximation
\\[\\begin{align*} H_{\\text{nys}} &= (H\\Omega)(\\Omega^TH\\Omega)^{+}(H\\Omega)^T \\\\\\ &= U \\Lambda U^T \\end{align*}\\] in factorized form and approximates the action of the inverse Hessian via
\\[ (H_{\\text{nys}} + \\lambda I)^{-1} = U(\\Lambda+\\lambda I)U^T + \\frac{1}{\\lambda}(I\u2212UU^T). \\] PARAMETER DESCRIPTION model
A PyTorch model. The Hessian will be calculated with respect to this model's parameters.
TYPE: Module
loss
A callable that takes the model's output and target as input and returns the scalar loss.
TYPE: Callable[[Tensor, Tensor], Tensor]
regularization
Optional regularization parameter added to the Hessian-vector product for numerical stability.
TYPE: Union[float, Dict[str, float]]
rank
rank of the low-rank approximation
TYPE: int
Source code in src/pydvl/influence/torch/influence_function_model.py
def __init__(\n self,\n model: torch.nn.Module,\n loss: Callable[[torch.Tensor, torch.Tensor], torch.Tensor],\n regularization: Union[float, Dict[str, float]],\n rank: int,\n block_structure: Union[BlockMode, OrderedDict[str, List[str]]] = BlockMode.FULL,\n second_order_mode: SecondOrderMode = SecondOrderMode.HESSIAN,\n):\n super().__init__(\n model,\n block_structure,\n regularization=cast(\n Union[float, Dict[str, Optional[float]]], regularization\n ),\n )\n self.second_order_mode = second_order_mode\n self.rank = rank\n self.loss = loss\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.NystroemSketchInfluence.fit","title":"fit","text":"fit(data: DataLoaderType) -> InfluenceFunctionModel\n
Fitting to provided data, by internally creating a block mapper instance from it. Args: data: iterable of tensors
RETURNS DESCRIPTION InfluenceFunctionModel
Fitted instance
Source code in src/pydvl/influence/base_influence_function_model.py
@log_duration(log_level=logging.INFO)\ndef fit(self, data: DataLoaderType) -> InfluenceFunctionModel:\n \"\"\"\n Fitting to provided data, by internally creating a block mapper instance from\n it.\n Args:\n data: iterable of tensors\n\n Returns:\n Fitted instance\n \"\"\"\n self.block_mapper = self._create_block_mapper(data)\n return self\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.NystroemSketchInfluence.fit_required","title":"fit_required staticmethod
","text":"fit_required(method)\n
Decorator to enforce the fitted check
Source code in src/pydvl/influence/base_influence_function_model.py
@staticmethod\ndef fit_required(method):\n \"\"\"Decorator to enforce the fitted check\"\"\"\n\n @wraps(method)\n def wrapper(self, *args, **kwargs):\n if not self.is_fitted:\n raise NotFittedException(type(self))\n return method(self, *args, **kwargs)\n\n return wrapper\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.NystroemSketchInfluence.influence_factors","title":"influence_factors","text":"influence_factors(x: TensorType, y: TensorType) -> TensorType\n
Computes the approximation of
\\[ H^{-1}\\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\] where the gradient is meant to be per sample of the batch \\((x, y)\\). For all input tensors it is assumed, that the first dimension is the batch dimension.
PARAMETER DESCRIPTION x
model input to use in the gradient computations
TYPE: TensorType
y
label tensor to compute gradients
TYPE: TensorType
RETURNS DESCRIPTION TensorType
Tensor representing the element-wise inverse Hessian matrix vector products
Source code in src/pydvl/influence/base_influence_function_model.py
def influence_factors(self, x: TensorType, y: TensorType) -> TensorType:\n r\"\"\"\n Computes the approximation of\n\n \\[ H^{-1}\\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\]\n\n where the gradient is meant to be per sample of the batch $(x, y)$.\n For all input tensors it is assumed,\n that the first dimension is the batch dimension.\n\n Args:\n x: model input to use in the gradient computations\n y: label tensor to compute gradients\n\n Returns:\n Tensor representing the element-wise inverse Hessian matrix vector products\n\n \"\"\"\n if not self.is_fitted:\n raise NotFittedException(type(self))\n return self._influence_factors(x, y)\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.NystroemSketchInfluence.influences","title":"influences","text":"influences(\n x_test: TensorType,\n y_test: TensorType,\n x: Optional[TensorType] = None,\n y: Optional[TensorType] = None,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> TensorType\n
Computes the approximation of
\\[ \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}})), \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the case of up-weighting influence, resp.
\\[ \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{test}, f_{\\theta}(x_{test})), \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the perturbation type influence case.
PARAMETER DESCRIPTION x_test
model input to use in the gradient computations of \\(H^{-1}\\nabla_{theta} \\ell(y_{test}, f_{\\theta}(x_{test}))\\)
TYPE: TensorType
y_test
label tensor to compute gradients
TYPE: TensorType
x
optional model input to use in the gradient computations \\(\\nabla_{theta}\\ell(y, f_{\\theta}(x))\\), resp. \\(\\nabla_{x}\\nabla_{theta}\\ell(y, f_{\\theta}(x))\\), if None, use \\(x=x_{test}\\)
TYPE: Optional[TensorType]
DEFAULT: None
y
optional label tensor to compute gradients
TYPE: Optional[TensorType]
DEFAULT: None
mode
enum value of InfluenceMode
TYPE: InfluenceMode
DEFAULT: Up
RETURNS DESCRIPTION TensorType
Tensor representing the element-wise scalar products for the provided batch
Source code in src/pydvl/influence/base_influence_function_model.py
def influences(\n self,\n x_test: TensorType,\n y_test: TensorType,\n x: Optional[TensorType] = None,\n y: Optional[TensorType] = None,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> TensorType:\n r\"\"\"\n Computes the approximation of\n\n \\[ \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}},\n f_{\\theta}(x_{\\text{test}})),\n \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the case of up-weighting influence, resp.\n\n \\[ \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{test}, f_{\\theta}(x_{test})),\n \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the perturbation type influence case.\n\n Args:\n x_test: model input to use in the gradient computations\n of $H^{-1}\\nabla_{theta} \\ell(y_{test}, f_{\\theta}(x_{test}))$\n y_test: label tensor to compute gradients\n x: optional model input to use in the gradient computations\n $\\nabla_{theta}\\ell(y, f_{\\theta}(x))$,\n resp. $\\nabla_{x}\\nabla_{theta}\\ell(y, f_{\\theta}(x))$,\n if None, use $x=x_{test}$\n y: optional label tensor to compute gradients\n mode: enum value of [InfluenceMode]\n [pydvl.influence.base_influence_function_model.InfluenceMode]\n\n Returns:\n Tensor representing the element-wise scalar products for the provided batch\n\n \"\"\"\n if not self.is_fitted:\n raise NotFittedException(type(self))\n\n if x is None and y is not None:\n raise ValueError(\n \"Providing labels y, without providing model input x \"\n \"is not supported\"\n )\n\n if x is not None and y is None:\n raise ValueError(\n \"Providing model input x, without providing labels y \"\n \"is not supported\"\n )\n\n return self._influences(x_test, y_test, x, y, mode)\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.NystroemSketchInfluence.influences_from_factors","title":"influences_from_factors","text":"influences_from_factors(\n z_test_factors: TensorType,\n x: TensorType,\n y: TensorType,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> TensorType\n
Computation of
\\[ \\langle z_{\\text{test_factors}}, \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the case of up-weighting influence, resp.
\\[ \\langle z_{\\text{test_factors}}, \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the perturbation type influence case. The gradient is meant to be per sample of the batch \\((x, y)\\).
PARAMETER DESCRIPTION z_test_factors
pre-computed array, approximating \\(H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}}))\\)
TYPE: TensorType
x
model input to use in the gradient computations \\(\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))\\), resp. \\(\\nabla_{x}\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))\\), if None, use \\(x=x_{\\text{test}}\\)
TYPE: TensorType
y
label tensor to compute gradients
TYPE: TensorType
mode
enum value of InfluenceMode
TYPE: InfluenceMode
DEFAULT: Up
RETURNS DESCRIPTION TensorType
Tensor representing the element-wise scalar products for the provided batch
Source code in src/pydvl/influence/base_influence_function_model.py
@InfluenceFunctionModel.fit_required\ndef influences_from_factors(\n self,\n z_test_factors: TensorType,\n x: TensorType,\n y: TensorType,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> TensorType:\n r\"\"\"\n Computation of\n\n \\[ \\langle z_{\\text{test_factors}},\n \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the case of up-weighting influence, resp.\n\n \\[ \\langle z_{\\text{test_factors}},\n \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the perturbation type influence case. The gradient is meant to be per sample\n of the batch $(x, y)$.\n\n Args:\n z_test_factors: pre-computed array, approximating\n $H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}},\n f_{\\theta}(x_{\\text{test}}))$\n x: model input to use in the gradient computations\n $\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))$,\n resp. $\\nabla_{x}\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))$,\n if None, use $x=x_{\\text{test}}$\n y: label tensor to compute gradients\n mode: enum value of [InfluenceMode]\n [pydvl.influence.base_influence_function_model.InfluenceMode]\n\n Returns:\n Tensor representing the element-wise scalar products for the provided batch\n\n \"\"\"\n tensors = self.block_mapper.generate_interactions_from_transformed_grads(\n z_test_factors,\n self._create_batch(x, y),\n mode,\n )\n result: TensorType = next(tensors)\n for tensor in tensors:\n result = result + tensor\n return result\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.NystroemSketchInfluence.influences_by_block","title":"influences_by_block","text":"influences_by_block(\n x_test: TensorType,\n y_test: TensorType,\n x: Optional[TensorType] = None,\n y: Optional[TensorType] = None,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> OrderedDict[str, TensorType]\n
Compute the block-wise influence values for the provided data, i.e. an approximation of
\\[ \\langle H^{-1}\\nabla_{theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}})), \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the case of up-weighting influence, resp.
\\[ \\langle H^{-1}\\nabla_{theta} \\ell(y_{test}, f_{\\theta}(x_{test})), \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the perturbation type influence case.
PARAMETER DESCRIPTION x_test
model input to use in the gradient computations of the approximation of \\(H^{-1}\\nabla_{theta} \\ell(y_{test}, f_{\\theta}(x_{test}))\\)
TYPE: TensorType
y_test
label tensor to compute gradients
TYPE: TensorType
x
optional model input to use in the gradient computations \\(\\nabla_{theta}\\ell(y, f_{\\theta}(x))\\), resp. \\(\\nabla_{x}\\nabla_{theta}\\ell(y, f_{\\theta}(x))\\), if None, use \\(x=x_{test}\\)
TYPE: Optional[TensorType]
DEFAULT: None
y
optional label tensor to compute gradients
TYPE: Optional[TensorType]
DEFAULT: None
mode
enum value of InfluenceMode
TYPE: InfluenceMode
DEFAULT: Up
RETURNS DESCRIPTION OrderedDict[str, TensorType]
Ordered dictionary of tensors representing the element-wise scalar products
OrderedDict[str, TensorType]
for the provided batch per block.
Source code in src/pydvl/influence/base_influence_function_model.py
@InfluenceFunctionModel.fit_required\ndef influences_by_block(\n self,\n x_test: TensorType,\n y_test: TensorType,\n x: Optional[TensorType] = None,\n y: Optional[TensorType] = None,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> OrderedDict[str, TensorType]:\n r\"\"\"\n Compute the block-wise influence values for the provided data, i.e. an\n approximation of\n\n \\[ \\langle H^{-1}\\nabla_{theta} \\ell(y_{\\text{test}},\n f_{\\theta}(x_{\\text{test}})),\n \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the case of up-weighting influence, resp.\n\n \\[ \\langle H^{-1}\\nabla_{theta} \\ell(y_{test}, f_{\\theta}(x_{test})),\n \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the perturbation type influence case.\n\n Args:\n x_test: model input to use in the gradient computations\n of the approximation of\n $H^{-1}\\nabla_{theta} \\ell(y_{test}, f_{\\theta}(x_{test}))$\n y_test: label tensor to compute gradients\n x: optional model input to use in the gradient computations\n $\\nabla_{theta}\\ell(y, f_{\\theta}(x))$,\n resp. $\\nabla_{x}\\nabla_{theta}\\ell(y, f_{\\theta}(x))$,\n if None, use $x=x_{test}$\n y: optional label tensor to compute gradients\n mode: enum value of [InfluenceMode]\n [pydvl.influence.base_influence_function_model.InfluenceMode]\n\n Returns:\n Ordered dictionary of tensors representing the element-wise scalar products\n for the provided batch per block.\n\n \"\"\"\n left_batch = self._create_batch(x_test, y_test)\n\n if x is None:\n if y is not None:\n raise ValueError(\n \"Providing labels y, without providing model input x \"\n \"is not supported\"\n )\n right_batch = left_batch\n else:\n if y is None:\n raise ValueError(\n \"Providing model input x, without providing labels y \"\n \"is not supported\"\n )\n right_batch = self._create_batch(x, y)\n\n return self.block_mapper.interactions(left_batch, right_batch, mode)\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.NystroemSketchInfluence.influence_factors_by_block","title":"influence_factors_by_block","text":"influence_factors_by_block(\n x: TensorType, y: TensorType\n) -> OrderedDict[str, TensorType]\n
Compute the block-wise approximation of
\\[ H^{-1}\\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\] where the gradient is meant to be per sample of the batch \\((x, y)\\).
PARAMETER DESCRIPTION x
model input to use in the gradient computations
TYPE: TensorType
y
label tensor to compute gradients
TYPE: TensorType
RETURNS DESCRIPTION OrderedDict[str, TensorType]
Ordered dictionary of tensors representing the element-wise
OrderedDict[str, TensorType]
approximate inverse Hessian matrix vector products per block.
Source code in src/pydvl/influence/base_influence_function_model.py
@InfluenceFunctionModel.fit_required\ndef influence_factors_by_block(\n self, x: TensorType, y: TensorType\n) -> OrderedDict[str, TensorType]:\n r\"\"\"\n Compute the block-wise approximation of\n\n \\[ H^{-1}\\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\]\n\n where the gradient is meant to be per sample of the batch $(x, y)$.\n\n Args:\n x: model input to use in the gradient computations\n y: label tensor to compute gradients\n\n Returns:\n Ordered dictionary of tensors representing the element-wise\n approximate inverse Hessian matrix vector products per block.\n\n \"\"\"\n return self.block_mapper.transformed_grads(self._create_batch(x, y))\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.NystroemSketchInfluence.influences_from_factors_by_block","title":"influences_from_factors_by_block","text":"influences_from_factors_by_block(\n z_test_factors: OrderedDict[str, TensorType],\n x: TensorType,\n y: TensorType,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> OrderedDict[str, TensorType]\n
Block-wise computation of
\\[ \\langle z_{\\text{test_factors}}, \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the case of up-weighting influence, resp.
\\[ \\langle z_{\\text{test_factors}}, \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the perturbation type influence case. The gradient is meant to be per sample of the batch \\((x, y)\\).
PARAMETER DESCRIPTION z_test_factors
pre-computed array, approximating \\(H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}}))\\)
TYPE: OrderedDict[str, TensorType]
x
model input to use in the gradient computations \\(\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))\\), resp. \\(\\nabla_{x}\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))\\), if None, use \\(x=x_{\\text{test}}\\)
TYPE: TensorType
y
label tensor to compute gradients
TYPE: TensorType
mode
enum value of InfluenceMode
TYPE: InfluenceMode
DEFAULT: Up
RETURNS DESCRIPTION OrderedDict[str, TensorType]
Ordered dictionary of tensors representing the element-wise scalar products
OrderedDict[str, TensorType]
for the provided batch per block
Source code in src/pydvl/influence/base_influence_function_model.py
@InfluenceFunctionModel.fit_required\ndef influences_from_factors_by_block(\n self,\n z_test_factors: OrderedDict[str, TensorType],\n x: TensorType,\n y: TensorType,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> OrderedDict[str, TensorType]:\n r\"\"\"\n Block-wise computation of\n\n \\[ \\langle z_{\\text{test_factors}},\n \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the case of up-weighting influence, resp.\n\n \\[ \\langle z_{\\text{test_factors}},\n \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the perturbation type influence case. The gradient is meant to be per sample\n of the batch $(x, y)$.\n\n Args:\n z_test_factors: pre-computed array, approximating\n $H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}},\n f_{\\theta}(x_{\\text{test}}))$\n x: model input to use in the gradient computations\n $\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))$,\n resp. $\\nabla_{x}\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))$,\n if None, use $x=x_{\\text{test}}$\n y: label tensor to compute gradients\n mode: enum value of [InfluenceMode]\n [pydvl.influence.base_influence_function_model.InfluenceMode]\n\n Returns:\n Ordered dictionary of tensors representing the element-wise scalar products\n for the provided batch per block\n\n \"\"\"\n return self.block_mapper.interactions_from_transformed_grads(\n z_test_factors, self._create_batch(x, y), mode\n )\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.InverseHarmonicMeanInfluence","title":"InverseHarmonicMeanInfluence","text":"InverseHarmonicMeanInfluence(\n model: Module,\n loss: LossType,\n regularization: Union[float, Dict[str, float]],\n block_structure: Union[\n BlockMode, OrderedDict[str, List[str]]\n ] = BlockMode.FULL,\n)\n
Bases: TorchComposableInfluence[InverseHarmonicMeanOperator]
This implementation replaces the inverse Hessian matrix in the influence computation with an approximation of the inverse Gauss-Newton vector product.
Viewing the damped Gauss-newton matrix
\\[\\begin{align*} G_{\\lambda}(\\theta) &= \\frac{1}{N}\\sum_{i}^N\\nabla_{\\theta}\\ell (x_i,y_i; \\theta) \\nabla_{\\theta}\\ell (x_i, y_i; \\theta)^t + \\lambda \\operatorname{I}, \\\\\\ \\ell(x,y; \\theta) &= \\text{loss}(\\text{model}(x; \\theta), y) \\end{align*}\\] as an arithmetic mean of the rank-\\(1\\) updates, this implementation replaces it with the harmonic mean of the rank-\\(1\\) updates, i.e.
\\[ \\tilde{G}_{\\lambda}(\\theta) = \\left(N \\cdot \\sum_{i=1}^N \\left( \\nabla_{\\theta}\\ell (x_i,y_i; \\theta) \\nabla_{\\theta}\\ell (x_i,y_i; \\theta)^t + \\lambda \\operatorname{I}\\right)^{-1} \\right)^{-1}\\] and uses the matrix
\\[ \\tilde{G}_{\\lambda}^{-1}(\\theta)\\] instead of the inverse Hessian.
In other words, it switches the order of summation and inversion, which resolves to the inverse harmonic mean
of the rank-\\(1\\) updates. The results are averaged over the batches provided by the data loader.
The inverses of the rank-\\(1\\) updates are not calculated explicitly, but instead a vectorized version of the Sherman\u2013Morrison formula is applied.
For more information, see Inverse Harmonic Mean.
PARAMETER DESCRIPTION model
The model.
TYPE: Module
loss
The loss function.
TYPE: LossType
regularization
The regularization parameter. In case a dictionary is provided, the keys must match the blocking structure and the specification must be complete, so every block needs a positive regularization value, which differs from the description in block-diagonal approximation.
TYPE: Union[float, Dict[str, float]]
block_structure
The blocking structure, either a pre-defined enum or a custom block structure, see the information regarding block-diagonal approximation.
TYPE: Union[BlockMode, OrderedDict[str, List[str]]]
DEFAULT: FULL
Source code in src/pydvl/influence/torch/influence_function_model.py
def __init__(\n self,\n model: torch.nn.Module,\n loss: LossType,\n regularization: Union[float, Dict[str, float]],\n block_structure: Union[BlockMode, OrderedDict[str, List[str]]] = BlockMode.FULL,\n):\n super().__init__(\n model,\n block_structure,\n regularization=cast(\n Union[float, Dict[str, Optional[float]]], regularization\n ),\n )\n self.loss = loss\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.InverseHarmonicMeanInfluence.fit","title":"fit","text":"fit(data: DataLoaderType) -> InfluenceFunctionModel\n
Fitting to provided data, by internally creating a block mapper instance from it. Args: data: iterable of tensors
RETURNS DESCRIPTION InfluenceFunctionModel
Fitted instance
Source code in src/pydvl/influence/base_influence_function_model.py
@log_duration(log_level=logging.INFO)\ndef fit(self, data: DataLoaderType) -> InfluenceFunctionModel:\n \"\"\"\n Fitting to provided data, by internally creating a block mapper instance from\n it.\n Args:\n data: iterable of tensors\n\n Returns:\n Fitted instance\n \"\"\"\n self.block_mapper = self._create_block_mapper(data)\n return self\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.InverseHarmonicMeanInfluence.fit_required","title":"fit_required staticmethod
","text":"fit_required(method)\n
Decorator to enforce the fitted check
Source code in src/pydvl/influence/base_influence_function_model.py
@staticmethod\ndef fit_required(method):\n \"\"\"Decorator to enforce the fitted check\"\"\"\n\n @wraps(method)\n def wrapper(self, *args, **kwargs):\n if not self.is_fitted:\n raise NotFittedException(type(self))\n return method(self, *args, **kwargs)\n\n return wrapper\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.InverseHarmonicMeanInfluence.influence_factors","title":"influence_factors","text":"influence_factors(x: TensorType, y: TensorType) -> TensorType\n
Computes the approximation of
\\[ H^{-1}\\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\] where the gradient is meant to be per sample of the batch \\((x, y)\\). For all input tensors it is assumed, that the first dimension is the batch dimension.
PARAMETER DESCRIPTION x
model input to use in the gradient computations
TYPE: TensorType
y
label tensor to compute gradients
TYPE: TensorType
RETURNS DESCRIPTION TensorType
Tensor representing the element-wise inverse Hessian matrix vector products
Source code in src/pydvl/influence/base_influence_function_model.py
def influence_factors(self, x: TensorType, y: TensorType) -> TensorType:\n r\"\"\"\n Computes the approximation of\n\n \\[ H^{-1}\\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\]\n\n where the gradient is meant to be per sample of the batch $(x, y)$.\n For all input tensors it is assumed,\n that the first dimension is the batch dimension.\n\n Args:\n x: model input to use in the gradient computations\n y: label tensor to compute gradients\n\n Returns:\n Tensor representing the element-wise inverse Hessian matrix vector products\n\n \"\"\"\n if not self.is_fitted:\n raise NotFittedException(type(self))\n return self._influence_factors(x, y)\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.InverseHarmonicMeanInfluence.influences","title":"influences","text":"influences(\n x_test: TensorType,\n y_test: TensorType,\n x: Optional[TensorType] = None,\n y: Optional[TensorType] = None,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> TensorType\n
Computes the approximation of
\\[ \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}})), \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the case of up-weighting influence, resp.
\\[ \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{test}, f_{\\theta}(x_{test})), \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the perturbation type influence case.
PARAMETER DESCRIPTION x_test
model input to use in the gradient computations of \\(H^{-1}\\nabla_{theta} \\ell(y_{test}, f_{\\theta}(x_{test}))\\)
TYPE: TensorType
y_test
label tensor to compute gradients
TYPE: TensorType
x
optional model input to use in the gradient computations \\(\\nabla_{theta}\\ell(y, f_{\\theta}(x))\\), resp. \\(\\nabla_{x}\\nabla_{theta}\\ell(y, f_{\\theta}(x))\\), if None, use \\(x=x_{test}\\)
TYPE: Optional[TensorType]
DEFAULT: None
y
optional label tensor to compute gradients
TYPE: Optional[TensorType]
DEFAULT: None
mode
enum value of InfluenceMode
TYPE: InfluenceMode
DEFAULT: Up
RETURNS DESCRIPTION TensorType
Tensor representing the element-wise scalar products for the provided batch
Source code in src/pydvl/influence/base_influence_function_model.py
def influences(\n self,\n x_test: TensorType,\n y_test: TensorType,\n x: Optional[TensorType] = None,\n y: Optional[TensorType] = None,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> TensorType:\n r\"\"\"\n Computes the approximation of\n\n \\[ \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}},\n f_{\\theta}(x_{\\text{test}})),\n \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the case of up-weighting influence, resp.\n\n \\[ \\langle H^{-1}\\nabla_{\\theta} \\ell(y_{test}, f_{\\theta}(x_{test})),\n \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the perturbation type influence case.\n\n Args:\n x_test: model input to use in the gradient computations\n of $H^{-1}\\nabla_{theta} \\ell(y_{test}, f_{\\theta}(x_{test}))$\n y_test: label tensor to compute gradients\n x: optional model input to use in the gradient computations\n $\\nabla_{theta}\\ell(y, f_{\\theta}(x))$,\n resp. $\\nabla_{x}\\nabla_{theta}\\ell(y, f_{\\theta}(x))$,\n if None, use $x=x_{test}$\n y: optional label tensor to compute gradients\n mode: enum value of [InfluenceMode]\n [pydvl.influence.base_influence_function_model.InfluenceMode]\n\n Returns:\n Tensor representing the element-wise scalar products for the provided batch\n\n \"\"\"\n if not self.is_fitted:\n raise NotFittedException(type(self))\n\n if x is None and y is not None:\n raise ValueError(\n \"Providing labels y, without providing model input x \"\n \"is not supported\"\n )\n\n if x is not None and y is None:\n raise ValueError(\n \"Providing model input x, without providing labels y \"\n \"is not supported\"\n )\n\n return self._influences(x_test, y_test, x, y, mode)\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.InverseHarmonicMeanInfluence.influences_from_factors","title":"influences_from_factors","text":"influences_from_factors(\n z_test_factors: TensorType,\n x: TensorType,\n y: TensorType,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> TensorType\n
Computation of
\\[ \\langle z_{\\text{test_factors}}, \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the case of up-weighting influence, resp.
\\[ \\langle z_{\\text{test_factors}}, \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the perturbation type influence case. The gradient is meant to be per sample of the batch \\((x, y)\\).
PARAMETER DESCRIPTION z_test_factors
pre-computed array, approximating \\(H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}}))\\)
TYPE: TensorType
x
model input to use in the gradient computations \\(\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))\\), resp. \\(\\nabla_{x}\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))\\), if None, use \\(x=x_{\\text{test}}\\)
TYPE: TensorType
y
label tensor to compute gradients
TYPE: TensorType
mode
enum value of InfluenceMode
TYPE: InfluenceMode
DEFAULT: Up
RETURNS DESCRIPTION TensorType
Tensor representing the element-wise scalar products for the provided batch
Source code in src/pydvl/influence/base_influence_function_model.py
@InfluenceFunctionModel.fit_required\ndef influences_from_factors(\n self,\n z_test_factors: TensorType,\n x: TensorType,\n y: TensorType,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> TensorType:\n r\"\"\"\n Computation of\n\n \\[ \\langle z_{\\text{test_factors}},\n \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the case of up-weighting influence, resp.\n\n \\[ \\langle z_{\\text{test_factors}},\n \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the perturbation type influence case. The gradient is meant to be per sample\n of the batch $(x, y)$.\n\n Args:\n z_test_factors: pre-computed array, approximating\n $H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}},\n f_{\\theta}(x_{\\text{test}}))$\n x: model input to use in the gradient computations\n $\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))$,\n resp. $\\nabla_{x}\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))$,\n if None, use $x=x_{\\text{test}}$\n y: label tensor to compute gradients\n mode: enum value of [InfluenceMode]\n [pydvl.influence.base_influence_function_model.InfluenceMode]\n\n Returns:\n Tensor representing the element-wise scalar products for the provided batch\n\n \"\"\"\n tensors = self.block_mapper.generate_interactions_from_transformed_grads(\n z_test_factors,\n self._create_batch(x, y),\n mode,\n )\n result: TensorType = next(tensors)\n for tensor in tensors:\n result = result + tensor\n return result\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.InverseHarmonicMeanInfluence.influences_by_block","title":"influences_by_block","text":"influences_by_block(\n x_test: TensorType,\n y_test: TensorType,\n x: Optional[TensorType] = None,\n y: Optional[TensorType] = None,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> OrderedDict[str, TensorType]\n
Compute the block-wise influence values for the provided data, i.e. an approximation of
\\[ \\langle H^{-1}\\nabla_{theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}})), \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the case of up-weighting influence, resp.
\\[ \\langle H^{-1}\\nabla_{theta} \\ell(y_{test}, f_{\\theta}(x_{test})), \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the perturbation type influence case.
PARAMETER DESCRIPTION x_test
model input to use in the gradient computations of the approximation of \\(H^{-1}\\nabla_{theta} \\ell(y_{test}, f_{\\theta}(x_{test}))\\)
TYPE: TensorType
y_test
label tensor to compute gradients
TYPE: TensorType
x
optional model input to use in the gradient computations \\(\\nabla_{theta}\\ell(y, f_{\\theta}(x))\\), resp. \\(\\nabla_{x}\\nabla_{theta}\\ell(y, f_{\\theta}(x))\\), if None, use \\(x=x_{test}\\)
TYPE: Optional[TensorType]
DEFAULT: None
y
optional label tensor to compute gradients
TYPE: Optional[TensorType]
DEFAULT: None
mode
enum value of InfluenceMode
TYPE: InfluenceMode
DEFAULT: Up
RETURNS DESCRIPTION OrderedDict[str, TensorType]
Ordered dictionary of tensors representing the element-wise scalar products
OrderedDict[str, TensorType]
for the provided batch per block.
Source code in src/pydvl/influence/base_influence_function_model.py
@InfluenceFunctionModel.fit_required\ndef influences_by_block(\n self,\n x_test: TensorType,\n y_test: TensorType,\n x: Optional[TensorType] = None,\n y: Optional[TensorType] = None,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> OrderedDict[str, TensorType]:\n r\"\"\"\n Compute the block-wise influence values for the provided data, i.e. an\n approximation of\n\n \\[ \\langle H^{-1}\\nabla_{theta} \\ell(y_{\\text{test}},\n f_{\\theta}(x_{\\text{test}})),\n \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the case of up-weighting influence, resp.\n\n \\[ \\langle H^{-1}\\nabla_{theta} \\ell(y_{test}, f_{\\theta}(x_{test})),\n \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the perturbation type influence case.\n\n Args:\n x_test: model input to use in the gradient computations\n of the approximation of\n $H^{-1}\\nabla_{theta} \\ell(y_{test}, f_{\\theta}(x_{test}))$\n y_test: label tensor to compute gradients\n x: optional model input to use in the gradient computations\n $\\nabla_{theta}\\ell(y, f_{\\theta}(x))$,\n resp. $\\nabla_{x}\\nabla_{theta}\\ell(y, f_{\\theta}(x))$,\n if None, use $x=x_{test}$\n y: optional label tensor to compute gradients\n mode: enum value of [InfluenceMode]\n [pydvl.influence.base_influence_function_model.InfluenceMode]\n\n Returns:\n Ordered dictionary of tensors representing the element-wise scalar products\n for the provided batch per block.\n\n \"\"\"\n left_batch = self._create_batch(x_test, y_test)\n\n if x is None:\n if y is not None:\n raise ValueError(\n \"Providing labels y, without providing model input x \"\n \"is not supported\"\n )\n right_batch = left_batch\n else:\n if y is None:\n raise ValueError(\n \"Providing model input x, without providing labels y \"\n \"is not supported\"\n )\n right_batch = self._create_batch(x, y)\n\n return self.block_mapper.interactions(left_batch, right_batch, mode)\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.InverseHarmonicMeanInfluence.influence_factors_by_block","title":"influence_factors_by_block","text":"influence_factors_by_block(\n x: TensorType, y: TensorType\n) -> OrderedDict[str, TensorType]\n
Compute the block-wise approximation of
\\[ H^{-1}\\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\] where the gradient is meant to be per sample of the batch \\((x, y)\\).
PARAMETER DESCRIPTION x
model input to use in the gradient computations
TYPE: TensorType
y
label tensor to compute gradients
TYPE: TensorType
RETURNS DESCRIPTION OrderedDict[str, TensorType]
Ordered dictionary of tensors representing the element-wise
OrderedDict[str, TensorType]
approximate inverse Hessian matrix vector products per block.
Source code in src/pydvl/influence/base_influence_function_model.py
@InfluenceFunctionModel.fit_required\ndef influence_factors_by_block(\n self, x: TensorType, y: TensorType\n) -> OrderedDict[str, TensorType]:\n r\"\"\"\n Compute the block-wise approximation of\n\n \\[ H^{-1}\\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\]\n\n where the gradient is meant to be per sample of the batch $(x, y)$.\n\n Args:\n x: model input to use in the gradient computations\n y: label tensor to compute gradients\n\n Returns:\n Ordered dictionary of tensors representing the element-wise\n approximate inverse Hessian matrix vector products per block.\n\n \"\"\"\n return self.block_mapper.transformed_grads(self._create_batch(x, y))\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.InverseHarmonicMeanInfluence.influences_from_factors_by_block","title":"influences_from_factors_by_block","text":"influences_from_factors_by_block(\n z_test_factors: OrderedDict[str, TensorType],\n x: TensorType,\n y: TensorType,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> OrderedDict[str, TensorType]\n
Block-wise computation of
\\[ \\langle z_{\\text{test_factors}}, \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the case of up-weighting influence, resp.
\\[ \\langle z_{\\text{test_factors}}, \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\] for the perturbation type influence case. The gradient is meant to be per sample of the batch \\((x, y)\\).
PARAMETER DESCRIPTION z_test_factors
pre-computed array, approximating \\(H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}}, f_{\\theta}(x_{\\text{test}}))\\)
TYPE: OrderedDict[str, TensorType]
x
model input to use in the gradient computations \\(\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))\\), resp. \\(\\nabla_{x}\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))\\), if None, use \\(x=x_{\\text{test}}\\)
TYPE: TensorType
y
label tensor to compute gradients
TYPE: TensorType
mode
enum value of InfluenceMode
TYPE: InfluenceMode
DEFAULT: Up
RETURNS DESCRIPTION OrderedDict[str, TensorType]
Ordered dictionary of tensors representing the element-wise scalar products
OrderedDict[str, TensorType]
for the provided batch per block
Source code in src/pydvl/influence/base_influence_function_model.py
@InfluenceFunctionModel.fit_required\ndef influences_from_factors_by_block(\n self,\n z_test_factors: OrderedDict[str, TensorType],\n x: TensorType,\n y: TensorType,\n mode: InfluenceMode = InfluenceMode.Up,\n) -> OrderedDict[str, TensorType]:\n r\"\"\"\n Block-wise computation of\n\n \\[ \\langle z_{\\text{test_factors}},\n \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the case of up-weighting influence, resp.\n\n \\[ \\langle z_{\\text{test_factors}},\n \\nabla_{x} \\nabla_{\\theta} \\ell(y, f_{\\theta}(x)) \\rangle \\]\n\n for the perturbation type influence case. The gradient is meant to be per sample\n of the batch $(x, y)$.\n\n Args:\n z_test_factors: pre-computed array, approximating\n $H^{-1}\\nabla_{\\theta} \\ell(y_{\\text{test}},\n f_{\\theta}(x_{\\text{test}}))$\n x: model input to use in the gradient computations\n $\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))$,\n resp. $\\nabla_{x}\\nabla_{\\theta}\\ell(y, f_{\\theta}(x))$,\n if None, use $x=x_{\\text{test}}$\n y: label tensor to compute gradients\n mode: enum value of [InfluenceMode]\n [pydvl.influence.base_influence_function_model.InfluenceMode]\n\n Returns:\n Ordered dictionary of tensors representing the element-wise scalar products\n for the provided batch per block\n\n \"\"\"\n return self.block_mapper.interactions_from_transformed_grads(\n z_test_factors, self._create_batch(x, y), mode\n )\n
"},{"location":"api/pydvl/influence/torch/influence_function_model/#pydvl.influence.torch.influence_function_model.InverseHarmonicMeanInfluence.with_regularization","title":"with_regularization","text":"with_regularization(\n regularization: Union[float, Dict[str, Optional[float]]]\n) -> TorchComposableInfluence\n
Update the regularization parameter. Args: regularization: Either a positive float or a dictionary with the block names as keys and the regularization values as values.
RETURNS DESCRIPTION TorchComposableInfluence
The modified instance
Source code in src/pydvl/influence/torch/influence_function_model.py
def with_regularization(\n self, regularization: Union[float, Dict[str, Optional[float]]]\n) -> TorchComposableInfluence:\n \"\"\"\n Update the regularization parameter.\n Args:\n regularization: Either a positive float or a dictionary with the\n block names as keys and the regularization values as values.\n\n Returns:\n The modified instance\n\n \"\"\"\n self._regularization_dict = self._build_regularization_dict(regularization)\n for k, reg in self._regularization_dict.items():\n self.block_mapper.composable_block_dict[k].op.regularization = reg\n return self\n
"},{"location":"api/pydvl/influence/torch/operator/","title":"Operator","text":""},{"location":"api/pydvl/influence/torch/operator/#pydvl.influence.torch.operator","title":"pydvl.influence.torch.operator","text":""},{"location":"api/pydvl/influence/torch/operator/#pydvl.influence.torch.operator.GaussNewtonOperator","title":"GaussNewtonOperator","text":"GaussNewtonOperator(\n model: Module,\n loss: Callable[[Tensor, Tensor], Tensor],\n dataloader: DataLoader,\n restrict_to: Optional[Dict[str, Parameter]] = None,\n)\n
Bases: _AveragingBatchOperator[GaussNewtonBatchOperation, PointAveraging]
Given a model and loss function computes the Gauss-Newton vector or matrix product with respect to the model parameters on a batch, i.e.
\\[\\begin{align*} G(\\text{model}, \\text{loss}, b, \\theta) &\\cdot v, \\\\\\ G(\\text{model}, \\text{loss}, b, \\theta) &= \\frac{1}{|b|}\\sum_{(x, y) \\in b}\\nabla_{\\theta}\\ell (x,y; \\theta) \\nabla_{\\theta}\\ell (x,y; \\theta)^t, \\\\\\ \\ell(x,y; \\theta) &= \\text{loss}(\\text{model}(x; \\theta), y) \\end{align*}\\] where model is a torch.nn.Module and \\(v\\) is a vector or matrix, and average the results over the batches provided by the data loader.
PARAMETER DESCRIPTION model
The model.
TYPE: Module
loss
The loss function.
TYPE: Callable[[Tensor, Tensor], Tensor]
dataloader
The data loader providing batches of data.
TYPE: DataLoader
restrict_to
The parameters to restrict the differentiation to, i.e. the corresponding sub-matrix of the Jacobian. If None, the full Jacobian is used. Make sure the input matches the corrct dimension, i.e. the last dimension must be equal to the property input_size
.
TYPE: Optional[Dict[str, Parameter]]
DEFAULT: None
Source code in src/pydvl/influence/torch/operator.py
def __init__(\n self,\n model: nn.Module,\n loss: Callable[[torch.Tensor, torch.Tensor], torch.Tensor],\n dataloader: DataLoader,\n restrict_to: Optional[Dict[str, nn.Parameter]] = None,\n):\n batch_op = GaussNewtonBatchOperation(\n model,\n loss,\n restrict_to=restrict_to,\n )\n averaging = PointAveraging()\n super().__init__(batch_op, dataloader, averaging)\n
"},{"location":"api/pydvl/influence/torch/operator/#pydvl.influence.torch.operator.GaussNewtonOperator.apply","title":"apply","text":"apply(tensor: TensorType) -> TensorType\n
Applies the operator to a tensor.
PARAMETER DESCRIPTION tensor
A tensor, whose tailing dimension must conform to the operator's input size
TYPE: TensorType
RETURNS DESCRIPTION TensorType
A tensor representing the result of the operator application.
Source code in src/pydvl/influence/types.py
def apply(self, tensor: TensorType) -> TensorType:\n \"\"\"\n Applies the operator to a tensor.\n\n Args:\n tensor: A tensor, whose tailing dimension must conform to the\n operator's input size\n\n Returns:\n A tensor representing the result of the operator application.\n \"\"\"\n self._validate_tensor_input(tensor)\n return self._apply(tensor)\n
"},{"location":"api/pydvl/influence/torch/operator/#pydvl.influence.torch.operator.GaussNewtonOperator.apply_to_dict","title":"apply_to_dict","text":"apply_to_dict(mat: Dict[str, Tensor]) -> Dict[str, Tensor]\n
Applies the operator to a dictionary of tensors, compatible to the structure defined by the property input_dict_structure
.
PARAMETER DESCRIPTION mat
dictionary of tensors, whose keys and shapes match the property input_dict_structure
.
TYPE: Dict[str, Tensor]
RETURNS DESCRIPTION Dict[str, Tensor]
A dictionary of tensors after applying the operator
Source code in src/pydvl/influence/torch/base.py
def apply_to_dict(self, mat: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:\n \"\"\"\n Applies the operator to a dictionary of tensors, compatible to the structure\n defined by the property `input_dict_structure`.\n\n Args:\n mat: dictionary of tensors, whose keys and shapes match the property\n `input_dict_structure`.\n\n Returns:\n A dictionary of tensors after applying the operator\n \"\"\"\n\n if not self._validate_mat_dict(mat):\n raise ValueError(\n f\"Incompatible input structure, expected (excluding batch\"\n f\"dimension): \\n {self.input_dict_structure}\"\n )\n\n return self._apply_to_dict(self._dict_to_device(mat))\n
"},{"location":"api/pydvl/influence/torch/operator/#pydvl.influence.torch.operator.HessianOperator","title":"HessianOperator","text":"HessianOperator(\n model: Module,\n loss: Callable[[Tensor, Tensor], Tensor],\n dataloader: DataLoader,\n restrict_to: Optional[Dict[str, Parameter]] = None,\n)\n
Bases: _AveragingBatchOperator[HessianBatchOperation, ChunkAveraging]
Given a model and loss function computes the Hessian vector or matrix product with respect to the model parameters for a given batch, i.e.
\\[\\begin{align*} &\\nabla^2_{\\theta} L(b;\\theta) \\cdot v \\\\\\ &L(b;\\theta) = \\left( \\frac{1}{|b|} \\sum_{(x,y) \\in b} \\text{loss}(\\text{model}(x; \\theta), y)\\right), \\end{align*}\\] where model is a torch.nn.Module and \\(v\\) is a vector or matrix, and average the results over the batches provided by the data loader.
PARAMETER DESCRIPTION model
The model.
TYPE: Module
loss
The loss function.
TYPE: Callable[[Tensor, Tensor], Tensor]
dataloader
The data loader providing batches of data.
TYPE: DataLoader
restrict_to
The parameters to restrict the second order differentiation to, i.e. the corresponding sub-matrix of the Hessian. If None, the full Hessian is used. Make sure the input matches the corrct dimension, i.e. the last dimension must be equal to the property input_size
.
TYPE: Optional[Dict[str, Parameter]]
DEFAULT: None
Source code in src/pydvl/influence/torch/operator.py
def __init__(\n self,\n model: nn.Module,\n loss: Callable[[torch.Tensor, torch.Tensor], torch.Tensor],\n dataloader: DataLoader,\n restrict_to: Optional[Dict[str, nn.Parameter]] = None,\n):\n batch_op = HessianBatchOperation(model, loss, restrict_to=restrict_to)\n averaging = ChunkAveraging()\n super().__init__(batch_op, dataloader, averaging)\n
"},{"location":"api/pydvl/influence/torch/operator/#pydvl.influence.torch.operator.HessianOperator.apply","title":"apply","text":"apply(tensor: TensorType) -> TensorType\n
Applies the operator to a tensor.
PARAMETER DESCRIPTION tensor
A tensor, whose tailing dimension must conform to the operator's input size
TYPE: TensorType
RETURNS DESCRIPTION TensorType
A tensor representing the result of the operator application.
Source code in src/pydvl/influence/types.py
def apply(self, tensor: TensorType) -> TensorType:\n \"\"\"\n Applies the operator to a tensor.\n\n Args:\n tensor: A tensor, whose tailing dimension must conform to the\n operator's input size\n\n Returns:\n A tensor representing the result of the operator application.\n \"\"\"\n self._validate_tensor_input(tensor)\n return self._apply(tensor)\n
"},{"location":"api/pydvl/influence/torch/operator/#pydvl.influence.torch.operator.HessianOperator.apply_to_dict","title":"apply_to_dict","text":"apply_to_dict(mat: Dict[str, Tensor]) -> Dict[str, Tensor]\n
Applies the operator to a dictionary of tensors, compatible to the structure defined by the property input_dict_structure
.
PARAMETER DESCRIPTION mat
dictionary of tensors, whose keys and shapes match the property input_dict_structure
.
TYPE: Dict[str, Tensor]
RETURNS DESCRIPTION Dict[str, Tensor]
A dictionary of tensors after applying the operator
Source code in src/pydvl/influence/torch/base.py
def apply_to_dict(self, mat: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:\n \"\"\"\n Applies the operator to a dictionary of tensors, compatible to the structure\n defined by the property `input_dict_structure`.\n\n Args:\n mat: dictionary of tensors, whose keys and shapes match the property\n `input_dict_structure`.\n\n Returns:\n A dictionary of tensors after applying the operator\n \"\"\"\n\n if not self._validate_mat_dict(mat):\n raise ValueError(\n f\"Incompatible input structure, expected (excluding batch\"\n f\"dimension): \\n {self.input_dict_structure}\"\n )\n\n return self._apply_to_dict(self._dict_to_device(mat))\n
"},{"location":"api/pydvl/influence/torch/operator/#pydvl.influence.torch.operator.InverseHarmonicMeanOperator","title":"InverseHarmonicMeanOperator","text":"InverseHarmonicMeanOperator(\n model: Module,\n loss: Callable[[Tensor, Tensor], Tensor],\n dataloader: DataLoader,\n regularization: float,\n restrict_to: Optional[Dict[str, Parameter]] = None,\n)\n
Bases: _AveragingBatchOperator[InverseHarmonicMeanBatchOperation, PointAveraging]
Given a model and loss function computes an approximation of the inverse Gauss-Newton vector or matrix product per batch and averages the results.
Viewing the damped Gauss-newton matrix
\\[\\begin{align*} G_{\\lambda}(\\text{model}, \\text{loss}, b, \\theta) &= \\frac{1}{|b|}\\sum_{(x, y) \\in b}\\nabla_{\\theta}\\ell (x,y; \\theta) \\nabla_{\\theta}\\ell (x,y; \\theta)^t + \\lambda \\operatorname{I}, \\\\\\ \\ell(x,y; \\theta) &= \\text{loss}(\\text{model}(x; \\theta), y) \\end{align*}\\] as an arithmetic mean of the rank-\\(1\\) updates, this operator replaces it with the harmonic mean of the rank-\\(1\\) updates, i.e.
\\[ \\tilde{G}_{\\lambda}(\\text{model}, \\text{loss}, b, \\theta) = \\left(n \\sum_{(x, y) \\in b} \\left( \\nabla_{\\theta}\\ell (x,y; \\theta) \\nabla_{\\theta}\\ell (x,y; \\theta)^t + \\lambda \\operatorname{I}\\right)^{-1} \\right)^{-1}\\] and computes
\\[ \\tilde{G}_{\\lambda}^{-1}(\\text{model}, \\text{loss}, b, \\theta) \\cdot v.\\] for any given batch \\(b\\), where model is a torch.nn.Module and \\(v\\) is a vector or matrix.
In other words, it switches the order of summation and inversion, which resolves to the inverse harmonic mean
of the rank-\\(1\\) updates. The results are averaged over the batches provided by the data loader.
The inverses of the rank-\\(1\\) updates are not calculated explicitly, but instead a vectorized version of the Sherman\u2013Morrison formula is applied.
For more information, see Inverse Harmonic Mean.
PARAMETER DESCRIPTION model
The model.
TYPE: Module
loss
The loss function.
TYPE: Callable[[Tensor, Tensor], Tensor]
dataloader
The data loader providing batches of data.
TYPE: DataLoader
restrict_to
The parameters to restrict the differentiation to, i.e. the corresponding sub-matrix of the Jacobian. If None, the full Jacobian is used. Make sure the input matches the corrct dimension, i.e. the last dimension must be equal to the property input_size
.
TYPE: Optional[Dict[str, Parameter]]
DEFAULT: None
Source code in src/pydvl/influence/torch/operator.py
def __init__(\n self,\n model: nn.Module,\n loss: Callable[[torch.Tensor, torch.Tensor], torch.Tensor],\n dataloader: DataLoader,\n regularization: float,\n restrict_to: Optional[Dict[str, nn.Parameter]] = None,\n):\n if regularization <= 0:\n raise ValueError(\"regularization must be positive\")\n\n self._regularization = regularization\n\n batch_op = InverseHarmonicMeanBatchOperation(\n model,\n loss,\n regularization,\n restrict_to=restrict_to,\n )\n averaging = PointAveraging()\n super().__init__(batch_op, dataloader, averaging)\n
"},{"location":"api/pydvl/influence/torch/operator/#pydvl.influence.torch.operator.InverseHarmonicMeanOperator.apply","title":"apply","text":"apply(tensor: TensorType) -> TensorType\n
Applies the operator to a tensor.
PARAMETER DESCRIPTION tensor
A tensor, whose tailing dimension must conform to the operator's input size
TYPE: TensorType
RETURNS DESCRIPTION TensorType
A tensor representing the result of the operator application.
Source code in src/pydvl/influence/types.py
def apply(self, tensor: TensorType) -> TensorType:\n \"\"\"\n Applies the operator to a tensor.\n\n Args:\n tensor: A tensor, whose tailing dimension must conform to the\n operator's input size\n\n Returns:\n A tensor representing the result of the operator application.\n \"\"\"\n self._validate_tensor_input(tensor)\n return self._apply(tensor)\n
"},{"location":"api/pydvl/influence/torch/operator/#pydvl.influence.torch.operator.InverseHarmonicMeanOperator.apply_to_dict","title":"apply_to_dict","text":"apply_to_dict(mat: Dict[str, Tensor]) -> Dict[str, Tensor]\n
Applies the operator to a dictionary of tensors, compatible to the structure defined by the property input_dict_structure
.
PARAMETER DESCRIPTION mat
dictionary of tensors, whose keys and shapes match the property input_dict_structure
.
TYPE: Dict[str, Tensor]
RETURNS DESCRIPTION Dict[str, Tensor]
A dictionary of tensors after applying the operator
Source code in src/pydvl/influence/torch/base.py
def apply_to_dict(self, mat: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:\n \"\"\"\n Applies the operator to a dictionary of tensors, compatible to the structure\n defined by the property `input_dict_structure`.\n\n Args:\n mat: dictionary of tensors, whose keys and shapes match the property\n `input_dict_structure`.\n\n Returns:\n A dictionary of tensors after applying the operator\n \"\"\"\n\n if not self._validate_mat_dict(mat):\n raise ValueError(\n f\"Incompatible input structure, expected (excluding batch\"\n f\"dimension): \\n {self.input_dict_structure}\"\n )\n\n return self._apply_to_dict(self._dict_to_device(mat))\n
"},{"location":"api/pydvl/influence/torch/operator/#pydvl.influence.torch.operator.DirectSolveOperator","title":"DirectSolveOperator","text":"DirectSolveOperator(\n matrix: Tensor,\n regularization: Optional[float] = None,\n in_place_regularization: bool = False,\n)\n
Bases: TensorOperator
Given a matrix \\(A\\) and an optional regularization parameter \\(\\lambda\\), computes the solution of the system \\((A+\\lambda I)x = b\\), where \\(b\\) is a vector or a matrix. Internally, it uses the routine torch.linalg.solve.
PARAMETER DESCRIPTION matrix
the system matrix
TYPE: Tensor
regularization
the regularization parameter
TYPE: Optional[float]
DEFAULT: None
in_place_regularization
If True, the input matrix is modified in-place, by adding the regularization value to the diagonal.
TYPE: bool
DEFAULT: False
Source code in src/pydvl/influence/torch/operator.py
def __init__(\n self,\n matrix: torch.Tensor,\n regularization: Optional[float] = None,\n in_place_regularization: bool = False,\n):\n if regularization is None:\n self.matrix = matrix\n else:\n self.matrix = self._update_diagonal(\n matrix if in_place_regularization else matrix.clone(), regularization\n )\n self._regularization = regularization\n
"},{"location":"api/pydvl/influence/torch/operator/#pydvl.influence.torch.operator.DirectSolveOperator.apply","title":"apply","text":"apply(tensor: TensorType) -> TensorType\n
Applies the operator to a tensor.
PARAMETER DESCRIPTION tensor
A tensor, whose tailing dimension must conform to the operator's input size
TYPE: TensorType
RETURNS DESCRIPTION TensorType
A tensor representing the result of the operator application.
Source code in src/pydvl/influence/types.py
def apply(self, tensor: TensorType) -> TensorType:\n \"\"\"\n Applies the operator to a tensor.\n\n Args:\n tensor: A tensor, whose tailing dimension must conform to the\n operator's input size\n\n Returns:\n A tensor representing the result of the operator application.\n \"\"\"\n self._validate_tensor_input(tensor)\n return self._apply(tensor)\n
"},{"location":"api/pydvl/influence/torch/operator/#pydvl.influence.torch.operator.LissaOperator","title":"LissaOperator","text":"LissaOperator(\n batch_operation: BatchOperationType,\n data: DataLoader,\n regularization: Optional[float] = None,\n maxiter: int = 1000,\n dampen: float = 0.0,\n scale: float = 10.0,\n rtol: float = 0.0001,\n progress: bool = False,\n warn_on_max_iteration: bool = True,\n)\n
Bases: TensorOperator
, Generic[BatchOperationType]
Uses LISSA, Linear time Stochastic Second-Order Algorithm, to iteratively approximate the solution of the system \\((A + \\lambda I)x = b\\). This is done with the update
\\[(A + \\lambda I)^{-1}_{j+1} b = b + (I - d) \\ (A + \\lambda I) - \\frac{(A + \\lambda I)^{-1}_j b}{s},\\] where \\(I\\) is the identity matrix, \\(d\\) is a dampening term and \\(s\\) a scaling factor that are applied to help convergence. For details, see Linear time Stochastic Second-Order Approximation (LiSSA)
PARAMETER DESCRIPTION batch_operation
The BatchOperation
representing the action of A on a batch of the data loader.
TYPE: BatchOperationType
data
a pytorch dataloader
TYPE: DataLoader
regularization
Optional regularization parameter added to the Hessian-vector product for numerical stability.
TYPE: Optional[float]
DEFAULT: None
maxiter
Maximum number of iterations.
TYPE: int
DEFAULT: 1000
dampen
Dampening factor, defaults to 0 for no dampening.
TYPE: float
DEFAULT: 0.0
scale
Scaling factor, defaults to 10.
TYPE: float
DEFAULT: 10.0
rtol
tolerance to use for early stopping
TYPE: float
DEFAULT: 0.0001
progress
If True, display progress bars.
TYPE: bool
DEFAULT: False
warn_on_max_iteration
If True, logs a warning, if the desired tolerance is not achieved within maxiter
iterations. If False, the log level for this information is logging.DEBUG
TYPE: bool
DEFAULT: True
Source code in src/pydvl/influence/torch/operator.py
def __init__(\n self,\n batch_operation: BatchOperationType,\n data: DataLoader,\n regularization: Optional[float] = None,\n maxiter: int = 1000,\n dampen: float = 0.0,\n scale: float = 10.0,\n rtol: float = 1e-4,\n progress: bool = False,\n warn_on_max_iteration: bool = True,\n):\n\n if regularization is not None and regularization < 0:\n raise ValueError(\"regularization must be non-negative\")\n\n self.data = data\n self.warn_on_max_iteration = warn_on_max_iteration\n self.progress = progress\n self.rtol = rtol\n self.scale = scale\n self.dampen = dampen\n self.maxiter = maxiter\n self.batch_operation = batch_operation\n self._regularization = regularization\n
"},{"location":"api/pydvl/influence/torch/operator/#pydvl.influence.torch.operator.LissaOperator.apply","title":"apply","text":"apply(tensor: TensorType) -> TensorType\n
Applies the operator to a tensor.
PARAMETER DESCRIPTION tensor
A tensor, whose tailing dimension must conform to the operator's input size
TYPE: TensorType
RETURNS DESCRIPTION TensorType
A tensor representing the result of the operator application.
Source code in src/pydvl/influence/types.py
def apply(self, tensor: TensorType) -> TensorType:\n \"\"\"\n Applies the operator to a tensor.\n\n Args:\n tensor: A tensor, whose tailing dimension must conform to the\n operator's input size\n\n Returns:\n A tensor representing the result of the operator application.\n \"\"\"\n self._validate_tensor_input(tensor)\n return self._apply(tensor)\n
"},{"location":"api/pydvl/influence/torch/operator/#pydvl.influence.torch.operator.LowRankOperator","title":"LowRankOperator","text":"LowRankOperator(\n low_rank_representation: LowRankProductRepresentation,\n regularization: float,\n exact: bool = True,\n)\n
Bases: TensorOperator
Given a low rank representation of a matrix
\\[ A = V D V^T\\] with a diagonal matrix \\(D\\) and an optional regularization parameter \\(\\lambda\\), computes
$$ (V D V^T+\\lambda I)^{-1}b$$.
Depending on the value of the exact
flag, the inverse action is computed exactly using the [Sherman\u2013Morrison\u2013Woodbury formula] (https://en.wikipedia.org/wiki/Woodbury_matrix_identity). If exact
is set to False
, the inverse action is approximated by
\\[ V^T(D+\\lambda I)^{-1}Vb\\] Args:
Source code in src/pydvl/influence/torch/operator.py
def __init__(\n self,\n low_rank_representation: LowRankProductRepresentation,\n regularization: float,\n exact: bool = True,\n):\n\n if exact and (regularization is None or regularization <= 0):\n raise ValueError(\"regularization must be positive when exact=True\")\n elif regularization is not None and regularization < 0:\n raise ValueError(\"regularization must be non-negative\")\n\n self._regularization = regularization\n self._exact = exact\n self._low_rank_representation = low_rank_representation\n
"},{"location":"api/pydvl/influence/torch/operator/#pydvl.influence.torch.operator.LowRankOperator.apply","title":"apply","text":"apply(tensor: TensorType) -> TensorType\n
Applies the operator to a tensor.
PARAMETER DESCRIPTION tensor
A tensor, whose tailing dimension must conform to the operator's input size
TYPE: TensorType
RETURNS DESCRIPTION TensorType
A tensor representing the result of the operator application.
Source code in src/pydvl/influence/types.py
def apply(self, tensor: TensorType) -> TensorType:\n \"\"\"\n Applies the operator to a tensor.\n\n Args:\n tensor: A tensor, whose tailing dimension must conform to the\n operator's input size\n\n Returns:\n A tensor representing the result of the operator application.\n \"\"\"\n self._validate_tensor_input(tensor)\n return self._apply(tensor)\n
"},{"location":"api/pydvl/influence/torch/pre_conditioner/","title":"Pre conditioner","text":""},{"location":"api/pydvl/influence/torch/pre_conditioner/#pydvl.influence.torch.pre_conditioner","title":"pydvl.influence.torch.pre_conditioner","text":""},{"location":"api/pydvl/influence/torch/pre_conditioner/#pydvl.influence.torch.pre_conditioner.PreConditioner","title":"PreConditioner","text":" Bases: ABC
Abstract base class for implementing pre-conditioners for improving the convergence of CG for systems of the form
\\[ ( A + \\lambda \\operatorname{I})x = \\operatorname{rhs} \\] i.e. a matrix \\(M\\) such that \\(M^{-1}(A + \\lambda \\operatorname{I})\\) has a better condition number than \\(A + \\lambda \\operatorname{I}\\).
"},{"location":"api/pydvl/influence/torch/pre_conditioner/#pydvl.influence.torch.pre_conditioner.PreConditioner.fit","title":"fit abstractmethod
","text":"fit(\n mat_mat_prod: Callable[[Tensor], Tensor],\n size: int,\n dtype: dtype,\n device: device,\n regularization: float = 0.0,\n)\n
Implement this to fit the pre-conditioner to the matrix represented by the mat_mat_prod Args: mat_mat_prod: a callable that computes the matrix-matrix product size: size of the matrix represented by mat_mat_prod
dtype: data type of the matrix represented by mat_mat_prod
device: device of the matrix represented by mat_mat_prod
regularization: regularization parameter \\(\\lambda\\) in the equation $ ( A + \\lambda \\operatorname{I})x = \\operatorname{rhs} $ Returns: self
Source code in src/pydvl/influence/torch/pre_conditioner.py
@abstractmethod\ndef fit(\n self,\n mat_mat_prod: Callable[[torch.Tensor], torch.Tensor],\n size: int,\n dtype: torch.dtype,\n device: torch.device,\n regularization: float = 0.0,\n):\n r\"\"\"\n Implement this to fit the pre-conditioner to the matrix represented by the\n mat_mat_prod\n Args:\n mat_mat_prod: a callable that computes the matrix-matrix product\n size: size of the matrix represented by `mat_mat_prod`\n dtype: data type of the matrix represented by `mat_mat_prod`\n device: device of the matrix represented by `mat_mat_prod`\n regularization: regularization parameter $\\lambda$ in the equation\n $ ( A + \\lambda \\operatorname{I})x = \\operatorname{rhs} $\n Returns:\n self\n \"\"\"\n pass\n
"},{"location":"api/pydvl/influence/torch/pre_conditioner/#pydvl.influence.torch.pre_conditioner.PreConditioner.solve","title":"solve","text":"solve(rhs: Tensor)\n
Solve the equation \\(M@Z = \\operatorname{rhs}\\) Args: rhs: right hand side of the equation, corresponds to the residuum vector (or matrix) in the conjugate gradient method
RETURNS DESCRIPTION solution \\(M^{-1}\\operatorname{rhs}\\)
Source code in src/pydvl/influence/torch/pre_conditioner.py
def solve(self, rhs: torch.Tensor):\n r\"\"\"\n Solve the equation $M@Z = \\operatorname{rhs}$\n Args:\n rhs: right hand side of the equation, corresponds to the residuum vector\n (or matrix) in the conjugate gradient method\n\n Returns:\n solution $M^{-1}\\operatorname{rhs}$\n\n \"\"\"\n if not self.is_fitted:\n raise NotFittedException(type(self))\n\n return self._solve(rhs)\n
"},{"location":"api/pydvl/influence/torch/pre_conditioner/#pydvl.influence.torch.pre_conditioner.PreConditioner.to","title":"to abstractmethod
","text":"to(device: device) -> PreConditioner\n
Implement this to move the (potentially fitted) preconditioner to a specific device
Source code in src/pydvl/influence/torch/pre_conditioner.py
@abstractmethod\ndef to(self, device: torch.device) -> PreConditioner:\n \"\"\"Implement this to move the (potentially fitted) preconditioner to a\n specific device\"\"\"\n
"},{"location":"api/pydvl/influence/torch/pre_conditioner/#pydvl.influence.torch.pre_conditioner.JacobiPreConditioner","title":"JacobiPreConditioner","text":"JacobiPreConditioner(num_samples_estimator: int = 1)\n
Bases: PreConditioner
Pre-conditioner for improving the convergence of CG for systems of the form
\\[ ( A + \\lambda \\operatorname{I})x = \\operatorname{rhs} \\] The JacobiPreConditioner uses the diagonal information of the matrix \\(A\\). The diagonal elements are not computed directly but estimated via Hutchinson's estimator.
\\[ M = \\frac{1}{m} \\sum_{i=1}^m u_i \\odot Au_i + \\lambda \\operatorname{I} \\] where \\(u_i\\) are i.i.d. Gaussian random vectors. Works well in the case the matrix \\(A + \\lambda \\operatorname{I}\\) is diagonal dominant. For more information, see the documentation of Conjugate Gradient Args: num_samples_estimator: number of samples to use in computation of Hutchinson's estimator
Source code in src/pydvl/influence/torch/pre_conditioner.py
def __init__(self, num_samples_estimator: int = 1):\n self.num_samples_estimator = num_samples_estimator\n
"},{"location":"api/pydvl/influence/torch/pre_conditioner/#pydvl.influence.torch.pre_conditioner.JacobiPreConditioner.solve","title":"solve","text":"solve(rhs: Tensor)\n
Solve the equation \\(M@Z = \\operatorname{rhs}\\) Args: rhs: right hand side of the equation, corresponds to the residuum vector (or matrix) in the conjugate gradient method
RETURNS DESCRIPTION solution \\(M^{-1}\\operatorname{rhs}\\)
Source code in src/pydvl/influence/torch/pre_conditioner.py
def solve(self, rhs: torch.Tensor):\n r\"\"\"\n Solve the equation $M@Z = \\operatorname{rhs}$\n Args:\n rhs: right hand side of the equation, corresponds to the residuum vector\n (or matrix) in the conjugate gradient method\n\n Returns:\n solution $M^{-1}\\operatorname{rhs}$\n\n \"\"\"\n if not self.is_fitted:\n raise NotFittedException(type(self))\n\n return self._solve(rhs)\n
"},{"location":"api/pydvl/influence/torch/pre_conditioner/#pydvl.influence.torch.pre_conditioner.JacobiPreConditioner.fit","title":"fit","text":"fit(\n mat_mat_prod: Callable[[Tensor], Tensor],\n size: int,\n dtype: dtype,\n device: device,\n regularization: float = 0.0,\n)\n
Fits by computing an estimate of the diagonal of the matrix represented by mat_mat_prod
via Hutchinson's estimator
PARAMETER DESCRIPTION mat_mat_prod
a callable representing the matrix-matrix product
TYPE: Callable[[Tensor], Tensor]
size
size of the square matrix
TYPE: int
dtype
needed data type of inputs for the mat_mat_prod
TYPE: dtype
device
needed device for inputs of mat_mat_prod
TYPE: device
regularization
regularization parameter \\(\\lambda\\) in \\((A+\\lambda I)x=b\\)
TYPE: float
DEFAULT: 0.0
Source code in src/pydvl/influence/torch/pre_conditioner.py
def fit(\n self,\n mat_mat_prod: Callable[[torch.Tensor], torch.Tensor],\n size: int,\n dtype: torch.dtype,\n device: torch.device,\n regularization: float = 0.0,\n):\n r\"\"\"\n Fits by computing an estimate of the diagonal of the matrix represented by\n `mat_mat_prod` via Hutchinson's estimator\n\n Args:\n mat_mat_prod: a callable representing the matrix-matrix product\n size: size of the square matrix\n dtype: needed data type of inputs for the mat_mat_prod\n device: needed device for inputs of mat_mat_prod\n regularization: regularization parameter\n $\\lambda$ in $(A+\\lambda I)x=b$\n \"\"\"\n random_samples = torch.randn(\n size, self.num_samples_estimator, device=device, dtype=dtype\n )\n diagonal_estimate = torch.sum(\n torch.mul(random_samples, mat_mat_prod(random_samples)), dim=1\n )\n diagonal_estimate /= self.num_samples_estimator\n self._diag = diagonal_estimate\n self._reg = regularization\n
"},{"location":"api/pydvl/influence/torch/pre_conditioner/#pydvl.influence.torch.pre_conditioner.NystroemPreConditioner","title":"NystroemPreConditioner","text":"NystroemPreConditioner(rank: int)\n
Bases: PreConditioner
Pre-conditioner for improving the convergence of CG for systems of the form
\\[ (A + \\lambda \\operatorname{I})x = \\operatorname{rhs} \\] The NystroemPreConditioner computes a low-rank approximation
\\[ A_{\\text{nys}} = (A \\Omega)(\\Omega^T A \\Omega)^{\\dagger}(A \\Omega)^T = U \\Sigma U^T, \\] where \\((\\cdot)^{\\dagger}\\) denotes the Moore-Penrose inverse, and uses the matrix
\\[ M^{-1} = (\\lambda + \\sigma_{\\text{rank}})U(\\Sigma+ \\lambda \\operatorname{I})^{-1}U^T+(\\operatorname{I} - UU^T) \\] for pre-conditioning, where \\( \\sigma_{\\text{rank}} \\) is the smallest eigenvalue of the low-rank approximation.
Source code in src/pydvl/influence/torch/pre_conditioner.py
def __init__(self, rank: int):\n self._rank = rank\n
"},{"location":"api/pydvl/influence/torch/pre_conditioner/#pydvl.influence.torch.pre_conditioner.NystroemPreConditioner.solve","title":"solve","text":"solve(rhs: Tensor)\n
Solve the equation \\(M@Z = \\operatorname{rhs}\\) Args: rhs: right hand side of the equation, corresponds to the residuum vector (or matrix) in the conjugate gradient method
RETURNS DESCRIPTION solution \\(M^{-1}\\operatorname{rhs}\\)
Source code in src/pydvl/influence/torch/pre_conditioner.py
def solve(self, rhs: torch.Tensor):\n r\"\"\"\n Solve the equation $M@Z = \\operatorname{rhs}$\n Args:\n rhs: right hand side of the equation, corresponds to the residuum vector\n (or matrix) in the conjugate gradient method\n\n Returns:\n solution $M^{-1}\\operatorname{rhs}$\n\n \"\"\"\n if not self.is_fitted:\n raise NotFittedException(type(self))\n\n return self._solve(rhs)\n
"},{"location":"api/pydvl/influence/torch/pre_conditioner/#pydvl.influence.torch.pre_conditioner.NystroemPreConditioner.fit","title":"fit","text":"fit(\n mat_mat_prod: Callable[[Tensor], Tensor],\n size: int,\n dtype: dtype,\n device: device,\n regularization: float = 0.0,\n)\n
Fits by computing a low-rank approximation of the matrix represented by mat_mat_prod
via Nystroem approximation
PARAMETER DESCRIPTION mat_mat_prod
a callable representing the matrix-matrix product
TYPE: Callable[[Tensor], Tensor]
size
size of the square matrix
TYPE: int
dtype
needed data type of inputs for the mat_mat_prod
TYPE: dtype
device
needed device for inputs of mat_mat_prod
TYPE: device
regularization
regularization parameter \\(\\lambda\\) in \\((A+\\lambda I)x=b\\)
TYPE: float
DEFAULT: 0.0
Source code in src/pydvl/influence/torch/pre_conditioner.py
def fit(\n self,\n mat_mat_prod: Callable[[torch.Tensor], torch.Tensor],\n size: int,\n dtype: torch.dtype,\n device: torch.device,\n regularization: float = 0.0,\n):\n r\"\"\"\n Fits by computing a low-rank approximation of the matrix represented by\n `mat_mat_prod` via Nystroem approximation\n\n Args:\n mat_mat_prod: a callable representing the matrix-matrix product\n size: size of the square matrix\n dtype: needed data type of inputs for the mat_mat_prod\n device: needed device for inputs of mat_mat_prod\n regularization: regularization parameter\n $\\lambda$ in $(A+\\lambda I)x=b$\n \"\"\"\n\n self._low_rank_approx = randomized_nystroem_approximation(\n mat_mat_prod, size, self._rank, dtype, mat_vec_device=device\n )\n self._regularization = regularization\n
"},{"location":"api/pydvl/influence/torch/util/","title":"Util","text":""},{"location":"api/pydvl/influence/torch/util/#pydvl.influence.torch.util","title":"pydvl.influence.torch.util","text":""},{"location":"api/pydvl/influence/torch/util/#pydvl.influence.torch.util.TorchTensorContainerType","title":"TorchTensorContainerType module-attribute
","text":"TorchTensorContainerType = Union[\n Tensor, Collection[Tensor], Mapping[str, Tensor]\n]\n
Type for a PyTorch tensor or a container thereof.
"},{"location":"api/pydvl/influence/torch/util/#pydvl.influence.torch.util.TorchNumpyConverter","title":"TorchNumpyConverter","text":"TorchNumpyConverter(device: Optional[device] = None)\n
Bases: NumpyConverter[Tensor]
Helper class for converting between torch.Tensor and numpy.ndarray
PARAMETER DESCRIPTION device
Optional device parameter to move the resulting torch tensors to the specified device
TYPE: Optional[device]
DEFAULT: None
Source code in src/pydvl/influence/torch/util.py
def __init__(self, device: Optional[torch.device] = None):\n self.device = device\n
"},{"location":"api/pydvl/influence/torch/util/#pydvl.influence.torch.util.TorchNumpyConverter.to_numpy","title":"to_numpy","text":"to_numpy(x: Tensor) -> NDArray\n
Convert a detached torch.Tensor to numpy.ndarray
Source code in src/pydvl/influence/torch/util.py
def to_numpy(self, x: torch.Tensor) -> NDArray:\n \"\"\"\n Convert a detached [torch.Tensor][torch.Tensor] to\n [numpy.ndarray][numpy.ndarray]\n \"\"\"\n arr: NDArray = x.cpu().numpy()\n return arr\n
"},{"location":"api/pydvl/influence/torch/util/#pydvl.influence.torch.util.TorchNumpyConverter.from_numpy","title":"from_numpy","text":"from_numpy(x: NDArray) -> Tensor\n
Convert a numpy.ndarray to torch.Tensor and optionally move it to a provided device
Source code in src/pydvl/influence/torch/util.py
def from_numpy(self, x: NDArray) -> torch.Tensor:\n \"\"\"\n Convert a [numpy.ndarray][numpy.ndarray] to [torch.Tensor][torch.Tensor] and\n optionally move it to a provided device\n \"\"\"\n t = torch.from_numpy(x)\n if self.device is not None:\n t = t.to(self.device)\n return t\n
"},{"location":"api/pydvl/influence/torch/util/#pydvl.influence.torch.util.TorchCatAggregator","title":"TorchCatAggregator","text":" Bases: SequenceAggregator[Tensor]
An aggregator that concatenates tensors using PyTorch's torch.cat function. Concatenation is done along the first dimension of the chunks.
"},{"location":"api/pydvl/influence/torch/util/#pydvl.influence.torch.util.TorchCatAggregator.__call__","title":"__call__","text":"__call__(tensor_sequence: LazyChunkSequence[Tensor])\n
Aggregates tensors from a single-level generator into a single tensor by concatenating them. This method is a straightforward way to combine a sequence of tensors into one larger tensor.
PARAMETER DESCRIPTION tensor_sequence
Object wrapping a generator that yields torch.Tensor
objects.
TYPE: LazyChunkSequence[Tensor]
RETURNS DESCRIPTION A single tensor formed by concatenating all tensors from the generator. The concatenation is performed along the default dimension (0).
Source code in src/pydvl/influence/torch/util.py
def __call__(\n self,\n tensor_sequence: LazyChunkSequence[torch.Tensor],\n):\n \"\"\"\n Aggregates tensors from a single-level generator into a single tensor by\n concatenating them. This method is a straightforward way to combine a sequence\n of tensors into one larger tensor.\n\n Args:\n tensor_sequence: Object wrapping a generator that yields `torch.Tensor`\n objects.\n\n Returns:\n A single tensor formed by concatenating all tensors from the generator.\n The concatenation is performed along the default dimension (0).\n \"\"\"\n t_gen = cast(Iterator[torch.Tensor], tensor_sequence.generator_factory())\n len_generator = tensor_sequence.len_generator\n if len_generator is not None:\n t_gen = cast(\n Iterator[torch.Tensor], tqdm(t_gen, total=len_generator, desc=\"Blocks\")\n )\n\n return torch.cat(list(t_gen))\n
"},{"location":"api/pydvl/influence/torch/util/#pydvl.influence.torch.util.NestedTorchCatAggregator","title":"NestedTorchCatAggregator","text":" Bases: NestedSequenceAggregator[Tensor]
An aggregator that concatenates tensors using PyTorch's torch.cat function. Concatenation is done along the first two dimensions of the chunks.
"},{"location":"api/pydvl/influence/torch/util/#pydvl.influence.torch.util.NestedTorchCatAggregator.__call__","title":"__call__","text":"__call__(nested_sequence_of_tensors: NestedLazyChunkSequence[Tensor])\n
Aggregates tensors from a nested generator structure into a single tensor by concatenating. Each inner generator is first concatenated along dimension 1 into a tensor, and then these tensors are concatenated along dimension 0 together to form the final tensor.
PARAMETER DESCRIPTION nested_sequence_of_tensors
Object wrapping a generator of generators, where each inner generator yields torch.Tensor
objects.
TYPE: NestedLazyChunkSequence[Tensor]
RETURNS DESCRIPTION A single tensor formed by concatenating all tensors from the nested
generators.
Source code in src/pydvl/influence/torch/util.py
def __call__(\n self, nested_sequence_of_tensors: NestedLazyChunkSequence[torch.Tensor]\n):\n \"\"\"\n Aggregates tensors from a nested generator structure into a single tensor by\n concatenating. Each inner generator is first concatenated along dimension 1 into\n a tensor, and then these tensors are concatenated along dimension 0 together to\n form the final tensor.\n\n Args:\n nested_sequence_of_tensors: Object wrapping a generator of generators,\n where each inner generator yields `torch.Tensor` objects.\n\n Returns:\n A single tensor formed by concatenating all tensors from the nested\n generators.\n\n \"\"\"\n\n outer_gen = cast(\n Iterator[Iterator[torch.Tensor]],\n nested_sequence_of_tensors.generator_factory(),\n )\n len_outer_generator = nested_sequence_of_tensors.len_outer_generator\n if len_outer_generator is not None:\n outer_gen = cast(\n Iterator[Iterator[torch.Tensor]],\n tqdm(outer_gen, total=len_outer_generator, desc=\"Row blocks\"),\n )\n\n return torch.cat(\n list(\n map(\n lambda tensor_gen: torch.cat(list(tensor_gen), dim=1),\n outer_gen,\n )\n )\n )\n
"},{"location":"api/pydvl/influence/torch/util/#pydvl.influence.torch.util.EkfacRepresentation","title":"EkfacRepresentation dataclass
","text":"EkfacRepresentation(\n layer_names: Iterable[str],\n layers_module: Iterable[Module],\n evecs_a: Iterable[Tensor],\n evecs_g: Iterable[Tensor],\n diags: Iterable[Tensor],\n)\n
Container class for the EKFAC representation of the Hessian. It can be iterated over to get the layers names and their corresponding module, eigenvectors and diagonal elements of the factorized Hessian matrix.
PARAMETER DESCRIPTION layer_names
Names of the layers.
TYPE: Iterable[str]
layers_module
The layers.
TYPE: Iterable[Module]
evecs_a
The a eigenvectors of the ekfac representation.
TYPE: Iterable[Tensor]
evecs_g
The g eigenvectors of the ekfac representation.
TYPE: Iterable[Tensor]
diags
The diagonal elements of the factorized Hessian matrix.
TYPE: Iterable[Tensor]
"},{"location":"api/pydvl/influence/torch/util/#pydvl.influence.torch.util.EkfacRepresentation.get_layer_evecs","title":"get_layer_evecs","text":"get_layer_evecs() -> Tuple[Dict[str, Tensor], Dict[str, Tensor]]\n
It returns two dictionaries, one for the a eigenvectors and one for the g eigenvectors, with the layer names as keys. The eigenvectors are in the same order as the layers in the model.
Source code in src/pydvl/influence/torch/util.py
def get_layer_evecs(\n self,\n) -> Tuple[Dict[str, torch.Tensor], Dict[str, torch.Tensor]]:\n \"\"\"\n It returns two dictionaries, one for the a eigenvectors and one for the g\n eigenvectors, with the layer names as keys. The eigenvectors are in the same\n order as the layers in the model.\n \"\"\"\n evecs_a_dict = {layer_name: evec_a for layer_name, (_, evec_a, _, _) in self}\n evecs_g_dict = {layer_name: evec_g for layer_name, (_, _, evec_g, _) in self}\n return evecs_a_dict, evecs_g_dict\n
"},{"location":"api/pydvl/influence/torch/util/#pydvl.influence.torch.util.TorchLinalgEighException","title":"TorchLinalgEighException","text":"TorchLinalgEighException(original_exception: RuntimeError)\n
Bases: Exception
Exception to wrap a RunTimeError raised by torch.linalg.eigh, when used with large matrices, see https://github.com/pytorch/pytorch/issues/92141
Source code in src/pydvl/influence/torch/util.py
def __init__(self, original_exception: RuntimeError):\n func = torch.linalg.eigh\n err_msg = (\n f\"A RunTimeError occurred in '{func.__module__}.{func.__qualname__}'. \"\n \"This might be related to known issues with \"\n \"[torch.linalg.eigh][torch.linalg.eigh] on certain matrix sizes.\\n \"\n \"For more details, refer to \"\n \"https://github.com/pytorch/pytorch/issues/92141. \\n\"\n \"In this case, consider to use a different implementation, which does not \"\n \"depend on the usage of [torch.linalg.eigh][torch.linalg.eigh].\\n\"\n f\" Inspect the original exception message: \\n{str(original_exception)}\"\n )\n super().__init__(err_msg)\n
"},{"location":"api/pydvl/influence/torch/util/#pydvl.influence.torch.util.BlockMode","title":"BlockMode","text":" Bases: Enum
Enumeration for different modes of grouping model parameters.
ATTRIBUTE DESCRIPTION LAYER_WISE
Groups parameters by layers of the model.
TYPE: str
PARAMETER_WISE
Groups parameters individually.
TYPE: str
FULL
Groups all parameters together.
TYPE: str
"},{"location":"api/pydvl/influence/torch/util/#pydvl.influence.torch.util.ModelParameterDictBuilder","title":"ModelParameterDictBuilder dataclass
","text":"ModelParameterDictBuilder(model: Module, detach: bool = True)\n
A builder class for creating ordered dictionaries of model parameters based on specified block modes or custom blocking structures.
ATTRIBUTE DESCRIPTION model
The neural network model.
TYPE: Module
detach
Whether to detach the parameters from the computation graph.
TYPE: bool
"},{"location":"api/pydvl/influence/torch/util/#pydvl.influence.torch.util.ModelParameterDictBuilder.build","title":"build","text":"build(\n block_structure: OrderedDict[str, List[str]]\n) -> Dict[str, Dict[str, Parameter]]\n
Builds an ordered dictionary of model parameters based on the specified block structure represented by an ordered dictionary, where the keys are block identifiers and the values are lists of model parameter names contained in this block.
PARAMETER DESCRIPTION block_structure
The block structure specifying how to group the parameters.
TYPE: OrderedDict[str, List[str]]
RETURNS DESCRIPTION Dict[str, Dict[str, Parameter]]
An ordered dictionary of ordered dictionaries, where the outer dictionary's
Dict[str, Dict[str, Parameter]]
keys are block identifiers and the inner dictionaries map parameter names
Dict[str, Dict[str, Parameter]]
to parameters.
Source code in src/pydvl/influence/torch/util.py
def build(\n self, block_structure: OrderedDict[str, List[str]]\n) -> Dict[str, Dict[str, torch.nn.Parameter]]:\n \"\"\"\n Builds an ordered dictionary of model parameters based on the specified block\n structure represented by an ordered dictionary, where the keys are block\n identifiers and the values are lists of model parameter names contained in\n this block.\n\n Args:\n block_structure: The block structure specifying how to group the parameters.\n\n Returns:\n An ordered dictionary of ordered dictionaries, where the outer dictionary's\n keys are block identifiers and the inner dictionaries map parameter names\n to parameters.\n \"\"\"\n parameter_dict = {}\n\n for block_name, parameter_names in block_structure.items():\n inner_ordered_dict = {}\n for parameter_name in parameter_names:\n parameter = self._extract_parameter_by_name(parameter_name)\n if parameter.requires_grad:\n inner_ordered_dict[parameter_name] = self._optional_detach(\n parameter\n )\n else:\n warnings.warn(\n f\"The parameter {parameter_name} from the block \"\n f\"{block_name} is mark as not trainable in the model \"\n f\"and will be excluded from the computation.\"\n )\n parameter_dict[block_name] = inner_ordered_dict\n\n return parameter_dict\n
"},{"location":"api/pydvl/influence/torch/util/#pydvl.influence.torch.util.ModelParameterDictBuilder.build_from_block_mode","title":"build_from_block_mode","text":"build_from_block_mode(block_mode: BlockMode) -> Dict[str, Dict[str, Parameter]]\n
Builds an ordered dictionary of model parameters based on the specified block mode or custom blocking structure represented by an ordered dictionary, where the keys are block identifiers and the values are lists of model parameter names contained in this block.
PARAMETER DESCRIPTION block_mode
The block mode specifying how to group the parameters.
TYPE: BlockMode
RETURNS DESCRIPTION Dict[str, Dict[str, Parameter]]
An ordered dictionary of ordered dictionaries, where the outer dictionary's
Dict[str, Dict[str, Parameter]]
keys are block identifiers and the inner dictionaries map parameter names
Dict[str, Dict[str, Parameter]]
to parameters.
Source code in src/pydvl/influence/torch/util.py
def build_from_block_mode(\n self, block_mode: BlockMode\n) -> Dict[str, Dict[str, torch.nn.Parameter]]:\n \"\"\"\n Builds an ordered dictionary of model parameters based on the specified block\n mode or custom blocking structure represented by an ordered dictionary, where\n the keys are block identifiers and the values are lists of model parameter names\n contained in this block.\n\n Args:\n block_mode: The block mode specifying how to group the parameters.\n\n Returns:\n An ordered dictionary of ordered dictionaries, where the outer dictionary's\n keys are block identifiers and the inner dictionaries map parameter names\n to parameters.\n \"\"\"\n\n block_mode_mapping = {\n BlockMode.FULL: self._build_full,\n BlockMode.PARAMETER_WISE: self._build_parameter_wise,\n BlockMode.LAYER_WISE: self._build_layer_wise,\n }\n\n parameter_dict_func = block_mode_mapping.get(block_mode, None)\n\n if parameter_dict_func is None:\n raise ValueError(f\"Unknown block mode {block_mode}.\")\n\n return self.build(parameter_dict_func())\n
"},{"location":"api/pydvl/influence/torch/util/#pydvl.influence.torch.util.ModelInfoMixin","title":"ModelInfoMixin","text":"ModelInfoMixin(model: Module)\n
A mixin class for classes that contain information about a model.
Source code in src/pydvl/influence/torch/util.py
def __init__(self, model: torch.nn.Module):\n self.model = model\n
"},{"location":"api/pydvl/influence/torch/util/#pydvl.influence.torch.util.to_model_device","title":"to_model_device","text":"to_model_device(x: Tensor, model: Module) -> Tensor\n
Returns the tensor x
moved to the device of the model
, if device of model is set
PARAMETER DESCRIPTION x
The tensor to be moved to the device of the model.
TYPE: Tensor
model
The model whose device will be used to move the tensor.
TYPE: Module
RETURNS DESCRIPTION Tensor
The tensor x
moved to the device of the model
, if device of model is set.
Source code in src/pydvl/influence/torch/util.py
def to_model_device(x: torch.Tensor, model: torch.nn.Module) -> torch.Tensor:\n \"\"\"\n Returns the tensor `x` moved to the device of the `model`, if device of model is set\n\n Args:\n x: The tensor to be moved to the device of the model.\n model: The model whose device will be used to move the tensor.\n\n Returns:\n The tensor `x` moved to the device of the `model`, if device of model is set.\n \"\"\"\n device = next(model.parameters()).device\n return x.to(device)\n
"},{"location":"api/pydvl/influence/torch/util/#pydvl.influence.torch.util.reshape_vector_to_tensors","title":"reshape_vector_to_tensors","text":"reshape_vector_to_tensors(\n input_vector: Tensor, target_shapes: Iterable[Tuple[int, ...]]\n) -> Tuple[Tensor, ...]\n
Reshape a 1D tensor into multiple tensors with specified shapes.
This function takes a 1D tensor (input_vector) and reshapes it into a series of tensors with shapes given by 'target_shapes'. The reshaped tensors are returned as a tuple in the same order as their corresponding shapes.
Note The total number of elements in 'input_vector' must be equal to the sum of the products of the shapes in 'target_shapes'.
PARAMETER DESCRIPTION input_vector
The 1D tensor to be reshaped. Must be 1D.
TYPE: Tensor
target_shapes
An iterable of tuples. Each tuple defines the shape of a tensor to be reshaped from the 'input_vector'.
TYPE: Iterable[Tuple[int, ...]]
RETURNS DESCRIPTION Tuple[Tensor, ...]
A tuple of reshaped tensors.
RAISES DESCRIPTION ValueError
If 'input_vector' is not a 1D tensor or if the total number of elements in 'input_vector' does not match the sum of the products of the shapes in 'target_shapes'.
Source code in src/pydvl/influence/torch/util.py
def reshape_vector_to_tensors(\n input_vector: torch.Tensor, target_shapes: Iterable[Tuple[int, ...]]\n) -> Tuple[torch.Tensor, ...]:\n \"\"\"\n Reshape a 1D tensor into multiple tensors with specified shapes.\n\n This function takes a 1D tensor (input_vector) and reshapes it into a series of\n tensors with shapes given by 'target_shapes'.\n The reshaped tensors are returned as a tuple in the same order\n as their corresponding shapes.\n\n Note:\n The total number of elements in 'input_vector' must be equal to the\n sum of the products of the shapes in 'target_shapes'.\n\n Args:\n input_vector: The 1D tensor to be reshaped. Must be 1D.\n target_shapes: An iterable of tuples. Each tuple defines the shape of a tensor\n to be reshaped from the 'input_vector'.\n\n Returns:\n A tuple of reshaped tensors.\n\n Raises:\n ValueError: If 'input_vector' is not a 1D tensor or if the total\n number of elements in 'input_vector' does not\n match the sum of the products of the shapes in 'target_shapes'.\n \"\"\"\n\n if input_vector.dim() != 1:\n raise ValueError(\"Input vector must be a 1D tensor\")\n\n total_elements = sum(math.prod(shape) for shape in target_shapes)\n\n if total_elements != input_vector.shape[0]:\n raise ValueError(\n f\"The total elements in shapes {total_elements} \"\n f\"does not match the vector length {input_vector.shape[0]}\"\n )\n\n tensors = []\n start = 0\n for shape in target_shapes:\n size = math.prod(shape) # compute the total size of the tensor with this shape\n tensors.append(\n input_vector[start : start + size].view(shape)\n ) # slice the vector and reshape it\n start += size\n return tuple(tensors)\n
"},{"location":"api/pydvl/influence/torch/util/#pydvl.influence.torch.util.align_structure","title":"align_structure","text":"align_structure(\n source: Mapping[str, Tensor], target: TorchTensorContainerType\n) -> Dict[str, Tensor]\n
This function transforms target
to have the same structure as source
, i.e., it should be a dictionary with the same keys as source
and each corresponding value in target
should have the same shape as the value in source
.
PARAMETER DESCRIPTION source
The reference dictionary containing PyTorch tensors.
TYPE: Mapping[str, Tensor]
target
The input to be harmonized. It can be a dictionary, tuple, or tensor.
TYPE: TorchTensorContainerType
RETURNS DESCRIPTION Dict[str, Tensor]
The harmonized version of target
.
RAISES DESCRIPTION ValueError
If target
cannot be harmonized to match source
.
Source code in src/pydvl/influence/torch/util.py
def align_structure(\n source: Mapping[str, torch.Tensor],\n target: TorchTensorContainerType,\n) -> Dict[str, torch.Tensor]:\n \"\"\"\n This function transforms `target` to have the same structure as `source`, i.e.,\n it should be a dictionary with the same keys as `source` and each corresponding\n value in `target` should have the same shape as the value in `source`.\n\n Args:\n source: The reference dictionary containing PyTorch tensors.\n target: The input to be harmonized. It can be a dictionary, tuple, or tensor.\n\n Returns:\n The harmonized version of `target`.\n\n Raises:\n ValueError: If `target` cannot be harmonized to match `source`.\n \"\"\"\n\n tangent_dict: Dict[str, torch.Tensor]\n\n if isinstance(target, dict):\n if list(target.keys()) != list(source.keys()):\n raise ValueError(\"The keys in 'target' do not match the keys in 'source'.\")\n\n if [v.shape for v in target.values()] != [v.shape for v in source.values()]:\n raise ValueError(\n \"The shapes of the values in 'target' do not match the shapes \"\n \"of the values in 'source'.\"\n )\n\n tangent_dict = target\n\n elif isinstance(target, tuple) or isinstance(target, list):\n if [v.shape for v in target] != [v.shape for v in source.values()]:\n raise ValueError(\n \"'target' is a tuple/list but its elements' shapes do not match \"\n \"the shapes of the values in 'source'.\"\n )\n\n tangent_dict = dict(zip(source.keys(), target))\n\n elif isinstance(target, torch.Tensor):\n try:\n tangent_dict = dict(\n zip(\n source.keys(),\n reshape_vector_to_tensors(\n target, [p.shape for p in source.values()]\n ),\n )\n )\n except Exception as e:\n raise ValueError(\n f\"'target' is a tensor but cannot be reshaped to match 'source'. \"\n f\"Original error: {e}\"\n )\n\n else:\n raise ValueError(f\"'target' is of type {type(target)} which is not supported.\")\n\n return tangent_dict\n
"},{"location":"api/pydvl/influence/torch/util/#pydvl.influence.torch.util.align_with_model","title":"align_with_model","text":"align_with_model(x: TorchTensorContainerType, model: Module)\n
Aligns an input to the model's parameter structure, i.e. transforms it into a dict with the same keys as model.named_parameters() and matching tensor shapes
PARAMETER DESCRIPTION x
The input to be aligned. It can be a dictionary, tuple, or tensor.
TYPE: TorchTensorContainerType
model
model to use for alignment
TYPE: Module
RETURNS DESCRIPTION The aligned version of x
.
RAISES DESCRIPTION ValueError
If x
cannot be aligned to match the model's parameters .
Source code in src/pydvl/influence/torch/util.py
def align_with_model(x: TorchTensorContainerType, model: torch.nn.Module):\n \"\"\"\n Aligns an input to the model's parameter structure, i.e. transforms it into a dict\n with the same keys as model.named_parameters() and matching tensor shapes\n\n Args:\n x: The input to be aligned. It can be a dictionary, tuple, or tensor.\n model: model to use for alignment\n\n Returns:\n The aligned version of `x`.\n\n Raises:\n ValueError: If `x` cannot be aligned to match the model's parameters .\n\n \"\"\"\n model_params = get_model_parameters(model, detach=False)\n return align_structure(model_params, x)\n
"},{"location":"api/pydvl/influence/torch/util/#pydvl.influence.torch.util.flatten_dimensions","title":"flatten_dimensions","text":"flatten_dimensions(\n tensors: Iterable[Tensor],\n shape: Optional[Tuple[int, ...]] = None,\n concat_at: int = -1,\n) -> Tensor\n
Flattens the dimensions of each tensor in the given iterable and concatenates them along a specified dimension.
This function takes an iterable of PyTorch tensors and flattens each tensor. Optionally, each tensor can be reshaped to a specified shape before concatenation. The concatenation is performed along the dimension specified by concat_at
.
PARAMETER DESCRIPTION tensors
An iterable containing PyTorch tensors to be flattened and concatenated.
TYPE: Iterable[Tensor]
shape
A tuple representing the desired shape to which each tensor is reshaped before concatenation. If None, tensors are flattened to 1D.
TYPE: Optional[Tuple[int, ...]]
DEFAULT: None
concat_at
The dimension along which to concatenate the tensors.
TYPE: int
DEFAULT: -1
RETURNS DESCRIPTION Tensor
A single tensor resulting from the concatenation of the input tensors,
Tensor
each either flattened or reshaped as specified.
Example >>> tensors = [torch.tensor([[1, 2], [3, 4]]), torch.tensor([[5, 6], [7, 8]])]\n>>> flatten_dimensions(tensors)\ntensor([1, 2, 3, 4, 5, 6, 7, 8])\n\n>>> flatten_dimensions(tensors, shape=(2, 2), concat_at=0)\ntensor([[1, 2],\n [3, 4],\n [5, 6],\n [7, 8]])\n
Source code in src/pydvl/influence/torch/util.py
def flatten_dimensions(\n tensors: Iterable[torch.Tensor],\n shape: Optional[Tuple[int, ...]] = None,\n concat_at: int = -1,\n) -> torch.Tensor:\n \"\"\"\n Flattens the dimensions of each tensor in the given iterable and concatenates them\n along a specified dimension.\n\n This function takes an iterable of PyTorch tensors and flattens each tensor.\n Optionally, each tensor can be reshaped to a specified shape before concatenation.\n The concatenation is performed along the dimension specified by `concat_at`.\n\n Args:\n tensors: An iterable containing PyTorch tensors to be flattened\n and concatenated.\n shape: A tuple representing the desired shape to which each tensor is reshaped\n before concatenation. If None, tensors are flattened to 1D.\n concat_at: The dimension along which to concatenate the tensors.\n\n Returns:\n A single tensor resulting from the concatenation of the input tensors,\n each either flattened or reshaped as specified.\n\n ??? Example\n ```pycon\n >>> tensors = [torch.tensor([[1, 2], [3, 4]]), torch.tensor([[5, 6], [7, 8]])]\n >>> flatten_dimensions(tensors)\n tensor([1, 2, 3, 4, 5, 6, 7, 8])\n\n >>> flatten_dimensions(tensors, shape=(2, 2), concat_at=0)\n tensor([[1, 2],\n [3, 4],\n [5, 6],\n [7, 8]])\n ```\n \"\"\"\n return torch.cat(\n [t.reshape(-1) if shape is None else t.reshape(*shape) for t in tensors],\n dim=concat_at,\n )\n
"},{"location":"api/pydvl/influence/torch/util/#pydvl.influence.torch.util.torch_dataset_to_dask_array","title":"torch_dataset_to_dask_array","text":"torch_dataset_to_dask_array(\n dataset: Dataset,\n chunk_size: int,\n total_size: Optional[int] = None,\n resulting_dtype: Type[number] = np.float32,\n) -> Tuple[Array, ...]\n
Construct tuple of dask arrays from a PyTorch dataset, using dask.delayed
PARAMETER DESCRIPTION dataset
A PyTorch dataset
TYPE: Dataset
chunk_size
The size of the chunks for the resulting Dask arrays.
TYPE: int
total_size
If the dataset does not implement len, provide the length via this parameter. If None the length of the dataset is inferred via accessing the dataset once.
TYPE: Optional[int]
DEFAULT: None
resulting_dtype
The dtype of the resulting dask.array.Array
TYPE: Type[number]
DEFAULT: float32
Example import torch\nfrom torch.utils.data import TensorDataset\nx = torch.rand((20, 3))\ny = torch.rand((20, 1))\ndataset = TensorDataset(x, y)\nda_x, da_y = torch_dataset_to_dask_array(dataset, 4)\n
RETURNS DESCRIPTION Tuple[Array, ...]
Tuple of Dask arrays corresponding to each tensor in the dataset.
Source code in src/pydvl/influence/torch/util.py
def torch_dataset_to_dask_array(\n dataset: Dataset,\n chunk_size: int,\n total_size: Optional[int] = None,\n resulting_dtype: Type[np.number] = np.float32,\n) -> Tuple[da.Array, ...]:\n \"\"\"\n Construct tuple of dask arrays from a PyTorch dataset, using dask.delayed\n\n Args:\n dataset: A PyTorch [dataset][torch.utils.data.Dataset]\n chunk_size: The size of the chunks for the resulting Dask arrays.\n total_size: If the dataset does not implement len, provide the length\n via this parameter. If None\n the length of the dataset is inferred via accessing the dataset once.\n resulting_dtype: The dtype of the resulting [dask.array.Array][dask.array.Array]\n\n ??? Example\n ```python\n import torch\n from torch.utils.data import TensorDataset\n x = torch.rand((20, 3))\n y = torch.rand((20, 1))\n dataset = TensorDataset(x, y)\n da_x, da_y = torch_dataset_to_dask_array(dataset, 4)\n ```\n\n Returns:\n Tuple of Dask arrays corresponding to each tensor in the dataset.\n \"\"\"\n\n def _infer_data_len(d_set: Dataset):\n try:\n n_data = len(d_set)\n if total_size is not None and n_data != total_size:\n raise ValueError(\n f\"The number of samples in the dataset ({n_data}), derived \"\n f\"from calling \u00b4len\u00b4, does not match the provided \"\n f\"total number of samples ({total_size}). \"\n f\"Call the function without total_size.\"\n )\n return n_data\n except TypeError as e:\n err_msg = (\n f\"Could not infer the number of samples in the dataset from \"\n f\"calling \u00b4len\u00b4. Original error: {e}.\"\n )\n if total_size is not None:\n logger.warning(\n err_msg\n + f\" Using the provided total number of samples {total_size}.\"\n )\n return total_size\n else:\n logger.warning(\n err_msg + \" Infer the number of samples from the dataset, \"\n \"via iterating the dataset once. \"\n \"This might induce severe overhead, so consider\"\n \"providing total_size, if you know the number of samples \"\n \"beforehand.\"\n )\n idx = 0\n while True:\n try:\n t = d_set[idx]\n if all(_t.numel() == 0 for _t in t):\n return idx\n idx += 1\n\n except IndexError:\n return idx\n\n sample = dataset[0]\n if not isinstance(sample, tuple):\n sample = (sample,)\n\n def _get_chunk(\n start_idx: int, stop_idx: int, d_set: Dataset\n ) -> Tuple[torch.Tensor, ...]:\n try:\n t = d_set[start_idx:stop_idx]\n if not isinstance(t, tuple):\n t = (t,)\n return t # type:ignore\n except Exception:\n nested_tensor_list = [\n [d_set[idx][k] for idx in range(start_idx, stop_idx)]\n for k in range(len(sample))\n ]\n return tuple(map(torch.stack, nested_tensor_list))\n\n n_samples = _infer_data_len(dataset)\n chunk_indices = [\n (i, min(i + chunk_size, n_samples)) for i in range(0, n_samples, chunk_size)\n ]\n delayed_dataset = dask.delayed(dataset)\n delayed_chunks = [\n dask.delayed(partial(_get_chunk, start, stop))(delayed_dataset)\n for (start, stop) in chunk_indices\n ]\n\n delayed_arrays_dict: Dict[int, List[da.Array]] = {k: [] for k in range(len(sample))}\n\n for chunk, (start, stop) in zip(delayed_chunks, chunk_indices):\n for tensor_idx, sample_tensor in enumerate(sample):\n delayed_tensor = da.from_delayed(\n dask.delayed(lambda t: t.cpu().numpy())(chunk[tensor_idx]),\n shape=(stop - start, *sample_tensor.shape),\n dtype=resulting_dtype,\n )\n\n delayed_arrays_dict[tensor_idx].append(delayed_tensor)\n\n return tuple(\n da.concatenate(array_list) for array_list in delayed_arrays_dict.values()\n )\n
"},{"location":"api/pydvl/influence/torch/util/#pydvl.influence.torch.util.empirical_cross_entropy_loss_fn","title":"empirical_cross_entropy_loss_fn","text":"empirical_cross_entropy_loss_fn(\n model_output: Tensor, *args, **kwargs\n) -> Tensor\n
Computes the empirical cross entropy loss of the model output. This is the cross entropy loss of the model output without the labels. The function takes all the usual arguments and keyword arguments of the cross entropy loss function, so that it is compatible with the PyTorch cross entropy loss function. However, it ignores everything except the first argument, which is the model output.
PARAMETER DESCRIPTION model_output
The output of the model.
TYPE: Tensor
Source code in src/pydvl/influence/torch/util.py
def empirical_cross_entropy_loss_fn(\n model_output: torch.Tensor, *args, **kwargs\n) -> torch.Tensor:\n \"\"\"\n Computes the empirical cross entropy loss of the model output. This is the\n cross entropy loss of the model output without the labels. The function takes\n all the usual arguments and keyword arguments of the cross entropy loss\n function, so that it is compatible with the PyTorch cross entropy loss\n function. However, it ignores everything except the first argument, which is\n the model output.\n\n Args:\n model_output: The output of the model.\n \"\"\"\n probs_ = torch.softmax(model_output, dim=1)\n log_probs_ = torch.log(probs_)\n log_probs_ = torch.where(\n torch.isfinite(log_probs_), log_probs_, torch.zeros_like(log_probs_)\n )\n return torch.sum(log_probs_ * probs_.detach() ** 0.5)\n
"},{"location":"api/pydvl/influence/torch/util/#pydvl.influence.torch.util.safe_torch_linalg_eigh","title":"safe_torch_linalg_eigh","text":"safe_torch_linalg_eigh(*args, **kwargs)\n
A wrapper around torch.linalg.eigh
that safely handles potential runtime errors by raising a custom TorchLinalgEighException
with more context, especially related to the issues reported in https://github.com/pytorch/pytorch/issues/92141.
PARAMETER DESCRIPTION *args
Positional arguments passed to torch.linalg.eigh
.
DEFAULT: ()
**kwargs
Keyword arguments passed to torch.linalg.eigh
.
DEFAULT: {}
RETURNS DESCRIPTION The result of calling torch.linalg.eigh
with the provided arguments.
RAISES DESCRIPTION TorchLinalgEighException
If a RuntimeError
occurs during the execution of torch.linalg.eigh
.
Source code in src/pydvl/influence/torch/util.py
@catch_and_raise_exception(RuntimeError, lambda e: TorchLinalgEighException(e))\ndef safe_torch_linalg_eigh(*args, **kwargs):\n \"\"\"\n A wrapper around `torch.linalg.eigh` that safely handles potential runtime errors\n by raising a custom `TorchLinalgEighException` with more context,\n especially related to the issues reported in\n [https://github.com/pytorch/pytorch/issues/92141](\n https://github.com/pytorch/pytorch/issues/92141).\n\n Args:\n *args: Positional arguments passed to `torch.linalg.eigh`.\n **kwargs: Keyword arguments passed to `torch.linalg.eigh`.\n\n Returns:\n The result of calling `torch.linalg.eigh` with the provided arguments.\n\n Raises:\n TorchLinalgEighException: If a `RuntimeError` occurs during the execution of\n `torch.linalg.eigh`.\n \"\"\"\n return torch.linalg.eigh(*args, **kwargs)\n
"},{"location":"api/pydvl/influence/torch/util/#pydvl.influence.torch.util.get_model_parameters","title":"get_model_parameters","text":"get_model_parameters(\n model: Module, detach: bool = True, require_grad_only: bool = True\n) -> Dict[str, Tensor]\n
Returns a dictionary of model parameters, optionally restricted to parameters requiring gradients and optionally detaching them from the computation graph.
PARAMETER DESCRIPTION model
The neural network model.
TYPE: Module
detach
Whether to detach the parameters from the computation graph.
TYPE: bool
DEFAULT: True
require_grad_only
Whether to include only parameters that require gradients.
TYPE: bool
DEFAULT: True
RETURNS DESCRIPTION Dict[str, Tensor]
A dict of named model parameters.
Source code in src/pydvl/influence/torch/util.py
def get_model_parameters(\n model: torch.nn.Module, detach: bool = True, require_grad_only: bool = True\n) -> Dict[str, torch.Tensor]:\n \"\"\"\n Returns a dictionary of model parameters, optionally restricted to parameters\n requiring gradients and optionally detaching them from the computation\n graph.\n\n Args:\n model: The neural network model.\n detach: Whether to detach the parameters from the computation graph.\n require_grad_only: Whether to include only parameters that require gradients.\n\n Returns:\n A dict of named model parameters.\n \"\"\"\n\n parameter_dict = {}\n for k, p in model.named_parameters():\n if require_grad_only and not p.requires_grad:\n continue\n parameter_dict[k] = p.detach() if detach else p\n\n return parameter_dict\n
"},{"location":"api/pydvl/parallel/","title":"Parallel","text":""},{"location":"api/pydvl/parallel/#pydvl.parallel","title":"pydvl.parallel","text":"This module provides a common interface to parallelization backends. The list of supported backends is here. Backends should be instantiated directly and passed to the respective valuation method.
We use executors that implement the Executor interface to submit tasks in parallel. The basic high-level pattern is:
from pydvl.parallel import JoblibParallelBackend\n\nparallel_backend = JoblibParallelBackend()\nwith parallel_backend.executor(max_workers=2) as executor:\n future = executor.submit(lambda x: x + 1, 1)\n result = future.result()\nassert result == 2\n
Running a map-style job is also easy:
from pydvl.parallel import JoblibParallelBackend\n\nparallel_backend = JoblibParallelBackend()\nwith parallel_backend.executor(max_workers=2) as executor:\n results = list(executor.map(lambda x: x + 1, range(5)))\nassert results == [1, 2, 3, 4, 5]\n
Passsing large objects
When running tasks which accept heavy inputs, it is important to first use put()
on the object and use the returned reference as argument to the callable within submit()
. For example:
u_ref = parallel_backend.put(u)\n...\nexecutor.submit(task, utility=u)\n
Note that task()
does not need to be changed in any way: the backend will get()
the object and pass it to the function upon invocation. There is an alternative map-reduce implementation MapReduceJob which internally uses joblib's higher level API with Parallel()
which then indirectly also supports the use of Dask and Ray.
"},{"location":"api/pydvl/parallel/backend/","title":"Backend","text":""},{"location":"api/pydvl/parallel/backend/#pydvl.parallel.backend","title":"pydvl.parallel.backend","text":""},{"location":"api/pydvl/parallel/backend/#pydvl.parallel.backend.CancellationPolicy","title":"CancellationPolicy","text":" Bases: Flag
Policy to use when cancelling futures after exiting an Executor.
Note
Not all backends support all policies.
ATTRIBUTE DESCRIPTION NONE
Do not cancel any futures.
PENDING
Cancel all pending futures, but not running ones.
RUNNING
Cancel all running futures, but not pending ones.
ALL
Cancel all pending and running futures.
"},{"location":"api/pydvl/parallel/backend/#pydvl.parallel.backend.ParallelBackend","title":"ParallelBackend","text":"Abstract base class for all parallel backends.
"},{"location":"api/pydvl/parallel/backend/#pydvl.parallel.backend.ParallelBackend.executor","title":"executor abstractmethod
classmethod
","text":"executor(\n max_workers: int | None = None,\n *,\n config: ParallelConfig | None = None,\n cancel_futures: CancellationPolicy | bool = CancellationPolicy.PENDING\n) -> Executor\n
Returns a futures executor for the parallel backend.
Source code in src/pydvl/parallel/backend.py
@classmethod\n@abstractmethod\ndef executor(\n cls,\n max_workers: int | None = None,\n *,\n config: ParallelConfig | None = None,\n cancel_futures: CancellationPolicy | bool = CancellationPolicy.PENDING,\n) -> Executor:\n \"\"\"Returns a futures executor for the parallel backend.\"\"\"\n ...\n
"},{"location":"api/pydvl/parallel/backend/#pydvl.parallel.backend.init_parallel_backend","title":"init_parallel_backend","text":"init_parallel_backend(\n config: ParallelConfig | None = None, backend_name: str | None = None\n) -> ParallelBackend\n
Initializes the parallel backend and returns an instance of it.
The following example creates a parallel backend instance with the default configuration, which is a local joblib backend.
If you don't pass any arguments, then by default it will instantiate the JoblibParallelBackend:
Example parallel_backend = init_parallel_backend()\n
To create a parallel backend instance with for example ray
as a backend, you can pass the backend name as a string:.
Example parallel_backend = init_parallel_backend(backend_name=\"ray\")\n
The following is an example of the deprecated way for instantiating a parallel backend:
Example config = ParallelConfig()\nparallel_backend = init_parallel_backend(config)\n
PARAMETER DESCRIPTION backend_name
Name of the backend to instantiate.
TYPE: str | None
DEFAULT: None
config
(DEPRECATED) Object configuring parallel computation, with cluster address, number of cpus, etc.
TYPE: ParallelConfig | None
DEFAULT: None
Source code in src/pydvl/parallel/backend.py
@deprecated(\n target=True,\n args_mapping={\"config\": \"config\"},\n deprecated_in=\"0.9.0\",\n remove_in=\"0.10.0\",\n)\ndef init_parallel_backend(\n config: ParallelConfig | None = None, backend_name: str | None = None\n) -> ParallelBackend:\n \"\"\"Initializes the parallel backend and returns an instance of it.\n\n The following example creates a parallel backend instance with the default\n configuration, which is a local joblib backend.\n\n If you don't pass any arguments, then by default it will instantiate\n the JoblibParallelBackend:\n\n ??? Example\n ```python\n parallel_backend = init_parallel_backend()\n ```\n\n To create a parallel backend instance with for example `ray` as a backend,\n you can pass the backend name as a string:.\n\n ??? Example\n ```python\n parallel_backend = init_parallel_backend(backend_name=\"ray\")\n ```\n\n\n The following is an example of the deprecated\n way for instantiating a parallel backend:\n\n ??? Example\n ``` python\n config = ParallelConfig()\n parallel_backend = init_parallel_backend(config)\n ```\n\n Args:\n backend_name: Name of the backend to instantiate.\n config: (**DEPRECATED**) Object configuring parallel computation,\n with cluster address, number of cpus, etc.\n\n\n \"\"\"\n if backend_name is None:\n if config is None:\n backend_name = \"joblib\"\n else:\n backend_name = config.backend\n\n try:\n parallel_backend_cls = ParallelBackend.BACKENDS[backend_name]\n except KeyError:\n raise NotImplementedError(f\"Unexpected parallel backend {backend_name}\")\n return parallel_backend_cls(config) # type: ignore\n
"},{"location":"api/pydvl/parallel/backend/#pydvl.parallel.backend.available_cpus","title":"available_cpus","text":"available_cpus() -> int\n
Platform-independent count of available cores.
FIXME: do we really need this or is os.cpu_count
enough? Is this portable?
RETURNS DESCRIPTION int
Number of cores, or 1 if it is not possible to determine.
Source code in src/pydvl/parallel/backend.py
def available_cpus() -> int:\n \"\"\"Platform-independent count of available cores.\n\n FIXME: do we really need this or is `os.cpu_count` enough? Is this portable?\n\n Returns:\n Number of cores, or 1 if it is not possible to determine.\n \"\"\"\n from platform import system\n\n if system() != \"Linux\":\n return os.cpu_count() or 1\n return len(os.sched_getaffinity(0)) # type: ignore\n
"},{"location":"api/pydvl/parallel/config/","title":"Config","text":""},{"location":"api/pydvl/parallel/config/#pydvl.parallel.config","title":"pydvl.parallel.config","text":""},{"location":"api/pydvl/parallel/config/#pydvl.parallel.config.ParallelConfig","title":"ParallelConfig dataclass
","text":"ParallelConfig(\n backend: Literal[\"joblib\", \"ray\"] = \"joblib\",\n address: Optional[Union[str, Tuple[str, int]]] = None,\n n_cpus_local: Optional[int] = None,\n logging_level: Optional[int] = None,\n wait_timeout: float = 1.0,\n)\n
Configuration for parallel computation backend.
PARAMETER DESCRIPTION backend
Type of backend to use. Defaults to 'joblib'
TYPE: Literal['joblib', 'ray']
DEFAULT: 'joblib'
address
(DEPRECATED) Address of existing remote or local cluster to use.
TYPE: Optional[Union[str, Tuple[str, int]]]
DEFAULT: None
n_cpus_local
(DEPRECATED) Number of CPUs to use when creating a local ray cluster. This has no effect when using an existing ray cluster.
TYPE: Optional[int]
DEFAULT: None
logging_level
(DEPRECATED) Logging level for the parallel backend's worker.
TYPE: Optional[int]
DEFAULT: None
wait_timeout
(DEPRECATED) Timeout in seconds for waiting on futures.
TYPE: float
DEFAULT: 1.0
"},{"location":"api/pydvl/parallel/map_reduce/","title":"Map reduce","text":""},{"location":"api/pydvl/parallel/map_reduce/#pydvl.parallel.map_reduce","title":"pydvl.parallel.map_reduce","text":"This module contains a wrapper around joblib's Parallel()
class that makes it easy to run map-reduce jobs.
Deprecation
This interface might be deprecated or changed in a future release before 1.0
"},{"location":"api/pydvl/parallel/map_reduce/#pydvl.parallel.map_reduce.MapReduceJob","title":"MapReduceJob","text":"MapReduceJob(\n inputs: Union[Collection[T], T],\n map_func: MapFunction[R],\n reduce_func: ReduceFunction[R] = identity,\n parallel_backend: Optional[ParallelBackend] = None,\n config: Optional[ParallelConfig] = None,\n *,\n map_kwargs: Optional[Dict] = None,\n reduce_kwargs: Optional[Dict] = None,\n n_jobs: int = -1,\n timeout: Optional[float] = None\n)\n
Bases: Generic[T, R]
Takes an embarrassingly parallel fun and runs it in n_jobs
parallel jobs, splitting the data evenly into a number of chunks equal to the number of jobs.
Typing information for objects of this class requires the type of the inputs that are split for map_func
and the type of its output.
PARAMETER DESCRIPTION inputs
The input that will be split and passed to map_func
. if it's not a sequence object. It will be repeat n_jobs
number of times.
TYPE: Union[Collection[T], T]
map_func
Function that will be applied to the input chunks in each job.
TYPE: MapFunction[R]
reduce_func
Function that will be applied to the results of map_func
to reduce them.
TYPE: ReduceFunction[R]
DEFAULT: identity
map_kwargs
Keyword arguments that will be passed to map_func
in each job. Alternatively, one can use functools.partial.
TYPE: Optional[Dict]
DEFAULT: None
reduce_kwargs
Keyword arguments that will be passed to reduce_func
in each job. Alternatively, one can use functools.partial.
TYPE: Optional[Dict]
DEFAULT: None
parallel_backend
Parallel backend instance to use for parallelizing computations. If None
, use JoblibParallelBackend backend. See the Parallel Backends package for available options.
TYPE: Optional[ParallelBackend]
DEFAULT: None
config
(DEPRECATED) Object configuring parallel computation, with cluster address, number of cpus, etc.
TYPE: Optional[ParallelConfig]
DEFAULT: None
n_jobs
Number of parallel jobs to run. Does not accept 0
TYPE: int
DEFAULT: -1
Example A simple usage example with 2 jobs:
>>> from pydvl.parallel import MapReduceJob\n>>> import numpy as np\n>>> map_reduce_job: MapReduceJob[np.ndarray, np.ndarray] = MapReduceJob(\n... np.arange(5),\n... map_func=np.sum,\n... reduce_func=np.sum,\n... n_jobs=2,\n... )\n>>> map_reduce_job()\n10\n
When passed a single object as input, it will be repeated for each job:
>>> from pydvl.parallel import MapReduceJob\n>>> import numpy as np\n>>> map_reduce_job: MapReduceJob[int, np.ndarray] = MapReduceJob(\n... 5,\n... map_func=lambda x: np.array([x]),\n... reduce_func=np.sum,\n... n_jobs=2,\n... )\n>>> map_reduce_job()\n10\n
Source code in src/pydvl/parallel/map_reduce.py
@deprecated(\n target=True,\n args_mapping={\"config\": \"config\"},\n deprecated_in=\"0.9.0\",\n remove_in=\"0.10.0\",\n)\ndef __init__(\n self,\n inputs: Union[Collection[T], T],\n map_func: MapFunction[R],\n reduce_func: ReduceFunction[R] = identity,\n parallel_backend: Optional[ParallelBackend] = None,\n config: Optional[ParallelConfig] = None,\n *,\n map_kwargs: Optional[Dict] = None,\n reduce_kwargs: Optional[Dict] = None,\n n_jobs: int = -1,\n timeout: Optional[float] = None,\n):\n parallel_backend = _maybe_init_parallel_backend(parallel_backend, config)\n\n self.parallel_backend = parallel_backend\n\n self.timeout = timeout\n\n self._n_jobs = -1\n # This uses the setter defined below\n self.n_jobs = n_jobs\n\n self.inputs_ = inputs\n\n self.map_kwargs = map_kwargs if map_kwargs is not None else dict()\n self.reduce_kwargs = reduce_kwargs if reduce_kwargs is not None else dict()\n\n self._map_func = reduce(maybe_add_argument, [\"job_id\", \"seed\"], map_func)\n self._reduce_func = reduce_func\n
"},{"location":"api/pydvl/parallel/map_reduce/#pydvl.parallel.map_reduce.MapReduceJob.n_jobs","title":"n_jobs property
writable
","text":"n_jobs: int\n
Effective number of jobs according to the used ParallelBackend instance.
"},{"location":"api/pydvl/parallel/map_reduce/#pydvl.parallel.map_reduce.MapReduceJob.__call__","title":"__call__","text":"__call__(seed: Optional[Union[Seed, SeedSequence]] = None) -> R\n
Runs the map-reduce job.
PARAMETER DESCRIPTION seed
Either an instance of a numpy random number generator or a seed for it.
TYPE: Optional[Union[Seed, SeedSequence]]
DEFAULT: None
RETURNS DESCRIPTION R
The result of the reduce function.
Source code in src/pydvl/parallel/map_reduce.py
def __call__(\n self,\n seed: Optional[Union[Seed, SeedSequence]] = None,\n) -> R:\n \"\"\"\n Runs the map-reduce job.\n\n Args:\n seed: Either an instance of a numpy random number generator or a seed for\n it.\n\n Returns:\n The result of the reduce function.\n \"\"\"\n seed_seq = ensure_seed_sequence(seed)\n\n if hasattr(self.parallel_backend, \"_joblib_backend_name\"):\n backend = getattr(self.parallel_backend, \"_joblib_backend_name\")\n else:\n warnings.warn(\n \"Parallel backend \"\n f\"{self.parallel_backend.__class__.__name__}. \"\n \"should have a `_joblib_backend_name` attribute in order to work \"\n \"property with MapReduceJob. \"\n \"Defaulting to joblib loky backend\"\n )\n backend = \"loky\"\n\n with Parallel(backend=backend, prefer=\"processes\") as parallel:\n chunks = self._chunkify(self.inputs_, n_chunks=self.n_jobs)\n map_results: List[R] = parallel(\n delayed(self._map_func)(\n next_chunk, job_id=j, seed=seed, **self.map_kwargs\n )\n for j, (next_chunk, seed) in enumerate(\n zip(chunks, seed_seq.spawn(len(chunks)))\n )\n )\n\n reduce_results: R = self._reduce_func(map_results, **self.reduce_kwargs)\n return reduce_results\n
"},{"location":"api/pydvl/parallel/backends/","title":"Backends","text":""},{"location":"api/pydvl/parallel/backends/#pydvl.parallel.backends","title":"pydvl.parallel.backends","text":""},{"location":"api/pydvl/parallel/backends/joblib/","title":"Joblib","text":""},{"location":"api/pydvl/parallel/backends/joblib/#pydvl.parallel.backends.joblib","title":"pydvl.parallel.backends.joblib","text":""},{"location":"api/pydvl/parallel/backends/joblib/#pydvl.parallel.backends.joblib.JoblibParallelBackend","title":"JoblibParallelBackend","text":"JoblibParallelBackend(config: ParallelConfig | None = None)\n
Bases: ParallelBackend
Class used to wrap joblib to make it transparent to algorithms.
Example
from pydvl.parallel import JoblibParallelBackend\nparallel_backend = JoblibParallelBackend()\n
Source code in src/pydvl/parallel/backends/joblib.py
@deprecated(\n target=True,\n args_mapping={\"config\": None},\n deprecated_in=\"0.9.0\",\n remove_in=\"0.10.0\",\n)\ndef __init__(self, config: ParallelConfig | None = None) -> None:\n n_jobs: int | None = None\n if config is not None:\n n_jobs = config.n_cpus_local\n self.config = {\n \"n_jobs\": n_jobs,\n }\n
"},{"location":"api/pydvl/parallel/backends/joblib/#pydvl.parallel.backends.joblib.JoblibParallelBackend.executor","title":"executor classmethod
","text":"executor(\n max_workers: int | None = None,\n *,\n config: ParallelConfig | None = None,\n cancel_futures: CancellationPolicy | bool = CancellationPolicy.NONE\n) -> Executor\n
Returns a futures executor for the parallel backend.
Example
from pydvl.parallel import JoblibParallelBackend\nparallel_backend = JoblibParallelBackend()\nwith parallel_backend.executor() as executor:\n executor.submit(...)\n
PARAMETER DESCRIPTION max_workers
Maximum number of parallel workers.
TYPE: int | None
DEFAULT: None
config
(DEPRECATED) Object configuring parallel computation, with cluster address, number of cpus, etc.
TYPE: ParallelConfig | None
DEFAULT: None
cancel_futures
Policy to use when cancelling futures after exiting an Executor.
TYPE: CancellationPolicy | bool
DEFAULT: NONE
RETURNS DESCRIPTION Executor
Instance of _ReusablePoolExecutor.
Source code in src/pydvl/parallel/backends/joblib.py
@classmethod\ndef executor(\n cls,\n max_workers: int | None = None,\n *,\n config: ParallelConfig | None = None,\n cancel_futures: CancellationPolicy | bool = CancellationPolicy.NONE,\n) -> Executor:\n \"\"\"Returns a futures executor for the parallel backend.\n\n !!! Example\n ``` python\n from pydvl.parallel import JoblibParallelBackend\n parallel_backend = JoblibParallelBackend()\n with parallel_backend.executor() as executor:\n executor.submit(...)\n ```\n\n Args:\n max_workers: Maximum number of parallel workers.\n config: (**DEPRECATED**) Object configuring parallel computation,\n with cluster address, number of cpus, etc.\n cancel_futures: Policy to use when cancelling futures\n after exiting an Executor.\n\n Returns:\n Instance of [_ReusablePoolExecutor][joblib.externals.loky.reusable_executor._ReusablePoolExecutor].\n \"\"\"\n if config is not None:\n warnings.warn(\n \"The `JoblibParallelBackend` uses deprecated arguments: \"\n \"`config`. They were deprecated since v0.9.0 \"\n \"and will be removed in v0.10.0.\",\n FutureWarning,\n )\n\n if cancel_futures not in (CancellationPolicy.NONE, False):\n warnings.warn(\n \"Cancellation of futures is not supported by the joblib backend\",\n )\n return cast(Executor, get_reusable_executor(max_workers=max_workers))\n
"},{"location":"api/pydvl/parallel/backends/joblib/#pydvl.parallel.backends.joblib.JoblibParallelBackend.wrap","title":"wrap","text":"wrap(fun: Callable, **kwargs) -> Callable\n
Wraps a function as a joblib delayed.
PARAMETER DESCRIPTION fun
the function to wrap
TYPE: Callable
RETURNS DESCRIPTION Callable
The delayed function.
Source code in src/pydvl/parallel/backends/joblib.py
def wrap(self, fun: Callable, **kwargs) -> Callable:\n \"\"\"Wraps a function as a joblib delayed.\n\n Args:\n fun: the function to wrap\n\n Returns:\n The delayed function.\n \"\"\"\n return delayed(fun) # type: ignore\n
"},{"location":"api/pydvl/parallel/backends/ray/","title":"Ray","text":""},{"location":"api/pydvl/parallel/backends/ray/#pydvl.parallel.backends.ray","title":"pydvl.parallel.backends.ray","text":""},{"location":"api/pydvl/parallel/backends/ray/#pydvl.parallel.backends.ray.RayParallelBackend","title":"RayParallelBackend","text":"RayParallelBackend(config: ParallelConfig | None = None)\n
Bases: ParallelBackend
Class used to wrap ray to make it transparent to algorithms.
Example
import ray\nfrom pydvl.parallel import RayParallelBackend\nray.init()\nparallel_backend = RayParallelBackend()\n
Source code in src/pydvl/parallel/backends/ray.py
@deprecated(\n target=True,\n args_mapping={\"config\": None},\n deprecated_in=\"0.9.0\",\n remove_in=\"0.10.0\",\n)\ndef __init__(self, config: ParallelConfig | None = None) -> None:\n if not ray.is_initialized():\n raise RuntimeError(\n \"Starting from v0.9.0, ray is no longer automatically initialized. \"\n \"Please use `ray.init()` with the desired configuration \"\n \"before using this class.\"\n )\n # Register ray joblib backend\n register_ray()\n
"},{"location":"api/pydvl/parallel/backends/ray/#pydvl.parallel.backends.ray.RayParallelBackend.executor","title":"executor classmethod
","text":"executor(\n max_workers: int | None = None,\n *,\n config: ParallelConfig | None = None,\n cancel_futures: CancellationPolicy | bool = CancellationPolicy.PENDING\n) -> Executor\n
Returns a futures executor for the parallel backend.
Example
import ray\nfrom pydvl.parallel import RayParallelBackend\nray.init()\nparallel_backend = RayParallelBackend()\nwith parallel_backend.executor() as executor:\n executor.submit(...)\n
PARAMETER DESCRIPTION max_workers
Maximum number of parallel workers.
TYPE: int | None
DEFAULT: None
config
(DEPRECATED) Object configuring parallel computation, with cluster address, number of cpus, etc.
TYPE: ParallelConfig | None
DEFAULT: None
cancel_futures
Policy to use when cancelling futures after exiting an Executor.
TYPE: CancellationPolicy | bool
DEFAULT: PENDING
RETURNS DESCRIPTION Executor
Instance of RayExecutor.
Source code in src/pydvl/parallel/backends/ray.py
@classmethod\ndef executor(\n cls,\n max_workers: int | None = None,\n *,\n config: ParallelConfig | None = None,\n cancel_futures: CancellationPolicy | bool = CancellationPolicy.PENDING,\n) -> Executor:\n \"\"\"Returns a futures executor for the parallel backend.\n\n !!! Example\n ``` python\n import ray\n from pydvl.parallel import RayParallelBackend\n ray.init()\n parallel_backend = RayParallelBackend()\n with parallel_backend.executor() as executor:\n executor.submit(...)\n ```\n\n Args:\n max_workers: Maximum number of parallel workers.\n config: (**DEPRECATED**) Object configuring parallel computation,\n with cluster address, number of cpus, etc.\n cancel_futures: Policy to use when cancelling futures\n after exiting an Executor.\n\n Returns:\n Instance of [RayExecutor][pydvl.parallel.futures.ray.RayExecutor].\n \"\"\"\n # Imported here to avoid circular import errors\n from pydvl.parallel.futures.ray import RayExecutor\n\n if config is not None:\n warnings.warn(\n \"The `RayParallelBackend` uses deprecated arguments: \"\n \"`config`. They were deprecated since v0.9.0 \"\n \"and will be removed in v0.10.0.\",\n FutureWarning,\n )\n\n return RayExecutor(max_workers, cancel_futures=cancel_futures) # type: ignore\n
"},{"location":"api/pydvl/parallel/backends/ray/#pydvl.parallel.backends.ray.RayParallelBackend.wrap","title":"wrap","text":"wrap(fun: Callable, **kwargs) -> Callable\n
Wraps a function as a ray remote.
PARAMETER DESCRIPTION fun
the function to wrap
TYPE: Callable
kwargs
keyword arguments to pass to @ray.remote
DEFAULT: {}
RETURNS DESCRIPTION Callable
The .remote
method of the ray RemoteFunction
.
Source code in src/pydvl/parallel/backends/ray.py
def wrap(self, fun: Callable, **kwargs) -> Callable:\n \"\"\"Wraps a function as a ray remote.\n\n Args:\n fun: the function to wrap\n kwargs: keyword arguments to pass to @ray.remote\n\n Returns:\n The `.remote` method of the ray `RemoteFunction`.\n \"\"\"\n if len(kwargs) > 0:\n return ray.remote(**kwargs)(fun).remote # type: ignore\n return ray.remote(fun).remote # type: ignore\n
"},{"location":"api/pydvl/parallel/futures/","title":"Futures","text":""},{"location":"api/pydvl/parallel/futures/#pydvl.parallel.futures","title":"pydvl.parallel.futures","text":""},{"location":"api/pydvl/parallel/futures/#pydvl.parallel.futures.init_executor","title":"init_executor","text":"init_executor(\n max_workers: Optional[int] = None,\n config: Optional[ParallelConfig] = None,\n **kwargs\n) -> Generator[Executor, None, None]\n
Initializes a futures executor for the given parallel configuration.
PARAMETER DESCRIPTION max_workers
Maximum number of concurrent tasks.
TYPE: Optional[int]
DEFAULT: None
config
instance of ParallelConfig with cluster address, number of cpus, etc.
TYPE: Optional[ParallelConfig]
DEFAULT: None
kwargs
Other optional parameter that will be passed to the executor.
DEFAULT: {}
Examples from pydvl.parallel.futures import init_executor, ParallelConfig\n\nconfig = ParallelConfig(backend=\"ray\")\nwith init_executor(max_workers=1, config=config) as executor:\n future = executor.submit(lambda x: x + 1, 1)\n result = future.result()\nassert result == 2\n
from pydvl.parallel.futures import init_executor\nwith init_executor() as executor:\n results = list(executor.map(lambda x: x + 1, range(5)))\nassert results == [1, 2, 3, 4, 5]\n
Source code in src/pydvl/parallel/futures/__init__.py
@contextmanager\n@deprecated(\n target=None,\n deprecated_in=\"0.9.0\",\n remove_in=\"0.10.0\",\n)\ndef init_executor(\n max_workers: Optional[int] = None,\n config: Optional[ParallelConfig] = None,\n **kwargs,\n) -> Generator[Executor, None, None]:\n \"\"\"Initializes a futures executor for the given parallel configuration.\n\n Args:\n max_workers: Maximum number of concurrent tasks.\n config: instance of [ParallelConfig][pydvl.utils.config.ParallelConfig]\n with cluster address, number of cpus, etc.\n kwargs: Other optional parameter that will be passed to the executor.\n\n\n ??? Examples\n ``` python\n from pydvl.parallel.futures import init_executor, ParallelConfig\n\n config = ParallelConfig(backend=\"ray\")\n with init_executor(max_workers=1, config=config) as executor:\n future = executor.submit(lambda x: x + 1, 1)\n result = future.result()\n assert result == 2\n ```\n ``` python\n from pydvl.parallel.futures import init_executor\n with init_executor() as executor:\n results = list(executor.map(lambda x: x + 1, range(5)))\n assert results == [1, 2, 3, 4, 5]\n ```\n \"\"\"\n\n if config is None:\n config = ParallelConfig()\n\n try:\n cls = ParallelBackend.BACKENDS[config.backend]\n with cls.executor(max_workers=max_workers, config=config, **kwargs) as e:\n yield e\n except KeyError:\n raise NotImplementedError(f\"Unexpected parallel backend {config.backend}\")\n
"},{"location":"api/pydvl/parallel/futures/ray/","title":"Ray","text":""},{"location":"api/pydvl/parallel/futures/ray/#pydvl.parallel.futures.ray","title":"pydvl.parallel.futures.ray","text":""},{"location":"api/pydvl/parallel/futures/ray/#pydvl.parallel.futures.ray.RayExecutor","title":"RayExecutor","text":"RayExecutor(\n max_workers: Optional[int] = None,\n *,\n config: Optional[ParallelConfig] = None,\n cancel_futures: Union[CancellationPolicy, bool] = CancellationPolicy.ALL\n)\n
Bases: Executor
Asynchronous executor using Ray that implements the concurrent.futures API.
PARAMETER DESCRIPTION max_workers
Maximum number of concurrent tasks. Each task can request itself any number of vCPUs. You must ensure the product of this value and the n_cpus_per_job parameter passed to submit() does not exceed available cluster resources. If set to None
, it will default to the total number of vCPUs in the ray cluster.
TYPE: Optional[int]
DEFAULT: None
cancel_futures
Select which futures will be cancelled when exiting this context manager. Pending
is the default, which will cancel all pending futures, but not running ones, as done by concurrent.futures.ProcessPoolExecutor. Additionally, All
cancels all pending and running futures, and None
doesn't cancel any. See CancellationPolicy
TYPE: Union[CancellationPolicy, bool]
DEFAULT: ALL
Source code in src/pydvl/parallel/futures/ray.py
@deprecated(\n target=True,\n args_mapping={\"config\": None},\n deprecated_in=\"0.9.0\",\n remove_in=\"0.10.0\",\n)\ndef __init__(\n self,\n max_workers: Optional[int] = None,\n *,\n config: Optional[ParallelConfig] = None,\n cancel_futures: Union[CancellationPolicy, bool] = CancellationPolicy.ALL,\n):\n if max_workers is not None:\n if max_workers <= 0:\n raise ValueError(\"max_workers must be greater than 0\")\n max_workers = max_workers\n\n if isinstance(cancel_futures, CancellationPolicy):\n self._cancel_futures = cancel_futures\n else:\n self._cancel_futures = (\n CancellationPolicy.PENDING\n if cancel_futures\n else CancellationPolicy.NONE\n )\n\n if not ray.is_initialized():\n raise RuntimeError(\n \"Starting from v0.9.0, ray is no longer automatically initialized. \"\n \"Please use `ray.init()` with the desired configuration \"\n \"before using this class.\"\n )\n\n self._max_workers = max_workers\n if self._max_workers is None:\n self._max_workers = int(ray._private.state.cluster_resources()[\"CPU\"])\n\n self._shutdown = False\n self._shutdown_lock = threading.Lock()\n self._queue_lock = threading.Lock()\n self._work_queue: \"queue.Queue[Optional[_WorkItem]]\" = queue.Queue(\n maxsize=self._max_workers\n )\n self._pending_queue: \"queue.SimpleQueue[Optional[_WorkItem]]\" = (\n queue.SimpleQueue()\n )\n\n # Work Item Manager Thread\n self._work_item_manager_thread: Optional[_WorkItemManagerThread] = None\n
"},{"location":"api/pydvl/parallel/futures/ray/#pydvl.parallel.futures.ray.RayExecutor.submit","title":"submit","text":"submit(fn: Callable[..., T], *args, **kwargs) -> Future[T]\n
Submits a callable to be executed with the given arguments.
Schedules the callable to be executed as fn(*args, **kwargs) and returns a Future instance representing the execution of the callable.
PARAMETER DESCRIPTION fn
Callable.
TYPE: Callable[..., T]
args
Positional arguments that will be passed to fn
.
DEFAULT: ()
kwargs
Keyword arguments that will be passed to fn
. It can also optionally contain options for the ray remote function as a dictionary as the keyword argument remote_function_options
.
DEFAULT: {}
Returns: A Future representing the given call.
RAISES DESCRIPTION RuntimeError
If a task is submitted after the executor has been shut down.
Source code in src/pydvl/parallel/futures/ray.py
def submit(self, fn: Callable[..., T], *args, **kwargs) -> \"Future[T]\":\n r\"\"\"Submits a callable to be executed with the given arguments.\n\n Schedules the callable to be executed as fn(\\*args, \\**kwargs)\n and returns a Future instance representing the execution of the callable.\n\n Args:\n fn: Callable.\n args: Positional arguments that will be passed to `fn`.\n kwargs: Keyword arguments that will be passed to `fn`.\n It can also optionally contain options for the ray remote function\n as a dictionary as the keyword argument `remote_function_options`.\n Returns:\n A Future representing the given call.\n\n Raises:\n RuntimeError: If a task is submitted after the executor has been shut down.\n \"\"\"\n with self._shutdown_lock:\n logger.debug(\"executor acquired shutdown lock\")\n if self._shutdown:\n raise RuntimeError(\"cannot schedule new futures after shutdown\")\n\n logging.debug(\"Creating future and putting work item in work queue\")\n future: \"Future[T]\" = Future()\n remote_function_options = kwargs.pop(\"remote_function_options\", None)\n w = _WorkItem(\n future,\n fn,\n args,\n kwargs,\n remote_function_options=remote_function_options,\n )\n self._put_work_item_in_queue(w)\n # We delay starting the thread until the first call to submit\n self._start_work_item_manager_thread()\n return future\n
"},{"location":"api/pydvl/parallel/futures/ray/#pydvl.parallel.futures.ray.RayExecutor.shutdown","title":"shutdown","text":"shutdown(wait: bool = True, *, cancel_futures: Optional[bool] = None) -> None\n
Clean up the resources associated with the Executor.
This method tries to mimic the behaviour of Executor.shutdown while allowing one more value for cancel_futures
which instructs it to use the CancellationPolicy defined upon construction.
PARAMETER DESCRIPTION wait
Whether to wait for pending futures to finish.
TYPE: bool
DEFAULT: True
cancel_futures
Overrides the executor's default policy for cancelling futures on exit. If True
, all pending futures are cancelled, and if False
, no futures are cancelled. If None
(default), the executor's policy set at initialization is used.
TYPE: Optional[bool]
DEFAULT: None
Source code in src/pydvl/parallel/futures/ray.py
def shutdown(\n self, wait: bool = True, *, cancel_futures: Optional[bool] = None\n) -> None:\n \"\"\"Clean up the resources associated with the Executor.\n\n This method tries to mimic the behaviour of\n [Executor.shutdown][concurrent.futures.Executor.shutdown]\n while allowing one more value for ``cancel_futures`` which instructs it\n to use the [CancellationPolicy][pydvl.parallel.backend.CancellationPolicy]\n defined upon construction.\n\n Args:\n wait: Whether to wait for pending futures to finish.\n cancel_futures: Overrides the executor's default policy for\n cancelling futures on exit. If ``True``, all pending futures are\n cancelled, and if ``False``, no futures are cancelled. If ``None``\n (default), the executor's policy set at initialization is used.\n \"\"\"\n logger.debug(\"executor shutting down\")\n with self._shutdown_lock:\n logger.debug(\"executor acquired shutdown lock\")\n self._shutdown = True\n self._cancel_futures = {\n None: self._cancel_futures,\n True: CancellationPolicy.PENDING,\n False: CancellationPolicy.NONE,\n }[cancel_futures]\n\n if wait:\n logger.debug(\"executor waiting for futures to finish\")\n if self._work_item_manager_thread is not None:\n # Putting None in the queue to signal\n # to work item manager thread that we are shutting down\n self._put_work_item_in_queue(None)\n logger.debug(\n \"executor waiting for work item manager thread to terminate\"\n )\n self._work_item_manager_thread.join()\n # To reduce the risk of opening too many files, remove references to\n # objects that use file descriptors.\n self._work_item_manager_thread = None\n del self._work_queue\n del self._pending_queue\n
"},{"location":"api/pydvl/parallel/futures/ray/#pydvl.parallel.futures.ray.RayExecutor.__exit__","title":"__exit__","text":"__exit__(exc_type, exc_val, exc_tb)\n
Exit the runtime context related to the RayExecutor object.
Source code in src/pydvl/parallel/futures/ray.py
def __exit__(self, exc_type, exc_val, exc_tb):\n \"\"\"Exit the runtime context related to the RayExecutor object.\"\"\"\n self.shutdown()\n return False\n
"},{"location":"api/pydvl/reporting/","title":"Reporting","text":""},{"location":"api/pydvl/reporting/#pydvl.reporting","title":"pydvl.reporting","text":""},{"location":"api/pydvl/reporting/plots/","title":"Plots","text":""},{"location":"api/pydvl/reporting/plots/#pydvl.reporting.plots","title":"pydvl.reporting.plots","text":""},{"location":"api/pydvl/reporting/plots/#pydvl.reporting.plots.shaded_mean_std","title":"shaded_mean_std","text":"shaded_mean_std(\n data: ndarray,\n abscissa: Optional[Sequence[Any]] = None,\n num_std: float = 1.0,\n mean_color: Optional[str] = \"dodgerblue\",\n shade_color: Optional[str] = \"lightblue\",\n title: Optional[str] = None,\n xlabel: Optional[str] = None,\n ylabel: Optional[str] = None,\n ax: Optional[Axes] = None,\n **kwargs\n) -> Axes\n
The usual mean \\(\\pm\\) std deviation plot to aggregate runs of experiments.
Deprecation notice
This function is bogus and will be removed in the future in favour of properly computed confidence intervals.
PARAMETER DESCRIPTION data
axis 0 is to be aggregated on (e.g. runs) and axis 1 is the data for each run.
TYPE: ndarray
abscissa
values for the x-axis. Leave empty to use increasing integers.
TYPE: Optional[Sequence[Any]]
DEFAULT: None
num_std
number of standard deviations to shade around the mean.
TYPE: float
DEFAULT: 1.0
mean_color
color for the mean
TYPE: Optional[str]
DEFAULT: 'dodgerblue'
shade_color
color for the shaded region
TYPE: Optional[str]
DEFAULT: 'lightblue'
title
Title text. To use mathematics, use LaTeX notation.
TYPE: Optional[str]
DEFAULT: None
xlabel
Text for the horizontal axis.
TYPE: Optional[str]
DEFAULT: None
ylabel
Text for the vertical axis
TYPE: Optional[str]
DEFAULT: None
ax
If passed, axes object into which to insert the figure. Otherwise, a new figure is created and returned
TYPE: Optional[Axes]
DEFAULT: None
kwargs
these are forwarded to the ax.plot() call for the mean.
DEFAULT: {}
RETURNS DESCRIPTION Axes
The axes used (or created)
Source code in src/pydvl/reporting/plots.py
@deprecated(target=None, deprecated_in=\"0.7.1\", remove_in=\"0.9.0\")\ndef shaded_mean_std(\n data: np.ndarray,\n abscissa: Optional[Sequence[Any]] = None,\n num_std: float = 1.0,\n mean_color: Optional[str] = \"dodgerblue\",\n shade_color: Optional[str] = \"lightblue\",\n title: Optional[str] = None,\n xlabel: Optional[str] = None,\n ylabel: Optional[str] = None,\n ax: Optional[Axes] = None,\n **kwargs,\n) -> Axes:\n r\"\"\"The usual mean \\(\\pm\\) std deviation plot to aggregate runs of\n experiments.\n\n !!! warning \"Deprecation notice\"\n This function is bogus and will be removed in the future in favour of\n properly computed confidence intervals.\n\n Args:\n data: axis 0 is to be aggregated on (e.g. runs) and axis 1 is the\n data for each run.\n abscissa: values for the x-axis. Leave empty to use increasing integers.\n num_std: number of standard deviations to shade around the mean.\n mean_color: color for the mean\n shade_color: color for the shaded region\n title: Title text. To use mathematics, use LaTeX notation.\n xlabel: Text for the horizontal axis.\n ylabel: Text for the vertical axis\n ax: If passed, axes object into which to insert the figure. Otherwise,\n a new figure is created and returned\n kwargs: these are forwarded to the ax.plot() call for the mean.\n\n Returns:\n The axes used (or created)\n \"\"\"\n assert len(data.shape) == 2\n mean = data.mean(axis=0)\n std = num_std * data.std(axis=0)\n\n if ax is None:\n fig, ax = plt.subplots()\n if abscissa is None:\n abscissa = list(range(data.shape[1]))\n\n ax.fill_between(abscissa, mean - std, mean + std, alpha=0.3, color=shade_color)\n ax.plot(abscissa, mean, color=mean_color, **kwargs)\n\n ax.set_title(title)\n ax.set_xlabel(xlabel)\n ax.set_ylabel(ylabel)\n\n return ax\n
"},{"location":"api/pydvl/reporting/plots/#pydvl.reporting.plots.plot_ci_array","title":"plot_ci_array","text":"plot_ci_array(\n data: NDArray,\n level: float,\n type: Literal[\"normal\", \"t\", \"auto\"] = \"normal\",\n abscissa: Optional[Sequence[str]] = None,\n mean_color: Optional[str] = \"dodgerblue\",\n shade_color: Optional[str] = \"lightblue\",\n ax: Optional[Axes] = None,\n **kwargs\n) -> Axes\n
Plot values and a confidence interval from a 2D array.
Supported intervals are based on the normal and the t distributions.
PARAMETER DESCRIPTION data
A 2D array with M different values for each of the N indices.
TYPE: NDArray
level
The confidence level.
TYPE: float
type
The type of confidence interval to use.
TYPE: Literal['normal', 't', 'auto']
DEFAULT: 'normal'
abscissa
The values for the x-axis. Leave empty to use increasing integers.
TYPE: Optional[Sequence[str]]
DEFAULT: None
mean_color
The color of the mean line.
TYPE: Optional[str]
DEFAULT: 'dodgerblue'
shade_color
The color of the confidence interval.
TYPE: Optional[str]
DEFAULT: 'lightblue'
ax
If passed, axes object into which to insert the figure. Otherwise, a new figure is created and the axes returned.
TYPE: Optional[Axes]
DEFAULT: None
**kwargs
Additional arguments to pass to the plot function.
DEFAULT: {}
RETURNS DESCRIPTION Axes
The matplotlib axes.
Source code in src/pydvl/reporting/plots.py
def plot_ci_array(\n data: NDArray,\n level: float,\n type: Literal[\"normal\", \"t\", \"auto\"] = \"normal\",\n abscissa: Optional[Sequence[str]] = None,\n mean_color: Optional[str] = \"dodgerblue\",\n shade_color: Optional[str] = \"lightblue\",\n ax: Optional[plt.Axes] = None,\n **kwargs,\n) -> plt.Axes:\n \"\"\"Plot values and a confidence interval from a 2D array.\n\n Supported intervals are based on the normal and the t distributions.\n\n Args:\n data: A 2D array with M different values for each of the N indices.\n level: The confidence level.\n type: The type of confidence interval to use.\n abscissa: The values for the x-axis. Leave empty to use increasing\n integers.\n mean_color: The color of the mean line.\n shade_color: The color of the confidence interval.\n ax: If passed, axes object into which to insert the figure. Otherwise,\n a new figure is created and the axes returned.\n **kwargs: Additional arguments to pass to the plot function.\n\n Returns:\n The matplotlib axes.\n \"\"\"\n\n m, n = data.shape\n\n means = np.mean(data, axis=0)\n variances = np.var(data, axis=0, ddof=1)\n\n dummy = ValuationResult[np.int_, np.object_](\n algorithm=\"dummy\",\n values=means,\n variances=variances,\n counts=np.ones_like(means, dtype=np.int_) * m,\n indices=np.arange(n),\n data_names=np.array(abscissa, dtype=str)\n if abscissa is not None\n else np.arange(n, dtype=str),\n )\n\n return plot_ci_values(\n dummy,\n level=level,\n type=type,\n mean_color=mean_color,\n shade_color=shade_color,\n ax=ax,\n **kwargs,\n )\n
"},{"location":"api/pydvl/reporting/plots/#pydvl.reporting.plots.plot_ci_values","title":"plot_ci_values","text":"plot_ci_values(\n values: ValuationResult,\n level: float,\n type: Literal[\"normal\", \"t\", \"auto\"] = \"auto\",\n abscissa: Optional[Sequence[str]] = None,\n mean_color: Optional[str] = \"dodgerblue\",\n shade_color: Optional[str] = \"lightblue\",\n ax: Optional[Axes] = None,\n **kwargs\n)\n
Plot values and a confidence interval.
Uses values.data_names
for the x-axis.
Supported intervals are based on the normal and the t distributions.
PARAMETER DESCRIPTION values
The valuation result.
TYPE: ValuationResult
level
The confidence level.
TYPE: float
type
The type of confidence interval to use. If \"auto\", uses \"norm\" if the minimum number of updates for all indices is greater than 30, otherwise uses \"t\".
TYPE: Literal['normal', 't', 'auto']
DEFAULT: 'auto'
abscissa
The values for the x-axis. Leave empty to use increasing integers.
TYPE: Optional[Sequence[str]]
DEFAULT: None
mean_color
The color of the mean line.
TYPE: Optional[str]
DEFAULT: 'dodgerblue'
shade_color
The color of the confidence interval.
TYPE: Optional[str]
DEFAULT: 'lightblue'
ax
If passed, axes object into which to insert the figure. Otherwise, a new figure is created and the axes returned.
TYPE: Optional[Axes]
DEFAULT: None
**kwargs
Additional arguments to pass to the plot function.
DEFAULT: {}
RETURNS DESCRIPTION The matplotlib axes.
Source code in src/pydvl/reporting/plots.py
def plot_ci_values(\n values: ValuationResult,\n level: float,\n type: Literal[\"normal\", \"t\", \"auto\"] = \"auto\",\n abscissa: Optional[Sequence[str]] = None,\n mean_color: Optional[str] = \"dodgerblue\",\n shade_color: Optional[str] = \"lightblue\",\n ax: Optional[plt.Axes] = None,\n **kwargs,\n):\n \"\"\"Plot values and a confidence interval.\n\n Uses `values.data_names` for the x-axis.\n\n Supported intervals are based on the normal and the t distributions.\n\n Args:\n values: The valuation result.\n level: The confidence level.\n type: The type of confidence interval to use. If \"auto\", uses \"norm\" if\n the minimum number of updates for all indices is greater than 30,\n otherwise uses \"t\".\n abscissa: The values for the x-axis. Leave empty to use increasing\n integers.\n mean_color: The color of the mean line.\n shade_color: The color of the confidence interval.\n ax: If passed, axes object into which to insert the figure. Otherwise,\n a new figure is created and the axes returned.\n **kwargs: Additional arguments to pass to the plot function.\n\n Returns:\n The matplotlib axes.\n \"\"\"\n\n ppfs = {\n \"normal\": norm.ppf,\n \"t\": partial(t.ppf, df=values.counts - 1),\n \"auto\": norm.ppf\n if np.min(values.counts) > 30\n else partial(t.ppf, df=values.counts - 1),\n }\n\n try:\n score = ppfs[type](1 - level / 2)\n except KeyError:\n raise ValueError(\n f\"Unknown confidence interval type requested: {type}.\"\n ) from None\n\n if abscissa is None:\n abscissa = [str(i) for i, _ in enumerate(values)]\n bound = score * values.stderr\n\n if ax is None:\n fig, ax = plt.subplots()\n\n ax.fill_between(\n abscissa,\n values.values - bound,\n values.values + bound,\n alpha=0.3,\n color=shade_color,\n )\n ax.plot(abscissa, values.values, color=mean_color, **kwargs)\n return ax\n
"},{"location":"api/pydvl/reporting/plots/#pydvl.reporting.plots.spearman_correlation","title":"spearman_correlation","text":"spearman_correlation(vv: List[OrderedDict], num_values: int, pvalue: float)\n
Simple matrix plots with spearman correlation for each pair in vv.
PARAMETER DESCRIPTION vv
list of OrderedDicts with index: value. Spearman correlation is computed for the keys.
TYPE: List[OrderedDict]
num_values
Use only these many values from the data (from the start of the OrderedDicts)
TYPE: int
pvalue
correlation coefficients for which the p-value is below the threshold pvalue/len(vv)
will be discarded.
TYPE: float
Source code in src/pydvl/reporting/plots.py
def spearman_correlation(vv: List[OrderedDict], num_values: int, pvalue: float):\n \"\"\"Simple matrix plots with spearman correlation for each pair in vv.\n\n Args:\n vv: list of OrderedDicts with index: value. Spearman correlation\n is computed for the keys.\n num_values: Use only these many values from the data (from the start\n of the OrderedDicts)\n pvalue: correlation coefficients for which the p-value is below the\n threshold `pvalue/len(vv)` will be discarded.\n \"\"\"\n r: np.ndarray = np.ndarray((len(vv), len(vv)))\n p: np.ndarray = np.ndarray((len(vv), len(vv)))\n for i, a in enumerate(vv):\n for j, b in enumerate(vv):\n from scipy.stats._stats_py import SpearmanrResult\n\n spearman: SpearmanrResult = sp.stats.spearmanr(\n list(a.keys())[:num_values], list(b.keys())[:num_values]\n )\n r[i][j] = (\n spearman.correlation if spearman.pvalue < pvalue / len(vv) else np.nan\n ) # Bonferroni correction\n p[i][j] = spearman.pvalue\n fig, axs = plt.subplots(1, 2, figsize=(16, 7))\n plot1 = axs[0].matshow(r, vmin=-1, vmax=1)\n axs[0].set_title(f\"Spearman correlation (top {num_values} values)\")\n axs[0].set_xlabel(\"Runs\")\n axs[0].set_ylabel(\"Runs\")\n fig.colorbar(plot1, ax=axs[0])\n plot2 = axs[1].matshow(p, vmin=0, vmax=1)\n axs[1].set_title(\"p-value\")\n axs[1].set_xlabel(\"Runs\")\n axs[1].set_ylabel(\"Runs\")\n fig.colorbar(plot2, ax=axs[1])\n\n return fig\n
"},{"location":"api/pydvl/reporting/plots/#pydvl.reporting.plots.plot_shapley","title":"plot_shapley","text":"plot_shapley(\n df: DataFrame,\n *,\n level: float = 0.05,\n ax: Optional[Axes] = None,\n title: Optional[str] = None,\n xlabel: Optional[str] = None,\n ylabel: Optional[str] = None,\n prefix: Optional[str] = \"data_value\"\n) -> Axes\n
Plots the shapley values, as returned from compute_shapley_values, with error bars corresponding to an \\(\\alpha\\)-level Normal confidence interval.
PARAMETER DESCRIPTION df
dataframe with the shapley values
TYPE: DataFrame
level
confidence level for the error bars
TYPE: float
DEFAULT: 0.05
ax
axes to plot on or None if a new subplots should be created
TYPE: Optional[Axes]
DEFAULT: None
title
string, title of the plot
TYPE: Optional[str]
DEFAULT: None
xlabel
string, x label of the plot
TYPE: Optional[str]
DEFAULT: None
ylabel
string, y label of the plot
TYPE: Optional[str]
DEFAULT: None
RETURNS DESCRIPTION Axes
The axes created or used
Source code in src/pydvl/reporting/plots.py
def plot_shapley(\n df: pd.DataFrame,\n *,\n level: float = 0.05,\n ax: Optional[plt.Axes] = None,\n title: Optional[str] = None,\n xlabel: Optional[str] = None,\n ylabel: Optional[str] = None,\n prefix: Optional[str] = \"data_value\",\n) -> plt.Axes:\n r\"\"\"Plots the shapley values, as returned from\n [compute_shapley_values][pydvl.value.shapley.common.compute_shapley_values],\n with error bars corresponding to an $\\alpha$-level Normal confidence\n interval.\n\n Args:\n df: dataframe with the shapley values\n level: confidence level for the error bars\n ax: axes to plot on or None if a new subplots should be created\n title: string, title of the plot\n xlabel: string, x label of the plot\n ylabel: string, y label of the plot\n\n Returns:\n The axes created or used\n \"\"\"\n if ax is None:\n _, ax = plt.subplots()\n\n yerr = norm.ppf(1 - level / 2) * df[f\"{prefix}_stderr\"]\n\n ax.errorbar(x=df.index, y=df[prefix], yerr=yerr, fmt=\"o\", capsize=6)\n ax.set_xlabel(xlabel)\n ax.set_ylabel(ylabel)\n ax.set_title(title)\n plt.xticks(rotation=60)\n return ax\n
"},{"location":"api/pydvl/reporting/plots/#pydvl.reporting.plots.plot_influence_distribution","title":"plot_influence_distribution","text":"plot_influence_distribution(\n influences: NDArray[float64], index: int, title_extra: str = \"\"\n) -> Axes\n
Plots the histogram of the influence that all samples in the training set have over a single sample index.
PARAMETER DESCRIPTION influences
array of influences (training samples x test samples)
TYPE: NDArray[float64]
index
Index of the test sample for which the influences will be plotted.
TYPE: int
title_extra
Additional text that will be appended to the title.
TYPE: str
DEFAULT: ''
Source code in src/pydvl/reporting/plots.py
def plot_influence_distribution(\n influences: NDArray[np.float64], index: int, title_extra: str = \"\"\n) -> plt.Axes:\n \"\"\"Plots the histogram of the influence that all samples in the training set\n have over a single sample index.\n\n Args:\n influences: array of influences (training samples x test samples)\n index: Index of the test sample for which the influences\n will be plotted.\n title_extra: Additional text that will be appended to the title.\n \"\"\"\n _, ax = plt.subplots()\n ax.hist(influences[:, index], alpha=0.7)\n ax.set_xlabel(\"Influence values\")\n ax.set_ylabel(\"Number of samples\")\n ax.set_title(f\"Distribution of influences {title_extra}\")\n return ax\n
"},{"location":"api/pydvl/reporting/plots/#pydvl.reporting.plots.plot_influence_distribution_by_label","title":"plot_influence_distribution_by_label","text":"plot_influence_distribution_by_label(\n influences: NDArray[float64],\n labels: NDArray[float64],\n title_extra: str = \"\",\n)\n
Plots the histogram of the influence that all samples in the training set have over a single sample index, separated by labels.
PARAMETER DESCRIPTION influences
array of influences (training samples x test samples)
TYPE: NDArray[float64]
labels
labels for the training set.
TYPE: NDArray[float64]
title_extra
Additional text that will be appended to the title.
TYPE: str
DEFAULT: ''
Source code in src/pydvl/reporting/plots.py
def plot_influence_distribution_by_label(\n influences: NDArray[np.float64], labels: NDArray[np.float64], title_extra: str = \"\"\n):\n \"\"\"Plots the histogram of the influence that all samples in the training set\n have over a single sample index, separated by labels.\n\n Args:\n influences: array of influences (training samples x test samples)\n labels: labels for the training set.\n title_extra: Additional text that will be appended to the title.\n \"\"\"\n _, ax = plt.subplots()\n unique_labels = np.unique(labels)\n for label in unique_labels:\n ax.hist(influences[labels == label], label=label, alpha=0.7)\n ax.set_xlabel(\"Influence values\")\n ax.set_ylabel(\"Number of samples\")\n ax.set_title(f\"Distribution of influences {title_extra}\")\n ax.legend()\n plt.show()\n
"},{"location":"api/pydvl/reporting/scores/","title":"Scores","text":""},{"location":"api/pydvl/reporting/scores/#pydvl.reporting.scores","title":"pydvl.reporting.scores","text":""},{"location":"api/pydvl/reporting/scores/#pydvl.reporting.scores.compute_removal_score","title":"compute_removal_score","text":"compute_removal_score(\n u: Utility,\n values: ValuationResult,\n percentages: Union[NDArray[float64], Iterable[float]],\n *,\n remove_best: bool = False,\n progress: bool = False\n) -> Dict[float, float]\n
Fits model and computes score on the test set after incrementally removing a percentage of data points from the training set, based on their values.
PARAMETER DESCRIPTION u
Utility object with model, data, and scoring function.
TYPE: Utility
values
Data values of data instances in the training set.
TYPE: ValuationResult
percentages
Sequence of removal percentages.
TYPE: Union[NDArray[float64], Iterable[float]]
remove_best
If True, removes data points in order of decreasing valuation.
TYPE: bool
DEFAULT: False
progress
If True, display a progress bar.
TYPE: bool
DEFAULT: False
RETURNS DESCRIPTION Dict[float, float]
Dictionary that maps the percentages to their respective scores.
Source code in src/pydvl/reporting/scores.py
def compute_removal_score(\n u: Utility,\n values: ValuationResult,\n percentages: Union[NDArray[np.float64], Iterable[float]],\n *,\n remove_best: bool = False,\n progress: bool = False,\n) -> Dict[float, float]:\n r\"\"\"Fits model and computes score on the test set after incrementally removing\n a percentage of data points from the training set, based on their values.\n\n Args:\n u: Utility object with model, data, and scoring function.\n values: Data values of data instances in the training set.\n percentages: Sequence of removal percentages.\n remove_best: If True, removes data points in order of decreasing valuation.\n progress: If True, display a progress bar.\n\n Returns:\n Dictionary that maps the percentages to their respective scores.\n \"\"\"\n # Sanity checks\n if np.any([x >= 1.0 or x < 0.0 for x in percentages]):\n raise ValueError(\"All percentages should be in the range [0.0, 1.0)\")\n\n if len(values) != len(u.data.indices):\n raise ValueError(\n f\"The number of values, {len(values) }, should be equal to the number of data indices, {len(u.data.indices)}\"\n )\n\n scores = {}\n\n # We sort in descending order if we want to remove the best values\n values.sort(reverse=remove_best)\n\n for pct in tqdm(percentages, disable=not progress, desc=\"Removal Scores\"):\n n_removal = int(pct * len(u.data))\n indices = values.indices[n_removal:]\n score = u(indices)\n scores[pct] = score\n return scores\n
"},{"location":"api/pydvl/utils/","title":"Utils","text":""},{"location":"api/pydvl/utils/#pydvl.utils","title":"pydvl.utils","text":""},{"location":"api/pydvl/utils/config/","title":"Config","text":""},{"location":"api/pydvl/utils/config/#pydvl.utils.config","title":"pydvl.utils.config","text":""},{"location":"api/pydvl/utils/config/#pydvl.utils.config.ParallelConfig","title":"ParallelConfig dataclass
","text":"ParallelConfig(\n backend: Literal[\"joblib\", \"ray\"] = \"joblib\",\n address: Optional[Union[str, Tuple[str, int]]] = None,\n n_cpus_local: Optional[int] = None,\n logging_level: Optional[int] = None,\n wait_timeout: float = 1.0,\n)\n
Configuration for parallel computation backend.
PARAMETER DESCRIPTION backend
Type of backend to use. Defaults to 'joblib'
TYPE: Literal['joblib', 'ray']
DEFAULT: 'joblib'
address
(DEPRECATED) Address of existing remote or local cluster to use.
TYPE: Optional[Union[str, Tuple[str, int]]]
DEFAULT: None
n_cpus_local
(DEPRECATED) Number of CPUs to use when creating a local ray cluster. This has no effect when using an existing ray cluster.
TYPE: Optional[int]
DEFAULT: None
logging_level
(DEPRECATED) Logging level for the parallel backend's worker.
TYPE: Optional[int]
DEFAULT: None
wait_timeout
(DEPRECATED) Timeout in seconds for waiting on futures.
TYPE: float
DEFAULT: 1.0
"},{"location":"api/pydvl/utils/config/#pydvl.utils.config.CachedFuncConfig","title":"CachedFuncConfig dataclass
","text":"CachedFuncConfig(\n hash_prefix: Optional[str] = None,\n ignore_args: Collection[str] = list(),\n time_threshold: float = 0.3,\n allow_repeated_evaluations: bool = False,\n rtol_stderr: float = 0.1,\n min_repetitions: int = 3,\n)\n
Configuration for cached functions and methods, providing memoization of function calls.
Instances of this class are typically used as arguments for the construction of a Utility.
PARAMETER DESCRIPTION hash_prefix
Optional string prefix that be prepended to the cache key. This can be provided in order to guarantee cache reuse across runs.
TYPE: Optional[str]
DEFAULT: None
ignore_args
Do not take these keyword arguments into account when hashing the wrapped function for usage as key. This allows sharing the cache among different jobs for the same experiment run if the callable happens to have \"nuisance\" parameters like job_id
which do not affect the result of the computation.
TYPE: Collection[str]
DEFAULT: list()
time_threshold
Computations taking less time than this many seconds are not cached. A value of 0 means that it will always cache results.
TYPE: float
DEFAULT: 0.3
allow_repeated_evaluations
If True
, repeated calls to a function with the same arguments will be allowed and outputs averaged until the running standard deviation of the mean stabilizes below rtol_stderr * mean
.
TYPE: bool
DEFAULT: False
rtol_stderr
relative tolerance for repeated evaluations. More precisely, memcached() will stop evaluating the function once the standard deviation of the mean is smaller than rtol_stderr * mean
.
TYPE: float
DEFAULT: 0.1
min_repetitions
minimum number of times that a function evaluation on the same arguments is repeated before returning cached values. Useful for stochastic functions only. If the model training is very noisy, set this number to higher values to reduce variance.
TYPE: int
DEFAULT: 3
"},{"location":"api/pydvl/utils/dataset/","title":"Dataset","text":""},{"location":"api/pydvl/utils/dataset/#pydvl.utils.dataset","title":"pydvl.utils.dataset","text":"This module contains convenience classes to handle data and groups thereof.
Shapley and Least Core value computations require evaluation of a scoring function (the utility). This is typically the performance of the model on a test set (as an approximation to its true expected performance). It is therefore convenient to keep both the training data and the test data together to be passed around to methods in shapley and least_core. This is done with Dataset.
This abstraction layer also seamlessly grouping data points together if one is interested in computing their value as a group, see GroupedDataset.
Objects of both types are used to construct a Utility object.
"},{"location":"api/pydvl/utils/dataset/#pydvl.utils.dataset.Dataset","title":"Dataset","text":"Dataset(\n x_train: Union[NDArray, DataFrame],\n y_train: Union[NDArray, DataFrame],\n x_test: Union[NDArray, DataFrame],\n y_test: Union[NDArray, DataFrame],\n feature_names: Optional[Sequence[str]] = None,\n target_names: Optional[Sequence[str]] = None,\n data_names: Optional[Sequence[str]] = None,\n description: Optional[str] = None,\n is_multi_output: bool = False,\n)\n
A convenience class to handle datasets.
It holds a dataset, split into training and test data, together with several labels on feature names, data point names and a description.
PARAMETER DESCRIPTION x_train
training data
TYPE: Union[NDArray, DataFrame]
y_train
labels for training data
TYPE: Union[NDArray, DataFrame]
x_test
test data
TYPE: Union[NDArray, DataFrame]
y_test
labels for test data
TYPE: Union[NDArray, DataFrame]
feature_names
name of the features of input data
TYPE: Optional[Sequence[str]]
DEFAULT: None
target_names
names of the features of target data
TYPE: Optional[Sequence[str]]
DEFAULT: None
data_names
names assigned to data points. For example, if the dataset is a time series, each entry can be a timestamp which can be referenced directly instead of using a row number.
TYPE: Optional[Sequence[str]]
DEFAULT: None
description
A textual description of the dataset.
TYPE: Optional[str]
DEFAULT: None
is_multi_output
set to False
if labels are scalars, or to True
if they are vectors of dimension > 1.
TYPE: bool
DEFAULT: False
Source code in src/pydvl/utils/dataset.py
def __init__(\n self,\n x_train: Union[NDArray, pd.DataFrame],\n y_train: Union[NDArray, pd.DataFrame],\n x_test: Union[NDArray, pd.DataFrame],\n y_test: Union[NDArray, pd.DataFrame],\n feature_names: Optional[Sequence[str]] = None,\n target_names: Optional[Sequence[str]] = None,\n data_names: Optional[Sequence[str]] = None,\n description: Optional[str] = None,\n # FIXME: use same parameter name as in check_X_y()\n is_multi_output: bool = False,\n):\n \"\"\"Constructs a Dataset from data and labels.\n\n Args:\n x_train: training data\n y_train: labels for training data\n x_test: test data\n y_test: labels for test data\n feature_names: name of the features of input data\n target_names: names of the features of target data\n data_names: names assigned to data points.\n For example, if the dataset is a time series, each entry can be a\n timestamp which can be referenced directly instead of using a row\n number.\n description: A textual description of the dataset.\n is_multi_output: set to `False` if labels are scalars, or to\n `True` if they are vectors of dimension > 1.\n \"\"\"\n self.x_train, self.y_train = check_X_y(\n x_train, y_train, multi_output=is_multi_output\n )\n self.x_test, self.y_test = check_X_y(\n x_test, y_test, multi_output=is_multi_output\n )\n\n if x_train.shape[-1] != x_test.shape[-1]:\n raise ValueError(\n f\"Mismatching number of features: \"\n f\"{x_train.shape[-1]} and {x_test.shape[-1]}\"\n )\n if x_train.shape[0] != y_train.shape[0]:\n raise ValueError(\n f\"Mismatching number of samples: \"\n f\"{x_train.shape[-1]} and {x_test.shape[-1]}\"\n )\n if x_test.shape[0] != y_test.shape[0]:\n raise ValueError(\n f\"Mismatching number of samples: \"\n f\"{x_test.shape[-1]} and {y_test.shape[-1]}\"\n )\n\n def make_names(s: str, a: np.ndarray) -> List[str]:\n n = a.shape[1] if len(a.shape) > 1 else 1\n return [f\"{s}{i:0{1 + int(np.log10(n))}d}\" for i in range(1, n + 1)]\n\n self.feature_names = feature_names\n self.target_names = target_names\n\n if self.feature_names is None:\n if isinstance(x_train, pd.DataFrame):\n self.feature_names = x_train.columns.tolist()\n else:\n self.feature_names = make_names(\"x\", x_train)\n\n if self.target_names is None:\n if isinstance(y_train, pd.DataFrame):\n self.target_names = y_train.columns.tolist()\n else:\n self.target_names = make_names(\"y\", y_train)\n\n if len(self.x_train.shape) > 1:\n if (\n len(self.feature_names) != self.x_train.shape[-1]\n or len(self.feature_names) != self.x_test.shape[-1]\n ):\n raise ValueError(\"Mismatching number of features and names\")\n if len(self.y_train.shape) > 1:\n if (\n len(self.target_names) != self.y_train.shape[-1]\n or len(self.target_names) != self.y_test.shape[-1]\n ):\n raise ValueError(\"Mismatching number of targets and names\")\n\n self.description = description or \"No description\"\n self._indices = np.arange(len(self.x_train), dtype=np.int_)\n self._data_names = (\n np.array(data_names, dtype=object)\n if data_names is not None\n else self._indices.astype(object)\n )\n
"},{"location":"api/pydvl/utils/dataset/#pydvl.utils.dataset.Dataset.indices","title":"indices property
","text":"indices: NDArray[int_]\n
Index of positions in data.x_train.
Contiguous integers from 0 to len(Dataset).
"},{"location":"api/pydvl/utils/dataset/#pydvl.utils.dataset.Dataset.data_names","title":"data_names property
","text":"data_names: NDArray[object_]\n
Names of each individual datapoint.
Used for reporting Shapley values.
"},{"location":"api/pydvl/utils/dataset/#pydvl.utils.dataset.Dataset.dim","title":"dim property
","text":"dim: int\n
Returns the number of dimensions of a sample.
"},{"location":"api/pydvl/utils/dataset/#pydvl.utils.dataset.Dataset.get_training_data","title":"get_training_data","text":"get_training_data(\n indices: Optional[Iterable[int]] = None,\n) -> Tuple[NDArray, NDArray]\n
Given a set of indices, returns the training data that refer to those indices.
This is used mainly by Utility to retrieve subsets of the data from indices. It is typically not needed in algorithms.
PARAMETER DESCRIPTION indices
Optional indices that will be used to select points from the training data. If None
, the entire training data will be returned.
TYPE: Optional[Iterable[int]]
DEFAULT: None
RETURNS DESCRIPTION Tuple[NDArray, NDArray]
If indices
is not None
, the selected x and y arrays from the training data. Otherwise, the entire dataset.
Source code in src/pydvl/utils/dataset.py
def get_training_data(\n self, indices: Optional[Iterable[int]] = None\n) -> Tuple[NDArray, NDArray]:\n \"\"\"Given a set of indices, returns the training data that refer to those\n indices.\n\n This is used mainly by [Utility][pydvl.utils.utility.Utility] to retrieve\n subsets of the data from indices. It is typically **not needed in\n algorithms**.\n\n Args:\n indices: Optional indices that will be used to select points from\n the training data. If `None`, the entire training data will be\n returned.\n\n Returns:\n If `indices` is not `None`, the selected x and y arrays from the\n training data. Otherwise, the entire dataset.\n \"\"\"\n if indices is None:\n return self.x_train, self.y_train\n x = self.x_train[indices]\n y = self.y_train[indices]\n return x, y\n
"},{"location":"api/pydvl/utils/dataset/#pydvl.utils.dataset.Dataset.get_test_data","title":"get_test_data","text":"get_test_data(\n indices: Optional[Iterable[int]] = None,\n) -> Tuple[NDArray, NDArray]\n
Returns the entire test set regardless of the passed indices.
The passed indices will not be used because for data valuation we generally want to score the trained model on the entire test data.
Additionally, the way this method is used in the Utility class, the passed indices will be those of the training data and would not work on the test data.
There may be cases where it is desired to use parts of the test data. In those cases, it is recommended to inherit from Dataset and override get_test_data().
For example, the following snippet shows how one could go about mapping the training data indices into test data indices inside get_test_data():
Example >>> from pydvl.utils import Dataset\n>>> import numpy as np\n>>> class DatasetWithTestDataIndices(Dataset):\n... def get_test_data(self, indices=None):\n... if indices is None:\n... return self.x_test, self.y_test\n... fraction = len(list(indices)) / len(self)\n... mapped_indices = len(self.x_test) / len(self) * np.asarray(indices)\n... mapped_indices = np.unique(mapped_indices.astype(int))\n... return self.x_test[mapped_indices], self.y_test[mapped_indices]\n...\n>>> X = np.random.rand(100, 10)\n>>> y = np.random.randint(0, 2, 100)\n>>> dataset = DatasetWithTestDataIndices.from_arrays(X, y)\n>>> indices = np.random.choice(dataset.indices, 30, replace=False)\n>>> _ = dataset.get_training_data(indices)\n>>> _ = dataset.get_test_data(indices)\n
PARAMETER DESCRIPTION indices
Optional indices into the test data. This argument is unused left for compatibility with get_training_data().
TYPE: Optional[Iterable[int]]
DEFAULT: None
RETURNS DESCRIPTION Tuple[NDArray, NDArray]
The entire test data.
Source code in src/pydvl/utils/dataset.py
def get_test_data(\n self, indices: Optional[Iterable[int]] = None\n) -> Tuple[NDArray, NDArray]:\n \"\"\"Returns the entire test set regardless of the passed indices.\n\n The passed indices will not be used because for data valuation\n we generally want to score the trained model on the entire test data.\n\n Additionally, the way this method is used in the\n [Utility][pydvl.utils.utility.Utility] class, the passed indices will\n be those of the training data and would not work on the test data.\n\n There may be cases where it is desired to use parts of the test data.\n In those cases, it is recommended to inherit from\n [Dataset][pydvl.utils.dataset.Dataset] and override\n [get_test_data()][pydvl.utils.dataset.Dataset.get_test_data].\n\n For example, the following snippet shows how one could go about\n mapping the training data indices into test data indices\n inside [get_test_data()][pydvl.utils.dataset.Dataset.get_test_data]:\n\n ??? Example\n ```pycon\n >>> from pydvl.utils import Dataset\n >>> import numpy as np\n >>> class DatasetWithTestDataIndices(Dataset):\n ... def get_test_data(self, indices=None):\n ... if indices is None:\n ... return self.x_test, self.y_test\n ... fraction = len(list(indices)) / len(self)\n ... mapped_indices = len(self.x_test) / len(self) * np.asarray(indices)\n ... mapped_indices = np.unique(mapped_indices.astype(int))\n ... return self.x_test[mapped_indices], self.y_test[mapped_indices]\n ...\n >>> X = np.random.rand(100, 10)\n >>> y = np.random.randint(0, 2, 100)\n >>> dataset = DatasetWithTestDataIndices.from_arrays(X, y)\n >>> indices = np.random.choice(dataset.indices, 30, replace=False)\n >>> _ = dataset.get_training_data(indices)\n >>> _ = dataset.get_test_data(indices)\n ```\n\n Args:\n indices: Optional indices into the test data. This argument is\n unused left for compatibility with\n [get_training_data()][pydvl.utils.dataset.Dataset.get_training_data].\n\n Returns:\n The entire test data.\n \"\"\"\n return self.x_test, self.y_test\n
"},{"location":"api/pydvl/utils/dataset/#pydvl.utils.dataset.Dataset.from_sklearn","title":"from_sklearn classmethod
","text":"from_sklearn(\n data: Bunch,\n train_size: float = 0.8,\n random_state: Optional[int] = None,\n stratify_by_target: bool = False,\n **kwargs\n) -> Dataset\n
Constructs a Dataset object from a sklearn.utils.Bunch, as returned by the load_*
functions in scikit-learn toy datasets.
Example >>> from pydvl.utils import Dataset\n>>> from sklearn.datasets import load_boston\n>>> dataset = Dataset.from_sklearn(load_boston())\n
PARAMETER DESCRIPTION data
scikit-learn Bunch object. The following attributes are supported:
data
: covariates. target
: target variables (labels). feature_names
(optional): the feature names. target_names
(optional): the target names. DESCR
(optional): a description.
TYPE: Bunch
train_size
size of the training dataset. Used in train_test_split
TYPE: float
DEFAULT: 0.8
random_state
seed for train / test split
TYPE: Optional[int]
DEFAULT: None
stratify_by_target
If True
, data is split in a stratified fashion, using the target variable as labels. Read more in scikit-learn's user guide.
TYPE: bool
DEFAULT: False
kwargs
Additional keyword arguments to pass to the Dataset constructor. Use this to pass e.g. is_multi_output
.
DEFAULT: {}
RETURNS DESCRIPTION Dataset
Object with the sklearn dataset
Changed in version 0.6.0
Added kwargs to pass to the Dataset constructor.
Source code in src/pydvl/utils/dataset.py
@classmethod\ndef from_sklearn(\n cls,\n data: Bunch,\n train_size: float = 0.8,\n random_state: Optional[int] = None,\n stratify_by_target: bool = False,\n **kwargs,\n) -> \"Dataset\":\n \"\"\"Constructs a [Dataset][pydvl.utils.Dataset] object from a\n [sklearn.utils.Bunch][], as returned by the `load_*`\n functions in [scikit-learn toy datasets](https://scikit-learn.org/stable/datasets/toy_dataset.html).\n\n ??? Example\n ```pycon\n >>> from pydvl.utils import Dataset\n >>> from sklearn.datasets import load_boston\n >>> dataset = Dataset.from_sklearn(load_boston())\n ```\n\n Args:\n data: scikit-learn Bunch object. The following attributes are supported:\n\n - `data`: covariates.\n - `target`: target variables (labels).\n - `feature_names` (**optional**): the feature names.\n - `target_names` (**optional**): the target names.\n - `DESCR` (**optional**): a description.\n train_size: size of the training dataset. Used in `train_test_split`\n random_state: seed for train / test split\n stratify_by_target: If `True`, data is split in a stratified\n fashion, using the target variable as labels. Read more in\n [scikit-learn's user guide](https://scikit-learn.org/stable/modules/cross_validation.html#stratification).\n kwargs: Additional keyword arguments to pass to the\n [Dataset][pydvl.utils.Dataset] constructor. Use this to pass e.g. `is_multi_output`.\n\n Returns:\n Object with the sklearn dataset\n\n !!! tip \"Changed in version 0.6.0\"\n Added kwargs to pass to the [Dataset][pydvl.utils.Dataset] constructor.\n \"\"\"\n x_train, x_test, y_train, y_test = train_test_split(\n data.data,\n data.target,\n train_size=train_size,\n random_state=random_state,\n stratify=data.target if stratify_by_target else None,\n )\n return cls(\n x_train,\n y_train,\n x_test,\n y_test,\n feature_names=data.get(\"feature_names\"),\n target_names=data.get(\"target_names\"),\n description=data.get(\"DESCR\"),\n **kwargs,\n )\n
"},{"location":"api/pydvl/utils/dataset/#pydvl.utils.dataset.Dataset.from_arrays","title":"from_arrays classmethod
","text":"from_arrays(\n X: NDArray,\n y: NDArray,\n train_size: float = 0.8,\n random_state: Optional[int] = None,\n stratify_by_target: bool = False,\n **kwargs\n) -> Dataset\n
Constructs a Dataset object from X and y numpy arrays as returned by the make_*
functions in sklearn generated datasets.
Example >>> from pydvl.utils import Dataset\n>>> from sklearn.datasets import make_regression\n>>> X, y = make_regression()\n>>> dataset = Dataset.from_arrays(X, y)\n
PARAMETER DESCRIPTION X
numpy array of shape (n_samples, n_features)
TYPE: NDArray
y
numpy array of shape (n_samples,)
TYPE: NDArray
train_size
size of the training dataset. Used in train_test_split
TYPE: float
DEFAULT: 0.8
random_state
seed for train / test split
TYPE: Optional[int]
DEFAULT: None
stratify_by_target
If True
, data is split in a stratified fashion, using the y variable as labels. Read more in sklearn's user guide.
TYPE: bool
DEFAULT: False
kwargs
Additional keyword arguments to pass to the Dataset constructor. Use this to pass e.g. feature_names
or target_names
.
DEFAULT: {}
RETURNS DESCRIPTION Dataset
Object with the passed X and y arrays split across training and test sets.
New in version 0.4.0
Changed in version 0.6.0
Added kwargs to pass to the Dataset constructor.
Source code in src/pydvl/utils/dataset.py
@classmethod\ndef from_arrays(\n cls,\n X: NDArray,\n y: NDArray,\n train_size: float = 0.8,\n random_state: Optional[int] = None,\n stratify_by_target: bool = False,\n **kwargs,\n) -> \"Dataset\":\n \"\"\"Constructs a [Dataset][pydvl.utils.Dataset] object from X and y numpy arrays as\n returned by the `make_*` functions in [sklearn generated datasets](https://scikit-learn.org/stable/datasets/sample_generators.html).\n\n ??? Example\n ```pycon\n >>> from pydvl.utils import Dataset\n >>> from sklearn.datasets import make_regression\n >>> X, y = make_regression()\n >>> dataset = Dataset.from_arrays(X, y)\n ```\n\n Args:\n X: numpy array of shape (n_samples, n_features)\n y: numpy array of shape (n_samples,)\n train_size: size of the training dataset. Used in `train_test_split`\n random_state: seed for train / test split\n stratify_by_target: If `True`, data is split in a stratified fashion,\n using the y variable as labels. Read more in [sklearn's user\n guide](https://scikit-learn.org/stable/modules/cross_validation.html#stratification).\n kwargs: Additional keyword arguments to pass to the\n [Dataset][pydvl.utils.Dataset] constructor. Use this to pass e.g. `feature_names`\n or `target_names`.\n\n Returns:\n Object with the passed X and y arrays split across training and test sets.\n\n !!! tip \"New in version 0.4.0\"\n\n !!! tip \"Changed in version 0.6.0\"\n Added kwargs to pass to the [Dataset][pydvl.utils.Dataset] constructor.\n \"\"\"\n x_train, x_test, y_train, y_test = train_test_split(\n X,\n y,\n train_size=train_size,\n random_state=random_state,\n stratify=y if stratify_by_target else None,\n )\n return cls(x_train, y_train, x_test, y_test, **kwargs)\n
"},{"location":"api/pydvl/utils/dataset/#pydvl.utils.dataset.GroupedDataset","title":"GroupedDataset","text":"GroupedDataset(\n x_train: NDArray,\n y_train: NDArray,\n x_test: NDArray,\n y_test: NDArray,\n data_groups: Sequence,\n feature_names: Optional[Sequence[str]] = None,\n target_names: Optional[Sequence[str]] = None,\n group_names: Optional[Sequence[str]] = None,\n description: Optional[str] = None,\n **kwargs\n)\n
Bases: Dataset
Used for calculating Shapley values of subsets of the data considered as logical units. For instance, one can group by value of a categorical feature, by bin into which a continuous feature falls, or by label.
PARAMETER DESCRIPTION x_train
training data
TYPE: NDArray
y_train
labels of training data
TYPE: NDArray
x_test
test data
TYPE: NDArray
y_test
labels of test data
TYPE: NDArray
data_groups
Iterable of the same length as x_train
containing a group label for each training data point. The label can be of any type, e.g. str
or int
. Data points with the same label will then be grouped by this object and considered as one for effects of valuation.
TYPE: Sequence
feature_names
names of the covariates' features.
TYPE: Optional[Sequence[str]]
DEFAULT: None
target_names
names of the labels or targets y
TYPE: Optional[Sequence[str]]
DEFAULT: None
group_names
names of the groups. If not provided, the labels from data_groups
will be used.
TYPE: Optional[Sequence[str]]
DEFAULT: None
description
A textual description of the dataset
TYPE: Optional[str]
DEFAULT: None
kwargs
Additional keyword arguments to pass to the Dataset constructor.
DEFAULT: {}
Changed in version 0.6.0
Added group_names
and forwarding of kwargs
Source code in src/pydvl/utils/dataset.py
def __init__(\n self,\n x_train: NDArray,\n y_train: NDArray,\n x_test: NDArray,\n y_test: NDArray,\n data_groups: Sequence,\n feature_names: Optional[Sequence[str]] = None,\n target_names: Optional[Sequence[str]] = None,\n group_names: Optional[Sequence[str]] = None,\n description: Optional[str] = None,\n **kwargs,\n):\n \"\"\"Class for grouping datasets.\n\n Used for calculating Shapley values of subsets of the data considered\n as logical units. For instance, one can group by value of a categorical\n feature, by bin into which a continuous feature falls, or by label.\n\n Args:\n x_train: training data\n y_train: labels of training data\n x_test: test data\n y_test: labels of test data\n data_groups: Iterable of the same length as `x_train` containing\n a group label for each training data point. The label can be of any\n type, e.g. `str` or `int`. Data points with the same label will\n then be grouped by this object and considered as one for effects of\n valuation.\n feature_names: names of the covariates' features.\n target_names: names of the labels or targets y\n group_names: names of the groups. If not provided, the labels\n from `data_groups` will be used.\n description: A textual description of the dataset\n kwargs: Additional keyword arguments to pass to the\n [Dataset][pydvl.utils.Dataset] constructor.\n\n !!! tip \"Changed in version 0.6.0\"\n Added `group_names` and forwarding of `kwargs`\n \"\"\"\n super().__init__(\n x_train=x_train,\n y_train=y_train,\n x_test=x_test,\n y_test=y_test,\n feature_names=feature_names,\n target_names=target_names,\n description=description,\n **kwargs,\n )\n\n if len(data_groups) != len(x_train):\n raise ValueError(\n f\"data_groups and x_train must have the same length.\"\n f\"Instead got {len(data_groups)=} and {len(x_train)=}\"\n )\n\n self.groups: OrderedDict[Any, List[int]] = OrderedDict(\n {k: [] for k in set(data_groups)}\n )\n for idx, group in enumerate(data_groups):\n self.groups[group].append(idx)\n self.group_items = list(self.groups.items())\n self._indices = np.arange(len(self.groups.keys()))\n self._data_names = (\n np.array(group_names, dtype=object)\n if group_names is not None\n else np.array(list(self.groups.keys()), dtype=object)\n )\n
"},{"location":"api/pydvl/utils/dataset/#pydvl.utils.dataset.GroupedDataset.dim","title":"dim property
","text":"dim: int\n
Returns the number of dimensions of a sample.
"},{"location":"api/pydvl/utils/dataset/#pydvl.utils.dataset.GroupedDataset.indices","title":"indices property
","text":"indices\n
Indices of the groups.
"},{"location":"api/pydvl/utils/dataset/#pydvl.utils.dataset.GroupedDataset.data_names","title":"data_names property
","text":"data_names\n
Names of the groups.
"},{"location":"api/pydvl/utils/dataset/#pydvl.utils.dataset.GroupedDataset.get_test_data","title":"get_test_data","text":"get_test_data(\n indices: Optional[Iterable[int]] = None,\n) -> Tuple[NDArray, NDArray]\n
Returns the entire test set regardless of the passed indices.
The passed indices will not be used because for data valuation we generally want to score the trained model on the entire test data.
Additionally, the way this method is used in the Utility class, the passed indices will be those of the training data and would not work on the test data.
There may be cases where it is desired to use parts of the test data. In those cases, it is recommended to inherit from Dataset and override get_test_data().
For example, the following snippet shows how one could go about mapping the training data indices into test data indices inside get_test_data():
Example >>> from pydvl.utils import Dataset\n>>> import numpy as np\n>>> class DatasetWithTestDataIndices(Dataset):\n... def get_test_data(self, indices=None):\n... if indices is None:\n... return self.x_test, self.y_test\n... fraction = len(list(indices)) / len(self)\n... mapped_indices = len(self.x_test) / len(self) * np.asarray(indices)\n... mapped_indices = np.unique(mapped_indices.astype(int))\n... return self.x_test[mapped_indices], self.y_test[mapped_indices]\n...\n>>> X = np.random.rand(100, 10)\n>>> y = np.random.randint(0, 2, 100)\n>>> dataset = DatasetWithTestDataIndices.from_arrays(X, y)\n>>> indices = np.random.choice(dataset.indices, 30, replace=False)\n>>> _ = dataset.get_training_data(indices)\n>>> _ = dataset.get_test_data(indices)\n
PARAMETER DESCRIPTION indices
Optional indices into the test data. This argument is unused left for compatibility with get_training_data().
TYPE: Optional[Iterable[int]]
DEFAULT: None
RETURNS DESCRIPTION Tuple[NDArray, NDArray]
The entire test data.
Source code in src/pydvl/utils/dataset.py
def get_test_data(\n self, indices: Optional[Iterable[int]] = None\n) -> Tuple[NDArray, NDArray]:\n \"\"\"Returns the entire test set regardless of the passed indices.\n\n The passed indices will not be used because for data valuation\n we generally want to score the trained model on the entire test data.\n\n Additionally, the way this method is used in the\n [Utility][pydvl.utils.utility.Utility] class, the passed indices will\n be those of the training data and would not work on the test data.\n\n There may be cases where it is desired to use parts of the test data.\n In those cases, it is recommended to inherit from\n [Dataset][pydvl.utils.dataset.Dataset] and override\n [get_test_data()][pydvl.utils.dataset.Dataset.get_test_data].\n\n For example, the following snippet shows how one could go about\n mapping the training data indices into test data indices\n inside [get_test_data()][pydvl.utils.dataset.Dataset.get_test_data]:\n\n ??? Example\n ```pycon\n >>> from pydvl.utils import Dataset\n >>> import numpy as np\n >>> class DatasetWithTestDataIndices(Dataset):\n ... def get_test_data(self, indices=None):\n ... if indices is None:\n ... return self.x_test, self.y_test\n ... fraction = len(list(indices)) / len(self)\n ... mapped_indices = len(self.x_test) / len(self) * np.asarray(indices)\n ... mapped_indices = np.unique(mapped_indices.astype(int))\n ... return self.x_test[mapped_indices], self.y_test[mapped_indices]\n ...\n >>> X = np.random.rand(100, 10)\n >>> y = np.random.randint(0, 2, 100)\n >>> dataset = DatasetWithTestDataIndices.from_arrays(X, y)\n >>> indices = np.random.choice(dataset.indices, 30, replace=False)\n >>> _ = dataset.get_training_data(indices)\n >>> _ = dataset.get_test_data(indices)\n ```\n\n Args:\n indices: Optional indices into the test data. This argument is\n unused left for compatibility with\n [get_training_data()][pydvl.utils.dataset.Dataset.get_training_data].\n\n Returns:\n The entire test data.\n \"\"\"\n return self.x_test, self.y_test\n
"},{"location":"api/pydvl/utils/dataset/#pydvl.utils.dataset.GroupedDataset.get_training_data","title":"get_training_data","text":"get_training_data(\n indices: Optional[Iterable[int]] = None,\n) -> Tuple[NDArray, NDArray]\n
Returns the data and labels of all samples in the given groups.
PARAMETER DESCRIPTION indices
group indices whose elements to return. If None
, all data from all groups are returned.
TYPE: Optional[Iterable[int]]
DEFAULT: None
RETURNS DESCRIPTION Tuple[NDArray, NDArray]
Tuple of training data x and labels y.
Source code in src/pydvl/utils/dataset.py
def get_training_data(\n self, indices: Optional[Iterable[int]] = None\n) -> Tuple[NDArray, NDArray]:\n \"\"\"Returns the data and labels of all samples in the given groups.\n\n Args:\n indices: group indices whose elements to return. If `None`,\n all data from all groups are returned.\n\n Returns:\n Tuple of training data x and labels y.\n \"\"\"\n if indices is None:\n indices = self.indices\n data_indices = [\n idx for group_id in indices for idx in self.group_items[group_id][1]\n ]\n return super().get_training_data(data_indices)\n
"},{"location":"api/pydvl/utils/dataset/#pydvl.utils.dataset.GroupedDataset.from_sklearn","title":"from_sklearn classmethod
","text":"from_sklearn(\n data: Bunch,\n train_size: float = 0.8,\n random_state: Optional[int] = None,\n stratify_by_target: bool = False,\n data_groups: Optional[Sequence] = None,\n **kwargs\n) -> GroupedDataset\n
Constructs a GroupedDataset object from a sklearn.utils.Bunch as returned by the load_*
functions in scikit-learn toy datasets and groups it.
Example >>> from sklearn.datasets import load_iris\n>>> from pydvl.utils import GroupedDataset\n>>> iris = load_iris()\n>>> data_groups = iris.data[:, 0] // 0.5\n>>> dataset = GroupedDataset.from_sklearn(iris, data_groups=data_groups)\n
PARAMETER DESCRIPTION data
scikit-learn Bunch object. The following attributes are supported:
data
: covariates. target
: target variables (labels). feature_names
(optional): the feature names. target_names
(optional): the target names. DESCR
(optional): a description.
TYPE: Bunch
train_size
size of the training dataset. Used in train_test_split
.
TYPE: float
DEFAULT: 0.8
random_state
seed for train / test split.
TYPE: Optional[int]
DEFAULT: None
stratify_by_target
If True
, data is split in a stratified fashion, using the target variable as labels. Read more in sklearn's user guide.
TYPE: bool
DEFAULT: False
data_groups
an array holding the group index or name for each data point. The length of this array must be equal to the number of data points in the dataset.
TYPE: Optional[Sequence]
DEFAULT: None
kwargs
Additional keyword arguments to pass to the Dataset constructor.
DEFAULT: {}
RETURNS DESCRIPTION GroupedDataset
Dataset with the selected sklearn data
Source code in src/pydvl/utils/dataset.py
@classmethod\ndef from_sklearn(\n cls,\n data: Bunch,\n train_size: float = 0.8,\n random_state: Optional[int] = None,\n stratify_by_target: bool = False,\n data_groups: Optional[Sequence] = None,\n **kwargs,\n) -> \"GroupedDataset\":\n \"\"\"Constructs a [GroupedDataset][pydvl.utils.GroupedDataset] object from a\n [sklearn.utils.Bunch][sklearn.utils.Bunch] as returned by the `load_*` functions in\n [scikit-learn toy datasets](https://scikit-learn.org/stable/datasets/toy_dataset.html) and groups\n it.\n\n ??? Example\n ```pycon\n >>> from sklearn.datasets import load_iris\n >>> from pydvl.utils import GroupedDataset\n >>> iris = load_iris()\n >>> data_groups = iris.data[:, 0] // 0.5\n >>> dataset = GroupedDataset.from_sklearn(iris, data_groups=data_groups)\n ```\n\n Args:\n data: scikit-learn Bunch object. The following attributes are supported:\n\n - `data`: covariates.\n - `target`: target variables (labels).\n - `feature_names` (**optional**): the feature names.\n - `target_names` (**optional**): the target names.\n - `DESCR` (**optional**): a description.\n train_size: size of the training dataset. Used in `train_test_split`.\n random_state: seed for train / test split.\n stratify_by_target: If `True`, data is split in a stratified\n fashion, using the target variable as labels. Read more in\n [sklearn's user guide](https://scikit-learn.org/stable/modules/cross_validation.html#stratification).\n data_groups: an array holding the group index or name for each\n data point. The length of this array must be equal to the number of\n data points in the dataset.\n kwargs: Additional keyword arguments to pass to the\n [Dataset][pydvl.utils.Dataset] constructor.\n\n Returns:\n Dataset with the selected sklearn data\n \"\"\"\n if data_groups is None:\n raise ValueError(\n \"data_groups must be provided when constructing a GroupedDataset\"\n )\n\n x_train, x_test, y_train, y_test, data_groups_train, _ = train_test_split(\n data.data,\n data.target,\n data_groups,\n train_size=train_size,\n random_state=random_state,\n stratify=data.target if stratify_by_target else None,\n )\n\n dataset = Dataset(\n x_train=x_train, y_train=y_train, x_test=x_test, y_test=y_test, **kwargs\n )\n return cls.from_dataset(dataset, data_groups_train) # type: ignore\n
"},{"location":"api/pydvl/utils/dataset/#pydvl.utils.dataset.GroupedDataset.from_arrays","title":"from_arrays classmethod
","text":"from_arrays(\n X: NDArray,\n y: NDArray,\n train_size: float = 0.8,\n random_state: Optional[int] = None,\n stratify_by_target: bool = False,\n data_groups: Optional[Sequence] = None,\n **kwargs\n) -> Dataset\n
Constructs a GroupedDataset object from X and y numpy arrays as returned by the make_*
functions in scikit-learn generated datasets.
Example >>> from sklearn.datasets import make_classification\n>>> from pydvl.utils import GroupedDataset\n>>> X, y = make_classification(\n... n_samples=100,\n... n_features=4,\n... n_informative=2,\n... n_redundant=0,\n... random_state=0,\n... shuffle=False\n... )\n>>> data_groups = X[:, 0] // 0.5\n>>> dataset = GroupedDataset.from_arrays(X, y, data_groups=data_groups)\n
PARAMETER DESCRIPTION X
array of shape (n_samples, n_features)
TYPE: NDArray
y
array of shape (n_samples,)
TYPE: NDArray
train_size
size of the training dataset. Used in train_test_split
.
TYPE: float
DEFAULT: 0.8
random_state
seed for train / test split.
TYPE: Optional[int]
DEFAULT: None
stratify_by_target
If True
, data is split in a stratified fashion, using the y variable as labels. Read more in sklearn's user guide.
TYPE: bool
DEFAULT: False
data_groups
an array holding the group index or name for each data point. The length of this array must be equal to the number of data points in the dataset.
TYPE: Optional[Sequence]
DEFAULT: None
kwargs
Additional keyword arguments that will be passed to the Dataset constructor.
DEFAULT: {}
RETURNS DESCRIPTION Dataset
Dataset with the passed X and y arrays split across training and test sets.
New in version 0.4.0
Changed in version 0.6.0
Added kwargs to pass to the Dataset constructor.
Source code in src/pydvl/utils/dataset.py
@classmethod\ndef from_arrays(\n cls,\n X: NDArray,\n y: NDArray,\n train_size: float = 0.8,\n random_state: Optional[int] = None,\n stratify_by_target: bool = False,\n data_groups: Optional[Sequence] = None,\n **kwargs,\n) -> \"Dataset\":\n \"\"\"Constructs a [GroupedDataset][pydvl.utils.GroupedDataset] object from X and y numpy arrays\n as returned by the `make_*` functions in\n [scikit-learn generated datasets](https://scikit-learn.org/stable/datasets/sample_generators.html).\n\n ??? Example\n ```pycon\n >>> from sklearn.datasets import make_classification\n >>> from pydvl.utils import GroupedDataset\n >>> X, y = make_classification(\n ... n_samples=100,\n ... n_features=4,\n ... n_informative=2,\n ... n_redundant=0,\n ... random_state=0,\n ... shuffle=False\n ... )\n >>> data_groups = X[:, 0] // 0.5\n >>> dataset = GroupedDataset.from_arrays(X, y, data_groups=data_groups)\n ```\n\n Args:\n X: array of shape (n_samples, n_features)\n y: array of shape (n_samples,)\n train_size: size of the training dataset. Used in `train_test_split`.\n random_state: seed for train / test split.\n stratify_by_target: If `True`, data is split in a stratified\n fashion, using the y variable as labels. Read more in\n [sklearn's user guide](https://scikit-learn.org/stable/modules/cross_validation.html#stratification).\n data_groups: an array holding the group index or name for each data\n point. The length of this array must be equal to the number of\n data points in the dataset.\n kwargs: Additional keyword arguments that will be passed to the\n [Dataset][pydvl.utils.Dataset] constructor.\n\n Returns:\n Dataset with the passed X and y arrays split across training and\n test sets.\n\n !!! tip \"New in version 0.4.0\"\n\n !!! tip \"Changed in version 0.6.0\"\n Added kwargs to pass to the [Dataset][pydvl.utils.Dataset] constructor.\n \"\"\"\n if data_groups is None:\n raise ValueError(\n \"data_groups must be provided when constructing a GroupedDataset\"\n )\n x_train, x_test, y_train, y_test, data_groups_train, _ = train_test_split(\n X,\n y,\n data_groups,\n train_size=train_size,\n random_state=random_state,\n stratify=y if stratify_by_target else None,\n )\n dataset = Dataset(\n x_train=x_train, y_train=y_train, x_test=x_test, y_test=y_test, **kwargs\n )\n return cls.from_dataset(dataset, data_groups_train)\n
"},{"location":"api/pydvl/utils/dataset/#pydvl.utils.dataset.GroupedDataset.from_dataset","title":"from_dataset classmethod
","text":"from_dataset(dataset: Dataset, data_groups: Sequence[Any]) -> GroupedDataset\n
Creates a GroupedDataset object from the data a Dataset object and a mapping of data groups.
Example >>> import numpy as np\n>>> from pydvl.utils import Dataset, GroupedDataset\n>>> dataset = Dataset.from_arrays(\n... X=np.asarray([[1, 2], [3, 4], [5, 6], [7, 8]]),\n... y=np.asarray([0, 1, 0, 1]),\n... )\n>>> dataset = GroupedDataset.from_dataset(dataset, data_groups=[0, 0, 1, 1])\n
PARAMETER DESCRIPTION dataset
The original data.
TYPE: Dataset
data_groups
An array holding the group index or name for each data point. The length of this array must be equal to the number of data points in the dataset.
TYPE: Sequence[Any]
RETURNS DESCRIPTION GroupedDataset
A GroupedDataset with the initial Dataset grouped by data_groups.
Source code in src/pydvl/utils/dataset.py
@classmethod\ndef from_dataset(\n cls, dataset: Dataset, data_groups: Sequence[Any]\n) -> \"GroupedDataset\":\n \"\"\"Creates a [GroupedDataset][pydvl.utils.GroupedDataset] object from the data a\n [Dataset][pydvl.utils.Dataset] object and a mapping of data groups.\n\n ??? Example\n ```pycon\n >>> import numpy as np\n >>> from pydvl.utils import Dataset, GroupedDataset\n >>> dataset = Dataset.from_arrays(\n ... X=np.asarray([[1, 2], [3, 4], [5, 6], [7, 8]]),\n ... y=np.asarray([0, 1, 0, 1]),\n ... )\n >>> dataset = GroupedDataset.from_dataset(dataset, data_groups=[0, 0, 1, 1])\n ```\n\n Args:\n dataset: The original data.\n data_groups: An array holding the group index or name for each data\n point. The length of this array must be equal to the number of\n data points in the dataset.\n\n Returns:\n A [GroupedDataset][pydvl.utils.GroupedDataset] with the initial\n [Dataset][pydvl.utils.Dataset] grouped by data_groups.\n \"\"\"\n return cls(\n x_train=dataset.x_train,\n y_train=dataset.y_train,\n x_test=dataset.x_test,\n y_test=dataset.y_test,\n data_groups=data_groups,\n feature_names=dataset.feature_names,\n target_names=dataset.target_names,\n description=dataset.description,\n )\n
"},{"location":"api/pydvl/utils/exceptions/","title":"Exceptions","text":""},{"location":"api/pydvl/utils/exceptions/#pydvl.utils.exceptions","title":"pydvl.utils.exceptions","text":""},{"location":"api/pydvl/utils/exceptions/#pydvl.utils.exceptions.catch_and_raise_exception","title":"catch_and_raise_exception","text":"catch_and_raise_exception(\n catch_exception_type: Type[CatchExceptionType],\n raise_exception_factory: Callable[[CatchExceptionType], RaiseExceptionType],\n) -> Callable\n
A decorator that catches exceptions of a specified exception type and raises another specified exception.
PARAMETER DESCRIPTION catch_exception_type
The type of the exception to catch.
TYPE: Type[CatchExceptionType]
raise_exception_factory
A factory function that creates a new exception.
TYPE: Callable[[CatchExceptionType], RaiseExceptionType]
RETURNS DESCRIPTION Callable
A decorator function that wraps the target function.
Example @catch_and_raise_exception(RuntimeError, lambda e: TorchLinalgEighException(e))\ndef safe_torch_linalg_eigh(*args, **kwargs):\n '''\n A wrapper around `torch.linalg.eigh` that safely handles potential runtime errors\n by raising a custom `TorchLinalgEighException` with more context,\n especially related to the issues reported in\n https://github.com/pytorch/pytorch/issues/92141.\n\n Args:\n *args: Positional arguments passed to `torch.linalg.eigh`.\n **kwargs: Keyword arguments passed to `torch.linalg.eigh`.\n\n Returns:\n The result of calling `torch.linalg.eigh` with the provided arguments.\n\n Raises:\n TorchLinalgEighException: If a `RuntimeError` occurs during the execution of\n `torch.linalg.eigh`.\n '''\n return torch.linalg.eigh(*args, **kwargs)\n
Source code in src/pydvl/utils/exceptions.py
def catch_and_raise_exception(\n catch_exception_type: Type[CatchExceptionType],\n raise_exception_factory: Callable[[CatchExceptionType], RaiseExceptionType],\n) -> Callable:\n \"\"\"\n A decorator that catches exceptions of a specified exception type and raises\n another specified exception.\n\n Args:\n catch_exception_type: The type of the exception to catch.\n raise_exception_factory: A factory function that creates a new exception.\n\n Returns:\n A decorator function that wraps the target function.\n\n ??? Example\n\n ```python\n @catch_and_raise_exception(RuntimeError, lambda e: TorchLinalgEighException(e))\n def safe_torch_linalg_eigh(*args, **kwargs):\n '''\n A wrapper around `torch.linalg.eigh` that safely handles potential runtime errors\n by raising a custom `TorchLinalgEighException` with more context,\n especially related to the issues reported in\n https://github.com/pytorch/pytorch/issues/92141.\n\n Args:\n *args: Positional arguments passed to `torch.linalg.eigh`.\n **kwargs: Keyword arguments passed to `torch.linalg.eigh`.\n\n Returns:\n The result of calling `torch.linalg.eigh` with the provided arguments.\n\n Raises:\n TorchLinalgEighException: If a `RuntimeError` occurs during the execution of\n `torch.linalg.eigh`.\n '''\n return torch.linalg.eigh(*args, **kwargs)\n ```\n \"\"\"\n\n def decorator(func):\n @wraps(func)\n def wrapper(*args, **kwargs):\n try:\n return func(*args, **kwargs)\n except catch_exception_type as e:\n raise raise_exception_factory(e) from e\n\n return wrapper\n\n return decorator\n
"},{"location":"api/pydvl/utils/functional/","title":"Functional","text":""},{"location":"api/pydvl/utils/functional/#pydvl.utils.functional","title":"pydvl.utils.functional","text":"Supporting utilities for manipulating arguments of functions.
"},{"location":"api/pydvl/utils/functional/#pydvl.utils.functional.free_arguments","title":"free_arguments","text":"free_arguments(fun: Union[Callable, partial]) -> Set[str]\n
Computes the set of free arguments for a function or functools.partial object.
All arguments of a function are considered free unless they are set by a partial. For example, if f = partial(g, a=1)
, then a
is not a free argument of f
.
PARAMETER DESCRIPTION fun
A callable or a [partial object][].
TYPE: Union[Callable, partial]
RETURNS DESCRIPTION Set[str]
The set of free arguments of fun
.
New in version 0.7.0
Source code in src/pydvl/utils/functional.py
def free_arguments(fun: Union[Callable, partial]) -> Set[str]:\n \"\"\"Computes the set of free arguments for a function or\n [functools.partial][] object.\n\n All arguments of a function are considered free unless they are set by a\n partial. For example, if `f = partial(g, a=1)`, then `a` is not a free\n argument of `f`.\n\n Args:\n fun: A callable or a [partial object][].\n\n Returns:\n The set of free arguments of `fun`.\n\n !!! tip \"New in version 0.7.0\"\n \"\"\"\n args_set_by_partial: Set[str] = set()\n\n def _rec_unroll_partial_function_args(g: Union[Callable, partial]) -> Callable:\n \"\"\"Stores arguments and recursively call itself if `g` is a\n [functools.partial][] object. In the end, returns the initially wrapped\n function.\n\n This handles the construct `partial(_accept_additional_argument, *args,\n **kwargs)` that is used by `maybe_add_argument`.\n\n Args:\n g: A partial or a function to unroll.\n\n Returns:\n Initial wrapped function.\n \"\"\"\n nonlocal args_set_by_partial\n\n if isinstance(g, partial) and g.func == _accept_additional_argument:\n arg = g.keywords[\"arg\"]\n if arg in args_set_by_partial:\n args_set_by_partial.remove(arg)\n return _rec_unroll_partial_function_args(g.keywords[\"fun\"])\n elif isinstance(g, partial):\n args_set_by_partial.update(g.keywords.keys())\n args_set_by_partial.update(g.args)\n return _rec_unroll_partial_function_args(g.func)\n else:\n return g\n\n wrapped_fn = _rec_unroll_partial_function_args(fun)\n sig = inspect.signature(wrapped_fn)\n return args_set_by_partial | set(sig.parameters.keys())\n
"},{"location":"api/pydvl/utils/functional/#pydvl.utils.functional.maybe_add_argument","title":"maybe_add_argument","text":"maybe_add_argument(fun: Callable, new_arg: str) -> Callable\n
Wraps a function to accept the given keyword parameter if it doesn't already.
If fun
already takes a keyword parameter of name new_arg
, then it is returned as is. Otherwise, a wrapper is returned which merely ignores the argument.
PARAMETER DESCRIPTION fun
The function to wrap
TYPE: Callable
new_arg
The name of the argument that the new function will accept (and ignore).
TYPE: str
RETURNS DESCRIPTION Callable
A new function accepting one more keyword argument.
Changed in version 0.7.0
Ability to work with partials.
Source code in src/pydvl/utils/functional.py
def maybe_add_argument(fun: Callable, new_arg: str) -> Callable:\n \"\"\"Wraps a function to accept the given keyword parameter if it doesn't\n already.\n\n If `fun` already takes a keyword parameter of name `new_arg`, then it is\n returned as is. Otherwise, a wrapper is returned which merely ignores the\n argument.\n\n Args:\n fun: The function to wrap\n new_arg: The name of the argument that the new function will accept\n (and ignore).\n\n Returns:\n A new function accepting one more keyword argument.\n\n !!! tip \"Changed in version 0.7.0\"\n Ability to work with partials.\n \"\"\"\n if new_arg in free_arguments(fun):\n return fun\n\n return partial(_accept_additional_argument, fun=fun, arg=new_arg)\n
"},{"location":"api/pydvl/utils/numeric/","title":"Numeric","text":""},{"location":"api/pydvl/utils/numeric/#pydvl.utils.numeric","title":"pydvl.utils.numeric","text":"This module contains routines for numerical computations used across the library.
"},{"location":"api/pydvl/utils/numeric/#pydvl.utils.numeric.powerset","title":"powerset","text":"powerset(s: NDArray[T]) -> Iterator[Collection[T]]\n
Returns an iterator for the power set of the argument.
Subsets are generated in sequence by growing size. See random_powerset() for random sampling.
Example >>> import numpy as np\n>>> from pydvl.utils.numeric import powerset\n>>> list(powerset(np.array((1,2))))\n[(), (1,), (2,), (1, 2)]\n
PARAMETER DESCRIPTION s
The set to use
TYPE: NDArray[T]
RETURNS DESCRIPTION Iterator[Collection[T]]
An iterator over all subsets of the set of indices s
.
Source code in src/pydvl/utils/numeric.py
def powerset(s: NDArray[T]) -> Iterator[Collection[T]]:\n \"\"\"Returns an iterator for the power set of the argument.\n\n Subsets are generated in sequence by growing size. See\n [random_powerset()][pydvl.utils.numeric.random_powerset] for random\n sampling.\n\n ??? Example\n ``` pycon\n >>> import numpy as np\n >>> from pydvl.utils.numeric import powerset\n >>> list(powerset(np.array((1,2))))\n [(), (1,), (2,), (1, 2)]\n ```\n\n Args:\n s: The set to use\n\n Returns:\n An iterator over all subsets of the set of indices `s`.\n \"\"\"\n return chain.from_iterable(combinations(s, r) for r in range(len(s) + 1))\n
"},{"location":"api/pydvl/utils/numeric/#pydvl.utils.numeric.num_samples_permutation_hoeffding","title":"num_samples_permutation_hoeffding","text":"num_samples_permutation_hoeffding(\n eps: float, delta: float, u_range: float\n) -> int\n
Lower bound on the number of samples required for MonteCarlo Shapley to obtain an (\u03b5,\u03b4)-approximation.
That is: with probability 1-\u03b4, the estimated value for one data point will be \u03b5-close to the true quantity, if at least this many permutations are sampled.
PARAMETER DESCRIPTION eps
\u03b5 > 0
TYPE: float
delta
0 < \u03b4 <= 1
TYPE: float
u_range
Range of the Utility function
TYPE: float
RETURNS DESCRIPTION int
Number of permutations required to guarantee \u03b5-correct Shapley values with probability 1-\u03b4
Source code in src/pydvl/utils/numeric.py
def num_samples_permutation_hoeffding(eps: float, delta: float, u_range: float) -> int:\n \"\"\"Lower bound on the number of samples required for MonteCarlo Shapley to\n obtain an (\u03b5,\u03b4)-approximation.\n\n That is: with probability 1-\u03b4, the estimated value for one data point will\n be \u03b5-close to the true quantity, if at least this many permutations are\n sampled.\n\n Args:\n eps: \u03b5 > 0\n delta: 0 < \u03b4 <= 1\n u_range: Range of the [Utility][pydvl.utils.utility.Utility] function\n\n Returns:\n Number of _permutations_ required to guarantee \u03b5-correct Shapley\n values with probability 1-\u03b4\n \"\"\"\n return int(np.ceil(np.log(2 / delta) * 2 * u_range**2 / eps**2))\n
"},{"location":"api/pydvl/utils/numeric/#pydvl.utils.numeric.random_subset","title":"random_subset","text":"random_subset(\n s: NDArray[T], q: float = 0.5, seed: Optional[Seed] = None\n) -> NDArray[T]\n
Returns one subset at random from s
.
PARAMETER DESCRIPTION s
set to sample from
TYPE: NDArray[T]
q
Sampling probability for elements. The default 0.5 yields a uniform distribution over the power set of s.
TYPE: float
DEFAULT: 0.5
seed
Either an instance of a numpy random number generator or a seed for it.
TYPE: Optional[Seed]
DEFAULT: None
RETURNS DESCRIPTION NDArray[T]
The subset
Source code in src/pydvl/utils/numeric.py
def random_subset(\n s: NDArray[T], q: float = 0.5, seed: Optional[Seed] = None\n) -> NDArray[T]:\n \"\"\"Returns one subset at random from ``s``.\n\n Args:\n s: set to sample from\n q: Sampling probability for elements. The default 0.5 yields a\n uniform distribution over the power set of s.\n seed: Either an instance of a numpy random number generator or a seed\n for it.\n\n Returns:\n The subset\n \"\"\"\n rng = np.random.default_rng(seed)\n selection = rng.uniform(size=len(s)) > q\n return s[selection]\n
"},{"location":"api/pydvl/utils/numeric/#pydvl.utils.numeric.random_powerset","title":"random_powerset","text":"random_powerset(\n s: NDArray[T],\n n_samples: Optional[int] = None,\n q: float = 0.5,\n seed: Optional[Seed] = None,\n) -> Generator[NDArray[T], None, None]\n
Samples subsets from the power set of the argument, without pre-generating all subsets and in no order.
See powerset if you wish to deterministically generate all subsets.
To generate subsets, len(s)
Bernoulli draws with probability q
are drawn. The default value of q = 0.5
provides a uniform distribution over the power set of s
. Other choices can be used e.g. to implement owen_sampling_shapley.
PARAMETER DESCRIPTION s
set to sample from
TYPE: NDArray[T]
n_samples
if set, stop the generator after this many steps. Defaults to np.iinfo(np.int32).max
TYPE: Optional[int]
DEFAULT: None
q
Sampling probability for elements. The default 0.5 yields a uniform distribution over the power set of s.
TYPE: float
DEFAULT: 0.5
seed
Either an instance of a numpy random number generator or a seed for it.
TYPE: Optional[Seed]
DEFAULT: None
RETURNS DESCRIPTION Generator[NDArray[T], None, None]
Samples from the power set of s
.
RAISES DESCRIPTION ValueError
if the element sampling probability is not in [0,1]
Source code in src/pydvl/utils/numeric.py
def random_powerset(\n s: NDArray[T],\n n_samples: Optional[int] = None,\n q: float = 0.5,\n seed: Optional[Seed] = None,\n) -> Generator[NDArray[T], None, None]:\n \"\"\"Samples subsets from the power set of the argument, without\n pre-generating all subsets and in no order.\n\n See [powerset][pydvl.utils.numeric.powerset] if you wish to deterministically generate all subsets.\n\n To generate subsets, `len(s)` Bernoulli draws with probability `q` are\n drawn. The default value of `q = 0.5` provides a uniform distribution over\n the power set of `s`. Other choices can be used e.g. to implement\n [owen_sampling_shapley][pydvl.value.shapley.owen.owen_sampling_shapley].\n\n Args:\n s: set to sample from\n n_samples: if set, stop the generator after this many steps.\n Defaults to `np.iinfo(np.int32).max`\n q: Sampling probability for elements. The default 0.5 yields a\n uniform distribution over the power set of s.\n seed: Either an instance of a numpy random number generator or a seed for it.\n\n Returns:\n Samples from the power set of `s`.\n\n Raises:\n ValueError: if the element sampling probability is not in [0,1]\n\n \"\"\"\n if q < 0 or q > 1:\n raise ValueError(\"Element sampling probability must be in [0,1]\")\n\n rng = np.random.default_rng(seed)\n total = 1\n if n_samples is None:\n n_samples = np.iinfo(np.int32).max\n while total <= n_samples:\n yield random_subset(s, q, seed=rng)\n total += 1\n
"},{"location":"api/pydvl/utils/numeric/#pydvl.utils.numeric.random_powerset_label_min","title":"random_powerset_label_min","text":"random_powerset_label_min(\n s: NDArray[T],\n labels: NDArray[int_],\n min_elements_per_label: int = 1,\n seed: Optional[Seed] = None,\n) -> Generator[NDArray[T], None, None]\n
Draws random subsets from s
, while ensuring that at least min_elements_per_label
elements per label are included in the draw. It can be used for classification problems to ensure that a set contains information for all labels (or not if min_elements_per_label=0
).
PARAMETER DESCRIPTION s
Set to sample from
TYPE: NDArray[T]
labels
Labels for the samples
TYPE: NDArray[int_]
min_elements_per_label
Minimum number of elements for each label.
TYPE: int
DEFAULT: 1
seed
Either an instance of a numpy random number generator or a seed for it.
TYPE: Optional[Seed]
DEFAULT: None
RETURNS DESCRIPTION Generator[NDArray[T], None, None]
Generated draw from the powerset of s with min_elements_per_label
for each
Generator[NDArray[T], None, None]
label.
RAISES DESCRIPTION ValueError
If s
and labels
are of different length or min_elements_per_label
is smaller than 0.
Source code in src/pydvl/utils/numeric.py
def random_powerset_label_min(\n s: NDArray[T],\n labels: NDArray[np.int_],\n min_elements_per_label: int = 1,\n seed: Optional[Seed] = None,\n) -> Generator[NDArray[T], None, None]:\n \"\"\"Draws random subsets from `s`, while ensuring that at least\n `min_elements_per_label` elements per label are included in the draw. It can be used\n for classification problems to ensure that a set contains information for all labels\n (or not if `min_elements_per_label=0`).\n\n Args:\n s: Set to sample from\n labels: Labels for the samples\n min_elements_per_label: Minimum number of elements for each label.\n seed: Either an instance of a numpy random number generator or a seed for it.\n\n Returns:\n Generated draw from the powerset of s with `min_elements_per_label` for each\n label.\n\n Raises:\n ValueError: If `s` and `labels` are of different length or\n `min_elements_per_label` is smaller than 0.\n \"\"\"\n if len(labels) != len(s):\n raise ValueError(\"Set and labels have to be of same size.\")\n\n if min_elements_per_label < 0:\n raise ValueError(\n f\"Parameter min_elements={min_elements_per_label} needs to be bigger or \"\n f\"equal to 0.\"\n )\n\n rng = np.random.default_rng(seed)\n unique_labels = np.unique(labels)\n\n while True:\n subsets: List[NDArray[T]] = []\n for label in unique_labels:\n label_indices = np.asarray(np.where(labels == label)[0])\n subset_size = int(\n rng.integers(\n min(min_elements_per_label, len(label_indices)),\n len(label_indices) + 1,\n )\n )\n if subset_size > 0:\n subsets.append(\n random_subset_of_size(s[label_indices], subset_size, seed=rng)\n )\n\n if len(subsets) > 0:\n subset = np.concatenate(tuple(subsets))\n rng.shuffle(subset)\n yield subset\n else:\n yield np.array([], dtype=s.dtype)\n
"},{"location":"api/pydvl/utils/numeric/#pydvl.utils.numeric.random_subset_of_size","title":"random_subset_of_size","text":"random_subset_of_size(\n s: NDArray[T], size: int, seed: Optional[Seed] = None\n) -> NDArray[T]\n
Samples a random subset of given size uniformly from the powerset of s
.
PARAMETER DESCRIPTION s
Set to sample from
TYPE: NDArray[T]
size
Size of the subset to generate
TYPE: int
seed
Either an instance of a numpy random number generator or a seed for it.
TYPE: Optional[Seed]
DEFAULT: None
RETURNS DESCRIPTION NDArray[T]
The subset
Raises ValueError: If size > len(s)
Source code in src/pydvl/utils/numeric.py
def random_subset_of_size(\n s: NDArray[T], size: int, seed: Optional[Seed] = None\n) -> NDArray[T]:\n \"\"\"Samples a random subset of given size uniformly from the powerset\n of `s`.\n\n Args:\n s: Set to sample from\n size: Size of the subset to generate\n seed: Either an instance of a numpy random number generator or a seed for it.\n\n Returns:\n The subset\n\n Raises\n ValueError: If size > len(s)\n \"\"\"\n if size > len(s):\n raise ValueError(\"Cannot sample subset larger than set\")\n rng = np.random.default_rng(seed)\n return rng.choice(s, size=size, replace=False)\n
"},{"location":"api/pydvl/utils/numeric/#pydvl.utils.numeric.random_matrix_with_condition_number","title":"random_matrix_with_condition_number","text":"random_matrix_with_condition_number(\n n: int, condition_number: float, seed: Optional[Seed] = None\n) -> NDArray\n
Constructs a square matrix with a given condition number.
Taken from: https://gist.github.com/bstellato/23322fe5d87bb71da922fbc41d658079#file-random_mat_condition_number-py
Also see: https://math.stackexchange.com/questions/1351616/condition-number-of-ata.
PARAMETER DESCRIPTION n
size of the matrix
TYPE: int
condition_number
duh
TYPE: float
seed
Either an instance of a numpy random number generator or a seed for it.
TYPE: Optional[Seed]
DEFAULT: None
RETURNS DESCRIPTION NDArray
An (n,n) matrix with the requested condition number.
Source code in src/pydvl/utils/numeric.py
def random_matrix_with_condition_number(\n n: int, condition_number: float, seed: Optional[Seed] = None\n) -> NDArray:\n \"\"\"Constructs a square matrix with a given condition number.\n\n Taken from:\n [https://gist.github.com/bstellato/23322fe5d87bb71da922fbc41d658079#file-random_mat_condition_number-py](\n https://gist.github.com/bstellato/23322fe5d87bb71da922fbc41d658079#file-random_mat_condition_number-py)\n\n Also see:\n [https://math.stackexchange.com/questions/1351616/condition-number-of-ata](\n https://math.stackexchange.com/questions/1351616/condition-number-of-ata).\n\n Args:\n n: size of the matrix\n condition_number: duh\n seed: Either an instance of a numpy random number generator or a seed for it.\n\n Returns:\n An (n,n) matrix with the requested condition number.\n \"\"\"\n if n < 2:\n raise ValueError(\"Matrix size must be at least 2\")\n\n if condition_number <= 1:\n raise ValueError(\"Condition number must be greater than 1\")\n\n rng = np.random.default_rng(seed)\n log_condition_number = np.log(condition_number)\n exp_vec = np.arange(\n -log_condition_number / 4.0,\n log_condition_number * (n + 1) / (4 * (n - 1)),\n log_condition_number / (2.0 * (n - 1)),\n )\n exp_vec = exp_vec[:n]\n s: np.ndarray = np.exp(exp_vec)\n S = np.diag(s)\n U, _ = np.linalg.qr((rng.uniform(size=(n, n)) - 5.0) * 200)\n V, _ = np.linalg.qr((rng.uniform(size=(n, n)) - 5.0) * 200)\n P: np.ndarray = U.dot(S).dot(V.T)\n P = P.dot(P.T)\n return P\n
"},{"location":"api/pydvl/utils/numeric/#pydvl.utils.numeric.running_moments","title":"running_moments","text":"running_moments(\n previous_avg: float | NDArray[float64],\n previous_variance: float | NDArray[float64],\n count: int,\n new_value: float | NDArray[float64],\n) -> Tuple[float | NDArray[float64], float | NDArray[float64]]\n
Uses Welford's algorithm to calculate the running average and variance of a set of numbers.
See Welford's algorithm in wikipedia
Warning
This is not really using Welford's correction for numerical stability for the variance. (FIXME)
Todo
This could be generalised to arbitrary moments. See this paper
PARAMETER DESCRIPTION previous_avg
average value at previous step
TYPE: float | NDArray[float64]
previous_variance
variance at previous step
TYPE: float | NDArray[float64]
count
number of points seen so far
TYPE: int
new_value
new value in the series of numbers
TYPE: float | NDArray[float64]
RETURNS DESCRIPTION Tuple[float | NDArray[float64], float | NDArray[float64]]
new_average, new_variance, calculated with the new count
Source code in src/pydvl/utils/numeric.py
def running_moments(\n previous_avg: float | NDArray[np.float64],\n previous_variance: float | NDArray[np.float64],\n count: int,\n new_value: float | NDArray[np.float64],\n) -> Tuple[float | NDArray[np.float64], float | NDArray[np.float64]]:\n \"\"\"Uses Welford's algorithm to calculate the running average and variance of\n a set of numbers.\n\n See [Welford's algorithm in wikipedia](https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Welford's_online_algorithm)\n\n !!! Warning\n This is not really using Welford's correction for numerical stability\n for the variance. (FIXME)\n\n !!! Todo\n This could be generalised to arbitrary moments. See [this paper](https://www.osti.gov/biblio/1028931)\n\n Args:\n previous_avg: average value at previous step\n previous_variance: variance at previous step\n count: number of points seen so far\n new_value: new value in the series of numbers\n\n Returns:\n new_average, new_variance, calculated with the new count\n \"\"\"\n # broadcasted operations seem not to be supported by mypy, so we ignore the type\n new_average = (new_value + count * previous_avg) / (count + 1) # type: ignore\n new_variance = previous_variance + (\n (new_value - previous_avg) * (new_value - new_average) - previous_variance\n ) / (count + 1)\n return new_average, new_variance\n
"},{"location":"api/pydvl/utils/numeric/#pydvl.utils.numeric.top_k_value_accuracy","title":"top_k_value_accuracy","text":"top_k_value_accuracy(\n y_true: NDArray[float64], y_pred: NDArray[float64], k: int = 3\n) -> float\n
Computes the top-k accuracy for the estimated values by comparing indices of the highest k values.
PARAMETER DESCRIPTION y_true
Exact/true value
TYPE: NDArray[float64]
y_pred
Predicted/estimated value
TYPE: NDArray[float64]
k
Number of the highest values taken into account
TYPE: int
DEFAULT: 3
RETURNS DESCRIPTION float
Accuracy
Source code in src/pydvl/utils/numeric.py
def top_k_value_accuracy(\n y_true: NDArray[np.float64], y_pred: NDArray[np.float64], k: int = 3\n) -> float:\n \"\"\"Computes the top-k accuracy for the estimated values by comparing indices\n of the highest k values.\n\n Args:\n y_true: Exact/true value\n y_pred: Predicted/estimated value\n k: Number of the highest values taken into account\n\n Returns:\n Accuracy\n \"\"\"\n top_k_exact_values = np.argsort(y_true)[-k:]\n top_k_pred_values = np.argsort(y_pred)[-k:]\n top_k_accuracy = len(np.intersect1d(top_k_exact_values, top_k_pred_values)) / k\n return top_k_accuracy\n
"},{"location":"api/pydvl/utils/progress/","title":"Progress","text":""},{"location":"api/pydvl/utils/progress/#pydvl.utils.progress","title":"pydvl.utils.progress","text":""},{"location":"api/pydvl/utils/progress/#pydvl.utils.progress.repeat_indices","title":"repeat_indices","text":"repeat_indices(\n indices: Collection[int],\n result: ValuationResult,\n done: StoppingCriterion,\n **kwargs\n) -> Iterator[int]\n
Helper function to cycle indefinitely over a collection of indices until the stopping criterion is satisfied while displaying progress.
PARAMETER DESCRIPTION indices
Collection of indices that will be cycled until done.
TYPE: Collection[int]
result
Object containing the current results.
TYPE: ValuationResult
done
Stopping criterion.
TYPE: StoppingCriterion
kwargs
Keyword arguments passed to tqdm.
DEFAULT: {}
Source code in src/pydvl/utils/progress.py
def repeat_indices(\n indices: Collection[int],\n result: \"ValuationResult\",\n done: \"StoppingCriterion\",\n **kwargs,\n) -> Iterator[int]:\n \"\"\"Helper function to cycle indefinitely over a collection of indices\n until the stopping criterion is satisfied while displaying progress.\n\n Args:\n indices: Collection of indices that will be cycled until done.\n result: Object containing the current results.\n done: Stopping criterion.\n kwargs: Keyword arguments passed to tqdm.\n \"\"\"\n with tqdm(total=100, unit=\"%\", **kwargs) as pbar:\n it = takewhile(lambda _: not done(result), cycle(indices))\n for i in it:\n yield i\n pbar.update(100 * done.completion() - pbar.n)\n pbar.refresh()\n
"},{"location":"api/pydvl/utils/progress/#pydvl.utils.progress.log_duration","title":"log_duration","text":"log_duration(_func=None, *, log_level=logging.DEBUG)\n
Decorator to log execution time of a function with a configurable logging level. It can be used with or without specifying a log level.
Source code in src/pydvl/utils/progress.py
def log_duration(_func=None, *, log_level=logging.DEBUG):\n \"\"\"\n Decorator to log execution time of a function with a configurable logging level.\n It can be used with or without specifying a log level.\n \"\"\"\n\n def decorator_log_duration(func):\n @wraps(func)\n def wrapper_log_duration(*args, **kwargs):\n func_name = func.__qualname__\n logger.log(log_level, f\"Function '{func_name}' is starting.\")\n start_time = time()\n result = func(*args, **kwargs)\n duration = time() - start_time\n logger.log(\n log_level,\n f\"Function '{func_name}' completed. \" f\"Duration: {duration:.2f} sec\",\n )\n return result\n\n return wrapper_log_duration\n\n if _func is None:\n # If log_duration was called without arguments, return decorator\n return decorator_log_duration\n else:\n # If log_duration was called with a function, apply decorator directly\n return decorator_log_duration(_func)\n
"},{"location":"api/pydvl/utils/score/","title":"Score","text":""},{"location":"api/pydvl/utils/score/#pydvl.utils.score","title":"pydvl.utils.score","text":"This module provides a Scorer class that wraps scoring functions with additional information.
Scorers are the fundamental building block of many data valuation methods. They are typically used by the Utility class to evaluate the quality of a model when trained on subsets of the training data.
Scorers can be constructed in the same way as in scikit-learn: either from known strings or from a callable. Greater values must be better. If they are not, a negated version can be used, see scikit-learn's make_scorer().
Scorer provides additional information about the scoring function, like its range and default values, which can be used by some data valuation methods (like group_testing_shapley()) to estimate the number of samples required for a certain quality of approximation.
"},{"location":"api/pydvl/utils/score/#pydvl.utils.score.squashed_r2","title":"squashed_r2 module-attribute
","text":"squashed_r2 = compose_score(Scorer('r2'), _sigmoid, (0, 1), 'squashed r2')\n
A scorer that squashes the R\u00b2 score into the range [0, 1] using a sigmoid.
"},{"location":"api/pydvl/utils/score/#pydvl.utils.score.squashed_variance","title":"squashed_variance module-attribute
","text":"squashed_variance = compose_score(\n Scorer(\"explained_variance\"),\n _sigmoid,\n (0, 1),\n \"squashed explained variance\",\n)\n
A scorer that squashes the explained variance score into the range [0, 1] using a sigmoid.
"},{"location":"api/pydvl/utils/score/#pydvl.utils.score.ScorerCallable","title":"ScorerCallable","text":" Bases: Protocol
Signature for a scorer
"},{"location":"api/pydvl/utils/score/#pydvl.utils.score.Scorer","title":"Scorer","text":"Scorer(\n scoring: Union[str, ScorerCallable],\n default: float = np.nan,\n range: Tuple = (-np.inf, np.inf),\n name: Optional[str] = None,\n)\n
A scoring callable that takes a model, data, and labels and returns a scalar.
PARAMETER DESCRIPTION scoring
Either a string or callable that can be passed to get_scorer.
TYPE: Union[str, ScorerCallable]
default
score to be used when a model cannot be fit, e.g. when too little data is passed, or errors arise.
TYPE: float
DEFAULT: nan
range
numerical range of the score function. Some Monte Carlo methods can use this to estimate the number of samples required for a certain quality of approximation. If not provided, it can be read from the scoring
object if it provides it, for instance if it was constructed with compose_score().
TYPE: Tuple
DEFAULT: (-inf, inf)
name
The name of the scorer. If not provided, the name of the function passed will be used.
TYPE: Optional[str]
DEFAULT: None
New in version 0.5.0
Source code in src/pydvl/utils/score.py
def __init__(\n self,\n scoring: Union[str, ScorerCallable],\n default: float = np.nan,\n range: Tuple = (-np.inf, np.inf),\n name: Optional[str] = None,\n):\n if name is None and isinstance(scoring, str):\n name = scoring\n self._scorer = get_scorer(scoring)\n self.default = default\n # TODO: auto-fill from known scorers ?\n self.range = np.array(range)\n self._name = getattr(self._scorer, \"__name__\", name or \"scorer\")\n
"},{"location":"api/pydvl/utils/score/#pydvl.utils.score.compose_score","title":"compose_score","text":"compose_score(\n scorer: Scorer,\n transformation: Callable[[float], float],\n range: Tuple[float, float],\n name: str,\n) -> Scorer\n
Composes a scoring function with an arbitrary scalar transformation.
Useful to squash unbounded scores into ranges manageable by data valuation methods.
Example:
sigmoid = lambda x: 1/(1+np.exp(-x))\ncompose_score(Scorer(\"r2\"), sigmoid, range=(0,1), name=\"squashed r2\")\n
PARAMETER DESCRIPTION scorer
The object to be composed.
TYPE: Scorer
transformation
A scalar transformation
TYPE: Callable[[float], float]
range
The range of the transformation. This will be used e.g. by Utility for the range of the composed.
TYPE: Tuple[float, float]
name
A string representation for the composition, for str()
.
TYPE: str
RETURNS DESCRIPTION Scorer
The composite Scorer.
Source code in src/pydvl/utils/score.py
def compose_score(\n scorer: Scorer,\n transformation: Callable[[float], float],\n range: Tuple[float, float],\n name: str,\n) -> Scorer:\n \"\"\"Composes a scoring function with an arbitrary scalar transformation.\n\n Useful to squash unbounded scores into ranges manageable by data valuation\n methods.\n\n Example:\n\n ```python\n sigmoid = lambda x: 1/(1+np.exp(-x))\n compose_score(Scorer(\"r2\"), sigmoid, range=(0,1), name=\"squashed r2\")\n ```\n\n Args:\n scorer: The object to be composed.\n transformation: A scalar transformation\n range: The range of the transformation. This will be used e.g. by\n [Utility][pydvl.utils.utility.Utility] for the range of the composed.\n name: A string representation for the composition, for `str()`.\n\n Returns:\n The composite [Scorer][pydvl.utils.score.Scorer].\n \"\"\"\n\n class CompositeScorer(Scorer):\n def __call__(self, model: SupervisedModel, X: NDArray, y: NDArray) -> float:\n score = self._scorer(model=model, X=X, y=y)\n return transformation(score)\n\n return CompositeScorer(scorer, range=range, name=name)\n
"},{"location":"api/pydvl/utils/status/","title":"Status","text":""},{"location":"api/pydvl/utils/status/#pydvl.utils.status","title":"pydvl.utils.status","text":""},{"location":"api/pydvl/utils/status/#pydvl.utils.status.Status","title":"Status","text":" Bases: Enum
Status of a computation.
Statuses can be combined using bitwise or (|
) and bitwise and (&
) to get the status of a combined computation. For example, if we have two computations, one that has converged and one that has failed, then the combined status is Status.Converged | Status.Failed == Status.Converged
, but Status.Converged & Status.Failed == Status.Failed
.
"},{"location":"api/pydvl/utils/status/#pydvl.utils.status.Status--or","title":"OR","text":"The result of bitwise or-ing two valuation statuses with |
is given by the following table:
P C F P P C P C C C C F P C F where P = Pending, C = Converged, F = Failed.
"},{"location":"api/pydvl/utils/status/#pydvl.utils.status.Status--and","title":"AND","text":"The result of bitwise and-ing two valuation statuses with &
is given by the following table:
P C F P P P F C P C F F F F F where P = Pending, C = Converged, F = Failed.
"},{"location":"api/pydvl/utils/status/#pydvl.utils.status.Status--not","title":"NOT","text":"The result of bitwise negation of a Status with ~
is Failed
if the status is Converged
, or Converged
otherwise:
~P == C, ~C == F, ~F == C\n
"},{"location":"api/pydvl/utils/status/#pydvl.utils.status.Status--boolean-casting","title":"Boolean casting","text":"A Status evaluates to True
iff it's Converged
or Failed
:
bool(Status.Pending) == False\nbool(Status.Converged) == True\nbool(Status.Failed) == True\n
Warning
These truth values are inconsistent with the usual boolean operations. In particular the XOR of two instances of Status
is not the same as the XOR of their boolean values.
"},{"location":"api/pydvl/utils/types/","title":"Types","text":""},{"location":"api/pydvl/utils/types/#pydvl.utils.types","title":"pydvl.utils.types","text":"This module contains types, protocols, decorators and generic function transformations. Some of it probably belongs elsewhere.
"},{"location":"api/pydvl/utils/types/#pydvl.utils.types.SupervisedModel","title":"SupervisedModel","text":" Bases: Protocol
This is the minimal Protocol that valuation methods require from models in order to work.
All that is needed are the standard sklearn methods fit()
, predict()
and score()
.
"},{"location":"api/pydvl/utils/types/#pydvl.utils.types.SupervisedModel.fit","title":"fit","text":"fit(x: NDArray, y: NDArray)\n
Fit the model to the data
PARAMETER DESCRIPTION x
Independent variables
TYPE: NDArray
y
Dependent variable
TYPE: NDArray
Source code in src/pydvl/utils/types.py
def fit(self, x: NDArray, y: NDArray):\n \"\"\"Fit the model to the data\n\n Args:\n x: Independent variables\n y: Dependent variable\n \"\"\"\n pass\n
"},{"location":"api/pydvl/utils/types/#pydvl.utils.types.SupervisedModel.predict","title":"predict","text":"predict(x: NDArray) -> NDArray\n
Compute predictions for the input
PARAMETER DESCRIPTION x
Independent variables for which to compute predictions
TYPE: NDArray
RETURNS DESCRIPTION NDArray
Predictions for the input
Source code in src/pydvl/utils/types.py
def predict(self, x: NDArray) -> NDArray:\n \"\"\"Compute predictions for the input\n\n Args:\n x: Independent variables for which to compute predictions\n\n Returns:\n Predictions for the input\n \"\"\"\n pass\n
"},{"location":"api/pydvl/utils/types/#pydvl.utils.types.SupervisedModel.score","title":"score","text":"score(x: NDArray, y: NDArray) -> float\n
Compute the score of the model given test data
PARAMETER DESCRIPTION x
Independent variables
TYPE: NDArray
y
Dependent variable
TYPE: NDArray
RETURNS DESCRIPTION float
The score of the model on (x, y)
Source code in src/pydvl/utils/types.py
def score(self, x: NDArray, y: NDArray) -> float:\n \"\"\"Compute the score of the model given test data\n\n Args:\n x: Independent variables\n y: Dependent variable\n\n Returns:\n The score of the model on `(x, y)`\n \"\"\"\n pass\n
"},{"location":"api/pydvl/utils/types/#pydvl.utils.types.ensure_seed_sequence","title":"ensure_seed_sequence","text":"ensure_seed_sequence(\n seed: Optional[Union[Seed, SeedSequence]] = None\n) -> SeedSequence\n
If the passed seed is a SeedSequence object then it is returned as is. If it is a Generator the internal protected seed sequence from the generator gets extracted. Otherwise, a new SeedSequence object is created from the passed (optional) seed.
PARAMETER DESCRIPTION seed
Either an int, a Generator object a SeedSequence object or None.
TYPE: Optional[Union[Seed, SeedSequence]]
DEFAULT: None
RETURNS DESCRIPTION SeedSequence
A SeedSequence object.
New in version 0.7.0
Source code in src/pydvl/utils/types.py
def ensure_seed_sequence(\n seed: Optional[Union[Seed, SeedSequence]] = None\n) -> SeedSequence:\n \"\"\"\n If the passed seed is a SeedSequence object then it is returned as is. If it is\n a Generator the internal protected seed sequence from the generator gets extracted.\n Otherwise, a new SeedSequence object is created from the passed (optional) seed.\n\n Args:\n seed: Either an int, a Generator object a SeedSequence object or None.\n\n Returns:\n A SeedSequence object.\n\n !!! tip \"New in version 0.7.0\"\n \"\"\"\n if isinstance(seed, SeedSequence):\n return seed\n elif isinstance(seed, Generator):\n return cast(SeedSequence, seed.bit_generator.seed_seq) # type: ignore\n else:\n return SeedSequence(seed)\n
"},{"location":"api/pydvl/utils/utility/","title":"Utility","text":""},{"location":"api/pydvl/utils/utility/#pydvl.utils.utility","title":"pydvl.utils.utility","text":"This module contains classes to manage and learn utility functions for the computation of values. Please see the documentation on Computing Data Values for more information.
Utility holds information about model, data and scoring function (the latter being what one usually understands under utility in the general definition of Shapley value). It is automatically cached across machines when the cache is configured and it is enabled upon construction.
DataUtilityLearning adds support for learning the scoring function to avoid repeated re-training of the model to compute the score.
This module also contains derived Utility
classes for toy games that are used for testing and for demonstration purposes.
"},{"location":"api/pydvl/utils/utility/#pydvl.utils.utility--references","title":"References","text":" -
Wang, T., Yang, Y. and Jia, R., 2021. Improving cooperative game theory-based data valuation via data utility learning. arXiv preprint arXiv:2107.06336.\u00a0\u21a9
"},{"location":"api/pydvl/utils/utility/#pydvl.utils.utility.Utility","title":"Utility","text":"Utility(\n model: SupervisedModel,\n data: Dataset,\n scorer: Optional[Union[str, Scorer]] = None,\n *,\n default_score: float = 0.0,\n score_range: Tuple[float, float] = (-np.inf, np.inf),\n catch_errors: bool = True,\n show_warnings: bool = False,\n cache_backend: Optional[CacheBackend] = None,\n cached_func_options: Optional[CachedFuncConfig] = None,\n clone_before_fit: bool = True\n)\n
Convenience wrapper with configurable memoization of the scoring function.
An instance of Utility
holds the triple of model, dataset and scoring function which determines the value of data points. This is used for the computation of all game-theoretic values like Shapley values and the Least Core.
The Utility expect the model to fulfill the SupervisedModel interface i.e. to have fit()
, predict()
, and score()
methods.
When calling the utility, the model will be cloned if it is a Sci-Kit Learn model, otherwise a copy is created using copy.deepcopy
Since evaluating the scoring function requires retraining the model and that can be time-consuming, this class wraps it and caches the results of each execution. Caching is available both locally and across nodes, but must always be enabled for your project first, see the documentation and the module documentation.
ATTRIBUTE DESCRIPTION model
The supervised model.
TYPE: SupervisedModel
data
An object containing the split data.
TYPE: Dataset
scorer
A scoring function. If None, the score()
method of the model will be used. See score for ways to create and compose scorers, in particular how to set default values and ranges.
TYPE: Scorer
PARAMETER DESCRIPTION model
Any supervised model. Typical choices can be found in the [sci-kit learn documentation][https://scikit-learn.org/stable/supervised_learning.html].
TYPE: SupervisedModel
data
Dataset or GroupedDataset instance.
TYPE: Dataset
scorer
A scoring object. If None, the score()
method of the model will be used. See score for ways to create and compose scorers, in particular how to set default values and ranges. For convenience, a string can be passed, which will be used to construct a Scorer.
TYPE: Optional[Union[str, Scorer]]
DEFAULT: None
default_score
As a convenience when no scorer
object is passed (where a default value can be provided), this argument also allows to set the default score for models that have not been fit, e.g. when too little data is passed, or errors arise.
TYPE: float
DEFAULT: 0.0
score_range
As with default_score
, this is a convenience argument for when no scorer
argument is provided, to set the numerical range of the score function. Some Monte Carlo methods can use this to estimate the number of samples required for a certain quality of approximation.
TYPE: Tuple[float, float]
DEFAULT: (-inf, inf)
catch_errors
set to True
to catch the errors when fit()
fails. This could happen in several steps of the pipeline, e.g. when too little training data is passed, which happens often during Shapley value calculations. When this happens, the default_score
is returned as a score and computation continues.
TYPE: bool
DEFAULT: True
show_warnings
Set to False
to suppress warnings thrown by fit()
.
TYPE: bool
DEFAULT: False
cache_backend
Optional instance of CacheBackend used to wrap the _utility method of the Utility instance. By default, this is set to None and that means that the utility evaluations will not be cached.
TYPE: Optional[CacheBackend]
DEFAULT: None
cached_func_options
Optional configuration object for cached utility evaluation.
TYPE: Optional[CachedFuncConfig]
DEFAULT: None
clone_before_fit
If True
, the model will be cloned before calling fit()
.
TYPE: bool
DEFAULT: True
Example >>> from pydvl.utils import Utility, DataUtilityLearning, Dataset\n>>> from sklearn.linear_model import LinearRegression, LogisticRegression\n>>> from sklearn.datasets import load_iris\n>>> dataset = Dataset.from_sklearn(load_iris(), random_state=16)\n>>> u = Utility(LogisticRegression(random_state=16), dataset)\n>>> u(dataset.indices)\n0.9\n
With caching enabled:
>>> from pydvl.utils import Utility, DataUtilityLearning, Dataset\n>>> from pydvl.utils.caching.memory import InMemoryCacheBackend\n>>> from sklearn.linear_model import LinearRegression, LogisticRegression\n>>> from sklearn.datasets import load_iris\n>>> dataset = Dataset.from_sklearn(load_iris(), random_state=16)\n>>> cache_backend = InMemoryCacheBackend()\n>>> u = Utility(LogisticRegression(random_state=16), dataset, cache_backend=cache_backend)\n>>> u(dataset.indices)\n0.9\n
Source code in src/pydvl/utils/utility.py
def __init__(\n self,\n model: SupervisedModel,\n data: Dataset,\n scorer: Optional[Union[str, Scorer]] = None,\n *,\n default_score: float = 0.0,\n score_range: Tuple[float, float] = (-np.inf, np.inf),\n catch_errors: bool = True,\n show_warnings: bool = False,\n cache_backend: Optional[CacheBackend] = None,\n cached_func_options: Optional[CachedFuncConfig] = None,\n clone_before_fit: bool = True,\n):\n self.model = self._clone_model(model)\n self.data = data\n if isinstance(scorer, str):\n scorer = Scorer(scorer, default=default_score, range=score_range)\n self.scorer = check_scoring(self.model, scorer)\n self.default_score = scorer.default if scorer is not None else default_score\n # TODO: auto-fill from known scorers ?\n self.score_range = scorer.range if scorer is not None else np.array(score_range)\n self.clone_before_fit = clone_before_fit\n self.catch_errors = catch_errors\n self.show_warnings = show_warnings\n self.cache = cache_backend\n if cached_func_options is None:\n cached_func_options = CachedFuncConfig()\n # TODO: Find a better way to do this.\n if cached_func_options.hash_prefix is None:\n # FIX: This does not handle reusing the same across runs.\n cached_func_options.hash_prefix = str(hash((model, data, scorer)))\n self.cached_func_options = cached_func_options\n self._initialize_utility_wrapper()\n
"},{"location":"api/pydvl/utils/utility/#pydvl.utils.utility.Utility.cache_stats","title":"cache_stats property
","text":"cache_stats: Optional[CacheStats]\n
Cache statistics are gathered when cache is enabled. See CacheStats for all fields returned.
"},{"location":"api/pydvl/utils/utility/#pydvl.utils.utility.Utility.__call__","title":"__call__","text":"__call__(indices: Iterable[int]) -> float\n
PARAMETER DESCRIPTION indices
a subset of valid indices for the x_train
attribute of Dataset.
TYPE: Iterable[int]
Source code in src/pydvl/utils/utility.py
def __call__(self, indices: Iterable[int]) -> float:\n \"\"\"\n Args:\n indices: a subset of valid indices for the\n `x_train` attribute of [Dataset][pydvl.utils.dataset.Dataset].\n \"\"\"\n utility: float = self._utility_wrapper(frozenset(indices))\n return utility\n
"},{"location":"api/pydvl/utils/utility/#pydvl.utils.utility.DataUtilityLearning","title":"DataUtilityLearning","text":"DataUtilityLearning(u: Utility, training_budget: int, model: SupervisedModel)\n
Implementation of Data Utility Learning (Wang et al., 2022)1.
This object wraps a Utility and delegates calls to it, up until a given budget (number of iterations). Every tuple of input and output (a so-called utility sample) is stored. Once the budget is exhausted, DataUtilityLearning
fits the given model to the utility samples. Subsequent calls will use the learned model to predict the utility instead of delegating.
PARAMETER DESCRIPTION u
The Utility to learn.
TYPE: Utility
training_budget
Number of utility samples to collect before fitting the given model.
TYPE: int
model
A supervised regression model
TYPE: SupervisedModel
Example >>> from pydvl.utils import Utility, DataUtilityLearning, Dataset\n>>> from sklearn.linear_model import LinearRegression, LogisticRegression\n>>> from sklearn.datasets import load_iris\n>>> dataset = Dataset.from_sklearn(load_iris())\n>>> u = Utility(LogisticRegression(), dataset)\n>>> wrapped_u = DataUtilityLearning(u, 3, LinearRegression())\n... # First 3 calls will be computed normally\n>>> for i in range(3):\n... _ = wrapped_u((i,))\n>>> wrapped_u((1, 2, 3)) # Subsequent calls will be computed using the fit model for DUL\n0.0\n
Source code in src/pydvl/utils/utility.py
def __init__(\n self, u: Utility, training_budget: int, model: SupervisedModel\n) -> None:\n self.utility = u\n self.training_budget = training_budget\n self.model = model\n self._current_iteration = 0\n self._is_model_fit = False\n self._utility_samples: Dict[FrozenSet, Tuple[NDArray[np.bool_], float]] = {}\n
"},{"location":"api/pydvl/utils/utility/#pydvl.utils.utility.DataUtilityLearning.data","title":"data property
","text":"data: Dataset\n
Returns the wrapped utility's Dataset.
"},{"location":"api/pydvl/utils/caching/","title":"Caching","text":""},{"location":"api/pydvl/utils/caching/#pydvl.utils.caching","title":"pydvl.utils.caching","text":"This module provides caching of functions.
PyDVL can cache (memoize) the computation of the utility function and speed up some computations for data valuation.
Warning
Function evaluations are cached with a key based on the function's signature and code. This can lead to undesired cache hits, see Cache reuse.
Remember not to reuse utility objects for different datasets.
"},{"location":"api/pydvl/utils/caching/#pydvl.utils.caching--configuration","title":"Configuration","text":"Caching is disabled by default but can be enabled easily, see Setting up the cache. When enabled, it will be added to any callable used to construct a Utility (done with the wrap method of CacheBackend). Depending on the nature of the utility you might want to enable the computation of a running average of function values, see Usage with stochastic functions. You can see all configuration options under CachedFuncConfig.
"},{"location":"api/pydvl/utils/caching/#pydvl.utils.caching--supported-backends","title":"Supported Backends","text":"pyDVL supports 3 different caching backends:
- InMemoryCacheBackend: an in-memory cache backend that uses a dictionary to store and retrieve cached values. This is used to share cached values between threads in a single process.
- DiskCacheBackend: a disk-based cache backend that uses pickled values written to and read from disk. This is used to share cached values between processes in a single machine.
-
MemcachedCacheBackend: a Memcached-based cache backend that uses pickled values written to and read from a Memcached server. This is used to share cached values between processes across multiple machines.
Info
This specific backend requires optional dependencies not installed by default. See Extra dependencies for more information.
"},{"location":"api/pydvl/utils/caching/#pydvl.utils.caching--usage-with-stochastic-functions","title":"Usage with stochastic functions","text":"In addition to standard memoization, the wrapped functions can compute running average and standard error of repeated evaluations for the same input. This can be useful for stochastic functions with high variance (e.g. model training for small sample sizes), but drastically reduces the speed benefits of memoization.
This behaviour can be activated with the option allow_repeated_evaluations.
"},{"location":"api/pydvl/utils/caching/#pydvl.utils.caching--cache-reuse","title":"Cache reuse","text":"When working directly with CachedFunc, it is essential to only cache pure functions. If they have any kind of state, either internal or external (e.g. a closure over some data that may change), then the cache will fail to notice this and the same value will be returned.
When a function is wrapped with CachedFunc for memoization, its signature (input and output names) and code are used as a key for the cache.
If you are running experiments with the same Utility but different datasets, this will lead to evaluations of the utility on new data returning old values because utilities only use sample indices as arguments (so there is no way to tell the difference between '1' for dataset A and '1' for dataset 2 from the point of view of the cache). One solution is to empty the cache between runs by calling the clear
method of the cache backend instance, but the preferred one is to use a different Utility object for each dataset.
"},{"location":"api/pydvl/utils/caching/#pydvl.utils.caching--unexpected-cache-misses","title":"Unexpected cache misses","text":"Because all arguments to a function are used as part of the key for the cache, sometimes one must exclude some of them. For example, If a function is going to run across multiple processes and some reporting arguments are added (like a job_id
for logging purposes), these will be part of the signature and make the functions distinct to the eyes of the cache. This can be avoided with the use of ignore_args option in the configuration.
"},{"location":"api/pydvl/utils/caching/base/","title":"Base","text":""},{"location":"api/pydvl/utils/caching/base/#pydvl.utils.caching.base","title":"pydvl.utils.caching.base","text":""},{"location":"api/pydvl/utils/caching/base/#pydvl.utils.caching.base.CacheStats","title":"CacheStats dataclass
","text":"CacheStats(\n sets: int = 0,\n misses: int = 0,\n hits: int = 0,\n timeouts: int = 0,\n errors: int = 0,\n reconnects: int = 0,\n)\n
Class used to store statistics gathered by cached functions.
ATTRIBUTE DESCRIPTION sets
Number of times a value was set in the cache.
TYPE: int
misses
Number of times a value was not found in the cache.
TYPE: int
hits
Number of times a value was found in the cache.
TYPE: int
timeouts
Number of times a timeout occurred.
TYPE: int
errors
Number of times an error occurred.
TYPE: int
reconnects
Number of times the client reconnected to the server.
TYPE: int
"},{"location":"api/pydvl/utils/caching/base/#pydvl.utils.caching.base.CacheResult","title":"CacheResult dataclass
","text":"CacheResult(value: float, count: int = 1, variance: float = 0.0)\n
A class used to store the cached result of a computation as well as count and variance when using repeated evaluation.
ATTRIBUTE DESCRIPTION value
Cached value.
TYPE: float
count
Number of times this value has been computed.
TYPE: int
variance
Variance associated with the cached value.
TYPE: float
"},{"location":"api/pydvl/utils/caching/base/#pydvl.utils.caching.base.CacheBackend","title":"CacheBackend","text":"CacheBackend()\n
Bases: ABC
Abstract base class for cache backends.
Defines interface for cache access including wrapping callables, getting/setting results, clearing cache, and combining cache keys.
ATTRIBUTE DESCRIPTION stats
Cache statistics tracker.
Source code in src/pydvl/utils/caching/base.py
def __init__(self) -> None:\n self.stats = CacheStats()\n
"},{"location":"api/pydvl/utils/caching/base/#pydvl.utils.caching.base.CacheBackend.wrap","title":"wrap","text":"wrap(\n func: Callable, *, config: Optional[CachedFuncConfig] = None\n) -> CachedFunc\n
Wraps a function to cache its results.
PARAMETER DESCRIPTION func
The function to wrap.
TYPE: Callable
config
Optional caching options for the wrapped function.
TYPE: Optional[CachedFuncConfig]
DEFAULT: None
RETURNS DESCRIPTION CachedFunc
The wrapped cached function.
Source code in src/pydvl/utils/caching/base.py
def wrap(\n self,\n func: Callable,\n *,\n config: Optional[CachedFuncConfig] = None,\n) -> \"CachedFunc\":\n \"\"\"Wraps a function to cache its results.\n\n Args:\n func: The function to wrap.\n config: Optional caching options for the wrapped function.\n\n Returns:\n The wrapped cached function.\n \"\"\"\n return CachedFunc(\n func,\n cache_backend=self,\n config=config,\n )\n
"},{"location":"api/pydvl/utils/caching/base/#pydvl.utils.caching.base.CacheBackend.get","title":"get abstractmethod
","text":"get(key: str) -> Optional[CacheResult]\n
Abstract method to retrieve a cached result.
Implemented by subclasses.
PARAMETER DESCRIPTION key
The cache key.
TYPE: str
RETURNS DESCRIPTION Optional[CacheResult]
The cached result or None if not found.
Source code in src/pydvl/utils/caching/base.py
@abstractmethod\ndef get(self, key: str) -> Optional[CacheResult]:\n \"\"\"Abstract method to retrieve a cached result.\n\n Implemented by subclasses.\n\n Args:\n key: The cache key.\n\n Returns:\n The cached result or None if not found.\n \"\"\"\n pass\n
"},{"location":"api/pydvl/utils/caching/base/#pydvl.utils.caching.base.CacheBackend.set","title":"set abstractmethod
","text":"set(key: str, value: CacheResult) -> None\n
Abstract method to set a cached result.
Implemented by subclasses.
PARAMETER DESCRIPTION key
The cache key.
TYPE: str
value
The result to cache.
TYPE: CacheResult
Source code in src/pydvl/utils/caching/base.py
@abstractmethod\ndef set(self, key: str, value: CacheResult) -> None:\n \"\"\"Abstract method to set a cached result.\n\n Implemented by subclasses.\n\n Args:\n key: The cache key.\n value: The result to cache.\n \"\"\"\n pass\n
"},{"location":"api/pydvl/utils/caching/base/#pydvl.utils.caching.base.CacheBackend.clear","title":"clear abstractmethod
","text":"clear() -> None\n
Abstract method to clear the entire cache.
Source code in src/pydvl/utils/caching/base.py
@abstractmethod\ndef clear(self) -> None:\n \"\"\"Abstract method to clear the entire cache.\"\"\"\n pass\n
"},{"location":"api/pydvl/utils/caching/base/#pydvl.utils.caching.base.CacheBackend.combine_hashes","title":"combine_hashes abstractmethod
","text":"combine_hashes(*args: str) -> str\n
Abstract method to combine cache keys.
Source code in src/pydvl/utils/caching/base.py
@abstractmethod\ndef combine_hashes(self, *args: str) -> str:\n \"\"\"Abstract method to combine cache keys.\"\"\"\n pass\n
"},{"location":"api/pydvl/utils/caching/base/#pydvl.utils.caching.base.CachedFunc","title":"CachedFunc","text":"CachedFunc(\n func: Callable[..., float],\n *,\n cache_backend: CacheBackend,\n config: Optional[CachedFuncConfig] = None\n)\n
Caches callable function results with a provided cache backend.
Wraps a callable function to cache its results using a provided an instance of a subclass of CacheBackend.
This class is heavily inspired from that of joblib.memory.MemorizedFunc.
This class caches calls to the wrapped callable by generating a hash key based on the wrapped callable's code, the arguments passed to it and the optional hash_prefix.
Warning
This class only works with hashable arguments to the wrapped callable.
PARAMETER DESCRIPTION func
Callable to wrap.
TYPE: Callable[..., float]
cache_backend
Instance of CacheBackendBase that handles setting and getting values.
TYPE: CacheBackend
config
Configuration for wrapped function.
TYPE: Optional[CachedFuncConfig]
DEFAULT: None
Source code in src/pydvl/utils/caching/base.py
def __init__(\n self,\n func: Callable[..., float],\n *,\n cache_backend: CacheBackend,\n config: Optional[CachedFuncConfig] = None,\n) -> None:\n self.func = func\n self.cache_backend = cache_backend\n if config is None:\n config = CachedFuncConfig()\n self.config = config\n\n self.__doc__ = f\"A wrapper around {func.__name__}() with caching enabled.\\n\" + (\n CachedFunc.__doc__ or \"\"\n )\n self.__name__ = f\"cached_{func.__name__}\"\n path = list(reversed(func.__qualname__.split(\".\")))\n patched = [f\"cached_{path[0]}\"] + path[1:]\n self.__qualname__ = \".\".join(reversed(patched))\n
"},{"location":"api/pydvl/utils/caching/base/#pydvl.utils.caching.base.CachedFunc.stats","title":"stats property
","text":"stats: CacheStats\n
Cache backend statistics.
"},{"location":"api/pydvl/utils/caching/base/#pydvl.utils.caching.base.CachedFunc.__call__","title":"__call__","text":"__call__(*args, **kwargs) -> float\n
Call the wrapped cached function.
Executes the wrapped function, caching and returning the result.
Source code in src/pydvl/utils/caching/base.py
def __call__(self, *args, **kwargs) -> float:\n \"\"\"Call the wrapped cached function.\n\n Executes the wrapped function, caching and returning the result.\n \"\"\"\n return self._cached_call(args, kwargs)\n
"},{"location":"api/pydvl/utils/caching/config/","title":"Config","text":""},{"location":"api/pydvl/utils/caching/config/#pydvl.utils.caching.config","title":"pydvl.utils.caching.config","text":""},{"location":"api/pydvl/utils/caching/config/#pydvl.utils.caching.config.CachedFuncConfig","title":"CachedFuncConfig dataclass
","text":"CachedFuncConfig(\n hash_prefix: Optional[str] = None,\n ignore_args: Collection[str] = list(),\n time_threshold: float = 0.3,\n allow_repeated_evaluations: bool = False,\n rtol_stderr: float = 0.1,\n min_repetitions: int = 3,\n)\n
Configuration for cached functions and methods, providing memoization of function calls.
Instances of this class are typically used as arguments for the construction of a Utility.
PARAMETER DESCRIPTION hash_prefix
Optional string prefix that be prepended to the cache key. This can be provided in order to guarantee cache reuse across runs.
TYPE: Optional[str]
DEFAULT: None
ignore_args
Do not take these keyword arguments into account when hashing the wrapped function for usage as key. This allows sharing the cache among different jobs for the same experiment run if the callable happens to have \"nuisance\" parameters like job_id
which do not affect the result of the computation.
TYPE: Collection[str]
DEFAULT: list()
time_threshold
Computations taking less time than this many seconds are not cached. A value of 0 means that it will always cache results.
TYPE: float
DEFAULT: 0.3
allow_repeated_evaluations
If True
, repeated calls to a function with the same arguments will be allowed and outputs averaged until the running standard deviation of the mean stabilizes below rtol_stderr * mean
.
TYPE: bool
DEFAULT: False
rtol_stderr
relative tolerance for repeated evaluations. More precisely, memcached() will stop evaluating the function once the standard deviation of the mean is smaller than rtol_stderr * mean
.
TYPE: float
DEFAULT: 0.1
min_repetitions
minimum number of times that a function evaluation on the same arguments is repeated before returning cached values. Useful for stochastic functions only. If the model training is very noisy, set this number to higher values to reduce variance.
TYPE: int
DEFAULT: 3
"},{"location":"api/pydvl/utils/caching/disk/","title":"Disk","text":""},{"location":"api/pydvl/utils/caching/disk/#pydvl.utils.caching.disk","title":"pydvl.utils.caching.disk","text":""},{"location":"api/pydvl/utils/caching/disk/#pydvl.utils.caching.disk.DiskCacheBackend","title":"DiskCacheBackend","text":"DiskCacheBackend(cache_dir: Optional[Union[PathLike, str]] = None)\n
Bases: CacheBackend
Disk cache backend that stores results in files.
Implements the CacheBackend interface for a disk-based cache. Stores cache entries as pickled files on disk, keyed by cache key. This allows sharing evaluations across processes in a single node/computer.
PARAMETER DESCRIPTION cache_dir
Base directory for cache storage.
TYPE: Optional[Union[PathLike, str]]
DEFAULT: None
ATTRIBUTE DESCRIPTION cache_dir
Base directory for cache storage.
Example Basic usage:
>>> from pydvl.utils.caching.disk import DiskCacheBackend\n>>> cache_backend = DiskCacheBackend()\n>>> cache_backend.clear()\n>>> value = 42\n>>> cache_backend.set(\"key\", value)\n>>> cache_backend.get(\"key\")\n42\n
Callable wrapping:
>>> from pydvl.utils.caching.disk import DiskCacheBackend\n>>> cache_backend = DiskCacheBackend()\n>>> cache_backend.clear()\n>>> value = 42\n>>> def foo(x: int):\n... return x + 1\n...\n>>> wrapped_foo = cache_backend.wrap(foo)\n>>> wrapped_foo(value)\n43\n>>> wrapped_foo.stats.misses\n1\n>>> wrapped_foo.stats.hits\n0\n>>> wrapped_foo(value)\n43\n>>> wrapped_foo.stats.misses\n1\n>>> wrapped_foo.stats.hits\n1\n
PARAMETER DESCRIPTION cache_dir
Base directory for cache storage. If not provided, this defaults to a newly created temporary directory.
TYPE: Optional[Union[PathLike, str]]
DEFAULT: None
Source code in src/pydvl/utils/caching/disk.py
def __init__(\n self,\n cache_dir: Optional[Union[os.PathLike, str]] = None,\n) -> None:\n \"\"\"Initialize the disk cache backend.\n\n Args:\n cache_dir: Base directory for cache storage.\n If not provided, this defaults to a newly created\n temporary directory.\n \"\"\"\n super().__init__()\n if cache_dir is None:\n cache_dir = tempfile.mkdtemp(prefix=\"pydvl\")\n self.cache_dir = Path(cache_dir)\n self.cache_dir.mkdir(exist_ok=True, parents=True)\n
"},{"location":"api/pydvl/utils/caching/disk/#pydvl.utils.caching.disk.DiskCacheBackend.wrap","title":"wrap","text":"wrap(\n func: Callable, *, config: Optional[CachedFuncConfig] = None\n) -> CachedFunc\n
Wraps a function to cache its results.
PARAMETER DESCRIPTION func
The function to wrap.
TYPE: Callable
config
Optional caching options for the wrapped function.
TYPE: Optional[CachedFuncConfig]
DEFAULT: None
RETURNS DESCRIPTION CachedFunc
The wrapped cached function.
Source code in src/pydvl/utils/caching/base.py
def wrap(\n self,\n func: Callable,\n *,\n config: Optional[CachedFuncConfig] = None,\n) -> \"CachedFunc\":\n \"\"\"Wraps a function to cache its results.\n\n Args:\n func: The function to wrap.\n config: Optional caching options for the wrapped function.\n\n Returns:\n The wrapped cached function.\n \"\"\"\n return CachedFunc(\n func,\n cache_backend=self,\n config=config,\n )\n
"},{"location":"api/pydvl/utils/caching/disk/#pydvl.utils.caching.disk.DiskCacheBackend.get","title":"get","text":"get(key: str) -> Optional[Any]\n
Get a value from the cache.
PARAMETER DESCRIPTION key
Cache key.
TYPE: str
RETURNS DESCRIPTION Optional[Any]
Cached value or None if not found.
Source code in src/pydvl/utils/caching/disk.py
def get(self, key: str) -> Optional[Any]:\n \"\"\"Get a value from the cache.\n\n Args:\n key: Cache key.\n\n Returns:\n Cached value or None if not found.\n \"\"\"\n cache_file = self.cache_dir / key\n if not cache_file.exists():\n self.stats.misses += 1\n return None\n self.stats.hits += 1\n with cache_file.open(\"rb\") as f:\n return cloudpickle.load(f)\n
"},{"location":"api/pydvl/utils/caching/disk/#pydvl.utils.caching.disk.DiskCacheBackend.set","title":"set","text":"set(key: str, value: Any) -> None\n
Set a value in the cache.
PARAMETER DESCRIPTION key
Cache key.
TYPE: str
value
Value to cache.
TYPE: Any
Source code in src/pydvl/utils/caching/disk.py
def set(self, key: str, value: Any) -> None:\n \"\"\"Set a value in the cache.\n\n Args:\n key: Cache key.\n value: Value to cache.\n \"\"\"\n cache_file = self.cache_dir / key\n self.stats.sets += 1\n with cache_file.open(\"wb\") as f:\n cloudpickle.dump(value, f, protocol=PICKLE_VERSION)\n
"},{"location":"api/pydvl/utils/caching/disk/#pydvl.utils.caching.disk.DiskCacheBackend.clear","title":"clear","text":"clear() -> None\n
Deletes cache directory and recreates it.
Source code in src/pydvl/utils/caching/disk.py
def clear(self) -> None:\n \"\"\"Deletes cache directory and recreates it.\"\"\"\n shutil.rmtree(self.cache_dir)\n self.cache_dir.mkdir(exist_ok=True, parents=True)\n
"},{"location":"api/pydvl/utils/caching/disk/#pydvl.utils.caching.disk.DiskCacheBackend.combine_hashes","title":"combine_hashes","text":"combine_hashes(*args: str) -> str\n
Join cache key components.
Source code in src/pydvl/utils/caching/disk.py
def combine_hashes(self, *args: str) -> str:\n \"\"\"Join cache key components.\"\"\"\n return os.pathsep.join(args)\n
"},{"location":"api/pydvl/utils/caching/memcached/","title":"Memcached","text":""},{"location":"api/pydvl/utils/caching/memcached/#pydvl.utils.caching.memcached","title":"pydvl.utils.caching.memcached","text":""},{"location":"api/pydvl/utils/caching/memcached/#pydvl.utils.caching.memcached.MemcachedClientConfig","title":"MemcachedClientConfig dataclass
","text":"MemcachedClientConfig(\n server: Tuple[str, int] = (\"localhost\", 11211),\n connect_timeout: float = 1.0,\n timeout: float = 1.0,\n no_delay: bool = True,\n serde: PickleSerde = PickleSerde(pickle_version=PICKLE_VERSION),\n)\n
Configuration of the memcached client.
PARAMETER DESCRIPTION server
A tuple of (IP|domain name, port).
TYPE: Tuple[str, int]
DEFAULT: ('localhost', 11211)
connect_timeout
How many seconds to wait before raising ConnectionRefusedError
on failure to connect.
TYPE: float
DEFAULT: 1.0
timeout
Duration in seconds to wait for send or recv calls on the socket connected to memcached.
TYPE: float
DEFAULT: 1.0
no_delay
If True, set the TCP_NODELAY
flag, which may help with performance in some cases.
TYPE: bool
DEFAULT: True
serde
Serializer / Deserializer (\"serde\"). The default PickleSerde
should work in most cases. See pymemcache.client.base.Client for details.
TYPE: PickleSerde
DEFAULT: PickleSerde(pickle_version=PICKLE_VERSION)
"},{"location":"api/pydvl/utils/caching/memcached/#pydvl.utils.caching.memcached.MemcachedCacheBackend","title":"MemcachedCacheBackend","text":"MemcachedCacheBackend(config: MemcachedClientConfig = MemcachedClientConfig())\n
Bases: CacheBackend
Memcached cache backend for the distributed caching of functions.
Implements the CacheBackend interface for a memcached based cache. This allows sharing evaluations across processes and nodes in a cluster. You can run memcached as a service, locally or remotely, see the caching documentation.
PARAMETER DESCRIPTION config
Memcached client configuration.
TYPE: MemcachedClientConfig
DEFAULT: MemcachedClientConfig()
ATTRIBUTE DESCRIPTION config
Memcached client configuration.
client
Memcached client instance.
Example Basic usage:
>>> from pydvl.utils.caching.memcached import MemcachedCacheBackend\n>>> cache_backend = MemcachedCacheBackend()\n>>> cache_backend.clear()\n>>> value = 42\n>>> cache_backend.set(\"key\", value)\n>>> cache_backend.get(\"key\")\n42\n
Callable wrapping:
>>> from pydvl.utils.caching.memcached import MemcachedCacheBackend\n>>> cache_backend = MemcachedCacheBackend()\n>>> cache_backend.clear()\n>>> value = 42\n>>> def foo(x: int):\n... return x + 1\n...\n>>> wrapped_foo = cache_backend.wrap(foo)\n>>> wrapped_foo(value)\n43\n>>> wrapped_foo.stats.misses\n1\n>>> wrapped_foo.stats.hits\n0\n>>> wrapped_foo(value)\n43\n>>> wrapped_foo.stats.misses\n1\n>>> wrapped_foo.stats.hits\n1\n
PARAMETER DESCRIPTION config
Memcached client configuration.
TYPE: MemcachedClientConfig
DEFAULT: MemcachedClientConfig()
Source code in src/pydvl/utils/caching/memcached.py
def __init__(self, config: MemcachedClientConfig = MemcachedClientConfig()) -> None:\n \"\"\"Initialize memcached cache backend.\n\n Args:\n config: Memcached client configuration.\n \"\"\"\n\n super().__init__()\n self.config = config\n self.client = self._connect(self.config)\n
"},{"location":"api/pydvl/utils/caching/memcached/#pydvl.utils.caching.memcached.MemcachedCacheBackend.wrap","title":"wrap","text":"wrap(\n func: Callable, *, config: Optional[CachedFuncConfig] = None\n) -> CachedFunc\n
Wraps a function to cache its results.
PARAMETER DESCRIPTION func
The function to wrap.
TYPE: Callable
config
Optional caching options for the wrapped function.
TYPE: Optional[CachedFuncConfig]
DEFAULT: None
RETURNS DESCRIPTION CachedFunc
The wrapped cached function.
Source code in src/pydvl/utils/caching/base.py
def wrap(\n self,\n func: Callable,\n *,\n config: Optional[CachedFuncConfig] = None,\n) -> \"CachedFunc\":\n \"\"\"Wraps a function to cache its results.\n\n Args:\n func: The function to wrap.\n config: Optional caching options for the wrapped function.\n\n Returns:\n The wrapped cached function.\n \"\"\"\n return CachedFunc(\n func,\n cache_backend=self,\n config=config,\n )\n
"},{"location":"api/pydvl/utils/caching/memcached/#pydvl.utils.caching.memcached.MemcachedCacheBackend.get","title":"get","text":"get(key: str) -> Optional[Any]\n
Get value from memcached.
PARAMETER DESCRIPTION key
Cache key.
TYPE: str
RETURNS DESCRIPTION Optional[Any]
Cached value or None if not found or client disconnected.
Source code in src/pydvl/utils/caching/memcached.py
def get(self, key: str) -> Optional[Any]:\n \"\"\"Get value from memcached.\n\n Args:\n key: Cache key.\n\n Returns:\n Cached value or None if not found or client disconnected.\n \"\"\"\n result = None\n try:\n result = self.client.get(key)\n except socket.timeout as e:\n self.stats.timeouts += 1\n warnings.warn(f\"{type(self).__name__}: {str(e)}\", RuntimeWarning)\n except OSError as e:\n self.stats.errors += 1\n warnings.warn(f\"{type(self).__name__}: {str(e)}\", RuntimeWarning)\n except AttributeError as e:\n # FIXME: this depends on _recv() failing on invalid sockets\n # See pymemcache.base.py,\n self.stats.reconnects += 1\n warnings.warn(f\"{type(self).__name__}: {str(e)}\", RuntimeWarning)\n self.client = self._connect(self.config)\n if result is None:\n self.stats.misses += 1\n else:\n self.stats.hits += 1\n return result\n
"},{"location":"api/pydvl/utils/caching/memcached/#pydvl.utils.caching.memcached.MemcachedCacheBackend.set","title":"set","text":"set(key: str, value: Any) -> None\n
Set value in memcached.
PARAMETER DESCRIPTION key
Cache key.
TYPE: str
value
Value to cache.
TYPE: Any
Source code in src/pydvl/utils/caching/memcached.py
def set(self, key: str, value: Any) -> None:\n \"\"\"Set value in memcached.\n\n Args:\n key: Cache key.\n value: Value to cache.\n \"\"\"\n self.client.set(key, value, noreply=True)\n self.stats.sets += 1\n
"},{"location":"api/pydvl/utils/caching/memcached/#pydvl.utils.caching.memcached.MemcachedCacheBackend.clear","title":"clear","text":"clear() -> None\n
Flush all values from memcached.
Source code in src/pydvl/utils/caching/memcached.py
def clear(self) -> None:\n \"\"\"Flush all values from memcached.\"\"\"\n self.client.flush_all(noreply=True)\n
"},{"location":"api/pydvl/utils/caching/memcached/#pydvl.utils.caching.memcached.MemcachedCacheBackend.combine_hashes","title":"combine_hashes","text":"combine_hashes(*args: str) -> str\n
Join cache key components for Memcached.
Source code in src/pydvl/utils/caching/memcached.py
def combine_hashes(self, *args: str) -> str:\n \"\"\"Join cache key components for Memcached.\"\"\"\n return \":\".join(args)\n
"},{"location":"api/pydvl/utils/caching/memcached/#pydvl.utils.caching.memcached.MemcachedCacheBackend.__getstate__","title":"__getstate__","text":"__getstate__() -> Dict\n
Enables pickling after a socket has been opened to the memcached server, by removing the client from the stored data.
Source code in src/pydvl/utils/caching/memcached.py
def __getstate__(self) -> Dict:\n \"\"\"Enables pickling after a socket has been opened to the\n memcached server, by removing the client from the stored\n data.\"\"\"\n odict = self.__dict__.copy()\n del odict[\"client\"]\n return odict\n
"},{"location":"api/pydvl/utils/caching/memcached/#pydvl.utils.caching.memcached.MemcachedCacheBackend.__setstate__","title":"__setstate__","text":"__setstate__(d: Dict)\n
Restores a client connection after loading from a pickle.
Source code in src/pydvl/utils/caching/memcached.py
def __setstate__(self, d: Dict):\n \"\"\"Restores a client connection after loading from a pickle.\"\"\"\n self.config = d[\"config\"]\n self.stats = d[\"stats\"]\n self.client = self._connect(self.config)\n
"},{"location":"api/pydvl/utils/caching/memory/","title":"Memory","text":""},{"location":"api/pydvl/utils/caching/memory/#pydvl.utils.caching.memory","title":"pydvl.utils.caching.memory","text":""},{"location":"api/pydvl/utils/caching/memory/#pydvl.utils.caching.memory.InMemoryCacheBackend","title":"InMemoryCacheBackend","text":"InMemoryCacheBackend()\n
Bases: CacheBackend
In-memory cache backend that stores results in a dictionary.
Implements the CacheBackend interface for an in-memory-based cache. Stores cache entries as values in a dictionary, keyed by cache key. This allows sharing evaluations across threads in a single process.
The implementation is not thread-safe.
ATTRIBUTE DESCRIPTION cached_values
Dictionary used to store cached values.
TYPE: Dict[str, Any]
Example Basic usage:
>>> from pydvl.utils.caching.memory import InMemoryCacheBackend\n>>> cache_backend = InMemoryCacheBackend()\n>>> cache_backend.clear()\n>>> value = 42\n>>> cache_backend.set(\"key\", value)\n>>> cache_backend.get(\"key\")\n42\n
Callable wrapping:
>>> from pydvl.utils.caching.memory import InMemoryCacheBackend\n>>> cache_backend = InMemoryCacheBackend()\n>>> cache_backend.clear()\n>>> value = 42\n>>> def foo(x: int):\n... return x + 1\n...\n>>> wrapped_foo = cache_backend.wrap(foo)\n>>> wrapped_foo(value)\n43\n>>> wrapped_foo.stats.misses\n1\n>>> wrapped_foo.stats.hits\n0\n>>> wrapped_foo(value)\n43\n>>> wrapped_foo.stats.misses\n1\n>>> wrapped_foo.stats.hits\n1\n
Source code in src/pydvl/utils/caching/memory.py
def __init__(self) -> None:\n \"\"\"Initialize the in-memory cache backend.\"\"\"\n super().__init__()\n self.cached_values: Dict[str, Any] = {}\n
"},{"location":"api/pydvl/utils/caching/memory/#pydvl.utils.caching.memory.InMemoryCacheBackend.wrap","title":"wrap","text":"wrap(\n func: Callable, *, config: Optional[CachedFuncConfig] = None\n) -> CachedFunc\n
Wraps a function to cache its results.
PARAMETER DESCRIPTION func
The function to wrap.
TYPE: Callable
config
Optional caching options for the wrapped function.
TYPE: Optional[CachedFuncConfig]
DEFAULT: None
RETURNS DESCRIPTION CachedFunc
The wrapped cached function.
Source code in src/pydvl/utils/caching/base.py
def wrap(\n self,\n func: Callable,\n *,\n config: Optional[CachedFuncConfig] = None,\n) -> \"CachedFunc\":\n \"\"\"Wraps a function to cache its results.\n\n Args:\n func: The function to wrap.\n config: Optional caching options for the wrapped function.\n\n Returns:\n The wrapped cached function.\n \"\"\"\n return CachedFunc(\n func,\n cache_backend=self,\n config=config,\n )\n
"},{"location":"api/pydvl/utils/caching/memory/#pydvl.utils.caching.memory.InMemoryCacheBackend.get","title":"get","text":"get(key: str) -> Optional[Any]\n
Get a value from the cache.
PARAMETER DESCRIPTION key
Cache key.
TYPE: str
RETURNS DESCRIPTION Optional[Any]
Cached value or None if not found.
Source code in src/pydvl/utils/caching/memory.py
def get(self, key: str) -> Optional[Any]:\n \"\"\"Get a value from the cache.\n\n Args:\n key: Cache key.\n\n Returns:\n Cached value or None if not found.\n \"\"\"\n value = self.cached_values.get(key, None)\n if value is not None:\n self.stats.hits += 1\n else:\n self.stats.misses += 1\n return value\n
"},{"location":"api/pydvl/utils/caching/memory/#pydvl.utils.caching.memory.InMemoryCacheBackend.set","title":"set","text":"set(key: str, value: Any) -> None\n
Set a value in the cache.
PARAMETER DESCRIPTION key
Cache key.
TYPE: str
value
Value to cache.
TYPE: Any
Source code in src/pydvl/utils/caching/memory.py
def set(self, key: str, value: Any) -> None:\n \"\"\"Set a value in the cache.\n\n Args:\n key: Cache key.\n value: Value to cache.\n \"\"\"\n self.cached_values[key] = value\n self.stats.sets += 1\n
"},{"location":"api/pydvl/utils/caching/memory/#pydvl.utils.caching.memory.InMemoryCacheBackend.clear","title":"clear","text":"clear() -> None\n
Deletes cache dictionary and recreates it.
Source code in src/pydvl/utils/caching/memory.py
def clear(self) -> None:\n \"\"\"Deletes cache dictionary and recreates it.\"\"\"\n del self.cached_values\n self.cached_values = {}\n
"},{"location":"api/pydvl/utils/caching/memory/#pydvl.utils.caching.memory.InMemoryCacheBackend.combine_hashes","title":"combine_hashes","text":"combine_hashes(*args: str) -> str\n
Join cache key components.
Source code in src/pydvl/utils/caching/memory.py
def combine_hashes(self, *args: str) -> str:\n \"\"\"Join cache key components.\"\"\"\n return os.pathsep.join(args)\n
"},{"location":"api/pydvl/value/","title":"Value","text":""},{"location":"api/pydvl/value/#pydvl.value","title":"pydvl.value","text":"This module implements algorithms for the exact and approximate computation of values and semi-values.
See Data valuation for an introduction to the concepts and methods implemented here.
"},{"location":"api/pydvl/value/games/","title":"Games","text":""},{"location":"api/pydvl/value/games/#pydvl.value.games","title":"pydvl.value.games","text":"This module provides several predefined games and, depending on the game, the corresponding Shapley values, Least Core values or both of them, for benchmarking purposes.
"},{"location":"api/pydvl/value/games/#pydvl.value.games--references","title":"References","text":" -
Castro, J., G\u00f3mez, D. and Tejada, J., 2009. Polynomial calculation of the Shapley value based on sampling. Computers & Operations Research, 36(5), pp.1726-1730.\u00a0\u21a9
"},{"location":"api/pydvl/value/games/#pydvl.value.games.DummyGameDataset","title":"DummyGameDataset","text":"DummyGameDataset(n_players: int, description: Optional[str] = None)\n
Bases: Dataset
Dummy game dataset.
Initializes a dummy game dataset with n_players and an optional description.
This class is used internally inside the Game class.
PARAMETER DESCRIPTION n_players
Number of players that participate in the game.
TYPE: int
description
Optional description of the dataset.
TYPE: Optional[str]
DEFAULT: None
Source code in src/pydvl/value/games.py
def __init__(self, n_players: int, description: Optional[str] = None) -> None:\n x = np.arange(0, n_players, 1).reshape(-1, 1)\n nil = np.zeros_like(x)\n super().__init__(\n x,\n nil.copy(),\n nil.copy(),\n nil.copy(),\n feature_names=[\"x\"],\n target_names=[\"y\"],\n description=description,\n )\n
"},{"location":"api/pydvl/value/games/#pydvl.value.games.DummyGameDataset.indices","title":"indices property
","text":"indices: NDArray[int_]\n
Index of positions in data.x_train.
Contiguous integers from 0 to len(Dataset).
"},{"location":"api/pydvl/value/games/#pydvl.value.games.DummyGameDataset.data_names","title":"data_names property
","text":"data_names: NDArray[object_]\n
Names of each individual datapoint.
Used for reporting Shapley values.
"},{"location":"api/pydvl/value/games/#pydvl.value.games.DummyGameDataset.dim","title":"dim property
","text":"dim: int\n
Returns the number of dimensions of a sample.
"},{"location":"api/pydvl/value/games/#pydvl.value.games.DummyGameDataset.get_training_data","title":"get_training_data","text":"get_training_data(\n indices: Optional[Iterable[int]] = None,\n) -> Tuple[NDArray, NDArray]\n
Given a set of indices, returns the training data that refer to those indices.
This is used mainly by Utility to retrieve subsets of the data from indices. It is typically not needed in algorithms.
PARAMETER DESCRIPTION indices
Optional indices that will be used to select points from the training data. If None
, the entire training data will be returned.
TYPE: Optional[Iterable[int]]
DEFAULT: None
RETURNS DESCRIPTION Tuple[NDArray, NDArray]
If indices
is not None
, the selected x and y arrays from the training data. Otherwise, the entire dataset.
Source code in src/pydvl/utils/dataset.py
def get_training_data(\n self, indices: Optional[Iterable[int]] = None\n) -> Tuple[NDArray, NDArray]:\n \"\"\"Given a set of indices, returns the training data that refer to those\n indices.\n\n This is used mainly by [Utility][pydvl.utils.utility.Utility] to retrieve\n subsets of the data from indices. It is typically **not needed in\n algorithms**.\n\n Args:\n indices: Optional indices that will be used to select points from\n the training data. If `None`, the entire training data will be\n returned.\n\n Returns:\n If `indices` is not `None`, the selected x and y arrays from the\n training data. Otherwise, the entire dataset.\n \"\"\"\n if indices is None:\n return self.x_train, self.y_train\n x = self.x_train[indices]\n y = self.y_train[indices]\n return x, y\n
"},{"location":"api/pydvl/value/games/#pydvl.value.games.DummyGameDataset.from_sklearn","title":"from_sklearn classmethod
","text":"from_sklearn(\n data: Bunch,\n train_size: float = 0.8,\n random_state: Optional[int] = None,\n stratify_by_target: bool = False,\n **kwargs\n) -> Dataset\n
Constructs a Dataset object from a sklearn.utils.Bunch, as returned by the load_*
functions in scikit-learn toy datasets.
Example >>> from pydvl.utils import Dataset\n>>> from sklearn.datasets import load_boston\n>>> dataset = Dataset.from_sklearn(load_boston())\n
PARAMETER DESCRIPTION data
scikit-learn Bunch object. The following attributes are supported:
data
: covariates. target
: target variables (labels). feature_names
(optional): the feature names. target_names
(optional): the target names. DESCR
(optional): a description.
TYPE: Bunch
train_size
size of the training dataset. Used in train_test_split
TYPE: float
DEFAULT: 0.8
random_state
seed for train / test split
TYPE: Optional[int]
DEFAULT: None
stratify_by_target
If True
, data is split in a stratified fashion, using the target variable as labels. Read more in scikit-learn's user guide.
TYPE: bool
DEFAULT: False
kwargs
Additional keyword arguments to pass to the Dataset constructor. Use this to pass e.g. is_multi_output
.
DEFAULT: {}
RETURNS DESCRIPTION Dataset
Object with the sklearn dataset
Changed in version 0.6.0
Added kwargs to pass to the Dataset constructor.
Source code in src/pydvl/utils/dataset.py
@classmethod\ndef from_sklearn(\n cls,\n data: Bunch,\n train_size: float = 0.8,\n random_state: Optional[int] = None,\n stratify_by_target: bool = False,\n **kwargs,\n) -> \"Dataset\":\n \"\"\"Constructs a [Dataset][pydvl.utils.Dataset] object from a\n [sklearn.utils.Bunch][], as returned by the `load_*`\n functions in [scikit-learn toy datasets](https://scikit-learn.org/stable/datasets/toy_dataset.html).\n\n ??? Example\n ```pycon\n >>> from pydvl.utils import Dataset\n >>> from sklearn.datasets import load_boston\n >>> dataset = Dataset.from_sklearn(load_boston())\n ```\n\n Args:\n data: scikit-learn Bunch object. The following attributes are supported:\n\n - `data`: covariates.\n - `target`: target variables (labels).\n - `feature_names` (**optional**): the feature names.\n - `target_names` (**optional**): the target names.\n - `DESCR` (**optional**): a description.\n train_size: size of the training dataset. Used in `train_test_split`\n random_state: seed for train / test split\n stratify_by_target: If `True`, data is split in a stratified\n fashion, using the target variable as labels. Read more in\n [scikit-learn's user guide](https://scikit-learn.org/stable/modules/cross_validation.html#stratification).\n kwargs: Additional keyword arguments to pass to the\n [Dataset][pydvl.utils.Dataset] constructor. Use this to pass e.g. `is_multi_output`.\n\n Returns:\n Object with the sklearn dataset\n\n !!! tip \"Changed in version 0.6.0\"\n Added kwargs to pass to the [Dataset][pydvl.utils.Dataset] constructor.\n \"\"\"\n x_train, x_test, y_train, y_test = train_test_split(\n data.data,\n data.target,\n train_size=train_size,\n random_state=random_state,\n stratify=data.target if stratify_by_target else None,\n )\n return cls(\n x_train,\n y_train,\n x_test,\n y_test,\n feature_names=data.get(\"feature_names\"),\n target_names=data.get(\"target_names\"),\n description=data.get(\"DESCR\"),\n **kwargs,\n )\n
"},{"location":"api/pydvl/value/games/#pydvl.value.games.DummyGameDataset.from_arrays","title":"from_arrays classmethod
","text":"from_arrays(\n X: NDArray,\n y: NDArray,\n train_size: float = 0.8,\n random_state: Optional[int] = None,\n stratify_by_target: bool = False,\n **kwargs\n) -> Dataset\n
Constructs a Dataset object from X and y numpy arrays as returned by the make_*
functions in sklearn generated datasets.
Example >>> from pydvl.utils import Dataset\n>>> from sklearn.datasets import make_regression\n>>> X, y = make_regression()\n>>> dataset = Dataset.from_arrays(X, y)\n
PARAMETER DESCRIPTION X
numpy array of shape (n_samples, n_features)
TYPE: NDArray
y
numpy array of shape (n_samples,)
TYPE: NDArray
train_size
size of the training dataset. Used in train_test_split
TYPE: float
DEFAULT: 0.8
random_state
seed for train / test split
TYPE: Optional[int]
DEFAULT: None
stratify_by_target
If True
, data is split in a stratified fashion, using the y variable as labels. Read more in sklearn's user guide.
TYPE: bool
DEFAULT: False
kwargs
Additional keyword arguments to pass to the Dataset constructor. Use this to pass e.g. feature_names
or target_names
.
DEFAULT: {}
RETURNS DESCRIPTION Dataset
Object with the passed X and y arrays split across training and test sets.
New in version 0.4.0
Changed in version 0.6.0
Added kwargs to pass to the Dataset constructor.
Source code in src/pydvl/utils/dataset.py
@classmethod\ndef from_arrays(\n cls,\n X: NDArray,\n y: NDArray,\n train_size: float = 0.8,\n random_state: Optional[int] = None,\n stratify_by_target: bool = False,\n **kwargs,\n) -> \"Dataset\":\n \"\"\"Constructs a [Dataset][pydvl.utils.Dataset] object from X and y numpy arrays as\n returned by the `make_*` functions in [sklearn generated datasets](https://scikit-learn.org/stable/datasets/sample_generators.html).\n\n ??? Example\n ```pycon\n >>> from pydvl.utils import Dataset\n >>> from sklearn.datasets import make_regression\n >>> X, y = make_regression()\n >>> dataset = Dataset.from_arrays(X, y)\n ```\n\n Args:\n X: numpy array of shape (n_samples, n_features)\n y: numpy array of shape (n_samples,)\n train_size: size of the training dataset. Used in `train_test_split`\n random_state: seed for train / test split\n stratify_by_target: If `True`, data is split in a stratified fashion,\n using the y variable as labels. Read more in [sklearn's user\n guide](https://scikit-learn.org/stable/modules/cross_validation.html#stratification).\n kwargs: Additional keyword arguments to pass to the\n [Dataset][pydvl.utils.Dataset] constructor. Use this to pass e.g. `feature_names`\n or `target_names`.\n\n Returns:\n Object with the passed X and y arrays split across training and test sets.\n\n !!! tip \"New in version 0.4.0\"\n\n !!! tip \"Changed in version 0.6.0\"\n Added kwargs to pass to the [Dataset][pydvl.utils.Dataset] constructor.\n \"\"\"\n x_train, x_test, y_train, y_test = train_test_split(\n X,\n y,\n train_size=train_size,\n random_state=random_state,\n stratify=y if stratify_by_target else None,\n )\n return cls(x_train, y_train, x_test, y_test, **kwargs)\n
"},{"location":"api/pydvl/value/games/#pydvl.value.games.DummyGameDataset.get_test_data","title":"get_test_data","text":"get_test_data(\n indices: Optional[Iterable[int]] = None,\n) -> Tuple[NDArray, NDArray]\n
Returns the subsets of the train set instead of the test set.
PARAMETER DESCRIPTION indices
Indices into the training data.
TYPE: Optional[Iterable[int]]
DEFAULT: None
RETURNS DESCRIPTION Tuple[NDArray, NDArray]
Subset of the train data.
Source code in src/pydvl/value/games.py
def get_test_data(\n self, indices: Optional[Iterable[int]] = None\n) -> Tuple[NDArray, NDArray]:\n \"\"\"Returns the subsets of the train set instead of the test set.\n\n Args:\n indices: Indices into the training data.\n\n Returns:\n Subset of the train data.\n \"\"\"\n if indices is None:\n return self.x_train, self.y_train\n x = self.x_train[indices]\n y = self.y_train[indices]\n return x, y\n
"},{"location":"api/pydvl/value/games/#pydvl.value.games.DummyModel","title":"DummyModel","text":"DummyModel()\n
Bases: SupervisedModel
Dummy model class.
A dummy supervised model used for testing purposes only.
Source code in src/pydvl/value/games.py
def __init__(self) -> None:\n pass\n
"},{"location":"api/pydvl/value/games/#pydvl.value.games.Game","title":"Game","text":"Game(\n n_players: int,\n score_range: Tuple[float, float] = (-np.inf, np.inf),\n description: Optional[str] = None,\n)\n
Bases: ABC
Base class for games
Any Game subclass has to implement the abstract _score
method to assign a score to each coalition/subset and at least one of shapley_values
, least_core_values
.
PARAMETER DESCRIPTION n_players
Number of players that participate in the game.
TYPE: int
score_range
Minimum and maximum values of the _score
method.
TYPE: Tuple[float, float]
DEFAULT: (-inf, inf)
description
Optional string description of the dummy dataset that will be created.
TYPE: Optional[str]
DEFAULT: None
ATTRIBUTE DESCRIPTION n_players
Number of players that participate in the game.
data
Dummy dataset object.
u
Utility object with a dummy model and dataset.
Source code in src/pydvl/value/games.py
def __init__(\n self,\n n_players: int,\n score_range: Tuple[float, float] = (-np.inf, np.inf),\n description: Optional[str] = None,\n):\n self.n_players = n_players\n self.data = DummyGameDataset(self.n_players, description)\n self.u = Utility(\n DummyModel(),\n self.data,\n scorer=Scorer(self._score, range=score_range),\n catch_errors=False,\n show_warnings=True,\n )\n
"},{"location":"api/pydvl/value/games/#pydvl.value.games.SymmetricVotingGame","title":"SymmetricVotingGame","text":"SymmetricVotingGame(n_players: int)\n
Bases: Game
Toy game that is used for testing and demonstration purposes.
A symmetric voting game defined in (Castro et al., 2009)1 Section 4.1
For this game the utility of a coalition is 1 if its cardinality is greater than num_samples/2, or 0 otherwise.
\\[{ v(S) = \\left\\{\\begin{array}{ll} 1, & \\text{ if} \\quad \\mid S \\mid > \\frac{N}{2} \\\\ 0, & \\text{ otherwise} \\end{array}\\right. }\\] PARAMETER DESCRIPTION n_players
Number of players that participate in the game.
TYPE: int
Source code in src/pydvl/value/games.py
def __init__(self, n_players: int) -> None:\n if n_players % 2 != 0:\n raise ValueError(\"n_players must be an even number.\")\n description = \"Dummy data for the symmetric voting game in Castro et al. 2009\"\n super().__init__(\n n_players,\n score_range=(0, 1),\n description=description,\n )\n
"},{"location":"api/pydvl/value/games/#pydvl.value.games.AsymmetricVotingGame","title":"AsymmetricVotingGame","text":"AsymmetricVotingGame(n_players: int = 51)\n
Bases: Game
Toy game that is used for testing and demonstration purposes.
An asymmetric voting game defined in (Castro et al., 2009)1 Section 4.2.
For this game the player set is \\(N = \\{1,\\dots,51\\}\\) and the utility of a coalition is given by:
\\[{ v(S) = \\left\\{\\begin{array}{ll} 1, & \\text{ if} \\quad \\sum\\limits_{i \\in S} w_i > \\sum\\limits_{j \\in N}\\frac{w_j}{2} \\\\ 0, & \\text{ otherwise} \\end{array}\\right. }\\] where \\(w = [w_1,\\dots, w_{51}]\\) is a list of weights associated with each player.
PARAMETER DESCRIPTION n_players
Number of players that participate in the game.
TYPE: int
DEFAULT: 51
Source code in src/pydvl/value/games.py
def __init__(self, n_players: int = 51) -> None:\n if n_players != 51:\n raise ValueError(\n f\"{self.__class__.__name__} only supports n_players=51 but got {n_players=}.\"\n )\n description = \"Dummy data for the asymmetric voting game in Castro et al. 2009\"\n super().__init__(\n n_players,\n score_range=(0, 1),\n description=description,\n )\n\n ranges = [\n range(0, 1),\n range(1, 2),\n range(2, 3),\n range(3, 5),\n range(5, 6),\n range(6, 7),\n range(7, 9),\n range(9, 10),\n range(10, 12),\n range(12, 15),\n range(15, 16),\n range(16, 20),\n range(20, 24),\n range(24, 26),\n range(26, 30),\n range(30, 34),\n range(34, 35),\n range(35, 44),\n range(44, 51),\n ]\n\n ranges_weights = [\n 45,\n 41,\n 27,\n 26,\n 25,\n 21,\n 17,\n 14,\n 13,\n 12,\n 11,\n 10,\n 9,\n 8,\n 7,\n 6,\n 5,\n 4,\n 3,\n ]\n ranges_values = [\n \"0.08831\",\n \"0.07973\",\n \"0.05096\",\n \"0.04898\",\n \"0.047\",\n \"0.03917\",\n \"0.03147\",\n \"0.02577\",\n \"0.02388\",\n \"0.022\",\n \"0.02013\",\n \"0.01827\",\n \"0.01641\",\n \"0.01456\",\n \"0.01272\",\n \"0.01088\",\n \"0.009053\",\n \"0.00723\",\n \"0.005412\",\n ]\n\n self.weight_table = np.zeros(self.n_players)\n exact_values = np.zeros(self.n_players)\n for r, w, v in zip(ranges, ranges_weights, ranges_values):\n self.weight_table[r] = w\n exact_values[r] = v\n\n self.exact_values = exact_values\n self.threshold = np.sum(self.weight_table) / 2\n
"},{"location":"api/pydvl/value/games/#pydvl.value.games.ShoesGame","title":"ShoesGame","text":"ShoesGame(left: int, right: int)\n
Bases: Game
Toy game that is used for testing and demonstration purposes.
A shoes game defined in (Castro et al., 2009)1.
In this game, some players have a left shoe and others a right shoe. Single shoes have a worth of zero while pairs have a worth of 1.
The payoff of a coalition \\(S\\) is:
\\[{ v(S) = \\min( \\mid S \\cap L \\mid, \\mid S \\cap R \\mid ) }\\] Where \\(L\\), respectively \\(R\\), is the set of players with left shoes, respectively right shoes.
PARAMETER DESCRIPTION left
Number of players with a left shoe.
TYPE: int
right
Number of players with a right shoe.
TYPE: int
Source code in src/pydvl/value/games.py
def __init__(self, left: int, right: int) -> None:\n self.left = left\n self.right = right\n n_players = self.left + self.right\n description = \"Dummy data for the shoe game in Castro et al. 2009\"\n max_score = n_players // 2\n super().__init__(n_players, score_range=(0, max_score), description=description)\n
"},{"location":"api/pydvl/value/games/#pydvl.value.games.AirportGame","title":"AirportGame","text":"AirportGame(n_players: int = 100)\n
Bases: Game
Toy game that is used for testing and demonstration purposes.
An airport game defined in (Castro et al., 2009)1 Section 4.3
PARAMETER DESCRIPTION n_players
Number of players that participate in the game.
TYPE: int
DEFAULT: 100
Source code in src/pydvl/value/games.py
def __init__(self, n_players: int = 100) -> None:\n if n_players != 100:\n raise ValueError(\n f\"{self.__class__.__name__} only supports n_players=100 but got {n_players=}.\"\n )\n description = \"A dummy dataset for the airport game in Castro et al. 2009\"\n super().__init__(n_players, score_range=(0, 100), description=description)\n ranges = [\n range(0, 8),\n range(8, 20),\n range(20, 26),\n range(26, 40),\n range(40, 48),\n range(48, 57),\n range(57, 70),\n range(70, 80),\n range(80, 90),\n range(90, 100),\n ]\n exact = [\n 0.01,\n 0.020869565,\n 0.033369565,\n 0.046883079,\n 0.063549745,\n 0.082780515,\n 0.106036329,\n 0.139369662,\n 0.189369662,\n 0.289369662,\n ]\n c = list(range(1, 10))\n score_table = np.zeros(100)\n exact_values = np.zeros(100)\n\n for r, v in zip(ranges, exact):\n score_table[r] = c\n exact_values[r] = v\n\n self.exact_values = exact_values\n self.score_table = score_table\n
"},{"location":"api/pydvl/value/games/#pydvl.value.games.MinimumSpanningTreeGame","title":"MinimumSpanningTreeGame","text":"MinimumSpanningTreeGame(n_players: int = 100)\n
Bases: Game
Toy game that is used for testing and demonstration purposes.
A minimum spanning tree game defined in (Castro et al., 2009)1.
Let \\(G = (N \\cup \\{0\\},E)\\) be a valued graph where \\(N = \\{1,\\dots,100\\}\\), and the cost associated to an edge \\((i, j)\\) is:
\\[{ c_{ij} = \\left\\{\\begin{array}{lll} 1, & \\text{ if} & i = j + 1 \\text{ or } i = j - 1 \\\\ & & \\text{ or } (i = 1 \\text{ and } j = 100) \\text{ or } (i = 100 \\text{ and } j = 1) \\\\ 101, & \\text{ if} & i = 0 \\text{ or } j = 0 \\\\ \\infty, & \\text{ otherwise} \\end{array}\\right. }\\] A minimum spanning tree game \\((N, c)\\) is a cost game, where for a given coalition \\(S \\subset N\\), \\(v(S)\\) is the sum of the edge cost of the minimum spanning tree, i.e. \\(v(S)\\) = Minimum Spanning Tree of the graph \\(G|_{S\\cup\\{0\\}}\\), which is the partial graph restricted to the players \\(S\\) and the source node \\(0\\).
PARAMETER DESCRIPTION n_players
Number of players that participate in the game.
TYPE: int
DEFAULT: 100
Source code in src/pydvl/value/games.py
def __init__(self, n_players: int = 100) -> None:\n if n_players != 100:\n raise ValueError(\n f\"{self.__class__.__name__} only supports n_players=100 but got {n_players=}.\"\n )\n description = (\n \"A dummy dataset for the minimum spanning tree game in Castro et al. 2009\"\n )\n super().__init__(n_players, score_range=(0, np.inf), description=description)\n\n graph = np.zeros(shape=(self.n_players, self.n_players))\n\n for i in range(self.n_players):\n for j in range(self.n_players):\n if (\n i == j + 1\n or i == j - 1\n or (i == 1 and j == self.n_players - 1)\n or (i == self.n_players - 1 and j == 1)\n ):\n graph[i, j] = 1\n elif i == 0 or j == 0:\n graph[i, j] = 0\n else:\n graph[i, j] = np.inf\n assert np.all(graph == graph.T)\n\n self.graph = graph\n
"},{"location":"api/pydvl/value/games/#pydvl.value.games.MinerGame","title":"MinerGame","text":"MinerGame(n_players: int)\n
Bases: Game
Toy game that is used for testing and demonstration purposes.
Consider a group of n miners, who have discovered large bars of gold.
If two miners can carry one piece of gold, then the payoff of a coalition \\(S\\) is:
\\[{ v(S) = \\left\\{\\begin{array}{lll} \\mid S \\mid / 2, & \\text{ if} & \\mid S \\mid \\text{ is even} \\\\ ( \\mid S \\mid - 1)/2, & \\text{ otherwise} \\end{array}\\right. }\\] If there are more than two miners and there is an even number of miners, then the core consists of the single payoff where each miner gets 1/2.
If there is an odd number of miners, then the core is empty.
Taken from Wikipedia
PARAMETER DESCRIPTION n_players
Number of miners that participate in the game.
TYPE: int
Source code in src/pydvl/value/games.py
def __init__(self, n_players: int) -> None:\n if n_players <= 2:\n raise ValueError(f\"n_players, {n_players}, should be > 2\")\n description = \"Dummy data for Miner Game taken from https://en.wikipedia.org/wiki/Core_(game_theory)\"\n super().__init__(\n n_players,\n score_range=(0, n_players // 2),\n description=description,\n )\n
"},{"location":"api/pydvl/value/result/","title":"Result","text":""},{"location":"api/pydvl/value/result/#pydvl.value.result","title":"pydvl.value.result","text":"This module collects types and methods for the inspection of the results of valuation algorithms.
The most important class is ValuationResult, which provides access to raw values, as well as convenient behaviour as a Sequence
with extended indexing and updating abilities, and conversion to pandas DataFrames.
"},{"location":"api/pydvl/value/result/#pydvl.value.result--operating-on-results","title":"Operating on results","text":"Results can be added together with the standard +
operator. Because values are typically running averages of iterative algorithms, addition behaves like a weighted average of the two results, with the weights being the number of updates in each result: adding two results is the same as generating one result with the mean of the values of the two results as values. The variances are updated accordingly. See ValuationResult for details.
Results can also be sorted by value, variance or number of updates, see sort(). The arrays of ValuationResult.values, ValuationResult.variances, ValuationResult.counts, ValuationResult.indices, ValuationResult.names are sorted in the same way.
Indexing and slicing of results is supported and ValueItem objects are returned. These objects can be compared with the usual operators, which take only the ValueItem.value into account.
"},{"location":"api/pydvl/value/result/#pydvl.value.result--creating-result-objects","title":"Creating result objects","text":"The most commonly used factory method is ValuationResult.zeros(), which creates a result object with all values, variances and counts set to zero. ValuationResult.empty() creates an empty result object, which can be used as a starting point for adding results together. Empty results are discarded when added to other results. Finally, ValuationResult.from_random() samples random values uniformly.
"},{"location":"api/pydvl/value/result/#pydvl.value.result.ValueItem","title":"ValueItem dataclass
","text":"ValueItem(\n index: IndexT,\n name: NameT,\n value: float,\n variance: Optional[float],\n count: Optional[int],\n)\n
Bases: Generic[IndexT, NameT]
The result of a value computation for one datum.
ValueItems
can be compared with the usual operators, forming a total order. Comparisons take only the value
into account.
Todo
Maybe have a mode of comparing similar to np.isclose
, or taking the variance
into account.
ATTRIBUTE DESCRIPTION index
Index of the sample with this value in the original Dataset
TYPE: IndexT
name
Name of the sample if it was provided. Otherwise, str(index)
TYPE: NameT
value
The value
TYPE: float
variance
Variance of the value if it was computed with an approximate method
TYPE: Optional[float]
count
Number of updates for this value
TYPE: Optional[int]
"},{"location":"api/pydvl/value/result/#pydvl.value.result.ValueItem.stderr","title":"stderr property
","text":"stderr: Optional[float]\n
Standard error of the value.
"},{"location":"api/pydvl/value/result/#pydvl.value.result.ValuationResult","title":"ValuationResult","text":"ValuationResult(\n *,\n values: NDArray[float64],\n variances: Optional[NDArray[float64]] = None,\n counts: Optional[NDArray[int_]] = None,\n indices: Optional[NDArray[IndexT]] = None,\n data_names: Optional[Sequence[NameT] | NDArray[NameT]] = None,\n algorithm: str = \"\",\n status: Status = Status.Pending,\n sort: bool = False,\n **extra_values\n)\n
Bases: Sequence
, Iterable[ValueItem[IndexT, NameT]]
, Generic[IndexT, NameT]
Objects of this class hold the results of valuation algorithms.
These include indices in the original Dataset, any data names (e.g. group names in GroupedDataset), the values themselves, and variance of the computation in the case of Monte Carlo methods. ValuationResults
can be iterated over like any Sequence
: iter(valuation_result)
returns a generator of ValueItem in the order in which the object is sorted.
"},{"location":"api/pydvl/value/result/#pydvl.value.result.ValuationResult--indexing","title":"Indexing","text":"Indexing can be position-based, when accessing any of the attributes values, variances, counts and indices, as well as when iterating over the object, or using the item access operator, both getter and setter. The \"position\" is either the original sequence in which the data was passed to the constructor, or the sequence in which the object is sorted, see below.
Alternatively, indexing can be data-based, i.e. using the indices in the original dataset. This is the case for the methods get() and update().
"},{"location":"api/pydvl/value/result/#pydvl.value.result.ValuationResult--sorting","title":"Sorting","text":"Results can be sorted in-place with sort(), or alternatively using python's standard sorted()
and reversed()
Note that sorting values affects how iterators and the object itself as Sequence
behave: values[0]
returns a ValueItem with the highest or lowest ranking point if this object is sorted by descending or ascending value, respectively. If unsorted, values[0]
returns the ValueItem
at position 0, which has data index indices[0]
in the Dataset.
The same applies to direct indexing of the ValuationResult
: the index is positional, according to the sorting. It does not refer to the \"data index\". To sort according to data index, use sort() with key=\"index\"
.
In order to access ValueItem objects by their data index, use get().
"},{"location":"api/pydvl/value/result/#pydvl.value.result.ValuationResult--operating-on-results","title":"Operating on results","text":"Results can be added to each other with the +
operator. Means and variances are correctly updated, using the counts
attribute.
Results can also be updated with new values using update(). Means and variances are updated accordingly using the Welford algorithm.
Empty objects behave in a special way, see empty().
PARAMETER DESCRIPTION values
An array of values. If omitted, defaults to an empty array or to an array of zeros if indices
are given.
TYPE: NDArray[float64]
indices
An optional array of indices in the original dataset. If omitted, defaults to np.arange(len(values))
. Warning: It is common to pass the indices of a Dataset here. Attention must be paid in a parallel context to copy them to the local process. Just do indices=np.copy(data.indices)
.
TYPE: Optional[NDArray[IndexT]]
DEFAULT: None
variances
An optional array of variances in the computation of each value.
TYPE: Optional[NDArray[float64]]
DEFAULT: None
counts
An optional array with the number of updates for each value. Defaults to an array of ones.
TYPE: Optional[NDArray[int_]]
DEFAULT: None
data_names
Names for the data points. Defaults to index numbers if not set.
TYPE: Optional[Sequence[NameT] | NDArray[NameT]]
DEFAULT: None
algorithm
The method used.
TYPE: str
DEFAULT: ''
status
The end status of the algorithm.
TYPE: Status
DEFAULT: Pending
sort
Whether to sort the indices by ascending value. See above how this affects usage as an iterable or sequence.
TYPE: bool
DEFAULT: False
extra_values
Additional values that can be passed as keyword arguments. This can contain, for example, the least core value.
DEFAULT: {}
RAISES DESCRIPTION ValueError
If input arrays have mismatching lengths.
Source code in src/pydvl/value/result.py
def __init__(\n self,\n *,\n values: NDArray[np.float64],\n variances: Optional[NDArray[np.float64]] = None,\n counts: Optional[NDArray[np.int_]] = None,\n indices: Optional[NDArray[IndexT]] = None,\n data_names: Optional[Sequence[NameT] | NDArray[NameT]] = None,\n algorithm: str = \"\",\n status: Status = Status.Pending,\n sort: bool = False,\n **extra_values,\n):\n if variances is not None and len(variances) != len(values):\n raise ValueError(\"Lengths of values and variances do not match\")\n if data_names is not None and len(data_names) != len(values):\n raise ValueError(\"Lengths of values and data_names do not match\")\n if indices is not None and len(indices) != len(values):\n raise ValueError(\"Lengths of values and indices do not match\")\n\n self._algorithm = algorithm\n self._status = Status(status) # Just in case we are given a string\n self._values = values\n self._variances = np.zeros_like(values) if variances is None else variances\n self._counts = np.ones_like(values) if counts is None else counts\n self._sort_order = None\n self._extra_values = extra_values or {}\n\n # Yuk...\n if data_names is None:\n if indices is not None:\n self._names = np.copy(indices)\n else:\n self._names = np.arange(len(self._values), dtype=np.int_)\n elif not isinstance(data_names, np.ndarray):\n self._names = np.array(data_names)\n else:\n self._names = data_names.copy()\n if len(np.unique(self._names)) != len(self._names):\n raise ValueError(\"Data names must be unique\")\n\n if indices is None:\n indices = np.arange(len(self._values), dtype=np.int_)\n self._indices = indices\n self._positions = {idx: pos for pos, idx in enumerate(indices)}\n\n self._sort_positions: NDArray[np.int_] = np.arange(\n len(self._values), dtype=np.int_\n )\n if sort:\n self.sort()\n
"},{"location":"api/pydvl/value/result/#pydvl.value.result.ValuationResult.values","title":"values property
","text":"values: NDArray[float64]\n
The values, possibly sorted.
"},{"location":"api/pydvl/value/result/#pydvl.value.result.ValuationResult.variances","title":"variances property
","text":"variances: NDArray[float64]\n
The variances, possibly sorted.
"},{"location":"api/pydvl/value/result/#pydvl.value.result.ValuationResult.stderr","title":"stderr property
","text":"stderr: NDArray[float64]\n
The raw standard errors, possibly sorted.
"},{"location":"api/pydvl/value/result/#pydvl.value.result.ValuationResult.counts","title":"counts property
","text":"counts: NDArray[int_]\n
The raw counts, possibly sorted.
"},{"location":"api/pydvl/value/result/#pydvl.value.result.ValuationResult.indices","title":"indices property
","text":"indices: NDArray[IndexT]\n
The indices for the values, possibly sorted.
If the object is unsorted, then these are the same as declared at construction or np.arange(len(values))
if none were passed.
"},{"location":"api/pydvl/value/result/#pydvl.value.result.ValuationResult.names","title":"names property
","text":"names: NDArray[NameT]\n
The names for the values, possibly sorted. If the object is unsorted, then these are the same as declared at construction or np.arange(len(values))
if none were passed.
"},{"location":"api/pydvl/value/result/#pydvl.value.result.ValuationResult.sort","title":"sort","text":"sort(\n reverse: bool = False,\n key: Literal[\"value\", \"variance\", \"index\", \"name\"] = \"value\",\n) -> None\n
Sorts the indices in place by key
.
Once sorted, iteration over the results, and indexing of all the properties ValuationResult.values, ValuationResult.variances, ValuationResult.counts, ValuationResult.indices and ValuationResult.names will follow the same order.
PARAMETER DESCRIPTION reverse
Whether to sort in descending order by value.
TYPE: bool
DEFAULT: False
key
The key to sort by. Defaults to ValueItem.value.
TYPE: Literal['value', 'variance', 'index', 'name']
DEFAULT: 'value'
Source code in src/pydvl/value/result.py
def sort(\n self,\n reverse: bool = False,\n # Need a \"Comparable\" type here\n key: Literal[\"value\", \"variance\", \"index\", \"name\"] = \"value\",\n) -> None:\n \"\"\"Sorts the indices in place by `key`.\n\n Once sorted, iteration over the results, and indexing of all the\n properties\n [ValuationResult.values][pydvl.value.result.ValuationResult.values],\n [ValuationResult.variances][pydvl.value.result.ValuationResult.variances],\n [ValuationResult.counts][pydvl.value.result.ValuationResult.counts],\n [ValuationResult.indices][pydvl.value.result.ValuationResult.indices]\n and [ValuationResult.names][pydvl.value.result.ValuationResult.names]\n will follow the same order.\n\n Args:\n reverse: Whether to sort in descending order by value.\n key: The key to sort by. Defaults to\n [ValueItem.value][pydvl.value.result.ValueItem].\n \"\"\"\n keymap = {\n \"index\": \"_indices\",\n \"value\": \"_values\",\n \"variance\": \"_variances\",\n \"name\": \"_names\",\n }\n self._sort_positions = np.argsort(getattr(self, keymap[key]))\n if reverse:\n self._sort_positions = self._sort_positions[::-1]\n self._sort_order = reverse\n
"},{"location":"api/pydvl/value/result/#pydvl.value.result.ValuationResult.__getattr__","title":"__getattr__","text":"__getattr__(attr: str) -> Any\n
Allows access to extra values as if they were properties of the instance.
Source code in src/pydvl/value/result.py
def __getattr__(self, attr: str) -> Any:\n \"\"\"Allows access to extra values as if they were properties of the instance.\"\"\"\n # This is here to avoid a RecursionError when copying or pickling the object\n if attr == \"_extra_values\":\n raise AttributeError()\n try:\n return self._extra_values[attr]\n except KeyError as e:\n raise AttributeError(\n f\"{self.__class__.__name__} object has no attribute {attr}\"\n ) from e\n
"},{"location":"api/pydvl/value/result/#pydvl.value.result.ValuationResult.__iter__","title":"__iter__","text":"__iter__() -> Iterator[ValueItem[IndexT, NameT]]\n
Iterate over the results returning ValueItem objects. To sort in place before iteration, use sort().
Source code in src/pydvl/value/result.py
def __iter__(self) -> Iterator[ValueItem[IndexT, NameT]]:\n \"\"\"Iterate over the results returning [ValueItem][pydvl.value.result.ValueItem] objects.\n To sort in place before iteration, use [sort()][pydvl.value.result.ValuationResult.sort].\n \"\"\"\n for pos in self._sort_positions:\n yield ValueItem(\n self._indices[pos],\n self._names[pos],\n self._values[pos],\n self._variances[pos],\n self._counts[pos],\n )\n
"},{"location":"api/pydvl/value/result/#pydvl.value.result.ValuationResult.__add__","title":"__add__","text":"__add__(\n other: ValuationResult[IndexT, NameT]\n) -> ValuationResult[IndexT, NameT]\n
Adds two ValuationResults.
The values must have been computed with the same algorithm. An exception to this is if one argument has empty values, in which case the other argument is returned.
Warning
Abusing this will introduce numerical errors.
Means and standard errors are correctly handled. Statuses are added with bit-wise &
, see Status. data_names
are taken from the left summand, or if unavailable from the right one. The algorithm
string is carried over if both terms have the same one or concatenated.
It is possible to add ValuationResults of different lengths, and with different or overlapping indices. The result will have the union of indices, and the values.
Warning
FIXME: Arbitrary extra_values
aren't handled.
Source code in src/pydvl/value/result.py
def __add__(\n self, other: ValuationResult[IndexT, NameT]\n) -> ValuationResult[IndexT, NameT]:\n \"\"\"Adds two ValuationResults.\n\n The values must have been computed with the same algorithm. An exception\n to this is if one argument has empty values, in which case the other\n argument is returned.\n\n !!! Warning\n Abusing this will introduce numerical errors.\n\n Means and standard errors are correctly handled. Statuses are added with\n bit-wise `&`, see [Status][pydvl.value.result.Status].\n `data_names` are taken from the left summand, or if unavailable from\n the right one. The `algorithm` string is carried over if both terms\n have the same one or concatenated.\n\n It is possible to add ValuationResults of different lengths, and with\n different or overlapping indices. The result will have the union of\n indices, and the values.\n\n !!! Warning\n FIXME: Arbitrary `extra_values` aren't handled.\n\n \"\"\"\n # empty results\n if len(self.values) == 0:\n return other\n if len(other.values) == 0:\n return self\n\n self._check_compatible(other)\n\n indices = np.union1d(self._indices, other._indices).astype(self._indices.dtype)\n this_pos = np.searchsorted(indices, self._indices)\n other_pos = np.searchsorted(indices, other._indices)\n\n n: NDArray[np.int_] = np.zeros_like(indices, dtype=int)\n m: NDArray[np.int_] = np.zeros_like(indices, dtype=int)\n xn: NDArray[np.int_] = np.zeros_like(indices, dtype=float)\n xm: NDArray[np.int_] = np.zeros_like(indices, dtype=float)\n vn: NDArray[np.int_] = np.zeros_like(indices, dtype=float)\n vm: NDArray[np.int_] = np.zeros_like(indices, dtype=float)\n\n n[this_pos] = self._counts\n xn[this_pos] = self._values\n vn[this_pos] = self._variances\n m[other_pos] = other._counts\n xm[other_pos] = other._values\n vm[other_pos] = other._variances\n\n # np.maximum(1, n + m) covers case n = m = 0.\n n_m_sum = np.maximum(1, n + m)\n\n # Sample mean of n+m samples from two means of n and m samples\n xnm = (n * xn + m * xm) / n_m_sum\n\n # Sample variance of n+m samples from two sample variances of n and m samples\n vnm = (n * (vn + xn**2) + m * (vm + xm**2)) / n_m_sum - xnm**2\n\n if np.any(vnm < 0):\n if np.any(vnm < -1e-6):\n logger.warning(\n \"Numerical error in variance computation. \"\n f\"Negative sample variances clipped to 0 in {vnm}\"\n )\n vnm[np.where(vnm < 0)] = 0\n\n # Merging of names:\n # If an index has the same name in both results, it must be the same.\n # If an index has a name in one result but not the other, the name is\n # taken from the result with the name.\n if self._names.dtype != other._names.dtype:\n if np.can_cast(other._names.dtype, self._names.dtype, casting=\"safe\"):\n other._names = other._names.astype(self._names.dtype)\n logger.warning(\n f\"Casting ValuationResult.names from {other._names.dtype} to {self._names.dtype}\"\n )\n else:\n raise TypeError(\n f\"Cannot cast ValuationResult.names from \"\n f\"{other._names.dtype} to {self._names.dtype}\"\n )\n\n both_pos = np.intersect1d(this_pos, other_pos)\n\n if len(both_pos) > 0:\n this_names: NDArray = np.empty_like(indices, dtype=object)\n other_names: NDArray = np.empty_like(indices, dtype=object)\n this_names[this_pos] = self._names\n other_names[other_pos] = other._names\n\n this_shared_names = np.take(this_names, both_pos)\n other_shared_names = np.take(other_names, both_pos)\n\n if np.any(this_shared_names != other_shared_names):\n raise ValueError(f\"Mismatching names in ValuationResults\")\n\n names = np.empty_like(indices, dtype=self._names.dtype)\n names[this_pos] = self._names\n names[other_pos] = other._names\n\n return ValuationResult(\n algorithm=self.algorithm or other.algorithm or \"\",\n status=self.status & other.status,\n indices=indices,\n values=xnm,\n variances=vnm,\n counts=n + m,\n data_names=names,\n # FIXME: What to do with extra_values? This is not commutative:\n # extra_values=self._extra_values.update(other._extra_values),\n )\n
"},{"location":"api/pydvl/value/result/#pydvl.value.result.ValuationResult.update","title":"update","text":"update(idx: int, new_value: float) -> ValuationResult[IndexT, NameT]\n
Updates the result in place with a new value, using running mean and variance.
PARAMETER DESCRIPTION idx
Data index of the value to update.
TYPE: int
new_value
New value to add to the result.
TYPE: float
RETURNS DESCRIPTION ValuationResult[IndexT, NameT]
A reference to the same, modified result.
RAISES DESCRIPTION IndexError
If the index is not found.
Source code in src/pydvl/value/result.py
def update(self, idx: int, new_value: float) -> ValuationResult[IndexT, NameT]:\n \"\"\"Updates the result in place with a new value, using running mean\n and variance.\n\n Args:\n idx: Data index of the value to update.\n new_value: New value to add to the result.\n\n Returns:\n A reference to the same, modified result.\n\n Raises:\n IndexError: If the index is not found.\n \"\"\"\n try:\n pos = self._positions[idx]\n except KeyError:\n raise IndexError(f\"Index {idx} not found in ValuationResult\")\n val, var = running_moments(\n self._values[pos], self._variances[pos], self._counts[pos], new_value\n )\n self[pos] = ValueItem(\n index=cast(IndexT, idx), # FIXME\n name=self._names[pos],\n value=val,\n variance=var,\n count=self._counts[pos] + 1,\n )\n return self\n
"},{"location":"api/pydvl/value/result/#pydvl.value.result.ValuationResult.scale","title":"scale","text":"scale(factor: float, indices: Optional[NDArray[IndexT]] = None)\n
Scales the values and variances of the result by a coefficient.
PARAMETER DESCRIPTION factor
Factor to scale by.
TYPE: float
indices
Indices to scale. If None, all values are scaled.
TYPE: Optional[NDArray[IndexT]]
DEFAULT: None
Source code in src/pydvl/value/result.py
def scale(self, factor: float, indices: Optional[NDArray[IndexT]] = None):\n \"\"\"\n Scales the values and variances of the result by a coefficient.\n\n Args:\n factor: Factor to scale by.\n indices: Indices to scale. If None, all values are scaled.\n \"\"\"\n self._values[self._sort_positions[indices]] *= factor\n self._variances[self._sort_positions[indices]] *= factor**2\n
"},{"location":"api/pydvl/value/result/#pydvl.value.result.ValuationResult.get","title":"get","text":"get(idx: Integral) -> ValueItem\n
Retrieves a ValueItem by data index, as opposed to sort index, like the indexing operator.
RAISES DESCRIPTION IndexError
If the index is not found.
Source code in src/pydvl/value/result.py
def get(self, idx: Integral) -> ValueItem:\n \"\"\"Retrieves a ValueItem by data index, as opposed to sort index, like\n the indexing operator.\n\n Raises:\n IndexError: If the index is not found.\n \"\"\"\n try:\n pos = self._positions[idx]\n except KeyError:\n raise IndexError(f\"Index {idx} not found in ValuationResult\")\n\n return ValueItem(\n self._indices[pos],\n self._names[pos],\n self._values[pos],\n self._variances[pos],\n self._counts[pos],\n )\n
"},{"location":"api/pydvl/value/result/#pydvl.value.result.ValuationResult.to_dataframe","title":"to_dataframe","text":"to_dataframe(\n column: Optional[str] = None, use_names: bool = False\n) -> DataFrame\n
Returns values as a dataframe.
PARAMETER DESCRIPTION column
Name for the column holding the data value. Defaults to the name of the algorithm used.
TYPE: Optional[str]
DEFAULT: None
use_names
Whether to use data names instead of indices for the DataFrame's index.
TYPE: bool
DEFAULT: False
RETURNS DESCRIPTION DataFrame
A dataframe with two columns, one for the values, with name given as explained in column
, and another with standard errors for approximate algorithms. The latter will be named column+'_stderr'
.
Source code in src/pydvl/value/result.py
def to_dataframe(\n self, column: Optional[str] = None, use_names: bool = False\n) -> pd.DataFrame:\n \"\"\"Returns values as a dataframe.\n\n Args:\n column: Name for the column holding the data value. Defaults to\n the name of the algorithm used.\n use_names: Whether to use data names instead of indices for the\n DataFrame's index.\n\n Returns:\n A dataframe with two columns, one for the values, with name\n given as explained in `column`, and another with standard errors for\n approximate algorithms. The latter will be named `column+'_stderr'`.\n \"\"\"\n column = column or self._algorithm\n df = pd.DataFrame(\n self._values[self._sort_positions],\n index=(\n self._names[self._sort_positions]\n if use_names\n else self._indices[self._sort_positions]\n ),\n columns=[column],\n )\n df[column + \"_stderr\"] = self.stderr[self._sort_positions]\n df[column + \"_updates\"] = self.counts[self._sort_positions]\n return df\n
"},{"location":"api/pydvl/value/result/#pydvl.value.result.ValuationResult.from_random","title":"from_random classmethod
","text":"from_random(\n size: int,\n total: Optional[float] = None,\n seed: Optional[Seed] = None,\n **kwargs\n) -> \"ValuationResult\"\n
Creates a ValuationResult object and fills it with an array of random values from a uniform distribution in [-1,1]. The values can be made to sum up to a given total number (doing so will change their range).
PARAMETER DESCRIPTION size
Number of values to generate
TYPE: int
total
If set, the values are normalized to sum to this number (\"efficiency\" property of Shapley values).
TYPE: Optional[float]
DEFAULT: None
kwargs
Additional options to pass to the constructor of ValuationResult. Use to override status, names, etc.
DEFAULT: {}
RETURNS DESCRIPTION 'ValuationResult'
A valuation result with its status set to
'ValuationResult'
Status.Converged by default.
RAISES DESCRIPTION ValueError
If size
is less than 1.
Changed in version 0.6.0
Added parameter total
. Check for zero size
Source code in src/pydvl/value/result.py
@classmethod\ndef from_random(\n cls,\n size: int,\n total: Optional[float] = None,\n seed: Optional[Seed] = None,\n **kwargs,\n) -> \"ValuationResult\":\n \"\"\"Creates a [ValuationResult][pydvl.value.result.ValuationResult] object and fills it with an array\n of random values from a uniform distribution in [-1,1]. The values can\n be made to sum up to a given total number (doing so will change their range).\n\n Args:\n size: Number of values to generate\n total: If set, the values are normalized to sum to this number\n (\"efficiency\" property of Shapley values).\n kwargs: Additional options to pass to the constructor of\n [ValuationResult][pydvl.value.result.ValuationResult]. Use to override status, names, etc.\n\n Returns:\n A valuation result with its status set to\n [Status.Converged][pydvl.utils.status.Status] by default.\n\n Raises:\n ValueError: If `size` is less than 1.\n\n !!! tip \"Changed in version 0.6.0\"\n Added parameter `total`. Check for zero size\n \"\"\"\n if size < 1:\n raise ValueError(\"Size must be a positive integer\")\n\n rng = np.random.default_rng(seed)\n values = rng.uniform(low=-1, high=1, size=size)\n if total is not None:\n values *= total / np.sum(values)\n\n options = dict(values=values, status=Status.Converged, algorithm=\"random\")\n options.update(kwargs)\n return cls(**options) # type: ignore\n
"},{"location":"api/pydvl/value/result/#pydvl.value.result.ValuationResult.empty","title":"empty classmethod
","text":"empty(\n algorithm: str = \"\",\n indices: Optional[Sequence[IndexT] | NDArray[IndexT]] = None,\n data_names: Optional[Sequence[NameT] | NDArray[NameT]] = None,\n n_samples: int = 0,\n) -> ValuationResult\n
Creates an empty ValuationResult object.
Empty results are characterised by having an empty array of values. When another result is added to an empty one, the empty one is discarded.
PARAMETER DESCRIPTION algorithm
Name of the algorithm used to compute the values
TYPE: str
DEFAULT: ''
indices
Optional sequence or array of indices.
TYPE: Optional[Sequence[IndexT] | NDArray[IndexT]]
DEFAULT: None
data_names
Optional sequences or array of names for the data points. Defaults to index numbers if not set.
TYPE: Optional[Sequence[NameT] | NDArray[NameT]]
DEFAULT: None
n_samples
Number of valuation result entries.
TYPE: int
DEFAULT: 0
RETURNS DESCRIPTION ValuationResult
Object with the results.
Source code in src/pydvl/value/result.py
@classmethod\ndef empty(\n cls,\n algorithm: str = \"\",\n indices: Optional[Sequence[IndexT] | NDArray[IndexT]] = None,\n data_names: Optional[Sequence[NameT] | NDArray[NameT]] = None,\n n_samples: int = 0,\n) -> ValuationResult:\n \"\"\"Creates an empty [ValuationResult][pydvl.value.result.ValuationResult] object.\n\n Empty results are characterised by having an empty array of values. When\n another result is added to an empty one, the empty one is discarded.\n\n Args:\n algorithm: Name of the algorithm used to compute the values\n indices: Optional sequence or array of indices.\n data_names: Optional sequences or array of names for the data points.\n Defaults to index numbers if not set.\n n_samples: Number of valuation result entries.\n\n Returns:\n Object with the results.\n \"\"\"\n if indices is not None or data_names is not None or n_samples != 0:\n return cls.zeros(\n algorithm=algorithm,\n indices=indices,\n data_names=data_names,\n n_samples=n_samples,\n )\n return cls(algorithm=algorithm, status=Status.Pending, values=np.array([]))\n
"},{"location":"api/pydvl/value/result/#pydvl.value.result.ValuationResult.zeros","title":"zeros classmethod
","text":"zeros(\n algorithm: str = \"\",\n indices: Optional[Sequence[IndexT] | NDArray[IndexT]] = None,\n data_names: Optional[Sequence[NameT] | NDArray[NameT]] = None,\n n_samples: int = 0,\n) -> ValuationResult\n
Creates an empty ValuationResult object.
Empty results are characterised by having an empty array of values. When another result is added to an empty one, the empty one is ignored.
PARAMETER DESCRIPTION algorithm
Name of the algorithm used to compute the values
TYPE: str
DEFAULT: ''
indices
Data indices to use. A copy will be made. If not given, the indices will be set to the range [0, n_samples)
.
TYPE: Optional[Sequence[IndexT] | NDArray[IndexT]]
DEFAULT: None
data_names
Data names to use. A copy will be made. If not given, the names will be set to the string representation of the indices.
TYPE: Optional[Sequence[NameT] | NDArray[NameT]]
DEFAULT: None
n_samples
Number of data points whose values are computed. If not given, the length of indices
will be used.
TYPE: int
DEFAULT: 0
RETURNS DESCRIPTION ValuationResult
Object with the results.
Source code in src/pydvl/value/result.py
@classmethod\ndef zeros(\n cls,\n algorithm: str = \"\",\n indices: Optional[Sequence[IndexT] | NDArray[IndexT]] = None,\n data_names: Optional[Sequence[NameT] | NDArray[NameT]] = None,\n n_samples: int = 0,\n) -> ValuationResult:\n \"\"\"Creates an empty [ValuationResult][pydvl.value.result.ValuationResult] object.\n\n Empty results are characterised by having an empty array of values. When\n another result is added to an empty one, the empty one is ignored.\n\n Args:\n algorithm: Name of the algorithm used to compute the values\n indices: Data indices to use. A copy will be made. If not given,\n the indices will be set to the range `[0, n_samples)`.\n data_names: Data names to use. A copy will be made. If not given,\n the names will be set to the string representation of the indices.\n n_samples: Number of data points whose values are computed. If\n not given, the length of `indices` will be used.\n\n Returns:\n Object with the results.\n \"\"\"\n if indices is None:\n indices = np.arange(n_samples, dtype=np.int_)\n else:\n indices = np.array(indices, dtype=np.int_)\n\n if data_names is None:\n data_names = np.array(indices)\n else:\n data_names = np.array(data_names)\n\n return cls(\n algorithm=algorithm,\n status=Status.Pending,\n indices=indices,\n data_names=data_names,\n values=np.zeros(len(indices)),\n variances=np.zeros(len(indices)),\n counts=np.zeros(len(indices), dtype=np.int_),\n )\n
"},{"location":"api/pydvl/value/sampler/","title":"Sampler","text":""},{"location":"api/pydvl/value/sampler/#pydvl.value.sampler","title":"pydvl.value.sampler","text":"Samplers iterate over subsets of indices.
The classes in this module are used to iterate over indices and subsets of their complement in the whole set, as required for the computation of marginal utility for semi-values. The elements returned when iterating over any subclass of PowersetSampler are tuples of the form (idx, subset)
, where idx
is the index of the element being added to the subset, and subset
is the subset of the complement of idx
. The classes in this module are used to iterate over an index set \\(I\\) as required for the computation of marginal utility for semi-values. The elements returned when iterating over any subclass of :class:PowersetSampler
are tuples of the form \\((i, S)\\), where \\(i\\) is an index of interest, and \\(S \\subset I \\setminus \\{i\\}\\) is a subset of the complement of \\(i\\).
The iteration happens in two nested loops. An outer loop iterates over \\(I\\), and an inner loop iterates over the powerset of \\(I \\setminus \\{i\\}\\). The outer iteration can be either sequential or at random.
Note
This is the natural mode of iteration for the combinatorial definition of semi-values, in particular Shapley value. For the computation using permutations, adhering to this interface is not ideal, but we stick to it for consistency.
The samplers are used in the semivalues module to compute any semi-value, in particular Shapley and Beta values, and Banzhaf indices.
"},{"location":"api/pydvl/value/sampler/#pydvl.value.sampler--slicing-of-samplers","title":"Slicing of samplers","text":"The samplers can be sliced for parallel computation. For those which are embarrassingly parallel, this is done by slicing the set of \"outer\" indices and returning new samplers over those slices. This includes all truly powerset-based samplers, such as DeterministicUniformSampler and UniformSampler. In contrast, slicing a PermutationSampler creates a new sampler which iterates over the same indices.
"},{"location":"api/pydvl/value/sampler/#pydvl.value.sampler--references","title":"References","text":" -
Mitchell, Rory, Joshua Cooper, Eibe Frank, and Geoffrey Holmes. Sampling Permutations for Shapley Value Estimation. Journal of Machine Learning Research 23, no. 43 (2022): 1\u201346.\u00a0\u21a9
-
Wang, J.T. and Jia, R., 2023. Data Banzhaf: A Robust Data Valuation Framework for Machine Learning. In: Proceedings of The 26th International Conference on Artificial Intelligence and Statistics, pp. 6388-6421.\u00a0\u21a9
"},{"location":"api/pydvl/value/sampler/#pydvl.value.sampler.PowersetSampler","title":"PowersetSampler","text":"PowersetSampler(\n indices: NDArray[IndexT],\n index_iteration: IndexIteration = IndexIteration.Sequential,\n outer_indices: NDArray[IndexT] | None = None,\n **kwargs\n)\n
Bases: ABC
, Iterable[SampleT]
, Generic[IndexT]
Samplers are custom iterables over subsets of indices.
Calling iter()
on a sampler returns an iterator over tuples of the form \\((i, S)\\), where \\(i\\) is an index of interest, and \\(S \\subset I \\setminus \\{i\\}\\) is a subset of the complement of \\(i\\).
This is done in two nested loops, where the outer loop iterates over the set of indices, and the inner loop iterates over subsets of the complement of the current index. The outer iteration can be either sequential or at random.
Note
Samplers are not iterators themselves, so that each call to iter()
e.g. in a for loop creates a new iterator.
Example >>>for idx, s in DeterministicUniformSampler(np.arange(2)):\n>>> print(s, end=\"\")\n[][2,][][1,]\n
"},{"location":"api/pydvl/value/sampler/#pydvl.value.sampler.PowersetSampler--methods-required-in-subclasses","title":"Methods required in subclasses","text":"Samplers must implement a weight() function to be used as a multiplier in Monte Carlo sums, so that the limit expectation coincides with the semi-value.
"},{"location":"api/pydvl/value/sampler/#pydvl.value.sampler.PowersetSampler--slicing-of-samplers","title":"Slicing of samplers","text":"The samplers can be sliced for parallel computation. For those which are embarrassingly parallel, this is done by slicing the set of \"outer\" indices and returning new samplers over those slices.
index_iteration: the order in which indices are iterated over\nouter_indices: The set of items (indices) over which to iterate\n when sampling. Subsets are taken from the complement of each index\n in succession. For embarrassingly parallel computations, this set\n is sliced and the samplers are used to iterate over the slices.\n
Source code in src/pydvl/value/sampler.py
def __init__(\n self,\n indices: NDArray[IndexT],\n index_iteration: IndexIteration = IndexIteration.Sequential,\n outer_indices: NDArray[IndexT] | None = None,\n **kwargs,\n):\n \"\"\"\n Args:\n indices: The set of items (indices) to sample from.\n index_iteration: the order in which indices are iterated over\n outer_indices: The set of items (indices) over which to iterate\n when sampling. Subsets are taken from the complement of each index\n in succession. For embarrassingly parallel computations, this set\n is sliced and the samplers are used to iterate over the slices.\n \"\"\"\n self._indices = indices\n self._index_iteration = index_iteration\n self._outer_indices = outer_indices if outer_indices is not None else indices\n self._n = len(indices)\n self._n_samples = 0\n
"},{"location":"api/pydvl/value/sampler/#pydvl.value.sampler.PowersetSampler.iterindices","title":"iterindices","text":"iterindices() -> Iterator[IndexT]\n
Iterates over indices in the order specified at construction.
this is probably not very useful, but I couldn't decide which method is better
Source code in src/pydvl/value/sampler.py
def iterindices(self) -> Iterator[IndexT]:\n \"\"\"Iterates over indices in the order specified at construction.\n\n FIXME: this is probably not very useful, but I couldn't decide\n which method is better\n \"\"\"\n if self._index_iteration is PowersetSampler.IndexIteration.Sequential:\n for idx in self._outer_indices:\n yield idx\n elif self._index_iteration is PowersetSampler.IndexIteration.Random:\n while True:\n yield np.random.choice(self._outer_indices, size=1).item()\n
"},{"location":"api/pydvl/value/sampler/#pydvl.value.sampler.PowersetSampler.__len__","title":"__len__","text":"__len__() -> int\n
Returns the number of outer indices over which the sampler iterates.
Source code in src/pydvl/value/sampler.py
def __len__(self) -> int:\n \"\"\"Returns the number of outer indices over which the sampler iterates.\"\"\"\n return len(self._outer_indices)\n
"},{"location":"api/pydvl/value/sampler/#pydvl.value.sampler.PowersetSampler.weight","title":"weight abstractmethod
classmethod
","text":"weight(n: int, subset_len: int) -> float\n
Factor by which to multiply Monte Carlo samples, so that the mean converges to the desired expression.
By the Law of Large Numbers, the sample mean of \\(\\delta_i(S_j)\\) converges to the expectation under the distribution from which \\(S_j\\) is sampled.
\\[ \\frac{1}{m} \\sum_{j = 1}^m \\delta_i (S_j) c (S_j) \\longrightarrow \\underset{S \\sim \\mathcal{D}_{- i}}{\\mathbb{E}} [\\delta_i (S) c ( S)]\\] We add a factor \\(c(S_j)\\) in order to have this expectation coincide with the desired expression.
Source code in src/pydvl/value/sampler.py
@classmethod\n@abc.abstractmethod\ndef weight(cls, n: int, subset_len: int) -> float:\n r\"\"\"Factor by which to multiply Monte Carlo samples, so that the\n mean converges to the desired expression.\n\n By the Law of Large Numbers, the sample mean of $\\delta_i(S_j)$\n converges to the expectation under the distribution from which $S_j$ is\n sampled.\n\n $$ \\frac{1}{m} \\sum_{j = 1}^m \\delta_i (S_j) c (S_j) \\longrightarrow\n \\underset{S \\sim \\mathcal{D}_{- i}}{\\mathbb{E}} [\\delta_i (S) c (\n S)]$$\n\n We add a factor $c(S_j)$ in order to have this expectation coincide with\n the desired expression.\n \"\"\"\n ...\n
"},{"location":"api/pydvl/value/sampler/#pydvl.value.sampler.StochasticSamplerMixin","title":"StochasticSamplerMixin","text":"StochasticSamplerMixin(*args, seed: Optional[Seed] = None, **kwargs)\n
Mixin class for samplers which use a random number generator.
Source code in src/pydvl/value/sampler.py
def __init__(self, *args, seed: Optional[Seed] = None, **kwargs):\n super().__init__(*args, **kwargs)\n self._rng = np.random.default_rng(seed)\n
"},{"location":"api/pydvl/value/sampler/#pydvl.value.sampler.DeterministicUniformSampler","title":"DeterministicUniformSampler","text":"DeterministicUniformSampler(indices: NDArray[IndexT], *args, **kwargs)\n
Bases: PowersetSampler[IndexT]
For every index \\(i\\), each subset of the complement indices - {i}
is returned.
Note
Indices are always iterated over sequentially, irrespective of the value of index_iteration
upon construction.
Example >>> for idx, s in DeterministicUniformSampler(np.arange(2)):\n>>> print(f\"{idx} - {s}\", end=\", \")\n1 - [], 1 - [2], 2 - [], 2 - [1],\n
PARAMETER DESCRIPTION indices
The set of items (indices) to sample from.
TYPE: NDArray[IndexT]
Source code in src/pydvl/value/sampler.py
def __init__(self, indices: NDArray[IndexT], *args, **kwargs):\n \"\"\"An iterator to perform uniform deterministic sampling of subsets.\n\n For every index $i$, each subset of the complement `indices - {i}` is\n returned.\n\n !!! Note\n Indices are always iterated over sequentially, irrespective of\n the value of `index_iteration` upon construction.\n\n ??? Example\n ``` pycon\n >>> for idx, s in DeterministicUniformSampler(np.arange(2)):\n >>> print(f\"{idx} - {s}\", end=\", \")\n 1 - [], 1 - [2], 2 - [], 2 - [1],\n ```\n\n Args:\n indices: The set of items (indices) to sample from.\n \"\"\"\n # Force sequential iteration\n kwargs.update({\"index_iteration\": PowersetSampler.IndexIteration.Sequential})\n super().__init__(indices, *args, **kwargs)\n
"},{"location":"api/pydvl/value/sampler/#pydvl.value.sampler.DeterministicUniformSampler.iterindices","title":"iterindices","text":"iterindices() -> Iterator[IndexT]\n
Iterates over indices in the order specified at construction.
this is probably not very useful, but I couldn't decide which method is better
Source code in src/pydvl/value/sampler.py
def iterindices(self) -> Iterator[IndexT]:\n \"\"\"Iterates over indices in the order specified at construction.\n\n FIXME: this is probably not very useful, but I couldn't decide\n which method is better\n \"\"\"\n if self._index_iteration is PowersetSampler.IndexIteration.Sequential:\n for idx in self._outer_indices:\n yield idx\n elif self._index_iteration is PowersetSampler.IndexIteration.Random:\n while True:\n yield np.random.choice(self._outer_indices, size=1).item()\n
"},{"location":"api/pydvl/value/sampler/#pydvl.value.sampler.DeterministicUniformSampler.__len__","title":"__len__","text":"__len__() -> int\n
Returns the number of outer indices over which the sampler iterates.
Source code in src/pydvl/value/sampler.py
def __len__(self) -> int:\n \"\"\"Returns the number of outer indices over which the sampler iterates.\"\"\"\n return len(self._outer_indices)\n
"},{"location":"api/pydvl/value/sampler/#pydvl.value.sampler.UniformSampler","title":"UniformSampler","text":"UniformSampler(*args, seed: Optional[Seed] = None, **kwargs)\n
Bases: StochasticSamplerMixin
, PowersetSampler[IndexT]
An iterator to perform uniform random sampling of subsets.
Iterating over every index \\(i\\), either in sequence or at random depending on the value of index_iteration
, one subset of the complement indices - {i}
is sampled with equal probability \\(2^{n-1}\\). The iterator never ends.
Example The code
for idx, s in UniformSampler(np.arange(3)):\n print(f\"{idx} - {s}\", end=\", \")\n
Produces the output: 0 - [1 4], 1 - [2 3], 2 - [0 1 3], 3 - [], 4 - [2], 0 - [1 3 4], 1 - [0 2]\n(...)\n
Source code in src/pydvl/value/sampler.py
def __init__(self, *args, seed: Optional[Seed] = None, **kwargs):\n super().__init__(*args, **kwargs)\n self._rng = np.random.default_rng(seed)\n
"},{"location":"api/pydvl/value/sampler/#pydvl.value.sampler.UniformSampler.iterindices","title":"iterindices","text":"iterindices() -> Iterator[IndexT]\n
Iterates over indices in the order specified at construction.
this is probably not very useful, but I couldn't decide which method is better
Source code in src/pydvl/value/sampler.py
def iterindices(self) -> Iterator[IndexT]:\n \"\"\"Iterates over indices in the order specified at construction.\n\n FIXME: this is probably not very useful, but I couldn't decide\n which method is better\n \"\"\"\n if self._index_iteration is PowersetSampler.IndexIteration.Sequential:\n for idx in self._outer_indices:\n yield idx\n elif self._index_iteration is PowersetSampler.IndexIteration.Random:\n while True:\n yield np.random.choice(self._outer_indices, size=1).item()\n
"},{"location":"api/pydvl/value/sampler/#pydvl.value.sampler.UniformSampler.__len__","title":"__len__","text":"__len__() -> int\n
Returns the number of outer indices over which the sampler iterates.
Source code in src/pydvl/value/sampler.py
def __len__(self) -> int:\n \"\"\"Returns the number of outer indices over which the sampler iterates.\"\"\"\n return len(self._outer_indices)\n
"},{"location":"api/pydvl/value/sampler/#pydvl.value.sampler.UniformSampler.weight","title":"weight classmethod
","text":"weight(n: int, subset_len: int) -> float\n
Correction coming from Monte Carlo integration so that the mean of the marginals converges to the value: the uniform distribution over the powerset of a set with n-1 elements has mass 2^{n-1} over each subset.
Source code in src/pydvl/value/sampler.py
@classmethod\ndef weight(cls, n: int, subset_len: int) -> float:\n \"\"\"Correction coming from Monte Carlo integration so that the mean of\n the marginals converges to the value: the uniform distribution over the\n powerset of a set with n-1 elements has mass 2^{n-1} over each subset.\"\"\"\n return float(2 ** (n - 1)) if n > 0 else 1.0\n
"},{"location":"api/pydvl/value/sampler/#pydvl.value.sampler.MSRSampler","title":"MSRSampler","text":"MSRSampler(*args, seed: Optional[Seed] = None, **kwargs)\n
Bases: StochasticSamplerMixin
, PowersetSampler[IndexT]
An iterator to perform sampling of random subsets.
This sampler does not return any index, it only returns subsets of the data. This sampler is used in (Wang et. al.)2.
Source code in src/pydvl/value/sampler.py
def __init__(self, *args, seed: Optional[Seed] = None, **kwargs):\n super().__init__(*args, **kwargs)\n self._rng = np.random.default_rng(seed)\n
"},{"location":"api/pydvl/value/sampler/#pydvl.value.sampler.MSRSampler.iterindices","title":"iterindices","text":"iterindices() -> Iterator[IndexT]\n
Iterates over indices in the order specified at construction.
this is probably not very useful, but I couldn't decide which method is better
Source code in src/pydvl/value/sampler.py
def iterindices(self) -> Iterator[IndexT]:\n \"\"\"Iterates over indices in the order specified at construction.\n\n FIXME: this is probably not very useful, but I couldn't decide\n which method is better\n \"\"\"\n if self._index_iteration is PowersetSampler.IndexIteration.Sequential:\n for idx in self._outer_indices:\n yield idx\n elif self._index_iteration is PowersetSampler.IndexIteration.Random:\n while True:\n yield np.random.choice(self._outer_indices, size=1).item()\n
"},{"location":"api/pydvl/value/sampler/#pydvl.value.sampler.MSRSampler.__len__","title":"__len__","text":"__len__() -> int\n
Returns the number of outer indices over which the sampler iterates.
Source code in src/pydvl/value/sampler.py
def __len__(self) -> int:\n \"\"\"Returns the number of outer indices over which the sampler iterates.\"\"\"\n return len(self._outer_indices)\n
"},{"location":"api/pydvl/value/sampler/#pydvl.value.sampler.AntitheticSampler","title":"AntitheticSampler","text":"AntitheticSampler(*args, seed: Optional[Seed] = None, **kwargs)\n
Bases: StochasticSamplerMixin
, PowersetSampler[IndexT]
An iterator to perform uniform random sampling of subsets, and their complements.
Works as UniformSampler, but for every tuple \\((i,S)\\), it subsequently returns \\((i,S^c)\\), where \\(S^c\\) is the complement of the set \\(S\\) in the set of indices, excluding \\(i\\).
Source code in src/pydvl/value/sampler.py
def __init__(self, *args, seed: Optional[Seed] = None, **kwargs):\n super().__init__(*args, **kwargs)\n self._rng = np.random.default_rng(seed)\n
"},{"location":"api/pydvl/value/sampler/#pydvl.value.sampler.AntitheticSampler.iterindices","title":"iterindices","text":"iterindices() -> Iterator[IndexT]\n
Iterates over indices in the order specified at construction.
this is probably not very useful, but I couldn't decide which method is better
Source code in src/pydvl/value/sampler.py
def iterindices(self) -> Iterator[IndexT]:\n \"\"\"Iterates over indices in the order specified at construction.\n\n FIXME: this is probably not very useful, but I couldn't decide\n which method is better\n \"\"\"\n if self._index_iteration is PowersetSampler.IndexIteration.Sequential:\n for idx in self._outer_indices:\n yield idx\n elif self._index_iteration is PowersetSampler.IndexIteration.Random:\n while True:\n yield np.random.choice(self._outer_indices, size=1).item()\n
"},{"location":"api/pydvl/value/sampler/#pydvl.value.sampler.AntitheticSampler.__len__","title":"__len__","text":"__len__() -> int\n
Returns the number of outer indices over which the sampler iterates.
Source code in src/pydvl/value/sampler.py
def __len__(self) -> int:\n \"\"\"Returns the number of outer indices over which the sampler iterates.\"\"\"\n return len(self._outer_indices)\n
"},{"location":"api/pydvl/value/sampler/#pydvl.value.sampler.PermutationSampler","title":"PermutationSampler","text":"PermutationSampler(*args, seed: Optional[Seed] = None, **kwargs)\n
Bases: StochasticSamplerMixin
, PowersetSampler[IndexT]
Sample permutations of indices and iterate through each returning increasing subsets, as required for the permutation definition of semi-values.
This sampler does not implement the two loops described in PowersetSampler. Instead, for a permutation (3,1,4,2)
, it returns in sequence the tuples of index and sets: (3, {})
, (1, {3})
, (4, {3,1})
and (2, {3,1,4})
.
Note that the full index set is never returned.
Warning
This sampler requires caching to be enabled or computation will be doubled wrt. a \"direct\" implementation of permutation MC
Source code in src/pydvl/value/sampler.py
def __init__(self, *args, seed: Optional[Seed] = None, **kwargs):\n super().__init__(*args, **kwargs)\n self._rng = np.random.default_rng(seed)\n
"},{"location":"api/pydvl/value/sampler/#pydvl.value.sampler.PermutationSampler.iterindices","title":"iterindices","text":"iterindices() -> Iterator[IndexT]\n
Iterates over indices in the order specified at construction.
this is probably not very useful, but I couldn't decide which method is better
Source code in src/pydvl/value/sampler.py
def iterindices(self) -> Iterator[IndexT]:\n \"\"\"Iterates over indices in the order specified at construction.\n\n FIXME: this is probably not very useful, but I couldn't decide\n which method is better\n \"\"\"\n if self._index_iteration is PowersetSampler.IndexIteration.Sequential:\n for idx in self._outer_indices:\n yield idx\n elif self._index_iteration is PowersetSampler.IndexIteration.Random:\n while True:\n yield np.random.choice(self._outer_indices, size=1).item()\n
"},{"location":"api/pydvl/value/sampler/#pydvl.value.sampler.PermutationSampler.__len__","title":"__len__","text":"__len__() -> int\n
Returns the number of outer indices over which the sampler iterates.
Source code in src/pydvl/value/sampler.py
def __len__(self) -> int:\n \"\"\"Returns the number of outer indices over which the sampler iterates.\"\"\"\n return len(self._outer_indices)\n
"},{"location":"api/pydvl/value/sampler/#pydvl.value.sampler.PermutationSampler.__getitem__","title":"__getitem__","text":"__getitem__(key: slice | list[int]) -> PowersetSampler[IndexT]\n
Permutation samplers cannot be split across indices, so we return a copy of the full sampler.
Source code in src/pydvl/value/sampler.py
def __getitem__(self, key: slice | list[int]) -> PowersetSampler[IndexT]:\n \"\"\"Permutation samplers cannot be split across indices, so we return\n a copy of the full sampler.\"\"\"\n return super().__getitem__(slice(None))\n
"},{"location":"api/pydvl/value/sampler/#pydvl.value.sampler.AntitheticPermutationSampler","title":"AntitheticPermutationSampler","text":"AntitheticPermutationSampler(*args, seed: Optional[Seed] = None, **kwargs)\n
Bases: PermutationSampler[IndexT]
Samples permutations like PermutationSampler, but after each permutation, it returns the same permutation in reverse order.
This sampler was suggested in (Mitchell et al. 2022)1
New in version 0.7.1
Source code in src/pydvl/value/sampler.py
def __init__(self, *args, seed: Optional[Seed] = None, **kwargs):\n super().__init__(*args, **kwargs)\n self._rng = np.random.default_rng(seed)\n
"},{"location":"api/pydvl/value/sampler/#pydvl.value.sampler.AntitheticPermutationSampler.iterindices","title":"iterindices","text":"iterindices() -> Iterator[IndexT]\n
Iterates over indices in the order specified at construction.
this is probably not very useful, but I couldn't decide which method is better
Source code in src/pydvl/value/sampler.py
def iterindices(self) -> Iterator[IndexT]:\n \"\"\"Iterates over indices in the order specified at construction.\n\n FIXME: this is probably not very useful, but I couldn't decide\n which method is better\n \"\"\"\n if self._index_iteration is PowersetSampler.IndexIteration.Sequential:\n for idx in self._outer_indices:\n yield idx\n elif self._index_iteration is PowersetSampler.IndexIteration.Random:\n while True:\n yield np.random.choice(self._outer_indices, size=1).item()\n
"},{"location":"api/pydvl/value/sampler/#pydvl.value.sampler.AntitheticPermutationSampler.__getitem__","title":"__getitem__","text":"__getitem__(key: slice | list[int]) -> PowersetSampler[IndexT]\n
Permutation samplers cannot be split across indices, so we return a copy of the full sampler.
Source code in src/pydvl/value/sampler.py
def __getitem__(self, key: slice | list[int]) -> PowersetSampler[IndexT]:\n \"\"\"Permutation samplers cannot be split across indices, so we return\n a copy of the full sampler.\"\"\"\n return super().__getitem__(slice(None))\n
"},{"location":"api/pydvl/value/sampler/#pydvl.value.sampler.AntitheticPermutationSampler.__len__","title":"__len__","text":"__len__() -> int\n
Returns the number of outer indices over which the sampler iterates.
Source code in src/pydvl/value/sampler.py
def __len__(self) -> int:\n \"\"\"Returns the number of outer indices over which the sampler iterates.\"\"\"\n return len(self._outer_indices)\n
"},{"location":"api/pydvl/value/sampler/#pydvl.value.sampler.DeterministicPermutationSampler","title":"DeterministicPermutationSampler","text":"DeterministicPermutationSampler(*args, seed: Optional[Seed] = None, **kwargs)\n
Bases: PermutationSampler[IndexT]
Samples all n! permutations of the indices deterministically, and iterates through them, returning sets as required for the permutation-based definition of semi-values.
Warning
This sampler requires caching to be enabled or computation will be doubled wrt. a \"direct\" implementation of permutation MC
Warning
This sampler is not parallelizable, as it always iterates over the whole set of permutations in the same order. Different processes would always return the same values for all indices.
Source code in src/pydvl/value/sampler.py
def __init__(self, *args, seed: Optional[Seed] = None, **kwargs):\n super().__init__(*args, **kwargs)\n self._rng = np.random.default_rng(seed)\n
"},{"location":"api/pydvl/value/sampler/#pydvl.value.sampler.DeterministicPermutationSampler.iterindices","title":"iterindices","text":"iterindices() -> Iterator[IndexT]\n
Iterates over indices in the order specified at construction.
this is probably not very useful, but I couldn't decide which method is better
Source code in src/pydvl/value/sampler.py
def iterindices(self) -> Iterator[IndexT]:\n \"\"\"Iterates over indices in the order specified at construction.\n\n FIXME: this is probably not very useful, but I couldn't decide\n which method is better\n \"\"\"\n if self._index_iteration is PowersetSampler.IndexIteration.Sequential:\n for idx in self._outer_indices:\n yield idx\n elif self._index_iteration is PowersetSampler.IndexIteration.Random:\n while True:\n yield np.random.choice(self._outer_indices, size=1).item()\n
"},{"location":"api/pydvl/value/sampler/#pydvl.value.sampler.DeterministicPermutationSampler.__getitem__","title":"__getitem__","text":"__getitem__(key: slice | list[int]) -> PowersetSampler[IndexT]\n
Permutation samplers cannot be split across indices, so we return a copy of the full sampler.
Source code in src/pydvl/value/sampler.py
def __getitem__(self, key: slice | list[int]) -> PowersetSampler[IndexT]:\n \"\"\"Permutation samplers cannot be split across indices, so we return\n a copy of the full sampler.\"\"\"\n return super().__getitem__(slice(None))\n
"},{"location":"api/pydvl/value/sampler/#pydvl.value.sampler.DeterministicPermutationSampler.__len__","title":"__len__","text":"__len__() -> int\n
Returns the number of outer indices over which the sampler iterates.
Source code in src/pydvl/value/sampler.py
def __len__(self) -> int:\n \"\"\"Returns the number of outer indices over which the sampler iterates.\"\"\"\n return len(self._outer_indices)\n
"},{"location":"api/pydvl/value/sampler/#pydvl.value.sampler.RandomHierarchicalSampler","title":"RandomHierarchicalSampler","text":"RandomHierarchicalSampler(*args, seed: Optional[Seed] = None, **kwargs)\n
Bases: StochasticSamplerMixin
, PowersetSampler[IndexT]
For every index, sample a set size, then a set of that size.
Todo
This is unnecessary, but a step towards proper stratified sampling.
Source code in src/pydvl/value/sampler.py
def __init__(self, *args, seed: Optional[Seed] = None, **kwargs):\n super().__init__(*args, **kwargs)\n self._rng = np.random.default_rng(seed)\n
"},{"location":"api/pydvl/value/sampler/#pydvl.value.sampler.RandomHierarchicalSampler.iterindices","title":"iterindices","text":"iterindices() -> Iterator[IndexT]\n
Iterates over indices in the order specified at construction.
this is probably not very useful, but I couldn't decide which method is better
Source code in src/pydvl/value/sampler.py
def iterindices(self) -> Iterator[IndexT]:\n \"\"\"Iterates over indices in the order specified at construction.\n\n FIXME: this is probably not very useful, but I couldn't decide\n which method is better\n \"\"\"\n if self._index_iteration is PowersetSampler.IndexIteration.Sequential:\n for idx in self._outer_indices:\n yield idx\n elif self._index_iteration is PowersetSampler.IndexIteration.Random:\n while True:\n yield np.random.choice(self._outer_indices, size=1).item()\n
"},{"location":"api/pydvl/value/sampler/#pydvl.value.sampler.RandomHierarchicalSampler.__len__","title":"__len__","text":"__len__() -> int\n
Returns the number of outer indices over which the sampler iterates.
Source code in src/pydvl/value/sampler.py
def __len__(self) -> int:\n \"\"\"Returns the number of outer indices over which the sampler iterates.\"\"\"\n return len(self._outer_indices)\n
"},{"location":"api/pydvl/value/semivalues/","title":"Semivalues","text":""},{"location":"api/pydvl/value/semivalues/#pydvl.value.semivalues","title":"pydvl.value.semivalues","text":"This module provides the core functionality for the computation of generic semi-values. A semi-value is any valuation function with the form:
\\[v_\\text{semi}(i) = \\sum_{i=1}^n w(k) \\sum_{S \\subset D_{-i}^{(k)}} [U(S_{+i})-U(S)],\\] where the coefficients \\(w(k)\\) satisfy the property:
\\[\\sum_{k=1}^n w(k) = 1.\\] Note For implementation consistency, we slightly depart from the common definition of semi-values, which includes a factor \\(1/n\\) in the sum over subsets. Instead, we subsume this factor into the coefficient \\(w(k)\\).
"},{"location":"api/pydvl/value/semivalues/#pydvl.value.semivalues--main-components","title":"Main components","text":"The computation of a semi-value requires two components:
- A subset sampler that generates subsets of the set \\(D\\) of interest.
- A coefficient \\(w(k)\\) that assigns a weight to each subset size \\(k\\).
Samplers can be found in sampler, and can be classified into two categories: powerset samplers and permutation samplers. Powerset samplers generate subsets of \\(D_{-i}\\), while the permutation sampler generates permutations of \\(D\\). The former conform to the above definition of semi-values, while the latter reformulates it as:
\\[ v(i) = \\frac{1}{n!} \\sum_{\\sigma \\in \\Pi(n)} \\tilde{w}( | \\sigma_{:i} | )[U(\\sigma_{:i} \\cup \\{i\\}) \u2212 U(\\sigma_{:i})], \\] where \\(\\sigma_{:i}\\) denotes the set of indices in permutation sigma before the position where \\(i\\) appears (see Data valuation for details), and
\\[ \\tilde{w} (k) = n \\binom{n - 1}{k} w (k) \\] is the weight correction due to the reformulation.
Warning
Both PermutationSampler and DeterministicPermutationSampler require caching to be enabled or computation will be doubled wrt. a 'direct' implementation of permutation MC.
"},{"location":"api/pydvl/value/semivalues/#pydvl.value.semivalues--computing-semi-values","title":"Computing semi-values","text":"Samplers and coefficients can be arbitrarily mixed by means of the main entry point of this module, compute_generic_semivalues. There are several pre-defined coefficients, including the Shapley value of (Ghorbani and Zou, 2019)1, the Banzhaf index of (Wang and Jia)3, and the Beta coefficient of (Kwon and Zou, 2022)2. For each of these methods, there is a convenience wrapper function. Respectively, these are: compute_shapley_semivalues, compute_banzhaf_semivalues, and compute_beta_shapley_semivalues. instead.
Parallelization and batching
In order to ensure reproducibility and fine-grained control of parallelization, samples are generated in the main process and then distributed to worker processes for evaluation. For small sample sizes, this can lead to a significant overhead. To avoid this, we temporarily provide an additional argument batch_size
to all methods which can improve performance with small models up to an order of magnitude. Note that this argument will be removed before version 1.0 in favour of a more general solution.
"},{"location":"api/pydvl/value/semivalues/#pydvl.value.semivalues--references","title":"References","text":" -
Ghorbani, A., Zou, J., 2019. Data Shapley: Equitable Valuation of Data for Machine Learning. In: Proceedings of the 36th International Conference on Machine Learning, PMLR, pp. 2242\u20132251.\u00a0\u21a9
-
Kwon, Y. and Zou, J., 2022. Beta Shapley: A Unified and Noise-reduced Data Valuation Framework for Machine Learning. In: Proceedings of the 25th International Conference on Artificial Intelligence and Statistics (AISTATS) 2022, Vol. 151. PMLR, Valencia, Spain.\u00a0\u21a9
-
Wang, J.T. and Jia, R., 2023. Data Banzhaf: A Robust Data Valuation Framework for Machine Learning. In: Proceedings of The 26th International Conference on Artificial Intelligence and Statistics, pp. 6388-6421.\u00a0\u21a9
"},{"location":"api/pydvl/value/semivalues/#pydvl.value.semivalues.SVCoefficient","title":"SVCoefficient","text":" Bases: Protocol
The protocol that coefficients for the computation of semi-values must fulfill.
"},{"location":"api/pydvl/value/semivalues/#pydvl.value.semivalues.SVCoefficient.__call__","title":"__call__","text":"__call__(n: int, k: int) -> float\n
Computes the coefficient for a given subset size.
PARAMETER DESCRIPTION n
Total number of elements in the set.
TYPE: int
k
Size of the subset for which the coefficient is being computed
TYPE: int
Source code in src/pydvl/value/semivalues.py
def __call__(self, n: int, k: int) -> float:\n \"\"\"Computes the coefficient for a given subset size.\n\n Args:\n n: Total number of elements in the set.\n k: Size of the subset for which the coefficient is being computed\n \"\"\"\n ...\n
"},{"location":"api/pydvl/value/semivalues/#pydvl.value.semivalues.DefaultMarginal","title":"DefaultMarginal","text":" Bases: MarginalFunction
"},{"location":"api/pydvl/value/semivalues/#pydvl.value.semivalues.DefaultMarginal.__call__","title":"__call__","text":"__call__(\n u: Utility, coefficient: SVCoefficient, samples: Iterable[SampleT]\n) -> Tuple[MarginalT, ...]\n
Computation of marginal utility. This is a helper function for compute_generic_semivalues.
PARAMETER DESCRIPTION u
Utility object with model, data, and scoring function.
TYPE: Utility
coefficient
The semivalue coefficient and sampler weight
TYPE: SVCoefficient
samples
A collection of samples. Each sample is a tuple of index and subset of indices to compute a marginal utility.
TYPE: Iterable[SampleT]
RETURNS DESCRIPTION MarginalT
A collection of marginals. Each marginal is a tuple with index and its marginal
...
utility.
Source code in src/pydvl/value/semivalues.py
def __call__(\n self, u: Utility, coefficient: SVCoefficient, samples: Iterable[SampleT]\n) -> Tuple[MarginalT, ...]:\n \"\"\"Computation of marginal utility. This is a helper function for\n [compute_generic_semivalues][pydvl.value.semivalues.compute_generic_semivalues].\n\n Args:\n u: Utility object with model, data, and scoring function.\n coefficient: The semivalue coefficient and sampler weight\n samples: A collection of samples. Each sample is a tuple of index and subset of\n indices to compute a marginal utility.\n\n Returns:\n A collection of marginals. Each marginal is a tuple with index and its marginal\n utility.\n \"\"\"\n n = len(u.data)\n marginals: List[MarginalT] = []\n for idx, s in samples:\n marginal = (u({idx}.union(s)) - u(s)) * coefficient(n, len(s))\n marginals.append((idx, marginal))\n return tuple(marginals)\n
"},{"location":"api/pydvl/value/semivalues/#pydvl.value.semivalues.RawUtility","title":"RawUtility","text":" Bases: MarginalFunction
"},{"location":"api/pydvl/value/semivalues/#pydvl.value.semivalues.RawUtility.__call__","title":"__call__","text":"__call__(\n u: Utility, coefficient: SVCoefficient, samples: Iterable[SampleT]\n) -> Tuple[MarginalT, ...]\n
Computation of raw utility without marginalization. This is a helper function for compute_generic_semivalues.
PARAMETER DESCRIPTION u
Utility object with model, data, and scoring function.
TYPE: Utility
coefficient
The semivalue coefficient and sampler weight
TYPE: SVCoefficient
samples
A collection of samples. Each sample is a tuple of index and subset of indices to compute a marginal utility.
TYPE: Iterable[SampleT]
RETURNS DESCRIPTION Tuple[MarginalT, ...]
A collection of marginals. Each marginal is a tuple with index and its raw utility.
Source code in src/pydvl/value/semivalues.py
def __call__(\n self, u: Utility, coefficient: SVCoefficient, samples: Iterable[SampleT]\n) -> Tuple[MarginalT, ...]:\n \"\"\"Computation of raw utility without marginalization. This is a helper function for\n [compute_generic_semivalues][pydvl.value.semivalues.compute_generic_semivalues].\n\n Args:\n u: Utility object with model, data, and scoring function.\n coefficient: The semivalue coefficient and sampler weight\n samples: A collection of samples. Each sample is a tuple of index and subset of\n indices to compute a marginal utility.\n\n Returns:\n A collection of marginals. Each marginal is a tuple with index and its raw utility.\n \"\"\"\n marginals: List[MarginalT] = []\n for idx, s in samples:\n marginals.append((s, u(s)))\n return tuple(marginals)\n
"},{"location":"api/pydvl/value/semivalues/#pydvl.value.semivalues.FutureProcessor","title":"FutureProcessor","text":"The FutureProcessor class used to process the results of the parallel marginal evaluations.
The marginals are evaluated in parallel by n_jobs
threads, but some algorithms require a central method to postprocess the marginal results. This can be achieved through the future processor. This base class does not perform any postprocessing, it is a noop used in most data valuation algorithms.
"},{"location":"api/pydvl/value/semivalues/#pydvl.value.semivalues.MSRFutureProcessor","title":"MSRFutureProcessor","text":"MSRFutureProcessor(u: Utility)\n
Bases: FutureProcessor
This FutureProcessor processes the raw marginals in a way that MSR sampling requires.
MSR sampling evaluates the utility once, and then updates all data semivalues based on this one evaluation. In order to do this, the RawUtility value needs to be postprocessed through this class. For more details on MSR, please refer to the paper (Wang et. al.)3. This processor keeps track of the current values and computes marginals for all data points, so that the values in the ValuationResult can be updated properly down the line.
Source code in src/pydvl/value/semivalues.py
def __init__(self, u: Utility):\n self.n = len(u.data)\n self.all_indices = u.data.indices.copy()\n self.point_in_subset = np.zeros((self.n,))\n self.positive_sums = np.zeros((self.n,))\n self.negative_sums = np.zeros((self.n,))\n self.total_evaluations = 0\n
"},{"location":"api/pydvl/value/semivalues/#pydvl.value.semivalues.MSRFutureProcessor.__call__","title":"__call__","text":"__call__(\n future_result: List[Tuple[List[IndexT], float]]\n) -> List[List[MarginalT]]\n
Computation of marginal utility using Maximum Sample Reuse.
This processor requires the Marginal Function to be set to RawUtility.\n Then, this processor computes marginals based on the utility value and the index set provided.\n\n The final formula that gives the Banzhaf semivalue using MSR is:\n $$\\hat{\\phi}_{MSR}(i) = \frac{1}{|\\mathbf{S}_{\n
i i}|} \\sum_{S \\in \\mathbf{S}{ i i}} U(S) - \frac{1}{|\\mathbf{S}{ ot{ i} i}|} \\sum_{S \\in \\mathbf{S}_{ ot{ i} i}} U(S)$$
Args:\n future_result: Result of the parallel computing jobs comprised of\n a list of indices that were used to evaluate the utility, and the evaluation result (metric).\n\n Returns:\n A collection of marginals. Each marginal is a tuple with index and its marginal\n utility.\n
Source code in src/pydvl/value/semivalues.py
def __call__(\n self, future_result: List[Tuple[List[IndexT], float]]\n) -> List[List[MarginalT]]:\n \"\"\"Computation of marginal utility using Maximum Sample Reuse.\n\n This processor requires the Marginal Function to be set to RawUtility.\n Then, this processor computes marginals based on the utility value and the index set provided.\n\n The final formula that gives the Banzhaf semivalue using MSR is:\n $$\\hat{\\phi}_{MSR}(i) = \\frac{1}{|\\mathbf{S}_{\\ni i}|} \\sum_{S \\in \\mathbf{S}_{\\ni i}} U(S)\n - \\frac{1}{|\\mathbf{S}_{\\not{\\ni} i}|} \\sum_{S \\in \\mathbf{S}_{\\not{\\ni} i}} U(S)$$\n\n Args:\n future_result: Result of the parallel computing jobs comprised of\n a list of indices that were used to evaluate the utility, and the evaluation result (metric).\n\n Returns:\n A collection of marginals. Each marginal is a tuple with index and its marginal\n utility.\n \"\"\"\n marginals: List[List[MarginalT]] = []\n for batch_id, (s, evaluation) in enumerate(future_result):\n previous_values = self.compute_values()\n self.total_evaluations += 1\n self.point_in_subset[s] += 1\n self.positive_sums[s] += evaluation\n not_s = np.setdiff1d(self.all_indices, s)\n self.negative_sums[not_s] += evaluation\n new_values = self.compute_values()\n # Hack to work around the update mechanic that does not work out of the box for MSR\n marginal_vals = (\n self.total_evaluations * new_values\n - (self.total_evaluations - 1) * previous_values\n )\n marginals.append([])\n for data_index in range(self.n):\n marginals[batch_id].append(\n (data_index, float(marginal_vals[data_index]))\n )\n return marginals\n
"},{"location":"api/pydvl/value/semivalues/#pydvl.value.semivalues.SemiValueMode","title":"SemiValueMode","text":" Bases: str
, Enum
Enumeration of semi-value modes.
Deprecation notice
This enum and the associated methods are deprecated and will be removed in 0.8.0.
"},{"location":"api/pydvl/value/semivalues/#pydvl.value.semivalues.compute_generic_semivalues","title":"compute_generic_semivalues","text":"compute_generic_semivalues(\n sampler: PowersetSampler[IndexT],\n u: Utility,\n coefficient: SVCoefficient,\n done: StoppingCriterion,\n *,\n marginal: MarginalFunction = DefaultMarginal(),\n future_processor: FutureProcessor = FutureProcessor(),\n batch_size: int = 1,\n skip_converged: bool = False,\n n_jobs: int = 1,\n parallel_backend: Optional[ParallelBackend] = None,\n config: Optional[ParallelConfig] = None,\n progress: bool = False\n) -> ValuationResult\n
Computes semi-values for a given utility function and subset sampler.
PARAMETER DESCRIPTION sampler
The subset sampler to use for utility computations.
TYPE: PowersetSampler[IndexT]
u
Utility object with model, data, and scoring function.
TYPE: Utility
coefficient
The semi-value coefficient
TYPE: SVCoefficient
done
Stopping criterion.
TYPE: StoppingCriterion
marginal
Marginal function to be used for computing the semivalues
TYPE: MarginalFunction
DEFAULT: DefaultMarginal()
future_processor
Additional postprocessing steps required for some algorithms
TYPE: FutureProcessor
DEFAULT: FutureProcessor()
batch_size
Number of marginal evaluations per single parallel job.
TYPE: int
DEFAULT: 1
skip_converged
Whether to skip marginal evaluations for indices that have already converged. CAUTION: This is only entirely safe if the stopping criterion is MaxUpdates. For any other stopping criterion, the convergence status of indices may change during the computation, or they may be marked as having converged even though in fact the estimated values are far from the true values (e.g. for AbsoluteStandardError, you will probably have to carefully adjust the threshold).
TYPE: bool
DEFAULT: False
n_jobs
Number of parallel jobs to use.
TYPE: int
DEFAULT: 1
parallel_backend
Parallel backend instance to use for parallelizing computations. If None
, use JoblibParallelBackend backend. See the Parallel Backends package for available options.
TYPE: Optional[ParallelBackend]
DEFAULT: None
config
(DEPRECATED) Object configuring parallel computation, with cluster address, number of cpus, etc.
TYPE: Optional[ParallelConfig]
DEFAULT: None
progress
Whether to display a progress bar.
TYPE: bool
DEFAULT: False
RETURNS DESCRIPTION ValuationResult
Object with the results.
Deprecation notice
Parameter batch_size
is for experimental use and will be removed in future versions.
Changed in version 0.9.0
Deprecated config
argument and added a parallel_backend
argument to allow users to pass the Parallel Backend instance directly.
Source code in src/pydvl/value/semivalues.py
@deprecated(\n target=True,\n args_mapping={\"config\": \"config\"},\n deprecated_in=\"0.9.0\",\n remove_in=\"0.10.0\",\n)\ndef compute_generic_semivalues(\n sampler: PowersetSampler[IndexT],\n u: Utility,\n coefficient: SVCoefficient,\n done: StoppingCriterion,\n *,\n marginal: MarginalFunction = DefaultMarginal(),\n future_processor: FutureProcessor = FutureProcessor(),\n batch_size: int = 1,\n skip_converged: bool = False,\n n_jobs: int = 1,\n parallel_backend: Optional[ParallelBackend] = None,\n config: Optional[ParallelConfig] = None,\n progress: bool = False,\n) -> ValuationResult:\n \"\"\"Computes semi-values for a given utility function and subset sampler.\n\n Args:\n sampler: The subset sampler to use for utility computations.\n u: Utility object with model, data, and scoring function.\n coefficient: The semi-value coefficient\n done: Stopping criterion.\n marginal: Marginal function to be used for computing the semivalues\n future_processor: Additional postprocessing steps required for some algorithms\n batch_size: Number of marginal evaluations per single parallel job.\n skip_converged: Whether to skip marginal evaluations for indices that\n have already converged. **CAUTION**: This is only entirely safe if\n the stopping criterion is [MaxUpdates][pydvl.value.stopping.MaxUpdates].\n For any other stopping criterion, the convergence status of indices\n may change during the computation, or they may be marked as having\n converged even though in fact the estimated values are far from the\n true values (e.g. for\n [AbsoluteStandardError][pydvl.value.stopping.AbsoluteStandardError],\n you will probably have to carefully adjust the threshold).\n n_jobs: Number of parallel jobs to use.\n parallel_backend: Parallel backend instance to use\n for parallelizing computations. If `None`,\n use [JoblibParallelBackend][pydvl.parallel.backends.JoblibParallelBackend] backend.\n See the [Parallel Backends][pydvl.parallel.backends] package\n for available options.\n config: (**DEPRECATED**) Object configuring parallel computation,\n with cluster address, number of cpus, etc.\n progress: Whether to display a progress bar.\n\n Returns:\n Object with the results.\n\n !!! warning \"Deprecation notice\"\n Parameter `batch_size` is for experimental use and will be removed in\n future versions.\n\n !!! tip \"Changed in version 0.9.0\"\n Deprecated `config` argument and added a `parallel_backend`\n argument to allow users to pass the Parallel Backend instance\n directly.\n \"\"\"\n if isinstance(sampler, PermutationSampler) and u.cache is None:\n log.warning(\n \"PermutationSampler requires caching to be enabled or computation \"\n \"will be doubled wrt. a 'direct' implementation of permutation MC\"\n )\n\n if batch_size != 1:\n warnings.warn(\n \"Parameter `batch_size` is for experimental use and will be\"\n \" removed in future versions\",\n DeprecationWarning,\n )\n\n result = ValuationResult.zeros(\n algorithm=f\"semivalue-{str(sampler)}-{coefficient.__name__}\", # type: ignore\n indices=u.data.indices,\n data_names=u.data.data_names,\n )\n\n parallel_backend = _maybe_init_parallel_backend(parallel_backend, config)\n u = parallel_backend.put(u)\n correction = parallel_backend.put(\n lambda n, k: coefficient(n, k) * sampler.weight(n, k)\n )\n\n max_workers = parallel_backend.effective_n_jobs(n_jobs)\n n_submitted_jobs = 2 * max_workers # number of jobs in the queue\n\n sampler_it = iter(sampler)\n pbar = tqdm(disable=not progress, total=100, unit=\"%\")\n\n with parallel_backend.executor(\n max_workers=max_workers, cancel_futures=True\n ) as executor:\n pending: set[Future] = set()\n while True:\n pbar.n = 100 * done.completion()\n pbar.refresh()\n\n completed, pending = wait(pending, timeout=1, return_when=FIRST_COMPLETED)\n for future in completed:\n processed_future = future_processor(\n future.result()\n ) # List of tuples or\n for batch_future in processed_future:\n if isinstance(batch_future, list): # Case when batch size is > 1\n for idx, marginal_val in batch_future:\n result.update(idx, marginal_val)\n else: # Batch size 1\n idx, marginal_val = batch_future\n result.update(idx, marginal_val)\n if done(result):\n return result\n\n # Ensure that we always have n_submitted_jobs running\n try:\n while len(pending) < n_submitted_jobs:\n samples = tuple(islice(sampler_it, batch_size))\n if len(samples) == 0:\n raise StopIteration\n\n # Filter out samples for indices that have already converged\n filtered_samples = samples\n if skip_converged and np.count_nonzero(done.converged) > 0:\n # TODO: cloudpickle can't pickle result of `filter` on python 3.8\n filtered_samples = tuple(\n filter(lambda t: not done.converged[t[0]], samples)\n )\n\n if filtered_samples:\n pending.add(\n executor.submit(\n marginal,\n u=u,\n coefficient=correction,\n samples=filtered_samples,\n )\n )\n except StopIteration:\n if len(pending) == 0:\n return result\n
"},{"location":"api/pydvl/value/semivalues/#pydvl.value.semivalues.compute_shapley_semivalues","title":"compute_shapley_semivalues","text":"compute_shapley_semivalues(\n u: Utility,\n *,\n done: StoppingCriterion,\n sampler_t: Type[StochasticSampler] = PermutationSampler,\n batch_size: int = 1,\n n_jobs: int = 1,\n parallel_backend: Optional[ParallelBackend] = None,\n config: Optional[ParallelConfig] = None,\n progress: bool = False,\n seed: Optional[Seed] = None\n) -> ValuationResult\n
Computes Shapley values for a given utility function.
This is a convenience wrapper for compute_generic_semivalues with the Shapley coefficient. Use compute_shapley_values for a more flexible interface and additional methods, including TMCS.
PARAMETER DESCRIPTION u
Utility object with model, data, and scoring function.
TYPE: Utility
done
Stopping criterion.
TYPE: StoppingCriterion
sampler_t
The sampler type to use. See the sampler module for a list.
TYPE: Type[StochasticSampler]
DEFAULT: PermutationSampler
batch_size
Number of marginal evaluations per single parallel job.
TYPE: int
DEFAULT: 1
n_jobs
Number of parallel jobs to use.
TYPE: int
DEFAULT: 1
parallel_backend
Parallel backend instance to use for parallelizing computations. If None
, use JoblibParallelBackend backend. See the Parallel Backends package for available options.
TYPE: Optional[ParallelBackend]
DEFAULT: None
config
(DEPRECATED) Object configuring parallel computation, with cluster address, number of cpus, etc.
TYPE: Optional[ParallelConfig]
DEFAULT: None
seed
Either an instance of a numpy random number generator or a seed for it.
TYPE: Optional[Seed]
DEFAULT: None
progress
Whether to display a progress bar.
TYPE: bool
DEFAULT: False
RETURNS DESCRIPTION ValuationResult
Object with the results.
Deprecation notice
Parameter batch_size
is for experimental use and will be removed in future versions.
Changed in version 0.9.0
Deprecated config
argument and added a parallel_backend
argument to allow users to pass the Parallel Backend instance directly.
Source code in src/pydvl/value/semivalues.py
@deprecated(\n target=True,\n args_mapping={\"config\": \"config\"},\n deprecated_in=\"0.9.0\",\n remove_in=\"0.10.0\",\n)\ndef compute_shapley_semivalues(\n u: Utility,\n *,\n done: StoppingCriterion,\n sampler_t: Type[StochasticSampler] = PermutationSampler,\n batch_size: int = 1,\n n_jobs: int = 1,\n parallel_backend: Optional[ParallelBackend] = None,\n config: Optional[ParallelConfig] = None,\n progress: bool = False,\n seed: Optional[Seed] = None,\n) -> ValuationResult:\n \"\"\"Computes Shapley values for a given utility function.\n\n This is a convenience wrapper for\n [compute_generic_semivalues][pydvl.value.semivalues.compute_generic_semivalues]\n with the Shapley coefficient. Use\n [compute_shapley_values][pydvl.value.shapley.common.compute_shapley_values]\n for a more flexible interface and additional methods, including TMCS.\n\n Args:\n u: Utility object with model, data, and scoring function.\n done: Stopping criterion.\n sampler_t: The sampler type to use. See the\n [sampler][pydvl.value.sampler] module for a list.\n batch_size: Number of marginal evaluations per single parallel job.\n n_jobs: Number of parallel jobs to use.\n parallel_backend: Parallel backend instance to use\n for parallelizing computations. If `None`,\n use [JoblibParallelBackend][pydvl.parallel.backends.JoblibParallelBackend] backend.\n See the [Parallel Backends][pydvl.parallel.backends] package\n for available options.\n config: (**DEPRECATED**) Object configuring parallel computation,\n with cluster address, number of cpus, etc.\n seed: Either an instance of a numpy random number generator or a seed\n for it.\n progress: Whether to display a progress bar.\n\n Returns:\n Object with the results.\n\n !!! warning \"Deprecation notice\"\n Parameter `batch_size` is for experimental use and will be removed in\n future versions.\n\n !!! tip \"Changed in version 0.9.0\"\n Deprecated `config` argument and added a `parallel_backend`\n argument to allow users to pass the Parallel Backend instance\n directly.\n \"\"\"\n # HACK: cannot infer return type because of useless IndexT, NameT\n return compute_generic_semivalues( # type: ignore\n sampler_t(u.data.indices, seed=seed),\n u,\n shapley_coefficient,\n done,\n batch_size=batch_size,\n n_jobs=n_jobs,\n parallel_backend=parallel_backend,\n config=config,\n progress=progress,\n )\n
"},{"location":"api/pydvl/value/semivalues/#pydvl.value.semivalues.compute_banzhaf_semivalues","title":"compute_banzhaf_semivalues","text":"compute_banzhaf_semivalues(\n u: Utility,\n *,\n done: StoppingCriterion,\n sampler_t: Type[StochasticSampler] = PermutationSampler,\n batch_size: int = 1,\n n_jobs: int = 1,\n parallel_backend: Optional[ParallelBackend] = None,\n config: Optional[ParallelConfig] = None,\n progress: bool = False,\n seed: Optional[Seed] = None\n) -> ValuationResult\n
Computes Banzhaf values for a given utility function.
This is a convenience wrapper for compute_generic_semivalues with the Banzhaf coefficient.
PARAMETER DESCRIPTION u
Utility object with model, data, and scoring function.
TYPE: Utility
done
Stopping criterion.
TYPE: StoppingCriterion
sampler_t
The sampler type to use. See the sampler module for a list.
TYPE: Type[StochasticSampler]
DEFAULT: PermutationSampler
batch_size
Number of marginal evaluations per single parallel job.
TYPE: int
DEFAULT: 1
n_jobs
Number of parallel jobs to use.
TYPE: int
DEFAULT: 1
seed
Either an instance of a numpy random number generator or a seed for it.
TYPE: Optional[Seed]
DEFAULT: None
parallel_backend
Parallel backend instance to use for parallelizing computations. If None
, use JoblibParallelBackend backend. See the Parallel Backends package for available options.
TYPE: Optional[ParallelBackend]
DEFAULT: None
config
(DEPRECATED) Object configuring parallel computation, with cluster address, number of cpus, etc.
TYPE: Optional[ParallelConfig]
DEFAULT: None
progress
Whether to display a progress bar.
TYPE: bool
DEFAULT: False
RETURNS DESCRIPTION ValuationResult
Object with the results.
Deprecation notice
Parameter batch_size
is for experimental use and will be removed in future versions.
Changed in version 0.9.0
Deprecated config
argument and added a parallel_backend
argument to allow users to pass the Parallel Backend instance directly.
Source code in src/pydvl/value/semivalues.py
@deprecated(\n target=True,\n args_mapping={\"config\": \"config\"},\n deprecated_in=\"0.9.0\",\n remove_in=\"0.10.0\",\n)\ndef compute_banzhaf_semivalues(\n u: Utility,\n *,\n done: StoppingCriterion,\n sampler_t: Type[StochasticSampler] = PermutationSampler,\n batch_size: int = 1,\n n_jobs: int = 1,\n parallel_backend: Optional[ParallelBackend] = None,\n config: Optional[ParallelConfig] = None,\n progress: bool = False,\n seed: Optional[Seed] = None,\n) -> ValuationResult:\n \"\"\"Computes Banzhaf values for a given utility function.\n\n This is a convenience wrapper for\n [compute_generic_semivalues][pydvl.value.semivalues.compute_generic_semivalues]\n with the Banzhaf coefficient.\n\n Args:\n u: Utility object with model, data, and scoring function.\n done: Stopping criterion.\n sampler_t: The sampler type to use. See the\n [sampler][pydvl.value.sampler] module for a list.\n batch_size: Number of marginal evaluations per single parallel job.\n n_jobs: Number of parallel jobs to use.\n seed: Either an instance of a numpy random number generator or a seed\n for it.\n parallel_backend: Parallel backend instance to use\n for parallelizing computations. If `None`,\n use [JoblibParallelBackend][pydvl.parallel.backends.JoblibParallelBackend] backend.\n See the [Parallel Backends][pydvl.parallel.backends] package\n for available options.\n config: (**DEPRECATED**) Object configuring parallel computation,\n with cluster address, number of cpus, etc.\n progress: Whether to display a progress bar.\n\n Returns:\n Object with the results.\n\n !!! warning \"Deprecation notice\"\n Parameter `batch_size` is for experimental use and will be removed in\n future versions.\n\n !!! tip \"Changed in version 0.9.0\"\n Deprecated `config` argument and added a `parallel_backend`\n argument to allow users to pass the Parallel Backend instance\n directly.\n \"\"\"\n # HACK: cannot infer return type because of useless IndexT, NameT\n return compute_generic_semivalues( # type: ignore\n sampler_t(u.data.indices, seed=seed),\n u,\n banzhaf_coefficient,\n done,\n batch_size=batch_size,\n n_jobs=n_jobs,\n parallel_backend=parallel_backend,\n config=config,\n progress=progress,\n )\n
"},{"location":"api/pydvl/value/semivalues/#pydvl.value.semivalues.compute_msr_banzhaf_semivalues","title":"compute_msr_banzhaf_semivalues","text":"compute_msr_banzhaf_semivalues(\n u: Utility,\n *,\n done: StoppingCriterion,\n sampler_t: Type[StochasticSampler] = MSRSampler,\n batch_size: int = 1,\n n_jobs: int = 1,\n parallel_backend: Optional[ParallelBackend] = None,\n config: Optional[ParallelConfig] = None,\n progress: bool = False,\n seed: Optional[Seed] = None\n) -> ValuationResult\n
Computes MSR sampled Banzhaf values for a given utility function.
This is a convenience wrapper for compute_generic_semivalues with the Banzhaf coefficient and MSR sampling.
This algorithm works by sampling random subsets and then evaluating the utility on that subset only once. Based on the evaluation and the subset indices, the MSRFutureProcessor then computes the marginal updates like in the paper (Wang et. al.)3. Their approach updates the semivalues for all data points every time a new evaluation is computed. This increases sample efficiency compared to normal Monte Carlo updates.
PARAMETER DESCRIPTION u
Utility object with model, data, and scoring function.
TYPE: Utility
done
Stopping criterion.
TYPE: StoppingCriterion
sampler_t
The sampler type to use. See the sampler module for a list.
TYPE: Type[StochasticSampler]
DEFAULT: MSRSampler
batch_size
Number of marginal evaluations per single parallel job.
TYPE: int
DEFAULT: 1
n_jobs
Number of parallel jobs to use.
TYPE: int
DEFAULT: 1
seed
Either an instance of a numpy random number generator or a seed for it.
TYPE: Optional[Seed]
DEFAULT: None
config
Object configuring parallel computation, with cluster address, number of cpus, etc.
TYPE: Optional[ParallelConfig]
DEFAULT: None
progress
Whether to display a progress bar.
TYPE: bool
DEFAULT: False
RETURNS DESCRIPTION ValuationResult
Object with the results.
Deprecation notice
Parameter batch_size
is for experimental use and will be removed in future versions.
Source code in src/pydvl/value/semivalues.py
@deprecated(\n target=True,\n args_mapping={\"config\": \"config\"},\n deprecated_in=\"0.9.0\",\n remove_in=\"0.10.0\",\n)\ndef compute_msr_banzhaf_semivalues(\n u: Utility,\n *,\n done: StoppingCriterion,\n sampler_t: Type[StochasticSampler] = MSRSampler,\n batch_size: int = 1,\n n_jobs: int = 1,\n parallel_backend: Optional[ParallelBackend] = None,\n config: Optional[ParallelConfig] = None,\n progress: bool = False,\n seed: Optional[Seed] = None,\n) -> ValuationResult:\n \"\"\"Computes MSR sampled Banzhaf values for a given utility function.\n\n This is a convenience wrapper for\n [compute_generic_semivalues][pydvl.value.semivalues.compute_generic_semivalues]\n with the Banzhaf coefficient and MSR sampling.\n\n This algorithm works by sampling random subsets and then evaluating the utility\n on that subset only once. Based on the evaluation and the subset indices,\n the MSRFutureProcessor then computes the marginal updates like in the paper\n (Wang et. al.)<sup><a href=\"wang_data_2023\">3</a></sup>.\n Their approach updates the semivalues for all data points every time a new evaluation\n is computed. This increases sample efficiency compared to normal Monte Carlo updates.\n\n Args:\n u: Utility object with model, data, and scoring function.\n done: Stopping criterion.\n sampler_t: The sampler type to use. See the\n [sampler][pydvl.value.sampler] module for a list.\n batch_size: Number of marginal evaluations per single parallel job.\n n_jobs: Number of parallel jobs to use.\n seed: Either an instance of a numpy random number generator or a seed\n for it.\n config: Object configuring parallel computation, with cluster address,\n number of cpus, etc.\n progress: Whether to display a progress bar.\n\n Returns:\n Object with the results.\n\n !!! warning \"Deprecation notice\"\n Parameter `batch_size` is for experimental use and will be removed in\n future versions.\n \"\"\"\n # HACK: cannot infer return type because of useless IndexT, NameT\n return compute_generic_semivalues( # type: ignore\n sampler_t(u.data.indices, seed=seed),\n u,\n always_one_coefficient,\n done,\n marginal=RawUtility(),\n future_processor=MSRFutureProcessor(u),\n batch_size=batch_size,\n n_jobs=n_jobs,\n parallel_backend=parallel_backend,\n config=config,\n progress=progress,\n )\n
"},{"location":"api/pydvl/value/semivalues/#pydvl.value.semivalues.compute_beta_shapley_semivalues","title":"compute_beta_shapley_semivalues","text":"compute_beta_shapley_semivalues(\n u: Utility,\n *,\n alpha: float = 1,\n beta: float = 1,\n done: StoppingCriterion,\n sampler_t: Type[StochasticSampler] = PermutationSampler,\n batch_size: int = 1,\n n_jobs: int = 1,\n parallel_backend: Optional[ParallelBackend] = None,\n config: Optional[ParallelConfig] = None,\n progress: bool = False,\n seed: Optional[Seed] = None\n) -> ValuationResult\n
Computes Beta Shapley values for a given utility function.
This is a convenience wrapper for compute_generic_semivalues with the Beta Shapley coefficient.
PARAMETER DESCRIPTION u
Utility object with model, data, and scoring function.
TYPE: Utility
alpha
Alpha parameter of the Beta distribution.
TYPE: float
DEFAULT: 1
beta
Beta parameter of the Beta distribution.
TYPE: float
DEFAULT: 1
done
Stopping criterion.
TYPE: StoppingCriterion
sampler_t
The sampler type to use. See the sampler module for a list.
TYPE: Type[StochasticSampler]
DEFAULT: PermutationSampler
batch_size
Number of marginal evaluations per (parallelized) task.
TYPE: int
DEFAULT: 1
n_jobs
Number of parallel jobs to use.
TYPE: int
DEFAULT: 1
seed
Either an instance of a numpy random number generator or a seed for it.
TYPE: Optional[Seed]
DEFAULT: None
parallel_backend
Parallel backend instance to use for parallelizing computations. If None
, use JoblibParallelBackend backend. See the Parallel Backends package for available options.
TYPE: Optional[ParallelBackend]
DEFAULT: None
config
(DEPRECATED) Object configuring parallel computation, with cluster address, number of cpus, etc.
TYPE: Optional[ParallelConfig]
DEFAULT: None
progress
Whether to display a progress bar.
TYPE: bool
DEFAULT: False
RETURNS DESCRIPTION ValuationResult
Object with the results.
Deprecation notice
Parameter batch_size
is for experimental use and will be removed in future versions.
Changed in version 0.9.0
Deprecated config
argument and added a parallel_backend
argument to allow users to pass the Parallel Backend instance directly.
Source code in src/pydvl/value/semivalues.py
@deprecated(\n target=True,\n args_mapping={\"config\": \"config\"},\n deprecated_in=\"0.9.0\",\n remove_in=\"0.10.0\",\n)\ndef compute_beta_shapley_semivalues(\n u: Utility,\n *,\n alpha: float = 1,\n beta: float = 1,\n done: StoppingCriterion,\n sampler_t: Type[StochasticSampler] = PermutationSampler,\n batch_size: int = 1,\n n_jobs: int = 1,\n parallel_backend: Optional[ParallelBackend] = None,\n config: Optional[ParallelConfig] = None,\n progress: bool = False,\n seed: Optional[Seed] = None,\n) -> ValuationResult:\n \"\"\"Computes Beta Shapley values for a given utility function.\n\n This is a convenience wrapper for\n [compute_generic_semivalues][pydvl.value.semivalues.compute_generic_semivalues]\n with the Beta Shapley coefficient.\n\n Args:\n u: Utility object with model, data, and scoring function.\n alpha: Alpha parameter of the Beta distribution.\n beta: Beta parameter of the Beta distribution.\n done: Stopping criterion.\n sampler_t: The sampler type to use. See the\n [sampler][pydvl.value.sampler] module for a list.\n batch_size: Number of marginal evaluations per (parallelized) task.\n n_jobs: Number of parallel jobs to use.\n seed: Either an instance of a numpy random number generator or a seed for it.\n parallel_backend: Parallel backend instance to use\n for parallelizing computations. If `None`,\n use [JoblibParallelBackend][pydvl.parallel.backends.JoblibParallelBackend] backend.\n See the [Parallel Backends][pydvl.parallel.backends] package\n for available options.\n config: (**DEPRECATED**) Object configuring parallel computation,\n with cluster address, number of cpus, etc.\n progress: Whether to display a progress bar.\n\n Returns:\n Object with the results.\n\n !!! warning \"Deprecation notice\"\n Parameter `batch_size` is for experimental use and will be removed in\n future versions.\n\n !!! tip \"Changed in version 0.9.0\"\n Deprecated `config` argument and added a `parallel_backend`\n argument to allow users to pass the Parallel Backend instance\n directly.\n \"\"\"\n # HACK: cannot infer return type because of useless IndexT, NameT\n return compute_generic_semivalues( # type: ignore\n sampler_t(u.data.indices, seed=seed),\n u,\n beta_coefficient(alpha, beta),\n done,\n batch_size=batch_size,\n n_jobs=n_jobs,\n parallel_backend=parallel_backend,\n config=config,\n progress=progress,\n )\n
"},{"location":"api/pydvl/value/semivalues/#pydvl.value.semivalues.compute_semivalues","title":"compute_semivalues","text":"compute_semivalues(\n u: Utility,\n *,\n done: StoppingCriterion,\n mode: SemiValueMode = SemiValueMode.Shapley,\n sampler_t: Type[StochasticSampler] = PermutationSampler,\n batch_size: int = 1,\n n_jobs: int = 1,\n seed: Optional[Seed] = None,\n **kwargs\n) -> ValuationResult\n
Convenience entry point for most common semi-value computations.
Deprecation warning
This method is deprecated and will be replaced in 0.8.0 by the more general implementation of compute_generic_semivalues. Use compute_shapley_semivalues, compute_banzhaf_semivalues, or compute_beta_shapley_semivalues instead.
The modes supported with this interface are the following. For greater flexibility use compute_generic_semivalues directly.
- SemiValueMode.Shapley: Shapley values.
- SemiValueMode.BetaShapley: Implements the Beta Shapley semi-value as introduced in (Kwon and Zou, 2022)1. Pass additional keyword arguments
alpha
and beta
to set the parameters of the Beta distribution (both default to 1). - SemiValueMode.Banzhaf: Implements the Banzhaf semi-value as introduced in (Wang and Jia, 2022)1.
See Data valuation for an overview of valuation.
PARAMETER DESCRIPTION u
Utility object with model, data, and scoring function.
TYPE: Utility
done
Stopping criterion.
TYPE: StoppingCriterion
mode
The semi-value mode to use. See SemiValueMode for a list.
TYPE: SemiValueMode
DEFAULT: Shapley
sampler_t
The sampler type to use. See sampler for a list.
TYPE: Type[StochasticSampler]
DEFAULT: PermutationSampler
batch_size
Number of marginal evaluations per (parallelized) task.
TYPE: int
DEFAULT: 1
n_jobs
Number of parallel jobs to use.
TYPE: int
DEFAULT: 1
seed
Either an instance of a numpy random number generator or a seed for it.
TYPE: Optional[Seed]
DEFAULT: None
kwargs
Additional keyword arguments passed to compute_generic_semivalues.
DEFAULT: {}
RETURNS DESCRIPTION ValuationResult
Object with the results.
Deprecation notice
Parameter batch_size
is for experimental use and will be removed in future versions.
Source code in src/pydvl/value/semivalues.py
@deprecated(target=True, deprecated_in=\"0.7.0\", remove_in=\"0.8.0\")\ndef compute_semivalues(\n u: Utility,\n *,\n done: StoppingCriterion,\n mode: SemiValueMode = SemiValueMode.Shapley,\n sampler_t: Type[StochasticSampler] = PermutationSampler,\n batch_size: int = 1,\n n_jobs: int = 1,\n seed: Optional[Seed] = None,\n **kwargs,\n) -> ValuationResult:\n \"\"\"Convenience entry point for most common semi-value computations.\n\n !!! warning \"Deprecation warning\"\n This method is deprecated and will be replaced in 0.8.0 by the more\n general implementation of\n [compute_generic_semivalues][pydvl.value.semivalues.compute_generic_semivalues].\n Use\n [compute_shapley_semivalues][pydvl.value.semivalues.compute_shapley_semivalues],\n [compute_banzhaf_semivalues][pydvl.value.semivalues.compute_banzhaf_semivalues],\n or\n [compute_beta_shapley_semivalues][pydvl.value.semivalues.compute_beta_shapley_semivalues]\n instead.\n\n The modes supported with this interface are the following. For greater\n flexibility use\n [compute_generic_semivalues][pydvl.value.semivalues.compute_generic_semivalues]\n directly.\n\n - [SemiValueMode.Shapley][pydvl.value.semivalues.SemiValueMode]:\n Shapley values.\n - [SemiValueMode.BetaShapley][pydvl.value.semivalues.SemiValueMode]:\n Implements the Beta Shapley semi-value as introduced in\n (Kwon and Zou, 2022)<sup><a href=\"#kwon_beta_2022\">1</a></sup>.\n Pass additional keyword arguments `alpha` and `beta` to set the\n parameters of the Beta distribution (both default to 1).\n - [SemiValueMode.Banzhaf][pydvl.value.semivalues.SemiValueMode]: Implements\n the Banzhaf semi-value as introduced in (Wang and Jia, 2022)<sup><a\n href=\"#wang_data_2023\">1</a></sup>.\n\n See [Data valuation][data-valuation] for an overview of valuation.\n\n Args:\n u: Utility object with model, data, and scoring function.\n done: Stopping criterion.\n mode: The semi-value mode to use. See\n [SemiValueMode][pydvl.value.semivalues.SemiValueMode] for a list.\n sampler_t: The sampler type to use. See [sampler][pydvl.value.sampler]\n for a list.\n batch_size: Number of marginal evaluations per (parallelized) task.\n n_jobs: Number of parallel jobs to use.\n seed: Either an instance of a numpy random number generator or a seed for it.\n kwargs: Additional keyword arguments passed to\n [compute_generic_semivalues][pydvl.value.semivalues.compute_generic_semivalues].\n\n Returns:\n Object with the results.\n\n !!! warning \"Deprecation notice\"\n Parameter `batch_size` is for experimental use and will be removed in\n future versions.\n \"\"\"\n if mode == SemiValueMode.Shapley:\n coefficient = shapley_coefficient\n elif mode == SemiValueMode.BetaShapley:\n alpha = kwargs.pop(\"alpha\", 1)\n beta = kwargs.pop(\"beta\", 1)\n coefficient = beta_coefficient(alpha, beta)\n elif mode == SemiValueMode.Banzhaf:\n coefficient = banzhaf_coefficient\n else:\n raise ValueError(f\"Unknown mode {mode}\")\n coefficient = cast(SVCoefficient, coefficient)\n\n # HACK: cannot infer return type because of useless IndexT, NameT\n return compute_generic_semivalues( # type: ignore\n sampler_t(u.data.indices, seed=seed),\n u,\n coefficient,\n done,\n n_jobs=n_jobs,\n batch_size=batch_size,\n **kwargs,\n )\n
"},{"location":"api/pydvl/value/stopping/","title":"Stopping","text":""},{"location":"api/pydvl/value/stopping/#pydvl.value.stopping","title":"pydvl.value.stopping","text":"Stopping criteria for value computations.
This module provides a basic set of stopping criteria, like MaxUpdates, MaxTime, or HistoryDeviation among others. These can behave in different ways depending on the context. For example, MaxUpdates limits the number of updates to values, which depending on the algorithm may mean a different number of utility evaluations or imply other computations like solving a linear or quadratic program.
Stopping criteria are callables that are evaluated on a ValuationResult and return a Status object. They can be combined using boolean operators.
"},{"location":"api/pydvl/value/stopping/#pydvl.value.stopping--how-convergence-is-determined","title":"How convergence is determined","text":"Most stopping criteria keep track of the convergence of each index separately but make global decisions based on the overall convergence of some fraction of all indices. For example, if we have a stopping criterion that checks whether the standard error of 90% of values is below a threshold, then methods will keep updating all indices until 90% of them have converged, irrespective of the quality of the individual estimates, and without freezing updates for indices along the way as values individually attain low standard error.
This has some practical implications, because some values do tend to converge sooner than others. For example, assume we use the criterion AbsoluteStandardError(0.02) | MaxUpdates(1000)
. Then values close to 0 might be marked as \"converged\" rather quickly because they fulfill the first criterion, say after 20 iterations, despite being poor estimates. Because other indices take much longer to have low standard error and the criterion is a global check, the \"converged\" ones keep being updated and end up being good estimates. In this case, this has been beneficial, but one might not wish for converged values to be updated, if one is sure that the criterion is adequate for individual values.
Semi-value methods include a parameter skip_converged
that allows to skip the computation of values that have converged. The way to avoid doing this too early is to use a more stringent check, e.g. AbsoluteStandardError(1e-3) | MaxUpdates(1000)
. With skip_converged=True
this check can still take less time than the first one, despite requiring more iterations for some indices.
"},{"location":"api/pydvl/value/stopping/#pydvl.value.stopping--choosing-a-stopping-criterion","title":"Choosing a stopping criterion","text":"The choice of a stopping criterion greatly depends on the algorithm and the context. A safe bet is to combine a MaxUpdates or a MaxTime with a HistoryDeviation or an AbsoluteStandardError. The former will ensure that the computation does not run for too long, while the latter will try to achieve results that are stable enough. Note however that if the threshold is too strict, one will always end up running until a maximum number of iterations or time. Also keep in mind that different values converge at different times, so you might want to use tight thresholds and skip_converged
as described above for semi-values.
Example from pydvl.value import AbsoluteStandardError, MaxUpdates, compute_banzhaf_semivalues\n\nutility = ... # some utility object\ncriterion = AbsoluteStandardError(threshold=1e-3, burn_in=32) | MaxUpdates(1000)\nvalues = compute_banzhaf_semivalues(\n utility,\n criterion,\n skip_converged=True, # skip values that have converged (CAREFUL!)\n)\n
This will compute the Banzhaf semivalues for utility
until either the absolute standard error is below 1e-3
or 1000
updates have been performed. The burn_in
parameter is used to discard the first 32
updates from the computation of the standard error. The skip_converged
parameter is used to avoid computing more marginals for indices that have converged, which is useful if AbsoluteStandardError is met before MaxUpdates for some indices. Warning
Be careful not to reuse the same stopping criterion for different computations. The object has state and will not be reset between calls to value computation methods. If you need to reuse the same criterion, you should create a new instance.
"},{"location":"api/pydvl/value/stopping/#pydvl.value.stopping--creating-stopping-criteria","title":"Creating stopping criteria","text":"The easiest way is to declare a function implementing the interface StoppingCriterionCallable and wrap it with make_criterion(). This creates a StoppingCriterion object that can be composed with other stopping criteria.
Alternatively, and in particular if reporting of completion is required, one can inherit from this class and implement the abstract methods _check
and completion.
"},{"location":"api/pydvl/value/stopping/#pydvl.value.stopping--combining-stopping-criteria","title":"Combining stopping criteria","text":"Objects of type StoppingCriterion can be combined with the binary operators &
(and), and |
(or), following the truth tables of Status. The unary operator ~
(not) is also supported. See StoppingCriterion for details on how these operations affect the behavior of the stopping criteria.
"},{"location":"api/pydvl/value/stopping/#pydvl.value.stopping--references","title":"References","text":" -
Ghorbani, A., Zou, J., 2019. Data Shapley: Equitable Valuation of Data for Machine Learning. In: Proceedings of the 36th International Conference on Machine Learning, PMLR, pp. 2242\u20132251.\u00a0\u21a9
-
Wang, J.T. and Jia, R., 2023. Data Banzhaf: A Robust Data Valuation Framework for Machine Learning. In: Proceedings of The 26th International Conference on Artificial Intelligence and Statistics, pp. 6388-6421.\u00a0\u21a9
"},{"location":"api/pydvl/value/stopping/#pydvl.value.stopping.StoppingCriterionCallable","title":"StoppingCriterionCallable","text":" Bases: Protocol
Signature for a stopping criterion
"},{"location":"api/pydvl/value/stopping/#pydvl.value.stopping.StoppingCriterion","title":"StoppingCriterion","text":"StoppingCriterion(modify_result: bool = True)\n
Bases: ABC
A composable callable object to determine whether a computation must stop.
A StoppingCriterion
is a callable taking a ValuationResult and returning a Status. It also keeps track of individual convergence of values with converged, and reports the overall completion of the computation with completion.
Instances of StoppingCriterion
can be composed with the binary operators &
(and), and |
(or), following the truth tables of Status. The unary operator ~
(not) is also supported. These boolean operations act according to the following rules:
- The results of
check()
are combined with the operator. See Status for the truth tables. - The results of converged are combined with the operator (returning another boolean array).
- The completion method returns the min, max, or the complement to 1 of the completions of the operands, for AND, OR and NOT respectively. This is required for cases where one of the criteria does not keep track of the convergence of single values, e.g. MaxUpdates, because completion by default returns the mean of the boolean convergence array.
"},{"location":"api/pydvl/value/stopping/#pydvl.value.stopping.StoppingCriterion--subclassing","title":"Subclassing","text":"Subclassing this class requires implementing a check()
method that returns a Status object based on a given ValuationResult. This method should update the attribute _converged
, which is a boolean array indicating whether the value for each index has converged. When this does not make sense for a particular stopping criterion, completion should be overridden to provide an overall completion value, since its default implementation attempts to compute the mean of _converged
.
PARAMETER DESCRIPTION modify_result
If True
the status of the input ValuationResult is modified in place after the call.
TYPE: bool
DEFAULT: True
Source code in src/pydvl/value/stopping.py
def __init__(self, modify_result: bool = True):\n self.modify_result = modify_result\n self._converged = np.full(0, False)\n
"},{"location":"api/pydvl/value/stopping/#pydvl.value.stopping.StoppingCriterion.converged","title":"converged property
","text":"converged: NDArray[bool_]\n
Returns a boolean array indicating whether the values have converged for each data point.
Inheriting classes must set the _converged
attribute in their check()
.
RETURNS DESCRIPTION NDArray[bool_]
A boolean array indicating whether the values have converged for
NDArray[bool_]
each data point.
"},{"location":"api/pydvl/value/stopping/#pydvl.value.stopping.StoppingCriterion.completion","title":"completion","text":"completion() -> float\n
Returns a value between 0 and 1 indicating the completion of the computation.
Source code in src/pydvl/value/stopping.py
def completion(self) -> float:\n \"\"\"Returns a value between 0 and 1 indicating the completion of the\n computation.\n \"\"\"\n if self.converged.size == 0:\n return 0.0\n return float(np.mean(self.converged).item())\n
"},{"location":"api/pydvl/value/stopping/#pydvl.value.stopping.StoppingCriterion.__call__","title":"__call__","text":"__call__(result: ValuationResult) -> Status\n
Calls check()
, maybe updating the result.
Source code in src/pydvl/value/stopping.py
def __call__(self, result: ValuationResult) -> Status:\n \"\"\"Calls `check()`, maybe updating the result.\"\"\"\n if len(result) == 0:\n logger.warning(\n \"At least one iteration finished but no results where generated. \"\n \"Please check that your scorer and utility return valid numbers.\"\n )\n status = self._check(result)\n if self.modify_result: # FIXME: this is not nice\n result._status = status\n return status\n
"},{"location":"api/pydvl/value/stopping/#pydvl.value.stopping.AbsoluteStandardError","title":"AbsoluteStandardError","text":"AbsoluteStandardError(\n threshold: float,\n fraction: float = 1.0,\n burn_in: int = 4,\n modify_result: bool = True,\n)\n
Bases: StoppingCriterion
Determine convergence based on the standard error of the values.
If \\(s_i\\) is the standard error for datum \\(i\\), then this criterion returns Converged if \\(s_i < \\epsilon\\) for all \\(i\\) and a threshold value \\(\\epsilon \\gt 0\\).
PARAMETER DESCRIPTION threshold
A value is considered to have converged if the standard error is below this threshold. A way of choosing it is to pick some percentage of the range of the values. For Shapley values this is the difference between the maximum and minimum of the utility function (to see this substitute the maximum and minimum values of the utility into the marginal contribution formula).
TYPE: float
fraction
The fraction of values that must have converged for the criterion to return Converged.
TYPE: float
DEFAULT: 1.0
burn_in
The number of iterations to ignore before checking for convergence. This is required because computations typically start with zero variance, as a result of using zeros(). The default is set to an arbitrary minimum which is usually enough but may need to be increased.
TYPE: int
DEFAULT: 4
Source code in src/pydvl/value/stopping.py
def __init__(\n self,\n threshold: float,\n fraction: float = 1.0,\n burn_in: int = 4,\n modify_result: bool = True,\n):\n super().__init__(modify_result=modify_result)\n self.threshold = threshold\n self.fraction = fraction\n self.burn_in = burn_in\n
"},{"location":"api/pydvl/value/stopping/#pydvl.value.stopping.AbsoluteStandardError.converged","title":"converged property
","text":"converged: NDArray[bool_]\n
Returns a boolean array indicating whether the values have converged for each data point.
Inheriting classes must set the _converged
attribute in their check()
.
RETURNS DESCRIPTION NDArray[bool_]
A boolean array indicating whether the values have converged for
NDArray[bool_]
each data point.
"},{"location":"api/pydvl/value/stopping/#pydvl.value.stopping.AbsoluteStandardError.completion","title":"completion","text":"completion() -> float\n
Returns a value between 0 and 1 indicating the completion of the computation.
Source code in src/pydvl/value/stopping.py
def completion(self) -> float:\n \"\"\"Returns a value between 0 and 1 indicating the completion of the\n computation.\n \"\"\"\n if self.converged.size == 0:\n return 0.0\n return float(np.mean(self.converged).item())\n
"},{"location":"api/pydvl/value/stopping/#pydvl.value.stopping.AbsoluteStandardError.__call__","title":"__call__","text":"__call__(result: ValuationResult) -> Status\n
Calls check()
, maybe updating the result.
Source code in src/pydvl/value/stopping.py
def __call__(self, result: ValuationResult) -> Status:\n \"\"\"Calls `check()`, maybe updating the result.\"\"\"\n if len(result) == 0:\n logger.warning(\n \"At least one iteration finished but no results where generated. \"\n \"Please check that your scorer and utility return valid numbers.\"\n )\n status = self._check(result)\n if self.modify_result: # FIXME: this is not nice\n result._status = status\n return status\n
"},{"location":"api/pydvl/value/stopping/#pydvl.value.stopping.MaxChecks","title":"MaxChecks","text":"MaxChecks(n_checks: Optional[int], modify_result: bool = True)\n
Bases: StoppingCriterion
Terminate as soon as the number of checks exceeds the threshold.
A \"check\" is one call to the criterion.
PARAMETER DESCRIPTION n_checks
Threshold: if None
, no _check is performed, effectively creating a (never) stopping criterion that always returns Pending
.
TYPE: Optional[int]
Source code in src/pydvl/value/stopping.py
def __init__(self, n_checks: Optional[int], modify_result: bool = True):\n super().__init__(modify_result=modify_result)\n if n_checks is not None and n_checks < 1:\n raise ValueError(\"n_iterations must be at least 1 or None\")\n self.n_checks = n_checks\n self._count = 0\n
"},{"location":"api/pydvl/value/stopping/#pydvl.value.stopping.MaxChecks.converged","title":"converged property
","text":"converged: NDArray[bool_]\n
Returns a boolean array indicating whether the values have converged for each data point.
Inheriting classes must set the _converged
attribute in their check()
.
RETURNS DESCRIPTION NDArray[bool_]
A boolean array indicating whether the values have converged for
NDArray[bool_]
each data point.
"},{"location":"api/pydvl/value/stopping/#pydvl.value.stopping.MaxChecks.__call__","title":"__call__","text":"__call__(result: ValuationResult) -> Status\n
Calls check()
, maybe updating the result.
Source code in src/pydvl/value/stopping.py
def __call__(self, result: ValuationResult) -> Status:\n \"\"\"Calls `check()`, maybe updating the result.\"\"\"\n if len(result) == 0:\n logger.warning(\n \"At least one iteration finished but no results where generated. \"\n \"Please check that your scorer and utility return valid numbers.\"\n )\n status = self._check(result)\n if self.modify_result: # FIXME: this is not nice\n result._status = status\n return status\n
"},{"location":"api/pydvl/value/stopping/#pydvl.value.stopping.MaxUpdates","title":"MaxUpdates","text":"MaxUpdates(n_updates: Optional[int], modify_result: bool = True)\n
Bases: StoppingCriterion
Terminate if any number of value updates exceeds or equals the given threshold.
Note
If you want to ensure that all values have been updated, you probably want MinUpdates instead.
This checks the counts
field of a ValuationResult, i.e. the number of times that each index has been updated. For powerset samplers, the maximum of this number coincides with the maximum number of subsets sampled. For permutation samplers, it coincides with the number of permutations sampled.
PARAMETER DESCRIPTION n_updates
Threshold: if None
, no _check is performed, effectively creating a (never) stopping criterion that always returns Pending
.
TYPE: Optional[int]
Source code in src/pydvl/value/stopping.py
def __init__(self, n_updates: Optional[int], modify_result: bool = True):\n super().__init__(modify_result=modify_result)\n if n_updates is not None and n_updates < 1:\n raise ValueError(\"n_updates must be at least 1 or None\")\n self.n_updates = n_updates\n self.last_max = 0\n
"},{"location":"api/pydvl/value/stopping/#pydvl.value.stopping.MaxUpdates.converged","title":"converged property
","text":"converged: NDArray[bool_]\n
Returns a boolean array indicating whether the values have converged for each data point.
Inheriting classes must set the _converged
attribute in their check()
.
RETURNS DESCRIPTION NDArray[bool_]
A boolean array indicating whether the values have converged for
NDArray[bool_]
each data point.
"},{"location":"api/pydvl/value/stopping/#pydvl.value.stopping.MaxUpdates.__call__","title":"__call__","text":"__call__(result: ValuationResult) -> Status\n
Calls check()
, maybe updating the result.
Source code in src/pydvl/value/stopping.py
def __call__(self, result: ValuationResult) -> Status:\n \"\"\"Calls `check()`, maybe updating the result.\"\"\"\n if len(result) == 0:\n logger.warning(\n \"At least one iteration finished but no results where generated. \"\n \"Please check that your scorer and utility return valid numbers.\"\n )\n status = self._check(result)\n if self.modify_result: # FIXME: this is not nice\n result._status = status\n return status\n
"},{"location":"api/pydvl/value/stopping/#pydvl.value.stopping.MinUpdates","title":"MinUpdates","text":"MinUpdates(n_updates: Optional[int], modify_result: bool = True)\n
Bases: StoppingCriterion
Terminate as soon as all value updates exceed or equal the given threshold.
This checks the counts
field of a ValuationResult, i.e. the number of times that each index has been updated. For powerset samplers, the minimum of this number is a lower bound for the number of subsets sampled. For permutation samplers, it lower-bounds the amount of permutations sampled.
PARAMETER DESCRIPTION n_updates
Threshold: if None
, no _check is performed, effectively creating a (never) stopping criterion that always returns Pending
.
TYPE: Optional[int]
Source code in src/pydvl/value/stopping.py
def __init__(self, n_updates: Optional[int], modify_result: bool = True):\n super().__init__(modify_result=modify_result)\n self.n_updates = n_updates\n self.last_min = 0\n
"},{"location":"api/pydvl/value/stopping/#pydvl.value.stopping.MinUpdates.converged","title":"converged property
","text":"converged: NDArray[bool_]\n
Returns a boolean array indicating whether the values have converged for each data point.
Inheriting classes must set the _converged
attribute in their check()
.
RETURNS DESCRIPTION NDArray[bool_]
A boolean array indicating whether the values have converged for
NDArray[bool_]
each data point.
"},{"location":"api/pydvl/value/stopping/#pydvl.value.stopping.MinUpdates.__call__","title":"__call__","text":"__call__(result: ValuationResult) -> Status\n
Calls check()
, maybe updating the result.
Source code in src/pydvl/value/stopping.py
def __call__(self, result: ValuationResult) -> Status:\n \"\"\"Calls `check()`, maybe updating the result.\"\"\"\n if len(result) == 0:\n logger.warning(\n \"At least one iteration finished but no results where generated. \"\n \"Please check that your scorer and utility return valid numbers.\"\n )\n status = self._check(result)\n if self.modify_result: # FIXME: this is not nice\n result._status = status\n return status\n
"},{"location":"api/pydvl/value/stopping/#pydvl.value.stopping.MaxTime","title":"MaxTime","text":"MaxTime(seconds: Optional[float], modify_result: bool = True)\n
Bases: StoppingCriterion
Terminate if the computation time exceeds the given number of seconds.
Checks the elapsed time since construction
PARAMETER DESCRIPTION seconds
Threshold: The computation is terminated if the elapsed time between object construction and a _check exceeds this value. If None
, no _check is performed, effectively creating a (never) stopping criterion that always returns Pending
.
TYPE: Optional[float]
Source code in src/pydvl/value/stopping.py
def __init__(self, seconds: Optional[float], modify_result: bool = True):\n super().__init__(modify_result=modify_result)\n self.max_seconds = seconds or np.inf\n if self.max_seconds <= 0:\n raise ValueError(\"Number of seconds for MaxTime must be positive or None\")\n self.start = time()\n
"},{"location":"api/pydvl/value/stopping/#pydvl.value.stopping.MaxTime.converged","title":"converged property
","text":"converged: NDArray[bool_]\n
Returns a boolean array indicating whether the values have converged for each data point.
Inheriting classes must set the _converged
attribute in their check()
.
RETURNS DESCRIPTION NDArray[bool_]
A boolean array indicating whether the values have converged for
NDArray[bool_]
each data point.
"},{"location":"api/pydvl/value/stopping/#pydvl.value.stopping.MaxTime.__call__","title":"__call__","text":"__call__(result: ValuationResult) -> Status\n
Calls check()
, maybe updating the result.
Source code in src/pydvl/value/stopping.py
def __call__(self, result: ValuationResult) -> Status:\n \"\"\"Calls `check()`, maybe updating the result.\"\"\"\n if len(result) == 0:\n logger.warning(\n \"At least one iteration finished but no results where generated. \"\n \"Please check that your scorer and utility return valid numbers.\"\n )\n status = self._check(result)\n if self.modify_result: # FIXME: this is not nice\n result._status = status\n return status\n
"},{"location":"api/pydvl/value/stopping/#pydvl.value.stopping.HistoryDeviation","title":"HistoryDeviation","text":"HistoryDeviation(\n n_steps: int,\n rtol: float,\n pin_converged: bool = True,\n modify_result: bool = True,\n)\n
Bases: StoppingCriterion
A simple check for relative distance to a previous step in the computation.
The method used by (Ghorbani and Zou, 2019)1 computes the relative distances between the current values \\(v_i^t\\) and the values at the previous checkpoint \\(v_i^{t-\\tau}\\). If the sum is below a given threshold, the computation is terminated.
\\[\\sum_{i=1}^n \\frac{\\left| v_i^t - v_i^{t-\\tau} \\right|}{v_i^t} < \\epsilon.\\] When the denominator is zero, the summand is set to the value of \\(v_i^{ t-\\tau}\\).
This implementation is slightly generalised to allow for different number of updates to individual indices, as happens with powerset samplers instead of permutations. Every subset of indices that is found to converge can be pinned to that state. Once all indices have converged the method has converged.
Warning
This criterion is meant for the reproduction of the results in the paper, but we do not recommend using it in practice.
PARAMETER DESCRIPTION n_steps
Checkpoint values every so many updates and use these saved values to compare.
TYPE: int
rtol
Relative tolerance for convergence (\\(\\epsilon\\) in the formula).
TYPE: float
pin_converged
If True
, once an index has converged, it is pinned
TYPE: bool
DEFAULT: True
Source code in src/pydvl/value/stopping.py
def __init__(\n self,\n n_steps: int,\n rtol: float,\n pin_converged: bool = True,\n modify_result: bool = True,\n):\n super().__init__(modify_result=modify_result)\n if n_steps < 1:\n raise ValueError(\"n_steps must be at least 1\")\n if rtol <= 0 or rtol >= 1:\n raise ValueError(\"rtol must be in (0, 1)\")\n\n self.n_steps = n_steps\n self.rtol = rtol\n self.update_op = np.logical_or if pin_converged else np.logical_and\n self._memory = None # type: ignore\n
"},{"location":"api/pydvl/value/stopping/#pydvl.value.stopping.HistoryDeviation.converged","title":"converged property
","text":"converged: NDArray[bool_]\n
Returns a boolean array indicating whether the values have converged for each data point.
Inheriting classes must set the _converged
attribute in their check()
.
RETURNS DESCRIPTION NDArray[bool_]
A boolean array indicating whether the values have converged for
NDArray[bool_]
each data point.
"},{"location":"api/pydvl/value/stopping/#pydvl.value.stopping.HistoryDeviation.completion","title":"completion","text":"completion() -> float\n
Returns a value between 0 and 1 indicating the completion of the computation.
Source code in src/pydvl/value/stopping.py
def completion(self) -> float:\n \"\"\"Returns a value between 0 and 1 indicating the completion of the\n computation.\n \"\"\"\n if self.converged.size == 0:\n return 0.0\n return float(np.mean(self.converged).item())\n
"},{"location":"api/pydvl/value/stopping/#pydvl.value.stopping.HistoryDeviation.__call__","title":"__call__","text":"__call__(result: ValuationResult) -> Status\n
Calls check()
, maybe updating the result.
Source code in src/pydvl/value/stopping.py
def __call__(self, result: ValuationResult) -> Status:\n \"\"\"Calls `check()`, maybe updating the result.\"\"\"\n if len(result) == 0:\n logger.warning(\n \"At least one iteration finished but no results where generated. \"\n \"Please check that your scorer and utility return valid numbers.\"\n )\n status = self._check(result)\n if self.modify_result: # FIXME: this is not nice\n result._status = status\n return status\n
"},{"location":"api/pydvl/value/stopping/#pydvl.value.stopping.RankCorrelation","title":"RankCorrelation","text":"RankCorrelation(rtol: float, burn_in: int, modify_result: bool = True)\n
Bases: StoppingCriterion
A check for stability of Spearman correlation between checks.
When the change in rank correlation between two successive iterations is below a given threshold, the computation is terminated. The criterion computes the Spearman correlation between two successive iterations. The Spearman correlation uses the ordering indices of the given values and correlates them. This means it focuses on the order of the elements instead of their exact values. If the order stops changing (meaning the Banzhaf semivalues estimates converge), the criterion stops the algorithm.
This criterion is used in (Wang et. al.)2.
PARAMETER DESCRIPTION rtol
Relative tolerance for convergence (\\(\\epsilon\\) in the formula)
TYPE: float
modify_result
If True
, the status of the input ValuationResult is modified in place after the call.
TYPE: bool
DEFAULT: True
burn_in
The minimum number of iterations before checking for convergence. This is required because the first correlation is meaningless.
TYPE: int
Added in 0.9.0
Source code in src/pydvl/value/stopping.py
def __init__(\n self,\n rtol: float,\n burn_in: int,\n modify_result: bool = True,\n):\n super().__init__(modify_result=modify_result)\n if rtol <= 0 or rtol >= 1:\n raise ValueError(\"rtol must be in (0, 1)\")\n self.rtol = rtol\n self.burn_in = burn_in\n self._memory: NDArray[np.float64] | None = None\n self._corr = 0.0\n self._completion = 0.0\n self._iterations = 0\n
"},{"location":"api/pydvl/value/stopping/#pydvl.value.stopping.RankCorrelation.converged","title":"converged property
","text":"converged: NDArray[bool_]\n
Returns a boolean array indicating whether the values have converged for each data point.
Inheriting classes must set the _converged
attribute in their check()
.
RETURNS DESCRIPTION NDArray[bool_]
A boolean array indicating whether the values have converged for
NDArray[bool_]
each data point.
"},{"location":"api/pydvl/value/stopping/#pydvl.value.stopping.RankCorrelation.__call__","title":"__call__","text":"__call__(result: ValuationResult) -> Status\n
Calls check()
, maybe updating the result.
Source code in src/pydvl/value/stopping.py
def __call__(self, result: ValuationResult) -> Status:\n \"\"\"Calls `check()`, maybe updating the result.\"\"\"\n if len(result) == 0:\n logger.warning(\n \"At least one iteration finished but no results where generated. \"\n \"Please check that your scorer and utility return valid numbers.\"\n )\n status = self._check(result)\n if self.modify_result: # FIXME: this is not nice\n result._status = status\n return status\n
"},{"location":"api/pydvl/value/stopping/#pydvl.value.stopping.make_criterion","title":"make_criterion","text":"make_criterion(\n fun: StoppingCriterionCallable,\n converged: Callable[[], NDArray[bool_]] | None = None,\n completion: Callable[[], float] | None = None,\n name: str | None = None,\n) -> Type[StoppingCriterion]\n
Create a new StoppingCriterion from a function. Use this to enable simpler functions to be composed with bitwise operators
PARAMETER DESCRIPTION fun
The callable to wrap.
TYPE: StoppingCriterionCallable
converged
A callable that returns a boolean array indicating what values have converged.
TYPE: Callable[[], NDArray[bool_]] | None
DEFAULT: None
completion
A callable that returns a value between 0 and 1 indicating the rate of completion of the computation. If not provided, the fraction of converged values is used.
TYPE: Callable[[], float] | None
DEFAULT: None
name
The name of the new criterion. If None
, the __name__
of the function is used.
TYPE: str | None
DEFAULT: None
RETURNS DESCRIPTION Type[StoppingCriterion]
A new subclass of StoppingCriterion.
Source code in src/pydvl/value/stopping.py
def make_criterion(\n fun: StoppingCriterionCallable,\n converged: Callable[[], NDArray[np.bool_]] | None = None,\n completion: Callable[[], float] | None = None,\n name: str | None = None,\n) -> Type[StoppingCriterion]:\n \"\"\"Create a new [StoppingCriterion][pydvl.value.stopping.StoppingCriterion] from a function.\n Use this to enable simpler functions to be composed with bitwise operators\n\n Args:\n fun: The callable to wrap.\n converged: A callable that returns a boolean array indicating what\n values have converged.\n completion: A callable that returns a value between 0 and 1 indicating\n the rate of completion of the computation. If not provided, the fraction\n of converged values is used.\n name: The name of the new criterion. If `None`, the `__name__` of\n the function is used.\n\n Returns:\n A new subclass of [StoppingCriterion][pydvl.value.stopping.StoppingCriterion].\n \"\"\"\n\n class WrappedCriterion(StoppingCriterion):\n def __init__(self, modify_result: bool = True):\n super().__init__(modify_result=modify_result)\n self._name = name or getattr(fun, \"__name__\", \"WrappedCriterion\")\n\n def _check(self, result: ValuationResult) -> Status:\n return fun(result)\n\n @property\n def converged(self) -> NDArray[np.bool_]:\n if converged is None:\n return super().converged\n return converged()\n\n def __str__(self):\n return self._name\n\n def completion(self) -> float:\n if completion is None:\n return super().completion()\n return completion()\n\n return WrappedCriterion\n
"},{"location":"api/pydvl/value/least_core/","title":"Least core","text":""},{"location":"api/pydvl/value/least_core/#pydvl.value.least_core","title":"pydvl.value.least_core","text":"New in version 0.4.0
This package holds all routines for the computation of Least Core data values.
Please refer to Data valuation for an overview.
In addition to the standard interface via compute_least_core_values(), because computing the Least Core values requires the solution of a linear and a quadratic problem after computing all the utility values, there is the possibility of performing each step separately. This is useful when running multiple experiments: use lc_prepare_problem() or mclc_prepare_problem() to prepare a list of problems to solve, then solve them in parallel with lc_solve_problems().
Note that mclc_prepare_problem() is parallelized itself, so preparing the problems should be done in sequence in this case. The solution of the linear systems can then be done in parallel.
"},{"location":"api/pydvl/value/least_core/#pydvl.value.least_core.LeastCoreMode","title":"LeastCoreMode","text":" Bases: Enum
Available Least Core algorithms.
"},{"location":"api/pydvl/value/least_core/#pydvl.value.least_core.compute_least_core_values","title":"compute_least_core_values","text":"compute_least_core_values(\n u: Utility,\n *,\n n_jobs: int = 1,\n n_iterations: Optional[int] = None,\n mode: LeastCoreMode = LeastCoreMode.MonteCarlo,\n non_negative_subsidy: bool = False,\n solver_options: Optional[dict] = None,\n progress: bool = False,\n **kwargs\n) -> ValuationResult\n
Umbrella method to compute Least Core values with any of the available algorithms.
See Data valuation for an overview.
The following algorithms are available. Note that the exact method can only work with very small datasets and is thus intended only for testing.
exact
: uses the complete powerset of the training set for the constraints combinatorial_exact_shapley(). montecarlo
: uses the approximate Monte Carlo Least Core algorithm. Implemented in montecarlo_least_core().
PARAMETER DESCRIPTION u
Utility object with model, data, and scoring function
TYPE: Utility
n_jobs
Number of jobs to run in parallel. Only used for Monte Carlo Least Core.
TYPE: int
DEFAULT: 1
n_iterations
Number of subsets to sample and evaluate the utility on. Only used for Monte Carlo Least Core.
TYPE: Optional[int]
DEFAULT: None
mode
Algorithm to use. See LeastCoreMode for available options.
TYPE: LeastCoreMode
DEFAULT: MonteCarlo
non_negative_subsidy
If True, the least core subsidy \\(e\\) is constrained to be non-negative.
TYPE: bool
DEFAULT: False
solver_options
Optional dictionary of options passed to the solvers.
TYPE: Optional[dict]
DEFAULT: None
RETURNS DESCRIPTION ValuationResult
Object with the computed values.
New in version 0.5.0
Source code in src/pydvl/value/least_core/__init__.py
def compute_least_core_values(\n u: Utility,\n *,\n n_jobs: int = 1,\n n_iterations: Optional[int] = None,\n mode: LeastCoreMode = LeastCoreMode.MonteCarlo,\n non_negative_subsidy: bool = False,\n solver_options: Optional[dict] = None,\n progress: bool = False,\n **kwargs,\n) -> ValuationResult:\n \"\"\"Umbrella method to compute Least Core values with any of the available\n algorithms.\n\n See [Data valuation][data-valuation] for an overview.\n\n The following algorithms are available. Note that the exact method can only\n work with very small datasets and is thus intended only for testing.\n\n - `exact`: uses the complete powerset of the training set for the constraints\n [combinatorial_exact_shapley()][pydvl.value.shapley.naive.combinatorial_exact_shapley].\n - `montecarlo`: uses the approximate Monte Carlo Least Core algorithm.\n Implemented in [montecarlo_least_core()][pydvl.value.least_core.montecarlo.montecarlo_least_core].\n\n Args:\n u: Utility object with model, data, and scoring function\n n_jobs: Number of jobs to run in parallel. Only used for Monte Carlo\n Least Core.\n n_iterations: Number of subsets to sample and evaluate the utility on.\n Only used for Monte Carlo Least Core.\n mode: Algorithm to use. See\n [LeastCoreMode][pydvl.value.least_core.LeastCoreMode] for available\n options.\n non_negative_subsidy: If True, the least core subsidy $e$ is constrained\n to be non-negative.\n solver_options: Optional dictionary of options passed to the solvers.\n\n Returns:\n Object with the computed values.\n\n !!! tip \"New in version 0.5.0\"\n \"\"\"\n\n if mode == LeastCoreMode.MonteCarlo:\n # TODO fix progress showing in remote case\n progress = False\n if n_iterations is None:\n raise ValueError(\"n_iterations cannot be None for Monte Carlo Least Core\")\n return montecarlo_least_core( # type: ignore\n u=u,\n n_iterations=n_iterations,\n n_jobs=n_jobs,\n progress=progress,\n non_negative_subsidy=non_negative_subsidy,\n solver_options=solver_options,\n **kwargs,\n )\n elif mode == LeastCoreMode.Exact:\n return exact_least_core(\n u=u,\n progress=progress,\n non_negative_subsidy=non_negative_subsidy,\n solver_options=solver_options,\n )\n\n raise ValueError(f\"Invalid value encountered in {mode=}\")\n
"},{"location":"api/pydvl/value/least_core/common/","title":"Common","text":""},{"location":"api/pydvl/value/least_core/common/#pydvl.value.least_core.common","title":"pydvl.value.least_core.common","text":""},{"location":"api/pydvl/value/least_core/common/#pydvl.value.least_core.common.lc_solve_problem","title":"lc_solve_problem","text":"lc_solve_problem(\n problem: LeastCoreProblem,\n *,\n u: Utility,\n algorithm: str,\n non_negative_subsidy: bool = False,\n solver_options: Optional[dict] = None\n) -> ValuationResult\n
Solves a linear problem as prepared by mclc_prepare_problem(). Useful for parallel execution of multiple experiments by running this as a remote task.
See exact_least_core() or montecarlo_least_core() for argument descriptions.
Source code in src/pydvl/value/least_core/common.py
def lc_solve_problem(\n problem: LeastCoreProblem,\n *,\n u: Utility,\n algorithm: str,\n non_negative_subsidy: bool = False,\n solver_options: Optional[dict] = None,\n) -> ValuationResult:\n \"\"\"Solves a linear problem as prepared by\n [mclc_prepare_problem()][pydvl.value.least_core.montecarlo.mclc_prepare_problem].\n Useful for parallel execution of multiple experiments by running this as a\n remote task.\n\n See [exact_least_core()][pydvl.value.least_core.naive.exact_least_core] or\n [montecarlo_least_core()][pydvl.value.least_core.montecarlo.montecarlo_least_core] for\n argument descriptions.\n \"\"\"\n n = len(u.data)\n\n if np.any(np.isnan(problem.utility_values)):\n warnings.warn(\n f\"Calculation returned \"\n f\"{np.sum(np.isnan(problem.utility_values))} NaN \"\n f\"values out of {problem.utility_values.size}\",\n RuntimeWarning,\n )\n\n if solver_options is None:\n solver_options = {}\n\n if \"solver\" not in solver_options:\n solver_options[\"solver\"] = cp.SCS\n\n if \"max_iters\" not in solver_options and solver_options[\"solver\"] == cp.SCS:\n solver_options[\"max_iters\"] = 10000\n\n logger.debug(\"Removing possible duplicate values in lower bound array\")\n b_lb = problem.utility_values\n A_lb, unique_indices = np.unique(problem.A_lb, return_index=True, axis=0)\n b_lb = b_lb[unique_indices]\n\n logger.debug(\"Building equality constraint\")\n A_eq = np.ones((1, n))\n # We might have already computed the total utility one or more times.\n # This is the index of the row(s) in A_lb with all ones.\n total_utility_indices = np.where(A_lb.sum(axis=1) == n)[0]\n if len(total_utility_indices) == 0:\n b_eq = np.array([u(u.data.indices)])\n else:\n b_eq = b_lb[total_utility_indices]\n # Remove the row(s) corresponding to the total utility\n # from the lower bound constraints\n # because given the equality constraint\n # it is the same as using the constraint e >= 0\n # (i.e. setting non_negative_subsidy = True).\n mask: NDArray[np.bool_] = np.ones_like(b_lb, dtype=bool)\n mask[total_utility_indices] = False\n b_lb = b_lb[mask]\n A_lb = A_lb[mask]\n\n # Remove the row(s) corresponding to the empty subset\n # because, given u(\u2205) = (which is almost always the case,\n # it is the same as using the constraint e >= 0\n # (i.e. setting non_negative_subsidy = True).\n emptyset_utility_indices = np.where(A_lb.sum(axis=1) == 0)[0]\n if len(emptyset_utility_indices) > 0:\n mask = np.ones_like(b_lb, dtype=bool)\n mask[emptyset_utility_indices] = False\n b_lb = b_lb[mask]\n A_lb = A_lb[mask]\n\n _, subsidy = _solve_least_core_linear_program(\n A_eq=A_eq,\n b_eq=b_eq,\n A_lb=A_lb,\n b_lb=b_lb,\n non_negative_subsidy=non_negative_subsidy,\n solver_options=solver_options,\n )\n\n values: Optional[NDArray[np.float64]]\n\n if subsidy is None:\n logger.debug(\"No values were found\")\n status = Status.Failed\n values = np.empty(n)\n values[:] = np.nan\n subsidy = np.nan\n else:\n values = _solve_egalitarian_least_core_quadratic_program(\n subsidy,\n A_eq=A_eq,\n b_eq=b_eq,\n A_lb=A_lb,\n b_lb=b_lb,\n solver_options=solver_options,\n )\n\n if values is None:\n logger.debug(\"No values were found\")\n status = Status.Failed\n values = np.empty(n)\n values[:] = np.nan\n subsidy = np.nan\n else:\n status = Status.Converged\n\n return ValuationResult(\n algorithm=algorithm,\n status=status,\n values=values,\n subsidy=subsidy,\n stderr=None,\n data_names=u.data.data_names,\n )\n
"},{"location":"api/pydvl/value/least_core/common/#pydvl.value.least_core.common.lc_solve_problems","title":"lc_solve_problems","text":"lc_solve_problems(\n problems: Sequence[LeastCoreProblem],\n u: Utility,\n algorithm: str,\n parallel_backend: Optional[ParallelBackend] = None,\n config: Optional[ParallelConfig] = None,\n n_jobs: int = 1,\n non_negative_subsidy: bool = True,\n solver_options: Optional[dict] = None,\n **options\n) -> List[ValuationResult]\n
Solves a list of linear problems in parallel.
PARAMETER DESCRIPTION u
Utility.
TYPE: Utility
problems
Least Core problems to solve, as returned by mclc_prepare_problem().
TYPE: Sequence[LeastCoreProblem]
algorithm
Name of the valuation algorithm.
TYPE: str
parallel_backend
Parallel backend instance to use for parallelizing computations. If None
, use JoblibParallelBackend backend. See the Parallel Backends package for available options.
TYPE: Optional[ParallelBackend]
DEFAULT: None
config
(DEPRECATED) Object configuring parallel computation, with cluster address, number of cpus, etc.
TYPE: Optional[ParallelConfig]
DEFAULT: None
n_jobs
Number of parallel jobs to run.
TYPE: int
DEFAULT: 1
non_negative_subsidy
If True, the least core subsidy \\(e\\) is constrained to be non-negative.
TYPE: bool
DEFAULT: True
solver_options
Additional options to pass to the solver.
TYPE: Optional[dict]
DEFAULT: None
RETURNS DESCRIPTION List[ValuationResult]
List of solutions.
Source code in src/pydvl/value/least_core/common.py
@deprecated(\n target=True,\n args_mapping={\"config\": \"config\"},\n deprecated_in=\"0.9.0\",\n remove_in=\"0.10.0\",\n)\ndef lc_solve_problems(\n problems: Sequence[LeastCoreProblem],\n u: Utility,\n algorithm: str,\n parallel_backend: Optional[ParallelBackend] = None,\n config: Optional[ParallelConfig] = None,\n n_jobs: int = 1,\n non_negative_subsidy: bool = True,\n solver_options: Optional[dict] = None,\n **options,\n) -> List[ValuationResult]:\n \"\"\"Solves a list of linear problems in parallel.\n\n Args:\n u: Utility.\n problems: Least Core problems to solve, as returned by\n [mclc_prepare_problem()][pydvl.value.least_core.montecarlo.mclc_prepare_problem].\n algorithm: Name of the valuation algorithm.\n parallel_backend: Parallel backend instance to use\n for parallelizing computations. If `None`,\n use [JoblibParallelBackend][pydvl.parallel.backends.JoblibParallelBackend] backend.\n See the [Parallel Backends][pydvl.parallel.backends] package\n for available options.\n config: (**DEPRECATED**) Object configuring parallel computation,\n with cluster address, number of cpus, etc.\n n_jobs: Number of parallel jobs to run.\n non_negative_subsidy: If True, the least core subsidy $e$ is constrained\n to be non-negative.\n solver_options: Additional options to pass to the solver.\n\n Returns:\n List of solutions.\n \"\"\"\n\n def _map_func(\n problems: List[LeastCoreProblem], *args, **kwargs\n ) -> List[ValuationResult]:\n return [lc_solve_problem(p, *args, **kwargs) for p in problems]\n\n parallel_backend = _maybe_init_parallel_backend(parallel_backend, config)\n\n map_reduce_job: MapReduceJob[\n \"LeastCoreProblem\", \"List[ValuationResult]\"\n ] = MapReduceJob(\n inputs=problems,\n map_func=_map_func,\n map_kwargs=dict(\n u=u,\n algorithm=algorithm,\n non_negative_subsidy=non_negative_subsidy,\n solver_options=solver_options,\n **options,\n ),\n reduce_func=lambda x: list(itertools.chain(*x)),\n parallel_backend=parallel_backend,\n n_jobs=n_jobs,\n )\n solutions = map_reduce_job()\n\n return solutions\n
"},{"location":"api/pydvl/value/least_core/montecarlo/","title":"Montecarlo","text":""},{"location":"api/pydvl/value/least_core/montecarlo/#pydvl.value.least_core.montecarlo","title":"pydvl.value.least_core.montecarlo","text":""},{"location":"api/pydvl/value/least_core/montecarlo/#pydvl.value.least_core.montecarlo.montecarlo_least_core","title":"montecarlo_least_core","text":"montecarlo_least_core(\n u: Utility,\n n_iterations: int,\n *,\n n_jobs: int = 1,\n parallel_backend: Optional[ParallelBackend] = None,\n config: Optional[ParallelConfig] = None,\n non_negative_subsidy: bool = False,\n solver_options: Optional[dict] = None,\n progress: bool = False,\n seed: Optional[Seed] = None\n) -> ValuationResult\n
Computes approximate Least Core values using a Monte Carlo approach.
\\[ \\begin{array}{lll} \\text{minimize} & \\displaystyle{e} & \\\\ \\text{subject to} & \\displaystyle\\sum_{i\\in N} x_{i} = v(N) & \\\\ & \\displaystyle\\sum_{i\\in S} x_{i} + e \\geq v(S) & , \\forall S \\in \\{S_1, S_2, \\dots, S_m \\overset{\\mathrm{iid}}{\\sim} U(2^N) \\} \\end{array} \\] Where:
- \\(U(2^N)\\) is the uniform distribution over the powerset of \\(N\\).
- \\(m\\) is the number of subsets that will be sampled and whose utility will be computed and used to compute the data values.
PARAMETER DESCRIPTION u
Utility object with model, data, and scoring function
TYPE: Utility
n_iterations
total number of iterations to use
TYPE: int
n_jobs
number of jobs across which to distribute the computation
TYPE: int
DEFAULT: 1
parallel_backend
Parallel backend instance to use for parallelizing computations. If None
, use JoblibParallelBackend backend. See the Parallel Backends package for available options.
TYPE: Optional[ParallelBackend]
DEFAULT: None
config
(DEPRECATED) Object configuring parallel computation, with cluster address, number of cpus, etc.
TYPE: Optional[ParallelConfig]
DEFAULT: None
non_negative_subsidy
If True, the least core subsidy \\(e\\) is constrained to be non-negative.
TYPE: bool
DEFAULT: False
solver_options
Dictionary of options that will be used to select a solver and to configure it. Refer to cvxpy's documentation for all possible options.
TYPE: Optional[dict]
DEFAULT: None
progress
If True, shows a tqdm progress bar
TYPE: bool
DEFAULT: False
seed
Either an instance of a numpy random number generator or a seed for it.
TYPE: Optional[Seed]
DEFAULT: None
RETURNS DESCRIPTION ValuationResult
Object with the data values and the least core value.
Changed in version 0.9.0
Deprecated config
argument and added a parallel_backend
argument to allow users to pass the Parallel Backend instance directly.
Source code in src/pydvl/value/least_core/montecarlo.py
@deprecated(\n target=True,\n args_mapping={\"config\": \"config\"},\n deprecated_in=\"0.9.0\",\n remove_in=\"0.10.0\",\n)\ndef montecarlo_least_core(\n u: Utility,\n n_iterations: int,\n *,\n n_jobs: int = 1,\n parallel_backend: Optional[ParallelBackend] = None,\n config: Optional[ParallelConfig] = None,\n non_negative_subsidy: bool = False,\n solver_options: Optional[dict] = None,\n progress: bool = False,\n seed: Optional[Seed] = None,\n) -> ValuationResult:\n r\"\"\"Computes approximate Least Core values using a Monte Carlo approach.\n\n $$\n \\begin{array}{lll}\n \\text{minimize} & \\displaystyle{e} & \\\\\n \\text{subject to} & \\displaystyle\\sum_{i\\in N} x_{i} = v(N) & \\\\\n & \\displaystyle\\sum_{i\\in S} x_{i} + e \\geq v(S) & ,\n \\forall S \\in \\{S_1, S_2, \\dots, S_m \\overset{\\mathrm{iid}}{\\sim} U(2^N) \\}\n \\end{array}\n $$\n\n Where:\n\n * $U(2^N)$ is the uniform distribution over the powerset of $N$.\n * $m$ is the number of subsets that will be sampled and whose utility will\n be computed and used to compute the data values.\n\n Args:\n u: Utility object with model, data, and scoring function\n n_iterations: total number of iterations to use\n n_jobs: number of jobs across which to distribute the computation\n parallel_backend: Parallel backend instance to use\n for parallelizing computations. If `None`,\n use [JoblibParallelBackend][pydvl.parallel.backends.JoblibParallelBackend] backend.\n See the [Parallel Backends][pydvl.parallel.backends] package\n for available options.\n config: (**DEPRECATED**) Object configuring parallel computation,\n with cluster address, number of cpus, etc.\n non_negative_subsidy: If True, the least core subsidy $e$ is constrained\n to be non-negative.\n solver_options: Dictionary of options that will be used to select a solver\n and to configure it. Refer to [cvxpy's\n documentation](https://www.cvxpy.org/tutorial/advanced/index.html#setting-solver-options)\n for all possible options.\n progress: If True, shows a tqdm progress bar\n seed: Either an instance of a numpy random number generator or a seed for it.\n\n Returns:\n Object with the data values and the least core value.\n\n !!! tip \"Changed in version 0.9.0\"\n Deprecated `config` argument and added a `parallel_backend`\n argument to allow users to pass the Parallel Backend instance\n directly.\n \"\"\"\n problem = mclc_prepare_problem(\n u,\n n_iterations,\n n_jobs=n_jobs,\n parallel_backend=parallel_backend,\n config=config,\n progress=progress,\n seed=seed,\n )\n return lc_solve_problem(\n problem,\n u=u,\n algorithm=\"montecarlo_least_core\",\n non_negative_subsidy=non_negative_subsidy,\n solver_options=solver_options,\n )\n
"},{"location":"api/pydvl/value/least_core/montecarlo/#pydvl.value.least_core.montecarlo.mclc_prepare_problem","title":"mclc_prepare_problem","text":"mclc_prepare_problem(\n u: Utility,\n n_iterations: int,\n *,\n n_jobs: int = 1,\n parallel_backend: Optional[ParallelBackend] = None,\n config: Optional[ParallelConfig] = None,\n progress: bool = False,\n seed: Optional[Seed] = None\n) -> LeastCoreProblem\n
Prepares a linear problem by sampling subsets of the data. Use this to separate the problem preparation from the solving with lc_solve_problem(). Useful for parallel execution of multiple experiments.
See montecarlo_least_core for argument descriptions.
Changed in version 0.9.0
Deprecated config
argument and added a parallel_backend
argument to allow users to pass the Parallel Backend instance directly.
Source code in src/pydvl/value/least_core/montecarlo.py
@deprecated(\n target=True,\n args_mapping={\"config\": \"config\"},\n deprecated_in=\"0.9.0\",\n remove_in=\"0.10.0\",\n)\ndef mclc_prepare_problem(\n u: Utility,\n n_iterations: int,\n *,\n n_jobs: int = 1,\n parallel_backend: Optional[ParallelBackend] = None,\n config: Optional[ParallelConfig] = None,\n progress: bool = False,\n seed: Optional[Seed] = None,\n) -> LeastCoreProblem:\n \"\"\"Prepares a linear problem by sampling subsets of the data. Use this to\n separate the problem preparation from the solving with\n [lc_solve_problem()][pydvl.value.least_core.common.lc_solve_problem]. Useful\n for parallel execution of multiple experiments.\n\n See\n [montecarlo_least_core][pydvl.value.least_core.montecarlo.montecarlo_least_core]\n for argument descriptions.\n\n !!! note \"Changed in version 0.9.0\"\n Deprecated `config` argument and added a `parallel_backend`\n argument to allow users to pass the Parallel Backend instance\n directly.\n \"\"\"\n n = len(u.data)\n\n if n_iterations < n:\n warnings.warn(\n f\"Number of iterations '{n_iterations}' is smaller the size of the dataset '{n}'. \"\n f\"This is not optimal because in the worst case we need at least '{n}' constraints \"\n \"to satisfy the individual rationality condition.\"\n )\n\n if n_iterations > 2**n:\n warnings.warn(\n f\"Passed n_iterations is greater than the number subsets! \"\n f\"Setting it to 2^{n}\",\n RuntimeWarning,\n )\n n_iterations = 2**n\n\n parallel_backend = _maybe_init_parallel_backend(parallel_backend, config)\n\n iterations_per_job = max(\n 1, n_iterations // parallel_backend.effective_n_jobs(n_jobs)\n )\n\n map_reduce_job: MapReduceJob[\"Utility\", \"LeastCoreProblem\"] = MapReduceJob(\n inputs=u,\n map_func=_montecarlo_least_core,\n reduce_func=_reduce_func,\n map_kwargs=dict(n_iterations=iterations_per_job, progress=progress),\n n_jobs=n_jobs,\n parallel_backend=parallel_backend,\n )\n\n return map_reduce_job(seed=seed)\n
"},{"location":"api/pydvl/value/least_core/naive/","title":"Naive","text":""},{"location":"api/pydvl/value/least_core/naive/#pydvl.value.least_core.naive","title":"pydvl.value.least_core.naive","text":""},{"location":"api/pydvl/value/least_core/naive/#pydvl.value.least_core.naive.exact_least_core","title":"exact_least_core","text":"exact_least_core(\n u: Utility,\n *,\n non_negative_subsidy: bool = False,\n solver_options: Optional[dict] = None,\n progress: bool = True\n) -> ValuationResult\n
Computes the exact Least Core values.
Note
If the training set contains more than 20 instances a warning is printed because the computation is very expensive. This method is mostly used for internal testing and simple use cases. Please refer to the Monte Carlo method for practical applications.
The least core is the solution to the following Linear Programming problem:
\\[ \\begin{array}{lll} \\text{minimize} & \\displaystyle{e} & \\\\ \\text{subject to} & \\displaystyle\\sum_{i\\in N} x_{i} = v(N) & \\\\ & \\displaystyle\\sum_{i\\in S} x_{i} + e \\geq v(S) &, \\forall S \\subseteq N \\\\ \\end{array} \\] Where \\(N = \\{1, 2, \\dots, n\\}\\) are the training set's indices.
PARAMETER DESCRIPTION u
Utility object with model, data, and scoring function
TYPE: Utility
non_negative_subsidy
If True, the least core subsidy \\(e\\) is constrained to be non-negative.
TYPE: bool
DEFAULT: False
solver_options
Dictionary of options that will be used to select a solver and to configure it. Refer to the cvxpy's documentation for all possible options.
TYPE: Optional[dict]
DEFAULT: None
progress
If True, shows a tqdm progress bar
TYPE: bool
DEFAULT: True
RETURNS DESCRIPTION ValuationResult
Object with the data values and the least core value.
Source code in src/pydvl/value/least_core/naive.py
def exact_least_core(\n u: Utility,\n *,\n non_negative_subsidy: bool = False,\n solver_options: Optional[dict] = None,\n progress: bool = True,\n) -> ValuationResult:\n r\"\"\"Computes the exact Least Core values.\n\n !!! Note\n If the training set contains more than 20 instances a warning is printed\n because the computation is very expensive. This method is mostly used for\n internal testing and simple use cases. Please refer to the\n [Monte Carlo method][pydvl.value.least_core.montecarlo.montecarlo_least_core]\n for practical applications.\n\n The least core is the solution to the following Linear Programming problem:\n\n $$\n \\begin{array}{lll}\n \\text{minimize} & \\displaystyle{e} & \\\\\n \\text{subject to} & \\displaystyle\\sum_{i\\in N} x_{i} = v(N) & \\\\\n & \\displaystyle\\sum_{i\\in S} x_{i} + e \\geq v(S) &, \\forall S \\subseteq N \\\\\n \\end{array}\n $$\n\n Where $N = \\{1, 2, \\dots, n\\}$ are the training set's indices.\n\n Args:\n u: Utility object with model, data, and scoring function\n non_negative_subsidy: If True, the least core subsidy $e$ is constrained\n to be non-negative.\n solver_options: Dictionary of options that will be used to select a solver\n and to configure it. Refer to the [cvxpy's\n documentation](https://www.cvxpy.org/tutorial/advanced/index.html#setting-solver-options)\n for all possible options.\n progress: If True, shows a tqdm progress bar\n\n Returns:\n Object with the data values and the least core value.\n \"\"\"\n n = len(u.data)\n if n > 20: # Arbitrary choice, will depend on time required, caching, etc.\n warnings.warn(f\"Large dataset! Computation requires 2^{n} calls to model.fit()\")\n\n problem = lc_prepare_problem(u, progress=progress)\n return lc_solve_problem(\n problem=problem,\n u=u,\n algorithm=\"exact_least_core\",\n non_negative_subsidy=non_negative_subsidy,\n solver_options=solver_options,\n )\n
"},{"location":"api/pydvl/value/least_core/naive/#pydvl.value.least_core.naive.lc_prepare_problem","title":"lc_prepare_problem","text":"lc_prepare_problem(u: Utility, progress: bool = False) -> LeastCoreProblem\n
Prepares a linear problem with all subsets of the data Use this to separate the problem preparation from the solving with lc_solve_problem(). Useful for parallel execution of multiple experiments.
See exact_least_core() for argument descriptions.
Source code in src/pydvl/value/least_core/naive.py
def lc_prepare_problem(u: Utility, progress: bool = False) -> LeastCoreProblem:\n \"\"\"Prepares a linear problem with all subsets of the data\n Use this to separate the problem preparation from the solving with\n [lc_solve_problem()][pydvl.value.least_core.common.lc_solve_problem]. Useful for\n parallel execution of multiple experiments.\n\n See [exact_least_core()][pydvl.value.least_core.naive.exact_least_core] for argument\n descriptions.\n \"\"\"\n n = len(u.data)\n\n logger.debug(\"Building vectors and matrices for linear programming problem\")\n powerset_size = 2**n\n A_lb = np.zeros((powerset_size, n))\n\n logger.debug(\"Iterating over all subsets\")\n utility_values = np.zeros(powerset_size)\n for i, subset in enumerate( # type: ignore\n tqdm(\n powerset(u.data.indices),\n disable=not progress,\n total=powerset_size - 1,\n position=0,\n )\n ):\n indices: NDArray[np.bool_] = np.zeros(n, dtype=bool)\n indices[list(subset)] = True\n A_lb[i, indices] = 1\n utility_values[i] = u(subset) # type: ignore\n\n return LeastCoreProblem(utility_values, A_lb)\n
"},{"location":"api/pydvl/value/loo/","title":"Loo","text":""},{"location":"api/pydvl/value/loo/#pydvl.value.loo","title":"pydvl.value.loo","text":""},{"location":"api/pydvl/value/loo/loo/","title":"Loo","text":""},{"location":"api/pydvl/value/loo/loo/#pydvl.value.loo.loo","title":"pydvl.value.loo.loo","text":""},{"location":"api/pydvl/value/loo/loo/#pydvl.value.loo.loo.compute_loo","title":"compute_loo","text":"compute_loo(\n u: Utility,\n *,\n n_jobs: int = 1,\n parallel_backend: Optional[ParallelBackend] = None,\n config: Optional[ParallelConfig] = None,\n progress: bool = True\n) -> ValuationResult\n
Computes leave one out value:
\\[v(i) = u(D) - u(D \\setminus \\{i\\}) \\] PARAMETER DESCRIPTION u
Utility object with model, data, and scoring function
TYPE: Utility
progress
If True, display a progress bar
TYPE: bool
DEFAULT: True
n_jobs
Number of parallel jobs to use
TYPE: int
DEFAULT: 1
parallel_backend
Parallel backend instance to use for parallelizing computations. If None
, use JoblibParallelBackend backend. See the Parallel Backends package for available options.
TYPE: Optional[ParallelBackend]
DEFAULT: None
config
(DEPRECATED) Object configuring parallel computation, with cluster address, number of cpus, etc.
TYPE: Optional[ParallelConfig]
DEFAULT: None
progress
If True, display a progress bar
TYPE: bool
DEFAULT: True
RETURNS DESCRIPTION ValuationResult
Object with the data values.
New in version 0.7.0
Renamed from naive_loo
and added parallel computation.
Changed in version 0.9.0
Deprecated config
argument and added a parallel_backend
argument to allow users to pass the Parallel Backend instance directly.
Source code in src/pydvl/value/loo/loo.py
@deprecated(\n target=True,\n args_mapping={\"config\": \"config\"},\n deprecated_in=\"0.9.0\",\n remove_in=\"0.10.0\",\n)\ndef compute_loo(\n u: Utility,\n *,\n n_jobs: int = 1,\n parallel_backend: Optional[ParallelBackend] = None,\n config: Optional[ParallelConfig] = None,\n progress: bool = True,\n) -> ValuationResult:\n r\"\"\"Computes leave one out value:\n\n $$v(i) = u(D) - u(D \\setminus \\{i\\}) $$\n\n Args:\n u: Utility object with model, data, and scoring function\n progress: If True, display a progress bar\n n_jobs: Number of parallel jobs to use\n parallel_backend: Parallel backend instance to use\n for parallelizing computations. If `None`,\n use [JoblibParallelBackend][pydvl.parallel.backends.JoblibParallelBackend] backend.\n See the [Parallel Backends][pydvl.parallel.backends] package\n for available options.\n config: (**DEPRECATED**) Object configuring parallel computation,\n with cluster address, number of cpus, etc.\n progress: If True, display a progress bar\n\n Returns:\n Object with the data values.\n\n !!! tip \"New in version 0.7.0\"\n Renamed from `naive_loo` and added parallel computation.\n\n !!! tip \"Changed in version 0.9.0\"\n Deprecated `config` argument and added a `parallel_backend`\n argument to allow users to pass the Parallel Backend instance\n directly.\n \"\"\"\n if len(u.data) < 3:\n raise ValueError(\"Dataset must have at least 2 elements\")\n\n result = ValuationResult.zeros(\n algorithm=\"loo\",\n indices=u.data.indices,\n data_names=u.data.data_names,\n )\n\n all_indices = set(u.data.indices)\n total_utility = u(u.data.indices)\n\n def fun(idx: int) -> tuple[int, float]:\n return idx, total_utility - u(all_indices.difference({idx}))\n\n parallel_backend = _maybe_init_parallel_backend(parallel_backend, config)\n max_workers = parallel_backend.effective_n_jobs(n_jobs)\n n_submitted_jobs = 2 * max_workers # number of jobs in the queue\n\n # NOTE: this could be done with a simple executor.map(), but we want to\n # display a progress bar\n\n with parallel_backend.executor(\n max_workers=max_workers, cancel_futures=True\n ) as executor:\n pending: set[Future] = set()\n index_it = iter(u.data.indices)\n\n pbar = tqdm(disable=not progress, total=100, unit=\"%\")\n while True:\n pbar.n = 100 * sum(result.counts) / len(u.data)\n pbar.refresh()\n completed, pending = wait(pending, timeout=0.1, return_when=FIRST_COMPLETED)\n for future in completed:\n idx, marginal = future.result()\n result.update(idx, marginal)\n\n # Ensure that we always have n_submitted_jobs running\n try:\n for _ in range(n_submitted_jobs - len(pending)):\n pending.add(executor.submit(fun, next(index_it)))\n except StopIteration:\n if len(pending) == 0:\n return result\n
"},{"location":"api/pydvl/value/oob/","title":"Oob","text":""},{"location":"api/pydvl/value/oob/#pydvl.value.oob","title":"pydvl.value.oob","text":""},{"location":"api/pydvl/value/oob/oob/","title":"Oob","text":""},{"location":"api/pydvl/value/oob/oob/#pydvl.value.oob.oob","title":"pydvl.value.oob.oob","text":""},{"location":"api/pydvl/value/oob/oob/#pydvl.value.oob.oob--references","title":"References","text":" -
Kwon et al. Data-OOB: Out-of-bag Estimate as a Simple and Efficient Data Value. In: Published at ICML 2023\u00a0\u21a9
"},{"location":"api/pydvl/value/oob/oob/#pydvl.value.oob.oob.compute_data_oob","title":"compute_data_oob","text":"compute_data_oob(\n u: Utility,\n *,\n n_est: int = 10,\n max_samples: float = 0.8,\n loss: Optional[LossFunction] = None,\n n_jobs: Optional[int] = None,\n seed: Optional[Seed] = None,\n progress: bool = False\n) -> ValuationResult\n
Computes Data out of bag values
This implements the method described in (Kwon and Zou, 2023)1. It fits several base estimators provided through u.model through a bagging process. The point value corresponds to the average loss of estimators which were not fit on it.
\\(w_{bj}\\in Z\\) is the number of times the j-th datum \\((x_j, y_j)\\) is selected in the b-th bootstrap dataset.
\\[\\psi((x_i,y_i),\\Theta_B):=\\frac{\\sum_{b=1}^{B}\\mathbb{1}(w_{bi}=0)T(y_i, \\hat{f}_b(x_i))}{\\sum_{b=1}^{B} \\mathbb{1} (w_{bi}=0)}\\] With:
\\[ T: Y \\times Y \\rightarrow \\mathbb{R} \\] T is a score function that represents the goodness of a weak learner \\(\\hat{f}_b\\) at the i-th datum \\((x_i, y_i)\\).
n_est
and max_samples
must be tuned jointly to ensure that all samples are at least 1 time out-of-bag, otherwise the result could include a NaN value for that datum.
PARAMETER DESCRIPTION u
Utility object with model, data, and scoring function.
TYPE: Utility
n_est
Number of estimator used in the bagging procedure.
TYPE: int
DEFAULT: 10
max_samples
The fraction of samples to draw to train each base estimator.
TYPE: float
DEFAULT: 0.8
loss
A function taking as parameters model prediction and corresponding data labels(y_true, y_pred) and returning an array of point-wise errors.
TYPE: Optional[LossFunction]
DEFAULT: None
n_jobs
The number of jobs to run in parallel used in the bagging procedure for both fit and predict.
TYPE: Optional[int]
DEFAULT: None
seed
Either an instance of a numpy random number generator or a seed for it.
TYPE: Optional[Seed]
DEFAULT: None
progress
If True, display a progress bar.
TYPE: bool
DEFAULT: False
RETURNS DESCRIPTION ValuationResult
Object with the data values.
Source code in src/pydvl/value/oob/oob.py
def compute_data_oob(\n u: Utility,\n *,\n n_est: int = 10,\n max_samples: float = 0.8,\n loss: Optional[LossFunction] = None,\n n_jobs: Optional[int] = None,\n seed: Optional[Seed] = None,\n progress: bool = False,\n) -> ValuationResult:\n r\"\"\"Computes Data out of bag values\n\n This implements the method described in\n (Kwon and Zou, 2023)<sup><a href=\"kwon_data_2023\">1</a></sup>.\n It fits several base estimators provided through u.model through a bagging\n process. The point value corresponds to the average loss of estimators which\n were not fit on it.\n\n $w_{bj}\\in Z$ is the number of times the j-th datum $(x_j, y_j)$ is selected\n in the b-th bootstrap dataset.\n\n $$\\psi((x_i,y_i),\\Theta_B):=\\frac{\\sum_{b=1}^{B}\\mathbb{1}(w_{bi}=0)T(y_i,\n \\hat{f}_b(x_i))}{\\sum_{b=1}^{B}\n \\mathbb{1}\n (w_{bi}=0)}$$\n\n With:\n\n $$\n T: Y \\times Y\n \\rightarrow \\mathbb{R}\n $$\n\n T is a score function that represents the goodness of a weak learner\n $\\hat{f}_b$ at the i-th datum $(x_i, y_i)$.\n\n `n_est` and `max_samples` must be tuned jointly to ensure that all samples\n are at least 1 time out-of-bag, otherwise the result could include a NaN\n value for that datum.\n\n Args:\n u: Utility object with model, data, and scoring function.\n n_est: Number of estimator used in the bagging procedure.\n max_samples: The fraction of samples to draw to train each base\n estimator.\n loss: A function taking as parameters model prediction and corresponding\n data labels(y_true, y_pred) and returning an array of point-wise errors.\n n_jobs: The number of jobs to run in parallel used in the bagging\n procedure for both fit and predict.\n seed: Either an instance of a numpy random number generator or a seed\n for it.\n progress: If True, display a progress bar.\n\n Returns:\n Object with the data values.\n \"\"\"\n rng = np.random.default_rng(seed)\n random_state = np.random.RandomState(rng.bit_generator)\n\n result: ValuationResult[np.int_, np.object_] = ValuationResult.empty(\n algorithm=\"data_oob\", indices=u.data.indices, data_names=u.data.data_names\n )\n\n if is_classifier(u.model):\n bag = BaggingClassifier(\n u.model,\n n_estimators=n_est,\n max_samples=max_samples,\n n_jobs=n_jobs,\n random_state=random_state,\n )\n if loss is None:\n loss = point_wise_accuracy\n elif is_regressor(u.model):\n bag = BaggingRegressor(\n u.model,\n n_estimators=n_est,\n max_samples=max_samples,\n n_jobs=n_jobs,\n random_state=random_state,\n )\n if loss is None:\n loss = neg_l2_distance\n else:\n raise Exception(\n \"Model has to be a classifier or a regressor in sklearn format.\"\n )\n\n bag.fit(u.data.x_train, u.data.y_train)\n\n for est, samples in tqdm(\n zip(bag.estimators_, bag.estimators_samples_), disable=not progress, total=n_est\n ): # The bottleneck is the bag fitting not this part so TQDM is not very useful here\n oob_idx = np.setxor1d(u.data.indices, np.unique(samples))\n array_loss = loss(\n y_true=u.data.y_train[oob_idx],\n y_pred=est.predict(u.data.x_train[oob_idx]),\n )\n result += ValuationResult(\n algorithm=\"data_oob\",\n indices=oob_idx,\n values=array_loss,\n counts=np.ones_like(array_loss, dtype=u.data.indices.dtype),\n )\n return result\n
"},{"location":"api/pydvl/value/oob/oob/#pydvl.value.oob.oob.point_wise_accuracy","title":"point_wise_accuracy","text":"point_wise_accuracy(y_true: NDArray[T], y_pred: NDArray[T]) -> NDArray[T]\n
Point-wise 0-1 loss between two arrays
PARAMETER DESCRIPTION y_true
Array of true values (e.g. labels)
TYPE: NDArray[T]
y_pred
Array of estimated values (e.g. model predictions)
TYPE: NDArray[T]
RETURNS DESCRIPTION NDArray[T]
Array with point-wise 0-1 losses between labels and model predictions
Source code in src/pydvl/value/oob/oob.py
def point_wise_accuracy(y_true: NDArray[T], y_pred: NDArray[T]) -> NDArray[T]:\n r\"\"\"Point-wise 0-1 loss between two arrays\n\n Args:\n y_true: Array of true values (e.g. labels)\n y_pred: Array of estimated values (e.g. model predictions)\n\n Returns:\n Array with point-wise 0-1 losses between labels and model predictions\n \"\"\"\n return np.array(y_pred == y_true, dtype=y_pred.dtype)\n
"},{"location":"api/pydvl/value/oob/oob/#pydvl.value.oob.oob.neg_l2_distance","title":"neg_l2_distance","text":"neg_l2_distance(y_true: NDArray[T], y_pred: NDArray[T]) -> NDArray[T]\n
Point-wise negative \\(l_2\\) distance between two arrays
PARAMETER DESCRIPTION y_true
Array of true values (e.g. labels)
TYPE: NDArray[T]
y_pred
Array of estimated values (e.g. model predictions)
TYPE: NDArray[T]
RETURNS DESCRIPTION NDArray[T]
Array with point-wise negative \\(l_2\\) distances between labels and model
NDArray[T]
predictions
Source code in src/pydvl/value/oob/oob.py
def neg_l2_distance(y_true: NDArray[T], y_pred: NDArray[T]) -> NDArray[T]:\n r\"\"\"Point-wise negative $l_2$ distance between two arrays\n\n Args:\n y_true: Array of true values (e.g. labels)\n y_pred: Array of estimated values (e.g. model predictions)\n\n Returns:\n Array with point-wise negative $l_2$ distances between labels and model\n predictions\n \"\"\"\n return -np.square(np.array(y_pred - y_true), dtype=y_pred.dtype)\n
"},{"location":"api/pydvl/value/shapley/","title":"Shapley","text":""},{"location":"api/pydvl/value/shapley/#pydvl.value.shapley","title":"pydvl.value.shapley","text":"This package holds all routines for the computation of Shapley Data value. Users will want to use compute_shapley_values or compute_semivalues as interfaces to most methods defined in the modules.
Please refer to the guide on data valuation for an overview of all methods.
"},{"location":"api/pydvl/value/shapley/classwise/","title":"Classwise","text":""},{"location":"api/pydvl/value/shapley/classwise/#pydvl.value.shapley.classwise","title":"pydvl.value.shapley.classwise","text":"Class-wise Shapley (Schoch et al., 2022)1 offers a Shapley framework tailored for classification problems. Let \\(D\\) be a dataset, \\(D_{y_i}\\) be the subset of \\(D\\) with labels \\(y_i\\), and \\(D_{-y_i}\\) be the complement of \\(D_{y_i}\\) in \\(D\\). The key idea is that a sample \\((x_i, y_i)\\), might enhance the overall performance on \\(D\\), while being detrimental for the performance on \\(D_{y_i}\\). The Class-wise value is defined as:
\\[ v_u(i) = \\frac{1}{2^{|D_{-y_i}|}} \\sum_{S_{-y_i}} \\frac{1}{|D_{y_i}|!} \\sum_{S_{y_i}} \\binom{|D_{y_i}|-1}{|S_{y_i}|}^{-1} [u( S_{y_i} \\cup \\{i\\} | S_{-y_i} ) \u2212 u( S_{y_i} | S_{-y_i})], \\] where \\(S_{y_i} \\subseteq D_{y_i} \\setminus \\{i\\}\\) and \\(S_{-y_i} \\subseteq D_{-y_i}\\).
Analysis of Class-wise Shapley
For a detailed analysis of the method, with comparison to other valuation techniques, please refer to the main documentation.
In practice, the quantity above is estimated using Monte Carlo sampling of the powerset and the set of index permutations. This results in the estimator
\\[ v_u(i) = \\frac{1}{K} \\sum_k \\frac{1}{L} \\sum_l [u(\\sigma^{(l)}_{:i} \\cup \\{i\\} | S^{(k)} ) \u2212 u( \\sigma^{(l)}_{:i} | S^{(k)})], \\] with \\(S^{(1)}, \\dots, S^{(K)} \\subseteq T_{-y_i},\\) \\(\\sigma^{(1)}, \\dots, \\sigma^{(L)} \\in \\Pi(T_{y_i}\\setminus\\{i\\}),\\) and \\(\\sigma^{(l)}_{:i}\\) denoting the set of indices in permutation \\(\\sigma^{(l)}\\) before the position where \\(i\\) appears. The sets \\(T_{y_i}\\) and \\(T_{-y_i}\\) are the training sets for the labels \\(y_i\\) and \\(-y_i\\), respectively.
Notes for derivation of test cases The unit tests include the following manually constructed data: Let \\(D=\\{(1,0),(2,0),(3,0),(4,1)\\}\\) be the test set and \\(T=\\{(1,0),(2,0),(3,1),(4,1)\\}\\) the train set. This specific dataset is chosen as it allows to solve the model
\\[y = \\max(0, \\min(1, \\text{round}(\\beta^T x)))\\] in closed form \\(\\beta = \\frac{\\text{dot}(x, y)}{\\text{dot}(x, x)}\\). From the closed-form solution, the tables for in-class accuracy \\(a_S(D_{y_i})\\) and out-of-class accuracy \\(a_S(D_{-y_i})\\) can be calculated. By using these tables and setting \\(\\{S^{(1)}, \\dots, S^{(K)}\\} = 2^{T_{-y_i}}\\) and \\(\\{\\sigma^{(1)}, \\dots, \\sigma^{(L)}\\} = \\Pi(T_{y_i}\\setminus\\{i\\})\\), the Monte Carlo estimator can be evaluated (\\(2^M\\) is the powerset of \\(M\\)). The details of the derivation are left to the eager reader.
"},{"location":"api/pydvl/value/shapley/classwise/#pydvl.value.shapley.classwise--references","title":"References","text":" -
Schoch, Stephanie, Haifeng Xu, and Yangfeng Ji. CS-Shapley: Class-wise Shapley Values for Data Valuation in Classification. In Proc. of the Thirty-Sixth Conference on Neural Information Processing Systems (NeurIPS). New Orleans, Louisiana, USA, 2022.\u00a0\u21a9
"},{"location":"api/pydvl/value/shapley/classwise/#pydvl.value.shapley.classwise.ClasswiseScorer","title":"ClasswiseScorer","text":"ClasswiseScorer(\n scoring: Union[str, ScorerCallable] = \"accuracy\",\n default: float = 0.0,\n range: Tuple[float, float] = (0, 1),\n in_class_discount_fn: Callable[[float], float] = lambda x: x,\n out_of_class_discount_fn: Callable[[float], float] = np.exp,\n initial_label: Optional[int] = None,\n name: Optional[str] = None,\n)\n
Bases: Scorer
A Scorer designed for evaluation in classification problems. Its value is computed from an in-class and an out-of-class \"inner score\" (Schoch et al., 2022) 1. Let \\(S\\) be the training set and \\(D\\) be the valuation set. For each label \\(c\\), \\(D\\) is factorized into two disjoint sets: \\(D_c\\) for in-class instances and \\(D_{-c}\\) for out-of-class instances. The score combines an in-class metric of performance, adjusted by a discounted out-of-class metric. These inner scores must be provided upon construction or default to accuracy. They are combined into:
\\[ u(S_{y_i}) = f(a_S(D_{y_i}))\\ g(a_S(D_{-y_i})), \\] where \\(f\\) and \\(g\\) are continuous, monotonic functions. For a detailed explanation, refer to section four of (Schoch et al., 2022) 1.
Warning
Metrics must support multiple class labels if you intend to apply them to a multi-class problem. For instance, the metric 'accuracy' supports multiple classes, but the metric f1
does not. For a two-class classification problem, using f1_weighted
is essentially equivalent to using accuracy
.
PARAMETER DESCRIPTION scoring
Name of the scoring function or a callable that can be passed to Scorer.
TYPE: Union[str, ScorerCallable]
DEFAULT: 'accuracy'
default
Score to use when a model fails to provide a number, e.g. when too little was used to train it, or errors arise.
TYPE: float
DEFAULT: 0.0
range
Numerical range of the score function. Some Monte Carlo methods can use this to estimate the number of samples required for a certain quality of approximation. If not provided, it can be read from the scoring
object if it provides it, for instance if it was constructed with compose_score.
TYPE: Tuple[float, float]
DEFAULT: (0, 1)
in_class_discount_fn
Continuous, monotonic increasing function used to discount the in-class score.
TYPE: Callable[[float], float]
DEFAULT: lambda x: x
out_of_class_discount_fn
Continuous, monotonic increasing function used to discount the out-of-class score.
TYPE: Callable[[float], float]
DEFAULT: exp
initial_label
Set initial label (for the first iteration)
TYPE: Optional[int]
DEFAULT: None
name
Name of the scorer. If not provided, the name of the inner scoring function will be prefixed by classwise
.
TYPE: Optional[str]
DEFAULT: None
New in version 0.7.1
Source code in src/pydvl/value/shapley/classwise.py
def __init__(\n self,\n scoring: Union[str, ScorerCallable] = \"accuracy\",\n default: float = 0.0,\n range: Tuple[float, float] = (0, 1),\n in_class_discount_fn: Callable[[float], float] = lambda x: x,\n out_of_class_discount_fn: Callable[[float], float] = np.exp,\n initial_label: Optional[int] = None,\n name: Optional[str] = None,\n):\n disc_score_in_class = in_class_discount_fn(range[1])\n disc_score_out_of_class = out_of_class_discount_fn(range[1])\n transformed_range = (0, disc_score_in_class * disc_score_out_of_class)\n super().__init__(\n scoring=scoring,\n range=transformed_range,\n default=default,\n name=name or f\"classwise {str(scoring)}\",\n )\n self._in_class_discount_fn = in_class_discount_fn\n self._out_of_class_discount_fn = out_of_class_discount_fn\n self.label = initial_label\n
"},{"location":"api/pydvl/value/shapley/classwise/#pydvl.value.shapley.classwise.ClasswiseScorer.estimate_in_class_and_out_of_class_score","title":"estimate_in_class_and_out_of_class_score","text":"estimate_in_class_and_out_of_class_score(\n model: SupervisedModel,\n x_test: NDArray[float64],\n y_test: NDArray[int_],\n rescale_scores: bool = True,\n) -> Tuple[float, float]\n
Computes in-class and out-of-class scores using the provided inner scoring function. The result is
\\[ a_S(D=\\{(x_1, y_1), \\dots, (x_K, y_K)\\}) = \\frac{1}{N} \\sum_k s(y(x_k), y_k). \\] In this context, for label \\(c\\) calculations are executed twice: once for \\(D_c\\) and once for \\(D_{-c}\\) to determine the in-class and out-of-class scores, respectively. By default, the raw scores are multiplied by \\(\\frac{|D_c|}{|D|}\\) and \\(\\frac{|D_{-c}|}{|D|}\\), respectively. This is done to ensure that both scores are of the same order of magnitude. This normalization is particularly useful when the inner score function \\(a_S\\) is calculated by an estimator of the form \\(\\frac{1}{N} \\sum_i x_i\\), e.g. the accuracy.
PARAMETER DESCRIPTION model
Model used for computing the score on the validation set.
TYPE: SupervisedModel
x_test
Array containing the features of the classification problem.
TYPE: NDArray[float64]
y_test
Array containing the labels of the classification problem.
TYPE: NDArray[int_]
rescale_scores
If set to True, the scores will be denormalized. This is particularly useful when the inner score function \\(a_S\\) is calculated by an estimator of the form \\(\\frac{1}{N} \\sum_i x_i\\).
TYPE: bool
DEFAULT: True
RETURNS DESCRIPTION Tuple[float, float]
Tuple containing the in-class and out-of-class scores.
Source code in src/pydvl/value/shapley/classwise.py
def estimate_in_class_and_out_of_class_score(\n self,\n model: SupervisedModel,\n x_test: NDArray[np.float64],\n y_test: NDArray[np.int_],\n rescale_scores: bool = True,\n) -> Tuple[float, float]:\n r\"\"\"\n Computes in-class and out-of-class scores using the provided inner\n scoring function. The result is\n\n $$\n a_S(D=\\{(x_1, y_1), \\dots, (x_K, y_K)\\}) = \\frac{1}{N} \\sum_k s(y(x_k), y_k).\n $$\n\n In this context, for label $c$ calculations are executed twice: once for $D_c$\n and once for $D_{-c}$ to determine the in-class and out-of-class scores,\n respectively. By default, the raw scores are multiplied by $\\frac{|D_c|}{|D|}$\n and $\\frac{|D_{-c}|}{|D|}$, respectively. This is done to ensure that both\n scores are of the same order of magnitude. This normalization is particularly\n useful when the inner score function $a_S$ is calculated by an estimator of the\n form $\\frac{1}{N} \\sum_i x_i$, e.g. the accuracy.\n\n Args:\n model: Model used for computing the score on the validation set.\n x_test: Array containing the features of the classification problem.\n y_test: Array containing the labels of the classification problem.\n rescale_scores: If set to True, the scores will be denormalized. This is\n particularly useful when the inner score function $a_S$ is calculated by\n an estimator of the form $\\frac{1}{N} \\sum_i x_i$.\n\n Returns:\n Tuple containing the in-class and out-of-class scores.\n \"\"\"\n scorer = self._scorer\n label_set_match = y_test == self.label\n label_set = np.where(label_set_match)[0]\n num_classes = len(np.unique(y_test))\n\n if len(label_set) == 0:\n return 0, 1 / (num_classes - 1)\n\n complement_label_set = np.where(~label_set_match)[0]\n in_class_score = scorer(model, x_test[label_set], y_test[label_set])\n out_of_class_score = scorer(\n model, x_test[complement_label_set], y_test[complement_label_set]\n )\n\n if rescale_scores:\n n_in_class = np.count_nonzero(y_test == self.label)\n n_out_of_class = len(y_test) - n_in_class\n in_class_score *= n_in_class / (n_in_class + n_out_of_class)\n out_of_class_score *= n_out_of_class / (n_in_class + n_out_of_class)\n\n return in_class_score, out_of_class_score\n
"},{"location":"api/pydvl/value/shapley/classwise/#pydvl.value.shapley.classwise.compute_classwise_shapley_values","title":"compute_classwise_shapley_values","text":"compute_classwise_shapley_values(\n u: Utility,\n *,\n done: StoppingCriterion,\n truncation: TruncationPolicy,\n done_sample_complements: Optional[StoppingCriterion] = None,\n normalize_values: bool = True,\n use_default_scorer_value: bool = True,\n min_elements_per_label: int = 1,\n n_jobs: int = 1,\n parallel_backend: Optional[ParallelBackend] = None,\n config: Optional[ParallelConfig] = None,\n progress: bool = False,\n seed: Optional[Seed] = None\n) -> ValuationResult\n
Computes an approximate Class-wise Shapley value by sampling independent permutations of the index set for each label and index sets sampled from the powerset of the complement (with respect to the currently evaluated label), approximating the sum:
\\[ v_u(i) = \\frac{1}{K} \\sum_k \\frac{1}{L} \\sum_l [u(\\sigma^{(l)}_{:i} \\cup \\{i\\} | S^{(k)} ) \u2212 u( \\sigma^{(l)}_{:i} | S^{(k)})], \\] where \\(\\sigma_{:i}\\) denotes the set of indices in permutation sigma before the position where \\(i\\) appears and \\(S\\) is a subset of the index set of all other labels (see the main documentation for details).
PARAMETER DESCRIPTION u
Utility object containing model, data, and scoring function. The scorer must be of type ClasswiseScorer.
TYPE: Utility
done
Function that checks whether the computation needs to stop.
TYPE: StoppingCriterion
truncation
Callable function that decides whether to interrupt processing a permutation and set subsequent marginals to zero.
TYPE: TruncationPolicy
done_sample_complements
Function checking whether computation needs to stop. Otherwise, it will resample conditional sets until the stopping criterion is met.
TYPE: Optional[StoppingCriterion]
DEFAULT: None
normalize_values
Indicates whether to normalize the values by the variation in each class times their in-class accuracy.
TYPE: bool
DEFAULT: True
done_sample_complements
Number of times to resample the complement set for each permutation.
TYPE: Optional[StoppingCriterion]
DEFAULT: None
use_default_scorer_value
The first set of indices is the sampled complement set. Unless not otherwise specified, the default scorer value is used for this. If it is set to false, the base score is calculated from the utility.
TYPE: bool
DEFAULT: True
min_elements_per_label
The minimum number of elements for each opposite label.
TYPE: int
DEFAULT: 1
n_jobs
Number of parallel jobs to run.
TYPE: int
DEFAULT: 1
parallel_backend
Parallel backend instance to use for parallelizing computations. If None
, use JoblibParallelBackend backend. See the Parallel Backends package for available options.
TYPE: Optional[ParallelBackend]
DEFAULT: None
config
(DEPRECATED) Object configuring parallel computation, with cluster address, number of cpus, etc.
TYPE: Optional[ParallelConfig]
DEFAULT: None
progress
Whether to display a progress bar.
TYPE: bool
DEFAULT: False
seed
Either an instance of a numpy random number generator or a seed for it.
TYPE: Optional[Seed]
DEFAULT: None
RETURNS DESCRIPTION ValuationResult
ValuationResult object containing computed data values.
New in version 0.7.1
Source code in src/pydvl/value/shapley/classwise.py
@deprecated(\n target=True,\n args_mapping={\"config\": \"config\"},\n deprecated_in=\"0.9.0\",\n remove_in=\"0.10.0\",\n)\ndef compute_classwise_shapley_values(\n u: Utility,\n *,\n done: StoppingCriterion,\n truncation: TruncationPolicy,\n done_sample_complements: Optional[StoppingCriterion] = None,\n normalize_values: bool = True,\n use_default_scorer_value: bool = True,\n min_elements_per_label: int = 1,\n n_jobs: int = 1,\n parallel_backend: Optional[ParallelBackend] = None,\n config: Optional[ParallelConfig] = None,\n progress: bool = False,\n seed: Optional[Seed] = None,\n) -> ValuationResult:\n r\"\"\"\n Computes an approximate Class-wise Shapley value by sampling independent\n permutations of the index set for each label and index sets sampled from the\n powerset of the complement (with respect to the currently evaluated label),\n approximating the sum:\n\n $$\n v_u(i) = \\frac{1}{K} \\sum_k \\frac{1}{L} \\sum_l\n [u(\\sigma^{(l)}_{:i} \\cup \\{i\\} | S^{(k)} ) \u2212 u( \\sigma^{(l)}_{:i} | S^{(k)})],\n $$\n\n where $\\sigma_{:i}$ denotes the set of indices in permutation sigma before\n the position where $i$ appears and $S$ is a subset of the index set of all\n other labels (see [the main documentation][class-wise-shapley] for\n details).\n\n Args:\n u: Utility object containing model, data, and scoring function. The\n scorer must be of type\n [ClasswiseScorer][pydvl.value.shapley.classwise.ClasswiseScorer].\n done: Function that checks whether the computation needs to stop.\n truncation: Callable function that decides whether to interrupt processing a\n permutation and set subsequent marginals to zero.\n done_sample_complements: Function checking whether computation needs to stop.\n Otherwise, it will resample conditional sets until the stopping criterion is\n met.\n normalize_values: Indicates whether to normalize the values by the variation\n in each class times their in-class accuracy.\n done_sample_complements: Number of times to resample the complement set\n for each permutation.\n use_default_scorer_value: The first set of indices is the sampled complement\n set. Unless not otherwise specified, the default scorer value is used for\n this. If it is set to false, the base score is calculated from the utility.\n min_elements_per_label: The minimum number of elements for each opposite\n label.\n n_jobs: Number of parallel jobs to run.\n parallel_backend: Parallel backend instance to use\n for parallelizing computations. If `None`,\n use [JoblibParallelBackend][pydvl.parallel.backends.JoblibParallelBackend] backend.\n See the [Parallel Backends][pydvl.parallel.backends] package\n for available options.\n config: (**DEPRECATED**) Object configuring parallel computation,\n with cluster address, number of cpus, etc.\n progress: Whether to display a progress bar.\n seed: Either an instance of a numpy random number generator or a seed for it.\n\n Returns:\n ValuationResult object containing computed data values.\n\n !!! tip \"New in version 0.7.1\"\n \"\"\"\n dim_correct = u.data.y_train.ndim == 1 and u.data.y_test.ndim == 1\n is_integral = all(\n map(\n lambda v: isinstance(v, numbers.Integral), (*u.data.y_train, *u.data.y_test)\n )\n )\n if not dim_correct or not is_integral:\n raise ValueError(\n \"The supplied dataset has to be a 1-dimensional classification dataset.\"\n )\n\n if not isinstance(u.scorer, ClasswiseScorer):\n raise ValueError(\n \"Please set a subclass of ClasswiseScorer object as scorer object of the\"\n \" utility. See scoring argument of Utility.\"\n )\n\n parallel_backend = _maybe_init_parallel_backend(parallel_backend, config)\n u_ref = parallel_backend.put(u)\n n_jobs = parallel_backend.effective_n_jobs(n_jobs)\n n_submitted_jobs = 2 * n_jobs\n\n pbar = tqdm(disable=not progress, position=0, total=100, unit=\"%\")\n algorithm = \"classwise_shapley\"\n accumulated_result = ValuationResult.zeros(\n algorithm=algorithm, indices=u.data.indices, data_names=u.data.data_names\n )\n terminate_exec = False\n seed_sequence = ensure_seed_sequence(seed)\n\n parallel_backend = _maybe_init_parallel_backend(parallel_backend, config)\n\n with parallel_backend.executor(max_workers=n_jobs) as executor:\n pending: Set[Future] = set()\n while True:\n completed_futures, pending = wait(\n pending, timeout=60, return_when=FIRST_COMPLETED\n )\n for future in completed_futures:\n accumulated_result += future.result()\n if done(accumulated_result):\n terminate_exec = True\n break\n\n pbar.n = 100 * done.completion()\n pbar.refresh()\n if terminate_exec:\n break\n\n n_remaining_slots = n_submitted_jobs - len(pending)\n seeds = seed_sequence.spawn(n_remaining_slots)\n for i in range(n_remaining_slots):\n future = executor.submit(\n _permutation_montecarlo_classwise_shapley_one_step,\n u_ref,\n truncation=truncation,\n done_sample_complements=done_sample_complements,\n use_default_scorer_value=use_default_scorer_value,\n min_elements_per_label=min_elements_per_label,\n algorithm_name=algorithm,\n seed=seeds[i],\n )\n pending.add(future)\n\n result = accumulated_result\n if normalize_values:\n result = _normalize_classwise_shapley_values(result, u)\n\n return result\n
"},{"location":"api/pydvl/value/shapley/common/","title":"Common","text":""},{"location":"api/pydvl/value/shapley/common/#pydvl.value.shapley.common","title":"pydvl.value.shapley.common","text":""},{"location":"api/pydvl/value/shapley/common/#pydvl.value.shapley.common.compute_shapley_values","title":"compute_shapley_values","text":"compute_shapley_values(\n u: Utility,\n *,\n done: StoppingCriterion = MaxChecks(None),\n mode: ShapleyMode = ShapleyMode.TruncatedMontecarlo,\n n_jobs: int = 1,\n seed: Optional[Seed] = None,\n **kwargs\n) -> ValuationResult\n
Umbrella method to compute Shapley values with any of the available algorithms.
See Data valuation for an overview.
The following algorithms are available. Note that the exact methods can only work with very small datasets and are thus intended only for testing. Some algorithms also accept additional arguments, please refer to the documentation of each particular method.
combinatorial_exact
: uses the combinatorial implementation of data Shapley. Implemented in combinatorial_exact_shapley(). combinatorial_montecarlo
: uses the approximate Monte Carlo implementation of combinatorial data Shapley. Implemented in combinatorial_montecarlo_shapley(). permutation_exact
: uses the permutation-based implementation of data Shapley. Computation is not parallelized. Implemented in permutation_exact_shapley(). permutation_montecarlo
: uses the approximate Monte Carlo implementation of permutation data Shapley. Accepts a TruncationPolicy to stop computing marginals. Implemented in permutation_montecarlo_shapley(). owen_sampling
: Uses the Owen continuous extension of the utility function to the unit cube. Implemented in owen_sampling_shapley(). This method does not take a StoppingCriterion but instead requires a parameter q_max
for the number of subdivisions of the unit interval to use for integration, and another parameter n_samples
for the number of subsets to sample for each \\(q\\). owen_halved
: Same as 'owen_sampling' but uses correlated samples in the expectation. Implemented in owen_sampling_shapley(). This method requires an additional parameter q_max
for the number of subdivisions of the interval [0,0.5] to use for integration, and another parameter n_samples
for the number of subsets to sample for each \\(q\\). group_testing
: estimates differences of Shapley values and solves a constraint satisfaction problem. High sample complexity, not recommended. Implemented in group_testing_shapley(). This method does not take a StoppingCriterion but instead requires a parameter n_samples
for the number of iterations to run.
Additionally, one can use model-specific methods:
knn
: Exact method for K-Nearest neighbour models. Implemented in knn_shapley().
PARAMETER DESCRIPTION u
Utility object with model, data, and scoring function.
TYPE: Utility
done
Object used to determine when to stop the computation for Monte Carlo methods. The default is to stop after 100 iterations. See the available criteria in stopping. It is possible to combine several of them using boolean operators. Some methods ignore this argument, others require specific subtypes.
TYPE: StoppingCriterion
DEFAULT: MaxChecks(None)
n_jobs
Number of parallel jobs (available only to some methods)
TYPE: int
DEFAULT: 1
seed
Either an instance of a numpy random number generator or a seed for it.
TYPE: Optional[Seed]
DEFAULT: None
mode
Choose which shapley algorithm to use. See ShapleyMode for a list of allowed value.
TYPE: ShapleyMode
DEFAULT: TruncatedMontecarlo
RETURNS DESCRIPTION ValuationResult
Object with the results.
Source code in src/pydvl/value/shapley/common.py
def compute_shapley_values(\n u: Utility,\n *,\n done: StoppingCriterion = MaxChecks(None),\n mode: ShapleyMode = ShapleyMode.TruncatedMontecarlo,\n n_jobs: int = 1,\n seed: Optional[Seed] = None,\n **kwargs,\n) -> ValuationResult:\n \"\"\"Umbrella method to compute Shapley values with any of the available\n algorithms.\n\n See [Data valuation][data-valuation] for an overview.\n\n The following algorithms are available. Note that the exact methods can only\n work with very small datasets and are thus intended only for testing. Some\n algorithms also accept additional arguments, please refer to the\n documentation of each particular method.\n\n - `combinatorial_exact`: uses the combinatorial implementation of data\n Shapley. Implemented in\n [combinatorial_exact_shapley()][pydvl.value.shapley.naive.combinatorial_exact_shapley].\n - `combinatorial_montecarlo`: uses the approximate Monte Carlo\n implementation of combinatorial data Shapley. Implemented in\n [combinatorial_montecarlo_shapley()][pydvl.value.shapley.montecarlo.combinatorial_montecarlo_shapley].\n - `permutation_exact`: uses the permutation-based implementation of data\n Shapley. Computation is **not parallelized**. Implemented in\n [permutation_exact_shapley()][pydvl.value.shapley.naive.permutation_exact_shapley].\n - `permutation_montecarlo`: uses the approximate Monte Carlo\n implementation of permutation data Shapley. Accepts a\n [TruncationPolicy][pydvl.value.shapley.truncated.TruncationPolicy] to stop\n computing marginals. Implemented in\n [permutation_montecarlo_shapley()][pydvl.value.shapley.montecarlo.permutation_montecarlo_shapley].\n - `owen_sampling`: Uses the Owen continuous extension of the utility\n function to the unit cube. Implemented in\n [owen_sampling_shapley()][pydvl.value.shapley.owen.owen_sampling_shapley]. This\n method does not take a [StoppingCriterion][pydvl.value.stopping.StoppingCriterion]\n but instead requires a parameter `q_max` for the number of subdivisions\n of the unit interval to use for integration, and another parameter\n `n_samples` for the number of subsets to sample for each $q$.\n - `owen_halved`: Same as 'owen_sampling' but uses correlated samples in the\n expectation. Implemented in\n [owen_sampling_shapley()][pydvl.value.shapley.owen.owen_sampling_shapley].\n This method requires an additional parameter `q_max` for the number of\n subdivisions of the interval [0,0.5] to use for integration, and another\n parameter `n_samples` for the number of subsets to sample for each $q$.\n - `group_testing`: estimates differences of Shapley values and solves a\n constraint satisfaction problem. High sample complexity, not recommended.\n Implemented in [group_testing_shapley()][pydvl.value.shapley.gt.group_testing_shapley]. This\n method does not take a [StoppingCriterion][pydvl.value.stopping.StoppingCriterion]\n but instead requires a parameter `n_samples` for the number of\n iterations to run.\n\n Additionally, one can use model-specific methods:\n\n - `knn`: Exact method for K-Nearest neighbour models. Implemented in\n [knn_shapley()][pydvl.value.shapley.knn.knn_shapley].\n\n Args:\n u: [Utility][pydvl.utils.utility.Utility] object with model, data, and\n scoring function.\n done: Object used to determine when to stop the computation for Monte\n Carlo methods. The default is to stop after 100 iterations. See the\n available criteria in [stopping][pydvl.value.stopping]. It is\n possible to combine several of them using boolean operators. Some\n methods ignore this argument, others require specific subtypes.\n n_jobs: Number of parallel jobs (available only to some methods)\n seed: Either an instance of a numpy random number generator or a seed\n for it.\n mode: Choose which shapley algorithm to use. See\n [ShapleyMode][pydvl.value.shapley.ShapleyMode] for a list of allowed\n value.\n\n Returns:\n Object with the results.\n\n \"\"\"\n progress: bool = kwargs.pop(\"progress\", False)\n\n if mode not in list(ShapleyMode):\n raise ValueError(f\"Invalid value encountered in {mode=}\")\n\n if mode in (\n ShapleyMode.PermutationMontecarlo,\n ShapleyMode.ApproShapley,\n ShapleyMode.TruncatedMontecarlo,\n ):\n truncation = kwargs.pop(\"truncation\", NoTruncation())\n return permutation_montecarlo_shapley( # type: ignore\n u=u,\n done=done,\n truncation=truncation,\n n_jobs=n_jobs,\n seed=seed,\n progress=progress,\n **kwargs,\n )\n elif mode == ShapleyMode.CombinatorialMontecarlo:\n return combinatorial_montecarlo_shapley( # type: ignore\n u, done=done, n_jobs=n_jobs, seed=seed, progress=progress\n )\n elif mode == ShapleyMode.CombinatorialExact:\n return combinatorial_exact_shapley(u, n_jobs=n_jobs, progress=progress) # type: ignore\n elif mode == ShapleyMode.PermutationExact:\n return permutation_exact_shapley(u, progress=progress)\n elif mode == ShapleyMode.Owen or mode == ShapleyMode.OwenAntithetic:\n if kwargs.get(\"n_samples\") is None:\n raise ValueError(\"n_samples cannot be None for Owen methods\")\n if kwargs.get(\"max_q\") is None:\n raise ValueError(\"Owen Sampling requires max_q for the outer integral\")\n\n method = (\n OwenAlgorithm.Standard\n if mode == ShapleyMode.Owen\n else OwenAlgorithm.Antithetic\n )\n return owen_sampling_shapley( # type: ignore\n u,\n n_samples=int(kwargs.get(\"n_samples\", -1)),\n max_q=int(kwargs.get(\"max_q\", -1)),\n method=method,\n n_jobs=n_jobs,\n seed=seed,\n )\n elif mode == ShapleyMode.KNN:\n return knn_shapley(u, progress=progress)\n elif mode == ShapleyMode.GroupTesting:\n n_samples = kwargs.pop(\"n_samples\")\n if n_samples is None:\n raise ValueError(\"n_samples cannot be None for Group Testing\")\n epsilon = kwargs.pop(\"epsilon\")\n if epsilon is None:\n raise ValueError(\"Group Testing requires error bound epsilon\")\n delta = kwargs.pop(\"delta\", 0.05)\n return group_testing_shapley( # type: ignore\n u,\n epsilon=float(epsilon),\n delta=delta,\n n_samples=int(n_samples),\n n_jobs=n_jobs,\n progress=progress,\n seed=seed,\n **kwargs,\n )\n else:\n raise ValueError(f\"Invalid value encountered in {mode=}\")\n
"},{"location":"api/pydvl/value/shapley/gt/","title":"Gt","text":""},{"location":"api/pydvl/value/shapley/gt/#pydvl.value.shapley.gt","title":"pydvl.value.shapley.gt","text":"This module implements Group Testing for the approximation of Shapley values, as introduced in (Jia, R. et al., 2019)1. The sampling of index subsets is done in such a way that an approximation to the true Shapley values can be computed with guarantees.
Warning
This method is very inefficient. Potential improvements to the implementation notwithstanding, convergence seems to be very slow (in terms of evaluations of the utility required). We recommend other Monte Carlo methods instead.
You can read more in the documentation.
New in version 0.4.0
"},{"location":"api/pydvl/value/shapley/gt/#pydvl.value.shapley.gt--references","title":"References","text":" -
Jia, R. et al., 2019. Towards Efficient Data Valuation Based on the Shapley Value. In: Proceedings of the 22nd International Conference on Artificial Intelligence and Statistics, pp. 1167\u20131176. PMLR.\u00a0\u21a9
"},{"location":"api/pydvl/value/shapley/gt/#pydvl.value.shapley.gt.num_samples_eps_delta","title":"num_samples_eps_delta","text":"num_samples_eps_delta(\n eps: float, delta: float, n: int, utility_range: float\n) -> int\n
Implements the formula in Theorem 3 of (Jia, R. et al., 2019)1 which gives a lower bound on the number of samples required to obtain an (\u03b5/\u221an,\u03b4/(N(N-1))-approximation to all pair-wise differences of Shapley values, wrt. \\(\\ell_2\\) norm.
PARAMETER DESCRIPTION eps
\u03b5
TYPE: float
delta
\u03b4
TYPE: float
n
Number of data points
TYPE: int
utility_range
Range of the Utility function
TYPE: float
Returns: Number of samples from \\(2^{[n]}\\) guaranteeing \u03b5/\u221an-correct Shapley pair-wise differences of values with probability 1-\u03b4/(N(N-1)).
New in version 0.4.0
Source code in src/pydvl/value/shapley/gt.py
def num_samples_eps_delta(\n eps: float, delta: float, n: int, utility_range: float\n) -> int:\n r\"\"\"Implements the formula in Theorem 3 of (Jia, R. et al., 2019)<sup><a href=\"#jia_efficient_2019\">1</a></sup>\n which gives a lower bound on the number of samples required to obtain an\n (\u03b5/\u221an,\u03b4/(N(N-1))-approximation to all pair-wise differences of Shapley\n values, wrt. $\\ell_2$ norm.\n\n Args:\n eps: \u03b5\n delta: \u03b4\n n: Number of data points\n utility_range: Range of the [Utility][pydvl.utils.utility.Utility] function\n Returns:\n Number of samples from $2^{[n]}$ guaranteeing \u03b5/\u221an-correct Shapley\n pair-wise differences of values with probability 1-\u03b4/(N(N-1)).\n\n !!! tip \"New in version 0.4.0\"\n\n \"\"\"\n constants = _constants(n=n, epsilon=eps, delta=delta, utility_range=utility_range)\n return int(constants.T)\n
"},{"location":"api/pydvl/value/shapley/gt/#pydvl.value.shapley.gt.group_testing_shapley","title":"group_testing_shapley","text":"group_testing_shapley(\n u: Utility,\n n_samples: int,\n epsilon: float,\n delta: float,\n *,\n n_jobs: int = 1,\n parallel_backend: Optional[ParallelBackend] = None,\n config: Optional[ParallelConfig] = None,\n progress: bool = False,\n seed: Optional[Seed] = None,\n **options: dict\n) -> ValuationResult\n
Implements group testing for approximation of Shapley values as described in (Jia, R. et al., 2019)1.
Warning
This method is very inefficient. It requires several orders of magnitude more evaluations of the utility than others in montecarlo. It also uses several intermediate objects like the results from the runners and the constraint matrices which can become rather large.
By picking a specific distribution over subsets, the differences in Shapley values can be approximated with a Monte Carlo sum. These are then used to solve for the individual values in a feasibility problem.
PARAMETER DESCRIPTION u
Utility object with model, data, and scoring function
TYPE: Utility
n_samples
Number of tests to perform. Use num_samples_eps_delta to estimate this.
TYPE: int
epsilon
From the (\u03b5,\u03b4) sample bound. Use the same as for the estimation of n_iterations
.
TYPE: float
delta
From the (\u03b5,\u03b4) sample bound. Use the same as for the estimation of n_iterations
.
TYPE: float
n_jobs
Number of parallel jobs to use. Each worker performs a chunk of all tests (i.e. utility evaluations).
TYPE: int
DEFAULT: 1
parallel_backend
Parallel backend instance to use for parallelizing computations. If None
, use JoblibParallelBackend backend. See the Parallel Backends package for available options.
TYPE: Optional[ParallelBackend]
DEFAULT: None
config
(DEPRECATED) Object configuring parallel computation, with cluster address, number of cpus, etc.
TYPE: Optional[ParallelConfig]
DEFAULT: None
progress
Whether to display progress bars for each job.
TYPE: bool
DEFAULT: False
seed
Either an instance of a numpy random number generator or a seed for it.
TYPE: Optional[Seed]
DEFAULT: None
options
Additional options to pass to cvxpy.Problem.solve(). E.g. to change the solver (which defaults to cvxpy.SCS
) pass solver=cvxpy.CVXOPT
.
TYPE: dict
DEFAULT: {}
RETURNS DESCRIPTION ValuationResult
Object with the data values.
New in version 0.4.0
Changed in version 0.5.0
Changed the solver to cvxpy instead of scipy's linprog. Added the ability to pass arbitrary options to it.
Changed in version 0.9.0
Deprecated config
argument and added a parallel_backend
argument to allow users to pass the Parallel Backend instance directly.
Source code in src/pydvl/value/shapley/gt.py
@deprecated(\n target=True,\n args_mapping={\"config\": \"config\"},\n deprecated_in=\"0.9.0\",\n remove_in=\"0.10.0\",\n)\ndef group_testing_shapley(\n u: Utility,\n n_samples: int,\n epsilon: float,\n delta: float,\n *,\n n_jobs: int = 1,\n parallel_backend: Optional[ParallelBackend] = None,\n config: Optional[ParallelConfig] = None,\n progress: bool = False,\n seed: Optional[Seed] = None,\n **options: dict,\n) -> ValuationResult:\n \"\"\"Implements group testing for approximation of Shapley values as described\n in (Jia, R. et al., 2019)<sup><a href=\"#jia_efficient_2019\">1</a></sup>.\n\n !!! Warning\n This method is very inefficient. It requires several orders of magnitude\n more evaluations of the utility than others in\n [montecarlo][pydvl.value.shapley.montecarlo]. It also uses several intermediate\n objects like the results from the runners and the constraint matrices\n which can become rather large.\n\n By picking a specific distribution over subsets, the differences in Shapley\n values can be approximated with a Monte Carlo sum. These are then used to\n solve for the individual values in a feasibility problem.\n\n Args:\n u: Utility object with model, data, and scoring function\n n_samples: Number of tests to perform. Use\n [num_samples_eps_delta][pydvl.value.shapley.gt.num_samples_eps_delta]\n to estimate this.\n epsilon: From the (\u03b5,\u03b4) sample bound. Use the same as for the\n estimation of `n_iterations`.\n delta: From the (\u03b5,\u03b4) sample bound. Use the same as for the\n estimation of `n_iterations`.\n n_jobs: Number of parallel jobs to use. Each worker performs a chunk\n of all tests (i.e. utility evaluations).\n parallel_backend: Parallel backend instance to use\n for parallelizing computations. If `None`,\n use [JoblibParallelBackend][pydvl.parallel.backends.JoblibParallelBackend] backend.\n See the [Parallel Backends][pydvl.parallel.backends] package\n for available options.\n config: (**DEPRECATED**) Object configuring parallel computation,\n with cluster address, number of cpus, etc.\n progress: Whether to display progress bars for each job.\n seed: Either an instance of a numpy random number generator or a seed for it.\n options: Additional options to pass to\n [cvxpy.Problem.solve()](https://www.cvxpy.org/tutorial/advanced/index.html#solve-method-options).\n E.g. to change the solver (which defaults to `cvxpy.SCS`) pass\n `solver=cvxpy.CVXOPT`.\n\n Returns:\n Object with the data values.\n\n !!! tip \"New in version 0.4.0\"\n\n !!! tip \"Changed in version 0.5.0\"\n Changed the solver to cvxpy instead of scipy's linprog. Added the ability\n to pass arbitrary options to it.\n\n !!! tip \"Changed in version 0.9.0\"\n Deprecated `config` argument and added a `parallel_backend`\n argument to allow users to pass the Parallel Backend instance\n directly.\n \"\"\"\n\n n = len(u.data.indices)\n\n const = _constants(\n n=n,\n epsilon=epsilon,\n delta=delta,\n utility_range=u.score_range.max() - u.score_range.min(),\n )\n T = n_samples\n if T < const.T:\n log.warning(\n f\"n_samples of {T} are below the required {const.T} for the \"\n f\"\u03b5={epsilon:.02f} guarantee at \u03b4={1 - delta:.02f} probability\"\n )\n\n parallel_backend = _maybe_init_parallel_backend(parallel_backend, config)\n\n samples_per_job = max(1, n_samples // parallel_backend.effective_n_jobs(n_jobs))\n\n def reducer(\n results_it: Iterable[Tuple[NDArray, NDArray]]\n ) -> Tuple[NDArray, NDArray]:\n return np.concatenate(list(x[0] for x in results_it)).astype(\n np.float64\n ), np.concatenate(list(x[1] for x in results_it)).astype(np.int_)\n\n seed_sequence = ensure_seed_sequence(seed)\n map_reduce_seed_sequence, cvxpy_seed = tuple(seed_sequence.spawn(2))\n\n map_reduce_job: MapReduceJob[Utility, Tuple[NDArray, NDArray]] = MapReduceJob(\n u,\n map_func=_group_testing_shapley,\n reduce_func=reducer,\n map_kwargs=dict(n_samples=samples_per_job, progress=progress),\n parallel_backend=parallel_backend,\n n_jobs=n_jobs,\n )\n uu, betas = map_reduce_job(seed=map_reduce_seed_sequence)\n\n # Matrix of estimated differences. See Eqs. (3) and (4) in the paper.\n C = np.zeros(shape=(n, n))\n for i in range(n):\n for j in range(i + 1, n):\n C[i, j] = np.dot(uu, betas[:, i] - betas[:, j])\n C *= const.Z / T\n total_utility = u(u.data.indices)\n\n ###########################################################################\n # Solution of the constraint problem with cvxpy\n\n v = cp.Variable(n)\n constraints = [cp.sum(v) == total_utility]\n for i in range(n):\n for j in range(i + 1, n):\n constraints.append(v[i] - v[j] <= epsilon + C[i, j])\n constraints.append(v[j] - v[i] <= epsilon - C[i, j])\n\n problem = cp.Problem(cp.Minimize(0), constraints)\n solver = options.pop(\"solver\", cp.SCS)\n problem.solve(solver=solver, **options)\n\n if problem.status != \"optimal\":\n log.warning(f\"cvxpy returned status {problem.status}\")\n values = (\n np.nan * np.ones_like(u.data.indices)\n if not hasattr(v.value, \"__len__\")\n else v.value\n )\n status = Status.Failed\n else:\n values = v.value\n status = Status.Converged\n\n return ValuationResult(\n algorithm=\"group_testing_shapley\",\n status=status,\n values=values,\n data_names=u.data.data_names,\n solver_status=problem.status,\n )\n
"},{"location":"api/pydvl/value/shapley/knn/","title":"Knn","text":""},{"location":"api/pydvl/value/shapley/knn/#pydvl.value.shapley.knn","title":"pydvl.value.shapley.knn","text":"This module contains Shapley computations for K-Nearest Neighbours.
Todo
Implement approximate KNN computation for sublinear complexity
"},{"location":"api/pydvl/value/shapley/knn/#pydvl.value.shapley.knn--references","title":"References","text":" -
Jia, R. et al., 2019. Efficient Task-Specific Data Valuation for Nearest Neighbor Algorithms. In: Proceedings of the VLDB Endowment, Vol. 12, No. 11, pp. 1610\u20131623.\u00a0\u21a9
"},{"location":"api/pydvl/value/shapley/knn/#pydvl.value.shapley.knn.knn_shapley","title":"knn_shapley","text":"knn_shapley(u: Utility, *, progress: bool = True) -> ValuationResult\n
Computes exact Shapley values for a KNN classifier.
This implements the method described in (Jia, R. et al., 2019)1. It exploits the local structure of K-Nearest Neighbours to reduce the number of calls to the utility function to a constant number per index, thus reducing computation time to \\(O(n)\\).
PARAMETER DESCRIPTION u
Utility with a KNN model to extract parameters from. The object will not be modified nor used other than to call get_params()
TYPE: Utility
progress
Whether to display a progress bar.
TYPE: bool
DEFAULT: True
RETURNS DESCRIPTION ValuationResult
Object with the data values.
RAISES DESCRIPTION TypeError
If the model in the utility is not a sklearn.neighbors.KNeighborsClassifier.
New in version 0.1.0
Source code in src/pydvl/value/shapley/knn.py
def knn_shapley(u: Utility, *, progress: bool = True) -> ValuationResult:\n \"\"\"Computes exact Shapley values for a KNN classifier.\n\n This implements the method described in (Jia, R. et al., 2019)<sup><a href=\"#jia_efficient_2019a\">1</a></sup>.\n It exploits the local structure of K-Nearest Neighbours to reduce the number\n of calls to the utility function to a constant number per index, thus\n reducing computation time to $O(n)$.\n\n Args:\n u: Utility with a KNN model to extract parameters from. The object\n will not be modified nor used other than to call [get_params()](\n <https://scikit-learn.org/stable/modules/generated/sklearn.base.BaseEstimator.html#sklearn.base.BaseEstimator.get_params>)\n progress: Whether to display a progress bar.\n\n Returns:\n Object with the data values.\n\n Raises:\n TypeError: If the model in the utility is not a\n [sklearn.neighbors.KNeighborsClassifier][].\n\n !!! tip \"New in version 0.1.0\"\n\n \"\"\"\n if not isinstance(u.model, KNeighborsClassifier):\n raise TypeError(\"KNN Shapley requires a K-Nearest Neighbours model\")\n\n defaults: Dict[str, Union[int, str]] = {\n \"algorithm\": \"ball_tree\" if u.data.dim >= 20 else \"kd_tree\",\n \"metric\": \"minkowski\",\n \"p\": 2,\n }\n defaults.update(u.model.get_params())\n # HACK: NearestNeighbors doesn't support this. There will be more...\n del defaults[\"weights\"]\n n_neighbors: int = int(defaults[\"n_neighbors\"])\n defaults[\"n_neighbors\"] = len(u.data) # We want all training points sorted\n\n assert n_neighbors < len(u.data)\n # assert data.target_dim == 1\n\n nns = NearestNeighbors(**defaults).fit(u.data.x_train)\n # closest to farthest\n _, indices = nns.kneighbors(u.data.x_test)\n\n values: NDArray[np.float64] = np.zeros_like(u.data.indices, dtype=np.float64)\n n = len(u.data)\n yt = u.data.y_train\n iterator = enumerate(zip(u.data.y_test, indices), start=1)\n for j, (y, ii) in tqdm(iterator, disable=not progress):\n value_at_x = int(yt[ii[-1]] == y) / n\n values[ii[-1]] += (value_at_x - values[ii[-1]]) / j\n for i in range(n - 2, n_neighbors, -1): # farthest to closest\n value_at_x = (\n values[ii[i + 1]] + (int(yt[ii[i]] == y) - int(yt[ii[i + 1]] == y)) / i\n )\n values[ii[i]] += (value_at_x - values[ii[i]]) / j\n for i in range(n_neighbors, -1, -1): # farthest to closest\n value_at_x = (\n values[ii[i + 1]]\n + (int(yt[ii[i]] == y) - int(yt[ii[i + 1]] == y)) / n_neighbors\n )\n values[ii[i]] += (value_at_x - values[ii[i]]) / j\n\n return ValuationResult(\n algorithm=\"knn_shapley\",\n status=Status.Converged,\n values=values,\n data_names=u.data.data_names,\n )\n
"},{"location":"api/pydvl/value/shapley/montecarlo/","title":"Montecarlo","text":""},{"location":"api/pydvl/value/shapley/montecarlo/#pydvl.value.shapley.montecarlo","title":"pydvl.value.shapley.montecarlo","text":"Monte Carlo approximations to Shapley Data values.
Warning
You probably want to use the common interface provided by compute_shapley_values() instead of directly using the functions in this module.
Because exact computation of Shapley values requires \\(\\mathcal{O}(2^n)\\) re-trainings of the model, several Monte Carlo approximations are available. The first two sample from the powerset of the training data directly: combinatorial_montecarlo_shapley() and owen_sampling_shapley(). The latter uses a reformulation in terms of a continuous extension of the utility.
Alternatively, employing another reformulation of the expression above as a sum over permutations, one has the implementation in permutation_montecarlo_shapley() with the option to pass an early stopping strategy to reduce computation as done in Truncated MonteCarlo Shapley (TMCS).
Also see
It is also possible to use group_testing_shapley() to reduce the number of evaluations of the utility. The method is however typically outperformed by others in this module.
Also see
Additionally, you can consider grouping your data points using GroupedDataset and computing the values of the groups instead. This is not to be confused with \"group testing\" as implemented in group_testing_shapley(): any of the algorithms mentioned above, including Group Testing, can work to valuate groups of samples as units.
"},{"location":"api/pydvl/value/shapley/montecarlo/#pydvl.value.shapley.montecarlo--references","title":"References","text":" -
Ghorbani, A., Zou, J., 2019. Data Shapley: Equitable Valuation of Data for Machine Learning. In: Proceedings of the 36th International Conference on Machine Learning, PMLR, pp. 2242\u20132251.\u00a0\u21a9
"},{"location":"api/pydvl/value/shapley/montecarlo/#pydvl.value.shapley.montecarlo.permutation_montecarlo_shapley","title":"permutation_montecarlo_shapley","text":"permutation_montecarlo_shapley(\n u: Utility,\n done: StoppingCriterion,\n *,\n truncation: TruncationPolicy = NoTruncation(),\n n_jobs: int = 1,\n parallel_backend: Optional[ParallelBackend] = None,\n config: Optional[ParallelConfig] = None,\n progress: bool = False,\n seed: Optional[Seed] = None\n) -> ValuationResult\n
Computes an approximate Shapley value by sampling independent permutations of the index set, approximating the sum:
\\[ v_u(x_i) = \\frac{1}{n!} \\sum_{\\sigma \\in \\Pi(n)} \\tilde{w}( | \\sigma_{:i} | )[u(\\sigma_{:i} \\cup \\{i\\}) \u2212 u(\\sigma_{:i})], \\] where \\(\\sigma_{:i}\\) denotes the set of indices in permutation sigma before the position where \\(i\\) appears (see [[data-valuation]] for details).
This implements the method described in (Ghorbani and Zou, 2019)1 with a double stopping criterion.
Todo
Think of how to add Robin-Gelman or some other more principled stopping criterion.
Instead of naively implementing the expectation, we sequentially add points to coalitions from a permutation and incrementally compute marginal utilities. We stop computing marginals for a given permutation based on a TruncationPolicy. (Ghorbani and Zou, 2019)1 mention two policies: one that stops after a certain fraction of marginals are computed, implemented in FixedTruncation, and one that stops if the last computed utility (\"score\") is close to the total utility using the standard deviation of the utility as a measure of proximity, implemented in BootstrapTruncation.
We keep sampling permutations and updating all shapley values until the StoppingCriterion returns True
.
PARAMETER DESCRIPTION u
Utility object with model, data, and scoring function.
TYPE: Utility
done
function checking whether computation must stop.
TYPE: StoppingCriterion
truncation
An optional callable which decides whether to interrupt processing a permutation and set all subsequent marginals to zero. Typically used to stop computation when the marginal is small.
TYPE: TruncationPolicy
DEFAULT: NoTruncation()
n_jobs
number of jobs across which to distribute the computation.
TYPE: int
DEFAULT: 1
parallel_backend
Parallel backend instance to use for parallelizing computations. If None
, use JoblibParallelBackend backend. See the Parallel Backends package for available options.
TYPE: Optional[ParallelBackend]
DEFAULT: None
config
(DEPRECATED) Object configuring parallel computation, with cluster address, number of cpus, etc.
TYPE: Optional[ParallelConfig]
DEFAULT: None
progress
Whether to display a progress bar.
TYPE: bool
DEFAULT: False
seed
Either an instance of a numpy random number generator or a seed for it.
TYPE: Optional[Seed]
DEFAULT: None
RETURNS DESCRIPTION ValuationResult
Object with the data values.
Changed in version 0.9.0
Deprecated config
argument and added a parallel_backend
argument to allow users to pass the Parallel Backend instance directly.
Source code in src/pydvl/value/shapley/montecarlo.py
@deprecated(\n target=True,\n args_mapping={\"config\": \"config\"},\n deprecated_in=\"0.9.0\",\n remove_in=\"0.10.0\",\n)\ndef permutation_montecarlo_shapley(\n u: Utility,\n done: StoppingCriterion,\n *,\n truncation: TruncationPolicy = NoTruncation(),\n n_jobs: int = 1,\n parallel_backend: Optional[ParallelBackend] = None,\n config: Optional[ParallelConfig] = None,\n progress: bool = False,\n seed: Optional[Seed] = None,\n) -> ValuationResult:\n r\"\"\"Computes an approximate Shapley value by sampling independent\n permutations of the index set, approximating the sum:\n\n $$\n v_u(x_i) = \\frac{1}{n!} \\sum_{\\sigma \\in \\Pi(n)}\n \\tilde{w}( | \\sigma_{:i} | )[u(\\sigma_{:i} \\cup \\{i\\}) \u2212 u(\\sigma_{:i})],\n $$\n\n where $\\sigma_{:i}$ denotes the set of indices in permutation sigma before\n the position where $i$ appears (see [[data-valuation]] for details).\n\n This implements the method described in (Ghorbani and Zou, 2019)<sup><a\n href=\"#ghorbani_data_2019\">1</a></sup> with a double stopping criterion.\n\n !!! Todo\n Think of how to add Robin-Gelman or some other more principled stopping\n criterion.\n\n Instead of naively implementing the expectation, we sequentially add points\n to coalitions from a permutation and incrementally compute marginal utilities.\n We stop computing marginals for a given permutation based on a\n [TruncationPolicy][pydvl.value.shapley.truncated.TruncationPolicy].\n (Ghorbani and Zou, 2019)<sup><a href=\"#ghorbani_data_2019\">1</a></sup>\n mention two policies: one that stops after a certain\n fraction of marginals are computed, implemented in\n [FixedTruncation][pydvl.value.shapley.truncated.FixedTruncation],\n and one that stops if the last computed utility (\"score\") is close to the\n total utility using the standard deviation of the utility as a measure of\n proximity, implemented in\n [BootstrapTruncation][pydvl.value.shapley.truncated.BootstrapTruncation].\n\n We keep sampling permutations and updating all shapley values\n until the [StoppingCriterion][pydvl.value.stopping.StoppingCriterion] returns\n `True`.\n\n Args:\n u: Utility object with model, data, and scoring function.\n done: function checking whether computation must stop.\n truncation: An optional callable which decides whether to interrupt\n processing a permutation and set all subsequent marginals to\n zero. Typically used to stop computation when the marginal is small.\n n_jobs: number of jobs across which to distribute the computation.\n parallel_backend: Parallel backend instance to use\n for parallelizing computations. If `None`,\n use [JoblibParallelBackend][pydvl.parallel.backends.JoblibParallelBackend] backend.\n See the [Parallel Backends][pydvl.parallel.backends] package\n for available options.\n config: (**DEPRECATED**) Object configuring parallel computation,\n with cluster address, number of cpus, etc.\n progress: Whether to display a progress bar.\n seed: Either an instance of a numpy random number generator or a seed for it.\n\n Returns:\n Object with the data values.\n\n !!! tip \"Changed in version 0.9.0\"\n Deprecated `config` argument and added a `parallel_backend`\n argument to allow users to pass the Parallel Backend instance\n directly.\n \"\"\"\n algorithm = \"permutation_montecarlo_shapley\"\n\n parallel_backend = _maybe_init_parallel_backend(parallel_backend, config)\n u = parallel_backend.put(u)\n max_workers = parallel_backend.effective_n_jobs(n_jobs)\n n_submitted_jobs = 2 * max_workers # number of jobs in the executor's queue\n\n seed_sequence = ensure_seed_sequence(seed)\n result = ValuationResult.zeros(\n algorithm=algorithm, indices=u.data.indices, data_names=u.data.data_names\n )\n\n pbar = tqdm(disable=not progress, total=100, unit=\"%\")\n\n with parallel_backend.executor(\n max_workers=max_workers, cancel_futures=CancellationPolicy.ALL\n ) as executor:\n pending: set[Future] = set()\n while True:\n pbar.n = 100 * done.completion()\n pbar.refresh()\n\n completed, pending = wait(pending, timeout=1.0, return_when=FIRST_COMPLETED)\n for future in completed:\n result += future.result()\n # we could check outside the loop, but that means more\n # submissions if the stopping criterion is unstable\n if done(result):\n return result\n\n # Ensure that we always have n_submitted_jobs in the queue or running\n n_remaining_slots = n_submitted_jobs - len(pending)\n seeds = seed_sequence.spawn(n_remaining_slots)\n for i in range(n_remaining_slots):\n future = executor.submit(\n _permutation_montecarlo_one_step,\n u,\n truncation,\n algorithm,\n seed=seeds[i],\n )\n pending.add(future)\n
"},{"location":"api/pydvl/value/shapley/montecarlo/#pydvl.value.shapley.montecarlo.combinatorial_montecarlo_shapley","title":"combinatorial_montecarlo_shapley","text":"combinatorial_montecarlo_shapley(\n u: Utility,\n done: StoppingCriterion,\n *,\n n_jobs: int = 1,\n parallel_backend: Optional[ParallelBackend] = None,\n config: Optional[ParallelConfig] = None,\n progress: bool = False,\n seed: Optional[Seed] = None\n) -> ValuationResult\n
Computes an approximate Shapley value using the combinatorial definition:
\\[v_u(i) = \\frac{1}{n} \\sum_{S \\subseteq N \\setminus \\{i\\}} \\binom{n-1}{ | S | }^{-1} [u(S \\cup \\{i\\}) \u2212 u(S)]\\] This consists of randomly sampling subsets of the power set of the training indices in u.data, and computing their marginal utilities. See Data valuation for details.
Note that because sampling is done with replacement, the approximation is poor even for \\(2^{m}\\) subsets with \\(m>n\\), even though there are \\(2^{n-1}\\) subsets for each \\(i\\). Prefer permutation_montecarlo_shapley().
Parallelization is done by splitting the set of indices across processes and computing the sum over subsets \\(S \\subseteq N \\setminus \\{i\\}\\) separately.
PARAMETER DESCRIPTION u
Utility object with model, data, and scoring function
TYPE: Utility
done
Stopping criterion for the computation.
TYPE: StoppingCriterion
n_jobs
number of parallel jobs across which to distribute the computation. Each worker receives a chunk of indices
TYPE: int
DEFAULT: 1
parallel_backend
Parallel backend instance to use for parallelizing computations. If None
, use JoblibParallelBackend backend. See the Parallel Backends package for available options.
TYPE: Optional[ParallelBackend]
DEFAULT: None
config
(DEPRECATED) Object configuring parallel computation, with cluster address, number of cpus, etc.
TYPE: Optional[ParallelConfig]
DEFAULT: None
progress
Whether to display progress bars for each job.
TYPE: bool
DEFAULT: False
seed
Either an instance of a numpy random number generator or a seed for it.
TYPE: Optional[Seed]
DEFAULT: None
RETURNS DESCRIPTION ValuationResult
Object with the data values.
Changed in version 0.9.0
Deprecated config
argument and added a parallel_backend
argument to allow users to pass the Parallel Backend instance directly.
Source code in src/pydvl/value/shapley/montecarlo.py
@deprecated(\n target=True,\n args_mapping={\"config\": \"config\"},\n deprecated_in=\"0.9.0\",\n remove_in=\"0.10.0\",\n)\ndef combinatorial_montecarlo_shapley(\n u: Utility,\n done: StoppingCriterion,\n *,\n n_jobs: int = 1,\n parallel_backend: Optional[ParallelBackend] = None,\n config: Optional[ParallelConfig] = None,\n progress: bool = False,\n seed: Optional[Seed] = None,\n) -> ValuationResult:\n r\"\"\"Computes an approximate Shapley value using the combinatorial\n definition:\n\n $$v_u(i) = \\frac{1}{n} \\sum_{S \\subseteq N \\setminus \\{i\\}}\n \\binom{n-1}{ | S | }^{-1} [u(S \\cup \\{i\\}) \u2212 u(S)]$$\n\n This consists of randomly sampling subsets of the power set of the training\n indices in [u.data][pydvl.utils.utility.Utility], and computing their\n marginal utilities. See [Data valuation][data-valuation] for details.\n\n Note that because sampling is done with replacement, the approximation is\n poor even for $2^{m}$ subsets with $m>n$, even though there are $2^{n-1}$\n subsets for each $i$. Prefer\n [permutation_montecarlo_shapley()][pydvl.value.shapley.montecarlo.permutation_montecarlo_shapley].\n\n Parallelization is done by splitting the set of indices across processes and\n computing the sum over subsets $S \\subseteq N \\setminus \\{i\\}$ separately.\n\n Args:\n u: Utility object with model, data, and scoring function\n done: Stopping criterion for the computation.\n n_jobs: number of parallel jobs across which to distribute the\n computation. Each worker receives a chunk of\n [indices][pydvl.utils.dataset.Dataset.indices]\n parallel_backend: Parallel backend instance to use\n for parallelizing computations. If `None`,\n use [JoblibParallelBackend][pydvl.parallel.backends.JoblibParallelBackend] backend.\n See the [Parallel Backends][pydvl.parallel.backends] package\n for available options.\n config: (**DEPRECATED**) Object configuring parallel computation,\n with cluster address, number of cpus, etc.\n progress: Whether to display progress bars for each job.\n seed: Either an instance of a numpy random number generator or a seed for it.\n\n Returns:\n Object with the data values.\n\n !!! tip \"Changed in version 0.9.0\"\n Deprecated `config` argument and added a `parallel_backend`\n argument to allow users to pass the Parallel Backend instance\n directly.\n \"\"\"\n parallel_backend = _maybe_init_parallel_backend(parallel_backend, config)\n\n map_reduce_job: MapReduceJob[NDArray, ValuationResult] = MapReduceJob(\n u.data.indices,\n map_func=_combinatorial_montecarlo_shapley,\n reduce_func=lambda results: reduce(operator.add, results),\n map_kwargs=dict(u=u, done=done, progress=progress),\n n_jobs=n_jobs,\n parallel_backend=parallel_backend,\n )\n return map_reduce_job(seed=seed)\n
"},{"location":"api/pydvl/value/shapley/naive/","title":"Naive","text":""},{"location":"api/pydvl/value/shapley/naive/#pydvl.value.shapley.naive","title":"pydvl.value.shapley.naive","text":"This module implements exact Shapley values using either the combinatorial or permutation definition.
The exact computation of \\(n\\) values takes \\(\\mathcal{O}(2^n)\\) evaluations of the utility and is therefore only possible for small datasets. For larger datasets, consider using any of the approximations, such as Monte Carlo, or proxy models like kNN.
See Data valuation for details.
"},{"location":"api/pydvl/value/shapley/naive/#pydvl.value.shapley.naive.permutation_exact_shapley","title":"permutation_exact_shapley","text":"permutation_exact_shapley(\n u: Utility, *, progress: bool = True\n) -> ValuationResult\n
Computes the exact Shapley value using the formulation with permutations:
\\[v_u(x_i) = \\frac{1}{n!} \\sum_{\\sigma \\in \\Pi(n)} [u(\\sigma_{i-1} \\cup {i}) \u2212 u(\\sigma_{i})].\\] See Data valuation for details.
When the length of the training set is > 10 this prints a warning since the computation becomes too expensive. Used mostly for internal testing and simple use cases. Please refer to the Monte Carlo approximations for practical applications.
PARAMETER DESCRIPTION u
Utility object with model, data, and scoring function
TYPE: Utility
progress
Whether to display progress bars for each job.
TYPE: bool
DEFAULT: True
RETURNS DESCRIPTION ValuationResult
Object with the data values.
Source code in src/pydvl/value/shapley/naive.py
def permutation_exact_shapley(u: Utility, *, progress: bool = True) -> ValuationResult:\n r\"\"\"Computes the exact Shapley value using the formulation with permutations:\n\n $$v_u(x_i) = \\frac{1}{n!} \\sum_{\\sigma \\in \\Pi(n)} [u(\\sigma_{i-1}\n \\cup {i}) \u2212 u(\\sigma_{i})].$$\n\n See [Data valuation][data-valuation] for details.\n\n When the length of the training set is > 10 this prints a warning since the\n computation becomes too expensive. Used mostly for internal testing and\n simple use cases. Please refer to the [Monte Carlo\n approximations][pydvl.value.shapley.montecarlo] for practical applications.\n\n Args:\n u: Utility object with model, data, and scoring function\n progress: Whether to display progress bars for each job.\n\n Returns:\n Object with the data values.\n \"\"\"\n\n n = len(u.data)\n # Note that the cache in utility saves most of the refitting because we\n # use frozenset for the input.\n if n > 10:\n warnings.warn(\n f\"Large dataset! Computation requires {n}! calls to utility()\",\n RuntimeWarning,\n )\n\n values = np.zeros(n)\n for p in tqdm(\n permutations(u.data.indices),\n disable=not progress,\n desc=\"Permutation\",\n total=math.factorial(n),\n ):\n for i, idx in enumerate(p):\n values[idx] += u(p[: i + 1]) - u(p[:i])\n values /= math.factorial(n)\n\n return ValuationResult(\n algorithm=\"permutation_exact_shapley\",\n status=Status.Converged,\n values=values,\n data_names=u.data.data_names,\n )\n
"},{"location":"api/pydvl/value/shapley/naive/#pydvl.value.shapley.naive.combinatorial_exact_shapley","title":"combinatorial_exact_shapley","text":"combinatorial_exact_shapley(\n u: Utility,\n *,\n n_jobs: int = 1,\n parallel_backend: Optional[ParallelBackend] = None,\n config: Optional[ParallelConfig] = None,\n progress: bool = False\n) -> ValuationResult\n
Computes the exact Shapley value using the combinatorial definition.
\\[v_u(i) = \\frac{1}{n} \\sum_{S \\subseteq N \\setminus \\{i\\}} \\binom{n-1}{ | S | }^{-1} [u(S \\cup \\{i\\}) \u2212 u(S)].\\] See Data valuation for details.
Note
If the length of the training set is > n_jobs*20 this prints a warning because the computation is very expensive. Used mostly for internal testing and simple use cases. Please refer to the Monte Carlo approximations for practical applications.
PARAMETER DESCRIPTION u
Utility object with model, data, and scoring function
TYPE: Utility
n_jobs
Number of parallel jobs to use
TYPE: int
DEFAULT: 1
parallel_backend
Parallel backend instance to use for parallelizing computations. If None
, use JoblibParallelBackend backend. See the Parallel Backends package for available options.
TYPE: Optional[ParallelBackend]
DEFAULT: None
config
(DEPRECATED) Object configuring parallel computation, with cluster address, number of cpus, etc.
TYPE: Optional[ParallelConfig]
DEFAULT: None
progress
Whether to display progress bars for each job.
TYPE: bool
DEFAULT: False
RETURNS DESCRIPTION ValuationResult
Object with the data values.
Changed in version 0.9.0
Deprecated config
argument and added a parallel_backend
argument to allow users to pass the Parallel Backend instance directly.
Source code in src/pydvl/value/shapley/naive.py
@deprecated(\n target=True,\n args_mapping={\"config\": \"config\"},\n deprecated_in=\"0.9.0\",\n remove_in=\"0.10.0\",\n)\ndef combinatorial_exact_shapley(\n u: Utility,\n *,\n n_jobs: int = 1,\n parallel_backend: Optional[ParallelBackend] = None,\n config: Optional[ParallelConfig] = None,\n progress: bool = False,\n) -> ValuationResult:\n r\"\"\"Computes the exact Shapley value using the combinatorial definition.\n\n $$v_u(i) = \\frac{1}{n} \\sum_{S \\subseteq N \\setminus \\{i\\}}\n \\binom{n-1}{ | S | }^{-1} [u(S \\cup \\{i\\}) \u2212 u(S)].$$\n\n See [Data valuation][data-valuation] for details.\n\n !!! Note\n If the length of the training set is > n_jobs*20 this prints a warning\n because the computation is very expensive. Used mostly for internal\n testing and simple use cases. Please refer to the\n [Monte Carlo][pydvl.value.shapley.montecarlo] approximations for\n practical applications.\n\n Args:\n u: Utility object with model, data, and scoring function\n n_jobs: Number of parallel jobs to use\n parallel_backend: Parallel backend instance to use\n for parallelizing computations. If `None`,\n use [JoblibParallelBackend][pydvl.parallel.backends.JoblibParallelBackend] backend.\n See the [Parallel Backends][pydvl.parallel.backends] package\n for available options.\n config: (**DEPRECATED**) Object configuring parallel computation,\n with cluster address, number of cpus, etc.\n progress: Whether to display progress bars for each job.\n\n Returns:\n Object with the data values.\n\n !!! tip \"Changed in version 0.9.0\"\n Deprecated `config` argument and added a `parallel_backend`\n argument to allow users to pass the Parallel Backend instance\n directly.\n \"\"\"\n # Arbitrary choice, will depend on time required, caching, etc.\n if len(u.data) // n_jobs > 20:\n warnings.warn(\n f\"Large dataset! Computation requires 2^{len(u.data)} calls to model.fit()\"\n )\n\n def reduce_fun(results: List[NDArray]) -> NDArray:\n return np.array(results).sum(axis=0) # type: ignore\n\n parallel_backend = _maybe_init_parallel_backend(parallel_backend, config)\n\n map_reduce_job: MapReduceJob[NDArray, NDArray] = MapReduceJob(\n u.data.indices,\n map_func=_combinatorial_exact_shapley,\n map_kwargs=dict(u=u, progress=progress),\n reduce_func=reduce_fun,\n n_jobs=n_jobs,\n parallel_backend=parallel_backend,\n )\n values = map_reduce_job()\n return ValuationResult(\n algorithm=\"combinatorial_exact_shapley\",\n status=Status.Converged,\n values=values,\n data_names=u.data.data_names,\n )\n
"},{"location":"api/pydvl/value/shapley/owen/","title":"Owen","text":""},{"location":"api/pydvl/value/shapley/owen/#pydvl.value.shapley.owen","title":"pydvl.value.shapley.owen","text":""},{"location":"api/pydvl/value/shapley/owen/#pydvl.value.shapley.owen--references","title":"References","text":" -
Okhrati, R., Lipani, A., 2021. A Multilinear Sampling Algorithm to Estimate Shapley Values. In: 2020 25th International Conference on Pattern Recognition (ICPR), pp. 7992\u20137999. IEEE.\u00a0\u21a9
"},{"location":"api/pydvl/value/shapley/owen/#pydvl.value.shapley.owen.OwenAlgorithm","title":"OwenAlgorithm","text":" Bases: Enum
Choices for the Owen sampling method.
ATTRIBUTE DESCRIPTION Standard
Use q \u2208 [0, 1]
Antithetic
Use q \u2208 [0, 0.5] and correlated samples
"},{"location":"api/pydvl/value/shapley/owen/#pydvl.value.shapley.owen.owen_sampling_shapley","title":"owen_sampling_shapley","text":"owen_sampling_shapley(\n u: Utility,\n n_samples: int,\n max_q: int,\n *,\n method: OwenAlgorithm = OwenAlgorithm.Standard,\n n_jobs: int = 1,\n parallel_backend: Optional[ParallelBackend] = None,\n config: Optional[ParallelConfig] = None,\n progress: bool = False,\n seed: Optional[Seed] = None\n) -> ValuationResult\n
Owen sampling of Shapley values as described in (Okhrati and Lipani, 2021)1.
This function computes a Monte Carlo approximation to
\\[v_u(i) = \\int_0^1 \\mathbb{E}_{S \\sim P_q(D_{\\backslash \\{i\\}})} [u(S \\cup \\{i\\}) - u(S)]\\] using one of two methods. The first one, selected with the argument mode = OwenAlgorithm.Standard
, approximates the integral with:
\\[\\hat{v}_u(i) = \\frac{1}{Q M} \\sum_{j=0}^Q \\sum_{m=1}^M [u(S^{(q_j)}_m \\cup \\{i\\}) - u(S^{(q_j)}_m)],\\] where \\(q_j = \\frac{j}{Q} \\in [0,1]\\) and the sets \\(S^{(q_j)}\\) are such that a sample \\(x \\in S^{(q_j)}\\) if a draw from a \\(Ber(q_j)\\) distribution is 1.
The second method, selected with the argument mode = OwenAlgorithm.Antithetic
, uses correlated samples in the inner sum to reduce the variance:
\\[\\hat{v}_u(i) = \\frac{1}{2 Q M} \\sum_{j=0}^Q \\sum_{m=1}^M [u(S^{(q_j)}_m \\cup \\{i\\}) - u(S^{(q_j)}_m) + u((S^{(q_j)}_m)^c \\cup \\{i\\}) - u((S^{( q_j)}_m)^c)],\\] where now \\(q_j = \\frac{j}{2Q} \\in [0,\\frac{1}{2}]\\), and \\(S^c\\) is the complement of \\(S\\).
Note
The outer integration could be done instead with a quadrature rule.
PARAMETER DESCRIPTION u
Utility object holding data, model and scoring function.
TYPE: Utility
n_samples
Numer of sets to sample for each value of q
TYPE: int
max_q
Number of subdivisions for q \u2208 [0,1] (the element sampling probability) used to approximate the outer integral.
TYPE: int
method
Selects the algorithm to use, see the description. Either OwenAlgorithm.Full for \\(q \\in [0,1]\\) or OwenAlgorithm.Halved for \\(q \\in [0,0.5]\\) and correlated samples
TYPE: OwenAlgorithm
DEFAULT: Standard
n_jobs
Number of parallel jobs to use. Each worker receives a chunk of the total of max_q
values for q.
TYPE: int
DEFAULT: 1
parallel_backend
Parallel backend instance to use for parallelizing computations. If None
, use JoblibParallelBackend backend. See the Parallel Backends package for available options.
TYPE: Optional[ParallelBackend]
DEFAULT: None
config
(DEPRECATED) Object configuring parallel computation, with cluster address, number of cpus, etc.
TYPE: Optional[ParallelConfig]
DEFAULT: None
progress
Whether to display progress bars for each job.
TYPE: bool
DEFAULT: False
seed
Either an instance of a numpy random number generator or a seed for it.
TYPE: Optional[Seed]
DEFAULT: None
RETURNS DESCRIPTION ValuationResult
Object with the data values.
New in version 0.3.0
Changed in version 0.5.0
Support for parallel computation and enable antithetic sampling.
Changed in version 0.9.0
Deprecated config
argument and added a parallel_backend
argument to allow users to pass the Parallel Backend instance directly.
Source code in src/pydvl/value/shapley/owen.py
@deprecated(\n target=True,\n args_mapping={\"config\": \"config\"},\n deprecated_in=\"0.9.0\",\n remove_in=\"0.10.0\",\n)\ndef owen_sampling_shapley(\n u: Utility,\n n_samples: int,\n max_q: int,\n *,\n method: OwenAlgorithm = OwenAlgorithm.Standard,\n n_jobs: int = 1,\n parallel_backend: Optional[ParallelBackend] = None,\n config: Optional[ParallelConfig] = None,\n progress: bool = False,\n seed: Optional[Seed] = None\n) -> ValuationResult:\n r\"\"\"Owen sampling of Shapley values as described in\n (Okhrati and Lipani, 2021)<sup><a href=\"#okhrati_multilinear_2021\">1</a></sup>.\n\n This function computes a Monte Carlo approximation to\n\n $$v_u(i) = \\int_0^1 \\mathbb{E}_{S \\sim P_q(D_{\\backslash \\{i\\}})}\n [u(S \\cup \\{i\\}) - u(S)]$$\n\n using one of two methods. The first one, selected with the argument ``mode =\n OwenAlgorithm.Standard``, approximates the integral with:\n\n $$\\hat{v}_u(i) = \\frac{1}{Q M} \\sum_{j=0}^Q \\sum_{m=1}^M [u(S^{(q_j)}_m\n \\cup \\{i\\}) - u(S^{(q_j)}_m)],$$\n\n where $q_j = \\frac{j}{Q} \\in [0,1]$ and the sets $S^{(q_j)}$ are such that a\n sample $x \\in S^{(q_j)}$ if a draw from a $Ber(q_j)$ distribution is 1.\n\n The second method, selected with the argument ``mode =\n OwenAlgorithm.Antithetic``, uses correlated samples in the inner sum to\n reduce the variance:\n\n $$\\hat{v}_u(i) = \\frac{1}{2 Q M} \\sum_{j=0}^Q \\sum_{m=1}^M [u(S^{(q_j)}_m\n \\cup \\{i\\}) - u(S^{(q_j)}_m) + u((S^{(q_j)}_m)^c \\cup \\{i\\}) - u((S^{(\n q_j)}_m)^c)],$$\n\n where now $q_j = \\frac{j}{2Q} \\in [0,\\frac{1}{2}]$, and $S^c$ is the\n complement of $S$.\n\n !!! Note\n The outer integration could be done instead with a quadrature rule.\n\n Args:\n u: [Utility][pydvl.utils.utility.Utility] object holding data, model\n and scoring function.\n n_samples: Numer of sets to sample for each value of q\n max_q: Number of subdivisions for q \u2208 [0,1] (the element sampling\n probability) used to approximate the outer integral.\n method: Selects the algorithm to use, see the description. Either\n [OwenAlgorithm.Full][pydvl.value.shapley.owen.OwenAlgorithm] for\n $q \\in [0,1]$ or\n [OwenAlgorithm.Halved][pydvl.value.shapley.owen.OwenAlgorithm] for\n $q \\in [0,0.5]$ and correlated samples\n n_jobs: Number of parallel jobs to use. Each worker receives a chunk\n of the total of `max_q` values for q.\n parallel_backend: Parallel backend instance to use\n for parallelizing computations. If `None`,\n use [JoblibParallelBackend][pydvl.parallel.backends.JoblibParallelBackend] backend.\n See the [Parallel Backends][pydvl.parallel.backends] package\n for available options.\n config: (**DEPRECATED**) Object configuring parallel computation,\n with cluster address, number of cpus, etc.\n progress: Whether to display progress bars for each job.\n seed: Either an instance of a numpy random number generator or a seed for it.\n\n Returns:\n Object with the data values.\n\n !!! tip \"New in version 0.3.0\"\n\n !!! tip \"Changed in version 0.5.0\"\n Support for parallel computation and enable antithetic sampling.\n\n !!! tip \"Changed in version 0.9.0\"\n Deprecated `config` argument and added a `parallel_backend`\n argument to allow users to pass the Parallel Backend instance\n directly.\n\n \"\"\"\n parallel_backend = _maybe_init_parallel_backend(parallel_backend, config)\n\n map_reduce_job: MapReduceJob[NDArray, ValuationResult] = MapReduceJob(\n u.data.indices,\n map_func=_owen_sampling_shapley,\n reduce_func=lambda results: reduce(operator.add, results),\n map_kwargs=dict(\n u=u,\n method=OwenAlgorithm(method),\n n_samples=n_samples,\n max_q=max_q,\n progress=progress,\n ),\n n_jobs=n_jobs,\n parallel_backend=parallel_backend,\n )\n\n return map_reduce_job(seed=seed)\n
"},{"location":"api/pydvl/value/shapley/truncated/","title":"Truncated","text":""},{"location":"api/pydvl/value/shapley/truncated/#pydvl.value.shapley.truncated","title":"pydvl.value.shapley.truncated","text":""},{"location":"api/pydvl/value/shapley/truncated/#pydvl.value.shapley.truncated--references","title":"References","text":" -
Ghorbani, A., Zou, J., 2019. Data Shapley: Equitable Valuation of Data for Machine Learning. In: Proceedings of the 36th International Conference on Machine Learning, PMLR, pp. 2242\u20132251.\u00a0\u21a9
"},{"location":"api/pydvl/value/shapley/truncated/#pydvl.value.shapley.truncated.TruncationPolicy","title":"TruncationPolicy","text":"TruncationPolicy()\n
Bases: ABC
A policy for deciding whether to stop computing marginals in a permutation.
Statistics are kept on the number of calls and truncations as n_calls
and n_truncations
respectively.
ATTRIBUTE DESCRIPTION n_calls
Number of calls to the policy.
TYPE: int
n_truncations
Number of truncations made by the policy.
TYPE: int
Todo
Because the policy objects are copied to the workers, the statistics are not accessible from the coordinating process. We need to add methods for this.
Source code in src/pydvl/value/shapley/truncated.py
def __init__(self) -> None:\n self.n_calls: int = 0\n self.n_truncations: int = 0\n
"},{"location":"api/pydvl/value/shapley/truncated/#pydvl.value.shapley.truncated.TruncationPolicy.reset","title":"reset abstractmethod
","text":"reset(u: Optional[Utility] = None)\n
Reset the policy to a state ready for a new permutation.
Source code in src/pydvl/value/shapley/truncated.py
@abc.abstractmethod\ndef reset(self, u: Optional[Utility] = None):\n \"\"\"Reset the policy to a state ready for a new permutation.\"\"\"\n ...\n
"},{"location":"api/pydvl/value/shapley/truncated/#pydvl.value.shapley.truncated.TruncationPolicy.__call__","title":"__call__","text":"__call__(idx: int, score: float) -> bool\n
Check whether the computation should be interrupted.
PARAMETER DESCRIPTION idx
Position in the permutation currently being computed.
TYPE: int
score
Last utility computed.
TYPE: float
RETURNS DESCRIPTION bool
True
if the computation should be interrupted.
Source code in src/pydvl/value/shapley/truncated.py
def __call__(self, idx: int, score: float) -> bool:\n \"\"\"Check whether the computation should be interrupted.\n\n Args:\n idx: Position in the permutation currently being computed.\n score: Last utility computed.\n\n Returns:\n `True` if the computation should be interrupted.\n \"\"\"\n ret = self._check(idx, score)\n self.n_calls += 1\n self.n_truncations += 1 if ret else 0\n return ret\n
"},{"location":"api/pydvl/value/shapley/truncated/#pydvl.value.shapley.truncated.NoTruncation","title":"NoTruncation","text":"NoTruncation()\n
Bases: TruncationPolicy
A policy which never interrupts the computation.
Source code in src/pydvl/value/shapley/truncated.py
def __init__(self) -> None:\n self.n_calls: int = 0\n self.n_truncations: int = 0\n
"},{"location":"api/pydvl/value/shapley/truncated/#pydvl.value.shapley.truncated.NoTruncation.__call__","title":"__call__","text":"__call__(idx: int, score: float) -> bool\n
Check whether the computation should be interrupted.
PARAMETER DESCRIPTION idx
Position in the permutation currently being computed.
TYPE: int
score
Last utility computed.
TYPE: float
RETURNS DESCRIPTION bool
True
if the computation should be interrupted.
Source code in src/pydvl/value/shapley/truncated.py
def __call__(self, idx: int, score: float) -> bool:\n \"\"\"Check whether the computation should be interrupted.\n\n Args:\n idx: Position in the permutation currently being computed.\n score: Last utility computed.\n\n Returns:\n `True` if the computation should be interrupted.\n \"\"\"\n ret = self._check(idx, score)\n self.n_calls += 1\n self.n_truncations += 1 if ret else 0\n return ret\n
"},{"location":"api/pydvl/value/shapley/truncated/#pydvl.value.shapley.truncated.FixedTruncation","title":"FixedTruncation","text":"FixedTruncation(u: Utility, fraction: float)\n
Bases: TruncationPolicy
Break a permutation after computing a fixed number of marginals.
The experiments in Appendix B of (Ghorbani and Zou, 2019)1 show that when the training set size is large enough, one can simply truncate the iteration over permutations after a fixed number of steps. This happens because beyond a certain number of samples in a training set, the model becomes insensitive to new ones. Alas, this strongly depends on the data distribution and the model and there is no automatic way of estimating this number.
PARAMETER DESCRIPTION u
Utility object with model, data, and scoring function
TYPE: Utility
fraction
Fraction of marginals in a permutation to compute before stopping (e.g. 0.5 to compute half of the marginals).
TYPE: float
Source code in src/pydvl/value/shapley/truncated.py
def __init__(self, u: Utility, fraction: float):\n super().__init__()\n if fraction <= 0 or fraction > 1:\n raise ValueError(\"fraction must be in (0, 1]\")\n self.max_marginals = len(u.data) * fraction\n self.count = 0\n
"},{"location":"api/pydvl/value/shapley/truncated/#pydvl.value.shapley.truncated.FixedTruncation.__call__","title":"__call__","text":"__call__(idx: int, score: float) -> bool\n
Check whether the computation should be interrupted.
PARAMETER DESCRIPTION idx
Position in the permutation currently being computed.
TYPE: int
score
Last utility computed.
TYPE: float
RETURNS DESCRIPTION bool
True
if the computation should be interrupted.
Source code in src/pydvl/value/shapley/truncated.py
def __call__(self, idx: int, score: float) -> bool:\n \"\"\"Check whether the computation should be interrupted.\n\n Args:\n idx: Position in the permutation currently being computed.\n score: Last utility computed.\n\n Returns:\n `True` if the computation should be interrupted.\n \"\"\"\n ret = self._check(idx, score)\n self.n_calls += 1\n self.n_truncations += 1 if ret else 0\n return ret\n
"},{"location":"api/pydvl/value/shapley/truncated/#pydvl.value.shapley.truncated.RelativeTruncation","title":"RelativeTruncation","text":"RelativeTruncation(u: Utility, rtol: float)\n
Bases: TruncationPolicy
Break a permutation if the marginal utility is too low.
This is called \"performance tolerance\" in (Ghorbani and Zou, 2019)1.
PARAMETER DESCRIPTION u
Utility object with model, data, and scoring function
TYPE: Utility
rtol
Relative tolerance. The permutation is broken if the last computed utility is less than total_utility * rtol
.
TYPE: float
Source code in src/pydvl/value/shapley/truncated.py
def __init__(self, u: Utility, rtol: float):\n super().__init__()\n self.rtol = rtol\n logger.info(\"Computing total utility for permutation truncation.\")\n self.total_utility = self.reset(u)\n self._u = u\n
"},{"location":"api/pydvl/value/shapley/truncated/#pydvl.value.shapley.truncated.RelativeTruncation.__call__","title":"__call__","text":"__call__(idx: int, score: float) -> bool\n
Check whether the computation should be interrupted.
PARAMETER DESCRIPTION idx
Position in the permutation currently being computed.
TYPE: int
score
Last utility computed.
TYPE: float
RETURNS DESCRIPTION bool
True
if the computation should be interrupted.
Source code in src/pydvl/value/shapley/truncated.py
def __call__(self, idx: int, score: float) -> bool:\n \"\"\"Check whether the computation should be interrupted.\n\n Args:\n idx: Position in the permutation currently being computed.\n score: Last utility computed.\n\n Returns:\n `True` if the computation should be interrupted.\n \"\"\"\n ret = self._check(idx, score)\n self.n_calls += 1\n self.n_truncations += 1 if ret else 0\n return ret\n
"},{"location":"api/pydvl/value/shapley/truncated/#pydvl.value.shapley.truncated.BootstrapTruncation","title":"BootstrapTruncation","text":"BootstrapTruncation(u: Utility, n_samples: int, sigmas: float = 1)\n
Bases: TruncationPolicy
Break a permutation if the last computed utility is close to the total utility, measured as a multiple of the standard deviation of the utilities.
PARAMETER DESCRIPTION u
Utility object with model, data, and scoring function
TYPE: Utility
n_samples
Number of bootstrap samples to use to compute the variance of the utilities.
TYPE: int
sigmas
Number of standard deviations to use as a threshold.
TYPE: float
DEFAULT: 1
Source code in src/pydvl/value/shapley/truncated.py
def __init__(self, u: Utility, n_samples: int, sigmas: float = 1):\n super().__init__()\n self.n_samples = n_samples\n logger.info(\"Computing total utility for permutation truncation.\")\n self.total_utility = u(u.data.indices)\n self.count: int = 0\n self.variance: float = 0\n self.mean: float = 0\n self.sigmas: float = sigmas\n
"},{"location":"api/pydvl/value/shapley/truncated/#pydvl.value.shapley.truncated.BootstrapTruncation.__call__","title":"__call__","text":"__call__(idx: int, score: float) -> bool\n
Check whether the computation should be interrupted.
PARAMETER DESCRIPTION idx
Position in the permutation currently being computed.
TYPE: int
score
Last utility computed.
TYPE: float
RETURNS DESCRIPTION bool
True
if the computation should be interrupted.
Source code in src/pydvl/value/shapley/truncated.py
def __call__(self, idx: int, score: float) -> bool:\n \"\"\"Check whether the computation should be interrupted.\n\n Args:\n idx: Position in the permutation currently being computed.\n score: Last utility computed.\n\n Returns:\n `True` if the computation should be interrupted.\n \"\"\"\n ret = self._check(idx, score)\n self.n_calls += 1\n self.n_truncations += 1 if ret else 0\n return ret\n
"},{"location":"api/pydvl/value/shapley/types/","title":"Types","text":""},{"location":"api/pydvl/value/shapley/types/#pydvl.value.shapley.types","title":"pydvl.value.shapley.types","text":""},{"location":"api/pydvl/value/shapley/types/#pydvl.value.shapley.types.ShapleyMode","title":"ShapleyMode","text":" Bases: str
, Enum
Supported algorithms for the computation of Shapley values.
Todo
Make algorithms register themselves here.
"},{"location":"examples/","title":"Examples","text":""},{"location":"examples/#data-valuation","title":"Data valuation","text":" -
Shapley values
An introduction using the spotify dataset, showcasing grouped datasets and applied to improving model performance and identifying bogus data.
-
KNN Shapley
A showcase of a fast model-specific valuation method using the iris dataset.
-
Data utility learning
Learning a utility function from a few evaluations and using it to estimate the value of the remaining data.
-
Least Core
An alternative solution concept from game theory, illustrated on a classification problem.
-
Data OOB
A different and fast strategy for data valuation, using the out-of-bag error of a bagging model.
-
Faster Banzhaf values
Using Banzhaf values to estimate the value of data points in MNIST, and evaluating convergence speed of MSR.
"},{"location":"examples/#influence-functions","title":"Influence functions","text":" -
For CNNs
Detecting corrupted labels with influence functions on the ImageNet dataset.
-
For language models
Using the IMDB dataset for sentiment analysis and a fine-tuned BERT model.
-
For mislabeled data
Detecting corrupted labels using a synthetic dataset.
-
For outlier detection
Using the wine dataset
"},{"location":"examples/data_oob/","title":"Data OOB","text":" This notebook introduces the Data- OOB method, an implementation based on a publication from Kwon and Zou \" Data- OOB : Out-of-bag Estimate as a Simple and Efficient Data Value \" ICML 2023 , using pyDVL.
The objective of this paper is mainly to overcome the computational bottleneck of shapley-based data valuation methods that require to fit a significant number of models to accurately estimate marginal contributions. The algorithms compute data values from out of bag estimates using a bagging model.
The value can be interpreted as a partition of the OOB estimate, which is originally introduced to estimate the prediction error. This OOB estimate is given as:
\\[ \\sum_{i=1}^n\\frac{\\sum_{b=1}^{B}\\mathbb{1}(w_{bi}=0)T(y_i, \\hat{f}_b(x_i))}{\\sum_{b=1}^{B} \\mathbb{1} (w_{bi}=0)} \\] %autoreload\nfrom pydvl.utils import Dataset, Scorer, Seed, Utility, ensure_seed_sequence\nfrom pydvl.value import ValuationResult, compute_data_oob\n
We will work with the adult classification dataset from the UCI repository. The objective is to predict whether a person earns more than 50k a year based on a set of features such as age, education, occupation, etc.
With a helper function we download the data and obtain the following pandas dataframe, where the categorical features have been removed:
\nFound cached file: adult_data.pkl.\n
\n
data_adult.head()\n
age fnlwgt education-num capital-gain capital-loss hours-per-week income 0 39 77516 13 2174 0 40 <=50K 1 50 83311 13 0 0 13 <=50K 2 38 215646 9 0 0 40 <=50K 3 53 234721 7 0 0 40 <=50K 4 28 338409 13 0 0 40 <=50K data = Dataset.from_arrays(\n X=data_adult.drop(columns=[\"income\"]).values,\n y=data_adult.loc[:, \"income\"].cat.codes.values,\n random_state=random_state,\n)\n\nmodel = KNeighborsClassifier(n_neighbors=5)\n\nutility = Utility(model, data, Scorer(\"accuracy\", default=0.0))\n
n_estimators = [100, 500]\noob_values = [\n compute_data_oob(utility, n_est=n_est, max_samples=0.95, seed=random_state)\n for n_est in n_estimators\n]\n
The two results are stored in an array of ValuationResult objects. Here's their distribution. The left-hand side depicts value as it increases with rank and a 99% t-confidence interval. The right-hand side shows the histogram of values.
Observe how adding estimators reduces the variance of the values, but doesn't change their distribution much.
"},{"location":"examples/data_oob/#bagging-for-data-valuation","title":"Bagging for data valuation","text":""},{"location":"examples/data_oob/#setup","title":"Setup","text":" We begin by importing the main libraries and setting some defaults.
If you are reading this in the documentation, some boilerplate (including most plotting code) has been omitted for convenience."},{"location":"examples/data_oob/#computing-the-oob-values","title":"Computing the OOB values","text":" The main idea of Data- OOB is to take an existing classifier or regression model and compute a per-sample out-of-bag performance estimate via bagging.
For this example, we use a simple KNN classifier with \\(k=5\\) neighbours on the data and compute the data-oob values with two choices for the number of estimators in the bagging. For that we construct a Utility object using the Scorer class to specify the metric to use for the evaluation. Note how we pass a random seed to Dataset.from_arrays in order to ensure that we always get the same split when running this notebook multiple times. This will be particularly important when running the standard point removal experiments later.
We then use the compute_data_oob function to compute the data-oob values.
"},{"location":"examples/data_oob/#point-removal-experiments","title":"Point removal experiments","text":" The standard procedure for the evaluation of data valuation schemes is the point removal experiment. The objective is to measure the evolution of performance when the best/worst points are removed from the training set. This can be done with the function compute_removal_score , which takes precomputed values and computes the performance of the model as points are removed.
In order to test the true performance of DataOOB, we repeat the whole task of computing the values and the point removal experiment multiple times, including the splitting of the dataset into training and valuation sets. It is important to remember to pass random state adequately for full reproducibility.
"},{"location":"examples/influence_imagenet/","title":"For CNNs","text":"If you are reading this in the documentation, some boilerplate has been omitted for convenience. from pydvl.influence.torch import CgInfluence\nfrom pydvl.reporting.plots import plot_influence_distribution_by_label\nfrom sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, f1_score\n
label_names = {90: \"tables\", 100: \"boats\"}\ntrain_ds, val_ds, test_ds = load_preprocess_imagenet(\n train_size=0.8,\n test_size=0.1,\n keep_labels=label_names,\n downsampling_ratio=1,\n)\n\nprint(\"Normalised image dtype:\", train_ds[\"normalized_images\"][0].dtype)\nprint(\"Label type:\", type(train_ds[\"labels\"][0]))\nprint(\"Image type:\", type(train_ds[\"images\"][0]))\ntrain_ds.info()\n
Let's take a closer look at a few image samples
Let's now further pre-process the data and prepare for model training. The helper function process_io
converts the normalized images into tensors and the labels to the indices 0 and 1 to train the classifier.
def process_io(df: pd.DataFrame, labels: dict) -> Tuple[torch.Tensor, torch.Tensor]:\n x = df[\"normalized_images\"]\n y = df[\"labels\"]\n ds_label_to_model_label = {\n ds_label: idx for idx, ds_label in enumerate(labels.values())\n }\n x_nn = torch.stack(x.tolist()).to(DEVICE)\n y_nn = torch.tensor([ds_label_to_model_label[yi] for yi in y], device=DEVICE)\n return x_nn, y_nn\n\n\ntrain_x, train_y = process_io(train_ds, label_names)\nval_x, val_y = process_io(val_ds, label_names)\ntest_x, test_y = process_io(test_ds, label_names)\n\nbatch_size = 768\ntrain_data = DataLoader(TensorDataset(train_x, train_y), batch_size=batch_size)\ntest_data = DataLoader(TensorDataset(test_x, test_y), batch_size=batch_size)\nval_data = DataLoader(TensorDataset(val_x, val_y), batch_size=batch_size)\n
device = torch.device(\"cuda:0\" if torch.cuda.is_available() else \"cpu\")\nmodel_ft = new_resnet_model(output_size=len(label_names))\nmgr = TrainingManager(\n \"model_ft\",\n model_ft,\n nn.CrossEntropyLoss(),\n train_data,\n val_data,\n MODEL_PATH,\n device=device,\n)\n# Set use_cache=False to retrain the model\ntrain_loss, val_loss = mgr.train(n_epochs=50, use_cache=True)\n
plot_losses(Losses(train_loss, val_loss))\n
The confusion matrix and \\(F_1\\) score look good, especially considering the low resolution of the images and their complexity (they contain different objects)
pred_y_test = np.argmax(model_ft(test_x).cpu().detach(), axis=1).cpu()\nmodel_score = f1_score(test_y.cpu(), pred_y_test, average=\"weighted\")\n\ncm = confusion_matrix(test_y.cpu(), pred_y_test)\ndisp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=label_names.values())\nprint(\"f1_score of model:\", model_score)\ndisp.plot();\n
\nf1_score of model: 0.9062805208898536\n
\n
influence_model = CgInfluence(mgr.model, mgr.loss, hessian_reg, progress=True)\ninfluence_model = influence_model.fit(train_data)\n
On the instantiated influence object, we can call the method influences , which takes some test data and some input dataset with labels (which typically is the training data, or a subset of it). The influence type will be up
. The other option, perturbation
, is beyond the scope of this notebook, but more info can be found in the notebook using the Wine dataset or in the documentation for pyDVL.
influences = influence_model.influences(test_x, test_y, train_x, train_y, mode=\"up\")\n
The output is a matrix of size test_set_length
x training_set_length
. Each row represents a test data point, and each column a training data point, so that entry \\((i,j)\\) represents the influence of training point \\(j\\) on test point \\(i\\) .
Now we plot the histogram of the influence that all training images have on the image selected above, separated by their label.
Rather unsurprisingly, the training points with the highest influence have the same label. Now we can take the training images with the same label and show those with highest and lowest scores.
Looking at the images, it is difficult to explain why those on the right are more influential than those on the left. At first sight, the choice seems to be random (or at the very least noisy). Let's dig in a bit more by looking at average influences:
avg_influences = np.mean(influences.cpu().numpy(), axis=0)\n
Once again, let's plot the histogram of influence values by label.
Next, for each class (you can change value by changing label key) we can have a look at the top and bottom images by average influence, i.e. we can show the images that have the highest and lowest average influence over all test images.
Once again, it is not easy to explain why the images on the left have a lower influence than the ones on the right.
corrupted_model = new_resnet_model(output_size=len(label_names))\ncorrupted_dataset, corrupted_indices = corrupt_imagenet(\n dataset=train_ds,\n fraction_to_corrupt=0.1,\n avg_influences=avg_influences,\n)\n\ncorrupted_train_x, corrupted_train_y = process_io(corrupted_dataset, label_names)\ncorrupted_data = DataLoader(\n TensorDataset(corrupted_train_x, corrupted_train_y), batch_size=batch_size\n)\n\nmgr = TrainingManager(\n \"corrupted_model\",\n corrupted_model,\n nn.CrossEntropyLoss(),\n corrupted_data,\n val_data,\n MODEL_PATH,\n device=device,\n)\ntraining_loss, validation_loss = mgr.train(n_epochs=50, use_cache=True)\n
plot_losses(Losses(training_loss, validation_loss))\n
\nF1 score of model with corrupted data: 0.8541666666666666\n
\n
Interestingly, despite being trained on a corrupted dataset, the model has a fairly high \\(F_1\\) score. Let's now calculate the influence of the corrupted training data points over the test data points.
influence_model = CgInfluence(mgr.model, mgr.loss, hessian_reg, progress=True)\ninfluence_model = influence_model.fit(corrupted_data)\ninfluences = influence_model.influences(\n test_x, test_y, corrupted_train_x, corrupted_train_y\n)\n
As before, since we are interested in the average influence on the test dataset, we take the average of influences across rows, and then plot the highest and lowest influences for a chosen label
avg_corrupted_influences = np.mean(influences.cpu().numpy(), axis=0)\n
As expected, the samples with lowest (negative) influence for the label \"boats\" are those that have been corrupted: all the images on the left are tables! We can compare the average influence of corrupted data with non-corrupted ones
label avg_non_corrupted_infl avg_corrupted_infl score_diff 0 tables -0.405254 -12.999691 12.594438 1 boats -0.544211 -13.080050 12.535838 And indeed corrupted data have a more negative influence on average than clean ones!
Despite this being a useful property, influence functions are known to be unreliable for tasks of data valuation, especially in deep learning where the fundamental assumption of the theory (convexity) is grossly violated. A lot of factors (e.g. the size of the network, the training process or the Hessian regularization term) can interfere with the computation, to the point that often the results that we obtain cannot be trusted. This has been extensively studied in the recent paper:
Basu, S., P. Pope, and S. Feizi. Influence Functions in Deep Learning Are Fragile. International Conference on Learning Representations (ICLR). 2021 .
Nevertheless, influence functions offer a relatively quick and mathematically rigorous way to evaluate (at first order) the importance of a training point for a model's prediction.
"},{"location":"examples/influence_imagenet/#influence-functions-for-neural-networks","title":"Influence functions for neural networks","text":" This notebook explores the use of influence functions for convolutional neural networks. In the first part we will investigate the usefulness, or lack thereof, of influence functions for the interpretation of a classifier's outputs.
For our study we choose a pre-trained ResNet18, fine-tuned on the tiny-imagenet dataset . This dataset was created for a Stanford course on Deep Learning for Computer Vision , and is a subset of the famous ImageNet with 200 classes instead of 1000, and images down-sampled to a lower resolution of 64x64 pixels.
After tuning the last layers of the network, we will use pyDVL to find the most and the least influential training images for the test set. This can sometimes be used to explain inference errors, or to direct efforts during data collection, although we will face inconclusive results with our model and data. This illustrates well-known issues of influence functions for neural networks.
However, in the final part of the notebook we will see that influence functions are an effective tool for finding anomalous or corrupted data points.
We conclude with an appendix with some basic theoretical concepts used.
"},{"location":"examples/influence_imagenet/#imports-and-setup","title":"Imports and setup","text":""},{"location":"examples/influence_imagenet/#loading-and-preprocessing-the-dataset","title":"Loading and preprocessing the dataset","text":" We pick two classes arbitrarily to work with: 90 and 100, corresponding respectively to dining tables, and boats in Venice (you can of course select any other two classes, or more of them, although that would imply longer training times and some modifications in the notebook below). The dataset is loaded with load_preprocess_imagenet()
, which returns three pandas DataFrames
with training, validation and test sets respectively. Each dataframe has three columns: normalized images, labels and the original images. Note that you can load a subset of the data decreasing downsampling_ratio.
"},{"location":"examples/influence_imagenet/#model-definition-and-training","title":"Model definition and training","text":" We use a ResNet18 from torchvision
with final layers modified for binary classification.
For training, we use the convenience class TrainingManager
which transparently handles persistence after training. It is not part of the main pyDVL package but just a way to reduce clutter in this notebook.
We train the model for 50 epochs and save the results. Then we plot the train and validation loss curves.
"},{"location":"examples/influence_imagenet/#influence-computation","title":"Influence computation","text":" Let's now calculate influences! The central interface for computing influences is InfluenceFunctionModel . Since Resnet18 is quite big, we pick the conjugate gradient implementation CgInfluence , which takes a trained torch.nn.Module , the training loss and the training data. Other important parameters are the Hessian regularization term, which should be chosen as small as possible for the computation to converge (further details on why this is important can be found in the Appendix ).
"},{"location":"examples/influence_imagenet/#analysing-influences","title":"Analysing influences","text":" With the computed influences we can study single images or all of them together:
"},{"location":"examples/influence_imagenet/#influence-on-a-single-test-image","title":"Influence on a single test image","text":" Let's take any image in the test set:
"},{"location":"examples/influence_imagenet/#analysing-the-average-influence-on-test-samples","title":"Analysing the average influence on test samples","text":" By averaging across the rows of the influence matrix, we obtain the average influence of each training sample on the whole test set:
"},{"location":"examples/influence_imagenet/#detecting-corrupted-data","title":"Detecting corrupted data","text":" After facing the shortcomings of influence functions for explaining decisions, we move to an application with clear-cut results. Influences can be successfully used to detect corrupted or mislabeled samples, making them an effective tool to \"debug\" training data.
We begin by training a new model (with the same architecture as before) on a dataset with some corrupted labels. The method get_corrupted_imagenet
will take the training dataset and corrupt a certain fraction of the labels by flipping them. We use the same number of epochs and optimizer as before.
"},{"location":"examples/influence_imagenet/#theory-of-influence-functions-for-neural-networks","title":"Theory of influence functions for neural networks","text":" In this appendix we will briefly go through the basic ideas of influence functions adapted for neural networks as introduced in Koh, Pang Wei, and Percy Liang. \"Understanding Black-box Predictions via Influence Functions\" International conference on machine learning. PMLR, 2017.
Note however that this paper departs from the standard and established theory and notation for influence functions. For a rigorous introduction to the topic we recommend classical texts like Hampel, Frank R., Elvezio M. Ronchetti, Peter J. Rousseeuw, and Werner A. Stahel. Robust Statistics: The Approach Based on Influence Functions. 1st edition. Wiley Series in Probability and Statistics. New York: Wiley-Interscience, 2005. https://doi.org/10.1002/9781118186435.
"},{"location":"examples/influence_imagenet/#upweighting-points","title":"Upweighting points","text":" Let's start by considering some input space \\(\\mathcal{X}\\) to a model (e.g. images) and an output space \\(\\mathcal{Y}\\) (e.g. labels). Let's take \\(z_i = (x_i, y_i)\\) to be the \\(i\\) -th training point, and \\(\\theta\\) to be the (potentially highly) multi-dimensional parameters of the neural network (i.e. \\(\\theta\\) is a big array with very many parameters). We will indicate with \\(L(z, \\theta)\\) the loss of the model for point \\(z\\) and parameters \\(\\theta\\) . When training the model we minimize the loss over all points, i.e. the optimal parameters are calculated through gradient descent on the following formula:
\\[ \\hat{\\theta} = \\arg \\min_\\theta \\frac{1}{n}\\sum_{i=1}^n L(z_i, \\theta) \\] where \\(n\\) is the total number of training data points.
For notational convenience, let's define
\\[ \\hat{\\theta}_{-z} = \\arg \\min_\\theta \\frac{1}{n}\\sum_{z_i \\ne z} L(z_i, \\theta) \\ , \\] i.e. \\(\\hat{\\theta}_{-z}\\) are the model parameters that minimize the total loss when \\(z\\) is not in the training dataset.
In order to check the impact of each training point on the model, we would need to calculate \\(\\hat{\\theta}_{-z}\\) for each \\(z\\) in the training dataset, thus re-training the model at least ~ \\(n\\) times (more if model training is noisy). This is computationally very expensive, especially for big neural networks. To circumvent this problem, we can just calculate a first order approximation of \\(\\hat{\\theta}\\) . This can be done through single backpropagation and without re-training the full model.
Let's define
\\[ \\hat{\\theta}_{\\epsilon, z} = \\arg \\min_\\theta \\frac{1}{n}\\sum_{i=1}^n L(z_i, \\theta) + \\epsilon L(z, \\theta) \\ , \\] which is the optimal \\(\\hat{\\theta}\\) if we were to up-weigh \\(z\\) by an amount \\(\\epsilon\\) .
From a classical result (a simple derivation is available in Appendix A of Koh and Liang's paper), we know that:
\\[ \\frac{d \\ \\hat{\\theta}_{\\epsilon, z}}{d \\epsilon} \\Big|_{\\epsilon=0} = -H_{\\hat{\\theta}}^{-1} \\nabla_\\theta L(z, \\hat{\\theta}) \\] where \\(H_{\\hat{\\theta}} = \\frac{1}{n} \\sum_{i=1}^n \\nabla_\\theta^2 L(z_i, \\hat{\\theta})\\) is the Hessian of \\(L\\) . Importantly, notice that this expression is only valid when \\(\\hat{\\theta}\\) is a minimum of \\(L\\) , or otherwise \\(H_{\\hat{\\theta}}\\) cannot be inverted!
"},{"location":"examples/influence_imagenet/#approximating-the-influence-of-a-point","title":"Approximating the influence of a point","text":" We will define the influence of training point \\(z\\) on test point \\(z_{\\text{test}}\\) as \\(\\mathcal{I}(z, z_{\\text{test}}) = L(z_{\\text{test}}, \\hat{\\theta}_{-z}) - L(z_{\\text{test}}, \\hat{\\theta})\\) (notice that it is higher for points \\(z\\) which positively impact the model score, since if they are excluded, the loss is higher). In practice, however, we will always use the infinitesimal approximation \\(\\mathcal{I}_{up}(z, z_{\\text{test}})\\) , defined as
\\[ \\mathcal{I}_{up}(z, z_{\\text{test}}) = - \\frac{d L(z_{\\text{test}}, \\hat{\\theta}_{\\epsilon, z})}{d \\epsilon} \\Big|_{\\epsilon=0} \\] Using the chain rule and the results calculated above, we thus have:
\\[ \\mathcal{I}_{up}(z, z_{\\text{test}}) = - \\nabla_\\theta L(z_{\\text{test}}, \\hat{\\theta})^\\top \\ \\frac{d \\hat{\\theta}_{\\epsilon, z}}{d \\epsilon} \\Big|_{\\epsilon=0} = \\nabla_\\theta L(z_{\\text{test}}, \\hat{\\theta})^\\top \\ H_{\\hat{\\theta}}^{-1} \\ \\nabla_\\theta L(z, \\hat{\\theta}) \\] In order to calculate this expression we need the gradient and the Hessian of the loss wrt. the model parameters \\(\\hat{\\theta}\\) . This can be easily done through a single backpropagation pass.
"},{"location":"examples/influence_imagenet/#regularizing-the-hessian","title":"Regularizing the Hessian","text":" One very important assumption that we make when approximating influence is that \\(\\hat{\\theta}\\) is at least a local minimum of the loss. However, we clearly cannot guarantee this except for convex models, and despite good apparent convergence, \\(\\hat{\\theta}\\) might be located in a region with flat curvature or close to a saddle point. In particular, the Hessian might have vanishing eigenvalues making its direct inversion impossible.
To circumvent this problem, instead of inverting the true Hessian \\(H_{\\hat{\\theta}}\\) , one can invert a small perturbation thereof: \\(H_{\\hat{\\theta}} + \\lambda \\mathbb{I}\\) , with \\(\\mathbb{I}\\) being the identity matrix. This standard trick ensures that the eigenvalues of \\(H_{\\hat{\\theta}}\\) are bounded away from zero and therefore the matrix is invertible. In order for this regularization not to corrupt the outcome too much, the parameter \\(\\lambda\\) should be as small as possible while still allowing a reliable inversion of \\(H_{\\hat{\\theta}} + \\lambda \\mathbb{I}\\) .
"},{"location":"examples/influence_sentiment_analysis/","title":"For language models","text":" This notebooks showcases the use of influence functions for large language models. In particular, it focuses on sentiment analysis using the IMDB dataset and a fine-tuned BERT model.
Not all the methods for influence function calculation can scale to large models and datasets. In this notebook we will use the Kronecker-Factored Approximate Curvature method, which is the only one that can scale to current state-of-the-art language models.
The notebook is structured as follows:
- Setup imports the required libraries and downloads the dataset and the model.
- Sentiment analysis loads the model and the dataset and goes through a few examples of sentiment analysis.
- Model and data preparation prepares the model and the dataset for influence function calculation. In particular, it assigns all the linear layers to require gradients and wraps the model so that only logits are returned (and not the loss or attention masks).
- Influence function computation : shows how to calculate the influence function for a few test and train examples.
- Analysis of influence values : analyses the influence values, trying to extract general information about the model and how it is affected by corruption in the training data.
- Influence functions by layer : since ekfac is based on a block diagonal approximation of the Fisher information matrix, we can compute the influence function separately for each layer of the neural network. This section shows how to do that and how to analyse the results.
Finally, the Appendix shows how to select the Hessian regularization parameter to obtain the best influence function approximation.
If you are reading this in the documentation, some boilerplate has been omitted for convenience. Let's start by importing the required libraries. If not already installed, you can install them with pip install -r requirements-notebooks.txt
.
import os\nfrom copy import deepcopy\nfrom typing import Sequence\n\nimport matplotlib.pyplot as plt\nimport torch\nimport torch.nn.functional as F\nfrom datasets import load_dataset\nfrom IPython.display import HTML, display\nfrom sklearn.metrics import f1_score\nfrom transformers import AutoModelForSequenceClassification, AutoTokenizer\n\nfrom pydvl.influence.torch import EkfacInfluence\nfrom support.torch import ImdbDataset, ModelLogitsWrapper\n
Sentiment analysis is the task of classifying a sentence as having a positive or negative sentiment. For example, the sentence \"I love this movie\" has a positive sentiment, while \"I hate this movie\" has a negative sentiment. In this notebook we will use the IMDB dataset, which contains 50,000 movie reviews with corresponding labels. The dataset is split into 25,000 reviews for training and 25,000 reviews for testing. The dataset is balanced, meaning that there are the same number of positive and negative reviews in the training and test set.
imdb = load_dataset(\"imdb\")\n
Let's print an example of review and its label
sample_review = imdb[\"train\"].select([24])\n\nprint(f\"Here is a sample review with label {sample_review['label'][0]}: \\n\")\n\ndisplay(HTML(sample_review[\"text\"][0].split(\"<br/>\")[0]))\ndisplay(HTML(sample_review[\"text\"][0].split(\"<br/>\")[-1]))\n
\nHere is a sample review with label 0: \n\n
\n
Without wishing to be a killjoy, Brad Sykes is responsible for at least two of the most dull and clich\u00e9d films i've ever seen - this being one of them, and Camp Blood being another. I bought this for \u00a31, but remember, you can't put a price on 71 minutes of your life. You'd do well to avoid this turkey, even at a bargain basement price. The review is negative, and so label 0 is associated to negative sentiment.
The model is a BERT model fine-tuned on the IMDB dataset. BERT is a large language model that has been pre-trained on a large corpus of text. The model was fine-tuned on the IMDB dataset by AssemblyAI and is available on the HuggingFace model hub. We also load its tokenizer, which is used to convert sentences into numeric tokens.
tokenizer = AutoTokenizer.from_pretrained(\"assemblyai/distilbert-base-uncased-sst2\")\nmodel = AutoModelForSequenceClassification.from_pretrained(\n \"assemblyai/distilbert-base-uncased-sst2\"\n)\n
Even if the model is trained on movie reviews, it can be used to classify any sentence as positive or negative. Let's try it on a simple sentence created by us.
example_phrase = (\n \"Pydvl is the best data valuation library, and it is fully open-source!\"\n)\n\ntokenized_example = tokenizer(\n [example_phrase],\n return_tensors=\"pt\",\n truncation=True,\n)\n\nmodel_output = model(\n input_ids=tokenized_example.input_ids,\n)\n
The model output is a SequenceClassificationOutput
object, which contains the logits and other information.
\nModel Output:\n SequenceClassifierOutput(loss=None, logits=tensor([[-2.6237, 2.8350]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)\n
\n
For calculating probabilities and for the influence functions we only need the logits. Then the softmax function converts the logits into probabilities.
model_predictions = F.softmax(model_output.logits, dim=1)\n
The model is quite confident that the sentence has a positive sentiment, which is correct.
\nPositive probability: 99.6%\nNegative probability: 0.4%\n
\n
Let's examine the model's f1 score on a small subset of the test set.
sample_test_set = imdb[\"test\"].shuffle(seed=seed).select(range(50 if not is_CI else 5))\nsample_test_set = sample_test_set.map(\n lambda example: tokenizer(example[\"text\"], truncation=True, padding=\"max_length\"),\n batched=True,\n)\nsample_test_set.set_format(\"torch\", columns=[\"input_ids\", \"attention_mask\", \"label\"])\nmodel.eval()\nwith torch.no_grad():\n logits = model(\n input_ids=sample_test_set[\"input_ids\"],\n attention_mask=sample_test_set[\"attention_mask\"],\n ).logits\n predictions = torch.argmax(logits, dim=1)\n
f1_score_value = f1_score(sample_test_set[\"label\"], predictions)\nprint(f\"F1 Score: {round(f1_score_value, 3)}\")\n
\nF1 Score: 0.955\n
\n
In this section we will define two helper function and classes that will be used in the rest of the notebook.
def print_sentiment_preds(\n model: ModelLogitsWrapper, model_input: torch.Tensor, true_label: int\n):\n \"\"\"\n Prints the sentiment predictions in a human-readable format given a model and an\n input. It also prints the true label.\n \"\"\"\n model_predictions = F.softmax(model(model_input.unsqueeze(0)), dim=1)\n print(\n \"Positive probability: \"\n + str(round(model_predictions[0][1].item(), 3) * 100)\n + \"%\"\n )\n print(\n \"Negative probability: \"\n + str(round(model_predictions[0][0].item(), 3) * 100)\n + \"%\"\n )\n\n true_label = \"Positive\" if true_label == 1 else \"Negative\"\n print(f\"True label: {true_label} \\n\")\n\n\ndef strip_layer_names(param_names: Sequence[str]):\n \"\"\"\n Helper function that strips the parameter names of the model and the transformer,\n so that they can be printed and compared more easily.\n \"\"\"\n stripped_param_names = []\n for name in param_names:\n name = name.replace(\"model.\", \"\")\n if name.startswith(\"distilbert.transformer.\"):\n name = name.replace(\"distilbert.transformer.\", \"\")\n stripped_param_names.append(name)\n return stripped_param_names\n
Importantly, we will need to assign all the linear layers to require gradients, so that we can compute the influence function with respect to them. Keep in mind that the current implementation of Ekfac only supports linear layers, so if any other type of layer in the model requires gradients the initialisation of the influence function class will fail.
for param in model.named_parameters():\n param[1].requires_grad = False\n\nfor m_name, module in model.named_modules():\n if len(list(module.children())) == 0 and len(list(module.parameters())) > 0:\n if isinstance(module, torch.nn.Linear):\n for p_name, param in module.named_parameters():\n if (\n (\"ffn\" in m_name and not is_CI)\n or \"pre_classifier\" in m_name\n or \"classifier\" in m_name\n ):\n param.requires_grad = True\n
Albeit restrictive, linear layers constitute a large fraction of the parameters of most large language models, and so our analysis still holds a lot of information about the full neural network.
\nTotal parameters: 66.96 millions\nParameters requiring gradients: 28.93 millions\nRatio of Linear over other layer types: 43.20%\n
\n
We are now ready to compute the influence function for a few testing and training examples. Let's start by selecting a subset of the full training and testing dataset and wrapping them in a DataLoader
object, so that we can easily do batching.
NUM_TRAIN_EXAMPLES = 100 if not is_CI else 7\nNUM_TEST_EXAMPLES = 100 if not is_CI else 5\n\nsmall_train_dataset = (\n imdb[\"train\"]\n .shuffle(seed=seed)\n .select([i for i in list(range(NUM_TRAIN_EXAMPLES))])\n)\nsmall_test_dataset = (\n imdb[\"test\"].shuffle(seed=seed).select([i for i in list(range(NUM_TEST_EXAMPLES))])\n)\n\ntrain_dataset = ImdbDataset(small_train_dataset, tokenizer=tokenizer)\ntest_dataset = ImdbDataset(small_test_dataset, tokenizer=tokenizer)\n\ntrain_dataloader = torch.utils.data.DataLoader(\n train_dataset, batch_size=7, shuffle=True\n)\ntest_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=5, shuffle=True)\n
For influence computation we need to take the model in evaluation mode, so that no dropout or batch normalization is applied. Then, we can fit the Ekfac representation.
wrapped_model = ModelLogitsWrapper(model)\nwrapped_model.eval()\n\nekfac_influence_model = EkfacInfluence(\n wrapped_model,\n progress=True,\n)\nekfac_influence_model = ekfac_influence_model.fit(train_dataloader)\n
\nK-FAC blocks - batch progress: 0%| | 0/15 [00:00<?, ?it/s]
\n
And the approximate Hessian is thus obtained. Considering that the model has almost 30 million parameters requiring gradients, this was very fast! Of course, this Hessian is computed using only a very small fraction (~0.4%) of the training data, and for a better approximation we should use a larger subset.
Before continuing, we need to set the Hessian regularization parameter to an appropriate value. A way to decide which is better can be found in the Appendix . Here, we will just set it to 1e-5.
ekfac_influence_model.hessian_regularization = 1e-5\n
We calculate the influence of the first batch of training data over the first batch of test data. This is because influence functions are very expensive to compute, and so to keep the runtime of this notebook within a few minutes we need to restrict ourselves to a small number of examples.
test_input, test_labels, test_text = next(iter(test_dataloader))\ntrain_input, train_labels, train_text = next(iter(train_dataloader))\n
And let's finally compute the influence function values
ekfac_train_influences = ekfac_influence_model.influences(\n test_input,\n test_labels,\n train_input,\n train_labels,\n)\n
\n/home/jakob/Documents/pyDVL/venv/lib/python3.10/site-packages/transformers/models/distilbert/modeling_distilbert.py:222: UserWarning: There is a performance drop because we have not yet implemented the batching rule for aten::masked_fill.Tensor. Please file us an issue on GitHub so that we can prioritize its implementation. (Triggered internally at ../aten/src/ATen/functorch/BatchedFallback.cpp:82.)\n scores = scores.masked_fill(\n
\n
Now that we have calculated the influences for a few examples, let's analyse some of the extreme values.
Let's plot the influence values as a heatmap for easily spotting patterns.
Most of the test and training examples have similar influence, close to zero. However, there is one test and one training samples that stand out. In particular, their cross influence is very large and negative. Let's examine them more closely.
\nTraining example with idx 3: \n\nPositive probability: 18.099999999999998%\nNegative probability: 81.89999999999999%\nTrue label: Positive \n\nSentence:\n
\n
In the process of trying to establish the audiences' empathy with Jake Roedel (Tobey Maguire) the filmmakers slander the North and the Jayhawkers. Missouri never withdrew from the Union and the Union Army was not an invading force. The Southerners fought for State's Rights: the right to own slaves, elect crooked legislatures and judges, and employ a political spoils system. There's nothing noble in that. The Missourians could have easily traveled east and joined the Confederate Army. It seems to me that the story has nothing to do with ambiguity. When Jake leaves the Bushwhackers, it's not because he saw error in his way, he certainly doesn't give himself over to the virtue of the cause of abolition. We can see that, despite being positive, this review is quite hard to classify. Its language is overall negative, mostly associated to the facts narrated rather than the movie itself. Notice how several terms are related to war and invasion.
\nTest example with idx 4: \n\nPositive probability: 39.6%\nNegative probability: 60.4%\nTrue label: Negative \n\nSentence:\n
\n
\"An astronaut (Michael Emmet) dies while returning from a mission and his body is recovered by the military. The base where the dead astronaut is taken to becomes the scene of a bizarre invasion plan from outer space. Alien embryos inside the dead astronaut resurrect the corpse and begin a terrifying assault on the military staff in the hopes of conquering the world,\" according to the DVD sleeve's synopsis. A Roger Corman \"American International\" production. The man who fell to Earth impregnated, Mr. Emmet (as John Corcoran), does all right. Angela Greene is his pretty conflicted fianc\u00e9e. And, Ed Nelson (as Dave Randall) is featured as prominently. With a bigger budget, better opening, and a re-write for crisper characterizations, this could have been something approaching classic 1950s science fiction. *** Night of the Blood Beast (1958) Bernard L. Kowalski, Roger Corman ~ Michael Emmet, Angela Greene, Ed Nelson This review is also quite hard to classify. This time it has a negative sentiment towards the movie, but it also contains several words with positive connotation. The parallel with the previous review is quite interesting since both talk about an invasion.
As it is often the case when analysing influence functions, it is hard to understand why these examples have such a large influence. We have seen some interesting patterns, mostly related to similarities in the language and words used, but it is hard to say with certainty if these are the reasons for such a large influence.
A recent paper has explored this topic in high detail, even for much larger language models than BERT (up to ~50 billion parameters!). Among the most interesting findings is that smaller models tend to rely a lot on word-to-word correspondencies, while larger models are more capable of extracting higher level concepts, drawing connections between words across multiple phrases.
For more info, you can visit our blog on influence functions for large language models
In this sections we want to get an idea of how influence functions change when training examples are corrupted. In the next cell we will flip the label of all the training examples and compute the influences on the same test batch as before.
modified_train_labels = deepcopy(train_labels)\nmodified_train_labels = 1 - train_labels\n\ncorrupted_ekfac_train_influences = ekfac_influence_model.influences(\n test_input,\n test_labels,\n train_input,\n modified_train_labels,\n)\n
Overall, when corrupted the influences tend to become negative, as expected. Nevertheless, there are cases where values go from slightly negative to positive, mostly isolated to the second and last test samples. Single values can be quite noisy, so it is difficult to generalise this result, but it would be interesting to see how common these cases are in the full test dataset.
Since ekfac is based on a block diagonal approximation of the Fisher information matrix, we can compute the influence functions separately for each layer of the neural network. In this section we show how to do that and we briefly analyse the results.
influences_by_layer = ekfac_influence_model.influences_by_layer(\n test_input,\n test_labels,\n train_input,\n train_labels,\n)\n
The method influences_by_layer
returns a dictionary containing the influence function values for each layer of the neural network as a tensor. To recover the full influence values as returned by the influences
(as done in the previous section), we need to sum each layer's values.
influences = torch.zeros_like(ekfac_train_influences)\nfor layer_id, value in influences_by_layer.items():\n influences += value.detach()\n
And if we plot the result as a heatmap we can see that the results are the same as in Negative influence training examples
Let's analyse how the influence values change across different layers for given test and train examples.
The plot above shows the influences for test idx 0 and all train idx apart idx=3 (excluded for clarity since it has a very large absolute value). We can see that the scores tend to keep their sign across layers, but in almost all cases tend to decrease when approaching the output layer. This is not always the case, and in fact other test examples show different patterns. Understanding why this happens is an interesting research direction.
Ekfac is a powerful approximate method for computing the influence function of models that use a cross-entropy loss. In this notebook we applied it to sentiment analysis with BERT on the IMDB dataset. However, this method can be applied to much larger models and problems, e.g. to analyse the influence of entire sentences generated by GPT, Llama or Claude. For more info, you can visit our paper pill on influence functions for large language models
The Hessian regularization value impacts a lot the quality of the influence function approximation. In general, the value should be chosen as small as possible so that the results are finite. In practice, even when finite the influence values can be too large and lead to numerical instabilities. In this section we show how to efficiently analyse the impact of the Hessian regularization value with the ekfac method.
Let's start with a few additional imports.
import pandas as pd\nfrom scipy.stats import pearsonr, spearmanr\n
The method explore_hessian_regularization
will calculate the influence values of the training examples with each other for a range of Hessian regularization values. The method optimises gradient calculation and Hessian inversion to minimise the computation time.
influences_by_reg_value = ekfac_influence_model.explore_hessian_regularization(\n train_input,\n train_labels,\n regularization_values=[1e-15, 1e-9, 1e-5, 1],\n)\n
\n/home/jakob/Documents/pyDVL/venv/lib/python3.10/site-packages/transformers/models/distilbert/modeling_distilbert.py:222: UserWarning: There is a performance drop because we have not yet implemented the batching rule for aten::masked_fill.Tensor. Please file us an issue on GitHub so that we can prioritize its implementation. (Triggered internally at ../aten/src/ATen/functorch/BatchedFallback.cpp:82.)\n scores = scores.masked_fill(\n
\n
The resulting object, influences_by_reg_value
is a dictionary that associates to each regularization value the influences for each layer of the neural network. This is a lot of data, so we will first organise it in a pandas dataframe and take the average across training examples.
cols = [\"reg_value\", \"layer_id\", \"mean_infl\"]\ninfl_df = pd.DataFrame(influences_by_reg_value, columns=cols)\nfor reg_value in influences_by_reg_value:\n for layer_id, layer_influences in influences_by_reg_value[reg_value].items():\n mean_infl = torch.mean(layer_influences, dim=0).detach().numpy()\n infl_df = pd.concat(\n [infl_df, pd.DataFrame([[reg_value, layer_id, mean_infl]], columns=cols)]\n )\n
\n/tmp/ipykernel_8503/1081261490.py:6: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n infl_df = pd.concat(\n
\n
With this dataframe, we can take contiguous values of regularization and, for each layer, calculate the Pearson and Spearman correlation coefficients. This will give us an idea of how the influence values change with the regularization value.
result_corr = {}\nfor layer_id, group_df in infl_df.groupby(\"layer_id\"):\n result_corr[layer_id + \"_pearson\"] = {}\n result_corr[layer_id + \"_spearman\"] = {}\n for idx, mean_infl in enumerate(group_df[\"mean_infl\"]):\n if idx == 0:\n continue\n reg_value_diff = f\"Reg: {group_df['reg_value'].iloc[idx-1]} -> {group_df['reg_value'].iloc[idx]}\"\n pearson = pearsonr(mean_infl, group_df[\"mean_infl\"].iloc[idx - 1]).statistic\n spearman = spearmanr(mean_infl, group_df[\"mean_infl\"].iloc[idx - 1]).statistic\n result_corr[layer_id + \"_pearson\"].update({f\"{reg_value_diff}\": pearson})\n result_corr[layer_id + \"_spearman\"].update({f\"{reg_value_diff}\": spearman})\nresult_df = pd.DataFrame(result_corr).T\n
Let's plot the correlations heatmap. The y-axis reports Spearman and Pearson correlations for each layer, while the x-axis reports pairs of regularization values. High correlations mean that influences are stable across regularization values.
In our case, we can see that for regularization = 1 the spearman correlation becomes very bad. However, for a large range of regularization values smaller than 1 the sample rankings are stable. This is a good indicator that the model is not too sensitive to the regularization value. We therefore chose the value 1e-5 for our analysis.
"},{"location":"examples/influence_sentiment_analysis/#influence-functions-for-large-language-models","title":"Influence functions for Large Language Models","text":""},{"location":"examples/influence_sentiment_analysis/#setup","title":"Setup","text":""},{"location":"examples/influence_sentiment_analysis/#sentiment-analysis","title":"Sentiment Analysis","text":""},{"location":"examples/influence_sentiment_analysis/#model-and-data-preparation","title":"Model and Data Preparation","text":""},{"location":"examples/influence_sentiment_analysis/#influence-function-computation","title":"Influence function computation","text":""},{"location":"examples/influence_sentiment_analysis/#analysis-of-influence-values","title":"Analysis of influence values","text":""},{"location":"examples/influence_sentiment_analysis/#negative-influence-training-examples","title":"Negative influence training examples","text":""},{"location":"examples/influence_sentiment_analysis/#influence-of-corrupted-training-examples","title":"Influence of corrupted training examples","text":""},{"location":"examples/influence_sentiment_analysis/#influence-functions-by-layer","title":"Influence functions by layer","text":""},{"location":"examples/influence_sentiment_analysis/#conclusion","title":"Conclusion","text":""},{"location":"examples/influence_sentiment_analysis/#appendix-choosing-the-hessian-regularization-value","title":"Appendix: Choosing the Hessian regularization value","text":""},{"location":"examples/influence_synthetic/","title":"For mislabeled data","text":"If you are reading this in the documentation, some boilerplate has been omitted for convenience. %autoreload\n%matplotlib inline\n\nimport os\nimport random\nimport numpy as np\nimport torch\nimport torch.nn.functional as F\nimport matplotlib.pyplot as plt\nfrom pydvl.influence.torch import DirectInfluence, CgInfluence\nfrom support.shapley import (\n synthetic_classification_dataset,\n decision_boundary_fixed_variance_2d,\n)\nfrom support.common import (\n plot_gaussian_blobs,\n plot_losses,\n plot_influences,\n)\nfrom support.torch import (\n fit_torch_model,\n TorchLogisticRegression,\n)\nfrom sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay\nfrom torch.optim import AdamW, lr_scheduler\nfrom torch.utils.data import DataLoader, TensorDataset\n
The following code snippet generates the aforementioned dataset.
train_data, val_data, test_data = synthetic_classification_dataset(\n means, sigma, num_samples, train_size=0.7, test_size=0.2\n)\n
Given the simplicity of the dataset, we can calculate exactly the optimal decision boundary(that which maximizes our accuracy). The following code maps a continuous line of z values to a 2-dimensional vector in feature space (More details are in the appendix to this notebook.)
decision_boundary_fn = decision_boundary_fixed_variance_2d(means[0], means[1])\ndecision_boundary = decision_boundary_fn(np.linspace(-1.5, 1.5, 100))\n
plot_gaussian_blobs(\n train_data,\n test_data,\n xlabel=\"$x_0$\",\n ylabel=\"$x_1$\",\n legend_title=\"$y - labels$\",\n line=decision_boundary,\n s=10,\n suptitle=\"Plot of train-test data\",\n)\n
Note that there are samples which go across the optimal decision boundary and will be wrongly labelled. The optimal decision boundary can not discriminate these as the mislabelling is a consequence of the presence of random noise.
model = TorchLogisticRegression(num_features)\ndevice = torch.device(\"cuda:0\" if torch.cuda.is_available() else \"cpu\")\nmodel.to(device)\n\nnum_epochs = 50\nlr = 0.05\nweight_decay = 0.05\nbatch_size = 256\n\ntrain_data_loader = DataLoader(\n TensorDataset(\n torch.as_tensor(train_data[0]),\n torch.as_tensor(train_data[1], dtype=torch.float64).unsqueeze(-1),\n ),\n batch_size=batch_size,\n shuffle=True,\n)\n\nval_data_loader = DataLoader(\n TensorDataset(\n torch.as_tensor(val_data[0]),\n torch.as_tensor(val_data[1], dtype=torch.float64).unsqueeze(-1),\n ),\n batch_size=batch_size,\n shuffle=True,\n)\n\noptimizer = AdamW(params=model.parameters(), lr=lr, weight_decay=weight_decay)\nscheduler = lr_scheduler.CosineAnnealingLR(optimizer, T_max=num_epochs)\nlosses = fit_torch_model(\n model=model,\n training_data=train_data_loader,\n val_data=val_data_loader,\n loss=F.binary_cross_entropy,\n optimizer=optimizer,\n scheduler=scheduler,\n num_epochs=num_epochs,\n device=device,\n)\n
And let's check that the model is not overfitting
plot_losses(losses)\n
A look at the confusion matrix also shows good results
It is important that the model converges to a point near the optimum, since the influence values assume that we are at a minimum (or close) in the loss landscape. The function
\\[I(x_1, y_1, x_2, y_2) \\colon \\mathbb{R}^d \\times \\mathbb{R}^d \\to \\mathbb{R}\\] measures the influence of the data point \\(x_1\\) onto \\(x_2\\) conditioned on the training targets \\(y_1\\) and \\(y_2\\) trough some model parameters \\(\\theta\\) . If the loss function L is differentiable, we can take \\(I\\) to be
$$ I(x_1, x_2) = \\nabla_\\theta\\; L(x_1, y_1) ^\\mathsf{T} \\; H_\\theta^{-1} \\; \\nabla_\\theta \\; L(x_2, y_2) $$ See \"Understanding Black-box Predictions via Influence Functions\" for a detailed derivation of this formula
Let's take a subset of the training data points, which we will calculate the influence values of.
x = train_data[0][:100]\ny = train_data[1][:100]\n
In pyDVL, the influence of the training points on the test points can be calculated with the following
train_x = torch.as_tensor(x)\ntrain_y = torch.as_tensor(y, dtype=torch.float64).unsqueeze(-1)\ntest_x = torch.as_tensor(test_data[0])\ntest_y = torch.as_tensor(test_data[1], dtype=torch.float64).unsqueeze(-1)\n\ntrain_data_loader = DataLoader(\n TensorDataset(train_x, train_y),\n batch_size=batch_size,\n)\n\ninfluence_model = DirectInfluence(\n model,\n F.binary_cross_entropy,\n regularization=0.0,\n)\ninfluence_model = influence_model.fit(train_data_loader)\n\ninfluence_values = influence_model.influences(\n test_x, test_y, train_x, train_y, mode=\"up\"\n)\n
The above explicitly constructs the Hessian. This can often be computationally expensive and conjugate gradient approximate calculation should be used for bigger models.
With the influence type 'up', training influences have shape [NxM] where N is the number of test samples and M is the number of training samples. They therefore associate to each training sample its influence on each test sample. Influence type 'perturbation', instead, return an array of shape [NxMxF], where F is the number of features in input, ie. the length of x.
In our case, in order to have a value of the total average influence of a point we can just average across training samples.
mean_train_influences = np.mean(influence_values.cpu().numpy(), axis=0)\n
Let's plot the results (adjust colorbar_limits for better color gradient)
plot_influences(\n x,\n mean_train_influences,\n line=decision_boundary,\n xlabel=\"$x_0$\",\n ylabel=\"$x_1$\",\n suptitle=\"Influences of input points\",\n legend_title=\"influence values\",\n # colorbar_limits=(-0.3,),\n);\n
We can see that, as we approach the separation line, the influences tend to move away from zero, i.e. the points become more decisive for model training, some in a positive way, some negative.
As a further test, let's introduce some labelling errors into \\(y\\) and see how the distribution of the influences changes. Let's flip the first 10 labels and calculate influences
y_corrupted = np.copy(y)\ny_corrupted[:10] = [1 - yi for yi in y[:10]]\ntrain_y_corrupted = torch.as_tensor(y_corrupted, dtype=torch.float64).unsqueeze(-1)\ntrain_corrupted_data_loader = DataLoader(\n TensorDataset(\n train_x,\n train_y_corrupted,\n ),\n batch_size=batch_size,\n)\n\ninfluence_model = DirectInfluence(\n model,\n F.binary_cross_entropy,\n regularization=0.0,\n)\ninfluence_model = influence_model.fit(train_corrupted_data_loader)\ninfluence_values = influence_model.influences(\n test_x, test_y, train_x, train_y_corrupted, mode=\"up\"\n)\n\nmean_train_influences = np.mean(influence_values.cpu().numpy(), axis=0)\n
\nAverage mislabelled data influence: -0.8618301488627411\nAverage correct data influence: 0.011604730452803018\n
\n
Red circles indicate the points which have been corrupted. We can see that the mislabelled data have a more negative average influence on the model, especially those that are farther away from the decision boundary.
The \"direct\" method that we have used above involves the inversion of the Hessian matrix of the model. If a model has \\(n\\) training points and \\(\\theta \\in \\mathbb{R}^p\\) parameters, this requires \\(O(n \\ p^2 + p^3)\\) operations, which for larger models, like neural networks, becomes quickly unfeasible. Conjugate gradient avoids the explicit computation of the Hessian via a technique called implicit Hessian-vector products (HVPs), which typically takes \\(O(n \\ p)\\) operations.
In the next cell we will use conjugate gradient to compute the influence factors. Since logistic regression is a very simple model, \"cg\" actually slows computation with respect to the direct method, which in this case is a much better choice. Nevertheless, we are able to verify that the influences calculated with \"cg\" are the same (to a minor error) as those calculated directly.
influence_model = CgInfluence(\n model,\n F.binary_cross_entropy,\n hessian_regularization=0.0,\n)\ninfluence_model = influence_model.fit(train_corrupted_data_loader)\ninfluence_values = influence_model.influences(\n test_x, test_y, train_x, train_y_corrupted\n)\nmean_train_influences = np.mean(influence_values.cpu().numpy(), axis=0)\n\nprint(\"Average mislabelled data influence:\", np.mean(mean_train_influences[:10]))\nprint(\"Average correct data influence:\", np.mean(mean_train_influences[10:]))\n
\nAverage mislabelled data influence: -0.30172696155741363\nAverage correct data influence: 0.0\n
\n
Averages are very similar to the ones calculated through direct method. Same is true for the plot
"},{"location":"examples/influence_synthetic/#influence-functions-for-data-mislabeling","title":"Influence functions for data mislabeling","text":" In this notebook, we will take a closer look at the theory of influence functions with the help of a synthetic dataset. Data mislabeling occurs whenever some examples from a usually big dataset are wrongly-labeled. In real-life this happens fairly often, e.g. as a consequence of human error, or noise in the data.
Let's consider a classification problem with the following notation:
\\[ \\begin{align*} x_i &\\in \\mathbb{R}^d \\\\ y_i &\\in \\{0, 1\\} \\\\ \\forall i &\\in [ N ] \\end{align*} \\] In other words, we have a dataset containing \\(N\\) samples, each with label 1 or 0. As typical example you can think of y indicating whether a patient has a disease based on some feature representation \\(x\\) .
Let's now introduce a toy model that will help us delve into the theory and practical utility of influence functions. We will assume that \\(y\\) is a Bernoulli binary random variable while the input \\(x\\) is d-dimensional Gaussian distribution which depends on the label \\(y\\) . More precisely:
\\[ y_i \\sim \\text{Ber}\\left (0.5 \\right) \\\\ x_i \\sim \\mathcal{N}\\left ((1 - y_i) \\mu_1 + y_i \\mu_2, \\sigma^2 I \\right), \\] with fixed means and diagonal covariance. Implementing the sampling scheme in python is straightforward and can be achieved by first sampling \\(y\\) and afterward \\(x\\) .
"},{"location":"examples/influence_synthetic/#imports","title":"Imports","text":""},{"location":"examples/influence_synthetic/#dataset","title":"Dataset","text":""},{"location":"examples/influence_synthetic/#plotting-the-dataset","title":"Plotting the dataset","text":" Let's plot the dataset is plotted with their respective labels and the optimal decision line
"},{"location":"examples/influence_synthetic/#training-the-model","title":"Training the model","text":" We will now train a logistic regression model on the training data. This can be done with the following
"},{"location":"examples/influence_synthetic/#calculating-influences","title":"Calculating influences","text":""},{"location":"examples/influence_synthetic/#inversion-through-conjugate-gradient","title":"Inversion through conjugate gradient","text":""},{"location":"examples/influence_synthetic/#appendix-calculating-the-decision-boundary","title":"Appendix: Calculating the decision boundary","text":" For obtaining the optimal discriminator one has to solve the equation
\\[p(x|y=0)=p(x|y=1)\\] and determine the solution set \\(X\\) . Let's take the following probabilities
\\[ \\begin{align*} p(x|y=0)&=\\mathcal{N}\\left (\\mu_1, \\sigma^2 I \\right) \\\\ p(x|y=1)&=\\mathcal{N}\\left (\\mu_2, \\sigma^2 I \\right) \\end{align*} \\] For a single fixed diagonal variance parameterized by \\(\\sigma\\) , the optimal discriminator lays at points which are equidistant from the means of the two distributions, i.e.
\\[ \\begin{align*} \\| x - \\mu_1 \\|^2 &= \\| x - \\mu_2 \\|^2 \\\\ \\| \\mu_1 \\|^2 -2 x^\\mathsf{T} \\mu_1 &= \\| \\mu_2 \\|^2 -2 x^\\mathsf{T} \\mu_2 \\\\ \\implies 0 &= 2 (\\mu_2 - \\mu_1)^\\mathsf{T} x + \\| \\mu_1 \\|^2 - \\| \\mu_2 \\|^2 \\\\ 0 &= \\mu_1^\\mathsf{T}x - \\mu_2^\\mathsf{T}x - \\frac{1}{2} \\mu_1^\\mathsf{T} \\mu_1 + \\frac{1}{2} \\mu_2^\\mathsf{T} \\mu_2 \\end{align*} \\] This is just the implicit description of the line. Solving for the explicit form can be achieved by enforcing a functional form \\(f(z) = x = a z + b\\) with \\(z \\in \\mathbb{R}\\) onto \\(x\\) . After the term is inserted in the previous equation
\\[ 0 = (\\mu_2 - \\mu_1)^\\mathsf{T} (az + b) + \\frac{1}{2} \\| \\mu_1 \\|^2 - \\| \\mu_2 \\|^2 \\] We can write \\(a\\) since, by symmetry, it is expected to be explicitly orthogonal to \\(\\mu_2 - \\mu_1\\) . Then, solving for \\(b\\) , the solution can be found to be
\\[ f(z) = \\underbrace{\\begin{bmatrix} 0 & 1 \\\\ -1 & 0 \\end{bmatrix} (\\mu_2 - \\mu_1)}_a z + \\underbrace{\\frac{\\mu_1 + \\mu_2}{2}}_b \\]"},{"location":"examples/influence_wine/","title":"For outlier detection","text":"If you are reading this in the documentation, some boilerplate has been omitted for convenience. Let's start by loading the imports, the dataset and splitting it into train, validation and test sets. We will use a large test set to have a less noisy estimate of the average influence.
%autoreload\n%matplotlib inline\n\nimport os\nimport random\n\nimport matplotlib.pyplot as plt\nimport numpy as np\nimport torch\nimport torch.nn.functional as F\nfrom support.common import plot_losses\nfrom support.torch import TorchMLP, fit_torch_model\nfrom pydvl.influence.torch import (\n DirectInfluence,\n CgInfluence,\n ArnoldiInfluence,\n EkfacInfluence,\n NystroemSketchInfluence,\n LissaInfluence,\n)\nfrom pydvl.influence import InfluenceMode\nfrom support.shapley import load_wine_dataset\nfrom sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, f1_score\nfrom torch.optim import Adam, lr_scheduler\nfrom torch.utils.data import DataLoader, TensorDataset\nfrom scipy.stats import pearsonr, spearmanr\n
training_data, val_data, test_data, feature_names = load_wine_dataset(\n train_size=0.6, test_size=0.3\n)\n
We will corrupt some of the training points by flipping their labels
num_corrupted_idxs = 10\ntraining_data[1][:num_corrupted_idxs] = torch.tensor(\n [(val + 1) % 3 for val in training_data[1][:num_corrupted_idxs]]\n)\n
and let's wrap it in a pytorch data loader
training_data_loader = DataLoader(\n TensorDataset(*training_data), batch_size=32, shuffle=False\n)\nval_data_loader = DataLoader(TensorDataset(*val_data), batch_size=32, shuffle=False)\ntest_data_loader = DataLoader(TensorDataset(*test_data), batch_size=32, shuffle=False)\n
feature_dimension = 13\nnum_classes = 3\nnetwork_size = [16, 16]\nlayers_size = [feature_dimension, *network_size, num_classes]\nnum_epochs = 300\nlr = 0.005\nweight_decay = 0.01\n\nnn_model = TorchMLP(layers_size)\nnn_model.to(device)\n\noptimizer = Adam(params=nn_model.parameters(), lr=lr, weight_decay=weight_decay)\nscheduler = lr_scheduler.CosineAnnealingLR(optimizer, T_max=num_epochs)\n\nlosses = fit_torch_model(\n model=nn_model,\n training_data=training_data_loader,\n val_data=val_data_loader,\n loss=F.cross_entropy,\n optimizer=optimizer,\n scheduler=scheduler,\n num_epochs=num_epochs,\n device=device,\n)\n
Let's check that the training has found a stable minimum by plotting the training and validation loss
plot_losses(losses)\n
Since it is a classification problem, let's also take a look at the confusion matrix on the test set
And let's compute the f1 score of the model
f1_score(test_data[1], pred_y_test, average=\"weighted\")\n
\n0.943730275125624
\n
Let's now move to calculating influences of each point on the total score.
influence_model = DirectInfluence(\n nn_model,\n F.cross_entropy,\n regularization=0.1,\n)\ninfluence_model = influence_model.fit(training_data_loader)\ntrain_influences = influence_model.influences(*test_data, *training_data, mode=\"up\")\n
the returned matrix, train_influences, has a quantity of columns equal to the points in the training set, and a number of rows equal to the points in the test set. At each element \\(a_{i,j}\\) it stores the influence that training point \\(j\\) has on the classification of test point \\(i\\) .
If we take the average across every column of the influences matrix, we obtain an estimate of the overall influence of a training point on the total accuracy of the network.
mean_train_influences = np.mean(train_influences.cpu().numpy(), axis=0)\nmean_train_influences.shape\n
\n(106,)
\n
The following histogram shows that there are big differences in score within the training set (notice the log-scale on the y axis).
We can see that the corrupted points tend to have a negative effect on the model, as expected
\nAverage influence of corrupted points: -1.0840776\nAverage influence of other points: 0.11192768\n
\n
We have seen how to calculate the influence of single training points on each test point using influence_type 'up'. Using influence_type 'perturbation' we can also calculate the influence of the input features of each point. In the next cell we will calculate the average influence of each feature on training and test points, and ultimately assess which are the most relevant to model performance.
influence_model.regularization = 1.0\nfeature_influences = influence_model.influences(\n *test_data, *training_data, mode=\"perturbation\"\n)\n
The explicit calculation of the Hessian matrix is numerically challenging, and due to the high memory need infeasible for larger models. PyDVL allows to use several approximation methods for the action of the inverse Hessian matrix to overcome this bottleneck:
- Iteration-based:
- Conjugate Gradients (Cg)
- Linear time Stochastic Second-Order Approximation ( LiSSA )
- Low-rank Approximations:
- Arnoldi
- Nystr\u00f6m Sketch-and-Solve (Nystr\u00f6m)
- Factorization-based:
- Eigenvalue-corrected Kronecker Factorization ( EKFAC )
In the following, we show the usage of these approximation methods and investigate their performance.
Since the Hessian is symmetric and positive definite (at least after applying a sufficient regularization), we can utilize the Conjugate Gradients Algorithm to approximately solve the equations
\\[ (H + \\lambda \\operatorname{I}) x = b\\] Most importantly, the algorithm do not require the computation of the full Hessian matrix, but only requires the implementation of Hessian vector products. pyDVL implements a stable block variant of preconditioned conjugate gradients algorithm.
from pydvl.influence.torch.pre_conditioner import NystroemPreConditioner\n\nnn_model.to(\"cpu\")\ncg_influence_model = CgInfluence(\n nn_model,\n F.cross_entropy,\n hessian_regularization=0.1,\n progress=True,\n use_block_cg=True,\n pre_conditioner=NystroemPreConditioner(rank=5),\n)\ncg_influence_model = cg_influence_model.fit(training_data_loader)\ncg_train_influences = cg_influence_model.influences(\n *test_data, *training_data, mode=\"up\"\n)\nmean_cg_train_influences = np.mean(cg_train_influences.numpy(), axis=0)\n
Let's compare the results obtained through conjugate gradient with those from the direct method
\nPercentage error of Cg over direct method:38.18922936916351 %\n
\n
\nPearson Correlation Cg vs direct 0.9978821390094573\nSpearman Correlation Cg vs direct 0.9946595460614153\n
\n
The LiSSA method is a stochastic approximation of the inverse Hessian vector product. Compared to conjugate gradient it is faster but less accurate and typically suffers from instability.
In order to find the solution of the HVP, LiSSA iteratively approximates the inverse of the Hessian matrix with the following update:
\\[H^{-1}_{j+1} b = b + (I - d) \\ H - \\frac{H^{-1}_j b}{s},\\] where \\(d\\) and \\(s\\) are a dampening and a scaling factor.
lissa_influence_model = LissaInfluence(\n nn_model,\n F.cross_entropy,\n regularization=0.1,\n progress=True,\n)\nlissa_influence_model = lissa_influence_model.fit(training_data_loader)\nlissa_train_influences = lissa_influence_model.influences(\n *test_data, *training_data, mode=\"up\"\n)\nmean_lissa_train_influences = np.mean(lissa_train_influences.numpy(), axis=0)\n
\nPercentage error of Lissa over direct method:119.32581663131714 %\n
\n
\nPearson Correlation Lissa vs direct 0.9875324674899437\nSpearman Correlation Lissa vs direct 0.9758067360253924\n
\n
The Arnoldi method leverages a low rank approximation of the Hessian matrix to reduce the memory requirements. It is generally much faster than the conjugate gradient method and can achieve similar accuracy.
arnoldi_influence_model = ArnoldiInfluence(\n nn_model,\n F.cross_entropy,\n rank_estimate=30,\n hessian_regularization=0.1,\n)\narnoldi_influence_model = arnoldi_influence_model.fit(training_data_loader)\narnoldi_train_influences = arnoldi_influence_model.influences(\n *test_data, *training_data, mode=\"up\"\n)\nmean_arnoldi_train_influences = np.mean(arnoldi_train_influences.numpy(), axis=0)\n
\nPercentage error of Arnoldi over direct method:40.1591956615448 %\n
\n
\nPearson Correlation Arnoldi vs direct 0.9913769850406638\nSpearman Correlation Arnoldi vs direct 0.9818122276242538\n
\n
Similar to the Arnoldi method. the Nystr\u00f6m method uses a low-rank approximation, which is computed from random projections of the Hessian matrix. In general the approximation is expected to be worse then the Arnoldi approximation, but is cheaper to compute.
nystroem_influence_model = NystroemSketchInfluence(\n nn_model,\n F.cross_entropy,\n rank=30,\n regularization=0.1,\n)\nnystroem_influence_model = nystroem_influence_model.fit(training_data_loader)\nnystroem_train_influences = nystroem_influence_model.influences(\n *test_data, *training_data, mode=\"up\"\n)\nmean_nystroem_train_influences = np.mean(nystroem_train_influences.numpy(), axis=0)\n
\nPercentage error of Nystr\u00f6m over direct method:106.66680335998535 %\n
\n
\nPearson Correlation Nystr\u00f6m vs direct 0.9951186619181842\nSpearman Correlation Nystr\u00f6m vs direct 0.9858830642114014\n
\n
The EKFAC method is a more recent technique that leverages the Kronecker product structure of the Hessian matrix to reduce the memory requirements. It is generally much faster than iterative methods like conjugate gradient and Arnoldi and it allows for an easier handling of memory. Therefore, it is the only technique that can scale to very large models (e.g. billions of parameters). Its accuracy is however much worse. Let's see how it performs on our example.
ekfac_influence_model = EkfacInfluence(\n nn_model,\n update_diagonal=True,\n hessian_regularization=0.1,\n)\nekfac_influence_model = ekfac_influence_model.fit(training_data_loader)\nekfac_train_influences = ekfac_influence_model.influences(\n *test_data, *training_data, mode=\"up\"\n)\nmean_ekfac_train_influences = np.mean(ekfac_train_influences.numpy(), axis=0)\n
\nPercentage error of EK-FAC over direct method:1995.9354400634766 %\n
\n
The accuracy is not good, and it is not recommended to use this method for small models. Nevertheless, a look at the actual influence values reveals that the EK-FAC estimates are not completely off.
The above plot shows a good correlation between the EK-FAC and the direct method. Corrupted points have been circled in red, and in both the direct and approximate case they are correcly identified as having negative influence on the model's accuracy. This is confirmed by explicit calculation of the Pearson and Spearman correlation coefficients.
\nPearson Correlation EK-FAC vs direct 0.9595030844711058\nSpearman Correlation EK-FAC vs direct 0.8974028264100562\n
\n
The correlation between the EK-FAC and the direct method is quite good, and it improves significantly if we just keep top-20 highest absolute influences.
\nPearson Correlation EK-FAC vs direct - top-20 influences 0.9891339337484283\nSpearman Correlation EK-FAC vs direct - top-20 influences 0.9593984962406013\n
\n
When we calculate influence scores, typically we are more interested in assessing which training points have the highest or lowest impact on the model rather than having a precise estimate of the influence value. EK-FAC then provides a fast and memory-efficient way to calculate a coarse influence ranking of the training points which scales very well even to the largest neural networks.
This was a quick introduction to the pyDVL interface for influence functions. Despite their speed and simplicity, influence functions are known to be a very noisy estimator of data quality, as pointed out in the paper \"Influence functions in deep learning are fragile\" . The size of the network, the weight decay, the inversion method used for calculating influences, the size of the test set: they all add up to the total amount of noise. Experiments may therefore give quantitative and qualitatively different results if not averaged across several realisations. Shapley values, on the contrary, have shown to be a more robust, but this comes at the cost of high computational requirements. PyDVL employs several parallelization and caching techniques to optimize such calculations.
"},{"location":"examples/influence_wine/#influence-functions-for-outlier-detection","title":"Influence functions for outlier detection","text":" This notebook shows how to calculate influences on a NN model using pyDVL for an arbitrary dataset, and how this can be used to find anomalous or corrupted data points.
It uses the wine dataset from sklearn: given a set of 13 different input parameters regarding a particular bottle, each related to some physical property (e.g. concentration of magnesium, malic acidity, alcoholic percentage, etc.), the model will need to predict to which of 3 classes the wine belongs to. For more details, please refer to the sklearn documentation .
"},{"location":"examples/influence_wine/#imports","title":"Imports","text":""},{"location":"examples/influence_wine/#dataset","title":"Dataset","text":""},{"location":"examples/influence_wine/#fit-a-neural-network-to-the-data","title":"Fit a neural network to the data","text":" We will train a 2-layer neural network. PyDVL has some convenience wrappers to initialize a pytorch NN. If you already have a model loaded and trained, you can skip this section.
"},{"location":"examples/influence_wine/#calculating-influences-for-small-neural-networks","title":"Calculating influences for small neural networks","text":" The following cell calculates the influences of each training data point on the neural network. Neural networks have typically a very bumpy parameter space, which, during training, is explored until the configuration that minimises the loss is found. There is an important assumption in influence functions that the model lays at a (at least local) minimum of such loss, and if this is not fulfilled many issues can arise. In order to avoid this scenario, a regularisation term should be used whenever dealing with big and noisy models.
"},{"location":"examples/influence_wine/#influence-of-training-features","title":"Influence of training features","text":""},{"location":"examples/influence_wine/#speeding-up-influences-for-big-models","title":"Speeding up influences for big models","text":""},{"location":"examples/influence_wine/#cg","title":"Cg","text":""},{"location":"examples/influence_wine/#lissa","title":"Lissa","text":""},{"location":"examples/influence_wine/#arnoldi","title":"Arnoldi","text":""},{"location":"examples/influence_wine/#nystrom","title":"Nystr\u00f6m","text":""},{"location":"examples/influence_wine/#ekfac","title":"EKFAC","text":""},{"location":"examples/influence_wine/#conclusions","title":"Conclusions","text":""},{"location":"examples/least_core_basic/","title":"Least Core","text":" We will be using the following functions and classes from pyDVL.
%autoreload\nfrom pydvl.utils import (\n Dataset,\n Utility,\n)\nfrom pydvl.value import compute_least_core_values, LeastCoreMode, ValuationResult\nfrom pydvl.reporting.plots import shaded_mean_std\nfrom pydvl.reporting.scores import compute_removal_score\n
X, y = make_classification(\n n_samples=dataset_size,\n n_features=50,\n n_informative=25,\n n_classes=3,\n random_state=random_state,\n)\n
full_dataset = Dataset.from_arrays(\n X, y, stratify_by_target=True, random_state=random_state\n)\nsmall_dataset = Dataset.from_arrays(\n X,\n y,\n stratify_by_target=True,\n train_size=train_size,\n random_state=random_state,\n)\n
model = LogisticRegression(max_iter=500, solver=\"liblinear\")\n
model.fit(full_dataset.x_train, full_dataset.y_train)\nprint(\n f\"Training accuracy: {100 * model.score(full_dataset.x_train, full_dataset.y_train):0.2f}%\"\n)\nprint(\n f\"Testing accuracy: {100 * model.score(full_dataset.x_test, full_dataset.y_test):0.2f}%\"\n)\n
\nTraining accuracy: 86.25%\nTesting accuracy: 70.00%\n
\n
model.fit(small_dataset.x_train, small_dataset.y_train)\nprint(\n f\"Training accuracy: {100 * model.score(small_dataset.x_train, small_dataset.y_train):0.2f}%\"\n)\nprint(\n f\"Testing accuracy: {100 * model.score(small_dataset.x_test, small_dataset.y_test):0.2f}%\"\n)\n
\nTraining accuracy: 100.00%\nTesting accuracy: 47.89%\n
\n
utility = Utility(model=model, data=small_dataset)\n
exact_values = compute_least_core_values(\n u=utility,\n mode=LeastCoreMode.Exact,\n progress=True,\n)\n
exact_values_df = exact_values.to_dataframe(column=\"exact_value\").T\nexact_values_df = exact_values_df[sorted(exact_values_df.columns)]\n
budget_array = np.linspace(200, 2 ** len(small_dataset), num=10, dtype=int)\n\nall_estimated_values_df = []\nall_errors = {budget: [] for budget in budget_array}\n\nfor budget in tqdm(budget_array):\n dfs = []\n errors = []\n column_name = f\"estimated_value_{budget}\"\n for i in range(20):\n values = compute_least_core_values(\n u=utility,\n mode=LeastCoreMode.MonteCarlo,\n n_iterations=budget,\n n_jobs=n_jobs,\n )\n df = (\n values.to_dataframe(column=column_name)\n .drop(columns=[f\"{column_name}_stderr\", f\"{column_name}_updates\"])\n .T\n )\n df = df[sorted(df.columns)]\n error = mean_squared_error(\n exact_values_df.loc[\"exact_value\"].values, np.nan_to_num(df.values.ravel())\n )\n all_errors[budget].append(error)\n df[\"budget\"] = budget\n dfs.append(df)\n estimated_values_df = pd.concat(dfs)\n all_estimated_values_df.append(estimated_values_df)\n\nvalues_df = pd.concat(all_estimated_values_df)\nerrors_df = pd.DataFrame(all_errors)\n
We can see that the approximation error decreases, on average, as the we increase the budget.
Still, the decrease may not always necessarily happen when we increase the number of iterations because of the fact that we sample the subsets with replacement in the Monte Carlo method i.e there may be repeated subsets.
utility = Utility(model=model, data=full_dataset)\n
method_names = [\"Random\", \"Least Core\"]\nremoval_percentages = np.arange(0, 0.41, 0.05)\n
all_scores = []\n\nfor i in trange(5):\n for method_name in method_names:\n if method_name == \"Random\":\n values = ValuationResult.from_random(size=len(utility.data))\n else:\n values = compute_least_core_values(\n u=utility,\n mode=LeastCoreMode.MonteCarlo,\n n_iterations=n_iterations,\n n_jobs=n_jobs,\n )\n scores = compute_removal_score(\n u=utility,\n values=values,\n percentages=removal_percentages,\n remove_best=True,\n )\n scores[\"method_name\"] = method_name\n all_scores.append(scores)\n\nscores_df = pd.DataFrame(all_scores)\n
We can clearly see that removing the most valuable data points, as given by the Least Core method, leads to, on average, a decrease in the model's performance and that the method outperforms random removal of data points.
all_scores = []\n\nfor i in trange(5):\n for method_name in method_names:\n if method_name == \"Random\":\n values = ValuationResult.from_random(size=len(utility.data))\n else:\n values = compute_least_core_values(\n u=utility,\n mode=LeastCoreMode.MonteCarlo,\n n_iterations=n_iterations,\n n_jobs=n_jobs,\n )\n scores = compute_removal_score(\n u=utility,\n values=values,\n percentages=removal_percentages,\n )\n scores[\"method_name\"] = method_name\n all_scores.append(scores)\n\nscores_df = pd.DataFrame(all_scores)\n
We can clearly see that removing the least valuable data points, as given by the Least Core method, leads to, on average, an increase in the model's performance and that the method outperforms the random removal of data points.
"},{"location":"examples/least_core_basic/#least-core-for-data-valuation","title":"Least Core for Data Valuation","text":" This notebook introduces Least Core methods for the computation of data values using pyDVL.
Shapley values define a fair way of distributing the worth of the whole training set when every data point is part of it. But they do not consider the question of stability of subsets: Could some data points obtain a higher payoff if they formed smaller subsets? It is argued that this might be relevant if data providers are paid based on data value, since Shapley values can incentivise them not to contribute their data to the \"grand coalition\", but instead try to form smaller ones. Whether this is of actual practical relevance is debatable, but in any case, the least core is an alternative tool available for any task of Data Valuation
The Core is another approach to compute data values originating in cooperative game theory that attempts to answer those questions. It is the set of feasible payoffs that cannot be improved upon by a coalition of the participants.
Its use for Data Valuation was first described in the paper If You Like Shapley Then You\u2019ll Love the Core by Tom Yan and Ariel D. Procaccia.
The Least Core value \\(v\\) of the \\(i\\) -th sample in dataset \\(D\\) wrt. utility \\(u\\) is computed by solving the following Linear Program:
\\[ \\begin{array}{lll} \\text{minimize} & \\displaystyle{e} & \\\\ \\text{subject to} & \\displaystyle\\sum_{x_i\\in D} v_u(x_i) = u(D) & \\\\ & \\displaystyle\\sum_{x_i\\in S} v_u(x_i) + e \\geq u(S) &, \\forall S \\subset D, S \\neq \\emptyset \\\\ \\end{array} \\] To illustrate this method we will use a synthetic dataset. We will first use a subset of 10 data point to compute the exact values and use them to assess the Monte Carlo approximation. Afterwards, we will conduct the data removal experiments as described by Ghorbani and Zou in their paper Data Shapley: Equitable Valuation of Data for Machine Learning : We compute the data valuation given different computation budgets and incrementally remove a percentage of the best, respectively worst, data points and observe how that affects the utility.
"},{"location":"examples/least_core_basic/#setup","title":"Setup","text":" We begin by importing the main libraries and setting some defaults.
If you are reading this in the documentation, some boilerplate (including most plotting code) has been omitted for convenience."},{"location":"examples/least_core_basic/#dataset","title":"Dataset","text":" We generate a synthetic dataset using the make_classification
function from scikit-learn.
We sample 200 data points from a 50-dimensional Gaussian distribution with 25 informative features and 25 non-informative features (generated as random linear combinations of the informative features).
The 200 samples are uniformly distributed across 3 classes with a small percentage of noise added to the labels to make the task a bit more difficult.
"},{"location":"examples/least_core_basic/#estimating-least-core-values","title":"Estimating Least Core Values","text":" In this first section we will use a smaller subset of the dataset containing 10 samples in order to be able to compute exact values in a reasonable amount of time. Afterwards, we will use the Monte Carlo method with a limited budget (maximum number of subsets) to approximate these values.
"},{"location":"examples/least_core_basic/#data-removal","title":"Data Removal","text":" We now move on to the data removal experiments using the full dataset.
In these experiments, we first rank the data points from most valuable to least valuable using the values estimated by the Monte Carlo Least Core method. Then, we gradually remove from 5 to 40 percent, by increments of 5 percentage points, of the most valuable/least valuable ones, train the model on this subset and compute its accuracy.
"},{"location":"examples/least_core_basic/#remove-best","title":"Remove Best","text":" We start by removing the best data points and seeing how the model's accuracy evolves.
"},{"location":"examples/least_core_basic/#remove-worst","title":"Remove Worst","text":" We then proceed to removing the worst data points and seeing how the model's accuracy evolves.
"},{"location":"examples/msr_banzhaf_digits/","title":"Banzhaf Semivalues","text":" We will be using the following functions from pyDVL. The main entry point is the function compute_banzhaf_semivalues()
. In order to use it we need the classes Dataset , Utility and Scorer .
%autoreload\nfrom pydvl.reporting.plots import plot_shapley\nfrom support.banzhaf import load_digits_dataset\nfrom pydvl.value import *\n
training_data, _, test_data = load_digits_dataset(\n test_size=0.3, random_state=random_state\n)\n
Training and test data are then used to instantiate a Dataset object:
dataset = Dataset(*training_data, *test_data)\n
import torch\nfrom support.banzhaf import TorchCNNModel\n\ndevice = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\nmodel = TorchCNNModel(lr=0.001, epochs=40, batch_size=32, device=device)\nmodel.fit(x=training_data[0], y=training_data[1])\n
\nTrain Accuracy: 0.705\nTest Accuracy: 0.630\n
\n
The final component is the scoring function. It can be anything like accuracy or \\(R^2\\) , and is set with a string from the standard sklearn scoring methods . Please refer to that documentation on information on how to define your own scoring function.
We group dataset, model and scoring function into an instance of Utility and compute the Banzhaf semi-values. We take all defaults, and choose to stop computation using the MaxChecks stopping criterion, which terminates after a fixed number of calls to it. With the default batch_size
of 1 this means that we will retrain the model.
Note how we enable caching using memcached (assuming memcached runs with the default configuration for localhost). This is necessary in the current preliminary implementation of permutation sampling , which is the default for compute_banzhaf_semivalues .
from pydvl.utils import MemcachedCacheBackend, MemcachedClientConfig\n\n# Compute regular Banzhaf semivalue\nutility = Utility(\n model=model,\n data=dataset,\n scorer=Scorer(\"accuracy\", default=0.0, range=(0, 1)),\n cache_backend=MemcachedCacheBackend(MemcachedClientConfig()),\n)\nvalues = compute_banzhaf_semivalues(\n utility, done=MaxChecks(max_checks), n_jobs=n_jobs, progress=True\n)\nvalues.sort(key=\"value\")\ndf = values.to_dataframe(column=\"banzhaf_value\", use_names=True)\n
The returned dataframe contains the mean and variance of the Monte Carlo estimates for the values:
banzhaf_value banzhaf_value_stderr banzhaf_value_updates 156 -1.097920 6.662418e-02 5 21 -0.925489 1.230752e-01 5 152 -0.913313 3.358054e-02 5 73 -0.778884 3.668419e-05 5 85 -0.644435 3.454322e-08 5 Let us plot the results. In the next cell we will take the 30 images with the lowest score and plot their values with 95% Normal confidence intervals. Keep in mind that Permutation Monte Carlo Banzhaf is typically very noisy, and it can take many steps to arrive at a clean estimate.
\nAverage value of first 10 data points: 0.650003277874342\nExact values:\n39 0.432836\n45 0.455392\n158 0.533221\n144 0.571260\n36 0.633091\n161 0.697940\n77 0.698507\n28 0.752367\n35 0.838752\n175 0.886668\nName: banzhaf_value, dtype: float64\n
\n
For the first 5 images, we will falsify their label, for images 6-10, we will add some noise.
x_train_anomalous = training_data[0].copy()\ny_train_anomalous = training_data[1].copy()\nanomalous_indices = high_dvl.index.map(int).values[:10]\n\n# Set label of first 5 images to 0\ny_train_anomalous[high_dvl.index.map(int).values[:5]] = 0\n\n# Add noise to images 6-10\nindices = high_dvl.index.values[5:10].astype(int)\ncurrent_images = x_train_anomalous[indices]\nnoisy_images = current_images + 0.5 * np.random.randn(*current_images.shape)\nnoisy_images[noisy_images < 0] = 0.0\nnoisy_images[noisy_images > 1] = 1.0\nx_train_anomalous[indices] = noisy_images\n
anomalous_dataset = Dataset(\n x_train=x_train_anomalous,\n y_train=y_train_anomalous,\n x_test=test_data[0],\n y_test=test_data[1],\n)\n\nanomalous_utility = Utility(\n model=TorchCNNModel(),\n data=anomalous_dataset,\n scorer=Scorer(\"accuracy\", default=0.0, range=(0, 1)),\n cache_backend=MemcachedCacheBackend(MemcachedClientConfig()),\n)\nanomalous_values = compute_banzhaf_semivalues(\n anomalous_utility, done=MaxChecks(max_checks), n_jobs=n_jobs, progress=True\n)\nanomalous_values.sort(key=\"value\")\nanomalous_df = anomalous_values.to_dataframe(column=\"banzhaf_value\", use_names=True)\n
Let us now take a look at the low-value images and check how many of our anomalous images are part of it.
As can be seen in this figure, the valuation of the data points has decreased significantly by adding noise or falsifying their labels. This shows the potential of using Banzhaf values or other data valuation methods to detect mislabeled data points or noisy input data.
\nAverage value of original data points: 0.650003277874342\nAverage value of modified, anomalous data points: -0.02501543656281746\nFor reference, these are the average data values of all data points used for training (anomalous):\nbanzhaf_value 0.006044\nbanzhaf_value_stderr 0.103098\nbanzhaf_value_updates 5.000000\ndtype: float64\nThese are the average data values of all points (original data):\nbanzhaf_value 0.005047\nbanzhaf_value_stderr 0.115262\nbanzhaf_value_updates 5.000000\ndtype: float64\n
\n
utility = Utility(\n model=TorchCNNModel(),\n data=dataset,\n scorer=Scorer(\"accuracy\", default=0.0, range=(0, 1)),\n cache_backend=MemcachedCacheBackend(MemcachedClientConfig()),\n)\n
Computing the values is the same, but we now use a better stopping criterion. Instead of fixing the number of utility evaluations with MaxChecks , we use RankCorrelation to stop when the change in Spearman correlation between the ranking of two successive iterations is below a threshold.
values = compute_msr_banzhaf_semivalues(\n utility,\n done=RankCorrelation(rtol=0.0001, burn_in=10),\n n_jobs=n_jobs,\n progress=True,\n)\nvalues.sort(key=\"value\")\nmsr_df = values.to_dataframe(column=\"banzhaf_value\", use_names=True)\n
Inspection of the values reveals (generally) much lower variances. Notice the number of updates to each value as well.
banzhaf_value banzhaf_value_stderr banzhaf_value_updates 137 -0.264918 0.093597 11 20 -0.217394 0.127022 11 19 -0.210309 0.087179 11 41 -0.210119 0.071534 11 192 -0.191667 0.130774 11 from sklearn.linear_model import SGDClassifier\n\nif is_CI:\n utility = Utility(\n model=SGDClassifier(max_iter=2),\n data=dataset,\n scorer=Scorer(\"accuracy\", default=0.0, range=(0, 1)),\n )\nelse:\n utility = Utility(\n model=TorchCNNModel(),\n data=dataset,\n scorer=Scorer(\"accuracy\", default=0.0, range=(0, 1)),\n )\n
def get_semivalues_and_history(\n sampler_t, max_checks=max_checks, n_jobs=n_jobs, progress=True\n):\n _history = HistoryDeviation(n_steps=max_checks, rtol=1e-9)\n if sampler_t == MSRSampler:\n semivalue_function = compute_msr_banzhaf_semivalues\n else:\n semivalue_function = compute_banzhaf_semivalues\n _values = semivalue_function(\n utility,\n sampler_t=sampler_t,\n done=MaxChecks(max_checks + 2) | _history,\n n_jobs=n_jobs,\n progress=progress,\n )\n return _history, _values\n
# Monte Carlo Permutation Sampling Banzhaf semivalues\nhistory_permutation, permutation_values = get_semivalues_and_history(PermutationSampler)\n
# MSR Banzhaf values\nhistory_msr, msr_values = get_semivalues_and_history(MSRSampler)\n
# UniformSampler\nhistory_uniform, uniform_values = get_semivalues_and_history(UniformSampler)\n
# AntitheticSampler\nhistory_antithetic, antithetic_values = get_semivalues_and_history(AntitheticSampler)\n
# RandomHierarchicalSampler\nhistory_random, random_values = get_semivalues_and_history(RandomHierarchicalSampler)\n
The plot above visualizes the convergence speed of different samplers used for Banzhaf semivalue calculation. /It shows the average magnitude of how much the semivalues are updated in every step of the algorithm.
As you can see, MSR Banzhaf stabilizes much faster. After 1000 iterations (subsets sampled and evaluated with the utility), Permutation Monte Carlo Banzhaf has evaluated the marginal function about 5 times per data point (we are using 200 data points). For MSR , the semivalue of each data point was updated 1000 times. Due to this, the values converge much faster wrt. the number of utility evaluations, which is the key advantage of MSR sampling.
MSR sampling does come at a cost, however, which is that the updates to the semivalues are more noisy than in other methods. We will analyze the impact of this tradeoff in the next sections. First, let us look at how similar all the computed semivalues are. They are all Banzhaf values, so in a perfect world, all samplers should result in the exact same semivalues. However, due to randomness in the utility (recall that we use a neural network) and randomness in the samplers, the resulting values are likely never exactly the same. Another quality measure is that a good sampler would lead to very consistent values, a bad one to less consistent values. Let us first examine how similar the results are, then we'll look at consistency.
This plot shows that the samplers lead to quite different Banzhaf semivalues, however, all of them have some points in common. The MSR Sampler does not seem to be significantly worse than any others.
In an ideal setting without randomness, the overlap of points would be higher, however, the stochastic nature of the CNN model that we use together with the fact that we use only 200 data points for training, might overshadow these results. As a matter of fact we have the rather discouraging following result:
\nTotal number of top 20 points that all samplers have in common: 0\n
\n
"},{"location":"examples/msr_banzhaf_digits/#banzhaf-semi-values-for-data-valuation","title":"Banzhaf Semi-values for data valuation","text":" This notebook showcases Data Banzhaf: A Robust Data Valuation Framework for Machine Learning by Wang, and Jia.
Computing Banzhaf semi-values using pyDVL follows basically the same procedure as all other semi-value-based methods like Shapley values. However, Data-Banzhaf tends to be more robust to stochasticity in the training process than other semi-values. A property that we study here.
Additionally, we compare two sampling techniques: the standard permutation-based Monte Carlo sampling, and the so-called MSR (Maximum Sample Reuse) principle.
In order to highlight the strengths of Data-Banzhaf, we require a stochastic model. For this reason, we use a CNN to classify handwritten digits from the scikit-learn toy datasets .
"},{"location":"examples/msr_banzhaf_digits/#setup","title":"Setup","text":"If you are reading this in the documentation, some boilerplate (including most plotting code) has been omitted for convenience."},{"location":"examples/msr_banzhaf_digits/#loading-the-dataset","title":"Loading the dataset","text":" We use a support function, load_digits_dataset()
, which downloads the data and prepares it for usage. It returns four arrays that we then use to construct a Dataset . The data consists of grayscale images of shape 8x8 pixels with 16 shades of gray. These images contain handwritten digits from 0 to 9.
"},{"location":"examples/msr_banzhaf_digits/#creating-the-utility-and-computing-banzhaf-semivalues","title":"Creating the utility and computing Banzhaf semivalues","text":" Now we can calculate the contribution of each training sample to the model performance. First we need a model and a Scorer .
As a model, we use a simple CNN written torch, and wrapped into an object to convert numpy arrays into tensors (as of v0.9.0 valuation methods in pyDVL work only with numpy arrays). Note that any model that implements the protocol pydvl.utils.types.SupervisedModel , which is just the standard sklearn interface of fit()
, predict()
and score()
can be used to construct the utility.
"},{"location":"examples/msr_banzhaf_digits/#evaluation-on-anomalous-data","title":"Evaluation on anomalous data","text":" An interesting use-case for data valuation is finding anomalous data. Maybe some of the data is really noisy or has been mislabeled. To simulate this, we will change some of the labels of our dataset and add noise to some others. Intuitively, these anomalous data points should then have a lower value.
To evaluate this, let us first check the average value of the first 10 data points, as these will be the ones that we modify. Currently, these are the 10 data points with the highest values:
"},{"location":"examples/msr_banzhaf_digits/#maximum-sample-reuse-banzhaf","title":"Maximum Sample Reuse Banzhaf","text":" Despite the previous results already being useful, we had to retrain the model a number of times and yet the variance of the value estimates was high. This has consequences for the stability of the top-k ranking of points, which decreases the applicability of the method. We now introduce a different sampling method called Maximum Sample Reuse ( MSR ) which reuses every sample for updating the Banzhaf values. The method was introduced by the authors of Data-Banzhaf and is much more sample-efficient, as we will show.
We next construct a new utility. Note how this time we don't use a cache: the chance of hitting twice the same subset of the training set is low enough that one can dispense with it (nevertheless it can still be useful, e.g. when running many experiments).
"},{"location":"examples/msr_banzhaf_digits/#compare-convergence-speed-of-banzhaf-and-msr-banzhaf-values","title":"Compare convergence speed of Banzhaf and MSR Banzhaf Values","text":" Conventional margin-based samplers produce require evaluating the utility twice to do one update of the value, and permutation samplers do instead \\(n+1\\) evaluations for \\(n\\) updates. Maximum Sample Reuse ( MSR ) updates instead all indices in every sample that the utility evaluates. We compare the convergence rates of these methods.
In order to do so, we will compute the semi-values using different samplers and use a high number of iterations to make sure that the values have converged.
"},{"location":"examples/msr_banzhaf_digits/#similarity-of-the-semivalues-computed-using-different-samplers","title":"Similarity of the semivalues computed using different samplers","text":""},{"location":"examples/msr_banzhaf_digits/#consistency-of-the-semivalues","title":"Consistency of the semivalues","text":" Finally, we want to analyze how consistent the semivalues returned by the different samplers are. In order to do this, we compute semivalues multiple times and check how many of the data points in the top and lowest 20% of valuation of the data overlap.
"},{"location":"examples/msr_banzhaf_digits/#conclusion","title":"Conclusion","text":" MSR sampling updates the semivalue estimates for every index in the sample, much more frequently than any other sampler available, which leads to much faster convergence . Additionally, the sampler is more consistent with its value estimates than the other samplers, which might be caused by the higher number of value updates.
There is alas no general recommendation. It is best to try different samplers when computing semivalues and test which one is best suited for your use case. Nevertheless, the MSR sampler seems like a more efficient sampler which may bring fast results and is well-suited for stochastic models.
"},{"location":"examples/shapley_basic_spotify/","title":"Shapley values","text":" This notebook introduces Shapley methods for the computation of data value using pyDVL.
In order to illustrate the practical advantages, we will predict the popularity of songs in the dataset Top Hits Spotify from 2000-2019 , and highlight how data valuation can help investigate and boost the performance of the models. In doing so, we will describe the basic usage patterns of pyDVL.
Recall that data value is a function of three things:
- The dataset.
- The model.
- The performance metric or scoring function.
Below we will describe how to instantiate each one of these objects and how to use them for data valuation. Please also see the documentation on data valuation .
We will be using the following functions from pyDVL. The main entry point is the function compute_shapley_values() , which provides a facade to all Shapley methods. In order to use it we need the classes Dataset , Utility and Scorer .
%autoreload\nfrom pydvl.reporting.plots import plot_shapley\nfrom pydvl.utils.dataset import GroupedDataset\nfrom support.shapley import load_spotify_dataset\nfrom pydvl.value import *\n
training_data, val_data, test_data = load_spotify_dataset(\n val_size=0.3, test_size=0.3, target_column=\"popularity\", random_state=random_state\n)\n
training_data[0].head()\n
artist song duration_ms explicit year danceability energy key loudness mode speechiness acousticness instrumentalness liveness valence tempo genre 1561 Fetty Wap 679 (feat. Remy Boyz) 196693 True 2015 0.618 0.717 7 -5.738 1 0.3180 0.00256 0.000000 0.6250 0.603 190.050 8 1410 Meghan Trainor All About That Bass 187920 True 2015 0.807 0.887 9 -3.726 1 0.0503 0.05730 0.000003 0.1240 0.961 134.052 14 1772 Katy Perry Chained To The Rhythm 237733 False 2017 0.562 0.800 0 -5.404 1 0.1120 0.08140 0.000000 0.1990 0.471 95.029 14 1670 Sigala Sweet Lovin' - Radio Edit 202149 False 2015 0.683 0.910 10 -1.231 1 0.0515 0.05530 0.000005 0.3360 0.674 124.977 15 1780 Liam Payne Strip That Down 204502 False 2017 0.869 0.485 6 -5.595 1 0.0545 0.24600 0.000000 0.0765 0.527 106.028 14 The dataset has many high-level features, some quite intuitive ('duration_ms' or 'tempo'), while others are a bit more cryptic ('valence'?). For information on each feature, please consult the dataset's website .
In our analysis, we will use all the columns, except for 'artist' and 'song', to predict the 'popularity' of each song. We will nonetheless keep the information on song and artist in a separate object for future reference.
song_name = training_data[0][\"song\"]\nartist = training_data[0][\"artist\"]\ntraining_data[0] = training_data[0].drop([\"song\", \"artist\"], axis=1)\ntest_data[0] = test_data[0].drop([\"song\", \"artist\"], axis=1)\nval_data[0] = val_data[0].drop([\"song\", \"artist\"], axis=1)\n
Input and label data are then used to instantiate a Dataset object:
dataset = Dataset(*training_data, *val_data)\n
The calculation of exact Shapley values is computationally very expensive (exponentially so!) because it requires training the model on every possible subset of the training set. For this reason, PyDVL implements techniques to speed up the calculation, such as Monte Carlo approximations , surrogate models or caching of intermediate results and grouping of data to calculate group Shapley values instead of single data points.
In our case, we will group songs by artist and calculate the Shapley value for the artists. Given the pandas Series for 'artist', to group the dataset by it, one does the following:
grouped_dataset = GroupedDataset.from_dataset(dataset=dataset, data_groups=artist)\n
utility = Utility(\n model=GradientBoostingRegressor(n_estimators=3),\n data=grouped_dataset,\n scorer=Scorer(\"neg_mean_absolute_error\", default=0.0),\n)\nvalues = compute_shapley_values(\n utility,\n mode=ShapleyMode.TruncatedMontecarlo,\n # Stop if the standard error is below 1% of the range of the values (which is ~2),\n # or if the number of updates exceeds 1000\n done=AbsoluteStandardError(threshold=0.2, fraction=0.9) | MaxUpdates(1000),\n truncation=RelativeTruncation(utility, rtol=0.01),\n n_jobs=-1,\n)\nvalues.sort(key=\"value\")\ndf = values.to_dataframe(column=\"data_value\", use_names=True)\n
\nCancellation of futures is not supported by the joblib backend\n
\n
The function compute_shapley_values() serves as a common access point to all Shapley methods. For most of them, we must choose a StoppingCriterion
with the argument done=
. In this case we choose to stop when the ratio of standard error to value is below 0.2 for at least 90% of the training points, or if the number of updates of any index exceeds 1000. The mode
argument specifies the Shapley method to use. In this case, we use the Truncated Monte Carlo approximation , which is the fastest of the Monte Carlo methods, owing both to using the permutation definition of Shapley values and the ability to truncate the iteration over a given permutation. We configure this to happen when the contribution of the remaining elements is below 1% of the total utility with the parameter truncation=
and the policy RelativeTruncation .
Let's take a look at the returned dataframe:
df.head()\n
data_value data_value_stderr Years & Years -1.150663 0.195376 Reik -1.123071 0.126558 Astrid S -0.945702 0.331619 Liam Payne -0.886687 0.112654 DB Boulevard -0.847957 0.057503 The first thing to notice is that we sorted the results in ascending order of Shapley value. The index holds the labels for each data group: in this case, artist names. The column data_value
is just that: the Shapley Data value, and data_value_stderr
is its estimated standard error because we are using a Monte Carlo approximation.
Let us plot the results. In the next cell we will take the 30 artists with the lowest score and plot their values with 95% Normal confidence intervals. Keep in mind that Monte Carlo Shapley is typically very noisy, and it can take many steps to arrive at a clean estimate.
We can immediately see that many artists (groups of samples) have very low, even negative value, which means that they tend to decrease the total score of the model when present in the training set! What happens if we remove them?
In the next cell we create a new training set excluding the artists with the lowest scores:
low_dvl_artists = df.iloc[: int(0.2 * len(df))].index.to_list()\nartist_filter = ~artist.isin(low_dvl_artists)\nX_train_good_dvl = training_data[0][artist_filter]\ny_train_good_dvl = training_data[1][artist_filter]\n
Now we will use this \"cleaned\" dataset to retrain the same model and compare its mean absolute error to the one trained on the full dataset. Notice that the score now is calculated using the test set, while in the calculation of the Shapley values we were using the validation set.
model_good_data = GradientBoostingRegressor(n_estimators=3).fit(\n X_train_good_dvl, y_train_good_dvl\n)\nerror_good_data = mean_absolute_error(\n model_good_data.predict(test_data[0]), test_data[1]\n)\n\nmodel_all_data = GradientBoostingRegressor(n_estimators=3).fit(\n training_data[0], training_data[1]\n)\nerror_all_data = mean_absolute_error(model_all_data.predict(test_data[0]), test_data[1])\n\nprint(f\"Improvement: {100*(error_all_data - error_good_data)/error_all_data:02f}%\")\n
\nImprovement: 15.314214%\n
\n
The score has improved by almost 14%! This is quite an important result, as it shows a consistent process to improve the performance of a model by excluding data points from its training set.
One must however proceed with caution instead of simply throwing away data. For one, `mean_absolute_error` is an estimate of generalization error on unseen data, so the improvement we see on the test set might not be as large upon deployment. It would be advisable to cross-validate this whole process to obtain more conservative estimates. It is also advisable to manually inspect the artists with low value and to try to understand the reason why the model behaves like it does. Finally, remember that **the value depends on the model chosen**! Artists that are detrimental to the Gradient Boosting Regressor might be informative for a different model (although it is likely that the worst ones share some characteristic making them \"bad\" for other regressors). Let us take all the songs by Billie Eilish, set their score to 0 and re-calculate the Shapley values.
y_train_anomalous = training_data[1].copy(deep=True)\ny_train_anomalous[artist == \"Billie Eilish\"] = 0\nanomalous_dataset = Dataset(\n x_train=training_data[0],\n y_train=y_train_anomalous,\n x_test=val_data[0],\n y_test=val_data[1],\n)\ngrouped_anomalous_dataset = GroupedDataset.from_dataset(anomalous_dataset, artist)\nanomalous_utility = Utility(\n model=GradientBoostingRegressor(n_estimators=3),\n data=grouped_anomalous_dataset,\n scorer=Scorer(\"neg_mean_absolute_error\", default=0.0),\n)\nvalues = compute_shapley_values(\n anomalous_utility,\n mode=ShapleyMode.TruncatedMontecarlo,\n done=AbsoluteStandardError(threshold=0.2, fraction=0.9) | MaxUpdates(1000),\n n_jobs=-1,\n)\nvalues.sort(key=\"value\")\ndf = values.to_dataframe(column=\"data_value\", use_names=True)\n
\nCancellation of futures is not supported by the joblib backend\n
\n
Let us now consider the low-value artists (at least for predictive purposes, no claims are made about their artistic value!) and plot the results
And Billie Eilish (our anomalous data group) has moved from top contributor to having negative impact on the performance of the model, as expected!
What is going on? A popularity of 0 for Billie Eilish's songs is inconsistent with listening patterns for other artists. In artificially setting this, we degrade the predictive power of the model.
By dropping low-value groups or samples, one can often increase model performance, but by inspecting them, it is possible to identify bogus data sources or acquisition methods.
"},{"location":"examples/shapley_basic_spotify/#shapley-for-data-valuation","title":"Shapley for data valuation","text":""},{"location":"examples/shapley_basic_spotify/#setup","title":"Setup","text":" We begin by importing the main libraries and setting some defaults.
If you are reading this in the documentation, some boilerplate (including most plotting code) has been omitted for convenience."},{"location":"examples/shapley_basic_spotify/#loading-and-grouping-the-dataset","title":"Loading and grouping the dataset","text":" pyDVL provides a support function for this notebook, load_spotify_dataset()
, which downloads data on songs published after 2014, and splits 30% of data for testing, and 30% of the remaining data for validation. The return value is a triple of training, validation and test data as lists of the form [X_input, Y_label]
.
"},{"location":"examples/shapley_basic_spotify/#creating-the-utility-and-computing-values","title":"Creating the utility and computing values","text":" Now we can calculate the contribution of each group to the model performance.
As a model, we use scikit-learn's GradientBoostingRegressor , but pyDVL can work with any model from sklearn, xgboost or lightgbm. More precisely, any model that implements the protocol pydvl.utils.types.SupervisedModel , which is just the standard sklearn interface of fit()
, predict()
and score()
can be used to construct the utility.
The third and final component is the scoring function. It can be anything like accuracy or \\(R^2\\) , and is set with a string from the standard sklearn scoring methods . Please refer to that documentation on information on how to define your own scoring function.
We group dataset, model and scoring function into an instance of Utility .
"},{"location":"examples/shapley_basic_spotify/#evaluation-on-anomalous-data","title":"Evaluation on anomalous data","text":" One interesting test is to corrupt some data and to monitor how their value changes. To do this, we will take one of the artists with the highest value and set the popularity of all their songs to 0.
"},{"location":"examples/shapley_knn_flowers/","title":"KNN Shapley","text":" This notebook shows how to calculate Shapley values for the K-Nearest Neighbours algorithm. By making use of the local structure of KNN, it is possible to compute an exact value in almost linear time, as opposed to exponential complexity of exact, model-agnostic Shapley.
The main idea is to exploit the fact that adding or removing points beyond the k-ball doesn't influence the score. Because the algorithm then essentially only needs to do a search it runs in \\(\\mathcal{O}(N \\log N)\\) time.
By further using approximate nearest neighbours, it is possible to achieve \\((\\epsilon,\\delta)\\) -approximations in sublinear time. However, this is not implemented in pyDVL yet.
We refer to the original paper that pyDVL implements for details: Jia, Ruoxi, David Dao, Boxin Wang, Frances Ann Hubis, Nezihe Merve Gurel, Bo Li, Ce Zhang, Costas Spanos, and Dawn Song. Efficient Task-Specific Data Valuation for Nearest Neighbor Algorithms . Proceedings of the VLDB Endowment 12, no. 11 (1 July 2019): 1610\u201323.
The main entry point is the function compute_shapley_values() , which provides a facade to all Shapley methods. In order to use it we need the classes Dataset , Utility and Scorer , all of which can be imported from pydvl.value
:
from pydvl.value import *\n
sklearn_dataset = datasets.load_iris()\ndata = Dataset.from_sklearn(sklearn_dataset)\nknn = sk.neighbors.KNeighborsClassifier(n_neighbors=5)\nutility = Utility(knn, data)\n
shapley_values = compute_shapley_values(utility, mode=ShapleyMode.KNN, progress=True)\nshapley_values.sort(key=\"value\")\nvalues = shapley_values.values\n
\n0it [00:00, ?it/s]
\n
If we now look at the distribution of Shapley values for each class, we see that each has samples with both high and low scores. This is expected, because an accurate model uses information of all classes.
corrupted_data = deepcopy(data)\nn_corrupted = 10\ncorrupted_data.y_train[:n_corrupted] = (corrupted_data.y_train[:n_corrupted] + 1) % 3\nknn = sk.neighbors.KNeighborsClassifier(n_neighbors=5)\ncontaminated_values = compute_shapley_values(\n Utility(knn, corrupted_data), mode=ShapleyMode.KNN\n)\n
Taking the average corrupted value and comparing it to non-corrupted ones, we notice that on average anomalous points have a much lower score, i.e. they tend to be much less valuable to the model.
To do this, first we make sure that we access the results by data index with a call to ValuationResult.sort()
, then we split the values into two groups: corrupted and non-corrupted. Note how we access property values
of the ValuationResult
object. This is a numpy array of values, sorted however the object was sorted. Finally, we compute the quantiles of the two groups and compare them. We see that the corrupted mean is in the lowest percentile of the value distribution, while the correct mean is in the 70th percentile.
contaminated_values.sort(\n key=\"index\"\n) # This is redundant, but illustrates sorting, which is in-place\n\ncorrupted_shapley_values = contaminated_values.values[:n_corrupted]\ncorrect_shapley_values = contaminated_values.values[n_corrupted:]\n\nmean_corrupted = np.mean(corrupted_shapley_values)\nmean_correct = np.mean(correct_shapley_values)\npercentile_corrupted = np.round(100 * np.mean(values < mean_corrupted), 0)\npercentile_correct = np.round(100 * np.mean(values < mean_correct), 0)\n\nprint(\n f\"The corrupted mean is at percentile {percentile_corrupted:.0f} of the value distribution.\"\n)\nprint(\n f\"The correct mean is percentile {percentile_correct:.0f} of the value distribution.\"\n)\n
\nThe corrupted mean is at percentile 2 of the value distribution.\nThe correct mean is percentile 71 of the value distribution.\n
\n
This is confirmed if we plot the distribution of Shapley values and circle corrupt points in red. They all tend to have low Shapley scores, regardless of their position in space and assigned label:
"},{"location":"examples/shapley_knn_flowers/#knn-shapley","title":"KNN Shapley","text":""},{"location":"examples/shapley_knn_flowers/#setup","title":"Setup","text":" We begin by importing the main libraries and setting some defaults.
If you are reading this in the documentation, some boilerplate (including most plotting code) has been omitted for convenience."},{"location":"examples/shapley_knn_flowers/#building-a-dataset-and-a-utility","title":"Building a Dataset and a Utility","text":" We use the sklearn iris dataset and wrap it into a pydvl.utils.dataset.Dataset calling the factory pydvl.utils.dataset.Dataset.from_sklearn() . This automatically creates a train/test split for us which will be used to compute the utility.
We then create a model and instantiate a Utility using data and model. The model needs to implement the protocol pydvl.utils.types.SupervisedModel , which is just the standard sklearn interface of fit()
, predict()
and score()
. In constructing the Utility
one can also choose a scoring function, but we pick the default which is just the model's knn.score()
.
"},{"location":"examples/shapley_knn_flowers/#computing-values","title":"Computing values","text":" Calculating the Shapley values is straightforward. We just call compute_shapley_values() with the utility object we created above. The function returns a ValuationResult . This object contains the values themselves, data indices and labels.
"},{"location":"examples/shapley_knn_flowers/#inspecting-the-results","title":"Inspecting the results","text":" Let us first look at the labels' distribution as a function of petal and sepal length:
"},{"location":"examples/shapley_knn_flowers/#corrupting-labels","title":"Corrupting labels","text":" To test how informative values are, we can corrupt some training labels and see how their Shapley values change with respect to the non-corrupted points.
"},{"location":"examples/shapley_utility_learning/","title":"Data utility learning","text":" This notebook introduces Data Utility Learning , a method of approximating Data Shapley values by learning to estimate the utility function.
The idea is to employ a model to learn the performance of the learning algorithm of interest on unseen data combinations (i.e. subsets of the dataset). The method was originally described in Wang, Tianhao, Yu Yang, and Ruoxi Jia. Improving Cooperative Game Theory-Based Data Valuation via Data Utility Learning . arXiv, 2022 .
Warning: Work on Data Utility Learning is preliminary. It remains to be seen when or whether it can be put effectively into application. For this further testing and benchmarking are required.
Recall the definition of Shapley value \\(v_u(i)\\) for data point \\(i\\) :
\\[\\begin{equation} v_u(i) = \\frac{1}{n} \\sum_{S \\subseteq N \\setminus \\{i\\}} \\binom{n-1}{|S|}^{-1} [u(S \\cup \\{i\\}) \u2212 u(S)] , \\tag{1} \\label{eq:shapley-def} \\end{equation}\\] where \\(N\\) is the set of all indices in the training set and \\(u\\) is the utility.
In Data Utility Learning, to avoid the exponential cost of computing this sum, one learns a surrogate model for \\(u\\) . We start by sampling so-called utility samples to form a training set \\(S_\\mathrm{train}\\) for our utility model. Each utility sample is a tuple consisting of a subset of indices \\(S_j\\) in the dataset and its utility \\(u(S_j)\\) :
\\[\\mathcal{S}_\\mathrm{train} = \\{(S_j, u(S_j): j = 1 , ..., m_\\mathrm{train}\\}\\] where \\(m_\\mathrm{train}\\) denotes the training budget for the learned utility function.
The subsets are then transformed into boolean vectors \\(\\phi\\) in which a \\(1\\) at index \\(k\\) means that the \\(k\\) -th sample of the dataset is present in the subset:
\\[S_j \\mapsto \\phi_j \\in \\{ 0, 1 \\}^{N}\\] We fit a regression model \\(\\tilde{u}\\) , called data utility model , on the transformed utility samples \\(\\phi (\\mathcal{S}_\\mathrm{train}) := \\{(\\phi(S_j), u(S_j): j = 1 , ..., m_\\mathrm{train}\\}\\) and use it to predict instead of computing the utility for any \\(S_j \\notin \\mathcal{S}_\\mathrm{train}\\) . We abuse notation and identify \\(\\tilde{u}\\) with the composition \\(\\tilde{u} \\circ \\phi : N \\rightarrow \\mathbb{R}\\) .
The main assumption is that it is much faster to fit and use \\(\\tilde{u}\\) than it is to compute \\(u\\) and that for most \\(i\\) , \\(v_\\tilde{u}(i) \\approx v_u(i)\\) in some sense.
As is the case with all other Shapley methods, the main entry point is the function compute_shapley_values() , which provides a facade to all algorithms in this family. We use it with the usual classes Dataset and Utility . In addition, we must import the core class for learning a utility, DataUtilityLearning .
%autoreload\nfrom pydvl.utils import DataUtilityLearning, top_k_value_accuracy\nfrom pydvl.reporting.plots import shaded_mean_std\nfrom pydvl.value import *\n
dataset = Dataset.from_sklearn(\n load_iris(),\n train_size=train_size,\n random_state=random_state,\n stratify_by_target=True,\n)\n
We verify that, as in the paper, if we fit a Support-Vector Classifier to the training data, we obtain an accuracy of around 92%:
model = LinearSVC()\nmodel.fit(dataset.x_train, dataset.y_train)\nprint(f\"Mean accuracy: {100 * model.score(dataset.x_test, dataset.y_test):0.2f}%\")\n
\nMean accuracy: 92.59%\n
\n
computation_times = {}\n
utility = Utility(model=model, data=dataset)\n
start_time = time.monotonic()\n\nresult = compute_shapley_values(\n u=utility,\n mode=ShapleyMode.CombinatorialExact,\n n_jobs=-1,\n progress=False,\n)\n\ncomputation_time = time.monotonic() - start_time\ncomputation_times[\"exact\"] = computation_time\n\ndf = result.to_dataframe(column=\"exact\").drop(columns=[\"exact_stderr\"])\n
We now estimate the Data Shapley values using the DataUtilityLearning wrapper. This class wraps a Utility and delegates calls to it, up until a given budget. Every call yields a utility sample which is saved under the hood for training of the given utility model. Once the budget is exhausted, DataUtilityLearning
fits the model to the utility samples and all subsequent calls use the learned model to predict the wrapped utility instead of delegating to it.
For the utility model we follow the paper and use a fully connected neural network. To train it we use a total of training_budget
utility samples. We repeat this multiple times for each training budget.
Note how we use a MonteCarlo approximation instead of `combinatorial_exact` as before. This is because the exact computation samples subsets in a particular order, from the lowest size to the largest. Because the training budget for the model to learn the utility is around 1/4th of the total number of subsets, this would mean that we would never see utility samples for the larger sizes and the model would be biased (try it!) mlp_kwargs = dict(\n hidden_layer_sizes=(20, 10),\n activation=\"relu\",\n solver=\"adam\",\n learning_rate_init=0.001,\n batch_size=batch_size,\n max_iter=800,\n)\n\nprint(\n f\"Doing {n_runs} runs for each of {len(training_budget_values)} different training budgets.\"\n)\n\npbar = tqdm(\n product(range(n_runs), training_budget_values),\n total=n_runs * len(training_budget_values),\n)\nfor idx, budget in pbar:\n pbar.set_postfix_str(f\"Run {idx} for training budget: {budget}\")\n dul_utility = DataUtilityLearning(\n u=utility, training_budget=budget, model=MLPRegressor(**mlp_kwargs)\n )\n\n start_time = time.monotonic()\n\n # DUL will kick in after training_budget calls to utility\n result = compute_shapley_values(\n u=dul_utility,\n mode=ShapleyMode.PermutationMontecarlo,\n done=MaxUpdates(300),\n n_jobs=-1,\n )\n\n computation_time = time.monotonic() - start_time\n if budget in computation_times:\n computation_times[budget].append(computation_time)\n else:\n computation_times[budget] = [computation_time]\n\n dul_df = result.to_dataframe(column=f\"{budget}_{idx}\").drop(\n columns=[f\"{budget}_{idx}_stderr\"]\n )\n df = pd.concat([df, dul_df], axis=1)\n\ncomputation_times_df = pd.DataFrame(computation_times)\n
\nDoing 10 runs for each of 10 different training budgets.\n
\n
\n 0%| | 0/100 [00:00<?, ?it/s]
\n
Next we compute the \\(l_1\\) error for the different training budgets across all runs and plot mean and standard deviation. We obtain results analogous to Figure 1 of the paper, verifying that the method indeed works for estimating the Data Shapley values (at least in this context).
In the plot we also display the mean and standard deviation of the computation time taken for each training budget.
errors = np.zeros((len(training_budget_values), n_runs), dtype=float)\naccuracies = np.zeros((len(training_budget_values), n_runs), dtype=float)\n\ntop_k = 3\n\nfor i, budget in enumerate(training_budget_values):\n for j in range(n_runs):\n y_true = df[\"exact\"].values\n y_estimated = df[f\"{budget}_{j}\"].values\n errors[i, j] = np.linalg.norm(y_true - y_estimated, ord=2)\n accuracies[i, j] = top_k_value_accuracy(y_true, y_estimated, k=top_k)\n\nerror_from_mean = np.linalg.norm(df[\"exact\"].values - df[\"exact\"].values.mean(), ord=2)\n
Let us next look at how well the ranking of values resulting from using the surrogate \\(\\tilde{u}\\) matches the ranking by the exact values. For this we fix \\(k=3\\) and consider the \\(k\\) samples with the highest value according to \\(\\tilde{u}\\) and \\(u\\) :
Finally, for each sample, we look at the distance of the estimates to the exact value across runs. Boxes are centered at the 50th percentile with wiskers at the 25th and 75th. We plot relative distances, as a percentage. We observe a general tendency to underestimate the value:
highest_value_index = df.index[df[\"exact\"].argmax()]\ny_train_corrupted = dataset.y_train.copy()\ny_train_corrupted[highest_value_index] = (\n y_train_corrupted[highest_value_index] + 1\n) % 3\n\ncorrupted_dataset = Dataset(\n x_train=dataset.x_train,\n y_train=y_train_corrupted,\n x_test=dataset.x_test,\n y_test=dataset.y_test,\n)\n
We retrain the model on the new dataset and verify that the accuracy decreases:
model = LinearSVC()\nmodel.fit(dataset.x_train, y_train_corrupted)\nprint(f\"Mean accuracy: {100 * model.score(dataset.x_test, dataset.y_test):0.2f}%\")\n
\nMean accuracy: 82.96%\n
\n
Finally, we recompute the values of all samples using the exact method and the best training budget previously obtained and then plot the resulting scores.
best_training_budget = training_budget_values[errors.mean(axis=1).argmin()]\n\nutility = Utility(\n model=LinearSVC(),\n data=corrupted_dataset,\n)\n\nresult = compute_shapley_values(\n u=utility,\n mode=ShapleyMode.CombinatorialExact,\n n_jobs=-1,\n progress=False,\n)\ndf_corrupted = result.to_dataframe(column=\"exact\").drop(columns=[\"exact_stderr\"])\n\ndul_utility = DataUtilityLearning(\n u=utility, training_budget=best_training_budget, model=MLPRegressor(**mlp_kwargs)\n)\n\nresult = compute_shapley_values(\n u=dul_utility,\n mode=ShapleyMode.PermutationMontecarlo,\n done=MaxUpdates(300),\n n_jobs=-1,\n)\ndul_df = result.to_dataframe(column=\"estimated\").drop(columns=[\"estimated_stderr\"])\ndf_corrupted = pd.concat([df_corrupted, dul_df], axis=1)\n
We can see in the figure that both methods assign the lowest value to the sample with the corrupted label.
As mentioned above, despite the previous results, this work is preliminary and the usefulness of Data Utility Learning remains to be tested in practice."},{"location":"examples/shapley_utility_learning/#data-utility-learning","title":"Data Utility Learning","text":""},{"location":"examples/shapley_utility_learning/#setup","title":"Setup","text":" We begin by importing the main libraries and setting some defaults.
If you are reading this in the documentation, some boilerplate (including most plotting code) has been omitted for convenience."},{"location":"examples/shapley_utility_learning/#dataset","title":"Dataset","text":" Following the paper, we take 15 samples (10%) from the Iris dataset and compute their Data Shapley values by using all the remaining samples as test set for computing the utility, which in this case is accuracy.
"},{"location":"examples/shapley_utility_learning/#data-shapley","title":"Data Shapley","text":" We start by defining the utility using the model and computing the exact Data Shapley values by definition \\(\\ref{eq:shapley-def}\\) .
"},{"location":"examples/shapley_utility_learning/#evaluation-on-anomalous-data","title":"Evaluation on anomalous data","text":" One interesting way to assess the Data Utility Learning approach is to corrupt some data and monitor how the value changes. To do this, we will take the sample with the highest score and change its label.
"},{"location":"getting-started/","title":"Getting started","text":"If you want to jump straight in, install pyDVL and then check out the examples. You will probably want to install with support for influence function computation.
We have introductions to the ideas behind Data valuation and Influence functions, as well as a short overview of common applications.
"},{"location":"getting-started/#","title":"Installing pyDVL","text":"To install the latest release use:
pip install pyDVL\n
See Extras for optional dependencies, in particular if you are interested in influence functions. You can also install the latest development version from TestPyPI:
pip install pyDVL --index-url https://test.pypi.org/simple/\n
In order to check the installation you can use:
python -c \"import pydvl; print(pydvl.__version__)\"\n
"},{"location":"getting-started/#dependencies","title":"Dependencies","text":"pyDVL requires Python >= 3.8, numpy, scikit-learn, scipy, cvxpy for the core methods, and joblib for parallelization locally. Additionally,the Influence functions module requires PyTorch (see Extras below).
"},{"location":"getting-started/#installation-extras","title":"Extras","text":"pyDVL has a few extra dependencies that can be optionally installed:
"},{"location":"getting-started/#installation-influences","title":"Influence functions","text":"To use the module on influence functions, pydvl.influence, run:
pip install pyDVL[influence]\n
This includes a dependency on PyTorch (Version 2.0 and above) and thus is left out by default.
"},{"location":"getting-started/#cupy","title":"CuPy","text":"In case that you have a supported version of CUDA installed (v11.2 to 11.8 as of this writing), you can enable eigenvalue computations for low-rank approximations with CuPy on the GPU by using:
pip install pyDVL[cupy]\n
This installs cupy-cuda11x.
If you use a different version of CUDA, please install CuPy manually.
"},{"location":"getting-started/#ray","title":"Ray","text":"If you want to use Ray to distribute data valuation workloads across nodes in a cluster (it can be used locally as well, but for this we recommend joblib instead) install pyDVL using:
pip install pyDVL[ray]\n
see the intro to parallelization for more details on how to use it.
"},{"location":"getting-started/#memcached","title":"Memcached","text":"If you want to use Memcached for caching utility evaluations, use:
pip install pyDVL[memcached]\n
This installs pymemcache additionally. Be aware that you still have to start a memcached server manually. See Setting up the Memcached cache.
"},{"location":"getting-started/advanced-usage/","title":"Advanced usage","text":"Besides the dos and don'ts of data valuation itself, which are the subject of the examples and the documentation of each method, there are two main things to keep in mind when using pyDVL namely Parallelization and Caching.
"},{"location":"getting-started/advanced-usage/#setting-up-parallelization","title":"Parallelization","text":"pyDVL uses parallelization to scale and speed up computations. It does so using one of Dask, Ray or Joblib. The first is used in the influence package whereas the other two are used in the value package.
"},{"location":"getting-started/advanced-usage/#data-valuation","title":"Data valuation","text":"For data valuation, pyDVL uses joblib for local parallelization (within one machine) and supports using Ray for distributed parallelization (across multiple machines).
The former works out of the box but for the latter you will need to install additional dependencies (see Extras) and to provide a running cluster (or run ray in local mode).
Info
As of v0.9.0 pyDVL does not allow requesting resources per task sent to the cluster, so you will need to make sure that each worker has enough resources to handle the tasks it receives. A data valuation task using game-theoretic methods will typically make a copy of the whole model and dataset to each worker, even if the re-training only happens on a subset of the data. This means that you should make sure that each worker has enough memory to handle the whole dataset.
We use backend classes for both joblib and ray as well as two types of executors for the different algorithms: the first uses a map reduce pattern as seen in the MapReduceJob class and the second implements the futures executor interface from concurrent.futures.
As a convenience, you can also instantiate a parallel backend class by using the init_parallel_backend function:
from pydvl.parallel import init_parallel_backend\nparallel_backend = init_parallel_backend(backend_name=\"joblib\")\n
Info
The executor classes are not meant to be instantiated and used by users of pyDVL. They are used internally as part of the computations of the different methods.
Deprecation notice
We are currently planning to deprecate MapReduceJob in favour of the futures executor interface because it allows for more diverse computation patterns with interruptions.
"},{"location":"getting-started/advanced-usage/#joblib","title":"Joblib","text":"Please follow the instructions in Joblib's documentation for all possible configuration options that you can pass to the parallel_config context manager.
To use the joblib parallel backend with the loky
backend and verbosity set to 100
to compute exact shapley values you would use:
import joblib\nfrom pydvl.parallel import JoblibParallelBackend\nfrom pydvl.value.shapley import combinatorial_exact_shapley\nfrom pydvl.utils.utility import Utility\n\nparallel_backend = JoblibParallelBackend() \nu = Utility(...)\n\nwith joblib.parallel_config(backend=\"loky\", verbose=100):\n values = combinatorial_exact_shapley(u, parallel_backend=parallel_backend)\n
"},{"location":"getting-started/advanced-usage/#ray","title":"Ray","text":"Additional dependencies
The Ray parallel backend requires optional dependencies. See Extras for more information.
Please follow the instructions in Ray's documentation to set up a remote cluster. You could alternatively use a local cluster and in that case you don't have to set anything up.
Before starting a computation, you should initialize ray by calling ray.init
with the appropriate parameters:
To set up and start a local ray cluster with 4 CPUs you would use:
import ray\n\nray.init(num_cpus=4)\n
Whereas for a remote ray cluster you would use:
import ray\n\naddress = \"<Hypothetical Ray Cluster IP Address>\"\nray.init(address)\n
To use the ray parallel backend to compute exact shapley values you would use:
import ray\nfrom pydvl.parallel import RayParallelBackend\nfrom pydvl.value.shapley import combinatorial_exact_shapley\nfrom pydvl.utils.utility import Utility\n\nray.init()\nparallel_backend = RayParallelBackend()\nu = Utility(...)\nvaues = combinatorial_exact_shapley(u, parallel_backend=parallel_backend)\n
"},{"location":"getting-started/advanced-usage/#futures-executor","title":"Futures executor","text":"For the futures executor interface, we have implemented an executor class for ray in RayExecutor and rely on joblib's loky get_reusable_executor function to instantiate an executor for local parallelization.
They are both compatibles with the builtin ThreadPoolExecutor and ProcessPoolExecutor classes.
>>> from joblib.externals.loky import _ReusablePoolExecutor\n>>> from pydvl.parallel import JoblibParallelBackend\n>>> parallel_backend = JoblibParallelBackend() \n>>> with parallel_backend.executor() as executor:\n... results = list(executor.map(lambda x: x + 1, range(3)))\n...\n>>> results\n[1, 2, 3]\n
"},{"location":"getting-started/advanced-usage/#map-reduce","title":"Map-reduce","text":"The map-reduce interface is older and more limited in the patterns it allows us to use.
To reproduce the previous example using MapReduceJob, we would use:
>>> from pydvl.parallel import JoblibParallelBackend, MapReduceJob\n>>> parallel_backend = JoblibParallelBackend() \n>>> map_reduce_job = MapReduceJob(\n... list(range(3)),\n... map_func=lambda x: x[0] + 1,\n... parallel_backend=parallel_backend,\n... )\n>>> results = map_reduce_job()\n>>> results\n[1, 2, 3]\n
"},{"location":"getting-started/advanced-usage/#influence-functions","title":"Influence functions","text":"Refer to Scaling influence computation for explanations about parallelization for Influence Functions.
"},{"location":"getting-started/advanced-usage/#getting-started-cache","title":"Caching","text":"PyDVL can cache (memoize) the computation of the utility function and speed up some computations for data valuation. It is however disabled by default. When it is enabled it takes into account the data indices passed as argument and the utility function wrapped into the Utility object. This means that care must be taken when reusing the same utility function with different data, see the documentation for the caching package for more information.
In general, caching won't play a major role in the computation of Shapley values because the probability of sampling the same subset twice, and hence needing the same utility function computation, is very low. However, it can be very useful when comparing methods that use the same utility function, or when running multiple experiments with the same data.
pyDVL supports 3 different caching backends:
-
InMemoryCacheBackend: an in-memory cache backend that uses a dictionary to store and retrieve cached values. This is used to share cached values between threads in a single process.
-
DiskCacheBackend: a disk-based cache backend that uses pickled values written to and read from disk. This is used to share cached values between processes in a single machine.
-
MemcachedCacheBackend: a Memcached-based cache backend that uses pickled values written to and read from a Memcached server. This is used to share cached values between processes across multiple machines.
Memcached extras The Memcached backend requires optional dependencies. See Extras for more information.
As an example, here's how one would use the disk-based cached backend with a utility:
from pydvl.utils.caching.disk import DiskCacheBackend\nfrom pydvl.utils.utility import Utility\n\ncache_backend = DiskCacheBackend()\nu = Utility(..., cache_backend=cache_backend)\n
Please refer to the documentation and examples of each backend class for more details.
When is the cache really necessary?
Crucially, semi-value computations with the PermutationSampler require caching to be enabled, or they will take twice as long as the direct implementation in compute_shapley_values.
Using the cache
Continue reading about the cache in the documentation for the caching package.
"},{"location":"getting-started/advanced-usage/#setting-up-memcached","title":"Setting up the Memcached cache","text":"Memcached is an in-memory key-value store accessible over the network. pyDVL can use it to cache the computation of the utility function and speed up some computations (in particular, semi-value computations with the PermutationSampler but other methods may benefit as well).
You can either install it as a package or run it inside a docker container (the simplest). For installation instructions, refer to the Getting started section in memcached's wiki. Then you can run it with:
memcached -u user\n
To run memcached inside a container in daemon mode instead, use:
docker container run -d --rm -p 11211:11211 memcached:latest\n
"},{"location":"getting-started/applications/","title":"Applications of data valuation","text":"Data valuation methods can improve various aspects of data engineering and machine learning workflows. When applied judiciously, these methods can enhance data quality, model performance, and cost-effectiveness.
However, the results can be inconsistent. Values have a strong dependency on the training procedure and the performance metric used. For instance, accuracy is a poor metric for imbalanced sets and this has a stark effect on data values. Some models exhibit great variance in some regimes and this again has a detrimental effect on values. See Problems of data values for more on this.
Here we quickly enumerate the most common uses of data valuation. For a comprehensive overview, along with concrete examples, please refer to the Transferlab blog post on this topic.
"},{"location":"getting-started/applications/#data-engineering","title":"Data engineering","text":"Some of the promising applications in data engineering include:
- Removing low-value data points to increase model performance.
- Pruning redundant samples enables more efficient training of large models.
- Active learning. Points predicted to have high-value points can be prioritized for labeling, reducing the cost of data collection.
- Analyzing high- and low-value data to guide data collection and improve upstream data processes. Low-value points may reveal data issues to address.
- Identify irrelevant or duplicated data when evaluating offerings from data providers.
"},{"location":"getting-started/applications/#model-development","title":"Model development","text":"Some of the useful applications include:
- Data attribution for interpretation and debugging: Analyzing the most or least valuable samples for a class can reveal cases where the model relies on confounding features instead of true signal. Investigating influential points for misclassified examples highlights limitations to address.
- Sensitivity / robustness analysis: (Broderick et al., 2021)1 shows that removing a small fraction of highly influential data can completely flip model conclusions. This can reveal potential issues with the modeling approach, data collection process, or intrinsic difficulties of the problem that require further inspection.
- Continual learning: in order to avoid forgetting when training on new data, a subset of previously seen data is presented again. Data valuation can help in the selection of the most valuable samples to retain.
"},{"location":"getting-started/applications/#attacks","title":"Attacks","text":"Data valuation techniques have applications in detecting data manipulation and contamination, although the feasibility of such attacks is limited.
- Watermark removal: Points with low value on a correct validation set may be part of a watermarking mechanism.
- Poisoning attacks: Influential points can be shifted to induce large changes in model estimators.
"},{"location":"getting-started/applications/#data-markets","title":"Data markets","text":"Additionally, one of the motivating applications for the whole field is that of data markets, where data valuation can be the key component to determine the price of data.
Game-theoretic valuation methods like Shapley values can help assign fair prices, but have limitations around handling duplicates or adversarial data. Model-free methods like LAVA (Just et al., 2023)2 and CRAIG are particularly well suited for this, as they use the Wasserstein distance between a vendor's data and the buyer's to determine the value of the former.
However, this is a complex problem which can face simple practical problems like data owners not willing to disclose their data for valuation, even to a broker.
-
Broderick, T., Giordano, R., Meager, R., 2021. An Automatic Finite-Sample Robustness Metric: When Can Dropping a Little Data Make a Big Difference? \u21a9
-
Just, H.A., Kang, F., Wang, T., Zeng, Y., Ko, M., Jin, M., Jia, R., 2023. LAVA: Data Valuation without Pre-Specified Learning Algorithms. Presented at the The Eleventh International Conference on Learning Representations (ICLR 2023).\u00a0\u21a9
"},{"location":"getting-started/benchmarking/","title":"Benchmarking tasks","text":"Because the magnitudes of values or influences from different algorithms, or datasets, are not comparable to each other, evaluation of the methods is typically done with downstream tasks.
"},{"location":"getting-started/benchmarking/#benchmarking-valuation-methods","title":"Benchmarking valuation methods","text":"Data valuation is particularly useful for data selection, pruning and inspection in general. For this reason, the most common benchmarks are data removal and noisy label detection.
"},{"location":"getting-started/benchmarking/#high-value-point-removal","title":"High-value point removal","text":"After computing the values for all data in \\(T = \\{ \\mathbf{z}_i : i = 1, \\ldots, n \\}\\), the set is sorted by decreasing value. We denote by \\(T_{[i :]}\\) the sorted sequence of points \\((\\mathbf{z}_i, \\mathbf{z}_{i + 1}, \\ldots, \\mathbf{z}_n)\\) for \\(1 \\leqslant i \\leqslant n\\). Now train successively \\(f_{T [i :]}\\) and compute its accuracy \\(a_{T_{[i :]}} (D_{\\operatorname{test}})\\) on the held-out test set, then plot all numbers. By using \\(D_{\\operatorname{test}}\\) one approximates the expected accuracy drop on unseen data. Because the points removed have a high value, one expects performance to drop visibly wrt. a random baseline.
"},{"location":"getting-started/benchmarking/#low-value-point-removal","title":"Low-value point removal","text":"The complementary experiment removes data in increasing order, with the lowest valued points first. Here one expects performance to increase relatively to randomly removing points before training. Additionally, every real dataset will include slightly out-of-distribution points, so one should also expect an absolute increase in performance when some of the lowest valued points are removed.
"},{"location":"getting-started/benchmarking/#value-transfer","title":"Value transfer","text":"This experiment explores the extent to which data values computed with one (cheap) model can be transferred to another (potentially more complex) one. Different classifiers are used as a source to calculate data values. These values are then used in the point removal tasks described above, but using a different (target) model for evaluation of the accuracies \\(a_{T [i :]}\\). A multi-layer perceptron is added for evaluation as well.
"},{"location":"getting-started/benchmarking/#noisy-label-detection","title":"Noisy label detection","text":"This experiment tests the ability of a method to detect mislabeled instances in the data. A fixed fraction \\(\\alpha\\) of the training data are picked at random and their labels flipped. Data values are computed, then the \\(\\alpha\\)-fraction of lowest-valued points are selected, and the overlap with the subset of flipped points is computed. This synthetic experiment is however hard to put into practical use, since the fraction \\(\\alpha\\) is of course unknown in practice.
"},{"location":"getting-started/benchmarking/#rank-stability","title":"Rank stability","text":"Introduced in [@wang_data_2022], one can look at how stable the top \\(k\\)% of the values is across runs. Rank stability of a method is necessary but not sufficient for good results. Ideally one wants to identify high-value points reliably (good precision and recall) and consistently (good rank stability).
"},{"location":"getting-started/benchmarking/#benchmarking-influence-function-methods","title":"Benchmarking Influence function methods","text":"Todo
This section is basically a stub
Although in principle one can compute the average influence over the test set and run the same tasks as above, because influences are computed for each pair of training and test sample, they typically require different experiments to compare their efficacy.
"},{"location":"getting-started/benchmarking/#approximation-quality","title":"Approximation quality","text":"The biggest difficulty when computing influences is the approximation of the inverse Hessian-vector product. For this reason one often sees in the literature the quality of the approximation to LOO as an indicator of performance, the exact Influence Function being a first order approximation to it. However, as shown by (Bae et al., 2022)1, the different approximation errors ensuing for lack of convexity, approximate Hessian-vector products and so on, lead to this being a poor benchmark overall.
"},{"location":"getting-started/benchmarking/#data-re-labelling","title":"Data re-labelling","text":"(Kong et al., 2022)2 introduce a method using IFs to re-label harmful training samples in order to improve accuracy. One can then take the obtained improvement as a measure of the quality of the IF method.
"},{"location":"getting-started/benchmarking/#post-hoc-fairness-adjustment","title":"Post-hoc fairness adjustment","text":"Introduced in [@...], the idea is to compute influences over a carefully selected fair set, and using them to re-weight the training data.
-
Bae, J., Ng, N., Lo, A., Ghassemi, M., Grosse, R.B., 2022. If Influence Functions are the Answer, Then What is the Question?, in: Advances in Neural Information Processing Systems. Presented at the NeurIPS 2022, pp. 17953\u201317967.\u00a0\u21a9
-
Kong, S., Shen, Y., Huang, L., 2022. Resolving Training Biases via Influence-based Data Relabeling. Presented at the International Conference on Learning Representations (ICLR 2022).\u00a0\u21a9
"},{"location":"getting-started/first-steps/","title":"First steps","text":"Warning
Make sure you have read Getting started before using the library. In particular read about which extra dependencies you may need.
"},{"location":"getting-started/first-steps/#main-concepts","title":"Main concepts","text":"pyDVL aims to be a repository of production-ready, reference implementations of algorithms for data valuation and influence functions. Even though we only briefly introduce key concepts in the documentation, the following sections should be enough to get you started.
- Basics of data valuation for key objects and usage patterns for Shapley value computation and related methods.
- Computing Influence Values for instructions on how to compute influence functions.
"},{"location":"getting-started/first-steps/#running-the-examples","title":"Running the examples","text":"If you are somewhat familiar with the concepts of data valuation, you can start by browsing our worked-out examples illustrating pyDVL's capabilities either:
- In the examples under Basics of data valuation and Computing Influence Values.
- Using binder notebooks, deployed from each example's page.
- Locally, by starting a jupyter server at the root of the project. You will have to install jupyter first manually since it's not a dependency of the library.
"},{"location":"getting-started/first-steps/#advanced-usage","title":"Advanced usage","text":"Refer to the Advanced usage page for explanations on how to enable and use parallelization and caching.
"},{"location":"getting-started/glossary/","title":"Glossary","text":"This glossary is meant to provide only brief explanations of each term, helping to clarify the concepts and techniques used in the library. For more detailed information, please refer to the relevant literature or resources.
Warning
This glossary is still a work in progress. Pull requests are welcome!
Terms in data valuation and influence functions:
"},{"location":"getting-started/glossary/#arnoldi-method","title":"Arnoldi Method","text":"The Arnoldi method approximately computes eigenvalue, eigenvector pairs of a symmetric matrix. For influence functions, it is used to approximate the iHVP. Introduced by (Schioppa et al., 2022)1 in the context of influence functions.
- Implementation (torch)
- Documentation (torch)
"},{"location":"getting-started/glossary/#block-conjugate-gradient","title":"Block Conjugate Gradient","text":"A blocked version of CG, which solves several linear systems simultaneously. For Influence Functions, it is used to approximate the iHVP.
- Implementation (torch)
- Documentation (torch)
"},{"location":"getting-started/glossary/#class-wise-shapley","title":"Class-wise Shapley","text":"Class-wise Shapley is a Shapley valuation method which introduces a utility function that balances in-class, and out-of-class accuracy, with the goal of favoring points that improve the model's performance on the class they belong to. It is estimated to be particularly useful in imbalanced datasets, but more research is needed to confirm this. Introduced by (Schoch et al., 2022)2.
- Implementation
- Documentation
"},{"location":"getting-started/glossary/#conjugate-gradient","title":"Conjugate Gradient","text":"CG is an algorithm for solving linear systems with a symmetric and positive-definite coefficient matrix. For Influence Functions, it is used to approximate the iHVP.
- Implementation (torch)
- Documentation (torch)
"},{"location":"getting-started/glossary/#data-utility-learning","title":"Data Utility Learning","text":"Data Utility Learning is a method that uses an ML model to learn the utility function. Essentially, it learns to predict the performance of a model when trained on a given set of indices from the dataset. The cost of training this model is quickly amortized by avoiding costly re-evaluations of the original utility. Introduced by (Wang et al., 2022)3.
- Implementation
- Documentation
"},{"location":"getting-started/glossary/#eigenvalue-corrected-kronecker-factored-approximate-curvature","title":"Eigenvalue-corrected Kronecker-Factored Approximate Curvature","text":"EKFAC builds on K-FAC by correcting for the approximation errors in the eigenvalues of the blocks of the Kronecker-factored approximate curvature matrix. This correction aims to refine the accuracy of natural gradient approximations, thus potentially offering better training efficiency and stability in neural networks.
- Implementation (torch)
- Documentation (torch)
"},{"location":"getting-started/glossary/#group-testing","title":"Group Testing","text":"Group Testing is a strategy for identifying characteristics within groups of items efficiently, by testing groups rather than individuals to quickly narrow down the search for items with specific properties. Introduced into data valuation by (Jia et al., 2019)4.
- Implementation
- Documentation
"},{"location":"getting-started/glossary/#influence-function","title":"Influence Function","text":"The Influence Function measures the impact of a single data point on a statistical estimator. In machine learning, it's used to understand how much a particular data point affects the model's prediction. Introduced into data valuation by (Koh and Liang, 2017)5.
- Documentation
"},{"location":"getting-started/glossary/#inverse-hessian-vector-product","title":"Inverse Hessian-vector product","text":"iHVP is the operation of calculating the product of the inverse Hessian matrix of a function and a vector, without explicitly constructing nor inverting the full Hessian matrix first. This is essential for influence function computation.
"},{"location":"getting-started/glossary/#kronecker-factored-approximate-curvature","title":"Kronecker-Factored Approximate Curvature","text":"K-FAC is an optimization technique that approximates the Fisher Information matrix's inverse efficiently. It uses the Kronecker product to factor the matrix, significantly speeding up the computation of natural gradient updates and potentially improving training efficiency.
"},{"location":"getting-started/glossary/#least-core","title":"Least Core","text":"The Least Core is a solution concept in cooperative game theory, referring to the smallest set of payoffs to players that cannot be improved upon by any coalition, ensuring stability in the allocation of value. In data valuation, it implies solving a linear and a quadratic system whose constraints are determined by the evaluations of the utility function on every subset of the training data. Introduced as data valuation method by (Yan and Procaccia, 2021)6.
- Implementation
- Documentation
"},{"location":"getting-started/glossary/#linear-time-stochastic-second-order-algorithm","title":"Linear-time Stochastic Second-order Algorithm","text":"LiSSA is an efficient algorithm for approximating the inverse Hessian-vector product, enabling faster computations in large-scale machine learning problems, particularly for second-order optimization. For Influence Functions, it is used to approximate the iHVP. Introduced by (Agarwal et al., 2017)7.
- Implementation (torch)
- Documentation (torch)
"},{"location":"getting-started/glossary/#leave-one-out","title":"Leave-One-Out","text":"LOO in the context of data valuation refers to the process of evaluating the impact of removing individual data points on the model's performance. The value of a training point is defined as the marginal change in the model's performance when that point is removed from the training set.
- Implementation
- Documentation
"},{"location":"getting-started/glossary/#maximum-sample-reuse","title":"Maximum Sample Reuse","text":"MSR is a sampling method for data valuation that updates the value of every data point in one sample. This method can achieve much faster convergence. Introduced by (Wang and Jia, 2023)8
- Implementation
"},{"location":"getting-started/glossary/#monte-carlo-least-core","title":"Monte Carlo Least Core","text":"MCLC is a variation of the Least Core that uses a reduced amount of constraints, sampled randomly from the powerset of the training data. Introduced by (Yan and Procaccia, 2021)6.
- Implementation
- Documentation
"},{"location":"getting-started/glossary/#monte-carlo-shapley","title":"Monte Carlo Shapley","text":"MCS estimates the Shapley Value using a Monte Carlo approximation to the sum over subsets of the training set. This reduces computation to polynomial time at the cost of accuracy, but this loss is typically irrelevant for downstream applications in ML. Introduced into data valuation by (Ghorbani and Zou, 2019)9.
- Implementation
- Documentation
"},{"location":"getting-started/glossary/#nystrom-low-rank-approximation","title":"Nystr\u00f6m Low-Rank Approximation","text":"The Nystr\u00f6m approximation computes a low-rank approximation to a symmetric positive-definite matrix via random projections. For influence functions, it is used to approximate the iHVP. Introduced as sketch and solve algorithm in (Hataya and Yamada, 2023)10, and as preconditioner for PCG in (Frangella et al., 2023)11.
- Implementation Sketch-and-Solve (torch)
- Documentation Sketch-and-Solve (torch)
- Implementation Preconditioner (torch)
"},{"location":"getting-started/glossary/#point-removal-task","title":"Point removal task","text":"A task in data valuation where the quality of a valuation method is measured through the impact of incrementally removing data points on the model's performance, where the points are removed in order of their value. See
- Benchmarking tasks
"},{"location":"getting-started/glossary/#preconditioned-block-conjugate-gradient","title":"Preconditioned Block Conjugate Gradient","text":"A blocked version of PCG, which solves several linear systems simultaneously. For Influence Functions, it is used to approximate the iHVP.
- Implementation CG (torch)
- Implementation Preconditioner (torch)
- Documentation (torch)
"},{"location":"getting-started/glossary/#preconditioned-conjugate-gradient","title":"Preconditioned Conjugate Gradient","text":"A preconditioned version of CG for improved convergence, depending on the characteristics of the matrix and the preconditioner. For Influence Functions, it is used to approximate the iHVP.
- Implementation CG (torch)
- Implementation Preconditioner (torch)
- Documentation (torch)
"},{"location":"getting-started/glossary/#shapley-value","title":"Shapley Value","text":"Shapley Value is a concept from cooperative game theory that allocates payouts to players based on their contribution to the total payoff. In data valuation, players are data points. The method assigns a value to each data point based on a weighted average of its marginal contributions to the model's performance when trained on each subset of the training set. This requires \\(\\mathcal{O}(2^{n-1})\\) re-trainings of the model, which is infeasible for even trivial data set sizes, so one resorts to approximations like TMCS. Introduced into data valuation by (Ghorbani and Zou, 2019)9.
- Implementation
- Documentation
"},{"location":"getting-started/glossary/#truncated-monte-carlo-shapley","title":"Truncated Monte Carlo Shapley","text":"TMCS is an efficient approach to estimating the Shapley Value using a truncated version of the Monte Carlo method, reducing computation time while maintaining accuracy in large datasets. Introduced by (Ghorbani and Zou, 2019)9.
- Implementation
- Documentation
"},{"location":"getting-started/glossary/#weighted-accuracy-drop","title":"Weighted Accuracy Drop","text":"WAD is a metric to evaluate the impact of sequentially removing data points on the performance of a machine learning model, weighted by their rank, i.e. by the time at which they were removed. Introduced by (Schoch et al., 2022)2.
"},{"location":"getting-started/glossary/#other-terms","title":"Other terms","text":""},{"location":"getting-started/glossary/#coefficient-of-variation","title":"Coefficient of Variation","text":"CV is a statistical measure of the dispersion of data points in a data series around the mean, expressed as a percentage. It's used to compare the degree of variation from one data series to another, even if the means are drastically different.
"},{"location":"getting-started/glossary/#constraint-satisfaction-problem","title":"Constraint Satisfaction Problem","text":"A CSP involves finding values for variables within specified constraints or conditions, commonly used in scheduling, planning, and design problems where solutions must satisfy a set of restrictions.
"},{"location":"getting-started/glossary/#out-of-bag","title":"Out-of-Bag","text":"OOB refers to data samples in an ensemble learning context (like random forests) that are not selected for training a specific model within the ensemble. These OOB samples are used as a validation set to estimate the model's accuracy, providing a convenient internal cross-validation mechanism.
"},{"location":"getting-started/glossary/#machine-learning-reproducibility-challenge","title":"Machine Learning Reproducibility Challenge","text":"The MLRC is an initiative that encourages the verification and replication of machine learning research findings, promoting transparency and reliability in the field. Papers are published in Transactions on Machine Learning Research (TMLR).
-
Schioppa, A., Zablotskaia, P., Vilar, D., Sokolov, A., 2022. Scaling Up Influence Functions. Proc. AAAI Conf. Artif. Intell. 36, 8179\u20138186. https://doi.org/10.1609/aaai.v36i8.20791 \u21a9
-
Schoch, S., Xu, H., Ji, Y., 2022. CS-Shapley: Class-wise Shapley Values for Data Valuation in Classification, in: Proc. Of the Thirty-Sixth Conference on Neural Information Processing Systems (NeurIPS). Presented at the Advances in Neural Information Processing Systems (NeurIPS 2022).\u00a0\u21a9\u21a9
-
Wang, T., Yang, Y., Jia, R., 2022. Improving Cooperative Game Theory-based Data Valuation via Data Utility Learning. Presented at the International Conference on Learning Representations (ICLR 2022). Workshop on Socially Responsible Machine Learning, arXiv. https://doi.org/10.48550/arXiv.2107.06336 \u21a9
-
Jia, R., Dao, D., Wang, B., Hubis, F.A., Gurel, N.M., Li, B., Zhang, C., Spanos, C., Song, D., 2019. Efficient task-specific data valuation for nearest neighbor algorithms. Proc. VLDB Endow. 12, 1610\u20131623. https://doi.org/10.14778/3342263.3342637 \u21a9
-
Koh, P.W., Liang, P., 2017. Understanding Black-box Predictions via Influence Functions, in: Proceedings of the 34th International Conference on Machine Learning. Presented at the International Conference on Machine Learning, PMLR, pp. 1885\u20131894.\u00a0\u21a9
-
Yan, T., Procaccia, A.D., 2021. If You Like Shapley Then You\u2019ll Love the Core, in: Proceedings of the 35th AAAI Conference on Artificial Intelligence, 2021. Presented at the AAAI Conference on Artificial Intelligence, Association for the Advancement of Artificial Intelligence, pp. 5751\u20135759. https://doi.org/10.1609/aaai.v35i6.16721 \u21a9\u21a9
-
Agarwal, N., Bullins, B., Hazan, E., 2017. Second-Order Stochastic Optimization for Machine Learning in Linear Time. JMLR 18, 1\u201340.\u00a0\u21a9
-
Wang, J.T., Jia, R., 2023. Data Banzhaf: A Robust Data Valuation Framework for Machine Learning, in: Proceedings of The 26th International Conference on Artificial Intelligence and Statistics. Presented at the International Conference on Artificial Intelligence and Statistics, PMLR, pp. 6388\u20136421.\u00a0\u21a9
-
Ghorbani, A., Zou, J., 2019. Data Shapley: Equitable Valuation of Data for Machine Learning, in: Proceedings of the 36th International Conference on Machine Learning, PMLR. Presented at the International Conference on Machine Learning (ICML 2019), PMLR, pp. 2242\u20132251.\u00a0\u21a9\u21a9\u21a9
-
Hataya, R., Yamada, M., 2023. Nystr\u00f6m Method for Accurate and Scalable Implicit Differentiation, in: Proceedings of The 26th International Conference on Artificial Intelligence and Statistics. Presented at the International Conference on Artificial Intelligence and Statistics, PMLR, pp. 4643\u20134654.\u00a0\u21a9
-
Frangella, Z., Tropp, J.A., Udell, M., 2023. Randomized Nystr\u00f6m Preconditioning. SIAM J. Matrix Anal. Appl. 44, 718\u2013752. https://doi.org/10.1137/21M1466244 \u21a9
"},{"location":"getting-started/methods/","title":"Methods","text":"We currently implement the following methods:
"},{"location":"getting-started/methods/#data-valuation","title":"Data valuation","text":" -
LOO.
-
Permutation Shapley (also called ApproxShapley) (Castro et al., 2009)1.
-
TMCS (Ghorbani and Zou, 2019)2.
-
Data Banzhaf [@wang_data_2022].
-
Beta Shapley (Kwon and Zou, 2022)3.
-
CS-Shapley (Schoch et al., 2022)4.
-
Least Core (Yan and Procaccia, 2021)5.
-
Owen Sampling (Okhrati and Lipani, 2021)6.
-
Data Utility Learning (Wang et al., 2022)7.
-
kNN-Shapley (Jia et al., 2019)8.
-
Group Testing (Jia et al., 2019)9
-
Data-OOB (Kwon and Zou, 2023)10.
"},{"location":"getting-started/methods/#influence-functions","title":"Influence functions","text":" -
CG Influence. (Koh and Liang, 2017)11.
-
Direct Influence (Koh and Liang, 2017)11.
-
LiSSA (Agarwal et al., 2017)12.
-
Arnoldi Influence (Schioppa et al., 2022)13.
-
EKFAC Influence (George et al., 2018; Martens and Grosse, 2015)1415.
-
Nystr\u00f6m Influence, based on the ideas in (Hataya and Yamada, 2023)16 for bi-level optimization.
-
Inverse-harmonic-mean Influence (Kwon et al., 2023)17.
-
Castro, J., G\u00f3mez, D., Tejada, J., 2009. Polynomial calculation of the Shapley value based on sampling. Computers & Operations Research, Selected papers presented at the Tenth International Symposium on Locational Decisions (ISOLDE X) 36, 1726\u20131730. https://doi.org/10.1016/j.cor.2008.04.004 \u21a9
-
Ghorbani, A., Zou, J., 2019. Data Shapley: Equitable Valuation of Data for Machine Learning, in: Proceedings of the 36th International Conference on Machine Learning, PMLR. Presented at the International Conference on Machine Learning (ICML 2019), PMLR, pp. 2242\u20132251.\u00a0\u21a9
-
Kwon, Y., Zou, J., 2022. Beta Shapley: A Unified and Noise-reduced Data Valuation Framework for Machine Learning, in: Proceedings of the 25th International Conference on Artificial Intelligence and Statistics (AISTATS) 2022,. Presented at the AISTATS 2022, PMLR.\u00a0\u21a9
-
Schoch, S., Xu, H., Ji, Y., 2022. CS-Shapley: Class-wise Shapley Values for Data Valuation in Classification, in: Proc. Of the Thirty-Sixth Conference on Neural Information Processing Systems (NeurIPS). Presented at the Advances in Neural Information Processing Systems (NeurIPS 2022).\u00a0\u21a9
-
Yan, T., Procaccia, A.D., 2021. If You Like Shapley Then You\u2019ll Love the Core, in: Proceedings of the 35th AAAI Conference on Artificial Intelligence, 2021. Presented at the AAAI Conference on Artificial Intelligence, Association for the Advancement of Artificial Intelligence, pp. 5751\u20135759. https://doi.org/10.1609/aaai.v35i6.16721 \u21a9
-
Okhrati, R., Lipani, A., 2021. A Multilinear Sampling Algorithm to Estimate Shapley Values, in: 2020 25th International Conference on Pattern Recognition (ICPR). Presented at the 2020 25th International Conference on Pattern Recognition (ICPR), IEEE, pp. 7992\u20137999. https://doi.org/10.1109/ICPR48806.2021.9412511 \u21a9
-
Wang, T., Yang, Y., Jia, R., 2022. Improving Cooperative Game Theory-based Data Valuation via Data Utility Learning. Presented at the International Conference on Learning Representations (ICLR 2022). Workshop on Socially Responsible Machine Learning, arXiv. https://doi.org/10.48550/arXiv.2107.06336 \u21a9
-
Jia, R., Dao, D., Wang, B., Hubis, F.A., Gurel, N.M., Li, B., Zhang, C., Spanos, C., Song, D., 2019. Efficient task-specific data valuation for nearest neighbor algorithms. Proc. VLDB Endow. 12, 1610\u20131623. https://doi.org/10.14778/3342263.3342637 \u21a9
-
Jia, R., Dao, D., Wang, B., Hubis, F.A., Hynes, N., G\u00fcrel, N.M., Li, B., Zhang, C., Song, D., Spanos, C.J., 2019. Towards Efficient Data Valuation Based on the Shapley Value, in: Proceedings of the 22nd International Conference on Artificial Intelligence and Statistics. Presented at the International Conference on Artificial Intelligence and Statistics (AISTATS), PMLR, pp. 1167\u20131176.\u00a0\u21a9
-
Kwon, Y., Zou, J., 2023. Data-OOB: Out-of-bag Estimate as a Simple and Efficient Data Value, in: Proceedings of the 40th International Conference on Machine Learning. Presented at the International Conference on Machine Learning, PMLR, pp. 18135\u201318152.\u00a0\u21a9
-
Koh, P.W., Liang, P., 2017. Understanding Black-box Predictions via Influence Functions, in: Proceedings of the 34th International Conference on Machine Learning. Presented at the International Conference on Machine Learning, PMLR, pp. 1885\u20131894.\u00a0\u21a9\u21a9
-
Agarwal, N., Bullins, B., Hazan, E., 2017. Second-Order Stochastic Optimization for Machine Learning in Linear Time. JMLR 18, 1\u201340.\u00a0\u21a9
-
Schioppa, A., Zablotskaia, P., Vilar, D., Sokolov, A., 2022. Scaling Up Influence Functions. Proc. AAAI Conf. Artif. Intell. 36, 8179\u20138186. https://doi.org/10.1609/aaai.v36i8.20791 \u21a9
-
George, T., Laurent, C., Bouthillier, X., Ballas, N., Vincent, P., 2018. Fast Approximate Natural Gradient Descent in a Kronecker Factored Eigenbasis, in: Advances in Neural Information Processing Systems. Curran Associates, Inc.\u00a0\u21a9
-
Martens, J., Grosse, R., 2015. Optimizing Neural Networks with Kronecker-factored Approximate Curvature, in: Proceedings of the 32nd International Conference on Machine Learning. Presented at the International Conference on Machine Learning, PMLR, pp. 2408\u20132417.\u00a0\u21a9
-
Hataya, R., Yamada, M., 2023. Nystr\u00f6m Method for Accurate and Scalable Implicit Differentiation, in: Proceedings of The 26th International Conference on Artificial Intelligence and Statistics. Presented at the International Conference on Artificial Intelligence and Statistics, PMLR, pp. 4643\u20134654.\u00a0\u21a9
-
Kwon, Y., Wu, E., Wu, K., Zou, J., 2023. DataInf: Efficiently Estimating Data Influence in LoRA-tuned LLMs and Diffusion Models. Presented at the The Twelfth International Conference on Learning Representations. https://doi.org/10.48550/arXiv.2310.00902 \u21a9
"},{"location":"influence/","title":"The influence function","text":""},{"location":"influence/#the-influence-function","title":"The influence function","text":"Warning
The code in the package pydvl.influence is experimental. Package structure and basic API are bound to change before v1.0.0
The influence function (IF) is a method to quantify the effect (influence) that each training point has on the parameters of a model, and by extension on any function thereof. In particular, it allows to estimate how much each training sample affects the error on a test point, making the IF useful for understanding and debugging models.
Alas, the influence function relies on some assumptions that can make their application difficult. Yet another drawback is that they require the computation of the inverse of the Hessian of the model wrt. its parameters, which is intractable for large models like deep neural networks. Much of the recent research tackles this issue using approximations, like a Neuman series (Agarwal et al., 2017)1, with the most successful solution using a low-rank approximation that iteratively finds increasing eigenspaces of the Hessian (Schioppa et al., 2022)2.
pyDVL implements several methods for the efficient computation of the IF for machine learning. In the examples we document some of the difficulties that can arise when using the IF.
"},{"location":"influence/#construction","title":"Construction","text":"First introduced in the context of robust statistics in (Hampel, 1974)3, the IF was popularized in the context of machine learning in (Koh and Liang, 2017)4.
Following their formulation, consider an input space \\(\\mathcal{X}\\) (e.g. images) and an output space \\(\\mathcal{Y}\\) (e.g. labels). Let's take \\(z_i = (x_i, y_i)\\), for \\(i \\in \\{1,...,n\\}\\) to be the \\(i\\)-th training point, and \\(\\theta\\) to be the (potentially highly) multi-dimensional parameters of a model (e.g. \\(\\theta\\) is a big array with all of a neural network's parameters, including biases and/or dropout rates). We will denote with \\(L(z, \\theta)\\) the loss of the model for point \\(z\\) when the parameters are \\(\\theta.\\)
To train a model, we typically minimize the loss over all \\(z_i\\), i.e. the optimal parameters are
\\[\\hat{\\theta} = \\arg \\min_\\theta \\sum_{i=1}^n L(z_i, \\theta).\\] In practice, lack of convexity means that one doesn't really obtain the minimizer of the loss, and the training is stopped when the validation loss stops decreasing.
For notational convenience, let's define
\\[\\hat{\\theta}_{-z} = \\arg \\min_\\theta \\sum_{z_i \\ne z} L(z_i, \\theta), \\] i.e. \\(\\hat{\\theta}_{-z}\\) are the model parameters that minimize the total loss when \\(z\\) is not in the training dataset.
In order to compute the impact of each training point on the model, we would need to calculate \\(\\hat{\\theta}_{-z}\\) for each \\(z\\) in the training dataset, thus re-training the model at least ~\\(n\\) times (more if model training is stochastic). This is computationally very expensive, especially for big neural networks. To circumvent this problem, we can just calculate a first order approximation of \\(\\hat{\\theta}\\). This can be done through single backpropagation and without re-training the full model.
pyDVL supports two ways of computing the empirical influence function, namely up-weighting of samples and perturbation influences.
"},{"location":"influence/#approximating-the-influence-of-a-point","title":"Approximating the influence of a point","text":"Let's define
\\[\\hat{\\theta}_{\\epsilon, z} = \\arg \\min_\\theta \\frac{1}{n}\\sum_{i=1}^n L(z_i, \\theta) + \\epsilon L(z, \\theta), \\] which is the optimal \\(\\hat{\\theta}\\) when we up-weight \\(z\\) by an amount \\(\\epsilon \\gt 0\\).
From a classical result (a simple derivation is available in Appendix A of (Koh and Liang, 2017)4), we know that:
\\[\\frac{d \\ \\hat{\\theta}_{\\epsilon, z}}{d \\epsilon} \\Big|_{\\epsilon=0} = -H_{\\hat{\\theta}}^{-1} \\nabla_\\theta L(z, \\hat{\\theta}), \\] where \\(H_{\\hat{\\theta}} = \\frac{1}{n} \\sum_{i=1}^n \\nabla_\\theta^2 L(z_i, \\hat{\\theta})\\) is the Hessian of \\(L\\). These quantities are also knows as influence factors.
Importantly, notice that this expression is only valid when \\(\\hat{\\theta}\\) is a minimum of \\(L\\), or otherwise \\(H_{\\hat{\\theta}}\\) cannot be inverted! At the same time, in machine learning full convergence is rarely achieved, so direct Hessian inversion is not possible. Approximations need to be developed that circumvent the problem of inverting the Hessian of the model in all those (frequent) cases where it is not positive definite.
The influence of training point \\(z\\) on test point \\(z_{\\text{test}}\\) is defined as:
\\[\\mathcal{I}(z, z_{\\text{test}}) = L(z_{\\text{test}}, \\hat{\\theta}_{-z}) - L(z_{\\text{test}}, \\hat{\\theta}). \\] Notice that \\(\\mathcal{I}\\) is higher for points \\(z\\) which positively impact the model score, since the loss is higher when they are excluded from training. In practice, one needs to rely on the following infinitesimal approximation:
\\[\\mathcal{I}_{up}(z, z_{\\text{test}}) = - \\frac{d L(z_{\\text{test}}, \\hat{\\theta}_{\\epsilon, z})}{d \\epsilon} \\Big|_{\\epsilon=0} \\] Using the chain rule and the results calculated above, we get:
\\[\\mathcal{I}_{up}(z, z_{\\text{test}}) = - \\nabla_\\theta L(z_{\\text{test}}, \\hat{\\theta})^\\top \\ \\frac{d \\hat{\\theta}_{\\epsilon, z}}{d \\epsilon} \\Big|_{\\epsilon=0} = \\nabla_\\theta L(z_{\\text{test}}, \\hat{\\theta})^\\top \\ H_{\\hat{\\theta}}^{-1} \\ \\nabla_\\theta L(z, \\hat{\\theta}) \\] All the resulting factors are gradients of the loss wrt. the model parameters \\(\\hat{\\theta}\\). This can be easily computed through one or more backpropagation passes.
"},{"location":"influence/#perturbation-definition-of-the-influence-score","title":"Perturbation definition of the influence score","text":"How would the loss of the model change if, instead of up-weighting an individual point \\(z\\), we were to up-weight only a single feature of that point? Given \\(z = (x, y)\\), we can define \\(z_{\\delta} = (x+\\delta, y)\\), where \\(\\delta\\) is a vector of zeros except for a 1 in the position of the feature we want to up-weight. In order to approximate the effect of modifying a single feature of a single point on the model score we can define
\\[\\hat{\\theta}_{\\epsilon, z_{\\delta} ,-z} = \\arg \\min_\\theta \\frac{1}{n}\\sum_{i=1}^n L(z_{i}, \\theta) + \\epsilon L(z_{\\delta}, \\theta) - \\epsilon L(z, \\theta), \\] Similarly to what was done above, we up-weight point \\(z_{\\delta}\\), but then we also remove the up-weighting for all the features that are not modified by \\(\\delta\\). From the calculations in the previous section, it is then easy to see that
\\[\\frac{d \\ \\hat{\\theta}_{\\epsilon, z_{\\delta} ,-z}}{d \\epsilon} \\Big|_{\\epsilon=0} = -H_{\\hat{\\theta}}^{-1} \\nabla_\\theta \\Big( L(z_{\\delta}, \\hat{\\theta}) - L(z, \\hat{\\theta}) \\Big) \\] and if the feature space is continuous and as \\(\\delta \\to 0\\) we can write
\\[\\frac{d \\ \\hat{\\theta}_{\\epsilon, z_{\\delta} ,-z}}{d \\epsilon} \\Big|_{\\epsilon=0} = -H_{\\hat{\\theta}}^{-1} \\ \\nabla_x \\nabla_\\theta L(z, \\hat{\\theta}) \\delta + \\mathcal{o}(\\delta) \\] The influence of each feature of \\(z\\) on the loss of the model can therefore be estimated through the following quantity:
\\[\\mathcal{I}_{pert}(z, z_{\\text{test}}) = - \\lim_{\\delta \\to 0} \\ \\frac{1}{\\delta} \\frac{d L(z_{\\text{test}}, \\hat{\\theta}_{\\epsilon, \\ z_{\\delta}, \\ -z})}{d \\epsilon} \\Big|_{\\epsilon=0} \\] which, using the chain rule and the results calculated above, is equal to
\\[\\mathcal{I}_{pert}(z, z_{\\text{test}}) = - \\nabla_\\theta L(z_{\\text{test}}, \\hat{\\theta})^\\top \\ \\frac{d \\hat{\\theta}_{\\epsilon, z_{\\delta} ,-z}}{d \\epsilon} \\Big|_{\\epsilon=0} = \\nabla_\\theta L(z_{\\text{test}}, \\hat{\\theta})^\\top \\ H_{\\hat{\\theta}}^{-1} \\ \\nabla_x \\nabla_\\theta L(z, \\hat{\\theta}) \\] The perturbation definition of the influence score is not straightforward to understand, but it has a simple interpretation: it tells how much the loss of the model changes when a certain feature of point z is up-weighted. A positive perturbation influence score indicates that the feature might have a positive effect on the accuracy of the model.
It is worth noting that the perturbation influence score is a very rough estimate of the impact of a point on the models loss and it is subject to large approximation errors. It can nonetheless be used to build training-set attacks, as done in (Koh and Liang, 2017)4.
"},{"location":"influence/#computation","title":"Computation","text":"The main abstraction of the library for influence calculation is InfluenceFunctionModel. On implementations of this abstraction, you can call the method influences
to compute influences.
pyDVL provides implementations to use with pytorch model in pydvl.influence.torch. For detailed information on available implementations see the documentation in InfluenceFunctionModel.
Given a pre-trained pytorch model and a loss, a basic example would look like
from torch.utils.data import DataLoader\nfrom pydvl.influence.torch import DirectInfluence\n\ntraining_data_loader = DataLoader(...)\ninfl_model = DirectInfluence(model, loss)\ninfl_model = infl_model.fit(training_data_loader)\n\ninfluences = infl_model.influences(x_test, y_test, x, y)\n
for batches \\(z_{\\text{test}} = (x_{\\text{test}}, y_{\\text{test}})\\) and \\(z = (x, y)\\) of data. The result is a tensor with one row per test point in \\(z_{\\text{test}}\\) and one column per point in \\(z\\). Thus, each entry \\((i, j)\\) represents the influence of training point \\(z[j]\\) on test point \\(z_{\\text{test}}[i]\\). Warning
Compared to the mathematical definitions above, we switch the ordering of \\(z\\) and \\(z_{\\text{test}}\\), in order to make the input ordering consistent with the dimensions of the resulting tensor. More concrete if the first dimension of \\(z_{\\text{test}}\\) is \\(N\\) and that of \\(z\\), the resulting tensor is of shape \\(N \\times M\\)
A large positive influence indicates that training point \\(j\\) tends to improve the performance of the model on test point \\(i\\), and vice versa, a large negative influence indicates that training point \\(j\\) tends to worsen the performance of the model on test point \\(i\\).
"},{"location":"influence/#hessian-regularization","title":"Hessian regularization","text":"Additionally, and as discussed in the introduction, in machine learning training rarely converges to a global minimum of the loss. Despite good apparent convergence, \\(\\hat{\\theta}\\) might be located in a region with flat curvature or close to a saddle point. In particular, the Hessian might have vanishing eigenvalues making its direct inversion impossible. Certain methods, such as the Arnoldi method are robust against these problems, but most are not.
To circumvent this problem, many approximate methods can be implemented. The simplest adds a small hessian perturbation term, i.e. \\(H_{\\hat{\\theta}} + \\lambda \\mathbb{I}\\), with \\(\\mathbb{I}\\) being the identity matrix.
from torch.utils.data import DataLoader\nfrom pydvl.influence.torch import DirectInfluence\n\ntraining_data_loader = DataLoader(...)\ninfl_model = DirectInfluence(model, loss, regularization=0.01)\ninfl_model = infl_model.fit(training_data_loader)\n
This standard trick ensures that the eigenvalues of \\(H_{\\hat{\\theta}}\\) are bounded away from zero and therefore the matrix is invertible. In order for this regularization not to corrupt the outcome too much, the parameter \\(\\lambda\\) should be as small as possible while still allowing a reliable inversion of \\(H_{\\hat{\\theta}} + \\lambda \\mathbb{I}\\).
"},{"location":"influence/#block-diagonal-approximation","title":"Block-diagonal approximation","text":"This implementation is capable of using a block-diagonal approximation. The full matrix is approximated by a block-diagonal version, which reduces both the time and memory consumption. The blocking structure can be specified via the block_structure
parameter. The block_structure
parameter can either be a BlockMode enum (which provides layer-wise or parameter-wise blocking) or a custom block structure defined by an ordered dictionary with the keys being the block identifiers (arbitrary strings) and the values being lists of parameter names contained in the block.
from torch.utils.data import DataLoader\nfrom pydvl.influence.torch import DirectInfluence, BlockMode, SecondOrderMode\n\ntraining_data_loader = DataLoader(...)\n# layer-wise block-diagonal approximation\ninfl_model = DirectInfluence(model, loss,\n regularization=0.1,\n block_structure=BlockMode.LAYER_WISE)\n\nblock_structure = OrderedDict((\n (\"custom_block1\", [\"0.weight\", \"1.bias\"]), \n (\"custom_block2\", [\"1.weight\", \"0.bias\"]),\n))\n# custom block-diagonal structure\ninfl_model = DirectInfluence(model, loss,\n regularization=0.1,\n block_structure=block_structure)\ninfl_model = infl_model.fit(training_data_loader)\n
If you would like to apply a block-specific regularization, you can provide a dictionary with the block names as keys and the regularization values as values. If no value is provided for a specific key, no regularization is applied for the corresponding block. regularization = {\n\"custom_block1\": 0.1,\n\"custom_block2\": 0.2,\n}\ninfl_model = DirectInfluence(model, loss,\n regularization=regularization,\n block_structure=block_structure)\ninfl_model = infl_model.fit(training_data_loader)\n
Accordingly, if you choose a layer-wise or parameter-wise structure (by providing BlockMode.LAYER_WISE
or BlockMode.PARAMETER_WISE
for block_structure
) the keys must be the layer names or parameter names, respectively. You can retrieve the block-wise influence information from the methods with suffix _by_block
. By default, block_structure
is set to BlockMode.FULL
and in this case these methods will return a dictionary with the empty string being the only key."},{"location":"influence/#gauss-newton-approximation","title":"Gauss-Newton approximation","text":"In the computation of the influence values, the inversion of the Hessian can be replaced by the inversion of the Gauss-Newton matrix
\\[ G_{\\hat{\\theta}}=n^{-1} \\sum_{i=1}^n \\nabla_{\\theta}L(z_i, \\hat{\\theta}) \\nabla_{\\theta}L(z_i, \\hat{\\theta})^T \\] so the computed values are of the form
\\[\\nabla_\\theta L(z_{\\text{test}}, \\hat{\\theta})^\\top \\ G_{\\hat{\\theta}}^{-1} \\ \\nabla_\\theta L(z, \\hat{\\theta}). \\] The parameter second_orer_mode
is used to configure this approximation.
from torch.utils.data import DataLoader\nfrom pydvl.influence.torch import DirectInfluence, BlockMode, SecondOrderMode\n\ntraining_data_loader = DataLoader(...)\ninfl_model = DirectInfluence(model, loss,\n regularization={\"layer_1\": 0.1, \"layer_2\": 0.2},\n block_structure=BlockMode.LAYER_WISE,\n second_order_mode=SecondOrderMode.GAUSS_NEWTON)\ninfl_model = infl_model.fit(training_data_loader)\n
"},{"location":"influence/#perturbation-influences","title":"Perturbation influences","text":"The method of empirical influence computation can be selected with the parameter mode
:
from pydvl.influence import InfluenceMode\n\ninfluences = infl_model.influences(x_test, y_test, x, y,\n mode=InfluenceMode.Perturbation)\n
The result is a tensor with at least three dimensions. The first two dimensions are the same as in the case of mode=InfluenceMode.Up
case, i.e. one row per test point and one column per training point. The remaining dimensions are the same as the number of input features in the data. Therefore, each entry in the tensor represents the influence of each feature of each training point on each test point."},{"location":"influence/#influence-factors","title":"Influence factors","text":"The influence factors(refer to the previous section for a definition) are typically the most computationally demanding part of influence calculation. They can be obtained via calling the influence_factors
method, saved, and later used for influence calculation on different subsets of the training dataset.
influence_factors = infl_model.influence_factors(x_test, y_test)\ninfluences = infl_model.influences_from_factors(influence_factors, x, y)\n
-
Agarwal, N., Bullins, B., Hazan, E., 2017. Second-Order Stochastic Optimization for Machine Learning in Linear Time. JMLR 18, 1\u201340.\u00a0\u21a9
-
Schioppa, A., Zablotskaia, P., Vilar, D., Sokolov, A., 2022. Scaling Up Influence Functions. Proc. AAAI Conf. Artif. Intell. 36, 8179\u20138186. https://doi.org/10.1609/aaai.v36i8.20791 \u21a9
-
Hampel, F.R., 1974. The Influence Curve and Its Role in Robust Estimation. J. Am. Stat. Assoc. 69, 383\u2013393. https://doi.org/10.2307/2285666 \u21a9
-
Koh, P.W., Liang, P., 2017. Understanding Black-box Predictions via Influence Functions, in: Proceedings of the 34th International Conference on Machine Learning. Presented at the International Conference on Machine Learning, PMLR, pp. 1885\u20131894.\u00a0\u21a9\u21a9\u21a9
"},{"location":"influence/influence_function_model/","title":"Influence Function Model","text":"In almost every practical application it is not possible to construct, even less invert the complete Hessian in memory. pyDVL offers several implementations of the interface InfluenceFunctionModel , which do not compute the full Hessian (in contrast to DirectInfluence ).
"},{"location":"influence/influence_function_model/#conjugate-gradient","title":"Conjugate Gradient","text":"This classical procedure for solving linear systems of equations is an iterative method that does not require the explicit inversion of the Hessian. Instead, it only requires the calculation of Hessian-vector products, making it a good choice for large datasets or models with many parameters. It is nevertheless much slower to converge than the direct inversion method and not as accurate.
More info on the theory of conjugate gradient can be found on Wikipedia, or in text books such as (Trefethen and Bau, 1997, Lecture 38)1.
pyDVL also implements a stable block variant of the conjugate gradient method, defined in (Ji and Li, 2017)2, which solves several right hand sides simultaneously.
Optionally, the user can provide a pre-conditioner to improve convergence, such as a Jacobi pre-conditioner , which is a simple diagonal pre-conditioner based on Hutchinson's diagonal estimator (Bekas et al., 2007)3, or a Nystr\u00f6m approximation based pre-conditioner , described in (Frangella et al., 2023)4.
from pydvl.influence.torch import CgInfluence\nfrom pydvl.influence.torch.pre_conditioner import NystroemPreConditioner\n\nif_model = CgInfluence(\n model,\n loss,\n hessian_regularization=0.0,\n rtol=1e-7,\n atol=1e-7,\n maxiter=None,\n use_block_cg=True,\n pre_conditioner=NystroemPreConditioner(rank=10)\n)\nif_model.fit(train_loader)\n
The additional optional parameters rtol
, atol
, maxiter
, use_block_cg
and pre_conditioner
are respectively, the relative tolerance, the absolute tolerance, the maximum number of iterations, a flag indicating whether to use block variant of cg and an optional pre-conditioner.
"},{"location":"influence/influence_function_model/#linear-time-stochastic-second-order-approximation-lissa","title":"Linear time Stochastic Second-Order Approximation (LiSSA)","text":"The LiSSA method is a stochastic approximation of the inverse Hessian vector product. Compared to conjugate gradient it is faster but less accurate and typically suffers from instability.
In order to find the solution of the HVP, LiSSA iteratively approximates the inverse of the Hessian matrix with the following update:
\\[H^{-1}_{j+1} b = b + (I - d) \\ H - \\frac{H^{-1}_j b}{s},\\] where \\(d\\) and \\(s\\) are a dampening and a scaling factor, which are essential for the convergence of the method and they need to be chosen carefully, and I is the identity matrix. More info on the theory of LiSSA can be found in the original paper (Agarwal et al., 2017)5.
from pydvl.influence.torch import LissaInfluence, BlockMode, SecondOrderMode\nif_model = LissaInfluence(\n model,\n loss,\n regularization=0.0 \n maxiter=1000,\n dampen=0.0,\n scale=10.0,\n rtol=1e-4,\n block_structure=BlockMode.FULL,\n second_order_mode=SecondOrderMode.GAUSS_NEWTON\n)\nif_model.fit(train_loader)\n
with the additional optional parameters maxiter
, dampen
, scale
, and rtol
, being the maximum number of iterations, the dampening factor, the scaling factor and the relative tolerance, respectively. This implementation is capable of using a block-matrix approximation, see Block-diagonal approximation, and can handle Gauss-Newton approximation.
"},{"location":"influence/influence_function_model/#arnoldi","title":"Arnoldi","text":"The Arnoldi method is a Krylov subspace method for approximating dominating eigenvalues and eigenvectors. Under a low rank assumption on the Hessian at a minimizer (which is typically observed for deep neural networks), this approximation captures the essential action of the Hessian. More concretely, for \\(Hx=b\\) the solution is approximated by
\\[x \\approx V D^{-1} V^T b\\] where \\(D\\) is a diagonal matrix with the top (in absolute value) eigenvalues of the Hessian and \\(V\\) contains the corresponding eigenvectors. See also (Schioppa et al., 2022)6.
from pydvl.influence.torch import ArnoldiInfluence\nif_model = ArnoldiInfluence(\n model,\n loss,\n hessian_regularization=0.0,\n rank_estimate=10,\n tol=1e-6,\n)\nif_model.fit(train_loader)\n
"},{"location":"influence/influence_function_model/#eigenvalue-corrected-k-fac","title":"Eigenvalue Corrected K-FAC","text":"K-FAC, short for Kronecker-Factored Approximate Curvature, is a method that approximates the Fisher information matrix FIM of a model. It is possible to show that for classification models with appropriate loss functions the FIM is equal to the Hessian of the model\u2019s loss over the dataset. In this restricted but nonetheless important context K-FAC offers an efficient way to approximate the Hessian and hence the influence scores. For more info and details refer to the original paper (Martens and Grosse, 2015)7.
The K-FAC method is implemented in the class EkfacInfluence . The following code snippet shows how to use the K-FAC method to calculate the influence function of a model. Note that, in contrast to the other methods for influence function calculation, K-FAC does not require the loss function as an input. This is because the current implementation is only applicable to classification models with a cross entropy loss function.
from pydvl.influence.torch import EkfacInfluence\nif_model = EkfacInfluence(\n model,\n hessian_regularization=0.0,\n)\nif_model.fit(train_loader)\n
Upon initialization, the K-FAC method will parse the model and extract which layers require grad and which do not. Then it will only calculate the influence scores for the layers that require grad. The current implementation of the K-FAC method is only available for linear layers, and therefore if the model contains non-linear layers that require gradient the K-FAC method will raise a NotImplementedLayerRepresentationException. A further improvement of the K-FAC method is the Eigenvalue Corrected K-FAC (EKFAC) method (George et al., 2018)8, which allows to further re-fit the eigenvalues of the Hessian, thus providing a more accurate approximation. On top of the K-FAC method, the EKFAC method is implemented by setting update_diagonal=True
when initialising EkfacInfluence . The following code snippet shows how to use the EKFAC method to calculate the influence function of a model.
from pydvl.influence.torch import EkfacInfluence\nif_model = EkfacInfluence(\n model,\n update_diagonal=True,\n hessian_regularization=0.0,\n)\nif_model.fit(train_loader)\n
"},{"location":"influence/influence_function_model/#nystrom-sketch-and-solve","title":"Nystr\u00f6m Sketch-and-Solve","text":"This approximation is based on a Nystr\u00f6m low-rank approximation of the form
\\[\\begin{align*} H_{\\text{nys}} &= (H\\Omega)(\\Omega^TH\\Omega)^{\\dagger}(H\\Omega)^T \\\\\\ &= U \\Lambda U^T, \\end{align*}\\] where \\((\\cdot)^{\\dagger}\\) denotes the Moore-Penrose inverse, in combination with the Sherman\u2013Morrison\u2013Woodbury formula to calculate the action of its inverse:
\\[\\begin{equation*} (H_{\\text{nys}} + \\lambda I)^{-1}x = U(\\Lambda+\\lambda I)U^Tx + \\frac{1}{\\lambda}(I\u2212UU^T)x, \\end{equation*}\\] see also (Hataya and Yamada, 2023)9 and (Frangella et al., 2023)4. The essential parameter is the rank of the approximation.
from pydvl.influence.torch import NystroemSketchInfluence\nif_model = NystroemSketchInfluence(\n model,\n loss,\n rank=10,\n regularization=0.0,\n block_structure=BlockMode.FULL,\n second_order_mode=SecondOrderMode.HESSIAN\n)\nif_model.fit(train_loader)\n
This implementation is capable of using a block-matrix approximation, see Block-diagonal approximation, and can handle Gauss-Newton approximation."},{"location":"influence/influence_function_model/#inverse-harmonic-mean","title":"Inverse Harmonic Mean","text":"This implementation replaces the inverse Hessian matrix in the influence computation with an approximation of the inverse Gauss-Newton vector product and was proposed in (Kwon et al., 2023)10.
The approximation method comprises the following steps:
-
Replace the Hessian \\(H(\\theta)\\) with the Gauss-Newton matrix \\(G(\\theta)\\):
\\[\\begin{equation*} G(\\theta)=n^{-1} \\sum_{i=1}^n \\nabla_{\\theta}\\ell_i\\nabla_{\\theta}\\ell_i^T \\end{equation*}\\] which results in
\\[\\begin{equation*} \\mathcal{I}(z_{t}, z) \\approx \\nabla_{\\theta} \\ell(z_{t}, \\theta)^T (G(\\theta) + \\lambda I_d)^{-1} \\nabla_{\\theta} \\ell(z, \\theta) \\end{equation*}\\] -
Simplify the problem by breaking it down into a block diagonal structure, where each block \\(G_l(\\theta)\\) corresponds to the l-th block:
\\[\\begin{equation*} G_{l}(\\theta) = n^{-1} \\sum_{i=1}^n \\nabla_{\\theta_l} \\ell_i \\nabla_{\\theta_l} \\ell_i^{T} + \\lambda_l I_{d_l}, \\end{equation*}\\] which leads to
\\[\\begin{equation*} \\mathcal{I}(z_{t}, z) \\approx \\nabla_{\\theta} \\ell(z_{t}, \\theta)^T \\operatorname{diag}(G_1(\\theta)^{-1}, \\dots, G_L(\\theta)^{-1}) \\nabla_{\\theta} \\ell(z, \\theta) \\end{equation*}\\] -
Substitute the arithmetic mean of the rank-\\(1\\) updates in \\(G_l(\\theta)\\), with the inverse harmonic mean \\(R_l(\\theta)\\) of the rank-1 updates:
\\[\\begin{align*} G_l(\\theta)^{-1} &= \\left( n^{-1} \\sum_{i=1}^n \\nabla_{\\theta_l} \\ell(z_i, \\theta) \\nabla_{\\theta_l} \\ell(z_i, \\theta)^{T} + \\lambda_l I_{d_l}\\right)^{-1} \\\\\\ R_{l}(\\theta)&= n^{-1} \\sum_{i=1}^n \\left( \\nabla_{\\theta_l} \\ell(z_i, \\theta) \\nabla_{\\theta_l} \\ell(z_i, \\theta)^{T} + \\lambda_l I_{d_l} \\right)^{-1} \\end{align*}\\] -
Use the Sherman\u2013Morrison formula to get an explicit representation of the inverses in the definition of \\(R_l(\\theta):\\)
\\[\\begin{align*} R_l(\\theta) &= n^{-1} \\sum_{i=1}^n \\left( \\nabla_{\\theta_l} \\ell_i \\nabla_{\\theta_l} \\ell_i^{T} + \\lambda_l I_{d_l}\\right)^{-1} \\\\\\ &= n^{-1} \\sum_{i=1}^n \\lambda_l^{-1} \\left(I_{d_l} - \\frac{\\nabla_{\\theta_l} \\ell_i \\nabla_{\\theta_l} \\ell_i^{T}}{\\lambda_l + \\\\|\\nabla_{\\theta_l} \\ell_i\\\\|_2^2}\\right) , \\end{align*}\\] which means application of \\(R_l(\\theta)\\) boils down to computing \\(n\\) rank-\\(1\\) updates.
from pydvl.influence.torch import InverseHarmonicMeanInfluence, BlockMode\n\nif_model = InverseHarmonicMeanInfluence(\n model,\n loss,\n regularization=1e-1,\n block_structure=BlockMode.LAYER_WISE\n)\nif_model.fit(train_loader)\n
This implementation is capable of using a block-matrix approximation, see Block-diagonal approximation. These implementations represent the calculation logic on in memory tensors. To scale up to large collection of data, we map these influence function models over these collections. For a detailed discussion see the documentation page Scaling Computation.
-
Trefethen, L.N., Bau, D., Iii, 1997. Numerical Linear Algebra. Society for Industrial and Applied Mathematics. https://doi.org/10.1137/1.9780898719574 \u21a9
-
Ji, H., Li, Y., 2017. A breakdown-free block conjugate gradient method. Bit Numer Math 57, 379\u2013403. https://doi.org/10.1007/s10543-016-0631-z \u21a9
-
Bekas, C., Kokiopoulou, E., Saad, Y., 2007. An estimator for the diagonal of a matrix. Applied Numerical Mathematics, Numerical Algorithms, Parallelism and Applications (2) 57, 1214\u20131229. https://doi.org/10.1016/j.apnum.2007.01.003 \u21a9
-
Frangella, Z., Tropp, J.A., Udell, M., 2023. Randomized Nystr\u00f6m Preconditioning. SIAM J. Matrix Anal. Appl. 44, 718\u2013752. https://doi.org/10.1137/21M1466244 \u21a9\u21a9
-
Agarwal, N., Bullins, B., Hazan, E., 2017. Second-Order Stochastic Optimization for Machine Learning in Linear Time. JMLR 18, 1\u201340.\u00a0\u21a9
-
Schioppa, A., Zablotskaia, P., Vilar, D., Sokolov, A., 2022. Scaling Up Influence Functions. Proc. AAAI Conf. Artif. Intell. 36, 8179\u20138186. https://doi.org/10.1609/aaai.v36i8.20791 \u21a9
-
Martens, J., Grosse, R., 2015. Optimizing Neural Networks with Kronecker-factored Approximate Curvature, in: Proceedings of the 32nd International Conference on Machine Learning. Presented at the International Conference on Machine Learning, PMLR, pp. 2408\u20132417.\u00a0\u21a9
-
George, T., Laurent, C., Bouthillier, X., Ballas, N., Vincent, P., 2018. Fast Approximate Natural Gradient Descent in a Kronecker Factored Eigenbasis, in: Advances in Neural Information Processing Systems. Curran Associates, Inc.\u00a0\u21a9
-
Hataya, R., Yamada, M., 2023. Nystr\u00f6m Method for Accurate and Scalable Implicit Differentiation, in: Proceedings of The 26th International Conference on Artificial Intelligence and Statistics. Presented at the International Conference on Artificial Intelligence and Statistics, PMLR, pp. 4643\u20134654.\u00a0\u21a9
-
Kwon, Y., Wu, E., Wu, K., Zou, J., 2023. DataInf: Efficiently Estimating Data Influence in LoRA-tuned LLMs and Diffusion Models. Presented at the The Twelfth International Conference on Learning Representations. https://doi.org/10.48550/arXiv.2310.00902 \u21a9
"},{"location":"influence/scaling_computation/","title":"Scaling Influence Computation","text":"The implementations of InfluenceFunctionModel provide a convenient way to calculate influences for in memory tensors.
Nevertheless, there is a need for computing the influences on batches of data. This might happen, if your input data does not fit into memory (e.g. it is very high-dimensional) or for large models the derivative computations exceed your memory or any combinations of these. For this scenario, we want to map our influence function model over collections of batches (or chunks) of data.
"},{"location":"influence/scaling_computation/#sequential","title":"Sequential","text":"The simplest way is to use a double for-loop to iterate over the batches sequentially and collect them. pyDVL provides the simple convenience class SequentialInfluenceCalculator to do this. The batch size should be chosen as large as possible, such that the corresponding batches fit into memory.
from pydvl.influence import SequentialInfluenceCalculator\nfrom pydvl.influence.torch.util import (\n TorchNumpyConverter, NestedTorchCatAggregator,\n)\nfrom pydvl.influence.torch import CgInfluence\n\nbatch_size = 10\ntrain_dataloader = DataLoader(..., batch_size=batch_size)\ntest_dataloader = DataLoader(..., batch_size=batch_size)\n\ninfl_model = CgInfluence(model, loss, hessian_regularization=0.01)\ninfl_model = infl_model.fit(train_dataloader)\n\ninfl_calc = SequentialInfluenceCalculator(infl_model)\n\n# this does not trigger the computation\nlazy_influences = infl_calc.influences(test_dataloader, train_dataloader)\n\n# trigger computation and pull the result into main memory, \n# result is the full tensor for all combinations of the two loaders\ninfluences = lazy_influences.compute(aggregator=NestedTorchCatAggregator())\n# or\n# trigger computation and write results chunk-wise to disk using zarr \n# in a sequential manner\nlazy_influences.to_zarr(\"local_path/or/url\", TorchNumpyConverter())\n
When invoking the compute
method, you have the option to specify a custom aggregator by implementing NestedSequenceAggregator. This allows for the aggregation of computed chunks. Such an approach is particularly beneficial for straightforward aggregation tasks, commonly seen in sequential computation models. Examples include operations like concatenation, as implemented in NestedTorchCatAggregator, or basic min and max operations. For more intricate aggregations, such as an argmax operation, it's advisable to use the DaskInfluenceCalculator (refer to Parallel for more details). This is because it returns data structures in the form of dask.array.Array objects, which offer an API almost fully compatible with NumPy arrays.
"},{"location":"influence/scaling_computation/#parallel","title":"Parallel","text":"While the sequential calculation helps in the case the resulting tensors are too large to fit into memory, the batches are computed one after another. Because the influence computation itself is completely data parallel, you may want to use a parallel processing framework.
pyDVL provides an implementation of a parallel computation model using dask. The wrapper class DaskInfluenceCalculator has convenience methods to map the influence function computation over chunks of data in a parallel manner.
Again, choosing an appropriate chunk size can be crucial. For a better understanding see the official dask best practice documentation and the following blog entry.
Warning
Make sure to set threads_per_worker=1
, when using the distributed scheduler for computing, if your implementation of InfluenceFunctionModel is not thread-safe.
client = Client(threads_per_worker=1)\n
For details on dask schedulers see the official documentation. import torch\nfrom torch.utils.data import Dataset, DataLoader\nfrom pydvl.influence import DaskInfluenceCalculator\nfrom pydvl.influence.torch import CgInfluence\nfrom pydvl.influence.torch.util import (\n torch_dataset_to_dask_array,\n TorchNumpyConverter,\n)\nfrom distributed import Client\n\ntrain_data_set: Dataset = LargeDataSet(\n ...) # Possible some out of memory large Dataset\ntest_data_set: Dataset = LargeDataSet(\n ...) # Possible some out of memory large Dataset\n\ntrain_dataloader = DataLoader(train_data_set)\ninfl_model = CgInfluence(model, loss, hessian_regularization=0.01)\ninfl_model = infl_model.fit(train_dataloader)\n\n# wrap your input data into dask arrays\nchunk_size = 10\nda_x, da_y = torch_dataset_to_dask_array(train_data_set, chunk_size=chunk_size)\nda_x_test, da_y_test = torch_dataset_to_dask_array(test_data_set,\n chunk_size=chunk_size)\n\n# use only one thread for scheduling, \n# due to non-thread safety of some torch operations\nclient = Client(n_workers=4, threads_per_worker=1)\n\ninfl_calc = DaskInfluenceCalculator(infl_model, \n converter=TorchNumpyConverter(\n device=torch.device(\"cpu\")\n ),\n client=client)\nda_influences = infl_calc.influences(da_x_test, da_y_test, da_x, da_y)\n# da_influences is a dask.array.Array\n# trigger computation and write chunks to disk in parallel\nda_influences.to_zarr(\"path/or/url\")\n
During initialization of the DaskInfluenceCalculator, the system verifies if all workers are operating in single-threaded mode when the provided influence_function_model is designated as not thread-safe (indicated by the is_thread_safe
property). If this condition is not met, the initialization will raise a specific error, signaling a potential thread-safety conflict. To intentionally skip this safety check (e.g., for debugging purposes using the single machine synchronous scheduler), you can supply the DisableClientSingleThreadCheck type.
from pydvl.influence import DisableClientSingleThreadCheck\n\ninfl_calc = DaskInfluenceCalculator(infl_model,\n TorchNumpyConverter(device=torch.device(\"cpu\")),\n DisableClientSingleThreadCheck)\nda_influences = infl_calc.influences(da_x_test, da_y_test, da_x, da_y)\nda_influences.compute(scheduler=\"synchronous\")\n
"},{"location":"value/","title":"Data valuation","text":"Info
If you want to jump right into it, skip ahead to Computing data values. If you want a quick list of applications, see Applications of data valuation. For a list of all algorithms implemented in pyDVL, see Methods.
Data valuation is the task of assigning a number to each element of a training set which reflects its contribution to the final performance of some model trained on it. Some methods attempt to be model-agnostic, but in most cases the model is an integral part of the method. In these cases, this number is not an intrinsic property of the element of interest, but typically a function of three factors:
-
The dataset \\(D\\), or more generally, the distribution it was sampled from: In some cases one only cares about values wrt. a given data set, in others value would ideally be the (expected) contribution of a data point to any random set \\(D\\) sampled from the same distribution. pyDVL implements methods of the first kind.
-
The algorithm \\(\\mathcal{A}\\) mapping the data \\(D\\) to some estimator \\(f\\) in a model class \\(\\mathcal{F}\\). E.g. MSE minimization to find the parameters of a linear model.
-
The performance metric of interest \\(u\\) for the problem. When value depends on a model, it must be measured in some way which uses it. E.g. the \\(R^2\\) score or the negative MSE over a test set. This metric will be computed over a held-out valuation set.
pyDVL collects algorithms for the computation of data values in this sense, mostly those derived from cooperative game theory. The methods can be found in the package [[pydvl.value]], with support from modules pydvl.utils.dataset and pydvl.utils.utility, as detailed below.
Warning
Be sure to read the section on the difficulties using data values.
There are three main families of methods for data valuation: game-theoretic, influence-based and intrinsic. As of v0.8.1 pyDVL supports the first two. Here, we focus on game-theoretic concepts and refer to the main documentation on the influence funtion for the second.
"},{"location":"value/#game-theoretical-methods","title":"Game theoretical methods","text":"The main contenders in game-theoretic approaches are Shapley values (Ghorbani and Zou, 2019)1, (Kwon et al., 2021)2, (Schoch et al., 2022)3, their generalization to so-called semi-values by (Kwon and Zou, 2022)4 and [@wang_data_2022], and the Core (Yan and Procaccia, 2021)5. All of these are implemented in pyDVL. For a full list see Methods
In these methods, data points are considered players in a cooperative game whose outcome is the performance of the model when trained on subsets (coalitions) of the data, measured on a held-out valuation set. This outcome, or utility, must typically be computed for every subset of the training set, so that an exact computation is \\(\\mathcal{O} (2^n)\\) in the number of samples \\(n\\), with each iteration requiring a full re-fitting of the model using a coalition as training set. Consequently, most methods involve Monte Carlo approximations, and sometimes approximate utilities which are faster to compute, e.g. proxy models (Wang et al., 2022)6 or constant-cost approximations like Neural Tangent Kernels (Wu et al., 2022)7.
The reasoning behind using game theory is that, in order to be useful, an assignment of value, dubbed valuation function, is usually required to fulfil certain requirements of consistency and \"fairness\". For instance, in some applications value should not depend on the order in which data are considered, or it should be equal for samples that contribute equally to any subset of the data (of equal size). When considering aggregated value for (sub-)sets of data there are additional desiderata, like having a value function that does not increase with repeated samples. Game-theoretic methods are all rooted in axioms that by construction ensure different desiderata, but despite their practical usefulness, none of them are either necessary or sufficient for all applications. For instance, SV methods try to equitably distribute all value among all samples, failing to identify repeated ones as unnecessary, with e.g. a zero value.
"},{"location":"value/#computing-data-values","title":"Computing data values","text":"Using pyDVL to compute data values is a simple process that can be broken down into three steps:
- Creating a Dataset object from your data.
- Creating a Utility which ties your model to the dataset and a scoring function.
- Computing values with a method of your choice, e.g. via compute_shapley_values.
"},{"location":"value/#creating-a-dataset","title":"Creating a Dataset","text":"The first item in the tuple \\((D, \\mathcal{A}, u)\\) characterising data value is the dataset. The class Dataset is a simple convenience wrapper for the train and test splits that is used throughout pyDVL. The test set will be used to evaluate a scoring function for the model.
It can be used as follows:
import numpy as np\nfrom pydvl.utils import Dataset\nfrom sklearn.model_selection import train_test_split\nX, y = np.arange(100).reshape((50, 2)), np.arange(50)\nX_train, X_test, y_train, y_test = train_test_split(\n X, y, test_size=0.5, random_state=16\n)\ndataset = Dataset(X_train, X_test, y_train, y_test)\n
It is also possible to construct Datasets from sklearn toy datasets for illustrative purposes using from_sklearn.
"},{"location":"value/#grouping-data","title":"Grouping data","text":"Be it because data valuation methods are computationally very expensive, or because we are interested in the groups themselves, it can be often useful or necessary to group samples to valuate them together. GroupedDataset provides an alternative to Dataset with the same interface which allows this.
You can see an example in action in the Spotify notebook, but here's a simple example grouping a pre-existing Dataset
. First we construct an array mapping each index in the dataset to a group, then use from_dataset:
import numpy as np\nfrom pydvl.utils import GroupedDataset\n\n# Randomly assign elements to any one of num_groups:\ndata_groups = np.random.randint(0, num_groups, len(dataset))\ngrouped_dataset = GroupedDataset.from_dataset(dataset, data_groups)\n
"},{"location":"value/#creating-a-utility","title":"Creating a Utility","text":"In pyDVL we have slightly overloaded the name \"utility\" and use it to refer to an object that keeps track of all three items in \\((D, \\mathcal{A}, u)\\). This will be an instance of Utility which, as mentioned, is a convenient wrapper for the dataset, model and scoring function used for valuation methods.
Here's a minimal example:
import sklearn as sk\nfrom pydvl.utils import Dataset, Utility\n\ndataset = Dataset.from_sklearn(sk.datasets.load_iris())\nmodel = sk.svm.SVC()\nutility = Utility(model, dataset)\n
The object utility
is a callable that data valuation methods will execute with different subsets of training data. Each call will retrain the model on a subset and evaluate it on the test data using a scoring function. By default, Utility will use model.score()
, but it is possible to use any scoring function (greater values must be better). In particular, the constructor accepts the same types as argument as sklearn.model_selection.cross_validate: a string, a scorer callable or None for the default.
utility = Utility(model, dataset, \"explained_variance\")\n
Utility
will wrap the fit()
method of the model to cache its results. This greatly reduces computation times of Monte Carlo methods. Because of how caching is implemented, it is important not to reuse Utility
objects for different datasets. You can read more about setting up the cache in the installation guide, and in the documentation of the caching module.
"},{"location":"value/#using-custom-scorers","title":"Using custom scorers","text":"The scoring
argument of Utility can be used to specify a custom Scorer object. This is a simple wrapper for a callable that takes a model, and test data and returns a score.
More importantly, the object provides information about the range of the score, which is used by some methods by estimate the number of samples necessary, and about what default value to use when the model fails to train.
Note
The most important property of a Scorer
is its default value. Because many models will fail to fit on small subsets of the data, it is important to provide a sensible default value for the score.
It is possible to skip the construction of the Scorer when constructing the Utility
object. The two following calls are equivalent:
from pydvl.utils import Utility, Scorer\n\nutility = Utility(\n model, dataset, \"explained_variance\", score_range=(-np.inf, 1), default_score=0.0\n)\nutility = Utility(\n model, dataset, Scorer(\"explained_variance\", range=(-np.inf, 1), default=0.0)\n)\n
"},{"location":"value/#learning-the-utility","title":"Learning the utility","text":"Because each evaluation of the utility entails a full retrain of the model with a new subset of the training set, it is natural to try to learn this mapping from subsets to scores. This is the idea behind Data Utility Learning (DUL) (Wang et al., 2022)6 and in pyDVL it's as simple as wrapping the Utility
inside DataUtilityLearning:
from pydvl.utils import Utility, DataUtilityLearning, Dataset\nfrom sklearn.linear_model import LinearRegression, LogisticRegression\nfrom sklearn.datasets import load_iris\n\ndataset = Dataset.from_sklearn(load_iris())\nu = Utility(LogisticRegression(), dataset)\ntraining_budget = 3\nwrapped_u = DataUtilityLearning(u, training_budget, LinearRegression())\n\n# First 3 calls will be computed normally\nfor i in range(training_budget):\n _ = wrapped_u((i,))\n# Subsequent calls will be computed using the fit model for DUL\nwrapped_u((1, 2, 3))\n
As you can see, all that is required is a model to learn the utility itself and the fitting and using of the learned model happens behind the scenes.
There is a longer example with an investigation of the results achieved by DUL in a dedicated notebook.
"},{"location":"value/#leave-one-out-values","title":"Leave-One-Out values","text":"LOO is the simplest approach to valuation. It assigns to each sample its marginal utility as value:
\\[v_u(i) = u(D) \u2212 u(D_{-i}).\\] For notational simplicity, we consider the valuation function as defined over the indices of the dataset \\(D\\), and \\(i \\in D\\) is the index of the sample, \\(D_{-i}\\) is the training set without the sample \\(x_i\\), and \\(u\\) is the utility function. See the section on notation for more.
For the purposes of data valuation, this is rarely useful beyond serving as a baseline for benchmarking. Although in some benchmarks it can perform astonishingly well on occasion. One particular weakness is that it does not necessarily correlate with an intrinsic value of a sample: since it is a marginal utility, it is affected by diminishing returns. Often, the training set is large enough for a single sample not to have any significant effect on training performance, despite any qualities it may possess. Whether this is indicative of low value or not depends on each one's goals and definitions, but other methods are typically preferable.
from pydvl.value.loo import compute_loo\n\nvalues = compute_loo(utility, n_jobs=-1)\n
The return value of all valuation functions is an object of type ValuationResult. This can be iterated over, indexed with integers, slices and Iterables, as well as converted to a pandas.DataFrame.
"},{"location":"value/#problems-of-data-values","title":"Problems of data values","text":"There are a number of factors that affect how useful values can be for your project. In particular, regression can be especially tricky, but the particular nature of every (non-trivial) ML problem can have an effect:
-
Variance of the utility: Classical applications of game theoretic value concepts operate with deterministic utilities, as do many of the bounds in the literature. But in ML we use an evaluation of the model on a validation set as a proxy for the true risk. Even if the utility is bounded, its variance will affect final values, and even more so any Monte Carlo estimates. Several works have tried to cope with variance. [@wang_data_2022] prove that by relaxing one of the Shapley axioms and considering the general class of semi-values, of which Shapley is an instance, one can prove that a choice of constant weights is the best one can do in a utility-agnostic setting. This method, dubbed Data Banzhaf, is available in pyDVL as compute_banzhaf_semivalues.
Averaging repeated utility evaluations One workaround in pyDVL is to configure the caching system to allow multiple evaluations of the utility for every index set. A moving average is computed and returned once the standard error is small, see CachedFuncConfig. Note however that in practice, the likelihood of cache hits is low, so one would have to force recomputation manually somehow.
-
Unbounded utility: Choosing a scorer for a classifier is simple: accuracy or some F-score provides a bounded number with a clear interpretation. However, in regression problems most scores, like \\(R^2\\), are not bounded because regressors can be arbitrarily bad. This leads to great variability in the utility for low sample sizes, and hence unreliable Monte Carlo approximations to the values. Nevertheless, in practice it is only the ranking of samples that matters, and this tends to be accurate (wrt. to the true ranking) despite inaccurate values.
Squashing scores pyDVL offers a dedicated function composition for scorer functions which can be used to squash a score. The following is defined in module score:
import numpy as np\nfrom pydvl.utils import compose_score\n\ndef sigmoid(x: float) -> float:\n return float(1 / (1 + np.exp(-x)))\n\nsquashed_r2 = compose_score(\"r2\", sigmoid, \"squashed r2\")\n\nsquashed_variance = compose_score(\n \"explained_variance\", sigmoid, \"squashed explained variance\"\n)\n
These squashed scores can prove useful in regression problems, but they can also introduce issues in the low-value regime. -
Data set size: Computing exact Shapley values is NP-hard, and Monte Carlo approximations can converge slowly. Massive datasets are thus impractical, at least with game-theoretical methods. A workaround is to group samples and investigate their value together. You can do this using GroupedDataset. There is a fully worked-out example here. Some algorithms also provide different sampling strategies to reduce the variance, but due to a no-free-lunch-type theorem, no single strategy can be optimal for all utilities. Finally, model specific methods like kNN-Shapley (Jia et al., 2019)8, or altogether different and typically faster approaches like Data-OOB (Kwon and Zou, 2023)9 can also be used.
-
Model size: Since every evaluation of the utility entails retraining the whole model on a subset of the data, large models require great amounts of computation. But also, they will effortlessly interpolate small to medium datasets, leading to great variance in the evaluation of performance on the dedicated validation set. One mitigation for this problem is cross-validation, but this would incur massive computational cost. As of v0.8.1 there are no facilities in pyDVL for cross-validating the utility (note that this would require cross-validating the whole value computation).
"},{"location":"value/#notation-and-nomenclature","title":"Notation and nomenclature","text":"Todo
Organize this section better and use its content consistently throughout the documentation.
The following notation is used throughout the documentation:
Let \\(D = \\{x_1, \\ldots, x_n\\}\\) be a training set of \\(n\\) samples.
The utility function \\(u:\\mathcal{D} \\rightarrow \\mathbb{R}\\) maps subsets of \\(D\\) to real numbers. In pyDVL, we typically call this mapping a score for consistency with sklearn, and reserve the term utility for the triple of dataset \\(D\\), model \\(f\\) and score \\(u\\), since they are used together to compute the value.
The value \\(v\\) of the \\(i\\)-th sample in dataset \\(D\\) wrt. utility \\(u\\) is denoted as \\(v_u(x_i)\\) or simply \\(v(i)\\).
For any \\(S \\subseteq D\\), we denote by \\(S_{-i}\\) the set of samples in \\(D\\) excluding \\(x_i\\), and \\(S_{+i}\\) denotes the set \\(S\\) with \\(x_i\\) added.
The marginal utility of adding sample \\(x_i\\) to a subset \\(S\\) is denoted as \\(\\delta(i) := u(S_{+i}) - u(S)\\).
The set \\(D_{-i}^{(k)}\\) contains all subsets of \\(D\\) of size \\(k\\) that do not include sample \\(x_i\\).
-
Ghorbani, A., Zou, J., 2019. Data Shapley: Equitable Valuation of Data for Machine Learning, in: Proceedings of the 36th International Conference on Machine Learning, PMLR. Presented at the International Conference on Machine Learning (ICML 2019), PMLR, pp. 2242\u20132251.\u00a0\u21a9
-
Kwon, Y., Rivas, M.A., Zou, J., 2021. Efficient Computation and Analysis of Distributional Shapley Values, in: Proceedings of the 24th International Conference on Artificial Intelligence and Statistics. Presented at the International Conference on Artificial Intelligence and Statistics, PMLR, pp. 793\u2013801.\u00a0\u21a9
-
Schoch, S., Xu, H., Ji, Y., 2022. CS-Shapley: Class-wise Shapley Values for Data Valuation in Classification, in: Proc. Of the Thirty-Sixth Conference on Neural Information Processing Systems (NeurIPS). Presented at the Advances in Neural Information Processing Systems (NeurIPS 2022).\u00a0\u21a9
-
Kwon, Y., Zou, J., 2022. Beta Shapley: A Unified and Noise-reduced Data Valuation Framework for Machine Learning, in: Proceedings of the 25th International Conference on Artificial Intelligence and Statistics (AISTATS) 2022,. Presented at the AISTATS 2022, PMLR.\u00a0\u21a9
-
Yan, T., Procaccia, A.D., 2021. If You Like Shapley Then You\u2019ll Love the Core, in: Proceedings of the 35th AAAI Conference on Artificial Intelligence, 2021. Presented at the AAAI Conference on Artificial Intelligence, Association for the Advancement of Artificial Intelligence, pp. 5751\u20135759. https://doi.org/10.1609/aaai.v35i6.16721 \u21a9
-
Wang, T., Yang, Y., Jia, R., 2022. Improving Cooperative Game Theory-based Data Valuation via Data Utility Learning. Presented at the International Conference on Learning Representations (ICLR 2022). Workshop on Socially Responsible Machine Learning, arXiv. https://doi.org/10.48550/arXiv.2107.06336 \u21a9\u21a9
-
Wu, Z., Shu, Y., Low, B.K.H., 2022. DAVINZ: Data Valuation using Deep Neural Networks at Initialization, in: Proceedings of the 39th International Conference on Machine Learning. Presented at the International Conference on Machine Learning, PMLR, pp. 24150\u201324176.\u00a0\u21a9
-
Jia, R., Dao, D., Wang, B., Hubis, F.A., Gurel, N.M., Li, B., Zhang, C., Spanos, C., Song, D., 2019. Efficient task-specific data valuation for nearest neighbor algorithms. Proc. VLDB Endow. 12, 1610\u20131623. https://doi.org/10.14778/3342263.3342637 \u21a9
-
Kwon, Y., Zou, J., 2023. Data-OOB: Out-of-bag Estimate as a Simple and Efficient Data Value, in: Proceedings of the 40th International Conference on Machine Learning. Presented at the International Conference on Machine Learning, PMLR, pp. 18135\u201318152.\u00a0\u21a9
"},{"location":"value/classwise-shapley/","title":"Class-wise Shapley","text":"Class-wise Shapley (CWS) (Schoch et al., 2022)1 offers a Shapley framework tailored for classification problems. Given a sample \\(x_i\\) with label \\(y_i \\in \\mathbb{N}\\), let \\(D_{y_i}\\) be the subset of \\(D\\) with labels \\(y_i\\), and \\(D_{-y_i}\\) be the complement of \\(D_{y_i}\\) in \\(D\\). The key idea is that the sample \\((x_i, y_i)\\) might improve the overall model performance on \\(D\\), while being detrimental for the performance on \\(D_{y_i},\\) e.g. because of a wrong label. To address this issue, the authors introduced
\\[ v_u(i) = \\frac{1}{2^{|D_{-y_i}|}} \\sum_{S_{-y_i}} \\left [ \\frac{1}{|D_{y_i}|}\\sum_{S_{y_i}} \\binom{|D_{y_i}|-1}{|S_{y_i}|}^{-1} \\delta(S_{y_i} | S_{-y_i}) \\right ], \\] where \\(S_{y_i} \\subseteq D_{y_i} \\setminus \\{i\\}\\) and \\(S_{-y_i} \\subseteq D_{-y_i}\\) is arbitrary (in particular, not the complement of \\(S_{y_i}\\)). The function \\(\\delta\\) is called set-conditional marginal Shapley value and is defined as
\\[ \\delta(S | C) = u( S_{+i} | C ) \u2212 u(S | C), \\] for any set \\(S\\) such that \\(i \\notin S, C\\) and \\(S \\cap C = \\emptyset\\).
In practical applications, estimating this quantity is done both with Monte Carlo sampling of the powerset, and the set of index permutations (Castro et al., 2009)2. Typically, this requires fewer samples than the original Shapley value, although the actual speed-up depends on the model and the dataset.
Computing classwise Shapley values
Like all other game-theoretic valuation methods, CWS requires a Utility object constructed with model and dataset, with the peculiarity of requiring a specific ClasswiseScorer. The entry point is the function compute_classwise_shapley_values:
from pydvl.value import *\n\nmodel = ...\ndata = Dataset(...)\nscorer = ClasswiseScorer(...)\nutility = Utility(model, data, scorer)\nvalues = compute_classwise_shapley_values(\n utility,\n done=HistoryDeviation(n_steps=500, rtol=5e-2) | MaxUpdates(5000),\n truncation=RelativeTruncation(utility, rtol=0.01),\n done_sample_complements=MaxChecks(1),\n normalize_values=True\n)\n
"},{"location":"value/classwise-shapley/#the-class-wise-scorer","title":"The class-wise scorer","text":"In order to use the classwise Shapley value, one needs to define a ClasswiseScorer. This scorer is defined as
\\[ u(S) = f(a_S(D_{y_i})) g(a_S(D_{-y_i})), \\] where \\(f\\) and \\(g\\) are monotonically increasing functions, \\(a_S(D_{y_i})\\) is the in-class accuracy, and \\(a_S(D_{-y_i})\\) is the out-of-class accuracy (the names originate from a choice by the authors to use accuracy, but in principle any other score, like \\(F_1\\) can be used).
The authors show that \\(f(x)=x\\) and \\(g(x)=e^x\\) have favorable properties and are therefore the defaults, but we leave the option to set different functions \\(f\\) and \\(g\\) for an exploration with different base scores.
The default class-wise scorer
Constructing the CWS scorer requires choosing a metric and the functions \\(f\\) and \\(g\\):
import numpy as np\nfrom pydvl.value.shapley.classwise import ClasswiseScorer\n\n# These are the defaults\nidentity = lambda x: x\nscorer = ClasswiseScorer(\n \"accuracy\",\n in_class_discount_fn=identity,\n out_of_class_discount_fn=np.exp\n)\n
Surface of the discounted utility function The level curves for \\(f(x)=x\\) and \\(g(x)=e^x\\) are depicted below. The lines illustrate the contour lines, annotated with their respective gradients. Level curves of the class-wise utility
"},{"location":"value/classwise-shapley/#evaluation","title":"Evaluation","text":"We illustrate the method with two experiments: point removal and noise removal, as well as an analysis of the distribution of the values. For this we employ the nine datasets used in (Schoch et al., 2022)1, using the same pre-processing. For images, PCA is used to reduce down to 32 the features found by a pre-trained Resnet18
model. Standard loc-scale normalization is performed for all models except gradient boosting, since the latter is not sensitive to the scale of the features.
Datasets used for evaluation Dataset Data Type Classes Input Dims OpenML ID Diabetes Tabular 2 8 37 Click Tabular 2 11 1216 CPU Tabular 2 21 197 Covertype Tabular 7 54 1596 Phoneme Tabular 2 5 1489 FMNIST Image 2 32 40996 CIFAR10 Image 2 32 40927 MNIST (binary) Image 2 32 554 MNIST (multi) Image 10 32 554 We show mean and coefficient of variation (CV) \\(\\frac{\\sigma}{\\mu}\\) of an \"inner metric\". The former shows the performance of the method, whereas the latter displays its stability: we normalize by the mean to see the relative effect of the standard deviation. Ideally the mean value is maximal and CV minimal.
Finally, we note that for all sampling-based valuation methods the same number of evaluations of the marginal utility was used. This is important to make the algorithms comparable, but in practice one should consider using a more sophisticated stopping criterion.
"},{"location":"value/classwise-shapley/#dataset-pruning-for-logistic-regression-point-removal","title":"Dataset pruning for logistic regression (point removal)","text":"In (best-)point removal, one first computes values for the training set and then removes in sequence the points with the highest values. After each removal, the remaining points are used to train the model from scratch and performance is measured on a test set. This produces a curve of performance vs. number of points removed which we show below.
As a scalar summary of this curve, (Schoch et al., 2022)1 define Weighted Accuracy Drop (WAD) as:
\\[ \\text{WAD} = \\sum_{j=1}^{n} \\left ( \\frac{1}{j} \\sum_{i=1}^{j} a_{T_{-\\{1 \\colon i-1 \\}}}(D) - a_{T_{-\\{1 \\colon i \\}}}(D) \\right) = a_T(D) - \\sum_{j=1}^{n} \\frac{a_{T_{-\\{1 \\colon j \\}}}(D)}{j} , \\] where \\(a_T(D)\\) is the accuracy of the model (trained on \\(T\\)) evaluated on \\(D\\) and \\(T_{-\\{1 \\colon j \\}}\\) is the set \\(T\\) without elements from \\(\\{1, \\dots , j \\}\\).
We run the point removal experiment for a logistic regression model five times and compute WAD for each run, then report the mean \\(\\mu_\\text{WAD}\\) and standard deviation \\(\\sigma_\\text{WAD}\\).
Mean WAD for best-point removal on logistic regression. Values computed using LOO, CWS, Beta Shapley, and TMCS
We see that CWS is competitive with all three other methods. In all problems except MNIST (multi)
it outperforms TMCS, while in that case TMCS has a slight advantage.
In order to understand the variability of WAD we look at its coefficient of variation (lower is better):
Coefficient of Variation of WAD for best-point removal on logistic regression. Values computed using LOO, CWS, Beta Shapley, and TMCS
CWS is not the best method in terms of CV. For CIFAR10
, Click
, CPU
and MNIST (binary)
Beta Shapley has the lowest CV. For Diabetes
, MNIST (multi)
and Phoneme
CWS is the winner and for FMNIST
and Covertype
TMCS takes the lead. Besides LOO, TMCS has the highest relative standard deviation.
The following plot shows accuracy vs number of samples removed. Random values serve as a baseline. The shaded area represents the 95% bootstrap confidence interval of the mean across 5 runs.
Accuracy after best-sample removal using values from logistic regression
Because samples are removed from high to low valuation order, we expect a steep decrease in the curve.
Overall we conclude that in terms of mean WAD, CWS and TMCS perform best, with CWS's CV on par with Beta Shapley's, making CWS a competitive method.
"},{"location":"value/classwise-shapley/#dataset-pruning-for-a-neural-network-by-value-transfer","title":"Dataset pruning for a neural network by value transfer","text":"Transfer of values from one model to another is probably of greater practical relevance: values are computed using a cheap model and used to prune the dataset before training a more expensive one.
The following plot shows accuracy vs number of samples removed for transfer from logistic regression to a neural network. The shaded area represents the 95% bootstrap confidence interval of the mean across 5 runs.
Accuracy after sample removal using values transferred from logistic regression to an MLP
As in the previous experiment samples are removed from high to low valuation order and hence we expect a steep decrease in the curve. CWS is competitive with the other methods, especially in very unbalanced datasets like Click
. In other datasets, like Covertype
, Diabetes
and MNIST (multi)
the performance is on par with TMCS.
"},{"location":"value/classwise-shapley/#detection-of-mis-labeled-data-points","title":"Detection of mis-labeled data points","text":"The next experiment tries to detect mis-labeled data points in binary classification tasks. 20% of the indices is flipped at random (we don't consider multi-class datasets because there isn't a unique flipping strategy). The following table shows the mean of the area under the curve (AUC) for five runs.
Mean AUC for mis-labeled data point detection. Values computed using LOO, CWS, Beta Shapley, and TMCS
In the majority of cases TMCS has a slight advantage over CWS, except for Click
, where CWS has a slight edge, most probably due to the unbalanced nature of the dataset. The following plot shows the CV for the AUC of the five runs.
Coefficient of variation of AUC for mis-labeled data point detection. Values computed using LOO, CWS, Beta Shapley, and TMCS
In terms of CV, CWS has a clear edge over TMCS and Beta Shapley.
Finally, we look at the ROC curves training the classifier on the \\(n\\) first samples in increasing order of valuation (i.e. starting with the worst):
Mean ROC across 5 runs with 95% bootstrap CI
Although at first sight TMCS seems to be the winner, CWS stays competitive after factoring in running time. For a perfectly balanced dataset, CWS needs on average fewer samples than TCMS.
"},{"location":"value/classwise-shapley/#value-distribution","title":"Value distribution","text":"For illustration, we compare the distribution of values computed by TMCS and CWS.
Histogram and estimated density of the values computed by TMCS and CWS on all nine datasets
For Click
TMCS has a multi-modal distribution of values. We hypothesize that this is due to the highly unbalanced nature of the dataset, and notice that CWS has a single mode, leading to its greater performance on this dataset.
"},{"location":"value/classwise-shapley/#conclusion","title":"Conclusion","text":"CWS is an effective way to handle classification problems, in particular for unbalanced datasets. It reduces the computing requirements by considering in-class and out-of-class points separately.
-
Schoch, S., Xu, H., Ji, Y., 2022. CS-Shapley: Class-wise Shapley Values for Data Valuation in Classification, in: Proc. Of the Thirty-Sixth Conference on Neural Information Processing Systems (NeurIPS). Presented at the Advances in Neural Information Processing Systems (NeurIPS 2022).\u00a0\u21a9\u21a9\u21a9
-
Castro, J., G\u00f3mez, D., Tejada, J., 2009. Polynomial calculation of the Shapley value based on sampling. Computers & Operations Research, Selected papers presented at the Tenth International Symposium on Locational Decisions (ISOLDE X) 36, 1726\u20131730. https://doi.org/10.1016/j.cor.2008.04.004 \u21a9
"},{"location":"value/semi-values/","title":"Semi-values","text":"SV is a particular case of a more general concept called semi-value, which is a generalization to different weighting schemes. A semi-value is any valuation function with the form:
\\[ v_\\text{semi}(i) = \\sum_{i=1}^n w(k) \\sum_{S \\subset D_{-i}^{(k)}} [u(S_{+i}) - u(S)], \\] where the coefficients \\(w(k)\\) satisfy the property:
\\[\\sum_{k=1}^n w(k) = 1,\\] the set \\(D_{-i}^{(k)}\\) contains all subsets of \\(D\\) of size \\(k\\) that do not include sample \\(x_i\\), \\(S_{+i}\\) is the set \\(S\\) with \\(x_i\\) added, and \\(u\\) is the utility function.
Two instances of this are Banzhaf indices (Wang and Jia, 2023)1, and Beta Shapley (Kwon and Zou, 2022)2, with better numerical and rank stability in certain situations.
Note
Shapley values are a particular case of semi-values and can therefore also be computed with the methods described here. However, as of version 0.8.1, we recommend using compute_shapley_values instead, in particular because it implements truncation policies for TMCS.
"},{"location":"value/semi-values/#beta-shapley","title":"Beta Shapley","text":"For some machine learning applications, where the utility is typically the performance when trained on a set \\(S \\subset D\\), diminishing returns are often observed when computing the marginal utility of adding a new data point.
Beta Shapley is a weighting scheme that uses the Beta function to place more weight on subsets deemed to be more informative. The weights are defined as:
\\[ w(k) := \\frac{B(k+\\beta, n-k+1+\\alpha)}{B(\\alpha, \\beta)}, \\] where \\(B\\) is the Beta function, and \\(\\alpha\\) and \\(\\beta\\) are parameters that control the weighting of the subsets. Setting both to 1 recovers Shapley values, and setting \\(\\alpha = 1\\), and \\(\\beta = 16\\) is reported in (Kwon and Zou, 2022)2 to be a good choice for some applications. Beta Shapley values are available in pyDVL through compute_beta_shapley_semivalues:
from pydvl.value import *\n\nutility = Utility(model, data)\nvalues = compute_beta_shapley_semivalues(\n u=utility, done=AbsoluteStandardError(threshold=1e-4), alpha=1, beta=16\n)\n
See however the Banzhaf indices section for an alternative choice of weights which is reported to work better.
"},{"location":"value/semi-values/#banzhaf-indices","title":"Banzhaf indices","text":"As noted in the section Problems of Data Values, the Shapley value can be very sensitive to variance in the utility function. For machine learning applications, where the utility is typically the performance when trained on a set \\(S \\subset D\\), this variance is often largest for smaller subsets \\(S\\). It is therefore reasonable to try reducing the relative contribution of these subsets with adequate weights.
One such choice of weights is the Banzhaf index, which is defined as the constant:
\\[w(k) := 2^{n-1},\\] for all set sizes \\(k\\). The intuition for picking a constant weight is that for any choice of weight function \\(w\\), one can always construct a utility with higher variance where \\(w\\) is greater. Therefore, in a worst-case sense, the best one can do is to pick a constant weight.
The authors of (Wang and Jia, 2023)1 show that Banzhaf indices are more robust to variance in the utility function than Shapley and Beta Shapley values. They are available in pyDVL through compute_banzhaf_semivalues:
from pydvl.value import *\n\nutility = Utility(model, data)\nvalues = compute_banzhaf_semivalues(\n u=utility, done=AbsoluteStandardError(threshold=1e-4), alpha=1, beta=16\n)\n
"},{"location":"value/semi-values/#banzhaf-semi-values-with-msr-sampling","title":"Banzhaf semi-values with MSR sampling","text":"Wang et. al. propose a more sample-efficient method for computing Banzhaf semivalues in their paper Data Banzhaf: A Robust Data Valuation Framework for Machine Learning (Wang and Jia, 2023)1. This method updates all semivalues per evaluation of the utility (i.e. per model trained) based on whether a specific data point was included in the data subset or not. The expression for computing the semivalues is
\\[\\hat{\\phi}_{MSR}(i) = \\frac{1}{|\\mathbf{S}_{\\ni i}|} \\sum_{S \\in \\mathbf{S}_{\\ni i}} U(S) - \\frac{1}{|\\mathbf{S}_{\\not{\\ni} i}|} \\sum_{S \\in \\mathbf{S}_{\\not{\\ni} i}} U(S)\\] where \\(\\mathbf{S}_{\\ni i}\\) are the subsets that contain the index \\(i\\) and \\(\\mathbf{S}_{\\not{\\ni} i}\\) are the subsets not containing the index \\(i\\).
The function implementing this method is compute_msr_banzhaf_semivalues.
from pydvl.value import compute_msr_banzhaf_semivalues, RankCorrelation, Utility\n\nutility = Utility(model, data)\nvalues = compute_msr_banzhaf_semivalues(\n u=utility, done=RankCorrelation(rtol=0.001),\n )\n
For further details on how to use this method and a comparison of the sample efficiency, we suggest to take a look at the example notebook msr_banzhaf_spotify."},{"location":"value/semi-values/#general-semi-values","title":"General semi-values","text":"As explained above, both Beta Shapley and Banzhaf indices are special cases of semi-values. In pyDVL we provide a general method for computing these with any combination of the three ingredients that define a semi-value:
- A utility function \\(u\\).
- A sampling method
- A weighting scheme \\(w\\).
You can construct any combination of these three ingredients with compute_generic_semivalues. The utility function is the same as for Shapley values, and the sampling method can be any of the types defined in the samplers module. For instance, the following snippet is equivalent to the above:
from pydvl.value import *\n\ndata = Dataset(...)\nutility = Utility(model, data)\nvalues = compute_generic_semivalues(\n sampler=PermutationSampler(data.indices),\n u=utility,\n coefficient=beta_coefficient(alpha=1, beta=16),\n done=AbsoluteStandardError(threshold=1e-4),\n)\n
Allowing any coefficient can help when experimenting with models which are more sensitive to changes in training set size. However, Data Banzhaf indices are proven to be the most robust to variance in the utility function, in the sense of rank stability, across a range of models and datasets (Wang and Jia, 2023)1.
Careful with permutation sampling
This generic implementation of semi-values allowing for any combination of sampling and weighting schemes is very flexible and, in principle, it recovers the original Shapley value, so that compute_shapley_values is no longer necessary. However, it loses the optimization in permutation sampling that reuses the utility computation from the last iteration when iterating over a permutation. This doubles the computation requirements (and slightly increases variance) when using permutation sampling, unless the cache is enabled. In addition, as mentioned above, truncation policies are not supported by this generic implementation (as of v0.8.1). For these reasons it is preferable to use compute_shapley_values whenever not computing other semi-values.
-
Wang, J.T., Jia, R., 2023. Data Banzhaf: A Robust Data Valuation Framework for Machine Learning, in: Proceedings of The 26th International Conference on Artificial Intelligence and Statistics. Presented at the International Conference on Artificial Intelligence and Statistics, PMLR, pp. 6388\u20136421.\u00a0\u21a9\u21a9\u21a9\u21a9
-
Kwon, Y., Zou, J., 2022. Beta Shapley: A Unified and Noise-reduced Data Valuation Framework for Machine Learning, in: Proceedings of the 25th International Conference on Artificial Intelligence and Statistics (AISTATS) 2022,. Presented at the AISTATS 2022, PMLR.\u00a0\u21a9\u21a9
"},{"location":"value/shapley/","title":"Shapley value","text":""},{"location":"value/shapley/#shapley-value","title":"Shapley value","text":"The Shapley method is an approach to compute data values originating in cooperative game theory. Shapley values are a common way of assigning payoffs to each participant in a cooperative game (i.e. one in which players can form coalitions) in a way that ensures that certain axioms are fulfilled.
pyDVL implements several methods for the computation and approximation of Shapley values. They can all be accessed via the facade function compute_shapley_values. The supported methods are enumerated in ShapleyMode.
Empirically, the most useful method is the so-called Truncated Monte Carlo Shapley (Ghorbani and Zou, 2019)1, which is a Monte Carlo approximation of the permutation Shapley value.
"},{"location":"value/shapley/#combinatorial-shapley","title":"Combinatorial Shapley","text":"The first algorithm is just a verbatim implementation of the definition. As such it returns as exact a value as the utility function allows (see what this means in Problems of Data Values).
The value \\(v\\) of the \\(i\\)-th sample in dataset \\(D\\) wrt. utility \\(u\\) is computed as a weighted sum of its marginal utility wrt. every possible coalition of training samples within the training set:
\\[ v(i) = \\frac{1}{n} \\sum_{S \\subseteq D_{-i}} \\binom{n-1}{ | S | }^{-1} [u(S_{+i}) \u2212 u(S)] ,\\] where \\(D_{-i}\\) denotes the set of samples in \\(D\\) excluding \\(x_i\\), and \\(S_{+i}\\) denotes the set \\(S\\) with \\(x_i\\) added.
from pydvl.value import compute_shapley_values\n\nvalues = compute_shapley_values(utility, mode=\"combinatorial_exact\")\ndf = values.to_dataframe(column='value')\n
We can convert the return value to a pandas.DataFrame. and name the column with the results as value
. Please refer to the documentation in shapley and ValuationResult for more information.
"},{"location":"value/shapley/#monte-carlo-combinatorial-shapley","title":"Monte Carlo Combinatorial Shapley","text":"Because the number of subsets \\(S \\subseteq D_{-i}\\) is \\(2^{ | D | - 1 }\\), one typically must resort to approximations. The simplest one is done via Monte Carlo sampling of the powerset \\(\\mathcal{P}(D)\\). In pyDVL this simple technique is called \"Monte Carlo Combinatorial\". The method has very poor converge rate and others are preferred, but if desired, usage follows the same pattern:
from pydvl.value import compute_shapley_values, MaxUpdates\n\nvalues = compute_shapley_values(\n utility, mode=\"combinatorial_montecarlo\", done=MaxUpdates(1000)\n)\ndf = values.to_dataframe(column='cmc')\n
The DataFrames returned by most Monte Carlo methods will contain approximate standard errors as an additional column, in this case named cmc_stderr
.
Note the usage of the object MaxUpdates as the stop condition. This is an instance of a StoppingCriterion. Other examples are MaxTime and AbsoluteStandardError.
"},{"location":"value/shapley/#owen-sampling","title":"Owen sampling","text":"Owen Sampling (Okhrati and Lipani, 2021)2 is a practical algorithm based on the combinatorial definition. It uses a continuous extension of the utility from \\(\\{0,1\\}^n\\), where a 1 in position \\(i\\) means that sample \\(x_i\\) is used to train the model, to \\([0,1]^n\\). The ensuing expression for Shapley value uses integration instead of discrete weights:
\\[ v_u(i) = \\int_0^1 \\mathbb{E}_{S \\sim P_q(D_{-i})} [u(S_{+i}) - u(S)]. \\] Using Owen sampling follows the same pattern as every other method for Shapley values in pyDVL. First construct the dataset and utility, then call compute_shapley_values:
from pydvl.value import compute_shapley_values\n\nvalues = compute_shapley_values(\n u=utility, mode=\"owen\", n_iterations=4, max_q=200\n)\n
There are more details on Owen sampling, and its variant Antithetic Owen Sampling in the documentation for the function doing the work behind the scenes: owen_sampling_shapley.
Note that in this case we do not pass a StoppingCriterion to the function, but instead the number of iterations and the maximum number of samples to use in the integration.
"},{"location":"value/shapley/#permutation-shapley","title":"Permutation Shapley","text":"An equivalent way of computing Shapley values (ApproShapley
) appeared in (Castro et al., 2009)3 and is the basis for the method most often used in practice. It uses permutations over indices instead of subsets:
\\[ v_u(x_i) = \\frac{1}{n!} \\sum_{\\sigma \\in \\Pi(n)} [u(\\sigma_{:i} \\cup \\{x_i\\}) \u2212 u(\\sigma_{:i})], \\] where \\(\\sigma_{:i}\\) denotes the set of indices in permutation sigma before the position where \\(i\\) appears. To approximate this sum (which has \\(\\mathcal{O}(n!)\\) terms!) one uses Monte Carlo sampling of permutations, something which has surprisingly low sample complexity. One notable difference wrt. the combinatorial approach above is that the approximations always fulfill the efficiency axiom of Shapley, namely \\(\\sum_{i=1}^n \\hat{v}_i = u(D)\\) (see (Castro et al., 2009)3, Proposition 3.2).
By adding two types of early stopping, the result is the so-called Truncated Monte Carlo Shapley (Ghorbani and Zou, 2019)1, which is efficient enough to be useful in applications. The first is simply a convergence criterion, of which there are several to choose from. The second is a criterion to truncate the iteration over single permutations. RelativeTruncation chooses to stop iterating over samples in a permutation when the marginal utility becomes too small.
from pydvl.value import compute_shapley_values, MaxUpdates, RelativeTruncation\n\nvalues = compute_shapley_values(\n u=utility,\n mode=\"permutation_montecarlo\",\n done=MaxUpdates(1000),\n truncation=RelativeTruncation(utility, rtol=0.01)\n)\n
You can see this method in action in this example using the Spotify dataset.
"},{"location":"value/shapley/#exact-shapley-for-knn","title":"Exact Shapley for KNN","text":"It is possible to exploit the local structure of K-Nearest Neighbours to reduce the amount of subsets to consider: because no sample besides the K closest affects the score, most are irrelevant and it is possible to compute a value in linear time. This method was introduced by (Jia et al., 2019)4, and can be used in pyDVL with:
from pydvl.utils import Dataset, Utility\nfrom pydvl.value import compute_shapley_values\nfrom sklearn.neighbors import KNeighborsClassifier\n\nmodel = KNeighborsClassifier(n_neighbors=5)\ndata = Dataset(...)\nutility = Utility(model, data)\nvalues = compute_shapley_values(u=utility, mode=\"knn\")\n
"},{"location":"value/shapley/#group-testing","title":"Group testing","text":"An alternative method for the approximation of Shapley values introduced in (Jia et al., 2019)4 first estimates the differences of values with a Monte Carlo sum. With
\\[\\hat{\\Delta}_{i j} \\approx v_i - v_j,\\] one then solves the following linear constraint satisfaction problem (CSP) to infer the final values:
\\[ \\begin{array}{lll} \\sum_{i = 1}^N v_i & = & U (D)\\\\ | v_i - v_j - \\hat{\\Delta}_{i j} | & \\leqslant & \\frac{\\varepsilon}{2 \\sqrt{N}} \\end{array} \\] Warning
We have reproduced this method in pyDVL for completeness and benchmarking, but we don't advocate its use because of the speed and memory cost. Despite our best efforts, the number of samples required in practice for convergence can be several orders of magnitude worse than with e.g. TMCS. Additionally, the CSP can sometimes turn out to be infeasible.
Usage follows the same pattern as every other Shapley method, but with the addition of an epsilon
parameter required for the solution of the CSP. It should be the same value used to compute the minimum number of samples required. This can be done with num_samples_eps_delta, but note that the number returned will be huge! In practice, fewer samples can be enough, but the actual number will strongly depend on the utility, in particular its variance.
from pydvl.utils import Dataset, Utility\nfrom pydvl.value import compute_shapley_values\n\nmodel = ...\ndata = Dataset(...)\nutility = Utility(model, data, score_range=(_min, _max))\nmin_iterations = num_samples_eps_delta(epsilon, delta, n, utility.score_range)\nvalues = compute_shapley_values(\n u=utility, mode=\"group_testing\", n_iterations=min_iterations, eps=eps\n)\n
-
Ghorbani, A., Zou, J., 2019. Data Shapley: Equitable Valuation of Data for Machine Learning, in: Proceedings of the 36th International Conference on Machine Learning, PMLR. Presented at the International Conference on Machine Learning (ICML 2019), PMLR, pp. 2242\u20132251.\u00a0\u21a9\u21a9
-
Okhrati, R., Lipani, A., 2021. A Multilinear Sampling Algorithm to Estimate Shapley Values, in: 2020 25th International Conference on Pattern Recognition (ICPR). Presented at the 2020 25th International Conference on Pattern Recognition (ICPR), IEEE, pp. 7992\u20137999. https://doi.org/10.1109/ICPR48806.2021.9412511 \u21a9
-
Castro, J., G\u00f3mez, D., Tejada, J., 2009. Polynomial calculation of the Shapley value based on sampling. Computers & Operations Research, Selected papers presented at the Tenth International Symposium on Locational Decisions (ISOLDE X) 36, 1726\u20131730. https://doi.org/10.1016/j.cor.2008.04.004 \u21a9\u21a9
-
Jia, R., Dao, D., Wang, B., Hubis, F.A., Gurel, N.M., Li, B., Zhang, C., Spanos, C., Song, D., 2019. Efficient task-specific data valuation for nearest neighbor algorithms. Proc. VLDB Endow. 12, 1610\u20131623. https://doi.org/10.14778/3342263.3342637 \u21a9\u21a9
"},{"location":"value/the-core/","title":"Core values","text":"Shapley values define a fair way to distribute payoffs amongst all participants (training points) when they form a grand coalition, i.e. when the model is trained on the whole dataset. But they do not consider the question of stability: under which conditions do all participants in a game form the grand coalition? Are the payoffs distributed in such a way that prioritizes its formation?
The Core is another solution concept in cooperative game theory that attempts to ensure stability in the sense that it provides the set of feasible payoffs that cannot be improved upon by a sub-coalition. This can be interesting for some applications of data valuation because it yields values consistent with training on the whole dataset, avoiding the spurious selection of subsets.
It satisfies the following 2 properties:
-
Efficiency: The payoffs are distributed such that it is not possible to make any participant better off without making another one worse off. \\(\\sum_{i \\in D} v(i) = u(D).\\)
-
Coalitional rationality: The sum of payoffs to the agents in any coalition \\(S\\) is at least as large as the amount that these agents could earn by forming a coalition on their own. \\(\\sum_{i \\in S} v(i) \\geq u(S), \\forall S \\subset D.\\)
The Core was first introduced into data valuation by (Yan and Procaccia, 2021)1, in the following form.
"},{"location":"value/the-core/#least-core-values","title":"Least Core values","text":"Unfortunately, for many cooperative games the Core may be empty. By relaxing the coalitional rationality property by a subsidy \\(e \\gt 0\\), we are then able to find approximate payoffs:
\\[ \\sum_{i\\in S} v(i) + e \\geq u(S), \\forall S \\subset D, S \\neq \\emptyset \\ ,\\] The Least Core (LC) values \\(\\{v\\}\\) for utility \\(u\\) are computed by solving the following linear program:
\\[ \\begin{array}{lll} \\text{minimize} & e & \\\\ \\text{subject to} & \\sum_{i\\in D} v(i) = u(D) & \\\\ & \\sum_{i\\in S} v(i) + e \\geq u(S) &, \\forall S \\subset D, S \\neq \\emptyset \\\\ \\end{array} \\] Note that solving this program yields a set of solutions \\(\\{v_j:N \\rightarrow \\mathbb{R}\\}\\), whereas the Shapley value is a single function \\(v\\). In order to obtain a single valuation to use, one breaks ties by solving a quadratic program to select the \\(v\\) in the LC with the smallest \\(\\ell_2\\) norm. This is called the egalitarian least core.
"},{"location":"value/the-core/#exact-least-core","title":"Exact Least Core","text":"This first algorithm is just a verbatim implementation of the definition, in compute_least_core_values. It computes all constraints for the linear problem by evaluating the utility on every subset of the training data, and returns as exact a value as the utility function allows (see what this means in Problems of Data Values).
from pydvl.value import compute_least_core_values\n\nvalues = compute_least_core_values(utility, mode=\"exact\")\n
"},{"location":"value/the-core/#monte-carlo-least-core","title":"Monte Carlo Least Core","text":"Because the number of subsets \\(S \\subseteq D \\setminus \\{i\\}\\) is \\(2^{ | D | - 1 }\\), one typically must resort to approximations.
The simplest one consists in using a fraction of all subsets for the constraints. (Yan and Procaccia, 2021)1 show that a quantity of order \\(\\mathcal{O}((n - \\log \\Delta ) / \\delta^2)\\) is enough to obtain a so-called \\(\\delta\\)-approximate least core with high probability. I.e. the following property holds with probability \\(1-\\Delta\\) over the choice of subsets:
\\[ \\mathbb{P}_{S\\sim D}\\left[\\sum_{i\\in S} v(i) + e^{*} \\geq u(S)\\right] \\geq 1 - \\delta, \\] where \\(e^{*}\\) is the optimal least core subsidy. This approximation is also implemented in compute_least_core_values:
from pydvl.value import compute_least_core_values\n\nvalues = compute_least_core_values(\n utility, mode=\"montecarlo\", n_iterations=n_iterations\n)\n
Note
Although any number is supported, it is best to choose n_iterations
to be at least equal to the number of data points.
Because computing the Least Core values requires the solution of a linear and a quadratic problem after computing all the utility values, we offer the possibility of splitting the latter from the former. This is useful when running multiple experiments: use mclc_prepare_problem to prepare a list of problems to solve, then solve them in parallel with lc_solve_problems.
from pydvl.value.least_core import mclc_prepare_problem, lc_solve_problems\n\nn_experiments = 10\nproblems = [mclc_prepare_problem(utility, n_iterations=n_iterations)\n for _ in range(n_experiments)]\nvalues = lc_solve_problems(problems)\n
"},{"location":"value/the-core/#method-comparison","title":"Method comparison","text":"The TransferLab team reproduced the results of the original paper in a publication for the 2022 MLRC (Benmerzoug and Benito Delgado, 2023)2.
Best sample removal on binary image classification
Roughly speaking, MCLC performs better in identifying high value points, as measured by best-sample removal tasks. In all other aspects, it performs worse or similarly to TMCS at comparable sample budgets. But using an equal number of subsets is more computationally expensive because of the need to solve large linear and quadratic optimization problems.
Worst sample removal on binary image classification
For these reasons we recommend some variation of SV like TMCS for outlier detection, data cleaning and pruning, and perhaps MCLC for the selection of interesting points to be inspected for the improvement of data collection or model design.
-
Yan, T., Procaccia, A.D., 2021. If You Like Shapley Then You\u2019ll Love the Core, in: Proceedings of the 35th AAAI Conference on Artificial Intelligence, 2021. Presented at the AAAI Conference on Artificial Intelligence, Association for the Advancement of Artificial Intelligence, pp. 5751\u20135759. https://doi.org/10.1609/aaai.v35i6.16721 \u21a9\u21a9
-
Benmerzoug, A., Benito Delgado, M. de, 2023. [Re] If you like Shapley, then you\u2019ll love the core. ReScience C 9. https://doi.org/10.5281/zenodo.8173733 \u21a9
"}]}
\ No newline at end of file
diff --git a/devel/sitemap.xml b/devel/sitemap.xml
index 563908454..457242b87 100644
--- a/devel/sitemap.xml
+++ b/devel/sitemap.xml
@@ -2,482 +2,482 @@
https://pydvl.org/stable/
- 2024-06-14
+ 2024-06-20
daily
https://pydvl.org/stable/CHANGELOG/
- 2024-06-14
+ 2024-06-20
daily
https://pydvl.org/stable/CONTRIBUTING/
- 2024-06-14
+ 2024-06-20
daily
https://pydvl.org/stable/api/pydvl/
- 2024-06-14
+ 2024-06-20
daily
https://pydvl.org/stable/api/pydvl/influence/
- 2024-06-14
+ 2024-06-20
daily
https://pydvl.org/stable/api/pydvl/influence/array/
- 2024-06-14
+ 2024-06-20
daily
https://pydvl.org/stable/api/pydvl/influence/base_influence_function_model/
- 2024-06-14
+ 2024-06-20
daily
https://pydvl.org/stable/api/pydvl/influence/influence_calculator/
- 2024-06-14
+ 2024-06-20
daily
https://pydvl.org/stable/api/pydvl/influence/types/
- 2024-06-14
+ 2024-06-20
daily
https://pydvl.org/stable/api/pydvl/influence/torch/
- 2024-06-14
+ 2024-06-20
daily
https://pydvl.org/stable/api/pydvl/influence/torch/base/
- 2024-06-14
+ 2024-06-20
daily
https://pydvl.org/stable/api/pydvl/influence/torch/batch_operation/
- 2024-06-14
+ 2024-06-20
daily
https://pydvl.org/stable/api/pydvl/influence/torch/functional/
- 2024-06-14
+ 2024-06-20
daily
https://pydvl.org/stable/api/pydvl/influence/torch/influence_function_model/
- 2024-06-14
+ 2024-06-20
daily
https://pydvl.org/stable/api/pydvl/influence/torch/operator/
- 2024-06-14
+ 2024-06-20
daily
https://pydvl.org/stable/api/pydvl/influence/torch/pre_conditioner/
- 2024-06-14
+ 2024-06-20
daily
https://pydvl.org/stable/api/pydvl/influence/torch/util/
- 2024-06-14
+ 2024-06-20
daily
https://pydvl.org/stable/api/pydvl/parallel/
- 2024-06-14
+ 2024-06-20
daily
https://pydvl.org/stable/api/pydvl/parallel/backend/
- 2024-06-14
+ 2024-06-20
daily
https://pydvl.org/stable/api/pydvl/parallel/config/
- 2024-06-14
+ 2024-06-20
daily
https://pydvl.org/stable/api/pydvl/parallel/map_reduce/
- 2024-06-14
+ 2024-06-20
daily
https://pydvl.org/stable/api/pydvl/parallel/backends/
- 2024-06-14
+ 2024-06-20
daily
https://pydvl.org/stable/api/pydvl/parallel/backends/joblib/
- 2024-06-14
+ 2024-06-20
daily
https://pydvl.org/stable/api/pydvl/parallel/backends/ray/
- 2024-06-14
+ 2024-06-20
daily
https://pydvl.org/stable/api/pydvl/parallel/futures/
- 2024-06-14
+ 2024-06-20
daily
https://pydvl.org/stable/api/pydvl/parallel/futures/ray/
- 2024-06-14
+ 2024-06-20
daily
https://pydvl.org/stable/api/pydvl/reporting/
- 2024-06-14
+ 2024-06-20
daily
https://pydvl.org/stable/api/pydvl/reporting/plots/
- 2024-06-14
+ 2024-06-20
daily
https://pydvl.org/stable/api/pydvl/reporting/scores/
- 2024-06-14
+ 2024-06-20
daily
https://pydvl.org/stable/api/pydvl/utils/
- 2024-06-14
+ 2024-06-20
daily
https://pydvl.org/stable/api/pydvl/utils/config/
- 2024-06-14
+ 2024-06-20
daily
https://pydvl.org/stable/api/pydvl/utils/dataset/
- 2024-06-14
+ 2024-06-20
daily
https://pydvl.org/stable/api/pydvl/utils/exceptions/
- 2024-06-14
+ 2024-06-20
daily
https://pydvl.org/stable/api/pydvl/utils/functional/
- 2024-06-14
+ 2024-06-20
daily
https://pydvl.org/stable/api/pydvl/utils/numeric/
- 2024-06-14
+ 2024-06-20
daily
https://pydvl.org/stable/api/pydvl/utils/progress/
- 2024-06-14
+ 2024-06-20
daily
https://pydvl.org/stable/api/pydvl/utils/score/
- 2024-06-14
+ 2024-06-20
daily
https://pydvl.org/stable/api/pydvl/utils/status/
- 2024-06-14
+ 2024-06-20
daily
https://pydvl.org/stable/api/pydvl/utils/types/
- 2024-06-14
+ 2024-06-20
daily
https://pydvl.org/stable/api/pydvl/utils/utility/
- 2024-06-14
+ 2024-06-20
daily
https://pydvl.org/stable/api/pydvl/utils/caching/
- 2024-06-14
+ 2024-06-20
daily
https://pydvl.org/stable/api/pydvl/utils/caching/base/
- 2024-06-14
+ 2024-06-20
daily
https://pydvl.org/stable/api/pydvl/utils/caching/config/
- 2024-06-14
+ 2024-06-20
daily
https://pydvl.org/stable/api/pydvl/utils/caching/disk/
- 2024-06-14
+ 2024-06-20
daily
https://pydvl.org/stable/api/pydvl/utils/caching/memcached/
- 2024-06-14
+ 2024-06-20
daily
https://pydvl.org/stable/api/pydvl/utils/caching/memory/
- 2024-06-14
+ 2024-06-20
daily
https://pydvl.org/stable/api/pydvl/value/
- 2024-06-14
+ 2024-06-20
daily
https://pydvl.org/stable/api/pydvl/value/games/
- 2024-06-14
+ 2024-06-20
daily
https://pydvl.org/stable/api/pydvl/value/result/
- 2024-06-14
+ 2024-06-20
daily
https://pydvl.org/stable/api/pydvl/value/sampler/
- 2024-06-14
+ 2024-06-20
daily
https://pydvl.org/stable/api/pydvl/value/semivalues/
- 2024-06-14
+ 2024-06-20
daily
https://pydvl.org/stable/api/pydvl/value/stopping/
- 2024-06-14
+ 2024-06-20
daily
https://pydvl.org/stable/api/pydvl/value/least_core/
- 2024-06-14
+ 2024-06-20
daily
https://pydvl.org/stable/api/pydvl/value/least_core/common/
- 2024-06-14
+ 2024-06-20
daily
https://pydvl.org/stable/api/pydvl/value/least_core/montecarlo/
- 2024-06-14
+ 2024-06-20
daily
https://pydvl.org/stable/api/pydvl/value/least_core/naive/
- 2024-06-14
+ 2024-06-20
daily
https://pydvl.org/stable/api/pydvl/value/loo/
- 2024-06-14
+ 2024-06-20
daily
https://pydvl.org/stable/api/pydvl/value/loo/loo/
- 2024-06-14
+ 2024-06-20
daily
https://pydvl.org/stable/api/pydvl/value/oob/
- 2024-06-14
+ 2024-06-20
daily
https://pydvl.org/stable/api/pydvl/value/oob/oob/
- 2024-06-14
+ 2024-06-20
daily
https://pydvl.org/stable/api/pydvl/value/shapley/
- 2024-06-14
+ 2024-06-20
daily
https://pydvl.org/stable/api/pydvl/value/shapley/classwise/
- 2024-06-14
+ 2024-06-20
daily
https://pydvl.org/stable/api/pydvl/value/shapley/common/
- 2024-06-14
+ 2024-06-20
daily
https://pydvl.org/stable/api/pydvl/value/shapley/gt/
- 2024-06-14
+ 2024-06-20
daily
https://pydvl.org/stable/api/pydvl/value/shapley/knn/
- 2024-06-14
+ 2024-06-20
daily
https://pydvl.org/stable/api/pydvl/value/shapley/montecarlo/
- 2024-06-14
+ 2024-06-20
daily
https://pydvl.org/stable/api/pydvl/value/shapley/naive/
- 2024-06-14
+ 2024-06-20
daily
https://pydvl.org/stable/api/pydvl/value/shapley/owen/
- 2024-06-14
+ 2024-06-20
daily
https://pydvl.org/stable/api/pydvl/value/shapley/truncated/
- 2024-06-14
+ 2024-06-20
daily
https://pydvl.org/stable/api/pydvl/value/shapley/types/
- 2024-06-14
+ 2024-06-20
daily
https://pydvl.org/stable/examples/
- 2024-06-14
+ 2024-06-20
daily
https://pydvl.org/stable/examples/data_oob/
- 2024-06-14
+ 2024-06-20
daily
https://pydvl.org/stable/examples/influence_imagenet/
- 2024-06-14
+ 2024-06-20
daily
https://pydvl.org/stable/examples/influence_sentiment_analysis/
- 2024-06-14
+ 2024-06-20
daily
https://pydvl.org/stable/examples/influence_synthetic/
- 2024-06-14
+ 2024-06-20
daily
https://pydvl.org/stable/examples/influence_wine/
- 2024-06-14
+ 2024-06-20
daily
https://pydvl.org/stable/examples/least_core_basic/
- 2024-06-14
+ 2024-06-20
daily
https://pydvl.org/stable/examples/msr_banzhaf_digits/
- 2024-06-14
+ 2024-06-20
daily
https://pydvl.org/stable/examples/shapley_basic_spotify/
- 2024-06-14
+ 2024-06-20
daily
https://pydvl.org/stable/examples/shapley_knn_flowers/
- 2024-06-14
+ 2024-06-20
daily
https://pydvl.org/stable/examples/shapley_utility_learning/
- 2024-06-14
+ 2024-06-20
daily
https://pydvl.org/stable/getting-started/
- 2024-06-14
+ 2024-06-20
daily
https://pydvl.org/stable/getting-started/advanced-usage/
- 2024-06-14
+ 2024-06-20
daily
https://pydvl.org/stable/getting-started/applications/
- 2024-06-14
+ 2024-06-20
daily
https://pydvl.org/stable/getting-started/benchmarking/
- 2024-06-14
+ 2024-06-20
daily
https://pydvl.org/stable/getting-started/first-steps/
- 2024-06-14
+ 2024-06-20
daily
https://pydvl.org/stable/getting-started/glossary/
- 2024-06-14
+ 2024-06-20
daily
https://pydvl.org/stable/getting-started/methods/
- 2024-06-14
+ 2024-06-20
daily
https://pydvl.org/stable/influence/
- 2024-06-14
+ 2024-06-20
daily
https://pydvl.org/stable/influence/influence_function_model/
- 2024-06-14
+ 2024-06-20
daily
https://pydvl.org/stable/influence/scaling_computation/
- 2024-06-14
+ 2024-06-20
daily
https://pydvl.org/stable/value/
- 2024-06-14
+ 2024-06-20
daily
https://pydvl.org/stable/value/classwise-shapley/
- 2024-06-14
+ 2024-06-20
daily
https://pydvl.org/stable/value/semi-values/
- 2024-06-14
+ 2024-06-20
daily
https://pydvl.org/stable/value/shapley/
- 2024-06-14
+ 2024-06-20
daily
https://pydvl.org/stable/value/the-core/
- 2024-06-14
+ 2024-06-20
daily
\ No newline at end of file
diff --git a/devel/sitemap.xml.gz b/devel/sitemap.xml.gz
index 9c9bcab7795d0a951f5e12ca31ed10d3a5bddb4a..1c2a8d5d1dd3a30f37b9a568727d583a8d4f4ccf 100644
GIT binary patch
delta 869
zcmV-r1DgDW2ZRR)ABzYGnh$i52Ob>ND1x?-6|_Z?1}Ge$aL&YNIV88!>i+7th>mfTrz9}aMx?l9n>6fb7
z@AtQ489gMrjk&AH^ks*tgk(p7Q9sf!6`yanP
ze0=}s=Z_L?pC2Co`t;kE$4?Iud64XUP_)g4)(s`_QHc_AhEa)i3xxG~w{C}Exb!x4
zL7|qYxbO`)GdSRwmC$)iNLs8hn_iW53T|a!HhmovW|;m9RoPEn2RK!1<$d4$N6dc_
zqo>z9pi?d#b_|W38Xa}n2ez^!BtZt}PziDi<_SZi16yLYwVe)Y8R*p+n?ZQ(d>sg+
zO#64-I=fYdmOe;1i`#*RjHC-Iy>&CDnAwI-i88s!q{K=lN{`p6fYBgSkZfmY16)R%
ztU!v-2Gdk?Yxw+?j4Eer7(8Zc6i9zlj$P@ZG(^CmGu7Z^;6X!!%Aa$bozuJJQA3gf
z(@U?UzLkof8U2>4rV2QL8%s}$xE}qqiH$CE%(8oj9k>A%2s`k?Gs5gF70cjQ0hGWe
zCwxl#C`E4%TN*xp;+T^3L_1R*mfv2BFI~UGn&`dog~#vA19rAS3&a&)gnbpg+5Sks7?yFf2RFawoa7xY^kb2L_HFsN*L+>F!
z=?K`?X5zAkNkijyE)s#rg}ZC2NA9D@dyyqS5&7D4dEy_fxuD5_D?v3-sxF@fmiic$
v)RpDgb)Mc_%AK0MEPpP=u766O-kyDB#JBu^7CH}K2zm1_&`!I@8bJU6eYmg1
delta 870
zcmV-s1DX7U2ZaX*ABzYG;rVNk2Ob5r<~
z?RM8>89gMrj=8JI^ks*t!@=t2`uD~9#Z`Gt7nry9ua%2tJza&$WzyJe-Tda?0V@lhc1p523?0<3H;LtQ%L=lOr>uIywH6=)j0;Ut-SA>|A>D%
zV)XQSdvwaB!;Yb`Q=_9U`@mLqge1t|94bL>z&v1RbYM%&wzkt@Ed#weV>1Y^oi77{
zlxe@mjk6nNXz7ELleist$Vj@d(pxuUikWTblqi#nOiHX|qV#y33K$JS1<7`XHo#@H
z$qJ
zFun9j>Sw9=nbDuQYN~)ExUux0i0jc$o7m_w$1J;d*n%5Sfv^KFJR{7`Qn3t<6+j7m
za>A#y4^s4Yzop^xCypsePqZ`DVfpQ~_|o+ktcl(WUwHh*T)>P5vWvRh0qlQPMR>%h
zM5#j><3@>6hm?9K*>+SUHAO)?OQ^}LB{Lks9V@~<1Wiq=D&ro}TGH*&8+WKFCO8kl
z01KZ(xG)Qy3dP6$08%tQf`Q|?Bp~E0@eeM}8VPNaaTMYr+NS2hi$G|Z98fJmpZX-T`3K+
ziuOIM;ngLQbC5Z6*C9_g?JUb~mRY@A3ZdCT>AsrfKqaYZ0H@@v0jc+#Tyi%hIrJX#
zgN}f0Z6+>zm^3tQ=OPh^T)4ZWdgMNeycb#W6Oqq7mnZ(wnlqXVxDqx5rRwr&V5yH`
wNnKfFwE9MtsZvXQ6Zdg^<@j010*Yu^K@D0Oeb`4FCWD