diff --git a/.bumpversion.cfg b/.bumpversion.cfg
index 0336abca1..f27d47b90 100644
--- a/.bumpversion.cfg
+++ b/.bumpversion.cfg
@@ -1,5 +1,5 @@
[bumpversion]
-current_version = 0.9.0
+current_version = 0.9.1
commit = False
tag = False
allow_dirty = False
diff --git a/CHANGELOG.md b/CHANGELOG.md
index d194f74ad..0092f6045 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,6 +1,14 @@
# Changelog
-## 0.9.0 🆕 New methods, better docs and bugfixes 📚🐞
+## Unreleased
+
+### Fixed
+
+- `FutureWarning` for `ParallelConfig` constantly raised without actually
+ instantiating the object
+ [PR #562](https://github.com/aai-institute/pyDVL/pull/562)
+
+## 0.9.0 - 🆕 New methods, better docs and bugfixes 📚🐞
### Added
diff --git a/CITATION.cff b/CITATION.cff
index 7ca971f99..241536c7d 100644
--- a/CITATION.cff
+++ b/CITATION.cff
@@ -27,6 +27,6 @@ keywords:
- Banzhaf index
license: LGPL-3.0
commit: 0e929ae121820b0014bf245da1b21032186768cb
-version: v0.7.0
-doi: 10.5281/zenodo.8311583
-date-released: '2023-09-02'
+version: v0.9.0
+doi: 10.5281/zenodo.10966754
+date-released: '2024-04-12'
diff --git a/README.md b/README.md
index 2f7296018..97cf5f22d 100644
--- a/README.md
+++ b/README.md
@@ -16,10 +16,8 @@
-**pyDVL** collects algorithms for **Data Valuation** and **Influence Function** computation.
-
-Refer to the [Methods](https://pydvl.org/devel/getting-started/methods/)
-page of our documentation for a list of all implemented methods.
+**pyDVL** collects algorithms for **Data Valuation** and **Influence Function**
+computation. Here is the list of [all methods implemented](https://pydvl.org/devel/getting-started/methods/).
**Data Valuation** for machine learning is the task of assigning a scalar
to each element of a training set which reflects its contribution to the final
@@ -29,7 +27,7 @@ pyDVL focuses on model-dependent methods.
**Note** pyDVL currently only support PyTorch for Influence Functions. We plan
+> to add support for Jax next.
+
+```python
+import torch
+from torch import nn
+from torch.utils.data import DataLoader, TensorDataset
+
+from pydvl.influence import SequentialInfluenceCalculator
+from pydvl.influence.torch import DirectInfluence
+from pydvl.influence.torch.util import (
+ NestedTorchCatAggregator,
+ TorchNumpyConverter,
)
- ```
-
-4. Define your loss:
-
- ```python
- loss = nn.MSELoss()
- ```
-
-5. Instantiate an `InfluenceFunctionModel` and fit it to the training data
- ```python
- infl_model = DirectInfluence(nn_architecture, loss, hessian_regularization=0.01)
- infl_model = infl_model.fit(train_data_loader)
- ```
+input_dim = (5, 5, 5)
+output_dim = 3
+train_x, train_y = torch.rand((10, *input_dim)), torch.rand((10, output_dim))
+test_x, test_y = torch.rand((5, *input_dim)), torch.rand((5, output_dim))
+train_data_loader = DataLoader(TensorDataset(train_x, train_y), batch_size=2)
+test_data_loader = DataLoader(TensorDataset(test_x, test_y), batch_size=1)
+model = nn.Sequential(
+ nn.Conv2d(in_channels=5, out_channels=3, kernel_size=3),
+ nn.Flatten(),
+ nn.Linear(27, 3),
+ )
+loss = nn.MSELoss()
-6. For small input data call influence method on the fitted instance.
-
- ```python
- influences = infl_model.influences(test_x, test_y, train_x, train_y)
- ```
- The result is a tensor of shape `(training samples x test samples)`
- that contains at index `(i, j`) the influence of training sample `i` on
- test sample `j`.
+infl_model = DirectInfluence(model, loss, hessian_regularization=0.01)
+infl_model = infl_model.fit(train_data_loader)
-7. For larger data, wrap the model into a
- calculator and call methods on the calculator.
- ```python
- infl_calc = SequentialInfluenceCalculator(infl_model)
-
- # Lazy object providing arrays batch-wise in a sequential manner
- lazy_influences = infl_calc.influences(test_data_loader, train_data_loader)
+# For small datasets, instantiate the full influence matrix:
+influences = infl_model.influences(test_x, test_y, train_x, train_y)
- # Trigger computation and pull results to memory
- influences = lazy_influences.compute(aggregator=NestedTorchCatAggregator())
+# For larger datasets, use the Influence calculators:
+infl_calc = SequentialInfluenceCalculator(infl_model)
- # Trigger computation and write results batch-wise to disk
- lazy_influences.to_zarr("influences_result", TorchNumpyConverter())
- ```
-
+# Lazy object providing arrays batch-wise in a sequential manner
+lazy_influences = infl_calc.influences(test_data_loader, train_data_loader)
- The higher the absolute value of the influence of a training sample
- on a test sample, the more influential it is for the chosen test sample, model
- and data loaders. The sign of the influence determines whether it is
- useful (positive) or harmful (negative).
+# Trigger computation and pull results to memory
+influences = lazy_influences.compute(aggregator=NestedTorchCatAggregator())
-> **Note** pyDVL currently only support PyTorch for Influence Functions.
-> We are planning to add support for Jax and perhaps TensorFlow or even Keras.
+# Trigger computation and write results batch-wise to disk
+lazy_influences.to_zarr("influences_result", TorchNumpyConverter())
+```
## Data Valuation
The steps required to compute data values for your samples are:
-1. Import the necessary packages (The exact packages depend on your specific use case).
-
- ```python
- import matplotlib.pyplot as plt
- from sklearn.datasets import load_breast_cancer
- from sklearn.linear_model import LogisticRegression
- from pydvl.utils import Dataset, Scorer, Utility
- from pydvl.value import (
- compute_shapley_values,
- ShapleyMode,
- MaxUpdates,
- )
- ```
-
+1. Import the necessary packages (the exact ones will depend on your specific
+ use case).
2. Create a `Dataset` object with your train and test splits.
-
- ```python
- data = Dataset.from_sklearn(
- load_breast_cancer(),
- train_size=10,
- stratify_by_target=True,
- random_state=16,
- )
- ```
-
3. Create an instance of a `SupervisedModel` (basically any sklearn compatible
- predictor).
-
- ```python
- model = LogisticRegression()
- ```
-
-4. Create a `Utility` object to wrap the Dataset, the model and a scoring
- function.
-
- ```python
- u = Utility(
- model,
- data,
- Scorer("accuracy", default=0.0)
- )
- ```
-
-5. Use one of the methods defined in the library to compute the values.
- In our example, we will use *Permutation Montecarlo Shapley*,
- an approximate method for computing Data Shapley values.
-
- ```python
- values = compute_shapley_values(
- u,
- mode=ShapleyMode.PermutationMontecarlo,
- done=MaxUpdates(100),
- seed=16,
- progress=True
- )
- ```
- The result is a variable of type `ValuationResult` that contains
- the indices and their values as well as other attributes.
-
- The higher the value for an index, the more important it is for the chosen
- model, dataset and scorer.
-
-6. (Optional) Convert the valuation result to a dataframe and analyze and visualize the values.
-
- ```python
- df = values.to_dataframe(column="data_value")
- ```
+ predictor), and wrap it in a `Utility` object together with the data and a
+ scoring function.
+4. Use one of the methods defined in the library to compute the values. In the
+ example below, we will use *Permutation Montecarlo Shapley*, an approximate
+ method for computing Data Shapley values. The result is a variable of type
+ `ValuationResult` that contains the indices and their values as well as other
+ attributes.
+5. Convert the valuation result to a dataframe, and analyze and visualize the
+ values.
+
+The higher the value for an index, the more important it is for the chosen
+model, dataset and scorer. Reciprocally, low-value points could be mislabelled,
+or out-of-distribution, and dropping them can improve the model's performance.
+
+```python
+from sklearn.datasets import load_breast_cancer
+from sklearn.linear_model import LogisticRegression
+
+from pydvl.utils import Dataset, Scorer, Utility
+from pydvl.value import (MaxUpdates, RelativeTruncation,
+ permutation_montecarlo_shapley)
+
+data = Dataset.from_sklearn(
+ load_breast_cancer(),
+ train_size=10,
+ stratify_by_target=True,
+ random_state=16,
+ )
+model = LogisticRegression()
+u = Utility(
+ model,
+ data,
+ Scorer("accuracy", default=0.0)
+ )
+values = permutation_montecarlo_shapley(
+ u,
+ truncation=RelativeTruncation(u, 0.05),
+ done=MaxUpdates(1000),
+ seed=16,
+ progress=True
+ )
+df = values.to_dataframe(column="data_value")
+```
# Contributing
diff --git a/requirements-notebooks.txt b/requirements-notebooks.txt
index 04c7cf350..23c03b420 100644
--- a/requirements-notebooks.txt
+++ b/requirements-notebooks.txt
@@ -3,5 +3,5 @@ distributed==2023.4.0
pillow==10.3.0
torch==2.0.1
torchvision==0.15.2
-transformers==4.36.0
+transformers==4.38.0
zarr==2.16.1
diff --git a/setup.py b/setup.py
index c06657f1d..805a93f22 100644
--- a/setup.py
+++ b/setup.py
@@ -12,7 +12,7 @@
package_data={"pydvl": ["py.typed"]},
packages=find_packages(where="src"),
include_package_data=True,
- version="0.9.0",
+ version="0.9.1",
description="The Python Data Valuation Library",
install_requires=[
line
diff --git a/src/pydvl/__init__.py b/src/pydvl/__init__.py
index a0bef3141..ba22fa08d 100644
--- a/src/pydvl/__init__.py
+++ b/src/pydvl/__init__.py
@@ -7,4 +7,4 @@
The two main modules you will want to look at are [value][pydvl.value] and
[influence][pydvl.influence].
"""
-__version__ = "0.9.0"
+__version__ = "0.9.1"
diff --git a/src/pydvl/influence/array.py b/src/pydvl/influence/array.py
index a82b380b8..d549eee9d 100644
--- a/src/pydvl/influence/array.py
+++ b/src/pydvl/influence/array.py
@@ -7,6 +7,7 @@
using the Zarr library.
"""
+import logging
from abc import ABC, abstractmethod
from typing import Callable, Generator, Generic, List, Optional, Tuple, Union
@@ -14,6 +15,7 @@
from numpy.typing import NDArray
from zarr.storage import StoreLike
+from ..utils import log_duration
from .base_influence_function_model import TensorType
@@ -119,6 +121,7 @@ def __init__(
):
self.generator_factory = generator_factory
+ @log_duration(log_level=logging.INFO)
def compute(self, aggregator: Optional[SequenceAggregator] = None):
"""
Computes and optionally aggregates the chunks of the array using the provided
@@ -139,6 +142,7 @@ def compute(self, aggregator: Optional[SequenceAggregator] = None):
aggregator = ListAggregator()
return aggregator(self.generator_factory())
+ @log_duration(log_level=logging.INFO)
def to_zarr(
self,
path_or_url: Union[str, StoreLike],
@@ -223,6 +227,7 @@ def __init__(
):
self.generator_factory = generator_factory
+ @log_duration(log_level=logging.INFO)
def compute(self, aggregator: Optional[NestedSequenceAggregator] = None):
"""
Computes and optionally aggregates the chunks of the array using the provided
@@ -244,6 +249,7 @@ def compute(self, aggregator: Optional[NestedSequenceAggregator] = None):
aggregator = NestedListAggregator()
return aggregator(self.generator_factory())
+ @log_duration(log_level=logging.INFO)
def to_zarr(
self,
path_or_url: Union[str, StoreLike],
diff --git a/src/pydvl/influence/torch/influence_function_model.py b/src/pydvl/influence/torch/influence_function_model.py
index f85c0d4f0..46a5fa16e 100644
--- a/src/pydvl/influence/torch/influence_function_model.py
+++ b/src/pydvl/influence/torch/influence_function_model.py
@@ -363,6 +363,7 @@ def is_fitted(self):
except AttributeError:
return False
+ @log_duration(log_level=logging.INFO)
def fit(self, data: DataLoader) -> DirectInfluence:
"""
Compute the hessian matrix based on a provided dataloader.
@@ -500,6 +501,7 @@ def is_fitted(self):
except AttributeError:
return False
+ @log_duration(log_level=logging.INFO)
def fit(self, data: DataLoader) -> CgInfluence:
self.train_dataloader = data
if self.pre_conditioner is not None:
@@ -816,6 +818,7 @@ def is_fitted(self):
except AttributeError:
return False
+ @log_duration(log_level=logging.INFO)
def fit(self, data: DataLoader) -> LissaInfluence:
self.train_dataloader = data
return self
@@ -948,6 +951,7 @@ def is_fitted(self):
except AttributeError:
return False
+ @log_duration(log_level=logging.INFO)
def fit(self, data: DataLoader) -> ArnoldiInfluence:
r"""
Fitting corresponds to the computation of the low rank decomposition
@@ -1204,6 +1208,7 @@ def _get_kfac_blocks(
return forward_x, grad_y
+ @log_duration(log_level=logging.INFO)
def fit(self, data: DataLoader) -> EkfacInfluence:
"""
Compute the KFAC blocks for each layer of the model, using the provided data.
@@ -1712,6 +1717,7 @@ def is_fitted(self):
except AttributeError:
return False
+ @log_duration(log_level=logging.INFO)
def fit(self, data: DataLoader):
self.low_rank_representation = model_hessian_nystroem_approximation(
self.model, self.loss, data, self.rank
diff --git a/src/pydvl/parallel/futures/__init__.py b/src/pydvl/parallel/futures/__init__.py
index c75d04299..0659137ca 100644
--- a/src/pydvl/parallel/futures/__init__.py
+++ b/src/pydvl/parallel/futures/__init__.py
@@ -21,7 +21,7 @@
)
def init_executor(
max_workers: Optional[int] = None,
- config: ParallelConfig = ParallelConfig(),
+ config: Optional[ParallelConfig] = None,
**kwargs,
) -> Generator[Executor, None, None]:
"""Initializes a futures executor for the given parallel configuration.
@@ -50,6 +50,10 @@ def init_executor(
assert results == [1, 2, 3, 4, 5]
```
"""
+
+ if config is None:
+ config = ParallelConfig()
+
try:
cls = ParallelBackend.BACKENDS[config.backend]
with cls.executor(max_workers=max_workers, config=config, **kwargs) as e:
diff --git a/src/pydvl/utils/progress.py b/src/pydvl/utils/progress.py
index 8a49f08aa..9ad931938 100644
--- a/src/pydvl/utils/progress.py
+++ b/src/pydvl/utils/progress.py
@@ -39,19 +39,33 @@ def repeat_indices(
pbar.refresh()
-def log_duration(func):
+def log_duration(_func=None, *, log_level=logging.DEBUG):
"""
- Decorator to log execution time of a function
+ Decorator to log execution time of a function with a configurable logging level.
+ It can be used with or without specifying a log level.
"""
- @wraps(func)
- def wrapper_log_duration(*args, **kwargs):
- func_name = func.__qualname__
- logger.info(f"Function '{func_name}' is starting.")
- start_time = time()
- result = func(*args, **kwargs)
- duration = time() - start_time
- logger.info(f"Function '{func_name}' completed. Duration: {duration:.2f} sec")
- return result
-
- return wrapper_log_duration
+ def decorator_log_duration(func):
+ @wraps(func)
+ def wrapper_log_duration(*args, **kwargs):
+ func_name = func.__qualname__
+ duration_logger = logging.getLogger(func_name)
+ duration_logger.setLevel(log_level)
+ duration_logger.log(log_level, f"Function '{func_name}' is starting.")
+ start_time = time()
+ result = func(*args, **kwargs)
+ duration = time() - start_time
+ duration_logger.log(
+ log_level,
+ f"Function '{func_name}' completed. " f"Duration: {duration:.2f} sec",
+ )
+ return result
+
+ return wrapper_log_duration
+
+ if _func is None:
+ # If log_duration was called without arguments, return decorator
+ return decorator_log_duration
+ else:
+ # If log_duration was called with a function, apply decorator directly
+ return decorator_log_duration(_func)