diff --git a/.bumpversion.cfg b/.bumpversion.cfg index 0336abca1..f27d47b90 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.9.0 +current_version = 0.9.1 commit = False tag = False allow_dirty = False diff --git a/CHANGELOG.md b/CHANGELOG.md index d194f74ad..0092f6045 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,14 @@ # Changelog -## 0.9.0 🆕 New methods, better docs and bugfixes 📚🐞 +## Unreleased + +### Fixed + +- `FutureWarning` for `ParallelConfig` constantly raised without actually + instantiating the object + [PR #562](https://github.com/aai-institute/pyDVL/pull/562) + +## 0.9.0 - 🆕 New methods, better docs and bugfixes 📚🐞 ### Added diff --git a/CITATION.cff b/CITATION.cff index 7ca971f99..241536c7d 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -27,6 +27,6 @@ keywords: - Banzhaf index license: LGPL-3.0 commit: 0e929ae121820b0014bf245da1b21032186768cb -version: v0.7.0 -doi: 10.5281/zenodo.8311583 -date-released: '2023-09-02' +version: v0.9.0 +doi: 10.5281/zenodo.10966754 +date-released: '2024-04-12' diff --git a/README.md b/README.md index 2f7296018..97cf5f22d 100644 --- a/README.md +++ b/README.md @@ -16,10 +16,8 @@ DOI

-**pyDVL** collects algorithms for **Data Valuation** and **Influence Function** computation. - -Refer to the [Methods](https://pydvl.org/devel/getting-started/methods/) -page of our documentation for a list of all implemented methods. +**pyDVL** collects algorithms for **Data Valuation** and **Influence Function** +computation. Here is the list of [all methods implemented](https://pydvl.org/devel/getting-started/methods/). **Data Valuation** for machine learning is the task of assigning a scalar to each element of a training set which reflects its contribution to the final @@ -29,7 +27,7 @@ pyDVL focuses on model-dependent methods.
**Note** pyDVL currently only support PyTorch for Influence Functions. We plan +> to add support for Jax next. + +```python +import torch +from torch import nn +from torch.utils.data import DataLoader, TensorDataset + +from pydvl.influence import SequentialInfluenceCalculator +from pydvl.influence.torch import DirectInfluence +from pydvl.influence.torch.util import ( + NestedTorchCatAggregator, + TorchNumpyConverter, ) - ``` - -4. Define your loss: - - ```python - loss = nn.MSELoss() - ``` - -5. Instantiate an `InfluenceFunctionModel` and fit it to the training data - ```python - infl_model = DirectInfluence(nn_architecture, loss, hessian_regularization=0.01) - infl_model = infl_model.fit(train_data_loader) - ``` +input_dim = (5, 5, 5) +output_dim = 3 +train_x, train_y = torch.rand((10, *input_dim)), torch.rand((10, output_dim)) +test_x, test_y = torch.rand((5, *input_dim)), torch.rand((5, output_dim)) +train_data_loader = DataLoader(TensorDataset(train_x, train_y), batch_size=2) +test_data_loader = DataLoader(TensorDataset(test_x, test_y), batch_size=1) +model = nn.Sequential( + nn.Conv2d(in_channels=5, out_channels=3, kernel_size=3), + nn.Flatten(), + nn.Linear(27, 3), + ) +loss = nn.MSELoss() -6. For small input data call influence method on the fitted instance. - - ```python - influences = infl_model.influences(test_x, test_y, train_x, train_y) - ``` - The result is a tensor of shape `(training samples x test samples)` - that contains at index `(i, j`) the influence of training sample `i` on - test sample `j`. +infl_model = DirectInfluence(model, loss, hessian_regularization=0.01) +infl_model = infl_model.fit(train_data_loader) -7. For larger data, wrap the model into a - calculator and call methods on the calculator. - ```python - infl_calc = SequentialInfluenceCalculator(infl_model) - - # Lazy object providing arrays batch-wise in a sequential manner - lazy_influences = infl_calc.influences(test_data_loader, train_data_loader) +# For small datasets, instantiate the full influence matrix: +influences = infl_model.influences(test_x, test_y, train_x, train_y) - # Trigger computation and pull results to memory - influences = lazy_influences.compute(aggregator=NestedTorchCatAggregator()) +# For larger datasets, use the Influence calculators: +infl_calc = SequentialInfluenceCalculator(infl_model) - # Trigger computation and write results batch-wise to disk - lazy_influences.to_zarr("influences_result", TorchNumpyConverter()) - ``` - +# Lazy object providing arrays batch-wise in a sequential manner +lazy_influences = infl_calc.influences(test_data_loader, train_data_loader) - The higher the absolute value of the influence of a training sample - on a test sample, the more influential it is for the chosen test sample, model - and data loaders. The sign of the influence determines whether it is - useful (positive) or harmful (negative). +# Trigger computation and pull results to memory +influences = lazy_influences.compute(aggregator=NestedTorchCatAggregator()) -> **Note** pyDVL currently only support PyTorch for Influence Functions. -> We are planning to add support for Jax and perhaps TensorFlow or even Keras. +# Trigger computation and write results batch-wise to disk +lazy_influences.to_zarr("influences_result", TorchNumpyConverter()) +``` ## Data Valuation The steps required to compute data values for your samples are: -1. Import the necessary packages (The exact packages depend on your specific use case). - - ```python - import matplotlib.pyplot as plt - from sklearn.datasets import load_breast_cancer - from sklearn.linear_model import LogisticRegression - from pydvl.utils import Dataset, Scorer, Utility - from pydvl.value import ( - compute_shapley_values, - ShapleyMode, - MaxUpdates, - ) - ``` - +1. Import the necessary packages (the exact ones will depend on your specific + use case). 2. Create a `Dataset` object with your train and test splits. - - ```python - data = Dataset.from_sklearn( - load_breast_cancer(), - train_size=10, - stratify_by_target=True, - random_state=16, - ) - ``` - 3. Create an instance of a `SupervisedModel` (basically any sklearn compatible - predictor). - - ```python - model = LogisticRegression() - ``` - -4. Create a `Utility` object to wrap the Dataset, the model and a scoring - function. - - ```python - u = Utility( - model, - data, - Scorer("accuracy", default=0.0) - ) - ``` - -5. Use one of the methods defined in the library to compute the values. - In our example, we will use *Permutation Montecarlo Shapley*, - an approximate method for computing Data Shapley values. - - ```python - values = compute_shapley_values( - u, - mode=ShapleyMode.PermutationMontecarlo, - done=MaxUpdates(100), - seed=16, - progress=True - ) - ``` - The result is a variable of type `ValuationResult` that contains - the indices and their values as well as other attributes. - - The higher the value for an index, the more important it is for the chosen - model, dataset and scorer. - -6. (Optional) Convert the valuation result to a dataframe and analyze and visualize the values. - - ```python - df = values.to_dataframe(column="data_value") - ``` + predictor), and wrap it in a `Utility` object together with the data and a + scoring function. +4. Use one of the methods defined in the library to compute the values. In the + example below, we will use *Permutation Montecarlo Shapley*, an approximate + method for computing Data Shapley values. The result is a variable of type + `ValuationResult` that contains the indices and their values as well as other + attributes. +5. Convert the valuation result to a dataframe, and analyze and visualize the + values. + +The higher the value for an index, the more important it is for the chosen +model, dataset and scorer. Reciprocally, low-value points could be mislabelled, +or out-of-distribution, and dropping them can improve the model's performance. + +```python +from sklearn.datasets import load_breast_cancer +from sklearn.linear_model import LogisticRegression + +from pydvl.utils import Dataset, Scorer, Utility +from pydvl.value import (MaxUpdates, RelativeTruncation, + permutation_montecarlo_shapley) + +data = Dataset.from_sklearn( + load_breast_cancer(), + train_size=10, + stratify_by_target=True, + random_state=16, + ) +model = LogisticRegression() +u = Utility( + model, + data, + Scorer("accuracy", default=0.0) + ) +values = permutation_montecarlo_shapley( + u, + truncation=RelativeTruncation(u, 0.05), + done=MaxUpdates(1000), + seed=16, + progress=True + ) +df = values.to_dataframe(column="data_value") +``` # Contributing diff --git a/requirements-notebooks.txt b/requirements-notebooks.txt index 04c7cf350..23c03b420 100644 --- a/requirements-notebooks.txt +++ b/requirements-notebooks.txt @@ -3,5 +3,5 @@ distributed==2023.4.0 pillow==10.3.0 torch==2.0.1 torchvision==0.15.2 -transformers==4.36.0 +transformers==4.38.0 zarr==2.16.1 diff --git a/setup.py b/setup.py index c06657f1d..805a93f22 100644 --- a/setup.py +++ b/setup.py @@ -12,7 +12,7 @@ package_data={"pydvl": ["py.typed"]}, packages=find_packages(where="src"), include_package_data=True, - version="0.9.0", + version="0.9.1", description="The Python Data Valuation Library", install_requires=[ line diff --git a/src/pydvl/__init__.py b/src/pydvl/__init__.py index a0bef3141..ba22fa08d 100644 --- a/src/pydvl/__init__.py +++ b/src/pydvl/__init__.py @@ -7,4 +7,4 @@ The two main modules you will want to look at are [value][pydvl.value] and [influence][pydvl.influence]. """ -__version__ = "0.9.0" +__version__ = "0.9.1" diff --git a/src/pydvl/influence/array.py b/src/pydvl/influence/array.py index a82b380b8..d549eee9d 100644 --- a/src/pydvl/influence/array.py +++ b/src/pydvl/influence/array.py @@ -7,6 +7,7 @@ using the Zarr library. """ +import logging from abc import ABC, abstractmethod from typing import Callable, Generator, Generic, List, Optional, Tuple, Union @@ -14,6 +15,7 @@ from numpy.typing import NDArray from zarr.storage import StoreLike +from ..utils import log_duration from .base_influence_function_model import TensorType @@ -119,6 +121,7 @@ def __init__( ): self.generator_factory = generator_factory + @log_duration(log_level=logging.INFO) def compute(self, aggregator: Optional[SequenceAggregator] = None): """ Computes and optionally aggregates the chunks of the array using the provided @@ -139,6 +142,7 @@ def compute(self, aggregator: Optional[SequenceAggregator] = None): aggregator = ListAggregator() return aggregator(self.generator_factory()) + @log_duration(log_level=logging.INFO) def to_zarr( self, path_or_url: Union[str, StoreLike], @@ -223,6 +227,7 @@ def __init__( ): self.generator_factory = generator_factory + @log_duration(log_level=logging.INFO) def compute(self, aggregator: Optional[NestedSequenceAggregator] = None): """ Computes and optionally aggregates the chunks of the array using the provided @@ -244,6 +249,7 @@ def compute(self, aggregator: Optional[NestedSequenceAggregator] = None): aggregator = NestedListAggregator() return aggregator(self.generator_factory()) + @log_duration(log_level=logging.INFO) def to_zarr( self, path_or_url: Union[str, StoreLike], diff --git a/src/pydvl/influence/torch/influence_function_model.py b/src/pydvl/influence/torch/influence_function_model.py index f85c0d4f0..46a5fa16e 100644 --- a/src/pydvl/influence/torch/influence_function_model.py +++ b/src/pydvl/influence/torch/influence_function_model.py @@ -363,6 +363,7 @@ def is_fitted(self): except AttributeError: return False + @log_duration(log_level=logging.INFO) def fit(self, data: DataLoader) -> DirectInfluence: """ Compute the hessian matrix based on a provided dataloader. @@ -500,6 +501,7 @@ def is_fitted(self): except AttributeError: return False + @log_duration(log_level=logging.INFO) def fit(self, data: DataLoader) -> CgInfluence: self.train_dataloader = data if self.pre_conditioner is not None: @@ -816,6 +818,7 @@ def is_fitted(self): except AttributeError: return False + @log_duration(log_level=logging.INFO) def fit(self, data: DataLoader) -> LissaInfluence: self.train_dataloader = data return self @@ -948,6 +951,7 @@ def is_fitted(self): except AttributeError: return False + @log_duration(log_level=logging.INFO) def fit(self, data: DataLoader) -> ArnoldiInfluence: r""" Fitting corresponds to the computation of the low rank decomposition @@ -1204,6 +1208,7 @@ def _get_kfac_blocks( return forward_x, grad_y + @log_duration(log_level=logging.INFO) def fit(self, data: DataLoader) -> EkfacInfluence: """ Compute the KFAC blocks for each layer of the model, using the provided data. @@ -1712,6 +1717,7 @@ def is_fitted(self): except AttributeError: return False + @log_duration(log_level=logging.INFO) def fit(self, data: DataLoader): self.low_rank_representation = model_hessian_nystroem_approximation( self.model, self.loss, data, self.rank diff --git a/src/pydvl/parallel/futures/__init__.py b/src/pydvl/parallel/futures/__init__.py index c75d04299..0659137ca 100644 --- a/src/pydvl/parallel/futures/__init__.py +++ b/src/pydvl/parallel/futures/__init__.py @@ -21,7 +21,7 @@ ) def init_executor( max_workers: Optional[int] = None, - config: ParallelConfig = ParallelConfig(), + config: Optional[ParallelConfig] = None, **kwargs, ) -> Generator[Executor, None, None]: """Initializes a futures executor for the given parallel configuration. @@ -50,6 +50,10 @@ def init_executor( assert results == [1, 2, 3, 4, 5] ``` """ + + if config is None: + config = ParallelConfig() + try: cls = ParallelBackend.BACKENDS[config.backend] with cls.executor(max_workers=max_workers, config=config, **kwargs) as e: diff --git a/src/pydvl/utils/progress.py b/src/pydvl/utils/progress.py index 8a49f08aa..9ad931938 100644 --- a/src/pydvl/utils/progress.py +++ b/src/pydvl/utils/progress.py @@ -39,19 +39,33 @@ def repeat_indices( pbar.refresh() -def log_duration(func): +def log_duration(_func=None, *, log_level=logging.DEBUG): """ - Decorator to log execution time of a function + Decorator to log execution time of a function with a configurable logging level. + It can be used with or without specifying a log level. """ - @wraps(func) - def wrapper_log_duration(*args, **kwargs): - func_name = func.__qualname__ - logger.info(f"Function '{func_name}' is starting.") - start_time = time() - result = func(*args, **kwargs) - duration = time() - start_time - logger.info(f"Function '{func_name}' completed. Duration: {duration:.2f} sec") - return result - - return wrapper_log_duration + def decorator_log_duration(func): + @wraps(func) + def wrapper_log_duration(*args, **kwargs): + func_name = func.__qualname__ + duration_logger = logging.getLogger(func_name) + duration_logger.setLevel(log_level) + duration_logger.log(log_level, f"Function '{func_name}' is starting.") + start_time = time() + result = func(*args, **kwargs) + duration = time() - start_time + duration_logger.log( + log_level, + f"Function '{func_name}' completed. " f"Duration: {duration:.2f} sec", + ) + return result + + return wrapper_log_duration + + if _func is None: + # If log_duration was called without arguments, return decorator + return decorator_log_duration + else: + # If log_duration was called with a function, apply decorator directly + return decorator_log_duration(_func)