Merge pull request #582 from aai-institute/feature/499-datainf

Feature/499 datainf
aai-institute · Jun 6, 2024 · 347b55a · 347b55a
2 parents 17b1ee5 + 5b34406
commit 347b55a
Show file tree

Hide file tree

Showing 21 changed files with 3,669 additions and 46 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,18 @@
 # Changelog
 
+## Unreleased
+
+### Added
+
+- New method `InverseHarmonicMeanInfluence`, implementation for the paper 
+  `DataInf: Efficiently Estimating Data Influence in LoRA-tuned LLMs and 
+    Diffusion Models`
+  [PR #582](https://github.com/aai-institute/pyDVL/pull/582)
+- Add new backend implementations for influence computation
+  to account for block-diagonal approximations
+  [PR #582](https://github.com/aai-institute/pyDVL/pull/582)
+
+
 ## 0.9.2 - 🏗  Bug fixes, logging improvement
 
 ### Added

diff --git a/docs/assets/pydvl.bib b/docs/assets/pydvl.bib
@@ -122,7 +122,8 @@ @inproceedings{george_fast_2018
   publisher = {Curran Associates, Inc.},
   url = {https://proceedings.neurips.cc/paper/2018/hash/48000647b315f6f00f913caa757a70b3-Abstract.html},
   urldate = {2024-01-12},
-  abstract = {Optimization algorithms that leverage gradient covariance information, such as variants of natural gradient descent (Amari, 1998), offer the prospect of yielding more effective descent directions. For models with many parameters, the covari- ance matrix they are based on becomes gigantic, making them inapplicable in their original form. This has motivated research into both simple diagonal approxima- tions and more sophisticated factored approximations such as KFAC (Heskes, 2000; Martens \& Grosse, 2015; Grosse \& Martens, 2016). In the present work we draw inspiration from both to propose a novel approximation that is provably better than KFAC and amendable to cheap partial updates. It consists in tracking a diagonal variance, not in parameter coordinates, but in a Kronecker-factored eigenbasis, in which the diagonal approximation is likely to be more effective. Experiments show improvements over KFAC in optimization speed for several deep network architectures.}
+  abstract = {Optimization algorithms that leverage gradient covariance information, such as variants of natural gradient descent (Amari, 1998), offer the prospect of yielding more effective descent directions. For models with many parameters, the covari- ance matrix they are based on becomes gigantic, making them inapplicable in their original form. This has motivated research into both simple diagonal approxima- tions and more sophisticated factored approximations such as KFAC (Heskes, 2000; Martens \& Grosse, 2015; Grosse \& Martens, 2016). In the present work we draw inspiration from both to propose a novel approximation that is provably better than KFAC and amendable to cheap partial updates. It consists in tracking a diagonal variance, not in parameter coordinates, but in a Kronecker-factored eigenbasis, in which the diagonal approximation is likely to be more effective. Experiments show improvements over KFAC in optimization speed for several deep network architectures.},
+  keywords = {notion}
 }
 
 @inproceedings{ghorbani_data_2019,
@@ -175,7 +176,8 @@ @inproceedings{hataya_nystrom_2023
   urldate = {2024-02-26},
   abstract = {The essential difficulty of gradient-based bilevel optimization using implicit differentiation is to estimate the inverse Hessian vector product with respect to neural network parameters. This paper proposes to tackle this problem by the Nyström method and the Woodbury matrix identity, exploiting the low-rankness of the Hessian. Compared to existing methods using iterative approximation, such as conjugate gradient and the Neumann series approximation, the proposed method avoids numerical instability and can be efficiently computed in matrix operations without iterations. As a result, the proposed method works stably in various tasks and is faster than iterative approximations. Throughout experiments including large-scale hyperparameter optimization and meta learning, we demonstrate that the Nyström method consistently achieves comparable or even superior performance to other approaches. The source code is available from https://github.com/moskomule/hypergrad.},
   eventtitle = {International {{Conference}} on {{Artificial Intelligence}} and {{Statistics}}},
-  langid = {english}
+  langid = {english},
+  keywords = {notion}
 }
 
 @article{ji_breakdownfree_2017,
@@ -292,6 +294,18 @@ @inproceedings{kwon_beta_2022
   keywords = {notion}
 }
 
+@inproceedings{kwon_datainf_2023,
+  title = {{{DataInf}}: {{Efficiently Estimating Data Influence}} in {{LoRA-tuned LLMs}} and {{Diffusion Models}}},
+  shorttitle = {{{DataInf}}},
+  author = {Kwon, Yongchan and Wu, Eric and Wu, Kevin and Zou, James},
+  date = {2023-10-13},
+  doi = {10.48550/arXiv.2310.00902},
+  url = {https://openreview.net/forum?id=9m02ib92Wz},
+  urldate = {2023-10-27},
+  abstract = {Quantifying the impact of training data points is crucial for understanding the outputs of machine learning models and for improving the transparency of the AI pipeline. The influence function is a principled and popular data attribution method, but its computational cost often makes it challenging to use. This issue becomes more pronounced in the setting of large language models and text-to-image models. In this work, we propose DataInf, an efficient influence approximation method that is practical for large-scale generative AI models. Leveraging an easy-to-compute closed-form expression, DataInf outperforms existing influence computation algorithms in terms of computational and memory efficiency. Our theoretical analysis shows that DataInf is particularly well-suited for parameter-efficient fine-tuning techniques such as LoRA. Through systematic empirical evaluations, we show that DataInf accurately approximates influence scores and is orders of magnitude faster than existing methods. In applications to RoBERTa-large, Llama-2-13B-chat, and stable-diffusion-v1.5 models, DataInf effectively identifies the most influential fine-tuning examples better than other approximate influence scores. Moreover, it can help to identify which data points are mislabeled.},
+  eventtitle = {The {{Twelfth International Conference}} on {{Learning Representations}}}
+}
+
 @inproceedings{kwon_dataoob_2023,
   title = {Data-{{OOB}}: {{Out-of-bag Estimate}} as a {{Simple}} and {{Efficient Data Value}}},
   shorttitle = {Data-{{OOB}}},
@@ -303,7 +317,7 @@ @inproceedings{kwon_dataoob_2023
   issn = {2640-3498},
   url = {https://proceedings.mlr.press/v202/kwon23e.html},
   urldate = {2023-09-06},
-  abstract = {Data valuation is a powerful framework for providing statistical insights into which data are beneficial or detrimental to model training. Many Shapley-based data valuation methods have shown promising results in various downstream tasks, however, they are well known to be computationally challenging as it requires training a large number of models. As a result, it has been recognized as infeasible to apply to large datasets. To address this issue, we propose Data-OOB, a new data valuation method for a bagging model that utilizes the out-of-bag estimate. The proposed method is computationally efficient and can scale to millions of data by reusing trained weak learners. Specifically, Data-OOB takes less than 2.25 hours on a single CPU processor when there are \$10\^{}6\$ samples to evaluate and the input dimension is 100. Furthermore, Data-OOB has solid theoretical interpretations in that it identifies the same important data point as the infinitesimal jackknife influence function when two different points are compared. We conduct comprehensive experiments using 12 classification datasets, each with thousands of sample sizes. We demonstrate that the proposed method significantly outperforms existing state-of-the-art data valuation methods in identifying mislabeled data and finding a set of helpful (or harmful) data points, highlighting the potential for applying data values in real-world applications.},
+  abstract = {Data valuation is a powerful framework for providing statistical insights into which data are beneficial or detrimental to model training. Many Shapley-based data valuation methods have shown promising results in various downstream tasks, however, they are well known to be computationally challenging as it requires training a large number of models. As a result, it has been recognized as infeasible to apply to large datasets. To address this issue, we propose Data-OOB, a new data valuation method for a bagging model that utilizes the out-of-bag estimate. The proposed method is computationally efficient and can scale to millions of data by reusing trained weak learners. Specifically, Data-OOB takes less than 2.25 hours on a single CPU processor when there are \$10\textasciicircum 6\$ samples to evaluate and the input dimension is 100. Furthermore, Data-OOB has solid theoretical interpretations in that it identifies the same important data point as the infinitesimal jackknife influence function when two different points are compared. We conduct comprehensive experiments using 12 classification datasets, each with thousands of sample sizes. We demonstrate that the proposed method significantly outperforms existing state-of-the-art data valuation methods in identifying mislabeled data and finding a set of helpful (or harmful) data points, highlighting the potential for applying data values in real-world applications.},
   eventtitle = {International {{Conference}} on {{Machine Learning}}},
   langid = {english},
   keywords = {notion}

diff --git a/docs/influence/influence_function_model.md b/docs/influence/influence_function_model.md
@@ -207,7 +207,132 @@ if_model = NystroemSketchInfluence(
 if_model.fit(train_loader)
 ```
 
+### Inverse Harmonic Mean
+
+This implementation replaces the inverse Hessian matrix in the influence computation
+with an approximation of the inverse Gauss-Newton vector product and was
+proposed in [@kwon_datainf_2023].
+
+The approximation method comprises
+the following steps:
+
+1.  Replace the Hessian $H(\theta)$ with the Gauss-Newton matrix 
+    $G(\theta)$:
+
+    \begin{equation*}
+        G(\theta)=n^{-1} \sum_{i=1}^n \nabla_{\theta}\ell_i\nabla_{\theta}\ell_i^T
+    \end{equation*}
+
+    which results in
+
+    \begin{equation*}
+        \mathcal{I}(z_{t}, z) \approx \nabla_{\theta} \ell(z_{t}, \theta)^T 
+                         (G(\theta) + \lambda I_d)^{-1} 
+                         \nabla_{\theta} \ell(z, \theta) 
+    \end{equation*}
+
+2.  Simplify the problem by breaking it down into a block diagonal structure, 
+    where each block $G_l(\theta)$ corresponds to the l-th block:   
+
+    \begin{equation*}
+        G_{l}(\theta) = n^{-1} \sum_{i=1}^n \nabla_{\theta_l} \ell_i 
+                       \nabla_{\theta_l} \ell_i^{T} + \lambda_l I_{d_l},
+    \end{equation*}
+
+    which leads to
+
+    \begin{equation*}
+       \mathcal{I}(z_{t}, z) \approx \nabla_{\theta} \ell(z_{t}, \theta)^T 
+                                     \operatorname{diag}(G_1(\theta)^{-1}, 
+                                     \dots, G_L(\theta)^{-1}) 
+                                     \nabla_{\theta} \ell(z, \theta)
+    \end{equation*}
+
+3.  Substitute the arithmetic mean of the rank-$1$ updates in 
+       $G_l(\theta)$, with the inverse harmonic mean $R_l(\theta)$ of the rank-1 
+    updates:
+
+    \begin{align*}
+        G_l(\theta)^{-1} &= \left(  n^{-1} \sum_{i=1}^n \nabla_{\theta_l} 
+                           \ell(z_i, \theta) \nabla_{\theta_l} 
+                           \ell(z_i, \theta)^{T} + 
+                           \lambda_l I_{d_l}\right)^{-1} \\\
+        R_{l}(\theta)&= n^{-1} \sum_{i=1}^n \left( \nabla_{\theta_l} 
+                       \ell(z_i, \theta) \nabla_{\theta_l} \ell(z_i, \theta)^{T} 
+                       + \lambda_l I_{d_l} \right)^{-1}
+    \end{align*}
+
+4.  Use the 
+   <a href="https://en.wikipedia.org/wiki/Sherman%E2%80%93Morrison_formula">
+     Sherman–Morrison formula
+   </a> 
+    to get an explicit representation of the inverses in the definition of 
+    $R_l(\theta):$
+
+    \begin{align*}
+        R_l(\theta) &= n^{-1} \sum_{i=1}^n \left( \nabla_{\theta_l} \ell_i
+        \nabla_{\theta_l} \ell_i^{T}
+        + \lambda_l I_{d_l}\right)^{-1} \\\
+        &= n^{-1} \sum_{i=1}^n \lambda_l^{-1} \left(I_{d_l}
+        - \frac{\nabla_{\theta_l} \ell_i \nabla_{\theta_l}
+        \ell_i^{T}}{\lambda_l
+        + \\|\nabla_{\theta_l} \ell_i\\|_2^2}\right)
+        ,
+    \end{align*}
+
+    which means application of $R_l(\theta)$ boils down to computing $n$
+    rank-$1$ updates.
+
+```python
+from pydvl.influence.torch import InverseHarmonicMeanInfluence, BlockMode
+
+if_model = InverseHarmonicMeanInfluence(
+    model,
+    loss,
+    regularization=1e-1,
+    block_structure=BlockMode.LAYER_WISE
+)
+if_model.fit(train_loader)
+```
+
+!!! Info
+    This implementation is capable of using a block-matrix approximation. The
+    blocking structure can be specified via the `block_structure` parameter.
+    The `block_structure` parameter can either be a
+    [BlockMode][pydvl.influence.torch.util.BlockMode] enum (which provides
+    layer-wise or parameter-wise blocking) or a custom block structure defined
+    by an ordered dictionary with the keys being the block identifiers (arbitrary
+    strings) and the values being lists of parameter names contained in the block.
+    ```python
+    block_structure = OrderedDict(
+    (
+        ("custom_block1", ["0.weight", "1.bias"]),
+        ("custom_block2", ["1.weight", "0.bias"]),
+    )
+    )
+    ```
+    If you would like to apply a block-specific regularization, you can provide a
+    dictionary with the block names as keys and the regularization values as values.
+    In this case, the specification must be complete, i.e. every block must have
+    a positive regularization value.
+    ```python
+    regularization =  {
+    "custom_block1": 0.1,
+    "custom_block2": 0.2,
+    }
+    ```
+    Accordingly, if you choose a layer-wise or parameter-wise structure
+    (by providing `BlockMode.LAYER_WISE` or `BlockMode.PARAMETER_WISE` for
+    `block_structure`) the keys must be the layer names or parameter names,
+    respectively.
+    You can retrieve the block-wise influence information from the methods
+    with suffix `_by_block`. By default, `block_structure` is set to
+    `BlockMode.FULL` and in this case these methods will return a dictionary
+    with the empty string being the only key.
+
 These implementations represent the calculation logic on in memory tensors. 
 To scale up to large collection of data, we map these influence function models 
 over these collections. For a detailed discussion see the
 documentation page [Scaling Computation](scaling_computation.md).
+
+
diff --git a/docs/influence/scaling_computation.md b/docs/influence/scaling_computation.md
@@ -24,8 +24,7 @@ into memory.
 ```python
 from pydvl.influence import SequentialInfluenceCalculator
 from pydvl.influence.torch.util import (
-    NestedTorchCatAggregator, 
-    TorchNumpyConverter,
+    TorchNumpyConverter, NestedTorchCatAggregator,
 )
 from pydvl.influence.torch import CgInfluence
 

diff --git a/src/pydvl/influence/__init__.py b/src/pydvl/influence/__init__.py
@@ -10,9 +10,9 @@
     probably change.
 
 """
-from .base_influence_function_model import InfluenceMode
 from .influence_calculator import (
     DaskInfluenceCalculator,
     DisableClientSingleThreadCheck,
     SequentialInfluenceCalculator,
 )
+from .types import InfluenceMode
diff --git a/src/pydvl/influence/array.py b/src/pydvl/influence/array.py
@@ -28,7 +28,7 @@
 from zarr.storage import StoreLike
 
 from ..utils import log_duration
-from .base_influence_function_model import TensorType
+from .types import TensorType
 
 
 class NumpyConverter(Generic[TensorType], ABC):