diff --git a/docs/assets/pydvl.bib b/docs/assets/pydvl.bib index 724e75f20..ed4dc30d3 100644 --- a/docs/assets/pydvl.bib +++ b/docs/assets/pydvl.bib @@ -122,7 +122,8 @@ @inproceedings{george_fast_2018 publisher = {Curran Associates, Inc.}, url = {https://proceedings.neurips.cc/paper/2018/hash/48000647b315f6f00f913caa757a70b3-Abstract.html}, urldate = {2024-01-12}, - abstract = {Optimization algorithms that leverage gradient covariance information, such as variants of natural gradient descent (Amari, 1998), offer the prospect of yielding more effective descent directions. For models with many parameters, the covari- ance matrix they are based on becomes gigantic, making them inapplicable in their original form. This has motivated research into both simple diagonal approxima- tions and more sophisticated factored approximations such as KFAC (Heskes, 2000; Martens \& Grosse, 2015; Grosse \& Martens, 2016). In the present work we draw inspiration from both to propose a novel approximation that is provably better than KFAC and amendable to cheap partial updates. It consists in tracking a diagonal variance, not in parameter coordinates, but in a Kronecker-factored eigenbasis, in which the diagonal approximation is likely to be more effective. Experiments show improvements over KFAC in optimization speed for several deep network architectures.} + abstract = {Optimization algorithms that leverage gradient covariance information, such as variants of natural gradient descent (Amari, 1998), offer the prospect of yielding more effective descent directions. For models with many parameters, the covari- ance matrix they are based on becomes gigantic, making them inapplicable in their original form. This has motivated research into both simple diagonal approxima- tions and more sophisticated factored approximations such as KFAC (Heskes, 2000; Martens \& Grosse, 2015; Grosse \& Martens, 2016). In the present work we draw inspiration from both to propose a novel approximation that is provably better than KFAC and amendable to cheap partial updates. It consists in tracking a diagonal variance, not in parameter coordinates, but in a Kronecker-factored eigenbasis, in which the diagonal approximation is likely to be more effective. Experiments show improvements over KFAC in optimization speed for several deep network architectures.}, + keywords = {notion} } @inproceedings{ghorbani_data_2019, @@ -175,7 +176,8 @@ @inproceedings{hataya_nystrom_2023 urldate = {2024-02-26}, abstract = {The essential difficulty of gradient-based bilevel optimization using implicit differentiation is to estimate the inverse Hessian vector product with respect to neural network parameters. This paper proposes to tackle this problem by the Nyström method and the Woodbury matrix identity, exploiting the low-rankness of the Hessian. Compared to existing methods using iterative approximation, such as conjugate gradient and the Neumann series approximation, the proposed method avoids numerical instability and can be efficiently computed in matrix operations without iterations. As a result, the proposed method works stably in various tasks and is faster than iterative approximations. Throughout experiments including large-scale hyperparameter optimization and meta learning, we demonstrate that the Nyström method consistently achieves comparable or even superior performance to other approaches. The source code is available from https://github.com/moskomule/hypergrad.}, eventtitle = {International {{Conference}} on {{Artificial Intelligence}} and {{Statistics}}}, - langid = {english} + langid = {english}, + keywords = {notion} } @article{ji_breakdownfree_2017, @@ -292,6 +294,18 @@ @inproceedings{kwon_beta_2022 keywords = {notion} } +@inproceedings{kwon_datainf_2023, + title = {{{DataInf}}: {{Efficiently Estimating Data Influence}} in {{LoRA-tuned LLMs}} and {{Diffusion Models}}}, + shorttitle = {{{DataInf}}}, + author = {Kwon, Yongchan and Wu, Eric and Wu, Kevin and Zou, James}, + date = {2023-10-13}, + doi = {10.48550/arXiv.2310.00902}, + url = {https://openreview.net/forum?id=9m02ib92Wz}, + urldate = {2023-10-27}, + abstract = {Quantifying the impact of training data points is crucial for understanding the outputs of machine learning models and for improving the transparency of the AI pipeline. The influence function is a principled and popular data attribution method, but its computational cost often makes it challenging to use. This issue becomes more pronounced in the setting of large language models and text-to-image models. In this work, we propose DataInf, an efficient influence approximation method that is practical for large-scale generative AI models. Leveraging an easy-to-compute closed-form expression, DataInf outperforms existing influence computation algorithms in terms of computational and memory efficiency. Our theoretical analysis shows that DataInf is particularly well-suited for parameter-efficient fine-tuning techniques such as LoRA. Through systematic empirical evaluations, we show that DataInf accurately approximates influence scores and is orders of magnitude faster than existing methods. In applications to RoBERTa-large, Llama-2-13B-chat, and stable-diffusion-v1.5 models, DataInf effectively identifies the most influential fine-tuning examples better than other approximate influence scores. Moreover, it can help to identify which data points are mislabeled.}, + eventtitle = {The {{Twelfth International Conference}} on {{Learning Representations}}} +} + @inproceedings{kwon_dataoob_2023, title = {Data-{{OOB}}: {{Out-of-bag Estimate}} as a {{Simple}} and {{Efficient Data Value}}}, shorttitle = {Data-{{OOB}}}, @@ -303,7 +317,7 @@ @inproceedings{kwon_dataoob_2023 issn = {2640-3498}, url = {https://proceedings.mlr.press/v202/kwon23e.html}, urldate = {2023-09-06}, - abstract = {Data valuation is a powerful framework for providing statistical insights into which data are beneficial or detrimental to model training. Many Shapley-based data valuation methods have shown promising results in various downstream tasks, however, they are well known to be computationally challenging as it requires training a large number of models. As a result, it has been recognized as infeasible to apply to large datasets. To address this issue, we propose Data-OOB, a new data valuation method for a bagging model that utilizes the out-of-bag estimate. The proposed method is computationally efficient and can scale to millions of data by reusing trained weak learners. Specifically, Data-OOB takes less than 2.25 hours on a single CPU processor when there are \$10\^{}6\$ samples to evaluate and the input dimension is 100. Furthermore, Data-OOB has solid theoretical interpretations in that it identifies the same important data point as the infinitesimal jackknife influence function when two different points are compared. We conduct comprehensive experiments using 12 classification datasets, each with thousands of sample sizes. We demonstrate that the proposed method significantly outperforms existing state-of-the-art data valuation methods in identifying mislabeled data and finding a set of helpful (or harmful) data points, highlighting the potential for applying data values in real-world applications.}, + abstract = {Data valuation is a powerful framework for providing statistical insights into which data are beneficial or detrimental to model training. Many Shapley-based data valuation methods have shown promising results in various downstream tasks, however, they are well known to be computationally challenging as it requires training a large number of models. As a result, it has been recognized as infeasible to apply to large datasets. To address this issue, we propose Data-OOB, a new data valuation method for a bagging model that utilizes the out-of-bag estimate. The proposed method is computationally efficient and can scale to millions of data by reusing trained weak learners. Specifically, Data-OOB takes less than 2.25 hours on a single CPU processor when there are \$10\textasciicircum 6\$ samples to evaluate and the input dimension is 100. Furthermore, Data-OOB has solid theoretical interpretations in that it identifies the same important data point as the infinitesimal jackknife influence function when two different points are compared. We conduct comprehensive experiments using 12 classification datasets, each with thousands of sample sizes. We demonstrate that the proposed method significantly outperforms existing state-of-the-art data valuation methods in identifying mislabeled data and finding a set of helpful (or harmful) data points, highlighting the potential for applying data values in real-world applications.}, eventtitle = {International {{Conference}} on {{Machine Learning}}}, langid = {english}, keywords = {notion}