Skip to content

Commit

Permalink
Fix dependency table loading from cache (#417)
Browse files Browse the repository at this point in the history
* Fix dependency table cache for pandas versions

* Fix import sort order

* Improve text
  • Loading branch information
hagenw authored Jun 4, 2024
1 parent 13c3019 commit fc4afca
Show file tree
Hide file tree
Showing 9 changed files with 142 additions and 1 deletion.
2 changes: 1 addition & 1 deletion audb/core/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,7 +271,7 @@ def dependencies(
try:
deps = Dependencies()
deps.load(cached_deps_file)
except (AttributeError, EOFError, FileNotFoundError, KeyError, ValueError):
except Exception: # does not catch KeyboardInterupt
# If loading cached file fails, load again from backend
backend_interface = utils.lookup_backend(name, version)
deps = download_dependencies(backend_interface, name, version, verbose)
Expand Down
2 changes: 2 additions & 0 deletions tests/assests/dependency-table-pandas/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
venv/
db.csv
29 changes: 29 additions & 0 deletions tests/assests/dependency-table-pandas/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# Dependency table pandas compatibility

Since version 1.7.0 of `audb`,
we use `pyarrow` dtypes
inside the dependency table
(`audb.Dependencies._df`).
The dependency table
is still stored in cache
as a pickle file.
When loading the pickle file
with a different `pandas` version,
than the one used to store the file,
an error related to the `pyarrow` dtypes
might be raised.

To test this,
we store an example dependency table
from the `emodb` dataset
as pickle file
using different `pandas` versions
as test assests.

The pickle files,
stored in this folder,
where created by running:

```bash
$ bash store_dependency_tables.sh
```
Binary file not shown.
Binary file not shown.
Binary file not shown.
41 changes: 41 additions & 0 deletions tests/assests/dependency-table-pandas/store_dependency_table.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import argparse

import audb


def main(pandas_version):
"""Load dependency from CSV and store as PKL file.
Args:
pandas_version: version of installed ``pandas`` package
"""
# Download emodb dependency table
# from version 1.4.1,
# which is still stored as CSV file
repository = audb.Repository(
"data-public",
"https://audeering.jfrog.io/artifactory",
"artifactory",
)
backend_interface = repository.create_backend_interface()
remote_file = backend_interface.join("/", "emodb", "db.zip")
with backend_interface.backend:
backend_interface.get_archive(remote_file, ".", "1.4.1", verbose=False)

deps = audb.Dependencies()
deps.load("db.csv")
outfile = f"emodb-pandas-{pandas_version}.pkl"
deps.save(outfile)


if __name__ == "__main__":
# Call the program with:
#
# $ python store_dependency_table.py 2.2.2
#
# where 2.2.2 refers to the installed pandas version.
parser = argparse.ArgumentParser()
parser.add_argument("pandas_version")
args = parser.parse_args()
main(args.pandas_version)
17 changes: 17 additions & 0 deletions tests/assests/dependency-table-pandas/store_dependency_tables.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#!/bin/bash
#
# This stores dependency tables of emodb
# as pickle files
# for different versions of pandas
# to test compatibility
audb_version="1.7.2"
python_version="3.10"
for pandas_version in 2.0.3 2.1.4 2.2.2; do
rm -rf venv
virtualenv -p "python${python_version}" venv
source venv/bin/activate
pip install "audb==${audb_version}"
pip install "pandas==${pandas_version}"
python store_dependency_table.py ${pandas_version}
deactivate
done
52 changes: 52 additions & 0 deletions tests/test_dependencies.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import os
import re

import pandas as pd
Expand All @@ -8,6 +9,7 @@
import audb


CURRENT_DIR = os.path.dirname(os.path.realpath(__file__))
ROWS = [
{
"file": "db.files.csv",
Expand Down Expand Up @@ -326,6 +328,56 @@ def test_load_save_backward_compatibility(tmpdir, deps):
assert deps == deps2


@pytest.mark.parametrize("pandas_version", ["2.0.3", "2.1.4", "2.2.2"])
def test_load_save_pandas_compatibility(pandas_version):
"""Test pandas backward compatibility of pickle cache files.
Dataframes using pyarrow dtypes,
and stored as pickle files
might fail to load
if the used ``pandas`` version
does not match.
Test which ``pandas`` versions raise errors
when loading deps from pickle files.
We have to except those errors inside
``audb.dependencies()``.
See Also:
https://github.com/audeering/audb/issues/418
Args:
pandas_version: the version of ``pandas``
used to store the dependency table in cache
"""
deps_file = audeer.path(
CURRENT_DIR,
"assests",
"dependency-table-pandas",
f"emodb-pandas-{pandas_version}.pkl",
)
deps = audb.Dependencies()

# Dependency table cached with pandas==2.0.3.
# Loading with pandas>=2.1.0 leads to a ModuleNotFoundError
if pd.__version__ >= "2.1.0" and pandas_version == "2.0.3":
error_msg = "No module named 'pandas.core.arrays.arrow.dtype'"
with pytest.raises(ModuleNotFoundError, match=error_msg):
deps.load(deps_file)

# Dependency table cached with pandas>=2.1.4.
# Loading with pandas==2.0.3 leads to a KeyError
elif pd.__version__ == "2.0.3" and pandas_version >= "2.1.4":
error_msg = "'_data'"
with pytest.raises(KeyError, match=error_msg):
deps.load(deps_file)

else:
deps.load(deps_file)
assert deps._df.index.dtype == audb.core.define.DEPEND_INDEX_DTYPE


def test_load_save_errors(deps):
"""Test possible errors when loading/saving."""
# Wrong file extension
Expand Down

0 comments on commit fc4afca

Please sign in to comment.