diff --git a/audformat/core/table.py b/audformat/core/table.py index 9ebf4d90..4848d317 100644 --- a/audformat/core/table.py +++ b/audformat/core/table.py @@ -1,7 +1,6 @@ from __future__ import annotations # allow typing without string import copy -import hashlib import os import pickle import typing @@ -1198,13 +1197,11 @@ def _save_parquet(self, path: str): table = pa.Table.from_pandas(self.df.reset_index(), preserve_index=False) # Create hash of table - table_hash = hashlib.md5() - table_hash.update(_schema_hash(table)) - table_hash.update(_dataframe_hash(self.df)) + table_hash = utils.hash(self.df, include_order_and_names=True) # Store in metadata of file, # see https://stackoverflow.com/a/58978449 - metadata = {"hash": table_hash.hexdigest()} + metadata = {"hash": table_hash} table = table.replace_schema_metadata({**metadata, **table.schema.metadata}) parquet.write_table(table, path, compression="snappy") @@ -1905,40 +1902,6 @@ def _assert_table_index( ) -def _dataframe_hash(df: pd.DataFrame) -> bytes: - """Hash a dataframe. - - The hash value takes into account: - - * index of dataframe - * values of the dataframe - * order of dataframe rows - - It does not consider: - - * column names of dataframe - * dtypes of dataframe - - Args: - df: dataframe - - Returns: - MD5 hash in bytes - - """ - md5 = hashlib.md5() - for _, y in df.reset_index().items(): - # Convert every column to a numpy array, - # and hash its string representation - if y.dtype == "Int64": - # Enforce consistent conversion to numpy.array - # for integers across different pandas versions - # (since pandas 2.2.x, Int64 is converted to float if it contains ) - y = y.astype("float") - md5.update(bytes(str(y.to_numpy()), "utf-8")) - return md5.digest() - - def _maybe_convert_dtype_to_string( index: pd.Index, ) -> pd.Index: @@ -1961,23 +1924,3 @@ def _maybe_update_scheme( for scheme in table.db.schemes.values(): if table._id == scheme.labels: scheme.replace_labels(table._id) - - -def _schema_hash(table: pa.Table) -> bytes: - r"""Hash pyarrow table schema. - - Args: - table: pyarrow table - - Returns: - MD5 hash in bytes - - """ - schema_str = table.schema.to_string( - # schema.metadata contains pandas related information, - # and the used pyarrow and pandas version, - # and needs to be excluded - show_field_metadata=False, - show_schema_metadata=False, - ) - return hashlib.md5(schema_str.encode()).digest() diff --git a/audformat/core/utils.py b/audformat/core/utils.py index 8945357d..0fe6271e 100644 --- a/audformat/core/utils.py +++ b/audformat/core/utils.py @@ -1,5 +1,6 @@ import collections import errno +import hashlib import os import platform import re @@ -10,6 +11,7 @@ import iso3166 import numpy as np import pandas as pd +import pyarrow as pa import audeer import audiofile @@ -664,10 +666,12 @@ def expand_file_path( def hash( obj: typing.Union[pd.Index, pd.Series, pd.DataFrame], + include_order_and_names: bool = False, ) -> str: r"""Create hash from object. - Objects with the same elements + If ``include_order_and_names`` is ``False``, + objects with the same elements produce the same hash string independent of the ordering of the elements, and level or column names. @@ -676,29 +680,75 @@ def hash( If ``obj`` is a dataframe or series with data type ``"Int64"``, + and ``include_order_and_names`` is ``False``, the returned hash value changes with ``pandas>=2.2.0``. Args: obj: object + include_order_and_names: if ``True``, + the hash takes into account + the order of rows + and column/level names Returns: - hash string + hash string with 19 characters, + or 32 characters if ``include_order_and_names`` is ``True`` Examples: >>> index = filewise_index(["f1", "f2"]) >>> hash(index) '-4231615416436839963' + >>> hash(index[::-1]) # reversed index + '-4231615416436839963' >>> y = pd.Series(0, index) >>> hash(y) '5251663970176285425' + >>> hash(index, include_order_and_names=True) + '0741235e2250e0fcd9ab7b64972f5047' + >>> hash(index[::-1], include_order_and_names=True) + 'c6639d377897dd9353dc3e8b2968170d' """ - # Convert to int64 - # to enforce same behavior - # across different pandas versions, - # see - # https://github.com/pandas-dev/pandas/issues/55452 - return str(pd.util.hash_pandas_object(obj).astype("int64").sum()) + if include_order_and_names: + if isinstance(obj, pd.Index): + df = obj.to_frame() + elif isinstance(obj, pd.Series): + df = obj.to_frame().reset_index() + else: + df = obj.reset_index() + # Handle column names and dtypes + table = pa.Table.from_pandas(df, preserve_index=False) + schema_str = table.schema.to_string( + # schema.metadata contains pandas related information, + # and the used pyarrow and pandas version, + # and needs to be excluded + show_field_metadata=False, + show_schema_metadata=False, + ) + schema_md5 = hashlib.md5(schema_str.encode()) + # Handle index, values, and row order + data_md5 = hashlib.md5() + for _, y in df.items(): + # Convert every column to a numpy array, + # and hash its string representation + if y.dtype == "Int64": + # Enforce consistent conversion to numpy.array + # for integers across different pandas versions + # (since pandas 2.2.x, Int64 is converted to float if it contains ) + y = y.astype("float") + data_md5.update(bytes(str(y.to_numpy()), "utf-8")) + md5 = hashlib.md5() + md5.update(schema_md5.digest()) + md5.update(data_md5.digest()) + md5 = md5.hexdigest() + else: + # Convert to int64 + # to enforce same behavior + # across different pandas versions, + # see + # https://github.com/pandas-dev/pandas/issues/55452 + md5 = str(pd.util.hash_pandas_object(obj).astype("int64").sum()) + return md5 def index_has_overlap( diff --git a/tests/test_utils.py b/tests/test_utils.py index 79cd6028..61e3bbf9 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -440,80 +440,219 @@ def test_expand_file_path(tmpdir, index, root, expected): @pytest.mark.parametrize( - "obj, expected", + "obj, include_order_and_names, mutable, expected", [ ( audformat.filewise_index(), + False, + True, "0", ), + ( + audformat.filewise_index(), + True, + True, + "890fa7e5864779b7c3bd85c9ede31657", + ), ( audformat.segmented_index(), + False, + True, "0", ), + ( + audformat.segmented_index(), + True, + True, + "10d5e40fca4e40c6c70ff64495916059", + ), ( audformat.filewise_index(["f1", "f2"]), + False, + True, "-4231615416436839963", ), + ( + audformat.filewise_index(["f1", "f2"]), + True, + False, + "0741235e2250e0fcd9ab7b64972f5047", + ), ( audformat.segmented_index(["f1", "f2"]), + False, + True, "-2363261461673824215", ), ( audformat.segmented_index(["f1", "f2"]), + True, + False, + "0e99d54165e6cc2dad2737982853f8c7", + ), + ( + audformat.segmented_index(["f1", "f2"]), + False, + True, "-2363261461673824215", ), ( audformat.segmented_index(["f1", "f2"], [0, 0], [1, 1]), + False, + True, "-3831446135233514455", ), + ( + audformat.segmented_index(["f1", "f2"], [0, 0], [1, 1]), + True, + False, + "396fda484a46686b2b5c41b0ae9c94bd", + ), ( pd.Series([0, 1], audformat.filewise_index(["f1", "f2"])), + False, + True, "-8245754232361677810", ), + ( + pd.Series([0, 1], audformat.filewise_index(["f1", "f2"])), + True, + False, + "28c5f6feb0682079b127d8ce8debebe9", + ), ( pd.DataFrame( {"a": [0, 1], "b": [2, 3]}, audformat.segmented_index(["f1", "f2"], [0, 0], [1, 1]), ), + False, + True, "-103439349488189352", ), - (pd.Index([0, 1], name="idx"), "6238072747940578789"), - (pd.Index([0, 1], name="name"), "6238072747940578789"), + ( + pd.DataFrame( + {"a": [0, 1], "b": [2, 3]}, + audformat.segmented_index(["f1", "f2"], [0, 0], [1, 1]), + ), + True, + False, + "69785e94447fab79f2b65b1dcb4a2122", + ), + ( + pd.Index([0, 1], name="idx"), + False, + True, + "6238072747940578789", + ), + ( + pd.Index([0, 1], name="idx"), + True, + False, + "a02406270880cde74e66c07278b765a0", + ), + ( + pd.Index([0, 1], name="name"), + False, + True, + "6238072747940578789", + ), + ( + pd.Index([0, 1], name="name"), + True, + False, + "7a8303866a35ececb4ae76a4aa050209", + ), ( pd.MultiIndex.from_arrays( [[0, 1], ["a", "b"]], names=["idx1", "idx2"], ), + False, + True, "8378370490910668918", ), + ( + pd.MultiIndex.from_arrays( + [[0, 1], ["a", "b"]], + names=["idx1", "idx2"], + ), + True, + False, + "be5373f6d8f801b902d8cf0e2f2a1914", + ), ( pd.MultiIndex.from_arrays( [[0, 1], ["a", "b"]], names=["name1", "name2"], ), + False, + True, "8378370490910668918", ), + ( + pd.MultiIndex.from_arrays( + [[0, 1], ["a", "b"]], + names=["name1", "name2"], + ), + True, + False, + "3726b3c39fc2c1453bb45a0460630ff7", + ), ( pd.Series([0, 1], name="series"), + False, + True, "-7179254265801896228", ), + ( + pd.Series([0, 1], name="series"), + True, + False, + "ad0f9900c0e2f3954bde3abb6f4a9b61", + ), ( pd.Series([0, 1], name="name"), + False, + True, "-7179254265801896228", ), + ( + pd.Series([0, 1], name="name"), + True, + False, + "648314808f0b27e5c04479ba8509fc25", + ), ( pd.DataFrame([0, 1], columns=["frame"]), + False, + True, "-7179254265801896228", ), + ( + pd.DataFrame([0, 1], columns=["frame"]), + True, + False, + "36d9779e257319ff69515af6b4ade8ad", + ), ( pd.DataFrame([0, 1], columns=["name"]), + False, + True, "-7179254265801896228", ), + ( + pd.DataFrame([0, 1], columns=["name"]), + True, + False, + "648314808f0b27e5c04479ba8509fc25", + ), pytest.param( pd.DataFrame( [0, 1, 2], pd.Index([0, 1, 2], dtype="Int64"), ), + False, + True, "5440931770055407318", marks=pytest.mark.skipif( pd.__version__ >= "2.2.0", @@ -525,6 +664,8 @@ def test_expand_file_path(tmpdir, index, root, expected): [0, 1, 2], pd.Index([0, 1, 2], dtype="Int64"), ), + False, + True, "-5491649331962632325", marks=pytest.mark.skipif( pd.__version__ < "2.2.0", @@ -536,6 +677,8 @@ def test_expand_file_path(tmpdir, index, root, expected): [0, 1, 2], pd.Index([0, 1, 2], dtype="Int64"), ), + False, + True, "5440931770055407318", marks=pytest.mark.skipif( pd.__version__ >= "2.2.0", @@ -547,6 +690,8 @@ def test_expand_file_path(tmpdir, index, root, expected): [0, 1, 2], pd.Index([0, 1, 2], dtype="Int64"), ), + False, + True, "-5491649331962632325", marks=pytest.mark.skipif( pd.__version__ < "2.2.0", @@ -555,9 +700,14 @@ def test_expand_file_path(tmpdir, index, root, expected): ), ], ) -def test_hash(obj, expected): - assert utils.hash(obj) == expected - assert utils.hash(obj[::-1]) == expected +def test_hash(obj, include_order_and_names, mutable, expected): + md5 = utils.hash(obj, include_order_and_names=include_order_and_names) + reverse_md5 = utils.hash(obj[::-1], include_order_and_names=include_order_and_names) + assert md5 == expected + if mutable: + assert reverse_md5 == md5 + else: + assert reverse_md5 != md5 @pytest.mark.parametrize(