Skip to content

Commit

Permalink
Add include_order_and_names argument to hash()
Browse files Browse the repository at this point in the history
  • Loading branch information
hagenw committed Jul 15, 2024
1 parent a8cd511 commit 7245d06
Show file tree
Hide file tree
Showing 3 changed files with 216 additions and 73 deletions.
61 changes: 2 additions & 59 deletions audformat/core/table.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from __future__ import annotations # allow typing without string

import copy
import hashlib
import os
import pickle
import typing
Expand Down Expand Up @@ -1198,13 +1197,11 @@ def _save_parquet(self, path: str):
table = pa.Table.from_pandas(self.df.reset_index(), preserve_index=False)

# Create hash of table
table_hash = hashlib.md5()
table_hash.update(_schema_hash(table))
table_hash.update(_dataframe_hash(self.df))
table_hash = utils.hash(self.df, include_order_and_names=True)

# Store in metadata of file,
# see https://stackoverflow.com/a/58978449
metadata = {"hash": table_hash.hexdigest()}
metadata = {"hash": table_hash}
table = table.replace_schema_metadata({**metadata, **table.schema.metadata})

parquet.write_table(table, path, compression="snappy")
Expand Down Expand Up @@ -1905,40 +1902,6 @@ def _assert_table_index(
)


def _dataframe_hash(df: pd.DataFrame) -> bytes:
"""Hash a dataframe.
The hash value takes into account:
* index of dataframe
* values of the dataframe
* order of dataframe rows
It does not consider:
* column names of dataframe
* dtypes of dataframe
Args:
df: dataframe
Returns:
MD5 hash in bytes
"""
md5 = hashlib.md5()
for _, y in df.reset_index().items():
# Convert every column to a numpy array,
# and hash its string representation
if y.dtype == "Int64":
# Enforce consistent conversion to numpy.array
# for integers across different pandas versions
# (since pandas 2.2.x, Int64 is converted to float if it contains <NA>)
y = y.astype("float")
md5.update(bytes(str(y.to_numpy()), "utf-8"))
return md5.digest()


def _maybe_convert_dtype_to_string(
index: pd.Index,
) -> pd.Index:
Expand All @@ -1961,23 +1924,3 @@ def _maybe_update_scheme(
for scheme in table.db.schemes.values():
if table._id == scheme.labels:
scheme.replace_labels(table._id)


def _schema_hash(table: pa.Table) -> bytes:
r"""Hash pyarrow table schema.
Args:
table: pyarrow table
Returns:
MD5 hash in bytes
"""
schema_str = table.schema.to_string(
# schema.metadata contains pandas related information,
# and the used pyarrow and pandas version,
# and needs to be excluded
show_field_metadata=False,
show_schema_metadata=False,
)
return hashlib.md5(schema_str.encode()).digest()
66 changes: 58 additions & 8 deletions audformat/core/utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import collections
import errno
import hashlib
import os
import platform
import re
Expand All @@ -10,6 +11,7 @@
import iso3166
import numpy as np
import pandas as pd
import pyarrow as pa

import audeer
import audiofile
Expand Down Expand Up @@ -664,10 +666,12 @@ def expand_file_path(

def hash(
obj: typing.Union[pd.Index, pd.Series, pd.DataFrame],
include_order_and_names: bool = False,
) -> str:
r"""Create hash from object.
Objects with the same elements
If ``include_order_and_names`` is ``False``,
objects with the same elements
produce the same hash string
independent of the ordering of the elements,
and level or column names.
Expand All @@ -676,29 +680,75 @@ def hash(
If ``obj`` is a dataframe or series
with data type ``"Int64"``,
and ``include_order_and_names`` is ``False``,
the returned hash value changes with ``pandas>=2.2.0``.
Args:
obj: object
include_order_and_names: if ``True``,
the hash takes into account
the order of rows
and column/level names
Returns:
hash string
hash string with 19 characters,
or 32 characters if ``include_order_and_names`` is ``True``
Examples:
>>> index = filewise_index(["f1", "f2"])
>>> hash(index)
'-4231615416436839963'
>>> hash(index[::-1]) # reversed index
'-4231615416436839963'
>>> y = pd.Series(0, index)
>>> hash(y)
'5251663970176285425'
>>> hash(index, include_order_and_names=True)
'0741235e2250e0fcd9ab7b64972f5047'
>>> hash(index[::-1], include_order_and_names=True)
'c6639d377897dd9353dc3e8b2968170d'
"""
# Convert to int64
# to enforce same behavior
# across different pandas versions,
# see
# https://github.com/pandas-dev/pandas/issues/55452
return str(pd.util.hash_pandas_object(obj).astype("int64").sum())
if include_order_and_names:
if isinstance(obj, pd.Index):
df = obj.to_frame()
elif isinstance(obj, pd.Series):
df = obj.to_frame().reset_index()
else:
df = obj.reset_index()
# Handle column names and dtypes
table = pa.Table.from_pandas(df, preserve_index=False)
schema_str = table.schema.to_string(
# schema.metadata contains pandas related information,
# and the used pyarrow and pandas version,
# and needs to be excluded
show_field_metadata=False,
show_schema_metadata=False,
)
schema_md5 = hashlib.md5(schema_str.encode())
# Handle index, values, and row order
data_md5 = hashlib.md5()
for _, y in df.items():
# Convert every column to a numpy array,
# and hash its string representation
if y.dtype == "Int64":
# Enforce consistent conversion to numpy.array
# for integers across different pandas versions
# (since pandas 2.2.x, Int64 is converted to float if it contains <NA>)
y = y.astype("float")
data_md5.update(bytes(str(y.to_numpy()), "utf-8"))
md5 = hashlib.md5()
md5.update(schema_md5.digest())
md5.update(data_md5.digest())
md5 = md5.hexdigest()
else:
# Convert to int64
# to enforce same behavior
# across different pandas versions,
# see
# https://github.com/pandas-dev/pandas/issues/55452
md5 = str(pd.util.hash_pandas_object(obj).astype("int64").sum())
return md5


def index_has_overlap(
Expand Down
Loading

0 comments on commit 7245d06

Please sign in to comment.