Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 2 additions & 59 deletions audformat/core/table.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from __future__ import annotations # allow typing without string

import copy
import hashlib
import os
import pickle
import typing
Expand Down Expand Up @@ -1198,13 +1197,11 @@ def _save_parquet(self, path: str):
table = pa.Table.from_pandas(self.df.reset_index(), preserve_index=False)

# Create hash of table
table_hash = hashlib.md5()
table_hash.update(_schema_hash(table))
table_hash.update(_dataframe_hash(self.df))
table_hash = utils.hash(self.df, strict=True)

# Store in metadata of file,
# see https://stackoverflow.com/a/58978449
metadata = {"hash": table_hash.hexdigest()}
metadata = {"hash": table_hash}
table = table.replace_schema_metadata({**metadata, **table.schema.metadata})

parquet.write_table(table, path, compression="snappy")
Expand Down Expand Up @@ -1905,40 +1902,6 @@ def _assert_table_index(
)


def _dataframe_hash(df: pd.DataFrame) -> bytes:
"""Hash a dataframe.

The hash value takes into account:

* index of dataframe
* values of the dataframe
* order of dataframe rows

It does not consider:

* column names of dataframe
* dtypes of dataframe

Args:
df: dataframe

Returns:
MD5 hash in bytes

"""
md5 = hashlib.md5()
for _, y in df.reset_index().items():
# Convert every column to a numpy array,
# and hash its string representation
if y.dtype == "Int64":
# Enforce consistent conversion to numpy.array
# for integers across different pandas versions
# (since pandas 2.2.x, Int64 is converted to float if it contains <NA>)
y = y.astype("float")
md5.update(bytes(str(y.to_numpy()), "utf-8"))
return md5.digest()


def _maybe_convert_dtype_to_string(
index: pd.Index,
) -> pd.Index:
Expand All @@ -1961,23 +1924,3 @@ def _maybe_update_scheme(
for scheme in table.db.schemes.values():
if table._id == scheme.labels:
scheme.replace_labels(table._id)


def _schema_hash(table: pa.Table) -> bytes:
r"""Hash pyarrow table schema.

Args:
table: pyarrow table

Returns:
MD5 hash in bytes

"""
schema_str = table.schema.to_string(
# schema.metadata contains pandas related information,
# and the used pyarrow and pandas version,
# and needs to be excluded
show_field_metadata=False,
show_schema_metadata=False,
)
return hashlib.md5(schema_str.encode()).digest()
66 changes: 58 additions & 8 deletions audformat/core/utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import collections
import errno
import hashlib
import os
import platform
import re
Expand All @@ -10,6 +11,7 @@
import iso3166
import numpy as np
import pandas as pd
import pyarrow as pa

import audeer
import audiofile
Expand Down Expand Up @@ -664,10 +666,12 @@ def expand_file_path(

def hash(
obj: typing.Union[pd.Index, pd.Series, pd.DataFrame],
strict: bool = False,
) -> str:
r"""Create hash from object.

Objects with the same elements
If ``strict`` is ``False``,
objects with the same elements
produce the same hash string
independent of the ordering of the elements,
and level or column names.
Expand All @@ -676,29 +680,75 @@ def hash(

If ``obj`` is a dataframe or series
with data type ``"Int64"``,
and ``strict`` is ``False``,
the returned hash value changes with ``pandas>=2.2.0``.

Args:
obj: object
strict: if ``True``,
the hash takes into account
the order of rows
and column/level names

Returns:
hash string
hash string with 19 characters,
or 32 characters if ``strict`` is ``True``

Examples:
>>> index = filewise_index(["f1", "f2"])
>>> hash(index)
'-4231615416436839963'
>>> hash(index[::-1]) # reversed index
'-4231615416436839963'
>>> y = pd.Series(0, index)
>>> hash(y)
'5251663970176285425'
>>> hash(index, strict=True)
'0741235e2250e0fcd9ab7b64972f5047'
>>> hash(index[::-1], strict=True) # reversed index
'c6639d377897dd9353dc3e8b2968170d'

"""
# Convert to int64
# to enforce same behavior
# across different pandas versions,
# see
# https://github.com/pandas-dev/pandas/issues/55452
return str(pd.util.hash_pandas_object(obj).astype("int64").sum())
if strict:
if isinstance(obj, pd.Index):
df = obj.to_frame()
elif isinstance(obj, pd.Series):
df = obj.to_frame().reset_index()
else:
df = obj.reset_index()
# Handle column names and dtypes
table = pa.Table.from_pandas(df, preserve_index=False)
schema_str = table.schema.to_string(
# schema.metadata contains pandas related information,
# and the used pyarrow and pandas version,
# and needs to be excluded
show_field_metadata=False,
show_schema_metadata=False,
)
schema_md5 = hashlib.md5(schema_str.encode())
# Handle index, values, and row order
data_md5 = hashlib.md5()
for _, y in df.items():
# Convert every column to a numpy array,
# and hash its string representation
if y.dtype == "Int64":
# Enforce consistent conversion to numpy.array
# for integers across different pandas versions
# (since pandas 2.2.x, Int64 is converted to float if it contains <NA>)
y = y.astype("float")
data_md5.update(bytes(str(y.to_numpy()), "utf-8"))
md5 = hashlib.md5()
md5.update(schema_md5.digest())
md5.update(data_md5.digest())
md5 = md5.hexdigest()
else:
# Convert to int64
# to enforce same behavior
# across different pandas versions,
# see
# https://github.com/pandas-dev/pandas/issues/55452
md5 = str(pd.util.hash_pandas_object(obj).astype("int64").sum())
return md5


def index_has_overlap(
Expand Down
Loading