audeering · hagenw · Jan 23, 2026 · Jan 23, 2026 · Jan 23, 2026 · Jan 23, 2026
diff --git a/audformat/core/utils.py b/audformat/core/utils.py
@@ -725,8 +725,47 @@ def hash(
             df = obj.to_frame().reset_index()
         else:
             df = obj.reset_index()
-        # Handle column names and dtypes
-        table = pa.Table.from_pandas(df, preserve_index=False)
+
+        # Normalize string columns to object dtype for consistent hashing
+        # (pandas 3.0 uses "string" dtype which maps to pyarrow "large_string",
+        # while "object" dtype maps to pyarrow "string")
+        # For empty DataFrames, we also need to specify an explicit schema
+        # because pyarrow infers "null" type for empty object columns
+        schema_fields = []
+        for col in df.columns:
+            if pd.api.types.is_string_dtype(df[col].dtype):
+                df[col] = df[col].astype("object")
+                schema_fields.append((col, pa.string()))
+            elif isinstance(df[col].dtype, pd.CategoricalDtype):
+                # Normalize categorical with string categories to object
+                cat_dtype = df[col].dtype.categories.dtype
+                if pd.api.types.is_string_dtype(cat_dtype):
+                    new_categories = df[col].dtype.categories.astype("object")
+                    ordered = df[col].dtype.ordered
+                    df[col] = df[col].astype(
+                        pd.CategoricalDtype(new_categories, ordered=ordered)
+                    )
+                schema_fields.append((col, None))
+            else:
+                # Let pyarrow infer
+                schema_fields.append((col, None))
+        # Build schema for columns that need explicit types
+        if len(df) == 0 and any(f[1] is not None for f in schema_fields):
+            # For empty DataFrames with index of type string/object,
+            # specify schema explicitly
+            schema = pa.schema(
+                [
+                    (
+                        name,
+                        typ if typ is not None else pa.from_numpy_dtype(df[name].dtype),
+                    )
+                    for name, typ in schema_fields
+                ]
+            )
+            table = pa.Table.from_pandas(df, preserve_index=False, schema=schema)
+        else:
+            # Handle column names and dtypes
+            table = pa.Table.from_pandas(df, preserve_index=False)
         schema_str = table.schema.to_string(
             # schema.metadata contains pandas related information,
             # and the used pyarrow and pandas version,

diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -520,6 +520,26 @@ def test_expand_file_path(tmpdir, index, root, expected):
             False,
             "28c5f6feb0682079b127d8ce8debebe9",
         ),
+        (
+            pd.Series(
+                ["a", "b"],
+                audformat.filewise_index(["f1", "f2"]),
+                dtype="object",
+            ),
+            True,
+            False,
+            "74280b0163f6ae31438c26459901adfe",
+        ),
+        (
+            pd.Series(
+                ["a", "b"],
+                audformat.filewise_index(["f1", "f2"]),
+                dtype="string",
+            ),
+            True,
+            False,
+            "74280b0163f6ae31438c26459901adfe",
+        ),
         (
             pd.DataFrame(
                 {"a": [0, 1], "b": [2, 3]},
@@ -698,6 +718,67 @@ def test_expand_file_path(tmpdir, index, root, expected):
                 reason="Changed in pandas 2.2.0",
             ),
         ),
+        # Categorical data
+        (
+            pd.DataFrame(
+                {
+                    "winner": ["w1", "w1", "w2"],
+                },
+                index=audformat.filewise_index(["f1.wav", "f2.wav", "f3.wav"]),
+                dtype=pd.CategoricalDtype(
+                    ["w1", "w2", "w3"],
+                    ordered=False,
+                ),
+            ),
+            True,
+            False,
+            "e845db6fbe98b17f24dd71d3a991094f",
+        ),
+        (
+            pd.DataFrame(
+                {
+                    "winner": ["w1", "w1", "w2"],
+                },
+                index=audformat.filewise_index(["f1.wav", "f2.wav", "f3.wav"]),
+                dtype=pd.CategoricalDtype(
+                    pd.Index(["w1", "w2", "w3"], dtype="object"),
+                    ordered=False,
+                ),
+            ),
+            True,
+            False,
+            "e845db6fbe98b17f24dd71d3a991094f",
+        ),
+        (
+            pd.DataFrame(
+                {
+                    "winner": ["w1", "w1", "w2"],
+                },
+                index=audformat.filewise_index(["f1.wav", "f2.wav", "f3.wav"]),
+                dtype=pd.CategoricalDtype(
+                    pd.Index(["w1", "w2", "w3"], dtype="string"),
+                    ordered=False,
+                ),
+            ),
+            True,
+            False,
+            "e845db6fbe98b17f24dd71d3a991094f",
+        ),
+        (
+            pd.DataFrame(
+                {
+                    "winner": [0, 0, 1],
+                },
+                index=audformat.filewise_index(["f1.wav", "f2.wav", "f3.wav"]),
+                dtype=pd.CategoricalDtype(
+                    [0, 1],
+                    ordered=True,
+                ),
+            ),
+            True,
+            False,
+            "1f6fec46a3c28c5ec3c07ef8f13fb258",
+        ),
     ],
 )
 def test_hash(obj, strict, mutable, expected):