Skip to content
25 changes: 18 additions & 7 deletions audformat/core/database.py
Original file line number Diff line number Diff line change
Expand Up @@ -696,12 +696,16 @@ def append_series(ys, y, column_id):
ys.append(y)

def dtypes_of_categories(objs):
dtypes = [
obj.dtype.categories.dtype
for obj in objs
if isinstance(obj.dtype, pd.CategoricalDtype)
]
return sorted(list(set(dtypes)))
dtypes = []
for obj in objs:
if isinstance(obj.dtype, pd.CategoricalDtype):
dtype = obj.dtype.categories.dtype
# Normalize string dtypes: treat 'str' and 'object' as equivalent
# for string categories (pandas 3.0 compatibility)
if str(dtype) == "str":
dtype = pd.Series(dtype="object").dtype
dtypes.append(dtype)
return sorted(list(set(dtypes)), key=str)

def empty_frame(name):
return pd.DataFrame(
Expand Down Expand Up @@ -823,7 +827,7 @@ def scheme_in_column(scheme_id, column, column_id):
raise TypeError( # pragma: nocover
f"Cannot join labels for scheme '{requested_scheme}' "
"with different data types: "
f"{', '.join(dtypes)}"
f"{', '.join(str(d) for d in dtypes)}"
)
dtype = dtypes[0]
# Convert everything to categorical data
Expand All @@ -832,6 +836,13 @@ def scheme_in_column(scheme_id, column, column_id):
ys[n] = y.astype(
pd.CategoricalDtype(y.array.dropna().unique().astype(dtype))
)
# Normalize all categorical dtypes to "object" for consistency
# (pandas 3.0 may infer "str" dtype for string categories)
for n, y in enumerate(ys):
cat_dtype = y.dtype.categories.dtype
if str(cat_dtype) == "str":
new_categories = y.dtype.categories.astype("object")
ys[n] = y.astype(pd.CategoricalDtype(new_categories))
# Find union of categorical data
data = [y.array for y in ys]
try:
Expand Down
13 changes: 12 additions & 1 deletion audformat/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -361,7 +361,18 @@ def segmented_index(
define.IndexField.END,
],
)
index = utils.set_index_dtypes(index, {define.IndexField.FILE: "string"})
# Starting with pandas 3.0.0,
# the default precision of timedelta is seconds.
# To ensure consistent behavior across pandas versions,
# we always use nanoseconds for timedelta dtypes.
index = utils.set_index_dtypes(
index,
{
define.IndexField.FILE: "string",
define.IndexField.START: "timedelta64[ns]",
define.IndexField.END: "timedelta64[ns]",
},
)
assert_index(index)

return index
2 changes: 1 addition & 1 deletion audformat/core/testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ def add_table(

for file in files:
times = [
pd.to_timedelta(random.random() * file_duration, unit="s")
random.random() * file_duration
for _ in range(num_segments_per_file * 2)
]
times.sort()
Expand Down
104 changes: 92 additions & 12 deletions audformat/core/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -725,8 +725,43 @@ def hash(
df = obj.to_frame().reset_index()
else:
df = obj.reset_index()
# Handle column names and dtypes
table = pa.Table.from_pandas(df, preserve_index=False)
# Normalize string columns to object dtype for consistent hashing
# (pandas 3.0 uses "string" dtype which maps to pyarrow "large_string",
# while "object" dtype maps to pyarrow "string")
# For empty DataFrames, we also need to specify an explicit schema
# because pyarrow infers "null" type for empty object columns
schema_fields = []
for col in df.columns:
if pd.api.types.is_string_dtype(df[col].dtype):
df[col] = df[col].astype("object")
schema_fields.append((col, pa.string()))
elif isinstance(df[col].dtype, pd.CategoricalDtype):
# Normalize categorical with string categories to object
cat_dtype = df[col].dtype.categories.dtype
if str(cat_dtype) == "str" or pd.api.types.is_string_dtype(cat_dtype):
new_categories = df[col].dtype.categories.astype("object")
df[col] = df[col].astype(pd.CategoricalDtype(new_categories))
schema_fields.append((col, None))
elif pd.api.types.is_timedelta64_dtype(df[col].dtype):
schema_fields.append((col, pa.duration("ns")))
else:
schema_fields.append((col, None)) # Let pyarrow infer
# Build schema for columns that need explicit types
if len(df) == 0 and any(f[1] is not None for f in schema_fields):
# For empty DataFrames, specify schema explicitly
schema = pa.schema(
[
(
name,
typ if typ is not None else pa.from_numpy_dtype(df[name].dtype),
)
for name, typ in schema_fields
]
)
table = pa.Table.from_pandas(df, preserve_index=False, schema=schema)
else:
# Handle column names and dtypes
table = pa.Table.from_pandas(df, preserve_index=False)
schema_str = table.schema.to_string(
# schema.metadata contains pandas related information,
# and the used pyarrow and pandas version,
Expand All @@ -745,7 +780,12 @@ def hash(
# for integers across different pandas versions
# (since pandas 2.2.x, Int64 is converted to float if it contains <NA>)
y = y.astype("float")
data_md5.update(bytes(str(y.to_numpy()), "utf-8"))
if pd.api.types.is_string_dtype(y.dtype):
# Enforce object dtype for string columns
# to ensure consistent hashing across Python versions
data_md5.update(bytes(str(y.to_numpy(dtype=object)), "utf-8"))
else:
data_md5.update(bytes(str(y.to_numpy()), "utf-8"))
md5 = hashlib.md5()
md5.update(schema_md5.digest())
md5.update(data_md5.digest())
Expand Down Expand Up @@ -941,7 +981,13 @@ def intersect(
# Ensure we have order of first object
index = objs[0].intersection(index)
if isinstance(index, pd.MultiIndex):
index = set_index_dtypes(index, objs[0].dtypes.to_dict())
dtypes = objs[0].dtypes.to_dict()
# Always use timedelta64[ns] for timedelta dtypes
# to ensure consistent precision across pandas versions
for name, dtype in dtypes.items():
if pd.api.types.is_timedelta64_dtype(dtype):
dtypes[name] = "timedelta64[ns]"
index = set_index_dtypes(index, dtypes)

return index

Expand Down Expand Up @@ -1030,7 +1076,7 @@ def iter_by_file(
('f1', MultiIndex([('f1', '0 days 00:00:00', '0 days 00:00:02'),
('f1', '0 days 00:00:01', '0 days 00:00:03')],
names=['file', 'start', 'end']))
>>> obj = pd.Series(["a", "b", "b"], index)
>>> obj = pd.Series(["a", "b", "b"], index, dtype="object")
>>> next(iter_by_file(obj))
('f1', file start end
f1 0 days 00:00:00 0 days 00:00:02 a
Expand Down Expand Up @@ -1479,14 +1525,14 @@ def set_index_dtypes(
index with new dtypes

Examples:
>>> index1 = pd.Index(["a", "b"])
>>> index1 = pd.Index(["a", "b"], dtype="object")
>>> index1
Index(['a', 'b'], dtype='object')
>>> index2 = set_index_dtypes(index1, "string")
>>> index2
Index(['a', 'b'], dtype='string')
>>> index3 = pd.MultiIndex.from_arrays(
... [["a", "b"], [1, 2]],
... [pd.Index(["a", "b"], dtype="object"), [1, 2]],
... names=["level1", "level2"],
... )
>>> index3.dtypes
Expand All @@ -1500,8 +1546,8 @@ def set_index_dtypes(
dtype: object
>>> index5 = set_index_dtypes(index3, "string")
>>> index5.dtypes
level1 string[python]
level2 string[python]
level1 string
level2 string
dtype: object

"""
Expand Down Expand Up @@ -1533,7 +1579,7 @@ def set_index_dtypes(
if pd.api.types.is_timedelta64_dtype(dtype):
# avoid: TypeError: Cannot cast DatetimeArray
# to dtype timedelta64[ns]
df[level] = pd.to_timedelta(list(df[level]))
df[level] = pd.to_timedelta(list(df[level])).astype(dtype)
else:
df[level] = df[level].astype(dtype)
index = pd.MultiIndex.from_frame(df)
Expand Down Expand Up @@ -1827,9 +1873,15 @@ def job(file: str) -> pd.Timedelta:
# Replace all NaT entries in end
# by the collected duration values.
# We have to convert ends to a series first
# in order to preserve precision of duration values
# in order to preserve precision of duration values.
# Starting with pandas 3.0.0,
# the default precision of timedelta is seconds,
# so we need to convert to nanoseconds
# to ensure sub-second precision is preserved
# when assigning duration values.

ends = ends.to_series()
ends = ends.astype("timedelta64[ns]")
ends.iloc[idx_nat] = durs

# Create a new index
Expand Down Expand Up @@ -1972,11 +2024,31 @@ def union(
if max_num_seg > UNION_MAX_INDEX_LEN_THRES:
df = pd.concat([o.to_frame() for o in objs])
index = df.index
# Starting with pandas 3.0.0,
# the default precision of timedelta is seconds.
# To avoid precision loss when combining indices
# with different timedelta precisions,
# we always use nanoseconds for timedelta dtypes.
if isinstance(index, pd.MultiIndex):
dtypes = {}
for name, dtype in zip(index.names, index.dtypes):
if pd.api.types.is_timedelta64_dtype(dtype):
dtypes[name] = "timedelta64[ns]"
if dtypes:
index = set_index_dtypes(index, dtypes)

elif isinstance(objs[0], pd.MultiIndex):
names = objs[0].names
num_levels = len(names)
dtypes = {name: dtype for name, dtype in zip(names, objs[0].dtypes)}
# Starting with pandas 3.0.0,
# the default precision of timedelta is seconds.
# To avoid precision loss when combining indices
# with different timedelta precisions,
# we always use nanoseconds for timedelta dtypes.
for name, dtype in dtypes.items():
if pd.api.types.is_timedelta64_dtype(dtype):
dtypes[name] = "timedelta64[ns]"
values = [[] for _ in range(num_levels)]

for obj in objs:
Expand All @@ -1991,13 +2063,21 @@ def union(

else:
name = objs[0].name
dtype = objs[0].dtype
# Starting with pandas 3.0.0,
# the default precision of timedelta is seconds.
# To avoid precision loss when combining indices
# with different timedelta precisions,
# we always use nanoseconds for timedelta dtypes.
if pd.api.types.is_timedelta64_dtype(dtype):
dtype = "timedelta64[ns]"
values = []

for obj in objs:
values.extend(obj.to_list())

index = pd.Index(values, name=name)
index = set_index_dtypes(index, objs[0].dtype)
index = set_index_dtypes(index, dtype)

index = index.drop_duplicates()

Expand Down
6 changes: 3 additions & 3 deletions tests/test_database_get.py
Original file line number Diff line number Diff line change
Expand Up @@ -511,7 +511,7 @@ def wrong_scheme_labels_db(tmpdir):
]
),
dtype=pd.CategoricalDtype(
["w1", "w2", "w3"],
pd.Index(["w1", "w2", "w3"], dtype="object"),
ordered=False,
),
),
Expand Down Expand Up @@ -603,7 +603,7 @@ def wrong_scheme_labels_db(tmpdir):
[0.2, 0.2, 0.5, 0.7],
),
dtype=pd.CategoricalDtype(
["s1", "s2", "s3"],
pd.Index(["s1", "s2", "s3"], dtype="object"),
ordered=False,
),
name="speaker",
Expand Down Expand Up @@ -1253,7 +1253,7 @@ def test_database_get_aggregate_and_modify_function(
["s1"],
index=audformat.filewise_index(["f1.wav"]),
dtype=pd.CategoricalDtype(
["s1", "s2", "s3"],
pd.Index(["s1", "s2", "s3"], dtype="object"),
ordered=False,
),
name="speaker",
Expand Down
20 changes: 20 additions & 0 deletions tests/test_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -262,6 +262,26 @@ def test_create_segmented_index(files, starts, ends):
] * len(files)


@pytest.mark.parametrize(
"files, starts, ends",
[
# normal case with sub-second values
(["f1.wav"], [0.001], [0.002]),
# NaT in ends
(["f1.wav"], [0], [pd.NaT]),
# NaT in starts and ends
(["f1.wav"], [pd.NaT], [pd.NaT]),
# empty index
(None, None, None),
],
)
def test_segmented_index_timedelta_dtype(files, starts, ends):
"""Ensure segmented_index always returns timedelta64[ns]."""
index = audformat.segmented_index(files, starts=starts, ends=ends)
assert index.get_level_values("start").dtype == "timedelta64[ns]"
assert index.get_level_values("end").dtype == "timedelta64[ns]"


@pytest.mark.parametrize(
"index, index_type",
[
Expand Down
Loading
Loading