Skip to content

Commit 11f6680

Browse files
moradologydcheriandstansby
authored andcommitted
Fix fill_value serialization issues (zarr-developers#2802)
* Fix fill_value serialization of NaN * Round trip serialization for array metadata v2/v3 * Unify metadata v2 fill value parsing * Test structured fill_value parsing * Remove redundancies, fix integral handling * Reorganize structured fill parsing * Bump up hypothesis deadline * Remove hypothesis deadline * Update tests/test_v2.py Co-authored-by: David Stansby <dstansby@gmail.com> --------- Co-authored-by: Deepak Cherian <deepak@cherian.net> Co-authored-by: David Stansby <dstansby@gmail.com>
1 parent c4b6e77 commit 11f6680

File tree

6 files changed

+330
-46
lines changed

6 files changed

+330
-46
lines changed

changes/2802.fix.rst

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Fix `fill_value` serialization for `NaN` in `ArrayV2Metadata` and add property-based testing of round-trip serialization

src/zarr/core/group.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -3456,7 +3456,7 @@ def _build_metadata_v3(zarr_json: dict[str, JSON]) -> ArrayV3Metadata | GroupMet
34563456

34573457

34583458
def _build_metadata_v2(
3459-
zarr_json: dict[str, object], attrs_json: dict[str, JSON]
3459+
zarr_json: dict[str, JSON], attrs_json: dict[str, JSON]
34603460
) -> ArrayV2Metadata | GroupMetadata:
34613461
"""
34623462
Convert a dict representation of Zarr V2 metadata into the corresponding metadata class.

src/zarr/core/metadata/v2.py

+82-30
Original file line numberDiff line numberDiff line change
@@ -2,24 +2,25 @@
22

33
import base64
44
import warnings
5-
from collections.abc import Iterable
5+
from collections.abc import Iterable, Sequence
66
from enum import Enum
77
from functools import cached_property
8-
from typing import TYPE_CHECKING, TypedDict, cast
8+
from typing import TYPE_CHECKING, Any, TypedDict, cast
99

1010
import numcodecs.abc
1111

1212
from zarr.abc.metadata import Metadata
1313

1414
if TYPE_CHECKING:
15-
from typing import Any, Literal, Self
15+
from typing import Literal, Self
1616

1717
import numpy.typing as npt
1818

1919
from zarr.core.buffer import Buffer, BufferPrototype
2020
from zarr.core.common import ChunkCoords
2121

2222
import json
23+
import numbers
2324
from dataclasses import dataclass, field, fields, replace
2425

2526
import numcodecs
@@ -146,41 +147,39 @@ def _json_convert(
146147
raise TypeError
147148

148149
zarray_dict = self.to_dict()
150+
zarray_dict["fill_value"] = _serialize_fill_value(self.fill_value, self.dtype)
149151
zattrs_dict = zarray_dict.pop("attributes", {})
150152
json_indent = config.get("json_indent")
151153
return {
152154
ZARRAY_JSON: prototype.buffer.from_bytes(
153-
json.dumps(zarray_dict, default=_json_convert, indent=json_indent).encode()
155+
json.dumps(
156+
zarray_dict, default=_json_convert, indent=json_indent, allow_nan=False
157+
).encode()
154158
),
155159
ZATTRS_JSON: prototype.buffer.from_bytes(
156-
json.dumps(zattrs_dict, indent=json_indent).encode()
160+
json.dumps(zattrs_dict, indent=json_indent, allow_nan=False).encode()
157161
),
158162
}
159163

160164
@classmethod
161165
def from_dict(cls, data: dict[str, Any]) -> ArrayV2Metadata:
162-
# make a copy to protect the original from modification
166+
# Make a copy to protect the original from modification.
163167
_data = data.copy()
164-
# check that the zarr_format attribute is correct
168+
# Check that the zarr_format attribute is correct.
165169
_ = parse_zarr_format(_data.pop("zarr_format"))
166-
dtype = parse_dtype(_data["dtype"])
167170

168-
if dtype.kind in "SV":
169-
fill_value_encoded = _data.get("fill_value")
170-
if fill_value_encoded is not None:
171-
fill_value = base64.standard_b64decode(fill_value_encoded)
172-
_data["fill_value"] = fill_value
173-
174-
# zarr v2 allowed arbitrary keys here.
175-
# We don't want the ArrayV2Metadata constructor to fail just because someone put an
176-
# extra key in the metadata.
171+
# zarr v2 allowed arbitrary keys in the metadata.
172+
# Filter the keys to only those expected by the constructor.
177173
expected = {x.name for x in fields(cls)}
178-
# https://github.com/zarr-developers/zarr-python/issues/2269
179-
# handle the renames
180174
expected |= {"dtype", "chunks"}
181175

182176
# check if `filters` is an empty sequence; if so use None instead and raise a warning
183-
if _data["filters"] is not None and len(_data["filters"]) == 0:
177+
filters = _data.get("filters")
178+
if (
179+
isinstance(filters, Sequence)
180+
and not isinstance(filters, (str, bytes))
181+
and len(filters) == 0
182+
):
184183
msg = (
185184
"Found an empty list of filters in the array metadata document. "
186185
"This is contrary to the Zarr V2 specification, and will cause an error in the future. "
@@ -196,13 +195,6 @@ def from_dict(cls, data: dict[str, Any]) -> ArrayV2Metadata:
196195
def to_dict(self) -> dict[str, JSON]:
197196
zarray_dict = super().to_dict()
198197

199-
if self.dtype.kind in "SV" and self.fill_value is not None:
200-
# There's a relationship between self.dtype and self.fill_value
201-
# that mypy isn't aware of. The fact that we have S or V dtype here
202-
# means we should have a bytes-type fill_value.
203-
fill_value = base64.standard_b64encode(cast(bytes, self.fill_value)).decode("ascii")
204-
zarray_dict["fill_value"] = fill_value
205-
206198
_ = zarray_dict.pop("dtype")
207199
dtype_json: JSON
208200
# In the case of zarr v2, the simplest i.e., '|VXX' dtype is represented as a string
@@ -300,7 +292,26 @@ def parse_metadata(data: ArrayV2Metadata) -> ArrayV2Metadata:
300292
return data
301293

302294

303-
def parse_fill_value(fill_value: object, dtype: np.dtype[Any]) -> Any:
295+
def _parse_structured_fill_value(fill_value: Any, dtype: np.dtype[Any]) -> Any:
296+
"""Handle structured dtype/fill value pairs"""
297+
print("FILL VALUE", fill_value, "DT", dtype)
298+
try:
299+
if isinstance(fill_value, list):
300+
return np.array([tuple(fill_value)], dtype=dtype)[0]
301+
elif isinstance(fill_value, tuple):
302+
return np.array([fill_value], dtype=dtype)[0]
303+
elif isinstance(fill_value, bytes):
304+
return np.frombuffer(fill_value, dtype=dtype)[0]
305+
elif isinstance(fill_value, str):
306+
decoded = base64.standard_b64decode(fill_value)
307+
return np.frombuffer(decoded, dtype=dtype)[0]
308+
else:
309+
return np.array(fill_value, dtype=dtype)[()]
310+
except Exception as e:
311+
raise ValueError(f"Fill_value {fill_value} is not valid for dtype {dtype}.") from e
312+
313+
314+
def parse_fill_value(fill_value: Any, dtype: np.dtype[Any]) -> Any:
304315
"""
305316
Parse a potential fill value into a value that is compatible with the provided dtype.
306317
@@ -317,13 +328,16 @@ def parse_fill_value(fill_value: object, dtype: np.dtype[Any]) -> Any:
317328
"""
318329

319330
if fill_value is None or dtype.hasobject:
320-
# no fill value
321331
pass
332+
elif dtype.fields is not None:
333+
# the dtype is structured (has multiple fields), so the fill_value might be a
334+
# compound value (e.g., a tuple or dict) that needs field-wise processing.
335+
# We use parse_structured_fill_value to correctly convert each component.
336+
fill_value = _parse_structured_fill_value(fill_value, dtype)
322337
elif not isinstance(fill_value, np.void) and fill_value == 0:
323338
# this should be compatible across numpy versions for any array type, including
324339
# structured arrays
325340
fill_value = np.zeros((), dtype=dtype)[()]
326-
327341
elif dtype.kind == "U":
328342
# special case unicode because of encoding issues on Windows if passed through numpy
329343
# https://github.com/alimanfoo/zarr/pull/172#issuecomment-343782713
@@ -332,6 +346,11 @@ def parse_fill_value(fill_value: object, dtype: np.dtype[Any]) -> Any:
332346
raise ValueError(
333347
f"fill_value {fill_value!r} is not valid for dtype {dtype}; must be a unicode string"
334348
)
349+
elif dtype.kind in "SV" and isinstance(fill_value, str):
350+
fill_value = base64.standard_b64decode(fill_value)
351+
elif dtype.kind == "c" and isinstance(fill_value, list) and len(fill_value) == 2:
352+
complex_val = complex(float(fill_value[0]), float(fill_value[1]))
353+
fill_value = np.array(complex_val, dtype=dtype)[()]
335354
else:
336355
try:
337356
if isinstance(fill_value, bytes) and dtype.kind == "V":
@@ -347,6 +366,39 @@ def parse_fill_value(fill_value: object, dtype: np.dtype[Any]) -> Any:
347366
return fill_value
348367

349368

369+
def _serialize_fill_value(fill_value: Any, dtype: np.dtype[Any]) -> JSON:
370+
serialized: JSON
371+
372+
if fill_value is None:
373+
serialized = None
374+
elif dtype.kind in "SV":
375+
# There's a relationship between dtype and fill_value
376+
# that mypy isn't aware of. The fact that we have S or V dtype here
377+
# means we should have a bytes-type fill_value.
378+
serialized = base64.standard_b64encode(cast(bytes, fill_value)).decode("ascii")
379+
elif isinstance(fill_value, np.datetime64):
380+
serialized = np.datetime_as_string(fill_value)
381+
elif isinstance(fill_value, numbers.Integral):
382+
serialized = int(fill_value)
383+
elif isinstance(fill_value, numbers.Real):
384+
float_fv = float(fill_value)
385+
if np.isnan(float_fv):
386+
serialized = "NaN"
387+
elif np.isinf(float_fv):
388+
serialized = "Infinity" if float_fv > 0 else "-Infinity"
389+
else:
390+
serialized = float_fv
391+
elif isinstance(fill_value, numbers.Complex):
392+
serialized = [
393+
_serialize_fill_value(fill_value.real, dtype),
394+
_serialize_fill_value(fill_value.imag, dtype),
395+
]
396+
else:
397+
serialized = fill_value
398+
399+
return serialized
400+
401+
350402
def _default_fill_value(dtype: np.dtype[Any]) -> Any:
351403
"""
352404
Get the default fill value for a type.

src/zarr/testing/strategies.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
import hypothesis.extra.numpy as npst
66
import hypothesis.strategies as st
77
import numpy as np
8-
from hypothesis import event, given, settings # noqa: F401
8+
from hypothesis import event
99
from hypothesis.strategies import SearchStrategy
1010

1111
import zarr

0 commit comments

Comments
 (0)