Skip to content

Commit 5cb6dd8

Browse files
committed
default codec config now uses the full config dict
1 parent ae1832d commit 5cb6dd8

File tree

6 files changed

+168
-84
lines changed

6 files changed

+168
-84
lines changed

src/zarr/api/asynchronous.py

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -18,14 +18,14 @@
1818
ChunkCoords,
1919
MemoryOrder,
2020
ZarrFormat,
21+
_default_zarr_version,
2122
_warn_order_kwarg,
2223
_warn_write_empty_chunks_kwarg,
2324
parse_dtype,
2425
)
25-
from zarr.core.common import _default_zarr_version
2626
from zarr.core.group import AsyncGroup, ConsolidatedMetadata, GroupMetadata
2727
from zarr.core.metadata import ArrayMetadataDict, ArrayV2Metadata, ArrayV3Metadata
28-
from zarr.core.metadata.v2 import _default_filters_and_compressor
28+
from zarr.core.metadata.v2 import _default_compressor, _default_filters
2929
from zarr.errors import NodeTypeValidationError
3030
from zarr.storage import (
3131
StoreLike,
@@ -886,8 +886,8 @@ async def create(
886886
If no codecs are provided, default codecs will be used:
887887
888888
- For numeric arrays, the default is ``BytesCodec`` and ``ZstdCodec``.
889-
- For Unicode strings, the default is ``VLenUTF8Codec``.
890-
- For bytes or objects, the default is ``VLenBytesCodec``.
889+
- For Unicode strings, the default is ``VLenUTF8Codec`` and ``ZstdCodec``.
890+
- For bytes or objects, the default is ``VLenBytesCodec`` and ``ZstdCodec``.
891891
892892
These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`.
893893
compressor : Codec, optional
@@ -900,7 +900,8 @@ async def create(
900900
- For Unicode strings, the default is ``VLenUTF8Codec``.
901901
- For bytes or objects, the default is ``VLenBytesCodec``.
902902
903-
These defaults can be changed by modifying the value of ``array.v2_default_compressor`` in :mod:`zarr.core.config`. fill_value : object
903+
These defaults can be changed by modifying the value of ``array.v2_default_compressor`` in :mod:`zarr.core.config`.
904+
fill_value : object
904905
Default value to use for uninitialized portions of the array.
905906
order : {'C', 'F'}, optional
906907
Deprecated in favor of the ``config`` keyword argument.
@@ -921,8 +922,8 @@ async def create(
921922
for storage of both chunks and metadata.
922923
filters : sequence of Codecs, optional
923924
Sequence of filters to use to encode chunk data prior to compression.
924-
V2 only. If neither ``compressor`` nor ``filters`` are provided, a default
925-
compressor will be used. (see ``compressor`` for details).
925+
V2 only. If no ``filters`` are provided, a default set of filters will be used.
926+
These defaults can be changed by modifying the value of ``array.v2_default_filters`` in :mod:`zarr.core.config`.
926927
cache_metadata : bool, optional
927928
If True, array configuration metadata will be cached for the
928929
lifetime of the object. If False, array metadata will be reloaded
@@ -975,8 +976,10 @@ async def create(
975976
if chunks is None:
976977
chunks = shape
977978
dtype = parse_dtype(dtype, zarr_format)
978-
if not filters and not compressor:
979-
filters, compressor = _default_filters_and_compressor(dtype)
979+
if not filters:
980+
filters = _default_filters(dtype)
981+
if not compressor:
982+
compressor = _default_compressor(dtype)
980983
elif zarr_format == 3 and chunk_shape is None: # type: ignore[redundant-expr]
981984
if chunks is not None:
982985
chunk_shape = chunks

src/zarr/core/array.py

Lines changed: 55 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@
1818
from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec, Codec
1919
from zarr.abc.store import Store, set_or_delete
2020
from zarr.codecs._v2 import V2Codec
21-
from zarr.codecs.zstd import ZstdCodec
2221
from zarr.core._info import ArrayInfo
2322
from zarr.core.array_spec import ArrayConfig, ArrayConfigParams, parse_array_config
2423
from zarr.core.attributes import Attributes
@@ -87,7 +86,10 @@
8786
ArrayV3MetadataDict,
8887
T_ArrayMetadata,
8988
)
90-
from zarr.core.metadata.v2 import _default_filters_and_compressor
89+
from zarr.core.metadata.v2 import (
90+
_default_compressor,
91+
_default_filters,
92+
)
9193
from zarr.core.metadata.v3 import DataType, parse_node_type_array
9294
from zarr.core.sync import sync
9395
from zarr.errors import MetadataValidationError
@@ -438,8 +440,8 @@ async def create(
438440
If no codecs are provided, default codecs will be used:
439441
440442
- For numeric arrays, the default is ``BytesCodec`` and ``ZstdCodec``.
441-
- For Unicode strings, the default is ``VLenUTF8Codec``.
442-
- For bytes or objects, the default is ``VLenBytesCodec``.
443+
- For Unicode strings, the default is ``VLenUTF8Codec`` and ``ZstdCodec``.
444+
- For bytes or objects, the default is ``VLenBytesCodec`` and ``ZstdCodec``.
443445
444446
These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`.
445447
dimension_names : Iterable[str], optional
@@ -460,14 +462,14 @@ async def create(
460462
order for Zarr 3 arrays is via the ``config`` parameter, e.g. ``{'config': 'C'}``.
461463
filters : list[dict[str, JSON]], optional
462464
Sequence of filters to use to encode chunk data prior to compression.
463-
V2 only. V3 arrays should use ``codecs`` instead. If neither ``compressor``
464-
nor ``filters`` are provided, a default compressor will be used. (see
465-
``compressor`` for details)
465+
V2 only. V3 arrays should use ``codecs`` instead. If no ``filters``
466+
are provided, a default set of filters will be used.
467+
These defaults can be changed by modifying the value of ``array.v2_default_filters`` in :mod:`zarr.core.config`.
466468
compressor : dict[str, JSON], optional
467469
The compressor used to compress the data (default is None).
468470
V2 only. V3 arrays should use ``codecs`` instead.
469471
470-
If neither ``compressor`` nor ``filters`` are provided, a default compressor will be used:
472+
If no ``compressor`` is provided, a default compressor will be used:
471473
472474
- For numeric arrays, the default is ``ZstdCodec``.
473475
- For Unicode strings, the default is ``VLenUTF8Codec``.
@@ -677,8 +679,10 @@ async def _create_v2(
677679
dimension_separator = "."
678680

679681
dtype = parse_dtype(dtype, zarr_format=2)
680-
if not filters and not compressor:
681-
filters, compressor = _default_filters_and_compressor(dtype)
682+
if not filters:
683+
filters = _default_filters(dtype)
684+
if not compressor:
685+
compressor = _default_compressor(dtype)
682686
if np.issubdtype(dtype, np.str_):
683687
filters = filters or []
684688
if not any(x["id"] == "vlen-utf8" for x in filters):
@@ -1572,8 +1576,8 @@ def create(
15721576
If no codecs are provided, default codecs will be used:
15731577
15741578
- For numeric arrays, the default is ``BytesCodec`` and ``ZstdCodec``.
1575-
- For Unicode strings, the default is ``VLenUTF8Codec``.
1576-
- For bytes or objects, the default is ``VLenBytesCodec``.
1579+
- For Unicode strings, the default is ``VLenUTF8Codec`` and ``ZstdCodec``.
1580+
- For bytes or objects, the default is ``VLenBytesCodec`` and ``ZstdCodec``.
15771581
15781582
These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`.
15791583
dimension_names : Iterable[str], optional
@@ -1594,14 +1598,14 @@ def create(
15941598
order for Zarr 3 arrays is via the ``config`` parameter, e.g. ``{'order': 'C'}``.
15951599
filters : list[dict[str, JSON]], optional
15961600
Sequence of filters to use to encode chunk data prior to compression.
1597-
V2 only. V3 arrays should use ``codecs`` instead. If neither ``compressor``
1598-
nor ``filters`` are provided, a default compressor will be used. (see
1599-
``compressor`` for details)
1601+
V2 only. V3 arrays should use ``codecs`` instead. If no ``filters``
1602+
are provided, a default set of filters will be used.
1603+
These defaults can be changed by modifying the value of ``array.v2_default_filters`` in :mod:`zarr.core.config`.
16001604
compressor : dict[str, JSON], optional
16011605
Primary compressor to compress chunk data.
16021606
V2 only. V3 arrays should use ``codecs`` instead.
16031607
1604-
If neither ``compressor`` nor ``filters`` are provided, a default compressor will be used:
1608+
If no ``compressor`` is provided, a default compressor will be used:
16051609
16061610
- For numeric arrays, the default is ``ZstdCodec``.
16071611
- For Unicode strings, the default is ``VLenUTF8Codec``.
@@ -3455,7 +3459,7 @@ def _get_default_codecs(
34553459
else:
34563460
dtype_key = "numeric"
34573461

3458-
return [{"name": codec_id, "configuration": {}} for codec_id in default_codecs[dtype_key]]
3462+
return default_codecs[dtype_key]
34593463

34603464

34613465
FiltersParam: TypeAlias = (
@@ -3672,49 +3676,56 @@ def _get_default_encoding_v3(
36723676
else:
36733677
dtype_key = "numeric"
36743678

3675-
codec_names = default_codecs[dtype_key]
3676-
array_bytes_cls, *rest = tuple(get_codec_class(codec_name) for codec_name in codec_names)
3677-
array_bytes: ArrayBytesCodec = cast(ArrayBytesCodec, array_bytes_cls())
3678-
# TODO: we should compress bytes and strings by default!
3679-
# The current default codecs only lists names, and strings / bytes are not compressed at all,
3680-
# so we insert the ZstdCodec at the end of the list as a default
3681-
bytes_bytes: tuple[BytesBytesCodec, ...]
3682-
array_array: tuple[ArrayArrayCodec, ...] = ()
3683-
if len(rest) == 0:
3684-
bytes_bytes = (ZstdCodec(),)
3685-
else:
3686-
bytes_bytes = cast(tuple[BytesBytesCodec, ...], tuple(r() for r in rest))
3679+
codec_dicts = default_codecs[dtype_key]
3680+
codecs = tuple(get_codec_class(c["name"]).from_dict(c) for c in codec_dicts)
3681+
array_bytes_maybe = None
3682+
array_array: list[ArrayArrayCodec] = []
3683+
bytes_bytes: list[BytesBytesCodec] = []
3684+
3685+
for codec in codecs:
3686+
if isinstance(codec, ArrayBytesCodec):
3687+
if array_bytes_maybe is not None:
3688+
raise ValueError(
3689+
f"Got two instances of ArrayBytesCodec: {array_bytes_maybe} and {codec}. "
3690+
"Only one array-to-bytes codec is allowed."
3691+
)
3692+
array_bytes_maybe = codec
3693+
elif isinstance(codec, ArrayArrayCodec):
3694+
array_array.append(codec)
3695+
elif isinstance(codec, BytesBytesCodec):
3696+
bytes_bytes.append(codec)
3697+
else:
3698+
raise TypeError(f"Unexpected codec type: {type(codec)}")
36873699

3688-
return array_array, array_bytes, bytes_bytes
3700+
if array_bytes_maybe is None:
3701+
raise ValueError("Required ArrayBytesCodec was not found.")
3702+
3703+
return tuple(array_array), array_bytes_maybe, tuple(bytes_bytes)
36893704

36903705

36913706
def _get_default_chunk_encoding_v2(
36923707
dtype: np.dtype[Any],
3693-
) -> tuple[tuple[numcodecs.abc.Codec, ...], numcodecs.abc.Codec]:
3708+
) -> tuple[tuple[numcodecs.abc.Codec, ...], numcodecs.abc.Codec | None]:
36943709
"""
36953710
Get the default chunk encoding for zarr v2 arrays, given a dtype
36963711
"""
3697-
codec_id_dict = zarr_config.get("array.v2_default_compressor")
3698-
36993712
if dtype.kind in "biufcmM":
37003713
dtype_key = "numeric"
3701-
codec_type = "compressor"
37023714
elif dtype.kind in "U":
37033715
dtype_key = "string"
3704-
codec_type = "filter"
37053716
elif dtype.kind in "OSV":
37063717
dtype_key = "bytes"
3707-
codec_type = "filter"
37083718
else:
37093719
raise ValueError(f"Unsupported dtype kind {dtype.kind}")
3710-
codec_id = codec_id_dict[dtype_key]
3711-
codec_instance = numcodecs.get_codec({"id": codec_id})
3712-
if codec_type == "compressor":
3713-
return (), codec_instance
3714-
elif codec_type == "filter":
3715-
return codec_instance, numcodecs.Zstd()
3716-
else:
3717-
raise ValueError(f"Unsupported codec type {codec_type}")
3720+
3721+
compressor_dict = zarr_config.get("array.v2_default_compressor").get(dtype_key, None)
3722+
filter_dicts = zarr_config.get("array.v2_default_filters").get(dtype_key, [])
3723+
3724+
compressor = None
3725+
if compressor_dict is not None:
3726+
compressor = numcodecs.get_codec(compressor_dict)
3727+
filters = tuple(numcodecs.get_codec(f) for f in filter_dicts)
3728+
return filters, compressor
37183729

37193730

37203731
def _parse_chunk_encoding_v2(

src/zarr/core/config.py

Lines changed: 20 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -67,14 +67,28 @@ def reset(self) -> None:
6767
"order": "C",
6868
"write_empty_chunks": False,
6969
"v2_default_compressor": {
70-
"numeric": "zstd",
71-
"string": "vlen-utf8",
72-
"bytes": "vlen-bytes",
70+
"numeric": {"id": "zstd", "level": 0, "checksum": True},
71+
"string": {"id": "zstd", "level": 0, "checksum": True},
72+
"bytes": {"id": "zstd", "level": 0, "checksum": True},
73+
},
74+
"v2_default_filters": {
75+
"numeric": [],
76+
"string": [{"id": "vlen-utf8"}],
77+
"bytes": [{"id": "vlen-bytes"}],
7378
},
7479
"v3_default_codecs": {
75-
"numeric": ["bytes", "zstd"],
76-
"string": ["vlen-utf8"],
77-
"bytes": ["vlen-bytes"],
80+
"numeric": [
81+
{"name": "bytes", "configuration": {"endian": "little"}},
82+
{"name": "zstd", "configuration": {"level": 0, "checksum": True}},
83+
],
84+
"string": [
85+
{"name": "vlen-utf8"},
86+
{"name": "zstd", "configuration": {"level": 0, "checksum": True}},
87+
],
88+
"bytes": [
89+
{"name": "vlen-bytes"},
90+
{"name": "zstd", "configuration": {"level": 0, "checksum": True}},
91+
],
7892
},
7993
},
8094
"async": {"concurrency": 10, "timeout": None},

src/zarr/core/metadata/v2.py

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -331,9 +331,9 @@ def _default_fill_value(dtype: np.dtype[Any]) -> Any:
331331
return dtype.type(0)
332332

333333

334-
def _default_filters_and_compressor(
334+
def _default_compressor(
335335
dtype: np.dtype[Any],
336-
) -> tuple[list[dict[str, JSON]], dict[str, JSON] | None]:
336+
) -> dict[str, JSON] | None:
337337
"""Get the default filters and compressor for a dtype.
338338
339339
https://numpy.org/doc/2.1/reference/generated/numpy.dtype.kind.html
@@ -348,4 +348,24 @@ def _default_filters_and_compressor(
348348
else:
349349
raise ValueError(f"Unsupported dtype kind {dtype.kind}")
350350

351-
return [{"id": default_compressor[dtype_key]}], None
351+
return default_compressor.get(dtype_key, None)
352+
353+
354+
def _default_filters(
355+
dtype: np.dtype[Any],
356+
) -> list[dict[str, JSON]]:
357+
"""Get the default filters and compressor for a dtype.
358+
359+
https://numpy.org/doc/2.1/reference/generated/numpy.dtype.kind.html
360+
"""
361+
default_filters = config.get("array.v2_default_filters")
362+
if dtype.kind in "biufcmM":
363+
dtype_key = "numeric"
364+
elif dtype.kind in "U":
365+
dtype_key = "string"
366+
elif dtype.kind in "OSV":
367+
dtype_key = "bytes"
368+
else:
369+
raise ValueError(f"Unsupported dtype kind {dtype.kind}")
370+
371+
return default_filters.get(dtype_key, [])

tests/test_config.py

Lines changed: 35 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -54,14 +54,28 @@ def test_config_defaults_set() -> None:
5454
"order": "C",
5555
"write_empty_chunks": False,
5656
"v2_default_compressor": {
57-
"numeric": "zstd",
58-
"string": "vlen-utf8",
59-
"bytes": "vlen-bytes",
57+
"numeric": {"id": "zstd", "level": 0, "checksum": True},
58+
"string": {"id": "zstd", "level": 0, "checksum": True},
59+
"bytes": {"id": "zstd", "level": 0, "checksum": True},
60+
},
61+
"v2_default_filters": {
62+
"numeric": [],
63+
"string": [{"id": "vlen-utf8"}],
64+
"bytes": [{"id": "vlen-bytes"}],
6065
},
6166
"v3_default_codecs": {
62-
"bytes": ["vlen-bytes"],
63-
"numeric": ["bytes", "zstd"],
64-
"string": ["vlen-utf8"],
67+
"bytes": [
68+
{"name": "vlen-bytes"},
69+
{"name": "zstd", "configuration": {"level": 0, "checksum": True}},
70+
],
71+
"numeric": [
72+
{"name": "bytes", "configuration": {"endian": "little"}},
73+
{"name": "zstd", "configuration": {"level": 0, "checksum": True}},
74+
],
75+
"string": [
76+
{"name": "vlen-utf8"},
77+
{"name": "zstd", "configuration": {"level": 0, "checksum": True}},
78+
],
6579
},
6680
},
6781
"async": {"concurrency": 10, "timeout": None},
@@ -291,17 +305,26 @@ class NewCodec2(BytesCodec):
291305
("dtype", "expected_codecs"),
292306
[
293307
("int", [BytesCodec(), GzipCodec()]),
294-
("bytes", [VLenBytesCodec()]),
295-
("str", [VLenUTF8Codec()]),
308+
("bytes", [VLenBytesCodec(), GzipCodec()]),
309+
("str", [VLenUTF8Codec(), GzipCodec()]),
296310
],
297311
)
298312
async def test_default_codecs(dtype: str, expected_codecs: list[Codec]) -> None:
299313
with config.set(
300314
{
301-
"array.v3_default_codecs": {
302-
"numeric": ["bytes", "gzip"], # test setting non-standard codecs
303-
"string": ["vlen-utf8"],
304-
"bytes": ["vlen-bytes"],
315+
"array.v3_default_codecs": { # test setting non-standard codecs
316+
"numeric": [
317+
{"name": "bytes", "configuration": {"endian": "little"}},
318+
{"name": "gzip", "configuration": {"level": 5}},
319+
],
320+
"string": [
321+
{"name": "vlen-utf8"},
322+
{"name": "gzip", "configuration": {"level": 5}},
323+
],
324+
"bytes": [
325+
{"name": "vlen-bytes"},
326+
{"name": "gzip", "configuration": {"level": 5}},
327+
],
305328
}
306329
}
307330
):

0 commit comments

Comments
 (0)