Skip to content

Commit 22634ea

Browse files
authored
Separate defaults for filters, serializers and compressors in v3 (#2653)
1 parent 4d252a2 commit 22634ea

File tree

7 files changed

+108
-147
lines changed

7 files changed

+108
-147
lines changed

docs/user-guide/config.rst

Lines changed: 15 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ Configuration options include the following:
2828

2929
- Default Zarr format ``default_zarr_version``
3030
- Default array order in memory ``array.order``
31-
- Default codecs ``array.v3_default_codecs`` and ``array.v2_default_compressor``
31+
- Default filters, serializers and compressors, e.g. ``array.v3_default_filters``, ``array.v3_default_serializer``, ``array.v3_default_compressors``, ``array.v2_default_filters`` and ``array.v2_default_compressor``
3232
- Whether empty chunks are written to storage ``array.write_empty_chunks``
3333
- Async and threading options, e.g. ``async.concurrency`` and ``threading.max_workers``
3434
- Selections of implementations of codecs, codec pipelines and buffers
@@ -54,19 +54,20 @@ This is the current default configuration::
5454
'v2_default_filters': {'bytes': [{'id': 'vlen-bytes'}],
5555
'numeric': None,
5656
'string': [{'id': 'vlen-utf8'}]},
57-
'v3_default_codecs': {'bytes': [{'name': 'vlen-bytes'},
58-
{'configuration': {'checksum': False,
59-
'level': 0},
60-
'name': 'zstd'}],
61-
'numeric': [{'configuration': {'endian': 'little'},
62-
'name': 'bytes'},
63-
{'configuration': {'checksum': False,
64-
'level': 0},
65-
'name': 'zstd'}],
66-
'string': [{'name': 'vlen-utf8'},
67-
{'configuration': {'checksum': False,
68-
'level': 0},
69-
'name': 'zstd'}]},
57+
'v3_default_compressors': {'bytes': [{'configuration': {'checksum': False,
58+
'level': 0},
59+
'name': 'zstd'}],
60+
'numeric': [{'configuration': {'checksum': False,
61+
'level': 0},
62+
'name': 'zstd'}],
63+
'string': [{'configuration': {'checksum': False,
64+
'level': 0},
65+
'name': 'zstd'}]},
66+
'v3_default_filters': {'bytes': [], 'numeric': [], 'string': []},
67+
'v3_default_serializer': {'bytes': {'name': 'vlen-bytes'},
68+
'numeric': {'configuration': {'endian': 'little'},
69+
'name': 'bytes'},
70+
'string': {'name': 'vlen-utf8'}},
7071
'write_empty_chunks': False},
7172
'async': {'concurrency': 10, 'timeout': None},
7273
'buffer': 'zarr.core.buffer.cpu.Buffer',

src/zarr/api/asynchronous.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -892,7 +892,8 @@ async def create(
892892
- For Unicode strings, the default is ``VLenUTF8Codec`` and ``ZstdCodec``.
893893
- For bytes or objects, the default is ``VLenBytesCodec`` and ``ZstdCodec``.
894894
895-
These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`.
895+
These defaults can be changed by modifying the value of ``array.v3_default_filters``,
896+
``array.v3_default_serializer`` and ``array.v3_default_compressors`` in :mod:`zarr.core.config`.
896897
compressor : Codec, optional
897898
Primary compressor to compress chunk data.
898899
Zarr format 2 only. Zarr format 3 arrays should use ``codecs`` instead.

src/zarr/api/synchronous.py

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -788,9 +788,8 @@ def create_array(
788788
For Zarr format 3, a "filter" is a codec that takes an array and returns an array,
789789
and these values must be instances of ``ArrayArrayCodec``, or dict representations
790790
of ``ArrayArrayCodec``.
791-
If ``filters`` and ``compressors`` are not specified, then the default codecs for
792-
Zarr format 3 will be used.
793-
These defaults can be changed by modifying the value of ``array.v3_default_codecs``
791+
If no ``filters`` are provided, a default set of filters will be used.
792+
These defaults can be changed by modifying the value of ``array.v3_default_filters``
794793
in :mod:`zarr.core.config`.
795794
Use ``None`` to omit default filters.
796795
@@ -806,22 +805,22 @@ def create_array(
806805
807806
For Zarr format 3, a "compressor" is a codec that takes a bytestream, and
808807
returns another bytestream. Multiple compressors my be provided for Zarr format 3.
809-
If ``filters`` and ``compressors`` are not specified, then the default codecs for
810-
Zarr format 3 will be used.
811-
These defaults can be changed by modifying the value of ``array.v3_default_codecs``
808+
If no ``compressors`` are provided, a default set of compressors will be used.
809+
These defaults can be changed by modifying the value of ``array.v3_default_compressors``
812810
in :mod:`zarr.core.config`.
813811
Use ``None`` to omit default compressors.
814812
815813
For Zarr format 2, a "compressor" can be any numcodecs codec. Only a single compressor may
816814
be provided for Zarr format 2.
817-
If no ``compressors`` are provided, a default compressor will be used.
818-
These defaults can be changed by modifying the value of ``array.v2_default_compressor``
815+
If no ``compressor`` is provided, a default compressor will be used.
819816
in :mod:`zarr.core.config`.
820817
Use ``None`` to omit the default compressor.
821818
serializer : dict[str, JSON] | ArrayBytesCodec, optional
822819
Array-to-bytes codec to use for encoding the array data.
823820
Zarr format 3 only. Zarr format 2 arrays use implicit array-to-bytes conversion.
824-
If no ``serializer`` is provided, the `zarr.codecs.BytesCodec` codec will be used.
821+
If no ``serializer`` is provided, a default serializer will be used.
822+
These defaults can be changed by modifying the value of ``array.v3_default_serializer``
823+
in :mod:`zarr.core.config`.
825824
fill_value : Any, optional
826825
Fill value for the array.
827826
order : {"C", "F"}, optional

src/zarr/core/array.py

Lines changed: 37 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,6 @@
110110
_parse_array_array_codec,
111111
_parse_array_bytes_codec,
112112
_parse_bytes_bytes_codec,
113-
_resolve_codec,
114113
get_pipeline_class,
115114
)
116115
from zarr.storage import StoreLike, make_store_path
@@ -469,7 +468,8 @@ async def create(
469468
- For Unicode strings, the default is ``VLenUTF8Codec`` and ``ZstdCodec``.
470469
- For bytes or objects, the default is ``VLenBytesCodec`` and ``ZstdCodec``.
471470
472-
These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`.
471+
These defaults can be changed by modifying the value of ``array.v3_default_filters``,
472+
``array.v3_default_serializer`` and ``array.v3_default_compressors`` in :mod:`zarr.core.config`.
473473
dimension_names : Iterable[str], optional
474474
The names of the dimensions (default is None).
475475
Zarr format 3 only. Zarr format 2 arrays should not use this parameter.
@@ -1715,7 +1715,8 @@ def create(
17151715
- For Unicode strings, the default is ``VLenUTF8Codec`` and ``ZstdCodec``.
17161716
- For bytes or objects, the default is ``VLenBytesCodec`` and ``ZstdCodec``.
17171717
1718-
These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`.
1718+
These defaults can be changed by modifying the value of ``array.v3_default_filters``,
1719+
``array.v3_default_serializer`` and ``array.v3_default_compressors`` in :mod:`zarr.core.config`.
17191720
dimension_names : Iterable[str], optional
17201721
The names of the dimensions (default is None).
17211722
Zarr format 3 only. Zarr format 2 arrays should not use this parameter.
@@ -3698,17 +3699,9 @@ def _build_parents(
36983699

36993700
def _get_default_codecs(
37003701
np_dtype: np.dtype[Any],
3701-
) -> list[dict[str, JSON]]:
3702-
default_codecs = zarr_config.get("array.v3_default_codecs")
3703-
dtype = DataType.from_numpy(np_dtype)
3704-
if dtype == DataType.string:
3705-
dtype_key = "string"
3706-
elif dtype == DataType.bytes:
3707-
dtype_key = "bytes"
3708-
else:
3709-
dtype_key = "numeric"
3710-
3711-
return cast(list[dict[str, JSON]], default_codecs[dtype_key])
3702+
) -> tuple[Codec, ...]:
3703+
filters, serializer, compressors = _get_default_chunk_encoding_v3(np_dtype)
3704+
return filters + (serializer,) + compressors
37123705

37133706

37143707
FiltersLike: TypeAlias = (
@@ -3785,9 +3778,8 @@ async def create_array(
37853778
For Zarr format 3, a "filter" is a codec that takes an array and returns an array,
37863779
and these values must be instances of ``ArrayArrayCodec``, or dict representations
37873780
of ``ArrayArrayCodec``.
3788-
If ``filters`` and ``compressors`` are not specified, then the default codecs for
3789-
Zarr format 3 will be used.
3790-
These defaults can be changed by modifying the value of ``array.v3_default_codecs``
3781+
If no ``filters`` are provided, a default set of filters will be used.
3782+
These defaults can be changed by modifying the value of ``array.v3_default_filters``
37913783
in :mod:`zarr.core.config`.
37923784
Use ``None`` to omit default filters.
37933785
@@ -3803,22 +3795,22 @@ async def create_array(
38033795
38043796
For Zarr format 3, a "compressor" is a codec that takes a bytestream, and
38053797
returns another bytestream. Multiple compressors my be provided for Zarr format 3.
3806-
If ``filters`` and ``compressors`` are not specified, then the default codecs for
3807-
Zarr format 3 will be used.
3808-
These defaults can be changed by modifying the value of ``array.v3_default_codecs``
3798+
If no ``compressors`` are provided, a default set of compressors will be used.
3799+
These defaults can be changed by modifying the value of ``array.v3_default_compressors``
38093800
in :mod:`zarr.core.config`.
38103801
Use ``None`` to omit default compressors.
38113802
38123803
For Zarr format 2, a "compressor" can be any numcodecs codec. Only a single compressor may
38133804
be provided for Zarr format 2.
3814-
If no ``compressors`` are provided, a default compressor will be used.
3815-
These defaults can be changed by modifying the value of ``array.v2_default_compressor``
3805+
If no ``compressor`` is provided, a default compressor will be used.
38163806
in :mod:`zarr.core.config`.
38173807
Use ``None`` to omit the default compressor.
38183808
serializer : dict[str, JSON] | ArrayBytesCodec, optional
38193809
Array-to-bytes codec to use for encoding the array data.
38203810
Zarr format 3 only. Zarr format 2 arrays use implicit array-to-bytes conversion.
3821-
If no ``serializer`` is provided, the `zarr.codecs.BytesCodec` codec will be used.
3811+
If no ``serializer`` is provided, a default serializer will be used.
3812+
These defaults can be changed by modifying the value of ``array.v3_default_serializer``
3813+
in :mod:`zarr.core.config`.
38223814
fill_value : Any, optional
38233815
Fill value for the array.
38243816
order : {"C", "F"}, optional
@@ -3997,7 +3989,6 @@ def _get_default_chunk_encoding_v3(
39973989
"""
39983990
Get the default ArrayArrayCodecs, ArrayBytesCodec, and BytesBytesCodec for a given dtype.
39993991
"""
4000-
default_codecs = zarr_config.get("array.v3_default_codecs")
40013992
dtype = DataType.from_numpy(np_dtype)
40023993
if dtype == DataType.string:
40033994
dtype_key = "string"
@@ -4006,31 +3997,15 @@ def _get_default_chunk_encoding_v3(
40063997
else:
40073998
dtype_key = "numeric"
40083999

4009-
codec_dicts = default_codecs[dtype_key]
4010-
codecs = tuple(_resolve_codec(c) for c in codec_dicts)
4011-
array_bytes_maybe = None
4012-
array_array: list[ArrayArrayCodec] = []
4013-
bytes_bytes: list[BytesBytesCodec] = []
4014-
4015-
for codec in codecs:
4016-
if isinstance(codec, ArrayBytesCodec):
4017-
if array_bytes_maybe is not None:
4018-
raise ValueError(
4019-
f"Got two instances of ArrayBytesCodec: {array_bytes_maybe} and {codec}. "
4020-
"Only one array-to-bytes codec is allowed."
4021-
)
4022-
array_bytes_maybe = codec
4023-
elif isinstance(codec, ArrayArrayCodec):
4024-
array_array.append(codec)
4025-
elif isinstance(codec, BytesBytesCodec):
4026-
bytes_bytes.append(codec)
4027-
else:
4028-
raise TypeError(f"Unexpected codec type: {type(codec)}")
4000+
default_filters = zarr_config.get("array.v3_default_filters").get(dtype_key)
4001+
default_serializer = zarr_config.get("array.v3_default_serializer").get(dtype_key)
4002+
default_compressors = zarr_config.get("array.v3_default_compressors").get(dtype_key)
40294003

4030-
if array_bytes_maybe is None:
4031-
raise ValueError("Required ArrayBytesCodec was not found.")
4004+
filters = tuple(_parse_array_array_codec(codec_dict) for codec_dict in default_filters)
4005+
serializer = _parse_array_bytes_codec(default_serializer)
4006+
compressors = tuple(_parse_bytes_bytes_codec(codec_dict) for codec_dict in default_compressors)
40324007

4033-
return tuple(array_array), array_bytes_maybe, tuple(bytes_bytes)
4008+
return filters, serializer, compressors
40344009

40354010

40364011
def _get_default_chunk_encoding_v2(
@@ -4111,34 +4086,15 @@ def _parse_chunk_encoding_v3(
41114086
default_array_array, default_array_bytes, default_bytes_bytes = _get_default_chunk_encoding_v3(
41124087
dtype
41134088
)
4114-
maybe_bytes_bytes: Iterable[Codec | dict[str, JSON]]
4115-
maybe_array_array: Iterable[Codec | dict[str, JSON]]
4116-
out_bytes_bytes: tuple[BytesBytesCodec, ...]
4117-
if compressors is None:
4118-
out_bytes_bytes = ()
4119-
4120-
elif compressors == "auto":
4121-
out_bytes_bytes = default_bytes_bytes
41224089

4123-
else:
4124-
if isinstance(compressors, dict | Codec):
4125-
maybe_bytes_bytes = (compressors,)
4126-
elif compressors is None:
4127-
maybe_bytes_bytes = ()
4128-
else:
4129-
maybe_bytes_bytes = cast(Iterable[Codec | dict[str, JSON]], compressors)
4130-
4131-
out_bytes_bytes = tuple(_parse_bytes_bytes_codec(c) for c in maybe_bytes_bytes)
4132-
out_array_array: tuple[ArrayArrayCodec, ...]
41334090
if filters is None:
4134-
out_array_array = ()
4091+
out_array_array: tuple[ArrayArrayCodec, ...] = ()
41354092
elif filters == "auto":
41364093
out_array_array = default_array_array
41374094
else:
4095+
maybe_array_array: Iterable[Codec | dict[str, JSON]]
41384096
if isinstance(filters, dict | Codec):
41394097
maybe_array_array = (filters,)
4140-
elif filters is None:
4141-
maybe_array_array = ()
41424098
else:
41434099
maybe_array_array = cast(Iterable[Codec | dict[str, JSON]], filters)
41444100
out_array_array = tuple(_parse_array_array_codec(c) for c in maybe_array_array)
@@ -4148,6 +4104,19 @@ def _parse_chunk_encoding_v3(
41484104
else:
41494105
out_array_bytes = _parse_array_bytes_codec(serializer)
41504106

4107+
if compressors is None:
4108+
out_bytes_bytes: tuple[BytesBytesCodec, ...] = ()
4109+
elif compressors == "auto":
4110+
out_bytes_bytes = default_bytes_bytes
4111+
else:
4112+
maybe_bytes_bytes: Iterable[Codec | dict[str, JSON]]
4113+
if isinstance(compressors, dict | Codec):
4114+
maybe_bytes_bytes = (compressors,)
4115+
else:
4116+
maybe_bytes_bytes = cast(Iterable[Codec | dict[str, JSON]], compressors)
4117+
4118+
out_bytes_bytes = tuple(_parse_bytes_bytes_codec(c) for c in maybe_bytes_bytes)
4119+
41514120
return out_array_array, out_array_bytes, out_bytes_bytes
41524121

41534122

src/zarr/core/config.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -76,17 +76,20 @@ def reset(self) -> None:
7676
"string": [{"id": "vlen-utf8"}],
7777
"bytes": [{"id": "vlen-bytes"}],
7878
},
79-
"v3_default_codecs": {
79+
"v3_default_filters": {"numeric": [], "string": [], "bytes": []},
80+
"v3_default_serializer": {
81+
"numeric": {"name": "bytes", "configuration": {"endian": "little"}},
82+
"string": {"name": "vlen-utf8"},
83+
"bytes": {"name": "vlen-bytes"},
84+
},
85+
"v3_default_compressors": {
8086
"numeric": [
81-
{"name": "bytes", "configuration": {"endian": "little"}},
8287
{"name": "zstd", "configuration": {"level": 0, "checksum": False}},
8388
],
8489
"string": [
85-
{"name": "vlen-utf8"},
8690
{"name": "zstd", "configuration": {"level": 0, "checksum": False}},
8791
],
8892
"bytes": [
89-
{"name": "vlen-bytes"},
9093
{"name": "zstd", "configuration": {"level": 0, "checksum": False}},
9194
],
9295
},

0 commit comments

Comments
 (0)