From 5cb6dd8f62ad6ed5391a08535dc05ef9ac50bbad Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Sun, 22 Dec 2024 20:31:44 +0100 Subject: [PATCH] default codec config now uses the full config dict --- src/zarr/api/asynchronous.py | 21 ++++---- src/zarr/core/array.py | 99 ++++++++++++++++++++---------------- src/zarr/core/config.py | 26 +++++++--- src/zarr/core/metadata/v2.py | 26 ++++++++-- tests/test_config.py | 47 ++++++++++++----- tests/test_v2.py | 33 ++++++++---- 6 files changed, 168 insertions(+), 84 deletions(-) diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index a55d24555..c8125a964 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -18,14 +18,14 @@ ChunkCoords, MemoryOrder, ZarrFormat, + _default_zarr_version, _warn_order_kwarg, _warn_write_empty_chunks_kwarg, parse_dtype, ) -from zarr.core.common import _default_zarr_version from zarr.core.group import AsyncGroup, ConsolidatedMetadata, GroupMetadata from zarr.core.metadata import ArrayMetadataDict, ArrayV2Metadata, ArrayV3Metadata -from zarr.core.metadata.v2 import _default_filters_and_compressor +from zarr.core.metadata.v2 import _default_compressor, _default_filters from zarr.errors import NodeTypeValidationError from zarr.storage import ( StoreLike, @@ -886,8 +886,8 @@ async def create( If no codecs are provided, default codecs will be used: - For numeric arrays, the default is ``BytesCodec`` and ``ZstdCodec``. - - For Unicode strings, the default is ``VLenUTF8Codec``. - - For bytes or objects, the default is ``VLenBytesCodec``. + - For Unicode strings, the default is ``VLenUTF8Codec`` and ``ZstdCodec``. + - For bytes or objects, the default is ``VLenBytesCodec`` and ``ZstdCodec``. These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`. compressor : Codec, optional @@ -900,7 +900,8 @@ async def create( - For Unicode strings, the default is ``VLenUTF8Codec``. - For bytes or objects, the default is ``VLenBytesCodec``. - These defaults can be changed by modifying the value of ``array.v2_default_compressor`` in :mod:`zarr.core.config`. fill_value : object + These defaults can be changed by modifying the value of ``array.v2_default_compressor`` in :mod:`zarr.core.config`. + fill_value : object Default value to use for uninitialized portions of the array. order : {'C', 'F'}, optional Deprecated in favor of the ``config`` keyword argument. @@ -921,8 +922,8 @@ async def create( for storage of both chunks and metadata. filters : sequence of Codecs, optional Sequence of filters to use to encode chunk data prior to compression. - V2 only. If neither ``compressor`` nor ``filters`` are provided, a default - compressor will be used. (see ``compressor`` for details). + V2 only. If no ``filters`` are provided, a default set of filters will be used. + These defaults can be changed by modifying the value of ``array.v2_default_filters`` in :mod:`zarr.core.config`. cache_metadata : bool, optional If True, array configuration metadata will be cached for the lifetime of the object. If False, array metadata will be reloaded @@ -975,8 +976,10 @@ async def create( if chunks is None: chunks = shape dtype = parse_dtype(dtype, zarr_format) - if not filters and not compressor: - filters, compressor = _default_filters_and_compressor(dtype) + if not filters: + filters = _default_filters(dtype) + if not compressor: + compressor = _default_compressor(dtype) elif zarr_format == 3 and chunk_shape is None: # type: ignore[redundant-expr] if chunks is not None: chunk_shape = chunks diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index fd3886a60..429fa4f74 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -18,7 +18,6 @@ from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec, Codec from zarr.abc.store import Store, set_or_delete from zarr.codecs._v2 import V2Codec -from zarr.codecs.zstd import ZstdCodec from zarr.core._info import ArrayInfo from zarr.core.array_spec import ArrayConfig, ArrayConfigParams, parse_array_config from zarr.core.attributes import Attributes @@ -87,7 +86,10 @@ ArrayV3MetadataDict, T_ArrayMetadata, ) -from zarr.core.metadata.v2 import _default_filters_and_compressor +from zarr.core.metadata.v2 import ( + _default_compressor, + _default_filters, +) from zarr.core.metadata.v3 import DataType, parse_node_type_array from zarr.core.sync import sync from zarr.errors import MetadataValidationError @@ -438,8 +440,8 @@ async def create( If no codecs are provided, default codecs will be used: - For numeric arrays, the default is ``BytesCodec`` and ``ZstdCodec``. - - For Unicode strings, the default is ``VLenUTF8Codec``. - - For bytes or objects, the default is ``VLenBytesCodec``. + - For Unicode strings, the default is ``VLenUTF8Codec`` and ``ZstdCodec``. + - For bytes or objects, the default is ``VLenBytesCodec`` and ``ZstdCodec``. These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`. dimension_names : Iterable[str], optional @@ -460,14 +462,14 @@ async def create( order for Zarr 3 arrays is via the ``config`` parameter, e.g. ``{'config': 'C'}``. filters : list[dict[str, JSON]], optional Sequence of filters to use to encode chunk data prior to compression. - V2 only. V3 arrays should use ``codecs`` instead. If neither ``compressor`` - nor ``filters`` are provided, a default compressor will be used. (see - ``compressor`` for details) + V2 only. V3 arrays should use ``codecs`` instead. If no ``filters`` + are provided, a default set of filters will be used. + These defaults can be changed by modifying the value of ``array.v2_default_filters`` in :mod:`zarr.core.config`. compressor : dict[str, JSON], optional The compressor used to compress the data (default is None). V2 only. V3 arrays should use ``codecs`` instead. - If neither ``compressor`` nor ``filters`` are provided, a default compressor will be used: + If no ``compressor`` is provided, a default compressor will be used: - For numeric arrays, the default is ``ZstdCodec``. - For Unicode strings, the default is ``VLenUTF8Codec``. @@ -677,8 +679,10 @@ async def _create_v2( dimension_separator = "." dtype = parse_dtype(dtype, zarr_format=2) - if not filters and not compressor: - filters, compressor = _default_filters_and_compressor(dtype) + if not filters: + filters = _default_filters(dtype) + if not compressor: + compressor = _default_compressor(dtype) if np.issubdtype(dtype, np.str_): filters = filters or [] if not any(x["id"] == "vlen-utf8" for x in filters): @@ -1572,8 +1576,8 @@ def create( If no codecs are provided, default codecs will be used: - For numeric arrays, the default is ``BytesCodec`` and ``ZstdCodec``. - - For Unicode strings, the default is ``VLenUTF8Codec``. - - For bytes or objects, the default is ``VLenBytesCodec``. + - For Unicode strings, the default is ``VLenUTF8Codec`` and ``ZstdCodec``. + - For bytes or objects, the default is ``VLenBytesCodec`` and ``ZstdCodec``. These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`. dimension_names : Iterable[str], optional @@ -1594,14 +1598,14 @@ def create( order for Zarr 3 arrays is via the ``config`` parameter, e.g. ``{'order': 'C'}``. filters : list[dict[str, JSON]], optional Sequence of filters to use to encode chunk data prior to compression. - V2 only. V3 arrays should use ``codecs`` instead. If neither ``compressor`` - nor ``filters`` are provided, a default compressor will be used. (see - ``compressor`` for details) + V2 only. V3 arrays should use ``codecs`` instead. If no ``filters`` + are provided, a default set of filters will be used. + These defaults can be changed by modifying the value of ``array.v2_default_filters`` in :mod:`zarr.core.config`. compressor : dict[str, JSON], optional Primary compressor to compress chunk data. V2 only. V3 arrays should use ``codecs`` instead. - If neither ``compressor`` nor ``filters`` are provided, a default compressor will be used: + If no ``compressor`` is provided, a default compressor will be used: - For numeric arrays, the default is ``ZstdCodec``. - For Unicode strings, the default is ``VLenUTF8Codec``. @@ -3455,7 +3459,7 @@ def _get_default_codecs( else: dtype_key = "numeric" - return [{"name": codec_id, "configuration": {}} for codec_id in default_codecs[dtype_key]] + return default_codecs[dtype_key] FiltersParam: TypeAlias = ( @@ -3672,49 +3676,56 @@ def _get_default_encoding_v3( else: dtype_key = "numeric" - codec_names = default_codecs[dtype_key] - array_bytes_cls, *rest = tuple(get_codec_class(codec_name) for codec_name in codec_names) - array_bytes: ArrayBytesCodec = cast(ArrayBytesCodec, array_bytes_cls()) - # TODO: we should compress bytes and strings by default! - # The current default codecs only lists names, and strings / bytes are not compressed at all, - # so we insert the ZstdCodec at the end of the list as a default - bytes_bytes: tuple[BytesBytesCodec, ...] - array_array: tuple[ArrayArrayCodec, ...] = () - if len(rest) == 0: - bytes_bytes = (ZstdCodec(),) - else: - bytes_bytes = cast(tuple[BytesBytesCodec, ...], tuple(r() for r in rest)) + codec_dicts = default_codecs[dtype_key] + codecs = tuple(get_codec_class(c["name"]).from_dict(c) for c in codec_dicts) + array_bytes_maybe = None + array_array: list[ArrayArrayCodec] = [] + bytes_bytes: list[BytesBytesCodec] = [] + + for codec in codecs: + if isinstance(codec, ArrayBytesCodec): + if array_bytes_maybe is not None: + raise ValueError( + f"Got two instances of ArrayBytesCodec: {array_bytes_maybe} and {codec}. " + "Only one array-to-bytes codec is allowed." + ) + array_bytes_maybe = codec + elif isinstance(codec, ArrayArrayCodec): + array_array.append(codec) + elif isinstance(codec, BytesBytesCodec): + bytes_bytes.append(codec) + else: + raise TypeError(f"Unexpected codec type: {type(codec)}") - return array_array, array_bytes, bytes_bytes + if array_bytes_maybe is None: + raise ValueError("Required ArrayBytesCodec was not found.") + + return tuple(array_array), array_bytes_maybe, tuple(bytes_bytes) def _get_default_chunk_encoding_v2( dtype: np.dtype[Any], -) -> tuple[tuple[numcodecs.abc.Codec, ...], numcodecs.abc.Codec]: +) -> tuple[tuple[numcodecs.abc.Codec, ...], numcodecs.abc.Codec | None]: """ Get the default chunk encoding for zarr v2 arrays, given a dtype """ - codec_id_dict = zarr_config.get("array.v2_default_compressor") - if dtype.kind in "biufcmM": dtype_key = "numeric" - codec_type = "compressor" elif dtype.kind in "U": dtype_key = "string" - codec_type = "filter" elif dtype.kind in "OSV": dtype_key = "bytes" - codec_type = "filter" else: raise ValueError(f"Unsupported dtype kind {dtype.kind}") - codec_id = codec_id_dict[dtype_key] - codec_instance = numcodecs.get_codec({"id": codec_id}) - if codec_type == "compressor": - return (), codec_instance - elif codec_type == "filter": - return codec_instance, numcodecs.Zstd() - else: - raise ValueError(f"Unsupported codec type {codec_type}") + + compressor_dict = zarr_config.get("array.v2_default_compressor").get(dtype_key, None) + filter_dicts = zarr_config.get("array.v2_default_filters").get(dtype_key, []) + + compressor = None + if compressor_dict is not None: + compressor = numcodecs.get_codec(compressor_dict) + filters = tuple(numcodecs.get_codec(f) for f in filter_dicts) + return filters, compressor def _parse_chunk_encoding_v2( diff --git a/src/zarr/core/config.py b/src/zarr/core/config.py index a14305aef..739529a3f 100644 --- a/src/zarr/core/config.py +++ b/src/zarr/core/config.py @@ -67,14 +67,28 @@ def reset(self) -> None: "order": "C", "write_empty_chunks": False, "v2_default_compressor": { - "numeric": "zstd", - "string": "vlen-utf8", - "bytes": "vlen-bytes", + "numeric": {"id": "zstd", "level": 0, "checksum": True}, + "string": {"id": "zstd", "level": 0, "checksum": True}, + "bytes": {"id": "zstd", "level": 0, "checksum": True}, + }, + "v2_default_filters": { + "numeric": [], + "string": [{"id": "vlen-utf8"}], + "bytes": [{"id": "vlen-bytes"}], }, "v3_default_codecs": { - "numeric": ["bytes", "zstd"], - "string": ["vlen-utf8"], - "bytes": ["vlen-bytes"], + "numeric": [ + {"name": "bytes", "configuration": {"endian": "little"}}, + {"name": "zstd", "configuration": {"level": 0, "checksum": True}}, + ], + "string": [ + {"name": "vlen-utf8"}, + {"name": "zstd", "configuration": {"level": 0, "checksum": True}}, + ], + "bytes": [ + {"name": "vlen-bytes"}, + {"name": "zstd", "configuration": {"level": 0, "checksum": True}}, + ], }, }, "async": {"concurrency": 10, "timeout": None}, diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index b19683981..ddfc85a61 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -331,9 +331,9 @@ def _default_fill_value(dtype: np.dtype[Any]) -> Any: return dtype.type(0) -def _default_filters_and_compressor( +def _default_compressor( dtype: np.dtype[Any], -) -> tuple[list[dict[str, JSON]], dict[str, JSON] | None]: +) -> dict[str, JSON] | None: """Get the default filters and compressor for a dtype. https://numpy.org/doc/2.1/reference/generated/numpy.dtype.kind.html @@ -348,4 +348,24 @@ def _default_filters_and_compressor( else: raise ValueError(f"Unsupported dtype kind {dtype.kind}") - return [{"id": default_compressor[dtype_key]}], None + return default_compressor.get(dtype_key, None) + + +def _default_filters( + dtype: np.dtype[Any], +) -> list[dict[str, JSON]]: + """Get the default filters and compressor for a dtype. + + https://numpy.org/doc/2.1/reference/generated/numpy.dtype.kind.html + """ + default_filters = config.get("array.v2_default_filters") + if dtype.kind in "biufcmM": + dtype_key = "numeric" + elif dtype.kind in "U": + dtype_key = "string" + elif dtype.kind in "OSV": + dtype_key = "bytes" + else: + raise ValueError(f"Unsupported dtype kind {dtype.kind}") + + return default_filters.get(dtype_key, []) diff --git a/tests/test_config.py b/tests/test_config.py index ea8e70a99..d5a364dd1 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -54,14 +54,28 @@ def test_config_defaults_set() -> None: "order": "C", "write_empty_chunks": False, "v2_default_compressor": { - "numeric": "zstd", - "string": "vlen-utf8", - "bytes": "vlen-bytes", + "numeric": {"id": "zstd", "level": 0, "checksum": True}, + "string": {"id": "zstd", "level": 0, "checksum": True}, + "bytes": {"id": "zstd", "level": 0, "checksum": True}, + }, + "v2_default_filters": { + "numeric": [], + "string": [{"id": "vlen-utf8"}], + "bytes": [{"id": "vlen-bytes"}], }, "v3_default_codecs": { - "bytes": ["vlen-bytes"], - "numeric": ["bytes", "zstd"], - "string": ["vlen-utf8"], + "bytes": [ + {"name": "vlen-bytes"}, + {"name": "zstd", "configuration": {"level": 0, "checksum": True}}, + ], + "numeric": [ + {"name": "bytes", "configuration": {"endian": "little"}}, + {"name": "zstd", "configuration": {"level": 0, "checksum": True}}, + ], + "string": [ + {"name": "vlen-utf8"}, + {"name": "zstd", "configuration": {"level": 0, "checksum": True}}, + ], }, }, "async": {"concurrency": 10, "timeout": None}, @@ -291,17 +305,26 @@ class NewCodec2(BytesCodec): ("dtype", "expected_codecs"), [ ("int", [BytesCodec(), GzipCodec()]), - ("bytes", [VLenBytesCodec()]), - ("str", [VLenUTF8Codec()]), + ("bytes", [VLenBytesCodec(), GzipCodec()]), + ("str", [VLenUTF8Codec(), GzipCodec()]), ], ) async def test_default_codecs(dtype: str, expected_codecs: list[Codec]) -> None: with config.set( { - "array.v3_default_codecs": { - "numeric": ["bytes", "gzip"], # test setting non-standard codecs - "string": ["vlen-utf8"], - "bytes": ["vlen-bytes"], + "array.v3_default_codecs": { # test setting non-standard codecs + "numeric": [ + {"name": "bytes", "configuration": {"endian": "little"}}, + {"name": "gzip", "configuration": {"level": 5}}, + ], + "string": [ + {"name": "vlen-utf8"}, + {"name": "gzip", "configuration": {"level": 5}}, + ], + "bytes": [ + {"name": "vlen-bytes"}, + {"name": "gzip", "configuration": {"level": 5}}, + ], } } ): diff --git a/tests/test_v2.py b/tests/test_v2.py index e77edf56c..3cf4fecc7 100644 --- a/tests/test_v2.py +++ b/tests/test_v2.py @@ -82,7 +82,7 @@ def test_codec_pipeline() -> None: @pytest.mark.parametrize("dtype", ["|S", "|V"]) async def test_v2_encode_decode(dtype): - with config.set({"array.v2_default_compressor.bytes": "vlen-bytes"}): + with config.set({"array.v2_default_compressor.bytes": {"id": "vlen-bytes"}}): store = zarr.storage.MemoryStore() g = zarr.group(store=store, zarr_format=2) g.create_array( @@ -120,9 +120,9 @@ def test_v2_encode_decode_with_data(dtype_value): dtype, value = dtype_value with config.set( { - "array.v2_default_compressor": { - "string": "vlen-utf8", - "bytes": "vlen-bytes", + "array.v2_default_filters": { + "string": [{"id": "vlen-utf8"}], + "bytes": [{"id": "vlen-bytes"}], }, } ): @@ -210,18 +210,31 @@ def test_default_compressor_deprecation_warning(): @pytest.mark.parametrize( "dtype_expected", - [["b", "zstd"], ["i", "zstd"], ["f", "zstd"], ["|S1", "vlen-bytes"], ["|U1", "vlen-utf8"]], + [ + ["b", "zstd", None], + ["i", "zstd", None], + ["f", "zstd", None], + ["|S1", "zstd", "vlen-bytes"], + ["|U1", "zstd", "vlen-utf8"], + ], ) def test_default_filters_and_compressor(dtype_expected: Any) -> None: with config.set( { "array.v2_default_compressor": { - "numeric": "zstd", - "string": "vlen-utf8", - "bytes": "vlen-bytes", + "numeric": {"id": "zstd", "level": "0"}, + "string": {"id": "zstd", "level": "0"}, + "bytes": {"id": "zstd", "level": "0"}, + }, + "array.v2_default_filters": { + "numeric": [], + "string": [{"id": "vlen-utf8"}], + "bytes": [{"id": "vlen-bytes"}], }, } ): - dtype, expected = dtype_expected + dtype, expected_compressor, expected_filter = dtype_expected arr = zarr.create(shape=(3,), path="foo", store={}, zarr_format=2, dtype=dtype) - assert arr.metadata.filters[0].codec_id == expected + assert arr.metadata.compressor.codec_id == expected_compressor + if expected_filter is not None: + assert arr.metadata.filters[0].codec_id == expected_filter