Skip to content

Commit

Permalink
default codec config now uses the full config dict
Browse files Browse the repository at this point in the history
  • Loading branch information
normanrz committed Dec 22, 2024
1 parent ae1832d commit 5cb6dd8
Show file tree
Hide file tree
Showing 6 changed files with 168 additions and 84 deletions.
21 changes: 12 additions & 9 deletions src/zarr/api/asynchronous.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,14 @@
ChunkCoords,
MemoryOrder,
ZarrFormat,
_default_zarr_version,
_warn_order_kwarg,
_warn_write_empty_chunks_kwarg,
parse_dtype,
)
from zarr.core.common import _default_zarr_version
from zarr.core.group import AsyncGroup, ConsolidatedMetadata, GroupMetadata
from zarr.core.metadata import ArrayMetadataDict, ArrayV2Metadata, ArrayV3Metadata
from zarr.core.metadata.v2 import _default_filters_and_compressor
from zarr.core.metadata.v2 import _default_compressor, _default_filters
from zarr.errors import NodeTypeValidationError
from zarr.storage import (
StoreLike,
Expand Down Expand Up @@ -886,8 +886,8 @@ async def create(
If no codecs are provided, default codecs will be used:
- For numeric arrays, the default is ``BytesCodec`` and ``ZstdCodec``.
- For Unicode strings, the default is ``VLenUTF8Codec``.
- For bytes or objects, the default is ``VLenBytesCodec``.
- For Unicode strings, the default is ``VLenUTF8Codec`` and ``ZstdCodec``.
- For bytes or objects, the default is ``VLenBytesCodec`` and ``ZstdCodec``.
These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`.
compressor : Codec, optional
Expand All @@ -900,7 +900,8 @@ async def create(
- For Unicode strings, the default is ``VLenUTF8Codec``.
- For bytes or objects, the default is ``VLenBytesCodec``.
These defaults can be changed by modifying the value of ``array.v2_default_compressor`` in :mod:`zarr.core.config`. fill_value : object
These defaults can be changed by modifying the value of ``array.v2_default_compressor`` in :mod:`zarr.core.config`.
fill_value : object
Default value to use for uninitialized portions of the array.
order : {'C', 'F'}, optional
Deprecated in favor of the ``config`` keyword argument.
Expand All @@ -921,8 +922,8 @@ async def create(
for storage of both chunks and metadata.
filters : sequence of Codecs, optional
Sequence of filters to use to encode chunk data prior to compression.
V2 only. If neither ``compressor`` nor ``filters`` are provided, a default
compressor will be used. (see ``compressor`` for details).
V2 only. If no ``filters`` are provided, a default set of filters will be used.
These defaults can be changed by modifying the value of ``array.v2_default_filters`` in :mod:`zarr.core.config`.
cache_metadata : bool, optional
If True, array configuration metadata will be cached for the
lifetime of the object. If False, array metadata will be reloaded
Expand Down Expand Up @@ -975,8 +976,10 @@ async def create(
if chunks is None:
chunks = shape
dtype = parse_dtype(dtype, zarr_format)
if not filters and not compressor:
filters, compressor = _default_filters_and_compressor(dtype)
if not filters:
filters = _default_filters(dtype)
if not compressor:
compressor = _default_compressor(dtype)
elif zarr_format == 3 and chunk_shape is None: # type: ignore[redundant-expr]
if chunks is not None:
chunk_shape = chunks
Expand Down
99 changes: 55 additions & 44 deletions src/zarr/core/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec, Codec
from zarr.abc.store import Store, set_or_delete
from zarr.codecs._v2 import V2Codec
from zarr.codecs.zstd import ZstdCodec
from zarr.core._info import ArrayInfo
from zarr.core.array_spec import ArrayConfig, ArrayConfigParams, parse_array_config
from zarr.core.attributes import Attributes
Expand Down Expand Up @@ -87,7 +86,10 @@
ArrayV3MetadataDict,
T_ArrayMetadata,
)
from zarr.core.metadata.v2 import _default_filters_and_compressor
from zarr.core.metadata.v2 import (
_default_compressor,
_default_filters,
)
from zarr.core.metadata.v3 import DataType, parse_node_type_array
from zarr.core.sync import sync
from zarr.errors import MetadataValidationError
Expand Down Expand Up @@ -438,8 +440,8 @@ async def create(
If no codecs are provided, default codecs will be used:
- For numeric arrays, the default is ``BytesCodec`` and ``ZstdCodec``.
- For Unicode strings, the default is ``VLenUTF8Codec``.
- For bytes or objects, the default is ``VLenBytesCodec``.
- For Unicode strings, the default is ``VLenUTF8Codec`` and ``ZstdCodec``.
- For bytes or objects, the default is ``VLenBytesCodec`` and ``ZstdCodec``.
These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`.
dimension_names : Iterable[str], optional
Expand All @@ -460,14 +462,14 @@ async def create(
order for Zarr 3 arrays is via the ``config`` parameter, e.g. ``{'config': 'C'}``.
filters : list[dict[str, JSON]], optional
Sequence of filters to use to encode chunk data prior to compression.
V2 only. V3 arrays should use ``codecs`` instead. If neither ``compressor``
nor ``filters`` are provided, a default compressor will be used. (see
``compressor`` for details)
V2 only. V3 arrays should use ``codecs`` instead. If no ``filters``
are provided, a default set of filters will be used.
These defaults can be changed by modifying the value of ``array.v2_default_filters`` in :mod:`zarr.core.config`.
compressor : dict[str, JSON], optional
The compressor used to compress the data (default is None).
V2 only. V3 arrays should use ``codecs`` instead.
If neither ``compressor`` nor ``filters`` are provided, a default compressor will be used:
If no ``compressor`` is provided, a default compressor will be used:
- For numeric arrays, the default is ``ZstdCodec``.
- For Unicode strings, the default is ``VLenUTF8Codec``.
Expand Down Expand Up @@ -677,8 +679,10 @@ async def _create_v2(
dimension_separator = "."

dtype = parse_dtype(dtype, zarr_format=2)
if not filters and not compressor:
filters, compressor = _default_filters_and_compressor(dtype)
if not filters:
filters = _default_filters(dtype)
if not compressor:
compressor = _default_compressor(dtype)
if np.issubdtype(dtype, np.str_):
filters = filters or []
if not any(x["id"] == "vlen-utf8" for x in filters):
Expand Down Expand Up @@ -1572,8 +1576,8 @@ def create(
If no codecs are provided, default codecs will be used:
- For numeric arrays, the default is ``BytesCodec`` and ``ZstdCodec``.
- For Unicode strings, the default is ``VLenUTF8Codec``.
- For bytes or objects, the default is ``VLenBytesCodec``.
- For Unicode strings, the default is ``VLenUTF8Codec`` and ``ZstdCodec``.
- For bytes or objects, the default is ``VLenBytesCodec`` and ``ZstdCodec``.
These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`.
dimension_names : Iterable[str], optional
Expand All @@ -1594,14 +1598,14 @@ def create(
order for Zarr 3 arrays is via the ``config`` parameter, e.g. ``{'order': 'C'}``.
filters : list[dict[str, JSON]], optional
Sequence of filters to use to encode chunk data prior to compression.
V2 only. V3 arrays should use ``codecs`` instead. If neither ``compressor``
nor ``filters`` are provided, a default compressor will be used. (see
``compressor`` for details)
V2 only. V3 arrays should use ``codecs`` instead. If no ``filters``
are provided, a default set of filters will be used.
These defaults can be changed by modifying the value of ``array.v2_default_filters`` in :mod:`zarr.core.config`.
compressor : dict[str, JSON], optional
Primary compressor to compress chunk data.
V2 only. V3 arrays should use ``codecs`` instead.
If neither ``compressor`` nor ``filters`` are provided, a default compressor will be used:
If no ``compressor`` is provided, a default compressor will be used:
- For numeric arrays, the default is ``ZstdCodec``.
- For Unicode strings, the default is ``VLenUTF8Codec``.
Expand Down Expand Up @@ -3455,7 +3459,7 @@ def _get_default_codecs(
else:
dtype_key = "numeric"

return [{"name": codec_id, "configuration": {}} for codec_id in default_codecs[dtype_key]]
return default_codecs[dtype_key]


FiltersParam: TypeAlias = (
Expand Down Expand Up @@ -3672,49 +3676,56 @@ def _get_default_encoding_v3(
else:
dtype_key = "numeric"

codec_names = default_codecs[dtype_key]
array_bytes_cls, *rest = tuple(get_codec_class(codec_name) for codec_name in codec_names)
array_bytes: ArrayBytesCodec = cast(ArrayBytesCodec, array_bytes_cls())
# TODO: we should compress bytes and strings by default!
# The current default codecs only lists names, and strings / bytes are not compressed at all,
# so we insert the ZstdCodec at the end of the list as a default
bytes_bytes: tuple[BytesBytesCodec, ...]
array_array: tuple[ArrayArrayCodec, ...] = ()
if len(rest) == 0:
bytes_bytes = (ZstdCodec(),)
else:
bytes_bytes = cast(tuple[BytesBytesCodec, ...], tuple(r() for r in rest))
codec_dicts = default_codecs[dtype_key]
codecs = tuple(get_codec_class(c["name"]).from_dict(c) for c in codec_dicts)
array_bytes_maybe = None
array_array: list[ArrayArrayCodec] = []
bytes_bytes: list[BytesBytesCodec] = []

for codec in codecs:
if isinstance(codec, ArrayBytesCodec):
if array_bytes_maybe is not None:
raise ValueError(
f"Got two instances of ArrayBytesCodec: {array_bytes_maybe} and {codec}. "
"Only one array-to-bytes codec is allowed."
)
array_bytes_maybe = codec
elif isinstance(codec, ArrayArrayCodec):
array_array.append(codec)
elif isinstance(codec, BytesBytesCodec):
bytes_bytes.append(codec)
else:
raise TypeError(f"Unexpected codec type: {type(codec)}")

return array_array, array_bytes, bytes_bytes
if array_bytes_maybe is None:
raise ValueError("Required ArrayBytesCodec was not found.")

return tuple(array_array), array_bytes_maybe, tuple(bytes_bytes)


def _get_default_chunk_encoding_v2(
dtype: np.dtype[Any],
) -> tuple[tuple[numcodecs.abc.Codec, ...], numcodecs.abc.Codec]:
) -> tuple[tuple[numcodecs.abc.Codec, ...], numcodecs.abc.Codec | None]:
"""
Get the default chunk encoding for zarr v2 arrays, given a dtype
"""
codec_id_dict = zarr_config.get("array.v2_default_compressor")

if dtype.kind in "biufcmM":
dtype_key = "numeric"
codec_type = "compressor"
elif dtype.kind in "U":
dtype_key = "string"
codec_type = "filter"
elif dtype.kind in "OSV":
dtype_key = "bytes"
codec_type = "filter"
else:
raise ValueError(f"Unsupported dtype kind {dtype.kind}")
codec_id = codec_id_dict[dtype_key]
codec_instance = numcodecs.get_codec({"id": codec_id})
if codec_type == "compressor":
return (), codec_instance
elif codec_type == "filter":
return codec_instance, numcodecs.Zstd()
else:
raise ValueError(f"Unsupported codec type {codec_type}")

compressor_dict = zarr_config.get("array.v2_default_compressor").get(dtype_key, None)
filter_dicts = zarr_config.get("array.v2_default_filters").get(dtype_key, [])

compressor = None
if compressor_dict is not None:
compressor = numcodecs.get_codec(compressor_dict)
filters = tuple(numcodecs.get_codec(f) for f in filter_dicts)
return filters, compressor


def _parse_chunk_encoding_v2(
Expand Down
26 changes: 20 additions & 6 deletions src/zarr/core/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,14 +67,28 @@ def reset(self) -> None:
"order": "C",
"write_empty_chunks": False,
"v2_default_compressor": {
"numeric": "zstd",
"string": "vlen-utf8",
"bytes": "vlen-bytes",
"numeric": {"id": "zstd", "level": 0, "checksum": True},
"string": {"id": "zstd", "level": 0, "checksum": True},
"bytes": {"id": "zstd", "level": 0, "checksum": True},
},
"v2_default_filters": {
"numeric": [],
"string": [{"id": "vlen-utf8"}],
"bytes": [{"id": "vlen-bytes"}],
},
"v3_default_codecs": {
"numeric": ["bytes", "zstd"],
"string": ["vlen-utf8"],
"bytes": ["vlen-bytes"],
"numeric": [
{"name": "bytes", "configuration": {"endian": "little"}},
{"name": "zstd", "configuration": {"level": 0, "checksum": True}},
],
"string": [
{"name": "vlen-utf8"},
{"name": "zstd", "configuration": {"level": 0, "checksum": True}},
],
"bytes": [
{"name": "vlen-bytes"},
{"name": "zstd", "configuration": {"level": 0, "checksum": True}},
],
},
},
"async": {"concurrency": 10, "timeout": None},
Expand Down
26 changes: 23 additions & 3 deletions src/zarr/core/metadata/v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -331,9 +331,9 @@ def _default_fill_value(dtype: np.dtype[Any]) -> Any:
return dtype.type(0)


def _default_filters_and_compressor(
def _default_compressor(
dtype: np.dtype[Any],
) -> tuple[list[dict[str, JSON]], dict[str, JSON] | None]:
) -> dict[str, JSON] | None:
"""Get the default filters and compressor for a dtype.
https://numpy.org/doc/2.1/reference/generated/numpy.dtype.kind.html
Expand All @@ -348,4 +348,24 @@ def _default_filters_and_compressor(
else:
raise ValueError(f"Unsupported dtype kind {dtype.kind}")

return [{"id": default_compressor[dtype_key]}], None
return default_compressor.get(dtype_key, None)


def _default_filters(
dtype: np.dtype[Any],
) -> list[dict[str, JSON]]:
"""Get the default filters and compressor for a dtype.
https://numpy.org/doc/2.1/reference/generated/numpy.dtype.kind.html
"""
default_filters = config.get("array.v2_default_filters")
if dtype.kind in "biufcmM":
dtype_key = "numeric"
elif dtype.kind in "U":
dtype_key = "string"
elif dtype.kind in "OSV":
dtype_key = "bytes"
else:
raise ValueError(f"Unsupported dtype kind {dtype.kind}")

return default_filters.get(dtype_key, [])
47 changes: 35 additions & 12 deletions tests/test_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,14 +54,28 @@ def test_config_defaults_set() -> None:
"order": "C",
"write_empty_chunks": False,
"v2_default_compressor": {
"numeric": "zstd",
"string": "vlen-utf8",
"bytes": "vlen-bytes",
"numeric": {"id": "zstd", "level": 0, "checksum": True},
"string": {"id": "zstd", "level": 0, "checksum": True},
"bytes": {"id": "zstd", "level": 0, "checksum": True},
},
"v2_default_filters": {
"numeric": [],
"string": [{"id": "vlen-utf8"}],
"bytes": [{"id": "vlen-bytes"}],
},
"v3_default_codecs": {
"bytes": ["vlen-bytes"],
"numeric": ["bytes", "zstd"],
"string": ["vlen-utf8"],
"bytes": [
{"name": "vlen-bytes"},
{"name": "zstd", "configuration": {"level": 0, "checksum": True}},
],
"numeric": [
{"name": "bytes", "configuration": {"endian": "little"}},
{"name": "zstd", "configuration": {"level": 0, "checksum": True}},
],
"string": [
{"name": "vlen-utf8"},
{"name": "zstd", "configuration": {"level": 0, "checksum": True}},
],
},
},
"async": {"concurrency": 10, "timeout": None},
Expand Down Expand Up @@ -291,17 +305,26 @@ class NewCodec2(BytesCodec):
("dtype", "expected_codecs"),
[
("int", [BytesCodec(), GzipCodec()]),
("bytes", [VLenBytesCodec()]),
("str", [VLenUTF8Codec()]),
("bytes", [VLenBytesCodec(), GzipCodec()]),
("str", [VLenUTF8Codec(), GzipCodec()]),
],
)
async def test_default_codecs(dtype: str, expected_codecs: list[Codec]) -> None:
with config.set(
{
"array.v3_default_codecs": {
"numeric": ["bytes", "gzip"], # test setting non-standard codecs
"string": ["vlen-utf8"],
"bytes": ["vlen-bytes"],
"array.v3_default_codecs": { # test setting non-standard codecs
"numeric": [
{"name": "bytes", "configuration": {"endian": "little"}},
{"name": "gzip", "configuration": {"level": 5}},
],
"string": [
{"name": "vlen-utf8"},
{"name": "gzip", "configuration": {"level": 5}},
],
"bytes": [
{"name": "vlen-bytes"},
{"name": "gzip", "configuration": {"level": 5}},
],
}
}
):
Expand Down
Loading

0 comments on commit 5cb6dd8

Please sign in to comment.