18
18
from zarr .abc .codec import ArrayArrayCodec , ArrayBytesCodec , BytesBytesCodec , Codec
19
19
from zarr .abc .store import Store , set_or_delete
20
20
from zarr .codecs ._v2 import V2Codec
21
- from zarr .codecs .zstd import ZstdCodec
22
21
from zarr .core ._info import ArrayInfo
23
22
from zarr .core .array_spec import ArrayConfig , ArrayConfigParams , parse_array_config
24
23
from zarr .core .attributes import Attributes
87
86
ArrayV3MetadataDict ,
88
87
T_ArrayMetadata ,
89
88
)
90
- from zarr .core .metadata .v2 import _default_filters_and_compressor
89
+ from zarr .core .metadata .v2 import (
90
+ _default_compressor ,
91
+ _default_filters ,
92
+ )
91
93
from zarr .core .metadata .v3 import DataType , parse_node_type_array
92
94
from zarr .core .sync import sync
93
95
from zarr .errors import MetadataValidationError
@@ -438,8 +440,8 @@ async def create(
438
440
If no codecs are provided, default codecs will be used:
439
441
440
442
- For numeric arrays, the default is ``BytesCodec`` and ``ZstdCodec``.
441
- - For Unicode strings, the default is ``VLenUTF8Codec``.
442
- - For bytes or objects, the default is ``VLenBytesCodec``.
443
+ - For Unicode strings, the default is ``VLenUTF8Codec`` and ``ZstdCodec`` .
444
+ - For bytes or objects, the default is ``VLenBytesCodec`` and ``ZstdCodec`` .
443
445
444
446
These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`.
445
447
dimension_names : Iterable[str], optional
@@ -460,14 +462,14 @@ async def create(
460
462
order for Zarr 3 arrays is via the ``config`` parameter, e.g. ``{'config': 'C'}``.
461
463
filters : list[dict[str, JSON]], optional
462
464
Sequence of filters to use to encode chunk data prior to compression.
463
- V2 only. V3 arrays should use ``codecs`` instead. If neither ``compressor ``
464
- nor ``filters`` are provided, a default compressor will be used. (see
465
- ``compressor `` for details)
465
+ V2 only. V3 arrays should use ``codecs`` instead. If no ``filters ``
466
+ are provided, a default set of filters will be used.
467
+ These defaults can be changed by modifying the value of ``array.v2_default_filters `` in :mod:`zarr.core.config`.
466
468
compressor : dict[str, JSON], optional
467
469
The compressor used to compress the data (default is None).
468
470
V2 only. V3 arrays should use ``codecs`` instead.
469
471
470
- If neither ``compressor`` nor ``filters`` are provided, a default compressor will be used:
472
+ If no ``compressor`` is provided, a default compressor will be used:
471
473
472
474
- For numeric arrays, the default is ``ZstdCodec``.
473
475
- For Unicode strings, the default is ``VLenUTF8Codec``.
@@ -677,8 +679,10 @@ async def _create_v2(
677
679
dimension_separator = "."
678
680
679
681
dtype = parse_dtype (dtype , zarr_format = 2 )
680
- if not filters and not compressor :
681
- filters , compressor = _default_filters_and_compressor (dtype )
682
+ if not filters :
683
+ filters = _default_filters (dtype )
684
+ if not compressor :
685
+ compressor = _default_compressor (dtype )
682
686
if np .issubdtype (dtype , np .str_ ):
683
687
filters = filters or []
684
688
if not any (x ["id" ] == "vlen-utf8" for x in filters ):
@@ -1572,8 +1576,8 @@ def create(
1572
1576
If no codecs are provided, default codecs will be used:
1573
1577
1574
1578
- For numeric arrays, the default is ``BytesCodec`` and ``ZstdCodec``.
1575
- - For Unicode strings, the default is ``VLenUTF8Codec``.
1576
- - For bytes or objects, the default is ``VLenBytesCodec``.
1579
+ - For Unicode strings, the default is ``VLenUTF8Codec`` and ``ZstdCodec`` .
1580
+ - For bytes or objects, the default is ``VLenBytesCodec`` and ``ZstdCodec`` .
1577
1581
1578
1582
These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`.
1579
1583
dimension_names : Iterable[str], optional
@@ -1594,14 +1598,14 @@ def create(
1594
1598
order for Zarr 3 arrays is via the ``config`` parameter, e.g. ``{'order': 'C'}``.
1595
1599
filters : list[dict[str, JSON]], optional
1596
1600
Sequence of filters to use to encode chunk data prior to compression.
1597
- V2 only. V3 arrays should use ``codecs`` instead. If neither ``compressor ``
1598
- nor ``filters`` are provided, a default compressor will be used. (see
1599
- ``compressor `` for details)
1601
+ V2 only. V3 arrays should use ``codecs`` instead. If no ``filters ``
1602
+ are provided, a default set of filters will be used.
1603
+ These defaults can be changed by modifying the value of ``array.v2_default_filters `` in :mod:`zarr.core.config`.
1600
1604
compressor : dict[str, JSON], optional
1601
1605
Primary compressor to compress chunk data.
1602
1606
V2 only. V3 arrays should use ``codecs`` instead.
1603
1607
1604
- If neither ``compressor`` nor ``filters`` are provided, a default compressor will be used:
1608
+ If no ``compressor`` is provided, a default compressor will be used:
1605
1609
1606
1610
- For numeric arrays, the default is ``ZstdCodec``.
1607
1611
- For Unicode strings, the default is ``VLenUTF8Codec``.
@@ -3455,7 +3459,7 @@ def _get_default_codecs(
3455
3459
else :
3456
3460
dtype_key = "numeric"
3457
3461
3458
- return [{ "name" : codec_id , "configuration" : {}} for codec_id in default_codecs [dtype_key ] ]
3462
+ return default_codecs [dtype_key ]
3459
3463
3460
3464
3461
3465
FiltersParam : TypeAlias = (
@@ -3672,49 +3676,56 @@ def _get_default_encoding_v3(
3672
3676
else :
3673
3677
dtype_key = "numeric"
3674
3678
3675
- codec_names = default_codecs [dtype_key ]
3676
- array_bytes_cls , * rest = tuple (get_codec_class (codec_name ) for codec_name in codec_names )
3677
- array_bytes : ArrayBytesCodec = cast (ArrayBytesCodec , array_bytes_cls ())
3678
- # TODO: we should compress bytes and strings by default!
3679
- # The current default codecs only lists names, and strings / bytes are not compressed at all,
3680
- # so we insert the ZstdCodec at the end of the list as a default
3681
- bytes_bytes : tuple [BytesBytesCodec , ...]
3682
- array_array : tuple [ArrayArrayCodec , ...] = ()
3683
- if len (rest ) == 0 :
3684
- bytes_bytes = (ZstdCodec (),)
3685
- else :
3686
- bytes_bytes = cast (tuple [BytesBytesCodec , ...], tuple (r () for r in rest ))
3679
+ codec_dicts = default_codecs [dtype_key ]
3680
+ codecs = tuple (get_codec_class (c ["name" ]).from_dict (c ) for c in codec_dicts )
3681
+ array_bytes_maybe = None
3682
+ array_array : list [ArrayArrayCodec ] = []
3683
+ bytes_bytes : list [BytesBytesCodec ] = []
3684
+
3685
+ for codec in codecs :
3686
+ if isinstance (codec , ArrayBytesCodec ):
3687
+ if array_bytes_maybe is not None :
3688
+ raise ValueError (
3689
+ f"Got two instances of ArrayBytesCodec: { array_bytes_maybe } and { codec } . "
3690
+ "Only one array-to-bytes codec is allowed."
3691
+ )
3692
+ array_bytes_maybe = codec
3693
+ elif isinstance (codec , ArrayArrayCodec ):
3694
+ array_array .append (codec )
3695
+ elif isinstance (codec , BytesBytesCodec ):
3696
+ bytes_bytes .append (codec )
3697
+ else :
3698
+ raise TypeError (f"Unexpected codec type: { type (codec )} " )
3687
3699
3688
- return array_array , array_bytes , bytes_bytes
3700
+ if array_bytes_maybe is None :
3701
+ raise ValueError ("Required ArrayBytesCodec was not found." )
3702
+
3703
+ return tuple (array_array ), array_bytes_maybe , tuple (bytes_bytes )
3689
3704
3690
3705
3691
3706
def _get_default_chunk_encoding_v2 (
3692
3707
dtype : np .dtype [Any ],
3693
- ) -> tuple [tuple [numcodecs .abc .Codec , ...], numcodecs .abc .Codec ]:
3708
+ ) -> tuple [tuple [numcodecs .abc .Codec , ...], numcodecs .abc .Codec | None ]:
3694
3709
"""
3695
3710
Get the default chunk encoding for zarr v2 arrays, given a dtype
3696
3711
"""
3697
- codec_id_dict = zarr_config .get ("array.v2_default_compressor" )
3698
-
3699
3712
if dtype .kind in "biufcmM" :
3700
3713
dtype_key = "numeric"
3701
- codec_type = "compressor"
3702
3714
elif dtype .kind in "U" :
3703
3715
dtype_key = "string"
3704
- codec_type = "filter"
3705
3716
elif dtype .kind in "OSV" :
3706
3717
dtype_key = "bytes"
3707
- codec_type = "filter"
3708
3718
else :
3709
3719
raise ValueError (f"Unsupported dtype kind { dtype .kind } " )
3710
- codec_id = codec_id_dict [dtype_key ]
3711
- codec_instance = numcodecs .get_codec ({"id" : codec_id })
3712
- if codec_type == "compressor" :
3713
- return (), codec_instance
3714
- elif codec_type == "filter" :
3715
- return codec_instance , numcodecs .Zstd ()
3716
- else :
3717
- raise ValueError (f"Unsupported codec type { codec_type } " )
3720
+
3721
+ compressor_dict = zarr_config .get ("array.v2_default_compressor" ).get (dtype_key , None )
3722
+ filter_dicts = zarr_config .get ("array.v2_default_filters" ).get (dtype_key , [])
3723
+
3724
+ compressor = None
3725
+ if compressor_dict is not None :
3726
+ compressor = numcodecs .get_codec (compressor_dict )
3727
+ filters = tuple (numcodecs .get_codec (f ) for f in filter_dicts )
3728
+ return filters , compressor
3718
3729
3719
3730
3720
3731
def _parse_chunk_encoding_v2 (
0 commit comments