|
13 | 13 |
|
14 | 14 | from zarr._compat import _deprecate_positional_args
|
15 | 15 | from zarr.abc.store import Store, set_or_delete
|
16 |
| -from zarr.codecs import _get_default_array_bytes_codec |
17 | 16 | from zarr.codecs._v2 import V2Codec
|
18 | 17 | from zarr.core._info import ArrayInfo
|
19 | 18 | from zarr.core.attributes import Attributes
|
|
78 | 77 | ArrayV3MetadataDict,
|
79 | 78 | T_ArrayMetadata,
|
80 | 79 | )
|
81 |
| -from zarr.core.metadata.v3 import parse_node_type_array |
| 80 | +from zarr.core.metadata.v2 import _default_filters_and_compressor |
| 81 | +from zarr.core.metadata.v3 import DataType, parse_node_type_array |
82 | 82 | from zarr.core.sync import sync
|
83 | 83 | from zarr.errors import MetadataValidationError
|
84 | 84 | from zarr.registry import get_pipeline_class
|
@@ -409,27 +409,53 @@ async def create(
|
409 | 409 | attributes : dict[str, JSON], optional
|
410 | 410 | The attributes of the array (default is None).
|
411 | 411 | chunk_shape : ChunkCoords, optional
|
412 |
| - The shape of the array's chunks (default is None). |
| 412 | + The shape of the array's chunks |
| 413 | + V3 only. V2 arrays should use `chunks` instead. |
| 414 | + If not specified, default are guessed based on the shape and dtype. |
413 | 415 | chunk_key_encoding : ChunkKeyEncoding, optional
|
414 |
| - The chunk key encoding (default is None). |
415 |
| - codecs : Iterable[Codec | dict[str, JSON]], optional |
416 |
| - The codecs used to encode the data (default is None). |
| 416 | + A specification of how the chunk keys are represented in storage. |
| 417 | + V3 only. V2 arrays should use `dimension_separator` instead. |
| 418 | + Default is ``("default", "/")``. |
| 419 | + codecs : Sequence of Codecs or dicts, optional |
| 420 | + An iterable of Codec or dict serializations of Codecs. The elements of |
| 421 | + this collection specify the transformation from array values to stored bytes. |
| 422 | + V3 only. V2 arrays should use ``filters`` and ``compressor`` instead. |
| 423 | +
|
| 424 | + If no codecs are provided, default codecs will be used: |
| 425 | +
|
| 426 | + - For numeric arrays, the default is ``BytesCodec`` and ``ZstdCodec``. |
| 427 | + - For Unicode strings, the default is ``VLenUTF8Codec``. |
| 428 | + - For bytes or objects, the default is ``VLenBytesCodec``. |
| 429 | +
|
| 430 | + These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`. |
417 | 431 | dimension_names : Iterable[str], optional
|
418 | 432 | The names of the dimensions (default is None).
|
| 433 | + V3 only. V2 arrays should not use this parameter. |
419 | 434 | chunks : ShapeLike, optional
|
420 |
| - The shape of the array's chunks (default is None). |
421 |
| - V2 only. V3 arrays should not have 'chunks' parameter. |
| 435 | + The shape of the array's chunks. |
| 436 | + V2 only. V3 arrays should use ``chunk_shape`` instead. |
| 437 | + If not specified, default are guessed based on the shape and dtype. |
422 | 438 | dimension_separator : Literal[".", "/"], optional
|
423 |
| - The dimension separator (default is None). |
424 |
| - V2 only. V3 arrays cannot have a dimension separator. |
| 439 | + The dimension separator (default is "."). |
| 440 | + V2 only. V3 arrays should use ``chunk_key_encoding`` instead. |
425 | 441 | order : Literal["C", "F"], optional
|
426 |
| - The order of the array (default is None). |
| 442 | + The order of the array (default is specified by ``array.order`` in :mod:`zarr.core.config`). |
427 | 443 | filters : list[dict[str, JSON]], optional
|
428 |
| - The filters used to compress the data (default is None). |
429 |
| - V2 only. V3 arrays should not have 'filters' parameter. |
| 444 | + Sequence of filters to use to encode chunk data prior to compression. |
| 445 | + V2 only. V3 arrays should use ``codecs`` instead. If neither ``compressor`` |
| 446 | + nor ``filters`` are provided, a default compressor will be used. (see |
| 447 | + ``compressor`` for details) |
430 | 448 | compressor : dict[str, JSON], optional
|
431 | 449 | The compressor used to compress the data (default is None).
|
432 |
| - V2 only. V3 arrays should not have 'compressor' parameter. |
| 450 | + V2 only. V3 arrays should use ``codecs`` instead. |
| 451 | +
|
| 452 | + If neither ``compressor`` nor ``filters`` are provided, a default compressor will be used: |
| 453 | +
|
| 454 | + - For numeric arrays, the default is ``ZstdCodec``. |
| 455 | + - For Unicode strings, the default is ``VLenUTF8Codec``. |
| 456 | + - For bytes or objects, the default is ``VLenBytesCodec``. |
| 457 | +
|
| 458 | + These defaults can be changed by modifying the value of ``array.v2_default_compressor`` in :mod:`zarr.core.config`. |
433 | 459 | overwrite : bool, optional
|
434 | 460 | Whether to raise an error if the store already exists (default is False).
|
435 | 461 | data : npt.ArrayLike, optional
|
@@ -494,14 +520,6 @@ async def create(
|
494 | 520 | order=order,
|
495 | 521 | )
|
496 | 522 | elif zarr_format == 2:
|
497 |
| - if dtype is str or dtype == "str": |
498 |
| - # another special case: zarr v2 added the vlen-utf8 codec |
499 |
| - vlen_codec: dict[str, JSON] = {"id": "vlen-utf8"} |
500 |
| - if filters and not any(x["id"] == "vlen-utf8" for x in filters): |
501 |
| - filters = list(filters) + [vlen_codec] |
502 |
| - else: |
503 |
| - filters = [vlen_codec] |
504 |
| - |
505 | 523 | if codecs is not None:
|
506 | 524 | raise ValueError(
|
507 | 525 | "codecs cannot be used for arrays with version 2. Use filters and compressor instead."
|
@@ -564,11 +582,7 @@ async def _create_v3(
|
564 | 582 | await ensure_no_existing_node(store_path, zarr_format=3)
|
565 | 583 |
|
566 | 584 | shape = parse_shapelike(shape)
|
567 |
| - codecs = ( |
568 |
| - list(codecs) |
569 |
| - if codecs is not None |
570 |
| - else [_get_default_array_bytes_codec(np.dtype(dtype))] |
571 |
| - ) |
| 585 | + codecs = list(codecs) if codecs is not None else _get_default_codecs(np.dtype(dtype)) |
572 | 586 |
|
573 | 587 | if chunk_key_encoding is None:
|
574 | 588 | chunk_key_encoding = ("default", "/")
|
@@ -634,6 +648,14 @@ async def _create_v2(
|
634 | 648 | if dimension_separator is None:
|
635 | 649 | dimension_separator = "."
|
636 | 650 |
|
| 651 | + dtype = parse_dtype(dtype, zarr_format=2) |
| 652 | + if not filters and not compressor: |
| 653 | + filters, compressor = _default_filters_and_compressor(dtype) |
| 654 | + if np.issubdtype(dtype, np.str_): |
| 655 | + filters = filters or [] |
| 656 | + if not any(x["id"] == "vlen-utf8" for x in filters): |
| 657 | + filters = list(filters) + [{"id": "vlen-utf8"}] |
| 658 | + |
637 | 659 | metadata = ArrayV2Metadata(
|
638 | 660 | shape=shape,
|
639 | 661 | dtype=np.dtype(dtype),
|
@@ -1493,23 +1515,53 @@ def create(
|
1493 | 1515 | dtype : npt.DTypeLike
|
1494 | 1516 | The data type of the array.
|
1495 | 1517 | chunk_shape : ChunkCoords, optional
|
1496 |
| - The shape of the Array's chunks (default is None). |
| 1518 | + The shape of the Array's chunks. |
| 1519 | + V3 only. V2 arrays should use `chunks` instead. |
| 1520 | + If not specified, default are guessed based on the shape and dtype. |
1497 | 1521 | chunk_key_encoding : ChunkKeyEncoding, optional
|
1498 |
| - The chunk key encoding (default is None). |
1499 |
| - codecs : Iterable[Codec | dict[str, JSON]], optional |
1500 |
| - The codecs used to encode the data (default is None). |
| 1522 | + A specification of how the chunk keys are represented in storage. |
| 1523 | + V3 only. V2 arrays should use `dimension_separator` instead. |
| 1524 | + Default is ``("default", "/")``. |
| 1525 | + codecs : Sequence of Codecs or dicts, optional |
| 1526 | + An iterable of Codec or dict serializations of Codecs. The elements of |
| 1527 | + this collection specify the transformation from array values to stored bytes. |
| 1528 | + V3 only. V2 arrays should use ``filters`` and ``compressor`` instead. |
| 1529 | +
|
| 1530 | + If no codecs are provided, default codecs will be used: |
| 1531 | +
|
| 1532 | + - For numeric arrays, the default is ``BytesCodec`` and ``ZstdCodec``. |
| 1533 | + - For Unicode strings, the default is ``VLenUTF8Codec``. |
| 1534 | + - For bytes or objects, the default is ``VLenBytesCodec``. |
| 1535 | +
|
| 1536 | + These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`. |
1501 | 1537 | dimension_names : Iterable[str], optional
|
1502 | 1538 | The names of the dimensions (default is None).
|
| 1539 | + V3 only. V2 arrays should not use this parameter. |
1503 | 1540 | chunks : ChunkCoords, optional
|
1504 |
| - The shape of the Array's chunks (default is None). |
| 1541 | + The shape of the array's chunks. |
| 1542 | + V2 only. V3 arrays should use ``chunk_shape`` instead. |
| 1543 | + If not specified, default are guessed based on the shape and dtype. |
1505 | 1544 | dimension_separator : Literal[".", "/"], optional
|
1506 |
| - The dimension separator (default is None). |
| 1545 | + The dimension separator (default is "."). |
| 1546 | + V2 only. V3 arrays should use ``chunk_key_encoding`` instead. |
1507 | 1547 | order : Literal["C", "F"], optional
|
1508 |
| - The order of the array (default is None). |
| 1548 | + The order of the array (default is specified by ``array.order`` in :mod:`zarr.core.config`). |
1509 | 1549 | filters : list[dict[str, JSON]], optional
|
1510 |
| - The filters used to compress the data (default is None). |
| 1550 | + Sequence of filters to use to encode chunk data prior to compression. |
| 1551 | + V2 only. V3 arrays should use ``codecs`` instead. If neither ``compressor`` |
| 1552 | + nor ``filters`` are provided, a default compressor will be used. (see |
| 1553 | + ``compressor`` for details) |
1511 | 1554 | compressor : dict[str, JSON], optional
|
1512 |
| - The compressor used to compress the data (default is None). |
| 1555 | + Primary compressor to compress chunk data. |
| 1556 | + V2 only. V3 arrays should use ``codecs`` instead. |
| 1557 | +
|
| 1558 | + If neither ``compressor`` nor ``filters`` are provided, a default compressor will be used: |
| 1559 | +
|
| 1560 | + - For numeric arrays, the default is ``ZstdCodec``. |
| 1561 | + - For Unicode strings, the default is ``VLenUTF8Codec``. |
| 1562 | + - For bytes or objects, the default is ``VLenBytesCodec``. |
| 1563 | +
|
| 1564 | + These defaults can be changed by modifying the value of ``array.v2_default_compressor`` in :mod:`zarr.core.config`. |
1513 | 1565 | overwrite : bool, optional
|
1514 | 1566 | Whether to raise an error if the store already exists (default is False).
|
1515 | 1567 |
|
@@ -3342,3 +3394,18 @@ def _build_parents(
|
3342 | 3394 | )
|
3343 | 3395 |
|
3344 | 3396 | return parents
|
| 3397 | + |
| 3398 | + |
| 3399 | +def _get_default_codecs( |
| 3400 | + np_dtype: np.dtype[Any], |
| 3401 | +) -> list[dict[str, JSON]]: |
| 3402 | + default_codecs = config.get("array.v3_default_codecs") |
| 3403 | + dtype = DataType.from_numpy(np_dtype) |
| 3404 | + if dtype == DataType.string: |
| 3405 | + dtype_key = "string" |
| 3406 | + elif dtype == DataType.bytes: |
| 3407 | + dtype_key = "bytes" |
| 3408 | + else: |
| 3409 | + dtype_key = "numeric" |
| 3410 | + |
| 3411 | + return [{"name": codec_id, "configuration": {}} for codec_id in default_codecs[dtype_key]] |
0 commit comments