Skip to content

Feat/write empty chunks #2429

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 21 commits into from
Dec 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
82 changes: 55 additions & 27 deletions src/zarr/api/asynchronous.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,16 @@
from typing_extensions import deprecated

from zarr.core.array import Array, AsyncArray, get_array_metadata
from zarr.core.array_spec import ArrayConfig, ArrayConfigParams
from zarr.core.buffer import NDArrayLike
from zarr.core.common import (
JSON,
AccessModeLiteral,
ChunkCoords,
MemoryOrder,
ZarrFormat,
_warn_order_kwarg,
_warn_write_empty_chunks_kwarg,
parse_dtype,
)
from zarr.core.config import config
Expand Down Expand Up @@ -794,7 +797,7 @@ async def create(
read_only: bool | None = None,
object_codec: Codec | None = None, # TODO: type has changed
dimension_separator: Literal[".", "/"] | None = None,
write_empty_chunks: bool = False, # TODO: default has changed
write_empty_chunks: bool | None = None,
zarr_version: ZarrFormat | None = None, # deprecated
zarr_format: ZarrFormat | None = None,
meta_array: Any | None = None, # TODO: need type
Expand All @@ -810,6 +813,7 @@ async def create(
codecs: Iterable[Codec | dict[str, JSON]] | None = None,
dimension_names: Iterable[str] | None = None,
storage_options: dict[str, Any] | None = None,
config: ArrayConfig | ArrayConfigParams | None = None,
**kwargs: Any,
) -> AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata]:
"""Create an array.
Expand Down Expand Up @@ -856,8 +860,10 @@ async def create(
These defaults can be changed by modifying the value of ``array.v2_default_compressor`` in :mod:`zarr.core.config`. fill_value : object
Default value to use for uninitialized portions of the array.
order : {'C', 'F'}, optional
Deprecated in favor of the ``config`` keyword argument.
Pass ``{'order': <value>}`` to ``create`` instead of using this parameter.
Memory layout to be used within each chunk.
If not specified, default is taken from the Zarr config ```array.order```.
If not specified, the ``array.order`` parameter in the global config will be used.
store : Store or str
Store or path to directory in file system or name of zip file.
synchronizer : object, optional
Expand Down Expand Up @@ -891,30 +897,26 @@ async def create(
Separator placed between the dimensions of a chunk.
V2 only. V3 arrays should use ``chunk_key_encoding`` instead.
Default is ".".
.. versionadded:: 2.8

write_empty_chunks : bool, optional
If True (default), all chunks will be stored regardless of their
Deprecated in favor of the ``config`` keyword argument.
Pass ``{'write_empty_chunks': <value>}`` to ``create`` instead of using this parameter.
If True, all chunks will be stored regardless of their
contents. If False, each chunk is compared to the array's fill value
prior to storing. If a chunk is uniformly equal to the fill value, then
that chunk is not be stored, and the store entry for that chunk's key
is deleted. This setting enables sparser storage, as only chunks with
non-fill-value data are stored, at the expense of overhead associated
with checking the data of each chunk.

.. versionadded:: 2.11

is deleted.
zarr_format : {2, 3, None}, optional
The zarr format to use when saving.
Default is 3.
meta_array : array-like, optional
An array instance to use for determining arrays to create and return
to users. Use `numpy.empty(())` by default.

.. versionadded:: 2.13
storage_options : dict
If using an fsspec URL to create the store, these will be passed to
the backend implementation. Ignored otherwise.
config : ArrayConfig or ArrayConfigParams, optional
Runtime configuration of the array. If provided, will override the
default values from `zarr.config.array`.

Returns
-------
Expand Down Expand Up @@ -951,26 +953,47 @@ async def create(
warnings.warn("object_codec is not yet implemented", RuntimeWarning, stacklevel=2)
if read_only is not None:
warnings.warn("read_only is not yet implemented", RuntimeWarning, stacklevel=2)
if dimension_separator is not None:
if zarr_format == 3:
raise ValueError(
"dimension_separator is not supported for zarr format 3, use chunk_key_encoding instead"
)
else:
warnings.warn(
"dimension_separator is not yet implemented",
RuntimeWarning,
stacklevel=2,
)
if write_empty_chunks:
warnings.warn("write_empty_chunks is not yet implemented", RuntimeWarning, stacklevel=2)
if dimension_separator is not None and zarr_format == 3:
raise ValueError(
"dimension_separator is not supported for zarr format 3, use chunk_key_encoding instead"
)

if order is not None:
_warn_order_kwarg()
if write_empty_chunks is not None:
_warn_write_empty_chunks_kwarg()

if meta_array is not None:
warnings.warn("meta_array is not yet implemented", RuntimeWarning, stacklevel=2)

mode = kwargs.pop("mode", None)
if mode is None:
mode = "a"
store_path = await make_store_path(store, path=path, mode=mode, storage_options=storage_options)

config_dict: ArrayConfigParams = {}

if write_empty_chunks is not None:
if config is not None:
msg = (
"Both write_empty_chunks and config keyword arguments are set. "
"This is redundant. When both are set, write_empty_chunks will be ignored and "
"config will be used."
)
warnings.warn(UserWarning(msg), stacklevel=1)
config_dict["write_empty_chunks"] = write_empty_chunks
if order is not None:
if config is not None:
msg = (
"Both order and config keyword arguments are set. "
"This is redundant. When both are set, order will be ignored and "
"config will be used."
)
warnings.warn(UserWarning(msg), stacklevel=1)
config_dict["order"] = order

config_parsed = ArrayConfig.from_dict(config_dict)

return await AsyncArray.create(
store_path,
shape=shape,
Expand All @@ -987,7 +1010,7 @@ async def create(
codecs=codecs,
dimension_names=dimension_names,
attributes=attributes,
order=order,
config=config_parsed,
**kwargs,
)

Expand Down Expand Up @@ -1163,6 +1186,11 @@ async def open_array(

zarr_format = _handle_zarr_version_or_format(zarr_version=zarr_version, zarr_format=zarr_format)

if "order" in kwargs:
_warn_order_kwarg()
if "write_empty_chunks" in kwargs:
_warn_write_empty_chunks_kwarg()

try:
return await AsyncArray.open(store_path, zarr_format=zarr_format)
except FileNotFoundError:
Expand Down
28 changes: 14 additions & 14 deletions src/zarr/api/synchronous.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

from zarr.abc.codec import Codec
from zarr.api.asynchronous import ArrayLike, PathLike
from zarr.core.array_spec import ArrayConfig, ArrayConfigParams
from zarr.core.buffer import NDArrayLike
from zarr.core.chunk_key_encodings import ChunkKeyEncoding
from zarr.core.common import JSON, AccessModeLiteral, ChunkCoords, MemoryOrder, ZarrFormat
Expand Down Expand Up @@ -542,7 +543,7 @@ def create(
read_only: bool | None = None,
object_codec: Codec | None = None, # TODO: type has changed
dimension_separator: Literal[".", "/"] | None = None,
write_empty_chunks: bool = False, # TODO: default has changed
write_empty_chunks: bool | None = None, # TODO: default has changed
zarr_version: ZarrFormat | None = None, # deprecated
zarr_format: ZarrFormat | None = None,
meta_array: Any | None = None, # TODO: need type
Expand All @@ -558,6 +559,7 @@ def create(
codecs: Iterable[Codec | dict[str, JSON]] | None = None,
dimension_names: Iterable[str] | None = None,
storage_options: dict[str, Any] | None = None,
config: ArrayConfig | ArrayConfigParams | None = None,
**kwargs: Any,
) -> Array:
"""Create an array.
Expand All @@ -578,8 +580,10 @@ def create(
fill_value : object
Default value to use for uninitialized portions of the array.
order : {'C', 'F'}, optional
Deprecated in favor of the ``config`` keyword argument.
Pass ``{'order': <value>}`` to ``create`` instead of using this parameter.
Memory layout to be used within each chunk.
Default is set in Zarr's config (`array.order`).
If not specified, the ``array.order`` parameter in the global config will be used.
store : Store or str
Store or path to directory in file system or name of zip file.
synchronizer : object, optional
Expand Down Expand Up @@ -609,30 +613,25 @@ def create(
A codec to encode object arrays, only needed if dtype=object.
dimension_separator : {'.', '/'}, optional
Separator placed between the dimensions of a chunk.

.. versionadded:: 2.8

write_empty_chunks : bool, optional
If True (default), all chunks will be stored regardless of their
Deprecated in favor of the ``config`` keyword argument.
Pass ``{'write_empty_chunks': <value>}`` to ``create`` instead of using this parameter.
If True, all chunks will be stored regardless of their
contents. If False, each chunk is compared to the array's fill value
prior to storing. If a chunk is uniformly equal to the fill value, then
that chunk is not be stored, and the store entry for that chunk's key
is deleted. This setting enables sparser storage, as only chunks with
non-fill-value data are stored, at the expense of overhead associated
with checking the data of each chunk.

.. versionadded:: 2.11

is deleted.
zarr_format : {2, 3, None}, optional
The zarr format to use when saving.
meta_array : array-like, optional
An array instance to use for determining arrays to create and return
to users. Use `numpy.empty(())` by default.

.. versionadded:: 2.13
storage_options : dict
If using an fsspec URL to create the store, these will be passed to
the backend implementation. Ignored otherwise.
config : ArrayConfig or ArrayConfigParams, optional
Runtime configuration of the array. If provided, will override the
default values from `zarr.config.array`.

Returns
-------
Expand Down Expand Up @@ -669,6 +668,7 @@ def create(
codecs=codecs,
dimension_names=dimension_names,
storage_options=storage_options,
config=config,
**kwargs,
)
)
Expand Down
8 changes: 5 additions & 3 deletions src/zarr/codecs/sharding.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from zarr.abc.store import ByteGetter, ByteRangeRequest, ByteSetter
from zarr.codecs.bytes import BytesCodec
from zarr.codecs.crc32c_ import Crc32cCodec
from zarr.core.array_spec import ArraySpec
from zarr.core.array_spec import ArrayConfig, ArraySpec
from zarr.core.buffer import (
Buffer,
BufferPrototype,
Expand Down Expand Up @@ -665,7 +665,9 @@ def _get_index_chunk_spec(self, chunks_per_shard: ChunkCoords) -> ArraySpec:
shape=chunks_per_shard + (2,),
dtype=np.dtype("<u8"),
fill_value=MAX_UINT_64,
order="C", # Note: this is hard-coded for simplicity -- it is not surfaced into user code
config=ArrayConfig(
order="C", write_empty_chunks=False
), # Note: this is hard-coded for simplicity -- it is not surfaced into user code,
prototype=numpy_buffer_prototype(),
)

Expand All @@ -674,7 +676,7 @@ def _get_chunk_spec(self, shard_spec: ArraySpec) -> ArraySpec:
shape=self.chunk_shape,
dtype=shard_spec.dtype,
fill_value=shard_spec.fill_value,
order=shard_spec.order,
config=shard_spec.config,
prototype=shard_spec.prototype,
)

Expand Down
2 changes: 1 addition & 1 deletion src/zarr/codecs/transpose.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ def resolve_metadata(self, chunk_spec: ArraySpec) -> ArraySpec:
shape=tuple(chunk_spec.shape[self.order[i]] for i in range(chunk_spec.ndim)),
dtype=chunk_spec.dtype,
fill_value=chunk_spec.fill_value,
order=chunk_spec.order,
config=chunk_spec.config,
prototype=chunk_spec.prototype,
)

Expand Down
Loading
Loading