Skip to content

Commit f3a2e0a

Browse files
authored
[v3] fix: zarr v2 compatibility fixes for Dask (#2186)
* fix: zarr v2 compatability fixes - port normalize_chunks from v2 - add array.store property - default to append in create * move zarr.store to zarr.storage also fix failing ci * make chunks a tuple * Apply suggestions from code review * more merge conflict resolution * fixups * fixup zipstore * Apply suggestions from code review * Apply suggestions from code review * add test * extend test * clean up parents * debug race condition * more debug * Update src/zarr/core/array.py
1 parent 2761845 commit f3a2e0a

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

41 files changed

+255
-133
lines changed

src/zarr/api/asynchronous.py

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -7,14 +7,16 @@
77
import numpy as np
88
import numpy.typing as npt
99

10+
from zarr.abc.store import Store
1011
from zarr.core.array import Array, AsyncArray, get_array_metadata
1112
from zarr.core.common import JSON, AccessModeLiteral, ChunkCoords, MemoryOrder, ZarrFormat
1213
from zarr.core.config import config
1314
from zarr.core.group import AsyncGroup
1415
from zarr.core.metadata.v2 import ArrayV2Metadata
1516
from zarr.core.metadata.v3 import ArrayV3Metadata
16-
from zarr.store import (
17+
from zarr.storage import (
1718
StoreLike,
19+
StorePath,
1820
make_store_path,
1921
)
2022

@@ -225,6 +227,7 @@ async def open(
225227
Return type depends on what exists in the given store.
226228
"""
227229
zarr_format = _handle_zarr_version_or_format(zarr_version=zarr_version, zarr_format=zarr_format)
230+
228231
store_path = await make_store_path(store, mode=mode, storage_options=storage_options)
229232

230233
if path is not None:
@@ -243,9 +246,9 @@ async def open(
243246
return await open_group(store=store_path, zarr_format=zarr_format, mode=mode, **kwargs)
244247

245248
try:
246-
return await open_array(store=store_path, zarr_format=zarr_format, mode=mode, **kwargs)
249+
return await open_array(store=store_path, zarr_format=zarr_format, **kwargs)
247250
except KeyError:
248-
return await open_group(store=store_path, zarr_format=zarr_format, mode=mode, **kwargs)
251+
return await open_group(store=store_path, zarr_format=zarr_format, **kwargs)
249252

250253

251254
async def open_consolidated(*args: Any, **kwargs: Any) -> AsyncGroup:
@@ -319,7 +322,8 @@ async def save_array(
319322
or _default_zarr_version()
320323
)
321324

322-
store_path = await make_store_path(store, mode="w", storage_options=storage_options)
325+
mode = kwargs.pop("mode", None)
326+
store_path = await make_store_path(store, mode=mode, storage_options=storage_options)
323327
if path is not None:
324328
store_path = store_path / path
325329
new = await AsyncArray.create(
@@ -496,7 +500,9 @@ async def group(
496500

497501
zarr_format = _handle_zarr_version_or_format(zarr_version=zarr_version, zarr_format=zarr_format)
498502

499-
store_path = await make_store_path(store, storage_options=storage_options)
503+
mode = None if isinstance(store, Store) else cast(AccessModeLiteral, "a")
504+
505+
store_path = await make_store_path(store, mode=mode, storage_options=storage_options)
500506
if path is not None:
501507
store_path = store_path / path
502508

@@ -769,7 +775,11 @@ async def create(
769775
if meta_array is not None:
770776
warnings.warn("meta_array is not yet implemented", RuntimeWarning, stacklevel=2)
771777

772-
mode = kwargs.pop("mode", cast(AccessModeLiteral, "r" if read_only else "w"))
778+
mode = kwargs.pop("mode", None)
779+
if mode is None:
780+
if not isinstance(store, Store | StorePath):
781+
mode = "a"
782+
773783
store_path = await make_store_path(store, mode=mode, storage_options=storage_options)
774784
if path is not None:
775785
store_path = store_path / path
@@ -945,7 +955,8 @@ async def open_array(
945955
The opened array.
946956
"""
947957

948-
store_path = await make_store_path(store, storage_options=storage_options)
958+
mode = kwargs.pop("mode", None)
959+
store_path = await make_store_path(store, mode=mode)
949960
if path is not None:
950961
store_path = store_path / path
951962

src/zarr/api/synchronous.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
if TYPE_CHECKING:
1212
from zarr.core.buffer import NDArrayLike
1313
from zarr.core.common import JSON, AccessModeLiteral, ChunkCoords, ZarrFormat
14-
from zarr.store import StoreLike
14+
from zarr.storage import StoreLike
1515

1616
__all__ = [
1717
"array",

src/zarr/core/array.py

Lines changed: 39 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,14 @@
33
import json
44
from asyncio import gather
55
from dataclasses import dataclass, field, replace
6+
from logging import getLogger
67
from typing import TYPE_CHECKING, Any, Literal, cast
78

89
import numpy as np
910
import numpy.typing as npt
1011

1112
from zarr._compat import _deprecate_positional_args
12-
from zarr.abc.store import set_or_delete
13+
from zarr.abc.store import Store, set_or_delete
1314
from zarr.codecs import BytesCodec
1415
from zarr.codecs._v2 import V2Compressor, V2Filters
1516
from zarr.core.attributes import Attributes
@@ -19,7 +20,7 @@
1920
NDBuffer,
2021
default_buffer_prototype,
2122
)
22-
from zarr.core.chunk_grids import RegularChunkGrid, _guess_chunks
23+
from zarr.core.chunk_grids import RegularChunkGrid, normalize_chunks
2324
from zarr.core.chunk_key_encodings import (
2425
ChunkKeyEncoding,
2526
DefaultChunkKeyEncoding,
@@ -67,10 +68,8 @@
6768
from zarr.core.metadata.v3 import ArrayV3Metadata
6869
from zarr.core.sync import collect_aiterator, sync
6970
from zarr.registry import get_pipeline_class
70-
from zarr.store import StoreLike, StorePath, make_store_path
71-
from zarr.store.common import (
72-
ensure_no_existing_node,
73-
)
71+
from zarr.storage import StoreLike, make_store_path
72+
from zarr.storage.common import StorePath, ensure_no_existing_node
7473

7574
if TYPE_CHECKING:
7675
from collections.abc import Iterable, Iterator, Sequence
@@ -82,6 +81,8 @@
8281
# Array and AsyncArray are defined in the base ``zarr`` namespace
8382
__all__ = ["create_codec_pipeline", "parse_array_metadata"]
8483

84+
logger = getLogger(__name__)
85+
8586

8687
def parse_array_metadata(data: Any) -> ArrayV2Metadata | ArrayV3Metadata:
8788
if isinstance(data, ArrayV2Metadata | ArrayV3Metadata):
@@ -222,15 +223,14 @@ async def create(
222223

223224
shape = parse_shapelike(shape)
224225

225-
if chunk_shape is None:
226-
if chunks is None:
227-
chunk_shape = chunks = _guess_chunks(shape=shape, typesize=np.dtype(dtype).itemsize)
228-
else:
229-
chunks = parse_shapelike(chunks)
226+
if chunks is not None and chunk_shape is not None:
227+
raise ValueError("Only one of chunk_shape or chunks can be provided.")
230228

231-
chunk_shape = chunks
232-
elif chunks is not None:
233-
raise ValueError("Only one of chunk_shape or chunks must be provided.")
229+
dtype = np.dtype(dtype)
230+
if chunks:
231+
_chunks = normalize_chunks(chunks, shape, dtype.itemsize)
232+
else:
233+
_chunks = normalize_chunks(chunk_shape, shape, dtype.itemsize)
234234

235235
if zarr_format == 3:
236236
if dimension_separator is not None:
@@ -253,7 +253,7 @@ async def create(
253253
store_path,
254254
shape=shape,
255255
dtype=dtype,
256-
chunk_shape=chunk_shape,
256+
chunk_shape=_chunks,
257257
fill_value=fill_value,
258258
chunk_key_encoding=chunk_key_encoding,
259259
codecs=codecs,
@@ -276,7 +276,7 @@ async def create(
276276
store_path,
277277
shape=shape,
278278
dtype=dtype,
279-
chunks=chunk_shape,
279+
chunks=_chunks,
280280
dimension_separator=dimension_separator,
281281
fill_value=fill_value,
282282
order=order,
@@ -404,6 +404,10 @@ async def open(
404404
metadata_dict = await get_array_metadata(store_path, zarr_format=zarr_format)
405405
return cls(store_path=store_path, metadata=metadata_dict)
406406

407+
@property
408+
def store(self) -> Store:
409+
return self.store_path.store
410+
407411
@property
408412
def ndim(self) -> int:
409413
return len(self.metadata.shape)
@@ -831,6 +835,10 @@ def open(
831835
async_array = sync(AsyncArray.open(store))
832836
return cls(async_array)
833837

838+
@property
839+
def store(self) -> Store:
840+
return self._async_array.store
841+
834842
@property
835843
def ndim(self) -> int:
836844
return self._async_array.ndim
@@ -2380,15 +2388,26 @@ def chunks_initialized(array: Array | AsyncArray) -> tuple[str, ...]:
23802388
def _build_parents(node: AsyncArray | AsyncGroup) -> list[AsyncGroup]:
23812389
from zarr.core.group import AsyncGroup, GroupMetadata
23822390

2383-
required_parts = node.store_path.path.split("/")[:-1]
2384-
parents = []
2391+
store = node.store_path.store
2392+
path = node.store_path.path
2393+
if not path:
2394+
return []
2395+
2396+
required_parts = path.split("/")[:-1]
2397+
parents = [
2398+
# the root group
2399+
AsyncGroup(
2400+
metadata=GroupMetadata(zarr_format=node.metadata.zarr_format),
2401+
store_path=StorePath(store=store, path=""),
2402+
)
2403+
]
23852404

23862405
for i, part in enumerate(required_parts):
2387-
path = "/".join(required_parts[:i] + [part])
2406+
p = "/".join(required_parts[:i] + [part])
23882407
parents.append(
23892408
AsyncGroup(
23902409
metadata=GroupMetadata(zarr_format=node.metadata.zarr_format),
2391-
store_path=StorePath(store=node.store_path.store, path=path),
2410+
store_path=StorePath(store=store, path=p),
23922411
)
23932412
)
23942413

src/zarr/core/chunk_grids.py

Lines changed: 45 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,12 @@
22

33
import itertools
44
import math
5+
import numbers
56
import operator
67
from abc import abstractmethod
78
from dataclasses import dataclass
89
from functools import reduce
9-
from typing import TYPE_CHECKING
10+
from typing import TYPE_CHECKING, Any
1011

1112
import numpy as np
1213

@@ -97,6 +98,49 @@ def _guess_chunks(
9798
return tuple(int(x) for x in chunks)
9899

99100

101+
def normalize_chunks(chunks: Any, shape: tuple[int, ...], typesize: int) -> tuple[int, ...]:
102+
"""Convenience function to normalize the `chunks` argument for an array
103+
with the given `shape`."""
104+
105+
# N.B., expect shape already normalized
106+
107+
# handle auto-chunking
108+
if chunks is None or chunks is True:
109+
return _guess_chunks(shape, typesize)
110+
111+
# handle no chunking
112+
if chunks is False:
113+
return shape
114+
115+
# handle 1D convenience form
116+
if isinstance(chunks, numbers.Integral):
117+
chunks = tuple(int(chunks) for _ in shape)
118+
119+
# handle dask-style chunks (iterable of iterables)
120+
if all(isinstance(c, (tuple | list)) for c in chunks):
121+
# take first chunk size for each dimension
122+
chunks = tuple(
123+
c[0] for c in chunks
124+
) # TODO: check/error/warn for irregular chunks (e.g. if c[0] != c[1:-1])
125+
126+
# handle bad dimensionality
127+
if len(chunks) > len(shape):
128+
raise ValueError("too many dimensions in chunks")
129+
130+
# handle underspecified chunks
131+
if len(chunks) < len(shape):
132+
# assume chunks across remaining dimensions
133+
chunks += shape[len(chunks) :]
134+
135+
# handle None or -1 in chunks
136+
if -1 in chunks or None in chunks:
137+
chunks = tuple(
138+
s if c == -1 or c is None else int(c) for s, c in zip(shape, chunks, strict=False)
139+
)
140+
141+
return tuple(int(c) for c in chunks)
142+
143+
100144
@dataclass(frozen=True)
101145
class ChunkGrid(Metadata):
102146
@classmethod

src/zarr/core/group.py

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,8 @@
2929
)
3030
from zarr.core.config import config
3131
from zarr.core.sync import SyncMixin, sync
32-
from zarr.store import StoreLike, StorePath, make_store_path
33-
from zarr.store.common import ensure_no_existing_node
32+
from zarr.storage import StoreLike, make_store_path
33+
from zarr.storage.common import StorePath, ensure_no_existing_node
3434

3535
if TYPE_CHECKING:
3636
from collections.abc import AsyncGenerator, Generator, Iterable, Iterator
@@ -176,7 +176,9 @@ async def open(
176176
# alternatively, we could warn and favor v3
177177
raise ValueError("Both zarr.json and .zgroup objects exist")
178178
if zarr_json_bytes is None and zgroup_bytes is None:
179-
raise FileNotFoundError(store_path)
179+
raise FileNotFoundError(
180+
f"could not find zarr.json or .zgroup objects in {store_path}"
181+
)
180182
# set zarr_format based on which keys were found
181183
if zarr_json_bytes is not None:
182184
zarr_format = 3
@@ -698,6 +700,10 @@ async def _members(
698700
"Object at %s is not recognized as a component of a Zarr hierarchy.", key
699701
)
700702

703+
async def keys(self) -> AsyncGenerator[str, None]:
704+
async for key, _ in self.members():
705+
yield key
706+
701707
async def contains(self, member: str) -> bool:
702708
# TODO: this can be made more efficient.
703709
try:
@@ -821,15 +827,18 @@ def __delitem__(self, key: str) -> None:
821827
self._sync(self._async_group.delitem(key))
822828

823829
def __iter__(self) -> Iterator[str]:
824-
raise NotImplementedError
830+
yield from self.keys()
825831

826832
def __len__(self) -> int:
827-
raise NotImplementedError
833+
return self.nmembers()
828834

829835
def __setitem__(self, key: str, value: Any) -> None:
830836
"""__setitem__ is not supported in v3"""
831837
raise NotImplementedError
832838

839+
def __repr__(self) -> str:
840+
return f"<Group {self.store_path}>"
841+
833842
async def update_attributes_async(self, new_attributes: dict[str, Any]) -> Group:
834843
new_metadata = replace(self.metadata, attributes=new_attributes)
835844

@@ -904,6 +913,9 @@ def members(self, max_depth: int | None = 0) -> tuple[tuple[str, Array | Group],
904913

905914
return tuple((kv[0], _parse_async_node(kv[1])) for kv in _members)
906915

916+
def keys(self) -> Generator[str, None]:
917+
yield from self._sync_iter(self._async_group.keys())
918+
907919
def __contains__(self, member: str) -> bool:
908920
return self._sync(self._async_group.contains(member))
909921

src/zarr/storage/__init__.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
from zarr.storage.common import StoreLike, StorePath, make_store_path
2+
from zarr.storage.local import LocalStore
3+
from zarr.storage.memory import MemoryStore
4+
from zarr.storage.remote import RemoteStore
5+
from zarr.storage.zip import ZipStore
6+
7+
__all__ = [
8+
"LocalStore",
9+
"MemoryStore",
10+
"RemoteStore",
11+
"StoreLike",
12+
"StorePath",
13+
"ZipStore",
14+
"make_store_path",
15+
]
File renamed without changes.

0 commit comments

Comments
 (0)