Skip to content

Commit aef47ac

Browse files
authored
[v2 / v3 compat] add Group.array and data kwarg to array creation (#2042)
* add deprecated Group.array method, and start filling in group unit tests * add errors module * memory store listdir fix, and a type annotation for a method on the test class * Use ContainsArrayError when a path contains an array; restore auto-chunking; restore data kwarg to array creation * use ContainsGroupError and contains_group for group routines * style changes to store tests * add a lot of tests, remove redundant decorators * add data kwarg to group.array * add newlines to end of docstrings * docstrings * put long type annotation on one line * port guess chunks to v3 * harden semantics for existing arrays / groups * fix exception name in docs
1 parent df4c25f commit aef47ac

File tree

11 files changed

+948
-103
lines changed

11 files changed

+948
-103
lines changed

src/zarr/array.py

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
from zarr.abc.store import set_or_delete
2222
from zarr.attributes import Attributes
2323
from zarr.buffer import BufferPrototype, NDArrayLike, NDBuffer, default_buffer_prototype
24-
from zarr.chunk_grids import RegularChunkGrid
24+
from zarr.chunk_grids import RegularChunkGrid, _guess_chunks
2525
from zarr.chunk_key_encodings import ChunkKeyEncoding, DefaultChunkKeyEncoding, V2ChunkKeyEncoding
2626
from zarr.codecs import BytesCodec
2727
from zarr.codecs._v2 import V2Compressor, V2Filters
@@ -62,6 +62,9 @@
6262
)
6363
from zarr.metadata import ArrayMetadata, ArrayV2Metadata, ArrayV3Metadata
6464
from zarr.store import StoreLike, StorePath, make_store_path
65+
from zarr.store.core import (
66+
ensure_no_existing_node,
67+
)
6568
from zarr.sync import sync
6669

6770

@@ -137,12 +140,13 @@ async def create(
137140
compressor: dict[str, JSON] | None = None,
138141
# runtime
139142
exists_ok: bool = False,
143+
data: npt.ArrayLike | None = None,
140144
) -> AsyncArray:
141145
store_path = make_store_path(store)
142146

143147
if chunk_shape is None:
144148
if chunks is None:
145-
raise ValueError("Either chunk_shape or chunks needs to be provided.")
149+
chunk_shape = chunks = _guess_chunks(shape=shape, typesize=np.dtype(dtype).itemsize)
146150
chunk_shape = chunks
147151
elif chunks is not None:
148152
raise ValueError("Only one of chunk_shape or chunks must be provided.")
@@ -164,7 +168,7 @@ async def create(
164168
raise ValueError(
165169
"compressor cannot be used for arrays with version 3. Use bytes-to-bytes codecs instead."
166170
)
167-
return await cls._create_v3(
171+
result = await cls._create_v3(
168172
store_path,
169173
shape=shape,
170174
dtype=dtype,
@@ -187,7 +191,7 @@ async def create(
187191
)
188192
if dimension_names is not None:
189193
raise ValueError("dimension_names cannot be used for arrays with version 2.")
190-
return await cls._create_v2(
194+
result = await cls._create_v2(
191195
store_path,
192196
shape=shape,
193197
dtype=dtype,
@@ -203,6 +207,12 @@ async def create(
203207
else:
204208
raise ValueError(f"Insupported zarr_format. Got: {zarr_format}")
205209

210+
if data is not None:
211+
# insert user-provided data
212+
await result.setitem(..., data)
213+
214+
return result
215+
206216
@classmethod
207217
async def _create_v3(
208218
cls,
@@ -224,7 +234,7 @@ async def _create_v3(
224234
exists_ok: bool = False,
225235
) -> AsyncArray:
226236
if not exists_ok:
227-
assert not await (store_path / ZARR_JSON).exists()
237+
await ensure_no_existing_node(store_path, zarr_format=3)
228238

229239
codecs = list(codecs) if codecs is not None else [BytesCodec()]
230240

@@ -280,8 +290,7 @@ async def _create_v2(
280290
import numcodecs
281291

282292
if not exists_ok:
283-
assert not await (store_path / ZARRAY_JSON).exists()
284-
293+
await ensure_no_existing_node(store_path, zarr_format=2)
285294
if order is None:
286295
order = "C"
287296

src/zarr/chunk_grids.py

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,16 @@
11
from __future__ import annotations
22

33
import itertools
4+
import math
45
import operator
56
from abc import abstractmethod
67
from collections.abc import Iterator
78
from dataclasses import dataclass
89
from functools import reduce
910
from typing import TYPE_CHECKING
1011

12+
import numpy as np
13+
1114
from zarr.abc.metadata import Metadata
1215
from zarr.common import (
1316
JSON,
@@ -22,6 +25,75 @@
2225
from typing_extensions import Self
2326

2427

28+
def _guess_chunks(
29+
shape: ChunkCoords,
30+
typesize: int,
31+
*,
32+
increment_bytes: int = 256 * 1024,
33+
min_bytes: int = 128 * 1024,
34+
max_bytes: int = 64 * 1024 * 1024,
35+
) -> ChunkCoords:
36+
"""
37+
Iteratively guess an appropriate chunk layout for an array, given its shape and
38+
the size of each element in bytes, and size constraints expressed in bytes. This logic is
39+
adapted from h5py.
40+
41+
Parameters
42+
----------
43+
shape: ChunkCoords
44+
The chunk shape.
45+
typesize: int
46+
The size, in bytes, of each element of the chunk.
47+
increment_bytes: int = 256 * 1024
48+
The number of bytes used to increment or decrement the target chunk size in bytes.
49+
min_bytes: int = 128 * 1024
50+
The soft lower bound on the final chunk size in bytes.
51+
max_bytes: int = 64 * 1024 * 1024
52+
The hard upper bound on the final chunk size in bytes.
53+
54+
Returns
55+
-------
56+
ChunkCoords
57+
58+
"""
59+
60+
ndims = len(shape)
61+
# require chunks to have non-zero length for all dimensions
62+
chunks = np.maximum(np.array(shape, dtype="=f8"), 1)
63+
64+
# Determine the optimal chunk size in bytes using a PyTables expression.
65+
# This is kept as a float.
66+
dset_size = np.prod(chunks) * typesize
67+
target_size = increment_bytes * (2 ** np.log10(dset_size / (1024.0 * 1024)))
68+
69+
if target_size > max_bytes:
70+
target_size = max_bytes
71+
elif target_size < min_bytes:
72+
target_size = min_bytes
73+
74+
idx = 0
75+
while True:
76+
# Repeatedly loop over the axes, dividing them by 2. Stop when:
77+
# 1a. We're smaller than the target chunk size, OR
78+
# 1b. We're within 50% of the target chunk size, AND
79+
# 2. The chunk is smaller than the maximum chunk size
80+
81+
chunk_bytes = np.prod(chunks) * typesize
82+
83+
if (
84+
chunk_bytes < target_size or abs(chunk_bytes - target_size) / target_size < 0.5
85+
) and chunk_bytes < max_bytes:
86+
break
87+
88+
if np.prod(chunks) == 1:
89+
break # Element size larger than max_bytes
90+
91+
chunks[idx % ndims] = math.ceil(chunks[idx % ndims] / 2.0)
92+
idx += 1
93+
94+
return tuple(int(x) for x in chunks)
95+
96+
2597
@dataclass(frozen=True)
2698
class ChunkGrid(Metadata):
2799
@classmethod

src/zarr/errors.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
from typing import Any
2+
3+
4+
class _BaseZarrError(ValueError):
5+
_msg = ""
6+
7+
def __init__(self, *args: Any) -> None:
8+
super().__init__(self._msg.format(*args))
9+
10+
11+
class ContainsGroupError(_BaseZarrError):
12+
_msg = "A group exists in store {0!r} at path {1!r}."
13+
14+
15+
class ContainsArrayError(_BaseZarrError):
16+
_msg = "An array exists in store {0!r} at path {1!r}."
17+
18+
19+
class ContainsArrayAndGroupError(_BaseZarrError):
20+
_msg = (
21+
"Array and group metadata documents (.zarray and .zgroup) were both found in store "
22+
"{0!r} at path {1!r}."
23+
"Only one of these files may be present in a given directory / prefix. "
24+
"Remove the .zarray file, or the .zgroup file, or both."
25+
)

0 commit comments

Comments
 (0)