Skip to content

Commit ceee364

Browse files
madsbknormanrz
andauthored
[v3] First step to generalizes ndarray and bytes (#1826)
* use Buffer * use memoryview as the underlying memory * use NDBuffer * convert to Buffer for the v2 tests * clean up * spilling * remove return_as_bytes_wrapper * remove as_ndarray * doc * clean up * as_buffer(): handle bytes like * removed sync.py again * separate Buffer and NNBuffer * impl. NDBuffer.from_numpy_array() * remove as_buffer() * remove Buffer.as_numpy_array() * impl. NDBuffer.as_buffer() * reduce the use of as_numpy_array() * impl. and use NDBuffer.all_equal * as_numpy_array(): doc * remove as_bytearray() * impl. Buffer.from_numpy_array() * NDArrayLike * Factory.Create * Factory.FromNumpy * doc * doc * remove the buffer factories again * NDBuffer.create(): take fill_value * getitem and setitem now use factory * doc * test * check_item_key_is_1d_contiguous * Buffer.create_zero_length() * Buffer.__add__(): use concat * Buffer.as_ndarray_like * Buffer.as_numpy_array * crc32c: use as_numpy_array * as_numpy_array_wrapper * fix import * use from __future__ import annotations * doc and clean up * doc * Apply suggestions from code review Co-authored-by: Norman Rzepka <code@normanrz.com> * Buffer is now backed by ArrayLike --------- Co-authored-by: Norman Rzepka <code@normanrz.com>
1 parent 67b07fb commit ceee364

23 files changed

+794
-246
lines changed

src/zarr/abc/codec.py

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -3,16 +3,16 @@
33
from abc import abstractmethod
44
from typing import TYPE_CHECKING, Optional
55

6-
import numpy as np
76
from zarr.abc.metadata import Metadata
87

8+
from zarr.buffer import Buffer, NDBuffer
99
from zarr.common import ArraySpec
1010
from zarr.store import StorePath
1111

1212

1313
if TYPE_CHECKING:
1414
from typing_extensions import Self
15-
from zarr.common import BytesLike, SliceSelection
15+
from zarr.common import SliceSelection
1616
from zarr.metadata import ArrayMetadata
1717

1818

@@ -37,35 +37,35 @@ class ArrayArrayCodec(Codec):
3737
@abstractmethod
3838
async def decode(
3939
self,
40-
chunk_array: np.ndarray,
40+
chunk_array: NDBuffer,
4141
chunk_spec: ArraySpec,
42-
) -> np.ndarray:
42+
) -> NDBuffer:
4343
pass
4444

4545
@abstractmethod
4646
async def encode(
4747
self,
48-
chunk_array: np.ndarray,
48+
chunk_array: NDBuffer,
4949
chunk_spec: ArraySpec,
50-
) -> Optional[np.ndarray]:
50+
) -> Optional[NDBuffer]:
5151
pass
5252

5353

5454
class ArrayBytesCodec(Codec):
5555
@abstractmethod
5656
async def decode(
5757
self,
58-
chunk_array: BytesLike,
58+
chunk_array: Buffer,
5959
chunk_spec: ArraySpec,
60-
) -> np.ndarray:
60+
) -> NDBuffer:
6161
pass
6262

6363
@abstractmethod
6464
async def encode(
6565
self,
66-
chunk_array: np.ndarray,
66+
chunk_array: NDBuffer,
6767
chunk_spec: ArraySpec,
68-
) -> Optional[BytesLike]:
68+
) -> Optional[Buffer]:
6969
pass
7070

7171

@@ -76,7 +76,7 @@ async def decode_partial(
7676
store_path: StorePath,
7777
selection: SliceSelection,
7878
chunk_spec: ArraySpec,
79-
) -> Optional[np.ndarray]:
79+
) -> Optional[NDBuffer]:
8080
pass
8181

8282

@@ -85,7 +85,7 @@ class ArrayBytesCodecPartialEncodeMixin:
8585
async def encode_partial(
8686
self,
8787
store_path: StorePath,
88-
chunk_array: np.ndarray,
88+
chunk_array: NDBuffer,
8989
selection: SliceSelection,
9090
chunk_spec: ArraySpec,
9191
) -> None:
@@ -96,15 +96,15 @@ class BytesBytesCodec(Codec):
9696
@abstractmethod
9797
async def decode(
9898
self,
99-
chunk_array: BytesLike,
99+
chunk_array: Buffer,
100100
chunk_spec: ArraySpec,
101-
) -> BytesLike:
101+
) -> Buffer:
102102
pass
103103

104104
@abstractmethod
105105
async def encode(
106106
self,
107-
chunk_array: BytesLike,
107+
chunk_array: Buffer,
108108
chunk_spec: ArraySpec,
109-
) -> Optional[BytesLike]:
109+
) -> Optional[Buffer]:
110110
pass

src/zarr/abc/store.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,14 @@
33

44
from typing import List, Tuple, Optional
55

6+
from zarr.buffer import Buffer
7+
68

79
class Store(ABC):
810
@abstractmethod
911
async def get(
1012
self, key: str, byte_range: Optional[Tuple[int, Optional[int]]] = None
11-
) -> Optional[bytes]:
13+
) -> Optional[Buffer]:
1214
"""Retrieve the value associated with a given key.
1315
1416
Parameters
@@ -18,14 +20,14 @@ async def get(
1820
1921
Returns
2022
-------
21-
bytes
23+
Buffer
2224
"""
2325
...
2426

2527
@abstractmethod
2628
async def get_partial_values(
2729
self, key_ranges: List[Tuple[str, Tuple[int, int]]]
28-
) -> List[Optional[bytes]]:
30+
) -> List[Optional[Buffer]]:
2931
"""Retrieve possibly partial values from given key_ranges.
3032
3133
Parameters
@@ -35,8 +37,7 @@ async def get_partial_values(
3537
3638
Returns
3739
-------
38-
list[bytes]
39-
list of values, in the order of the key_ranges, may contain null/none for missing keys
40+
list of values, in the order of the key_ranges, may contain null/none for missing keys
4041
"""
4142
...
4243

@@ -61,7 +62,7 @@ def supports_writes(self) -> bool:
6162
...
6263

6364
@abstractmethod
64-
async def set(self, key: str, value: bytes) -> None:
65+
async def set(self, key: str, value: Buffer) -> None:
6566
"""Store a (key, value) pair.
6667
6768
Parameters

src/zarr/array.py

Lines changed: 33 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020

2121

2222
# from zarr.array_v2 import ArrayV2
23+
from zarr.buffer import Buffer, Factory, NDArrayLike, NDBuffer
2324
from zarr.codecs import BytesCodec
2425
from zarr.codecs.pipeline import CodecPipeline
2526
from zarr.common import (
@@ -147,7 +148,7 @@ async def open(
147148
assert zarr_json_bytes is not None
148149
return cls.from_dict(
149150
store_path,
150-
json.loads(zarr_json_bytes),
151+
json.loads(zarr_json_bytes.to_bytes()),
151152
)
152153

153154
@classmethod
@@ -160,7 +161,7 @@ async def open_auto(
160161
if v3_metadata_bytes is not None:
161162
return cls.from_dict(
162163
store_path,
163-
json.loads(v3_metadata_bytes),
164+
json.loads(v3_metadata_bytes.to_bytes()),
164165
)
165166
else:
166167
raise ValueError("no v2 support yet")
@@ -186,7 +187,9 @@ def dtype(self) -> np.dtype[Any]:
186187
def attrs(self) -> dict[str, Any]:
187188
return self.metadata.attributes
188189

189-
async def getitem(self, selection: Selection) -> npt.NDArray[Any]:
190+
async def getitem(
191+
self, selection: Selection, *, factory: Factory.Create = NDBuffer.create
192+
) -> NDArrayLike:
190193
assert isinstance(self.metadata.chunk_grid, RegularChunkGrid)
191194
indexer = BasicIndexer(
192195
selection,
@@ -195,10 +198,8 @@ async def getitem(self, selection: Selection) -> npt.NDArray[Any]:
195198
)
196199

197200
# setup output array
198-
out = np.zeros(
199-
indexer.shape,
200-
dtype=self.metadata.dtype,
201-
order=self.order,
201+
out = factory(
202+
shape=indexer.shape, dtype=self.metadata.dtype, order=self.order, fill_value=0
202203
)
203204

204205
# reading chunks and decoding them
@@ -210,21 +211,17 @@ async def getitem(self, selection: Selection) -> npt.NDArray[Any]:
210211
self._read_chunk,
211212
config.get("async.concurrency"),
212213
)
213-
214-
if out.shape:
215-
return out
216-
else:
217-
return out[()]
214+
return out.as_ndarray_like()
218215

219216
async def _save_metadata(self) -> None:
220-
await (self.store_path / ZARR_JSON).set(self.metadata.to_bytes())
217+
await (self.store_path / ZARR_JSON).set(Buffer.from_bytes(self.metadata.to_bytes()))
221218

222219
async def _read_chunk(
223220
self,
224221
chunk_coords: ChunkCoords,
225222
chunk_selection: SliceSelection,
226223
out_selection: SliceSelection,
227-
out: npt.NDArray[Any],
224+
out: NDBuffer,
228225
) -> None:
229226
chunk_spec = self.metadata.get_chunk_spec(chunk_coords, self.order)
230227
chunk_key_encoding = self.metadata.chunk_key_encoding
@@ -246,7 +243,12 @@ async def _read_chunk(
246243
else:
247244
out[out_selection] = self.metadata.fill_value
248245

249-
async def setitem(self, selection: Selection, value: npt.NDArray[Any]) -> None:
246+
async def setitem(
247+
self,
248+
selection: Selection,
249+
value: NDArrayLike,
250+
factory: Factory.NDArrayLike = NDBuffer.from_ndarray_like,
251+
) -> None:
250252
assert isinstance(self.metadata.chunk_grid, RegularChunkGrid)
251253
chunk_shape = self.metadata.chunk_grid.chunk_shape
252254
indexer = BasicIndexer(
@@ -259,15 +261,19 @@ async def setitem(self, selection: Selection, value: npt.NDArray[Any]) -> None:
259261

260262
# check value shape
261263
if np.isscalar(value):
262-
# setting a scalar value
263-
pass
264+
value = np.asanyarray(value)
264265
else:
265266
if not hasattr(value, "shape"):
266267
value = np.asarray(value, self.metadata.dtype)
267268
assert value.shape == sel_shape
268269
if value.dtype.name != self.metadata.dtype.name:
269270
value = value.astype(self.metadata.dtype, order="A")
270271

272+
# We accept any ndarray like object from the user and convert it
273+
# to a NDBuffer (or subclass). From this point onwards, we only pass
274+
# Buffer and NDBuffer between components.
275+
value = factory(value)
276+
271277
# merging with existing data and encoding chunks
272278
await concurrent_map(
273279
[
@@ -286,7 +292,7 @@ async def setitem(self, selection: Selection, value: npt.NDArray[Any]) -> None:
286292

287293
async def _write_chunk(
288294
self,
289-
value: npt.NDArray[Any],
295+
value: NDBuffer,
290296
chunk_shape: ChunkCoords,
291297
chunk_coords: ChunkCoords,
292298
chunk_selection: SliceSelection,
@@ -300,11 +306,9 @@ async def _write_chunk(
300306
if is_total_slice(chunk_selection, chunk_shape):
301307
# write entire chunks
302308
if np.isscalar(value):
303-
chunk_array = np.empty(
304-
chunk_shape,
305-
dtype=self.metadata.dtype,
309+
chunk_array = NDBuffer.create(
310+
shape=chunk_shape, dtype=self.metadata.dtype, fill_value=value
306311
)
307-
chunk_array.fill(value)
308312
else:
309313
chunk_array = value[out_selection]
310314
await self._write_chunk_to_store(store_path, chunk_array, chunk_spec)
@@ -324,11 +328,11 @@ async def _write_chunk(
324328

325329
# merge new value
326330
if chunk_bytes is None:
327-
chunk_array = np.empty(
328-
chunk_shape,
331+
chunk_array = NDBuffer.create(
332+
shape=chunk_shape,
329333
dtype=self.metadata.dtype,
334+
fill_value=self.metadata.fill_value,
330335
)
331-
chunk_array.fill(self.metadata.fill_value)
332336
else:
333337
chunk_array = (
334338
await self.codecs.decode(chunk_bytes, chunk_spec)
@@ -338,9 +342,9 @@ async def _write_chunk(
338342
await self._write_chunk_to_store(store_path, chunk_array, chunk_spec)
339343

340344
async def _write_chunk_to_store(
341-
self, store_path: StorePath, chunk_array: npt.NDArray[Any], chunk_spec: ArraySpec
345+
self, store_path: StorePath, chunk_array: NDBuffer, chunk_spec: ArraySpec
342346
) -> None:
343-
if np.all(chunk_array == self.metadata.fill_value):
347+
if chunk_array.all_equal(self.metadata.fill_value):
344348
# chunks that only contain fill_value will be removed
345349
await store_path.delete()
346350
else:
@@ -379,14 +383,14 @@ async def _delete_key(key: str) -> None:
379383
)
380384

381385
# Write new metadata
382-
await (self.store_path / ZARR_JSON).set(new_metadata.to_bytes())
386+
await (self.store_path / ZARR_JSON).set(Buffer.from_bytes(new_metadata.to_bytes()))
383387
return replace(self, metadata=new_metadata)
384388

385389
async def update_attributes(self, new_attributes: Dict[str, Any]) -> AsyncArray:
386390
new_metadata = replace(self.metadata, attributes=new_attributes)
387391

388392
# Write new metadata
389-
await (self.store_path / ZARR_JSON).set(new_metadata.to_bytes())
393+
await (self.store_path / ZARR_JSON).set(Buffer.from_bytes(new_metadata.to_bytes()))
390394
return replace(self, metadata=new_metadata)
391395

392396
def __repr__(self) -> str:

0 commit comments

Comments
 (0)