Skip to content

Commit b6a6d41

Browse files
committed
merge
1 parent 960c885 commit b6a6d41

37 files changed

+581
-5020
lines changed

src/zarr/v3/abc/codec.py

Lines changed: 36 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,3 @@
1-
# Notes:
2-
# 1. These are missing methods described in the spec. I expected to see these method definitions:
3-
# def compute_encoded_representation_type(self, decoded_representation_type):
4-
# def encode(self, decoded_value):
5-
# def decode(self, encoded_value, decoded_representation_type):
6-
# def partial_decode(self, input_handle, decoded_representation_type, decoded_regions):
7-
# def compute_encoded_size(self, input_size):
8-
# 2. Understand why array metadata is included on all codecs
9-
10-
111
from __future__ import annotations
122

133
from abc import abstractmethod, ABC
@@ -20,30 +10,39 @@
2010

2111

2212
if TYPE_CHECKING:
23-
from zarr.v3.metadata import CoreArrayMetadata, CodecMetadata
13+
from zarr.v3.metadata import (
14+
ArraySpec,
15+
ArrayMetadata,
16+
DataType,
17+
CodecMetadata,
18+
RuntimeConfiguration,
19+
)
2420

2521

2622
class Codec(ABC):
2723
is_fixed_size: bool
28-
array_metadata: CoreArrayMetadata
2924

25+
@classmethod
3026
@abstractmethod
31-
def compute_encoded_size(self, input_byte_length: int) -> int:
27+
def get_metadata_class(cls) -> Type[CodecMetadata]:
3228
pass
3329

34-
def resolve_metadata(self) -> CoreArrayMetadata:
35-
return self.array_metadata
36-
3730
@classmethod
3831
@abstractmethod
39-
def from_metadata(
40-
cls, codec_metadata: "CodecMetadata", array_metadata: CoreArrayMetadata
41-
) -> Codec:
32+
def from_metadata(cls, codec_metadata: CodecMetadata) -> Codec:
4233
pass
4334

44-
@classmethod
4535
@abstractmethod
46-
def get_metadata_class(cls) -> "Type[CodecMetadata]":
36+
def compute_encoded_size(self, input_byte_length: int, chunk_spec: ArraySpec) -> int:
37+
pass
38+
39+
def resolve_metadata(self, chunk_spec: ArraySpec) -> ArraySpec:
40+
return chunk_spec
41+
42+
def evolve(self, *, ndim: int, data_type: DataType) -> Codec:
43+
return self
44+
45+
def validate(self, array_metadata: ArrayMetadata) -> None:
4746
pass
4847

4948

@@ -52,13 +51,17 @@ class ArrayArrayCodec(Codec):
5251
async def decode(
5352
self,
5453
chunk_array: np.ndarray,
54+
chunk_spec: ArraySpec,
55+
runtime_configuration: RuntimeConfiguration,
5556
) -> np.ndarray:
5657
pass
5758

5859
@abstractmethod
5960
async def encode(
6061
self,
6162
chunk_array: np.ndarray,
63+
chunk_spec: ArraySpec,
64+
runtime_configuration: RuntimeConfiguration,
6265
) -> Optional[np.ndarray]:
6366
pass
6467

@@ -68,13 +71,17 @@ class ArrayBytesCodec(Codec):
6871
async def decode(
6972
self,
7073
chunk_array: BytesLike,
74+
chunk_spec: ArraySpec,
75+
runtime_configuration: RuntimeConfiguration,
7176
) -> np.ndarray:
7277
pass
7378

7479
@abstractmethod
7580
async def encode(
7681
self,
7782
chunk_array: np.ndarray,
83+
chunk_spec: ArraySpec,
84+
runtime_configuration: RuntimeConfiguration,
7885
) -> Optional[BytesLike]:
7986
pass
8087

@@ -85,6 +92,8 @@ async def decode_partial(
8592
self,
8693
store_path: StorePath,
8794
selection: SliceSelection,
95+
chunk_spec: ArraySpec,
96+
runtime_configuration: RuntimeConfiguration,
8897
) -> Optional[np.ndarray]:
8998
pass
9099

@@ -96,6 +105,8 @@ async def encode_partial(
96105
store_path: StorePath,
97106
chunk_array: np.ndarray,
98107
selection: SliceSelection,
108+
chunk_spec: ArraySpec,
109+
runtime_configuration: RuntimeConfiguration,
99110
) -> None:
100111
pass
101112

@@ -105,12 +116,16 @@ class BytesBytesCodec(Codec):
105116
async def decode(
106117
self,
107118
chunk_array: BytesLike,
119+
chunk_spec: ArraySpec,
120+
runtime_configuration: RuntimeConfiguration,
108121
) -> BytesLike:
109122
pass
110123

111124
@abstractmethod
112125
async def encode(
113126
self,
114127
chunk_array: BytesLike,
128+
chunk_spec: ArraySpec,
129+
runtime_configuration: RuntimeConfiguration,
115130
) -> Optional[BytesLike]:
116131
pass

src/zarr/v3/array.py

Lines changed: 38 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,10 @@
1616
import numpy as np
1717
from attr import evolve, frozen
1818

19-
from zarr.v3.abc.codec import ArrayBytesCodecPartialDecodeMixin
20-
2119

2220
# from zarr.v3.array_v2 import ArrayV2
2321
from zarr.v3.codecs import CodecMetadata, CodecPipeline, bytes_codec
22+
from zarr.v3.codecs.registry import get_codec_from_metadata
2423
from zarr.v3.common import (
2524
ZARR_JSON,
2625
ChunkCoords,
@@ -31,6 +30,7 @@
3130
from zarr.v3.indexing import BasicIndexer, all_chunk_coords, is_total_slice
3231
from zarr.v3.metadata import (
3332
ArrayMetadata,
33+
ArraySpec,
3434
DataType,
3535
DefaultChunkKeyEncodingConfigurationMetadata,
3636
DefaultChunkKeyEncodingMetadata,
@@ -41,7 +41,6 @@
4141
V2ChunkKeyEncodingMetadata,
4242
dtype_to_data_type,
4343
)
44-
from zarr.v3.codecs.sharding import ShardingCodec
4544
from zarr.v3.store import StoreLike, StorePath, make_store_path
4645
from zarr.v3.sync import sync
4746

@@ -118,8 +117,11 @@ async def create(
118117
metadata=metadata,
119118
store_path=store_path,
120119
runtime_configuration=runtime_configuration,
121-
codec_pipeline=CodecPipeline.from_metadata(
122-
metadata.codecs, metadata.get_core_metadata(runtime_configuration)
120+
codec_pipeline=CodecPipeline.create(
121+
[
122+
get_codec_from_metadata(codec).evolve(ndim=len(shape), data_type=data_type)
123+
for codec in codecs
124+
]
123125
),
124126
)
125127

@@ -134,13 +136,17 @@ def from_json(
134136
runtime_configuration: RuntimeConfiguration,
135137
) -> AsyncArray:
136138
metadata = ArrayMetadata.from_json(zarr_json)
139+
codecs = [
140+
get_codec_from_metadata(codec).evolve(
141+
ndim=len(metadata.shape), data_type=metadata.data_type
142+
)
143+
for codec in metadata.codecs
144+
]
137145
async_array = cls(
138146
metadata=metadata,
139147
store_path=store_path,
140148
runtime_configuration=runtime_configuration,
141-
codec_pipeline=CodecPipeline.from_metadata(
142-
metadata.codecs, metadata.get_core_metadata(runtime_configuration)
143-
),
149+
codec_pipeline=CodecPipeline.create(codecs),
144150
)
145151
async_array._validate_metadata()
146152
return async_array
@@ -240,6 +246,7 @@ def _validate_metadata(self) -> None:
240246
self.metadata.dimension_names
241247
), "`dimension_names` and `shape` need to have the same number of dimensions."
242248
assert self.metadata.fill_value is not None, "`fill_value` is required."
249+
self.codec_pipeline.validate(self.metadata)
243250

244251
async def _read_chunk(
245252
self,
@@ -248,15 +255,14 @@ async def _read_chunk(
248255
out_selection: SliceSelection,
249256
out: np.ndarray,
250257
):
258+
chunk_spec = self.metadata.get_chunk_spec(chunk_coords)
251259
chunk_key_encoding = self.metadata.chunk_key_encoding
252260
chunk_key = chunk_key_encoding.encode_chunk_key(chunk_coords)
253261
store_path = self.store_path / chunk_key
254262

255-
if len(self.codec_pipeline.codecs) == 1 and isinstance(
256-
self.codec_pipeline.codecs[0], ArrayBytesCodecPartialDecodeMixin
257-
):
258-
chunk_array = await self.codec_pipeline.codecs[0].decode_partial(
259-
store_path, chunk_selection
263+
if self.codec_pipeline.supports_partial_decode:
264+
chunk_array = await self.codec_pipeline.decode_partial(
265+
store_path, chunk_selection, chunk_spec, self.runtime_configuration
260266
)
261267
if chunk_array is not None:
262268
out[out_selection] = chunk_array
@@ -265,7 +271,9 @@ async def _read_chunk(
265271
else:
266272
chunk_bytes = await store_path.get()
267273
if chunk_bytes is not None:
268-
chunk_array = await self.codec_pipeline.decode(chunk_bytes)
274+
chunk_array = await self.codec_pipeline.decode(
275+
chunk_bytes, chunk_spec, self.runtime_configuration
276+
)
269277
tmp = chunk_array[chunk_selection]
270278
out[out_selection] = tmp
271279
else:
@@ -316,6 +324,7 @@ async def _write_chunk(
316324
chunk_selection: SliceSelection,
317325
out_selection: SliceSelection,
318326
):
327+
chunk_spec = self.metadata.get_chunk_spec(chunk_coords)
319328
chunk_key_encoding = self.metadata.chunk_key_encoding
320329
chunk_key = chunk_key_encoding.encode_chunk_key(chunk_coords)
321330
store_path = self.store_path / chunk_key
@@ -330,17 +339,16 @@ async def _write_chunk(
330339
chunk_array.fill(value)
331340
else:
332341
chunk_array = value[out_selection]
333-
await self._write_chunk_to_store(store_path, chunk_array)
342+
await self._write_chunk_to_store(store_path, chunk_array, chunk_spec)
334343

335-
elif len(self.codec_pipeline.codecs) == 1 and isinstance(
336-
self.codec_pipeline.codecs[0], ShardingCodec
337-
):
338-
sharding_codec = self.codec_pipeline.codecs[0]
344+
elif self.codec_pipeline.supports_partial_encode:
339345
# print("encode_partial", chunk_coords, chunk_selection, repr(self))
340-
await sharding_codec.encode_partial(
346+
await self.codec_pipeline.encode_partial(
341347
store_path,
342348
value[out_selection],
343349
chunk_selection,
350+
chunk_spec,
351+
self.runtime_configuration,
344352
)
345353
else:
346354
# writing partial chunks
@@ -356,18 +364,24 @@ async def _write_chunk(
356364
chunk_array.fill(self.metadata.fill_value)
357365
else:
358366
chunk_array = (
359-
await self.codec_pipeline.decode(chunk_bytes)
367+
await self.codec_pipeline.decode(
368+
chunk_bytes, chunk_spec, self.runtime_configuration
369+
)
360370
).copy() # make a writable copy
361371
chunk_array[chunk_selection] = value[out_selection]
362372

363-
await self._write_chunk_to_store(store_path, chunk_array)
373+
await self._write_chunk_to_store(store_path, chunk_array, chunk_spec)
364374

365-
async def _write_chunk_to_store(self, store_path: StorePath, chunk_array: np.ndarray):
375+
async def _write_chunk_to_store(
376+
self, store_path: StorePath, chunk_array: np.ndarray, chunk_spec: ArraySpec
377+
):
366378
if np.all(chunk_array == self.metadata.fill_value):
367379
# chunks that only contain fill_value will be removed
368380
await store_path.delete()
369381
else:
370-
chunk_bytes = await self.codec_pipeline.encode(chunk_array)
382+
chunk_bytes = await self.codec_pipeline.encode(
383+
chunk_array, chunk_spec, self.runtime_configuration
384+
)
371385
if chunk_bytes is None:
372386
await store_path.delete()
373387
else:

src/zarr/v3/array_v2.py

Lines changed: 29 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
to_thread,
2121
)
2222
from zarr.v3.indexing import BasicIndexer, all_chunk_coords, is_total_slice
23-
from zarr.v3.metadata import ArrayV2Metadata, RuntimeConfiguration
23+
from zarr.v3.metadata import ArrayV2Metadata, CodecMetadata, RuntimeConfiguration
2424
from zarr.v3.store import StoreLike, StorePath, make_store_path
2525
from zarr.v3.sync import sync
2626

@@ -83,12 +83,14 @@ async def create_async(
8383
order=order,
8484
dimension_separator=dimension_separator,
8585
fill_value=0 if fill_value is None else fill_value,
86-
compressor=numcodecs.get_codec(compressor).get_config()
87-
if compressor is not None
88-
else None,
89-
filters=[numcodecs.get_codec(filter).get_config() for filter in filters]
90-
if filters is not None
91-
else None,
86+
compressor=(
87+
numcodecs.get_codec(compressor).get_config() if compressor is not None else None
88+
),
89+
filters=(
90+
[numcodecs.get_codec(filter).get_config() for filter in filters]
91+
if filters is not None
92+
else None
93+
),
9294
)
9395
array = cls(
9496
metadata=metadata,
@@ -441,22 +443,29 @@ async def convert_to_v3_async(self) -> Array:
441443
from zarr.v3.common import ZARR_JSON
442444
from zarr.v3.metadata import (
443445
ArrayMetadata,
446+
DataType,
447+
RegularChunkGridConfigurationMetadata,
448+
RegularChunkGridMetadata,
449+
V2ChunkKeyEncodingConfigurationMetadata,
450+
V2ChunkKeyEncodingMetadata,
451+
dtype_to_data_type,
452+
)
453+
from zarr.v3.codecs.blosc import (
444454
BloscCodecConfigurationMetadata,
445455
BloscCodecMetadata,
456+
blosc_shuffle_int_to_str,
457+
)
458+
from zarr.v3.codecs.bytes import (
446459
BytesCodecConfigurationMetadata,
447460
BytesCodecMetadata,
448-
CodecMetadata,
449-
DataType,
461+
)
462+
from zarr.v3.codecs.gzip import (
450463
GzipCodecConfigurationMetadata,
451464
GzipCodecMetadata,
452-
RegularChunkGridConfigurationMetadata,
453-
RegularChunkGridMetadata,
465+
)
466+
from zarr.v3.codecs.transpose import (
454467
TransposeCodecConfigurationMetadata,
455468
TransposeCodecMetadata,
456-
V2ChunkKeyEncodingConfigurationMetadata,
457-
V2ChunkKeyEncodingMetadata,
458-
blosc_shuffle_int_to_str,
459-
dtype_to_data_type,
460469
)
461470

462471
data_type = DataType[dtype_to_data_type[self.metadata.dtype.str]]
@@ -476,7 +485,11 @@ async def convert_to_v3_async(self) -> Array:
476485

477486
if self.metadata.order == "F":
478487
codecs.append(
479-
TransposeCodecMetadata(configuration=TransposeCodecConfigurationMetadata(order="F"))
488+
TransposeCodecMetadata(
489+
configuration=TransposeCodecConfigurationMetadata(
490+
order=tuple(reversed(range(self.metadata.ndim)))
491+
)
492+
)
480493
)
481494
codecs.append(
482495
BytesCodecMetadata(configuration=BytesCodecConfigurationMetadata(endian=endian))

0 commit comments

Comments
 (0)