Skip to content

Commit 8d483c9

Browse files
authored
Remove implicit groups (#1827)
* wip: more group members work * remove implicit groups refactor Group.getitem and Group.open to better handle loading members for v2/v3 * tidy
1 parent 4f5ca4b commit 8d483c9

File tree

3 files changed

+65
-62
lines changed

3 files changed

+65
-62
lines changed

src/zarr/codecs/bytes.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,6 @@ async def decode(
8282
dtype = np.dtype(f"{prefix}{chunk_spec.dtype.str[1:]}")
8383
else:
8484
dtype = np.dtype(f"|{chunk_spec.dtype.str[1:]}")
85-
print(dtype)
8685
chunk_array = np.frombuffer(chunk_bytes, dtype)
8786

8887
# ensure correct chunk shape

src/zarr/group.py

Lines changed: 61 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -109,37 +109,52 @@ async def open(
109109
cls,
110110
store: StoreLike,
111111
runtime_configuration: RuntimeConfiguration = RuntimeConfiguration(),
112-
zarr_format: Literal[2, 3] = 3,
112+
zarr_format: Literal[2, 3, None] = 3,
113113
) -> AsyncGroup:
114114
store_path = make_store_path(store)
115-
zarr_json_bytes = await (store_path / ZARR_JSON).get()
116-
assert zarr_json_bytes is not None
117-
118-
# TODO: consider trying to autodiscover the zarr-format here
119-
if zarr_format == 3:
120-
# V3 groups are comprised of a zarr.json object
121-
# (it is optional in the case of implicit groups)
122-
zarr_json_bytes = await (store_path / ZARR_JSON).get()
123-
zarr_json = (
124-
json.loads(zarr_json_bytes) if zarr_json_bytes is not None else {"zarr_format": 3}
125-
)
126115

127-
elif zarr_format == 2:
128-
# V2 groups are comprised of a .zgroup and .zattrs objects
129-
# (both are optional in the case of implicit groups)
116+
if zarr_format == 2:
130117
zgroup_bytes, zattrs_bytes = await asyncio.gather(
131118
(store_path / ZGROUP_JSON).get(), (store_path / ZATTRS_JSON).get()
132119
)
133-
zgroup = (
134-
json.loads(json.loads(zgroup_bytes))
135-
if zgroup_bytes is not None
136-
else {"zarr_format": 2}
120+
if zgroup_bytes is None:
121+
raise KeyError(store_path) # filenotfounderror?
122+
elif zarr_format == 3:
123+
zarr_json_bytes = await (store_path / ZARR_JSON).get()
124+
if zarr_json_bytes is None:
125+
raise KeyError(store_path) # filenotfounderror?
126+
elif zarr_format is None:
127+
zarr_json_bytes, zgroup_bytes, zattrs_bytes = await asyncio.gather(
128+
(store_path / ZARR_JSON).get(),
129+
(store_path / ZGROUP_JSON).get(),
130+
(store_path / ZATTRS_JSON).get(),
137131
)
138-
zattrs = json.loads(json.loads(zattrs_bytes)) if zattrs_bytes is not None else {}
139-
zarr_json = {**zgroup, "attributes": zattrs}
132+
if zarr_json_bytes is not None and zgroup_bytes is not None:
133+
# TODO: revisit this exception type
134+
# alternatively, we could warn and favor v3
135+
raise ValueError("Both zarr.json and .zgroup objects exist")
136+
if zarr_json_bytes is None and zgroup_bytes is None:
137+
raise KeyError(store_path) # filenotfounderror?
138+
# set zarr_format based on which keys were found
139+
if zarr_json_bytes is not None:
140+
zarr_format = 3
141+
else:
142+
zarr_format = 2
140143
else:
141144
raise ValueError(f"unexpected zarr_format: {zarr_format}")
142-
return cls.from_dict(store_path, zarr_json, runtime_configuration)
145+
146+
if zarr_format == 2:
147+
# V2 groups are comprised of a .zgroup and .zattrs objects
148+
assert zgroup_bytes is not None
149+
zgroup = json.loads(zgroup_bytes)
150+
zattrs = json.loads(zattrs_bytes) if zattrs_bytes is not None else {}
151+
group_metadata = {**zgroup, "attributes": zattrs}
152+
else:
153+
# V3 groups are comprised of a zarr.json object
154+
assert zarr_json_bytes is not None
155+
group_metadata = json.loads(zarr_json_bytes)
156+
157+
return cls.from_dict(store_path, group_metadata, runtime_configuration)
143158

144159
@classmethod
145160
def from_dict(
@@ -174,13 +189,7 @@ async def getitem(
174189
if self.metadata.zarr_format == 3:
175190
zarr_json_bytes = await (store_path / ZARR_JSON).get()
176191
if zarr_json_bytes is None:
177-
# implicit group?
178-
logger.warning("group at %s is an implicit group", store_path)
179-
zarr_json = {
180-
"zarr_format": self.metadata.zarr_format,
181-
"node_type": "group",
182-
"attributes": {},
183-
}
192+
raise KeyError(key)
184193
else:
185194
zarr_json = json.loads(zarr_json_bytes)
186195
if zarr_json["node_type"] == "group":
@@ -200,6 +209,9 @@ async def getitem(
200209
(store_path / ZATTRS_JSON).get(),
201210
)
202211

212+
if zgroup_bytes is None and zarray_bytes is None:
213+
raise KeyError(key)
214+
203215
# unpack the zarray, if this is None then we must be opening a group
204216
zarray = json.loads(zarray_bytes) if zarray_bytes else None
205217
# unpack the zattrs, this can be None if no attrs were written
@@ -212,9 +224,6 @@ async def getitem(
212224
store_path, zarray, runtime_configuration=self.runtime_configuration
213225
)
214226
else:
215-
if zgroup_bytes is None:
216-
# implicit group?
217-
logger.warning("group at %s is an implicit group", store_path)
218227
zgroup = (
219228
json.loads(zgroup_bytes)
220229
if zgroup_bytes is not None
@@ -288,7 +297,12 @@ def __repr__(self):
288297
return f"<AsyncGroup {self.store_path}>"
289298

290299
async def nmembers(self) -> int:
291-
raise NotImplementedError
300+
# TODO: consider using aioitertools.builtins.sum for this
301+
# return await aioitertools.builtins.sum((1 async for _ in self.members()), start=0)
302+
n = 0
303+
async for _ in self.members():
304+
n += 1
305+
return n
292306

293307
async def members(self) -> AsyncGenerator[tuple[str, AsyncArray | AsyncGroup], None]:
294308
"""
@@ -321,10 +335,14 @@ async def members(self) -> AsyncGenerator[tuple[str, AsyncArray | AsyncGroup], N
321335
logger.warning(
322336
"Object at %s is not recognized as a component of a Zarr hierarchy.", subkey
323337
)
324-
pass
325338

326339
async def contains(self, member: str) -> bool:
327-
raise NotImplementedError
340+
# TODO: this can be made more efficient.
341+
try:
342+
await self.getitem(member)
343+
return True
344+
except KeyError:
345+
return False
328346

329347
# todo: decide if this method should be separate from `groups`
330348
async def group_keys(self) -> AsyncGenerator[str, None]:
@@ -493,26 +511,18 @@ def members(self) -> tuple[tuple[str, Array | Group], ...]:
493511
def __contains__(self, member) -> bool:
494512
return self._sync(self._async_group.contains(member))
495513

496-
def group_keys(self) -> list[str]:
497-
# uncomment with AsyncGroup implements this method
498-
# return self._sync_iter(self._async_group.group_keys())
499-
raise NotImplementedError
514+
def group_keys(self) -> tuple[str, ...]:
515+
return tuple(self._sync_iter(self._async_group.group_keys()))
500516

501-
def groups(self) -> list[Group]:
517+
def groups(self) -> tuple[Group, ...]:
502518
# TODO: in v2 this was a generator that return key: Group
503-
# uncomment with AsyncGroup implements this method
504-
# return [Group(obj) for obj in self._sync_iter(self._async_group.groups())]
505-
raise NotImplementedError
519+
return tuple(Group(obj) for obj in self._sync_iter(self._async_group.groups()))
506520

507-
def array_keys(self) -> list[str]:
508-
# uncomment with AsyncGroup implements this method
509-
# return self._sync_iter(self._async_group.array_keys)
510-
raise NotImplementedError
521+
def array_keys(self) -> tuple[str, ...]:
522+
return tuple(self._sync_iter(self._async_group.array_keys()))
511523

512-
def arrays(self) -> list[Array]:
513-
# uncomment with AsyncGroup implements this method
514-
# return [Array(obj) for obj in self._sync_iter(self._async_group.arrays)]
515-
raise NotImplementedError
524+
def arrays(self) -> tuple[Array, ...]:
525+
return tuple(Array(obj) for obj in self._sync_iter(self._async_group.arrays()))
516526

517527
def tree(self, expand=False, level=None) -> Any:
518528
return self._sync(self._async_group.tree(expand=expand, level=level))

tests/v3/test_group.py

Lines changed: 4 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -41,17 +41,11 @@ def test_group_members(store_type, request):
4141

4242
# add an extra object to the domain of the group.
4343
# the list of children should ignore this object.
44-
sync(store.set(f"{path}/extra_object", b"000000"))
44+
sync(store.set(f"{path}/extra_object-1", b"000000"))
4545
# add an extra object under a directory-like prefix in the domain of the group.
46-
# this creates an implicit group called implicit_subgroup
47-
sync(store.set(f"{path}/implicit_subgroup/extra_object", b"000000"))
48-
# make the implicit subgroup
49-
members_expected["implicit_subgroup"] = Group(
50-
AsyncGroup(
51-
metadata=GroupMetadata(),
52-
store_path=StorePath(store=store, path=f"{path}/implicit_subgroup"),
53-
)
54-
)
46+
# this creates a directory with a random key in it
47+
# this should not show up as a member
48+
sync(store.set(f"{path}/extra_directory/extra_object-2", b"000000"))
5549
members_observed = group.members
5650
# members are not guaranteed to be ordered, so sort before comparing
5751
assert sorted(dict(members_observed)) == sorted(members_expected)

0 commit comments

Comments
 (0)