-
-
Notifications
You must be signed in to change notification settings - Fork 331
Optimize setitem with chunk equal to fill_value, round 2 #738
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
8153810
c84839e
750d696
6ac2349
eb36713
053ad4c
3375bf0
30c3a30
d2fc396
e4e4012
bd27b9a
814d009
769f5a6
cd56b35
bcbaac4
9096f2c
044a9b8
74e0852
b2ec5ad
62a55ab
dbc32fd
cd28aff
160c2dc
b7fe1fe
af715fe
e17993a
7c9a041
59328f0
72488a8
7489ae9
10199b3
88b4811
3c69719
8aa93fa
0dae9da
40c3f14
7dde846
7a45fd2
99f59ef
a6ba3c7
6f8b6c4
054399e
7025d19
bbabe5c
3abcbc3
c3b4455
ea3356c
b81c14a
05716a3
23bfc1e
b921b34
48c38c7
7e4fbad
b063f52
020475b
a81ac83
f8d8415
1c29fe8
710b875
8a06884
7f859c3
0a7a3cc
1a0f41c
94d5d0a
a918f1d
3dd1afd
2165164
4a4adb1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -32,6 +32,7 @@ | |
from zarr.meta import decode_array_metadata, encode_array_metadata | ||
from zarr.storage import array_meta_key, attrs_key, getsize, listdir | ||
from zarr.util import ( | ||
all_equal, | ||
InfoReporter, | ||
check_array_shape, | ||
human_readable_size, | ||
|
@@ -75,6 +76,14 @@ class Array: | |
If True and while the chunk_store is a FSStore and the compresion used | ||
is Blosc, when getting data from the array chunks will be partially | ||
read and decompressed when possible. | ||
write_empty_chunks : bool, optional | ||
If True (default), all chunks will be stored regardless of their | ||
contents. If False, each chunk is compared to the array's fill | ||
value prior to storing. If a chunk is uniformly equal to the fill | ||
value, then that chunk is not be stored, and the store entry for | ||
that chunk's key is deleted. This setting enables sparser storage, | ||
as only chunks with non-fill-value data are stored, at the expense | ||
of overhead associated with checking the data of each chunk. | ||
|
||
.. versionadded:: 2.7 | ||
|
||
|
@@ -107,6 +116,7 @@ class Array: | |
info | ||
vindex | ||
oindex | ||
write_empty_chunks | ||
|
||
Methods | ||
------- | ||
|
@@ -139,6 +149,7 @@ def __init__( | |
cache_metadata=True, | ||
cache_attrs=True, | ||
partial_decompress=False, | ||
write_empty_chunks=True, | ||
): | ||
# N.B., expect at this point store is fully initialized with all | ||
# configuration metadata fully specified and normalized | ||
|
@@ -155,6 +166,7 @@ def __init__( | |
self._cache_metadata = cache_metadata | ||
self._is_view = False | ||
self._partial_decompress = partial_decompress | ||
self._write_empty_chunks = write_empty_chunks | ||
|
||
# initialize metadata | ||
self._load_metadata() | ||
|
@@ -455,6 +467,13 @@ def vindex(self): | |
:func:`set_mask_selection` for documentation and examples.""" | ||
return self._vindex | ||
|
||
@property | ||
def write_empty_chunks(self) -> bool: | ||
"""A Boolean, True if chunks composed of the array's fill value | ||
will be stored. If False, such chunks will not be stored. | ||
""" | ||
return self._write_empty_chunks | ||
|
||
def __eq__(self, other): | ||
return ( | ||
isinstance(other, Array) and | ||
|
@@ -1626,9 +1645,18 @@ def _set_basic_selection_zd(self, selection, value, fields=None): | |
else: | ||
chunk[selection] = value | ||
|
||
# encode and store | ||
cdata = self._encode_chunk(chunk) | ||
self.chunk_store[ckey] = cdata | ||
# remove chunk if write_empty_chunks is false and it only contains the fill value | ||
if (not self.write_empty_chunks) and all_equal(self.fill_value, chunk): | ||
try: | ||
del self.chunk_store[ckey] | ||
return | ||
except Exception: # pragma: no cover | ||
# deleting failed, fallback to overwriting | ||
pass | ||
else: | ||
# encode and store | ||
cdata = self._encode_chunk(chunk) | ||
self.chunk_store[ckey] = cdata | ||
|
||
def _set_basic_selection_nd(self, selection, value, fields=None): | ||
# implementation of __setitem__ for array with at least one dimension | ||
|
@@ -1896,11 +1924,38 @@ def _chunk_getitems(self, lchunk_coords, lchunk_selection, out, lout_selection, | |
out[out_select] = fill_value | ||
|
||
def _chunk_setitems(self, lchunk_coords, lchunk_selection, values, fields=None): | ||
ckeys = [self._chunk_key(co) for co in lchunk_coords] | ||
cdatas = [self._process_for_setitem(key, sel, val, fields=fields) | ||
for key, sel, val in zip(ckeys, lchunk_selection, values)] | ||
values = {k: v for k, v in zip(ckeys, cdatas)} | ||
self.chunk_store.setitems(values) | ||
ckeys = map(self._chunk_key, lchunk_coords) | ||
cdatas = {key: self._process_for_setitem(key, sel, val, fields=fields) | ||
for key, sel, val in zip(ckeys, lchunk_selection, values)} | ||
to_store = {} | ||
if not self.write_empty_chunks: | ||
empty_chunks = {k: v for k, v in cdatas.items() if all_equal(self.fill_value, v)} | ||
self._chunk_delitems(empty_chunks.keys()) | ||
nonempty_keys = cdatas.keys() - empty_chunks.keys() | ||
to_store = {k: self._encode_chunk(cdatas[k]) for k in nonempty_keys} | ||
else: | ||
to_store = {k: self._encode_chunk(v) for k, v in cdatas.items()} | ||
self.chunk_store.setitems(to_store) | ||
|
||
def _chunk_delitems(self, ckeys): | ||
if hasattr(self.store, "delitems"): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @grlee77: does this work with the |
||
self.store.delitems(ckeys) | ||
else: # pragma: no cover | ||
# exempting this branch from coverage as there are no extant stores | ||
# that will trigger this condition, but it's possible that they | ||
# will be developed in the future. | ||
tuple(map(self._chunk_delitem, ckeys)) | ||
joshmoore marked this conversation as resolved.
Show resolved
Hide resolved
|
||
return None | ||
|
||
def _chunk_delitem(self, ckey): | ||
""" | ||
Attempt to delete the value associated with ckey. | ||
""" | ||
try: | ||
del self.chunk_store[ckey] | ||
return | ||
except KeyError: | ||
return | ||
|
||
def _chunk_setitem(self, chunk_coords, chunk_selection, value, fields=None): | ||
"""Replace part or whole of a chunk. | ||
|
@@ -1931,8 +1986,12 @@ def _chunk_setitem(self, chunk_coords, chunk_selection, value, fields=None): | |
def _chunk_setitem_nosync(self, chunk_coords, chunk_selection, value, fields=None): | ||
ckey = self._chunk_key(chunk_coords) | ||
cdata = self._process_for_setitem(ckey, chunk_selection, value, fields=fields) | ||
# store | ||
self.chunk_store[ckey] = cdata | ||
|
||
# attempt to delete chunk if it only contains the fill value | ||
if (not self.write_empty_chunks) and all_equal(self.fill_value, cdata): | ||
self._chunk_delitem(ckey) | ||
else: | ||
self.chunk_store[ckey] = self._encode_chunk(cdata) | ||
|
||
def _process_for_setitem(self, ckey, chunk_selection, value, fields=None): | ||
if is_total_slice(chunk_selection, self._chunks) and not fields: | ||
|
@@ -1988,8 +2047,7 @@ def _process_for_setitem(self, ckey, chunk_selection, value, fields=None): | |
else: | ||
chunk[chunk_selection] = value | ||
|
||
# encode chunk | ||
return self._encode_chunk(chunk) | ||
return chunk | ||
|
||
def _chunk_key(self, chunk_coords): | ||
return self._key_prefix + self._dimension_separator.join(map(str, chunk_coords)) | ||
|
@@ -2209,7 +2267,8 @@ def hexdigest(self, hashname="sha1"): | |
|
||
def __getstate__(self): | ||
return (self._store, self._path, self._read_only, self._chunk_store, | ||
self._synchronizer, self._cache_metadata, self._attrs.cache) | ||
self._synchronizer, self._cache_metadata, self._attrs.cache, | ||
self._partial_decompress, self._write_empty_chunks) | ||
|
||
def __setstate__(self, state): | ||
self.__init__(*state) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1154,6 +1154,15 @@ def __delitem__(self, key): | |
else: | ||
del self.map[key] | ||
|
||
def delitems(self, keys): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What's the implication that this is defined on FSStore? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You mean, what if a store doesn't have this method? For stores without |
||
if self.mode == 'r': | ||
raise ReadOnlyError() | ||
# only remove the keys that exist in the store | ||
nkeys = [self._normalize_key(key) for key in keys if key in self] | ||
# rm errors if you pass an empty collection | ||
if len(nkeys) > 0: | ||
self.map.delitems(nkeys) | ||
|
||
def __contains__(self, key): | ||
key = self._normalize_key(key) | ||
return key in self.map | ||
|
Uh oh!
There was an error while loading. Please reload this page.