Skip to content

Commit 6fcd873

Browse files
committed
Zarr: optimize appending
1 parent 7489aba commit 6fcd873

File tree

2 files changed

+35
-40
lines changed

2 files changed

+35
-40
lines changed

xarray/backends/api.py

Lines changed: 3 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -1521,42 +1521,6 @@ def save_mfdataset(
15211521
)
15221522

15231523

1524-
def _validate_datatypes_for_zarr_append(zstore, dataset):
1525-
"""If variable exists in the store, confirm dtype of the data to append is compatible with
1526-
existing dtype.
1527-
"""
1528-
1529-
existing_vars = zstore.get_variables()
1530-
1531-
def check_dtype(vname, var):
1532-
if (
1533-
vname not in existing_vars
1534-
or np.issubdtype(var.dtype, np.number)
1535-
or np.issubdtype(var.dtype, np.datetime64)
1536-
or np.issubdtype(var.dtype, np.bool_)
1537-
or var.dtype == object
1538-
):
1539-
# We can skip dtype equality checks under two conditions: (1) if the var to append is
1540-
# new to the dataset, because in this case there is no existing var to compare it to;
1541-
# or (2) if var to append's dtype is known to be easy-to-append, because in this case
1542-
# we can be confident appending won't cause problems. Examples of dtypes which are not
1543-
# easy-to-append include length-specified strings of type `|S*` or `<U*` (where * is a
1544-
# positive integer character length). For these dtypes, appending dissimilar lengths
1545-
# can result in truncation of appended data. Therefore, variables which already exist
1546-
# in the dataset, and with dtypes which are not known to be easy-to-append, necessitate
1547-
# exact dtype equality, as checked below.
1548-
pass
1549-
elif not var.dtype == existing_vars[vname].dtype:
1550-
raise ValueError(
1551-
f"Mismatched dtypes for variable {vname} between Zarr store on disk "
1552-
f"and dataset to append. Store has dtype {existing_vars[vname].dtype} but "
1553-
f"dataset to append has dtype {var.dtype}."
1554-
)
1555-
1556-
for vname, var in dataset.data_vars.items():
1557-
check_dtype(vname, var)
1558-
1559-
15601524
# compute=True returns ZarrStore
15611525
@overload
15621526
def to_zarr(
@@ -1721,17 +1685,16 @@ def to_zarr(
17211685
)
17221686

17231687
if mode in ["a", "a-", "r+"]:
1724-
_validate_datatypes_for_zarr_append(zstore, dataset)
1725-
if append_dim is not None:
1688+
existing_var_names = set(zstore.zarr_group.array_keys())
1689+
if append_dim is not None and append_dim not in existing_var_names:
17261690
existing_dims = zstore.get_dimensions()
17271691
if append_dim not in existing_dims:
17281692
raise ValueError(
17291693
f"append_dim={append_dim!r} does not match any existing "
17301694
f"dataset dimensions {existing_dims}"
17311695
)
1732-
existing_var_names = set(zstore.zarr_group.array_keys())
17331696
for var_name in existing_var_names:
1734-
if var_name in encoding.keys():
1697+
if var_name in encoding:
17351698
raise ValueError(
17361699
f"variable {var_name!r} already exists, but encoding was provided"
17371700
)

xarray/backends/zarr.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -324,6 +324,34 @@ def encode_zarr_variable(var, needs_copy=True, name=None):
324324
return var
325325

326326

327+
def _validate_datatypes_for_zarr_append(vname, existing_var, new_var):
328+
"""If variable exists in the store, confirm dtype of the data to append is compatible with
329+
existing dtype.
330+
"""
331+
if (
332+
np.issubdtype(new_var.dtype, np.number)
333+
or np.issubdtype(new_var.dtype, np.datetime64)
334+
or np.issubdtype(new_var.dtype, np.bool_)
335+
or new_var.dtype == object
336+
):
337+
# We can skip dtype equality checks under two conditions: (1) if the var to append is
338+
# new to the dataset, because in this case there is no existing var to compare it to;
339+
# or (2) if var to append's dtype is known to be easy-to-append, because in this case
340+
# we can be confident appending won't cause problems. Examples of dtypes which are not
341+
# easy-to-append include length-specified strings of type `|S*` or `<U*` (where * is a
342+
# positive integer character length). For these dtypes, appending dissimilar lengths
343+
# can result in truncation of appended data. Therefore, variables which already exist
344+
# in the dataset, and with dtypes which are not known to be easy-to-append, necessitate
345+
# exact dtype equality, as checked below.
346+
pass
347+
elif not new_var.dtype == existing_var.dtype:
348+
raise ValueError(
349+
f"Mismatched dtypes for variable {vname} between Zarr store on disk "
350+
f"and dataset to append. Store has dtype {existing_var.dtype} but "
351+
f"dataset to append has dtype {new_var.dtype}."
352+
)
353+
354+
327355
def _validate_and_transpose_existing_dims(
328356
var_name, new_var, existing_var, region, append_dim
329357
):
@@ -632,6 +660,10 @@ def store(
632660
# Modified variables must use the same encoding as the store.
633661
vars_with_encoding = {}
634662
for vn in existing_variable_names:
663+
if self._mode in ["a", "a-", "r+"]:
664+
_validate_datatypes_for_zarr_append(
665+
vn, existing_vars[vn], variables[vn]
666+
)
635667
vars_with_encoding[vn] = variables[vn].copy(deep=False)
636668
vars_with_encoding[vn].encoding = existing_vars[vn].encoding
637669
vars_with_encoding, _ = self.encode(vars_with_encoding, {})

0 commit comments

Comments
 (0)