-
Notifications
You must be signed in to change notification settings - Fork 47
Closed
Labels
upstream issueusage exampleReal world use case examplesReal world use case exampleszarr-specsRequires adoption of a new ZEPRequires adoption of a new ZEP
Description
I am trying to make a virtual Zarr store for some daily gridMET data. The data are stored in yearly NetCDF files, which results in some data files having an additional day due to leap years. I can read them in as virtual datasets, but when I go to concatenate them, an error is thrown saying the arrays have inconsistent chunk shapes.
ValueError: Cannot concatenate arrays with inconsistent chunk shapes: (366,) vs (365,) .Requires ZEP003 (Variable-length Chunks).
ValueError: Cannot concatenate arrays with inconsistent chunk shapes: (366,) vs (365,) .Requires ZEP003 (Variable-length Chunks).
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[28], line 1
----> 1 xr.combine_nested(virtual_datasets[80:90], concat_dim=['day'], coords='minimal', compat='override')
File ~/.conda/envs/chunking/lib/python3.13/site-packages/xarray/core/combine.py:592, in combine_nested(datasets, concat_dim, compat, data_vars, coords, fill_value, join, combine_attrs)
589 concat_dim = [concat_dim]
591 # The IDs argument tells _nested_combine that datasets aren't yet sorted
--> 592 return _nested_combine(
593 datasets,
594 concat_dims=concat_dim,
595 compat=compat,
596 data_vars=data_vars,
597 coords=coords,
598 ids=False,
599 fill_value=fill_value,
600 join=join,
601 combine_attrs=combine_attrs,
602 )
File ~/.conda/envs/chunking/lib/python3.13/site-packages/xarray/core/combine.py:371, in _nested_combine(datasets, concat_dims, compat, data_vars, coords, ids, fill_value, join, combine_attrs)
368 _check_shape_tile_ids(combined_ids)
370 # Apply series of concatenate or merge operations along each dimension
--> 371 combined = _combine_nd(
372 combined_ids,
373 concat_dims,
374 compat=compat,
375 data_vars=data_vars,
376 coords=coords,
377 fill_value=fill_value,
378 join=join,
379 combine_attrs=combine_attrs,
380 )
381 return combined
File ~/.conda/envs/chunking/lib/python3.13/site-packages/xarray/core/combine.py:247, in _combine_nd(combined_ids, concat_dims, data_vars, coords, compat, fill_value, join, combine_attrs)
243 # Each iteration of this loop reduces the length of the tile_ids tuples
244 # by one. It always combines along the first dimension, removing the first
245 # element of the tuple
246 for concat_dim in concat_dims:
--> 247 combined_ids = _combine_all_along_first_dim(
248 combined_ids,
249 dim=concat_dim,
250 data_vars=data_vars,
251 coords=coords,
252 compat=compat,
253 fill_value=fill_value,
254 join=join,
255 combine_attrs=combine_attrs,
256 )
257 (combined_ds,) = combined_ids.values()
258 return combined_ds
File ~/.conda/envs/chunking/lib/python3.13/site-packages/xarray/core/combine.py:282, in _combine_all_along_first_dim(combined_ids, dim, data_vars, coords, compat, fill_value, join, combine_attrs)
280 combined_ids = dict(sorted(group))
281 datasets = combined_ids.values()
--> 282 new_combined_ids[new_id] = _combine_1d(
283 datasets, dim, compat, data_vars, coords, fill_value, join, combine_attrs
284 )
285 return new_combined_ids
File ~/.conda/envs/chunking/lib/python3.13/site-packages/xarray/core/combine.py:305, in _combine_1d(datasets, concat_dim, compat, data_vars, coords, fill_value, join, combine_attrs)
303 if concat_dim is not None:
304 try:
--> 305 combined = concat(
306 datasets,
307 dim=concat_dim,
308 data_vars=data_vars,
309 coords=coords,
310 compat=compat,
311 fill_value=fill_value,
312 join=join,
313 combine_attrs=combine_attrs,
314 )
315 except ValueError as err:
316 if "encountered unexpected variable" in str(err):
File ~/.conda/envs/chunking/lib/python3.13/site-packages/xarray/core/concat.py:277, in concat(objs, dim, data_vars, coords, compat, positions, fill_value, join, combine_attrs, create_index_for_new_dim)
264 return _dataarray_concat(
265 objs,
266 dim=dim,
(...)
274 create_index_for_new_dim=create_index_for_new_dim,
275 )
276 elif isinstance(first_obj, Dataset):
--> 277 return _dataset_concat(
278 objs,
279 dim=dim,
280 data_vars=data_vars,
281 coords=coords,
282 compat=compat,
283 positions=positions,
284 fill_value=fill_value,
285 join=join,
286 combine_attrs=combine_attrs,
287 create_index_for_new_dim=create_index_for_new_dim,
288 )
289 else:
290 raise TypeError(
291 "can only concatenate xarray Dataset and DataArray "
292 f"objects, got {type(first_obj)}"
293 )
File ~/.conda/envs/chunking/lib/python3.13/site-packages/xarray/core/concat.py:669, in _dataset_concat(datasets, dim, data_vars, coords, compat, positions, fill_value, join, combine_attrs, create_index_for_new_dim)
667 result_vars[k] = v
668 else:
--> 669 combined_var = concat_vars(
670 vars, dim_name, positions, combine_attrs=combine_attrs
671 )
672 # reindex if variable is not present in all datasets
673 if len(variable_index) < concat_index_size:
File ~/.conda/envs/chunking/lib/python3.13/site-packages/xarray/core/variable.py:3050, in concat(variables, dim, positions, shortcut, combine_attrs)
3048 return IndexVariable.concat(variables, dim, positions, shortcut, combine_attrs)
3049 else:
-> 3050 return Variable.concat(variables, dim, positions, shortcut, combine_attrs)
File ~/.conda/envs/chunking/lib/python3.13/site-packages/xarray/core/variable.py:1782, in Variable.concat(cls, variables, dim, positions, shortcut, combine_attrs)
1780 axis = first_var.get_axis_num(dim)
1781 dims = first_var_dims
-> 1782 data = duck_array_ops.concatenate(arrays, axis=axis)
1783 if positions is not None:
1784 # TODO: deprecate this option -- we don't need it for groupby
1785 # any more.
1786 indices = nputils.inverse_permutation(np.concatenate(positions))
File ~/.conda/envs/chunking/lib/python3.13/site-packages/xarray/core/duck_array_ops.py:391, in concatenate(arrays, axis)
389 xp = get_array_namespace(arrays[0])
390 return xp.concat(as_shared_dtype(arrays, xp=xp), axis=axis)
--> 391 return _concatenate(as_shared_dtype(arrays), axis=axis)
File ~/.conda/envs/chunking/lib/python3.13/site-packages/virtualizarr/manifests/array.py:130, in ManifestArray.__array_function__(self, func, types, args, kwargs)
127 if not all(issubclass(t, ManifestArray) for t in types):
128 return NotImplemented
--> 130 return MANIFESTARRAY_HANDLED_ARRAY_FUNCTIONS[func](*args, **kwargs)
File ~/.conda/envs/chunking/lib/python3.13/site-packages/virtualizarr/manifests/array_api.py:109, in concatenate(arrays, axis)
106 raise TypeError()
108 # ensure dtypes, shapes, codecs etc. are consistent
--> 109 _check_combineable_zarr_arrays(arrays)
111 _check_same_ndims([arr.ndim for arr in arrays])
113 # Ensure we handle axis being passed as a negative integer
File ~/.conda/envs/chunking/lib/python3.13/site-packages/virtualizarr/manifests/array_api.py:40, in _check_combineable_zarr_arrays(arrays)
37 _check_same_codecs([arr.zarray.codec for arr in arrays])
39 # Would require variable-length chunks ZEP
---> 40 _check_same_chunk_shapes([arr.chunks for arr in arrays])
File ~/.conda/envs/chunking/lib/python3.13/site-packages/virtualizarr/manifests/array_api.py:71, in _check_same_chunk_shapes(chunks_list)
69 for other_chunks in other_chunks_list:
70 if other_chunks != first_chunks:
---> 71 raise ValueError(
72 f"Cannot concatenate arrays with inconsistent chunk shapes: {other_chunks} vs {first_chunks} ."
73 "Requires ZEP003 (Variable-length Chunks)."
74 )
ValueError: Cannot concatenate arrays with inconsistent chunk shapes: (366,) vs (365,) .Requires ZEP003 (Variable-length Chunks).
I have checked and confirmed all NetCDF files have the same chunking and have a chunk shape along the day
dimension of 61
. Is this error actually due to the chunk shapes, or is it due to the inconsistent data shape between files? Any help or insight would be appreciated!
Here is a minimal reproducible example:
import fsspec
import xarray as xr
from virtualizarr import open_virtual_dataset
reader_options = {
'storage_options': {
'anon': True,
'client_kwargs': {
'endpoint_url': 'https://usgs.osn.mghpcc.org/'
}
}
}
fs = fsspec.filesystem(
protocol='s3',
**reader_options['storage_options']
)
virtual_datasets = [
open_virtual_dataset(f's3://{file}', indexes={}, reader_options=reader_options)
for file in fs.glob('s3://mdmf/gdp/netcdf/gridmet/gridmet/pr_198[!0-5]*.nc')
]
xr.concat(virtual_datasets , dim='day', coords='minimal', compat='override')
# or equivently
# xr.combine_nested(virtual_datasets , concat_dim='day', coords='minimal', compat='override')
Metadata
Metadata
Assignees
Labels
upstream issueusage exampleReal world use case examplesReal world use case exampleszarr-specsRequires adoption of a new ZEPRequires adoption of a new ZEP