Skip to content

xarray.concat fails due to inconsistent leap year shapes #330

@kjdoore

Description

@kjdoore

I am trying to make a virtual Zarr store for some daily gridMET data. The data are stored in yearly NetCDF files, which results in some data files having an additional day due to leap years. I can read them in as virtual datasets, but when I go to concatenate them, an error is thrown saying the arrays have inconsistent chunk shapes.

ValueError: Cannot concatenate arrays with inconsistent chunk shapes: (366,) vs (365,) .Requires ZEP003 (Variable-length Chunks).
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[28], line 1
----> 1 xr.combine_nested(virtual_datasets[80:90], concat_dim=['day'], coords='minimal', compat='override')

File ~/.conda/envs/chunking/lib/python3.13/site-packages/xarray/core/combine.py:592, in combine_nested(datasets, concat_dim, compat, data_vars, coords, fill_value, join, combine_attrs)
    589     concat_dim = [concat_dim]
    591 # The IDs argument tells _nested_combine that datasets aren't yet sorted
--> 592 return _nested_combine(
    593     datasets,
    594     concat_dims=concat_dim,
    595     compat=compat,
    596     data_vars=data_vars,
    597     coords=coords,
    598     ids=False,
    599     fill_value=fill_value,
    600     join=join,
    601     combine_attrs=combine_attrs,
    602 )

File ~/.conda/envs/chunking/lib/python3.13/site-packages/xarray/core/combine.py:371, in _nested_combine(datasets, concat_dims, compat, data_vars, coords, ids, fill_value, join, combine_attrs)
    368 _check_shape_tile_ids(combined_ids)
    370 # Apply series of concatenate or merge operations along each dimension
--> 371 combined = _combine_nd(
    372     combined_ids,
    373     concat_dims,
    374     compat=compat,
    375     data_vars=data_vars,
    376     coords=coords,
    377     fill_value=fill_value,
    378     join=join,
    379     combine_attrs=combine_attrs,
    380 )
    381 return combined

File ~/.conda/envs/chunking/lib/python3.13/site-packages/xarray/core/combine.py:247, in _combine_nd(combined_ids, concat_dims, data_vars, coords, compat, fill_value, join, combine_attrs)
    243 # Each iteration of this loop reduces the length of the tile_ids tuples
    244 # by one. It always combines along the first dimension, removing the first
    245 # element of the tuple
    246 for concat_dim in concat_dims:
--> 247     combined_ids = _combine_all_along_first_dim(
    248         combined_ids,
    249         dim=concat_dim,
    250         data_vars=data_vars,
    251         coords=coords,
    252         compat=compat,
    253         fill_value=fill_value,
    254         join=join,
    255         combine_attrs=combine_attrs,
    256     )
    257 (combined_ds,) = combined_ids.values()
    258 return combined_ds

File ~/.conda/envs/chunking/lib/python3.13/site-packages/xarray/core/combine.py:282, in _combine_all_along_first_dim(combined_ids, dim, data_vars, coords, compat, fill_value, join, combine_attrs)
    280     combined_ids = dict(sorted(group))
    281     datasets = combined_ids.values()
--> 282     new_combined_ids[new_id] = _combine_1d(
    283         datasets, dim, compat, data_vars, coords, fill_value, join, combine_attrs
    284     )
    285 return new_combined_ids

File ~/.conda/envs/chunking/lib/python3.13/site-packages/xarray/core/combine.py:305, in _combine_1d(datasets, concat_dim, compat, data_vars, coords, fill_value, join, combine_attrs)
    303 if concat_dim is not None:
    304     try:
--> 305         combined = concat(
    306             datasets,
    307             dim=concat_dim,
    308             data_vars=data_vars,
    309             coords=coords,
    310             compat=compat,
    311             fill_value=fill_value,
    312             join=join,
    313             combine_attrs=combine_attrs,
    314         )
    315     except ValueError as err:
    316         if "encountered unexpected variable" in str(err):

File ~/.conda/envs/chunking/lib/python3.13/site-packages/xarray/core/concat.py:277, in concat(objs, dim, data_vars, coords, compat, positions, fill_value, join, combine_attrs, create_index_for_new_dim)
    264     return _dataarray_concat(
    265         objs,
    266         dim=dim,
   (...)
    274         create_index_for_new_dim=create_index_for_new_dim,
    275     )
    276 elif isinstance(first_obj, Dataset):
--> 277     return _dataset_concat(
    278         objs,
    279         dim=dim,
    280         data_vars=data_vars,
    281         coords=coords,
    282         compat=compat,
    283         positions=positions,
    284         fill_value=fill_value,
    285         join=join,
    286         combine_attrs=combine_attrs,
    287         create_index_for_new_dim=create_index_for_new_dim,
    288     )
    289 else:
    290     raise TypeError(
    291         "can only concatenate xarray Dataset and DataArray "
    292         f"objects, got {type(first_obj)}"
    293     )

File ~/.conda/envs/chunking/lib/python3.13/site-packages/xarray/core/concat.py:669, in _dataset_concat(datasets, dim, data_vars, coords, compat, positions, fill_value, join, combine_attrs, create_index_for_new_dim)
    667         result_vars[k] = v
    668 else:
--> 669     combined_var = concat_vars(
    670         vars, dim_name, positions, combine_attrs=combine_attrs
    671     )
    672     # reindex if variable is not present in all datasets
    673     if len(variable_index) < concat_index_size:

File ~/.conda/envs/chunking/lib/python3.13/site-packages/xarray/core/variable.py:3050, in concat(variables, dim, positions, shortcut, combine_attrs)
   3048     return IndexVariable.concat(variables, dim, positions, shortcut, combine_attrs)
   3049 else:
-> 3050     return Variable.concat(variables, dim, positions, shortcut, combine_attrs)

File ~/.conda/envs/chunking/lib/python3.13/site-packages/xarray/core/variable.py:1782, in Variable.concat(cls, variables, dim, positions, shortcut, combine_attrs)
   1780 axis = first_var.get_axis_num(dim)
   1781 dims = first_var_dims
-> 1782 data = duck_array_ops.concatenate(arrays, axis=axis)
   1783 if positions is not None:
   1784     # TODO: deprecate this option -- we don't need it for groupby
   1785     # any more.
   1786     indices = nputils.inverse_permutation(np.concatenate(positions))

File ~/.conda/envs/chunking/lib/python3.13/site-packages/xarray/core/duck_array_ops.py:391, in concatenate(arrays, axis)
    389     xp = get_array_namespace(arrays[0])
    390     return xp.concat(as_shared_dtype(arrays, xp=xp), axis=axis)
--> 391 return _concatenate(as_shared_dtype(arrays), axis=axis)

File ~/.conda/envs/chunking/lib/python3.13/site-packages/virtualizarr/manifests/array.py:130, in ManifestArray.__array_function__(self, func, types, args, kwargs)
    127 if not all(issubclass(t, ManifestArray) for t in types):
    128     return NotImplemented
--> 130 return MANIFESTARRAY_HANDLED_ARRAY_FUNCTIONS[func](*args, **kwargs)

File ~/.conda/envs/chunking/lib/python3.13/site-packages/virtualizarr/manifests/array_api.py:109, in concatenate(arrays, axis)
    106     raise TypeError()
    108 # ensure dtypes, shapes, codecs etc. are consistent
--> 109 _check_combineable_zarr_arrays(arrays)
    111 _check_same_ndims([arr.ndim for arr in arrays])
    113 # Ensure we handle axis being passed as a negative integer

File ~/.conda/envs/chunking/lib/python3.13/site-packages/virtualizarr/manifests/array_api.py:40, in _check_combineable_zarr_arrays(arrays)
     37 _check_same_codecs([arr.zarray.codec for arr in arrays])
     39 # Would require variable-length chunks ZEP
---> 40 _check_same_chunk_shapes([arr.chunks for arr in arrays])

File ~/.conda/envs/chunking/lib/python3.13/site-packages/virtualizarr/manifests/array_api.py:71, in _check_same_chunk_shapes(chunks_list)
     69 for other_chunks in other_chunks_list:
     70     if other_chunks != first_chunks:
---> 71         raise ValueError(
     72             f"Cannot concatenate arrays with inconsistent chunk shapes: {other_chunks} vs {first_chunks} ."
     73             "Requires ZEP003 (Variable-length Chunks)."
     74         )

ValueError: Cannot concatenate arrays with inconsistent chunk shapes: (366,) vs (365,) .Requires ZEP003 (Variable-length Chunks).

I have checked and confirmed all NetCDF files have the same chunking and have a chunk shape along the day dimension of 61. Is this error actually due to the chunk shapes, or is it due to the inconsistent data shape between files? Any help or insight would be appreciated!

Here is a minimal reproducible example:

import fsspec
import xarray as xr
from virtualizarr import open_virtual_dataset

reader_options = {
    'storage_options': {
        'anon': True, 
        'client_kwargs': {
            'endpoint_url': 'https://usgs.osn.mghpcc.org/'
        }
    }
}

fs = fsspec.filesystem(
    protocol='s3',
    **reader_options['storage_options']
)

virtual_datasets = [
    open_virtual_dataset(f's3://{file}', indexes={}, reader_options=reader_options)
    for file in fs.glob('s3://mdmf/gdp/netcdf/gridmet/gridmet/pr_198[!0-5]*.nc')
]

xr.concat(virtual_datasets , dim='day', coords='minimal', compat='override')
# or equivently
# xr.combine_nested(virtual_datasets , concat_dim='day', coords='minimal', compat='override')

Metadata

Metadata

Assignees

No one assigned

    Labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions