Support extension array indexes (#9671)

ilan-gold · shoyer · dcherian · web-flow · commit 2f1751df7fb1 · 2025-04-26T17:51:40.000Z
Co-authored-by: Stephan Hoyer &lt;shoyer@google.com&gt;
Co-authored-by: Deepak Cherian &lt;dcherian@users.noreply.github.com&gt;
Co-authored-by: Spencer Clark &lt;spencerkclark@gmail.com&gt;
Co-authored-by: Kai Mühlbauer &lt;kmuehlbauer@wradlib.org&gt;
Co-authored-by: Benoit Bovy &lt;benbovy@gmail.com&gt;
Co-authored-by: Kai Muehlbauer &lt;kai.muehlbauer@uni-bonn.de&gt;
Co-authored-by: pre-commit-ci[bot] &lt;66853113+pre-commit-ci[bot]@users.noreply.github.com&gt;
Co-authored-by: Deepak Cherian &lt;deepak@cherian.net&gt;
diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -29,6 +29,9 @@ New Features
 - Improved compatibility with OPeNDAP DAP4 data model for backend engine ``pydap``. This
   includes ``datatree`` support, and removing slashes from dimension names. By
   `Miguel Jimenez-Urias <https://github.com/Mikejmnez>`_.
+- Improved support pandas Extension Arrays. (:issue:`9661`, :pull:`9671`)
+  By `Ilan Gold <https://github.com/ilan-gold>`_.
+
 
 Breaking changes
 ~~~~~~~~~~~~~~~~
@@ -41,6 +44,12 @@ Breaking changes
     pydap                    3.4     3.5.0
   ===================== =========  =======
 
+
+- Reductions with ``groupby_bins`` or those that involve :py:class:`xarray.groupers.BinGrouper`
+  now return objects indexed by :py:meth:`pandas.IntervalArray` objects,
+  instead of numpy object arrays containing tuples. This change enables interval-aware indexing of
+  such Xarray objects. (:pull:`9671`). By `Ilan Gold <https://github.com/ilan-gold>`_.
+
 Deprecations
 ~~~~~~~~~~~~
 
diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py
@@ -6917,7 +6917,7 @@ def groupby(
                [[nan, nan, nan],
                 [ 3.,  4.,  5.]]])
         Coordinates:
-          * x_bins   (x_bins) object 16B (5, 15] (15, 25]
+          * x_bins   (x_bins) interval[int64, right] 32B (5, 15] (15, 25]
           * letters  (letters) object 16B 'a' 'b'
         Dimensions without coordinates: y
 
diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py
@@ -7059,6 +7059,8 @@ def to_pandas(self) -> pd.Series | pd.DataFrame:
         )
 
     def _to_dataframe(self, ordered_dims: Mapping[Any, int]):
+        from xarray.core.extension_array import PandasExtensionArray
+
         columns_in_order = [k for k in self.variables if k not in self.dims]
         non_extension_array_columns = [
             k
@@ -7070,20 +7072,41 @@ def _to_dataframe(self, ordered_dims: Mapping[Any, int]):
             for k in columns_in_order
             if is_extension_array_dtype(self.variables[k].data)
         ]
+        extension_array_columns_different_index = [
+            k
+            for k in extension_array_columns
+            if set(self.variables[k].dims) != set(ordered_dims.keys())
+        ]
+        extension_array_columns_same_index = [
+            k
+            for k in extension_array_columns
+            if k not in extension_array_columns_different_index
+        ]
         data = [
             self._variables[k].set_dims(ordered_dims).values.reshape(-1)
             for k in non_extension_array_columns
         ]
         index = self.coords.to_index([*ordered_dims])
         broadcasted_df = pd.DataFrame(
-            dict(zip(non_extension_array_columns, data, strict=True)), index=index
+            {
+                **dict(zip(non_extension_array_columns, data, strict=True)),
+                **{
+                    c: self.variables[c].data.array
+                    for c in extension_array_columns_same_index
+                },
+            },
+            index=index,
         )
-        for extension_array_column in extension_array_columns:
+        for extension_array_column in extension_array_columns_different_index:
             extension_array = self.variables[extension_array_column].data.array
-            index = self[self.variables[extension_array_column].dims[0]].data
+            index = self[
+                self.variables[extension_array_column].dims[0]
+            ].coords.to_index()
             extension_array_df = pd.DataFrame(
                 {extension_array_column: extension_array},
-                index=self[self.variables[extension_array_column].dims[0]].data,
+                index=pd.Index(index.array)
+                if isinstance(index, PandasExtensionArray)
+                else index,
             )
             extension_array_df.index.name = self.variables[extension_array_column].dims[
                 0
@@ -9892,10 +9915,10 @@ def groupby(
         >>> from xarray.groupers import BinGrouper, UniqueGrouper
         >>>
         >>> ds.groupby(x=BinGrouper(bins=[5, 15, 25]), letters=UniqueGrouper()).sum()
-        <xarray.Dataset> Size: 128B
+        <xarray.Dataset> Size: 144B
         Dimensions:  (y: 3, x_bins: 2, letters: 2)
         Coordinates:
-          * x_bins   (x_bins) object 16B (5, 15] (15, 25]
+          * x_bins   (x_bins) interval[int64, right] 32B (5, 15] (15, 25]
           * letters  (letters) object 16B 'a' 'b'
         Dimensions without coordinates: y
         Data variables:
diff --git a/xarray/core/extension_array.py b/xarray/core/extension_array.py
@@ -102,7 +102,7 @@ def replace_duck_with_extension_array(args) -> list:
             return type(self)[type(res)](res)
         return res
 
-    def __array_ufunc__(ufunc, method, *inputs, **kwargs):
+    def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
         return ufunc(*inputs, **kwargs)
 
     def __repr__(self):
diff --git a/xarray/core/indexes.py b/xarray/core/indexes.py
@@ -11,6 +11,7 @@
 
 from xarray.core import formatting, nputils, utils
 from xarray.core.coordinate_transform import CoordinateTransform
+from xarray.core.extension_array import PandasExtensionArray
 from xarray.core.indexing import (
     CoordinateTransformIndexingAdapter,
     IndexSelResult,
@@ -444,6 +445,8 @@ def safe_cast_to_index(array: Any) -> pd.Index:
     from xarray.core.variable import Variable
     from xarray.namedarray.pycompat import to_numpy
 
+    if isinstance(array, PandasExtensionArray):
+        array = pd.Index(array.array)
     if isinstance(array, pd.Index):
         index = array
     elif isinstance(array, DataArray | Variable):
@@ -602,7 +605,11 @@ def __init__(
         self.dim = dim
 
         if coord_dtype is None:
-            coord_dtype = get_valid_numpy_dtype(index)
+            if pd.api.types.is_extension_array_dtype(index.dtype):
+                cast(pd.api.extensions.ExtensionDtype, index.dtype)
+                coord_dtype = index.dtype
+            else:
+                coord_dtype = get_valid_numpy_dtype(index)
         self.coord_dtype = coord_dtype
 
     def _replace(self, index, dim=None, coord_dtype=None):
@@ -698,6 +705,8 @@ def concat(
 
         if not indexes:
             coord_dtype = None
+        elif len(set(idx.coord_dtype for idx in indexes)) == 1:
+            coord_dtype = indexes[0].coord_dtype
         else:
             coord_dtype = np.result_type(*[idx.coord_dtype for idx in indexes])
 
diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py
@@ -10,14 +10,16 @@
 from dataclasses import dataclass, field
 from datetime import timedelta
 from html import escape
-from typing import TYPE_CHECKING, Any, overload
+from typing import TYPE_CHECKING, Any, cast, overload
 
 import numpy as np
 import pandas as pd
+from numpy.typing import DTypeLike
 from packaging.version import Version
 
 from xarray.core import duck_array_ops
 from xarray.core.coordinate_transform import CoordinateTransform
+from xarray.core.extension_array import PandasExtensionArray
 from xarray.core.nputils import NumpyVIndexAdapter
 from xarray.core.options import OPTIONS
 from xarray.core.types import T_Xarray
@@ -28,14 +30,13 @@
     is_duck_array,
     is_duck_dask_array,
     is_scalar,
+    is_valid_numpy_dtype,
     to_0d_array,
 )
 from xarray.namedarray.parallelcompat import get_chunked_array_type
 from xarray.namedarray.pycompat import array_type, integer_types, is_chunked_array
 
 if TYPE_CHECKING:
-    from numpy.typing import DTypeLike
-
     from xarray.core.indexes import Index
     from xarray.core.types import Self
     from xarray.core.variable import Variable
@@ -1744,27 +1745,43 @@ class PandasIndexingAdapter(ExplicitlyIndexedNDArrayMixin):
     __slots__ = ("_dtype", "array")
 
     array: pd.Index
-    _dtype: np.dtype
+    _dtype: np.dtype | pd.api.extensions.ExtensionDtype
 
-    def __init__(self, array: pd.Index, dtype: DTypeLike = None):
+    def __init__(
+        self,
+        array: pd.Index,
+        dtype: DTypeLike | pd.api.extensions.ExtensionDtype | None = None,
+    ):
         from xarray.core.indexes import safe_cast_to_index
 
         self.array = safe_cast_to_index(array)
 
         if dtype is None:
-            self._dtype = get_valid_numpy_dtype(array)
+            if pd.api.types.is_extension_array_dtype(array.dtype):
+                cast(pd.api.extensions.ExtensionDtype, array.dtype)
+                self._dtype = array.dtype
+            else:
+                self._dtype = get_valid_numpy_dtype(array)
+        elif pd.api.types.is_extension_array_dtype(dtype):
+            self._dtype = cast(pd.api.extensions.ExtensionDtype, dtype)
         else:
-            self._dtype = np.dtype(dtype)
+            self._dtype = np.dtype(cast(DTypeLike, dtype))
 
     @property
-    def dtype(self) -> np.dtype:
+    def dtype(self) -> np.dtype | pd.api.extensions.ExtensionDtype:  # type: ignore[override]
         return self._dtype
 
     def __array__(
-        self, dtype: np.typing.DTypeLike = None, /, *, copy: bool | None = None
+        self,
+        dtype: np.typing.DTypeLike | None = None,
+        /,
+        *,
+        copy: bool | None = None,
     ) -> np.ndarray:
-        if dtype is None:
-            dtype = self.dtype
+        if dtype is None and is_valid_numpy_dtype(self.dtype):
+            dtype = cast(np.dtype, self.dtype)
+        else:
+            dtype = get_valid_numpy_dtype(self.array)
         array = self.array
         if isinstance(array, pd.PeriodIndex):
             with suppress(AttributeError):
@@ -1776,14 +1793,18 @@ def __array__(
         else:
             return np.asarray(array.values, dtype=dtype)
 
-    def get_duck_array(self) -> np.ndarray:
+    def get_duck_array(self) -> np.ndarray | PandasExtensionArray:
+        # We return an PandasExtensionArray wrapper type that satisfies
+        # duck array protocols. This is what's needed for tests to pass.
+        if pd.api.types.is_extension_array_dtype(self.array):
+            return PandasExtensionArray(self.array.array)
         return np.asarray(self)
 
     @property
     def shape(self) -> _Shape:
         return (len(self.array),)
 
-    def _convert_scalar(self, item):
+    def _convert_scalar(self, item) -> np.ndarray:
         if item is pd.NaT:
             # work around the impossibility of casting NaT with asarray
             # note: it probably would be better in general to return
@@ -1799,7 +1820,10 @@ def _convert_scalar(self, item):
             # numpy fails to convert pd.Timestamp to np.datetime64[ns]
             item = np.asarray(item.to_datetime64())
         elif self.dtype != object:
-            item = np.asarray(item, dtype=self.dtype)
+            dtype = self.dtype
+            if pd.api.types.is_extension_array_dtype(dtype):
+                dtype = get_valid_numpy_dtype(self.array)
+            item = np.asarray(item, dtype=cast(np.dtype, dtype))
 
         # as for numpy.ndarray indexing, we always want the result to be
         # a NumPy array.
@@ -1902,6 +1926,12 @@ def copy(self, deep: bool = True) -> Self:
         array = self.array.copy(deep=True) if deep else self.array
         return type(self)(array, self._dtype)
 
+    @property
+    def nbytes(self) -> int:
+        if pd.api.types.is_extension_array_dtype(self.dtype):
+            return self.array.nbytes
+        return cast(np.dtype, self.dtype).itemsize * len(self.array)
+
 
 class PandasMultiIndexingAdapter(PandasIndexingAdapter):
     """Handles explicit indexing for a pandas.MultiIndex.
@@ -1914,23 +1944,27 @@ class PandasMultiIndexingAdapter(PandasIndexingAdapter):
     __slots__ = ("_dtype", "adapter", "array", "level")
 
     array: pd.MultiIndex
-    _dtype: np.dtype
+    _dtype: np.dtype | pd.api.extensions.ExtensionDtype
     level: str | None
 
     def __init__(
         self,
         array: pd.MultiIndex,
-        dtype: DTypeLike = None,
+        dtype: DTypeLike | pd.api.extensions.ExtensionDtype | None = None,
         level: str | None = None,
     ):
         super().__init__(array, dtype)
         self.level = level
 
     def __array__(
-        self, dtype: np.typing.DTypeLike = None, /, *, copy: bool | None = None
+        self,
+        dtype: DTypeLike | None = None,
+        /,
+        *,
+        copy: bool | None = None,
     ) -> np.ndarray:
         if dtype is None:
-            dtype = self.dtype
+            dtype = cast(np.dtype, self.dtype)
         if self.level is not None:
             return np.asarray(
                 self.array.get_level_values(self.level).values, dtype=dtype
diff --git a/xarray/core/variable.py b/xarray/core/variable.py
@@ -13,7 +13,6 @@
 import numpy as np
 import pandas as pd
 from numpy.typing import ArrayLike
-from pandas.api.types import is_extension_array_dtype
 
 import xarray as xr  # only for Dataset and DataArray
 from xarray.compat.array_api_compat import to_like_array
@@ -60,6 +59,7 @@
     indexing.ExplicitlyIndexed,
     pd.Index,
     pd.api.extensions.ExtensionArray,
+    PandasExtensionArray,
 )
 # https://github.com/python/mypy/issues/224
 BASIC_INDEXING_TYPES = integer_types + (slice,)
@@ -192,7 +192,7 @@ def _maybe_wrap_data(data):
     if isinstance(data, pd.Index):
         return PandasIndexingAdapter(data)
     if isinstance(data, pd.api.extensions.ExtensionArray):
-        return PandasExtensionArray[type(data)](data)
+        return PandasExtensionArray(data)
     return data
 
 
@@ -2593,11 +2593,6 @@ def chunk(  # type: ignore[override]
         dask.array.from_array
         """
 
-        if is_extension_array_dtype(self):
-            raise ValueError(
-                f"{self} was found to be a Pandas ExtensionArray.  Please convert to numpy first."
-            )
-
         if from_array_kwargs is None:
             from_array_kwargs = {}
 
diff --git a/xarray/namedarray/core.py b/xarray/namedarray/core.py
@@ -834,6 +834,7 @@ def chunk(
         if chunkmanager.is_chunked_array(data_old):
             data_chunked = chunkmanager.rechunk(data_old, chunks)  # type: ignore[arg-type]
         else:
+            ndata: duckarray[Any, Any]
             if not isinstance(data_old, ExplicitlyIndexed):
                 ndata = data_old
             else:
diff --git a/xarray/plot/dataarray_plot.py b/xarray/plot/dataarray_plot.py
@@ -524,7 +524,7 @@ def line(
         assert hueplt is not None
         ax.legend(handles=primitive, labels=list(hueplt.to_numpy()), title=hue_label)
 
-    if np.issubdtype(xplt.dtype, np.datetime64):
+    if isinstance(xplt.dtype, np.dtype) and np.issubdtype(xplt.dtype, np.datetime64):  # type: ignore[redundant-expr]
         _set_concise_date(ax, axis="x")
 
     _update_axes(ax, xincrease, yincrease, xscale, yscale, xticks, yticks, xlim, ylim)
diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py
@@ -4381,7 +4381,6 @@ def test_setitem_pandas(self) -> None:
         ds["x"] = np.arange(3)
         ds_copy = ds.copy()
         ds_copy["bar"] = ds["bar"].to_pandas()
-
         assert_equal(ds, ds_copy)
 
     def test_setitem_auto_align(self) -> None:
@@ -4972,6 +4971,16 @@ def test_to_and_from_dataframe(self) -> None:
         expected = pd.DataFrame([[]], index=idx)
         assert expected.equals(actual), (expected, actual)
 
+    def test_from_dataframe_categorical_dtype_index(self) -> None:
+        cat = pd.CategoricalIndex(list("abcd"))
+        df = pd.DataFrame({"f": [0, 1, 2, 3]}, index=cat)
+        ds = df.to_xarray()
+        restored = ds.to_dataframe()
+        df.index.name = (
+            "index"  # restored gets the name because it has the coord with the name
+        )
+        pd.testing.assert_frame_equal(df, restored)
+
     def test_from_dataframe_categorical_index(self) -> None:
         cat = pd.CategoricalDtype(
             categories=["foo", "bar", "baz", "qux", "quux", "corge"]
@@ -4996,7 +5005,7 @@ def test_from_dataframe_categorical_index_string_categories(self) -> None:
         )
         ser = pd.Series(1, index=cat)
         ds = ser.to_xarray()
-        assert ds.coords.dtypes["index"] == np.dtype("O")
+        assert ds.coords.dtypes["index"] == ser.index.dtype
 
     @requires_sparse
     def test_from_dataframe_sparse(self) -> None:
diff --git a/xarray/tests/test_groupby.py b/xarray/tests/test_groupby.py
diff --git a/xarray/tests/test_pandas_to_xarray.py b/xarray/tests/test_pandas_to_xarray.py
diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py