Skip to content

Dataset.from_dataframe: optionally keep multi-index unexpanded #8170

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
86 changes: 61 additions & 25 deletions xarray/core/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -7128,7 +7128,7 @@ def _set_sparse_data_from_dataframe(
def _set_numpy_data_from_dataframe(
self, idx: pd.Index, arrays: list[tuple[Hashable, np.ndarray]], dims: tuple
) -> None:
if not isinstance(idx, pd.MultiIndex):
if len(dims) == 1:
for name, values in arrays:
self[name] = (dims, values)
return
Expand Down Expand Up @@ -7164,17 +7164,18 @@ def _set_numpy_data_from_dataframe(

@classmethod
def from_dataframe(
cls: type[T_Dataset], dataframe: pd.DataFrame, sparse: bool = False
cls: type[T_Dataset],
dataframe: pd.DataFrame,
sparse: bool = False,
unstack: bool = True,
dim: Hashable | None = None,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agree re #8166 (comment)

We could just use "dim_X" if Index.name is None, and have the user manually rename to a name they like.

) -> T_Dataset:
"""Convert a pandas.DataFrame into an xarray.Dataset

Each column will be converted into an independent variable in the
Dataset. If the dataframe's index is a MultiIndex, it will be expanded
into a tensor product of one-dimensional indices (filling in missing
values with NaN). This method will produce a Dataset very similar to
that on which the 'to_dataframe' method was called, except with
possibly redundant dimensions (since all dataset variables will have
the same dimensionality)
Dataset.

DEPRECATED: if the dataframe's index is a MultiIndex and ``dim=None``,

Parameters
----------
Expand All @@ -7183,7 +7184,21 @@ def from_dataframe(
sparse : bool, default: False
If true, create a sparse arrays instead of dense numpy arrays. This
can potentially save a large amount of memory if the DataFrame has
a MultiIndex. Requires the sparse package (sparse.pydata.org).
a MultiIndex and ``unstack=True``.
Requires the sparse package (sparse.pydata.org).
unstack : bool, default: True
If True (default) and if the dataframe's index is a MultiIndex,
the index will be expanded into a tensor product of one-dimensional
indices (filling in missing values with NaN). This method will produce a
Dataset very similar to that on which the 'to_dataframe' method was
called, except with possibly redundant dimensions (since all dataset
variables will have the same dimensionality).
dim : str, optional
Name of the dimension to assign to all variables and coordinates.
If None (default), the dimension name is set from the name of the
DataFrame index. If the index has no defined name, "dim_0" is used
as a fallback. This argument is ignored if the dataframe's index is
a MultiIndex and ``unstack=True``.

Returns
-------
Expand All @@ -7192,7 +7207,9 @@ def from_dataframe(
See Also
--------
xarray.DataArray.from_series
xarray.Dataset.to_dataframe
pandas.DataFrame.to_xarray

"""
# TODO: Add an option to remove dimensions along which the variables
# are constant, to enable consistent serialization to/from a dataframe,
Expand All @@ -7203,10 +7220,15 @@ def from_dataframe(

idx = remove_unused_levels_categories(dataframe.index)

if isinstance(idx, pd.MultiIndex) and not idx.is_unique:
raise ValueError(
"cannot convert a DataFrame with a non-unique MultiIndex into xarray"
)
if isinstance(idx, pd.MultiIndex):
if not idx.is_unique:
raise ValueError(
"cannot convert a DataFrame with a non-unique MultiIndex into xarray"
)
if sparse and not unstack:
raise ValueError(
"conversion to sparse arrays is no supported when unstack=False"
)

# Cast to a NumPy array first, in case the Series is a pandas Extension
# array (which doesn't have a valid NumPy dtype)
Expand All @@ -7216,21 +7238,35 @@ def from_dataframe(

indexes: dict[Hashable, Index] = {}
index_vars: dict[Hashable, Variable] = {}
dims: tuple[Hashable]

def get_dims(index) -> tuple[Hashable]:
if dim is not None:
return (dim,)
elif index.name is not None:
return (index.name,)
else:
return ("index",)

if isinstance(idx, pd.MultiIndex):
dims = tuple(
name if name is not None else "level_%i" % n
for n, name in enumerate(idx.names)
)
for dim, lev in zip(dims, idx.levels):
xr_idx = PandasIndex(lev, dim)
indexes[dim] = xr_idx
index_vars.update(xr_idx.create_variables())
if unstack:
dims = tuple(
name if name is not None else "level_%i" % n
for n, name in enumerate(idx.names)
)
for dim, lev in zip(dims, idx.levels):
xr_idx = PandasIndex(lev, dim)
indexes[dim] = xr_idx
index_vars.update(xr_idx.create_variables())
else:
dims = get_dims(idx)
coords = Coordinates.from_pandas_multiindex(idx, dim=dims[0])
indexes.update(coords.xindexes)
index_vars.update(coords.variables)
else:
index_name = idx.name if idx.name is not None else "index"
dims = (index_name,)
xr_idx = PandasIndex(idx, index_name)
indexes[index_name] = xr_idx
dims = get_dims(idx)
xr_idx = PandasIndex(idx, dims[0])
indexes[dims[0]] = xr_idx
index_vars.update(xr_idx.create_variables())

obj = cls._construct_direct(index_vars, set(index_vars), indexes=indexes)
Expand Down