Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve the speed of from_dataframe with a MultiIndex (by 40x!) #4184

Merged
merged 13 commits into from
Jul 2, 2020
24 changes: 24 additions & 0 deletions asv_bench/benchmarks/pandas.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import numpy as np
import pandas as pd

from . import parameterized


class MultiIndexSeries:
def setup(self, dtype, subset):
data = np.random.rand(100000).astype(dtype)
index = pd.MultiIndex.from_product(
[
list("abcdefhijk"),
list("abcdefhijk"),
pd.date_range(start="2000-01-01", periods=1000, freq="B"),
]
)
series = pd.Series(data, index)
if subset:
series = series[::3]
self.series = series

@parameterized(["dtype", "subset"], ([int, float], [True, False]))
def time_to_xarray(self, dtype, subset):
self.series.to_xarray()
5 changes: 4 additions & 1 deletion doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,10 @@ Enhancements
For orthogonal linear- and nearest-neighbor interpolation, we do 1d-interpolation sequentially
rather than interpolating in multidimensional space. (:issue:`2223`)
By `Keisuke Fujii <https://github.com/fujiisoup>`_.
- :py:meth:`DataArray.reset_index` and :py:meth:`Dataset.reset_index` now keep
- Major performance improvement for :py:meth:`Dataset.from_dataframe` when the
dataframe has a MultiIndex (:pull:`4184`).
By `Stephan Hoyer <https://github.com/shoyer>`_.
- :py:meth:`DataArray.reset_index` and :py:meth:`Dataset.reset_index` now keep
coordinate attributes (:pull:`4103`). By `Oriol Abril <https://github.com/OriolAbril>`_.

New Features
Expand Down
40 changes: 31 additions & 9 deletions xarray/core/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -4582,17 +4582,39 @@ def _set_sparse_data_from_dataframe(
def _set_numpy_data_from_dataframe(
self, dataframe: pd.DataFrame, dims: tuple
) -> None:

idx = dataframe.index
if isinstance(idx, pd.MultiIndex):
# expand the DataFrame to include the product of all levels
full_idx = pd.MultiIndex.from_product(idx.levels, names=idx.names)
dataframe = dataframe.reindex(full_idx)
shape = tuple(lev.size for lev in idx.levels)
else:
shape = (idx.size,)

if not isinstance(idx, pd.MultiIndex):
for name, series in dataframe.items():
self[name] = (dims, np.asarray(series))
return

if not idx.is_unique:
raise ValueError(
"cannot convert a DataFrame with a non-unique MultiIndex into xarray"
)

shape = tuple(lev.size for lev in idx.levels)
full_indexer = tuple(idx.codes)

# We already verified that the MultiIndex has all unique values, so
# there are missing values if and only if the size of output arrays is
# larger that the index.
missing_values = np.prod(shape) > idx.shape[0]

for name, series in dataframe.items():
data = np.asarray(series).reshape(shape)
self[name] = (dims, data)
data = np.asarray(series)
# NumPy indexing is much faster than using DataFrame.reindex to
# fill in missing values:
# https://stackoverflow.com/a/35049899/809705
if missing_values:
dtype, fill_value = dtypes.maybe_promote(data.dtype)
new_data = np.full(shape, fill_value, dtype)
else:
new_data = np.zeros(shape, data.dtype)
new_data[full_indexer] = data
self[name] = (dims, new_data)

@classmethod
def from_dataframe(cls, dataframe: pd.DataFrame, sparse: bool = False) -> "Dataset":
Expand Down
27 changes: 27 additions & 0 deletions xarray/tests/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -4013,6 +4013,33 @@ def test_to_and_from_empty_dataframe(self):
assert len(actual) == 0
assert expected.equals(actual)

def test_from_dataframe_multiindex(self):
index = pd.MultiIndex.from_product([["a", "b"], [1, 2, 3]], names=["x", "y"])
df = pd.DataFrame({"z": np.arange(6)}, index=index)

expected = Dataset(
{"z": (("x", "y"), [[0, 1, 2], [3, 4, 5]])},
coords={"x": ["a", "b"], "y": [1, 2, 3]},
)
actual = Dataset.from_dataframe(df)
assert_identical(actual, expected)

df2 = df.iloc[[3, 2, 1, 0, 4, 5], :]
actual = Dataset.from_dataframe(df2)
assert_identical(actual, expected)

df3 = df.iloc[:4, :]
expected3 = Dataset(
{"z": (("x", "y"), [[0, 1, 2], [3, np.nan, np.nan]])},
coords={"x": ["a", "b"], "y": [1, 2, 3]},
)
actual = Dataset.from_dataframe(df3)
assert_identical(actual, expected3)

df_nonunique = df.iloc[[0, 0], :]
with raises_regex(ValueError, "non-unique MultiIndex"):
Dataset.from_dataframe(df_nonunique)

def test_from_dataframe_non_unique_columns(self):
# regression test for GH449
df = pd.DataFrame(np.zeros((2, 2)))
Expand Down