pydata · shoyer · Jul 2, 2020 · Jun 26, 2020 · Jun 26, 2020 · Jun 26, 2020
diff --git a/asv_bench/benchmarks/pandas.py b/asv_bench/benchmarks/pandas.py
@@ -0,0 +1,24 @@
+import numpy as np
+import pandas as pd
+
+from . import parameterized
+
+
+class MultiIndexSeries:
+    def setup(self, dtype, subset):
+        data = np.random.rand(100000).astype(dtype)
+        index = pd.MultiIndex.from_product(
+            [
+                list("abcdefhijk"),
+                list("abcdefhijk"),
+                pd.date_range(start="2000-01-01", periods=1000, freq="B"),
+            ]
+        )
+        series = pd.Series(data, index)
+        if subset:
+            series = series[::3]
+        self.series = series
+
+    @parameterized(["dtype", "subset"], ([int, float], [True, False]))
+    def time_to_xarray(self, dtype, subset):
+        self.series.to_xarray()
diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -47,7 +47,10 @@ Enhancements
   For orthogonal linear- and nearest-neighbor interpolation, we do 1d-interpolation sequentially
   rather than interpolating in multidimensional space. (:issue:`2223`)
   By `Keisuke Fujii <https://github.com/fujiisoup>`_.
-- :py:meth:`DataArray.reset_index` and :py:meth:`Dataset.reset_index` now keep
+- Major performance improvement for :py:meth:`Dataset.from_dataframe` when the
+  dataframe has a MultiIndex (:pull:`4184`).
+  By `Stephan Hoyer <https://github.com/shoyer>`_.
+  - :py:meth:`DataArray.reset_index` and :py:meth:`Dataset.reset_index` now keep
   coordinate attributes (:pull:`4103`). By `Oriol Abril <https://github.com/OriolAbril>`_.
 
 New Features

diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py
@@ -4582,17 +4582,39 @@ def _set_sparse_data_from_dataframe(
     def _set_numpy_data_from_dataframe(
         self, dataframe: pd.DataFrame, dims: tuple
     ) -> None:
+
         idx = dataframe.index
-        if isinstance(idx, pd.MultiIndex):
-            # expand the DataFrame to include the product of all levels
-            full_idx = pd.MultiIndex.from_product(idx.levels, names=idx.names)
-            dataframe = dataframe.reindex(full_idx)
-            shape = tuple(lev.size for lev in idx.levels)
-        else:
-            shape = (idx.size,)
+
+        if not isinstance(idx, pd.MultiIndex):
+            for name, series in dataframe.items():
+                self[name] = (dims, np.asarray(series))
+            return
+
+        if not idx.is_unique:
+            raise ValueError(
+                "cannot convert a DataFrame with a non-unique MultiIndex into xarray"
+            )
+
+        shape = tuple(lev.size for lev in idx.levels)
+        full_indexer = tuple(idx.codes)
+
+        # We already verified that the MultiIndex has all unique values, so
+        # there are missing values if and only if the size of output arrays is
+        # larger that the index.
+        missing_values = np.prod(shape) > idx.shape[0]
+
         for name, series in dataframe.items():
-            data = np.asarray(series).reshape(shape)
-            self[name] = (dims, data)
+            data = np.asarray(series)
+            # NumPy indexing is much faster than using DataFrame.reindex to
+            # fill in missing values:
+            # https://stackoverflow.com/a/35049899/809705
+            if missing_values:
+                dtype, fill_value = dtypes.maybe_promote(data.dtype)
+                new_data = np.full(shape, fill_value, dtype)
+            else:
+                new_data = np.zeros(shape, data.dtype)
+            new_data[full_indexer] = data
+            self[name] = (dims, new_data)
 
     @classmethod
     def from_dataframe(cls, dataframe: pd.DataFrame, sparse: bool = False) -> "Dataset":

diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py
@@ -4013,6 +4013,33 @@ def test_to_and_from_empty_dataframe(self):
         assert len(actual) == 0
         assert expected.equals(actual)
 
+    def test_from_dataframe_multiindex(self):
+        index = pd.MultiIndex.from_product([["a", "b"], [1, 2, 3]], names=["x", "y"])
+        df = pd.DataFrame({"z": np.arange(6)}, index=index)
+
+        expected = Dataset(
+            {"z": (("x", "y"), [[0, 1, 2], [3, 4, 5]])},
+            coords={"x": ["a", "b"], "y": [1, 2, 3]},
+        )
+        actual = Dataset.from_dataframe(df)
+        assert_identical(actual, expected)
+
+        df2 = df.iloc[[3, 2, 1, 0, 4, 5], :]
+        actual = Dataset.from_dataframe(df2)
+        assert_identical(actual, expected)
+
+        df3 = df.iloc[:4, :]
+        expected3 = Dataset(
+            {"z": (("x", "y"), [[0, 1, 2], [3, np.nan, np.nan]])},
+            coords={"x": ["a", "b"], "y": [1, 2, 3]},
+        )
+        actual = Dataset.from_dataframe(df3)
+        assert_identical(actual, expected3)
+
+        df_nonunique = df.iloc[[0, 0], :]
+        with raises_regex(ValueError, "non-unique MultiIndex"):
+            Dataset.from_dataframe(df_nonunique)
+
     def test_from_dataframe_non_unique_columns(self):
         # regression test for GH449
         df = pd.DataFrame(np.zeros((2, 2)))