FEAT-#1189: Add DataFrame.stack (#1673)

Signed-off-by: Alexey Prutskov <alexey.prutskov@intel.com>
modin-project · Aug 31, 2020 · 3ddd5c0 · 3ddd5c0
1 parent 633a8b0
commit 3ddd5c0
Show file tree

Hide file tree

Showing 6 changed files with 128 additions and 8 deletions.
diff --git a/docs/supported_apis/dataframe_supported.rst b/docs/supported_apis/dataframe_supported.rst
@@ -352,7 +352,7 @@ default to pandas.
 +----------------------------+---------------------------+------------------------+----------------------------------------------------+
 | ``squeeze``                | `squeeze`_                | Y                      |                                                    |
 +----------------------------+---------------------------+------------------------+----------------------------------------------------+
-| ``stack``                  | `stack`_                  | D                      |                                                    |
+| ``stack``                  | `stack`_                  | Y                      |                                                    |
 +----------------------------+---------------------------+------------------------+----------------------------------------------------+
 | ``std``                    | `std`_                    | Y                      |                                                    |
 +----------------------------+---------------------------+------------------------+----------------------------------------------------+

diff --git a/modin/backends/base/query_compiler.py b/modin/backends/base/query_compiler.py
@@ -542,6 +542,9 @@ def unique(self, **kwargs):
     def value_counts(self, **kwargs):
         pass
 
+    def stack(self, level, dropna):
+        pass
+
     # Abstract map partitions across select indices
     @abc.abstractmethod
     def astype(self, col_dtypes, **kwargs):

diff --git a/modin/backends/pandas/query_compiler.py b/modin/backends/pandas/query_compiler.py
@@ -1148,6 +1148,23 @@ def map_func(df):
             result = result.reindex(0, new_index)
         return result
 
+    def stack(self, level, dropna):
+        if not isinstance(self.columns, pandas.MultiIndex) or (
+            isinstance(self.columns, pandas.MultiIndex)
+            and is_list_like(level)
+            and len(level) == self.columns.nlevels
+        ):
+            new_columns = ["__reduced__"]
+        else:
+            new_columns = None
+
+        new_modin_frame = self._modin_frame._apply_full_axis(
+            1,
+            lambda df: pandas.DataFrame(df.stack(level=level, dropna=dropna)),
+            new_columns=new_columns,
+        )
+        return self.__constructor__(new_modin_frame)
+
     # Map partitions operations
     # These operations are operations that apply a function to every partition.
     abs = MapFunction.register(pandas.DataFrame.abs, dtypes="copy")

diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py
@@ -1688,12 +1688,14 @@ def unstack(self, level=-1, fill_value=None):
         If the index is not a MultiIndex, the output will be a Series
         (the analogue of stack when the columns are not a MultiIndex).
         The level involved will automatically get sorted.
+
         Parameters
         ----------
         level : int, str, or list of these, default -1 (last level)
             Level(s) of index to unstack, can pass level name.
         fill_value : int, str or dict
             Replace NaN with this value if the unstack produces missing values.
+
         Returns
         -------
         Series or DataFrame
@@ -2162,9 +2164,45 @@ def squeeze(self, axis=None):
             return self.copy()
 
     def stack(self, level=-1, dropna=True):
-        return self._default_to_pandas(
-            pandas.DataFrame.stack, level=level, dropna=dropna
-        )
+        """
+        Stack the prescribed level(s) from columns to index.
+        Return a reshaped DataFrame or Series having a multi-level
+        index with one or more new inner-most levels compared to the current
+        DataFrame. The new inner-most levels are created by pivoting the
+        columns of the current dataframe:
+          - if the columns have a single level, the output is a Series;
+          - if the columns have multiple levels, the new index
+            level(s) is (are) taken from the prescribed level(s) and
+            the output is a DataFrame.
+
+        Parameters
+        ----------
+        level : int, str, list, default -1
+            Level(s) to stack from the column axis onto the index
+            axis, defined as one index or label, or a list of indices
+            or labels.
+        dropna : bool, default True
+            Whether to drop rows in the resulting Frame/Series with
+            missing values. Stacking a column level onto the index
+            axis can create combinations of index and column values
+            that are missing from the original dataframe. See Examples
+            section.
+
+        Returns
+        -------
+        DataFrame or Series
+            Stacked dataframe or series.
+        """
+        if not isinstance(self.columns, pandas.MultiIndex) or (
+            isinstance(self.columns, pandas.MultiIndex)
+            and is_list_like(level)
+            and len(level) == self.columns.nlevels
+        ):
+            return self._reduce_dimension(
+                query_compiler=self._query_compiler.stack(level, dropna)
+            )
+        else:
+            return DataFrame(query_compiler=self._query_compiler.stack(level, dropna))
 
     def sub(self, other, axis="columns", level=None, fill_value=None):
         return self._binary_op(

diff --git a/modin/pandas/series.py b/modin/pandas/series.py
@@ -1088,12 +1088,14 @@ def unstack(self, level=-1, fill_value=None):
         """
         Unstack, a.k.a. pivot, Series with MultiIndex to produce DataFrame.
         The level involved will automatically get sorted.
+
         Parameters
         ----------
         level : int, str, or list of these, default last level
             Level(s) to unstack, can pass level name.
         fill_value : scalar value, default None
             Value to use when replacing NaN values.
+
         Returns
         -------
         DataFrame

diff --git a/modin/pandas/test/test_dataframe.py b/modin/pandas/test/test_dataframe.py
@@ -2736,10 +2736,70 @@ def test_slice_shift(self, data, index, axis, periods):
             pandas_df.slice_shift(periods=periods, axis=axis),
         )
 
-    def test_stack(self):
-        data = test_data_values[0]
-        with pytest.warns(UserWarning):
-            pd.DataFrame(data).stack()
+    @pytest.mark.parametrize(
+        "is_multi_idx", [True, False], ids=["idx_multi", "idx_index"]
+    )
+    @pytest.mark.parametrize(
+        "is_multi_col", [True, False], ids=["col_multi", "col_index"]
+    )
+    @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
+    def test_stack(self, data, is_multi_idx, is_multi_col):
+        pandas_df = pandas.DataFrame(data)
+        modin_df = pd.DataFrame(data)
+
+        if is_multi_idx:
+            if len(pandas_df.index) == 256:
+                index = pd.MultiIndex.from_product(
+                    [
+                        ["a", "b", "c", "d"],
+                        ["x", "y", "z", "last"],
+                        ["i", "j", "k", "index"],
+                        [1, 2, 3, 4],
+                    ]
+                )
+            elif len(pandas_df.index) == 100:
+                index = pd.MultiIndex.from_product(
+                    [
+                        ["x", "y", "z", "last"],
+                        ["a", "b", "c", "d", "f"],
+                        ["i", "j", "k", "l", "index"],
+                    ]
+                )
+        else:
+            index = pandas_df.index
+
+        if is_multi_col:
+            if len(pandas_df.columns) == 64:
+                columns = pd.MultiIndex.from_product(
+                    [
+                        ["A", "B", "C", "D"],
+                        ["xx", "yy", "zz", "LAST"],
+                        [10, 20, 30, 40],
+                    ]
+                )
+            elif len(pandas_df.columns) == 100:
+                columns = pd.MultiIndex.from_product(
+                    [
+                        ["xx", "yy", "zz", "LAST"],
+                        ["A", "B", "C", "D", "F"],
+                        ["I", "J", "K", "L", "INDEX"],
+                    ]
+                )
+        else:
+            columns = pandas_df.columns
+
+        pandas_df.columns = columns
+        pandas_df.index = index
+
+        modin_df.columns = columns
+        modin_df.index = index
+
+        df_equals(modin_df.stack(), pandas_df.stack())
+
+        if is_multi_col:
+            df_equals(modin_df.stack(level=0), pandas_df.stack(level=0))
+            df_equals(modin_df.stack(level=[0, 1]), pandas_df.stack(level=[0, 1]))
+            df_equals(modin_df.stack(level=[0, 1, 2]), pandas_df.stack(level=[0, 1, 2]))
 
     def test_style(self):
         data = test_data_values[0]