Skip to content

Commit

Permalink
FEAT-#1189: Add DataFrame.stack (#1673)
Browse files Browse the repository at this point in the history
Signed-off-by: Alexey Prutskov <alexey.prutskov@intel.com>
  • Loading branch information
prutskov authored Aug 31, 2020
1 parent 633a8b0 commit 3ddd5c0
Show file tree
Hide file tree
Showing 6 changed files with 128 additions and 8 deletions.
2 changes: 1 addition & 1 deletion docs/supported_apis/dataframe_supported.rst
Original file line number Diff line number Diff line change
Expand Up @@ -352,7 +352,7 @@ default to pandas.
+----------------------------+---------------------------+------------------------+----------------------------------------------------+
| ``squeeze`` | `squeeze`_ | Y | |
+----------------------------+---------------------------+------------------------+----------------------------------------------------+
| ``stack`` | `stack`_ | D | |
| ``stack`` | `stack`_ | Y | |
+----------------------------+---------------------------+------------------------+----------------------------------------------------+
| ``std`` | `std`_ | Y | |
+----------------------------+---------------------------+------------------------+----------------------------------------------------+
Expand Down
3 changes: 3 additions & 0 deletions modin/backends/base/query_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -542,6 +542,9 @@ def unique(self, **kwargs):
def value_counts(self, **kwargs):
pass

def stack(self, level, dropna):
pass

# Abstract map partitions across select indices
@abc.abstractmethod
def astype(self, col_dtypes, **kwargs):
Expand Down
17 changes: 17 additions & 0 deletions modin/backends/pandas/query_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -1148,6 +1148,23 @@ def map_func(df):
result = result.reindex(0, new_index)
return result

def stack(self, level, dropna):
if not isinstance(self.columns, pandas.MultiIndex) or (
isinstance(self.columns, pandas.MultiIndex)
and is_list_like(level)
and len(level) == self.columns.nlevels
):
new_columns = ["__reduced__"]
else:
new_columns = None

new_modin_frame = self._modin_frame._apply_full_axis(
1,
lambda df: pandas.DataFrame(df.stack(level=level, dropna=dropna)),
new_columns=new_columns,
)
return self.__constructor__(new_modin_frame)

# Map partitions operations
# These operations are operations that apply a function to every partition.
abs = MapFunction.register(pandas.DataFrame.abs, dtypes="copy")
Expand Down
44 changes: 41 additions & 3 deletions modin/pandas/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1688,12 +1688,14 @@ def unstack(self, level=-1, fill_value=None):
If the index is not a MultiIndex, the output will be a Series
(the analogue of stack when the columns are not a MultiIndex).
The level involved will automatically get sorted.
Parameters
----------
level : int, str, or list of these, default -1 (last level)
Level(s) of index to unstack, can pass level name.
fill_value : int, str or dict
Replace NaN with this value if the unstack produces missing values.
Returns
-------
Series or DataFrame
Expand Down Expand Up @@ -2162,9 +2164,45 @@ def squeeze(self, axis=None):
return self.copy()

def stack(self, level=-1, dropna=True):
return self._default_to_pandas(
pandas.DataFrame.stack, level=level, dropna=dropna
)
"""
Stack the prescribed level(s) from columns to index.
Return a reshaped DataFrame or Series having a multi-level
index with one or more new inner-most levels compared to the current
DataFrame. The new inner-most levels are created by pivoting the
columns of the current dataframe:
- if the columns have a single level, the output is a Series;
- if the columns have multiple levels, the new index
level(s) is (are) taken from the prescribed level(s) and
the output is a DataFrame.
Parameters
----------
level : int, str, list, default -1
Level(s) to stack from the column axis onto the index
axis, defined as one index or label, or a list of indices
or labels.
dropna : bool, default True
Whether to drop rows in the resulting Frame/Series with
missing values. Stacking a column level onto the index
axis can create combinations of index and column values
that are missing from the original dataframe. See Examples
section.
Returns
-------
DataFrame or Series
Stacked dataframe or series.
"""
if not isinstance(self.columns, pandas.MultiIndex) or (
isinstance(self.columns, pandas.MultiIndex)
and is_list_like(level)
and len(level) == self.columns.nlevels
):
return self._reduce_dimension(
query_compiler=self._query_compiler.stack(level, dropna)
)
else:
return DataFrame(query_compiler=self._query_compiler.stack(level, dropna))

def sub(self, other, axis="columns", level=None, fill_value=None):
return self._binary_op(
Expand Down
2 changes: 2 additions & 0 deletions modin/pandas/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1088,12 +1088,14 @@ def unstack(self, level=-1, fill_value=None):
"""
Unstack, a.k.a. pivot, Series with MultiIndex to produce DataFrame.
The level involved will automatically get sorted.
Parameters
----------
level : int, str, or list of these, default last level
Level(s) to unstack, can pass level name.
fill_value : scalar value, default None
Value to use when replacing NaN values.
Returns
-------
DataFrame
Expand Down
68 changes: 64 additions & 4 deletions modin/pandas/test/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -2736,10 +2736,70 @@ def test_slice_shift(self, data, index, axis, periods):
pandas_df.slice_shift(periods=periods, axis=axis),
)

def test_stack(self):
data = test_data_values[0]
with pytest.warns(UserWarning):
pd.DataFrame(data).stack()
@pytest.mark.parametrize(
"is_multi_idx", [True, False], ids=["idx_multi", "idx_index"]
)
@pytest.mark.parametrize(
"is_multi_col", [True, False], ids=["col_multi", "col_index"]
)
@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
def test_stack(self, data, is_multi_idx, is_multi_col):
pandas_df = pandas.DataFrame(data)
modin_df = pd.DataFrame(data)

if is_multi_idx:
if len(pandas_df.index) == 256:
index = pd.MultiIndex.from_product(
[
["a", "b", "c", "d"],
["x", "y", "z", "last"],
["i", "j", "k", "index"],
[1, 2, 3, 4],
]
)
elif len(pandas_df.index) == 100:
index = pd.MultiIndex.from_product(
[
["x", "y", "z", "last"],
["a", "b", "c", "d", "f"],
["i", "j", "k", "l", "index"],
]
)
else:
index = pandas_df.index

if is_multi_col:
if len(pandas_df.columns) == 64:
columns = pd.MultiIndex.from_product(
[
["A", "B", "C", "D"],
["xx", "yy", "zz", "LAST"],
[10, 20, 30, 40],
]
)
elif len(pandas_df.columns) == 100:
columns = pd.MultiIndex.from_product(
[
["xx", "yy", "zz", "LAST"],
["A", "B", "C", "D", "F"],
["I", "J", "K", "L", "INDEX"],
]
)
else:
columns = pandas_df.columns

pandas_df.columns = columns
pandas_df.index = index

modin_df.columns = columns
modin_df.index = index

df_equals(modin_df.stack(), pandas_df.stack())

if is_multi_col:
df_equals(modin_df.stack(level=0), pandas_df.stack(level=0))
df_equals(modin_df.stack(level=[0, 1]), pandas_df.stack(level=[0, 1]))
df_equals(modin_df.stack(level=[0, 1, 2]), pandas_df.stack(level=[0, 1, 2]))

def test_style(self):
data = test_data_values[0]
Expand Down

0 comments on commit 3ddd5c0

Please sign in to comment.