FEAT-modin-project#7117: Support building range-partitioning from an …

…index level (modin-project#7120) Signed-off-by: Dmitry Chigarev <dmitry.chigarev@intel.com>
sfc-gh-mvashishtha · Apr 2, 2024 · 9afc049 · 9afc049
1 parent 3bf5d8f
commit 9afc049
Show file tree

Hide file tree

Showing 5 changed files with 168 additions and 56 deletions.
diff --git a/modin/core/dataframe/pandas/dataframe/dataframe.py b/modin/core/dataframe/pandas/dataframe/dataframe.py
@@ -2446,6 +2446,7 @@ def _apply_func_to_range_partitioning(
         preserve_columns=False,
         data=None,
         data_key_columns=None,
+        level=None,
         **kwargs,
     ):
         """
@@ -2454,7 +2455,7 @@ def _apply_func_to_range_partitioning(
         Parameters
         ----------
         key_columns : list of hashables
-            Columns to build the range partitioning for.
+            Columns to build the range partitioning for. Can't be specified along with `level`.
         func : callable(pandas.DataFrame) -> pandas.DataFrame
             Function to apply against partitions.
         ascending : bool, default: True
@@ -2467,6 +2468,8 @@ def _apply_func_to_range_partitioning(
             ``df["grouper"] # self`` and ``df["data"] # data``.
         data_key_columns : list of hashables, optional
             Additional key columns from `data`. Will be combined with `key_columns`.
+        level : list of ints or labels, optional
+            Index level(s) to build the range partitioning for. Can't be specified along with `key_columns`.
         **kwargs : dict
             Additional arguments to forward to the range builder function.
 
@@ -2575,14 +2578,21 @@ def _apply_func_to_range_partitioning(
             key_columns,
             ascending[0] if is_list_like(ascending) else ascending,
             ideal_num_new_partitions,
+            level=level,
             **kwargs,
         )
 
-        # here we want to get indices of those partitions that hold the key columns
-        key_indices = grouper.columns.get_indexer_for(key_columns)
-        partition_indices = np.unique(
-            np.digitize(key_indices, np.cumsum(grouper.column_widths))
-        )
+        if key_columns:
+            # here we want to get indices of those partitions that hold the key columns
+            key_indices = grouper.columns.get_indexer_for(key_columns)
+            partition_indices = np.unique(
+                np.digitize(key_indices, np.cumsum(grouper.column_widths))
+            )
+        elif level is not None:
+            # each partition contains an index, so taking the first one
+            partition_indices = [0]
+        else:
+            raise ValueError("Must specify either 'level' or 'key_columns'")
 
         new_partitions = grouper._partition_mgr_cls.shuffle_partitions(
             new_partitions,
@@ -4038,6 +4048,10 @@ def groupby(
         duplicated_suffix = "__duplicated_suffix__"
         duplicated_pattern = r"_[\d]*__duplicated_suffix__"
         kwargs["observed"] = True
+        level = kwargs.get("level")
+
+        if level is not None and not isinstance(level, list):
+            level = [level]
 
         def apply_func(df):  # pragma: no cover
             if has_external_grouper:
@@ -4081,6 +4095,12 @@ def apply_func(df):  # pragma: no cover
 
             if series_groupby:
                 df = df.squeeze(axis=1)
+
+            if kwargs.get("level") is not None:
+                assert len(by) == 0
+                # passing an empty list triggers an error
+                by = None
+
             result = operator(df.groupby(by, **kwargs))
 
             if align_result_columns and df.empty and result.empty:
@@ -4135,6 +4155,7 @@ def apply_func(df):  # pragma: no cover
             func=apply_func,
             data=data,
             data_key_columns=data_key_columns,
+            level=level,
         )
         # no need aligning columns if there's only one row partition
         if add_missing_cats or align_result_columns and result._partitions.shape[0] > 1:
@@ -4293,7 +4314,11 @@ def join_cols(df, *cols):
                 row_lengths=result._row_lengths_cache,
             )
 
-        if not result.has_materialized_index and not has_external_grouper:
+        if (
+            not result.has_materialized_index
+            and not has_external_grouper
+            and level is None
+        ):
             by_dtypes = ModinDtypes(self._dtypes).lazy_get(internal_by)
             if by_dtypes.is_materialized:
                 new_index = ModinIndex(value=result, axis=0, dtypes=by_dtypes)

diff --git a/modin/core/dataframe/pandas/dataframe/utils.py b/modin/core/dataframe/pandas/dataframe/utils.py
@@ -114,34 +114,42 @@ class ShuffleSortFunctions(ShuffleFunctions):
     ----------
     modin_frame : PandasDataframe
         The frame to build the range-partitioning for.
-    columns : str or list of strings
-        The column/columns to use as a key.
+    columns : str, list of strings or None
+        The column/columns to use as a key. Can't be specified along with `level`.
     ascending : bool
         Whether the ranges should be in ascending or descending order.
     ideal_num_new_partitions : int
         The ideal number of new partitions.
+    level : list of strings or ints, or None
+        Index level(s) to use as a key. Can't be specified along with `columns`.
     **kwargs : dict
         Additional keyword arguments.
     """
 
     def __init__(
         self,
         modin_frame: "PandasDataframe",
-        columns: Union[str, list],
+        columns: Optional[Union[str, list]],
         ascending: Union[list, bool],
         ideal_num_new_partitions: int,
+        level: Optional[list[Union[str, int]]] = None,
         **kwargs: dict,
     ):
         self.frame_len = len(modin_frame)
         self.ideal_num_new_partitions = ideal_num_new_partitions
         self.columns = columns if is_list_like(columns) else [columns]
         self.ascending = ascending
         self.kwargs = kwargs.copy()
+        self.level = level
         self.columns_info = None
 
     def sample_fn(self, partition: pandas.DataFrame) -> pandas.DataFrame:
+        if self.level is not None:
+            partition = self._index_to_df_zero_copy(partition, self.level)
+        else:
+            partition = partition[self.columns]
         return self.pick_samples_for_quantiles(
-            partition[self.columns], self.ideal_num_new_partitions, self.frame_len
+            partition, self.ideal_num_new_partitions, self.frame_len
         )
 
     def pivot_fn(self, samples: "list[pandas.DataFrame]") -> int:
@@ -181,6 +189,7 @@ def split_fn(
             partition,
             self.columns_info,
             self.ascending,
+            keys_are_index_levels=self.level is not None,
             **self.kwargs,
         )
 
@@ -312,6 +321,7 @@ def split_partitions_using_pivots_for_sort(
         df: pandas.DataFrame,
         columns_info: "list[ColumnInfo]",
         ascending: bool,
+        keys_are_index_levels: bool = False,
         **kwargs: dict,
     ) -> "tuple[pandas.DataFrame, ...]":
         """
@@ -330,6 +340,8 @@ def split_partitions_using_pivots_for_sort(
             Information regarding keys and pivots for range partitioning.
         ascending : bool
             The ascending flag.
+        keys_are_index_levels : bool, default: False
+            Whether `columns_info` describes index levels or actual columns from `df`.
         **kwargs : dict
             Additional keyword arguments.
 
@@ -342,9 +354,14 @@ def split_partitions_using_pivots_for_sort(
             # We can return the dataframe with zero changes if there were no pivots passed
             return (df,)
 
-        na_index = (
-            df[[col_info.name for col_info in columns_info]].isna().squeeze(axis=1)
+        key_data = (
+            ShuffleSortFunctions._index_to_df_zero_copy(
+                df, [col_info.name for col_info in columns_info]
+            )
+            if keys_are_index_levels
+            else df[[col_info.name for col_info in columns_info]]
         )
+        na_index = key_data.isna().squeeze(axis=1)
         if na_index.ndim == 2:
             na_index = na_index.any(axis=1)
         na_rows = df[na_index]
@@ -373,22 +390,27 @@ def get_group(grp, key, df):
                 pivots = pivots[::-1]
             group_keys.append(range(len(pivots) + 1))
             key = kwargs.pop("key", None)
-            cols_to_digitize = non_na_rows[col_info.name]
+            cols_to_digitize = (
+                non_na_rows.index.get_level_values(col_info.name)
+                if keys_are_index_levels
+                else non_na_rows[col_info.name]
+            )
             if key is not None:
                 cols_to_digitize = key(cols_to_digitize)
 
+            if cols_to_digitize.ndim == 2:
+                cols_to_digitize = cols_to_digitize.squeeze()
+
             if col_info.is_numeric:
-                groupby_col = np.digitize(cols_to_digitize.squeeze(), pivots)
+                groupby_col = np.digitize(cols_to_digitize, pivots)
                 # `np.digitize` returns results based off of the sort order of the pivots it is passed.
                 # When we only have one unique value in our pivots, `np.digitize` assumes that the pivots
                 # are sorted in ascending order, and gives us results based off of that assumption - so if
                 # we actually want to sort in descending order, we need to swap the new indices.
                 if not ascending and len(np.unique(pivots)) == 1:
                     groupby_col = len(pivots) - groupby_col
             else:
-                groupby_col = np.searchsorted(
-                    pivots, cols_to_digitize.squeeze(), side="right"
-                )
+                groupby_col = np.searchsorted(pivots, cols_to_digitize, side="right")
                 # Since np.searchsorted requires the pivots to be in ascending order, if we want to sort
                 # in descending order, we need to swap the new indices.
                 if not ascending:
@@ -426,6 +448,36 @@ def get_group(grp, key, df):
         ).astype(df.dtypes)
         return tuple(groups)
 
+    @staticmethod
+    def _index_to_df_zero_copy(
+        df: pandas.DataFrame, levels: list[Union[str, int]]
+    ) -> pandas.DataFrame:
+        """
+        Convert index `level` of `df` to a ``pandas.DataFrame``.
+
+        Parameters
+        ----------
+        df : pandas.DataFrame
+        levels : list of labels or ints
+            Index level to convert to a dataframe.
+
+        Returns
+        -------
+        pandas.DataFrame
+            The columns in the resulting dataframe use the same data arrays as the index levels
+            in the original `df`, so no copies.
+        """
+        # calling 'df.index.to_frame()' creates a copy of the index, so doing the conversion manually
+        # to avoid the copy
+        data = {
+            (
+                df.index.names[lvl] if isinstance(lvl, int) else lvl
+            ): df.index.get_level_values(lvl)
+            for lvl in levels
+        }
+        index_data = pandas.DataFrame(data, index=df.index, copy=False)
+        return index_data
+
 
 def lazy_metadata_decorator(apply_axis=None, axis_arg=-1, transpose=False):
     """

diff --git a/modin/core/storage_formats/pandas/query_compiler.py b/modin/core/storage_formats/pandas/query_compiler.py
@@ -3701,12 +3701,7 @@ def _groupby_shuffle(
                 by, agg_func, axis, groupby_kwargs, agg_args, agg_kwargs, how, drop
             )
 
-        if groupby_kwargs.get("level") is not None:
-            raise NotImplementedError(
-                "Grouping on an index level is not yet supported by range-partitioning groupby implementation: "
-                + "https://github.com/modin-project/modin/issues/5926"
-            )
-
+        grouping_on_level = groupby_kwargs.get("level") is not None
         if any(
             isinstance(obj, pandas.Grouper)
             for obj in (by if isinstance(by, list) else [by])
@@ -3716,7 +3711,10 @@ def _groupby_shuffle(
                 + "https://github.com/modin-project/modin/issues/5926"
             )
 
-        external_by, internal_by, by_positions = self._groupby_separate_by(by, drop)
+        if grouping_on_level:
+            external_by, internal_by, by_positions = [], [], []
+        else:
+            external_by, internal_by, by_positions = self._groupby_separate_by(by, drop)
 
         all_external_are_qcs = all(isinstance(obj, type(self)) for obj in external_by)
         if not all_external_are_qcs: