FIX-#1294: fixed 'value_counts' implementation (#2730)

dchigarev · YarShev · web-flow · commit ebd07ddf14a6 · 2021-09-21T07:50:12.000-05:00
Signed-off-by: Dmitry Chigarev &lt;dmitry.chigarev@intel.com&gt;

Co-authored-by: Yaroslav Igoshev &lt;Poolliver868@mail.ru&gt;
diff --git a/modin/backends/base/query_compiler.py b/modin/backends/base/query_compiler.py
@@ -1293,30 +1293,6 @@ def searchsorted(self, **kwargs):  # noqa: PR02
 
     # END Abstract map partitions operations
 
-    @doc_utils.add_one_column_warning
-    @doc_utils.add_refer_to("Series.value_counts")
-    def value_counts(self, **kwargs):  # noqa: PR02
-        """
-        Count unique values of one-column `self`.
-
-        Parameters
-        ----------
-        normalize : bool
-        sort : bool
-        ascending : bool
-        bins : int, optional
-        dropna : bool
-        **kwargs : dict
-            Serves the compatibility purpose. Does not affect the result.
-
-        Returns
-        -------
-        BaseQueryCompiler
-            One-column QueryCompiler which index labels is a unique elements of `self`
-            and each row contains the number of times corresponding value was met in the `self`.
-        """
-        return SeriesDefault.register(pandas.Series.value_counts)(self, **kwargs)
-
     @doc_utils.add_refer_to("DataFrame.stack")
     def stack(self, level, dropna):
         """
diff --git a/modin/backends/pandas/query_compiler.py b/modin/backends/pandas/query_compiler.py
@@ -766,12 +766,6 @@ def reduce_fn(df, **kwargs):
             reduce_fn,
         )(self, axis=axis, **kwargs)
 
-    def value_counts(self, **kwargs):
-        def value_counts(df):
-            return df.squeeze(axis=1).value_counts(**kwargs).to_frame()
-
-        return self.default_to_pandas(value_counts)
-
     # END MapReduce operations
 
     # Reduction operations
diff --git a/modin/experimental/backends/omnisci/query_compiler.py b/modin/experimental/backends/omnisci/query_compiler.py
@@ -415,24 +415,6 @@ def _agg(self, agg, axis=0, level=None, **kwargs):
         )
         return self.__constructor__(new_frame, shape_hint="row")
 
-    def value_counts(self, **kwargs):
-        subset = kwargs.get("subset", None)
-        normalize = kwargs.get("normalize", False)
-        sort = kwargs.get("sort", True)
-        ascending = kwargs.get("ascending", False)
-        bins = kwargs.get("bins", False)
-        dropna = kwargs.get("dropna", True)
-
-        if bins or normalize:
-            raise NotImplementedError(
-                "OmniSci's 'value_counts' does not support 'bins' and 'normalize' parameters."
-            )
-
-        new_frame = self._modin_frame.value_counts(
-            columns=subset, dropna=dropna, sort=sort, ascending=ascending
-        )
-        return self.__constructor__(new_frame, shape_hint="column")
-
     def _get_index(self):
         """
         Return frame's index.
diff --git a/modin/experimental/engines/omnisci_on_native/frame/data.py b/modin/experimental/engines/omnisci_on_native/frame/data.py
@@ -568,89 +568,6 @@ def agg(self, agg):
             force_execution_mode=self._force_execution_mode,
         )
 
-    def value_counts(self, dropna, columns, sort, ascending):
-        """
-        Count unique rows operation.
-
-        Parameters
-        ----------
-        dropna : bool
-            True when rows with NULLs should be ignored.
-        columns : list-like of str or None
-            Columns to use for unique combinations count. Use all
-            columns when None.
-        sort : bool
-            Sort by frequencies.
-        ascending : bool
-            Sort order.
-
-        Returns
-        -------
-        OmnisciOnNativeFrame
-            The new frame.
-        """
-        by = [col for col in self.columns if columns is None or col in columns]
-
-        if not by:
-            raise ValueError("invalid columns subset is specified")
-
-        base = self
-        if dropna:
-            checks = [base.ref(col).is_not_null() for col in by]
-            condition = (
-                checks[0]
-                if len(checks) == 1
-                else OpExpr("AND", [checks], np.dtype("bool"))
-            )
-            base = self.__constructor__(
-                columns=Index.__new__(Index, data=by, dtype="O"),
-                dtypes=base._dtypes[by],
-                op=FilterNode(base, condition),
-                index_cols=None,
-                force_execution_mode=base._force_execution_mode,
-            )
-
-        agg_exprs = OrderedDict()
-        agg_exprs[""] = AggregateExpr("size", None)
-        dtypes = base._dtypes[by].tolist()
-        dtypes.append(np.dtype("int64"))
-
-        new_columns = Index.__new__(Index, data=[""], dtype="O")
-
-        res = self.__constructor__(
-            columns=new_columns,
-            dtypes=dtypes,
-            op=GroupbyAggNode(base, by, agg_exprs, {"sort": False}),
-            index_cols=by.copy(),
-            force_execution_mode=base._force_execution_mode,
-        )
-
-        if sort or ascending:
-            res = self.__constructor__(
-                columns=res.columns,
-                dtypes=res._dtypes,
-                op=SortNode(res, [""], [ascending], "last"),
-                index_cols=res._index_cols,
-                force_execution_mode=res._force_execution_mode,
-            )
-
-        # If a single column is used then it keeps its name.
-        # TODO: move it to upper levels when index renaming is in place.
-        if len(by) == 1:
-            exprs = OrderedDict()
-            exprs["__index__"] = res.ref(by[0])
-            exprs[by[0]] = res.ref("")
-
-            res = self.__constructor__(
-                columns=Index.__new__(Index, data=by, dtype="O"),
-                dtypes=self._dtypes_for_exprs(exprs),
-                op=TransformNode(res, exprs),
-                index_cols=["__index__"],
-                force_execution_mode=res._force_execution_mode,
-            )
-
-        return res
-
     def fillna(self, value=None, method=None, axis=None, limit=None, downcast=None):
         """
         Replace NULLs operation.
diff --git a/modin/experimental/engines/omnisci_on_native/test/test_dataframe.py b/modin/experimental/engines/omnisci_on_native/test/test_dataframe.py
@@ -20,6 +20,7 @@
 
 from modin.config import IsExperimental, Engine, Backend
 from modin.pandas.test.utils import io_ops_bad_exc
+from pandas.core.dtypes.common import is_list_like
 
 IsExperimental.put(True)
 Engine.put("native")
@@ -35,6 +36,7 @@
     generate_multiindex,
     eval_general,
     eval_io,
+    df_equals_with_non_stable_indices,
 )
 
 from modin.experimental.engines.omnisci_on_native.frame.partition_manager import (
@@ -58,6 +60,7 @@ def run_and_compare(
     force_lazy=True,
     force_arrow_execute=False,
     allow_subqueries=False,
+    comparator=df_equals,
     **kwargs,
 ):
     def run_modin(
@@ -120,7 +123,7 @@ def run_modin(
             constructor_kwargs=constructor_kwargs,
             **kwargs,
         )
-        df_equals(ref_res, exp_res)
+        comparator(ref_res, exp_res)
 
 
 @pytest.mark.usefixtures("TestReadCSVFixture")
@@ -1164,21 +1167,35 @@ def apply(df, **kwargs):
 
         run_and_compare(apply, data=self.data, force_lazy=False)
 
-    @pytest.mark.parametrize("cols", ["a", "d"])
+    @pytest.mark.parametrize("data", [data, int_data], ids=["nan_data", "int_data"])
+    @pytest.mark.parametrize("cols", ["a", "d", ["a", "d"]])
     @pytest.mark.parametrize("dropna", [True, False])
     @pytest.mark.parametrize("sort", [True])
     @pytest.mark.parametrize("ascending", [True, False])
-    def test_value_counts(self, cols, dropna, sort, ascending):
+    def test_value_counts(self, data, cols, dropna, sort, ascending):
         def value_counts(df, cols, dropna, sort, ascending, **kwargs):
             return df[cols].value_counts(dropna=dropna, sort=sort, ascending=ascending)
 
+        if dropna and pandas.DataFrame(
+            data, columns=cols if is_list_like(cols) else [cols]
+        ).isna().any(axis=None):
+            pytest.xfail(
+                reason="'dropna' parameter is forcibly disabled in OmniSci's GroupBy"
+                "due to performance issues, you can track this problem at:"
+                "https://github.com/modin-project/modin/issues/2896"
+            )
+
+        # Custom comparator is required because pandas is inconsistent about
+        # the order of equal values, we can't match this behaviour. For more details:
+        # https://github.com/modin-project/modin/issues/1650
         run_and_compare(
             value_counts,
-            data=self.data,
+            data=data,
             cols=cols,
             dropna=dropna,
             sort=sort,
             ascending=ascending,
+            comparator=df_equals_with_non_stable_indices,
         )
 
     @pytest.mark.parametrize(
diff --git a/modin/pandas/base.py b/modin/pandas/base.py
@@ -41,7 +41,7 @@
     TimestampConvertibleTypes,
 )
 import re
-from typing import Optional, Union
+from typing import Optional, Union, Sequence, Hashable
 import warnings
 import pickle as pkl
 
@@ -2863,6 +2863,38 @@ def tz_localize(
         )
         return self.set_axis(labels=new_labels, axis=axis, inplace=not copy)
 
+    # TODO: uncomment the following lines when #3331 issue will be closed
+    # @prepend_to_notes(
+    #     """
+    #     In comparison with pandas, Modin's ``value_counts`` returns Series with ``MultiIndex``
+    #     only if multiple columns were passed via the `subset` parameter, otherwise, the resulted
+    #     Series's index will be a regular single dimensional ``Index``.
+    #     """
+    # )
+    # @_inherit_docstrings(pandas.DataFrame.value_counts, apilink="pandas.DataFrame.value_counts")
+    def value_counts(
+        self,
+        subset: Sequence[Hashable] = None,
+        normalize: bool = False,
+        sort: bool = True,
+        ascending: bool = False,
+        dropna: bool = True,
+    ):
+        if subset is None:
+            subset = self._query_compiler.columns
+        counted_values = self.groupby(by=subset, sort=False, dropna=dropna).size()
+        if sort:
+            counted_values.sort_values(ascending=ascending, inplace=True)
+        if normalize:
+            counted_values = counted_values / counted_values.sum()
+        # TODO: uncomment when strict compability mode will be implemented:
+        # https://github.com/modin-project/modin/issues/3411
+        # if STRICT_COMPABILITY and not isinstance(counted_values.index, MultiIndex):
+        #     counted_values.index = pandas.MultiIndex.from_arrays(
+        #         [counted_values.index], names=counted_values.index.names
+        #     )
+        return counted_values
+
     def var(
         self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs
     ):
diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py
@@ -31,7 +31,7 @@
 import functools
 import numpy as np
 import sys
-from typing import IO, Optional, Sequence, Tuple, Union, Mapping, Iterator, Hashable
+from typing import IO, Optional, Tuple, Union, Mapping, Iterator
 import warnings
 
 from modin.error_message import ErrorMessage
@@ -2365,26 +2365,6 @@ def update(
         )
         self._update_inplace(new_query_compiler=query_compiler)
 
-    def value_counts(
-        self,
-        subset: Sequence[Hashable] = None,
-        normalize: bool = False,
-        sort: bool = True,
-        ascending: bool = False,
-        dropna: bool = True,
-    ):  # noqa: PR01, RT01, D200
-        """
-        Return a ``Series`` containing counts of unique rows in the ``DataFrame``.
-        """
-        return self._default_to_pandas(
-            "value_counts",
-            subset=subset,
-            normalize=normalize,
-            sort=sort,
-            ascending=ascending,
-            dropna=dropna,
-        )
-
     def where(
         self,
         cond,
diff --git a/modin/pandas/series.py b/modin/pandas/series.py
@@ -1982,15 +1982,28 @@ def value_counts(
         """
         Return a Series containing counts of unique values.
         """
-        return self.__constructor__(
-            query_compiler=self._query_compiler.value_counts(
+        if bins is not None:
+            # Potentially we could implement `cut` function from pandas API, which
+            # bins values into intervals, and then we can just count them as regular values.
+            # TODO #1333: new_self = Series(pd.cut(self, bins, include_lowest=True), dtype="interval")
+            return self._default_to_pandas(
+                pandas.Series.value_counts,
                 normalize=normalize,
                 sort=sort,
                 ascending=ascending,
                 bins=bins,
                 dropna=dropna,
             )
+        counted_values = super(Series, self).value_counts(
+            subset=self,
+            normalize=normalize,
+            sort=sort,
+            ascending=ascending,
+            dropna=dropna,
         )
+        # pandas sets output index names to None because the Series name already contains it
+        counted_values._query_compiler.set_index_name(None)
+        return counted_values
 
     def view(self, dtype=None):  # noqa: PR01, RT01, D200
         """
diff --git a/modin/pandas/test/dataframe/test_reduction.py b/modin/pandas/test/dataframe/test_reduction.py
diff --git a/modin/pandas/test/test_series.py b/modin/pandas/test/test_series.py
diff --git a/modin/pandas/test/utils.py b/modin/pandas/test/utils.py