Skip to content

Commit ebd07dd

Browse files
dchigarevYarShev
andauthored
FIX-#1294: fixed 'value_counts' implementation (#2730)
Signed-off-by: Dmitry Chigarev <dmitry.chigarev@intel.com> Co-authored-by: Yaroslav Igoshev <Poolliver868@mail.ru>
1 parent dda6ebb commit ebd07dd

File tree

11 files changed

+176
-208
lines changed

11 files changed

+176
-208
lines changed

modin/backends/base/query_compiler.py

Lines changed: 0 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1293,30 +1293,6 @@ def searchsorted(self, **kwargs): # noqa: PR02
12931293

12941294
# END Abstract map partitions operations
12951295

1296-
@doc_utils.add_one_column_warning
1297-
@doc_utils.add_refer_to("Series.value_counts")
1298-
def value_counts(self, **kwargs): # noqa: PR02
1299-
"""
1300-
Count unique values of one-column `self`.
1301-
1302-
Parameters
1303-
----------
1304-
normalize : bool
1305-
sort : bool
1306-
ascending : bool
1307-
bins : int, optional
1308-
dropna : bool
1309-
**kwargs : dict
1310-
Serves the compatibility purpose. Does not affect the result.
1311-
1312-
Returns
1313-
-------
1314-
BaseQueryCompiler
1315-
One-column QueryCompiler which index labels is a unique elements of `self`
1316-
and each row contains the number of times corresponding value was met in the `self`.
1317-
"""
1318-
return SeriesDefault.register(pandas.Series.value_counts)(self, **kwargs)
1319-
13201296
@doc_utils.add_refer_to("DataFrame.stack")
13211297
def stack(self, level, dropna):
13221298
"""

modin/backends/pandas/query_compiler.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -766,12 +766,6 @@ def reduce_fn(df, **kwargs):
766766
reduce_fn,
767767
)(self, axis=axis, **kwargs)
768768

769-
def value_counts(self, **kwargs):
770-
def value_counts(df):
771-
return df.squeeze(axis=1).value_counts(**kwargs).to_frame()
772-
773-
return self.default_to_pandas(value_counts)
774-
775769
# END MapReduce operations
776770

777771
# Reduction operations

modin/experimental/backends/omnisci/query_compiler.py

Lines changed: 0 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -415,24 +415,6 @@ def _agg(self, agg, axis=0, level=None, **kwargs):
415415
)
416416
return self.__constructor__(new_frame, shape_hint="row")
417417

418-
def value_counts(self, **kwargs):
419-
subset = kwargs.get("subset", None)
420-
normalize = kwargs.get("normalize", False)
421-
sort = kwargs.get("sort", True)
422-
ascending = kwargs.get("ascending", False)
423-
bins = kwargs.get("bins", False)
424-
dropna = kwargs.get("dropna", True)
425-
426-
if bins or normalize:
427-
raise NotImplementedError(
428-
"OmniSci's 'value_counts' does not support 'bins' and 'normalize' parameters."
429-
)
430-
431-
new_frame = self._modin_frame.value_counts(
432-
columns=subset, dropna=dropna, sort=sort, ascending=ascending
433-
)
434-
return self.__constructor__(new_frame, shape_hint="column")
435-
436418
def _get_index(self):
437419
"""
438420
Return frame's index.

modin/experimental/engines/omnisci_on_native/frame/data.py

Lines changed: 0 additions & 83 deletions
Original file line numberDiff line numberDiff line change
@@ -568,89 +568,6 @@ def agg(self, agg):
568568
force_execution_mode=self._force_execution_mode,
569569
)
570570

571-
def value_counts(self, dropna, columns, sort, ascending):
572-
"""
573-
Count unique rows operation.
574-
575-
Parameters
576-
----------
577-
dropna : bool
578-
True when rows with NULLs should be ignored.
579-
columns : list-like of str or None
580-
Columns to use for unique combinations count. Use all
581-
columns when None.
582-
sort : bool
583-
Sort by frequencies.
584-
ascending : bool
585-
Sort order.
586-
587-
Returns
588-
-------
589-
OmnisciOnNativeFrame
590-
The new frame.
591-
"""
592-
by = [col for col in self.columns if columns is None or col in columns]
593-
594-
if not by:
595-
raise ValueError("invalid columns subset is specified")
596-
597-
base = self
598-
if dropna:
599-
checks = [base.ref(col).is_not_null() for col in by]
600-
condition = (
601-
checks[0]
602-
if len(checks) == 1
603-
else OpExpr("AND", [checks], np.dtype("bool"))
604-
)
605-
base = self.__constructor__(
606-
columns=Index.__new__(Index, data=by, dtype="O"),
607-
dtypes=base._dtypes[by],
608-
op=FilterNode(base, condition),
609-
index_cols=None,
610-
force_execution_mode=base._force_execution_mode,
611-
)
612-
613-
agg_exprs = OrderedDict()
614-
agg_exprs[""] = AggregateExpr("size", None)
615-
dtypes = base._dtypes[by].tolist()
616-
dtypes.append(np.dtype("int64"))
617-
618-
new_columns = Index.__new__(Index, data=[""], dtype="O")
619-
620-
res = self.__constructor__(
621-
columns=new_columns,
622-
dtypes=dtypes,
623-
op=GroupbyAggNode(base, by, agg_exprs, {"sort": False}),
624-
index_cols=by.copy(),
625-
force_execution_mode=base._force_execution_mode,
626-
)
627-
628-
if sort or ascending:
629-
res = self.__constructor__(
630-
columns=res.columns,
631-
dtypes=res._dtypes,
632-
op=SortNode(res, [""], [ascending], "last"),
633-
index_cols=res._index_cols,
634-
force_execution_mode=res._force_execution_mode,
635-
)
636-
637-
# If a single column is used then it keeps its name.
638-
# TODO: move it to upper levels when index renaming is in place.
639-
if len(by) == 1:
640-
exprs = OrderedDict()
641-
exprs["__index__"] = res.ref(by[0])
642-
exprs[by[0]] = res.ref("")
643-
644-
res = self.__constructor__(
645-
columns=Index.__new__(Index, data=by, dtype="O"),
646-
dtypes=self._dtypes_for_exprs(exprs),
647-
op=TransformNode(res, exprs),
648-
index_cols=["__index__"],
649-
force_execution_mode=res._force_execution_mode,
650-
)
651-
652-
return res
653-
654571
def fillna(self, value=None, method=None, axis=None, limit=None, downcast=None):
655572
"""
656573
Replace NULLs operation.

modin/experimental/engines/omnisci_on_native/test/test_dataframe.py

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020

2121
from modin.config import IsExperimental, Engine, Backend
2222
from modin.pandas.test.utils import io_ops_bad_exc
23+
from pandas.core.dtypes.common import is_list_like
2324

2425
IsExperimental.put(True)
2526
Engine.put("native")
@@ -35,6 +36,7 @@
3536
generate_multiindex,
3637
eval_general,
3738
eval_io,
39+
df_equals_with_non_stable_indices,
3840
)
3941

4042
from modin.experimental.engines.omnisci_on_native.frame.partition_manager import (
@@ -58,6 +60,7 @@ def run_and_compare(
5860
force_lazy=True,
5961
force_arrow_execute=False,
6062
allow_subqueries=False,
63+
comparator=df_equals,
6164
**kwargs,
6265
):
6366
def run_modin(
@@ -120,7 +123,7 @@ def run_modin(
120123
constructor_kwargs=constructor_kwargs,
121124
**kwargs,
122125
)
123-
df_equals(ref_res, exp_res)
126+
comparator(ref_res, exp_res)
124127

125128

126129
@pytest.mark.usefixtures("TestReadCSVFixture")
@@ -1164,21 +1167,35 @@ def apply(df, **kwargs):
11641167

11651168
run_and_compare(apply, data=self.data, force_lazy=False)
11661169

1167-
@pytest.mark.parametrize("cols", ["a", "d"])
1170+
@pytest.mark.parametrize("data", [data, int_data], ids=["nan_data", "int_data"])
1171+
@pytest.mark.parametrize("cols", ["a", "d", ["a", "d"]])
11681172
@pytest.mark.parametrize("dropna", [True, False])
11691173
@pytest.mark.parametrize("sort", [True])
11701174
@pytest.mark.parametrize("ascending", [True, False])
1171-
def test_value_counts(self, cols, dropna, sort, ascending):
1175+
def test_value_counts(self, data, cols, dropna, sort, ascending):
11721176
def value_counts(df, cols, dropna, sort, ascending, **kwargs):
11731177
return df[cols].value_counts(dropna=dropna, sort=sort, ascending=ascending)
11741178

1179+
if dropna and pandas.DataFrame(
1180+
data, columns=cols if is_list_like(cols) else [cols]
1181+
).isna().any(axis=None):
1182+
pytest.xfail(
1183+
reason="'dropna' parameter is forcibly disabled in OmniSci's GroupBy"
1184+
"due to performance issues, you can track this problem at:"
1185+
"https://github.com/modin-project/modin/issues/2896"
1186+
)
1187+
1188+
# Custom comparator is required because pandas is inconsistent about
1189+
# the order of equal values, we can't match this behaviour. For more details:
1190+
# https://github.com/modin-project/modin/issues/1650
11751191
run_and_compare(
11761192
value_counts,
1177-
data=self.data,
1193+
data=data,
11781194
cols=cols,
11791195
dropna=dropna,
11801196
sort=sort,
11811197
ascending=ascending,
1198+
comparator=df_equals_with_non_stable_indices,
11821199
)
11831200

11841201
@pytest.mark.parametrize(

modin/pandas/base.py

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@
4141
TimestampConvertibleTypes,
4242
)
4343
import re
44-
from typing import Optional, Union
44+
from typing import Optional, Union, Sequence, Hashable
4545
import warnings
4646
import pickle as pkl
4747

@@ -2863,6 +2863,38 @@ def tz_localize(
28632863
)
28642864
return self.set_axis(labels=new_labels, axis=axis, inplace=not copy)
28652865

2866+
# TODO: uncomment the following lines when #3331 issue will be closed
2867+
# @prepend_to_notes(
2868+
# """
2869+
# In comparison with pandas, Modin's ``value_counts`` returns Series with ``MultiIndex``
2870+
# only if multiple columns were passed via the `subset` parameter, otherwise, the resulted
2871+
# Series's index will be a regular single dimensional ``Index``.
2872+
# """
2873+
# )
2874+
# @_inherit_docstrings(pandas.DataFrame.value_counts, apilink="pandas.DataFrame.value_counts")
2875+
def value_counts(
2876+
self,
2877+
subset: Sequence[Hashable] = None,
2878+
normalize: bool = False,
2879+
sort: bool = True,
2880+
ascending: bool = False,
2881+
dropna: bool = True,
2882+
):
2883+
if subset is None:
2884+
subset = self._query_compiler.columns
2885+
counted_values = self.groupby(by=subset, sort=False, dropna=dropna).size()
2886+
if sort:
2887+
counted_values.sort_values(ascending=ascending, inplace=True)
2888+
if normalize:
2889+
counted_values = counted_values / counted_values.sum()
2890+
# TODO: uncomment when strict compability mode will be implemented:
2891+
# https://github.com/modin-project/modin/issues/3411
2892+
# if STRICT_COMPABILITY and not isinstance(counted_values.index, MultiIndex):
2893+
# counted_values.index = pandas.MultiIndex.from_arrays(
2894+
# [counted_values.index], names=counted_values.index.names
2895+
# )
2896+
return counted_values
2897+
28662898
def var(
28672899
self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs
28682900
):

modin/pandas/dataframe.py

Lines changed: 1 addition & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@
3131
import functools
3232
import numpy as np
3333
import sys
34-
from typing import IO, Optional, Sequence, Tuple, Union, Mapping, Iterator, Hashable
34+
from typing import IO, Optional, Tuple, Union, Mapping, Iterator
3535
import warnings
3636

3737
from modin.error_message import ErrorMessage
@@ -2365,26 +2365,6 @@ def update(
23652365
)
23662366
self._update_inplace(new_query_compiler=query_compiler)
23672367

2368-
def value_counts(
2369-
self,
2370-
subset: Sequence[Hashable] = None,
2371-
normalize: bool = False,
2372-
sort: bool = True,
2373-
ascending: bool = False,
2374-
dropna: bool = True,
2375-
): # noqa: PR01, RT01, D200
2376-
"""
2377-
Return a ``Series`` containing counts of unique rows in the ``DataFrame``.
2378-
"""
2379-
return self._default_to_pandas(
2380-
"value_counts",
2381-
subset=subset,
2382-
normalize=normalize,
2383-
sort=sort,
2384-
ascending=ascending,
2385-
dropna=dropna,
2386-
)
2387-
23882368
def where(
23892369
self,
23902370
cond,

modin/pandas/series.py

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1982,15 +1982,28 @@ def value_counts(
19821982
"""
19831983
Return a Series containing counts of unique values.
19841984
"""
1985-
return self.__constructor__(
1986-
query_compiler=self._query_compiler.value_counts(
1985+
if bins is not None:
1986+
# Potentially we could implement `cut` function from pandas API, which
1987+
# bins values into intervals, and then we can just count them as regular values.
1988+
# TODO #1333: new_self = Series(pd.cut(self, bins, include_lowest=True), dtype="interval")
1989+
return self._default_to_pandas(
1990+
pandas.Series.value_counts,
19871991
normalize=normalize,
19881992
sort=sort,
19891993
ascending=ascending,
19901994
bins=bins,
19911995
dropna=dropna,
19921996
)
1997+
counted_values = super(Series, self).value_counts(
1998+
subset=self,
1999+
normalize=normalize,
2000+
sort=sort,
2001+
ascending=ascending,
2002+
dropna=dropna,
19932003
)
2004+
# pandas sets output index names to None because the Series name already contains it
2005+
counted_values._query_compiler.set_index_name(None)
2006+
return counted_values
19942007

19952008
def view(self, dtype=None): # noqa: PR01, RT01, D200
19962009
"""

0 commit comments

Comments
 (0)