Skip to content

Commit

Permalink
FEAT 5460: Make DataFrameGroupBy.value counts more distributed (#3)
Browse files Browse the repository at this point in the history
* FIX-modin-project#4154: add value_counts method for SeriesGroupBy and DataFrameGroupBy (modin-project#5453)

Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com>

* WIP GroupBy

* Seemingly working w/ Ponder and Modin

* Maintain proper value ordering for single group-by operations

* linter updates from black on the files that have changed

* When normalize is used, or the groupby was performed with as_index=False, default to pandas.
With ponder this will result in a NotImplemented error currently.

* Remove _to_pandas() by implementing sort_index on Series on the service side

* | Object | call | Pushdown? | Compatibility |
| --- | --- | --- | --- |
| DataFrameGroupBy | value_counts() | Full | PASS |
| DataFrameGroupBy | value_counts(ascending=True) |Full | PASS |
| DataFrameGroupBy | value_counts(ascending=False) | Full |PASS |
| DataFrameGroupBy | value_counts(sort=False) |Full | PASS |
| DataFrameGroupBy | value_counts(sort=True) |Full | PASS |
| DataFrameGroupBy | value_counts(normalize=False) |Full | PASS |
| DataFrameGroupBy | value_counts(normalize=True) | | FAIL |
| DataFrame | groupby(as_index=False) |  | FAIL |
| DataFrameGroupBy | value_counts(dropna=False) | Full  | PASS
| DataFrameGroupBy | value_counts(dropna=True) |  |FAIL

$\color{red}{\text{NOTE: For MultiIndex GroupBys the n+1 level index is ignored in the sorting.}}$

---------

Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com>
Co-authored-by: Anatoly Myachev <anatoly.myachev@intel.com>
  • Loading branch information
2 people authored and vnlitvinov committed Mar 16, 2023
1 parent 1f29227 commit 60df3fa
Showing 1 changed file with 44 additions and 5 deletions.
49 changes: 44 additions & 5 deletions modin/pandas/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,7 @@ def skew(self, *args, **kwargs):
)

def ffill(self, limit=None):
return self.fillna(limit=limit, method='ffill')
return self.fillna(limit=limit, method="ffill")

def sem(self, ddof=1):
return self._wrap_aggregation(
Expand All @@ -235,14 +235,35 @@ def value_counts(
ascending: bool = False,
dropna: bool = True,
):
return self._default_to_pandas(
lambda df: df.value_counts(
# Compatibility Notes:
# dfGroupBy.value_counts nearly semantically
# equivalent to df.value_counts([<by>, <other...>]).sort_index()
# it returns a MultiIndex Series which needs to be converted to
# pandas for sort_index.
#
# Semantic Exceptions:
# normalize does not work; it will return the normalized results
# across the entire dataframe, not within the sub levels
# DataFrame(as_index=False) does not work. The default is True
# calling this function will always result in a Series rather
# than a DataFrame
#
if is_list_like(self._by):
subset = self._by
elif isinstance(self._by, type(self._query_compiler)):
subset = self._by.columns.values.tolist()
for c in self._columns.values.tolist():
if c not in subset:
subset.append(c)
return (
self._df.value_counts(
subset=subset,
normalize=normalize,
sort=sort,
ascending=ascending,
dropna=dropna,
)
.sort_index(level=0, sort_remaining=False)
)

def mean(self, numeric_only=None):
Expand Down Expand Up @@ -618,7 +639,7 @@ def cummin(self, axis=0, **kwargs):
)

def bfill(self, limit=None):
return self.fillna(limit=limit, method='bfill')
return self.fillna(limit=limit, method="bfill")

def idxmin(self):
return self._default_to_pandas(lambda df: df.idxmin())
Expand Down Expand Up @@ -787,7 +808,7 @@ def corrwith(self):
return self._default_to_pandas(lambda df: df.corrwith)

def pad(self, limit=None):
return self.fillna(limit=limit, method='pad')
return self.fillna(limit=limit, method="pad")

def max(self, numeric_only=False, min_count=-1):
return self._wrap_aggregation(
Expand Down Expand Up @@ -1453,6 +1474,24 @@ def _iter(self):
for k in (sorted(group_ids) if self._sort else group_ids)
)

def value_counts(
self,
normalize: bool = False,
sort: bool = True,
ascending: bool = False,
bins=None,
dropna: bool = True,
):
return self._default_to_pandas(
lambda ser: ser.value_counts(
normalize=normalize,
sort=sort,
ascending=ascending,
bins=bins,
dropna=dropna,
)
)

def unique(self):
return self._check_index(
self._wrap_aggregation(
Expand Down

0 comments on commit 60df3fa

Please sign in to comment.