Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: GroupBy.value_counts sorting order #56016

Merged
merged 5 commits into from
Nov 17, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions doc/source/whatsnew/v2.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -444,6 +444,9 @@ Groupby/resample/rolling
- Bug in :meth:`DataFrame.resample` not respecting ``closed`` and ``label`` arguments for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55282`)
- Bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55281`)
- Bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.MonthBegin` (:issue:`55271`)
- Bug in :meth:`DataFrameGroupBy.value_counts` and :meth:`SeriesGroupBy.value_count` could result in incorrect sorting if the columns of the DataFrame or name of the Series are integers (:issue:`56007`)
- Bug in :meth:`DataFrameGroupBy.value_counts` and :meth:`SeriesGroupBy.value_count` would not respect ``sort=False`` in :meth:`DataFrame.groupby` and :meth:`Series.groupby` (:issue:`56007`)
- Bug in :meth:`DataFrameGroupBy.value_counts` and :meth:`SeriesGroupBy.value_count` would sort by proportions rather than frequencies when ``sort=True`` and ``normalize=True`` (:issue:`56007`)

Reshaping
^^^^^^^^^
Expand Down
27 changes: 18 additions & 9 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -2821,11 +2821,27 @@ def _value_counts(
for grouping in groupings
):
levels_list = [ping.result_index for ping in groupings]
multi_index, _ = MultiIndex.from_product(
multi_index = MultiIndex.from_product(
levels_list, names=[ping.name for ping in groupings]
).sortlevel()
)
result_series = result_series.reindex(multi_index, fill_value=0)

if sort:
# Sort by the values
result_series = result_series.sort_values(
ascending=ascending, kind="stable"
)
if self.sort:
# Sort by the groupings
names = result_series.index.names
# GH#56007 - Temporarily replace names in case they are integers
result_series.index.names = range(len(names))
index_level = list(range(len(self.grouper.groupings)))
result_series = result_series.sort_index(
level=index_level, sort_remaining=False
)
result_series.index.names = names

if normalize:
# Normalize the results by dividing by the original group sizes.
# We are guaranteed to have the first N levels be the
Expand All @@ -2845,13 +2861,6 @@ def _value_counts(
# Handle groups of non-observed categories
result_series = result_series.fillna(0.0)

if sort:
# Sort the values and then resort by the main grouping
index_level = range(len(self.grouper.groupings))
result_series = result_series.sort_values(ascending=ascending).sort_index(
level=index_level, sort_remaining=False
)

result: Series | DataFrame
if self.as_index:
result = result_series
Expand Down
82 changes: 72 additions & 10 deletions pandas/tests/groupby/methods/test_value_counts.py
Original file line number Diff line number Diff line change
Expand Up @@ -385,8 +385,8 @@ def test_against_frame_and_seriesgroupby(
"sort, ascending, expected_rows, expected_count, expected_group_size",
[
(False, None, [0, 1, 2, 3, 4], [1, 1, 1, 2, 1], [1, 3, 1, 3, 1]),
(True, False, [4, 3, 1, 2, 0], [1, 2, 1, 1, 1], [1, 3, 3, 1, 1]),
(True, True, [4, 1, 3, 2, 0], [1, 1, 2, 1, 1], [1, 3, 3, 1, 1]),
(True, False, [3, 0, 1, 2, 4], [2, 1, 1, 1, 1], [3, 1, 3, 1, 1]),
(True, True, [0, 1, 2, 4, 3], [1, 1, 1, 1, 2], [1, 3, 1, 1, 3]),
],
)
def test_compound(
Expand Down Expand Up @@ -811,19 +811,19 @@ def test_categorical_single_grouper_observed_false(
("FR", "female", "high"),
("FR", "male", "medium"),
("FR", "female", "low"),
("FR", "male", "high"),
("FR", "female", "medium"),
("FR", "male", "high"),
("US", "female", "high"),
("US", "male", "low"),
("US", "male", "medium"),
("US", "male", "high"),
("US", "female", "medium"),
("US", "female", "low"),
("ASIA", "male", "low"),
("ASIA", "male", "high"),
("ASIA", "female", "medium"),
("ASIA", "female", "low"),
("US", "female", "medium"),
("US", "male", "high"),
("US", "male", "medium"),
("ASIA", "female", "high"),
("ASIA", "female", "low"),
("ASIA", "female", "medium"),
("ASIA", "male", "high"),
("ASIA", "male", "low"),
("ASIA", "male", "medium"),
]

Expand Down Expand Up @@ -1177,3 +1177,65 @@ def test_value_counts_integer_columns():
{1: ["a", "a", "a"], 2: ["a", "a", "d"], 3: ["a", "b", "c"], "count": 1}
)
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize("vc_sort", [True, False])
@pytest.mark.parametrize("normalize", [True, False])
def test_value_counts_sort(sort, vc_sort, normalize):
# GH#55951
df = DataFrame({"a": [2, 1, 1, 1], 0: [3, 4, 3, 3]})
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you also tests with Categorical?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks - good call. I added a test and found a bug calling .sortlevel() in the categorical section. But the categorical behavior is still buggy here; namely

multi_index, _ = MultiIndex.from_product(
levels_list, names=[ping.name for ping in groupings]
).sortlevel()

is constructing the index by taking the cartesian product across all groupings; it should only be doing this for the categorical variables. That's not an easy fix, but will be handled in #55738 where we can just remove that whole block of code.

gb = df.groupby("a", sort=sort)
result = gb.value_counts(sort=vc_sort, normalize=normalize)

if normalize:
values = [2 / 3, 1 / 3, 1.0]
else:
values = [2, 1, 1]
index = MultiIndex(
levels=[[1, 2], [3, 4]], codes=[[0, 0, 1], [0, 1, 0]], names=["a", 0]
)
expected = Series(values, index=index, name="proportion" if normalize else "count")
if sort and vc_sort:
taker = [0, 1, 2]
elif sort and not vc_sort:
taker = [0, 1, 2]
elif not sort and vc_sort:
taker = [0, 2, 1]
else:
taker = [2, 1, 0]
expected = expected.take(taker)

tm.assert_series_equal(result, expected)


@pytest.mark.parametrize("vc_sort", [True, False])
@pytest.mark.parametrize("normalize", [True, False])
def test_value_counts_sort_categorical(sort, vc_sort, normalize):
# GH#55951
df = DataFrame({"a": [2, 1, 1, 1], 0: [3, 4, 3, 3]}, dtype="category")
gb = df.groupby("a", sort=sort, observed=True)
result = gb.value_counts(sort=vc_sort, normalize=normalize)

if normalize:
values = [2 / 3, 1 / 3, 1.0, 0.0]
else:
values = [2, 1, 1, 0]
name = "proportion" if normalize else "count"
expected = DataFrame(
{
"a": Categorical([1, 1, 2, 2]),
0: Categorical([3, 4, 3, 4]),
name: values,
}
).set_index(["a", 0])[name]
if sort and vc_sort:
taker = [0, 1, 2, 3]
elif sort and not vc_sort:
taker = [0, 1, 2, 3]
elif not sort and vc_sort:
taker = [0, 2, 1, 3]
else:
taker = [2, 3, 0, 1]
expected = expected.take(taker)

tm.assert_series_equal(result, expected)
Loading