Skip to content

BUG: groupby.describe on a frame with duplicate column names #50846

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 34 commits into from
Feb 3, 2023
Merged
Show file tree
Hide file tree
Changes from 28 commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
aa9c9e1
REF: groupby Series selection with as_index=False
rhshadrach Dec 28, 2022
7d00d07
GH#
rhshadrach Jan 14, 2023
fd62b4e
Merge branch 'main' of https://github.com/pandas-dev/pandas into seri…
rhshadrach Jan 14, 2023
c0891db
Merge branch 'main' into series_as_index_false
rhshadrach Jan 14, 2023
6bcfb12
Merge branch 'main' of https://github.com/pandas-dev/pandas into seri…
rhshadrach Jan 16, 2023
41399ad
type-hinting fixes
rhshadrach Jan 16, 2023
c26957d
WIP
rhshadrach Jan 17, 2023
f2b538e
Merge branch 'main' of https://github.com/pandas-dev/pandas into owe_…
rhshadrach Jan 17, 2023
1860c4d
WIP
rhshadrach Jan 18, 2023
e42e222
WIP
rhshadrach Jan 18, 2023
0bdf009
BUG: groupby.describe on a frame with duplicate column names
rhshadrach Dec 28, 2022
185e4f8
cleanup
rhshadrach Jan 18, 2023
d2b965f
test fixup
rhshadrach Jan 19, 2023
932e3c8
Fix type-hint for _group_selection
rhshadrach Jan 19, 2023
5139df8
Merge branch 'main' of https://github.com/pandas-dev/pandas into grou…
rhshadrach Jan 19, 2023
8f132cd
Merge branch 'groupby_select_obj_dup_cols' of https://github.com/rhsh…
rhshadrach Jan 20, 2023
eeea6fc
Merge branch 'groupby_select_obj_dup_cols' of https://github.com/rhsh…
rhshadrach Jan 20, 2023
feb6661
Merge branch 'main' of https://github.com/pandas-dev/pandas into grou…
rhshadrach Jan 20, 2023
83f12b7
Speedup
rhshadrach Jan 20, 2023
c37a1ab
refinement
rhshadrach Jan 20, 2023
973b893
Merge branch 'main' into groupby_select_obj_dup_cols
rhshadrach Jan 24, 2023
78a3d5f
Merge branch 'main' of https://github.com/pandas-dev/pandas into grou…
rhshadrach Jan 25, 2023
4dafe5a
cleanup, faster implementation
rhshadrach Jan 25, 2023
0959c1b
Merge branch 'main' into groupby_select_obj_dup_cols
rhshadrach Jan 29, 2023
2fc97b2
Merge branch 'main' into groupby_select_obj_dup_cols
rhshadrach Jan 30, 2023
d5df78c
Make group_selection a Boolean flag
rhshadrach Jan 31, 2023
62bb1fb
Merge branch 'groupby_select_obj_dup_cols' of https://github.com/rhsh…
rhshadrach Jan 31, 2023
8d6df54
Avoid resetting cache
rhshadrach Jan 31, 2023
62540af
Improve test
rhshadrach Feb 1, 2023
f7a6973
Merge branch 'main' of https://github.com/pandas-dev/pandas into grou…
rhshadrach Feb 1, 2023
615d9c6
Merge branch 'main' of https://github.com/pandas-dev/pandas into grou…
rhshadrach Feb 1, 2023
88a9ec9
Merge branch 'main' of https://github.com/pandas-dev/pandas into grou…
rhshadrach Feb 3, 2023
359d7ff
Rework test
rhshadrach Feb 3, 2023
d1d2610
Merge branch 'groupby_select_obj_dup_cols' of https://github.com/rhsh…
rhshadrach Feb 3, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1141,6 +1141,7 @@ Groupby/resample/rolling
- Bug in :meth:`.SeriesGroupBy.value_counts` did not respect ``sort=False`` (:issue:`50482`)
- Bug in :meth:`.DataFrameGroupBy.resample` raises ``KeyError`` when getting the result from a key list when resampling on time index (:issue:`50840`)
- Bug in :meth:`.DataFrameGroupBy.transform` and :meth:`.SeriesGroupBy.transform` would raise incorrectly when grouper had ``axis=1`` for ``"ngroup"`` argument (:issue:`45986`)
- Bug in :meth:`.DataFrameGroupBy.describe` produced incorrect results when data had duplicate columns (:issue:`50806`)
-

Reshaping
Expand Down
27 changes: 8 additions & 19 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -610,7 +610,7 @@ def f(self):


class BaseGroupBy(PandasObject, SelectionMixin[NDFrameT], GroupByIndexingMixin):
_group_selection: IndexLabel | None = None
_group_selection: bool = False
_hidden_attrs = PandasObject._hidden_attrs | {
"as_index",
"axis",
Expand Down Expand Up @@ -725,8 +725,8 @@ def _selected_obj(self):
# Note: _selected_obj is always just `self.obj` for SeriesGroupBy

if self._selection is None or isinstance(self.obj, Series):
if self._group_selection is not None:
return self.obj[self._group_selection]
if self._group_selection:
return self._obj_with_exclusions
return self.obj
else:
return self.obj[self._selection]
Expand Down Expand Up @@ -1009,22 +1009,11 @@ def _set_group_selection(self) -> None:

NOTE: this should be paired with a call to _reset_group_selection
"""
# This is a no-op for SeriesGroupBy
grp = self.grouper
if (
grp.groupings is None
or self.obj.ndim == 1
or self._group_selection is not None
):
if grp.groupings is None or self.obj.ndim == 1 or self._group_selection:
return

groupers = self.exclusions

if len(groupers):
# GH12839 clear selected obj cache when group selection changes
ax = self.obj._info_axis
self._group_selection = ax.difference(Index(groupers), sort=False).tolist()
self._reset_cache("_selected_obj")
self._group_selection = True
self._reset_cache("_selected_obj")

@final
def _reset_group_selection(self) -> None:
Expand All @@ -1034,9 +1023,9 @@ def _reset_group_selection(self) -> None:
Used for methods needing to return info on each group regardless of
whether a group selection was previously set.
"""
if self._group_selection is not None:
if self._group_selection:
# GH12839 clear cached selection too when changing group selection
self._group_selection = None
self._group_selection = False
self._reset_cache("_selected_obj")

@contextmanager
Expand Down
47 changes: 47 additions & 0 deletions pandas/tests/groupby/test_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -1254,6 +1254,27 @@ def test_describe_with_duplicate_output_column_names(as_index, keys):
tm.assert_frame_equal(result, expected)


def test_describe_duplicate_columns():
# GH#50806
df = DataFrame([[0, 1, 2, 3]])
df.columns = [0, 1, 2, 0]
gb = df.groupby(df[1])
result = gb.describe(percentiles=[])

columns = ["count", "mean", "std", "min", "50%", "max"]
frames = [
DataFrame([[1.0, val, np.nan, val, val, val]], index=[1], columns=columns)
for val in (0.0, 2.0, 3.0)
]
expected = pd.concat(frames, axis=1)
expected.columns = MultiIndex(
levels=[[0, 2], columns],
codes=[6 * [0] + 6 * [1] + 6 * [0], 3 * list(range(6))],
)
expected.index.names = [1]
tm.assert_frame_equal(result, expected)


def test_groupby_mean_no_overflow():
# Regression test for (#22487)
df = DataFrame(
Expand Down Expand Up @@ -1594,3 +1615,29 @@ def test_multiindex_group_all_columns_when_empty(groupby_func):
result = method(*args).index
expected = df.index
tm.assert_index_equal(result, expected)


def test_duplicate_columns(request, groupby_func, as_index):
# GH#50806
if groupby_func == "corrwith":
msg = "GH#50845 - corrwith fails when there are duplicate columns"
request.node.add_marker(pytest.mark.xfail(reason=msg))
df = DataFrame([[1, 3, 6], [1, 4, 7], [2, 5, 8]], columns=list("abb"))
args = get_groupby_method_args(groupby_func, df)
gb = df.groupby("a", as_index=as_index)
result = getattr(gb, groupby_func)(*args)

if groupby_func in ("size", "ngroup", "cumcount"):
expected = getattr(
df.take([0, 1], axis=1).groupby("a", as_index=as_index), groupby_func
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nitpick: can you avoid chaining take/gropby/getattr here (and in L1639)? easier to grok if something goes wrong

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

)(*args)
tm.assert_equal(result, expected)
else:
expected_df = df.copy()
expected_df.columns = ["a", "b", "c"]
expected_args = get_groupby_method_args(groupby_func, expected_df)
expected = getattr(expected_df.groupby("a", as_index=as_index), groupby_func)(
*expected_args
)
expected = expected.rename(columns={"c": "b"})
tm.assert_frame_equal(result, expected)
11 changes: 11 additions & 0 deletions pandas/tests/groupby/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -2898,3 +2898,14 @@ def test_groupby_reduce_period():
expected = ser[:10]
expected.index = Index(range(10), dtype=np.int_)
tm.assert_series_equal(res, expected)


def test_selected_obj_duplicate_columns():
# GH#50806
df = DataFrame([[0, 1, 2, 3]])
df.columns = [0, 1, 2, 0]
gb = df.groupby(df[1])
with gb._group_selection_context():
result = gb._selected_obj
expected = df.take([0, 2, 3], axis=1)
tm.assert_frame_equal(result, expected)