pandas-dev · mroeschke · Feb 3, 2023 · Dec 28, 2022 · Jan 14, 2023 · Jan 14, 2023
diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst
@@ -1141,6 +1141,7 @@ Groupby/resample/rolling
 - Bug in :meth:`.SeriesGroupBy.value_counts` did not respect ``sort=False`` (:issue:`50482`)
 - Bug in :meth:`.DataFrameGroupBy.resample` raises ``KeyError`` when getting the result from a key list when resampling on time index (:issue:`50840`)
 - Bug in :meth:`.DataFrameGroupBy.transform` and :meth:`.SeriesGroupBy.transform` would raise incorrectly when grouper had ``axis=1`` for ``"ngroup"`` argument (:issue:`45986`)
+- Bug in :meth:`.DataFrameGroupBy.describe` produced incorrect results when data had duplicate columns (:issue:`50806`)
 -
 
 Reshaping

diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -610,7 +610,7 @@ def f(self):
 
 
 class BaseGroupBy(PandasObject, SelectionMixin[NDFrameT], GroupByIndexingMixin):
-    _group_selection: IndexLabel | None = None
+    _group_selection: bool = False
     _hidden_attrs = PandasObject._hidden_attrs | {
         "as_index",
         "axis",
@@ -725,8 +725,8 @@ def _selected_obj(self):
         # Note: _selected_obj is always just `self.obj` for SeriesGroupBy
 
         if self._selection is None or isinstance(self.obj, Series):
-            if self._group_selection is not None:
-                return self.obj[self._group_selection]
+            if self._group_selection:
+                return self._obj_with_exclusions
             return self.obj
         else:
             return self.obj[self._selection]
@@ -1009,22 +1009,11 @@ def _set_group_selection(self) -> None:
 
         NOTE: this should be paired with a call to _reset_group_selection
         """
-        # This is a no-op for SeriesGroupBy
         grp = self.grouper
-        if (
-            grp.groupings is None
-            or self.obj.ndim == 1
-            or self._group_selection is not None
-        ):
+        if grp.groupings is None or self.obj.ndim == 1 or self._group_selection:
             return
-
-        groupers = self.exclusions
-
-        if len(groupers):
-            # GH12839 clear selected obj cache when group selection changes
-            ax = self.obj._info_axis
-            self._group_selection = ax.difference(Index(groupers), sort=False).tolist()
-            self._reset_cache("_selected_obj")
+        self._group_selection = True
+        self._reset_cache("_selected_obj")
 
     @final
     def _reset_group_selection(self) -> None:
@@ -1034,9 +1023,9 @@ def _reset_group_selection(self) -> None:
         Used for methods needing to return info on each group regardless of
         whether a group selection was previously set.
         """
-        if self._group_selection is not None:
+        if self._group_selection:
             # GH12839 clear cached selection too when changing group selection
-            self._group_selection = None
+            self._group_selection = False
             self._reset_cache("_selected_obj")
 
     @contextmanager

diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py
@@ -1254,6 +1254,27 @@ def test_describe_with_duplicate_output_column_names(as_index, keys):
     tm.assert_frame_equal(result, expected)
 
 
+def test_describe_duplicate_columns():
+    # GH#50806
+    df = DataFrame([[0, 1, 2, 3]])
+    df.columns = [0, 1, 2, 0]
+    gb = df.groupby(df[1])
+    result = gb.describe(percentiles=[])
+
+    columns = ["count", "mean", "std", "min", "50%", "max"]
+    frames = [
+        DataFrame([[1.0, val, np.nan, val, val, val]], index=[1], columns=columns)
+        for val in (0.0, 2.0, 3.0)
+    ]
+    expected = pd.concat(frames, axis=1)
+    expected.columns = MultiIndex(
+        levels=[[0, 2], columns],
+        codes=[6 * [0] + 6 * [1] + 6 * [0], 3 * list(range(6))],
+    )
+    expected.index.names = [1]
+    tm.assert_frame_equal(result, expected)
+
+
 def test_groupby_mean_no_overflow():
     # Regression test for (#22487)
     df = DataFrame(
@@ -1594,3 +1615,29 @@ def test_multiindex_group_all_columns_when_empty(groupby_func):
     result = method(*args).index
     expected = df.index
     tm.assert_index_equal(result, expected)
+
+
+def test_duplicate_columns(request, groupby_func, as_index):
+    # GH#50806
+    if groupby_func == "corrwith":
+        msg = "GH#50845 - corrwith fails when there are duplicate columns"
+        request.node.add_marker(pytest.mark.xfail(reason=msg))
+    df = DataFrame([[1, 3, 6], [1, 4, 7], [2, 5, 8]], columns=list("abb"))
+    args = get_groupby_method_args(groupby_func, df)
+    gb = df.groupby("a", as_index=as_index)
+    result = getattr(gb, groupby_func)(*args)
+
+    if groupby_func in ("size", "ngroup", "cumcount"):
+        expected = getattr(
+            df.take([0, 1], axis=1).groupby("a", as_index=as_index), groupby_func
+        )(*args)
+        tm.assert_equal(result, expected)
+    else:
+        expected_df = df.copy()
+        expected_df.columns = ["a", "b", "c"]
+        expected_args = get_groupby_method_args(groupby_func, expected_df)
+        expected = getattr(expected_df.groupby("a", as_index=as_index), groupby_func)(
+            *expected_args
+        )
+        expected = expected.rename(columns={"c": "b"})
+        tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
@@ -2898,3 +2898,14 @@ def test_groupby_reduce_period():
     expected = ser[:10]
     expected.index = Index(range(10), dtype=np.int_)
     tm.assert_series_equal(res, expected)
+
+
+def test_selected_obj_duplicate_columns():
+    # GH#50806
+    df = DataFrame([[0, 1, 2, 3]])
+    df.columns = [0, 1, 2, 0]
+    gb = df.groupby(df[1])
+    with gb._group_selection_context():
+        result = gb._selected_obj
+    expected = df.take([0, 2, 3], axis=1)
+    tm.assert_frame_equal(result, expected)