From f598670353311a6fff4e6e1e96074ccf0737e6b7 Mon Sep 17 00:00:00 2001 From: Petroncini <59212480+Petroncini@users.noreply.github.com> Date: Tue, 1 Oct 2024 17:33:42 -0300 Subject: [PATCH] BUG: groupby().any() returns true for groups with timedelta all NaT (#59782) --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/groupby/ops.py | 8 +++++--- pandas/tests/groupby/test_grouping.py | 12 ++++++++++++ 3 files changed, 18 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 41ba80989a0ce..6ebb51cd3ef89 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -652,6 +652,7 @@ Plotting Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ - Bug in :meth:`.DataFrameGroupBy.__len__` and :meth:`.SeriesGroupBy.__len__` would raise when the grouping contained NA values and ``dropna=False`` (:issue:`58644`) +- Bug in :meth:`.DataFrameGroupBy.any` that returned True for groups where all Timedelta values are NaT. (:issue:`59712`) - Bug in :meth:`.DataFrameGroupBy.groups` and :meth:`.SeriesGroupby.groups` that would not respect groupby argument ``dropna`` (:issue:`55919`) - Bug in :meth:`.DataFrameGroupBy.median` where nat values gave an incorrect result. (:issue:`57926`) - Bug in :meth:`.DataFrameGroupBy.quantile` when ``interpolation="nearest"`` is inconsistent with :meth:`DataFrame.quantile` (:issue:`47942`) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index da80969b613cd..0e99178642715 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -371,6 +371,10 @@ def _call_cython_op( is_datetimelike = dtype.kind in "mM" + if self.how in ["any", "all"]: + if mask is None: + mask = isna(values) + if is_datetimelike: values = values.view("int64") is_numeric = True @@ -380,12 +384,10 @@ def _call_cython_op( values = values.astype(np.float32) if self.how in ["any", "all"]: - if mask is None: - mask = isna(values) if dtype == object: if kwargs["skipna"]: # GH#37501: don't raise on pd.NA when skipna=True - if mask.any(): + if mask is not None and mask.any(): # mask on original values computed separately values = values.copy() values[mask] = True diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index fc2a8a970010a..6bb2eaf89b5d7 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -1180,3 +1180,15 @@ def test_grouping_by_key_is_in_axis(): result = gb.sum() expected = DataFrame({"a": [1, 2], "b": [1, 2], "c": [7, 5]}) tm.assert_frame_equal(result, expected) + + +def test_groupby_any_with_timedelta(): + # GH#59712 + df = DataFrame({"value": [pd.Timedelta(1), pd.NaT]}) + + result = df.groupby(np.array([0, 1], dtype=np.int64))["value"].any() + + expected = Series({0: True, 1: False}, name="value", dtype=bool) + expected.index = expected.index.astype(np.int64) + + tm.assert_series_equal(result, expected)