From 448e5c20462b1c27177dc7b8bf701bf59eb512df Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 25 May 2023 09:13:25 -0700 Subject: [PATCH] DEPR: support axis=None in DataFrame reductions (#52042) * DEPR: support axis=None in DataFrame reductions * test, whatsnew * catch in apply(sum) * Fix defaults * catch warnings * dont check stacklevel * mypy fixup * catch warning --- doc/source/whatsnew/v0.15.1.rst | 1 + doc/source/whatsnew/v2.1.0.rst | 1 + pandas/core/frame.py | 10 ++--- pandas/core/generic.py | 39 +++++++++++++++---- pandas/tests/frame/test_npfuncs.py | 17 ++++++++ .../tests/groupby/aggregate/test_aggregate.py | 6 ++- pandas/tests/groupby/test_apply.py | 7 +++- pandas/tests/groupby/test_function.py | 6 ++- pandas/tests/groupby/test_groupby.py | 8 +++- pandas/tests/window/test_expanding.py | 7 +++- 10 files changed, 84 insertions(+), 18 deletions(-) diff --git a/doc/source/whatsnew/v0.15.1.rst b/doc/source/whatsnew/v0.15.1.rst index 07139ebad8737b..09b59f35972cde 100644 --- a/doc/source/whatsnew/v0.15.1.rst +++ b/doc/source/whatsnew/v0.15.1.rst @@ -102,6 +102,7 @@ API changes current behavior: .. ipython:: python + :okwarning: gr.apply(sum) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 70fc0a2f23bb10..7db266d60df985 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -284,6 +284,7 @@ Deprecations - Deprecated :meth:`DataFrame.applymap`. Use the new :meth:`DataFrame.map` method instead (:issue:`52353`) - Deprecated :meth:`DataFrame.swapaxes` and :meth:`Series.swapaxes`, use :meth:`DataFrame.transpose` or :meth:`Series.transpose` instead (:issue:`51946`) - Deprecated ``freq`` parameter in :class:`PeriodArray` constructor, pass ``dtype`` instead (:issue:`52462`) +- Deprecated behavior of :class:`DataFrame` reductions ``sum``, ``prod``, ``std``, ``var``, ``sem`` with ``axis=None``, in a future version this will operate over both axes returning a scalar instead of behaving like ``axis=0``; note this also affects numpy functions e.g. ``np.sum(df)`` (:issue:`21597`) - Deprecated behavior of :func:`concat` when :class:`DataFrame` has columns that are all-NA, in a future version these will not be discarded when determining the resulting dtype (:issue:`40893`) - Deprecated behavior of :meth:`Series.dt.to_pydatetime`, in a future version this will return a :class:`Series` containing python ``datetime`` objects instead of an ``ndarray`` of datetimes; this matches the behavior of other :meth:`Series.dt` properties (:issue:`20306`) - Deprecated logical operations (``|``, ``&``, ``^``) between pandas objects and dtype-less sequences (e.g. ``list``, ``tuple``), wrap a sequence in a :class:`Series` or numpy array before operating instead (:issue:`51521`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c8486de9b2cb25..9894ca0da4ec1c 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -10981,7 +10981,7 @@ def max( @doc(make_doc("sum", ndim=2)) def sum( self, - axis: Axis | None = None, + axis: Axis | None = 0, skipna: bool = True, numeric_only: bool = False, min_count: int = 0, @@ -10993,7 +10993,7 @@ def sum( @doc(make_doc("prod", ndim=2)) def prod( self, - axis: Axis | None = None, + axis: Axis | None = 0, skipna: bool = True, numeric_only: bool = False, min_count: int = 0, @@ -11024,7 +11024,7 @@ def median( @doc(make_doc("sem", ndim=2)) def sem( self, - axis: Axis | None = None, + axis: Axis | None = 0, skipna: bool = True, ddof: int = 1, numeric_only: bool = False, @@ -11035,7 +11035,7 @@ def sem( @doc(make_doc("var", ndim=2)) def var( self, - axis: Axis | None = None, + axis: Axis | None = 0, skipna: bool = True, ddof: int = 1, numeric_only: bool = False, @@ -11046,7 +11046,7 @@ def var( @doc(make_doc("std", ndim=2)) def std( self, - axis: Axis | None = None, + axis: Axis | None = 0, skipna: bool = True, ddof: int = 1, numeric_only: bool = False, diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 05b60cd278758a..bcfbfa1a2b7133 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -11290,6 +11290,8 @@ def _logical_func( name, func, axis=0, bool_only=bool_only, skipna=skipna, **kwargs ) return res._logical_func(name, func, skipna=skipna, **kwargs) + elif axis is None: + axis = 0 if ( self.ndim > 1 @@ -11394,7 +11396,7 @@ def _stat_function_ddof( self, name: str, func, - axis: Axis | None = None, + axis: Axis | None | lib.NoDefault = lib.no_default, skipna: bool_t = True, ddof: int = 1, numeric_only: bool_t = False, @@ -11402,7 +11404,19 @@ def _stat_function_ddof( ) -> Series | float: nv.validate_stat_ddof_func((), kwargs, fname=name) validate_bool_kwarg(skipna, "skipna", none_allowed=False) + if axis is None: + if self.ndim > 1: + warnings.warn( + f"The behavior of {type(self).__name__}.{name} with axis=None " + "is deprecated, in a future version this will reduce over both " + "axes and return a scalar. To retain the old behavior, pass " + "axis=0 (or do not pass axis)", + FutureWarning, + stacklevel=find_stack_level(), + ) + axis = 0 + elif axis is lib.no_default: axis = 0 return self._reduce( @@ -11411,7 +11425,7 @@ def _stat_function_ddof( def sem( self, - axis: Axis | None = None, + axis: Axis | None = 0, skipna: bool_t = True, ddof: int = 1, numeric_only: bool_t = False, @@ -11423,7 +11437,7 @@ def sem( def var( self, - axis: Axis | None = None, + axis: Axis | None = 0, skipna: bool_t = True, ddof: int = 1, numeric_only: bool_t = False, @@ -11435,7 +11449,7 @@ def var( def std( self, - axis: Axis | None = None, + axis: Axis | None = 0, skipna: bool_t = True, ddof: int = 1, numeric_only: bool_t = False, @@ -11547,7 +11561,7 @@ def _min_count_stat_function( self, name: str, func, - axis: Axis | None = None, + axis: Axis | None | lib.NoDefault = lib.no_default, skipna: bool_t = True, numeric_only: bool_t = False, min_count: int = 0, @@ -11559,6 +11573,17 @@ def _min_count_stat_function( validate_bool_kwarg(skipna, "skipna", none_allowed=False) if axis is None: + if self.ndim > 1: + warnings.warn( + f"The behavior of {type(self).__name__}.{name} with axis=None " + "is deprecated, in a future version this will reduce over both " + "axes and return a scalar. To retain the old behavior, pass " + "axis=0 (or do not pass axis)", + FutureWarning, + stacklevel=find_stack_level(), + ) + axis = 0 + elif axis is lib.no_default: axis = 0 return self._reduce( @@ -11572,7 +11597,7 @@ def _min_count_stat_function( def sum( self, - axis: Axis | None = None, + axis: Axis | None = 0, skipna: bool_t = True, numeric_only: bool_t = False, min_count: int = 0, @@ -11584,7 +11609,7 @@ def sum( def prod( self, - axis: Axis | None = None, + axis: Axis | None = 0, skipna: bool_t = True, numeric_only: bool_t = False, min_count: int = 0, diff --git a/pandas/tests/frame/test_npfuncs.py b/pandas/tests/frame/test_npfuncs.py index f671ee9d04d7fa..b734dafb6c31b6 100644 --- a/pandas/tests/frame/test_npfuncs.py +++ b/pandas/tests/frame/test_npfuncs.py @@ -27,6 +27,23 @@ def test_np_sqrt(self, float_frame): tm.assert_frame_equal(result, float_frame.apply(np.sqrt)) + def test_sum_deprecated_axis_behavior(self): + # GH#52042 deprecated behavior of df.sum(axis=None), which gets + # called when we do np.sum(df) + + arr = np.random.randn(4, 3) + df = DataFrame(arr) + + msg = "The behavior of DataFrame.sum with axis=None is deprecated" + with tm.assert_produces_warning( + FutureWarning, match=msg, check_stacklevel=False + ): + res = np.sum(df) + + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = df.sum(axis=None) + tm.assert_series_equal(res, expected) + def test_np_ravel(self): # GH26247 arr = np.array( diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 185b50e9d7833b..3558377907931d 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -153,7 +153,11 @@ def test_agg_apply_corner(ts, tsframe): ) tm.assert_frame_equal(grouped.sum(), exp_df) tm.assert_frame_equal(grouped.agg(np.sum), exp_df) - tm.assert_frame_equal(grouped.apply(np.sum), exp_df) + + msg = "The behavior of DataFrame.sum with axis=None is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg, check_stacklevel=False): + res = grouped.apply(np.sum) + tm.assert_frame_equal(res, exp_df) def test_agg_grouping_is_list_tuple(ts): diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index a077bd62927e68..a9912d75c89780 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -1071,14 +1071,17 @@ def test_apply_is_unchanged_when_other_methods_are_called_first(reduction_func): # Check output when no other methods are called before .apply() grp = df.groupby(by="a") - result = grp.apply(sum) + msg = "The behavior of DataFrame.sum with axis=None is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg, check_stacklevel=False): + result = grp.apply(sum) tm.assert_frame_equal(result, expected) # Check output when another method is called before .apply() grp = df.groupby(by="a") args = get_groupby_method_args(reduction_func, df) _ = getattr(grp, reduction_func)(*args) - result = grp.apply(sum) + with tm.assert_produces_warning(FutureWarning, match=msg, check_stacklevel=False): + result = grp.apply(sum) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index cf4d8a9c879b65..98fce9d668e445 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -73,7 +73,11 @@ def test_builtins_apply(keys, f): gb = df.groupby(keys) fname = f.__name__ - result = gb.apply(f) + + warn = None if f is not sum else FutureWarning + msg = "The behavior of DataFrame.sum with axis=None is deprecated" + with tm.assert_produces_warning(warn, match=msg, check_stacklevel=False): + result = gb.apply(f) ngroups = len(df.drop_duplicates(subset=keys)) assert_msg = f"invalid frame shape: {result.shape} (expected ({ngroups}, 3))" diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 583b66a99740d9..0c6661b49d9175 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -752,7 +752,13 @@ def test_groupby_as_index_agg(df): gr = df.groupby(ts) gr.nth(0) # invokes set_selection_from_grouper internally - tm.assert_frame_equal(gr.apply(sum), df.groupby(ts).apply(sum)) + + msg = "The behavior of DataFrame.sum with axis=None is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg, check_stacklevel=False): + res = gr.apply(sum) + with tm.assert_produces_warning(FutureWarning, match=msg, check_stacklevel=False): + alt = df.groupby(ts).apply(sum) + tm.assert_frame_equal(res, alt) for attr in ["mean", "max", "count", "idxmax", "cumsum", "all"]: gr = df.groupby(ts, as_index=False) diff --git a/pandas/tests/window/test_expanding.py b/pandas/tests/window/test_expanding.py index 03af2e63da165b..bbcc260aa779ed 100644 --- a/pandas/tests/window/test_expanding.py +++ b/pandas/tests/window/test_expanding.py @@ -333,7 +333,12 @@ def test_expanding_func(func, static_comp, frame_or_series): result = getattr(obj, func)() assert isinstance(result, frame_or_series) - expected = static_comp(data[:11]) + msg = "The behavior of DataFrame.sum with axis=None is deprecated" + warn = None + if frame_or_series is DataFrame and static_comp is np.sum: + warn = FutureWarning + with tm.assert_produces_warning(warn, match=msg, check_stacklevel=False): + expected = static_comp(data[:11]) if frame_or_series is Series: tm.assert_almost_equal(result[10], expected) else: