From 1186ee0080a43a08c3431a5bfa86898f6ec3b54d Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 1 Jul 2023 08:36:10 +0200 Subject: [PATCH 01/35] DOC/WEB: update Google Analytics id (#53954) --- doc/source/conf.py | 2 +- web/pandas/_templates/layout.html | 13 ++++++------- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/doc/source/conf.py b/doc/source/conf.py index 66fca61c2c6e5..31893bdf929d8 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -240,7 +240,7 @@ "footer_start": ["pandas_footer", "sphinx-version"], "github_url": "https://github.com/pandas-dev/pandas", "twitter_url": "https://twitter.com/pandas_dev", - "analytics": {"google_analytics_id": "UA-27880019-2"}, + "analytics": {"google_analytics_id": "G-5RE31C1RNW"}, "logo": {"image_dark": "https://pandas.pydata.org/static/img/pandas_white.svg"}, "navbar_end": ["version-switcher", "theme-switcher", "navbar-icon-links"], "switcher": { diff --git a/web/pandas/_templates/layout.html b/web/pandas/_templates/layout.html index 94c0493234cf2..d9824f4641667 100644 --- a/web/pandas/_templates/layout.html +++ b/web/pandas/_templates/layout.html @@ -1,13 +1,12 @@ - + pandas - Python Data Analysis Library From c4c6ae97a91af11e6744991cee4e22f588337c84 Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Sun, 2 Jul 2023 21:24:27 +0100 Subject: [PATCH 02/35] DOC: fix pandas-coverage link (#53971) fix pandas-coverage link --- doc/source/development/contributing_codebase.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/development/contributing_codebase.rst b/doc/source/development/contributing_codebase.rst index 4631e5dc8d0ca..f3ff5b70d4aac 100644 --- a/doc/source/development/contributing_codebase.rst +++ b/doc/source/development/contributing_codebase.rst @@ -764,7 +764,7 @@ install pandas) by typing:: your installation is probably fine and you can start contributing! Often it is worth running only a subset of tests first around your changes before running the -entire suite (tip: you can use the [pandas-coverage app](https://pandas-coverage.herokuapp.com/)) +entire suite (tip: you can use the [pandas-coverage app](https://pandas-coverage-12d2130077bc.herokuapp.com/)) to find out which tests hit the lines of code you've modified, and then run only those). The easiest way to do this is with:: From 4da9cb69f84641cbcad837cccfe7102233dd6463 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 2 Jul 2023 14:27:54 -0700 Subject: [PATCH 03/35] PERF: ffill/bfill with non-numpy dtypes (#53950) --- doc/source/whatsnew/v2.1.0.rst | 1 + pandas/_libs/algos.pyi | 4 +++ pandas/_libs/algos.pyx | 36 +++++++++++++++++++++++++++ pandas/core/arrays/arrow/array.py | 3 --- pandas/core/arrays/base.py | 19 ++++++++++---- pandas/tests/extension/test_arrow.py | 7 ------ pandas/tests/extension/test_string.py | 11 -------- 7 files changed, 55 insertions(+), 26 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index ebbdbcb0f61f5..6390fbeed8548 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -342,6 +342,7 @@ Performance improvements - Performance improvement in :meth:`DataFrame.loc` when selecting rows and columns (:issue:`53014`) - Performance improvement in :meth:`Series.add` for pyarrow string and binary dtypes (:issue:`53150`) - Performance improvement in :meth:`Series.corr` and :meth:`Series.cov` for extension dtypes (:issue:`52502`) +- Performance improvement in :meth:`Series.ffill`, :meth:`Series.bfill`, :meth:`DataFrame.ffill`, :meth:`DataFrame.bfill` with pyarrow dtypes (:issue:`53950`) - Performance improvement in :meth:`Series.str.get_dummies` for pyarrow-backed strings (:issue:`53655`) - Performance improvement in :meth:`Series.str.get` for pyarrow-backed strings (:issue:`53152`) - Performance improvement in :meth:`Series.str.split` with ``expand=True`` for pyarrow-backed strings (:issue:`53585`) diff --git a/pandas/_libs/algos.pyi b/pandas/_libs/algos.pyi index 20a805533e8cc..cbbe418c8ab48 100644 --- a/pandas/_libs/algos.pyi +++ b/pandas/_libs/algos.pyi @@ -60,6 +60,10 @@ def nancorr_spearman( # ---------------------------------------------------------------------- def validate_limit(nobs: int | None, limit=...) -> int: ... +def get_fill_indexer( + mask: npt.NDArray[np.bool_], + limit: int | None = None, +) -> npt.NDArray[np.intp]: ... def pad( old: np.ndarray, # ndarray[numeric_object_t] new: np.ndarray, # ndarray[numeric_object_t] diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 854eacc5e1df5..0b6ea58f987d4 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -525,6 +525,42 @@ def validate_limit(nobs: int | None, limit=None) -> int: return lim +# TODO: overlap with libgroupby.group_fillna_indexer? +@cython.boundscheck(False) +@cython.wraparound(False) +def get_fill_indexer(const uint8_t[:] mask, limit=None): + """ + Find an indexer to use for ffill to `take` on the array being filled. + """ + cdef: + ndarray[intp_t, ndim=1] indexer + Py_ssize_t i, N = len(mask), last_valid + int lim + + # fill_count is the number of consecutive NAs we have seen. + # If it exceeds the given limit, we stop padding. + int fill_count = 0 + + lim = validate_limit(N, limit) + indexer = np.empty(N, dtype=np.intp) + + last_valid = -1 # haven't yet seen anything non-NA + + for i in range(N): + if not mask[i]: + indexer[i] = i + last_valid = i + fill_count = 0 + else: + if fill_count < lim: + indexer[i] = last_valid + else: + indexer[i] = -1 + fill_count += 1 + + return indexer + + @cython.boundscheck(False) @cython.wraparound(False) def pad( diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 85a75fff25ebd..17120d0de5c5f 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -67,8 +67,6 @@ from pandas.core.dtypes.dtypes import ArrowDtype - from pandas.core.arrays.arrow._arrow_utils import fallback_performancewarning - ARROW_CMP_FUNCS = { "eq": pc.equal, "ne": pc.not_equal, @@ -918,7 +916,6 @@ def fillna( return super().fillna(value=value, method=method, limit=limit) if method is not None: - fallback_performancewarning() return super().fillna(value=value, method=method, limit=limit) if isinstance(value, (np.ndarray, ExtensionArray)): diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index ceac8e22426d9..64f917a419391 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -23,7 +23,10 @@ import numpy as np -from pandas._libs import lib +from pandas._libs import ( + algos as libalgos, + lib, +) from pandas.compat import set_function_name from pandas.compat.numpy import function as nv from pandas.errors import AbstractMethodError @@ -824,10 +827,16 @@ def fillna( if mask.any(): if method is not None: - func = missing.get_fill_func(method) - npvalues = self.astype(object) - func(npvalues, limit=limit, mask=mask) - new_values = self._from_sequence(npvalues, dtype=self.dtype) + meth = missing.clean_fill_method(method) + + npmask = np.asarray(mask) + if meth == "pad": + indexer = libalgos.get_fill_indexer(npmask, limit=limit) + return self.take(indexer, allow_fill=True) + else: + # i.e. meth == "backfill" + indexer = libalgos.get_fill_indexer(npmask[::-1], limit=limit)[::-1] + return self[::-1].take(indexer, allow_fill=True) else: # fill with value new_values = self.copy() diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index abcca16340365..56e35d30ad83c 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -38,7 +38,6 @@ pa_version_under9p0, pa_version_under11p0, ) -from pandas.errors import PerformanceWarning from pandas.core.dtypes.dtypes import ( ArrowDtype, @@ -698,12 +697,6 @@ def test_fillna_no_op_returns_copy(self, data): assert result is not data self.assert_extension_array_equal(result, data) - def test_fillna_series_method(self, data_missing, fillna_method): - with tm.maybe_produces_warning( - PerformanceWarning, fillna_method is not None, check_stacklevel=False - ): - super().test_fillna_series_method(data_missing, fillna_method) - class TestBasePrinting(base.BasePrintingTests): pass diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 1f39e8e9b450e..eb166691d3314 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -18,10 +18,7 @@ import numpy as np import pytest -from pandas.errors import PerformanceWarning - import pandas as pd -import pandas._testing as tm from pandas.api.types import is_string_dtype from pandas.core.arrays import ArrowStringArray from pandas.core.arrays.string_ import StringDtype @@ -169,14 +166,6 @@ def test_fillna_no_op_returns_copy(self, data): assert result is not data self.assert_extension_array_equal(result, data) - def test_fillna_series_method(self, data_missing, fillna_method): - with tm.maybe_produces_warning( - PerformanceWarning, - fillna_method is not None and data_missing.dtype.storage == "pyarrow", - check_stacklevel=False, - ): - super().test_fillna_series_method(data_missing, fillna_method) - class TestNoReduce(base.BaseNoReduceTests): @pytest.mark.parametrize("skipna", [True, False]) From 866a388412067f39e5be9ba75d0fbeff10ae5fd7 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Mon, 3 Jul 2023 22:07:38 +0200 Subject: [PATCH 04/35] ENH: Don't fragment manager if convert is no-op (#53977) --- pandas/core/internals/blocks.py | 10 +++++++++- pandas/tests/frame/methods/test_replace.py | 7 +++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 1d572dbfd5386..8923faf444953 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -480,7 +480,15 @@ def convert( return [self.copy()] if copy else [self] if self.ndim != 1 and self.shape[0] != 1: - return self.split_and_operate(Block.convert, copy=copy, using_cow=using_cow) + blocks = self.split_and_operate( + Block.convert, copy=copy, using_cow=using_cow + ) + if all(blk.dtype.kind == "O" for blk in blocks): + # Avoid fragmenting the block if convert is a no-op + if using_cow: + return [self.copy(deep=False)] + return [self.copy()] if copy else [self] + return blocks values = self.values if values.ndim == 2: diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index 9256df72cdf7b..1846ac24e9cc5 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -1592,3 +1592,10 @@ def test_replace_categorical_no_replacement(self): result = df.replace(to_replace=[".", "def"], value=["_", None]) tm.assert_frame_equal(result, expected) + + def test_replace_object_splitting(self): + # GH#53977 + df = DataFrame({"a": ["a"], "b": "b"}) + assert len(df._mgr.blocks) == 1 + df.replace(to_replace=r"^\s*$", value="", inplace=True, regex=True) + assert len(df._mgr.blocks) == 1 From ecc46c382a2bb1a7dfb783674064fc0c9d269ea2 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 4 Jul 2023 20:53:09 +0200 Subject: [PATCH 05/35] DEPR: start with Deprecation instead of FutureWarning for NDFrame._data (#53994) --- pandas/core/generic.py | 2 +- pandas/tests/frame/test_api.py | 2 +- pandas/tests/generic/test_generic.py | 2 +- pandas/tests/series/test_api.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index b806ddbaa89ba..f049e9d479b26 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -506,7 +506,7 @@ def _data(self): warnings.warn( f"{type(self).__name__}._data is deprecated and will be removed in " "a future version. Use public APIs instead.", - FutureWarning, + DeprecationWarning, stacklevel=find_stack_level(), ) return self._mgr diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index e0d9d6c281fd5..7cf1c56d9342e 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -379,6 +379,6 @@ def test_inspect_getmembers(self): df = DataFrame() msg = "DataFrame._data is deprecated" with tm.assert_produces_warning( - FutureWarning, match=msg, check_stacklevel=False + DeprecationWarning, match=msg, check_stacklevel=False ): inspect.getmembers(df) diff --git a/pandas/tests/generic/test_generic.py b/pandas/tests/generic/test_generic.py index acc1a8c2e1d05..6226f97c73f92 100644 --- a/pandas/tests/generic/test_generic.py +++ b/pandas/tests/generic/test_generic.py @@ -307,7 +307,7 @@ def test_copy_and_deepcopy(self, frame_or_series, shape, func): def test_data_deprecated(self, frame_or_series): obj = frame_or_series() msg = "(Series|DataFrame)._data is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(DeprecationWarning, match=msg): mgr = obj._data assert mgr is obj._mgr diff --git a/pandas/tests/series/test_api.py b/pandas/tests/series/test_api.py index e4e276af121f9..7d70206585be4 100644 --- a/pandas/tests/series/test_api.py +++ b/pandas/tests/series/test_api.py @@ -172,7 +172,7 @@ def test_inspect_getmembers(self): ser = Series(dtype=object) msg = "Series._data is deprecated" with tm.assert_produces_warning( - FutureWarning, match=msg, check_stacklevel=False + DeprecationWarning, match=msg, check_stacklevel=False ): inspect.getmembers(ser) From 1d7cab61629e507eb28c42b48a4729d7c47321ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dea=20Mar=C3=ADa=20L=C3=A9on?= Date: Wed, 5 Jul 2023 09:36:31 +0200 Subject: [PATCH 06/35] DOC: Fixing EX01 - Added examples (#53948) * Examples Rolling.max, cov, skew, apply * correct skew * Changing format skew * Trying to fix skew * Removed replace() * Remove replace() correct doc.py links --- ci/code_checks.sh | 4 --- pandas/core/window/rolling.py | 63 ++++++++++++++++++++++++++++++++--- 2 files changed, 59 insertions(+), 8 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index a67dc66b26d34..7a5dbb45636d2 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -110,10 +110,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then pandas_object \ pandas.api.interchange.from_dataframe \ pandas.DatetimeIndex.snap \ - pandas.core.window.rolling.Rolling.max \ - pandas.core.window.rolling.Rolling.cov \ - pandas.core.window.rolling.Rolling.skew \ - pandas.core.window.rolling.Rolling.apply \ pandas.core.window.rolling.Window.mean \ pandas.core.window.rolling.Window.sum \ pandas.core.window.rolling.Window.var \ diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index f4d733423b3ae..9778651814b23 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -1900,7 +1900,19 @@ def count(self, numeric_only: bool = False): create_section_header("Returns"), template_returns, create_section_header("See Also"), - template_see_also[:-1], + template_see_also, + create_section_header("Examples"), + dedent( + """\ + >>> ser = pd.Series([1, 6, 5, 4]) + >>> ser.rolling(2).apply(lambda s: s.sum() - s.min()) + 0 NaN + 1 6.0 + 2 6.0 + 3 5.0 + dtype: float64 + """ + ), window_method="rolling", aggregation_description="custom aggregation function", agg_method="apply", @@ -2008,7 +2020,19 @@ def sum( create_section_header("See Also"), template_see_also, create_section_header("Notes"), - numba_notes[:-1], + numba_notes, + create_section_header("Examples"), + dedent( + """\ + >>> ser = pd.Series([1, 2, 3, 4]) + >>> ser.rolling(2).max() + 0 NaN + 1 2.0 + 2 3.0 + 3 4.0 + dtype: float64 + """ + ), window_method="rolling", aggregation_description="maximum", agg_method="max", @@ -2288,7 +2312,25 @@ def var( "scipy.stats.skew : Third moment of a probability density.\n", template_see_also, create_section_header("Notes"), - "A minimum of three periods is required for the rolling calculation.\n", + dedent( + """ + A minimum of three periods is required for the rolling calculation.\n + """ + ), + create_section_header("Examples"), + dedent( + """\ + >>> ser = pd.Series([1, 5, 2, 7, 12, 6]) + >>> ser.rolling(3).skew().round(6) + 0 NaN + 1 NaN + 2 1.293343 + 3 -0.585583 + 4 0.000000 + 5 1.545393 + dtype: float64 + """ + ), window_method="rolling", aggregation_description="unbiased skewness", agg_method="skew", @@ -2538,7 +2580,20 @@ def rank( create_section_header("Returns"), template_returns, create_section_header("See Also"), - template_see_also[:-1], + template_see_also, + create_section_header("Examples"), + dedent( + """\ + >>> ser1 = pd.Series([1, 2, 3, 4]) + >>> ser2 = pd.Series([1, 4, 5, 8]) + >>> ser1.rolling(2).cov(ser2) + 0 NaN + 1 1.5 + 2 0.5 + 3 1.5 + dtype: float64 + """ + ), window_method="rolling", aggregation_description="sample covariance", agg_method="cov", From bcb3fa8e5c91059e9035ac0f4acb2fc11e1b28bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dea=20Mar=C3=ADa=20L=C3=A9on?= Date: Wed, 5 Jul 2023 09:37:37 +0200 Subject: [PATCH 07/35] DOC: Fixing EX01 - Added examples (#53985) * Expanding examples * change list for examples max, min * Remove replace() on Expanding examples --- ci/code_checks.sh | 11 --- pandas/core/window/expanding.py | 161 +++++++++++++++++++++++++++++--- 2 files changed, 149 insertions(+), 23 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 7a5dbb45636d2..f9020a192e5b7 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -114,17 +114,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then pandas.core.window.rolling.Window.sum \ pandas.core.window.rolling.Window.var \ pandas.core.window.rolling.Window.std \ - pandas.core.window.expanding.Expanding.count \ - pandas.core.window.expanding.Expanding.sum \ - pandas.core.window.expanding.Expanding.mean \ - pandas.core.window.expanding.Expanding.median \ - pandas.core.window.expanding.Expanding.min \ - pandas.core.window.expanding.Expanding.max \ - pandas.core.window.expanding.Expanding.corr \ - pandas.core.window.expanding.Expanding.cov \ - pandas.core.window.expanding.Expanding.skew \ - pandas.core.window.expanding.Expanding.apply \ - pandas.core.window.expanding.Expanding.quantile \ pandas.core.window.ewm.ExponentialMovingWindow.mean \ pandas.core.window.ewm.ExponentialMovingWindow.sum \ pandas.core.window.ewm.ExponentialMovingWindow.std \ diff --git a/pandas/core/window/expanding.py b/pandas/core/window/expanding.py index 19dd98851611f..ec4c23bfc5e49 100644 --- a/pandas/core/window/expanding.py +++ b/pandas/core/window/expanding.py @@ -183,7 +183,19 @@ def aggregate(self, func, *args, **kwargs): create_section_header("Returns"), template_returns, create_section_header("See Also"), - template_see_also[:-1], + template_see_also, + create_section_header("Examples"), + dedent( + """\ + >>> ser = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd']) + >>> ser.expanding().count() + a 1.0 + b 2.0 + c 3.0 + d 4.0 + dtype: float64 + """ + ), window_method="expanding", aggregation_description="count of non NaN observations", agg_method="count", @@ -198,7 +210,19 @@ def count(self, numeric_only: bool = False): create_section_header("Returns"), template_returns, create_section_header("See Also"), - template_see_also[:-1], + template_see_also, + create_section_header("Examples"), + dedent( + """\ + >>> ser = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd']) + >>> ser.expanding().apply(lambda s: s.max() - 2 * s.min()) + a -1.0 + b 0.0 + c 1.0 + d 2.0 + dtype: float64 + """ + ), window_method="expanding", aggregation_description="custom aggregation function", agg_method="apply", @@ -231,7 +255,19 @@ def apply( create_section_header("See Also"), template_see_also, create_section_header("Notes"), - numba_notes[:-1], + numba_notes, + create_section_header("Examples"), + dedent( + """\ + >>> ser = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd']) + >>> ser.expanding().sum() + a 1.0 + b 3.0 + c 6.0 + d 10.0 + dtype: float64 + """ + ), window_method="expanding", aggregation_description="sum", agg_method="sum", @@ -258,7 +294,19 @@ def sum( create_section_header("See Also"), template_see_also, create_section_header("Notes"), - numba_notes[:-1], + numba_notes, + create_section_header("Examples"), + dedent( + """\ + >>> ser = pd.Series([3, 2, 1, 4], index=['a', 'b', 'c', 'd']) + >>> ser.expanding().max() + a 3.0 + b 3.0 + c 3.0 + d 4.0 + dtype: float64 + """ + ), window_method="expanding", aggregation_description="maximum", agg_method="max", @@ -285,7 +333,19 @@ def max( create_section_header("See Also"), template_see_also, create_section_header("Notes"), - numba_notes[:-1], + numba_notes, + create_section_header("Examples"), + dedent( + """\ + >>> ser = pd.Series([2, 3, 4, 1], index=['a', 'b', 'c', 'd']) + >>> ser.expanding().min() + a 2.0 + b 2.0 + c 2.0 + d 1.0 + dtype: float64 + """ + ), window_method="expanding", aggregation_description="minimum", agg_method="min", @@ -312,7 +372,19 @@ def min( create_section_header("See Also"), template_see_also, create_section_header("Notes"), - numba_notes[:-1], + numba_notes, + create_section_header("Examples"), + dedent( + """\ + >>> ser = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd']) + >>> ser.expanding().mean() + a 1.0 + b 1.5 + c 2.0 + d 2.5 + dtype: float64 + """ + ), window_method="expanding", aggregation_description="mean", agg_method="mean", @@ -339,7 +411,19 @@ def mean( create_section_header("See Also"), template_see_also, create_section_header("Notes"), - numba_notes[:-1], + numba_notes, + create_section_header("Examples"), + dedent( + """\ + >>> ser = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd']) + >>> ser.expanding().median() + a 1.0 + b 1.5 + c 2.0 + d 2.5 + dtype: float64 + """ + ), window_method="expanding", aggregation_description="median", agg_method="median", @@ -523,7 +607,20 @@ def sem(self, ddof: int = 1, numeric_only: bool = False): "scipy.stats.skew : Third moment of a probability density.\n", template_see_also, create_section_header("Notes"), - "A minimum of three periods is required for the rolling calculation.\n", + "A minimum of three periods is required for the rolling calculation.\n\n", + create_section_header("Examples"), + dedent( + """\ + >>> ser = pd.Series([-1, 0, 2, -1, 2], index=['a', 'b', 'c', 'd', 'e']) + >>> ser.expanding().skew() + a NaN + b NaN + c 0.935220 + d 1.414214 + e 0.315356 + dtype: float64 + """ + ), window_method="expanding", aggregation_description="unbiased skewness", agg_method="skew", @@ -597,7 +694,21 @@ def kurt(self, numeric_only: bool = False): create_section_header("Returns"), template_returns, create_section_header("See Also"), - template_see_also[:-1], + template_see_also, + create_section_header("Examples"), + dedent( + """\ + >>> ser = pd.Series([1, 2, 3, 4, 5, 6], index=['a', 'b', 'c', 'd', 'e', 'f']) + >>> ser.expanding(min_periods=4).quantile(.25) + a NaN + b NaN + c NaN + d 1.75 + e 2.00 + f 2.25 + dtype: float64 + """ + ), window_method="expanding", aggregation_description="quantile", agg_method="quantile", @@ -714,7 +825,20 @@ def rank( create_section_header("Returns"), template_returns, create_section_header("See Also"), - template_see_also[:-1], + template_see_also, + create_section_header("Examples"), + dedent( + """\ + >>> ser1 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd']) + >>> ser2 = pd.Series([10, 11, 13, 16], index=['a', 'b', 'c', 'd']) + >>> ser1.expanding().cov(ser2) + a NaN + b 0.500000 + c 1.500000 + d 3.333333 + dtype: float64 + """ + ), window_method="expanding", aggregation_description="sample covariance", agg_method="cov", @@ -782,9 +906,22 @@ def cov( columns on the second level. In the case of missing elements, only complete pairwise observations - will be used. + will be used.\n """ - ).replace("\n", "", 1), + ), + create_section_header("Examples"), + dedent( + """\ + >>> ser1 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd']) + >>> ser2 = pd.Series([10, 11, 13, 16], index=['a', 'b', 'c', 'd']) + >>> ser1.expanding().corr(ser2) + a NaN + b 1.000000 + c 0.981981 + d 0.975900 + dtype: float64 + """ + ), window_method="expanding", aggregation_description="correlation", agg_method="corr", From 172b7c116a66c63bed605c633683778f828d2e57 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dea=20Mar=C3=ADa=20L=C3=A9on?= Date: Wed, 5 Jul 2023 10:07:14 +0200 Subject: [PATCH 08/35] DOC: Fixing EX01 - Added examples (#53982) * Examples Window.mean, sum, var, std * Corrections to docstrings * Removed replace() * Correct link take 2 * remove replace() to right method --- ci/code_checks.sh | 4 -- pandas/core/window/rolling.py | 105 ++++++++++++++++++++++++++++++++-- 2 files changed, 101 insertions(+), 8 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index f9020a192e5b7..bf0711dcc0581 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -110,10 +110,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then pandas_object \ pandas.api.interchange.from_dataframe \ pandas.DatetimeIndex.snap \ - pandas.core.window.rolling.Window.mean \ - pandas.core.window.rolling.Window.sum \ - pandas.core.window.rolling.Window.var \ - pandas.core.window.rolling.Window.std \ pandas.core.window.ewm.ExponentialMovingWindow.mean \ pandas.core.window.ewm.ExponentialMovingWindow.sum \ pandas.core.window.ewm.ExponentialMovingWindow.std \ diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 9778651814b23..5fd9930da4463 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -1270,7 +1270,32 @@ def aggregate(self, func, *args, **kwargs): create_section_header("Returns"), template_returns, create_section_header("See Also"), - template_see_also[:-1], + template_see_also, + create_section_header("Examples"), + dedent( + """\ + >>> ser = pd.Series([0, 1, 5, 2, 8]) + + To get an instance of :class:`~pandas.core.window.rolling.Window` we need + to pass the parameter `win_type`. + + >>> type(ser.rolling(2, win_type='gaussian')) + + + In order to use the `SciPy` Gaussian window we need to provide the parameters + `M` and `std`. The parameter `M` corresponds to 2 in our example. + We pass the second parameter `std` as a parameter of the following method + (`sum` in this case): + + >>> ser.rolling(2, win_type='gaussian').sum(std=3) + 0 NaN + 1 0.986207 + 2 5.917243 + 3 6.903450 + 4 9.862071 + dtype: float64 + """ + ), window_method="rolling", aggregation_description="weighted window sum", agg_method="sum", @@ -1295,7 +1320,31 @@ def sum(self, numeric_only: bool = False, **kwargs): create_section_header("Returns"), template_returns, create_section_header("See Also"), - template_see_also[:-1], + template_see_also, + create_section_header("Examples"), + dedent( + """\ + >>> ser = pd.Series([0, 1, 5, 2, 8]) + + To get an instance of :class:`~pandas.core.window.rolling.Window` we need + to pass the parameter `win_type`. + + >>> type(ser.rolling(2, win_type='gaussian')) + + + In order to use the `SciPy` Gaussian window we need to provide the parameters + `M` and `std`. The parameter `M` corresponds to 2 in our example. + We pass the second parameter `std` as a parameter of the following method: + + >>> ser.rolling(2, win_type='gaussian').mean(std=3) + 0 NaN + 1 0.5 + 2 3.0 + 3 3.5 + 4 5.0 + dtype: float64 + """ + ), window_method="rolling", aggregation_description="weighted window mean", agg_method="mean", @@ -1320,7 +1369,31 @@ def mean(self, numeric_only: bool = False, **kwargs): create_section_header("Returns"), template_returns, create_section_header("See Also"), - template_see_also[:-1], + template_see_also, + create_section_header("Examples"), + dedent( + """\ + >>> ser = pd.Series([0, 1, 5, 2, 8]) + + To get an instance of :class:`~pandas.core.window.rolling.Window` we need + to pass the parameter `win_type`. + + >>> type(ser.rolling(2, win_type='gaussian')) + + + In order to use the `SciPy` Gaussian window we need to provide the parameters + `M` and `std`. The parameter `M` corresponds to 2 in our example. + We pass the second parameter `std` as a parameter of the following method: + + >>> ser.rolling(2, win_type='gaussian').var(std=3) + 0 NaN + 1 0.5 + 2 8.0 + 3 4.5 + 4 18.0 + dtype: float64 + """ + ), window_method="rolling", aggregation_description="weighted window variance", agg_method="var", @@ -1338,7 +1411,31 @@ def var(self, ddof: int = 1, numeric_only: bool = False, **kwargs): create_section_header("Returns"), template_returns, create_section_header("See Also"), - template_see_also[:-1], + template_see_also, + create_section_header("Examples"), + dedent( + """\ + >>> ser = pd.Series([0, 1, 5, 2, 8]) + + To get an instance of :class:`~pandas.core.window.rolling.Window` we need + to pass the parameter `win_type`. + + >>> type(ser.rolling(2, win_type='gaussian')) + + + In order to use the `SciPy` Gaussian window we need to provide the parameters + `M` and `std`. The parameter `M` corresponds to 2 in our example. + We pass the second parameter `std` as a parameter of the following method: + + >>> ser.rolling(2, win_type='gaussian').std(std=3) + 0 NaN + 1 0.707107 + 2 2.828427 + 3 2.121320 + 4 4.242641 + dtype: float64 + """ + ), window_method="rolling", aggregation_description="weighted window standard deviation", agg_method="std", From 6cb37b6e050c374151a23e790a46ad8b822a0564 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 5 Jul 2023 07:26:12 -0700 Subject: [PATCH 09/35] DEPR: GroupBy.quantile with bool dtype (#53975) --- doc/source/whatsnew/v2.1.0.rst | 1 + pandas/core/groupby/groupby.py | 11 +++++++++++ pandas/tests/groupby/test_function.py | 7 +++++++ 3 files changed, 19 insertions(+) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 6390fbeed8548..198a7155e1a1e 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -296,6 +296,7 @@ Deprecations - Deprecated :meth:`Series.interpolate` and :meth:`DataFrame.interpolate` for object-dtype (:issue:`53631`) - Deprecated :meth:`Series.last` and :meth:`DataFrame.last` (please create a mask and filter using ``.loc`` instead) (:issue:`53692`) - Deprecated allowing arbitrary ``fill_value`` in :class:`SparseDtype`, in a future version the ``fill_value`` will need to be compatible with the ``dtype.subtype``, either a scalar that can be held by that subtype or ``NaN`` for integer or bool subtypes (:issue:`23124`) +- Deprecated allowing bool dtype in :meth:`DataFrameGroupBy.quantile` and :meth:`SeriesGroupBy.quantile`, consistent with the :meth:`Series.quantile` and :meth:`DataFrame.quantile` behavior (:issue:`51424`) - Deprecated behavior of :func:`assert_series_equal` and :func:`assert_frame_equal` considering NA-like values (e.g. ``NaN`` vs ``None`` as equivalent) (:issue:`52081`) - Deprecated bytes input to :func:`read_excel`. To read a file path, use a string or path-like object. (:issue:`53767`) - Deprecated constructing :class:`SparseArray` from scalar data, pass a sequence instead (:issue:`53039`) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 678ab7444bb58..2e3415f9a4474 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -4166,6 +4166,17 @@ def pre_processor(vals: ArrayLike) -> tuple[np.ndarray, DtypeObj | None]: inference = np.dtype(np.int64) elif is_bool_dtype(vals.dtype) and isinstance(vals, ExtensionArray): out = vals.to_numpy(dtype=float, na_value=np.nan) + elif is_bool_dtype(vals.dtype): + # GH#51424 deprecate to match Series/DataFrame behavior + warnings.warn( + f"Allowing bool dtype in {type(self).__name__}.quantile is " + "deprecated and will raise in a future version, matching " + "the Series/DataFrame behavior. Cast to uint8 dtype before " + "calling quantile instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + out = np.asarray(vals) elif needs_i8_conversion(vals.dtype): inference = vals.dtype # In this case we need to delay the casting until after the diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index b97afe8ae9524..090ed37d7d1b2 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -1589,6 +1589,13 @@ def test_deprecate_numeric_only_series(dtype, groupby_func, request): ) with pytest.raises(TypeError, match=msg): method(*args, numeric_only=True) + elif dtype == bool and groupby_func == "quantile": + msg = "Allowing bool dtype in SeriesGroupBy.quantile" + with tm.assert_produces_warning(FutureWarning, match=msg): + # GH#51424 + result = method(*args, numeric_only=True) + expected = method(*args, numeric_only=False) + tm.assert_series_equal(result, expected) else: result = method(*args, numeric_only=True) expected = method(*args, numeric_only=False) From 7856c0248d9d76c6f5689a9718da270eac38c4df Mon Sep 17 00:00:00 2001 From: Rajat Subhra Mukherjee Date: Wed, 5 Jul 2023 22:51:00 +0530 Subject: [PATCH 10/35] DOC: Added note for `corr` (#53972) * Added note for corr * Removed additional linebreak * Added example and note for Series.corr * Update frame.py * Added operation in example * Fixed indentation error for note * Relocated notes --- pandas/core/series.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/pandas/core/series.py b/pandas/core/series.py index e59a4cfc3fcc1..164b1a61b006c 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2783,6 +2783,9 @@ def corr( * `Kendall rank correlation coefficient `_ * `Spearman's rank correlation coefficient `_ + Automatic data alignment: as with all pandas operations, automatic data alignment is performed for this method. + ``corr()`` automatically considers values with matching indices. + Examples -------- >>> def histogram_intersection(a, b): @@ -2792,6 +2795,13 @@ def corr( >>> s2 = pd.Series([.3, .6, .0, .1]) >>> s1.corr(s2, method=histogram_intersection) 0.3 + + Pandas auto-aligns the values with matching indices + + >>> s1 = pd.Series([1, 2, 3], index=[0, 1, 2]) + >>> s2 = pd.Series([1, 2, 3], index=[2, 1, 0]) + >>> s1.corr(s2) + -1.0 """ # noqa: E501 this, other = self.align(other, join="inner", copy=False) if len(this) == 0: From 1e90f9e3c5371bf65347bc1ea5e9b3cef9dfa189 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Wed, 5 Jul 2023 20:46:54 +0200 Subject: [PATCH 11/35] Test CoW for multiple Python versions (#53981) --- .github/workflows/unit-tests.yml | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index d608654b510d1..600986d3297a9 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -57,7 +57,15 @@ jobs: # Also install zh_CN (its encoding is gb2312) but do not activate it. # It will be temporarily activated during tests with locale.setlocale extra_loc: "zh_CN" - - name: "Copy-on-Write" + - name: "Copy-on-Write 3.9" + env_file: actions-39.yaml + pattern: "not slow and not network and not single_cpu" + pandas_copy_on_write: "1" + - name: "Copy-on-Write 3.10" + env_file: actions-310.yaml + pattern: "not slow and not network and not single_cpu" + pandas_copy_on_write: "1" + - name: "Copy-on-Write 3.11" env_file: actions-311.yaml pattern: "not slow and not network and not single_cpu" pandas_copy_on_write: "1" From 64085dd41ad190e154b0fe12c00f00fff27bbc86 Mon Sep 17 00:00:00 2001 From: Sayed Qaiser Ali <66676360+sqali@users.noreply.github.com> Date: Thu, 6 Jul 2023 18:39:41 +0530 Subject: [PATCH 12/35] Update ecosystem.md (#53980) * Update ecosystem.md replaced qgrid with modin-spreadsheet in docs * Update ecosystem.md removed the trailing spaces --- web/pandas/community/ecosystem.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/web/pandas/community/ecosystem.md b/web/pandas/community/ecosystem.md index 957a8d38b204c..fba50faac3e58 100644 --- a/web/pandas/community/ecosystem.md +++ b/web/pandas/community/ecosystem.md @@ -210,10 +210,11 @@ or may not be compatible with non-HTML Jupyter output formats.) See [Options and Settings](https://pandas.pydata.org/docs/user_guide/options.html) for pandas `display.` settings. -### [quantopian/qgrid](https://github.com/quantopian/qgrid) +### [modin-project/modin-spreadsheet](https://github.com/modin-project/modin-spreadsheet) -qgrid is "an interactive grid for sorting and filtering DataFrames in -IPython Notebook" built with SlickGrid. +modin-spreadsheet is an interactive grid for sorting and filtering DataFrames in IPython Notebook. +It is a fork of qgrid and is actively maintained by the modin project. +modin-spreadsheet provides similar functionality to qgrid and allows for easy data exploration and manipulation in a tabular format. ### [Spyder](https://www.spyder-ide.org/) From 1f7463681943d5b912ce47d627e6a7aea2014e3e Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Thu, 6 Jul 2023 18:12:29 +0200 Subject: [PATCH 13/35] CoW: Add warning for chained assignment with fillna (#53779) --- doc/source/whatsnew/v2.1.0.rst | 8 ++++++++ pandas/core/generic.py | 17 +++++++++++++++++ pandas/errors/__init__.py | 11 +++++++++++ pandas/tests/copy_view/test_interp_fillna.py | 13 +++++++++++++ pandas/tests/frame/methods/test_fillna.py | 10 +++++++--- pandas/tests/frame/test_block_internals.py | 11 +++++++++-- .../multiindex/test_chaining_and_caching.py | 8 ++++++-- scripts/validate_unwanted_patterns.py | 1 + 8 files changed, 72 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 198a7155e1a1e..1119117c411d3 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -27,6 +27,14 @@ Copy-on-Write improvements of those Index objects for the columns of the DataFrame (:issue:`52947`) - Add lazy copy mechanism to :meth:`DataFrame.eval` (:issue:`53746`) +- Trying to operate inplace on a temporary column selection + (for example, ``df["a"].fillna(100, inplace=True)``) + will now always raise a warning when Copy-on-Write is enabled. In this mode, + operating inplace like this will never work, since the selection behaves + as a temporary copy. This holds true for: + + - DataFrame.fillna / Series.fillna + .. _whatsnew_210.enhancements.enhancement2: ``map(func, na_action="ignore")`` now works for all array types diff --git a/pandas/core/generic.py b/pandas/core/generic.py index f049e9d479b26..68e5fbd696ab9 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -9,6 +9,7 @@ import operator import pickle import re +import sys from typing import ( TYPE_CHECKING, Any, @@ -89,13 +90,19 @@ WriteExcelBuffer, npt, ) +from pandas.compat import ( + PY311, + PYPY, +) from pandas.compat._optional import import_optional_dependency from pandas.compat.numpy import function as nv from pandas.errors import ( AbstractMethodError, + ChainedAssignmentError, InvalidIndexError, SettingWithCopyError, SettingWithCopyWarning, + _chained_assignment_method_msg, ) from pandas.util._decorators import doc from pandas.util._exceptions import find_stack_level @@ -7083,6 +7090,16 @@ def fillna( Note that column D is not affected since it is not present in df2. """ inplace = validate_bool_kwarg(inplace, "inplace") + if inplace: + if not PYPY and using_copy_on_write(): + refcount = 2 if PY311 else 3 + if sys.getrefcount(self) <= refcount: + warnings.warn( + _chained_assignment_method_msg, + ChainedAssignmentError, + stacklevel=2, + ) + value, method = validate_fillna_kwargs(value, method) if method is not None: warnings.warn( diff --git a/pandas/errors/__init__.py b/pandas/errors/__init__.py index 438f504968b2d..0c5b8caeaba6e 100644 --- a/pandas/errors/__init__.py +++ b/pandas/errors/__init__.py @@ -393,6 +393,17 @@ class ChainedAssignmentError(Warning): ) +_chained_assignment_method_msg = ( + "A value is trying to be set on a copy of a DataFrame or Series " + "through chained assignment.\n" + "When using the Copy-on-Write mode, such chained assignment never works " + "to update the original DataFrame or Series, because the intermediate " + "object on which we are setting values always behaves as a copy.\n\n" + "Try using 'df.method({col: value}, inplace=True)' instead, to perform " + "the operation inplace.\n\n" +) + + class NumExprClobberingError(NameError): """ Exception raised when trying to use a built-in numexpr name as a variable name. diff --git a/pandas/tests/copy_view/test_interp_fillna.py b/pandas/tests/copy_view/test_interp_fillna.py index 5a4b958b2148d..eb10e3b4e716a 100644 --- a/pandas/tests/copy_view/test_interp_fillna.py +++ b/pandas/tests/copy_view/test_interp_fillna.py @@ -344,3 +344,16 @@ def test_fillna_inplace_ea_noop_shares_memory( assert not np.shares_memory(get_array(df, "b"), get_array(view, "b")) df.iloc[0, 1] = 100 tm.assert_frame_equal(df_orig, view) + + +def test_fillna_chained_assignment(using_copy_on_write): + df = DataFrame({"a": [1, np.nan, 2], "b": 1}) + df_orig = df.copy() + if using_copy_on_write: + with tm.raises_chained_assignment_error(): + df["a"].fillna(100, inplace=True) + tm.assert_frame_equal(df, df_orig) + + with tm.raises_chained_assignment_error(): + df[["a"]].fillna(100, inplace=True) + tm.assert_frame_equal(df, df_orig) diff --git a/pandas/tests/frame/methods/test_fillna.py b/pandas/tests/frame/methods/test_fillna.py index 109520859af4d..40fe7d2ce9af5 100644 --- a/pandas/tests/frame/methods/test_fillna.py +++ b/pandas/tests/frame/methods/test_fillna.py @@ -1,6 +1,7 @@ import numpy as np import pytest +from pandas.errors import ChainedAssignmentError import pandas.util._test_decorators as td from pandas import ( @@ -49,11 +50,12 @@ def test_fillna_on_column_view(self, using_copy_on_write): arr = np.full((40, 50), np.nan) df = DataFrame(arr, copy=False) - # TODO(CoW): This should raise a chained assignment error - df[0].fillna(-1, inplace=True) if using_copy_on_write: + with tm.assert_produces_warning(ChainedAssignmentError): + df[0].fillna(-1, inplace=True) assert np.isnan(arr[:, 0]).all() else: + df[0].fillna(-1, inplace=True) assert (arr[:, 0] == -1).all() # i.e. we didn't create a new 49-column block @@ -105,7 +107,9 @@ def test_fillna_mixed_float(self, mixed_float_frame): result = mf.fillna(method="pad") _check_mixed_float(result, dtype={"C": None}) - def test_fillna_empty(self): + def test_fillna_empty(self, using_copy_on_write): + if using_copy_on_write: + pytest.skip("condition is unnecessary complex and is deprecated anyway") # empty frame (GH#2778) df = DataFrame(columns=["x"]) for m in ["pad", "backfill"]: diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py index 6b6c1f6f64ff7..335901c457240 100644 --- a/pandas/tests/frame/test_block_internals.py +++ b/pandas/tests/frame/test_block_internals.py @@ -7,7 +7,10 @@ import numpy as np import pytest -from pandas.errors import PerformanceWarning +from pandas.errors import ( + ChainedAssignmentError, + PerformanceWarning, +) import pandas.util._test_decorators as td import pandas as pd @@ -410,7 +413,11 @@ def test_update_inplace_sets_valid_block_values(using_copy_on_write): df = DataFrame({"a": Series([1, 2, None], dtype="category")}) # inplace update of a single column - df["a"].fillna(1, inplace=True) + if using_copy_on_write: + with tm.assert_produces_warning(ChainedAssignmentError): + df["a"].fillna(1, inplace=True) + else: + df["a"].fillna(1, inplace=True) # check we haven't put a Series into any block.values assert isinstance(df._mgr.blocks[0].values, Categorical) diff --git a/pandas/tests/indexing/multiindex/test_chaining_and_caching.py b/pandas/tests/indexing/multiindex/test_chaining_and_caching.py index e0868745a480a..d27a2cde9417e 100644 --- a/pandas/tests/indexing/multiindex/test_chaining_and_caching.py +++ b/pandas/tests/indexing/multiindex/test_chaining_and_caching.py @@ -1,7 +1,10 @@ import numpy as np import pytest -from pandas.errors import SettingWithCopyError +from pandas.errors import ( + ChainedAssignmentError, + SettingWithCopyError, +) import pandas.util._test_decorators as td from pandas import ( @@ -30,7 +33,8 @@ def test_detect_chained_assignment(using_copy_on_write): zed = DataFrame(events, index=["a", "b"], columns=multiind) if using_copy_on_write: - zed["eyes"]["right"].fillna(value=555, inplace=True) + with tm.assert_produces_warning(ChainedAssignmentError): + zed["eyes"]["right"].fillna(value=555, inplace=True) else: msg = "A value is trying to be set on a copy of a slice from a DataFrame" with pytest.raises(SettingWithCopyError, match=msg): diff --git a/scripts/validate_unwanted_patterns.py b/scripts/validate_unwanted_patterns.py index cffae7d18bee1..466419bf5093e 100755 --- a/scripts/validate_unwanted_patterns.py +++ b/scripts/validate_unwanted_patterns.py @@ -51,6 +51,7 @@ "_arrow_dtype_mapping", "_global_config", "_chained_assignment_msg", + "_chained_assignment_method_msg", } From 0751cf2c04d69e5a144c622d35a7952a47e4924c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dea=20Mar=C3=ADa=20L=C3=A9on?= Date: Thu, 6 Jul 2023 20:17:43 +0200 Subject: [PATCH 14/35] DOC: Fixing EX01 - Added examples (#54004) Added examples ewm --- ci/code_checks.sh | 6 --- pandas/core/window/ewm.py | 102 ++++++++++++++++++++++++++++++++------ 2 files changed, 88 insertions(+), 20 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index bf0711dcc0581..756096a7fe345 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -110,12 +110,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then pandas_object \ pandas.api.interchange.from_dataframe \ pandas.DatetimeIndex.snap \ - pandas.core.window.ewm.ExponentialMovingWindow.mean \ - pandas.core.window.ewm.ExponentialMovingWindow.sum \ - pandas.core.window.ewm.ExponentialMovingWindow.std \ - pandas.core.window.ewm.ExponentialMovingWindow.var \ - pandas.core.window.ewm.ExponentialMovingWindow.corr \ - pandas.core.window.ewm.ExponentialMovingWindow.cov \ pandas.api.indexers.BaseIndexer \ pandas.api.indexers.VariableOffsetWindowIndexer \ pandas.io.formats.style.Styler \ diff --git a/pandas/core/window/ewm.py b/pandas/core/window/ewm.py index 42123fafd62aa..775f3cd428677 100644 --- a/pandas/core/window/ewm.py +++ b/pandas/core/window/ewm.py @@ -502,7 +502,19 @@ def aggregate(self, func, *args, **kwargs): create_section_header("See Also"), template_see_also, create_section_header("Notes"), - numba_notes.replace("\n", "", 1), + numba_notes, + create_section_header("Examples"), + dedent( + """\ + >>> ser = pd.Series([1, 2, 3, 4]) + >>> ser.ewm(alpha=.2).mean() + 0 1.000000 + 1 1.555556 + 2 2.147541 + 3 2.775068 + dtype: float64 + """ + ), window_method="ewm", aggregation_description="(exponential weighted moment) mean", agg_method="mean", @@ -554,7 +566,19 @@ def mean( create_section_header("See Also"), template_see_also, create_section_header("Notes"), - numba_notes.replace("\n", "", 1), + numba_notes, + create_section_header("Examples"), + dedent( + """\ + >>> ser = pd.Series([1, 2, 3, 4]) + >>> ser.ewm(alpha=.2).sum() + 0 1.000 + 1 2.800 + 2 5.240 + 3 8.192 + dtype: float64 + """ + ), window_method="ewm", aggregation_description="(exponential weighted moment) sum", agg_method="sum", @@ -602,16 +626,28 @@ def sum( template_header, create_section_header("Parameters"), dedent( - """ + """\ bias : bool, default False Use a standard estimation bias correction. """ - ).replace("\n", "", 1), + ), kwargs_numeric_only, create_section_header("Returns"), template_returns, create_section_header("See Also"), - template_see_also[:-1], + template_see_also, + create_section_header("Examples"), + dedent( + """\ + >>> ser = pd.Series([1, 2, 3, 4]) + >>> ser.ewm(alpha=.2).std() + 0 NaN + 1 0.707107 + 2 0.995893 + 3 1.277320 + dtype: float64 + """ + ), window_method="ewm", aggregation_description="(exponential weighted moment) standard deviation", agg_method="std", @@ -632,16 +668,28 @@ def std(self, bias: bool = False, numeric_only: bool = False): template_header, create_section_header("Parameters"), dedent( - """ + """\ bias : bool, default False Use a standard estimation bias correction. """ - ).replace("\n", "", 1), + ), kwargs_numeric_only, create_section_header("Returns"), template_returns, create_section_header("See Also"), - template_see_also[:-1], + template_see_also, + create_section_header("Examples"), + dedent( + """\ + >>> ser = pd.Series([1, 2, 3, 4]) + >>> ser.ewm(alpha=.2).var() + 0 NaN + 1 0.500000 + 2 0.991803 + 3 1.631547 + dtype: float64 + """ + ), window_method="ewm", aggregation_description="(exponential weighted moment) variance", agg_method="var", @@ -665,7 +713,7 @@ def var_func(values, begin, end, min_periods): template_header, create_section_header("Parameters"), dedent( - """ + """\ other : Series or DataFrame , optional If not supplied then will default to self and produce pairwise output. @@ -679,12 +727,25 @@ def var_func(values, begin, end, min_periods): bias : bool, default False Use a standard estimation bias correction. """ - ).replace("\n", "", 1), + ), kwargs_numeric_only, create_section_header("Returns"), template_returns, create_section_header("See Also"), - template_see_also[:-1], + template_see_also, + create_section_header("Examples"), + dedent( + """\ + >>> ser1 = pd.Series([1, 2, 3, 4]) + >>> ser2 = pd.Series([10, 11, 13, 16]) + >>> ser1.ewm(alpha=.2).cov(ser2) + 0 NaN + 1 0.500000 + 2 1.524590 + 3 3.408836 + dtype: float64 + """ + ), window_method="ewm", aggregation_description="(exponential weighted moment) sample covariance", agg_method="cov", @@ -739,7 +800,7 @@ def cov_func(x, y): template_header, create_section_header("Parameters"), dedent( - """ + """\ other : Series or DataFrame, optional If not supplied then will default to self and produce pairwise output. @@ -751,12 +812,25 @@ def cov_func(x, y): inputs. In the case of missing elements, only complete pairwise observations will be used. """ - ).replace("\n", "", 1), + ), kwargs_numeric_only, create_section_header("Returns"), template_returns, create_section_header("See Also"), - template_see_also[:-1], + template_see_also, + create_section_header("Examples"), + dedent( + """\ + >>> ser1 = pd.Series([1, 2, 3, 4]) + >>> ser2 = pd.Series([10, 11, 13, 16]) + >>> ser1.ewm(alpha=.2).corr(ser2) + 0 NaN + 1 1.000000 + 2 0.982821 + 3 0.977802 + dtype: float64 + """ + ), window_method="ewm", aggregation_description="(exponential weighted moment) sample correlation", agg_method="corr", From 03fd49c66a79ef337c2eee31b5a9d7f49ab0e218 Mon Sep 17 00:00:00 2001 From: liang3zy22 <35164941+liang3zy22@users.noreply.github.com> Date: Fri, 7 Jul 2023 03:48:34 +0800 Subject: [PATCH 15/35] BUG: Fix mamba can't create environment issue (#54017) --- environment.yml | 1 - requirements-dev.txt | 1 - 2 files changed, 2 deletions(-) diff --git a/environment.yml b/environment.yml index 8fd97e6fcc0e1..6178fe896760f 100644 --- a/environment.yml +++ b/environment.yml @@ -17,7 +17,6 @@ dependencies: - pytest-cov - pytest-xdist>=2.2.0 - pytest-asyncio>=0.17.0 - - pytest-localserver>=0.7.1 - coverage # required dependencies diff --git a/requirements-dev.txt b/requirements-dev.txt index b1d8ce1cf2143..38a2ce7f66aa3 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -10,7 +10,6 @@ pytest>=7.3.2 pytest-cov pytest-xdist>=2.2.0 pytest-asyncio>=0.17.0 -pytest-localserver>=0.7.1 coverage python-dateutil numpy From ce9ad65c4930ce53dd5e51d468ec24cd9ba6998e Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Thu, 6 Jul 2023 15:52:57 -0400 Subject: [PATCH 16/35] CLN: tests.groupby.test_any_all (#53998) --- pandas/tests/groupby/conftest.py | 5 +++++ pandas/tests/groupby/test_any_all.py | 13 ++++++------- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/pandas/tests/groupby/conftest.py b/pandas/tests/groupby/conftest.py index c5e30513f69de..b1b1d455d5027 100644 --- a/pandas/tests/groupby/conftest.py +++ b/pandas/tests/groupby/conftest.py @@ -24,6 +24,11 @@ def dropna(request): return request.param +@pytest.fixture(params=[True, False]) +def skipna(request): + return request.param + + @pytest.fixture(params=[True, False]) def observed(request): return request.param diff --git a/pandas/tests/groupby/test_any_all.py b/pandas/tests/groupby/test_any_all.py index 4e6631cb763fe..57a83335be849 100644 --- a/pandas/tests/groupby/test_any_all.py +++ b/pandas/tests/groupby/test_any_all.py @@ -14,7 +14,6 @@ @pytest.mark.parametrize("agg_func", ["any", "all"]) -@pytest.mark.parametrize("skipna", [True, False]) @pytest.mark.parametrize( "vals", [ @@ -33,7 +32,7 @@ [np.nan, np.nan, np.nan], ], ) -def test_groupby_bool_aggs(agg_func, skipna, vals): +def test_groupby_bool_aggs(skipna, agg_func, vals): df = DataFrame({"key": ["a"] * 3 + ["b"] * 3, "val": vals * 2}) # Figure out expectation using Python builtin @@ -43,9 +42,11 @@ def test_groupby_bool_aggs(agg_func, skipna, vals): if skipna and all(isna(vals)) and agg_func == "any": exp = False - exp_df = DataFrame([exp] * 2, columns=["val"], index=Index(["a", "b"], name="key")) + expected = DataFrame( + [exp] * 2, columns=["val"], index=Index(["a", "b"], name="key") + ) result = getattr(df.groupby("key"), agg_func)(skipna=skipna) - tm.assert_frame_equal(result, exp_df) + tm.assert_frame_equal(result, expected) def test_any(): @@ -63,7 +64,7 @@ def test_any(): @pytest.mark.parametrize("bool_agg_func", ["any", "all"]) def test_bool_aggs_dup_column_labels(bool_agg_func): - # 21668 + # GH#21668 df = DataFrame([[True, True]], columns=["a", "a"]) grp_by = df.groupby([0]) result = getattr(grp_by, bool_agg_func)() @@ -73,7 +74,6 @@ def test_bool_aggs_dup_column_labels(bool_agg_func): @pytest.mark.parametrize("bool_agg_func", ["any", "all"]) -@pytest.mark.parametrize("skipna", [True, False]) @pytest.mark.parametrize( "data", [ @@ -141,7 +141,6 @@ def test_masked_mixed_types(dtype1, dtype2, exp_col1, exp_col2): @pytest.mark.parametrize("bool_agg_func", ["any", "all"]) @pytest.mark.parametrize("dtype", ["Int64", "Float64", "boolean"]) -@pytest.mark.parametrize("skipna", [True, False]) def test_masked_bool_aggs_skipna(bool_agg_func, dtype, skipna, frame_or_series): # GH#40585 obj = frame_or_series([pd.NA, 1], dtype=dtype) From c1c3f14b9e4e6df800a3f6bd9b4e56f5ced16cc8 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 6 Jul 2023 12:56:43 -0700 Subject: [PATCH 17/35] BUG: missing fstring (#53990) --- pandas/_libs/tslibs/parsing.pyx | 2 +- pandas/tests/tslibs/test_parsing.py | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index 536ae7ee4673b..9173b7e8b1449 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -704,7 +704,7 @@ cdef datetime dateutil_parse( # we get tzlocal, once the deprecation is enforced will get # timezone.utc, not raise. warnings.warn( - "Parsing '{res.tzname}' as tzlocal (dependent on system timezone) " + f"Parsing '{res.tzname}' as tzlocal (dependent on system timezone) " "is deprecated and will raise in a future version. Pass the 'tz' " "keyword or call tz_localize after construction instead", FutureWarning, diff --git a/pandas/tests/tslibs/test_parsing.py b/pandas/tests/tslibs/test_parsing.py index 8408c9df5962b..2c8a6827a3bf1 100644 --- a/pandas/tests/tslibs/test_parsing.py +++ b/pandas/tests/tslibs/test_parsing.py @@ -29,7 +29,10 @@ ) def test_parsing_tzlocal_deprecated(): # GH#50791 - msg = "Pass the 'tz' keyword or call tz_localize after construction instead" + msg = ( + "Parsing 'EST' as tzlocal.*" + "Pass the 'tz' keyword or call tz_localize after construction instead" + ) dtstr = "Jan 15 2004 03:00 EST" with tm.set_timezone("US/Eastern"): From 609047b5acb084e3227bcf5d8c8b005f18734802 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 6 Jul 2023 12:57:33 -0700 Subject: [PATCH 18/35] BUG: ignoring sort in DTA.factorize (#53992) --- pandas/core/arrays/datetimelike.py | 10 +++++++++- pandas/tests/arrays/test_datetimes.py | 13 +++++++++++++ 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 86849aa41e3e1..40cd59340f942 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -2211,7 +2211,15 @@ def factorize( codes = codes[::-1] uniques = uniques[::-1] return codes, uniques - # FIXME: shouldn't get here; we are ignoring sort + + if sort: + # algorithms.factorize only passes sort=True here when freq is + # not None, so this should not be reached. + raise NotImplementedError( + f"The 'sort' keyword in {type(self).__name__}.factorize is " + "ignored unless arr.freq is not None. To factorize with sort, " + "call pd.factorize(obj, sort=True) instead." + ) return super().factorize(use_na_sentinel=use_na_sentinel) @classmethod diff --git a/pandas/tests/arrays/test_datetimes.py b/pandas/tests/arrays/test_datetimes.py index 2acc7bdc0d902..1fe1d4efbefd7 100644 --- a/pandas/tests/arrays/test_datetimes.py +++ b/pandas/tests/arrays/test_datetimes.py @@ -745,3 +745,16 @@ def test_iter_zoneinfo_fold(self, tz): right2 = dta.astype(object)[2] assert str(left) == str(right2) assert left.utcoffset() == right2.utcoffset() + + +def test_factorize_sort_without_freq(): + dta = DatetimeArray._from_sequence([0, 2, 1]) + + msg = r"call pd.factorize\(obj, sort=True\) instead" + with pytest.raises(NotImplementedError, match=msg): + dta.factorize(sort=True) + + # Do TimedeltaArray while we're here + tda = dta - dta[0] + with pytest.raises(NotImplementedError, match=msg): + tda.factorize(sort=True) From 15cef2cc8a554b7d7d864f88170938479ab154a7 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 6 Jul 2023 12:58:37 -0700 Subject: [PATCH 19/35] CLN: remove unreachable, unnecessary axis kwd (#53991) --- pandas/core/generic.py | 2 -- pandas/core/internals/blocks.py | 18 +----------------- 2 files changed, 1 insertion(+), 19 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 68e5fbd696ab9..2c3fb6201295c 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -8052,10 +8052,8 @@ def interpolate( ) else: index = missing.get_interp_index(method, obj.index) - axis = self._info_axis_number new_data = obj._mgr.interpolate( method=method, - axis=axis, index=index, limit=limit, limit_direction=limit_direction, diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 8923faf444953..067544636ccbf 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1396,7 +1396,6 @@ def interpolate( self, *, method: InterpolateOptions, - axis: AxisInt, index: Index, inplace: bool = False, limit: int | None = None, @@ -1427,27 +1426,12 @@ def interpolate( return [self.copy(deep=False)] return [self] if inplace else [self.copy()] - if self.is_object and self.ndim == 2 and self.shape[0] != 1 and axis == 0: - # split improves performance in ndarray.copy() - return self.split_and_operate( - type(self).interpolate, - method=method, - axis=axis, - index=index, - inplace=inplace, - limit=limit, - limit_direction=limit_direction, - limit_area=limit_area, - downcast=downcast, - **kwargs, - ) - copy, refs = self._get_refs_and_copy(using_cow, inplace) # Dispatch to the EA method. new_values = self.array_values.interpolate( method=method, - axis=axis, + axis=self.ndim - 1, index=index, limit=limit, limit_direction=limit_direction, From 2cc6bfad25003c7bafd8a0bdcadb465d1258db00 Mon Sep 17 00:00:00 2001 From: William Andrea Date: Thu, 6 Jul 2023 16:03:50 -0400 Subject: [PATCH 20/35] DOC: Clean up CSV sniffing and chunking examples (#53987) --- doc/source/user_guide/io.rst | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 0084e885db2b5..ec0e7d0636b07 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -1568,8 +1568,7 @@ class of the csv module. For this, you have to specify ``sep=None``. .. ipython:: python df = pd.DataFrame(np.random.randn(10, 4)) - df.to_csv("tmp.csv", sep="|") - df.to_csv("tmp2.csv", sep=":") + df.to_csv("tmp2.csv", sep=":", index=False) pd.read_csv("tmp2.csv", sep=None, engine="python") .. ipython:: python @@ -1597,8 +1596,8 @@ rather than reading the entire file into memory, such as the following: .. ipython:: python df = pd.DataFrame(np.random.randn(10, 4)) - df.to_csv("tmp.csv", sep="|") - table = pd.read_csv("tmp.csv", sep="|") + df.to_csv("tmp.csv", index=False) + table = pd.read_csv("tmp.csv") table @@ -1607,8 +1606,8 @@ value will be an iterable object of type ``TextFileReader``: .. ipython:: python - with pd.read_csv("tmp.csv", sep="|", chunksize=4) as reader: - reader + with pd.read_csv("tmp.csv", chunksize=4) as reader: + print(reader) for chunk in reader: print(chunk) @@ -1620,8 +1619,8 @@ Specifying ``iterator=True`` will also return the ``TextFileReader`` object: .. ipython:: python - with pd.read_csv("tmp.csv", sep="|", iterator=True) as reader: - reader.get_chunk(5) + with pd.read_csv("tmp.csv", iterator=True) as reader: + print(reader.get_chunk(5)) .. ipython:: python :suppress: From 031f8c8f23f1b253aa96148a97eb92377d787d3c Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 6 Jul 2023 13:06:44 -0700 Subject: [PATCH 21/35] REF: helper for merge casting (#53976) --- pandas/core/reshape/merge.py | 106 ++++++++++++++++------------------- 1 file changed, 47 insertions(+), 59 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index e68277c38063e..cfaa5a1fdad64 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -2067,6 +2067,32 @@ def _validate_tolerance(self, left_join_keys: list[ArrayLike]) -> None: else: raise MergeError("key must be integer, timestamp or float") + def _convert_values_for_libjoin( + self, values: AnyArrayLike, side: str + ) -> np.ndarray: + # we require sortedness and non-null values in the join keys + if not Index(values).is_monotonic_increasing: + if isna(values).any(): + raise ValueError(f"Merge keys contain null values on {side} side") + raise ValueError(f"{side} keys must be sorted") + + if isinstance(values, ArrowExtensionArray): + values = values._maybe_convert_datelike_array() + + if needs_i8_conversion(values.dtype): + values = values.view("i8") + + elif isinstance(values, BaseMaskedArray): + # we've verified above that no nulls exist + values = values._data + elif isinstance(values, ExtensionArray): + values = values.to_numpy() + + # error: Incompatible return value type (got "Union[ExtensionArray, + # Any, ndarray[Any, Any], ndarray[Any, dtype[Any]], Index, Series]", + # expected "ndarray[Any, Any]") + return values # type: ignore[return-value] + def _get_join_indexers(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: """return the join indexers""" @@ -2110,31 +2136,11 @@ def injection(obj: ArrayLike): assert left_values.dtype == right_values.dtype tolerance = self.tolerance - - # we require sortedness and non-null values in the join keys - if not Index(left_values).is_monotonic_increasing: - side = "left" - if isna(left_values).any(): - raise ValueError(f"Merge keys contain null values on {side} side") - raise ValueError(f"{side} keys must be sorted") - - if not Index(right_values).is_monotonic_increasing: - side = "right" - if isna(right_values).any(): - raise ValueError(f"Merge keys contain null values on {side} side") - raise ValueError(f"{side} keys must be sorted") - - if isinstance(left_values, ArrowExtensionArray): - left_values = left_values._maybe_convert_datelike_array() - - if isinstance(right_values, ArrowExtensionArray): - right_values = right_values._maybe_convert_datelike_array() - - # initial type conversion as needed - if needs_i8_conversion(getattr(left_values, "dtype", None)): - if tolerance is not None: + if tolerance is not None: + # TODO: can we reuse a tolerance-conversion function from + # e.g. TimedeltaIndex? + if needs_i8_conversion(left_values.dtype): tolerance = Timedelta(tolerance) - # TODO: we have no test cases with PeriodDtype here; probably # need to adjust tolerance for that case. if left_values.dtype.kind in "mM": @@ -2145,22 +2151,9 @@ def injection(obj: ArrayLike): tolerance = tolerance._value - # TODO: require left_values.dtype == right_values.dtype, or at least - # comparable for e.g. dt64tz - left_values = left_values.view("i8") - right_values = right_values.view("i8") - - if isinstance(left_values, BaseMaskedArray): - # we've verified above that no nulls exist - left_values = left_values._data - elif isinstance(left_values, ExtensionArray): - left_values = left_values.to_numpy() - - if isinstance(right_values, BaseMaskedArray): - # we've verified above that no nulls exist - right_values = right_values._data - elif isinstance(right_values, ExtensionArray): - right_values = right_values.to_numpy() + # initial type conversion as needed + left_values = self._convert_values_for_libjoin(left_values, "left") + right_values = self._convert_values_for_libjoin(right_values, "right") # a "by" parameter requires special handling if self.left_by is not None: @@ -2259,19 +2252,7 @@ def _get_multiindex_indexer( # get flat i8 join keys lkey, rkey = _get_join_keys(lcodes, rcodes, tuple(shape), sort) - - # factorize keys to a dense i8 space - lkey, rkey, count = _factorize_keys(lkey, rkey, sort=sort) - - return libjoin.left_outer_join(lkey, rkey, count, sort=sort) - - -def _get_single_indexer( - join_key: ArrayLike, index: Index, sort: bool = False -) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: - left_key, right_key, count = _factorize_keys(join_key, index._values, sort=sort) - - return libjoin.left_outer_join(left_key, right_key, count, sort=sort) + return lkey, rkey def _get_empty_indexer() -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: @@ -2315,13 +2296,20 @@ def _left_join_on_index( left_ax: Index, right_ax: Index, join_keys: list[ArrayLike], sort: bool = False ) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp]]: if isinstance(right_ax, MultiIndex): - left_indexer, right_indexer = _get_multiindex_indexer( - join_keys, right_ax, sort=sort - ) + lkey, rkey = _get_multiindex_indexer(join_keys, right_ax, sort=sort) else: - left_indexer, right_indexer = _get_single_indexer( - join_keys[0], right_ax, sort=sort - ) + # error: Incompatible types in assignment (expression has type + # "Union[Union[ExtensionArray, ndarray[Any, Any]], Index, Series]", + # variable has type "ndarray[Any, dtype[signedinteger[Any]]]") + lkey = join_keys[0] # type: ignore[assignment] + # error: Incompatible types in assignment (expression has type "Index", + # variable has type "ndarray[Any, dtype[signedinteger[Any]]]") + rkey = right_ax._values # type: ignore[assignment] + + left_key, right_key, count = _factorize_keys(lkey, rkey, sort=sort) + left_indexer, right_indexer = libjoin.left_outer_join( + left_key, right_key, count, sort=sort + ) if sort or len(left_ax) != len(left_indexer): # if asked to sort or there are 1-to-many matches From de5afb66dee294d012eee7b4997b5f7e29ccd651 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 6 Jul 2023 10:19:17 -1000 Subject: [PATCH 22/35] CLN: Consolidate Dependencies (#53863) * CLN: Dependencies * Move back togglepromt * Add tokenize-rt * Use hauntsaninja/black-pre-commit-mirror --- .pre-commit-config.yaml | 22 +++++----------------- environment.yml | 14 +++++--------- pyproject.toml | 2 -- requirements-dev.txt | 6 +----- 4 files changed, 11 insertions(+), 33 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 4379220c33687..c9cd7528bcd2f 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -15,18 +15,11 @@ default_stages: [ ci: autofix_prs: false repos: -- repo: local +- repo: https://github.com/hauntsaninja/black-pre-commit-mirror + # black compiled with mypyc + rev: 23.3.0 hooks: - # NOTE: we make `black` a local hook because if it's installed from - # PyPI (rather than from source) then it'll run twice as fast thanks to mypyc - - id: black - name: black - description: "Black: The uncompromising Python code formatter" - entry: black - language: python - require_serial: true - types_or: [python, pyi] - additional_dependencies: [black==23.3.0] + - id: black - repo: https://github.com/charliermarsh/ruff-pre-commit rev: v0.0.270 hooks: @@ -74,7 +67,7 @@ repos: --linelength=88, '--filter=-readability/casting,-runtime/int,-build/include_subdir,-readability/fn_size' ] -- repo: https://github.com/pycqa/pylint +- repo: https://github.com/pylint-dev/pylint rev: v3.0.0a6 hooks: - id: pylint @@ -93,11 +86,6 @@ repos: |^pandas/conftest\.py # keep excluded args: [--disable=all, --enable=redefined-outer-name] stages: [manual] - - id: pylint - alias: unspecified-encoding - name: Using open without explicitly specifying an encoding - args: [--disable=all, --enable=unspecified-encoding] - stages: [manual] - repo: https://github.com/PyCQA/isort rev: 5.12.0 hooks: diff --git a/environment.yml b/environment.yml index 6178fe896760f..8e3c3a26ffc0f 100644 --- a/environment.yml +++ b/environment.yml @@ -39,7 +39,7 @@ dependencies: - lxml>=4.8.0 - matplotlib>=3.6.1 - numba>=0.55.2 - - numexpr>=2.8.0 # pin for "Run checks on imported code" job + - numexpr>=2.8.0 - openpyxl>=3.0.10 - odfpy>=1.4.1 - py @@ -75,14 +75,10 @@ dependencies: - cxx-compiler # code checks - - black=23.3.0 - - cpplint - - flake8=6.0.0 - - isort>=5.2.1 # check that imports are in the right order - - mypy=1.2 + - flake8=6.0.0 # run in subprocess over docstring examples + - mypy=1.2 # pre-commit uses locally installed mypy + - tokenize-rt # scripts/check_for_inconsistent_pandas_namespace.py - pre-commit>=2.15.0 - - pyupgrade - - ruff=0.0.215 # documentation - gitpython # obtain contributors from git for whatsnew @@ -118,6 +114,6 @@ dependencies: - pygments # Code highlighting - pip: - - sphinx-toggleprompt + - sphinx-toggleprompt # conda-forge version has stricter pins on jinja2 - typing_extensions; python_version<"3.11" - tzdata>=2022.1 diff --git a/pyproject.toml b/pyproject.toml index 1ca0aaa33e179..a2ae269c26667 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -331,7 +331,6 @@ exclude = [ ".eggs/*.py", # vendored files "pandas/util/version/*", - "versioneer.py", # exclude asv benchmark environments from linting "env", ] @@ -445,7 +444,6 @@ disable = [ "super-init-not-called", "try-except-raise", "unnecessary-lambda", - "unspecified-encoding", "unused-argument", "unused-variable", "using-constant-test" diff --git a/requirements-dev.txt b/requirements-dev.txt index 38a2ce7f66aa3..7576b2d49614f 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -52,14 +52,10 @@ seaborn moto flask asv>=0.5.1 -black==23.3.0 -cpplint flake8==6.0.0 -isort>=5.2.1 mypy==1.2 +tokenize-rt pre-commit>=2.15.0 -pyupgrade -ruff==0.0.215 gitpython gitdb natsort From 291c5e3d2663e01a68019fd4eb2cb24a1e89ab71 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Thu, 6 Jul 2023 22:25:50 +0200 Subject: [PATCH 23/35] CLN: Remove unnecessary pa version check (#54012) --- pandas/core/arrays/arrow/array.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 17120d0de5c5f..284044dfadfef 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -2026,8 +2026,6 @@ def _str_repeat(self, repeats: int | Sequence[int]): raise NotImplementedError( f"repeat is not implemented when repeats is {type(repeats).__name__}" ) - elif pa_version_under7p0: - raise NotImplementedError("repeat is not implemented for pyarrow < 7") else: return type(self)(pc.binary_repeat(self._pa_array, repeats)) From 29ee8323503ddbc2de8126829752bfba25c4f0dd Mon Sep 17 00:00:00 2001 From: Rajat Subhra Mukherjee Date: Fri, 7 Jul 2023 02:01:31 +0530 Subject: [PATCH 24/35] DOC: Added to 10mins guide (#54010) added basic intro --- doc/source/user_guide/10min.rst | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/doc/source/user_guide/10min.rst b/doc/source/user_guide/10min.rst index 7c98c99fecd5b..cb3c4ab3de658 100644 --- a/doc/source/user_guide/10min.rst +++ b/doc/source/user_guide/10min.rst @@ -16,6 +16,16 @@ Customarily, we import as follows: import numpy as np import pandas as pd +Basic data structures in pandas +------------------------------- + +Pandas provides two types of classes for handling data: + +1. :class:`Series`: a one-dimensional labeled array holding data of any type + such as integers, strings, Python objects etc. +2. :class:`DataFrame`: a two-dimensional data structure that holds data like + a two-dimension array or a table with rows and columns. + Object creation --------------- From 1a3f522397507e4cd14046da475b7775bb6170fe Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 6 Jul 2023 13:33:54 -0700 Subject: [PATCH 25/35] CLN: is_mixed_type, is_homogeneous_type (#54008) CLN: is_mixed_type --- pandas/core/frame.py | 29 ++++---------------------- pandas/core/generic.py | 11 +++++----- pandas/core/internals/array_manager.py | 4 ---- pandas/core/series.py | 5 ----- 4 files changed, 9 insertions(+), 40 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index ae43a44d68f1c..f90b5c0eedbe8 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -961,13 +961,6 @@ def _is_homogeneous_type(self) -> bool: ------- bool - See Also - -------- - Index._is_homogeneous_type : Whether the object has a single - dtype. - MultiIndex._is_homogeneous_type : Whether all the levels of a - MultiIndex have the same dtype. - Examples -------- >>> DataFrame({"A": [1, 2], "B": [3, 4]})._is_homogeneous_type @@ -983,12 +976,8 @@ def _is_homogeneous_type(self) -> bool: ... "B": np.array([1, 2], dtype=np.int64)})._is_homogeneous_type False """ - if isinstance(self._mgr, ArrayManager): - return len({arr.dtype for arr in self._mgr.arrays}) == 1 - if self._mgr.any_extension_types: - return len({block.dtype for block in self._mgr.blocks}) == 1 - else: - return not self._is_mixed_type + # The "<" part of "<=" here is for empty DataFrame cases + return len({arr.dtype for arr in self._mgr.arrays}) <= 1 @property def _can_fast_transpose(self) -> bool: @@ -4958,7 +4947,7 @@ def _reindex_multi( if row_indexer is not None and col_indexer is not None: # Fastpath. By doing two 'take's at once we avoid making an # unnecessary copy. - # We only get here with `not self._is_mixed_type`, which (almost) + # We only get here with `self._can_fast_transpose`, which (almost) # ensures that self.values is cheap. It may be worth making this # condition more specific. indexer = row_indexer, col_indexer @@ -10849,17 +10838,7 @@ def count(self, axis: Axis = 0, numeric_only: bool = False): if len(frame._get_axis(axis)) == 0: result = self._constructor_sliced(0, index=frame._get_agg_axis(axis)) else: - if frame._is_mixed_type or frame._mgr.any_extension_types: - # the or any_extension_types is really only hit for single- - # column frames with an extension array - result = notna(frame).sum(axis=axis) - else: - # GH13407 - series_counts = notna(frame).sum(axis=axis) - counts = series_counts._values - result = self._constructor_sliced( - counts, index=frame._get_agg_axis(axis), copy=False - ) + result = notna(frame).sum(axis=axis) return result.astype("int64").__finalize__(self, method="count") diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 2c3fb6201295c..c7c26b924913a 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -5543,12 +5543,9 @@ def _needs_reindex_multi(self, axes, method, level: Level | None) -> bool_t: (common.count_not_none(*axes.values()) == self._AXIS_LEN) and method is None and level is None - and not self._is_mixed_type - and not ( - self.ndim == 2 - and len(self.dtypes) == 1 - and isinstance(self.dtypes.iloc[0], ExtensionDtype) - ) + # reindex_multi calls self.values, so we only want to go + # down that path when doing so is cheap. + and self._can_fast_transpose ) def _reindex_multi(self, axes, copy, fill_value): @@ -6273,9 +6270,11 @@ def _consolidate(self): self ) + @final @property def _is_mixed_type(self) -> bool_t: if self._mgr.is_single_block: + # Includes all Series cases return False if self._mgr.any_extension_types: diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index f402c9ced0e19..431de70a25392 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -347,10 +347,6 @@ def _convert(arr): def to_native_types(self, **kwargs) -> Self: return self.apply(to_native_types, **kwargs) - @property - def is_mixed_type(self) -> bool: - return True - @property def any_extension_types(self) -> bool: """Whether any of the blocks in this manager are extension blocks""" diff --git a/pandas/core/series.py b/pandas/core/series.py index 164b1a61b006c..2fc926d7e43d1 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1383,7 +1383,6 @@ def _maybe_update_cacher( return cacher = getattr(self, "_cacher", None) if cacher is not None: - assert self.ndim == 1 ref: DataFrame = cacher[1]() # we are trying to reference a dead referent, hence @@ -1407,10 +1406,6 @@ def _maybe_update_cacher( # ---------------------------------------------------------------------- # Unsorted - @property - def _is_mixed_type(self) -> bool: - return False - def repeat(self, repeats: int | Sequence[int], axis: None = None) -> Series: """ Repeat elements of a Series. From 8a413c11f760a56f35dc9b8289829a959a4f19ad Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 6 Jul 2023 13:53:32 -0700 Subject: [PATCH 26/35] REF: swap axis before calling Manager.pad_or_backfill (#53989) --- pandas/core/arrays/numpy_.py | 2 +- pandas/core/generic.py | 4 ++-- pandas/core/internals/blocks.py | 4 ++-- pandas/tests/extension/base/dim2.py | 18 +++++++++++++++++- 4 files changed, 22 insertions(+), 6 deletions(-) diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index 570e48344c961..5f02053a454ed 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -247,7 +247,7 @@ def pad_or_backfill( meth = missing.clean_fill_method(method) missing.pad_or_backfill_inplace( - out_data, + out_data.T, method=meth, axis=0, limit=limit, diff --git a/pandas/core/generic.py b/pandas/core/generic.py index c7c26b924913a..8f213a1b7a1e2 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6917,7 +6917,7 @@ def _pad_or_backfill( new_mgr = self._mgr.pad_or_backfill( method=method, - axis=axis, + axis=self._get_block_manager_axis(axis), limit=limit, inplace=inplace, downcast=downcast, @@ -8043,7 +8043,7 @@ def interpolate( new_data = obj._mgr.pad_or_backfill( method=method, - axis=axis, + axis=self._get_block_manager_axis(axis), limit=limit, limit_area=limit_area, inplace=inplace, diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 067544636ccbf..4480a1a0c6746 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1885,8 +1885,8 @@ def pad_or_backfill( using_cow: bool = False, ) -> list[Block]: values = self.values - if values.ndim == 2 and axis == 0: - # NDArrayBackedExtensionArray.fillna assumes axis=1 + if values.ndim == 2 and axis == 1: + # NDArrayBackedExtensionArray.fillna assumes axis=0 new_values = values.T.fillna(method=method, limit=limit).T else: new_values = values.fillna(method=method, limit=limit) diff --git a/pandas/tests/extension/base/dim2.py b/pandas/tests/extension/base/dim2.py index 85f01b1ee5d5e..6847c5c183267 100644 --- a/pandas/tests/extension/base/dim2.py +++ b/pandas/tests/extension/base/dim2.py @@ -159,11 +159,27 @@ def test_fillna_2d_method(self, data_missing, method): assert arr[0].isna().all() assert not arr[1].isna().any() - result = arr.fillna(method=method) + try: + result = arr.pad_or_backfill(method=method, limit=None) + except AttributeError: + result = arr.fillna(method=method, limit=None) expected = data_missing.fillna(method=method).repeat(2).reshape(2, 2) self.assert_extension_array_equal(result, expected) + # Reverse so that backfill is not a no-op. + arr2 = arr[::-1] + assert not arr2[0].isna().any() + assert arr2[1].isna().all() + + try: + result2 = arr2.pad_or_backfill(method=method, limit=None) + except AttributeError: + result2 = arr2.fillna(method=method, limit=None) + + expected2 = data_missing[::-1].fillna(method=method).repeat(2).reshape(2, 2) + self.assert_extension_array_equal(result2, expected2) + @pytest.mark.parametrize("method", ["mean", "median", "var", "std", "sum", "prod"]) def test_reductions_2d_axis_none(self, data, method): arr2d = data.reshape(1, -1) From ea1e0c85c3a736865a931d8cef85cd3890144cdb Mon Sep 17 00:00:00 2001 From: Rajat Subhra Mukherjee Date: Fri, 7 Jul 2023 02:39:58 +0530 Subject: [PATCH 27/35] DOC: Updated pandas extension list (#53960) * Updated extension list * Fixed typo --- web/pandas/community/ecosystem.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/web/pandas/community/ecosystem.md b/web/pandas/community/ecosystem.md index fba50faac3e58..e9f5c8643b493 100644 --- a/web/pandas/community/ecosystem.md +++ b/web/pandas/community/ecosystem.md @@ -526,6 +526,13 @@ Pandas-Genomics provides an extension type and extension array for working with genomics data. It also includes `genomics` accessors for many useful properties and methods related to QC and analysis of genomics data. +### [Physipandas](https://github.com/mocquin/physipandas) + +Physipandas provides an extension for manipulating physical quantities + (like scalar and numpy.ndarray) in association with a physical unit + (like meter or joule) and additional features for integration of + `physipy` accessors with pandas Series and Dataframe. + ### [Pint-Pandas](https://github.com/hgrecco/pint-pandas) Pint-Pandas provides an extension type for storing numeric arrays with units. @@ -551,6 +558,7 @@ authors to coordinate on the namespace. | [pandas-genomics](https://pandas-genomics.readthedocs.io/en/latest/) | `genomics` | `Series`, `DataFrame` | | [pandas_path](https://github.com/drivendataorg/pandas-path/) | `path` | `Index`, `Series` | | [pint-pandas](https://github.com/hgrecco/pint-pandas) | `pint` | `Series`, `DataFrame` | + | [physipandas](https://github.com/mocquin/physipandas) | `physipy` | `Series`, `DataFrame` | | [composeml](https://github.com/alteryx/compose) | `slice` | `DataFrame` | | [datatest](https://datatest.readthedocs.io/en/stable/) | `validate` | `Series`, `DataFrame` | | [composeml](https://github.com/alteryx/compose) | `slice` | `DataFrame` | From e997175638bae0a8365367a98319f2c174b51505 Mon Sep 17 00:00:00 2001 From: Yao Xiao <108576690+Charlie-XIAO@users.noreply.github.com> Date: Fri, 7 Jul 2023 05:11:18 +0800 Subject: [PATCH 28/35] API: add `NaTType` and `NAType` to `pandas.api.typing` (#53958) * API: add NaTType and NAType to pandas.api.typing * updated test --- pandas/api/typing/__init__.py | 5 +++++ pandas/tests/api/test_api.py | 2 ++ 2 files changed, 7 insertions(+) diff --git a/pandas/api/typing/__init__.py b/pandas/api/typing/__init__.py index 4c535bf81d3b6..9b5d2cb06b523 100644 --- a/pandas/api/typing/__init__.py +++ b/pandas/api/typing/__init__.py @@ -2,6 +2,9 @@ Public API classes that store intermediate results useful for type-hinting. """ +from pandas._libs import NaTType +from pandas._libs.missing import NAType + from pandas.core.groupby import ( DataFrameGroupBy, SeriesGroupBy, @@ -36,6 +39,8 @@ "ExponentialMovingWindow", "ExponentialMovingWindowGroupby", "JsonReader", + "NaTType", + "NAType", "PeriodIndexResamplerGroupby", "Resampler", "Rolling", diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index ffed6a0935c8d..924a6db4b901b 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -259,6 +259,8 @@ class TestApi(Base): "ExponentialMovingWindow", "ExponentialMovingWindowGroupby", "JsonReader", + "NaTType", + "NAType", "PeriodIndexResamplerGroupby", "Resampler", "Rolling", From eceaffbe3f8e6387ccc1970455e1d828f7f5702e Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 6 Jul 2023 14:12:59 -0700 Subject: [PATCH 29/35] DEPR: downcast keyword in Index.fillna (#53956) * DEPR: downcast keyword in Index.fillna * GH ref * docstring fixup --- doc/source/whatsnew/v2.1.0.rst | 1 + pandas/core/indexes/base.py | 14 +++++++++++++- pandas/tests/indexes/test_old_base.py | 8 +++++--- 3 files changed, 19 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 1119117c411d3..dc306471dbd3f 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -249,6 +249,7 @@ Other API changes Deprecations ~~~~~~~~~~~~ - Deprecated 'broadcast_axis' keyword in :meth:`Series.align` and :meth:`DataFrame.align`, upcast before calling ``align`` with ``left = DataFrame({col: left for col in right.columns}, index=right.index)`` (:issue:`51856`) +- Deprecated 'downcast' keyword in :meth:`Index.fillna` (:issue:`53956`) - Deprecated 'fill_method' and 'limit' keywords in :meth:`DataFrame.pct_change`, :meth:`Series.pct_change`, :meth:`DataFrameGroupBy.pct_change`, and :meth:`SeriesGroupBy.pct_change`, explicitly call ``ffill`` or ``bfill`` before calling ``pct_change`` instead (:issue:`53491`) - Deprecated 'method', 'limit', and 'fill_axis' keywords in :meth:`DataFrame.align` and :meth:`Series.align`, explicitly call ``fillna`` on the alignment results instead (:issue:`51856`) - Deprecated 'quantile' keyword in :meth:`Rolling.quantile` and :meth:`Expanding.quantile`, renamed as 'q' instead (:issue:`52550`) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index f03af387151b2..5f19f6d06a194 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2918,7 +2918,7 @@ def notna(self) -> npt.NDArray[np.bool_]: notnull = notna - def fillna(self, value=None, downcast=None): + def fillna(self, value=None, downcast=lib.no_default): """ Fill NA/NaN values with the specified value. @@ -2932,6 +2932,8 @@ def fillna(self, value=None, downcast=None): or the string 'infer' which will try to downcast to an appropriate equal type (e.g. float64 to int64 if possible). + .. deprecated:: 2.1.0 + Returns ------- Index @@ -2949,6 +2951,16 @@ def fillna(self, value=None, downcast=None): """ if not is_scalar(value): raise TypeError(f"'value' must be a scalar, passed: {type(value).__name__}") + if downcast is not lib.no_default: + warnings.warn( + f"The 'downcast' keyword in {type(self).__name__}.fillna is " + "deprecated and will be removed in a future version. " + "It was previously silently ignored.", + FutureWarning, + stacklevel=find_stack_level(), + ) + else: + downcast = None if self.hasnans: result = self.putmask(self._isnan, value) diff --git a/pandas/tests/indexes/test_old_base.py b/pandas/tests/indexes/test_old_base.py index ff23c8a8ba5a4..588aa458c8b04 100644 --- a/pandas/tests/indexes/test_old_base.py +++ b/pandas/tests/indexes/test_old_base.py @@ -593,9 +593,11 @@ def test_fillna(self, index): idx = type(index)(values) msg = "does not support 'downcast'" - with pytest.raises(NotImplementedError, match=msg): - # For now at least, we only raise if there are NAs present - idx.fillna(idx[0], downcast="infer") + msg2 = r"The 'downcast' keyword in .*Index\.fillna is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg2): + with pytest.raises(NotImplementedError, match=msg): + # For now at least, we only raise if there are NAs present + idx.fillna(idx[0], downcast="infer") expected = np.array([False] * len(idx), dtype=bool) expected[1] = True From 431dd6fce8795e69431e9d565fd6150c3a90afe0 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Fri, 7 Jul 2023 01:55:09 +0200 Subject: [PATCH 30/35] ENH: Use explicit methods instead of regex pattern in arrow strings (#54006) * ENH: Use explicit methods instead of regex pattern in arrow strings * Fixup * Fix --- pandas/core/arrays/string_arrow.py | 29 +++++++++++++---------- pandas/tests/strings/test_find_replace.py | 26 +++++--------------- 2 files changed, 22 insertions(+), 33 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index fa56571d7b0b0..12f4b5486b6b9 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -307,28 +307,31 @@ def _str_contains( return super()._str_contains(pat, case, flags, na, regex) if regex: - if case is False: - fallback_performancewarning() - return super()._str_contains(pat, case, flags, na, regex) - else: - result = pc.match_substring_regex(self._pa_array, pat) + result = pc.match_substring_regex(self._pa_array, pat, ignore_case=not case) else: - if case: - result = pc.match_substring(self._pa_array, pat) - else: - result = pc.match_substring(pc.utf8_upper(self._pa_array), pat.upper()) + result = pc.match_substring(self._pa_array, pat, ignore_case=not case) result = BooleanDtype().__from_arrow__(result) if not isna(na): result[isna(result)] = bool(na) return result def _str_startswith(self, pat: str, na=None): - pat = f"^{re.escape(pat)}" - return self._str_contains(pat, na=na, regex=True) + result = pc.starts_with(self._pa_array, pattern=pat) + if not isna(na): + result = result.fill_null(na) + result = BooleanDtype().__from_arrow__(result) + if not isna(na): + result[isna(result)] = bool(na) + return result def _str_endswith(self, pat: str, na=None): - pat = f"{re.escape(pat)}$" - return self._str_contains(pat, na=na, regex=True) + result = pc.ends_with(self._pa_array, pattern=pat) + if not isna(na): + result = result.fill_null(na) + result = BooleanDtype().__from_arrow__(result) + if not isna(na): + result[isna(result)] = bool(na) + return result def _str_replace( self, diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index 89718b1b35f12..c3cc8b3643ed2 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -53,10 +53,8 @@ def test_contains(any_string_dtype): np.array(["Foo", "xYz", "fOOomMm__fOo", "MMM_"], dtype=object), dtype=any_string_dtype, ) - with tm.maybe_produces_warning( - PerformanceWarning, any_string_dtype == "string[pyarrow]" - ): - result = values.str.contains("FOO|mmm", case=False) + + result = values.str.contains("FOO|mmm", case=False) expected = Series(np.array([True, False, True, True]), dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -172,10 +170,7 @@ def test_contains_moar(any_string_dtype): ) tm.assert_series_equal(result, expected) - with tm.maybe_produces_warning( - PerformanceWarning, any_string_dtype == "string[pyarrow]" - ): - result = s.str.contains("a", case=False) + result = s.str.contains("a", case=False) expected = Series( [True, False, False, True, True, False, np.nan, True, False, True], dtype=expected_dtype, @@ -196,10 +191,7 @@ def test_contains_moar(any_string_dtype): ) tm.assert_series_equal(result, expected) - with tm.maybe_produces_warning( - PerformanceWarning, any_string_dtype == "string[pyarrow]" - ): - result = s.str.contains("ba", case=False) + result = s.str.contains("ba", case=False) expected = Series( [False, False, False, True, True, False, np.nan, True, False, False], dtype=expected_dtype, @@ -723,10 +715,7 @@ def test_match_na_kwarg(any_string_dtype): def test_match_case_kwarg(any_string_dtype): values = Series(["ab", "AB", "abc", "ABC"], dtype=any_string_dtype) - with tm.maybe_produces_warning( - PerformanceWarning, any_string_dtype == "string[pyarrow]" - ): - result = values.str.match("ab", case=False) + result = values.str.match("ab", case=False) expected_dtype = np.bool_ if any_string_dtype == "object" else "boolean" expected = Series([True, True, True, True], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -769,10 +758,7 @@ def test_fullmatch_case_kwarg(any_string_dtype): expected = Series([True, True, False, False], dtype=expected_dtype) - with tm.maybe_produces_warning( - PerformanceWarning, any_string_dtype == "string[pyarrow]" - ): - result = ser.str.fullmatch("ab", case=False) + result = ser.str.fullmatch("ab", case=False) tm.assert_series_equal(result, expected) with tm.maybe_produces_warning( From 6b4254efc4421aa90ea0a6a30f620f0535d926b9 Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Fri, 7 Jul 2023 13:05:12 -0400 Subject: [PATCH 31/35] DEPR: Replacing builtin and NumPy funcs in agg/apply/transform (#53974) * DEPR: Replacing builtin and NumPy funcs in agg/apply/transform * mypy fixup --- .../comparison/comparison_with_r.rst | 6 +- .../comparison/comparison_with_sql.rst | 4 +- doc/source/user_guide/basics.rst | 6 +- doc/source/user_guide/cookbook.rst | 6 +- doc/source/user_guide/reshaping.rst | 10 +- doc/source/user_guide/timeseries.rst | 6 +- doc/source/user_guide/window.rst | 2 +- doc/source/whatsnew/v0.14.0.rst | 2 +- doc/source/whatsnew/v0.20.0.rst | 8 +- doc/source/whatsnew/v0.25.0.rst | 4 +- doc/source/whatsnew/v2.1.0.rst | 2 +- pandas/core/apply.py | 22 +++++ pandas/core/common.py | 7 ++ pandas/core/frame.py | 14 +-- pandas/core/groupby/generic.py | 12 ++- pandas/core/groupby/groupby.py | 8 ++ pandas/core/resample.py | 15 ++- pandas/tests/apply/test_frame_apply.py | 6 +- .../apply/test_frame_apply_relabeling.py | 24 +++-- pandas/tests/apply/test_invalid_arg.py | 8 +- pandas/tests/apply/test_series_apply.py | 10 +- .../apply/test_series_apply_relabeling.py | 12 ++- pandas/tests/apply/test_str.py | 18 +++- .../tests/groupby/aggregate/test_aggregate.py | 50 +++++----- pandas/tests/groupby/aggregate/test_cython.py | 17 +++- pandas/tests/groupby/aggregate/test_other.py | 25 +++-- pandas/tests/groupby/test_categorical.py | 55 ++++++++--- pandas/tests/groupby/test_function.py | 28 ++++-- pandas/tests/groupby/test_groupby.py | 92 ++++++++++-------- pandas/tests/groupby/test_groupby_dropna.py | 4 +- pandas/tests/groupby/test_grouping.py | 8 +- pandas/tests/groupby/test_raises.py | 21 +++- pandas/tests/groupby/test_timegrouper.py | 6 +- .../tests/groupby/transform/test_transform.py | 64 ++++++++---- pandas/tests/resample/test_base.py | 2 +- pandas/tests/resample/test_datetime_index.py | 8 +- pandas/tests/resample/test_period_index.py | 2 +- pandas/tests/resample/test_resample_api.py | 30 ++++-- pandas/tests/resample/test_time_grouper.py | 4 +- pandas/tests/reshape/merge/test_join.py | 2 +- pandas/tests/reshape/test_crosstab.py | 31 +++--- pandas/tests/reshape/test_pivot.py | 97 ++++++++++--------- pandas/tests/test_multilevel.py | 6 +- pandas/tests/window/test_api.py | 18 ++-- 44 files changed, 510 insertions(+), 272 deletions(-) diff --git a/doc/source/getting_started/comparison/comparison_with_r.rst b/doc/source/getting_started/comparison/comparison_with_r.rst index 767779b0f58a8..25ba237e8caf3 100644 --- a/doc/source/getting_started/comparison/comparison_with_r.rst +++ b/doc/source/getting_started/comparison/comparison_with_r.rst @@ -246,7 +246,7 @@ In pandas we may use :meth:`~pandas.pivot_table` method to handle this: } ) - baseball.pivot_table(values="batting avg", columns="team", aggfunc=np.max) + baseball.pivot_table(values="batting avg", columns="team", aggfunc="max") For more details and examples see :ref:`the reshaping documentation `. @@ -359,7 +359,7 @@ In pandas the equivalent expression, using the ) grouped = df.groupby(["month", "week"]) - grouped["x"].agg([np.mean, np.std]) + grouped["x"].agg(["mean", "std"]) For more details and examples see :ref:`the groupby documentation @@ -482,7 +482,7 @@ In Python the best way is to make use of :meth:`~pandas.pivot_table`: values="value", index=["variable", "week"], columns=["month"], - aggfunc=np.mean, + aggfunc="mean", ) Similarly for ``dcast`` which uses a data.frame called ``df`` in R to diff --git a/doc/source/getting_started/comparison/comparison_with_sql.rst b/doc/source/getting_started/comparison/comparison_with_sql.rst index a6d9d65e85645..7a83d50416186 100644 --- a/doc/source/getting_started/comparison/comparison_with_sql.rst +++ b/doc/source/getting_started/comparison/comparison_with_sql.rst @@ -198,7 +198,7 @@ to your grouped DataFrame, indicating which functions to apply to specific colum .. ipython:: python - tips.groupby("day").agg({"tip": np.mean, "day": np.size}) + tips.groupby("day").agg({"tip": "mean", "day": "size"}) Grouping by more than one column is done by passing a list of columns to the :meth:`~pandas.DataFrame.groupby` method. @@ -222,7 +222,7 @@ Grouping by more than one column is done by passing a list of columns to the .. ipython:: python - tips.groupby(["smoker", "day"]).agg({"tip": [np.size, np.mean]}) + tips.groupby(["smoker", "day"]).agg({"tip": ["size", "mean"]}) .. _compare_with_sql.join: diff --git a/doc/source/user_guide/basics.rst b/doc/source/user_guide/basics.rst index 65892f01326e4..389a2d23c466d 100644 --- a/doc/source/user_guide/basics.rst +++ b/doc/source/user_guide/basics.rst @@ -881,8 +881,8 @@ statistics methods, takes an optional ``axis`` argument: .. ipython:: python - df.apply(np.mean) - df.apply(np.mean, axis=1) + df.apply(lambda x: np.mean(x)) + df.apply(lambda x: np.mean(x), axis=1) df.apply(lambda x: x.max() - x.min()) df.apply(np.cumsum) df.apply(np.exp) @@ -986,7 +986,7 @@ output: .. ipython:: python - tsdf.agg(np.sum) + tsdf.agg(lambda x: np.sum(x)) tsdf.agg("sum") diff --git a/doc/source/user_guide/cookbook.rst b/doc/source/user_guide/cookbook.rst index c7278c604ca02..fd4f7cd1b83fe 100644 --- a/doc/source/user_guide/cookbook.rst +++ b/doc/source/user_guide/cookbook.rst @@ -530,7 +530,7 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to code_groups = df.groupby("code") - agg_n_sort_order = code_groups[["data"]].transform(sum).sort_values(by="data") + agg_n_sort_order = code_groups[["data"]].transform("sum").sort_values(by="data") sorted_df = df.loc[agg_n_sort_order.index] @@ -549,7 +549,7 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to return x.iloc[1] * 1.234 return pd.NaT - mhc = {"Mean": np.mean, "Max": np.max, "Custom": MyCust} + mhc = {"Mean": "mean", "Max": "max", "Custom": MyCust} ts.resample("5min").apply(mhc) ts @@ -685,7 +685,7 @@ The :ref:`Pivot ` docs. values=["Sales"], index=["Province"], columns=["City"], - aggfunc=np.sum, + aggfunc="sum", margins=True, ) table.stack("City") diff --git a/doc/source/user_guide/reshaping.rst b/doc/source/user_guide/reshaping.rst index 8d0f1048f6e77..4df6996c4f66b 100644 --- a/doc/source/user_guide/reshaping.rst +++ b/doc/source/user_guide/reshaping.rst @@ -402,12 +402,12 @@ We can produce pivot tables from this data very easily: .. ipython:: python pd.pivot_table(df, values="D", index=["A", "B"], columns=["C"]) - pd.pivot_table(df, values="D", index=["B"], columns=["A", "C"], aggfunc=np.sum) + pd.pivot_table(df, values="D", index=["B"], columns=["A", "C"], aggfunc="sum") pd.pivot_table( df, values=["D", "E"], index=["B"], columns=["A", "C"], - aggfunc=np.sum, + aggfunc="sum", ) The result object is a :class:`DataFrame` having potentially hierarchical indexes on the @@ -451,7 +451,7 @@ rows and columns: columns="C", values=["D", "E"], margins=True, - aggfunc=np.std + aggfunc="std" ) table @@ -552,7 +552,7 @@ each group defined by the first two :class:`Series`: .. ipython:: python - pd.crosstab(df["A"], df["B"], values=df["C"], aggfunc=np.sum) + pd.crosstab(df["A"], df["B"], values=df["C"], aggfunc="sum") Adding margins ~~~~~~~~~~~~~~ @@ -562,7 +562,7 @@ Finally, one can also add margins or normalize this output. .. ipython:: python pd.crosstab( - df["A"], df["B"], values=df["C"], aggfunc=np.sum, normalize=True, margins=True + df["A"], df["B"], values=df["C"], aggfunc="sum", normalize=True, margins=True ) .. _reshaping.tile: diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst index fb1c37c1b9073..a0754ba0d2995 100644 --- a/doc/source/user_guide/timeseries.rst +++ b/doc/source/user_guide/timeseries.rst @@ -1801,14 +1801,14 @@ You can pass a list or dict of functions to do aggregation with, outputting a `` .. ipython:: python - r["A"].agg([np.sum, np.mean, np.std]) + r["A"].agg(["sum", "mean", "std"]) On a resampled ``DataFrame``, you can pass a list of functions to apply to each column, which produces an aggregated result with a hierarchical index: .. ipython:: python - r.agg([np.sum, np.mean]) + r.agg(["sum", "mean"]) By passing a dict to ``aggregate`` you can apply a different aggregation to the columns of a ``DataFrame``: @@ -1816,7 +1816,7 @@ columns of a ``DataFrame``: .. ipython:: python :okexcept: - r.agg({"A": np.sum, "B": lambda x: np.std(x, ddof=1)}) + r.agg({"A": "sum", "B": lambda x: np.std(x, ddof=1)}) The function names can also be strings. In order for a string to be valid it must be implemented on the resampled object: diff --git a/doc/source/user_guide/window.rst b/doc/source/user_guide/window.rst index ea80a2804256c..85e8762570736 100644 --- a/doc/source/user_guide/window.rst +++ b/doc/source/user_guide/window.rst @@ -140,7 +140,7 @@ of multiple aggregations applied to a window. .. ipython:: python df = pd.DataFrame({"A": range(5), "B": range(10, 15)}) - df.expanding().agg([np.sum, np.mean, np.std]) + df.expanding().agg(["sum", "mean", "std"]) .. _window.generic: diff --git a/doc/source/whatsnew/v0.14.0.rst b/doc/source/whatsnew/v0.14.0.rst index ef24f2645d992..f33ab3911f231 100644 --- a/doc/source/whatsnew/v0.14.0.rst +++ b/doc/source/whatsnew/v0.14.0.rst @@ -846,7 +846,7 @@ Enhancements df.pivot_table(values='Quantity', index=pd.Grouper(freq='M', key='Date'), columns=pd.Grouper(freq='M', key='PayDay'), - aggfunc=np.sum) + aggfunc="sum") - Arrays of strings can be wrapped to a specified width (``str.wrap``) (:issue:`6999`) - Add :meth:`~Series.nsmallest` and :meth:`Series.nlargest` methods to Series, See :ref:`the docs ` (:issue:`3960`) diff --git a/doc/source/whatsnew/v0.20.0.rst b/doc/source/whatsnew/v0.20.0.rst index 34a875f59e808..b4224785988e6 100644 --- a/doc/source/whatsnew/v0.20.0.rst +++ b/doc/source/whatsnew/v0.20.0.rst @@ -984,7 +984,7 @@ Previous behavior: 75% 3.750000 max 4.000000 - In [3]: df.groupby('A').agg([np.mean, np.std, np.min, np.max]) + In [3]: df.groupby('A').agg(["mean", "std", "min", "max"]) Out[3]: B mean std amin amax @@ -1000,7 +1000,7 @@ New behavior: df.groupby('A').describe() - df.groupby('A').agg([np.mean, np.std, np.min, np.max]) + df.groupby('A').agg(["mean", "std", "min", "max"]) .. _whatsnew_0200.api_breaking.rolling_pairwise: @@ -1163,7 +1163,7 @@ Previous behavior: .. code-block:: ipython - In [2]: df.pivot_table('col1', index=['col3', 'col2'], aggfunc=np.sum) + In [2]: df.pivot_table('col1', index=['col3', 'col2'], aggfunc="sum") Out[2]: col3 col2 1 C 3 @@ -1175,7 +1175,7 @@ New behavior: .. ipython:: python - df.pivot_table('col1', index=['col3', 'col2'], aggfunc=np.sum) + df.pivot_table('col1', index=['col3', 'col2'], aggfunc="sum") .. _whatsnew_0200.api: diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 7965d335d0aac..c0f169aa6251f 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -48,7 +48,7 @@ output columns when applying multiple aggregation functions to specific columns animals.groupby("kind").agg( min_height=pd.NamedAgg(column='height', aggfunc='min'), max_height=pd.NamedAgg(column='height', aggfunc='max'), - average_weight=pd.NamedAgg(column='weight', aggfunc=np.mean), + average_weight=pd.NamedAgg(column='weight', aggfunc="mean"), ) Pass the desired columns names as the ``**kwargs`` to ``.agg``. The values of ``**kwargs`` @@ -61,7 +61,7 @@ what the arguments to the function are, but plain tuples are accepted as well. animals.groupby("kind").agg( min_height=('height', 'min'), max_height=('height', 'max'), - average_weight=('weight', np.mean), + average_weight=('weight', 'mean'), ) Named aggregation is the recommended replacement for the deprecated "dict-of-dicts" diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index dc306471dbd3f..eb49c69ad7567 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -315,11 +315,11 @@ Deprecations - Deprecated option "mode.use_inf_as_na", convert inf entries to ``NaN`` before instead (:issue:`51684`) - Deprecated parameter ``obj`` in :meth:`GroupBy.get_group` (:issue:`53545`) - Deprecated positional indexing on :class:`Series` with :meth:`Series.__getitem__` and :meth:`Series.__setitem__`, in a future version ``ser[item]`` will *always* interpret ``item`` as a label, not a position (:issue:`50617`) +- Deprecated replacing builtin and NumPy functions in ``.agg``, ``.apply``, and ``.transform``; use the corresponding string alias (e.g. ``"sum"`` for ``sum`` or ``np.sum``) instead (:issue:`53425`) - Deprecated strings ``T``, ``t``, ``L`` and ``l`` denoting units in :func:`to_timedelta` (:issue:`52536`) - Deprecated the "method" and "limit" keywords on :meth:`Series.fillna`, :meth:`DataFrame.fillna`, :meth:`SeriesGroupBy.fillna`, :meth:`DataFrameGroupBy.fillna`, and :meth:`Resampler.fillna`, use ``obj.bfill()`` or ``obj.ffill()`` instead (:issue:`53394`) - Deprecated the ``method`` and ``limit`` keywords in :meth:`DataFrame.replace` and :meth:`Series.replace` (:issue:`33302`) - Deprecated values "pad", "ffill", "bfill", "backfill" for :meth:`Series.interpolate` and :meth:`DataFrame.interpolate`, use ``obj.ffill()`` or ``obj.bfill()`` instead (:issue:`53581`) -- .. --------------------------------------------------------------------------- .. _whatsnew_210.performance: diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 83a3b29bfd7f0..6af4557897a0d 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -170,6 +170,7 @@ def agg(self) -> DataFrame | Series | None: if callable(func): f = com.get_cython_func(func) if f and not args and not kwargs: + warn_alias_replacement(obj, func, f) return getattr(obj, f)() # caller can react @@ -280,6 +281,7 @@ def transform_str_or_callable(self, func) -> DataFrame | Series: if not args and not kwargs: f = com.get_cython_func(func) if f: + warn_alias_replacement(obj, func, f) return getattr(obj, f)() # Two possible ways to use a UDF - apply or call directly @@ -1695,3 +1697,23 @@ def validate_func_kwargs( no_arg_message = "Must provide 'func' or named aggregation **kwargs." raise TypeError(no_arg_message) return columns, func + + +def warn_alias_replacement( + obj: AggObjType, + func: Callable, + alias: str, +) -> None: + if alias.startswith("np."): + full_alias = alias + else: + full_alias = f"{type(obj).__name__}.{alias}" + alias = f"'{alias}'" + warnings.warn( + f"The provided callable {func} is currently using " + f"{full_alias}. In a future version of pandas, " + f"the provided callable will be used directly. To keep current " + f"behavior pass {alias} instead.", + category=FutureWarning, + stacklevel=find_stack_level(), + ) diff --git a/pandas/core/common.py b/pandas/core/common.py index ee8fe220698b5..9db03ac3ae571 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -565,6 +565,13 @@ def require_length_match(data, index: Index) -> None: builtins.min: np.minimum.reduce, } +# GH#53425: Only for deprecation +_builtin_table_alias = { + builtins.sum: "np.sum", + builtins.max: "np.maximum.reduce", + builtins.min: "np.minimum.reduce", +} + _cython_table = { builtins.sum: "sum", builtins.max: "max", diff --git a/pandas/core/frame.py b/pandas/core/frame.py index f90b5c0eedbe8..6fdd6cb2a639e 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -8851,7 +8851,7 @@ def pivot( it can contain any of the other types (except list). If an array is passed, it must be the same length as the data and will be used in the same manner as column values. - aggfunc : function, list of functions, dict, default numpy.mean + aggfunc : function, list of functions, dict, default "mean" If a list of functions is passed, the resulting pivot table will have hierarchical columns whose top level are the function names (inferred from the function objects themselves). @@ -8926,7 +8926,7 @@ def pivot( This first example aggregates values by taking the sum. >>> table = pd.pivot_table(df, values='D', index=['A', 'B'], - ... columns=['C'], aggfunc=np.sum) + ... columns=['C'], aggfunc="sum") >>> table C large small A B @@ -8938,7 +8938,7 @@ def pivot( We can also fill missing values using the `fill_value` parameter. >>> table = pd.pivot_table(df, values='D', index=['A', 'B'], - ... columns=['C'], aggfunc=np.sum, fill_value=0) + ... columns=['C'], aggfunc="sum", fill_value=0) >>> table C large small A B @@ -8950,7 +8950,7 @@ def pivot( The next example aggregates by taking the mean across multiple columns. >>> table = pd.pivot_table(df, values=['D', 'E'], index=['A', 'C'], - ... aggfunc={'D': np.mean, 'E': np.mean}) + ... aggfunc={'D': "mean", 'E': "mean"}) >>> table D E A C @@ -8963,8 +8963,8 @@ def pivot( value column. >>> table = pd.pivot_table(df, values=['D', 'E'], index=['A', 'C'], - ... aggfunc={'D': np.mean, - ... 'E': [min, max, np.mean]}) + ... aggfunc={'D': "mean", + ... 'E': ["min", "max", "mean"]}) >>> table D E mean max mean min @@ -9565,7 +9565,7 @@ def _gotitem( Aggregate different functions over the columns and rename the index of the resulting DataFrame. - >>> df.agg(x=('A', max), y=('B', 'min'), z=('C', np.mean)) + >>> df.agg(x=('A', 'max'), y=('B', 'min'), z=('C', 'mean')) A B C x 7.0 NaN NaN y NaN 2.0 NaN diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index e3aa97b448fe1..3bedcb935b6ba 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -64,6 +64,7 @@ maybe_mangle_lambdas, reconstruct_func, validate_func_kwargs, + warn_alias_replacement, ) import pandas.core.common as com from pandas.core.frame import DataFrame @@ -133,7 +134,7 @@ class NamedAgg(NamedTuple): -------- >>> df = pd.DataFrame({"key": [1, 1, 2], "a": [-1, 0, 1], 1: [10, 11, 12]}) >>> agg_a = pd.NamedAgg(column="a", aggfunc="min") - >>> agg_1 = pd.NamedAgg(column=1, aggfunc=np.mean) + >>> agg_1 = pd.NamedAgg(column=1, aggfunc=lambda x: np.mean(x)) >>> df.groupby("key").agg(result_a=agg_a, result_1=agg_1) result_a result_1 key @@ -257,6 +258,7 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) else: cyfunc = com.get_cython_func(func) if cyfunc and not args and not kwargs: + warn_alias_replacement(self, func, cyfunc) return getattr(self, cyfunc)() if maybe_use_numba(engine): @@ -306,7 +308,11 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) agg = aggregate def _python_agg_general(self, func, *args, **kwargs): + orig_func = func func = com.is_builtin_func(func) + if orig_func != func: + alias = com._builtin_table_alias[func] + warn_alias_replacement(self, orig_func, alias) f = lambda x: func(x, *args, **kwargs) obj = self._obj_with_exclusions @@ -1511,7 +1517,11 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) agg = aggregate def _python_agg_general(self, func, *args, **kwargs): + orig_func = func func = com.is_builtin_func(func) + if orig_func != func: + alias = com._builtin_table_alias[func] + warn_alias_replacement(self, orig_func, alias) f = lambda x: func(x, *args, **kwargs) if self.ngroups == 0: diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 2e3415f9a4474..ff9c1cf757f37 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -96,6 +96,7 @@ class providing the base-class of operations. sample, ) from pandas.core._numba import executor +from pandas.core.apply import warn_alias_replacement from pandas.core.arrays import ( BaseMaskedArray, Categorical, @@ -1677,7 +1678,11 @@ def _aggregate_with_numba(self, func, *args, engine_kwargs=None, **kwargs): ) ) def apply(self, func, *args, **kwargs) -> NDFrameT: + orig_func = func func = com.is_builtin_func(func) + if orig_func != func: + alias = com._builtin_table_alias[orig_func] + warn_alias_replacement(self, orig_func, alias) if isinstance(func, str): if hasattr(self, func): @@ -1880,7 +1885,10 @@ def _cython_transform( @final def _transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs): # optimized transforms + orig_func = func func = com.get_cython_func(func) or func + if orig_func != func: + warn_alias_replacement(self, orig_func, func) if not isinstance(func, str): return self._transform_general(func, engine, engine_kwargs, *args, **kwargs) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 3916f90e1f0b2..c0a6587d527e1 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -41,7 +41,10 @@ ) import pandas.core.algorithms as algos -from pandas.core.apply import ResamplerWindowApply +from pandas.core.apply import ( + ResamplerWindowApply, + warn_alias_replacement, +) from pandas.core.base import ( PandasObject, SelectionMixin, @@ -295,7 +298,7 @@ def pipe( >>> r = s.resample('2s') - >>> r.agg(np.sum) + >>> r.agg("sum") 2013-01-01 00:00:00 3 2013-01-01 00:00:02 7 2013-01-01 00:00:04 5 @@ -308,7 +311,7 @@ def pipe( 2013-01-01 00:00:04 5 5.0 5 >>> r.agg({'result': lambda x: x.mean() / x.std(), - ... 'total': np.sum}) + ... 'total': "sum"}) result total 2013-01-01 00:00:00 2.121320 3 2013-01-01 00:00:02 4.949747 7 @@ -1673,7 +1676,10 @@ def _downsample(self, how, **kwargs): how : string / cython mapped function **kwargs : kw args passed to how function """ + orig_how = how how = com.get_cython_func(how) or how + if orig_how != how: + warn_alias_replacement(self, orig_how, how) ax = self.ax # Excludes `on` column when provided @@ -1827,7 +1833,10 @@ def _downsample(self, how, **kwargs): if self.kind == "timestamp": return super()._downsample(how, **kwargs) + orig_how = how how = com.get_cython_func(how) or how + if orig_how != how: + warn_alias_replacement(self, orig_how, how) ax = self.ax if is_subperiod(ax.freq, self.freq): diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py index 5681167cd54f9..43f903f99d0d7 100644 --- a/pandas/tests/apply/test_frame_apply.py +++ b/pandas/tests/apply/test_frame_apply.py @@ -1608,11 +1608,13 @@ def foo2(x, b=2, c=0): def test_agg_std(): df = DataFrame(np.arange(6).reshape(3, 2), columns=["A", "B"]) - result = df.agg(np.std) + with tm.assert_produces_warning(FutureWarning, match="using DataFrame.std"): + result = df.agg(np.std) expected = Series({"A": 2.0, "B": 2.0}, dtype=float) tm.assert_series_equal(result, expected) - result = df.agg([np.std]) + with tm.assert_produces_warning(FutureWarning, match="using Series.std"): + result = df.agg([np.std]) expected = DataFrame({"A": 2.0, "B": 2.0}, index=["std"]) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/apply/test_frame_apply_relabeling.py b/pandas/tests/apply/test_frame_apply_relabeling.py index 2652d43fd42ec..723bdd349c0cb 100644 --- a/pandas/tests/apply/test_frame_apply_relabeling.py +++ b/pandas/tests/apply/test_frame_apply_relabeling.py @@ -49,20 +49,24 @@ def test_agg_relabel_multi_columns_multi_methods(): def test_agg_relabel_partial_functions(): # GH 26513, test on partial, functools or more complex cases df = pd.DataFrame({"A": [1, 2, 1, 2], "B": [1, 2, 3, 4], "C": [3, 4, 5, 6]}) - result = df.agg(foo=("A", np.mean), bar=("A", "mean"), cat=("A", min)) + msg = "using Series.[mean|min]" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.agg(foo=("A", np.mean), bar=("A", "mean"), cat=("A", min)) expected = pd.DataFrame( {"A": [1.5, 1.5, 1.0]}, index=pd.Index(["foo", "bar", "cat"]) ) tm.assert_frame_equal(result, expected) - result = df.agg( - foo=("A", min), - bar=("A", np.min), - cat=("B", max), - dat=("C", "min"), - f=("B", np.sum), - kk=("B", lambda x: min(x)), - ) + msg = "using Series.[mean|min|max|sum]" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.agg( + foo=("A", min), + bar=("A", np.min), + cat=("B", max), + dat=("C", "min"), + f=("B", np.sum), + kk=("B", lambda x: min(x)), + ) expected = pd.DataFrame( { "A": [1.0, 1.0, np.nan, np.nan, np.nan, np.nan], @@ -79,7 +83,7 @@ def test_agg_namedtuple(): df = pd.DataFrame({"A": [0, 1], "B": [1, 2]}) result = df.agg( foo=pd.NamedAgg("B", "sum"), - bar=pd.NamedAgg("B", min), + bar=pd.NamedAgg("B", "min"), cat=pd.NamedAgg(column="B", aggfunc="count"), fft=pd.NamedAgg("B", aggfunc="max"), ) diff --git a/pandas/tests/apply/test_invalid_arg.py b/pandas/tests/apply/test_invalid_arg.py index 21b5c803d0e76..e0d52f094515b 100644 --- a/pandas/tests/apply/test_invalid_arg.py +++ b/pandas/tests/apply/test_invalid_arg.py @@ -222,8 +222,10 @@ def transform2(row): def test_agg_cython_table_raises_frame(df, func, expected, axis): # GH 21224 msg = "can't multiply sequence by non-int of type 'str'" + warn = None if isinstance(func, str) else FutureWarning with pytest.raises(expected, match=msg): - df.agg(func, axis=axis) + with tm.assert_produces_warning(warn, match="using DataFrame.cumprod"): + df.agg(func, axis=axis) @pytest.mark.parametrize( @@ -247,10 +249,12 @@ def test_agg_cython_table_raises_series(series, func, expected): msg = r"[Cc]ould not convert|can't multiply sequence by non-int of type" if func == "median" or func is np.nanmedian or func is np.median: msg = r"Cannot convert \['a' 'b' 'c'\] to numeric" + warn = None if isinstance(func, str) else FutureWarning with pytest.raises(expected, match=msg): # e.g. Series('a b'.split()).cumprod() will raise - series.agg(func) + with tm.assert_produces_warning(warn, match="is currently using Series.*"): + series.agg(func) def test_agg_none_to_type(): diff --git a/pandas/tests/apply/test_series_apply.py b/pandas/tests/apply/test_series_apply.py index 9002a5f85cba6..79954eeed8e95 100644 --- a/pandas/tests/apply/test_series_apply.py +++ b/pandas/tests/apply/test_series_apply.py @@ -561,7 +561,10 @@ def test_apply_listlike_reducer(string_series, ops, names, how, kwargs): # GH 39140 expected = Series({name: op(string_series) for name, op in zip(names, ops)}) expected.name = "series" - result = getattr(string_series, how)(ops, **kwargs) + warn = FutureWarning if how == "agg" else None + msg = f"using Series.[{'|'.join(names)}]" + with tm.assert_produces_warning(warn, match=msg): + result = getattr(string_series, how)(ops, **kwargs) tm.assert_series_equal(result, expected) @@ -582,7 +585,10 @@ def test_apply_dictlike_reducer(string_series, ops, how, kwargs, by_row): # GH 39140 expected = Series({name: op(string_series) for name, op in ops.items()}) expected.name = string_series.name - result = getattr(string_series, how)(ops, **kwargs) + warn = FutureWarning if how == "agg" else None + msg = "using Series.[sum|mean]" + with tm.assert_produces_warning(warn, match=msg): + result = getattr(string_series, how)(ops, **kwargs) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/apply/test_series_apply_relabeling.py b/pandas/tests/apply/test_series_apply_relabeling.py index c0a285e6eb38c..cdfa054f91c9b 100644 --- a/pandas/tests/apply/test_series_apply_relabeling.py +++ b/pandas/tests/apply/test_series_apply_relabeling.py @@ -14,8 +14,12 @@ def test_relabel_no_duplicated_method(): expected = df["B"].agg({"foo": "min", "bar": "max"}) tm.assert_series_equal(result, expected) - result = df["B"].agg(foo=sum, bar=min, cat="max") - expected = df["B"].agg({"foo": sum, "bar": min, "cat": "max"}) + msg = "using Series.[sum|min|max]" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df["B"].agg(foo=sum, bar=min, cat="max") + msg = "using Series.[sum|min|max]" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = df["B"].agg({"foo": sum, "bar": min, "cat": "max"}) tm.assert_series_equal(result, expected) @@ -28,6 +32,8 @@ def test_relabel_duplicated_method(): expected = pd.Series([6, 6], index=["foo", "bar"], name="A") tm.assert_series_equal(result, expected) - result = df["B"].agg(foo=min, bar="min") + msg = "using Series.min" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df["B"].agg(foo=min, bar="min") expected = pd.Series([1, 1], index=["foo", "bar"], name="B") tm.assert_series_equal(result, expected) diff --git a/pandas/tests/apply/test_str.py b/pandas/tests/apply/test_str.py index 64189fae5f578..363d0285cabbc 100644 --- a/pandas/tests/apply/test_str.py +++ b/pandas/tests/apply/test_str.py @@ -135,7 +135,9 @@ def test_agg_cython_table_series(series, func, expected): # GH21224 # test reducing functions in # pandas.core.base.SelectionMixin._cython_table - result = series.agg(func) + warn = None if isinstance(func, str) else FutureWarning + with tm.assert_produces_warning(warn, match="is currently using Series.*"): + result = series.agg(func) if is_number(expected): assert np.isclose(result, expected, equal_nan=True) else: @@ -168,7 +170,9 @@ def test_agg_cython_table_transform_series(series, func, expected): # GH21224 # test transforming functions in # pandas.core.base.SelectionMixin._cython_table (cumprod, cumsum) - result = series.agg(func) + warn = None if isinstance(func, str) else FutureWarning + with tm.assert_produces_warning(warn, match="is currently using Series.*"): + result = series.agg(func) tm.assert_series_equal(result, expected) @@ -211,7 +215,10 @@ def test_agg_cython_table_frame(df, func, expected, axis): # GH 21224 # test reducing functions in # pandas.core.base.SelectionMixin._cython_table - result = df.agg(func, axis=axis) + warn = None if isinstance(func, str) else FutureWarning + with tm.assert_produces_warning(warn, match="is currently using DataFrame.*"): + # GH#53425 + result = df.agg(func, axis=axis) tm.assert_series_equal(result, expected) @@ -238,7 +245,10 @@ def test_agg_cython_table_transform_frame(df, func, expected, axis): # operating blockwise doesn't let us preserve dtypes expected = expected.astype("float64") - result = df.agg(func, axis=axis) + warn = None if isinstance(func, str) else FutureWarning + with tm.assert_produces_warning(warn, match="is currently using DataFrame.*"): + # GH#53425 + result = df.agg(func, axis=axis) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 3558377907931..2875e1ae80501 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -40,7 +40,7 @@ def dummy_func(x): def test_agg_regression1(tsframe): grouped = tsframe.groupby([lambda x: x.year, lambda x: x.month]) - result = grouped.agg(np.mean) + result = grouped.agg("mean") expected = grouped.mean() tm.assert_frame_equal(result, expected) @@ -141,8 +141,8 @@ def test_agg_apply_corner(ts, tsframe): # groupby float64 values results in a float64 Index exp = Series([], dtype=np.float64, index=Index([], dtype=np.float64)) tm.assert_series_equal(grouped.sum(), exp) - tm.assert_series_equal(grouped.agg(np.sum), exp) - tm.assert_series_equal(grouped.apply(np.sum), exp, check_index_type=False) + tm.assert_series_equal(grouped.agg("sum"), exp) + tm.assert_series_equal(grouped.apply("sum"), exp, check_index_type=False) # DataFrame grouped = tsframe.groupby(tsframe["A"] * np.nan, group_keys=False) @@ -152,7 +152,7 @@ def test_agg_apply_corner(ts, tsframe): index=Index([], name="A", dtype=np.float64), ) tm.assert_frame_equal(grouped.sum(), exp_df) - tm.assert_frame_equal(grouped.agg(np.sum), exp_df) + tm.assert_frame_equal(grouped.agg("sum"), exp_df) msg = "The behavior of DataFrame.sum with axis=None is deprecated" with tm.assert_produces_warning(FutureWarning, match=msg, check_stacklevel=False): @@ -167,13 +167,13 @@ def test_agg_grouping_is_list_tuple(ts): grouper = grouped.grouper.groupings[0].grouping_vector grouped.grouper.groupings[0] = Grouping(ts.index, list(grouper)) - result = grouped.agg(np.mean) + result = grouped.agg("mean") expected = grouped.mean() tm.assert_frame_equal(result, expected) grouped.grouper.groupings[0] = Grouping(ts.index, tuple(grouper)) - result = grouped.agg(np.mean) + result = grouped.agg("mean") expected = grouped.mean() tm.assert_frame_equal(result, expected) @@ -181,7 +181,7 @@ def test_agg_grouping_is_list_tuple(ts): def test_agg_python_multiindex(mframe): grouped = mframe.groupby(["A", "B"]) - result = grouped.agg(np.mean) + result = grouped.agg("mean") expected = grouped.mean() tm.assert_frame_equal(result, expected) @@ -348,7 +348,9 @@ def func(ser): def test_agg_multiple_functions_maintain_order(df): # GH #610 funcs = [("mean", np.mean), ("max", np.max), ("min", np.min)] - result = df.groupby("A")["C"].agg(funcs) + msg = "is currently using SeriesGroupBy.mean" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("A")["C"].agg(funcs) exp_cols = Index(["mean", "max", "min"]) tm.assert_index_equal(result.columns, exp_cols) @@ -428,20 +430,20 @@ def test_multiple_functions_tuples_and_non_tuples(df): def test_more_flexible_frame_multi_function(df): grouped = df.groupby("A") - exmean = grouped.agg({"C": np.mean, "D": np.mean}) - exstd = grouped.agg({"C": np.std, "D": np.std}) + exmean = grouped.agg({"C": "mean", "D": "mean"}) + exstd = grouped.agg({"C": "std", "D": "std"}) expected = concat([exmean, exstd], keys=["mean", "std"], axis=1) expected = expected.swaplevel(0, 1, axis=1).sort_index(level=0, axis=1) - d = {"C": [np.mean, np.std], "D": [np.mean, np.std]} + d = {"C": ["mean", "std"], "D": ["mean", "std"]} result = grouped.aggregate(d) tm.assert_frame_equal(result, expected) # be careful - result = grouped.aggregate({"C": np.mean, "D": [np.mean, np.std]}) - expected = grouped.aggregate({"C": np.mean, "D": [np.mean, np.std]}) + result = grouped.aggregate({"C": "mean", "D": ["mean", "std"]}) + expected = grouped.aggregate({"C": "mean", "D": ["mean", "std"]}) tm.assert_frame_equal(result, expected) def numpymean(x): @@ -453,11 +455,11 @@ def numpystd(x): # this uses column selection & renaming msg = r"nested renamer is not supported" with pytest.raises(SpecificationError, match=msg): - d = {"C": np.mean, "D": {"foo": np.mean, "bar": np.std}} + d = {"C": "mean", "D": {"foo": "mean", "bar": "std"}} grouped.aggregate(d) # But without renaming, these functions are OK - d = {"C": [np.mean], "D": [numpymean, numpystd]} + d = {"C": ["mean"], "D": [numpymean, numpystd]} grouped.aggregate(d) @@ -774,8 +776,8 @@ def test_agg_relabel(self): p98 = functools.partial(np.percentile, q=98) result = df.groupby("group").agg( b_min=("B", "min"), - a_min=("A", min), - a_mean=("A", np.mean), + a_min=("A", "min"), + a_mean=("A", "mean"), a_max=("A", "max"), b_max=("B", "max"), a_98=("A", p98), @@ -880,16 +882,16 @@ def test_mangled(self): [ ( (("y", "A"), "max"), - (("y", "A"), np.min), + (("y", "A"), np.mean), (("y", "B"), "mean"), [1, 3], - [0, 2], + [0.5, 2.5], [5.5, 7.5], ), ( (("y", "A"), lambda x: max(x)), (("y", "A"), lambda x: 1), - (("y", "B"), "mean"), + (("y", "B"), np.mean), [1, 3], [1, 1], [5.5, 7.5], @@ -918,9 +920,11 @@ def test_agg_relabel_multiindex_column( expected = DataFrame({"a_max": [1, 3]}, index=idx) tm.assert_frame_equal(result, expected) - result = df.groupby(("x", "group")).agg( - col_1=agg_col1, col_2=agg_col2, col_3=agg_col3 - ) + msg = "is currently using SeriesGroupBy.mean" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby(("x", "group")).agg( + col_1=agg_col1, col_2=agg_col2, col_3=agg_col3 + ) expected = DataFrame( {"col_1": agg_result1, "col_2": agg_result2, "col_3": agg_result3}, index=idx ) diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py index 2fb7c8eb03bb0..873e3e73c7cf5 100644 --- a/pandas/tests/groupby/aggregate/test_cython.py +++ b/pandas/tests/groupby/aggregate/test_cython.py @@ -21,6 +21,7 @@ bdate_range, ) import pandas._testing as tm +import pandas.core.common as com @pytest.mark.parametrize( @@ -84,7 +85,10 @@ def test_cython_agg_boolean(): } ) result = frame.groupby("a")["b"].mean() - expected = frame.groupby("a")["b"].agg(np.mean) + msg = "using SeriesGroupBy.mean" + with tm.assert_produces_warning(FutureWarning, match=msg): + # GH#53425 + expected = frame.groupby("a")["b"].agg(np.mean) tm.assert_series_equal(result, expected) @@ -159,7 +163,10 @@ def test_cython_fail_agg(): grouped = ts.groupby(lambda x: x.month) summed = grouped.sum() - expected = grouped.agg(np.sum) + msg = "using SeriesGroupBy.sum" + with tm.assert_produces_warning(FutureWarning, match=msg): + # GH#53425 + expected = grouped.agg(np.sum) tm.assert_series_equal(summed, expected) @@ -182,7 +189,11 @@ def test__cython_agg_general(op, targop): labels = np.random.randint(0, 50, size=1000).astype(float) result = df.groupby(labels)._cython_agg_general(op, alt=None, numeric_only=True) - expected = df.groupby(labels).agg(targop) + warn = FutureWarning if targop in com._cython_table else None + msg = f"using DataFrameGroupBy.{op}" + with tm.assert_produces_warning(warn, match=msg): + # GH#53425 + expected = df.groupby(labels).agg(targop) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index aad1218190a84..8772e3cfb45f4 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -77,10 +77,10 @@ def test_agg_datetimes_mixed(): ) df1["weights"] = df1["value"] / df1["value"].sum() - gb1 = df1.groupby("date").aggregate(np.sum) + gb1 = df1.groupby("date").aggregate("sum") df2["weights"] = df1["value"] / df1["value"].sum() - gb2 = df2.groupby("date").aggregate(np.sum) + gb2 = df2.groupby("date").aggregate("sum") assert len(gb1) == len(gb2) @@ -191,12 +191,12 @@ def test_aggregate_api_consistency(): expected.columns = ["sum", "mean"] tm.assert_frame_equal(result, expected, check_like=True) - result = grouped.agg([np.sum, np.mean]) + result = grouped.agg(["sum", "mean"]) expected = pd.concat([c_sum, c_mean, d_sum, d_mean], axis=1) expected.columns = MultiIndex.from_product([["C", "D"], ["sum", "mean"]]) tm.assert_frame_equal(result, expected, check_like=True) - result = grouped[["D", "C"]].agg([np.sum, np.mean]) + result = grouped[["D", "C"]].agg(["sum", "mean"]) expected = pd.concat([d_sum, d_mean, c_sum, c_mean], axis=1) expected.columns = MultiIndex.from_product([["D", "C"], ["sum", "mean"]]) tm.assert_frame_equal(result, expected, check_like=True) @@ -211,7 +211,7 @@ def test_aggregate_api_consistency(): msg = r"Column\(s\) \['r', 'r2'\] do not exist" with pytest.raises(KeyError, match=msg): - grouped[["D", "C"]].agg({"r": np.sum, "r2": np.mean}) + grouped[["D", "C"]].agg({"r": "sum", "r2": "mean"}) def test_agg_dict_renaming_deprecation(): @@ -299,7 +299,7 @@ def test_series_agg_multikey(): ts = tm.makeTimeSeries() grouped = ts.groupby([lambda x: x.year, lambda x: x.month]) - result = grouped.agg(np.sum) + result = grouped.agg("sum") expected = grouped.sum() tm.assert_series_equal(result, expected) @@ -406,9 +406,12 @@ def __call__(self, x): fn_class(), ] - expected = df.groupby("foo").agg(sum) + expected = df.groupby("foo").agg("sum") for ecall in equiv_callables: - result = df.groupby("foo").agg(ecall) + warn = FutureWarning if ecall is sum or ecall is np.sum else None + msg = "using DataFrameGroupBy.sum" + with tm.assert_produces_warning(warn, match=msg): + result = df.groupby("foo").agg(ecall) tm.assert_frame_equal(result, expected) @@ -476,7 +479,7 @@ def test_agg_timezone_round_trip(): ts = pd.Timestamp("2016-01-01 12:00:00", tz="US/Pacific") df = DataFrame({"a": 1, "b": [ts + dt.timedelta(minutes=nn) for nn in range(10)]}) - result1 = df.groupby("a")["b"].agg(np.min).iloc[0] + result1 = df.groupby("a")["b"].agg("min").iloc[0] result2 = df.groupby("a")["b"].agg(lambda x: np.min(x)).iloc[0] result3 = df.groupby("a")["b"].min().iloc[0] @@ -580,7 +583,9 @@ def test_agg_category_nansum(observed): df = DataFrame( {"A": pd.Categorical(["a", "a", "b"], categories=categories), "B": [1, 2, 3]} ) - result = df.groupby("A", observed=observed).B.agg(np.nansum) + msg = "using SeriesGroupBy.sum" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("A", observed=observed).B.agg(np.nansum) expected = Series( [3, 3, 0], index=pd.CategoricalIndex(["a", "b", "c"], categories=categories, name="A"), diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index b25950192018d..3ab62bb7656b7 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -135,23 +135,33 @@ def f(x): df = DataFrame({"a": [5, 15, 25]}) c = pd.cut(df.a, bins=[0, 10, 20, 30, 40]) - result = df.a.groupby(c, observed=False).transform(sum) + msg = "using SeriesGroupBy.sum" + with tm.assert_produces_warning(FutureWarning, match=msg): + # GH#53425 + result = df.a.groupby(c, observed=False).transform(sum) tm.assert_series_equal(result, df["a"]) tm.assert_series_equal( df.a.groupby(c, observed=False).transform(lambda xs: np.sum(xs)), df["a"] ) - tm.assert_frame_equal(df.groupby(c, observed=False).transform(sum), df[["a"]]) + msg = "using DataFrameGroupBy.sum" + with tm.assert_produces_warning(FutureWarning, match=msg): + # GH#53425 + result = df.groupby(c, observed=False).transform(sum) + expected = df[["a"]] + tm.assert_frame_equal(result, expected) gbc = df.groupby(c, observed=False) result = gbc.transform(lambda xs: np.max(xs, axis=0)) tm.assert_frame_equal(result, df[["a"]]) - with tm.assert_produces_warning(None): - result2 = gbc.transform(lambda xs: np.max(xs, axis=0)) + result2 = gbc.transform(lambda xs: np.max(xs, axis=0)) + msg = "using DataFrameGroupBy.max" + with tm.assert_produces_warning(FutureWarning, match=msg): + # GH#53425 result3 = gbc.transform(max) - result4 = gbc.transform(np.maximum.reduce) - result5 = gbc.transform(lambda xs: np.maximum.reduce(xs)) + result4 = gbc.transform(np.maximum.reduce) + result5 = gbc.transform(lambda xs: np.maximum.reduce(xs)) tm.assert_frame_equal(result2, df[["a"]], check_dtype=False) tm.assert_frame_equal(result3, df[["a"]], check_dtype=False) tm.assert_frame_equal(result4, df[["a"]]) @@ -165,13 +175,22 @@ def f(x): df = DataFrame({"a": [5, 15, 25, -5]}) c = pd.cut(df.a, bins=[-10, 0, 10, 20, 30, 40]) - result = df.a.groupby(c, observed=False).transform(sum) + msg = "using SeriesGroupBy.sum" + with tm.assert_produces_warning(FutureWarning, match=msg): + # GH#53425 + result = df.a.groupby(c, observed=False).transform(sum) tm.assert_series_equal(result, df["a"]) tm.assert_series_equal( df.a.groupby(c, observed=False).transform(lambda xs: np.sum(xs)), df["a"] ) - tm.assert_frame_equal(df.groupby(c, observed=False).transform(sum), df[["a"]]) + msg = "using DataFrameGroupBy.sum" + with tm.assert_produces_warning(FutureWarning, match=msg): + # GH#53425 + result = df.groupby(c, observed=False).transform(sum) + expected = df[["a"]] + tm.assert_frame_equal(result, expected) + tm.assert_frame_equal( df.groupby(c, observed=False).transform(lambda xs: np.sum(xs)), df[["a"]] ) @@ -294,7 +313,10 @@ def test_apply(ordered): result = grouped.mean() tm.assert_frame_equal(result, expected) - result = grouped.agg(np.mean) + msg = "using DataFrameGroupBy.mean" + with tm.assert_produces_warning(FutureWarning, match=msg): + # GH#53425 + result = grouped.agg(np.mean) tm.assert_frame_equal(result, expected) # but for transform we should still get back the original index @@ -1216,7 +1238,10 @@ def test_seriesgroupby_observed_true(df_cat, operation): expected = Series(data=[2, 4, 1, 3], index=index, name="C").sort_index() grouped = df_cat.groupby(["A", "B"], observed=True)["C"] - result = getattr(grouped, operation)(sum) + msg = "using np.sum" if operation == "apply" else "using SeriesGroupBy.sum" + with tm.assert_produces_warning(FutureWarning, match=msg): + # GH#53425 + result = getattr(grouped, operation)(sum) tm.assert_series_equal(result, expected) @@ -1239,7 +1264,10 @@ def test_seriesgroupby_observed_false_or_none(df_cat, observed, operation): with tm.assert_produces_warning(FutureWarning, match=msg): expected = expected.fillna(0, downcast="infer") grouped = df_cat.groupby(["A", "B"], observed=observed)["C"] - result = getattr(grouped, operation)(sum) + msg = "using SeriesGroupBy.sum" if operation == "agg" else "using np.sum" + with tm.assert_produces_warning(FutureWarning, match=msg): + # GH#53425 + result = getattr(grouped, operation)(sum) tm.assert_series_equal(result, expected) @@ -1664,7 +1692,10 @@ def test_categorical_transform(): categories=["Waiting", "OnTheWay", "Delivered"], ordered=True ) df["status"] = df["status"].astype(delivery_status_type) - df["last_status"] = df.groupby("package_id")["status"].transform(max) + msg = "using SeriesGroupBy.max" + with tm.assert_produces_warning(FutureWarning, match=msg): + # GH#53425 + df["last_status"] = df.groupby("package_id")["status"].transform(max) result = df.copy() expected = DataFrame( diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 090ed37d7d1b2..e3a5d308c4346 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -57,8 +57,14 @@ def test_intercept_builtin_sum(): s = Series([1.0, 2.0, np.nan, 3.0]) grouped = s.groupby([0, 1, 2, 2]) - result = grouped.agg(builtins.sum) - result2 = grouped.apply(builtins.sum) + msg = "using SeriesGroupBy.sum" + with tm.assert_produces_warning(FutureWarning, match=msg): + # GH#53425 + result = grouped.agg(builtins.sum) + msg = "using np.sum" + with tm.assert_produces_warning(FutureWarning, match=msg): + # GH#53425 + result2 = grouped.apply(builtins.sum) expected = grouped.sum() tm.assert_series_equal(result, expected) tm.assert_series_equal(result2, expected) @@ -78,7 +84,10 @@ def test_builtins_apply(keys, f): warn = None if f is not sum else FutureWarning msg = "The behavior of DataFrame.sum with axis=None is deprecated" - with tm.assert_produces_warning(warn, match=msg, check_stacklevel=False): + with tm.assert_produces_warning( + warn, match=msg, check_stacklevel=False, raise_on_extra_warnings=False + ): + # Also warns on deprecation GH#53425 result = gb.apply(f) ngroups = len(df.drop_duplicates(subset=keys)) @@ -370,11 +379,15 @@ def test_cython_median(): labels[::17] = np.nan result = df.groupby(labels).median() - exp = df.groupby(labels).agg(np.nanmedian) + msg = "using DataFrameGroupBy.median" + with tm.assert_produces_warning(FutureWarning, match=msg): + exp = df.groupby(labels).agg(np.nanmedian) tm.assert_frame_equal(result, exp) df = DataFrame(np.random.randn(1000, 5)) - rs = df.groupby(labels).agg(np.median) + msg = "using DataFrameGroupBy.median" + with tm.assert_produces_warning(FutureWarning, match=msg): + rs = df.groupby(labels).agg(np.median) xp = df.groupby(labels).median() tm.assert_frame_equal(rs, xp) @@ -682,7 +695,10 @@ def test_ops_general(op, targop): labels = np.random.randint(0, 50, size=1000).astype(float) result = getattr(df.groupby(labels), op)() - expected = df.groupby(labels).agg(targop) + warn = None if op in ("first", "last", "count", "sem") else FutureWarning + msg = f"using DataFrameGroupBy.{op}" + with tm.assert_produces_warning(warn, match=msg): + expected = df.groupby(labels).agg(targop) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 79fc631fff87c..ca3fec8a99555 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -78,12 +78,21 @@ def test_basic_aggregations(dtype): for k, v in grouped: assert len(v) == 3 - agged = grouped.aggregate(np.mean) + msg = "using SeriesGroupBy.mean" + with tm.assert_produces_warning(FutureWarning, match=msg): + agged = grouped.aggregate(np.mean) assert agged[1] == 1 - tm.assert_series_equal(agged, grouped.agg(np.mean)) # shorthand + msg = "using SeriesGroupBy.mean" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = grouped.agg(np.mean) + tm.assert_series_equal(agged, expected) # shorthand tm.assert_series_equal(agged, grouped.mean()) - tm.assert_series_equal(grouped.agg(np.sum), grouped.sum()) + result = grouped.sum() + msg = "using SeriesGroupBy.sum" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = grouped.agg(np.sum) + tm.assert_series_equal(result, expected) expected = grouped.apply(lambda x: x * x.sum()) transformed = grouped.transform(lambda x: x * x.sum()) @@ -91,12 +100,15 @@ def test_basic_aggregations(dtype): tm.assert_series_equal(transformed, expected) value_grouped = data.groupby(data) - tm.assert_series_equal( - value_grouped.aggregate(np.mean), agged, check_index_type=False - ) + msg = "using SeriesGroupBy.mean" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = value_grouped.aggregate(np.mean) + tm.assert_series_equal(result, agged, check_index_type=False) # complex agg - agged = grouped.aggregate([np.mean, np.std]) + msg = "using SeriesGroupBy.[mean|std]" + with tm.assert_produces_warning(FutureWarning, match=msg): + agged = grouped.aggregate([np.mean, np.std]) msg = r"nested renamer is not supported" with pytest.raises(SpecificationError, match=msg): @@ -422,14 +434,14 @@ def test_frame_groupby(tsframe): grouped = tsframe.groupby(lambda x: x.weekday()) # aggregate - aggregated = grouped.aggregate(np.mean) + aggregated = grouped.aggregate("mean") assert len(aggregated) == 5 assert len(aggregated.columns) == 4 # by string tscopy = tsframe.copy() tscopy["weekday"] = [x.weekday() for x in tscopy.index] - stragged = tscopy.groupby("weekday").aggregate(np.mean) + stragged = tscopy.groupby("weekday").aggregate("mean") tm.assert_frame_equal(stragged, aggregated, check_names=False) # transform @@ -465,7 +477,7 @@ def test_frame_groupby_columns(tsframe): grouped = tsframe.groupby(mapping, axis=1) # aggregate - aggregated = grouped.aggregate(np.mean) + aggregated = grouped.aggregate("mean") assert len(aggregated) == len(tsframe) assert len(aggregated.columns) == 2 @@ -490,22 +502,22 @@ def test_frame_set_name_single(df): result = df.groupby("A", as_index=False).mean(numeric_only=True) assert result.index.name != "A" - result = grouped[["C", "D"]].agg(np.mean) + result = grouped[["C", "D"]].agg("mean") assert result.index.name == "A" - result = grouped.agg({"C": np.mean, "D": np.std}) + result = grouped.agg({"C": "mean", "D": "std"}) assert result.index.name == "A" result = grouped["C"].mean() assert result.index.name == "A" - result = grouped["C"].agg(np.mean) + result = grouped["C"].agg("mean") assert result.index.name == "A" - result = grouped["C"].agg([np.mean, np.std]) + result = grouped["C"].agg(["mean", "std"]) assert result.index.name == "A" msg = r"nested renamer is not supported" with pytest.raises(SpecificationError, match=msg): - grouped["C"].agg({"foo": np.mean, "bar": np.std}) + grouped["C"].agg({"foo": "mean", "bar": "std"}) def test_multi_func(df): @@ -533,14 +545,14 @@ def test_multi_func(df): ) # only verify that it works for now grouped = df.groupby(["k1", "k2"]) - grouped.agg(np.sum) + grouped.agg("sum") def test_multi_key_multiple_functions(df): grouped = df.groupby(["A", "B"])["C"] - agged = grouped.agg([np.mean, np.std]) - expected = DataFrame({"mean": grouped.agg(np.mean), "std": grouped.agg(np.std)}) + agged = grouped.agg(["mean", "std"]) + expected = DataFrame({"mean": grouped.agg("mean"), "std": grouped.agg("std")}) tm.assert_frame_equal(agged, expected) @@ -580,7 +592,7 @@ def test_frame_multi_key_function_list(): ) grouped = data.groupby(["A", "B"]) - funcs = [np.mean, np.std] + funcs = ["mean", "std"] agged = grouped.agg(funcs) expected = pd.concat( [grouped["D"].agg(funcs), grouped["E"].agg(funcs), grouped["F"].agg(funcs)], @@ -641,7 +653,7 @@ def test_frame_multi_key_function_list_partial_failure(): ) grouped = data.groupby(["A", "B"]) - funcs = [np.mean, np.std] + funcs = ["mean", "std"] msg = re.escape("agg function failed [how->mean,dtype->object]") with pytest.raises(TypeError, match=msg): grouped.agg(funcs) @@ -722,11 +734,11 @@ def test_groupby_as_index_agg(df): # single-key - result = grouped[["C", "D"]].agg(np.mean) + result = grouped[["C", "D"]].agg("mean") expected = grouped.mean(numeric_only=True) tm.assert_frame_equal(result, expected) - result2 = grouped.agg({"C": np.mean, "D": np.sum}) + result2 = grouped.agg({"C": "mean", "D": "sum"}) expected2 = grouped.mean(numeric_only=True) expected2["D"] = grouped.sum()["D"] tm.assert_frame_equal(result2, expected2) @@ -735,17 +747,17 @@ def test_groupby_as_index_agg(df): msg = r"nested renamer is not supported" with pytest.raises(SpecificationError, match=msg): - grouped["C"].agg({"Q": np.sum}) + grouped["C"].agg({"Q": "sum"}) # multi-key grouped = df.groupby(["A", "B"], as_index=False) - result = grouped.agg(np.mean) + result = grouped.agg("mean") expected = grouped.mean() tm.assert_frame_equal(result, expected) - result2 = grouped.agg({"C": np.mean, "D": np.sum}) + result2 = grouped.agg({"C": "mean", "D": "sum"}) expected2 = grouped.mean() expected2["D"] = grouped.sum()["D"] tm.assert_frame_equal(result2, expected2) @@ -754,7 +766,7 @@ def test_groupby_as_index_agg(df): expected3 = DataFrame(expected3).rename(columns={"C": "Q"}) msg = "Passing a dictionary to SeriesGroupBy.agg is deprecated" with tm.assert_produces_warning(FutureWarning, match=msg): - result3 = grouped["C"].agg({"Q": np.sum}) + result3 = grouped["C"].agg({"Q": "sum"}) tm.assert_frame_equal(result3, expected3) # GH7115 & GH8112 & GH8582 @@ -817,13 +829,13 @@ def test_as_index_series_return_frame(df): grouped = df.groupby("A", as_index=False) grouped2 = df.groupby(["A", "B"], as_index=False) - result = grouped["C"].agg(np.sum) - expected = grouped.agg(np.sum).loc[:, ["A", "C"]] + result = grouped["C"].agg("sum") + expected = grouped.agg("sum").loc[:, ["A", "C"]] assert isinstance(result, DataFrame) tm.assert_frame_equal(result, expected) - result2 = grouped2["C"].agg(np.sum) - expected2 = grouped2.agg(np.sum).loc[:, ["A", "B", "C"]] + result2 = grouped2["C"].agg("sum") + expected2 = grouped2.agg("sum").loc[:, ["A", "B", "C"]] assert isinstance(result2, DataFrame) tm.assert_frame_equal(result2, expected2) @@ -928,7 +940,7 @@ def test_raises_on_nuisance(df): grouped = df.groupby("A") msg = re.escape("agg function failed [how->mean,dtype->object]") with pytest.raises(TypeError, match=msg): - grouped.agg(np.mean) + grouped.agg("mean") with pytest.raises(TypeError, match=msg): grouped.mean() @@ -937,7 +949,7 @@ def test_raises_on_nuisance(df): grouped = df.groupby("A") msg = "datetime64 type does not support sum operations" with pytest.raises(TypeError, match=msg): - grouped.agg(np.sum) + grouped.agg("sum") with pytest.raises(TypeError, match=msg): grouped.sum() @@ -1009,7 +1021,7 @@ def test_raise_on_nuisance_python_multiple(three_group): grouped = three_group.groupby(["A", "B"]) msg = re.escape("agg function failed [how->mean,dtype->object]") with pytest.raises(TypeError, match=msg): - grouped.agg(np.mean) + grouped.agg("mean") with pytest.raises(TypeError, match=msg): grouped.mean() @@ -1027,13 +1039,13 @@ def test_empty_groups_corner(mframe): ) grouped = df.groupby(["k1", "k2"]) - result = grouped[["v1", "v2"]].agg(np.mean) + result = grouped[["v1", "v2"]].agg("mean") expected = grouped.mean(numeric_only=True) tm.assert_frame_equal(result, expected) grouped = mframe[3:5].groupby(level=0) agged = grouped.apply(lambda x: x.mean()) - agged_A = grouped["A"].apply(np.mean) + agged_A = grouped["A"].apply("mean") tm.assert_series_equal(agged["A"], agged_A) assert agged.index.name == "first" @@ -1052,8 +1064,8 @@ def test_wrap_aggregated_output_multindex(mframe): keys = [np.array([0, 0, 1]), np.array([0, 0, 1])] msg = re.escape("agg function failed [how->mean,dtype->object]") with pytest.raises(TypeError, match=msg): - df.groupby(keys).agg(np.mean) - agged = df.drop(columns=("baz", "two")).groupby(keys).agg(np.mean) + df.groupby(keys).agg("mean") + agged = df.drop(columns=("baz", "two")).groupby(keys).agg("mean") assert isinstance(agged.columns, MultiIndex) def aggfun(ser): @@ -1201,7 +1213,7 @@ def test_groupby_with_hier_columns(): result = gb.mean() tm.assert_index_equal(result.index, df.index) - result = df.groupby(level=0).agg(np.mean) + result = df.groupby(level=0).agg("mean") tm.assert_index_equal(result.columns, columns) result = df.groupby(level=0).apply(lambda x: x.mean()) @@ -1242,7 +1254,7 @@ def test_groupby_wrong_multi_labels(): grouped = data.groupby(["foo", "bar", "baz", "spam"]) - result = grouped.agg(np.mean) + result = grouped.agg("mean") expected = grouped.mean() tm.assert_frame_equal(result, expected) @@ -1602,7 +1614,7 @@ def test_no_nonsense_name(float_frame): s = float_frame["C"].copy() s.name = None - result = s.groupby(float_frame["A"]).agg(np.sum) + result = s.groupby(float_frame["A"]).agg("sum") assert result.name is None diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index ab268a1d94b96..03e3086b8c847 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -231,7 +231,7 @@ def test_groupby_dropna_multi_index_dataframe_agg(dropna, tuples, outputs): ["A", "B", 1, 1, 1.0], ] df = pd.DataFrame(df_list, columns=["a", "b", "c", "d", "e"]) - agg_dict = {"c": sum, "d": max, "e": "min"} + agg_dict = {"c": "sum", "d": "max", "e": "min"} grouped = df.groupby(["a", "b"], dropna=dropna).agg(agg_dict) mi = pd.MultiIndex.from_tuples(tuples, names=list("ab")) @@ -278,7 +278,7 @@ def test_groupby_dropna_datetime_like_data( else: indexes = [datetime1, datetime2, np.nan] - grouped = df.groupby("dt", dropna=dropna).agg({"values": sum}) + grouped = df.groupby("dt", dropna=dropna).agg({"values": "sum"}) expected = pd.DataFrame({"values": values}, index=pd.Index(indexes, name="dt")) tm.assert_frame_equal(grouped, expected) diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index 41b7dde4bf631..1e9c4b446c4d0 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -63,7 +63,7 @@ def test_column_select_via_attr(self, df): df["mean"] = 1.5 result = df.groupby("A").mean(numeric_only=True) - expected = df.groupby("A")[["C", "D", "mean"]].agg(np.mean) + expected = df.groupby("A")[["C", "D", "mean"]].agg("mean") tm.assert_frame_equal(result, expected) def test_getitem_list_of_columns(self): @@ -399,15 +399,15 @@ def test_groupby_grouper(self, df): def test_groupby_dict_mapping(self): # GH #679 s = Series({"T1": 5}) - result = s.groupby({"T1": "T2"}).agg(sum) - expected = s.groupby(["T2"]).agg(sum) + result = s.groupby({"T1": "T2"}).agg("sum") + expected = s.groupby(["T2"]).agg("sum") tm.assert_series_equal(result, expected) s = Series([1.0, 2.0, 3.0, 4.0], index=list("abcd")) mapping = {"a": 0, "b": 0, "c": 1, "d": 1} result = s.groupby(mapping).mean() - result2 = s.groupby(mapping).agg(np.mean) + result2 = s.groupby(mapping).agg("mean") exp_key = np.array([0, 0, 1, 1], dtype=np.int64) expected = s.groupby(exp_key).mean() expected2 = s.groupby(exp_key).mean() diff --git a/pandas/tests/groupby/test_raises.py b/pandas/tests/groupby/test_raises.py index 180755c1dca12..a3fa5bf794030 100644 --- a/pandas/tests/groupby/test_raises.py +++ b/pandas/tests/groupby/test_raises.py @@ -229,7 +229,12 @@ def test_groupby_raises_string_np( ), }[groupby_func_np] - _call_and_check(klass, msg, how, gb, groupby_func_np, ()) + if groupby_series: + warn_msg = "using SeriesGroupBy.[sum|mean]" + else: + warn_msg = "using DataFrameGroupBy.[sum|mean]" + with tm.assert_produces_warning(FutureWarning, match=warn_msg): + _call_and_check(klass, msg, how, gb, groupby_func_np, ()) @pytest.mark.parametrize("how", ["method", "agg", "transform"]) @@ -333,7 +338,12 @@ def test_groupby_raises_datetime_np( np.mean: (None, ""), }[groupby_func_np] - _call_and_check(klass, msg, how, gb, groupby_func_np, ()) + if groupby_series: + warn_msg = "using SeriesGroupBy.[sum|mean]" + else: + warn_msg = "using DataFrameGroupBy.[sum|mean]" + with tm.assert_produces_warning(FutureWarning, match=warn_msg): + _call_and_check(klass, msg, how, gb, groupby_func_np, ()) @pytest.mark.parametrize("func", ["prod", "cumprod", "skew", "var"]) @@ -526,7 +536,12 @@ def test_groupby_raises_category_np( ), }[groupby_func_np] - _call_and_check(klass, msg, how, gb, groupby_func_np, ()) + if groupby_series: + warn_msg = "using SeriesGroupBy.[sum|mean]" + else: + warn_msg = "using DataFrameGroupBy.[sum|mean]" + with tm.assert_produces_warning(FutureWarning, match=warn_msg): + _call_and_check(klass, msg, how, gb, groupby_func_np, ()) @pytest.mark.parametrize("how", ["method", "agg", "transform"]) diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py index 04b99939514e6..60c35064d9aa7 100644 --- a/pandas/tests/groupby/test_timegrouper.py +++ b/pandas/tests/groupby/test_timegrouper.py @@ -725,7 +725,7 @@ def test_groupby_datetime64_32_bit(self): # 32-bit under 1.9-dev indexing issue df = DataFrame({"A": range(2), "B": [Timestamp("2000-01-1")] * 2}) - result = df.groupby("A")["B"].transform(min) + result = df.groupby("A")["B"].transform("min") expected = Series([Timestamp("2000-01-1")] * 2, name="B") tm.assert_series_equal(result, expected) @@ -918,11 +918,11 @@ def test_groupby_agg_numba_timegrouper_with_nat( lambda values, index: np.nanmean(values), engine="numba" ) - expected = gb["Quantity"].aggregate(np.nanmean) + expected = gb["Quantity"].aggregate("mean") tm.assert_series_equal(result, expected) result_df = gb[["Quantity"]].aggregate( lambda values, index: np.nanmean(values), engine="numba" ) - expected_df = gb[["Quantity"]].aggregate(np.nanmean) + expected_df = gb[["Quantity"]].aggregate("mean") tm.assert_frame_equal(result_df, expected_df) diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index cf41b4ff57331..c84ee3114b71f 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -75,7 +75,9 @@ def demean(arr): # GH 9700 df = DataFrame({"a": range(5, 10), "b": range(5)}) - result = df.groupby("a").transform(max) + msg = "using DataFrameGroupBy.max" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("a").transform(max) expected = DataFrame({"b": range(5)}) tm.assert_frame_equal(result, expected) @@ -88,7 +90,9 @@ def test_transform_fast(): values = np.repeat(grp.mean().values, ensure_platform_int(grp.count().values)) expected = Series(values, index=df.index, name="val") - result = grp.transform(np.mean) + msg = "using SeriesGroupBy.mean" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = grp.transform(np.mean) tm.assert_series_equal(result, expected) result = grp.transform("mean") @@ -132,14 +136,18 @@ def test_transform_fast(): def test_transform_broadcast(tsframe, ts): grouped = ts.groupby(lambda x: x.month) - result = grouped.transform(np.mean) + msg = "using SeriesGroupBy.mean" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = grouped.transform(np.mean) tm.assert_index_equal(result.index, ts.index) for _, gp in grouped: assert_fp_equal(result.reindex(gp.index), gp.mean()) grouped = tsframe.groupby(lambda x: x.month) - result = grouped.transform(np.mean) + msg = "using DataFrameGroupBy.mean" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = grouped.transform(np.mean) tm.assert_index_equal(result.index, tsframe.index) for _, gp in grouped: agged = gp.mean(axis=0) @@ -151,7 +159,9 @@ def test_transform_broadcast(tsframe, ts): msg = "DataFrame.groupby with axis=1 is deprecated" with tm.assert_produces_warning(FutureWarning, match=msg): grouped = tsframe.groupby({"A": 0, "B": 0, "C": 1, "D": 1}, axis=1) - result = grouped.transform(np.mean) + msg = "using DataFrameGroupBy.mean" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = grouped.transform(np.mean) tm.assert_index_equal(result.index, tsframe.index) tm.assert_index_equal(result.columns, tsframe.columns) for _, gp in grouped: @@ -348,7 +358,10 @@ def test_transform_multiple(ts): grouped = ts.groupby([lambda x: x.year, lambda x: x.month]) grouped.transform(lambda x: x * 2) - grouped.transform(np.mean) + + msg = "using SeriesGroupBy.mean" + with tm.assert_produces_warning(FutureWarning, match=msg): + grouped.transform(np.mean) def test_dispatch_transform(tsframe): @@ -464,11 +477,15 @@ def test_transform_nuisance_raises(df): def test_transform_function_aliases(df): result = df.groupby("A").transform("mean", numeric_only=True) - expected = df.groupby("A")[["C", "D"]].transform(np.mean) + msg = "using DataFrameGroupBy.mean" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = df.groupby("A")[["C", "D"]].transform(np.mean) tm.assert_frame_equal(result, expected) result = df.groupby("A")["C"].transform("mean") - expected = df.groupby("A")["C"].transform(np.mean) + msg = "using SeriesGroupBy.mean" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = df.groupby("A")["C"].transform(np.mean) tm.assert_series_equal(result, expected) @@ -496,12 +513,14 @@ def test_transform_length(): def nsum(x): return np.nansum(x) - results = [ - df.groupby("col1").transform(sum)["col2"], - df.groupby("col1")["col2"].transform(sum), - df.groupby("col1").transform(nsum)["col2"], - df.groupby("col1")["col2"].transform(nsum), - ] + msg = "using DataFrameGroupBy.sum" + with tm.assert_produces_warning(FutureWarning, match=msg): + results = [ + df.groupby("col1").transform(sum)["col2"], + df.groupby("col1")["col2"].transform(sum), + df.groupby("col1").transform(nsum)["col2"], + df.groupby("col1")["col2"].transform(nsum), + ] for result in results: tm.assert_series_equal(result, expected, check_names=False) @@ -513,7 +532,9 @@ def test_transform_coercion(): df = DataFrame({"A": ["a", "a", "b", "b"], "B": [0, 1, 3, 4]}) g = df.groupby("A") - expected = g.transform(np.mean) + msg = "using DataFrameGroupBy.mean" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = g.transform(np.mean) result = g.transform(lambda x: np.mean(x, axis=0)) tm.assert_frame_equal(result, expected) @@ -584,7 +605,9 @@ def test_groupby_transform_with_int(): def test_groupby_transform_with_nan_group(): # GH 9941 df = DataFrame({"a": range(10), "b": [1, 1, 2, 3, np.nan, 4, 4, 5, 5, 5]}) - result = df.groupby(df.b)["a"].transform(max) + msg = "using SeriesGroupBy.max" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby(df.b)["a"].transform(max) expected = Series([1.0, 1.0, 2.0, 3.0, np.nan, 6.0, 6.0, 9.0, 9.0, 9.0], name="a") tm.assert_series_equal(result, expected) @@ -1085,7 +1108,9 @@ def test_any_all_np_func(func): exp = Series([True, np.nan, True], name="val") - res = df.groupby("key")["val"].transform(func) + msg = "using SeriesGroupBy.[any|all]" + with tm.assert_produces_warning(FutureWarning, match=msg): + res = df.groupby("key")["val"].transform(func) tm.assert_series_equal(res, exp) @@ -1115,7 +1140,10 @@ def test_groupby_transform_timezone_column(func): # GH 24198 ts = pd.to_datetime("now", utc=True).tz_convert("Asia/Singapore") result = DataFrame({"end_time": [ts], "id": [1]}) - result["max_end_time"] = result.groupby("id").end_time.transform(func) + warn = FutureWarning if not isinstance(func, str) else None + msg = "using SeriesGroupBy.[min|max]" + with tm.assert_produces_warning(warn, match=msg): + result["max_end_time"] = result.groupby("id").end_time.transform(func) expected = DataFrame([[ts, 1, ts]], columns=["end_time", "id", "max_end_time"]) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/resample/test_base.py b/pandas/tests/resample/test_base.py index b82afab49954d..e57d938f060df 100644 --- a/pandas/tests/resample/test_base.py +++ b/pandas/tests/resample/test_base.py @@ -299,7 +299,7 @@ def test_apply_to_empty_series(empty_series_dti, freq): return result = ser.resample(freq, group_keys=False).apply(lambda x: 1) - expected = ser.resample(freq).apply(np.sum) + expected = ser.resample(freq).apply("sum") tm.assert_series_equal(result, expected, check_dtype=False) diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index 0f52f2d1c65ee..3c6f75bdcfc46 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -87,7 +87,7 @@ def test_custom_grouper(index, unit): expect = Series(arr, index=idx) # GH2763 - return input dtype if we can - result = g.agg(np.sum) + result = g.agg("sum") tm.assert_series_equal(result, expect) @@ -95,7 +95,7 @@ def test_custom_grouper_df(index, unit): b = Grouper(freq=Minute(5), closed="right", label="right") dti = index.as_unit(unit) df = DataFrame(np.random.rand(len(dti), 10), index=dti, dtype="float64") - r = df.groupby(b).agg(np.sum) + r = df.groupby(b).agg("sum") assert len(r.columns) == 10 assert len(r.index) == 2593 @@ -1847,7 +1847,9 @@ def test_resample_apply_product(duplicates, unit): if duplicates: df.columns = ["A", "A"] - result = df.resample("Q").apply(np.prod) + msg = "using DatetimeIndexResampler.prod" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.resample("Q").apply(np.prod) expected = DataFrame( np.array([[0, 24], [60, 210], [336, 720], [990, 1716]], dtype=np.int64), index=DatetimeIndex( diff --git a/pandas/tests/resample/test_period_index.py b/pandas/tests/resample/test_period_index.py index 9c3ccd96a8d59..20b997bdca873 100644 --- a/pandas/tests/resample/test_period_index.py +++ b/pandas/tests/resample/test_period_index.py @@ -524,7 +524,7 @@ def test_resample_tz_localized(self): ) result = ( ts.resample("A") - .agg({"first": np.sum, "second": np.mean}) + .agg({"first": "sum", "second": "mean"}) .reindex(columns=["first", "second"]) ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index dbd28868b81b1..6aa59d8b3d164 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -401,6 +401,7 @@ def test_agg(): expected = pd.concat([a_mean, a_std, b_mean, b_std], axis=1) expected.columns = pd.MultiIndex.from_product([["A", "B"], ["mean", "std"]]) + msg = "using SeriesGroupBy.[mean|std]" for t in cases: # In case 2, "date" is an index and a column, so get included in the agg if t == cases[2]: @@ -410,21 +411,26 @@ def test_agg(): exp.columns = pd.MultiIndex.from_product( [["date", "A", "B"], ["mean", "std"]] ) - result = t.aggregate([np.mean, np.std]) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = t.aggregate([np.mean, np.std]) tm.assert_frame_equal(result, exp) else: - result = t.aggregate([np.mean, np.std]) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = t.aggregate([np.mean, np.std]) tm.assert_frame_equal(result, expected) expected = pd.concat([a_mean, b_std], axis=1) for t in cases: - result = t.aggregate({"A": np.mean, "B": np.std}) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = t.aggregate({"A": np.mean, "B": np.std}) tm.assert_frame_equal(result, expected, check_like=True) - result = t.aggregate(A=("A", np.mean), B=("B", np.std)) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = t.aggregate(A=("A", np.mean), B=("B", np.std)) tm.assert_frame_equal(result, expected, check_like=True) - result = t.aggregate(A=NamedAgg("A", np.mean), B=NamedAgg("B", np.std)) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = t.aggregate(A=NamedAgg("A", np.mean), B=NamedAgg("B", np.std)) tm.assert_frame_equal(result, expected, check_like=True) expected = pd.concat([a_mean, a_std], axis=1) @@ -501,18 +507,22 @@ def test_agg_misc(): ] # passed lambda + msg = "using SeriesGroupBy.sum" for t in cases: - result = t.agg({"A": np.sum, "B": lambda x: np.std(x, ddof=1)}) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = t.agg({"A": np.sum, "B": lambda x: np.std(x, ddof=1)}) rcustom = t["B"].apply(lambda x: np.std(x, ddof=1)) expected = pd.concat([r["A"].sum(), rcustom], axis=1) tm.assert_frame_equal(result, expected, check_like=True) - result = t.agg(A=("A", np.sum), B=("B", lambda x: np.std(x, ddof=1))) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = t.agg(A=("A", np.sum), B=("B", lambda x: np.std(x, ddof=1))) tm.assert_frame_equal(result, expected, check_like=True) - result = t.agg( - A=NamedAgg("A", np.sum), B=NamedAgg("B", lambda x: np.std(x, ddof=1)) - ) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = t.agg( + A=NamedAgg("A", np.sum), B=NamedAgg("B", lambda x: np.std(x, ddof=1)) + ) tm.assert_frame_equal(result, expected, check_like=True) # agg with renamers diff --git a/pandas/tests/resample/test_time_grouper.py b/pandas/tests/resample/test_time_grouper.py index a5fb48f801522..2cd47296d5cab 100644 --- a/pandas/tests/resample/test_time_grouper.py +++ b/pandas/tests/resample/test_time_grouper.py @@ -54,7 +54,9 @@ def test_count(test_series): def test_numpy_reduction(test_series): result = test_series.resample("A", closed="right").prod() - expected = test_series.groupby(lambda x: x.year).agg(np.prod) + msg = "using SeriesGroupBy.prod" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = test_series.groupby(lambda x: x.year).agg(np.prod) expected.index = result.index tm.assert_series_equal(result, expected) diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py index ffdff75e53cf7..179748f0506b5 100644 --- a/pandas/tests/reshape/merge/test_join.py +++ b/pandas/tests/reshape/merge/test_join.py @@ -425,7 +425,7 @@ def test_join_hierarchical_mixed_raises(self): # GH 2024 # GH 40993: For raising, enforced in 2.0 df = DataFrame([(1, 2, 3), (4, 5, 6)], columns=["a", "b", "c"]) - new_df = df.groupby(["a"]).agg({"b": [np.mean, np.sum]}) + new_df = df.groupby(["a"]).agg({"b": ["mean", "sum"]}) other_df = DataFrame([(1, 2, 3), (7, 10, 6)], columns=["a", "b", "d"]) other_df.set_index("a", inplace=True) # GH 9455, 12219 diff --git a/pandas/tests/reshape/test_crosstab.py b/pandas/tests/reshape/test_crosstab.py index 1bcc86c4908a2..382c102f1194f 100644 --- a/pandas/tests/reshape/test_crosstab.py +++ b/pandas/tests/reshape/test_crosstab.py @@ -212,13 +212,13 @@ def test_crosstab_pass_values(self): values = np.random.randn(100) table = crosstab( - [a, b], c, values, aggfunc=np.sum, rownames=["foo", "bar"], colnames=["baz"] + [a, b], c, values, aggfunc="sum", rownames=["foo", "bar"], colnames=["baz"] ) df = DataFrame({"foo": a, "bar": b, "baz": c, "values": values}) expected = df.pivot_table( - "values", index=["foo", "bar"], columns="baz", aggfunc=np.sum + "values", index=["foo", "bar"], columns="baz", aggfunc="sum" ) tm.assert_frame_equal(table, expected) @@ -452,9 +452,11 @@ def test_crosstab_normalize_arrays(self): index=Index([1, 2, "All"], name="a", dtype="object"), columns=Index([3, 4, "All"], name="b", dtype="object"), ) - test_case = crosstab( - df.a, df.b, df.c, aggfunc=np.sum, normalize="all", margins=True - ) + msg = "using DataFrameGroupBy.sum" + with tm.assert_produces_warning(FutureWarning, match=msg): + test_case = crosstab( + df.a, df.b, df.c, aggfunc=np.sum, normalize="all", margins=True + ) tm.assert_frame_equal(test_case, norm_sum) def test_crosstab_with_empties(self, using_array_manager): @@ -655,14 +657,17 @@ def test_crosstab_normalize_multiple_columns(self): "E": [0] * 24, } ) - result = crosstab( - [df.A, df.B], - df.C, - values=df.D, - aggfunc=np.sum, - normalize=True, - margins=True, - ) + + msg = "using DataFrameGroupBy.sum" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = crosstab( + [df.A, df.B], + df.C, + values=df.D, + aggfunc=np.sum, + normalize=True, + margins=True, + ) expected = DataFrame( np.array([0] * 29 + [1], dtype=float).reshape(10, 3), columns=Index(["bar", "foo", "All"], dtype="object", name="C"), diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index b6fcb27faf146..1e122442cd40c 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -114,7 +114,7 @@ def test_pivot_table(self, observed, data): else: assert table.columns.name == columns[0] - expected = data.groupby(index + [columns])["D"].agg(np.mean).unstack() + expected = data.groupby(index + [columns])["D"].agg("mean").unstack() tm.assert_frame_equal(table, expected) def test_pivot_table_categorical_observed_equal(self, observed): @@ -124,7 +124,7 @@ def test_pivot_table_categorical_observed_equal(self, observed): ) expected = df.pivot_table( - index="col1", values="col3", columns="col2", aggfunc=np.sum, fill_value=0 + index="col1", values="col3", columns="col2", aggfunc="sum", fill_value=0 ) expected.index = expected.index.astype("category") @@ -137,7 +137,7 @@ def test_pivot_table_categorical_observed_equal(self, observed): index="col1", values="col3", columns="col2", - aggfunc=np.sum, + aggfunc="sum", fill_value=0, observed=observed, ) @@ -148,8 +148,8 @@ def test_pivot_table_nocols(self): df = DataFrame( {"rows": ["a", "b", "c"], "cols": ["x", "y", "z"], "values": [1, 2, 3]} ) - rs = df.pivot_table(columns="cols", aggfunc=np.sum) - xp = df.pivot_table(index="cols", aggfunc=np.sum).T + rs = df.pivot_table(columns="cols", aggfunc="sum") + xp = df.pivot_table(index="cols", aggfunc="sum").T tm.assert_frame_equal(rs, xp) rs = df.pivot_table(columns="cols", aggfunc={"values": "mean"}) @@ -345,7 +345,7 @@ def test_pivot_table_multiple(self, data): index = ["A", "B"] columns = "C" table = pivot_table(data, index=index, columns=columns) - expected = data.groupby(index + [columns]).agg(np.mean).unstack() + expected = data.groupby(index + [columns]).agg("mean").unstack() tm.assert_frame_equal(table, expected) def test_pivot_dtypes(self): @@ -360,7 +360,7 @@ def test_pivot_dtypes(self): assert f.dtypes["v"] == "int64" z = pivot_table( - f, values="v", index=["a"], columns=["i"], fill_value=0, aggfunc=np.sum + f, values="v", index=["a"], columns=["i"], fill_value=0, aggfunc="sum" ) result = z.dtypes expected = Series([np.dtype("int64")] * 2, index=Index(list("ab"), name="i")) @@ -377,7 +377,7 @@ def test_pivot_dtypes(self): assert f.dtypes["v"] == "float64" z = pivot_table( - f, values="v", index=["a"], columns=["i"], fill_value=0, aggfunc=np.mean + f, values="v", index=["a"], columns=["i"], fill_value=0, aggfunc="mean" ) result = z.dtypes expected = Series([np.dtype("float64")] * 2, index=Index(list("ab"), name="i")) @@ -461,9 +461,9 @@ def test_pivot_multi_functions(self, data): f = lambda func: pivot_table( data, values=["D", "E"], index=["A", "B"], columns="C", aggfunc=func ) - result = f([np.mean, np.std]) - means = f(np.mean) - stds = f(np.std) + result = f(["mean", "std"]) + means = f("mean") + stds = f("std") expected = concat([means, stds], keys=["mean", "std"], axis=1) tm.assert_frame_equal(result, expected) @@ -476,9 +476,9 @@ def test_pivot_multi_functions(self, data): aggfunc=func, margins=True, ) - result = f([np.mean, np.std]) - means = f(np.mean) - stds = f(np.std) + result = f(["mean", "std"]) + means = f("mean") + stds = f("std") expected = concat([means, stds], keys=["mean", "std"], axis=1) tm.assert_frame_equal(result, expected) @@ -633,7 +633,7 @@ def test_pivot_tz_in_values(self): values="ts", index=["uid"], columns=[mins], - aggfunc=np.min, + aggfunc="min", ) expected = DataFrame( [ @@ -897,7 +897,7 @@ def _check_output( def test_margins(self, data): # column specified result = data.pivot_table( - values="D", index=["A", "B"], columns="C", margins=True, aggfunc=np.mean + values="D", index=["A", "B"], columns="C", margins=True, aggfunc="mean" ) self._check_output(result, "D", data) @@ -907,14 +907,14 @@ def test_margins(self, data): index=["A", "B"], columns="C", margins=True, - aggfunc=np.mean, + aggfunc="mean", margins_name="Totals", ) self._check_output(result, "D", data, margins_col="Totals") # no column specified table = data.pivot_table( - index=["A", "B"], columns="C", margins=True, aggfunc=np.mean + index=["A", "B"], columns="C", margins=True, aggfunc="mean" ) for value_col in table.columns.levels[0]: self._check_output(table[value_col], value_col, data) @@ -926,9 +926,9 @@ def test_no_col(self, data): data.columns = [k * 2 for k in data.columns] msg = re.escape("agg function failed [how->mean,dtype->object]") with pytest.raises(TypeError, match=msg): - data.pivot_table(index=["AA", "BB"], margins=True, aggfunc=np.mean) + data.pivot_table(index=["AA", "BB"], margins=True, aggfunc="mean") table = data.drop(columns="CC").pivot_table( - index=["AA", "BB"], margins=True, aggfunc=np.mean + index=["AA", "BB"], margins=True, aggfunc="mean" ) for value_col in table.columns: totals = table.loc[("All", ""), value_col] @@ -948,7 +948,7 @@ def test_no_col(self, data): [ ( "A", - np.mean, + "mean", [[5.5, 5.5, 2.2, 2.2], [8.0, 8.0, 4.4, 4.4]], Index(["bar", "All", "foo", "All"], name="A"), ), @@ -1027,7 +1027,7 @@ def test_margins_dtype(self, data): index=["A", "B"], columns="C", margins=True, - aggfunc=np.sum, + aggfunc="sum", fill_value=0, ) @@ -1274,7 +1274,7 @@ def test_pivot_timegrouper(self, using_array_manager): index=Grouper(freq="A"), columns="Buyer", values="Quantity", - aggfunc=np.sum, + aggfunc="sum", ) tm.assert_frame_equal(result, expected) @@ -1283,7 +1283,7 @@ def test_pivot_timegrouper(self, using_array_manager): index="Buyer", columns=Grouper(freq="A"), values="Quantity", - aggfunc=np.sum, + aggfunc="sum", ) tm.assert_frame_equal(result, expected.T) @@ -1305,7 +1305,7 @@ def test_pivot_timegrouper(self, using_array_manager): index=Grouper(freq="6MS"), columns="Buyer", values="Quantity", - aggfunc=np.sum, + aggfunc="sum", ) tm.assert_frame_equal(result, expected) @@ -1314,7 +1314,7 @@ def test_pivot_timegrouper(self, using_array_manager): index="Buyer", columns=Grouper(freq="6MS"), values="Quantity", - aggfunc=np.sum, + aggfunc="sum", ) tm.assert_frame_equal(result, expected.T) @@ -1325,7 +1325,7 @@ def test_pivot_timegrouper(self, using_array_manager): index=Grouper(freq="6MS", key="Date"), columns="Buyer", values="Quantity", - aggfunc=np.sum, + aggfunc="sum", ) tm.assert_frame_equal(result, expected) @@ -1334,7 +1334,7 @@ def test_pivot_timegrouper(self, using_array_manager): index="Buyer", columns=Grouper(freq="6MS", key="Date"), values="Quantity", - aggfunc=np.sum, + aggfunc="sum", ) tm.assert_frame_equal(result, expected.T) @@ -1345,7 +1345,7 @@ def test_pivot_timegrouper(self, using_array_manager): index=Grouper(freq="6MS", key="foo"), columns="Buyer", values="Quantity", - aggfunc=np.sum, + aggfunc="sum", ) with pytest.raises(KeyError, match=msg): pivot_table( @@ -1353,7 +1353,7 @@ def test_pivot_timegrouper(self, using_array_manager): index="Buyer", columns=Grouper(freq="6MS", key="foo"), values="Quantity", - aggfunc=np.sum, + aggfunc="sum", ) # passing the level @@ -1363,7 +1363,7 @@ def test_pivot_timegrouper(self, using_array_manager): index=Grouper(freq="6MS", level="Date"), columns="Buyer", values="Quantity", - aggfunc=np.sum, + aggfunc="sum", ) tm.assert_frame_equal(result, expected) @@ -1372,7 +1372,7 @@ def test_pivot_timegrouper(self, using_array_manager): index="Buyer", columns=Grouper(freq="6MS", level="Date"), values="Quantity", - aggfunc=np.sum, + aggfunc="sum", ) tm.assert_frame_equal(result, expected.T) @@ -1383,7 +1383,7 @@ def test_pivot_timegrouper(self, using_array_manager): index=Grouper(freq="6MS", level="foo"), columns="Buyer", values="Quantity", - aggfunc=np.sum, + aggfunc="sum", ) with pytest.raises(ValueError, match=msg): pivot_table( @@ -1391,7 +1391,7 @@ def test_pivot_timegrouper(self, using_array_manager): index="Buyer", columns=Grouper(freq="6MS", level="foo"), values="Quantity", - aggfunc=np.sum, + aggfunc="sum", ) def test_pivot_timegrouper_double(self): @@ -1429,7 +1429,7 @@ def test_pivot_timegrouper_double(self): index=Grouper(freq="M", key="Date"), columns=Grouper(freq="M", key="PayDay"), values="Quantity", - aggfunc=np.sum, + aggfunc="sum", ) expected = DataFrame( np.array( @@ -1481,7 +1481,7 @@ def test_pivot_timegrouper_double(self): index=Grouper(freq="M", key="PayDay"), columns=Grouper(freq="M", key="Date"), values="Quantity", - aggfunc=np.sum, + aggfunc="sum", ) tm.assert_frame_equal(result, expected.T) @@ -1508,7 +1508,7 @@ def test_pivot_timegrouper_double(self): index=[Grouper(freq="M", key="Date"), Grouper(freq="M", key="PayDay")], columns=["Branch"], values="Quantity", - aggfunc=np.sum, + aggfunc="sum", ) tm.assert_frame_equal(result, expected) @@ -1517,7 +1517,7 @@ def test_pivot_timegrouper_double(self): index=["Branch"], columns=[Grouper(freq="M", key="Date"), Grouper(freq="M", key="PayDay")], values="Quantity", - aggfunc=np.sum, + aggfunc="sum", ) tm.assert_frame_equal(result, expected.T) @@ -1588,7 +1588,7 @@ def test_pivot_datetime_tz(self): index=["dt1"], columns=["dt2"], values=["value1", "value2"], - aggfunc=[np.sum, np.mean], + aggfunc=["sum", "mean"], ) tm.assert_frame_equal(result, expected) @@ -1749,7 +1749,7 @@ def test_pivot_table_margins_name_with_aggfunc_list(self): columns="day", margins=True, margins_name=margins_name, - aggfunc=[np.mean, max], + aggfunc=["mean", "max"], ) ix = Index(["bacon", "cheese", margins_name], dtype="object", name="item") tups = [ @@ -1927,13 +1927,13 @@ def test_pivot_table_not_series(self): # and aggfunc is not instance of list df = DataFrame({"col1": [3, 4, 5], "col2": ["C", "D", "E"], "col3": [1, 3, 9]}) - result = df.pivot_table("col1", index=["col3", "col2"], aggfunc=np.sum) + result = df.pivot_table("col1", index=["col3", "col2"], aggfunc="sum") m = MultiIndex.from_arrays([[1, 3, 9], ["C", "D", "E"]], names=["col3", "col2"]) expected = DataFrame([3, 4, 5], index=m, columns=["col1"]) tm.assert_frame_equal(result, expected) - result = df.pivot_table("col1", index="col3", columns="col2", aggfunc=np.sum) + result = df.pivot_table("col1", index="col3", columns="col2", aggfunc="sum") expected = DataFrame( [[3, np.NaN, np.NaN], [np.NaN, 4, np.NaN], [np.NaN, np.NaN, 5]], index=Index([1, 3, 9], name="col3"), @@ -1942,7 +1942,7 @@ def test_pivot_table_not_series(self): tm.assert_frame_equal(result, expected) - result = df.pivot_table("col1", index="col3", aggfunc=[np.sum]) + result = df.pivot_table("col1", index="col3", aggfunc=["sum"]) m = MultiIndex.from_arrays([["sum"], ["col1"]]) expected = DataFrame([3, 4, 5], index=Index([1, 3, 9], name="col3"), columns=m) @@ -2037,7 +2037,10 @@ def test_pivot_string_func_vs_func(self, f, f_numpy, data): # for consistency purposes data = data.drop(columns="C") result = pivot_table(data, index="A", columns="B", aggfunc=f) - expected = pivot_table(data, index="A", columns="B", aggfunc=f_numpy) + ops = "|".join(f) if isinstance(f, list) else f + msg = f"using DataFrameGroupBy.[{ops}]" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = pivot_table(data, index="A", columns="B", aggfunc=f_numpy) tm.assert_frame_equal(result, expected) @pytest.mark.slow @@ -2104,7 +2107,7 @@ def test_pivot_table_aggfunc_scalar_dropna(self, dropna): {"A": ["one", "two", "one"], "x": [3, np.nan, 2], "y": [1, np.nan, np.nan]} ) - result = pivot_table(df, columns="A", aggfunc=np.mean, dropna=dropna) + result = pivot_table(df, columns="A", aggfunc="mean", dropna=dropna) data = [[2.5, np.nan], [1, np.nan]] col = Index(["one", "two"], name="A") @@ -2172,7 +2175,7 @@ def test_pivot_table_multiindex_columns_doctest_case(self): df, values=["D", "E"], index=["A", "C"], - aggfunc={"D": np.mean, "E": [min, max, np.mean]}, + aggfunc={"D": "mean", "E": ["min", "max", "mean"]}, ) cols = MultiIndex.from_tuples( [("D", "mean"), ("E", "max"), ("E", "mean"), ("E", "min")] @@ -2374,7 +2377,7 @@ def test_pivot_table_with_mixed_nested_tuples(self, using_array_manager): } ) result = pivot_table( - df, values="D", index=["A", "B"], columns=[(7, "seven")], aggfunc=np.sum + df, values="D", index=["A", "B"], columns=[(7, "seven")], aggfunc="sum" ) expected = DataFrame( [[4.0, 5.0], [7.0, 6.0], [4.0, 1.0], [np.nan, 6.0]], diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 8c5f9a894f2f7..43f1f5527c8e2 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -19,13 +19,13 @@ def test_reindex_level(self, multiindex_year_month_day_dataframe_random_data): month_sums = ymd.groupby("month").sum() result = month_sums.reindex(ymd.index, level=1) - expected = ymd.groupby(level="month").transform(np.sum) + expected = ymd.groupby(level="month").transform("sum") tm.assert_frame_equal(result, expected) # Series result = month_sums["A"].reindex(ymd.index, level=1) - expected = ymd["A"].groupby(level="month").transform(np.sum) + expected = ymd["A"].groupby(level="month").transform("sum") tm.assert_series_equal(result, expected, check_names=False) # axis=1 @@ -35,7 +35,7 @@ def test_reindex_level(self, multiindex_year_month_day_dataframe_random_data): month_sums = gb.sum() result = month_sums.reindex(columns=ymd.index, level=1) - expected = ymd.groupby(level="month").transform(np.sum).T + expected = ymd.groupby(level="month").transform("sum").T tm.assert_frame_equal(result, expected) def test_reindex(self, multiindex_dataframe_random_data): diff --git a/pandas/tests/window/test_api.py b/pandas/tests/window/test_api.py index d6cca5061671b..6b7093b4e4c3c 100644 --- a/pandas/tests/window/test_api.py +++ b/pandas/tests/window/test_api.py @@ -85,12 +85,14 @@ def test_agg(step): b_mean = r["B"].mean() b_std = r["B"].std() - result = r.aggregate([np.mean, np.std]) + with tm.assert_produces_warning(FutureWarning, match="using Rolling.[mean|std]"): + result = r.aggregate([np.mean, np.std]) expected = concat([a_mean, a_std, b_mean, b_std], axis=1) expected.columns = MultiIndex.from_product([["A", "B"], ["mean", "std"]]) tm.assert_frame_equal(result, expected) - result = r.aggregate({"A": np.mean, "B": np.std}) + with tm.assert_produces_warning(FutureWarning, match="using Rolling.[mean|std]"): + result = r.aggregate({"A": np.mean, "B": np.std}) expected = concat([a_mean, b_std], axis=1) tm.assert_frame_equal(result, expected, check_like=True) @@ -143,7 +145,8 @@ def test_agg_apply(raw): r = df.rolling(window=3) a_sum = r["A"].sum() - result = r.agg({"A": np.sum, "B": lambda x: np.std(x, ddof=1)}) + with tm.assert_produces_warning(FutureWarning, match="using Rolling.[sum|std]"): + result = r.agg({"A": np.sum, "B": lambda x: np.std(x, ddof=1)}) rcustom = r["B"].apply(lambda x: np.std(x, ddof=1), raw=raw) expected = concat([a_sum, rcustom], axis=1) tm.assert_frame_equal(result, expected, check_like=True) @@ -153,15 +156,18 @@ def test_agg_consistency(step): df = DataFrame({"A": range(5), "B": range(0, 10, 2)}) r = df.rolling(window=3, step=step) - result = r.agg([np.sum, np.mean]).columns + with tm.assert_produces_warning(FutureWarning, match="using Rolling.[sum|mean]"): + result = r.agg([np.sum, np.mean]).columns expected = MultiIndex.from_product([list("AB"), ["sum", "mean"]]) tm.assert_index_equal(result, expected) - result = r["A"].agg([np.sum, np.mean]).columns + with tm.assert_produces_warning(FutureWarning, match="using Rolling.[sum|mean]"): + result = r["A"].agg([np.sum, np.mean]).columns expected = Index(["sum", "mean"]) tm.assert_index_equal(result, expected) - result = r.agg({"A": [np.sum, np.mean]}).columns + with tm.assert_produces_warning(FutureWarning, match="using Rolling.[sum|mean]"): + result = r.agg({"A": [np.sum, np.mean]}).columns expected = MultiIndex.from_tuples([("A", "sum"), ("A", "mean")]) tm.assert_index_equal(result, expected) From 011d1d0143f139bc1d5bade750218dd2fc9df646 Mon Sep 17 00:00:00 2001 From: Natalia Mokeeva <91160475+natmokval@users.noreply.github.com> Date: Fri, 7 Jul 2023 19:06:54 +0200 Subject: [PATCH 32/35] TST: add test to check dtype after replacing values in categorical Series inplace (#53993) * TST: add test to check dtype after replacing categorical Series inplace * replace tm.assert_index_equal with tm.assert_series_equal --- pandas/tests/series/methods/test_replace.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py index d3cdae63d26f3..50b9714082054 100644 --- a/pandas/tests/series/methods/test_replace.py +++ b/pandas/tests/series/methods/test_replace.py @@ -387,6 +387,16 @@ def test_replace_categorical(self, categorical, numeric): expected = expected.cat.add_categories(2) tm.assert_series_equal(expected, result) + @pytest.mark.parametrize( + "data, data_exp", [(["a", "b", "c"], ["b", "b", "c"]), (["a"], ["b"])] + ) + def test_replace_categorical_inplace(self, data, data_exp): + # GH 53358 + result = pd.Series(data, dtype="category") + result.replace(to_replace="a", value="b", inplace=True) + expected = pd.Series(data_exp, dtype="category") + tm.assert_series_equal(result, expected) + def test_replace_categorical_single(self): # GH 26988 dti = pd.date_range("2016-01-01", periods=3, tz="US/Pacific") From d73e9e5428c8c17a0593d12cf451d9e10a1df136 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 7 Jul 2023 07:07:41 -1000 Subject: [PATCH 33/35] TST: Mark subprocess tests as single_cpu (#54026) --- pandas/tests/io/test_compression.py | 2 ++ pandas/tests/plotting/test_converter.py | 2 ++ pandas/tests/test_common.py | 1 + pandas/tests/test_downstream.py | 3 +++ 4 files changed, 8 insertions(+) diff --git a/pandas/tests/io/test_compression.py b/pandas/tests/io/test_compression.py index 9bdfbad347481..af83ec4a55fa5 100644 --- a/pandas/tests/io/test_compression.py +++ b/pandas/tests/io/test_compression.py @@ -202,6 +202,7 @@ def test_gzip_reproducibility_file_object(): assert output == buffer.getvalue() +@pytest.mark.single_cpu def test_with_missing_lzma(): """Tests if import pandas works when lzma is not present.""" # https://github.com/pandas-dev/pandas/issues/27575 @@ -215,6 +216,7 @@ def test_with_missing_lzma(): subprocess.check_output([sys.executable, "-c", code], stderr=subprocess.PIPE) +@pytest.mark.single_cpu def test_with_missing_lzma_runtime(): """Tests if RuntimeError is hit when calling lzma without having the module available. diff --git a/pandas/tests/plotting/test_converter.py b/pandas/tests/plotting/test_converter.py index 6caeb3a5d7445..cadd4c4589964 100644 --- a/pandas/tests/plotting/test_converter.py +++ b/pandas/tests/plotting/test_converter.py @@ -43,6 +43,7 @@ dates = pytest.importorskip("matplotlib.dates") +@pytest.mark.single_cpu def test_registry_mpl_resets(): # Check that Matplotlib converters are properly reset (see issue #27481) code = ( @@ -63,6 +64,7 @@ def test_timtetonum_accepts_unicode(): class TestRegistration: + @pytest.mark.single_cpu def test_dont_register_by_default(self): # Run in subprocess to ensure a clean state code = ( diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index 4860ee235c03d..fa7750397369b 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -233,6 +233,7 @@ def test_temp_setattr(with_exception): assert ser.name == "first" +@pytest.mark.single_cpu def test_str_size(): # GH#21758 a = "a" diff --git a/pandas/tests/test_downstream.py b/pandas/tests/test_downstream.py index 7354e313e24f4..09594588be81c 100644 --- a/pandas/tests/test_downstream.py +++ b/pandas/tests/test_downstream.py @@ -119,11 +119,13 @@ def test_xarray_cftimeindex_nearest(): assert result == expected +@pytest.mark.single_cpu def test_oo_optimizable(): # GH 21071 subprocess.check_call([sys.executable, "-OO", "-c", "import pandas"]) +@pytest.mark.single_cpu def test_oo_optimized_datetime_index_unpickle(): # GH 42866 subprocess.check_call( @@ -200,6 +202,7 @@ def test_yaml_dump(df): tm.assert_frame_equal(df, loaded2) +@pytest.mark.single_cpu def test_missing_required_dependency(): # GH 23868 # To ensure proper isolation, we pass these flags From fad7abcd6f2460512a1d8867a26960975f75512e Mon Sep 17 00:00:00 2001 From: penelopeysm Date: Fri, 7 Jul 2023 18:08:23 +0100 Subject: [PATCH 34/35] DOC: Fix typos in groupby user guide (#54041) Fix typos in groupby user guide --- doc/source/user_guide/groupby.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index 6f4008853f161..7ddce18d8a259 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -878,7 +878,7 @@ will be broadcast across the group. grouped.transform("sum") In addition to string aliases, the :meth:`~.DataFrameGroupBy.transform` method can -also except User-Defined functions (UDFs). The UDF must: +also accept User-Defined Functions (UDFs). The UDF must: * Return a result that is either the same size as the group chunk or broadcastable to the size of the group chunk (e.g., a scalar, @@ -1363,7 +1363,7 @@ implementation headache). Grouping with ordered factors ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Categorical variables represented as instance of pandas's ``Categorical`` class +Categorical variables represented as instances of pandas's ``Categorical`` class can be used as group keys. If so, the order of the levels will be preserved: .. ipython:: python @@ -1496,7 +1496,7 @@ You can also select multiple rows from each group by specifying multiple nth val # get the first, 4th, and last date index for each month df.groupby([df.index.year, df.index.month]).nth([0, 3, -1]) -You may also use a slices or lists of slices. +You may also use slices or lists of slices. .. ipython:: python From d32d02527c6b04ff1ccd9f5a33e3029097ae1f35 Mon Sep 17 00:00:00 2001 From: wcgonzal Date: Fri, 7 Jul 2023 12:12:14 -0500 Subject: [PATCH 35/35] DOC: add backticks to docstrings (#54030) add backticks to docstrings --- pandas/io/xml.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/pandas/io/xml.py b/pandas/io/xml.py index 2aec361d46b99..62bbb410dacc1 100644 --- a/pandas/io/xml.py +++ b/pandas/io/xml.py @@ -1,5 +1,5 @@ """ -:mod:`pandas.io.xml` is a module for reading XML. +:mod:``pandas.io.xml`` is a module for reading XML. """ from __future__ import annotations @@ -66,26 +66,26 @@ class _XMLFrameParser: Parameters ---------- - path_or_buffer : a valid JSON str, path object or file-like object + path_or_buffer : a valid JSON ``str``, path object or file-like object Any valid string path is acceptable. The string could be a URL. Valid URL schemes include http, ftp, s3, and file. xpath : str or regex The XPath expression to parse required set of nodes for - migration to `Data Frame`. `etree` supports limited XPath. + migration to :class:`~pandas.DataFrame`. `etree` supports limited XPath. namespaces : dict - The namespaces defined in XML document (`xmlns:namespace='URI') + The namespaces defined in XML document (``xmlns:namespace='URI'``) as dicts with key being namespace and value the URI. elems_only : bool - Parse only the child elements at the specified `xpath`. + Parse only the child elements at the specified ``xpath``. attrs_only : bool - Parse only the attributes at the specified `xpath`. + Parse only the attributes at the specified ``xpath``. names : list - Column names for Data Frame of parsed XML data. + Column names for :class:`~pandas.DataFrame`of parsed XML data. dtype : dict Data type for data or columns. E.g. {{'a': np.float64,