From d59eb7f8b3d69a83535bc3cc2035db7ac942ecbf Mon Sep 17 00:00:00 2001 From: KeiOshima Date: Sun, 21 Apr 2024 13:35:03 -0400 Subject: [PATCH 001/100] DOC: fixing SA01 error for DatetimeIndex: second, nanosecond, and microsecond (#58342) * DOC: fixing SA01 error for DatetimeIndex: second, nanosecond, microsecond * fixing EXPECTED TO FAIL, BUT NOT FAILING error --- ci/code_checks.sh | 6 ------ pandas/core/arrays/datetimes.py | 16 ++++++++++++++++ 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index d1cdff8f7f56b..ad12458ad6b0d 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -109,7 +109,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.DataFrame.var PR01,RT03,SA01" \ -i "pandas.DatetimeIndex.ceil SA01" \ -i "pandas.DatetimeIndex.date SA01" \ - -i "pandas.DatetimeIndex.day SA01" \ -i "pandas.DatetimeIndex.day_of_year SA01" \ -i "pandas.DatetimeIndex.dayofyear SA01" \ -i "pandas.DatetimeIndex.floor SA01" \ @@ -118,8 +117,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.DatetimeIndex.indexer_between_time RT03" \ -i "pandas.DatetimeIndex.inferred_freq SA01" \ -i "pandas.DatetimeIndex.is_leap_year SA01" \ - -i "pandas.DatetimeIndex.microsecond SA01" \ - -i "pandas.DatetimeIndex.nanosecond SA01" \ -i "pandas.DatetimeIndex.quarter SA01" \ -i "pandas.DatetimeIndex.round SA01" \ -i "pandas.DatetimeIndex.snap PR01,RT03,SA01" \ @@ -296,7 +293,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Series.dt.ceil PR01,PR02,SA01" \ -i "pandas.Series.dt.components SA01" \ -i "pandas.Series.dt.date SA01" \ - -i "pandas.Series.dt.day SA01" \ -i "pandas.Series.dt.day_name PR01,PR02" \ -i "pandas.Series.dt.day_of_year SA01" \ -i "pandas.Series.dt.dayofyear SA01" \ @@ -306,10 +302,8 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Series.dt.floor PR01,PR02,SA01" \ -i "pandas.Series.dt.freq GL08" \ -i "pandas.Series.dt.is_leap_year SA01" \ - -i "pandas.Series.dt.microsecond SA01" \ -i "pandas.Series.dt.microseconds SA01" \ -i "pandas.Series.dt.month_name PR01,PR02" \ - -i "pandas.Series.dt.nanosecond SA01" \ -i "pandas.Series.dt.nanoseconds SA01" \ -i "pandas.Series.dt.normalize PR01" \ -i "pandas.Series.dt.quarter SA01" \ diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 5d0dfc67bd90a..7704c99141fc2 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -1597,6 +1597,12 @@ def isocalendar(self) -> DataFrame: """ The day of the datetime. + See Also + -------- + DatetimeIndex.year: The year of the datetime. + DatetimeIndex.month: The month as January=1, December=12. + DatetimeIndex.hour: The hours of the datetime. + Examples -------- >>> datetime_series = pd.Series( @@ -1706,6 +1712,11 @@ def isocalendar(self) -> DataFrame: """ The microseconds of the datetime. + See Also + -------- + DatetimeIndex.second: The seconds of the datetime. + DatetimeIndex.nanosecond: The nanoseconds of the datetime. + Examples -------- >>> datetime_series = pd.Series( @@ -1729,6 +1740,11 @@ def isocalendar(self) -> DataFrame: """ The nanoseconds of the datetime. + See Also + -------- + DatetimeIndex.second: The seconds of the datetime. + DatetimeIndex.microsecond: The microseconds of the datetime. + Examples -------- >>> datetime_series = pd.Series( From 99f1df6cb87e9b73dd8e71dbee686a7b555c285a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torsten=20W=C3=B6rtwein?= <6618166+twoertwein@users.noreply.github.com> Date: Sun, 21 Apr 2024 13:41:20 -0400 Subject: [PATCH 002/100] TYP: export SASReader in pandas.api.typing (#58349) * TYP: export SASReader in pandas.api.typing * fix test --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/api/typing/__init__.py | 2 ++ pandas/io/sas/sas7bdat.py | 5 ++--- pandas/io/sas/sas_xport.py | 5 ++--- pandas/io/sas/sasreader.py | 13 +++++++------ pandas/tests/api/test_api.py | 1 + 6 files changed, 15 insertions(+), 12 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 8618d7d525771..c817e09b3b360 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -29,6 +29,7 @@ enhancement2 Other enhancements ^^^^^^^^^^^^^^^^^^ - :class:`pandas.api.typing.FrozenList` is available for typing the outputs of :attr:`MultiIndex.names`, :attr:`MultiIndex.codes` and :attr:`MultiIndex.levels` (:issue:`58237`) +- :class:`pandas.api.typing.SASReader` is available for typing the output of :func:`read_sas` (:issue:`55689`) - :func:`DataFrame.to_excel` now raises an ``UserWarning`` when the character count in a cell exceeds Excel's limitation of 32767 characters (:issue:`56954`) - :func:`read_stata` now returns ``datetime64`` resolutions better matching those natively stored in the stata format (:issue:`55642`) - :meth:`Styler.set_tooltips` provides alternative method to storing tooltips by using title attribute of td elements. (:issue:`56981`) diff --git a/pandas/api/typing/__init__.py b/pandas/api/typing/__init__.py index df6392bf692a2..c58fa0f085266 100644 --- a/pandas/api/typing/__init__.py +++ b/pandas/api/typing/__init__.py @@ -30,6 +30,7 @@ # TODO: Can't import Styler without importing jinja2 # from pandas.io.formats.style import Styler from pandas.io.json._json import JsonReader +from pandas.io.sas.sasreader import SASReader from pandas.io.stata import StataReader __all__ = [ @@ -49,6 +50,7 @@ "RollingGroupby", "SeriesGroupBy", "StataReader", + "SASReader", # See TODO above # "Styler", "TimedeltaIndexResamplerGroupby", diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py index 6a392a0f02caf..25257d5fcc192 100644 --- a/pandas/io/sas/sas7bdat.py +++ b/pandas/io/sas/sas7bdat.py @@ -16,7 +16,6 @@ from __future__ import annotations -from collections import abc from datetime import datetime import sys from typing import TYPE_CHECKING @@ -45,7 +44,7 @@ from pandas.io.common import get_handle import pandas.io.sas.sas_constants as const -from pandas.io.sas.sasreader import ReaderBase +from pandas.io.sas.sasreader import SASReader if TYPE_CHECKING: from pandas._typing import ( @@ -116,7 +115,7 @@ def __init__( # SAS7BDAT represents a SAS data file in SAS7BDAT format. -class SAS7BDATReader(ReaderBase, abc.Iterator): +class SAS7BDATReader(SASReader): """ Read SAS files in SAS7BDAT format. diff --git a/pandas/io/sas/sas_xport.py b/pandas/io/sas/sas_xport.py index adba9bf117a8e..89dbdab64c23c 100644 --- a/pandas/io/sas/sas_xport.py +++ b/pandas/io/sas/sas_xport.py @@ -10,7 +10,6 @@ from __future__ import annotations -from collections import abc from datetime import datetime import struct from typing import TYPE_CHECKING @@ -24,7 +23,7 @@ import pandas as pd from pandas.io.common import get_handle -from pandas.io.sas.sasreader import ReaderBase +from pandas.io.sas.sasreader import SASReader if TYPE_CHECKING: from pandas._typing import ( @@ -252,7 +251,7 @@ def _parse_float_vec(vec): return ieee -class XportReader(ReaderBase, abc.Iterator): +class XportReader(SASReader): __doc__ = _xport_reader_doc def __init__( diff --git a/pandas/io/sas/sasreader.py b/pandas/io/sas/sasreader.py index 69d911863338f..12d698a4f76a8 100644 --- a/pandas/io/sas/sasreader.py +++ b/pandas/io/sas/sasreader.py @@ -8,6 +8,7 @@ ABC, abstractmethod, ) +from collections.abc import Iterator from typing import ( TYPE_CHECKING, overload, @@ -33,9 +34,9 @@ from pandas import DataFrame -class ReaderBase(ABC): +class SASReader(Iterator["DataFrame"], ABC): """ - Protocol for XportReader and SAS7BDATReader classes. + Abstract class for XportReader and SAS7BDATReader. """ @abstractmethod @@ -66,7 +67,7 @@ def read_sas( chunksize: int = ..., iterator: bool = ..., compression: CompressionOptions = ..., -) -> ReaderBase: ... +) -> SASReader: ... @overload @@ -79,7 +80,7 @@ def read_sas( chunksize: None = ..., iterator: bool = ..., compression: CompressionOptions = ..., -) -> DataFrame | ReaderBase: ... +) -> DataFrame | SASReader: ... @doc(decompression_options=_shared_docs["decompression_options"] % "filepath_or_buffer") @@ -92,7 +93,7 @@ def read_sas( chunksize: int | None = None, iterator: bool = False, compression: CompressionOptions = "infer", -) -> DataFrame | ReaderBase: +) -> DataFrame | SASReader: """ Read SAS files stored as either XPORT or SAS7BDAT format files. @@ -145,7 +146,7 @@ def read_sas( f"unable to infer format of SAS file from filename: {fname!r}" ) - reader: ReaderBase + reader: SASReader if format.lower() == "xport": from pandas.io.sas.sas_xport import XportReader diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index 0f2a641d13b11..b23876d9280f7 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -267,6 +267,7 @@ class TestApi(Base): "RollingGroupby", "SeriesGroupBy", "StataReader", + "SASReader", "TimedeltaIndexResamplerGroupby", "TimeGrouper", "Window", From b111ac671e9eb8119e53ca57be54d24c47f672f8 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Mon, 22 Apr 2024 00:14:30 +0530 Subject: [PATCH 003/100] DOC: Enforce Numpy Docstring Validation for pandas.HDFStore.groups (#58357) * DOC: added SA01 to HDFStore.groups * DOC: removed HDFStore.groups --- ci/code_checks.sh | 1 - pandas/io/pytables.py | 4 ++++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index ad12458ad6b0d..cabc25b5e0ba5 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -131,7 +131,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.DatetimeTZDtype.tz SA01" \ -i "pandas.DatetimeTZDtype.unit SA01" \ -i "pandas.Grouper PR02" \ - -i "pandas.HDFStore.groups SA01" \ -i "pandas.HDFStore.info RT03,SA01" \ -i "pandas.HDFStore.keys SA01" \ -i "pandas.HDFStore.put PR01,SA01" \ diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 25808f5b4a132..d7fc71d037f2d 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -1504,6 +1504,10 @@ def groups(self) -> list: list List of objects. + See Also + -------- + HDFStore.get_node : Returns the node with the key. + Examples -------- >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"]) From 3b0824e92e9932588f5d0e58e1b0aa59df2e76fa Mon Sep 17 00:00:00 2001 From: shriyakalakata <87483933+shriyakalakata@users.noreply.github.com> Date: Mon, 22 Apr 2024 12:29:47 -0400 Subject: [PATCH 004/100] DOC: Fix SA01 errors for Index.hasnans, Index.map, Index.nbytes (#58343) * Shorten sentence length * Remove Series.nbytes from ci/code_checks.sh * Update see also method names * Update see also method names * Update see also methods for Index.map * Update method descriptions --- ci/code_checks.sh | 4 ---- pandas/core/base.py | 5 +++++ pandas/core/indexes/base.py | 10 ++++++++++ 3 files changed, 15 insertions(+), 4 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index cabc25b5e0ba5..4debc2eb91449 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -156,18 +156,15 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Index.get_indexer_non_unique PR07,SA01" \ -i "pandas.Index.get_loc PR07,RT03,SA01" \ -i "pandas.Index.get_slice_bound PR07" \ - -i "pandas.Index.hasnans SA01" \ -i "pandas.Index.identical PR01,SA01" \ -i "pandas.Index.inferred_type SA01" \ -i "pandas.Index.insert PR07,RT03,SA01" \ -i "pandas.Index.intersection PR07,RT03,SA01" \ -i "pandas.Index.item SA01" \ -i "pandas.Index.join PR07,RT03,SA01" \ - -i "pandas.Index.map SA01" \ -i "pandas.Index.memory_usage RT03" \ -i "pandas.Index.name SA01" \ -i "pandas.Index.names GL08" \ - -i "pandas.Index.nbytes SA01" \ -i "pandas.Index.nunique RT03" \ -i "pandas.Index.putmask PR01,RT03" \ -i "pandas.Index.ravel PR01,RT03" \ @@ -344,7 +341,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Series.mod PR07" \ -i "pandas.Series.mode SA01" \ -i "pandas.Series.mul PR07" \ - -i "pandas.Series.nbytes SA01" \ -i "pandas.Series.ne PR07,SA01" \ -i "pandas.Series.nunique RT03" \ -i "pandas.Series.pad PR01,SA01" \ diff --git a/pandas/core/base.py b/pandas/core/base.py index 9b1251a4ef5d8..424f0609dd485 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -419,6 +419,11 @@ def nbytes(self) -> int: """ Return the number of bytes in the underlying data. + See Also + -------- + Series.ndim : Number of dimensions of the underlying data. + Series.size : Return the number of elements in the underlying data. + Examples -------- For Series: diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 8ede401f37184..d1d1c5ea3171f 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2423,6 +2423,12 @@ def hasnans(self) -> bool: ------- bool + See Also + -------- + Index.isna : Detect missing values. + Index.dropna : Return Index without NA/NaN values. + Index.fillna : Fill NA/NaN values with the specified value. + Examples -------- >>> s = pd.Series([1, 2, 3], index=["a", "b", None]) @@ -6067,6 +6073,10 @@ def map(self, mapper, na_action: Literal["ignore"] | None = None): If the function returns a tuple with more than one element a MultiIndex will be returned. + See Also + -------- + Index.where : Replace values where the condition is False. + Examples -------- >>> idx = pd.Index([1, 2, 3]) From 2768a22d3b6bb70029f406968bc366faf2c7267f Mon Sep 17 00:00:00 2001 From: gboeker <68177766+gboeker@users.noreply.github.com> Date: Mon, 22 Apr 2024 12:31:05 -0400 Subject: [PATCH 005/100] DOC: Enforce Numpy Docstring Validation for DatetimeIndex (#58353) * fix line too long * add return for snap * undo return * fix docstring issues for DatetimeIndex.quarter * remove pandas.Series.dt.quarter from codechecks * fix docstring issues for DatetimeIndex.round * fix docstring issues for DatetimeIndex.time * fix docstring issues for DatetimeIndex.timetz * add see also for timetz * fix code check errors * delete round from code_checks * fix code check errors --- ci/code_checks.sh | 20 ++++---------------- pandas/core/arrays/datetimelike.py | 5 +++++ pandas/core/arrays/datetimes.py | 20 ++++++++++++++++++++ pandas/core/indexes/datetimes.py | 7 +++++++ 4 files changed, 36 insertions(+), 16 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 4debc2eb91449..443fa4b4005d3 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -107,22 +107,16 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.DataFrame.to_markdown SA01" \ -i "pandas.DataFrame.to_parquet RT03" \ -i "pandas.DataFrame.var PR01,RT03,SA01" \ - -i "pandas.DatetimeIndex.ceil SA01" \ -i "pandas.DatetimeIndex.date SA01" \ -i "pandas.DatetimeIndex.day_of_year SA01" \ -i "pandas.DatetimeIndex.dayofyear SA01" \ - -i "pandas.DatetimeIndex.floor SA01" \ -i "pandas.DatetimeIndex.freqstr SA01" \ -i "pandas.DatetimeIndex.indexer_at_time PR01,RT03" \ -i "pandas.DatetimeIndex.indexer_between_time RT03" \ -i "pandas.DatetimeIndex.inferred_freq SA01" \ -i "pandas.DatetimeIndex.is_leap_year SA01" \ - -i "pandas.DatetimeIndex.quarter SA01" \ - -i "pandas.DatetimeIndex.round SA01" \ - -i "pandas.DatetimeIndex.snap PR01,RT03,SA01" \ + -i "pandas.DatetimeIndex.snap PR01,RT03" \ -i "pandas.DatetimeIndex.std PR01,RT03" \ - -i "pandas.DatetimeIndex.time SA01" \ - -i "pandas.DatetimeIndex.timetz SA01" \ -i "pandas.DatetimeIndex.to_period RT03" \ -i "pandas.DatetimeIndex.to_pydatetime RT03,SA01" \ -i "pandas.DatetimeIndex.tz SA01" \ @@ -286,7 +280,7 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Series.div PR07" \ -i "pandas.Series.droplevel SA01" \ -i "pandas.Series.dt.as_unit PR01,PR02" \ - -i "pandas.Series.dt.ceil PR01,PR02,SA01" \ + -i "pandas.Series.dt.ceil PR01,PR02" \ -i "pandas.Series.dt.components SA01" \ -i "pandas.Series.dt.date SA01" \ -i "pandas.Series.dt.day_name PR01,PR02" \ @@ -295,20 +289,17 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Series.dt.days SA01" \ -i "pandas.Series.dt.days_in_month SA01" \ -i "pandas.Series.dt.daysinmonth SA01" \ - -i "pandas.Series.dt.floor PR01,PR02,SA01" \ + -i "pandas.Series.dt.floor PR01,PR02" \ -i "pandas.Series.dt.freq GL08" \ -i "pandas.Series.dt.is_leap_year SA01" \ -i "pandas.Series.dt.microseconds SA01" \ -i "pandas.Series.dt.month_name PR01,PR02" \ -i "pandas.Series.dt.nanoseconds SA01" \ -i "pandas.Series.dt.normalize PR01" \ - -i "pandas.Series.dt.quarter SA01" \ -i "pandas.Series.dt.qyear GL08" \ - -i "pandas.Series.dt.round PR01,PR02,SA01" \ + -i "pandas.Series.dt.round PR01,PR02" \ -i "pandas.Series.dt.seconds SA01" \ -i "pandas.Series.dt.strftime PR01,PR02" \ - -i "pandas.Series.dt.time SA01" \ - -i "pandas.Series.dt.timetz SA01" \ -i "pandas.Series.dt.to_period PR01,PR02,RT03" \ -i "pandas.Series.dt.total_seconds PR01" \ -i "pandas.Series.dt.tz SA01" \ @@ -428,14 +419,11 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Timedelta.total_seconds SA01" \ -i "pandas.Timedelta.view SA01" \ -i "pandas.TimedeltaIndex.as_unit RT03,SA01" \ - -i "pandas.TimedeltaIndex.ceil SA01" \ -i "pandas.TimedeltaIndex.components SA01" \ -i "pandas.TimedeltaIndex.days SA01" \ - -i "pandas.TimedeltaIndex.floor SA01" \ -i "pandas.TimedeltaIndex.inferred_freq SA01" \ -i "pandas.TimedeltaIndex.microseconds SA01" \ -i "pandas.TimedeltaIndex.nanoseconds SA01" \ - -i "pandas.TimedeltaIndex.round SA01" \ -i "pandas.TimedeltaIndex.seconds SA01" \ -i "pandas.TimedeltaIndex.to_pytimedelta RT03,SA01" \ -i "pandas.Timestamp PR07,SA01" \ diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 8ada9d88e08bc..974289160b145 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -1825,6 +1825,11 @@ def strftime(self, date_format: str) -> npt.NDArray[np.object_]: ------ ValueError if the `freq` cannot be converted. + See Also + -------- + DatetimeIndex.floor : Perform floor operation on the data to the specified `freq`. + DatetimeIndex.snap : Snap time stamps to nearest occurring frequency. + Notes ----- If the timestamps have a timezone, {op}ing will take place relative to the diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 7704c99141fc2..fb9f047d432a1 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -1391,6 +1391,14 @@ def time(self) -> npt.NDArray[np.object_]: The time part of the Timestamps. + See Also + -------- + DatetimeIndex.timetz : Returns numpy array of :class:`datetime.time` + objects with timezones. The time part of the Timestamps. + DatetimeIndex.date : Returns numpy array of python :class:`datetime.date` + objects. Namely, the date part of Timestamps without time and timezone + information. + Examples -------- For Series: @@ -1428,6 +1436,12 @@ def timetz(self) -> npt.NDArray[np.object_]: The time part of the Timestamps. + See Also + -------- + DatetimeIndex.time : Returns numpy array of :class:`datetime.time` objects. + The time part of the Timestamps. + DatetimeIndex.tz : Return the timezone. + Examples -------- For Series: @@ -1836,6 +1850,12 @@ def isocalendar(self) -> DataFrame: """ The quarter of the date. + See Also + -------- + DatetimeIndex.snap : Snap time stamps to nearest occurring frequency. + DatetimeIndex.time : Returns numpy array of datetime.time objects. + The time part of the Timestamps. + Examples -------- For Series: diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index cefdc14145d1f..7122de745e13b 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -455,6 +455,13 @@ def snap(self, freq: Frequency = "S") -> DatetimeIndex: ------- DatetimeIndex + See Also + -------- + DatetimeIndex.round : Perform round operation on the data to the + specified `freq`. + DatetimeIndex.floor : Perform floor operation on the data to the + specified `freq`. + Examples -------- >>> idx = pd.DatetimeIndex( From 09c7201d6db4be265aafda8feb7577c02145f2eb Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Mon, 22 Apr 2024 22:34:44 +0530 Subject: [PATCH 006/100] DOC: Enforce Numpy Docstring Validation for pandas.HDFStore.info (#58368) * DOC: add return description and see also section to pandas.HDFStore.info * DOC: add 2 df in the examples for pandas.HDFStore.info * DOC: remove pandas.HDFStore.info --- ci/code_checks.sh | 1 - pandas/io/pytables.py | 15 ++++++++++++--- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 443fa4b4005d3..fdcbcbe31c47f 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -125,7 +125,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.DatetimeTZDtype.tz SA01" \ -i "pandas.DatetimeTZDtype.unit SA01" \ -i "pandas.Grouper PR02" \ - -i "pandas.HDFStore.info RT03,SA01" \ -i "pandas.HDFStore.keys SA01" \ -i "pandas.HDFStore.put PR01,SA01" \ -i "pandas.HDFStore.select SA01" \ diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index d7fc71d037f2d..89c6ac9a58382 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -1688,17 +1688,26 @@ def info(self) -> str: Returns ------- str + A String containing the python pandas class name, filepath to the HDF5 + file and all the object keys along with their respective dataframe shapes. + + See Also + -------- + HDFStore.get_storer : Returns the storer object for a key. Examples -------- - >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"]) + >>> df1 = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"]) + >>> df2 = pd.DataFrame([[5, 6], [7, 8]], columns=["C", "D"]) >>> store = pd.HDFStore("store.h5", "w") # doctest: +SKIP - >>> store.put("data", df) # doctest: +SKIP + >>> store.put("data1", df1) # doctest: +SKIP + >>> store.put("data2", df2) # doctest: +SKIP >>> print(store.info()) # doctest: +SKIP >>> store.close() # doctest: +SKIP File path: store.h5 - /data frame (shape->[2,2]) + /data1 frame (shape->[2,2]) + /data2 frame (shape->[2,2]) """ path = pprint_thing(self._path) output = f"{type(self)}\nFile path: {path}\n" From 5db3196e8a5779a2548ba5f48ed8f4ebfb2cf31b Mon Sep 17 00:00:00 2001 From: gboeker <68177766+gboeker@users.noreply.github.com> Date: Mon, 22 Apr 2024 13:16:42 -0400 Subject: [PATCH 007/100] DOC: Fix SA01 Docstring Errors for DataFrame (#58364) * DataFrame.__iter__ fix SA01 * add See Also for DataFrame.column * DataFrame.droplevel SA01 fixed * remove pandas.Series.droplevel from code_checks.sh --- ci/code_checks.sh | 4 ---- pandas/core/frame.py | 4 ++++ pandas/core/generic.py | 11 +++++++++++ 3 files changed, 15 insertions(+), 4 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index fdcbcbe31c47f..d2ba06902096e 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -80,10 +80,7 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.CategoricalIndex.codes SA01" \ -i "pandas.CategoricalIndex.ordered SA01" \ -i "pandas.DataFrame.__dataframe__ SA01" \ - -i "pandas.DataFrame.__iter__ SA01" \ -i "pandas.DataFrame.at_time PR01" \ - -i "pandas.DataFrame.columns SA01" \ - -i "pandas.DataFrame.droplevel SA01" \ -i "pandas.DataFrame.hist RT03" \ -i "pandas.DataFrame.infer_objects RT03" \ -i "pandas.DataFrame.kurt RT03,SA01" \ @@ -277,7 +274,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Series.cat.reorder_categories PR01,PR02" \ -i "pandas.Series.cat.set_categories PR01,PR02" \ -i "pandas.Series.div PR07" \ - -i "pandas.Series.droplevel SA01" \ -i "pandas.Series.dt.as_unit PR01,PR02" \ -i "pandas.Series.dt.ceil PR01,PR02" \ -i "pandas.Series.dt.components SA01" \ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 0185ca8241617..50dc514e7181f 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -12893,6 +12893,10 @@ def isin_(x): """ The column labels of the DataFrame. + See Also + -------- + DataFrame.index: The index (row labels) of the DataFrame. + Examples -------- >>> df = pd.DataFrame({'A': [1, 2], 'B': [3, 4]}) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index dbe2006642484..a7f155ec93524 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -783,6 +783,12 @@ def droplevel(self, level: IndexLabel, axis: Axis = 0) -> Self: {klass} {klass} with requested index / column level(s) removed. + See Also + -------- + DataFrame.replace : Replace values given in `to_replace` with `value`. + DataFrame.pivot : Return reshaped DataFrame organized by given + index / column values. + Examples -------- >>> df = ( @@ -1862,6 +1868,11 @@ def __iter__(self) -> Iterator: iterator Info axis as iterator. + See Also + -------- + DataFrame.items : Iterate over (column name, Series) pairs. + DataFrame.itertuples : Iterate over DataFrame rows as namedtuples. + Examples -------- >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) From f1297fae4561c1cdf1c0eab1ec6fa2247ef73f07 Mon Sep 17 00:00:00 2001 From: William Andrea <22385371+wjandrea@users.noreply.github.com> Date: Mon, 22 Apr 2024 13:30:27 -0400 Subject: [PATCH 008/100] More idiomatic example code in BaseIndexer (#58356) No need to loop when NumPy supports range and array addition --- pandas/core/indexers/objects.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/pandas/core/indexers/objects.py b/pandas/core/indexers/objects.py index 2e6bcda520aba..d108f840a1b4f 100644 --- a/pandas/core/indexers/objects.py +++ b/pandas/core/indexers/objects.py @@ -53,11 +53,8 @@ class BaseIndexer: >>> from pandas.api.indexers import BaseIndexer >>> class CustomIndexer(BaseIndexer): ... def get_window_bounds(self, num_values, min_periods, center, closed, step): - ... start = np.empty(num_values, dtype=np.int64) - ... end = np.empty(num_values, dtype=np.int64) - ... for i in range(num_values): - ... start[i] = i - ... end[i] = i + self.window_size + ... start = np.arange(num_values, dtype=np.int64) + ... end = np.arange(num_values, dtype=np.int64) + self.window_size ... return start, end >>> df = pd.DataFrame({"values": range(5)}) >>> indexer = CustomIndexer(window_size=2) From e714aca6f2ed594c95a9681dac3d4858f23552a2 Mon Sep 17 00:00:00 2001 From: KeiOshima Date: Mon, 22 Apr 2024 13:43:48 -0400 Subject: [PATCH 009/100] DOC: fixing SA01 errors for Index: name, dtype, and equals (#58355) * DOC: fixing SA01 errors for Index: name, dtype, and equals * fixing Blank line contains whitespace error --- ci/code_checks.sh | 3 --- pandas/core/indexes/base.py | 15 +++++++++++++++ 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index d2ba06902096e..d595162fd84e9 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -136,10 +136,8 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Index.drop_duplicates RT03" \ -i "pandas.Index.droplevel RT03,SA01" \ -i "pandas.Index.dropna RT03,SA01" \ - -i "pandas.Index.dtype SA01" \ -i "pandas.Index.duplicated RT03" \ -i "pandas.Index.empty GL08" \ - -i "pandas.Index.equals SA01" \ -i "pandas.Index.fillna RT03" \ -i "pandas.Index.get_indexer PR07,SA01" \ -i "pandas.Index.get_indexer_for PR01,SA01" \ @@ -153,7 +151,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Index.item SA01" \ -i "pandas.Index.join PR07,RT03,SA01" \ -i "pandas.Index.memory_usage RT03" \ - -i "pandas.Index.name SA01" \ -i "pandas.Index.names GL08" \ -i "pandas.Index.nunique RT03" \ -i "pandas.Index.putmask PR01,RT03" \ diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index d1d1c5ea3171f..424126132656c 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -976,6 +976,10 @@ def dtype(self) -> DtypeObj: """ Return the dtype object of the underlying data. + See Also + -------- + Index.inferred_type: Return a string of the type inferred from the values. + Examples -------- >>> idx = pd.Index([1, 2, 3]) @@ -1638,6 +1642,11 @@ def name(self) -> Hashable: """ Return Index or MultiIndex name. + See Also + -------- + Index.set_names: Able to set new names partially and by level. + Index.rename: Able to set new names partially and by level. + Examples -------- >>> idx = pd.Index([1, 2, 3], name="x") @@ -5181,6 +5190,12 @@ def equals(self, other: Any) -> bool: True if "other" is an Index and it has the same elements and order as the calling index; False otherwise. + See Also + -------- + Index.identical: Checks that object attributes and types are also equal. + Index.has_duplicates: Check if the Index has duplicate values. + Index.is_unique: Return if the index has unique values. + Examples -------- >>> idx1 = pd.Index([1, 2, 3]) From 22e524799de6189e93e5d4f1907f3e6ea282a28a Mon Sep 17 00:00:00 2001 From: Nrezhang <102526155+Nrezhang@users.noreply.github.com> Date: Mon, 22 Apr 2024 13:45:18 -0400 Subject: [PATCH 010/100] DOC: Fix SA01 errors for pandas.Index.astype (#58352) * pandas.Index.astype * check fixes * series to index --- ci/code_checks.sh | 1 - pandas/core/indexes/base.py | 6 ++++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index d595162fd84e9..f03ea65866031 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -129,7 +129,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Index PR07" \ -i "pandas.Index.T SA01" \ -i "pandas.Index.append PR07,RT03,SA01" \ - -i "pandas.Index.astype SA01" \ -i "pandas.Index.copy PR07,SA01" \ -i "pandas.Index.difference PR07,RT03,SA01" \ -i "pandas.Index.drop PR07,SA01" \ diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 424126132656c..63facb61ed498 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -1060,6 +1060,12 @@ def astype(self, dtype, copy: bool = True): Index Index with values cast to specified dtype. + See Also + -------- + Index.dtype: Return the dtype object of the underlying data. + Index.dtypes: Return the dtype object of the underlying data. + Index.convert_dtypes: Convert columns to the best possible dtypes. + Examples -------- >>> idx = pd.Index([1, 2, 3]) From 3461db5656b2ea2b90368f521c63fbcccb48d68d Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 22 Apr 2024 07:56:29 -1000 Subject: [PATCH 011/100] CLN: Use more memoryviews (#58330) * Add memoryviews in reshape.pyx * Use more const memoryviews --- pandas/_libs/lib.pyx | 6 +++--- pandas/_libs/reshape.pyx | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 7aa1cb715521e..24afbe3a07bf1 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -477,7 +477,7 @@ def has_infs(const floating[:] arr) -> bool: @cython.boundscheck(False) @cython.wraparound(False) -def has_only_ints_or_nan(floating[:] arr) -> bool: +def has_only_ints_or_nan(const floating[:] arr) -> bool: cdef: floating val intp_t i @@ -631,7 +631,7 @@ ctypedef fused int6432_t: @cython.wraparound(False) @cython.boundscheck(False) -def is_range_indexer(ndarray[int6432_t, ndim=1] left, Py_ssize_t n) -> bool: +def is_range_indexer(const int6432_t[:] left, Py_ssize_t n) -> bool: """ Perform an element by element comparison on 1-d integer arrays, meant for indexer comparisons @@ -652,7 +652,7 @@ def is_range_indexer(ndarray[int6432_t, ndim=1] left, Py_ssize_t n) -> bool: @cython.wraparound(False) @cython.boundscheck(False) -def is_sequence_range(ndarray[int6432_t, ndim=1] sequence, int64_t step) -> bool: +def is_sequence_range(const int6432_t[:] sequence, int64_t step) -> bool: """ Check if sequence is equivalent to a range with the specified step. """ diff --git a/pandas/_libs/reshape.pyx b/pandas/_libs/reshape.pyx index 21d1405328da6..28ea06739e0c8 100644 --- a/pandas/_libs/reshape.pyx +++ b/pandas/_libs/reshape.pyx @@ -19,7 +19,7 @@ from pandas._libs.lib cimport c_is_list_like @cython.wraparound(False) @cython.boundscheck(False) -def unstack(numeric_object_t[:, :] values, const uint8_t[:] mask, +def unstack(const numeric_object_t[:, :] values, const uint8_t[:] mask, Py_ssize_t stride, Py_ssize_t length, Py_ssize_t width, numeric_object_t[:, :] new_values, uint8_t[:, :] new_mask) -> None: """ @@ -80,7 +80,7 @@ def unstack(numeric_object_t[:, :] values, const uint8_t[:] mask, @cython.wraparound(False) @cython.boundscheck(False) -def explode(ndarray[object] values): +def explode(object[:] values): """ transform array list-likes to long form preserve non-list entries From 454e2e1d9d7b118953ecfb4edc6f9fe7f5cb07b8 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 22 Apr 2024 08:01:24 -1000 Subject: [PATCH 012/100] CLN: Use generators when objects are re-iterated over in core/internals (#58319) * Make _split generator * More iterators * Remove typing --- pandas/core/frame.py | 4 +-- pandas/core/internals/blocks.py | 23 ++++++------- pandas/core/internals/managers.py | 41 +++++++++--------------- pandas/tests/internals/test_internals.py | 2 +- 4 files changed, 29 insertions(+), 41 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 50dc514e7181f..567fcb1ef7c05 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -12925,12 +12925,12 @@ def _to_dict_of_blocks(self): Return a dict of dtype -> Constructor Types that each is a homogeneous dtype. - Internal ONLY - only works for BlockManager + Internal ONLY. """ mgr = self._mgr return { k: self._constructor_from_mgr(v, axes=v.axes).__finalize__(self) - for k, v in mgr.to_dict().items() + for k, v in mgr.to_iter_dict() } @property diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 7be1d5d95ffdf..1b72c164f7945 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -118,6 +118,7 @@ if TYPE_CHECKING: from collections.abc import ( + Generator, Iterable, Sequence, ) @@ -385,20 +386,18 @@ def _split_op_result(self, result: ArrayLike) -> list[Block]: return [nb] @final - def _split(self) -> list[Block]: + def _split(self) -> Generator[Block, None, None]: """ Split a block into a list of single-column blocks. """ assert self.ndim == 2 - new_blocks = [] for i, ref_loc in enumerate(self._mgr_locs): vals = self.values[slice(i, i + 1)] bp = BlockPlacement(ref_loc) nb = type(self)(vals, placement=bp, ndim=2, refs=self.refs) - new_blocks.append(nb) - return new_blocks + yield nb @final def split_and_operate(self, func, *args, **kwargs) -> list[Block]: @@ -537,7 +536,9 @@ def convert_dtypes( rbs = [] for blk in blks: # Determine dtype column by column - sub_blks = [blk] if blk.ndim == 1 or self.shape[0] == 1 else blk._split() + sub_blks = ( + [blk] if blk.ndim == 1 or self.shape[0] == 1 else list(blk._split()) + ) dtypes = [ convert_dtypes( b.values, @@ -1190,8 +1191,7 @@ def putmask(self, mask, new) -> list[Block]: is_array = isinstance(new, np.ndarray) res_blocks = [] - nbs = self._split() - for i, nb in enumerate(nbs): + for i, nb in enumerate(self._split()): n = new if is_array: # we have a different value per-column @@ -1255,8 +1255,7 @@ def where(self, other, cond) -> list[Block]: is_array = isinstance(other, (np.ndarray, ExtensionArray)) res_blocks = [] - nbs = self._split() - for i, nb in enumerate(nbs): + for i, nb in enumerate(self._split()): oth = other if is_array: # we have a different value per-column @@ -1698,8 +1697,7 @@ def where(self, other, cond) -> list[Block]: is_array = isinstance(orig_other, (np.ndarray, ExtensionArray)) res_blocks = [] - nbs = self._split() - for i, nb in enumerate(nbs): + for i, nb in enumerate(self._split()): n = orig_other if is_array: # we have a different value per-column @@ -1760,8 +1758,7 @@ def putmask(self, mask, new) -> list[Block]: is_array = isinstance(orig_new, (np.ndarray, ExtensionArray)) res_blocks = [] - nbs = self._split() - for i, nb in enumerate(nbs): + for i, nb in enumerate(self._split()): n = orig_new if is_array: # we have a different value per-column diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 8fda9cd23b508..7c1bcbec1d3f2 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -92,6 +92,8 @@ ) if TYPE_CHECKING: + from collections.abc import Generator + from pandas._typing import ( ArrayLike, AxisInt, @@ -645,8 +647,7 @@ def get_bool_data(self) -> Self: new_blocks.append(blk) elif blk.is_object: - nbs = blk._split() - new_blocks.extend(nb for nb in nbs if nb.is_bool) + new_blocks.extend(nb for nb in blk._split() if nb.is_bool) return self._combine(new_blocks) @@ -1525,7 +1526,9 @@ def _insert_update_mgr_locs(self, loc) -> None: When inserting a new Block at location 'loc', we increment all of the mgr_locs of blocks above that by one. """ - for blkno, count in _fast_count_smallints(self.blknos[loc:]): + # Faster version of set(arr) for sequences of small numbers + blknos = np.bincount(self.blknos[loc:]).nonzero()[0] + for blkno in blknos: # .620 this way, .326 of which is in increment_above blk = self.blocks[blkno] blk._mgr_locs = blk._mgr_locs.increment_above(loc) @@ -1597,7 +1600,7 @@ def grouped_reduce(self, func: Callable) -> Self: nrows = 0 else: nrows = result_blocks[0].values.shape[-1] - index = Index(range(nrows)) + index = default_index(nrows) return type(self).from_blocks(result_blocks, [self.axes[0], index]) @@ -1735,21 +1738,18 @@ def unstack(self, unstacker, fill_value) -> BlockManager: bm = BlockManager(new_blocks, [new_columns, new_index], verify_integrity=False) return bm - def to_dict(self) -> dict[str, Self]: + def to_iter_dict(self) -> Generator[tuple[str, Self], None, None]: """ - Return a dict of str(dtype) -> BlockManager + Yield a tuple of (str(dtype), BlockManager) Returns ------- - values : a dict of dtype -> BlockManager + values : a tuple of (str(dtype), BlockManager) """ - - bd: dict[str, list[Block]] = {} - for b in self.blocks: - bd.setdefault(str(b.dtype), []).append(b) - - # TODO(EA2D): the combine will be unnecessary with 2D EAs - return {dtype: self._combine(blocks) for dtype, blocks in bd.items()} + key = lambda block: str(block.dtype) + for dtype, blocks in itertools.groupby(sorted(self.blocks, key=key), key=key): + # TODO(EA2D): the combine will be unnecessary with 2D EAs + yield dtype, self._combine(list(blocks)) def as_array( self, @@ -2330,7 +2330,7 @@ def _grouping_func(tup: tuple[int, ArrayLike]) -> tuple[int, DtypeObj]: def _form_blocks(arrays: list[ArrayLike], consolidate: bool, refs: list) -> list[Block]: - tuples = list(enumerate(arrays)) + tuples = enumerate(arrays) if not consolidate: return _tuples_to_blocks_no_consolidate(tuples, refs) @@ -2351,7 +2351,7 @@ def _form_blocks(arrays: list[ArrayLike], consolidate: bool, refs: list) -> list if issubclass(dtype.type, (str, bytes)): dtype = np.dtype(object) - values, placement = _stack_arrays(list(tup_block), dtype) + values, placement = _stack_arrays(tup_block, dtype) if is_dtlike: values = ensure_wrapped_if_datetimelike(values) blk = block_type(values, placement=BlockPlacement(placement), ndim=2) @@ -2450,15 +2450,6 @@ def _merge_blocks( return blocks, False -def _fast_count_smallints(arr: npt.NDArray[np.intp]): - """Faster version of set(arr) for sequences of small numbers.""" - counts = np.bincount(arr) - nz = counts.nonzero()[0] - # Note: list(zip(...) outperforms list(np.c_[nz, counts[nz]]) here, - # in one benchmark by a factor of 11 - return zip(nz, counts[nz]) - - def _preprocess_slice_or_indexer( slice_or_indexer: slice | np.ndarray, length: int, allow_fill: bool ): diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index 92addeb29252a..43bcf84f901b1 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -347,7 +347,7 @@ def test_split(self): # GH#37799 values = np.random.default_rng(2).standard_normal((3, 4)) blk = new_block(values, placement=BlockPlacement([3, 1, 6]), ndim=2) - result = blk._split() + result = list(blk._split()) # check that we get views, not copies values[:] = -9999 From cf953dac795e49a530df33d1f1c012bd7346a555 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dominik=20Smr=C5=BE?= Date: Mon, 22 Apr 2024 20:55:37 +0200 Subject: [PATCH 013/100] Allow `tan` to be used in `df.eval`. (#58334) * Allow `tan` to be used in `df.eval`. * Whatsnew: Link issue for fixing `tan` in `eval`. --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/computation/ops.py | 1 + 2 files changed, 2 insertions(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index c817e09b3b360..7823f74b7a153 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -458,6 +458,7 @@ Other - Bug in :class:`DataFrame` when passing a ``dict`` with a NA scalar and ``columns`` that would always return ``np.nan`` (:issue:`57205`) - Bug in :func:`unique` on :class:`Index` not always returning :class:`Index` (:issue:`57043`) - Bug in :meth:`DataFrame.eval` and :meth:`DataFrame.query` which caused an exception when using NumPy attributes via ``@`` notation, e.g., ``df.eval("@np.floor(a)")``. (:issue:`58041`) +- Bug in :meth:`DataFrame.eval` and :meth:`DataFrame.query` which did not allow to use ``tan`` function. (:issue:`55091`) - Bug in :meth:`DataFrame.sort_index` when passing ``axis="columns"`` and ``ignore_index=True`` and ``ascending=False`` not returning a :class:`RangeIndex` columns (:issue:`57293`) - Bug in :meth:`DataFrame.transform` that was returning the wrong order unless the index was monotonically increasing. (:issue:`57069`) - Bug in :meth:`DataFrame.where` where using a non-bool type array in the function would return a ``ValueError`` instead of a ``TypeError`` (:issue:`56330`) diff --git a/pandas/core/computation/ops.py b/pandas/core/computation/ops.py index 7d8e23abf43b6..b7a1cb173f659 100644 --- a/pandas/core/computation/ops.py +++ b/pandas/core/computation/ops.py @@ -45,6 +45,7 @@ _unary_math_ops = ( "sin", "cos", + "tan", "exp", "log", "expm1", From 281d4a8d62b2397225822b3a4f0ba4c4df6cff07 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Tue, 23 Apr 2024 00:26:22 +0530 Subject: [PATCH 014/100] DOC: Enforce Numpy Docstring Validation for pandas.HDFStore.keys (#58371) * DOC: add SA01 to HDFStore.keys * DOC: remove HDFStore.keys * DOC: fix typo in See Also for HDFStore.keys --- ci/code_checks.sh | 1 - pandas/io/pytables.py | 6 ++++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index f03ea65866031..17316c80f86ba 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -122,7 +122,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.DatetimeTZDtype.tz SA01" \ -i "pandas.DatetimeTZDtype.unit SA01" \ -i "pandas.Grouper PR02" \ - -i "pandas.HDFStore.keys SA01" \ -i "pandas.HDFStore.put PR01,SA01" \ -i "pandas.HDFStore.select SA01" \ -i "pandas.HDFStore.walk SA01" \ diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 89c6ac9a58382..5c04342b9eb55 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -656,6 +656,12 @@ def keys(self, include: str = "pandas") -> list[str]: ------ raises ValueError if kind has an illegal value + See Also + -------- + HDFStore.info : Prints detailed information on the store. + HDFStore.get_node : Returns the node with the key. + HDFStore.get_storer : Returns the storer object for a key. + Examples -------- >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"]) From 19c4769f7d793d715b008675a4f94b2e5570b025 Mon Sep 17 00:00:00 2001 From: KeiOshima Date: Mon, 22 Apr 2024 14:56:51 -0400 Subject: [PATCH 015/100] Doc: Fixing SA01 error for DataFrame: pop and columns (#58359) Doc: Fixinf SA01 error for DataFrame: pop and columns --- ci/code_checks.sh | 1 - pandas/core/frame.py | 6 ++++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 17316c80f86ba..a7a4bcf165f2a 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -90,7 +90,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.DataFrame.median RT03,SA01" \ -i "pandas.DataFrame.min RT03" \ -i "pandas.DataFrame.plot PR02,SA01" \ - -i "pandas.DataFrame.pop SA01" \ -i "pandas.DataFrame.prod RT03" \ -i "pandas.DataFrame.product RT03" \ -i "pandas.DataFrame.reorder_levels SA01" \ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 567fcb1ef7c05..3bcf41893b6c8 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5535,6 +5535,11 @@ def pop(self, item: Hashable) -> Series: Series Series representing the item that is dropped. + See Also + -------- + DataFrame.drop: Drop specified labels from rows or columns. + DataFrame.drop_duplicates: Return DataFrame with duplicate rows removed. + Examples -------- >>> df = pd.DataFrame( @@ -12896,6 +12901,7 @@ def isin_(x): See Also -------- DataFrame.index: The index (row labels) of the DataFrame. + DataFrame.axes: Return a list representing the axes of the DataFrame. Examples -------- From 963ce7a594b4346d70a2b39a6fc81af0bb463809 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Tue, 23 Apr 2024 01:52:57 +0530 Subject: [PATCH 016/100] DOC: Enforce Numpy Docstring Validation for pandas.HDFStore.select (#58374) * DOC: add SA01 to HDFStore.select * DOC: remove HDFStore.select --- ci/code_checks.sh | 1 - pandas/io/pytables.py | 6 ++++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index a7a4bcf165f2a..599d4d65b9101 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -122,7 +122,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.DatetimeTZDtype.unit SA01" \ -i "pandas.Grouper PR02" \ -i "pandas.HDFStore.put PR01,SA01" \ - -i "pandas.HDFStore.select SA01" \ -i "pandas.HDFStore.walk SA01" \ -i "pandas.Index PR07" \ -i "pandas.Index.T SA01" \ diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 5c04342b9eb55..0af5c753977bd 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -859,6 +859,12 @@ def select( object Retrieved object from file. + See Also + -------- + HDFStore.select_as_coordinates : Returns the selection as an index. + HDFStore.select_column : Returns a single column from the table. + HDFStore.select_as_multiple : Retrieves pandas objects from multiple tables. + Examples -------- >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"]) From 0cafd1007640b9c6f3542eddd10ffffbaee49c88 Mon Sep 17 00:00:00 2001 From: KeiOshima Date: Mon, 22 Apr 2024 18:11:44 -0400 Subject: [PATCH 017/100] DOC: Fixing SA01 issues for DatetimeIndex: date and tz (#58377) * DOC: Fixing SA01 issues for DatetimeIndex: date and tz * fixing: XPECTED TO FAIL, BUT NOT FAILING error --- ci/code_checks.sh | 4 ---- pandas/core/arrays/datetimes.py | 15 +++++++++++++++ 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 599d4d65b9101..801fe7eccd1ed 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -103,7 +103,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.DataFrame.to_markdown SA01" \ -i "pandas.DataFrame.to_parquet RT03" \ -i "pandas.DataFrame.var PR01,RT03,SA01" \ - -i "pandas.DatetimeIndex.date SA01" \ -i "pandas.DatetimeIndex.day_of_year SA01" \ -i "pandas.DatetimeIndex.dayofyear SA01" \ -i "pandas.DatetimeIndex.freqstr SA01" \ @@ -115,7 +114,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.DatetimeIndex.std PR01,RT03" \ -i "pandas.DatetimeIndex.to_period RT03" \ -i "pandas.DatetimeIndex.to_pydatetime RT03,SA01" \ - -i "pandas.DatetimeIndex.tz SA01" \ -i "pandas.DatetimeIndex.tz_convert RT03" \ -i "pandas.DatetimeTZDtype SA01" \ -i "pandas.DatetimeTZDtype.tz SA01" \ @@ -270,7 +268,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Series.dt.as_unit PR01,PR02" \ -i "pandas.Series.dt.ceil PR01,PR02" \ -i "pandas.Series.dt.components SA01" \ - -i "pandas.Series.dt.date SA01" \ -i "pandas.Series.dt.day_name PR01,PR02" \ -i "pandas.Series.dt.day_of_year SA01" \ -i "pandas.Series.dt.dayofyear SA01" \ @@ -290,7 +287,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Series.dt.strftime PR01,PR02" \ -i "pandas.Series.dt.to_period PR01,PR02,RT03" \ -i "pandas.Series.dt.total_seconds PR01" \ - -i "pandas.Series.dt.tz SA01" \ -i "pandas.Series.dt.tz_convert PR01,PR02,RT03" \ -i "pandas.Series.dt.tz_localize PR01,PR02" \ -i "pandas.Series.dt.unit GL08" \ diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index fb9f047d432a1..203308b4f0dee 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -593,6 +593,13 @@ def tz(self) -> tzinfo | None: datetime.tzinfo, pytz.tzinfo.BaseTZInfo, dateutil.tz.tz.tzfile, or None Returns None when the array is tz-naive. + See Also + -------- + DatetimeIndex.tz_localize : Localize tz-naive DatetimeIndex to a + given time zone, or remove timezone from a tz-aware DatetimeIndex. + DatetimeIndex.tz_convert : Convert tz-aware DatetimeIndex from + one time zone to another. + Examples -------- For Series: @@ -1476,6 +1483,14 @@ def date(self) -> npt.NDArray[np.object_]: Namely, the date part of Timestamps without time and timezone information. + See Also + -------- + DatetimeIndex.time : Returns numpy array of :class:`datetime.time` objects. + The time part of the Timestamps. + DatetimeIndex.year : The year of the datetime. + DatetimeIndex.month : The month as January=1, December=12. + DatetimeIndex.day : The day of the datetime. + Examples -------- For Series: From bfe5be01fef4eaecf4ab033e74139b0a3cac4a39 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 22 Apr 2024 15:32:37 -1000 Subject: [PATCH 018/100] REF: Defer creating Index._engine until needed (#58370) --- pandas/core/frame.py | 3 +-- pandas/core/indexes/base.py | 3 ++- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 3bcf41893b6c8..4d89272013a52 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4012,7 +4012,6 @@ def _get_value(self, index, col, takeable: bool = False) -> Scalar: return series._values[index] series = self._get_item(col) - engine = self.index._engine if not isinstance(self.index, MultiIndex): # CategoricalIndex: Trying to use the engine fastpath may give incorrect @@ -4023,7 +4022,7 @@ def _get_value(self, index, col, takeable: bool = False) -> Scalar: # For MultiIndex going through engine effectively restricts us to # same-length tuples; see test_get_set_value_no_partial_indexing - loc = engine.get_loc(index) + loc = self.index._engine.get_loc(index) return series._values[loc] def isetitem(self, loc, value) -> None: diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 63facb61ed498..d2129c54fabc4 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -832,7 +832,8 @@ def _reset_identity(self) -> None: @final def _cleanup(self) -> None: - self._engine.clear_mapping() + if "_engine" in self._cache: + self._engine.clear_mapping() @cache_readonly def _engine( From ec1dff9ff3289ab2a456d293e232cffcd4abb90d Mon Sep 17 00:00:00 2001 From: shriyakalakata <87483933+shriyakalakata@users.noreply.github.com> Date: Tue, 23 Apr 2024 12:51:09 -0400 Subject: [PATCH 019/100] Add mailing list link (#58358) * Add mailing list link * Update mailing list link --- doc/source/development/community.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/source/development/community.rst b/doc/source/development/community.rst index ccf7be8e47748..ab8294b8f135a 100644 --- a/doc/source/development/community.rst +++ b/doc/source/development/community.rst @@ -100,6 +100,8 @@ The pandas mailing list `pandas-dev@python.org `_. + .. _community.slack: Community slack From 903cd53911a3e1dd79b51c28db9cfbed95fb4fc1 Mon Sep 17 00:00:00 2001 From: KeiOshima Date: Tue, 23 Apr 2024 12:58:49 -0400 Subject: [PATCH 020/100] DOC: fixinf SA01 issue for DataFrame.to_feather (#58378) --- ci/code_checks.sh | 1 - pandas/core/frame.py | 10 ++++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 801fe7eccd1ed..cf21ae92496ac 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -99,7 +99,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.DataFrame.std PR01,RT03,SA01" \ -i "pandas.DataFrame.sum RT03" \ -i "pandas.DataFrame.swaplevel SA01" \ - -i "pandas.DataFrame.to_feather SA01" \ -i "pandas.DataFrame.to_markdown SA01" \ -i "pandas.DataFrame.to_parquet RT03" \ -i "pandas.DataFrame.var PR01,RT03,SA01" \ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 4d89272013a52..e8a0e37b70145 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2685,6 +2685,16 @@ def to_feather(self, path: FilePath | WriteBuffer[bytes], **kwargs) -> None: This includes the `compression`, `compression_level`, `chunksize` and `version` keywords. + See Also + -------- + DataFrame.to_parquet : Write a DataFrame to the binary parquet format. + DataFrame.to_excel : Write object to an Excel sheet. + DataFrame.to_sql : Write to a sql table. + DataFrame.to_csv : Write a csv file. + DataFrame.to_json : Convert the object to a JSON string. + DataFrame.to_html : Render a DataFrame as an HTML table. + DataFrame.to_string : Convert DataFrame to a string. + Notes ----- This function writes the dataframe as a `feather file From ff2727147d367b5b81659931e9804733711e8f6c Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 23 Apr 2024 10:04:00 -0700 Subject: [PATCH 021/100] BUG: setitem with mixed-resolution dt64s (#56419) * BUG: setitem with mixed-resolution dt64s * Move whatsnew to 3.0 * de-xfail * improve exception message --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/arrays/datetimes.py | 2 +- pandas/core/arrays/timedeltas.py | 2 +- pandas/core/indexes/datetimes.py | 2 ++ pandas/core/internals/blocks.py | 17 ++++++++-- pandas/tests/series/indexing/test_setitem.py | 33 ++++++++++++++++++++ pandas/tests/series/methods/test_clip.py | 28 ++++++++++++++--- pandas/tests/series/methods/test_fillna.py | 14 ++------- 8 files changed, 79 insertions(+), 20 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 7823f74b7a153..4213cc8e6cfcf 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -360,6 +360,7 @@ Datetimelike - Bug in :func:`date_range` where the last valid timestamp would sometimes not be produced (:issue:`56134`) - Bug in :func:`date_range` where using a negative frequency value would not include all points between the start and end values (:issue:`56382`) - Bug in :func:`tseries.api.guess_datetime_format` would fail to infer time format when "%Y" == "%H%M" (:issue:`57452`) +- Bug in setting scalar values with mismatched resolution into arrays with non-nanosecond ``datetime64``, ``timedelta64`` or :class:`DatetimeTZDtype` incorrectly truncating those scalars (:issue:`56410`) Timedelta ^^^^^^^^^ diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 203308b4f0dee..be087e19ce7b6 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -539,7 +539,7 @@ def _unbox_scalar(self, value) -> np.datetime64: if value is NaT: return np.datetime64(value._value, self.unit) else: - return value.as_unit(self.unit).asm8 + return value.as_unit(self.unit, round_ok=False).asm8 def _scalar_from_string(self, value) -> Timestamp | NaTType: return Timestamp(value, tz=self.tz) diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index 6eb4d234b349d..ff43f97161136 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -322,7 +322,7 @@ def _unbox_scalar(self, value) -> np.timedelta64: if value is NaT: return np.timedelta64(value._value, self.unit) else: - return value.as_unit(self.unit).asm8 + return value.as_unit(self.unit, round_ok=False).asm8 def _scalar_from_string(self, value) -> Timedelta | NaTType: return Timedelta(value) diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 7122de745e13b..6d5f32774f485 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -515,6 +515,8 @@ def _parsed_string_to_bounds( freq = OFFSET_TO_PERIOD_FREQSTR.get(reso.attr_abbrev, reso.attr_abbrev) per = Period(parsed, freq=freq) start, end = per.start_time, per.end_time + start = start.as_unit(self.unit) + end = end.as_unit(self.unit) # GH 24076 # If an incoming date string contained a UTC offset, need to localize diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 1b72c164f7945..28d3292a1c65b 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -38,7 +38,10 @@ Shape, npt, ) -from pandas.errors import AbstractMethodError +from pandas.errors import ( + AbstractMethodError, + OutOfBoundsDatetime, +) from pandas.util._decorators import cache_readonly from pandas.util._exceptions import find_stack_level from pandas.util._validators import validate_bool_kwarg @@ -478,7 +481,17 @@ def coerce_to_target_dtype(self, other, warn_on_upcast: bool = False) -> Block: f"{self.values.dtype}. Please report a bug at " "https://github.com/pandas-dev/pandas/issues." ) - return self.astype(new_dtype) + try: + return self.astype(new_dtype) + except OutOfBoundsDatetime as err: + # e.g. GH#56419 if self.dtype is a low-resolution dt64 and we try to + # upcast to a higher-resolution dt64, we may have entries that are + # out of bounds for the higher resolution. + # Re-raise with a more informative message. + raise OutOfBoundsDatetime( + f"Incompatible (high-resolution) value for dtype='{self.dtype}'. " + "Explicitly cast before operating." + ) from err @final def convert(self) -> list[Block]: diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py index 99535f273075c..7a2a4892f61fb 100644 --- a/pandas/tests/series/indexing/test_setitem.py +++ b/pandas/tests/series/indexing/test_setitem.py @@ -1467,6 +1467,39 @@ def test_slice_key(self, obj, key, expected, warn, val, indexer_sli, is_inplace) raise AssertionError("xfail not relevant for this test.") +@pytest.mark.parametrize( + "exp_dtype", + [ + "M8[ms]", + "M8[ms, UTC]", + "m8[ms]", + ], +) +class TestCoercionDatetime64HigherReso(CoercionTest): + @pytest.fixture + def obj(self, exp_dtype): + idx = date_range("2011-01-01", freq="D", periods=4, unit="s") + if exp_dtype == "m8[ms]": + idx = idx - Timestamp("1970-01-01") + assert idx.dtype == "m8[s]" + elif exp_dtype == "M8[ms, UTC]": + idx = idx.tz_localize("UTC") + return Series(idx) + + @pytest.fixture + def val(self, exp_dtype): + ts = Timestamp("2011-01-02 03:04:05.678").as_unit("ms") + if exp_dtype == "m8[ms]": + return ts - Timestamp("1970-01-01") + elif exp_dtype == "M8[ms, UTC]": + return ts.tz_localize("UTC") + return ts + + @pytest.fixture + def warn(self): + return FutureWarning + + @pytest.mark.parametrize( "val,exp_dtype,warn", [ diff --git a/pandas/tests/series/methods/test_clip.py b/pandas/tests/series/methods/test_clip.py index 75b4050c18afe..8ed422fc118dc 100644 --- a/pandas/tests/series/methods/test_clip.py +++ b/pandas/tests/series/methods/test_clip.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas.errors import OutOfBoundsDatetime + import pandas as pd from pandas import ( Series, @@ -131,12 +133,30 @@ def test_clip_with_datetimes(self): ) tm.assert_series_equal(result, expected) - @pytest.mark.parametrize("dtype", [object, "M8[us]"]) - def test_clip_with_timestamps_and_oob_datetimes(self, dtype): + def test_clip_with_timestamps_and_oob_datetimes_object(self): # GH-42794 - ser = Series([datetime(1, 1, 1), datetime(9999, 9, 9)], dtype=dtype) + ser = Series([datetime(1, 1, 1), datetime(9999, 9, 9)], dtype=object) result = ser.clip(lower=Timestamp.min, upper=Timestamp.max) - expected = Series([Timestamp.min, Timestamp.max], dtype=dtype) + expected = Series([Timestamp.min, Timestamp.max], dtype=object) + + tm.assert_series_equal(result, expected) + + def test_clip_with_timestamps_and_oob_datetimes_non_nano(self): + # GH#56410 + dtype = "M8[us]" + ser = Series([datetime(1, 1, 1), datetime(9999, 9, 9)], dtype=dtype) + + msg = ( + r"Incompatible \(high-resolution\) value for dtype='datetime64\[us\]'. " + "Explicitly cast before operating" + ) + with pytest.raises(OutOfBoundsDatetime, match=msg): + ser.clip(lower=Timestamp.min, upper=Timestamp.max) + + lower = Timestamp.min.as_unit("us") + upper = Timestamp.max.as_unit("us") + result = ser.clip(lower=lower, upper=upper) + expected = Series([lower, upper], dtype=dtype) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_fillna.py b/pandas/tests/series/methods/test_fillna.py index 0965d36e4827d..592dba253532d 100644 --- a/pandas/tests/series/methods/test_fillna.py +++ b/pandas/tests/series/methods/test_fillna.py @@ -308,12 +308,7 @@ def test_datetime64_fillna(self): "scalar", [ False, - pytest.param( - True, - marks=pytest.mark.xfail( - reason="GH#56410 scalar case not yet addressed" - ), - ), + True, ], ) @pytest.mark.parametrize("tz", [None, "UTC"]) @@ -342,12 +337,7 @@ def test_datetime64_fillna_mismatched_reso_no_rounding(self, tz, scalar): "scalar", [ False, - pytest.param( - True, - marks=pytest.mark.xfail( - reason="GH#56410 scalar case not yet addressed" - ), - ), + True, ], ) def test_timedelta64_fillna_mismatched_reso_no_rounding(self, scalar): From 191a56c32578be7ae7d231108abbe4ce1c4378e9 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Tue, 23 Apr 2024 22:50:15 +0530 Subject: [PATCH 022/100] DOC: Enforce Numpy Docstring Validation for pandas.HDFStore.put (#58384) * DOC: add SA01 and PR01 to HDFStore.put * DOC: remove SA01 and PR01 of HDFStore.put --- ci/code_checks.sh | 1 - pandas/io/pytables.py | 20 ++++++++++++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index cf21ae92496ac..5993fabfc9d6c 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -118,7 +118,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.DatetimeTZDtype.tz SA01" \ -i "pandas.DatetimeTZDtype.unit SA01" \ -i "pandas.Grouper PR02" \ - -i "pandas.HDFStore.put PR01,SA01" \ -i "pandas.HDFStore.walk SA01" \ -i "pandas.Index PR07" \ -i "pandas.Index.T SA01" \ diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 0af5c753977bd..75e9b779e5094 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -1144,12 +1144,27 @@ def put( Write DataFrame index as a column. append : bool, default False This will force Table format, append the input data to the existing. + complib : default None + This parameter is currently not accepted. + complevel : int, 0-9, default None + Specifies a compression level for data. + A value of 0 or None disables compression. + min_itemsize : int, dict, or None + Dict of columns that specify minimum str sizes. + nan_rep : str + Str to use as str nan representation. data_columns : list of columns or True, default None List of columns to create as data columns, or True to use all columns. See `here `__. encoding : str, default None Provide an encoding for strings. + errors : str, default 'strict' + The error handling scheme to use for encoding errors. + The default is 'strict' meaning that encoding errors raise a + UnicodeEncodeError. Other possible values are 'ignore', 'replace' and + 'xmlcharrefreplace' as well as any other name registered with + codecs.register_error that can handle UnicodeEncodeErrors. track_times : bool, default True Parameter is propagated to 'create_table' method of 'PyTables'. If set to False it enables to have the same h5 files (same hashes) @@ -1157,6 +1172,11 @@ def put( dropna : bool, default False, optional Remove missing values. + See Also + -------- + HDFStore.info : Prints detailed information on the store. + HDFStore.get_storer : Returns the storer object for a key. + Examples -------- >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"]) From bd9c09b4331f890fc9fb4698deaf2d168060941b Mon Sep 17 00:00:00 2001 From: Abdulaziz Aloqeely <52792999+Aloqeely@users.noreply.github.com> Date: Tue, 23 Apr 2024 20:21:58 +0300 Subject: [PATCH 023/100] DEPR: to_pytimedelta return Index[object] (#58383) * DEPR: to_pytimedelta return Index[object] * ignore doctest warning --------- Co-authored-by: Abdulaziz Aloqeely <52792999+DAzVise@users.noreply.github.com> --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/conftest.py | 1 + pandas/core/indexes/accessors.py | 20 +++++++++++++++++++ pandas/tests/extension/test_arrow.py | 8 ++++++-- .../series/accessors/test_cat_accessor.py | 3 +++ .../series/accessors/test_dt_accessor.py | 4 +++- 6 files changed, 34 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 4213cc8e6cfcf..02e4aba667408 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -199,6 +199,7 @@ Other Deprecations - Deprecated allowing non-keyword arguments in :meth:`DataFrame.all`, :meth:`DataFrame.min`, :meth:`DataFrame.max`, :meth:`DataFrame.sum`, :meth:`DataFrame.prod`, :meth:`DataFrame.mean`, :meth:`DataFrame.median`, :meth:`DataFrame.sem`, :meth:`DataFrame.var`, :meth:`DataFrame.std`, :meth:`DataFrame.skew`, :meth:`DataFrame.kurt`, :meth:`Series.all`, :meth:`Series.min`, :meth:`Series.max`, :meth:`Series.sum`, :meth:`Series.prod`, :meth:`Series.mean`, :meth:`Series.median`, :meth:`Series.sem`, :meth:`Series.var`, :meth:`Series.std`, :meth:`Series.skew`, and :meth:`Series.kurt`. (:issue:`57087`) - Deprecated allowing non-keyword arguments in :meth:`Series.to_markdown` except ``buf``. (:issue:`57280`) - Deprecated allowing non-keyword arguments in :meth:`Series.to_string` except ``buf``. (:issue:`57280`) +- Deprecated behavior of :meth:`Series.dt.to_pytimedelta`, in a future version this will return a :class:`Series` containing python ``datetime.timedelta`` objects instead of an ``ndarray`` of timedelta; this matches the behavior of other :meth:`Series.dt` properties. (:issue:`57463`) - Deprecated using ``epoch`` date format in :meth:`DataFrame.to_json` and :meth:`Series.to_json`, use ``iso`` instead. (:issue:`57063`) - diff --git a/pandas/conftest.py b/pandas/conftest.py index 34489bb70575a..21100178262c8 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -157,6 +157,7 @@ def pytest_collection_modifyitems(items, config) -> None: ("SeriesGroupBy.fillna", "SeriesGroupBy.fillna is deprecated"), ("SeriesGroupBy.idxmin", "The behavior of Series.idxmin"), ("SeriesGroupBy.idxmax", "The behavior of Series.idxmax"), + ("to_pytimedelta", "The behavior of TimedeltaProperties.to_pytimedelta"), # Docstring divides by zero to show behavior difference ("missing.mask_zero_div_zero", "divide by zero encountered"), ( diff --git a/pandas/core/indexes/accessors.py b/pandas/core/indexes/accessors.py index 2bb234e174563..3dcd1fedc8d64 100644 --- a/pandas/core/indexes/accessors.py +++ b/pandas/core/indexes/accessors.py @@ -9,10 +9,12 @@ NoReturn, cast, ) +import warnings import numpy as np from pandas._libs import lib +from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.common import ( is_integer_dtype, @@ -210,6 +212,15 @@ def _delegate_method(self, name: str, *args, **kwargs): return result def to_pytimedelta(self): + # GH 57463 + warnings.warn( + f"The behavior of {type(self).__name__}.to_pytimedelta is deprecated, " + "in a future version this will return a Series containing python " + "datetime.timedelta objects instead of an ndarray. To retain the " + "old behavior, call `np.array` on the result", + FutureWarning, + stacklevel=find_stack_level(), + ) return cast(ArrowExtensionArray, self._parent.array)._dt_to_pytimedelta() def to_pydatetime(self) -> Series: @@ -462,6 +473,15 @@ def to_pytimedelta(self) -> np.ndarray: datetime.timedelta(days=2), datetime.timedelta(days=3), datetime.timedelta(days=4)], dtype=object) """ + # GH 57463 + warnings.warn( + f"The behavior of {type(self).__name__}.to_pytimedelta is deprecated, " + "in a future version this will return a Series containing python " + "datetime.timedelta objects instead of an ndarray. To retain the " + "old behavior, call `np.array` on the result", + FutureWarning, + stacklevel=find_stack_level(), + ) return self._get_values().to_pytimedelta() @property diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 9b2251d0b7d4a..79440b55dd5dd 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -2861,12 +2861,16 @@ def test_dt_to_pytimedelta(): data = [timedelta(1, 2, 3), timedelta(1, 2, 4)] ser = pd.Series(data, dtype=ArrowDtype(pa.duration("ns"))) - result = ser.dt.to_pytimedelta() + msg = "The behavior of ArrowTemporalProperties.to_pytimedelta is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = ser.dt.to_pytimedelta() expected = np.array(data, dtype=object) tm.assert_numpy_array_equal(result, expected) assert all(type(res) is timedelta for res in result) - expected = ser.astype("timedelta64[ns]").dt.to_pytimedelta() + msg = "The behavior of TimedeltaProperties.to_pytimedelta is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = ser.astype("timedelta64[ns]").dt.to_pytimedelta() tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/series/accessors/test_cat_accessor.py b/pandas/tests/series/accessors/test_cat_accessor.py index ca2768efd5c68..ce8ea27ea1fa2 100644 --- a/pandas/tests/series/accessors/test_cat_accessor.py +++ b/pandas/tests/series/accessors/test_cat_accessor.py @@ -200,6 +200,9 @@ def test_dt_accessor_api_for_categorical(self, idx): if func == "to_period" and getattr(idx, "tz", None) is not None: # dropping TZ warn_cls.append(UserWarning) + elif func == "to_pytimedelta": + # GH 57463 + warn_cls.append(FutureWarning) if warn_cls: warn_cls = tuple(warn_cls) else: diff --git a/pandas/tests/series/accessors/test_dt_accessor.py b/pandas/tests/series/accessors/test_dt_accessor.py index 5f0057ac50b47..8c60f7beb317d 100644 --- a/pandas/tests/series/accessors/test_dt_accessor.py +++ b/pandas/tests/series/accessors/test_dt_accessor.py @@ -192,7 +192,9 @@ def test_dt_namespace_accessor_timedelta(self): assert isinstance(result, DataFrame) tm.assert_index_equal(result.index, ser.index) - result = ser.dt.to_pytimedelta() + msg = "The behavior of TimedeltaProperties.to_pytimedelta is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = ser.dt.to_pytimedelta() assert isinstance(result, np.ndarray) assert result.dtype == object From 9b7d09d69e252e6afff4d991728713a541e03045 Mon Sep 17 00:00:00 2001 From: Abdulaziz Aloqeely <52792999+Aloqeely@users.noreply.github.com> Date: Tue, 23 Apr 2024 20:23:16 +0300 Subject: [PATCH 024/100] TST: No longer produce test_stata.dta file after running test suite (#58381) Use tmp_path fixture Co-authored-by: Abdulaziz Aloqeely <52792999+DAzVise@users.noreply.github.com> --- pandas/tests/io/test_stata.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 43c62237c6786..2650f351e2203 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -1962,7 +1962,7 @@ def test_writer_118_exceptions(self, temp_file): "dtype_backend", ["numpy_nullable", pytest.param("pyarrow", marks=td.skip_if_no("pyarrow"))], ) - def test_read_write_ea_dtypes(self, dtype_backend, temp_file): + def test_read_write_ea_dtypes(self, dtype_backend, temp_file, tmp_path): df = DataFrame( { "a": [1, 2, None], @@ -1974,7 +1974,8 @@ def test_read_write_ea_dtypes(self, dtype_backend, temp_file): index=pd.Index([0, 1, 2], name="index"), ) df = df.convert_dtypes(dtype_backend=dtype_backend) - df.to_stata("test_stata.dta", version=118) + stata_path = tmp_path / "test_stata.dta" + df.to_stata(stata_path, version=118) df.to_stata(temp_file) written_and_read_again = self.read_dta(temp_file) From 23dd1f12aea8bfd503ea86ce1850de817cf0fe43 Mon Sep 17 00:00:00 2001 From: Nrezhang <102526155+Nrezhang@users.noreply.github.com> Date: Tue, 23 Apr 2024 13:25:32 -0400 Subject: [PATCH 025/100] #58324 (#58379) --- doc/source/user_guide/style.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/user_guide/style.ipynb b/doc/source/user_guide/style.ipynb index f831723f44931..43da43a983429 100644 --- a/doc/source/user_guide/style.ipynb +++ b/doc/source/user_guide/style.ipynb @@ -1908,7 +1908,7 @@ "- Provide an API that is pleasing to use interactively and is \"good enough\" for many tasks\n", "- Provide the foundations for dedicated libraries to build on\n", "\n", - "If you build a great library on top of this, let us know and we'll [link](https://pandas.pydata.org/pandas-docs/stable/ecosystem.html) to it.\n", + "If you build a great library on top of this, let us know and we'll [link](https://pandas.pydata.org/community/ecosystem.html) to it.\n", "\n", "### Subclassing\n", "\n", From ffca68426fe32c61428aaec02e2283063148ed47 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Wed, 24 Apr 2024 00:04:15 +0530 Subject: [PATCH 026/100] DOC: Enforce Numpy Docstring Validation for pandas.HDFStore.walk (#58386) * DOC: add SA01 to HDFStore.walk * DOC: remove SA01 of HDFStore.walk --- ci/code_checks.sh | 1 - pandas/io/pytables.py | 4 ++++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 5993fabfc9d6c..24dacd6b48a42 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -118,7 +118,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.DatetimeTZDtype.tz SA01" \ -i "pandas.DatetimeTZDtype.unit SA01" \ -i "pandas.Grouper PR02" \ - -i "pandas.HDFStore.walk SA01" \ -i "pandas.Index PR07" \ -i "pandas.Index.T SA01" \ -i "pandas.Index.append PR07,RT03,SA01" \ diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 75e9b779e5094..d585c59dd5581 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -1595,6 +1595,10 @@ def walk(self, where: str = "/") -> Iterator[tuple[str, list[str], list[str]]]: leaves : list Names (strings) of the pandas objects contained in `path`. + See Also + -------- + HDFStore.info : Prints detailed information on the store. + Examples -------- >>> df1 = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"]) From 9d5c88e52ac1a652e8392003a8aa4cdb52bc29f6 Mon Sep 17 00:00:00 2001 From: bdwzhangumich <112042021+bdwzhangumich@users.noreply.github.com> Date: Tue, 23 Apr 2024 14:07:42 -0600 Subject: [PATCH 027/100] ENH: Implement cummax and cummin in _accumulate() for ordered Categorical arrays (#58360) * Added tests with and without np.nan * Added tests for cummin and cummax * Fixed series tests expected series, rewrote categorical arrays to use pd.Categorical * Fixed cat not defined error and misspelling * Implement _accumulate for Categorical * fixed misspellings in tests * fixed expected categories on tests * Updated whatsnew * Update doc/source/whatsnew/v3.0.0.rst Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> * Removed testing for _accumulate. * Moved categorical_accumulations.py logic to categorical.py * Assigned expected results to expected variable; Added pytest.mark.parametrize to test_cummax_cummin_ordered_categorical_nan with skipna and expected data --------- Co-authored-by: Christopher Xiang Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Co-authored-by: Chris Xiang <124408670+xiangchris@users.noreply.github.com> --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/arrays/categorical.py | 23 ++++++++++++ pandas/tests/series/test_cumulative.py | 52 ++++++++++++++++++++++++++ 3 files changed, 76 insertions(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 02e4aba667408..9a432e03e9cf4 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -41,6 +41,7 @@ Other enhancements - :class:`.errors.DtypeWarning` improved to include column names when mixed data types are detected (:issue:`58174`) - :meth:`DataFrame.cummin`, :meth:`DataFrame.cummax`, :meth:`DataFrame.cumprod` and :meth:`DataFrame.cumsum` methods now have a ``numeric_only`` parameter (:issue:`53072`) - :meth:`DataFrame.fillna` and :meth:`Series.fillna` can now accept ``value=None``; for non-object dtype the corresponding NA value will be used (:issue:`57723`) +- :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`) .. --------------------------------------------------------------------------- .. _whatsnew_300.notable_bug_fixes: diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 8d6880fc2acb3..6a3cf4590568c 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -6,6 +6,7 @@ from shutil import get_terminal_size from typing import ( TYPE_CHECKING, + Callable, Literal, cast, overload, @@ -2508,6 +2509,28 @@ def equals(self, other: object) -> bool: return np.array_equal(self._codes, other._codes) return False + def _accumulate(self, name: str, skipna: bool = True, **kwargs) -> Self: + func: Callable + if name == "cummin": + func = np.minimum.accumulate + elif name == "cummax": + func = np.maximum.accumulate + else: + raise TypeError(f"Accumulation {name} not supported for {type(self)}") + self.check_for_ordered(name) + + codes = self.codes.copy() + mask = self.isna() + if func == np.minimum.accumulate: + codes[mask] = np.iinfo(codes.dtype.type).max + # no need to change codes for maximum because codes[mask] is already -1 + if not skipna: + mask = np.maximum.accumulate(mask) + + codes = func(codes) + codes[mask] = -1 + return self._simple_new(codes, dtype=self._dtype) + @classmethod def _concat_same_type(cls, to_concat: Sequence[Self], axis: AxisInt = 0) -> Self: from pandas.core.dtypes.concat import union_categoricals diff --git a/pandas/tests/series/test_cumulative.py b/pandas/tests/series/test_cumulative.py index 9b7b08127a550..a9d5486139b46 100644 --- a/pandas/tests/series/test_cumulative.py +++ b/pandas/tests/series/test_cumulative.py @@ -170,6 +170,58 @@ def test_cummethods_bool_in_object_dtype(self, method, expected): result = getattr(ser, method)() tm.assert_series_equal(result, expected) + @pytest.mark.parametrize( + "method, order", + [ + ["cummax", "abc"], + ["cummin", "cba"], + ], + ) + def test_cummax_cummin_on_ordered_categorical(self, method, order): + # GH#52335 + cat = pd.CategoricalDtype(list(order), ordered=True) + ser = pd.Series( + list("ababcab"), + dtype=cat, + ) + result = getattr(ser, method)() + expected = pd.Series( + list("abbbccc"), + dtype=cat, + ) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "skip, exp", + [ + [True, ["a", np.nan, "b", "b", "c"]], + [False, ["a", np.nan, np.nan, np.nan, np.nan]], + ], + ) + @pytest.mark.parametrize( + "method, order", + [ + ["cummax", "abc"], + ["cummin", "cba"], + ], + ) + def test_cummax_cummin_ordered_categorical_nan(self, skip, exp, method, order): + # GH#52335 + cat = pd.CategoricalDtype(list(order), ordered=True) + ser = pd.Series( + ["a", np.nan, "b", "a", "c"], + dtype=cat, + ) + result = getattr(ser, method)(skipna=skip) + expected = pd.Series( + exp, + dtype=cat, + ) + tm.assert_series_equal( + result, + expected, + ) + def test_cumprod_timedelta(self): # GH#48111 ser = pd.Series([pd.Timedelta(days=1), pd.Timedelta(days=3)]) From 8aa4f0eb5a7a456f9476ff4b1bd6743ca25c949b Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Wed, 24 Apr 2024 01:38:30 +0530 Subject: [PATCH 028/100] DOC: Enforce Numpy Docstring Validation for pandas.DatetimeTZDtype.unit (#58387) * DOC: add SA01 to DatetimeTZDtype.tz * DOC: remove SA01 of DatetimeTZDtype.unit --- ci/code_checks.sh | 1 - pandas/core/dtypes/dtypes.py | 4 ++++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 24dacd6b48a42..066c7176fcc34 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -116,7 +116,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.DatetimeIndex.tz_convert RT03" \ -i "pandas.DatetimeTZDtype SA01" \ -i "pandas.DatetimeTZDtype.tz SA01" \ - -i "pandas.DatetimeTZDtype.unit SA01" \ -i "pandas.Grouper PR02" \ -i "pandas.Index PR07" \ -i "pandas.Index.T SA01" \ diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 98e689528744e..0a97a0d03c22a 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -793,6 +793,10 @@ def unit(self) -> str_type: """ The precision of the datetime data. + See Also + -------- + DatetimeTZDtype.tz : Retrieves the timezone. + Examples -------- >>> from zoneinfo import ZoneInfo From e9b0a3c914088ce1f89cde16c61f61807ccc6730 Mon Sep 17 00:00:00 2001 From: Abdulaziz Aloqeely <52792999+Aloqeely@users.noreply.github.com> Date: Wed, 24 Apr 2024 02:30:51 +0300 Subject: [PATCH 029/100] CLN: Enforce empty bool indexer deprecation (#58390) * CLN: Enforce empty bool indexer deprecation * Add whatsnew entry --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/indexes/base.py | 9 +++------ pandas/tests/indexes/test_base.py | 2 +- 3 files changed, 5 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 9a432e03e9cf4..781b3b2282a87 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -222,6 +222,7 @@ Removal of prior version deprecations/changes - Disallow automatic casting to object in :class:`Series` logical operations (``&``, ``^``, ``||``) between series with mismatched indexes and dtypes other than ``object`` or ``bool`` (:issue:`52538`) - Disallow calling :meth:`Series.replace` or :meth:`DataFrame.replace` without a ``value`` and with non-dict-like ``to_replace`` (:issue:`33302`) - Disallow constructing a :class:`arrays.SparseArray` with scalar data (:issue:`53039`) +- Disallow indexing an :class:`Index` with a boolean indexer of length zero, it now raises ``ValueError`` (:issue:`55820`) - Disallow non-standard (``np.ndarray``, :class:`Index`, :class:`ExtensionArray`, or :class:`Series`) to :func:`isin`, :func:`unique`, :func:`factorize` (:issue:`52986`) - Disallow passing a pandas type to :meth:`Index.view` (:issue:`55709`) - Disallow units other than "s", "ms", "us", "ns" for datetime64 and timedelta64 dtypes in :func:`array` (:issue:`53817`) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index d2129c54fabc4..5654111132b5e 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -5033,12 +5033,9 @@ def __getitem__(self, key): if not isinstance(self.dtype, ExtensionDtype): if len(key) == 0 and len(key) != len(self): - warnings.warn( - "Using a boolean indexer with length 0 on an Index with " - "length greater than 0 is deprecated and will raise in a " - "future version.", - FutureWarning, - stacklevel=find_stack_level(), + raise ValueError( + "The length of the boolean indexer cannot be 0 " + "when the Index has length greater than 0." ) result = getitem(key) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 3a2d04d3ffdc2..301c4794be4ef 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -481,7 +481,7 @@ def test_empty_fancy(self, index, dtype, request, using_infer_string): assert index[[]].identical(empty_index) if dtype == np.bool_: - with tm.assert_produces_warning(FutureWarning, match="is deprecated"): + with pytest.raises(ValueError, match="length of the boolean indexer"): assert index[empty_arr].identical(empty_index) else: assert index[empty_arr].identical(empty_index) From b6c15ea2cb8b50035be5b111cd656d6983d00788 Mon Sep 17 00:00:00 2001 From: Abdulaziz Aloqeely <52792999+Aloqeely@users.noreply.github.com> Date: Wed, 24 Apr 2024 19:18:02 +0300 Subject: [PATCH 030/100] BUG: Let check_exact_index default to True for integers (#58189) * Default check_exact_index to True for integers * Fix pyright issue * fix logic for multiindex * Pre-commit stuff * Address review comments --------- Co-authored-by: Abdulaziz Aloqeely <52792999+DAzVise@users.noreply.github.com> --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/_testing/asserters.py | 22 +++++++++- pandas/tests/util/test_assert_series_equal.py | 41 +++++++++++++++++-- 3 files changed, 59 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 781b3b2282a87..027c692c6c89e 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -159,6 +159,7 @@ Other API changes - Updated :meth:`DataFrame.to_excel` so that the output spreadsheet has no styling. Custom styling can still be done using :meth:`Styler.to_excel` (:issue:`54154`) - pickle and HDF (``.h5``) files created with Python 2 are no longer explicitly supported (:issue:`57387`) - pickled objects from pandas version less than ``1.0.0`` are no longer supported (:issue:`57155`) +- when comparing the indexes in :func:`testing.assert_series_equal`, check_exact defaults to True if an :class:`Index` is of integer dtypes. (:issue:`57386`) .. --------------------------------------------------------------------------- .. _whatsnew_300.deprecations: diff --git a/pandas/_testing/asserters.py b/pandas/_testing/asserters.py index 3aacd3099c334..543d7944e4c5d 100644 --- a/pandas/_testing/asserters.py +++ b/pandas/_testing/asserters.py @@ -861,12 +861,19 @@ def assert_series_equal( check_names : bool, default True Whether to check the Series and Index names attribute. check_exact : bool, default False - Whether to compare number exactly. + Whether to compare number exactly. This also applies when checking + Index equivalence. .. versionchanged:: 2.2.0 Defaults to True for integer dtypes if none of ``check_exact``, ``rtol`` and ``atol`` are specified. + + .. versionchanged:: 3.0.0 + + check_exact for comparing the Indexes defaults to True by + checking if an Index is of integer dtypes. + check_datetimelike_compat : bool, default False Compare datetime-like which is comparable ignoring dtype. check_categorical : bool, default True @@ -902,7 +909,6 @@ def assert_series_equal( >>> tm.assert_series_equal(a, b) """ __tracebackhide__ = True - check_exact_index = False if check_exact is lib.no_default else check_exact if ( check_exact is lib.no_default and rtol is lib.no_default @@ -914,8 +920,20 @@ def assert_series_equal( or is_numeric_dtype(right.dtype) and not is_float_dtype(right.dtype) ) + left_index_dtypes = ( + [left.index.dtype] if left.index.nlevels == 1 else left.index.dtypes + ) + right_index_dtypes = ( + [right.index.dtype] if right.index.nlevels == 1 else right.index.dtypes + ) + check_exact_index = all( + dtype.kind in "iu" for dtype in left_index_dtypes + ) or all(dtype.kind in "iu" for dtype in right_index_dtypes) elif check_exact is lib.no_default: check_exact = False + check_exact_index = False + else: + check_exact_index = check_exact rtol = rtol if rtol is not lib.no_default else 1.0e-5 atol = atol if atol is not lib.no_default else 1.0e-8 diff --git a/pandas/tests/util/test_assert_series_equal.py b/pandas/tests/util/test_assert_series_equal.py index 0b3bc07c17452..f75f48157aad2 100644 --- a/pandas/tests/util/test_assert_series_equal.py +++ b/pandas/tests/util/test_assert_series_equal.py @@ -475,9 +475,44 @@ def test_assert_series_equal_int_tol(): ) -def test_assert_series_equal_index_exact_default(): +@pytest.mark.parametrize( + "left_idx, right_idx", + [ + ( + pd.Index([0, 0.2, 0.4, 0.6, 0.8, 1]), + pd.Index(np.linspace(0, 1, 6)), + ), + ( + pd.MultiIndex.from_arrays([[0, 0, 0, 0, 1, 1], [0, 0.2, 0.4, 0.6, 0.8, 1]]), + pd.MultiIndex.from_arrays([[0, 0, 0, 0, 1, 1], np.linspace(0, 1, 6)]), + ), + ( + pd.MultiIndex.from_arrays( + [["a", "a", "a", "b", "b", "b"], [1, 2, 3, 4, 5, 10000000000001]] + ), + pd.MultiIndex.from_arrays( + [["a", "a", "a", "b", "b", "b"], [1, 2, 3, 4, 5, 10000000000002]] + ), + ), + pytest.param( + pd.Index([1, 2, 3, 4, 5, 10000000000001]), + pd.Index([1, 2, 3, 4, 5, 10000000000002]), + marks=pytest.mark.xfail(reason="check_exact_index defaults to True"), + ), + pytest.param( + pd.MultiIndex.from_arrays( + [[0, 0, 0, 0, 1, 1], [1, 2, 3, 4, 5, 10000000000001]] + ), + pd.MultiIndex.from_arrays( + [[0, 0, 0, 0, 1, 1], [1, 2, 3, 4, 5, 10000000000002]] + ), + marks=pytest.mark.xfail(reason="check_exact_index defaults to True"), + ), + ], +) +def test_assert_series_equal_check_exact_index_default(left_idx, right_idx): # GH#57067 - ser1 = Series(np.zeros(6, dtype=int), [0, 0.2, 0.4, 0.6, 0.8, 1]) - ser2 = Series(np.zeros(6, dtype=int), np.linspace(0, 1, 6)) + ser1 = Series(np.zeros(6, dtype=int), left_idx) + ser2 = Series(np.zeros(6, dtype=int), right_idx) tm.assert_series_equal(ser1, ser2) tm.assert_frame_equal(ser1.to_frame(), ser2.to_frame()) From c342e9f0be5bae1895f60e6afc9435d0afb087ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dominik=20Smr=C5=BE?= Date: Wed, 24 Apr 2024 18:19:00 +0200 Subject: [PATCH 031/100] Extend eval test of standard functions to cover python engine. (#58393) Extend eval test of ops to cover pandas engine. --- pandas/tests/computation/test_eval.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/pandas/tests/computation/test_eval.py b/pandas/tests/computation/test_eval.py index 8f14c562fa7c3..f7d1fcfa3e469 100644 --- a/pandas/tests/computation/test_eval.py +++ b/pandas/tests/computation/test_eval.py @@ -1609,22 +1609,20 @@ def eval(self, *args, **kwargs): kwargs["level"] = kwargs.pop("level", 0) + 1 return pd.eval(*args, **kwargs) - @pytest.mark.skipif( - not NUMEXPR_INSTALLED, reason="Unary ops only implemented for numexpr" - ) + @pytest.mark.filterwarnings("ignore::RuntimeWarning") @pytest.mark.parametrize("fn", _unary_math_ops) - def test_unary_functions(self, fn): + def test_unary_functions(self, fn, engine, parser): df = DataFrame({"a": np.random.default_rng(2).standard_normal(10)}) a = df.a expr = f"{fn}(a)" - got = self.eval(expr) + got = self.eval(expr, engine=engine, parser=parser) with np.errstate(all="ignore"): expect = getattr(np, fn)(a) tm.assert_series_equal(got, expect, check_names=False) @pytest.mark.parametrize("fn", _binary_math_ops) - def test_binary_functions(self, fn): + def test_binary_functions(self, fn, engine, parser): df = DataFrame( { "a": np.random.default_rng(2).standard_normal(10), @@ -1635,7 +1633,7 @@ def test_binary_functions(self, fn): b = df.b expr = f"{fn}(a, b)" - got = self.eval(expr) + got = self.eval(expr, engine=engine, parser=parser) with np.errstate(all="ignore"): expect = getattr(np, fn)(a, b) tm.assert_almost_equal(got, expect, check_names=False) From ea2f857be39fe2b6c360178a5d63b8ea7173a5ed Mon Sep 17 00:00:00 2001 From: Gianluca Ficarelli <26835404+GianlucaFicarelli@users.noreply.github.com> Date: Wed, 24 Apr 2024 18:19:40 +0200 Subject: [PATCH 032/100] PERF: MultiIndex.memory_usage shouldn't trigger the index engine (#58385) * PERF: MultiIndex.memory_usage shouldn't trigger the index engine Ignore the index engine when it isn't already cached. * Move test, sort whatsnew --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/indexes/base.py | 5 +++-- pandas/core/indexes/multi.py | 5 +++-- pandas/tests/indexes/test_old_base.py | 24 ++++++++++++++++++++++++ 4 files changed, 31 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 027c692c6c89e..ca97e2b6ffb6b 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -333,6 +333,7 @@ Performance improvements - Performance improvement in :meth:`Index.take` when ``indices`` is a full range indexer from zero to length of index (:issue:`56806`) - Performance improvement in :meth:`Index.to_frame` returning a :class:`RangeIndex` columns of a :class:`Index` when possible. (:issue:`58018`) - Performance improvement in :meth:`MultiIndex.equals` for equal length indexes (:issue:`56990`) +- Performance improvement in :meth:`MultiIndex.memory_usage` to ignore the index engine when it isn't already cached. (:issue:`58385`) - Performance improvement in :meth:`RangeIndex.__getitem__` with a boolean mask or integers returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57588`) - Performance improvement in :meth:`RangeIndex.append` when appending the same index (:issue:`57252`) - Performance improvement in :meth:`RangeIndex.argmin` and :meth:`RangeIndex.argmax` (:issue:`57823`) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 5654111132b5e..e08b585920779 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4863,8 +4863,9 @@ def _from_join_target(self, result: np.ndarray) -> ArrayLike: def memory_usage(self, deep: bool = False) -> int: result = self._memory_usage(deep=deep) - # include our engine hashtable - result += self._engine.sizeof(deep=deep) + # include our engine hashtable, only if it's already cached + if "_engine" in self._cache: + result += self._engine.sizeof(deep=deep) return result @final diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 21ce9b759f2df..c8e16fad00d5b 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1391,8 +1391,9 @@ def _nbytes(self, deep: bool = False) -> int: names_nbytes = sum(getsizeof(i, objsize) for i in self.names) result = level_nbytes + label_nbytes + names_nbytes - # include our engine hashtable - result += self._engine.sizeof(deep=deep) + # include our engine hashtable, only if it's already cached + if "_engine" in self._cache: + result += self._engine.sizeof(deep=deep) return result # -------------------------------------------------------------------- diff --git a/pandas/tests/indexes/test_old_base.py b/pandas/tests/indexes/test_old_base.py index 9b4470021cc1d..b929616c814ee 100644 --- a/pandas/tests/indexes/test_old_base.py +++ b/pandas/tests/indexes/test_old_base.py @@ -326,6 +326,30 @@ def test_memory_usage(self, index): if index.inferred_type == "object": assert result3 > result2 + def test_memory_usage_doesnt_trigger_engine(self, index): + index._cache.clear() + assert "_engine" not in index._cache + + res_without_engine = index.memory_usage() + assert "_engine" not in index._cache + + # explicitly load and cache the engine + _ = index._engine + assert "_engine" in index._cache + + res_with_engine = index.memory_usage() + + # the empty engine doesn't affect the result even when initialized with values, + # because engine.sizeof() doesn't consider the content of engine.values + assert res_with_engine == res_without_engine + + if len(index) == 0: + assert res_without_engine == 0 + assert res_with_engine == 0 + else: + assert res_without_engine > 0 + assert res_with_engine > 0 + def test_argsort(self, index): if isinstance(index, CategoricalIndex): pytest.skip(f"{type(self).__name__} separately tested") From 9e7565ac0e1886f7ae27981ef67561563326ddd6 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Wed, 24 Apr 2024 21:50:25 +0530 Subject: [PATCH 033/100] DOC: Enforce Numpy Docstring Validation for pandas.Index.unique (#58399) * DOC: add RT03 to pandas.Index.unique * DOC: remove pandas.Index.unique --- ci/code_checks.sh | 1 - pandas/core/indexes/base.py | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 066c7176fcc34..101d650a0e768 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -153,7 +153,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Index.take PR01,PR07" \ -i "pandas.Index.to_list RT03" \ -i "pandas.Index.union PR07,RT03,SA01" \ - -i "pandas.Index.unique RT03" \ -i "pandas.Index.view GL08" \ -i "pandas.Int16Dtype SA01" \ -i "pandas.Int32Dtype SA01" \ diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index e08b585920779..2bb0aedb8bd84 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2647,6 +2647,7 @@ def unique(self, level: Hashable | None = None) -> Self: Returns ------- Index + Unique values in the index. See Also -------- From ba60432eda7f7ea0479eb63aae43ac680a2b8678 Mon Sep 17 00:00:00 2001 From: Abdulaziz Aloqeely <52792999+Aloqeely@users.noreply.github.com> Date: Wed, 24 Apr 2024 19:25:35 +0300 Subject: [PATCH 034/100] TST: Added match argument for most uses of tm.assert_produces_warning (#58396) * Fix for all FutureWarnings * Add match for most warnings * Cleaner code --------- Co-authored-by: Abdulaziz Aloqeely <52792999+DAzVise@users.noreply.github.com> --- .../development/contributing_codebase.rst | 5 +-- .../tests/arrays/sparse/test_constructors.py | 6 ++-- pandas/tests/arrays/test_datetimelike.py | 2 +- pandas/tests/computation/test_eval.py | 5 +-- pandas/tests/dtypes/test_common.py | 2 +- pandas/tests/dtypes/test_generic.py | 2 +- pandas/tests/frame/indexing/test_indexing.py | 2 +- pandas/tests/frame/indexing/test_setitem.py | 5 ++- pandas/tests/frame/methods/test_to_dict.py | 2 +- pandas/tests/frame/test_arithmetic.py | 7 ++-- pandas/tests/frame/test_reductions.py | 2 +- .../tests/indexes/base_class/test_setops.py | 4 +-- .../datetimes/methods/test_to_period.py | 20 ++++------- pandas/tests/indexes/multi/test_setops.py | 2 +- pandas/tests/indexes/test_base.py | 4 +-- pandas/tests/indexes/test_index_new.py | 11 ++---- pandas/tests/indexes/test_setops.py | 2 +- pandas/tests/internals/test_internals.py | 12 ++++--- pandas/tests/io/formats/test_css.py | 29 +++++++-------- pandas/tests/io/formats/test_to_excel.py | 2 +- .../tests/io/json/test_json_table_schema.py | 2 +- pandas/tests/io/test_clipboard.py | 2 +- pandas/tests/io/test_common.py | 2 +- pandas/tests/io/test_compression.py | 2 +- pandas/tests/io/test_sql.py | 2 +- pandas/tests/io/test_stata.py | 35 ++++++++++++------- pandas/tests/plotting/frame/test_frame.py | 10 +++--- .../plotting/frame/test_frame_subplots.py | 4 +-- pandas/tests/plotting/test_boxplot_method.py | 11 +++--- pandas/tests/reductions/test_reductions.py | 2 +- pandas/tests/reshape/merge/test_merge.py | 5 +-- .../timestamp/methods/test_to_pydatetime.py | 3 +- .../tests/scalar/timestamp/test_timestamp.py | 3 +- pandas/tests/series/test_arithmetic.py | 5 +-- pandas/tests/test_expressions.py | 13 +++---- pandas/tests/test_optional_dependency.py | 6 ++-- pandas/tests/tools/test_to_datetime.py | 5 ++- pandas/tests/window/test_expanding.py | 4 ++- pandas/tests/window/test_rolling_quantile.py | 4 ++- 39 files changed, 130 insertions(+), 116 deletions(-) diff --git a/doc/source/development/contributing_codebase.rst b/doc/source/development/contributing_codebase.rst index 39e279fd5c917..28129440b86d7 100644 --- a/doc/source/development/contributing_codebase.rst +++ b/doc/source/development/contributing_codebase.rst @@ -557,11 +557,12 @@ is being raised, using ``pytest.raises`` instead. Testing a warning ^^^^^^^^^^^^^^^^^ -Use ``tm.assert_produces_warning`` as a context manager to check that a block of code raises a warning. +Use ``tm.assert_produces_warning`` as a context manager to check that a block of code raises a warning +and specify the warning message using the ``match`` argument. .. code-block:: python - with tm.assert_produces_warning(DeprecationWarning): + with tm.assert_produces_warning(DeprecationWarning, match="the warning message"): pd.deprecated_function() If a warning should specifically not happen in a block of code, pass ``False`` into the context manager. diff --git a/pandas/tests/arrays/sparse/test_constructors.py b/pandas/tests/arrays/sparse/test_constructors.py index 012ff1da0d431..0bf3ab77e9eed 100644 --- a/pandas/tests/arrays/sparse/test_constructors.py +++ b/pandas/tests/arrays/sparse/test_constructors.py @@ -90,13 +90,13 @@ def test_constructor_warns_when_losing_timezone(self): dti = pd.date_range("2016-01-01", periods=3, tz="US/Pacific") expected = SparseArray(np.asarray(dti, dtype="datetime64[ns]")) - - with tm.assert_produces_warning(UserWarning): + msg = "loses timezone information" + with tm.assert_produces_warning(UserWarning, match=msg): result = SparseArray(dti) tm.assert_sp_array_equal(result, expected) - with tm.assert_produces_warning(UserWarning): + with tm.assert_produces_warning(UserWarning, match=msg): result = SparseArray(pd.Series(dti)) tm.assert_sp_array_equal(result, expected) diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index cfc04b5c91354..22c63af59a47c 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -778,7 +778,7 @@ def test_to_period_2d(self, arr1d): arr2d = arr1d.reshape(1, -1) warn = None if arr1d.tz is None else UserWarning - with tm.assert_produces_warning(warn): + with tm.assert_produces_warning(warn, match="will drop timezone information"): result = arr2d.to_period("D") expected = arr1d.to_period("D").reshape(1, -1) tm.assert_period_array_equal(result, expected) diff --git a/pandas/tests/computation/test_eval.py b/pandas/tests/computation/test_eval.py index f7d1fcfa3e469..ebbb31205e264 100644 --- a/pandas/tests/computation/test_eval.py +++ b/pandas/tests/computation/test_eval.py @@ -1014,7 +1014,8 @@ def test_performance_warning_for_poor_alignment( else: seen = False - with tm.assert_produces_warning(seen): + msg = "Alignment difference on axis 1 is larger than an order of magnitude" + with tm.assert_produces_warning(seen, match=msg): pd.eval("df + s", engine=engine, parser=parser) s = Series(np.random.default_rng(2).standard_normal(1000)) @@ -1036,7 +1037,7 @@ def test_performance_warning_for_poor_alignment( else: wrn = False - with tm.assert_produces_warning(wrn) as w: + with tm.assert_produces_warning(wrn, match=msg) as w: pd.eval("df + s", engine=engine, parser=parser) if not is_python_engine and performance_warning: diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index c34c97b6e4f04..f47815ee059af 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -797,5 +797,5 @@ def test_pandas_dtype_numpy_warning(): def test_pandas_dtype_ea_not_instance(): # GH 31356 GH 54592 - with tm.assert_produces_warning(UserWarning): + with tm.assert_produces_warning(UserWarning, match="without any arguments"): assert pandas_dtype(CategoricalDtype) == CategoricalDtype() diff --git a/pandas/tests/dtypes/test_generic.py b/pandas/tests/dtypes/test_generic.py index 02c827853b29d..261f86bfb0326 100644 --- a/pandas/tests/dtypes/test_generic.py +++ b/pandas/tests/dtypes/test_generic.py @@ -124,7 +124,7 @@ def test_setattr_warnings(): # this should not raise a warning df.two.not_an_index = [1, 2] - with tm.assert_produces_warning(UserWarning): + with tm.assert_produces_warning(UserWarning, match="doesn't allow columns"): # warn when setting column to nonexistent name df.four = df.two + 2 assert df.four.sum() > df.two.sum() diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 5a6fe07aa007b..69e6228d6efde 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -145,7 +145,7 @@ def test_getitem_boolean(self, mixed_float_frame, mixed_int_frame, datetime_fram # we are producing a warning that since the passed boolean # key is not the same as the given index, we will reindex # not sure this is really necessary - with tm.assert_produces_warning(UserWarning): + with tm.assert_produces_warning(UserWarning, match="will be reindexed"): indexer_obj = indexer_obj.reindex(datetime_frame.index[::-1]) subframe_obj = datetime_frame[indexer_obj] tm.assert_frame_equal(subframe_obj, subframe) diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index 3f98f49cd1877..ed81e8c8b8129 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -711,7 +711,10 @@ def test_setitem_npmatrix_2d(self): df["np-array"] = a # Instantiation of `np.matrix` gives PendingDeprecationWarning - with tm.assert_produces_warning(PendingDeprecationWarning): + with tm.assert_produces_warning( + PendingDeprecationWarning, + match="matrix subclass is not the recommended way to represent matrices", + ): df["np-matrix"] = np.matrix(a) tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/frame/methods/test_to_dict.py b/pandas/tests/frame/methods/test_to_dict.py index b8631d95a6399..11adc9f6179ce 100644 --- a/pandas/tests/frame/methods/test_to_dict.py +++ b/pandas/tests/frame/methods/test_to_dict.py @@ -166,7 +166,7 @@ def test_to_dict_not_unique_warning(self): # GH#16927: When converting to a dict, if a column has a non-unique name # it will be dropped, throwing a warning. df = DataFrame([[1, 2, 3]], columns=["a", "a", "b"]) - with tm.assert_produces_warning(UserWarning): + with tm.assert_produces_warning(UserWarning, match="columns will be omitted"): df.to_dict() @pytest.mark.filterwarnings("ignore::UserWarning") diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index f463b3f94fa55..91b5f905ada22 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -1097,7 +1097,7 @@ def test_binop_other(self, op, value, dtype, switch_numexpr_min_elements): and expr.USE_NUMEXPR and switch_numexpr_min_elements == 0 ): - warn = UserWarning # "evaluating in Python space because ..." + warn = UserWarning else: msg = ( f"cannot perform __{op.__name__}__ with this " @@ -1105,17 +1105,16 @@ def test_binop_other(self, op, value, dtype, switch_numexpr_min_elements): ) with pytest.raises(TypeError, match=msg): - with tm.assert_produces_warning(warn): + with tm.assert_produces_warning(warn, match="evaluating in Python"): op(df, elem.value) elif (op, dtype) in skip: if op in [operator.add, operator.mul]: if expr.USE_NUMEXPR and switch_numexpr_min_elements == 0: - # "evaluating in Python space because ..." warn = UserWarning else: warn = None - with tm.assert_produces_warning(warn): + with tm.assert_produces_warning(warn, match="evaluating in Python"): op(df, elem.value) else: diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 8ccd7b2ca83ba..5118561f67338 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -699,7 +699,7 @@ def test_mode_sortwarning(self, using_infer_string): expected = DataFrame({"A": ["a", np.nan]}) warning = None if using_infer_string else UserWarning - with tm.assert_produces_warning(warning): + with tm.assert_produces_warning(warning, match="Unable to sort modes"): result = df.mode(dropna=False) result = result.sort_values(by="A").reset_index(drop=True) diff --git a/pandas/tests/indexes/base_class/test_setops.py b/pandas/tests/indexes/base_class/test_setops.py index 49c6a91236db7..d57df82b2358c 100644 --- a/pandas/tests/indexes/base_class/test_setops.py +++ b/pandas/tests/indexes/base_class/test_setops.py @@ -84,13 +84,13 @@ def test_union_sort_other_incomparable(self): # https://github.com/pandas-dev/pandas/issues/24959 idx = Index([1, pd.Timestamp("2000")]) # default (sort=None) - with tm.assert_produces_warning(RuntimeWarning): + with tm.assert_produces_warning(RuntimeWarning, match="not supported between"): result = idx.union(idx[:1]) tm.assert_index_equal(result, idx) # sort=None - with tm.assert_produces_warning(RuntimeWarning): + with tm.assert_produces_warning(RuntimeWarning, match="not supported between"): result = idx.union(idx[:1], sort=None) tm.assert_index_equal(result, idx) diff --git a/pandas/tests/indexes/datetimes/methods/test_to_period.py b/pandas/tests/indexes/datetimes/methods/test_to_period.py index 05e9a294d74a6..5b2cc55d6dc56 100644 --- a/pandas/tests/indexes/datetimes/methods/test_to_period.py +++ b/pandas/tests/indexes/datetimes/methods/test_to_period.py @@ -117,10 +117,10 @@ def test_to_period_infer(self): freq="5min", ) - with tm.assert_produces_warning(UserWarning): + with tm.assert_produces_warning(UserWarning, match="drop timezone info"): pi1 = rng.to_period("5min") - with tm.assert_produces_warning(UserWarning): + with tm.assert_produces_warning(UserWarning, match="drop timezone info"): pi2 = rng.to_period() tm.assert_index_equal(pi1, pi2) @@ -143,8 +143,7 @@ def test_to_period_millisecond(self): ] ) - with tm.assert_produces_warning(UserWarning): - # warning that timezone info will be lost + with tm.assert_produces_warning(UserWarning, match="drop timezone info"): period = index.to_period(freq="ms") assert 2 == len(period) assert period[0] == Period("2007-01-01 10:11:12.123Z", "ms") @@ -158,8 +157,7 @@ def test_to_period_microsecond(self): ] ) - with tm.assert_produces_warning(UserWarning): - # warning that timezone info will be lost + with tm.assert_produces_warning(UserWarning, match="drop timezone info"): period = index.to_period(freq="us") assert 2 == len(period) assert period[0] == Period("2007-01-01 10:11:12.123456Z", "us") @@ -172,10 +170,7 @@ def test_to_period_microsecond(self): def test_to_period_tz(self, tz): ts = date_range("1/1/2000", "2/1/2000", tz=tz) - with tm.assert_produces_warning(UserWarning): - # GH#21333 warning that timezone info will be lost - # filter warning about freq deprecation - + with tm.assert_produces_warning(UserWarning, match="drop timezone info"): result = ts.to_period()[0] expected = ts[0].to_period(ts.freq) @@ -183,8 +178,7 @@ def test_to_period_tz(self, tz): expected = date_range("1/1/2000", "2/1/2000").to_period() - with tm.assert_produces_warning(UserWarning): - # GH#21333 warning that timezone info will be lost + with tm.assert_produces_warning(UserWarning, match="drop timezone info"): result = ts.to_period(ts.freq) tm.assert_index_equal(result, expected) @@ -193,7 +187,7 @@ def test_to_period_tz(self, tz): def test_to_period_tz_utc_offset_consistency(self, tz): # GH#22905 ts = date_range("1/1/2000", "2/1/2000", tz="Etc/GMT-1") - with tm.assert_produces_warning(UserWarning): + with tm.assert_produces_warning(UserWarning, match="drop timezone info"): result = ts.to_period()[0] expected = ts[0].to_period(ts.freq) assert result == expected diff --git a/pandas/tests/indexes/multi/test_setops.py b/pandas/tests/indexes/multi/test_setops.py index 9354984538c58..47f21cc7f8182 100644 --- a/pandas/tests/indexes/multi/test_setops.py +++ b/pandas/tests/indexes/multi/test_setops.py @@ -382,7 +382,7 @@ def test_union_sort_other_incomparable(): idx = MultiIndex.from_product([[1, pd.Timestamp("2000")], ["a", "b"]]) # default, sort=None - with tm.assert_produces_warning(RuntimeWarning): + with tm.assert_produces_warning(RuntimeWarning, match="are unorderable"): result = idx.union(idx[:1]) tm.assert_index_equal(result, idx) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 301c4794be4ef..04858643d97b1 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -1065,10 +1065,10 @@ def test_outer_join_sort(self): left_index = Index(np.random.default_rng(2).permutation(15)) right_index = date_range("2020-01-01", periods=10) - with tm.assert_produces_warning(RuntimeWarning): + with tm.assert_produces_warning(RuntimeWarning, match="not supported between"): result = left_index.join(right_index, how="outer") - with tm.assert_produces_warning(RuntimeWarning): + with tm.assert_produces_warning(RuntimeWarning, match="not supported between"): expected = left_index.astype(object).union(right_index.astype(object)) tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/test_index_new.py b/pandas/tests/indexes/test_index_new.py index 21cb0b8723d59..b544ebac43ece 100644 --- a/pandas/tests/indexes/test_index_new.py +++ b/pandas/tests/indexes/test_index_new.py @@ -142,25 +142,18 @@ def test_constructor_infer_nat_dt_like( data = [ctor] data.insert(pos, nulls_fixture) - warn = None if nulls_fixture is NA: expected = Index([NA, NaT]) mark = pytest.mark.xfail(reason="Broken with np.NaT ctor; see GH 31884") request.applymarker(mark) - # GH#35942 numpy will emit a DeprecationWarning within the - # assert_index_equal calls. Since we can't do anything - # about it until GH#31884 is fixed, we suppress that warning. - warn = DeprecationWarning result = Index(data) - with tm.assert_produces_warning(warn): - tm.assert_index_equal(result, expected) + tm.assert_index_equal(result, expected) result = Index(np.array(data, dtype=object)) - with tm.assert_produces_warning(warn): - tm.assert_index_equal(result, expected) + tm.assert_index_equal(result, expected) @pytest.mark.parametrize("swap_objs", [True, False]) def test_constructor_mixed_nat_objs_infers_object(self, swap_objs): diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py index 9a3471fe526c1..8fd349dacf9e9 100644 --- a/pandas/tests/indexes/test_setops.py +++ b/pandas/tests/indexes/test_setops.py @@ -882,7 +882,7 @@ def test_difference_incomparable(self, opname): b = Index([2, Timestamp("1999"), 1]) op = operator.methodcaller(opname, b) - with tm.assert_produces_warning(RuntimeWarning): + with tm.assert_produces_warning(RuntimeWarning, match="not supported between"): # sort=None, the default result = op(a) expected = Index([3, Timestamp("2000"), 2, Timestamp("1999")]) diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index 43bcf84f901b1..749e2c4a86b55 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -1280,19 +1280,20 @@ def test_interval_can_hold_element(self, dtype, element): # `elem` to not have the same length as `arr` ii2 = IntervalIndex.from_breaks(arr[:-1], closed="neither") elem = element(ii2) - with tm.assert_produces_warning(FutureWarning): + msg = "Setting an item of incompatible dtype is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): self.check_series_setitem(elem, ii, False) assert not blk._can_hold_element(elem) ii3 = IntervalIndex.from_breaks([Timestamp(1), Timestamp(3), Timestamp(4)]) elem = element(ii3) - with tm.assert_produces_warning(FutureWarning): + with tm.assert_produces_warning(FutureWarning, match=msg): self.check_series_setitem(elem, ii, False) assert not blk._can_hold_element(elem) ii4 = IntervalIndex.from_breaks([Timedelta(1), Timedelta(3), Timedelta(4)]) elem = element(ii4) - with tm.assert_produces_warning(FutureWarning): + with tm.assert_produces_warning(FutureWarning, match=msg): self.check_series_setitem(elem, ii, False) assert not blk._can_hold_element(elem) @@ -1312,12 +1313,13 @@ def test_period_can_hold_element(self, element): # `elem` to not have the same length as `arr` pi2 = pi.asfreq("D")[:-1] elem = element(pi2) - with tm.assert_produces_warning(FutureWarning): + msg = "Setting an item of incompatible dtype is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): self.check_series_setitem(elem, pi, False) dti = pi.to_timestamp("s")[:-1] elem = element(dti) - with tm.assert_produces_warning(FutureWarning): + with tm.assert_produces_warning(FutureWarning, match=msg): self.check_series_setitem(elem, pi, False) def check_can_hold_element(self, obj, elem, inplace: bool): diff --git a/pandas/tests/io/formats/test_css.py b/pandas/tests/io/formats/test_css.py index 8bf9aa4ac04d3..c4ecb48006cb1 100644 --- a/pandas/tests/io/formats/test_css.py +++ b/pandas/tests/io/formats/test_css.py @@ -38,30 +38,31 @@ def test_css_parse_normalisation(name, norm, abnorm): @pytest.mark.parametrize( - "invalid_css,remainder", + "invalid_css,remainder,msg", [ # No colon - ("hello-world", ""), - ("border-style: solid; hello-world", "border-style: solid"), + ("hello-world", "", "expected a colon"), + ("border-style: solid; hello-world", "border-style: solid", "expected a colon"), ( "border-style: solid; hello-world; font-weight: bold", "border-style: solid; font-weight: bold", + "expected a colon", ), # Unclosed string fail # Invalid size - ("font-size: blah", "font-size: 1em"), - ("font-size: 1a2b", "font-size: 1em"), - ("font-size: 1e5pt", "font-size: 1em"), - ("font-size: 1+6pt", "font-size: 1em"), - ("font-size: 1unknownunit", "font-size: 1em"), - ("font-size: 10", "font-size: 1em"), - ("font-size: 10 pt", "font-size: 1em"), + ("font-size: blah", "font-size: 1em", "Unhandled size"), + ("font-size: 1a2b", "font-size: 1em", "Unhandled size"), + ("font-size: 1e5pt", "font-size: 1em", "Unhandled size"), + ("font-size: 1+6pt", "font-size: 1em", "Unhandled size"), + ("font-size: 1unknownunit", "font-size: 1em", "Unhandled size"), + ("font-size: 10", "font-size: 1em", "Unhandled size"), + ("font-size: 10 pt", "font-size: 1em", "Unhandled size"), # Too many args - ("border-top: 1pt solid red green", "border-top: 1pt solid green"), + ("border-top: 1pt solid red green", "border-top: 1pt solid green", "Too many"), ], ) -def test_css_parse_invalid(invalid_css, remainder): - with tm.assert_produces_warning(CSSWarning): +def test_css_parse_invalid(invalid_css, remainder, msg): + with tm.assert_produces_warning(CSSWarning, match=msg): assert_same_resolution(invalid_css, remainder) @@ -120,7 +121,7 @@ def test_css_side_shorthands(shorthand, expansions): {top: "1pt", right: "4pt", bottom: "2pt", left: "0pt"}, ) - with tm.assert_produces_warning(CSSWarning): + with tm.assert_produces_warning(CSSWarning, match="Could not expand"): assert_resolves(f"{shorthand}: 1pt 1pt 1pt 1pt 1pt", {}) diff --git a/pandas/tests/io/formats/test_to_excel.py b/pandas/tests/io/formats/test_to_excel.py index 3b782713eed6c..b40201b9ba1e6 100644 --- a/pandas/tests/io/formats/test_to_excel.py +++ b/pandas/tests/io/formats/test_to_excel.py @@ -325,7 +325,7 @@ def test_css_to_excel_bad_colors(input_color): if input_color is not None: expected["fill"] = {"patternType": "solid"} - with tm.assert_produces_warning(CSSWarning): + with tm.assert_produces_warning(CSSWarning, match="Unhandled color format"): convert = CSSToExcelConverter() assert expected == convert(css) diff --git a/pandas/tests/io/json/test_json_table_schema.py b/pandas/tests/io/json/test_json_table_schema.py index ec49b7644ea0e..a0d5b3a741aaf 100644 --- a/pandas/tests/io/json/test_json_table_schema.py +++ b/pandas/tests/io/json/test_json_table_schema.py @@ -639,7 +639,7 @@ def test_warns_non_roundtrippable_names(self, idx): # GH 19130 df = DataFrame(index=idx) df.index.name = "index" - with tm.assert_produces_warning(): + with tm.assert_produces_warning(UserWarning, match="not round-trippable"): set_default_names(df) def test_timestamp_in_columns(self): diff --git a/pandas/tests/io/test_clipboard.py b/pandas/tests/io/test_clipboard.py index 5f19c15817ce7..babbddafa3b49 100644 --- a/pandas/tests/io/test_clipboard.py +++ b/pandas/tests/io/test_clipboard.py @@ -222,7 +222,7 @@ def test_excel_sep_warning(self, df): # Separator is ignored when excel=False and should produce a warning def test_copy_delim_warning(self, df): - with tm.assert_produces_warning(): + with tm.assert_produces_warning(UserWarning, match="ignores the sep argument"): df.to_clipboard(excel=False, sep="\t") # Tests that the default behavior of to_clipboard is tab diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index f5880d8a894f8..ad729d2346a3b 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -463,7 +463,7 @@ def test_warning_missing_utf_bom(self, encoding, compression_): index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), ) with tm.ensure_clean() as path: - with tm.assert_produces_warning(UnicodeWarning): + with tm.assert_produces_warning(UnicodeWarning, match="byte order mark"): df.to_csv(path, compression=compression_, encoding=encoding) # reading should fail (otherwise we wouldn't need the warning) diff --git a/pandas/tests/io/test_compression.py b/pandas/tests/io/test_compression.py index 3a58dda9e8dc4..00082be7e07e8 100644 --- a/pandas/tests/io/test_compression.py +++ b/pandas/tests/io/test_compression.py @@ -133,7 +133,7 @@ def test_compression_warning(compression_only): ) with tm.ensure_clean() as path: with icom.get_handle(path, "w", compression=compression_only) as handles: - with tm.assert_produces_warning(RuntimeWarning): + with tm.assert_produces_warning(RuntimeWarning, match="has no effect"): df.to_csv(handles.handle, compression=compression_only) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 3083fa24ba8b5..af77972d9fd26 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -2602,7 +2602,7 @@ def close(self): self.conn.close() with contextlib.closing(MockSqliteConnection(":memory:")) as conn: - with tm.assert_produces_warning(UserWarning): + with tm.assert_produces_warning(UserWarning, match="only supports SQLAlchemy"): sql.read_sql("SELECT 1", conn) diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 2650f351e2203..d7fb3c0049965 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -189,11 +189,12 @@ def test_read_dta2(self, datapath): path2 = datapath("io", "data", "stata", "stata2_115.dta") path3 = datapath("io", "data", "stata", "stata2_117.dta") - with tm.assert_produces_warning(UserWarning): + msg = "Leaving in Stata Internal Format" + with tm.assert_produces_warning(UserWarning, match=msg): parsed_114 = self.read_dta(path1) - with tm.assert_produces_warning(UserWarning): + with tm.assert_produces_warning(UserWarning, match=msg): parsed_115 = self.read_dta(path2) - with tm.assert_produces_warning(UserWarning): + with tm.assert_produces_warning(UserWarning, match=msg): parsed_117 = self.read_dta(path3) # FIXME: don't leave commented-out # 113 is buggy due to limits of date format support in Stata @@ -478,7 +479,8 @@ def test_read_write_dta11(self, temp_file): formatted = formatted.astype(np.int32) path = temp_file - with tm.assert_produces_warning(InvalidColumnName): + msg = "Not all pandas column names were valid Stata variable names" + with tm.assert_produces_warning(InvalidColumnName, match=msg): original.to_stata(path, convert_dates=None) written_and_read_again = self.read_dta(path) @@ -515,7 +517,8 @@ def test_read_write_dta12(self, version, temp_file): formatted = formatted.astype(np.int32) path = temp_file - with tm.assert_produces_warning(InvalidColumnName): + msg = "Not all pandas column names were valid Stata variable names" + with tm.assert_produces_warning(InvalidColumnName, match=msg): original.to_stata(path, convert_dates=None, version=version) # should get a warning for that format. @@ -612,7 +615,8 @@ def test_numeric_column_names(self, temp_file): original.index.name = "index" path = temp_file # should get a warning for that format. - with tm.assert_produces_warning(InvalidColumnName): + msg = "Not all pandas column names were valid Stata variable names" + with tm.assert_produces_warning(InvalidColumnName, match=msg): original.to_stata(path) written_and_read_again = self.read_dta(path) @@ -672,7 +676,7 @@ def test_large_value_conversion(self, temp_file): original = DataFrame({"s0": s0, "s1": s1, "s2": s2, "s3": s3}) original.index.name = "index" path = temp_file - with tm.assert_produces_warning(PossiblePrecisionLoss): + with tm.assert_produces_warning(PossiblePrecisionLoss, match="from int64 to"): original.to_stata(path) written_and_read_again = self.read_dta(path) @@ -687,7 +691,8 @@ def test_dates_invalid_column(self, temp_file): original = DataFrame([datetime(2006, 11, 19, 23, 13, 20)]) original.index.name = "index" path = temp_file - with tm.assert_produces_warning(InvalidColumnName): + msg = "Not all pandas column names were valid Stata variable names" + with tm.assert_produces_warning(InvalidColumnName, match=msg): original.to_stata(path, convert_dates={0: "tc"}) written_and_read_again = self.read_dta(path) @@ -1111,7 +1116,8 @@ def test_categorical_warnings_and_errors(self, temp_file): [["a"], ["b"], ["c"], ["d"], [1]], columns=["Too_long"] ).astype("category") - with tm.assert_produces_warning(ValueLabelTypeMismatch): + msg = "data file created has not lost information due to duplicate labels" + with tm.assert_produces_warning(ValueLabelTypeMismatch, match=msg): original.to_stata(path) # should get a warning for mixed content @@ -1732,7 +1738,8 @@ def test_convert_strl_name_swap(self, temp_file): ) original.index.name = "index" - with tm.assert_produces_warning(InvalidColumnName): + msg = "Not all pandas column names were valid Stata variable names" + with tm.assert_produces_warning(InvalidColumnName, match=msg): path = temp_file original.to_stata(path, convert_strl=["long", 1], version=117) reread = self.read_dta(path) @@ -2139,8 +2146,9 @@ def test_chunked_categorical(version, temp_file): def test_chunked_categorical_partial(datapath): dta_file = datapath("io", "data", "stata", "stata-dta-partially-labeled.dta") values = ["a", "b", "a", "b", 3.0] + msg = "series with value labels are not fully labeled" with StataReader(dta_file, chunksize=2) as reader: - with tm.assert_produces_warning(CategoricalConversionWarning): + with tm.assert_produces_warning(CategoricalConversionWarning, match=msg): for i, block in enumerate(reader): assert list(block.cats) == values[2 * i : 2 * (i + 1)] if i < 2: @@ -2148,7 +2156,7 @@ def test_chunked_categorical_partial(datapath): else: idx = pd.Index([3.0], dtype="float64") tm.assert_index_equal(block.cats.cat.categories, idx) - with tm.assert_produces_warning(CategoricalConversionWarning): + with tm.assert_produces_warning(CategoricalConversionWarning, match=msg): with StataReader(dta_file, chunksize=5) as reader: large_chunk = reader.__next__() direct = read_stata(dta_file) @@ -2304,7 +2312,8 @@ def test_non_categorical_value_label_name_conversion(temp_file): "_1__2_": {3: "three"}, } - with tm.assert_produces_warning(InvalidColumnName): + msg = "Not all pandas column names were valid Stata variable names" + with tm.assert_produces_warning(InvalidColumnName, match=msg): data.to_stata(temp_file, value_labels=value_labels) with StataReader(temp_file) as reader: diff --git a/pandas/tests/plotting/frame/test_frame.py b/pandas/tests/plotting/frame/test_frame.py index 65c9083d9fe2b..c30cb96fef252 100644 --- a/pandas/tests/plotting/frame/test_frame.py +++ b/pandas/tests/plotting/frame/test_frame.py @@ -2001,7 +2001,7 @@ def _check(axes): plt.close("all") gs, axes = _generate_4_axes_via_gridspec() - with tm.assert_produces_warning(UserWarning): + with tm.assert_produces_warning(UserWarning, match="sharex and sharey"): axes = df.plot(subplots=True, ax=axes, sharex=True) _check(axes) @@ -2065,7 +2065,7 @@ def _check(axes): plt.close("all") gs, axes = _generate_4_axes_via_gridspec() - with tm.assert_produces_warning(UserWarning): + with tm.assert_produces_warning(UserWarning, match="sharex and sharey"): axes = df.plot(subplots=True, ax=axes, sharey=True) gs.tight_layout(plt.gcf()) @@ -2186,7 +2186,7 @@ def _get_horizontal_grid(): # vertical / subplots / sharex=True / sharey=True ax1, ax2 = _get_vertical_grid() - with tm.assert_produces_warning(UserWarning): + with tm.assert_produces_warning(UserWarning, match="sharex and sharey"): axes = df.plot(subplots=True, ax=[ax1, ax2], sharex=True, sharey=True) assert len(axes[0].lines) == 1 assert len(axes[1].lines) == 1 @@ -2202,7 +2202,7 @@ def _get_horizontal_grid(): # horizontal / subplots / sharex=True / sharey=True ax1, ax2 = _get_horizontal_grid() - with tm.assert_produces_warning(UserWarning): + with tm.assert_produces_warning(UserWarning, match="sharex and sharey"): axes = df.plot(subplots=True, ax=[ax1, ax2], sharex=True, sharey=True) assert len(axes[0].lines) == 1 assert len(axes[1].lines) == 1 @@ -2252,7 +2252,7 @@ def _get_boxed_grid(): # subplots / sharex=True / sharey=True axes = _get_boxed_grid() - with tm.assert_produces_warning(UserWarning): + with tm.assert_produces_warning(UserWarning, match="sharex and sharey"): axes = df.plot(subplots=True, ax=axes, sharex=True, sharey=True) for ax in axes: assert len(ax.lines) == 1 diff --git a/pandas/tests/plotting/frame/test_frame_subplots.py b/pandas/tests/plotting/frame/test_frame_subplots.py index 511266d5786c5..a98f4b56ebf4d 100644 --- a/pandas/tests/plotting/frame/test_frame_subplots.py +++ b/pandas/tests/plotting/frame/test_frame_subplots.py @@ -335,7 +335,7 @@ def test_subplots_multiple_axes_2_dim(self, layout, exp_layout): np.random.default_rng(2).random((10, 4)), index=list(string.ascii_letters[:10]), ) - with tm.assert_produces_warning(UserWarning): + with tm.assert_produces_warning(UserWarning, match="layout keyword is ignored"): returned = df.plot( subplots=True, ax=axes, layout=layout, sharex=False, sharey=False ) @@ -501,7 +501,7 @@ def test_df_subplots_patterns_minorticks_1st_ax_hidden(self): columns=list("AB"), ) _, axes = plt.subplots(2, 1) - with tm.assert_produces_warning(UserWarning): + with tm.assert_produces_warning(UserWarning, match="sharex and sharey"): axes = df.plot(subplots=True, ax=axes, sharex=True) for ax in axes: assert len(ax.lines) == 1 diff --git a/pandas/tests/plotting/test_boxplot_method.py b/pandas/tests/plotting/test_boxplot_method.py index f8029a1c1ee40..573f95eed15ef 100644 --- a/pandas/tests/plotting/test_boxplot_method.py +++ b/pandas/tests/plotting/test_boxplot_method.py @@ -129,7 +129,8 @@ def test_boxplot_legacy2_with_multi_col(self): df["Y"] = Series(["A"] * 10) # Multiple columns with an ax argument should use same figure fig, ax = mpl.pyplot.subplots() - with tm.assert_produces_warning(UserWarning): + msg = "the figure containing the passed axes is being cleared" + with tm.assert_produces_warning(UserWarning, match=msg): axes = df.boxplot( column=["Col1", "Col2"], by="X", ax=ax, return_type="axes" ) @@ -607,7 +608,7 @@ def test_grouped_box_multiple_axes(self, hist_df): # passes multiple axes to plot, hist or boxplot # location should be changed if other test is added # which has earlier alphabetical order - with tm.assert_produces_warning(UserWarning): + with tm.assert_produces_warning(UserWarning, match="sharex and sharey"): _, axes = mpl.pyplot.subplots(2, 2) df.groupby("category").boxplot(column="height", return_type="axes", ax=axes) _check_axes_shape(mpl.pyplot.gcf().axes, axes_num=4, layout=(2, 2)) @@ -617,7 +618,7 @@ def test_grouped_box_multiple_axes_on_fig(self, hist_df): # GH 6970, GH 7069 df = hist_df fig, axes = mpl.pyplot.subplots(2, 3) - with tm.assert_produces_warning(UserWarning): + with tm.assert_produces_warning(UserWarning, match="sharex and sharey"): returned = df.boxplot( column=["height", "weight", "category"], by="gender", @@ -630,7 +631,7 @@ def test_grouped_box_multiple_axes_on_fig(self, hist_df): assert returned[0].figure is fig # draw on second row - with tm.assert_produces_warning(UserWarning): + with tm.assert_produces_warning(UserWarning, match="sharex and sharey"): returned = df.groupby("classroom").boxplot( column=["height", "weight", "category"], return_type="axes", ax=axes[1] ) @@ -647,7 +648,7 @@ def test_grouped_box_multiple_axes_ax_error(self, hist_df): _, axes = mpl.pyplot.subplots(2, 3) with pytest.raises(ValueError, match=msg): # pass different number of axes from required - with tm.assert_produces_warning(UserWarning): + with tm.assert_produces_warning(UserWarning, match="sharex and sharey"): axes = df.groupby("classroom").boxplot(ax=axes) def test_fontsize(self): diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index 46753b668a8b0..422ed8d4f3d2b 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -1558,7 +1558,7 @@ def test_mode_sortwarning(self): expected = Series(["foo", np.nan]) s = Series([1, "foo", "foo", np.nan, np.nan]) - with tm.assert_produces_warning(UserWarning): + with tm.assert_produces_warning(UserWarning, match="Unable to sort modes"): result = s.mode(dropna=False) result = result.sort_values().reset_index(drop=True) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 7ab8ee24bd194..5c5c06dea0008 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1565,11 +1565,12 @@ def test_merge_on_ints_floats_warning(self): B = DataFrame({"Y": [1.1, 2.5, 3.0]}) expected = DataFrame({"X": [3], "Y": [3.0]}) - with tm.assert_produces_warning(UserWarning): + msg = "the float values are not equal to their int representation" + with tm.assert_produces_warning(UserWarning, match=msg): result = A.merge(B, left_on="X", right_on="Y") tm.assert_frame_equal(result, expected) - with tm.assert_produces_warning(UserWarning): + with tm.assert_produces_warning(UserWarning, match=msg): result = B.merge(A, left_on="Y", right_on="X") tm.assert_frame_equal(result, expected[["Y", "X"]]) diff --git a/pandas/tests/scalar/timestamp/methods/test_to_pydatetime.py b/pandas/tests/scalar/timestamp/methods/test_to_pydatetime.py index 57f57e56201c8..be6ec7dbc24c7 100644 --- a/pandas/tests/scalar/timestamp/methods/test_to_pydatetime.py +++ b/pandas/tests/scalar/timestamp/methods/test_to_pydatetime.py @@ -24,7 +24,8 @@ def test_to_pydatetime_nonzero_nano(self): ts = Timestamp("2011-01-01 9:00:00.123456789") # Warn the user of data loss (nanoseconds). - with tm.assert_produces_warning(UserWarning): + msg = "Discarding nonzero nanoseconds in conversion" + with tm.assert_produces_warning(UserWarning, match=msg): expected = datetime(2011, 1, 1, 9, 0, 0, 123456) result = ts.to_pydatetime() assert result == expected diff --git a/pandas/tests/scalar/timestamp/test_timestamp.py b/pandas/tests/scalar/timestamp/test_timestamp.py index ea970433464fc..79fd285073983 100644 --- a/pandas/tests/scalar/timestamp/test_timestamp.py +++ b/pandas/tests/scalar/timestamp/test_timestamp.py @@ -501,8 +501,7 @@ def test_to_period_tz_warning(self): # GH#21333 make sure a warning is issued when timezone # info is lost ts = Timestamp("2009-04-15 16:17:18", tz="US/Eastern") - with tm.assert_produces_warning(UserWarning): - # warning that timezone info will be lost + with tm.assert_produces_warning(UserWarning, match="drop timezone information"): ts.to_period("D") def test_to_numpy_alias(self): diff --git a/pandas/tests/series/test_arithmetic.py b/pandas/tests/series/test_arithmetic.py index 44bf3475b85a6..f0930a831e98d 100644 --- a/pandas/tests/series/test_arithmetic.py +++ b/pandas/tests/series/test_arithmetic.py @@ -359,12 +359,13 @@ def test_add_list_to_masked_array_boolean(self, request): else None ) ser = Series([True, None, False], dtype="boolean") - with tm.assert_produces_warning(warning): + msg = "operator is not supported by numexpr for the bool dtype" + with tm.assert_produces_warning(warning, match=msg): result = ser + [True, None, True] expected = Series([True, None, True], dtype="boolean") tm.assert_series_equal(result, expected) - with tm.assert_produces_warning(warning): + with tm.assert_produces_warning(warning, match=msg): result = [True, None, True] + ser tm.assert_series_equal(result, expected) diff --git a/pandas/tests/test_expressions.py b/pandas/tests/test_expressions.py index 68dcc1a18eda7..8f275345a7819 100644 --- a/pandas/tests/test_expressions.py +++ b/pandas/tests/test_expressions.py @@ -339,35 +339,36 @@ def test_bool_ops_warn_on_arithmetic(self, op_str, opname, monkeypatch): # raises TypeError return + msg = "operator is not supported by numexpr" with monkeypatch.context() as m: m.setattr(expr, "_MIN_ELEMENTS", 5) with option_context("compute.use_numexpr", True): - with tm.assert_produces_warning(): + with tm.assert_produces_warning(UserWarning, match=msg): r = f(df, df) e = fe(df, df) tm.assert_frame_equal(r, e) - with tm.assert_produces_warning(): + with tm.assert_produces_warning(UserWarning, match=msg): r = f(df.a, df.b) e = fe(df.a, df.b) tm.assert_series_equal(r, e) - with tm.assert_produces_warning(): + with tm.assert_produces_warning(UserWarning, match=msg): r = f(df.a, True) e = fe(df.a, True) tm.assert_series_equal(r, e) - with tm.assert_produces_warning(): + with tm.assert_produces_warning(UserWarning, match=msg): r = f(False, df.a) e = fe(False, df.a) tm.assert_series_equal(r, e) - with tm.assert_produces_warning(): + with tm.assert_produces_warning(UserWarning, match=msg): r = f(False, df) e = fe(False, df) tm.assert_frame_equal(r, e) - with tm.assert_produces_warning(): + with tm.assert_produces_warning(UserWarning, match=msg): r = f(df, True) e = fe(df, True) tm.assert_frame_equal(r, e) diff --git a/pandas/tests/test_optional_dependency.py b/pandas/tests/test_optional_dependency.py index 52b5f636b1254..9127981d1845d 100644 --- a/pandas/tests/test_optional_dependency.py +++ b/pandas/tests/test_optional_dependency.py @@ -42,7 +42,7 @@ def test_bad_version(monkeypatch): result = import_optional_dependency("fakemodule", min_version="0.8") assert result is module - with tm.assert_produces_warning(UserWarning): + with tm.assert_produces_warning(UserWarning, match=match): result = import_optional_dependency("fakemodule", errors="warn") assert result is None @@ -53,7 +53,7 @@ def test_bad_version(monkeypatch): with pytest.raises(ImportError, match="Pandas requires version '1.1.0'"): import_optional_dependency("fakemodule", min_version="1.1.0") - with tm.assert_produces_warning(UserWarning): + with tm.assert_produces_warning(UserWarning, match="Pandas requires version"): result = import_optional_dependency( "fakemodule", errors="warn", min_version="1.1.0" ) @@ -81,7 +81,7 @@ def test_submodule(monkeypatch): with pytest.raises(ImportError, match=match): import_optional_dependency("fakemodule.submodule") - with tm.assert_produces_warning(UserWarning): + with tm.assert_produces_warning(UserWarning, match=match): result = import_optional_dependency("fakemodule.submodule", errors="warn") assert result is None diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index b59dd194cac27..7ce02c12ac1ca 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -1738,7 +1738,10 @@ def test_unit(self, cache): def test_unit_str(self, cache): # GH 57051 # Test that strs aren't dropping precision to 32-bit accidentally. - with tm.assert_produces_warning(FutureWarning): + with tm.assert_produces_warning( + FutureWarning, + match="'to_datetime' with 'unit' when parsing strings is deprecated", + ): res = to_datetime(["1704660000"], unit="s", origin="unix") expected = to_datetime([1704660000], unit="s", origin="unix") tm.assert_index_equal(res, expected) diff --git a/pandas/tests/window/test_expanding.py b/pandas/tests/window/test_expanding.py index d375010aff3cc..510a69a2ff3e4 100644 --- a/pandas/tests/window/test_expanding.py +++ b/pandas/tests/window/test_expanding.py @@ -696,5 +696,7 @@ def test_numeric_only_corr_cov_series(kernel, use_arg, numeric_only, dtype): def test_keyword_quantile_deprecated(): # GH #52550 ser = Series([1, 2, 3, 4]) - with tm.assert_produces_warning(FutureWarning): + with tm.assert_produces_warning( + FutureWarning, match="the 'quantile' keyword is deprecated, use 'q' instead" + ): ser.expanding().quantile(quantile=0.5) diff --git a/pandas/tests/window/test_rolling_quantile.py b/pandas/tests/window/test_rolling_quantile.py index d5a7010923563..1604d72d4f9b1 100644 --- a/pandas/tests/window/test_rolling_quantile.py +++ b/pandas/tests/window/test_rolling_quantile.py @@ -178,5 +178,7 @@ def test_center_reindex_frame(frame, q): def test_keyword_quantile_deprecated(): # GH #52550 s = Series([1, 2, 3, 4]) - with tm.assert_produces_warning(FutureWarning): + with tm.assert_produces_warning( + FutureWarning, match="the 'quantile' keyword is deprecated, use 'q' instead" + ): s.rolling(2).quantile(quantile=0.4) From 41014db0e802bd9d2ae6326d6314c65ecad9b28d Mon Sep 17 00:00:00 2001 From: Zhengbo Wang <2736230899@qq.com> Date: Thu, 25 Apr 2024 00:30:24 +0800 Subject: [PATCH 035/100] BUG: Ignore warning for duplicate columns in `to_dict` when orient='tight' (#58335) * Ignore warning for duplicate columns in to_dict when orient='tight' * Add whatsnew * Update doc/source/whatsnew/v3.0.0.rst Co-authored-by: Abdulaziz Aloqeely <52792999+Aloqeely@users.noreply.github.com> * Update whatsnew and redefine duplicate columns * Use assert instead * assert not raise and equal --------- Co-authored-by: Abdulaziz Aloqeely <52792999+Aloqeely@users.noreply.github.com> --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/methods/to_dict.py | 2 +- pandas/tests/frame/methods/test_to_dict.py | 14 ++++++++++++++ 3 files changed, 16 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index ca97e2b6ffb6b..59cc709359a8d 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -416,6 +416,7 @@ MultiIndex I/O ^^^ - Bug in :class:`DataFrame` and :class:`Series` ``repr`` of :py:class:`collections.abc.Mapping`` elements. (:issue:`57915`) +- Bug in :meth:`DataFrame.to_dict` raises unnecessary ``UserWarning`` when columns are not unique and ``orient='tight'``. (:issue:`58281`) - Bug in :meth:`DataFrame.to_excel` when writing empty :class:`DataFrame` with :class:`MultiIndex` on both axes (:issue:`57696`) - Bug in :meth:`DataFrame.to_string` that raised ``StopIteration`` with nested DataFrames. (:issue:`16098`) - Bug in :meth:`read_csv` raising ``TypeError`` when ``index_col`` is specified and ``na_values`` is a dict containing the key ``None``. (:issue:`57547`) diff --git a/pandas/core/methods/to_dict.py b/pandas/core/methods/to_dict.py index 57e03dedc384d..84202a4fcc840 100644 --- a/pandas/core/methods/to_dict.py +++ b/pandas/core/methods/to_dict.py @@ -148,7 +148,7 @@ def to_dict( Return a collections.abc.MutableMapping object representing the DataFrame. The resulting transformation depends on the `orient` parameter. """ - if not df.columns.is_unique: + if orient != "tight" and not df.columns.is_unique: warnings.warn( "DataFrame columns are not unique, some columns will be omitted.", UserWarning, diff --git a/pandas/tests/frame/methods/test_to_dict.py b/pandas/tests/frame/methods/test_to_dict.py index 11adc9f6179ce..0272b679e85a2 100644 --- a/pandas/tests/frame/methods/test_to_dict.py +++ b/pandas/tests/frame/methods/test_to_dict.py @@ -513,6 +513,20 @@ def test_to_dict_masked_native_python(self): result = df.to_dict(orient="records") assert isinstance(result[0]["a"], int) + def test_to_dict_tight_no_warning_with_duplicate_column(self): + # GH#58281 + df = DataFrame([[1, 2], [3, 4], [5, 6]], columns=["A", "A"]) + with tm.assert_produces_warning(None): + result = df.to_dict(orient="tight") + expected = { + "index": [0, 1, 2], + "columns": ["A", "A"], + "data": [[1, 2], [3, 4], [5, 6]], + "index_names": [None], + "column_names": [None], + } + assert result == expected + @pytest.mark.parametrize( "val", [Timestamp(2020, 1, 1), Timedelta(1), Period("2020"), Interval(1, 2)] From 2536d3a736eea96b9da8b774e671516eb8f25f4a Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 24 Apr 2024 07:26:56 -1000 Subject: [PATCH 036/100] CI: Fix npdev failures (#58389) * CI: Fix npdev failures * Use unique index, make array writable * Update pandas/_libs/hashtable_class_helper.pxi.in * Update pandas/tests/arrays/test_datetimelike.py * Update pandas/tests/arrays/test_datetimelike.py --- pandas/tests/arrays/test_datetimelike.py | 8 ++++++-- pandas/tests/extension/base/missing.py | 2 ++ pandas/tests/indexes/test_base.py | 4 ++-- 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index 22c63af59a47c..3d8f8d791b763 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -661,7 +661,9 @@ def test_array_interface(self, datetime_index): assert result is expected tm.assert_numpy_array_equal(result, expected) result = np.array(arr, dtype="datetime64[ns]") - assert result is not expected + if not np_version_gt2: + # TODO: GH 57739 + assert result is not expected tm.assert_numpy_array_equal(result, expected) # to object dtype @@ -976,7 +978,9 @@ def test_array_interface(self, timedelta_index): assert result is expected tm.assert_numpy_array_equal(result, expected) result = np.array(arr, dtype="timedelta64[ns]") - assert result is not expected + if not np_version_gt2: + # TODO: GH 57739 + assert result is not expected tm.assert_numpy_array_equal(result, expected) # to object dtype diff --git a/pandas/tests/extension/base/missing.py b/pandas/tests/extension/base/missing.py index 4b9234a9904a2..cee565d4f7c1e 100644 --- a/pandas/tests/extension/base/missing.py +++ b/pandas/tests/extension/base/missing.py @@ -27,7 +27,9 @@ def test_isna_returns_copy(self, data_missing, na_func): expected = result.copy() mask = getattr(result, na_func)() if isinstance(mask.dtype, pd.SparseDtype): + # TODO: GH 57739 mask = np.array(mask) + mask.flags.writeable = True mask[:] = True tm.assert_series_equal(result, expected) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 04858643d97b1..2e94961b673f8 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -71,8 +71,8 @@ def test_constructor_casting(self, index): tm.assert_contains_all(arr, new_index) tm.assert_index_equal(index, new_index) - @pytest.mark.parametrize("index", ["string"], indirect=True) - def test_constructor_copy(self, index, using_infer_string): + def test_constructor_copy(self, using_infer_string): + index = Index(list("abc"), name="name") arr = np.array(index) new_index = Index(arr, copy=True, name="name") assert isinstance(new_index, Index) From a52728a87a91d45f8352ee588ce32b32aac774de Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 24 Apr 2024 12:39:03 -0700 Subject: [PATCH 037/100] DEPR: to_datetime string behavior with unit (#58407) * DEPR: to_datetime string behavior with unit * remove outdated test --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/_libs/tslib.pyi | 6 +- pandas/_libs/tslib.pyx | 127 +++---------------------- pandas/core/tools/datetimes.py | 9 +- pandas/tests/tools/test_to_datetime.py | 33 +++---- 5 files changed, 35 insertions(+), 141 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 59cc709359a8d..dee793f5ef002 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -214,6 +214,7 @@ Removal of prior version deprecations/changes - :func:`concat` no longer ignores empty objects when determining output dtypes (:issue:`39122`) - :func:`concat` with all-NA entries no longer ignores the dtype of those entries when determining the result dtype (:issue:`40893`) - :func:`read_excel`, :func:`read_json`, :func:`read_html`, and :func:`read_xml` no longer accept raw string or byte representation of the data. That type of data must be wrapped in a :py:class:`StringIO` or :py:class:`BytesIO` (:issue:`53767`) +- :func:`to_datetime` with a ``unit`` specified no longer parses strings into floats, instead parses them the same way as without ``unit`` (:issue:`50735`) - :meth:`DataFrame.groupby` with ``as_index=False`` and aggregation methods will no longer exclude from the result the groupings that do not arise from the input (:issue:`49519`) - :meth:`Series.dt.to_pydatetime` now returns a :class:`Series` of :py:class:`datetime.datetime` objects (:issue:`52459`) - :meth:`SeriesGroupBy.agg` no longer pins the name of the group to the input passed to the provided ``func`` (:issue:`51703`) diff --git a/pandas/_libs/tslib.pyi b/pandas/_libs/tslib.pyi index 5a340c1d88bc4..7e3372a80db9d 100644 --- a/pandas/_libs/tslib.pyi +++ b/pandas/_libs/tslib.pyi @@ -11,11 +11,6 @@ def format_array_from_datetime( na_rep: str | float = ..., reso: int = ..., # NPY_DATETIMEUNIT ) -> npt.NDArray[np.object_]: ... -def array_with_unit_to_datetime( - values: npt.NDArray[np.object_], - unit: str, - errors: str = ..., -) -> tuple[np.ndarray, tzinfo | None]: ... def first_non_null(values: np.ndarray) -> int: ... def array_to_datetime( values: npt.NDArray[np.object_], @@ -24,6 +19,7 @@ def array_to_datetime( yearfirst: bool = ..., utc: bool = ..., creso: int = ..., + unit_for_numerics: str | None = ..., ) -> tuple[np.ndarray, tzinfo | None]: ... # returned ndarray may be object dtype or datetime64[ns] diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index aecf9f2e46bd4..dca3ba0ce49b3 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -1,7 +1,3 @@ -import warnings - -from pandas.util._exceptions import find_stack_level - cimport cython from datetime import timezone @@ -234,117 +230,6 @@ def format_array_from_datetime( return result -def array_with_unit_to_datetime( - ndarray[object] values, - str unit, - str errors="coerce" -): - """ - Convert the ndarray to datetime according to the time unit. - - This function converts an array of objects into a numpy array of - datetime64[ns]. It returns the converted array - and also returns the timezone offset - - if errors: - - raise: return converted values or raise OutOfBoundsDatetime - if out of range on the conversion or - ValueError for other conversions (e.g. a string) - - ignore: return non-convertible values as the same unit - - coerce: NaT for non-convertibles - - Parameters - ---------- - values : ndarray - Date-like objects to convert. - unit : str - Time unit to use during conversion. - errors : str, default 'raise' - Error behavior when parsing. - - Returns - ------- - result : ndarray of m8 values - tz : parsed timezone offset or None - """ - cdef: - Py_ssize_t i, n=len(values) - bint is_coerce = errors == "coerce" - bint is_raise = errors == "raise" - ndarray[int64_t] iresult - tzinfo tz = None - double fval - - assert is_coerce or is_raise - - if unit == "ns": - result, tz = array_to_datetime( - values.astype(object, copy=False), - errors=errors, - creso=NPY_FR_ns, - ) - return result, tz - - result = np.empty(n, dtype="M8[ns]") - iresult = result.view("i8") - - for i in range(n): - val = values[i] - - try: - if checknull_with_nat_and_na(val): - iresult[i] = NPY_NAT - - elif is_integer_object(val) or is_float_object(val): - - if val != val or val == NPY_NAT: - iresult[i] = NPY_NAT - else: - iresult[i] = cast_from_unit(val, unit) - - elif isinstance(val, str): - if len(val) == 0 or val in nat_strings: - iresult[i] = NPY_NAT - - else: - - try: - fval = float(val) - except ValueError: - raise ValueError( - f"non convertible value {val} with the unit '{unit}'" - ) - warnings.warn( - "The behavior of 'to_datetime' with 'unit' when parsing " - "strings is deprecated. In a future version, strings will " - "be parsed as datetime strings, matching the behavior " - "without a 'unit'. To retain the old behavior, explicitly " - "cast ints or floats to numeric type before calling " - "to_datetime.", - FutureWarning, - stacklevel=find_stack_level(), - ) - - iresult[i] = cast_from_unit(fval, unit) - - else: - # TODO: makes more sense as TypeError, but that would be an - # API change. - raise ValueError( - f"unit='{unit}' not valid with non-numerical val='{val}'" - ) - - except (ValueError, TypeError) as err: - if is_raise: - err.args = (f"{err}, at position {i}",) - raise - else: - # is_coerce - iresult[i] = NPY_NAT - - return result, tz - - @cython.wraparound(False) @cython.boundscheck(False) def first_non_null(values: ndarray) -> int: @@ -376,6 +261,7 @@ cpdef array_to_datetime( bint yearfirst=False, bint utc=False, NPY_DATETIMEUNIT creso=NPY_FR_ns, + str unit_for_numerics=None, ): """ Converts a 1D array of date-like values to a numpy array of either: @@ -404,6 +290,7 @@ cpdef array_to_datetime( indicator whether the dates should be UTC creso : NPY_DATETIMEUNIT, default NPY_FR_ns Set to NPY_FR_GENERIC to infer a resolution. + unit_for_numerics : str, default "ns" Returns ------- @@ -434,6 +321,13 @@ cpdef array_to_datetime( abbrev = "ns" else: abbrev = npy_unit_to_abbrev(creso) + + if unit_for_numerics is not None: + # either creso or unit_for_numerics should be passed, not both + assert creso == NPY_FR_ns + else: + unit_for_numerics = abbrev + result = np.empty((values).shape, dtype=f"M8[{abbrev}]") iresult = result.view("i8").ravel() @@ -485,7 +379,8 @@ cpdef array_to_datetime( creso = state.creso # we now need to parse this as if unit=abbrev - iresult[i] = cast_from_unit(val, abbrev, out_reso=creso) + iresult[i] = cast_from_unit(val, unit_for_numerics, out_reso=creso) + state.found_other = True elif isinstance(val, str): diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index df7a6cdb1ea52..b01cdb335ec46 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -481,7 +481,7 @@ def _to_datetime_with_unit(arg, unit, name, utc: bool, errors: str) -> Index: """ arg = extract_array(arg, extract_numpy=True) - # GH#30050 pass an ndarray to tslib.array_with_unit_to_datetime + # GH#30050 pass an ndarray to tslib.array_to_datetime # because it expects an ndarray argument if isinstance(arg, IntegerArray): arr = arg.astype(f"datetime64[{unit}]") @@ -519,7 +519,12 @@ def _to_datetime_with_unit(arg, unit, name, utc: bool, errors: str) -> Index: tz_parsed = None else: arg = arg.astype(object, copy=False) - arr, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit, errors=errors) + arr, tz_parsed = tslib.array_to_datetime( + arg, + utc=utc, + errors=errors, + unit_for_numerics=unit, + ) result = DatetimeIndex(arr, name=name) if not isinstance(result, DatetimeIndex): diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 7ce02c12ac1ca..f4042acd05dc3 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -1705,22 +1705,24 @@ def test_to_datetime_month_or_year_unit_non_round_float(self, cache, unit): # GH#50301 # Match Timestamp behavior in disallowing non-round floats with # Y or M unit - warn_msg = "strings will be parsed as datetime strings" msg = f"Conversion of non-round float with unit={unit} is ambiguous" with pytest.raises(ValueError, match=msg): to_datetime([1.5], unit=unit, errors="raise") with pytest.raises(ValueError, match=msg): to_datetime(np.array([1.5]), unit=unit, errors="raise") + + msg = r"Given date string \"1.5\" not likely a datetime, at position 0" with pytest.raises(ValueError, match=msg): - with tm.assert_produces_warning(FutureWarning, match=warn_msg): - to_datetime(["1.5"], unit=unit, errors="raise") + to_datetime(["1.5"], unit=unit, errors="raise") res = to_datetime([1.5], unit=unit, errors="coerce") expected = Index([NaT], dtype="M8[ns]") tm.assert_index_equal(res, expected) - with tm.assert_produces_warning(FutureWarning, match=warn_msg): - res = to_datetime(["1.5"], unit=unit, errors="coerce") + # In 3.0, the string "1.5" is parsed as as it would be without unit, + # which fails. With errors="coerce" this becomes NaT. + res = to_datetime(["1.5"], unit=unit, errors="coerce") + expected = to_datetime([NaT]) tm.assert_index_equal(res, expected) # round floats are OK @@ -1735,17 +1737,6 @@ def test_unit(self, cache): with pytest.raises(ValueError, match=msg): to_datetime([1], unit="D", format="%Y%m%d", cache=cache) - def test_unit_str(self, cache): - # GH 57051 - # Test that strs aren't dropping precision to 32-bit accidentally. - with tm.assert_produces_warning( - FutureWarning, - match="'to_datetime' with 'unit' when parsing strings is deprecated", - ): - res = to_datetime(["1704660000"], unit="s", origin="unix") - expected = to_datetime([1704660000], unit="s", origin="unix") - tm.assert_index_equal(res, expected) - def test_unit_array_mixed_nans(self, cache): values = [11111111111111111, 1, 1.0, iNaT, NaT, np.nan, "NaT", ""] @@ -1774,7 +1765,7 @@ def test_unit_array_mixed_nans_large_int(self, cache): def test_to_datetime_invalid_str_not_out_of_bounds_valuerror(self, cache): # if we have a string, then we raise a ValueError # and NOT an OutOfBoundsDatetime - msg = "non convertible value foo with the unit 's'" + msg = "Unknown datetime string format, unable to parse: foo, at position 0" with pytest.raises(ValueError, match=msg): to_datetime("foo", errors="raise", unit="s", cache=cache) @@ -1909,7 +1900,13 @@ def test_to_datetime_unit_na_values(self): @pytest.mark.parametrize("bad_val", ["foo", 111111111]) def test_to_datetime_unit_invalid(self, bad_val): - msg = f"{bad_val} with the unit 'D'" + if bad_val == "foo": + msg = ( + "Unknown datetime string format, unable to parse: " + f"{bad_val}, at position 2" + ) + else: + msg = "cannot convert input 111111111 with the unit 'D', at position 2" with pytest.raises(ValueError, match=msg): to_datetime([1, 2, bad_val], unit="D") From 53609a79be3b5ef378d9cc2efe167e09714a953e Mon Sep 17 00:00:00 2001 From: KeiOshima Date: Wed, 24 Apr 2024 15:39:48 -0400 Subject: [PATCH 038/100] DOC: fix SA01 error for DatetimeIndex: day_of_year, is_leap_year, inferred_freq (#58406) * DOC: fix SA01 error for DatetimeIndex: day_of_year, is_leap_year, inferred_freq * fixing line to long error * Fixing: EXPECTED TO FAIL, BUT NOT FAILING errors --- ci/code_checks.sh | 8 -------- pandas/core/arrays/datetimelike.py | 5 +++++ pandas/core/arrays/datetimes.py | 12 ++++++++++++ 3 files changed, 17 insertions(+), 8 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 101d650a0e768..7c97408cee559 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -102,13 +102,9 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.DataFrame.to_markdown SA01" \ -i "pandas.DataFrame.to_parquet RT03" \ -i "pandas.DataFrame.var PR01,RT03,SA01" \ - -i "pandas.DatetimeIndex.day_of_year SA01" \ - -i "pandas.DatetimeIndex.dayofyear SA01" \ -i "pandas.DatetimeIndex.freqstr SA01" \ -i "pandas.DatetimeIndex.indexer_at_time PR01,RT03" \ -i "pandas.DatetimeIndex.indexer_between_time RT03" \ - -i "pandas.DatetimeIndex.inferred_freq SA01" \ - -i "pandas.DatetimeIndex.is_leap_year SA01" \ -i "pandas.DatetimeIndex.snap PR01,RT03" \ -i "pandas.DatetimeIndex.std PR01,RT03" \ -i "pandas.DatetimeIndex.to_period RT03" \ @@ -264,14 +260,11 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Series.dt.ceil PR01,PR02" \ -i "pandas.Series.dt.components SA01" \ -i "pandas.Series.dt.day_name PR01,PR02" \ - -i "pandas.Series.dt.day_of_year SA01" \ - -i "pandas.Series.dt.dayofyear SA01" \ -i "pandas.Series.dt.days SA01" \ -i "pandas.Series.dt.days_in_month SA01" \ -i "pandas.Series.dt.daysinmonth SA01" \ -i "pandas.Series.dt.floor PR01,PR02" \ -i "pandas.Series.dt.freq GL08" \ - -i "pandas.Series.dt.is_leap_year SA01" \ -i "pandas.Series.dt.microseconds SA01" \ -i "pandas.Series.dt.month_name PR01,PR02" \ -i "pandas.Series.dt.nanoseconds SA01" \ @@ -400,7 +393,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.TimedeltaIndex.as_unit RT03,SA01" \ -i "pandas.TimedeltaIndex.components SA01" \ -i "pandas.TimedeltaIndex.days SA01" \ - -i "pandas.TimedeltaIndex.inferred_freq SA01" \ -i "pandas.TimedeltaIndex.microseconds SA01" \ -i "pandas.TimedeltaIndex.nanoseconds SA01" \ -i "pandas.TimedeltaIndex.seconds SA01" \ diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 974289160b145..ff8b16b3361ee 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -908,6 +908,11 @@ def inferred_freq(self) -> str | None: Returns None if it can't autodetect the frequency. + See Also + -------- + DatetimeIndex.freqstr : Return the frequency object as a string if it's set, + otherwise None. + Examples -------- For DatetimeIndex: diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index be087e19ce7b6..25c7f926d19a8 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -1835,6 +1835,11 @@ def isocalendar(self) -> DataFrame: """ The ordinal day of the year. + See Also + -------- + DatetimeIndex.dayofweek : The day of the week with Monday=0, Sunday=6. + DatetimeIndex.day : The day of the datetime. + Examples -------- For Series: @@ -2155,6 +2160,13 @@ def isocalendar(self) -> DataFrame: Series or ndarray Booleans indicating if dates belong to a leap year. + See Also + -------- + DatetimeIndex.is_year_end : Indicate whether the date is the + last day of the year. + DatetimeIndex.is_year_start : Indicate whether the date is the first + day of a year. + Examples -------- This method is available on Series with datetime values under From 661d7f044bb09da2f963707b25aabee485dd0bc8 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Thu, 25 Apr 2024 01:10:54 +0530 Subject: [PATCH 039/100] DOC: Enforce Numpy Docstring Validation for pandas.Index.to_list (#58398) * DOC: added RT03 to pandas.Index.to_list * DOC: remove pandas.Index.to_list * DOC: remove pandas.Series.tolist --- ci/code_checks.sh | 2 -- pandas/core/base.py | 1 + 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 7c97408cee559..bf7423dfe5825 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -147,7 +147,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Index.str PR01,SA01" \ -i "pandas.Index.symmetric_difference PR07,RT03,SA01" \ -i "pandas.Index.take PR01,PR07" \ - -i "pandas.Index.to_list RT03" \ -i "pandas.Index.union PR07,RT03,SA01" \ -i "pandas.Index.view GL08" \ -i "pandas.Int16Dtype SA01" \ @@ -368,7 +367,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Series.swaplevel SA01" \ -i "pandas.Series.to_dict SA01" \ -i "pandas.Series.to_frame SA01" \ - -i "pandas.Series.to_list RT03" \ -i "pandas.Series.to_markdown SA01" \ -i "pandas.Series.to_string SA01" \ -i "pandas.Series.truediv PR07" \ diff --git a/pandas/core/base.py b/pandas/core/base.py index 424f0609dd485..d716a9ffb7bcc 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -789,6 +789,7 @@ def tolist(self) -> list: Returns ------- list + List containing the values as Python or pandas scalers. See Also -------- From 4f7cb743533d21d3025f9b4fd2f4f1854977cc63 Mon Sep 17 00:00:00 2001 From: Carlo Barth Date: Wed, 24 Apr 2024 21:41:58 +0200 Subject: [PATCH 040/100] Fix/time series interpolation is wrong 21351 (#56515) * fix: Fixes wrong doctest output in `pandas.core.resample.Resampler.interpolate` and the related explanation about consideration of anchor points when interpolating downsampled series with non-aligned result index. * Resolved merge conflicts * fix: Fixes wrong test case assumption for interpolation Fixes assumption in `test_interp_basic_with_non_range_index`. If the index is [1, 2, 3, 5] and values are [1, 2, np.nan, 4], it is wrong to expect that interpolation will result in 3 for the missing value in case of linear interpolation. It will rather be 2.666... * fix: Make sure frequency indexes are preserved with new interpolation approach * fix: Fixes new-style up-sampling interpolation for MultiIndexes resulting from groupby-operations * fix: Fixes wrong test case assumption when using linear interpolation on series with datetime index using business days only (test case `pandas.tests.series.methods.test_interpolate.TestSeriesInterpolateData.test_interpolate`). * fix: Fixes wrong test case assumption when using linear interpolation on irregular index (test case `pandas.tests.series.methods.test_interpolate.TestSeriesInterpolateData.test_nan_irregular_index`). * fix: Adds test skips for interpolation methods that require scipy if scipy is not installed * fix: Makes sure keyword arguments "downcast" is not passed to scipy interpolation methods that are not using `interp1d` or spline. * fix: Adjusted expected warning type in `test_groupby_resample_interpolate_off_grid`. * fix: Fixes failing interpolation on groupby if the index has `name`=None. Adds this check to an existing test case. * Trigger Actions * feat: Raise error on attempt to interpolate a MultiIndex data frame, providing a useful error message that describes a working alternative syntax. Fixed related test cases and added test that makes sure the error is raised. * Apply suggestions from code review Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> * refactor: Adjusted error type assertion in test case * refactor: Removed unused parametrization definitions and switched to direct parametrization for interpolation methods in tests. * fix: Adds forgotten "@" before pytest.mark.parametrize * refactor: Apply suggestions from code review * refactor: Switched to ficture params syntax for test case parametrization * Update pandas/tests/resample/test_time_grouper.py Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> * Update pandas/tests/resample/test_base.py Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> * refactor: Fixes too long line * tests: Fixes test that fails due to unimportant index name comparison * docs: Added entry in whatsnew * Empty-Commit * Empty-Commit * Empty-Commit * docs: Sorted whatsnew * docs: Adjusted bug fix note and moved it to the right section --------- Co-authored-by: Marco Edward Gorelli Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/missing.py | 14 +- pandas/core/resample.py | 68 ++++++++-- .../tests/frame/methods/test_interpolate.py | 2 +- pandas/tests/resample/test_base.py | 73 +++++++++++ pandas/tests/resample/test_time_grouper.py | 120 +++++++++++++----- .../tests/series/methods/test_interpolate.py | 9 +- 7 files changed, 239 insertions(+), 48 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index dee793f5ef002..c77348b365370 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -438,6 +438,7 @@ Groupby/resample/rolling - Bug in :meth:`.DataFrameGroupBy.groups` and :meth:`.SeriesGroupby.groups` that would not respect groupby argument ``dropna`` (:issue:`55919`) - Bug in :meth:`.DataFrameGroupBy.median` where nat values gave an incorrect result. (:issue:`57926`) - Bug in :meth:`.DataFrameGroupBy.quantile` when ``interpolation="nearest"`` is inconsistent with :meth:`DataFrame.quantile` (:issue:`47942`) +- Bug in :meth:`.Resampler.interpolate` on a :class:`DataFrame` with non-uniform sampling and/or indices not aligning with the resulting resampled index would result in wrong interpolation (:issue:`21351`) - Bug in :meth:`DataFrame.ewm` and :meth:`Series.ewm` when passed ``times`` and aggregation functions other than mean (:issue:`51695`) - Bug in :meth:`DataFrameGroupBy.apply` that was returning a completely empty DataFrame when all return values of ``func`` were ``None`` instead of returning an empty DataFrame with the original columns and dtypes. (:issue:`57775`) diff --git a/pandas/core/missing.py b/pandas/core/missing.py index 9fef78d9f8c3d..039d868bccd16 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -314,7 +314,16 @@ def get_interp_index(method, index: Index) -> Index: # prior default from pandas import Index - index = Index(np.arange(len(index))) + if isinstance(index.dtype, DatetimeTZDtype) or lib.is_np_dtype( + index.dtype, "mM" + ): + # Convert datetime-like indexes to int64 + index = Index(index.view("i8")) + + elif not is_numeric_dtype(index.dtype): + # We keep behavior consistent with prior versions of pandas for + # non-numeric, non-datetime indexes + index = Index(range(len(index))) else: methods = {"index", "values", "nearest", "time"} is_numeric_or_datetime = ( @@ -616,6 +625,9 @@ def _interpolate_scipy_wrapper( terp = alt_methods.get(method, None) if terp is None: raise ValueError(f"Can not interpolate with method={method}.") + + # Make sure downcast is not in kwargs for alt methods + kwargs.pop("downcast", None) new_y = terp(x, y, new_x, **kwargs) return new_y diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 86d1f55f38c05..ccbe25fdae841 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -80,6 +80,7 @@ TimedeltaIndex, timedelta_range, ) +from pandas.core.reshape.concat import concat from pandas.tseries.frequencies import ( is_subperiod, @@ -885,30 +886,59 @@ def interpolate( Freq: 500ms, dtype: float64 Internal reindexing with ``asfreq()`` prior to interpolation leads to - an interpolated timeseries on the basis the reindexed timestamps (anchors). - Since not all datapoints from original series become anchors, - it can lead to misleading interpolation results as in the following example: + an interpolated timeseries on the basis of the reindexed timestamps + (anchors). It is assured that all available datapoints from original + series become anchors, so it also works for resampling-cases that lead + to non-aligned timestamps, as in the following example: >>> series.resample("400ms").interpolate("linear") 2023-03-01 07:00:00.000 1.0 - 2023-03-01 07:00:00.400 1.2 - 2023-03-01 07:00:00.800 1.4 - 2023-03-01 07:00:01.200 1.6 - 2023-03-01 07:00:01.600 1.8 + 2023-03-01 07:00:00.400 0.2 + 2023-03-01 07:00:00.800 -0.6 + 2023-03-01 07:00:01.200 -0.4 + 2023-03-01 07:00:01.600 0.8 2023-03-01 07:00:02.000 2.0 - 2023-03-01 07:00:02.400 2.2 - 2023-03-01 07:00:02.800 2.4 - 2023-03-01 07:00:03.200 2.6 - 2023-03-01 07:00:03.600 2.8 + 2023-03-01 07:00:02.400 1.6 + 2023-03-01 07:00:02.800 1.2 + 2023-03-01 07:00:03.200 1.4 + 2023-03-01 07:00:03.600 2.2 2023-03-01 07:00:04.000 3.0 Freq: 400ms, dtype: float64 - Note that the series erroneously increases between two anchors + Note that the series correctly decreases between two anchors ``07:00:00`` and ``07:00:02``. """ assert downcast is lib.no_default # just checking coverage result = self._upsample("asfreq") - return result.interpolate( + + # If the original data has timestamps which are not aligned with the + # target timestamps, we need to add those points back to the data frame + # that is supposed to be interpolated. This does not work with + # PeriodIndex, so we skip this case. GH#21351 + obj = self._selected_obj + is_period_index = isinstance(obj.index, PeriodIndex) + + # Skip this step for PeriodIndex + if not is_period_index: + final_index = result.index + if isinstance(final_index, MultiIndex): + raise NotImplementedError( + "Direct interpolation of MultiIndex data frames is not " + "supported. If you tried to resample and interpolate on a " + "grouped data frame, please use:\n" + "`df.groupby(...).apply(lambda x: x.resample(...)." + "interpolate(...), include_groups=False)`" + "\ninstead, as resampling and interpolation has to be " + "performed for each group independently." + ) + + missing_data_points_index = obj.index.difference(final_index) + if len(missing_data_points_index) > 0: + result = concat( + [result, obj.loc[missing_data_points_index]] + ).sort_index() + + result_interpolated = result.interpolate( method=method, axis=axis, limit=limit, @@ -919,6 +949,18 @@ def interpolate( **kwargs, ) + # No further steps if the original data has a PeriodIndex + if is_period_index: + return result_interpolated + + # Make sure that original data points which do not align with the + # resampled index are removed + result_interpolated = result_interpolated.loc[final_index] + + # Make sure frequency indexes are preserved + result_interpolated.index = final_index + return result_interpolated + @final def asfreq(self, fill_value=None): """ diff --git a/pandas/tests/frame/methods/test_interpolate.py b/pandas/tests/frame/methods/test_interpolate.py index 0a9d059736e6f..cdb9ff8a67b6b 100644 --- a/pandas/tests/frame/methods/test_interpolate.py +++ b/pandas/tests/frame/methods/test_interpolate.py @@ -109,7 +109,7 @@ def test_interp_basic_with_non_range_index(self, using_infer_string): else: result = df.set_index("C").interpolate() expected = df.set_index("C") - expected.loc[3, "A"] = 3 + expected.loc[3, "A"] = 2.66667 expected.loc[5, "B"] = 9 tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/resample/test_base.py b/pandas/tests/resample/test_base.py index 9cd51b95d6efd..3428abacd509e 100644 --- a/pandas/tests/resample/test_base.py +++ b/pandas/tests/resample/test_base.py @@ -25,6 +25,29 @@ from pandas.core.resample import _asfreq_compat +@pytest.fixture( + params=[ + "linear", + "time", + "index", + "values", + "nearest", + "zero", + "slinear", + "quadratic", + "cubic", + "barycentric", + "krogh", + "from_derivatives", + "piecewise_polynomial", + "pchip", + "akima", + ], +) +def all_1d_no_arg_interpolation_methods(request): + return request.param + + @pytest.mark.parametrize("freq", ["2D", "1h"]) @pytest.mark.parametrize( "index", @@ -91,6 +114,56 @@ def test_resample_interpolate(index): tm.assert_frame_equal(result, expected) +def test_resample_interpolate_regular_sampling_off_grid( + all_1d_no_arg_interpolation_methods, +): + pytest.importorskip("scipy") + # GH#21351 + index = date_range("2000-01-01 00:01:00", periods=5, freq="2h") + ser = Series(np.arange(5.0), index) + + method = all_1d_no_arg_interpolation_methods + # Resample to 1 hour sampling and interpolate with the given method + ser_resampled = ser.resample("1h").interpolate(method) + + # Check that none of the resampled values are NaN, except the first one + # which lies 1 minute before the first actual data point + assert np.isnan(ser_resampled.iloc[0]) + assert not ser_resampled.iloc[1:].isna().any() + + if method not in ["nearest", "zero"]: + # Check that the resampled values are close to the expected values + # except for methods with known inaccuracies + assert np.all( + np.isclose(ser_resampled.values[1:], np.arange(0.5, 4.5, 0.5), rtol=1.0e-1) + ) + + +def test_resample_interpolate_irregular_sampling(all_1d_no_arg_interpolation_methods): + pytest.importorskip("scipy") + # GH#21351 + ser = Series( + np.linspace(0.0, 1.0, 5), + index=DatetimeIndex( + [ + "2000-01-01 00:00:03", + "2000-01-01 00:00:22", + "2000-01-01 00:00:24", + "2000-01-01 00:00:31", + "2000-01-01 00:00:39", + ] + ), + ) + + # Resample to 5 second sampling and interpolate with the given method + ser_resampled = ser.resample("5s").interpolate(all_1d_no_arg_interpolation_methods) + + # Check that none of the resampled values are NaN, except the first one + # which lies 3 seconds before the first actual data point + assert np.isnan(ser_resampled.iloc[0]) + assert not ser_resampled.iloc[1:].isna().any() + + def test_raises_on_non_datetimelike_index(): # this is a non datetimelike index xp = DataFrame() diff --git a/pandas/tests/resample/test_time_grouper.py b/pandas/tests/resample/test_time_grouper.py index 11ad9240527d5..5f5a54c4d92a3 100644 --- a/pandas/tests/resample/test_time_grouper.py +++ b/pandas/tests/resample/test_time_grouper.py @@ -333,26 +333,98 @@ def test_upsample_sum(method, method_args, expected_values): tm.assert_series_equal(result, expected) -def test_groupby_resample_interpolate(): +@pytest.fixture +def groupy_test_df(): + return DataFrame( + {"price": [10, 11, 9], "volume": [50, 60, 50]}, + index=date_range("01/01/2018", periods=3, freq="W"), + ) + + +def test_groupby_resample_interpolate_raises(groupy_test_df): + # GH 35325 + + # Make a copy of the test data frame that has index.name=None + groupy_test_df_without_index_name = groupy_test_df.copy() + groupy_test_df_without_index_name.index.name = None + + dfs = [groupy_test_df, groupy_test_df_without_index_name] + + for df in dfs: + msg = "DataFrameGroupBy.resample operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + with pytest.raises( + NotImplementedError, + match="Direct interpolation of MultiIndex data frames is " + "not supported", + ): + df.groupby("volume").resample("1D").interpolate(method="linear") + + +def test_groupby_resample_interpolate_with_apply_syntax(groupy_test_df): # GH 35325 - d = {"price": [10, 11, 9], "volume": [50, 60, 50]} - df = DataFrame(d) + # Make a copy of the test data frame that has index.name=None + groupy_test_df_without_index_name = groupy_test_df.copy() + groupy_test_df_without_index_name.index.name = None - df["week_starting"] = date_range("01/01/2018", periods=3, freq="W") + dfs = [groupy_test_df, groupy_test_df_without_index_name] - msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = ( - df.set_index("week_starting") - .groupby("volume") - .resample("1D") - .interpolate(method="linear") + for df in dfs: + result = df.groupby("volume").apply( + lambda x: x.resample("1d").interpolate(method="linear"), + include_groups=False, ) - volume = [50] * 15 + [60] - week_starting = list(date_range("2018-01-07", "2018-01-21")) + [ - Timestamp("2018-01-14") + volume = [50] * 15 + [60] + week_starting = list(date_range("2018-01-07", "2018-01-21")) + [ + Timestamp("2018-01-14") + ] + expected_ind = pd.MultiIndex.from_arrays( + [volume, week_starting], + names=["volume", df.index.name], + ) + + expected = DataFrame( + data={ + "price": [ + 10.0, + 9.928571428571429, + 9.857142857142858, + 9.785714285714286, + 9.714285714285714, + 9.642857142857142, + 9.571428571428571, + 9.5, + 9.428571428571429, + 9.357142857142858, + 9.285714285714286, + 9.214285714285714, + 9.142857142857142, + 9.071428571428571, + 9.0, + 11.0, + ] + }, + index=expected_ind, + ) + tm.assert_frame_equal(result, expected) + + +def test_groupby_resample_interpolate_with_apply_syntax_off_grid(groupy_test_df): + """Similar test as test_groupby_resample_interpolate_with_apply_syntax but + with resampling that results in missing anchor points when interpolating. + See GH#21351.""" + # GH#21351 + result = groupy_test_df.groupby("volume").apply( + lambda x: x.resample("265h").interpolate(method="linear"), include_groups=False + ) + + volume = [50, 50, 60] + week_starting = [ + Timestamp("2018-01-07"), + Timestamp("2018-01-18 01:00:00"), + Timestamp("2018-01-14"), ] expected_ind = pd.MultiIndex.from_arrays( [volume, week_starting], @@ -363,24 +435,10 @@ def test_groupby_resample_interpolate(): data={ "price": [ 10.0, - 9.928571428571429, - 9.857142857142858, - 9.785714285714286, - 9.714285714285714, - 9.642857142857142, - 9.571428571428571, - 9.5, - 9.428571428571429, - 9.357142857142858, - 9.285714285714286, - 9.214285714285714, - 9.142857142857142, - 9.071428571428571, - 9.0, + 9.21131, 11.0, - ], - "volume": [50.0] * 15 + [60], + ] }, index=expected_ind, ) - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected, check_names=False) diff --git a/pandas/tests/series/methods/test_interpolate.py b/pandas/tests/series/methods/test_interpolate.py index 1008c2c87dc9e..ff7f8d0b7fa72 100644 --- a/pandas/tests/series/methods/test_interpolate.py +++ b/pandas/tests/series/methods/test_interpolate.py @@ -94,7 +94,12 @@ def test_interpolate(self, datetime_series): ts = Series(np.arange(len(datetime_series), dtype=float), datetime_series.index) ts_copy = ts.copy() - ts_copy[5:10] = np.nan + + # Set data between Tuesday and Thursday to NaN for 2 consecutive weeks. + # Linear interpolation should fill in the missing values correctly, + # as the index is equally-spaced within each week. + ts_copy[1:4] = np.nan + ts_copy[6:9] = np.nan linear_interp = ts_copy.interpolate(method="linear") tm.assert_series_equal(linear_interp, ts) @@ -265,7 +270,7 @@ def test_nan_interpolate(self, kwargs): def test_nan_irregular_index(self): s = Series([1, 2, np.nan, 4], index=[1, 3, 5, 9]) result = s.interpolate() - expected = Series([1.0, 2.0, 3.0, 4.0], index=[1, 3, 5, 9]) + expected = Series([1.0, 2.0, 2.6666666666666665, 4.0], index=[1, 3, 5, 9]) tm.assert_series_equal(result, expected) def test_nan_str_index(self): From 6320c8bb3287fd603dc7e014daf8d695a510024b Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 24 Apr 2024 16:26:28 -0700 Subject: [PATCH 041/100] REF: use maybe_convert_objects in pd.array (#56484) * REF: use maybe_convert_objects in pd.array * lint fixups * Update pandas/_libs/lib.pyx Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --------- Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/_libs/lib.pyx | 43 ++++++++--- pandas/core/construction.py | 100 +++++++++++++++----------- pandas/tests/arrays/test_array.py | 16 +++++ pandas/tests/dtypes/test_inference.py | 4 +- 4 files changed, 109 insertions(+), 54 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 24afbe3a07bf1..5b6d83ba8e9ee 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2628,7 +2628,11 @@ def maybe_convert_objects(ndarray[object] objects, seen.object_ = True break elif val is C_NA: - seen.object_ = True + if convert_to_nullable_dtype: + seen.null_ = True + mask[i] = True + else: + seen.object_ = True continue else: seen.object_ = True @@ -2691,6 +2695,12 @@ def maybe_convert_objects(ndarray[object] objects, dtype = StringDtype(storage="pyarrow_numpy") return dtype.construct_array_type()._from_sequence(objects, dtype=dtype) + elif convert_to_nullable_dtype and is_string_array(objects, skipna=True): + from pandas.core.arrays.string_ import StringDtype + + dtype = StringDtype() + return dtype.construct_array_type()._from_sequence(objects, dtype=dtype) + seen.object_ = True elif seen.interval_: if is_interval_array(objects): @@ -2734,12 +2744,12 @@ def maybe_convert_objects(ndarray[object] objects, return objects if seen.bool_: - if seen.is_bool: - # is_bool property rules out everything else - return bools.view(np.bool_) - elif convert_to_nullable_dtype and seen.is_bool_or_na: + if convert_to_nullable_dtype and seen.is_bool_or_na: from pandas.core.arrays import BooleanArray return BooleanArray(bools.view(np.bool_), mask) + elif seen.is_bool: + # is_bool property rules out everything else + return bools.view(np.bool_) seen.object_ = True if not seen.object_: @@ -2752,11 +2762,11 @@ def maybe_convert_objects(ndarray[object] objects, result = floats elif seen.int_ or seen.uint_: if convert_to_nullable_dtype: - from pandas.core.arrays import IntegerArray + # Below we will wrap in IntegerArray if seen.uint_: - result = IntegerArray(uints, mask) + result = uints else: - result = IntegerArray(ints, mask) + result = ints else: result = floats elif seen.nan_: @@ -2771,7 +2781,6 @@ def maybe_convert_objects(ndarray[object] objects, result = uints else: result = ints - else: # don't cast int to float, etc. if seen.null_: @@ -2794,6 +2803,22 @@ def maybe_convert_objects(ndarray[object] objects, else: result = ints + # TODO: do these after the itemsize check? + if (result is ints or result is uints) and convert_to_nullable_dtype: + from pandas.core.arrays import IntegerArray + + # Set these values to 1 to be deterministic, match + # IntegerArray._internal_fill_value + result[mask] = 1 + result = IntegerArray(result, mask) + elif result is floats and convert_to_nullable_dtype: + from pandas.core.arrays import FloatingArray + + # Set these values to 1.0 to be deterministic, match + # FloatingArray._internal_fill_value + result[mask] = 1.0 + result = FloatingArray(result, mask) + if result is uints or result is ints or result is floats or result is complexes: # cast to the largest itemsize when all values are NumPy scalars if itemsize_max > 0 and itemsize_max != result.dtype.itemsize: diff --git a/pandas/core/construction.py b/pandas/core/construction.py index ec49340e9a516..2718e9819cdf8 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -7,11 +7,8 @@ from __future__ import annotations -from collections.abc import Sequence from typing import ( TYPE_CHECKING, - Optional, - Union, cast, overload, ) @@ -23,17 +20,9 @@ from pandas._libs import lib from pandas._libs.tslibs import ( - Period, get_supported_dtype, is_supported_dtype, ) -from pandas._typing import ( - AnyArrayLike, - ArrayLike, - Dtype, - DtypeObj, - T, -) from pandas.core.dtypes.base import ExtensionDtype from pandas.core.dtypes.cast import ( @@ -46,6 +35,7 @@ maybe_promote, ) from pandas.core.dtypes.common import ( + ensure_object, is_list_like, is_object_dtype, is_string_dtype, @@ -63,11 +53,25 @@ import pandas.core.common as com if TYPE_CHECKING: + from collections.abc import Sequence + + from pandas._typing import ( + AnyArrayLike, + ArrayLike, + Dtype, + DtypeObj, + T, + ) + from pandas import ( Index, Series, ) - from pandas.core.arrays.base import ExtensionArray + from pandas.core.arrays import ( + DatetimeArray, + ExtensionArray, + TimedeltaArray, + ) def array( @@ -286,9 +290,7 @@ def array( ExtensionArray, FloatingArray, IntegerArray, - IntervalArray, NumpyExtensionArray, - PeriodArray, TimedeltaArray, ) from pandas.core.arrays.string_ import StringDtype @@ -320,46 +322,58 @@ def array( return cls._from_sequence(data, dtype=dtype, copy=copy) if dtype is None: - inferred_dtype = lib.infer_dtype(data, skipna=True) - if inferred_dtype == "period": - period_data = cast(Union[Sequence[Optional[Period]], AnyArrayLike], data) - return PeriodArray._from_sequence(period_data, copy=copy) - - elif inferred_dtype == "interval": - return IntervalArray(data, copy=copy) - - elif inferred_dtype.startswith("datetime"): - # datetime, datetime64 - try: - return DatetimeArray._from_sequence(data, copy=copy) - except ValueError: - # Mixture of timezones, fall back to NumpyExtensionArray - pass - - elif inferred_dtype.startswith("timedelta"): - # timedelta, timedelta64 - return TimedeltaArray._from_sequence(data, copy=copy) - - elif inferred_dtype == "string": + was_ndarray = isinstance(data, np.ndarray) + # error: Item "Sequence[object]" of "Sequence[object] | ExtensionArray | + # ndarray[Any, Any]" has no attribute "dtype" + if not was_ndarray or data.dtype == object: # type: ignore[union-attr] + result = lib.maybe_convert_objects( + ensure_object(data), + convert_non_numeric=True, + convert_to_nullable_dtype=True, + dtype_if_all_nat=None, + ) + result = ensure_wrapped_if_datetimelike(result) + if isinstance(result, np.ndarray): + if len(result) == 0 and not was_ndarray: + # e.g. empty list + return FloatingArray._from_sequence(data, dtype="Float64") + return NumpyExtensionArray._from_sequence( + data, dtype=result.dtype, copy=copy + ) + if result is data and copy: + return result.copy() + return result + + data = cast(np.ndarray, data) + result = ensure_wrapped_if_datetimelike(data) + if result is not data: + result = cast("DatetimeArray | TimedeltaArray", result) + if copy and result.dtype == data.dtype: + return result.copy() + return result + + if data.dtype.kind in "SU": # StringArray/ArrowStringArray depending on pd.options.mode.string_storage dtype = StringDtype() cls = dtype.construct_array_type() return cls._from_sequence(data, dtype=dtype, copy=copy) - elif inferred_dtype == "integer": + elif data.dtype.kind in "iu": return IntegerArray._from_sequence(data, copy=copy) - elif inferred_dtype == "empty" and not hasattr(data, "dtype") and not len(data): - return FloatingArray._from_sequence(data, copy=copy) - elif ( - inferred_dtype in ("floating", "mixed-integer-float") - and getattr(data, "dtype", None) != np.float16 - ): + elif data.dtype.kind == "f": # GH#44715 Exclude np.float16 bc FloatingArray does not support it; # we will fall back to NumpyExtensionArray. + if data.dtype == np.float16: + return NumpyExtensionArray._from_sequence( + data, dtype=data.dtype, copy=copy + ) return FloatingArray._from_sequence(data, copy=copy) - elif inferred_dtype == "boolean": + elif data.dtype.kind == "b": return BooleanArray._from_sequence(data, dtype="boolean", copy=copy) + else: + # e.g. complex + return NumpyExtensionArray._from_sequence(data, dtype=data.dtype, copy=copy) # Pandas overrides NumPy for # 1. datetime64[ns,us,ms,s] diff --git a/pandas/tests/arrays/test_array.py b/pandas/tests/arrays/test_array.py index 50dafb5dbbb06..857509e18fa8e 100644 --- a/pandas/tests/arrays/test_array.py +++ b/pandas/tests/arrays/test_array.py @@ -220,6 +220,14 @@ def test_dt64_array(dtype_unit): .construct_array_type() ._from_sequence(["a", None], dtype=pd.StringDtype()), ), + ( + # numpy array with string dtype + np.array(["a", "b"], dtype=str), + None, + pd.StringDtype() + .construct_array_type() + ._from_sequence(["a", "b"], dtype=pd.StringDtype()), + ), # Boolean ( [True, None], @@ -247,6 +255,14 @@ def test_dt64_array(dtype_unit): "category", pd.Categorical([pd.Period("2000", "D"), pd.Period("2001", "D")]), ), + # Complex + ( + np.array([complex(1), complex(2)], dtype=np.complex128), + None, + NumpyExtensionArray( + np.array([complex(1), complex(2)], dtype=np.complex128) + ), + ), ], ) def test_array(data, dtype, expected): diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index 668e7192c0e52..f4282c9c7ac3a 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -936,9 +936,9 @@ def test_maybe_convert_objects_bool_nan(self): def test_maybe_convert_objects_nullable_boolean(self): # GH50047 arr = np.array([True, False], dtype=object) - exp = np.array([True, False]) + exp = BooleanArray._from_sequence([True, False], dtype="boolean") out = lib.maybe_convert_objects(arr, convert_to_nullable_dtype=True) - tm.assert_numpy_array_equal(out, exp) + tm.assert_extension_array_equal(out, exp) arr = np.array([True, False, pd.NaT], dtype=object) exp = np.array([True, False, pd.NaT], dtype=object) From 2c8c0e2210dcf57875a9b991cf68fcd082271446 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Thu, 25 Apr 2024 22:43:16 +0530 Subject: [PATCH 042/100] DOC: Enforce Numpy Docstring Validation for pandas.Index.item (#58400) * DOC: add SA01 to pandas.Index.item * DOC: remove pandas.Index.item * DOC: remove pandas.Series.item --------- Co-authored-by: aBiR1D --- ci/code_checks.sh | 2 -- pandas/core/base.py | 5 +++++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index bf7423dfe5825..570ea1272758a 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -134,7 +134,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Index.inferred_type SA01" \ -i "pandas.Index.insert PR07,RT03,SA01" \ -i "pandas.Index.intersection PR07,RT03,SA01" \ - -i "pandas.Index.item SA01" \ -i "pandas.Index.join PR07,RT03,SA01" \ -i "pandas.Index.memory_usage RT03" \ -i "pandas.Index.names GL08" \ @@ -288,7 +287,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Series.is_monotonic_decreasing SA01" \ -i "pandas.Series.is_monotonic_increasing SA01" \ -i "pandas.Series.is_unique SA01" \ - -i "pandas.Series.item SA01" \ -i "pandas.Series.kurt RT03,SA01" \ -i "pandas.Series.kurtosis RT03,SA01" \ -i "pandas.Series.le PR07,SA01" \ diff --git a/pandas/core/base.py b/pandas/core/base.py index d716a9ffb7bcc..ab27248308d74 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -398,6 +398,11 @@ def item(self): ValueError If the data is not length = 1. + See Also + -------- + Index.values : Returns an array representing the data in the Index. + Series.head : Returns the first `n` rows. + Examples -------- >>> s = pd.Series([1]) From 8a9325fa6343f01fd3c9795283a84a160a52643d Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Thu, 25 Apr 2024 22:48:20 +0530 Subject: [PATCH 043/100] DOC: Enforce Numpy Docstring Validation for pandas.Index.fillna (#58417) * DOC: add RT03 to pandas.Index.fillna * DOC: remove pandas.Index.fillna --- ci/code_checks.sh | 1 - pandas/core/indexes/base.py | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 570ea1272758a..51745b208c786 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -124,7 +124,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Index.dropna RT03,SA01" \ -i "pandas.Index.duplicated RT03" \ -i "pandas.Index.empty GL08" \ - -i "pandas.Index.fillna RT03" \ -i "pandas.Index.get_indexer PR07,SA01" \ -i "pandas.Index.get_indexer_for PR01,SA01" \ -i "pandas.Index.get_indexer_non_unique PR07,SA01" \ diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 2bb0aedb8bd84..ffc228d57a95b 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2578,6 +2578,7 @@ def fillna(self, value): Returns ------- Index + NA/NaN values replaced with `value`. See Also -------- From a0977f5b5d9614441b908409272eb97e211332ec Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Thu, 25 Apr 2024 22:48:53 +0530 Subject: [PATCH 044/100] DOC: Enforce Numpy Docstring Validation for pandas.DatetimeIndex.tzconvert (#58416) * DOC: remove pandas.DatetimeIndex.tz_convert * DOC: add RT03 to pandas.DatetimeIndex.tz_convert * DOC: removed RT03 from pandas.Series.dt.tz_convert --- ci/code_checks.sh | 3 +-- pandas/core/arrays/datetimes.py | 1 + 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 51745b208c786..e2d125ad1fc68 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -109,7 +109,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.DatetimeIndex.std PR01,RT03" \ -i "pandas.DatetimeIndex.to_period RT03" \ -i "pandas.DatetimeIndex.to_pydatetime RT03,SA01" \ - -i "pandas.DatetimeIndex.tz_convert RT03" \ -i "pandas.DatetimeTZDtype SA01" \ -i "pandas.DatetimeTZDtype.tz SA01" \ -i "pandas.Grouper PR02" \ @@ -272,7 +271,7 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Series.dt.strftime PR01,PR02" \ -i "pandas.Series.dt.to_period PR01,PR02,RT03" \ -i "pandas.Series.dt.total_seconds PR01" \ - -i "pandas.Series.dt.tz_convert PR01,PR02,RT03" \ + -i "pandas.Series.dt.tz_convert PR01,PR02" \ -i "pandas.Series.dt.tz_localize PR01,PR02" \ -i "pandas.Series.dt.unit GL08" \ -i "pandas.Series.dtype SA01" \ diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 25c7f926d19a8..106064ade8344 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -867,6 +867,7 @@ def tz_convert(self, tz) -> Self: Returns ------- Array or Index + Datetme Array/Index with target `tz`. Raises ------ From 12e47e96a81d65d3a781363b49d05787a5572d58 Mon Sep 17 00:00:00 2001 From: Pascal Corpet Date: Thu, 25 Apr 2024 19:51:33 +0200 Subject: [PATCH 045/100] [Typing] Enhance the WriteExcelBuffer protocol to be compatible with io.BinaryIO (#58422) TYP: Enhance the WriteExcelBuffer protocol to be compatible with io.BinaryIO --- pandas/_typing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_typing.py b/pandas/_typing.py index 172b30c59fc13..ef68018f2721a 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -314,7 +314,7 @@ def readline(self) -> bytes: ... class WriteExcelBuffer(WriteBuffer[bytes], Protocol): - def truncate(self, size: int | None = ...) -> int: ... + def truncate(self, size: int | None = ..., /) -> int: ... class ReadCsvBuffer(ReadBuffer[AnyStr_co], Protocol): From cbbe3a26b4dbcebff5e68f361a46bc0f2610b2ff Mon Sep 17 00:00:00 2001 From: shriyakalakata <87483933+shriyakalakata@users.noreply.github.com> Date: Thu, 25 Apr 2024 16:18:00 -0400 Subject: [PATCH 046/100] DOC: Fix DataFrame.reorder_levels SA01 error (#58431) --- ci/code_checks.sh | 1 - pandas/core/frame.py | 4 ++++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index e2d125ad1fc68..3286cb74c3119 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -92,7 +92,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.DataFrame.plot PR02,SA01" \ -i "pandas.DataFrame.prod RT03" \ -i "pandas.DataFrame.product RT03" \ - -i "pandas.DataFrame.reorder_levels SA01" \ -i "pandas.DataFrame.sem PR01,RT03,SA01" \ -i "pandas.DataFrame.skew RT03,SA01" \ -i "pandas.DataFrame.sparse PR01" \ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index e8a0e37b70145..618218a70b557 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -7696,6 +7696,10 @@ def reorder_levels(self, order: Sequence[int | str], axis: Axis = 0) -> DataFram DataFrame DataFrame with indices or columns with reordered levels. + See Also + -------- + DataFrame.swaplevel : Swap levels i and j in a MultiIndex. + Examples -------- >>> data = { From 926a9c35fc8ae448be5dea0239ea1da1013a043a Mon Sep 17 00:00:00 2001 From: shriyakalakata <87483933+shriyakalakata@users.noreply.github.com> Date: Thu, 25 Apr 2024 16:19:00 -0400 Subject: [PATCH 047/100] DOC: Fix RT03 errors for DataFrame.infer_objects, DataFrame.hist, DataFrame.to_parquet (#58429) * Fix RT03 errors * Fix RT03 errors --- ci/code_checks.sh | 4 ---- pandas/core/frame.py | 3 +++ pandas/core/generic.py | 1 + pandas/plotting/_core.py | 1 + 4 files changed, 5 insertions(+), 4 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 3286cb74c3119..44017c575a516 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -81,8 +81,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.CategoricalIndex.ordered SA01" \ -i "pandas.DataFrame.__dataframe__ SA01" \ -i "pandas.DataFrame.at_time PR01" \ - -i "pandas.DataFrame.hist RT03" \ - -i "pandas.DataFrame.infer_objects RT03" \ -i "pandas.DataFrame.kurt RT03,SA01" \ -i "pandas.DataFrame.kurtosis RT03,SA01" \ -i "pandas.DataFrame.max RT03" \ @@ -99,7 +97,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.DataFrame.sum RT03" \ -i "pandas.DataFrame.swaplevel SA01" \ -i "pandas.DataFrame.to_markdown SA01" \ - -i "pandas.DataFrame.to_parquet RT03" \ -i "pandas.DataFrame.var PR01,RT03,SA01" \ -i "pandas.DatetimeIndex.freqstr SA01" \ -i "pandas.DatetimeIndex.indexer_at_time PR01,RT03" \ @@ -280,7 +277,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Series.ge PR07,SA01" \ -i "pandas.Series.gt PR07,SA01" \ -i "pandas.Series.hasnans SA01" \ - -i "pandas.Series.infer_objects RT03" \ -i "pandas.Series.is_monotonic_decreasing SA01" \ -i "pandas.Series.is_monotonic_increasing SA01" \ -i "pandas.Series.is_unique SA01" \ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 618218a70b557..9fbbc2c08efaa 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2876,6 +2876,9 @@ def to_parquet( Returns ------- bytes if no path argument is provided else None + Returns the DataFrame converted to the binary parquet format as bytes if no + path argument. Returns None and writes the DataFrame to the specified + location in the Parquet format if the path argument is provided. See Also -------- diff --git a/pandas/core/generic.py b/pandas/core/generic.py index a7f155ec93524..121f49cb7d1cf 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6579,6 +6579,7 @@ def infer_objects(self, copy: bool | lib.NoDefault = lib.no_default) -> Self: Returns ------- same type as input object + Returns an object of the same type as the input object. See Also -------- diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index 60bb45d3ac1dc..ea5daf02b7252 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -233,6 +233,7 @@ def hist_frame( Returns ------- matplotlib.Axes or numpy.ndarray of them + Returns a AxesSubplot object a numpy array of AxesSubplot objects. See Also -------- From 1fec924f9fb4096e80c9a732a62686a4ec275d8c Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Fri, 26 Apr 2024 01:49:55 +0530 Subject: [PATCH 048/100] DOC: Enforce Numpy Docstring Validation for pandas.DatetimeIndex.indexer_between_time (#58415) * DOC: add RT03 to pandas.DatetimeIndex.indexer_between_time * DOC: remove pandas.DatetimeIndex.indexer_between_time --- ci/code_checks.sh | 1 - pandas/core/indexes/datetimes.py | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 44017c575a516..740814151aaf4 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -100,7 +100,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.DataFrame.var PR01,RT03,SA01" \ -i "pandas.DatetimeIndex.freqstr SA01" \ -i "pandas.DatetimeIndex.indexer_at_time PR01,RT03" \ - -i "pandas.DatetimeIndex.indexer_between_time RT03" \ -i "pandas.DatetimeIndex.snap PR01,RT03" \ -i "pandas.DatetimeIndex.std PR01,RT03" \ -i "pandas.DatetimeIndex.to_period RT03" \ diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 6d5f32774f485..951455b627fbd 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -759,6 +759,7 @@ def indexer_between_time( Returns ------- np.ndarray[np.intp] + Index locations of values between particular times of day. See Also -------- From 114845c952c3d3405c897b4566b584fec94373fe Mon Sep 17 00:00:00 2001 From: William Andrea <22385371+wjandrea@users.noreply.github.com> Date: Thu, 25 Apr 2024 16:21:30 -0400 Subject: [PATCH 049/100] DOC: Fix "versionadded" for case_when (#58426) Fix "versionadded" for case_when Tag was on parameter instead of function itself. --- pandas/core/series.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index a72eb8e261e65..c1920312489c9 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -5359,6 +5359,8 @@ def case_when( """ Replace values where the conditions are True. + .. versionadded:: 2.2.0 + Parameters ---------- caselist : A list of tuples of conditions and expected replacements @@ -5376,8 +5378,6 @@ def case_when( must not change the input Series (though pandas doesn`t check it). - .. versionadded:: 2.2.0 - Returns ------- Series From 8f33ae0219d9c7b1260745d6090fd46a545e4fc4 Mon Sep 17 00:00:00 2001 From: KeiOshima Date: Thu, 25 Apr 2024 16:23:47 -0400 Subject: [PATCH 050/100] DOC: fixing SA01 error for Index: T and empty (#58430) * DOC: fixing SA01 error for Index: T and empty * fixing EXPECTED TO FAIL, BUT NOT FAILING error --- ci/code_checks.sh | 4 ---- pandas/core/base.py | 38 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+), 4 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 740814151aaf4..49089e903c8ba 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -108,7 +108,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.DatetimeTZDtype.tz SA01" \ -i "pandas.Grouper PR02" \ -i "pandas.Index PR07" \ - -i "pandas.Index.T SA01" \ -i "pandas.Index.append PR07,RT03,SA01" \ -i "pandas.Index.copy PR07,SA01" \ -i "pandas.Index.difference PR07,RT03,SA01" \ @@ -117,7 +116,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Index.droplevel RT03,SA01" \ -i "pandas.Index.dropna RT03,SA01" \ -i "pandas.Index.duplicated RT03" \ - -i "pandas.Index.empty GL08" \ -i "pandas.Index.get_indexer PR07,SA01" \ -i "pandas.Index.get_indexer_for PR01,SA01" \ -i "pandas.Index.get_indexer_non_unique PR07,SA01" \ @@ -229,7 +227,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.RangeIndex.step SA01" \ -i "pandas.RangeIndex.stop SA01" \ -i "pandas.Series SA01" \ - -i "pandas.Series.T SA01" \ -i "pandas.Series.__iter__ RT03,SA01" \ -i "pandas.Series.add PR07" \ -i "pandas.Series.at_time PR01" \ @@ -270,7 +267,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Series.dt.tz_localize PR01,PR02" \ -i "pandas.Series.dt.unit GL08" \ -i "pandas.Series.dtype SA01" \ - -i "pandas.Series.empty GL08" \ -i "pandas.Series.eq PR07,SA01" \ -i "pandas.Series.floordiv PR07" \ -i "pandas.Series.ge PR07,SA01" \ diff --git a/pandas/core/base.py b/pandas/core/base.py index ab27248308d74..72d8c1b837398 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -309,6 +309,10 @@ def transpose(self, *args, **kwargs) -> Self: doc=""" Return the transpose, which is by definition self. + See Also + -------- + Index : Immutable sequence used for indexing and alignment. + Examples -------- For Series: @@ -691,6 +695,40 @@ def to_numpy( @final @property def empty(self) -> bool: + """ + Indicator whether Index is empty. + + Returns + ------- + bool + If Index is empty, return True, if not return False. + + See Also + -------- + Index.size : Return the number of elements in the underlying data. + + Examples + -------- + >>> idx_empty = pd.Index([1, 2, 3]) + >>> idx_empty + Index([1, 2, 3], dtype='int64') + >>> idx_empty.empty + False + + >>> idx_empty = pd.Index([]) + >>> idx_empty + Index([], dtype='object') + >>> idx_empty.empty + True + + If we only have NaNs in our DataFrame, it is not considered empty! + + >>> idx_empty = pd.Index([np.nan, np.nan]) + >>> idx_empty + Index([nan, nan], dtype='float64') + >>> idx_empty.empty + False + """ return not self.size @doc(op="max", oppose="min", value="largest") From a149abd4d71ac07975b6e849a219c1db676eeceb Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Fri, 26 Apr 2024 01:54:39 +0530 Subject: [PATCH 051/100] DOC: Enforce Numpy Docstring Validation for pandas.DatetimeTZDtype.tz (#58401) * DOC: add SA01 to pandas.DatetimeTZDtype.tz * DOC: remove pandas.DatetimeTZDtype.tz --- ci/code_checks.sh | 1 - pandas/core/dtypes/dtypes.py | 4 ++++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 49089e903c8ba..c1d60c4d9900a 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -105,7 +105,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.DatetimeIndex.to_period RT03" \ -i "pandas.DatetimeIndex.to_pydatetime RT03,SA01" \ -i "pandas.DatetimeTZDtype SA01" \ - -i "pandas.DatetimeTZDtype.tz SA01" \ -i "pandas.Grouper PR02" \ -i "pandas.Index PR07" \ -i "pandas.Index.append PR07,RT03,SA01" \ diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 0a97a0d03c22a..5ff7ca33d18bd 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -811,6 +811,10 @@ def tz(self) -> tzinfo: """ The timezone. + See Also + -------- + DatetimeTZDtype.unit : Retrieves precision of the datetime data. + Examples -------- >>> from zoneinfo import ZoneInfo From 39363cfe531648a35b806d187e1fb3a39a0c0203 Mon Sep 17 00:00:00 2001 From: KeiOshima Date: Thu, 25 Apr 2024 16:26:56 -0400 Subject: [PATCH 052/100] DOC: ficing RT03 errors for Index: drop_duplicates and memory_usage (#58434) --- ci/code_checks.sh | 2 -- pandas/core/base.py | 1 + pandas/core/indexes/base.py | 1 + 3 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index c1d60c4d9900a..b912a40e6d04e 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -111,7 +111,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Index.copy PR07,SA01" \ -i "pandas.Index.difference PR07,RT03,SA01" \ -i "pandas.Index.drop PR07,SA01" \ - -i "pandas.Index.drop_duplicates RT03" \ -i "pandas.Index.droplevel RT03,SA01" \ -i "pandas.Index.dropna RT03,SA01" \ -i "pandas.Index.duplicated RT03" \ @@ -125,7 +124,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Index.insert PR07,RT03,SA01" \ -i "pandas.Index.intersection PR07,RT03,SA01" \ -i "pandas.Index.join PR07,RT03,SA01" \ - -i "pandas.Index.memory_usage RT03" \ -i "pandas.Index.names GL08" \ -i "pandas.Index.nunique RT03" \ -i "pandas.Index.putmask PR01,RT03" \ diff --git a/pandas/core/base.py b/pandas/core/base.py index 72d8c1b837398..f535f0c55415a 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -1170,6 +1170,7 @@ def _memory_usage(self, deep: bool = False) -> int: Returns ------- bytes used + Returns memory usage of the values in the Index in bytes. See Also -------- diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index ffc228d57a95b..ace082fba609a 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2684,6 +2684,7 @@ def drop_duplicates(self, *, keep: DropKeep = "first") -> Self: Returns ------- Index + A new Index object with the duplicate values removed. See Also -------- From 7c836ed2ecaec55b788aedf053b74ee2a84685da Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Fri, 26 Apr 2024 01:59:26 +0530 Subject: [PATCH 053/100] DOC: Enforce Numpy Docstring Validation for pandas.DatetimeIndex.freqstr (#58309) * DOC: add SA01 to pandas.DatetimeIndex.freqstr * DOC: remove pandas.DatetimeIndex.freqstr * DOC: removed pandas.PeriodIndex.freqstr --- ci/code_checks.sh | 2 -- pandas/core/arrays/datetimelike.py | 5 +++++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index b912a40e6d04e..9aae477ca1af3 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -98,7 +98,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.DataFrame.swaplevel SA01" \ -i "pandas.DataFrame.to_markdown SA01" \ -i "pandas.DataFrame.var PR01,RT03,SA01" \ - -i "pandas.DatetimeIndex.freqstr SA01" \ -i "pandas.DatetimeIndex.indexer_at_time PR01,RT03" \ -i "pandas.DatetimeIndex.snap PR01,RT03" \ -i "pandas.DatetimeIndex.std PR01,RT03" \ @@ -203,7 +202,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.PeriodIndex.dayofyear SA01" \ -i "pandas.PeriodIndex.days_in_month SA01" \ -i "pandas.PeriodIndex.daysinmonth SA01" \ - -i "pandas.PeriodIndex.freqstr SA01" \ -i "pandas.PeriodIndex.from_fields PR07,SA01" \ -i "pandas.PeriodIndex.from_ordinals SA01" \ -i "pandas.PeriodIndex.hour SA01" \ diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index ff8b16b3361ee..ab17ae43215d2 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -875,6 +875,11 @@ def freqstr(self) -> str | None: """ Return the frequency object as a string if it's set, otherwise None. + See Also + -------- + DatetimeIndex.inferred_freq : Returns a string representing a frequency + generated by infer_freq. + Examples -------- For DatetimeIndex: From 87b5a827c6178216732057e866095dd1eb99f8c3 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Fri, 26 Apr 2024 23:23:26 +0530 Subject: [PATCH 054/100] DOC: Enforce Numpy Docstring Validation for pandas.DatetimeTZDtype (#58402) * DOC: add SA01 to pandas.DatetimeTZDtype * DOC: remove pandas.DatetimeTZDtype * DOC: add . * DOC: delete tz and tz_convert --- ci/code_checks.sh | 1 - pandas/core/dtypes/dtypes.py | 5 +++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 9aae477ca1af3..2ae74cfbe6e2e 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -103,7 +103,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.DatetimeIndex.std PR01,RT03" \ -i "pandas.DatetimeIndex.to_period RT03" \ -i "pandas.DatetimeIndex.to_pydatetime RT03,SA01" \ - -i "pandas.DatetimeTZDtype SA01" \ -i "pandas.Grouper PR02" \ -i "pandas.Index PR07" \ -i "pandas.Index.append PR07,RT03,SA01" \ diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 5ff7ca33d18bd..778b6bd6f3f18 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -717,6 +717,11 @@ class DatetimeTZDtype(PandasExtensionDtype): ZoneInfoNotFoundError When the requested timezone cannot be found. + See Also + -------- + numpy.datetime64 : Numpy data type for datetime. + datetime.datetime : Python datetime object. + Examples -------- >>> from zoneinfo import ZoneInfo From 362278a1c4a6022b57be73d7d73a293c1c0abd76 Mon Sep 17 00:00:00 2001 From: shriyakalakata <87483933+shriyakalakata@users.noreply.github.com> Date: Fri, 26 Apr 2024 13:54:25 -0400 Subject: [PATCH 055/100] DOC: Fix Index.inferred type SA01 and Index.slice_locs RT03 errors (#58435) Fix Index.inferred type SA01 and Index.slice_locs RT03 errors --- ci/code_checks.sh | 2 -- pandas/core/indexes/base.py | 6 ++++++ 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 2ae74cfbe6e2e..f7eb16b4a85b5 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -118,7 +118,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Index.get_loc PR07,RT03,SA01" \ -i "pandas.Index.get_slice_bound PR07" \ -i "pandas.Index.identical PR01,SA01" \ - -i "pandas.Index.inferred_type SA01" \ -i "pandas.Index.insert PR07,RT03,SA01" \ -i "pandas.Index.intersection PR07,RT03,SA01" \ -i "pandas.Index.join PR07,RT03,SA01" \ @@ -128,7 +127,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Index.ravel PR01,RT03" \ -i "pandas.Index.reindex PR07" \ -i "pandas.Index.slice_indexer PR07,RT03,SA01" \ - -i "pandas.Index.slice_locs RT03" \ -i "pandas.Index.str PR01,SA01" \ -i "pandas.Index.symmetric_difference PR07,RT03,SA01" \ -i "pandas.Index.take PR01,PR07" \ diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index ace082fba609a..61ba2fc7088fd 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2360,6 +2360,10 @@ def inferred_type(self) -> str_t: """ Return a string of the type inferred from the values. + See Also + -------- + Index.dtype : Return the dtype object of the underlying data. + Examples -------- >>> idx = pd.Index([1, 2, 3]) @@ -6471,6 +6475,8 @@ def slice_locs(self, start=None, end=None, step=None) -> tuple[int, int]: Returns ------- tuple[int, int] + Returns a tuple of two integers representing the slice locations for the + input labels within the index. See Also -------- From 4f35184ac19d942ad1fef9f70ef860d5f6c0ff81 Mon Sep 17 00:00:00 2001 From: shriyakalakata <87483933+shriyakalakata@users.noreply.github.com> Date: Fri, 26 Apr 2024 13:56:19 -0400 Subject: [PATCH 056/100] DOC: Fix RT03 and SA01 errors for Index.droplevel, Index.dropna (#58433) * Fix RT03 and SA01 errors for Index.droplevel, Index.dropna * Remove line from code_checks.sh --- ci/code_checks.sh | 3 --- pandas/core/indexes/base.py | 12 ++++++++++++ 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index f7eb16b4a85b5..08dedb1b13a66 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -109,8 +109,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Index.copy PR07,SA01" \ -i "pandas.Index.difference PR07,RT03,SA01" \ -i "pandas.Index.drop PR07,SA01" \ - -i "pandas.Index.droplevel RT03,SA01" \ - -i "pandas.Index.dropna RT03,SA01" \ -i "pandas.Index.duplicated RT03" \ -i "pandas.Index.get_indexer PR07,SA01" \ -i "pandas.Index.get_indexer_for PR01,SA01" \ @@ -158,7 +156,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.MultiIndex.append PR07,SA01" \ -i "pandas.MultiIndex.copy PR07,RT03,SA01" \ -i "pandas.MultiIndex.drop PR07,RT03,SA01" \ - -i "pandas.MultiIndex.droplevel RT03,SA01" \ -i "pandas.MultiIndex.dtypes SA01" \ -i "pandas.MultiIndex.get_indexer PR07,SA01" \ -i "pandas.MultiIndex.get_level_values SA01" \ diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 61ba2fc7088fd..ebdaaf4be8419 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2093,6 +2093,12 @@ def droplevel(self, level: IndexLabel = 0): Returns ------- Index or MultiIndex + Returns an Index or MultiIndex object, depending on the resulting index + after removing the requested level(s). + + See Also + -------- + Index.dropna : Return Index without NA/NaN values. Examples -------- @@ -2619,6 +2625,12 @@ def dropna(self, how: AnyAll = "any") -> Self: Returns ------- Index + Returns an Index object after removing NA/NaN values. + + See Also + -------- + Index.fillna : Fill NA/NaN values with the specified value. + Index.isna : Detect missing values. Examples -------- From 13771ab411b37df9545b3b6cb16dc776a825eca1 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Fri, 26 Apr 2024 23:32:05 +0530 Subject: [PATCH 057/100] DOC: Enforce Numpy Docstring Validation for pandas.DatetimeIndex.std (#58439) * DOC: add PR01,RT03 for pandas.DatetimeIndex.std * DOC: remove PR01,RT03 for pandas.DatetimeIndex.std --- ci/code_checks.sh | 1 - pandas/core/arrays/datetimes.py | 17 +++++++++++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 08dedb1b13a66..2639a7b25f389 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -100,7 +100,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.DataFrame.var PR01,RT03,SA01" \ -i "pandas.DatetimeIndex.indexer_at_time PR01,RT03" \ -i "pandas.DatetimeIndex.snap PR01,RT03" \ - -i "pandas.DatetimeIndex.std PR01,RT03" \ -i "pandas.DatetimeIndex.to_period RT03" \ -i "pandas.DatetimeIndex.to_pydatetime RT03,SA01" \ -i "pandas.Grouper PR02" \ diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 106064ade8344..0f59d62339bf2 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -2248,9 +2248,25 @@ def std( axis : int, optional Axis for the function to be applied on. For :class:`pandas.Series` this parameter is unused and defaults to ``None``. + dtype : dtype, optional, default None + Type to use in computing the standard deviation. For arrays of + integer type the default is float64, for arrays of float types + it is the same as the array type. + out : ndarray, optional, default None + Alternative output array in which to place the result. It must have + the same shape as the expected output but the type (of the + calculated values) will be cast if necessary. ddof : int, default 1 Degrees of Freedom. The divisor used in calculations is `N - ddof`, where `N` represents the number of elements. + keepdims : bool, optional + If this is set to True, the axes which are reduced are left in the + result as dimensions with size one. With this option, the result + will broadcast correctly against the input array. If the default + value is passed, then keepdims will not be passed through to the + std method of sub-classes of ndarray, however any non-default value + will be. If the sub-class method does not implement keepdims any + exceptions will be raised. skipna : bool, default True Exclude NA/null values. If an entire row/column is ``NA``, the result will be ``NA``. @@ -2258,6 +2274,7 @@ def std( Returns ------- Timedelta + Standard deviation over requested axis. See Also -------- From a1fc8e8147efb0c7d7e10e674c3ee383b14f2d43 Mon Sep 17 00:00:00 2001 From: KeiOshima Date: Fri, 26 Apr 2024 14:03:47 -0400 Subject: [PATCH 058/100] DOC: fix PR07 and SA01 issue for Index: copy and get_slice_bound (#58443) * DOC: fix PR07 and SA01 issue for Index: copy and get_slice_bound * ficing line to long error --- ci/code_checks.sh | 2 -- pandas/core/indexes/base.py | 10 ++++++++++ 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 2639a7b25f389..26c8ae1298630 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -105,7 +105,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Grouper PR02" \ -i "pandas.Index PR07" \ -i "pandas.Index.append PR07,RT03,SA01" \ - -i "pandas.Index.copy PR07,SA01" \ -i "pandas.Index.difference PR07,RT03,SA01" \ -i "pandas.Index.drop PR07,SA01" \ -i "pandas.Index.duplicated RT03" \ @@ -113,7 +112,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Index.get_indexer_for PR01,SA01" \ -i "pandas.Index.get_indexer_non_unique PR07,SA01" \ -i "pandas.Index.get_loc PR07,RT03,SA01" \ - -i "pandas.Index.get_slice_bound PR07" \ -i "pandas.Index.identical PR01,SA01" \ -i "pandas.Index.insert PR07,RT03,SA01" \ -i "pandas.Index.intersection PR07,RT03,SA01" \ diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index ebdaaf4be8419..9acab2642f6be 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -1262,12 +1262,19 @@ def copy( name : Label, optional Set name for new object. deep : bool, default False + If True attempts to make a deep copy of the Index. + Else makes a shallow copy. Returns ------- Index Index refer to new object which is a copy of this object. + See Also + -------- + Index.delete: Make new Index with passed location(-s) deleted. + Index.drop: Make new Index with passed list of labels deleted. + Notes ----- In most cases, there should be no functional difference from using @@ -6398,7 +6405,10 @@ def get_slice_bound(self, label, side: Literal["left", "right"]) -> int: Parameters ---------- label : object + The label for which to calculate the slice bound. side : {'left', 'right'} + if 'left' return leftmost position of given label. + if 'right' return one-past-the-rightmost position of given label. Returns ------- From bd84be4aac6f84926ff00c594d5401da7a3dc068 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torsten=20W=C3=B6rtwein?= <6618166+twoertwein@users.noreply.github.com> Date: Sun, 28 Apr 2024 23:36:30 -0400 Subject: [PATCH 059/100] TYP: misc return annotations (#58468) --- pandas/io/excel/_xlsxwriter.py | 2 +- pandas/io/pytables.py | 39 ++++++++++++++++++++-------------- pandas/util/_decorators.py | 4 ++-- 3 files changed, 26 insertions(+), 19 deletions(-) diff --git a/pandas/io/excel/_xlsxwriter.py b/pandas/io/excel/_xlsxwriter.py index 6eacac8c064fb..b2fd24a670300 100644 --- a/pandas/io/excel/_xlsxwriter.py +++ b/pandas/io/excel/_xlsxwriter.py @@ -93,7 +93,7 @@ class _XlsxStyler: } @classmethod - def convert(cls, style_dict, num_format_str=None): + def convert(cls, style_dict, num_format_str=None) -> dict[str, Any]: """ converts a style_dict to an xlsxwriter format dict diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index d585c59dd5581..5d325397a81ae 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -22,6 +22,7 @@ Final, Literal, cast, + overload, ) import warnings @@ -593,7 +594,7 @@ def __getitem__(self, key: str): def __setitem__(self, key: str, value) -> None: self.put(key, value) - def __delitem__(self, key: str) -> None: + def __delitem__(self, key: str) -> int | None: return self.remove(key) def __getattr__(self, name: str): @@ -1203,7 +1204,7 @@ def put( dropna=dropna, ) - def remove(self, key: str, where=None, start=None, stop=None) -> None: + def remove(self, key: str, where=None, start=None, stop=None) -> int | None: """ Remove pandas object partially by specifying the where condition @@ -1251,14 +1252,12 @@ def remove(self, key: str, where=None, start=None, stop=None) -> None: # remove the node if com.all_none(where, start, stop): s.group._f_remove(recursive=True) + return None # delete from the table - else: - if not s.is_table: - raise ValueError( - "can only remove with where on objects written as tables" - ) - return s.delete(where=where, start=start, stop=stop) + if not s.is_table: + raise ValueError("can only remove with where on objects written as tables") + return s.delete(where=where, start=start, stop=stop) def append( self, @@ -2895,7 +2894,7 @@ def read( columns=None, start: int | None = None, stop: int | None = None, - ): + ) -> Series | DataFrame: raise NotImplementedError( "cannot read on an abstract storer: subclasses should implement" ) @@ -2907,7 +2906,7 @@ def write(self, obj, **kwargs) -> None: def delete( self, where=None, start: int | None = None, stop: int | None = None - ) -> None: + ) -> int | None: """ support fully deleting the node in its entirety (only) - where specification must be None @@ -3601,7 +3600,7 @@ def queryables(self) -> dict[str, Any]: return dict(d1 + d2 + d3) - def index_cols(self): + def index_cols(self) -> list[tuple[Any, Any]]: """return a list of my index cols""" # Note: each `i.cname` below is assured to be a str. return [(i.axis, i.cname) for i in self.index_axes] @@ -3731,7 +3730,7 @@ def indexables(self): dc = set(self.data_columns) base_pos = len(_indexables) - def f(i, c): + def f(i, c: str) -> DataCol: assert isinstance(c, str) klass = DataCol if c in dc: @@ -3897,7 +3896,7 @@ def get_object(cls, obj, transposed: bool): """return the data for this obj""" return obj - def validate_data_columns(self, data_columns, min_itemsize, non_index_axes): + def validate_data_columns(self, data_columns, min_itemsize, non_index_axes) -> list: """ take the input data_columns and min_itemize and create a data columns spec @@ -4590,7 +4589,9 @@ def write_data_chunk( self.table.append(rows) self.table.flush() - def delete(self, where=None, start: int | None = None, stop: int | None = None): + def delete( + self, where=None, start: int | None = None, stop: int | None = None + ) -> int | None: # delete all rows (and return the nrows) if where is None or not len(where): if start is None and stop is None: @@ -4918,7 +4919,7 @@ def read( columns=None, start: int | None = None, stop: int | None = None, - ): + ) -> DataFrame: df = super().read(where=where, columns=columns, start=start, stop=stop) df = df.set_index(self.levels) @@ -5379,7 +5380,13 @@ def __init__( if self.terms is not None: self.condition, self.filter = self.terms.evaluate() - def generate(self, where): + @overload + def generate(self, where: dict | list | tuple | str) -> PyTablesExpr: ... + + @overload + def generate(self, where: None) -> None: ... + + def generate(self, where: dict | list | tuple | str | None) -> PyTablesExpr | None: """where can be a : dict,list,tuple,string""" if where is None: return None diff --git a/pandas/util/_decorators.py b/pandas/util/_decorators.py index d287fa72d552d..bdfb0b1cad8ae 100644 --- a/pandas/util/_decorators.py +++ b/pandas/util/_decorators.py @@ -505,7 +505,7 @@ def indent(text: str | None, indents: int = 1) -> str: ] -def set_module(module): +def set_module(module) -> Callable[[F], F]: """Private decorator for overriding __module__ on a function or class. Example usage:: @@ -518,7 +518,7 @@ def example(): assert example.__module__ == "pandas" """ - def decorator(func): + def decorator(func: F) -> F: if module is not None: func.__module__ = module return func From 1593fb9f024156b0e69c8a82a0d472720d5c055e Mon Sep 17 00:00:00 2001 From: shriyakalakata <87483933+shriyakalakata@users.noreply.github.com> Date: Sun, 28 Apr 2024 23:52:32 -0400 Subject: [PATCH 060/100] Fix errors for Index.drop, Index.reindex (#58454) --- ci/code_checks.sh | 2 -- pandas/core/indexes/base.py | 8 ++++++++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 26c8ae1298630..c06277d66f7a9 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -106,7 +106,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Index PR07" \ -i "pandas.Index.append PR07,RT03,SA01" \ -i "pandas.Index.difference PR07,RT03,SA01" \ - -i "pandas.Index.drop PR07,SA01" \ -i "pandas.Index.duplicated RT03" \ -i "pandas.Index.get_indexer PR07,SA01" \ -i "pandas.Index.get_indexer_for PR01,SA01" \ @@ -120,7 +119,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Index.nunique RT03" \ -i "pandas.Index.putmask PR01,RT03" \ -i "pandas.Index.ravel PR01,RT03" \ - -i "pandas.Index.reindex PR07" \ -i "pandas.Index.slice_indexer PR07,RT03,SA01" \ -i "pandas.Index.str PR01,SA01" \ -i "pandas.Index.symmetric_difference PR07,RT03,SA01" \ diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 9acab2642f6be..8ea844d72326c 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3953,6 +3953,7 @@ def reindex( Parameters ---------- target : an iterable + An iterable containing the values to be used for creating the new index. method : {None, 'pad'/'ffill', 'backfill'/'bfill', 'nearest'}, optional * default: exact matches only. * pad / ffill: find the PREVIOUS index value if no exact match. @@ -6686,6 +6687,8 @@ def drop( Parameters ---------- labels : array-like or scalar + Array-like object or a scalar value, representing the labels to be removed + from the Index. errors : {'ignore', 'raise'}, default 'raise' If 'ignore', suppress error and existing labels are dropped. @@ -6699,6 +6702,11 @@ def drop( KeyError If not all of the labels are found in the selected axis + See Also + -------- + Index.dropna : Return Index without NA/NaN values. + Index.drop_duplicates : Return Index with duplicate values removed. + Examples -------- >>> idx = pd.Index(["a", "b", "c"]) From cf0014ad9f7bfccac3cfb87cb66556825dba0bea Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Mon, 29 Apr 2024 09:37:59 +0530 Subject: [PATCH 061/100] DOC: Enforce Numpy Docstring Validation for pandas.DatetimeIndex.to_pydatetime (#58441) * DOC: add RT03,SA01 for pandas.DatetimeIndex.to_pydatetime * DOC: remove RT03,SA01 for pandas.DatetimeIndex.to_pydatetime --- ci/code_checks.sh | 1 - pandas/core/arrays/datetimes.py | 6 ++++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index c06277d66f7a9..2b418d6655b0b 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -101,7 +101,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.DatetimeIndex.indexer_at_time PR01,RT03" \ -i "pandas.DatetimeIndex.snap PR01,RT03" \ -i "pandas.DatetimeIndex.to_period RT03" \ - -i "pandas.DatetimeIndex.to_pydatetime RT03,SA01" \ -i "pandas.Grouper PR02" \ -i "pandas.Index PR07" \ -i "pandas.Index.append PR07,RT03,SA01" \ diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 0f59d62339bf2..b5048973755bc 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -1127,6 +1127,12 @@ def to_pydatetime(self) -> npt.NDArray[np.object_]: Returns ------- numpy.ndarray + An ndarray of ``datetime.datetime`` objects. + + See Also + -------- + DatetimeIndex.to_julian_date : Converts Datetime Array to float64 ndarray + of Julian Dates. Examples -------- From a2bce66d04ed2addfb9782f0e824c60bc0b1b449 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 29 Apr 2024 05:28:46 -0700 Subject: [PATCH 062/100] REF: move MaskedArray subclass attributes to dtypes (#58423) --- pandas/_libs/lib.pyx | 4 +-- pandas/core/arrays/boolean.py | 10 ++---- pandas/core/arrays/floating.py | 10 ++---- pandas/core/arrays/integer.py | 10 ++---- pandas/core/arrays/masked.py | 60 +++++++++++++--------------------- pandas/core/arrays/numeric.py | 2 +- pandas/core/dtypes/dtypes.py | 20 ++++++++++++ 7 files changed, 53 insertions(+), 63 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 5b6d83ba8e9ee..4fd68a1593e49 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2808,14 +2808,14 @@ def maybe_convert_objects(ndarray[object] objects, from pandas.core.arrays import IntegerArray # Set these values to 1 to be deterministic, match - # IntegerArray._internal_fill_value + # IntegerDtype._internal_fill_value result[mask] = 1 result = IntegerArray(result, mask) elif result is floats and convert_to_nullable_dtype: from pandas.core.arrays import FloatingArray # Set these values to 1.0 to be deterministic, match - # FloatingArray._internal_fill_value + # FloatingDtype._internal_fill_value result[mask] = 1.0 result = FloatingArray(result, mask) diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index 813b10eef5e4b..a326925545045 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -68,6 +68,9 @@ class BooleanDtype(BaseMaskedDtype): name: ClassVar[str] = "boolean" + # The value used to fill '_data' to avoid upcasting + _internal_fill_value = False + # https://github.com/python/mypy/issues/4125 # error: Signature of "type" incompatible with supertype "BaseMaskedDtype" @property @@ -293,13 +296,6 @@ class BooleanArray(BaseMaskedArray): Length: 3, dtype: boolean """ - # The value used to fill '_data' to avoid upcasting - _internal_fill_value = False - # Fill values used for any/all - # Incompatible types in assignment (expression has type "bool", base class - # "BaseMaskedArray" defined the type as "") - _truthy_value = True # type: ignore[assignment] - _falsey_value = False # type: ignore[assignment] _TRUE_VALUES = {"True", "TRUE", "true", "1", "1.0"} _FALSE_VALUES = {"False", "FALSE", "false", "0", "0.0"} diff --git a/pandas/core/arrays/floating.py b/pandas/core/arrays/floating.py index 653e63e9d1e2d..b3fbf0f92c32d 100644 --- a/pandas/core/arrays/floating.py +++ b/pandas/core/arrays/floating.py @@ -23,6 +23,8 @@ class FloatingDtype(NumericDtype): The attributes name & type are set when these subclasses are created. """ + # The value used to fill '_data' to avoid upcasting + _internal_fill_value = np.nan _default_np_dtype = np.dtype(np.float64) _checker = is_float_dtype @@ -113,14 +115,6 @@ class FloatingArray(NumericArray): _dtype_cls = FloatingDtype - # The value used to fill '_data' to avoid upcasting - _internal_fill_value = np.nan - # Fill values used for any/all - # Incompatible types in assignment (expression has type "float", base class - # "BaseMaskedArray" defined the type as "") - _truthy_value = 1.0 # type: ignore[assignment] - _falsey_value = 0.0 # type: ignore[assignment] - _dtype_docstring = """ An ExtensionDtype for {dtype} data. diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index dc453f3e37c50..21a9b09227663 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -23,6 +23,8 @@ class IntegerDtype(NumericDtype): The attributes name & type are set when these subclasses are created. """ + # The value used to fill '_data' to avoid upcasting + _internal_fill_value = 1 _default_np_dtype = np.dtype(np.int64) _checker = is_integer_dtype @@ -128,14 +130,6 @@ class IntegerArray(NumericArray): _dtype_cls = IntegerDtype - # The value used to fill '_data' to avoid upcasting - _internal_fill_value = 1 - # Fill values used for any/all - # Incompatible types in assignment (expression has type "int", base class - # "BaseMaskedArray" defined the type as "") - _truthy_value = 1 # type: ignore[assignment] - _falsey_value = 0 # type: ignore[assignment] - _dtype_docstring = """ An ExtensionDtype for {dtype} integer data. diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 190888d281ea9..df794183f67d1 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -5,6 +5,7 @@ Any, Callable, Literal, + cast, overload, ) import warnings @@ -16,22 +17,6 @@ missing as libmissing, ) from pandas._libs.tslibs import is_supported_dtype -from pandas._typing import ( - ArrayLike, - AstypeArg, - AxisInt, - DtypeObj, - FillnaOptions, - InterpolateOptions, - NpDtype, - PositionalIndexer, - Scalar, - ScalarIndexer, - Self, - SequenceIndexer, - Shape, - npt, -) from pandas.compat import ( IS64, is_platform_windows, @@ -97,6 +82,20 @@ from pandas._typing import ( NumpySorter, NumpyValueArrayLike, + ArrayLike, + AstypeArg, + AxisInt, + DtypeObj, + FillnaOptions, + InterpolateOptions, + NpDtype, + PositionalIndexer, + Scalar, + ScalarIndexer, + Self, + SequenceIndexer, + Shape, + npt, ) from pandas._libs.missing import NAType from pandas.core.arrays import FloatingArray @@ -111,16 +110,10 @@ class BaseMaskedArray(OpsMixin, ExtensionArray): numpy based """ - # The value used to fill '_data' to avoid upcasting - _internal_fill_value: Scalar # our underlying data and mask are each ndarrays _data: np.ndarray _mask: npt.NDArray[np.bool_] - # Fill values used for any/all - _truthy_value = Scalar # bool(_truthy_value) = True - _falsey_value = Scalar # bool(_falsey_value) = False - @classmethod def _simple_new(cls, values: np.ndarray, mask: npt.NDArray[np.bool_]) -> Self: result = BaseMaskedArray.__new__(cls) @@ -155,8 +148,9 @@ def _from_sequence(cls, scalars, *, dtype=None, copy: bool = False) -> Self: @classmethod @doc(ExtensionArray._empty) def _empty(cls, shape: Shape, dtype: ExtensionDtype) -> Self: - values = np.empty(shape, dtype=dtype.type) - values.fill(cls._internal_fill_value) + dtype = cast(BaseMaskedDtype, dtype) + values: np.ndarray = np.empty(shape, dtype=dtype.type) + values.fill(dtype._internal_fill_value) mask = np.ones(shape, dtype=bool) result = cls(values, mask) if not isinstance(result, cls) or dtype != result.dtype: @@ -917,7 +911,9 @@ def take( ) -> Self: # we always fill with 1 internally # to avoid upcasting - data_fill_value = self._internal_fill_value if isna(fill_value) else fill_value + data_fill_value = ( + self.dtype._internal_fill_value if isna(fill_value) else fill_value + ) result = take( self._data, indexer, @@ -1397,12 +1393,7 @@ def any( nv.validate_any((), kwargs) values = self._data.copy() - # error: Argument 3 to "putmask" has incompatible type "object"; - # expected "Union[_SupportsArray[dtype[Any]], - # _NestedSequence[_SupportsArray[dtype[Any]]], - # bool, int, float, complex, str, bytes, - # _NestedSequence[Union[bool, int, float, complex, str, bytes]]]" - np.putmask(values, self._mask, self._falsey_value) # type: ignore[arg-type] + np.putmask(values, self._mask, self.dtype._falsey_value) result = values.any() if skipna: return result @@ -1490,12 +1481,7 @@ def all( nv.validate_all((), kwargs) values = self._data.copy() - # error: Argument 3 to "putmask" has incompatible type "object"; - # expected "Union[_SupportsArray[dtype[Any]], - # _NestedSequence[_SupportsArray[dtype[Any]]], - # bool, int, float, complex, str, bytes, - # _NestedSequence[Union[bool, int, float, complex, str, bytes]]]" - np.putmask(values, self._mask, self._truthy_value) # type: ignore[arg-type] + np.putmask(values, self._mask, self.dtype._truthy_value) result = values.all(axis=axis) if skipna: diff --git a/pandas/core/arrays/numeric.py b/pandas/core/arrays/numeric.py index fe7b32ec9652e..c5e9ed8698ffe 100644 --- a/pandas/core/arrays/numeric.py +++ b/pandas/core/arrays/numeric.py @@ -221,7 +221,7 @@ def _coerce_to_data_and_mask( # we copy as need to coerce here if mask.any(): values = values.copy() - values[mask] = cls._internal_fill_value + values[mask] = dtype_cls._internal_fill_value if inferred_type in ("string", "unicode"): # casts from str are always safe since they raise # a ValueError if the str cannot be parsed into a float diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 778b6bd6f3f18..8c64a38bc1be3 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -79,6 +79,7 @@ DtypeObj, IntervalClosedType, Ordered, + Scalar, Self, npt, type_t, @@ -1551,6 +1552,25 @@ class BaseMaskedDtype(ExtensionDtype): base = None type: type + _internal_fill_value: Scalar + + @property + def _truthy_value(self): + # Fill values used for 'any' + if self.kind == "f": + return 1.0 + if self.kind in "iu": + return 1 + return True + + @property + def _falsey_value(self): + # Fill values used for 'all' + if self.kind == "f": + return 0.0 + if self.kind in "iu": + return 0 + return False @property def na_value(self) -> libmissing.NAType: From 2a7ad2e274c751015f8daf33ccba551770d53b55 Mon Sep 17 00:00:00 2001 From: shriyakalakata <87483933+shriyakalakata@users.noreply.github.com> Date: Mon, 29 Apr 2024 13:30:39 -0400 Subject: [PATCH 063/100] Fix PR07,RT03,SA01 errors for Index.insert, Index.intersection (#58456) * Fix PR07,RT03,SA01 errors for Index.insert, Index.intersection * Update pandas/core/indexes/base.py Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> * Update pandas/core/indexes/base.py Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> * Update pandas/core/indexes/base.py Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --------- Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- ci/code_checks.sh | 2 -- pandas/core/indexes/base.py | 18 ++++++++++++++++++ 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 2b418d6655b0b..22f12ac0312d1 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -111,8 +111,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Index.get_indexer_non_unique PR07,SA01" \ -i "pandas.Index.get_loc PR07,RT03,SA01" \ -i "pandas.Index.identical PR01,SA01" \ - -i "pandas.Index.insert PR07,RT03,SA01" \ - -i "pandas.Index.intersection PR07,RT03,SA01" \ -i "pandas.Index.join PR07,RT03,SA01" \ -i "pandas.Index.names GL08" \ -i "pandas.Index.nunique RT03" \ diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 8ea844d72326c..f0ac8604ccd60 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3082,6 +3082,8 @@ def intersection(self, other, sort: bool = False): Parameters ---------- other : Index or array-like + An Index or an array-like object containing elements to form the + intersection with the original Index. sort : True, False or None, default False Whether to sort the resulting index. @@ -3093,6 +3095,14 @@ def intersection(self, other, sort: bool = False): Returns ------- Index + Returns a new Index object with elements common to both the original Index + and the `other` Index. + + See Also + -------- + Index.union : Form the union of two Index objects. + Index.difference : Return a new Index with elements of index not in other. + Index.isin : Return a boolean array where the index values are in values. Examples -------- @@ -6625,11 +6635,19 @@ def insert(self, loc: int, item) -> Index: Parameters ---------- loc : int + The integer location where the new item will be inserted. item : object + The new item to be inserted into the Index. Returns ------- Index + Returns a new Index object resulting from inserting the specified item at + the specified location within the original Index. + + See Also + -------- + Index.append : Append a collection of Indexes together. Examples -------- From 6af69a0dd14ca9e8b9ba8bb027c73009f0ec3377 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Mon, 29 Apr 2024 23:07:09 +0530 Subject: [PATCH 064/100] DOC: Enforce Numpy Docstring Validation for pandas.DatetimeIndex.to_period (#58440) * DOC: add RT03 for pandas.DatetimeIndex.to_period * DOC: remove RT03 for pandas.DatetimeIndex.to_period * DOC: remove RT03 for pandas.Series.dt.to_period --- ci/code_checks.sh | 3 +-- pandas/core/arrays/datetimes.py | 1 + 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 22f12ac0312d1..f22bfe85c5c81 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -100,7 +100,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.DataFrame.var PR01,RT03,SA01" \ -i "pandas.DatetimeIndex.indexer_at_time PR01,RT03" \ -i "pandas.DatetimeIndex.snap PR01,RT03" \ - -i "pandas.DatetimeIndex.to_period RT03" \ -i "pandas.Grouper PR02" \ -i "pandas.Index PR07" \ -i "pandas.Index.append PR07,RT03,SA01" \ @@ -242,7 +241,7 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Series.dt.round PR01,PR02" \ -i "pandas.Series.dt.seconds SA01" \ -i "pandas.Series.dt.strftime PR01,PR02" \ - -i "pandas.Series.dt.to_period PR01,PR02,RT03" \ + -i "pandas.Series.dt.to_period PR01,PR02" \ -i "pandas.Series.dt.total_seconds PR01" \ -i "pandas.Series.dt.tz_convert PR01,PR02" \ -i "pandas.Series.dt.tz_localize PR01,PR02" \ diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index b5048973755bc..8747f795bebd8 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -1207,6 +1207,7 @@ def to_period(self, freq=None) -> PeriodArray: Returns ------- PeriodArray/PeriodIndex + Immutable ndarray holding ordinal values at a particular frequency. Raises ------ From 95178690289e3c7278457e31aa289c9c88c77546 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Mon, 29 Apr 2024 23:21:38 +0530 Subject: [PATCH 065/100] DOC: Enforce Numpy Docstring Validation for pandas.DatetimeIndex.indexer_at_time (#58476) * DOC: add PR01,RT03 for pandas.DatetimeIndex.indexer_at_time * DOC: remove pandas.DatetimeIndex.indexer_at_time --- ci/code_checks.sh | 1 - pandas/core/indexes/datetimes.py | 3 +++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index f22bfe85c5c81..58ecae66e1bcc 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -98,7 +98,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.DataFrame.swaplevel SA01" \ -i "pandas.DataFrame.to_markdown SA01" \ -i "pandas.DataFrame.var PR01,RT03,SA01" \ - -i "pandas.DatetimeIndex.indexer_at_time PR01,RT03" \ -i "pandas.DatetimeIndex.snap PR01,RT03" \ -i "pandas.Grouper PR02" \ -i "pandas.Index PR07" \ diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 951455b627fbd..742f66aa80728 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -703,10 +703,13 @@ def indexer_at_time(self, time, asof: bool = False) -> npt.NDArray[np.intp]: Time passed in either as object (datetime.time) or as string in appropriate format ("%H:%M", "%H%M", "%I:%M%p", "%I%M%p", "%H:%M:%S", "%H%M%S", "%I:%M:%S%p", "%I%M%S%p"). + asof : bool, default False + This parameter is currently not supported. Returns ------- np.ndarray[np.intp] + Index locations of values at given `time` of day. See Also -------- From 3efe698611f43f5625694cf0d1d00422207eb810 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 29 Apr 2024 08:01:06 -1000 Subject: [PATCH 066/100] PERF: RangeIndex.value_counts/searchsorted/to_numpy (#58376) * Add RangeIndex.value_counts,searchsorted,to_numpy * Undo engine stuff * Finish searchsorted, add wahtsnew * Remove old to_numpy implementation * Add whatsnew for to_numpy * add whatsnew number * Fix typing --- doc/source/whatsnew/v3.0.0.rst | 3 ++ pandas/core/base.py | 4 +- pandas/core/indexes/range.py | 65 +++++++++++++++++++++++ pandas/tests/indexes/ranges/test_range.py | 33 ++++++++++++ 4 files changed, 102 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index c77348b365370..517510760e9c1 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -340,6 +340,9 @@ Performance improvements - Performance improvement in :meth:`RangeIndex.argmin` and :meth:`RangeIndex.argmax` (:issue:`57823`) - Performance improvement in :meth:`RangeIndex.insert` returning a :class:`RangeIndex` instead of a :class:`Index` when the :class:`RangeIndex` is empty. (:issue:`57833`) - Performance improvement in :meth:`RangeIndex.round` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57824`) +- Performance improvement in :meth:`RangeIndex.searchsorted` (:issue:`58376`) +- Performance improvement in :meth:`RangeIndex.to_numpy` when specifying an ``na_value`` (:issue:`58376`) +- Performance improvement in :meth:`RangeIndex.value_counts` (:issue:`58376`) - Performance improvement in :meth:`RangeIndex.join` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57651`, :issue:`57752`) - Performance improvement in :meth:`RangeIndex.reindex` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57647`, :issue:`57752`) - Performance improvement in :meth:`RangeIndex.take` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57445`, :issue:`57752`) diff --git a/pandas/core/base.py b/pandas/core/base.py index f535f0c55415a..e54fac3da72a6 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -556,7 +556,6 @@ def array(self) -> ExtensionArray: """ raise AbstractMethodError(self) - @final def to_numpy( self, dtype: npt.DTypeLike | None = None, @@ -668,7 +667,7 @@ def to_numpy( ) values = self._values - if fillna: + if fillna and self.hasnans: if not can_hold_element(values, na_value): # if we can't hold the na_value asarray either makes a copy or we # error before modifying values. The asarray later on thus won't make @@ -943,7 +942,6 @@ def _map_values(self, mapper, na_action=None): return algorithms.map_array(arr, mapper, na_action=na_action) - @final def value_counts( self, normalize: bool = False, diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 0ba3c22093c69..bd9e8b84fd82a 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -57,9 +57,13 @@ Dtype, JoinHow, NaPosition, + NumpySorter, Self, npt, ) + + from pandas import Series + _empty_range = range(0) _dtype_int64 = np.dtype(np.int64) @@ -1359,3 +1363,64 @@ def take( # type: ignore[override] taken += self.start return self._shallow_copy(taken, name=self.name) + + def value_counts( + self, + normalize: bool = False, + sort: bool = True, + ascending: bool = False, + bins=None, + dropna: bool = True, + ) -> Series: + from pandas import Series + + if bins is not None: + return super().value_counts( + normalize=normalize, + sort=sort, + ascending=ascending, + bins=bins, + dropna=dropna, + ) + name = "proportion" if normalize else "count" + data: npt.NDArray[np.floating] | npt.NDArray[np.signedinteger] = np.ones( + len(self), dtype=np.int64 + ) + if normalize: + data = data / len(self) + return Series(data, index=self.copy(), name=name) + + def searchsorted( # type: ignore[override] + self, + value, + side: Literal["left", "right"] = "left", + sorter: NumpySorter | None = None, + ) -> npt.NDArray[np.intp] | np.intp: + if side not in {"left", "right"} or sorter is not None: + return super().searchsorted(value=value, side=side, sorter=sorter) + + was_scalar = False + if is_scalar(value): + was_scalar = True + array_value = np.array([value]) + else: + array_value = np.asarray(value) + if array_value.dtype.kind not in "iu": + return super().searchsorted(value=value, side=side, sorter=sorter) + + if flip := (self.step < 0): + rng = self._range[::-1] + start = rng.start + step = rng.step + shift = side == "right" + else: + start = self.start + step = self.step + shift = side == "left" + result = (array_value - start - int(shift)) // step + 1 + if flip: + result = len(self) - result + result = np.maximum(np.minimum(result, len(self)), 0) + if was_scalar: + return np.intp(result.item()) + return result.astype(np.intp, copy=False) diff --git a/pandas/tests/indexes/ranges/test_range.py b/pandas/tests/indexes/ranges/test_range.py index 727edb7ae30ad..1f9df30d60c11 100644 --- a/pandas/tests/indexes/ranges/test_range.py +++ b/pandas/tests/indexes/ranges/test_range.py @@ -874,3 +874,36 @@ def test_getitem_integers_return_index(): result = RangeIndex(0, 10, 2, name="foo")[[0, 1, -1]] expected = Index([0, 2, 8], dtype="int64", name="foo") tm.assert_index_equal(result, expected) + + +@pytest.mark.parametrize("normalize", [True, False]) +@pytest.mark.parametrize( + "rng", + [ + range(3), + range(0), + range(0, 3, 2), + range(3, -3, -2), + ], +) +def test_value_counts(sort, dropna, ascending, normalize, rng): + ri = RangeIndex(rng, name="A") + result = ri.value_counts( + normalize=normalize, sort=sort, ascending=ascending, dropna=dropna + ) + expected = Index(list(rng), name="A").value_counts( + normalize=normalize, sort=sort, ascending=ascending, dropna=dropna + ) + tm.assert_series_equal(result, expected, check_index_type=False) + + +@pytest.mark.parametrize("side", ["left", "right"]) +@pytest.mark.parametrize("value", [0, -5, 5, -3, np.array([-5, -3, 0, 5])]) +def test_searchsorted(side, value): + ri = RangeIndex(-3, 3, 2) + result = ri.searchsorted(value=value, side=side) + expected = Index(list(ri)).searchsorted(value=value, side=side) + if isinstance(value, int): + assert result == expected + else: + tm.assert_numpy_array_equal(result, expected) From 72d06124e1c0dfaac288c0efd7ab595f6d92c075 Mon Sep 17 00:00:00 2001 From: Abel Tavares <121238257+abeltavares@users.noreply.github.com> Date: Mon, 29 Apr 2024 19:04:51 +0100 Subject: [PATCH 067/100] BUG: Series.plot(kind="pie") does not respect ylabel argument (#58254) Co-authored-by: Abel Tavares --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/plotting/_matplotlib/core.py | 3 --- pandas/tests/plotting/frame/test_frame.py | 2 +- pandas/tests/plotting/test_series.py | 2 +- 4 files changed, 3 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 517510760e9c1..a81fb584c8df9 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -42,6 +42,7 @@ Other enhancements - :meth:`DataFrame.cummin`, :meth:`DataFrame.cummax`, :meth:`DataFrame.cumprod` and :meth:`DataFrame.cumsum` methods now have a ``numeric_only`` parameter (:issue:`53072`) - :meth:`DataFrame.fillna` and :meth:`Series.fillna` can now accept ``value=None``; for non-object dtype the corresponding NA value will be used (:issue:`57723`) - :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`) +- :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`) .. --------------------------------------------------------------------------- .. _whatsnew_300.notable_bug_fixes: diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 38a75e741d60e..fffeb9b82492f 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -2077,9 +2077,6 @@ def _make_plot(self, fig: Figure) -> None: for i, (label, y) in enumerate(self._iter_data(data=self.data)): ax = self._get_ax(i) - if label is not None: - label = pprint_thing(label) - ax.set_ylabel(label) kwds = self.kwds.copy() diff --git a/pandas/tests/plotting/frame/test_frame.py b/pandas/tests/plotting/frame/test_frame.py index c30cb96fef252..adb56a40b0071 100644 --- a/pandas/tests/plotting/frame/test_frame.py +++ b/pandas/tests/plotting/frame/test_frame.py @@ -1629,7 +1629,7 @@ def test_pie_df_subplots(self): for ax in axes: _check_text_labels(ax.texts, df.index) for ax, ylabel in zip(axes, df.columns): - assert ax.get_ylabel() == ylabel + assert ax.get_ylabel() == "" def test_pie_df_labels_colors(self): df = DataFrame( diff --git a/pandas/tests/plotting/test_series.py b/pandas/tests/plotting/test_series.py index 9fbc20e10f5c1..54f09c7007330 100644 --- a/pandas/tests/plotting/test_series.py +++ b/pandas/tests/plotting/test_series.py @@ -378,7 +378,7 @@ def test_pie_series(self): ) ax = _check_plot_works(series.plot.pie) _check_text_labels(ax.texts, series.index) - assert ax.get_ylabel() == "YLABEL" + assert ax.get_ylabel() == "" def test_pie_series_no_label(self): series = Series( From d038da86c37e51fd104f00ce85fde7e620c31b1f Mon Sep 17 00:00:00 2001 From: Jason Mok <106209849+jasonmokk@users.noreply.github.com> Date: Mon, 29 Apr 2024 13:49:51 -0500 Subject: [PATCH 068/100] TST: Add tests for #55431 (#58367) * Add tests for #55431 * Fix inconsistent pandas namespace usage * Fix inconsistent pandas namespace usage again * Temp disable part of test potentialy due to known bug * Remove unnecessary comments and adjust implementation --------- Co-authored-by: Jason Mok Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/tests/reshape/test_cut.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/pandas/tests/reshape/test_cut.py b/pandas/tests/reshape/test_cut.py index 0811c69859c0d..340c5c449aea7 100644 --- a/pandas/tests/reshape/test_cut.py +++ b/pandas/tests/reshape/test_cut.py @@ -789,3 +789,17 @@ def test_cut_with_nullable_int64(): result = cut(series, bins=bins) tm.assert_series_equal(result, expected) + + +def test_cut_datetime_array_no_attributeerror(): + # GH 55431 + ser = Series(to_datetime(["2023-10-06 12:00:00+0000", "2023-10-07 12:00:00+0000"])) + + result = cut(ser.array, bins=2) + + categories = result.categories + expected = Categorical.from_codes([0, 1], categories=categories, ordered=True) + + tm.assert_categorical_equal( + result, expected, check_dtype=True, check_category_order=True + ) From 4c6d9eb4b0037804204e63809e885e4f207b7894 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Tue, 30 Apr 2024 00:21:17 +0530 Subject: [PATCH 069/100] DOC: Enforce Numpy Docstring Validation for pandas.DatetimeIndex.snap (#58477) * DOC: add PR01,RT03 for pandas.DatetimeIndex.snap * DOC: remove pandas.DatetimeIndex.snap --- ci/code_checks.sh | 1 - pandas/core/indexes/datetimes.py | 8 ++++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 58ecae66e1bcc..ce53c9fca60e0 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -98,7 +98,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.DataFrame.swaplevel SA01" \ -i "pandas.DataFrame.to_markdown SA01" \ -i "pandas.DataFrame.var PR01,RT03,SA01" \ - -i "pandas.DatetimeIndex.snap PR01,RT03" \ -i "pandas.Grouper PR02" \ -i "pandas.Index PR07" \ -i "pandas.Index.append PR07,RT03,SA01" \ diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 742f66aa80728..78f04f57029b1 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -451,9 +451,17 @@ def snap(self, freq: Frequency = "S") -> DatetimeIndex: """ Snap time stamps to nearest occurring frequency. + Parameters + ---------- + freq : str, Timedelta, datetime.timedelta, or DateOffset, default 'S' + Frequency strings can have multiples, e.g. '5h'. See + :ref:`here ` for a list of + frequency aliases. + Returns ------- DatetimeIndex + Time stamps to nearest occurring `freq`. See Also -------- From 4de300da14bd03da3bd759bebdfcc65570d68094 Mon Sep 17 00:00:00 2001 From: Abdulaziz Aloqeely <52792999+Aloqeely@users.noreply.github.com> Date: Mon, 29 Apr 2024 21:54:12 +0300 Subject: [PATCH 070/100] DOC: Update Categorical/CategoricalDtype methods' docstring to pass docstring validation (#58079) * Improve docstring for some methods in categorical/categoricaldtype * Remove cat.ordered --------- Co-authored-by: Abdulaziz Aloqeely <52792999+DAzVise@users.noreply.github.com> --- ci/code_checks.sh | 10 ---------- pandas/core/arrays/categorical.py | 28 ++++++++++++++++++++++++++++ pandas/core/dtypes/dtypes.py | 8 ++++++++ 3 files changed, 36 insertions(+), 10 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index ce53c9fca60e0..1724fae98a6e5 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -70,15 +70,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then --format=actions \ -i ES01 `# For now it is ok if docstrings are missing the extended summary` \ -i "pandas.Series.dt PR01" `# Accessors are implemented as classes, but we do not document the Parameters section` \ - -i "pandas.Categorical.__array__ SA01" \ - -i "pandas.Categorical.codes SA01" \ - -i "pandas.Categorical.dtype SA01" \ - -i "pandas.Categorical.from_codes SA01" \ - -i "pandas.Categorical.ordered SA01" \ - -i "pandas.CategoricalDtype.categories SA01" \ - -i "pandas.CategoricalDtype.ordered SA01" \ - -i "pandas.CategoricalIndex.codes SA01" \ - -i "pandas.CategoricalIndex.ordered SA01" \ -i "pandas.DataFrame.__dataframe__ SA01" \ -i "pandas.DataFrame.at_time PR01" \ -i "pandas.DataFrame.kurt RT03,SA01" \ @@ -215,7 +206,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Series.cat.as_ordered PR01" \ -i "pandas.Series.cat.as_unordered PR01" \ -i "pandas.Series.cat.codes SA01" \ - -i "pandas.Series.cat.ordered SA01" \ -i "pandas.Series.cat.remove_categories PR01,PR02" \ -i "pandas.Series.cat.remove_unused_categories PR01" \ -i "pandas.Series.cat.rename_categories PR01,PR02" \ diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 6a3cf4590568c..11dea697d9b93 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -497,6 +497,11 @@ def dtype(self) -> CategoricalDtype: """ The :class:`~pandas.api.types.CategoricalDtype` for this instance. + See Also + -------- + astype : Cast argument to a specified dtype. + CategoricalDtype : Type for categorical data. + Examples -------- >>> cat = pd.Categorical(["a", "b"], ordered=True) @@ -721,6 +726,11 @@ def from_codes( ------- Categorical + See Also + -------- + codes : The category codes of the categorical. + CategoricalIndex : An Index with an underlying ``Categorical``. + Examples -------- >>> dtype = pd.CategoricalDtype(["a", "b"], ordered=True) @@ -810,6 +820,12 @@ def ordered(self) -> Ordered: """ Whether the categories have an ordered relationship. + See Also + -------- + set_ordered : Set the ordered attribute. + as_ordered : Set the Categorical to be ordered. + as_unordered : Set the Categorical to be unordered. + Examples -------- For :class:`pandas.Series`: @@ -861,6 +877,11 @@ def codes(self) -> np.ndarray: ndarray[int] A non-writable view of the ``codes`` array. + See Also + -------- + Categorical.from_codes : Make a Categorical from codes. + CategoricalIndex : An Index with an underlying ``Categorical``. + Examples -------- For :class:`pandas.Categorical`: @@ -1641,6 +1662,9 @@ def __array__( """ The numpy array interface. + Users should not call this directly. Rather, it is invoked by + :func:`numpy.array` and :func:`numpy.asarray`. + Parameters ---------- dtype : np.dtype or None @@ -1656,6 +1680,10 @@ def __array__( if dtype==None (default), the same dtype as categorical.categories.dtype. + See Also + -------- + numpy.asarray : Convert input to numpy.ndarray. + Examples -------- diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 8c64a38bc1be3..e52cbff451700 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -623,6 +623,10 @@ def categories(self) -> Index: """ An ``Index`` containing the unique categories allowed. + See Also + -------- + ordered : Whether the categories have an ordered relationship. + Examples -------- >>> cat_type = pd.CategoricalDtype(categories=["a", "b"], ordered=True) @@ -636,6 +640,10 @@ def ordered(self) -> Ordered: """ Whether the categories have an ordered relationship. + See Also + -------- + categories : An Index containing the unique categories allowed. + Examples -------- >>> cat_type = pd.CategoricalDtype(categories=["a", "b"], ordered=True) From 2246a78e2a615207ee208bfa4cc3339a67214035 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dominik=20Smr=C5=BE?= Date: Mon, 29 Apr 2024 22:14:06 +0200 Subject: [PATCH 071/100] `pd.eval`: `Series` names are now preserved even for `"numexpr"` engine. (#58437) * Eval: Series names are preserved for numexpr Series names are now preserved even when using numexpr engine. Making the behavior consistent with python engine. * Update doc/source/whatsnew/v3.0.0.rst Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> * Update pandas/core/computation/align.py Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> * Update pandas/tests/computation/test_eval.py --------- Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/computation/align.py | 19 ++++++++---- pandas/core/computation/engines.py | 11 +++++-- pandas/tests/computation/test_eval.py | 43 +++++++++++++++------------ pandas/tests/frame/test_query_eval.py | 16 ++++++---- 5 files changed, 57 insertions(+), 33 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index a81fb584c8df9..6ae3a8e00c02f 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -469,6 +469,7 @@ Styler Other ^^^^^ - Bug in :class:`DataFrame` when passing a ``dict`` with a NA scalar and ``columns`` that would always return ``np.nan`` (:issue:`57205`) +- Bug in :func:`eval` where the names of the :class:`Series` were not preserved when using ``engine="numexpr"``. (:issue:`10239`) - Bug in :func:`unique` on :class:`Index` not always returning :class:`Index` (:issue:`57043`) - Bug in :meth:`DataFrame.eval` and :meth:`DataFrame.query` which caused an exception when using NumPy attributes via ``@`` notation, e.g., ``df.eval("@np.floor(a)")``. (:issue:`58041`) - Bug in :meth:`DataFrame.eval` and :meth:`DataFrame.query` which did not allow to use ``tan`` function. (:issue:`55091`) diff --git a/pandas/core/computation/align.py b/pandas/core/computation/align.py index c5562fb0284b7..b4e33b8ac75cb 100644 --- a/pandas/core/computation/align.py +++ b/pandas/core/computation/align.py @@ -160,19 +160,24 @@ def align_terms(terms): # can't iterate so it must just be a constant or single variable if isinstance(terms.value, (ABCSeries, ABCDataFrame)): typ = type(terms.value) - return typ, _zip_axes_from_type(typ, terms.value.axes) - return np.result_type(terms.type), None + name = terms.value.name if isinstance(terms.value, ABCSeries) else None + return typ, _zip_axes_from_type(typ, terms.value.axes), name + return np.result_type(terms.type), None, None # if all resolved variables are numeric scalars if all(term.is_scalar for term in terms): - return result_type_many(*(term.value for term in terms)).type, None + return result_type_many(*(term.value for term in terms)).type, None, None + + # if all input series have a common name, propagate it to the returned series + names = {term.value.name for term in terms if isinstance(term.value, ABCSeries)} + name = names.pop() if len(names) == 1 else None # perform the main alignment typ, axes = _align_core(terms) - return typ, axes + return typ, axes, name -def reconstruct_object(typ, obj, axes, dtype): +def reconstruct_object(typ, obj, axes, dtype, name): """ Reconstruct an object given its type, raw value, and possibly empty (None) axes. @@ -200,7 +205,9 @@ def reconstruct_object(typ, obj, axes, dtype): res_t = np.result_type(obj.dtype, dtype) if not isinstance(typ, partial) and issubclass(typ, PandasObject): - return typ(obj, dtype=res_t, **axes) + if name is None: + return typ(obj, dtype=res_t, **axes) + return typ(obj, dtype=res_t, name=name, **axes) # special case for pathological things like ~True/~False if hasattr(res_t, "type") and typ == np.bool_ and res_t != np.bool_: diff --git a/pandas/core/computation/engines.py b/pandas/core/computation/engines.py index 5db05ebe33efd..d2a181cbb3c36 100644 --- a/pandas/core/computation/engines.py +++ b/pandas/core/computation/engines.py @@ -54,6 +54,7 @@ def __init__(self, expr) -> None: self.expr = expr self.aligned_axes = None self.result_type = None + self.result_name = None def convert(self) -> str: """ @@ -76,12 +77,18 @@ def evaluate(self) -> object: The result of the passed expression. """ if not self._is_aligned: - self.result_type, self.aligned_axes = align_terms(self.expr.terms) + self.result_type, self.aligned_axes, self.result_name = align_terms( + self.expr.terms + ) # make sure no names in resolvers and locals/globals clash res = self._evaluate() return reconstruct_object( - self.result_type, res, self.aligned_axes, self.expr.terms.return_type + self.result_type, + res, + self.aligned_axes, + self.expr.terms.return_type, + self.result_name, ) @property diff --git a/pandas/tests/computation/test_eval.py b/pandas/tests/computation/test_eval.py index ebbb31205e264..d8e5908b0c58f 100644 --- a/pandas/tests/computation/test_eval.py +++ b/pandas/tests/computation/test_eval.py @@ -737,6 +737,17 @@ def test_and_logic_string_match(self): assert pd.eval(f"{event.str.match('hello').a}") assert pd.eval(f"{event.str.match('hello').a and event.str.match('hello').a}") + def test_eval_keep_name(self, engine, parser): + df = Series([2, 15, 28], name="a").to_frame() + res = df.eval("a + a", engine=engine, parser=parser) + expected = Series([4, 30, 56], name="a") + tm.assert_series_equal(expected, res) + + def test_eval_unmatching_names(self, engine, parser): + variable_name = Series([42], name="series_name") + res = pd.eval("variable_name + 0", engine=engine, parser=parser) + tm.assert_series_equal(variable_name, res) + # ------------------------------------- # gh-12388: Typecasting rules consistency with python @@ -1269,14 +1280,12 @@ def test_assignment_explicit(self): expected["c"] = expected["a"] + expected["b"] tm.assert_frame_equal(df, expected) - def test_column_in(self): + def test_column_in(self, engine): # GH 11235 df = DataFrame({"a": [11], "b": [-32]}) - result = df.eval("a in [11, -32]") - expected = Series([True]) - # TODO: 2022-01-29: Name check failed with numexpr 2.7.3 in CI - # but cannot reproduce locally - tm.assert_series_equal(result, expected, check_names=False) + result = df.eval("a in [11, -32]", engine=engine) + expected = Series([True], name="a") + tm.assert_series_equal(result, expected) @pytest.mark.xfail(reason="Unknown: Omitted test_ in name prior.") def test_assignment_not_inplace(self): @@ -1505,7 +1514,7 @@ def test_date_boolean(self, engine, parser): parser=parser, ) expec = df.dates1 < "20130101" - tm.assert_series_equal(res, expec, check_names=False) + tm.assert_series_equal(res, expec) def test_simple_in_ops(self, engine, parser): if parser != "python": @@ -1620,7 +1629,7 @@ def test_unary_functions(self, fn, engine, parser): got = self.eval(expr, engine=engine, parser=parser) with np.errstate(all="ignore"): expect = getattr(np, fn)(a) - tm.assert_series_equal(got, expect, check_names=False) + tm.assert_series_equal(got, expect) @pytest.mark.parametrize("fn", _binary_math_ops) def test_binary_functions(self, fn, engine, parser): @@ -1637,7 +1646,7 @@ def test_binary_functions(self, fn, engine, parser): got = self.eval(expr, engine=engine, parser=parser) with np.errstate(all="ignore"): expect = getattr(np, fn)(a, b) - tm.assert_almost_equal(got, expect, check_names=False) + tm.assert_almost_equal(got, expect) def test_df_use_case(self, engine, parser): df = DataFrame( @@ -1653,8 +1662,8 @@ def test_df_use_case(self, engine, parser): inplace=True, ) got = df.e - expect = np.arctan2(np.sin(df.a), df.b) - tm.assert_series_equal(got, expect, check_names=False) + expect = np.arctan2(np.sin(df.a), df.b).rename("e") + tm.assert_series_equal(got, expect) def test_df_arithmetic_subexpression(self, engine, parser): df = DataFrame( @@ -1665,8 +1674,8 @@ def test_df_arithmetic_subexpression(self, engine, parser): ) df.eval("e = sin(a + b)", engine=engine, parser=parser, inplace=True) got = df.e - expect = np.sin(df.a + df.b) - tm.assert_series_equal(got, expect, check_names=False) + expect = np.sin(df.a + df.b).rename("e") + tm.assert_series_equal(got, expect) @pytest.mark.parametrize( "dtype, expect_dtype", @@ -1690,10 +1699,10 @@ def test_result_types(self, dtype, expect_dtype, engine, parser): assert df.a.dtype == dtype df.eval("b = sin(a)", engine=engine, parser=parser, inplace=True) got = df.b - expect = np.sin(df.a) + expect = np.sin(df.a).rename("b") assert expect.dtype == got.dtype assert expect_dtype == got.dtype - tm.assert_series_equal(got, expect, check_names=False) + tm.assert_series_equal(got, expect) def test_undefined_func(self, engine, parser): df = DataFrame({"a": np.random.default_rng(2).standard_normal(10)}) @@ -1898,10 +1907,6 @@ def test_equals_various(other): df = DataFrame({"A": ["a", "b", "c"]}, dtype=object) result = df.eval(f"A == {other}") expected = Series([False, False, False], name="A") - if USE_NUMEXPR: - # https://github.com/pandas-dev/pandas/issues/10239 - # lose name with numexpr engine. Remove when that's fixed. - expected.name = None tm.assert_series_equal(result, expected) diff --git a/pandas/tests/frame/test_query_eval.py b/pandas/tests/frame/test_query_eval.py index 94e8e469f21e7..643d342b052a4 100644 --- a/pandas/tests/frame/test_query_eval.py +++ b/pandas/tests/frame/test_query_eval.py @@ -58,26 +58,26 @@ def test_query_default(self, df, expected1, expected2): result = df.query("A>0") tm.assert_frame_equal(result, expected1) result = df.eval("A+1") - tm.assert_series_equal(result, expected2, check_names=False) + tm.assert_series_equal(result, expected2) def test_query_None(self, df, expected1, expected2): result = df.query("A>0", engine=None) tm.assert_frame_equal(result, expected1) result = df.eval("A+1", engine=None) - tm.assert_series_equal(result, expected2, check_names=False) + tm.assert_series_equal(result, expected2) def test_query_python(self, df, expected1, expected2): result = df.query("A>0", engine="python") tm.assert_frame_equal(result, expected1) result = df.eval("A+1", engine="python") - tm.assert_series_equal(result, expected2, check_names=False) + tm.assert_series_equal(result, expected2) def test_query_numexpr(self, df, expected1, expected2): if NUMEXPR_INSTALLED: result = df.query("A>0", engine="numexpr") tm.assert_frame_equal(result, expected1) result = df.eval("A+1", engine="numexpr") - tm.assert_series_equal(result, expected2, check_names=False) + tm.assert_series_equal(result, expected2) else: msg = ( r"'numexpr' is not installed or an unsupported version. " @@ -194,8 +194,12 @@ def test_using_numpy(self, engine, parser): df = Series([0.2, 1.5, 2.8], name="a").to_frame() res = df.eval("@np.floor(a)", engine=engine, parser=parser) expected = np.floor(df["a"]) - if engine == "numexpr": - expected.name = None # See GH 58069 + tm.assert_series_equal(expected, res) + + def test_eval_simple(self, engine, parser): + df = Series([0.2, 1.5, 2.8], name="a").to_frame() + res = df.eval("a", engine=engine, parser=parser) + expected = df["a"] tm.assert_series_equal(expected, res) From 9d3747f3b44ba7444c228d429217c1424a812380 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 29 Apr 2024 13:14:34 -0700 Subject: [PATCH 072/100] DEPR: Series setitem/getitem treating ints as positional (#58089) * DEPR: Series setitem/getitem treating ints as positional * 32bit build compat * update exception message for numpy 2 --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/series.py | 89 ++----------------- pandas/tests/copy_view/test_indexing.py | 11 +-- pandas/tests/extension/base/getitem.py | 9 +- pandas/tests/indexing/test_coercion.py | 12 +-- pandas/tests/indexing/test_floats.py | 15 ++-- pandas/tests/series/indexing/test_datetime.py | 7 -- pandas/tests/series/indexing/test_get.py | 26 ++---- pandas/tests/series/indexing/test_getitem.py | 75 ++++++---------- pandas/tests/series/indexing/test_indexing.py | 23 ++--- pandas/tests/series/indexing/test_setitem.py | 30 +++---- 11 files changed, 80 insertions(+), 218 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 6ae3a8e00c02f..66dafecffeb01 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -221,6 +221,7 @@ Removal of prior version deprecations/changes - :meth:`SeriesGroupBy.agg` no longer pins the name of the group to the input passed to the provided ``func`` (:issue:`51703`) - All arguments except ``name`` in :meth:`Index.rename` are now keyword only (:issue:`56493`) - All arguments except the first ``path``-like argument in IO writers are now keyword only (:issue:`54229`) +- Changed behavior of :meth:`Series.__getitem__` and :meth:`Series.__setitem__` to always treat integer keys as labels, never as positional, consistent with :class:`DataFrame` behavior (:issue:`50617`) - Disallow allowing logical operations (``||``, ``&``, ``^``) between pandas objects and dtype-less sequences (e.g. ``list``, ``tuple``); wrap the objects in :class:`Series`, :class:`Index`, or ``np.array`` first instead (:issue:`52264`) - Disallow automatic casting to object in :class:`Series` logical operations (``&``, ``^``, ``||``) between series with mismatched indexes and dtypes other than ``object`` or ``bool`` (:issue:`52538`) - Disallow calling :meth:`Series.replace` or :meth:`DataFrame.replace` without a ``value`` and with non-dict-like ``to_replace`` (:issue:`33302`) diff --git a/pandas/core/series.py b/pandas/core/series.py index c1920312489c9..8a26d52bb5df1 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -901,19 +901,9 @@ def __getitem__(self, key): if isinstance(key, (list, tuple)): key = unpack_1tuple(key) - if is_integer(key) and self.index._should_fallback_to_positional: - warnings.warn( - # GH#50617 - "Series.__getitem__ treating keys as positions is deprecated. " - "In a future version, integer keys will always be treated " - "as labels (consistent with DataFrame behavior). To access " - "a value by position, use `ser.iloc[pos]`", - FutureWarning, - stacklevel=find_stack_level(), - ) - return self._values[key] - elif key_is_scalar: + # Note: GH#50617 in 3.0 we changed int key to always be treated as + # a label, matching DataFrame behavior. return self._get_value(key) # Convert generator to list before going through hashable part @@ -958,35 +948,6 @@ def _get_with(self, key): elif isinstance(key, tuple): return self._get_values_tuple(key) - elif not is_list_like(key): - # e.g. scalars that aren't recognized by lib.is_scalar, GH#32684 - return self.loc[key] - - if not isinstance(key, (list, np.ndarray, ExtensionArray, Series, Index)): - key = list(key) - - key_type = lib.infer_dtype(key, skipna=False) - - # Note: The key_type == "boolean" case should be caught by the - # com.is_bool_indexer check in __getitem__ - if key_type == "integer": - # We need to decide whether to treat this as a positional indexer - # (i.e. self.iloc) or label-based (i.e. self.loc) - if not self.index._should_fallback_to_positional: - return self.loc[key] - else: - warnings.warn( - # GH#50617 - "Series.__getitem__ treating keys as positions is deprecated. " - "In a future version, integer keys will always be treated " - "as labels (consistent with DataFrame behavior). To access " - "a value by position, use `ser.iloc[pos]`", - FutureWarning, - stacklevel=find_stack_level(), - ) - return self.iloc[key] - - # handle the dup indexing case GH#4246 return self.loc[key] def _get_values_tuple(self, key: tuple): @@ -1076,27 +1037,8 @@ def __setitem__(self, key, value) -> None: except KeyError: # We have a scalar (or for MultiIndex or object-dtype, scalar-like) # key that is not present in self.index. - if is_integer(key): - if not self.index._should_fallback_to_positional: - # GH#33469 - self.loc[key] = value - else: - # positional setter - # can't use _mgr.setitem_inplace yet bc could have *both* - # KeyError and then ValueError, xref GH#45070 - warnings.warn( - # GH#50617 - "Series.__setitem__ treating keys as positions is deprecated. " - "In a future version, integer keys will always be treated " - "as labels (consistent with DataFrame behavior). To set " - "a value by position, use `ser.iloc[pos] = value`", - FutureWarning, - stacklevel=find_stack_level(), - ) - self._set_values(key, value) - else: - # GH#12862 adding a new key to the Series - self.loc[key] = value + # GH#12862 adding a new key to the Series + self.loc[key] = value except (TypeError, ValueError, LossySetitemError): # The key was OK, but we cannot set the value losslessly @@ -1155,28 +1097,7 @@ def _set_with(self, key, value) -> None: # Without this, the call to infer_dtype will consume the generator key = list(key) - if not self.index._should_fallback_to_positional: - # Regardless of the key type, we're treating it as labels - self._set_labels(key, value) - - else: - # Note: key_type == "boolean" should not occur because that - # should be caught by the is_bool_indexer check in __setitem__ - key_type = lib.infer_dtype(key, skipna=False) - - if key_type == "integer": - warnings.warn( - # GH#50617 - "Series.__setitem__ treating keys as positions is deprecated. " - "In a future version, integer keys will always be treated " - "as labels (consistent with DataFrame behavior). To set " - "a value by position, use `ser.iloc[pos] = value`", - FutureWarning, - stacklevel=find_stack_level(), - ) - self._set_values(key, value) - else: - self._set_labels(key, value) + self._set_labels(key, value) def _set_labels(self, key, value) -> None: key = com.asarray_tuplesafe(key) diff --git a/pandas/tests/copy_view/test_indexing.py b/pandas/tests/copy_view/test_indexing.py index 09d13677eef62..b10141b0d63f4 100644 --- a/pandas/tests/copy_view/test_indexing.py +++ b/pandas/tests/copy_view/test_indexing.py @@ -622,16 +622,17 @@ def test_series_subset_set_with_indexer(backend, indexer_si, indexer): s_orig = s.copy() subset = s[:] - warn = None - msg = "Series.__setitem__ treating keys as positions is deprecated" if ( indexer_si is tm.setitem and isinstance(indexer, np.ndarray) and indexer.dtype.kind == "i" ): - warn = FutureWarning - with tm.assert_produces_warning(warn, match=msg): - indexer_si(subset)[indexer] = 0 + # In 3.0 we treat integers as always-labels + with pytest.raises(KeyError): + indexer_si(subset)[indexer] = 0 + return + + indexer_si(subset)[indexer] = 0 expected = Series([0, 0, 3], index=["a", "b", "c"]) tm.assert_series_equal(subset, expected) diff --git a/pandas/tests/extension/base/getitem.py b/pandas/tests/extension/base/getitem.py index 1f89c7ad9d4e4..935edce32a0ab 100644 --- a/pandas/tests/extension/base/getitem.py +++ b/pandas/tests/extension/base/getitem.py @@ -329,11 +329,10 @@ def test_get(self, data): result = s.get("Z") assert result is None - msg = "Series.__getitem__ treating keys as positions is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - assert s.get(4) == s.iloc[4] - assert s.get(-1) == s.iloc[-1] - assert s.get(len(s)) is None + # As of 3.0, getitem with int keys treats them as labels + assert s.get(4) is None + assert s.get(-1) is None + assert s.get(len(s)) is None # GH 21257 s = pd.Series(data) diff --git a/pandas/tests/indexing/test_coercion.py b/pandas/tests/indexing/test_coercion.py index d51a986a22f1e..d4bc0341e732e 100644 --- a/pandas/tests/indexing/test_coercion.py +++ b/pandas/tests/indexing/test_coercion.py @@ -117,16 +117,8 @@ def test_setitem_index_object(self, val, exp_dtype): obj = pd.Series([1, 2, 3, 4], index=pd.Index(list("abcd"), dtype=object)) assert obj.index.dtype == object - if exp_dtype is IndexError: - temp = obj.copy() - warn_msg = "Series.__setitem__ treating keys as positions is deprecated" - msg = "index 5 is out of bounds for axis 0 with size 4" - with pytest.raises(exp_dtype, match=msg): - with tm.assert_produces_warning(FutureWarning, match=warn_msg): - temp[5] = 5 - else: - exp_index = pd.Index(list("abcd") + [val], dtype=object) - self._assert_setitem_index_conversion(obj, val, exp_index, exp_dtype) + exp_index = pd.Index(list("abcd") + [val], dtype=object) + self._assert_setitem_index_conversion(obj, val, exp_index, exp_dtype) @pytest.mark.parametrize( "val,exp_dtype", [(5, np.int64), (1.1, np.float64), ("x", object)] diff --git a/pandas/tests/indexing/test_floats.py b/pandas/tests/indexing/test_floats.py index 1fe431e12f2a1..8597ee1198ff0 100644 --- a/pandas/tests/indexing/test_floats.py +++ b/pandas/tests/indexing/test_floats.py @@ -87,11 +87,11 @@ def test_scalar_non_numeric(self, index, frame_or_series, indexer_sl): ], ) def test_scalar_non_numeric_series_fallback(self, index): - # fallsback to position selection, series only + # starting in 3.0, integer keys are always treated as labels, no longer + # fall back to positional. s = Series(np.arange(len(index)), index=index) - msg = "Series.__getitem__ treating keys as positions is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): + with pytest.raises(KeyError, match="3"): s[3] with pytest.raises(KeyError, match="^3.0$"): s[3.0] @@ -118,12 +118,9 @@ def test_scalar_with_mixed(self, indexer_sl): indexer_sl(s3)[1.0] if indexer_sl is not tm.loc: - # __getitem__ falls back to positional - msg = "Series.__getitem__ treating keys as positions is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = s3[1] - expected = 2 - assert result == expected + # as of 3.0, __getitem__ no longer falls back to positional + with pytest.raises(KeyError, match="^1$"): + s3[1] with pytest.raises(KeyError, match=r"^1\.0$"): indexer_sl(s3)[1.0] diff --git a/pandas/tests/series/indexing/test_datetime.py b/pandas/tests/series/indexing/test_datetime.py index e0ca4bf64ea91..3b41c8ee463d8 100644 --- a/pandas/tests/series/indexing/test_datetime.py +++ b/pandas/tests/series/indexing/test_datetime.py @@ -36,9 +36,6 @@ def test_fancy_getitem(): s = Series(np.arange(len(dti)), index=dti) - msg = "Series.__getitem__ treating keys as positions is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - assert s[48] == 48 assert s["1/2/2009"] == 48 assert s["2009-1-2"] == 48 assert s[datetime(2009, 1, 2)] == 48 @@ -57,10 +54,6 @@ def test_fancy_setitem(): s = Series(np.arange(len(dti)), index=dti) - msg = "Series.__setitem__ treating keys as positions is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - s[48] = -1 - assert s.iloc[48] == -1 s["1/2/2009"] = -2 assert s.iloc[48] == -2 s["1/2/2009":"2009-06-05"] = -3 diff --git a/pandas/tests/series/indexing/test_get.py b/pandas/tests/series/indexing/test_get.py index 1f3711ad91903..5ff92ca89efba 100644 --- a/pandas/tests/series/indexing/test_get.py +++ b/pandas/tests/series/indexing/test_get.py @@ -157,13 +157,8 @@ def test_get_with_default(): assert s.get("e", "z") == "z" assert s.get("e", "e") == "e" - msg = "Series.__getitem__ treating keys as positions is deprecated" - warn = None - if index is d0: - warn = FutureWarning - with tm.assert_produces_warning(warn, match=msg): - assert s.get(10, "z") == "z" - assert s.get(10, 10) == 10 + assert s.get(10, "z") == "z" + assert s.get(10, 10) == 10 @pytest.mark.parametrize( @@ -201,13 +196,10 @@ def test_get_with_ea(arr): result = ser.get("Z") assert result is None - msg = "Series.__getitem__ treating keys as positions is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - assert ser.get(4) == ser.iloc[4] - with tm.assert_produces_warning(FutureWarning, match=msg): - assert ser.get(-1) == ser.iloc[-1] - with tm.assert_produces_warning(FutureWarning, match=msg): - assert ser.get(len(ser)) is None + # As of 3.0, ints are treated as labels + assert ser.get(4) is None + assert ser.get(-1) is None + assert ser.get(len(ser)) is None # GH#21257 ser = Series(arr) @@ -216,16 +208,14 @@ def test_get_with_ea(arr): def test_getitem_get(string_series, object_series): - msg = "Series.__getitem__ treating keys as positions is deprecated" - for obj in [string_series, object_series]: idx = obj.index[5] assert obj[idx] == obj.get(idx) assert obj[idx] == obj.iloc[5] - with tm.assert_produces_warning(FutureWarning, match=msg): - assert string_series.get(-1) == string_series.get(string_series.index[-1]) + # As of 3.0, ints are treated as labels + assert string_series.get(-1) is None assert string_series.iloc[5] == string_series.get(string_series.index[5]) diff --git a/pandas/tests/series/indexing/test_getitem.py b/pandas/tests/series/indexing/test_getitem.py index fac543ac450a5..ede39ba61dfeb 100644 --- a/pandas/tests/series/indexing/test_getitem.py +++ b/pandas/tests/series/indexing/test_getitem.py @@ -15,6 +15,7 @@ conversion, timezones, ) +from pandas.compat.numpy import np_version_gt2 from pandas.core.dtypes.common import is_scalar @@ -72,19 +73,14 @@ def test_getitem_unrecognized_scalar(self): def test_getitem_negative_out_of_bounds(self): ser = Series(["a"] * 10, index=["a"] * 10) - msg = "index -11 is out of bounds for axis 0 with size 10|index out of bounds" - warn_msg = "Series.__getitem__ treating keys as positions is deprecated" - with pytest.raises(IndexError, match=msg): - with tm.assert_produces_warning(FutureWarning, match=warn_msg): - ser[-11] + with pytest.raises(KeyError, match="^-11$"): + ser[-11] def test_getitem_out_of_bounds_indexerror(self, datetime_series): # don't segfault, GH#495 - msg = r"index \d+ is out of bounds for axis 0 with size \d+" - warn_msg = "Series.__getitem__ treating keys as positions is deprecated" - with pytest.raises(IndexError, match=msg): - with tm.assert_produces_warning(FutureWarning, match=warn_msg): - datetime_series[len(datetime_series)] + N = len(datetime_series) + with pytest.raises(KeyError, match=str(N)): + datetime_series[N] def test_getitem_out_of_bounds_empty_rangeindex_keyerror(self): # GH#917 @@ -118,11 +114,13 @@ def test_getitem_keyerror_with_integer_index(self, any_int_numpy_dtype): ser["c"] def test_getitem_int64(self, datetime_series): + if np_version_gt2: + msg = r"^np.int64\(5\)$" + else: + msg = "^5$" idx = np.int64(5) - msg = "Series.__getitem__ treating keys as positions is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - res = datetime_series[idx] - assert res == datetime_series.iloc[5] + with pytest.raises(KeyError, match=msg): + datetime_series[idx] def test_getitem_full_range(self): # github.com/pandas-dev/pandas/commit/4f433773141d2eb384325714a2776bcc5b2e20f7 @@ -218,10 +216,8 @@ def test_getitem_str_with_timedeltaindex(self): def test_getitem_bool_index_positional(self): # GH#48653 ser = Series({True: 1, False: 0}) - msg = "Series.__getitem__ treating keys as positions is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = ser[0] - assert result == 1 + with pytest.raises(KeyError, match="^0$"): + ser[0] class TestSeriesGetitemSlices: @@ -384,17 +380,16 @@ def test_getitem_intlist_intindex_periodvalues(self): @pytest.mark.parametrize("box", [list, np.array, Index]) def test_getitem_intlist_intervalindex_non_int(self, box): - # GH#33404 fall back to positional since ints are unambiguous + # GH#33404 fall back to positional since ints are unambiguous; + # changed in 3.0 to never fallback dti = date_range("2000-01-03", periods=3)._with_freq(None) ii = pd.IntervalIndex.from_breaks(dti) ser = Series(range(len(ii)), index=ii) - expected = ser.iloc[:1] key = box([0]) - msg = "Series.__getitem__ treating keys as positions is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = ser[key] - tm.assert_series_equal(result, expected) + msg = r"None of \[Index\(\[0\], dtype='int(32|64)'\)\] are in the \[index\]" + with pytest.raises(KeyError, match=msg): + ser[key] @pytest.mark.parametrize("box", [list, np.array, Index]) @pytest.mark.parametrize("dtype", [np.int64, np.float64, np.uint64]) @@ -635,11 +630,6 @@ def test_getitem_preserve_name(datetime_series): result = datetime_series[datetime_series > 0] assert result.name == datetime_series.name - msg = "Series.__getitem__ treating keys as positions is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = datetime_series[[0, 2, 4]] - assert result.name == datetime_series.name - result = datetime_series[5:10] assert result.name == datetime_series.name @@ -667,21 +657,16 @@ def test_getitem_missing(datetime_series): def test_getitem_fancy(string_series, object_series): - msg = "Series.__getitem__ treating keys as positions is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - slice1 = string_series[[1, 2, 3]] - slice2 = object_series[[1, 2, 3]] - assert string_series.index[2] == slice1.index[1] - assert object_series.index[2] == slice2.index[1] - assert string_series.iloc[2] == slice1.iloc[1] - assert object_series.iloc[2] == slice2.iloc[1] + msg = r"None of \[Index\(\[1, 2, 3\], dtype='int(32|64)'\)\] are in the \[index\]" + with pytest.raises(KeyError, match=msg): + string_series[[1, 2, 3]] + with pytest.raises(KeyError, match=msg): + object_series[[1, 2, 3]] def test_getitem_box_float64(datetime_series): - msg = "Series.__getitem__ treating keys as positions is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - value = datetime_series[5] - assert isinstance(value, np.float64) + with pytest.raises(KeyError, match="^5$"): + datetime_series[5] def test_getitem_unordered_dup(): @@ -712,13 +697,11 @@ def test_slice_can_reorder_not_uniquely_indexed(): @pytest.mark.parametrize("index_vals", ["aabcd", "aadcb"]) def test_duplicated_index_getitem_positional_indexer(index_vals): - # GH 11747 + # GH 11747; changed in 3.0 integers are treated as always-labels s = Series(range(5), index=list(index_vals)) - msg = "Series.__getitem__ treating keys as positions is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = s[3] - assert result == 3 + with pytest.raises(KeyError, match="^3$"): + s[3] class TestGetitemDeprecatedIndexers: diff --git a/pandas/tests/series/indexing/test_indexing.py b/pandas/tests/series/indexing/test_indexing.py index a629d18131306..5002b6d20da09 100644 --- a/pandas/tests/series/indexing/test_indexing.py +++ b/pandas/tests/series/indexing/test_indexing.py @@ -32,27 +32,16 @@ def test_basic_indexing(): np.random.default_rng(2).standard_normal(5), index=["a", "b", "a", "a", "b"] ) - warn_msg = "Series.__[sg]etitem__ treating keys as positions is deprecated" - msg = "index 5 is out of bounds for axis 0 with size 5" - with pytest.raises(IndexError, match=msg): - with tm.assert_produces_warning(FutureWarning, match=warn_msg): - s[5] - with pytest.raises(IndexError, match=msg): - with tm.assert_produces_warning(FutureWarning, match=warn_msg): - s[5] = 0 + with pytest.raises(KeyError, match="^5$"): + s[5] with pytest.raises(KeyError, match=r"^'c'$"): s["c"] s = s.sort_index() - with pytest.raises(IndexError, match=msg): - with tm.assert_produces_warning(FutureWarning, match=warn_msg): - s[5] - msg = r"index 5 is out of bounds for axis (0|1) with size 5|^5$" - with pytest.raises(IndexError, match=msg): - with tm.assert_produces_warning(FutureWarning, match=warn_msg): - s[5] = 0 + with pytest.raises(KeyError, match="^5$"): + s[5] def test_getitem_numeric_should_not_fallback_to_positional(any_numeric_dtype): @@ -153,9 +142,7 @@ def test_series_box_timestamp(): assert isinstance(ser.iloc[4], Timestamp) ser = Series(rng, index=rng) - msg = "Series.__getitem__ treating keys as positions is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - assert isinstance(ser[0], Timestamp) + assert isinstance(ser[rng[0]], Timestamp) assert isinstance(ser.at[rng[1]], Timestamp) assert isinstance(ser.iat[2], Timestamp) assert isinstance(ser.loc[rng[3]], Timestamp) diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py index 7a2a4892f61fb..b94e6b6f0c6c8 100644 --- a/pandas/tests/series/indexing/test_setitem.py +++ b/pandas/tests/series/indexing/test_setitem.py @@ -181,14 +181,12 @@ def test_object_series_setitem_dt64array_exact_match(self): class TestSetitemScalarIndexer: def test_setitem_negative_out_of_bounds(self): + # As of 3.0, int keys are treated as labels, so this becomes + # setitem-with-expansion ser = Series(["a"] * 10, index=["a"] * 10) - - # string index falls back to positional - msg = "index -11|-1 is out of bounds for axis 0 with size 10" - warn_msg = "Series.__setitem__ treating keys as positions is deprecated" - with pytest.raises(IndexError, match=msg): - with tm.assert_produces_warning(FutureWarning, match=warn_msg): - ser[-11] = "foo" + ser[-11] = "foo" + exp = Series(["a"] * 10 + ["foo"], index=["a"] * 10 + [-11]) + tm.assert_series_equal(ser, exp) @pytest.mark.parametrize("indexer", [tm.loc, tm.at]) @pytest.mark.parametrize("ser_index", [0, 1]) @@ -1749,24 +1747,24 @@ def test_setitem_bool_int_float_consistency(indexer_sli): def test_setitem_positional_with_casting(): # GH#45070 case where in __setitem__ we get a KeyError, then when # we fallback we *also* get a ValueError if we try to set inplace. + # As of 3.0 we always treat int keys as labels, so this becomes + # setitem-with-expansion ser = Series([1, 2, 3], index=["a", "b", "c"]) - warn_msg = "Series.__setitem__ treating keys as positions is deprecated" - with tm.assert_produces_warning(FutureWarning, match=warn_msg): - ser[0] = "X" - expected = Series(["X", 2, 3], index=["a", "b", "c"], dtype=object) + ser[0] = "X" + expected = Series([1, 2, 3, "X"], index=["a", "b", "c", 0], dtype=object) tm.assert_series_equal(ser, expected) def test_setitem_positional_float_into_int_coerces(): # Case where we hit a KeyError and then trying to set in-place incorrectly - # casts a float to an int + # casts a float to an int; + # As of 3.0 we always treat int keys as labels, so this becomes + # setitem-with-expansion ser = Series([1, 2, 3], index=["a", "b", "c"]) - warn_msg = "Series.__setitem__ treating keys as positions is deprecated" - with tm.assert_produces_warning(FutureWarning, match=warn_msg): - ser[0] = 1.5 - expected = Series([1.5, 2, 3], index=["a", "b", "c"]) + ser[0] = 1.5 + expected = Series([1, 2, 3, 1.5], index=["a", "b", "c", 0]) tm.assert_series_equal(ser, expected) From a052307e2deb36a3548b58de8888765fb4b7bed0 Mon Sep 17 00:00:00 2001 From: shriyakalakata <87483933+shriyakalakata@users.noreply.github.com> Date: Mon, 29 Apr 2024 16:15:54 -0400 Subject: [PATCH 073/100] Fix PR07,RT03,SA01 errors for Index.union, Index.symmetric_difference (#58457) * Fix PR07,RT03,SA01 errors for Index.union, Index.symmetric_difference * Update pandas/core/indexes/base.py Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --------- Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- ci/code_checks.sh | 2 -- pandas/core/indexes/base.py | 21 +++++++++++++++++++++ 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 1724fae98a6e5..8b1bccdaa8d1b 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -106,9 +106,7 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Index.ravel PR01,RT03" \ -i "pandas.Index.slice_indexer PR07,RT03,SA01" \ -i "pandas.Index.str PR01,SA01" \ - -i "pandas.Index.symmetric_difference PR07,RT03,SA01" \ -i "pandas.Index.take PR01,PR07" \ - -i "pandas.Index.union PR07,RT03,SA01" \ -i "pandas.Index.view GL08" \ -i "pandas.Int16Dtype SA01" \ -i "pandas.Int32Dtype SA01" \ diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index f0ac8604ccd60..212d0bcef8f43 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2872,6 +2872,8 @@ def union(self, other, sort=None): Parameters ---------- other : Index or array-like + Index or an array-like object containing elements to form the union + with the original Index. sort : bool or None, default None Whether to sort the resulting Index. @@ -2888,6 +2890,14 @@ def union(self, other, sort=None): Returns ------- Index + Returns a new Index object with all unique elements from both the original + Index and the `other` Index. + + See Also + -------- + Index.unique : Return unique values in the index. + Index.intersection : Form the intersection of two Index objects. + Index.difference : Return a new Index with elements of index not in `other`. Examples -------- @@ -3312,7 +3322,10 @@ def symmetric_difference(self, other, result_name=None, sort=None): Parameters ---------- other : Index or array-like + Index or an array-like object with elements to compute the symmetric + difference with the original Index. result_name : str + A string representing the name of the resulting Index, if desired. sort : bool or None, default None Whether to sort the resulting index. By default, the values are attempted to be sorted, but any TypeError from @@ -3326,6 +3339,14 @@ def symmetric_difference(self, other, result_name=None, sort=None): Returns ------- Index + Returns a new Index object containing elements that appear in either the + original Index or the `other` Index, but not both. + + See Also + -------- + Index.difference : Return a new Index with elements of index not in other. + Index.union : Form the union of two Index objects. + Index.intersection : Form the intersection of two Index objects. Notes ----- From f3f3853cd7ed92108cfd53adaad6dd631d48fc72 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Tue, 30 Apr 2024 00:16:41 +0200 Subject: [PATCH 074/100] BUG: astype not casting values for dictionary dtype correctly (#58479) * BUG: astype not casting values for dictionary dtype correctly * Fixup --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/arrays/arrow/array.py | 2 ++ pandas/tests/extension/test_arrow.py | 8 ++++++++ 3 files changed, 11 insertions(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 66dafecffeb01..59926c0751d32 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -390,6 +390,7 @@ Numeric Conversion ^^^^^^^^^^ +- Bug in :meth:`DataFrame.astype` not casting ``values`` for Arrow-based dictionary dtype correctly (:issue:`58479`) - Bug in :meth:`DataFrame.update` bool dtype being converted to object (:issue:`55509`) - Bug in :meth:`Series.astype` might modify read-only array inplace when casting to a string dtype (:issue:`57212`) - Bug in :meth:`Series.reindex` not maintaining ``float32`` type when a ``reindex`` introduces a missing value (:issue:`45857`) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 1154130b9bed3..0240433cdb683 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -525,6 +525,8 @@ def _box_pa_array( if pa_type is not None and pa_array.type != pa_type: if pa.types.is_dictionary(pa_type): pa_array = pa_array.dictionary_encode() + if pa_array.type != pa_type: + pa_array = pa_array.cast(pa_type) else: try: pa_array = pa_array.cast(pa_type) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 79440b55dd5dd..7d31fe6085c3a 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -3498,6 +3498,14 @@ def test_to_numpy_timestamp_to_int(): tm.assert_numpy_array_equal(result, expected) +@pytest.mark.parametrize("arrow_type", [pa.large_string(), pa.string()]) +def test_cast_dictionary_different_value_dtype(arrow_type): + df = pd.DataFrame({"a": ["x", "y"]}, dtype="string[pyarrow]") + data_type = ArrowDtype(pa.dictionary(pa.int32(), arrow_type)) + result = df.astype({"a": data_type}) + assert result.dtypes.iloc[0] == data_type + + def test_map_numeric_na_action(): ser = pd.Series([32, 40, None], dtype="int64[pyarrow]") result = ser.map(lambda x: 42, na_action="ignore") From 7cdee7a15670b3273e45425619b493c7d74c3719 Mon Sep 17 00:00:00 2001 From: KeiOshima Date: Mon, 29 Apr 2024 20:25:50 -0400 Subject: [PATCH 075/100] DOC: fixing RT03 erros for Index: duplicated and nunique (#58432) * DOC: fixing RT03 erros for Index: duplicated and nunique * deleting it lines from code_checks * fixing EXPECTED TO FAIL, BUT NOT FAILING error * fixing code_checks issue * fixed Expected to fail error --- ci/code_checks.sh | 3 --- pandas/core/base.py | 1 + pandas/core/indexes/base.py | 1 + 3 files changed, 2 insertions(+), 3 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 8b1bccdaa8d1b..45831f6030794 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -93,7 +93,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Index PR07" \ -i "pandas.Index.append PR07,RT03,SA01" \ -i "pandas.Index.difference PR07,RT03,SA01" \ - -i "pandas.Index.duplicated RT03" \ -i "pandas.Index.get_indexer PR07,SA01" \ -i "pandas.Index.get_indexer_for PR01,SA01" \ -i "pandas.Index.get_indexer_non_unique PR07,SA01" \ @@ -101,7 +100,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Index.identical PR01,SA01" \ -i "pandas.Index.join PR07,RT03,SA01" \ -i "pandas.Index.names GL08" \ - -i "pandas.Index.nunique RT03" \ -i "pandas.Index.putmask PR01,RT03" \ -i "pandas.Index.ravel PR01,RT03" \ -i "pandas.Index.slice_indexer PR07,RT03,SA01" \ @@ -256,7 +254,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Series.mode SA01" \ -i "pandas.Series.mul PR07" \ -i "pandas.Series.ne PR07,SA01" \ - -i "pandas.Series.nunique RT03" \ -i "pandas.Series.pad PR01,SA01" \ -i "pandas.Series.plot PR02,SA01" \ -i "pandas.Series.pop RT03,SA01" \ diff --git a/pandas/core/base.py b/pandas/core/base.py index e54fac3da72a6..87e87538ca1d9 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -1062,6 +1062,7 @@ def nunique(self, dropna: bool = True) -> int: Returns ------- int + A integer indicating the number of unique elements in the object. See Also -------- diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 212d0bcef8f43..73ba02c515344 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2767,6 +2767,7 @@ def duplicated(self, keep: DropKeep = "first") -> npt.NDArray[np.bool_]: Returns ------- np.ndarray[bool] + A numpy array of boolean values indicating duplicate index values. See Also -------- From 46bd88f795ad4ff51fbd97b5e29c1b216524c72d Mon Sep 17 00:00:00 2001 From: rohanjain101 <38412262+rohanjain101@users.noreply.github.com> Date: Mon, 29 Apr 2024 20:26:37 -0400 Subject: [PATCH 076/100] preserve index in list accessor (#58438) * preserve index in list accessor * gh reference * explode fix * cleanup * improve test * Update v3.0.0.rst Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> * f --------- Co-authored-by: Rohan Jain Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/arrays/arrow/accessors.py | 22 ++++++++++------ .../series/accessors/test_list_accessor.py | 25 ++++++++++++++++--- 3 files changed, 37 insertions(+), 11 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 59926c0751d32..afe63b6785524 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -483,6 +483,7 @@ Other - Bug in :meth:`Series.rank` that doesn't preserve missing values for nullable integers when ``na_option='keep'``. (:issue:`56976`) - Bug in :meth:`Series.replace` and :meth:`DataFrame.replace` inconsistently replacing matching instances when ``regex=True`` and missing values are present. (:issue:`56599`) - Bug in Dataframe Interchange Protocol implementation was returning incorrect results for data buffers' associated dtype, for string and datetime columns (:issue:`54781`) +- Bug in ``Series.list`` methods not preserving the original :class:`Index`. (:issue:`58425`) .. ***DO NOT USE THIS SECTION*** diff --git a/pandas/core/arrays/arrow/accessors.py b/pandas/core/arrays/arrow/accessors.py index 19ec253e81ef2..d8f948a37d206 100644 --- a/pandas/core/arrays/arrow/accessors.py +++ b/pandas/core/arrays/arrow/accessors.py @@ -110,7 +110,9 @@ def len(self) -> Series: from pandas import Series value_lengths = pc.list_value_length(self._pa_array) - return Series(value_lengths, dtype=ArrowDtype(value_lengths.type)) + return Series( + value_lengths, dtype=ArrowDtype(value_lengths.type), index=self._data.index + ) def __getitem__(self, key: int | slice) -> Series: """ @@ -149,7 +151,9 @@ def __getitem__(self, key: int | slice) -> Series: # if key < 0: # key = pc.add(key, pc.list_value_length(self._pa_array)) element = pc.list_element(self._pa_array, key) - return Series(element, dtype=ArrowDtype(element.type)) + return Series( + element, dtype=ArrowDtype(element.type), index=self._data.index + ) elif isinstance(key, slice): if pa_version_under11p0: raise NotImplementedError( @@ -167,7 +171,7 @@ def __getitem__(self, key: int | slice) -> Series: if step is None: step = 1 sliced = pc.list_slice(self._pa_array, start, stop, step) - return Series(sliced, dtype=ArrowDtype(sliced.type)) + return Series(sliced, dtype=ArrowDtype(sliced.type), index=self._data.index) else: raise ValueError(f"key must be an int or slice, got {type(key).__name__}") @@ -195,15 +199,17 @@ def flatten(self) -> Series: ... ) >>> s.list.flatten() 0 1 - 1 2 - 2 3 - 3 3 + 0 2 + 0 3 + 1 3 dtype: int64[pyarrow] """ from pandas import Series - flattened = pc.list_flatten(self._pa_array) - return Series(flattened, dtype=ArrowDtype(flattened.type)) + counts = pa.compute.list_value_length(self._pa_array) + flattened = pa.compute.list_flatten(self._pa_array) + index = self._data.index.repeat(counts.fill_null(pa.scalar(0, counts.type))) + return Series(flattened, dtype=ArrowDtype(flattened.type), index=index) class StructAccessor(ArrowAccessor): diff --git a/pandas/tests/series/accessors/test_list_accessor.py b/pandas/tests/series/accessors/test_list_accessor.py index 1c60567c1a530..c153e800cb534 100644 --- a/pandas/tests/series/accessors/test_list_accessor.py +++ b/pandas/tests/series/accessors/test_list_accessor.py @@ -31,10 +31,23 @@ def test_list_getitem(list_dtype): tm.assert_series_equal(actual, expected) +def test_list_getitem_index(): + # GH 58425 + ser = Series( + [[1, 2, 3], [4, None, 5], None], + dtype=ArrowDtype(pa.list_(pa.int64())), + index=[1, 3, 7], + ) + actual = ser.list[1] + expected = Series([2, None, None], dtype="int64[pyarrow]", index=[1, 3, 7]) + tm.assert_series_equal(actual, expected) + + def test_list_getitem_slice(): ser = Series( [[1, 2, 3], [4, None, 5], None], dtype=ArrowDtype(pa.list_(pa.int64())), + index=[1, 3, 7], ) if pa_version_under11p0: with pytest.raises( @@ -44,7 +57,9 @@ def test_list_getitem_slice(): else: actual = ser.list[1:None:None] expected = Series( - [[2, 3], [None, 5], None], dtype=ArrowDtype(pa.list_(pa.int64())) + [[2, 3], [None, 5], None], + dtype=ArrowDtype(pa.list_(pa.int64())), + index=[1, 3, 7], ) tm.assert_series_equal(actual, expected) @@ -61,11 +76,15 @@ def test_list_len(): def test_list_flatten(): ser = Series( - [[1, 2, 3], [4, None], None], + [[1, 2, 3], None, [4, None], [], [7, 8]], dtype=ArrowDtype(pa.list_(pa.int64())), ) actual = ser.list.flatten() - expected = Series([1, 2, 3, 4, None], dtype=ArrowDtype(pa.int64())) + expected = Series( + [1, 2, 3, 4, None, 7, 8], + dtype=ArrowDtype(pa.int64()), + index=[0, 0, 0, 2, 2, 4, 4], + ) tm.assert_series_equal(actual, expected) From 78a2ef2f43a40e13f51c223b64d2325bd9e7716e Mon Sep 17 00:00:00 2001 From: KeiOshima Date: Mon, 29 Apr 2024 20:27:15 -0400 Subject: [PATCH 077/100] DOC: ficing PR01 and SA01 issue for Index: Identical (#58442) * DOC: ficing PR01 and SA01 issue for Index: Identical * fixing EXpected to fail issue --- ci/code_checks.sh | 1 - pandas/core/indexes/base.py | 11 +++++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 45831f6030794..161047197ff6f 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -97,7 +97,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Index.get_indexer_for PR01,SA01" \ -i "pandas.Index.get_indexer_non_unique PR07,SA01" \ -i "pandas.Index.get_loc PR07,RT03,SA01" \ - -i "pandas.Index.identical PR01,SA01" \ -i "pandas.Index.join PR07,RT03,SA01" \ -i "pandas.Index.names GL08" \ -i "pandas.Index.putmask PR01,RT03" \ diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 73ba02c515344..2bf0aca31449e 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -5344,12 +5344,23 @@ def identical(self, other) -> bool: """ Similar to equals, but checks that object attributes and types are also equal. + Parameters + ---------- + other : Index + The Index object you want to compare with the current Index object. + Returns ------- bool If two Index objects have equal elements and same type True, otherwise False. + See Also + -------- + Index.equals: Determine if two Index object are equal. + Index.has_duplicates: Check if the Index has duplicate values. + Index.is_unique: Return if the index has unique values. + Examples -------- >>> idx1 = pd.Index(["1", "2", "3"]) From f2909854ab6b2b7912ed68df5c5f0dd8a8fd3f3a Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Tue, 30 Apr 2024 22:01:16 +0530 Subject: [PATCH 078/100] DOC: Enforce Numpy Docstring Validation for pandas.Index.take (#58489) * DOC: add PR01,PR07 for pandas.Index.take * DOC: remove PR01,PR07 for pandas.Index.take --- ci/code_checks.sh | 1 - pandas/core/indexes/base.py | 12 ++++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 161047197ff6f..1bdd2d5e8aa33 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -103,7 +103,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Index.ravel PR01,RT03" \ -i "pandas.Index.slice_indexer PR07,RT03,SA01" \ -i "pandas.Index.str PR01,SA01" \ - -i "pandas.Index.take PR01,PR07" \ -i "pandas.Index.view GL08" \ -i "pandas.Int16Dtype SA01" \ -i "pandas.Int32Dtype SA01" \ diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 2bf0aca31449e..c7b009bc02dbe 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -1120,9 +1120,21 @@ def astype(self, dtype, copy: bool = True): axis : int, optional The axis over which to select values, always 0. allow_fill : bool, default True + How to handle negative values in `indices`. + + * False: negative values in `indices` indicate positional indices + from the right (the default). This is similar to + :func:`numpy.take`. + + * True: negative values in `indices` indicate + missing values. These values are set to `fill_value`. Any other + other negative values raise a ``ValueError``. + fill_value : scalar, default None If allow_fill=True and fill_value is not None, indices specified by -1 are regarded as NA. If Index doesn't hold NA, raise ValueError. + **kwargs + Required for compatibility with numpy. Returns ------- From 2e7fa91a30d28f92fb31ee891fd2a74f57e99f78 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Tue, 30 Apr 2024 22:07:35 +0530 Subject: [PATCH 079/100] DOC: Enforce Numpy Docstring Validation for pandas.Index.sliece_indexer (#58490) * DOC: add PR07,RT03 in pandas.Index.slice_indexer * DOC: add SA01 in pandas.Index.slice_indexer * DOC: remove pandas.Index.slice_indexer --- ci/code_checks.sh | 1 - pandas/core/indexes/base.py | 9 ++++++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 1bdd2d5e8aa33..6b6ca25178720 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -101,7 +101,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Index.names GL08" \ -i "pandas.Index.putmask PR01,RT03" \ -i "pandas.Index.ravel PR01,RT03" \ - -i "pandas.Index.slice_indexer PR07,RT03,SA01" \ -i "pandas.Index.str PR01,SA01" \ -i "pandas.Index.view GL08" \ -i "pandas.Int16Dtype SA01" \ diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index c7b009bc02dbe..c83dd3be13424 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -6342,19 +6342,26 @@ def slice_indexer( end : label, default None If None, defaults to the end. step : int, default None + If None, defaults to 1. Returns ------- slice + A slice object. Raises ------ KeyError : If key does not exist, or key is not unique and index is not ordered. + See Also + -------- + Index.slice_locs : Computes slice locations for input labels. + Index.get_slice_bound : Retrieves slice bound that corresponds to given label. + Notes ----- - This function assumes that the data is sorted, so use at your own peril + This function assumes that the data is sorted, so use at your own peril. Examples -------- From c150511159f7ef8dc4df8d45a99b1c49ea948dea Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Tue, 30 Apr 2024 22:08:17 +0530 Subject: [PATCH 080/100] DOC: Enforce Numpy Docstring Validation for pandas.DataFrame.__dataframe__ (#58491) * DOC: add SA01 in pandas.DataFrame.__dataframe__ * DOC: remove SA01 in pandas.DataFrame.__dataframe__ --- ci/code_checks.sh | 1 - pandas/core/frame.py | 5 +++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 6b6ca25178720..d1ba0d24b4b7f 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -70,7 +70,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then --format=actions \ -i ES01 `# For now it is ok if docstrings are missing the extended summary` \ -i "pandas.Series.dt PR01" `# Accessors are implemented as classes, but we do not document the Parameters section` \ - -i "pandas.DataFrame.__dataframe__ SA01" \ -i "pandas.DataFrame.at_time PR01" \ -i "pandas.DataFrame.kurt RT03,SA01" \ -i "pandas.DataFrame.kurtosis RT03,SA01" \ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 9fbbc2c08efaa..b7eba737829ec 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -931,6 +931,11 @@ def __dataframe__( DataFrame interchange object The object which consuming library can use to ingress the dataframe. + See Also + -------- + DataFrame.from_records : Constructor from tuples, also record arrays. + DataFrame.from_dict : From dicts of Series, arrays, or dicts. + Notes ----- Details on the interchange protocol: From fdcdcb84cfa675b756c7f4c42d3eb466c49bc098 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Tue, 30 Apr 2024 22:09:10 +0530 Subject: [PATCH 081/100] DOC: Enforce Numpy Docstring Validation for pandas.DataFrame.kurt and pandas.DataFrame.kurtosis (#58493) DOC: fix ruff issues --- ci/code_checks.sh | 2 -- pandas/core/frame.py | 80 +++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 79 insertions(+), 3 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index d1ba0d24b4b7f..8f6c5e0beee0b 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -71,8 +71,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i ES01 `# For now it is ok if docstrings are missing the extended summary` \ -i "pandas.Series.dt PR01" `# Accessors are implemented as classes, but we do not document the Parameters section` \ -i "pandas.DataFrame.at_time PR01" \ - -i "pandas.DataFrame.kurt RT03,SA01" \ - -i "pandas.DataFrame.kurtosis RT03,SA01" \ -i "pandas.DataFrame.max RT03" \ -i "pandas.DataFrame.mean RT03,SA01" \ -i "pandas.DataFrame.median RT03,SA01" \ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b7eba737829ec..653b07b6e27ed 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -12069,7 +12069,6 @@ def kurt( ) -> Series | Any: ... @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="kurt") - @doc(make_doc("kurt", ndim=2)) def kurt( self, axis: Axis | None = 0, @@ -12077,6 +12076,85 @@ def kurt( numeric_only: bool = False, **kwargs, ) -> Series | Any: + """ + Return unbiased kurtosis over requested axis. + + Kurtosis obtained using Fisher's definition of + kurtosis (kurtosis of normal == 0.0). Normalized by N-1. + + Parameters + ---------- + axis : {index (0), columns (1)} + Axis for the function to be applied on. + For `Series` this parameter is unused and defaults to 0. + + For DataFrames, specifying ``axis=None`` will apply the aggregation + across both axes. + + .. versionadded:: 2.0.0 + + skipna : bool, default True + Exclude NA/null values when computing the result. + numeric_only : bool, default False + Include only float, int, boolean columns. + + **kwargs + Additional keyword arguments to be passed to the function. + + Returns + ------- + Series or scalar + Unbiased kurtosis over requested axis. + + See Also + -------- + Dataframe.kurtosis : Returns unbiased kurtosis over requested axis. + + Examples + -------- + >>> s = pd.Series([1, 2, 2, 3], index=["cat", "dog", "dog", "mouse"]) + >>> s + cat 1 + dog 2 + dog 2 + mouse 3 + dtype: int64 + >>> s.kurt() + 1.5 + + With a DataFrame + + >>> df = pd.DataFrame( + ... {"a": [1, 2, 2, 3], "b": [3, 4, 4, 4]}, + ... index=["cat", "dog", "dog", "mouse"], + ... ) + >>> df + a b + cat 1 3 + dog 2 4 + dog 2 4 + mouse 3 4 + >>> df.kurt() + a 1.5 + b 4.0 + dtype: float64 + + With axis=None + + >>> df.kurt(axis=None).round(6) + -0.988693 + + Using axis=1 + + >>> df = pd.DataFrame( + ... {"a": [1, 2], "b": [3, 4], "c": [3, 4], "d": [1, 2]}, + ... index=["cat", "dog"], + ... ) + >>> df.kurt(axis=1) + cat -6.0 + dog -6.0 + dtype: float64 + """ result = super().kurt( axis=axis, skipna=skipna, numeric_only=numeric_only, **kwargs ) From 53b7f24258b878107e11f2b20ac4d9184ba72b49 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Tue, 30 Apr 2024 23:24:43 +0530 Subject: [PATCH 082/100] DOC: Enforce Numpy Docstring Validation for pandas.Index.view (#58486) * DOC: add GL08 for pandas.Index.view * DOC: remove GL08 for pandas.Index.view * DOC: fix examples in docstring * DOC: fix examples in docstring * DOC: fix examples in docstring * DOC: fix examples in docstring * DOC: fix examples in docstring --- ci/code_checks.sh | 1 - pandas/core/indexes/base.py | 36 ++++++++++++++++++++++++++++++++++++ 2 files changed, 36 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 8f6c5e0beee0b..3ecca97b5dccd 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -99,7 +99,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Index.putmask PR01,RT03" \ -i "pandas.Index.ravel PR01,RT03" \ -i "pandas.Index.str PR01,SA01" \ - -i "pandas.Index.view GL08" \ -i "pandas.Int16Dtype SA01" \ -i "pandas.Int32Dtype SA01" \ -i "pandas.Int64Dtype SA01" \ diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index c83dd3be13424..baa8a7493a030 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -1013,6 +1013,42 @@ def ravel(self, order: str_t = "C") -> Self: return self[:] def view(self, cls=None): + """ + Return a view on self. + + Parameters + ---------- + cls : data-type or ndarray sub-class, optional + Data-type descriptor of the returned view, e.g., float32 or int16. + Omitting it results in the view having the same data-type as `self`. + This argument can also be specified as an ndarray sub-class, + e.g., np.int64 or np.float32 which then specifies the type of + the returned object. + + Returns + ------- + numpy.ndarray + A new view of the same data in memory. + + See Also + -------- + numpy.ndarray.view : Returns a new view of array with the same data. + + Examples + -------- + >>> s = pd.Series([1, 2, 3], index=["1", "2", "3"]) + >>> s.index.view("object") + array(['1', '2', '3'], dtype=object) + + >>> s = pd.Series([1, 2, 3], index=[-1, 0, 1]) + >>> s.index.view(np.int64) + array([-1, 0, 1]) + >>> s.index.view(np.float32) + array([ nan, nan, 0.e+00, 0.e+00, 1.e-45, 0.e+00], dtype=float32) + >>> s.index.view(np.uint64) + array([18446744073709551615, 0, 1], + dtype=uint64) + """ # we need to see if we are subclassing an # index type here if cls is not None: From c9bc4809528998313a609ab16168ca237bc186b6 Mon Sep 17 00:00:00 2001 From: Elliott Sales de Andrade Date: Tue, 30 Apr 2024 13:55:25 -0400 Subject: [PATCH 083/100] Remove deprecated plot_date calls (#58484) * Remove deprecated plot_date calls These were deprecated in Matplotlib 3.9. * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- pandas/tests/plotting/test_datetimelike.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/pandas/tests/plotting/test_datetimelike.py b/pandas/tests/plotting/test_datetimelike.py index 6b709522bab70..4b4eeada58366 100644 --- a/pandas/tests/plotting/test_datetimelike.py +++ b/pandas/tests/plotting/test_datetimelike.py @@ -1432,13 +1432,19 @@ def test_mpl_nopandas(self): values1 = np.arange(10.0, 11.0, 0.5) values2 = np.arange(11.0, 12.0, 0.5) - kw = {"fmt": "-", "lw": 4} - _, ax = mpl.pyplot.subplots() - ax.plot_date([x.toordinal() for x in dates], values1, **kw) - ax.plot_date([x.toordinal() for x in dates], values2, **kw) - - line1, line2 = ax.get_lines() + ( + line1, + line2, + ) = ax.plot( + [x.toordinal() for x in dates], + values1, + "-", + [x.toordinal() for x in dates], + values2, + "-", + linewidth=4, + ) exp = np.array([x.toordinal() for x in dates], dtype=np.float64) tm.assert_numpy_array_equal(line1.get_xydata()[:, 0], exp) From cb8b213cbde4b677cc79e781e1c7f535e0724fe9 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Tue, 30 Apr 2024 23:25:58 +0530 Subject: [PATCH 084/100] DOC: Enforce Numpy Docstring Validation for pandas.DataFrame.attime (#58492) * DOC: add SA01PR01 in pandas.DataFrame.at_time * DOC: remove PR01 in pandas.DataFrame.at_time * DOC: remove pandas.Series.at_time --- ci/code_checks.sh | 2 -- pandas/core/generic.py | 2 ++ 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 3ecca97b5dccd..af432dcd64c82 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -70,7 +70,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then --format=actions \ -i ES01 `# For now it is ok if docstrings are missing the extended summary` \ -i "pandas.Series.dt PR01" `# Accessors are implemented as classes, but we do not document the Parameters section` \ - -i "pandas.DataFrame.at_time PR01" \ -i "pandas.DataFrame.max RT03" \ -i "pandas.DataFrame.mean RT03,SA01" \ -i "pandas.DataFrame.median RT03,SA01" \ @@ -187,7 +186,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Series SA01" \ -i "pandas.Series.__iter__ RT03,SA01" \ -i "pandas.Series.add PR07" \ - -i "pandas.Series.at_time PR01" \ -i "pandas.Series.backfill PR01,SA01" \ -i "pandas.Series.case_when RT03" \ -i "pandas.Series.cat PR07,SA01" \ diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 121f49cb7d1cf..24727bb9d83c1 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -8532,6 +8532,8 @@ def at_time(self, time, asof: bool = False, axis: Axis | None = None) -> Self: ---------- time : datetime.time or str The values to select. + asof : bool, default False + This parameter is currently not supported. axis : {0 or 'index', 1 or 'columns'}, default 0 For `Series` this parameter is unused and defaults to 0. From 66cfd806144f001f460679f9322c3c0b7d335685 Mon Sep 17 00:00:00 2001 From: shriyakalakata <87483933+shriyakalakata@users.noreply.github.com> Date: Tue, 30 Apr 2024 13:57:25 -0400 Subject: [PATCH 085/100] Fix PR07,RT03,SA01 errors for Index.append, Index.difference (#58453) * Fix errors for Index.append * Fixed errors for Index.difference --- ci/code_checks.sh | 2 -- pandas/core/indexes/base.py | 18 ++++++++++++++++++ 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index af432dcd64c82..f49bfb1581332 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -87,8 +87,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.DataFrame.var PR01,RT03,SA01" \ -i "pandas.Grouper PR02" \ -i "pandas.Index PR07" \ - -i "pandas.Index.append PR07,RT03,SA01" \ - -i "pandas.Index.difference PR07,RT03,SA01" \ -i "pandas.Index.get_indexer PR07,SA01" \ -i "pandas.Index.get_indexer_for PR01,SA01" \ -i "pandas.Index.get_indexer_non_unique PR07,SA01" \ diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index baa8a7493a030..054f522e7a37b 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3297,6 +3297,8 @@ def difference(self, other, sort=None): Parameters ---------- other : Index or array-like + Index object or an array-like object containing elements to be compared + with the elements of the original Index. sort : bool or None, default None Whether to sort the resulting index. By default, the values are attempted to be sorted, but any TypeError from @@ -3310,6 +3312,14 @@ def difference(self, other, sort=None): Returns ------- Index + Returns a new Index object containing elements that are in the original + Index but not in the `other` Index. + + See Also + -------- + Index.symmetric_difference : Compute the symmetric difference of two Index + objects. + Index.intersection : Form the intersection of two Index objects. Examples -------- @@ -5192,10 +5202,18 @@ def append(self, other: Index | Sequence[Index]) -> Index: Parameters ---------- other : Index or list/tuple of indices + Single Index or a collection of indices, which can be either a list or a + tuple. Returns ------- Index + Returns a new Index object resulting from appending the provided other + indices to the original Index. + + See Also + -------- + Index.insert : Make new Index inserting new item at location. Examples -------- From 0f9adf86858a428ff7fc63d3b48f6dbc8321ba52 Mon Sep 17 00:00:00 2001 From: Khor Chean Wei Date: Wed, 1 May 2024 01:58:11 +0800 Subject: [PATCH 086/100] ENH: Allow parameter min_periods in DataFrame.corrwith() (#58231) * Testing * Testing * enhance test case * add test * testing * add * add test * enhance * add * add * add * add * add * add * enhance * enhance * enhance * Update doc/source/whatsnew/v3.0.0.rst Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> * test * Update test_cov_corr.py --------- Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/frame.py | 9 +++++++- pandas/tests/frame/methods/test_cov_corr.py | 25 +++++++++++++++++++++ pandas/tests/groupby/test_api.py | 2 ++ 4 files changed, 36 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index afe63b6785524..1fc2f1041e2ea 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -39,6 +39,7 @@ Other enhancements - Users can globally disable any ``PerformanceWarning`` by setting the option ``mode.performance_warnings`` to ``False`` (:issue:`56920`) - :meth:`Styler.format_index_names` can now be used to format the index and column names (:issue:`48936` and :issue:`47489`) - :class:`.errors.DtypeWarning` improved to include column names when mixed data types are detected (:issue:`58174`) +- :meth:`DataFrame.corrwith` now accepts ``min_periods`` as optional arguments, as in :meth:`DataFrame.corr` and :meth:`Series.corr` (:issue:`9490`) - :meth:`DataFrame.cummin`, :meth:`DataFrame.cummax`, :meth:`DataFrame.cumprod` and :meth:`DataFrame.cumsum` methods now have a ``numeric_only`` parameter (:issue:`53072`) - :meth:`DataFrame.fillna` and :meth:`Series.fillna` can now accept ``value=None``; for non-object dtype the corresponding NA value will be used (:issue:`57723`) - :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 653b07b6e27ed..3d2a6093464a9 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -11132,6 +11132,7 @@ def corrwith( drop: bool = False, method: CorrelationMethod = "pearson", numeric_only: bool = False, + min_periods: int | None = None, ) -> Series: """ Compute pairwise correlation. @@ -11162,6 +11163,9 @@ def corrwith( numeric_only : bool, default False Include only `float`, `int` or `boolean` data. + min_periods : int, optional + Minimum number of observations needed to have a valid result. + .. versionadded:: 1.5.0 .. versionchanged:: 2.0.0 @@ -11205,7 +11209,10 @@ def corrwith( this = self._get_numeric_data() if numeric_only else self if isinstance(other, Series): - return this.apply(lambda x: other.corr(x, method=method), axis=axis) + return this.apply( + lambda x: other.corr(x, method=method, min_periods=min_periods), + axis=axis, + ) if numeric_only: other = other._get_numeric_data() diff --git a/pandas/tests/frame/methods/test_cov_corr.py b/pandas/tests/frame/methods/test_cov_corr.py index 4d2d83d25e8da..53aa44f264c7a 100644 --- a/pandas/tests/frame/methods/test_cov_corr.py +++ b/pandas/tests/frame/methods/test_cov_corr.py @@ -461,3 +461,28 @@ def test_corrwith_spearman_with_tied_data(self): result = df_bool.corrwith(ser_bool) expected = Series([0.57735, 0.57735], index=["A", "B"]) tm.assert_series_equal(result, expected) + + def test_corrwith_min_periods_method(self): + # GH#9490 + pytest.importorskip("scipy") + df1 = DataFrame( + { + "A": [1, np.nan, 7, 8], + "B": [False, True, True, False], + "C": [10, 4, 9, 3], + } + ) + df2 = df1[["B", "C"]] + result = (df1 + 1).corrwith(df2.B, method="spearman", min_periods=2) + expected = Series([0.0, 1.0, 0.0], index=["A", "B", "C"]) + tm.assert_series_equal(result, expected) + + def test_corrwith_min_periods_boolean(self): + # GH#9490 + df_bool = DataFrame( + {"A": [True, True, False, False], "B": [True, False, False, True]} + ) + ser_bool = Series([True, True, False, True]) + result = df_bool.corrwith(ser_bool, min_periods=3) + expected = Series([0.57735, 0.57735], index=["A", "B"]) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/test_api.py b/pandas/tests/groupby/test_api.py index d2cfa530e7c65..33b39bad4ab81 100644 --- a/pandas/tests/groupby/test_api.py +++ b/pandas/tests/groupby/test_api.py @@ -192,6 +192,8 @@ def test_frame_consistency(groupby_func): exclude_expected = {"numeric_only"} elif groupby_func in ("quantile",): exclude_expected = {"method", "axis"} + elif groupby_func in ["corrwith"]: + exclude_expected = {"min_periods"} if groupby_func not in ["pct_change", "size"]: exclude_expected |= {"axis"} From 086b047242e8f2a1a2a8d5f7851cecb528eb4785 Mon Sep 17 00:00:00 2001 From: Gianluca Ficarelli <26835404+GianlucaFicarelli@users.noreply.github.com> Date: Tue, 30 Apr 2024 20:25:26 +0200 Subject: [PATCH 087/100] PERF: MultiIndex._engine use smaller dtypes (#58411) * PERF: MultiIndex._engine use smaller dtypes * Move offsets downcasting to MultiIndex._engine * Remove unused import uint64_t --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/_libs/index.pyi | 4 +- pandas/_libs/index.pyx | 48 +++++++-- pandas/core/indexes/multi.py | 110 +++++++++----------- pandas/tests/indexes/multi/test_indexing.py | 41 +++++--- 5 files changed, 114 insertions(+), 90 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 1fc2f1041e2ea..ce9022bdc2967 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -336,6 +336,7 @@ Performance improvements - Performance improvement in :meth:`Index.join` by propagating cached attributes in cases where the result matches one of the inputs (:issue:`57023`) - Performance improvement in :meth:`Index.take` when ``indices`` is a full range indexer from zero to length of index (:issue:`56806`) - Performance improvement in :meth:`Index.to_frame` returning a :class:`RangeIndex` columns of a :class:`Index` when possible. (:issue:`58018`) +- Performance improvement in :meth:`MultiIndex._engine` to use smaller dtypes if possible (:issue:`58411`) - Performance improvement in :meth:`MultiIndex.equals` for equal length indexes (:issue:`56990`) - Performance improvement in :meth:`MultiIndex.memory_usage` to ignore the index engine when it isn't already cached. (:issue:`58385`) - Performance improvement in :meth:`RangeIndex.__getitem__` with a boolean mask or integers returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57588`) diff --git a/pandas/_libs/index.pyi b/pandas/_libs/index.pyi index 12a5bf245977e..bf6d8ba8973d3 100644 --- a/pandas/_libs/index.pyi +++ b/pandas/_libs/index.pyi @@ -74,13 +74,13 @@ class MaskedBoolEngine(MaskedUInt8Engine): ... class BaseMultiIndexCodesEngine: levels: list[np.ndarray] - offsets: np.ndarray # ndarray[uint64_t, ndim=1] + offsets: np.ndarray # np.ndarray[..., ndim=1] def __init__( self, levels: list[Index], # all entries hashable labels: list[np.ndarray], # all entries integer-dtyped - offsets: np.ndarray, # np.ndarray[np.uint64, ndim=1] + offsets: np.ndarray, # np.ndarray[..., ndim=1] ) -> None: ... def get_indexer(self, target: npt.NDArray[np.object_]) -> npt.NDArray[np.intp]: ... def _extract_level_codes(self, target: MultiIndex) -> np.ndarray: ... diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index a700074d46ba8..f1be8d97c71eb 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -9,7 +9,6 @@ from numpy cimport ( intp_t, ndarray, uint8_t, - uint64_t, ) cnp.import_array() @@ -699,8 +698,7 @@ cdef class BaseMultiIndexCodesEngine: Keys are located by first locating each component against the respective level, then locating (the integer representation of) codes. """ - def __init__(self, object levels, object labels, - ndarray[uint64_t, ndim=1] offsets): + def __init__(self, object levels, object labels, ndarray offsets): """ Parameters ---------- @@ -708,7 +706,7 @@ cdef class BaseMultiIndexCodesEngine: Levels of the MultiIndex. labels : list-like of numpy arrays of integer dtype Labels of the MultiIndex. - offsets : numpy array of uint64 dtype + offsets : numpy array of int dtype Pre-calculated offsets, one for each level of the index. """ self.levels = levels @@ -718,8 +716,9 @@ cdef class BaseMultiIndexCodesEngine: # with positive integers (-1 for NaN becomes 1). This enables us to # differentiate between values that are missing in other and matching # NaNs. We will set values that are not found to 0 later: - labels_arr = np.array(labels, dtype="int64").T + multiindex_nulls_shift - codes = labels_arr.astype("uint64", copy=False) + codes = np.array(labels).T + codes += multiindex_nulls_shift # inplace sum optimisation + self.level_has_nans = [-1 in lab for lab in labels] # Map each codes combination in the index to an integer unambiguously @@ -731,8 +730,37 @@ cdef class BaseMultiIndexCodesEngine: # integers representing labels: we will use its get_loc and get_indexer self._base.__init__(self, lab_ints) - def _codes_to_ints(self, ndarray[uint64_t] codes) -> np.ndarray: - raise NotImplementedError("Implemented by subclass") # pragma: no cover + def _codes_to_ints(self, ndarray codes) -> np.ndarray: + """ + Transform combination(s) of uint in one uint or Python integer (each), in a + strictly monotonic way (i.e. respecting the lexicographic order of integer + combinations). + + Parameters + ---------- + codes : 1- or 2-dimensional array of dtype uint + Combinations of integers (one per row) + + Returns + ------- + scalar or 1-dimensional array, of dtype _codes_dtype + Integer(s) representing one combination (each). + """ + # To avoid overflows, first make sure we are working with the right dtype: + codes = codes.astype(self._codes_dtype, copy=False) + + # Shift the representation of each level by the pre-calculated number of bits: + codes <<= self.offsets # inplace shift optimisation + + # Now sum and OR are in fact interchangeable. This is a simple + # composition of the (disjunct) significant bits of each level (i.e. + # each column in "codes") in a single positive integer (per row): + if codes.ndim == 1: + # Single key + return np.bitwise_or.reduce(codes) + + # Multiple keys + return np.bitwise_or.reduce(codes, axis=1) def _extract_level_codes(self, target) -> np.ndarray: """ @@ -757,7 +785,7 @@ cdef class BaseMultiIndexCodesEngine: codes[codes > 0] += 1 if self.level_has_nans[i]: codes[target.codes[i] == -1] += 1 - return self._codes_to_ints(np.array(level_codes, dtype="uint64").T) + return self._codes_to_ints(np.array(level_codes, dtype=self._codes_dtype).T) def get_indexer(self, target: np.ndarray) -> np.ndarray: """ @@ -788,7 +816,7 @@ cdef class BaseMultiIndexCodesEngine: raise KeyError(key) # Transform indices into single integer: - lab_int = self._codes_to_ints(np.array(indices, dtype="uint64")) + lab_int = self._codes_to_ints(np.array(indices, dtype=self._codes_dtype)) return self._base.get_loc(self, lab_int) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index c8e16fad00d5b..a5bcf49c5490b 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -123,84 +123,56 @@ ) -class MultiIndexUIntEngine(libindex.BaseMultiIndexCodesEngine, libindex.UInt64Engine): - """ - This class manages a MultiIndex by mapping label combinations to positive - integers. +class MultiIndexUInt64Engine(libindex.BaseMultiIndexCodesEngine, libindex.UInt64Engine): + """Manages a MultiIndex by mapping label combinations to positive integers. + + The number of possible label combinations must not overflow the 64 bits integers. """ _base = libindex.UInt64Engine + _codes_dtype = "uint64" - def _codes_to_ints(self, codes): - """ - Transform combination(s) of uint64 in one uint64 (each), in a strictly - monotonic way (i.e. respecting the lexicographic order of integer - combinations): see BaseMultiIndexCodesEngine documentation. - Parameters - ---------- - codes : 1- or 2-dimensional array of dtype uint64 - Combinations of integers (one per row) +class MultiIndexUInt32Engine(libindex.BaseMultiIndexCodesEngine, libindex.UInt32Engine): + """Manages a MultiIndex by mapping label combinations to positive integers. - Returns - ------- - scalar or 1-dimensional array, of dtype uint64 - Integer(s) representing one combination (each). - """ - # Shift the representation of each level by the pre-calculated number - # of bits: - codes <<= self.offsets + The number of possible label combinations must not overflow the 32 bits integers. + """ - # Now sum and OR are in fact interchangeable. This is a simple - # composition of the (disjunct) significant bits of each level (i.e. - # each column in "codes") in a single positive integer: - if codes.ndim == 1: - # Single key - return np.bitwise_or.reduce(codes) + _base = libindex.UInt32Engine + _codes_dtype = "uint32" - # Multiple keys - return np.bitwise_or.reduce(codes, axis=1) +class MultiIndexUInt16Engine(libindex.BaseMultiIndexCodesEngine, libindex.UInt16Engine): + """Manages a MultiIndex by mapping label combinations to positive integers. -class MultiIndexPyIntEngine(libindex.BaseMultiIndexCodesEngine, libindex.ObjectEngine): - """ - This class manages those (extreme) cases in which the number of possible - label combinations overflows the 64 bits integers, and uses an ObjectEngine - containing Python integers. + The number of possible label combinations must not overflow the 16 bits integers. """ - _base = libindex.ObjectEngine + _base = libindex.UInt16Engine + _codes_dtype = "uint16" - def _codes_to_ints(self, codes): - """ - Transform combination(s) of uint64 in one Python integer (each), in a - strictly monotonic way (i.e. respecting the lexicographic order of - integer combinations): see BaseMultiIndexCodesEngine documentation. - Parameters - ---------- - codes : 1- or 2-dimensional array of dtype uint64 - Combinations of integers (one per row) +class MultiIndexUInt8Engine(libindex.BaseMultiIndexCodesEngine, libindex.UInt8Engine): + """Manages a MultiIndex by mapping label combinations to positive integers. - Returns - ------- - int, or 1-dimensional array of dtype object - Integer(s) representing one combination (each). - """ - # Shift the representation of each level by the pre-calculated number - # of bits. Since this can overflow uint64, first make sure we are - # working with Python integers: - codes = codes.astype("object") << self.offsets + The number of possible label combinations must not overflow the 8 bits integers. + """ - # Now sum and OR are in fact interchangeable. This is a simple - # composition of the (disjunct) significant bits of each level (i.e. - # each column in "codes") in a single positive integer (per row): - if codes.ndim == 1: - # Single key - return np.bitwise_or.reduce(codes) + _base = libindex.UInt8Engine + _codes_dtype = "uint8" - # Multiple keys - return np.bitwise_or.reduce(codes, axis=1) + +class MultiIndexPyIntEngine(libindex.BaseMultiIndexCodesEngine, libindex.ObjectEngine): + """Manages a MultiIndex by mapping label combinations to positive integers. + + This class manages those (extreme) cases in which the number of possible + label combinations overflows the 64 bits integers, and uses an ObjectEngine + containing Python integers. + """ + + _base = libindex.ObjectEngine + _codes_dtype = "object" def names_compat(meth: F) -> F: @@ -1229,13 +1201,25 @@ def _engine(self): # equivalent to sorting lexicographically the codes themselves. Notice # that each level needs to be shifted by the number of bits needed to # represent the _previous_ ones: - offsets = np.concatenate([lev_bits[1:], [0]]).astype("uint64") + offsets = np.concatenate([lev_bits[1:], [0]]) + # Downcast the type if possible, to prevent upcasting when shifting codes: + offsets = offsets.astype(np.min_scalar_type(int(offsets[0]))) # Check the total number of bits needed for our representation: if lev_bits[0] > 64: # The levels would overflow a 64 bit uint - use Python integers: return MultiIndexPyIntEngine(self.levels, self.codes, offsets) - return MultiIndexUIntEngine(self.levels, self.codes, offsets) + if lev_bits[0] > 32: + # The levels would overflow a 32 bit uint - use uint64 + return MultiIndexUInt64Engine(self.levels, self.codes, offsets) + if lev_bits[0] > 16: + # The levels would overflow a 16 bit uint - use uint8 + return MultiIndexUInt32Engine(self.levels, self.codes, offsets) + if lev_bits[0] > 8: + # The levels would overflow a 8 bit uint - use uint16 + return MultiIndexUInt16Engine(self.levels, self.codes, offsets) + # The levels fit in an 8 bit uint - use uint8 + return MultiIndexUInt8Engine(self.levels, self.codes, offsets) # Return type "Callable[..., MultiIndex]" of "_constructor" incompatible with return # type "Type[MultiIndex]" in supertype "Index" diff --git a/pandas/tests/indexes/multi/test_indexing.py b/pandas/tests/indexes/multi/test_indexing.py index 18d64999de496..f08a7625e7f8a 100644 --- a/pandas/tests/indexes/multi/test_indexing.py +++ b/pandas/tests/indexes/multi/test_indexing.py @@ -919,30 +919,41 @@ def test_slice_indexer_with_missing_value(index_arr, expected, start_idx, end_id assert result == expected -def test_pyint_engine(): +@pytest.mark.parametrize( + "N, expected_dtype", + [ + (1, "uint8"), # 2*4*N = 8 + (2, "uint16"), # 2*4*N = 16 + (4, "uint32"), # 2*4*N = 32 + (8, "uint64"), # 2*4*N = 64 + (10, "object"), # 2*4*N = 80 + ], +) +def test_pyint_engine(N, expected_dtype): # GH#18519 : when combinations of codes cannot be represented in 64 # bits, the index underlying the MultiIndex engine works with Python # integers, rather than uint64. - N = 5 keys = [ tuple(arr) for arr in [ - [0] * 10 * N, - [1] * 10 * N, - [2] * 10 * N, - [np.nan] * N + [2] * 9 * N, - [0] * N + [2] * 9 * N, - [np.nan] * N + [2] * 8 * N + [0] * N, + [0] * 4 * N, + [1] * 4 * N, + [np.nan] * N + [0] * 3 * N, + [0] * N + [1] * 3 * N, + [np.nan] * N + [1] * 2 * N + [0] * N, ] ] - # Each level contains 4 elements (including NaN), so it is represented - # in 2 bits, for a total of 2*N*10 = 100 > 64 bits. If we were using a - # 64 bit engine and truncating the first levels, the fourth and fifth - # keys would collide; if truncating the last levels, the fifth and - # sixth; if rotating bits rather than shifting, the third and fifth. + # Each level contains 3 elements (NaN, 0, 1), and it's represented + # in 2 bits to store 4 possible values (0=notfound, 1=NaN, 2=0, 3=1), for + # a total of 2*N*4 = 80 > 64 bits where N=10 and the number of levels is N*4. + # If we were using a 64 bit engine and truncating the first levels, the + # fourth and fifth keys would collide; if truncating the last levels, the + # fifth and sixth; if rotating bits rather than shifting, the third and fifth. + + index = MultiIndex.from_tuples(keys) + assert index._engine.values.dtype == expected_dtype for idx, key_value in enumerate(keys): - index = MultiIndex.from_tuples(keys) assert index.get_loc(key_value) == idx expected = np.arange(idx + 1, dtype=np.intp) @@ -952,7 +963,7 @@ def test_pyint_engine(): # With missing key: idces = range(len(keys)) expected = np.array([-1] + list(idces), dtype=np.intp) - missing = tuple([0, 1] * 5 * N) + missing = tuple([0, 1, 0, 1] * N) result = index.get_indexer([missing] + [keys[i] for i in idces]) tm.assert_numpy_array_equal(result, expected) From 59f6a3373751bd6b8e257066e33fdc3c618030ea Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Tue, 30 Apr 2024 22:02:25 +0200 Subject: [PATCH 088/100] BUG: hashing read only object categories raises (#58481) --- pandas/_libs/hashing.pyx | 3 ++- pandas/tests/arrays/categorical/test_algos.py | 8 ++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/hashing.pyx b/pandas/_libs/hashing.pyx index a9bf784d5f973..a1fd70529efa7 100644 --- a/pandas/_libs/hashing.pyx +++ b/pandas/_libs/hashing.pyx @@ -11,6 +11,7 @@ import numpy as np from numpy cimport ( import_array, + ndarray, uint8_t, uint64_t, ) @@ -22,7 +23,7 @@ from pandas._libs.util cimport is_nan @cython.boundscheck(False) def hash_object_array( - object[:] arr, str key, str encoding="utf8" + ndarray[object, ndim=1] arr, str key, str encoding="utf8" ) -> np.ndarray[np.uint64]: """ Parameters diff --git a/pandas/tests/arrays/categorical/test_algos.py b/pandas/tests/arrays/categorical/test_algos.py index 69c3364c7e98e..a7d0becc30dd9 100644 --- a/pandas/tests/arrays/categorical/test_algos.py +++ b/pandas/tests/arrays/categorical/test_algos.py @@ -86,3 +86,11 @@ def test_diff(): df = ser.to_frame(name="A") with pytest.raises(TypeError, match=msg): df.diff() + + +def test_hash_read_only_categorical(): + # GH#58481 + idx = pd.Index(pd.Index(["a", "b", "c"], dtype="object").values) + cat = pd.CategoricalDtype(idx) + arr = pd.Series(["a", "b"], dtype=cat).values + assert hash(arr.dtype) == hash(arr.dtype) From b804d9efda0392a9817f3350e6a8deb2d3c801a5 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Wed, 1 May 2024 22:22:21 +0530 Subject: [PATCH 089/100] DOC: Enforce Numpy Docstring Validation for pandas.Index.get_indexer (#58506) * DOC: add PR07,SA01 in pandas.Index.get_indexer * DOC: remove PR07,SA01 in pandas.Index.get_indexer * DOC: remove PR07,SA01 in pandas.IntervalIndex.get_indexer * DOC: remove PR07,SA01 in pandas.MultiIndex.get_indexer --- ci/code_checks.sh | 3 --- pandas/core/indexes/base.py | 7 +++++++ 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index f49bfb1581332..91335719cf303 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -87,7 +87,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.DataFrame.var PR01,RT03,SA01" \ -i "pandas.Grouper PR02" \ -i "pandas.Index PR07" \ - -i "pandas.Index.get_indexer PR07,SA01" \ -i "pandas.Index.get_indexer_for PR01,SA01" \ -i "pandas.Index.get_indexer_non_unique PR07,SA01" \ -i "pandas.Index.get_loc PR07,RT03,SA01" \ @@ -109,7 +108,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.IntervalDtype.subtype SA01" \ -i "pandas.IntervalIndex.closed SA01" \ -i "pandas.IntervalIndex.contains RT03" \ - -i "pandas.IntervalIndex.get_indexer PR07,SA01" \ -i "pandas.IntervalIndex.get_loc PR07,RT03,SA01" \ -i "pandas.IntervalIndex.is_non_overlapping_monotonic SA01" \ -i "pandas.IntervalIndex.left GL08" \ @@ -123,7 +121,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.MultiIndex.copy PR07,RT03,SA01" \ -i "pandas.MultiIndex.drop PR07,RT03,SA01" \ -i "pandas.MultiIndex.dtypes SA01" \ - -i "pandas.MultiIndex.get_indexer PR07,SA01" \ -i "pandas.MultiIndex.get_level_values SA01" \ -i "pandas.MultiIndex.get_loc PR07" \ -i "pandas.MultiIndex.get_loc_level PR07" \ diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 054f522e7a37b..46da27e216986 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3543,6 +3543,7 @@ def get_indexer( Parameters ---------- target : Index + An iterable containing the values to be used for computing indexer. method : {None, 'pad'/'ffill', 'backfill'/'bfill', 'nearest'}, optional * default: exact matches only. * pad / ffill: find the PREVIOUS index value if no exact match. @@ -3570,6 +3571,12 @@ def get_indexer( positions matches the corresponding target values. Missing values in the target are marked by -1. + See Also + -------- + Index.get_indexer_for : Returns an indexer even when non-unique. + Index.get_non_unique : Returns indexer and masks for new index given + the current index. + Notes ----- Returns -1 for unmatched values, for further explanation see the From 66daafc734873d0dac917f616ccc7e045becf153 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Wed, 1 May 2024 22:24:50 +0530 Subject: [PATCH 090/100] DOC: Enforce Numpy Docstring Validation for pandas.Index.get_indexer_non_unique (#58508) * DOC: remove PR07,SA01 in pandas.Index.get_indexer_non_unique * DOC: remove PR07,SA01 in pandas.Index.get_indexer_non_unique --- ci/code_checks.sh | 1 - pandas/core/indexes/base.py | 7 +++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 91335719cf303..8081efd008147 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -88,7 +88,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Grouper PR02" \ -i "pandas.Index PR07" \ -i "pandas.Index.get_indexer_for PR01,SA01" \ - -i "pandas.Index.get_indexer_non_unique PR07,SA01" \ -i "pandas.Index.get_loc PR07,RT03,SA01" \ -i "pandas.Index.join PR07,RT03,SA01" \ -i "pandas.Index.names GL08" \ diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 46da27e216986..93be22ca7d5f8 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -5850,6 +5850,7 @@ def _should_fallback_to_positional(self) -> bool: Parameters ---------- target : %(target_klass)s + An iterable containing the values to be used for computing indexer. Returns ------- @@ -5861,6 +5862,12 @@ def _should_fallback_to_positional(self) -> bool: An indexer into the target of the values not found. These correspond to the -1 in the indexer array. + See Also + -------- + Index.get_indexer : Computes indexer and mask for new index given + the current index. + Index.get_indexer_for : Returns an indexer even when non-unique. + Examples -------- >>> index = pd.Index(['c', 'b', 'a', 'b', 'b']) From 7257a89b5e4c6be3eb92c596f4a85956a91de24a Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Wed, 1 May 2024 22:28:41 +0530 Subject: [PATCH 091/100] DOC: Enforce Numpy Docstring Validation for pandas.Index.putmask (#58510) * DOC: add PR01,RT03 in pandas.Index.putmask * DOC: remove PR01,RT03 in pandas.Index.putmask --- ci/code_checks.sh | 1 - pandas/core/indexes/base.py | 10 ++++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 8081efd008147..a3db2559315d0 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -91,7 +91,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Index.get_loc PR07,RT03,SA01" \ -i "pandas.Index.join PR07,RT03,SA01" \ -i "pandas.Index.names GL08" \ - -i "pandas.Index.putmask PR01,RT03" \ -i "pandas.Index.ravel PR01,RT03" \ -i "pandas.Index.str PR01,SA01" \ -i "pandas.Int16Dtype SA01" \ diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 93be22ca7d5f8..3952503581bba 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -5260,9 +5260,19 @@ def putmask(self, mask, value) -> Index: """ Return a new Index of the values set with the mask. + Parameters + ---------- + mask : np.ndarray[bool] + Array of booleans denoting where values in the original + data are not ``NA``. + value : scalar + Scalar value to use to fill holes (e.g. 0). + This value cannot be a list-likes. + Returns ------- Index + A new Index of the values set with the mask. See Also -------- From 439526c9d434c520a3a977178949285042b6d773 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Wed, 1 May 2024 22:33:12 +0530 Subject: [PATCH 092/100] DOC: Enforce Numpy Docstring Validation for pandas.DataFrame.prod (#58512) * DOC: add RT03 in pandas.DataFrame.prod * DOC: remove RT03 in pandas.DataFrame.prod * DOC: remove RT03 in pandas.DataFrame.product * DOC: add RT03 in pandas.DataFrame.prod --- ci/code_checks.sh | 2 -- pandas/core/frame.py | 68 +++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 67 insertions(+), 3 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index a3db2559315d0..10e36be9c3efc 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -75,8 +75,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.DataFrame.median RT03,SA01" \ -i "pandas.DataFrame.min RT03" \ -i "pandas.DataFrame.plot PR02,SA01" \ - -i "pandas.DataFrame.prod RT03" \ - -i "pandas.DataFrame.product RT03" \ -i "pandas.DataFrame.sem PR01,RT03,SA01" \ -i "pandas.DataFrame.skew RT03,SA01" \ -i "pandas.DataFrame.sparse PR01" \ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 3d2a6093464a9..8bb0608e0bcd5 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -11730,7 +11730,6 @@ def sum( return result @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="prod") - @doc(make_doc("prod", ndim=2)) def prod( self, axis: Axis | None = 0, @@ -11739,6 +11738,73 @@ def prod( min_count: int = 0, **kwargs, ) -> Series: + """ + Return the product of the values over the requested axis. + + Parameters + ---------- + axis : {index (0), columns (1)} + Axis for the function to be applied on. + For `Series` this parameter is unused and defaults to 0. + + .. warning:: + + The behavior of DataFrame.prod with ``axis=None`` is deprecated, + in a future version this will reduce over both axes and return a scalar + To retain the old behavior, pass axis=0 (or do not pass axis). + + .. versionadded:: 2.0.0 + + skipna : bool, default True + Exclude NA/null values when computing the result. + numeric_only : bool, default False + Include only float, int, boolean columns. Not implemented for Series. + + min_count : int, default 0 + The required number of valid values to perform the operation. If fewer than + ``min_count`` non-NA values are present the result will be NA. + **kwargs + Additional keyword arguments to be passed to the function. + + Returns + ------- + Series or scalar + The product of the values over the requested axis. + + See Also + -------- + Series.sum : Return the sum. + Series.min : Return the minimum. + Series.max : Return the maximum. + Series.idxmin : Return the index of the minimum. + Series.idxmax : Return the index of the maximum. + DataFrame.sum : Return the sum over the requested axis. + DataFrame.min : Return the minimum over the requested axis. + DataFrame.max : Return the maximum over the requested axis. + DataFrame.idxmin : Return the index of the minimum over the requested axis. + DataFrame.idxmax : Return the index of the maximum over the requested axis. + + Examples + -------- + By default, the product of an empty or all-NA Series is ``1`` + + >>> pd.Series([], dtype="float64").prod() + 1.0 + + This can be controlled with the ``min_count`` parameter + + >>> pd.Series([], dtype="float64").prod(min_count=1) + nan + + Thanks to the ``skipna`` parameter, ``min_count`` handles all-NA and + empty series identically. + + >>> pd.Series([np.nan]).prod() + 1.0 + + >>> pd.Series([np.nan]).prod(min_count=1) + nan + """ result = super().prod( axis=axis, skipna=skipna, From 8cc036ccf1c568e9077a29539edfa15028ab5ec1 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Wed, 1 May 2024 22:33:53 +0530 Subject: [PATCH 093/100] DOC: Enforce Numpy Docstring Validation for pandas.DataFrame.skew (#58514) DOC: remove RT03,SA01 in pandas.DataFrame.skew --- ci/code_checks.sh | 1 - pandas/core/frame.py | 75 +++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 74 insertions(+), 2 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 10e36be9c3efc..7e4c7cb527a62 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -76,7 +76,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.DataFrame.min RT03" \ -i "pandas.DataFrame.plot PR02,SA01" \ -i "pandas.DataFrame.sem PR01,RT03,SA01" \ - -i "pandas.DataFrame.skew RT03,SA01" \ -i "pandas.DataFrame.sparse PR01" \ -i "pandas.DataFrame.std PR01,RT03,SA01" \ -i "pandas.DataFrame.sum RT03" \ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 8bb0608e0bcd5..88e4d695b8328 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -12095,7 +12095,6 @@ def skew( ) -> Series | Any: ... @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="skew") - @doc(make_doc("skew", ndim=2)) def skew( self, axis: Axis | None = 0, @@ -12103,6 +12102,80 @@ def skew( numeric_only: bool = False, **kwargs, ) -> Series | Any: + """ + Return unbiased skew over requested axis. + + Normalized by N-1. + + Parameters + ---------- + axis : {index (0), columns (1)} + Axis for the function to be applied on. + For `Series` this parameter is unused and defaults to 0. + + For DataFrames, specifying ``axis=None`` will apply the aggregation + across both axes. + + .. versionadded:: 2.0.0 + + skipna : bool, default True + Exclude NA/null values when computing the result. + numeric_only : bool, default False + Include only float, int, boolean columns. + + **kwargs + Additional keyword arguments to be passed to the function. + + Returns + ------- + Series or scalar + Unbiased skew over requested axis. + + See Also + -------- + Dataframe.kurt : Returns unbiased kurtosis over requested axis. + + Examples + -------- + >>> s = pd.Series([1, 2, 3]) + >>> s.skew() + 0.0 + + With a DataFrame + + >>> df = pd.DataFrame( + ... {"a": [1, 2, 3], "b": [2, 3, 4], "c": [1, 3, 5]}, + ... index=["tiger", "zebra", "cow"], + ... ) + >>> df + a b c + tiger 1 2 1 + zebra 2 3 3 + cow 3 4 5 + >>> df.skew() + a 0.0 + b 0.0 + c 0.0 + dtype: float64 + + Using axis=1 + + >>> df.skew(axis=1) + tiger 1.732051 + zebra -1.732051 + cow 0.000000 + dtype: float64 + + In this case, `numeric_only` should be set to `True` to avoid + getting an error. + + >>> df = pd.DataFrame( + ... {"a": [1, 2, 3], "b": ["T", "Z", "X"]}, index=["tiger", "zebra", "cow"] + ... ) + >>> df.skew(numeric_only=True) + a 0.0 + dtype: float64 + """ result = super().skew( axis=axis, skipna=skipna, numeric_only=numeric_only, **kwargs ) From d27670976c862e1039d954caad0b6388014b694a Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Thu, 2 May 2024 01:02:39 +0530 Subject: [PATCH 094/100] DOC: Enforce Numpy Docstring Validation for pandas.Index.get_indexer_for (#58507) * DOC: add PR01,SA01 in pandas.Index.get_indexer_for * DOC: remove PR01,SA01 in pandas.Index.get_indexer_for --- ci/code_checks.sh | 1 - pandas/core/indexes/base.py | 12 ++++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 7e4c7cb527a62..da0d98eff5e46 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -84,7 +84,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.DataFrame.var PR01,RT03,SA01" \ -i "pandas.Grouper PR02" \ -i "pandas.Index PR07" \ - -i "pandas.Index.get_indexer_for PR01,SA01" \ -i "pandas.Index.get_loc PR07,RT03,SA01" \ -i "pandas.Index.join PR07,RT03,SA01" \ -i "pandas.Index.names GL08" \ diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 3952503581bba..048362a28dfd7 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -5953,11 +5953,23 @@ def get_indexer_for(self, target) -> npt.NDArray[np.intp]: This dispatches to get_indexer or get_indexer_non_unique as appropriate. + Parameters + ---------- + target : Index + An iterable containing the values to be used for computing indexer. + Returns ------- np.ndarray[np.intp] List of indices. + See Also + -------- + Index.get_indexer : Computes indexer and mask for new index given + the current index. + Index.get_non_unique : Returns indexer and masks for new index given + the current index. + Examples -------- >>> idx = pd.Index([np.nan, "var1", np.nan]) From 33baa453e5a33cf704c9a9fb7e677e438f5fa1dc Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Thu, 2 May 2024 01:08:51 +0530 Subject: [PATCH 095/100] DOC: Enforce Numpy Docstring Validation for pandas.DataFrame.sparse (#58515) * DOC: remove PR01 in pandas.DataFrame.sparse * DOC: remove PR01 in pandas.DataFrame.sparse --- ci/code_checks.sh | 1 - pandas/core/arrays/sparse/accessor.py | 5 +++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index da0d98eff5e46..8364314ca55be 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -76,7 +76,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.DataFrame.min RT03" \ -i "pandas.DataFrame.plot PR02,SA01" \ -i "pandas.DataFrame.sem PR01,RT03,SA01" \ - -i "pandas.DataFrame.sparse PR01" \ -i "pandas.DataFrame.std PR01,RT03,SA01" \ -i "pandas.DataFrame.sum RT03" \ -i "pandas.DataFrame.swaplevel SA01" \ diff --git a/pandas/core/arrays/sparse/accessor.py b/pandas/core/arrays/sparse/accessor.py index 1f82285e3e40e..6a1c25711acb0 100644 --- a/pandas/core/arrays/sparse/accessor.py +++ b/pandas/core/arrays/sparse/accessor.py @@ -243,6 +243,11 @@ class SparseFrameAccessor(BaseAccessor, PandasDelegate): """ DataFrame accessor for sparse data. + Parameters + ---------- + data : scipy.sparse.spmatrix + Must be convertible to csc format. + See Also -------- DataFrame.sparse.density : Ratio of non-sparse points to total (dense) data points. From 7320430af7d1e19fbb4eb9e447ef270848495729 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Thu, 2 May 2024 01:09:28 +0530 Subject: [PATCH 096/100] DOC: Enforce Numpy Docstring Validation for pandas.Int (#58511) * DOC: add SA01 for Int16Dtype,Int32Dtype,Int64Dtype,Int8Dtype * DOC: remove SA01 for Int16Dtype,Int32Dtype,Int64Dtype,Int8Dtype * DOC: remove SA01 for Int16Dtype,Int32Dtype,Int64Dtype,Int8Dtype * DOC: change description to n-bit nullable integer type --- ci/code_checks.sh | 8 -------- pandas/core/arrays/integer.py | 7 +++++++ 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 8364314ca55be..cde9f9dd43280 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -88,10 +88,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Index.names GL08" \ -i "pandas.Index.ravel PR01,RT03" \ -i "pandas.Index.str PR01,SA01" \ - -i "pandas.Int16Dtype SA01" \ - -i "pandas.Int32Dtype SA01" \ - -i "pandas.Int64Dtype SA01" \ - -i "pandas.Int8Dtype SA01" \ -i "pandas.Interval PR02" \ -i "pandas.Interval.closed SA01" \ -i "pandas.Interval.left SA01" \ @@ -391,10 +387,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Timestamp.weekday SA01" \ -i "pandas.Timestamp.weekofyear SA01" \ -i "pandas.Timestamp.year GL08" \ - -i "pandas.UInt16Dtype SA01" \ - -i "pandas.UInt32Dtype SA01" \ - -i "pandas.UInt64Dtype SA01" \ - -i "pandas.UInt8Dtype SA01" \ -i "pandas.api.extensions.ExtensionArray SA01" \ -i "pandas.api.extensions.ExtensionArray._accumulate RT03,SA01" \ -i "pandas.api.extensions.ExtensionArray._concat_same_type PR07,SA01" \ diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 21a9b09227663..f85fbd062b0c3 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -144,6 +144,13 @@ class IntegerArray(NumericArray): ------- None +See Also +-------- +Int8Dtype : 8-bit nullable integer type. +Int16Dtype : 16-bit nullable integer type. +Int32Dtype : 32-bit nullable integer type. +Int64Dtype : 64-bit nullable integer type. + Examples -------- For Int8Dtype: From f6932cb8c538e89d231bbd10b4b422f8b3d41f39 Mon Sep 17 00:00:00 2001 From: iangainey <109095667+iangainey@users.noreply.github.com> Date: Wed, 1 May 2024 17:01:14 -0400 Subject: [PATCH 097/100] REF: Read excel parse refactor (#58497) --- pandas/io/excel/_base.py | 304 +++++++++++++++++++++++---------------- 1 file changed, 178 insertions(+), 126 deletions(-) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 2b35cfa044ae9..6063ac098a4dc 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -780,143 +780,195 @@ def parse( output[asheetname] = DataFrame() continue - is_list_header = False - is_len_one_list_header = False - if is_list_like(header): - assert isinstance(header, Sequence) - is_list_header = True - if len(header) == 1: - is_len_one_list_header = True - - if is_len_one_list_header: - header = cast(Sequence[int], header)[0] - - # forward fill and pull out names for MultiIndex column - header_names = None - if header is not None and is_list_like(header): - assert isinstance(header, Sequence) - - header_names = [] - control_row = [True] * len(data[0]) - - for row in header: - if is_integer(skiprows): - assert isinstance(skiprows, int) - row += skiprows - - if row > len(data) - 1: - raise ValueError( - f"header index {row} exceeds maximum index " - f"{len(data) - 1} of data.", - ) - - data[row], control_row = fill_mi_header(data[row], control_row) - - if index_col is not None: - header_name, _ = pop_header_name(data[row], index_col) - header_names.append(header_name) - - # If there is a MultiIndex header and an index then there is also - # a row containing just the index name(s) - has_index_names = False - if is_list_header and not is_len_one_list_header and index_col is not None: - index_col_list: Sequence[int] - if isinstance(index_col, int): - index_col_list = [index_col] - else: - assert isinstance(index_col, Sequence) - index_col_list = index_col - - # We have to handle mi without names. If any of the entries in the data - # columns are not empty, this is a regular row - assert isinstance(header, Sequence) - if len(header) < len(data): - potential_index_names = data[len(header)] - potential_data = [ - x - for i, x in enumerate(potential_index_names) - if not control_row[i] and i not in index_col_list - ] - has_index_names = all(x == "" or x is None for x in potential_data) - - if is_list_like(index_col): - # Forward fill values for MultiIndex index. - if header is None: - offset = 0 - elif isinstance(header, int): - offset = 1 + header - else: - offset = 1 + max(header) + output = self._parse_sheet( + data=data, + output=output, + asheetname=asheetname, + header=header, + names=names, + index_col=index_col, + usecols=usecols, + dtype=dtype, + skiprows=skiprows, + nrows=nrows, + true_values=true_values, + false_values=false_values, + na_values=na_values, + parse_dates=parse_dates, + date_parser=date_parser, + date_format=date_format, + thousands=thousands, + decimal=decimal, + comment=comment, + skipfooter=skipfooter, + dtype_backend=dtype_backend, + **kwds, + ) - # GH34673: if MultiIndex names present and not defined in the header, - # offset needs to be incremented so that forward filling starts - # from the first MI value instead of the name - if has_index_names: - offset += 1 + if last_sheetname is None: + raise ValueError("Sheet name is an empty list") - # Check if we have an empty dataset - # before trying to collect data. - if offset < len(data): - assert isinstance(index_col, Sequence) + if ret_dict: + return output + else: + return output[last_sheetname] - for col in index_col: - last = data[offset][col] + def _parse_sheet( + self, + data: list, + output: dict, + asheetname: str | int | None = None, + header: int | Sequence[int] | None = 0, + names: SequenceNotStr[Hashable] | range | None = None, + index_col: int | Sequence[int] | None = None, + usecols=None, + dtype: DtypeArg | None = None, + skiprows: Sequence[int] | int | Callable[[int], object] | None = None, + nrows: int | None = None, + true_values: Iterable[Hashable] | None = None, + false_values: Iterable[Hashable] | None = None, + na_values=None, + parse_dates: list | dict | bool = False, + date_parser: Callable | lib.NoDefault = lib.no_default, + date_format: dict[Hashable, str] | str | None = None, + thousands: str | None = None, + decimal: str = ".", + comment: str | None = None, + skipfooter: int = 0, + dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, + **kwds, + ): + is_list_header = False + is_len_one_list_header = False + if is_list_like(header): + assert isinstance(header, Sequence) + is_list_header = True + if len(header) == 1: + is_len_one_list_header = True + + if is_len_one_list_header: + header = cast(Sequence[int], header)[0] + + # forward fill and pull out names for MultiIndex column + header_names = None + if header is not None and is_list_like(header): + assert isinstance(header, Sequence) + + header_names = [] + control_row = [True] * len(data[0]) + + for row in header: + if is_integer(skiprows): + assert isinstance(skiprows, int) + row += skiprows + + if row > len(data) - 1: + raise ValueError( + f"header index {row} exceeds maximum index " + f"{len(data) - 1} of data.", + ) - for row in range(offset + 1, len(data)): - if data[row][col] == "" or data[row][col] is None: - data[row][col] = last - else: - last = data[row][col] + data[row], control_row = fill_mi_header(data[row], control_row) - # GH 12292 : error when read one empty column from excel file - try: - parser = TextParser( - data, - names=names, - header=header, - index_col=index_col, - has_index_names=has_index_names, - dtype=dtype, - true_values=true_values, - false_values=false_values, - skiprows=skiprows, - nrows=nrows, - na_values=na_values, - skip_blank_lines=False, # GH 39808 - parse_dates=parse_dates, - date_parser=date_parser, - date_format=date_format, - thousands=thousands, - decimal=decimal, - comment=comment, - skipfooter=skipfooter, - usecols=usecols, - dtype_backend=dtype_backend, - **kwds, - ) + if index_col is not None: + header_name, _ = pop_header_name(data[row], index_col) + header_names.append(header_name) - output[asheetname] = parser.read(nrows=nrows) + # If there is a MultiIndex header and an index then there is also + # a row containing just the index name(s) + has_index_names = False + if is_list_header and not is_len_one_list_header and index_col is not None: + index_col_list: Sequence[int] + if isinstance(index_col, int): + index_col_list = [index_col] + else: + assert isinstance(index_col, Sequence) + index_col_list = index_col + + # We have to handle mi without names. If any of the entries in the data + # columns are not empty, this is a regular row + assert isinstance(header, Sequence) + if len(header) < len(data): + potential_index_names = data[len(header)] + potential_data = [ + x + for i, x in enumerate(potential_index_names) + if not control_row[i] and i not in index_col_list + ] + has_index_names = all(x == "" or x is None for x in potential_data) + + if is_list_like(index_col): + # Forward fill values for MultiIndex index. + if header is None: + offset = 0 + elif isinstance(header, int): + offset = 1 + header + else: + offset = 1 + max(header) + + # GH34673: if MultiIndex names present and not defined in the header, + # offset needs to be incremented so that forward filling starts + # from the first MI value instead of the name + if has_index_names: + offset += 1 + + # Check if we have an empty dataset + # before trying to collect data. + if offset < len(data): + assert isinstance(index_col, Sequence) + + for col in index_col: + last = data[offset][col] + + for row in range(offset + 1, len(data)): + if data[row][col] == "" or data[row][col] is None: + data[row][col] = last + else: + last = data[row][col] + + # GH 12292 : error when read one empty column from excel file + try: + parser = TextParser( + data, + names=names, + header=header, + index_col=index_col, + has_index_names=has_index_names, + dtype=dtype, + true_values=true_values, + false_values=false_values, + skiprows=skiprows, + nrows=nrows, + na_values=na_values, + skip_blank_lines=False, # GH 39808 + parse_dates=parse_dates, + date_parser=date_parser, + date_format=date_format, + thousands=thousands, + decimal=decimal, + comment=comment, + skipfooter=skipfooter, + usecols=usecols, + dtype_backend=dtype_backend, + **kwds, + ) - if header_names: - output[asheetname].columns = output[asheetname].columns.set_names( - header_names - ) + output[asheetname] = parser.read(nrows=nrows) - except EmptyDataError: - # No Data, return an empty DataFrame - output[asheetname] = DataFrame() + if header_names: + output[asheetname].columns = output[asheetname].columns.set_names( + header_names + ) - except Exception as err: - err.args = (f"{err.args[0]} (sheet: {asheetname})", *err.args[1:]) - raise err + except EmptyDataError: + # No Data, return an empty DataFrame + output[asheetname] = DataFrame() - if last_sheetname is None: - raise ValueError("Sheet name is an empty list") + except Exception as err: + err.args = (f"{err.args[0]} (sheet: {asheetname})", *err.args[1:]) + raise err - if ret_dict: - return output - else: - return output[last_sheetname] + return output @doc(storage_options=_shared_docs["storage_options"]) From 564d0d9e04970538951e307911f0af2c44414841 Mon Sep 17 00:00:00 2001 From: undermyumbrella1 <120079323+undermyumbrella1@users.noreply.github.com> Date: Thu, 2 May 2024 05:05:26 +0800 Subject: [PATCH 098/100] BUG: as_index=False can return a MultiIndex in groupby.apply (#58369) --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/groupby/groupby.py | 5 +---- pandas/tests/groupby/methods/test_value_counts.py | 9 +++------ pandas/tests/groupby/test_apply.py | 2 +- pandas/tests/groupby/test_apply_mutate.py | 4 +--- pandas/tests/groupby/test_groupby.py | 9 ++++----- 6 files changed, 11 insertions(+), 19 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index ce9022bdc2967..9e7349a061295 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -449,6 +449,7 @@ Groupby/resample/rolling - Bug in :meth:`.Resampler.interpolate` on a :class:`DataFrame` with non-uniform sampling and/or indices not aligning with the resulting resampled index would result in wrong interpolation (:issue:`21351`) - Bug in :meth:`DataFrame.ewm` and :meth:`Series.ewm` when passed ``times`` and aggregation functions other than mean (:issue:`51695`) - Bug in :meth:`DataFrameGroupBy.apply` that was returning a completely empty DataFrame when all return values of ``func`` were ``None`` instead of returning an empty DataFrame with the original columns and dtypes. (:issue:`57775`) +- Bug in :meth:`DataFrameGroupBy.apply` with ``as_index=False`` that was returning :class:`MultiIndex` instead of returning :class:`Index`. (:issue:`58291`) Reshaping diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 79d9f49a3b355..f44ef8c4dbbfa 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1202,10 +1202,7 @@ def _concat_objects( sort=False, ) else: - # GH5610, returns a MI, with the first level being a - # range index - keys = RangeIndex(len(values)) - result = concat(values, axis=0, keys=keys) + result = concat(values, axis=0) elif not not_indexed_same: result = concat(values, axis=0) diff --git a/pandas/tests/groupby/methods/test_value_counts.py b/pandas/tests/groupby/methods/test_value_counts.py index be52b4a591c26..0f136b06c782a 100644 --- a/pandas/tests/groupby/methods/test_value_counts.py +++ b/pandas/tests/groupby/methods/test_value_counts.py @@ -329,13 +329,10 @@ def test_against_frame_and_seriesgroupby( else: name = "proportion" if normalize else "count" expected = expected.reset_index().rename({0: name}, axis=1) - if groupby == "column": - expected = expected.rename({"level_0": "country"}, axis=1) - expected["country"] = np.where(expected["country"], "US", "FR") - elif groupby == "function": - expected["level_0"] = expected["level_0"] == 1 + if groupby in ["array", "function"] and (not as_index and frame): + expected.insert(loc=0, column="level_0", value=result["level_0"]) else: - expected["level_0"] = np.where(expected["level_0"], "US", "FR") + expected.insert(loc=0, column="country", value=result["country"]) tm.assert_frame_equal(result, expected) else: # compare against SeriesGroupBy value_counts diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 1a2589fe94ea5..e27c782c1bdcf 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -315,7 +315,7 @@ def test_groupby_as_index_apply(): # apply doesn't maintain the original ordering # changed in GH5610 as the as_index=False returns a MI here - exp_not_as_apply = MultiIndex.from_tuples([(0, 0), (0, 2), (1, 1), (2, 4)]) + exp_not_as_apply = Index([0, 2, 1, 4]) tp = [(1, 0), (1, 2), (2, 1), (3, 4)] exp_as_apply = MultiIndex.from_tuples(tp, names=["user_id", None]) diff --git a/pandas/tests/groupby/test_apply_mutate.py b/pandas/tests/groupby/test_apply_mutate.py index e5028884e992b..fa20efad4da77 100644 --- a/pandas/tests/groupby/test_apply_mutate.py +++ b/pandas/tests/groupby/test_apply_mutate.py @@ -90,9 +90,7 @@ def fn(x): result = df.groupby(["col1"], as_index=False).apply(fn) expected = pd.Series( [1, 2, 0, 4, 5, 0], - index=pd.MultiIndex.from_tuples( - [(0, 0), (0, 1), (0, 2), (1, 3), (1, 4), (1, 5)] - ), + index=range(6), name="col2", ) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 54d7895691f3f..d50fea459552a 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -113,8 +113,9 @@ def f(x, q=None, axis=0): expected_seq = df_grouped.quantile([0.4, 0.8]) if not as_index: # apply treats the op as a transform; .quantile knows it's a reduction - apply_result = apply_result.reset_index() - apply_result["level_0"] = [1, 1, 2, 2] + apply_result.index = range(4) + apply_result.insert(loc=0, column="level_0", value=[1, 1, 2, 2]) + apply_result.insert(loc=1, column="level_1", value=[0.4, 0.8, 0.4, 0.8]) tm.assert_frame_equal(apply_result, expected_seq, check_names=False) agg_result = df_grouped.agg(f, q=80) @@ -519,9 +520,7 @@ def test_as_index_select_column(): result = df.groupby("A", as_index=False, group_keys=True)["B"].apply( lambda x: x.cumsum() ) - expected = Series( - [2, 6, 6], name="B", index=MultiIndex.from_tuples([(0, 0), (0, 1), (1, 2)]) - ) + expected = Series([2, 6, 6], name="B", index=range(3)) tm.assert_series_equal(result, expected) From 9250bf7829adeac62461dd7aed4d1b2cb790a35d Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Thu, 2 May 2024 04:07:09 +0530 Subject: [PATCH 099/100] DOC: Enforce Numpy Docstring Validation for pandas.DataFrame.sem (#58513) * DOC: add PR01,RT03,SA01 in pandas.DataFrame.sem * DOC: remove PR01,RT03,SA01 in pandas.DataFrame.sem --- ci/code_checks.sh | 1 - pandas/core/frame.py | 71 +++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 70 insertions(+), 2 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index cde9f9dd43280..43c80cf80d487 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -75,7 +75,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.DataFrame.median RT03,SA01" \ -i "pandas.DataFrame.min RT03" \ -i "pandas.DataFrame.plot PR02,SA01" \ - -i "pandas.DataFrame.sem PR01,RT03,SA01" \ -i "pandas.DataFrame.std PR01,RT03,SA01" \ -i "pandas.DataFrame.sum RT03" \ -i "pandas.DataFrame.swaplevel SA01" \ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 88e4d695b8328..96943eb71c7bd 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -11945,7 +11945,6 @@ def sem( ) -> Series | Any: ... @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="sem") - @doc(make_doc("sem", ndim=2)) def sem( self, axis: Axis | None = 0, @@ -11954,6 +11953,76 @@ def sem( numeric_only: bool = False, **kwargs, ) -> Series | Any: + """ + Return unbiased standard error of the mean over requested axis. + + Normalized by N-1 by default. This can be changed using the ddof argument + + Parameters + ---------- + axis : {index (0), columns (1)} + For `Series` this parameter is unused and defaults to 0. + + .. warning:: + + The behavior of DataFrame.sem with ``axis=None`` is deprecated, + in a future version this will reduce over both axes and return a scalar + To retain the old behavior, pass axis=0 (or do not pass axis). + + skipna : bool, default True + Exclude NA/null values. If an entire row/column is NA, the result + will be NA. + ddof : int, default 1 + Delta Degrees of Freedom. The divisor used in calculations is N - ddof, + where N represents the number of elements. + numeric_only : bool, default False + Include only float, int, boolean columns. Not implemented for Series. + **kwargs : + Additional keywords passed. + + Returns + ------- + Series or DataFrame (if level specified) + Unbiased standard error of the mean over requested axis. + + See Also + -------- + DataFrame.var : Return unbiased variance over requested axis. + DataFrame.std : Returns sample standard deviation over requested axis. + + Examples + -------- + >>> s = pd.Series([1, 2, 3]) + >>> s.sem().round(6) + 0.57735 + + With a DataFrame + + >>> df = pd.DataFrame({"a": [1, 2], "b": [2, 3]}, index=["tiger", "zebra"]) + >>> df + a b + tiger 1 2 + zebra 2 3 + >>> df.sem() + a 0.5 + b 0.5 + dtype: float64 + + Using axis=1 + + >>> df.sem(axis=1) + tiger 0.5 + zebra 0.5 + dtype: float64 + + In this case, `numeric_only` should be set to `True` + to avoid getting an error. + + >>> df = pd.DataFrame({"a": [1, 2], "b": ["T", "Z"]}, index=["tiger", "zebra"]) + >>> df.sem(numeric_only=True) + a 0.5 + dtype: float64 + """ result = super().sem( axis=axis, skipna=skipna, ddof=ddof, numeric_only=numeric_only, **kwargs ) From 9110f7cdac46c69212512635a7fdc99963540c30 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Thu, 2 May 2024 04:07:33 +0530 Subject: [PATCH 100/100] DOC: Enforce Numpy Docstring Validation for pandas.Index.get_loc (#58509) * DOC: add PR07,RT03,SA01 in pandas.Index.get_loc * DOC: remove PR07,RT03,SA01 in pandas.Index.get_loc --- ci/code_checks.sh | 1 - pandas/core/indexes/base.py | 12 ++++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 43c80cf80d487..996f361e9440f 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -82,7 +82,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.DataFrame.var PR01,RT03,SA01" \ -i "pandas.Grouper PR02" \ -i "pandas.Index PR07" \ - -i "pandas.Index.get_loc PR07,RT03,SA01" \ -i "pandas.Index.join PR07,RT03,SA01" \ -i "pandas.Index.names GL08" \ -i "pandas.Index.ravel PR01,RT03" \ diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 048362a28dfd7..e93db22906b39 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3490,10 +3490,22 @@ def get_loc(self, key): Parameters ---------- key : label + The key to check its location if it is present in the index. Returns ------- int if unique index, slice if monotonic index, else mask + Integer location, slice or boolean mask. + + See Also + -------- + Index.get_slice_bound : Calculate slice bound that corresponds to + given label. + Index.get_indexer : Computes indexer and mask for new index given + the current index. + Index.get_non_unique : Returns indexer and masks for new index given + the current index. + Index.get_indexer_for : Returns an indexer even when non-unique. Examples --------