diff --git a/ci/code_checks.sh b/ci/code_checks.sh index d1cdff8f7f56b..996f361e9440f 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -70,125 +70,22 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then --format=actions \ -i ES01 `# For now it is ok if docstrings are missing the extended summary` \ -i "pandas.Series.dt PR01" `# Accessors are implemented as classes, but we do not document the Parameters section` \ - -i "pandas.Categorical.__array__ SA01" \ - -i "pandas.Categorical.codes SA01" \ - -i "pandas.Categorical.dtype SA01" \ - -i "pandas.Categorical.from_codes SA01" \ - -i "pandas.Categorical.ordered SA01" \ - -i "pandas.CategoricalDtype.categories SA01" \ - -i "pandas.CategoricalDtype.ordered SA01" \ - -i "pandas.CategoricalIndex.codes SA01" \ - -i "pandas.CategoricalIndex.ordered SA01" \ - -i "pandas.DataFrame.__dataframe__ SA01" \ - -i "pandas.DataFrame.__iter__ SA01" \ - -i "pandas.DataFrame.at_time PR01" \ - -i "pandas.DataFrame.columns SA01" \ - -i "pandas.DataFrame.droplevel SA01" \ - -i "pandas.DataFrame.hist RT03" \ - -i "pandas.DataFrame.infer_objects RT03" \ - -i "pandas.DataFrame.kurt RT03,SA01" \ - -i "pandas.DataFrame.kurtosis RT03,SA01" \ -i "pandas.DataFrame.max RT03" \ -i "pandas.DataFrame.mean RT03,SA01" \ -i "pandas.DataFrame.median RT03,SA01" \ -i "pandas.DataFrame.min RT03" \ -i "pandas.DataFrame.plot PR02,SA01" \ - -i "pandas.DataFrame.pop SA01" \ - -i "pandas.DataFrame.prod RT03" \ - -i "pandas.DataFrame.product RT03" \ - -i "pandas.DataFrame.reorder_levels SA01" \ - -i "pandas.DataFrame.sem PR01,RT03,SA01" \ - -i "pandas.DataFrame.skew RT03,SA01" \ - -i "pandas.DataFrame.sparse PR01" \ -i "pandas.DataFrame.std PR01,RT03,SA01" \ -i "pandas.DataFrame.sum RT03" \ -i "pandas.DataFrame.swaplevel SA01" \ - -i "pandas.DataFrame.to_feather SA01" \ -i "pandas.DataFrame.to_markdown SA01" \ - -i "pandas.DataFrame.to_parquet RT03" \ -i "pandas.DataFrame.var PR01,RT03,SA01" \ - -i "pandas.DatetimeIndex.ceil SA01" \ - -i "pandas.DatetimeIndex.date SA01" \ - -i "pandas.DatetimeIndex.day SA01" \ - -i "pandas.DatetimeIndex.day_of_year SA01" \ - -i "pandas.DatetimeIndex.dayofyear SA01" \ - -i "pandas.DatetimeIndex.floor SA01" \ - -i "pandas.DatetimeIndex.freqstr SA01" \ - -i "pandas.DatetimeIndex.indexer_at_time PR01,RT03" \ - -i "pandas.DatetimeIndex.indexer_between_time RT03" \ - -i "pandas.DatetimeIndex.inferred_freq SA01" \ - -i "pandas.DatetimeIndex.is_leap_year SA01" \ - -i "pandas.DatetimeIndex.microsecond SA01" \ - -i "pandas.DatetimeIndex.nanosecond SA01" \ - -i "pandas.DatetimeIndex.quarter SA01" \ - -i "pandas.DatetimeIndex.round SA01" \ - -i "pandas.DatetimeIndex.snap PR01,RT03,SA01" \ - -i "pandas.DatetimeIndex.std PR01,RT03" \ - -i "pandas.DatetimeIndex.time SA01" \ - -i "pandas.DatetimeIndex.timetz SA01" \ - -i "pandas.DatetimeIndex.to_period RT03" \ - -i "pandas.DatetimeIndex.to_pydatetime RT03,SA01" \ - -i "pandas.DatetimeIndex.tz SA01" \ - -i "pandas.DatetimeIndex.tz_convert RT03" \ - -i "pandas.DatetimeTZDtype SA01" \ - -i "pandas.DatetimeTZDtype.tz SA01" \ - -i "pandas.DatetimeTZDtype.unit SA01" \ -i "pandas.Grouper PR02" \ - -i "pandas.HDFStore.groups SA01" \ - -i "pandas.HDFStore.info RT03,SA01" \ - -i "pandas.HDFStore.keys SA01" \ - -i "pandas.HDFStore.put PR01,SA01" \ - -i "pandas.HDFStore.select SA01" \ - -i "pandas.HDFStore.walk SA01" \ -i "pandas.Index PR07" \ - -i "pandas.Index.T SA01" \ - -i "pandas.Index.append PR07,RT03,SA01" \ - -i "pandas.Index.astype SA01" \ - -i "pandas.Index.copy PR07,SA01" \ - -i "pandas.Index.difference PR07,RT03,SA01" \ - -i "pandas.Index.drop PR07,SA01" \ - -i "pandas.Index.drop_duplicates RT03" \ - -i "pandas.Index.droplevel RT03,SA01" \ - -i "pandas.Index.dropna RT03,SA01" \ - -i "pandas.Index.dtype SA01" \ - -i "pandas.Index.duplicated RT03" \ - -i "pandas.Index.empty GL08" \ - -i "pandas.Index.equals SA01" \ - -i "pandas.Index.fillna RT03" \ - -i "pandas.Index.get_indexer PR07,SA01" \ - -i "pandas.Index.get_indexer_for PR01,SA01" \ - -i "pandas.Index.get_indexer_non_unique PR07,SA01" \ - -i "pandas.Index.get_loc PR07,RT03,SA01" \ - -i "pandas.Index.get_slice_bound PR07" \ - -i "pandas.Index.hasnans SA01" \ - -i "pandas.Index.identical PR01,SA01" \ - -i "pandas.Index.inferred_type SA01" \ - -i "pandas.Index.insert PR07,RT03,SA01" \ - -i "pandas.Index.intersection PR07,RT03,SA01" \ - -i "pandas.Index.item SA01" \ -i "pandas.Index.join PR07,RT03,SA01" \ - -i "pandas.Index.map SA01" \ - -i "pandas.Index.memory_usage RT03" \ - -i "pandas.Index.name SA01" \ -i "pandas.Index.names GL08" \ - -i "pandas.Index.nbytes SA01" \ - -i "pandas.Index.nunique RT03" \ - -i "pandas.Index.putmask PR01,RT03" \ -i "pandas.Index.ravel PR01,RT03" \ - -i "pandas.Index.reindex PR07" \ - -i "pandas.Index.slice_indexer PR07,RT03,SA01" \ - -i "pandas.Index.slice_locs RT03" \ -i "pandas.Index.str PR01,SA01" \ - -i "pandas.Index.symmetric_difference PR07,RT03,SA01" \ - -i "pandas.Index.take PR01,PR07" \ - -i "pandas.Index.to_list RT03" \ - -i "pandas.Index.union PR07,RT03,SA01" \ - -i "pandas.Index.unique RT03" \ - -i "pandas.Index.view GL08" \ - -i "pandas.Int16Dtype SA01" \ - -i "pandas.Int32Dtype SA01" \ - -i "pandas.Int64Dtype SA01" \ - -i "pandas.Int8Dtype SA01" \ -i "pandas.Interval PR02" \ -i "pandas.Interval.closed SA01" \ -i "pandas.Interval.left SA01" \ @@ -198,7 +95,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.IntervalDtype.subtype SA01" \ -i "pandas.IntervalIndex.closed SA01" \ -i "pandas.IntervalIndex.contains RT03" \ - -i "pandas.IntervalIndex.get_indexer PR07,SA01" \ -i "pandas.IntervalIndex.get_loc PR07,RT03,SA01" \ -i "pandas.IntervalIndex.is_non_overlapping_monotonic SA01" \ -i "pandas.IntervalIndex.left GL08" \ @@ -211,9 +107,7 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.MultiIndex.append PR07,SA01" \ -i "pandas.MultiIndex.copy PR07,RT03,SA01" \ -i "pandas.MultiIndex.drop PR07,RT03,SA01" \ - -i "pandas.MultiIndex.droplevel RT03,SA01" \ -i "pandas.MultiIndex.dtypes SA01" \ - -i "pandas.MultiIndex.get_indexer PR07,SA01" \ -i "pandas.MultiIndex.get_level_values SA01" \ -i "pandas.MultiIndex.get_loc PR07" \ -i "pandas.MultiIndex.get_loc_level PR07" \ @@ -252,7 +146,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.PeriodIndex.dayofyear SA01" \ -i "pandas.PeriodIndex.days_in_month SA01" \ -i "pandas.PeriodIndex.daysinmonth SA01" \ - -i "pandas.PeriodIndex.freqstr SA01" \ -i "pandas.PeriodIndex.from_fields PR07,SA01" \ -i "pandas.PeriodIndex.from_ordinals SA01" \ -i "pandas.PeriodIndex.hour SA01" \ @@ -273,10 +166,8 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.RangeIndex.step SA01" \ -i "pandas.RangeIndex.stop SA01" \ -i "pandas.Series SA01" \ - -i "pandas.Series.T SA01" \ -i "pandas.Series.__iter__ RT03,SA01" \ -i "pandas.Series.add PR07" \ - -i "pandas.Series.at_time PR01" \ -i "pandas.Series.backfill PR01,SA01" \ -i "pandas.Series.case_when RT03" \ -i "pandas.Series.cat PR07,SA01" \ @@ -284,59 +175,43 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Series.cat.as_ordered PR01" \ -i "pandas.Series.cat.as_unordered PR01" \ -i "pandas.Series.cat.codes SA01" \ - -i "pandas.Series.cat.ordered SA01" \ -i "pandas.Series.cat.remove_categories PR01,PR02" \ -i "pandas.Series.cat.remove_unused_categories PR01" \ -i "pandas.Series.cat.rename_categories PR01,PR02" \ -i "pandas.Series.cat.reorder_categories PR01,PR02" \ -i "pandas.Series.cat.set_categories PR01,PR02" \ -i "pandas.Series.div PR07" \ - -i "pandas.Series.droplevel SA01" \ -i "pandas.Series.dt.as_unit PR01,PR02" \ - -i "pandas.Series.dt.ceil PR01,PR02,SA01" \ + -i "pandas.Series.dt.ceil PR01,PR02" \ -i "pandas.Series.dt.components SA01" \ - -i "pandas.Series.dt.date SA01" \ - -i "pandas.Series.dt.day SA01" \ -i "pandas.Series.dt.day_name PR01,PR02" \ - -i "pandas.Series.dt.day_of_year SA01" \ - -i "pandas.Series.dt.dayofyear SA01" \ -i "pandas.Series.dt.days SA01" \ -i "pandas.Series.dt.days_in_month SA01" \ -i "pandas.Series.dt.daysinmonth SA01" \ - -i "pandas.Series.dt.floor PR01,PR02,SA01" \ + -i "pandas.Series.dt.floor PR01,PR02" \ -i "pandas.Series.dt.freq GL08" \ - -i "pandas.Series.dt.is_leap_year SA01" \ - -i "pandas.Series.dt.microsecond SA01" \ -i "pandas.Series.dt.microseconds SA01" \ -i "pandas.Series.dt.month_name PR01,PR02" \ - -i "pandas.Series.dt.nanosecond SA01" \ -i "pandas.Series.dt.nanoseconds SA01" \ -i "pandas.Series.dt.normalize PR01" \ - -i "pandas.Series.dt.quarter SA01" \ -i "pandas.Series.dt.qyear GL08" \ - -i "pandas.Series.dt.round PR01,PR02,SA01" \ + -i "pandas.Series.dt.round PR01,PR02" \ -i "pandas.Series.dt.seconds SA01" \ -i "pandas.Series.dt.strftime PR01,PR02" \ - -i "pandas.Series.dt.time SA01" \ - -i "pandas.Series.dt.timetz SA01" \ - -i "pandas.Series.dt.to_period PR01,PR02,RT03" \ + -i "pandas.Series.dt.to_period PR01,PR02" \ -i "pandas.Series.dt.total_seconds PR01" \ - -i "pandas.Series.dt.tz SA01" \ - -i "pandas.Series.dt.tz_convert PR01,PR02,RT03" \ + -i "pandas.Series.dt.tz_convert PR01,PR02" \ -i "pandas.Series.dt.tz_localize PR01,PR02" \ -i "pandas.Series.dt.unit GL08" \ -i "pandas.Series.dtype SA01" \ - -i "pandas.Series.empty GL08" \ -i "pandas.Series.eq PR07,SA01" \ -i "pandas.Series.floordiv PR07" \ -i "pandas.Series.ge PR07,SA01" \ -i "pandas.Series.gt PR07,SA01" \ -i "pandas.Series.hasnans SA01" \ - -i "pandas.Series.infer_objects RT03" \ -i "pandas.Series.is_monotonic_decreasing SA01" \ -i "pandas.Series.is_monotonic_increasing SA01" \ -i "pandas.Series.is_unique SA01" \ - -i "pandas.Series.item SA01" \ -i "pandas.Series.kurt RT03,SA01" \ -i "pandas.Series.kurtosis RT03,SA01" \ -i "pandas.Series.le PR07,SA01" \ @@ -351,9 +226,7 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Series.mod PR07" \ -i "pandas.Series.mode SA01" \ -i "pandas.Series.mul PR07" \ - -i "pandas.Series.nbytes SA01" \ -i "pandas.Series.ne PR07,SA01" \ - -i "pandas.Series.nunique RT03" \ -i "pandas.Series.pad PR01,SA01" \ -i "pandas.Series.plot PR02,SA01" \ -i "pandas.Series.pop RT03,SA01" \ @@ -416,7 +289,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Series.swaplevel SA01" \ -i "pandas.Series.to_dict SA01" \ -i "pandas.Series.to_frame SA01" \ - -i "pandas.Series.to_list RT03" \ -i "pandas.Series.to_markdown SA01" \ -i "pandas.Series.to_string SA01" \ -i "pandas.Series.truediv PR07" \ @@ -439,14 +311,10 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Timedelta.total_seconds SA01" \ -i "pandas.Timedelta.view SA01" \ -i "pandas.TimedeltaIndex.as_unit RT03,SA01" \ - -i "pandas.TimedeltaIndex.ceil SA01" \ -i "pandas.TimedeltaIndex.components SA01" \ -i "pandas.TimedeltaIndex.days SA01" \ - -i "pandas.TimedeltaIndex.floor SA01" \ - -i "pandas.TimedeltaIndex.inferred_freq SA01" \ -i "pandas.TimedeltaIndex.microseconds SA01" \ -i "pandas.TimedeltaIndex.nanoseconds SA01" \ - -i "pandas.TimedeltaIndex.round SA01" \ -i "pandas.TimedeltaIndex.seconds SA01" \ -i "pandas.TimedeltaIndex.to_pytimedelta RT03,SA01" \ -i "pandas.Timestamp PR07,SA01" \ @@ -517,10 +385,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Timestamp.weekday SA01" \ -i "pandas.Timestamp.weekofyear SA01" \ -i "pandas.Timestamp.year GL08" \ - -i "pandas.UInt16Dtype SA01" \ - -i "pandas.UInt32Dtype SA01" \ - -i "pandas.UInt64Dtype SA01" \ - -i "pandas.UInt8Dtype SA01" \ -i "pandas.api.extensions.ExtensionArray SA01" \ -i "pandas.api.extensions.ExtensionArray._accumulate RT03,SA01" \ -i "pandas.api.extensions.ExtensionArray._concat_same_type PR07,SA01" \ diff --git a/doc/source/development/community.rst b/doc/source/development/community.rst index ccf7be8e47748..ab8294b8f135a 100644 --- a/doc/source/development/community.rst +++ b/doc/source/development/community.rst @@ -100,6 +100,8 @@ The pandas mailing list `pandas-dev@python.org `_. + .. _community.slack: Community slack diff --git a/doc/source/development/contributing_codebase.rst b/doc/source/development/contributing_codebase.rst index 39e279fd5c917..28129440b86d7 100644 --- a/doc/source/development/contributing_codebase.rst +++ b/doc/source/development/contributing_codebase.rst @@ -557,11 +557,12 @@ is being raised, using ``pytest.raises`` instead. Testing a warning ^^^^^^^^^^^^^^^^^ -Use ``tm.assert_produces_warning`` as a context manager to check that a block of code raises a warning. +Use ``tm.assert_produces_warning`` as a context manager to check that a block of code raises a warning +and specify the warning message using the ``match`` argument. .. code-block:: python - with tm.assert_produces_warning(DeprecationWarning): + with tm.assert_produces_warning(DeprecationWarning, match="the warning message"): pd.deprecated_function() If a warning should specifically not happen in a block of code, pass ``False`` into the context manager. diff --git a/doc/source/user_guide/style.ipynb b/doc/source/user_guide/style.ipynb index f831723f44931..43da43a983429 100644 --- a/doc/source/user_guide/style.ipynb +++ b/doc/source/user_guide/style.ipynb @@ -1908,7 +1908,7 @@ "- Provide an API that is pleasing to use interactively and is \"good enough\" for many tasks\n", "- Provide the foundations for dedicated libraries to build on\n", "\n", - "If you build a great library on top of this, let us know and we'll [link](https://pandas.pydata.org/pandas-docs/stable/ecosystem.html) to it.\n", + "If you build a great library on top of this, let us know and we'll [link](https://pandas.pydata.org/community/ecosystem.html) to it.\n", "\n", "### Subclassing\n", "\n", diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index a8653cc76fae1..a3fbd83336858 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -29,6 +29,7 @@ enhancement2 Other enhancements ^^^^^^^^^^^^^^^^^^ - :class:`pandas.api.typing.FrozenList` is available for typing the outputs of :attr:`MultiIndex.names`, :attr:`MultiIndex.codes` and :attr:`MultiIndex.levels` (:issue:`58237`) +- :class:`pandas.api.typing.SASReader` is available for typing the output of :func:`read_sas` (:issue:`55689`) - :func:`DataFrame.to_excel` now raises an ``UserWarning`` when the character count in a cell exceeds Excel's limitation of 32767 characters (:issue:`56954`) - :func:`read_stata` now returns ``datetime64`` resolutions better matching those natively stored in the stata format (:issue:`55642`) - :meth:`Styler.set_tooltips` provides alternative method to storing tooltips by using title attribute of td elements. (:issue:`56981`) @@ -38,8 +39,11 @@ Other enhancements - Users can globally disable any ``PerformanceWarning`` by setting the option ``mode.performance_warnings`` to ``False`` (:issue:`56920`) - :meth:`Styler.format_index_names` can now be used to format the index and column names (:issue:`48936` and :issue:`47489`) - :class:`.errors.DtypeWarning` improved to include column names when mixed data types are detected (:issue:`58174`) +- :meth:`DataFrame.corrwith` now accepts ``min_periods`` as optional arguments, as in :meth:`DataFrame.corr` and :meth:`Series.corr` (:issue:`9490`) - :meth:`DataFrame.cummin`, :meth:`DataFrame.cummax`, :meth:`DataFrame.cumprod` and :meth:`DataFrame.cumsum` methods now have a ``numeric_only`` parameter (:issue:`53072`) - :meth:`DataFrame.fillna` and :meth:`Series.fillna` can now accept ``value=None``; for non-object dtype the corresponding NA value will be used (:issue:`57723`) +- :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`) +- :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`) .. --------------------------------------------------------------------------- .. _whatsnew_300.notable_bug_fixes: @@ -157,6 +161,7 @@ Other API changes - Updated :meth:`DataFrame.to_excel` so that the output spreadsheet has no styling. Custom styling can still be done using :meth:`Styler.to_excel` (:issue:`54154`) - pickle and HDF (``.h5``) files created with Python 2 are no longer explicitly supported (:issue:`57387`) - pickled objects from pandas version less than ``1.0.0`` are no longer supported (:issue:`57155`) +- when comparing the indexes in :func:`testing.assert_series_equal`, check_exact defaults to True if an :class:`Index` is of integer dtypes. (:issue:`57386`) .. --------------------------------------------------------------------------- .. _whatsnew_300.deprecations: @@ -198,6 +203,7 @@ Other Deprecations - Deprecated allowing non-keyword arguments in :meth:`DataFrame.all`, :meth:`DataFrame.min`, :meth:`DataFrame.max`, :meth:`DataFrame.sum`, :meth:`DataFrame.prod`, :meth:`DataFrame.mean`, :meth:`DataFrame.median`, :meth:`DataFrame.sem`, :meth:`DataFrame.var`, :meth:`DataFrame.std`, :meth:`DataFrame.skew`, :meth:`DataFrame.kurt`, :meth:`Series.all`, :meth:`Series.min`, :meth:`Series.max`, :meth:`Series.sum`, :meth:`Series.prod`, :meth:`Series.mean`, :meth:`Series.median`, :meth:`Series.sem`, :meth:`Series.var`, :meth:`Series.std`, :meth:`Series.skew`, and :meth:`Series.kurt`. (:issue:`57087`) - Deprecated allowing non-keyword arguments in :meth:`Series.to_markdown` except ``buf``. (:issue:`57280`) - Deprecated allowing non-keyword arguments in :meth:`Series.to_string` except ``buf``. (:issue:`57280`) +- Deprecated behavior of :meth:`Series.dt.to_pytimedelta`, in a future version this will return a :class:`Series` containing python ``datetime.timedelta`` objects instead of an ``ndarray`` of timedelta; this matches the behavior of other :meth:`Series.dt` properties. (:issue:`57463`) - Deprecated using ``epoch`` date format in :meth:`DataFrame.to_json` and :meth:`Series.to_json`, use ``iso`` instead. (:issue:`57063`) - @@ -210,15 +216,18 @@ Removal of prior version deprecations/changes - :func:`concat` no longer ignores empty objects when determining output dtypes (:issue:`39122`) - :func:`concat` with all-NA entries no longer ignores the dtype of those entries when determining the result dtype (:issue:`40893`) - :func:`read_excel`, :func:`read_json`, :func:`read_html`, and :func:`read_xml` no longer accept raw string or byte representation of the data. That type of data must be wrapped in a :py:class:`StringIO` or :py:class:`BytesIO` (:issue:`53767`) +- :func:`to_datetime` with a ``unit`` specified no longer parses strings into floats, instead parses them the same way as without ``unit`` (:issue:`50735`) - :meth:`DataFrame.groupby` with ``as_index=False`` and aggregation methods will no longer exclude from the result the groupings that do not arise from the input (:issue:`49519`) - :meth:`Series.dt.to_pydatetime` now returns a :class:`Series` of :py:class:`datetime.datetime` objects (:issue:`52459`) - :meth:`SeriesGroupBy.agg` no longer pins the name of the group to the input passed to the provided ``func`` (:issue:`51703`) - All arguments except ``name`` in :meth:`Index.rename` are now keyword only (:issue:`56493`) - All arguments except the first ``path``-like argument in IO writers are now keyword only (:issue:`54229`) +- Changed behavior of :meth:`Series.__getitem__` and :meth:`Series.__setitem__` to always treat integer keys as labels, never as positional, consistent with :class:`DataFrame` behavior (:issue:`50617`) - Disallow allowing logical operations (``||``, ``&``, ``^``) between pandas objects and dtype-less sequences (e.g. ``list``, ``tuple``); wrap the objects in :class:`Series`, :class:`Index`, or ``np.array`` first instead (:issue:`52264`) - Disallow automatic casting to object in :class:`Series` logical operations (``&``, ``^``, ``||``) between series with mismatched indexes and dtypes other than ``object`` or ``bool`` (:issue:`52538`) - Disallow calling :meth:`Series.replace` or :meth:`DataFrame.replace` without a ``value`` and with non-dict-like ``to_replace`` (:issue:`33302`) - Disallow constructing a :class:`arrays.SparseArray` with scalar data (:issue:`53039`) +- Disallow indexing an :class:`Index` with a boolean indexer of length zero, it now raises ``ValueError`` (:issue:`55820`) - Disallow non-standard (``np.ndarray``, :class:`Index`, :class:`ExtensionArray`, or :class:`Series`) to :func:`isin`, :func:`unique`, :func:`factorize` (:issue:`52986`) - Disallow passing a pandas type to :meth:`Index.view` (:issue:`55709`) - Disallow units other than "s", "ms", "us", "ns" for datetime64 and timedelta64 dtypes in :func:`array` (:issue:`53817`) @@ -327,12 +336,17 @@ Performance improvements - Performance improvement in :meth:`Index.join` by propagating cached attributes in cases where the result matches one of the inputs (:issue:`57023`) - Performance improvement in :meth:`Index.take` when ``indices`` is a full range indexer from zero to length of index (:issue:`56806`) - Performance improvement in :meth:`Index.to_frame` returning a :class:`RangeIndex` columns of a :class:`Index` when possible. (:issue:`58018`) +- Performance improvement in :meth:`MultiIndex._engine` to use smaller dtypes if possible (:issue:`58411`) - Performance improvement in :meth:`MultiIndex.equals` for equal length indexes (:issue:`56990`) +- Performance improvement in :meth:`MultiIndex.memory_usage` to ignore the index engine when it isn't already cached. (:issue:`58385`) - Performance improvement in :meth:`RangeIndex.__getitem__` with a boolean mask or integers returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57588`) - Performance improvement in :meth:`RangeIndex.append` when appending the same index (:issue:`57252`) - Performance improvement in :meth:`RangeIndex.argmin` and :meth:`RangeIndex.argmax` (:issue:`57823`) - Performance improvement in :meth:`RangeIndex.insert` returning a :class:`RangeIndex` instead of a :class:`Index` when the :class:`RangeIndex` is empty. (:issue:`57833`) - Performance improvement in :meth:`RangeIndex.round` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57824`) +- Performance improvement in :meth:`RangeIndex.searchsorted` (:issue:`58376`) +- Performance improvement in :meth:`RangeIndex.to_numpy` when specifying an ``na_value`` (:issue:`58376`) +- Performance improvement in :meth:`RangeIndex.value_counts` (:issue:`58376`) - Performance improvement in :meth:`RangeIndex.join` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57651`, :issue:`57752`) - Performance improvement in :meth:`RangeIndex.reindex` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57647`, :issue:`57752`) - Performance improvement in :meth:`RangeIndex.take` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57445`, :issue:`57752`) @@ -359,6 +373,7 @@ Datetimelike - Bug in :func:`date_range` where the last valid timestamp would sometimes not be produced (:issue:`56134`) - Bug in :func:`date_range` where using a negative frequency value would not include all points between the start and end values (:issue:`56382`) - Bug in :func:`tseries.api.guess_datetime_format` would fail to infer time format when "%Y" == "%H%M" (:issue:`57452`) +- Bug in setting scalar values with mismatched resolution into arrays with non-nanosecond ``datetime64``, ``timedelta64`` or :class:`DatetimeTZDtype` incorrectly truncating those scalars (:issue:`56410`) Timedelta ^^^^^^^^^ @@ -377,6 +392,7 @@ Numeric Conversion ^^^^^^^^^^ +- Bug in :meth:`DataFrame.astype` not casting ``values`` for Arrow-based dictionary dtype correctly (:issue:`58479`) - Bug in :meth:`DataFrame.update` bool dtype being converted to object (:issue:`55509`) - Bug in :meth:`Series.astype` might modify read-only array inplace when casting to a string dtype (:issue:`57212`) - Bug in :meth:`Series.reindex` not maintaining ``float32`` type when a ``reindex`` introduces a missing value (:issue:`45857`) @@ -409,6 +425,7 @@ MultiIndex I/O ^^^ - Bug in :class:`DataFrame` and :class:`Series` ``repr`` of :py:class:`collections.abc.Mapping`` elements. (:issue:`57915`) +- Bug in :meth:`DataFrame.to_dict` raises unnecessary ``UserWarning`` when columns are not unique and ``orient='tight'``. (:issue:`58281`) - Bug in :meth:`DataFrame.to_excel` when writing empty :class:`DataFrame` with :class:`MultiIndex` on both axes (:issue:`57696`) - Bug in :meth:`DataFrame.to_string` that raised ``StopIteration`` with nested DataFrames. (:issue:`16098`) - Bug in :meth:`read_csv` raising ``TypeError`` when ``index_col`` is specified and ``na_values`` is a dict containing the key ``None``. (:issue:`57547`) @@ -429,9 +446,11 @@ Groupby/resample/rolling - Bug in :meth:`.DataFrameGroupBy.groups` and :meth:`.SeriesGroupby.groups` that would not respect groupby argument ``dropna`` (:issue:`55919`) - Bug in :meth:`.DataFrameGroupBy.median` where nat values gave an incorrect result. (:issue:`57926`) - Bug in :meth:`.DataFrameGroupBy.quantile` when ``interpolation="nearest"`` is inconsistent with :meth:`DataFrame.quantile` (:issue:`47942`) +- Bug in :meth:`.Resampler.interpolate` on a :class:`DataFrame` with non-uniform sampling and/or indices not aligning with the resulting resampled index would result in wrong interpolation (:issue:`21351`) - Bug in :meth:`DataFrame.ewm` and :meth:`Series.ewm` when passed ``times`` and aggregation functions other than mean (:issue:`51695`) - Bug in :meth:`DataFrameGroupBy.agg` and :meth:`SeriesGroupBy.agg` that was returning numpy dtype values when input values are pyarrow dtype values, instead of returning pyarrow dtype values. (:issue:`53030`) - Bug in :meth:`DataFrameGroupBy.apply` that was returning a completely empty DataFrame when all return values of ``func`` were ``None`` instead of returning an empty DataFrame with the original columns and dtypes. (:issue:`57775`) +- Bug in :meth:`DataFrameGroupBy.apply` with ``as_index=False`` that was returning :class:`MultiIndex` instead of returning :class:`Index`. (:issue:`58291`) Reshaping @@ -456,8 +475,10 @@ Styler Other ^^^^^ - Bug in :class:`DataFrame` when passing a ``dict`` with a NA scalar and ``columns`` that would always return ``np.nan`` (:issue:`57205`) +- Bug in :func:`eval` where the names of the :class:`Series` were not preserved when using ``engine="numexpr"``. (:issue:`10239`) - Bug in :func:`unique` on :class:`Index` not always returning :class:`Index` (:issue:`57043`) - Bug in :meth:`DataFrame.eval` and :meth:`DataFrame.query` which caused an exception when using NumPy attributes via ``@`` notation, e.g., ``df.eval("@np.floor(a)")``. (:issue:`58041`) +- Bug in :meth:`DataFrame.eval` and :meth:`DataFrame.query` which did not allow to use ``tan`` function. (:issue:`55091`) - Bug in :meth:`DataFrame.sort_index` when passing ``axis="columns"`` and ``ignore_index=True`` and ``ascending=False`` not returning a :class:`RangeIndex` columns (:issue:`57293`) - Bug in :meth:`DataFrame.transform` that was returning the wrong order unless the index was monotonically increasing. (:issue:`57069`) - Bug in :meth:`DataFrame.where` where using a non-bool type array in the function would return a ``ValueError`` instead of a ``TypeError`` (:issue:`56330`) @@ -466,6 +487,7 @@ Other - Bug in :meth:`Series.rank` that doesn't preserve missing values for nullable integers when ``na_option='keep'``. (:issue:`56976`) - Bug in :meth:`Series.replace` and :meth:`DataFrame.replace` inconsistently replacing matching instances when ``regex=True`` and missing values are present. (:issue:`56599`) - Bug in Dataframe Interchange Protocol implementation was returning incorrect results for data buffers' associated dtype, for string and datetime columns (:issue:`54781`) +- Bug in ``Series.list`` methods not preserving the original :class:`Index`. (:issue:`58425`) .. ***DO NOT USE THIS SECTION*** diff --git a/pandas/_libs/hashing.pyx b/pandas/_libs/hashing.pyx index a9bf784d5f973..a1fd70529efa7 100644 --- a/pandas/_libs/hashing.pyx +++ b/pandas/_libs/hashing.pyx @@ -11,6 +11,7 @@ import numpy as np from numpy cimport ( import_array, + ndarray, uint8_t, uint64_t, ) @@ -22,7 +23,7 @@ from pandas._libs.util cimport is_nan @cython.boundscheck(False) def hash_object_array( - object[:] arr, str key, str encoding="utf8" + ndarray[object, ndim=1] arr, str key, str encoding="utf8" ) -> np.ndarray[np.uint64]: """ Parameters diff --git a/pandas/_libs/index.pyi b/pandas/_libs/index.pyi index 12a5bf245977e..bf6d8ba8973d3 100644 --- a/pandas/_libs/index.pyi +++ b/pandas/_libs/index.pyi @@ -74,13 +74,13 @@ class MaskedBoolEngine(MaskedUInt8Engine): ... class BaseMultiIndexCodesEngine: levels: list[np.ndarray] - offsets: np.ndarray # ndarray[uint64_t, ndim=1] + offsets: np.ndarray # np.ndarray[..., ndim=1] def __init__( self, levels: list[Index], # all entries hashable labels: list[np.ndarray], # all entries integer-dtyped - offsets: np.ndarray, # np.ndarray[np.uint64, ndim=1] + offsets: np.ndarray, # np.ndarray[..., ndim=1] ) -> None: ... def get_indexer(self, target: npt.NDArray[np.object_]) -> npt.NDArray[np.intp]: ... def _extract_level_codes(self, target: MultiIndex) -> np.ndarray: ... diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index a700074d46ba8..f1be8d97c71eb 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -9,7 +9,6 @@ from numpy cimport ( intp_t, ndarray, uint8_t, - uint64_t, ) cnp.import_array() @@ -699,8 +698,7 @@ cdef class BaseMultiIndexCodesEngine: Keys are located by first locating each component against the respective level, then locating (the integer representation of) codes. """ - def __init__(self, object levels, object labels, - ndarray[uint64_t, ndim=1] offsets): + def __init__(self, object levels, object labels, ndarray offsets): """ Parameters ---------- @@ -708,7 +706,7 @@ cdef class BaseMultiIndexCodesEngine: Levels of the MultiIndex. labels : list-like of numpy arrays of integer dtype Labels of the MultiIndex. - offsets : numpy array of uint64 dtype + offsets : numpy array of int dtype Pre-calculated offsets, one for each level of the index. """ self.levels = levels @@ -718,8 +716,9 @@ cdef class BaseMultiIndexCodesEngine: # with positive integers (-1 for NaN becomes 1). This enables us to # differentiate between values that are missing in other and matching # NaNs. We will set values that are not found to 0 later: - labels_arr = np.array(labels, dtype="int64").T + multiindex_nulls_shift - codes = labels_arr.astype("uint64", copy=False) + codes = np.array(labels).T + codes += multiindex_nulls_shift # inplace sum optimisation + self.level_has_nans = [-1 in lab for lab in labels] # Map each codes combination in the index to an integer unambiguously @@ -731,8 +730,37 @@ cdef class BaseMultiIndexCodesEngine: # integers representing labels: we will use its get_loc and get_indexer self._base.__init__(self, lab_ints) - def _codes_to_ints(self, ndarray[uint64_t] codes) -> np.ndarray: - raise NotImplementedError("Implemented by subclass") # pragma: no cover + def _codes_to_ints(self, ndarray codes) -> np.ndarray: + """ + Transform combination(s) of uint in one uint or Python integer (each), in a + strictly monotonic way (i.e. respecting the lexicographic order of integer + combinations). + + Parameters + ---------- + codes : 1- or 2-dimensional array of dtype uint + Combinations of integers (one per row) + + Returns + ------- + scalar or 1-dimensional array, of dtype _codes_dtype + Integer(s) representing one combination (each). + """ + # To avoid overflows, first make sure we are working with the right dtype: + codes = codes.astype(self._codes_dtype, copy=False) + + # Shift the representation of each level by the pre-calculated number of bits: + codes <<= self.offsets # inplace shift optimisation + + # Now sum and OR are in fact interchangeable. This is a simple + # composition of the (disjunct) significant bits of each level (i.e. + # each column in "codes") in a single positive integer (per row): + if codes.ndim == 1: + # Single key + return np.bitwise_or.reduce(codes) + + # Multiple keys + return np.bitwise_or.reduce(codes, axis=1) def _extract_level_codes(self, target) -> np.ndarray: """ @@ -757,7 +785,7 @@ cdef class BaseMultiIndexCodesEngine: codes[codes > 0] += 1 if self.level_has_nans[i]: codes[target.codes[i] == -1] += 1 - return self._codes_to_ints(np.array(level_codes, dtype="uint64").T) + return self._codes_to_ints(np.array(level_codes, dtype=self._codes_dtype).T) def get_indexer(self, target: np.ndarray) -> np.ndarray: """ @@ -788,7 +816,7 @@ cdef class BaseMultiIndexCodesEngine: raise KeyError(key) # Transform indices into single integer: - lab_int = self._codes_to_ints(np.array(indices, dtype="uint64")) + lab_int = self._codes_to_ints(np.array(indices, dtype=self._codes_dtype)) return self._base.get_loc(self, lab_int) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 7aa1cb715521e..4fd68a1593e49 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -477,7 +477,7 @@ def has_infs(const floating[:] arr) -> bool: @cython.boundscheck(False) @cython.wraparound(False) -def has_only_ints_or_nan(floating[:] arr) -> bool: +def has_only_ints_or_nan(const floating[:] arr) -> bool: cdef: floating val intp_t i @@ -631,7 +631,7 @@ ctypedef fused int6432_t: @cython.wraparound(False) @cython.boundscheck(False) -def is_range_indexer(ndarray[int6432_t, ndim=1] left, Py_ssize_t n) -> bool: +def is_range_indexer(const int6432_t[:] left, Py_ssize_t n) -> bool: """ Perform an element by element comparison on 1-d integer arrays, meant for indexer comparisons @@ -652,7 +652,7 @@ def is_range_indexer(ndarray[int6432_t, ndim=1] left, Py_ssize_t n) -> bool: @cython.wraparound(False) @cython.boundscheck(False) -def is_sequence_range(ndarray[int6432_t, ndim=1] sequence, int64_t step) -> bool: +def is_sequence_range(const int6432_t[:] sequence, int64_t step) -> bool: """ Check if sequence is equivalent to a range with the specified step. """ @@ -2628,7 +2628,11 @@ def maybe_convert_objects(ndarray[object] objects, seen.object_ = True break elif val is C_NA: - seen.object_ = True + if convert_to_nullable_dtype: + seen.null_ = True + mask[i] = True + else: + seen.object_ = True continue else: seen.object_ = True @@ -2691,6 +2695,12 @@ def maybe_convert_objects(ndarray[object] objects, dtype = StringDtype(storage="pyarrow_numpy") return dtype.construct_array_type()._from_sequence(objects, dtype=dtype) + elif convert_to_nullable_dtype and is_string_array(objects, skipna=True): + from pandas.core.arrays.string_ import StringDtype + + dtype = StringDtype() + return dtype.construct_array_type()._from_sequence(objects, dtype=dtype) + seen.object_ = True elif seen.interval_: if is_interval_array(objects): @@ -2734,12 +2744,12 @@ def maybe_convert_objects(ndarray[object] objects, return objects if seen.bool_: - if seen.is_bool: - # is_bool property rules out everything else - return bools.view(np.bool_) - elif convert_to_nullable_dtype and seen.is_bool_or_na: + if convert_to_nullable_dtype and seen.is_bool_or_na: from pandas.core.arrays import BooleanArray return BooleanArray(bools.view(np.bool_), mask) + elif seen.is_bool: + # is_bool property rules out everything else + return bools.view(np.bool_) seen.object_ = True if not seen.object_: @@ -2752,11 +2762,11 @@ def maybe_convert_objects(ndarray[object] objects, result = floats elif seen.int_ or seen.uint_: if convert_to_nullable_dtype: - from pandas.core.arrays import IntegerArray + # Below we will wrap in IntegerArray if seen.uint_: - result = IntegerArray(uints, mask) + result = uints else: - result = IntegerArray(ints, mask) + result = ints else: result = floats elif seen.nan_: @@ -2771,7 +2781,6 @@ def maybe_convert_objects(ndarray[object] objects, result = uints else: result = ints - else: # don't cast int to float, etc. if seen.null_: @@ -2794,6 +2803,22 @@ def maybe_convert_objects(ndarray[object] objects, else: result = ints + # TODO: do these after the itemsize check? + if (result is ints or result is uints) and convert_to_nullable_dtype: + from pandas.core.arrays import IntegerArray + + # Set these values to 1 to be deterministic, match + # IntegerDtype._internal_fill_value + result[mask] = 1 + result = IntegerArray(result, mask) + elif result is floats and convert_to_nullable_dtype: + from pandas.core.arrays import FloatingArray + + # Set these values to 1.0 to be deterministic, match + # FloatingDtype._internal_fill_value + result[mask] = 1.0 + result = FloatingArray(result, mask) + if result is uints or result is ints or result is floats or result is complexes: # cast to the largest itemsize when all values are NumPy scalars if itemsize_max > 0 and itemsize_max != result.dtype.itemsize: diff --git a/pandas/_libs/reshape.pyx b/pandas/_libs/reshape.pyx index 21d1405328da6..28ea06739e0c8 100644 --- a/pandas/_libs/reshape.pyx +++ b/pandas/_libs/reshape.pyx @@ -19,7 +19,7 @@ from pandas._libs.lib cimport c_is_list_like @cython.wraparound(False) @cython.boundscheck(False) -def unstack(numeric_object_t[:, :] values, const uint8_t[:] mask, +def unstack(const numeric_object_t[:, :] values, const uint8_t[:] mask, Py_ssize_t stride, Py_ssize_t length, Py_ssize_t width, numeric_object_t[:, :] new_values, uint8_t[:, :] new_mask) -> None: """ @@ -80,7 +80,7 @@ def unstack(numeric_object_t[:, :] values, const uint8_t[:] mask, @cython.wraparound(False) @cython.boundscheck(False) -def explode(ndarray[object] values): +def explode(object[:] values): """ transform array list-likes to long form preserve non-list entries diff --git a/pandas/_libs/tslib.pyi b/pandas/_libs/tslib.pyi index 5a340c1d88bc4..7e3372a80db9d 100644 --- a/pandas/_libs/tslib.pyi +++ b/pandas/_libs/tslib.pyi @@ -11,11 +11,6 @@ def format_array_from_datetime( na_rep: str | float = ..., reso: int = ..., # NPY_DATETIMEUNIT ) -> npt.NDArray[np.object_]: ... -def array_with_unit_to_datetime( - values: npt.NDArray[np.object_], - unit: str, - errors: str = ..., -) -> tuple[np.ndarray, tzinfo | None]: ... def first_non_null(values: np.ndarray) -> int: ... def array_to_datetime( values: npt.NDArray[np.object_], @@ -24,6 +19,7 @@ def array_to_datetime( yearfirst: bool = ..., utc: bool = ..., creso: int = ..., + unit_for_numerics: str | None = ..., ) -> tuple[np.ndarray, tzinfo | None]: ... # returned ndarray may be object dtype or datetime64[ns] diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index aecf9f2e46bd4..dca3ba0ce49b3 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -1,7 +1,3 @@ -import warnings - -from pandas.util._exceptions import find_stack_level - cimport cython from datetime import timezone @@ -234,117 +230,6 @@ def format_array_from_datetime( return result -def array_with_unit_to_datetime( - ndarray[object] values, - str unit, - str errors="coerce" -): - """ - Convert the ndarray to datetime according to the time unit. - - This function converts an array of objects into a numpy array of - datetime64[ns]. It returns the converted array - and also returns the timezone offset - - if errors: - - raise: return converted values or raise OutOfBoundsDatetime - if out of range on the conversion or - ValueError for other conversions (e.g. a string) - - ignore: return non-convertible values as the same unit - - coerce: NaT for non-convertibles - - Parameters - ---------- - values : ndarray - Date-like objects to convert. - unit : str - Time unit to use during conversion. - errors : str, default 'raise' - Error behavior when parsing. - - Returns - ------- - result : ndarray of m8 values - tz : parsed timezone offset or None - """ - cdef: - Py_ssize_t i, n=len(values) - bint is_coerce = errors == "coerce" - bint is_raise = errors == "raise" - ndarray[int64_t] iresult - tzinfo tz = None - double fval - - assert is_coerce or is_raise - - if unit == "ns": - result, tz = array_to_datetime( - values.astype(object, copy=False), - errors=errors, - creso=NPY_FR_ns, - ) - return result, tz - - result = np.empty(n, dtype="M8[ns]") - iresult = result.view("i8") - - for i in range(n): - val = values[i] - - try: - if checknull_with_nat_and_na(val): - iresult[i] = NPY_NAT - - elif is_integer_object(val) or is_float_object(val): - - if val != val or val == NPY_NAT: - iresult[i] = NPY_NAT - else: - iresult[i] = cast_from_unit(val, unit) - - elif isinstance(val, str): - if len(val) == 0 or val in nat_strings: - iresult[i] = NPY_NAT - - else: - - try: - fval = float(val) - except ValueError: - raise ValueError( - f"non convertible value {val} with the unit '{unit}'" - ) - warnings.warn( - "The behavior of 'to_datetime' with 'unit' when parsing " - "strings is deprecated. In a future version, strings will " - "be parsed as datetime strings, matching the behavior " - "without a 'unit'. To retain the old behavior, explicitly " - "cast ints or floats to numeric type before calling " - "to_datetime.", - FutureWarning, - stacklevel=find_stack_level(), - ) - - iresult[i] = cast_from_unit(fval, unit) - - else: - # TODO: makes more sense as TypeError, but that would be an - # API change. - raise ValueError( - f"unit='{unit}' not valid with non-numerical val='{val}'" - ) - - except (ValueError, TypeError) as err: - if is_raise: - err.args = (f"{err}, at position {i}",) - raise - else: - # is_coerce - iresult[i] = NPY_NAT - - return result, tz - - @cython.wraparound(False) @cython.boundscheck(False) def first_non_null(values: ndarray) -> int: @@ -376,6 +261,7 @@ cpdef array_to_datetime( bint yearfirst=False, bint utc=False, NPY_DATETIMEUNIT creso=NPY_FR_ns, + str unit_for_numerics=None, ): """ Converts a 1D array of date-like values to a numpy array of either: @@ -404,6 +290,7 @@ cpdef array_to_datetime( indicator whether the dates should be UTC creso : NPY_DATETIMEUNIT, default NPY_FR_ns Set to NPY_FR_GENERIC to infer a resolution. + unit_for_numerics : str, default "ns" Returns ------- @@ -434,6 +321,13 @@ cpdef array_to_datetime( abbrev = "ns" else: abbrev = npy_unit_to_abbrev(creso) + + if unit_for_numerics is not None: + # either creso or unit_for_numerics should be passed, not both + assert creso == NPY_FR_ns + else: + unit_for_numerics = abbrev + result = np.empty((values).shape, dtype=f"M8[{abbrev}]") iresult = result.view("i8").ravel() @@ -485,7 +379,8 @@ cpdef array_to_datetime( creso = state.creso # we now need to parse this as if unit=abbrev - iresult[i] = cast_from_unit(val, abbrev, out_reso=creso) + iresult[i] = cast_from_unit(val, unit_for_numerics, out_reso=creso) + state.found_other = True elif isinstance(val, str): diff --git a/pandas/_testing/asserters.py b/pandas/_testing/asserters.py index 3aacd3099c334..543d7944e4c5d 100644 --- a/pandas/_testing/asserters.py +++ b/pandas/_testing/asserters.py @@ -861,12 +861,19 @@ def assert_series_equal( check_names : bool, default True Whether to check the Series and Index names attribute. check_exact : bool, default False - Whether to compare number exactly. + Whether to compare number exactly. This also applies when checking + Index equivalence. .. versionchanged:: 2.2.0 Defaults to True for integer dtypes if none of ``check_exact``, ``rtol`` and ``atol`` are specified. + + .. versionchanged:: 3.0.0 + + check_exact for comparing the Indexes defaults to True by + checking if an Index is of integer dtypes. + check_datetimelike_compat : bool, default False Compare datetime-like which is comparable ignoring dtype. check_categorical : bool, default True @@ -902,7 +909,6 @@ def assert_series_equal( >>> tm.assert_series_equal(a, b) """ __tracebackhide__ = True - check_exact_index = False if check_exact is lib.no_default else check_exact if ( check_exact is lib.no_default and rtol is lib.no_default @@ -914,8 +920,20 @@ def assert_series_equal( or is_numeric_dtype(right.dtype) and not is_float_dtype(right.dtype) ) + left_index_dtypes = ( + [left.index.dtype] if left.index.nlevels == 1 else left.index.dtypes + ) + right_index_dtypes = ( + [right.index.dtype] if right.index.nlevels == 1 else right.index.dtypes + ) + check_exact_index = all( + dtype.kind in "iu" for dtype in left_index_dtypes + ) or all(dtype.kind in "iu" for dtype in right_index_dtypes) elif check_exact is lib.no_default: check_exact = False + check_exact_index = False + else: + check_exact_index = check_exact rtol = rtol if rtol is not lib.no_default else 1.0e-5 atol = atol if atol is not lib.no_default else 1.0e-8 diff --git a/pandas/_typing.py b/pandas/_typing.py index 172b30c59fc13..ef68018f2721a 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -314,7 +314,7 @@ def readline(self) -> bytes: ... class WriteExcelBuffer(WriteBuffer[bytes], Protocol): - def truncate(self, size: int | None = ...) -> int: ... + def truncate(self, size: int | None = ..., /) -> int: ... class ReadCsvBuffer(ReadBuffer[AnyStr_co], Protocol): diff --git a/pandas/api/typing/__init__.py b/pandas/api/typing/__init__.py index df6392bf692a2..c58fa0f085266 100644 --- a/pandas/api/typing/__init__.py +++ b/pandas/api/typing/__init__.py @@ -30,6 +30,7 @@ # TODO: Can't import Styler without importing jinja2 # from pandas.io.formats.style import Styler from pandas.io.json._json import JsonReader +from pandas.io.sas.sasreader import SASReader from pandas.io.stata import StataReader __all__ = [ @@ -49,6 +50,7 @@ "RollingGroupby", "SeriesGroupBy", "StataReader", + "SASReader", # See TODO above # "Styler", "TimedeltaIndexResamplerGroupby", diff --git a/pandas/conftest.py b/pandas/conftest.py index 34489bb70575a..21100178262c8 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -157,6 +157,7 @@ def pytest_collection_modifyitems(items, config) -> None: ("SeriesGroupBy.fillna", "SeriesGroupBy.fillna is deprecated"), ("SeriesGroupBy.idxmin", "The behavior of Series.idxmin"), ("SeriesGroupBy.idxmax", "The behavior of Series.idxmax"), + ("to_pytimedelta", "The behavior of TimedeltaProperties.to_pytimedelta"), # Docstring divides by zero to show behavior difference ("missing.mask_zero_div_zero", "divide by zero encountered"), ( diff --git a/pandas/core/arrays/arrow/accessors.py b/pandas/core/arrays/arrow/accessors.py index 19ec253e81ef2..d8f948a37d206 100644 --- a/pandas/core/arrays/arrow/accessors.py +++ b/pandas/core/arrays/arrow/accessors.py @@ -110,7 +110,9 @@ def len(self) -> Series: from pandas import Series value_lengths = pc.list_value_length(self._pa_array) - return Series(value_lengths, dtype=ArrowDtype(value_lengths.type)) + return Series( + value_lengths, dtype=ArrowDtype(value_lengths.type), index=self._data.index + ) def __getitem__(self, key: int | slice) -> Series: """ @@ -149,7 +151,9 @@ def __getitem__(self, key: int | slice) -> Series: # if key < 0: # key = pc.add(key, pc.list_value_length(self._pa_array)) element = pc.list_element(self._pa_array, key) - return Series(element, dtype=ArrowDtype(element.type)) + return Series( + element, dtype=ArrowDtype(element.type), index=self._data.index + ) elif isinstance(key, slice): if pa_version_under11p0: raise NotImplementedError( @@ -167,7 +171,7 @@ def __getitem__(self, key: int | slice) -> Series: if step is None: step = 1 sliced = pc.list_slice(self._pa_array, start, stop, step) - return Series(sliced, dtype=ArrowDtype(sliced.type)) + return Series(sliced, dtype=ArrowDtype(sliced.type), index=self._data.index) else: raise ValueError(f"key must be an int or slice, got {type(key).__name__}") @@ -195,15 +199,17 @@ def flatten(self) -> Series: ... ) >>> s.list.flatten() 0 1 - 1 2 - 2 3 - 3 3 + 0 2 + 0 3 + 1 3 dtype: int64[pyarrow] """ from pandas import Series - flattened = pc.list_flatten(self._pa_array) - return Series(flattened, dtype=ArrowDtype(flattened.type)) + counts = pa.compute.list_value_length(self._pa_array) + flattened = pa.compute.list_flatten(self._pa_array) + index = self._data.index.repeat(counts.fill_null(pa.scalar(0, counts.type))) + return Series(flattened, dtype=ArrowDtype(flattened.type), index=index) class StructAccessor(ArrowAccessor): diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 1154130b9bed3..0240433cdb683 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -525,6 +525,8 @@ def _box_pa_array( if pa_type is not None and pa_array.type != pa_type: if pa.types.is_dictionary(pa_type): pa_array = pa_array.dictionary_encode() + if pa_array.type != pa_type: + pa_array = pa_array.cast(pa_type) else: try: pa_array = pa_array.cast(pa_type) diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index 813b10eef5e4b..a326925545045 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -68,6 +68,9 @@ class BooleanDtype(BaseMaskedDtype): name: ClassVar[str] = "boolean" + # The value used to fill '_data' to avoid upcasting + _internal_fill_value = False + # https://github.com/python/mypy/issues/4125 # error: Signature of "type" incompatible with supertype "BaseMaskedDtype" @property @@ -293,13 +296,6 @@ class BooleanArray(BaseMaskedArray): Length: 3, dtype: boolean """ - # The value used to fill '_data' to avoid upcasting - _internal_fill_value = False - # Fill values used for any/all - # Incompatible types in assignment (expression has type "bool", base class - # "BaseMaskedArray" defined the type as "") - _truthy_value = True # type: ignore[assignment] - _falsey_value = False # type: ignore[assignment] _TRUE_VALUES = {"True", "TRUE", "true", "1", "1.0"} _FALSE_VALUES = {"False", "FALSE", "false", "0", "0.0"} diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 8d6880fc2acb3..11dea697d9b93 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -6,6 +6,7 @@ from shutil import get_terminal_size from typing import ( TYPE_CHECKING, + Callable, Literal, cast, overload, @@ -496,6 +497,11 @@ def dtype(self) -> CategoricalDtype: """ The :class:`~pandas.api.types.CategoricalDtype` for this instance. + See Also + -------- + astype : Cast argument to a specified dtype. + CategoricalDtype : Type for categorical data. + Examples -------- >>> cat = pd.Categorical(["a", "b"], ordered=True) @@ -720,6 +726,11 @@ def from_codes( ------- Categorical + See Also + -------- + codes : The category codes of the categorical. + CategoricalIndex : An Index with an underlying ``Categorical``. + Examples -------- >>> dtype = pd.CategoricalDtype(["a", "b"], ordered=True) @@ -809,6 +820,12 @@ def ordered(self) -> Ordered: """ Whether the categories have an ordered relationship. + See Also + -------- + set_ordered : Set the ordered attribute. + as_ordered : Set the Categorical to be ordered. + as_unordered : Set the Categorical to be unordered. + Examples -------- For :class:`pandas.Series`: @@ -860,6 +877,11 @@ def codes(self) -> np.ndarray: ndarray[int] A non-writable view of the ``codes`` array. + See Also + -------- + Categorical.from_codes : Make a Categorical from codes. + CategoricalIndex : An Index with an underlying ``Categorical``. + Examples -------- For :class:`pandas.Categorical`: @@ -1640,6 +1662,9 @@ def __array__( """ The numpy array interface. + Users should not call this directly. Rather, it is invoked by + :func:`numpy.array` and :func:`numpy.asarray`. + Parameters ---------- dtype : np.dtype or None @@ -1655,6 +1680,10 @@ def __array__( if dtype==None (default), the same dtype as categorical.categories.dtype. + See Also + -------- + numpy.asarray : Convert input to numpy.ndarray. + Examples -------- @@ -2508,6 +2537,28 @@ def equals(self, other: object) -> bool: return np.array_equal(self._codes, other._codes) return False + def _accumulate(self, name: str, skipna: bool = True, **kwargs) -> Self: + func: Callable + if name == "cummin": + func = np.minimum.accumulate + elif name == "cummax": + func = np.maximum.accumulate + else: + raise TypeError(f"Accumulation {name} not supported for {type(self)}") + self.check_for_ordered(name) + + codes = self.codes.copy() + mask = self.isna() + if func == np.minimum.accumulate: + codes[mask] = np.iinfo(codes.dtype.type).max + # no need to change codes for maximum because codes[mask] is already -1 + if not skipna: + mask = np.maximum.accumulate(mask) + + codes = func(codes) + codes[mask] = -1 + return self._simple_new(codes, dtype=self._dtype) + @classmethod def _concat_same_type(cls, to_concat: Sequence[Self], axis: AxisInt = 0) -> Self: from pandas.core.dtypes.concat import union_categoricals diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 8ada9d88e08bc..ab17ae43215d2 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -875,6 +875,11 @@ def freqstr(self) -> str | None: """ Return the frequency object as a string if it's set, otherwise None. + See Also + -------- + DatetimeIndex.inferred_freq : Returns a string representing a frequency + generated by infer_freq. + Examples -------- For DatetimeIndex: @@ -908,6 +913,11 @@ def inferred_freq(self) -> str | None: Returns None if it can't autodetect the frequency. + See Also + -------- + DatetimeIndex.freqstr : Return the frequency object as a string if it's set, + otherwise None. + Examples -------- For DatetimeIndex: @@ -1825,6 +1835,11 @@ def strftime(self, date_format: str) -> npt.NDArray[np.object_]: ------ ValueError if the `freq` cannot be converted. + See Also + -------- + DatetimeIndex.floor : Perform floor operation on the data to the specified `freq`. + DatetimeIndex.snap : Snap time stamps to nearest occurring frequency. + Notes ----- If the timestamps have a timezone, {op}ing will take place relative to the diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 5d0dfc67bd90a..8747f795bebd8 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -539,7 +539,7 @@ def _unbox_scalar(self, value) -> np.datetime64: if value is NaT: return np.datetime64(value._value, self.unit) else: - return value.as_unit(self.unit).asm8 + return value.as_unit(self.unit, round_ok=False).asm8 def _scalar_from_string(self, value) -> Timestamp | NaTType: return Timestamp(value, tz=self.tz) @@ -593,6 +593,13 @@ def tz(self) -> tzinfo | None: datetime.tzinfo, pytz.tzinfo.BaseTZInfo, dateutil.tz.tz.tzfile, or None Returns None when the array is tz-naive. + See Also + -------- + DatetimeIndex.tz_localize : Localize tz-naive DatetimeIndex to a + given time zone, or remove timezone from a tz-aware DatetimeIndex. + DatetimeIndex.tz_convert : Convert tz-aware DatetimeIndex from + one time zone to another. + Examples -------- For Series: @@ -860,6 +867,7 @@ def tz_convert(self, tz) -> Self: Returns ------- Array or Index + Datetme Array/Index with target `tz`. Raises ------ @@ -1119,6 +1127,12 @@ def to_pydatetime(self) -> npt.NDArray[np.object_]: Returns ------- numpy.ndarray + An ndarray of ``datetime.datetime`` objects. + + See Also + -------- + DatetimeIndex.to_julian_date : Converts Datetime Array to float64 ndarray + of Julian Dates. Examples -------- @@ -1193,6 +1207,7 @@ def to_period(self, freq=None) -> PeriodArray: Returns ------- PeriodArray/PeriodIndex + Immutable ndarray holding ordinal values at a particular frequency. Raises ------ @@ -1391,6 +1406,14 @@ def time(self) -> npt.NDArray[np.object_]: The time part of the Timestamps. + See Also + -------- + DatetimeIndex.timetz : Returns numpy array of :class:`datetime.time` + objects with timezones. The time part of the Timestamps. + DatetimeIndex.date : Returns numpy array of python :class:`datetime.date` + objects. Namely, the date part of Timestamps without time and timezone + information. + Examples -------- For Series: @@ -1428,6 +1451,12 @@ def timetz(self) -> npt.NDArray[np.object_]: The time part of the Timestamps. + See Also + -------- + DatetimeIndex.time : Returns numpy array of :class:`datetime.time` objects. + The time part of the Timestamps. + DatetimeIndex.tz : Return the timezone. + Examples -------- For Series: @@ -1462,6 +1491,14 @@ def date(self) -> npt.NDArray[np.object_]: Namely, the date part of Timestamps without time and timezone information. + See Also + -------- + DatetimeIndex.time : Returns numpy array of :class:`datetime.time` objects. + The time part of the Timestamps. + DatetimeIndex.year : The year of the datetime. + DatetimeIndex.month : The month as January=1, December=12. + DatetimeIndex.day : The day of the datetime. + Examples -------- For Series: @@ -1597,6 +1634,12 @@ def isocalendar(self) -> DataFrame: """ The day of the datetime. + See Also + -------- + DatetimeIndex.year: The year of the datetime. + DatetimeIndex.month: The month as January=1, December=12. + DatetimeIndex.hour: The hours of the datetime. + Examples -------- >>> datetime_series = pd.Series( @@ -1706,6 +1749,11 @@ def isocalendar(self) -> DataFrame: """ The microseconds of the datetime. + See Also + -------- + DatetimeIndex.second: The seconds of the datetime. + DatetimeIndex.nanosecond: The nanoseconds of the datetime. + Examples -------- >>> datetime_series = pd.Series( @@ -1729,6 +1777,11 @@ def isocalendar(self) -> DataFrame: """ The nanoseconds of the datetime. + See Also + -------- + DatetimeIndex.second: The seconds of the datetime. + DatetimeIndex.microsecond: The microseconds of the datetime. + Examples -------- >>> datetime_series = pd.Series( @@ -1790,6 +1843,11 @@ def isocalendar(self) -> DataFrame: """ The ordinal day of the year. + See Also + -------- + DatetimeIndex.dayofweek : The day of the week with Monday=0, Sunday=6. + DatetimeIndex.day : The day of the datetime. + Examples -------- For Series: @@ -1820,6 +1878,12 @@ def isocalendar(self) -> DataFrame: """ The quarter of the date. + See Also + -------- + DatetimeIndex.snap : Snap time stamps to nearest occurring frequency. + DatetimeIndex.time : Returns numpy array of datetime.time objects. + The time part of the Timestamps. + Examples -------- For Series: @@ -2104,6 +2168,13 @@ def isocalendar(self) -> DataFrame: Series or ndarray Booleans indicating if dates belong to a leap year. + See Also + -------- + DatetimeIndex.is_year_end : Indicate whether the date is the + last day of the year. + DatetimeIndex.is_year_start : Indicate whether the date is the first + day of a year. + Examples -------- This method is available on Series with datetime values under @@ -2184,9 +2255,25 @@ def std( axis : int, optional Axis for the function to be applied on. For :class:`pandas.Series` this parameter is unused and defaults to ``None``. + dtype : dtype, optional, default None + Type to use in computing the standard deviation. For arrays of + integer type the default is float64, for arrays of float types + it is the same as the array type. + out : ndarray, optional, default None + Alternative output array in which to place the result. It must have + the same shape as the expected output but the type (of the + calculated values) will be cast if necessary. ddof : int, default 1 Degrees of Freedom. The divisor used in calculations is `N - ddof`, where `N` represents the number of elements. + keepdims : bool, optional + If this is set to True, the axes which are reduced are left in the + result as dimensions with size one. With this option, the result + will broadcast correctly against the input array. If the default + value is passed, then keepdims will not be passed through to the + std method of sub-classes of ndarray, however any non-default value + will be. If the sub-class method does not implement keepdims any + exceptions will be raised. skipna : bool, default True Exclude NA/null values. If an entire row/column is ``NA``, the result will be ``NA``. @@ -2194,6 +2281,7 @@ def std( Returns ------- Timedelta + Standard deviation over requested axis. See Also -------- diff --git a/pandas/core/arrays/floating.py b/pandas/core/arrays/floating.py index 653e63e9d1e2d..b3fbf0f92c32d 100644 --- a/pandas/core/arrays/floating.py +++ b/pandas/core/arrays/floating.py @@ -23,6 +23,8 @@ class FloatingDtype(NumericDtype): The attributes name & type are set when these subclasses are created. """ + # The value used to fill '_data' to avoid upcasting + _internal_fill_value = np.nan _default_np_dtype = np.dtype(np.float64) _checker = is_float_dtype @@ -113,14 +115,6 @@ class FloatingArray(NumericArray): _dtype_cls = FloatingDtype - # The value used to fill '_data' to avoid upcasting - _internal_fill_value = np.nan - # Fill values used for any/all - # Incompatible types in assignment (expression has type "float", base class - # "BaseMaskedArray" defined the type as "") - _truthy_value = 1.0 # type: ignore[assignment] - _falsey_value = 0.0 # type: ignore[assignment] - _dtype_docstring = """ An ExtensionDtype for {dtype} data. diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index dc453f3e37c50..f85fbd062b0c3 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -23,6 +23,8 @@ class IntegerDtype(NumericDtype): The attributes name & type are set when these subclasses are created. """ + # The value used to fill '_data' to avoid upcasting + _internal_fill_value = 1 _default_np_dtype = np.dtype(np.int64) _checker = is_integer_dtype @@ -128,14 +130,6 @@ class IntegerArray(NumericArray): _dtype_cls = IntegerDtype - # The value used to fill '_data' to avoid upcasting - _internal_fill_value = 1 - # Fill values used for any/all - # Incompatible types in assignment (expression has type "int", base class - # "BaseMaskedArray" defined the type as "") - _truthy_value = 1 # type: ignore[assignment] - _falsey_value = 0 # type: ignore[assignment] - _dtype_docstring = """ An ExtensionDtype for {dtype} integer data. @@ -150,6 +144,13 @@ class IntegerArray(NumericArray): ------- None +See Also +-------- +Int8Dtype : 8-bit nullable integer type. +Int16Dtype : 16-bit nullable integer type. +Int32Dtype : 32-bit nullable integer type. +Int64Dtype : 64-bit nullable integer type. + Examples -------- For Int8Dtype: diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 190888d281ea9..df794183f67d1 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -5,6 +5,7 @@ Any, Callable, Literal, + cast, overload, ) import warnings @@ -16,22 +17,6 @@ missing as libmissing, ) from pandas._libs.tslibs import is_supported_dtype -from pandas._typing import ( - ArrayLike, - AstypeArg, - AxisInt, - DtypeObj, - FillnaOptions, - InterpolateOptions, - NpDtype, - PositionalIndexer, - Scalar, - ScalarIndexer, - Self, - SequenceIndexer, - Shape, - npt, -) from pandas.compat import ( IS64, is_platform_windows, @@ -97,6 +82,20 @@ from pandas._typing import ( NumpySorter, NumpyValueArrayLike, + ArrayLike, + AstypeArg, + AxisInt, + DtypeObj, + FillnaOptions, + InterpolateOptions, + NpDtype, + PositionalIndexer, + Scalar, + ScalarIndexer, + Self, + SequenceIndexer, + Shape, + npt, ) from pandas._libs.missing import NAType from pandas.core.arrays import FloatingArray @@ -111,16 +110,10 @@ class BaseMaskedArray(OpsMixin, ExtensionArray): numpy based """ - # The value used to fill '_data' to avoid upcasting - _internal_fill_value: Scalar # our underlying data and mask are each ndarrays _data: np.ndarray _mask: npt.NDArray[np.bool_] - # Fill values used for any/all - _truthy_value = Scalar # bool(_truthy_value) = True - _falsey_value = Scalar # bool(_falsey_value) = False - @classmethod def _simple_new(cls, values: np.ndarray, mask: npt.NDArray[np.bool_]) -> Self: result = BaseMaskedArray.__new__(cls) @@ -155,8 +148,9 @@ def _from_sequence(cls, scalars, *, dtype=None, copy: bool = False) -> Self: @classmethod @doc(ExtensionArray._empty) def _empty(cls, shape: Shape, dtype: ExtensionDtype) -> Self: - values = np.empty(shape, dtype=dtype.type) - values.fill(cls._internal_fill_value) + dtype = cast(BaseMaskedDtype, dtype) + values: np.ndarray = np.empty(shape, dtype=dtype.type) + values.fill(dtype._internal_fill_value) mask = np.ones(shape, dtype=bool) result = cls(values, mask) if not isinstance(result, cls) or dtype != result.dtype: @@ -917,7 +911,9 @@ def take( ) -> Self: # we always fill with 1 internally # to avoid upcasting - data_fill_value = self._internal_fill_value if isna(fill_value) else fill_value + data_fill_value = ( + self.dtype._internal_fill_value if isna(fill_value) else fill_value + ) result = take( self._data, indexer, @@ -1397,12 +1393,7 @@ def any( nv.validate_any((), kwargs) values = self._data.copy() - # error: Argument 3 to "putmask" has incompatible type "object"; - # expected "Union[_SupportsArray[dtype[Any]], - # _NestedSequence[_SupportsArray[dtype[Any]]], - # bool, int, float, complex, str, bytes, - # _NestedSequence[Union[bool, int, float, complex, str, bytes]]]" - np.putmask(values, self._mask, self._falsey_value) # type: ignore[arg-type] + np.putmask(values, self._mask, self.dtype._falsey_value) result = values.any() if skipna: return result @@ -1490,12 +1481,7 @@ def all( nv.validate_all((), kwargs) values = self._data.copy() - # error: Argument 3 to "putmask" has incompatible type "object"; - # expected "Union[_SupportsArray[dtype[Any]], - # _NestedSequence[_SupportsArray[dtype[Any]]], - # bool, int, float, complex, str, bytes, - # _NestedSequence[Union[bool, int, float, complex, str, bytes]]]" - np.putmask(values, self._mask, self._truthy_value) # type: ignore[arg-type] + np.putmask(values, self._mask, self.dtype._truthy_value) result = values.all(axis=axis) if skipna: diff --git a/pandas/core/arrays/numeric.py b/pandas/core/arrays/numeric.py index fe7b32ec9652e..c5e9ed8698ffe 100644 --- a/pandas/core/arrays/numeric.py +++ b/pandas/core/arrays/numeric.py @@ -221,7 +221,7 @@ def _coerce_to_data_and_mask( # we copy as need to coerce here if mask.any(): values = values.copy() - values[mask] = cls._internal_fill_value + values[mask] = dtype_cls._internal_fill_value if inferred_type in ("string", "unicode"): # casts from str are always safe since they raise # a ValueError if the str cannot be parsed into a float diff --git a/pandas/core/arrays/sparse/accessor.py b/pandas/core/arrays/sparse/accessor.py index 1f82285e3e40e..6a1c25711acb0 100644 --- a/pandas/core/arrays/sparse/accessor.py +++ b/pandas/core/arrays/sparse/accessor.py @@ -243,6 +243,11 @@ class SparseFrameAccessor(BaseAccessor, PandasDelegate): """ DataFrame accessor for sparse data. + Parameters + ---------- + data : scipy.sparse.spmatrix + Must be convertible to csc format. + See Also -------- DataFrame.sparse.density : Ratio of non-sparse points to total (dense) data points. diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index 6eb4d234b349d..ff43f97161136 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -322,7 +322,7 @@ def _unbox_scalar(self, value) -> np.timedelta64: if value is NaT: return np.timedelta64(value._value, self.unit) else: - return value.as_unit(self.unit).asm8 + return value.as_unit(self.unit, round_ok=False).asm8 def _scalar_from_string(self, value) -> Timedelta | NaTType: return Timedelta(value) diff --git a/pandas/core/base.py b/pandas/core/base.py index 9b1251a4ef5d8..87e87538ca1d9 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -309,6 +309,10 @@ def transpose(self, *args, **kwargs) -> Self: doc=""" Return the transpose, which is by definition self. + See Also + -------- + Index : Immutable sequence used for indexing and alignment. + Examples -------- For Series: @@ -398,6 +402,11 @@ def item(self): ValueError If the data is not length = 1. + See Also + -------- + Index.values : Returns an array representing the data in the Index. + Series.head : Returns the first `n` rows. + Examples -------- >>> s = pd.Series([1]) @@ -419,6 +428,11 @@ def nbytes(self) -> int: """ Return the number of bytes in the underlying data. + See Also + -------- + Series.ndim : Number of dimensions of the underlying data. + Series.size : Return the number of elements in the underlying data. + Examples -------- For Series: @@ -542,7 +556,6 @@ def array(self) -> ExtensionArray: """ raise AbstractMethodError(self) - @final def to_numpy( self, dtype: npt.DTypeLike | None = None, @@ -654,7 +667,7 @@ def to_numpy( ) values = self._values - if fillna: + if fillna and self.hasnans: if not can_hold_element(values, na_value): # if we can't hold the na_value asarray either makes a copy or we # error before modifying values. The asarray later on thus won't make @@ -681,6 +694,40 @@ def to_numpy( @final @property def empty(self) -> bool: + """ + Indicator whether Index is empty. + + Returns + ------- + bool + If Index is empty, return True, if not return False. + + See Also + -------- + Index.size : Return the number of elements in the underlying data. + + Examples + -------- + >>> idx_empty = pd.Index([1, 2, 3]) + >>> idx_empty + Index([1, 2, 3], dtype='int64') + >>> idx_empty.empty + False + + >>> idx_empty = pd.Index([]) + >>> idx_empty + Index([], dtype='object') + >>> idx_empty.empty + True + + If we only have NaNs in our DataFrame, it is not considered empty! + + >>> idx_empty = pd.Index([np.nan, np.nan]) + >>> idx_empty + Index([nan, nan], dtype='float64') + >>> idx_empty.empty + False + """ return not self.size @doc(op="max", oppose="min", value="largest") @@ -784,6 +831,7 @@ def tolist(self) -> list: Returns ------- list + List containing the values as Python or pandas scalers. See Also -------- @@ -894,7 +942,6 @@ def _map_values(self, mapper, na_action=None): return algorithms.map_array(arr, mapper, na_action=na_action) - @final def value_counts( self, normalize: bool = False, @@ -1015,6 +1062,7 @@ def nunique(self, dropna: bool = True) -> int: Returns ------- int + A integer indicating the number of unique elements in the object. See Also -------- @@ -1121,6 +1169,7 @@ def _memory_usage(self, deep: bool = False) -> int: Returns ------- bytes used + Returns memory usage of the values in the Index in bytes. See Also -------- diff --git a/pandas/core/computation/align.py b/pandas/core/computation/align.py index c5562fb0284b7..b4e33b8ac75cb 100644 --- a/pandas/core/computation/align.py +++ b/pandas/core/computation/align.py @@ -160,19 +160,24 @@ def align_terms(terms): # can't iterate so it must just be a constant or single variable if isinstance(terms.value, (ABCSeries, ABCDataFrame)): typ = type(terms.value) - return typ, _zip_axes_from_type(typ, terms.value.axes) - return np.result_type(terms.type), None + name = terms.value.name if isinstance(terms.value, ABCSeries) else None + return typ, _zip_axes_from_type(typ, terms.value.axes), name + return np.result_type(terms.type), None, None # if all resolved variables are numeric scalars if all(term.is_scalar for term in terms): - return result_type_many(*(term.value for term in terms)).type, None + return result_type_many(*(term.value for term in terms)).type, None, None + + # if all input series have a common name, propagate it to the returned series + names = {term.value.name for term in terms if isinstance(term.value, ABCSeries)} + name = names.pop() if len(names) == 1 else None # perform the main alignment typ, axes = _align_core(terms) - return typ, axes + return typ, axes, name -def reconstruct_object(typ, obj, axes, dtype): +def reconstruct_object(typ, obj, axes, dtype, name): """ Reconstruct an object given its type, raw value, and possibly empty (None) axes. @@ -200,7 +205,9 @@ def reconstruct_object(typ, obj, axes, dtype): res_t = np.result_type(obj.dtype, dtype) if not isinstance(typ, partial) and issubclass(typ, PandasObject): - return typ(obj, dtype=res_t, **axes) + if name is None: + return typ(obj, dtype=res_t, **axes) + return typ(obj, dtype=res_t, name=name, **axes) # special case for pathological things like ~True/~False if hasattr(res_t, "type") and typ == np.bool_ and res_t != np.bool_: diff --git a/pandas/core/computation/engines.py b/pandas/core/computation/engines.py index 5db05ebe33efd..d2a181cbb3c36 100644 --- a/pandas/core/computation/engines.py +++ b/pandas/core/computation/engines.py @@ -54,6 +54,7 @@ def __init__(self, expr) -> None: self.expr = expr self.aligned_axes = None self.result_type = None + self.result_name = None def convert(self) -> str: """ @@ -76,12 +77,18 @@ def evaluate(self) -> object: The result of the passed expression. """ if not self._is_aligned: - self.result_type, self.aligned_axes = align_terms(self.expr.terms) + self.result_type, self.aligned_axes, self.result_name = align_terms( + self.expr.terms + ) # make sure no names in resolvers and locals/globals clash res = self._evaluate() return reconstruct_object( - self.result_type, res, self.aligned_axes, self.expr.terms.return_type + self.result_type, + res, + self.aligned_axes, + self.expr.terms.return_type, + self.result_name, ) @property diff --git a/pandas/core/computation/ops.py b/pandas/core/computation/ops.py index 7d8e23abf43b6..b7a1cb173f659 100644 --- a/pandas/core/computation/ops.py +++ b/pandas/core/computation/ops.py @@ -45,6 +45,7 @@ _unary_math_ops = ( "sin", "cos", + "tan", "exp", "log", "expm1", diff --git a/pandas/core/construction.py b/pandas/core/construction.py index ec49340e9a516..2718e9819cdf8 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -7,11 +7,8 @@ from __future__ import annotations -from collections.abc import Sequence from typing import ( TYPE_CHECKING, - Optional, - Union, cast, overload, ) @@ -23,17 +20,9 @@ from pandas._libs import lib from pandas._libs.tslibs import ( - Period, get_supported_dtype, is_supported_dtype, ) -from pandas._typing import ( - AnyArrayLike, - ArrayLike, - Dtype, - DtypeObj, - T, -) from pandas.core.dtypes.base import ExtensionDtype from pandas.core.dtypes.cast import ( @@ -46,6 +35,7 @@ maybe_promote, ) from pandas.core.dtypes.common import ( + ensure_object, is_list_like, is_object_dtype, is_string_dtype, @@ -63,11 +53,25 @@ import pandas.core.common as com if TYPE_CHECKING: + from collections.abc import Sequence + + from pandas._typing import ( + AnyArrayLike, + ArrayLike, + Dtype, + DtypeObj, + T, + ) + from pandas import ( Index, Series, ) - from pandas.core.arrays.base import ExtensionArray + from pandas.core.arrays import ( + DatetimeArray, + ExtensionArray, + TimedeltaArray, + ) def array( @@ -286,9 +290,7 @@ def array( ExtensionArray, FloatingArray, IntegerArray, - IntervalArray, NumpyExtensionArray, - PeriodArray, TimedeltaArray, ) from pandas.core.arrays.string_ import StringDtype @@ -320,46 +322,58 @@ def array( return cls._from_sequence(data, dtype=dtype, copy=copy) if dtype is None: - inferred_dtype = lib.infer_dtype(data, skipna=True) - if inferred_dtype == "period": - period_data = cast(Union[Sequence[Optional[Period]], AnyArrayLike], data) - return PeriodArray._from_sequence(period_data, copy=copy) - - elif inferred_dtype == "interval": - return IntervalArray(data, copy=copy) - - elif inferred_dtype.startswith("datetime"): - # datetime, datetime64 - try: - return DatetimeArray._from_sequence(data, copy=copy) - except ValueError: - # Mixture of timezones, fall back to NumpyExtensionArray - pass - - elif inferred_dtype.startswith("timedelta"): - # timedelta, timedelta64 - return TimedeltaArray._from_sequence(data, copy=copy) - - elif inferred_dtype == "string": + was_ndarray = isinstance(data, np.ndarray) + # error: Item "Sequence[object]" of "Sequence[object] | ExtensionArray | + # ndarray[Any, Any]" has no attribute "dtype" + if not was_ndarray or data.dtype == object: # type: ignore[union-attr] + result = lib.maybe_convert_objects( + ensure_object(data), + convert_non_numeric=True, + convert_to_nullable_dtype=True, + dtype_if_all_nat=None, + ) + result = ensure_wrapped_if_datetimelike(result) + if isinstance(result, np.ndarray): + if len(result) == 0 and not was_ndarray: + # e.g. empty list + return FloatingArray._from_sequence(data, dtype="Float64") + return NumpyExtensionArray._from_sequence( + data, dtype=result.dtype, copy=copy + ) + if result is data and copy: + return result.copy() + return result + + data = cast(np.ndarray, data) + result = ensure_wrapped_if_datetimelike(data) + if result is not data: + result = cast("DatetimeArray | TimedeltaArray", result) + if copy and result.dtype == data.dtype: + return result.copy() + return result + + if data.dtype.kind in "SU": # StringArray/ArrowStringArray depending on pd.options.mode.string_storage dtype = StringDtype() cls = dtype.construct_array_type() return cls._from_sequence(data, dtype=dtype, copy=copy) - elif inferred_dtype == "integer": + elif data.dtype.kind in "iu": return IntegerArray._from_sequence(data, copy=copy) - elif inferred_dtype == "empty" and not hasattr(data, "dtype") and not len(data): - return FloatingArray._from_sequence(data, copy=copy) - elif ( - inferred_dtype in ("floating", "mixed-integer-float") - and getattr(data, "dtype", None) != np.float16 - ): + elif data.dtype.kind == "f": # GH#44715 Exclude np.float16 bc FloatingArray does not support it; # we will fall back to NumpyExtensionArray. + if data.dtype == np.float16: + return NumpyExtensionArray._from_sequence( + data, dtype=data.dtype, copy=copy + ) return FloatingArray._from_sequence(data, copy=copy) - elif inferred_dtype == "boolean": + elif data.dtype.kind == "b": return BooleanArray._from_sequence(data, dtype="boolean", copy=copy) + else: + # e.g. complex + return NumpyExtensionArray._from_sequence(data, dtype=data.dtype, copy=copy) # Pandas overrides NumPy for # 1. datetime64[ns,us,ms,s] diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 98e689528744e..e52cbff451700 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -79,6 +79,7 @@ DtypeObj, IntervalClosedType, Ordered, + Scalar, Self, npt, type_t, @@ -622,6 +623,10 @@ def categories(self) -> Index: """ An ``Index`` containing the unique categories allowed. + See Also + -------- + ordered : Whether the categories have an ordered relationship. + Examples -------- >>> cat_type = pd.CategoricalDtype(categories=["a", "b"], ordered=True) @@ -635,6 +640,10 @@ def ordered(self) -> Ordered: """ Whether the categories have an ordered relationship. + See Also + -------- + categories : An Index containing the unique categories allowed. + Examples -------- >>> cat_type = pd.CategoricalDtype(categories=["a", "b"], ordered=True) @@ -717,6 +726,11 @@ class DatetimeTZDtype(PandasExtensionDtype): ZoneInfoNotFoundError When the requested timezone cannot be found. + See Also + -------- + numpy.datetime64 : Numpy data type for datetime. + datetime.datetime : Python datetime object. + Examples -------- >>> from zoneinfo import ZoneInfo @@ -793,6 +807,10 @@ def unit(self) -> str_type: """ The precision of the datetime data. + See Also + -------- + DatetimeTZDtype.tz : Retrieves the timezone. + Examples -------- >>> from zoneinfo import ZoneInfo @@ -807,6 +825,10 @@ def tz(self) -> tzinfo: """ The timezone. + See Also + -------- + DatetimeTZDtype.unit : Retrieves precision of the datetime data. + Examples -------- >>> from zoneinfo import ZoneInfo @@ -1538,6 +1560,25 @@ class BaseMaskedDtype(ExtensionDtype): base = None type: type + _internal_fill_value: Scalar + + @property + def _truthy_value(self): + # Fill values used for 'any' + if self.kind == "f": + return 1.0 + if self.kind in "iu": + return 1 + return True + + @property + def _falsey_value(self): + # Fill values used for 'all' + if self.kind == "f": + return 0.0 + if self.kind in "iu": + return 0 + return False @property def na_value(self) -> libmissing.NAType: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 0185ca8241617..96943eb71c7bd 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -931,6 +931,11 @@ def __dataframe__( DataFrame interchange object The object which consuming library can use to ingress the dataframe. + See Also + -------- + DataFrame.from_records : Constructor from tuples, also record arrays. + DataFrame.from_dict : From dicts of Series, arrays, or dicts. + Notes ----- Details on the interchange protocol: @@ -2685,6 +2690,16 @@ def to_feather(self, path: FilePath | WriteBuffer[bytes], **kwargs) -> None: This includes the `compression`, `compression_level`, `chunksize` and `version` keywords. + See Also + -------- + DataFrame.to_parquet : Write a DataFrame to the binary parquet format. + DataFrame.to_excel : Write object to an Excel sheet. + DataFrame.to_sql : Write to a sql table. + DataFrame.to_csv : Write a csv file. + DataFrame.to_json : Convert the object to a JSON string. + DataFrame.to_html : Render a DataFrame as an HTML table. + DataFrame.to_string : Convert DataFrame to a string. + Notes ----- This function writes the dataframe as a `feather file @@ -2866,6 +2881,9 @@ def to_parquet( Returns ------- bytes if no path argument is provided else None + Returns the DataFrame converted to the binary parquet format as bytes if no + path argument. Returns None and writes the DataFrame to the specified + location in the Parquet format if the path argument is provided. See Also -------- @@ -4012,7 +4030,6 @@ def _get_value(self, index, col, takeable: bool = False) -> Scalar: return series._values[index] series = self._get_item(col) - engine = self.index._engine if not isinstance(self.index, MultiIndex): # CategoricalIndex: Trying to use the engine fastpath may give incorrect @@ -4023,7 +4040,7 @@ def _get_value(self, index, col, takeable: bool = False) -> Scalar: # For MultiIndex going through engine effectively restricts us to # same-length tuples; see test_get_set_value_no_partial_indexing - loc = engine.get_loc(index) + loc = self.index._engine.get_loc(index) return series._values[loc] def isetitem(self, loc, value) -> None: @@ -5535,6 +5552,11 @@ def pop(self, item: Hashable) -> Series: Series Series representing the item that is dropped. + See Also + -------- + DataFrame.drop: Drop specified labels from rows or columns. + DataFrame.drop_duplicates: Return DataFrame with duplicate rows removed. + Examples -------- >>> df = pd.DataFrame( @@ -7682,6 +7704,10 @@ def reorder_levels(self, order: Sequence[int | str], axis: Axis = 0) -> DataFram DataFrame DataFrame with indices or columns with reordered levels. + See Also + -------- + DataFrame.swaplevel : Swap levels i and j in a MultiIndex. + Examples -------- >>> data = { @@ -11106,6 +11132,7 @@ def corrwith( drop: bool = False, method: CorrelationMethod = "pearson", numeric_only: bool = False, + min_periods: int | None = None, ) -> Series: """ Compute pairwise correlation. @@ -11136,6 +11163,9 @@ def corrwith( numeric_only : bool, default False Include only `float`, `int` or `boolean` data. + min_periods : int, optional + Minimum number of observations needed to have a valid result. + .. versionadded:: 1.5.0 .. versionchanged:: 2.0.0 @@ -11179,7 +11209,10 @@ def corrwith( this = self._get_numeric_data() if numeric_only else self if isinstance(other, Series): - return this.apply(lambda x: other.corr(x, method=method), axis=axis) + return this.apply( + lambda x: other.corr(x, method=method, min_periods=min_periods), + axis=axis, + ) if numeric_only: other = other._get_numeric_data() @@ -11697,7 +11730,6 @@ def sum( return result @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="prod") - @doc(make_doc("prod", ndim=2)) def prod( self, axis: Axis | None = 0, @@ -11706,6 +11738,73 @@ def prod( min_count: int = 0, **kwargs, ) -> Series: + """ + Return the product of the values over the requested axis. + + Parameters + ---------- + axis : {index (0), columns (1)} + Axis for the function to be applied on. + For `Series` this parameter is unused and defaults to 0. + + .. warning:: + + The behavior of DataFrame.prod with ``axis=None`` is deprecated, + in a future version this will reduce over both axes and return a scalar + To retain the old behavior, pass axis=0 (or do not pass axis). + + .. versionadded:: 2.0.0 + + skipna : bool, default True + Exclude NA/null values when computing the result. + numeric_only : bool, default False + Include only float, int, boolean columns. Not implemented for Series. + + min_count : int, default 0 + The required number of valid values to perform the operation. If fewer than + ``min_count`` non-NA values are present the result will be NA. + **kwargs + Additional keyword arguments to be passed to the function. + + Returns + ------- + Series or scalar + The product of the values over the requested axis. + + See Also + -------- + Series.sum : Return the sum. + Series.min : Return the minimum. + Series.max : Return the maximum. + Series.idxmin : Return the index of the minimum. + Series.idxmax : Return the index of the maximum. + DataFrame.sum : Return the sum over the requested axis. + DataFrame.min : Return the minimum over the requested axis. + DataFrame.max : Return the maximum over the requested axis. + DataFrame.idxmin : Return the index of the minimum over the requested axis. + DataFrame.idxmax : Return the index of the maximum over the requested axis. + + Examples + -------- + By default, the product of an empty or all-NA Series is ``1`` + + >>> pd.Series([], dtype="float64").prod() + 1.0 + + This can be controlled with the ``min_count`` parameter + + >>> pd.Series([], dtype="float64").prod(min_count=1) + nan + + Thanks to the ``skipna`` parameter, ``min_count`` handles all-NA and + empty series identically. + + >>> pd.Series([np.nan]).prod() + 1.0 + + >>> pd.Series([np.nan]).prod(min_count=1) + nan + """ result = super().prod( axis=axis, skipna=skipna, @@ -11846,7 +11945,6 @@ def sem( ) -> Series | Any: ... @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="sem") - @doc(make_doc("sem", ndim=2)) def sem( self, axis: Axis | None = 0, @@ -11855,6 +11953,76 @@ def sem( numeric_only: bool = False, **kwargs, ) -> Series | Any: + """ + Return unbiased standard error of the mean over requested axis. + + Normalized by N-1 by default. This can be changed using the ddof argument + + Parameters + ---------- + axis : {index (0), columns (1)} + For `Series` this parameter is unused and defaults to 0. + + .. warning:: + + The behavior of DataFrame.sem with ``axis=None`` is deprecated, + in a future version this will reduce over both axes and return a scalar + To retain the old behavior, pass axis=0 (or do not pass axis). + + skipna : bool, default True + Exclude NA/null values. If an entire row/column is NA, the result + will be NA. + ddof : int, default 1 + Delta Degrees of Freedom. The divisor used in calculations is N - ddof, + where N represents the number of elements. + numeric_only : bool, default False + Include only float, int, boolean columns. Not implemented for Series. + **kwargs : + Additional keywords passed. + + Returns + ------- + Series or DataFrame (if level specified) + Unbiased standard error of the mean over requested axis. + + See Also + -------- + DataFrame.var : Return unbiased variance over requested axis. + DataFrame.std : Returns sample standard deviation over requested axis. + + Examples + -------- + >>> s = pd.Series([1, 2, 3]) + >>> s.sem().round(6) + 0.57735 + + With a DataFrame + + >>> df = pd.DataFrame({"a": [1, 2], "b": [2, 3]}, index=["tiger", "zebra"]) + >>> df + a b + tiger 1 2 + zebra 2 3 + >>> df.sem() + a 0.5 + b 0.5 + dtype: float64 + + Using axis=1 + + >>> df.sem(axis=1) + tiger 0.5 + zebra 0.5 + dtype: float64 + + In this case, `numeric_only` should be set to `True` + to avoid getting an error. + + >>> df = pd.DataFrame({"a": [1, 2], "b": ["T", "Z"]}, index=["tiger", "zebra"]) + >>> df.sem(numeric_only=True) + a 0.5 + dtype: float64 + """ result = super().sem( axis=axis, skipna=skipna, ddof=ddof, numeric_only=numeric_only, **kwargs ) @@ -11996,7 +12164,6 @@ def skew( ) -> Series | Any: ... @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="skew") - @doc(make_doc("skew", ndim=2)) def skew( self, axis: Axis | None = 0, @@ -12004,6 +12171,80 @@ def skew( numeric_only: bool = False, **kwargs, ) -> Series | Any: + """ + Return unbiased skew over requested axis. + + Normalized by N-1. + + Parameters + ---------- + axis : {index (0), columns (1)} + Axis for the function to be applied on. + For `Series` this parameter is unused and defaults to 0. + + For DataFrames, specifying ``axis=None`` will apply the aggregation + across both axes. + + .. versionadded:: 2.0.0 + + skipna : bool, default True + Exclude NA/null values when computing the result. + numeric_only : bool, default False + Include only float, int, boolean columns. + + **kwargs + Additional keyword arguments to be passed to the function. + + Returns + ------- + Series or scalar + Unbiased skew over requested axis. + + See Also + -------- + Dataframe.kurt : Returns unbiased kurtosis over requested axis. + + Examples + -------- + >>> s = pd.Series([1, 2, 3]) + >>> s.skew() + 0.0 + + With a DataFrame + + >>> df = pd.DataFrame( + ... {"a": [1, 2, 3], "b": [2, 3, 4], "c": [1, 3, 5]}, + ... index=["tiger", "zebra", "cow"], + ... ) + >>> df + a b c + tiger 1 2 1 + zebra 2 3 3 + cow 3 4 5 + >>> df.skew() + a 0.0 + b 0.0 + c 0.0 + dtype: float64 + + Using axis=1 + + >>> df.skew(axis=1) + tiger 1.732051 + zebra -1.732051 + cow 0.000000 + dtype: float64 + + In this case, `numeric_only` should be set to `True` to avoid + getting an error. + + >>> df = pd.DataFrame( + ... {"a": [1, 2, 3], "b": ["T", "Z", "X"]}, index=["tiger", "zebra", "cow"] + ... ) + >>> df.skew(numeric_only=True) + a 0.0 + dtype: float64 + """ result = super().skew( axis=axis, skipna=skipna, numeric_only=numeric_only, **kwargs ) @@ -12043,7 +12284,6 @@ def kurt( ) -> Series | Any: ... @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="kurt") - @doc(make_doc("kurt", ndim=2)) def kurt( self, axis: Axis | None = 0, @@ -12051,6 +12291,85 @@ def kurt( numeric_only: bool = False, **kwargs, ) -> Series | Any: + """ + Return unbiased kurtosis over requested axis. + + Kurtosis obtained using Fisher's definition of + kurtosis (kurtosis of normal == 0.0). Normalized by N-1. + + Parameters + ---------- + axis : {index (0), columns (1)} + Axis for the function to be applied on. + For `Series` this parameter is unused and defaults to 0. + + For DataFrames, specifying ``axis=None`` will apply the aggregation + across both axes. + + .. versionadded:: 2.0.0 + + skipna : bool, default True + Exclude NA/null values when computing the result. + numeric_only : bool, default False + Include only float, int, boolean columns. + + **kwargs + Additional keyword arguments to be passed to the function. + + Returns + ------- + Series or scalar + Unbiased kurtosis over requested axis. + + See Also + -------- + Dataframe.kurtosis : Returns unbiased kurtosis over requested axis. + + Examples + -------- + >>> s = pd.Series([1, 2, 2, 3], index=["cat", "dog", "dog", "mouse"]) + >>> s + cat 1 + dog 2 + dog 2 + mouse 3 + dtype: int64 + >>> s.kurt() + 1.5 + + With a DataFrame + + >>> df = pd.DataFrame( + ... {"a": [1, 2, 2, 3], "b": [3, 4, 4, 4]}, + ... index=["cat", "dog", "dog", "mouse"], + ... ) + >>> df + a b + cat 1 3 + dog 2 4 + dog 2 4 + mouse 3 4 + >>> df.kurt() + a 1.5 + b 4.0 + dtype: float64 + + With axis=None + + >>> df.kurt(axis=None).round(6) + -0.988693 + + Using axis=1 + + >>> df = pd.DataFrame( + ... {"a": [1, 2], "b": [3, 4], "c": [3, 4], "d": [1, 2]}, + ... index=["cat", "dog"], + ... ) + >>> df.kurt(axis=1) + cat -6.0 + dog -6.0 + dtype: float64 + """ result = super().kurt( axis=axis, skipna=skipna, numeric_only=numeric_only, **kwargs ) @@ -12893,6 +13212,11 @@ def isin_(x): """ The column labels of the DataFrame. + See Also + -------- + DataFrame.index: The index (row labels) of the DataFrame. + DataFrame.axes: Return a list representing the axes of the DataFrame. + Examples -------- >>> df = pd.DataFrame({'A': [1, 2], 'B': [3, 4]}) @@ -12921,12 +13245,12 @@ def _to_dict_of_blocks(self): Return a dict of dtype -> Constructor Types that each is a homogeneous dtype. - Internal ONLY - only works for BlockManager + Internal ONLY. """ mgr = self._mgr return { k: self._constructor_from_mgr(v, axes=v.axes).__finalize__(self) - for k, v in mgr.to_dict().items() + for k, v in mgr.to_iter_dict() } @property diff --git a/pandas/core/generic.py b/pandas/core/generic.py index dbe2006642484..24727bb9d83c1 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -783,6 +783,12 @@ def droplevel(self, level: IndexLabel, axis: Axis = 0) -> Self: {klass} {klass} with requested index / column level(s) removed. + See Also + -------- + DataFrame.replace : Replace values given in `to_replace` with `value`. + DataFrame.pivot : Return reshaped DataFrame organized by given + index / column values. + Examples -------- >>> df = ( @@ -1862,6 +1868,11 @@ def __iter__(self) -> Iterator: iterator Info axis as iterator. + See Also + -------- + DataFrame.items : Iterate over (column name, Series) pairs. + DataFrame.itertuples : Iterate over DataFrame rows as namedtuples. + Examples -------- >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) @@ -6568,6 +6579,7 @@ def infer_objects(self, copy: bool | lib.NoDefault = lib.no_default) -> Self: Returns ------- same type as input object + Returns an object of the same type as the input object. See Also -------- @@ -8520,6 +8532,8 @@ def at_time(self, time, asof: bool = False, axis: Axis | None = None) -> Self: ---------- time : datetime.time or str The values to select. + asof : bool, default False + This parameter is currently not supported. axis : {0 or 'index', 1 or 'columns'}, default 0 For `Series` this parameter is unused and defaults to 0. diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 79d9f49a3b355..f44ef8c4dbbfa 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1202,10 +1202,7 @@ def _concat_objects( sort=False, ) else: - # GH5610, returns a MI, with the first level being a - # range index - keys = RangeIndex(len(values)) - result = concat(values, axis=0, keys=keys) + result = concat(values, axis=0) elif not not_indexed_same: result = concat(values, axis=0) diff --git a/pandas/core/indexers/objects.py b/pandas/core/indexers/objects.py index 2e6bcda520aba..d108f840a1b4f 100644 --- a/pandas/core/indexers/objects.py +++ b/pandas/core/indexers/objects.py @@ -53,11 +53,8 @@ class BaseIndexer: >>> from pandas.api.indexers import BaseIndexer >>> class CustomIndexer(BaseIndexer): ... def get_window_bounds(self, num_values, min_periods, center, closed, step): - ... start = np.empty(num_values, dtype=np.int64) - ... end = np.empty(num_values, dtype=np.int64) - ... for i in range(num_values): - ... start[i] = i - ... end[i] = i + self.window_size + ... start = np.arange(num_values, dtype=np.int64) + ... end = np.arange(num_values, dtype=np.int64) + self.window_size ... return start, end >>> df = pd.DataFrame({"values": range(5)}) >>> indexer = CustomIndexer(window_size=2) diff --git a/pandas/core/indexes/accessors.py b/pandas/core/indexes/accessors.py index 2bb234e174563..3dcd1fedc8d64 100644 --- a/pandas/core/indexes/accessors.py +++ b/pandas/core/indexes/accessors.py @@ -9,10 +9,12 @@ NoReturn, cast, ) +import warnings import numpy as np from pandas._libs import lib +from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.common import ( is_integer_dtype, @@ -210,6 +212,15 @@ def _delegate_method(self, name: str, *args, **kwargs): return result def to_pytimedelta(self): + # GH 57463 + warnings.warn( + f"The behavior of {type(self).__name__}.to_pytimedelta is deprecated, " + "in a future version this will return a Series containing python " + "datetime.timedelta objects instead of an ndarray. To retain the " + "old behavior, call `np.array` on the result", + FutureWarning, + stacklevel=find_stack_level(), + ) return cast(ArrowExtensionArray, self._parent.array)._dt_to_pytimedelta() def to_pydatetime(self) -> Series: @@ -462,6 +473,15 @@ def to_pytimedelta(self) -> np.ndarray: datetime.timedelta(days=2), datetime.timedelta(days=3), datetime.timedelta(days=4)], dtype=object) """ + # GH 57463 + warnings.warn( + f"The behavior of {type(self).__name__}.to_pytimedelta is deprecated, " + "in a future version this will return a Series containing python " + "datetime.timedelta objects instead of an ndarray. To retain the " + "old behavior, call `np.array` on the result", + FutureWarning, + stacklevel=find_stack_level(), + ) return self._get_values().to_pytimedelta() @property diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 8ede401f37184..e93db22906b39 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -832,7 +832,8 @@ def _reset_identity(self) -> None: @final def _cleanup(self) -> None: - self._engine.clear_mapping() + if "_engine" in self._cache: + self._engine.clear_mapping() @cache_readonly def _engine( @@ -976,6 +977,10 @@ def dtype(self) -> DtypeObj: """ Return the dtype object of the underlying data. + See Also + -------- + Index.inferred_type: Return a string of the type inferred from the values. + Examples -------- >>> idx = pd.Index([1, 2, 3]) @@ -1008,6 +1013,42 @@ def ravel(self, order: str_t = "C") -> Self: return self[:] def view(self, cls=None): + """ + Return a view on self. + + Parameters + ---------- + cls : data-type or ndarray sub-class, optional + Data-type descriptor of the returned view, e.g., float32 or int16. + Omitting it results in the view having the same data-type as `self`. + This argument can also be specified as an ndarray sub-class, + e.g., np.int64 or np.float32 which then specifies the type of + the returned object. + + Returns + ------- + numpy.ndarray + A new view of the same data in memory. + + See Also + -------- + numpy.ndarray.view : Returns a new view of array with the same data. + + Examples + -------- + >>> s = pd.Series([1, 2, 3], index=["1", "2", "3"]) + >>> s.index.view("object") + array(['1', '2', '3'], dtype=object) + + >>> s = pd.Series([1, 2, 3], index=[-1, 0, 1]) + >>> s.index.view(np.int64) + array([-1, 0, 1]) + >>> s.index.view(np.float32) + array([ nan, nan, 0.e+00, 0.e+00, 1.e-45, 0.e+00], dtype=float32) + >>> s.index.view(np.uint64) + array([18446744073709551615, 0, 1], + dtype=uint64) + """ # we need to see if we are subclassing an # index type here if cls is not None: @@ -1056,6 +1097,12 @@ def astype(self, dtype, copy: bool = True): Index Index with values cast to specified dtype. + See Also + -------- + Index.dtype: Return the dtype object of the underlying data. + Index.dtypes: Return the dtype object of the underlying data. + Index.convert_dtypes: Convert columns to the best possible dtypes. + Examples -------- >>> idx = pd.Index([1, 2, 3]) @@ -1109,9 +1156,21 @@ def astype(self, dtype, copy: bool = True): axis : int, optional The axis over which to select values, always 0. allow_fill : bool, default True + How to handle negative values in `indices`. + + * False: negative values in `indices` indicate positional indices + from the right (the default). This is similar to + :func:`numpy.take`. + + * True: negative values in `indices` indicate + missing values. These values are set to `fill_value`. Any other + other negative values raise a ``ValueError``. + fill_value : scalar, default None If allow_fill=True and fill_value is not None, indices specified by -1 are regarded as NA. If Index doesn't hold NA, raise ValueError. + **kwargs + Required for compatibility with numpy. Returns ------- @@ -1251,12 +1310,19 @@ def copy( name : Label, optional Set name for new object. deep : bool, default False + If True attempts to make a deep copy of the Index. + Else makes a shallow copy. Returns ------- Index Index refer to new object which is a copy of this object. + See Also + -------- + Index.delete: Make new Index with passed location(-s) deleted. + Index.drop: Make new Index with passed list of labels deleted. + Notes ----- In most cases, there should be no functional difference from using @@ -1638,6 +1704,11 @@ def name(self) -> Hashable: """ Return Index or MultiIndex name. + See Also + -------- + Index.set_names: Able to set new names partially and by level. + Index.rename: Able to set new names partially and by level. + Examples -------- >>> idx = pd.Index([1, 2, 3], name="x") @@ -2077,6 +2148,12 @@ def droplevel(self, level: IndexLabel = 0): Returns ------- Index or MultiIndex + Returns an Index or MultiIndex object, depending on the resulting index + after removing the requested level(s). + + See Also + -------- + Index.dropna : Return Index without NA/NaN values. Examples -------- @@ -2344,6 +2421,10 @@ def inferred_type(self) -> str_t: """ Return a string of the type inferred from the values. + See Also + -------- + Index.dtype : Return the dtype object of the underlying data. + Examples -------- >>> idx = pd.Index([1, 2, 3]) @@ -2423,6 +2504,12 @@ def hasnans(self) -> bool: ------- bool + See Also + -------- + Index.isna : Detect missing values. + Index.dropna : Return Index without NA/NaN values. + Index.fillna : Fill NA/NaN values with the specified value. + Examples -------- >>> s = pd.Series([1, 2, 3], index=["a", "b", None]) @@ -2556,6 +2643,7 @@ def fillna(self, value): Returns ------- Index + NA/NaN values replaced with `value`. See Also -------- @@ -2592,6 +2680,12 @@ def dropna(self, how: AnyAll = "any") -> Self: Returns ------- Index + Returns an Index object after removing NA/NaN values. + + See Also + -------- + Index.fillna : Fill NA/NaN values with the specified value. + Index.isna : Detect missing values. Examples -------- @@ -2625,6 +2719,7 @@ def unique(self, level: Hashable | None = None) -> Self: Returns ------- Index + Unique values in the index. See Also -------- @@ -2660,6 +2755,7 @@ def drop_duplicates(self, *, keep: DropKeep = "first") -> Self: Returns ------- Index + A new Index object with the duplicate values removed. See Also -------- @@ -2719,6 +2815,7 @@ def duplicated(self, keep: DropKeep = "first") -> npt.NDArray[np.bool_]: Returns ------- np.ndarray[bool] + A numpy array of boolean values indicating duplicate index values. See Also -------- @@ -2824,6 +2921,8 @@ def union(self, other, sort=None): Parameters ---------- other : Index or array-like + Index or an array-like object containing elements to form the union + with the original Index. sort : bool or None, default None Whether to sort the resulting Index. @@ -2840,6 +2939,14 @@ def union(self, other, sort=None): Returns ------- Index + Returns a new Index object with all unique elements from both the original + Index and the `other` Index. + + See Also + -------- + Index.unique : Return unique values in the index. + Index.intersection : Form the intersection of two Index objects. + Index.difference : Return a new Index with elements of index not in `other`. Examples -------- @@ -3034,6 +3141,8 @@ def intersection(self, other, sort: bool = False): Parameters ---------- other : Index or array-like + An Index or an array-like object containing elements to form the + intersection with the original Index. sort : True, False or None, default False Whether to sort the resulting index. @@ -3045,6 +3154,14 @@ def intersection(self, other, sort: bool = False): Returns ------- Index + Returns a new Index object with elements common to both the original Index + and the `other` Index. + + See Also + -------- + Index.union : Form the union of two Index objects. + Index.difference : Return a new Index with elements of index not in other. + Index.isin : Return a boolean array where the index values are in values. Examples -------- @@ -3180,6 +3297,8 @@ def difference(self, other, sort=None): Parameters ---------- other : Index or array-like + Index object or an array-like object containing elements to be compared + with the elements of the original Index. sort : bool or None, default None Whether to sort the resulting index. By default, the values are attempted to be sorted, but any TypeError from @@ -3193,6 +3312,14 @@ def difference(self, other, sort=None): Returns ------- Index + Returns a new Index object containing elements that are in the original + Index but not in the `other` Index. + + See Also + -------- + Index.symmetric_difference : Compute the symmetric difference of two Index + objects. + Index.intersection : Form the intersection of two Index objects. Examples -------- @@ -3254,7 +3381,10 @@ def symmetric_difference(self, other, result_name=None, sort=None): Parameters ---------- other : Index or array-like + Index or an array-like object with elements to compute the symmetric + difference with the original Index. result_name : str + A string representing the name of the resulting Index, if desired. sort : bool or None, default None Whether to sort the resulting index. By default, the values are attempted to be sorted, but any TypeError from @@ -3268,6 +3398,14 @@ def symmetric_difference(self, other, result_name=None, sort=None): Returns ------- Index + Returns a new Index object containing elements that appear in either the + original Index or the `other` Index, but not both. + + See Also + -------- + Index.difference : Return a new Index with elements of index not in other. + Index.union : Form the union of two Index objects. + Index.intersection : Form the intersection of two Index objects. Notes ----- @@ -3352,10 +3490,22 @@ def get_loc(self, key): Parameters ---------- key : label + The key to check its location if it is present in the index. Returns ------- int if unique index, slice if monotonic index, else mask + Integer location, slice or boolean mask. + + See Also + -------- + Index.get_slice_bound : Calculate slice bound that corresponds to + given label. + Index.get_indexer : Computes indexer and mask for new index given + the current index. + Index.get_non_unique : Returns indexer and masks for new index given + the current index. + Index.get_indexer_for : Returns an indexer even when non-unique. Examples -------- @@ -3405,6 +3555,7 @@ def get_indexer( Parameters ---------- target : Index + An iterable containing the values to be used for computing indexer. method : {None, 'pad'/'ffill', 'backfill'/'bfill', 'nearest'}, optional * default: exact matches only. * pad / ffill: find the PREVIOUS index value if no exact match. @@ -3432,6 +3583,12 @@ def get_indexer( positions matches the corresponding target values. Missing values in the target are marked by -1. + See Also + -------- + Index.get_indexer_for : Returns an indexer even when non-unique. + Index.get_non_unique : Returns indexer and masks for new index given + the current index. + Notes ----- Returns -1 for unmatched values, for further explanation see the @@ -3905,6 +4062,7 @@ def reindex( Parameters ---------- target : an iterable + An iterable containing the values to be used for creating the new index. method : {None, 'pad'/'ffill', 'backfill'/'bfill', 'nearest'}, optional * default: exact matches only. * pad / ffill: find the PREVIOUS index value if no exact match. @@ -4841,8 +4999,9 @@ def _from_join_target(self, result: np.ndarray) -> ArrayLike: def memory_usage(self, deep: bool = False) -> int: result = self._memory_usage(deep=deep) - # include our engine hashtable - result += self._engine.sizeof(deep=deep) + # include our engine hashtable, only if it's already cached + if "_engine" in self._cache: + result += self._engine.sizeof(deep=deep) return result @final @@ -5011,12 +5170,9 @@ def __getitem__(self, key): if not isinstance(self.dtype, ExtensionDtype): if len(key) == 0 and len(key) != len(self): - warnings.warn( - "Using a boolean indexer with length 0 on an Index with " - "length greater than 0 is deprecated and will raise in a " - "future version.", - FutureWarning, - stacklevel=find_stack_level(), + raise ValueError( + "The length of the boolean indexer cannot be 0 " + "when the Index has length greater than 0." ) result = getitem(key) @@ -5065,10 +5221,18 @@ def append(self, other: Index | Sequence[Index]) -> Index: Parameters ---------- other : Index or list/tuple of indices + Single Index or a collection of indices, which can be either a list or a + tuple. Returns ------- Index + Returns a new Index object resulting from appending the provided other + indices to the original Index. + + See Also + -------- + Index.insert : Make new Index inserting new item at location. Examples -------- @@ -5108,9 +5272,19 @@ def putmask(self, mask, value) -> Index: """ Return a new Index of the values set with the mask. + Parameters + ---------- + mask : np.ndarray[bool] + Array of booleans denoting where values in the original + data are not ``NA``. + value : scalar + Scalar value to use to fill holes (e.g. 0). + This value cannot be a list-likes. + Returns ------- Index + A new Index of the values set with the mask. See Also -------- @@ -5175,6 +5349,12 @@ def equals(self, other: Any) -> bool: True if "other" is an Index and it has the same elements and order as the calling index; False otherwise. + See Also + -------- + Index.identical: Checks that object attributes and types are also equal. + Index.has_duplicates: Check if the Index has duplicate values. + Index.is_unique: Return if the index has unique values. + Examples -------- >>> idx1 = pd.Index([1, 2, 3]) @@ -5259,12 +5439,23 @@ def identical(self, other) -> bool: """ Similar to equals, but checks that object attributes and types are also equal. + Parameters + ---------- + other : Index + The Index object you want to compare with the current Index object. + Returns ------- bool If two Index objects have equal elements and same type True, otherwise False. + See Also + -------- + Index.equals: Determine if two Index object are equal. + Index.has_duplicates: Check if the Index has duplicate values. + Index.is_unique: Return if the index has unique values. + Examples -------- >>> idx1 = pd.Index(["1", "2", "3"]) @@ -5681,6 +5872,7 @@ def _should_fallback_to_positional(self) -> bool: Parameters ---------- target : %(target_klass)s + An iterable containing the values to be used for computing indexer. Returns ------- @@ -5692,6 +5884,12 @@ def _should_fallback_to_positional(self) -> bool: An indexer into the target of the values not found. These correspond to the -1 in the indexer array. + See Also + -------- + Index.get_indexer : Computes indexer and mask for new index given + the current index. + Index.get_indexer_for : Returns an indexer even when non-unique. + Examples -------- >>> index = pd.Index(['c', 'b', 'a', 'b', 'b']) @@ -5767,11 +5965,23 @@ def get_indexer_for(self, target) -> npt.NDArray[np.intp]: This dispatches to get_indexer or get_indexer_non_unique as appropriate. + Parameters + ---------- + target : Index + An iterable containing the values to be used for computing indexer. + Returns ------- np.ndarray[np.intp] List of indices. + See Also + -------- + Index.get_indexer : Computes indexer and mask for new index given + the current index. + Index.get_non_unique : Returns indexer and masks for new index given + the current index. + Examples -------- >>> idx = pd.Index([np.nan, "var1", np.nan]) @@ -6067,6 +6277,10 @@ def map(self, mapper, na_action: Literal["ignore"] | None = None): If the function returns a tuple with more than one element a MultiIndex will be returned. + See Also + -------- + Index.where : Replace values where the condition is False. + Examples -------- >>> idx = pd.Index([1, 2, 3]) @@ -6230,19 +6444,26 @@ def slice_indexer( end : label, default None If None, defaults to the end. step : int, default None + If None, defaults to 1. Returns ------- slice + A slice object. Raises ------ KeyError : If key does not exist, or key is not unique and index is not ordered. + See Also + -------- + Index.slice_locs : Computes slice locations for input labels. + Index.get_slice_bound : Retrieves slice bound that corresponds to given label. + Notes ----- - This function assumes that the data is sorted, so use at your own peril + This function assumes that the data is sorted, so use at your own peril. Examples -------- @@ -6349,7 +6570,10 @@ def get_slice_bound(self, label, side: Literal["left", "right"]) -> int: Parameters ---------- label : object + The label for which to calculate the slice bound. side : {'left', 'right'} + if 'left' return leftmost position of given label. + if 'right' return one-past-the-rightmost position of given label. Returns ------- @@ -6438,6 +6662,8 @@ def slice_locs(self, start=None, end=None, step=None) -> tuple[int, int]: Returns ------- tuple[int, int] + Returns a tuple of two integers representing the slice locations for the + input labels within the index. See Also -------- @@ -6563,11 +6789,19 @@ def insert(self, loc: int, item) -> Index: Parameters ---------- loc : int + The integer location where the new item will be inserted. item : object + The new item to be inserted into the Index. Returns ------- Index + Returns a new Index object resulting from inserting the specified item at + the specified location within the original Index. + + See Also + -------- + Index.append : Append a collection of Indexes together. Examples -------- @@ -6625,6 +6859,8 @@ def drop( Parameters ---------- labels : array-like or scalar + Array-like object or a scalar value, representing the labels to be removed + from the Index. errors : {'ignore', 'raise'}, default 'raise' If 'ignore', suppress error and existing labels are dropped. @@ -6638,6 +6874,11 @@ def drop( KeyError If not all of the labels are found in the selected axis + See Also + -------- + Index.dropna : Return Index without NA/NaN values. + Index.drop_duplicates : Return Index with duplicate values removed. + Examples -------- >>> idx = pd.Index(["a", "b", "c"]) diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index cefdc14145d1f..78f04f57029b1 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -451,9 +451,24 @@ def snap(self, freq: Frequency = "S") -> DatetimeIndex: """ Snap time stamps to nearest occurring frequency. + Parameters + ---------- + freq : str, Timedelta, datetime.timedelta, or DateOffset, default 'S' + Frequency strings can have multiples, e.g. '5h'. See + :ref:`here ` for a list of + frequency aliases. + Returns ------- DatetimeIndex + Time stamps to nearest occurring `freq`. + + See Also + -------- + DatetimeIndex.round : Perform round operation on the data to the + specified `freq`. + DatetimeIndex.floor : Perform floor operation on the data to the + specified `freq`. Examples -------- @@ -508,6 +523,8 @@ def _parsed_string_to_bounds( freq = OFFSET_TO_PERIOD_FREQSTR.get(reso.attr_abbrev, reso.attr_abbrev) per = Period(parsed, freq=freq) start, end = per.start_time, per.end_time + start = start.as_unit(self.unit) + end = end.as_unit(self.unit) # GH 24076 # If an incoming date string contained a UTC offset, need to localize @@ -694,10 +711,13 @@ def indexer_at_time(self, time, asof: bool = False) -> npt.NDArray[np.intp]: Time passed in either as object (datetime.time) or as string in appropriate format ("%H:%M", "%H%M", "%I:%M%p", "%I%M%p", "%H:%M:%S", "%H%M%S", "%I:%M:%S%p", "%I%M%S%p"). + asof : bool, default False + This parameter is currently not supported. Returns ------- np.ndarray[np.intp] + Index locations of values at given `time` of day. See Also -------- @@ -750,6 +770,7 @@ def indexer_between_time( Returns ------- np.ndarray[np.intp] + Index locations of values between particular times of day. See Also -------- diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 21ce9b759f2df..a5bcf49c5490b 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -123,84 +123,56 @@ ) -class MultiIndexUIntEngine(libindex.BaseMultiIndexCodesEngine, libindex.UInt64Engine): - """ - This class manages a MultiIndex by mapping label combinations to positive - integers. +class MultiIndexUInt64Engine(libindex.BaseMultiIndexCodesEngine, libindex.UInt64Engine): + """Manages a MultiIndex by mapping label combinations to positive integers. + + The number of possible label combinations must not overflow the 64 bits integers. """ _base = libindex.UInt64Engine + _codes_dtype = "uint64" - def _codes_to_ints(self, codes): - """ - Transform combination(s) of uint64 in one uint64 (each), in a strictly - monotonic way (i.e. respecting the lexicographic order of integer - combinations): see BaseMultiIndexCodesEngine documentation. - Parameters - ---------- - codes : 1- or 2-dimensional array of dtype uint64 - Combinations of integers (one per row) +class MultiIndexUInt32Engine(libindex.BaseMultiIndexCodesEngine, libindex.UInt32Engine): + """Manages a MultiIndex by mapping label combinations to positive integers. - Returns - ------- - scalar or 1-dimensional array, of dtype uint64 - Integer(s) representing one combination (each). - """ - # Shift the representation of each level by the pre-calculated number - # of bits: - codes <<= self.offsets + The number of possible label combinations must not overflow the 32 bits integers. + """ - # Now sum and OR are in fact interchangeable. This is a simple - # composition of the (disjunct) significant bits of each level (i.e. - # each column in "codes") in a single positive integer: - if codes.ndim == 1: - # Single key - return np.bitwise_or.reduce(codes) + _base = libindex.UInt32Engine + _codes_dtype = "uint32" - # Multiple keys - return np.bitwise_or.reduce(codes, axis=1) +class MultiIndexUInt16Engine(libindex.BaseMultiIndexCodesEngine, libindex.UInt16Engine): + """Manages a MultiIndex by mapping label combinations to positive integers. -class MultiIndexPyIntEngine(libindex.BaseMultiIndexCodesEngine, libindex.ObjectEngine): - """ - This class manages those (extreme) cases in which the number of possible - label combinations overflows the 64 bits integers, and uses an ObjectEngine - containing Python integers. + The number of possible label combinations must not overflow the 16 bits integers. """ - _base = libindex.ObjectEngine + _base = libindex.UInt16Engine + _codes_dtype = "uint16" - def _codes_to_ints(self, codes): - """ - Transform combination(s) of uint64 in one Python integer (each), in a - strictly monotonic way (i.e. respecting the lexicographic order of - integer combinations): see BaseMultiIndexCodesEngine documentation. - Parameters - ---------- - codes : 1- or 2-dimensional array of dtype uint64 - Combinations of integers (one per row) +class MultiIndexUInt8Engine(libindex.BaseMultiIndexCodesEngine, libindex.UInt8Engine): + """Manages a MultiIndex by mapping label combinations to positive integers. - Returns - ------- - int, or 1-dimensional array of dtype object - Integer(s) representing one combination (each). - """ - # Shift the representation of each level by the pre-calculated number - # of bits. Since this can overflow uint64, first make sure we are - # working with Python integers: - codes = codes.astype("object") << self.offsets + The number of possible label combinations must not overflow the 8 bits integers. + """ - # Now sum and OR are in fact interchangeable. This is a simple - # composition of the (disjunct) significant bits of each level (i.e. - # each column in "codes") in a single positive integer (per row): - if codes.ndim == 1: - # Single key - return np.bitwise_or.reduce(codes) + _base = libindex.UInt8Engine + _codes_dtype = "uint8" - # Multiple keys - return np.bitwise_or.reduce(codes, axis=1) + +class MultiIndexPyIntEngine(libindex.BaseMultiIndexCodesEngine, libindex.ObjectEngine): + """Manages a MultiIndex by mapping label combinations to positive integers. + + This class manages those (extreme) cases in which the number of possible + label combinations overflows the 64 bits integers, and uses an ObjectEngine + containing Python integers. + """ + + _base = libindex.ObjectEngine + _codes_dtype = "object" def names_compat(meth: F) -> F: @@ -1229,13 +1201,25 @@ def _engine(self): # equivalent to sorting lexicographically the codes themselves. Notice # that each level needs to be shifted by the number of bits needed to # represent the _previous_ ones: - offsets = np.concatenate([lev_bits[1:], [0]]).astype("uint64") + offsets = np.concatenate([lev_bits[1:], [0]]) + # Downcast the type if possible, to prevent upcasting when shifting codes: + offsets = offsets.astype(np.min_scalar_type(int(offsets[0]))) # Check the total number of bits needed for our representation: if lev_bits[0] > 64: # The levels would overflow a 64 bit uint - use Python integers: return MultiIndexPyIntEngine(self.levels, self.codes, offsets) - return MultiIndexUIntEngine(self.levels, self.codes, offsets) + if lev_bits[0] > 32: + # The levels would overflow a 32 bit uint - use uint64 + return MultiIndexUInt64Engine(self.levels, self.codes, offsets) + if lev_bits[0] > 16: + # The levels would overflow a 16 bit uint - use uint8 + return MultiIndexUInt32Engine(self.levels, self.codes, offsets) + if lev_bits[0] > 8: + # The levels would overflow a 8 bit uint - use uint16 + return MultiIndexUInt16Engine(self.levels, self.codes, offsets) + # The levels fit in an 8 bit uint - use uint8 + return MultiIndexUInt8Engine(self.levels, self.codes, offsets) # Return type "Callable[..., MultiIndex]" of "_constructor" incompatible with return # type "Type[MultiIndex]" in supertype "Index" @@ -1391,8 +1375,9 @@ def _nbytes(self, deep: bool = False) -> int: names_nbytes = sum(getsizeof(i, objsize) for i in self.names) result = level_nbytes + label_nbytes + names_nbytes - # include our engine hashtable - result += self._engine.sizeof(deep=deep) + # include our engine hashtable, only if it's already cached + if "_engine" in self._cache: + result += self._engine.sizeof(deep=deep) return result # -------------------------------------------------------------------- diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 0ba3c22093c69..bd9e8b84fd82a 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -57,9 +57,13 @@ Dtype, JoinHow, NaPosition, + NumpySorter, Self, npt, ) + + from pandas import Series + _empty_range = range(0) _dtype_int64 = np.dtype(np.int64) @@ -1359,3 +1363,64 @@ def take( # type: ignore[override] taken += self.start return self._shallow_copy(taken, name=self.name) + + def value_counts( + self, + normalize: bool = False, + sort: bool = True, + ascending: bool = False, + bins=None, + dropna: bool = True, + ) -> Series: + from pandas import Series + + if bins is not None: + return super().value_counts( + normalize=normalize, + sort=sort, + ascending=ascending, + bins=bins, + dropna=dropna, + ) + name = "proportion" if normalize else "count" + data: npt.NDArray[np.floating] | npt.NDArray[np.signedinteger] = np.ones( + len(self), dtype=np.int64 + ) + if normalize: + data = data / len(self) + return Series(data, index=self.copy(), name=name) + + def searchsorted( # type: ignore[override] + self, + value, + side: Literal["left", "right"] = "left", + sorter: NumpySorter | None = None, + ) -> npt.NDArray[np.intp] | np.intp: + if side not in {"left", "right"} or sorter is not None: + return super().searchsorted(value=value, side=side, sorter=sorter) + + was_scalar = False + if is_scalar(value): + was_scalar = True + array_value = np.array([value]) + else: + array_value = np.asarray(value) + if array_value.dtype.kind not in "iu": + return super().searchsorted(value=value, side=side, sorter=sorter) + + if flip := (self.step < 0): + rng = self._range[::-1] + start = rng.start + step = rng.step + shift = side == "right" + else: + start = self.start + step = self.step + shift = side == "left" + result = (array_value - start - int(shift)) // step + 1 + if flip: + result = len(self) - result + result = np.maximum(np.minimum(result, len(self)), 0) + if was_scalar: + return np.intp(result.item()) + return result.astype(np.intp, copy=False) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 7be1d5d95ffdf..28d3292a1c65b 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -38,7 +38,10 @@ Shape, npt, ) -from pandas.errors import AbstractMethodError +from pandas.errors import ( + AbstractMethodError, + OutOfBoundsDatetime, +) from pandas.util._decorators import cache_readonly from pandas.util._exceptions import find_stack_level from pandas.util._validators import validate_bool_kwarg @@ -118,6 +121,7 @@ if TYPE_CHECKING: from collections.abc import ( + Generator, Iterable, Sequence, ) @@ -385,20 +389,18 @@ def _split_op_result(self, result: ArrayLike) -> list[Block]: return [nb] @final - def _split(self) -> list[Block]: + def _split(self) -> Generator[Block, None, None]: """ Split a block into a list of single-column blocks. """ assert self.ndim == 2 - new_blocks = [] for i, ref_loc in enumerate(self._mgr_locs): vals = self.values[slice(i, i + 1)] bp = BlockPlacement(ref_loc) nb = type(self)(vals, placement=bp, ndim=2, refs=self.refs) - new_blocks.append(nb) - return new_blocks + yield nb @final def split_and_operate(self, func, *args, **kwargs) -> list[Block]: @@ -479,7 +481,17 @@ def coerce_to_target_dtype(self, other, warn_on_upcast: bool = False) -> Block: f"{self.values.dtype}. Please report a bug at " "https://github.com/pandas-dev/pandas/issues." ) - return self.astype(new_dtype) + try: + return self.astype(new_dtype) + except OutOfBoundsDatetime as err: + # e.g. GH#56419 if self.dtype is a low-resolution dt64 and we try to + # upcast to a higher-resolution dt64, we may have entries that are + # out of bounds for the higher resolution. + # Re-raise with a more informative message. + raise OutOfBoundsDatetime( + f"Incompatible (high-resolution) value for dtype='{self.dtype}'. " + "Explicitly cast before operating." + ) from err @final def convert(self) -> list[Block]: @@ -537,7 +549,9 @@ def convert_dtypes( rbs = [] for blk in blks: # Determine dtype column by column - sub_blks = [blk] if blk.ndim == 1 or self.shape[0] == 1 else blk._split() + sub_blks = ( + [blk] if blk.ndim == 1 or self.shape[0] == 1 else list(blk._split()) + ) dtypes = [ convert_dtypes( b.values, @@ -1190,8 +1204,7 @@ def putmask(self, mask, new) -> list[Block]: is_array = isinstance(new, np.ndarray) res_blocks = [] - nbs = self._split() - for i, nb in enumerate(nbs): + for i, nb in enumerate(self._split()): n = new if is_array: # we have a different value per-column @@ -1255,8 +1268,7 @@ def where(self, other, cond) -> list[Block]: is_array = isinstance(other, (np.ndarray, ExtensionArray)) res_blocks = [] - nbs = self._split() - for i, nb in enumerate(nbs): + for i, nb in enumerate(self._split()): oth = other if is_array: # we have a different value per-column @@ -1698,8 +1710,7 @@ def where(self, other, cond) -> list[Block]: is_array = isinstance(orig_other, (np.ndarray, ExtensionArray)) res_blocks = [] - nbs = self._split() - for i, nb in enumerate(nbs): + for i, nb in enumerate(self._split()): n = orig_other if is_array: # we have a different value per-column @@ -1760,8 +1771,7 @@ def putmask(self, mask, new) -> list[Block]: is_array = isinstance(orig_new, (np.ndarray, ExtensionArray)) res_blocks = [] - nbs = self._split() - for i, nb in enumerate(nbs): + for i, nb in enumerate(self._split()): n = orig_new if is_array: # we have a different value per-column diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 8fda9cd23b508..7c1bcbec1d3f2 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -92,6 +92,8 @@ ) if TYPE_CHECKING: + from collections.abc import Generator + from pandas._typing import ( ArrayLike, AxisInt, @@ -645,8 +647,7 @@ def get_bool_data(self) -> Self: new_blocks.append(blk) elif blk.is_object: - nbs = blk._split() - new_blocks.extend(nb for nb in nbs if nb.is_bool) + new_blocks.extend(nb for nb in blk._split() if nb.is_bool) return self._combine(new_blocks) @@ -1525,7 +1526,9 @@ def _insert_update_mgr_locs(self, loc) -> None: When inserting a new Block at location 'loc', we increment all of the mgr_locs of blocks above that by one. """ - for blkno, count in _fast_count_smallints(self.blknos[loc:]): + # Faster version of set(arr) for sequences of small numbers + blknos = np.bincount(self.blknos[loc:]).nonzero()[0] + for blkno in blknos: # .620 this way, .326 of which is in increment_above blk = self.blocks[blkno] blk._mgr_locs = blk._mgr_locs.increment_above(loc) @@ -1597,7 +1600,7 @@ def grouped_reduce(self, func: Callable) -> Self: nrows = 0 else: nrows = result_blocks[0].values.shape[-1] - index = Index(range(nrows)) + index = default_index(nrows) return type(self).from_blocks(result_blocks, [self.axes[0], index]) @@ -1735,21 +1738,18 @@ def unstack(self, unstacker, fill_value) -> BlockManager: bm = BlockManager(new_blocks, [new_columns, new_index], verify_integrity=False) return bm - def to_dict(self) -> dict[str, Self]: + def to_iter_dict(self) -> Generator[tuple[str, Self], None, None]: """ - Return a dict of str(dtype) -> BlockManager + Yield a tuple of (str(dtype), BlockManager) Returns ------- - values : a dict of dtype -> BlockManager + values : a tuple of (str(dtype), BlockManager) """ - - bd: dict[str, list[Block]] = {} - for b in self.blocks: - bd.setdefault(str(b.dtype), []).append(b) - - # TODO(EA2D): the combine will be unnecessary with 2D EAs - return {dtype: self._combine(blocks) for dtype, blocks in bd.items()} + key = lambda block: str(block.dtype) + for dtype, blocks in itertools.groupby(sorted(self.blocks, key=key), key=key): + # TODO(EA2D): the combine will be unnecessary with 2D EAs + yield dtype, self._combine(list(blocks)) def as_array( self, @@ -2330,7 +2330,7 @@ def _grouping_func(tup: tuple[int, ArrayLike]) -> tuple[int, DtypeObj]: def _form_blocks(arrays: list[ArrayLike], consolidate: bool, refs: list) -> list[Block]: - tuples = list(enumerate(arrays)) + tuples = enumerate(arrays) if not consolidate: return _tuples_to_blocks_no_consolidate(tuples, refs) @@ -2351,7 +2351,7 @@ def _form_blocks(arrays: list[ArrayLike], consolidate: bool, refs: list) -> list if issubclass(dtype.type, (str, bytes)): dtype = np.dtype(object) - values, placement = _stack_arrays(list(tup_block), dtype) + values, placement = _stack_arrays(tup_block, dtype) if is_dtlike: values = ensure_wrapped_if_datetimelike(values) blk = block_type(values, placement=BlockPlacement(placement), ndim=2) @@ -2450,15 +2450,6 @@ def _merge_blocks( return blocks, False -def _fast_count_smallints(arr: npt.NDArray[np.intp]): - """Faster version of set(arr) for sequences of small numbers.""" - counts = np.bincount(arr) - nz = counts.nonzero()[0] - # Note: list(zip(...) outperforms list(np.c_[nz, counts[nz]]) here, - # in one benchmark by a factor of 11 - return zip(nz, counts[nz]) - - def _preprocess_slice_or_indexer( slice_or_indexer: slice | np.ndarray, length: int, allow_fill: bool ): diff --git a/pandas/core/methods/to_dict.py b/pandas/core/methods/to_dict.py index 57e03dedc384d..84202a4fcc840 100644 --- a/pandas/core/methods/to_dict.py +++ b/pandas/core/methods/to_dict.py @@ -148,7 +148,7 @@ def to_dict( Return a collections.abc.MutableMapping object representing the DataFrame. The resulting transformation depends on the `orient` parameter. """ - if not df.columns.is_unique: + if orient != "tight" and not df.columns.is_unique: warnings.warn( "DataFrame columns are not unique, some columns will be omitted.", UserWarning, diff --git a/pandas/core/missing.py b/pandas/core/missing.py index 9fef78d9f8c3d..039d868bccd16 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -314,7 +314,16 @@ def get_interp_index(method, index: Index) -> Index: # prior default from pandas import Index - index = Index(np.arange(len(index))) + if isinstance(index.dtype, DatetimeTZDtype) or lib.is_np_dtype( + index.dtype, "mM" + ): + # Convert datetime-like indexes to int64 + index = Index(index.view("i8")) + + elif not is_numeric_dtype(index.dtype): + # We keep behavior consistent with prior versions of pandas for + # non-numeric, non-datetime indexes + index = Index(range(len(index))) else: methods = {"index", "values", "nearest", "time"} is_numeric_or_datetime = ( @@ -616,6 +625,9 @@ def _interpolate_scipy_wrapper( terp = alt_methods.get(method, None) if terp is None: raise ValueError(f"Can not interpolate with method={method}.") + + # Make sure downcast is not in kwargs for alt methods + kwargs.pop("downcast", None) new_y = terp(x, y, new_x, **kwargs) return new_y diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 86d1f55f38c05..ccbe25fdae841 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -80,6 +80,7 @@ TimedeltaIndex, timedelta_range, ) +from pandas.core.reshape.concat import concat from pandas.tseries.frequencies import ( is_subperiod, @@ -885,30 +886,59 @@ def interpolate( Freq: 500ms, dtype: float64 Internal reindexing with ``asfreq()`` prior to interpolation leads to - an interpolated timeseries on the basis the reindexed timestamps (anchors). - Since not all datapoints from original series become anchors, - it can lead to misleading interpolation results as in the following example: + an interpolated timeseries on the basis of the reindexed timestamps + (anchors). It is assured that all available datapoints from original + series become anchors, so it also works for resampling-cases that lead + to non-aligned timestamps, as in the following example: >>> series.resample("400ms").interpolate("linear") 2023-03-01 07:00:00.000 1.0 - 2023-03-01 07:00:00.400 1.2 - 2023-03-01 07:00:00.800 1.4 - 2023-03-01 07:00:01.200 1.6 - 2023-03-01 07:00:01.600 1.8 + 2023-03-01 07:00:00.400 0.2 + 2023-03-01 07:00:00.800 -0.6 + 2023-03-01 07:00:01.200 -0.4 + 2023-03-01 07:00:01.600 0.8 2023-03-01 07:00:02.000 2.0 - 2023-03-01 07:00:02.400 2.2 - 2023-03-01 07:00:02.800 2.4 - 2023-03-01 07:00:03.200 2.6 - 2023-03-01 07:00:03.600 2.8 + 2023-03-01 07:00:02.400 1.6 + 2023-03-01 07:00:02.800 1.2 + 2023-03-01 07:00:03.200 1.4 + 2023-03-01 07:00:03.600 2.2 2023-03-01 07:00:04.000 3.0 Freq: 400ms, dtype: float64 - Note that the series erroneously increases between two anchors + Note that the series correctly decreases between two anchors ``07:00:00`` and ``07:00:02``. """ assert downcast is lib.no_default # just checking coverage result = self._upsample("asfreq") - return result.interpolate( + + # If the original data has timestamps which are not aligned with the + # target timestamps, we need to add those points back to the data frame + # that is supposed to be interpolated. This does not work with + # PeriodIndex, so we skip this case. GH#21351 + obj = self._selected_obj + is_period_index = isinstance(obj.index, PeriodIndex) + + # Skip this step for PeriodIndex + if not is_period_index: + final_index = result.index + if isinstance(final_index, MultiIndex): + raise NotImplementedError( + "Direct interpolation of MultiIndex data frames is not " + "supported. If you tried to resample and interpolate on a " + "grouped data frame, please use:\n" + "`df.groupby(...).apply(lambda x: x.resample(...)." + "interpolate(...), include_groups=False)`" + "\ninstead, as resampling and interpolation has to be " + "performed for each group independently." + ) + + missing_data_points_index = obj.index.difference(final_index) + if len(missing_data_points_index) > 0: + result = concat( + [result, obj.loc[missing_data_points_index]] + ).sort_index() + + result_interpolated = result.interpolate( method=method, axis=axis, limit=limit, @@ -919,6 +949,18 @@ def interpolate( **kwargs, ) + # No further steps if the original data has a PeriodIndex + if is_period_index: + return result_interpolated + + # Make sure that original data points which do not align with the + # resampled index are removed + result_interpolated = result_interpolated.loc[final_index] + + # Make sure frequency indexes are preserved + result_interpolated.index = final_index + return result_interpolated + @final def asfreq(self, fill_value=None): """ diff --git a/pandas/core/series.py b/pandas/core/series.py index a72eb8e261e65..8a26d52bb5df1 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -901,19 +901,9 @@ def __getitem__(self, key): if isinstance(key, (list, tuple)): key = unpack_1tuple(key) - if is_integer(key) and self.index._should_fallback_to_positional: - warnings.warn( - # GH#50617 - "Series.__getitem__ treating keys as positions is deprecated. " - "In a future version, integer keys will always be treated " - "as labels (consistent with DataFrame behavior). To access " - "a value by position, use `ser.iloc[pos]`", - FutureWarning, - stacklevel=find_stack_level(), - ) - return self._values[key] - elif key_is_scalar: + # Note: GH#50617 in 3.0 we changed int key to always be treated as + # a label, matching DataFrame behavior. return self._get_value(key) # Convert generator to list before going through hashable part @@ -958,35 +948,6 @@ def _get_with(self, key): elif isinstance(key, tuple): return self._get_values_tuple(key) - elif not is_list_like(key): - # e.g. scalars that aren't recognized by lib.is_scalar, GH#32684 - return self.loc[key] - - if not isinstance(key, (list, np.ndarray, ExtensionArray, Series, Index)): - key = list(key) - - key_type = lib.infer_dtype(key, skipna=False) - - # Note: The key_type == "boolean" case should be caught by the - # com.is_bool_indexer check in __getitem__ - if key_type == "integer": - # We need to decide whether to treat this as a positional indexer - # (i.e. self.iloc) or label-based (i.e. self.loc) - if not self.index._should_fallback_to_positional: - return self.loc[key] - else: - warnings.warn( - # GH#50617 - "Series.__getitem__ treating keys as positions is deprecated. " - "In a future version, integer keys will always be treated " - "as labels (consistent with DataFrame behavior). To access " - "a value by position, use `ser.iloc[pos]`", - FutureWarning, - stacklevel=find_stack_level(), - ) - return self.iloc[key] - - # handle the dup indexing case GH#4246 return self.loc[key] def _get_values_tuple(self, key: tuple): @@ -1076,27 +1037,8 @@ def __setitem__(self, key, value) -> None: except KeyError: # We have a scalar (or for MultiIndex or object-dtype, scalar-like) # key that is not present in self.index. - if is_integer(key): - if not self.index._should_fallback_to_positional: - # GH#33469 - self.loc[key] = value - else: - # positional setter - # can't use _mgr.setitem_inplace yet bc could have *both* - # KeyError and then ValueError, xref GH#45070 - warnings.warn( - # GH#50617 - "Series.__setitem__ treating keys as positions is deprecated. " - "In a future version, integer keys will always be treated " - "as labels (consistent with DataFrame behavior). To set " - "a value by position, use `ser.iloc[pos] = value`", - FutureWarning, - stacklevel=find_stack_level(), - ) - self._set_values(key, value) - else: - # GH#12862 adding a new key to the Series - self.loc[key] = value + # GH#12862 adding a new key to the Series + self.loc[key] = value except (TypeError, ValueError, LossySetitemError): # The key was OK, but we cannot set the value losslessly @@ -1155,28 +1097,7 @@ def _set_with(self, key, value) -> None: # Without this, the call to infer_dtype will consume the generator key = list(key) - if not self.index._should_fallback_to_positional: - # Regardless of the key type, we're treating it as labels - self._set_labels(key, value) - - else: - # Note: key_type == "boolean" should not occur because that - # should be caught by the is_bool_indexer check in __setitem__ - key_type = lib.infer_dtype(key, skipna=False) - - if key_type == "integer": - warnings.warn( - # GH#50617 - "Series.__setitem__ treating keys as positions is deprecated. " - "In a future version, integer keys will always be treated " - "as labels (consistent with DataFrame behavior). To set " - "a value by position, use `ser.iloc[pos] = value`", - FutureWarning, - stacklevel=find_stack_level(), - ) - self._set_values(key, value) - else: - self._set_labels(key, value) + self._set_labels(key, value) def _set_labels(self, key, value) -> None: key = com.asarray_tuplesafe(key) @@ -5359,6 +5280,8 @@ def case_when( """ Replace values where the conditions are True. + .. versionadded:: 2.2.0 + Parameters ---------- caselist : A list of tuples of conditions and expected replacements @@ -5376,8 +5299,6 @@ def case_when( must not change the input Series (though pandas doesn`t check it). - .. versionadded:: 2.2.0 - Returns ------- Series diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index df7a6cdb1ea52..b01cdb335ec46 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -481,7 +481,7 @@ def _to_datetime_with_unit(arg, unit, name, utc: bool, errors: str) -> Index: """ arg = extract_array(arg, extract_numpy=True) - # GH#30050 pass an ndarray to tslib.array_with_unit_to_datetime + # GH#30050 pass an ndarray to tslib.array_to_datetime # because it expects an ndarray argument if isinstance(arg, IntegerArray): arr = arg.astype(f"datetime64[{unit}]") @@ -519,7 +519,12 @@ def _to_datetime_with_unit(arg, unit, name, utc: bool, errors: str) -> Index: tz_parsed = None else: arg = arg.astype(object, copy=False) - arr, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit, errors=errors) + arr, tz_parsed = tslib.array_to_datetime( + arg, + utc=utc, + errors=errors, + unit_for_numerics=unit, + ) result = DatetimeIndex(arr, name=name) if not isinstance(result, DatetimeIndex): diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 2b35cfa044ae9..6063ac098a4dc 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -780,143 +780,195 @@ def parse( output[asheetname] = DataFrame() continue - is_list_header = False - is_len_one_list_header = False - if is_list_like(header): - assert isinstance(header, Sequence) - is_list_header = True - if len(header) == 1: - is_len_one_list_header = True - - if is_len_one_list_header: - header = cast(Sequence[int], header)[0] - - # forward fill and pull out names for MultiIndex column - header_names = None - if header is not None and is_list_like(header): - assert isinstance(header, Sequence) - - header_names = [] - control_row = [True] * len(data[0]) - - for row in header: - if is_integer(skiprows): - assert isinstance(skiprows, int) - row += skiprows - - if row > len(data) - 1: - raise ValueError( - f"header index {row} exceeds maximum index " - f"{len(data) - 1} of data.", - ) - - data[row], control_row = fill_mi_header(data[row], control_row) - - if index_col is not None: - header_name, _ = pop_header_name(data[row], index_col) - header_names.append(header_name) - - # If there is a MultiIndex header and an index then there is also - # a row containing just the index name(s) - has_index_names = False - if is_list_header and not is_len_one_list_header and index_col is not None: - index_col_list: Sequence[int] - if isinstance(index_col, int): - index_col_list = [index_col] - else: - assert isinstance(index_col, Sequence) - index_col_list = index_col - - # We have to handle mi without names. If any of the entries in the data - # columns are not empty, this is a regular row - assert isinstance(header, Sequence) - if len(header) < len(data): - potential_index_names = data[len(header)] - potential_data = [ - x - for i, x in enumerate(potential_index_names) - if not control_row[i] and i not in index_col_list - ] - has_index_names = all(x == "" or x is None for x in potential_data) - - if is_list_like(index_col): - # Forward fill values for MultiIndex index. - if header is None: - offset = 0 - elif isinstance(header, int): - offset = 1 + header - else: - offset = 1 + max(header) + output = self._parse_sheet( + data=data, + output=output, + asheetname=asheetname, + header=header, + names=names, + index_col=index_col, + usecols=usecols, + dtype=dtype, + skiprows=skiprows, + nrows=nrows, + true_values=true_values, + false_values=false_values, + na_values=na_values, + parse_dates=parse_dates, + date_parser=date_parser, + date_format=date_format, + thousands=thousands, + decimal=decimal, + comment=comment, + skipfooter=skipfooter, + dtype_backend=dtype_backend, + **kwds, + ) - # GH34673: if MultiIndex names present and not defined in the header, - # offset needs to be incremented so that forward filling starts - # from the first MI value instead of the name - if has_index_names: - offset += 1 + if last_sheetname is None: + raise ValueError("Sheet name is an empty list") - # Check if we have an empty dataset - # before trying to collect data. - if offset < len(data): - assert isinstance(index_col, Sequence) + if ret_dict: + return output + else: + return output[last_sheetname] - for col in index_col: - last = data[offset][col] + def _parse_sheet( + self, + data: list, + output: dict, + asheetname: str | int | None = None, + header: int | Sequence[int] | None = 0, + names: SequenceNotStr[Hashable] | range | None = None, + index_col: int | Sequence[int] | None = None, + usecols=None, + dtype: DtypeArg | None = None, + skiprows: Sequence[int] | int | Callable[[int], object] | None = None, + nrows: int | None = None, + true_values: Iterable[Hashable] | None = None, + false_values: Iterable[Hashable] | None = None, + na_values=None, + parse_dates: list | dict | bool = False, + date_parser: Callable | lib.NoDefault = lib.no_default, + date_format: dict[Hashable, str] | str | None = None, + thousands: str | None = None, + decimal: str = ".", + comment: str | None = None, + skipfooter: int = 0, + dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, + **kwds, + ): + is_list_header = False + is_len_one_list_header = False + if is_list_like(header): + assert isinstance(header, Sequence) + is_list_header = True + if len(header) == 1: + is_len_one_list_header = True + + if is_len_one_list_header: + header = cast(Sequence[int], header)[0] + + # forward fill and pull out names for MultiIndex column + header_names = None + if header is not None and is_list_like(header): + assert isinstance(header, Sequence) + + header_names = [] + control_row = [True] * len(data[0]) + + for row in header: + if is_integer(skiprows): + assert isinstance(skiprows, int) + row += skiprows + + if row > len(data) - 1: + raise ValueError( + f"header index {row} exceeds maximum index " + f"{len(data) - 1} of data.", + ) - for row in range(offset + 1, len(data)): - if data[row][col] == "" or data[row][col] is None: - data[row][col] = last - else: - last = data[row][col] + data[row], control_row = fill_mi_header(data[row], control_row) - # GH 12292 : error when read one empty column from excel file - try: - parser = TextParser( - data, - names=names, - header=header, - index_col=index_col, - has_index_names=has_index_names, - dtype=dtype, - true_values=true_values, - false_values=false_values, - skiprows=skiprows, - nrows=nrows, - na_values=na_values, - skip_blank_lines=False, # GH 39808 - parse_dates=parse_dates, - date_parser=date_parser, - date_format=date_format, - thousands=thousands, - decimal=decimal, - comment=comment, - skipfooter=skipfooter, - usecols=usecols, - dtype_backend=dtype_backend, - **kwds, - ) + if index_col is not None: + header_name, _ = pop_header_name(data[row], index_col) + header_names.append(header_name) - output[asheetname] = parser.read(nrows=nrows) + # If there is a MultiIndex header and an index then there is also + # a row containing just the index name(s) + has_index_names = False + if is_list_header and not is_len_one_list_header and index_col is not None: + index_col_list: Sequence[int] + if isinstance(index_col, int): + index_col_list = [index_col] + else: + assert isinstance(index_col, Sequence) + index_col_list = index_col + + # We have to handle mi without names. If any of the entries in the data + # columns are not empty, this is a regular row + assert isinstance(header, Sequence) + if len(header) < len(data): + potential_index_names = data[len(header)] + potential_data = [ + x + for i, x in enumerate(potential_index_names) + if not control_row[i] and i not in index_col_list + ] + has_index_names = all(x == "" or x is None for x in potential_data) + + if is_list_like(index_col): + # Forward fill values for MultiIndex index. + if header is None: + offset = 0 + elif isinstance(header, int): + offset = 1 + header + else: + offset = 1 + max(header) + + # GH34673: if MultiIndex names present and not defined in the header, + # offset needs to be incremented so that forward filling starts + # from the first MI value instead of the name + if has_index_names: + offset += 1 + + # Check if we have an empty dataset + # before trying to collect data. + if offset < len(data): + assert isinstance(index_col, Sequence) + + for col in index_col: + last = data[offset][col] + + for row in range(offset + 1, len(data)): + if data[row][col] == "" or data[row][col] is None: + data[row][col] = last + else: + last = data[row][col] + + # GH 12292 : error when read one empty column from excel file + try: + parser = TextParser( + data, + names=names, + header=header, + index_col=index_col, + has_index_names=has_index_names, + dtype=dtype, + true_values=true_values, + false_values=false_values, + skiprows=skiprows, + nrows=nrows, + na_values=na_values, + skip_blank_lines=False, # GH 39808 + parse_dates=parse_dates, + date_parser=date_parser, + date_format=date_format, + thousands=thousands, + decimal=decimal, + comment=comment, + skipfooter=skipfooter, + usecols=usecols, + dtype_backend=dtype_backend, + **kwds, + ) - if header_names: - output[asheetname].columns = output[asheetname].columns.set_names( - header_names - ) + output[asheetname] = parser.read(nrows=nrows) - except EmptyDataError: - # No Data, return an empty DataFrame - output[asheetname] = DataFrame() + if header_names: + output[asheetname].columns = output[asheetname].columns.set_names( + header_names + ) - except Exception as err: - err.args = (f"{err.args[0]} (sheet: {asheetname})", *err.args[1:]) - raise err + except EmptyDataError: + # No Data, return an empty DataFrame + output[asheetname] = DataFrame() - if last_sheetname is None: - raise ValueError("Sheet name is an empty list") + except Exception as err: + err.args = (f"{err.args[0]} (sheet: {asheetname})", *err.args[1:]) + raise err - if ret_dict: - return output - else: - return output[last_sheetname] + return output @doc(storage_options=_shared_docs["storage_options"]) diff --git a/pandas/io/excel/_xlsxwriter.py b/pandas/io/excel/_xlsxwriter.py index 6eacac8c064fb..b2fd24a670300 100644 --- a/pandas/io/excel/_xlsxwriter.py +++ b/pandas/io/excel/_xlsxwriter.py @@ -93,7 +93,7 @@ class _XlsxStyler: } @classmethod - def convert(cls, style_dict, num_format_str=None): + def convert(cls, style_dict, num_format_str=None) -> dict[str, Any]: """ converts a style_dict to an xlsxwriter format dict diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 25808f5b4a132..5d325397a81ae 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -22,6 +22,7 @@ Final, Literal, cast, + overload, ) import warnings @@ -593,7 +594,7 @@ def __getitem__(self, key: str): def __setitem__(self, key: str, value) -> None: self.put(key, value) - def __delitem__(self, key: str) -> None: + def __delitem__(self, key: str) -> int | None: return self.remove(key) def __getattr__(self, name: str): @@ -656,6 +657,12 @@ def keys(self, include: str = "pandas") -> list[str]: ------ raises ValueError if kind has an illegal value + See Also + -------- + HDFStore.info : Prints detailed information on the store. + HDFStore.get_node : Returns the node with the key. + HDFStore.get_storer : Returns the storer object for a key. + Examples -------- >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"]) @@ -853,6 +860,12 @@ def select( object Retrieved object from file. + See Also + -------- + HDFStore.select_as_coordinates : Returns the selection as an index. + HDFStore.select_column : Returns a single column from the table. + HDFStore.select_as_multiple : Retrieves pandas objects from multiple tables. + Examples -------- >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"]) @@ -1132,12 +1145,27 @@ def put( Write DataFrame index as a column. append : bool, default False This will force Table format, append the input data to the existing. + complib : default None + This parameter is currently not accepted. + complevel : int, 0-9, default None + Specifies a compression level for data. + A value of 0 or None disables compression. + min_itemsize : int, dict, or None + Dict of columns that specify minimum str sizes. + nan_rep : str + Str to use as str nan representation. data_columns : list of columns or True, default None List of columns to create as data columns, or True to use all columns. See `here `__. encoding : str, default None Provide an encoding for strings. + errors : str, default 'strict' + The error handling scheme to use for encoding errors. + The default is 'strict' meaning that encoding errors raise a + UnicodeEncodeError. Other possible values are 'ignore', 'replace' and + 'xmlcharrefreplace' as well as any other name registered with + codecs.register_error that can handle UnicodeEncodeErrors. track_times : bool, default True Parameter is propagated to 'create_table' method of 'PyTables'. If set to False it enables to have the same h5 files (same hashes) @@ -1145,6 +1173,11 @@ def put( dropna : bool, default False, optional Remove missing values. + See Also + -------- + HDFStore.info : Prints detailed information on the store. + HDFStore.get_storer : Returns the storer object for a key. + Examples -------- >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"]) @@ -1171,7 +1204,7 @@ def put( dropna=dropna, ) - def remove(self, key: str, where=None, start=None, stop=None) -> None: + def remove(self, key: str, where=None, start=None, stop=None) -> int | None: """ Remove pandas object partially by specifying the where condition @@ -1219,14 +1252,12 @@ def remove(self, key: str, where=None, start=None, stop=None) -> None: # remove the node if com.all_none(where, start, stop): s.group._f_remove(recursive=True) + return None # delete from the table - else: - if not s.is_table: - raise ValueError( - "can only remove with where on objects written as tables" - ) - return s.delete(where=where, start=start, stop=stop) + if not s.is_table: + raise ValueError("can only remove with where on objects written as tables") + return s.delete(where=where, start=start, stop=stop) def append( self, @@ -1504,6 +1535,10 @@ def groups(self) -> list: list List of objects. + See Also + -------- + HDFStore.get_node : Returns the node with the key. + Examples -------- >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"]) @@ -1559,6 +1594,10 @@ def walk(self, where: str = "/") -> Iterator[tuple[str, list[str], list[str]]]: leaves : list Names (strings) of the pandas objects contained in `path`. + See Also + -------- + HDFStore.info : Prints detailed information on the store. + Examples -------- >>> df1 = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"]) @@ -1684,17 +1723,26 @@ def info(self) -> str: Returns ------- str + A String containing the python pandas class name, filepath to the HDF5 + file and all the object keys along with their respective dataframe shapes. + + See Also + -------- + HDFStore.get_storer : Returns the storer object for a key. Examples -------- - >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"]) + >>> df1 = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"]) + >>> df2 = pd.DataFrame([[5, 6], [7, 8]], columns=["C", "D"]) >>> store = pd.HDFStore("store.h5", "w") # doctest: +SKIP - >>> store.put("data", df) # doctest: +SKIP + >>> store.put("data1", df1) # doctest: +SKIP + >>> store.put("data2", df2) # doctest: +SKIP >>> print(store.info()) # doctest: +SKIP >>> store.close() # doctest: +SKIP File path: store.h5 - /data frame (shape->[2,2]) + /data1 frame (shape->[2,2]) + /data2 frame (shape->[2,2]) """ path = pprint_thing(self._path) output = f"{type(self)}\nFile path: {path}\n" @@ -2846,7 +2894,7 @@ def read( columns=None, start: int | None = None, stop: int | None = None, - ): + ) -> Series | DataFrame: raise NotImplementedError( "cannot read on an abstract storer: subclasses should implement" ) @@ -2858,7 +2906,7 @@ def write(self, obj, **kwargs) -> None: def delete( self, where=None, start: int | None = None, stop: int | None = None - ) -> None: + ) -> int | None: """ support fully deleting the node in its entirety (only) - where specification must be None @@ -3552,7 +3600,7 @@ def queryables(self) -> dict[str, Any]: return dict(d1 + d2 + d3) - def index_cols(self): + def index_cols(self) -> list[tuple[Any, Any]]: """return a list of my index cols""" # Note: each `i.cname` below is assured to be a str. return [(i.axis, i.cname) for i in self.index_axes] @@ -3682,7 +3730,7 @@ def indexables(self): dc = set(self.data_columns) base_pos = len(_indexables) - def f(i, c): + def f(i, c: str) -> DataCol: assert isinstance(c, str) klass = DataCol if c in dc: @@ -3848,7 +3896,7 @@ def get_object(cls, obj, transposed: bool): """return the data for this obj""" return obj - def validate_data_columns(self, data_columns, min_itemsize, non_index_axes): + def validate_data_columns(self, data_columns, min_itemsize, non_index_axes) -> list: """ take the input data_columns and min_itemize and create a data columns spec @@ -4541,7 +4589,9 @@ def write_data_chunk( self.table.append(rows) self.table.flush() - def delete(self, where=None, start: int | None = None, stop: int | None = None): + def delete( + self, where=None, start: int | None = None, stop: int | None = None + ) -> int | None: # delete all rows (and return the nrows) if where is None or not len(where): if start is None and stop is None: @@ -4869,7 +4919,7 @@ def read( columns=None, start: int | None = None, stop: int | None = None, - ): + ) -> DataFrame: df = super().read(where=where, columns=columns, start=start, stop=stop) df = df.set_index(self.levels) @@ -5330,7 +5380,13 @@ def __init__( if self.terms is not None: self.condition, self.filter = self.terms.evaluate() - def generate(self, where): + @overload + def generate(self, where: dict | list | tuple | str) -> PyTablesExpr: ... + + @overload + def generate(self, where: None) -> None: ... + + def generate(self, where: dict | list | tuple | str | None) -> PyTablesExpr | None: """where can be a : dict,list,tuple,string""" if where is None: return None diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py index 6a392a0f02caf..25257d5fcc192 100644 --- a/pandas/io/sas/sas7bdat.py +++ b/pandas/io/sas/sas7bdat.py @@ -16,7 +16,6 @@ from __future__ import annotations -from collections import abc from datetime import datetime import sys from typing import TYPE_CHECKING @@ -45,7 +44,7 @@ from pandas.io.common import get_handle import pandas.io.sas.sas_constants as const -from pandas.io.sas.sasreader import ReaderBase +from pandas.io.sas.sasreader import SASReader if TYPE_CHECKING: from pandas._typing import ( @@ -116,7 +115,7 @@ def __init__( # SAS7BDAT represents a SAS data file in SAS7BDAT format. -class SAS7BDATReader(ReaderBase, abc.Iterator): +class SAS7BDATReader(SASReader): """ Read SAS files in SAS7BDAT format. diff --git a/pandas/io/sas/sas_xport.py b/pandas/io/sas/sas_xport.py index adba9bf117a8e..89dbdab64c23c 100644 --- a/pandas/io/sas/sas_xport.py +++ b/pandas/io/sas/sas_xport.py @@ -10,7 +10,6 @@ from __future__ import annotations -from collections import abc from datetime import datetime import struct from typing import TYPE_CHECKING @@ -24,7 +23,7 @@ import pandas as pd from pandas.io.common import get_handle -from pandas.io.sas.sasreader import ReaderBase +from pandas.io.sas.sasreader import SASReader if TYPE_CHECKING: from pandas._typing import ( @@ -252,7 +251,7 @@ def _parse_float_vec(vec): return ieee -class XportReader(ReaderBase, abc.Iterator): +class XportReader(SASReader): __doc__ = _xport_reader_doc def __init__( diff --git a/pandas/io/sas/sasreader.py b/pandas/io/sas/sasreader.py index 69d911863338f..12d698a4f76a8 100644 --- a/pandas/io/sas/sasreader.py +++ b/pandas/io/sas/sasreader.py @@ -8,6 +8,7 @@ ABC, abstractmethod, ) +from collections.abc import Iterator from typing import ( TYPE_CHECKING, overload, @@ -33,9 +34,9 @@ from pandas import DataFrame -class ReaderBase(ABC): +class SASReader(Iterator["DataFrame"], ABC): """ - Protocol for XportReader and SAS7BDATReader classes. + Abstract class for XportReader and SAS7BDATReader. """ @abstractmethod @@ -66,7 +67,7 @@ def read_sas( chunksize: int = ..., iterator: bool = ..., compression: CompressionOptions = ..., -) -> ReaderBase: ... +) -> SASReader: ... @overload @@ -79,7 +80,7 @@ def read_sas( chunksize: None = ..., iterator: bool = ..., compression: CompressionOptions = ..., -) -> DataFrame | ReaderBase: ... +) -> DataFrame | SASReader: ... @doc(decompression_options=_shared_docs["decompression_options"] % "filepath_or_buffer") @@ -92,7 +93,7 @@ def read_sas( chunksize: int | None = None, iterator: bool = False, compression: CompressionOptions = "infer", -) -> DataFrame | ReaderBase: +) -> DataFrame | SASReader: """ Read SAS files stored as either XPORT or SAS7BDAT format files. @@ -145,7 +146,7 @@ def read_sas( f"unable to infer format of SAS file from filename: {fname!r}" ) - reader: ReaderBase + reader: SASReader if format.lower() == "xport": from pandas.io.sas.sas_xport import XportReader diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index 60bb45d3ac1dc..ea5daf02b7252 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -233,6 +233,7 @@ def hist_frame( Returns ------- matplotlib.Axes or numpy.ndarray of them + Returns a AxesSubplot object a numpy array of AxesSubplot objects. See Also -------- diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 38a75e741d60e..fffeb9b82492f 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -2077,9 +2077,6 @@ def _make_plot(self, fig: Figure) -> None: for i, (label, y) in enumerate(self._iter_data(data=self.data)): ax = self._get_ax(i) - if label is not None: - label = pprint_thing(label) - ax.set_ylabel(label) kwds = self.kwds.copy() diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index 0f2a641d13b11..b23876d9280f7 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -267,6 +267,7 @@ class TestApi(Base): "RollingGroupby", "SeriesGroupBy", "StataReader", + "SASReader", "TimedeltaIndexResamplerGroupby", "TimeGrouper", "Window", diff --git a/pandas/tests/arrays/categorical/test_algos.py b/pandas/tests/arrays/categorical/test_algos.py index 69c3364c7e98e..a7d0becc30dd9 100644 --- a/pandas/tests/arrays/categorical/test_algos.py +++ b/pandas/tests/arrays/categorical/test_algos.py @@ -86,3 +86,11 @@ def test_diff(): df = ser.to_frame(name="A") with pytest.raises(TypeError, match=msg): df.diff() + + +def test_hash_read_only_categorical(): + # GH#58481 + idx = pd.Index(pd.Index(["a", "b", "c"], dtype="object").values) + cat = pd.CategoricalDtype(idx) + arr = pd.Series(["a", "b"], dtype=cat).values + assert hash(arr.dtype) == hash(arr.dtype) diff --git a/pandas/tests/arrays/sparse/test_constructors.py b/pandas/tests/arrays/sparse/test_constructors.py index 012ff1da0d431..0bf3ab77e9eed 100644 --- a/pandas/tests/arrays/sparse/test_constructors.py +++ b/pandas/tests/arrays/sparse/test_constructors.py @@ -90,13 +90,13 @@ def test_constructor_warns_when_losing_timezone(self): dti = pd.date_range("2016-01-01", periods=3, tz="US/Pacific") expected = SparseArray(np.asarray(dti, dtype="datetime64[ns]")) - - with tm.assert_produces_warning(UserWarning): + msg = "loses timezone information" + with tm.assert_produces_warning(UserWarning, match=msg): result = SparseArray(dti) tm.assert_sp_array_equal(result, expected) - with tm.assert_produces_warning(UserWarning): + with tm.assert_produces_warning(UserWarning, match=msg): result = SparseArray(pd.Series(dti)) tm.assert_sp_array_equal(result, expected) diff --git a/pandas/tests/arrays/test_array.py b/pandas/tests/arrays/test_array.py index 50dafb5dbbb06..857509e18fa8e 100644 --- a/pandas/tests/arrays/test_array.py +++ b/pandas/tests/arrays/test_array.py @@ -220,6 +220,14 @@ def test_dt64_array(dtype_unit): .construct_array_type() ._from_sequence(["a", None], dtype=pd.StringDtype()), ), + ( + # numpy array with string dtype + np.array(["a", "b"], dtype=str), + None, + pd.StringDtype() + .construct_array_type() + ._from_sequence(["a", "b"], dtype=pd.StringDtype()), + ), # Boolean ( [True, None], @@ -247,6 +255,14 @@ def test_dt64_array(dtype_unit): "category", pd.Categorical([pd.Period("2000", "D"), pd.Period("2001", "D")]), ), + # Complex + ( + np.array([complex(1), complex(2)], dtype=np.complex128), + None, + NumpyExtensionArray( + np.array([complex(1), complex(2)], dtype=np.complex128) + ), + ), ], ) def test_array(data, dtype, expected): diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index cfc04b5c91354..3d8f8d791b763 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -661,7 +661,9 @@ def test_array_interface(self, datetime_index): assert result is expected tm.assert_numpy_array_equal(result, expected) result = np.array(arr, dtype="datetime64[ns]") - assert result is not expected + if not np_version_gt2: + # TODO: GH 57739 + assert result is not expected tm.assert_numpy_array_equal(result, expected) # to object dtype @@ -778,7 +780,7 @@ def test_to_period_2d(self, arr1d): arr2d = arr1d.reshape(1, -1) warn = None if arr1d.tz is None else UserWarning - with tm.assert_produces_warning(warn): + with tm.assert_produces_warning(warn, match="will drop timezone information"): result = arr2d.to_period("D") expected = arr1d.to_period("D").reshape(1, -1) tm.assert_period_array_equal(result, expected) @@ -976,7 +978,9 @@ def test_array_interface(self, timedelta_index): assert result is expected tm.assert_numpy_array_equal(result, expected) result = np.array(arr, dtype="timedelta64[ns]") - assert result is not expected + if not np_version_gt2: + # TODO: GH 57739 + assert result is not expected tm.assert_numpy_array_equal(result, expected) # to object dtype diff --git a/pandas/tests/computation/test_eval.py b/pandas/tests/computation/test_eval.py index 8f14c562fa7c3..d8e5908b0c58f 100644 --- a/pandas/tests/computation/test_eval.py +++ b/pandas/tests/computation/test_eval.py @@ -737,6 +737,17 @@ def test_and_logic_string_match(self): assert pd.eval(f"{event.str.match('hello').a}") assert pd.eval(f"{event.str.match('hello').a and event.str.match('hello').a}") + def test_eval_keep_name(self, engine, parser): + df = Series([2, 15, 28], name="a").to_frame() + res = df.eval("a + a", engine=engine, parser=parser) + expected = Series([4, 30, 56], name="a") + tm.assert_series_equal(expected, res) + + def test_eval_unmatching_names(self, engine, parser): + variable_name = Series([42], name="series_name") + res = pd.eval("variable_name + 0", engine=engine, parser=parser) + tm.assert_series_equal(variable_name, res) + # ------------------------------------- # gh-12388: Typecasting rules consistency with python @@ -1014,7 +1025,8 @@ def test_performance_warning_for_poor_alignment( else: seen = False - with tm.assert_produces_warning(seen): + msg = "Alignment difference on axis 1 is larger than an order of magnitude" + with tm.assert_produces_warning(seen, match=msg): pd.eval("df + s", engine=engine, parser=parser) s = Series(np.random.default_rng(2).standard_normal(1000)) @@ -1036,7 +1048,7 @@ def test_performance_warning_for_poor_alignment( else: wrn = False - with tm.assert_produces_warning(wrn) as w: + with tm.assert_produces_warning(wrn, match=msg) as w: pd.eval("df + s", engine=engine, parser=parser) if not is_python_engine and performance_warning: @@ -1268,14 +1280,12 @@ def test_assignment_explicit(self): expected["c"] = expected["a"] + expected["b"] tm.assert_frame_equal(df, expected) - def test_column_in(self): + def test_column_in(self, engine): # GH 11235 df = DataFrame({"a": [11], "b": [-32]}) - result = df.eval("a in [11, -32]") - expected = Series([True]) - # TODO: 2022-01-29: Name check failed with numexpr 2.7.3 in CI - # but cannot reproduce locally - tm.assert_series_equal(result, expected, check_names=False) + result = df.eval("a in [11, -32]", engine=engine) + expected = Series([True], name="a") + tm.assert_series_equal(result, expected) @pytest.mark.xfail(reason="Unknown: Omitted test_ in name prior.") def test_assignment_not_inplace(self): @@ -1504,7 +1514,7 @@ def test_date_boolean(self, engine, parser): parser=parser, ) expec = df.dates1 < "20130101" - tm.assert_series_equal(res, expec, check_names=False) + tm.assert_series_equal(res, expec) def test_simple_in_ops(self, engine, parser): if parser != "python": @@ -1609,22 +1619,20 @@ def eval(self, *args, **kwargs): kwargs["level"] = kwargs.pop("level", 0) + 1 return pd.eval(*args, **kwargs) - @pytest.mark.skipif( - not NUMEXPR_INSTALLED, reason="Unary ops only implemented for numexpr" - ) + @pytest.mark.filterwarnings("ignore::RuntimeWarning") @pytest.mark.parametrize("fn", _unary_math_ops) - def test_unary_functions(self, fn): + def test_unary_functions(self, fn, engine, parser): df = DataFrame({"a": np.random.default_rng(2).standard_normal(10)}) a = df.a expr = f"{fn}(a)" - got = self.eval(expr) + got = self.eval(expr, engine=engine, parser=parser) with np.errstate(all="ignore"): expect = getattr(np, fn)(a) - tm.assert_series_equal(got, expect, check_names=False) + tm.assert_series_equal(got, expect) @pytest.mark.parametrize("fn", _binary_math_ops) - def test_binary_functions(self, fn): + def test_binary_functions(self, fn, engine, parser): df = DataFrame( { "a": np.random.default_rng(2).standard_normal(10), @@ -1635,10 +1643,10 @@ def test_binary_functions(self, fn): b = df.b expr = f"{fn}(a, b)" - got = self.eval(expr) + got = self.eval(expr, engine=engine, parser=parser) with np.errstate(all="ignore"): expect = getattr(np, fn)(a, b) - tm.assert_almost_equal(got, expect, check_names=False) + tm.assert_almost_equal(got, expect) def test_df_use_case(self, engine, parser): df = DataFrame( @@ -1654,8 +1662,8 @@ def test_df_use_case(self, engine, parser): inplace=True, ) got = df.e - expect = np.arctan2(np.sin(df.a), df.b) - tm.assert_series_equal(got, expect, check_names=False) + expect = np.arctan2(np.sin(df.a), df.b).rename("e") + tm.assert_series_equal(got, expect) def test_df_arithmetic_subexpression(self, engine, parser): df = DataFrame( @@ -1666,8 +1674,8 @@ def test_df_arithmetic_subexpression(self, engine, parser): ) df.eval("e = sin(a + b)", engine=engine, parser=parser, inplace=True) got = df.e - expect = np.sin(df.a + df.b) - tm.assert_series_equal(got, expect, check_names=False) + expect = np.sin(df.a + df.b).rename("e") + tm.assert_series_equal(got, expect) @pytest.mark.parametrize( "dtype, expect_dtype", @@ -1691,10 +1699,10 @@ def test_result_types(self, dtype, expect_dtype, engine, parser): assert df.a.dtype == dtype df.eval("b = sin(a)", engine=engine, parser=parser, inplace=True) got = df.b - expect = np.sin(df.a) + expect = np.sin(df.a).rename("b") assert expect.dtype == got.dtype assert expect_dtype == got.dtype - tm.assert_series_equal(got, expect, check_names=False) + tm.assert_series_equal(got, expect) def test_undefined_func(self, engine, parser): df = DataFrame({"a": np.random.default_rng(2).standard_normal(10)}) @@ -1899,10 +1907,6 @@ def test_equals_various(other): df = DataFrame({"A": ["a", "b", "c"]}, dtype=object) result = df.eval(f"A == {other}") expected = Series([False, False, False], name="A") - if USE_NUMEXPR: - # https://github.com/pandas-dev/pandas/issues/10239 - # lose name with numexpr engine. Remove when that's fixed. - expected.name = None tm.assert_series_equal(result, expected) diff --git a/pandas/tests/copy_view/test_indexing.py b/pandas/tests/copy_view/test_indexing.py index 09d13677eef62..b10141b0d63f4 100644 --- a/pandas/tests/copy_view/test_indexing.py +++ b/pandas/tests/copy_view/test_indexing.py @@ -622,16 +622,17 @@ def test_series_subset_set_with_indexer(backend, indexer_si, indexer): s_orig = s.copy() subset = s[:] - warn = None - msg = "Series.__setitem__ treating keys as positions is deprecated" if ( indexer_si is tm.setitem and isinstance(indexer, np.ndarray) and indexer.dtype.kind == "i" ): - warn = FutureWarning - with tm.assert_produces_warning(warn, match=msg): - indexer_si(subset)[indexer] = 0 + # In 3.0 we treat integers as always-labels + with pytest.raises(KeyError): + indexer_si(subset)[indexer] = 0 + return + + indexer_si(subset)[indexer] = 0 expected = Series([0, 0, 3], index=["a", "b", "c"]) tm.assert_series_equal(subset, expected) diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index c34c97b6e4f04..f47815ee059af 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -797,5 +797,5 @@ def test_pandas_dtype_numpy_warning(): def test_pandas_dtype_ea_not_instance(): # GH 31356 GH 54592 - with tm.assert_produces_warning(UserWarning): + with tm.assert_produces_warning(UserWarning, match="without any arguments"): assert pandas_dtype(CategoricalDtype) == CategoricalDtype() diff --git a/pandas/tests/dtypes/test_generic.py b/pandas/tests/dtypes/test_generic.py index 02c827853b29d..261f86bfb0326 100644 --- a/pandas/tests/dtypes/test_generic.py +++ b/pandas/tests/dtypes/test_generic.py @@ -124,7 +124,7 @@ def test_setattr_warnings(): # this should not raise a warning df.two.not_an_index = [1, 2] - with tm.assert_produces_warning(UserWarning): + with tm.assert_produces_warning(UserWarning, match="doesn't allow columns"): # warn when setting column to nonexistent name df.four = df.two + 2 assert df.four.sum() > df.two.sum() diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index 668e7192c0e52..f4282c9c7ac3a 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -936,9 +936,9 @@ def test_maybe_convert_objects_bool_nan(self): def test_maybe_convert_objects_nullable_boolean(self): # GH50047 arr = np.array([True, False], dtype=object) - exp = np.array([True, False]) + exp = BooleanArray._from_sequence([True, False], dtype="boolean") out = lib.maybe_convert_objects(arr, convert_to_nullable_dtype=True) - tm.assert_numpy_array_equal(out, exp) + tm.assert_extension_array_equal(out, exp) arr = np.array([True, False, pd.NaT], dtype=object) exp = np.array([True, False, pd.NaT], dtype=object) diff --git a/pandas/tests/extension/base/getitem.py b/pandas/tests/extension/base/getitem.py index 1f89c7ad9d4e4..935edce32a0ab 100644 --- a/pandas/tests/extension/base/getitem.py +++ b/pandas/tests/extension/base/getitem.py @@ -329,11 +329,10 @@ def test_get(self, data): result = s.get("Z") assert result is None - msg = "Series.__getitem__ treating keys as positions is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - assert s.get(4) == s.iloc[4] - assert s.get(-1) == s.iloc[-1] - assert s.get(len(s)) is None + # As of 3.0, getitem with int keys treats them as labels + assert s.get(4) is None + assert s.get(-1) is None + assert s.get(len(s)) is None # GH 21257 s = pd.Series(data) diff --git a/pandas/tests/extension/base/missing.py b/pandas/tests/extension/base/missing.py index 4b9234a9904a2..cee565d4f7c1e 100644 --- a/pandas/tests/extension/base/missing.py +++ b/pandas/tests/extension/base/missing.py @@ -27,7 +27,9 @@ def test_isna_returns_copy(self, data_missing, na_func): expected = result.copy() mask = getattr(result, na_func)() if isinstance(mask.dtype, pd.SparseDtype): + # TODO: GH 57739 mask = np.array(mask) + mask.flags.writeable = True mask[:] = True tm.assert_series_equal(result, expected) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 0f5c2d1ec6199..cf1e502a723ee 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -2893,12 +2893,16 @@ def test_dt_to_pytimedelta(): data = [timedelta(1, 2, 3), timedelta(1, 2, 4)] ser = pd.Series(data, dtype=ArrowDtype(pa.duration("ns"))) - result = ser.dt.to_pytimedelta() + msg = "The behavior of ArrowTemporalProperties.to_pytimedelta is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = ser.dt.to_pytimedelta() expected = np.array(data, dtype=object) tm.assert_numpy_array_equal(result, expected) assert all(type(res) is timedelta for res in result) - expected = ser.astype("timedelta64[ns]").dt.to_pytimedelta() + msg = "The behavior of TimedeltaProperties.to_pytimedelta is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = ser.astype("timedelta64[ns]").dt.to_pytimedelta() tm.assert_numpy_array_equal(result, expected) @@ -3526,6 +3530,14 @@ def test_to_numpy_timestamp_to_int(): tm.assert_numpy_array_equal(result, expected) +@pytest.mark.parametrize("arrow_type", [pa.large_string(), pa.string()]) +def test_cast_dictionary_different_value_dtype(arrow_type): + df = pd.DataFrame({"a": ["x", "y"]}, dtype="string[pyarrow]") + data_type = ArrowDtype(pa.dictionary(pa.int32(), arrow_type)) + result = df.astype({"a": data_type}) + assert result.dtypes.iloc[0] == data_type + + def test_map_numeric_na_action(): ser = pd.Series([32, 40, None], dtype="int64[pyarrow]") result = ser.map(lambda x: 42, na_action="ignore") diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 5a6fe07aa007b..69e6228d6efde 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -145,7 +145,7 @@ def test_getitem_boolean(self, mixed_float_frame, mixed_int_frame, datetime_fram # we are producing a warning that since the passed boolean # key is not the same as the given index, we will reindex # not sure this is really necessary - with tm.assert_produces_warning(UserWarning): + with tm.assert_produces_warning(UserWarning, match="will be reindexed"): indexer_obj = indexer_obj.reindex(datetime_frame.index[::-1]) subframe_obj = datetime_frame[indexer_obj] tm.assert_frame_equal(subframe_obj, subframe) diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index 3f98f49cd1877..ed81e8c8b8129 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -711,7 +711,10 @@ def test_setitem_npmatrix_2d(self): df["np-array"] = a # Instantiation of `np.matrix` gives PendingDeprecationWarning - with tm.assert_produces_warning(PendingDeprecationWarning): + with tm.assert_produces_warning( + PendingDeprecationWarning, + match="matrix subclass is not the recommended way to represent matrices", + ): df["np-matrix"] = np.matrix(a) tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/frame/methods/test_cov_corr.py b/pandas/tests/frame/methods/test_cov_corr.py index 4d2d83d25e8da..53aa44f264c7a 100644 --- a/pandas/tests/frame/methods/test_cov_corr.py +++ b/pandas/tests/frame/methods/test_cov_corr.py @@ -461,3 +461,28 @@ def test_corrwith_spearman_with_tied_data(self): result = df_bool.corrwith(ser_bool) expected = Series([0.57735, 0.57735], index=["A", "B"]) tm.assert_series_equal(result, expected) + + def test_corrwith_min_periods_method(self): + # GH#9490 + pytest.importorskip("scipy") + df1 = DataFrame( + { + "A": [1, np.nan, 7, 8], + "B": [False, True, True, False], + "C": [10, 4, 9, 3], + } + ) + df2 = df1[["B", "C"]] + result = (df1 + 1).corrwith(df2.B, method="spearman", min_periods=2) + expected = Series([0.0, 1.0, 0.0], index=["A", "B", "C"]) + tm.assert_series_equal(result, expected) + + def test_corrwith_min_periods_boolean(self): + # GH#9490 + df_bool = DataFrame( + {"A": [True, True, False, False], "B": [True, False, False, True]} + ) + ser_bool = Series([True, True, False, True]) + result = df_bool.corrwith(ser_bool, min_periods=3) + expected = Series([0.57735, 0.57735], index=["A", "B"]) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_interpolate.py b/pandas/tests/frame/methods/test_interpolate.py index 0a9d059736e6f..cdb9ff8a67b6b 100644 --- a/pandas/tests/frame/methods/test_interpolate.py +++ b/pandas/tests/frame/methods/test_interpolate.py @@ -109,7 +109,7 @@ def test_interp_basic_with_non_range_index(self, using_infer_string): else: result = df.set_index("C").interpolate() expected = df.set_index("C") - expected.loc[3, "A"] = 3 + expected.loc[3, "A"] = 2.66667 expected.loc[5, "B"] = 9 tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_to_dict.py b/pandas/tests/frame/methods/test_to_dict.py index b8631d95a6399..0272b679e85a2 100644 --- a/pandas/tests/frame/methods/test_to_dict.py +++ b/pandas/tests/frame/methods/test_to_dict.py @@ -166,7 +166,7 @@ def test_to_dict_not_unique_warning(self): # GH#16927: When converting to a dict, if a column has a non-unique name # it will be dropped, throwing a warning. df = DataFrame([[1, 2, 3]], columns=["a", "a", "b"]) - with tm.assert_produces_warning(UserWarning): + with tm.assert_produces_warning(UserWarning, match="columns will be omitted"): df.to_dict() @pytest.mark.filterwarnings("ignore::UserWarning") @@ -513,6 +513,20 @@ def test_to_dict_masked_native_python(self): result = df.to_dict(orient="records") assert isinstance(result[0]["a"], int) + def test_to_dict_tight_no_warning_with_duplicate_column(self): + # GH#58281 + df = DataFrame([[1, 2], [3, 4], [5, 6]], columns=["A", "A"]) + with tm.assert_produces_warning(None): + result = df.to_dict(orient="tight") + expected = { + "index": [0, 1, 2], + "columns": ["A", "A"], + "data": [[1, 2], [3, 4], [5, 6]], + "index_names": [None], + "column_names": [None], + } + assert result == expected + @pytest.mark.parametrize( "val", [Timestamp(2020, 1, 1), Timedelta(1), Period("2020"), Interval(1, 2)] diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index f463b3f94fa55..91b5f905ada22 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -1097,7 +1097,7 @@ def test_binop_other(self, op, value, dtype, switch_numexpr_min_elements): and expr.USE_NUMEXPR and switch_numexpr_min_elements == 0 ): - warn = UserWarning # "evaluating in Python space because ..." + warn = UserWarning else: msg = ( f"cannot perform __{op.__name__}__ with this " @@ -1105,17 +1105,16 @@ def test_binop_other(self, op, value, dtype, switch_numexpr_min_elements): ) with pytest.raises(TypeError, match=msg): - with tm.assert_produces_warning(warn): + with tm.assert_produces_warning(warn, match="evaluating in Python"): op(df, elem.value) elif (op, dtype) in skip: if op in [operator.add, operator.mul]: if expr.USE_NUMEXPR and switch_numexpr_min_elements == 0: - # "evaluating in Python space because ..." warn = UserWarning else: warn = None - with tm.assert_produces_warning(warn): + with tm.assert_produces_warning(warn, match="evaluating in Python"): op(df, elem.value) else: diff --git a/pandas/tests/frame/test_query_eval.py b/pandas/tests/frame/test_query_eval.py index 94e8e469f21e7..643d342b052a4 100644 --- a/pandas/tests/frame/test_query_eval.py +++ b/pandas/tests/frame/test_query_eval.py @@ -58,26 +58,26 @@ def test_query_default(self, df, expected1, expected2): result = df.query("A>0") tm.assert_frame_equal(result, expected1) result = df.eval("A+1") - tm.assert_series_equal(result, expected2, check_names=False) + tm.assert_series_equal(result, expected2) def test_query_None(self, df, expected1, expected2): result = df.query("A>0", engine=None) tm.assert_frame_equal(result, expected1) result = df.eval("A+1", engine=None) - tm.assert_series_equal(result, expected2, check_names=False) + tm.assert_series_equal(result, expected2) def test_query_python(self, df, expected1, expected2): result = df.query("A>0", engine="python") tm.assert_frame_equal(result, expected1) result = df.eval("A+1", engine="python") - tm.assert_series_equal(result, expected2, check_names=False) + tm.assert_series_equal(result, expected2) def test_query_numexpr(self, df, expected1, expected2): if NUMEXPR_INSTALLED: result = df.query("A>0", engine="numexpr") tm.assert_frame_equal(result, expected1) result = df.eval("A+1", engine="numexpr") - tm.assert_series_equal(result, expected2, check_names=False) + tm.assert_series_equal(result, expected2) else: msg = ( r"'numexpr' is not installed or an unsupported version. " @@ -194,8 +194,12 @@ def test_using_numpy(self, engine, parser): df = Series([0.2, 1.5, 2.8], name="a").to_frame() res = df.eval("@np.floor(a)", engine=engine, parser=parser) expected = np.floor(df["a"]) - if engine == "numexpr": - expected.name = None # See GH 58069 + tm.assert_series_equal(expected, res) + + def test_eval_simple(self, engine, parser): + df = Series([0.2, 1.5, 2.8], name="a").to_frame() + res = df.eval("a", engine=engine, parser=parser) + expected = df["a"] tm.assert_series_equal(expected, res) diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 8ccd7b2ca83ba..5118561f67338 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -699,7 +699,7 @@ def test_mode_sortwarning(self, using_infer_string): expected = DataFrame({"A": ["a", np.nan]}) warning = None if using_infer_string else UserWarning - with tm.assert_produces_warning(warning): + with tm.assert_produces_warning(warning, match="Unable to sort modes"): result = df.mode(dropna=False) result = result.sort_values(by="A").reset_index(drop=True) diff --git a/pandas/tests/groupby/methods/test_value_counts.py b/pandas/tests/groupby/methods/test_value_counts.py index be52b4a591c26..0f136b06c782a 100644 --- a/pandas/tests/groupby/methods/test_value_counts.py +++ b/pandas/tests/groupby/methods/test_value_counts.py @@ -329,13 +329,10 @@ def test_against_frame_and_seriesgroupby( else: name = "proportion" if normalize else "count" expected = expected.reset_index().rename({0: name}, axis=1) - if groupby == "column": - expected = expected.rename({"level_0": "country"}, axis=1) - expected["country"] = np.where(expected["country"], "US", "FR") - elif groupby == "function": - expected["level_0"] = expected["level_0"] == 1 + if groupby in ["array", "function"] and (not as_index and frame): + expected.insert(loc=0, column="level_0", value=result["level_0"]) else: - expected["level_0"] = np.where(expected["level_0"], "US", "FR") + expected.insert(loc=0, column="country", value=result["country"]) tm.assert_frame_equal(result, expected) else: # compare against SeriesGroupBy value_counts diff --git a/pandas/tests/groupby/test_api.py b/pandas/tests/groupby/test_api.py index d2cfa530e7c65..33b39bad4ab81 100644 --- a/pandas/tests/groupby/test_api.py +++ b/pandas/tests/groupby/test_api.py @@ -192,6 +192,8 @@ def test_frame_consistency(groupby_func): exclude_expected = {"numeric_only"} elif groupby_func in ("quantile",): exclude_expected = {"method", "axis"} + elif groupby_func in ["corrwith"]: + exclude_expected = {"min_periods"} if groupby_func not in ["pct_change", "size"]: exclude_expected |= {"axis"} diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 1a2589fe94ea5..e27c782c1bdcf 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -315,7 +315,7 @@ def test_groupby_as_index_apply(): # apply doesn't maintain the original ordering # changed in GH5610 as the as_index=False returns a MI here - exp_not_as_apply = MultiIndex.from_tuples([(0, 0), (0, 2), (1, 1), (2, 4)]) + exp_not_as_apply = Index([0, 2, 1, 4]) tp = [(1, 0), (1, 2), (2, 1), (3, 4)] exp_as_apply = MultiIndex.from_tuples(tp, names=["user_id", None]) diff --git a/pandas/tests/groupby/test_apply_mutate.py b/pandas/tests/groupby/test_apply_mutate.py index e5028884e992b..fa20efad4da77 100644 --- a/pandas/tests/groupby/test_apply_mutate.py +++ b/pandas/tests/groupby/test_apply_mutate.py @@ -90,9 +90,7 @@ def fn(x): result = df.groupby(["col1"], as_index=False).apply(fn) expected = pd.Series( [1, 2, 0, 4, 5, 0], - index=pd.MultiIndex.from_tuples( - [(0, 0), (0, 1), (0, 2), (1, 3), (1, 4), (1, 5)] - ), + index=range(6), name="col2", ) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 4764bcb64fd0c..b66664f05122c 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -114,8 +114,9 @@ def f(x, q=None, axis=0): expected_seq = df_grouped.quantile([0.4, 0.8]) if not as_index: # apply treats the op as a transform; .quantile knows it's a reduction - apply_result = apply_result.reset_index() - apply_result["level_0"] = [1, 1, 2, 2] + apply_result.index = range(4) + apply_result.insert(loc=0, column="level_0", value=[1, 1, 2, 2]) + apply_result.insert(loc=1, column="level_1", value=[0.4, 0.8, 0.4, 0.8]) tm.assert_frame_equal(apply_result, expected_seq, check_names=False) agg_result = df_grouped.agg(f, q=80) @@ -520,9 +521,7 @@ def test_as_index_select_column(): result = df.groupby("A", as_index=False, group_keys=True)["B"].apply( lambda x: x.cumsum() ) - expected = Series( - [2, 6, 6], name="B", index=MultiIndex.from_tuples([(0, 0), (0, 1), (1, 2)]) - ) + expected = Series([2, 6, 6], name="B", index=range(3)) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/indexes/base_class/test_setops.py b/pandas/tests/indexes/base_class/test_setops.py index 49c6a91236db7..d57df82b2358c 100644 --- a/pandas/tests/indexes/base_class/test_setops.py +++ b/pandas/tests/indexes/base_class/test_setops.py @@ -84,13 +84,13 @@ def test_union_sort_other_incomparable(self): # https://github.com/pandas-dev/pandas/issues/24959 idx = Index([1, pd.Timestamp("2000")]) # default (sort=None) - with tm.assert_produces_warning(RuntimeWarning): + with tm.assert_produces_warning(RuntimeWarning, match="not supported between"): result = idx.union(idx[:1]) tm.assert_index_equal(result, idx) # sort=None - with tm.assert_produces_warning(RuntimeWarning): + with tm.assert_produces_warning(RuntimeWarning, match="not supported between"): result = idx.union(idx[:1], sort=None) tm.assert_index_equal(result, idx) diff --git a/pandas/tests/indexes/datetimes/methods/test_to_period.py b/pandas/tests/indexes/datetimes/methods/test_to_period.py index 05e9a294d74a6..5b2cc55d6dc56 100644 --- a/pandas/tests/indexes/datetimes/methods/test_to_period.py +++ b/pandas/tests/indexes/datetimes/methods/test_to_period.py @@ -117,10 +117,10 @@ def test_to_period_infer(self): freq="5min", ) - with tm.assert_produces_warning(UserWarning): + with tm.assert_produces_warning(UserWarning, match="drop timezone info"): pi1 = rng.to_period("5min") - with tm.assert_produces_warning(UserWarning): + with tm.assert_produces_warning(UserWarning, match="drop timezone info"): pi2 = rng.to_period() tm.assert_index_equal(pi1, pi2) @@ -143,8 +143,7 @@ def test_to_period_millisecond(self): ] ) - with tm.assert_produces_warning(UserWarning): - # warning that timezone info will be lost + with tm.assert_produces_warning(UserWarning, match="drop timezone info"): period = index.to_period(freq="ms") assert 2 == len(period) assert period[0] == Period("2007-01-01 10:11:12.123Z", "ms") @@ -158,8 +157,7 @@ def test_to_period_microsecond(self): ] ) - with tm.assert_produces_warning(UserWarning): - # warning that timezone info will be lost + with tm.assert_produces_warning(UserWarning, match="drop timezone info"): period = index.to_period(freq="us") assert 2 == len(period) assert period[0] == Period("2007-01-01 10:11:12.123456Z", "us") @@ -172,10 +170,7 @@ def test_to_period_microsecond(self): def test_to_period_tz(self, tz): ts = date_range("1/1/2000", "2/1/2000", tz=tz) - with tm.assert_produces_warning(UserWarning): - # GH#21333 warning that timezone info will be lost - # filter warning about freq deprecation - + with tm.assert_produces_warning(UserWarning, match="drop timezone info"): result = ts.to_period()[0] expected = ts[0].to_period(ts.freq) @@ -183,8 +178,7 @@ def test_to_period_tz(self, tz): expected = date_range("1/1/2000", "2/1/2000").to_period() - with tm.assert_produces_warning(UserWarning): - # GH#21333 warning that timezone info will be lost + with tm.assert_produces_warning(UserWarning, match="drop timezone info"): result = ts.to_period(ts.freq) tm.assert_index_equal(result, expected) @@ -193,7 +187,7 @@ def test_to_period_tz(self, tz): def test_to_period_tz_utc_offset_consistency(self, tz): # GH#22905 ts = date_range("1/1/2000", "2/1/2000", tz="Etc/GMT-1") - with tm.assert_produces_warning(UserWarning): + with tm.assert_produces_warning(UserWarning, match="drop timezone info"): result = ts.to_period()[0] expected = ts[0].to_period(ts.freq) assert result == expected diff --git a/pandas/tests/indexes/multi/test_indexing.py b/pandas/tests/indexes/multi/test_indexing.py index 18d64999de496..f08a7625e7f8a 100644 --- a/pandas/tests/indexes/multi/test_indexing.py +++ b/pandas/tests/indexes/multi/test_indexing.py @@ -919,30 +919,41 @@ def test_slice_indexer_with_missing_value(index_arr, expected, start_idx, end_id assert result == expected -def test_pyint_engine(): +@pytest.mark.parametrize( + "N, expected_dtype", + [ + (1, "uint8"), # 2*4*N = 8 + (2, "uint16"), # 2*4*N = 16 + (4, "uint32"), # 2*4*N = 32 + (8, "uint64"), # 2*4*N = 64 + (10, "object"), # 2*4*N = 80 + ], +) +def test_pyint_engine(N, expected_dtype): # GH#18519 : when combinations of codes cannot be represented in 64 # bits, the index underlying the MultiIndex engine works with Python # integers, rather than uint64. - N = 5 keys = [ tuple(arr) for arr in [ - [0] * 10 * N, - [1] * 10 * N, - [2] * 10 * N, - [np.nan] * N + [2] * 9 * N, - [0] * N + [2] * 9 * N, - [np.nan] * N + [2] * 8 * N + [0] * N, + [0] * 4 * N, + [1] * 4 * N, + [np.nan] * N + [0] * 3 * N, + [0] * N + [1] * 3 * N, + [np.nan] * N + [1] * 2 * N + [0] * N, ] ] - # Each level contains 4 elements (including NaN), so it is represented - # in 2 bits, for a total of 2*N*10 = 100 > 64 bits. If we were using a - # 64 bit engine and truncating the first levels, the fourth and fifth - # keys would collide; if truncating the last levels, the fifth and - # sixth; if rotating bits rather than shifting, the third and fifth. + # Each level contains 3 elements (NaN, 0, 1), and it's represented + # in 2 bits to store 4 possible values (0=notfound, 1=NaN, 2=0, 3=1), for + # a total of 2*N*4 = 80 > 64 bits where N=10 and the number of levels is N*4. + # If we were using a 64 bit engine and truncating the first levels, the + # fourth and fifth keys would collide; if truncating the last levels, the + # fifth and sixth; if rotating bits rather than shifting, the third and fifth. + + index = MultiIndex.from_tuples(keys) + assert index._engine.values.dtype == expected_dtype for idx, key_value in enumerate(keys): - index = MultiIndex.from_tuples(keys) assert index.get_loc(key_value) == idx expected = np.arange(idx + 1, dtype=np.intp) @@ -952,7 +963,7 @@ def test_pyint_engine(): # With missing key: idces = range(len(keys)) expected = np.array([-1] + list(idces), dtype=np.intp) - missing = tuple([0, 1] * 5 * N) + missing = tuple([0, 1, 0, 1] * N) result = index.get_indexer([missing] + [keys[i] for i in idces]) tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/indexes/multi/test_setops.py b/pandas/tests/indexes/multi/test_setops.py index 9354984538c58..47f21cc7f8182 100644 --- a/pandas/tests/indexes/multi/test_setops.py +++ b/pandas/tests/indexes/multi/test_setops.py @@ -382,7 +382,7 @@ def test_union_sort_other_incomparable(): idx = MultiIndex.from_product([[1, pd.Timestamp("2000")], ["a", "b"]]) # default, sort=None - with tm.assert_produces_warning(RuntimeWarning): + with tm.assert_produces_warning(RuntimeWarning, match="are unorderable"): result = idx.union(idx[:1]) tm.assert_index_equal(result, idx) diff --git a/pandas/tests/indexes/ranges/test_range.py b/pandas/tests/indexes/ranges/test_range.py index 727edb7ae30ad..1f9df30d60c11 100644 --- a/pandas/tests/indexes/ranges/test_range.py +++ b/pandas/tests/indexes/ranges/test_range.py @@ -874,3 +874,36 @@ def test_getitem_integers_return_index(): result = RangeIndex(0, 10, 2, name="foo")[[0, 1, -1]] expected = Index([0, 2, 8], dtype="int64", name="foo") tm.assert_index_equal(result, expected) + + +@pytest.mark.parametrize("normalize", [True, False]) +@pytest.mark.parametrize( + "rng", + [ + range(3), + range(0), + range(0, 3, 2), + range(3, -3, -2), + ], +) +def test_value_counts(sort, dropna, ascending, normalize, rng): + ri = RangeIndex(rng, name="A") + result = ri.value_counts( + normalize=normalize, sort=sort, ascending=ascending, dropna=dropna + ) + expected = Index(list(rng), name="A").value_counts( + normalize=normalize, sort=sort, ascending=ascending, dropna=dropna + ) + tm.assert_series_equal(result, expected, check_index_type=False) + + +@pytest.mark.parametrize("side", ["left", "right"]) +@pytest.mark.parametrize("value", [0, -5, 5, -3, np.array([-5, -3, 0, 5])]) +def test_searchsorted(side, value): + ri = RangeIndex(-3, 3, 2) + result = ri.searchsorted(value=value, side=side) + expected = Index(list(ri)).searchsorted(value=value, side=side) + if isinstance(value, int): + assert result == expected + else: + tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 3a2d04d3ffdc2..2e94961b673f8 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -71,8 +71,8 @@ def test_constructor_casting(self, index): tm.assert_contains_all(arr, new_index) tm.assert_index_equal(index, new_index) - @pytest.mark.parametrize("index", ["string"], indirect=True) - def test_constructor_copy(self, index, using_infer_string): + def test_constructor_copy(self, using_infer_string): + index = Index(list("abc"), name="name") arr = np.array(index) new_index = Index(arr, copy=True, name="name") assert isinstance(new_index, Index) @@ -481,7 +481,7 @@ def test_empty_fancy(self, index, dtype, request, using_infer_string): assert index[[]].identical(empty_index) if dtype == np.bool_: - with tm.assert_produces_warning(FutureWarning, match="is deprecated"): + with pytest.raises(ValueError, match="length of the boolean indexer"): assert index[empty_arr].identical(empty_index) else: assert index[empty_arr].identical(empty_index) @@ -1065,10 +1065,10 @@ def test_outer_join_sort(self): left_index = Index(np.random.default_rng(2).permutation(15)) right_index = date_range("2020-01-01", periods=10) - with tm.assert_produces_warning(RuntimeWarning): + with tm.assert_produces_warning(RuntimeWarning, match="not supported between"): result = left_index.join(right_index, how="outer") - with tm.assert_produces_warning(RuntimeWarning): + with tm.assert_produces_warning(RuntimeWarning, match="not supported between"): expected = left_index.astype(object).union(right_index.astype(object)) tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/test_index_new.py b/pandas/tests/indexes/test_index_new.py index 21cb0b8723d59..b544ebac43ece 100644 --- a/pandas/tests/indexes/test_index_new.py +++ b/pandas/tests/indexes/test_index_new.py @@ -142,25 +142,18 @@ def test_constructor_infer_nat_dt_like( data = [ctor] data.insert(pos, nulls_fixture) - warn = None if nulls_fixture is NA: expected = Index([NA, NaT]) mark = pytest.mark.xfail(reason="Broken with np.NaT ctor; see GH 31884") request.applymarker(mark) - # GH#35942 numpy will emit a DeprecationWarning within the - # assert_index_equal calls. Since we can't do anything - # about it until GH#31884 is fixed, we suppress that warning. - warn = DeprecationWarning result = Index(data) - with tm.assert_produces_warning(warn): - tm.assert_index_equal(result, expected) + tm.assert_index_equal(result, expected) result = Index(np.array(data, dtype=object)) - with tm.assert_produces_warning(warn): - tm.assert_index_equal(result, expected) + tm.assert_index_equal(result, expected) @pytest.mark.parametrize("swap_objs", [True, False]) def test_constructor_mixed_nat_objs_infers_object(self, swap_objs): diff --git a/pandas/tests/indexes/test_old_base.py b/pandas/tests/indexes/test_old_base.py index 9b4470021cc1d..b929616c814ee 100644 --- a/pandas/tests/indexes/test_old_base.py +++ b/pandas/tests/indexes/test_old_base.py @@ -326,6 +326,30 @@ def test_memory_usage(self, index): if index.inferred_type == "object": assert result3 > result2 + def test_memory_usage_doesnt_trigger_engine(self, index): + index._cache.clear() + assert "_engine" not in index._cache + + res_without_engine = index.memory_usage() + assert "_engine" not in index._cache + + # explicitly load and cache the engine + _ = index._engine + assert "_engine" in index._cache + + res_with_engine = index.memory_usage() + + # the empty engine doesn't affect the result even when initialized with values, + # because engine.sizeof() doesn't consider the content of engine.values + assert res_with_engine == res_without_engine + + if len(index) == 0: + assert res_without_engine == 0 + assert res_with_engine == 0 + else: + assert res_without_engine > 0 + assert res_with_engine > 0 + def test_argsort(self, index): if isinstance(index, CategoricalIndex): pytest.skip(f"{type(self).__name__} separately tested") diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py index 9a3471fe526c1..8fd349dacf9e9 100644 --- a/pandas/tests/indexes/test_setops.py +++ b/pandas/tests/indexes/test_setops.py @@ -882,7 +882,7 @@ def test_difference_incomparable(self, opname): b = Index([2, Timestamp("1999"), 1]) op = operator.methodcaller(opname, b) - with tm.assert_produces_warning(RuntimeWarning): + with tm.assert_produces_warning(RuntimeWarning, match="not supported between"): # sort=None, the default result = op(a) expected = Index([3, Timestamp("2000"), 2, Timestamp("1999")]) diff --git a/pandas/tests/indexing/test_coercion.py b/pandas/tests/indexing/test_coercion.py index d51a986a22f1e..d4bc0341e732e 100644 --- a/pandas/tests/indexing/test_coercion.py +++ b/pandas/tests/indexing/test_coercion.py @@ -117,16 +117,8 @@ def test_setitem_index_object(self, val, exp_dtype): obj = pd.Series([1, 2, 3, 4], index=pd.Index(list("abcd"), dtype=object)) assert obj.index.dtype == object - if exp_dtype is IndexError: - temp = obj.copy() - warn_msg = "Series.__setitem__ treating keys as positions is deprecated" - msg = "index 5 is out of bounds for axis 0 with size 4" - with pytest.raises(exp_dtype, match=msg): - with tm.assert_produces_warning(FutureWarning, match=warn_msg): - temp[5] = 5 - else: - exp_index = pd.Index(list("abcd") + [val], dtype=object) - self._assert_setitem_index_conversion(obj, val, exp_index, exp_dtype) + exp_index = pd.Index(list("abcd") + [val], dtype=object) + self._assert_setitem_index_conversion(obj, val, exp_index, exp_dtype) @pytest.mark.parametrize( "val,exp_dtype", [(5, np.int64), (1.1, np.float64), ("x", object)] diff --git a/pandas/tests/indexing/test_floats.py b/pandas/tests/indexing/test_floats.py index 1fe431e12f2a1..8597ee1198ff0 100644 --- a/pandas/tests/indexing/test_floats.py +++ b/pandas/tests/indexing/test_floats.py @@ -87,11 +87,11 @@ def test_scalar_non_numeric(self, index, frame_or_series, indexer_sl): ], ) def test_scalar_non_numeric_series_fallback(self, index): - # fallsback to position selection, series only + # starting in 3.0, integer keys are always treated as labels, no longer + # fall back to positional. s = Series(np.arange(len(index)), index=index) - msg = "Series.__getitem__ treating keys as positions is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): + with pytest.raises(KeyError, match="3"): s[3] with pytest.raises(KeyError, match="^3.0$"): s[3.0] @@ -118,12 +118,9 @@ def test_scalar_with_mixed(self, indexer_sl): indexer_sl(s3)[1.0] if indexer_sl is not tm.loc: - # __getitem__ falls back to positional - msg = "Series.__getitem__ treating keys as positions is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = s3[1] - expected = 2 - assert result == expected + # as of 3.0, __getitem__ no longer falls back to positional + with pytest.raises(KeyError, match="^1$"): + s3[1] with pytest.raises(KeyError, match=r"^1\.0$"): indexer_sl(s3)[1.0] diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index 92addeb29252a..749e2c4a86b55 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -347,7 +347,7 @@ def test_split(self): # GH#37799 values = np.random.default_rng(2).standard_normal((3, 4)) blk = new_block(values, placement=BlockPlacement([3, 1, 6]), ndim=2) - result = blk._split() + result = list(blk._split()) # check that we get views, not copies values[:] = -9999 @@ -1280,19 +1280,20 @@ def test_interval_can_hold_element(self, dtype, element): # `elem` to not have the same length as `arr` ii2 = IntervalIndex.from_breaks(arr[:-1], closed="neither") elem = element(ii2) - with tm.assert_produces_warning(FutureWarning): + msg = "Setting an item of incompatible dtype is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): self.check_series_setitem(elem, ii, False) assert not blk._can_hold_element(elem) ii3 = IntervalIndex.from_breaks([Timestamp(1), Timestamp(3), Timestamp(4)]) elem = element(ii3) - with tm.assert_produces_warning(FutureWarning): + with tm.assert_produces_warning(FutureWarning, match=msg): self.check_series_setitem(elem, ii, False) assert not blk._can_hold_element(elem) ii4 = IntervalIndex.from_breaks([Timedelta(1), Timedelta(3), Timedelta(4)]) elem = element(ii4) - with tm.assert_produces_warning(FutureWarning): + with tm.assert_produces_warning(FutureWarning, match=msg): self.check_series_setitem(elem, ii, False) assert not blk._can_hold_element(elem) @@ -1312,12 +1313,13 @@ def test_period_can_hold_element(self, element): # `elem` to not have the same length as `arr` pi2 = pi.asfreq("D")[:-1] elem = element(pi2) - with tm.assert_produces_warning(FutureWarning): + msg = "Setting an item of incompatible dtype is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): self.check_series_setitem(elem, pi, False) dti = pi.to_timestamp("s")[:-1] elem = element(dti) - with tm.assert_produces_warning(FutureWarning): + with tm.assert_produces_warning(FutureWarning, match=msg): self.check_series_setitem(elem, pi, False) def check_can_hold_element(self, obj, elem, inplace: bool): diff --git a/pandas/tests/io/formats/test_css.py b/pandas/tests/io/formats/test_css.py index 8bf9aa4ac04d3..c4ecb48006cb1 100644 --- a/pandas/tests/io/formats/test_css.py +++ b/pandas/tests/io/formats/test_css.py @@ -38,30 +38,31 @@ def test_css_parse_normalisation(name, norm, abnorm): @pytest.mark.parametrize( - "invalid_css,remainder", + "invalid_css,remainder,msg", [ # No colon - ("hello-world", ""), - ("border-style: solid; hello-world", "border-style: solid"), + ("hello-world", "", "expected a colon"), + ("border-style: solid; hello-world", "border-style: solid", "expected a colon"), ( "border-style: solid; hello-world; font-weight: bold", "border-style: solid; font-weight: bold", + "expected a colon", ), # Unclosed string fail # Invalid size - ("font-size: blah", "font-size: 1em"), - ("font-size: 1a2b", "font-size: 1em"), - ("font-size: 1e5pt", "font-size: 1em"), - ("font-size: 1+6pt", "font-size: 1em"), - ("font-size: 1unknownunit", "font-size: 1em"), - ("font-size: 10", "font-size: 1em"), - ("font-size: 10 pt", "font-size: 1em"), + ("font-size: blah", "font-size: 1em", "Unhandled size"), + ("font-size: 1a2b", "font-size: 1em", "Unhandled size"), + ("font-size: 1e5pt", "font-size: 1em", "Unhandled size"), + ("font-size: 1+6pt", "font-size: 1em", "Unhandled size"), + ("font-size: 1unknownunit", "font-size: 1em", "Unhandled size"), + ("font-size: 10", "font-size: 1em", "Unhandled size"), + ("font-size: 10 pt", "font-size: 1em", "Unhandled size"), # Too many args - ("border-top: 1pt solid red green", "border-top: 1pt solid green"), + ("border-top: 1pt solid red green", "border-top: 1pt solid green", "Too many"), ], ) -def test_css_parse_invalid(invalid_css, remainder): - with tm.assert_produces_warning(CSSWarning): +def test_css_parse_invalid(invalid_css, remainder, msg): + with tm.assert_produces_warning(CSSWarning, match=msg): assert_same_resolution(invalid_css, remainder) @@ -120,7 +121,7 @@ def test_css_side_shorthands(shorthand, expansions): {top: "1pt", right: "4pt", bottom: "2pt", left: "0pt"}, ) - with tm.assert_produces_warning(CSSWarning): + with tm.assert_produces_warning(CSSWarning, match="Could not expand"): assert_resolves(f"{shorthand}: 1pt 1pt 1pt 1pt 1pt", {}) diff --git a/pandas/tests/io/formats/test_to_excel.py b/pandas/tests/io/formats/test_to_excel.py index 3b782713eed6c..b40201b9ba1e6 100644 --- a/pandas/tests/io/formats/test_to_excel.py +++ b/pandas/tests/io/formats/test_to_excel.py @@ -325,7 +325,7 @@ def test_css_to_excel_bad_colors(input_color): if input_color is not None: expected["fill"] = {"patternType": "solid"} - with tm.assert_produces_warning(CSSWarning): + with tm.assert_produces_warning(CSSWarning, match="Unhandled color format"): convert = CSSToExcelConverter() assert expected == convert(css) diff --git a/pandas/tests/io/json/test_json_table_schema.py b/pandas/tests/io/json/test_json_table_schema.py index ec49b7644ea0e..a0d5b3a741aaf 100644 --- a/pandas/tests/io/json/test_json_table_schema.py +++ b/pandas/tests/io/json/test_json_table_schema.py @@ -639,7 +639,7 @@ def test_warns_non_roundtrippable_names(self, idx): # GH 19130 df = DataFrame(index=idx) df.index.name = "index" - with tm.assert_produces_warning(): + with tm.assert_produces_warning(UserWarning, match="not round-trippable"): set_default_names(df) def test_timestamp_in_columns(self): diff --git a/pandas/tests/io/test_clipboard.py b/pandas/tests/io/test_clipboard.py index 5f19c15817ce7..babbddafa3b49 100644 --- a/pandas/tests/io/test_clipboard.py +++ b/pandas/tests/io/test_clipboard.py @@ -222,7 +222,7 @@ def test_excel_sep_warning(self, df): # Separator is ignored when excel=False and should produce a warning def test_copy_delim_warning(self, df): - with tm.assert_produces_warning(): + with tm.assert_produces_warning(UserWarning, match="ignores the sep argument"): df.to_clipboard(excel=False, sep="\t") # Tests that the default behavior of to_clipboard is tab diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index f5880d8a894f8..ad729d2346a3b 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -463,7 +463,7 @@ def test_warning_missing_utf_bom(self, encoding, compression_): index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), ) with tm.ensure_clean() as path: - with tm.assert_produces_warning(UnicodeWarning): + with tm.assert_produces_warning(UnicodeWarning, match="byte order mark"): df.to_csv(path, compression=compression_, encoding=encoding) # reading should fail (otherwise we wouldn't need the warning) diff --git a/pandas/tests/io/test_compression.py b/pandas/tests/io/test_compression.py index 3a58dda9e8dc4..00082be7e07e8 100644 --- a/pandas/tests/io/test_compression.py +++ b/pandas/tests/io/test_compression.py @@ -133,7 +133,7 @@ def test_compression_warning(compression_only): ) with tm.ensure_clean() as path: with icom.get_handle(path, "w", compression=compression_only) as handles: - with tm.assert_produces_warning(RuntimeWarning): + with tm.assert_produces_warning(RuntimeWarning, match="has no effect"): df.to_csv(handles.handle, compression=compression_only) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 3083fa24ba8b5..af77972d9fd26 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -2602,7 +2602,7 @@ def close(self): self.conn.close() with contextlib.closing(MockSqliteConnection(":memory:")) as conn: - with tm.assert_produces_warning(UserWarning): + with tm.assert_produces_warning(UserWarning, match="only supports SQLAlchemy"): sql.read_sql("SELECT 1", conn) diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 43c62237c6786..d7fb3c0049965 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -189,11 +189,12 @@ def test_read_dta2(self, datapath): path2 = datapath("io", "data", "stata", "stata2_115.dta") path3 = datapath("io", "data", "stata", "stata2_117.dta") - with tm.assert_produces_warning(UserWarning): + msg = "Leaving in Stata Internal Format" + with tm.assert_produces_warning(UserWarning, match=msg): parsed_114 = self.read_dta(path1) - with tm.assert_produces_warning(UserWarning): + with tm.assert_produces_warning(UserWarning, match=msg): parsed_115 = self.read_dta(path2) - with tm.assert_produces_warning(UserWarning): + with tm.assert_produces_warning(UserWarning, match=msg): parsed_117 = self.read_dta(path3) # FIXME: don't leave commented-out # 113 is buggy due to limits of date format support in Stata @@ -478,7 +479,8 @@ def test_read_write_dta11(self, temp_file): formatted = formatted.astype(np.int32) path = temp_file - with tm.assert_produces_warning(InvalidColumnName): + msg = "Not all pandas column names were valid Stata variable names" + with tm.assert_produces_warning(InvalidColumnName, match=msg): original.to_stata(path, convert_dates=None) written_and_read_again = self.read_dta(path) @@ -515,7 +517,8 @@ def test_read_write_dta12(self, version, temp_file): formatted = formatted.astype(np.int32) path = temp_file - with tm.assert_produces_warning(InvalidColumnName): + msg = "Not all pandas column names were valid Stata variable names" + with tm.assert_produces_warning(InvalidColumnName, match=msg): original.to_stata(path, convert_dates=None, version=version) # should get a warning for that format. @@ -612,7 +615,8 @@ def test_numeric_column_names(self, temp_file): original.index.name = "index" path = temp_file # should get a warning for that format. - with tm.assert_produces_warning(InvalidColumnName): + msg = "Not all pandas column names were valid Stata variable names" + with tm.assert_produces_warning(InvalidColumnName, match=msg): original.to_stata(path) written_and_read_again = self.read_dta(path) @@ -672,7 +676,7 @@ def test_large_value_conversion(self, temp_file): original = DataFrame({"s0": s0, "s1": s1, "s2": s2, "s3": s3}) original.index.name = "index" path = temp_file - with tm.assert_produces_warning(PossiblePrecisionLoss): + with tm.assert_produces_warning(PossiblePrecisionLoss, match="from int64 to"): original.to_stata(path) written_and_read_again = self.read_dta(path) @@ -687,7 +691,8 @@ def test_dates_invalid_column(self, temp_file): original = DataFrame([datetime(2006, 11, 19, 23, 13, 20)]) original.index.name = "index" path = temp_file - with tm.assert_produces_warning(InvalidColumnName): + msg = "Not all pandas column names were valid Stata variable names" + with tm.assert_produces_warning(InvalidColumnName, match=msg): original.to_stata(path, convert_dates={0: "tc"}) written_and_read_again = self.read_dta(path) @@ -1111,7 +1116,8 @@ def test_categorical_warnings_and_errors(self, temp_file): [["a"], ["b"], ["c"], ["d"], [1]], columns=["Too_long"] ).astype("category") - with tm.assert_produces_warning(ValueLabelTypeMismatch): + msg = "data file created has not lost information due to duplicate labels" + with tm.assert_produces_warning(ValueLabelTypeMismatch, match=msg): original.to_stata(path) # should get a warning for mixed content @@ -1732,7 +1738,8 @@ def test_convert_strl_name_swap(self, temp_file): ) original.index.name = "index" - with tm.assert_produces_warning(InvalidColumnName): + msg = "Not all pandas column names were valid Stata variable names" + with tm.assert_produces_warning(InvalidColumnName, match=msg): path = temp_file original.to_stata(path, convert_strl=["long", 1], version=117) reread = self.read_dta(path) @@ -1962,7 +1969,7 @@ def test_writer_118_exceptions(self, temp_file): "dtype_backend", ["numpy_nullable", pytest.param("pyarrow", marks=td.skip_if_no("pyarrow"))], ) - def test_read_write_ea_dtypes(self, dtype_backend, temp_file): + def test_read_write_ea_dtypes(self, dtype_backend, temp_file, tmp_path): df = DataFrame( { "a": [1, 2, None], @@ -1974,7 +1981,8 @@ def test_read_write_ea_dtypes(self, dtype_backend, temp_file): index=pd.Index([0, 1, 2], name="index"), ) df = df.convert_dtypes(dtype_backend=dtype_backend) - df.to_stata("test_stata.dta", version=118) + stata_path = tmp_path / "test_stata.dta" + df.to_stata(stata_path, version=118) df.to_stata(temp_file) written_and_read_again = self.read_dta(temp_file) @@ -2138,8 +2146,9 @@ def test_chunked_categorical(version, temp_file): def test_chunked_categorical_partial(datapath): dta_file = datapath("io", "data", "stata", "stata-dta-partially-labeled.dta") values = ["a", "b", "a", "b", 3.0] + msg = "series with value labels are not fully labeled" with StataReader(dta_file, chunksize=2) as reader: - with tm.assert_produces_warning(CategoricalConversionWarning): + with tm.assert_produces_warning(CategoricalConversionWarning, match=msg): for i, block in enumerate(reader): assert list(block.cats) == values[2 * i : 2 * (i + 1)] if i < 2: @@ -2147,7 +2156,7 @@ def test_chunked_categorical_partial(datapath): else: idx = pd.Index([3.0], dtype="float64") tm.assert_index_equal(block.cats.cat.categories, idx) - with tm.assert_produces_warning(CategoricalConversionWarning): + with tm.assert_produces_warning(CategoricalConversionWarning, match=msg): with StataReader(dta_file, chunksize=5) as reader: large_chunk = reader.__next__() direct = read_stata(dta_file) @@ -2303,7 +2312,8 @@ def test_non_categorical_value_label_name_conversion(temp_file): "_1__2_": {3: "three"}, } - with tm.assert_produces_warning(InvalidColumnName): + msg = "Not all pandas column names were valid Stata variable names" + with tm.assert_produces_warning(InvalidColumnName, match=msg): data.to_stata(temp_file, value_labels=value_labels) with StataReader(temp_file) as reader: diff --git a/pandas/tests/plotting/frame/test_frame.py b/pandas/tests/plotting/frame/test_frame.py index 65c9083d9fe2b..adb56a40b0071 100644 --- a/pandas/tests/plotting/frame/test_frame.py +++ b/pandas/tests/plotting/frame/test_frame.py @@ -1629,7 +1629,7 @@ def test_pie_df_subplots(self): for ax in axes: _check_text_labels(ax.texts, df.index) for ax, ylabel in zip(axes, df.columns): - assert ax.get_ylabel() == ylabel + assert ax.get_ylabel() == "" def test_pie_df_labels_colors(self): df = DataFrame( @@ -2001,7 +2001,7 @@ def _check(axes): plt.close("all") gs, axes = _generate_4_axes_via_gridspec() - with tm.assert_produces_warning(UserWarning): + with tm.assert_produces_warning(UserWarning, match="sharex and sharey"): axes = df.plot(subplots=True, ax=axes, sharex=True) _check(axes) @@ -2065,7 +2065,7 @@ def _check(axes): plt.close("all") gs, axes = _generate_4_axes_via_gridspec() - with tm.assert_produces_warning(UserWarning): + with tm.assert_produces_warning(UserWarning, match="sharex and sharey"): axes = df.plot(subplots=True, ax=axes, sharey=True) gs.tight_layout(plt.gcf()) @@ -2186,7 +2186,7 @@ def _get_horizontal_grid(): # vertical / subplots / sharex=True / sharey=True ax1, ax2 = _get_vertical_grid() - with tm.assert_produces_warning(UserWarning): + with tm.assert_produces_warning(UserWarning, match="sharex and sharey"): axes = df.plot(subplots=True, ax=[ax1, ax2], sharex=True, sharey=True) assert len(axes[0].lines) == 1 assert len(axes[1].lines) == 1 @@ -2202,7 +2202,7 @@ def _get_horizontal_grid(): # horizontal / subplots / sharex=True / sharey=True ax1, ax2 = _get_horizontal_grid() - with tm.assert_produces_warning(UserWarning): + with tm.assert_produces_warning(UserWarning, match="sharex and sharey"): axes = df.plot(subplots=True, ax=[ax1, ax2], sharex=True, sharey=True) assert len(axes[0].lines) == 1 assert len(axes[1].lines) == 1 @@ -2252,7 +2252,7 @@ def _get_boxed_grid(): # subplots / sharex=True / sharey=True axes = _get_boxed_grid() - with tm.assert_produces_warning(UserWarning): + with tm.assert_produces_warning(UserWarning, match="sharex and sharey"): axes = df.plot(subplots=True, ax=axes, sharex=True, sharey=True) for ax in axes: assert len(ax.lines) == 1 diff --git a/pandas/tests/plotting/frame/test_frame_subplots.py b/pandas/tests/plotting/frame/test_frame_subplots.py index 511266d5786c5..a98f4b56ebf4d 100644 --- a/pandas/tests/plotting/frame/test_frame_subplots.py +++ b/pandas/tests/plotting/frame/test_frame_subplots.py @@ -335,7 +335,7 @@ def test_subplots_multiple_axes_2_dim(self, layout, exp_layout): np.random.default_rng(2).random((10, 4)), index=list(string.ascii_letters[:10]), ) - with tm.assert_produces_warning(UserWarning): + with tm.assert_produces_warning(UserWarning, match="layout keyword is ignored"): returned = df.plot( subplots=True, ax=axes, layout=layout, sharex=False, sharey=False ) @@ -501,7 +501,7 @@ def test_df_subplots_patterns_minorticks_1st_ax_hidden(self): columns=list("AB"), ) _, axes = plt.subplots(2, 1) - with tm.assert_produces_warning(UserWarning): + with tm.assert_produces_warning(UserWarning, match="sharex and sharey"): axes = df.plot(subplots=True, ax=axes, sharex=True) for ax in axes: assert len(ax.lines) == 1 diff --git a/pandas/tests/plotting/test_boxplot_method.py b/pandas/tests/plotting/test_boxplot_method.py index f8029a1c1ee40..573f95eed15ef 100644 --- a/pandas/tests/plotting/test_boxplot_method.py +++ b/pandas/tests/plotting/test_boxplot_method.py @@ -129,7 +129,8 @@ def test_boxplot_legacy2_with_multi_col(self): df["Y"] = Series(["A"] * 10) # Multiple columns with an ax argument should use same figure fig, ax = mpl.pyplot.subplots() - with tm.assert_produces_warning(UserWarning): + msg = "the figure containing the passed axes is being cleared" + with tm.assert_produces_warning(UserWarning, match=msg): axes = df.boxplot( column=["Col1", "Col2"], by="X", ax=ax, return_type="axes" ) @@ -607,7 +608,7 @@ def test_grouped_box_multiple_axes(self, hist_df): # passes multiple axes to plot, hist or boxplot # location should be changed if other test is added # which has earlier alphabetical order - with tm.assert_produces_warning(UserWarning): + with tm.assert_produces_warning(UserWarning, match="sharex and sharey"): _, axes = mpl.pyplot.subplots(2, 2) df.groupby("category").boxplot(column="height", return_type="axes", ax=axes) _check_axes_shape(mpl.pyplot.gcf().axes, axes_num=4, layout=(2, 2)) @@ -617,7 +618,7 @@ def test_grouped_box_multiple_axes_on_fig(self, hist_df): # GH 6970, GH 7069 df = hist_df fig, axes = mpl.pyplot.subplots(2, 3) - with tm.assert_produces_warning(UserWarning): + with tm.assert_produces_warning(UserWarning, match="sharex and sharey"): returned = df.boxplot( column=["height", "weight", "category"], by="gender", @@ -630,7 +631,7 @@ def test_grouped_box_multiple_axes_on_fig(self, hist_df): assert returned[0].figure is fig # draw on second row - with tm.assert_produces_warning(UserWarning): + with tm.assert_produces_warning(UserWarning, match="sharex and sharey"): returned = df.groupby("classroom").boxplot( column=["height", "weight", "category"], return_type="axes", ax=axes[1] ) @@ -647,7 +648,7 @@ def test_grouped_box_multiple_axes_ax_error(self, hist_df): _, axes = mpl.pyplot.subplots(2, 3) with pytest.raises(ValueError, match=msg): # pass different number of axes from required - with tm.assert_produces_warning(UserWarning): + with tm.assert_produces_warning(UserWarning, match="sharex and sharey"): axes = df.groupby("classroom").boxplot(ax=axes) def test_fontsize(self): diff --git a/pandas/tests/plotting/test_datetimelike.py b/pandas/tests/plotting/test_datetimelike.py index 6b709522bab70..4b4eeada58366 100644 --- a/pandas/tests/plotting/test_datetimelike.py +++ b/pandas/tests/plotting/test_datetimelike.py @@ -1432,13 +1432,19 @@ def test_mpl_nopandas(self): values1 = np.arange(10.0, 11.0, 0.5) values2 = np.arange(11.0, 12.0, 0.5) - kw = {"fmt": "-", "lw": 4} - _, ax = mpl.pyplot.subplots() - ax.plot_date([x.toordinal() for x in dates], values1, **kw) - ax.plot_date([x.toordinal() for x in dates], values2, **kw) - - line1, line2 = ax.get_lines() + ( + line1, + line2, + ) = ax.plot( + [x.toordinal() for x in dates], + values1, + "-", + [x.toordinal() for x in dates], + values2, + "-", + linewidth=4, + ) exp = np.array([x.toordinal() for x in dates], dtype=np.float64) tm.assert_numpy_array_equal(line1.get_xydata()[:, 0], exp) diff --git a/pandas/tests/plotting/test_series.py b/pandas/tests/plotting/test_series.py index 9fbc20e10f5c1..54f09c7007330 100644 --- a/pandas/tests/plotting/test_series.py +++ b/pandas/tests/plotting/test_series.py @@ -378,7 +378,7 @@ def test_pie_series(self): ) ax = _check_plot_works(series.plot.pie) _check_text_labels(ax.texts, series.index) - assert ax.get_ylabel() == "YLABEL" + assert ax.get_ylabel() == "" def test_pie_series_no_label(self): series = Series( diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index 46753b668a8b0..422ed8d4f3d2b 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -1558,7 +1558,7 @@ def test_mode_sortwarning(self): expected = Series(["foo", np.nan]) s = Series([1, "foo", "foo", np.nan, np.nan]) - with tm.assert_produces_warning(UserWarning): + with tm.assert_produces_warning(UserWarning, match="Unable to sort modes"): result = s.mode(dropna=False) result = result.sort_values().reset_index(drop=True) diff --git a/pandas/tests/resample/test_base.py b/pandas/tests/resample/test_base.py index 9cd51b95d6efd..3428abacd509e 100644 --- a/pandas/tests/resample/test_base.py +++ b/pandas/tests/resample/test_base.py @@ -25,6 +25,29 @@ from pandas.core.resample import _asfreq_compat +@pytest.fixture( + params=[ + "linear", + "time", + "index", + "values", + "nearest", + "zero", + "slinear", + "quadratic", + "cubic", + "barycentric", + "krogh", + "from_derivatives", + "piecewise_polynomial", + "pchip", + "akima", + ], +) +def all_1d_no_arg_interpolation_methods(request): + return request.param + + @pytest.mark.parametrize("freq", ["2D", "1h"]) @pytest.mark.parametrize( "index", @@ -91,6 +114,56 @@ def test_resample_interpolate(index): tm.assert_frame_equal(result, expected) +def test_resample_interpolate_regular_sampling_off_grid( + all_1d_no_arg_interpolation_methods, +): + pytest.importorskip("scipy") + # GH#21351 + index = date_range("2000-01-01 00:01:00", periods=5, freq="2h") + ser = Series(np.arange(5.0), index) + + method = all_1d_no_arg_interpolation_methods + # Resample to 1 hour sampling and interpolate with the given method + ser_resampled = ser.resample("1h").interpolate(method) + + # Check that none of the resampled values are NaN, except the first one + # which lies 1 minute before the first actual data point + assert np.isnan(ser_resampled.iloc[0]) + assert not ser_resampled.iloc[1:].isna().any() + + if method not in ["nearest", "zero"]: + # Check that the resampled values are close to the expected values + # except for methods with known inaccuracies + assert np.all( + np.isclose(ser_resampled.values[1:], np.arange(0.5, 4.5, 0.5), rtol=1.0e-1) + ) + + +def test_resample_interpolate_irregular_sampling(all_1d_no_arg_interpolation_methods): + pytest.importorskip("scipy") + # GH#21351 + ser = Series( + np.linspace(0.0, 1.0, 5), + index=DatetimeIndex( + [ + "2000-01-01 00:00:03", + "2000-01-01 00:00:22", + "2000-01-01 00:00:24", + "2000-01-01 00:00:31", + "2000-01-01 00:00:39", + ] + ), + ) + + # Resample to 5 second sampling and interpolate with the given method + ser_resampled = ser.resample("5s").interpolate(all_1d_no_arg_interpolation_methods) + + # Check that none of the resampled values are NaN, except the first one + # which lies 3 seconds before the first actual data point + assert np.isnan(ser_resampled.iloc[0]) + assert not ser_resampled.iloc[1:].isna().any() + + def test_raises_on_non_datetimelike_index(): # this is a non datetimelike index xp = DataFrame() diff --git a/pandas/tests/resample/test_time_grouper.py b/pandas/tests/resample/test_time_grouper.py index 11ad9240527d5..5f5a54c4d92a3 100644 --- a/pandas/tests/resample/test_time_grouper.py +++ b/pandas/tests/resample/test_time_grouper.py @@ -333,26 +333,98 @@ def test_upsample_sum(method, method_args, expected_values): tm.assert_series_equal(result, expected) -def test_groupby_resample_interpolate(): +@pytest.fixture +def groupy_test_df(): + return DataFrame( + {"price": [10, 11, 9], "volume": [50, 60, 50]}, + index=date_range("01/01/2018", periods=3, freq="W"), + ) + + +def test_groupby_resample_interpolate_raises(groupy_test_df): + # GH 35325 + + # Make a copy of the test data frame that has index.name=None + groupy_test_df_without_index_name = groupy_test_df.copy() + groupy_test_df_without_index_name.index.name = None + + dfs = [groupy_test_df, groupy_test_df_without_index_name] + + for df in dfs: + msg = "DataFrameGroupBy.resample operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + with pytest.raises( + NotImplementedError, + match="Direct interpolation of MultiIndex data frames is " + "not supported", + ): + df.groupby("volume").resample("1D").interpolate(method="linear") + + +def test_groupby_resample_interpolate_with_apply_syntax(groupy_test_df): # GH 35325 - d = {"price": [10, 11, 9], "volume": [50, 60, 50]} - df = DataFrame(d) + # Make a copy of the test data frame that has index.name=None + groupy_test_df_without_index_name = groupy_test_df.copy() + groupy_test_df_without_index_name.index.name = None - df["week_starting"] = date_range("01/01/2018", periods=3, freq="W") + dfs = [groupy_test_df, groupy_test_df_without_index_name] - msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = ( - df.set_index("week_starting") - .groupby("volume") - .resample("1D") - .interpolate(method="linear") + for df in dfs: + result = df.groupby("volume").apply( + lambda x: x.resample("1d").interpolate(method="linear"), + include_groups=False, ) - volume = [50] * 15 + [60] - week_starting = list(date_range("2018-01-07", "2018-01-21")) + [ - Timestamp("2018-01-14") + volume = [50] * 15 + [60] + week_starting = list(date_range("2018-01-07", "2018-01-21")) + [ + Timestamp("2018-01-14") + ] + expected_ind = pd.MultiIndex.from_arrays( + [volume, week_starting], + names=["volume", df.index.name], + ) + + expected = DataFrame( + data={ + "price": [ + 10.0, + 9.928571428571429, + 9.857142857142858, + 9.785714285714286, + 9.714285714285714, + 9.642857142857142, + 9.571428571428571, + 9.5, + 9.428571428571429, + 9.357142857142858, + 9.285714285714286, + 9.214285714285714, + 9.142857142857142, + 9.071428571428571, + 9.0, + 11.0, + ] + }, + index=expected_ind, + ) + tm.assert_frame_equal(result, expected) + + +def test_groupby_resample_interpolate_with_apply_syntax_off_grid(groupy_test_df): + """Similar test as test_groupby_resample_interpolate_with_apply_syntax but + with resampling that results in missing anchor points when interpolating. + See GH#21351.""" + # GH#21351 + result = groupy_test_df.groupby("volume").apply( + lambda x: x.resample("265h").interpolate(method="linear"), include_groups=False + ) + + volume = [50, 50, 60] + week_starting = [ + Timestamp("2018-01-07"), + Timestamp("2018-01-18 01:00:00"), + Timestamp("2018-01-14"), ] expected_ind = pd.MultiIndex.from_arrays( [volume, week_starting], @@ -363,24 +435,10 @@ def test_groupby_resample_interpolate(): data={ "price": [ 10.0, - 9.928571428571429, - 9.857142857142858, - 9.785714285714286, - 9.714285714285714, - 9.642857142857142, - 9.571428571428571, - 9.5, - 9.428571428571429, - 9.357142857142858, - 9.285714285714286, - 9.214285714285714, - 9.142857142857142, - 9.071428571428571, - 9.0, + 9.21131, 11.0, - ], - "volume": [50.0] * 15 + [60], + ] }, index=expected_ind, ) - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected, check_names=False) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 7ab8ee24bd194..5c5c06dea0008 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1565,11 +1565,12 @@ def test_merge_on_ints_floats_warning(self): B = DataFrame({"Y": [1.1, 2.5, 3.0]}) expected = DataFrame({"X": [3], "Y": [3.0]}) - with tm.assert_produces_warning(UserWarning): + msg = "the float values are not equal to their int representation" + with tm.assert_produces_warning(UserWarning, match=msg): result = A.merge(B, left_on="X", right_on="Y") tm.assert_frame_equal(result, expected) - with tm.assert_produces_warning(UserWarning): + with tm.assert_produces_warning(UserWarning, match=msg): result = B.merge(A, left_on="Y", right_on="X") tm.assert_frame_equal(result, expected[["Y", "X"]]) diff --git a/pandas/tests/reshape/test_cut.py b/pandas/tests/reshape/test_cut.py index 0811c69859c0d..340c5c449aea7 100644 --- a/pandas/tests/reshape/test_cut.py +++ b/pandas/tests/reshape/test_cut.py @@ -789,3 +789,17 @@ def test_cut_with_nullable_int64(): result = cut(series, bins=bins) tm.assert_series_equal(result, expected) + + +def test_cut_datetime_array_no_attributeerror(): + # GH 55431 + ser = Series(to_datetime(["2023-10-06 12:00:00+0000", "2023-10-07 12:00:00+0000"])) + + result = cut(ser.array, bins=2) + + categories = result.categories + expected = Categorical.from_codes([0, 1], categories=categories, ordered=True) + + tm.assert_categorical_equal( + result, expected, check_dtype=True, check_category_order=True + ) diff --git a/pandas/tests/scalar/timestamp/methods/test_to_pydatetime.py b/pandas/tests/scalar/timestamp/methods/test_to_pydatetime.py index 57f57e56201c8..be6ec7dbc24c7 100644 --- a/pandas/tests/scalar/timestamp/methods/test_to_pydatetime.py +++ b/pandas/tests/scalar/timestamp/methods/test_to_pydatetime.py @@ -24,7 +24,8 @@ def test_to_pydatetime_nonzero_nano(self): ts = Timestamp("2011-01-01 9:00:00.123456789") # Warn the user of data loss (nanoseconds). - with tm.assert_produces_warning(UserWarning): + msg = "Discarding nonzero nanoseconds in conversion" + with tm.assert_produces_warning(UserWarning, match=msg): expected = datetime(2011, 1, 1, 9, 0, 0, 123456) result = ts.to_pydatetime() assert result == expected diff --git a/pandas/tests/scalar/timestamp/test_timestamp.py b/pandas/tests/scalar/timestamp/test_timestamp.py index ea970433464fc..79fd285073983 100644 --- a/pandas/tests/scalar/timestamp/test_timestamp.py +++ b/pandas/tests/scalar/timestamp/test_timestamp.py @@ -501,8 +501,7 @@ def test_to_period_tz_warning(self): # GH#21333 make sure a warning is issued when timezone # info is lost ts = Timestamp("2009-04-15 16:17:18", tz="US/Eastern") - with tm.assert_produces_warning(UserWarning): - # warning that timezone info will be lost + with tm.assert_produces_warning(UserWarning, match="drop timezone information"): ts.to_period("D") def test_to_numpy_alias(self): diff --git a/pandas/tests/series/accessors/test_cat_accessor.py b/pandas/tests/series/accessors/test_cat_accessor.py index ca2768efd5c68..ce8ea27ea1fa2 100644 --- a/pandas/tests/series/accessors/test_cat_accessor.py +++ b/pandas/tests/series/accessors/test_cat_accessor.py @@ -200,6 +200,9 @@ def test_dt_accessor_api_for_categorical(self, idx): if func == "to_period" and getattr(idx, "tz", None) is not None: # dropping TZ warn_cls.append(UserWarning) + elif func == "to_pytimedelta": + # GH 57463 + warn_cls.append(FutureWarning) if warn_cls: warn_cls = tuple(warn_cls) else: diff --git a/pandas/tests/series/accessors/test_dt_accessor.py b/pandas/tests/series/accessors/test_dt_accessor.py index 5f0057ac50b47..8c60f7beb317d 100644 --- a/pandas/tests/series/accessors/test_dt_accessor.py +++ b/pandas/tests/series/accessors/test_dt_accessor.py @@ -192,7 +192,9 @@ def test_dt_namespace_accessor_timedelta(self): assert isinstance(result, DataFrame) tm.assert_index_equal(result.index, ser.index) - result = ser.dt.to_pytimedelta() + msg = "The behavior of TimedeltaProperties.to_pytimedelta is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = ser.dt.to_pytimedelta() assert isinstance(result, np.ndarray) assert result.dtype == object diff --git a/pandas/tests/series/accessors/test_list_accessor.py b/pandas/tests/series/accessors/test_list_accessor.py index 1c60567c1a530..c153e800cb534 100644 --- a/pandas/tests/series/accessors/test_list_accessor.py +++ b/pandas/tests/series/accessors/test_list_accessor.py @@ -31,10 +31,23 @@ def test_list_getitem(list_dtype): tm.assert_series_equal(actual, expected) +def test_list_getitem_index(): + # GH 58425 + ser = Series( + [[1, 2, 3], [4, None, 5], None], + dtype=ArrowDtype(pa.list_(pa.int64())), + index=[1, 3, 7], + ) + actual = ser.list[1] + expected = Series([2, None, None], dtype="int64[pyarrow]", index=[1, 3, 7]) + tm.assert_series_equal(actual, expected) + + def test_list_getitem_slice(): ser = Series( [[1, 2, 3], [4, None, 5], None], dtype=ArrowDtype(pa.list_(pa.int64())), + index=[1, 3, 7], ) if pa_version_under11p0: with pytest.raises( @@ -44,7 +57,9 @@ def test_list_getitem_slice(): else: actual = ser.list[1:None:None] expected = Series( - [[2, 3], [None, 5], None], dtype=ArrowDtype(pa.list_(pa.int64())) + [[2, 3], [None, 5], None], + dtype=ArrowDtype(pa.list_(pa.int64())), + index=[1, 3, 7], ) tm.assert_series_equal(actual, expected) @@ -61,11 +76,15 @@ def test_list_len(): def test_list_flatten(): ser = Series( - [[1, 2, 3], [4, None], None], + [[1, 2, 3], None, [4, None], [], [7, 8]], dtype=ArrowDtype(pa.list_(pa.int64())), ) actual = ser.list.flatten() - expected = Series([1, 2, 3, 4, None], dtype=ArrowDtype(pa.int64())) + expected = Series( + [1, 2, 3, 4, None, 7, 8], + dtype=ArrowDtype(pa.int64()), + index=[0, 0, 0, 2, 2, 4, 4], + ) tm.assert_series_equal(actual, expected) diff --git a/pandas/tests/series/indexing/test_datetime.py b/pandas/tests/series/indexing/test_datetime.py index e0ca4bf64ea91..3b41c8ee463d8 100644 --- a/pandas/tests/series/indexing/test_datetime.py +++ b/pandas/tests/series/indexing/test_datetime.py @@ -36,9 +36,6 @@ def test_fancy_getitem(): s = Series(np.arange(len(dti)), index=dti) - msg = "Series.__getitem__ treating keys as positions is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - assert s[48] == 48 assert s["1/2/2009"] == 48 assert s["2009-1-2"] == 48 assert s[datetime(2009, 1, 2)] == 48 @@ -57,10 +54,6 @@ def test_fancy_setitem(): s = Series(np.arange(len(dti)), index=dti) - msg = "Series.__setitem__ treating keys as positions is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - s[48] = -1 - assert s.iloc[48] == -1 s["1/2/2009"] = -2 assert s.iloc[48] == -2 s["1/2/2009":"2009-06-05"] = -3 diff --git a/pandas/tests/series/indexing/test_get.py b/pandas/tests/series/indexing/test_get.py index 1f3711ad91903..5ff92ca89efba 100644 --- a/pandas/tests/series/indexing/test_get.py +++ b/pandas/tests/series/indexing/test_get.py @@ -157,13 +157,8 @@ def test_get_with_default(): assert s.get("e", "z") == "z" assert s.get("e", "e") == "e" - msg = "Series.__getitem__ treating keys as positions is deprecated" - warn = None - if index is d0: - warn = FutureWarning - with tm.assert_produces_warning(warn, match=msg): - assert s.get(10, "z") == "z" - assert s.get(10, 10) == 10 + assert s.get(10, "z") == "z" + assert s.get(10, 10) == 10 @pytest.mark.parametrize( @@ -201,13 +196,10 @@ def test_get_with_ea(arr): result = ser.get("Z") assert result is None - msg = "Series.__getitem__ treating keys as positions is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - assert ser.get(4) == ser.iloc[4] - with tm.assert_produces_warning(FutureWarning, match=msg): - assert ser.get(-1) == ser.iloc[-1] - with tm.assert_produces_warning(FutureWarning, match=msg): - assert ser.get(len(ser)) is None + # As of 3.0, ints are treated as labels + assert ser.get(4) is None + assert ser.get(-1) is None + assert ser.get(len(ser)) is None # GH#21257 ser = Series(arr) @@ -216,16 +208,14 @@ def test_get_with_ea(arr): def test_getitem_get(string_series, object_series): - msg = "Series.__getitem__ treating keys as positions is deprecated" - for obj in [string_series, object_series]: idx = obj.index[5] assert obj[idx] == obj.get(idx) assert obj[idx] == obj.iloc[5] - with tm.assert_produces_warning(FutureWarning, match=msg): - assert string_series.get(-1) == string_series.get(string_series.index[-1]) + # As of 3.0, ints are treated as labels + assert string_series.get(-1) is None assert string_series.iloc[5] == string_series.get(string_series.index[5]) diff --git a/pandas/tests/series/indexing/test_getitem.py b/pandas/tests/series/indexing/test_getitem.py index fac543ac450a5..ede39ba61dfeb 100644 --- a/pandas/tests/series/indexing/test_getitem.py +++ b/pandas/tests/series/indexing/test_getitem.py @@ -15,6 +15,7 @@ conversion, timezones, ) +from pandas.compat.numpy import np_version_gt2 from pandas.core.dtypes.common import is_scalar @@ -72,19 +73,14 @@ def test_getitem_unrecognized_scalar(self): def test_getitem_negative_out_of_bounds(self): ser = Series(["a"] * 10, index=["a"] * 10) - msg = "index -11 is out of bounds for axis 0 with size 10|index out of bounds" - warn_msg = "Series.__getitem__ treating keys as positions is deprecated" - with pytest.raises(IndexError, match=msg): - with tm.assert_produces_warning(FutureWarning, match=warn_msg): - ser[-11] + with pytest.raises(KeyError, match="^-11$"): + ser[-11] def test_getitem_out_of_bounds_indexerror(self, datetime_series): # don't segfault, GH#495 - msg = r"index \d+ is out of bounds for axis 0 with size \d+" - warn_msg = "Series.__getitem__ treating keys as positions is deprecated" - with pytest.raises(IndexError, match=msg): - with tm.assert_produces_warning(FutureWarning, match=warn_msg): - datetime_series[len(datetime_series)] + N = len(datetime_series) + with pytest.raises(KeyError, match=str(N)): + datetime_series[N] def test_getitem_out_of_bounds_empty_rangeindex_keyerror(self): # GH#917 @@ -118,11 +114,13 @@ def test_getitem_keyerror_with_integer_index(self, any_int_numpy_dtype): ser["c"] def test_getitem_int64(self, datetime_series): + if np_version_gt2: + msg = r"^np.int64\(5\)$" + else: + msg = "^5$" idx = np.int64(5) - msg = "Series.__getitem__ treating keys as positions is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - res = datetime_series[idx] - assert res == datetime_series.iloc[5] + with pytest.raises(KeyError, match=msg): + datetime_series[idx] def test_getitem_full_range(self): # github.com/pandas-dev/pandas/commit/4f433773141d2eb384325714a2776bcc5b2e20f7 @@ -218,10 +216,8 @@ def test_getitem_str_with_timedeltaindex(self): def test_getitem_bool_index_positional(self): # GH#48653 ser = Series({True: 1, False: 0}) - msg = "Series.__getitem__ treating keys as positions is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = ser[0] - assert result == 1 + with pytest.raises(KeyError, match="^0$"): + ser[0] class TestSeriesGetitemSlices: @@ -384,17 +380,16 @@ def test_getitem_intlist_intindex_periodvalues(self): @pytest.mark.parametrize("box", [list, np.array, Index]) def test_getitem_intlist_intervalindex_non_int(self, box): - # GH#33404 fall back to positional since ints are unambiguous + # GH#33404 fall back to positional since ints are unambiguous; + # changed in 3.0 to never fallback dti = date_range("2000-01-03", periods=3)._with_freq(None) ii = pd.IntervalIndex.from_breaks(dti) ser = Series(range(len(ii)), index=ii) - expected = ser.iloc[:1] key = box([0]) - msg = "Series.__getitem__ treating keys as positions is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = ser[key] - tm.assert_series_equal(result, expected) + msg = r"None of \[Index\(\[0\], dtype='int(32|64)'\)\] are in the \[index\]" + with pytest.raises(KeyError, match=msg): + ser[key] @pytest.mark.parametrize("box", [list, np.array, Index]) @pytest.mark.parametrize("dtype", [np.int64, np.float64, np.uint64]) @@ -635,11 +630,6 @@ def test_getitem_preserve_name(datetime_series): result = datetime_series[datetime_series > 0] assert result.name == datetime_series.name - msg = "Series.__getitem__ treating keys as positions is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = datetime_series[[0, 2, 4]] - assert result.name == datetime_series.name - result = datetime_series[5:10] assert result.name == datetime_series.name @@ -667,21 +657,16 @@ def test_getitem_missing(datetime_series): def test_getitem_fancy(string_series, object_series): - msg = "Series.__getitem__ treating keys as positions is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - slice1 = string_series[[1, 2, 3]] - slice2 = object_series[[1, 2, 3]] - assert string_series.index[2] == slice1.index[1] - assert object_series.index[2] == slice2.index[1] - assert string_series.iloc[2] == slice1.iloc[1] - assert object_series.iloc[2] == slice2.iloc[1] + msg = r"None of \[Index\(\[1, 2, 3\], dtype='int(32|64)'\)\] are in the \[index\]" + with pytest.raises(KeyError, match=msg): + string_series[[1, 2, 3]] + with pytest.raises(KeyError, match=msg): + object_series[[1, 2, 3]] def test_getitem_box_float64(datetime_series): - msg = "Series.__getitem__ treating keys as positions is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - value = datetime_series[5] - assert isinstance(value, np.float64) + with pytest.raises(KeyError, match="^5$"): + datetime_series[5] def test_getitem_unordered_dup(): @@ -712,13 +697,11 @@ def test_slice_can_reorder_not_uniquely_indexed(): @pytest.mark.parametrize("index_vals", ["aabcd", "aadcb"]) def test_duplicated_index_getitem_positional_indexer(index_vals): - # GH 11747 + # GH 11747; changed in 3.0 integers are treated as always-labels s = Series(range(5), index=list(index_vals)) - msg = "Series.__getitem__ treating keys as positions is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = s[3] - assert result == 3 + with pytest.raises(KeyError, match="^3$"): + s[3] class TestGetitemDeprecatedIndexers: diff --git a/pandas/tests/series/indexing/test_indexing.py b/pandas/tests/series/indexing/test_indexing.py index a629d18131306..5002b6d20da09 100644 --- a/pandas/tests/series/indexing/test_indexing.py +++ b/pandas/tests/series/indexing/test_indexing.py @@ -32,27 +32,16 @@ def test_basic_indexing(): np.random.default_rng(2).standard_normal(5), index=["a", "b", "a", "a", "b"] ) - warn_msg = "Series.__[sg]etitem__ treating keys as positions is deprecated" - msg = "index 5 is out of bounds for axis 0 with size 5" - with pytest.raises(IndexError, match=msg): - with tm.assert_produces_warning(FutureWarning, match=warn_msg): - s[5] - with pytest.raises(IndexError, match=msg): - with tm.assert_produces_warning(FutureWarning, match=warn_msg): - s[5] = 0 + with pytest.raises(KeyError, match="^5$"): + s[5] with pytest.raises(KeyError, match=r"^'c'$"): s["c"] s = s.sort_index() - with pytest.raises(IndexError, match=msg): - with tm.assert_produces_warning(FutureWarning, match=warn_msg): - s[5] - msg = r"index 5 is out of bounds for axis (0|1) with size 5|^5$" - with pytest.raises(IndexError, match=msg): - with tm.assert_produces_warning(FutureWarning, match=warn_msg): - s[5] = 0 + with pytest.raises(KeyError, match="^5$"): + s[5] def test_getitem_numeric_should_not_fallback_to_positional(any_numeric_dtype): @@ -153,9 +142,7 @@ def test_series_box_timestamp(): assert isinstance(ser.iloc[4], Timestamp) ser = Series(rng, index=rng) - msg = "Series.__getitem__ treating keys as positions is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - assert isinstance(ser[0], Timestamp) + assert isinstance(ser[rng[0]], Timestamp) assert isinstance(ser.at[rng[1]], Timestamp) assert isinstance(ser.iat[2], Timestamp) assert isinstance(ser.loc[rng[3]], Timestamp) diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py index 99535f273075c..b94e6b6f0c6c8 100644 --- a/pandas/tests/series/indexing/test_setitem.py +++ b/pandas/tests/series/indexing/test_setitem.py @@ -181,14 +181,12 @@ def test_object_series_setitem_dt64array_exact_match(self): class TestSetitemScalarIndexer: def test_setitem_negative_out_of_bounds(self): + # As of 3.0, int keys are treated as labels, so this becomes + # setitem-with-expansion ser = Series(["a"] * 10, index=["a"] * 10) - - # string index falls back to positional - msg = "index -11|-1 is out of bounds for axis 0 with size 10" - warn_msg = "Series.__setitem__ treating keys as positions is deprecated" - with pytest.raises(IndexError, match=msg): - with tm.assert_produces_warning(FutureWarning, match=warn_msg): - ser[-11] = "foo" + ser[-11] = "foo" + exp = Series(["a"] * 10 + ["foo"], index=["a"] * 10 + [-11]) + tm.assert_series_equal(ser, exp) @pytest.mark.parametrize("indexer", [tm.loc, tm.at]) @pytest.mark.parametrize("ser_index", [0, 1]) @@ -1467,6 +1465,39 @@ def test_slice_key(self, obj, key, expected, warn, val, indexer_sli, is_inplace) raise AssertionError("xfail not relevant for this test.") +@pytest.mark.parametrize( + "exp_dtype", + [ + "M8[ms]", + "M8[ms, UTC]", + "m8[ms]", + ], +) +class TestCoercionDatetime64HigherReso(CoercionTest): + @pytest.fixture + def obj(self, exp_dtype): + idx = date_range("2011-01-01", freq="D", periods=4, unit="s") + if exp_dtype == "m8[ms]": + idx = idx - Timestamp("1970-01-01") + assert idx.dtype == "m8[s]" + elif exp_dtype == "M8[ms, UTC]": + idx = idx.tz_localize("UTC") + return Series(idx) + + @pytest.fixture + def val(self, exp_dtype): + ts = Timestamp("2011-01-02 03:04:05.678").as_unit("ms") + if exp_dtype == "m8[ms]": + return ts - Timestamp("1970-01-01") + elif exp_dtype == "M8[ms, UTC]": + return ts.tz_localize("UTC") + return ts + + @pytest.fixture + def warn(self): + return FutureWarning + + @pytest.mark.parametrize( "val,exp_dtype,warn", [ @@ -1716,24 +1747,24 @@ def test_setitem_bool_int_float_consistency(indexer_sli): def test_setitem_positional_with_casting(): # GH#45070 case where in __setitem__ we get a KeyError, then when # we fallback we *also* get a ValueError if we try to set inplace. + # As of 3.0 we always treat int keys as labels, so this becomes + # setitem-with-expansion ser = Series([1, 2, 3], index=["a", "b", "c"]) - warn_msg = "Series.__setitem__ treating keys as positions is deprecated" - with tm.assert_produces_warning(FutureWarning, match=warn_msg): - ser[0] = "X" - expected = Series(["X", 2, 3], index=["a", "b", "c"], dtype=object) + ser[0] = "X" + expected = Series([1, 2, 3, "X"], index=["a", "b", "c", 0], dtype=object) tm.assert_series_equal(ser, expected) def test_setitem_positional_float_into_int_coerces(): # Case where we hit a KeyError and then trying to set in-place incorrectly - # casts a float to an int + # casts a float to an int; + # As of 3.0 we always treat int keys as labels, so this becomes + # setitem-with-expansion ser = Series([1, 2, 3], index=["a", "b", "c"]) - warn_msg = "Series.__setitem__ treating keys as positions is deprecated" - with tm.assert_produces_warning(FutureWarning, match=warn_msg): - ser[0] = 1.5 - expected = Series([1.5, 2, 3], index=["a", "b", "c"]) + ser[0] = 1.5 + expected = Series([1, 2, 3, 1.5], index=["a", "b", "c", 0]) tm.assert_series_equal(ser, expected) diff --git a/pandas/tests/series/methods/test_clip.py b/pandas/tests/series/methods/test_clip.py index 75b4050c18afe..8ed422fc118dc 100644 --- a/pandas/tests/series/methods/test_clip.py +++ b/pandas/tests/series/methods/test_clip.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas.errors import OutOfBoundsDatetime + import pandas as pd from pandas import ( Series, @@ -131,12 +133,30 @@ def test_clip_with_datetimes(self): ) tm.assert_series_equal(result, expected) - @pytest.mark.parametrize("dtype", [object, "M8[us]"]) - def test_clip_with_timestamps_and_oob_datetimes(self, dtype): + def test_clip_with_timestamps_and_oob_datetimes_object(self): # GH-42794 - ser = Series([datetime(1, 1, 1), datetime(9999, 9, 9)], dtype=dtype) + ser = Series([datetime(1, 1, 1), datetime(9999, 9, 9)], dtype=object) result = ser.clip(lower=Timestamp.min, upper=Timestamp.max) - expected = Series([Timestamp.min, Timestamp.max], dtype=dtype) + expected = Series([Timestamp.min, Timestamp.max], dtype=object) + + tm.assert_series_equal(result, expected) + + def test_clip_with_timestamps_and_oob_datetimes_non_nano(self): + # GH#56410 + dtype = "M8[us]" + ser = Series([datetime(1, 1, 1), datetime(9999, 9, 9)], dtype=dtype) + + msg = ( + r"Incompatible \(high-resolution\) value for dtype='datetime64\[us\]'. " + "Explicitly cast before operating" + ) + with pytest.raises(OutOfBoundsDatetime, match=msg): + ser.clip(lower=Timestamp.min, upper=Timestamp.max) + + lower = Timestamp.min.as_unit("us") + upper = Timestamp.max.as_unit("us") + result = ser.clip(lower=lower, upper=upper) + expected = Series([lower, upper], dtype=dtype) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_fillna.py b/pandas/tests/series/methods/test_fillna.py index 0965d36e4827d..592dba253532d 100644 --- a/pandas/tests/series/methods/test_fillna.py +++ b/pandas/tests/series/methods/test_fillna.py @@ -308,12 +308,7 @@ def test_datetime64_fillna(self): "scalar", [ False, - pytest.param( - True, - marks=pytest.mark.xfail( - reason="GH#56410 scalar case not yet addressed" - ), - ), + True, ], ) @pytest.mark.parametrize("tz", [None, "UTC"]) @@ -342,12 +337,7 @@ def test_datetime64_fillna_mismatched_reso_no_rounding(self, tz, scalar): "scalar", [ False, - pytest.param( - True, - marks=pytest.mark.xfail( - reason="GH#56410 scalar case not yet addressed" - ), - ), + True, ], ) def test_timedelta64_fillna_mismatched_reso_no_rounding(self, scalar): diff --git a/pandas/tests/series/methods/test_interpolate.py b/pandas/tests/series/methods/test_interpolate.py index 1008c2c87dc9e..ff7f8d0b7fa72 100644 --- a/pandas/tests/series/methods/test_interpolate.py +++ b/pandas/tests/series/methods/test_interpolate.py @@ -94,7 +94,12 @@ def test_interpolate(self, datetime_series): ts = Series(np.arange(len(datetime_series), dtype=float), datetime_series.index) ts_copy = ts.copy() - ts_copy[5:10] = np.nan + + # Set data between Tuesday and Thursday to NaN for 2 consecutive weeks. + # Linear interpolation should fill in the missing values correctly, + # as the index is equally-spaced within each week. + ts_copy[1:4] = np.nan + ts_copy[6:9] = np.nan linear_interp = ts_copy.interpolate(method="linear") tm.assert_series_equal(linear_interp, ts) @@ -265,7 +270,7 @@ def test_nan_interpolate(self, kwargs): def test_nan_irregular_index(self): s = Series([1, 2, np.nan, 4], index=[1, 3, 5, 9]) result = s.interpolate() - expected = Series([1.0, 2.0, 3.0, 4.0], index=[1, 3, 5, 9]) + expected = Series([1.0, 2.0, 2.6666666666666665, 4.0], index=[1, 3, 5, 9]) tm.assert_series_equal(result, expected) def test_nan_str_index(self): diff --git a/pandas/tests/series/test_arithmetic.py b/pandas/tests/series/test_arithmetic.py index 44bf3475b85a6..f0930a831e98d 100644 --- a/pandas/tests/series/test_arithmetic.py +++ b/pandas/tests/series/test_arithmetic.py @@ -359,12 +359,13 @@ def test_add_list_to_masked_array_boolean(self, request): else None ) ser = Series([True, None, False], dtype="boolean") - with tm.assert_produces_warning(warning): + msg = "operator is not supported by numexpr for the bool dtype" + with tm.assert_produces_warning(warning, match=msg): result = ser + [True, None, True] expected = Series([True, None, True], dtype="boolean") tm.assert_series_equal(result, expected) - with tm.assert_produces_warning(warning): + with tm.assert_produces_warning(warning, match=msg): result = [True, None, True] + ser tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/test_cumulative.py b/pandas/tests/series/test_cumulative.py index 9b7b08127a550..a9d5486139b46 100644 --- a/pandas/tests/series/test_cumulative.py +++ b/pandas/tests/series/test_cumulative.py @@ -170,6 +170,58 @@ def test_cummethods_bool_in_object_dtype(self, method, expected): result = getattr(ser, method)() tm.assert_series_equal(result, expected) + @pytest.mark.parametrize( + "method, order", + [ + ["cummax", "abc"], + ["cummin", "cba"], + ], + ) + def test_cummax_cummin_on_ordered_categorical(self, method, order): + # GH#52335 + cat = pd.CategoricalDtype(list(order), ordered=True) + ser = pd.Series( + list("ababcab"), + dtype=cat, + ) + result = getattr(ser, method)() + expected = pd.Series( + list("abbbccc"), + dtype=cat, + ) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "skip, exp", + [ + [True, ["a", np.nan, "b", "b", "c"]], + [False, ["a", np.nan, np.nan, np.nan, np.nan]], + ], + ) + @pytest.mark.parametrize( + "method, order", + [ + ["cummax", "abc"], + ["cummin", "cba"], + ], + ) + def test_cummax_cummin_ordered_categorical_nan(self, skip, exp, method, order): + # GH#52335 + cat = pd.CategoricalDtype(list(order), ordered=True) + ser = pd.Series( + ["a", np.nan, "b", "a", "c"], + dtype=cat, + ) + result = getattr(ser, method)(skipna=skip) + expected = pd.Series( + exp, + dtype=cat, + ) + tm.assert_series_equal( + result, + expected, + ) + def test_cumprod_timedelta(self): # GH#48111 ser = pd.Series([pd.Timedelta(days=1), pd.Timedelta(days=3)]) diff --git a/pandas/tests/test_expressions.py b/pandas/tests/test_expressions.py index 68dcc1a18eda7..8f275345a7819 100644 --- a/pandas/tests/test_expressions.py +++ b/pandas/tests/test_expressions.py @@ -339,35 +339,36 @@ def test_bool_ops_warn_on_arithmetic(self, op_str, opname, monkeypatch): # raises TypeError return + msg = "operator is not supported by numexpr" with monkeypatch.context() as m: m.setattr(expr, "_MIN_ELEMENTS", 5) with option_context("compute.use_numexpr", True): - with tm.assert_produces_warning(): + with tm.assert_produces_warning(UserWarning, match=msg): r = f(df, df) e = fe(df, df) tm.assert_frame_equal(r, e) - with tm.assert_produces_warning(): + with tm.assert_produces_warning(UserWarning, match=msg): r = f(df.a, df.b) e = fe(df.a, df.b) tm.assert_series_equal(r, e) - with tm.assert_produces_warning(): + with tm.assert_produces_warning(UserWarning, match=msg): r = f(df.a, True) e = fe(df.a, True) tm.assert_series_equal(r, e) - with tm.assert_produces_warning(): + with tm.assert_produces_warning(UserWarning, match=msg): r = f(False, df.a) e = fe(False, df.a) tm.assert_series_equal(r, e) - with tm.assert_produces_warning(): + with tm.assert_produces_warning(UserWarning, match=msg): r = f(False, df) e = fe(False, df) tm.assert_frame_equal(r, e) - with tm.assert_produces_warning(): + with tm.assert_produces_warning(UserWarning, match=msg): r = f(df, True) e = fe(df, True) tm.assert_frame_equal(r, e) diff --git a/pandas/tests/test_optional_dependency.py b/pandas/tests/test_optional_dependency.py index 52b5f636b1254..9127981d1845d 100644 --- a/pandas/tests/test_optional_dependency.py +++ b/pandas/tests/test_optional_dependency.py @@ -42,7 +42,7 @@ def test_bad_version(monkeypatch): result = import_optional_dependency("fakemodule", min_version="0.8") assert result is module - with tm.assert_produces_warning(UserWarning): + with tm.assert_produces_warning(UserWarning, match=match): result = import_optional_dependency("fakemodule", errors="warn") assert result is None @@ -53,7 +53,7 @@ def test_bad_version(monkeypatch): with pytest.raises(ImportError, match="Pandas requires version '1.1.0'"): import_optional_dependency("fakemodule", min_version="1.1.0") - with tm.assert_produces_warning(UserWarning): + with tm.assert_produces_warning(UserWarning, match="Pandas requires version"): result = import_optional_dependency( "fakemodule", errors="warn", min_version="1.1.0" ) @@ -81,7 +81,7 @@ def test_submodule(monkeypatch): with pytest.raises(ImportError, match=match): import_optional_dependency("fakemodule.submodule") - with tm.assert_produces_warning(UserWarning): + with tm.assert_produces_warning(UserWarning, match=match): result = import_optional_dependency("fakemodule.submodule", errors="warn") assert result is None diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index b59dd194cac27..f4042acd05dc3 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -1705,22 +1705,24 @@ def test_to_datetime_month_or_year_unit_non_round_float(self, cache, unit): # GH#50301 # Match Timestamp behavior in disallowing non-round floats with # Y or M unit - warn_msg = "strings will be parsed as datetime strings" msg = f"Conversion of non-round float with unit={unit} is ambiguous" with pytest.raises(ValueError, match=msg): to_datetime([1.5], unit=unit, errors="raise") with pytest.raises(ValueError, match=msg): to_datetime(np.array([1.5]), unit=unit, errors="raise") + + msg = r"Given date string \"1.5\" not likely a datetime, at position 0" with pytest.raises(ValueError, match=msg): - with tm.assert_produces_warning(FutureWarning, match=warn_msg): - to_datetime(["1.5"], unit=unit, errors="raise") + to_datetime(["1.5"], unit=unit, errors="raise") res = to_datetime([1.5], unit=unit, errors="coerce") expected = Index([NaT], dtype="M8[ns]") tm.assert_index_equal(res, expected) - with tm.assert_produces_warning(FutureWarning, match=warn_msg): - res = to_datetime(["1.5"], unit=unit, errors="coerce") + # In 3.0, the string "1.5" is parsed as as it would be without unit, + # which fails. With errors="coerce" this becomes NaT. + res = to_datetime(["1.5"], unit=unit, errors="coerce") + expected = to_datetime([NaT]) tm.assert_index_equal(res, expected) # round floats are OK @@ -1735,14 +1737,6 @@ def test_unit(self, cache): with pytest.raises(ValueError, match=msg): to_datetime([1], unit="D", format="%Y%m%d", cache=cache) - def test_unit_str(self, cache): - # GH 57051 - # Test that strs aren't dropping precision to 32-bit accidentally. - with tm.assert_produces_warning(FutureWarning): - res = to_datetime(["1704660000"], unit="s", origin="unix") - expected = to_datetime([1704660000], unit="s", origin="unix") - tm.assert_index_equal(res, expected) - def test_unit_array_mixed_nans(self, cache): values = [11111111111111111, 1, 1.0, iNaT, NaT, np.nan, "NaT", ""] @@ -1771,7 +1765,7 @@ def test_unit_array_mixed_nans_large_int(self, cache): def test_to_datetime_invalid_str_not_out_of_bounds_valuerror(self, cache): # if we have a string, then we raise a ValueError # and NOT an OutOfBoundsDatetime - msg = "non convertible value foo with the unit 's'" + msg = "Unknown datetime string format, unable to parse: foo, at position 0" with pytest.raises(ValueError, match=msg): to_datetime("foo", errors="raise", unit="s", cache=cache) @@ -1906,7 +1900,13 @@ def test_to_datetime_unit_na_values(self): @pytest.mark.parametrize("bad_val", ["foo", 111111111]) def test_to_datetime_unit_invalid(self, bad_val): - msg = f"{bad_val} with the unit 'D'" + if bad_val == "foo": + msg = ( + "Unknown datetime string format, unable to parse: " + f"{bad_val}, at position 2" + ) + else: + msg = "cannot convert input 111111111 with the unit 'D', at position 2" with pytest.raises(ValueError, match=msg): to_datetime([1, 2, bad_val], unit="D") diff --git a/pandas/tests/util/test_assert_series_equal.py b/pandas/tests/util/test_assert_series_equal.py index 0b3bc07c17452..f75f48157aad2 100644 --- a/pandas/tests/util/test_assert_series_equal.py +++ b/pandas/tests/util/test_assert_series_equal.py @@ -475,9 +475,44 @@ def test_assert_series_equal_int_tol(): ) -def test_assert_series_equal_index_exact_default(): +@pytest.mark.parametrize( + "left_idx, right_idx", + [ + ( + pd.Index([0, 0.2, 0.4, 0.6, 0.8, 1]), + pd.Index(np.linspace(0, 1, 6)), + ), + ( + pd.MultiIndex.from_arrays([[0, 0, 0, 0, 1, 1], [0, 0.2, 0.4, 0.6, 0.8, 1]]), + pd.MultiIndex.from_arrays([[0, 0, 0, 0, 1, 1], np.linspace(0, 1, 6)]), + ), + ( + pd.MultiIndex.from_arrays( + [["a", "a", "a", "b", "b", "b"], [1, 2, 3, 4, 5, 10000000000001]] + ), + pd.MultiIndex.from_arrays( + [["a", "a", "a", "b", "b", "b"], [1, 2, 3, 4, 5, 10000000000002]] + ), + ), + pytest.param( + pd.Index([1, 2, 3, 4, 5, 10000000000001]), + pd.Index([1, 2, 3, 4, 5, 10000000000002]), + marks=pytest.mark.xfail(reason="check_exact_index defaults to True"), + ), + pytest.param( + pd.MultiIndex.from_arrays( + [[0, 0, 0, 0, 1, 1], [1, 2, 3, 4, 5, 10000000000001]] + ), + pd.MultiIndex.from_arrays( + [[0, 0, 0, 0, 1, 1], [1, 2, 3, 4, 5, 10000000000002]] + ), + marks=pytest.mark.xfail(reason="check_exact_index defaults to True"), + ), + ], +) +def test_assert_series_equal_check_exact_index_default(left_idx, right_idx): # GH#57067 - ser1 = Series(np.zeros(6, dtype=int), [0, 0.2, 0.4, 0.6, 0.8, 1]) - ser2 = Series(np.zeros(6, dtype=int), np.linspace(0, 1, 6)) + ser1 = Series(np.zeros(6, dtype=int), left_idx) + ser2 = Series(np.zeros(6, dtype=int), right_idx) tm.assert_series_equal(ser1, ser2) tm.assert_frame_equal(ser1.to_frame(), ser2.to_frame()) diff --git a/pandas/tests/window/test_expanding.py b/pandas/tests/window/test_expanding.py index d375010aff3cc..510a69a2ff3e4 100644 --- a/pandas/tests/window/test_expanding.py +++ b/pandas/tests/window/test_expanding.py @@ -696,5 +696,7 @@ def test_numeric_only_corr_cov_series(kernel, use_arg, numeric_only, dtype): def test_keyword_quantile_deprecated(): # GH #52550 ser = Series([1, 2, 3, 4]) - with tm.assert_produces_warning(FutureWarning): + with tm.assert_produces_warning( + FutureWarning, match="the 'quantile' keyword is deprecated, use 'q' instead" + ): ser.expanding().quantile(quantile=0.5) diff --git a/pandas/tests/window/test_rolling_quantile.py b/pandas/tests/window/test_rolling_quantile.py index d5a7010923563..1604d72d4f9b1 100644 --- a/pandas/tests/window/test_rolling_quantile.py +++ b/pandas/tests/window/test_rolling_quantile.py @@ -178,5 +178,7 @@ def test_center_reindex_frame(frame, q): def test_keyword_quantile_deprecated(): # GH #52550 s = Series([1, 2, 3, 4]) - with tm.assert_produces_warning(FutureWarning): + with tm.assert_produces_warning( + FutureWarning, match="the 'quantile' keyword is deprecated, use 'q' instead" + ): s.rolling(2).quantile(quantile=0.4) diff --git a/pandas/util/_decorators.py b/pandas/util/_decorators.py index d287fa72d552d..bdfb0b1cad8ae 100644 --- a/pandas/util/_decorators.py +++ b/pandas/util/_decorators.py @@ -505,7 +505,7 @@ def indent(text: str | None, indents: int = 1) -> str: ] -def set_module(module): +def set_module(module) -> Callable[[F], F]: """Private decorator for overriding __module__ on a function or class. Example usage:: @@ -518,7 +518,7 @@ def example(): assert example.__module__ == "pandas" """ - def decorator(func): + def decorator(func: F) -> F: if module is not None: func.__module__ = module return func