Merge branch 'main' into fix/group_by_agg_pyarrow_bool_numpy_same_type

pandas-dev · Apr 8, 2024 · 62a31d9 · 62a31d9
2 parents d510052 + b8a4691
commit 62a31d9
Show file tree

Hide file tree

Showing 23 changed files with 86 additions and 156 deletions.
diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py
@@ -24,7 +24,7 @@ def setup(self):
         self.codes = np.tile(range(len(self.categories)), N)
 
         self.datetimes = pd.Series(
-            pd.date_range("1995-01-01 00:00:00", periods=N / 10, freq="s")
+            pd.date_range("1995-01-01 00:00:00", periods=N // 10, freq="s")
         )
         self.datetimes_with_nat = self.datetimes.copy()
         self.datetimes_with_nat.iloc[-1] = pd.NaT

diff --git a/asv_bench/benchmarks/timeseries.py b/asv_bench/benchmarks/timeseries.py
@@ -29,7 +29,7 @@ def setup(self, index_type):
             "dst": date_range(
                 start="10/29/2000 1:00:00", end="10/29/2000 1:59:59", freq="s"
             ),
-            "repeated": date_range(start="2000", periods=N / 10, freq="s").repeat(10),
+            "repeated": date_range(start="2000", periods=N // 10, freq="s").repeat(10),
             "tz_aware": date_range(start="2000", periods=N, freq="s", tz="US/Eastern"),
             "tz_local": date_range(
                 start="2000", periods=N, freq="s", tz=dateutil.tz.tzlocal()

diff --git a/doc/redirects.csv b/doc/redirects.csv
@@ -1422,7 +1422,6 @@ reference/api/pandas.Series.transpose,pandas.Series.T
 reference/api/pandas.Index.transpose,pandas.Index.T
 reference/api/pandas.Index.notnull,pandas.Index.notna
 reference/api/pandas.Index.tolist,pandas.Index.to_list
-reference/api/pandas.arrays.PandasArray,pandas.arrays.NumpyExtensionArray
 reference/api/pandas.core.groupby.DataFrameGroupBy.backfill,pandas.core.groupby.DataFrameGroupBy.bfill
 reference/api/pandas.core.groupby.GroupBy.backfill,pandas.core.groupby.DataFrameGroupBy.bfill
 reference/api/pandas.core.resample.Resampler.backfill,pandas.core.resample.Resampler.bfill

diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -208,20 +208,25 @@ Removal of prior version deprecations/changes
 - All arguments except ``name`` in :meth:`Index.rename` are now keyword only (:issue:`56493`)
 - All arguments except the first ``path``-like argument in IO writers are now keyword only (:issue:`54229`)
 - Disallow calling :meth:`Series.replace` or :meth:`DataFrame.replace` without a ``value`` and with non-dict-like ``to_replace`` (:issue:`33302`)
+- Disallow constructing a :class:`arrays.SparseArray` with scalar data (:issue:`53039`)
 - Disallow non-standard (``np.ndarray``, :class:`Index`, :class:`ExtensionArray`, or :class:`Series`) to :func:`isin`, :func:`unique`, :func:`factorize` (:issue:`52986`)
 - Disallow passing a pandas type to :meth:`Index.view` (:issue:`55709`)
 - Disallow units other than "s", "ms", "us", "ns" for datetime64 and timedelta64 dtypes in :func:`array` (:issue:`53817`)
 - Removed "freq" keyword from :class:`PeriodArray` constructor, use "dtype" instead (:issue:`52462`)
+- Removed 'fastpath' keyword in :class:`Categorical` constructor (:issue:`20110`)
+- Removed alias :class:`arrays.PandasArray` for :class:`arrays.NumpyExtensionArray` (:issue:`53694`)
 - Removed deprecated "method" and "limit" keywords from :meth:`Series.replace` and :meth:`DataFrame.replace` (:issue:`53492`)
 - Removed extension test classes ``BaseNoReduceTests``, ``BaseNumericReduceTests``, ``BaseBooleanReduceTests`` (:issue:`54663`)
 - Removed the "closed" and "normalize" keywords in :meth:`DatetimeIndex.__new__` (:issue:`52628`)
+- Require :meth:`SparseDtype.fill_value` to be a valid value for the :meth:`SparseDtype.subtype` (:issue:`53043`)
 - Stopped performing dtype inference with in :meth:`Index.insert` with object-dtype index; this often affects the index/columns that result when setting new entries into an empty :class:`Series` or :class:`DataFrame` (:issue:`51363`)
 - Removed the "closed" and "unit" keywords in :meth:`TimedeltaIndex.__new__` (:issue:`52628`, :issue:`55499`)
 - All arguments in :meth:`Index.sort_values` are now keyword only (:issue:`56493`)
 - All arguments in :meth:`Series.to_dict` are now keyword only (:issue:`56493`)
 - Changed the default value of ``observed`` in :meth:`DataFrame.groupby` and :meth:`Series.groupby` to ``True`` (:issue:`51811`)
 - Enforce deprecation in :func:`testing.assert_series_equal` and :func:`testing.assert_frame_equal` with object dtype and mismatched null-like values, which are now considered not-equal (:issue:`18463`)
-- Enforced deprecation ``all`` and ``any`` reductions with ``datetime64`` and :class:`DatetimeTZDtype` dtypes (:issue:`58029`)
+- Enforced deprecation ``all`` and ``any`` reductions with ``datetime64``, :class:`DatetimeTZDtype`, and :class:`PeriodDtype` dtypes (:issue:`58029`)
+- Enforced deprecation disallowing ``float`` "periods" in :func:`date_range`, :func:`period_range`, :func:`timedelta_range`, :func:`interval_range`,  (:issue:`56036`)
 - Enforced deprecation disallowing parsing datetimes with mixed time zones unless user passes ``utc=True`` to :func:`to_datetime` (:issue:`57275`)
 - Enforced deprecation in :meth:`Series.value_counts` and :meth:`Index.value_counts` with object dtype performing dtype inference on the ``.index`` of the result (:issue:`56161`)
 - Enforced deprecation of :meth:`.DataFrameGroupBy.get_group` and :meth:`.SeriesGroupBy.get_group` allowing the ``name`` argument to be a non-tuple when grouping by a list of length 1 (:issue:`54155`)
@@ -448,6 +453,7 @@ Other
 - Bug in :func:`unique` on :class:`Index` not always returning :class:`Index` (:issue:`57043`)
 - Bug in :meth:`DataFrame.sort_index` when passing ``axis="columns"`` and ``ignore_index=True`` and ``ascending=False`` not returning a :class:`RangeIndex` columns (:issue:`57293`)
 - Bug in :meth:`DataFrame.where` where using a non-bool type array in the function would return a ``ValueError`` instead of a ``TypeError`` (:issue:`56330`)
+- Bug in :meth:`Index.sort_values` when passing a key function that turns values into tuples, e.g. ``key=natsort.natsort_key``, would raise ``TypeError`` (:issue:`56081`)
 - Bug in Dataframe Interchange Protocol implementation was returning incorrect results for data buffers' associated dtype, for string and datetime columns (:issue:`54781`)
 
 .. ***DO NOT USE THIS SECTION***

diff --git a/pandas/arrays/__init__.py b/pandas/arrays/__init__.py
@@ -35,20 +35,3 @@
     "StringArray",
     "TimedeltaArray",
 ]
-
-
-def __getattr__(name: str) -> type[NumpyExtensionArray]:
-    if name == "PandasArray":
-        # GH#53694
-        import warnings
-
-        from pandas.util._exceptions import find_stack_level
-
-        warnings.warn(
-            "PandasArray has been renamed NumpyExtensionArray. Use that "
-            "instead. This alias will be removed in a future version.",
-            FutureWarning,
-            stacklevel=find_stack_level(),
-        )
-        return NumpyExtensionArray
-    raise AttributeError(f"module 'pandas.arrays' has no attribute '{name}'")
diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -276,9 +276,6 @@ class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMi
         provided).
     dtype : CategoricalDtype
         An instance of ``CategoricalDtype`` to use for this categorical.
-    fastpath : bool
-        The 'fastpath' keyword in Categorical is deprecated and will be
-        removed in a future version. Use Categorical.from_codes instead.
     copy : bool, default True
         Whether to copy if the codes are unchanged.
 
@@ -391,33 +388,15 @@ def __init__(
         categories=None,
         ordered=None,
         dtype: Dtype | None = None,
-        fastpath: bool | lib.NoDefault = lib.no_default,
         copy: bool = True,
     ) -> None:
-        if fastpath is not lib.no_default:
-            # GH#20110
-            warnings.warn(
-                "The 'fastpath' keyword in Categorical is deprecated and will "
-                "be removed in a future version. Use Categorical.from_codes instead",
-                DeprecationWarning,
-                stacklevel=find_stack_level(),
-            )
-        else:
-            fastpath = False
-
         dtype = CategoricalDtype._from_values_or_dtype(
             values, categories, ordered, dtype
         )
         # At this point, dtype is always a CategoricalDtype, but
         # we may have dtype.categories be None, and we need to
         # infer categories in a factorization step further below
 
-        if fastpath:
-            codes = coerce_indexer_dtype(values, dtype.categories)
-            dtype = CategoricalDtype(ordered=False).update_dtype(dtype)
-            super().__init__(codes, dtype)
-            return
-
         if not is_list_like(values):
             # GH#38433
             raise TypeError("Categorical input must be list-like")

diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py
@@ -1661,20 +1661,24 @@ def _groupby_op(
         dtype = self.dtype
         if dtype.kind == "M":
             # Adding/multiplying datetimes is not valid
-            if how in ["any", "all", "sum", "prod", "cumsum", "cumprod", "var", "skew"]:
+            if how in ["sum", "prod", "cumsum", "cumprod", "var", "skew"]:
                 raise TypeError(f"datetime64 type does not support operation '{how}'")
+            if how in ["any", "all"]:
+                # GH#34479
+                raise TypeError(
+                    f"'{how}' with datetime64 dtypes is no longer supported. "
+                    f"Use (obj != pd.Timestamp(0)).{how}() instead."
+                )
 
         elif isinstance(dtype, PeriodDtype):
             # Adding/multiplying Periods is not valid
             if how in ["sum", "prod", "cumsum", "cumprod", "var", "skew"]:
                 raise TypeError(f"Period type does not support {how} operations")
             if how in ["any", "all"]:
                 # GH#34479
-                warnings.warn(
-                    f"'{how}' with PeriodDtype is deprecated and will raise in a "
-                    f"future version. Use (obj != pd.Period(0, freq)).{how}() instead.",
-                    FutureWarning,
-                    stacklevel=find_stack_level(),
+                raise TypeError(
+                    f"'{how}' with PeriodDtype is no longer supported. "
+                    f"Use (obj != pd.Period(0, freq)).{how}() instead."
                 )
         else:
             # timedeltas we can add but not multiply
@@ -2424,17 +2428,17 @@ def validate_periods(periods: None) -> None: ...
 
 
 @overload
-def validate_periods(periods: int | float) -> int: ...
+def validate_periods(periods: int) -> int: ...
 
 
-def validate_periods(periods: int | float | None) -> int | None:
+def validate_periods(periods: int | None) -> int | None:
     """
     If a `periods` argument is passed to the Datetime/Timedelta Array/Index
     constructor, cast it to an integer.
 
     Parameters
     ----------
-    periods : None, float, int
+    periods : None, int
 
     Returns
     -------
@@ -2443,22 +2447,13 @@ def validate_periods(periods: int | float | None) -> int | None:
     Raises
     ------
     TypeError
-        if periods is None, float, or int
+        if periods is not None or int
     """
-    if periods is not None:
-        if lib.is_float(periods):
-            warnings.warn(
-                # GH#56036
-                "Non-integer 'periods' in pd.date_range, pd.timedelta_range, "
-                "pd.period_range, and pd.interval_range are deprecated and "
-                "will raise in a future version.",
-                FutureWarning,
-                stacklevel=find_stack_level(),
-            )
-            periods = int(periods)
-        elif not lib.is_integer(periods):
-            raise TypeError(f"periods must be a number, got {periods}")
-    return periods
+    if periods is not None and not lib.is_integer(periods):
+        raise TypeError(f"periods must be an integer, got {periods}")
+    # error: Incompatible return value type (got "int | integer[Any] | None",
+    # expected "int | None")
+    return periods  # type: ignore[return-value]
 
 
 def _validate_inferred_freq(

diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py
@@ -40,7 +40,6 @@
 
 from pandas.core.dtypes.astype import astype_array
 from pandas.core.dtypes.cast import (
-    construct_1d_arraylike_from_scalar,
     find_common_type,
     maybe_box_datetimelike,
 )
@@ -399,19 +398,10 @@ def __init__(
             dtype = dtype.subtype
 
         if is_scalar(data):
-            warnings.warn(
-                f"Constructing {type(self).__name__} with scalar data is deprecated "
-                "and will raise in a future version. Pass a sequence instead.",
-                FutureWarning,
-                stacklevel=find_stack_level(),
+            raise TypeError(
+                f"Cannot construct {type(self).__name__} from scalar data. "
+                "Pass a sequence instead."
             )
-            if sparse_index is None:
-                npoints = 1
-            else:
-                npoints = sparse_index.length
-
-            data = construct_1d_arraylike_from_scalar(data, npoints, dtype=None)
-            dtype = data.dtype
 
         if dtype is not None:
             dtype = pandas_dtype(dtype)

diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py
@@ -1762,24 +1762,18 @@ def _check_fill_value(self) -> None:
         val = self._fill_value
         if isna(val):
             if not is_valid_na_for_dtype(val, self.subtype):
-                warnings.warn(
-                    "Allowing arbitrary scalar fill_value in SparseDtype is "
-                    "deprecated. In a future version, the fill_value must be "
-                    "a valid value for the SparseDtype.subtype.",
-                    FutureWarning,
-                    stacklevel=find_stack_level(),
+                raise ValueError(
+                    # GH#53043
+                    "fill_value must be a valid value for the SparseDtype.subtype"
                 )
         else:
             dummy = np.empty(0, dtype=self.subtype)
             dummy = ensure_wrapped_if_datetimelike(dummy)
 
             if not can_hold_element(dummy, val):
-                warnings.warn(
-                    "Allowing arbitrary scalar fill_value in SparseDtype is "
-                    "deprecated. In a future version, the fill_value must be "
-                    "a valid value for the SparseDtype.subtype.",
-                    FutureWarning,
-                    stacklevel=find_stack_level(),
+                raise ValueError(
+                    # GH#53043
+                    "fill_value must be a valid value for the SparseDtype.subtype"
                 )
 
     @property

diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py
@@ -577,7 +577,7 @@ def ensure_key_mapped(
         if isinstance(
             values, Index
         ):  # convert to a new Index subclass, not necessarily the same
-            result = Index(result)
+            result = Index(result, tupleize_cols=False)
         else:
             # try to revert to original type otherwise
             type_of_values = type(values)

diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py
@@ -395,13 +395,5 @@ def test_util_in_top_level(self):
             pd.util.foo
 
 
-def test_pandas_array_alias():
-    msg = "PandasArray has been renamed NumpyExtensionArray"
-    with tm.assert_produces_warning(FutureWarning, match=msg):
-        res = pd.arrays.PandasArray
-
-    assert res is pd.arrays.NumpyExtensionArray
-
-
 def test_set_module():
     assert pd.DataFrame.__module__ == "pandas"
diff --git a/pandas/tests/arrays/categorical/test_constructors.py b/pandas/tests/arrays/categorical/test_constructors.py
@@ -35,13 +35,6 @@
 
 
 class TestCategoricalConstructors:
-    def test_fastpath_deprecated(self):
-        codes = np.array([1, 2, 3])
-        dtype = CategoricalDtype(categories=["a", "b", "c", "d"], ordered=False)
-        msg = "The 'fastpath' keyword in Categorical is deprecated"
-        with tm.assert_produces_warning(DeprecationWarning, match=msg):
-            Categorical(codes, dtype=dtype, fastpath=True)
-
     def test_categorical_from_cat_and_dtype_str_preserve_ordered(self):
         # GH#49309 we should preserve orderedness in `res`
         cat = Categorical([3, 1], categories=[3, 2, 1], ordered=True)

diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py
@@ -52,10 +52,11 @@ def test_set_fill_value(self):
         arr.fill_value = 2
         assert arr.fill_value == 2
 
-        msg = "Allowing arbitrary scalar fill_value in SparseDtype is deprecated"
-        with tm.assert_produces_warning(FutureWarning, match=msg):
+        msg = "fill_value must be a valid value for the SparseDtype.subtype"
+        with pytest.raises(ValueError, match=msg):
+            # GH#53043
             arr.fill_value = 3.1
-        assert arr.fill_value == 3.1
+        assert arr.fill_value == 2
 
         arr.fill_value = np.nan
         assert np.isnan(arr.fill_value)
@@ -64,8 +65,9 @@ def test_set_fill_value(self):
         arr.fill_value = True
         assert arr.fill_value is True
 
-        with tm.assert_produces_warning(FutureWarning, match=msg):
+        with pytest.raises(ValueError, match=msg):
             arr.fill_value = 0
+        assert arr.fill_value is True
 
         arr.fill_value = np.nan
         assert np.isnan(arr.fill_value)

diff --git a/pandas/tests/arrays/sparse/test_constructors.py b/pandas/tests/arrays/sparse/test_constructors.py
@@ -144,20 +144,12 @@ def test_constructor_spindex_dtype(self):
     @pytest.mark.parametrize("sparse_index", [None, IntIndex(1, [0])])
     def test_constructor_spindex_dtype_scalar(self, sparse_index):
         # scalar input
-        msg = "Constructing SparseArray with scalar data is deprecated"
-        with tm.assert_produces_warning(FutureWarning, match=msg):
-            arr = SparseArray(data=1, sparse_index=sparse_index, dtype=None)
-        exp = SparseArray([1], dtype=None)
-        tm.assert_sp_array_equal(arr, exp)
-        assert arr.dtype == SparseDtype(np.int64)
-        assert arr.fill_value == 0
+        msg = "Cannot construct SparseArray from scalar data. Pass a sequence instead"
+        with pytest.raises(TypeError, match=msg):
+            SparseArray(data=1, sparse_index=sparse_index, dtype=None)
 
-        with tm.assert_produces_warning(FutureWarning, match=msg):
-            arr = SparseArray(data=1, sparse_index=IntIndex(1, [0]), dtype=None)
-        exp = SparseArray([1], dtype=None)
-        tm.assert_sp_array_equal(arr, exp)
-        assert arr.dtype == SparseDtype(np.int64)
-        assert arr.fill_value == 0
+        with pytest.raises(TypeError, match=msg):
+            SparseArray(data=1, sparse_index=IntIndex(1, [0]), dtype=None)
 
     def test_constructor_spindex_dtype_scalar_broadcasts(self):
         arr = SparseArray(

diff --git a/pandas/tests/arrays/sparse/test_dtype.py b/pandas/tests/arrays/sparse/test_dtype.py
@@ -84,7 +84,6 @@ def test_nans_not_equal():
         (SparseDtype("float64"), SparseDtype("float32")),
         (SparseDtype("float64"), SparseDtype("float64", 0)),
         (SparseDtype("float64"), SparseDtype("datetime64[ns]", np.nan)),
-        (SparseDtype(int, pd.NaT), SparseDtype(float, pd.NaT)),
         (SparseDtype("float64"), np.dtype("float64")),
     ]
 

diff --git a/pandas/tests/frame/methods/test_to_csv.py b/pandas/tests/frame/methods/test_to_csv.py
@@ -1406,19 +1406,21 @@ def test_to_csv_categorical_and_interval(self):
         expected = tm.convert_rows_list_to_csv_str(expected_rows)
         assert result == expected
 
-    def test_to_csv_warn_when_zip_tar_and_append_mode(self):
+    def test_to_csv_warn_when_zip_tar_and_append_mode(self, tmp_path):
         # GH57875
         df = DataFrame({"a": [1, 2, 3]})
         msg = (
             "zip and tar do not support mode 'a' properly. This combination will "
             "result in multiple files with same name being added to the archive"
         )
+        zip_path = tmp_path / "test.zip"
+        tar_path = tmp_path / "test.tar"
         with tm.assert_produces_warning(
             RuntimeWarning, match=msg, raise_on_extra_warnings=False
         ):
-            df.to_csv("test.zip", mode="a")
+            df.to_csv(zip_path, mode="a")
 
         with tm.assert_produces_warning(
             RuntimeWarning, match=msg, raise_on_extra_warnings=False
         ):
-            df.to_csv("test.tar", mode="a")
+            df.to_csv(tar_path, mode="a")