Merge remote-tracking branch 'upstream/main' into arrow-non-nano

pandas-dev · May 12, 2023 · e131cf0 · e131cf0
2 parents b460e6e + e19ceb7
commit e131cf0
Show file tree

Hide file tree

Showing 15 changed files with 193 additions and 102 deletions.
diff --git a/doc/source/conf.py b/doc/source/conf.py
@@ -237,14 +237,14 @@
 
 html_theme_options = {
     "external_links": [],
-    "footer_items": ["pandas_footer", "sphinx-version"],
+    "footer_start": ["pandas_footer", "sphinx-version"],
     "github_url": "https://github.com/pandas-dev/pandas",
     "twitter_url": "https://twitter.com/pandas_dev",
-    "google_analytics_id": "UA-27880019-2",
+    "analytics": {"google_analytics_id": "UA-27880019-2"},
     "logo": {"image_dark": "https://pandas.pydata.org/static/img/pandas_white.svg"},
     "navbar_end": ["version-switcher", "theme-switcher", "navbar-icon-links"],
     "switcher": {
-        "json_url": "/versions.json",
+        "json_url": "https://pandas.pydata.org/versions.json",
         "version_match": switcher_version,
     },
     "icon_links": [

diff --git a/doc/source/development/contributing_environment.rst b/doc/source/development/contributing_environment.rst
@@ -225,7 +225,7 @@ To compile pandas with meson, run::
    # Build and install pandas
    python -m pip install -ve . --no-build-isolation
 
-** Build options **
+**Build options**
 
 It is possible to pass options from the pip frontend to the meson backend if you would like to configure your
 install. Occasionally, you'll want to use this to adjust the build directory, and/or toggle debug/optimization levels.

diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst
@@ -92,13 +92,13 @@ Other enhancements
 - Implemented ``__pandas_priority__`` to allow custom types to take precedence over :class:`DataFrame`, :class:`Series`, :class:`Index`, or :class:`ExtensionArray` for arithmetic operations, :ref:`see the developer guide <extending.pandas_priority>` (:issue:`48347`)
 - Improve error message when having incompatible columns using :meth:`DataFrame.merge` (:issue:`51861`)
 - Improve error message when setting :class:`DataFrame` with wrong number of columns through :meth:`DataFrame.isetitem` (:issue:`51701`)
+- Improved error handling when using :meth:`DataFrame.to_json` with incompatible ``index`` and ``orient`` arguments (:issue:`52143`)
 - Improved error message when creating a DataFrame with empty data (0 rows), no index and an incorrect number of columns. (:issue:`52084`)
 - Let :meth:`DataFrame.to_feather` accept a non-default :class:`Index` and non-string column names (:issue:`51787`)
 - Performance improvement in :func:`read_csv` (:issue:`52632`) with ``engine="c"``
 - :meth:`Categorical.from_codes` has gotten a ``validate`` parameter (:issue:`50975`)
 - Performance improvement in :func:`concat` with homogeneous ``np.float64`` or ``np.float32`` dtypes (:issue:`52685`)
 - Performance improvement in :meth:`DataFrame.filter` when ``items`` is given (:issue:`52941`)
--
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_210.notable_bug_fixes:
@@ -261,6 +261,7 @@ Deprecations
 - Deprecated unused "closed" keyword in the :class:`TimedeltaIndex` constructor (:issue:`52628`)
 - Deprecated logical operation between two non boolean :class:`Series` with different indexes always coercing the result to bool dtype. In a future version, this will maintain the return type of the inputs. (:issue:`52500`, :issue:`52538`)
 - Deprecated allowing ``downcast`` keyword other than ``None``, ``False``, "infer", or a dict with these as values in :meth:`Series.fillna`, :meth:`DataFrame.fillna` (:issue:`40988`)
+- Deprecated allowing arbitrary ``fill_value`` in :class:`SparseDtype`, in a future version the ``fill_value`` will need to be compatible with the ``dtype.subtype``, either a scalar that can be held by that subtype or ``NaN`` for integer or bool subtypes (:issue:`23124`)
 - Deprecated constructing :class:`SparseArray` from scalar data, pass a sequence instead (:issue:`53039`)
 -
 
@@ -426,6 +427,7 @@ Reshaping
 
 Sparse
 ^^^^^^
+- Bug in :class:`SparseDtype` constructor failing to raise ``TypeError`` when given an incompatible ``dtype`` for its subtype, which must be a ``numpy`` dtype (:issue:`53160`)
 - Bug in :meth:`arrays.SparseArray.map` allowed the fill value to be included in the sparse values (:issue:`52095`)
 -
 

diff --git a/environment.yml b/environment.yml
@@ -89,7 +89,7 @@ dependencies:
   - gitdb
   - natsort  # DataFrame.sort_values doctest
   - numpydoc
-  - pydata-sphinx-theme<0.11
+  - pydata-sphinx-theme
   - pytest-cython  # doctest
   - sphinx
   - sphinx-design

diff --git a/pandas/core/arrays/sparse/dtype.py b/pandas/core/arrays/sparse/dtype.py
@@ -18,18 +18,23 @@
     ExtensionDtype,
     register_extension_dtype,
 )
+from pandas.core.dtypes.cast import can_hold_element
 from pandas.core.dtypes.common import (
     is_bool_dtype,
     is_object_dtype,
     is_scalar,
     is_string_dtype,
     pandas_dtype,
 )
+from pandas.core.dtypes.dtypes import CategoricalDtype
 from pandas.core.dtypes.missing import (
+    is_valid_na_for_dtype,
     isna,
     na_value_for_dtype,
 )
 
+from pandas.core.construction import ensure_wrapped_if_datetimelike
+
 if TYPE_CHECKING:
     from pandas._typing import (
         Dtype,
@@ -91,6 +96,9 @@ def __init__(self, dtype: Dtype = np.float64, fill_value: Any = None) -> None:
         dtype = pandas_dtype(dtype)
         if is_string_dtype(dtype):
             dtype = np.dtype("object")
+        if not isinstance(dtype, np.dtype):
+            # GH#53160
+            raise TypeError("SparseDtype subtype must be a numpy dtype")
 
         if fill_value is None:
             fill_value = na_value_for_dtype(dtype)
@@ -161,18 +169,41 @@ def _check_fill_value(self):
             raise ValueError(
                 f"fill_value must be a scalar. Got {self._fill_value} instead"
             )
-        # TODO: Right now we can use Sparse boolean array
-        #       with any fill_value. Here was an attempt
-        #       to allow only 3 value: True, False or nan
-        #       but plenty test has failed.
-        # see pull 44955
-        # if self._is_boolean and not (
-        #    is_bool(self._fill_value) or isna(self._fill_value)
-        # ):
-        #    raise ValueError(
-        #        "fill_value must be True, False or nan "
-        #        f"for boolean type. Got {self._fill_value} instead"
-        #    )
+
+        # GH#23124 require fill_value and subtype to match
+        val = self._fill_value
+        if isna(val):
+            if not is_valid_na_for_dtype(val, self.subtype):
+                warnings.warn(
+                    "Allowing arbitrary scalar fill_value in SparseDtype is "
+                    "deprecated. In a future version, the fill_value must be "
+                    "a valid value for the SparseDtype.subtype.",
+                    FutureWarning,
+                    stacklevel=find_stack_level(),
+                )
+        elif isinstance(self.subtype, CategoricalDtype):
+            # TODO: is this even supported?  It is reached in
+            #  test_dtype_sparse_with_fill_value_not_present_in_data
+            if self.subtype.categories is None or val not in self.subtype.categories:
+                warnings.warn(
+                    "Allowing arbitrary scalar fill_value in SparseDtype is "
+                    "deprecated. In a future version, the fill_value must be "
+                    "a valid value for the SparseDtype.subtype.",
+                    FutureWarning,
+                    stacklevel=find_stack_level(),
+                )
+        else:
+            dummy = np.empty(0, dtype=self.subtype)
+            dummy = ensure_wrapped_if_datetimelike(dummy)
+
+            if not can_hold_element(dummy, val):
+                warnings.warn(
+                    "Allowing arbitrary scalar fill_value in SparseDtype is "
+                    "deprecated. In a future version, the fill_value must be "
+                    "a valid value for the SparseDtype.subtype.",
+                    FutureWarning,
+                    stacklevel=find_stack_level(),
+                )
 
     @property
     def _is_na_fill_value(self) -> bool:

diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -2306,7 +2306,7 @@ def to_json(
         default_handler: Callable[[Any], JSONSerializable] | None = None,
         lines: bool_t = False,
         compression: CompressionOptions = "infer",
-        index: bool_t = True,
+        index: bool_t | None = None,
         indent: int | None = None,
         storage_options: StorageOptions = None,
         mode: Literal["a", "w"] = "w",
@@ -2375,10 +2375,11 @@ def to_json(
 
             .. versionchanged:: 1.4.0 Zstandard support.
 
-        index : bool, default True
-            Whether to include the index values in the JSON string. Not
-            including the index (``index=False``) is only supported when
-            orient is 'split' or 'table'.
+        index : bool or None, default None
+            The index is only used when 'orient' is 'split', 'index', 'column',
+            or 'table'. Of these, 'index' and 'column' do not support
+            `index=False`.
+
         indent : int, optional
            Length of whitespace used to indent each record.
 

diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py
@@ -100,7 +100,7 @@ def to_json(
     default_handler: Callable[[Any], JSONSerializable] | None = ...,
     lines: bool = ...,
     compression: CompressionOptions = ...,
-    index: bool = ...,
+    index: bool | None = ...,
     indent: int = ...,
     storage_options: StorageOptions = ...,
     mode: Literal["a", "w"] = ...,
@@ -120,7 +120,7 @@ def to_json(
     default_handler: Callable[[Any], JSONSerializable] | None = ...,
     lines: bool = ...,
     compression: CompressionOptions = ...,
-    index: bool = ...,
+    index: bool | None = ...,
     indent: int = ...,
     storage_options: StorageOptions = ...,
     mode: Literal["a", "w"] = ...,
@@ -139,15 +139,24 @@ def to_json(
     default_handler: Callable[[Any], JSONSerializable] | None = None,
     lines: bool = False,
     compression: CompressionOptions = "infer",
-    index: bool = True,
+    index: bool | None = None,
     indent: int = 0,
     storage_options: StorageOptions = None,
     mode: Literal["a", "w"] = "w",
 ) -> str | None:
-    if not index and orient not in ["split", "table"]:
+    if orient in ["records", "values"] and index is True:
         raise ValueError(
-            "'index=False' is only valid when 'orient' is 'split' or 'table'"
+            "'index=True' is only valid when 'orient' is 'split', 'table', "
+            "'index', or 'columns'."
         )
+    elif orient in ["index", "columns"] and index is False:
+        raise ValueError(
+            "'index=False' is only valid when 'orient' is 'split', 'table', "
+            "'records', or 'values'."
+        )
+    elif index is None:
+        # will be ignored for orient='records' and 'values'
+        index = True
 
     if lines and orient != "records":
         raise ValueError("'lines' keyword only valid when 'orient' is records")

diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py
@@ -52,33 +52,21 @@ def test_set_fill_value(self):
         arr.fill_value = 2
         assert arr.fill_value == 2
 
-        # TODO: this seems fine? You can construct an integer
-        # sparsearray with NaN fill value, why not update one?
-        # coerces to int
-        # msg = "unable to set fill_value 3\\.1 to int64 dtype"
-        # with pytest.raises(ValueError, match=msg):
-        arr.fill_value = 3.1
+        msg = "Allowing arbitrary scalar fill_value in SparseDtype is deprecated"
+        with tm.assert_produces_warning(FutureWarning, match=msg):
+            arr.fill_value = 3.1
         assert arr.fill_value == 3.1
 
-        # msg = "unable to set fill_value nan to int64 dtype"
-        # with pytest.raises(ValueError, match=msg):
         arr.fill_value = np.nan
         assert np.isnan(arr.fill_value)
 
         arr = SparseArray([True, False, True], fill_value=False, dtype=np.bool_)
         arr.fill_value = True
-        assert arr.fill_value
-
-        # FIXME: don't leave commented-out
-        # coerces to bool
-        # TODO: we can construct an sparse array of bool
-        #      type and use as fill_value any value
-        # msg = "fill_value must be True, False or nan"
-        # with pytest.raises(ValueError, match=msg):
-        #    arr.fill_value = 0
-
-        # msg = "unable to set fill_value nan to bool dtype"
-        # with pytest.raises(ValueError, match=msg):
+        assert arr.fill_value is True
+
+        with tm.assert_produces_warning(FutureWarning, match=msg):
+            arr.fill_value = 0
+
         arr.fill_value = np.nan
         assert np.isnan(arr.fill_value)
 

diff --git a/pandas/tests/arrays/sparse/test_astype.py b/pandas/tests/arrays/sparse/test_astype.py
@@ -3,11 +3,7 @@
 
 from pandas._libs.sparse import IntIndex
 
-from pandas import (
-    DataFrame,
-    Series,
-    Timestamp,
-)
+from pandas import Timestamp
 import pandas._testing as tm
 from pandas.core.arrays.sparse import (
     SparseArray,
@@ -135,13 +131,3 @@ def test_astype_dt64_to_int64(self):
         arr3 = SparseArray(values, dtype=dtype)
         result3 = arr3.astype("int64")
         tm.assert_numpy_array_equal(result3, expected)
-
-
-def test_dtype_sparse_with_fill_value_not_present_in_data():
-    # GH 49987
-    df = DataFrame([["a", 0], ["b", 1], ["b", 2]], columns=["A", "B"])
-    result = df["A"].astype(SparseDtype("category", fill_value="c"))
-    expected = Series(
-        ["a", "b", "b"], name="A", dtype=SparseDtype("object", fill_value="c")
-    )
-    tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/arrays/sparse/test_dtype.py b/pandas/tests/arrays/sparse/test_dtype.py
@@ -1,4 +1,5 @@
 import re
+import warnings
 
 import numpy as np
 import pytest
@@ -67,15 +68,22 @@ def test_nans_equal():
     assert b == a
 
 
-@pytest.mark.parametrize(
-    "a, b",
-    [
+with warnings.catch_warnings():
+    msg = "Allowing arbitrary scalar fill_value in SparseDtype is deprecated"
+    warnings.filterwarnings("ignore", msg, category=FutureWarning)
+
+    tups = [
         (SparseDtype("float64"), SparseDtype("float32")),
         (SparseDtype("float64"), SparseDtype("float64", 0)),
         (SparseDtype("float64"), SparseDtype("datetime64[ns]", np.nan)),
         (SparseDtype(int, pd.NaT), SparseDtype(float, pd.NaT)),
         (SparseDtype("float64"), np.dtype("float64")),
-    ],
+    ]
+
+
+@pytest.mark.parametrize(
+    "a, b",
+    tups,
 )
 def test_not_equal(a, b):
     assert a != b
@@ -207,3 +215,10 @@ def test_repr():
     result = str(SparseDtype(object, fill_value="0"))
     expected = "Sparse[object, '0']"
     assert result == expected
+
+
+def test_sparse_dtype_subtype_must_be_numpy_dtype():
+    # GH#53160
+    msg = "SparseDtype subtype must be a numpy dtype"
+    with pytest.raises(TypeError, match=msg):
+        SparseDtype("category", fill_value="c")
diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
@@ -3058,3 +3058,27 @@ def test_groupby_selection_other_methods(df):
     tm.assert_frame_equal(
         g.filter(lambda x: len(x) == 3), g_exp.filter(lambda x: len(x) == 3)
     )
+
+
+def test_groupby_with_Time_Grouper():
+    idx2 = [
+        to_datetime("2016-08-31 22:08:12.000"),
+        to_datetime("2016-08-31 22:09:12.200"),
+        to_datetime("2016-08-31 22:20:12.400"),
+    ]
+
+    test_data = DataFrame(
+        {"quant": [1.0, 1.0, 3.0], "quant2": [1.0, 1.0, 3.0], "time2": idx2}
+    )
+
+    expected_output = DataFrame(
+        {
+            "time2": date_range("2016-08-31 22:08:00", periods=13, freq="1T"),
+            "quant": [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
+            "quant2": [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
+        }
+    )
+
+    df = test_data.groupby(Grouper(key="time2", freq="1T")).count().reset_index()
+
+    tm.assert_frame_equal(df, expected_output)