Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/main' into arrow-non-nano
Browse files Browse the repository at this point in the history
  • Loading branch information
lukemanley committed May 12, 2023
2 parents b460e6e + e19ceb7 commit e131cf0
Show file tree
Hide file tree
Showing 15 changed files with 193 additions and 102 deletions.
6 changes: 3 additions & 3 deletions doc/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,14 +237,14 @@

html_theme_options = {
"external_links": [],
"footer_items": ["pandas_footer", "sphinx-version"],
"footer_start": ["pandas_footer", "sphinx-version"],
"github_url": "https://github.com/pandas-dev/pandas",
"twitter_url": "https://twitter.com/pandas_dev",
"google_analytics_id": "UA-27880019-2",
"analytics": {"google_analytics_id": "UA-27880019-2"},
"logo": {"image_dark": "https://pandas.pydata.org/static/img/pandas_white.svg"},
"navbar_end": ["version-switcher", "theme-switcher", "navbar-icon-links"],
"switcher": {
"json_url": "/versions.json",
"json_url": "https://pandas.pydata.org/versions.json",
"version_match": switcher_version,
},
"icon_links": [
Expand Down
2 changes: 1 addition & 1 deletion doc/source/development/contributing_environment.rst
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,7 @@ To compile pandas with meson, run::
# Build and install pandas
python -m pip install -ve . --no-build-isolation

** Build options **
**Build options**

It is possible to pass options from the pip frontend to the meson backend if you would like to configure your
install. Occasionally, you'll want to use this to adjust the build directory, and/or toggle debug/optimization levels.
Expand Down
4 changes: 3 additions & 1 deletion doc/source/whatsnew/v2.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -92,13 +92,13 @@ Other enhancements
- Implemented ``__pandas_priority__`` to allow custom types to take precedence over :class:`DataFrame`, :class:`Series`, :class:`Index`, or :class:`ExtensionArray` for arithmetic operations, :ref:`see the developer guide <extending.pandas_priority>` (:issue:`48347`)
- Improve error message when having incompatible columns using :meth:`DataFrame.merge` (:issue:`51861`)
- Improve error message when setting :class:`DataFrame` with wrong number of columns through :meth:`DataFrame.isetitem` (:issue:`51701`)
- Improved error handling when using :meth:`DataFrame.to_json` with incompatible ``index`` and ``orient`` arguments (:issue:`52143`)
- Improved error message when creating a DataFrame with empty data (0 rows), no index and an incorrect number of columns. (:issue:`52084`)
- Let :meth:`DataFrame.to_feather` accept a non-default :class:`Index` and non-string column names (:issue:`51787`)
- Performance improvement in :func:`read_csv` (:issue:`52632`) with ``engine="c"``
- :meth:`Categorical.from_codes` has gotten a ``validate`` parameter (:issue:`50975`)
- Performance improvement in :func:`concat` with homogeneous ``np.float64`` or ``np.float32`` dtypes (:issue:`52685`)
- Performance improvement in :meth:`DataFrame.filter` when ``items`` is given (:issue:`52941`)
-

.. ---------------------------------------------------------------------------
.. _whatsnew_210.notable_bug_fixes:
Expand Down Expand Up @@ -261,6 +261,7 @@ Deprecations
- Deprecated unused "closed" keyword in the :class:`TimedeltaIndex` constructor (:issue:`52628`)
- Deprecated logical operation between two non boolean :class:`Series` with different indexes always coercing the result to bool dtype. In a future version, this will maintain the return type of the inputs. (:issue:`52500`, :issue:`52538`)
- Deprecated allowing ``downcast`` keyword other than ``None``, ``False``, "infer", or a dict with these as values in :meth:`Series.fillna`, :meth:`DataFrame.fillna` (:issue:`40988`)
- Deprecated allowing arbitrary ``fill_value`` in :class:`SparseDtype`, in a future version the ``fill_value`` will need to be compatible with the ``dtype.subtype``, either a scalar that can be held by that subtype or ``NaN`` for integer or bool subtypes (:issue:`23124`)
- Deprecated constructing :class:`SparseArray` from scalar data, pass a sequence instead (:issue:`53039`)
-

Expand Down Expand Up @@ -426,6 +427,7 @@ Reshaping

Sparse
^^^^^^
- Bug in :class:`SparseDtype` constructor failing to raise ``TypeError`` when given an incompatible ``dtype`` for its subtype, which must be a ``numpy`` dtype (:issue:`53160`)
- Bug in :meth:`arrays.SparseArray.map` allowed the fill value to be included in the sparse values (:issue:`52095`)
-

Expand Down
2 changes: 1 addition & 1 deletion environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ dependencies:
- gitdb
- natsort # DataFrame.sort_values doctest
- numpydoc
- pydata-sphinx-theme<0.11
- pydata-sphinx-theme
- pytest-cython # doctest
- sphinx
- sphinx-design
Expand Down
55 changes: 43 additions & 12 deletions pandas/core/arrays/sparse/dtype.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,18 +18,23 @@
ExtensionDtype,
register_extension_dtype,
)
from pandas.core.dtypes.cast import can_hold_element
from pandas.core.dtypes.common import (
is_bool_dtype,
is_object_dtype,
is_scalar,
is_string_dtype,
pandas_dtype,
)
from pandas.core.dtypes.dtypes import CategoricalDtype
from pandas.core.dtypes.missing import (
is_valid_na_for_dtype,
isna,
na_value_for_dtype,
)

from pandas.core.construction import ensure_wrapped_if_datetimelike

if TYPE_CHECKING:
from pandas._typing import (
Dtype,
Expand Down Expand Up @@ -91,6 +96,9 @@ def __init__(self, dtype: Dtype = np.float64, fill_value: Any = None) -> None:
dtype = pandas_dtype(dtype)
if is_string_dtype(dtype):
dtype = np.dtype("object")
if not isinstance(dtype, np.dtype):
# GH#53160
raise TypeError("SparseDtype subtype must be a numpy dtype")

if fill_value is None:
fill_value = na_value_for_dtype(dtype)
Expand Down Expand Up @@ -161,18 +169,41 @@ def _check_fill_value(self):
raise ValueError(
f"fill_value must be a scalar. Got {self._fill_value} instead"
)
# TODO: Right now we can use Sparse boolean array
# with any fill_value. Here was an attempt
# to allow only 3 value: True, False or nan
# but plenty test has failed.
# see pull 44955
# if self._is_boolean and not (
# is_bool(self._fill_value) or isna(self._fill_value)
# ):
# raise ValueError(
# "fill_value must be True, False or nan "
# f"for boolean type. Got {self._fill_value} instead"
# )

# GH#23124 require fill_value and subtype to match
val = self._fill_value
if isna(val):
if not is_valid_na_for_dtype(val, self.subtype):
warnings.warn(
"Allowing arbitrary scalar fill_value in SparseDtype is "
"deprecated. In a future version, the fill_value must be "
"a valid value for the SparseDtype.subtype.",
FutureWarning,
stacklevel=find_stack_level(),
)
elif isinstance(self.subtype, CategoricalDtype):
# TODO: is this even supported? It is reached in
# test_dtype_sparse_with_fill_value_not_present_in_data
if self.subtype.categories is None or val not in self.subtype.categories:
warnings.warn(
"Allowing arbitrary scalar fill_value in SparseDtype is "
"deprecated. In a future version, the fill_value must be "
"a valid value for the SparseDtype.subtype.",
FutureWarning,
stacklevel=find_stack_level(),
)
else:
dummy = np.empty(0, dtype=self.subtype)
dummy = ensure_wrapped_if_datetimelike(dummy)

if not can_hold_element(dummy, val):
warnings.warn(
"Allowing arbitrary scalar fill_value in SparseDtype is "
"deprecated. In a future version, the fill_value must be "
"a valid value for the SparseDtype.subtype.",
FutureWarning,
stacklevel=find_stack_level(),
)

@property
def _is_na_fill_value(self) -> bool:
Expand Down
11 changes: 6 additions & 5 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -2306,7 +2306,7 @@ def to_json(
default_handler: Callable[[Any], JSONSerializable] | None = None,
lines: bool_t = False,
compression: CompressionOptions = "infer",
index: bool_t = True,
index: bool_t | None = None,
indent: int | None = None,
storage_options: StorageOptions = None,
mode: Literal["a", "w"] = "w",
Expand Down Expand Up @@ -2375,10 +2375,11 @@ def to_json(
.. versionchanged:: 1.4.0 Zstandard support.
index : bool, default True
Whether to include the index values in the JSON string. Not
including the index (``index=False``) is only supported when
orient is 'split' or 'table'.
index : bool or None, default None
The index is only used when 'orient' is 'split', 'index', 'column',
or 'table'. Of these, 'index' and 'column' do not support
`index=False`.
indent : int, optional
Length of whitespace used to indent each record.
Expand Down
19 changes: 14 additions & 5 deletions pandas/io/json/_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ def to_json(
default_handler: Callable[[Any], JSONSerializable] | None = ...,
lines: bool = ...,
compression: CompressionOptions = ...,
index: bool = ...,
index: bool | None = ...,
indent: int = ...,
storage_options: StorageOptions = ...,
mode: Literal["a", "w"] = ...,
Expand All @@ -120,7 +120,7 @@ def to_json(
default_handler: Callable[[Any], JSONSerializable] | None = ...,
lines: bool = ...,
compression: CompressionOptions = ...,
index: bool = ...,
index: bool | None = ...,
indent: int = ...,
storage_options: StorageOptions = ...,
mode: Literal["a", "w"] = ...,
Expand All @@ -139,15 +139,24 @@ def to_json(
default_handler: Callable[[Any], JSONSerializable] | None = None,
lines: bool = False,
compression: CompressionOptions = "infer",
index: bool = True,
index: bool | None = None,
indent: int = 0,
storage_options: StorageOptions = None,
mode: Literal["a", "w"] = "w",
) -> str | None:
if not index and orient not in ["split", "table"]:
if orient in ["records", "values"] and index is True:
raise ValueError(
"'index=False' is only valid when 'orient' is 'split' or 'table'"
"'index=True' is only valid when 'orient' is 'split', 'table', "
"'index', or 'columns'."
)
elif orient in ["index", "columns"] and index is False:
raise ValueError(
"'index=False' is only valid when 'orient' is 'split', 'table', "
"'records', or 'values'."
)
elif index is None:
# will be ignored for orient='records' and 'values'
index = True

if lines and orient != "records":
raise ValueError("'lines' keyword only valid when 'orient' is records")
Expand Down
28 changes: 8 additions & 20 deletions pandas/tests/arrays/sparse/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,33 +52,21 @@ def test_set_fill_value(self):
arr.fill_value = 2
assert arr.fill_value == 2

# TODO: this seems fine? You can construct an integer
# sparsearray with NaN fill value, why not update one?
# coerces to int
# msg = "unable to set fill_value 3\\.1 to int64 dtype"
# with pytest.raises(ValueError, match=msg):
arr.fill_value = 3.1
msg = "Allowing arbitrary scalar fill_value in SparseDtype is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
arr.fill_value = 3.1
assert arr.fill_value == 3.1

# msg = "unable to set fill_value nan to int64 dtype"
# with pytest.raises(ValueError, match=msg):
arr.fill_value = np.nan
assert np.isnan(arr.fill_value)

arr = SparseArray([True, False, True], fill_value=False, dtype=np.bool_)
arr.fill_value = True
assert arr.fill_value

# FIXME: don't leave commented-out
# coerces to bool
# TODO: we can construct an sparse array of bool
# type and use as fill_value any value
# msg = "fill_value must be True, False or nan"
# with pytest.raises(ValueError, match=msg):
# arr.fill_value = 0

# msg = "unable to set fill_value nan to bool dtype"
# with pytest.raises(ValueError, match=msg):
assert arr.fill_value is True

with tm.assert_produces_warning(FutureWarning, match=msg):
arr.fill_value = 0

arr.fill_value = np.nan
assert np.isnan(arr.fill_value)

Expand Down
16 changes: 1 addition & 15 deletions pandas/tests/arrays/sparse/test_astype.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,7 @@

from pandas._libs.sparse import IntIndex

from pandas import (
DataFrame,
Series,
Timestamp,
)
from pandas import Timestamp
import pandas._testing as tm
from pandas.core.arrays.sparse import (
SparseArray,
Expand Down Expand Up @@ -135,13 +131,3 @@ def test_astype_dt64_to_int64(self):
arr3 = SparseArray(values, dtype=dtype)
result3 = arr3.astype("int64")
tm.assert_numpy_array_equal(result3, expected)


def test_dtype_sparse_with_fill_value_not_present_in_data():
# GH 49987
df = DataFrame([["a", 0], ["b", 1], ["b", 2]], columns=["A", "B"])
result = df["A"].astype(SparseDtype("category", fill_value="c"))
expected = Series(
["a", "b", "b"], name="A", dtype=SparseDtype("object", fill_value="c")
)
tm.assert_series_equal(result, expected)
23 changes: 19 additions & 4 deletions pandas/tests/arrays/sparse/test_dtype.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import re
import warnings

import numpy as np
import pytest
Expand Down Expand Up @@ -67,15 +68,22 @@ def test_nans_equal():
assert b == a


@pytest.mark.parametrize(
"a, b",
[
with warnings.catch_warnings():
msg = "Allowing arbitrary scalar fill_value in SparseDtype is deprecated"
warnings.filterwarnings("ignore", msg, category=FutureWarning)

tups = [
(SparseDtype("float64"), SparseDtype("float32")),
(SparseDtype("float64"), SparseDtype("float64", 0)),
(SparseDtype("float64"), SparseDtype("datetime64[ns]", np.nan)),
(SparseDtype(int, pd.NaT), SparseDtype(float, pd.NaT)),
(SparseDtype("float64"), np.dtype("float64")),
],
]


@pytest.mark.parametrize(
"a, b",
tups,
)
def test_not_equal(a, b):
assert a != b
Expand Down Expand Up @@ -207,3 +215,10 @@ def test_repr():
result = str(SparseDtype(object, fill_value="0"))
expected = "Sparse[object, '0']"
assert result == expected


def test_sparse_dtype_subtype_must_be_numpy_dtype():
# GH#53160
msg = "SparseDtype subtype must be a numpy dtype"
with pytest.raises(TypeError, match=msg):
SparseDtype("category", fill_value="c")
24 changes: 24 additions & 0 deletions pandas/tests/groupby/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -3058,3 +3058,27 @@ def test_groupby_selection_other_methods(df):
tm.assert_frame_equal(
g.filter(lambda x: len(x) == 3), g_exp.filter(lambda x: len(x) == 3)
)


def test_groupby_with_Time_Grouper():
idx2 = [
to_datetime("2016-08-31 22:08:12.000"),
to_datetime("2016-08-31 22:09:12.200"),
to_datetime("2016-08-31 22:20:12.400"),
]

test_data = DataFrame(
{"quant": [1.0, 1.0, 3.0], "quant2": [1.0, 1.0, 3.0], "time2": idx2}
)

expected_output = DataFrame(
{
"time2": date_range("2016-08-31 22:08:00", periods=13, freq="1T"),
"quant": [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
"quant2": [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
}
)

df = test_data.groupby(Grouper(key="time2", freq="1T")).count().reset_index()

tm.assert_frame_equal(df, expected_output)
Loading

0 comments on commit e131cf0

Please sign in to comment.