Skip to content

Commit

Permalink
Merge branch 'main' into ref-dedup
Browse files Browse the repository at this point in the history
  • Loading branch information
jbrockmendel committed Sep 10, 2024
2 parents feb30a4 + 50ac190 commit f970e3f
Show file tree
Hide file tree
Showing 56 changed files with 533 additions and 277 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/unit-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -380,7 +380,7 @@ jobs:
fetch-depth: 0

- name: Set up Python Free-threading Version
uses: deadsnakes/action@v3.1.0
uses: deadsnakes/action@v3.2.0
with:
python-version: 3.13-dev
nogil: true
Expand Down
15 changes: 0 additions & 15 deletions ci/code_checks.sh
Original file line number Diff line number Diff line change
Expand Up @@ -75,9 +75,7 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
-i "pandas.Period.ordinal GL08" \
-i "pandas.PeriodDtype.freq SA01" \
-i "pandas.RangeIndex.from_range PR01,SA01" \
-i "pandas.RangeIndex.start SA01" \
-i "pandas.RangeIndex.step SA01" \
-i "pandas.RangeIndex.stop SA01" \
-i "pandas.Series.cat.add_categories PR01,PR02" \
-i "pandas.Series.cat.as_ordered PR01" \
-i "pandas.Series.cat.as_unordered PR01" \
Expand All @@ -92,10 +90,8 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
-i "pandas.Series.dt.floor PR01,PR02" \
-i "pandas.Series.dt.freq GL08" \
-i "pandas.Series.dt.month_name PR01,PR02" \
-i "pandas.Series.dt.nanoseconds SA01" \
-i "pandas.Series.dt.normalize PR01" \
-i "pandas.Series.dt.round PR01,PR02" \
-i "pandas.Series.dt.seconds SA01" \
-i "pandas.Series.dt.strftime PR01,PR02" \
-i "pandas.Series.dt.to_period PR01,PR02" \
-i "pandas.Series.dt.total_seconds PR01" \
Expand All @@ -113,8 +109,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
-i "pandas.Timedelta.resolution PR02" \
-i "pandas.Timedelta.to_timedelta64 SA01" \
-i "pandas.Timedelta.total_seconds SA01" \
-i "pandas.TimedeltaIndex.nanoseconds SA01" \
-i "pandas.TimedeltaIndex.seconds SA01" \
-i "pandas.TimedeltaIndex.to_pytimedelta RT03,SA01" \
-i "pandas.Timestamp.max PR02" \
-i "pandas.Timestamp.min PR02" \
Expand All @@ -123,13 +117,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
-i "pandas.Timestamp.tzinfo GL08" \
-i "pandas.Timestamp.year GL08" \
-i "pandas.api.extensions.ExtensionArray.interpolate PR01,SA01" \
-i "pandas.api.types.is_bool PR01,SA01" \
-i "pandas.api.types.is_categorical_dtype SA01" \
-i "pandas.api.types.is_complex PR01,SA01" \
-i "pandas.api.types.is_complex_dtype SA01" \
-i "pandas.api.types.is_datetime64_dtype SA01" \
-i "pandas.api.types.is_datetime64_ns_dtype SA01" \
-i "pandas.api.types.is_datetime64tz_dtype SA01" \
-i "pandas.api.types.is_dict_like PR07,SA01" \
-i "pandas.api.types.is_extension_array_dtype SA01" \
-i "pandas.api.types.is_file_like PR07,SA01" \
Expand Down Expand Up @@ -163,7 +150,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
-i "pandas.core.groupby.DataFrameGroupBy.agg RT03" \
-i "pandas.core.groupby.DataFrameGroupBy.aggregate RT03" \
-i "pandas.core.groupby.DataFrameGroupBy.boxplot PR07,RT03,SA01" \
-i "pandas.core.groupby.DataFrameGroupBy.filter SA01" \
-i "pandas.core.groupby.DataFrameGroupBy.get_group RT03,SA01" \
-i "pandas.core.groupby.DataFrameGroupBy.groups SA01" \
-i "pandas.core.groupby.DataFrameGroupBy.hist RT03" \
Expand All @@ -179,7 +165,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
-i "pandas.core.groupby.SeriesGroupBy.__iter__ RT03,SA01" \
-i "pandas.core.groupby.SeriesGroupBy.agg RT03" \
-i "pandas.core.groupby.SeriesGroupBy.aggregate RT03" \
-i "pandas.core.groupby.SeriesGroupBy.filter PR01,SA01" \
-i "pandas.core.groupby.SeriesGroupBy.get_group RT03,SA01" \
-i "pandas.core.groupby.SeriesGroupBy.groups SA01" \
-i "pandas.core.groupby.SeriesGroupBy.indices SA01" \
Expand Down
3 changes: 2 additions & 1 deletion doc/source/whatsnew/v2.3.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -103,8 +103,9 @@ Conversion
Strings
^^^^^^^
- Bug in :meth:`Series.str.replace` when ``n < 0`` for :class:`StringDtype` with ``storage="pyarrow"`` (:issue:`59628`)
- Bug in ``ser.str.slice`` with negative ``step`` with :class:`ArrowDtype` and :class:`StringDtype` with ``storage="pyarrow"`` giving incorrect results (:issue:`59710`)
- Bug in the ``center`` method on :class:`Series` and :class:`Index` object ``str`` accessors with pyarrow-backed dtype not matching the python behavior in corner cases with an odd number of fill characters (:issue:`54792`)

-

Interval
^^^^^^^^
Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ Other enhancements
- :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`)
- :meth:`DataFrame.plot.scatter` argument ``c`` now accepts a column of strings, where rows with the same string are colored identically (:issue:`16827` and :issue:`16485`)
- :meth:`pandas.concat` will raise a ``ValueError`` when ``ignore_index=True`` and ``keys`` is not ``None`` (:issue:`59274`)
- :meth:`str.get_dummies` now accepts a ``dtype`` parameter to specify the dtype of the resulting DataFrame (:issue:`47872`)
- Multiplying two :class:`DateOffset` objects will now raise a ``TypeError`` instead of a ``RecursionError`` (:issue:`59442`)
- Restore support for reading Stata 104-format and enable reading 103-format dta files (:issue:`58554`)
- Support passing a :class:`Iterable[Hashable]` input to :meth:`DataFrame.drop_duplicates` (:issue:`59237`)
Expand Down
41 changes: 35 additions & 6 deletions pandas/_libs/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -733,7 +733,9 @@ cpdef ndarray[object] ensure_string_array(
convert_na_value : bool, default True
If False, existing na values will be used unchanged in the new array.
copy : bool, default True
Whether to ensure that a new array is returned.
Whether to ensure that a new array is returned. When True, a new array
is always returned. When False, a new array is only returned when needed
to avoid mutating the input array.
skipna : bool, default True
Whether or not to coerce nulls to their stringified form
(e.g. if False, NaN becomes 'nan').
Expand Down Expand Up @@ -762,11 +764,15 @@ cpdef ndarray[object] ensure_string_array(

result = np.asarray(arr, dtype="object")

if copy and (result is arr or np.shares_memory(arr, result)):
# GH#54654
result = result.copy()
elif not copy and result is arr:
already_copied = False
if result is arr or np.may_share_memory(arr, result):
# if np.asarray(..) did not make a copy of the input arr, we still need
# to do that to avoid mutating the input array
# GH#54654: share_memory check is needed for rare cases where np.asarray
# returns a new object without making a copy of the actual data
if copy:
result = result.copy()
else:
already_copied = False
elif not copy and not result.flags.writeable:
# Weird edge case where result is a view
already_copied = False
Expand Down Expand Up @@ -1123,10 +1129,21 @@ def is_bool(obj: object) -> bool:
"""
Return True if given object is boolean.

Parameters
----------
obj : object
Object to check.

Returns
-------
bool

See Also
--------
api.types.is_scalar : Check if the input is a scalar.
api.types.is_integer : Check if the input is an integer.
api.types.is_float : Check if the input is a float.

Examples
--------
>>> pd.api.types.is_bool(True)
Expand All @@ -1142,10 +1159,22 @@ def is_complex(obj: object) -> bool:
"""
Return True if given object is complex.

Parameters
----------
obj : object
Object to check.

Returns
-------
bool

See Also
--------
api.types.is_complex_dtype: Check whether the provided array or
dtype is of a complex dtype.
api.types.is_number: Check if the object is a number.
api.types.is_integer: Return True if given object is integer.

Examples
--------
>>> pd.api.types.is_complex(1 + 1j)
Expand Down
28 changes: 28 additions & 0 deletions pandas/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -1272,6 +1272,34 @@ def string_dtype(request):
return request.param


@pytest.fixture(
params=[
("python", pd.NA),
pytest.param(("pyarrow", pd.NA), marks=td.skip_if_no("pyarrow")),
pytest.param(("pyarrow", np.nan), marks=td.skip_if_no("pyarrow")),
("python", np.nan),
],
ids=[
"string=string[python]",
"string=string[pyarrow]",
"string=str[pyarrow]",
"string=str[python]",
],
)
def string_dtype_no_object(request):
"""
Parametrized fixture for string dtypes.
* 'string[python]' (NA variant)
* 'string[pyarrow]' (NA variant)
* 'str' (NaN variant, with pyarrow)
* 'str' (NaN variant, without pyarrow)
"""
# need to instantiate the StringDtype here instead of in the params
# to avoid importing pyarrow during test collection
storage, na_value = request.param
return pd.StringDtype(storage, na_value)


@pytest.fixture(
params=[
"string[python]",
Expand Down
32 changes: 23 additions & 9 deletions pandas/core/arrays/_arrow_string_mixins.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

from pandas.compat import (
pa_version_under10p1,
pa_version_under11p0,
pa_version_under13p0,
pa_version_under17p0,
)
Expand All @@ -22,10 +23,7 @@
import pyarrow.compute as pc

if TYPE_CHECKING:
from collections.abc import (
Callable,
Sized,
)
from collections.abc import Callable

from pandas._typing import (
Scalar,
Expand All @@ -34,7 +32,7 @@


class ArrowStringArrayMixin:
_pa_array: Sized
_pa_array: pa.ChunkedArray

def __init__(self, *args, **kwargs) -> None:
raise NotImplementedError
Expand Down Expand Up @@ -127,13 +125,29 @@ def _str_get(self, i: int) -> Self:
selected = pc.utf8_slice_codeunits(
self._pa_array, start=start, stop=stop, step=step
)
null_value = pa.scalar(
None,
type=self._pa_array.type, # type: ignore[attr-defined]
)
null_value = pa.scalar(None, type=self._pa_array.type)
result = pc.if_else(not_out_of_bounds, selected, null_value)
return type(self)(result)

def _str_slice(
self, start: int | None = None, stop: int | None = None, step: int | None = None
) -> Self:
if pa_version_under11p0:
# GH#59724
result = self._apply_elementwise(lambda val: val[start:stop:step])
return type(self)(pa.chunked_array(result, type=self._pa_array.type))
if start is None:
if step is not None and step < 0:
# GH#59710
start = -1
else:
start = 0
if step is None:
step = 1
return type(self)(
pc.utf8_slice_codeunits(self._pa_array, start=start, stop=stop, step=step)
)

def _str_slice_replace(
self, start: int | None = None, stop: int | None = None, repl: str | None = None
) -> Self:
Expand Down
26 changes: 13 additions & 13 deletions pandas/core/arrays/arrow/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
is_list_like,
is_numeric_dtype,
is_scalar,
pandas_dtype,
)
from pandas.core.dtypes.dtypes import DatetimeTZDtype
from pandas.core.dtypes.missing import isna
Expand Down Expand Up @@ -2376,17 +2377,6 @@ def _str_rpartition(self, sep: str, expand: bool) -> Self:
result = self._apply_elementwise(predicate)
return type(self)(pa.chunked_array(result))

def _str_slice(
self, start: int | None = None, stop: int | None = None, step: int | None = None
) -> Self:
if start is None:
start = 0
if step is None:
step = 1
return type(self)(
pc.utf8_slice_codeunits(self._pa_array, start=start, stop=stop, step=step)
)

def _str_removeprefix(self, prefix: str):
if not pa_version_under13p0:
starts_with = pc.starts_with(self._pa_array, pattern=prefix)
Expand Down Expand Up @@ -2428,7 +2418,9 @@ def _str_findall(self, pat: str, flags: int = 0) -> Self:
result = self._apply_elementwise(predicate)
return type(self)(pa.chunked_array(result))

def _str_get_dummies(self, sep: str = "|"):
def _str_get_dummies(self, sep: str = "|", dtype: NpDtype | None = None):
if dtype is None:
dtype = np.bool_
split = pc.split_pattern(self._pa_array, sep)
flattened_values = pc.list_flatten(split)
uniques = flattened_values.unique()
Expand All @@ -2438,7 +2430,15 @@ def _str_get_dummies(self, sep: str = "|"):
n_cols = len(uniques)
indices = pc.index_in(flattened_values, uniques_sorted).to_numpy()
indices = indices + np.arange(n_rows).repeat(lengths) * n_cols
dummies = np.zeros(n_rows * n_cols, dtype=np.bool_)
_dtype = pandas_dtype(dtype)
dummies_dtype: NpDtype
if isinstance(_dtype, np.dtype):
dummies_dtype = _dtype
else:
dummies_dtype = np.bool_
dummies = np.zeros(n_rows * n_cols, dtype=dummies_dtype)
if dtype == str:
dummies[:] = False
dummies[indices] = True
dummies = dummies.reshape((n_rows, n_cols))
result = type(self)(pa.array(list(dummies)))
Expand Down
4 changes: 2 additions & 2 deletions pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -2681,11 +2681,11 @@ def _str_map(
result = NumpyExtensionArray(categories.to_numpy())._str_map(f, na_value, dtype)
return take_nd(result, codes, fill_value=na_value)

def _str_get_dummies(self, sep: str = "|"):
def _str_get_dummies(self, sep: str = "|", dtype: NpDtype | None = None):
# sep may not be in categories. Just bail on this.
from pandas.core.arrays import NumpyExtensionArray

return NumpyExtensionArray(self.astype(str))._str_get_dummies(sep)
return NumpyExtensionArray(self.astype(str))._str_get_dummies(sep, dtype)

# ------------------------------------------------------------------------
# GroupBy Methods
Expand Down
27 changes: 16 additions & 11 deletions pandas/core/arrays/string_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@
from pandas._typing import (
ArrayLike,
Dtype,
NpDtype,
Self,
npt,
)
Expand Down Expand Up @@ -305,6 +306,7 @@ def astype(self, dtype, copy: bool = True):
_str_swapcase = ArrowStringArrayMixin._str_swapcase
_str_slice_replace = ArrowStringArrayMixin._str_slice_replace
_str_len = ArrowStringArrayMixin._str_len
_str_slice = ArrowStringArrayMixin._str_slice

def _str_contains(
self, pat, case: bool = True, flags: int = 0, na=np.nan, regex: bool = True
Expand Down Expand Up @@ -351,13 +353,6 @@ def _str_repeat(self, repeats: int | Sequence[int]):
else:
return ArrowExtensionArray._str_repeat(self, repeats=repeats)

def _str_slice(
self, start: int | None = None, stop: int | None = None, step: int | None = None
) -> Self:
if stop is None:
return super()._str_slice(start, stop, step)
return ArrowExtensionArray._str_slice(self, start=start, stop=stop, step=step)

def _str_removeprefix(self, prefix: str):
if not pa_version_under13p0:
return ArrowStringArrayMixin._str_removeprefix(self, prefix)
Expand All @@ -379,12 +374,22 @@ def _str_find(self, sub: str, start: int = 0, end: int | None = None):
return super()._str_find(sub, start, end)
return ArrowStringArrayMixin._str_find(self, sub, start, end)

def _str_get_dummies(self, sep: str = "|"):
dummies_pa, labels = ArrowExtensionArray(self._pa_array)._str_get_dummies(sep)
def _str_get_dummies(self, sep: str = "|", dtype: NpDtype | None = None):
if dtype is None:
dtype = np.int64
dummies_pa, labels = ArrowExtensionArray(self._pa_array)._str_get_dummies(
sep, dtype
)
if len(labels) == 0:
return np.empty(shape=(0, 0), dtype=np.int64), labels
return np.empty(shape=(0, 0), dtype=dtype), labels
dummies = np.vstack(dummies_pa.to_numpy())
return dummies.astype(np.int64, copy=False), labels
_dtype = pandas_dtype(dtype)
dummies_dtype: NpDtype
if isinstance(_dtype, np.dtype):
dummies_dtype = _dtype
else:
dummies_dtype = np.bool_
return dummies.astype(dummies_dtype, copy=False), labels

def _convert_int_result(self, result):
if self.dtype.na_value is np.nan:
Expand Down
Loading

0 comments on commit f970e3f

Please sign in to comment.