Skip to content

Commit

Permalink
BUG (string): Series.str.slice with negative step (#59724)
Browse files Browse the repository at this point in the history
Co-authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
  • Loading branch information
jbrockmendel and jorisvandenbossche authored Sep 10, 2024
1 parent 715585d commit 50ac190
Show file tree
Hide file tree
Showing 6 changed files with 28 additions and 34 deletions.
3 changes: 2 additions & 1 deletion doc/source/whatsnew/v2.3.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -103,8 +103,9 @@ Conversion
Strings
^^^^^^^
- Bug in :meth:`Series.str.replace` when ``n < 0`` for :class:`StringDtype` with ``storage="pyarrow"`` (:issue:`59628`)
- Bug in ``ser.str.slice`` with negative ``step`` with :class:`ArrowDtype` and :class:`StringDtype` with ``storage="pyarrow"`` giving incorrect results (:issue:`59710`)
- Bug in the ``center`` method on :class:`Series` and :class:`Index` object ``str`` accessors with pyarrow-backed dtype not matching the python behavior in corner cases with an odd number of fill characters (:issue:`54792`)

-

Interval
^^^^^^^^
Expand Down
32 changes: 23 additions & 9 deletions pandas/core/arrays/_arrow_string_mixins.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

from pandas.compat import (
pa_version_under10p1,
pa_version_under11p0,
pa_version_under13p0,
pa_version_under17p0,
)
Expand All @@ -22,10 +23,7 @@
import pyarrow.compute as pc

if TYPE_CHECKING:
from collections.abc import (
Callable,
Sized,
)
from collections.abc import Callable

from pandas._typing import (
Scalar,
Expand All @@ -34,7 +32,7 @@


class ArrowStringArrayMixin:
_pa_array: Sized
_pa_array: pa.ChunkedArray

def __init__(self, *args, **kwargs) -> None:
raise NotImplementedError
Expand Down Expand Up @@ -96,13 +94,29 @@ def _str_get(self, i: int) -> Self:
selected = pc.utf8_slice_codeunits(
self._pa_array, start=start, stop=stop, step=step
)
null_value = pa.scalar(
None,
type=self._pa_array.type, # type: ignore[attr-defined]
)
null_value = pa.scalar(None, type=self._pa_array.type)
result = pc.if_else(not_out_of_bounds, selected, null_value)
return type(self)(result)

def _str_slice(
self, start: int | None = None, stop: int | None = None, step: int | None = None
) -> Self:
if pa_version_under11p0:
# GH#59724
result = self._apply_elementwise(lambda val: val[start:stop:step])
return type(self)(pa.chunked_array(result, type=self._pa_array.type))
if start is None:
if step is not None and step < 0:
# GH#59710
start = -1
else:
start = 0
if step is None:
step = 1
return type(self)(
pc.utf8_slice_codeunits(self._pa_array, start=start, stop=stop, step=step)
)

def _str_slice_replace(
self, start: int | None = None, stop: int | None = None, repl: str | None = None
) -> Self:
Expand Down
11 changes: 0 additions & 11 deletions pandas/core/arrays/arrow/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -2394,17 +2394,6 @@ def _str_rpartition(self, sep: str, expand: bool) -> Self:
result = self._apply_elementwise(predicate)
return type(self)(pa.chunked_array(result))

def _str_slice(
self, start: int | None = None, stop: int | None = None, step: int | None = None
) -> Self:
if start is None:
start = 0
if step is None:
step = 1
return type(self)(
pc.utf8_slice_codeunits(self._pa_array, start=start, stop=stop, step=step)
)

def _str_len(self) -> Self:
return type(self)(pc.utf8_length(self._pa_array))

Expand Down
14 changes: 1 addition & 13 deletions pandas/core/arrays/string_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -294,6 +294,7 @@ def astype(self, dtype, copy: bool = True):
_str_startswith = ArrowStringArrayMixin._str_startswith
_str_endswith = ArrowStringArrayMixin._str_endswith
_str_pad = ArrowStringArrayMixin._str_pad
_str_slice = ArrowStringArrayMixin._str_slice

def _str_contains(
self, pat, case: bool = True, flags: int = 0, na=np.nan, regex: bool = True
Expand Down Expand Up @@ -352,19 +353,6 @@ def _str_fullmatch(
pat = f"{pat}$"
return self._str_match(pat, case, flags, na)

def _str_slice(
self, start: int | None = None, stop: int | None = None, step: int | None = None
) -> Self:
if stop is None:
return super()._str_slice(start, stop, step)
if start is None:
start = 0
if step is None:
step = 1
return type(self)(
pc.utf8_slice_codeunits(self._pa_array, start=start, stop=stop, step=step)
)

def _str_len(self):
result = pc.utf8_length(self._pa_array)
return self._convert_int_result(result)
Expand Down
1 change: 1 addition & 0 deletions pandas/tests/extension/test_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -2036,6 +2036,7 @@ def test_str_join_string_type():
[None, 2, None, ["ab", None]],
[None, 2, 1, ["ab", None]],
[1, 3, 1, ["bc", None]],
(None, None, -1, ["dcba", None]),
],
)
def test_str_slice(start, stop, step, exp):
Expand Down
1 change: 1 addition & 0 deletions pandas/tests/strings/test_strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -394,6 +394,7 @@ def test_pipe_failures(any_string_dtype):
(2, 5, None, ["foo", "bar", np.nan, "baz"]),
(0, 3, -1, ["", "", np.nan, ""]),
(None, None, -1, ["owtoofaa", "owtrabaa", np.nan, "xuqzabaa"]),
(None, 2, -1, ["owtoo", "owtra", np.nan, "xuqza"]),
(3, 10, 2, ["oto", "ato", np.nan, "aqx"]),
(3, 0, -1, ["ofa", "aba", np.nan, "aba"]),
],
Expand Down

0 comments on commit 50ac190

Please sign in to comment.