Skip to content

Commit

Permalink
String dtype: remove fallback Perfomance warnings for string methods (#…
Browse files Browse the repository at this point in the history
  • Loading branch information
jorisvandenbossche authored Sep 10, 2024
1 parent 50ac190 commit de51d33
Show file tree
Hide file tree
Showing 6 changed files with 27 additions and 117 deletions.
19 changes: 0 additions & 19 deletions pandas/core/arrays/arrow/_arrow_utils.py
Original file line number Diff line number Diff line change
@@ -1,27 +1,8 @@
from __future__ import annotations

import warnings

import numpy as np
import pyarrow

from pandas._config.config import get_option

from pandas.errors import PerformanceWarning
from pandas.util._exceptions import find_stack_level


def fallback_performancewarning(version: str | None = None) -> None:
"""
Raise a PerformanceWarning for falling back to ExtensionArray's
non-pyarrow method
"""
if get_option("performance_warnings"):
msg = "Falling back on a non-pyarrow code path which may decrease performance."
if version is not None:
msg += f" Upgrade to pyarrow >={version} to possibly suppress this warning."
warnings.warn(msg, PerformanceWarning, stacklevel=find_stack_level())


def pyarrow_array_to_numpy_and_mask(
arr, dtype: np.dtype
Expand Down
8 changes: 0 additions & 8 deletions pandas/core/arrays/string_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,6 @@

import numpy as np

from pandas._config.config import get_option

from pandas._libs import (
lib,
missing as libmissing,
Expand Down Expand Up @@ -43,8 +41,6 @@
import pyarrow as pa
import pyarrow.compute as pc

from pandas.core.arrays.arrow._arrow_utils import fallback_performancewarning


if TYPE_CHECKING:
from collections.abc import (
Expand Down Expand Up @@ -300,8 +296,6 @@ def _str_contains(
self, pat, case: bool = True, flags: int = 0, na=np.nan, regex: bool = True
):
if flags:
if get_option("mode.performance_warnings"):
fallback_performancewarning()
return super()._str_contains(pat, case, flags, na, regex)

if not isna(na):
Expand All @@ -327,8 +321,6 @@ def _str_replace(
regex: bool = True,
):
if isinstance(pat, re.Pattern) or callable(repl) or not case or flags:
if get_option("mode.performance_warnings"):
fallback_performancewarning()
return super()._str_replace(pat, repl, n, case, flags, regex)

return ArrowExtensionArray._str_replace(self, pat, repl, n, case, flags, regex)
Expand Down
1 change: 0 additions & 1 deletion pandas/tests/extension/test_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,6 @@ def test_compare_scalar(self, data, comparison_op):
ser = pd.Series(data)
self._compare_other(ser, data, comparison_op, "abc")

@pytest.mark.filterwarnings("ignore:Falling back:pandas.errors.PerformanceWarning")
def test_groupby_extension_apply(self, data_for_grouping, groupby_apply_op):
super().test_groupby_extension_apply(data_for_grouping, groupby_apply_op)

Expand Down
12 changes: 0 additions & 12 deletions pandas/tests/indexes/test_setops.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,9 +246,6 @@ def test_intersection_base(self, index):
with pytest.raises(TypeError, match=msg):
first.intersection([1, 2, 3])

@pytest.mark.filterwarnings(
"ignore:Falling back on a non-pyarrow:pandas.errors.PerformanceWarning"
)
@pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning")
def test_union_base(self, index):
index = index.unique()
Expand Down Expand Up @@ -276,9 +273,6 @@ def test_union_base(self, index):
first.union([1, 2, 3])

@pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning")
@pytest.mark.filterwarnings(
"ignore:Falling back on a non-pyarrow:pandas.errors.PerformanceWarning"
)
def test_difference_base(self, sort, index):
first = index[2:]
second = index[:4]
Expand All @@ -305,9 +299,6 @@ def test_difference_base(self, sort, index):
first.difference([1, 2, 3], sort)

@pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning")
@pytest.mark.filterwarnings(
"ignore:Falling back on a non-pyarrow:pandas.errors.PerformanceWarning"
)
def test_symmetric_difference(self, index):
if isinstance(index, CategoricalIndex):
pytest.skip(f"Not relevant for {type(index).__name__}")
Expand Down Expand Up @@ -529,9 +520,6 @@ def test_intersection_difference_match_empty(self, index, sort):


@pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning")
@pytest.mark.filterwarnings(
"ignore:Falling back on a non-pyarrow:pandas.errors.PerformanceWarning"
)
@pytest.mark.parametrize(
"method", ["intersection", "union", "difference", "symmetric_difference"]
)
Expand Down
103 changes: 27 additions & 76 deletions pandas/tests/strings/test_find_replace.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,6 @@
# --------------------------------------------------------------------------------------


def using_pyarrow(dtype):
return dtype == "string" and dtype.storage == "pyarrow"


def test_contains(any_string_dtype):
values = np.array(
["foo", np.nan, "fooommm__foo", "mmm_", "foommm[_]+bar"], dtype=np.object_
Expand Down Expand Up @@ -458,13 +454,10 @@ def test_replace_mixed_object():
tm.assert_series_equal(result, expected)


def test_replace_unicode(any_string_dtype, performance_warning):
def test_replace_unicode(any_string_dtype):
ser = Series([b"abcd,\xc3\xa0".decode("utf-8")], dtype=any_string_dtype)
expected = Series([b"abcd, \xc3\xa0".decode("utf-8")], dtype=any_string_dtype)
with tm.maybe_produces_warning(
performance_warning, using_pyarrow(any_string_dtype)
):
result = ser.str.replace(r"(?<=\w),(?=\w)", ", ", flags=re.UNICODE, regex=True)
result = ser.str.replace(r"(?<=\w),(?=\w)", ", ", flags=re.UNICODE, regex=True)
tm.assert_series_equal(result, expected)


Expand All @@ -478,24 +471,21 @@ def test_replace_wrong_repl_type_raises(any_string_dtype, index_or_series, repl,
obj.str.replace("a", repl)


def test_replace_callable(any_string_dtype, performance_warning):
def test_replace_callable(any_string_dtype):
# GH 15055
ser = Series(["fooBAD__barBAD", np.nan], dtype=any_string_dtype)

# test with callable
repl = lambda m: m.group(0).swapcase()
with tm.maybe_produces_warning(
performance_warning, using_pyarrow(any_string_dtype)
):
result = ser.str.replace("[a-z][A-Z]{2}", repl, n=2, regex=True)
result = ser.str.replace("[a-z][A-Z]{2}", repl, n=2, regex=True)
expected = Series(["foObaD__baRbaD", np.nan], dtype=any_string_dtype)
tm.assert_series_equal(result, expected)


@pytest.mark.parametrize(
"repl", [lambda: None, lambda m, x: None, lambda m, x, y=None: None]
)
def test_replace_callable_raises(any_string_dtype, performance_warning, repl):
def test_replace_callable_raises(any_string_dtype, repl):
# GH 15055
values = Series(["fooBAD__barBAD", np.nan], dtype=any_string_dtype)

Expand All @@ -504,43 +494,31 @@ def test_replace_callable_raises(any_string_dtype, performance_warning, repl):
r"((takes)|(missing)) (?(2)from \d+ to )?\d+ "
r"(?(3)required )positional arguments?"
)
if not using_pyarrow(any_string_dtype):
performance_warning = False
with pytest.raises(TypeError, match=msg):
with tm.assert_produces_warning(performance_warning):
values.str.replace("a", repl, regex=True)
values.str.replace("a", repl, regex=True)


def test_replace_callable_named_groups(any_string_dtype, performance_warning):
def test_replace_callable_named_groups(any_string_dtype):
# test regex named groups
ser = Series(["Foo Bar Baz", np.nan], dtype=any_string_dtype)
pat = r"(?P<first>\w+) (?P<middle>\w+) (?P<last>\w+)"
repl = lambda m: m.group("middle").swapcase()
with tm.maybe_produces_warning(
performance_warning, using_pyarrow(any_string_dtype)
):
result = ser.str.replace(pat, repl, regex=True)
result = ser.str.replace(pat, repl, regex=True)
expected = Series(["bAR", np.nan], dtype=any_string_dtype)
tm.assert_series_equal(result, expected)


def test_replace_compiled_regex(any_string_dtype, performance_warning):
def test_replace_compiled_regex(any_string_dtype):
# GH 15446
ser = Series(["fooBAD__barBAD", np.nan], dtype=any_string_dtype)

# test with compiled regex
pat = re.compile(r"BAD_*")
with tm.maybe_produces_warning(
performance_warning, using_pyarrow(any_string_dtype)
):
result = ser.str.replace(pat, "", regex=True)
result = ser.str.replace(pat, "", regex=True)
expected = Series(["foobar", np.nan], dtype=any_string_dtype)
tm.assert_series_equal(result, expected)

with tm.maybe_produces_warning(
performance_warning, using_pyarrow(any_string_dtype)
):
result = ser.str.replace(pat, "", n=1, regex=True)
result = ser.str.replace(pat, "", n=1, regex=True)
expected = Series(["foobarBAD", np.nan], dtype=any_string_dtype)
tm.assert_series_equal(result, expected)

Expand All @@ -557,14 +535,11 @@ def test_replace_compiled_regex_mixed_object():
tm.assert_series_equal(result, expected)


def test_replace_compiled_regex_unicode(any_string_dtype, performance_warning):
def test_replace_compiled_regex_unicode(any_string_dtype):
ser = Series([b"abcd,\xc3\xa0".decode("utf-8")], dtype=any_string_dtype)
expected = Series([b"abcd, \xc3\xa0".decode("utf-8")], dtype=any_string_dtype)
pat = re.compile(r"(?<=\w),(?=\w)", flags=re.UNICODE)
with tm.maybe_produces_warning(
performance_warning, using_pyarrow(any_string_dtype)
):
result = ser.str.replace(pat, ", ", regex=True)
result = ser.str.replace(pat, ", ", regex=True)
tm.assert_series_equal(result, expected)


Expand All @@ -586,15 +561,12 @@ def test_replace_compiled_regex_raises(any_string_dtype):
ser.str.replace(pat, "", case=True, regex=True)


def test_replace_compiled_regex_callable(any_string_dtype, performance_warning):
def test_replace_compiled_regex_callable(any_string_dtype):
# test with callable
ser = Series(["fooBAD__barBAD", np.nan], dtype=any_string_dtype)
repl = lambda m: m.group(0).swapcase()
pat = re.compile("[a-z][A-Z]{2}")
with tm.maybe_produces_warning(
performance_warning, using_pyarrow(any_string_dtype)
):
result = ser.str.replace(pat, repl, n=2, regex=True)
result = ser.str.replace(pat, repl, n=2, regex=True)
expected = Series(["foObaD__baRbaD", np.nan], dtype=any_string_dtype)
tm.assert_series_equal(result, expected)

Expand Down Expand Up @@ -626,7 +598,7 @@ def test_replace_literal_compiled_raises(any_string_dtype):
ser.str.replace(pat, "", regex=False)


def test_replace_moar(any_string_dtype, performance_warning):
def test_replace_moar(any_string_dtype):
# PR #1179
ser = Series(
["A", "B", "C", "Aaba", "Baca", "", np.nan, "CABA", "dog", "cat"],
Expand All @@ -640,10 +612,7 @@ def test_replace_moar(any_string_dtype, performance_warning):
)
tm.assert_series_equal(result, expected)

with tm.maybe_produces_warning(
performance_warning, using_pyarrow(any_string_dtype)
):
result = ser.str.replace("A", "YYY", case=False)
result = ser.str.replace("A", "YYY", case=False)
expected = Series(
[
"YYY",
Expand All @@ -661,10 +630,7 @@ def test_replace_moar(any_string_dtype, performance_warning):
)
tm.assert_series_equal(result, expected)

with tm.maybe_produces_warning(
performance_warning, using_pyarrow(any_string_dtype)
):
result = ser.str.replace("^.a|dog", "XX-XX ", case=False, regex=True)
result = ser.str.replace("^.a|dog", "XX-XX ", case=False, regex=True)
expected = Series(
[
"A",
Expand All @@ -683,21 +649,15 @@ def test_replace_moar(any_string_dtype, performance_warning):
tm.assert_series_equal(result, expected)


def test_replace_not_case_sensitive_not_regex(any_string_dtype, performance_warning):
def test_replace_not_case_sensitive_not_regex(any_string_dtype):
# https://github.com/pandas-dev/pandas/issues/41602
ser = Series(["A.", "a.", "Ab", "ab", np.nan], dtype=any_string_dtype)

with tm.maybe_produces_warning(
performance_warning, using_pyarrow(any_string_dtype)
):
result = ser.str.replace("a", "c", case=False, regex=False)
result = ser.str.replace("a", "c", case=False, regex=False)
expected = Series(["c.", "c.", "cb", "cb", np.nan], dtype=any_string_dtype)
tm.assert_series_equal(result, expected)

with tm.maybe_produces_warning(
performance_warning, using_pyarrow(any_string_dtype)
):
result = ser.str.replace("a.", "c.", case=False, regex=False)
result = ser.str.replace("a.", "c.", case=False, regex=False)
expected = Series(["c.", "c.", "Ab", "ab", np.nan], dtype=any_string_dtype)
tm.assert_series_equal(result, expected)

Expand Down Expand Up @@ -853,7 +813,7 @@ def test_fullmatch_na_kwarg(any_string_dtype):
tm.assert_series_equal(result, expected)


def test_fullmatch_case_kwarg(any_string_dtype, performance_warning):
def test_fullmatch_case_kwarg(any_string_dtype):
ser = Series(["ab", "AB", "abc", "ABC"], dtype=any_string_dtype)
expected_dtype = (
np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
Expand All @@ -869,10 +829,7 @@ def test_fullmatch_case_kwarg(any_string_dtype, performance_warning):
result = ser.str.fullmatch("ab", case=False)
tm.assert_series_equal(result, expected)

with tm.maybe_produces_warning(
performance_warning, using_pyarrow(any_string_dtype)
):
result = ser.str.fullmatch("ab", flags=re.IGNORECASE)
result = ser.str.fullmatch("ab", flags=re.IGNORECASE)
tm.assert_series_equal(result, expected)


Expand Down Expand Up @@ -1046,7 +1003,7 @@ def test_translate_mixed_object():
# --------------------------------------------------------------------------------------


def test_flags_kwarg(any_string_dtype, performance_warning):
def test_flags_kwarg(any_string_dtype):
data = {
"Dave": "dave@google.com",
"Steve": "steve@gmail.com",
Expand All @@ -1057,17 +1014,13 @@ def test_flags_kwarg(any_string_dtype, performance_warning):

pat = r"([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})"

use_pyarrow = using_pyarrow(any_string_dtype)

result = data.str.extract(pat, flags=re.IGNORECASE, expand=True)
assert result.iloc[0].tolist() == ["dave", "google", "com"]

with tm.maybe_produces_warning(performance_warning, use_pyarrow):
result = data.str.match(pat, flags=re.IGNORECASE)
result = data.str.match(pat, flags=re.IGNORECASE)
assert result.iloc[0]

with tm.maybe_produces_warning(performance_warning, use_pyarrow):
result = data.str.fullmatch(pat, flags=re.IGNORECASE)
result = data.str.fullmatch(pat, flags=re.IGNORECASE)
assert result.iloc[0]

result = data.str.findall(pat, flags=re.IGNORECASE)
Expand All @@ -1077,8 +1030,6 @@ def test_flags_kwarg(any_string_dtype, performance_warning):
assert result.iloc[0] == 1

msg = "has match groups"
with tm.assert_produces_warning(
UserWarning, match=msg, raise_on_extra_warnings=not use_pyarrow
):
with tm.assert_produces_warning(UserWarning, match=msg):
result = data.str.contains(pat, flags=re.IGNORECASE)
assert result.iloc[0]
1 change: 0 additions & 1 deletion pandas/tests/strings/test_string_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
)


@pytest.mark.filterwarnings("ignore:Falling back")
def test_string_array(nullable_string_dtype, any_string_method):
method_name, args, kwargs = any_string_method

Expand Down

0 comments on commit de51d33

Please sign in to comment.