Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: RecursionError with categorical merge keys #56485

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v2.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -604,7 +604,7 @@ Indexing
^^^^^^^^
- Bug in :meth:`DataFrame.loc` when setting :class:`Series` with extension dtype into NumPy dtype (:issue:`55604`)
- Bug in :meth:`Index.difference` not returning a unique set of values when ``other`` is empty or ``other`` is considered non-comparable (:issue:`55113`)
- Bug in setting :class:`Categorical` values into a :class:`DataFrame` with numpy dtypes raising ``RecursionError`` (:issue:`52927`)
- Bug in setting :class:`Categorical` values into a :class:`DataFrame` with numpy dtypes raising ``RecursionError`` (:issue:`52927`, :issue:`56376`)

Missing
^^^^^^^
Expand Down
26 changes: 22 additions & 4 deletions pandas/core/dtypes/cast.py
Original file line number Diff line number Diff line change
Expand Up @@ -1749,6 +1749,7 @@ def np_can_hold_element(dtype: np.dtype, element: Any) -> Any:

tipo = _maybe_infer_dtype_type(element)

casted: Any # For mypy
if dtype.kind in "iu":
if isinstance(element, range):
if _dtype_can_hold_range(element, dtype):
Expand Down Expand Up @@ -1780,9 +1781,9 @@ def np_can_hold_element(dtype: np.dtype, element: Any) -> Any:
return casted
raise LossySetitemError

elif isinstance(element, ABCExtensionArray) and isinstance(
element.dtype, CategoricalDtype
):
elif isinstance(
element, (ABCExtensionArray, ABCIndex, ABCSeries)
) and isinstance(element.dtype, CategoricalDtype):
# GH#52927 setting Categorical value into non-EA frame
# TODO: general-case for EAs?
try:
Expand Down Expand Up @@ -1834,14 +1835,31 @@ def np_can_hold_element(dtype: np.dtype, element: Any) -> Any:
raise LossySetitemError

if tipo is not None:
if isinstance(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure that this is the right place if the issue is only in merge

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The issue should show up anywhere that tries to determine the result type of an operation using np_can_hold_element but calls itself in error handling. In the bug report, the problem was happening in putmask, but a similar issue happened with loc before.

element, (ABCExtensionArray, ABCIndex, ABCSeries)
) and isinstance(element.dtype, CategoricalDtype):
# GH#52927,56376 setting Categorical value into non-EA frame
# TODO: general-case for EAs?
try:
casted = element.astype(dtype)
except (ValueError, TypeError):
raise LossySetitemError
# Check for cases of either
# a) lossy overflow/rounding or
# b) semantic changes like dt64->int64
comp = casted == element
if not comp.all():
raise LossySetitemError
return casted

# TODO: itemsize check?
if tipo.kind not in "iuf":
# Anything other than float/integer we cannot hold
raise LossySetitemError
if not isinstance(tipo, np.dtype):
# i.e. nullable IntegerDtype or FloatingDtype;
# we can put this into an ndarray losslessly iff it has no NAs
if element._hasna:
if element._hasna: # type: ignore[union-attr]
raise LossySetitemError
return element
elif tipo.itemsize > dtype.itemsize or tipo.kind != dtype.kind:
Expand Down
8 changes: 8 additions & 0 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -5504,6 +5504,14 @@ def putmask(self, mask, value) -> Index:

# See also: Block.coerce_to_target_dtype
dtype = self._find_common_type_compat(value)

# Prevent an infinite putmask loop GH56376
if dtype == self.dtype:
raise AssertionError(
"Something has gone wrong, please report a bug at "
"https://github.com/pandas-dev/pandas/issues"
) from err

return self.astype(dtype).putmask(mask, value)

values = self._values.copy()
Expand Down
32 changes: 32 additions & 0 deletions pandas/tests/indexes/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,15 @@

from pandas import (
NA,
Categorical,
CategoricalDtype,
DatetimeIndex,
Index,
IntervalIndex,
MultiIndex,
NaT,
PeriodIndex,
Series,
TimedeltaIndex,
)
import pandas._testing as tm
Expand Down Expand Up @@ -303,6 +306,35 @@ def test_putmask_with_wrong_mask(self, index):
index.putmask("foo", fill)


def test_putmask_categorical():
# Check that putmask can use categorical values in various forms GH56376
index = Index([2, 1, 0], dtype="int64")
dtype = CategoricalDtype(categories=np.asarray([1, 2, 3], dtype="float64"))

value = Categorical([1.0, 2.0, 3.0], dtype=dtype)
result = index.putmask([True, True, False], value)
expected = Index([1, 2, 0], dtype="int64")
tm.assert_index_equal(result, expected)

value = Series([1.0, 2.0, 3.0], dtype=dtype)
result = index.putmask([True, True, False], value)
tm.assert_index_equal(result, expected)

value = Index([1.0, 2.0, 3.0], dtype=dtype)
result = index.putmask([True, True, False], value)
tm.assert_index_equal(result, expected)


def test_putmask_infinite_loop():
# Check that putmask won't get stuck in an infinite loop GH56376
index = Index([1, 2, 0], dtype="int64")
dtype = CategoricalDtype(categories=np.asarray([1, 2, 3], dtype="float64"))
value = Index([1.0, np.nan, 3.0], dtype=dtype)

with pytest.raises(AssertionError, match="please report a bug"):
index.putmask([True, True, False], value)


@pytest.mark.parametrize(
"idx", [Index([1, 2, 3]), Index([0.1, 0.2, 0.3]), Index(["a", "b", "c"])]
)
Expand Down
11 changes: 8 additions & 3 deletions pandas/tests/indexing/test_loc.py
Original file line number Diff line number Diff line change
Expand Up @@ -1677,12 +1677,17 @@ def test_loc_setitem_range_key(self, frame_or_series):
expected = frame_or_series([0, 1, 10, 9, 11], index=obj.index)
tm.assert_equal(obj, expected)

def test_loc_setitem_numpy_frame_categorical_value(self):
@pytest.mark.parametrize("dtype", ["int64", "float64"])
def test_loc_setitem_numpy_frame_categorical_value(self, dtype):
# GH#52927
df = DataFrame({"a": [1, 1, 1, 1, 1], "b": ["a", "a", "a", "a", "a"]})
df = DataFrame({"a": [1, 1, 1, 1, 1], "b": ["a", "a", "a", "a", "a"]}).astype(
{"a": dtype}
)
df.loc[1:2, "a"] = Categorical([2, 2], categories=[1, 2])

expected = DataFrame({"a": [1, 2, 2, 1, 1], "b": ["a", "a", "a", "a", "a"]})
expected = DataFrame(
{"a": [1, 2, 2, 1, 1], "b": ["a", "a", "a", "a", "a"]}
).astype({"a": dtype})
tm.assert_frame_equal(df, expected)


Expand Down