Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: Fix potential segfault after pd.Categorical(pd.Series(...), categories=...) #25368

Merged
merged 1 commit into from
Mar 3, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v0.24.2.rst
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ Fixed Regressions
- Fixed regression in :class:`TimedeltaIndex` where `np.sum(index)` incorrectly returned a zero-dimensional object instead of a scalar (:issue:`25282`)
- Fixed regression in ``IntervalDtype`` construction where passing an incorrect string with 'Interval' as a prefix could result in a ``RecursionError``. (:issue:`25338`)

- Fixed regression in :class:`Categorical`, where constructing it from a categorical ``Series`` and an explicit ``categories=`` that differed from that in the ``Series`` created an invalid object which could trigger segfaults. (:issue:`25318`)

.. _whatsnew_0242.enhancements:

Enhancements
Expand Down
13 changes: 4 additions & 9 deletions pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -323,14 +323,6 @@ def __init__(self, values, categories=None, ordered=None, dtype=None,
# we may have dtype.categories be None, and we need to
# infer categories in a factorization step futher below

if is_categorical(values):
# GH23814, for perf, if values._values already an instance of
# Categorical, set values to codes, and run fastpath
if (isinstance(values, (ABCSeries, ABCIndexClass)) and
isinstance(values._values, type(self))):
batterseapower marked this conversation as resolved.
Show resolved Hide resolved
values = values._values.codes.copy()
fastpath = True

if fastpath:
self._codes = coerce_indexer_dtype(values, dtype.categories)
self._dtype = self._dtype.update_dtype(dtype)
Expand Down Expand Up @@ -382,7 +374,7 @@ def __init__(self, values, categories=None, ordered=None, dtype=None,
dtype = CategoricalDtype(categories, dtype.ordered)

elif is_categorical_dtype(values):
old_codes = (values.cat.codes if isinstance(values, ABCSeries)
old_codes = (values._values.codes if isinstance(values, ABCSeries)
else values.codes)
codes = _recode_for_categories(old_codes, values.dtype.categories,
dtype.categories)
Expand Down Expand Up @@ -2625,6 +2617,9 @@ def _recode_for_categories(codes, old_categories, new_categories):
if len(old_categories) == 0:
# All null anyway, so just retain the nulls
return codes.copy()
elif new_categories.equals(old_categories):
# Same categories, so no need to actually recode
return codes.copy()
indexer = coerce_indexer_dtype(new_categories.get_indexer(old_categories),
new_categories)
new_codes = take_1d(indexer, codes.copy(), fill_value=-1)
Expand Down
12 changes: 12 additions & 0 deletions pandas/tests/arrays/categorical/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,18 @@ def test_constructor(self):
c = Categorical(np.array([], dtype='int64'), # noqa
categories=[3, 2, 1], ordered=True)

def test_constructor_with_existing_categories(self):
# GH25318: constructing with pd.Series used to bogusly skip recoding
# categories
c0 = Categorical(["a", "b", "c", "a"])
c1 = Categorical(["a", "b", "c", "a"], categories=["b", "c"])

c2 = Categorical(c0, categories=c1.categories)
tm.assert_categorical_equal(c1, c2)

c3 = Categorical(Series(c0), categories=c1.categories)
tm.assert_categorical_equal(c1, c3)

def test_constructor_not_sequence(self):
# https://github.com/pandas-dev/pandas/issues/16022
msg = r"^Parameter 'categories' must be list-like, was"
Expand Down