Skip to content

REF: implement Categorical._validate_listlike #36274

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Sep 12, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 32 additions & 12 deletions pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -1716,6 +1716,35 @@ def _box_func(self, i: int):
return np.NaN
return self.categories[i]

def _validate_listlike(self, target: ArrayLike) -> np.ndarray:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

looks great, yeah these have probably built up over time. only nit is that this should not be private as we are cross importing right?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

only nit is that this should not be private as we are cross importing right?

for now im following the patterns that we use in the datetimelike arrays so we can share more code. eventually we can revisit whether to deprivatize (easier to deprivatize than the other way around)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

kk

"""
Extract integer codes we can use for comparison.

Notes
-----
If a value in target is not present, it gets coded as -1.
"""

if isinstance(target, Categorical):
# Indexing on codes is more efficient if categories are the same,
# so we can apply some optimizations based on the degree of
# dtype-matching.
if self.categories.equals(target.categories):
# We use the same codes, so can go directly to the engine
codes = target.codes
elif self.is_dtype_equal(target):
# We have the same categories up to a reshuffling of codes.
codes = recode_for_categories(
target.codes, target.categories, self.categories
)
else:
code_indexer = self.categories.get_indexer(target.categories)
codes = take_1d(code_indexer, target.codes, fill_value=-1)
else:
codes = self.categories.get_indexer(target)

return codes

# ------------------------------------------------------------------

def take_nd(self, indexer, allow_fill: bool = False, fill_value=None):
Expand Down Expand Up @@ -1890,11 +1919,8 @@ def _validate_setitem_value(self, value):
"Cannot set a Categorical with another, "
"without identical categories"
)
if not self.categories.equals(value.categories):
new_codes = recode_for_categories(
value.codes, value.categories, self.categories
)
value = Categorical.from_codes(new_codes, dtype=self.dtype)
new_codes = self._validate_listlike(value)
value = Categorical.from_codes(new_codes, dtype=self.dtype)

rvalue = value if is_list_like(value) else [value]

Expand Down Expand Up @@ -2164,13 +2190,7 @@ def equals(self, other: object) -> bool:
if not isinstance(other, Categorical):
return False
elif self.is_dtype_equal(other):
if self.categories.equals(other.categories):
# fastpath to avoid re-coding
other_codes = other._codes
else:
other_codes = recode_for_categories(
other.codes, other.categories, self.categories
)
other_codes = self._validate_listlike(other)
return np.array_equal(self._codes, other_codes)
return False

Expand Down
10 changes: 2 additions & 8 deletions pandas/core/dtypes/concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -310,14 +310,8 @@ def _maybe_unwrap(x):
categories = first.categories
ordered = first.ordered

if all(first.categories.equals(other.categories) for other in to_union[1:]):
new_codes = np.concatenate([c.codes for c in to_union])
else:
codes = [first.codes] + [
recode_for_categories(other.codes, other.categories, first.categories)
for other in to_union[1:]
]
new_codes = np.concatenate(codes)
all_codes = [first._validate_listlike(x) for x in to_union]
new_codes = np.concatenate(all_codes)

if sort_categories and not ignore_order and ordered:
raise TypeError("Cannot use sort_categories=True with ordered Categoricals")
Expand Down
29 changes: 3 additions & 26 deletions pandas/core/indexes/category.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,7 @@
from pandas.core.dtypes.missing import is_valid_nat_for_dtype, notna

from pandas.core import accessor
from pandas.core.algorithms import take_1d
from pandas.core.arrays.categorical import Categorical, contains, recode_for_categories
from pandas.core.arrays.categorical import Categorical, contains
import pandas.core.common as com
from pandas.core.construction import extract_array
import pandas.core.indexes.base as ibase
Expand Down Expand Up @@ -558,37 +557,15 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
"method='nearest' not implemented yet for CategoricalIndex"
)

if isinstance(target, CategoricalIndex) and self._values.is_dtype_equal(target):
if self._values.equals(target._values):
# we have the same codes
codes = target.codes
else:
codes = recode_for_categories(
target.codes, target.categories, self._values.categories
)
else:
if isinstance(target, CategoricalIndex):
code_indexer = self.categories.get_indexer(target.categories)
codes = take_1d(code_indexer, target.codes, fill_value=-1)
else:
codes = self.categories.get_indexer(target)

codes = self._values._validate_listlike(target._values)
indexer, _ = self._engine.get_indexer_non_unique(codes)
return ensure_platform_int(indexer)

@Appender(_index_shared_docs["get_indexer_non_unique"] % _index_doc_kwargs)
def get_indexer_non_unique(self, target):
target = ibase.ensure_index(target)

if isinstance(target, CategoricalIndex):
# Indexing on codes is more efficient if categories are the same:
if target.categories is self.categories:
target = target.codes
indexer, missing = self._engine.get_indexer_non_unique(target)
return ensure_platform_int(indexer), missing
target = target._values

codes = self.categories.get_indexer(target)
codes = self._values._validate_listlike(target._values)
indexer, missing = self._engine.get_indexer_non_unique(codes)
return ensure_platform_int(indexer), missing

Expand Down
9 changes: 2 additions & 7 deletions pandas/core/reshape/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@
from pandas import Categorical, Index, MultiIndex
from pandas.core import groupby
import pandas.core.algorithms as algos
from pandas.core.arrays.categorical import recode_for_categories
import pandas.core.common as com
from pandas.core.construction import extract_array
from pandas.core.frame import _merge_doc
Expand Down Expand Up @@ -1936,12 +1935,8 @@ def _factorize_keys(
):
assert isinstance(lk, Categorical)
assert isinstance(rk, Categorical)
if lk.categories.equals(rk.categories):
# if we exactly match in categories, allow us to factorize on codes
rk = rk.codes
else:
# Same categories in different orders -> recode
rk = recode_for_categories(rk.codes, rk.categories, lk.categories)
# Cast rk to encoding so we can compare codes with lk
rk = lk._validate_listlike(rk)

lk = ensure_int64(lk.codes)
rk = ensure_int64(rk)
Expand Down