Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Align CategoricalIndex APIs with pandas 2.x #16369

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
130 changes: 75 additions & 55 deletions python/cudf/cudf/core/column/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -262,37 +262,10 @@ def add_categories(self, new_categories: Any) -> SeriesOrIndex | None:
dtype: category
Categories (2, int64): [1, 2]
"""
old_categories = self._column.categories
new_categories = column.as_column(
new_categories,
dtype=old_categories.dtype if len(new_categories) == 0 else None,
)

if is_mixed_with_object_dtype(old_categories, new_categories):
raise TypeError(
f"cudf does not support adding categories with existing "
f"categories of dtype `{old_categories.dtype}` and new "
f"categories of dtype `{new_categories.dtype}`, please "
f"type-cast new_categories to the same type as "
f"existing categories."
)
common_dtype = find_common_type(
[old_categories.dtype, new_categories.dtype]
return self._return_or_inplace(
self._column.add_categories(new_categories=new_categories)
)

new_categories = new_categories.astype(common_dtype)
old_categories = old_categories.astype(common_dtype)

if old_categories.isin(new_categories).any():
raise ValueError("new categories must not include old categories")

new_categories = old_categories.append(new_categories)
out_col = self._column
if not out_col._categories_equal(new_categories):
out_col = out_col._set_categories(new_categories)

return self._return_or_inplace(out_col)

def remove_categories(
self,
removals: Any,
Expand Down Expand Up @@ -349,23 +322,9 @@ def remove_categories(
dtype: category
Categories (3, int64): [1, 2, 10]
"""

cats = self.categories.to_series()
removals = cudf.Series(removals, dtype=cats.dtype)
removals_mask = removals.isin(cats)

# ensure all the removals are in the current categories
# list. If not, raise an error to match Pandas behavior
if not removals_mask.all():
vals = removals[~removals_mask].to_numpy()
raise ValueError(f"removals must all be in old categories: {vals}")

new_categories = cats[~cats.isin(removals)]._column
out_col = self._column
if not out_col._categories_equal(new_categories):
out_col = out_col._set_categories(new_categories)

return self._return_or_inplace(out_col)
return self._return_or_inplace(
self._column.remove_categories(removals=removals)
)

def set_categories(
self,
Expand Down Expand Up @@ -1319,7 +1278,7 @@ def _set_categories(
new_categories: Any,
is_unique: bool = False,
ordered: bool = False,
) -> CategoricalColumn:
) -> Self:
"""Returns a new CategoricalColumn with the categories set to the
specified *new_categories*.

Expand Down Expand Up @@ -1376,17 +1335,68 @@ def _set_categories(
new_codes = df._data["new_codes"]

# codes can't have masks, so take mask out before moving in
return column.build_categorical_column(
categories=new_cats,
codes=column.build_column(
new_codes.base_data, dtype=new_codes.dtype
return cast(
Self,
column.build_categorical_column(
categories=new_cats,
codes=column.build_column(
new_codes.base_data, dtype=new_codes.dtype
),
mask=new_codes.base_mask,
size=new_codes.size,
offset=new_codes.offset,
ordered=ordered,
),
mask=new_codes.base_mask,
size=new_codes.size,
offset=new_codes.offset,
ordered=ordered,
)

def add_categories(self, new_categories: Any) -> Self:
old_categories = self.categories
new_categories = column.as_column(
new_categories,
dtype=old_categories.dtype if len(new_categories) == 0 else None,
)
if is_mixed_with_object_dtype(old_categories, new_categories):
raise TypeError(
f"cudf does not support adding categories with existing "
f"categories of dtype `{old_categories.dtype}` and new "
f"categories of dtype `{new_categories.dtype}`, please "
f"type-cast new_categories to the same type as "
f"existing categories."
)
common_dtype = find_common_type(
[old_categories.dtype, new_categories.dtype]
)

new_categories = new_categories.astype(common_dtype)
old_categories = old_categories.astype(common_dtype)

if old_categories.isin(new_categories).any():
raise ValueError("new categories must not include old categories")

new_categories = old_categories.append(new_categories)
if not self._categories_equal(new_categories):
return self._set_categories(new_categories)
return self

def remove_categories(
self,
removals: Any,
) -> Self:
removals = column.as_column(removals).astype(self.categories.dtype)
removals_mask = removals.isin(self.categories)

# ensure all the removals are in the current categories
# list. If not, raise an error to match Pandas behavior
if not removals_mask.all():
raise ValueError("removals must all be in old categories")

new_categories = self.categories.apply_boolean_mask(
self.categories.isin(removals).unary_operator("not")
)
if not self._categories_equal(new_categories):
return self._set_categories(new_categories)
return self

def reorder_categories(
self,
new_categories: Any,
Expand All @@ -1404,6 +1414,16 @@ def reorder_categories(
)
return self._set_categories(new_categories, ordered=ordered)

def rename_categories(self, new_categories) -> CategoricalColumn:
raise NotImplementedError(
"rename_categories is currently not supported."
)

def remove_unused_categories(self) -> Self:
raise NotImplementedError(
"remove_unused_categories is currently not supported."
)

def as_ordered(self, ordered: bool):
if self.dtype.ordered == ordered:
return self
Expand Down
116 changes: 116 additions & 0 deletions python/cudf/cudf/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -2705,6 +2705,10 @@ def __init__(
data = data.as_ordered(ordered=False)
super().__init__(data, name=name)

@property
def ordered(self) -> bool:
return self._column.ordered

@property # type: ignore
@_performance_tracking
def codes(self):
Expand All @@ -2727,6 +2731,118 @@ def _is_boolean(self):
def _is_categorical(self):
return True

def add_categories(self, new_categories) -> Self:
"""
Add new categories.

`new_categories` will be included at the last/highest place in the
categories and will be unused directly after this call.
"""
return type(self)._from_data(
{self.name: self._column.add_categories(new_categories)}
)

def as_ordered(self) -> Self:
"""
Set the Categorical to be ordered.
"""
return type(self)._from_data(
{self.name: self._column.as_ordered(ordered=True)}
)

def as_unordered(self) -> Self:
"""
Set the Categorical to be unordered.
"""
return type(self)._from_data(
{self.name: self._column.as_ordered(ordered=False)}
)

def remove_categories(self, removals) -> Self:
"""
Remove the specified categories.

`removals` must be included in the old categories.

Parameters
----------
removals : category or list of categories
The categories which should be removed.
"""
return type(self)._from_data(
{self.name: self._column.remove_categories(removals)}
)

def remove_unused_categories(self) -> Self:
"""
Remove categories which are not used.

This method is currently not supported.
"""
return type(self)._from_data(
{self.name: self._column.remove_unused_categories()}
)

def rename_categories(self, new_categories) -> Self:
"""
Rename categories.

This method is currently not supported.
"""
return type(self)._from_data(
{self.name: self._column.rename_categories(new_categories)}
)

def reorder_categories(self, new_categories, ordered=None) -> Self:
"""
Reorder categories as specified in new_categories.

``new_categories`` need to include all old categories and no new category
items.

Parameters
----------
new_categories : Index-like
The categories in new order.
ordered : bool, optional
Whether or not the categorical is treated as a ordered categorical.
If not given, do not change the ordered information.
"""
return type(self)._from_data(
{
self.name: self._column.reorder_categories(
new_categories, ordered=ordered
)
}
)

def set_categories(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could we add pytests coverage for these new APIs?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure added tests in fc64424

self, new_categories, ordered=None, rename: bool = False
) -> Self:
"""
Set the categories to the specified new_categories.

Parameters
----------
new_categories : list-like
The categories in new order.
ordered : bool, default None
Whether or not the categorical is treated as
a ordered categorical. If not given, do
not change the ordered information.
rename : bool, default False
Whether or not the `new_categories` should be
considered as a rename of the old categories
or as reordered categories.
"""
return type(self)._from_data(
{
self.name: self._column.set_categories(
new_categories, ordered=ordered, rename=rename
)
}
)


@_performance_tracking
def interval_range(
Expand Down
56 changes: 56 additions & 0 deletions python/cudf/cudf/tests/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -891,3 +891,59 @@ def test_categorical_maxima(op):
result = getattr(ser.cat.as_ordered(), op)()
result_pd = getattr(ser_pd.cat.as_ordered(), op)()
assert_eq(result, result_pd)


@pytest.mark.parametrize("ordered", [True, False])
def test_index_ordered(ordered):
pd_ci = pd.CategoricalIndex([1, 2, 3], ordered=ordered)
cudf_ci = cudf.from_pandas(pd_ci)
assert pd_ci.ordered == cudf_ci.ordered


@pytest.mark.parametrize("method", ["as_ordered", "as_unordered"])
@pytest.mark.parametrize("ordered", [True, False])
def test_index_as_ordered(method, ordered):
pd_ci = pd.CategoricalIndex([1, 2, 3], ordered=ordered)
cudf_ci = cudf.from_pandas(pd_ci)

expected = getattr(pd_ci, method)()
result = getattr(cudf_ci, method)()
assert_eq(result, expected)


def test_index_add_categories():
pd_ci = pd.CategoricalIndex([1, 2, 3])
cudf_ci = cudf.from_pandas(pd_ci)

expected = pd_ci.add_categories([4])
result = cudf_ci.add_categories([4])
assert_eq(result, expected)


def test_index_remove_categories():
pd_ci = pd.CategoricalIndex([1, 2, 3], categories=[1, 2, 3, 4])
cudf_ci = cudf.from_pandas(pd_ci)

expected = pd_ci.remove_categories([4])
result = cudf_ci.remove_categories([4])
assert_eq(result, expected)


@pytest.mark.parametrize("ordered", [True, False])
def test_index_reorder_categories(ordered):
pd_ci = pd.CategoricalIndex([1, 2, 3], categories=[1, 3, 2, 4])
cudf_ci = cudf.from_pandas(pd_ci)

expected = pd_ci.reorder_categories([1, 2, 3, 4], ordered=ordered)
result = cudf_ci.reorder_categories([1, 2, 3, 4], ordered=ordered)
assert_eq(result, expected)


@pytest.mark.parametrize("ordered", [True, False])
def test_index_set_categories(ordered):
pd_ci = pd.CategoricalIndex([1, 2, 3])
cudf_ci = cudf.from_pandas(pd_ci)

expected = pd_ci.set_categories([1, 2, 3, 4], ordered=ordered)
result = cudf_ci.set_categories([1, 2, 3, 4], ordered=ordered)
assert_eq(result, expected)
Loading