Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove cudf._lib.column in favor of pylibcudf. #17760

Open
wants to merge 21 commits into
base: branch-25.04
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
b511f4a
Remove cudf._lib.column
mroeschke Jan 16, 2025
94a96f0
Start working through circular import errors
mroeschke Jan 16, 2025
119c0fd
Remove other access of cudf._lib.column as libcudf
mroeschke Jan 16, 2025
00d8709
Fix more tests
mroeschke Jan 16, 2025
bff0fa5
Add column_from_self_view, and fix other tests
mroeschke Jan 17, 2025
0581459
Merge remote-tracking branch 'upstream/branch-25.02' into cudf/_lib/c…
mroeschke Jan 17, 2025
1034df9
Merge remote-tracking branch 'upstream/branch-25.02' into cudf/_lib/c…
mroeschke Jan 21, 2025
cd60209
Type output of dtype_from_pylibcudf_column
mroeschke Jan 21, 2025
d578f6f
Replace column_from_self_view with existing copy
mroeschke Jan 21, 2025
cde7573
Merge branch 'branch-25.02' into cudf/_lib/column
mroeschke Jan 22, 2025
01b20bc
Merge remote-tracking branch 'upstream/branch-25.02' into cudf/_lib/c…
mroeschke Jan 24, 2025
323b1b0
Merge branch 'cudf/_lib/column' of https://github.com/mroeschke/cudf …
mroeschke Jan 24, 2025
c1ff886
Fold in some doc changes from scalar refactor
mroeschke Jan 24, 2025
1e85b3e
Merge remote-tracking branch 'upstream/branch-25.02' into cudf/_lib/c…
mroeschke Jan 24, 2025
622b4ef
Merge remote-tracking branch 'upstream/branch-25.02' into cudf/_lib/c…
mroeschke Jan 27, 2025
b3aa5cd
Adjust search_sorted output return
mroeschke Jan 27, 2025
16626f2
Remove c_value
mroeschke Jan 27, 2025
5e134e6
Merge branch 'branch-25.02' into cudf/_lib/column
galipremsagar Jan 28, 2025
05936c6
Merge remote-tracking branch 'upstream/branch-25.02' into cudf/_lib/c…
mroeschke Jan 28, 2025
ec92112
Merge branch 'cudf/_lib/column' of https://github.com/mroeschke/cudf …
mroeschke Jan 28, 2025
5a9725e
Merge remote-tracking branch 'upstream/branch-25.04' into cudf/_lib/c…
mroeschke Jan 31, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Start working through circular import errors
  • Loading branch information
mroeschke committed Jan 16, 2025
commit 94a96f070809f150d50c0bc4c73b5b75924e6cb2
18 changes: 13 additions & 5 deletions python/cudf/cudf/core/_base_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -2032,10 +2032,13 @@ def dropna(self, how="any"):
data_columns = [col.nans_to_nulls() for col in self._columns]

return self._from_columns_like_self(
drop_nulls(
data_columns,
how=how,
),
[
ColumnBase.from_pylibcudf(col)
for col in drop_nulls(
data_columns,
how=how,
)
],
self._column_names,
)

Expand Down Expand Up @@ -2103,7 +2106,12 @@ def _apply_boolean_mask(self, boolean_mask):
raise ValueError("boolean_mask is not boolean type.")

return self._from_columns_like_self(
apply_boolean_mask(list(self._columns), boolean_mask),
[
ColumnBase.from_pylibcudf(col)
for col in apply_boolean_mask(
list(self._columns), boolean_mask
)
],
column_names=self._column_names,
)

Expand Down
12 changes: 7 additions & 5 deletions python/cudf/cudf/core/_internals/copying.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,14 @@
import pylibcudf as plc

from cudf.core.buffer import acquire_spill_lock
from cudf.core.column import ColumnBase

if TYPE_CHECKING:
from collections.abc import Iterable

from cudf import Scalar

# ruff does not identify that there's a relative import in use
from cudf.core.column import ColumnBase # noqa: TC004
from cudf.core.column.numerical import NumericalColumn


Expand All @@ -20,15 +22,15 @@ def gather(
columns: Iterable[ColumnBase],
gather_map: NumericalColumn,
nullify: bool = False,
) -> list[ColumnBase]:
) -> list[plc.Column]:
plc_tbl = plc.copying.gather(
plc.Table([col.to_pylibcudf(mode="read") for col in columns]),
gather_map.to_pylibcudf(mode="read"),
plc.copying.OutOfBoundsPolicy.NULLIFY
if nullify
else plc.copying.OutOfBoundsPolicy.DONT_CHECK,
)
return [ColumnBase.from_pylibcudf(col) for col in plc_tbl.columns()]
return plc_tbl.columns()


@acquire_spill_lock()
Expand Down Expand Up @@ -69,13 +71,13 @@ def scatter(
plc.Table([col.to_pylibcudf(mode="read") for col in target_columns]),
)

return [ColumnBase.from_pylibcudf(col) for col in plc_tbl.columns()]
return plc_tbl.columns()


@acquire_spill_lock()
def columns_split(
input_columns: Iterable[ColumnBase], splits: list[int]
) -> list[list[ColumnBase]]:
) -> list[list[plc.Column]]:
return [
[ColumnBase.from_pylibcudf(col) for col in plc_tbl.columns()]
for plc_tbl in plc.copying.split(
Expand Down
36 changes: 16 additions & 20 deletions python/cudf/cudf/core/_internals/sorting.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,12 @@
import pylibcudf as plc

from cudf.core.buffer import acquire_spill_lock
from cudf.core.column import ColumnBase

if TYPE_CHECKING:
from collections.abc import Iterable

from cudf.core.column import ColumnBase


@acquire_spill_lock()
def is_sorted(
Expand Down Expand Up @@ -118,7 +119,7 @@ def order_by(
na_position: Literal["first", "last"],
*,
stable: bool,
):
) -> plc.Column:
"""
Get index to sort the table in ascending/descending order.

Expand All @@ -144,14 +145,12 @@ def order_by(
func = (
plc.sorting.stable_sorted_order if stable else plc.sorting.sorted_order
)
return ColumnBase.from_pylibcudf(
func(
plc.Table(
[col.to_pylibcudf(mode="read") for col in columns_from_table],
),
order[0],
order[1],
)
return func(
plc.Table(
[col.to_pylibcudf(mode="read") for col in columns_from_table],
),
order[0],
order[1],
)


Expand All @@ -163,7 +162,7 @@ def sort_by_key(
na_position: list[Literal["first", "last"]],
*,
stable: bool,
) -> list[ColumnBase]:
) -> list[plc.Column]:
"""
Sort a table by given keys

Expand Down Expand Up @@ -192,12 +191,9 @@ def sort_by_key(
func = (
plc.sorting.stable_sort_by_key if stable else plc.sorting.sort_by_key
)
return [
ColumnBase.from_pylibcudf(col)
for col in func(
plc.Table([col.to_pylibcudf(mode="read") for col in values]),
plc.Table([col.to_pylibcudf(mode="read") for col in keys]),
order[0],
order[1],
).columns()
]
return func(
plc.Table([col.to_pylibcudf(mode="read") for col in values]),
plc.Table([col.to_pylibcudf(mode="read") for col in keys]),
order[0],
order[1],
).columns()
18 changes: 10 additions & 8 deletions python/cudf/cudf/core/_internals/stream_compaction.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
# Copyright (c) 2020-2025, NVIDIA CORPORATION.
from __future__ import annotations

from typing import Literal
from typing import TYPE_CHECKING, Literal

import pylibcudf as plc

from cudf.core.buffer import acquire_spill_lock
from cudf.core.column import ColumnBase

if TYPE_CHECKING:
from cudf.core.column import ColumnBase


@acquire_spill_lock()
Expand All @@ -15,7 +17,7 @@ def drop_nulls(
how: Literal["any", "all"] = "any",
keys: list[int] | None = None,
thresh: int | None = None,
) -> list[ColumnBase]:
) -> list[plc.Column]:
"""
Drops null rows from cols depending on key columns.

Expand Down Expand Up @@ -50,13 +52,13 @@ def drop_nulls(
keys,
keep_threshold,
)
return [ColumnBase.from_pylibcudf(col) for col in plc_table.columns()]
return plc_table.columns()


@acquire_spill_lock()
def apply_boolean_mask(
columns: list[ColumnBase], boolean_mask: ColumnBase
) -> list[ColumnBase]:
) -> list[plc.Column]:
"""
Drops the rows which correspond to False in boolean_mask.

Expand All @@ -73,7 +75,7 @@ def apply_boolean_mask(
plc.Table([col.to_pylibcudf(mode="read") for col in columns]),
boolean_mask.to_pylibcudf(mode="read"),
)
return [ColumnBase.from_pylibcudf(col) for col in plc_table.columns()]
return plc_table.columns()


@acquire_spill_lock()
Expand All @@ -82,7 +84,7 @@ def drop_duplicates(
keys: list[int] | None = None,
keep: Literal["first", "last", False] = "first",
nulls_are_equal: bool = True,
) -> list[ColumnBase]:
) -> list[plc.Column]:
"""
Drops rows in source_table as per duplicate rows in keys.

Expand Down Expand Up @@ -115,4 +117,4 @@ def drop_duplicates(
else plc.types.NullEquality.UNEQUAL,
plc.types.NanEquality.ALL_EQUAL,
)
return [ColumnBase.from_pylibcudf(col) for col in plc_table.columns()]
return plc_table.columns()
34 changes: 19 additions & 15 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -719,7 +719,9 @@ def any(self, skipna: bool = True) -> bool:

def dropna(self) -> Self:
if self.has_nulls():
return drop_nulls([self])[0]._with_type_metadata(self.dtype) # type: ignore[return-value]
return ColumnBase.from_pylibcudf(
drop_nulls([self])[0]
)._with_type_metadata(self.dtype) # type: ignore[return-value]
else:
return self.copy()

Expand Down Expand Up @@ -1300,9 +1302,9 @@ def indices_of(
else:
value = as_column(value, dtype=self.dtype, length=1)
mask = value.contains(self)
return apply_boolean_mask( # type: ignore[return-value]
[as_column(range(0, len(self)), dtype=SIZE_TYPE_DTYPE)], mask
)[0]
return as_column(
range(len(self)), dtype=SIZE_TYPE_DTYPE
).apply_boolean_mask(mask) # type: ignore[return-value]

def _find_first_and_last(self, value: ScalarLike) -> tuple[int, int]:
indices = self.indices_of(value)
Expand Down Expand Up @@ -1682,9 +1684,9 @@ def apply_boolean_mask(self, mask) -> ColumnBase:
if mask.dtype.kind != "b":
raise ValueError("boolean_mask is not boolean type.")

return apply_boolean_mask([self], mask)[0]._with_type_metadata(
self.dtype
)
return ColumnBase.from_pylibcudf(
apply_boolean_mask([self], mask)[0]
)._with_type_metadata(self.dtype)

def argsort(
self,
Expand All @@ -1705,8 +1707,8 @@ def argsort(
as_column(range(len(self) - 1, -1, -1)),
)
else:
return sorting.order_by(
[self], [ascending], na_position, stable=True
return ColumnBase.from_pylibcudf( # type: ignore[return-value]
sorting.order_by([self], [ascending], na_position, stable=True)
)

def __arrow_array__(self, type=None):
Expand Down Expand Up @@ -1772,9 +1774,11 @@ def unique(self) -> Self:
if self.is_unique:
return self.copy()
else:
return drop_duplicates([self], keep="first")[ # type: ignore[return-value]
0
]._with_type_metadata(self.dtype)
return ColumnBase.from_pylibcudf(
drop_duplicates([self], keep="first")[ # type: ignore[return-value]
0
]
)._with_type_metadata(self.dtype)

def serialize(self) -> tuple[dict, list]:
# data model:
Expand Down Expand Up @@ -2010,10 +2014,10 @@ def _return_sentinel_column():
del right_rows
# reorder `codes` so that its values correspond to the
# values of `self`:
(codes,) = sorting.sort_by_key(
plc_codes = sorting.sort_by_key(
[codes], [left_gather_map], [True], ["last"], stable=True
)
return codes.fillna(na_sentinel.value)
)[0]
return ColumnBase.from_pylibcudf(plc_codes).fillna(na_sentinel.value)

@acquire_spill_lock()
def copy_if_else(
Expand Down
7 changes: 3 additions & 4 deletions python/cudf/cudf/core/column/numerical_base.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2018-2024, NVIDIA CORPORATION.
# Copyright (c) 2018-2025, NVIDIA CORPORATION.
"""Define an interface for columns that can perform numerical operations."""

from __future__ import annotations
Expand All @@ -10,7 +10,6 @@
import pylibcudf as plc

import cudf
from cudf.core._internals import sorting
from cudf.core.buffer import Buffer, acquire_spill_lock
from cudf.core.column.column import ColumnBase
from cudf.core.missing import NA
Expand Down Expand Up @@ -145,8 +144,8 @@ def quantile(
else:
no_nans = self.nans_to_nulls()
# get sorted indices and exclude nulls
indices = sorting.order_by(
[no_nans], [True], "first", stable=True
indices = no_nans.argsort(
ascending=True, na_position="first"
).slice(no_nans.null_count, len(no_nans))
with acquire_spill_lock():
plc_column = plc.quantiles.quantile(
Expand Down
Loading