Skip to content

Commit

Permalink
Make cudf._lib.string_udf work with pylibcudf Columns instead of cudf…
Browse files Browse the repository at this point in the history
…._lib Columns (#17715)

With the possibility of the `cudf._lib.Column` being refactored (possibly to a pure Python implementation), this PR makes the `string_udf` routines dependent on pylibcudf instead of the the cudf classic Cython based Column

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: #17715
  • Loading branch information
mroeschke authored Jan 15, 2025
1 parent f863090 commit 8538ec8
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 16 deletions.
9 changes: 3 additions & 6 deletions python/cudf/cudf/_lib/strings_udf.pyx
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2022-2024, NVIDIA CORPORATION.
# Copyright (c) 2022-2025, NVIDIA CORPORATION.

from libc.stdint cimport uint8_t, uint16_t, uintptr_t
from pylibcudf.libcudf.strings_udf cimport (
Expand All @@ -25,15 +25,14 @@ from pylibcudf.libcudf.strings_udf cimport (
from rmm.librmm.device_buffer cimport device_buffer
from rmm.pylibrmm.device_buffer cimport DeviceBuffer

from cudf._lib.column cimport Column
from pylibcudf cimport Column as plc_Column


def get_cuda_build_version():
return cpp_get_cuda_build_version()


def column_to_string_view_array(Column strings_col):
def column_to_string_view_array(plc_Column strings_col):
cdef unique_ptr[device_buffer] c_buffer
cdef column_view input_view = strings_col.view()
with nogil:
Expand All @@ -52,9 +51,7 @@ def column_from_udf_string_array(DeviceBuffer d_buffer):
c_result = move(cpp_column_from_udf_string_array(data, size))
cpp_free_udf_string_array(data, size)

return Column.from_pylibcudf(
plc_Column.from_libcudf(move(c_result))
)
return plc_Column.from_libcudf(move(c_result))


def get_character_flags_table_ptr():
Expand Down
22 changes: 16 additions & 6 deletions python/cudf/cudf/core/udf/utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020-2024, NVIDIA CORPORATION.
# Copyright (c) 2020-2025, NVIDIA CORPORATION.
from __future__ import annotations

import functools
Expand All @@ -20,6 +20,7 @@
import rmm

from cudf._lib import strings_udf
from cudf._lib.column import Column
from cudf.api.types import is_scalar
from cudf.core.column.column import as_column
from cudf.core.dtypes import dtype
Expand All @@ -44,6 +45,11 @@
if TYPE_CHECKING:
from collections.abc import Callable

import pylibcudf as plc

from cudf.core.buffer.buffer import Buffer
from cudf.core.indexed_frame import IndexedFrame

# Maximum size of a string column is 2 GiB
_STRINGS_UDF_DEFAULT_HEAP_SIZE = os.environ.get("STRINGS_UDF_HEAP_SIZE", 2**31)
_heap_size = 0
Expand Down Expand Up @@ -298,12 +304,14 @@ def _get_kernel(kernel_string, globals_, sig, func):
return kernel


def _get_input_args_from_frame(fr):
args = []
def _get_input_args_from_frame(fr: IndexedFrame) -> list:
args: list[Buffer | tuple[Buffer, Buffer]] = []
offsets = []
for col in _supported_cols_from_frame(fr).values():
if col.dtype == _cudf_str_dtype:
data = column_to_string_view_array_init_heap(col)
data = column_to_string_view_array_init_heap(
col.to_pylibcudf(mode="read")
)
else:
data = col.data
if col.mask is not None:
Expand All @@ -325,7 +333,9 @@ def _return_arr_from_dtype(dtype, size):

def _post_process_output_col(col, retty):
if retty == _cudf_str_dtype:
return strings_udf.column_from_udf_string_array(col)
return Column.from_pylibcudf(
strings_udf.column_from_udf_string_array(col)
)
return as_column(col, retty)


Expand Down Expand Up @@ -365,7 +375,7 @@ def set_malloc_heap_size(size=None):
_heap_size = size


def column_to_string_view_array_init_heap(col):
def column_to_string_view_array_init_heap(col: plc.Column) -> Buffer:
# lazily allocate heap only when a string needs to be returned
return strings_udf.column_to_string_view_array(col)

Expand Down
11 changes: 7 additions & 4 deletions python/cudf/cudf/tests/test_string_udfs.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2022-2024, NVIDIA CORPORATION.
# Copyright (c) 2022-2025, NVIDIA CORPORATION.

import numba
import numpy as np
Expand All @@ -11,6 +11,7 @@
import rmm

import cudf
from cudf._lib.column import Column
from cudf._lib.strings_udf import (
column_from_udf_string_array,
column_to_string_view_array,
Expand Down Expand Up @@ -87,14 +88,16 @@ def run_udf_test(data, func, dtype):
)

cudf_column = cudf.core.column.as_column(data)
str_views = column_to_string_view_array(cudf_column)
str_views = column_to_string_view_array(
cudf_column.to_pylibcudf(mode="read")
)
sv_kernel, udf_str_kernel = get_kernels(func, dtype, len(data))

expect = pd.Series(data).apply(func)
with _CUDFNumbaConfig():
sv_kernel.forall(len(data))(str_views, output)
if dtype == "str":
result = column_from_udf_string_array(output)
result = Column.from_pylibcudf(column_from_udf_string_array(output))
else:
result = output

Expand All @@ -103,7 +106,7 @@ def run_udf_test(data, func, dtype):
with _CUDFNumbaConfig():
udf_str_kernel.forall(len(data))(str_views, output)
if dtype == "str":
result = column_from_udf_string_array(output)
result = Column.from_pylibcudf(column_from_udf_string_array(output))
else:
result = output

Expand Down

0 comments on commit 8538ec8

Please sign in to comment.