Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Migrate lists/set_operations to pylibcudf #16190

Merged
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
Initial commit
  • Loading branch information
Matt711 committed Jul 3, 2024
commit b7a22d8ee910c6f49a98106015fce354e9448775
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# Copyright (c) 2021-2024, NVIDIA CORPORATION.

from libcpp.memory cimport unique_ptr

from cudf._lib.pylibcudf.libcudf.column.column cimport column
from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport (
lists_column_view,
)
from cudf._lib.pylibcudf.libcudf.types cimport nan_equality, null_equality


cdef extern from "cudf/lists/set_operations.hpp" namespace "cudf::lists" nogil:
cdef unique_ptr[column] difference_distinct(
const lists_column_view& lhs,
const lists_column_view& rhs,
null_equality nulls_equal,
nan_equality nans_equal
) except +

cdef unique_ptr[column] have_overlap(
const lists_column_view& lhs,
const lists_column_view& rhs,
null_equality nulls_equal,
nan_equality nans_equal
) except +

cdef unique_ptr[column] intersect_distinct(
const lists_column_view& lhs,
const lists_column_view& rhs,
null_equality nulls_equal,
nan_equality nans_equal
) except +

cdef unique_ptr[column] union_distinct(
const lists_column_view& lhs,
const lists_column_view& rhs,
null_equality nulls_equal,
nan_equality nans_equal
) except +
8 changes: 8 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/lists.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,11 @@ cpdef Column contains(Column, ColumnOrScalar)
cpdef Column contains_nulls(Column)

cpdef Column index_of(Column, ColumnOrScalar, bool)

cpdef Column difference_distinct(Column, Column, bool nulls_equal=*, bool nans_equal=*)

cpdef Column have_overlap(Column, Column, bool nulls_equal=*, bool nans_equal=*)

cpdef Column intersect_distinct(Column, Column, bool nulls_equal=*, bool nans_equal=*)

cpdef Column union_distinct(Column, Column, bool nulls_equal=*, bool nans_equal=*)
201 changes: 200 additions & 1 deletion python/cudf/cudf/_lib/pylibcudf/lists.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,19 @@ from cudf._lib.pylibcudf.libcudf.column.column cimport column
from cudf._lib.pylibcudf.libcudf.lists cimport (
contains as cpp_contains,
explode as cpp_explode,
set_operations as cpp_set_operations,
)
from cudf._lib.pylibcudf.libcudf.lists.combine cimport (
concatenate_list_elements as cpp_concatenate_list_elements,
concatenate_null_policy,
concatenate_rows as cpp_concatenate_rows,
)
from cudf._lib.pylibcudf.libcudf.table.table cimport table
from cudf._lib.pylibcudf.libcudf.types cimport size_type
from cudf._lib.pylibcudf.libcudf.types cimport (
nan_equality,
null_equality,
size_type,
)
from cudf._lib.pylibcudf.lists cimport ColumnOrScalar

from .column cimport Column, ListColumnView
Expand Down Expand Up @@ -206,3 +211,197 @@ cpdef Column index_of(Column input, ColumnOrScalar search_key, bool find_first_o
find_option,
))
return Column.from_libcudf(move(c_result))


cpdef Column difference_distinct(
Column lhs,
Column rhs,
bool nulls_equal=True,
bool nans_equal=True
):
"""Create a column of index values indicating the position of a search
key row within the corresponding list row in the lists column.

For details, see :cpp:func:`difference_distinct`.

Parameters
----------
lhs : Column
The input lists column of elements that may be included.
rhs : Column
The input lists column of elements to exclude.
nulls_equal : bool
If true, null elements are considered equal. Otherwise, unequal.
nans_equal : bool
If true, libcudf will treat nan elements from {-nan, +nan}
as equal. Otherwise, unequal. Otherwise, unequal.

Returns
-------
Column
A lists column containing the difference results.
"""
cdef unique_ptr[column] c_result
cdef ListColumnView lhs_view = lhs.list_view()
cdef ListColumnView rhs_view = rhs.list_view()

cdef null_equality c_nulls_equal = (
null_equality.EQUAL if nulls_equal else null_equality.UNEQUAL
)
cdef nan_equality c_nans_equal = (
nan_equality.ALL_EQUAL if nans_equal else nan_equality.UNEQUAL
)

with nogil:
c_result = move(cpp_set_operations.difference_distinct(
lhs_view.view(),
rhs_view.view(),
c_nulls_equal,
c_nans_equal,
))
return Column.from_libcudf(move(c_result))


cpdef Column have_overlap(
Column lhs,
Column rhs,
bool nulls_equal=True,
bool nans_equal=True
):
"""Check if lists at each row of the given lists columns overlap.

For details, see :cpp:func:`have_overlap`.

Parameters
----------
lhs : Column
The input lists column for one side.
rhs : Column
The input lists column for the other side.
nulls_equal : bool
If true, null elements are considered equal. Otherwise, unequal.
nans_equal : bool
If true, libcudf will treat nan elements from {-nan, +nan}
as equal. Otherwise, unequal. Otherwise, unequal.

Returns
-------
Column
A column containing the check results.
"""
cdef unique_ptr[column] c_result
cdef ListColumnView lhs_view = lhs.list_view()
cdef ListColumnView rhs_view = rhs.list_view()

cdef null_equality c_nulls_equal = (
null_equality.EQUAL if nulls_equal else null_equality.UNEQUAL
)
cdef nan_equality c_nans_equal = (
nan_equality.ALL_EQUAL if nans_equal else nan_equality.UNEQUAL
)

with nogil:
c_result = move(cpp_set_operations.have_overlap(
lhs_view.view(),
rhs_view.view(),
c_nulls_equal,
c_nans_equal,
))
return Column.from_libcudf(move(c_result))


cpdef Column intersect_distinct(
Column lhs,
Column rhs,
bool nulls_equal=True,
bool nans_equal=True
):
"""Create a lists column of distinct elements common to two input lists columns.

For details, see :cpp:func:`intersect_distinct`.

Parameters
----------
lhs : Column
The input lists column of elements that may be included.
rhs : Column
The input lists column of elements to exclude.
nulls_equal : bool
If true, null elements are considered equal. Otherwise, unequal.
nans_equal : bool
If true, libcudf will treat nan elements from {-nan, +nan}
as equal. Otherwise, unequal. Otherwise, unequal.

Returns
-------
Column
A lists column containing the intersection results.
"""
cdef unique_ptr[column] c_result
cdef ListColumnView lhs_view = lhs.list_view()
cdef ListColumnView rhs_view = rhs.list_view()

cdef null_equality c_nulls_equal = (
null_equality.EQUAL if nulls_equal else null_equality.UNEQUAL
)
cdef nan_equality c_nans_equal = (
nan_equality.ALL_EQUAL if nans_equal else nan_equality.UNEQUAL
)

with nogil:
c_result = move(cpp_set_operations.intersect_distinct(
lhs_view.view(),
rhs_view.view(),
c_nulls_equal,
c_nans_equal,
))
return Column.from_libcudf(move(c_result))


cpdef Column union_distinct(
Column lhs,
Column rhs,
bool nulls_equal=True,
bool nans_equal=True
):
"""Create a lists column of distinct elements found in
either of two input lists columns.

For details, see :cpp:func:`union_distinct`.

Parameters
----------
lhs : Column
The input lists column of elements that may be included.
rhs : Column
The input lists column of elements to exclude.
nulls_equal : bool
If true, null elements are considered equal. Otherwise, unequal.
nans_equal : bool
Matt711 marked this conversation as resolved.
Show resolved Hide resolved
If true, libcudf will treat nan elements from {-nan, +nan}
as equal. Otherwise, unequal. Otherwise, unequal.

Returns
-------
Column
A lists column containing the union results.
"""
cdef unique_ptr[column] c_result
cdef ListColumnView lhs_view = lhs.list_view()
cdef ListColumnView rhs_view = rhs.list_view()

cdef null_equality c_nulls_equal = (
null_equality.EQUAL if nulls_equal else null_equality.UNEQUAL
)
cdef nan_equality c_nans_equal = (
nan_equality.ALL_EQUAL if nans_equal else nan_equality.UNEQUAL
)

with nogil:
c_result = move(cpp_set_operations.union_distinct(
lhs_view.view(),
rhs_view.view(),
c_nulls_equal,
c_nans_equal,
))
return Column.from_libcudf(move(c_result))
39 changes: 39 additions & 0 deletions python/cudf/cudf/pylibcudf_tests/test_lists.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,13 @@ def column():
return pa.array([3, 2, 5, 6]), pa.array([-1, 0, 0, 0], type=pa.int32())


@pytest.fixture
def set_lists_column():
lhs = [[2, 1, 2], [1, 2, 3], None, [4, None, 5]]
rhs = [[1, 2, 3], [4, 5], [None, 7, 8], [None, None]]
return lhs, rhs


def test_concatenate_rows(test_data):
arrow_tbl = pa.Table.from_arrays(test_data[0], names=["a", "b"])
plc_tbl = plc.interop.from_arrow(arrow_tbl)
Expand Down Expand Up @@ -134,3 +141,35 @@ def test_index_of_list_column(test_data, column):
expect = pa.array(column[1], type=pa.int32())

assert_column_eq(expect, res)


def test_set_operations(set_lists_column):
lhs, rhs = set_lists_column

res = plc.lists.difference_distinct(
plc.interop.from_arrow(pa.array(lhs)),
plc.interop.from_arrow(pa.array(rhs)),
Matt711 marked this conversation as resolved.
Show resolved Hide resolved
)
expect = pa.array([[], [1, 2, 3], None, [4, 5]])
assert_column_eq(expect, res)

res = plc.lists.have_overlap(
plc.interop.from_arrow(pa.array(lhs)),
plc.interop.from_arrow(pa.array(rhs)),
)
expect = pa.array([True, False, None, True])
assert_column_eq(expect, res)

res = plc.lists.intersect_distinct(
plc.interop.from_arrow(pa.array(lhs)),
plc.interop.from_arrow(pa.array(rhs)),
)
expect = pa.array([[1, 2], [], None, [None]])
assert_column_eq(expect, res)

res = plc.lists.union_distinct(
plc.interop.from_arrow(pa.array(lhs)),
plc.interop.from_arrow(pa.array(rhs)),
)
expect = pa.array([[2, 1, 3], [1, 2, 3, 4, 5], None, [4, None, 5]])
assert_column_eq(expect, res)
Loading