Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Migrate quantile.pxd to pylibcudf #15874

Merged
merged 13 commits into from
Jun 6, 2024
Prev Previous commit
Next Next commit
More tests
  • Loading branch information
lithomas1 committed Jun 4, 2024
commit 0259dbc28a94d33af5e85d62e4104134ea52308c
2 changes: 1 addition & 1 deletion python/cudf/cudf/_lib/pylibcudf/quantiles.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ cpdef Table quantiles(
The Table to calculate row quantiles on.
q: array-like
The quantiles to calculate in range [0,1]
interp: Interpolation, default Interpolation.LINEAR
interp: Interpolation, default Interpolation.NEAREST
The strategy used to select between values adjacent to a specified quantile.

Must be a non-arithmetic interpolation strategy
lithomas1 marked this conversation as resolved.
Show resolved Hide resolved
Expand Down
105 changes: 79 additions & 26 deletions python/cudf/cudf/_lib/quantiles.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -3,69 +3,122 @@
from cudf.core.buffer import acquire_spill_lock

from libcpp cimport bool
from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move
from libcpp.vector cimport vector

from cudf._lib.column cimport Column
from cudf._lib.types cimport (
underlying_type_t_interpolation,
underlying_type_t_null_order,
underlying_type_t_order,
underlying_type_t_sorted,
)

from cudf._lib.types import Interpolation

from cudf._lib.pylibcudf.libcudf.types cimport interpolation, sorted
from cudf._lib.utils cimport columns_from_pylibcudf_table

import cudf._lib.pylibcudf as plc
from cudf._lib.pylibcudf.libcudf.column.column cimport column
from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
from cudf._lib.pylibcudf.libcudf.quantiles cimport (
quantile as cpp_quantile,
quantiles as cpp_quantile_table,
)
from cudf._lib.pylibcudf.libcudf.table.table cimport table
from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
from cudf._lib.pylibcudf.libcudf.types cimport (
interpolation,
null_order,
order,
sorted,
)
from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns


@acquire_spill_lock()
def quantile(
Column input,
double[:] q,
object q,
str interp,
Column ordered_indices,
bool exact,

):
cdef column_view c_input = input.view()
cdef column_view c_ordered_indices = (
column_view() if ordered_indices is None
else ordered_indices.view()
)
cdef interpolation c_interp = <interpolation>(
<underlying_type_t_interpolation> Interpolation[interp.upper()]
)
cdef bool c_exact = exact

return Column.from_pylibcudf(
plc.quantiles.quantile(
input.to_pylibcudf(mode="read"),
q,
c_interp,
ordered_indices.to_pylibcudf(mode="read"),
<bool>exact
cdef vector[double] c_q
c_q.reserve(len(q))

for value in q:
c_q.push_back(value)

cdef unique_ptr[column] c_result

with nogil:
c_result = move(
cpp_quantile(
c_input,
c_q,
c_interp,
c_ordered_indices,
c_exact,
)
)
)

return Column.from_unique_ptr(move(c_result))


def quantile_table(
list source_columns,
double[:] q,
vector[double] q,
object interp,
object is_input_sorted,
list column_order,
list null_precedence,
):

cdef table_view c_input = table_view_from_columns(source_columns)
cdef vector[double] c_q = q
cdef interpolation c_interp = <interpolation>(
<underlying_type_t_interpolation> interp
)
cdef sorted c_is_input_sorted = <sorted>(
<underlying_type_t_sorted> is_input_sorted
)
cdef vector[order] c_column_order
cdef vector[null_order] c_null_precedence

c_column_order.reserve(len(column_order))
c_null_precedence.reserve(len(null_precedence))

return columns_from_pylibcudf_table(
plc.quantiles.quantiles(
plc.Table([
c.to_pylibcudf(mode="read") for c in source_columns
]),
q,
c_interp,
c_is_input_sorted,
column_order,
null_precedence
for value in column_order:
c_column_order.push_back(
<order>(<underlying_type_t_order> value)
)
)

for value in null_precedence:
c_null_precedence.push_back(
<null_order>(<underlying_type_t_null_order> value)
)

cdef unique_ptr[table] c_result

with nogil:
c_result = move(
cpp_quantile_table(
c_input,
c_q,
c_interp,
c_is_input_sorted,
c_column_order,
c_null_precedence,
)
)

return columns_from_unique_ptr(move(c_result))
3 changes: 0 additions & 3 deletions python/cudf/cudf/core/column/numerical_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,9 +116,6 @@ def quantile(
indices = libcudf.sort.order_by(
[self], [True], "first", stable=True
).slice(self.null_count, len(self))

q = np.asarray(q, dtype="float64")

result = libcudf.quantiles.quantile(
self, q, interpolation, indices, exact
)
Expand Down
2 changes: 0 additions & 2 deletions python/cudf/cudf/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -833,8 +833,6 @@ def _quantile_table(
libcudf.types.NullOrder[key] for key in null_precedence
]

q = np.asarray(q, dtype="float64")

return self._from_columns_like_self(
libcudf.quantiles.quantile_table(
[*self._columns],
Expand Down
133 changes: 102 additions & 31 deletions python/cudf/cudf/pylibcudf_tests/test_quantiles.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,8 +61,6 @@ def plc_tbl_data(request):
@pytest.mark.parametrize("q", [[], [0], [0.5], [0.1, 0.5, 0.7, 0.9]])
@pytest.mark.parametrize("exact", [True, False])
def test_quantile(pa_col_data, plc_col_data, interp_opt, q, exact):
q = np.asarray(q, dtype="float64")

ordered_indices = plc.interop.from_arrow(
pc.cast(pc.sort_indices(pa_col_data), pa.int32())
)
Expand All @@ -87,38 +85,23 @@ def test_quantile(pa_col_data, plc_col_data, interp_opt, q, exact):
assert_column_eq(exp, res)


@pytest.mark.parametrize(
"q", [[], [0.1], [0.2], [0.3], [0.4], [0.5], [0.1, 0.5, 0.7, 0.9]]
)
@pytest.mark.parametrize(
"column_order", [[plc.types.Order.ASCENDING, plc.types.Order.ASCENDING]]
)
@pytest.mark.parametrize(
"null_precedence",
[
[plc.types.NullOrder.BEFORE, plc.types.NullOrder.BEFORE],
[plc.types.NullOrder.AFTER, plc.types.NullOrder.AFTER],
],
)
def test_quantiles(
plc_tbl_data, interp_opt, q, sorted_opt, column_order, null_precedence
def _pyarrow_quantiles(
pa_tbl_data,
q,
interp_opt=plc.types.Interpolation.NEAREST,
sorted_opt=plc.types.Sorted.NO,
column_order=None,
null_precedence=None,
):
if interp_opt in {
plc.types.Interpolation.LINEAR,
plc.types.Interpolation.MIDPOINT,
}:
pytest.skip(
"interp cannot be an arithmetic interpolation strategy for quantiles"
)

q = np.asarray(q, dtype="float64")

pa_tbl_data = plc.interop.to_arrow(plc_tbl_data, ["a", "b"])
"""
The pyarrow equivalent of plc.quantiles.quantiles

res = plc.quantiles.quantiles(
plc_tbl_data, q, interp_opt, sorted_opt, column_order, null_precedence
)
Takes the same arguments (except input should be a pyarrow table instead of
of a pylibcudf table)

NOTE: This function doesn't support having different null precedences because of
a lack of support in pyarrow.
"""
if len(q) > 0:
# pyarrow quantile doesn't support empty q
pa_interp_opt = interp_mapping[interp_opt]
Expand All @@ -128,6 +111,25 @@ def test_quantiles(
plc.types.Order.ASCENDING: "ascending",
plc.types.Order.DESCENDING: "descending",
}
if null_precedence is None:
null_precedence = [plc.types.NullOrder.BEFORE] * len(
pa_tbl_data.columns
)
if column_order is None:
column_order = [plc.types.Order.ASCENDING] * len(
pa_tbl_data.columns
)

if not all(
[
null_prec == null_precedence[0]
for null_prec in null_precedence
]
):
raise NotImplementedError(
"Having varying null precendences is not implemented!"
)

pa_tbl_data = pa_tbl_data.sort_by(
[
(name, order_mapper[order])
Expand All @@ -148,6 +150,47 @@ def test_quantiles(
[[] for _ in range(len(pa_tbl_data.schema))],
schema=pa_tbl_data.schema,
)
return exp


@pytest.mark.parametrize(
"q", [[], [0.1], [0.2], [0.3], [0.4], [0.5], [0.1, 0.5, 0.7, 0.9]]
)
@pytest.mark.parametrize(
"column_order", [[plc.types.Order.ASCENDING, plc.types.Order.ASCENDING]]
)
@pytest.mark.parametrize(
"null_precedence",
[
[plc.types.NullOrder.BEFORE, plc.types.NullOrder.BEFORE],
[plc.types.NullOrder.AFTER, plc.types.NullOrder.AFTER],
],
)
def test_quantiles(
plc_tbl_data, interp_opt, q, sorted_opt, column_order, null_precedence
):
if interp_opt in {
plc.types.Interpolation.LINEAR,
plc.types.Interpolation.MIDPOINT,
}:
pytest.skip(
"interp cannot be an arithmetic interpolation strategy for quantiles"
)

pa_tbl_data = plc.interop.to_arrow(plc_tbl_data, ["a", "b"])

exp = _pyarrow_quantiles(
pa_tbl_data,
q=q,
interp_opt=interp_opt,
sorted_opt=sorted_opt,
column_order=column_order,
null_precedence=null_precedence,
)

res = plc.quantiles.quantiles(
plc_tbl_data, q, interp_opt, sorted_opt, column_order, null_precedence
)

assert_table_eq(exp, res)

Expand All @@ -161,3 +204,31 @@ def test_quantiles_invalid_interp(plc_tbl_data, invalid_interp):
plc.quantiles.quantiles(
plc_tbl_data, q=np.array([0.1]), interp=invalid_interp
)


@pytest.mark.parametrize(
"q",
[[0.1], (0.1,), np.array([0.1])],
)
def test_quantile_q_array_like(pa_col_data, plc_col_data, q):
ordered_indices = plc.interop.from_arrow(
pc.cast(pc.sort_indices(pa_col_data), pa.int32())
)
res = plc.quantiles.quantile(
plc_col_data,
q=q,
ordered_indices=ordered_indices,
)
exp = pc.quantile(pa_col_data, q=q)
assert_column_eq(exp, res)


@pytest.mark.parametrize(
"q",
[[0.1], (0.1,), np.array([0.1])],
)
def test_quantiles_q_array_like(plc_tbl_data, q):
res = plc.quantiles.quantiles(plc_tbl_data, q=q)
pa_tbl_data = plc.interop.to_arrow(plc_tbl_data, ["a", "b"])
exp = _pyarrow_quantiles(pa_tbl_data, q=q)
assert_table_eq(exp, res)
Loading