Plumb through top-k to Python

rapidsai · wence- · Mar 11, 2024 · Mar 11, 2024 · Mar 11, 2024 · Mar 12, 2024
commit c9357794ff8945e4b8208a7fd17747bbc05eb2ef
@@ -76,6 +76,17 @@ class Aggregation:
     def nth(cls, size):
         return cls(pylibcudf.aggregation.nth_element(size))
 
+    @classmethod
+    def top_k(cls, size, descending=True):
+        return cls(
+            pylibcudf.aggregation.top_k(
+                size,
+                pylibcudf.types.Order.DESCENDING
+                if descending
+                else pylibcudf.types.Order.ASCENDING
+            )
+        )
+
     @classmethod
     def product(cls):
         return cls(pylibcudf.aggregation.product())

@@ -43,6 +43,7 @@ cdef extern from "cudf/aggregation.hpp" namespace "cudf" nogil:
         RANK
         COLLECT_LIST
         COLLECT_SET
+        TOP_K
         PTX
         CUDA
         CORRELATION
@@ -135,6 +136,10 @@ cdef extern from "cudf/aggregation.hpp" namespace "cudf" nogil:
         null_policy null_handling, null_equality nulls_equal, nan_equality nans_equal
     ) except +
 
+    cdef unique_ptr[T] make_top_k_aggregation[T](
+        size_type k, order order
+    ) except +
+
     cdef unique_ptr[T] make_udf_aggregation[T](
         udf_type type,
         string user_defined_aggregator,

@@ -88,6 +88,8 @@ cpdef Aggregation collect_list(null_policy null_handling = *)
 
 cpdef Aggregation collect_set(null_handling = *, nulls_equal = *, nans_equal = *)
 
+cpdef Aggregation top_k(size_type k, order column_order = *)
+
 cpdef Aggregation udf(str operation, DataType output_type)
 
 cpdef Aggregation correlation(correlation_type type, size_type min_periods)

@@ -31,6 +31,7 @@ from cudf._lib.cpp.aggregation cimport (
     make_std_aggregation,
     make_sum_aggregation,
     make_sum_of_squares_aggregation,
+    make_top_k_aggregation,
     make_udf_aggregation,
     make_variance_aggregation,
     rank_method,
@@ -457,6 +458,34 @@ cpdef Aggregation collect_set(
         )
     )
 
+
+cpdef Aggregation top_k(
+    size_type k,
+    order column_order = order.DESCENDING
+):
+    """Create a top_k aggregation.
+
+    Parameters
+    ----------
+    k : size_type
+        Number of values to return.
+    column_order : order, default DESCENDING
+        Sort order within the groups.
+
+    Returns
+    -------
+    Aggregation
+        The top_k aggregation.
+    """
+    return Aggregation.from_libcudf(
+        move(
+            make_top_k_aggregation[aggregation](
+                k, column_order
+            )
+        )
+    )
+
+
 cpdef Aggregation udf(str operation, DataType output_type):
     """Create a udf aggregation.
 

@@ -947,6 +947,20 @@ def nth(self, n):
         del self.obj._data["__groupbynth_order__"]
         return result
 
+    @_cudf_nvtx_annotate
+    def topk(self, k, bottom=False):
+        """
+        Return the top-k values from each group
+
+        Parameters
+        ----------
+        k : int
+           Number of values to obtain
+        bottom : bool
+           Should one return the bottom-k values?
+        """
+        return self.agg(lambda x: x.top_k(k, descending=not bottom))
+
     @_cudf_nvtx_annotate
     def ngroup(self, ascending=True):
         """