rapidsai · rjzamora · Nov 6, 2024 · Nov 7, 2024 · Nov 7, 2024 · Nov 7, 2024
@@ -8,4 +8,8 @@ set -euo pipefail
 # Support invoking run_cudf_polars_pytests.sh outside the script directory
 cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../python/cudf_polars/
 
+# Test the default "cudf" executor
 python -m pytest --cache-clear "$@" tests
+
+# Test the "dask-experimental" executor
+python -m pytest --cache-clear "$@" tests --executor dask-experimental
@@ -135,6 +135,7 @@ def _callback(
     *,
     device: int | None,
     memory_resource: int | None,
+    executor: str | None,
 ) -> pl.DataFrame:
     assert with_columns is None
     assert pyarrow_predicate is None
@@ -145,7 +146,14 @@ def _callback(
         set_device(device),
         set_memory_resource(memory_resource),
     ):
-        return ir.evaluate(cache={}).to_polars()
+        if executor is None or executor == "cudf":
+            return ir.evaluate(cache={}).to_polars()
+        elif executor == "dask-experimental":
+            from cudf_polars.experimental.parallel import evaluate_dask
+
+            return evaluate_dask(ir).to_polars()
+        else:
+            raise ValueError(f"Unknown executor '{executor}'")
 
 
 def execute_with_cudf(nt: NodeTraverser, *, config: GPUEngine) -> None:
@@ -174,7 +182,8 @@ def execute_with_cudf(nt: NodeTraverser, *, config: GPUEngine) -> None:
     device = config.device
     memory_resource = config.memory_resource
     raise_on_fail = config.config.get("raise_on_fail", False)
-    if unsupported := (config.config.keys() - {"raise_on_fail"}):
+    executor = config.config.get("executor", None)
+    if unsupported := (config.config.keys() - {"raise_on_fail", "executor"}):
         raise ValueError(
             f"Engine configuration contains unsupported settings {unsupported}"
         )
@@ -200,5 +209,11 @@ def execute_with_cudf(nt: NodeTraverser, *, config: GPUEngine) -> None:
                 raise exception
         else:
             nt.set_udf(
-                partial(_callback, ir, device=device, memory_resource=memory_resource)
+                partial(
+                    _callback,
+                    ir,
+                    device=device,
+                    memory_resource=memory_resource,
+                    executor=executor,
+                )
             )
@@ -69,7 +69,7 @@ def __init__(
         *by: Expr,
     ) -> None:
         self.dtype = dtype
-        self.options = options
+        self.options = (options[0], tuple(options[1]), tuple(options[2]))
         self.children = (column, *by)
 
     def do_evaluate(

@@ -1554,13 +1554,20 @@ def __init__(self, schema: Schema, name: str, options: Any, df: IR):
             self.options = (
                 tuple(indices),
                 tuple(pivotees),
-                (variable_name, schema[variable_name]),
-                (value_name, schema[value_name]),
+                variable_name,
+                value_name,
             )
-        self._non_child_args = (name, self.options)
+        self._non_child_args = (schema, name, self.options)
+
+    def get_hashable(self) -> Hashable:
-    def get_hashable(self) -> Hashable:
+    def get_hashable(self) -> Hashable:  # pragma: no cover; Needed by experimental
-    def get_hashable(self) -> Hashable:
+    def get_hashable(self) -> Hashable:  # pragma: no cover; Needed by experimental
+        """Hashable representation of the node."""
+        schema_hash = tuple(self.schema.items())
+        return (type(self), schema_hash, self.name, str(self.options), *self.children)
 
     @classmethod
-    def do_evaluate(cls, name: str, options: Any, df: DataFrame) -> DataFrame:
+    def do_evaluate(
+        cls, schema: Schema, name: str, options: Any, df: DataFrame
+    ) -> DataFrame:
         """Evaluate and return a dataframe."""
         if name == "rechunk":
             # No-op in our data model
@@ -1582,8 +1589,8 @@ def do_evaluate(cls, name: str, options: Any, df: DataFrame) -> DataFrame:
             (
                 indices,
                 pivotees,
-                (variable_name, variable_dtype),
-                (value_name, value_dtype),
+                variable_name,
+                value_name,
             ) = options
             npiv = len(pivotees)
             index_columns = [
@@ -1600,15 +1607,18 @@ def do_evaluate(cls, name: str, options: Any, df: DataFrame) -> DataFrame:
                         plc.interop.from_arrow(
                             pa.array(
                                 pivotees,
-                                type=plc.interop.to_arrow(variable_dtype),
+                                type=plc.interop.to_arrow(schema[variable_name]),
                             ),
                         )
                     ]
                 ),
                 df.num_rows,
             ).columns()
             value_column = plc.concatenate.concatenate(
-                [df.column_map[pivotee].astype(value_dtype).obj for pivotee in pivotees]
+                [
+                    df.column_map[pivotee].astype(schema[value_name]).obj
+                    for pivotee in pivotees
+                ]
             )
             return DataFrame(
                 [

@@ -0,0 +1,152 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+"""Partitioned LogicalPlan nodes."""
+
+from __future__ import annotations
+
+from functools import singledispatch
+from typing import TYPE_CHECKING, Any
+
+from cudf_polars.dsl.expr import NamedExpr
+from cudf_polars.dsl.traversal import reuse_if_unchanged, traversal
+
+if TYPE_CHECKING:
+    from collections.abc import MutableMapping
+
+    from cudf_polars.containers import DataFrame
+    from cudf_polars.dsl.ir import IR
+    from cudf_polars.dsl.nodebase import Node
+
+
+class PartitionInfo:
+    """
+    Partitioning information.
+
+    This class only tracks the partition count (for now).
+    """
+
+    __slots__ = ("count",)
+
+    def __init__(self, count: int):
+        self.count = count
+
+
+# The hash of an IR object must always map to a
+# unique PartitionInfo object, and we can cache
+# this mapping until evaluation is complete.
+_IR_PARTS_CACHE: MutableMapping[int, PartitionInfo] = {}
+
+
+def _clear_parts_info_cache() -> None:
+    """Clear cached partitioning information."""
+    _IR_PARTS_CACHE.clear()
+
+
+def get_key_name(node: Node | NamedExpr) -> str:
+    """Generate the key name for a Node."""
+    if isinstance(node, NamedExpr):
+        return f"named-{get_key_name(node.value)}"  # pragma: no cover
+    return f"{type(node).__name__.lower()}-{hash(node)}"
+
+
+@singledispatch
+def lower_ir_node(ir: IR, rec) -> IR:
+    """Rewrite an IR node with proper partitioning."""
+    # Return same node by default
+    return reuse_if_unchanged(ir, rec)
+
+
+def lower_ir_graph(ir: IR) -> IR:
+    """Rewrite an IR graph with proper partitioning."""
+    from cudf_polars.dsl.traversal import CachingVisitor
+
+    mapper = CachingVisitor(lower_ir_node)
+    return mapper(ir)
+
+
+def _default_ir_parts_info(ir: IR) -> PartitionInfo:
+    # Single-partition default behavior.
+    # This is used by `_ir_parts_info` for all unregistered IR sub-types.
+    count = max((ir_parts_info(child).count for child in ir.children), default=1)
+    if count > 1:
+        raise NotImplementedError(
+            f"Class {type(ir)} does not support multiple partitions."
+        )  # pragma: no cover
+    return PartitionInfo(count=count)
+
+
+@singledispatch
+def _ir_parts_info(ir: IR) -> PartitionInfo:
+    """IR partitioning-info dispatch."""
+    return _default_ir_parts_info(ir)
+
+
+def ir_parts_info(ir: IR) -> PartitionInfo:
+    """Return the partitioning info for an IR node."""
+    key = hash(ir)
+    try:
+        return _IR_PARTS_CACHE[key]
+    except KeyError:
+        _IR_PARTS_CACHE[key] = _ir_parts_info(ir)
+        return _IR_PARTS_CACHE[key]
+
+
+def _default_ir_tasks(ir: IR) -> MutableMapping[Any, Any]:
+    # Single-partition default behavior.
+    # This is used by `generate_ir_tasks` for all unregistered IR sub-types.
+    if ir_parts_info(ir).count > 1:
+        raise NotImplementedError(
+            f"Failed to generate multiple output tasks for {ir}."
+        )  # pragma: no cover
+
+    child_names = []
+    for child in ir.children:
+        child_names.append(get_key_name(child))
+        if ir_parts_info(child).count > 1:
+            raise NotImplementedError(
+                f"Failed to generate tasks for {ir} with child {child}."
+            )  # pragma: no cover
+
+    key_name = get_key_name(ir)
+    return {
+        (key_name, 0): (
+            ir.do_evaluate,
+            *ir._non_child_args,
+            *((child_name, 0) for child_name in child_names),
+        )
+    }
+
+
+@singledispatch
+def generate_ir_tasks(ir: IR) -> MutableMapping[Any, Any]:
+    """
+    Generate tasks for an IR node.
+
+    An IR node only needs to generate the graph for
+    the current IR logic (not including child IRs).
+    """
+    return _default_ir_tasks(ir)
+
+
+def task_graph(_ir: IR) -> tuple[MutableMapping[str, Any], str]:
+    """Construct a Dask-compatible task graph."""
+    ir: IR = lower_ir_graph(_ir)
+
+    graph = {
+        k: v
+        for layer in [generate_ir_tasks(n) for n in traversal(ir)]
+        for k, v in layer.items()
+    }
+    key_name = get_key_name(ir)
+    graph[key_name] = (key_name, 0)
+
+    _clear_parts_info_cache()
+    return graph, key_name
+
+
+def evaluate_dask(ir: IR) -> DataFrame:
+    """Evaluate an IR graph with Dask."""
+    from dask import get
+
+    graph, key = task_graph(ir)
+    return get(graph, key)
@@ -20,6 +20,11 @@
 __all__: list[str] = ["assert_gpu_result_equal", "assert_ir_translation_raises"]
 
 
+# Will be overriden by `conftest.py` with the value from the `--executor`
+# command-line argument
+Executor = None
+
+
 def assert_gpu_result_equal(
     lazydf: pl.LazyFrame,
     *,
@@ -33,6 +38,7 @@ def assert_gpu_result_equal(
     rtol: float = 1e-05,
     atol: float = 1e-08,
     categorical_as_str: bool = False,
+    executor: str | None = None,
 ) -> None:
     """
     Assert that collection of a lazyframe on GPU produces correct results.
@@ -68,6 +74,8 @@ def assert_gpu_result_equal(
         Absolute tolerance for float comparisons
     categorical_as_str
         Decat categoricals to strings before comparing
+    executor
+        The executor configuration to pass to `GPUEngine`
 
     Raises
     ------
@@ -81,7 +89,7 @@ def assert_gpu_result_equal(
     )
 
     expect = lazydf.collect(**final_polars_collect_kwargs)
-    engine = GPUEngine(raise_on_fail=True)
+    engine = GPUEngine(raise_on_fail=True, executor=Executor)
     got = lazydf.collect(**final_cudf_collect_kwargs, engine=engine)
     assert_frame_equal(
         expect,

@@ -8,3 +8,19 @@
 @pytest.fixture(params=[False, True], ids=["no_nulls", "nulls"], scope="session")
 def with_nulls(request):
     return request.param
+
+
+def pytest_addoption(parser):
+    parser.addoption(
+        "--executor",
+        action="store",
+        default="cudf",
+        choices=("cudf", "dask-experimental"),
+        help="Executor to use for GPUEngine.",
+    )
+
+
+def pytest_configure(config):
+    import cudf_polars.testing.asserts
+
+    cudf_polars.testing.asserts.Executor = config.getoption("--executor")
@@ -0,0 +1,24 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+import polars as pl
+from polars.testing import assert_frame_equal
+
+from cudf_polars import Translator
+from cudf_polars.experimental.parallel import evaluate_dask
+
+
+def test_evaluate_dask():
+    df = pl.LazyFrame({"a": [1, 2, 3], "b": [3, 4, 5], "c": [5, 6, 7], "d": [7, 9, 8]})
+
+    q = df.select(pl.col("a") - (pl.col("b") + pl.col("c") * 2), pl.col("d")).sort("d")
+
+    qir = Translator(q._ldf.visit()).translate_ir()
+
+    expected = qir.evaluate(cache={}).to_polars()
+
+    got = evaluate_dask(qir).to_polars()
+
+    assert_frame_equal(expected, got)