refactor: Factor out exectuor.head method into cache and execute slice (#1676)

TrevorBergeron · web-flow · commit 7489c270d435 · 2025-05-01T15:37:39.000-07:00
diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py
@@ -69,6 +69,7 @@
 import bigframes.exceptions as bfe
 import bigframes.operations as ops
 import bigframes.operations.aggregations as agg_ops
+from bigframes.session import executor as executors
 
 # Type constraint for wherever column labels are used
 Label = typing.Hashable
@@ -1560,12 +1561,19 @@ def retrieve_repr_request_results(
         """
 
         # head caches full underlying expression, so row_count will be free after
-        head_result = self.session._executor.head(self.expr, max_results)
+        executor = self.session._executor
+        executor.cached(
+            array_value=self.expr,
+            config=executors.CacheConfig(optimize_for="head", if_cached="reuse-strict"),
+        )
+        head_result = self.session._executor.execute(
+            self.expr.slice(start=None, stop=max_results, step=None)
+        )
         row_count = self.session._executor.execute(self.expr.row_count()).to_py_scalar()
 
-        df = head_result.to_pandas()
-        self._copy_index_to_pandas(df)
-        return df, row_count, head_result.query_job
+        head_df = head_result.to_pandas()
+        self._copy_index_to_pandas(head_df)
+        return head_df, row_count, head_result.query_job
 
     def promote_offsets(self, label: Label = None) -> typing.Tuple[Block, str]:
         expr, result_id = self._expr.promote_offsets()
@@ -2535,9 +2543,12 @@ def cached(self, *, force: bool = False, session_aware: bool = False) -> None:
         # use a heuristic for whether something needs to be cached
         self.session._executor.cached(
             self.expr,
-            force=force,
-            use_session=session_aware,
-            cluster_cols=self.index_columns,
+            config=executors.CacheConfig(
+                optimize_for="auto"
+                if session_aware
+                else executors.HierarchicalKey(tuple(self.index_columns)),
+                if_cached="replace" if force else "reuse-any",
+            ),
         )
 
     def _is_monotonic(
diff --git a/bigframes/session/bq_caching_executor.py b/bigframes/session/bq_caching_executor.py
@@ -243,46 +243,37 @@ def peek(
             plan, ordered=False, destination=destination_table, peek=n_rows
         )
 
-    def head(
-        self, array_value: bigframes.core.ArrayValue, n_rows: int
-    ) -> executor.ExecuteResult:
-        plan = self.logical_plan(array_value.node)
-        if (plan.row_count is not None) and (plan.row_count <= n_rows):
-            return self._execute_plan(plan, ordered=True)
-
-        if not self.strictly_ordered and not array_value.node.explicitly_ordered:
-            # No user-provided ordering, so just get any N rows, its faster!
-            return self.peek(array_value, n_rows)
-
-        if not tree_properties.can_fast_head(plan):
-            # If can't get head fast, we are going to need to execute the whole query
-            # Will want to do this in a way such that the result is reusable, but the first
-            # N values can be easily extracted.
-            # This currently requires clustering on offsets.
-            self._cache_with_offsets(array_value)
-            # Get a new optimized plan after caching
-            plan = self.logical_plan(array_value.node)
-            assert tree_properties.can_fast_head(plan)
-
-        head_plan = generate_head_plan(plan, n_rows)
-        return self._execute_plan(head_plan, ordered=True)
-
     def cached(
-        self,
-        array_value: bigframes.core.ArrayValue,
-        *,
-        force: bool = False,
-        use_session: bool = False,
-        cluster_cols: Sequence[str] = (),
+        self, array_value: bigframes.core.ArrayValue, *, config: executor.CacheConfig
     ) -> None:
         """Write the block to a session table."""
-        # use a heuristic for whether something needs to be cached
-        if (not force) and self._is_trivially_executable(array_value):
-            return
-        if use_session:
+        # First, see if we can reuse the existing cache
+        # TODO(b/415105423): Provide feedback to user on whether new caching action was deemed necessary
+        # TODO(b/415105218): Make cached a deferred action
+        if config.if_cached == "reuse-any":
+            if self._is_trivially_executable(array_value):
+                return
+        elif config.if_cached == "reuse-strict":
+            # This path basically exists to make sure that repr in head mode is optimized for subsequent repr operations.
+            if config.optimize_for == "head":
+                if tree_properties.can_fast_head(array_value.node):
+                    return
+            else:
+                raise NotImplementedError(
+                    "if_cached='reuse-strict' currently only supported with optimize_for='head'"
+                )
+        elif config.if_cached != "replace":
+            raise ValueError(f"Unexpected 'if_cached' arg: {config.if_cached}")
+
+        if config.optimize_for == "auto":
             self._cache_with_session_awareness(array_value)
+        elif config.optimize_for == "head":
+            self._cache_with_offsets(array_value)
         else:
-            self._cache_with_cluster_cols(array_value, cluster_cols=cluster_cols)
+            assert isinstance(config.optimize_for, executor.HierarchicalKey)
+            self._cache_with_cluster_cols(
+                array_value, cluster_cols=config.optimize_for.columns
+            )
 
     # Helpers
     def _run_execute_query(
@@ -571,7 +562,3 @@ def _sanitize(
         )
         for f in schema
     )
-
-
-def generate_head_plan(node: nodes.BigFrameNode, n: int):
-    return nodes.SliceNode(node, start=None, stop=n)
diff --git a/bigframes/session/executor.py b/bigframes/session/executor.py
@@ -73,6 +73,17 @@ def to_py_scalar(self):
         return column[0]
 
 
+@dataclasses.dataclass(frozen=True)
+class HierarchicalKey:
+    columns: tuple[str, ...]
+
+
+@dataclasses.dataclass(frozen=True)
+class CacheConfig(abc.ABC):
+    optimize_for: Union[Literal["auto", "head"], HierarchicalKey] = "auto"
+    if_cached: Literal["reuse-strict", "reuse-any", "replace"] = "reuse-any"
+
+
 class Executor(abc.ABC):
     """
     Interface for an executor, which compiles and executes ArrayValue objects.
@@ -149,21 +160,10 @@ def peek(
         """
         raise NotImplementedError("peek not implemented for this executor")
 
-    # TODO: Remove this and replace with efficient slice operator that can use execute()
-    def head(
-        self, array_value: bigframes.core.ArrayValue, n_rows: int
-    ) -> ExecuteResult:
-        """
-        Preview the first n rows of the dataframe. This is less efficient than the unordered peek preview op.
-        """
-        raise NotImplementedError("head not implemented for this executor")
-
     def cached(
         self,
         array_value: bigframes.core.ArrayValue,
         *,
-        force: bool = False,
-        use_session: bool = False,
-        cluster_cols: Sequence[str] = (),
+        config: CacheConfig,
     ) -> None:
         raise NotImplementedError("cached not implemented for this executor")