feat: Short circuit query for local scan

TrevorBergeron · TrevorBergeron · commit a944c5878d67 · 2025-04-21T20:17:58.000Z
diff --git a/bigframes/core/local_data.py b/bigframes/core/local_data.py
@@ -98,15 +98,14 @@ def from_pyarrow(self, table: pa.Table) -> ManagedArrowTable:
         mat.validate()
         return mat
 
-    def to_parquet(
+    def to_pyarrow_table(
         self,
-        dst: Union[str, io.IOBase],
         *,
         offsets_col: Optional[str] = None,
         geo_format: Literal["wkb", "wkt"] = "wkt",
         duration_type: Literal["int", "duration"] = "duration",
         json_type: Literal["string"] = "string",
-    ):
+    ) -> pa.Table:
         pa_table = self.data
         if offsets_col is not None:
             pa_table = pa_table.append_column(
@@ -119,6 +118,23 @@ def to_parquet(
                 f"duration as {duration_type} not yet implemented"
             )
         assert json_type == "string"
+        return pa_table
+
+    def to_parquet(
+        self,
+        dst: Union[str, io.IOBase],
+        *,
+        offsets_col: Optional[str] = None,
+        geo_format: Literal["wkb", "wkt"] = "wkt",
+        duration_type: Literal["int", "duration"] = "duration",
+        json_type: Literal["string"] = "string",
+    ):
+        pa_table = self.to_pyarrow_table(
+            offsets_col=offsets_col,
+            geo_format=geo_format,
+            duration_type=duration_type,
+            json_type=json_type,
+        )
         pyarrow.parquet.write_table(pa_table, where=dst)
 
     def itertuples(
diff --git a/bigframes/core/rewrite/__init__.py b/bigframes/core/rewrite/__init__.py
@@ -17,7 +17,10 @@
 from bigframes.core.rewrite.legacy_align import legacy_join_as_projection
 from bigframes.core.rewrite.order import pull_up_order
 from bigframes.core.rewrite.pruning import column_pruning
-from bigframes.core.rewrite.scan_reduction import try_reduce_to_table_scan
+from bigframes.core.rewrite.scan_reduction import (
+    try_reduce_to_local_scan,
+    try_reduce_to_table_scan,
+)
 from bigframes.core.rewrite.slices import pullup_limit_from_slice, rewrite_slice
 from bigframes.core.rewrite.timedeltas import rewrite_timedelta_expressions
 from bigframes.core.rewrite.windows import rewrite_range_rolling
@@ -33,4 +36,5 @@
     "column_pruning",
     "rewrite_range_rolling",
     "try_reduce_to_table_scan",
+    "try_reduce_to_local_scan",
 ]
diff --git a/bigframes/core/rewrite/scan_reduction.py b/bigframes/core/rewrite/scan_reduction.py
@@ -28,14 +28,28 @@ def try_reduce_to_table_scan(root: nodes.BigFrameNode) -> Optional[nodes.ReadTab
     return None
 
 
+def try_reduce_to_local_scan(node: nodes.BigFrameNode) -> Optional[nodes.ReadLocalNode]:
+    if not all(
+        map(
+            lambda x: isinstance(x, (nodes.ReadLocalNode, nodes.SelectionNode)),
+            node.unique_nodes(),
+        )
+    ):
+        return None
+    result = node.bottom_up(merge_scan)
+    if isinstance(result, nodes.ReadLocalNode):
+        return result
+    return None
+
+
 @functools.singledispatch
 def merge_scan(node: nodes.BigFrameNode) -> nodes.BigFrameNode:
     return node
 
 
 @merge_scan.register
 def _(node: nodes.SelectionNode) -> nodes.BigFrameNode:
-    if not isinstance(node.child, nodes.ReadTableNode):
+    if not isinstance(node.child, (nodes.ReadTableNode, nodes.ReadLocalNode)):
         return node
     if node.has_multi_referenced_ids:
         return node
diff --git a/bigframes/session/bq_caching_executor.py b/bigframes/session/bq_caching_executor.py
@@ -35,7 +35,7 @@
 import bigframes.dtypes
 import bigframes.exceptions as bfe
 import bigframes.features
-from bigframes.session import executor, read_api_execution
+from bigframes.session import executor, local_scan_execution, read_api_execution
 import bigframes.session._io.bigquery as bq_io
 import bigframes.session.metrics
 import bigframes.session.planner
@@ -84,6 +84,7 @@ def __init__(
                 bqstoragereadclient=bqstoragereadclient,
                 project=self.bqclient.project,
             ),
+            local_scan_execution.LocalScanExecutor(),
         )
 
     def to_sql(
diff --git a/bigframes/session/local_scan_execution.py b/bigframes/session/local_scan_execution.py
@@ -0,0 +1,60 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+
+from typing import Optional
+
+from bigframes.core import bigframe_node, rewrite
+from bigframes.session import executor, semi_executor
+
+
+class LocalScanExecutor(semi_executor.SemiExecutor):
+    """
+    Executes plans reducible to a arrow table scan.
+    """
+
+    def execute(
+        self,
+        plan: bigframe_node.BigFrameNode,
+        ordered: bool,
+        peek: Optional[int] = None,
+    ) -> Optional[executor.ExecuteResult]:
+        node = rewrite.try_reduce_to_local_scan(plan)
+        if not node:
+            return None
+
+        # TODO: Can support some slicing, sorting
+        def iterator_supplier():
+            offsets_col = (
+                node.offsets_col.sql if (node.offsets_col is not None) else None
+            )
+            arrow_table = node.local_data_source.to_pyarrow_table(
+                offsets_col=offsets_col
+            )
+            if peek:
+                arrow_table = arrow_table.slice(0, peek)
+            for batch in arrow_table.to_batches():
+                batch = batch.select([item.source_id for item in node.scan_list.items])
+                batch = batch.rename_columns(
+                    {item.source_id: item.id.sql for item in node.scan_list.items}
+                )
+                yield batch
+
+        return executor.ExecuteResult(
+            arrow_batches=iterator_supplier,
+            schema=plan.schema,
+            query_job=None,
+            total_bytes=None,
+            total_rows=peek or node.local_data_source.metadata.row_count,
+        )
diff --git a/tests/system/small/bigquery/test_json.py b/tests/system/small/bigquery/test_json.py
@@ -66,7 +66,7 @@ def test_json_set_w_more_pairs():
         s, json_path_value_pairs=[("$.a", 1), ("$.b", 2), ("$.a", [3, 4, 5])]
     )
 
-    expected_json = ['{"a": 3, "b": 2}', '{"a": 4, "b": 2}', '{"a": 5, "b": 2, "c": 1}']
+    expected_json = ['{"a":3,"b":2}', '{"a":4,"b":2}', '{"a":5,"b":2,"c":1}']
     expected = bpd.Series(expected_json, dtype=dtypes.JSON_DTYPE)
 
     pd.testing.assert_series_equal(actual.to_pandas(), expected.to_pandas())

Original file line number	Diff line number	Diff line change
`@@ -66,7 +66,7 @@ def test_json_set_w_more_pairs():`
`66`	`66`	`s, json_path_value_pairs=[("$.a", 1), ("$.b", 2), ("$.a", [3, 4, 5])]`
`67`	`67`	`)`
`68`	`68`
`69`		`- expected_json = ['{"a": 3, "b": 2}', '{"a": 4, "b": 2}', '{"a": 5, "b": 2, "c": 1}']`
	`69`	`+ expected_json = ['{"a":3,"b":2}', '{"a":4,"b":2}', '{"a":5,"b":2,"c":1}']`
`70`	`70`	`expected = bpd.Series(expected_json, dtype=dtypes.JSON_DTYPE)`
`71`	`71`
`72`	`72`	`pd.testing.assert_series_equal(actual.to_pandas(), expected.to_pandas())`