googleapis · TrevorBergeron · Jun 18, 2025 · Jun 18, 2025 · Jun 18, 2025 · Jun 18, 2025
@@ -65,6 +65,7 @@ def compile_sql(request: configs.CompileRequest) -> configs.CompileResult:
     ordering: Optional[bf_ordering.RowOrdering] = result_node.order_by
     result_node = dataclasses.replace(result_node, order_by=None)
     result_node = cast(nodes.ResultNode, rewrites.column_pruning(result_node))
+    result_node = cast(nodes.ResultNode, rewrites.defer_selection(result_node))
     sql = compile_result_node(result_node)
     # Return the ordering iff no extra columns are needed to define the row order
     if ordering is not None:

@@ -125,7 +125,7 @@ def sql(self) -> str:
         return "\n".join(text)
 
 
-@dataclasses.dataclass
+@dataclasses.dataclass(frozen=True)
 class SelectExpression(abc.SQLSyntax):
     """This class represents `select_expression`."""
 

@@ -87,6 +87,9 @@ def _compile_sql(self, request: configs.CompileRequest) -> configs.CompileResult
                 nodes.ResultNode, rewrite.column_pruning(result_node)
             )
             result_node = self._remap_variables(result_node)
+            result_node = typing.cast(
+                nodes.ResultNode, rewrite.defer_selection(result_node)
+            )
             sql = self._compile_result_node(result_node)
             return configs.CompileResult(
                 sql, result_node.schema.to_bigquery(), result_node.order_by
@@ -97,6 +100,9 @@ def _compile_sql(self, request: configs.CompileRequest) -> configs.CompileResult
         result_node = typing.cast(nodes.ResultNode, rewrite.column_pruning(result_node))
 
         result_node = self._remap_variables(result_node)
+        result_node = typing.cast(
+            nodes.ResultNode, rewrite.defer_selection(result_node)
+        )
         sql = self._compile_result_node(result_node)
         # Return the ordering iff no extra columns are needed to define the row order
         if ordering is not None:

@@ -75,7 +75,7 @@ def additive_base(self) -> BigFrameNode:
         ...
 
     @abc.abstractmethod
-    def replace_additive_base(self, BigFrameNode):
+    def replace_additive_base(self, BigFrameNode) -> BigFrameNode:
         ...
 
 
@@ -1568,6 +1568,10 @@ class ExplodeNode(UnaryNode):
     # Offsets are generated only if this is non-null
     offsets_col: Optional[identifiers.ColumnId] = None
 
+    def _validate(self):
+        for col in self.column_ids:
+            assert col.id in self.child.ids
+
     @property
     def row_preserving(self) -> bool:
         return False
@@ -1646,6 +1650,10 @@ class ResultNode(UnaryNode):
     limit: Optional[int] = None
     # TODO: CTE definitions
 
+    def _validate(self):
+        for ref, name in self.output_cols:
+            assert ref.id in self.child.ids
+
     @property
     def node_defined_ids(self) -> Tuple[identifiers.ColumnId, ...]:
         return ()

@@ -22,6 +22,7 @@
     try_reduce_to_local_scan,
     try_reduce_to_table_scan,
 )
+from bigframes.core.rewrite.select_pullup import defer_selection
 from bigframes.core.rewrite.slices import pull_out_limit, pull_up_limits, rewrite_slice
 from bigframes.core.rewrite.timedeltas import rewrite_timedelta_expressions
 from bigframes.core.rewrite.windows import pull_out_window_order, rewrite_range_rolling
@@ -42,4 +43,5 @@
     "try_reduce_to_local_scan",
     "fold_row_counts",
     "pull_out_window_order",
+    "defer_selection",
 ]
@@ -0,0 +1,144 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import dataclasses
+from typing import cast
+
+from bigframes.core import expression, nodes
+
+
+def defer_selection(
+    root: nodes.BigFrameNode,
+) -> nodes.BigFrameNode:
+    """
+    Defers SelectionNode operations in the tree, pulling them up.
+
+    In many cases, these nodes will be merged or eliminated entirely, simplifying the overall tree.
+    """
+    return nodes.bottom_up(root, pull_up_select)
+
+
+def pull_up_select(node: nodes.BigFrameNode) -> nodes.BigFrameNode:
+    if isinstance(node, nodes.LeafNode):
+        return node
+    if isinstance(node, nodes.JoinNode):
+        return pull_up_selects_under_join(node)
+    if isinstance(node, nodes.ConcatNode):
+        return handle_selects_under_concat(node)
+    if isinstance(node, nodes.UnaryNode):
+        return pull_up_select_unary(node)
+    # shouldn't hit this, but not worth crashing over
+    return node
+
+
+def pull_up_select_unary(node: nodes.UnaryNode) -> nodes.BigFrameNode:
+    child = node.child
+    if not isinstance(child, nodes.SelectionNode):
+        return node
+
+    # Schema-preserving nodes
+    if isinstance(
+        node,
+        (
+            nodes.ReversedNode,
+            nodes.OrderByNode,
+            nodes.SliceNode,
+            nodes.FilterNode,
+            nodes.RandomSampleNode,
+        ),
+    ):
+        pushed_down_node: nodes.BigFrameNode = node.remap_refs(
+            {id: ref.id for ref, id in child.input_output_pairs}
+        ).replace_child(child.child)
+        pulled_up_select = cast(
+            nodes.SelectionNode, child.replace_child(pushed_down_node)
+        )
+        return pulled_up_select
+    elif isinstance(
+        node,
+        (
+            nodes.SelectionNode,
+            nodes.ResultNode,
+        ),
+    ):
+        return node.remap_refs(
+            {id: ref.id for ref, id in child.input_output_pairs}
+        ).replace_child(child.child)
+    elif isinstance(node, nodes.AggregateNode):
+        pushed_down_agg = node.remap_refs(
+            {id: ref.id for ref, id in child.input_output_pairs}
+        ).replace_child(child.child)
+        new_selection = tuple(
+            nodes.AliasedRef.identity(id).remap_refs(
+                {id: ref.id for ref, id in child.input_output_pairs}
+            )
+            for id in node.ids
+        )
+        return nodes.SelectionNode(pushed_down_agg, new_selection)
+    elif isinstance(node, nodes.ExplodeNode):
+        pushed_down_node = node.remap_refs(
+            {id: ref.id for ref, id in child.input_output_pairs}
+        ).replace_child(child.child)
+        pulled_up_select = cast(
+            nodes.SelectionNode, child.replace_child(pushed_down_node)
+        )
+        if node.offsets_col:
+            pulled_up_select = dataclasses.replace(
+                pulled_up_select,
+                input_output_pairs=(
+                    *pulled_up_select.input_output_pairs,
+                    nodes.AliasedRef(
+                        expression.DerefOp(node.offsets_col), node.offsets_col
+                    ),
+                ),
+            )
+        return pulled_up_select
+    elif isinstance(node, nodes.AdditiveNode):
+        pushed_down_node = node.replace_additive_base(child.child).remap_refs(
+            {id: ref.id for ref, id in child.input_output_pairs}
+        )
+        new_selection = (
+            *child.input_output_pairs,
+            *(
+                nodes.AliasedRef(expression.DerefOp(col.id), col.id)
+                for col in node.added_fields
+            ),
+        )
+        pulled_up_select = dataclasses.replace(
+            child, child=pushed_down_node, input_output_pairs=new_selection
+        )
+        return pulled_up_select
+    # shouldn't hit this, but not worth crashing over
+    return node
+
+
+def pull_up_selects_under_join(node: nodes.JoinNode) -> nodes.JoinNode:
+    # Can in theory pull up selects here, but it is a bit dangerous, in particular or self-joins, when there are more transforms to do.
+    # TODO: Safely pull up selects above join
+    return node
+
+
+def handle_selects_under_concat(node: nodes.ConcatNode) -> nodes.ConcatNode:
+    new_children = []
+    for child in node.child_nodes:
+        # remove select if no-op
+        if not isinstance(child, nodes.SelectionNode):
+            new_children.append(child)
+        else:
+            inputs = (ref.id for ref in child.input_output_pairs)
+            if inputs == tuple(child.child.ids):
+                new_children.append(child.child)
+            else:
+                new_children.append(child)
+    return dataclasses.replace(node, children=tuple(new_children))
@@ -148,6 +148,12 @@ def label_to_identifier(label: typing.Hashable, strict: bool = False) -> str:
             # first character must be letter or underscore
             identifier = "_" + identifier
 
+    else:
+        # Even with flexible column names, there are constraints
+        # Convert illegal characters
+        # See: https://cloud.google.com/bigquery/docs/schemas#flexible-column-names
+        identifier = re.sub(r"[!\"$\(\)\*\,\./;\?@[\]^`{}~]", "_", identifier)
+
     # Except in special circumstances (true anonymous query results tables),
     # field names are not allowed to start with these (case-insensitive)
     # prefixes.

@@ -4,79 +4,47 @@ WITH `bfcte_1` AS (
   FROM UNNEST(ARRAY<STRUCT<`bfcol_0` INT64, `bfcol_1` INT64, `bfcol_2` INT64, `bfcol_3` STRING, `bfcol_4` INT64>>[STRUCT(0, 123456789, 0, 'Hello, World!', 0), STRUCT(1, -987654321, 1, 'こんにちは', 1), STRUCT(2, 314159, 2, '  ¡Hola Mundo!  ', 2), STRUCT(3, CAST(NULL AS INT64), 3, CAST(NULL AS STRING), 3), STRUCT(4, -234892, 4, 'Hello, World!', 4), STRUCT(5, 55555, 5, 'Güten Tag!', 5), STRUCT(6, 101202303, 6, 'capitalize, This ', 6), STRUCT(7, -214748367, 7, ' سلام', 7), STRUCT(8, 2, 8, 'T', 8)])
 ), `bfcte_3` AS (
   SELECT
-    `bfcol_0` AS `bfcol_5`,
-    `bfcol_2` AS `bfcol_6`,
-    `bfcol_1` AS `bfcol_7`,
-    `bfcol_3` AS `bfcol_8`,
-    `bfcol_4` AS `bfcol_9`
+    *,
+    `bfcol_4` AS `bfcol_10`
   FROM `bfcte_1`
 ), `bfcte_5` AS (
-  SELECT
-    *,
-    `bfcol_9` AS `bfcol_10`
-  FROM `bfcte_3`
-), `bfcte_7` AS (
-  SELECT
-    `bfcol_5` AS `bfcol_11`,
-    `bfcol_6` AS `bfcol_12`,
-    `bfcol_7` AS `bfcol_13`,
-    `bfcol_8` AS `bfcol_14`,
-    `bfcol_10` AS `bfcol_15`
-  FROM `bfcte_5`
-), `bfcte_9` AS (
   SELECT
     *,
     0 AS `bfcol_16`
-  FROM `bfcte_7`
-), `bfcte_10` AS (
+  FROM `bfcte_3`
+), `bfcte_6` AS (
   SELECT
-    `bfcol_11` AS `bfcol_17`,
-    `bfcol_12` AS `bfcol_18`,
-    `bfcol_13` AS `bfcol_19`,
-    `bfcol_14` AS `bfcol_20`,
+    `bfcol_0` AS `bfcol_17`,
+    `bfcol_2` AS `bfcol_18`,
+    `bfcol_1` AS `bfcol_19`,
+    `bfcol_3` AS `bfcol_20`,
     `bfcol_16` AS `bfcol_21`,
-    `bfcol_15` AS `bfcol_22`
-  FROM `bfcte_9`
+    `bfcol_10` AS `bfcol_22`
+  FROM `bfcte_5`
 ), `bfcte_0` AS (
   SELECT
     *
   FROM UNNEST(ARRAY<STRUCT<`bfcol_23` INT64, `bfcol_24` INT64, `bfcol_25` INT64, `bfcol_26` STRING, `bfcol_27` INT64>>[STRUCT(0, 123456789, 0, 'Hello, World!', 0), STRUCT(1, -987654321, 1, 'こんにちは', 1), STRUCT(2, 314159, 2, '  ¡Hola Mundo!  ', 2), STRUCT(3, CAST(NULL AS INT64), 3, CAST(NULL AS STRING), 3), STRUCT(4, -234892, 4, 'Hello, World!', 4), STRUCT(5, 55555, 5, 'Güten Tag!', 5), STRUCT(6, 101202303, 6, 'capitalize, This ', 6), STRUCT(7, -214748367, 7, ' سلام', 7), STRUCT(8, 2, 8, 'T', 8)])
 ), `bfcte_2` AS (
   SELECT
-    `bfcol_23` AS `bfcol_28`,
-    `bfcol_25` AS `bfcol_29`,
-    `bfcol_24` AS `bfcol_30`,
-    `bfcol_26` AS `bfcol_31`,
-    `bfcol_27` AS `bfcol_32`
+    *,
+    `bfcol_27` AS `bfcol_33`
   FROM `bfcte_0`
 ), `bfcte_4` AS (
   SELECT
     *,
-    `bfcol_32` AS `bfcol_33`
+    1 AS `bfcol_39`
   FROM `bfcte_2`
-), `bfcte_6` AS (
+), `bfcte_7` AS (
   SELECT
-    `bfcol_28` AS `bfcol_34`,
-    `bfcol_29` AS `bfcol_35`,
-    `bfcol_30` AS `bfcol_36`,
-    `bfcol_31` AS `bfcol_37`,
-    `bfcol_33` AS `bfcol_38`
+    `bfcol_23` AS `bfcol_40`,
+    `bfcol_25` AS `bfcol_41`,
+    `bfcol_24` AS `bfcol_42`,
+    `bfcol_26` AS `bfcol_43`,
+    `bfcol_39` AS `bfcol_44`,
+    `bfcol_33` AS `bfcol_45`
   FROM `bfcte_4`
 ), `bfcte_8` AS (
-  SELECT
-    *,
-    1 AS `bfcol_39`
-  FROM `bfcte_6`
-), `bfcte_11` AS (
-  SELECT
-    `bfcol_34` AS `bfcol_40`,
-    `bfcol_35` AS `bfcol_41`,
-    `bfcol_36` AS `bfcol_42`,
-    `bfcol_37` AS `bfcol_43`,
-    `bfcol_39` AS `bfcol_44`,
-    `bfcol_38` AS `bfcol_45`
-  FROM `bfcte_8`
-), `bfcte_12` AS (
   SELECT
     *
   FROM (
@@ -87,7 +55,7 @@ WITH `bfcte_1` AS (
       bfcol_20 AS `bfcol_49`,
       bfcol_21 AS `bfcol_50`,
       bfcol_22 AS `bfcol_51`
-    FROM `bfcte_10`
+    FROM `bfcte_6`
     UNION ALL
     SELECT
       bfcol_40 AS `bfcol_46`,
@@ -96,15 +64,15 @@ WITH `bfcte_1` AS (
       bfcol_43 AS `bfcol_49`,
       bfcol_44 AS `bfcol_50`,
       bfcol_45 AS `bfcol_51`
-    FROM `bfcte_11`
+    FROM `bfcte_7`
   )
 )
 SELECT
   `bfcol_46` AS `rowindex`,
   `bfcol_47` AS `rowindex_1`,
   `bfcol_48` AS `int64_col`,
   `bfcol_49` AS `string_col`
-FROM `bfcte_12`
+FROM `bfcte_8`
 ORDER BY
   `bfcol_50` ASC NULLS LAST,
   `bfcol_51` ASC NULLS LAST