Skip to content

perf: Produce simpler sql #1836

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions bigframes/core/compile/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ def compile_sql(request: configs.CompileRequest) -> configs.CompileResult:
ordering: Optional[bf_ordering.RowOrdering] = result_node.order_by
result_node = dataclasses.replace(result_node, order_by=None)
result_node = cast(nodes.ResultNode, rewrites.column_pruning(result_node))
result_node = cast(nodes.ResultNode, rewrites.defer_selection(result_node))
sql = compile_result_node(result_node)
# Return the ordering iff no extra columns are needed to define the row order
if ordering is not None:
Expand Down
2 changes: 1 addition & 1 deletion bigframes/core/compile/googlesql/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ def sql(self) -> str:
return "\n".join(text)


@dataclasses.dataclass
@dataclasses.dataclass(frozen=True)
class SelectExpression(abc.SQLSyntax):
"""This class represents `select_expression`."""

Expand Down
6 changes: 6 additions & 0 deletions bigframes/core/compile/sqlglot/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,9 @@ def _compile_sql(self, request: configs.CompileRequest) -> configs.CompileResult
nodes.ResultNode, rewrite.column_pruning(result_node)
)
result_node = self._remap_variables(result_node)
result_node = typing.cast(
nodes.ResultNode, rewrite.defer_selection(result_node)
)
sql = self._compile_result_node(result_node)
return configs.CompileResult(
sql, result_node.schema.to_bigquery(), result_node.order_by
Expand All @@ -97,6 +100,9 @@ def _compile_sql(self, request: configs.CompileRequest) -> configs.CompileResult
result_node = typing.cast(nodes.ResultNode, rewrite.column_pruning(result_node))

result_node = self._remap_variables(result_node)
result_node = typing.cast(
nodes.ResultNode, rewrite.defer_selection(result_node)
)
sql = self._compile_result_node(result_node)
# Return the ordering iff no extra columns are needed to define the row order
if ordering is not None:
Expand Down
10 changes: 9 additions & 1 deletion bigframes/core/nodes.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ def additive_base(self) -> BigFrameNode:
...

@abc.abstractmethod
def replace_additive_base(self, BigFrameNode):
def replace_additive_base(self, BigFrameNode) -> BigFrameNode:
...


Expand Down Expand Up @@ -1568,6 +1568,10 @@ class ExplodeNode(UnaryNode):
# Offsets are generated only if this is non-null
offsets_col: Optional[identifiers.ColumnId] = None

def _validate(self):
for col in self.column_ids:
assert col.id in self.child.ids

@property
def row_preserving(self) -> bool:
return False
Expand Down Expand Up @@ -1646,6 +1650,10 @@ class ResultNode(UnaryNode):
limit: Optional[int] = None
# TODO: CTE definitions

def _validate(self):
for ref, name in self.output_cols:
assert ref.id in self.child.ids

@property
def node_defined_ids(self) -> Tuple[identifiers.ColumnId, ...]:
return ()
Expand Down
2 changes: 2 additions & 0 deletions bigframes/core/rewrite/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
try_reduce_to_local_scan,
try_reduce_to_table_scan,
)
from bigframes.core.rewrite.select_pullup import defer_selection
from bigframes.core.rewrite.slices import pull_out_limit, pull_up_limits, rewrite_slice
from bigframes.core.rewrite.timedeltas import rewrite_timedelta_expressions
from bigframes.core.rewrite.windows import pull_out_window_order, rewrite_range_rolling
Expand All @@ -42,4 +43,5 @@
"try_reduce_to_local_scan",
"fold_row_counts",
"pull_out_window_order",
"defer_selection",
]
144 changes: 144 additions & 0 deletions bigframes/core/rewrite/select_pullup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import dataclasses
from typing import cast

from bigframes.core import expression, nodes


def defer_selection(
root: nodes.BigFrameNode,
) -> nodes.BigFrameNode:
"""
Defers SelectionNode operations in the tree, pulling them up.

In many cases, these nodes will be merged or eliminated entirely, simplifying the overall tree.
"""
return nodes.bottom_up(root, pull_up_select)


def pull_up_select(node: nodes.BigFrameNode) -> nodes.BigFrameNode:
if isinstance(node, nodes.LeafNode):
return node
if isinstance(node, nodes.JoinNode):
return pull_up_selects_under_join(node)
if isinstance(node, nodes.ConcatNode):
return handle_selects_under_concat(node)
if isinstance(node, nodes.UnaryNode):
return pull_up_select_unary(node)
# shouldn't hit this, but not worth crashing over
return node


def pull_up_select_unary(node: nodes.UnaryNode) -> nodes.BigFrameNode:
child = node.child
if not isinstance(child, nodes.SelectionNode):
return node

# Schema-preserving nodes
if isinstance(
node,
(
nodes.ReversedNode,
nodes.OrderByNode,
nodes.SliceNode,
nodes.FilterNode,
nodes.RandomSampleNode,
),
):
pushed_down_node: nodes.BigFrameNode = node.remap_refs(
{id: ref.id for ref, id in child.input_output_pairs}
).replace_child(child.child)
pulled_up_select = cast(
nodes.SelectionNode, child.replace_child(pushed_down_node)
)
return pulled_up_select
elif isinstance(
node,
(
nodes.SelectionNode,
nodes.ResultNode,
),
):
return node.remap_refs(
{id: ref.id for ref, id in child.input_output_pairs}
).replace_child(child.child)
elif isinstance(node, nodes.AggregateNode):
pushed_down_agg = node.remap_refs(
{id: ref.id for ref, id in child.input_output_pairs}
).replace_child(child.child)
new_selection = tuple(
nodes.AliasedRef.identity(id).remap_refs(
{id: ref.id for ref, id in child.input_output_pairs}
)
for id in node.ids
)
return nodes.SelectionNode(pushed_down_agg, new_selection)
elif isinstance(node, nodes.ExplodeNode):
pushed_down_node = node.remap_refs(
{id: ref.id for ref, id in child.input_output_pairs}
).replace_child(child.child)
pulled_up_select = cast(
nodes.SelectionNode, child.replace_child(pushed_down_node)
)
if node.offsets_col:
pulled_up_select = dataclasses.replace(
pulled_up_select,
input_output_pairs=(
*pulled_up_select.input_output_pairs,
nodes.AliasedRef(
expression.DerefOp(node.offsets_col), node.offsets_col
),
),
)
return pulled_up_select
elif isinstance(node, nodes.AdditiveNode):
pushed_down_node = node.replace_additive_base(child.child).remap_refs(
{id: ref.id for ref, id in child.input_output_pairs}
)
new_selection = (
*child.input_output_pairs,
*(
nodes.AliasedRef(expression.DerefOp(col.id), col.id)
for col in node.added_fields
),
)
pulled_up_select = dataclasses.replace(
child, child=pushed_down_node, input_output_pairs=new_selection
)
return pulled_up_select
# shouldn't hit this, but not worth crashing over
return node


def pull_up_selects_under_join(node: nodes.JoinNode) -> nodes.JoinNode:
# Can in theory pull up selects here, but it is a bit dangerous, in particular or self-joins, when there are more transforms to do.
# TODO: Safely pull up selects above join
return node


def handle_selects_under_concat(node: nodes.ConcatNode) -> nodes.ConcatNode:
new_children = []
for child in node.child_nodes:
# remove select if no-op
if not isinstance(child, nodes.SelectionNode):
new_children.append(child)
else:
inputs = (ref.id for ref in child.input_output_pairs)
if inputs == tuple(child.child.ids):
new_children.append(child.child)
else:
new_children.append(child)
return dataclasses.replace(node, children=tuple(new_children))
6 changes: 6 additions & 0 deletions bigframes/core/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,12 @@ def label_to_identifier(label: typing.Hashable, strict: bool = False) -> str:
# first character must be letter or underscore
identifier = "_" + identifier

else:
# Even with flexible column names, there are constraints
# Convert illegal characters
# See: https://cloud.google.com/bigquery/docs/schemas#flexible-column-names
identifier = re.sub(r"[!\"$\(\)\*\,\./;\?@[\]^`{}~]", "_", identifier)

# Except in special circumstances (true anonymous query results tables),
# field names are not allowed to start with these (case-insensitive)
# prefixes.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,79 +4,47 @@ WITH `bfcte_1` AS (
FROM UNNEST(ARRAY<STRUCT<`bfcol_0` INT64, `bfcol_1` INT64, `bfcol_2` INT64, `bfcol_3` STRING, `bfcol_4` INT64>>[STRUCT(0, 123456789, 0, 'Hello, World!', 0), STRUCT(1, -987654321, 1, 'こんにちは', 1), STRUCT(2, 314159, 2, ' ¡Hola Mundo! ', 2), STRUCT(3, CAST(NULL AS INT64), 3, CAST(NULL AS STRING), 3), STRUCT(4, -234892, 4, 'Hello, World!', 4), STRUCT(5, 55555, 5, 'Güten Tag!', 5), STRUCT(6, 101202303, 6, 'capitalize, This ', 6), STRUCT(7, -214748367, 7, ' سلام', 7), STRUCT(8, 2, 8, 'T', 8)])
), `bfcte_3` AS (
SELECT
`bfcol_0` AS `bfcol_5`,
`bfcol_2` AS `bfcol_6`,
`bfcol_1` AS `bfcol_7`,
`bfcol_3` AS `bfcol_8`,
`bfcol_4` AS `bfcol_9`
*,
`bfcol_4` AS `bfcol_10`
FROM `bfcte_1`
), `bfcte_5` AS (
SELECT
*,
`bfcol_9` AS `bfcol_10`
FROM `bfcte_3`
), `bfcte_7` AS (
SELECT
`bfcol_5` AS `bfcol_11`,
`bfcol_6` AS `bfcol_12`,
`bfcol_7` AS `bfcol_13`,
`bfcol_8` AS `bfcol_14`,
`bfcol_10` AS `bfcol_15`
FROM `bfcte_5`
), `bfcte_9` AS (
SELECT
*,
0 AS `bfcol_16`
FROM `bfcte_7`
), `bfcte_10` AS (
FROM `bfcte_3`
), `bfcte_6` AS (
SELECT
`bfcol_11` AS `bfcol_17`,
`bfcol_12` AS `bfcol_18`,
`bfcol_13` AS `bfcol_19`,
`bfcol_14` AS `bfcol_20`,
`bfcol_0` AS `bfcol_17`,
`bfcol_2` AS `bfcol_18`,
`bfcol_1` AS `bfcol_19`,
`bfcol_3` AS `bfcol_20`,
`bfcol_16` AS `bfcol_21`,
`bfcol_15` AS `bfcol_22`
FROM `bfcte_9`
`bfcol_10` AS `bfcol_22`
FROM `bfcte_5`
), `bfcte_0` AS (
SELECT
*
FROM UNNEST(ARRAY<STRUCT<`bfcol_23` INT64, `bfcol_24` INT64, `bfcol_25` INT64, `bfcol_26` STRING, `bfcol_27` INT64>>[STRUCT(0, 123456789, 0, 'Hello, World!', 0), STRUCT(1, -987654321, 1, 'こんにちは', 1), STRUCT(2, 314159, 2, ' ¡Hola Mundo! ', 2), STRUCT(3, CAST(NULL AS INT64), 3, CAST(NULL AS STRING), 3), STRUCT(4, -234892, 4, 'Hello, World!', 4), STRUCT(5, 55555, 5, 'Güten Tag!', 5), STRUCT(6, 101202303, 6, 'capitalize, This ', 6), STRUCT(7, -214748367, 7, ' سلام', 7), STRUCT(8, 2, 8, 'T', 8)])
), `bfcte_2` AS (
SELECT
`bfcol_23` AS `bfcol_28`,
`bfcol_25` AS `bfcol_29`,
`bfcol_24` AS `bfcol_30`,
`bfcol_26` AS `bfcol_31`,
`bfcol_27` AS `bfcol_32`
*,
`bfcol_27` AS `bfcol_33`
FROM `bfcte_0`
), `bfcte_4` AS (
SELECT
*,
`bfcol_32` AS `bfcol_33`
1 AS `bfcol_39`
FROM `bfcte_2`
), `bfcte_6` AS (
), `bfcte_7` AS (
SELECT
`bfcol_28` AS `bfcol_34`,
`bfcol_29` AS `bfcol_35`,
`bfcol_30` AS `bfcol_36`,
`bfcol_31` AS `bfcol_37`,
`bfcol_33` AS `bfcol_38`
`bfcol_23` AS `bfcol_40`,
`bfcol_25` AS `bfcol_41`,
`bfcol_24` AS `bfcol_42`,
`bfcol_26` AS `bfcol_43`,
`bfcol_39` AS `bfcol_44`,
`bfcol_33` AS `bfcol_45`
FROM `bfcte_4`
), `bfcte_8` AS (
SELECT
*,
1 AS `bfcol_39`
FROM `bfcte_6`
), `bfcte_11` AS (
SELECT
`bfcol_34` AS `bfcol_40`,
`bfcol_35` AS `bfcol_41`,
`bfcol_36` AS `bfcol_42`,
`bfcol_37` AS `bfcol_43`,
`bfcol_39` AS `bfcol_44`,
`bfcol_38` AS `bfcol_45`
FROM `bfcte_8`
), `bfcte_12` AS (
SELECT
*
FROM (
Expand All @@ -87,7 +55,7 @@ WITH `bfcte_1` AS (
bfcol_20 AS `bfcol_49`,
bfcol_21 AS `bfcol_50`,
bfcol_22 AS `bfcol_51`
FROM `bfcte_10`
FROM `bfcte_6`
UNION ALL
SELECT
bfcol_40 AS `bfcol_46`,
Expand All @@ -96,15 +64,15 @@ WITH `bfcte_1` AS (
bfcol_43 AS `bfcol_49`,
bfcol_44 AS `bfcol_50`,
bfcol_45 AS `bfcol_51`
FROM `bfcte_11`
FROM `bfcte_7`
)
)
SELECT
`bfcol_46` AS `rowindex`,
`bfcol_47` AS `rowindex_1`,
`bfcol_48` AS `int64_col`,
`bfcol_49` AS `string_col`
FROM `bfcte_12`
FROM `bfcte_8`
ORDER BY
`bfcol_50` ASC NULLS LAST,
`bfcol_51` ASC NULLS LAST
Loading