Skip to content

perf: Use CTEs more aggressively for smaller sql #1883

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions bigframes/core/compile/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,10 @@ def compile_sql(request: configs.CompileRequest) -> configs.CompileResult:
# Need to do this before replacing unsupported ops, as that will rewrite slice ops
result_node = rewrites.pull_up_limits(result_node)
result_node = _replace_unsupported_ops(result_node)

# must extract ctes before column pruning, which pushes constraints down
result_node = cast(nodes.ResultNode, rewrites.extract_ctes(result_node))

# prune before pulling up order to avoid unnnecessary row_number() ops
result_node = cast(nodes.ResultNode, rewrites.column_pruning(result_node))
result_node = rewrites.defer_order(
Expand Down Expand Up @@ -284,3 +288,10 @@ def compile_explode(node: nodes.ExplodeNode, child: compiled.UnorderedIR):
@_compile_node.register
def compile_random_sample(node: nodes.RandomSampleNode, child: compiled.UnorderedIR):
return child._uniform_sampling(node.fraction)


@_compile_node.register
def compile_cte_node(node: nodes.CteNode, child: compiled.UnorderedIR):
# CTE node is just an optimization barrier for ibis compiler
# Ibis itself will identify cte candidates and extract them
return child
30 changes: 30 additions & 0 deletions bigframes/core/nodes.py
Original file line number Diff line number Diff line change
Expand Up @@ -1235,6 +1235,36 @@ def remap_refs(
return dataclasses.replace(self, input_output_pairs=new_fields) # type: ignore


@dataclasses.dataclass(frozen=True, eq=False)
class CteNode(UnaryNode):
@property
def fields(self) -> Sequence[Field]:
return self.child.fields

@property
def variables_introduced(self) -> int:
# This operation only renames variables, doesn't actually create new ones
return 0

@property
def row_count(self) -> Optional[int]:
return self.child.row_count

@property
def node_defined_ids(self) -> Tuple[identifiers.ColumnId, ...]:
return ()

def remap_vars(
self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId]
) -> CteNode:
return self

def remap_refs(
self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId]
) -> CteNode:
return self


@dataclasses.dataclass(frozen=True, eq=False)
class ProjectionNode(UnaryNode, AdditiveNode):
"""Assigns new variables (without modifying existing ones)"""
Expand Down
2 changes: 2 additions & 0 deletions bigframes/core/rewrite/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from bigframes.core.rewrite.exract_ctes import extract_ctes
from bigframes.core.rewrite.fold_row_count import fold_row_counts
from bigframes.core.rewrite.identifiers import remap_variables
from bigframes.core.rewrite.implicit_align import try_row_join
Expand Down Expand Up @@ -44,4 +45,5 @@
"fold_row_counts",
"pull_out_window_order",
"defer_selection",
"extract_ctes",
]
36 changes: 36 additions & 0 deletions bigframes/core/rewrite/exract_ctes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import annotations

from collections import defaultdict

from bigframes.core import nodes


def extract_ctes(root: nodes.BigFrameNode) -> nodes.BigFrameNode:
# identify candidates
# candidates
node_parents: dict[nodes.BigFrameNode, int] = defaultdict(int)
for parent, child in root.edges():
node_parents[child] += 1

# ok time to replace via extract
# we just mark in place, rather than pull out of the tree.
# if we did pull out of tree, we'd want to make sure to extract bottom-up
def insert_cte_markers(node: nodes.BigFrameNode) -> nodes.BigFrameNode:
if node_parents[node] > 1:
return nodes.CteNode(node)
return node

return root.top_down(insert_cte_markers)
3 changes: 3 additions & 0 deletions bigframes/core/rewrite/order.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,9 @@ def pull_up_order_inner(
elif isinstance(node, bigframes.core.nodes.ProjectionNode):
child_result, child_order = pull_up_order_inner(node.child)
return node.replace_child(child_result), child_order
elif isinstance(node, bigframes.core.nodes.CteNode):
child_result, child_order = pull_up_order_inner(node.child)
return node.replace_child(child_result), child_order
elif isinstance(node, bigframes.core.nodes.JoinNode):
if node.propogate_order:
return pull_order_join(node)
Expand Down