Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 28 additions & 34 deletions datafusion/core/tests/dataframe/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2744,23 +2744,20 @@ async fn test_count_wildcard_on_where_exist() -> Result<()> {

assert_snapshot!(
pretty_format_batches(&sql_results).unwrap(),
@r###"
+---------------+---------------------------------------------------------+
| plan_type | plan |
+---------------+---------------------------------------------------------+
| logical_plan | LeftSemi Join: |
| | TableScan: t1 projection=[a, b] |
| | SubqueryAlias: __correlated_sq_1 |
| | Projection: |
| | Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]] |
| | TableScan: t2 projection=[] |
| physical_plan | NestedLoopJoinExec: join_type=RightSemi |
| | ProjectionExec: expr=[] |
| | PlaceholderRowExec |
| | DataSourceExec: partitions=1, partition_sizes=[1] |
| | |
+---------------+---------------------------------------------------------+
"###
@r"
+---------------+-----------------------------------------------------+
| plan_type | plan |
+---------------+-----------------------------------------------------+
| logical_plan | LeftSemi Join: |
| | TableScan: t1 projection=[a, b] |
| | SubqueryAlias: __correlated_sq_1 |
| | EmptyRelation |
| physical_plan | NestedLoopJoinExec: join_type=RightSemi |
| | PlaceholderRowExec |
| | DataSourceExec: partitions=1, partition_sizes=[1] |
| | |
+---------------+-----------------------------------------------------+
"
);

let df_results = ctx
Expand All @@ -2783,23 +2780,20 @@ async fn test_count_wildcard_on_where_exist() -> Result<()> {

assert_snapshot!(
pretty_format_batches(&df_results).unwrap(),
@r###"
+---------------+---------------------------------------------------------------------+
| plan_type | plan |
+---------------+---------------------------------------------------------------------+
| logical_plan | LeftSemi Join: |
| | TableScan: t1 projection=[a, b] |
| | SubqueryAlias: __correlated_sq_1 |
| | Projection: |
| | Aggregate: groupBy=[[]], aggr=[[count(Int64(1)) AS count(*)]] |
| | TableScan: t2 projection=[] |
| physical_plan | NestedLoopJoinExec: join_type=RightSemi |
| | ProjectionExec: expr=[] |
| | PlaceholderRowExec |
| | DataSourceExec: partitions=1, partition_sizes=[1] |
| | |
+---------------+---------------------------------------------------------------------+
"###
@r"
+---------------+-----------------------------------------------------+
| plan_type | plan |
+---------------+-----------------------------------------------------+
| logical_plan | LeftSemi Join: |
| | TableScan: t1 projection=[a, b] |
| | SubqueryAlias: __correlated_sq_1 |
| | EmptyRelation |
| physical_plan | NestedLoopJoinExec: join_type=RightSemi |
| | PlaceholderRowExec |
| | DataSourceExec: partitions=1, partition_sizes=[1] |
| | |
+---------------+-----------------------------------------------------+
"
);

Ok(())
Expand Down
5 changes: 4 additions & 1 deletion datafusion/expr/src/logical_plan/plan.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3522,7 +3522,10 @@ impl Aggregate {
) -> Result<Self> {
if group_expr.is_empty() && aggr_expr.is_empty() {
return plan_err!(
"Aggregate requires at least one grouping or aggregate expression"
"Aggregate requires at least one grouping or aggregate expression. \
Aggregate without grouping expressions nor aggregate expressions is \
logically equivalent to, but less efficient than, VALUES producing \
single row. Please use VALUES instead."
);
}
let group_expr_count = grouping_set_expr_count(&group_expr)?;
Expand Down
37 changes: 14 additions & 23 deletions datafusion/optimizer/src/optimize_projections/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,12 @@ use std::sync::Arc;

use datafusion_common::{
get_required_group_by_exprs_indices, internal_datafusion_err, internal_err, Column,
HashMap, JoinType, Result,
DFSchema, HashMap, JoinType, Result,
};
use datafusion_expr::expr::Alias;
use datafusion_expr::{
logical_plan::LogicalPlan, Aggregate, Distinct, Expr, Projection, TableScan, Unnest,
Window,
logical_plan::LogicalPlan, Aggregate, Distinct, EmptyRelation, Expr, Projection,
TableScan, Unnest, Window,
};

use crate::optimize_projections::required_indices::RequiredIndices;
Expand Down Expand Up @@ -153,23 +153,16 @@ fn optimize_projections(

// Only use the absolutely necessary aggregate expressions required
// by the parent:
let mut new_aggr_expr = aggregate_reqs.get_at_indices(&aggregate.aggr_expr);

// Aggregations always need at least one aggregate expression.
// With a nested count, we don't require any column as input, but
// still need to create a correct aggregate, which may be optimized
// out later. As an example, consider the following query:
//
// SELECT count(*) FROM (SELECT count(*) FROM [...])
//
// which always returns 1.
if new_aggr_expr.is_empty()
&& new_group_bys.is_empty()
&& !aggregate.aggr_expr.is_empty()
{
// take the old, first aggregate expression
new_aggr_expr = aggregate.aggr_expr;
new_aggr_expr.resize_with(1, || unreachable!());
let new_aggr_expr = aggregate_reqs.get_at_indices(&aggregate.aggr_expr);

if new_group_bys.is_empty() && new_aggr_expr.is_empty() {
// Global aggregation with no aggregate functions always produces 1 row and no columns.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this seems like a good rule to have, though perhaps it would be eaiser to find if it were with related rules for aggregates, for example

https://github.com/apache/datafusion/blob/main/datafusion/optimizer/src/eliminate_group_by_constant.rs

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IMO here is a natural place to place this logic. We're pruning and we need to somehow handle the case where we prune the last aggregate function. The code should do something reasonable and now it does.
alternative is to let this place create Agg with empty group by and no aggregate functions (I had so initially), and then have a separate rule that finds such trivial Aggs and replaces with VALUES.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Keeping it here just makes sense

return Ok(Transformed::yes(LogicalPlan::EmptyRelation(
EmptyRelation {
produce_one_row: true,
schema: Arc::new(DFSchema::empty()),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not related to this PR, but why we need schema for EmptyRelation, the structure description presumes the schema always empty?

/// Produces no rows: An empty relation with an empty schema
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct EmptyRelation

We probably can simplify using this relationship

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

UPD: actually the description is not accurate, EmptyRelation used with non empty schema in joins and some other places, we need to update the description

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here we don't produce any symbols, but maybe we use EmptyRelation also for other cases?
For exmple SELECT a, b, c FROM t WHERE false could be replaced with EmptyRelation, but something needs to project a, b, c symbols. It can be a project above EmptyRelation or part of EmptyRelation itself.

A different question -- why do we have both, EmptyRelation and Values?
The latter is strictly more generic, without any visible downsides. It even has a better name (EmptyRelation is only conditionally an empty relation).
Can we deprecate EmptyRelation and replace with Values?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Both structs have much in common, but EmptyRelation also serves for producing 0 rows, not sure if that can work for Values?

Although EmptyRelation with produce_one_row == true that probably feasible to replace with Values 🤔

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Both structs have much in common, but EmptyRelation also serves for producing 0 rows, not sure if that can work for Values?

Why not?
SQL doesn't allow empty values, but I see no reason for LP not to allow them.

},
)));
}

let all_exprs_iter = new_group_bys.iter().chain(new_aggr_expr.iter());
Expand Down Expand Up @@ -1146,9 +1139,7 @@ mod tests {
plan,
@r"
Aggregate: groupBy=[[]], aggr=[[count(Int32(1))]]
Projection:
Aggregate: groupBy=[[]], aggr=[[count(Int32(1))]]
TableScan: ?table? projection=[]
EmptyRelation
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

table scan gets replaced with 1-row VALUES (that would be more visible if we merge #17145)

we could further eliminate count(*) on top of a relation with known cardinality. follow-up potential

"
)
}
Expand Down
7 changes: 2 additions & 5 deletions datafusion/sqllogictest/test_files/explain.slt
Original file line number Diff line number Diff line change
Expand Up @@ -429,14 +429,11 @@ logical_plan
01)LeftSemi Join:
02)--TableScan: t1 projection=[a]
03)--SubqueryAlias: __correlated_sq_1
04)----Projection:
05)------Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]]
06)--------TableScan: t2 projection=[]
04)----EmptyRelation
physical_plan
01)NestedLoopJoinExec: join_type=LeftSemi
02)--DataSourceExec: partitions=1, partition_sizes=[0]
03)--ProjectionExec: expr=[]
04)----PlaceholderRowExec
03)--PlaceholderRowExec

statement ok
drop table t1;
Expand Down
7 changes: 2 additions & 5 deletions datafusion/sqllogictest/test_files/explain_tree.slt
Original file line number Diff line number Diff line change
Expand Up @@ -1263,14 +1263,11 @@ physical_plan
04)│ join_type: LeftSemi │ │
05)└─────────────┬─────────────┘ │
06)┌─────────────┴─────────────┐┌─────────────┴─────────────┐
07)│ DataSourceExec ││ ProjectionExec
07)│ DataSourceExec ││ PlaceholderRowExec
08)│ -------------------- ││ │
09)│ files: 1 ││ │
10)│ format: csv ││ │
11)└───────────────────────────┘└─────────────┬─────────────┘
12)-----------------------------┌─────────────┴─────────────┐
13)-----------------------------│ PlaceholderRowExec │
14)-----------------------------└───────────────────────────┘
11)└───────────────────────────┘└───────────────────────────┘

# Query with cross join.
query TT
Expand Down
2 changes: 1 addition & 1 deletion datafusion/sqllogictest/test_files/expr/date_part.slt
Original file line number Diff line number Diff line change
Expand Up @@ -1089,4 +1089,4 @@ SELECT EXTRACT("isodow" FROM to_timestamp('2020-09-08T12:00:00+00:00'))
query I
SELECT EXTRACT('isodow' FROM to_timestamp('2020-09-08T12:00:00+00:00'))
----
1
1
36 changes: 36 additions & 0 deletions datafusion/sqllogictest/test_files/issue_17138.slt
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
statement ok
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I recommend we name this file more specifically, or put it in an existing function, perhaps aggregate.slt ?

Not a blocker, just a suggestion

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This showed up as a regression so i named this file as a regression test.
We have regression tests stucked in other files, but this creates risk of inadvertently changing the scenario and so thus breaking the regression properties of a test. They are better of isolated.
Would it help if this file is moved under a directory for just regression cases?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, maybe that would be better. But if there is already precedent for regression cases perhaps we can leave this PR as is and move it in a subsequent PR

CREATE TABLE tab1(col0 INTEGER, col1 INTEGER, col2 INTEGER)

statement ok
INSERT INTO tab1 VALUES(51,14,96)

query R
SELECT NULL * AVG(DISTINCT 4) + SUM(col1) AS col0 FROM tab1
----
NULL

query TT
EXPLAIN SELECT NULL * AVG(DISTINCT 4) + SUM(col1) AS col0 FROM tab1
----
logical_plan
01)Projection: Float64(NULL) AS col0
02)--EmptyRelation
physical_plan
01)ProjectionExec: expr=[NULL as col0]
02)--PlaceholderRowExec

# Similar, with a few more arithmetic operations
query R
SELECT + CAST ( NULL AS INTEGER ) * + + AVG ( DISTINCT 4 ) + - SUM ( ALL + col1 ) AS col0 FROM tab1
----
NULL

query TT
EXPLAIN SELECT + CAST ( NULL AS INTEGER ) * + + AVG ( DISTINCT 4 ) + - SUM ( ALL + col1 ) AS col0 FROM tab1
----
logical_plan
01)Projection: Float64(NULL) AS col0
02)--EmptyRelation
physical_plan
01)ProjectionExec: expr=[NULL as col0]
02)--PlaceholderRowExec
Original file line number Diff line number Diff line change
Expand Up @@ -73,4 +73,3 @@ query I
SELECT getbit(11, NULL);
----
NULL

4 changes: 1 addition & 3 deletions datafusion/sqllogictest/test_files/subquery.slt
Original file line number Diff line number Diff line change
Expand Up @@ -1453,9 +1453,7 @@ logical_plan
01)LeftSemi Join:
02)--TableScan: t1 projection=[a]
03)--SubqueryAlias: __correlated_sq_1
04)----Projection:
05)------Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]]
06)--------TableScan: t2 projection=[]
04)----EmptyRelation
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

seems like an improvement to me


statement count 0
drop table t1;
Expand Down