diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala index 34cf0477160c8..57ddd392e180a 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala @@ -808,8 +808,7 @@ object CollapseRepartition extends Rule[LogicalPlan] { } /** - * Substitute the aggregate expression which uses [[First]] as the aggregate function - * in the window with the window function [[NthValue]]. + * Replaces first(col) to nth_value(col, 1) for better performance. */ object OptimizeWindowFunctions extends Rule[LogicalPlan] { def apply(plan: LogicalPlan): LogicalPlan = plan resolveExpressions { diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizeWindowFunctionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizeWindowFunctionsSuite.scala index 1449d41b22f34..c89208dce45d6 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizeWindowFunctionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizeWindowFunctionsSuite.scala @@ -31,21 +31,35 @@ class OptimizeWindowFunctionsSuite extends PlanTest { OptimizeWindowFunctions) :: Nil } - test("check OptimizeWindowFunctions") { - val testRelation = LocalRelation('a.double, 'b.double, 'c.string) - val a = testRelation.output.head + val testRelation = LocalRelation('a.double, 'b.double, 'c.string) + val a = testRelation.output(0) + val b = testRelation.output(1) + val c = testRelation.output(2) + + test("replace first(col) by nth_value(col, 1) if the window frame is ordered") { val inputPlan = testRelation.select( WindowExpression( First(a, false).toAggregateExpression(), - WindowSpecDefinition(Nil, a.asc :: Nil, + WindowSpecDefinition(b :: Nil, c.asc :: Nil, SpecifiedWindowFrame(RowFrame, UnboundedPreceding, CurrentRow)))) val correctAnswer = testRelation.select( WindowExpression( NthValue(a, Literal(1), false), - WindowSpecDefinition(Nil, a.asc :: Nil, + WindowSpecDefinition(b :: Nil, c.asc :: Nil, SpecifiedWindowFrame(RowFrame, UnboundedPreceding, CurrentRow)))) val optimized = Optimize.execute(inputPlan) assert(optimized == correctAnswer) } + + test("can't replace first(col) by nth_value(col, 1) if the window frame isn't ordered") { + val inputPlan = testRelation.select( + WindowExpression( + First(a, false).toAggregateExpression(), + WindowSpecDefinition(b :: Nil, Nil, + SpecifiedWindowFrame(RowFrame, UnboundedPreceding, CurrentRow)))) + + val optimized = Optimize.execute(inputPlan) + assert(optimized == inputPlan) + } } diff --git a/sql/core/src/test/resources/sql-tests/inputs/window.sql b/sql/core/src/test/resources/sql-tests/inputs/window.sql index 83ab4e2e7bbf5..f5223af9125f6 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/window.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/window.sql @@ -150,130 +150,104 @@ FROM testData ORDER BY cate, val; SELECT employee_name, salary, - first_value(employee_name) OVER (ORDER BY salary DESC) highest_salary, - nth_value(employee_name, 2) OVER (ORDER BY salary DESC) second_highest_salary + first_value(employee_name) OVER w highest_salary, + nth_value(employee_name, 2) OVER w second_highest_salary FROM basic_pays +WINDOW w AS (ORDER BY salary DESC) ORDER BY salary DESC; SELECT employee_name, salary, - first_value(employee_name) OVER ( - ORDER BY salary DESC - RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) highest_salary, - nth_value(employee_name, 2) OVER ( - ORDER BY salary DESC - RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) second_highest_salary + first_value(employee_name) OVER w highest_salary, + nth_value(employee_name, 2) OVER w second_highest_salary FROM basic_pays +WINDOW w AS (ORDER BY salary DESC RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) ORDER BY salary DESC; SELECT employee_name, salary, - first_value(employee_name) OVER ( - ORDER BY salary DESC - ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) highest_salary, - nth_value(employee_name, 2) OVER ( - ORDER BY salary DESC - ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) second_highest_salary + first_value(employee_name) OVER w highest_salary, + nth_value(employee_name, 2) OVER w second_highest_salary FROM basic_pays +WINDOW w AS (ORDER BY salary DESC ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) ORDER BY salary DESC; SELECT employee_name, salary, - first_value(employee_name) OVER ( - ORDER BY salary - RANGE BETWEEN 2000 PRECEDING AND 1000 FOLLOWING) highest_salary, - nth_value(employee_name, 2) OVER ( - ORDER BY salary - RANGE BETWEEN 2000 PRECEDING AND 1000 FOLLOWING) second_highest_salary + first_value(employee_name) OVER w highest_salary, + nth_value(employee_name, 2) OVER w second_highest_salary FROM basic_pays +WINDOW w AS (ORDER BY salary RANGE BETWEEN 2000 PRECEDING AND 1000 FOLLOWING) ORDER BY salary; SELECT employee_name, salary, - first_value(employee_name) OVER ( - ORDER BY salary DESC - ROWS BETWEEN 2 PRECEDING AND 2 FOLLOWING) highest_salary, - nth_value(employee_name, 2) OVER ( - ORDER BY salary DESC - ROWS BETWEEN 2 PRECEDING AND 2 FOLLOWING) second_highest_salary + first_value(employee_name) OVER w highest_salary, + nth_value(employee_name, 2) OVER w second_highest_salary FROM basic_pays +WINDOW w AS (ORDER BY salary DESC ROWS BETWEEN 2 PRECEDING AND 2 FOLLOWING) ORDER BY salary DESC; SELECT employee_name, salary, - first_value(employee_name) OVER ( - ORDER BY salary DESC - RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) highest_salary, - nth_value(employee_name, 2) OVER ( - ORDER BY salary DESC - RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) second_highest_salary + first_value(employee_name) OVER w highest_salary, + nth_value(employee_name, 2) OVER w second_highest_salary FROM basic_pays +WINDOW w AS (ORDER BY salary DESC RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) ORDER BY salary DESC; SELECT employee_name, salary, - first_value(employee_name) OVER ( - ORDER BY salary DESC - RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) highest_salary, - nth_value(employee_name, 2) OVER ( - ORDER BY salary DESC - RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) second_highest_salary + first_value(employee_name) OVER w highest_salary, + nth_value(employee_name, 2) OVER w second_highest_salary FROM basic_pays +WINDOW w AS (ORDER BY salary DESC RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) ORDER BY salary DESC; SELECT employee_name, salary, - first_value(employee_name) OVER ( - ORDER BY salary DESC - ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) highest_salary, - nth_value(employee_name, 2) OVER ( - ORDER BY salary DESC - ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) second_highest_salary + first_value(employee_name) OVER w highest_salary, + nth_value(employee_name, 2) OVER w second_highest_salary FROM basic_pays +WINDOW w AS (ORDER BY salary DESC ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) ORDER BY salary DESC; SELECT employee_name, salary, - first_value(employee_name) OVER ( - ORDER BY salary DESC - ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING) highest_salary, - nth_value(employee_name, 2) OVER ( - ORDER BY salary DESC - ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING) second_highest_salary + first_value(employee_name) OVER w highest_salary, + nth_value(employee_name, 2) OVER w second_highest_salary FROM basic_pays +WINDOW w AS (ORDER BY salary DESC ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING) ORDER BY salary DESC; SELECT employee_name, department, salary, - FIRST_VALUE(employee_name) OVER ( - PARTITION BY department - ORDER BY salary DESC - RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING - ) highest_salary, - NTH_VALUE(employee_name, 2) OVER ( - PARTITION BY department - ORDER BY salary DESC - RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING - ) second_highest_salary + FIRST_VALUE(employee_name) OVER w highest_salary, + NTH_VALUE(employee_name, 2) OVER w second_highest_salary FROM basic_pays +WINDOW w AS ( + PARTITION BY department + ORDER BY salary DESC + RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING +) ORDER BY department; \ No newline at end of file diff --git a/sql/core/src/test/resources/sql-tests/results/window.sql.out b/sql/core/src/test/resources/sql-tests/results/window.sql.out index b3d0a47c74243..1304dcf21d0b3 100644 --- a/sql/core/src/test/resources/sql-tests/results/window.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/window.sql.out @@ -421,10 +421,11 @@ window aggregate function with filter predicate is not supported yet.; SELECT employee_name, salary, - first_value(employee_name) OVER (ORDER BY salary DESC) highest_salary, - nth_value(employee_name, 2) OVER (ORDER BY salary DESC) second_highest_salary + first_value(employee_name) OVER w highest_salary, + nth_value(employee_name, 2) OVER w second_highest_salary FROM basic_pays +WINDOW w AS (ORDER BY salary DESC) ORDER BY salary DESC -- !query schema struct @@ -452,14 +453,11 @@ Leslie Thompson 5186 Larry Bott Gerard Bondur SELECT employee_name, salary, - first_value(employee_name) OVER ( - ORDER BY salary DESC - RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) highest_salary, - nth_value(employee_name, 2) OVER ( - ORDER BY salary DESC - RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) second_highest_salary + first_value(employee_name) OVER w highest_salary, + nth_value(employee_name, 2) OVER w second_highest_salary FROM basic_pays +WINDOW w AS (ORDER BY salary DESC RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) ORDER BY salary DESC -- !query schema struct @@ -487,14 +485,11 @@ Leslie Thompson 5186 Larry Bott Gerard Bondur SELECT employee_name, salary, - first_value(employee_name) OVER ( - ORDER BY salary DESC - ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) highest_salary, - nth_value(employee_name, 2) OVER ( - ORDER BY salary DESC - ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) second_highest_salary + first_value(employee_name) OVER w highest_salary, + nth_value(employee_name, 2) OVER w second_highest_salary FROM basic_pays +WINDOW w AS (ORDER BY salary DESC ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) ORDER BY salary DESC -- !query schema struct @@ -522,14 +517,11 @@ Leslie Thompson 5186 Larry Bott Gerard Bondur SELECT employee_name, salary, - first_value(employee_name) OVER ( - ORDER BY salary - RANGE BETWEEN 2000 PRECEDING AND 1000 FOLLOWING) highest_salary, - nth_value(employee_name, 2) OVER ( - ORDER BY salary - RANGE BETWEEN 2000 PRECEDING AND 1000 FOLLOWING) second_highest_salary + first_value(employee_name) OVER w highest_salary, + nth_value(employee_name, 2) OVER w second_highest_salary FROM basic_pays +WINDOW w AS (ORDER BY salary RANGE BETWEEN 2000 PRECEDING AND 1000 FOLLOWING) ORDER BY salary -- !query schema struct @@ -557,14 +549,11 @@ Larry Bott 11798 Mary Patterson Loui Bondur SELECT employee_name, salary, - first_value(employee_name) OVER ( - ORDER BY salary DESC - ROWS BETWEEN 2 PRECEDING AND 2 FOLLOWING) highest_salary, - nth_value(employee_name, 2) OVER ( - ORDER BY salary DESC - ROWS BETWEEN 2 PRECEDING AND 2 FOLLOWING) second_highest_salary + first_value(employee_name) OVER w highest_salary, + nth_value(employee_name, 2) OVER w second_highest_salary FROM basic_pays +WINDOW w AS (ORDER BY salary DESC ROWS BETWEEN 2 PRECEDING AND 2 FOLLOWING) ORDER BY salary DESC -- !query schema struct @@ -592,14 +581,11 @@ Leslie Thompson 5186 Foon Yue Tseng Anthony Bow SELECT employee_name, salary, - first_value(employee_name) OVER ( - ORDER BY salary DESC - RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) highest_salary, - nth_value(employee_name, 2) OVER ( - ORDER BY salary DESC - RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) second_highest_salary + first_value(employee_name) OVER w highest_salary, + nth_value(employee_name, 2) OVER w second_highest_salary FROM basic_pays +WINDOW w AS (ORDER BY salary DESC RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) ORDER BY salary DESC -- !query schema struct @@ -627,14 +613,11 @@ Leslie Thompson 5186 Leslie Thompson NULL SELECT employee_name, salary, - first_value(employee_name) OVER ( - ORDER BY salary DESC - RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) highest_salary, - nth_value(employee_name, 2) OVER ( - ORDER BY salary DESC - RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) second_highest_salary + first_value(employee_name) OVER w highest_salary, + nth_value(employee_name, 2) OVER w second_highest_salary FROM basic_pays +WINDOW w AS (ORDER BY salary DESC RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) ORDER BY salary DESC -- !query schema struct @@ -662,14 +645,11 @@ Leslie Thompson 5186 Larry Bott Gerard Bondur SELECT employee_name, salary, - first_value(employee_name) OVER ( - ORDER BY salary DESC - ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) highest_salary, - nth_value(employee_name, 2) OVER ( - ORDER BY salary DESC - ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) second_highest_salary + first_value(employee_name) OVER w highest_salary, + nth_value(employee_name, 2) OVER w second_highest_salary FROM basic_pays +WINDOW w AS (ORDER BY salary DESC ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) ORDER BY salary DESC -- !query schema struct @@ -697,14 +677,11 @@ Leslie Thompson 5186 Larry Bott Gerard Bondur SELECT employee_name, salary, - first_value(employee_name) OVER ( - ORDER BY salary DESC - ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING) highest_salary, - nth_value(employee_name, 2) OVER ( - ORDER BY salary DESC - ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING) second_highest_salary + first_value(employee_name) OVER w highest_salary, + nth_value(employee_name, 2) OVER w second_highest_salary FROM basic_pays +WINDOW w AS (ORDER BY salary DESC ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING) ORDER BY salary DESC -- !query schema struct @@ -733,18 +710,15 @@ SELECT employee_name, department, salary, - FIRST_VALUE(employee_name) OVER ( - PARTITION BY department - ORDER BY salary DESC - RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING - ) highest_salary, - NTH_VALUE(employee_name, 2) OVER ( - PARTITION BY department - ORDER BY salary DESC - RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING - ) second_highest_salary + FIRST_VALUE(employee_name) OVER w highest_salary, + NTH_VALUE(employee_name, 2) OVER w second_highest_salary FROM basic_pays +WINDOW w AS ( + PARTITION BY department + ORDER BY salary DESC + RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING +) ORDER BY department -- !query schema struct