Skip to content

Commit 2f07c56

Browse files
beliefercloud-fan
authored andcommitted
[SPARK-33278][SQL] Improve the performance for FIRST_VALUE
### What changes were proposed in this pull request? #29800 provides a performance improvement for `NTH_VALUE`. `FIRST_VALUE` also could use the `UnboundedOffsetWindowFunctionFrame` and `UnboundedPrecedingOffsetWindowFunctionFrame`. ### Why are the changes needed? Improve the performance for `FIRST_VALUE`. ### Does this PR introduce _any_ user-facing change? 'No'. ### How was this patch tested? Jenkins test. Closes #30178 from beliefer/SPARK-33278. Lead-authored-by: gengjiaan <gengjiaan@360.cn> Co-authored-by: beliefer <beliefer@163.com> Co-authored-by: Jiaan Geng <beliefer@163.com> Signed-off-by: Wenchen Fan <wenchen@databricks.com>
1 parent a3d2954 commit 2f07c56

File tree

4 files changed

+339
-242
lines changed

4 files changed

+339
-242
lines changed

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@ abstract class Optimizer(catalogManager: CatalogManager)
8282
// Operator combine
8383
CollapseRepartition,
8484
CollapseProject,
85+
OptimizeWindowFunctions,
8586
CollapseWindow,
8687
CombineFilters,
8788
CombineLimits,
@@ -806,6 +807,18 @@ object CollapseRepartition extends Rule[LogicalPlan] {
806807
}
807808
}
808809

810+
/**
811+
* Replaces first(col) to nth_value(col, 1) for better performance.
812+
*/
813+
object OptimizeWindowFunctions extends Rule[LogicalPlan] {
814+
def apply(plan: LogicalPlan): LogicalPlan = plan resolveExpressions {
815+
case we @ WindowExpression(AggregateExpression(first: First, _, _, _, _), spec)
816+
if spec.orderSpec.nonEmpty &&
817+
spec.frameSpecification.asInstanceOf[SpecifiedWindowFrame].frameType == RowFrame =>
818+
we.copy(windowFunction = NthValue(first.child, Literal(1), first.ignoreNulls))
819+
}
820+
}
821+
809822
/**
810823
* Collapse Adjacent Window Expression.
811824
* - If the partition specs and order specs are the same and the window expression are
Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.apache.spark.sql.catalyst.optimizer
19+
20+
import org.apache.spark.sql.catalyst.dsl.expressions._
21+
import org.apache.spark.sql.catalyst.dsl.plans._
22+
import org.apache.spark.sql.catalyst.expressions._
23+
import org.apache.spark.sql.catalyst.expressions.aggregate.First
24+
import org.apache.spark.sql.catalyst.plans.PlanTest
25+
import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan}
26+
import org.apache.spark.sql.catalyst.rules.RuleExecutor
27+
28+
class OptimizeWindowFunctionsSuite extends PlanTest {
29+
object Optimize extends RuleExecutor[LogicalPlan] {
30+
val batches = Batch("OptimizeWindowFunctions", FixedPoint(10),
31+
OptimizeWindowFunctions) :: Nil
32+
}
33+
34+
val testRelation = LocalRelation('a.double, 'b.double, 'c.string)
35+
val a = testRelation.output(0)
36+
val b = testRelation.output(1)
37+
val c = testRelation.output(2)
38+
39+
test("replace first(col) by nth_value(col, 1)") {
40+
val inputPlan = testRelation.select(
41+
WindowExpression(
42+
First(a, false).toAggregateExpression(),
43+
WindowSpecDefinition(b :: Nil, c.asc :: Nil,
44+
SpecifiedWindowFrame(RowFrame, UnboundedPreceding, CurrentRow))))
45+
val correctAnswer = testRelation.select(
46+
WindowExpression(
47+
NthValue(a, Literal(1), false),
48+
WindowSpecDefinition(b :: Nil, c.asc :: Nil,
49+
SpecifiedWindowFrame(RowFrame, UnboundedPreceding, CurrentRow))))
50+
51+
val optimized = Optimize.execute(inputPlan)
52+
assert(optimized == correctAnswer)
53+
}
54+
55+
test("can't replace first(col) by nth_value(col, 1) if the window frame type is range") {
56+
val inputPlan = testRelation.select(
57+
WindowExpression(
58+
First(a, false).toAggregateExpression(),
59+
WindowSpecDefinition(b :: Nil, c.asc :: Nil,
60+
SpecifiedWindowFrame(RangeFrame, UnboundedPreceding, CurrentRow))))
61+
62+
val optimized = Optimize.execute(inputPlan)
63+
assert(optimized == inputPlan)
64+
}
65+
66+
test("can't replace first(col) by nth_value(col, 1) if the window frame isn't ordered") {
67+
val inputPlan = testRelation.select(
68+
WindowExpression(
69+
First(a, false).toAggregateExpression(),
70+
WindowSpecDefinition(b :: Nil, Nil,
71+
SpecifiedWindowFrame(RowFrame, UnboundedPreceding, CurrentRow))))
72+
73+
val optimized = Optimize.execute(inputPlan)
74+
assert(optimized == inputPlan)
75+
}
76+
}

sql/core/src/test/resources/sql-tests/inputs/window.sql

Lines changed: 35 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -146,104 +146,108 @@ SELECT val, cate,
146146
count(val) FILTER (WHERE val > 1) OVER(PARTITION BY cate)
147147
FROM testData ORDER BY cate, val;
148148

149-
-- nth_value() over ()
149+
-- nth_value()/first_value() over ()
150150
SELECT
151151
employee_name,
152152
salary,
153-
nth_value(employee_name, 2) OVER (ORDER BY salary DESC) second_highest_salary
153+
first_value(employee_name) OVER w highest_salary,
154+
nth_value(employee_name, 2) OVER w second_highest_salary
154155
FROM
155156
basic_pays
157+
WINDOW w AS (ORDER BY salary DESC)
156158
ORDER BY salary DESC;
157159

158160
SELECT
159161
employee_name,
160162
salary,
161-
nth_value(employee_name, 2) OVER (
162-
ORDER BY salary DESC
163-
RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) second_highest_salary
163+
first_value(employee_name) OVER w highest_salary,
164+
nth_value(employee_name, 2) OVER w second_highest_salary
164165
FROM
165166
basic_pays
167+
WINDOW w AS (ORDER BY salary DESC RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)
166168
ORDER BY salary DESC;
167169

168170
SELECT
169171
employee_name,
170172
salary,
171-
nth_value(employee_name, 2) OVER (
172-
ORDER BY salary DESC
173-
ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) second_highest_salary
173+
first_value(employee_name) OVER w highest_salary,
174+
nth_value(employee_name, 2) OVER w second_highest_salary
174175
FROM
175176
basic_pays
177+
WINDOW w AS (ORDER BY salary DESC ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)
176178
ORDER BY salary DESC;
177179

178180
SELECT
179181
employee_name,
180182
salary,
181-
nth_value(employee_name, 2) OVER (
182-
ORDER BY salary
183-
RANGE BETWEEN 2000 PRECEDING AND 1000 FOLLOWING) second_highest_salary
183+
first_value(employee_name) OVER w highest_salary,
184+
nth_value(employee_name, 2) OVER w second_highest_salary
184185
FROM
185186
basic_pays
187+
WINDOW w AS (ORDER BY salary RANGE BETWEEN 2000 PRECEDING AND 1000 FOLLOWING)
186188
ORDER BY salary;
187189

188190
SELECT
189191
employee_name,
190192
salary,
191-
nth_value(employee_name, 2) OVER (
192-
ORDER BY salary DESC
193-
ROWS BETWEEN 2 PRECEDING AND 2 FOLLOWING) second_highest_salary
193+
first_value(employee_name) OVER w highest_salary,
194+
nth_value(employee_name, 2) OVER w second_highest_salary
194195
FROM
195196
basic_pays
197+
WINDOW w AS (ORDER BY salary DESC ROWS BETWEEN 2 PRECEDING AND 2 FOLLOWING)
196198
ORDER BY salary DESC;
197199

198200
SELECT
199201
employee_name,
200202
salary,
201-
nth_value(employee_name, 2) OVER (
202-
ORDER BY salary DESC
203-
RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) second_highest_salary
203+
first_value(employee_name) OVER w highest_salary,
204+
nth_value(employee_name, 2) OVER w second_highest_salary
204205
FROM
205206
basic_pays
207+
WINDOW w AS (ORDER BY salary DESC RANGE BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING)
206208
ORDER BY salary DESC;
207209

208210
SELECT
209211
employee_name,
210212
salary,
211-
nth_value(employee_name, 2) OVER (
212-
ORDER BY salary DESC
213-
RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) second_highest_salary
213+
first_value(employee_name) OVER w highest_salary,
214+
nth_value(employee_name, 2) OVER w second_highest_salary
214215
FROM
215216
basic_pays
217+
WINDOW w AS (ORDER BY salary DESC RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)
216218
ORDER BY salary DESC;
217219

218220
SELECT
219221
employee_name,
220222
salary,
221-
nth_value(employee_name, 2) OVER (
222-
ORDER BY salary DESC
223-
ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) second_highest_salary
223+
first_value(employee_name) OVER w highest_salary,
224+
nth_value(employee_name, 2) OVER w second_highest_salary
224225
FROM
225226
basic_pays
227+
WINDOW w AS (ORDER BY salary DESC ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)
226228
ORDER BY salary DESC;
227229

228230
SELECT
229231
employee_name,
230232
salary,
231-
nth_value(employee_name, 2) OVER (
232-
ORDER BY salary DESC
233-
ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING) second_highest_salary
233+
first_value(employee_name) OVER w highest_salary,
234+
nth_value(employee_name, 2) OVER w second_highest_salary
234235
FROM
235236
basic_pays
237+
WINDOW w AS (ORDER BY salary DESC ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING)
236238
ORDER BY salary DESC;
237239

238240
SELECT
239241
employee_name,
240242
department,
241243
salary,
242-
NTH_VALUE(employee_name, 2) OVER (
243-
PARTITION BY department
244-
ORDER BY salary DESC
245-
RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
246-
) second_highest_salary
244+
FIRST_VALUE(employee_name) OVER w highest_salary,
245+
NTH_VALUE(employee_name, 2) OVER w second_highest_salary
247246
FROM
248247
basic_pays
248+
WINDOW w AS (
249+
PARTITION BY department
250+
ORDER BY salary DESC
251+
RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
252+
)
249253
ORDER BY department;

0 commit comments

Comments
 (0)