-
Notifications
You must be signed in to change notification settings - Fork 28.3k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[SPARK-44871][SQL][3.3] Fix percentile_disc behaviour
### What changes were proposed in this pull request? This PR fixes `percentile_disc()` function as currently it returns inforrect results in some cases. E.g.: ``` SELECT percentile_disc(0.0) WITHIN GROUP (ORDER BY a) as p0, percentile_disc(0.1) WITHIN GROUP (ORDER BY a) as p1, percentile_disc(0.2) WITHIN GROUP (ORDER BY a) as p2, percentile_disc(0.3) WITHIN GROUP (ORDER BY a) as p3, percentile_disc(0.4) WITHIN GROUP (ORDER BY a) as p4, percentile_disc(0.5) WITHIN GROUP (ORDER BY a) as p5, percentile_disc(0.6) WITHIN GROUP (ORDER BY a) as p6, percentile_disc(0.7) WITHIN GROUP (ORDER BY a) as p7, percentile_disc(0.8) WITHIN GROUP (ORDER BY a) as p8, percentile_disc(0.9) WITHIN GROUP (ORDER BY a) as p9, percentile_disc(1.0) WITHIN GROUP (ORDER BY a) as p10 FROM VALUES (0), (1), (2), (3), (4) AS v(a) ``` currently returns: ``` +---+---+---+---+---+---+---+---+---+---+---+ | p0| p1| p2| p3| p4| p5| p6| p7| p8| p9|p10| +---+---+---+---+---+---+---+---+---+---+---+ |0.0|0.0|0.0|1.0|1.0|2.0|2.0|2.0|3.0|3.0|4.0| +---+---+---+---+---+---+---+---+---+---+---+ ``` but after this PR it returns the correct: ``` +---+---+---+---+---+---+---+---+---+---+---+ | p0| p1| p2| p3| p4| p5| p6| p7| p8| p9|p10| +---+---+---+---+---+---+---+---+---+---+---+ |0.0|0.0|0.0|1.0|1.0|2.0|2.0|3.0|3.0|4.0|4.0| +---+---+---+---+---+---+---+---+---+---+---+ ``` ### Why are the changes needed? Bugfix. ### Does this PR introduce _any_ user-facing change? Yes, fixes a correctness bug, but the old behaviour can be restored with `spark.sql.legacy.percentileDiscCalculation=true`. ### How was this patch tested? Added new UTs. Closes #42611 from peter-toth/SPARK-44871-fix-percentile-disc-behaviour-3.3. Authored-by: Peter Toth <peter.toth@gmail.com> Signed-off-by: Max Gekk <max.gekk@gmail.com>
- Loading branch information
1 parent
352810b
commit aa6f6f7
Showing
4 changed files
with
234 additions
and
7 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
74 changes: 74 additions & 0 deletions
74
sql/core/src/test/resources/sql-tests/inputs/percentiles.sql
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
-- SPARK-44871: Fix percentile_disc behaviour | ||
SELECT | ||
percentile_disc(0.0) WITHIN GROUP (ORDER BY a) as p0, | ||
percentile_disc(0.1) WITHIN GROUP (ORDER BY a) as p1, | ||
percentile_disc(0.2) WITHIN GROUP (ORDER BY a) as p2, | ||
percentile_disc(0.3) WITHIN GROUP (ORDER BY a) as p3, | ||
percentile_disc(0.4) WITHIN GROUP (ORDER BY a) as p4, | ||
percentile_disc(0.5) WITHIN GROUP (ORDER BY a) as p5, | ||
percentile_disc(0.6) WITHIN GROUP (ORDER BY a) as p6, | ||
percentile_disc(0.7) WITHIN GROUP (ORDER BY a) as p7, | ||
percentile_disc(0.8) WITHIN GROUP (ORDER BY a) as p8, | ||
percentile_disc(0.9) WITHIN GROUP (ORDER BY a) as p9, | ||
percentile_disc(1.0) WITHIN GROUP (ORDER BY a) as p10 | ||
FROM VALUES (0) AS v(a); | ||
|
||
SELECT | ||
percentile_disc(0.0) WITHIN GROUP (ORDER BY a) as p0, | ||
percentile_disc(0.1) WITHIN GROUP (ORDER BY a) as p1, | ||
percentile_disc(0.2) WITHIN GROUP (ORDER BY a) as p2, | ||
percentile_disc(0.3) WITHIN GROUP (ORDER BY a) as p3, | ||
percentile_disc(0.4) WITHIN GROUP (ORDER BY a) as p4, | ||
percentile_disc(0.5) WITHIN GROUP (ORDER BY a) as p5, | ||
percentile_disc(0.6) WITHIN GROUP (ORDER BY a) as p6, | ||
percentile_disc(0.7) WITHIN GROUP (ORDER BY a) as p7, | ||
percentile_disc(0.8) WITHIN GROUP (ORDER BY a) as p8, | ||
percentile_disc(0.9) WITHIN GROUP (ORDER BY a) as p9, | ||
percentile_disc(1.0) WITHIN GROUP (ORDER BY a) as p10 | ||
FROM VALUES (0), (1) AS v(a); | ||
|
||
SELECT | ||
percentile_disc(0.0) WITHIN GROUP (ORDER BY a) as p0, | ||
percentile_disc(0.1) WITHIN GROUP (ORDER BY a) as p1, | ||
percentile_disc(0.2) WITHIN GROUP (ORDER BY a) as p2, | ||
percentile_disc(0.3) WITHIN GROUP (ORDER BY a) as p3, | ||
percentile_disc(0.4) WITHIN GROUP (ORDER BY a) as p4, | ||
percentile_disc(0.5) WITHIN GROUP (ORDER BY a) as p5, | ||
percentile_disc(0.6) WITHIN GROUP (ORDER BY a) as p6, | ||
percentile_disc(0.7) WITHIN GROUP (ORDER BY a) as p7, | ||
percentile_disc(0.8) WITHIN GROUP (ORDER BY a) as p8, | ||
percentile_disc(0.9) WITHIN GROUP (ORDER BY a) as p9, | ||
percentile_disc(1.0) WITHIN GROUP (ORDER BY a) as p10 | ||
FROM VALUES (0), (1), (2) AS v(a); | ||
|
||
SELECT | ||
percentile_disc(0.0) WITHIN GROUP (ORDER BY a) as p0, | ||
percentile_disc(0.1) WITHIN GROUP (ORDER BY a) as p1, | ||
percentile_disc(0.2) WITHIN GROUP (ORDER BY a) as p2, | ||
percentile_disc(0.3) WITHIN GROUP (ORDER BY a) as p3, | ||
percentile_disc(0.4) WITHIN GROUP (ORDER BY a) as p4, | ||
percentile_disc(0.5) WITHIN GROUP (ORDER BY a) as p5, | ||
percentile_disc(0.6) WITHIN GROUP (ORDER BY a) as p6, | ||
percentile_disc(0.7) WITHIN GROUP (ORDER BY a) as p7, | ||
percentile_disc(0.8) WITHIN GROUP (ORDER BY a) as p8, | ||
percentile_disc(0.9) WITHIN GROUP (ORDER BY a) as p9, | ||
percentile_disc(1.0) WITHIN GROUP (ORDER BY a) as p10 | ||
FROM VALUES (0), (1), (2), (3), (4) AS v(a); | ||
|
||
SET spark.sql.legacy.percentileDiscCalculation = true; | ||
|
||
SELECT | ||
percentile_disc(0.0) WITHIN GROUP (ORDER BY a) as p0, | ||
percentile_disc(0.1) WITHIN GROUP (ORDER BY a) as p1, | ||
percentile_disc(0.2) WITHIN GROUP (ORDER BY a) as p2, | ||
percentile_disc(0.3) WITHIN GROUP (ORDER BY a) as p3, | ||
percentile_disc(0.4) WITHIN GROUP (ORDER BY a) as p4, | ||
percentile_disc(0.5) WITHIN GROUP (ORDER BY a) as p5, | ||
percentile_disc(0.6) WITHIN GROUP (ORDER BY a) as p6, | ||
percentile_disc(0.7) WITHIN GROUP (ORDER BY a) as p7, | ||
percentile_disc(0.8) WITHIN GROUP (ORDER BY a) as p8, | ||
percentile_disc(0.9) WITHIN GROUP (ORDER BY a) as p9, | ||
percentile_disc(1.0) WITHIN GROUP (ORDER BY a) as p10 | ||
FROM VALUES (0), (1), (2), (3), (4) AS v(a); | ||
|
||
SET spark.sql.legacy.percentileDiscCalculation = false; |
118 changes: 118 additions & 0 deletions
118
sql/core/src/test/resources/sql-tests/results/percentiles.sql.out
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,118 @@ | ||
-- Automatically generated by SQLQueryTestSuite | ||
-- Number of queries: 7 | ||
|
||
|
||
-- !query | ||
SELECT | ||
percentile_disc(0.0) WITHIN GROUP (ORDER BY a) as p0, | ||
percentile_disc(0.1) WITHIN GROUP (ORDER BY a) as p1, | ||
percentile_disc(0.2) WITHIN GROUP (ORDER BY a) as p2, | ||
percentile_disc(0.3) WITHIN GROUP (ORDER BY a) as p3, | ||
percentile_disc(0.4) WITHIN GROUP (ORDER BY a) as p4, | ||
percentile_disc(0.5) WITHIN GROUP (ORDER BY a) as p5, | ||
percentile_disc(0.6) WITHIN GROUP (ORDER BY a) as p6, | ||
percentile_disc(0.7) WITHIN GROUP (ORDER BY a) as p7, | ||
percentile_disc(0.8) WITHIN GROUP (ORDER BY a) as p8, | ||
percentile_disc(0.9) WITHIN GROUP (ORDER BY a) as p9, | ||
percentile_disc(1.0) WITHIN GROUP (ORDER BY a) as p10 | ||
FROM VALUES (0) AS v(a) | ||
-- !query schema | ||
struct<p0:double,p1:double,p2:double,p3:double,p4:double,p5:double,p6:double,p7:double,p8:double,p9:double,p10:double> | ||
-- !query output | ||
0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 | ||
|
||
|
||
-- !query | ||
SELECT | ||
percentile_disc(0.0) WITHIN GROUP (ORDER BY a) as p0, | ||
percentile_disc(0.1) WITHIN GROUP (ORDER BY a) as p1, | ||
percentile_disc(0.2) WITHIN GROUP (ORDER BY a) as p2, | ||
percentile_disc(0.3) WITHIN GROUP (ORDER BY a) as p3, | ||
percentile_disc(0.4) WITHIN GROUP (ORDER BY a) as p4, | ||
percentile_disc(0.5) WITHIN GROUP (ORDER BY a) as p5, | ||
percentile_disc(0.6) WITHIN GROUP (ORDER BY a) as p6, | ||
percentile_disc(0.7) WITHIN GROUP (ORDER BY a) as p7, | ||
percentile_disc(0.8) WITHIN GROUP (ORDER BY a) as p8, | ||
percentile_disc(0.9) WITHIN GROUP (ORDER BY a) as p9, | ||
percentile_disc(1.0) WITHIN GROUP (ORDER BY a) as p10 | ||
FROM VALUES (0), (1) AS v(a) | ||
-- !query schema | ||
struct<p0:double,p1:double,p2:double,p3:double,p4:double,p5:double,p6:double,p7:double,p8:double,p9:double,p10:double> | ||
-- !query output | ||
0.0 0.0 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 1.0 | ||
|
||
|
||
-- !query | ||
SELECT | ||
percentile_disc(0.0) WITHIN GROUP (ORDER BY a) as p0, | ||
percentile_disc(0.1) WITHIN GROUP (ORDER BY a) as p1, | ||
percentile_disc(0.2) WITHIN GROUP (ORDER BY a) as p2, | ||
percentile_disc(0.3) WITHIN GROUP (ORDER BY a) as p3, | ||
percentile_disc(0.4) WITHIN GROUP (ORDER BY a) as p4, | ||
percentile_disc(0.5) WITHIN GROUP (ORDER BY a) as p5, | ||
percentile_disc(0.6) WITHIN GROUP (ORDER BY a) as p6, | ||
percentile_disc(0.7) WITHIN GROUP (ORDER BY a) as p7, | ||
percentile_disc(0.8) WITHIN GROUP (ORDER BY a) as p8, | ||
percentile_disc(0.9) WITHIN GROUP (ORDER BY a) as p9, | ||
percentile_disc(1.0) WITHIN GROUP (ORDER BY a) as p10 | ||
FROM VALUES (0), (1), (2) AS v(a) | ||
-- !query schema | ||
struct<p0:double,p1:double,p2:double,p3:double,p4:double,p5:double,p6:double,p7:double,p8:double,p9:double,p10:double> | ||
-- !query output | ||
0.0 0.0 0.0 0.0 1.0 1.0 1.0 2.0 2.0 2.0 2.0 | ||
|
||
|
||
-- !query | ||
SELECT | ||
percentile_disc(0.0) WITHIN GROUP (ORDER BY a) as p0, | ||
percentile_disc(0.1) WITHIN GROUP (ORDER BY a) as p1, | ||
percentile_disc(0.2) WITHIN GROUP (ORDER BY a) as p2, | ||
percentile_disc(0.3) WITHIN GROUP (ORDER BY a) as p3, | ||
percentile_disc(0.4) WITHIN GROUP (ORDER BY a) as p4, | ||
percentile_disc(0.5) WITHIN GROUP (ORDER BY a) as p5, | ||
percentile_disc(0.6) WITHIN GROUP (ORDER BY a) as p6, | ||
percentile_disc(0.7) WITHIN GROUP (ORDER BY a) as p7, | ||
percentile_disc(0.8) WITHIN GROUP (ORDER BY a) as p8, | ||
percentile_disc(0.9) WITHIN GROUP (ORDER BY a) as p9, | ||
percentile_disc(1.0) WITHIN GROUP (ORDER BY a) as p10 | ||
FROM VALUES (0), (1), (2), (3), (4) AS v(a) | ||
-- !query schema | ||
struct<p0:double,p1:double,p2:double,p3:double,p4:double,p5:double,p6:double,p7:double,p8:double,p9:double,p10:double> | ||
-- !query output | ||
0.0 0.0 0.0 1.0 1.0 2.0 2.0 3.0 3.0 4.0 4.0 | ||
|
||
|
||
-- !query | ||
SET spark.sql.legacy.percentileDiscCalculation = true | ||
-- !query schema | ||
struct<key:string,value:string> | ||
-- !query output | ||
spark.sql.legacy.percentileDiscCalculation true | ||
|
||
|
||
-- !query | ||
SELECT | ||
percentile_disc(0.0) WITHIN GROUP (ORDER BY a) as p0, | ||
percentile_disc(0.1) WITHIN GROUP (ORDER BY a) as p1, | ||
percentile_disc(0.2) WITHIN GROUP (ORDER BY a) as p2, | ||
percentile_disc(0.3) WITHIN GROUP (ORDER BY a) as p3, | ||
percentile_disc(0.4) WITHIN GROUP (ORDER BY a) as p4, | ||
percentile_disc(0.5) WITHIN GROUP (ORDER BY a) as p5, | ||
percentile_disc(0.6) WITHIN GROUP (ORDER BY a) as p6, | ||
percentile_disc(0.7) WITHIN GROUP (ORDER BY a) as p7, | ||
percentile_disc(0.8) WITHIN GROUP (ORDER BY a) as p8, | ||
percentile_disc(0.9) WITHIN GROUP (ORDER BY a) as p9, | ||
percentile_disc(1.0) WITHIN GROUP (ORDER BY a) as p10 | ||
FROM VALUES (0), (1), (2), (3), (4) AS v(a) | ||
-- !query schema | ||
struct<p0:double,p1:double,p2:double,p3:double,p4:double,p5:double,p6:double,p7:double,p8:double,p9:double,p10:double> | ||
-- !query output | ||
0.0 0.0 0.0 1.0 1.0 2.0 2.0 2.0 3.0 3.0 4.0 | ||
|
||
|
||
-- !query | ||
SET spark.sql.legacy.percentileDiscCalculation = false | ||
-- !query schema | ||
struct<key:string,value:string> | ||
-- !query output | ||
spark.sql.legacy.percentileDiscCalculation false |