From bd8fbf465796038b1975c228325b354888eb9c0b Mon Sep 17 00:00:00 2001 From: Peter Toth Date: Tue, 22 Aug 2023 14:13:03 +0200 Subject: [PATCH] [SPARK-44871][SQL] Fix percentile_disc behaviour ### What changes were proposed in this pull request? This PR fixes `percentile_disc()` function as currently it returns inforrect results in some cases. E.g.: ``` SELECT percentile_disc(0.0) WITHIN GROUP (ORDER BY a) as p0, percentile_disc(0.1) WITHIN GROUP (ORDER BY a) as p1, percentile_disc(0.2) WITHIN GROUP (ORDER BY a) as p2, percentile_disc(0.3) WITHIN GROUP (ORDER BY a) as p3, percentile_disc(0.4) WITHIN GROUP (ORDER BY a) as p4, percentile_disc(0.5) WITHIN GROUP (ORDER BY a) as p5, percentile_disc(0.6) WITHIN GROUP (ORDER BY a) as p6, percentile_disc(0.7) WITHIN GROUP (ORDER BY a) as p7, percentile_disc(0.8) WITHIN GROUP (ORDER BY a) as p8, percentile_disc(0.9) WITHIN GROUP (ORDER BY a) as p9, percentile_disc(1.0) WITHIN GROUP (ORDER BY a) as p10 FROM VALUES (0), (1), (2), (3), (4) AS v(a) ``` currently returns: ``` +---+---+---+---+---+---+---+---+---+---+---+ | p0| p1| p2| p3| p4| p5| p6| p7| p8| p9|p10| +---+---+---+---+---+---+---+---+---+---+---+ |0.0|0.0|0.0|1.0|1.0|2.0|2.0|2.0|3.0|3.0|4.0| +---+---+---+---+---+---+---+---+---+---+---+ ``` but after this PR it returns the correct: ``` +---+---+---+---+---+---+---+---+---+---+---+ | p0| p1| p2| p3| p4| p5| p6| p7| p8| p9|p10| +---+---+---+---+---+---+---+---+---+---+---+ |0.0|0.0|0.0|1.0|1.0|2.0|2.0|3.0|3.0|4.0|4.0| +---+---+---+---+---+---+---+---+---+---+---+ ``` ### Why are the changes needed? Bugfix. ### Does this PR introduce _any_ user-facing change? Yes, fixes a correctness bug, but the old behaviour can be restored with `spark.sql.legacy.percentileDiscCalculation=true`. ### How was this patch tested? Added new UTs. Closes #42559 from peter-toth/SPARK-44871-fix-percentile-disc-behaviour. Authored-by: Peter Toth Signed-off-by: Peter Toth --- .../expressions/aggregate/percentiles.scala | 39 +++++- .../apache/spark/sql/internal/SQLConf.scala | 10 ++ .../analyzer-results/percentiles.sql.out | 128 ++++++++++++++++-- .../postgreSQL/aggregates_part4.sql.out | 2 +- .../postgreSQL/udf-aggregates_part4.sql.out | 2 +- .../sql-tests/inputs/percentiles.sql | 77 ++++++++++- .../sql-tests/results/percentiles.sql.out | 116 ++++++++++++++++ 7 files changed, 356 insertions(+), 18 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/percentiles.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/percentiles.scala index b26502a176339..74d848298698b 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/percentiles.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/percentiles.scala @@ -28,6 +28,7 @@ import org.apache.spark.sql.catalyst.trees.{BinaryLike, TernaryLike, UnaryLike} import org.apache.spark.sql.catalyst.types.PhysicalDataType import org.apache.spark.sql.catalyst.util._ import org.apache.spark.sql.errors.QueryExecutionErrors +import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types._ import org.apache.spark.sql.types.TypeCollection.NumericAndAnsiInterval import org.apache.spark.util.collection.OpenHashMap @@ -164,11 +165,8 @@ abstract class PercentileBase val accumulatedCounts = sortedCounts.scanLeft((sortedCounts.head._1, 0L)) { case ((key1, count1), (key2, count2)) => (key2, count1 + count2) }.tail - val maxPosition = accumulatedCounts.last._2 - 1 - percentages.map { percentile => - getPercentile(accumulatedCounts, maxPosition * percentile) - } + percentages.map(getPercentile(accumulatedCounts, _)) } private def generateOutput(percentiles: Seq[Double]): Any = { @@ -191,8 +189,11 @@ abstract class PercentileBase * This function has been based upon similar function from HIVE * `org.apache.hadoop.hive.ql.udf.UDAFPercentile.getPercentile()`. */ - private def getPercentile( - accumulatedCounts: Seq[(AnyRef, Long)], position: Double): Double = { + protected def getPercentile( + accumulatedCounts: Seq[(AnyRef, Long)], + percentile: Double): Double = { + val position = (accumulatedCounts.last._2 - 1) * percentile + // We may need to do linear interpolation to get the exact percentile val lower = position.floor.toLong val higher = position.ceil.toLong @@ -215,6 +216,7 @@ abstract class PercentileBase } if (discrete) { + // We end up here only if spark.sql.legacy.percentileDiscCalculation=true toDoubleValue(lowerKey) } else { // Linear interpolation to get the exact percentile @@ -384,7 +386,9 @@ case class PercentileDisc( percentageExpression: Expression, reverse: Boolean = false, mutableAggBufferOffset: Int = 0, - inputAggBufferOffset: Int = 0) extends PercentileBase with BinaryLike[Expression] { + inputAggBufferOffset: Int = 0, + legacyCalculation: Boolean = SQLConf.get.getConf(SQLConf.LEGACY_PERCENTILE_DISC_CALCULATION)) + extends PercentileBase with BinaryLike[Expression] { val frequencyExpression: Expression = Literal(1L) @@ -412,4 +416,25 @@ case class PercentileDisc( child = newLeft, percentageExpression = newRight ) + + override protected def getPercentile( + accumulatedCounts: Seq[(AnyRef, Long)], + percentile: Double): Double = { + if (legacyCalculation) { + super.getPercentile(accumulatedCounts, percentile) + } else { + // `percentile_disc(p)` returns the value with the smallest `cume_dist()` value given that is + // greater than or equal to `p` so `position` here is `p` adjusted by max position. + val position = accumulatedCounts.last._2 * percentile + + val higher = position.ceil.toLong + + // Use binary search to find the higher position. + val countsArray = accumulatedCounts.map(_._2).toArray[Long] + val higherIndex = binarySearchCount(countsArray, 0, accumulatedCounts.size, higher) + val higherKey = accumulatedCounts(higherIndex)._1 + + toDoubleValue(higherKey) + } + } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index 5d720c856478a..9b421251cf6f9 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -4368,6 +4368,16 @@ object SQLConf { .booleanConf .createWithDefault(true) + val LEGACY_PERCENTILE_DISC_CALCULATION = buildConf("spark.sql.legacy.percentileDiscCalculation") + .internal() + .doc("If true, the old bogus percentile_disc calculation is used. The old calculation " + + "incorrectly mapped the requested percentile to the sorted range of values in some cases " + + "and so returned incorrect results. Also, the new implementation is faster as it doesn't " + + "contain the interpolation logic that the old percentile_cont based one did.") + .version("3.3.4") + .booleanConf + .createWithDefault(false) + /** * Holds information about keys that have been deprecated. * diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/percentiles.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/percentiles.sql.out index fa7529b594eae..36845ce63466e 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/percentiles.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/percentiles.sql.out @@ -103,7 +103,7 @@ SELECT percentile_disc(0.25) WITHIN GROUP (ORDER BY v DESC) FILTER (WHERE k > 0) FROM aggr -- !query analysis -Aggregate [percentile_disc(v#x, cast(0.25 as double), false, 0, 0) AS percentile_disc(0.25) WITHIN GROUP (ORDER BY v)#x, percentile_disc(v#x, cast(0.25 as double), false, 0, 0) FILTER (WHERE (k#x > 0)) AS percentile_disc(0.25) WITHIN GROUP (ORDER BY v) FILTER (WHERE (k > 0))#x, percentile_disc(v#x, cast(0.25 as double), true, 0, 0) AS percentile_disc(0.25) WITHIN GROUP (ORDER BY v DESC)#x, percentile_disc(v#x, cast(0.25 as double), true, 0, 0) FILTER (WHERE (k#x > 0)) AS percentile_disc(0.25) WITHIN GROUP (ORDER BY v DESC) FILTER (WHERE (k > 0))#x] +Aggregate [percentile_disc(v#x, cast(0.25 as double), false, 0, 0, false) AS percentile_disc(0.25) WITHIN GROUP (ORDER BY v)#x, percentile_disc(v#x, cast(0.25 as double), false, 0, 0, false) FILTER (WHERE (k#x > 0)) AS percentile_disc(0.25) WITHIN GROUP (ORDER BY v) FILTER (WHERE (k > 0))#x, percentile_disc(v#x, cast(0.25 as double), true, 0, 0, false) AS percentile_disc(0.25) WITHIN GROUP (ORDER BY v DESC)#x, percentile_disc(v#x, cast(0.25 as double), true, 0, 0, false) FILTER (WHERE (k#x > 0)) AS percentile_disc(0.25) WITHIN GROUP (ORDER BY v DESC) FILTER (WHERE (k > 0))#x] +- SubqueryAlias aggr +- View (`aggr`, [k#x,v#x]) +- Project [cast(k#x as int) AS k#x, cast(v#x as int) AS v#x] @@ -124,7 +124,7 @@ GROUP BY k ORDER BY k -- !query analysis Sort [k#x ASC NULLS FIRST], true -+- Aggregate [k#x], [k#x, percentile_disc(v#x, cast(0.25 as double), false, 0, 0) AS percentile_disc(0.25) WITHIN GROUP (ORDER BY v)#x, percentile_disc(v#x, cast(0.25 as double), false, 0, 0) FILTER (WHERE (k#x > 0)) AS percentile_disc(0.25) WITHIN GROUP (ORDER BY v) FILTER (WHERE (k > 0))#x, percentile_disc(v#x, cast(0.25 as double), true, 0, 0) AS percentile_disc(0.25) WITHIN GROUP (ORDER BY v DESC)#x, percentile_disc(v#x, cast(0.25 as double), true, 0, 0) FILTER (WHERE (k#x > 0)) AS percentile_disc(0.25) WITHIN GROUP (ORDER BY v DESC) FILTER (WHERE (k > 0))#x] ++- Aggregate [k#x], [k#x, percentile_disc(v#x, cast(0.25 as double), false, 0, 0, false) AS percentile_disc(0.25) WITHIN GROUP (ORDER BY v)#x, percentile_disc(v#x, cast(0.25 as double), false, 0, 0, false) FILTER (WHERE (k#x > 0)) AS percentile_disc(0.25) WITHIN GROUP (ORDER BY v) FILTER (WHERE (k > 0))#x, percentile_disc(v#x, cast(0.25 as double), true, 0, 0, false) AS percentile_disc(0.25) WITHIN GROUP (ORDER BY v DESC)#x, percentile_disc(v#x, cast(0.25 as double), true, 0, 0, false) FILTER (WHERE (k#x > 0)) AS percentile_disc(0.25) WITHIN GROUP (ORDER BY v DESC) FILTER (WHERE (k > 0))#x] +- SubqueryAlias aggr +- View (`aggr`, [k#x,v#x]) +- Project [cast(k#x as int) AS k#x, cast(v#x as int) AS v#x] @@ -184,7 +184,7 @@ ORDER BY salary Sort [salary#x ASC NULLS FIRST], true +- Project [employee_name#x, department#x, salary#x, percentile_cont(0.25) WITHIN GROUP (ORDER BY salary) OVER (PARTITION BY department ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#x, percentile_disc(0.25) WITHIN GROUP (ORDER BY salary) OVER (PARTITION BY department ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#x, percentile_cont(0.25) WITHIN GROUP (ORDER BY salary DESC) OVER (PARTITION BY department ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#x, percentile_disc(0.25) WITHIN GROUP (ORDER BY salary DESC) OVER (PARTITION BY department ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#x] +- Project [employee_name#x, department#x, salary#x, percentile_cont(0.25) WITHIN GROUP (ORDER BY salary) OVER (PARTITION BY department ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#x, percentile_disc(0.25) WITHIN GROUP (ORDER BY salary) OVER (PARTITION BY department ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#x, percentile_cont(0.25) WITHIN GROUP (ORDER BY salary DESC) OVER (PARTITION BY department ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#x, percentile_disc(0.25) WITHIN GROUP (ORDER BY salary DESC) OVER (PARTITION BY department ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#x, percentile_cont(0.25) WITHIN GROUP (ORDER BY salary) OVER (PARTITION BY department ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#x, percentile_disc(0.25) WITHIN GROUP (ORDER BY salary) OVER (PARTITION BY department ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#x, percentile_cont(0.25) WITHIN GROUP (ORDER BY salary DESC) OVER (PARTITION BY department ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#x, percentile_disc(0.25) WITHIN GROUP (ORDER BY salary DESC) OVER (PARTITION BY department ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#x] - +- Window [percentile_cont(salary#x, cast(0.25 as double), false) windowspecdefinition(department#x, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS percentile_cont(0.25) WITHIN GROUP (ORDER BY salary) OVER (PARTITION BY department ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#x, percentile_disc(salary#x, cast(0.25 as double), false, 0, 0) windowspecdefinition(department#x, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS percentile_disc(0.25) WITHIN GROUP (ORDER BY salary) OVER (PARTITION BY department ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#x, percentile_cont(salary#x, cast(0.25 as double), true) windowspecdefinition(department#x, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS percentile_cont(0.25) WITHIN GROUP (ORDER BY salary DESC) OVER (PARTITION BY department ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#x, percentile_disc(salary#x, cast(0.25 as double), true, 0, 0) windowspecdefinition(department#x, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS percentile_disc(0.25) WITHIN GROUP (ORDER BY salary DESC) OVER (PARTITION BY department ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#x], [department#x] + +- Window [percentile_cont(salary#x, cast(0.25 as double), false) windowspecdefinition(department#x, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS percentile_cont(0.25) WITHIN GROUP (ORDER BY salary) OVER (PARTITION BY department ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#x, percentile_disc(salary#x, cast(0.25 as double), false, 0, 0, false) windowspecdefinition(department#x, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS percentile_disc(0.25) WITHIN GROUP (ORDER BY salary) OVER (PARTITION BY department ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#x, percentile_cont(salary#x, cast(0.25 as double), true) windowspecdefinition(department#x, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS percentile_cont(0.25) WITHIN GROUP (ORDER BY salary DESC) OVER (PARTITION BY department ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#x, percentile_disc(salary#x, cast(0.25 as double), true, 0, 0, false) windowspecdefinition(department#x, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS percentile_disc(0.25) WITHIN GROUP (ORDER BY salary DESC) OVER (PARTITION BY department ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#x], [department#x] +- Project [employee_name#x, department#x, salary#x] +- SubqueryAlias basic_pays +- View (`basic_pays`, [employee_name#x,department#x,salary#x]) @@ -364,7 +364,7 @@ ORDER BY salary Sort [salary#x ASC NULLS FIRST], true +- Project [employee_name#x, department#x, salary#x, percentile_cont(0.25) WITHIN GROUP (ORDER BY salary) OVER (PARTITION BY department ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#x, percentile_disc(0.25) WITHIN GROUP (ORDER BY salary) OVER (PARTITION BY department ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#x, percentile_cont(0.25) WITHIN GROUP (ORDER BY salary DESC) OVER (PARTITION BY department ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#x, percentile_disc(0.25) WITHIN GROUP (ORDER BY salary DESC) OVER (PARTITION BY department ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#x] +- Project [employee_name#x, department#x, salary#x, percentile_cont(0.25) WITHIN GROUP (ORDER BY salary) OVER (PARTITION BY department ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#x, percentile_disc(0.25) WITHIN GROUP (ORDER BY salary) OVER (PARTITION BY department ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#x, percentile_cont(0.25) WITHIN GROUP (ORDER BY salary DESC) OVER (PARTITION BY department ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#x, percentile_disc(0.25) WITHIN GROUP (ORDER BY salary DESC) OVER (PARTITION BY department ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#x, percentile_cont(0.25) WITHIN GROUP (ORDER BY salary) OVER (PARTITION BY department ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#x, percentile_disc(0.25) WITHIN GROUP (ORDER BY salary) OVER (PARTITION BY department ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#x, percentile_cont(0.25) WITHIN GROUP (ORDER BY salary DESC) OVER (PARTITION BY department ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#x, percentile_disc(0.25) WITHIN GROUP (ORDER BY salary DESC) OVER (PARTITION BY department ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#x] - +- Window [percentile_cont(salary#x, cast(0.25 as double), false) windowspecdefinition(department#x, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS percentile_cont(0.25) WITHIN GROUP (ORDER BY salary) OVER (PARTITION BY department ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#x, percentile_disc(salary#x, cast(0.25 as double), false, 0, 0) windowspecdefinition(department#x, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS percentile_disc(0.25) WITHIN GROUP (ORDER BY salary) OVER (PARTITION BY department ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#x, percentile_cont(salary#x, cast(0.25 as double), true) windowspecdefinition(department#x, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS percentile_cont(0.25) WITHIN GROUP (ORDER BY salary DESC) OVER (PARTITION BY department ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#x, percentile_disc(salary#x, cast(0.25 as double), true, 0, 0) windowspecdefinition(department#x, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS percentile_disc(0.25) WITHIN GROUP (ORDER BY salary DESC) OVER (PARTITION BY department ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#x], [department#x] + +- Window [percentile_cont(salary#x, cast(0.25 as double), false) windowspecdefinition(department#x, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS percentile_cont(0.25) WITHIN GROUP (ORDER BY salary) OVER (PARTITION BY department ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#x, percentile_disc(salary#x, cast(0.25 as double), false, 0, 0, false) windowspecdefinition(department#x, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS percentile_disc(0.25) WITHIN GROUP (ORDER BY salary) OVER (PARTITION BY department ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#x, percentile_cont(salary#x, cast(0.25 as double), true) windowspecdefinition(department#x, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS percentile_cont(0.25) WITHIN GROUP (ORDER BY salary DESC) OVER (PARTITION BY department ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#x, percentile_disc(salary#x, cast(0.25 as double), true, 0, 0, false) windowspecdefinition(department#x, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS percentile_disc(0.25) WITHIN GROUP (ORDER BY salary DESC) OVER (PARTITION BY department ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#x], [department#x] +- Project [employee_name#x, department#x, salary#x] +- SubqueryAlias basic_pays +- View (`basic_pays`, [employee_name#x,department#x,salary#x]) @@ -392,7 +392,7 @@ ORDER BY salary Sort [salary#x ASC NULLS FIRST], true +- Project [employee_name#x, department#x, salary#x, median(salary) OVER (PARTITION BY department ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#x, percentile_cont(0.5) WITHIN GROUP (ORDER BY salary) OVER (PARTITION BY department ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#x, percentile_disc(0.5) WITHIN GROUP (ORDER BY salary) OVER (PARTITION BY department ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#x, percentile_cont(0.5) WITHIN GROUP (ORDER BY salary DESC) OVER (PARTITION BY department ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#x, percentile_disc(0.5) WITHIN GROUP (ORDER BY salary DESC) OVER (PARTITION BY department ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#x] +- Project [employee_name#x, department#x, salary#x, median(salary) OVER (PARTITION BY department ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#x, percentile_cont(0.5) WITHIN GROUP (ORDER BY salary) OVER (PARTITION BY department ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#x, percentile_disc(0.5) WITHIN GROUP (ORDER BY salary) OVER (PARTITION BY department ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#x, percentile_cont(0.5) WITHIN GROUP (ORDER BY salary DESC) OVER (PARTITION BY department ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#x, percentile_disc(0.5) WITHIN GROUP (ORDER BY salary DESC) OVER (PARTITION BY department ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#x, median(salary) OVER (PARTITION BY department ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#x, percentile_cont(0.5) WITHIN GROUP (ORDER BY salary) OVER (PARTITION BY department ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#x, percentile_disc(0.5) WITHIN GROUP (ORDER BY salary) OVER (PARTITION BY department ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#x, percentile_cont(0.5) WITHIN GROUP (ORDER BY salary DESC) OVER (PARTITION BY department ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#x, percentile_disc(0.5) WITHIN GROUP (ORDER BY salary DESC) OVER (PARTITION BY department ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#x] - +- Window [median(salary#x) windowspecdefinition(department#x, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS median(salary) OVER (PARTITION BY department ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#x, percentile_cont(salary#x, cast(0.5 as double), false) windowspecdefinition(department#x, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS percentile_cont(0.5) WITHIN GROUP (ORDER BY salary) OVER (PARTITION BY department ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#x, percentile_disc(salary#x, cast(0.5 as double), false, 0, 0) windowspecdefinition(department#x, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS percentile_disc(0.5) WITHIN GROUP (ORDER BY salary) OVER (PARTITION BY department ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#x, percentile_cont(salary#x, cast(0.5 as double), true) windowspecdefinition(department#x, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS percentile_cont(0.5) WITHIN GROUP (ORDER BY salary DESC) OVER (PARTITION BY department ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#x, percentile_disc(salary#x, cast(0.5 as double), true, 0, 0) windowspecdefinition(department#x, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS percentile_disc(0.5) WITHIN GROUP (ORDER BY salary DESC) OVER (PARTITION BY department ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#x], [department#x] + +- Window [median(salary#x) windowspecdefinition(department#x, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS median(salary) OVER (PARTITION BY department ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#x, percentile_cont(salary#x, cast(0.5 as double), false) windowspecdefinition(department#x, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS percentile_cont(0.5) WITHIN GROUP (ORDER BY salary) OVER (PARTITION BY department ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#x, percentile_disc(salary#x, cast(0.5 as double), false, 0, 0, false) windowspecdefinition(department#x, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS percentile_disc(0.5) WITHIN GROUP (ORDER BY salary) OVER (PARTITION BY department ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#x, percentile_cont(salary#x, cast(0.5 as double), true) windowspecdefinition(department#x, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS percentile_cont(0.5) WITHIN GROUP (ORDER BY salary DESC) OVER (PARTITION BY department ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#x, percentile_disc(salary#x, cast(0.5 as double), true, 0, 0, false) windowspecdefinition(department#x, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS percentile_disc(0.5) WITHIN GROUP (ORDER BY salary DESC) OVER (PARTITION BY department ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#x], [department#x] +- Project [employee_name#x, department#x, salary#x] +- Filter (salary#x > 8900) +- SubqueryAlias basic_pays @@ -659,7 +659,7 @@ SELECT percentile_disc(0.25) WITHIN GROUP (ORDER BY dt DESC) FROM intervals -- !query analysis -Aggregate [percentile_disc(dt#x, cast(0.25 as double), false, 0, 0) AS percentile_disc(0.25) WITHIN GROUP (ORDER BY dt)#x, percentile_disc(dt#x, cast(0.25 as double), true, 0, 0) AS percentile_disc(0.25) WITHIN GROUP (ORDER BY dt DESC)#x] +Aggregate [percentile_disc(dt#x, cast(0.25 as double), false, 0, 0, false) AS percentile_disc(0.25) WITHIN GROUP (ORDER BY dt)#x, percentile_disc(dt#x, cast(0.25 as double), true, 0, 0, false) AS percentile_disc(0.25) WITHIN GROUP (ORDER BY dt DESC)#x] +- SubqueryAlias intervals +- View (`intervals`, [k#x,dt#x,ym#x,dt2#x]) +- Project [cast(k#x as int) AS k#x, cast(dt#x as interval month) AS dt#x, cast(ym#x as interval second) AS ym#x, cast(dt2#x as interval minute) AS dt2#x] @@ -678,7 +678,7 @@ GROUP BY k ORDER BY k -- !query analysis Sort [k#x ASC NULLS FIRST], true -+- Aggregate [k#x], [k#x, percentile_disc(ym#x, cast(0.25 as double), false, 0, 0) AS percentile_disc(0.25) WITHIN GROUP (ORDER BY ym)#x, percentile_disc(ym#x, cast(0.25 as double), true, 0, 0) AS percentile_disc(0.25) WITHIN GROUP (ORDER BY ym DESC)#x] ++- Aggregate [k#x], [k#x, percentile_disc(ym#x, cast(0.25 as double), false, 0, 0, false) AS percentile_disc(0.25) WITHIN GROUP (ORDER BY ym)#x, percentile_disc(ym#x, cast(0.25 as double), true, 0, 0, false) AS percentile_disc(0.25) WITHIN GROUP (ORDER BY ym DESC)#x] +- SubqueryAlias intervals +- View (`intervals`, [k#x,dt#x,ym#x,dt2#x]) +- Project [cast(k#x as int) AS k#x, cast(dt#x as interval month) AS dt#x, cast(ym#x as interval second) AS ym#x, cast(dt2#x as interval minute) AS dt2#x] @@ -697,7 +697,7 @@ GROUP BY k ORDER BY k -- !query analysis Sort [k#x ASC NULLS FIRST], true -+- Aggregate [k#x], [k#x, percentile_disc(dt2#x, cast(0.25 as double), false, 0, 0) AS percentile_disc(0.25) WITHIN GROUP (ORDER BY dt2)#x, percentile_disc(dt2#x, cast(0.25 as double), true, 0, 0) AS percentile_disc(0.25) WITHIN GROUP (ORDER BY dt2 DESC)#x] ++- Aggregate [k#x], [k#x, percentile_disc(dt2#x, cast(0.25 as double), false, 0, 0, false) AS percentile_disc(0.25) WITHIN GROUP (ORDER BY dt2)#x, percentile_disc(dt2#x, cast(0.25 as double), true, 0, 0, false) AS percentile_disc(0.25) WITHIN GROUP (ORDER BY dt2 DESC)#x] +- SubqueryAlias intervals +- View (`intervals`, [k#x,dt#x,ym#x,dt2#x]) +- Project [cast(k#x as int) AS k#x, cast(dt#x as interval month) AS dt#x, cast(ym#x as interval second) AS ym#x, cast(dt2#x as interval minute) AS dt2#x] @@ -760,3 +760,115 @@ Sort [k#x ASC NULLS FIRST], true +- Project [k#x, dt#x, ym#x, dt2#x] +- SubqueryAlias intervals +- LocalRelation [k#x, dt#x, ym#x, dt2#x] + + +-- !query +SELECT + percentile_disc(0.0) WITHIN GROUP (ORDER BY a) as p0, + percentile_disc(0.1) WITHIN GROUP (ORDER BY a) as p1, + percentile_disc(0.2) WITHIN GROUP (ORDER BY a) as p2, + percentile_disc(0.3) WITHIN GROUP (ORDER BY a) as p3, + percentile_disc(0.4) WITHIN GROUP (ORDER BY a) as p4, + percentile_disc(0.5) WITHIN GROUP (ORDER BY a) as p5, + percentile_disc(0.6) WITHIN GROUP (ORDER BY a) as p6, + percentile_disc(0.7) WITHIN GROUP (ORDER BY a) as p7, + percentile_disc(0.8) WITHIN GROUP (ORDER BY a) as p8, + percentile_disc(0.9) WITHIN GROUP (ORDER BY a) as p9, + percentile_disc(1.0) WITHIN GROUP (ORDER BY a) as p10 +FROM VALUES (0) AS v(a) +-- !query analysis +Aggregate [percentile_disc(a#x, cast(0.0 as double), false, 0, 0, false) AS p0#x, percentile_disc(a#x, cast(0.1 as double), false, 0, 0, false) AS p1#x, percentile_disc(a#x, cast(0.2 as double), false, 0, 0, false) AS p2#x, percentile_disc(a#x, cast(0.3 as double), false, 0, 0, false) AS p3#x, percentile_disc(a#x, cast(0.4 as double), false, 0, 0, false) AS p4#x, percentile_disc(a#x, cast(0.5 as double), false, 0, 0, false) AS p5#x, percentile_disc(a#x, cast(0.6 as double), false, 0, 0, false) AS p6#x, percentile_disc(a#x, cast(0.7 as double), false, 0, 0, false) AS p7#x, percentile_disc(a#x, cast(0.8 as double), false, 0, 0, false) AS p8#x, percentile_disc(a#x, cast(0.9 as double), false, 0, 0, false) AS p9#x, percentile_disc(a#x, cast(1.0 as double), false, 0, 0, false) AS p10#x] ++- SubqueryAlias v + +- LocalRelation [a#x] + + +-- !query +SELECT + percentile_disc(0.0) WITHIN GROUP (ORDER BY a) as p0, + percentile_disc(0.1) WITHIN GROUP (ORDER BY a) as p1, + percentile_disc(0.2) WITHIN GROUP (ORDER BY a) as p2, + percentile_disc(0.3) WITHIN GROUP (ORDER BY a) as p3, + percentile_disc(0.4) WITHIN GROUP (ORDER BY a) as p4, + percentile_disc(0.5) WITHIN GROUP (ORDER BY a) as p5, + percentile_disc(0.6) WITHIN GROUP (ORDER BY a) as p6, + percentile_disc(0.7) WITHIN GROUP (ORDER BY a) as p7, + percentile_disc(0.8) WITHIN GROUP (ORDER BY a) as p8, + percentile_disc(0.9) WITHIN GROUP (ORDER BY a) as p9, + percentile_disc(1.0) WITHIN GROUP (ORDER BY a) as p10 +FROM VALUES (0), (1) AS v(a) +-- !query analysis +Aggregate [percentile_disc(a#x, cast(0.0 as double), false, 0, 0, false) AS p0#x, percentile_disc(a#x, cast(0.1 as double), false, 0, 0, false) AS p1#x, percentile_disc(a#x, cast(0.2 as double), false, 0, 0, false) AS p2#x, percentile_disc(a#x, cast(0.3 as double), false, 0, 0, false) AS p3#x, percentile_disc(a#x, cast(0.4 as double), false, 0, 0, false) AS p4#x, percentile_disc(a#x, cast(0.5 as double), false, 0, 0, false) AS p5#x, percentile_disc(a#x, cast(0.6 as double), false, 0, 0, false) AS p6#x, percentile_disc(a#x, cast(0.7 as double), false, 0, 0, false) AS p7#x, percentile_disc(a#x, cast(0.8 as double), false, 0, 0, false) AS p8#x, percentile_disc(a#x, cast(0.9 as double), false, 0, 0, false) AS p9#x, percentile_disc(a#x, cast(1.0 as double), false, 0, 0, false) AS p10#x] ++- SubqueryAlias v + +- LocalRelation [a#x] + + +-- !query +SELECT + percentile_disc(0.0) WITHIN GROUP (ORDER BY a) as p0, + percentile_disc(0.1) WITHIN GROUP (ORDER BY a) as p1, + percentile_disc(0.2) WITHIN GROUP (ORDER BY a) as p2, + percentile_disc(0.3) WITHIN GROUP (ORDER BY a) as p3, + percentile_disc(0.4) WITHIN GROUP (ORDER BY a) as p4, + percentile_disc(0.5) WITHIN GROUP (ORDER BY a) as p5, + percentile_disc(0.6) WITHIN GROUP (ORDER BY a) as p6, + percentile_disc(0.7) WITHIN GROUP (ORDER BY a) as p7, + percentile_disc(0.8) WITHIN GROUP (ORDER BY a) as p8, + percentile_disc(0.9) WITHIN GROUP (ORDER BY a) as p9, + percentile_disc(1.0) WITHIN GROUP (ORDER BY a) as p10 +FROM VALUES (0), (1), (2) AS v(a) +-- !query analysis +Aggregate [percentile_disc(a#x, cast(0.0 as double), false, 0, 0, false) AS p0#x, percentile_disc(a#x, cast(0.1 as double), false, 0, 0, false) AS p1#x, percentile_disc(a#x, cast(0.2 as double), false, 0, 0, false) AS p2#x, percentile_disc(a#x, cast(0.3 as double), false, 0, 0, false) AS p3#x, percentile_disc(a#x, cast(0.4 as double), false, 0, 0, false) AS p4#x, percentile_disc(a#x, cast(0.5 as double), false, 0, 0, false) AS p5#x, percentile_disc(a#x, cast(0.6 as double), false, 0, 0, false) AS p6#x, percentile_disc(a#x, cast(0.7 as double), false, 0, 0, false) AS p7#x, percentile_disc(a#x, cast(0.8 as double), false, 0, 0, false) AS p8#x, percentile_disc(a#x, cast(0.9 as double), false, 0, 0, false) AS p9#x, percentile_disc(a#x, cast(1.0 as double), false, 0, 0, false) AS p10#x] ++- SubqueryAlias v + +- LocalRelation [a#x] + + +-- !query +SELECT + percentile_disc(0.0) WITHIN GROUP (ORDER BY a) as p0, + percentile_disc(0.1) WITHIN GROUP (ORDER BY a) as p1, + percentile_disc(0.2) WITHIN GROUP (ORDER BY a) as p2, + percentile_disc(0.3) WITHIN GROUP (ORDER BY a) as p3, + percentile_disc(0.4) WITHIN GROUP (ORDER BY a) as p4, + percentile_disc(0.5) WITHIN GROUP (ORDER BY a) as p5, + percentile_disc(0.6) WITHIN GROUP (ORDER BY a) as p6, + percentile_disc(0.7) WITHIN GROUP (ORDER BY a) as p7, + percentile_disc(0.8) WITHIN GROUP (ORDER BY a) as p8, + percentile_disc(0.9) WITHIN GROUP (ORDER BY a) as p9, + percentile_disc(1.0) WITHIN GROUP (ORDER BY a) as p10 +FROM VALUES (0), (1), (2), (3), (4) AS v(a) +-- !query analysis +Aggregate [percentile_disc(a#x, cast(0.0 as double), false, 0, 0, false) AS p0#x, percentile_disc(a#x, cast(0.1 as double), false, 0, 0, false) AS p1#x, percentile_disc(a#x, cast(0.2 as double), false, 0, 0, false) AS p2#x, percentile_disc(a#x, cast(0.3 as double), false, 0, 0, false) AS p3#x, percentile_disc(a#x, cast(0.4 as double), false, 0, 0, false) AS p4#x, percentile_disc(a#x, cast(0.5 as double), false, 0, 0, false) AS p5#x, percentile_disc(a#x, cast(0.6 as double), false, 0, 0, false) AS p6#x, percentile_disc(a#x, cast(0.7 as double), false, 0, 0, false) AS p7#x, percentile_disc(a#x, cast(0.8 as double), false, 0, 0, false) AS p8#x, percentile_disc(a#x, cast(0.9 as double), false, 0, 0, false) AS p9#x, percentile_disc(a#x, cast(1.0 as double), false, 0, 0, false) AS p10#x] ++- SubqueryAlias v + +- LocalRelation [a#x] + + +-- !query +SET spark.sql.legacy.percentileDiscCalculation = true +-- !query analysis +SetCommand (spark.sql.legacy.percentileDiscCalculation,Some(true)) + + +-- !query +SELECT + percentile_disc(0.0) WITHIN GROUP (ORDER BY a) as p0, + percentile_disc(0.1) WITHIN GROUP (ORDER BY a) as p1, + percentile_disc(0.2) WITHIN GROUP (ORDER BY a) as p2, + percentile_disc(0.3) WITHIN GROUP (ORDER BY a) as p3, + percentile_disc(0.4) WITHIN GROUP (ORDER BY a) as p4, + percentile_disc(0.5) WITHIN GROUP (ORDER BY a) as p5, + percentile_disc(0.6) WITHIN GROUP (ORDER BY a) as p6, + percentile_disc(0.7) WITHIN GROUP (ORDER BY a) as p7, + percentile_disc(0.8) WITHIN GROUP (ORDER BY a) as p8, + percentile_disc(0.9) WITHIN GROUP (ORDER BY a) as p9, + percentile_disc(1.0) WITHIN GROUP (ORDER BY a) as p10 +FROM VALUES (0), (1), (2), (3), (4) AS v(a) +-- !query analysis +Aggregate [percentile_disc(a#x, cast(0.0 as double), false, 0, 0, true) AS p0#x, percentile_disc(a#x, cast(0.1 as double), false, 0, 0, true) AS p1#x, percentile_disc(a#x, cast(0.2 as double), false, 0, 0, true) AS p2#x, percentile_disc(a#x, cast(0.3 as double), false, 0, 0, true) AS p3#x, percentile_disc(a#x, cast(0.4 as double), false, 0, 0, true) AS p4#x, percentile_disc(a#x, cast(0.5 as double), false, 0, 0, true) AS p5#x, percentile_disc(a#x, cast(0.6 as double), false, 0, 0, true) AS p6#x, percentile_disc(a#x, cast(0.7 as double), false, 0, 0, true) AS p7#x, percentile_disc(a#x, cast(0.8 as double), false, 0, 0, true) AS p8#x, percentile_disc(a#x, cast(0.9 as double), false, 0, 0, true) AS p9#x, percentile_disc(a#x, cast(1.0 as double), false, 0, 0, true) AS p10#x] ++- SubqueryAlias v + +- LocalRelation [a#x] + + +-- !query +SET spark.sql.legacy.percentileDiscCalculation = false +-- !query analysis +SetCommand (spark.sql.legacy.percentileDiscCalculation,Some(false)) diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/aggregates_part4.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/aggregates_part4.sql.out index 11670e7056c0c..0422aaf40f3c9 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/aggregates_part4.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/aggregates_part4.sql.out @@ -26,6 +26,6 @@ Aggregate [percentile_cont(thousand#x, cast(0.5 as double), false) AS percentile -- !query select percentile_disc(0.5) within group (order by thousand) from tenk1 -- !query analysis -Aggregate [percentile_disc(thousand#x, cast(0.5 as double), false, 0, 0) AS percentile_disc(0.5) WITHIN GROUP (ORDER BY thousand)#x] +Aggregate [percentile_disc(thousand#x, cast(0.5 as double), false, 0, 0, false) AS percentile_disc(0.5) WITHIN GROUP (ORDER BY thousand)#x] +- SubqueryAlias spark_catalog.default.tenk1 +- Relation spark_catalog.default.tenk1[unique1#x,unique2#x,two#x,four#x,ten#x,twenty#x,hundred#x,thousand#x,twothousand#x,fivethous#x,tenthous#x,odd#x,even#x,stringu1#x,stringu2#x,string4#x] parquet diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/udf/postgreSQL/udf-aggregates_part4.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/udf/postgreSQL/udf-aggregates_part4.sql.out index 11670e7056c0c..0422aaf40f3c9 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/udf/postgreSQL/udf-aggregates_part4.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/udf/postgreSQL/udf-aggregates_part4.sql.out @@ -26,6 +26,6 @@ Aggregate [percentile_cont(thousand#x, cast(0.5 as double), false) AS percentile -- !query select percentile_disc(0.5) within group (order by thousand) from tenk1 -- !query analysis -Aggregate [percentile_disc(thousand#x, cast(0.5 as double), false, 0, 0) AS percentile_disc(0.5) WITHIN GROUP (ORDER BY thousand)#x] +Aggregate [percentile_disc(thousand#x, cast(0.5 as double), false, 0, 0, false) AS percentile_disc(0.5) WITHIN GROUP (ORDER BY thousand)#x] +- SubqueryAlias spark_catalog.default.tenk1 +- Relation spark_catalog.default.tenk1[unique1#x,unique2#x,two#x,four#x,ten#x,twenty#x,hundred#x,thousand#x,twothousand#x,fivethous#x,tenthous#x,odd#x,even#x,stringu1#x,stringu2#x,string4#x] parquet diff --git a/sql/core/src/test/resources/sql-tests/inputs/percentiles.sql b/sql/core/src/test/resources/sql-tests/inputs/percentiles.sql index c55c300b5e805..87c5d4be90ce1 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/percentiles.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/percentiles.sql @@ -299,4 +299,79 @@ SELECT percentile_cont(0.5) WITHIN GROUP (ORDER BY dt2) FROM intervals GROUP BY k -ORDER BY k; \ No newline at end of file +ORDER BY k; + +-- SPARK-44871: Fix percentile_disc behaviour +SELECT + percentile_disc(0.0) WITHIN GROUP (ORDER BY a) as p0, + percentile_disc(0.1) WITHIN GROUP (ORDER BY a) as p1, + percentile_disc(0.2) WITHIN GROUP (ORDER BY a) as p2, + percentile_disc(0.3) WITHIN GROUP (ORDER BY a) as p3, + percentile_disc(0.4) WITHIN GROUP (ORDER BY a) as p4, + percentile_disc(0.5) WITHIN GROUP (ORDER BY a) as p5, + percentile_disc(0.6) WITHIN GROUP (ORDER BY a) as p6, + percentile_disc(0.7) WITHIN GROUP (ORDER BY a) as p7, + percentile_disc(0.8) WITHIN GROUP (ORDER BY a) as p8, + percentile_disc(0.9) WITHIN GROUP (ORDER BY a) as p9, + percentile_disc(1.0) WITHIN GROUP (ORDER BY a) as p10 +FROM VALUES (0) AS v(a); + +SELECT + percentile_disc(0.0) WITHIN GROUP (ORDER BY a) as p0, + percentile_disc(0.1) WITHIN GROUP (ORDER BY a) as p1, + percentile_disc(0.2) WITHIN GROUP (ORDER BY a) as p2, + percentile_disc(0.3) WITHIN GROUP (ORDER BY a) as p3, + percentile_disc(0.4) WITHIN GROUP (ORDER BY a) as p4, + percentile_disc(0.5) WITHIN GROUP (ORDER BY a) as p5, + percentile_disc(0.6) WITHIN GROUP (ORDER BY a) as p6, + percentile_disc(0.7) WITHIN GROUP (ORDER BY a) as p7, + percentile_disc(0.8) WITHIN GROUP (ORDER BY a) as p8, + percentile_disc(0.9) WITHIN GROUP (ORDER BY a) as p9, + percentile_disc(1.0) WITHIN GROUP (ORDER BY a) as p10 +FROM VALUES (0), (1) AS v(a); + +SELECT + percentile_disc(0.0) WITHIN GROUP (ORDER BY a) as p0, + percentile_disc(0.1) WITHIN GROUP (ORDER BY a) as p1, + percentile_disc(0.2) WITHIN GROUP (ORDER BY a) as p2, + percentile_disc(0.3) WITHIN GROUP (ORDER BY a) as p3, + percentile_disc(0.4) WITHIN GROUP (ORDER BY a) as p4, + percentile_disc(0.5) WITHIN GROUP (ORDER BY a) as p5, + percentile_disc(0.6) WITHIN GROUP (ORDER BY a) as p6, + percentile_disc(0.7) WITHIN GROUP (ORDER BY a) as p7, + percentile_disc(0.8) WITHIN GROUP (ORDER BY a) as p8, + percentile_disc(0.9) WITHIN GROUP (ORDER BY a) as p9, + percentile_disc(1.0) WITHIN GROUP (ORDER BY a) as p10 +FROM VALUES (0), (1), (2) AS v(a); + +SELECT + percentile_disc(0.0) WITHIN GROUP (ORDER BY a) as p0, + percentile_disc(0.1) WITHIN GROUP (ORDER BY a) as p1, + percentile_disc(0.2) WITHIN GROUP (ORDER BY a) as p2, + percentile_disc(0.3) WITHIN GROUP (ORDER BY a) as p3, + percentile_disc(0.4) WITHIN GROUP (ORDER BY a) as p4, + percentile_disc(0.5) WITHIN GROUP (ORDER BY a) as p5, + percentile_disc(0.6) WITHIN GROUP (ORDER BY a) as p6, + percentile_disc(0.7) WITHIN GROUP (ORDER BY a) as p7, + percentile_disc(0.8) WITHIN GROUP (ORDER BY a) as p8, + percentile_disc(0.9) WITHIN GROUP (ORDER BY a) as p9, + percentile_disc(1.0) WITHIN GROUP (ORDER BY a) as p10 +FROM VALUES (0), (1), (2), (3), (4) AS v(a); + +SET spark.sql.legacy.percentileDiscCalculation = true; + +SELECT + percentile_disc(0.0) WITHIN GROUP (ORDER BY a) as p0, + percentile_disc(0.1) WITHIN GROUP (ORDER BY a) as p1, + percentile_disc(0.2) WITHIN GROUP (ORDER BY a) as p2, + percentile_disc(0.3) WITHIN GROUP (ORDER BY a) as p3, + percentile_disc(0.4) WITHIN GROUP (ORDER BY a) as p4, + percentile_disc(0.5) WITHIN GROUP (ORDER BY a) as p5, + percentile_disc(0.6) WITHIN GROUP (ORDER BY a) as p6, + percentile_disc(0.7) WITHIN GROUP (ORDER BY a) as p7, + percentile_disc(0.8) WITHIN GROUP (ORDER BY a) as p8, + percentile_disc(0.9) WITHIN GROUP (ORDER BY a) as p9, + percentile_disc(1.0) WITHIN GROUP (ORDER BY a) as p10 +FROM VALUES (0), (1), (2), (3), (4) AS v(a); + +SET spark.sql.legacy.percentileDiscCalculation = false; diff --git a/sql/core/src/test/resources/sql-tests/results/percentiles.sql.out b/sql/core/src/test/resources/sql-tests/results/percentiles.sql.out index a83eee69fec37..0d6ab54286110 100644 --- a/sql/core/src/test/resources/sql-tests/results/percentiles.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/percentiles.sql.out @@ -730,3 +730,119 @@ struct +-- !query output +0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 + + +-- !query +SELECT + percentile_disc(0.0) WITHIN GROUP (ORDER BY a) as p0, + percentile_disc(0.1) WITHIN GROUP (ORDER BY a) as p1, + percentile_disc(0.2) WITHIN GROUP (ORDER BY a) as p2, + percentile_disc(0.3) WITHIN GROUP (ORDER BY a) as p3, + percentile_disc(0.4) WITHIN GROUP (ORDER BY a) as p4, + percentile_disc(0.5) WITHIN GROUP (ORDER BY a) as p5, + percentile_disc(0.6) WITHIN GROUP (ORDER BY a) as p6, + percentile_disc(0.7) WITHIN GROUP (ORDER BY a) as p7, + percentile_disc(0.8) WITHIN GROUP (ORDER BY a) as p8, + percentile_disc(0.9) WITHIN GROUP (ORDER BY a) as p9, + percentile_disc(1.0) WITHIN GROUP (ORDER BY a) as p10 +FROM VALUES (0), (1) AS v(a) +-- !query schema +struct +-- !query output +0.0 0.0 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 1.0 + + +-- !query +SELECT + percentile_disc(0.0) WITHIN GROUP (ORDER BY a) as p0, + percentile_disc(0.1) WITHIN GROUP (ORDER BY a) as p1, + percentile_disc(0.2) WITHIN GROUP (ORDER BY a) as p2, + percentile_disc(0.3) WITHIN GROUP (ORDER BY a) as p3, + percentile_disc(0.4) WITHIN GROUP (ORDER BY a) as p4, + percentile_disc(0.5) WITHIN GROUP (ORDER BY a) as p5, + percentile_disc(0.6) WITHIN GROUP (ORDER BY a) as p6, + percentile_disc(0.7) WITHIN GROUP (ORDER BY a) as p7, + percentile_disc(0.8) WITHIN GROUP (ORDER BY a) as p8, + percentile_disc(0.9) WITHIN GROUP (ORDER BY a) as p9, + percentile_disc(1.0) WITHIN GROUP (ORDER BY a) as p10 +FROM VALUES (0), (1), (2) AS v(a) +-- !query schema +struct +-- !query output +0.0 0.0 0.0 0.0 1.0 1.0 1.0 2.0 2.0 2.0 2.0 + + +-- !query +SELECT + percentile_disc(0.0) WITHIN GROUP (ORDER BY a) as p0, + percentile_disc(0.1) WITHIN GROUP (ORDER BY a) as p1, + percentile_disc(0.2) WITHIN GROUP (ORDER BY a) as p2, + percentile_disc(0.3) WITHIN GROUP (ORDER BY a) as p3, + percentile_disc(0.4) WITHIN GROUP (ORDER BY a) as p4, + percentile_disc(0.5) WITHIN GROUP (ORDER BY a) as p5, + percentile_disc(0.6) WITHIN GROUP (ORDER BY a) as p6, + percentile_disc(0.7) WITHIN GROUP (ORDER BY a) as p7, + percentile_disc(0.8) WITHIN GROUP (ORDER BY a) as p8, + percentile_disc(0.9) WITHIN GROUP (ORDER BY a) as p9, + percentile_disc(1.0) WITHIN GROUP (ORDER BY a) as p10 +FROM VALUES (0), (1), (2), (3), (4) AS v(a) +-- !query schema +struct +-- !query output +0.0 0.0 0.0 1.0 1.0 2.0 2.0 3.0 3.0 4.0 4.0 + + +-- !query +SET spark.sql.legacy.percentileDiscCalculation = true +-- !query schema +struct +-- !query output +spark.sql.legacy.percentileDiscCalculation true + + +-- !query +SELECT + percentile_disc(0.0) WITHIN GROUP (ORDER BY a) as p0, + percentile_disc(0.1) WITHIN GROUP (ORDER BY a) as p1, + percentile_disc(0.2) WITHIN GROUP (ORDER BY a) as p2, + percentile_disc(0.3) WITHIN GROUP (ORDER BY a) as p3, + percentile_disc(0.4) WITHIN GROUP (ORDER BY a) as p4, + percentile_disc(0.5) WITHIN GROUP (ORDER BY a) as p5, + percentile_disc(0.6) WITHIN GROUP (ORDER BY a) as p6, + percentile_disc(0.7) WITHIN GROUP (ORDER BY a) as p7, + percentile_disc(0.8) WITHIN GROUP (ORDER BY a) as p8, + percentile_disc(0.9) WITHIN GROUP (ORDER BY a) as p9, + percentile_disc(1.0) WITHIN GROUP (ORDER BY a) as p10 +FROM VALUES (0), (1), (2), (3), (4) AS v(a) +-- !query schema +struct +-- !query output +0.0 0.0 0.0 1.0 1.0 2.0 2.0 2.0 3.0 3.0 4.0 + + +-- !query +SET spark.sql.legacy.percentileDiscCalculation = false +-- !query schema +struct +-- !query output +spark.sql.legacy.percentileDiscCalculation false