[SPARK-41793][SQL] Incorrect result for window frames defined by a range clause on large decimals

ulysses-you · cloud-fan · commit 0a440e907f48 · 2023-02-23T20:36:36.000+08:00
### What changes were proposed in this pull request? Use `DecimalAddNoOverflowCheck` instead of `Add` to craete bound ordering for window range frame ### Why are the changes needed? Before 3.4, the `Add` did not check overflow. Instead, we always wrapped `Add` with a `CheckOverflow`. After #36698, we make `Add` check overflow by itself. However, the bound ordering of window range frame uses `Add` to calculate the boundary that is used to determine which input row lies within the frame boundaries of an output row. Then the behavior is changed with an extra overflow check. Technically，We could allow an overflowing value if it is just an intermediate result. So this pr use `DecimalAddNoOverflowCheck` to replace the `Add` to restore the previous behavior. ### Does this PR introduce _any_ user-facing change? yes, restore the previous(before 3.4) behavior ### How was this patch tested? add test Closes #40138 from ulysses-you/SPARK-41793. Authored-by: ulysses-you <ulyssesyou18@gmail.com> Signed-off-by: Wenchen Fan <wenchen@databricks.com> (cherry picked from commit fec4f7f) Signed-off-by: Wenchen Fan <wenchen@databricks.com>
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/decimalExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/decimalExpressions.scala
@@ -213,7 +213,7 @@ case class CheckOverflowInSum(
 }
 
 /**
- * An add expression for decimal values which is only used internally by Sum/Avg.
+ * An add expression for decimal values which is only used internally by Sum/Avg/Window.
  *
  * Nota that, this expression does not check overflow which is different with `Add`. When
  * aggregating values, Spark writes the aggregation buffer values to `UnsafeRow` via
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/window/WindowExecBase.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/window/WindowExecBase.scala
@@ -128,6 +128,7 @@ trait WindowExecBase extends UnaryExecNode {
             TimestampAddYMInterval(expr, boundOffset, Some(timeZone))
           case (TimestampType | TimestampNTZType, _: DayTimeIntervalType) =>
             TimeAdd(expr, boundOffset, Some(timeZone))
+          case (d: DecimalType, _: DecimalType) => DecimalAddNoOverflowCheck(expr, boundOffset, d)
           case (a, b) if a == b => Add(expr, boundOffset)
         }
         val bound = MutableProjection.create(boundExpr :: Nil, child.output)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameWindowFramesSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameWindowFramesSuite.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql
 
-import org.apache.spark.sql.catalyst.expressions.{Ascending, Literal, NonFoldableLiteral, RangeFrame, SortOrder, SpecifiedWindowFrame, UnspecifiedFrame}
+import org.apache.spark.sql.catalyst.expressions.{Ascending, Literal, NonFoldableLiteral, RangeFrame, SortOrder, SpecifiedWindowFrame, UnaryMinus, UnspecifiedFrame}
 import org.apache.spark.sql.catalyst.plans.logical.{Window => WindowNode}
 import org.apache.spark.sql.expressions.{Window, WindowSpec}
 import org.apache.spark.sql.functions._
@@ -474,4 +474,22 @@ class DataFrameWindowFramesSuite extends QueryTest with SharedSparkSession {
     checkAnswer(df,
       Row(3, 1.5) :: Row(3, 1.5) :: Row(6, 2.0) :: Row(6, 2.0) :: Row(6, 2.0) :: Nil)
   }
+
+  test("SPARK-41793: Incorrect result for window frames defined by a range clause on large " +
+    "decimals") {
+    val window = new WindowSpec(Seq($"a".expr), Seq(SortOrder($"b".expr, Ascending)),
+      SpecifiedWindowFrame(RangeFrame,
+        UnaryMinus(Literal(BigDecimal(10.2345))), Literal(BigDecimal(6.7890))))
+
+    val df = Seq(
+      1 -> "11342371013783243717493546650944543.47",
+      1 -> "999999999999999999999999999999999999.99"
+    ).toDF("a", "b")
+      .select($"a", $"b".cast("decimal(38, 2)"))
+      .select(count("*").over(window))
+
+    checkAnswer(
+      df,
+      Row(1) :: Row(1) :: Nil)
+  }
 }

Original file line number	Diff line number	Diff line change
`@@ -213,7 +213,7 @@ case class CheckOverflowInSum(`
`213`	`213`	`}`
`214`	`214`
`215`	`215`	`/**`
`216`		`- * An add expression for decimal values which is only used internally by Sum/Avg.`
	`216`	`+ * An add expression for decimal values which is only used internally by Sum/Avg/Window.`
`217`	`217`	`*`
`218`	`218`	* Nota that, this expression does not check overflow which is different with `Add`. When
`219`	`219`	* aggregating values, Spark writes the aggregation buffer values to `UnsafeRow` via
Original file line number	Diff line number	Diff line change
`@@ -128,6 +128,7 @@ trait WindowExecBase extends UnaryExecNode {`
`128`	`128`	`TimestampAddYMInterval(expr, boundOffset, Some(timeZone))`
`129`	`129`	`case (TimestampType \| TimestampNTZType, _: DayTimeIntervalType) =>`
`130`	`130`	`TimeAdd(expr, boundOffset, Some(timeZone))`
	`131`	`+ case (d: DecimalType, _: DecimalType) => DecimalAddNoOverflowCheck(expr, boundOffset, d)`
`131`	`132`	`case (a, b) if a == b => Add(expr, boundOffset)`
`132`	`133`	`}`
`133`	`134`	`val bound = MutableProjection.create(boundExpr :: Nil, child.output)`