[SPARK-37557][SQL] Replace object hash with sort aggregate if child is already sorted

c21 · cloud-fan · commit 41a940f0713b · 2021-12-07T12:37:19.000+08:00
### What changes were proposed in this pull request? This is a follow up of #34702 (comment) , where we can replace object hash aggregate with sort aggregate as well. This PR is to handle object hash aggregate. ### Why are the changes needed? Increase coverage of rule by handling object hash aggregate as well. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Modified unit test in `ReplaceHashWithSortAggSuite.scala` to cover object hash aggregate (by using aggregate expression `COLLECT_LIST`). Closes #34824 from c21/agg-rule-followup. Authored-by: Cheng Su <chengsu@fb.com> Signed-off-by: Wenchen Fan <wenchen@databricks.com>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ReplaceHashWithSortAgg.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ReplaceHashWithSortAgg.scala
@@ -20,17 +20,18 @@ package org.apache.spark.sql.execution
 import org.apache.spark.sql.catalyst.expressions.SortOrder
 import org.apache.spark.sql.catalyst.expressions.aggregate.{Complete, Final, Partial}
 import org.apache.spark.sql.catalyst.rules.Rule
-import org.apache.spark.sql.execution.aggregate.HashAggregateExec
+import org.apache.spark.sql.execution.aggregate.{BaseAggregateExec, HashAggregateExec, ObjectHashAggregateExec}
 import org.apache.spark.sql.internal.SQLConf
 
 /**
- * Replace [[HashAggregateExec]] with [[SortAggregateExec]] in the spark plan if:
+ * Replace hash-based aggregate with sort aggregate in the spark plan if:
  *
- * 1. The plan is a pair of partial and final [[HashAggregateExec]], and the child of partial
- *    aggregate satisfies the sort order of corresponding [[SortAggregateExec]].
+ * 1. The plan is a pair of partial and final [[HashAggregateExec]] or [[ObjectHashAggregateExec]],
+ *    and the child of partial aggregate satisfies the sort order of corresponding
+ *    [[SortAggregateExec]].
  * or
- * 2. The plan is a [[HashAggregateExec]], and the child satisfies the sort order of
- *    corresponding [[SortAggregateExec]].
+ * 2. The plan is a [[HashAggregateExec]] or [[ObjectHashAggregateExec]], and the child satisfies
+ *    the sort order of corresponding [[SortAggregateExec]].
  *
  * Examples:
  * 1. aggregate after join:
@@ -47,9 +48,9 @@ import org.apache.spark.sql.internal.SQLConf
  *               |                     =>                  |
  *           Sort(t1.i)                                Sort(t1.i)
  *
- * [[HashAggregateExec]] can be replaced when its child satisfies the sort order of
- * corresponding [[SortAggregateExec]]. [[SortAggregateExec]] is faster in the sense that
- * it does not have hashing overhead of [[HashAggregateExec]].
+ * Hash-based aggregate can be replaced when its child satisfies the sort order of
+ * corresponding sort aggregate. Sort aggregate is faster in the sense that
+ * it does not have hashing overhead of hash aggregate.
  */
 object ReplaceHashWithSortAgg extends Rule[SparkPlan] {
   def apply(plan: SparkPlan): SparkPlan = {
@@ -61,14 +62,15 @@ object ReplaceHashWithSortAgg extends Rule[SparkPlan] {
   }
 
   /**
-   * Replace [[HashAggregateExec]] with [[SortAggregateExec]].
+   * Replace [[HashAggregateExec]] and [[ObjectHashAggregateExec]] with [[SortAggregateExec]].
    */
   private def replaceHashAgg(plan: SparkPlan): SparkPlan = {
     plan.transformDown {
-      case hashAgg: HashAggregateExec if hashAgg.groupingExpressions.nonEmpty =>
+      case hashAgg: BaseAggregateExec if isHashBasedAggWithKeys(hashAgg) =>
         val sortAgg = hashAgg.toSortAggregate
         hashAgg.child match {
-          case partialAgg: HashAggregateExec if isPartialAgg(partialAgg, hashAgg) =>
+          case partialAgg: BaseAggregateExec
+            if isHashBasedAggWithKeys(partialAgg) && isPartialAgg(partialAgg, hashAgg) =>
             if (SortOrder.orderingSatisfies(
                 partialAgg.child.outputOrdering, sortAgg.requiredChildOrdering.head)) {
               sortAgg.copy(
@@ -92,7 +94,7 @@ object ReplaceHashWithSortAgg extends Rule[SparkPlan] {
   /**
    * Check if `partialAgg` to be partial aggregate of `finalAgg`.
    */
-  private def isPartialAgg(partialAgg: HashAggregateExec, finalAgg: HashAggregateExec): Boolean = {
+  private def isPartialAgg(partialAgg: BaseAggregateExec, finalAgg: BaseAggregateExec): Boolean = {
     if (partialAgg.aggregateExpressions.forall(_.mode == Partial) &&
         finalAgg.aggregateExpressions.forall(_.mode == Final)) {
       (finalAgg.logicalLink, partialAgg.logicalLink) match {
@@ -103,4 +105,16 @@ object ReplaceHashWithSortAgg extends Rule[SparkPlan] {
       false
     }
   }
+
+  /**
+   * Check if `agg` is [[HashAggregateExec]] or [[ObjectHashAggregateExec]],
+   * and has grouping keys.
+   */
+  private def isHashBasedAggWithKeys(agg: BaseAggregateExec): Boolean = {
+    val isHashBasedAgg = agg match {
+      case _: HashAggregateExec | _: ObjectHashAggregateExec => true
+      case _ => false
+    }
+    isHashBasedAgg && agg.groupingExpressions.nonEmpty
+  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/BaseAggregateExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/BaseAggregateExec.scala
@@ -30,6 +30,7 @@ trait BaseAggregateExec extends UnaryExecNode with AliasAwareOutputPartitioning
   def groupingExpressions: Seq[NamedExpression]
   def aggregateExpressions: Seq[AggregateExpression]
   def aggregateAttributes: Seq[Attribute]
+  def initialInputBufferOffset: Int
   def resultExpressions: Seq[NamedExpression]
 
   override def verboseStringWithOperatorId(): String = {
@@ -95,4 +96,13 @@ trait BaseAggregateExec extends UnaryExecNode with AliasAwareOutputPartitioning
       case None => UnspecifiedDistribution :: Nil
     }
   }
+
+  /**
+   * The corresponding [[SortAggregateExec]] to get same result as this node.
+   */
+  def toSortAggregate: SortAggregateExec = {
+    SortAggregateExec(
+      requiredChildDistributionExpressions, groupingExpressions, aggregateExpressions,
+      aggregateAttributes, initialInputBufferOffset, resultExpressions, child)
+  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala
@@ -1153,15 +1153,6 @@ case class HashAggregateExec(
     }
   }
 
-  /**
-   * The corresponding [[SortAggregateExec]] to get same result as this node.
-   */
-  def toSortAggregate: SortAggregateExec = {
-    SortAggregateExec(
-      requiredChildDistributionExpressions, groupingExpressions, aggregateExpressions,
-      aggregateAttributes, initialInputBufferOffset, resultExpressions, child)
-  }
-
   override protected def withNewChildInternal(newChild: SparkPlan): HashAggregateExec =
     copy(child = newChild)
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/ReplaceHashWithSortAggSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/ReplaceHashWithSortAggSuite.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql.execution
 
 import org.apache.spark.sql.{DataFrame, QueryTest}
 import org.apache.spark.sql.execution.adaptive.{AdaptiveSparkPlanHelper, DisableAdaptiveExecutionSuite, EnableAdaptiveExecutionSuite}
-import org.apache.spark.sql.execution.aggregate.{HashAggregateExec, SortAggregateExec}
+import org.apache.spark.sql.execution.aggregate.{HashAggregateExec, ObjectHashAggregateExec, SortAggregateExec}
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SharedSparkSession
 
@@ -30,7 +30,9 @@ abstract class ReplaceHashWithSortAggSuiteBase
 
   private def checkNumAggs(df: DataFrame, hashAggCount: Int, sortAggCount: Int): Unit = {
     val plan = df.queryExecution.executedPlan
-    assert(collectWithSubqueries(plan) { case s: HashAggregateExec => s }.length == hashAggCount)
+    assert(collectWithSubqueries(plan) {
+      case s @ (_: HashAggregateExec | _: ObjectHashAggregateExec) => s
+    }.length == hashAggCount)
     assert(collectWithSubqueries(plan) { case s: SortAggregateExec => s }.length == sortAggCount)
   }
 
@@ -55,71 +57,79 @@ abstract class ReplaceHashWithSortAggSuiteBase
   test("replace partial hash aggregate with sort aggregate") {
     withTempView("t") {
       spark.range(100).selectExpr("id as key").repartition(10).createOrReplaceTempView("t")
-      val query =
-        """
-          |SELECT key, FIRST(key)
-          |FROM
-          |(
-          |   SELECT key
-          |   FROM t
-          |   WHERE key > 10
-          |   SORT BY key
-          |)
-          |GROUP BY key
-        """.stripMargin
-      checkAggs(query, 1, 1, 2, 0)
+      Seq("FIRST", "COLLECT_LIST").foreach { aggExpr =>
+        val query =
+          s"""
+             |SELECT key, $aggExpr(key)
+             |FROM
+             |(
+             |   SELECT key
+             |   FROM t
+             |   WHERE key > 10
+             |   SORT BY key
+             |)
+             |GROUP BY key
+           """.stripMargin
+        checkAggs(query, 1, 1, 2, 0)
+      }
     }
   }
 
   test("replace partial and final hash aggregate together with sort aggregate") {
     withTempView("t1", "t2") {
       spark.range(100).selectExpr("id as key").createOrReplaceTempView("t1")
       spark.range(50).selectExpr("id as key").createOrReplaceTempView("t2")
-      val query =
-        """
-          |SELECT key, COUNT(key)
-          |FROM
-          |(
-          |   SELECT /*+ SHUFFLE_MERGE(t1) */ t1.key AS key
-          |   FROM t1
-          |   JOIN t2
-          |   ON t1.key = t2.key
-          |)
-          |GROUP BY key
-        """.stripMargin
-      checkAggs(query, 0, 1, 2, 0)
+      Seq("COUNT", "COLLECT_LIST").foreach { aggExpr =>
+        val query =
+          s"""
+             |SELECT key, $aggExpr(key)
+             |FROM
+             |(
+             |   SELECT /*+ SHUFFLE_MERGE(t1) */ t1.key AS key
+             |   FROM t1
+             |   JOIN t2
+             |   ON t1.key = t2.key
+             |)
+             |GROUP BY key
+           """.stripMargin
+        checkAggs(query, 0, 1, 2, 0)
+      }
     }
   }
 
   test("do not replace hash aggregate if child does not have sort order") {
     withTempView("t1", "t2") {
       spark.range(100).selectExpr("id as key").createOrReplaceTempView("t1")
       spark.range(50).selectExpr("id as key").createOrReplaceTempView("t2")
-      val query =
-        """
-          |SELECT key, COUNT(key)
-          |FROM
-          |(
-          |   SELECT /*+ BROADCAST(t1) */ t1.key AS key
-          |   FROM t1
-          |   JOIN t2
-          |   ON t1.key = t2.key
-          |)
-          |GROUP BY key
-        """.stripMargin
-      checkAggs(query, 2, 0, 2, 0)
+      Seq("COUNT", "COLLECT_LIST").foreach { aggExpr =>
+        val query =
+          s"""
+             |SELECT key, $aggExpr(key)
+             |FROM
+             |(
+             |   SELECT /*+ BROADCAST(t1) */ t1.key AS key
+             |   FROM t1
+             |   JOIN t2
+             |   ON t1.key = t2.key
+             |)
+             |GROUP BY key
+           """.stripMargin
+        checkAggs(query, 2, 0, 2, 0)
+      }
     }
   }
 
   test("do not replace hash aggregate if there is no group-by column") {
     withTempView("t1") {
       spark.range(100).selectExpr("id as key").createOrReplaceTempView("t1")
-      val query =
-        """
-          |SELECT COUNT(key)
-          |FROM t1
-        """.stripMargin
-      checkAggs(query, 2, 0, 2, 0)
+      Seq("COUNT", "COLLECT_LIST").foreach { aggExpr =>
+        val query =
+          s"""
+             |SELECT $aggExpr(key)
+             |FROM t1
+           """.stripMargin
+        checkAggs(query, 2, 0, 2, 0)
+      }
     }
   }
 }

Original file line number	Diff line number	Diff line change
`@@ -1153,15 +1153,6 @@ case class HashAggregateExec(`
`1153`	`1153`	`}`
`1154`	`1154`	`}`
`1155`	`1155`
`1156`		`- /**`
`1157`		`- * The corresponding [[SortAggregateExec]] to get same result as this node.`
`1158`		`- */`
`1159`		`- def toSortAggregate: SortAggregateExec = {`
`1160`		`- SortAggregateExec(`
`1161`		`- requiredChildDistributionExpressions, groupingExpressions, aggregateExpressions,`
`1162`		`- aggregateAttributes, initialInputBufferOffset, resultExpressions, child)`
`1163`		`- }`
`1164`		`-`
`1165`	`1156`	`override protected def withNewChildInternal(newChild: SparkPlan): HashAggregateExec =`
`1166`	`1157`	`copy(child = newChild)`
`1167`	`1158`	`}`