Disable change reduce number if the joins are changed (apache#81)

Jiajia Li · carsonwang · commit edbdea147c48 · 2019-01-30T10:26:54.000+08:00
* Disable change reduce number if the joins are changed

* Change reduce number when all leaf nodes are shuffle querystages and not local shuffles

* Ensure all leaf nodes are shuffle query stages

* Update comments.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/QueryStage.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/QueryStage.scala
@@ -113,38 +113,50 @@ abstract class QueryStage extends UnaryExecNode {
     val queryStageInputs: Seq[ShuffleQueryStageInput] = child.collect {
       case input: ShuffleQueryStageInput if !input.isLocalShuffle => input
     }
-    val childMapOutputStatistics = queryStageInputs.map(_.childStage.mapOutputStatistics)
-      .filter(_ != null).toArray
-    // Right now, Adaptive execution only support HashPartitionings.
-    val supportAdaptive = queryStageInputs.forall{
+
+    val skewedShuffleQueryStageInputs: Seq[SkewedShuffleQueryStageInput] = child.collect {
+      case input: SkewedShuffleQueryStageInput => input
+    }
+
+    val leafNodes = child.collect {
+      case s: SparkPlan if s.children.isEmpty => s
+    }
+
+    // Ensure all leaf nodes are shuffle query stages
+    if (leafNodes.length == queryStageInputs.length + skewedShuffleQueryStageInputs.length) {
+      val childMapOutputStatistics = queryStageInputs.map(_.childStage.mapOutputStatistics)
+        .filter(_ != null).toArray
+      // Right now, Adaptive execution only support HashPartitionings.
+      val supportAdaptive = queryStageInputs.forall {
         _.outputPartitioning match {
           case hash: HashPartitioning => true
           case collection: PartitioningCollection =>
             collection.partitionings.forall(_.isInstanceOf[HashPartitioning])
           case _ => false
         }
-    }
+      }
 
-    if (childMapOutputStatistics.length > 0 && supportAdaptive) {
-      val exchangeCoordinator = new ExchangeCoordinator(
-        conf.targetPostShuffleInputSize,
-        conf.adaptiveTargetPostShuffleRowCount,
-        conf.minNumPostShufflePartitions)
-
-      if (queryStageInputs.length == 2 && queryStageInputs.forall(_.skewedPartitions.isDefined)) {
-        // If a skewed join is detected and optimized, we will omit the skewed partitions when
-        // estimate the partition start and end indices.
-        val (partitionStartIndices, partitionEndIndices) =
-          exchangeCoordinator.estimatePartitionStartEndIndices(
-            childMapOutputStatistics, queryStageInputs(0).skewedPartitions.get)
-        queryStageInputs.foreach { i =>
-          i.partitionStartIndices = Some(partitionStartIndices)
-          i.partitionEndIndices = Some(partitionEndIndices)
+      if (childMapOutputStatistics.length > 0 && supportAdaptive) {
+        val exchangeCoordinator = new ExchangeCoordinator(
+          conf.targetPostShuffleInputSize,
+          conf.adaptiveTargetPostShuffleRowCount,
+          conf.minNumPostShufflePartitions)
+
+        if (queryStageInputs.length == 2 && queryStageInputs.forall(_.skewedPartitions.isDefined)) {
+          // If a skewed join is detected and optimized, we will omit the skewed partitions when
+          // estimate the partition start and end indices.
+          val (partitionStartIndices, partitionEndIndices) =
+            exchangeCoordinator.estimatePartitionStartEndIndices(
+              childMapOutputStatistics, queryStageInputs(0).skewedPartitions.get)
+          queryStageInputs.foreach { i =>
+            i.partitionStartIndices = Some(partitionStartIndices)
+            i.partitionEndIndices = Some(partitionEndIndices)
+          }
+        } else {
+          val partitionStartIndices =
+            exchangeCoordinator.estimatePartitionStartIndices(childMapOutputStatistics)
+          queryStageInputs.foreach(_.partitionStartIndices = Some(partitionStartIndices))
         }
-      } else {
-        val partitionStartIndices =
-          exchangeCoordinator.estimatePartitionStartIndices(childMapOutputStatistics)
-        queryStageInputs.foreach(_.partitionStartIndices = Some(partitionStartIndices))
       }
     }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/QueryStageSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/QueryStageSuite.scala
@@ -262,6 +262,75 @@ class QueryStageSuite extends SparkFunSuite with BeforeAndAfterAll {
     }
   }
 
+  test("One of two sort merge inner joins to broadcast join") {
+    // t1 is smaller than spark.sql.adaptiveBroadcastJoinThreshold
+    // t2 and t3 are greater than spark.sql.adaptiveBroadcastJoinThreshold
+    // Join1 is changed to broadcast join.
+    //
+    //              Join2
+    //              /   \
+    //          Join1   Ex (Exchange)
+    //          /   \    \
+    //        Ex    Ex   t3
+    //       /       \
+    //      t1       t2
+    val spark = defaultSparkSession
+    spark.conf.set(SQLConf.ADAPTIVE_EXECUTION_ALLOW_ADDITIONAL_SHUFFLE.key, "true")
+    withSparkSession(spark) { spark: SparkSession =>
+      val df1 =
+        spark
+          .range(0, 1000, 1, numInputPartitions)
+          .selectExpr("id % 500 as key1", "id as value1")
+      val df2 =
+        spark
+          .range(0, 1000, 1, numInputPartitions)
+          .selectExpr("id % 500 as key2", "id as value2")
+      val df3 =
+        spark
+          .range(0, 1500, 1, numInputPartitions)
+          .selectExpr("id % 500 as key3", "id as value3")
+
+      val join =
+        df1
+          .join(df2, col("key1") === col("key2"))
+          .join(df3, col("key2") === col("key3"))
+          .select(col("key3"), col("value1"))
+
+      // Before Execution, there is two SortMergeJoins
+      val smjBeforeExecution = join.queryExecution.executedPlan.collect {
+        case smj: SortMergeJoinExec => smj
+      }
+      assert(smjBeforeExecution.length === 2)
+
+      // Check the answer.
+      val partResult =
+        spark
+          .range(0, 1000)
+          .selectExpr("id % 500 as key", "id as value")
+          .union(spark.range(0, 1000).selectExpr("id % 500 as key", "id as value"))
+      val expectedAnswer = partResult.union(partResult).union(partResult)
+      checkAnswer(
+        join,
+        expectedAnswer.collect())
+
+      // During execution, one SortMergeJoin is changed to BroadcastHashJoin
+      val numSmjAfterExecution = join.queryExecution.executedPlan.collect {
+        case smj: SortMergeJoinExec => smj
+      }.length
+      assert(numSmjAfterExecution === 1)
+
+      val numBhjAfterExecution = join.queryExecution.executedPlan.collect {
+        case bhj: BroadcastHashJoinExec => bhj
+      }.length
+      assert(numBhjAfterExecution === 1)
+
+      val queryStageInputs = join.queryExecution.executedPlan.collect {
+        case q: QueryStageInput => q
+      }
+      assert(queryStageInputs.length === 3)
+    }
+  }
+
   test("Reuse QueryStage in adaptive execution") {
     withSparkSession(defaultSparkSession) { spark: SparkSession =>
       val df = spark.range(0, 1000, 1, numInputPartitions).toDF()