[SPARK-33551][SQL] Do not use custom shuffle reader for repartition

maryannxue · gatorsmile · commit dfa3978d9191 · 2020-11-25T19:32:22.000-08:00
### What changes were proposed in this pull request? This PR fixes an AQE issue where local shuffle reader, partition coalescing, or skew join optimization can be mistakenly applied to a shuffle introduced by repartition or a regular shuffle that logically replaces a repartition shuffle. The proposed solution checks for the presence of any repartition shuffle and filters out not applicable optimization rules for the final stage in an AQE plan. ### Why are the changes needed? Without the change, the output of a repartition query may not be correct. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Added UT. Closes #30494 from maryannxue/csr-repartition. Authored-by: Maryann Xue <maryann.xue@gmail.com> Signed-off-by: Xiao Li <gatorsmile@gmail.com>
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -509,7 +509,7 @@ object SQLConf {
         "'spark.sql.adaptive.skewJoin.skewedPartitionThresholdInBytes'")
       .version("3.0.0")
       .intConf
-      .checkValue(_ > 0, "The skew factor must be positive.")
+      .checkValue(_ >= 0, "The skew factor cannot be negative.")
       .createWithDefault(5)
 
   val SKEW_JOIN_SKEWED_PARTITION_THRESHOLD =
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
@@ -37,8 +37,6 @@ import org.apache.spark.sql.catalyst.trees.TreeNodeTag
 import org.apache.spark.sql.execution._
 import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec._
 import org.apache.spark.sql.execution.bucketing.DisableUnnecessaryBucketedScan
-import org.apache.spark.sql.execution.command.DataWritingCommandExec
-import org.apache.spark.sql.execution.datasources.v2.V2TableWriteExec
 import org.apache.spark.sql.execution.exchange._
 import org.apache.spark.sql.execution.ui.{SparkListenerSQLAdaptiveExecutionUpdate, SparkListenerSQLAdaptiveSQLMetricUpdates, SQLPlanMetric}
 import org.apache.spark.sql.internal.SQLConf
@@ -104,23 +102,30 @@ case class AdaptiveSparkPlanExec(
     OptimizeLocalShuffleReader
   )
 
-  private def finalStageOptimizerRules: Seq[Rule[SparkPlan]] =
-    context.qe.sparkPlan match {
-      case _: DataWritingCommandExec | _: V2TableWriteExec =>
-        // SPARK-32932: Local shuffle reader could break partitioning that works best
-        // for the following writing command
-        queryStageOptimizerRules.filterNot(_ == OptimizeLocalShuffleReader)
-      case _ =>
-        queryStageOptimizerRules
-    }
-
   // A list of physical optimizer rules to be applied right after a new stage is created. The input
   // plan to these rules has exchange as its root node.
   @transient private val postStageCreationRules = Seq(
     ApplyColumnarRulesAndInsertTransitions(context.session.sessionState.columnarRules),
     CollapseCodegenStages()
   )
 
+  // The partitioning of the query output depends on the shuffle(s) in the final stage. If the
+  // original plan contains a repartition operator, we need to preserve the specified partitioning,
+  // whether or not the repartition-introduced shuffle is optimized out because of an underlying
+  // shuffle of the same partitioning. Thus, we need to exclude some `CustomShuffleReaderRule`s
+  // from the final stage, depending on the presence and properties of repartition operators.
+  private def finalStageOptimizerRules: Seq[Rule[SparkPlan]] = {
+    val origins = inputPlan.collect {
+      case s: ShuffleExchangeLike => s.shuffleOrigin
+    }
+    val allRules = queryStageOptimizerRules ++ postStageCreationRules
+    allRules.filter {
+      case c: CustomShuffleReaderRule =>
+        origins.forall(c.supportedShuffleOrigins.contains)
+      case _ => true
+    }
+  }
+
   @transient private val costEvaluator = SimpleCostEvaluator
 
   @transient private val initialPlan = context.session.withActive {
@@ -249,7 +254,7 @@ case class AdaptiveSparkPlanExec(
       // Run the final plan when there's no more unfinished stages.
       currentPhysicalPlan = applyPhysicalRules(
         result.newPlan,
-        finalStageOptimizerRules ++ postStageCreationRules,
+        finalStageOptimizerRules,
         Some((planChangeLogger, "AQE Final Query Stage Optimization")))
       isFinalPlan = true
       executionId.foreach(onUpdatePlan(_, Seq(currentPhysicalPlan)))
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/CoalesceShufflePartitions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/CoalesceShufflePartitions.scala
@@ -19,16 +19,18 @@ package org.apache.spark.sql.execution.adaptive
 
 import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.catalyst.plans.physical.SinglePartition
-import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.execution.SparkPlan
-import org.apache.spark.sql.execution.exchange.{ENSURE_REQUIREMENTS, REPARTITION, ShuffleExchangeLike}
+import org.apache.spark.sql.execution.exchange.{ENSURE_REQUIREMENTS, REPARTITION, ShuffleExchangeLike, ShuffleOrigin}
 import org.apache.spark.sql.internal.SQLConf
 
 /**
  * A rule to coalesce the shuffle partitions based on the map output statistics, which can
  * avoid many small reduce tasks that hurt performance.
  */
-case class CoalesceShufflePartitions(session: SparkSession) extends Rule[SparkPlan] {
+case class CoalesceShufflePartitions(session: SparkSession) extends CustomShuffleReaderRule {
+
+  override val supportedShuffleOrigins: Seq[ShuffleOrigin] = Seq(ENSURE_REQUIREMENTS, REPARTITION)
+
   override def apply(plan: SparkPlan): SparkPlan = {
     if (!conf.coalesceShufflePartitionsEnabled) {
       return plan
@@ -86,7 +88,6 @@ case class CoalesceShufflePartitions(session: SparkSession) extends Rule[SparkPl
   }
 
   private def supportCoalesce(s: ShuffleExchangeLike): Boolean = {
-    s.outputPartitioning != SinglePartition &&
-      (s.shuffleOrigin == ENSURE_REQUIREMENTS || s.shuffleOrigin == REPARTITION)
+    s.outputPartitioning != SinglePartition && supportedShuffleOrigins.contains(s.shuffleOrigin)
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/CustomShuffleReaderRule.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/CustomShuffleReaderRule.scala
@@ -0,0 +1,33 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.adaptive
+
+import org.apache.spark.sql.catalyst.rules.Rule
+import org.apache.spark.sql.execution.SparkPlan
+import org.apache.spark.sql.execution.exchange.ShuffleOrigin
+
+/**
+ * Adaptive Query Execution rule that may create [[CustomShuffleReaderExec]] on top of query stages.
+ */
+trait CustomShuffleReaderRule extends Rule[SparkPlan] {
+
+  /**
+   * Returns the list of [[ShuffleOrigin]]s supported by this rule.
+   */
+  def supportedShuffleOrigins: Seq[ShuffleOrigin]
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeLocalShuffleReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeLocalShuffleReader.scala
@@ -19,9 +19,8 @@ package org.apache.spark.sql.execution.adaptive
 
 import org.apache.spark.sql.catalyst.optimizer.{BuildLeft, BuildRight, BuildSide}
 import org.apache.spark.sql.catalyst.plans.physical.SinglePartition
-import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.execution._
-import org.apache.spark.sql.execution.exchange.{ENSURE_REQUIREMENTS, EnsureRequirements, ShuffleExchangeExec, ShuffleExchangeLike}
+import org.apache.spark.sql.execution.exchange.{ENSURE_REQUIREMENTS, EnsureRequirements, ShuffleExchangeExec, ShuffleExchangeLike, ShuffleOrigin}
 import org.apache.spark.sql.execution.joins.BroadcastHashJoinExec
 import org.apache.spark.sql.internal.SQLConf
 
@@ -34,7 +33,9 @@ import org.apache.spark.sql.internal.SQLConf
  * then run `EnsureRequirements` to check whether additional shuffle introduced.
  * If introduced, we will revert all the local readers.
  */
-object OptimizeLocalShuffleReader extends Rule[SparkPlan] {
+object OptimizeLocalShuffleReader extends CustomShuffleReaderRule {
+
+  override val supportedShuffleOrigins: Seq[ShuffleOrigin] = Seq(ENSURE_REQUIREMENTS)
 
   private val ensureRequirements = EnsureRequirements
 
@@ -144,6 +145,6 @@ object OptimizeLocalShuffleReader extends Rule[SparkPlan] {
   }
 
   private def supportLocalReader(s: ShuffleExchangeLike): Boolean = {
-    s.outputPartitioning != SinglePartition && s.shuffleOrigin == ENSURE_REQUIREMENTS
+    s.outputPartitioning != SinglePartition && supportedShuffleOrigins.contains(s.shuffleOrigin)
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeSkewedJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeSkewedJoin.scala
@@ -23,9 +23,8 @@ import org.apache.commons.io.FileUtils
 
 import org.apache.spark.{MapOutputStatistics, MapOutputTrackerMaster, SparkEnv}
 import org.apache.spark.sql.catalyst.plans._
-import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.execution._
-import org.apache.spark.sql.execution.exchange.{EnsureRequirements, ShuffleExchangeExec}
+import org.apache.spark.sql.execution.exchange.{ENSURE_REQUIREMENTS, EnsureRequirements, ShuffleExchangeExec, ShuffleOrigin}
 import org.apache.spark.sql.execution.joins.SortMergeJoinExec
 import org.apache.spark.sql.internal.SQLConf
 
@@ -53,7 +52,9 @@ import org.apache.spark.sql.internal.SQLConf
  * Note that, when this rule is enabled, it also coalesces non-skewed partitions like
  * `CoalesceShufflePartitions` does.
  */
-object OptimizeSkewedJoin extends Rule[SparkPlan] {
+object OptimizeSkewedJoin extends CustomShuffleReaderRule {
+
+  override val supportedShuffleOrigins: Seq[ShuffleOrigin] = Seq(ENSURE_REQUIREMENTS)
 
   private val ensureRequirements = EnsureRequirements
 
@@ -290,7 +291,9 @@ object OptimizeSkewedJoin extends Rule[SparkPlan] {
 
 private object ShuffleStage {
   def unapply(plan: SparkPlan): Option[ShuffleStageInfo] = plan match {
-    case s: ShuffleQueryStageExec if s.mapStats.isDefined =>
+    case s: ShuffleQueryStageExec
+        if s.mapStats.isDefined &&
+          OptimizeSkewedJoin.supportedShuffleOrigins.contains(s.shuffle.shuffleOrigin) =>
       val mapStats = s.mapStats.get
       val sizes = mapStats.bytesByPartitionId
       val partitions = sizes.zipWithIndex.map {
@@ -299,7 +302,8 @@ private object ShuffleStage {
       Some(ShuffleStageInfo(s, mapStats, partitions))
 
     case CustomShuffleReaderExec(s: ShuffleQueryStageExec, partitionSpecs)
-      if s.mapStats.isDefined && partitionSpecs.nonEmpty =>
+        if s.mapStats.isDefined && partitionSpecs.nonEmpty &&
+          OptimizeSkewedJoin.supportedShuffleOrigins.contains(s.shuffle.shuffleOrigin) =>
       val mapStats = s.mapStats.get
       val sizes = mapStats.bytesByPartitionId
       val partitions = partitionSpecs.map {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala
@@ -30,7 +30,7 @@ import org.apache.spark.sql.execution.{PartialReducerPartitionSpec, QueryExecuti
 import org.apache.spark.sql.execution.command.DataWritingCommandExec
 import org.apache.spark.sql.execution.datasources.noop.NoopDataSource
 import org.apache.spark.sql.execution.datasources.v2.V2TableWriteExec
-import org.apache.spark.sql.execution.exchange.{BroadcastExchangeExec, Exchange, ReusedExchangeExec, ShuffleExchangeExec}
+import org.apache.spark.sql.execution.exchange.{BroadcastExchangeExec, Exchange, REPARTITION, REPARTITION_WITH_NUM, ReusedExchangeExec, ShuffleExchangeExec, ShuffleExchangeLike}
 import org.apache.spark.sql.execution.joins.{BaseJoinExec, BroadcastHashJoinExec, SortMergeJoinExec}
 import org.apache.spark.sql.execution.ui.SparkListenerSQLAdaptiveExecutionUpdate
 import org.apache.spark.sql.functions._
@@ -1317,4 +1317,118 @@ class AdaptiveQueryExecSuite
       checkNumLocalShuffleReaders(df.queryExecution.executedPlan, numShufflesWithoutLocalReader = 1)
     }
   }
+
+  test("SPARK-33551: Do not use custom shuffle reader for repartition") {
+    def hasRepartitionShuffle(plan: SparkPlan): Boolean = {
+      find(plan) {
+        case s: ShuffleExchangeLike =>
+          s.shuffleOrigin == REPARTITION || s.shuffleOrigin == REPARTITION_WITH_NUM
+        case _ => false
+      }.isDefined
+    }
+
+    withSQLConf(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "true",
+      SQLConf.SHUFFLE_PARTITIONS.key -> "5") {
+      val df = sql(
+        """
+          |SELECT * FROM (
+          |  SELECT * FROM testData WHERE key = 1
+          |)
+          |RIGHT OUTER JOIN testData2
+          |ON value = b
+        """.stripMargin)
+
+      withSQLConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "80") {
+        // Repartition with no partition num specified.
+        val dfRepartition = df.repartition('b)
+        dfRepartition.collect()
+        val plan = dfRepartition.queryExecution.executedPlan
+        // The top shuffle from repartition is optimized out.
+        assert(!hasRepartitionShuffle(plan))
+        val bhj = findTopLevelBroadcastHashJoin(plan)
+        assert(bhj.length == 1)
+        checkNumLocalShuffleReaders(plan, 1)
+        // Probe side is coalesced.
+        val customReader = bhj.head.right.find(_.isInstanceOf[CustomShuffleReaderExec])
+        assert(customReader.isDefined)
+        assert(customReader.get.asInstanceOf[CustomShuffleReaderExec].hasCoalescedPartition)
+
+        // Repartition with partition default num specified.
+        val dfRepartitionWithNum = df.repartition(5, 'b)
+        dfRepartitionWithNum.collect()
+        val planWithNum = dfRepartitionWithNum.queryExecution.executedPlan
+        // The top shuffle from repartition is optimized out.
+        assert(!hasRepartitionShuffle(planWithNum))
+        val bhjWithNum = findTopLevelBroadcastHashJoin(planWithNum)
+        assert(bhjWithNum.length == 1)
+        checkNumLocalShuffleReaders(planWithNum, 1)
+        // Probe side is not coalesced.
+        assert(bhjWithNum.head.right.find(_.isInstanceOf[CustomShuffleReaderExec]).isEmpty)
+
+        // Repartition with partition non-default num specified.
+        val dfRepartitionWithNum2 = df.repartition(3, 'b)
+        dfRepartitionWithNum2.collect()
+        val planWithNum2 = dfRepartitionWithNum2.queryExecution.executedPlan
+        // The top shuffle from repartition is not optimized out, and this is the only shuffle that
+        // does not have local shuffle reader.
+        assert(hasRepartitionShuffle(planWithNum2))
+        val bhjWithNum2 = findTopLevelBroadcastHashJoin(planWithNum2)
+        assert(bhjWithNum2.length == 1)
+        checkNumLocalShuffleReaders(planWithNum2, 1)
+        val customReader2 = bhjWithNum2.head.right.find(_.isInstanceOf[CustomShuffleReaderExec])
+        assert(customReader2.isDefined)
+        assert(customReader2.get.asInstanceOf[CustomShuffleReaderExec].isLocalReader)
+      }
+
+      // Force skew join
+      withSQLConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1",
+        SQLConf.SKEW_JOIN_ENABLED.key -> "true",
+        SQLConf.SKEW_JOIN_SKEWED_PARTITION_THRESHOLD.key -> "1",
+        SQLConf.SKEW_JOIN_SKEWED_PARTITION_FACTOR.key -> "0",
+        SQLConf.ADVISORY_PARTITION_SIZE_IN_BYTES.key -> "10") {
+        // Repartition with no partition num specified.
+        val dfRepartition = df.repartition('b)
+        dfRepartition.collect()
+        val plan = dfRepartition.queryExecution.executedPlan
+        // The top shuffle from repartition is optimized out.
+        assert(!hasRepartitionShuffle(plan))
+        val smj = findTopLevelSortMergeJoin(plan)
+        assert(smj.length == 1)
+        // No skew join due to the repartition.
+        assert(!smj.head.isSkewJoin)
+        // Both sides are coalesced.
+        val customReaders = collect(smj.head) {
+          case c: CustomShuffleReaderExec if c.hasCoalescedPartition => c
+        }
+        assert(customReaders.length == 2)
+
+        // Repartition with default partition num specified.
+        val dfRepartitionWithNum = df.repartition(5, 'b)
+        dfRepartitionWithNum.collect()
+        val planWithNum = dfRepartitionWithNum.queryExecution.executedPlan
+        // The top shuffle from repartition is optimized out.
+        assert(!hasRepartitionShuffle(planWithNum))
+        val smjWithNum = findTopLevelSortMergeJoin(planWithNum)
+        assert(smjWithNum.length == 1)
+        // No skew join due to the repartition.
+        assert(!smjWithNum.head.isSkewJoin)
+        // No coalesce due to the num in repartition.
+        val customReadersWithNum = collect(smjWithNum.head) {
+          case c: CustomShuffleReaderExec if c.hasCoalescedPartition => c
+        }
+        assert(customReadersWithNum.isEmpty)
+
+        // Repartition with default non-partition num specified.
+        val dfRepartitionWithNum2 = df.repartition(3, 'b)
+        dfRepartitionWithNum2.collect()
+        val planWithNum2 = dfRepartitionWithNum2.queryExecution.executedPlan
+        // The top shuffle from repartition is not optimized out.
+        assert(hasRepartitionShuffle(planWithNum2))
+        val smjWithNum2 = findTopLevelSortMergeJoin(planWithNum2)
+        assert(smjWithNum2.length == 1)
+        // Skew join can apply as the repartition is not optimized out.
+        assert(smjWithNum2.head.isSkewJoin)
+      }
+    }
+  }
 }