[SPARK-29552][SQL] Execute the "OptimizeLocalShuffleReader" rule when creating new query stage and then can optimize the shuffle reader to local shuffle reader as much as possible

JkSelf · cloud-fan · commit 7e8e4c0a146e · 2019-10-24T01:18:07.000+08:00
### What changes were proposed in this pull request? `OptimizeLocalShuffleReader` rule is very conservative and gives up optimization as long as there are extra shuffles introduced. It's very likely that most of the added local shuffle readers are fine and only one introduces extra shuffle. However, it's very hard to make `OptimizeLocalShuffleReader` optimal, a simple workaround is to run this rule again right before executing a query stage. ### Why are the changes needed? Optimize more shuffle reader to local shuffle reader. ### Does this PR introduce any user-facing change? No ### How was this patch tested? existing ut Closes #26207 from JkSelf/resolve-multi-joins-issue. Authored-by: jiake <ke.a.jia@intel.com> Signed-off-by: Wenchen Fan <wenchen@databricks.com>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
@@ -92,6 +92,15 @@ case class AdaptiveSparkPlanExec(
   // optimizations should be stage-independent.
   @transient private val queryStageOptimizerRules: Seq[Rule[SparkPlan]] = Seq(
     ReuseAdaptiveSubquery(conf, subqueryCache),
+
+    // When adding local shuffle readers in 'OptimizeLocalShuffleReader`, we revert all the local
+    // readers if additional shuffles are introduced. This may be too conservative: maybe there is
+    // only one local reader that introduces shuffle, and we can still keep other local readers.
+    // Here we re-execute this rule with the sub-plan-tree of a query stage, to make sure necessary
+    // local readers are added before executing the query stage.
+    // This rule must be executed before `ReduceNumShufflePartitions`, as local shuffle readers
+    // can't change number of partitions.
+    OptimizeLocalShuffleReader(conf),
     ReduceNumShufflePartitions(conf),
     ApplyColumnarRulesAndInsertTransitions(session.sessionState.conf,
       session.sessionState.columnarRules),
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala
@@ -163,8 +163,9 @@ class AdaptiveQueryExecSuite
       assert(smj.size == 3)
       val bhj = findTopLevelBroadcastHashJoin(adaptivePlan)
       assert(bhj.size == 3)
-      // additional shuffle exchange introduced, only one shuffle reader to local shuffle reader.
-      checkNumLocalShuffleReaders(adaptivePlan, 1)
+      // The child of remaining one BroadcastHashJoin is not ShuffleQueryStage.
+      // So only two LocalShuffleReader.
+      checkNumLocalShuffleReaders(adaptivePlan, 2)
     }
   }
 
@@ -188,7 +189,8 @@ class AdaptiveQueryExecSuite
       assert(smj.size == 3)
       val bhj = findTopLevelBroadcastHashJoin(adaptivePlan)
       assert(bhj.size == 3)
-      // additional shuffle exchange introduced, only one shuffle reader to local shuffle reader.
+      // The child of remaining two BroadcastHashJoin is not ShuffleQueryStage.
+      // So only two LocalShuffleReader.
       checkNumLocalShuffleReaders(adaptivePlan, 1)
     }
   }
@@ -213,7 +215,8 @@ class AdaptiveQueryExecSuite
       assert(smj.size == 3)
       val bhj = findTopLevelBroadcastHashJoin(adaptivePlan)
       assert(bhj.size == 3)
-      // additional shuffle exchange introduced, only one shuffle reader to local shuffle reader.
+      // The child of remaining two BroadcastHashJoin is not ShuffleQueryStage.
+      // So only two LocalShuffleReader.
       checkNumLocalShuffleReaders(adaptivePlan, 1)
     }
   }

Original file line number	Diff line number	Diff line change
`@@ -163,8 +163,9 @@ class AdaptiveQueryExecSuite`
`163`	`163`	`assert(smj.size == 3)`
`164`	`164`	`val bhj = findTopLevelBroadcastHashJoin(adaptivePlan)`
`165`	`165`	`assert(bhj.size == 3)`
`166`		`- // additional shuffle exchange introduced, only one shuffle reader to local shuffle reader.`
`167`		`- checkNumLocalShuffleReaders(adaptivePlan, 1)`
	`166`	`+ // The child of remaining one BroadcastHashJoin is not ShuffleQueryStage.`
	`167`	`+ // So only two LocalShuffleReader.`
	`168`	`+ checkNumLocalShuffleReaders(adaptivePlan, 2)`
`168`	`169`	`}`
`169`	`170`	`}`
`170`	`171`
`@@ -188,7 +189,8 @@ class AdaptiveQueryExecSuite`
`188`	`189`	`assert(smj.size == 3)`
`189`	`190`	`val bhj = findTopLevelBroadcastHashJoin(adaptivePlan)`
`190`	`191`	`assert(bhj.size == 3)`
`191`		`- // additional shuffle exchange introduced, only one shuffle reader to local shuffle reader.`
	`192`	`+ // The child of remaining two BroadcastHashJoin is not ShuffleQueryStage.`
	`193`	`+ // So only two LocalShuffleReader.`
`192`	`194`	`checkNumLocalShuffleReaders(adaptivePlan, 1)`
`193`	`195`	`}`
`194`	`196`	`}`
`@@ -213,7 +215,8 @@ class AdaptiveQueryExecSuite`
`213`	`215`	`assert(smj.size == 3)`
`214`	`216`	`val bhj = findTopLevelBroadcastHashJoin(adaptivePlan)`
`215`	`217`	`assert(bhj.size == 3)`
`216`		`- // additional shuffle exchange introduced, only one shuffle reader to local shuffle reader.`
	`218`	`+ // The child of remaining two BroadcastHashJoin is not ShuffleQueryStage.`
	`219`	`+ // So only two LocalShuffleReader.`
`217`	`220`	`checkNumLocalShuffleReaders(adaptivePlan, 1)`
`218`	`221`	`}`
`219`	`222`	`}`