[CARMEL-6174][FOLLOWUP] Change prefer shuffled hash join condition (#1099)

wakun · GitHub Enterprise · commit 9a4f8c3830df · 2022-11-11T18:23:56.000+08:00
* [CARMEL-6174][FOLLOWUP] Change prefer shuffled hash join condition

* Select SHJ the max partition size than ADAPTIVE_SHUFFLE_HASH_JOIN_ADVISORY_STREAM_PARTITION_SIZE
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -736,6 +736,14 @@ object SQLConf {
       .bytesConf(ByteUnit.BYTE)
       .createWithDefault(0L)
 
+  val ADAPTIVE_SHUFFLE_HASH_JOIN_ADVISORY_STREAM_PARTITION_SIZE =
+    buildConf("spark.sql.adaptive.shuffledHashJoinAdvisoryStreamPartitionSize")
+      .doc(s"If the median partition size is larger than this config, join selection prefer to " +
+        s"use shuffled hash join.")
+      .version("3.2.0")
+      .bytesConf(ByteUnit.BYTE)
+      .createWithDefault(0L)
+
   val SUBEXPRESSION_ELIMINATION_ENABLED =
     buildConf("spark.sql.subexpressionElimination.enabled")
       .internal()
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/DynamicJoinSelection.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/DynamicJoinSelection.scala
@@ -23,26 +23,37 @@ import org.apache.spark.sql.catalyst.planning.ExtractEquiJoinKeys
 import org.apache.spark.sql.catalyst.plans.{LeftAnti, LeftOuter, RightOuter}
 import org.apache.spark.sql.catalyst.plans.logical.{HintInfo, Join, JoinStrategyHint, LogicalPlan, NO_BROADCAST_HASH, PREFER_SHUFFLE_HASH, SHUFFLE_HASH}
 import org.apache.spark.sql.catalyst.rules.Rule
-import org.apache.spark.sql.catalyst.trees.TreeNodeTag
-import org.apache.spark.sql.execution.{CoalescedPartitionSpec, SparkPlan}
+import org.apache.spark.sql.execution.CoalescedPartitionSpec
 import org.apache.spark.sql.execution.adaptive.OptimizeSkewedJoin.getSkewThreshold
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.util.Utils
 
 /**
  * This optimization rule includes three join selection:
- *   1. detects a join child that has a high ratio of empty partitions and adds a
+ *   1. Do not add any until all the children are materialized and don't need additional shuffle.
+ *   1.1 Won't select any join strategy for the following query as it need additional shuffle
+ *      after all ShuffleQueryStageExec are materialized
+ *      SortMergeJoin
+ *      :- ShuffleQueryStageExec (hashpartitioning(ID#1, 10000))
+ *      +- SortMergeJoin
+ *         :- ShuffleQueryStageExec (hashpartitioning(ID#2, 500))
+ *         +- ShuffleQueryStageExec (hashpartitioning(ID#3, 500))
+ *
+ *   1.2 Won't select any join strategy if the other side contains bucket table.
+ *      SortMergeJoin
+ *      :- ShuffleQueryStageExec (hashpartitioning(ID#1, 10000))
+ *      +- BucketTableScan
+ *
+ *   2. detects a join child that has a high ratio of empty partitions and adds a
  *      NO_BROADCAST_HASH hint to avoid it being broadcast, as shuffle join is faster in this case:
  *      many tasks complete immediately since one join side is empty.
- *   2. detects a join child that every partition size is less than local map threshold and adds a
+ *   3. detects a join child that every partition size is less than local map threshold and adds a
  *      PREFER_SHUFFLE_HASH hint to encourage being shuffle hash join instead of sort merge join.
- *   3. if a join satisfies both NO_BROADCAST_HASH and PREFER_SHUFFLE_HASH,
+ *   4. if a join satisfies both NO_BROADCAST_HASH and PREFER_SHUFFLE_HASH,
  *      then add a SHUFFLE_HASH hint.
  */
 object DynamicJoinSelection extends Rule[LogicalPlan] with JoinSelectionHelper {
 
-  val USER_DEFINED_HINT_TAG = TreeNodeTag[Boolean]("USER_DEFINED_HINT")
-
   private def hasManyEmptyPartitions(mapStats: MapOutputStatistics): Boolean = {
     val partitionCnt = mapStats.bytesByPartitionId.length
     val nonZeroCnt = mapStats.bytesByPartitionId.count(_ > 0)
@@ -65,22 +76,20 @@ object DynamicJoinSelection extends Rule[LogicalPlan] with JoinSelectionHelper {
       streamedStats: Seq[MapOutputStatistics]): Boolean = {
     val maxShuffledHashJoinLocalMapThreshold =
       conf.getConf(SQLConf.ADAPTIVE_MAX_SHUFFLE_HASH_JOIN_LOCAL_MAP_THRESHOLD)
+    val advisoryStreamPartitionSize =
+      conf.getConf(SQLConf.ADAPTIVE_SHUFFLE_HASH_JOIN_ADVISORY_STREAM_PARTITION_SIZE)
     // If the join is skew, since CARMEL will not handle SHJ skew join, and we are not sure SHJ
     // will be faster better SMJ for the left skew join patten, so do not convert to SHJ if any
     // join side is skew.
-    if (maxShuffledHashJoinLocalMapThreshold <= 0 || streamedStats.exists(isSkew(_))) {
+    if (maxShuffledHashJoinLocalMapThreshold <= 0 || advisoryStreamPartitionSize <= 0 ||
+      streamedStats.exists(isSkew(_))) {
       return false
     }
-    val partitionSpecs = ShufflePartitionsUtil.coalescePartitions(
-      Array(mapStats) ++ streamedStats,
-      advisoryTargetSize = conf.getConf(SQLConf.ADVISORY_PARTITION_SIZE_IN_BYTES),
-      minNumPartitions = 0)
-    partitionSpecs.nonEmpty &&
-      partitionSpecs.forall(_.isInstanceOf[CoalescedPartitionSpec]) &&
-      partitionSpecs.collect {
-        case CoalescedPartitionSpec(startReducerIndex, endReducerIndex) =>
-          mapStats.bytesByPartitionId.slice(startReducerIndex, endReducerIndex).sum
-      }.forall(_ <= maxShuffledHashJoinLocalMapThreshold)
+
+    mapStats.bytesByPartitionId.forall(_ <= maxShuffledHashJoinLocalMapThreshold) &&
+      streamedStats.filter(_.bytesByPartitionId.length > 0).exists { stats =>
+        Utils.median(stats.bytesByPartitionId, false) > advisoryStreamPartitionSize
+      }
   }
 
   private def selectJoinStrategy(
@@ -123,16 +132,29 @@ object DynamicJoinSelection extends Rule[LogicalPlan] with JoinSelectionHelper {
           stage.computeStats().exists(_.rowCount.exists(_.toLong >= conf.broadcastMaxRowNum))
         val adjustDemoteBroadcastHash = rowNumberExceeded || demoteBroadcastHash
 
-        def collectShuffleStats(plan: LogicalPlan): Seq[MapOutputStatistics] = plan match {
+        var bucketedPlan = false
+        def collectShuffleStats(plan: LogicalPlan): Seq[Option[MapOutputStatistics]] = plan match {
           case LogicalQueryStage(_, streamedStage: ShuffleQueryStageExec)
             if streamedStage.isMaterialized && streamedStage.mapStats.isDefined =>
-            Seq(streamedStage.mapStats.get)
-          case _ => plan.children.flatMap(collectShuffleStats)
+            Seq(streamedStage.mapStats)
+          case LogicalQueryStage(_, _: ShuffleQueryStageExec) => Seq(None)
+          case _ if plan.children.nonEmpty => plan.children.flatMap(collectShuffleStats)
+          case _ =>
+            bucketedPlan = true
+            Seq()
         }
-        val preferShuffleHash =
-          preferShuffledHashJoin(stage.mapStats.get, collectShuffleStats(streamedPlan))
 
-        logInfo(s"canBroadcastPlan = $canBroadcastPlan, rowNumberExceeded = " +
+        val streamedStats = collectShuffleStats(streamedPlan)
+        val allStats = Array(stage.mapStats) ++ streamedStats
+
+        val shuffleMaterialized =
+          allStats.forall(_.isDefined) &&
+            allStats.map(_.get.bytesByPartitionId.length).distinct.length == 1
+        val preferShuffleHash = !bucketedPlan && shuffleMaterialized &&
+          preferShuffledHashJoin(stage.mapStats.get, streamedStats.map(_.get))
+
+        logInfo(s"isLeft = $isLeft, shuffleMaterialized = $shuffleMaterialized, " +
+          s"canBroadcastPlan = $canBroadcastPlan, rowNumberExceeded = " +
           s"$rowNumberExceeded, adjustDemoteBroadcastHash = $adjustDemoteBroadcastHash, " +
           s"preferShuffleHash = $preferShuffleHash")
         if (adjustDemoteBroadcastHash && preferShuffleHash) {
@@ -150,24 +172,18 @@ object DynamicJoinSelection extends Rule[LogicalPlan] with JoinSelectionHelper {
   }
 
   def apply(plan: LogicalPlan): LogicalPlan = plan.transformDown {
-    case j @ ExtractEquiJoinKeys(_, _, _, _, left, right, hint) =>
-      if (left.getTagValue(USER_DEFINED_HINT_TAG).isEmpty) {
-        left.setTagValue(USER_DEFINED_HINT_TAG, hint.leftHint.exists(_.strategy.isDefined))
-      }
-      if (right.getTagValue(USER_DEFINED_HINT_TAG).isEmpty) {
-        right.setTagValue(USER_DEFINED_HINT_TAG, hint.rightHint.exists(_.strategy.isDefined))
-      }
+    case j @ ExtractEquiJoinKeys(_, _, _, _, _, _, hint) =>
       var newHint = hint
-      if (!left.getTagValue(USER_DEFINED_HINT_TAG).getOrElse(false)) {
+      if (!hint.leftHint.exists(_.strategy.isDefined) ||
+        hint.leftHint.get.strategy.contains(NO_BROADCAST_HASH)) {
         selectJoinStrategy(j, true).foreach { strategy =>
-          logInfo(s"Set left side join strategy: $strategy")
           newHint = newHint.copy(leftHint =
             Some(hint.leftHint.getOrElse(HintInfo()).copy(strategy = Some(strategy))))
         }
       }
-      if (!right.getTagValue(USER_DEFINED_HINT_TAG).getOrElse(false)) {
+      if (!hint.rightHint.exists(_.strategy.isDefined) ||
+        hint.rightHint.get.strategy.contains(NO_BROADCAST_HASH)) {
         selectJoinStrategy(j, false).foreach { strategy =>
-          logInfo(s"Set right side join strategy: $strategy")
           newHint = newHint.copy(rightHint =
             Some(hint.rightHint.getOrElse(HintInfo()).copy(strategy = Some(strategy))))
         }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala
@@ -2496,22 +2496,64 @@ class AdaptiveQueryExecSuite
           (1 to 30).map(i => TestData(i, i.toString)), 5)
           .toDF("c1", "c2").createOrReplaceTempView("t2")
 
-        // left partition size: [926, 729, 731] after coalesce : [926, 1460]
-        // right partition size: [416, 258, 252] after coalesce : [416, 510]
+        // left partition size: [926, 729, 731]
+        // right partition size: [416, 258, 252]
         withSQLConf(SQLConf.SHUFFLE_PARTITIONS.key -> "3",
           SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "450",
-          SQLConf.ADVISORY_PARTITION_SIZE_IN_BYTES.key -> "2000",
+          SQLConf.ADVISORY_PARTITION_SIZE_IN_BYTES.key -> "300",
+          SQLConf.ADAPTIVE_SHUFFLE_HASH_JOIN_ADVISORY_STREAM_PARTITION_SIZE.key -> "700",
           SQLConf.PREFER_SORTMERGEJOIN.key -> "true") {
           // check default value ADAPTIVE_MAX_SHUFFLE_HASH_JOIN_LOCAL_MAP_THRESHOLD = 0
           checkJoinStrategy(false)
-          withSQLConf(SQLConf.ADAPTIVE_MAX_SHUFFLE_HASH_JOIN_LOCAL_MAP_THRESHOLD.key -> "500") {
+          withSQLConf(SQLConf.ADAPTIVE_MAX_SHUFFLE_HASH_JOIN_LOCAL_MAP_THRESHOLD.key -> "300") {
             checkJoinStrategy(false)
           }
-          withSQLConf(SQLConf.ADAPTIVE_MAX_SHUFFLE_HASH_JOIN_LOCAL_MAP_THRESHOLD.key -> "800") {
+          withSQLConf(SQLConf.ADAPTIVE_MAX_SHUFFLE_HASH_JOIN_LOCAL_MAP_THRESHOLD.key -> "500") {
             checkJoinStrategy(true)
           }
         }
       }
     }
   }
+
+  test("CARMEL-6174: Won't use SHJ for Bucket") {
+    withTempView("t1", "t2") {
+      def checkJoinStrategy(shouldShuffleHashJoin: Boolean): Unit = {
+        val (origin1, adaptive1) = runAdaptiveAndVerifyResult(
+          "SELECT t1.c1, t2.c1 FROM t1 JOIN t2 ON t1.c1 = t2.c1")
+        assert(findTopLevelSortMergeJoin(origin1).size === 1)
+        if (shouldShuffleHashJoin) {
+          val shj = findTopLevelShuffledHashJoin(adaptive1)
+          assert(shj.size === 1)
+          assert(shj.head.buildSide == BuildRight)
+        } else {
+          assert(findTopLevelSortMergeJoin(adaptive1).size === 1)
+        }
+
+        withSQLConf(SQLConf.ADAPTIVE_MAX_SHUFFLE_HASH_JOIN_LOCAL_MAP_THRESHOLD.key -> "0") {
+          // respect user specified join hint
+          val (origin2, adaptive2) = runAdaptiveAndVerifyResult(
+            "SELECT /*+ MERGE(t1) */ t1.c1, t2.c1 FROM t1 JOIN t2 ON t1.c1 = t2.c1")
+          assert(findTopLevelSortMergeJoin(origin2).size === 1)
+          assert(findTopLevelSortMergeJoin(adaptive2).size === 1)
+        }
+      }
+
+      withSQLConf(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "true") {
+        spark.range(1, 100, 10).map(i => (i, i.toString)).toDF("c1", "c2")
+          .write.format("parquet").saveAsTable("t1")
+        spark.range(1, 30, 5).map(i => (i, i.toString)).toDF("c1", "c2")
+          .write.format("parquet").bucketBy(2, "c1").sortBy("c1").saveAsTable("t2")
+
+        withSQLConf(SQLConf.SHUFFLE_PARTITIONS.key -> "3",
+          SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "0",
+          SQLConf.ADAPTIVE_MAX_SHUFFLE_HASH_JOIN_LOCAL_MAP_THRESHOLD.key -> "1024000",
+          SQLConf.ADVISORY_PARTITION_SIZE_IN_BYTES.key -> "200",
+          SQLConf.ADAPTIVE_SHUFFLE_HASH_JOIN_ADVISORY_STREAM_PARTITION_SIZE.key -> "500",
+          SQLConf.PREFER_SORTMERGEJOIN.key -> "true") {
+          checkJoinStrategy(false)
+        }
+      }
+    }
+  }
 }