Preserve shuffled hash join build side partitioning

c21 · c21 · commit dface2ab4e09 · 2020-07-15T22:44:41.000-07:00
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashJoin.scala
@@ -206,7 +206,7 @@ trait HashJoin extends BaseJoinExec {
         existenceJoin(streamedIter, hashed)
       case x =>
         throw new IllegalArgumentException(
-          s"BroadcastHashJoin should not take $x as the JoinType")
+          s"HashJoin should not take $x as the JoinType")
     }
 
     val resultProj = createResultProjection
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ShuffledHashJoinExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ShuffledHashJoinExec.scala
@@ -47,6 +47,18 @@ case class ShuffledHashJoinExec(
     "buildDataSize" -> SQLMetrics.createSizeMetric(sparkContext, "data size of build side"),
     "buildTime" -> SQLMetrics.createTimingMetric(sparkContext, "time to build hash map"))
 
+  override def outputPartitioning: Partitioning = joinType match {
+    case _: InnerLike =>
+      PartitioningCollection(Seq(left.outputPartitioning, right.outputPartitioning))
+    // For left and right outer joins, the output is partitioned by the streamed input's join keys.
+    case LeftOuter => left.outputPartitioning
+    case RightOuter => right.outputPartitioning
+    case LeftExistence(_) => left.outputPartitioning
+    case x =>
+      throw new IllegalArgumentException(
+        s"${getClass.getSimpleName} should not take $x as the JoinType")
+  }
+
   override def requiredChildDistribution: Seq[Distribution] =
     HashClusteredDistribution(leftKeys) :: HashClusteredDistribution(rightKeys) :: Nil
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
@@ -31,6 +31,7 @@ import org.apache.spark.sql.catalyst.expressions.{Ascending, GenericRow, SortOrd
 import org.apache.spark.sql.catalyst.plans.logical.Filter
 import org.apache.spark.sql.execution.{BinaryExecNode, FilterExec, SortExec, SparkPlan}
 import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper
+import org.apache.spark.sql.execution.exchange.ShuffleExchangeExec
 import org.apache.spark.sql.execution.joins._
 import org.apache.spark.sql.execution.python.BatchEvalPythonExec
 import org.apache.spark.sql.internal.SQLConf
@@ -1086,4 +1087,21 @@ class JoinSuite extends QueryTest with SharedSparkSession with AdaptiveSparkPlan
       assert(df2.join(df1, "id").collect().isEmpty)
     }
   }
+
+  test("SPARK-32330: Preserve shuffled hash join build side partitioning") {
+    withSQLConf(
+        SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "50",
+        SQLConf.SHUFFLE_PARTITIONS.key -> "2",
+        SQLConf.PREFER_SORTMERGEJOIN.key -> "false") {
+      val df1 = spark.range(10).select($"id".as("k1"))
+      val df2 = spark.range(30).select($"id".as("k2"))
+      Seq("inner", "cross").foreach(joinType => {
+        val plan = df1.join(df2, $"k1" === $"k2", joinType).groupBy($"k1").count()
+          .queryExecution.executedPlan
+        assert(plan.collect { case _: ShuffledHashJoinExec => true }.size === 1)
+        // No extra shuffle before aggregate
+        assert(plan.collect { case _: ShuffleExchangeExec => true }.size === 2)
+      })
+    }
+  }
 }

Original file line number	Diff line number	Diff line change
`@@ -206,7 +206,7 @@ trait HashJoin extends BaseJoinExec {`
`206`	`206`	`existenceJoin(streamedIter, hashed)`
`207`	`207`	`case x =>`
`208`	`208`	`throw new IllegalArgumentException(`
`209`		`- s"BroadcastHashJoin should not take $x as the JoinType")`
	`209`	`+ s"HashJoin should not take $x as the JoinType")`
`210`	`210`	`}`
`211`	`211`
`212`	`212`	`val resultProj = createResultProjection`