[HADP-55702] Fix incorrect output binding in BroadcastRangeJoinExec (apache#652)

colinmjj · GitHub Enterprise · commit f7f8ec1adc2e · 2024-12-16T03:32:18.000-06:00
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
@@ -35,7 +35,7 @@ import org.apache.spark.sql.errors.{QueryCompilationErrors, QueryExecutionErrors
 import org.apache.spark.sql.execution.aggregate.AggUtils
 import org.apache.spark.sql.execution.columnar.{InMemoryRelation, InMemoryTableScanExec}
 import org.apache.spark.sql.execution.command._
-import org.apache.spark.sql.execution.datasources.{WriteFiles, WriteFilesExec}
+import org.apache.spark.sql.execution.datasources.{LogicalRelation, WriteFiles, WriteFilesExec}
 import org.apache.spark.sql.execution.exchange.{REBALANCE_PARTITIONS_BY_COL, REBALANCE_PARTITIONS_BY_NONE, REPARTITION_BY_COL, REPARTITION_BY_NUM, ShuffleExchangeExec}
 import org.apache.spark.sql.execution.joins.{BroadcastRangeJoinExec, RangeInfo}
 import org.apache.spark.sql.execution.python._
@@ -414,6 +414,22 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
             }
           }
 
+          // only process the plan with only one LogicalRelation
+          def getPartitionColumns(plan: LogicalPlan): Seq[String] = {
+            var partitionColumns = Seq.empty[String]
+            var findRelation = false
+            plan foreach {
+              case LogicalRelation(_, _, catalogTable, _) if catalogTable.isDefined =>
+                if (findRelation) {
+                  return Seq.empty
+                }
+                findRelation = true
+                partitionColumns = catalogTable.get.partitionColumnNames
+              case _ =>
+            }
+            partitionColumns
+          }
+
           def createBroadcastRangeJoinExec(leftRangeKeys: Seq[Expression],
                                            rightRangeKeys: Seq[Expression],
                                            equality: Seq[Boolean],
@@ -435,10 +451,29 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
               (buildKeys.flatMap(e => QueryPlan.normalizePredicates(e :: Nil, buildPlan.output)),
                 buildPlan.output.map(QueryPlan.normalizeExpressions(_, buildPlan.output)))
 
+            // for partition table, the partition columns should be put at the end of the output
+            val streamPartitionColumns = getPartitionColumns(streamedPlan)
+            val streamOutput = if (streamPartitionColumns.isEmpty) {
+              streamedPlan.output
+            } else {
+              var orderedPartitionOutput = Seq.empty[Attribute]
+              // order partition columns
+              streamPartitionColumns.foreach(col => {
+                streamedPlan.output.foreach(att => {
+                  if (att.name == col) {
+                    orderedPartitionOutput = orderedPartitionOutput :+ att
+                  }
+                })
+              })
+              // merge all output with order
+              streamedPlan.output.filter(
+                att => !streamPartitionColumns.contains(att.name)) ++ orderedPartitionOutput
+            }
+
             val (normalizedStreamedKeys, normalizedStreamedPlanOutput) =
               (streamedKeys.flatMap(e =>
-                QueryPlan.normalizePredicates(e :: Nil, streamedPlan.output)),
-                streamedPlan.output.map(QueryPlan.normalizeExpressions(_, streamedPlan.output)))
+                QueryPlan.normalizePredicates(e :: Nil, streamOutput)),
+                streamOutput.map(QueryPlan.normalizeExpressions(_, streamOutput)))
 
             val allOutput = left.output ++ right.output
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/mv/MaterializedViewOptimizerBaseSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/mv/MaterializedViewOptimizerBaseSuite.scala
@@ -21,6 +21,7 @@ import java.io.File
 
 import org.apache.spark.{SparkConf, SparkContext, SparkFunSuite}
 import org.apache.spark.sql.catalyst.plans.logical._
+import org.apache.spark.sql.execution.SparkPlan
 import org.apache.spark.sql.test.SharedSparkSession
 import org.apache.spark.util.Utils
 
@@ -179,6 +180,26 @@ class MaterializedViewOptimizerBaseSuite extends SparkFunSuite with SharedSparkS
         |) using parquet
         |""".stripMargin)
 
+    sql(
+      """
+        |create table db2.range_t1 (
+        |col1 int,
+        |col2 string,
+        |d1 date,
+        |d2 date
+        |) using parquet
+        |""".stripMargin)
+
+    sql(
+      """
+        |create table db2.range_t2 (
+        |col1 int,
+        |col2 string,
+        |d1 date,
+        |d2 date
+        |) using parquet
+        |""".stripMargin)
+
     mvDbPath = Utils.createTempDir(dir.getAbsolutePath, "mv_db")
     sql(s"create database mv_db location '${mvDbPath.getAbsolutePath}'")
   }
@@ -194,6 +215,8 @@ class MaterializedViewOptimizerBaseSuite extends SparkFunSuite with SharedSparkS
     sql("drop table db2.company")
     sql("drop table db2.dependents")
     sql("drop table db2.locations")
+    sql("drop table db2.range_t1")
+    sql("drop table db2.range_t2")
     sql("drop database db1")
     sql("drop database db2")
     sql("drop database mv_db")
@@ -254,12 +277,19 @@ class MaterializedViewOptimizerBaseSuite extends SparkFunSuite with SharedSparkS
       mvName: String,
       mvQuery: String,
       query: String,
-      expectedResult: String): Unit = {
+      expectedResult: String,
+      partitionColumns: Seq[String] = Seq.empty): Unit = {
     val mvTablePath = new File(mvDbPath, mvName)
+    val partitionDesc = if (!partitionColumns.isEmpty) {
+      "partitioned by (" + partitionColumns.mkString(", ") + ")"
+    } else {
+      ""
+    }
     try {
       sql(
         s"""
            |create materialized view mv_db.$mvName using parquet
+           |$partitionDesc
            |as $mvQuery
            |""".stripMargin)
 
@@ -281,6 +311,40 @@ class MaterializedViewOptimizerBaseSuite extends SparkFunSuite with SharedSparkS
     }
   }
 
+  protected def checkSparkPlanWithMaterializedView(
+      mvName: String,
+      mvQuery: String,
+      query: String,
+      partitionColumns: Seq[String] = Seq.empty)(f: SparkPlan => Unit): Unit = {
+    val mvTablePath = new File(mvDbPath, mvName)
+    val partitionDesc = if (!partitionColumns.isEmpty) {
+      "partitioned by (" + partitionColumns.mkString(", ") + ")"
+    } else {
+      ""
+    }
+    try {
+      sql(
+        s"""
+           |create materialized view mv_db.$mvName using parquet
+           |$partitionDesc
+           |as $mvQuery
+           |""".stripMargin)
+
+      MaterializedViewManager.cacheMaterializedView()
+
+      val dfResult = sql(query)
+      assert(dfResult.queryExecution.materialized.
+        getOptimizeTags().contains("MATERIALIZED_VIEW_OPTIMIZED"))
+      f(dfResult.queryExecution.sparkPlan)
+    } catch {
+      case ex: Exception =>
+        fail(ex)
+    } finally {
+      dropMaterializedView(mvName, mvTablePath)
+      MaterializedViewManager.clearCache()
+    }
+  }
+
   protected def withUnSatisfiedMaterializedView(
       mvName: String,
       mvQuery: String,
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/mv/MaterializedViewOptimizerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/mv/MaterializedViewOptimizerSuite.scala
@@ -18,6 +18,10 @@
 package org.apache.spark.sql.execution.mv
 
 import org.apache.spark.SparkContext
+import org.apache.spark.sql.catalyst.expressions.AttributeReference
+import org.apache.spark.sql.execution.joins.BroadcastRangeJoinExec
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.types.IntegerType
 
 class MaterializedViewOptimizerSuite extends MaterializedViewOptimizerBaseSuite {
 
@@ -3256,4 +3260,77 @@ class MaterializedViewOptimizerSuite extends MaterializedViewOptimizerBaseSuite
           mvOptimizedCount("testmv") == 2)
     }
   }
+
+  test("testWithRangerJoin1") {
+    withSQLConf(SQLConf.RANGE_JOIN_ENABLED.key -> "true") {
+      checkSparkPlanWithMaterializedView("testmv",
+        mvQuery =
+          """
+            |select * from db2.range_t1
+            |""".stripMargin,
+        partitionColumns = Seq("d1"),
+        query =
+          """
+            |select t1.d1, t1.d2, t1.col1, t2.col2
+            |from
+            |    db2.range_t1 t1
+            |join
+            |  db2.range_t2 t2
+            |  on t1.d1 < t2.d1
+            |  and t1.d1 >= t2.d1 - interval '30' day
+            |""".stripMargin
+        ) {
+        p => {
+          assert(p.exists(_.isInstanceOf[BroadcastRangeJoinExec]))
+          p.foreach {
+            case b: BroadcastRangeJoinExec =>
+              val rangeInfo = b.rangeInfo
+              val keys = rangeInfo.normalizedStreamedKeys
+              val output = rangeInfo.normalizedStreamedPlanOutput
+              assert(output.last.exprId == keys.head.asInstanceOf[AttributeReference].exprId)
+            case _ =>
+          }
+        }
+      }
+    }
+  }
+
+  test("testWithRangerJoin2") {
+    withSQLConf(SQLConf.RANGE_JOIN_ENABLED.key -> "true") {
+      checkSparkPlanWithMaterializedView("testmv",
+        mvQuery =
+          """
+            |select * from db2.range_t1
+            |""".stripMargin,
+        partitionColumns = Seq("d1", "col1"),
+        query =
+          """
+            |select t1.d1, t1.d2, t1.col1, t2.col2
+            |from
+            |    db2.range_t1 t1
+            |join
+            |  db2.range_t2 t2
+            |  on t1.d1 < t2.d1
+            |  and t1.d1 >= t2.d1 - interval '30' day
+            |""".stripMargin
+      ) {
+        p => {
+          assert(p.exists(_.isInstanceOf[BroadcastRangeJoinExec]))
+          p.foreach {
+            case b: BroadcastRangeJoinExec =>
+              val rangeInfo = b.rangeInfo
+              val keys = rangeInfo.normalizedStreamedKeys
+              val output = rangeInfo.normalizedStreamedPlanOutput
+              // should be d2, d1, col1
+              assert(output.size == 3)
+              // should be d1
+              assert(output(1).exprId == keys.head.asInstanceOf[AttributeReference].exprId)
+              // should be col1
+              assert(output.last.dataType == IntegerType)
+            case _ =>
+          }
+        }
+      }
+    }
+  }
 }