[SPARK-51544][SQL] Add only unique and necessary metadata columns

mihailotim-db · cloud-fan · commit aaa94c4e40df · 2025-03-20T14:35:49.000+08:00
### What changes were proposed in this pull request? AddMetadataColumns should add only unique and necessary metadata columns, not the entire child's metadata output ### Why are the changes needed? There are 3 reasons to make this change: 1. Adding duplicates of metadata columns creates problems for single-pass analyzer, where we need to hack our way around adding these columns, because both `AddMetadataColumns` and `ResolveReferences` can add same attributes. 2. Adding unique and only necessary metadata columns is more semantically correct 3. This PR is also a preparation to fix [SPARK-51545 ](https://issues.apache.org/jira/browse/SPARK-51545) ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Added a new suite to test `AddMetadataColumns` rule. Existing tests ### Was this patch authored or co-authored using generative AI tooling? No Closes #50304 from mihailotim-db/mihailotim-db/unique_metadata_cols. Authored-by: Mihailo Timotic <mihailo.timotic@databricks.com> Signed-off-by: Wenchen Fan <wenchen@databricks.com>
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -985,25 +985,30 @@ class Analyzer(override val catalogManager: CatalogManager) extends RuleExecutor
   object AddMetadataColumns extends Rule[LogicalPlan] {
     import org.apache.spark.sql.catalyst.util._
 
-    def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperatorsDownWithPruning(
-      AlwaysProcess.fn, ruleId) {
-      case hint: UnresolvedHint => hint
-      // Add metadata output to all node types
-      case node if node.children.nonEmpty && node.resolved && hasMetadataCol(node) =>
-        val inputAttrs = AttributeSet(node.children.flatMap(_.output))
-        val metaCols = getMetadataAttributes(node).filterNot(inputAttrs.contains)
-        if (metaCols.isEmpty) {
-          node
-        } else {
-          val newNode = node.mapChildren(addMetadataCol(_, metaCols.map(_.exprId).toSet))
-          // We should not change the output schema of the plan. We should project away the extra
-          // metadata columns if necessary.
-          if (newNode.sameOutput(node)) {
-            newNode
+    def apply(plan: LogicalPlan): LogicalPlan = {
+      val onlyUniqueAndNecessaryMetadataColumns =
+        conf.getConf(SQLConf.ONLY_NECESSARY_AND_UNIQUE_METADATA_COLUMNS)
+      plan.resolveOperatorsDownWithPruning(AlwaysProcess.fn, ruleId) {
+        case hint: UnresolvedHint => hint
+        // Add metadata output to all node types
+        case node if node.children.nonEmpty && node.resolved && hasMetadataCol(node) =>
+          val inputAttrs = AttributeSet(node.children.flatMap(_.output))
+          val metaCols = getMetadataAttributes(node).filterNot(inputAttrs.contains)
+          if (metaCols.isEmpty) {
+            node
           } else {
-            Project(node.output, newNode)
+            val newNode = node.mapChildren(
+              addMetadataCol(_, metaCols.map(_.exprId).toSet, onlyUniqueAndNecessaryMetadataColumns)
+            )
+            // We should not change the output schema of the plan. We should project away the extra
+            // metadata columns if necessary.
+            if (newNode.sameOutput(node)) {
+              newNode
+            } else {
+              Project(node.output, newNode)
+            }
           }
-        }
+      }
     }
 
     private def getMetadataAttributes(plan: LogicalPlan): Seq[Attribute] = {
@@ -1031,18 +1036,32 @@ class Analyzer(override val catalogManager: CatalogManager) extends RuleExecutor
 
     private def addMetadataCol(
         plan: LogicalPlan,
-        requiredAttrIds: Set[ExprId]): LogicalPlan = plan match {
+        requiredAttrIds: Set[ExprId],
+        onlyUniqueAndNecessaryMetadataColumns: Boolean = true): LogicalPlan = plan match {
       case s: ExposesMetadataColumns if s.metadataOutput.exists( a =>
         requiredAttrIds.contains(a.exprId)) =>
         s.withMetadataColumns()
       case p: Project if p.metadataOutput.exists(a => requiredAttrIds.contains(a.exprId)) =>
+        val uniqueMetadataColumns = if (onlyUniqueAndNecessaryMetadataColumns) {
+          val actualRequiredExprIds = new util.HashSet[ExprId](requiredAttrIds.asJava)
+          p.projectList.foreach(ne => actualRequiredExprIds.remove(ne.exprId))
+          p.metadataOutput.filter(attr => actualRequiredExprIds.contains(attr.exprId))
+        } else {
+          p.metadataOutput
+        }
+
         val newProj = p.copy(
           // Do not leak the qualified-access-only restriction to normal plan outputs.
-          projectList = p.projectList ++ p.metadataOutput.map(_.markAsAllowAnyAccess()),
-          child = addMetadataCol(p.child, requiredAttrIds))
+          projectList = p.projectList ++ uniqueMetadataColumns.map(_.markAsAllowAnyAccess()),
+          child = addMetadataCol(p.child, requiredAttrIds, onlyUniqueAndNecessaryMetadataColumns)
+        )
         newProj.copyTagsFrom(p)
         newProj
-      case _ => plan.withNewChildren(plan.children.map(addMetadataCol(_, requiredAttrIds)))
+      case _ =>
+        plan.withNewChildren(
+          plan.children
+            .map(addMetadataCol(_, requiredAttrIds, onlyUniqueAndNecessaryMetadataColumns))
+        )
     }
   }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -241,6 +241,15 @@ object SQLConf {
     }
   }
 
+  val ONLY_NECESSARY_AND_UNIQUE_METADATA_COLUMNS =
+    buildConf("spark.sql.analyzer.uniqueNecessaryMetadataColumns")
+    .internal()
+    .doc(
+      "When this conf is enabled, AddMetadataColumns rule should only add necessary metadata " +
+      "columns and only if those columns are not already present in the project list.")
+    .booleanConf
+    .createWithDefault(true)
+
   val ANALYZER_MAX_ITERATIONS = buildConf("spark.sql.analyzer.maxIterations")
     .internal()
     .doc("The max number of iterations the analyzer runs.")
diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/pipe-operators.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/pipe-operators.sql.out
@@ -1025,10 +1025,10 @@ Project [a#x, a#x, z2#x]
                   +- PipeOperator
                      +- Filter (z2#x = 0)
                         +- PipeOperator
-                           +- Project [a#x, z2#x, a#x, a#x]
-                              +- Project [a#x, z1#x, (a#x - a#x) AS z2#x, a#x, a#x]
-                                 +- Project [a#x, (a#x + a#x) AS z1#x, a#x, a#x, a#x]
-                                    +- Project [a#x, a#x, a#x, a#x, a#x]
+                           +- Project [a#x, z2#x, a#x]
+                              +- Project [a#x, z1#x, (a#x - a#x) AS z2#x, a#x]
+                                 +- Project [a#x, (a#x + a#x) AS z1#x, a#x]
+                                    +- Project [a#x, a#x]
                                        +- Join Inner, (a#x = a#x)
                                           :- SubqueryAlias lhs
                                           :  +- LocalRelation [a#x]
diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/using-join.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/using-join.sql.out
@@ -568,7 +568,7 @@ SELECT k FROM (SELECT nt2.k FROM nt1 full outer join nt2 using (k))
 Project [k#x]
 +- SubqueryAlias __auto_generated_subquery_name
    +- Project [k#x]
-      +- Project [coalesce(k#x, k#x) AS k#x, v1#x, v2#x, k#x, k#x]
+      +- Project [coalesce(k#x, k#x) AS k#x, v1#x, v2#x, k#x]
          +- Join FullOuter, (k#x = k#x)
             :- SubqueryAlias nt1
             :  +- View (`nt1`, [k#x, v1#x])
@@ -589,7 +589,7 @@ SELECT nt2.k AS key FROM nt1 full outer join nt2 using (k) ORDER BY key
 -- !query analysis
 Sort [key#x ASC NULLS FIRST], true
 +- Project [k#x AS key#x]
-   +- Project [coalesce(k#x, k#x) AS k#x, v1#x, v2#x, k#x, k#x]
+   +- Project [coalesce(k#x, k#x) AS k#x, v1#x, v2#x, k#x]
       +- Join FullOuter, (k#x = k#x)
          :- SubqueryAlias nt1
          :  +- View (`nt1`, [k#x, v1#x])
@@ -609,7 +609,7 @@ Sort [key#x ASC NULLS FIRST], true
 SELECT k, nt1.k FROM nt1 full outer join nt2 using (k)
 -- !query analysis
 Project [k#x, k#x]
-+- Project [coalesce(k#x, k#x) AS k#x, v1#x, v2#x, k#x, k#x]
++- Project [coalesce(k#x, k#x) AS k#x, v1#x, v2#x, k#x]
    +- Join FullOuter, (k#x = k#x)
       :- SubqueryAlias nt1
       :  +- View (`nt1`, [k#x, v1#x])
@@ -629,7 +629,7 @@ Project [k#x, k#x]
 SELECT k, nt2.k FROM nt1 full outer join nt2 using (k)
 -- !query analysis
 Project [k#x, k#x]
-+- Project [coalesce(k#x, k#x) AS k#x, v1#x, v2#x, k#x, k#x]
++- Project [coalesce(k#x, k#x) AS k#x, v1#x, v2#x, k#x]
    +- Join FullOuter, (k#x = k#x)
       :- SubqueryAlias nt1
       :  +- View (`nt1`, [k#x, v1#x])
@@ -828,9 +828,9 @@ WithCTE
 :        +- SubqueryAlias t
 :           +- LocalRelation [key#x]
 +- Project [key#x]
-   +- Project [key#x, key#x, key#x]
+   +- Project [key#x, key#x]
       +- Filter NOT key#x LIKE bb.%
-         +- Project [coalesce(key#x, key#x) AS key#x, key#x, key#x, key#x]
+         +- Project [coalesce(key#x, key#x) AS key#x, key#x]
             +- Join FullOuter, (key#x = key#x)
                :- SubqueryAlias t1
                :  +- CTERelationRef xxxx, true, [key#x], false, false, 1
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/AddMetadataColumnSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/AddMetadataColumnSuite.scala
@@ -0,0 +1,166 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.analysis
+
+import org.apache.spark.sql.QueryTest
+import org.apache.spark.sql.catalyst.expressions.NamedExpression
+import org.apache.spark.sql.catalyst.plans.logical.{Filter, Project}
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.test.SharedSparkSession
+
+class AddMetadataColumnsSuite extends QueryTest with SharedSparkSession {
+
+  test("Add only necessary metadata columns") {
+    // For a query like:
+    //
+    // {{{
+    // SELECT t1.k
+    // FROM VALUES(1,2) AS t1(k,v1) FULL OUTER JOIN VALUES(1,2) AS t2(k,v2) USING (k)
+    // }}}
+    //
+    // the analyzed plan would look like:
+    // Project [k#0]
+    // +- Project [coalesce(k#0, k#2) AS k#4, v1#1, v2#3, k#0, k#2]
+    //    +- Join FullOuter, (k#0 = k#2)
+    //       :- SubqueryAlias nt1
+    //       :  +- LocalRelation [k#0, v1#1]
+    //       +- SubqueryAlias nt2
+    //          +- LocalRelation [k#2, v2#3]
+    // The inner project in this case contains a reference to k#2, which is not needed in the
+    // top-most project. With `spark.sql.analyzer.uniqueNecessaryMetadataColumns` set to false, we
+    // will add k#2 to the project list because it is a metadata column. Otherwise, we don't need
+    // it and can avoid adding it in AddMetadataColumns rule.
+    withTable("t1", "t2") {
+      sql("CREATE TABLE t1(k INT, v1 INT)")
+      sql("CREATE TABLE t2(k INT, v2 INT)")
+      val left = sql("select * from t1")
+      val right = sql("select * from t2")
+      val join = left.join(right, Seq("k"), "full_outer")
+
+      val rightKeyExprId = right
+        .select(right("k"))
+        .queryExecution
+        .analyzed
+        .asInstanceOf[Project]
+        .projectList
+        .head
+        .exprId
+
+      withSQLConf(SQLConf.ONLY_NECESSARY_AND_UNIQUE_METADATA_COLUMNS.key -> "true") {
+        // Inner project list shouldn't contain a reference to the right key.
+        val analyzed = join.select(left("k")).queryExecution.analyzed
+        analyzed match {
+          case Project(_, Project(innerProjectList: Seq[NamedExpression], _)) =>
+            assert(Seq("k", "v1", "v2", "k") == innerProjectList.map(_.name))
+            assert(!innerProjectList.map(_.exprId).contains(rightKeyExprId))
+        }
+      }
+
+      withSQLConf(SQLConf.ONLY_NECESSARY_AND_UNIQUE_METADATA_COLUMNS.key -> "false") {
+        // Inner project list should contain a reference to the right key.
+        val analyzed = join.select(left("k")).queryExecution.analyzed
+        analyzed match {
+          case Project(_, Project(innerProjectList: Seq[NamedExpression], _)) =>
+            assert(Seq("k", "v1", "v2", "k", "k") == innerProjectList.map(_.name))
+            assert(innerProjectList.map(_.exprId).contains(rightKeyExprId))
+        }
+      }
+    }
+  }
+
+  test("Add only unique metadata columns") {
+    // For a query like:
+    // {{{
+    // SELECT t1.k
+    // FROM VALUES(1,2) AS t1(k, v1) FULL OUTER JOIN VALUES(1,2) AS t2(k,v2) USING (k)
+    // WHERE t1.k IS NOT NULL
+    // }}}
+    //
+    // the analyzed plan will look like:
+    // Project [k#0]
+    // +- Project [k#4, v1#1, v2#3, k#0, k#2]
+    //    +- Filter isnotnull(k#0)
+    //       +- Project [coalesce(k#0, k#2) AS k#4, v1#1, v2#3, k#0, k#0, k#2]
+    //          +- Join FullOuter, (k#0 = k#2)
+    //             :- SubqueryAlias t1
+    //             :  +- LocalRelation [k#0, v1#1]
+    //             +- SubqueryAlias t2
+    //                +- LocalRelation [k#2, v2#3]
+    //
+    // In this case, the Project under Filter contains a duplicate #k#0 attribute reference as well
+    // as an unnecessary k#2 attribute reference. Additionally, the second top-most Project has an
+    // extra k#2 that can also be removed. Duplicate reference comes from the fact that this
+    // attribute will first be added by ResolveReferences rule as missing input, but
+    // AddMetadataColumns doesn't respect the fact that this attribute already exists in the
+    // project list and duplicates it. With `spark.sql.analyzer.uniqueNecessaryMetadataColumns` set
+    // to true, we remove this duplication and the unnecessary attribute.
+    withTable("t1", "t2") {
+      sql("CREATE TABLE t1(k INT, v1 INT)")
+      sql("CREATE TABLE t2(k INT, v2 INT)")
+      val left = sql("select * from t1")
+      val right = sql("select * from t2")
+      val join = left.join(right, Seq("k"), "full_outer")
+      val filter = join.filter(left("k").isNull)
+
+      val leftKeyExprId = left
+        .select(left("k"))
+        .queryExecution
+        .analyzed
+        .asInstanceOf[Project]
+        .projectList
+        .head
+        .exprId
+      val rightKeyExprId = right
+        .select(right("k"))
+        .queryExecution
+        .analyzed
+        .asInstanceOf[Project]
+        .projectList
+        .head
+        .exprId
+
+      withSQLConf(SQLConf.ONLY_NECESSARY_AND_UNIQUE_METADATA_COLUMNS.key -> "true") {
+        // With conf on, no duplication of left key and no unnecessary right key.
+        val analyzed = filter.select(left("k")).queryExecution.analyzed
+        analyzed match {
+          case Project(
+              _,
+              Project(_, Filter(_, Project(innerProjectList: Seq[NamedExpression], _)))
+              ) =>
+            assert(Seq("k", "v1", "v2", "k") == innerProjectList.map(_.name))
+            assert(innerProjectList.map(_.exprId).count(_ == rightKeyExprId) == 0)
+            assert(innerProjectList.map(_.exprId).count(_ == leftKeyExprId) == 1)
+        }
+      }
+
+      withSQLConf(SQLConf.ONLY_NECESSARY_AND_UNIQUE_METADATA_COLUMNS.key -> "false") {
+        // With conf off, duplication of left key and an unnecessary right key.
+        val analyzed = filter.select(left("k")).queryExecution.analyzed
+        analyzed match {
+          case Project(
+              _,
+              Project(_, Filter(_, Project(innerProjectList: Seq[NamedExpression], _)))
+              ) =>
+            assert(Seq("k", "v1", "v2", "k", "k", "k") == innerProjectList.map(_.name))
+            assert(innerProjectList.map(_.exprId).count(_ == rightKeyExprId) == 1)
+            assert(innerProjectList.map(_.exprId).count(_ == leftKeyExprId) == 2)
+        }
+      }
+    }
+  }
+}