[CARMEL-5845] Add a logical plan visitor to propagate the distinct attributes (#884)

wangyum · GitHub Enterprise · commit c31f7276aa93 · 2022-07-15T17:46:15.000+08:00
* Remove the aggregation from left semi/anti join if the same aggregation has already been done on left side * Add more test * grouping -> groupingExps * Add DistinctAttributesVisitor * Fix test name * Improve DistinctAttributesVisitor * Fix test. * DistinctKeyVisitor * Address comments * Fix scala 2.13 * Address comments * Address all comments * Address all comments * fix * fix test * [SPARK-38489][SQL] Aggregate.groupOnly support foldable expressions ### What changes were proposed in this pull request? This pr makes `Aggregate.groupOnly` support foldable expressions. ### Why are the changes needed? Improve query performance. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Unit test. Closes #35795 from wangyum/SPARK-38489. Authored-by: Yuming Wang <yumwang@ebay.com> Signed-off-by: Wenchen Fan <wenchen@databricks.com> (cherry picked from commit bcf7849) * Remove * fix * Update DistinctKeyVisitor.scala
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -108,6 +108,7 @@ abstract class Optimizer(catalogManager: CatalogManager)
         RewriteCorrelatedScalarSubquery,
         EliminateSerialization,
         RemoveRedundantAliases,
+        RemoveRedundantAggregates,
         UnwrapCastInBinaryComparison,
         RemoveNoopOperators,
         SimplifyExtractValueOps,
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/RemoveRedundantAggregates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/RemoveRedundantAggregates.scala
@@ -0,0 +1,34 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.optimizer
+
+import org.apache.spark.sql.catalyst.expressions.{Alias, AliasHelper, ExpressionSet}
+import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, LogicalPlan, Project}
+import org.apache.spark.sql.catalyst.rules.Rule
+
+/**
+ * Remove redundant aggregates from a query plan. A redundant aggregate is an aggregate whose
+ * only goal is to keep distinct values, while its parent aggregate would ignore duplicate values.
+ */
+object RemoveRedundantAggregates extends Rule[LogicalPlan] with AliasHelper {
+  def apply(plan: LogicalPlan): LogicalPlan = plan.transform {
+    case agg @ Aggregate(groupingExps, _, child)
+        if agg.groupOnly && child.distinctKeys.exists(_.subsetOf(ExpressionSet(groupingExps))) =>
+      Project(agg.aggregateExpressions, child)
+  }
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/joins.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/joins.scala
@@ -169,16 +169,16 @@ object EliminateOuterJoin extends Rule[LogicalPlan] with PredicateHelper {
       if (j.joinType == newJoinType) f else Filter(condition, j.copy(joinType = newJoinType))
 
     case a @ Aggregate(_, _, Join(left, _, LeftOuter, _, _))
-        if a.groupOnly && a.references.subsetOf(AttributeSet(left.output)) =>
+        if a.groupOnly && a.references.subsetOf(left.outputSet) =>
       a.copy(child = left)
     case a @ Aggregate(_, _, Join(_, right, RightOuter, _, _))
-        if a.groupOnly && a.references.subsetOf(AttributeSet(right.output)) =>
+        if a.groupOnly && a.references.subsetOf(right.outputSet) =>
       a.copy(child = right)
     case a @ Aggregate(_, _, p @ Project(_, Join(left, _, LeftOuter, _, _)))
-        if a.groupOnly && a.references.subsetOf(AttributeSet(left.output)) =>
+        if a.groupOnly && p.references.subsetOf(left.outputSet) =>
       a.copy(child = p.copy(child = left))
     case a @ Aggregate(_, _, p @ Project(_, Join(_, right, RightOuter, _, _)))
-        if a.groupOnly && a.references.subsetOf(AttributeSet(right.output)) =>
+        if a.groupOnly && p.references.subsetOf(right.outputSet) =>
       a.copy(child = p.copy(child = right))
   }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/DistinctKeyVisitor.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/DistinctKeyVisitor.scala
@@ -0,0 +1,130 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.plans.logical
+
+import org.apache.spark.sql.catalyst.expressions.{Alias, Expression, ExpressionSet, NamedExpression}
+import org.apache.spark.sql.catalyst.planning.ExtractEquiJoinKeys
+import org.apache.spark.sql.catalyst.plans.{Inner, LeftOuter, LeftSemiOrAnti, RightOuter}
+
+/**
+ * A visitor pattern for traversing a [[LogicalPlan]] tree and propagate the distinct attributes.
+ */
+object DistinctKeyVisitor extends LogicalPlanVisitor[Set[ExpressionSet]] {
+
+  private def projectDistinctKeys(
+      keys: Set[ExpressionSet], projectList: Seq[NamedExpression]): Set[ExpressionSet] = {
+    val outputSet = ExpressionSet(projectList.map(_.toAttribute))
+    val aliases = projectList.filter(_.isInstanceOf[Alias])
+    if (aliases.isEmpty) {
+      keys.filter(_.subsetOf(outputSet)).filter(_.nonEmpty)
+    } else {
+      val expressions = keys.flatMap(_.toSet)
+      projectList.filter {
+        case a: Alias => expressions.exists(_.semanticEquals(a.child))
+        case ne => expressions.exists(_.semanticEquals(ne))
+      }.toSet.subsets(keys.map(_.size).min).take(50).filter { s =>
+        val references = s.map {
+          case a: Alias => a.child
+          case ne => ne
+        }
+        keys.exists(_.equals(ExpressionSet(references)))
+      }.map(s => ExpressionSet(s.map(_.toAttribute))).filter(_.nonEmpty).toSet
+    }
+  }
+
+  override def default(p: LogicalPlan): Set[ExpressionSet] = Set.empty[ExpressionSet]
+
+  override def visitAggregate(p: Aggregate): Set[ExpressionSet] = {
+    val groupingExps = ExpressionSet(p.groupingExpressions) // handle group by a, a
+    projectDistinctKeys(Set(groupingExps), p.aggregateExpressions)
+  }
+
+  override def visitDistinct(p: Distinct): Set[ExpressionSet] = Set(ExpressionSet(p.output))
+
+  override def visitExcept(p: Except): Set[ExpressionSet] =
+    if (!p.isAll) Set(ExpressionSet(p.output)) else default(p)
+
+  override def visitExpand(p: Expand): Set[ExpressionSet] = default(p)
+
+  override def visitFilter(p: Filter): Set[ExpressionSet] = p.child.distinctKeys
+
+  override def visitGenerate(p: Generate): Set[ExpressionSet] = default(p)
+
+  override def visitGlobalLimit(p: GlobalLimit): Set[ExpressionSet] = {
+    p.maxRows match {
+      case Some(value) if value <= 1 => Set(ExpressionSet(p.output))
+      case _ => p.child.distinctKeys
+    }
+  }
+
+  override def visitIntersect(p: Intersect): Set[ExpressionSet] = {
+    if (!p.isAll) Set(ExpressionSet(p.output)) else default(p)
+  }
+
+  override def visitJoin(p: Join): Set[ExpressionSet] = {
+    p match {
+      case Join(_, _, LeftSemiOrAnti(_), _, _) =>
+        p.left.distinctKeys
+      case ExtractEquiJoinKeys(joinType, leftKeys, rightKeys, _, left, right, _)
+          if left.distinctKeys.nonEmpty || right.distinctKeys.nonEmpty =>
+        val rightJoinKeySet = ExpressionSet(rightKeys)
+        val leftJoinKeySet = ExpressionSet(leftKeys)
+        joinType match {
+          case Inner if left.distinctKeys.exists(_.subsetOf(leftJoinKeySet)) &&
+            right.distinctKeys.exists(_.subsetOf(rightJoinKeySet)) =>
+            left.distinctKeys ++ right.distinctKeys
+          case Inner | LeftOuter if right.distinctKeys.exists(_.subsetOf(rightJoinKeySet)) =>
+            p.left.distinctKeys
+          case Inner | RightOuter if left.distinctKeys.exists(_.subsetOf(leftJoinKeySet)) =>
+            p.right.distinctKeys
+          case _ =>
+            default(p)
+        }
+      case _ => default(p)
+    }
+  }
+
+  override def visitLocalLimit(p: LocalLimit): Set[ExpressionSet] = p.child.distinctKeys
+
+  override def visitPivot(p: Pivot): Set[ExpressionSet] = default(p)
+
+  override def visitProject(p: Project): Set[ExpressionSet] = {
+    if (p.child.distinctKeys.nonEmpty) {
+      projectDistinctKeys(p.child.distinctKeys, p.projectList)
+    } else {
+      default(p)
+    }
+  }
+
+  override def visitRepartition(p: Repartition): Set[ExpressionSet] = p.child.distinctKeys
+
+  override def visitRepartitionByExpr(p: RepartitionByExpression): Set[ExpressionSet] =
+    p.child.distinctKeys
+
+  override def visitSample(p: Sample): Set[ExpressionSet] = {
+    if (!p.withReplacement) p.child.distinctKeys else default(p)
+  }
+
+  override def visitScriptTransform(p: ScriptTransformation): Set[ExpressionSet] = default(p)
+
+  override def visitUnion(p: Union): Set[ExpressionSet] = default(p)
+
+  override def visitWindow(p: Window): Set[ExpressionSet] = p.child.distinctKeys
+
+  override def visitSort(p: Sort): Set[ExpressionSet] = p.child.distinctKeys
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
@@ -30,6 +30,7 @@ abstract class LogicalPlan
   extends QueryPlan[LogicalPlan]
   with AnalysisHelper
   with LogicalPlanStats
+  with LogicalPlanDistinctKeys
   with QueryPlanConstraints
   with Logging {
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlanDistinctKeys.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlanDistinctKeys.scala
@@ -0,0 +1,34 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.plans.logical
+
+import org.apache.spark.sql.catalyst.expressions.ExpressionSet
+import org.apache.spark.sql.internal.SQLConf.PROPAGATE_DISTINCT_KEYS_ENABLED
+
+/**
+ * A trait to add distinct attributes to [[LogicalPlan]]. For example:
+ * {{{
+ *   SELECT a, b, SUM(c) FROM Tab1 GROUP BY a, b
+ *   // returns a, b
+ * }}}
+ */
+trait LogicalPlanDistinctKeys { self: LogicalPlan =>
+  lazy val distinctKeys: Set[ExpressionSet] = {
+    if (conf.getConf(PROPAGATE_DISTINCT_KEYS_ENABLED)) DistinctKeyVisitor.visit(self) else Set.empty
+  }
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
@@ -648,7 +648,12 @@ case class Aggregate(
 
   // Whether this Aggregate operator is group only. For example: SELECT a, a FROM t GROUP BY a
   private[sql] def groupOnly: Boolean = {
-    aggregateExpressions.forall(a => groupingExpressions.exists(g => a.semanticEquals(g)))
+    // aggregateExpressions can be empty through Dateset.agg,
+    // so we should also check groupingExpressions is non empty
+    groupingExpressions.nonEmpty && aggregateExpressions.map {
+      case Alias(child, _) => child
+      case e => e
+    }.forall(a => a.foldable || groupingExpressions.exists(g => a.semanticEquals(g)))
   }
 }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -731,6 +731,15 @@ object SQLConf {
     .booleanConf
     .createWithDefault(true)
 
+  val PROPAGATE_DISTINCT_KEYS_ENABLED =
+    buildConf("spark.sql.optimizer.propagateDistinctKeys.enabled")
+      .internal()
+      .doc("When true, the query optimizer will propagate a set of distinct attributes from the " +
+        "current node and use it to optimize query.")
+      .version("3.3.0")
+      .booleanConf
+      .createWithDefault(true)
+
   val ESCAPED_STRING_LITERALS = buildConf("spark.sql.parser.escapedStringLiterals")
     .internal()
     .doc("When true, string literals (including regex patterns) remain escaped in our SQL " +
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/AggregateOptimizeSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/AggregateOptimizeSuite.scala
@@ -21,6 +21,7 @@ import org.apache.spark.sql.catalyst.analysis.AnalysisTest
 import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.catalyst.dsl.plans._
 import org.apache.spark.sql.catalyst.expressions.Literal
+import org.apache.spark.sql.catalyst.expressions.Literal.{FalseLiteral, TrueLiteral}
 import org.apache.spark.sql.catalyst.plans.{LeftOuter, RightOuter}
 import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, Distinct, LocalRelation, LogicalPlan}
 import org.apache.spark.sql.catalyst.rules.RuleExecutor
@@ -122,7 +123,42 @@ class AggregateOptimizeSuite extends AnalysisTest {
       Optimize.execute(
         x.join(y, LeftOuter, Some("x.a".attr === "y.a".attr))
           .groupBy("x.a".attr)("x.a".attr, Literal(1)).analyze),
-      x.join(y, LeftOuter, Some("x.a".attr === "y.a".attr))
-        .groupBy("x.a".attr)("x.a".attr, Literal(1)).analyze)
+      x.groupBy("x.a".attr)("x.a".attr, Literal(1)).analyze)
+  }
+
+  test("SPARK-37292: Removes outer join if it only has DISTINCT on streamed side with alias") {
+    val x = testRelation.subquery('x)
+    val y = testRelation.subquery('y)
+    comparePlans(
+      Optimize.execute(
+        Distinct(x.join(y, LeftOuter, Some("x.a".attr === "y.a".attr))
+          .select("x.b".attr.as("newAlias"))).analyze),
+      x.select("x.b".attr.as("newAlias")).groupBy("newAlias".attr)("newAlias".attr).analyze)
+
+    comparePlans(
+      Optimize.execute(
+        Distinct(x.join(y, RightOuter, Some("x.a".attr === "y.a".attr))
+          .select("y.b".attr.as("newAlias"))).analyze),
+      y.select("y.b".attr.as("newAlias")).groupBy("newAlias".attr)("newAlias".attr).analyze)
+
+    comparePlans(
+      Optimize.execute(
+        Distinct(x.join(y, LeftOuter, Some("x.a".attr === "y.a".attr))
+          .select("x.b".attr.as("newAlias1"), "x.b".attr.as("newAlias2"))).analyze),
+      x.select("x.b".attr.as("newAlias1"), "x.b".attr.as("newAlias2"))
+        .groupBy("newAlias1".attr, "newAlias2".attr)("newAlias1".attr, "newAlias2".attr).analyze)
+  }
+
+  test("SPARK-38489: Aggregate.groupOnly support foldable expressions") {
+    val x = testRelation.subquery('x)
+    val y = testRelation.subquery('y)
+    comparePlans(
+      Optimize.execute(
+        Distinct(x.join(y, LeftOuter, Some("x.a".attr === "y.a".attr))
+          .select("x.b".attr, TrueLiteral, FalseLiteral.as("newAlias")))
+          .analyze),
+      x.select("x.b".attr, TrueLiteral, FalseLiteral.as("newAlias"))
+        .groupBy("x.b".attr)("x.b".attr, TrueLiteral, FalseLiteral.as("newAlias"))
+        .analyze)
   }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/RemoveRedundantAggregatesSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/RemoveRedundantAggregatesSuite.scala
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/logical/DistinctKeyVisitorSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/logical/DistinctKeyVisitorSuite.scala

Original file line number	Diff line number	Diff line change
`@@ -648,7 +648,12 @@ case class Aggregate(`
`648`	`648`
`649`	`649`	`// Whether this Aggregate operator is group only. For example: SELECT a, a FROM t GROUP BY a`
`650`	`650`	`private[sql] def groupOnly: Boolean = {`
`651`		`- aggregateExpressions.forall(a => groupingExpressions.exists(g => a.semanticEquals(g)))`
	`651`	`+ // aggregateExpressions can be empty through Dateset.agg,`
	`652`	`+ // so we should also check groupingExpressions is non empty`
	`653`	`+ groupingExpressions.nonEmpty && aggregateExpressions.map {`
	`654`	`+ case Alias(child, _) => child`
	`655`	`+ case e => e`
	`656`	`+ }.forall(a => a.foldable \|\| groupingExpressions.exists(g => a.semanticEquals(g)))`
`652`	`657`	`}`
`653`	`658`	`}`
`654`	`659`