Don't inherit expression id in dropDuplicates

zsxwing · zsxwing · commit 4f38e3b6fb75 · 2017-01-12T14:03:16.000-08:00
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -2003,10 +2003,7 @@ class Dataset[T] private[sql](
       if (groupColExprIds.contains(attr.exprId)) {
         attr
       } else {
-        // Removing duplicate rows should not change output attributes. We should keep
-        // the original exprId of the attribute. Otherwise, to select a column in original
-        // dataset will cause analysis exception due to unresolved attribute.
-        Alias(new First(attr).toAggregateExpression(), attr.name)(exprId = attr.exprId)
+        Alias(new First(attr).toAggregateExpression(), attr.name)()
       }
     }
     Aggregate(groupCols, aggCols, logicalPlan)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
@@ -29,7 +29,7 @@ import org.apache.hadoop.fs.Path
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.encoders.RowEncoder
-import org.apache.spark.sql.catalyst.expressions.{Alias, Attribute, AttributeMap, CurrentBatchTimestamp, CurrentDate, CurrentTimestamp}
+import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap, CurrentBatchTimestamp, CurrentDate, CurrentTimestamp}
 import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan}
 import org.apache.spark.sql.execution.QueryExecution
 import org.apache.spark.sql.execution.command.ExplainCommand
@@ -495,13 +495,8 @@ class StreamExecution(
 
     // Rewire the plan to use the new attributes that were returned by the source.
     val replacementMap = AttributeMap(replacements)
-    val exprIdMap =
-      replacements.map { case (oldAttr, newAttr) => (oldAttr.exprId, newAttr.exprId)}.toMap
     val triggerLogicalPlan = withNewSources transformAllExpressions {
       case a: Attribute if replacementMap.contains(a) => replacementMap(a)
-      case a: Alias if exprIdMap.contains(a.exprId) =>
-        // Also rewrite `Alias`s as they may use the same `exprId` of `Attribute`s.
-        Alias(a.child, a.name)(exprIdMap(a.exprId), a.qualifier, a.explicitMetadata, a.isGenerated)
       case ct: CurrentTimestamp =>
         CurrentBatchTimestamp(offsetSeqMetadata.batchTimestampMs,
           ct.dataType)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
@@ -21,6 +21,7 @@ import java.io.{Externalizable, ObjectInput, ObjectOutput}
 import java.sql.{Date, Timestamp}
 
 import org.apache.spark.sql.catalyst.encoders.{OuterScopes, RowEncoder}
+import org.apache.spark.sql.catalyst.expressions.NamedExpression
 import org.apache.spark.sql.catalyst.util.sideBySide
 import org.apache.spark.sql.execution.{LogicalRDD, RDDScanExec, SortExec}
 import org.apache.spark.sql.execution.exchange.{BroadcastExchangeExec, ShuffleExchange}
@@ -898,11 +899,15 @@ class DatasetSuite extends QueryTest with SharedSQLContext {
       (1, 2), (1, 1), (2, 1), (2, 2))
   }
 
-  test("dropDuplicates should not change child plan output") {
-    val ds = Seq(("a", 1), ("a", 2), ("b", 1), ("a", 1)).toDS()
-    checkDataset(
-      ds.dropDuplicates("_1").select(ds("_1").as[String], ds("_2").as[Int]),
-      ("a", 1), ("b", 1))
+  test("SPARK-19065 dropDuplicates should not create expressions using the same id") {
+    val ds = Seq(("a", 1), ("a", 2), ("b", 1), ("a", 1)).toDS().dropDuplicates("_1")
+    var exprs = Set.empty[NamedExpression]
+    ds.logicalPlan.transformAllExpressions { case e: NamedExpression =>
+      exprs += e
+      e
+    }
+    val duplicatedExprs = exprs.groupBy(expr => expr.exprId).filter(_._2.size > 1).values
+    assert(duplicatedExprs.isEmpty)
   }
 
   test("SPARK-16097: Encoders.tuple should handle null object correctly") {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala
@@ -304,32 +304,6 @@ class StreamSuite extends StreamTest {
       q.stop()
     }
   }
-
-  test("SPARK-19065 Alia should be replaced as well") {
-    withTempPath { testPath =>
-      val data = Seq((1, 2), (2, 3), (3, 4))
-      data.toDS.write.mode("overwrite").json(testPath.getCanonicalPath)
-      val schema = spark.read.json(testPath.getCanonicalPath).schema
-      val query = spark
-        .readStream
-        .schema(schema)
-        .json(testPath.getCanonicalPath)
-        .dropDuplicates("_1") // dropDuplicates will create an Alias using the same exprId.
-        .writeStream
-        .format("memory")
-        .queryName("testquery")
-        .outputMode("complete")
-        .start()
-      try {
-        query.processAllAvailable()
-        if (query.exception.isDefined) {
-          throw query.exception.get
-        }
-      } finally {
-        query.stop()
-      }
-    }
-  }
 }
 
 /**

Original file line number	Diff line number	Diff line change
`@@ -2003,10 +2003,7 @@ class Dataset[T] private[sql](`
`2003`	`2003`	`if (groupColExprIds.contains(attr.exprId)) {`
`2004`	`2004`	`attr`
`2005`	`2005`	`} else {`
`2006`		`- // Removing duplicate rows should not change output attributes. We should keep`
`2007`		`- // the original exprId of the attribute. Otherwise, to select a column in original`
`2008`		`- // dataset will cause analysis exception due to unresolved attribute.`
`2009`		`- Alias(new First(attr).toAggregateExpression(), attr.name)(exprId = attr.exprId)`
	`2006`	`+ Alias(new First(attr).toAggregateExpression(), attr.name)()`
`2010`	`2007`	`}`
`2011`	`2008`	`}`
`2012`	`2009`	`Aggregate(groupCols, aggCols, logicalPlan)`