Merge pull request #23 from marmbrus/streaming-attributes

marmbrus · marmbrus · commit addb3abd0043 · 2016-01-07T10:40:19.000-08:00
Fix attribute rewiring
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -1215,6 +1215,8 @@ object CleanupAliases extends Rule[LogicalPlan] {
       Window(projectList, cleanedWindowExprs, partitionSpec.map(trimAliases),
         orderSpec.map(trimAliases(_).asInstanceOf[SortOrder]), child)
 
+    case o: ObjectOperator => o
+
     case other =>
       var stop = false
       other transformExpressionsDown {
@@ -1265,22 +1267,26 @@ object ResolveUpCast extends Rule[LogicalPlan] {
   }
 
   def apply(plan: LogicalPlan): LogicalPlan = {
-    plan transformAllExpressions {
-      case u @ UpCast(child, _, _) if !child.resolved => u
-
-      case UpCast(child, dataType, walkedTypePath) => (child.dataType, dataType) match {
-        case (from: NumericType, to: DecimalType) if !to.isWiderThan(from) =>
-          fail(child, to, walkedTypePath)
-        case (from: DecimalType, to: NumericType) if !from.isTighterThan(to) =>
-          fail(child, to, walkedTypePath)
-        case (from, to) if illegalNumericPrecedence(from, to) =>
-          fail(child, to, walkedTypePath)
-        case (TimestampType, DateType) =>
-          fail(child, DateType, walkedTypePath)
-        case (StringType, to: NumericType) =>
-          fail(child, to, walkedTypePath)
-        case _ => Cast(child, dataType)
-      }
+    plan transform {
+      case o: ObjectOperator => o
+      case other =>
+        other transformExpressions {
+          case u@UpCast(child, _, _) if !child.resolved => u
+
+          case UpCast(child, dataType, walkedTypePath) => (child.dataType, dataType) match {
+            case (from: NumericType, to: DecimalType) if !to.isWiderThan(from) =>
+              fail(child, to, walkedTypePath)
+            case (from: DecimalType, to: NumericType) if !from.isTighterThan(to) =>
+              fail(child, to, walkedTypePath)
+            case (from, to) if illegalNumericPrecedence(from, to) =>
+              fail(child, to, walkedTypePath)
+            case (TimestampType, DateType) =>
+              fail(child, DateType, walkedTypePath)
+            case (StringType, to: NumericType) =>
+              fail(child, to, walkedTypePath)
+            case _ => Cast(child, dataType)
+          }
+        }
     }
   }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -448,6 +448,7 @@ object NullPropagation extends Rule[LogicalPlan] {
  */
 object ConstantFolding extends Rule[LogicalPlan] {
   def apply(plan: LogicalPlan): LogicalPlan = plan transform {
+    case o: ObjectOperator => o
     case q: LogicalPlan => q transformExpressionsDown {
       // Skip redundant folding of literals. This rule is technically not necessary. Placing this
       // here avoids running the next rule for Literal values, which would create a new Literal
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala
@@ -17,9 +17,11 @@
 
 package org.apache.spark.sql.catalyst.plans
 
+import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
 import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeSet, Expression, VirtualColumn}
 import org.apache.spark.sql.catalyst.trees.TreeNode
 import org.apache.spark.sql.types.{DataType, StructType}
+import org.apache.spark.sql.catalyst.util._
 
 abstract class QueryPlan[PlanType <: TreeNode[PlanType]] extends TreeNode[PlanType] {
   self: PlanType =>
@@ -83,6 +85,14 @@ abstract class QueryPlan[PlanType <: TreeNode[PlanType]] extends TreeNode[PlanTy
     }
 
     def recursiveTransform(arg: Any): AnyRef = arg match {
+      case e: ExpressionEncoder[_] =>
+        val newEncoder = new ExpressionEncoder(
+          e.schema,
+          e.flat,
+          e.toRowExpressions.map(transformExpressionDown),
+          transformExpressionDown(e.fromRowExpression),
+          e.clsTag)
+        newEncoder
       case e: Expression => transformExpressionDown(e)
       case Some(e: Expression) => Some(transformExpressionDown(e))
       case m: Map[_, _] => m
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala
@@ -479,6 +479,8 @@ case object OneRowRelation extends LeafNode {
   override def statistics: Statistics = Statistics(sizeInBytes = 1)
 }
 
+trait ObjectOperator
+
 /**
  * A relation produced by applying `func` to each partition of the `child`. tEncoder/uEncoder are
  * used respectively to decode/encode from the JVM object representation expected by `func.`
@@ -488,7 +490,7 @@ case class MapPartitions[T, U](
     tEncoder: ExpressionEncoder[T],
     uEncoder: ExpressionEncoder[U],
     output: Seq[Attribute],
-    child: LogicalPlan) extends UnaryNode {
+    child: LogicalPlan) extends UnaryNode with ObjectOperator {
   override def producedAttributes: AttributeSet = outputSet
 }
 
@@ -513,7 +515,7 @@ case class AppendColumns[T, U](
     tEncoder: ExpressionEncoder[T],
     uEncoder: ExpressionEncoder[U],
     newColumns: Seq[Attribute],
-    child: LogicalPlan) extends UnaryNode {
+    child: LogicalPlan) extends UnaryNode with ObjectOperator {
   override def output: Seq[Attribute] = child.output ++ newColumns
   override def producedAttributes: AttributeSet = AttributeSet(newColumns)
 }
@@ -549,7 +551,7 @@ case class MapGroups[K, T, U](
     uEncoder: ExpressionEncoder[U],
     groupingAttributes: Seq[Attribute],
     output: Seq[Attribute],
-    child: LogicalPlan) extends UnaryNode {
+    child: LogicalPlan) extends UnaryNode with ObjectOperator {
   override def producedAttributes: AttributeSet = outputSet
 }
 
@@ -592,6 +594,6 @@ case class CoGroup[Key, Left, Right, Result](
     leftGroup: Seq[Attribute],
     rightGroup: Seq[Attribute],
     left: LogicalPlan,
-    right: LogicalPlan) extends BinaryNode {
+    right: LogicalPlan) extends BinaryNode with ObjectOperator {
   override def producedAttributes: AttributeSet = outputSet
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/package.scala
@@ -19,9 +19,10 @@ package org.apache.spark.sql.catalyst
 
 import java.io._
 
+import org.apache.spark.Logging
 import org.apache.spark.util.Utils
 
-package object util {
+package object util extends Logging{
 
   /** Silences output to stderr or stdout for the duration of f */
   def quietly[A](f: => A): A = {
@@ -42,6 +43,24 @@ package object util {
     }
   }
 
+  private val analysisRule = """.*org\.apache\.spark\.sql\.catalyst\.analysis\.([A-Za-z]+).*""".r
+
+  /**
+   * Logs along with the name of the analyzer rule that is running.  This is pretty expensive so
+   * always logs at warning.
+   */
+  def logRule(msg: String): Unit = {
+    val error = try sys.error("") catch {
+      case e: Exception =>
+        stackTraceToString(e)
+    }
+
+    val rule = error.split("\n").collect {
+      case analysisRule(r) => r
+    }.headOption.getOrElse("unknown rule")
+    logWarning(s"$rule: $msg")
+  }
+
   def fileToString(file: File, encoding: String = "UTF-8"): String = {
     val inStream = new FileInputStream(file)
     val outStream = new ByteArrayOutputStream
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
@@ -108,7 +108,7 @@ class StreamExecution(
           val newPlan = batch.data.logicalPlan
 
           assert(output.size == newPlan.output.size)
-          replacements ++= newPlan.output.zip(output)
+          replacements ++= output.zip(newPlan.output)
           newPlan
         }.getOrElse {
           LocalRelation(output)