apache
diff --git a/‎sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/AttributeMap.scala
Lines changed: 4 additions & 6 deletions b/‎sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/AttributeMap.scala
Lines changed: 4 additions & 6 deletions
diff --git a/‎sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
Lines changed: 15 additions & 16 deletions b/‎sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
Lines changed: 15 additions & 16 deletions
diff --git a/‎sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnStats.scala
Lines changed: 55 additions & 40 deletions b/‎sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnStats.scala
Lines changed: 55 additions & 40 deletions
@@ -23,19 +23,17 @@ package org.apache.spark.sql.catalyst.expressions
  * of the name, or the expected nullability).
  */
 object AttributeMap {
-  def apply[A](kvs: Seq[(Attribute, A)]) =
-    new AttributeMap(kvs.map(kv => (kv._1.exprId, (kv._1, kv._2))).toMap)
+  def apply[A](kvs: Seq[(Attribute, A)]) = new AttributeMap(kvs.map(kv => (kv._1.exprId, kv)).toMap)
 }
 
 class AttributeMap[A](baseMap: Map[ExprId, (Attribute, A)])
   extends Map[Attribute, A] with Serializable {
 
   override def get(k: Attribute): Option[A] = baseMap.get(k.exprId).map(_._2)
 
-  override def + [B1 >: A](kv: (Attribute, B1)): Map[Attribute, B1] =
-    (baseMap.map(_._2) + kv).toMap
+  override def + [B1 >: A](kv: (Attribute, B1)): Map[Attribute, B1] = baseMap.values.toMap + kv
 
-  override def iterator: Iterator[(Attribute, A)] = baseMap.map(_._2).iterator
+  override def iterator: Iterator[(Attribute, A)] = baseMap.valuesIterator
 
-  override def -(key: Attribute): Map[Attribute, A] = (baseMap.map(_._2) - key).toMap
+  override def -(key: Attribute): Map[Attribute, A] = baseMap.values.toMap - key
 }
@@ -26,25 +26,24 @@ import org.apache.spark.sql.catalyst.trees.TreeNode
 import org.apache.spark.sql.catalyst.types.StructType
 import org.apache.spark.sql.catalyst.trees
 
+/**
+ * Estimates of various statistics.  The default estimation logic simply lazily multiplies the
+ * corresponding statistic produced by the children.  To override this behavior, override
+ * `statistics` and assign it an overriden version of `Statistics`.
+ *
+ * '''NOTE''': concrete and/or overriden versions of statistics fields should pay attention to the
+ * performance of the implementations.  The reason is that estimations might get triggered in
+ * performance-critical processes, such as query plan planning.
+ *
+ * @param sizeInBytes Physical size in bytes. For leaf operators this defaults to 1, otherwise it
+ *                    defaults to the product of children's `sizeInBytes`.
+ */
+case class Statistics(sizeInBytes: BigInt)
+
 abstract class LogicalPlan extends QueryPlan[LogicalPlan] with Logging {
   self: Product =>
 
-  /**
-   * Estimates of various statistics.  The default estimation logic simply lazily multiplies the
-   * corresponding statistic produced by the children.  To override this behavior, override
-   * `statistics` and assign it an overriden version of `Statistics`.
-   *
-   * '''NOTE''': concrete and/or overriden versions of statistics fields should pay attention to the
-   * performance of the implementations.  The reason is that estimations might get triggered in
-   * performance-critical processes, such as query plan planning.
-   *
-   * @param sizeInBytes Physical size in bytes. For leaf operators this defaults to 1, otherwise it
-   *                    defaults to the product of children's `sizeInBytes`.
-   */
-  case class Statistics(
-    sizeInBytes: BigInt
-  )
-  lazy val statistics: Statistics = {
+  def statistics: Statistics = {
     if (children.size == 0) {
       throw new UnsupportedOperationException(s"LeafNode $nodeName must implement statistics.")
     }
 
@@ -24,11 +24,13 @@ import org.apache.spark.sql.catalyst.expressions.{AttributeMap, Attribute, Attri
 import org.apache.spark.sql.catalyst.types._
 
 private[sql] class ColumnStatisticsSchema(a: Attribute) extends Serializable {
-  val upperBound = AttributeReference(a.name + ".upperBound", a.dataType, nullable = false)()
-  val lowerBound = AttributeReference(a.name + ".lowerBound", a.dataType, nullable = false)()
-  val nullCount =  AttributeReference(a.name + ".nullCount", IntegerType, nullable = false)()
+  val upperBound = AttributeReference(a.name + ".upperBound", a.dataType, nullable = true)()
+  val lowerBound = AttributeReference(a.name + ".lowerBound", a.dataType, nullable = true)()
+  val nullCount = AttributeReference(a.name + ".nullCount", IntegerType, nullable = false)()
+  val count = AttributeReference(a.name + ".count", IntegerType, nullable = false)()
+  val sizeInBytes = AttributeReference(a.name + ".sizeInBytes", LongType, nullable = false)()
 
-  val schema = Seq(lowerBound, upperBound, nullCount)
+  val schema = Seq(lowerBound, upperBound, nullCount, count, sizeInBytes)
 }
 
 private[sql] class PartitionStatistics(tableSchema: Seq[Attribute]) extends Serializable {
@@ -45,6 +47,10 @@ private[sql] class PartitionStatistics(tableSchema: Seq[Attribute]) extends Seri
  * brings significant performance penalty.
  */
 private[sql] sealed trait ColumnStats extends Serializable {
+  protected var count = 0
+  protected var nullCount = 0
+  protected var sizeInBytes = 0L
+
   /**
    * Gathers statistics information from `row(ordinal)`.
    */
@@ -65,9 +71,8 @@ private[sql] class NoopColumnStats extends ColumnStats {
 }
 
 private[sql] class ByteColumnStats extends ColumnStats {
-  var upper = Byte.MinValue
-  var lower = Byte.MaxValue
-  var nullCount = 0
+  protected var upper = Byte.MinValue
+  protected var lower = Byte.MaxValue
 
   override def gatherStats(row: Row, ordinal: Int): Unit = {
     if (!row.isNullAt(ordinal)) {
@@ -77,15 +82,16 @@ private[sql] class ByteColumnStats extends ColumnStats {
     } else {
       nullCount += 1
     }
+    count += 1
+    sizeInBytes += BYTE.defaultSize
   }
 
-  def collectedStatistics = Row(lower, upper, nullCount)
+  def collectedStatistics = Row(lower, upper, nullCount, count, sizeInBytes)
 }
 
 private[sql] class ShortColumnStats extends ColumnStats {
-  var upper = Short.MinValue
-  var lower = Short.MaxValue
-  var nullCount = 0
+  protected var upper = Short.MinValue
+  protected var lower = Short.MaxValue
 
   override def gatherStats(row: Row, ordinal: Int): Unit = {
     if (!row.isNullAt(ordinal)) {
@@ -95,15 +101,16 @@ private[sql] class ShortColumnStats extends ColumnStats {
     } else {
       nullCount += 1
     }
+    count += 1
+    sizeInBytes += SHORT.defaultSize
   }
 
-  def collectedStatistics = Row(lower, upper, nullCount)
+  def collectedStatistics = Row(lower, upper, nullCount, count, sizeInBytes)
 }
 
 private[sql] class LongColumnStats extends ColumnStats {
-  var upper = Long.MinValue
-  var lower = Long.MaxValue
-  var nullCount = 0
+  protected var upper = Long.MinValue
+  protected var lower = Long.MaxValue
 
   override def gatherStats(row: Row, ordinal: Int): Unit = {
     if (!row.isNullAt(ordinal)) {
@@ -113,15 +120,16 @@ private[sql] class LongColumnStats extends ColumnStats {
     } else {
       nullCount += 1
     }
+    count += 1
+    sizeInBytes += LONG.defaultSize
   }
 
-  def collectedStatistics = Row(lower, upper, nullCount)
+  def collectedStatistics = Row(lower, upper, nullCount, count, sizeInBytes)
 }
 
 private[sql] class DoubleColumnStats extends ColumnStats {
-  var upper = Double.MinValue
-  var lower = Double.MaxValue
-  var nullCount = 0
+  protected var upper = Double.MinValue
+  protected var lower = Double.MaxValue
 
   override def gatherStats(row: Row, ordinal: Int): Unit = {
     if (!row.isNullAt(ordinal)) {
@@ -131,15 +139,16 @@ private[sql] class DoubleColumnStats extends ColumnStats {
     } else {
       nullCount += 1
     }
+    count += 1
+    sizeInBytes += DOUBLE.defaultSize
   }
 
-  def collectedStatistics = Row(lower, upper, nullCount)
+  def collectedStatistics = Row(lower, upper, nullCount, count, sizeInBytes)
 }
 
 private[sql] class FloatColumnStats extends ColumnStats {
-  var upper = Float.MinValue
-  var lower = Float.MaxValue
-  var nullCount = 0
+  protected var upper = Float.MinValue
+  protected var lower = Float.MaxValue
 
   override def gatherStats(row: Row, ordinal: Int): Unit = {
     if (!row.isNullAt(ordinal)) {
@@ -149,15 +158,16 @@ private[sql] class FloatColumnStats extends ColumnStats {
     } else {
       nullCount += 1
     }
+    count += 1
+    sizeInBytes += FLOAT.defaultSize
   }
 
-  def collectedStatistics = Row(lower, upper, nullCount)
+  def collectedStatistics = Row(lower, upper, nullCount, count, sizeInBytes)
 }
 
 private[sql] class IntColumnStats extends ColumnStats {
-  var upper = Int.MinValue
-  var lower = Int.MaxValue
-  var nullCount = 0
+  protected var upper = Int.MinValue
+  protected var lower = Int.MaxValue
 
   override def gatherStats(row: Row, ordinal: Int): Unit = {
     if (!row.isNullAt(ordinal)) {
@@ -167,15 +177,16 @@ private[sql] class IntColumnStats extends ColumnStats {
     } else {
       nullCount += 1
     }
+    count += 1
+    sizeInBytes += INT.defaultSize
   }
 
-  def collectedStatistics = Row(lower, upper, nullCount)
+  def collectedStatistics = Row(lower, upper, nullCount, count, sizeInBytes)
 }
 
 private[sql] class StringColumnStats extends ColumnStats {
-  var upper: String = null
-  var lower: String = null
-  var nullCount = 0
+  protected var upper: String = null
+  protected var lower: String = null
 
   override def gatherStats(row: Row, ordinal: Int): Unit = {
     if (!row.isNullAt(ordinal)) {
@@ -185,15 +196,16 @@ private[sql] class StringColumnStats extends ColumnStats {
     } else {
       nullCount += 1
     }
+    count += 1
+    sizeInBytes += STRING.actualSize(row, ordinal)
   }
 
-  def collectedStatistics = Row(lower, upper, nullCount)
+  def collectedStatistics = Row(lower, upper, nullCount, count, sizeInBytes)
 }
 
 private[sql] class DateColumnStats extends ColumnStats {
-  var upper: Date = null
-  var lower: Date = null
-  var nullCount = 0
+  protected var upper: Date = null
+  protected var lower: Date = null
 
   override def gatherStats(row: Row, ordinal: Int) {
     if (!row.isNullAt(ordinal)) {
@@ -203,15 +215,16 @@ private[sql] class DateColumnStats extends ColumnStats {
     } else {
       nullCount += 1
     }
+    count += 1
+    sizeInBytes += DATE.defaultSize
   }
 
-  def collectedStatistics = Row(lower, upper, nullCount)
+  def collectedStatistics = Row(lower, upper, nullCount, count, sizeInBytes)
 }
 
 private[sql] class TimestampColumnStats extends ColumnStats {
-  var upper: Timestamp = null
-  var lower: Timestamp = null
-  var nullCount = 0
+  protected var upper: Timestamp = null
+  protected var lower: Timestamp = null
 
   override def gatherStats(row: Row, ordinal: Int): Unit = {
     if (!row.isNullAt(ordinal)) {
@@ -221,7 +234,9 @@ private[sql] class TimestampColumnStats extends ColumnStats {
     } else {
       nullCount += 1
     }
+    count += 1
+    sizeInBytes += TIMESTAMP.defaultSize
   }
 
-  def collectedStatistics = Row(lower, upper, nullCount)
+  def collectedStatistics = Row(lower, upper, nullCount, count, sizeInBytes)
 }