CodingCat · CodingCat · Mar 7, 2016 · Mar 7, 2016 · Oct 27, 2017 · Oct 27, 2017
diff --git a/core/src/main/scala/org/apache/spark/storage/BlockId.scala b/core/src/main/scala/org/apache/spark/storage/BlockId.scala
@@ -31,7 +31,7 @@ import org.apache.spark.annotation.DeveloperApi
  * If your BlockId should be serializable, be sure to add it to the BlockId.apply() method.
  */
 @DeveloperApi
-sealed abstract class BlockId {
+abstract class BlockId {
   /** A globally unique identifier for this Block. Can be used for ser/de. */
   def name: String
 
@@ -49,6 +49,11 @@ case class RDDBlockId(rddId: Int, splitIndex: Int) extends BlockId {
   override def name: String = "rdd_" + rddId + "_" + splitIndex
 }
 
+@DeveloperApi
+case class RDDPartitionMetadataBlockId(rddId: Int, splitIndex: Int) extends BlockId {
+  override def name: String = "rdd_" + rddId + "_" + splitIndex + ".metadata"
+}
+
 // Format of the shuffle block ids (including data and index) should be kept in sync with
 // org.apache.spark.network.shuffle.ExternalShuffleBlockResolver#getBlockData().
 @DeveloperApi
@@ -103,6 +108,7 @@ class UnrecognizedBlockId(name: String)
 @DeveloperApi
 object BlockId {
   val RDD = "rdd_([0-9]+)_([0-9]+)".r
+  val PARTITION_METADATA = "rdd_([0-9]+)_([0-9]+).metadata".r
   val SHUFFLE = "shuffle_([0-9]+)_([0-9]+)_([0-9]+)".r
   val SHUFFLE_DATA = "shuffle_([0-9]+)_([0-9]+)_([0-9]+).data".r
   val SHUFFLE_INDEX = "shuffle_([0-9]+)_([0-9]+)_([0-9]+).index".r
@@ -116,6 +122,8 @@ object BlockId {
   def apply(name: String): BlockId = name match {
     case RDD(rddId, splitIndex) =>
       RDDBlockId(rddId.toInt, splitIndex.toInt)
+    case PARTITION_METADATA(rddId, splitIndex) =>
+      RDDPartitionMetadataBlockId(rddId.toInt, splitIndex.toInt)
     case SHUFFLE(shuffleId, mapId, reduceId) =>
       ShuffleBlockId(shuffleId.toInt, mapId.toInt, reduceId.toInt)
     case SHUFFLE_DATA(shuffleId, mapId, reduceId) =>

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -136,7 +136,7 @@ object SQLConf {
   val IN_MEMORY_PARTITION_PRUNING =
     buildConf("spark.sql.inMemoryColumnarStorage.partitionPruning")
       .internal()
-      .doc("When true, enable partition pruning for in-memory columnar tables.")
+      .doc("When true, enable partition batch pruning for in-memory columnar tables.")
       .booleanConf
       .createWithDefault(true)
 
@@ -147,6 +147,14 @@ object SQLConf {
       .booleanConf
       .createWithDefault(false)
 
+  val IN_MEMORY_PARTITION_METADATA =
+    buildConf("spark.sql.inMemoryColumnarStorage.partitionMetadata")
+      .internal()
+      .doc("When true, spark sql will collect partition level stats for in-memory columnar" +
+        " tables and do coarse-grained pruning")
+      .booleanConf
+      .createWithDefault(false)
+
   val PREFER_SORTMERGEJOIN = buildConf("spark.sql.join.preferSortMergeJoin")
     .internal()
     .doc("When true, prefer sort merge join over shuffle hash join.")
@@ -1219,6 +1227,8 @@ class SQLConf extends Serializable with Logging {
 
   def offHeapColumnVectorEnabled: Boolean = getConf(COLUMN_VECTOR_OFFHEAP_ENABLED)
 
+  def inMemoryPartitionMetadata: Boolean = getConf(IN_MEMORY_PARTITION_METADATA)
+
   def columnNameOfCorruptRecord: String = getConf(COLUMN_NAME_OF_CORRUPT_RECORD)
 
   def broadcastTimeout: Long = getConf(BROADCAST_TIMEOUT)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/CachedColumnarRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/CachedColumnarRDD.scala
@@ -0,0 +1,99 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.columnar
+
+import java.util.concurrent.ConcurrentHashMap
+
+import scala.collection.JavaConverters._
+import scala.collection.mutable
+
+import org.apache.spark._
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.storage.{RDDPartitionMetadataBlockId, StorageLevel}
+
+private[columnar] class CachedColumnarRDD(
+    @transient private var _sc: SparkContext,
+    private var dataRDD: RDD[CachedBatch],
+    private[columnar] val containsPartitionMetadata: Boolean,
+    expectedStorageLevel: StorageLevel)
+  extends RDD[CachedBatch](_sc, Seq(new OneToOneDependency(dataRDD))) {
+
+  override def compute(split: Partition, context: TaskContext): Iterator[CachedBatch] = {
+    firstParent.iterator(split, context)
+  }
+
+  override def unpersist(blocking: Boolean = true): this.type = {
+    CachedColumnarRDD.allMetadataFetched.remove(id)
+    CachedColumnarRDD.rddIdToMetadata.remove(id)
+    super.unpersist(blocking)
+  }
+
+  override protected def getPartitions: Array[Partition] = dataRDD.partitions
+
+  override private[spark] def getOrCompute(split: Partition, context: TaskContext):
+      Iterator[CachedBatch] = {
+    val metadataBlockId = RDDPartitionMetadataBlockId(id, split.index)
+    val superGetOrCompute: (Partition, TaskContext) => Iterator[CachedBatch] = super.getOrCompute
+    SparkEnv.get.blockManager.getSingle[InternalRow](metadataBlockId).map(_ =>
+      superGetOrCompute(split, context)
+    ).getOrElse {
+      val batchIter = superGetOrCompute(split, context)
+      if (containsPartitionMetadata && getStorageLevel != StorageLevel.NONE && batchIter.hasNext) {
+        val cachedBatch = batchIter.next()
+        SparkEnv.get.blockManager.putSingle(metadataBlockId, cachedBatch.stats,
+          expectedStorageLevel)
+        new InterruptibleIterator[CachedBatch](context, Iterator(cachedBatch))
+      } else {
+        batchIter
+      }
+    }
+  }
+}
+
+private[columnar] object CachedColumnarRDD {
+
+  private val rddIdToMetadata = new ConcurrentHashMap[Int, mutable.ArraySeq[Option[InternalRow]]]()
+  private val allMetadataFetched = new ConcurrentHashMap[Int, Boolean]()
+
+  def collectStats(rdd: RDD[CachedBatch]): IndexedSeq[Option[InternalRow]] = {
+    if (allMetadataFetched.containsKey(rdd.id)) {
+      rddIdToMetadata.get(rdd.id)
+    } else {
+      val updatedMetadataBlocks = rdd.partitions.indices.map {
+        partitionId => {
+          if (!rddIdToMetadata.containsKey(rdd.id)) {
+            val initSeq = new mutable.ArraySeq[Option[InternalRow]](rdd.partitions.length)
+            initSeq.indices.foreach(idx => initSeq(idx) = None)
+            rddIdToMetadata.put(rdd.id, initSeq)
+          }
+          rddIdToMetadata.get(rdd.id)(partitionId).orElse{
+            val metadata = SparkEnv.get.blockManager.getSingle[InternalRow](
+              RDDPartitionMetadataBlockId(rdd.id, partitionId))
+            rddIdToMetadata.get(rdd.id).update(partitionId, metadata)
+            metadata
+          }
+        }
+      }
+      if (updatedMetadataBlocks.forall(_.isDefined)) {
+        allMetadataFetched.put(rdd.id, true)
+      }
+      updatedMetadataBlocks
+    }
+  }
+}
diff --git a/...re/src/main/scala/org/apache/spark/sql/execution/columnar/FilteredCachedColumnarRDD.scala b/...re/src/main/scala/org/apache/spark/sql/execution/columnar/FilteredCachedColumnarRDD.scala
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.columnar
+
+import org.apache.spark._
+import org.apache.spark.rdd.RDD
+
+private[columnar] class FilteredCachedColumnarPartition(
+    val partitionIndex: Int,
+    val parentPartition: Partition) extends Partition {
+
+  override def index: Int = partitionIndex
+}
+
+private class PartialDependency[T](rdd: RDD[T], partitions: Array[Partition])
+  extends NarrowDependency[T](rdd) {
+
+  override def getParents(partitionId: Int): Seq[Int] = {
+    List(partitions(partitionId).asInstanceOf[FilteredCachedColumnarPartition].
+      parentPartition.index)
+  }
+}
+
+private[columnar] class FilteredCachedColumnarRDD (
+    @transient private var _sc: SparkContext,
+    private var cachedColumnarRDD: CachedColumnarRDD,
+    acceptedPartitions: Seq[Partition])
+  extends RDD[CachedBatch](
+    _sc, Seq(new PartialDependency(cachedColumnarRDD, acceptedPartitions.toArray))) {
+
+  override def compute(split: Partition, context: TaskContext): Iterator[CachedBatch] = {
+    val filteredCachedColumnarPartition = split.asInstanceOf[FilteredCachedColumnarPartition]
+    firstParent.iterator(filteredCachedColumnarPartition.parentPartition, context)
+  }
+
+  override protected def getPartitions: Array[Partition] = acceptedPartitions.toArray
+
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryRelation.scala
@@ -52,6 +52,68 @@ object InMemoryRelation {
 private[columnar]
 case class CachedBatch(numRows: Int, buffers: Array[Array[Byte]], stats: InternalRow)
 
+private[columnar] class CachedBatchIterator(
+    rowIterator: Iterator[InternalRow],
+    output: Seq[Attribute],
+    batchSize: Int,
+    useCompression: Boolean,
+    batchStats: LongAccumulator,
+    singleBatchPerPartition: Boolean) extends Iterator[CachedBatch] {
+
+  def next(): CachedBatch = {
+    val columnBuilders = output.map { attribute =>
+      ColumnBuilder(attribute.dataType, batchSize, attribute.name, useCompression)
+    }.toArray
+
+    var rowCount = 0
+    var totalSize = 0L
+
+    val terminateLoop = (singleBatch: Boolean, rowIter: Iterator[InternalRow],
+      rowCount: Int, size: Long) => {
+      if (!singleBatch) {
+        rowIter.hasNext && rowCount < batchSize && totalSize < ColumnBuilder.MAX_BATCH_SIZE_IN_BYTE
+      } else {
+        rowIter.hasNext
+      }
+    }
+
+    while (terminateLoop(singleBatchPerPartition, rowIterator, rowCount, totalSize)) {
+      val row = rowIterator.next()
+
+      // Added for SPARK-6082. This assertion can be useful for scenarios when something
+      // like Hive TRANSFORM is used. The external data generation script used in TRANSFORM
+      // may result malformed rows, causing ArrayIndexOutOfBoundsException, which is somewhat
+      // hard to decipher.
+      assert(
+        row.numFields == columnBuilders.length,
+        s"Row column number mismatch, expected ${output.size} columns, " +
+          s"but got ${row.numFields}." +
+          s"\nRow content: $row")
+
+      var i = 0
+      totalSize = 0
+      while (i < row.numFields) {
+        columnBuilders(i).appendFrom(row, i)
+        totalSize += columnBuilders(i).columnStats.sizeInBytes
+        i += 1
+      }
+      rowCount += 1
+    }
+
+    batchStats.add(totalSize)
+
+    val statsInSeq = columnBuilders.flatMap(_.columnStats.collectedStatistics)
+
+    val stats = InternalRow.fromSeq(statsInSeq)
+
+    CachedBatch(rowCount, columnBuilders.map { builder =>
+      JavaUtils.bufferToArray(builder.build())
+    }, stats)
+  }
+
+  def hasNext: Boolean = rowIterator.hasNext
+}
+
 case class InMemoryRelation(
     output: Seq[Attribute],
     useCompression: Boolean,
@@ -69,6 +131,8 @@ case class InMemoryRelation(
 
   @transient val partitionStatistics = new PartitionStatistics(output)
 
+  private val usePartitionLevelMetadata = conf.inMemoryPartitionMetadata
+
   override def computeStats(): Statistics = {
     if (batchStats.value == 0L) {
       // Underlying columnar RDD hasn't been materialized, no useful statistics information
@@ -87,51 +151,14 @@ case class InMemoryRelation(
 
   private def buildBuffers(): Unit = {
     val output = child.output
-    val cached = child.execute().mapPartitionsInternal { rowIterator =>
-      new Iterator[CachedBatch] {
-        def next(): CachedBatch = {
-          val columnBuilders = output.map { attribute =>
-            ColumnBuilder(attribute.dataType, batchSize, attribute.name, useCompression)
-          }.toArray
-
-          var rowCount = 0
-          var totalSize = 0L
-          while (rowIterator.hasNext && rowCount < batchSize
-            && totalSize < ColumnBuilder.MAX_BATCH_SIZE_IN_BYTE) {
-            val row = rowIterator.next()
-
-            // Added for SPARK-6082. This assertion can be useful for scenarios when something
-            // like Hive TRANSFORM is used. The external data generation script used in TRANSFORM
-            // may result malformed rows, causing ArrayIndexOutOfBoundsException, which is somewhat
-            // hard to decipher.
-            assert(
-              row.numFields == columnBuilders.length,
-              s"Row column number mismatch, expected ${output.size} columns, " +
-                s"but got ${row.numFields}." +
-                s"\nRow content: $row")
-
-            var i = 0
-            totalSize = 0
-            while (i < row.numFields) {
-              columnBuilders(i).appendFrom(row, i)
-              totalSize += columnBuilders(i).columnStats.sizeInBytes
-              i += 1
-            }
-            rowCount += 1
-          }
-
-          batchStats.add(totalSize)
-
-          val stats = InternalRow.fromSeq(
-            columnBuilders.flatMap(_.columnStats.collectedStatistics))
-          CachedBatch(rowCount, columnBuilders.map { builder =>
-            JavaUtils.bufferToArray(builder.build())
-          }, stats)
-        }
-
-        def hasNext: Boolean = rowIterator.hasNext
-      }
-    }.persist(storageLevel)
+
+    val batchedRDD = child.execute().mapPartitionsInternal { rowIterator =>
+      new CachedBatchIterator(rowIterator, output, batchSize, useCompression, batchStats,
+        usePartitionLevelMetadata)
+    }
+
+    val cached = new CachedColumnarRDD(batchedRDD.sparkContext, batchedRDD,
+      usePartitionLevelMetadata, storageLevel).persist(storageLevel)
 
     cached.setName(
       tableName.map(n => s"In-memory table $n")