More partial work towards sort-based shuffle

mateiz · mateiz · commit 3a5634157f4b · 2014-07-30T10:48:03.000-07:00
diff --git a/core/src/main/scala/org/apache/spark/shuffle/sort/SortShuffleWriter.scala b/core/src/main/scala/org/apache/spark/shuffle/sort/SortShuffleWriter.scala
@@ -21,9 +21,10 @@ import org.apache.spark.shuffle.{ShuffleWriter, BaseShuffleHandle}
 import org.apache.spark.{SparkEnv, Logging, TaskContext}
 import org.apache.spark.scheduler.MapStatus
 import org.apache.spark.serializer.Serializer
+import org.apache.spark.util.collection.ExternalSorter
 
-private[spark] class SortShuffleWriter[K, V](
-    handle: BaseShuffleHandle[K, V, _],
+private[spark] class SortShuffleWriter[K, V, C](
+    handle: BaseShuffleHandle[K, V, C],
     mapId: Int,
     context: TaskContext)
   extends ShuffleWriter[K, V] with Logging {
@@ -38,19 +39,27 @@ private[spark] class SortShuffleWriter[K, V](
 
   /** Write a bunch of records to this task's output */
   override def write(records: Iterator[_ <: Product2[K, V]]): Unit = {
-    val iter = if (dep.aggregator.isDefined) {
+    var sorter: ExternalSorter[K, V, _] = null
+
+    val partitions: Iterator[(Int, Iterator[Product2[K, _]])] = {
       if (dep.mapSideCombine) {
-        // TODO: This does an external merge-sort if the data is highly combinable, and then we
-        // do another one later to sort them by output partition. We can improve this by doing
-        // the merging as part of the SortedFileWriter.
-        dep.aggregator.get.combineValuesByKey(records, context)
+        if (!dep.aggregator.isDefined) {
+          throw new IllegalStateException("Aggregator is empty for map-side combine")
+        }
+        sorter = new ExternalSorter[K, V, C](
+          dep.aggregator, Some(dep.partitioner), dep.keyOrdering, dep.serializer)
+        sorter.write(records)
+        sorter.partitionedIterator
       } else {
-        records
+        sorter = new ExternalSorter[K, V, V](
+          None, Some(dep.partitioner), dep.keyOrdering, dep.serializer)
+        sorter.write(records)
+        sorter.partitionedIterator
       }
-    } else if (dep.aggregator.isEmpty && dep.mapSideCombine) {
-      throw new IllegalStateException("Aggregator is empty for map-side combine")
-    } else {
-      records
+    }
+
+    for ((id, elements) <- partitions) {
+
     }
 
     ???
diff --git a/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala b/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala
@@ -17,12 +17,15 @@
 
 package org.apache.spark.util.collection
 
-import org.apache.spark.{SparkEnv, Aggregator, Logging, Partitioner}
-import org.apache.spark.serializer.Serializer
+import java.io._
 
 import scala.collection.mutable.ArrayBuffer
+
+import com.google.common.io.ByteStreams
+
+import org.apache.spark.{Aggregator, SparkEnv, Logging, Partitioner}
+import org.apache.spark.serializer.Serializer
 import org.apache.spark.storage.BlockId
-import java.io.File
 
 /**
  * Sorts and potentially merges a number of key-value pairs of type (K, V) to produce key-combiner
@@ -49,6 +52,7 @@ private[spark] class ExternalSorter[K, V, C](
   private val blockManager = SparkEnv.get.blockManager
   private val diskBlockManager = blockManager.diskBlockManager
   private val ser = Serializer.getSerializer(serializer.getOrElse(null))
+  private val serInstance = ser.newInstance()
 
   private val conf = SparkEnv.get.conf
   private val fileBufferSize = conf.getInt("spark.shuffle.file.buffer.kb", 100) * 1024
@@ -232,7 +236,7 @@ private[spark] class ExternalSorter[K, V, C](
     // TODO: merge intermediate results if they are sorted by the comparator
     val readers = spills.map(new SpillReader(_))
     (0 until numPartitions).iterator.map { p =>
-      (p, readers.iterator.flatMap(_.readPartition(p)))
+      (p, readers.iterator.flatMap(_.readNextPartition()))
     }
   }
 
@@ -241,7 +245,92 @@ private[spark] class ExternalSorter[K, V, C](
    * partitions to be requested in order.
    */
   private class SpillReader(spill: SpilledFile) {
-    def readPartition(id: Int): Iterator[Product2[K, C]] = ???
+    val fileStream = new FileInputStream(spill.file)
+    val bufferedStream = new BufferedInputStream(fileStream, fileBufferSize)
+
+    // An intermediate stream that reads from exactly one batch
+    // This guards against pre-fetching and other arbitrary behavior of higher level streams
+    var batchStream = nextBatchStream()
+    var compressedStream = blockManager.wrapForCompression(spill.blockId, batchStream)
+    var deserStream = serInstance.deserializeStream(compressedStream)
+    var nextItem: (K, C) = null
+    var finished = false
+
+    // Track which partition and which batch stream we're in
+    var partitionId = 0
+    var indexInPartition = -1L  // Just to make sure we start at index 0
+    var batchStreamsRead = 0
+    var indexInBatch = -1
+
+    /** Construct a stream that only reads from the next batch */
+    def nextBatchStream(): InputStream = {
+      if (batchStreamsRead < spill.serializerBatchSizes.length) {
+        batchStreamsRead += 1
+        ByteStreams.limit(bufferedStream, spill.serializerBatchSizes(batchStreamsRead - 1))
+      } else {
+        // No more batches left
+        bufferedStream
+      }
+    }
+
+    /**
+     * Return the next (K, C) pair from the deserialization stream and update partitionId,
+     * indexInPartition, indexInBatch and such to match its location.
+     *
+     * If the current batch is drained, construct a stream for the next batch and read from it.
+     * If no more pairs are left, return null.
+     */
+    private def readNextItem(): (K, C) = {
+      try {
+        if (finished) {
+          return null
+        }
+        // Start reading the next batch if we're done with this one
+        indexInBatch += 1
+        if (indexInBatch == serializerBatchSize) {
+          batchStream = nextBatchStream()
+          compressedStream = blockManager.wrapForCompression(spill.blockId, batchStream)
+          deserStream = serInstance.deserializeStream(compressedStream)
+          indexInBatch = 0
+        }
+        // Update the partition location of the element we're reading
+        indexInPartition += 1
+        while (indexInPartition == spill.elementsPerPartition(partitionId)) {
+          partitionId += 1
+          indexInPartition = 0
+        }
+        val k = deserStream.readObject().asInstanceOf[K]
+        val c = deserStream.readObject().asInstanceOf[C]
+        (k, c)
+      } catch {
+        case e: EOFException =>
+          finished = true
+          deserStream.close()
+          null
+      }
+    }
+
+    var nextPartitionToRead = 0
+
+    def readNextPartition(): Iterator[Product2[K, C]] = new Iterator[Product2[K, C]] {
+      val myPartition = nextPartitionToRead
+      nextPartitionToRead += 1
+
+      override def hasNext: Boolean = {
+        if (nextItem == null) {
+          nextItem = readNextItem()
+        }
+        // Check that we're still in the right partition; will be numPartitions at EOF
+        partitionId == myPartition
+      }
+
+      override def next(): Product2[K, C] = {
+        if (!hasNext) {
+          throw new NoSuchElementException
+        }
+        nextItem
+      }
+    }
   }
 
   /**
diff --git a/core/src/main/scala/org/apache/spark/util/collection/SizeTrackingBuffer.scala b/core/src/main/scala/org/apache/spark/util/collection/SizeTrackingBuffer.scala
@@ -20,7 +20,6 @@ package org.apache.spark.util.collection
 import java.util.Arrays
 import java.util.Comparator
 
-import scala.reflect.ClassTag
 import scala.collection.mutable.ArrayBuffer
 
 import org.apache.spark.util.SizeEstimator
@@ -32,13 +31,15 @@ import org.apache.spark.util.SizeEstimator
  *
  * The tracking code is copied from SizeTrackingAppendOnlyMap -- we'll factor that out soon.
  */
-private[spark] class SizeTrackingBuffer[T: ClassTag](initialCapacity: Int = 64)
+private[spark] class SizeTrackingBuffer[T <: AnyRef](initialCapacity: Int = 64)
   extends SizeTrackingCollection[T]
 {
-  // Basic growable array data structure
+  // Basic growable array data structure. NOTE: We use an Array of AnyRef because Arrays.sort()
+  // is not easy to call on an Array[T], and Scala doesn't provide a great way to sort a generic
+  // array using a Comparator.
   private var capacity = initialCapacity
   private var curSize = 0
-  private var data = new Array[T](initialCapacity)
+  private var data = new Array[AnyRef](initialCapacity)
 
   // Size-tracking variables: we maintain a sequence of samples since the size of the collection
   // depends on both the array and how many of its elements are filled. We reset this each time
@@ -91,7 +92,7 @@ private[spark] class SizeTrackingBuffer[T: ClassTag](initialCapacity: Int = 64)
     override def next(): T = {
       val elem = data(pos)
       pos += 1
-      elem
+      elem.asInstanceOf[T]
     }
   }
 
@@ -112,7 +113,7 @@ private[spark] class SizeTrackingBuffer[T: ClassTag](initialCapacity: Int = 64)
       throw new Exception("Can't grow buffer beyond 2^30 elements")
     }
     val newCapacity = capacity * 2
-    val newArray = new Array[T](newCapacity)
+    val newArray = new Array[AnyRef](newCapacity)
     System.arraycopy(data, 0, newArray, 0, capacity)
     data = newArray
     capacity = newCapacity
@@ -143,7 +144,7 @@ private[spark] class SizeTrackingBuffer[T: ClassTag](initialCapacity: Int = 64)
 
   /** Iterate through the data in a given order. For this class this is not really destructive. */
   override def destructiveSortedIterator(cmp: Comparator[T]): Iterator[T] = {
-    Arrays.sort(data, 0, curSize, cmp)
+    Arrays.sort(data, 0, curSize, cmp.asInstanceOf[Comparator[AnyRef]])
     iterator
   }
 }