Merge branch 'master' of git://git.apache.org/spark into SPARK-2583

sarutak · sarutak · commit e579302e629e · 2014-07-23T11:14:52.000-07:00
diff --git a/core/src/main/scala/org/apache/spark/rdd/PartitionwiseSampledRDD.scala b/core/src/main/scala/org/apache/spark/rdd/PartitionwiseSampledRDD.scala
@@ -39,16 +39,20 @@ class PartitionwiseSampledRDDPartition(val prev: Partition, val seed: Long)
  *
  * @param prev RDD to be sampled
  * @param sampler a random sampler
+ * @param preservesPartitioning whether the sampler preserves the partitioner of the parent RDD
  * @param seed random seed
  * @tparam T input RDD item type
  * @tparam U sampled RDD item type
  */
 private[spark] class PartitionwiseSampledRDD[T: ClassTag, U: ClassTag](
     prev: RDD[T],
     sampler: RandomSampler[T, U],
+    @transient preservesPartitioning: Boolean,
     @transient seed: Long = Utils.random.nextLong)
   extends RDD[U](prev) {
 
+  @transient override val partitioner = if (preservesPartitioning) prev.partitioner else None
+
   override def getPartitions: Array[Partition] = {
     val random = new Random(seed)
     firstParent[T].partitions.map(x => new PartitionwiseSampledRDDPartition(x, random.nextLong()))
diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
@@ -356,9 +356,9 @@ abstract class RDD[T: ClassTag](
       seed: Long = Utils.random.nextLong): RDD[T] = {
     require(fraction >= 0.0, "Invalid fraction value: " + fraction)
     if (withReplacement) {
-      new PartitionwiseSampledRDD[T, T](this, new PoissonSampler[T](fraction), seed)
+      new PartitionwiseSampledRDD[T, T](this, new PoissonSampler[T](fraction), true, seed)
     } else {
-      new PartitionwiseSampledRDD[T, T](this, new BernoulliSampler[T](fraction), seed)
+      new PartitionwiseSampledRDD[T, T](this, new BernoulliSampler[T](fraction), true, seed)
     }
   }
 
@@ -374,7 +374,7 @@ abstract class RDD[T: ClassTag](
     val sum = weights.sum
     val normalizedCumWeights = weights.map(_ / sum).scanLeft(0.0d)(_ + _)
     normalizedCumWeights.sliding(2).map { x =>
-      new PartitionwiseSampledRDD[T, T](this, new BernoulliSampler[T](x(0), x(1)), seed)
+      new PartitionwiseSampledRDD[T, T](this, new BernoulliSampler[T](x(0), x(1)), true, seed)
     }.toArray
   }
 
@@ -586,6 +586,9 @@ abstract class RDD[T: ClassTag](
 
   /**
    * Return a new RDD by applying a function to each partition of this RDD.
+   *
+   * `preservesPartitioning` indicates whether the input function preserves the partitioner, which
+   * should be `false` unless this is a pair RDD and the input function doesn't modify the keys.
    */
   def mapPartitions[U: ClassTag](
       f: Iterator[T] => Iterator[U], preservesPartitioning: Boolean = false): RDD[U] = {
@@ -596,6 +599,9 @@ abstract class RDD[T: ClassTag](
   /**
    * Return a new RDD by applying a function to each partition of this RDD, while tracking the index
    * of the original partition.
+   *
+   * `preservesPartitioning` indicates whether the input function preserves the partitioner, which
+   * should be `false` unless this is a pair RDD and the input function doesn't modify the keys.
    */
   def mapPartitionsWithIndex[U: ClassTag](
       f: (Int, Iterator[T]) => Iterator[U], preservesPartitioning: Boolean = false): RDD[U] = {
@@ -607,6 +613,9 @@ abstract class RDD[T: ClassTag](
    * :: DeveloperApi ::
    * Return a new RDD by applying a function to each partition of this RDD. This is a variant of
    * mapPartitions that also passes the TaskContext into the closure.
+   *
+   * `preservesPartitioning` indicates whether the input function preserves the partitioner, which
+   * should be `false` unless this is a pair RDD and the input function doesn't modify the keys.
    */
   @DeveloperApi
   def mapPartitionsWithContext[U: ClassTag](
@@ -689,7 +698,7 @@ abstract class RDD[T: ClassTag](
    * a map on the other).
    */
   def zip[U: ClassTag](other: RDD[U]): RDD[(T, U)] = {
-    zipPartitions(other, true) { (thisIter, otherIter) =>
+    zipPartitions(other, preservesPartitioning = false) { (thisIter, otherIter) =>
       new Iterator[(T, U)] {
         def hasNext = (thisIter.hasNext, otherIter.hasNext) match {
           case (true, true) => true
diff --git a/core/src/main/scala/org/apache/spark/scheduler/local/LocalBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/local/LocalBackend.scala
@@ -57,7 +57,7 @@ private[spark] class LocalActor(
     case StatusUpdate(taskId, state, serializedData) =>
       scheduler.statusUpdate(taskId, state, serializedData)
       if (TaskState.isFinished(state)) {
-        freeCores += 1
+        freeCores += scheduler.CPUS_PER_TASK
         reviveOffers()
       }
 
@@ -68,7 +68,7 @@ private[spark] class LocalActor(
   def reviveOffers() {
     val offers = Seq(new WorkerOffer(localExecutorId, localExecutorHostname, freeCores))
     for (task <- scheduler.resourceOffers(offers).flatten) {
-      freeCores -= 1
+      freeCores -= scheduler.CPUS_PER_TASK
       executor.launchTask(executorBackend, task.taskId, task.name, task.serializedTask)
     }
   }
diff --git a/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala b/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala
@@ -106,6 +106,7 @@ class ExternalAppendOnlyMap[K, V, C](
   private val fileBufferSize = sparkConf.getInt("spark.shuffle.file.buffer.kb", 100) * 1024
   private val keyComparator = new HashComparator[K]
   private val ser = serializer.newInstance()
+  private val threadId = Thread.currentThread().getId
 
   /**
    * Insert the given key and value into the map.
@@ -128,7 +129,6 @@ class ExternalAppendOnlyMap[K, V, C](
       // Atomically check whether there is sufficient memory in the global pool for
       // this map to grow and, if possible, allocate the required amount
       shuffleMemoryMap.synchronized {
-        val threadId = Thread.currentThread().getId
         val previouslyOccupiedMemory = shuffleMemoryMap.get(threadId)
         val availableMemory = maxMemoryThreshold -
           (shuffleMemoryMap.values.sum - previouslyOccupiedMemory.getOrElse(0L))
@@ -153,8 +153,8 @@ class ExternalAppendOnlyMap[K, V, C](
    */
   private def spill(mapSize: Long) {
     spillCount += 1
-    logWarning("Spilling in-memory map of %d MB to disk (%d time%s so far)"
-      .format(mapSize / (1024 * 1024), spillCount, if (spillCount > 1) "s" else ""))
+    logWarning("Thread %d spilling in-memory map of %d MB to disk (%d time%s so far)"
+      .format(threadId, mapSize / (1024 * 1024), spillCount, if (spillCount > 1) "s" else ""))
     val (blockId, file) = diskBlockManager.createTempBlock()
     var writer = blockManager.getDiskWriter(blockId, file, serializer, fileBufferSize)
     var objectsWritten = 0
diff --git a/core/src/test/scala/org/apache/spark/rdd/PartitionwiseSampledRDDSuite.scala b/core/src/test/scala/org/apache/spark/rdd/PartitionwiseSampledRDDSuite.scala
@@ -43,7 +43,7 @@ class PartitionwiseSampledRDDSuite extends FunSuite with SharedSparkContext {
   test("seed distribution") {
     val rdd = sc.makeRDD(Array(1L, 2L, 3L, 4L), 2)
     val sampler = new MockSampler
-    val sample = new PartitionwiseSampledRDD[Long, Long](rdd, sampler, 0L)
+    val sample = new PartitionwiseSampledRDD[Long, Long](rdd, sampler, false, 0L)
     assert(sample.distinct().count == 2, "Seeds must be different.")
   }
 
@@ -52,7 +52,7 @@ class PartitionwiseSampledRDDSuite extends FunSuite with SharedSparkContext {
     // We want to make sure there are no concurrency issues.
     val rdd = sc.parallelize(0 until 111, 10)
     for (sampler <- Seq(new BernoulliSampler[Int](0.5), new PoissonSampler[Int](0.5))) {
-      val sampled = new PartitionwiseSampledRDD[Int, Int](rdd, sampler)
+      val sampled = new PartitionwiseSampledRDD[Int, Int](rdd, sampler, true)
       sampled.zip(sampled).count()
     }
   }
diff --git a/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala b/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala
@@ -523,6 +523,15 @@ class RDDSuite extends FunSuite with SharedSparkContext {
     assert(sortedTopK === nums.sorted(ord).take(5))
   }
 
+  test("sample preserves partitioner") {
+    val partitioner = new HashPartitioner(2)
+    val rdd = sc.parallelize(Seq((0, 1), (2, 3))).partitionBy(partitioner)
+    for (withReplacement <- Seq(true, false)) {
+      val sampled = rdd.sample(withReplacement, 1.0)
+      assert(sampled.partitioner === rdd.partitioner)
+    }
+  }
+
   test("takeSample") {
     val n = 1000000
     val data = sc.parallelize(1 to n, 2)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala
@@ -103,11 +103,11 @@ class BinaryClassificationMetrics(scoreAndLabels: RDD[(Double, Double)]) extends
       mergeValue = (c: BinaryLabelCounter, label: Double) => c += label,
       mergeCombiners = (c1: BinaryLabelCounter, c2: BinaryLabelCounter) => c1 += c2
     ).sortByKey(ascending = false)
-    val agg = counts.values.mapPartitions({ iter =>
+    val agg = counts.values.mapPartitions { iter =>
       val agg = new BinaryLabelCounter()
       iter.foreach(agg += _)
       Iterator(agg)
-    }, preservesPartitioning = true).collect()
+    }.collect()
     val partitionwiseCumulativeCounts =
       agg.scanLeft(new BinaryLabelCounter())(
         (agg: BinaryLabelCounter, c: BinaryLabelCounter) => agg.clone() += c)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala
@@ -377,18 +377,18 @@ class RowMatrix(
       s"Only support dense matrix at this time but found ${B.getClass.getName}.")
 
     val Bb = rows.context.broadcast(B.toBreeze.asInstanceOf[BDM[Double]].toDenseVector.toArray)
-    val AB = rows.mapPartitions({ iter =>
+    val AB = rows.mapPartitions { iter =>
       val Bi = Bb.value
-      iter.map(row => {
+      iter.map { row =>
         val v = BDV.zeros[Double](k)
         var i = 0
         while (i < k) {
           v(i) = row.toBreeze.dot(new BDV(Bi, i * n, 1, n))
           i += 1
         }
         Vectors.fromBreeze(v)
-      })
-    }, preservesPartitioning = true)
+      }
+    }
 
     new RowMatrix(AB, nRows, B.numCols)
   }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala b/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala
@@ -430,7 +430,7 @@ class ALS private (
       val inLinkBlock = makeInLinkBlock(numProductBlocks, ratings, productPartitioner)
       val outLinkBlock = makeOutLinkBlock(numProductBlocks, ratings, productPartitioner)
       Iterator.single((blockId, (inLinkBlock, outLinkBlock)))
-    }, true)
+    }, preservesPartitioning = true)
     val inLinks = links.mapValues(_._1)
     val outLinks = links.mapValues(_._2)
     inLinks.persist(StorageLevel.MEMORY_AND_DISK)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
@@ -264,8 +264,8 @@ object MLUtils {
     (1 to numFolds).map { fold =>
       val sampler = new BernoulliSampler[T]((fold - 1) / numFoldsF, fold / numFoldsF,
         complement = false)
-      val validation = new PartitionwiseSampledRDD(rdd, sampler, seed)
-      val training = new PartitionwiseSampledRDD(rdd, sampler.cloneComplement(), seed)
+      val validation = new PartitionwiseSampledRDD(rdd, sampler, true, seed)
+      val training = new PartitionwiseSampledRDD(rdd, sampler.cloneComplement(), true, seed)
       (training, validation)
     }.toArray
   }

Original file line number	Diff line number	Diff line change
`@@ -57,7 +57,7 @@ private[spark] class LocalActor(`
`57`	`57`	`case StatusUpdate(taskId, state, serializedData) =>`
`58`	`58`	`scheduler.statusUpdate(taskId, state, serializedData)`
`59`	`59`	`if (TaskState.isFinished(state)) {`
`60`		`- freeCores += 1`
	`60`	`+ freeCores += scheduler.CPUS_PER_TASK`
`61`	`61`	`reviveOffers()`
`62`	`62`	`}`
`63`	`63`
`@@ -68,7 +68,7 @@ private[spark] class LocalActor(`
`68`	`68`	`def reviveOffers() {`
`69`	`69`	`val offers = Seq(new WorkerOffer(localExecutorId, localExecutorHostname, freeCores))`
`70`	`70`	`for (task <- scheduler.resourceOffers(offers).flatten) {`
`71`		`- freeCores -= 1`
	`71`	`+ freeCores -= scheduler.CPUS_PER_TASK`
`72`	`72`	`executor.launchTask(executorBackend, task.taskId, task.name, task.serializedTask)`
`73`	`73`	`}`
`74`	`74`	`}`
Original file line number	Diff line number	Diff line change
`@@ -43,7 +43,7 @@ class PartitionwiseSampledRDDSuite extends FunSuite with SharedSparkContext {`
`43`	`43`	`test("seed distribution") {`
`44`	`44`	`val rdd = sc.makeRDD(Array(1L, 2L, 3L, 4L), 2)`
`45`	`45`	`val sampler = new MockSampler`
`46`		`- val sample = new PartitionwiseSampledRDD[Long, Long](rdd, sampler, 0L)`
	`46`	`+ val sample = new PartitionwiseSampledRDD[Long, Long](rdd, sampler, false, 0L)`
`47`	`47`	`assert(sample.distinct().count == 2, "Seeds must be different.")`
`48`	`48`	`}`
`49`	`49`
`@@ -52,7 +52,7 @@ class PartitionwiseSampledRDDSuite extends FunSuite with SharedSparkContext {`
`52`	`52`	`// We want to make sure there are no concurrency issues.`
`53`	`53`	`val rdd = sc.parallelize(0 until 111, 10)`
`54`	`54`	`for (sampler <- Seq(new BernoulliSampler[Int](0.5), new PoissonSampler[Int](0.5))) {`
`55`		`- val sampled = new PartitionwiseSampledRDD[Int, Int](rdd, sampler)`
	`55`	`+ val sampled = new PartitionwiseSampledRDD[Int, Int](rdd, sampler, true)`
`56`	`56`	`sampled.zip(sampled).count()`
`57`	`57`	`}`
`58`	`58`	`}`