Merge branch 'master' into SPARK-1149

liguoqiang · liguoqiang · commit db6ecc56be31 · 2014-03-04T09:39:41.000+08:00
diff --git a/core/src/main/scala/org/apache/spark/BlockStoreShuffleFetcher.scala b/core/src/main/scala/org/apache/spark/BlockStoreShuffleFetcher.scala
@@ -79,7 +79,6 @@ private[spark] class BlockStoreShuffleFetcher extends ShuffleFetcher with Loggin
     val completionIter = CompletionIterator[T, Iterator[T]](itr, {
       val shuffleMetrics = new ShuffleReadMetrics
       shuffleMetrics.shuffleFinishTime = System.currentTimeMillis
-      shuffleMetrics.remoteFetchTime = blockFetcherItr.remoteFetchTime
       shuffleMetrics.fetchWaitTime = blockFetcherItr.fetchWaitTime
       shuffleMetrics.remoteBytesRead = blockFetcherItr.remoteBytesRead
       shuffleMetrics.totalBlocksFetched = blockFetcherItr.totalBlocks
diff --git a/core/src/main/scala/org/apache/spark/executor/TaskMetrics.scala b/core/src/main/scala/org/apache/spark/executor/TaskMetrics.scala
@@ -103,13 +103,6 @@ class ShuffleReadMetrics extends Serializable {
    */
   var fetchWaitTime: Long = _
 
-  /**
-   * Total time spent fetching remote shuffle blocks. This aggregates the time spent fetching all
-   * input blocks. Since block fetches are both pipelined and parallelized, this can
-   * exceed fetchWaitTime and executorRunTime.
-   */
-  var remoteFetchTime: Long = _
-
   /**
    * Total number of remote bytes read from the shuffle by this task
    */
diff --git a/core/src/main/scala/org/apache/spark/network/ConnectionManager.scala b/core/src/main/scala/org/apache/spark/network/ConnectionManager.scala
@@ -550,9 +550,6 @@ private[spark] class ConnectionManager(port: Int, conf: SparkConf) extends Loggi
 
       newConnection
     }
-    // I removed the lookupKey stuff as part of merge ... should I re-add it ? We did not find it
-    // useful in our test-env ... If we do re-add it, we should consistently use it everywhere I
-    // guess ?
     val connection = connectionsById.getOrElseUpdate(connectionManagerId, startNewConnection())
     message.senderAddress = id.toSocketAddress()
     logDebug("Sending [" + message + "] to [" + connectionManagerId + "]")
diff --git a/core/src/main/scala/org/apache/spark/scheduler/JobLogger.scala b/core/src/main/scala/org/apache/spark/scheduler/JobLogger.scala
@@ -275,7 +275,6 @@ class JobLogger(val user: String, val logDirName: String)
         " BLOCK_FETCHED_LOCAL=" + metrics.localBlocksFetched +
         " BLOCK_FETCHED_REMOTE=" + metrics.remoteBlocksFetched +
         " REMOTE_FETCH_WAIT_TIME=" + metrics.fetchWaitTime +
-        " REMOTE_FETCH_TIME=" + metrics.remoteFetchTime +
         " REMOTE_BYTES_READ=" + metrics.remoteBytesRead
       case None => ""
     }
diff --git a/core/src/main/scala/org/apache/spark/storage/BlockFetcherIterator.scala b/core/src/main/scala/org/apache/spark/storage/BlockFetcherIterator.scala
@@ -49,7 +49,6 @@ trait BlockFetcherIterator extends Iterator[(BlockId, Option[Iterator[Any]])] wi
   def totalBlocks: Int
   def numLocalBlocks: Int
   def numRemoteBlocks: Int
-  def remoteFetchTime: Long
   def fetchWaitTime: Long
   def remoteBytesRead: Long
 }
@@ -79,7 +78,6 @@ object BlockFetcherIterator {
     import blockManager._
 
     private var _remoteBytesRead = 0L
-    private var _remoteFetchTime = 0L
     private var _fetchWaitTime = 0L
 
     if (blocksByAddress == null) {
@@ -125,7 +123,6 @@ object BlockFetcherIterator {
       future.onSuccess {
         case Some(message) => {
           val fetchDone = System.currentTimeMillis()
-          _remoteFetchTime += fetchDone - fetchStart
           val bufferMessage = message.asInstanceOf[BufferMessage]
           val blockMessageArray = BlockMessageArray.fromBufferMessage(bufferMessage)
           for (blockMessage <- blockMessageArray) {
@@ -241,7 +238,6 @@ object BlockFetcherIterator {
     override def totalBlocks: Int = numLocal + numRemote
     override def numLocalBlocks: Int = numLocal
     override def numRemoteBlocks: Int = numRemote
-    override def remoteFetchTime: Long = _remoteFetchTime
     override def fetchWaitTime: Long = _fetchWaitTime
     override def remoteBytesRead: Long = _remoteBytesRead
  
diff --git a/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala b/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala
@@ -347,6 +347,32 @@ class PairRDDFunctionsSuite extends FunSuite with SharedSparkContext {
      */
     pairs.saveAsNewAPIHadoopFile[ConfigTestFormat]("ignored")
   }
+
+  test("lookup") {
+    val pairs = sc.parallelize(Array((1,2), (3,4), (5,6), (5,7)))
+
+    assert(pairs.partitioner === None)
+    assert(pairs.lookup(1) === Seq(2))
+    assert(pairs.lookup(5) === Seq(6,7))
+    assert(pairs.lookup(-1) === Seq())
+
+  }
+
+  test("lookup with partitioner") {
+    val pairs = sc.parallelize(Array((1,2), (3,4), (5,6), (5,7)))
+
+    val p = new Partitioner {
+      def numPartitions: Int = 2
+
+      def getPartition(key: Any): Int = Math.abs(key.hashCode() % 2)
+    }
+    val shuffled = pairs.partitionBy(p)
+
+    assert(shuffled.partitioner === Some(p))
+    assert(shuffled.lookup(1) === Seq(2))
+    assert(shuffled.lookup(5) === Seq(6,7))
+    assert(shuffled.lookup(-1) === Seq())
+  }
 }
 
 /*
diff --git a/core/src/test/scala/org/apache/spark/scheduler/SparkListenerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/SparkListenerSuite.scala
@@ -129,7 +129,6 @@ class SparkListenerSuite extends FunSuite with LocalSparkContext with ShouldMatc
           sm.localBlocksFetched should be > (0)
           sm.remoteBlocksFetched should be (0)
           sm.remoteBytesRead should be (0l)
-          sm.remoteFetchTime should be (0l)
         }
       }
     }
diff --git a/docs/tuning.md b/docs/tuning.md
@@ -163,8 +163,8 @@ their work directories), *not* on your driver program.
 **Cache Size Tuning**
 
 One important configuration parameter for GC is the amount of memory that should be used for caching RDDs.
-By default, Spark uses 66% of the configured executor memory (`spark.executor.memory` or `SPARK_MEM`) to
-cache RDDs. This means that 33% of memory is available for any objects created during task execution.
+By default, Spark uses 60% of the configured executor memory (`spark.executor.memory` or `SPARK_MEM`) to
+cache RDDs. This means that 40% of memory is available for any objects created during task execution.
 
 In case your tasks slow down and you find that your JVM is garbage-collecting frequently or running out of
 memory, lowering this value will help reduce the memory consumption. To change this to say 50%, you can call

Original file line number	Diff line number	Diff line change
`@@ -550,9 +550,6 @@ private[spark] class ConnectionManager(port: Int, conf: SparkConf) extends Loggi`
`550`	`550`
`551`	`551`	`newConnection`
`552`	`552`	`}`
`553`		`- // I removed the lookupKey stuff as part of merge ... should I re-add it ? We did not find it`
`554`		`- // useful in our test-env ... If we do re-add it, we should consistently use it everywhere I`
`555`		`- // guess ?`
`556`	`553`	`val connection = connectionsById.getOrElseUpdate(connectionManagerId, startNewConnection())`
`557`	`554`	`message.senderAddress = id.toSocketAddress()`
`558`	`555`	`logDebug("Sending [" + message + "] to [" + connectionManagerId + "]")`
Original file line number	Diff line number	Diff line change
`@@ -275,7 +275,6 @@ class JobLogger(val user: String, val logDirName: String)`
`275`	`275`	`" BLOCK_FETCHED_LOCAL=" + metrics.localBlocksFetched +`
`276`	`276`	`" BLOCK_FETCHED_REMOTE=" + metrics.remoteBlocksFetched +`
`277`	`277`	`" REMOTE_FETCH_WAIT_TIME=" + metrics.fetchWaitTime +`
`278`		`- " REMOTE_FETCH_TIME=" + metrics.remoteFetchTime +`
`279`	`278`	`" REMOTE_BYTES_READ=" + metrics.remoteBytesRead`
`280`	`279`	`case None => ""`
`281`	`280`	`}`
Original file line number	Diff line number	Diff line change
`@@ -129,7 +129,6 @@ class SparkListenerSuite extends FunSuite with LocalSparkContext with ShouldMatc`
`129`	`129`	`sm.localBlocksFetched should be > (0)`
`130`	`130`	`sm.remoteBlocksFetched should be (0)`
`131`	`131`	`sm.remoteBytesRead should be (0l)`
`132`		`- sm.remoteFetchTime should be (0l)`
`133`	`132`	`}`
`134`	`133`	`}`
`135`	`134`	`}`