apache
diff --git a/‎core/src/main/scala/org/apache/spark/deploy/DeployMessage.scala
Lines changed: 2 additions & 0 deletions b/‎core/src/main/scala/org/apache/spark/deploy/DeployMessage.scala
Lines changed: 2 additions & 0 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
Lines changed: 24 additions & 13 deletions b/‎core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
Lines changed: 24 additions & 13 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/deploy/history/HistoryPage.scala
Lines changed: 7 additions & 1 deletion b/‎core/src/main/scala/org/apache/spark/deploy/history/HistoryPage.scala
Lines changed: 7 additions & 1 deletion
diff --git a/‎core/src/main/scala/org/apache/spark/deploy/history/HistoryServerArguments.scala
Lines changed: 8 additions & 7 deletions b/‎core/src/main/scala/org/apache/spark/deploy/history/HistoryServerArguments.scala
Lines changed: 8 additions & 7 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala
Lines changed: 45 additions & 7 deletions b/‎core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala
Lines changed: 45 additions & 7 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/executor/Executor.scala
Lines changed: 1 addition & 1 deletion b/‎core/src/main/scala/org/apache/spark/executor/Executor.scala
Lines changed: 1 addition & 1 deletion
diff --git a/‎core/src/main/scala/org/apache/spark/network/netty/SparkTransportConf.scala
Lines changed: 37 additions & 7 deletions b/‎core/src/main/scala/org/apache/spark/network/netty/SparkTransportConf.scala
Lines changed: 37 additions & 7 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/util/JsonProtocol.scala
Lines changed: 4 additions & 0 deletions b/‎core/src/main/scala/org/apache/spark/util/JsonProtocol.scala
Lines changed: 4 additions & 0 deletions
diff --git a/‎core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala
Lines changed: 12 additions & 3 deletions b/‎core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala
Lines changed: 12 additions & 3 deletions
diff --git a/‎dev/create-release/create-release.sh
Lines changed: 1 addition & 0 deletions b/‎dev/create-release/create-release.sh
Lines changed: 1 addition & 0 deletions
@@ -92,6 +92,8 @@ private[deploy] object DeployMessages {
 
   case object WorkDirCleanup      // Sent to Worker actor periodically for cleaning up app folders
 
+  case object ReregisterWithMaster // used when a worker attempts to reconnect to a master
+
   // AppClient to Master
 
   case class RegisterApplication(appDescription: ApplicationDescription)
 
@@ -29,22 +29,27 @@ import org.apache.spark.scheduler._
 import org.apache.spark.ui.SparkUI
 import org.apache.spark.util.Utils
 
+/**
+ * A class that provides application history from event logs stored in the file system.
+ * This provider checks for new finished applications in the background periodically and
+ * renders the history application UI by parsing the associated event logs.
+ */
 private[history] class FsHistoryProvider(conf: SparkConf) extends ApplicationHistoryProvider
   with Logging {
 
+  import FsHistoryProvider._
+
   private val NOT_STARTED = "<Not Started>"
 
   // Interval between each check for event log updates
   private val UPDATE_INTERVAL_MS = conf.getInt("spark.history.fs.updateInterval",
     conf.getInt("spark.history.updateInterval", 10)) * 1000
 
-  private val logDir = conf.get("spark.history.fs.logDirectory", null)
-  private val resolvedLogDir = Option(logDir)
-    .map { d => Utils.resolveURI(d) }
-    .getOrElse { throw new IllegalArgumentException("Logging directory must be specified.") }
+  private val logDir = conf.getOption("spark.history.fs.logDirectory")
+    .map { d => Utils.resolveURI(d).toString }
+    .getOrElse(DEFAULT_LOG_DIR)
 
-  private val fs = Utils.getHadoopFileSystem(resolvedLogDir,
-    SparkHadoopUtil.get.newConfiguration(conf))
+  private val fs = Utils.getHadoopFileSystem(logDir, SparkHadoopUtil.get.newConfiguration(conf))
 
   // A timestamp of when the disk was last accessed to check for log updates
   private var lastLogCheckTimeMs = -1L
@@ -87,14 +92,17 @@ private[history] class FsHistoryProvider(conf: SparkConf) extends ApplicationHis
 
   private def initialize() {
     // Validate the log directory.
-    val path = new Path(resolvedLogDir)
+    val path = new Path(logDir)
     if (!fs.exists(path)) {
-      throw new IllegalArgumentException(
-        "Logging directory specified does not exist: %s".format(resolvedLogDir))
+      var msg = s"Log directory specified does not exist: $logDir."
+      if (logDir == DEFAULT_LOG_DIR) {
+        msg += " Did you configure the correct one through spark.fs.history.logDirectory?"
+      }
+      throw new IllegalArgumentException(msg)
     }
     if (!fs.getFileStatus(path).isDir) {
       throw new IllegalArgumentException(
-        "Logging directory specified is not a directory: %s".format(resolvedLogDir))
+        "Logging directory specified is not a directory: %s".format(logDir))
     }
 
     checkForLogs()
@@ -134,8 +142,7 @@ private[history] class FsHistoryProvider(conf: SparkConf) extends ApplicationHis
     }
   }
 
-  override def getConfig(): Map[String, String] =
-    Map("Event Log Location" -> resolvedLogDir.toString)
+  override def getConfig(): Map[String, String] = Map("Event log directory" -> logDir.toString)
 
   /**
    * Builds the application list based on the current contents of the log directory.
@@ -146,7 +153,7 @@ private[history] class FsHistoryProvider(conf: SparkConf) extends ApplicationHis
     lastLogCheckTimeMs = getMonotonicTimeMs()
     logDebug("Checking for logs. Time is now %d.".format(lastLogCheckTimeMs))
     try {
-      val logStatus = fs.listStatus(new Path(resolvedLogDir))
+      val logStatus = fs.listStatus(new Path(logDir))
       val logDirs = if (logStatus != null) logStatus.filter(_.isDir).toSeq else Seq[FileStatus]()
 
       // Load all new logs from the log directory. Only directories that have a modification time
@@ -244,6 +251,10 @@ private[history] class FsHistoryProvider(conf: SparkConf) extends ApplicationHis
 
 }
 
+private object FsHistoryProvider {
+  val DEFAULT_LOG_DIR = "file:/tmp/spark-events"
+}
+
 private class FsApplicationHistoryInfo(
     val logDir: String,
     id: String,
 
@@ -58,7 +58,13 @@ private[spark] class HistoryPage(parent: HistoryServer) extends WebUIPage("") {
               </h4> ++
               appTable
             } else {
-              <h4>No Completed Applications Found</h4>
+              <h4>No completed applications found!</h4> ++
+              <p>Did you specify the correct logging directory?
+                Please verify your setting of <span style="font-style:italic">
+                spark.history.fs.logDirectory</span> and whether you have the permissions to
+                access it.<br /> It is also possible that your application did not run to
+                completion or did not stop the SparkContext.
+              </p>
             }
           }
         </div>
 
@@ -17,22 +17,22 @@
 
 package org.apache.spark.deploy.history
 
-import org.apache.spark.SparkConf
+import org.apache.spark.{Logging, SparkConf}
 import org.apache.spark.util.Utils
 
 /**
  * Command-line parser for the master.
  */
-private[spark] class HistoryServerArguments(conf: SparkConf, args: Array[String]) {
-  private var logDir: String = null
+private[spark] class HistoryServerArguments(conf: SparkConf, args: Array[String]) extends Logging {
   private var propertiesFile: String = null
 
   parse(args.toList)
 
   private def parse(args: List[String]): Unit = {
     args match {
       case ("--dir" | "-d") :: value :: tail =>
-        logDir = value
+        logWarning("Setting log directory through the command line is deprecated as of " +
+          "Spark 1.1.0. Please set this through spark.history.fs.logDirectory instead.")
         conf.set("spark.history.fs.logDirectory", value)
         System.setProperty("spark.history.fs.logDirectory", value)
         parse(tail)
@@ -78,9 +78,10 @@ private[spark] class HistoryServerArguments(conf: SparkConf, args: Array[String]
       |                                     (default 50)
       |FsHistoryProvider options:
       |
-      |  spark.history.fs.logDirectory      Directory where app logs are stored (required)
-      |  spark.history.fs.updateInterval    How often to reload log data from storage (in seconds,
-      |                                     default 10)
+      |  spark.history.fs.logDirectory      Directory where app logs are stored
+      |                                     (default: file:/tmp/spark-events)
+      |  spark.history.fs.updateInterval    How often to reload log data from storage
+      |                                     (in seconds, default: 10)
       |""".stripMargin)
     System.exit(exitCode)
   }
 
@@ -21,7 +21,6 @@ import java.io.File
 import java.io.IOException
 import java.text.SimpleDateFormat
 import java.util.{UUID, Date}
-import java.util.concurrent.TimeUnit
 
 import scala.collection.JavaConversions._
 import scala.collection.mutable.HashMap
@@ -177,6 +176,9 @@ private[spark] class Worker(
         throw new SparkException("Invalid spark URL: " + x)
     }
     connected = true
+    // Cancel any outstanding re-registration attempts because we found a new master
+    registrationRetryTimer.foreach(_.cancel())
+    registrationRetryTimer = None
   }
 
   private def tryRegisterAllMasters() {
@@ -187,20 +189,53 @@ private[spark] class Worker(
     }
   }
 
-  private def retryConnectToMaster() {
+  /**
+   * Re-register with the master because a network failure or a master failure has occurred.
+   * If the re-registration attempt threshold is exceeded, the worker exits with error.
+   * Note that for thread-safety this should only be called from the actor.
+   */
+  private def reregisterWithMaster(): Unit = {
     Utils.tryOrExit {
       connectionAttemptCount += 1
       if (registered) {
         registrationRetryTimer.foreach(_.cancel())
         registrationRetryTimer = None
       } else if (connectionAttemptCount <= TOTAL_REGISTRATION_RETRIES) {
         logInfo(s"Retrying connection to master (attempt # $connectionAttemptCount)")
-        tryRegisterAllMasters()
+        /**
+         * Re-register with the active master this worker has been communicating with. If there
+         * is none, then it means this worker is still bootstrapping and hasn't established a
+         * connection with a master yet, in which case we should re-register with all masters.
+         *
+         * It is important to re-register only with the active master during failures. Otherwise,
+         * if the worker unconditionally attempts to re-register with all masters, the following
+         * race condition may arise and cause a "duplicate worker" error detailed in SPARK-4592:
+         *
+         *   (1) Master A fails and Worker attempts to reconnect to all masters
+         *   (2) Master B takes over and notifies Worker
+         *   (3) Worker responds by registering with Master B
+         *   (4) Meanwhile, Worker's previous reconnection attempt reaches Master B,
+         *       causing the same Worker to register with Master B twice
+         *
+         * Instead, if we only register with the known active master, we can assume that the
+         * old master must have died because another master has taken over. Note that this is
+         * still not safe if the old master recovers within this interval, but this is a much
+         * less likely scenario.
+         */
+        if (master != null) {
+          master ! RegisterWorker(
+            workerId, host, port, cores, memory, webUi.boundPort, publicAddress)
+        } else {
+          // We are retrying the initial registration
+          tryRegisterAllMasters()
+        }
+        // We have exceeded the initial registration retry threshold
+        // All retries from now on should use a higher interval
         if (connectionAttemptCount == INITIAL_REGISTRATION_RETRIES) {
           registrationRetryTimer.foreach(_.cancel())
           registrationRetryTimer = Some {
             context.system.scheduler.schedule(PROLONGED_REGISTRATION_RETRY_INTERVAL,
-              PROLONGED_REGISTRATION_RETRY_INTERVAL)(retryConnectToMaster)
+              PROLONGED_REGISTRATION_RETRY_INTERVAL, self, ReregisterWithMaster)
           }
         }
       } else {
@@ -220,7 +255,7 @@ private[spark] class Worker(
         connectionAttemptCount = 0
         registrationRetryTimer = Some {
           context.system.scheduler.schedule(INITIAL_REGISTRATION_RETRY_INTERVAL,
-            INITIAL_REGISTRATION_RETRY_INTERVAL)(retryConnectToMaster)
+            INITIAL_REGISTRATION_RETRY_INTERVAL, self, ReregisterWithMaster)
         }
       case Some(_) =>
         logInfo("Not spawning another attempt to register with the master, since there is an" +
@@ -400,12 +435,15 @@ private[spark] class Worker(
       logInfo(s"$x Disassociated !")
       masterDisconnected()
 
-    case RequestWorkerState => {
+    case RequestWorkerState =>
       sender ! WorkerStateResponse(host, port, workerId, executors.values.toList,
         finishedExecutors.values.toList, drivers.values.toList,
         finishedDrivers.values.toList, activeMasterUrl, cores, memory,
         coresUsed, memoryUsed, activeMasterWebUiUrl)
-    }
+
+    case ReregisterWithMaster =>
+      reregisterWithMaster()
+
   }
 
   private def masterDisconnected() {
 
@@ -334,7 +334,7 @@ private[spark] class Executor(
    * SparkContext. Also adds any new JARs we fetched to the class loader.
    */
   private def updateDependencies(newFiles: HashMap[String, Long], newJars: HashMap[String, Long]) {
-    val hadoopConf = SparkHadoopUtil.get.newConfiguration(conf)
+    lazy val hadoopConf = SparkHadoopUtil.get.newConfiguration(conf)
     synchronized {
       // Fetch missing dependencies
       for ((name, timestamp) <- newFiles if currentFiles.getOrElse(name, -1L) < timestamp) {
 
@@ -20,7 +20,24 @@ package org.apache.spark.network.netty
 import org.apache.spark.SparkConf
 import org.apache.spark.network.util.{TransportConf, ConfigProvider}
 
+/**
+ * Provides a utility for transforming from a SparkConf inside a Spark JVM (e.g., Executor,
+ * Driver, or a standalone shuffle service) into a TransportConf with details on our environment
+ * like the number of cores that are allocated to this JVM.
+ */
 object SparkTransportConf {
+  /**
+   * Specifies an upper bound on the number of Netty threads that Spark requires by default.
+   * In practice, only 2-4 cores should be required to transfer roughly 10 Gb/s, and each core
+   * that we use will have an initial overhead of roughly 32 MB of off-heap memory, which comes
+   * at a premium.
+   *
+   * Thus, this value should still retain maximum throughput and reduce wasted off-heap memory
+   * allocation. It can be overridden by setting the number of serverThreads and clientThreads
+   * manually in Spark's configuration.
+   */
+  private val MAX_DEFAULT_NETTY_THREADS = 8
+
   /**
    * Utility for creating a [[TransportConf]] from a [[SparkConf]].
    * @param numUsableCores if nonzero, this will restrict the server and client threads to only
@@ -29,15 +46,28 @@ object SparkTransportConf {
    */
   def fromSparkConf(_conf: SparkConf, numUsableCores: Int = 0): TransportConf = {
     val conf = _conf.clone
-    if (numUsableCores > 0) {
-      // Only set if serverThreads/clientThreads not already set.
-      conf.set("spark.shuffle.io.serverThreads",
-        conf.get("spark.shuffle.io.serverThreads", numUsableCores.toString))
-      conf.set("spark.shuffle.io.clientThreads",
-        conf.get("spark.shuffle.io.clientThreads", numUsableCores.toString))
-    }
+
+    // Specify thread configuration based on our JVM's allocation of cores (rather than necessarily
+    // assuming we have all the machine's cores).
+    // NB: Only set if serverThreads/clientThreads not already set.
+    val numThreads = defaultNumThreads(numUsableCores)
+    conf.set("spark.shuffle.io.serverThreads",
+      conf.get("spark.shuffle.io.serverThreads", numThreads.toString))
+    conf.set("spark.shuffle.io.clientThreads",
+      conf.get("spark.shuffle.io.clientThreads", numThreads.toString))
+
     new TransportConf(new ConfigProvider {
       override def get(name: String): String = conf.get(name)
     })
   }
+
+  /**
+   * Returns the default number of threads for both the Netty client and server thread pools.
+   * If numUsableCores is 0, we will use Runtime get an approximate number of available cores.
+   */
+  private def defaultNumThreads(numUsableCores: Int): Int = {
+    val availableCores =
+      if (numUsableCores > 0) numUsableCores else Runtime.getRuntime.availableProcessors()
+    math.min(availableCores, MAX_DEFAULT_NETTY_THREADS)
+  }
 }
@@ -688,6 +688,10 @@ private[spark] object JsonProtocol {
   }
 
   def blockManagerIdFromJson(json: JValue): BlockManagerId = {
+    // On metadata fetch fail, block manager ID can be null (SPARK-4471)
+    if (json == JNothing) {
+      return null
+    }
     val executorId = (json \ "Executor ID").extract[String]
     val host = (json \ "Host").extract[String]
     val port = (json \ "Port").extract[Int]
 
@@ -19,6 +19,8 @@ package org.apache.spark.util
 
 import java.util.Properties
 
+import org.apache.spark.shuffle.MetadataFetchFailedException
+
 import scala.collection.Map
 
 import org.json4s.jackson.JsonMethods._
@@ -116,10 +118,13 @@ class JsonProtocolSuite extends FunSuite {
     // TaskEndReason
     val fetchFailed = FetchFailed(BlockManagerId("With or", "without you", 15), 17, 18, 19,
       "Some exception")
+    val fetchMetadataFailed = new MetadataFetchFailedException(17,
+      19, "metadata Fetch failed exception").toTaskEndReason
     val exceptionFailure = new ExceptionFailure(exception, None)
     testTaskEndReason(Success)
     testTaskEndReason(Resubmitted)
     testTaskEndReason(fetchFailed)
+    testTaskEndReason(fetchMetadataFailed)
     testTaskEndReason(exceptionFailure)
     testTaskEndReason(TaskResultLost)
     testTaskEndReason(TaskKilled)
@@ -431,9 +436,13 @@ class JsonProtocolSuite extends FunSuite {
   }
 
   private def assertEquals(bm1: BlockManagerId, bm2: BlockManagerId) {
-    assert(bm1.executorId === bm2.executorId)
-    assert(bm1.host === bm2.host)
-    assert(bm1.port === bm2.port)
+    if (bm1 == null || bm2 == null) {
+      assert(bm1 === bm2)
+    } else {
+      assert(bm1.executorId === bm2.executorId)
+      assert(bm1.host === bm2.host)
+      assert(bm1.port === bm2.port)
+    }
   }
 
   private def assertEquals(result1: JobResult, result2: JobResult) {
 
@@ -197,6 +197,7 @@ make_binary_release "hadoop2.3" "-Phadoop-2.3 -Phive -Phive-thriftserver -Pyarn"
 make_binary_release "hadoop2.4" "-Phadoop-2.4 -Phive -Phive-thriftserver -Pyarn" &
 make_binary_release "mapr3" "-Pmapr3 -Phive -Phive-thriftserver" &
 make_binary_release "mapr4" "-Pmapr4 -Pyarn -Phive -Phive-thriftserver" &
+make_binary_release "hadoop2.4-without-hive" "-Phadoop-2.4 -Pyarn" &
 wait
 
 # Copy data