Shutdown executor once we are done decommissioning

holdenk · holdenk · commit 8897d74d23c6 · 2020-06-17T13:57:23.000-07:00
diff --git a/core/src/main/scala/org/apache/spark/deploy/DeployMessage.scala b/core/src/main/scala/org/apache/spark/deploy/DeployMessage.scala
@@ -158,8 +158,6 @@ private[deploy] object DeployMessages {
 
   case object ReregisterWithMaster // used when a worker attempts to reconnect to a master
 
-  case object DecommissionSelf // Mark as decommissioned. May be Master to Worker in the future.
-
   // AppClient to Master
 
   case class RegisterApplication(appDescription: ApplicationDescription, driver: RpcEndpointRef)
diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala b/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala
@@ -668,7 +668,7 @@ private[deploy] class Worker(
       finishedApps += id
       maybeCleanupApplication(id)
 
-    case DecommissionSelf =>
+    case WorkerDecommission(_, _) =>
       decommissionSelf()
   }
 
diff --git a/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala b/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala
@@ -64,7 +64,6 @@ private[spark] class CoarseGrainedExecutorBackend(
 
   private[this] val stopping = new AtomicBoolean(false)
   var executor: Executor = null
-  @volatile private var decommissioned = false
   @volatile var driver: Option[RpcEndpointRef] = None
 
   // If this CoarseGrainedExecutorBackend is changed to support multiple threads, then this may need
@@ -80,6 +79,9 @@ private[spark] class CoarseGrainedExecutorBackend(
    */
   private[executor] val taskResources = new mutable.HashMap[Long, Map[String, ResourceInformation]]
 
+  // Track our decommissioning status internally.
+  @volatile private var decommissioned = false
+
   override def onStart(): Unit = {
     logInfo("Registering PWR handler.")
     SignalUtils.register("PWR", "Failed to register SIGPWR handler - " +
@@ -210,6 +212,10 @@ private[spark] class CoarseGrainedExecutorBackend(
     case UpdateDelegationTokens(tokenBytes) =>
       logInfo(s"Received tokens of ${tokenBytes.length} bytes")
       SparkHadoopUtil.get.addDelegationTokens(tokenBytes, env.conf)
+
+    case DecommissionSelf =>
+      logInfo("Received decommission self")
+      decommissionSelf()
   }
 
   override def onDisconnected(remoteAddress: RpcAddress): Unit = {
@@ -259,7 +265,7 @@ private[spark] class CoarseGrainedExecutorBackend(
   }
 
   private def decommissionSelf(): Boolean = {
-    logInfo("Decommissioning self w/sync")
+    logInfo("Decommissioning self")
     try {
       decommissioned = true
       // Tell master we are are decommissioned so it stops trying to schedule us
@@ -271,12 +277,53 @@ private[spark] class CoarseGrainedExecutorBackend(
       if (executor != null) {
         executor.decommission()
       }
-      logInfo("Done decommissioning self.")
+      // Shutdown the executor once all tasks are gone & any configured migrations completed.
+      // Detecting migrations completion doesn't need to be perfect and we want to minimize the
+      // overhead for executors that are not in decommissioning state as overall that will be
+      // more of the executors. For example, this will not catch a block which is already in
+      // the process of being put from a remote executor before migration starts. This trade-off
+      // is viewed as acceptable to minimize introduction of any new locking structures in critical
+      // code paths.
+
+      val shutdownThread = new Thread() {
+        var lastTaskRunningTime = System.nanoTime()
+        val sleep_time = 1000 // 1s
+
+        while (true) {
+          logInfo("Checking to see if we can shutdown.")
+          if (executor == null || executor.numRunningTasks == 0) {
+            if (env.conf.get(STORAGE_DECOMMISSION_ENABLED)) {
+              logInfo("No running tasks, checking migrations")
+              val allBlocksMigrated = env.blockManager.lastMigrationInfo()
+              // We can only trust allBlocksMigrated boolean value if there were no tasks running
+              // since the start of computing it.
+              if (allBlocksMigrated._2 &&
+                (allBlocksMigrated._1 > lastTaskRunningTime)) {
+                logInfo("No running tasks, all blocks migrated, stopping.")
+                exitExecutor(0, "Finished decommissioning", notifyDriver = true)
+              } else {
+                logInfo("All blocks not yet migrated.")
+              }
+            } else {
+              logInfo("No running tasks, no block migration configured, stopping.")
+              exitExecutor(0, "Finished decommissioning", notifyDriver = true)
+            }
+            Thread.sleep(sleep_time)
+          } else {
+            logInfo("Blocked from shutdown by running task")
+            // If there is a running task it could store blocks, so make sure we wait for a
+            // migration loop to complete after the last task is done.
+            Thread.sleep(sleep_time)
+            lastTaskRunningTime = System.nanoTime()
+          }
+        }
+      }
+      logInfo("Will exit when finished decommissioning")
       // Return true since we are handling a signal
       true
     } catch {
       case e: Exception =>
-        logError(s"Error ${e} during attempt to decommission self")
+        logError("Unexpected error while decommissioning self", e)
         false
     }
   }
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala
@@ -132,4 +132,6 @@ private[spark] object CoarseGrainedClusterMessages {
   // Used internally by executors to shut themselves down.
   case object Shutdown extends CoarseGrainedClusterMessage
 
+  // Used to ask an executor to decommission it's self.
+  case object DecommissionSelf extends CoarseGrainedClusterMessage
 }
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
@@ -437,6 +437,15 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
           case e: Exception =>
             logError(s"Unexpected error during decommissioning ${e.toString}", e)
         }
+        // Send decommission message to the executor (it could have originated on the executor
+        // but not necessarily.
+        executorDataMap.get(executorId) match {
+          case Some(executorInfo) =>
+            executorInfo.executorEndpoint.send(DecommissionSelf)
+          case None =>
+            // Ignoring the executor since it is not registered.
+            logWarning(s"Attempted to decommission unknown executor $executorId.")
+        }
         logInfo(s"Finished decommissioning executor $executorId.")
 
         if (conf.get(STORAGE_DECOMMISSION_ENABLED)) {
diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala
@@ -1818,6 +1818,14 @@ private[spark] class BlockManager(
     }
   }
 
+  /*
+   *  Returns the last migration time and a boolean for if all blocks have been migrated.
+   *  If there are any tasks running since that time the boolean may be incorrect.
+   */
+  private[spark] def lastMigrationInfo(): (Long, Boolean) = {
+    decommissioner.map(_.lastMigrationInfo()).getOrElse((0, false))
+  }
+
   private[storage] def getMigratableRDDBlocks(): Seq[ReplicateBlock] =
     master.getReplicateInfoForRDDBlocks(blockManagerId)
 
diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManagerDecommissioner.scala b/core/src/main/scala/org/apache/spark/storage/BlockManagerDecommissioner.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.storage
 
 import java.util.concurrent.ExecutorService
+import java.util.concurrent.atomic.AtomicInteger
 
 import scala.collection.JavaConverters._
 import scala.collection.mutable
@@ -41,6 +42,10 @@ private[storage] class BlockManagerDecommissioner(
   private val maxReplicationFailuresForDecommission =
     conf.get(config.STORAGE_DECOMMISSION_MAX_REPLICATION_FAILURE_PER_BLOCK)
 
+  // This is only valid if there are no tasks running since lastMigrationTime
+  @volatile private[storage] var lastMigrationTime: Long = 0
+  @volatile private[storage] var allBlocksMigrated = false
+
   /**
    * This runnable consumes any shuffle blocks in the queue for migration. This part of a
    * producer/consumer where the main migration loop updates the queue of blocks to be migrated
@@ -90,7 +95,8 @@ private[storage] class BlockManagerDecommissioner(
                   null)// class tag, we don't need for shuffle
                 logDebug(s"Migrated sub block ${blockId}")
               }
-              logInfo(s"Migrated ${shuffleBlockInfo} to ${peer}")
+              logInfo(s"Migrated ${shuffleBlockInfo}")
+              numMigratedShuffles.incrementAndGet()
           }
         }
         // This catch is intentionally outside of the while running block.
@@ -111,6 +117,12 @@ private[storage] class BlockManagerDecommissioner(
   // Shuffles which are either in queue for migrations or migrated
   private val migratingShuffles = mutable.HashSet[ShuffleBlockInfo]()
 
+  // Shuffles which have migrated. This used to know when we are "done", being done can change
+  // if a new shuffle file is created by a running task.
+  private val numMigratedShuffles = new AtomicInteger(0)
+
+
+
   // Shuffles which are queued for migration
   private[storage] val shufflesToMigrate =
     new java.util.concurrent.ConcurrentLinkedQueue[ShuffleBlockInfo]()
@@ -123,6 +135,7 @@ private[storage] class BlockManagerDecommissioner(
   private lazy val blockMigrationExecutor =
     ThreadUtils.newDaemonSingleThreadExecutor("block-manager-decommission")
 
+
   private val blockMigrationRunnable = new Runnable {
     val sleepInterval = conf.get(config.STORAGE_DECOMMISSION_REPLICATION_REATTEMPT_INTERVAL)
 
@@ -133,21 +146,47 @@ private[storage] class BlockManagerDecommissioner(
           s"${config.STORAGE_RDD_DECOMMISSION_ENABLED.key}\n" +
           s"${config.STORAGE_SHUFFLE_DECOMMISSION_ENABLED.key}")
         stopped = true
+        allBlocksMigrated = true
       }
+      var blocksLeft = false
       while (!stopped && !Thread.interrupted()) {
         logInfo("Iterating on migrating from the block manager.")
         try {
+          val startMigrationTime = System.nanoTime()
           // If enabled we migrate shuffle blocks first as they are more expensive.
           if (conf.get(config.STORAGE_SHUFFLE_DECOMMISSION_ENABLED)) {
             logDebug("Attempting to replicate all shuffle blocks")
-            offloadShuffleBlocks()
-            logInfo("Done starting workers to migrate shuffle blocks")
+            blocksLeft = offloadShuffleBlocks()
+            logInfo(s"Done starting workers to migrate shuffle blocks ${blocksLeft}")
           }
           if (conf.get(config.STORAGE_RDD_DECOMMISSION_ENABLED)) {
             logDebug("Attempting to replicate all cached RDD blocks")
-            decommissionRddCacheBlocks()
+            val cacheBlocksLeft = decommissionRddCacheBlocks()
+            blocksLeft = blocksLeft || cacheBlocksLeft
             logInfo("Attempt to replicate all cached blocks done")
           }
+
+          // Only update the migration info if it block have not changed under us.
+          if (lastMigrationTime < startMigrationTime) {
+            lastMigrationTime = startMigrationTime
+            allBlocksMigrated = ! blocksLeft
+            logInfo(s"Updating migration info to ${startMigrationTime}, ${allBlocksMigrated}")
+          } else {
+            logInfo(s"Blocks changed under us (last migration time is ${lastMigrationTime})")
+            allBlocksMigrated = false
+          }
+
+          // Stop if we don't have any migrations configured.
+          if (!conf.get(config.STORAGE_RDD_DECOMMISSION_ENABLED) &&
+            !conf.get(config.STORAGE_SHUFFLE_DECOMMISSION_ENABLED)) {
+            logWarning("Decommissioning, but no task configured set one or both:\n" +
+              "spark.storage.decommission.shuffle_blocks\n" +
+              "spark.storage.decommission.rdd_blocks")
+            lastMigrationTime = System.nanoTime()
+            allBlocksMigrated = true
+            stopped = true
+          }
+
           logInfo(s"Waiting for ${sleepInterval} before refreshing migrations.")
           Thread.sleep(sleepInterval)
         } catch {
@@ -172,8 +211,9 @@ private[storage] class BlockManagerDecommissioner(
    * but rather shadows them.
    * Requires an Indexed based shuffle resolver.
    * Note: if called in testing please call stopOffloadingShuffleBlocks to avoid thread leakage.
+   * Returns true if we are not done migrating shuffle blocks.
    */
-  private[storage] def offloadShuffleBlocks(): Unit = {
+  private[storage] def offloadShuffleBlocks(): Boolean = {
     // Update the queue of shuffles to be migrated
     logInfo("Offloading shuffle blocks")
     val localShuffles = bm.migratableResolver.getStoredShuffles()
@@ -197,6 +237,8 @@ private[storage] class BlockManagerDecommissioner(
     deadPeers.foreach { peer =>
         migrationPeers.get(peer).foreach(_.running = false)
     }
+    // If we found any new shuffles to migrate or otherwise have not migrated everything.
+    newShufflesToMigrate.nonEmpty || migratingShuffles.size < numMigratedShuffles.get()
   }
 
   /**
@@ -213,16 +255,17 @@ private[storage] class BlockManagerDecommissioner(
   /**
    * Tries to offload all cached RDD blocks from this BlockManager to peer BlockManagers
    * Visible for testing
+   * Returns true if we have not migrated all of our RDD blocks.
    */
-  private[storage] def decommissionRddCacheBlocks(): Unit = {
+  private[storage] def decommissionRddCacheBlocks(): Boolean = {
     val replicateBlocksInfo = bm.getMigratableRDDBlocks()
 
     if (replicateBlocksInfo.nonEmpty) {
       logInfo(s"Need to replicate ${replicateBlocksInfo.size} RDD blocks " +
         "for block manager decommissioning")
     } else {
       logWarning(s"Asked to decommission RDD cache blocks, but no blocks to migrate")
-      return
+      return false
     }
 
     // TODO: We can sort these blocks based on some policy (LRU/blockSize etc)
@@ -234,7 +277,9 @@ private[storage] class BlockManagerDecommissioner(
     if (blocksFailedReplication.nonEmpty) {
       logWarning("Blocks failed replication in cache decommissioning " +
         s"process: ${blocksFailedReplication.mkString(",")}")
+      return true
     }
+    return false
   }
 
   private def migrateBlock(blockToReplicate: ReplicateBlock): Boolean = {
@@ -277,4 +322,16 @@ private[storage] class BlockManagerDecommissioner(
     logInfo("Stopping block migration thread")
     blockMigrationExecutor.shutdownNow()
   }
+
+  /*
+   *  Returns the last migration time and a boolean for if all blocks have been migrated.
+   *  If there are any tasks running since that time the boolean may be incorrect.
+   */
+  private[storage] def lastMigrationInfo(): (Long, Boolean) = {
+    if (stopped) {
+      (System.nanoTime(), true)
+    } else {
+      (lastMigrationTime, allBlocksMigrated)
+    }
+  }
 }
diff --git a/core/src/test/scala/org/apache/spark/storage/BlockManagerDecommissionIntegrationSuite.scala b/core/src/test/scala/org/apache/spark/storage/BlockManagerDecommissionIntegrationSuite.scala
@@ -59,7 +59,7 @@ class BlockManagerDecommissionIntegrationSuite extends SparkFunSuite with LocalS
       .set(config.STORAGE_SHUFFLE_DECOMMISSION_ENABLED, shuffle)
     // Just replicate blocks as fast as we can during testing, there isn't another
     // workload we need to worry about.
-      .set(config.STORAGE_DECOMMISSION_REPLICATION_REATTEMPT_INTERVAL, 1L)
+      .set(config.STORAGE_DECOMMISSION_REPLICATION_REATTEMPT_INTERVAL, 10L)
 
     sc = new SparkContext(master, "test", conf)
 
@@ -223,10 +223,7 @@ class BlockManagerDecommissionIntegrationSuite extends SparkFunSuite with LocalS
       assert(execIdToBlocksMapping.values.flatMap(_.keys).count(_.isRDD) === numParts)
     }
 
-    // Make the executor we decommissioned exit
-    sched.client.killExecutors(List(execToDecommission))
-
-    // Wait for the executor to be removed
+    // Wait for the executor to be removed automatically after migration.
     executorRemovedSem.acquire(1)
 
     // Since the RDD is cached or shuffled so further usage of same RDD should use the
diff --git a/core/src/test/scala/org/apache/spark/storage/BlockManagerDecommissionUnitSuite.scala b/core/src/test/scala/org/apache/spark/storage/BlockManagerDecommissionUnitSuite.scala
@@ -38,6 +38,9 @@ class BlockManagerDecommissionUnitSuite extends SparkFunSuite with Matchers {
   private val sparkConf = new SparkConf(false)
     .set(config.STORAGE_SHUFFLE_DECOMMISSION_ENABLED, true)
     .set(config.STORAGE_RDD_DECOMMISSION_ENABLED, true)
+    // Just replicate blocks as fast as we can during testing, there isn't another
+    // workload we need to worry about.
+    .set(config.STORAGE_DECOMMISSION_REPLICATION_REATTEMPT_INTERVAL, 10L)
 
   private def registerShuffleBlocks(
       mockMigratableShuffleResolver: MigratableResolver,
@@ -77,7 +80,8 @@ class BlockManagerDecommissionUnitSuite extends SparkFunSuite with Matchers {
     try {
       bmDecomManager.start()
 
-      eventually(timeout(5.second), interval(10.milliseconds)) {
+      // We don't check that all blocks are migrated because out mock is always returning an RDD.
+      eventually(timeout(10.second), interval(10.milliseconds)) {
         assert(bmDecomManager.shufflesToMigrate.isEmpty == true)
         verify(bm, times(1)).replicateBlock(
           mc.eq(storedBlockId1), mc.any(), mc.any(), mc.eq(Some(3)))
@@ -88,5 +92,7 @@ class BlockManagerDecommissionUnitSuite extends SparkFunSuite with Matchers {
     } finally {
         bmDecomManager.stop()
     }
+
+    bmDecomManager.stop()
   }
 }

Original file line number	Diff line number	Diff line change
`@@ -668,7 +668,7 @@ private[deploy] class Worker(`
`668`	`668`	`finishedApps += id`
`669`	`669`	`maybeCleanupApplication(id)`
`670`	`670`
`671`		`- case DecommissionSelf =>`
	`671`	`+ case WorkerDecommission(_, _) =>`
`672`	`672`	`decommissionSelf()`
`673`	`673`	`}`
`674`	`674`
Original file line number	Diff line number	Diff line change
`@@ -132,4 +132,6 @@ private[spark] object CoarseGrainedClusterMessages {`
`132`	`132`	`// Used internally by executors to shut themselves down.`
`133`	`133`	`case object Shutdown extends CoarseGrainedClusterMessage`
`134`	`134`
	`135`	`+ // Used to ask an executor to decommission it's self.`
	`136`	`+ case object DecommissionSelf extends CoarseGrainedClusterMessage`
`135`	`137`	`}`
Original file line number	Diff line number	Diff line change
`@@ -1818,6 +1818,14 @@ private[spark] class BlockManager(`
`1818`	`1818`	`}`
`1819`	`1819`	`}`
`1820`	`1820`
	`1821`	`+ /*`
	`1822`	`+ * Returns the last migration time and a boolean for if all blocks have been migrated.`
	`1823`	`+ * If there are any tasks running since that time the boolean may be incorrect.`
	`1824`	`+ */`
	`1825`	`+ private[spark] def lastMigrationInfo(): (Long, Boolean) = {`
	`1826`	`+ decommissioner.map(_.lastMigrationInfo()).getOrElse((0, false))`
	`1827`	`+ }`
	`1828`	`+`
`1821`	`1829`	`private[storage] def getMigratableRDDBlocks(): Seq[ReplicateBlock] =`
`1822`	`1830`	`master.getReplicateInfoForRDDBlocks(blockManagerId)`
`1823`	`1831`