Track the decommissioning executors in the core dynamic scheduler so we don't scale down too low, update the streaming ExecutorAllocationManager to also delegate to decommission

holdenk · holdenk · commit 9bb0293125af · 2020-06-16T22:08:05.000-07:00
Fix up executor add for resource profile
diff --git a/core/src/main/scala/org/apache/spark/ExecutorAllocationClient.scala b/core/src/main/scala/org/apache/spark/ExecutorAllocationClient.scala
@@ -87,9 +87,28 @@ private[spark] trait ExecutorAllocationClient {
    * if it supports graceful decommissioning.
    *
    * @param executorIds identifiers of executors to decommission
+   * @param adjustTargetNumExecutors whether the target number of executors will be adjusted down
+   *                                 after these executors have been decommissioned.
    * @return the ids of the executors acknowledged by the cluster manager to be removed.
    */
-  def decommissionExecutors(executorIds: Seq[String]): Seq[String]
+  def decommissionExecutors(
+    executorIds: Seq[String],
+    adjustTargetNumExecutors: Boolean): Seq[String]
+
+
+  /**
+   * Request that the cluster manager decommission the specified executors.
+   * Default implementation delegates to kill, scheduler must override
+   * if it supports graceful decommissioning.
+   *
+   * @param executorIds identifiers of executors to decommission
+   * @return whether the request is acknowledged by the cluster manager.
+   */
+  def decommissionExecutor(executorId: String): Boolean = {
+    val decommissionedExecutors = decommissionExecutors(Seq(executorId),
+      adjustTargetNumExecutors = true)
+    decommissionedExecutors.nonEmpty && decommissionedExecutors(0).equals(executorId)
+  }
 
   /**
    * Request that the cluster manager kill every executor on the specified host.
diff --git a/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala b/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala
@@ -529,7 +529,9 @@ private[spark] class ExecutorAllocationManager(
         // get the running total as we remove or initialize it to the count - pendingRemoval
         val newExecutorTotal = numExecutorsTotalPerRpId.getOrElseUpdate(rpId,
           (executorMonitor.executorCountWithResourceProfile(rpId) -
-            executorMonitor.pendingRemovalCountPerResourceProfileId(rpId)))
+            executorMonitor.pendingRemovalCountPerResourceProfileId(rpId) -
+            executorMonitor.pendingDecommissioningPerResourceProfileId(rpId)
+          ))
         if (newExecutorTotal - 1 < minNumExecutors) {
           logDebug(s"Not removing idle executor $executorIdToBeRemoved because there " +
             s"are only $newExecutorTotal executor(s) left (minimum number of executor limit " +
@@ -556,7 +558,7 @@ private[spark] class ExecutorAllocationManager(
       // We don't want to change our target number of executors, because we already did that
       // when the task backlog decreased.
       if (conf.get(WORKER_DECOMMISSION_ENABLED)) {
-        client.decommissionExecutors(executorIdsToBeRemoved)
+        client.decommissionExecutors(executorIdsToBeRemoved, adjustTargetNumExecutors = false)
       } else {
         client.killExecutors(executorIdsToBeRemoved, adjustTargetNumExecutors = false,
           countFailures = false, force = false)
@@ -572,7 +574,11 @@ private[spark] class ExecutorAllocationManager(
 
     // reset the newExecutorTotal to the existing number of executors
     if (testing || executorsRemoved.nonEmpty) {
-      executorMonitor.executorsKilled(executorsRemoved)
+      if (conf.get(WORKER_DECOMMISSION_ENABLED)) {
+        executorMonitor.executorsDecommissioned(executorsRemoved)
+      } else {
+        executorMonitor.executorsKilled(executorsRemoved)
+      }
       logInfo(s"Executors ${executorsRemoved.mkString(",")} removed due to idle timeout.")
       executorsRemoved
     } else {
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
@@ -415,13 +415,6 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
       scheduler.workerRemoved(workerId, host, message)
     }
 
-    /**
-     * Mark given executors as decommissioned and stop making resource offers for it.
-     */
-    private def decommissionExecutor(executorId: String): Boolean = {
-      (! decommissionExecutors(List(executorId)).isEmpty)
-    }
-
     /**
      * Stop making resource offers for the given executor. The executor is marked as lost with
      * the loss reason still pending.
@@ -457,30 +450,58 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
    * Request that the cluster manager decommission the specified executors.
    *
    * @param executorIds identifiers of executors to decommission
+   * @param adjustTargetNumExecutors whether the target number of executors will be adjusted down
+   *                                 after these executors have been decommissioned.
    * @return the ids of the executors acknowledged by the cluster manager to be removed.
    */
-  override def decommissionExecutors(executorIds: Seq[String]): Seq[String] = {
-    val executorsToDecommission = executorIds.filter{executorId =>
-      CoarseGrainedSchedulerBackend.this.synchronized {
-        // Only bother decommissioning executors which are alive.
-        if (isExecutorActive(executorId)) {
-          executorsPendingDecommission += executorId
-          true
-        } else {
-          false
+  override def decommissionExecutors(executorIds: Seq[String],
+    adjustTargetNumExecutors: Boolean): Seq[String] = {
+
+    withLock {
+      val executorsToDecommission = executorIds.filter{executorId =>
+        CoarseGrainedSchedulerBackend.this.synchronized {
+          // Only bother decommissioning executors which are alive.
+          if (isExecutorActive(executorId)) {
+            executorsPendingDecommission += executorId
+            true
+          } else {
+            false
+          }
         }
       }
-    }
 
-    executorsToDecommission.filter{executorId =>
-      doDecommission(executorId)
+      // If we don't want to replace the executors we are decommissioning
+      if (adjustTargetNumExecutors) {
+        executorsToDecommission.foreach { exec =>
+          val rpId = executorDataMap(exec).resourceProfileId
+          val rp = scheduler.sc.resourceProfileManager.resourceProfileFromId(rpId)
+          if (requestedTotalExecutorsPerResourceProfile.isEmpty) {
+            // Assume that we are killing an executor that was started by default and
+            // not through the request api
+            requestedTotalExecutorsPerResourceProfile(rp) = 0
+          } else {
+            val requestedTotalForRp = requestedTotalExecutorsPerResourceProfile(rp)
+            requestedTotalExecutorsPerResourceProfile(rp) = math.max(requestedTotalForRp - 1, 0)
+          }
+        }
+        doRequestTotalExecutors(requestedTotalExecutorsPerResourceProfile.toMap)
+      }
+
+      val decommissioned = executorsToDecommission.filter{executorId =>
+        doDecommission(executorId)
+      }
+      decommissioned
     }
   }
 
   private def doDecommission(executorId: String): Boolean = {
     logInfo(s"Starting decommissioning executor $executorId.")
     try {
       scheduler.executorDecommission(executorId)
+      if (driverEndpoint != null) {
+        logInfo("Propagating executor decommission to driver.")
+        driverEndpoint.send(DecommissionExecutor(executorId))
+      }
     } catch {
       case e: Exception =>
         logError(s"Unexpected error during decommissioning ${e.toString}", e)
@@ -609,16 +630,6 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
     driverEndpoint.send(RemoveWorker(workerId, host, message))
   }
 
-  /**
-   * Called by subclasses when notified of a decommissioning executor.
-   */
-  private[spark] def decommissionExecutor(executorId: String): Unit = {
-    if (driverEndpoint != null) {
-      logInfo("Propagating executor decommission to driver.")
-      driverEndpoint.send(DecommissionExecutor(executorId))
-    }
-  }
-
   def sufficientResourcesRegistered(): Boolean = true
 
   override def isReady(): Boolean = {
diff --git a/core/src/main/scala/org/apache/spark/scheduler/dynalloc/ExecutorMonitor.scala b/core/src/main/scala/org/apache/spark/scheduler/dynalloc/ExecutorMonitor.scala
@@ -114,7 +114,8 @@ private[spark] class ExecutorMonitor(
 
       var newNextTimeout = Long.MaxValue
       timedOutExecs = executors.asScala
-        .filter { case (_, exec) => !exec.pendingRemoval && !exec.hasActiveShuffle }
+        .filter { case (_, exec) =>
+          !exec.pendingRemoval && !exec.hasActiveShuffle && !exec.pendingDecommissioning}
         .filter { case (_, exec) =>
           val deadline = exec.timeoutAt
           if (deadline > now) {
@@ -150,6 +151,19 @@ private[spark] class ExecutorMonitor(
     nextTimeout.set(Long.MinValue)
   }
 
+  private[spark] def executorsDecommissioned(ids: Seq[String]): Unit = {
+    ids.foreach { id =>
+      val tracker = executors.get(id)
+      if (tracker != null) {
+        tracker.pendingDecommissioning = true
+      }
+    }
+
+    // Recompute timed out executors in the next EAM callback, since this call invalidates
+    // the current list.
+    nextTimeout.set(Long.MinValue)
+  }
+
   def executorCount: Int = executors.size()
 
   def executorCountWithResourceProfile(id: Int): Int = {
@@ -172,6 +186,16 @@ private[spark] class ExecutorMonitor(
     executors.asScala.filter { case (k, v) => v.resourceProfileId == id && v.pendingRemoval }.size
   }
 
+  def pendingDecommissioningCount: Int = executors.asScala.count { case (_, exec) =>
+    exec.pendingDecommissioning
+  }
+
+  def pendingDecommissioningPerResourceProfileId(id: Int): Int = {
+    executors.asScala.filter { case (k, v) =>
+      v.resourceProfileId == id && v.pendingDecommissioning
+    }.size
+  }
+
   override def onJobStart(event: SparkListenerJobStart): Unit = {
     if (!shuffleTrackingEnabled) {
       return
@@ -328,7 +352,7 @@ private[spark] class ExecutorMonitor(
     val removed = executors.remove(event.executorId)
     if (removed != null) {
       decrementExecResourceProfileCount(removed.resourceProfileId)
-      if (!removed.pendingRemoval) {
+      if (!removed.pendingRemoval || !removed.pendingDecommissioning) {
         nextTimeout.set(Long.MinValue)
       }
     }
@@ -431,6 +455,11 @@ private[spark] class ExecutorMonitor(
     executors.asScala.filter { case (_, exec) => exec.pendingRemoval }.keys.toSet
   }
 
+  // Visible for testing
+  def executorsDecommissioning(): Set[String] = {
+    executors.asScala.filter { case (_, exec) => exec.pendingDecommissioning }.keys.toSet
+  }
+
   /**
    * This method should be used when updating executor state. It guards against a race condition in
    * which the `SparkListenerTaskStart` event is posted before the `SparkListenerBlockManagerAdded`
@@ -483,6 +512,7 @@ private[spark] class ExecutorMonitor(
     @volatile var timedOut: Boolean = false
 
     var pendingRemoval: Boolean = false
+    var pendingDecommissioning: Boolean = false
     var hasActiveShuffle: Boolean = false
 
     private var idleStart: Long = -1
diff --git a/core/src/test/scala/org/apache/spark/ExecutorAllocationManagerSuite.scala b/core/src/test/scala/org/apache/spark/ExecutorAllocationManagerSuite.scala
@@ -28,6 +28,7 @@ import org.scalatest.PrivateMethodTester
 import org.apache.spark.executor.ExecutorMetrics
 import org.apache.spark.internal.config
 import org.apache.spark.internal.config.Tests.TEST_SCHEDULE_INTERVAL
+import org.apache.spark.internal.config.Worker.WORKER_DECOMMISSION_ENABLED
 import org.apache.spark.metrics.MetricsSystem
 import org.apache.spark.resource.{ExecutorResourceRequests, ResourceProfile, ResourceProfileBuilder, ResourceProfileManager, TaskResourceRequests}
 import org.apache.spark.resource.ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID
@@ -1101,6 +1102,68 @@ class ExecutorAllocationManagerSuite extends SparkFunSuite {
     assert(executorsPendingToRemove(manager).size === 6) // limit reached (1 executor remaining)
   }
 
+  test("mock polling loop remove with decommissioning") {
+    val clock = new ManualClock(2020L)
+    val manager = createManager(createConf(1, 20, 1, true), clock = clock)
+
+    // Remove idle executors on timeout
+    onExecutorAddedDefaultProfile(manager, "executor-1")
+    onExecutorAddedDefaultProfile(manager, "executor-2")
+    onExecutorAddedDefaultProfile(manager, "executor-3")
+    assert(executorsDecommissioning(manager).isEmpty)
+    assert(executorsPendingToRemove(manager).isEmpty)
+
+    // idle threshold not reached yet
+    clock.advance(executorIdleTimeout * 1000 / 2)
+    schedule(manager)
+    assert(manager.executorMonitor.timedOutExecutors().isEmpty)
+    assert(executorsPendingToRemove(manager).isEmpty)
+    assert(executorsDecommissioning(manager).isEmpty)
+
+    // idle threshold exceeded
+    clock.advance(executorIdleTimeout * 1000)
+    assert(manager.executorMonitor.timedOutExecutors().size === 3)
+    schedule(manager)
+    assert(executorsPendingToRemove(manager).isEmpty) // limit reached (1 executor remaining)
+    assert(executorsDecommissioning(manager).size === 2) // limit reached (1 executor remaining)
+
+    // Mark a subset as busy - only idle executors should be removed
+    onExecutorAddedDefaultProfile(manager, "executor-4")
+    onExecutorAddedDefaultProfile(manager, "executor-5")
+    onExecutorAddedDefaultProfile(manager, "executor-6")
+    onExecutorAddedDefaultProfile(manager, "executor-7")
+    assert(manager.executorMonitor.executorCount === 7)
+    assert(executorsPendingToRemove(manager).isEmpty) // no pending to be removed
+    assert(executorsDecommissioning(manager).size === 2) // 2 decommissioning
+    onExecutorBusy(manager, "executor-4")
+    onExecutorBusy(manager, "executor-5")
+    onExecutorBusy(manager, "executor-6") // 3 busy and 2 idle (of the 5 active ones)
+
+    // after scheduling, the previously timed out executor should be removed, since
+    // there are new active ones.
+    schedule(manager)
+    assert(executorsDecommissioning(manager).size === 3)
+
+    // advance the clock so that idle executors should time out and move to the pending list
+    clock.advance(executorIdleTimeout * 1000)
+    schedule(manager)
+    assert(executorsPendingToRemove(manager).size === 0)
+    assert(executorsDecommissioning(manager).size === 4)
+    assert(!executorsDecommissioning(manager).contains("executor-4"))
+    assert(!executorsDecommissioning(manager).contains("executor-5"))
+    assert(!executorsDecommissioning(manager).contains("executor-6"))
+
+    // Busy executors are now idle and should be removed
+    onExecutorIdle(manager, "executor-4")
+    onExecutorIdle(manager, "executor-5")
+    onExecutorIdle(manager, "executor-6")
+    schedule(manager)
+    assert(executorsDecommissioning(manager).size === 4)
+    clock.advance(executorIdleTimeout * 1000)
+    schedule(manager)
+    assert(executorsDecommissioning(manager).size === 6) // limit reached (1 executor remaining)
+  }
+
   test("listeners trigger add executors correctly") {
     val manager = createManager(createConf(1, 20, 1))
     assert(addTime(manager) === NOT_SET)
@@ -1419,7 +1482,8 @@ class ExecutorAllocationManagerSuite extends SparkFunSuite {
   private def createConf(
       minExecutors: Int = 1,
       maxExecutors: Int = 5,
-      initialExecutors: Int = 1): SparkConf = {
+      initialExecutors: Int = 1,
+      decommissioningEnabled: Boolean = false): SparkConf = {
     val sparkConf = new SparkConf()
       .set(config.DYN_ALLOCATION_ENABLED, true)
       .set(config.DYN_ALLOCATION_MIN_EXECUTORS, minExecutors)
@@ -1435,6 +1499,7 @@ class ExecutorAllocationManagerSuite extends SparkFunSuite {
       // SPARK-22864: effectively disable the allocation schedule by setting the period to a
       // really long value.
       .set(TEST_SCHEDULE_INTERVAL, 10000L)
+      .set(WORKER_DECOMMISSION_ENABLED, decommissioningEnabled)
     sparkConf
   }
 
@@ -1501,6 +1566,10 @@ class ExecutorAllocationManagerSuite extends SparkFunSuite {
   private def executorsPendingToRemove(manager: ExecutorAllocationManager): Set[String] = {
     manager.executorMonitor.executorsPendingToRemove()
   }
+
+  private def executorsDecommissioning(manager: ExecutorAllocationManager): Set[String] = {
+    manager.executorMonitor.executorsDecommissioning()
+  }
 }
 
 /**
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ExecutorAllocationManager.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ExecutorAllocationManager.scala
@@ -23,6 +23,7 @@ import scala.util.Random
 import org.apache.spark.{ExecutorAllocationClient, SparkConf}
 import org.apache.spark.internal.Logging
 import org.apache.spark.internal.config.Streaming._
+import org.apache.spark.internal.config.Worker.WORKER_DECOMMISSION_ENABLED
 import org.apache.spark.resource.ResourceProfile
 import org.apache.spark.streaming.util.RecurringTimer
 import org.apache.spark.util.{Clock, Utils}
@@ -133,7 +134,11 @@ private[streaming] class ExecutorAllocationManager(
       logDebug(s"Removable executors (${removableExecIds.size}): ${removableExecIds}")
       if (removableExecIds.nonEmpty) {
         val execIdToRemove = removableExecIds(Random.nextInt(removableExecIds.size))
-        client.killExecutor(execIdToRemove)
+        if (conf.get(WORKER_DECOMMISSION_ENABLED)) {
+          client.decommissionExecutor(execIdToRemove)
+        } else {
+          client.killExecutor(execIdToRemove)
+        }
         logInfo(s"Requested to kill executor $execIdToRemove")
       } else {
         logInfo(s"No non-receiver executors to kill")
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ExecutorAllocationManagerSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ExecutorAllocationManagerSuite.scala