turboFei
diff --git a/‎core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala‎
Lines changed: 220 additions & 12 deletions b/‎core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala‎
Lines changed: 220 additions & 12 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/SparkContext.scala‎
Lines changed: 2 additions & 1 deletion b/‎core/src/main/scala/org/apache/spark/SparkContext.scala‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎core/src/main/scala/org/apache/spark/internal/config/Tests.scala‎
Lines changed: 6 additions & 0 deletions b/‎core/src/main/scala/org/apache/spark/internal/config/Tests.scala‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/internal/config/package.scala‎
Lines changed: 18 additions & 0 deletions b/‎core/src/main/scala/org/apache/spark/internal/config/package.scala‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala‎
Lines changed: 10 additions & 0 deletions b/‎core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala‎
Lines changed: 10 additions & 0 deletions
@@ -28,10 +28,10 @@ import com.codahale.metrics.{Counter, Gauge, MetricRegistry}
 import org.apache.spark.internal.{config, Logging}
 import org.apache.spark.internal.config._
 import org.apache.spark.internal.config.DECOMMISSION_ENABLED
-import org.apache.spark.internal.config.Tests.TEST_DYNAMIC_ALLOCATION_SCHEDULE_ENABLED
+import org.apache.spark.internal.config.Tests.{TEST_DYNAMIC_ALLOCATION_SCHEDULE_ENABLED, TEST_SCHEDULE_INTERVAL}
 import org.apache.spark.metrics.source.Source
+import org.apache.spark.resource.{ResourceProfile, ResourceProfileManager}
 import org.apache.spark.resource.ResourceProfile.UNKNOWN_RESOURCE_PROFILE_ID
-import org.apache.spark.resource.ResourceProfileManager
 import org.apache.spark.scheduler._
 import org.apache.spark.scheduler.dynalloc.ExecutorMonitor
 import org.apache.spark.util.{Clock, SystemClock, ThreadUtils, Utils}
@@ -104,7 +104,8 @@ private[spark] class ExecutorAllocationManager(
     cleaner: Option[ContextCleaner] = None,
     clock: Clock = new SystemClock(),
     resourceProfileManager: ResourceProfileManager,
-    reliableShuffleStorage: Boolean)
+    reliableShuffleStorage: Boolean,
+    taskScheduler: Option[TaskScheduler] = None)
   extends Logging {
 
   allocationManager =>
@@ -126,6 +127,10 @@ private[spark] class ExecutorAllocationManager(
   // During testing, the methods to actually kill and add executors are mocked out
   private val testing = conf.get(DYN_ALLOCATION_TESTING)
 
+  private val tasksPerExecutorForFullParallelism = resourceProfileManager
+    .resourceProfileFromId(ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID)
+    .maxTasksPerExecutor(conf)
+
   private val executorAllocationRatio =
     conf.get(DYN_ALLOCATION_EXECUTOR_ALLOCATION_RATIO)
 
@@ -151,7 +156,8 @@ private[spark] class ExecutorAllocationManager(
   private var addTime: Long = NOT_SET
 
   // Polling loop interval (ms)
-  private val intervalMillis: Long = 100
+  private val intervalMillis: Long =
+    if (Utils.isTesting) conf.get(TEST_SCHEDULE_INTERVAL) else 1000
 
   // Listener for Spark events that impact the allocation policy
   val listener = new ExecutorAllocationListener
@@ -180,6 +186,18 @@ private[spark] class ExecutorAllocationManager(
   // ResourceProfile id to Host to possible task running on it, used for executor placement.
   private var rpIdToHostToLocalTaskCount: Map[Int, Map[String, Int]] = Map.empty
 
+  private val removeExecutorsPerStats = conf.get(DYN_ALLOCATION_REMOVE_EXECUTOR_PER_STATS)
+  private val removeExecutorsPerStatsIntervalS =
+    conf.get(DYN_ALLOCATION_REMOVE_EXECUTOR_PER_STATS_INTERVAL)
+  // the executor number kept rate compared to system task stats
+  private val executorNumKeepRatio = 1.3
+  private[spark] val systemTaskStats: SystemTaskStats = conf.get(DYN_ALLOCATION_STATS_TYPE) match {
+    case "avg" => new SystemAvgTaskStats
+    case _ => new SystemMaxTaskStats
+  }
+
+  private var nextRemoveTime: Long = -1
+
   /**
    * Verify that the settings specified through the config are valid.
    * If not, throw an appropriate exception.
@@ -288,11 +306,9 @@ private[spark] class ExecutorAllocationManager(
    * under the current load to satisfy all running and pending tasks, rounded up.
    */
   private[spark] def maxNumExecutorsNeededPerResourceProfile(rpId: Int): Int = {
-    val pendingTask = listener.pendingTasksPerResourceProfile(rpId)
     val pendingSpeculative = listener.pendingSpeculativeTasksPerResourceProfile(rpId)
     val unschedulableTaskSets = listener.pendingUnschedulableTaskSetsPerResourceProfile(rpId)
-    val running = listener.totalRunningTasksPerResourceProfile(rpId)
-    val numRunningOrPendingTasks = pendingTask + pendingSpeculative + running
+    val numRunningOrPendingTasks = currTotalTaskNum(Some(rpId))
     val rp = resourceProfileManager.resourceProfileFromId(rpId)
     val tasksPerExecutor = rp.maxTasksPerExecutor(conf)
     logDebug(s"max needed for rpId: $rpId numpending: $numRunningOrPendingTasks," +
@@ -341,13 +357,56 @@ private[spark] class ExecutorAllocationManager(
       initializing = false
     }
 
+    val now = clock.nanoTime()
     // Update executor target number only after initializing flag is unset
-    updateAndSyncNumExecutorsTarget(clock.nanoTime())
-    if (executorIdsToBeRemoved.nonEmpty) {
-      removeExecutors(executorIdsToBeRemoved)
+    updateAndSyncNumExecutorsTarget(now)
+    if (removeExecutorsPerStats) {
+      executorIdsToBeRemoved.head._2
+      val currTaskNum = systemTaskStats.updateCurrTaskNum(now, currTotalTaskNum)
+      if (currTaskNum > 0) {
+        removeExecutorsBaseOnStats(now, currTaskNum)
+      }
+    } else if (executorIdsToBeRemoved.nonEmpty) {
+      removeExecutors(executorIdsToBeRemoved, "idle timeout")
+    }
+  }
+
+  private def currTotalTaskNum(rpId: Option[Int] = None): Int = {
+    taskScheduler.map(_.totalTasks()).getOrElse {
+      // The total task number in listener may not be accurate
+      // because of the possible event loss in the event queue
+      rpId.map { id =>
+        listener.pendingTasksPerResourceProfile(id) +
+          listener.pendingSpeculativeTasksPerResourceProfile(id) +
+          listener.totalRunningTasksPerResourceProfile(id)
+      }.getOrElse(listener.totalTasks())
     }
   }
 
+  private def removeExecutorsBaseOnStats(now: Long, latestTotalTaskNum: Int): Unit = {
+    if (now < nextRemoveTime) {
+      return
+    }
+    nextRemoveTime = now + TimeUnit.SECONDS.toNanos(removeExecutorsPerStatsIntervalS)
+    val consideredTotalTaskNum = systemTaskStats.lastFiveMinStat
+    var expectedExecutorCnt = math.ceil(consideredTotalTaskNum *
+      executorNumKeepRatio / tasksPerExecutorForFullParallelism).toInt
+
+    val currExecutorCnt = executorMonitor.executorCount
+    if (expectedExecutorCnt < minNumExecutors) {
+      expectedExecutorCnt = minNumExecutors
+    }
+    if (currExecutorCnt <= expectedExecutorCnt) {
+      return
+    }
+
+    val removeCnt = currExecutorCnt - expectedExecutorCnt
+    logInfo(s"Current executor cnt:$currExecutorCnt, " +
+      s"expected executor cnt:$expectedExecutorCnt, will remove $removeCnt executors")
+    val removeExecutorIDs = executorMonitor.executorsWithLeastRunningTasks(removeCnt)
+    removeExecutors(removeExecutorIDs, "less executors needed based on stats")
+  }
+
   /**
    * Updates our target number of executors for each ResourceProfile and then syncs the result
    * with the cluster manager.
@@ -533,7 +592,8 @@ private[spark] class ExecutorAllocationManager(
    * Request the cluster manager to remove the given executors.
    * Returns the list of executors which are removed.
    */
-  private def removeExecutors(executors: Seq[(String, Int)]): Seq[String] = synchronized {
+  private def removeExecutors(
+      executors: Seq[(String, Int)], reason: String): Seq[String] = synchronized {
     val executorIdsToBeRemoved = new ArrayBuffer[String]
     logDebug(s"Request to remove executorIds: ${executors.mkString(", ")}")
     val numExecutorsTotalPerRpId = mutable.Map[Int, Int]()
@@ -603,7 +663,7 @@ private[spark] class ExecutorAllocationManager(
       } else {
         executorMonitor.executorsKilled(executorsRemoved.toSeq)
       }
-      logInfo(s"Executors ${executorsRemoved.mkString(",")} removed due to idle timeout.")
+      logInfo(s"Executors ${executorsRemoved.mkString(",")} removed due to $reason.")
       executorsRemoved.toSeq
     } else {
       logWarning(s"Unable to reach the cluster manager to kill executor/s " +
@@ -936,6 +996,40 @@ private[spark] class ExecutorAllocationManager(
       }.sum
     }
 
+    /**
+     * An estimate of the total number of pending tasks remaining for currently running stages. Does
+     * not account for tasks which may have failed and been resubmitted.
+     *
+     * Note: This is not thread-safe without the caller owning the `allocationManager` lock.
+     */
+    def pendingTasks(): Int = {
+      stageAttemptToNumTasks.map { case (stageAttempt, numTasks) =>
+        numTasks - stageAttemptToTaskIndices.get(stageAttempt).map(_.size).getOrElse(0)
+      }.sum
+    }
+
+    def pendingSpeculativeTasks(): Int = {
+      stageAttemptToPendingSpeculativeTasks.map { case (sa, tasks) =>
+        tasks.size - stageAttemptToSpeculativeTaskIndices.get(sa).map(_.size).getOrElse(0)
+      }.sum
+    }
+
+    def totalPendingTasks(): Int = {
+      pendingTasks + pendingSpeculativeTasks
+    }
+
+    /**
+     * The number of tasks currently running across all stages.
+     * Include running-but-zombie stage attempts
+     */
+    def totalRunningTasks(): Int = {
+      stageAttemptToNumRunningTask.values.sum
+    }
+
+    def totalTasks(): Int = {
+      totalRunningTasks() + totalPendingTasks()
+    }
+
     /**
      * Update the Executor placement hints (the number of tasks with locality preferences,
      * a map where each pair is a node and the number of tasks that would like to be scheduled
@@ -1011,6 +1105,9 @@ private[spark] class ExecutorAllocationManagerSource(
       .map(executorAllocationManager.maxNumExecutorsNeededPerResourceProfile(_)).sum, 0)
   registerGauge("numberDecommissioningExecutors",
     executorAllocationManager.executorMonitor.decommissioningCount, 0)
+  registerGauge("lastOneMinStat", executorAllocationManager.systemTaskStats.lastOneMinStat, 0)
+  registerGauge("lastFiveMinStat", executorAllocationManager.systemTaskStats.lastFiveMinStat, 0)
+  registerGauge("currActiveCoresNum", executorAllocationManager.totalCores, 0)
 }
 
 private object ExecutorAllocationManager {
@@ -1020,3 +1117,114 @@ private object ExecutorAllocationManager {
   private[spark] case class TargetNumUpdates(delta: Int, oldNumExecutorsTarget: Int)
 
 }
+
+trait SystemTaskStats {
+  def updateCurrTaskNum(now: Long, taskNumCal: Option[Int] => Int): Int
+
+  def lastOneMinStat: Int
+
+  def lastFiveMinStat: Int
+
+  def lastFifteenMinStat: Int
+}
+
+class SystemMaxTaskStats extends SystemTaskStats {
+  val TICK_INTERVAL: Long = TimeUnit.SECONDS.toNanos(5)
+  private var nextTick: Long = ExecutorAllocationManager.NOT_SET
+
+  private final val SLOT_NUM = 180
+  private val taskNumArray = new Array[Int](SLOT_NUM)
+  private var currSlot: Int = 0
+
+  def updateCurrTaskNum(now: Long, taskNumCal: Option[Int] => Int): Int = {
+    if (nextTick == ExecutorAllocationManager.NOT_SET) {
+      nextTick = now
+    }
+    if (now < nextTick) return -1
+    val totalTaskNum = taskNumCal(None)
+    nextTick += TICK_INTERVAL
+    taskNumArray(currSlot) = totalTaskNum
+    currSlot += 1
+    if (currSlot > (SLOT_NUM - 1)) {
+      currSlot = 0
+    }
+    totalTaskNum
+  }
+
+  def lastOneMinStat: Int = {
+    calMaxTaskNumInPreviousSlots(12)
+  }
+
+  def lastFiveMinStat: Int = {
+    calMaxTaskNumInPreviousSlots(60)
+  }
+
+  def lastFifteenMinStat: Int = {
+    calMaxTaskNumInPreviousSlots(180)
+  }
+
+  def calMaxTaskNumInPreviousSlots(n: Int): Int = {
+    var maxTaskNum = 0
+    var nextSlot = currSlot
+    for (_ <- 1 to n) {
+      nextSlot = nextSlot - 1
+      if (nextSlot < 0) {
+        nextSlot = SLOT_NUM - 1
+      }
+      if (taskNumArray(nextSlot) > maxTaskNum) {
+        maxTaskNum = taskNumArray(nextSlot)
+      }
+    }
+    maxTaskNum
+  }
+}
+
+/**
+ * Moving average like unix load average
+ * @see <a href="
+ *      https://www.helpsystems.com/resources/guides/unix-load-average-part-1-how-it-works">
+ *        UNIX Load Average Part 1: How It Works</a>
+ */
+class SystemAvgTaskStats extends SystemTaskStats {
+  val EXP_1 = 1884 /* 1/exp(5sec/1min) as fixed-point */
+  val EXP_5 = 2014 /* 1/exp(5sec/5min) */
+  val EXP_15 = 2037 /* 1/exp(5sec/15min) */
+  val F_SHIFT = 11
+  val FIXED_1 = 1 << F_SHIFT
+
+  val TICK_INTERVAL: Long = TimeUnit.SECONDS.toNanos(5)
+
+  private var nextTick: Long = ExecutorAllocationManager.NOT_SET
+  private var _lastOneMinAvgTaskNum: Int = _
+  private var _lastFiveMinAvgTaskNum: Int = _
+  private var _lastFifteenMinAvgTaskNum: Int = _
+
+  def updateCurrTaskNum(now: Long, taskNumCal: Option[Int] => Int): Int = {
+    if (nextTick == ExecutorAllocationManager.NOT_SET) {
+      nextTick = now
+      val taskNum = taskNumCal(None)
+      _lastOneMinAvgTaskNum = taskNum
+      _lastFiveMinAvgTaskNum = taskNum
+      _lastFifteenMinAvgTaskNum = taskNum
+    }
+    if (now < nextTick) return -1
+    nextTick += TICK_INTERVAL
+    val taskNum = taskNumCal(None)
+    _lastOneMinAvgTaskNum = calNewLoad(_lastOneMinAvgTaskNum, EXP_1, taskNum)
+    _lastFiveMinAvgTaskNum = calNewLoad(_lastFiveMinAvgTaskNum, EXP_5, taskNum)
+    _lastFifteenMinAvgTaskNum = calNewLoad(_lastFifteenMinAvgTaskNum, EXP_15, taskNum)
+    taskNum
+  }
+
+  def calNewLoad(load: Int, exp: Int, newVal: Int): Int = {
+    var newLoad: Long = load
+    newLoad *= exp
+    newLoad += newVal * (FIXED_1 - exp)
+    newLoad = newLoad >> F_SHIFT
+    newLoad.toInt
+  }
+
+  def lastOneMinStat: Int = _lastOneMinAvgTaskNum
+  def lastFiveMinStat: Int = _lastFiveMinAvgTaskNum
+  def lastFifteenMinStat: Int = _lastFifteenMinAvgTaskNum
+}
@@ -689,7 +689,8 @@ class SparkContext(config: SparkConf) extends Logging {
             Some(new ExecutorAllocationManager(
               schedulerBackend.asInstanceOf[ExecutorAllocationClient], listenerBus, _conf,
               cleaner = cleaner, resourceProfileManager = resourceProfileManager,
-              reliableShuffleStorage = _shuffleDriverComponents.supportsReliableStorage()))
+              reliableShuffleStorage = _shuffleDriverComponents.supportsReliableStorage(),
+              taskScheduler = Some(_taskScheduler)))
           case _ =>
             None
         }
 
@@ -26,6 +26,12 @@ private[spark] object Tests {
     .longConf
     .createWithDefault(Runtime.getRuntime.maxMemory)
 
+  val TEST_SCHEDULE_INTERVAL =
+    ConfigBuilder("spark.testing.dynamicAllocation.scheduleInterval")
+      .version("3.5.0")
+      .longConf
+      .createWithDefault(100)
+
   val TEST_DYNAMIC_ALLOCATION_SCHEDULE_ENABLED =
     ConfigBuilder("spark.testing.dynamicAllocation.schedule.enabled")
       .version("3.1.0")
 
@@ -662,6 +662,24 @@ package object config {
       .version("1.2.0")
       .timeConf(TimeUnit.SECONDS).createWithDefault(1)
 
+  private[spark] val DYN_ALLOCATION_REMOVE_EXECUTOR_PER_STATS =
+    ConfigBuilder("spark.dynamicAllocation.removeExecutorPerSystemStats")
+      .version("3.5.0")
+      .booleanConf
+      .createWithDefault(false)
+
+  private[spark] val DYN_ALLOCATION_REMOVE_EXECUTOR_PER_STATS_INTERVAL =
+    ConfigBuilder("spark.dynamicAllocation.removeExecutorPerSystemStats.interval")
+      .version("3.5.0")
+      .timeConf(TimeUnit.SECONDS).createWithDefault(300)
+
+  private[spark] val DYN_ALLOCATION_STATS_TYPE =
+    ConfigBuilder("spark.dynamicAllocation.system.stats.type")
+      .version("3.5.0")
+      .stringConf
+      .checkValues(Set("max", "avg"))
+      .createWithDefault("max")
+
   private[spark] val DYN_ALLOCATION_SUSTAINED_SCHEDULER_BACKLOG_TIMEOUT =
     ConfigBuilder("spark.dynamicAllocation.sustainedSchedulerBacklogTimeout")
       .version("1.2.0")
 
@@ -1332,6 +1332,16 @@ private[spark] class TaskSchedulerImpl(
     }
   }
 
+  override def totalTasks(): Int = {
+    var totalTaskNum = 0
+    for (taskSet <- rootPool.getTaskSetQueue) {
+      if (!taskSet.isZombie) {
+        totalTaskNum += (taskSet.numTasks - taskSet.successfulTasks)
+      }
+    }
+    totalTaskNum
+  }
+
   override def taskSummary(): TaskSummary = {
     var totalTasks = 0
     var runningTasks = 0