Skip to content

Commit 8897d74

Browse files
committed
Shutdown executor once we are done decommissioning
1 parent ac096f4 commit 8897d74

File tree

9 files changed

+144
-20
lines changed

9 files changed

+144
-20
lines changed

core/src/main/scala/org/apache/spark/deploy/DeployMessage.scala

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -158,8 +158,6 @@ private[deploy] object DeployMessages {
158158

159159
case object ReregisterWithMaster // used when a worker attempts to reconnect to a master
160160

161-
case object DecommissionSelf // Mark as decommissioned. May be Master to Worker in the future.
162-
163161
// AppClient to Master
164162

165163
case class RegisterApplication(appDescription: ApplicationDescription, driver: RpcEndpointRef)

core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -668,7 +668,7 @@ private[deploy] class Worker(
668668
finishedApps += id
669669
maybeCleanupApplication(id)
670670

671-
case DecommissionSelf =>
671+
case WorkerDecommission(_, _) =>
672672
decommissionSelf()
673673
}
674674

core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala

Lines changed: 51 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,6 @@ private[spark] class CoarseGrainedExecutorBackend(
6464

6565
private[this] val stopping = new AtomicBoolean(false)
6666
var executor: Executor = null
67-
@volatile private var decommissioned = false
6867
@volatile var driver: Option[RpcEndpointRef] = None
6968

7069
// If this CoarseGrainedExecutorBackend is changed to support multiple threads, then this may need
@@ -80,6 +79,9 @@ private[spark] class CoarseGrainedExecutorBackend(
8079
*/
8180
private[executor] val taskResources = new mutable.HashMap[Long, Map[String, ResourceInformation]]
8281

82+
// Track our decommissioning status internally.
83+
@volatile private var decommissioned = false
84+
8385
override def onStart(): Unit = {
8486
logInfo("Registering PWR handler.")
8587
SignalUtils.register("PWR", "Failed to register SIGPWR handler - " +
@@ -210,6 +212,10 @@ private[spark] class CoarseGrainedExecutorBackend(
210212
case UpdateDelegationTokens(tokenBytes) =>
211213
logInfo(s"Received tokens of ${tokenBytes.length} bytes")
212214
SparkHadoopUtil.get.addDelegationTokens(tokenBytes, env.conf)
215+
216+
case DecommissionSelf =>
217+
logInfo("Received decommission self")
218+
decommissionSelf()
213219
}
214220

215221
override def onDisconnected(remoteAddress: RpcAddress): Unit = {
@@ -259,7 +265,7 @@ private[spark] class CoarseGrainedExecutorBackend(
259265
}
260266

261267
private def decommissionSelf(): Boolean = {
262-
logInfo("Decommissioning self w/sync")
268+
logInfo("Decommissioning self")
263269
try {
264270
decommissioned = true
265271
// Tell master we are are decommissioned so it stops trying to schedule us
@@ -271,12 +277,53 @@ private[spark] class CoarseGrainedExecutorBackend(
271277
if (executor != null) {
272278
executor.decommission()
273279
}
274-
logInfo("Done decommissioning self.")
280+
// Shutdown the executor once all tasks are gone & any configured migrations completed.
281+
// Detecting migrations completion doesn't need to be perfect and we want to minimize the
282+
// overhead for executors that are not in decommissioning state as overall that will be
283+
// more of the executors. For example, this will not catch a block which is already in
284+
// the process of being put from a remote executor before migration starts. This trade-off
285+
// is viewed as acceptable to minimize introduction of any new locking structures in critical
286+
// code paths.
287+
288+
val shutdownThread = new Thread() {
289+
var lastTaskRunningTime = System.nanoTime()
290+
val sleep_time = 1000 // 1s
291+
292+
while (true) {
293+
logInfo("Checking to see if we can shutdown.")
294+
if (executor == null || executor.numRunningTasks == 0) {
295+
if (env.conf.get(STORAGE_DECOMMISSION_ENABLED)) {
296+
logInfo("No running tasks, checking migrations")
297+
val allBlocksMigrated = env.blockManager.lastMigrationInfo()
298+
// We can only trust allBlocksMigrated boolean value if there were no tasks running
299+
// since the start of computing it.
300+
if (allBlocksMigrated._2 &&
301+
(allBlocksMigrated._1 > lastTaskRunningTime)) {
302+
logInfo("No running tasks, all blocks migrated, stopping.")
303+
exitExecutor(0, "Finished decommissioning", notifyDriver = true)
304+
} else {
305+
logInfo("All blocks not yet migrated.")
306+
}
307+
} else {
308+
logInfo("No running tasks, no block migration configured, stopping.")
309+
exitExecutor(0, "Finished decommissioning", notifyDriver = true)
310+
}
311+
Thread.sleep(sleep_time)
312+
} else {
313+
logInfo("Blocked from shutdown by running task")
314+
// If there is a running task it could store blocks, so make sure we wait for a
315+
// migration loop to complete after the last task is done.
316+
Thread.sleep(sleep_time)
317+
lastTaskRunningTime = System.nanoTime()
318+
}
319+
}
320+
}
321+
logInfo("Will exit when finished decommissioning")
275322
// Return true since we are handling a signal
276323
true
277324
} catch {
278325
case e: Exception =>
279-
logError(s"Error ${e} during attempt to decommission self")
326+
logError("Unexpected error while decommissioning self", e)
280327
false
281328
}
282329
}

core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -132,4 +132,6 @@ private[spark] object CoarseGrainedClusterMessages {
132132
// Used internally by executors to shut themselves down.
133133
case object Shutdown extends CoarseGrainedClusterMessage
134134

135+
// Used to ask an executor to decommission it's self.
136+
case object DecommissionSelf extends CoarseGrainedClusterMessage
135137
}

core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -437,6 +437,15 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
437437
case e: Exception =>
438438
logError(s"Unexpected error during decommissioning ${e.toString}", e)
439439
}
440+
// Send decommission message to the executor (it could have originated on the executor
441+
// but not necessarily.
442+
executorDataMap.get(executorId) match {
443+
case Some(executorInfo) =>
444+
executorInfo.executorEndpoint.send(DecommissionSelf)
445+
case None =>
446+
// Ignoring the executor since it is not registered.
447+
logWarning(s"Attempted to decommission unknown executor $executorId.")
448+
}
440449
logInfo(s"Finished decommissioning executor $executorId.")
441450

442451
if (conf.get(STORAGE_DECOMMISSION_ENABLED)) {

core/src/main/scala/org/apache/spark/storage/BlockManager.scala

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1818,6 +1818,14 @@ private[spark] class BlockManager(
18181818
}
18191819
}
18201820

1821+
/*
1822+
* Returns the last migration time and a boolean for if all blocks have been migrated.
1823+
* If there are any tasks running since that time the boolean may be incorrect.
1824+
*/
1825+
private[spark] def lastMigrationInfo(): (Long, Boolean) = {
1826+
decommissioner.map(_.lastMigrationInfo()).getOrElse((0, false))
1827+
}
1828+
18211829
private[storage] def getMigratableRDDBlocks(): Seq[ReplicateBlock] =
18221830
master.getReplicateInfoForRDDBlocks(blockManagerId)
18231831

core/src/main/scala/org/apache/spark/storage/BlockManagerDecommissioner.scala

Lines changed: 64 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
package org.apache.spark.storage
1919

2020
import java.util.concurrent.ExecutorService
21+
import java.util.concurrent.atomic.AtomicInteger
2122

2223
import scala.collection.JavaConverters._
2324
import scala.collection.mutable
@@ -41,6 +42,10 @@ private[storage] class BlockManagerDecommissioner(
4142
private val maxReplicationFailuresForDecommission =
4243
conf.get(config.STORAGE_DECOMMISSION_MAX_REPLICATION_FAILURE_PER_BLOCK)
4344

45+
// This is only valid if there are no tasks running since lastMigrationTime
46+
@volatile private[storage] var lastMigrationTime: Long = 0
47+
@volatile private[storage] var allBlocksMigrated = false
48+
4449
/**
4550
* This runnable consumes any shuffle blocks in the queue for migration. This part of a
4651
* producer/consumer where the main migration loop updates the queue of blocks to be migrated
@@ -90,7 +95,8 @@ private[storage] class BlockManagerDecommissioner(
9095
null)// class tag, we don't need for shuffle
9196
logDebug(s"Migrated sub block ${blockId}")
9297
}
93-
logInfo(s"Migrated ${shuffleBlockInfo} to ${peer}")
98+
logInfo(s"Migrated ${shuffleBlockInfo}")
99+
numMigratedShuffles.incrementAndGet()
94100
}
95101
}
96102
// This catch is intentionally outside of the while running block.
@@ -111,6 +117,12 @@ private[storage] class BlockManagerDecommissioner(
111117
// Shuffles which are either in queue for migrations or migrated
112118
private val migratingShuffles = mutable.HashSet[ShuffleBlockInfo]()
113119

120+
// Shuffles which have migrated. This used to know when we are "done", being done can change
121+
// if a new shuffle file is created by a running task.
122+
private val numMigratedShuffles = new AtomicInteger(0)
123+
124+
125+
114126
// Shuffles which are queued for migration
115127
private[storage] val shufflesToMigrate =
116128
new java.util.concurrent.ConcurrentLinkedQueue[ShuffleBlockInfo]()
@@ -123,6 +135,7 @@ private[storage] class BlockManagerDecommissioner(
123135
private lazy val blockMigrationExecutor =
124136
ThreadUtils.newDaemonSingleThreadExecutor("block-manager-decommission")
125137

138+
126139
private val blockMigrationRunnable = new Runnable {
127140
val sleepInterval = conf.get(config.STORAGE_DECOMMISSION_REPLICATION_REATTEMPT_INTERVAL)
128141

@@ -133,21 +146,47 @@ private[storage] class BlockManagerDecommissioner(
133146
s"${config.STORAGE_RDD_DECOMMISSION_ENABLED.key}\n" +
134147
s"${config.STORAGE_SHUFFLE_DECOMMISSION_ENABLED.key}")
135148
stopped = true
149+
allBlocksMigrated = true
136150
}
151+
var blocksLeft = false
137152
while (!stopped && !Thread.interrupted()) {
138153
logInfo("Iterating on migrating from the block manager.")
139154
try {
155+
val startMigrationTime = System.nanoTime()
140156
// If enabled we migrate shuffle blocks first as they are more expensive.
141157
if (conf.get(config.STORAGE_SHUFFLE_DECOMMISSION_ENABLED)) {
142158
logDebug("Attempting to replicate all shuffle blocks")
143-
offloadShuffleBlocks()
144-
logInfo("Done starting workers to migrate shuffle blocks")
159+
blocksLeft = offloadShuffleBlocks()
160+
logInfo(s"Done starting workers to migrate shuffle blocks ${blocksLeft}")
145161
}
146162
if (conf.get(config.STORAGE_RDD_DECOMMISSION_ENABLED)) {
147163
logDebug("Attempting to replicate all cached RDD blocks")
148-
decommissionRddCacheBlocks()
164+
val cacheBlocksLeft = decommissionRddCacheBlocks()
165+
blocksLeft = blocksLeft || cacheBlocksLeft
149166
logInfo("Attempt to replicate all cached blocks done")
150167
}
168+
169+
// Only update the migration info if it block have not changed under us.
170+
if (lastMigrationTime < startMigrationTime) {
171+
lastMigrationTime = startMigrationTime
172+
allBlocksMigrated = ! blocksLeft
173+
logInfo(s"Updating migration info to ${startMigrationTime}, ${allBlocksMigrated}")
174+
} else {
175+
logInfo(s"Blocks changed under us (last migration time is ${lastMigrationTime})")
176+
allBlocksMigrated = false
177+
}
178+
179+
// Stop if we don't have any migrations configured.
180+
if (!conf.get(config.STORAGE_RDD_DECOMMISSION_ENABLED) &&
181+
!conf.get(config.STORAGE_SHUFFLE_DECOMMISSION_ENABLED)) {
182+
logWarning("Decommissioning, but no task configured set one or both:\n" +
183+
"spark.storage.decommission.shuffle_blocks\n" +
184+
"spark.storage.decommission.rdd_blocks")
185+
lastMigrationTime = System.nanoTime()
186+
allBlocksMigrated = true
187+
stopped = true
188+
}
189+
151190
logInfo(s"Waiting for ${sleepInterval} before refreshing migrations.")
152191
Thread.sleep(sleepInterval)
153192
} catch {
@@ -172,8 +211,9 @@ private[storage] class BlockManagerDecommissioner(
172211
* but rather shadows them.
173212
* Requires an Indexed based shuffle resolver.
174213
* Note: if called in testing please call stopOffloadingShuffleBlocks to avoid thread leakage.
214+
* Returns true if we are not done migrating shuffle blocks.
175215
*/
176-
private[storage] def offloadShuffleBlocks(): Unit = {
216+
private[storage] def offloadShuffleBlocks(): Boolean = {
177217
// Update the queue of shuffles to be migrated
178218
logInfo("Offloading shuffle blocks")
179219
val localShuffles = bm.migratableResolver.getStoredShuffles()
@@ -197,6 +237,8 @@ private[storage] class BlockManagerDecommissioner(
197237
deadPeers.foreach { peer =>
198238
migrationPeers.get(peer).foreach(_.running = false)
199239
}
240+
// If we found any new shuffles to migrate or otherwise have not migrated everything.
241+
newShufflesToMigrate.nonEmpty || migratingShuffles.size < numMigratedShuffles.get()
200242
}
201243

202244
/**
@@ -213,16 +255,17 @@ private[storage] class BlockManagerDecommissioner(
213255
/**
214256
* Tries to offload all cached RDD blocks from this BlockManager to peer BlockManagers
215257
* Visible for testing
258+
* Returns true if we have not migrated all of our RDD blocks.
216259
*/
217-
private[storage] def decommissionRddCacheBlocks(): Unit = {
260+
private[storage] def decommissionRddCacheBlocks(): Boolean = {
218261
val replicateBlocksInfo = bm.getMigratableRDDBlocks()
219262

220263
if (replicateBlocksInfo.nonEmpty) {
221264
logInfo(s"Need to replicate ${replicateBlocksInfo.size} RDD blocks " +
222265
"for block manager decommissioning")
223266
} else {
224267
logWarning(s"Asked to decommission RDD cache blocks, but no blocks to migrate")
225-
return
268+
return false
226269
}
227270

228271
// TODO: We can sort these blocks based on some policy (LRU/blockSize etc)
@@ -234,7 +277,9 @@ private[storage] class BlockManagerDecommissioner(
234277
if (blocksFailedReplication.nonEmpty) {
235278
logWarning("Blocks failed replication in cache decommissioning " +
236279
s"process: ${blocksFailedReplication.mkString(",")}")
280+
return true
237281
}
282+
return false
238283
}
239284

240285
private def migrateBlock(blockToReplicate: ReplicateBlock): Boolean = {
@@ -277,4 +322,16 @@ private[storage] class BlockManagerDecommissioner(
277322
logInfo("Stopping block migration thread")
278323
blockMigrationExecutor.shutdownNow()
279324
}
325+
326+
/*
327+
* Returns the last migration time and a boolean for if all blocks have been migrated.
328+
* If there are any tasks running since that time the boolean may be incorrect.
329+
*/
330+
private[storage] def lastMigrationInfo(): (Long, Boolean) = {
331+
if (stopped) {
332+
(System.nanoTime(), true)
333+
} else {
334+
(lastMigrationTime, allBlocksMigrated)
335+
}
336+
}
280337
}

core/src/test/scala/org/apache/spark/storage/BlockManagerDecommissionIntegrationSuite.scala

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ class BlockManagerDecommissionIntegrationSuite extends SparkFunSuite with LocalS
5959
.set(config.STORAGE_SHUFFLE_DECOMMISSION_ENABLED, shuffle)
6060
// Just replicate blocks as fast as we can during testing, there isn't another
6161
// workload we need to worry about.
62-
.set(config.STORAGE_DECOMMISSION_REPLICATION_REATTEMPT_INTERVAL, 1L)
62+
.set(config.STORAGE_DECOMMISSION_REPLICATION_REATTEMPT_INTERVAL, 10L)
6363

6464
sc = new SparkContext(master, "test", conf)
6565

@@ -223,10 +223,7 @@ class BlockManagerDecommissionIntegrationSuite extends SparkFunSuite with LocalS
223223
assert(execIdToBlocksMapping.values.flatMap(_.keys).count(_.isRDD) === numParts)
224224
}
225225

226-
// Make the executor we decommissioned exit
227-
sched.client.killExecutors(List(execToDecommission))
228-
229-
// Wait for the executor to be removed
226+
// Wait for the executor to be removed automatically after migration.
230227
executorRemovedSem.acquire(1)
231228

232229
// Since the RDD is cached or shuffled so further usage of same RDD should use the

core/src/test/scala/org/apache/spark/storage/BlockManagerDecommissionUnitSuite.scala

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,9 @@ class BlockManagerDecommissionUnitSuite extends SparkFunSuite with Matchers {
3838
private val sparkConf = new SparkConf(false)
3939
.set(config.STORAGE_SHUFFLE_DECOMMISSION_ENABLED, true)
4040
.set(config.STORAGE_RDD_DECOMMISSION_ENABLED, true)
41+
// Just replicate blocks as fast as we can during testing, there isn't another
42+
// workload we need to worry about.
43+
.set(config.STORAGE_DECOMMISSION_REPLICATION_REATTEMPT_INTERVAL, 10L)
4144

4245
private def registerShuffleBlocks(
4346
mockMigratableShuffleResolver: MigratableResolver,
@@ -77,7 +80,8 @@ class BlockManagerDecommissionUnitSuite extends SparkFunSuite with Matchers {
7780
try {
7881
bmDecomManager.start()
7982

80-
eventually(timeout(5.second), interval(10.milliseconds)) {
83+
// We don't check that all blocks are migrated because out mock is always returning an RDD.
84+
eventually(timeout(10.second), interval(10.milliseconds)) {
8185
assert(bmDecomManager.shufflesToMigrate.isEmpty == true)
8286
verify(bm, times(1)).replicateBlock(
8387
mc.eq(storedBlockId1), mc.any(), mc.any(), mc.eq(Some(3)))
@@ -88,5 +92,7 @@ class BlockManagerDecommissionUnitSuite extends SparkFunSuite with Matchers {
8892
} finally {
8993
bmDecomManager.stop()
9094
}
95+
96+
bmDecomManager.stop()
9197
}
9298
}

0 commit comments

Comments
 (0)