18
18
package org .apache .spark .storage
19
19
20
20
import java .util .concurrent .ExecutorService
21
+ import java .util .concurrent .atomic .AtomicInteger
21
22
22
23
import scala .collection .JavaConverters ._
23
24
import scala .collection .mutable
@@ -41,6 +42,12 @@ private[storage] class BlockManagerDecommissioner(
41
42
private val maxReplicationFailuresForDecommission =
42
43
conf.get(config.STORAGE_DECOMMISSION_MAX_REPLICATION_FAILURE_PER_BLOCK )
43
44
45
+ // Used for tracking if our migrations are complete. Readable for testing
46
+ @ volatile private [storage] var lastRDDMigrationTime : Long = 0
47
+ @ volatile private [storage] var lastShuffleMigrationTime : Long = 0
48
+ @ volatile private [storage] var rddBlocksLeft : Boolean = true
49
+ @ volatile private [storage] var shuffleBlocksLeft : Boolean = true
50
+
44
51
/**
45
52
* This runnable consumes any shuffle blocks in the queue for migration. This part of a
46
53
* producer/consumer where the main migration loop updates the queue of blocks to be migrated
@@ -91,10 +98,11 @@ private[storage] class BlockManagerDecommissioner(
91
98
null )// class tag, we don't need for shuffle
92
99
logDebug(s " Migrated sub block ${blockId}" )
93
100
}
94
- logInfo (s " Migrated ${shuffleBlockInfo} to ${peer}" )
101
+ logDebug (s " Migrated ${shuffleBlockInfo} to ${peer}" )
95
102
} else {
96
103
logError(s " Skipping block ${shuffleBlockInfo} because it has failed ${retryCount}" )
97
104
}
105
+ numMigratedShuffles.incrementAndGet()
98
106
}
99
107
}
100
108
// This catch is intentionally outside of the while running block.
@@ -115,12 +123,21 @@ private[storage] class BlockManagerDecommissioner(
115
123
// Shuffles which are either in queue for migrations or migrated
116
124
private val migratingShuffles = mutable.HashSet [ShuffleBlockInfo ]()
117
125
126
+ // Shuffles which have migrated. This used to know when we are "done", being done can change
127
+ // if a new shuffle file is created by a running task.
128
+ private val numMigratedShuffles = new AtomicInteger (0 )
129
+
118
130
// Shuffles which are queued for migration & number of retries so far.
131
+ // Visible in storage for testing.
119
132
private [storage] val shufflesToMigrate =
120
133
new java.util.concurrent.ConcurrentLinkedQueue [(ShuffleBlockInfo , Int )]()
121
134
122
135
// Set if we encounter an error attempting to migrate and stop.
123
136
@ volatile private var stopped = false
137
+ @ volatile private var stoppedRDD =
138
+ ! conf.get(config.STORAGE_DECOMMISSION_RDD_BLOCKS_ENABLED )
139
+ @ volatile private var stoppedShuffle =
140
+ ! conf.get(config.STORAGE_DECOMMISSION_SHUFFLE_BLOCKS_ENABLED )
124
141
125
142
private val migrationPeers =
126
143
mutable.HashMap [BlockManagerId , ShuffleMigrationRunnable ]()
@@ -133,22 +150,31 @@ private[storage] class BlockManagerDecommissioner(
133
150
134
151
override def run (): Unit = {
135
152
assert(conf.get(config.STORAGE_DECOMMISSION_RDD_BLOCKS_ENABLED ))
136
- while (! stopped && ! Thread .interrupted()) {
153
+ while (! stopped && ! stoppedRDD && ! Thread .interrupted()) {
137
154
logInfo(" Iterating on migrating from the block manager." )
155
+ // Validate we have peers to migrate to.
156
+ val peers = bm.getPeers(false )
157
+ // If we have no peers give up.
158
+ if (peers.isEmpty) {
159
+ stopped = true
160
+ stoppedRDD = true
161
+ }
138
162
try {
163
+ val startTime = System .nanoTime()
139
164
logDebug(" Attempting to replicate all cached RDD blocks" )
140
- decommissionRddCacheBlocks()
165
+ rddBlocksLeft = decommissionRddCacheBlocks()
166
+ lastRDDMigrationTime = startTime
141
167
logInfo(" Attempt to replicate all cached blocks done" )
142
168
logInfo(s " Waiting for ${sleepInterval} before refreshing migrations. " )
143
169
Thread .sleep(sleepInterval)
144
170
} catch {
145
171
case e : InterruptedException =>
146
- logInfo(" Interrupted during migration, will not refresh migrations. " )
147
- stopped = true
172
+ logInfo(" Interrupted during RDD migration, stopping " )
173
+ stoppedRDD = true
148
174
case NonFatal (e) =>
149
- logError(" Error occurred while trying to replicate for block manager decommissioning." ,
175
+ logError(" Error occurred replicating RDD for block manager decommissioning." ,
150
176
e)
151
- stopped = true
177
+ stoppedRDD = true
152
178
}
153
179
}
154
180
}
@@ -162,20 +188,22 @@ private[storage] class BlockManagerDecommissioner(
162
188
163
189
override def run () {
164
190
assert(conf.get(config.STORAGE_DECOMMISSION_SHUFFLE_BLOCKS_ENABLED ))
165
- while (! stopped && ! Thread .interrupted()) {
191
+ while (! stopped && ! stoppedShuffle && ! Thread .interrupted()) {
166
192
try {
167
193
logDebug(" Attempting to replicate all shuffle blocks" )
168
- refreshOffloadingShuffleBlocks()
194
+ val startTime = System .nanoTime()
195
+ shuffleBlocksLeft = refreshOffloadingShuffleBlocks()
196
+ lastShuffleMigrationTime = startTime
169
197
logInfo(" Done starting workers to migrate shuffle blocks" )
170
198
Thread .sleep(sleepInterval)
171
199
} catch {
172
200
case e : InterruptedException =>
173
201
logInfo(" Interrupted during migration, will not refresh migrations." )
174
- stopped = true
202
+ stoppedShuffle = true
175
203
case NonFatal (e) =>
176
204
logError(" Error occurred while trying to replicate for block manager decommissioning." ,
177
205
e)
178
- stopped = true
206
+ stoppedShuffle = true
179
207
}
180
208
}
181
209
}
@@ -191,8 +219,9 @@ private[storage] class BlockManagerDecommissioner(
191
219
* but rather shadows them.
192
220
* Requires an Indexed based shuffle resolver.
193
221
* Note: if called in testing please call stopOffloadingShuffleBlocks to avoid thread leakage.
222
+ * Returns true if we are not done migrating shuffle blocks.
194
223
*/
195
- private [storage] def refreshOffloadingShuffleBlocks (): Unit = {
224
+ private [storage] def refreshOffloadingShuffleBlocks (): Boolean = {
196
225
// Update the queue of shuffles to be migrated
197
226
logInfo(" Offloading shuffle blocks" )
198
227
val localShuffles = bm.migratableResolver.getStoredShuffles().toSet
@@ -215,6 +244,12 @@ private[storage] class BlockManagerDecommissioner(
215
244
deadPeers.foreach { peer =>
216
245
migrationPeers.get(peer).foreach(_.running = false )
217
246
}
247
+ // If we don't have anyone to migrate to give up
248
+ if (migrationPeers.values.find(_.running == true ).isEmpty) {
249
+ stoppedShuffle = true
250
+ }
251
+ // If we found any new shuffles to migrate or otherwise have not migrated everything.
252
+ newShufflesToMigrate.nonEmpty || migratingShuffles.size < numMigratedShuffles.get()
218
253
}
219
254
220
255
/**
@@ -231,16 +266,18 @@ private[storage] class BlockManagerDecommissioner(
231
266
/**
232
267
* Tries to offload all cached RDD blocks from this BlockManager to peer BlockManagers
233
268
* Visible for testing
269
+ * Returns true if we have not migrated all of our RDD blocks.
234
270
*/
235
- private [storage] def decommissionRddCacheBlocks (): Unit = {
271
+ private [storage] def decommissionRddCacheBlocks (): Boolean = {
236
272
val replicateBlocksInfo = bm.getMigratableRDDBlocks()
273
+ // Refresh peers and validate we have somewhere to move blocks.
237
274
238
275
if (replicateBlocksInfo.nonEmpty) {
239
276
logInfo(s " Need to replicate ${replicateBlocksInfo.size} RDD blocks " +
240
277
" for block manager decommissioning" )
241
278
} else {
242
279
logWarning(s " Asked to decommission RDD cache blocks, but no blocks to migrate " )
243
- return
280
+ return false
244
281
}
245
282
246
283
// TODO: We can sort these blocks based on some policy (LRU/blockSize etc)
@@ -252,7 +289,9 @@ private[storage] class BlockManagerDecommissioner(
252
289
if (blocksFailedReplication.nonEmpty) {
253
290
logWarning(" Blocks failed replication in cache decommissioning " +
254
291
s " process: ${blocksFailedReplication.mkString(" ," )}" )
292
+ return true
255
293
}
294
+ return false
256
295
}
257
296
258
297
private def migrateBlock (blockToReplicate : ReplicateBlock ): Boolean = {
@@ -327,4 +366,33 @@ private[storage] class BlockManagerDecommissioner(
327
366
}
328
367
logInfo(" Stopped storage decommissioner" )
329
368
}
369
+
370
+ /*
371
+ * Returns the last migration time and a boolean for if all blocks have been migrated.
372
+ * The last migration time is calculated to be the minimum of the last migration of any
373
+ * running migration (and if there are now current running migrations it is set to current).
374
+ * This provides a timeStamp which, if there have been no tasks running since that time
375
+ * we can know that all potential blocks that can be have been migrated off.
376
+ */
377
+ private [storage] def lastMigrationInfo (): (Long , Boolean ) = {
378
+ if (stopped || (stoppedRDD && stoppedShuffle)) {
379
+ // Since we don't have anything left to migrate ever (since we don't restart once
380
+ // stopped), return that we're done with a validity timestamp that doesn't expire.
381
+ (Long .MaxValue , true )
382
+ } else {
383
+ // Chose the min of the active times. See the function description for more information.
384
+ val lastMigrationTime = if (! stoppedRDD && ! stoppedShuffle) {
385
+ Math .min(lastRDDMigrationTime, lastShuffleMigrationTime)
386
+ } else if (! stoppedShuffle) {
387
+ lastShuffleMigrationTime
388
+ } else {
389
+ lastRDDMigrationTime
390
+ }
391
+
392
+ // Technically we could have blocks left if we encountered an error, but those blocks will
393
+ // never be migrated, so we don't care about them.
394
+ val blocksMigrated = (! shuffleBlocksLeft || stoppedShuffle) && (! rddBlocksLeft || stoppedRDD)
395
+ (lastMigrationTime, blocksMigrated)
396
+ }
397
+ }
330
398
}
0 commit comments