54
54
import org .apache .flink .runtime .jobgraph .JobVertex ;
55
55
import org .apache .flink .runtime .jobgraph .JobVertexID ;
56
56
import org .apache .flink .runtime .jobgraph .OperatorID ;
57
+ import org .apache .flink .runtime .jobmaster .TestingExecutionDeploymentTrackerWrapper ;
57
58
import org .apache .flink .runtime .jobmaster .event .ExecutionVertexFinishedEvent ;
58
59
import org .apache .flink .runtime .jobmaster .event .FileSystemJobEventStore ;
59
60
import org .apache .flink .runtime .jobmaster .event .JobEvent ;
@@ -159,6 +160,9 @@ public class BatchJobRecoveryTest {
159
160
private ScheduledExecutor delayedExecutor =
160
161
new ScheduledExecutorServiceAdapter (EXECUTOR_RESOURCE .getExecutor ());
161
162
163
+ private TestingExecutionDeploymentTrackerWrapper executionDeploymentTracker =
164
+ new TestingExecutionDeploymentTrackerWrapper ();
165
+
162
166
private static final OperatorID OPERATOR_ID = new OperatorID (1234L , 5678L );
163
167
private static final int NUM_SPLITS = 10 ;
164
168
private static final int SOURCE_PARALLELISM = 5 ;
@@ -216,6 +220,7 @@ void setUp() throws IOException {
216
220
217
221
this .serializedJobGraph = serializeJobGraph (createDefaultJobGraph ());
218
222
allPartitionWithMetrics .clear ();
223
+ executionDeploymentTracker = new TestingExecutionDeploymentTrackerWrapper ();
219
224
}
220
225
221
226
@ AfterEach
@@ -238,11 +243,14 @@ void testRecoverFromJMFailover() throws Exception {
238
243
239
244
runInMainThread (scheduler ::startScheduling );
240
245
246
+ waitUntilAllExecutionsDeployed (SOURCE_ID , scheduler );
241
247
runInMainThread (
242
248
() -> {
243
249
// transition all sources to finished.
244
250
transitionExecutionsState (scheduler , ExecutionState .FINISHED , SOURCE_ID );
245
251
});
252
+
253
+ waitUntilAllExecutionsDeployed (MIDDLE_ID , scheduler );
246
254
runInMainThread (
247
255
() -> {
248
256
// transition all middle tasks to RUNNING state
@@ -338,11 +346,14 @@ void testJobVertexUnFinishedAndOperatorCoordinatorNotSupportBatchSnapshot() thro
338
346
339
347
runInMainThread (scheduler ::startScheduling );
340
348
349
+ waitUntilAllExecutionsDeployed (SOURCE_ID , scheduler );
341
350
runInMainThread (
342
351
() -> {
343
352
// transition all sources to finished.
344
353
transitionExecutionsState (scheduler , ExecutionState .FINISHED , SOURCE_ID );
345
354
});
355
+
356
+ waitUntilAllExecutionsDeployed (MIDDLE_ID , scheduler );
346
357
runInMainThread (
347
358
() -> {
348
359
// transition first middle task to finished.
@@ -451,6 +462,7 @@ void testJobVertexFinishedAndOperatorCoordinatorNotSupportBatchSnapshotAndPartit
451
462
452
463
runInMainThread (scheduler ::startScheduling );
453
464
465
+ waitUntilAllExecutionsDeployed (SOURCE_ID , scheduler );
454
466
runInMainThread (
455
467
() -> {
456
468
// transition all sources to finished.
@@ -495,14 +507,13 @@ void testJobVertexFinishedAndOperatorCoordinatorNotSupportBatchSnapshotAndPartit
495
507
}
496
508
}
497
509
498
- for (ExecutionVertex taskVertex :
499
- getExecutionVertices (MIDDLE_ID , newScheduler .getExecutionGraph ())) {
500
- waitUntilExecutionVertexState (taskVertex , ExecutionState .DEPLOYING , 15000L );
501
- }
510
+ waitUntilAllExecutionsDeployed (MIDDLE_ID , newScheduler );
502
511
512
+ waitUntilAllExecutionsDeployed (MIDDLE_ID , scheduler );
503
513
runInMainThread (
504
514
() -> {
505
515
// transition all middle tasks to running
516
+ transitionExecutionsState (scheduler , ExecutionState .INITIALIZING , MIDDLE_ID );
506
517
transitionExecutionsState (scheduler , ExecutionState .RUNNING , MIDDLE_ID );
507
518
});
508
519
@@ -539,6 +550,7 @@ void testRecoverFromJMFailoverAndPartitionsUnavailable() throws Exception {
539
550
540
551
runInMainThread (scheduler ::startScheduling );
541
552
553
+ waitUntilAllExecutionsDeployed (SOURCE_ID , scheduler );
542
554
runInMainThread (
543
555
() -> {
544
556
// transition all sources to finished.
@@ -596,15 +608,20 @@ void testRecoverDecidedParallelismFromTheSameJobGraphInstance() throws Exception
596
608
597
609
runInMainThread (scheduler ::startScheduling );
598
610
611
+ waitUntilAllExecutionsDeployed (SOURCE_ID , scheduler );
599
612
runInMainThread (
600
613
() -> {
601
614
// transition all sources to finished.
602
615
transitionExecutionsState (scheduler , ExecutionState .FINISHED , SOURCE_ID );
603
616
});
617
+
618
+ waitUntilAllExecutionsDeployed (MIDDLE_ID , scheduler );
604
619
runInMainThread (
605
620
() -> { // transition all middle tasks to finished.
606
621
transitionExecutionsState (scheduler , ExecutionState .FINISHED , MIDDLE_ID );
607
622
});
623
+
624
+ waitUntilAllExecutionsDeployed (SINK_ID , scheduler );
608
625
runInMainThread (
609
626
() -> {
610
627
// transition all sinks to finished.
@@ -676,6 +693,7 @@ void testPartitionNotFoundTwiceAfterJMFailover() throws Exception {
676
693
});
677
694
678
695
// transition all sources to finished.
696
+ waitUntilAllExecutionsDeployed (SOURCE_ID , scheduler );
679
697
runInMainThread (
680
698
() -> transitionExecutionsState (scheduler , ExecutionState .FINISHED , SOURCE_ID ));
681
699
@@ -1124,6 +1142,7 @@ private AdaptiveBatchScheduler createScheduler(
1124
1142
jobGraph ,
1125
1143
mainThreadExecutor .getMainThreadExecutor (),
1126
1144
EXECUTOR_RESOURCE .getExecutor ())
1145
+ .setExecutionDeploymentTracker (executionDeploymentTracker )
1127
1146
.setRestartBackoffTimeStrategy (
1128
1147
new FixedDelayRestartBackoffTimeStrategy
1129
1148
.FixedDelayRestartBackoffTimeStrategyFactory (10 , 0 )
@@ -1212,4 +1231,30 @@ public Optional<ResourceID> storesLocalResourcesOn() {
1212
1231
};
1213
1232
}
1214
1233
}
1234
+
1235
+ private void waitUntilAllExecutionsDeployed (
1236
+ JobVertexID vertexId , AdaptiveBatchScheduler scheduler ) throws Exception {
1237
+ AtomicBoolean isAllExecutionDeployed = new AtomicBoolean (false );
1238
+
1239
+ while (!isAllExecutionDeployed .get ()) {
1240
+ runInMainThread (
1241
+ () -> {
1242
+ List <ExecutionAttemptID > attemptIds =
1243
+ Arrays .stream (
1244
+ scheduler
1245
+ .getExecutionJobVertex (vertexId )
1246
+ .getTaskVertices ())
1247
+ .map (ExecutionVertex ::getCurrentExecutionAttempt )
1248
+ .map (Execution ::getAttemptId )
1249
+ .collect (Collectors .toList ());
1250
+ if (!attemptIds .isEmpty ()
1251
+ && executionDeploymentTracker
1252
+ .getDeployedExecutions ()
1253
+ .containsAll (attemptIds )) {
1254
+ isAllExecutionDeployed .set (true );
1255
+ }
1256
+ });
1257
+ Thread .sleep (2 );
1258
+ }
1259
+ }
1215
1260
}
0 commit comments