Skip to content

Commit

Permalink
YARN-4968. A couple of AM retry unit tests need to wait SchedulerAppl…
Browse files Browse the repository at this point in the history
…icationAttempt stopped. (Wangda Tan via gtcarrera9)
  • Loading branch information
gtCarrera committed Apr 21, 2016
1 parent a749ba0 commit 7c6339f
Show file tree
Hide file tree
Showing 3 changed files with 47 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttempt;
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptState;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.QueueMetrics;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.TestSchedulerUtils;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.YarnScheduler;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerApp;
Expand Down Expand Up @@ -2328,6 +2329,8 @@ public void testRMRestartAfterPreemption() throws Exception {
// start RM
MockRM rm1 = new MockRM(conf, memStore);
rm1.start();
CapacityScheduler cs = (CapacityScheduler) rm1.getResourceScheduler();

MockNM nm1 =
new MockNM("127.0.0.1:1234", 15120, rm1.getResourceTrackerService());
nm1.registerNode();
Expand All @@ -2338,17 +2341,21 @@ public void testRMRestartAfterPreemption() throws Exception {
nm1.nodeHeartbeat(am0.getApplicationAttemptId(), 1,
ContainerState.COMPLETE);
am0.waitForState(RMAppAttemptState.FAILED);
TestSchedulerUtils.waitSchedulerApplicationAttemptStopped(cs,
am0.getApplicationAttemptId());

for (int i = 0; i < 4; i++) {
am0 = MockRM.launchAM(app0, rm1, nm1);
am0.registerAppAttempt();
CapacityScheduler cs = (CapacityScheduler) rm1.getResourceScheduler();
// get scheduler app
FiCaSchedulerApp schedulerAppAttempt = cs.getSchedulerApplications()
.get(app0.getApplicationId()).getCurrentAppAttempt();
// kill app0-attempt
cs.markContainerForKillable(schedulerAppAttempt.getRMContainer(
app0.getCurrentAppAttempt().getMasterContainer().getId()));
am0.waitForState(RMAppAttemptState.FAILED);
TestSchedulerUtils.waitSchedulerApplicationAttemptStopped(cs,
am0.getApplicationAttemptId());
}
am0 = MockRM.launchAM(app0, rm1, nm1);
am0.registerAppAttempt();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.AbstractYarnScheduler;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceScheduler;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerApplicationAttempt;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.TestSchedulerUtils;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.SchedulerEvent;
import org.apache.hadoop.yarn.server.utils.BuilderUtils;
Expand Down Expand Up @@ -568,6 +569,9 @@ public void testShouldNotCountFailureToMaxAttemptRetry() throws Exception {
scheduler.markContainerForKillable(scheduler.getRMContainer(amContainer));

am1.waitForState(RMAppAttemptState.FAILED);
TestSchedulerUtils.waitSchedulerApplicationAttemptStopped(scheduler,
am1.getApplicationAttemptId());

Assert.assertTrue(! attempt1.shouldCountTowardsMaxAttemptRetry());
rm1.waitForState(app1.getApplicationId(), RMAppState.ACCEPTED);
ApplicationStateData appState =
Expand All @@ -584,6 +588,9 @@ public void testShouldNotCountFailureToMaxAttemptRetry() throws Exception {
scheduler.markContainerForKillable(scheduler.getRMContainer(amContainer2));

am2.waitForState(RMAppAttemptState.FAILED);
TestSchedulerUtils.waitSchedulerApplicationAttemptStopped(scheduler,
am2.getApplicationAttemptId());

Assert.assertTrue(! attempt2.shouldCountTowardsMaxAttemptRetry());
rm1.waitForState(app1.getApplicationId(), RMAppState.ACCEPTED);
MockAM am3 =
Expand All @@ -604,6 +611,9 @@ public void testShouldNotCountFailureToMaxAttemptRetry() throws Exception {
nm1.nodeHeartbeat(conts, true);

am3.waitForState(RMAppAttemptState.FAILED);
TestSchedulerUtils.waitSchedulerApplicationAttemptStopped(scheduler,
am3.getApplicationAttemptId());

Assert.assertTrue(! attempt3.shouldCountTowardsMaxAttemptRetry());
Assert.assertEquals(ContainerExitStatus.DISKS_FAILED,
appState.getAttempt(am3.getApplicationAttemptId())
Expand All @@ -623,6 +633,9 @@ public void testShouldNotCountFailureToMaxAttemptRetry() throws Exception {
// This will mimic ContainerExitStatus.ABORT
nm1.nodeHeartbeat(false);
am4.waitForState(RMAppAttemptState.FAILED);
TestSchedulerUtils.waitSchedulerApplicationAttemptStopped(scheduler,
am4.getApplicationAttemptId());

Assert.assertTrue(! attempt4.shouldCountTowardsMaxAttemptRetry());
Assert.assertEquals(ContainerExitStatus.ABORTED,
appState.getAttempt(am4.getApplicationAttemptId())
Expand All @@ -637,6 +650,9 @@ public void testShouldNotCountFailureToMaxAttemptRetry() throws Exception {
nm2
.nodeHeartbeat(am5.getApplicationAttemptId(), 1, ContainerState.COMPLETE);
am5.waitForState(RMAppAttemptState.FAILED);
TestSchedulerUtils.waitSchedulerApplicationAttemptStopped(scheduler,
am5.getApplicationAttemptId());

Assert.assertTrue(attempt5.shouldCountTowardsMaxAttemptRetry());

// AM should not be restarted.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,8 @@
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppState;
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttempt;
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptState;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerApp;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.AppAddedSchedulerEvent;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.AppRemovedSchedulerEvent;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.SchedulerEvent;
Expand Down Expand Up @@ -775,6 +777,27 @@ public void testNormalizeNodeLabelExpression()
}
}

public static void waitSchedulerApplicationAttemptStopped(CapacityScheduler cs,
ApplicationAttemptId attemptId) throws InterruptedException {
FiCaSchedulerApp schedulerApp = cs.getApplicationAttempt(attemptId);
if (null == schedulerApp) {
return;
}

// Wait at most 5 secs to make sure SchedulerApplicationAttempt stopped
int tick = 0;
while (tick < 100) {
if (schedulerApp.isStopped()) {
return;
}
tick++;
Thread.sleep(50);
}

// Only print, don't throw exception
System.err.println("Failed to wait scheduler application attempt stopped.");
}

public static SchedulerApplication<SchedulerApplicationAttempt>
verifyAppAddedAndRemovedFromScheduler(
Map<ApplicationId, SchedulerApplication<SchedulerApplicationAttempt>> applications,
Expand Down

0 comments on commit 7c6339f

Please sign in to comment.