Skip to content

Commit ab2bda5

Browse files
author
Giovanni Matteo Fumarola
committed
YARN-9428. Add metrics for paused containers in NodeManager. Contributed by Abhishek Modi.
1 parent da7f8c2 commit ab2bda5

File tree

3 files changed

+31
-0
lines changed
  • hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src

3 files changed

+31
-0
lines changed

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,7 @@ private ReInitializationContext createContextForRollback() {
161161
private final StringBuilder diagnostics;
162162
private final int diagnosticsMaxSize;
163163
private boolean wasLaunched;
164+
private boolean wasPaused;
164165
private long containerLocalizationStartTime;
165166
private long containerLaunchStartTime;
166167
private ContainerMetrics containerMetrics;
@@ -1541,6 +1542,7 @@ static class RecoveredContainerTransition extends ContainerTransition {
15411542
public void transition(ContainerImpl container, ContainerEvent event) {
15421543
container.sendContainerMonitorStartEvent();
15431544
container.wasLaunched = true;
1545+
container.setIsPaused(true);
15441546
}
15451547
}
15461548

@@ -1561,6 +1563,7 @@ public ExitedWithSuccessTransition(boolean clCleanupRequired) {
15611563
public void transition(ContainerImpl container, ContainerEvent event) {
15621564

15631565
container.setIsReInitializing(false);
1566+
container.setIsPaused(false);
15641567
// Set exit code to 0 on success
15651568
container.exitCode = 0;
15661569

@@ -1591,6 +1594,7 @@ public ExitedWithFailureTransition(boolean clCleanupRequired) {
15911594

15921595
@Override
15931596
public void transition(ContainerImpl container, ContainerEvent event) {
1597+
container.setIsPaused(false);
15941598
container.setIsReInitializing(false);
15951599
ContainerExitEvent exitEvent = (ContainerExitEvent) event;
15961600
container.exitCode = exitEvent.getExitCode();
@@ -1835,6 +1839,7 @@ static class KillTransition implements
18351839
public void transition(ContainerImpl container, ContainerEvent event) {
18361840
// Kill the process/process-grp
18371841
container.setIsReInitializing(false);
1842+
container.setIsPaused(false);
18381843
container.dispatcher.getEventHandler().handle(
18391844
new ContainersLauncherEvent(container,
18401845
ContainersLauncherEventType.CLEANUP_CONTAINER));
@@ -2080,6 +2085,8 @@ static class PausedContainerTransition implements
20802085
SingleArcTransition<ContainerImpl, ContainerEvent> {
20812086
@Override
20822087
public void transition(ContainerImpl container, ContainerEvent event) {
2088+
container.setIsPaused(true);
2089+
container.metrics.pausedContainer();
20832090
// Container was PAUSED so tell the scheduler
20842091
container.dispatcher.getEventHandler().handle(
20852092
new ContainerSchedulerEvent(container,
@@ -2096,6 +2103,7 @@ static class ResumeContainerTransition implements
20962103
SingleArcTransition<ContainerImpl, ContainerEvent> {
20972104
@Override
20982105
public void transition(ContainerImpl container, ContainerEvent event) {
2106+
container.setIsPaused(false);
20992107
// Pause the process/process-grp if it is supported by the container
21002108
container.dispatcher.getEventHandler().handle(
21012109
new ContainersLauncherEvent(container,
@@ -2154,6 +2162,13 @@ private static boolean shouldBeUploadedToSharedCache(ContainerImpl container,
21542162
return container.resourceSet.getResourcesUploadPolicies().get(resource);
21552163
}
21562164

2165+
private void setIsPaused(boolean paused) {
2166+
if (this.wasPaused && !paused) {
2167+
this.metrics.endPausedContainer();
2168+
}
2169+
this.wasPaused = paused;
2170+
}
2171+
21572172
@VisibleForTesting
21582173
ContainerRetryContext getContainerRetryContext() {
21592174
return containerRetryContext;

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/metrics/NodeManagerMetrics.java

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ public class NodeManagerMetrics {
4444
@Metric("# of initializing containers")
4545
MutableGaugeInt containersIniting;
4646
@Metric MutableGaugeInt containersRunning;
47+
@Metric("# of paused containers") MutableGaugeInt containersPaused;
4748
@Metric("Current allocated memory in GB")
4849
MutableGaugeInt allocatedGB;
4950
@Metric("Current # of allocated containers")
@@ -168,6 +169,14 @@ public void endReInitingContainer() {
168169
containersReIniting.decr();
169170
}
170171

172+
public void pausedContainer() {
173+
containersPaused.incr();
174+
}
175+
176+
public void endPausedContainer() {
177+
containersPaused.decr();
178+
}
179+
171180
public void allocateContainer(Resource res) {
172181
allocatedContainers.incr();
173182
allocatedMB = allocatedMB + res.getMemorySize();
@@ -268,6 +277,10 @@ public int getRunningContainers() {
268277
return containersRunning.value();
269278
}
270279

280+
public int getPausedContainers() {
281+
return containersPaused.value();
282+
}
283+
271284
@VisibleForTesting
272285
public int getKilledContainers() {
273286
return containersKilled.value();

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/TestContainer.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -246,13 +246,16 @@ public void testContainerPauseAndResume() throws Exception {
246246
wc.initContainer();
247247
wc.localizeResources();
248248
int running = metrics.getRunningContainers();
249+
int paused = metrics.getPausedContainers();
249250
wc.launchContainer();
250251
assertEquals(running + 1, metrics.getRunningContainers());
251252
reset(wc.localizerBus);
252253
wc.pauseContainer();
253254
assertEquals(ContainerState.PAUSED,
254255
wc.c.getContainerState());
256+
assertEquals(paused + 1, metrics.getPausedContainers());
255257
wc.resumeContainer();
258+
assertEquals(paused, metrics.getPausedContainers());
256259
assertEquals(ContainerState.RUNNING,
257260
wc.c.getContainerState());
258261
wc.containerKilledOnRequest();

0 commit comments

Comments
 (0)