Skip to content

Commit 5dadf96

Browse files
committed
YARN-10208. Add capacityScheduler metric for NODE_UPDATE interval. Contributed by Pranjal Protim Borah.
1 parent 026dce5 commit 5dadf96

File tree

3 files changed

+32
-0
lines changed

3 files changed

+32
-0
lines changed

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacityScheduler.java

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1828,6 +1828,7 @@ public void handle(SchedulerEvent event) {
18281828
case NODE_UPDATE:
18291829
{
18301830
NodeUpdateSchedulerEvent nodeUpdatedEvent = (NodeUpdateSchedulerEvent)event;
1831+
updateSchedulerNodeHBIntervalMetrics(nodeUpdatedEvent);
18311832
nodeUpdate(nodeUpdatedEvent.getRMNode());
18321833
}
18331834
break;
@@ -2114,6 +2115,19 @@ private void removeNode(RMNode nodeInfo) {
21142115
}
21152116
}
21162117

2118+
private void updateSchedulerNodeHBIntervalMetrics(
2119+
NodeUpdateSchedulerEvent nodeUpdatedEvent) {
2120+
// Add metrics for evaluating the time difference between heartbeats.
2121+
SchedulerNode node =
2122+
nodeTracker.getNode(nodeUpdatedEvent.getRMNode().getNodeID());
2123+
if (node != null) {
2124+
long lastInterval =
2125+
Time.monotonicNow() - node.getLastHeartbeatMonotonicTime();
2126+
CapacitySchedulerMetrics.getMetrics()
2127+
.addSchedulerNodeHBInterval(lastInterval);
2128+
}
2129+
}
2130+
21172131
@Override
21182132
protected void completedContainerInternal(
21192133
RMContainer rmContainer, ContainerStatus containerStatus,

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacitySchedulerMetrics.java

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
import org.apache.hadoop.metrics2.annotation.Metrics;
2727
import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
2828
import org.apache.hadoop.metrics2.lib.MetricsRegistry;
29+
import org.apache.hadoop.metrics2.lib.MutableQuantiles;
2930
import org.apache.hadoop.metrics2.lib.MutableRate;
3031

3132
import java.util.concurrent.atomic.AtomicBoolean;
@@ -49,6 +50,8 @@ public class CapacitySchedulerMetrics {
4950
@Metric("Scheduler commit success") MutableRate commitSuccess;
5051
@Metric("Scheduler commit failure") MutableRate commitFailure;
5152
@Metric("Scheduler node update") MutableRate nodeUpdate;
53+
@Metric("Scheduler node heartbeat interval") MutableQuantiles
54+
schedulerNodeHBInterval;
5255

5356
private static volatile CapacitySchedulerMetrics INSTANCE = null;
5457
private static MetricsRegistry registry;
@@ -116,4 +119,13 @@ public long getNumOfAllocates() {
116119
public long getNumOfCommitSuccess() {
117120
return this.commitSuccess.lastStat().numSamples();
118121
}
122+
123+
public void addSchedulerNodeHBInterval(long heartbeatInterval) {
124+
schedulerNodeHBInterval.add(heartbeatInterval);
125+
}
126+
127+
@VisibleForTesting
128+
public long getNumOfSchedulerNodeHBInterval() {
129+
return this.schedulerNodeHBInterval.getEstimator().getCount();
130+
}
119131
}

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestCapacitySchedulerMetrics.java

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,9 @@ public RMNodeLabelsManager createNodeLabelManager() {
7171
try {
7272
GenericTestUtils.waitFor(()
7373
-> csMetrics.getNumOfNodeUpdate() == 2, 100, 3000);
74+
GenericTestUtils
75+
.waitFor(() -> csMetrics.getNumOfSchedulerNodeHBInterval() == 2,
76+
100, 3000);
7477
} catch(TimeoutException e) {
7578
Assert.fail("CS metrics not updated on node-update events.");
7679
}
@@ -101,6 +104,9 @@ public RMNodeLabelsManager createNodeLabelManager() {
101104
// Verify HB metrics updated
102105
GenericTestUtils.waitFor(()
103106
-> csMetrics.getNumOfNodeUpdate() == 4, 100, 3000);
107+
GenericTestUtils
108+
.waitFor(() -> csMetrics.getNumOfSchedulerNodeHBInterval() == 4,
109+
100, 3000);
104110
// For async mode, the number of alloc might be bigger than 1
105111
Assert.assertTrue(csMetrics.getNumOfAllocates() > 0);
106112
// But there will be only 2 successful commit (1 AM + 1 task)

0 commit comments

Comments
 (0)