Skip to content

Commit ec33119

Browse files
virajjasanijojochuang
authored andcommitted
HADOOP-16290. Enable RpcMetrics units to be configurable (#3198)
Signed-off-by: Akira Ajisaka <aajisaka@apache.org> (cherry picked from commit e1d00ad)
1 parent 177569f commit ec33119

File tree

8 files changed

+133
-21
lines changed

8 files changed

+133
-21
lines changed

hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/CommonConfigurationKeys.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -378,7 +378,9 @@ public class CommonConfigurationKeys extends CommonConfigurationKeysPublic {
378378
public static final boolean RPC_METRICS_QUANTILE_ENABLE_DEFAULT = false;
379379
public static final String RPC_METRICS_PERCENTILES_INTERVALS_KEY =
380380
"rpc.metrics.percentiles.intervals";
381-
381+
382+
public static final String RPC_METRICS_TIME_UNIT = "rpc.metrics.timeunit";
383+
382384
/** Allowed hosts for nfs exports */
383385
public static final String NFS_EXPORTS_ALLOWED_HOSTS_SEPARATOR = ";";
384386
public static final String NFS_EXPORTS_ALLOWED_HOSTS_KEY = "nfs.exports.allowed.hosts";

hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/DecayRpcScheduler.java

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,7 @@ public class DecayRpcScheduler implements RpcScheduler,
178178
private final String namespace;
179179
private final int topUsersCount; // e.g., report top 10 users' metrics
180180
private static final double PRECISION = 0.0001;
181+
private final TimeUnit metricsTimeUnit;
181182
private MetricsProxy metricsProxy;
182183
private final CostProvider costProvider;
183184

@@ -249,6 +250,8 @@ public DecayRpcScheduler(int numLevels, String ns, Configuration conf) {
249250
DecayRpcSchedulerDetailedMetrics.create(ns);
250251
decayRpcSchedulerDetailedMetrics.init(numLevels);
251252

253+
metricsTimeUnit = RpcMetrics.getMetricsTimeUnit(conf);
254+
252255
// Setup delay timer
253256
Timer timer = new Timer(true);
254257
DecayTask task = new DecayTask(this, timer);
@@ -676,8 +679,9 @@ public void addResponseTime(String callName, Schedulable schedulable,
676679
addCost(user, processingCost);
677680

678681
int priorityLevel = schedulable.getPriorityLevel();
679-
long queueTime = details.get(Timing.QUEUE, RpcMetrics.TIMEUNIT);
680-
long processingTime = details.get(Timing.PROCESSING, RpcMetrics.TIMEUNIT);
682+
long queueTime = details.get(Timing.QUEUE, metricsTimeUnit);
683+
long processingTime = details.get(Timing.PROCESSING,
684+
metricsTimeUnit);
681685

682686
this.decayRpcSchedulerDetailedMetrics.addQueueTime(
683687
priorityLevel, queueTime);

hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/RpcScheduler.java

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -62,10 +62,10 @@ default void addResponseTime(String callName, Schedulable schedulable,
6262
// this interface, a default implementation is supplied which uses the old
6363
// method. All new implementations MUST override this interface and should
6464
// NOT use the other addResponseTime method.
65-
int queueTime = (int)
66-
details.get(ProcessingDetails.Timing.QUEUE, RpcMetrics.TIMEUNIT);
67-
int processingTime = (int)
68-
details.get(ProcessingDetails.Timing.PROCESSING, RpcMetrics.TIMEUNIT);
65+
int queueTime = (int) details.get(ProcessingDetails.Timing.QUEUE,
66+
RpcMetrics.DEFAULT_METRIC_TIME_UNIT);
67+
int processingTime = (int) details.get(ProcessingDetails.Timing.PROCESSING,
68+
RpcMetrics.DEFAULT_METRIC_TIME_UNIT);
6969
addResponseTime(callName, schedulable.getPriorityLevel(),
7070
queueTime, processingTime);
7171
}

hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/Server.java

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -542,13 +542,13 @@ void logSlowRpcCalls(String methodName, Call call,
542542
(rpcMetrics.getProcessingStdDev() * deviation);
543543

544544
long processingTime =
545-
details.get(Timing.PROCESSING, RpcMetrics.TIMEUNIT);
545+
details.get(Timing.PROCESSING, rpcMetrics.getMetricsTimeUnit());
546546
if ((rpcMetrics.getProcessingSampleCount() > minSampleSize) &&
547547
(processingTime > threeSigma)) {
548548
LOG.warn(
549549
"Slow RPC : {} took {} {} to process from client {},"
550550
+ " the processing detail is {}",
551-
methodName, processingTime, RpcMetrics.TIMEUNIT, call,
551+
methodName, processingTime, rpcMetrics.getMetricsTimeUnit(), call,
552552
details.toString());
553553
rpcMetrics.incrSlowRpc();
554554
}
@@ -568,7 +568,7 @@ void updateMetrics(Call call, long startTime, boolean connDropped) {
568568
deltaNanos -= details.get(Timing.RESPONSE);
569569
details.set(Timing.HANDLER, deltaNanos);
570570

571-
long queueTime = details.get(Timing.QUEUE, RpcMetrics.TIMEUNIT);
571+
long queueTime = details.get(Timing.QUEUE, rpcMetrics.getMetricsTimeUnit());
572572
rpcMetrics.addRpcQueueTime(queueTime);
573573

574574
if (call.isResponseDeferred() || connDropped) {
@@ -577,9 +577,9 @@ void updateMetrics(Call call, long startTime, boolean connDropped) {
577577
}
578578

579579
long processingTime =
580-
details.get(Timing.PROCESSING, RpcMetrics.TIMEUNIT);
580+
details.get(Timing.PROCESSING, rpcMetrics.getMetricsTimeUnit());
581581
long waitTime =
582-
details.get(Timing.LOCKWAIT, RpcMetrics.TIMEUNIT);
582+
details.get(Timing.LOCKWAIT, rpcMetrics.getMetricsTimeUnit());
583583
rpcMetrics.addRpcLockWaitTime(waitTime);
584584
rpcMetrics.addRpcProcessingTime(processingTime);
585585
// don't include lock wait for detailed metrics.

hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/metrics/RpcMetrics.java

Lines changed: 32 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919

2020
import java.util.concurrent.TimeUnit;
2121

22+
import org.apache.commons.lang3.StringUtils;
2223
import org.apache.hadoop.thirdparty.com.google.common.annotations.VisibleForTesting;
2324
import org.apache.hadoop.fs.CommonConfigurationKeys;
2425
import org.apache.hadoop.ipc.Server;
@@ -48,9 +49,12 @@ public class RpcMetrics {
4849
final MetricsRegistry registry;
4950
final String name;
5051
final boolean rpcQuantileEnable;
52+
53+
public static final TimeUnit DEFAULT_METRIC_TIME_UNIT =
54+
TimeUnit.MILLISECONDS;
5155
/** The time unit used when storing/accessing time durations. */
52-
public final static TimeUnit TIMEUNIT = TimeUnit.MILLISECONDS;
53-
56+
private final TimeUnit metricsTimeUnit;
57+
5458
RpcMetrics(Server server, Configuration conf) {
5559
String port = String.valueOf(server.getListenerAddress().getPort());
5660
name = "RpcActivityForPort" + port;
@@ -63,6 +67,7 @@ public class RpcMetrics {
6367
rpcQuantileEnable = (intervals.length > 0) && conf.getBoolean(
6468
CommonConfigurationKeys.RPC_METRICS_QUANTILE_ENABLE,
6569
CommonConfigurationKeys.RPC_METRICS_QUANTILE_ENABLE_DEFAULT);
70+
metricsTimeUnit = getMetricsTimeUnit(conf);
6671
if (rpcQuantileEnable) {
6772
rpcQueueTimeQuantiles =
6873
new MutableQuantiles[intervals.length];
@@ -75,19 +80,19 @@ public class RpcMetrics {
7580
for (int i = 0; i < intervals.length; i++) {
7681
int interval = intervals[i];
7782
rpcQueueTimeQuantiles[i] = registry.newQuantiles("rpcQueueTime"
78-
+ interval + "s", "rpc queue time in " + TIMEUNIT, "ops",
83+
+ interval + "s", "rpc queue time in " + metricsTimeUnit, "ops",
7984
"latency", interval);
8085
rpcLockWaitTimeQuantiles[i] = registry.newQuantiles(
8186
"rpcLockWaitTime" + interval + "s",
82-
"rpc lock wait time in " + TIMEUNIT, "ops",
87+
"rpc lock wait time in " + metricsTimeUnit, "ops",
8388
"latency", interval);
8489
rpcProcessingTimeQuantiles[i] = registry.newQuantiles(
8590
"rpcProcessingTime" + interval + "s",
86-
"rpc processing time in " + TIMEUNIT, "ops",
91+
"rpc processing time in " + metricsTimeUnit, "ops",
8792
"latency", interval);
8893
deferredRpcProcessingTimeQuantiles[i] = registry.newQuantiles(
8994
"deferredRpcProcessingTime" + interval + "s",
90-
"deferred rpc processing time in " + TIMEUNIT, "ops",
95+
"deferred rpc processing time in " + metricsTimeUnit, "ops",
9196
"latency", interval);
9297
}
9398
}
@@ -141,6 +146,27 @@ public String numOpenConnectionsPerUser() {
141146
return server.getNumDroppedConnections();
142147
}
143148

149+
public TimeUnit getMetricsTimeUnit() {
150+
return metricsTimeUnit;
151+
}
152+
153+
public static TimeUnit getMetricsTimeUnit(Configuration conf) {
154+
TimeUnit metricsTimeUnit = RpcMetrics.DEFAULT_METRIC_TIME_UNIT;
155+
String timeunit = conf.get(CommonConfigurationKeys.RPC_METRICS_TIME_UNIT);
156+
if (StringUtils.isNotEmpty(timeunit)) {
157+
try {
158+
metricsTimeUnit = TimeUnit.valueOf(timeunit);
159+
} catch (IllegalArgumentException e) {
160+
LOG.info("Config key {} 's value {} does not correspond to enum values"
161+
+ " of java.util.concurrent.TimeUnit. Hence default unit"
162+
+ " {} will be used",
163+
CommonConfigurationKeys.RPC_METRICS_TIME_UNIT, timeunit,
164+
RpcMetrics.DEFAULT_METRIC_TIME_UNIT);
165+
}
166+
}
167+
return metricsTimeUnit;
168+
}
169+
144170
// Public instrumentation methods that could be extracted to an
145171
// abstract class if we decide to do custom instrumentation classes a la
146172
// JobTrackerInstrumentation. The methods with //@Override comment are

hadoop-common-project/hadoop-common/src/main/resources/core-default.xml

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3293,6 +3293,21 @@
32933293
</description>
32943294
</property>
32953295

3296+
<property>
3297+
<name>rpc.metrics.timeunit</name>
3298+
<value>MILLISECONDS</value>
3299+
<description>
3300+
This property is used to configure timeunit for various RPC Metrics
3301+
e.g rpcQueueTime, rpcLockWaitTime, rpcProcessingTime,
3302+
deferredRpcProcessingTime. In the absence of this property,
3303+
default timeunit used is milliseconds.
3304+
The value of this property should match to any one value of enum:
3305+
java.util.concurrent.TimeUnit.
3306+
Some of the valid values: NANOSECONDS, MICROSECONDS, MILLISECONDS,
3307+
SECONDS etc.
3308+
</description>
3309+
</property>
3310+
32963311
<property>
32973312
<name>rpc.metrics.percentiles.intervals</name>
32983313
<value></value>

hadoop-common-project/hadoop-common/src/site/markdown/Metrics.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,8 @@ rpc
6565
---
6666

6767
Each metrics record contains tags such as Hostname and port (number to which server is bound) as additional information along with metrics.
68+
`rpc.metrics.timeunit` config can be used to configure timeunit for RPC metrics.
69+
The default timeunit used for RPC metrics is milliseconds (as per the below description).
6870

6971
| Name | Description |
7072
|:---- |:---- |

hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ipc/TestRPC.java

Lines changed: 66 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,6 @@
2929
import org.apache.hadoop.ipc.Client.ConnectionId;
3030
import org.apache.hadoop.ipc.Server.Call;
3131
import org.apache.hadoop.ipc.Server.Connection;
32-
import org.apache.hadoop.ipc.metrics.RpcMetrics;
3332
import org.apache.hadoop.ipc.protobuf.RpcHeaderProtos.RpcResponseHeaderProto.RpcErrorCodeProto;
3433
import org.apache.hadoop.ipc.protobuf.RpcHeaderProtos.RpcResponseHeaderProto.RpcStatusProto;
3534
import org.apache.hadoop.ipc.protobuf.TestProtos;
@@ -1098,8 +1097,8 @@ public TestRpcService run() {
10981097
proxy.lockAndSleep(null, newSleepRequest(5));
10991098
rpcMetrics = getMetrics(server.getRpcMetrics().name());
11001099
assertGauge("RpcLockWaitTimeAvgTime",
1101-
(double)(RpcMetrics.TIMEUNIT.convert(10L, TimeUnit.SECONDS)),
1102-
rpcMetrics);
1100+
(double)(server.getRpcMetrics().getMetricsTimeUnit().convert(10L,
1101+
TimeUnit.SECONDS)), rpcMetrics);
11031102
} finally {
11041103
if (proxy2 != null) {
11051104
RPC.stopProxy(proxy2);
@@ -1603,6 +1602,70 @@ public void testSetProtocolEngine() {
16031602
assertTrue(rpcEngine instanceof StoppedRpcEngine);
16041603
}
16051604

1605+
@Test
1606+
public void testRpcMetricsInNanos() throws Exception {
1607+
final Server server;
1608+
TestRpcService proxy = null;
1609+
1610+
final int interval = 1;
1611+
conf.setBoolean(CommonConfigurationKeys.
1612+
RPC_METRICS_QUANTILE_ENABLE, true);
1613+
conf.set(CommonConfigurationKeys.
1614+
RPC_METRICS_PERCENTILES_INTERVALS_KEY, "" + interval);
1615+
conf.set(CommonConfigurationKeys.RPC_METRICS_TIME_UNIT, "NANOSECONDS");
1616+
1617+
server = setupTestServer(conf, 5);
1618+
String testUser = "testUserInNanos";
1619+
UserGroupInformation anotherUser =
1620+
UserGroupInformation.createRemoteUser(testUser);
1621+
TestRpcService proxy2 =
1622+
anotherUser.doAs((PrivilegedAction<TestRpcService>) () -> {
1623+
try {
1624+
return RPC.getProxy(TestRpcService.class, 0,
1625+
server.getListenerAddress(), conf);
1626+
} catch (IOException e) {
1627+
LOG.error("Something went wrong.", e);
1628+
}
1629+
return null;
1630+
});
1631+
try {
1632+
proxy = getClient(addr, conf);
1633+
for (int i = 0; i < 100; i++) {
1634+
proxy.ping(null, newEmptyRequest());
1635+
proxy.echo(null, newEchoRequest("" + i));
1636+
proxy2.echo(null, newEchoRequest("" + i));
1637+
}
1638+
MetricsRecordBuilder rpcMetrics =
1639+
getMetrics(server.getRpcMetrics().name());
1640+
assertEquals("Expected zero rpc lock wait time",
1641+
0, getDoubleGauge("RpcLockWaitTimeAvgTime", rpcMetrics), 0.001);
1642+
MetricsAsserts.assertQuantileGauges("RpcQueueTime" + interval + "s",
1643+
rpcMetrics);
1644+
MetricsAsserts.assertQuantileGauges("RpcProcessingTime" + interval + "s",
1645+
rpcMetrics);
1646+
1647+
proxy.lockAndSleep(null, newSleepRequest(5));
1648+
rpcMetrics = getMetrics(server.getRpcMetrics().name());
1649+
assertGauge("RpcLockWaitTimeAvgTime",
1650+
(double)(server.getRpcMetrics().getMetricsTimeUnit().convert(10L,
1651+
TimeUnit.SECONDS)), rpcMetrics);
1652+
LOG.info("RpcProcessingTimeAvgTime: {} , RpcQueueTimeAvgTime: {}",
1653+
getDoubleGauge("RpcProcessingTimeAvgTime", rpcMetrics),
1654+
getDoubleGauge("RpcQueueTimeAvgTime", rpcMetrics));
1655+
1656+
assertTrue(getDoubleGauge("RpcProcessingTimeAvgTime", rpcMetrics)
1657+
> 4000000D);
1658+
assertTrue(getDoubleGauge("RpcQueueTimeAvgTime", rpcMetrics)
1659+
> 4000D);
1660+
} finally {
1661+
if (proxy2 != null) {
1662+
RPC.stopProxy(proxy2);
1663+
}
1664+
stop(server, proxy);
1665+
}
1666+
}
1667+
1668+
16061669
public static void main(String[] args) throws Exception {
16071670
new TestRPC().testCallsInternal(conf);
16081671
}

0 commit comments

Comments
 (0)