Skip to content

Commit 1aee5f0

Browse files
chenxu14saintstack
authored andcommitted
HBASE-23083 Collect Executor status info periodically and report to (#664)
metrics system Signed-off-by: stack <stack@apache.org>
1 parent 473816d commit 1aee5f0

File tree

5 files changed

+238
-9
lines changed

5 files changed

+238
-9
lines changed

hbase-common/src/main/java/org/apache/hadoop/hbase/HConstants.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1191,6 +1191,9 @@ public enum OperationStatusCode {
11911191
"hbase.node.health.failure.threshold";
11921192
public static final int DEFAULT_HEALTH_FAILURE_THRESHOLD = 3;
11931193

1194+
public static final String EXECUTOR_STATUS_COLLECT_ENABLED =
1195+
"hbase.executors.status.collect.enabled";
1196+
public static final boolean DEFAULT_EXECUTOR_STATUS_COLLECT_ENABLED = true;
11941197

11951198
/**
11961199
* Setting to activate, or not, the publication of the status by the master. Default
Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
/**
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing, software
13+
* distributed under the License is distributed on an "AS IS" BASIS,
14+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
* See the License for the specific language governing permissions and
16+
* limitations under the License.
17+
*/
18+
package org.apache.hadoop.hbase;
19+
20+
import java.util.Map;
21+
import org.apache.hadoop.hbase.executor.ExecutorService;
22+
import org.apache.hadoop.hbase.executor.ExecutorService.ExecutorStatus;
23+
import org.apache.hadoop.hbase.regionserver.MetricsRegionServerSource;
24+
import org.apache.hadoop.hbase.regionserver.MetricsRegionServerSourceImpl;
25+
import org.apache.hadoop.hbase.util.Pair;
26+
import org.apache.hadoop.metrics2.lib.DynamicMetricsRegistry;
27+
import org.apache.hadoop.metrics2.lib.MutableGaugeLong;
28+
import org.apache.hadoop.util.StringUtils;
29+
import org.apache.yetus.audience.InterfaceAudience;
30+
import org.slf4j.Logger;
31+
import org.slf4j.LoggerFactory;
32+
import org.apache.hbase.thirdparty.com.google.common.annotations.VisibleForTesting;
33+
34+
/**
35+
* The Class ExecutorStatusChore for collect Executor status info periodically
36+
* and report to metrics system
37+
*/
38+
@InterfaceAudience.Private
39+
public class ExecutorStatusChore extends ScheduledChore {
40+
private static final Logger LOG = LoggerFactory.getLogger(HealthCheckChore.class);
41+
public static final String WAKE_FREQ = "hbase.executors.status.collect.period";
42+
public static final int DEFAULT_WAKE_FREQ = 60000;
43+
private ExecutorService service;
44+
private DynamicMetricsRegistry metricsRegistry;
45+
46+
public ExecutorStatusChore(int sleepTime, Stoppable stopper, ExecutorService service,
47+
MetricsRegionServerSource metrics) {
48+
super("ExecutorStatusChore", stopper, sleepTime);
49+
LOG.info("ExecutorStatusChore runs every {} ", StringUtils.formatTime(sleepTime));
50+
this.service = service;
51+
this.metricsRegistry = ((MetricsRegionServerSourceImpl) metrics).getMetricsRegistry();
52+
}
53+
54+
@Override
55+
protected void chore() {
56+
try{
57+
// thread pool monitor
58+
Map<String, ExecutorStatus> statuses = service.getAllExecutorStatuses();
59+
for (Map.Entry<String, ExecutorStatus> statusEntry : statuses.entrySet()) {
60+
String name = statusEntry.getKey();
61+
// Executor's name is generate by ExecutorType#getExecutorName
62+
// include ExecutorType & Servername(split by '-'), here we only need the ExecutorType
63+
String poolName = name.split("-")[0];
64+
ExecutorStatus status = statusEntry.getValue();
65+
MutableGaugeLong queued = metricsRegistry.getGauge(poolName + "_queued", 0L);
66+
MutableGaugeLong running = metricsRegistry.getGauge(poolName + "_running", 0L);
67+
int queueSize = status.getQueuedEvents().size();
68+
int runningSize = status.getRunning().size();
69+
if (queueSize > 0) {
70+
LOG.warn("{}'s size info, queued: {}, running: {}", poolName, queueSize, runningSize);
71+
}
72+
queued.set(queueSize);
73+
running.set(runningSize);
74+
}
75+
} catch(Throwable e) {
76+
LOG.error(e.getMessage(), e);
77+
}
78+
}
79+
80+
@VisibleForTesting
81+
public Pair<Long, Long> getExecutorStatus(String poolName) {
82+
MutableGaugeLong running = metricsRegistry.getGauge(poolName + "_running", 0L);
83+
MutableGaugeLong queued = metricsRegistry.getGauge(poolName + "_queued", 0L);
84+
return new Pair<Long, Long>(running.value(), queued.value());
85+
}
86+
}

hbase-server/src/main/java/org/apache/hadoop/hbase/executor/ExecutorService.java

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -313,6 +313,14 @@ public static class ExecutorStatus {
313313
this.running = running;
314314
}
315315

316+
public List<EventHandler> getQueuedEvents() {
317+
return queuedEvents;
318+
}
319+
320+
public List<RunningEventStatus> getRunning() {
321+
return running;
322+
}
323+
316324
/**
317325
* Dump a textual representation of the executor's status
318326
* to the given writer.

hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java

Lines changed: 39 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@
6969
import org.apache.hadoop.hbase.ClockOutOfSyncException;
7070
import org.apache.hadoop.hbase.CoordinatedStateManager;
7171
import org.apache.hadoop.hbase.DoNotRetryIOException;
72+
import org.apache.hadoop.hbase.ExecutorStatusChore;
7273
import org.apache.hadoop.hbase.HBaseConfiguration;
7374
import org.apache.hadoop.hbase.HBaseInterfaceAudience;
7475
import org.apache.hadoop.hbase.HConstants;
@@ -442,6 +443,9 @@ public class HRegionServer extends HasThread implements
442443
/** The health check chore. */
443444
private HealthCheckChore healthCheckChore;
444445

446+
/** The Executor status collect chore. */
447+
private ExecutorStatusChore executorStatusChore;
448+
445449
/** The nonce manager chore. */
446450
private ScheduledChore nonceManagerChore;
447451

@@ -1922,6 +1926,14 @@ private void startServices() throws IOException {
19221926
HConstants.DEFAULT_THREAD_WAKE_FREQUENCY);
19231927
healthCheckChore = new HealthCheckChore(sleepTime, this, getConfiguration());
19241928
}
1929+
// Executor status collect thread.
1930+
if (this.conf.getBoolean(HConstants.EXECUTOR_STATUS_COLLECT_ENABLED,
1931+
HConstants.DEFAULT_EXECUTOR_STATUS_COLLECT_ENABLED)) {
1932+
int sleepTime = this.conf.getInt(ExecutorStatusChore.WAKE_FREQ,
1933+
ExecutorStatusChore.DEFAULT_WAKE_FREQ);
1934+
executorStatusChore = new ExecutorStatusChore(sleepTime, this, this.getExecutorService(),
1935+
this.getRegionServerMetrics().getMetricsSource());
1936+
}
19251937

19261938
this.walRoller = new LogRoller(this, this);
19271939
this.flushThroughputController = FlushThroughputControllerFactory.create(this, conf);
@@ -1970,25 +1982,42 @@ private void startServices() throws IOException {
19701982
conf.getInt("hbase.regionserver.executor.switch.rpc.throttle.threads", 1));
19711983

19721984
Threads.setDaemonThreadRunning(this.walRoller.getThread(), getName() + ".logRoller",
1973-
uncaughtExceptionHandler);
1985+
uncaughtExceptionHandler);
19741986
if (this.cacheFlusher != null) {
19751987
this.cacheFlusher.start(uncaughtExceptionHandler);
19761988
}
19771989
Threads.setDaemonThreadRunning(this.procedureResultReporter,
19781990
getName() + ".procedureResultReporter", uncaughtExceptionHandler);
19791991

1980-
if (this.compactionChecker != null) choreService.scheduleChore(compactionChecker);
1981-
if (this.periodicFlusher != null) choreService.scheduleChore(periodicFlusher);
1982-
if (this.healthCheckChore != null) choreService.scheduleChore(healthCheckChore);
1983-
if (this.nonceManagerChore != null) choreService.scheduleChore(nonceManagerChore);
1984-
if (this.storefileRefresher != null) choreService.scheduleChore(storefileRefresher);
1985-
if (this.movedRegionsCleaner != null) choreService.scheduleChore(movedRegionsCleaner);
1986-
if (this.fsUtilizationChore != null) choreService.scheduleChore(fsUtilizationChore);
1992+
if (this.compactionChecker != null) {
1993+
choreService.scheduleChore(compactionChecker);
1994+
}
1995+
if (this.periodicFlusher != null) {
1996+
choreService.scheduleChore(periodicFlusher);
1997+
}
1998+
if (this.healthCheckChore != null) {
1999+
choreService.scheduleChore(healthCheckChore);
2000+
}
2001+
if (this.executorStatusChore != null) {
2002+
choreService.scheduleChore(executorStatusChore);
2003+
}
2004+
if (this.nonceManagerChore != null) {
2005+
choreService.scheduleChore(nonceManagerChore);
2006+
}
2007+
if (this.storefileRefresher != null) {
2008+
choreService.scheduleChore(storefileRefresher);
2009+
}
2010+
if (this.movedRegionsCleaner != null) {
2011+
choreService.scheduleChore(movedRegionsCleaner);
2012+
}
2013+
if (this.fsUtilizationChore != null) {
2014+
choreService.scheduleChore(fsUtilizationChore);
2015+
}
19872016

19882017
// Leases is not a Thread. Internally it runs a daemon thread. If it gets
19892018
// an unhandled exception, it will just exit.
19902019
Threads.setDaemonThreadRunning(this.leases.getThread(), getName() + ".leaseChecker",
1991-
uncaughtExceptionHandler);
2020+
uncaughtExceptionHandler);
19922021

19932022
// Create the log splitting worker and start it
19942023
// set a smaller retries to fast fail otherwise splitlogworker could be blocked for
@@ -2500,6 +2529,7 @@ protected void stopServiceThreads() {
25002529
choreService.cancelChore(compactionChecker);
25012530
choreService.cancelChore(periodicFlusher);
25022531
choreService.cancelChore(healthCheckChore);
2532+
choreService.cancelChore(executorStatusChore);
25032533
choreService.cancelChore(storefileRefresher);
25042534
choreService.cancelChore(movedRegionsCleaner);
25052535
choreService.cancelChore(fsUtilizationChore);
Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
/**
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing, software
13+
* distributed under the License is distributed on an "AS IS" BASIS,
14+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
* See the License for the specific language governing permissions and
16+
* limitations under the License.
17+
*/
18+
package org.apache.hadoop.hbase;
19+
20+
import static org.junit.Assert.assertEquals;
21+
import static org.junit.Assert.assertTrue;
22+
import static org.mockito.Mockito.mock;
23+
import static org.mockito.Mockito.when;
24+
import java.util.concurrent.atomic.AtomicBoolean;
25+
import java.util.concurrent.atomic.AtomicInteger;
26+
import org.apache.hadoop.hbase.executor.EventType;
27+
import org.apache.hadoop.hbase.executor.ExecutorService;
28+
import org.apache.hadoop.hbase.executor.ExecutorType;
29+
import org.apache.hadoop.hbase.executor.TestExecutorService.TestEventHandler;
30+
import org.apache.hadoop.hbase.regionserver.MetricsRegionServerSource;
31+
import org.apache.hadoop.hbase.regionserver.MetricsRegionServerSourceFactory;
32+
import org.apache.hadoop.hbase.regionserver.MetricsRegionServerSourceImpl;
33+
import org.apache.hadoop.hbase.testclassification.MiscTests;
34+
import org.apache.hadoop.hbase.testclassification.SmallTests;
35+
import org.apache.hadoop.hbase.util.Pair;
36+
import org.junit.ClassRule;
37+
import org.junit.Test;
38+
import org.junit.experimental.categories.Category;
39+
import org.slf4j.Logger;
40+
import org.slf4j.LoggerFactory;
41+
42+
@Category({MiscTests.class, SmallTests.class})
43+
public class TestExecutorStatusChore {
44+
@ClassRule
45+
public static final HBaseClassTestRule CLASS_RULE =
46+
HBaseClassTestRule.forClass(TestExecutorStatusChore.class);
47+
48+
private static final Logger LOG = LoggerFactory.getLogger(TestExecutorStatusChore.class);
49+
50+
@Test
51+
public void testMetricsCollect() throws Exception {
52+
int maxThreads = 5;
53+
int maxTries = 10;
54+
int sleepInterval = 10;
55+
56+
Server mockedServer = mock(Server.class);
57+
when(mockedServer.getConfiguration()).thenReturn(HBaseConfiguration.create());
58+
59+
// Start an executor service pool with max 5 threads
60+
ExecutorService executorService = new ExecutorService("unit_test");
61+
executorService.startExecutorService(
62+
ExecutorType.RS_PARALLEL_SEEK, maxThreads);
63+
64+
MetricsRegionServerSource serverSource = CompatibilitySingletonFactory
65+
.getInstance(MetricsRegionServerSourceFactory.class).createServer(null);
66+
assertTrue(serverSource instanceof MetricsRegionServerSourceImpl);
67+
68+
ExecutorStatusChore statusChore = new ExecutorStatusChore(60000,
69+
mockedServer, executorService, serverSource);
70+
71+
AtomicBoolean lock = new AtomicBoolean(true);
72+
AtomicInteger counter = new AtomicInteger(0);
73+
74+
for (int i = 0; i < maxThreads + 1; i++) {
75+
executorService.submit(new TestEventHandler(mockedServer,
76+
EventType.RS_PARALLEL_SEEK, lock, counter));
77+
}
78+
79+
// The TestEventHandler will increment counter when it starts.
80+
int tries = 0;
81+
while (counter.get() < maxThreads && tries < maxTries) {
82+
LOG.info("Waiting for all event handlers to start...");
83+
Thread.sleep(sleepInterval);
84+
tries++;
85+
}
86+
87+
// Assert that pool is at max threads.
88+
assertEquals(maxThreads, counter.get());
89+
90+
statusChore.chore();
91+
Pair<Long, Long> executorStatus = statusChore.getExecutorStatus("RS_PARALLEL_SEEK");
92+
assertEquals(maxThreads, executorStatus.getFirst().intValue()); // running
93+
assertEquals(1, executorStatus.getSecond().intValue()); // pending
94+
95+
// Now interrupt the running Executor
96+
synchronized (lock) {
97+
lock.set(false);
98+
lock.notifyAll();
99+
}
100+
executorService.shutdown();
101+
}
102+
}

0 commit comments

Comments
 (0)