Skip to content

Commit f05ef0d

Browse files
GEODE-8977: change ThreadMonitor to reduce how long it does a "stop the world" ThreadDump vm op (#7751)
Now uses a cheaper getThreadInfo that does not get lock info by default and calls getThreadInfo for each stuck thread. These are the defaults because they have the shortest time do the the VM ThreadDump operation. To get locks set the system property "gemfire.threadmonitor.showLocks" to "true". To get ThreadInfo on all stuck threads with a single call set the system property "gemfire.threadmonitor.batchCalls" to "true". (cherry picked from commit 3df1e76)
1 parent 3c9bb2e commit f05ef0d

File tree

5 files changed

+290
-116
lines changed

5 files changed

+290
-116
lines changed

geode-core/src/main/java/org/apache/geode/internal/monitoring/ThreadsMonitoringProcess.java

Lines changed: 68 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717

1818
import java.lang.management.ManagementFactory;
1919
import java.lang.management.ThreadInfo;
20+
import java.lang.management.ThreadMXBean;
2021
import java.util.Collection;
2122
import java.util.Collections;
2223
import java.util.HashMap;
@@ -119,30 +120,87 @@ private int checkForStuckThreads(Collection<AbstractExecutor> executors, long cu
119120
return result;
120121
}
121122

122-
@VisibleForTesting
123-
public static Map<Long, ThreadInfo> createThreadInfoMap(Set<Long> stuckThreadIds) {
124-
/*
125-
* NOTE: at least some implementations of getThreadInfo(long[], boolean, boolean)
126-
* will core dump if the long array contains a duplicate value.
127-
* That is why stuckThreadIds is a Set instead of a List.
128-
*/
123+
/**
124+
* If set to true, then the JVM will be asked for what locks a thread holds.
125+
* This is extra expensive to ask for on some JVMs so be careful setting this to true.
126+
*/
127+
private static final boolean SHOW_LOCKS = Boolean.getBoolean("gemfire.threadmonitor.showLocks");
128+
/**
129+
* If set to true, then the JVM will be asked for all potential stuck threads with one call.
130+
* Since getThreadInfo on many JVMs, stops ALL threads from running, and since getting info
131+
* on multiple threads with one call is additional work, setting this can cause an extra long
132+
* stop the world that can then cause other problems (like a forced disconnect).
133+
* So be careful setting this to true.
134+
*/
135+
private static final boolean BATCH_CALLS = Boolean.getBoolean("gemfire.threadmonitor.batchCalls");
136+
137+
private static Map<Long, ThreadInfo> createThreadInfoMap(Set<Long> stuckThreadIds) {
138+
return createThreadInfoMap(stuckThreadIds, SHOW_LOCKS, BATCH_CALLS);
139+
}
140+
141+
public static Map<Long, ThreadInfo> createThreadInfoMap(Set<Long> stuckThreadIds,
142+
boolean showLocks, boolean batchCalls) {
143+
return createThreadInfoMap(ManagementFactory.getThreadMXBean(), stuckThreadIds, showLocks,
144+
batchCalls);
145+
}
146+
147+
static Map<Long, ThreadInfo> createThreadInfoMap(ThreadMXBean threadMXBean,
148+
Set<Long> stuckThreadIds,
149+
final boolean showLocks, final boolean batchCalls) {
129150
if (stuckThreadIds.isEmpty()) {
130151
return Collections.emptyMap();
131152
}
153+
logger.info(
154+
"Obtaining ThreadInfo for {} threads. Configuration: showLocks={} batchCalls={}. This is an expensive operation for the JVM and on most JVMs causes all threads to be paused.",
155+
stuckThreadIds.size(), showLocks, batchCalls);
156+
Map<Long, ThreadInfo> result = new HashMap<>();
157+
if (batchCalls) {
158+
createThreadInfoMapUsingSingleCall(threadMXBean, stuckThreadIds, showLocks, result);
159+
} else {
160+
for (long id : stuckThreadIds) {
161+
ThreadInfo threadInfo = createThreadInfoForSingleThread(threadMXBean, showLocks, id);
162+
if (threadInfo != null) {
163+
result.put(threadInfo.getThreadId(), threadInfo);
164+
}
165+
}
166+
}
167+
logger.info("finished obtaining ThreadInfo");
168+
return result;
169+
}
170+
171+
private static ThreadInfo createThreadInfoForSingleThread(
172+
ThreadMXBean threadMXBean, boolean showLocks, long id) {
173+
ThreadInfo threadInfo;
174+
if (showLocks) {
175+
ThreadInfo[] threadInfos =
176+
threadMXBean.getThreadInfo(new long[] {id}, true, true);
177+
threadInfo = threadInfos[0];
178+
} else {
179+
threadInfo = threadMXBean.getThreadInfo(id, Integer.MAX_VALUE);
180+
}
181+
return threadInfo;
182+
}
183+
184+
private static void createThreadInfoMapUsingSingleCall(
185+
ThreadMXBean threadMXBean, Set<Long> stuckThreadIds, boolean showLocks,
186+
Map<Long, ThreadInfo> result) {
132187
long[] ids = new long[stuckThreadIds.size()];
133188
int idx = 0;
134189
for (long id : stuckThreadIds) {
135190
ids[idx] = id;
136191
idx++;
137192
}
138-
ThreadInfo[] threadInfos = ManagementFactory.getThreadMXBean().getThreadInfo(ids, true, true);
139-
Map<Long, ThreadInfo> result = new HashMap<>();
193+
/*
194+
* NOTE: at least some implementations of getThreadInfo(long[], boolean, boolean)
195+
* will core dump if the long array contains a duplicate value.
196+
* That is why stuckThreadIds is a Set instead of a List.
197+
*/
198+
ThreadInfo[] threadInfos = threadMXBean.getThreadInfo(ids, showLocks, showLocks);
140199
for (ThreadInfo threadInfo : threadInfos) {
141200
if (threadInfo != null) {
142201
result.put(threadInfo.getThreadId(), threadInfo);
143202
}
144203
}
145-
return result;
146204
}
147205

148206
private void addLockOwnerThreadId(Set<Long> stuckThreadIds, long threadId) {
Lines changed: 68 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -15,37 +15,35 @@
1515
package org.apache.geode.internal.monitoring;
1616

1717
import static org.assertj.core.api.Assertions.assertThat;
18-
import static org.junit.Assert.assertFalse;
19-
import static org.junit.Assert.assertTrue;
18+
import static org.junit.jupiter.api.Assertions.assertFalse;
19+
import static org.junit.jupiter.api.Assertions.assertNotNull;
20+
import static org.junit.jupiter.api.Assertions.assertNull;
21+
import static org.junit.jupiter.api.Assertions.assertTrue;
2022
import static org.mockito.ArgumentMatchers.any;
2123
import static org.mockito.Mockito.mock;
2224
import static org.mockito.Mockito.never;
2325
import static org.mockito.Mockito.times;
2426
import static org.mockito.Mockito.verify;
2527

26-
import org.junit.After;
27-
import org.junit.Before;
28-
import org.junit.Test;
28+
import org.junit.jupiter.api.AfterEach;
29+
import org.junit.jupiter.api.Test;
2930

3031
import org.apache.geode.internal.monitoring.ThreadsMonitoring.Mode;
3132
import org.apache.geode.internal.monitoring.executor.AbstractExecutor;
3233
import org.apache.geode.internal.monitoring.executor.FunctionExecutionPooledExecutorGroup;
34+
import org.apache.geode.internal.monitoring.executor.PooledExecutorGroup;
3335

3436
/**
3537
* Contains simple tests for the {@link org.apache.geode.internal.monitoring.ThreadsMonitoringImpl}.
3638
*
3739
* @since Geode 1.5
3840
*/
39-
public class ThreadsMonitoringImplJUnitTest {
41+
public class ThreadsMonitoringImplTest {
42+
private static final int TIME_LIMIT_MILLIS = 1000;
4043

4144
private ThreadsMonitoringImpl threadsMonitoringImpl;
4245

43-
@Before
44-
public void before() {
45-
threadsMonitoringImpl = new ThreadsMonitoringImpl(null, 100000, 0, false);
46-
}
47-
48-
@After
46+
@AfterEach
4947
public void after() {
5048
threadsMonitoringImpl.close();
5149
}
@@ -55,6 +53,7 @@ public void after() {
5553
*/
5654
@Test
5755
public void testStartMonitor() {
56+
threadsMonitoringImpl = new ThreadsMonitoringImpl(null, 100000, 0, false);
5857
assertTrue(threadsMonitoringImpl.startMonitor(Mode.FunctionExecutor));
5958
assertTrue(threadsMonitoringImpl.startMonitor(Mode.PooledExecutor));
6059
assertTrue(threadsMonitoringImpl.startMonitor(Mode.SerialQueuedExecutor));
@@ -67,6 +66,7 @@ public void testStartMonitor() {
6766

6867
@Test
6968
public void verifyMonitorLifeCycle() {
69+
threadsMonitoringImpl = new ThreadsMonitoringImpl(null, 100000, 0, false);
7070
assertFalse(threadsMonitoringImpl.isMonitoring());
7171
threadsMonitoringImpl.startMonitor(Mode.FunctionExecutor);
7272
assertTrue(threadsMonitoringImpl.isMonitoring());
@@ -76,6 +76,7 @@ public void verifyMonitorLifeCycle() {
7676

7777
@Test
7878
public void verifyExecutorMonitoringLifeCycle() {
79+
threadsMonitoringImpl = new ThreadsMonitoringImpl(null, 100000, 0, false);
7980
AbstractExecutor executor =
8081
threadsMonitoringImpl.createAbstractExecutor(Mode.P2PReaderExecutor);
8182
assertThat(threadsMonitoringImpl.isMonitoring(executor)).isFalse();
@@ -95,24 +96,28 @@ public void verifyExecutorMonitoringLifeCycle() {
9596

9697
@Test
9798
public void createAbstractExecutorIsAssociatedWithCallingThread() {
99+
threadsMonitoringImpl = new ThreadsMonitoringImpl(null, 100000, 0, false);
98100
AbstractExecutor executor = threadsMonitoringImpl.createAbstractExecutor(Mode.FunctionExecutor);
99101
assertThat(executor.getThreadID()).isEqualTo(Thread.currentThread().getId());
100102
}
101103

102104
@Test
103105
public void createAbstractExecutorDoesNotSetStartTime() {
106+
threadsMonitoringImpl = new ThreadsMonitoringImpl(null, 100000, 0, false);
104107
AbstractExecutor executor = threadsMonitoringImpl.createAbstractExecutor(Mode.FunctionExecutor);
105108
assertThat(executor.getStartTime()).isEqualTo(0);
106109
}
107110

108111
@Test
109112
public void createAbstractExecutorSetsNumIterationsStuckToZero() {
113+
threadsMonitoringImpl = new ThreadsMonitoringImpl(null, 100000, 0, false);
110114
AbstractExecutor executor = threadsMonitoringImpl.createAbstractExecutor(Mode.FunctionExecutor);
111115
assertThat(executor.getNumIterationsStuck()).isEqualTo((short) 0);
112116
}
113117

114118
@Test
115119
public void createAbstractExecutorSetsExpectedGroupName() {
120+
threadsMonitoringImpl = new ThreadsMonitoringImpl(null, 100000, 0, false);
116121
AbstractExecutor executor = threadsMonitoringImpl.createAbstractExecutor(Mode.FunctionExecutor);
117122
assertThat(executor.getGroupName()).isEqualTo(FunctionExecutionPooledExecutorGroup.GROUPNAME);
118123
}
@@ -122,16 +127,17 @@ public void createAbstractExecutorSetsExpectedGroupName() {
122127
*/
123128
@Test
124129
public void testClosure() {
125-
ThreadsMonitoringImpl liveMonitor = new ThreadsMonitoringImpl(null, 100000, 0, true);
126-
assertTrue(liveMonitor.getThreadsMonitoringProcess() != null);
127-
assertFalse(liveMonitor.isClosed());
128-
liveMonitor.close();
129-
assertTrue(liveMonitor.isClosed());
130-
assertFalse(liveMonitor.getThreadsMonitoringProcess() != null);
130+
threadsMonitoringImpl = new ThreadsMonitoringImpl(null, 100000, 0, true);
131+
assertNotNull(threadsMonitoringImpl.getThreadsMonitoringProcess());
132+
assertFalse(threadsMonitoringImpl.isClosed());
133+
threadsMonitoringImpl.close();
134+
assertTrue(threadsMonitoringImpl.isClosed());
135+
assertNull(threadsMonitoringImpl.getThreadsMonitoringProcess());
131136
}
132137

133138
@Test
134139
public void updateThreadStatusCallsSetStartTime() {
140+
threadsMonitoringImpl = new ThreadsMonitoringImpl(null, 100000, 0, false);
135141
AbstractExecutor executor = mock(AbstractExecutor.class);
136142
long threadId = Thread.currentThread().getId();
137143

@@ -142,8 +148,52 @@ public void updateThreadStatusCallsSetStartTime() {
142148

143149
@Test
144150
public void updateThreadStatusWithoutExecutorInMapDoesNotCallSetStartTime() {
151+
threadsMonitoringImpl = new ThreadsMonitoringImpl(null, 100000, 0, false);
145152
AbstractExecutor executor = mock(AbstractExecutor.class);
146153
threadsMonitoringImpl.updateThreadStatus();
147154
verify(executor, never()).setStartTime(any(Long.class));
148155
}
156+
157+
/**
158+
* Tests that indeed thread is considered stuck when it should
159+
*/
160+
@Test
161+
public void testThreadIsStuck() {
162+
threadsMonitoringImpl = new ThreadsMonitoringImpl(null, 100000, TIME_LIMIT_MILLIS);
163+
164+
final long threadID = 123456;
165+
166+
AbstractExecutor absExtgroup = new PooledExecutorGroup();
167+
absExtgroup.setStartTime(absExtgroup.getStartTime() - TIME_LIMIT_MILLIS - 1);
168+
169+
threadsMonitoringImpl.getMonitorMap().put(threadID, absExtgroup);
170+
171+
assertTrue(threadsMonitoringImpl.getThreadsMonitoringProcess().mapValidation());
172+
}
173+
174+
@Test
175+
public void monitorHandlesDefunctThread() {
176+
threadsMonitoringImpl = new ThreadsMonitoringImpl(null, 100000, TIME_LIMIT_MILLIS);
177+
final long threadID = Long.MAX_VALUE;
178+
179+
AbstractExecutor absExtgroup = new PooledExecutorGroup(threadID);
180+
absExtgroup.setStartTime(absExtgroup.getStartTime() - TIME_LIMIT_MILLIS - 1);
181+
182+
threadsMonitoringImpl.getMonitorMap().put(threadID, absExtgroup);
183+
184+
assertTrue(threadsMonitoringImpl.getThreadsMonitoringProcess().mapValidation());
185+
}
186+
187+
/**
188+
* Tests that indeed thread is NOT considered stuck when it shouldn't
189+
*/
190+
@Test
191+
public void testThreadIsNotStuck() {
192+
threadsMonitoringImpl = new ThreadsMonitoringImpl(null, 100000, TIME_LIMIT_MILLIS);
193+
194+
threadsMonitoringImpl.startMonitor(Mode.PooledExecutor);
195+
196+
assertFalse(threadsMonitoringImpl.getThreadsMonitoringProcess().mapValidation());
197+
}
198+
149199
}

geode-core/src/test/java/org/apache/geode/internal/monitoring/ThreadsMonitoringProcessJUnitTest.java

Lines changed: 0 additions & 87 deletions
This file was deleted.

0 commit comments

Comments
 (0)