Skip to content

HBASE-26468 Region Server doesn't exit cleanly incase it crashes. #3862

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Dec 1, 2021
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,12 @@

import java.io.PrintStream;
import java.lang.Thread.UncaughtExceptionHandler;
import java.util.Set;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.ThreadFactory;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;

import org.apache.hadoop.util.StringUtils;
import org.apache.yetus.audience.InterfaceAudience;
Expand Down Expand Up @@ -197,4 +199,33 @@ public static void setLoggingUncaughtExceptionHandler(Thread t) {
public static void printThreadInfo(PrintStream stream, String title) {
ReflectionUtils.printThreadInfo(stream, title);
}

/**
* Checks whether any non-daemon thread is running.
* @return true if there are non daemon threads running, otherwise false
*/
public static boolean isNonDaemonThreadRunning() {
AtomicInteger nonDaemonThreadCount = new AtomicInteger();
Set<Thread> threads = Thread.getAllStackTraces().keySet();
threads.forEach(t -> {
// Exclude current thread
if (t.getId() != Thread.currentThread().getId() && !t.isDaemon()) {
nonDaemonThreadCount.getAndIncrement();
LOG.info("Non daemon thread {} is still alive", t.getName());
LOG.info(printStackTrace(t));
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the sample stack trace after deploying it to internal cluster

2021-11-26 19:21:16,747 INFO  [main] util.Threads - Non daemon thread main.named-queue-events-pool-pool1-t1 is still alive
2021-11-26 19:21:16,748 INFO  [main] util.Threads - 
    sun.misc.Unsafe.park(Native Method)
    java.util.concurrent.locks.LockSupport.park(LockSupport.java:175)
    java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.await(AbstractQueuedSynchronizer.java:2044)
    com.lmax.disruptor.BlockingWaitStrategy.waitFor(BlockingWaitStrategy.java:45)
    com.lmax.disruptor.ProcessingSequenceBarrier.waitFor(ProcessingSequenceBarrier.java:56)
    com.lmax.disruptor.BatchEventProcessor.run(BatchEventProcessor.java:124)
    java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
    java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
    java.lang.Thread.run(Thread.java:748)

There is no easy method to print stack trace given a thread handle. Thread#dumpStack dumps stack of current thread but we don't need that.
Cc @virajjasani @Apache9

}
});
return nonDaemonThreadCount.get() > 0;
}

/*
Print stack trace of the passed thread
*/
public static String printStackTrace(Thread t) {
StringBuilder sb = new StringBuilder();
for (StackTraceElement frame: t.getStackTrace()) {
sb.append("\n").append(" ").append(frame.toString());
}
return sb.toString();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import static org.apache.hadoop.hbase.util.Threads.isNonDaemonThreadRunning;

/**
* Base class for command lines that start up various HBase daemons.
Expand Down Expand Up @@ -141,15 +142,31 @@ public static void logProcessInfo(Configuration conf) {
}

/**
* Parse and run the given command line. This may exit the JVM if
* a nonzero exit code is returned from <code>run()</code>.
* Parse and run the given command line. This will exit the JVM with
* the exit code returned from <code>run()</code>.
* If return code is 0, wait for atmost 30 seconds for all non-daemon threads to quit,
* otherwise exit the jvm
*/
public void doMain(String args[]) {
try {
int ret = ToolRunner.run(HBaseConfiguration.create(), this, args);
if (ret != 0) {
System.exit(ret);
}
// Return code is 0 here.
boolean forceStop = false;
long startTime = EnvironmentEdgeManager.currentTime();
while (isNonDaemonThreadRunning()) {
if (EnvironmentEdgeManager.currentTime() - startTime > 30 * 1000) {
forceStop = true;
break;
}
Thread.sleep(1000);
}
if (forceStop) {
LOG.error("Failed to stop all non-daemon threads, so terminating JVM");
System.exit(-1);
}
} catch (Exception e) {
LOG.error("Failed to run", e);
System.exit(-1);
Expand Down