From 070a2c38134a4ff4afaca150218289a3c8927603 Mon Sep 17 00:00:00 2001 From: Mark Logan <103447440+mystenmark@users.noreply.github.com> Date: Fri, 30 Aug 2024 01:58:44 +0100 Subject: [PATCH] Improvements to thread stall monitor (#19151) Use a watchdog thread for monitoring thread stalls. This allows us to detect a thread stall while it is still occurring, rather than only after the fact. --- .../src/thread_stall_monitor.rs | 92 ++++++++++++++----- 1 file changed, 71 insertions(+), 21 deletions(-) diff --git a/crates/mysten-metrics/src/thread_stall_monitor.rs b/crates/mysten-metrics/src/thread_stall_monitor.rs index a23c0d2783465..4a214518f35b7 100644 --- a/crates/mysten-metrics/src/thread_stall_monitor.rs +++ b/crates/mysten-metrics/src/thread_stall_monitor.rs @@ -1,7 +1,11 @@ // Copyright (c) Mysten Labs, Inc. // SPDX-License-Identifier: Apache-2.0 +use std::sync::Arc; +use std::sync::Mutex; use std::sync::Once; +use std::time::Duration; +use std::time::Instant; use tracing::{error, info}; @@ -9,11 +13,44 @@ use crate::{get_metrics, spawn_logged_monitored_task}; static THREAD_STALL_MONITOR: Once = Once::new(); -const MONITOR_INTERVAL: std::time::Duration = std::time::Duration::from_millis(500); +const MONITOR_INTERVAL: Duration = Duration::from_millis(50); +const ALERT_THRESHOLD: Duration = Duration::from_millis(500); + +// These funcs are extern in order to be easily findable by debuggers +// To catch a thread stall in the act, do the following: +// +// Create a file `gdbcmd` with +// +// set logging file gdb.txt +// set logging on +// set pagination off +// set breakpoint pending on +// +// b thread_monitor_report_stall +// commands +// thread apply all bt +// continue +// end +// +// Then run gdb with: +// gdb -x gdbmcmd -p +// +// You will need to type `c` to continue the process after it loads. +// +// The debugger will now print out all thread stacks every time a thread stall is detected. +#[inline(never)] +extern "C" fn thread_monitor_report_stall(duration_ms: u64) { + error!("Thread stalled for {}ms", duration_ms); +} + +#[inline(never)] +extern "C" fn thread_monitor_report_stall_cleared(duration_ms: u64) { + error!("Thread stall cleared after {}ms", duration_ms); +} /// Monitors temporary stalls in tokio scheduling every MONITOR_INTERVAL. -/// Logs an error and increments a metric if more than 2 * MONITOR_INTERVAL has elapsed, -/// which means the stall lasted longer than MONITOR_INTERVAL. +/// Calls `thread_monitor_report_stall` if more than ALERT_THRESHOLD has elapsed. +/// When the stall clears, we observer the duration in a histogram. pub fn start_thread_stall_monitor() { let mut called = true; THREAD_STALL_MONITOR.call_once(|| { @@ -27,28 +64,41 @@ pub fn start_thread_stall_monitor() { return; } + let last_update: Arc> = Arc::new(Mutex::new(Instant::now())); + + { + let last_update = last_update.clone(); + std::thread::spawn(move || { + info!("Starting thread stall monitor watchdog thread"); + let mut stall_duration = None; + + loop { + std::thread::sleep(MONITOR_INTERVAL); + let now = Instant::now(); + let last_update = *last_update.lock().unwrap(); + let time_since_last_update = now - last_update; + if time_since_last_update > ALERT_THRESHOLD { + if stall_duration.is_none() { + thread_monitor_report_stall(time_since_last_update.as_millis() as u64); + } + stall_duration = Some(time_since_last_update); + } else if let Some(dur) = stall_duration { + stall_duration = None; + thread_monitor_report_stall_cleared(dur.as_millis() as u64); + if let Some(metrics) = get_metrics() { + metrics.thread_stall_duration_sec.observe(dur.as_secs_f64()); + } + } + } + }); + } + spawn_logged_monitored_task!( async move { - let Some(metrics) = get_metrics() else { - info!("Metrics uninitialized, not starting thread stall monitor."); - return; - }; - let mut last_sleep_time = tokio::time::Instant::now(); + info!("Starting thread stall monitor update task"); loop { tokio::time::sleep(MONITOR_INTERVAL).await; - let current_time = tokio::time::Instant::now(); - let stalled_duration = current_time - last_sleep_time - MONITOR_INTERVAL; - last_sleep_time = current_time; - if stalled_duration > MONITOR_INTERVAL { - metrics - .thread_stall_duration_sec - .observe(stalled_duration.as_secs_f64()); - // TODO: disable this in simulation tests with artificial thread stalls? - error!( - "Thread stalled for {}s. Possible causes include CPU overload or too much blocking calls.", - stalled_duration.as_secs_f64() - ); - } + *last_update.lock().unwrap() = Instant::now(); } }, "ThreadStallMonitor"