Skip to content

Commit 1fff8cc

Browse files
authored
add option to dump backtrace of stop-the-world straggler (JuliaLang#57045) (#208)
This is still a work in progress, but it should help determine what a straggler thread was doing during the stop-the-world phase and why it failed to reach a safepoint in a timely manner. We've encountered long TTSP issues in production, and this tool should provide a valuable means to accurately diagnose them.
1 parent 25fb6f8 commit 1fff8cc

File tree

8 files changed

+82
-1
lines changed

8 files changed

+82
-1
lines changed

base/options.jl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ struct JLOptions
6161
trace_compile_timing::Int8
6262
safe_crash_log_file::Ptr{UInt8}
6363
task_metrics::Int8
64+
timeout_for_safepoint_straggler_s::Int16
6465
end
6566

6667
# This runs early in the sysimage != is not defined yet

src/gc.c

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -245,8 +245,22 @@ void jl_gc_wait_for_the_world(jl_ptls_t* gc_all_tls_states, int gc_n_threads)
245245
// We're currently also using atomic store release in mutator threads
246246
// (in jl_gc_state_set), but we may want to use signals to flush the
247247
// memory operations on those threads lazily instead.
248-
while (!jl_atomic_load_relaxed(&ptls2->gc_state) || !jl_atomic_load_acquire(&ptls2->gc_state))
248+
const int64_t timeout = jl_options.timeout_for_safepoint_straggler_s * 1000000000; // convert to nanoseconds
249+
uint64_t t0 = jl_hrtime();
250+
while (!jl_atomic_load_relaxed(&ptls2->gc_state) || !jl_atomic_load_acquire(&ptls2->gc_state)) {
249251
jl_cpu_pause(); // yield?
252+
if ((jl_hrtime() - t0) > timeout) {
253+
jl_safe_printf("===== Thread %d failed to reach safepoint after %d seconds, printing backtrace below =====\n", ptls2->tid + 1, jl_options.timeout_for_safepoint_straggler_s);
254+
// Try to record the backtrace of the straggler using `jl_try_record_thread_backtrace`
255+
jl_ptls_t ptls = jl_current_task->ptls;
256+
size_t bt_size = jl_try_record_thread_backtrace(ptls2, ptls->bt_data, JL_MAX_BT_SIZE);
257+
// Print the backtrace of the straggler
258+
for (size_t i = 0; i < bt_size; i += jl_bt_entry_size(ptls->bt_data + i)) {
259+
jl_print_bt_entry_codeloc(-1, ptls->bt_data + i);
260+
}
261+
t0 = jl_hrtime();
262+
}
263+
}
250264
}
251265
}
252266
}

src/jloptions.c

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,7 @@ JL_DLLEXPORT void jl_init_options(void)
9494
0, // trace_compile_timing
9595
NULL, // safe_crash_log_file
9696
0, // task_metrics
97+
-1, // timeout_for_safepoint_straggler_s
9798
};
9899
jl_options_initialized = 1;
99100
}
@@ -214,6 +215,8 @@ static const char opts_hidden[] =
214215
" --output-asm <name> Generate an assembly file (.s)\n"
215216
" --output-incremental={yes|no*} Generate an incremental output file (rather than\n"
216217
" complete)\n"
218+
" --timeout-for-safepoint-straggler <seconds> If this value is set, then we will dump the backtrace for a thread\n"
219+
" that fails to reach a safepoint within the specified time\n"
217220
" --trace-compile={stderr|name} Print precompile statements for methods compiled\n"
218221
" during execution or save to stderr or a path. Methods that\n"
219222
" were recompiled are printed in yellow or with a trailing\n"
@@ -242,6 +245,7 @@ JL_DLLEXPORT void jl_parse_opts(int *argcp, char ***argvp)
242245
opt_warn_scope,
243246
opt_inline,
244247
opt_polly,
248+
opt_timeout_for_safepoint_straggler,
245249
opt_trace_compile,
246250
opt_trace_compile_timing,
247251
opt_trace_dispatch,
@@ -321,6 +325,7 @@ JL_DLLEXPORT void jl_parse_opts(int *argcp, char ***argvp)
321325
{ "warn-scope", required_argument, 0, opt_warn_scope },
322326
{ "inline", required_argument, 0, opt_inline },
323327
{ "polly", required_argument, 0, opt_polly },
328+
{ "timeout-for-safepoint-straggler", required_argument, 0, opt_timeout_for_safepoint_straggler },
324329
{ "trace-compile", required_argument, 0, opt_trace_compile },
325330
{ "trace-compile-timing", no_argument, 0, opt_trace_compile_timing },
326331
{ "trace-dispatch", required_argument, 0, opt_trace_dispatch },
@@ -881,6 +886,13 @@ JL_DLLEXPORT void jl_parse_opts(int *argcp, char ***argvp)
881886
if (jl_options.safe_crash_log_file == NULL)
882887
jl_error("julia: failed to allocate memory for --safe-crash-log-file");
883888
break;
889+
case opt_timeout_for_safepoint_straggler:
890+
errno = 0;
891+
long timeout = strtol(optarg, &endptr, 10);
892+
if (errno != 0 || optarg == endptr || timeout < 1 || timeout > INT16_MAX)
893+
jl_errorf("julia: --timeout-for-safepoint-straggler=<seconds>; seconds must be an integer between 1 and %d", INT16_MAX);
894+
jl_options.timeout_for_safepoint_straggler_s = (int16_t)timeout;
895+
break;
884896
case opt_task_metrics:
885897
if (!strcmp(optarg, "no"))
886898
jl_options.task_metrics = JL_OPTIONS_TASK_METRICS_OFF;

src/jloptions.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ typedef struct {
6565
int8_t trace_compile_timing;
6666
const char *safe_crash_log_file;
6767
int8_t task_metrics;
68+
int16_t timeout_for_safepoint_straggler_s;
6869
} jl_options_t;
6970

7071
#endif

src/julia_internal.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -213,6 +213,8 @@ typedef struct {
213213
size_t bt_size;
214214
int tid;
215215
} jl_record_backtrace_result_t;
216+
JL_DLLEXPORT JL_DLLEXPORT size_t jl_try_record_thread_backtrace(jl_ptls_t ptls2, struct _jl_bt_element_t *bt_data,
217+
size_t max_bt_size) JL_NOTSAFEPOINT;
216218
JL_DLLEXPORT jl_record_backtrace_result_t jl_record_backtrace(jl_task_t *t, struct _jl_bt_element_t *bt_data,
217219
size_t max_bt_size, int all_tasks_profiler) JL_NOTSAFEPOINT;
218220
extern volatile struct _jl_bt_element_t *profile_bt_data_prof;

src/stackwalk.c

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -889,6 +889,26 @@ _os_ptr_munge(uintptr_t ptr)
889889

890890
extern bt_context_t *jl_to_bt_context(void *sigctx);
891891

892+
JL_DLLEXPORT size_t jl_try_record_thread_backtrace(jl_ptls_t ptls2, jl_bt_element_t *bt_data, size_t max_bt_size) JL_NOTSAFEPOINT
893+
{
894+
int16_t tid = ptls2->tid;
895+
jl_task_t *t = NULL;
896+
bt_context_t *context = NULL;
897+
bt_context_t c;
898+
int lockret = jl_lock_stackwalk();
899+
if (!jl_thread_suspend_and_get_state(tid, 0, &c)) {
900+
jl_unlock_stackwalk(lockret);
901+
return 0;
902+
}
903+
jl_unlock_stackwalk(lockret);
904+
// thread is stopped, safe to read the task it was running before we stopped it
905+
t = jl_atomic_load_relaxed(&ptls2->current_task);
906+
context = &c;
907+
size_t bt_size = rec_backtrace_ctx(bt_data, max_bt_size, context, ptls2->previous_task ? NULL : t->gcstack);
908+
jl_thread_resume(tid);
909+
return bt_size;
910+
}
911+
892912
JL_DLLEXPORT jl_record_backtrace_result_t jl_record_backtrace(jl_task_t *t, jl_bt_element_t *bt_data, size_t max_bt_size, int all_tasks_profiler) JL_NOTSAFEPOINT
893913
{
894914
int16_t tid = INT16_MAX;

test/cmdlineargs.jl

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1126,3 +1126,9 @@ end
11261126
#heap-size-hint, we reserve 250 MB for non GC memory (llvm, etc.)
11271127
@test readchomp(`$(Base.julia_cmd()) --startup-file=no --heap-size-hint=500M -e "println(@ccall jl_gc_get_max_memory()::UInt64)"`) == "$((500-250)*1024*1024)"
11281128
end
1129+
1130+
@testset "--timeout-for-safepoint-straggler" begin
1131+
exename = `$(Base.julia_cmd())`
1132+
timeout = 120
1133+
@test parse(Int,read(`$exename --timeout-for-safepoint-straggler=$timeout -E "Base.JLOptions().timeout_for_safepoint_straggler_s"`, String)) == timeout
1134+
end

test/threads_exec.jl

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1313,4 +1313,29 @@ end
13131313
end
13141314
end
13151315

1316+
@testset "--timeout-for-safepoint-straggler command-line flag" begin
1317+
program = "
1318+
function main()
1319+
t = Threads.@spawn begin
1320+
ccall(:uv_sleep, Cvoid, (Cuint,), 5000)
1321+
end
1322+
# Force a GC
1323+
ccall(:uv_sleep, Cvoid, (Cuint,), 1000)
1324+
GC.gc()
1325+
wait(t)
1326+
end
1327+
main()
1328+
"
1329+
tmp_output_filename = tempname()
1330+
tmp_output_file = open(tmp_output_filename, "w")
1331+
if isnothing(tmp_output_file)
1332+
error("Failed to open file $tmp_output_filename")
1333+
end
1334+
run(pipeline(`$(Base.julia_cmd()) --threads=4 --timeout-for-safepoint-straggler=1 -e $program`, stderr=tmp_output_file))
1335+
# Check whether we printed the straggler's backtrace
1336+
@test !isempty(read(tmp_output_filename, String))
1337+
close(tmp_output_file)
1338+
rm(tmp_output_filename)
1339+
end
1340+
13161341
end # main testset

0 commit comments

Comments
 (0)