diff --git a/NEWS.md b/NEWS.md index fd217eb7f954f0..e159a47c9c4ee9 100644 --- a/NEWS.md +++ b/NEWS.md @@ -146,7 +146,7 @@ Standard library changes * Now uses `textwidth` for formatting `%s` and `%c` widths ([#41085]). #### Profile -* Profiling now records sample metadata including thread and task. `Profile.print()` has a new `groupby` kwarg that allows +* CPU profiling now records sample metadata including thread and task. `Profile.print()` has a new `groupby` kwarg that allows grouping by thread, task, or nested thread/task, task/thread, and `threads` and `tasks` kwargs to allow filtering. Further, percent utilization is now reported as a total or per-thread, based on whether the thread is idle or not at each sample. `Profile.fetch()` by default strips out the new metadata to ensure backwards compatibility with external @@ -154,6 +154,9 @@ Standard library changes * The new `Profile.Allocs` module allows memory allocations to be profiled. The stack trace, type, and size of each allocation is recorded, and a `sample_rate` argument allows a tunable amount of allocations to be skipped, reducing performance overhead. ([#42768]) +* A fixed duration cpu profile can now be triggered by the user during running tasks without `Profile` being loaded + first and the report will show during execution. On MacOS & FreeBSD press `ctrl-t` or raise a `SIGINFO`. + For other platforms raise a `SIGUSR1` i.e. `% kill -USR1 $julia_pid`. Not currently available on windows. ([#43179]) #### Random diff --git a/contrib/generate_precompile.jl b/contrib/generate_precompile.jl index 983f5c3c8ef363..f2d39918ffe0d3 100644 --- a/contrib/generate_precompile.jl +++ b/contrib/generate_precompile.jl @@ -221,6 +221,7 @@ Profile = get(Base.loaded_modules, Base.PkgId(Base.UUID("9abbd945-dff8-562f-b5e8-e1ebf5ef1b79"), "Profile"), nothing) if Profile !== nothing + repl_script *= Profile.precompile_script hardcoded_precompile_statements *= """ precompile(Tuple{typeof(Profile.tree!), Profile.StackFrameTree{UInt64}, Vector{UInt64}, Dict{UInt64, Vector{Base.StackTraces.StackFrame}}, Bool, Symbol, Int, UInt}) precompile(Tuple{typeof(Profile.tree!), Profile.StackFrameTree{UInt64}, Vector{UInt64}, Dict{UInt64, Vector{Base.StackTraces.StackFrame}}, Bool, Symbol, Int, UnitRange{UInt}}) diff --git a/src/julia_internal.h b/src/julia_internal.h index 96d064c9df067a..759fd8e3c76547 100644 --- a/src/julia_internal.h +++ b/src/julia_internal.h @@ -128,6 +128,10 @@ int jl_running_under_rr(int recheck) JL_NOTSAFEPOINT; // Returns time in nanosec JL_DLLEXPORT uint64_t jl_hrtime(void) JL_NOTSAFEPOINT; +JL_DLLEXPORT void jl_set_peek_cond(uintptr_t); +JL_DLLEXPORT double jl_get_profile_peek_duration(void); +JL_DLLEXPORT void jl_set_profile_peek_duration(double); + // number of cycles since power-on static inline uint64_t cycleclock(void) JL_NOTSAFEPOINT { diff --git a/src/signal-handling.c b/src/signal-handling.c index 0f3b715d430611..142f03b6c899db 100644 --- a/src/signal-handling.c +++ b/src/signal-handling.c @@ -129,6 +129,35 @@ static size_t jl_safe_read_mem(const volatile char *ptr, char *out, size_t len) return i; } +static double profile_autostop_time = -1.0; +static double profile_peek_duration = 1.0; // seconds + +double jl_get_profile_peek_duration(void) { + return profile_peek_duration; +} +void jl_set_profile_peek_duration(double t) { + profile_peek_duration = t; + return; +} + +uintptr_t profile_show_peek_cond_loc; +JL_DLLEXPORT void jl_set_peek_cond(uintptr_t cond) +{ + profile_show_peek_cond_loc = cond; + return; +} + +static void jl_check_profile_autostop(void) { + if ((profile_autostop_time != -1.0) && (jl_hrtime() > profile_autostop_time)) { + profile_autostop_time = -1.0; + jl_profile_stop_timer(); + jl_safe_printf("\n==============================================================\n"); + jl_safe_printf("Profile collected. A report will print at the next yield point\n"); + jl_safe_printf("==============================================================\n\n"); + uv_async_send((uv_async_t*)profile_show_peek_cond_loc); + } +} + #if defined(_WIN32) #include "signals-win.c" #else diff --git a/src/signals-mach.c b/src/signals-mach.c index 4ddfad4e10600f..130603931c5aa2 100644 --- a/src/signals-mach.c +++ b/src/signals-mach.c @@ -643,6 +643,7 @@ void *mach_profile_listener(void *arg) } jl_unlock_profile_mach(0, keymgr_locked); if (running) { + jl_check_profile_autostop(); // Reset the alarm kern_return_t ret = clock_alarm(clk, TIME_RELATIVE, timerprof, profile_port); HANDLE_MACH_ERROR("clock_alarm", ret) diff --git a/src/signals-unix.c b/src/signals-unix.c index 67af5c0b1358de..b2e3ef29520299 100644 --- a/src/signals-unix.c +++ b/src/signals-unix.c @@ -61,7 +61,6 @@ bt_context_t *jl_to_bt_context(void *sigctx) #endif } - static int thread0_exit_count = 0; static void jl_exit_thread0(int exitstate, jl_bt_element_t *bt_data, size_t bt_size); @@ -519,17 +518,22 @@ JL_DLLEXPORT int jl_profile_start_timer(void) sigprof.sigev_notify = SIGEV_SIGNAL; sigprof.sigev_signo = SIGUSR1; sigprof.sigev_value.sival_ptr = &timerprof; - if (timer_create(CLOCK_REALTIME, &sigprof, &timerprof) == -1) + // Because SIGUSR1 is multipurpose, set `running` before so that we know that the first SIGUSR1 came from the timer + running = 1; + if (timer_create(CLOCK_REALTIME, &sigprof, &timerprof) == -1) { + running = 0; return -2; + } // Start the timer itsprof.it_interval.tv_sec = 0; itsprof.it_interval.tv_nsec = 0; itsprof.it_value.tv_sec = nsecprof / GIGA; itsprof.it_value.tv_nsec = nsecprof % GIGA; - if (timer_settime(timerprof, 0, &itsprof, NULL) == -1) + if (timer_settime(timerprof, 0, &itsprof, NULL) == -1) { + running = 0; return -3; - running = 1; + } return 0; } @@ -648,6 +652,18 @@ static void kqueue_signal(int *sigqueue, struct kevent *ev, int sig) } #endif +void trigger_profile_peek(void) +{ + jl_safe_printf("\n======================================================================================\n"); + jl_safe_printf("Information request received. A stacktrace will print followed by a %.1f second profile\n", profile_peek_duration); + jl_safe_printf("======================================================================================\n"); + bt_size_cur = 0; // clear profile buffer + if (jl_profile_start_timer() < 0) + jl_safe_printf("ERROR: Could not start profile timer\n"); + else + profile_autostop_time = jl_hrtime() + (profile_peek_duration * 1e9); +} + static void *signal_listener(void *arg) { static jl_bt_element_t bt_data[JL_MAX_BT_SIZE + 1]; @@ -753,11 +769,17 @@ static void *signal_listener(void *arg) int doexit = critical; #ifdef SIGINFO - if (sig == SIGINFO) + if (sig == SIGINFO) { + if (running != 1) + trigger_profile_peek(); doexit = 0; + } #else - if (sig == SIGUSR1) + if (sig == SIGUSR1) { + if (running != 1) + trigger_profile_peek(); doexit = 0; + } #endif bt_size = 0; @@ -835,6 +857,7 @@ static void *signal_listener(void *arg) } #ifndef HAVE_MACH if (profile && running) { + jl_check_profile_autostop(); #if defined(HAVE_TIMER) timer_settime(timerprof, 0, &itsprof, NULL); #elif defined(HAVE_ITIMER) diff --git a/src/signals-win.c b/src/signals-win.c index fefb29a529847d..178a7463b8d502 100644 --- a/src/signals-win.c +++ b/src/signals-win.c @@ -388,6 +388,7 @@ static DWORD WINAPI profile_bt( LPVOID lparam ) jl_gc_debug_critical_error(); abort(); } + jl_check_profile_autostop(); } } } diff --git a/stdlib/Profile/docs/src/index.md b/stdlib/Profile/docs/src/index.md index 89894723b11165..8701dded0d4275 100644 --- a/stdlib/Profile/docs/src/index.md +++ b/stdlib/Profile/docs/src/index.md @@ -2,6 +2,79 @@ ## CPU Profiling +There are two main approaches to CPU profiling julia code: + +## Via `@profile` + +Where profiling is enabled for a given call via the `@profile` macro. + +```julia-repl +julia> using Profile + +julia> @profile foo() + +julia> Profile.print() +Overhead ╎ [+additional indent] Count File:Line; Function +========================================================= + ╎147 @Base/client.jl:506; _start() + ╎ 147 @Base/client.jl:318; exec_options(opts::Base.JLOptions) +... +``` + +## Triggered During Execution + +Tasks that are already running can also be profiled for a fixed time period at any user-triggered time. + +To trigger the profiling: +- MacOS & FreeBSD (BSD-based platforms): Use `ctrl-t` or pass a `SIGINFO` signal to the julia process i.e. `% kill -INFO $julia_pid` +- Linux: Pass a `SIGUSR1` signal to the julia process i.e. `% kill -USR1 $julia_pid` +- Windows: Not currently supported. + +First, a single stack trace at the instant that the signal was thrown is shown, then a 1 second profile is collected, +followed by the profile report at the next yield point, which may be at task completion for code without yield points +e.g. tight loops. + +```julia-repl +julia> foo() +##== the user sends a trigger while foo is running ==## +load: 2.53 cmd: julia 88903 running 6.16u 0.97s + +====================================================================================== +Information request received. A stacktrace will print followed by a 1.0 second profile +====================================================================================== + +signal (29): Information request: 29 +__psynch_cvwait at /usr/lib/system/libsystem_kernel.dylib (unknown line) +_pthread_cond_wait at /usr/lib/system/libsystem_pthread.dylib (unknown line) +... + +====================================================================== +Profile collected. A report will print if the Profile module is loaded +====================================================================== + +Overhead ╎ [+additional indent] Count File:Line; Function +========================================================= +Thread 1 Task 0x000000011687c010 Total snapshots: 572. Utilization: 100% + ╎147 @Base/client.jl:506; _start() + ╎ 147 @Base/client.jl:318; exec_options(opts::Base.JLOptions) +... + +Thread 2 Task 0x0000000116960010 Total snapshots: 572. Utilization: 0% + ╎572 @Base/task.jl:587; task_done_hook(t::Task) + ╎ 572 @Base/task.jl:879; wait() +... +``` + +### Customization + +The duration of the profiling can be adjusted via [`Profile.set_peek_duration`](@ref) + +The profile report is broken down by thread and task. Pass a no-arg function to `Profile.peek_report[]` to override this. +i.e. `Profile.peek_report[] = () -> Profile.print()` to remove any grouping. This could also be overridden by an external +profile data consumer. + +## Reference + ```@docs Profile.@profile ``` @@ -16,6 +89,8 @@ Profile.fetch Profile.retrieve Profile.callers Profile.clear_malloc_data +Profile.get_peek_duration +Profile.set_peek_duration ``` ## Memory profiling diff --git a/stdlib/Profile/src/Profile.jl b/stdlib/Profile/src/Profile.jl index 6916f400e16daa..a5f383b77efa1a 100644 --- a/stdlib/Profile/src/Profile.jl +++ b/stdlib/Profile/src/Profile.jl @@ -34,6 +34,52 @@ macro profile(ex) end end +# triggers printing the report after a SIGINFO/SIGUSR1 profile request +const PROFILE_PRINT_COND = Ref{Base.AsyncCondition}() +function profile_printing_listener() + try + while true + wait(PROFILE_PRINT_COND[]) + peek_report[]() + end + catch ex + if !isa(ex, InterruptException) + @error "Profile printing listener crashed" exception=ex,catch_backtrace() + end + end +end + +# An internal function called to show the report after an information request (SIGINFO or SIGUSR1). +function _peek_report() + iob = IOBuffer() + ioc = IOContext(IOContext(iob, stdout), :displaysize=>displaysize(stdout)) + print(ioc, groupby = [:thread, :task]) + Base.print(stdout, String(resize!(iob.data, iob.size))) +end +# This is a ref so that it can be overridden by other profile info consumers. +const peek_report = Ref{Function}(_peek_report) + +""" + get_peek_duration() + +Get the duration in seconds of the profile "peek" that is triggered via `SIGINFO` or `SIGUSR1`, depending on platform. +""" +get_peek_duration() = ccall(:jl_get_profile_peek_duration, Float64, ()) +""" + set_peek_duration(t::Float64) + +Set the duration in seconds of the profile "peek" that is triggered via `SIGINFO` or `SIGUSR1`, depending on platform. +""" +set_peek_duration(t::Float64) = ccall(:jl_set_profile_peek_duration, Cvoid, (Float64,), t) + +precompile_script = """ +let _Profile = Base.PkgId(Base.UUID("9abbd945-dff8-562f-b5e8-e1ebf5ef1b79"), "Profile") + Base.loaded_modules[_Profile].@profile sleep(1) + Base.loaded_modules[_Profile].peek_report[]() + Base.loaded_modules[_Profile].clear() +end +""" + #### #### User-level functions #### @@ -81,15 +127,23 @@ function init(n::Integer, delay::Real; limitwarn::Bool = true) end end -# init with default values -# Use a max size of 10M profile samples, and fire timer every 1ms -# (that should typically give around 100 seconds of record) -if Sys.iswindows() && Sys.WORD_SIZE == 32 - # The Win32 unwinder is 1000x slower than elsewhere (around 1ms/frame), - # so we don't want to slow the program down by quite that much - __init__() = init(1_000_000, 0.01, limitwarn = false) -else - __init__() = init(10_000_000, 0.001, limitwarn = false) +function __init__() + # init with default values + # Use a max size of 10M profile samples, and fire timer every 1ms + # (that should typically give around 100 seconds of record) + @static if Sys.iswindows() && Sys.WORD_SIZE == 32 + # The Win32 unwinder is 1000x slower than elsewhere (around 1ms/frame), + # so we don't want to slow the program down by quite that much + n = 1_000_000 + delay = 0.01 + else + n = 10_000_000 + delay = 0.001 + end + init(n, delay, limitwarn = false) + PROFILE_PRINT_COND[] = Base.AsyncCondition() + ccall(:jl_set_peek_cond, Cvoid, (Ptr{Cvoid},), PROFILE_PRINT_COND[].handle) + errormonitor(Threads.@spawn(profile_printing_listener())) end """ @@ -129,7 +183,7 @@ const META_OFFSET_TASKID = 4 const META_OFFSET_THREADID = 5 """ - print([io::IO = stdout,] [data::Vector]; kwargs...) + print([io::IO = stdout,] [data::Vector = fetch()], [lidict::Union{LineInfoDict, LineInfoFlatDict} = getdict(data)]; kwargs...) Prints profiling results to `io` (by default, `stdout`). If you do not supply a `data` vector, the internal buffer of accumulated backtraces @@ -170,7 +224,7 @@ The keyword arguments can be any combination of: does not control which tasks samples are collected within. """ function print(io::IO, - data::Vector{<:Unsigned} = fetch(include_meta = true), + data::Vector{<:Unsigned} = fetch(), lidict::Union{LineInfoDict, LineInfoFlatDict} = getdict(data) ; format = :tree, @@ -206,7 +260,7 @@ function print(io::IO, nl = length(threadids) > 1 ? "\n" : "" printstyled(io, "Task $(Base.repr(taskid))$nl"; bold=true, color=Base.debug_color()) for threadid in threadids - printstyled(io, " Thread $threadid\n"; bold=true, color=Base.info_color()) + printstyled(io, " Thread $threadid "; bold=true, color=Base.info_color()) nosamples = print(io, data, lidict, pf, format, threadid, taskid, true) nosamples && (any_nosamples = true) println(io) @@ -222,7 +276,7 @@ function print(io::IO, nl = length(taskids) > 1 ? "\n" : "" printstyled(io, "Thread $threadid$nl"; bold=true, color=Base.info_color()) for taskid in taskids - printstyled(io, " Task $(Base.repr(taskid))\n"; bold=true, color=Base.debug_color()) + printstyled(io, " Task $(Base.repr(taskid)) "; bold=true, color=Base.debug_color()) nosamples = print(io, data, lidict, pf, format, threadid, taskid, true) nosamples && (any_nosamples = true) println(io) @@ -232,7 +286,7 @@ function print(io::IO, elseif groupby == :task threads = 1:typemax(Int) for taskid in intersect(get_task_ids(data), tasks) - printstyled(io, "Task $(Base.repr(taskid))\n"; bold=true, color=Base.debug_color()) + printstyled(io, "Task $(Base.repr(taskid)) "; bold=true, color=Base.debug_color()) nosamples = print(io, data, lidict, pf, format, threads, taskid, true) nosamples && (any_nosamples = true) println(io) @@ -240,7 +294,7 @@ function print(io::IO, elseif groupby == :thread tasks = 1:typemax(UInt) for threadid in intersect(get_thread_ids(data), threads) - printstyled(io, "Thread $threadid\n"; bold=true, color=Base.info_color()) + printstyled(io, "Thread $threadid "; bold=true, color=Base.info_color()) nosamples = print(io, data, lidict, pf, format, threadid, tasks, true) nosamples && (any_nosamples = true) println(io) @@ -251,6 +305,18 @@ function print(io::IO, return end +""" + print([io::IO = stdout,] data::Vector, lidict::LineInfoDict; kwargs...) + +Prints profiling results to `io`. This variant is used to examine results exported by a +previous call to [`retrieve`](@ref). Supply the vector `data` of backtraces and +a dictionary `lidict` of line information. + +See `Profile.print([io], data)` for an explanation of the valid keyword arguments. +""" +print(data::Vector{<:Unsigned} = fetch(), lidict::Union{LineInfoDict, LineInfoFlatDict} = getdict(data); kwargs...) = + print(stdout, data, lidict; kwargs...) + function print(io::IO, data::Vector{<:Unsigned}, lidict::Union{LineInfoDict, LineInfoFlatDict}, fmt::ProfileFormat, format::Symbol, threads::Union{Int,AbstractVector{Int}}, tasks::Union{UInt,AbstractVector{UInt}}, is_subsection::Bool = false) @@ -318,18 +384,6 @@ function has_meta(data) return false end -""" - print([io::IO = stdout,] data::Vector, lidict::LineInfoDict; kwargs...) - -Prints profiling results to `io`. This variant is used to examine results exported by a -previous call to [`retrieve`](@ref). Supply the vector `data` of backtraces and -a dictionary `lidict` of line information. - -See `Profile.print([io], data)` for an explanation of the valid keyword arguments. -""" -print(data::Vector{<:Unsigned} = fetch(include_meta = true), lidict::Union{LineInfoDict, LineInfoFlatDict} = getdict(data); kwargs...) = - print(stdout, data, lidict; kwargs...) - """ retrieve(; kwargs...) -> data, lidict @@ -541,10 +595,10 @@ depends on the exact memory addresses used in JIT-compiling. This function is pr internal use; [`retrieve`](@ref) may be a better choice for most users. By default metadata such as threadid and taskid is included. Set `include_meta` to `false` to strip metadata. """ -function fetch(;include_meta = true) +function fetch(;include_meta = true, limitwarn = true) maxlen = maxlen_data() len = len_data() - if is_buffer_full() + if limitwarn && is_buffer_full() @warn """The profile data buffer is full; profiling probably terminated before your program finished. To profile for longer runs, call `Profile.init()` with a larger buffer and/or larger delay.""" @@ -1058,22 +1112,24 @@ function tree(io::IO, data::Vector{UInt64}, lidict::Union{LineInfoFlatDict, Line root, nsleeping = tree!(StackFrameTree{UInt64}(), data, lidict, fmt.C, fmt.recur, threads, tasks) end util_perc = (1 - (nsleeping / root.count)) * 100 + !is_subsection && print_tree(io, root, cols, fmt, is_subsection) if isempty(root.down) if is_subsection Base.print(io, "Total snapshots: ") printstyled(io, "$(root.count)", color=Base.warn_color()) - Base.println(io, " (", round(Int, util_perc), "% utilization)") + Base.println(io, ". Utilization: ", round(Int, util_perc), "%") else warning_empty() end return true + else + Base.print(io, "Total snapshots: ", root.count, ". Utilization: ", round(Int, util_perc), "%") end - print_tree(io, root, cols, fmt, is_subsection) - Base.print(io, "Total snapshots: ", root.count, " (", round(Int, util_perc), "% utilization") if is_subsection - println(io, ")") + println(io) + print_tree(io, root, cols, fmt, is_subsection) else - println(io, " across all threads and tasks. Use the `groupby` kwarg to break down by thread and/or task)") + println(io, " across all threads and tasks. Use the `groupby` kwarg to break down by thread and/or task") end return false end diff --git a/stdlib/Profile/test/runtests.jl b/stdlib/Profile/test/runtests.jl index ddddac55fcd931..50917b9797c7dc 100644 --- a/stdlib/Profile/test/runtests.jl +++ b/stdlib/Profile/test/runtests.jl @@ -187,6 +187,49 @@ let cmd = Base.julia_cmd() @test parse(Int, s) > 100 end +if Sys.isbsd() || Sys.islinux() + @testset "SIGINFO/SIGUSR1 profile triggering" begin + let cmd = Base.julia_cmd() + script = """ + x = rand(1000, 1000) + println("started") + while true + x * x + yield() + end + """ + iob = Base.BufferStream() + p = run(pipeline(`$cmd -e $script`, stderr = devnull, stdout = iob), wait = false) + t = Timer(60) do t # should be done in under 10 seconds + kill(p, Base.SIGKILL) + sleep(5) + close(iob) + end + try + s = readuntil(iob, "started", keep = true) + @assert occursin("started", s) + @assert process_running(p) + for _ in 1:2 + sleep(2) + if Sys.isbsd() + kill(p, 29) # SIGINFO + elseif Sys.islinux() + kill(p, 10) # SIGUSR1 + end + s = readuntil(iob, "Overhead ╎", keep = true) + @test process_running(p) + @test occursin("Overhead ╎", s) + end + finally + kill(p, Base.SIGKILL) + close(t) + end + end + end +else + @warn "Skipping \"SIGINFO/SIGUSR1 profile triggering\" test as it is not supported on this platform" +end + @testset "FlameGraphs" begin # FlameGraphs makes use of some Profile's internals. Detect possible breakage by mimicking some of its tests. # Breakage is acceptable since these internals are not part of the stable API, but it's better to know, and ideally