Skip to content

Commit 0d4ca9c

Browse files
committed
all tasks profiler
1 parent 17445fe commit 0d4ca9c

File tree

8 files changed

+420
-215
lines changed

8 files changed

+420
-215
lines changed

src/julia_internal.h

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -211,6 +211,23 @@ JL_DLLEXPORT void jl_unlock_profile_wr(void) JL_NOTSAFEPOINT JL_NOTSAFEPOINT_LEA
211211
int jl_lock_stackwalk(void) JL_NOTSAFEPOINT JL_NOTSAFEPOINT_ENTER;
212212
void jl_unlock_stackwalk(int lockret) JL_NOTSAFEPOINT JL_NOTSAFEPOINT_LEAVE;
213213

214+
arraylist_t *jl_get_all_tasks_arraylist(void) JL_NOTSAFEPOINT;
215+
typedef struct {
216+
size_t bt_size;
217+
int tid;
218+
} jl_record_backtrace_result_t;
219+
JL_DLLEXPORT jl_record_backtrace_result_t jl_record_backtrace(jl_task_t *t, struct _jl_bt_element_t *bt_data,
220+
size_t max_bt_size) JL_NOTSAFEPOINT;
221+
extern volatile struct _jl_bt_element_t *profile_bt_data_prof;
222+
extern volatile size_t profile_bt_size_max;
223+
extern volatile size_t profile_bt_size_cur;
224+
extern volatile int profile_running;
225+
extern volatile int profile_all_tasks;
226+
STATIC_INLINE int all_tasks_profile_running(void) JL_NOTSAFEPOINT
227+
{
228+
return profile_running && profile_all_tasks;
229+
}
230+
214231
// number of cycles since power-on
215232
static inline uint64_t cycleclock(void) JL_NOTSAFEPOINT
216233
{

src/signal-handling.c

Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -18,46 +18,46 @@ extern "C" {
1818
#include <threading.h>
1919

2020
// Profiler control variables
21-
// Note: these "static" variables are also used in "signals-*.c"
22-
static volatile jl_bt_element_t *bt_data_prof = NULL;
23-
static volatile size_t bt_size_max = 0;
24-
static volatile size_t bt_size_cur = 0;
21+
volatile jl_bt_element_t *profile_bt_data_prof = NULL;
22+
volatile size_t profile_bt_size_max = 0;
23+
volatile size_t profile_bt_size_cur = 0;
2524
static volatile uint64_t nsecprof = 0;
26-
static volatile int running = 0;
27-
static const uint64_t GIGA = 1000000000ULL;
25+
volatile int profile_running = 0;
26+
volatile int profile_all_tasks = 0;
27+
static const uint64_t GIGA = 1000000000ULL;
2828
// Timers to take samples at intervals
2929
JL_DLLEXPORT void jl_profile_stop_timer(void);
30-
JL_DLLEXPORT int jl_profile_start_timer(void);
30+
JL_DLLEXPORT int jl_profile_start_timer(uint8_t);
3131

3232
///////////////////////
3333
// Utility functions //
3434
///////////////////////
3535
JL_DLLEXPORT int jl_profile_init(size_t maxsize, uint64_t delay_nsec)
3636
{
37-
bt_size_max = maxsize;
37+
profile_bt_size_max = maxsize;
3838
nsecprof = delay_nsec;
39-
if (bt_data_prof != NULL)
40-
free((void*)bt_data_prof);
41-
bt_data_prof = (jl_bt_element_t*) calloc(maxsize, sizeof(jl_bt_element_t));
42-
if (bt_data_prof == NULL && maxsize > 0)
39+
if (profile_bt_data_prof != NULL)
40+
free((void*)profile_bt_data_prof);
41+
profile_bt_data_prof = (jl_bt_element_t*) calloc(maxsize, sizeof(jl_bt_element_t));
42+
if (profile_bt_data_prof == NULL && maxsize > 0)
4343
return -1;
44-
bt_size_cur = 0;
44+
profile_bt_size_cur = 0;
4545
return 0;
4646
}
4747

4848
JL_DLLEXPORT uint8_t *jl_profile_get_data(void)
4949
{
50-
return (uint8_t*) bt_data_prof;
50+
return (uint8_t*) profile_bt_data_prof;
5151
}
5252

5353
JL_DLLEXPORT size_t jl_profile_len_data(void)
5454
{
55-
return bt_size_cur;
55+
return profile_bt_size_cur;
5656
}
5757

5858
JL_DLLEXPORT size_t jl_profile_maxlen_data(void)
5959
{
60-
return bt_size_max;
60+
return profile_bt_size_max;
6161
}
6262

6363
JL_DLLEXPORT uint64_t jl_profile_delay_nsec(void)
@@ -67,12 +67,12 @@ JL_DLLEXPORT uint64_t jl_profile_delay_nsec(void)
6767

6868
JL_DLLEXPORT void jl_profile_clear_data(void)
6969
{
70-
bt_size_cur = 0;
70+
profile_bt_size_cur = 0;
7171
}
7272

7373
JL_DLLEXPORT int jl_profile_is_running(void)
7474
{
75-
return running;
75+
return profile_running;
7676
}
7777

7878
// Any function that acquires this lock must be either a unmanaged thread
@@ -184,7 +184,7 @@ JL_DLLEXPORT int jl_profile_is_buffer_full(void)
184184
// Declare buffer full if there isn't enough room to sample even just the
185185
// thread metadata and one max-sized frame. The `+ 6` is for the two block
186186
// terminator `0`'s plus the 4 metadata entries.
187-
return bt_size_cur + ((JL_BT_MAX_ENTRY_SIZE + 1) + 6) > bt_size_max;
187+
return profile_bt_size_cur + ((JL_BT_MAX_ENTRY_SIZE + 1) + 6) > profile_bt_size_max;
188188
}
189189

190190
static uint64_t jl_last_sigint_trigger = 0;

src/signals-mach.c

Lines changed: 96 additions & 81 deletions
Original file line numberDiff line numberDiff line change
@@ -724,6 +724,85 @@ void jl_unlock_stackwalk(int lockret)
724724
jl_unlock_profile_mach(1, lockret);
725725
}
726726

727+
// assumes holding `jl_lock_profile_mach`
728+
void jl_profile_thread_mach(int tid)
729+
{
730+
// if there is no space left, return early
731+
if (jl_profile_is_buffer_full()) {
732+
jl_profile_stop_timer();
733+
return;
734+
}
735+
if (_dyld_dlopen_atfork_prepare != NULL && _dyld_dlopen_atfork_parent != NULL)
736+
_dyld_dlopen_atfork_prepare();
737+
if (_dyld_atfork_prepare != NULL && _dyld_atfork_parent != NULL)
738+
_dyld_atfork_prepare(); // briefly acquire the dlsym lock
739+
host_thread_state_t state;
740+
int valid_thread = jl_thread_suspend_and_get_state2(tid, &state);
741+
unw_context_t *uc = (unw_context_t*)&state;
742+
if (_dyld_atfork_prepare != NULL && _dyld_atfork_parent != NULL)
743+
_dyld_atfork_parent(); // quickly release the dlsym lock
744+
if (_dyld_dlopen_atfork_prepare != NULL && _dyld_dlopen_atfork_parent != NULL)
745+
_dyld_dlopen_atfork_parent();
746+
if (!valid_thread)
747+
return;
748+
if (profile_running) {
749+
#ifdef LLVMLIBUNWIND
750+
/*
751+
* Unfortunately compact unwind info is incorrectly generated for quite a number of
752+
* libraries by quite a large number of compilers. We can fall back to DWARF unwind info
753+
* in some cases, but in quite a number of cases (especially libraries not compiled in debug
754+
* mode, only the compact unwind info may be available). Even more unfortunately, there is no
755+
* way to detect such bogus compact unwind info (other than noticing the resulting segfault).
756+
* What we do here is ugly, but necessary until the compact unwind info situation improves.
757+
* We try to use the compact unwind info and if that results in a segfault, we retry with DWARF info.
758+
* Note that in a small number of cases this may result in bogus stack traces, but at least the topmost
759+
* entry will always be correct, and the number of cases in which this is an issue is rather small.
760+
* Other than that, this implementation is not incorrect as the other thread is paused while we are profiling
761+
* and during stack unwinding we only ever read memory, but never write it.
762+
*/
763+
764+
forceDwarf = 0;
765+
unw_getcontext(&profiler_uc); // will resume from this point if the next lines segfault at any point
766+
767+
if (forceDwarf == 0) {
768+
// Save the backtrace
769+
profile_bt_size_cur += rec_backtrace_ctx((jl_bt_element_t*)profile_bt_data_prof + profile_bt_size_cur, profile_bt_size_max - profile_bt_size_cur - 1, uc, NULL);
770+
}
771+
else if (forceDwarf == 1) {
772+
profile_bt_size_cur += rec_backtrace_ctx_dwarf((jl_bt_element_t*)profile_bt_data_prof + profile_bt_size_cur, profile_bt_size_max - profile_bt_size_cur - 1, uc, NULL);
773+
}
774+
else if (forceDwarf == -1) {
775+
jl_safe_printf("WARNING: profiler attempt to access an invalid memory location\n");
776+
}
777+
778+
forceDwarf = -2;
779+
#else
780+
profile_bt_size_cur += rec_backtrace_ctx((jl_bt_element_t*)profile_bt_data_prof + profile_bt_size_cur, profile_bt_size_max - profile_bt_size_cur - 1, uc, NULL);
781+
#endif
782+
jl_ptls_t ptls = jl_atomic_load_relaxed(&jl_all_tls_states)[tid];
783+
784+
// store threadid but add 1 as 0 is preserved to indicate end of block
785+
profile_bt_data_prof[profile_bt_size_cur++].uintptr = ptls->tid + 1;
786+
787+
// store task id (never null)
788+
profile_bt_data_prof[profile_bt_size_cur++].jlvalue = (jl_value_t*)jl_atomic_load_relaxed(&ptls->current_task);
789+
790+
// store cpu cycle clock
791+
profile_bt_data_prof[profile_bt_size_cur++].uintptr = cycleclock();
792+
793+
// store whether thread is sleeping but add 1 as 0 is preserved to indicate end of block
794+
profile_bt_data_prof[profile_bt_size_cur++].uintptr = jl_atomic_load_relaxed(&ptls->sleep_check_state) + 1;
795+
796+
// Mark the end of this block with two 0's
797+
profile_bt_data_prof[profile_bt_size_cur++].uintptr = 0;
798+
profile_bt_data_prof[profile_bt_size_cur++].uintptr = 0;
799+
}
800+
// We're done! Resume the thread.
801+
jl_thread_resume(tid);
802+
}
803+
804+
void jl_profile_task_unix(void);
805+
727806
void *mach_profile_listener(void *arg)
728807
{
729808
(void)arg;
@@ -741,88 +820,21 @@ void *mach_profile_listener(void *arg)
741820
// sample each thread, round-robin style in reverse order
742821
// (so that thread zero gets notified last)
743822
int keymgr_locked = jl_lock_profile_mach(0);
744-
745823
int nthreads = jl_atomic_load_acquire(&jl_n_threads);
746-
int *randperm = profile_get_randperm(nthreads);
747-
for (int idx = nthreads; idx-- > 0; ) {
748-
// Stop the threads in the random or reverse round-robin order.
749-
int i = randperm[idx];
750-
// if there is no space left, break early
751-
if (jl_profile_is_buffer_full()) {
752-
jl_profile_stop_timer();
753-
break;
754-
}
755-
756-
if (_dyld_dlopen_atfork_prepare != NULL && _dyld_dlopen_atfork_parent != NULL)
757-
_dyld_dlopen_atfork_prepare();
758-
if (_dyld_atfork_prepare != NULL && _dyld_atfork_parent != NULL)
759-
_dyld_atfork_prepare(); // briefly acquire the dlsym lock
760-
host_thread_state_t state;
761-
int valid_thread = jl_thread_suspend_and_get_state2(i, &state);
762-
unw_context_t *uc = (unw_context_t*)&state;
763-
if (_dyld_atfork_prepare != NULL && _dyld_atfork_parent != NULL)
764-
_dyld_atfork_parent(); // quickly release the dlsym lock
765-
if (_dyld_dlopen_atfork_prepare != NULL && _dyld_dlopen_atfork_parent != NULL)
766-
_dyld_dlopen_atfork_parent();
767-
if (!valid_thread)
768-
continue;
769-
if (running) {
770-
#ifdef LLVMLIBUNWIND
771-
/*
772-
* Unfortunately compact unwind info is incorrectly generated for quite a number of
773-
* libraries by quite a large number of compilers. We can fall back to DWARF unwind info
774-
* in some cases, but in quite a number of cases (especially libraries not compiled in debug
775-
* mode, only the compact unwind info may be available). Even more unfortunately, there is no
776-
* way to detect such bogus compact unwind info (other than noticing the resulting segfault).
777-
* What we do here is ugly, but necessary until the compact unwind info situation improves.
778-
* We try to use the compact unwind info and if that results in a segfault, we retry with DWARF info.
779-
* Note that in a small number of cases this may result in bogus stack traces, but at least the topmost
780-
* entry will always be correct, and the number of cases in which this is an issue is rather small.
781-
* Other than that, this implementation is not incorrect as the other thread is paused while we are profiling
782-
* and during stack unwinding we only ever read memory, but never write it.
783-
*/
784-
785-
forceDwarf = 0;
786-
unw_getcontext(&profiler_uc); // will resume from this point if the next lines segfault at any point
787-
788-
if (forceDwarf == 0) {
789-
// Save the backtrace
790-
bt_size_cur += rec_backtrace_ctx((jl_bt_element_t*)bt_data_prof + bt_size_cur, bt_size_max - bt_size_cur - 1, uc, NULL);
791-
}
792-
else if (forceDwarf == 1) {
793-
bt_size_cur += rec_backtrace_ctx_dwarf((jl_bt_element_t*)bt_data_prof + bt_size_cur, bt_size_max - bt_size_cur - 1, uc, NULL);
794-
}
795-
else if (forceDwarf == -1) {
796-
jl_safe_printf("WARNING: profiler attempt to access an invalid memory location\n");
797-
}
798-
799-
forceDwarf = -2;
800-
#else
801-
bt_size_cur += rec_backtrace_ctx((jl_bt_element_t*)bt_data_prof + bt_size_cur, bt_size_max - bt_size_cur - 1, uc, NULL);
802-
#endif
803-
jl_ptls_t ptls = jl_atomic_load_relaxed(&jl_all_tls_states)[i];
804-
805-
// META_OFFSET_THREADID store threadid but add 1 as 0 is preserved to indicate end of block
806-
bt_data_prof[bt_size_cur++].uintptr = ptls->tid + 1;
807-
808-
// META_OFFSET_TASKID store task id (never null)
809-
bt_data_prof[bt_size_cur++].jlvalue = (jl_value_t*)jl_atomic_load_relaxed(&ptls->current_task);
810-
811-
// META_OFFSET_CPUCYCLECLOCK store cpu cycle clock
812-
bt_data_prof[bt_size_cur++].uintptr = cycleclock();
813-
814-
// META_OFFSET_SLEEPSTATE store whether thread is sleeping but add 1 as 0 is preserved to indicate end of block
815-
bt_data_prof[bt_size_cur++].uintptr = jl_atomic_load_relaxed(&ptls->sleep_check_state) + 1;
816-
817-
// Mark the end of this block with two 0's
818-
bt_data_prof[bt_size_cur++].uintptr = 0;
819-
bt_data_prof[bt_size_cur++].uintptr = 0;
824+
if (profile_all_tasks) {
825+
// Don't take the stackwalk lock here since it's already taken in `jl_rec_backtrace`
826+
jl_profile_task_unix();
827+
}
828+
else {
829+
int *randperm = profile_get_randperm(nthreads);
830+
for (int idx = nthreads; idx-- > 0; ) {
831+
// Stop the threads in random order.
832+
int i = randperm[idx];
833+
jl_profile_thread_mach(i);
820834
}
821-
// We're done! Resume the thread.
822-
jl_thread_resume(i);
823835
}
824836
jl_unlock_profile_mach(0, keymgr_locked);
825-
if (running) {
837+
if (profile_running) {
826838
jl_check_profile_autostop();
827839
// Reset the alarm
828840
kern_return_t ret = clock_alarm(clk, TIME_RELATIVE, timerprof, profile_port);
@@ -831,7 +843,8 @@ void *mach_profile_listener(void *arg)
831843
}
832844
}
833845

834-
JL_DLLEXPORT int jl_profile_start_timer(void)
846+
847+
JL_DLLEXPORT int jl_profile_start_timer(uint8_t all_tasks)
835848
{
836849
kern_return_t ret;
837850
if (!profile_started) {
@@ -860,7 +873,8 @@ JL_DLLEXPORT int jl_profile_start_timer(void)
860873
timerprof.tv_sec = nsecprof/GIGA;
861874
timerprof.tv_nsec = nsecprof%GIGA;
862875

863-
running = 1;
876+
profile_running = 1;
877+
profile_all_tasks = all_tasks;
864878
// ensure the alarm is running
865879
ret = clock_alarm(clk, TIME_RELATIVE, timerprof, profile_port);
866880
HANDLE_MACH_ERROR("clock_alarm", ret);
@@ -870,5 +884,6 @@ JL_DLLEXPORT int jl_profile_start_timer(void)
870884

871885
JL_DLLEXPORT void jl_profile_stop_timer(void)
872886
{
873-
running = 0;
887+
profile_running = 0;
888+
profile_all_tasks = 0;
874889
}

0 commit comments

Comments
 (0)