Skip to content

Commit b1e5a86

Browse files
authored
add try/catch around scheduler to reset sleep state (#54721)
Fixes #54700 Mostly just an indentation change, so recommend viewing with whitespace hidden (or if backporting).
1 parent 525b95e commit b1e5a86

File tree

2 files changed

+138
-127
lines changed

2 files changed

+138
-127
lines changed

src/julia.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2376,8 +2376,8 @@ extern int had_exception;
23762376
size_t __excstack_state = jl_excstack_state(__eh_ct); \
23772377
jl_enter_handler(__eh_ct, &__eh); \
23782378
__eh_ct->eh = &__eh; \
2379-
if (1)
2380-
/* TRY BLOCK; */
2379+
for (i__try=1; i__try; i__try=0)
2380+
23812381
#define JL_CATCH \
23822382
if (!had_exception) \
23832383
jl_eh_restore_state_noexcept(__eh_ct, &__eh); \

src/scheduler.c

Lines changed: 136 additions & 125 deletions
Original file line numberDiff line numberDiff line change
@@ -456,145 +456,156 @@ JL_DLLEXPORT jl_task_t *jl_task_get_next(jl_value_t *trypoptask, jl_value_t *q,
456456
}
457457
continue;
458458
}
459-
task = get_next_task(trypoptask, q); // note: this should not yield
460-
if (ptls != ct->ptls) {
461-
// sigh, a yield was detected, so let's go ahead and handle it anyway by starting over
462-
ptls = ct->ptls;
463-
if (set_not_sleeping(ptls)) {
464-
JL_PROBE_RT_SLEEP_CHECK_TASK_WAKE(ptls);
459+
volatile int isrunning = 1;
460+
JL_TRY {
461+
task = get_next_task(trypoptask, q); // note: this should not yield
462+
if (ptls != ct->ptls) {
463+
// sigh, a yield was detected, so let's go ahead and handle it anyway by starting over
464+
ptls = ct->ptls;
465+
if (set_not_sleeping(ptls)) {
466+
JL_PROBE_RT_SLEEP_CHECK_TASK_WAKE(ptls);
467+
}
468+
continue; // jump to JL_CATCH
465469
}
466-
if (task)
467-
return task;
468-
continue;
469-
}
470-
if (task) {
471-
if (set_not_sleeping(ptls)) {
472-
JL_PROBE_RT_SLEEP_CHECK_TASK_WAKE(ptls);
470+
if (task) {
471+
if (set_not_sleeping(ptls)) {
472+
JL_PROBE_RT_SLEEP_CHECK_TASK_WAKE(ptls);
473+
}
474+
continue; // jump to JL_CATCH
473475
}
474-
return task;
475-
}
476476

477-
// IO is always permitted, but outside a threaded region, only
478-
// thread 0 will process messages.
479-
// Inside a threaded region, any thread can listen for IO messages,
480-
// and one thread should win this race and watch the event loop,
481-
// but we bias away from idle threads getting parked here.
482-
//
483-
// The reason this works is somewhat convoluted, and closely tied to [^store_buffering_1]:
484-
// - After decrementing _threadedregion, the thread is required to
485-
// call jl_wakeup_thread(0), that will kick out any thread who is
486-
// already there, and then eventually thread 0 will get here.
487-
// - Inside a _threadedregion, there must exist at least one
488-
// thread that has a happens-before relationship on the libuv lock
489-
// before reaching this decision point in the code who will see
490-
// the lock as unlocked and thus must win this race here.
491-
int uvlock = 0;
492-
if (jl_atomic_load_relaxed(&_threadedregion)) {
493-
uvlock = jl_mutex_trylock(&jl_uv_mutex);
494-
}
495-
else if (ptls->tid == jl_atomic_load_relaxed(&io_loop_tid)) {
496-
uvlock = 1;
497-
JL_UV_LOCK();
498-
}
499-
else {
500-
// Since we might have started some IO work, we might need
501-
// to ensure tid = 0 will go watch that new event source.
502-
// If trylock would have succeeded, that may have been our
503-
// responsibility, so need to make sure thread 0 will take care
504-
// of us.
505-
if (jl_atomic_load_relaxed(&jl_uv_mutex.owner) == NULL) // aka trylock
506-
jl_wakeup_thread(0);
507-
}
508-
if (uvlock) {
509-
int enter_eventloop = may_sleep(ptls);
510-
int active = 0;
511-
if (jl_atomic_load_relaxed(&jl_uv_n_waiters) != 0)
512-
// if we won the race against someone who actually needs
513-
// the lock to do real work, we need to let them have it instead
514-
enter_eventloop = 0;
515-
if (enter_eventloop) {
516-
uv_loop_t *loop = jl_global_event_loop();
517-
loop->stop_flag = 0;
518-
JULIA_DEBUG_SLEEPWAKE( ptls->uv_run_enter = cycleclock() );
519-
active = uv_run(loop, UV_RUN_ONCE);
520-
JULIA_DEBUG_SLEEPWAKE( ptls->uv_run_leave = cycleclock() );
521-
jl_gc_safepoint();
477+
// IO is always permitted, but outside a threaded region, only
478+
// thread 0 will process messages.
479+
// Inside a threaded region, any thread can listen for IO messages,
480+
// and one thread should win this race and watch the event loop,
481+
// but we bias away from idle threads getting parked here.
482+
//
483+
// The reason this works is somewhat convoluted, and closely tied to [^store_buffering_1]:
484+
// - After decrementing _threadedregion, the thread is required to
485+
// call jl_wakeup_thread(0), that will kick out any thread who is
486+
// already there, and then eventually thread 0 will get here.
487+
// - Inside a _threadedregion, there must exist at least one
488+
// thread that has a happens-before relationship on the libuv lock
489+
// before reaching this decision point in the code who will see
490+
// the lock as unlocked and thus must win this race here.
491+
int uvlock = 0;
492+
if (jl_atomic_load_relaxed(&_threadedregion)) {
493+
uvlock = jl_mutex_trylock(&jl_uv_mutex);
522494
}
523-
JL_UV_UNLOCK();
524-
// optimization: check again first if we may have work to do.
525-
// Otherwise we got a spurious wakeup since some other thread
526-
// that just wanted to steal libuv from us. We will just go
527-
// right back to sleep on the individual wake signal to let
528-
// them take it from us without conflict.
529-
if (active || !may_sleep(ptls)) {
530-
if (set_not_sleeping(ptls)) {
531-
JL_PROBE_RT_SLEEP_CHECK_UV_WAKE(ptls);
532-
}
533-
start_cycles = 0;
534-
continue;
495+
else if (ptls->tid == jl_atomic_load_relaxed(&io_loop_tid)) {
496+
uvlock = 1;
497+
JL_UV_LOCK();
535498
}
536-
if (!enter_eventloop && !jl_atomic_load_relaxed(&_threadedregion) && ptls->tid == jl_atomic_load_relaxed(&io_loop_tid)) {
537-
// thread 0 is the only thread permitted to run the event loop
538-
// so it needs to stay alive, just spin-looping if necessary
539-
if (set_not_sleeping(ptls)) {
540-
JL_PROBE_RT_SLEEP_CHECK_UV_WAKE(ptls);
499+
else {
500+
// Since we might have started some IO work, we might need
501+
// to ensure tid = 0 will go watch that new event source.
502+
// If trylock would have succeeded, that may have been our
503+
// responsibility, so need to make sure thread 0 will take care
504+
// of us.
505+
if (jl_atomic_load_relaxed(&jl_uv_mutex.owner) == NULL) // aka trylock
506+
jl_wakeup_thread(0);
507+
}
508+
if (uvlock) {
509+
int enter_eventloop = may_sleep(ptls);
510+
int active = 0;
511+
if (jl_atomic_load_relaxed(&jl_uv_n_waiters) != 0)
512+
// if we won the race against someone who actually needs
513+
// the lock to do real work, we need to let them have it instead
514+
enter_eventloop = 0;
515+
if (enter_eventloop) {
516+
uv_loop_t *loop = jl_global_event_loop();
517+
loop->stop_flag = 0;
518+
JULIA_DEBUG_SLEEPWAKE( ptls->uv_run_enter = cycleclock() );
519+
active = uv_run(loop, UV_RUN_ONCE);
520+
JULIA_DEBUG_SLEEPWAKE( ptls->uv_run_leave = cycleclock() );
521+
jl_gc_safepoint();
522+
}
523+
JL_UV_UNLOCK();
524+
// optimization: check again first if we may have work to do.
525+
// Otherwise we got a spurious wakeup since some other thread
526+
// that just wanted to steal libuv from us. We will just go
527+
// right back to sleep on the individual wake signal to let
528+
// them take it from us without conflict.
529+
if (active || !may_sleep(ptls)) {
530+
if (set_not_sleeping(ptls)) {
531+
JL_PROBE_RT_SLEEP_CHECK_UV_WAKE(ptls);
532+
}
533+
start_cycles = 0;
534+
continue; // jump to JL_CATCH
535+
}
536+
if (!enter_eventloop && !jl_atomic_load_relaxed(&_threadedregion) && ptls->tid == jl_atomic_load_relaxed(&io_loop_tid)) {
537+
// thread 0 is the only thread permitted to run the event loop
538+
// so it needs to stay alive, just spin-looping if necessary
539+
if (set_not_sleeping(ptls)) {
540+
JL_PROBE_RT_SLEEP_CHECK_UV_WAKE(ptls);
541+
}
542+
start_cycles = 0;
543+
continue; // jump to JL_CATCH
541544
}
542-
start_cycles = 0;
543-
continue;
544545
}
545-
}
546546

547-
// any thread which wants us running again will have to observe
548-
// sleep_check_state==sleeping and increment nrunning for us
549-
int wasrunning = jl_atomic_fetch_add_relaxed(&nrunning, -1);
550-
assert(wasrunning);
551-
if (wasrunning == 1) {
552-
// This was the last running thread, and there is no thread with !may_sleep
553-
// so make sure io_loop_tid is notified to check wait_empty
554-
// TODO: this also might be a good time to check again that
555-
// libuv's queue is truly empty, instead of during delete_thread
556-
int16_t tid2 = 0;
557-
if (ptls->tid != tid2) {
558-
jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[tid2];
559-
uv_mutex_lock(&ptls2->sleep_lock);
560-
uv_cond_signal(&ptls2->wake_signal);
561-
uv_mutex_unlock(&ptls2->sleep_lock);
547+
// any thread which wants us running again will have to observe
548+
// sleep_check_state==sleeping and increment nrunning for us
549+
int wasrunning = jl_atomic_fetch_add_relaxed(&nrunning, -1);
550+
assert(wasrunning);
551+
isrunning = 0;
552+
if (wasrunning == 1) {
553+
// This was the last running thread, and there is no thread with !may_sleep
554+
// so make sure io_loop_tid is notified to check wait_empty
555+
// TODO: this also might be a good time to check again that
556+
// libuv's queue is truly empty, instead of during delete_thread
557+
int16_t tid2 = 0;
558+
if (ptls->tid != tid2) {
559+
jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[tid2];
560+
uv_mutex_lock(&ptls2->sleep_lock);
561+
uv_cond_signal(&ptls2->wake_signal);
562+
uv_mutex_unlock(&ptls2->sleep_lock);
563+
}
562564
}
563-
}
564565

565-
// the other threads will just wait for an individual wake signal to resume
566-
JULIA_DEBUG_SLEEPWAKE( ptls->sleep_enter = cycleclock() );
567-
int8_t gc_state = jl_gc_safe_enter(ptls);
568-
uv_mutex_lock(&ptls->sleep_lock);
569-
while (may_sleep(ptls)) {
570-
if (ptls->tid == 0) {
571-
task = wait_empty;
572-
if (task && jl_atomic_load_relaxed(&nrunning) == 0) {
573-
wasrunning = jl_atomic_fetch_add_relaxed(&nrunning, 1);
574-
assert(!wasrunning);
575-
wasrunning = !set_not_sleeping(ptls);
576-
assert(!wasrunning);
577-
JL_PROBE_RT_SLEEP_CHECK_TASK_WAKE(ptls);
578-
if (!ptls->finalizers_inhibited)
579-
ptls->finalizers_inhibited++; // this annoyingly is rather sticky (we should like to reset it at the end of jl_task_wait_empty)
580-
break;
566+
// the other threads will just wait for an individual wake signal to resume
567+
JULIA_DEBUG_SLEEPWAKE( ptls->sleep_enter = cycleclock() );
568+
int8_t gc_state = jl_gc_safe_enter(ptls);
569+
uv_mutex_lock(&ptls->sleep_lock);
570+
while (may_sleep(ptls)) {
571+
if (ptls->tid == 0) {
572+
task = wait_empty;
573+
if (task && jl_atomic_load_relaxed(&nrunning) == 0) {
574+
wasrunning = jl_atomic_fetch_add_relaxed(&nrunning, 1);
575+
assert(!wasrunning);
576+
wasrunning = !set_not_sleeping(ptls);
577+
assert(!wasrunning);
578+
JL_PROBE_RT_SLEEP_CHECK_TASK_WAKE(ptls);
579+
if (!ptls->finalizers_inhibited)
580+
ptls->finalizers_inhibited++; // this annoyingly is rather sticky (we should like to reset it at the end of jl_task_wait_empty)
581+
break;
582+
}
583+
task = NULL;
581584
}
582-
task = NULL;
585+
// else should we warn the user of certain deadlock here if tid == 0 && nrunning == 0?
586+
uv_cond_wait(&ptls->wake_signal, &ptls->sleep_lock);
587+
}
588+
assert(jl_atomic_load_relaxed(&ptls->sleep_check_state) == not_sleeping);
589+
assert(jl_atomic_load_relaxed(&nrunning));
590+
start_cycles = 0;
591+
uv_mutex_unlock(&ptls->sleep_lock);
592+
JULIA_DEBUG_SLEEPWAKE( ptls->sleep_leave = cycleclock() );
593+
jl_gc_safe_leave(ptls, gc_state); // contains jl_gc_safepoint
594+
if (task) {
595+
assert(task == wait_empty);
596+
wait_empty = NULL;
597+
continue;
583598
}
584-
// else should we warn the user of certain deadlock here if tid == 0 && nrunning == 0?
585-
uv_cond_wait(&ptls->wake_signal, &ptls->sleep_lock);
586599
}
587-
assert(jl_atomic_load_relaxed(&ptls->sleep_check_state) == not_sleeping);
588-
assert(jl_atomic_load_relaxed(&nrunning));
589-
start_cycles = 0;
590-
uv_mutex_unlock(&ptls->sleep_lock);
591-
JULIA_DEBUG_SLEEPWAKE( ptls->sleep_leave = cycleclock() );
592-
jl_gc_safe_leave(ptls, gc_state); // contains jl_gc_safepoint
593-
if (task) {
594-
assert(task == wait_empty);
595-
wait_empty = NULL;
596-
return task;
600+
JL_CATCH {
601+
// probably SIGINT, but possibly a user mistake in trypoptask
602+
if (!isrunning)
603+
jl_atomic_fetch_add_relaxed(&nrunning, 1);
604+
set_not_sleeping(ptls);
605+
jl_rethrow();
597606
}
607+
if (task)
608+
return task;
598609
}
599610
else {
600611
// maybe check the kernel for new messages too

0 commit comments

Comments
 (0)