Skip to content

Commit 5120d05

Browse files
committed
add pending state to jl_thread_suspend_and_get_state-machine
Fixes an issue with #55500, where signals may abruptly abort the process as they observe it is still processing the resume SIGUSR2 message and are not able to wait for that processing to end before setting the new message to exit.
1 parent 378f192 commit 5120d05

File tree

1 file changed

+58
-13
lines changed

1 file changed

+58
-13
lines changed

src/signals-unix.c

Lines changed: 58 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -448,6 +448,7 @@ static int signal_caught_cond = -1;
448448

449449
int jl_thread_suspend_and_get_state(int tid, int timeout, bt_context_t *ctx)
450450
{
451+
int err;
451452
pthread_mutex_lock(&in_signal_lock);
452453
jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[tid];
453454
jl_task_t *ct2 = ptls2 ? jl_atomic_load_relaxed(&ptls2->current_task) : NULL;
@@ -456,22 +457,51 @@ int jl_thread_suspend_and_get_state(int tid, int timeout, bt_context_t *ctx)
456457
pthread_mutex_unlock(&in_signal_lock);
457458
return 0;
458459
}
459-
sig_atomic_t request = 0;
460-
if (!jl_atomic_cmpswap(&ptls2->signal_request, &request, 1)) {
460+
if (jl_atomic_load(&ptls2->signal_request) != 0) {
461461
// something is wrong, or there is already a usr2 in flight elsewhere
462+
// try to wait for it to finish or wait for timeout
463+
struct pollfd event = {signal_caught_cond, POLLIN, 0};
464+
do {
465+
err = poll(&event, 1, timeout * 1000);
466+
} while (err == -1 && errno == EINTR);
467+
if (err == -1 || (event.revents & POLLIN) == 0) {
468+
// not ready after timeout: cancel this request
469+
pthread_mutex_unlock(&in_signal_lock);
470+
return 0;
471+
}
472+
}
473+
// check for any stale signal_caught_cond events
474+
struct pollfd event = {signal_caught_cond, POLLIN, 0};
475+
do {
476+
err = poll(&event, 1, 0);
477+
} while (err == -1 && errno == EINTR);
478+
if (err == -1) {
462479
pthread_mutex_unlock(&in_signal_lock);
463480
return 0;
464481
}
482+
if ((event.revents & POLLIN) != 0) {
483+
// consume it before continuing
484+
eventfd_t got;
485+
do {
486+
err = read(signal_caught_cond, &got, sizeof(eventfd_t));
487+
} while (err == -1 && errno == EINTR);
488+
if (err != sizeof(eventfd_t)) abort();
489+
assert(got == 1); (void) got;
490+
}
491+
sig_atomic_t request = jl_atomic_exchange(&ptls2->signal_request, 1);
492+
assert(request == 0 || request == -1);
465493
request = 1;
466-
int err = pthread_kill(ptls2->system_id, SIGUSR2);
467-
// wait for thread to acknowledge or timeout
468-
struct pollfd event = {signal_caught_cond, POLLIN, 0};
494+
err = pthread_kill(ptls2->system_id, SIGUSR2);
469495
if (err == 0) {
496+
// wait for thread to acknowledge or timeout
497+
struct pollfd event = {signal_caught_cond, POLLIN, 0};
470498
do {
471499
err = poll(&event, 1, timeout * 1000);
472500
} while (err == -1 && errno == EINTR);
501+
if (err != 1 || (event.revents & POLLIN) == 0)
502+
err = -1;
473503
}
474-
if ((event.revents & POLLIN) == 0) {
504+
if (err == -1) {
475505
// not ready after timeout: try to cancel this request
476506
if (jl_atomic_cmpswap(&ptls2->signal_request, &request, 0)) {
477507
pthread_mutex_unlock(&in_signal_lock);
@@ -487,7 +517,7 @@ int jl_thread_suspend_and_get_state(int tid, int timeout, bt_context_t *ctx)
487517
// Now the other thread is waiting on exit_signal_cond (verify that here by
488518
// checking it is 0, and add an acquire barrier for good measure)
489519
request = jl_atomic_load_acquire(&ptls2->signal_request);
490-
assert(request == 0); (void) request;
520+
assert(request == 0 || request == -1); (void) request;
491521
jl_atomic_store_release(&ptls2->signal_request, 4); // prepare to resume normally, but later code may change this
492522
*ctx = *signal_context;
493523
return 1;
@@ -546,6 +576,7 @@ static void jl_exit_thread0(int signo, jl_bt_element_t *bt_data, size_t bt_size)
546576
}
547577

548578
// request:
579+
// -1: processing
549580
// 0: nothing [not from here]
550581
// 1: get state & wait for request
551582
// 2: throw sigint if `!defer_signal && io_wait` or if force throw threshold
@@ -561,21 +592,35 @@ void usr2_handler(int sig, siginfo_t *info, void *ctx)
561592
if (ptls == NULL)
562593
return;
563594
int errno_save = errno;
595+
sig_atomic_t request = jl_atomic_load(&ptls->signal_request);
596+
if (request == 0)
597+
return;
598+
if (!jl_atomic_cmpswap(&ptls->signal_request, &request, -1))
599+
return;
564600
// acknowledge that we saw the signal_request
565-
sig_atomic_t request = jl_atomic_exchange(&ptls->signal_request, 0);
601+
int err;
602+
eventfd_t got = 1;
603+
err = write(signal_caught_cond, &got, sizeof(eventfd_t));
604+
if (err != sizeof(eventfd_t)) abort();
605+
int processing = -1;
606+
jl_atomic_cmpswap(&ptls->signal_request, &processing, 0);
566607
if (request == 1) {
567608
signal_context = jl_to_bt_context(ctx);
568-
int err;
569-
eventfd_t got = 1;
570-
err = write(signal_caught_cond, &got, sizeof(eventfd_t));
571-
if (err != sizeof(eventfd_t)) abort();
572609
do {
573610
err = read(exit_signal_cond, &got, sizeof(eventfd_t));
574611
} while (err == -1 && errno == EINTR);
575612
if (err != sizeof(eventfd_t)) abort();
576613
assert(got == 1);
577-
request = jl_atomic_exchange(&ptls->signal_request, 0);
614+
request = jl_atomic_exchange(&ptls->signal_request, -1);
615+
if (request != 0) {
616+
int err;
617+
eventfd_t got = 1;
618+
err = write(signal_caught_cond, &got, sizeof(eventfd_t));
619+
if (err != sizeof(eventfd_t)) abort();
620+
}
578621
assert(request == 2 || request == 3 || request == 4);
622+
int processing = -1;
623+
jl_atomic_cmpswap(&ptls->signal_request, &processing, 0);
579624
}
580625
if (request == 2) {
581626
int force = jl_check_force_sigint();

0 commit comments

Comments
 (0)