Skip to content

Commit bdf78c9

Browse files
authored
fix jl_mutex_lock deadlock under rr (#56644)
1 parent 522f496 commit bdf78c9

File tree

6 files changed

+34
-33
lines changed

6 files changed

+34
-33
lines changed

src/gc-stock.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3333,10 +3333,10 @@ JL_DLLEXPORT void jl_gc_collect(jl_gc_collection_t collection)
33333333
jl_atomic_store_release(&ptls->gc_state, JL_GC_STATE_WAITING);
33343334
// `jl_safepoint_start_gc()` makes sure only one thread can run the GC.
33353335
uint64_t t0 = jl_hrtime();
3336-
if (!jl_safepoint_start_gc()) {
3336+
if (!jl_safepoint_start_gc(ct)) {
33373337
// either another thread is running GC, or the GC got disabled just now.
33383338
jl_gc_state_set(ptls, old_state, JL_GC_STATE_WAITING);
3339-
jl_safepoint_wait_thread_resume(); // block in thread-suspend now if requested, after clearing the gc_state
3339+
jl_safepoint_wait_thread_resume(ct); // block in thread-suspend now if requested, after clearing the gc_state
33403340
return;
33413341
}
33423342

@@ -3390,7 +3390,7 @@ JL_DLLEXPORT void jl_gc_collect(jl_gc_collection_t collection)
33903390
jl_safepoint_end_gc();
33913391
jl_gc_state_set(ptls, old_state, JL_GC_STATE_WAITING);
33923392
JL_PROBE_GC_END();
3393-
jl_safepoint_wait_thread_resume(); // block in thread-suspend now if requested, after clearing the gc_state
3393+
jl_safepoint_wait_thread_resume(ct); // block in thread-suspend now if requested, after clearing the gc_state
33943394

33953395
// Only disable finalizers on current thread
33963396
// Doing this on all threads is racy (it's impossible to check

src/julia_internal.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1133,7 +1133,7 @@ void jl_safepoint_init(void);
11331133
// before calling this function. If the calling thread is to run the GC,
11341134
// it should also wait for the mutator threads to hit a safepoint **AFTER**
11351135
// this function returns
1136-
int jl_safepoint_start_gc(void);
1136+
int jl_safepoint_start_gc(jl_task_t *ct);
11371137
// Can only be called by the thread that have got a `1` return value from
11381138
// `jl_safepoint_start_gc()`. This disables the safepoint (for GC,
11391139
// the `mprotect` may not be removed if there's pending SIGINT) and wake
@@ -1143,8 +1143,8 @@ void jl_safepoint_end_gc(void);
11431143
// Wait for the GC to finish
11441144
// This function does **NOT** modify the `gc_state` to inform the GC thread
11451145
// The caller should set it **BEFORE** calling this function.
1146-
void jl_safepoint_wait_gc(void) JL_NOTSAFEPOINT;
1147-
void jl_safepoint_wait_thread_resume(void) JL_NOTSAFEPOINT;
1146+
void jl_safepoint_wait_gc(jl_task_t *ct) JL_NOTSAFEPOINT;
1147+
void jl_safepoint_wait_thread_resume(jl_task_t *ct) JL_NOTSAFEPOINT;
11481148
int8_t jl_safepoint_take_sleep_lock(jl_ptls_t ptls) JL_NOTSAFEPOINT_ENTER;
11491149
// Set pending sigint and enable the mechanisms to deliver the sigint.
11501150
void jl_safepoint_enable_sigint(void);
@@ -1170,7 +1170,7 @@ JL_DLLEXPORT void jl_pgcstack_getkey(jl_get_pgcstack_func **f, jl_pgcstack_key_t
11701170
extern pthread_mutex_t in_signal_lock;
11711171
#endif
11721172

1173-
void jl_set_gc_and_wait(void); // n.b. not used on _OS_DARWIN_
1173+
void jl_set_gc_and_wait(jl_task_t *ct); // n.b. not used on _OS_DARWIN_
11741174

11751175
// Query if a Julia object is if a permalloc region (due to part of a sys- pkg-image)
11761176
STATIC_INLINE size_t n_linkage_blobs(void) JL_NOTSAFEPOINT

src/safepoint.c

Lines changed: 15 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -158,17 +158,16 @@ void jl_gc_wait_for_the_world(jl_ptls_t* gc_all_tls_states, int gc_n_threads)
158158
}
159159
}
160160

161-
int jl_safepoint_start_gc(void)
161+
int jl_safepoint_start_gc(jl_task_t *ct)
162162
{
163163
// The thread should have just set this before entry
164-
assert(jl_atomic_load_relaxed(&jl_current_task->ptls->gc_state) == JL_GC_STATE_WAITING);
164+
assert(jl_atomic_load_relaxed(&ct->ptls->gc_state) == JL_GC_STATE_WAITING);
165165
uv_mutex_lock(&safepoint_lock);
166166
uv_cond_broadcast(&safepoint_cond_begin);
167167
// make sure we are permitted to run GC now (we might be required to stop instead)
168-
jl_task_t *ct = jl_current_task;
169168
while (jl_atomic_load_relaxed(&ct->ptls->suspend_count)) {
170169
uv_mutex_unlock(&safepoint_lock);
171-
jl_safepoint_wait_thread_resume();
170+
jl_safepoint_wait_thread_resume(ct);
172171
uv_mutex_lock(&safepoint_lock);
173172
}
174173
// In case multiple threads enter the GC at the same time, only allow
@@ -178,7 +177,7 @@ int jl_safepoint_start_gc(void)
178177
uint32_t running = 0;
179178
if (!jl_atomic_cmpswap(&jl_gc_running, &running, 1)) {
180179
uv_mutex_unlock(&safepoint_lock);
181-
jl_safepoint_wait_gc();
180+
jl_safepoint_wait_gc(ct);
182181
return 0;
183182
}
184183
// Foreign thread adoption disables the GC and waits for it to finish, however, that may
@@ -213,28 +212,28 @@ void jl_safepoint_end_gc(void)
213212
uv_cond_broadcast(&safepoint_cond_end);
214213
}
215214

216-
void jl_set_gc_and_wait(void) // n.b. not used on _OS_DARWIN_
215+
void jl_set_gc_and_wait(jl_task_t *ct) // n.b. not used on _OS_DARWIN_
217216
{
218-
jl_task_t *ct = jl_current_task;
219217
// reading own gc state doesn't need atomic ops since no one else
220218
// should store to it.
221219
int8_t state = jl_atomic_load_relaxed(&ct->ptls->gc_state);
222220
jl_atomic_store_release(&ct->ptls->gc_state, JL_GC_STATE_WAITING);
223221
uv_mutex_lock(&safepoint_lock);
224222
uv_cond_broadcast(&safepoint_cond_begin);
225223
uv_mutex_unlock(&safepoint_lock);
226-
jl_safepoint_wait_gc();
224+
jl_safepoint_wait_gc(ct);
227225
jl_atomic_store_release(&ct->ptls->gc_state, state);
228-
jl_safepoint_wait_thread_resume(); // block in thread-suspend now if requested, after clearing the gc_state
226+
jl_safepoint_wait_thread_resume(ct); // block in thread-suspend now if requested, after clearing the gc_state
229227
}
230228

231229
// this is the core of jl_set_gc_and_wait
232-
void jl_safepoint_wait_gc(void) JL_NOTSAFEPOINT
230+
void jl_safepoint_wait_gc(jl_task_t *ct) JL_NOTSAFEPOINT
233231
{
234-
jl_task_t *ct = jl_current_task; (void)ct;
235-
JL_TIMING_SUSPEND_TASK(GC_SAFEPOINT, ct);
236-
// The thread should have set this is already
237-
assert(jl_atomic_load_relaxed(&ct->ptls->gc_state) != JL_GC_STATE_UNSAFE);
232+
if (ct) {
233+
JL_TIMING_SUSPEND_TASK(GC_SAFEPOINT, ct);
234+
// The thread should have set this is already
235+
assert(jl_atomic_load_relaxed(&ct->ptls->gc_state) != JL_GC_STATE_UNSAFE);
236+
}
238237
// Use normal volatile load in the loop for speed until GC finishes.
239238
// Then use an acquire load to make sure the GC result is visible on this thread.
240239
while (jl_atomic_load_relaxed(&jl_gc_running) || jl_atomic_load_acquire(&jl_gc_running)) {
@@ -249,9 +248,8 @@ void jl_safepoint_wait_gc(void) JL_NOTSAFEPOINT
249248
}
250249

251250
// equivalent to jl_set_gc_and_wait, but waiting on resume-thread lock instead
252-
void jl_safepoint_wait_thread_resume(void)
251+
void jl_safepoint_wait_thread_resume(jl_task_t *ct)
253252
{
254-
jl_task_t *ct = jl_current_task;
255253
// n.b. we do not permit a fast-path here that skips the lock acquire since
256254
// we otherwise have no synchronization point to ensure that this thread
257255
// will observe the change to the safepoint, even though the other thread
@@ -333,7 +331,7 @@ int jl_safepoint_suspend_thread(int tid, int waitstate)
333331
// It will be unable to reenter helping with GC because we have
334332
// changed its safepoint page.
335333
uv_mutex_unlock(&safepoint_lock);
336-
jl_set_gc_and_wait();
334+
jl_set_gc_and_wait(jl_current_task);
337335
uv_mutex_lock(&safepoint_lock);
338336
}
339337
while (jl_atomic_load_acquire(&ptls2->suspend_count) != 0) {

src/signals-unix.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -392,7 +392,7 @@ JL_NO_ASAN static void segv_handler(int sig, siginfo_t *info, void *context)
392392
return;
393393
}
394394
if (sig == SIGSEGV && info->si_code == SEGV_ACCERR && jl_addr_is_safepoint((uintptr_t)info->si_addr) && !is_write_fault(context)) {
395-
jl_set_gc_and_wait();
395+
jl_set_gc_and_wait(ct);
396396
// Do not raise sigint on worker thread
397397
if (jl_atomic_load_relaxed(&ct->tid) != 0)
398398
return;

src/signals-win.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -256,7 +256,7 @@ LONG WINAPI jl_exception_handler(struct _EXCEPTION_POINTERS *ExceptionInfo)
256256
break;
257257
case EXCEPTION_ACCESS_VIOLATION:
258258
if (jl_addr_is_safepoint(ExceptionInfo->ExceptionRecord->ExceptionInformation[1])) {
259-
jl_set_gc_and_wait();
259+
jl_set_gc_and_wait(ct);
260260
// Do not raise sigint on worker thread
261261
if (ptls->tid != 0)
262262
return EXCEPTION_CONTINUE_EXECUTION;

src/threading.c

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -427,11 +427,9 @@ JL_DLLEXPORT jl_gcframe_t **jl_adopt_thread(void)
427427
{
428428
// `jl_init_threadtls` puts us in a GC unsafe region, so ensure GC isn't running.
429429
// we can't use a normal safepoint because we don't have signal handlers yet.
430-
// we also can't use jl_safepoint_wait_gc because that assumes we're in a task.
431430
jl_atomic_fetch_add(&jl_gc_disable_counter, 1);
432-
while (jl_atomic_load_acquire(&jl_gc_running)) {
433-
jl_cpu_pause();
434-
}
431+
// pass NULL as a special token to indicate we are running on an unmanaged task
432+
jl_safepoint_wait_gc(NULL);
435433
// this check is coupled with the one in `jl_safepoint_wait_gc`, where we observe if a
436434
// foreign thread has asked to disable the GC, guaranteeing the order of events.
437435

@@ -915,15 +913,20 @@ void _jl_mutex_wait(jl_task_t *self, jl_mutex_t *lock, int safepoint)
915913
jl_profile_lock_acquired(lock);
916914
return;
917915
}
918-
if (safepoint) {
919-
jl_gc_safepoint_(self->ptls);
920-
}
921916
if (jl_running_under_rr(0)) {
922917
// when running under `rr`, use system mutexes rather than spin locking
918+
int8_t gc_state;
919+
if (safepoint)
920+
gc_state = jl_gc_safe_enter(self->ptls);
923921
uv_mutex_lock(&tls_lock);
924922
if (jl_atomic_load_relaxed(&lock->owner))
925923
uv_cond_wait(&cond, &tls_lock);
926924
uv_mutex_unlock(&tls_lock);
925+
if (safepoint)
926+
jl_gc_safe_leave(self->ptls, gc_state);
927+
}
928+
else if (safepoint) {
929+
jl_gc_safepoint_(self->ptls);
927930
}
928931
jl_cpu_suspend();
929932
owner = jl_atomic_load_relaxed(&lock->owner);

0 commit comments

Comments
 (0)