Skip to content

Commit

Permalink
fix jl_mutex_lock deadlock under rr (#56644)
Browse files Browse the repository at this point in the history
(cherry picked from commit bdf78c9)
  • Loading branch information
vtjnash authored and KristofferC committed Jan 2, 2025
1 parent 905e39a commit 1b75f96
Show file tree
Hide file tree
Showing 6 changed files with 36 additions and 34 deletions.
6 changes: 3 additions & 3 deletions src/gc.c
Original file line number Diff line number Diff line change
Expand Up @@ -3853,10 +3853,10 @@ JL_DLLEXPORT void jl_gc_collect(jl_gc_collection_t collection)
jl_atomic_store_release(&ptls->gc_state, JL_GC_STATE_WAITING);
// `jl_safepoint_start_gc()` makes sure only one thread can run the GC.
uint64_t t0 = jl_hrtime();
if (!jl_safepoint_start_gc()) {
if (!jl_safepoint_start_gc(ct)) {
// either another thread is running GC, or the GC got disabled just now.
jl_gc_state_set(ptls, old_state, JL_GC_STATE_WAITING);
jl_safepoint_wait_thread_resume(); // block in thread-suspend now if requested, after clearing the gc_state
jl_safepoint_wait_thread_resume(ct); // block in thread-suspend now if requested, after clearing the gc_state
return;
}

Expand Down Expand Up @@ -3910,7 +3910,7 @@ JL_DLLEXPORT void jl_gc_collect(jl_gc_collection_t collection)
jl_safepoint_end_gc();
jl_gc_state_set(ptls, old_state, JL_GC_STATE_WAITING);
JL_PROBE_GC_END();
jl_safepoint_wait_thread_resume(); // block in thread-suspend now if requested, after clearing the gc_state
jl_safepoint_wait_thread_resume(ct); // block in thread-suspend now if requested, after clearing the gc_state

// Only disable finalizers on current thread
// Doing this on all threads is racy (it's impossible to check
Expand Down
9 changes: 4 additions & 5 deletions src/julia_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -967,7 +967,7 @@ void jl_safepoint_init(void);
// before calling this function. If the calling thread is to run the GC,
// it should also wait for the mutator threads to hit a safepoint **AFTER**
// this function returns
int jl_safepoint_start_gc(void);
int jl_safepoint_start_gc(jl_task_t *ct);
// Can only be called by the thread that have got a `1` return value from
// `jl_safepoint_start_gc()`. This disables the safepoint (for GC,
// the `mprotect` may not be removed if there's pending SIGINT) and wake
Expand All @@ -977,9 +977,8 @@ void jl_safepoint_end_gc(void);
// Wait for the GC to finish
// This function does **NOT** modify the `gc_state` to inform the GC thread
// The caller should set it **BEFORE** calling this function.
void jl_safepoint_wait_gc(void) JL_NOTSAFEPOINT;
void jl_safepoint_wait_thread_resume(void) JL_NOTSAFEPOINT;

void jl_safepoint_wait_gc(jl_task_t *ct) JL_NOTSAFEPOINT;
void jl_safepoint_wait_thread_resume(jl_task_t *ct) JL_NOTSAFEPOINT;
// Set pending sigint and enable the mechanisms to deliver the sigint.
void jl_safepoint_enable_sigint(void);
// If the safepoint is enabled to deliver sigint, disable it
Expand All @@ -1006,7 +1005,7 @@ JL_DLLEXPORT void jl_pgcstack_getkey(jl_get_pgcstack_func **f, jl_pgcstack_key_t
extern pthread_mutex_t in_signal_lock;
#endif

void jl_set_gc_and_wait(void); // n.b. not used on _OS_DARWIN_
void jl_set_gc_and_wait(jl_task_t *ct); // n.b. not used on _OS_DARWIN_

// Query if a Julia object is if a permalloc region (due to part of a sys- pkg-image)
STATIC_INLINE size_t n_linkage_blobs(void) JL_NOTSAFEPOINT
Expand Down
34 changes: 17 additions & 17 deletions src/safepoint.c
Original file line number Diff line number Diff line change
Expand Up @@ -158,17 +158,16 @@ void jl_gc_wait_for_the_world(jl_ptls_t* gc_all_tls_states, int gc_n_threads)
}
}

int jl_safepoint_start_gc(void)
int jl_safepoint_start_gc(jl_task_t *ct)
{
// The thread should have just set this before entry
assert(jl_atomic_load_relaxed(&jl_current_task->ptls->gc_state) == JL_GC_STATE_WAITING);
assert(jl_atomic_load_relaxed(&ct->ptls->gc_state) == JL_GC_STATE_WAITING);
uv_mutex_lock(&safepoint_lock);
uv_cond_broadcast(&safepoint_cond_begin);
// make sure we are permitted to run GC now (we might be required to stop instead)
jl_task_t *ct = jl_current_task;
while (jl_atomic_load_relaxed(&ct->ptls->suspend_count)) {
uv_mutex_unlock(&safepoint_lock);
jl_safepoint_wait_thread_resume();
jl_safepoint_wait_thread_resume(ct);
uv_mutex_lock(&safepoint_lock);
}
// In case multiple threads enter the GC at the same time, only allow
Expand All @@ -178,7 +177,7 @@ int jl_safepoint_start_gc(void)
uint32_t running = 0;
if (!jl_atomic_cmpswap(&jl_gc_running, &running, 1)) {
uv_mutex_unlock(&safepoint_lock);
jl_safepoint_wait_gc();
jl_safepoint_wait_gc(ct);
return 0;
}
// Foreign thread adoption disables the GC and waits for it to finish, however, that may
Expand Down Expand Up @@ -213,28 +212,28 @@ void jl_safepoint_end_gc(void)
uv_cond_broadcast(&safepoint_cond_end);
}

void jl_set_gc_and_wait(void) // n.b. not used on _OS_DARWIN_
void jl_set_gc_and_wait(jl_task_t *ct) // n.b. not used on _OS_DARWIN_
{
jl_task_t *ct = jl_current_task;
// reading own gc state doesn't need atomic ops since no one else
// should store to it.
int8_t state = jl_atomic_load_relaxed(&ct->ptls->gc_state);
jl_atomic_store_release(&ct->ptls->gc_state, JL_GC_STATE_WAITING);
uv_mutex_lock(&safepoint_lock);
uv_cond_broadcast(&safepoint_cond_begin);
uv_mutex_unlock(&safepoint_lock);
jl_safepoint_wait_gc();
jl_safepoint_wait_gc(ct);
jl_atomic_store_release(&ct->ptls->gc_state, state);
jl_safepoint_wait_thread_resume(); // block in thread-suspend now if requested, after clearing the gc_state
jl_safepoint_wait_thread_resume(ct); // block in thread-suspend now if requested, after clearing the gc_state
}

// this is the core of jl_set_gc_and_wait
void jl_safepoint_wait_gc(void) JL_NOTSAFEPOINT
void jl_safepoint_wait_gc(jl_task_t *ct) JL_NOTSAFEPOINT
{
jl_task_t *ct = jl_current_task; (void)ct;
JL_TIMING_SUSPEND_TASK(GC_SAFEPOINT, ct);
// The thread should have set this is already
assert(jl_atomic_load_relaxed(&ct->ptls->gc_state) != JL_GC_STATE_UNSAFE);
if (ct) {
JL_TIMING_SUSPEND_TASK(GC_SAFEPOINT, ct);
// The thread should have set this is already
assert(jl_atomic_load_relaxed(&ct->ptls->gc_state) != JL_GC_STATE_UNSAFE);
}
// Use normal volatile load in the loop for speed until GC finishes.
// Then use an acquire load to make sure the GC result is visible on this thread.
while (jl_atomic_load_relaxed(&jl_gc_running) || jl_atomic_load_acquire(&jl_gc_running)) {
Expand All @@ -249,9 +248,8 @@ void jl_safepoint_wait_gc(void) JL_NOTSAFEPOINT
}

// equivalent to jl_set_gc_and_wait, but waiting on resume-thread lock instead
void jl_safepoint_wait_thread_resume(void)
void jl_safepoint_wait_thread_resume(jl_task_t *ct)
{
jl_task_t *ct = jl_current_task;
// n.b. we do not permit a fast-path here that skips the lock acquire since
// we otherwise have no synchronization point to ensure that this thread
// will observe the change to the safepoint, even though the other thread
Expand Down Expand Up @@ -305,7 +303,9 @@ int jl_safepoint_suspend_thread(int tid, int waitstate)
// not, so assume it is running GC and wait for GC to finish first.
// It will be unable to reenter helping with GC because we have
// changed its safepoint page.
jl_set_gc_and_wait();
uv_mutex_unlock(&safepoint_lock);
jl_set_gc_and_wait(jl_current_task);
uv_mutex_lock(&safepoint_lock);
}
while (jl_atomic_load_acquire(&ptls2->suspend_count) != 0) {
int8_t state2 = jl_atomic_load_acquire(&ptls2->gc_state);
Expand Down
2 changes: 1 addition & 1 deletion src/signals-unix.c
Original file line number Diff line number Diff line change
Expand Up @@ -365,7 +365,7 @@ JL_NO_ASAN static void segv_handler(int sig, siginfo_t *info, void *context)
return;
}
if (sig == SIGSEGV && info->si_code == SEGV_ACCERR && jl_addr_is_safepoint((uintptr_t)info->si_addr) && !is_write_fault(context)) {
jl_set_gc_and_wait();
jl_set_gc_and_wait(ct);
// Do not raise sigint on worker thread
if (jl_atomic_load_relaxed(&ct->tid) != 0)
return;
Expand Down
2 changes: 1 addition & 1 deletion src/signals-win.c
Original file line number Diff line number Diff line change
Expand Up @@ -250,7 +250,7 @@ LONG WINAPI jl_exception_handler(struct _EXCEPTION_POINTERS *ExceptionInfo)
break;
case EXCEPTION_ACCESS_VIOLATION:
if (jl_addr_is_safepoint(ExceptionInfo->ExceptionRecord->ExceptionInformation[1])) {
jl_set_gc_and_wait();
jl_set_gc_and_wait(ct);
// Do not raise sigint on worker thread
if (ptls->tid != 0)
return EXCEPTION_CONTINUE_EXECUTION;
Expand Down
17 changes: 10 additions & 7 deletions src/threading.c
Original file line number Diff line number Diff line change
Expand Up @@ -411,11 +411,9 @@ JL_DLLEXPORT jl_gcframe_t **jl_adopt_thread(void)
{
// `jl_init_threadtls` puts us in a GC unsafe region, so ensure GC isn't running.
// we can't use a normal safepoint because we don't have signal handlers yet.
// we also can't use jl_safepoint_wait_gc because that assumes we're in a task.
jl_atomic_fetch_add(&jl_gc_disable_counter, 1);
while (jl_atomic_load_acquire(&jl_gc_running)) {
jl_cpu_pause();
}
// pass NULL as a special token to indicate we are running on an unmanaged task
jl_safepoint_wait_gc(NULL);
// this check is coupled with the one in `jl_safepoint_wait_gc`, where we observe if a
// foreign thread has asked to disable the GC, guaranteeing the order of events.

Expand Down Expand Up @@ -855,15 +853,20 @@ void _jl_mutex_wait(jl_task_t *self, jl_mutex_t *lock, int safepoint)
jl_profile_lock_acquired(lock);
return;
}
if (safepoint) {
jl_gc_safepoint_(self->ptls);
}
if (jl_running_under_rr(0)) {
// when running under `rr`, use system mutexes rather than spin locking
int8_t gc_state;
if (safepoint)
gc_state = jl_gc_safe_enter(self->ptls);
uv_mutex_lock(&tls_lock);
if (jl_atomic_load_relaxed(&lock->owner))
uv_cond_wait(&cond, &tls_lock);
uv_mutex_unlock(&tls_lock);
if (safepoint)
jl_gc_safe_leave(self->ptls, gc_state);
}
else if (safepoint) {
jl_gc_safepoint_(self->ptls);
}
jl_cpu_suspend();
owner = jl_atomic_load_relaxed(&lock->owner);
Expand Down

0 comments on commit 1b75f96

Please sign in to comment.