Skip to content

Commit

Permalink
all tasks profiler
Browse files Browse the repository at this point in the history
  • Loading branch information
d-netto committed Oct 25, 2024
1 parent b81e33f commit 6c033d6
Show file tree
Hide file tree
Showing 16 changed files with 588 additions and 236 deletions.
2 changes: 1 addition & 1 deletion src/gc-alloc-profiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ jl_raw_backtrace_t get_raw_backtrace() JL_NOTSAFEPOINT {
ptls->profiling_bt_buffer = shared_bt_data_buffer;
}

size_t bt_size = rec_backtrace(shared_bt_data_buffer, JL_MAX_BT_SIZE, 2);
size_t bt_size = rec_backtrace(shared_bt_data_buffer, JL_MAX_BT_SIZE, 2, 0);

// Then we copy only the needed bytes out of the buffer into our profile.
size_t bt_bytes = bt_size * sizeof(jl_bt_element_t);
Expand Down
29 changes: 29 additions & 0 deletions src/gc-stacks.c
Original file line number Diff line number Diff line change
Expand Up @@ -296,6 +296,35 @@ void sweep_stack_pool_loop(void) JL_NOTSAFEPOINT
jl_atomic_fetch_add(&gc_n_threads_sweeping_stacks, -1);
}

// Builds a list of the live tasks. Racy: `live_tasks` can expand at any time.
arraylist_t *jl_get_all_tasks_arraylist(void) JL_NOTSAFEPOINT
{
arraylist_t *tasks = (arraylist_t*)malloc_s(sizeof(arraylist_t));
arraylist_new(tasks, 0);
size_t nthreads = jl_atomic_load_acquire(&jl_n_threads);
jl_ptls_t *allstates = jl_atomic_load_relaxed(&jl_all_tls_states);
for (size_t i = 0; i < nthreads; i++) {
jl_ptls_t ptls2 = allstates[i];
if (ptls2 == NULL) {
continue;
}
jl_task_t *t = ptls2->root_task;
if (t->ctx.stkbuf != NULL) {
arraylist_push(tasks, t);
}
small_arraylist_t *live_tasks = &ptls2->gc_tls_common.heap.live_tasks;
size_t n = mtarraylist_length(live_tasks);
for (size_t i = 0; i < n; i++) {
jl_task_t *t = (jl_task_t*)mtarraylist_get(live_tasks, i);
assert(t != NULL);
if (t->ctx.stkbuf != NULL) {
arraylist_push(tasks, t);
}
}
}
return tasks;
}

JL_DLLEXPORT jl_array_t *jl_live_tasks(void)
{
size_t nthreads = jl_atomic_load_acquire(&jl_n_threads);
Expand Down
24 changes: 22 additions & 2 deletions src/gc-stock.c
Original file line number Diff line number Diff line change
Expand Up @@ -1025,7 +1025,22 @@ void gc_sweep_wait_for_all_stacks(void) JL_NOTSAFEPOINT
}
}

void sweep_stack_pools(jl_ptls_t ptls) JL_NOTSAFEPOINT
void sweep_mtarraylist_buffers(void) JL_NOTSAFEPOINT
{
for (int i = 0; i < gc_n_threads; i++) {
jl_ptls_t ptls = gc_all_tls_states[i];
if (ptls == NULL) {
continue;
}
small_arraylist_t *buffers = &ptls->lazily_freed_mtarraylist_buffers;
void *buf;
while ((buf = small_arraylist_pop(buffers)) != NULL) {
free(buf);
}
}
}

void sweep_stack_pools_and_mtarraylist_buffers(jl_ptls_t ptls) JL_NOTSAFEPOINT
{
// initialize ptls index for parallel sweeping of stack pools
assert(gc_n_threads);
Expand All @@ -1035,9 +1050,12 @@ void sweep_stack_pools(jl_ptls_t ptls) JL_NOTSAFEPOINT
else
jl_atomic_store_relaxed(&gc_stack_free_idx, stack_free_idx + 1);
jl_atomic_store_release(&gc_ptls_sweep_idx, gc_n_threads - 1); // idx == gc_n_threads = release stacks to the OS so it's serial
uv_mutex_lock(&live_tasks_lock);
gc_sweep_wake_all_stacks(ptls);
sweep_stack_pool_loop();
gc_sweep_wait_for_all_stacks();
sweep_mtarraylist_buffers();
uv_mutex_unlock(&live_tasks_lock);
}

static void gc_pool_sync_nfree(jl_gc_pagemeta_t *pg, jl_taggedvalue_t *last) JL_NOTSAFEPOINT
Expand Down Expand Up @@ -3084,7 +3102,7 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection)
current_sweep_full = sweep_full;
sweep_weak_refs();
uint64_t stack_pool_time = jl_hrtime();
sweep_stack_pools(ptls);
sweep_stack_pools_and_mtarraylist_buffers(ptls);
stack_pool_time = jl_hrtime() - stack_pool_time;
gc_num.total_stack_pool_sweep_time += stack_pool_time;
gc_num.stack_pool_sweep_time = stack_pool_time;
Expand Down Expand Up @@ -3453,6 +3471,8 @@ void jl_init_thread_heap(jl_ptls_t ptls)
jl_atomic_store_relaxed(&q->bottom, 0);
jl_atomic_store_relaxed(&q->array, wsa2);
arraylist_new(&mq->reclaim_set, 32);
// Initialize `lazily_freed_mtarraylist_buffers`
small_arraylist_new(&ptls->lazily_freed_mtarraylist_buffers, 0);

memset(&ptls->gc_tls_common.gc_num, 0, sizeof(ptls->gc_tls_common.gc_num));
jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.allocd, -(int64_t)gc_num.interval);
Expand Down
2 changes: 1 addition & 1 deletion src/gf.c
Original file line number Diff line number Diff line change
Expand Up @@ -2352,7 +2352,7 @@ static void JL_NORETURN jl_method_error_bare(jl_value_t *f, jl_value_t *args, si
jl_static_show((JL_STREAM*)STDERR_FILENO,(jl_value_t*)f); jl_printf((JL_STREAM*)STDERR_FILENO," world %u\n", (unsigned)world);
jl_static_show((JL_STREAM*)STDERR_FILENO,args); jl_printf((JL_STREAM*)STDERR_FILENO,"\n");
jl_ptls_t ptls = jl_current_task->ptls;
ptls->bt_size = rec_backtrace(ptls->bt_data, JL_MAX_BT_SIZE, 0);
ptls->bt_size = rec_backtrace(ptls->bt_data, JL_MAX_BT_SIZE, 0, 0);
jl_critical_error(0, 0, NULL, jl_current_task);
abort();
}
Expand Down
6 changes: 5 additions & 1 deletion src/init.c
Original file line number Diff line number Diff line change
Expand Up @@ -744,6 +744,10 @@ JL_DLLEXPORT void julia_init(JL_IMAGE_SEARCH rel)

// initialize symbol-table lock
uv_mutex_init(&symtab_lock);
// initialize the live tasks lock
uv_mutex_init(&live_tasks_lock);
// initialize the profiler buffer lock
uv_mutex_init(&bt_data_prof_lock);

// initialize backtraces
jl_init_profile_lock();
Expand All @@ -758,7 +762,7 @@ JL_DLLEXPORT void julia_init(JL_IMAGE_SEARCH rel)
// nongnu libunwind initialization is only threadsafe on architecture where the
// author could access TSAN, per https://github.com/libunwind/libunwind/pull/109
// so we need to do this once early (before threads)
rec_backtrace(NULL, 0, 0);
rec_backtrace(NULL, 0, 0, 0);
#endif

libsupport_init();
Expand Down
31 changes: 30 additions & 1 deletion src/julia_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,35 @@ JL_DLLEXPORT void jl_unlock_profile_wr(void) JL_NOTSAFEPOINT JL_NOTSAFEPOINT_LEA
int jl_lock_stackwalk(void) JL_NOTSAFEPOINT JL_NOTSAFEPOINT_ENTER;
void jl_unlock_stackwalk(int lockret) JL_NOTSAFEPOINT JL_NOTSAFEPOINT_LEAVE;

arraylist_t *jl_get_all_tasks_arraylist(void) JL_NOTSAFEPOINT;
typedef struct {
size_t bt_size;
int tid;
} jl_record_backtrace_result_t;
JL_DLLEXPORT jl_record_backtrace_result_t jl_record_backtrace(jl_task_t *t, struct _jl_bt_element_t *bt_data,
size_t max_bt_size, int all_tasks_profiler) JL_NOTSAFEPOINT;
extern volatile struct _jl_bt_element_t *profile_bt_data_prof;
extern volatile size_t profile_bt_size_max;
extern volatile size_t profile_bt_size_cur;
extern volatile int profile_running;
extern volatile int profile_all_tasks;
// Ensures that we can safely read the `live_tasks`field of every TLS when profiling.
// We want to avoid the case that a GC gets interleaved with `jl_profile_task` and shrinks
// the `live_tasks` array while we are reading it or frees tasks that are being profiled.
// Because of that, this lock must be held in `jl_profile_task` and `sweep_stack_pools_and_mtarraylist_buffers`.
extern uv_mutex_t live_tasks_lock;
// Ensures that we can safely write to `profile_bt_data_prof` and `profile_bt_size_cur`.
// We want to avoid the case that:
// - We start to profile a task very close to the profiling time window end.
// - The profiling time window ends and we start to read the profile data in a compute thread.
// - We write to the profile in a profiler thread while the compute thread is reading it.
// Locking discipline: `bt_data_prof_lock` must be held inside the scope of `live_tasks_lock`.
extern uv_mutex_t bt_data_prof_lock;
#define PROFILE_STATE_THREAD_NOT_SLEEPING (1)
#define PROFILE_STATE_THREAD_SLEEPING (2)
#define PROFILE_STATE_WALL_TIME_PROFILING (3)
void jl_profile_task(void);

// number of cycles since power-on
static inline uint64_t cycleclock(void) JL_NOTSAFEPOINT
{
Expand Down Expand Up @@ -1332,7 +1361,7 @@ typedef unw_cursor_t bt_cursor_t;
typedef int bt_context_t;
typedef int bt_cursor_t;
#endif
size_t rec_backtrace(jl_bt_element_t *bt_data, size_t maxsize, int skip) JL_NOTSAFEPOINT;
size_t rec_backtrace(jl_bt_element_t *bt_data, size_t maxsize, int skip, int no_gcstack) JL_NOTSAFEPOINT;
// Record backtrace from a signal handler. `ctx` is the context of the code
// which was asynchronously interrupted.
size_t rec_backtrace_ctx(jl_bt_element_t *bt_data, size_t maxsize, bt_context_t *ctx,
Expand Down
1 change: 1 addition & 0 deletions src/julia_threads.h
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,7 @@ typedef struct _jl_tls_states_t {
int finalizers_inhibited;
jl_gc_tls_states_t gc_tls; // this is very large, and the offset of the first member is baked into codegen
jl_gc_tls_states_common_t gc_tls_common; // common tls for both GCs
small_arraylist_t lazily_freed_mtarraylist_buffers;
volatile sig_atomic_t defer_signal;
_Atomic(struct _jl_task_t*) current_task;
struct _jl_task_t *next_task;
Expand Down
2 changes: 1 addition & 1 deletion src/mtarraylist.c
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ static void mtarraylist_resizeto(small_mtarraylist_t *a, size_t len, size_t newl
a->max = nm;
if (olditems != (void*)&a->_space[0]) {
jl_task_t *ct = jl_current_task;
jl_gc_add_quiescent(ct->ptls, (void**)olditems, free);
small_arraylist_push(&ct->ptls->lazily_freed_mtarraylist_buffers, olditems);
}
}
}
Expand Down
135 changes: 116 additions & 19 deletions src/signal-handling.c
Original file line number Diff line number Diff line change
Expand Up @@ -18,46 +18,48 @@ extern "C" {
#include <threading.h>

// Profiler control variables
// Note: these "static" variables are also used in "signals-*.c"
static volatile jl_bt_element_t *bt_data_prof = NULL;
static volatile size_t bt_size_max = 0;
static volatile size_t bt_size_cur = 0;
uv_mutex_t live_tasks_lock;
uv_mutex_t bt_data_prof_lock;
volatile jl_bt_element_t *profile_bt_data_prof = NULL;
volatile size_t profile_bt_size_max = 0;
volatile size_t profile_bt_size_cur = 0;
static volatile uint64_t nsecprof = 0;
static volatile int running = 0;
static const uint64_t GIGA = 1000000000ULL;
volatile int profile_running = 0;
volatile int profile_all_tasks = 0;
static const uint64_t GIGA = 1000000000ULL;
// Timers to take samples at intervals
JL_DLLEXPORT void jl_profile_stop_timer(void);
JL_DLLEXPORT int jl_profile_start_timer(void);
JL_DLLEXPORT int jl_profile_start_timer(uint8_t);

///////////////////////
// Utility functions //
///////////////////////
JL_DLLEXPORT int jl_profile_init(size_t maxsize, uint64_t delay_nsec)
{
bt_size_max = maxsize;
profile_bt_size_max = maxsize;
nsecprof = delay_nsec;
if (bt_data_prof != NULL)
free((void*)bt_data_prof);
bt_data_prof = (jl_bt_element_t*) calloc(maxsize, sizeof(jl_bt_element_t));
if (bt_data_prof == NULL && maxsize > 0)
if (profile_bt_data_prof != NULL)
free((void*)profile_bt_data_prof);
profile_bt_data_prof = (jl_bt_element_t*) calloc(maxsize, sizeof(jl_bt_element_t));
if (profile_bt_data_prof == NULL && maxsize > 0)
return -1;
bt_size_cur = 0;
profile_bt_size_cur = 0;
return 0;
}

JL_DLLEXPORT uint8_t *jl_profile_get_data(void)
{
return (uint8_t*) bt_data_prof;
return (uint8_t*) profile_bt_data_prof;
}

JL_DLLEXPORT size_t jl_profile_len_data(void)
{
return bt_size_cur;
return profile_bt_size_cur;
}

JL_DLLEXPORT size_t jl_profile_maxlen_data(void)
{
return bt_size_max;
return profile_bt_size_max;
}

JL_DLLEXPORT uint64_t jl_profile_delay_nsec(void)
Expand All @@ -67,12 +69,12 @@ JL_DLLEXPORT uint64_t jl_profile_delay_nsec(void)

JL_DLLEXPORT void jl_profile_clear_data(void)
{
bt_size_cur = 0;
profile_bt_size_cur = 0;
}

JL_DLLEXPORT int jl_profile_is_running(void)
{
return running;
return profile_running;
}

// Any function that acquires this lock must be either a unmanaged thread
Expand Down Expand Up @@ -184,7 +186,102 @@ JL_DLLEXPORT int jl_profile_is_buffer_full(void)
// Declare buffer full if there isn't enough room to sample even just the
// thread metadata and one max-sized frame. The `+ 6` is for the two block
// terminator `0`'s plus the 4 metadata entries.
return bt_size_cur + ((JL_BT_MAX_ENTRY_SIZE + 1) + 6) > bt_size_max;
return profile_bt_size_cur + ((JL_BT_MAX_ENTRY_SIZE + 1) + 6) > profile_bt_size_max;
}

NOINLINE int failed_to_sample_task_fun(jl_bt_element_t *bt_data, size_t maxsize, int skip) JL_NOTSAFEPOINT;
NOINLINE int failed_to_stop_thread_fun(jl_bt_element_t *bt_data, size_t maxsize, int skip) JL_NOTSAFEPOINT;

#define PROFILE_TASK_DEBUG_FORCE_SAMPLING_FAILURE (0)
#define PROFILE_TASK_DEBUG_FORCE_STOP_THREAD_FAILURE (0)

void jl_profile_task(void)
{
if (jl_profile_is_buffer_full()) {
// Buffer full: Delete the timer
jl_profile_stop_timer();
return;
}

jl_task_t *t = NULL;
int got_mutex = 0;
if (uv_mutex_trylock(&live_tasks_lock) != 0) {
goto collect_backtrace;
}
got_mutex = 1;

arraylist_t *tasks = jl_get_all_tasks_arraylist();
uint64_t seed = jl_rand();
const int n_max_random_attempts = (1 << 2);
// randomly select a task that is not done
for (int i = 0; i < n_max_random_attempts; i++) {
t = (jl_task_t*)tasks->items[cong(tasks->len, &seed)];
assert(t == NULL || jl_is_task(t));
if (t == NULL) {
continue;
}
int t_state = jl_atomic_load_relaxed(&t->_state);
if (t_state == JL_TASK_STATE_DONE) {
continue;
}
break;
}
arraylist_free(tasks);
free(tasks);

collect_backtrace:

uv_mutex_lock(&bt_data_prof_lock);
if (profile_running == 0) {
uv_mutex_unlock(&bt_data_prof_lock);
if (got_mutex) {
uv_mutex_unlock(&live_tasks_lock);
}
return;
}

jl_record_backtrace_result_t r = {0, INT16_MAX};
jl_bt_element_t *bt_data_prof = (jl_bt_element_t*)(profile_bt_data_prof + profile_bt_size_cur);
size_t bt_size_max = profile_bt_size_max - profile_bt_size_cur - 1;
if (t == NULL || PROFILE_TASK_DEBUG_FORCE_SAMPLING_FAILURE) {
// failed to find a task
r.bt_size = failed_to_sample_task_fun(bt_data_prof, bt_size_max, 0);
}
else {
if (!PROFILE_TASK_DEBUG_FORCE_STOP_THREAD_FAILURE) {
r = jl_record_backtrace(t, bt_data_prof, bt_size_max, 1);
}
// we failed to get a backtrace
if (r.bt_size == 0) {
r.bt_size = failed_to_stop_thread_fun(bt_data_prof, bt_size_max, 0);
}
}

// update the profile buffer size
profile_bt_size_cur += r.bt_size;

// store threadid but add 1 as 0 is preserved to indicate end of block
profile_bt_data_prof[profile_bt_size_cur++].uintptr = (uintptr_t)r.tid + 1;

// store task id (never null)
profile_bt_data_prof[profile_bt_size_cur++].jlvalue = (jl_value_t*)t;

// store cpu cycle clock
profile_bt_data_prof[profile_bt_size_cur++].uintptr = cycleclock();

// the thread profiler uses this block to record whether the thread is not sleeping (1) or sleeping (2)
// let's use a dummy value which is not 1 or 2 to
// indicate that we are profiling a task, and therefore, this block is not about the thread state
profile_bt_data_prof[profile_bt_size_cur++].uintptr = 3;

// Mark the end of this block with two 0's
profile_bt_data_prof[profile_bt_size_cur++].uintptr = 0;
profile_bt_data_prof[profile_bt_size_cur++].uintptr = 0;

uv_mutex_unlock(&bt_data_prof_lock);
if (got_mutex) {
uv_mutex_unlock(&live_tasks_lock);
}
}

static uint64_t jl_last_sigint_trigger = 0;
Expand Down
Loading

0 comments on commit 6c033d6

Please sign in to comment.