Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP thread cpu time #175

Closed
wants to merge 10 commits into from
2 changes: 2 additions & 0 deletions base/locks-mt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,10 @@ end
Base.assert_havelock(l::SpinLock) = islocked(l) ? nothing : Base.concurrency_violation()

function lock(l::SpinLock)
# t0 = time_ns()
while true
if @inline trylock(l)
# ccall(:jl_record_lock_spin_time, Cvoid, (Culonglong,), time_ns() - t0)
return
end
ccall(:jl_cpu_suspend, Cvoid, ())
Expand Down
26 changes: 26 additions & 0 deletions base/timing.jl
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,32 @@ function jit_total_bytes()
return ccall(:jl_jit_total_bytes, Csize_t, ())
end

thread_up_time() = ccall(:jl_thread_up_time, UInt64, ())

_unsafe_thread_user_time(tid::Integer) = ccall(:jl_thread_user_time, UInt64, (Cint,), Cint(tid))

function thread_user_time(tid::Integer)
max_tid = Threads.threadpoolsize(:interactive) + Threads.threadpoolsize(:default)
0 < tid ≤ max_tid || throw(ArgumentError("invalid thread id `$(tid)`"))
return _unsafe_thread_user_time(tid)
end

function thread_user_time(pool::Symbol=:all)
total = UInt64(0)
pool in (:all, :default, :interactive) || throw(ArgumentError("invalid threadpool `$(repr(pool))`"))
if pool === :all || pool === :default
for tid in Threads.threadpooltids(:default)
total += _unsafe_thread_user_time(tid)
end
end
if pool === :all || pool === :interactive
for tid in Threads.threadpooltids(:interactive)
total += _unsafe_thread_user_time(tid)
end
end
return total
end

# print elapsed time, return expression value
const _mem_units = ["byte", "KiB", "MiB", "GiB", "TiB", "PiB"]
const _cnt_units = ["", " k", " M", " G", " T", " P"]
Expand Down
6 changes: 4 additions & 2 deletions src/gc.c
Original file line number Diff line number Diff line change
Expand Up @@ -3741,7 +3741,7 @@ JL_DLLEXPORT void jl_gc_collect(jl_gc_collection_t collection)
int8_t old_state = jl_atomic_load_relaxed(&ptls->gc_state);
jl_atomic_store_release(&ptls->gc_state, JL_GC_STATE_WAITING);
// `jl_safepoint_start_gc()` makes sure only one thread can run the GC.
uint64_t t0 = jl_hrtime();
uint64_t t0 = jl_hrtime(); // time we entered GC
if (!jl_safepoint_start_gc()) {
// either another thread is running GC, or the GC got disabled just now.
jl_gc_state_set(ptls, old_state, JL_GC_STATE_WAITING);
Expand Down Expand Up @@ -3770,7 +3770,7 @@ JL_DLLEXPORT void jl_gc_collect(jl_gc_collection_t collection)
jl_gc_wait_for_the_world(gc_all_tls_states, gc_n_threads);
JL_PROBE_GC_STOP_THE_WORLD();

uint64_t t1 = jl_hrtime();
uint64_t t1 = jl_hrtime(); // time user code stopped running
uint64_t duration = t1 - t0;
if (duration > gc_num.max_time_to_safepoint)
gc_num.max_time_to_safepoint = duration;
Expand Down Expand Up @@ -3798,6 +3798,8 @@ JL_DLLEXPORT void jl_gc_collect(jl_gc_collection_t collection)
jl_safepoint_end_gc();
jl_gc_state_set(ptls, old_state, JL_GC_STATE_WAITING);
JL_PROBE_GC_END();
// Time how long GC took.
ptls->timing_tls.gc_time += jl_hrtime() - t1;
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

GC time for the thread coordinating the GC


// Only disable finalizers on current thread
// Doing this on all threads is racy (it's impossible to check
Expand Down
9 changes: 9 additions & 0 deletions src/julia_threads.h
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,14 @@ typedef struct {

struct _jl_bt_element_t;

typedef struct {
uint64_t start_time;
uint64_t sleep_time;
uint64_t scheduler_time;
/* uint64_t lock_spin_time; */
uint64_t gc_time;
} jl_timing_tls_states_t;
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

TODO: should this be like GC_Num and have a corresponding struct on the Julia side, so on that side we work with the struct rather than individual numbers?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

downside: any user-facing struct is impossible to expand, so probably just want to be exposing functions that return numbers... could still be passing data from C -> Julia side as a struct but idk if that gains us much tbh


// This includes all the thread local states we care about for a thread.
// Changes to TLS field types must be reflected in codegen.
#define JL_MAX_BT_SIZE 80000
Expand Down Expand Up @@ -155,6 +163,7 @@ typedef struct _jl_tls_states_t {
// Counter to disable finalizer **on the current thread**
int finalizers_inhibited;
jl_gc_tls_states_t gc_tls; // this is very large, and the offset of the first member is baked into codegen
jl_timing_tls_states_t timing_tls;
volatile sig_atomic_t defer_signal;
_Atomic(struct _jl_task_t*) current_task;
struct _jl_task_t *next_task;
Expand Down
13 changes: 9 additions & 4 deletions src/partr.c
Original file line number Diff line number Diff line change
Expand Up @@ -389,12 +389,14 @@ JL_DLLEXPORT jl_task_t *jl_task_get_next(jl_value_t *trypoptask, jl_value_t *q,
{
jl_task_t *ct = jl_current_task;
uint64_t start_cycles = 0;

uint64_t t0 = jl_hrtime();
while (1) {
jl_ptls_t ptls = ct->ptls;
jl_task_t *task = get_next_task(trypoptask, q);
if (task)
if (task) {
ptls->timing_tls.scheduler_time += jl_hrtime() - t0;
return task;

}
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is the fast-path for task-switches (i think?), but i think we've concluded this shouldn't add too much overhead (given jl_hrtime is a vdso call)... still need to verify that experimentally though

// quick, race-y check to see if there seems to be any stuff in there
jl_cpu_pause();
if (!check_empty(checkempty)) {
Expand All @@ -403,7 +405,6 @@ JL_DLLEXPORT jl_task_t *jl_task_get_next(jl_value_t *trypoptask, jl_value_t *q,
}

jl_cpu_pause();
jl_ptls_t ptls = ct->ptls;
if (sleep_check_after_threshold(&start_cycles) || (ptls->tid == 0 && (!jl_atomic_load_relaxed(&_threadedregion) || wait_empty))) {
// acquire sleep-check lock
jl_atomic_store_relaxed(&ptls->sleep_check_state, sleeping);
Expand All @@ -425,6 +426,7 @@ JL_DLLEXPORT jl_task_t *jl_task_get_next(jl_value_t *trypoptask, jl_value_t *q,
JL_PROBE_RT_SLEEP_CHECK_TASK_WAKE(ptls);
}
if (task)
ptls->timing_tls.scheduler_time += jl_hrtime() - t0;
return task;
continue;
}
Expand All @@ -433,6 +435,7 @@ JL_DLLEXPORT jl_task_t *jl_task_get_next(jl_value_t *trypoptask, jl_value_t *q,
jl_atomic_store_relaxed(&ptls->sleep_check_state, not_sleeping); // let other threads know they don't need to wake us
JL_PROBE_RT_SLEEP_CHECK_TASK_WAKE(ptls);
}
ptls->timing_tls.scheduler_time += jl_hrtime() - t0;
return task;
}

Expand Down Expand Up @@ -507,6 +510,7 @@ JL_DLLEXPORT jl_task_t *jl_task_get_next(jl_value_t *trypoptask, jl_value_t *q,

// the other threads will just wait for an individual wake signal to resume
JULIA_DEBUG_SLEEPWAKE( ptls->sleep_enter = cycleclock() );
uint64_t tsleep0 = jl_hrtime();
int8_t gc_state = jl_gc_safe_enter(ptls);
uv_mutex_lock(&ptls->sleep_lock);
while (may_sleep(ptls)) {
Expand All @@ -523,6 +527,7 @@ JL_DLLEXPORT jl_task_t *jl_task_get_next(jl_value_t *trypoptask, jl_value_t *q,
assert(jl_atomic_load_relaxed(&ptls->sleep_check_state) == not_sleeping);
uv_mutex_unlock(&ptls->sleep_lock);
JULIA_DEBUG_SLEEPWAKE( ptls->sleep_leave = cycleclock() );
ptls->timing_tls.sleep_time += jl_hrtime() - tsleep0;
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

currently sleep_time is a subset of scheduler_time (so we might want to rename scheduler_time to make that clear, or do the extra accounting so that we stop accumulating scheduler_time when we start accumulating sleep_time?)

jl_gc_safe_leave(ptls, gc_state); // contains jl_gc_safepoint
start_cycles = 0;
if (task) {
Expand Down
6 changes: 5 additions & 1 deletion src/safepoint.c
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,10 @@ void jl_safepoint_wait_gc(void)
jl_task_t *ct = jl_current_task; (void)ct;
JL_TIMING_SUSPEND_TASK(GC_SAFEPOINT, ct);
// The thread should have set this is already
assert(jl_atomic_load_relaxed(&ct->ptls->gc_state) != 0);
jl_ptls_t ptls = ct->ptls;
assert(jl_atomic_load_relaxed(&ptls->gc_state) != 0);
// Time how long this thread is stopped while GC is running.
uint64_t t0 = jl_hrtime();
// Use normal volatile load in the loop for speed until GC finishes.
// Then use an acquire load to make sure the GC result is visible on this thread.
while (jl_atomic_load_relaxed(&jl_gc_running) || jl_atomic_load_acquire(&jl_gc_running)) {
Expand All @@ -173,6 +176,7 @@ void jl_safepoint_wait_gc(void)
uv_cond_wait(&safepoint_cond, &safepoint_lock);
uv_mutex_unlock(&safepoint_lock);
}
ptls->timing_tls.gc_time = jl_hrtime() - t0;
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

GC time for the other threads

}

void jl_safepoint_enable_sigint(void)
Expand Down
29 changes: 28 additions & 1 deletion src/threading.c
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,24 @@ JL_DLLEXPORT _Atomic(uint8_t) jl_measure_compile_time_enabled = 0;
JL_DLLEXPORT _Atomic(uint64_t) jl_cumulative_compile_time = 0;
JL_DLLEXPORT _Atomic(uint64_t) jl_cumulative_recompile_time = 0;

static uint64_t jl_thread_start_time;
void jl_set_thread_start_time(void)
{
jl_thread_start_time = jl_hrtime();
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is global shared by all threads, which technically isn't correct since threads will start at very slightly different times, but i think this is fine at least for a first pass?

}

// TODO: not just current thread
JL_DLLEXPORT uint64_t jl_thread_up_time(void)
{
return jl_hrtime() - jl_thread_start_time;
}
JL_DLLEXPORT uint64_t jl_thread_user_time(uint8_t tid)
{
jl_ptls_t ptls = jl_atomic_load_relaxed(&jl_all_tls_states)[tid];
jl_timing_tls_states_t *timing = &ptls->timing_tls;
return jl_thread_up_time() - timing->gc_time - timing->scheduler_time;
}

JL_DLLEXPORT void *jl_get_ptls_states(void)
{
// mostly deprecated: use current_task instead
Expand Down Expand Up @@ -765,6 +783,7 @@ void jl_start_threads(void)
}
uv_thread_detach(&uvtid);
}
jl_set_thread_start_time();
}

_Atomic(unsigned) _threadedregion; // HACK: keep track of whether to prioritize IO or threading
Expand Down Expand Up @@ -794,6 +813,13 @@ JL_DLLEXPORT void jl_exit_threaded_region(void)

// Profiling stubs

/* JL_DLLEXPORT void jl_record_lock_spin_time(uint64_t time) JL_NOTSAFEPOINT */
/* { */
/* jl_task_t *ct = jl_current_task; */
/* jl_ptls_t ptls = ct->ptls; */
/* ptls->timing_tls.lock_spin_time += time; */
/* } */

void _jl_mutex_init(jl_mutex_t *lock, const char *name) JL_NOTSAFEPOINT
{
jl_atomic_store_relaxed(&lock->owner, (jl_task_t*)NULL);
Expand All @@ -816,10 +842,12 @@ void _jl_mutex_wait(jl_task_t *self, jl_mutex_t *lock, int safepoint)
return;
}
JL_TIMING(LOCK_SPIN, LOCK_SPIN);
/* uint64_t t0 = jl_hrtime(); */
while (1) {
if (owner == NULL && jl_atomic_cmpswap(&lock->owner, &owner, self)) {
lock->count = 1;
jl_profile_lock_acquired(lock);
/* jl_record_lock_spin_time(jl_hrtime() - t0); */
return;
}
if (safepoint) {
Expand Down Expand Up @@ -919,7 +947,6 @@ void _jl_mutex_unlock(jl_task_t *self, jl_mutex_t *lock)
}
}


// Make gc alignment available for threading
// see threads.jl alignment
JL_DLLEXPORT int jl_alignment(size_t sz)
Expand Down
1 change: 1 addition & 0 deletions src/threading.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ void jl_init_threadinginfra(void);
void jl_parallel_gc_threadfun(void *arg);
void jl_concurrent_gc_threadfun(void *arg);
void jl_threadfun(void *arg);
void jl_set_thread_start_time(void);

#ifdef __cplusplus
}
Expand Down