Skip to content

threads: improve ggml_barrier scaling with large number of threads #9598

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 45 additions & 14 deletions ggml/src/ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,25 @@ int ggml_sve_cnt_b = 0;
#pragma warning(disable: 4702)
#endif

// Note: once we move threading into a separate C++ file
// will use std::hardware_destructive_interference_size instead of hardcoding it here
// and we'll use C++ attribute syntax.
#define GGML_CACHE_LINE 64

#if defined(__clang__) || defined(__GNUC__)
#define GGML_CACHE_ALIGN __attribute__((aligned(GGML_CACHE_LINE)))
#endif

#if defined(__has_feature)
#if __has_feature(thread_sanitizer)
#define GGML_TSAN_ENABLED 1
#endif
#else // __has_feature
#if defined(__SANITIZE_THREAD__)
#define GGML_TSAN_ENABLED 1
#endif
#endif // __has_feature

#if defined(_WIN32)

#define WIN32_LEAN_AND_MEAN
Expand All @@ -72,6 +91,8 @@ int ggml_sve_cnt_b = 0;
#include <windows.h>

#if !defined(__clang__)
#define GGML_CACHE_ALIGN __declspec(align(GGML_CACHE_LINE))

typedef volatile LONG atomic_int;
typedef atomic_int atomic_bool;
typedef atomic_int atomic_flag;
Expand Down Expand Up @@ -2007,8 +2028,8 @@ struct ggml_threadpool {

// synchronization primitives
atomic_int n_graph; // incremented when there is work to be done (i.e each graph)
atomic_int n_barrier;
atomic_int n_barrier_passed;
atomic_int GGML_CACHE_ALIGN n_barrier;
atomic_int GGML_CACHE_ALIGN n_barrier_passed;
atomic_int current_chunk; // currently processing chunk during Mat_Mul, shared between all the threads.

// these are atomic as an annotation for thread-sanitizer
Expand Down Expand Up @@ -3196,20 +3217,27 @@ static void ggml_barrier(struct ggml_threadpool * tp) {
// enter barrier (full seq-cst fence)
int n_barrier = atomic_fetch_add_explicit(&tp->n_barrier, 1, memory_order_seq_cst);

int last = 0;
if (n_barrier == (n_threads - 1)) {
// last thread
atomic_store_explicit(&tp->n_barrier, 0, memory_order_relaxed);
last = 1;
} else {
// wait for other threads
while (atomic_load_explicit(&tp->n_barrier_passed, memory_order_relaxed) == n_passed) {
ggml_thread_cpu_relax();
}

// exit barrier (fill seq-cst fence)
atomic_fetch_add_explicit(&tp->n_barrier_passed, 1, memory_order_seq_cst);
return;
}

// wait for other threads
while (atomic_load_explicit(&tp->n_barrier_passed, memory_order_relaxed) == n_passed) {
ggml_thread_cpu_relax();
}

// exit barrier (full seq-cst fence)
atomic_fetch_add_explicit(&tp->n_barrier_passed, last, memory_order_seq_cst);
// TSAN doesn't support standalone fence yet, we use a dummy read-modify-write instead
#ifdef GGML_TSAN_ENABLED
atomic_fetch_add_explicit(&tp->n_barrier_passed, 0, memory_order_seq_cst);
#else
atomic_thread_fence(memory_order_seq_cst);
#endif
#endif
}

Expand Down Expand Up @@ -20240,10 +20268,13 @@ static inline bool ggml_graph_compute_thread_ready(struct ggml_compute_state * s

// sync thread state after polling
static inline void ggml_graph_compute_thread_sync(struct ggml_compute_state * state) {
struct ggml_threadpool * threadpool = state->threadpool;
// this should just be atomic_thread_fence(seq_cst) but it confuses thread-sanitizer
// so instead we just use a dummy read-modify-write
atomic_fetch_add_explicit(&threadpool->n_graph, 0, memory_order_seq_cst);
// TSAN doesn't support standalone fence yet, we use a dummy read-modify-write instead
#ifdef GGML_TSAN_ENABLED
atomic_fetch_add_explicit(&state->threadpool->n_graph, 0, memory_order_seq_cst);
#else
atomic_thread_fence(memory_order_seq_cst);
#endif
UNUSED(state);
}

static inline bool ggml_graph_compute_poll_for_work(struct ggml_compute_state * state) {
Expand Down
Loading