Skip to content

Commit f0c7b5e

Browse files
threads: improve ggml_barrier scaling with large number of threads (#9598)
Make sure n_barrier and n_barrier_passed do not share the cache line to avoid cache line bouncing. This optimization shows performance improvements even for n_threads <= 8 cases. Resurect TSAN (Thread Sanitizer) check so that we can avoid doing expensive read-modify-write in the normal case and just use thread-fence as originally intended. --- Here is the original description and suggestions from Willy Tarreau : There's currently some false sharing between n_barrier and n_barrier_passed that is amplified in ggml_barrier() by the fact that all threads need to increment n_barrier when entering, while all previous threads continue to read n_barrier_passed, waiting for the last one to release them all. The side effect is that all these readers are slowing down all new threads by making the cache line bounce back and forth between readers and writers. Just placing them in two distinct cache lines is sufficient to boost the performance by 21% on a 80-core ARM server compared to the no-openmp version, and by 3% compared to the openmp version. Note that the variables could have been spread apart in the structure as well, but it doesn't seem that the size of this threadpool struct is critical so here we're simply aligning them. Finally, the same issue was present when leaving the barrier since all threads had to update the n_barrier_passed counter, though only one would add a non-zero value. This alone is responsible for half of the cost due to undesired serialization. It might be possible that using a small array of n_barrier counters could make things even faster on many-core systems, but it would likely complicate the logic needed to detect the last thread. Co-authored-by: Willy Tarreau <w@1wt.eu>
1 parent 1d48e98 commit f0c7b5e

File tree

1 file changed

+45
-14
lines changed

1 file changed

+45
-14
lines changed

ggml/src/ggml.c

Lines changed: 45 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,25 @@ int ggml_sve_cnt_b = 0;
6363
#pragma warning(disable: 4702)
6464
#endif
6565

66+
// Note: once we move threading into a separate C++ file
67+
// will use std::hardware_destructive_interference_size instead of hardcoding it here
68+
// and we'll use C++ attribute syntax.
69+
#define GGML_CACHE_LINE 64
70+
71+
#if defined(__clang__) || defined(__GNUC__)
72+
#define GGML_CACHE_ALIGN __attribute__((aligned(GGML_CACHE_LINE)))
73+
#endif
74+
75+
#if defined(__has_feature)
76+
#if __has_feature(thread_sanitizer)
77+
#define GGML_TSAN_ENABLED 1
78+
#endif
79+
#else // __has_feature
80+
#if defined(__SANITIZE_THREAD__)
81+
#define GGML_TSAN_ENABLED 1
82+
#endif
83+
#endif // __has_feature
84+
6685
#if defined(_WIN32)
6786

6887
#define WIN32_LEAN_AND_MEAN
@@ -72,6 +91,8 @@ int ggml_sve_cnt_b = 0;
7291
#include <windows.h>
7392

7493
#if !defined(__clang__)
94+
#define GGML_CACHE_ALIGN __declspec(align(GGML_CACHE_LINE))
95+
7596
typedef volatile LONG atomic_int;
7697
typedef atomic_int atomic_bool;
7798
typedef atomic_int atomic_flag;
@@ -2007,8 +2028,8 @@ struct ggml_threadpool {
20072028

20082029
// synchronization primitives
20092030
atomic_int n_graph; // incremented when there is work to be done (i.e each graph)
2010-
atomic_int n_barrier;
2011-
atomic_int n_barrier_passed;
2031+
atomic_int GGML_CACHE_ALIGN n_barrier;
2032+
atomic_int GGML_CACHE_ALIGN n_barrier_passed;
20122033
atomic_int current_chunk; // currently processing chunk during Mat_Mul, shared between all the threads.
20132034

20142035
// these are atomic as an annotation for thread-sanitizer
@@ -3196,20 +3217,27 @@ static void ggml_barrier(struct ggml_threadpool * tp) {
31963217
// enter barrier (full seq-cst fence)
31973218
int n_barrier = atomic_fetch_add_explicit(&tp->n_barrier, 1, memory_order_seq_cst);
31983219

3199-
int last = 0;
32003220
if (n_barrier == (n_threads - 1)) {
32013221
// last thread
32023222
atomic_store_explicit(&tp->n_barrier, 0, memory_order_relaxed);
3203-
last = 1;
3204-
} else {
3205-
// wait for other threads
3206-
while (atomic_load_explicit(&tp->n_barrier_passed, memory_order_relaxed) == n_passed) {
3207-
ggml_thread_cpu_relax();
3208-
}
3223+
3224+
// exit barrier (fill seq-cst fence)
3225+
atomic_fetch_add_explicit(&tp->n_barrier_passed, 1, memory_order_seq_cst);
3226+
return;
3227+
}
3228+
3229+
// wait for other threads
3230+
while (atomic_load_explicit(&tp->n_barrier_passed, memory_order_relaxed) == n_passed) {
3231+
ggml_thread_cpu_relax();
32093232
}
32103233

32113234
// exit barrier (full seq-cst fence)
3212-
atomic_fetch_add_explicit(&tp->n_barrier_passed, last, memory_order_seq_cst);
3235+
// TSAN doesn't support standalone fence yet, we use a dummy read-modify-write instead
3236+
#ifdef GGML_TSAN_ENABLED
3237+
atomic_fetch_add_explicit(&tp->n_barrier_passed, 0, memory_order_seq_cst);
3238+
#else
3239+
atomic_thread_fence(memory_order_seq_cst);
3240+
#endif
32133241
#endif
32143242
}
32153243

@@ -20240,10 +20268,13 @@ static inline bool ggml_graph_compute_thread_ready(struct ggml_compute_state * s
2024020268

2024120269
// sync thread state after polling
2024220270
static inline void ggml_graph_compute_thread_sync(struct ggml_compute_state * state) {
20243-
struct ggml_threadpool * threadpool = state->threadpool;
20244-
// this should just be atomic_thread_fence(seq_cst) but it confuses thread-sanitizer
20245-
// so instead we just use a dummy read-modify-write
20246-
atomic_fetch_add_explicit(&threadpool->n_graph, 0, memory_order_seq_cst);
20271+
// TSAN doesn't support standalone fence yet, we use a dummy read-modify-write instead
20272+
#ifdef GGML_TSAN_ENABLED
20273+
atomic_fetch_add_explicit(&state->threadpool->n_graph, 0, memory_order_seq_cst);
20274+
#else
20275+
atomic_thread_fence(memory_order_seq_cst);
20276+
#endif
20277+
UNUSED(state);
2024720278
}
2024820279

2024920280
static inline bool ggml_graph_compute_poll_for_work(struct ggml_compute_state * state) {

0 commit comments

Comments
 (0)