threads: improve ggml_barrier scaling with large number of threads

Make sure n_barrier and n_barrier do not share the cache line to avoid cache line bouncing. This optimization shows performance improvements even for n_threads <= 8 cases. Resurect TSAN (Thread Sanitizer) check so that we can avoid doing expensive read-modify-write in the normal case and just use thread-fence as originally intended. --- Here is the original description and suggestions from Willy Tarreau : There's currently some false sharing between n_barrier and n_barrier_passed that is amplified in ggml_barrier() by the fact that all threads need to increment n_barrier when entering, while all previous threads continue to read n_barrier_passed, waiting for the last one to release them all. The side effect is that all these readers are slowing down all new threads by making the cache line bounce back and forth between readers and writers. Just placing them in two distinct cache lines is sufficient to boost the performance by 21% on a 80-core ARM server compared to the no-openmp version, and by 3% compared to the openmp version. Note that the variables could have been spread apart in the structure as well, but it doesn't seem that the size of this threadpool struct is critical so here we're simply aligning them. Finally, the same issue was present when leaving the barrier since all threads had to update the n_barrier_passed counter, though only one would add a non-zero value. This alone is responsible for half of the cost due to undesired serialization. It might be possible that using a small array of n_barrier counters could make things even faster on many-core systems, but it would likely complicate the logic needed to detect the last thread. Co-authored-by: Willy Tarreau <w@1wt.eu>
ggerganov · Sep 23, 2024 · 32f945d · 32f945d
1 parent d09770c
commit 32f945d
Showing 1 changed file with 34 additions and 11 deletions.
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
@@ -63,6 +63,21 @@ int ggml_sve_cnt_b = 0;
 #pragma warning(disable: 4702)
 #endif
 
+// Note: once we move threading into a separate C++ file
+// will use std::hardware_destructive_interference_size instead of hardcoding it here
+// and we'll use C++ attribute syntax.
+#define GGML_CACHE_LINE  64
+
+#if defined(__clang__) || defined(__GNUC__)
+#define GGML_CACHE_ALIGN __attribute__((aligned(GGML_CACHE_LINE)))
+#endif
+
+#if defined(__has_feature)
+#if __has_feature(thread_sanitizer)
+#define GGML_TSAN_ENABLED 1
+#endif
+#endif
+
 #if defined(_WIN32)
 
 #define WIN32_LEAN_AND_MEAN
@@ -72,6 +87,8 @@ int ggml_sve_cnt_b = 0;
 #include <windows.h>
 
 #if !defined(__clang__)
+#define GGML_CACHE_ALIGN __declspec(align(GGML_CACHE_LINE))
+
 typedef volatile LONG atomic_int;
 typedef atomic_int atomic_bool;
 typedef atomic_int atomic_flag;
@@ -2007,8 +2024,8 @@ struct ggml_threadpool {
 
     // synchronization primitives
     atomic_int n_graph;       // incremented when there is work to be done (i.e each graph)
-    atomic_int n_barrier;
-    atomic_int n_barrier_passed;
+    atomic_int GGML_CACHE_ALIGN n_barrier;
+    atomic_int GGML_CACHE_ALIGN n_barrier_passed;
     atomic_int current_chunk; // currently processing chunk during Mat_Mul, shared between all the threads.
 
     // these are atomic as an annotation for thread-sanitizer
@@ -3196,20 +3213,23 @@ static void ggml_barrier(struct ggml_threadpool * tp) {
     // enter barrier (full seq-cst fence)
     int n_barrier = atomic_fetch_add_explicit(&tp->n_barrier, 1, memory_order_seq_cst);
 
-    int last = 0;
     if (n_barrier == (n_threads - 1)) {
         // last thread
         atomic_store_explicit(&tp->n_barrier, 0, memory_order_relaxed);
-        last = 1;
+        atomic_fetch_add_explicit(&tp->n_barrier_passed, 1, memory_order_seq_cst);
     } else {
         // wait for other threads
         while (atomic_load_explicit(&tp->n_barrier_passed, memory_order_relaxed) == n_passed) {
             ggml_thread_cpu_relax();
         }
-    }
 
-    // exit barrier (full seq-cst fence)
-    atomic_fetch_add_explicit(&tp->n_barrier_passed, last, memory_order_seq_cst);
+        #ifdef GGML_TSAN_ENABLED
+        // TSAN doesn't support standalone fence yet, we use a dummy read-modify-write instead
+        atomic_fetch_add_explicit(&tp->n_barrier_passed, 0, memory_order_seq_cst);
+        #else
+        atomic_thread_fence(memory_order_seq_cst);
+        #endif
+    }
 #endif
 }
 
@@ -20240,10 +20260,13 @@ static inline bool ggml_graph_compute_thread_ready(struct ggml_compute_state * s
 
 // sync thread state after polling
 static inline void ggml_graph_compute_thread_sync(struct ggml_compute_state * state) {
-    struct ggml_threadpool * threadpool = state->threadpool;
-    // this should just be atomic_thread_fence(seq_cst) but it confuses thread-sanitizer
-    // so instead we just use a dummy read-modify-write
-    atomic_fetch_add_explicit(&threadpool->n_graph, 0, memory_order_seq_cst);
+    #ifdef GGML_TSAN_ENABLED
+    // TSAN doesn't support standalone fence yet, we use a dummy read-modify-write instead
+    atomic_fetch_add_explicit(&state->threadpool->n_graph, 0, memory_order_seq_cst);
+    #else
+    atomic_thread_fence(memory_order_seq_cst);
+    #endif
+    UNUSED(state);
 }
 
 static inline bool ggml_graph_compute_poll_for_work(struct ggml_compute_state * state) {