ggml : reduce hash table reset cost (ggerganov#8698)

* ggml : reduce hash table reset cost * fix unreachable code warnings after GGML_ASSERT(false) * GGML_ASSERT(false) -> GGML_ABORT("fatal error") * GGML_ABORT use format string
teleprint-me · Jul 27, 2024 · 2b1f616 · 2b1f616
1 parent 01245f5
commit 2b1f616
Show file tree

Hide file tree

Showing 46 changed files with 851 additions and 754 deletions.
diff --git a/Makefile b/Makefile
@@ -325,9 +325,9 @@ ifdef LLAMA_DEBUG
 	endif
 else
 	MK_CPPFLAGS   += -DNDEBUG
-	MK_CFLAGS     += -O3
-	MK_CXXFLAGS   += -O3
-	MK_NVCCFLAGS  += -O3
+	MK_CFLAGS     += -O3 -g
+	MK_CXXFLAGS   += -O3 -g
+	MK_NVCCFLAGS  += -O3 -g
 endif
 
 ifdef LLAMA_SANITIZE_THREAD

diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp
@@ -62,7 +62,7 @@ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne
                     } else if (type == GGML_TYPE_I8) {
                         v = (float) *(int8_t *) &data[i];
                     } else {
-                        GGML_ASSERT(false);
+                        GGML_ABORT("fatal error");
                     }
                     printf("%12.4f", v);
                     sum += v;

diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp
@@ -127,7 +127,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
         }
         else if (e.values.size() != (size_t)src1->ne[0]*n_as) {
             fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as);
-            exit(1); //GGML_ASSERT(false);
+            exit(1); //GGML_ABORT("fatal error");
         }
         if (m_params.verbosity > 1) {
             printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[2], (int)src1->type);
@@ -176,7 +176,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
         }
         else if (e.values.size() != (size_t)src1->ne[0]) {
             fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]);
-            exit(1); //GGML_ASSERT(false);
+            exit(1); //GGML_ABORT("fatal error");
         }
         ++e.ncall;
         if (m_params.verbosity > 1) {

diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp
@@ -150,7 +150,7 @@ static const char * output_format_str(output_formats format) {
         case JSON:     return "json";
         case MARKDOWN: return "md";
         case SQL:      return "sql";
-        default: GGML_ASSERT(!"invalid output format");
+        default: GGML_ABORT("invalid output format");
     }
 }
 
@@ -176,7 +176,7 @@ static const char * split_mode_str(llama_split_mode mode) {
         case LLAMA_SPLIT_MODE_NONE:  return "none";
         case LLAMA_SPLIT_MODE_LAYER: return "layer";
         case LLAMA_SPLIT_MODE_ROW:   return "row";
-        default: GGML_ASSERT(!"invalid split mode");
+        default: GGML_ABORT("invalid split mode");
     }
 }
 
@@ -1326,7 +1326,7 @@ static std::unique_ptr<printer> create_printer(output_formats format) {
         case SQL:
             return std::unique_ptr<printer>(new sql_printer());
     }
-    GGML_ASSERT(false);
+    GGML_ABORT("fatal error");
 }
 
 int main(int argc, char ** argv) {

diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
@@ -869,7 +869,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
             embeddings = peg_0;
         }
         else {
-            GGML_ASSERT(false);
+            GGML_ABORT("fatal error");
         }
     }
 

diff --git a/examples/tokenize/tokenize.cpp b/examples/tokenize/tokenize.cpp
@@ -163,7 +163,7 @@ static void write_utf8_cstr_to_stdout(const char * str, bool & invalid_utf8) {
                 printf(">");
                 return;
             }
-            GGML_ASSERT(false && "MultiByteToWideChar() failed in an unexpected way.");
+            GGML_ABORT("MultiByteToWideChar() failed in an unexpected way.");
         }
 
         LPWSTR wstr = (LPWSTR) calloc(length_needed+1, sizeof(*wstr));

diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
@@ -254,18 +254,8 @@
 
 #define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1))
 
-#define GGML_ASSERT(x) \
-    do { \
-        if (!(x)) { \
-            fflush(stdout); \
-            fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
-            ggml_print_backtrace(); \
-            abort(); \
-        } \
-    } while (0)
-
 #ifndef NDEBUG
-#define GGML_UNREACHABLE() GGML_ASSERT(!"statement should not be reached")
+#define GGML_UNREACHABLE() do { fprintf(stderr, "statement should be unreachable\n"); abort(); } while(0)
 #elif defined(__GNUC__)
 #define GGML_UNREACHABLE() __builtin_unreachable()
 #elif defined(_MSC_VER)
@@ -274,6 +264,17 @@
 #define GGML_UNREACHABLE() ((void) 0)
 #endif
 
+#ifdef __cplusplus
+#define GGML_NORETURN [[noreturn]]
+#elif defined(_MSC_VER)
+#define GGML_NORETURN __declspec(noreturn)
+#else
+#define GGML_NORETURN _Noreturn
+#endif
+
+#define GGML_ABORT(...) ggml_abort(__FILE__, __LINE__, __VA_ARGS__)
+#define GGML_ASSERT(x) if (!(x)) GGML_ABORT("GGML_ASSERT(%s) failed", #x)
+
 // used to copy the number of elements and stride in bytes of tensors into local variables.
 // main purpose is to reduce code duplication and improve readability.
 //
@@ -322,6 +323,9 @@
 extern "C" {
 #endif
 
+    GGML_NORETURN GGML_ATTRIBUTE_FORMAT(3, 4)
+    GGML_API void ggml_abort(const char * file, int line, const char * fmt, ...);
+
     enum ggml_status {
         GGML_STATUS_ALLOC_FAILED = -2,
         GGML_STATUS_FAILED = -1,
@@ -636,8 +640,11 @@ extern "C" {
         GGML_CGRAPH_EVAL_ORDER_COUNT
     };
 
+    typedef uint32_t ggml_bitset_t;
+
     struct ggml_hash_set {
         size_t size;
+        ggml_bitset_t * used;
         struct ggml_tensor ** keys;
     };
 
@@ -651,7 +658,7 @@ extern "C" {
         struct ggml_tensor ** grads;
         struct ggml_tensor ** leafs;
 
-        struct ggml_hash_set visited_hash_table;
+        struct ggml_hash_set visited_hash_set;
 
         enum ggml_cgraph_eval_order order;
     };
@@ -698,8 +705,6 @@ extern "C" {
     GGML_API int64_t ggml_cycles(void);
     GGML_API int64_t ggml_cycles_per_ms(void);
 
-    GGML_API void    ggml_print_backtrace(void);
-
     // accepts a UTF-8 path, even on Windows
     GGML_API FILE *  ggml_fopen(const char * fname, const char * mode);
 
@@ -2005,8 +2010,8 @@ extern "C" {
 
     // ggml_graph_plan() has to be called before ggml_graph_compute()
     // when plan.work_size > 0, caller must allocate memory for plan.work_data
-    GGML_API struct ggml_cplan ggml_graph_plan            (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
-    GGML_API enum ggml_status  ggml_graph_compute         (      struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
+    GGML_API struct ggml_cplan ggml_graph_plan   (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
+    GGML_API enum ggml_status  ggml_graph_compute(      struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
     // same as ggml_graph_compute() but the work data is allocated as a part of the context
     // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
     GGML_API enum ggml_status  ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);

diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c
@@ -91,8 +91,7 @@ void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tenso
     if (talloc->offset + size > ggml_backend_buffer_get_size(talloc->buffer)) {
         fprintf(stderr, "%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n",
                 __func__, tensor->name, size, ggml_backend_buffer_get_size(talloc->buffer) - talloc->offset);
-        GGML_ASSERT(!"not enough space in the buffer");
-        return;
+        GGML_ABORT("not enough space in the buffer");
     }
 
     void * addr = (char *)ggml_backend_buffer_get_base(talloc->buffer) + talloc->offset;
@@ -133,7 +132,7 @@ static void add_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset,
             return;
         }
     }
-    GGML_ASSERT(!"out of allocated_tensors");
+    GGML_ABORT("out of allocated_tensors");
 }
 static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, const struct ggml_tensor * tensor) {
     for (int i = 0; i < 1024; i++) {
@@ -142,8 +141,7 @@ static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offs
             return;
         }
     }
-    fprintf(stderr, "tried to free tensor %s not found\n", tensor->name);
-    GGML_ASSERT(!"tensor not found");
+    GGML_ABORT("tried to free tensor %s not found\n", tensor->name);
 }
 #endif
 
@@ -176,8 +174,7 @@ static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t siz
             // this should never happen
             fprintf(stderr, "%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
                     __func__, size, max_avail);
-            GGML_ASSERT(!"not enough space in the buffer");
-            GGML_UNREACHABLE();
+            GGML_ABORT("not enough space in the buffer");
         }
     }
 
@@ -443,7 +440,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
         }
     }
 
-    free(galloc->hash_set.keys);
+    ggml_hash_set_free(&galloc->hash_set);
     free(galloc->hash_values);
     free(galloc->bufts);
     free(galloc->buffers);
@@ -456,7 +453,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
 typedef struct ggml_gallocr * ggml_gallocr_t;
 
 static struct hash_node * ggml_gallocr_hash_get(ggml_gallocr_t galloc, struct ggml_tensor * t) {
-    size_t i = ggml_hash_find_or_insert(galloc->hash_set, t);
+    size_t i = ggml_hash_find_or_insert(&galloc->hash_set, t);
     return &galloc->hash_values[i];
 }
 
@@ -565,8 +562,8 @@ static int get_node_buffer_id(const int * node_buffer_ids, int i) {
 
 static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
     // clear hash tables
-    memset(galloc->hash_set.keys, 0, galloc->hash_set.size * sizeof(struct ggml_tensor *));
-    memset(galloc->hash_values,   0, galloc->hash_set.size * sizeof(struct hash_node));
+    ggml_hash_set_reset(&galloc->hash_set);
+    memset(galloc->hash_values, 0, sizeof(struct hash_node) * galloc->hash_set.size);
 
     // allocate leafs
     // these may be tensors that the application is not using in the graph, but may still want to allocate for other purposes
@@ -671,21 +668,19 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
 }
 
 bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
-    size_t hash_size = graph->visited_hash_table.size;
+    size_t min_hash_size = graph->n_nodes + graph->n_leafs;
+    // add 25% margin to avoid hash collisions
+    min_hash_size += min_hash_size / 4;
 
     // initialize hash table
-    if (galloc->hash_set.size < hash_size) {
-        free(galloc->hash_set.keys);
-        free(galloc->hash_values);
-        galloc->hash_set.size = hash_size;
-        galloc->hash_set.keys = calloc(hash_size, sizeof(struct ggml_tensor *));
-        galloc->hash_values   = calloc(hash_size, sizeof(struct hash_node));
+    if (galloc->hash_set.size < min_hash_size) {
+        ggml_hash_set_free(&galloc->hash_set);
+        galloc->hash_set = ggml_hash_set_new(min_hash_size);
         GGML_ASSERT(galloc->hash_set.keys != NULL);
+
+        free(galloc->hash_values);
+        galloc->hash_values = malloc(sizeof(struct hash_node) * galloc->hash_set.size);
         GGML_ASSERT(galloc->hash_values != NULL);
-    } else {
-        // reset hash table
-        memset(galloc->hash_set.keys, 0, sizeof(struct ggml_tensor *) * galloc->hash_set.size);
-        memset(galloc->hash_values,   0, sizeof(struct hash_node) * galloc->hash_set.size);
     }
 
     // reset allocators
@@ -817,8 +812,7 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
 }
 
 static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct tensor_alloc * talloc) {
-    ggml_backend_buffer_type_t buft = talloc->buffer_id != -1 ? galloc->bufts[talloc->buffer_id] : NULL;
-    size_t node_size = (node->data || node->view_src) ? 0 : ggml_backend_buft_get_alloc_size(buft, node);
+    size_t node_size = (node->data || node->view_src) ? 0 : ggml_backend_buft_get_alloc_size(galloc->bufts[talloc->buffer_id], node);
     return talloc->size_max >= node_size;
 }