From afaba33eddc7c7097b4d672a8ba9b9c8cc5a715c Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 15 Aug 2023 08:51:07 +0300
Subject: [PATCH] llama : style formatting + remove helper methods

---
 ggml.h         |  10 +--
 gguf-llama.cpp | 172 +++++++++++++++++++------------------------------
 gguf-util.h    |  12 ++--
 3 files changed, 79 insertions(+), 115 deletions(-)
diff --git a/ggml.h b/ggml.h
index fb3db10e2cedb2..9a9c7ab391f025 100644
--- a/ggml.h
+++ b/ggml.h
@@ -1744,12 +1744,12 @@ extern "C" {
     GGML_API size_t gguf_get_data_offset(struct gguf_context * ctx);
     GGML_API void * gguf_get_data       (struct gguf_context * ctx);
 
-    GGML_API int            gguf_get_n_kv(struct gguf_context * ctx);
-    GGML_API int            gguf_find_key(struct gguf_context * ctx, const char * key);
-    GGML_API const char *   gguf_get_key (struct gguf_context * ctx, int i);
+    GGML_API int          gguf_get_n_kv(struct gguf_context * ctx);
+    GGML_API int          gguf_find_key(struct gguf_context * ctx, const char * key);
+    GGML_API const char * gguf_get_key (struct gguf_context * ctx, int i);
+
     GGML_API enum gguf_type gguf_get_kv_type (struct gguf_context * ctx, int i);
-    GGML_API enum gguf_type gguf_get_arr_type (struct gguf_context * ctx, int i);
-    GGML_API void           gguf_get_val (struct gguf_context * ctx, int i, void * val);
+    GGML_API enum gguf_type gguf_get_arr_type(struct gguf_context * ctx, int i);
 
     GGML_API const char * gguf_get_arr_str(struct gguf_context * ctx, int key_id, int i);
     GGML_API float        gguf_get_arr_f32(struct gguf_context * ctx, int key_id, int i);
diff --git a/gguf-llama.cpp b/gguf-llama.cpp
index 1c1d6718e5740a..9b5a0f97139e2b 100644
--- a/gguf-llama.cpp
+++ b/gguf-llama.cpp
@@ -510,22 +510,9 @@ struct llama_state {
 // global state
 static llama_state g_state;
 
-template <typename T>
-static T checked_mul(T a, T b) {
-    T ret = a * b;
-    if (a != 0 && ret / a != b) {
-        throw std::runtime_error(format("overflow multiplying %llu * %llu",
-                     (unsigned long long) a, (unsigned long long) b));
-    }
-    return ret;
-}
-
-static size_t checked_div(size_t a, size_t b) {
-    if (b == 0 || a % b != 0) {
-        throw std::runtime_error(format("error dividing %zu / %zu", a, b));
-    }
-    return a / b;
-}
+//
+// model loading and saving
+//
 
 static std::string llama_format_tensor_shape(const std::vector<uint32_t> & ne) {
     char buf[256];
@@ -536,14 +523,6 @@ static std::string llama_format_tensor_shape(const std::vector<uint32_t> & ne) {
     return buf;
 }
 
-static size_t llama_calc_tensor_size(const std::vector<uint32_t> & ne, enum ggml_type type) {
-    size_t size = ggml_type_size(type);
-    for (uint32_t dim : ne) {
-        size = checked_mul<size_t>(size, dim);
-    }
-    return size / ggml_blck_size(type);
-}
-
 struct gguf_load_tensor {
     std::string name;
     enum ggml_type type = GGML_TYPE_F32;
@@ -573,20 +552,19 @@ struct gguf_file_loader {
 
     struct ggml_context * ctx_data = NULL;
 
-    gguf_file_loader(const char * fname, gguf_load_tensors_map & tensors_map)
-        : file(fname, "rb") {
+    gguf_file_loader(const char * fname, gguf_load_tensors_map & tensors_map) : file(fname, "rb") {
         fprintf(stderr, "llama.cpp: loading model from %s\n", fname);
 
-    struct gguf_init_params params = {
-        /*.no_alloc = */ true,
-        /*.ctx      = */ &ctx_data,
-    };
+        struct gguf_init_params params = {
+            /*.no_alloc = */ true,
+            /*.ctx      = */ &ctx_data,
+        };
 
-    gguf_ctx = gguf_init_from_file(fname, params);
-    file_version = (enum gguf_file_version) gguf_get_version(gguf_ctx);
+        gguf_ctx = gguf_init_from_file(fname, params);
+        file_version = (enum gguf_file_version) gguf_get_version(gguf_ctx);
 
-    read_hparams();
-    read_vocab();
+        read_hparams();
+        read_vocab();
         read_tensor_metadata(tensors_map);
     }
 
@@ -636,18 +614,18 @@ struct gguf_file_loader {
 
     void read_vocab() {
         vocab.id_to_token.resize(hparams.n_vocab);
-        int token_idx = gguf_find_key(gguf_ctx, "tokenizer.ggml.tokens");
+
+        const int token_idx = gguf_find_key(gguf_ctx, "tokenizer.ggml.tokens");
         if (token_idx == -1) {
             throw std::runtime_error("cannot find token list in GGUF file\n");
         }
 
-        int score_idx = gguf_find_key(gguf_ctx, "tokenizer.ggml.scores");
+        const int score_idx = gguf_find_key(gguf_ctx, "tokenizer.ggml.scores");
         if (score_idx == -1) {
             throw std::runtime_error("cannot find token scores list in GGUF file\n");
         }
 
         for (uint32_t i = 0; i < hparams.n_vocab; i++) {
-
             std::string word = gguf_get_arr_str(gguf_ctx, token_idx, i);
 
             vocab.token_to_id[word] = i;
@@ -701,7 +679,7 @@ struct gguf_file_loader {
             tensor.file_off = gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, i);
 
             tensor.name = name;
-            tensor.size = llama_calc_tensor_size(tensor.ne, tensor.type);
+            tensor.size = ggml_nbytes(cur);
 
             tensors_map.tensors.push_back(tensor);
             tensors_map.name_to_idx[name] = tensors_map.tensors.size() - 1;
@@ -786,7 +764,7 @@ struct gguf_file_saver {
                 gguf_type arr_type;
                 int n_arr;
 
-                switch(vtype) {
+                switch (vtype) {
                     case GGUF_TYPE_BOOL:
                         bool_val = gguf_get_val_bool(fl->gguf_ctx, i);
                         file.write_val<bool>(key, GGUF_TYPE_BOOL, bool_val);
@@ -809,7 +787,7 @@ struct gguf_file_saver {
                         break;
                     case GGUF_TYPE_STRING:
                         str_val = gguf_get_val_str(fl->gguf_ctx, i);
-                        file.write_val<std::string>(key, GGUF_TYPE_STRING, str_val);
+                        file.write_str(key, GGUF_TYPE_STRING, str_val);
                         break;
                     case GGUF_TYPE_UINT16:
                         u16_val = gguf_get_val_u16(fl->gguf_ctx, i);
@@ -825,7 +803,7 @@ struct gguf_file_saver {
                         break;
                     case GGUF_TYPE_ARRAY:
                         arr_type = gguf_get_arr_type(fl->gguf_ctx, i);
-                        n_arr    = gguf_get_arr_n(fl->gguf_ctx, i);
+                        n_arr    = gguf_get_arr_n   (fl->gguf_ctx, i);
                         if (arr_type == GGUF_TYPE_FLOAT32) {
                             write_hparam_arr_f32(key, arr_type, i, n_arr);
                         } else if (arr_type == GGUF_TYPE_STRING) {
@@ -922,20 +900,6 @@ struct llama_model_loader {
         }
     }
 
-    struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne, ggml_backend backend) {
-        auto it = tensors_map.name_to_idx.find(name);
-        if (it == tensors_map.name_to_idx.end()) {
-            throw std::runtime_error(std::runtime_error(format("llama.cpp: tensor '%s' is missing from model", name.c_str())));
-        }
-        gguf_load_tensor & lt = tensors_map.tensors.at(it->second);
-        if (lt.ne != ne) {
-            throw std::runtime_error(format("llama.cpp: tensor '%s' has wrong shape; expected %s, got %s",
-                         name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str()));
-        }
-
-        return get_tensor_for(lt, backend);
-    }
-
     struct ggml_tensor * get_tensor_for(gguf_load_tensor & lt, ggml_backend backend) {
         struct ggml_tensor * tensor;
         if (backend != GGML_BACKEND_CPU) {
@@ -959,16 +923,41 @@ struct llama_model_loader {
         return tensor;
     }
 
+    struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne, ggml_backend backend) {
+        auto it = tensors_map.name_to_idx.find(name);
+        if (it == tensors_map.name_to_idx.end()) {
+            throw std::runtime_error(std::runtime_error(format("llama.cpp: tensor '%s' is missing from model", name.c_str())));
+        }
+        gguf_load_tensor & lt = tensors_map.tensors.at(it->second);
+        if (lt.ne != ne) {
+            throw std::runtime_error(format("llama.cpp: tensor '%s' has wrong shape; expected %s, got %s",
+                         name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str()));
+        }
+
+        return get_tensor_for(lt, backend);
+    }
+
     void done_getting_tensors() const {
         if (num_ggml_tensors_created != tensors_map.tensors.size()) {
             throw std::runtime_error(std::string("llama.cpp: file contained more tensors than expected"));
         }
     }
 
-    void load_all_data(llama_progress_callback progress_callback, void *  progress_callback_user_data, gguf_mlock * lmlock) {
-        size_t data_size = 0;
+    void load_data_for(gguf_load_tensor & lt) const {
+        if (use_mmap) {
+            lt.data = (uint8_t *) mapping->addr + lt.file_off;
+        } else {
+            gguf_file & file = file_loader->file;
+            file.seek(lt.file_off, SEEK_SET);
+            file.read_raw(lt.data, lt.size);
+        }
+    }
+
+    void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, gguf_mlock * lmlock) {
+        size_t data_size     = 0;
         size_t prefetch_size = 0;
-        size_t lock_size = 0;
+        size_t lock_size     = 0;
+
         for (const gguf_load_tensor & lt : tensors_map.tensors) {
             data_size += lt.size;
             if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
@@ -1030,31 +1019,6 @@ struct llama_model_loader {
             done_size += lt.size;
         }
     }
-
-    void load_data_for(gguf_load_tensor & lt) {
-        if (use_mmap) {
-            lt.data = (uint8_t *) mapping->addr + lt.file_off;
-        } else {
-            gguf_file & file = file_loader->file;
-            file.seek(lt.file_off, SEEK_SET);
-            file.read_raw(lt.data, lt.size);
-        }
-
-        if (0) {
-            print_checksum(lt);
-        }
-    }
-
-    static void print_checksum(gguf_load_tensor & lt) {
-        uint32_t sum = 0;
-        for (size_t i = 0; i < lt.size; i++) {
-            uint8_t byte = lt.data[i];
-            sum = byte + (sum << 6) + (sum << 16) - sum; // sdbm hash
-        }
-        fprintf(stderr, "%s checksum: %#08x (%s, size %zu)\n", lt.name.c_str(), sum,
-                llama_format_tensor_shape(lt.ne).c_str(), lt.size);
-    }
-
 };
 
 //
@@ -1184,18 +1148,18 @@ int64_t llama_time_us() {
 }
 
 //
-// model loading
+// load LLaMA models
 //
 
-static const char *gguf_file_version_name(gguf_file_version version) {
+static const char * gguf_file_version_name(gguf_file_version version) {
     switch (version) {
         case GGUF_FILE_VERSION_V1: return "GGUF V1 (latest)";
-        }
+    }
 
     return "unknown";
 }
 
-static const char *llama_ftype_name(enum llama_ftype ftype) {
+static const char * llama_ftype_name(enum llama_ftype ftype) {
     switch (ftype) {
         case LLAMA_FTYPE_ALL_F32:     return "all F32";
         case LLAMA_FTYPE_MOSTLY_F16:  return "mostly F16";
@@ -1206,8 +1170,9 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
         case LLAMA_FTYPE_MOSTLY_Q5_0: return "mostly Q5_0";
         case LLAMA_FTYPE_MOSTLY_Q5_1: return "mostly Q5_1";
         case LLAMA_FTYPE_MOSTLY_Q8_0: return "mostly Q8_0";
+
         // K-quants
-        case LLAMA_FTYPE_MOSTLY_Q2_K: return "mostly Q2_K";
+        case LLAMA_FTYPE_MOSTLY_Q2_K:   return "mostly Q2_K";
         case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "mostly Q3_K - Small";
         case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "mostly Q3_K - Medium";
         case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "mostly Q3_K - Large";
@@ -1215,15 +1180,16 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
         case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "mostly Q4_K - Medium";
         case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "mostly Q5_K - Small";
         case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "mostly Q5_K - Medium";
-        case LLAMA_FTYPE_MOSTLY_Q6_K: return "mostly Q6_K";
-        default:                      return "unknown, may not work";
+        case LLAMA_FTYPE_MOSTLY_Q6_K:   return "mostly Q6_K";
+
+        default: return "unknown, may not work";
     }
 }
 
-static const char *llama_model_type_name(e_model type) {
+static const char * llama_model_type_name(e_model type) {
     switch (type) {
-        case MODEL_3B: return "3B";
-        case MODEL_7B: return "7B";
+        case MODEL_3B:  return "3B";
+        case MODEL_7B:  return "7B";
         case MODEL_13B: return "13B";
         case MODEL_30B: return "30B";
         case MODEL_65B: return "65B";
@@ -1604,7 +1570,6 @@ static struct ggml_cgraph * llama_build_graph(
     const int64_t n_embd_head = hparams.n_embd_head();
     const int64_t n_embd_gqa  = hparams.n_embd_gqa();
 
-
     GGML_ASSERT(n_embd_head == hparams.n_rot);
 
     const float freq_base  = hparams.rope_freq_base;
@@ -1713,7 +1678,7 @@ static struct ggml_cgraph * llama_build_graph(
 
         struct ggml_tensor * inpSA = inpL;
 
-        lctx.use_buf(ctx0, 0);
+        llama_context::use_buf(ctx0, 0);
 
         // norm
         {
@@ -1852,7 +1817,7 @@ static struct ggml_cgraph * llama_build_graph(
             ggml_set_name(cur, "result_wo");
         }
 
-        lctx.use_buf(ctx0, 1);
+        llama_context::use_buf(ctx0, 1);
 
         struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
         offload_func(inpFF);
@@ -1908,7 +1873,7 @@ static struct ggml_cgraph * llama_build_graph(
         inpL = cur;
     }
 
-    lctx.use_buf(ctx0, 0);
+    llama_context::use_buf(ctx0, 0);
 
     // norm
     {
@@ -1926,7 +1891,7 @@ static struct ggml_cgraph * llama_build_graph(
     cur = ggml_mul_mat(ctx0, model.output, cur);
     ggml_set_name(cur, "result_output");
 
-    lctx.use_buf(ctx0, -1);
+    llama_context::use_buf(ctx0, -1);
 
     // logits -> probs
     //cur = ggml_soft_max_inplace(ctx0, cur);
@@ -2996,9 +2961,8 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
         }
     }
 
-    const auto rejects =
-        llama_grammar_reject_candidates(grammar->rules, grammar->stacks, candidates_grammar);
-    for (auto & reject : rejects) {
+    const auto rejects = llama_grammar_reject_candidates(grammar->rules, grammar->stacks, candidates_grammar);
+    for (const auto & reject : rejects) {
         candidates->data[reject.index].logit = -INFINITY;
     }
 
@@ -3725,7 +3689,7 @@ void llama_free(struct llama_context * ctx) {
 int llama_model_quantize(
         const char * fname_inp,
         const char * fname_out,
-        const llama_model_quantize_params *params) {
+        const llama_model_quantize_params * params) {
     try {
         llama_model_quantize_internal(fname_inp, fname_out, params);
         return 0;
@@ -4343,8 +4307,7 @@ static bool llama_load_session_file_internal(struct llama_context * ctx, const c
     GGML_UNUSED(n_token_capacity);
     GGML_UNUSED(n_token_count_out);
 
-
-// TODO: implement with GGUF format
+    // TODO: implement with GGUF format
     return true;
 }
 
@@ -4389,7 +4352,6 @@ int llama_eval(
     return 0;
 }
 
-
 int llama_eval_embd(
             struct llama_context * ctx,
                      const float * embd,
diff --git a/gguf-util.h b/gguf-util.h
index d8557d94f114d5..b6a20cf5be4e5d 100644
--- a/gguf-util.h
+++ b/gguf-util.h
@@ -122,9 +122,10 @@ struct gguf_file {
 
     template<typename T>
     void write_val(const std::string & key, enum gguf_type type, const T & val) {
+        static_assert(std::is_fundamental<T>::value, "T must be a primitive type");
         write_str(key);
         fwrite((const char *) &type, sizeof(type), 1, fp);
-        fwrite((const char *) &val, sizeof(val), 1, fp);
+        fwrite((const char *) &val,  sizeof(val), 1, fp);
     }
 
     template<typename T>
@@ -137,7 +138,7 @@ struct gguf_file {
 
         const int32_t n = val.size();
         fwrite((const char *) &type, sizeof(type), 1, fp);
-        fwrite((const char *) &n, sizeof(n), 1, fp);
+        fwrite((const char *) &n,    sizeof(n), 1, fp);
         fwrite(val.data(), sizeof(T), n, fp);
     }
 
@@ -159,7 +160,7 @@ struct gguf_file {
 
         const int32_t n = val.size();
         fwrite((const char *) &type, sizeof(type), 1, fp);
-        fwrite((const char *) &n, sizeof(n), 1, fp);
+        fwrite((const char *) &n,    sizeof(n), 1, fp);
         for (int i = 0; i < n; ++i) {
             const int32_t nstr = val[i].size();
             fwrite((const char *) &nstr, sizeof(nstr), 1, fp);
@@ -265,7 +266,7 @@ struct gguf_mmap {
 #elif defined(_WIN32)
     static constexpr bool SUPPORTED = true;
 
-    gguf_mmap(struct llama_file * file, bool prefetch = true, bool numa = false) {
+    gguf_mmap(struct gguf_file * file, bool prefetch = true, bool numa = false) {
         (void) numa;
 
         size = file->size;
@@ -312,7 +313,8 @@ struct gguf_mmap {
 #else
     static constexpr bool SUPPORTED = false;
 
-    gguf_mmap(struct llama_file *, bool prefetch = true, bool numa = false) {
+    gguf_mmap(struct gguf_file * file, bool prefetch = true, bool numa = false) {
+        (void) file;
         (void) prefetch;
         (void) numa;