Merge branch 'feature/llama_supports_rpc' of github.com:martindevans/llama.cpp into feature/llama_supports_rpc

martindevans · martindevans · commit f87e6ac5e343 · 2024-06-09T16:16:27.000+01:00
diff --git a/llama.h b/llama.h
@@ -97,7 +97,7 @@ extern "C" {
         LLAMA_ROPE_TYPE_GLM  =  4,
     };
 
-    enum llama_token_type { //TODO: remove, required until per token attributes are available from GGUF file
+    enum llama_token_type {
         LLAMA_TOKEN_TYPE_UNDEFINED    = 0,
         LLAMA_TOKEN_TYPE_NORMAL       = 1,
         LLAMA_TOKEN_TYPE_UNKNOWN      = 2,
@@ -107,20 +107,6 @@ extern "C" {
         LLAMA_TOKEN_TYPE_BYTE         = 6,
     };
 
-    enum llama_token_attr {
-        LLAMA_TOKEN_ATTR_UNDEFINED    = 0,
-        LLAMA_TOKEN_ATTR_UNKNOWN      = 1 << 0,
-        LLAMA_TOKEN_ATTR_UNUSED       = 1 << 1,
-        LLAMA_TOKEN_ATTR_NORMAL       = 1 << 2,
-        LLAMA_TOKEN_ATTR_CONTROL      = 1 << 3,  // SPECIAL?
-        LLAMA_TOKEN_ATTR_USER_DEFINED = 1 << 4,
-        LLAMA_TOKEN_ATTR_BYTE         = 1 << 5,
-        LLAMA_TOKEN_ATTR_NORMALIZED   = 1 << 6,
-        LLAMA_TOKEN_ATTR_LSTRIP       = 1 << 7,
-        LLAMA_TOKEN_ATTR_RSTRIP       = 1 << 8,
-        LLAMA_TOKEN_ATTR_SINGLE_WORD  = 1 << 9,
-    };
-
     // model file types
     enum llama_ftype {
         LLAMA_FTYPE_ALL_F32              = 0,
@@ -365,9 +351,6 @@ extern "C" {
         // modifies a preceding LLAMA_GRETYPE_CHAR or
         // LLAMA_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA])
         LLAMA_GRETYPE_CHAR_ALT       = 6,
-
-        // any character (.)
-        LLAMA_GRETYPE_CHAR_ANY       = 7,
     };
 
     typedef struct llama_grammar_element {
@@ -430,7 +413,7 @@ extern "C" {
 
     LLAMA_API bool llama_supports_mmap       (void);
     LLAMA_API bool llama_supports_mlock      (void);
-    LLAMA_API bool llama_supports_rpc        (void);
+    LLAMA_API bool llama_supports_rpc        (void); // TMP: https://github.com/ggerganov/llama.cpp/pull/7647#issuecomment-2140234367
     LLAMA_API bool llama_supports_gpu_offload(void);
 
     LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx);
@@ -442,8 +425,8 @@ extern "C" {
 
     LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx);
 
-    LLAMA_API enum llama_vocab_type   llama_vocab_type  (const struct llama_model * model);
-    LLAMA_API enum llama_rope_type    llama_rope_type   (const struct llama_model * model);
+    LLAMA_API enum llama_vocab_type   llama_vocab_type  (const struct llama_model   * model);
+    LLAMA_API enum llama_rope_type    llama_rope_type   (const struct llama_model   * model);
 
     LLAMA_API int32_t llama_n_vocab    (const struct llama_model * model);
     LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model);
@@ -839,7 +822,7 @@ extern "C" {
 
     LLAMA_API float llama_token_get_score(const struct llama_model * model, llama_token token);
 
-    LLAMA_API enum llama_token_attr llama_token_get_attr(const struct llama_model * model, llama_token token);
+    LLAMA_API enum llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token);
 
     // Check if the token is supposed to end generation (end-of-generation, eg. EOS, EOT, etc.)
     LLAMA_API bool llama_token_is_eog(const struct llama_model * model, llama_token token);
@@ -1060,9 +1043,49 @@ extern "C" {
                      llama_token   token);
 
     //
-    // Model split
+    // Beam search
     //
 
+    struct llama_beam_view {
+        const llama_token * tokens;
+
+        size_t n_tokens;
+        float  p;        // Cumulative beam probability (renormalized relative to all beams)
+        bool   eob;      // Callback should set this to true when a beam is at end-of-beam.
+    };
+
+    // Passed to beam_search_callback function.
+    // Whenever 0 < common_prefix_length, this number of tokens should be copied from any of the beams
+    // (e.g. beams[0]) as they will be removed (shifted) from all beams in all subsequent callbacks.
+    // These pointers are valid only during the synchronous callback, so should not be saved.
+    struct llama_beams_state {
+        struct llama_beam_view * beam_views;
+
+        size_t n_beams;               // Number of elements in beam_views[].
+        size_t common_prefix_length;  // Current max length of prefix tokens shared by all beams.
+        bool   last_call;             // True iff this is the last callback invocation.
+    };
+
+    // Type of pointer to the beam_search_callback function.
+    // void* callback_data is any custom data passed to llama_beam_search, that is subsequently
+    // passed back to beam_search_callback. This avoids having to use global variables in the callback.
+    typedef void (*llama_beam_search_callback_fn_t)(void * callback_data, struct llama_beams_state);
+
+    /// @details Deterministically returns entire sentence constructed by a beam search.
+    /// @param ctx Pointer to the llama_context.
+    /// @param callback Invoked for each iteration of the beam_search loop, passing in beams_state.
+    /// @param callback_data A pointer that is simply passed back to callback.
+    /// @param n_beams Number of beams to use.
+    /// @param n_past Number of tokens already evaluated.
+    /// @param n_predict Maximum number of tokens to predict. EOS may occur earlier.
+    LLAMA_API void llama_beam_search(
+                   struct llama_context * ctx,
+        llama_beam_search_callback_fn_t   callback,
+                                   void * callback_data,
+                                 size_t   n_beams,
+                                int32_t   n_past,
+                                int32_t   n_predict);
+
     /// @details Build a split GGUF final path for this chunk.
     ///          llama_split_path(split_path, sizeof(split_path), "/models/ggml-model-q4_0", 2, 4) => split_path = "/models/ggml-model-q4_0-00002-of-00004.gguf"
     //  Returns the split_path length.