Update llama.cpp API code 20250513

JamePeng · JamePeng · commit 6b3e11c7e718 · 2025-05-14T02:02:02.000+08:00
diff --git a/llama_cpp/_ggml.py b/llama_cpp/_ggml.py
@@ -5,6 +5,7 @@
 import enum
 import os
 import pathlib
+import ctypes
 
 import llama_cpp._ctypes_extensions as ctypes_ext
 
@@ -27,3 +28,64 @@ class GGMLLogLevel(enum.IntEnum):
     GGML_LOG_LEVEL_WARN = 3
     GGML_LOG_LEVEL_ERROR = 4
     GGML_LOG_LEVEL_CONT = 5 # continue previous log
+
+# // ====== ggml-opt.h ======
+
+# enum ggml_opt_build_type {
+#     GGML_OPT_BUILD_TYPE_FORWARD = 10,
+#     GGML_OPT_BUILD_TYPE_GRAD    = 20,
+#     GGML_OPT_BUILD_TYPE_OPT     = 30,
+# };
+class GGMLOptBuildType(enum.IntEnum):
+    GGML_OPT_BUILD_TYPE_FORWARD = 10
+    GGML_OPT_BUILD_TYPE_GRAD = 20
+    GGML_OPT_BUILD_TYPE_OPT = 30
+
+
+# // built-in loss types, i.e. the built-in quantities minimized by the optimizer
+# // custom loss types can be defined via mean or sum which simply reduce the outputs for all datapoints to a single value
+# enum ggml_opt_loss_type {
+#     GGML_OPT_LOSS_TYPE_MEAN,
+#     GGML_OPT_LOSS_TYPE_SUM,
+#     GGML_OPT_LOSS_TYPE_CROSS_ENTROPY,
+#     GGML_OPT_LOSS_TYPE_MEAN_SQUARED_ERROR,
+# };
+class GGMLOptLossType(enum.IntEnum):
+    GGML_OPT_LOSS_TYPE_MEAN = 0
+    GGML_OPT_LOSS_TYPE_SUM = 1
+    GGML_OPT_LOSS_TYPE_CROSS_ENTROPY = 2
+    GGML_OPT_LOSS_TYPE_MEAN_SQUARED_ERROR = 3
+
+
+# // parameters that control which optimizer is used and how said optimizer tries to find the minimal loss
+# struct ggml_opt_optimizer_params {
+#     // AdamW optimizer parameters
+#     struct {
+#         float alpha; // learning rate
+#         float beta1;
+#         float beta2;
+#         float eps;   // epsilon for numerical stability
+#         float wd;    // weight decay for AdamW, use 0.0f to disable
+#     } adamw;
+# };
+class ggml_opt_adamw_params(ctypes.Structure):
+    _fields_ = [
+        ('alpha', ctypes.c_float), # learning rate
+        ('beta1', ctypes.c_float),
+        ('beta2', ctypes.c_float),
+        ('eps',   ctypes.c_float), # epsilon for numerical stability
+        ('wd',    ctypes.c_float), # weight decay for AdamW, use 0.0f to disable
+    ]
+
+class ggml_opt_optimizer_params(ctypes.Structure):
+    _fields_ = [
+        ('adamw', ggml_opt_adamw_params), # Nested AdamW parameters
+    ]
+
+
+# // callback to calculate optimizer parameters prior to a backward pass
+# // userdata can be used to pass arbitrary data
+# typedef struct ggml_opt_optimizer_params (*ggml_opt_get_optimizer_params)(void * userdata);
+ggml_opt_get_optimizer_params = ctypes.CFUNCTYPE(
+    ctypes.POINTER(ggml_opt_optimizer_params), ctypes.c_void_p
+)
diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py
@@ -66,7 +66,7 @@ def __init__(
         def free_model():
             if self.model is None:
                 return
-            llama_cpp.llama_free_model(self.model)
+            llama_cpp.llama_model_free(self.model)
             self.model = None
 
         self._exit_stack.callback(free_model)
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
@@ -89,9 +89,11 @@ def __init__(
         yarn_beta_fast: float = 32.0,
         yarn_beta_slow: float = 1.0,
         yarn_orig_ctx: int = 0,
+        defrag_thold: float = -1.0,
         embedding: bool = False,
         offload_kqv: bool = True,
         flash_attn: bool = False,
+        op_offload: bool = True,
         # Sampling Params
         no_perf: bool = False,
         last_n_tokens_size: int = 64,
@@ -169,9 +171,11 @@ def __init__(
             yarn_beta_fast: YaRN low correction dim
             yarn_beta_slow: YaRN high correction dim
             yarn_orig_ctx: YaRN original context size
+            defrag_thold: Defragment the KV cache if holes/size > thold, <= 0 disabled (default)
             embedding: Embedding mode only.
             offload_kqv: Offload K, Q, V to GPU.
             flash_attn: Use flash attention.
+            op_offload: whether to offload host tensor operations to device
             no_perf: Measure performance timings.
             last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque.
             lora_base: Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model.
@@ -339,9 +343,11 @@ def __init__(
             yarn_beta_slow if yarn_beta_slow != 0.0 else 0
         )
         self.context_params.yarn_orig_ctx = yarn_orig_ctx if yarn_orig_ctx != 0 else 0
+        self.context_params.defrag_thold = defrag_thold
         self.context_params.embeddings = embedding  # TODO: Rename to embeddings
         self.context_params.offload_kqv = offload_kqv
         self.context_params.flash_attn = flash_attn
+        self.context_params.op_offload = op_offload
         #  KV cache quantization
         if type_k is not None:
             self.context_params.type_k = type_k
@@ -561,7 +567,7 @@ def eval_tokens(self) -> Deque[int]:
     def eval_logits(self) -> Deque[List[float]]:
         return deque(
             self.scores[: self.n_tokens, :].tolist(),
-            maxlen=self._n_ctx
+            maxlen = 1
         )
 
     def tokenize(
@@ -2189,9 +2195,11 @@ def __getstate__(self):
             yarn_beta_fast=self.context_params.yarn_beta_fast,
             yarn_beta_slow=self.context_params.yarn_beta_slow,
             yarn_orig_ctx=self.context_params.yarn_orig_ctx,
+            defrag_thold=self.context_params.defrag_thold,
             embedding=self.context_params.embeddings,
             offload_kqv=self.context_params.offload_kqv,
             flash_attn=self.context_params.flash_attn,
+            op_offload=self.context_params.op_offload,
             # Sampling Params
             no_perf=self.context_params.no_perf,
             last_n_tokens_size=self.last_n_tokens_size,
diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
@@ -1,9 +1,13 @@
 from __future__ import annotations
 
-import os
 import ctypes
+import os
 import pathlib
 
+from ._ggml import (
+    ggml_opt_get_optimizer_params
+)
+
 from typing import (
     Callable,
     Union,
@@ -171,6 +175,10 @@
 # llama_sampler_p = NewType("llama_sampler_p", int)
 # llama_sampler_p_ctypes = ctypes.c_void_p
 
+# struct llama_opt_params;
+llama_opt_params_p = NewType("llama_opt_params_p", int)
+llama_opt_params_p_ctypes = ctypes.c_void_p
+
 # struct llama_kv_cache;
 llama_kv_cache_p = NewType("llama_kv_cache_p", int)
 llama_kv_cache_p_ctypes = ctypes.c_void_p
@@ -243,6 +251,7 @@
 #     LLAMA_VOCAB_PRE_TYPE_BAILINGMOE     = 32,
 #     LLAMA_VOCAB_PRE_TYPE_LLAMA4         = 33,
 #     LLAMA_VOCAB_PRE_TYPE_PIXTRAL        = 34,
+#     LLAMA_VOCAB_PRE_TYPE_SEED_CODER     = 35,
 # };
 LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0
 LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1
@@ -279,6 +288,7 @@
 LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32
 LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33
 LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34
+LLAMA_VOCAB_PRE_TYPE_SEED_CODER = 35
 
 
 # // note: these values should be synchronized with ggml_rope
@@ -790,6 +800,7 @@ class llama_model_params(ctypes.Structure):
 #     bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
 #     bool flash_attn;  // whether to use flash attention [EXPERIMENTAL]
 #     bool no_perf;     // whether to measure performance timings
+#     bool op_offload;  // whether to offload host tensor operations to device
 # };
 class llama_context_params(ctypes.Structure):
     """Parameters for llama_context
@@ -811,7 +822,7 @@ class llama_context_params(ctypes.Structure):
         yarn_beta_fast (float): YaRN low correction dim
         yarn_beta_slow (float): YaRN high correction dim
         yarn_orig_ctx (int): YaRN original context size
-        defrag_thold (float): defragment the KV cache if holes/size > thold, < 0 disabled (default)
+        defrag_thold (float): defragment the KV cache if holes/size > thold, <= 0 disabled (default)
         cb_eval (ggml_backend_sched_eval_callback): callback for scheduling eval
         cb_eval_user_data (ctypes.ctypes.c_void_p): user data for cb_eval
         type_k (int): data type for K cache
@@ -822,6 +833,7 @@ class llama_context_params(ctypes.Structure):
         offload_kqv (bool): whether to offload the KQV ops (including the KV cache) to GPU
         flash_attn (bool): whether to use flash attention
         no_perf (bool): whether to measure performance timings
+        op_offload(bool): whether to offload host tensor operations to device
     """
 
     if TYPE_CHECKING:
@@ -852,6 +864,7 @@ class llama_context_params(ctypes.Structure):
         offload_kqv: bool
         flash_attn: bool
         no_perf: bool
+        op_offload:bool
 
     _fields_ = [
         ("n_ctx", ctypes.c_uint32),
@@ -881,6 +894,7 @@ class llama_context_params(ctypes.Structure):
         ("offload_kqv", ctypes.c_bool),
         ("flash_attn", ctypes.c_bool),
         ("no_perf", ctypes.c_bool),
+        ("op_offload", ctypes.c_bool),
     ]
 
 
@@ -1193,7 +1207,20 @@ def llama_model_load_from_splits(
     ...
 
 
-# LLAMA_API void llama_free_model(struct llama_model * model);
+# LLAMA_API void llama_model_save_to_file(
+#         const struct llama_model * model,
+#                     const char * path_model);
+@ctypes_function(
+    "llama_model_save_to_file",
+    [llama_model_p_ctypes, ctypes.c_char_p],
+    None,
+)
+def llama_model_save_to_file(model: llama_model_p, path_model: bytes, /):
+    ...
+
+
+# DEPRECATED(LLAMA_API void llama_free_model(struct llama_model * model),
+#         "use llama_model_free instead");
 @ctypes_function(
     "llama_free_model",
     [llama_model_p_ctypes],
@@ -4128,8 +4155,8 @@ def llama_sampler_get_seed(smpl: llama_sampler_p, /) -> int:
     llama_token,
 )
 def llama_sampler_sample(
-    smpl: llama_sampler_p, ctx: llama_context_p, idx: int, /
-) -> int:
+    smpl: llama_sampler_p, ctx: llama_context_p, idx: ctypes.c_int32, /
+) -> ctypes.c_int32:
     ...
 
 
@@ -4306,3 +4333,85 @@ def llama_perf_sampler_reset(chain: llama_sampler_p, /):
     ...
 
 
+# //
+# // training
+# //
+
+# // function that returns whether or not a given tensor contains trainable parameters
+# typedef bool (*llama_opt_param_filter)(const struct ggml_tensor * tensor, void * userdata);
+llama_opt_param_filter = ctypes.CFUNCTYPE(
+    ctypes.c_bool, ctypes.c_void_p, ctypes.c_void_p
+)
+
+
+# // always returns true
+# LLAMA_API bool llama_opt_param_filter_all(const struct ggml_tensor * tensor, void * userdata);
+@ctypes_function("llama_opt_param_filter_all", [ctypes.c_void_p, ctypes.c_void_p], ctypes.c_bool)
+def llama_opt_param_filter_all(
+    tensor: llama_model_p,
+    userdata: ctypes.c_void_p, /
+) -> bool:
+    ...
+
+# struct llama_opt_params {
+#     uint32_t n_ctx_train; // assumed context size post training, use context size specified in llama_context if 0
+
+#     llama_opt_param_filter param_filter; // callback for determining which tensors contain trainable parameters
+#     void * param_filter_ud;              // userdata for determining which tensors contain trainable parameters
+
+#     ggml_opt_get_optimizer_params get_opt_pars; // callback for calculating optimizer parameters
+#     void * get_opt_pars_ud;                     // userdata for calculating optimizer parameters
+# };
+class llama_opt_params(ctypes.Structure):
+    _fields_ = [
+        ("n_ctx_train", ctypes.c_uint32),
+        ("param_filter", llama_opt_param_filter),
+        ("param_filter_ud", ctypes.c_void_p),
+        ("get_opt_pars", ggml_opt_get_optimizer_params),
+        ("get_opt_pars_ud", ctypes.c_void_p),
+    ]
+
+
+# LLAMA_API void llama_opt_init(struct llama_context * lctx, struct llama_model * model, struct llama_opt_params lopt_params);
+@ctypes_function(
+    "llama_opt_init",
+    [llama_context_p_ctypes, llama_model_p_ctypes, llama_opt_params_p_ctypes],
+    None,
+)
+def llama_opt_init(
+    lctx: llama_context_p,
+    model: llama_model_p,
+    lopt_params: llama_opt_params_p, /
+):
+    ...
+
+# LLAMA_API void llama_opt_epoch(
+#         struct llama_context    * lctx,
+#         ggml_opt_dataset_t        dataset,
+#         ggml_opt_result_t         result_train,
+#         ggml_opt_result_t         result_eval,
+#         int64_t                   idata_split,
+#         ggml_opt_epoch_callback   callback_train,
+#         ggml_opt_epoch_callback   callback_eval);
+@ctypes_function(
+    "llama_opt_epoch",[
+        llama_context_p_ctypes,
+        ctypes.c_void_p,
+        ctypes.c_void_p,
+        ctypes.c_void_p,
+        ctypes.c_int64,
+        ctypes.c_void_p,
+        ctypes.c_void_p
+    ],
+    None,
+)
+def llama_opt_epoch(
+    lctx: llama_context_p,
+    dataset: ctypes.c_void_p,
+    result_train: ctypes.c_void_p,
+    result_eval: ctypes.c_void_p,
+    idata_split: ctypes.c_int64,
+    callback_train: ctypes.c_void_p,
+    callback_eval: ctypes.c_void_p, /
+):
+    ...
diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py
@@ -260,10 +260,12 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama:
             yarn_beta_fast=settings.yarn_beta_fast,
             yarn_beta_slow=settings.yarn_beta_slow,
             yarn_orig_ctx=settings.yarn_orig_ctx,
+            defrag_thold=settings.defrag_thold,
             mul_mat_q=settings.mul_mat_q,
             embedding=settings.embedding,
             offload_kqv=settings.offload_kqv,
             flash_attn=settings.flash_attn,
+            op_offload=settings.op_offload,
             # Sampling Params
             last_n_tokens_size=settings.last_n_tokens_size,
             # LoRA Params
diff --git a/llama_cpp/server/settings.py b/llama_cpp/server/settings.py
@@ -95,6 +95,7 @@ class ModelSettings(BaseSettings):
     yarn_beta_fast: float = Field(default=32.0)
     yarn_beta_slow: float = Field(default=1.0)
     yarn_orig_ctx: int = Field(default=0)
+    defrag_thold: float = Field(default=-1.0)
     mul_mat_q: bool = Field(
         default=True, description="if true, use experimental mul_mat_q kernels"
     )
@@ -105,6 +106,9 @@ class ModelSettings(BaseSettings):
     flash_attn: bool = Field(
         default=False, description="Whether to use flash attention."
     )
+    op_offload: bool = Field(
+        default=True, description="Whether to offload host tensor operations to device"
+    )
     # Sampling Params
     last_n_tokens_size: int = Field(
         default=64,