Skip to content

Commit 6b3e11c

Browse files
committed
Update llama.cpp API code 20250513
1 parent 152698a commit 6b3e11c

File tree

6 files changed

+192
-7
lines changed

6 files changed

+192
-7
lines changed

llama_cpp/_ggml.py

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import enum
66
import os
77
import pathlib
8+
import ctypes
89

910
import llama_cpp._ctypes_extensions as ctypes_ext
1011

@@ -27,3 +28,64 @@ class GGMLLogLevel(enum.IntEnum):
2728
GGML_LOG_LEVEL_WARN = 3
2829
GGML_LOG_LEVEL_ERROR = 4
2930
GGML_LOG_LEVEL_CONT = 5 # continue previous log
31+
32+
# // ====== ggml-opt.h ======
33+
34+
# enum ggml_opt_build_type {
35+
# GGML_OPT_BUILD_TYPE_FORWARD = 10,
36+
# GGML_OPT_BUILD_TYPE_GRAD = 20,
37+
# GGML_OPT_BUILD_TYPE_OPT = 30,
38+
# };
39+
class GGMLOptBuildType(enum.IntEnum):
40+
GGML_OPT_BUILD_TYPE_FORWARD = 10
41+
GGML_OPT_BUILD_TYPE_GRAD = 20
42+
GGML_OPT_BUILD_TYPE_OPT = 30
43+
44+
45+
# // built-in loss types, i.e. the built-in quantities minimized by the optimizer
46+
# // custom loss types can be defined via mean or sum which simply reduce the outputs for all datapoints to a single value
47+
# enum ggml_opt_loss_type {
48+
# GGML_OPT_LOSS_TYPE_MEAN,
49+
# GGML_OPT_LOSS_TYPE_SUM,
50+
# GGML_OPT_LOSS_TYPE_CROSS_ENTROPY,
51+
# GGML_OPT_LOSS_TYPE_MEAN_SQUARED_ERROR,
52+
# };
53+
class GGMLOptLossType(enum.IntEnum):
54+
GGML_OPT_LOSS_TYPE_MEAN = 0
55+
GGML_OPT_LOSS_TYPE_SUM = 1
56+
GGML_OPT_LOSS_TYPE_CROSS_ENTROPY = 2
57+
GGML_OPT_LOSS_TYPE_MEAN_SQUARED_ERROR = 3
58+
59+
60+
# // parameters that control which optimizer is used and how said optimizer tries to find the minimal loss
61+
# struct ggml_opt_optimizer_params {
62+
# // AdamW optimizer parameters
63+
# struct {
64+
# float alpha; // learning rate
65+
# float beta1;
66+
# float beta2;
67+
# float eps; // epsilon for numerical stability
68+
# float wd; // weight decay for AdamW, use 0.0f to disable
69+
# } adamw;
70+
# };
71+
class ggml_opt_adamw_params(ctypes.Structure):
72+
_fields_ = [
73+
('alpha', ctypes.c_float), # learning rate
74+
('beta1', ctypes.c_float),
75+
('beta2', ctypes.c_float),
76+
('eps', ctypes.c_float), # epsilon for numerical stability
77+
('wd', ctypes.c_float), # weight decay for AdamW, use 0.0f to disable
78+
]
79+
80+
class ggml_opt_optimizer_params(ctypes.Structure):
81+
_fields_ = [
82+
('adamw', ggml_opt_adamw_params), # Nested AdamW parameters
83+
]
84+
85+
86+
# // callback to calculate optimizer parameters prior to a backward pass
87+
# // userdata can be used to pass arbitrary data
88+
# typedef struct ggml_opt_optimizer_params (*ggml_opt_get_optimizer_params)(void * userdata);
89+
ggml_opt_get_optimizer_params = ctypes.CFUNCTYPE(
90+
ctypes.POINTER(ggml_opt_optimizer_params), ctypes.c_void_p
91+
)

llama_cpp/_internals.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ def __init__(
6666
def free_model():
6767
if self.model is None:
6868
return
69-
llama_cpp.llama_free_model(self.model)
69+
llama_cpp.llama_model_free(self.model)
7070
self.model = None
7171

7272
self._exit_stack.callback(free_model)

llama_cpp/llama.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,9 +89,11 @@ def __init__(
8989
yarn_beta_fast: float = 32.0,
9090
yarn_beta_slow: float = 1.0,
9191
yarn_orig_ctx: int = 0,
92+
defrag_thold: float = -1.0,
9293
embedding: bool = False,
9394
offload_kqv: bool = True,
9495
flash_attn: bool = False,
96+
op_offload: bool = True,
9597
# Sampling Params
9698
no_perf: bool = False,
9799
last_n_tokens_size: int = 64,
@@ -169,9 +171,11 @@ def __init__(
169171
yarn_beta_fast: YaRN low correction dim
170172
yarn_beta_slow: YaRN high correction dim
171173
yarn_orig_ctx: YaRN original context size
174+
defrag_thold: Defragment the KV cache if holes/size > thold, <= 0 disabled (default)
172175
embedding: Embedding mode only.
173176
offload_kqv: Offload K, Q, V to GPU.
174177
flash_attn: Use flash attention.
178+
op_offload: whether to offload host tensor operations to device
175179
no_perf: Measure performance timings.
176180
last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque.
177181
lora_base: Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model.
@@ -339,9 +343,11 @@ def __init__(
339343
yarn_beta_slow if yarn_beta_slow != 0.0 else 0
340344
)
341345
self.context_params.yarn_orig_ctx = yarn_orig_ctx if yarn_orig_ctx != 0 else 0
346+
self.context_params.defrag_thold = defrag_thold
342347
self.context_params.embeddings = embedding # TODO: Rename to embeddings
343348
self.context_params.offload_kqv = offload_kqv
344349
self.context_params.flash_attn = flash_attn
350+
self.context_params.op_offload = op_offload
345351
# KV cache quantization
346352
if type_k is not None:
347353
self.context_params.type_k = type_k
@@ -561,7 +567,7 @@ def eval_tokens(self) -> Deque[int]:
561567
def eval_logits(self) -> Deque[List[float]]:
562568
return deque(
563569
self.scores[: self.n_tokens, :].tolist(),
564-
maxlen=self._n_ctx
570+
maxlen = 1
565571
)
566572

567573
def tokenize(
@@ -2189,9 +2195,11 @@ def __getstate__(self):
21892195
yarn_beta_fast=self.context_params.yarn_beta_fast,
21902196
yarn_beta_slow=self.context_params.yarn_beta_slow,
21912197
yarn_orig_ctx=self.context_params.yarn_orig_ctx,
2198+
defrag_thold=self.context_params.defrag_thold,
21922199
embedding=self.context_params.embeddings,
21932200
offload_kqv=self.context_params.offload_kqv,
21942201
flash_attn=self.context_params.flash_attn,
2202+
op_offload=self.context_params.op_offload,
21952203
# Sampling Params
21962204
no_perf=self.context_params.no_perf,
21972205
last_n_tokens_size=self.last_n_tokens_size,

llama_cpp/llama_cpp.py

Lines changed: 114 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,13 @@
11
from __future__ import annotations
22

3-
import os
43
import ctypes
4+
import os
55
import pathlib
66

7+
from ._ggml import (
8+
ggml_opt_get_optimizer_params
9+
)
10+
711
from typing import (
812
Callable,
913
Union,
@@ -171,6 +175,10 @@
171175
# llama_sampler_p = NewType("llama_sampler_p", int)
172176
# llama_sampler_p_ctypes = ctypes.c_void_p
173177

178+
# struct llama_opt_params;
179+
llama_opt_params_p = NewType("llama_opt_params_p", int)
180+
llama_opt_params_p_ctypes = ctypes.c_void_p
181+
174182
# struct llama_kv_cache;
175183
llama_kv_cache_p = NewType("llama_kv_cache_p", int)
176184
llama_kv_cache_p_ctypes = ctypes.c_void_p
@@ -243,6 +251,7 @@
243251
# LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32,
244252
# LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33,
245253
# LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34,
254+
# LLAMA_VOCAB_PRE_TYPE_SEED_CODER = 35,
246255
# };
247256
LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0
248257
LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1
@@ -279,6 +288,7 @@
279288
LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32
280289
LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33
281290
LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34
291+
LLAMA_VOCAB_PRE_TYPE_SEED_CODER = 35
282292

283293

284294
# // note: these values should be synchronized with ggml_rope
@@ -790,6 +800,7 @@ class llama_model_params(ctypes.Structure):
790800
# bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
791801
# bool flash_attn; // whether to use flash attention [EXPERIMENTAL]
792802
# bool no_perf; // whether to measure performance timings
803+
# bool op_offload; // whether to offload host tensor operations to device
793804
# };
794805
class llama_context_params(ctypes.Structure):
795806
"""Parameters for llama_context
@@ -811,7 +822,7 @@ class llama_context_params(ctypes.Structure):
811822
yarn_beta_fast (float): YaRN low correction dim
812823
yarn_beta_slow (float): YaRN high correction dim
813824
yarn_orig_ctx (int): YaRN original context size
814-
defrag_thold (float): defragment the KV cache if holes/size > thold, < 0 disabled (default)
825+
defrag_thold (float): defragment the KV cache if holes/size > thold, <= 0 disabled (default)
815826
cb_eval (ggml_backend_sched_eval_callback): callback for scheduling eval
816827
cb_eval_user_data (ctypes.ctypes.c_void_p): user data for cb_eval
817828
type_k (int): data type for K cache
@@ -822,6 +833,7 @@ class llama_context_params(ctypes.Structure):
822833
offload_kqv (bool): whether to offload the KQV ops (including the KV cache) to GPU
823834
flash_attn (bool): whether to use flash attention
824835
no_perf (bool): whether to measure performance timings
836+
op_offload(bool): whether to offload host tensor operations to device
825837
"""
826838

827839
if TYPE_CHECKING:
@@ -852,6 +864,7 @@ class llama_context_params(ctypes.Structure):
852864
offload_kqv: bool
853865
flash_attn: bool
854866
no_perf: bool
867+
op_offload:bool
855868

856869
_fields_ = [
857870
("n_ctx", ctypes.c_uint32),
@@ -881,6 +894,7 @@ class llama_context_params(ctypes.Structure):
881894
("offload_kqv", ctypes.c_bool),
882895
("flash_attn", ctypes.c_bool),
883896
("no_perf", ctypes.c_bool),
897+
("op_offload", ctypes.c_bool),
884898
]
885899

886900

@@ -1193,7 +1207,20 @@ def llama_model_load_from_splits(
11931207
...
11941208

11951209

1196-
# LLAMA_API void llama_free_model(struct llama_model * model);
1210+
# LLAMA_API void llama_model_save_to_file(
1211+
# const struct llama_model * model,
1212+
# const char * path_model);
1213+
@ctypes_function(
1214+
"llama_model_save_to_file",
1215+
[llama_model_p_ctypes, ctypes.c_char_p],
1216+
None,
1217+
)
1218+
def llama_model_save_to_file(model: llama_model_p, path_model: bytes, /):
1219+
...
1220+
1221+
1222+
# DEPRECATED(LLAMA_API void llama_free_model(struct llama_model * model),
1223+
# "use llama_model_free instead");
11971224
@ctypes_function(
11981225
"llama_free_model",
11991226
[llama_model_p_ctypes],
@@ -4128,8 +4155,8 @@ def llama_sampler_get_seed(smpl: llama_sampler_p, /) -> int:
41284155
llama_token,
41294156
)
41304157
def llama_sampler_sample(
4131-
smpl: llama_sampler_p, ctx: llama_context_p, idx: int, /
4132-
) -> int:
4158+
smpl: llama_sampler_p, ctx: llama_context_p, idx: ctypes.c_int32, /
4159+
) -> ctypes.c_int32:
41334160
...
41344161

41354162

@@ -4306,3 +4333,85 @@ def llama_perf_sampler_reset(chain: llama_sampler_p, /):
43064333
...
43074334

43084335

4336+
# //
4337+
# // training
4338+
# //
4339+
4340+
# // function that returns whether or not a given tensor contains trainable parameters
4341+
# typedef bool (*llama_opt_param_filter)(const struct ggml_tensor * tensor, void * userdata);
4342+
llama_opt_param_filter = ctypes.CFUNCTYPE(
4343+
ctypes.c_bool, ctypes.c_void_p, ctypes.c_void_p
4344+
)
4345+
4346+
4347+
# // always returns true
4348+
# LLAMA_API bool llama_opt_param_filter_all(const struct ggml_tensor * tensor, void * userdata);
4349+
@ctypes_function("llama_opt_param_filter_all", [ctypes.c_void_p, ctypes.c_void_p], ctypes.c_bool)
4350+
def llama_opt_param_filter_all(
4351+
tensor: llama_model_p,
4352+
userdata: ctypes.c_void_p, /
4353+
) -> bool:
4354+
...
4355+
4356+
# struct llama_opt_params {
4357+
# uint32_t n_ctx_train; // assumed context size post training, use context size specified in llama_context if 0
4358+
4359+
# llama_opt_param_filter param_filter; // callback for determining which tensors contain trainable parameters
4360+
# void * param_filter_ud; // userdata for determining which tensors contain trainable parameters
4361+
4362+
# ggml_opt_get_optimizer_params get_opt_pars; // callback for calculating optimizer parameters
4363+
# void * get_opt_pars_ud; // userdata for calculating optimizer parameters
4364+
# };
4365+
class llama_opt_params(ctypes.Structure):
4366+
_fields_ = [
4367+
("n_ctx_train", ctypes.c_uint32),
4368+
("param_filter", llama_opt_param_filter),
4369+
("param_filter_ud", ctypes.c_void_p),
4370+
("get_opt_pars", ggml_opt_get_optimizer_params),
4371+
("get_opt_pars_ud", ctypes.c_void_p),
4372+
]
4373+
4374+
4375+
# LLAMA_API void llama_opt_init(struct llama_context * lctx, struct llama_model * model, struct llama_opt_params lopt_params);
4376+
@ctypes_function(
4377+
"llama_opt_init",
4378+
[llama_context_p_ctypes, llama_model_p_ctypes, llama_opt_params_p_ctypes],
4379+
None,
4380+
)
4381+
def llama_opt_init(
4382+
lctx: llama_context_p,
4383+
model: llama_model_p,
4384+
lopt_params: llama_opt_params_p, /
4385+
):
4386+
...
4387+
4388+
# LLAMA_API void llama_opt_epoch(
4389+
# struct llama_context * lctx,
4390+
# ggml_opt_dataset_t dataset,
4391+
# ggml_opt_result_t result_train,
4392+
# ggml_opt_result_t result_eval,
4393+
# int64_t idata_split,
4394+
# ggml_opt_epoch_callback callback_train,
4395+
# ggml_opt_epoch_callback callback_eval);
4396+
@ctypes_function(
4397+
"llama_opt_epoch",[
4398+
llama_context_p_ctypes,
4399+
ctypes.c_void_p,
4400+
ctypes.c_void_p,
4401+
ctypes.c_void_p,
4402+
ctypes.c_int64,
4403+
ctypes.c_void_p,
4404+
ctypes.c_void_p
4405+
],
4406+
None,
4407+
)
4408+
def llama_opt_epoch(
4409+
lctx: llama_context_p,
4410+
dataset: ctypes.c_void_p,
4411+
result_train: ctypes.c_void_p,
4412+
result_eval: ctypes.c_void_p,
4413+
idata_split: ctypes.c_int64,
4414+
callback_train: ctypes.c_void_p,
4415+
callback_eval: ctypes.c_void_p, /
4416+
):
4417+
...

llama_cpp/server/model.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -260,10 +260,12 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama:
260260
yarn_beta_fast=settings.yarn_beta_fast,
261261
yarn_beta_slow=settings.yarn_beta_slow,
262262
yarn_orig_ctx=settings.yarn_orig_ctx,
263+
defrag_thold=settings.defrag_thold,
263264
mul_mat_q=settings.mul_mat_q,
264265
embedding=settings.embedding,
265266
offload_kqv=settings.offload_kqv,
266267
flash_attn=settings.flash_attn,
268+
op_offload=settings.op_offload,
267269
# Sampling Params
268270
last_n_tokens_size=settings.last_n_tokens_size,
269271
# LoRA Params

llama_cpp/server/settings.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,7 @@ class ModelSettings(BaseSettings):
9595
yarn_beta_fast: float = Field(default=32.0)
9696
yarn_beta_slow: float = Field(default=1.0)
9797
yarn_orig_ctx: int = Field(default=0)
98+
defrag_thold: float = Field(default=-1.0)
9899
mul_mat_q: bool = Field(
99100
default=True, description="if true, use experimental mul_mat_q kernels"
100101
)
@@ -105,6 +106,9 @@ class ModelSettings(BaseSettings):
105106
flash_attn: bool = Field(
106107
default=False, description="Whether to use flash attention."
107108
)
109+
op_offload: bool = Field(
110+
default=True, description="Whether to offload host tensor operations to device"
111+
)
108112
# Sampling Params
109113
last_n_tokens_size: int = Field(
110114
default=64,

0 commit comments

Comments
 (0)