Skip to content

Commit 02d007e

Browse files
committed
chore(llama.cpp): Add Missing llama.cpp Options to gRPC Server
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
1 parent 8ac7e8c commit 02d007e

File tree

3 files changed

+127
-1
lines changed

3 files changed

+127
-1
lines changed

backend/cpp/llama-cpp/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11

2-
LLAMA_VERSION?=5c8a717128cc98aa9e5b1c44652f5cf458fd426e
2+
LLAMA_VERSION?=9d52f17ae33e8df958e20f3f1b13bfec53ab5a1d
33
LLAMA_REPO?=https://github.com/ggerganov/llama.cpp
44

55
CMAKE_ARGS?=

backend/cpp/llama-cpp/grpc-server.cpp

Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -392,6 +392,34 @@ static void params_parse(server_context& ctx_server, const backend::ModelOptions
392392
// Initialize grpc_servers to empty (can be overridden by options)
393393
std::string grpc_servers_option = "";
394394

395+
// Initialize fit_params options (can be overridden by options)
396+
// fit_params: whether to auto-adjust params to fit device memory (default: true as in llama.cpp)
397+
params.fit_params = true;
398+
// fit_params_target: target margin per device in bytes (default: 1GB)
399+
params.fit_params_target = 1024 * 1024 * 1024;
400+
// fit_params_min_ctx: minimum context size for fit (default: 4096)
401+
params.fit_params_min_ctx = 4096;
402+
403+
// Initialize additional server options (can be overridden by options)
404+
// n_cache_reuse: min chunk size for KV cache reuse via shifting (default: 0 = disabled)
405+
params.n_cache_reuse = 0;
406+
// slot_prompt_similarity: threshold for slot prompt matching (default: 0.1)
407+
params.slot_prompt_similarity = 0.1f;
408+
// swa_full: use full-size SWA cache (default: false)
409+
params.swa_full = false;
410+
// cont_batching: continuous batching (default: true, auto-enabled when n_parallel > 1)
411+
params.cont_batching = true;
412+
// check_tensors: validate tensor data (default: false)
413+
params.check_tensors = false;
414+
// warmup: enable warmup run (default: true)
415+
params.warmup = true;
416+
// no_op_offload: disable host tensor op offload (default: false)
417+
params.no_op_offload = false;
418+
// kv_unified: enable unified KV cache (default: false)
419+
params.kv_unified = false;
420+
// n_ctx_checkpoints: max context checkpoints per slot (default: 8)
421+
params.n_ctx_checkpoints = 8;
422+
395423
// decode options. Options are in form optname:optvale, or if booleans only optname.
396424
for (int i = 0; i < request->options_size(); i++) {
397425
std::string opt = request->options(i);
@@ -436,6 +464,89 @@ static void params_parse(server_context& ctx_server, const backend::ModelOptions
436464
if (optval != NULL) {
437465
grpc_servers_option = optval_str;
438466
}
467+
} else if (!strcmp(optname, "fit_params") || !strcmp(optname, "fit")) {
468+
if (optval_str == "true" || optval_str == "1" || optval_str == "yes" || optval_str == "on" || optval_str == "enabled") {
469+
params.fit_params = true;
470+
} else if (optval_str == "false" || optval_str == "0" || optval_str == "no" || optval_str == "off" || optval_str == "disabled") {
471+
params.fit_params = false;
472+
}
473+
} else if (!strcmp(optname, "fit_params_target") || !strcmp(optname, "fit_target")) {
474+
if (optval != NULL) {
475+
try {
476+
// Value is in MiB, convert to bytes
477+
params.fit_params_target = static_cast<size_t>(std::stoi(optval_str)) * 1024 * 1024;
478+
} catch (const std::exception& e) {
479+
// If conversion fails, keep default value (1GB)
480+
}
481+
}
482+
} else if (!strcmp(optname, "fit_params_min_ctx") || !strcmp(optname, "fit_ctx")) {
483+
if (optval != NULL) {
484+
try {
485+
params.fit_params_min_ctx = std::stoi(optval_str);
486+
} catch (const std::exception& e) {
487+
// If conversion fails, keep default value (4096)
488+
}
489+
}
490+
} else if (!strcmp(optname, "n_cache_reuse") || !strcmp(optname, "cache_reuse")) {
491+
if (optval != NULL) {
492+
try {
493+
params.n_cache_reuse = std::stoi(optval_str);
494+
} catch (const std::exception& e) {
495+
// If conversion fails, keep default value (0)
496+
}
497+
}
498+
} else if (!strcmp(optname, "slot_prompt_similarity") || !strcmp(optname, "sps")) {
499+
if (optval != NULL) {
500+
try {
501+
params.slot_prompt_similarity = std::stof(optval_str);
502+
} catch (const std::exception& e) {
503+
// If conversion fails, keep default value (0.1)
504+
}
505+
}
506+
} else if (!strcmp(optname, "swa_full")) {
507+
if (optval_str == "true" || optval_str == "1" || optval_str == "yes" || optval_str == "on" || optval_str == "enabled") {
508+
params.swa_full = true;
509+
} else if (optval_str == "false" || optval_str == "0" || optval_str == "no" || optval_str == "off" || optval_str == "disabled") {
510+
params.swa_full = false;
511+
}
512+
} else if (!strcmp(optname, "cont_batching") || !strcmp(optname, "continuous_batching")) {
513+
if (optval_str == "true" || optval_str == "1" || optval_str == "yes" || optval_str == "on" || optval_str == "enabled") {
514+
params.cont_batching = true;
515+
} else if (optval_str == "false" || optval_str == "0" || optval_str == "no" || optval_str == "off" || optval_str == "disabled") {
516+
params.cont_batching = false;
517+
}
518+
} else if (!strcmp(optname, "check_tensors")) {
519+
if (optval_str == "true" || optval_str == "1" || optval_str == "yes" || optval_str == "on" || optval_str == "enabled") {
520+
params.check_tensors = true;
521+
} else if (optval_str == "false" || optval_str == "0" || optval_str == "no" || optval_str == "off" || optval_str == "disabled") {
522+
params.check_tensors = false;
523+
}
524+
} else if (!strcmp(optname, "warmup")) {
525+
if (optval_str == "true" || optval_str == "1" || optval_str == "yes" || optval_str == "on" || optval_str == "enabled") {
526+
params.warmup = true;
527+
} else if (optval_str == "false" || optval_str == "0" || optval_str == "no" || optval_str == "off" || optval_str == "disabled") {
528+
params.warmup = false;
529+
}
530+
} else if (!strcmp(optname, "no_op_offload")) {
531+
if (optval_str == "true" || optval_str == "1" || optval_str == "yes" || optval_str == "on" || optval_str == "enabled") {
532+
params.no_op_offload = true;
533+
} else if (optval_str == "false" || optval_str == "0" || optval_str == "no" || optval_str == "off" || optval_str == "disabled") {
534+
params.no_op_offload = false;
535+
}
536+
} else if (!strcmp(optname, "kv_unified") || !strcmp(optname, "unified_kv")) {
537+
if (optval_str == "true" || optval_str == "1" || optval_str == "yes" || optval_str == "on" || optval_str == "enabled") {
538+
params.kv_unified = true;
539+
} else if (optval_str == "false" || optval_str == "0" || optval_str == "no" || optval_str == "off" || optval_str == "disabled") {
540+
params.kv_unified = false;
541+
}
542+
} else if (!strcmp(optname, "n_ctx_checkpoints") || !strcmp(optname, "ctx_checkpoints")) {
543+
if (optval != NULL) {
544+
try {
545+
params.n_ctx_checkpoints = std::stoi(optval_str);
546+
} catch (const std::exception& e) {
547+
// If conversion fails, keep default value (8)
548+
}
549+
}
439550
}
440551
}
441552

docs/content/features/text-generation.md

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,18 @@ The `llama.cpp` backend supports additional configuration options that can be sp
149149
| `cache_ram` | integer | Set the maximum RAM cache size in MiB for KV cache. Use `-1` for unlimited (default). | `cache_ram:2048` |
150150
| `parallel` or `n_parallel` | integer | Enable parallel request processing. When set to a value greater than 1, enables continuous batching for handling multiple requests concurrently. | `parallel:4` |
151151
| `grpc_servers` or `rpc_servers` | string | Comma-separated list of gRPC server addresses for distributed inference. Allows distributing workload across multiple llama.cpp workers. | `grpc_servers:localhost:50051,localhost:50052` |
152+
| `fit_params` or `fit` | boolean | Enable auto-adjustment of model/context parameters to fit available device memory. Default: `true`. | `fit_params:true` |
153+
| `fit_params_target` or `fit_target` | integer | Target margin per device in MiB when using fit_params. Default: `1024` (1GB). | `fit_target:2048` |
154+
| `fit_params_min_ctx` or `fit_ctx` | integer | Minimum context size that can be set by fit_params. Default: `4096`. | `fit_ctx:2048` |
155+
| `n_cache_reuse` or `cache_reuse` | integer | Minimum chunk size to attempt reusing from the cache via KV shifting. Default: `0` (disabled). | `cache_reuse:256` |
156+
| `slot_prompt_similarity` or `sps` | float | How much the prompt of a request must match the prompt of a slot to use that slot. Default: `0.1`. Set to `0` to disable. | `sps:0.5` |
157+
| `swa_full` | boolean | Use full-size SWA (Sliding Window Attention) cache. Default: `false`. | `swa_full:true` |
158+
| `cont_batching` or `continuous_batching` | boolean | Enable continuous batching for handling multiple sequences. Default: `true`. | `cont_batching:true` |
159+
| `check_tensors` | boolean | Validate tensor data for invalid values during model loading. Default: `false`. | `check_tensors:true` |
160+
| `warmup` | boolean | Enable warmup run after model loading. Default: `true`. | `warmup:false` |
161+
| `no_op_offload` | boolean | Disable offloading host tensor operations to device. Default: `false`. | `no_op_offload:true` |
162+
| `kv_unified` or `unified_kv` | boolean | Enable unified KV cache. Default: `false`. | `kv_unified:true` |
163+
| `n_ctx_checkpoints` or `ctx_checkpoints` | integer | Maximum number of context checkpoints per slot. Default: `8`. | `ctx_checkpoints:4` |
152164

153165
**Example configuration with options:**
154166

@@ -162,6 +174,9 @@ options:
162174
- context_shift:true
163175
- cache_ram:4096
164176
- parallel:2
177+
- fit_params:true
178+
- fit_target:1024
179+
- slot_prompt_similarity:0.5
165180
```
166181

167182
**Note:** The `parallel` option can also be set via the `LLAMACPP_PARALLEL` environment variable, and `grpc_servers` can be set via the `LLAMACPP_GRPC_SERVERS` environment variable. Options specified in the YAML file take precedence over environment variables.

0 commit comments

Comments
 (0)