Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion backend/cpp/llama-cpp/Makefile
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@

LLAMA_VERSION?=5c8a717128cc98aa9e5b1c44652f5cf458fd426e
LLAMA_VERSION?=9d52f17ae33e8df958e20f3f1b13bfec53ab5a1d
LLAMA_REPO?=https://github.com/ggerganov/llama.cpp

CMAKE_ARGS?=
Expand Down
111 changes: 111 additions & 0 deletions backend/cpp/llama-cpp/grpc-server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -392,6 +392,34 @@ static void params_parse(server_context& ctx_server, const backend::ModelOptions
// Initialize grpc_servers to empty (can be overridden by options)
std::string grpc_servers_option = "";

// Initialize fit_params options (can be overridden by options)
// fit_params: whether to auto-adjust params to fit device memory (default: true as in llama.cpp)
params.fit_params = true;
// fit_params_target: target margin per device in bytes (default: 1GB)
params.fit_params_target = 1024 * 1024 * 1024;
// fit_params_min_ctx: minimum context size for fit (default: 4096)
params.fit_params_min_ctx = 4096;

// Initialize additional server options (can be overridden by options)
// n_cache_reuse: min chunk size for KV cache reuse via shifting (default: 0 = disabled)
params.n_cache_reuse = 0;
// slot_prompt_similarity: threshold for slot prompt matching (default: 0.1)
params.slot_prompt_similarity = 0.1f;
// swa_full: use full-size SWA cache (default: false)
params.swa_full = false;
// cont_batching: continuous batching (default: true, auto-enabled when n_parallel > 1)
params.cont_batching = true;
// check_tensors: validate tensor data (default: false)
params.check_tensors = false;
// warmup: enable warmup run (default: true)
params.warmup = true;
// no_op_offload: disable host tensor op offload (default: false)
params.no_op_offload = false;
// kv_unified: enable unified KV cache (default: false)
params.kv_unified = false;
// n_ctx_checkpoints: max context checkpoints per slot (default: 8)
params.n_ctx_checkpoints = 8;

// decode options. Options are in form optname:optvale, or if booleans only optname.
for (int i = 0; i < request->options_size(); i++) {
std::string opt = request->options(i);
Expand Down Expand Up @@ -436,6 +464,89 @@ static void params_parse(server_context& ctx_server, const backend::ModelOptions
if (optval != NULL) {
grpc_servers_option = optval_str;
}
} else if (!strcmp(optname, "fit_params") || !strcmp(optname, "fit")) {
if (optval_str == "true" || optval_str == "1" || optval_str == "yes" || optval_str == "on" || optval_str == "enabled") {
params.fit_params = true;
} else if (optval_str == "false" || optval_str == "0" || optval_str == "no" || optval_str == "off" || optval_str == "disabled") {
params.fit_params = false;
}
} else if (!strcmp(optname, "fit_params_target") || !strcmp(optname, "fit_target")) {
if (optval != NULL) {
try {
// Value is in MiB, convert to bytes
params.fit_params_target = static_cast<size_t>(std::stoi(optval_str)) * 1024 * 1024;
} catch (const std::exception& e) {
// If conversion fails, keep default value (1GB)
}
}
} else if (!strcmp(optname, "fit_params_min_ctx") || !strcmp(optname, "fit_ctx")) {
if (optval != NULL) {
try {
params.fit_params_min_ctx = std::stoi(optval_str);
} catch (const std::exception& e) {
// If conversion fails, keep default value (4096)
}
}
} else if (!strcmp(optname, "n_cache_reuse") || !strcmp(optname, "cache_reuse")) {
if (optval != NULL) {
try {
params.n_cache_reuse = std::stoi(optval_str);
} catch (const std::exception& e) {
// If conversion fails, keep default value (0)
}
}
} else if (!strcmp(optname, "slot_prompt_similarity") || !strcmp(optname, "sps")) {
if (optval != NULL) {
try {
params.slot_prompt_similarity = std::stof(optval_str);
} catch (const std::exception& e) {
// If conversion fails, keep default value (0.1)
}
}
} else if (!strcmp(optname, "swa_full")) {
if (optval_str == "true" || optval_str == "1" || optval_str == "yes" || optval_str == "on" || optval_str == "enabled") {
params.swa_full = true;
} else if (optval_str == "false" || optval_str == "0" || optval_str == "no" || optval_str == "off" || optval_str == "disabled") {
params.swa_full = false;
}
} else if (!strcmp(optname, "cont_batching") || !strcmp(optname, "continuous_batching")) {
if (optval_str == "true" || optval_str == "1" || optval_str == "yes" || optval_str == "on" || optval_str == "enabled") {
params.cont_batching = true;
} else if (optval_str == "false" || optval_str == "0" || optval_str == "no" || optval_str == "off" || optval_str == "disabled") {
params.cont_batching = false;
}
} else if (!strcmp(optname, "check_tensors")) {
if (optval_str == "true" || optval_str == "1" || optval_str == "yes" || optval_str == "on" || optval_str == "enabled") {
params.check_tensors = true;
} else if (optval_str == "false" || optval_str == "0" || optval_str == "no" || optval_str == "off" || optval_str == "disabled") {
params.check_tensors = false;
}
} else if (!strcmp(optname, "warmup")) {
if (optval_str == "true" || optval_str == "1" || optval_str == "yes" || optval_str == "on" || optval_str == "enabled") {
params.warmup = true;
} else if (optval_str == "false" || optval_str == "0" || optval_str == "no" || optval_str == "off" || optval_str == "disabled") {
params.warmup = false;
}
} else if (!strcmp(optname, "no_op_offload")) {
if (optval_str == "true" || optval_str == "1" || optval_str == "yes" || optval_str == "on" || optval_str == "enabled") {
params.no_op_offload = true;
} else if (optval_str == "false" || optval_str == "0" || optval_str == "no" || optval_str == "off" || optval_str == "disabled") {
params.no_op_offload = false;
}
} else if (!strcmp(optname, "kv_unified") || !strcmp(optname, "unified_kv")) {
if (optval_str == "true" || optval_str == "1" || optval_str == "yes" || optval_str == "on" || optval_str == "enabled") {
params.kv_unified = true;
} else if (optval_str == "false" || optval_str == "0" || optval_str == "no" || optval_str == "off" || optval_str == "disabled") {
params.kv_unified = false;
}
} else if (!strcmp(optname, "n_ctx_checkpoints") || !strcmp(optname, "ctx_checkpoints")) {
if (optval != NULL) {
try {
params.n_ctx_checkpoints = std::stoi(optval_str);
} catch (const std::exception& e) {
// If conversion fails, keep default value (8)
}
}
}
}

Expand Down
15 changes: 15 additions & 0 deletions docs/content/features/text-generation.md
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,18 @@ The `llama.cpp` backend supports additional configuration options that can be sp
| `cache_ram` | integer | Set the maximum RAM cache size in MiB for KV cache. Use `-1` for unlimited (default). | `cache_ram:2048` |
| `parallel` or `n_parallel` | integer | Enable parallel request processing. When set to a value greater than 1, enables continuous batching for handling multiple requests concurrently. | `parallel:4` |
| `grpc_servers` or `rpc_servers` | string | Comma-separated list of gRPC server addresses for distributed inference. Allows distributing workload across multiple llama.cpp workers. | `grpc_servers:localhost:50051,localhost:50052` |
| `fit_params` or `fit` | boolean | Enable auto-adjustment of model/context parameters to fit available device memory. Default: `true`. | `fit_params:true` |
| `fit_params_target` or `fit_target` | integer | Target margin per device in MiB when using fit_params. Default: `1024` (1GB). | `fit_target:2048` |
| `fit_params_min_ctx` or `fit_ctx` | integer | Minimum context size that can be set by fit_params. Default: `4096`. | `fit_ctx:2048` |
| `n_cache_reuse` or `cache_reuse` | integer | Minimum chunk size to attempt reusing from the cache via KV shifting. Default: `0` (disabled). | `cache_reuse:256` |
| `slot_prompt_similarity` or `sps` | float | How much the prompt of a request must match the prompt of a slot to use that slot. Default: `0.1`. Set to `0` to disable. | `sps:0.5` |
| `swa_full` | boolean | Use full-size SWA (Sliding Window Attention) cache. Default: `false`. | `swa_full:true` |
| `cont_batching` or `continuous_batching` | boolean | Enable continuous batching for handling multiple sequences. Default: `true`. | `cont_batching:true` |
| `check_tensors` | boolean | Validate tensor data for invalid values during model loading. Default: `false`. | `check_tensors:true` |
| `warmup` | boolean | Enable warmup run after model loading. Default: `true`. | `warmup:false` |
| `no_op_offload` | boolean | Disable offloading host tensor operations to device. Default: `false`. | `no_op_offload:true` |
| `kv_unified` or `unified_kv` | boolean | Enable unified KV cache. Default: `false`. | `kv_unified:true` |
| `n_ctx_checkpoints` or `ctx_checkpoints` | integer | Maximum number of context checkpoints per slot. Default: `8`. | `ctx_checkpoints:4` |

**Example configuration with options:**

Expand All @@ -162,6 +174,9 @@ options:
- context_shift:true
- cache_ram:4096
- parallel:2
- fit_params:true
- fit_target:1024
- slot_prompt_similarity:0.5
```

**Note:** The `parallel` option can also be set via the `LLAMACPP_PARALLEL` environment variable, and `grpc_servers` can be set via the `LLAMACPP_GRPC_SERVERS` environment variable. Options specified in the YAML file take precedence over environment variables.
Expand Down
Loading