llama-router: replace implicit arg injection with explicit placeholders

ServeurpersoCom · ServeurpersoCom · commit d99d95206ede · 2025-12-01T23:19:19.000+01:00
Remove automatic --model/--port/--host appending in favor of $path,
$port, $host placeholders in spawn commands. All parameters now visible
in configuration for full transparency and flexibility
diff --git a/tools/router/ARCHITECTURE.md b/tools/router/ARCHITECTURE.md
@@ -148,6 +148,13 @@ Spawn commands support both absolute/relative paths and PATH-based binaries:
 
 The router only validates file existence for commands containing `/` or `\\` path separators, allowing seamless use of system-installed binaries.
 
+### Spawn Command Placeholders
+
+The router expands placeholders in spawn commands:
+- `$path` → The model file path from `path` field
+- `$port` → Dynamically assigned port (increments from `base_port`)
+- `$host` → Always expands to `127.0.0.1` for security
+
 ### Model-Scoped Route Stripping
 
 Routes like `/<model>/health` are router-side aliases for convenience. Before proxying to the backend, the router strips the model prefix:
diff --git a/tools/router/README.md b/tools/router/README.md
@@ -196,7 +196,15 @@ Override with `--config`:
     "notify_model_swap": false
   },
   "default_spawn": {
-    "command": ["llama-server", "--jinja", "--ctx-size", "4096", "--n-gpu-layers", "99"],
+    "command": [
+      "llama-server",
+      "-m", "$path",
+      "--port", "$port",
+      "--host", "$host",
+      "--jinja",
+      "--ctx-size", "4096",
+      "--n-gpu-layers", "99"
+    ],
     "proxy_endpoints": ["/v1/", "/health", "/slots", "/props"],
     "health_endpoint": "/health"
   },
@@ -233,16 +241,31 @@ The `default_spawn` block defines how llama-server instances are launched:
 
 ```json
 {
-  "command": ["llama-server", "--jinja", "--ctx-size", "4096", "--n-gpu-layers", "99"],
+  "command": [
+    "llama-server",
+    "-m", "$path",
+    "--port", "$port",
+    "--host", "$host",
+    "--jinja",
+    "--ctx-size", "4096",
+    "--n-gpu-layers", "99"
+  ],
   "proxy_endpoints": ["/v1/", "/health", "/slots", "/props"],
   "health_endpoint": "/health"
 }
 ```
 
-The router automatically appends these arguments:
-- `--model <path>` - The model file path
-- `--port <port>` - Dynamically assigned port
-- `--host 127.0.0.1` - Localhost binding for security
+### Spawn Command Placeholders
+
+The router supports placeholders in spawn commands for dynamic value injection:
+
+| Placeholder | Description | Example expansion |
+|-------------|-------------|-------------------|
+| `$path` | Model file path from configuration | `/home/user/.cache/llama.cpp/model.gguf` |
+| `$port` | Dynamically assigned port | `50000`, `50001`, etc. |
+| `$host` | Bind address (always `127.0.0.1`) | `127.0.0.1` |
+
+This makes all spawn parameters explicit and visible in the configuration.
 
 ### Optimizing for Your Hardware
 
@@ -253,6 +276,9 @@ The `default_spawn` is where you tune performance for your specific hardware. **
   "default_spawn": {
     "command": [
       "llama-server",
+      "-m", "$path",
+      "--port", "$port",
+      "--host", "$host",
       "-ngl", "999",
       "-ctk", "q8_0",
       "-ctv", "q8_0",
@@ -277,8 +303,6 @@ The `default_spawn` is where you tune performance for your specific hardware. **
 - `-kvu`: Use single unified KV buffer for all sequences (also `--kv-unified`)
 - `--jinja`: Enable Jinja template support
 
-**Note:** The router automatically appends `--model`, `--port`, and `--host` - do not include these in your command.
-
 Change `default_spawn`, reload the router, and all `auto` models instantly use the new configuration.
 
 ### Per-Model Spawn Override
@@ -293,6 +317,9 @@ Individual models can override the default spawn configuration:
   "spawn": {
     "command": [
       "llama-server",
+      "-m", "$path",
+      "--port", "$port",
+      "--host", "$host",
       "--jinja",
       "--ctx-size", "8192",
       "--n-gpu-layers", "99",
diff --git a/tools/router/router-app.cpp b/tools/router/router-app.cpp
@@ -82,12 +82,18 @@ bool RouterApp::ensure_running(const std::string & model_name, std::string & err
     const SpawnConfig spawn_cfg = resolve_spawn_config(cfg);
 
     std::vector<std::string> command = spawn_cfg.command;
-    command.push_back("--model");
-    command.push_back(expand_user_path(cfg.path));
-    command.push_back("--port");
-    command.push_back(std::to_string(port));
-    command.push_back("--host");
-    command.push_back("127.0.0.1");
+    const std::string model_path = expand_user_path(cfg.path);
+
+    // Replace all placeholders
+    for (auto & arg : command) {
+        if (arg == "$path") {
+            arg = model_path;
+        } else if (arg == "$port") {
+            arg = std::to_string(port);
+        } else if (arg == "$host") {
+            arg = "127.0.0.1";
+        }
+    }
 
     LOG_INF("Starting %s on port %d\n", model_name.c_str(), port);
 
diff --git a/tools/router/router-config.cpp b/tools/router/router-config.cpp
@@ -107,7 +107,7 @@ static json serialize_spawn_config(const SpawnConfig & spawn) {
 const SpawnConfig & get_default_spawn() {
     static const SpawnConfig spawn = [] {
         SpawnConfig default_spawn = {
-            /*command          =*/ {"llama-server", "--jinja", "--ctx-size", "4096", "--n-gpu-layers", "99"},
+            /*command          =*/ {"llama-server", "-m", "$path", "--port", "$port", "--host", "$host", "--jinja", "--ctx-size", "4096", "--n-gpu-layers", "99"},
             /*proxy_endpoints =*/ {"/v1/", "/health", "/slots", "/props"},
             /*health_endpoint =*/ "/health",
         };