Merge pull request #1 from ochafik/Telosnex_phi4_tools_template

jpohhhh · jpohhhh · commit eae5d97dafba · 2025-03-14T23:11:49.000-04:00
Fixes for phi-4 support
diff --git a/common/chat.cpp b/common/chat.cpp
@@ -584,10 +584,7 @@ static common_chat_msg parse_json_tool_calls(
     }
 
     if (!result.tool_calls.empty()) {
-        if (!string_strip(result.content).empty()) {
-            LOG_WRN("Content found with tool calls: %s\n", result.content.c_str());
-        }
-        result.content = "";
+        result.content = string_strip(result.content);
     }
     return result;
 }
@@ -1371,14 +1368,15 @@ static common_chat_params common_chat_params_init_phi_4(const common_chat_templa
             std::string name = function.at("name");
             auto parameters = function.at("parameters");
             builder.resolve_refs(parameters);
-            tool_rules.push_back(builder.add_schema(name + "-call", {
+            auto call_rule = builder.add_schema(name + "-call", {
                 {"type", "object"},
                 {"properties", {
                     {"name", {{"const", name}}},
                     {"arguments", parameters},
                 }},
                 {"required", json::array({"name", "arguments"})},
-            }));
+            });
+            tool_rules.push_back(builder.add_rule(name + "-call", "\"<|tool_call|>\" " + call_rule + " \"<|/tool_call|>\""));
         });
         auto any_tool_call = builder.add_rule("any_tool_call", "( " + string_join(tool_rules, " | ") + " ) space");
         std::vector<std::string> alt_tags {
@@ -1391,6 +1389,9 @@ static common_chat_params common_chat_params_init_phi_4(const common_chat_templa
         data.preserved_tokens = {
             "<|tool_call|>",
             "</|tool_call|>",
+            "<|tool_response|>",
+            "<|tool|>",
+            "</|tool|>",
         };
     });
 
@@ -1449,89 +1450,9 @@ static common_chat_params common_chat_params_init_phi_4(const common_chat_templa
 }
 
 static common_chat_msg common_chat_parse_phi_4(const std::string & input) {
-    common_chat_msg result;
-    result.role = "assistant";
-    
-    std::string final_content = "";
-    
-    const std::string opening_tag = "<|tool_call|>";
-    const std::string closing_tag = "</|tool_call|>";
-    
-    size_t start_pos = 0;
-    while (true) {
-        // Find next tool call
-        size_t tool_start = input.find(opening_tag, start_pos);
-        if (tool_start == std::string::npos) {
-            // No more tool calls.
-
-            // Is start_pos within string bounds?
-            if (start_pos < input.length()) {
-                // Add the rest of the string to final_content
-                final_content += input.substr(start_pos);
-            }
-            break;
-        }
-        
-        // Add content before the tool call to final_content
-        final_content += input.substr(start_pos, tool_start - start_pos);
-
-        // Find closing tag
-        size_t content_start = tool_start + opening_tag.length();
-        size_t tool_end = input.find(closing_tag, content_start);
-        
-        if (tool_end == std::string::npos) {
-            // No closing tag found, so just include the rest of the string as tool.
-            tool_end = input.length();
-        }
-        
-        // Extract tool call content
-        std::string tool_content = input.substr(
-            content_start,
-            tool_end - content_start
-        );
-        
-        // Try to parse the tool call
-        try {
-            auto tool_call = json::parse(tool_content);
-            
-            // Verify the required fields exist
-            if (!tool_call.contains("name")) {
-                throw std::runtime_error("Missing 'name' field in tool call");
-            }
-            
-            if (!tool_call.contains("arguments")) {
-                throw std::runtime_error("Missing 'arguments' field in tool call");
-            }
-            
-            std::string name = tool_call["name"].get<std::string>();
-            
-            std::string arguments;
-            try {
-                arguments = tool_call["arguments"].dump();
-            } catch (const std::exception & e) {
-                LOG_ERR("Failed to serialize arguments: %s\n", e.what());
-                arguments = "{}";
-            }
-            
-            result.tool_calls.push_back({
-                name,
-                arguments,
-                /* id= */ "",
-            });
-        } catch (const std::exception & e) {
-            // If parsing fails, include the entire tool call in the content
-            final_content += input.substr(
-                tool_start,
-                tool_end + closing_tag.length() - tool_start
-            );
-        }
-        
-        // Move past this tool call for next iteration
-        start_pos = tool_end + closing_tag.length();
-    }
-    
-    result.content = final_content;
-    return result;
+    static std::regex function_regex("<\\|tool_call\\|>\\s*\\{\\s*\"name\"\\s*:\\s*\"([^\"]+)\"\\s*,\\s*\"arguments\"\\s*:");
+    static std::regex close_regex(R"(\}\s*(</\|tool_call\|>)?)");
+    return parse_json_tool_calls(input, std::nullopt, function_regex, close_regex);
 }
 
 
diff --git a/docs/function-calling.md b/docs/function-calling.md
@@ -12,11 +12,12 @@ Function calling is supported for all models (see https://github.com/ggml-org/ll
   - Llama 3.1 / 3.3 (including builtin tools support - tool names for `wolfram_alpha`, `web_search` / `brave_search`, `code_interpreter`), Llama 3.2
   - Functionary v3.1 / v3.2
   - Hermes 2/3, Qwen 2.5
-  - Qwen 2.5 Coder (WIP: https://github.com/ggml-org/llama.cpp/pull/12034)
+  - Qwen 2.5 Coder (#12034)
   - Mistral Nemo
   - Firefunction v2
-  - Command R7B
-  - DeepSeek R1 (WIP / seems reluctant to call any tools?)
+  - Command R7B (#11585)
+  - DeepSeek R1 (#11607)
+  - Phi 4 (#12288)
 
 - Generic tool call is supported when the template isn't recognized by native format handlers (you'll see `Chat format: Generic` in the logs).
   - Use `--chat-template-file` to override the template when appropriate (see examples below)
@@ -297,9 +298,14 @@ llama-server --jinja -fa -hf bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q6_K_L \
 llama-server --jinja -fa -hf bartowski/DeepSeek-R1-Distill-Qwen-32B-GGUF:Q4_K_M \
     --chat-template-file models/templates/llama-cpp-deepseek-r1.jinja
 
+# Native support for Phi 4 also needs a template override (official template is buggy)
+
+llama-server --jinja -fa -hf bartowski/microsoft_Phi-4-mini-instruct-GGUF \
+    --chat-template-file models/templates/llama-cpp-microsoft-Phi-4-mini-instruct.jinja
+
 # Native support requires the right template for these GGUFs:
 
-llama-server --jinja -fa -hf bartowski/functionary-small-v3.2-GGUF:Q4_K_M
+llama-server --jinja -fa -hf bartowski/functionary-small-v3.2-GGUF:Q4_K_M \
     --chat-template-file models/templates/meetkai-functionary-medium-v3.2.jinja
 
 llama-server --jinja -fa -hf bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M \
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
@@ -386,7 +386,7 @@ struct server_task {
                             trigger.type = COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN;
                             trigger.value = word;
                             trigger.token = token;
-                            params.sampling.grammar_triggers.push_back(std::move(trigger));
+                            params.sampling.grammar_triggers.push_back(trigger);
                         } else {
                             SRV_DBG("Grammar trigger word: `%s`\n", word.c_str());
                             params.sampling.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, word});
@@ -751,10 +751,7 @@ struct server_task_result_cmpl_final : server_task_result {
                         {"name", tc.name},
                         {"arguments", tc.arguments},
                     }},
-                    // Some templates generate and require an id (sometimes in a very specific format, e.g. Mistral Nemo).
-                    // We only generate a random id for the ones that don't generate one by themselves
-                    // (they also won't get to see it as their template likely doesn't use it, so it's all for the client)
-                    {"id", tc.id.empty() ? gen_tool_call_id() : tc.id},
+                    {"id", tc.id},
                 });
             }
             message["tool_calls"] = tool_calls;
@@ -2040,18 +2037,6 @@ struct server_context {
         return ret;
     }
 
-    bool can_be_detokenized(const struct llama_context * ctx, const std::vector<llama_token> & tokens) {
-        const llama_model * model = llama_get_model(ctx);
-        const llama_vocab * vocab = llama_model_get_vocab(model);
-        const int32_t n_vocab = llama_vocab_n_tokens(vocab);
-        for (const auto & token : tokens) {
-            if (token < 0 || token >= n_vocab) {
-                return false;
-            }
-        }
-        return true;
-    }
-
     bool launch_slot_with_task(server_slot & slot, const server_task & task) {
         slot.reset();
         slot.id_task       = task.id;
@@ -2066,11 +2051,6 @@ struct server_context {
             slot.lora = task.params.lora;
         }
 
-        bool can_detokenize = can_be_detokenized(ctx, slot.prompt_tokens);
-        if (!can_detokenize) {
-            send_error(task, "Prompt contains invalid tokens", ERROR_TYPE_INVALID_REQUEST);
-            return false;
-        }
         SLT_DBG(slot, "launching slot : %s\n", safe_json_to_str(slot.to_json()).c_str());
 
         if (slot.n_predict > 0 && slot.params.n_predict > slot.n_predict) {
@@ -2113,7 +2093,7 @@ struct server_context {
         SRV_DBG("%s", "clearing KV cache\n");
 
         // clear the entire KV cache
-        llama_kv_self_clear(ctx);
+        llama_kv_cache_clear(ctx);
         clean_kv_cache = false;
     }
 
@@ -2655,8 +2635,8 @@ struct server_context {
                     res->n_tasks_deferred    = queue_tasks.queue_tasks_deferred.size();
                     res->t_start             = metrics.t_start;
 
-                    res->kv_cache_tokens_count = llama_kv_self_n_tokens(ctx);
-                    res->kv_cache_used_cells   = llama_kv_self_used_cells(ctx);
+                    res->kv_cache_tokens_count = llama_get_kv_cache_token_count(ctx);
+                    res->kv_cache_used_cells   = llama_get_kv_cache_used_cells(ctx);
 
                     res->n_prompt_tokens_processed_total = metrics.n_prompt_tokens_processed_total;
                     res->t_prompt_processing_total       = metrics.t_prompt_processing_total;
@@ -2772,7 +2752,7 @@ struct server_context {
 
                     // Erase token cache
                     const size_t n_erased = slot->cache_tokens.size();
-                    llama_kv_self_seq_rm(ctx, slot->id, -1, -1);
+                    llama_kv_cache_seq_rm(ctx, slot->id, -1, -1);
                     slot->cache_tokens.clear();
 
                     auto res = std::make_unique<server_task_result_slot_erase>();
@@ -2840,8 +2820,8 @@ struct server_context {
 
                 SLT_WRN(slot, "slot context shift, n_keep = %d, n_left = %d, n_discard = %d\n", n_keep, n_left, n_discard);
 
-                llama_kv_self_seq_rm (ctx, slot.id, n_keep            , n_keep + n_discard);
-                llama_kv_self_seq_add(ctx, slot.id, n_keep + n_discard, slot.n_past,        -n_discard);
+                llama_kv_cache_seq_rm (ctx, slot.id, n_keep            , n_keep + n_discard);
+                llama_kv_cache_seq_add(ctx, slot.id, n_keep + n_discard, slot.n_past,        -n_discard);
 
                 if (slot.params.cache_prompt) {
                     for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++) {
@@ -3032,8 +3012,8 @@ struct server_context {
 
                                             const int64_t kv_shift = (int64_t) head_p - (int64_t) head_c;
 
-                                            llama_kv_self_seq_rm (ctx, slot.id, head_p, head_c);
-                                            llama_kv_self_seq_add(ctx, slot.id, head_c, head_c + n_match, kv_shift);
+                                            llama_kv_cache_seq_rm (ctx, slot.id, head_p, head_c);
+                                            llama_kv_cache_seq_add(ctx, slot.id, head_c, head_c + n_match, kv_shift);
 
                                             for (size_t i = 0; i < n_match; i++) {
                                                 slot.cache_tokens[head_p + i] = slot.cache_tokens[head_c + i];
@@ -3071,9 +3051,9 @@ struct server_context {
                     }
 
                     // keep only the common part
-                    if (!llama_kv_self_seq_rm(ctx, slot.id, slot.n_past, -1)) {
+                    if (!llama_kv_cache_seq_rm(ctx, slot.id, slot.n_past, -1)) {
                         // could not partially delete (likely using a non-Transformer model)
-                        llama_kv_self_seq_rm(ctx, slot.id, -1, -1);
+                        llama_kv_cache_seq_rm(ctx, slot.id, -1, -1);
 
                         // there is no common part left
                         slot.n_past = 0;
@@ -3313,7 +3293,7 @@ struct server_context {
                 slot.cache_tokens.push_back(id);
                 slot.cache_tokens.insert(slot.cache_tokens.end(), ids.begin(), ids.end() - 1);
 
-                llama_kv_self_seq_rm(ctx, slot.id, slot.n_past, -1);
+                llama_kv_cache_seq_rm(ctx, slot.id, slot.n_past, -1);
 
                 for (size_t i = 0; i < ids.size(); ++i) {
                     completion_token_output result;
diff --git a/models/templates/llama-cpp-microsoft-Phi-4-mini-instruct.jinja b/models/templates/llama-cpp-microsoft-Phi-4-mini-instruct.jinja
@@ -0,0 +1,35 @@
+{%- if messages[0]["role"] == "system" %}
+    {%- set system_message = messages[0]["content"] %}
+{% elif tools is defined -%}
+    {%- set system_message = "You are a helpful assistant with access to tools." -%}
+{% else %}
+    {%- set system_message = "" -%}
+{%- endif %}
+{%- if tools is defined -%}
+    {%- set system_message = system_message + '<|tool|>' + (tools | tojson) + '<|/tool|>' -%}
+    {%- if '<|tool_call|>' not in system_message -%}
+        {%- set system_message = system_message + "\nTo use a tool, respond in this format: <|tool_call|>{\"name\": \"foo\", \"arguments\": {\"a\": 1}}<|/tool_call|>" %}
+    {%- endif %}
+{%- endif %}
+{%- if system_message is defined -%}
+    {{- '<|system|>' + system_message + '<|end|>' -}}
+{%- endif -%}
+{%- for message in messages -%}
+    {%- if message['role'] == 'tool' -%}
+        {{- '<|tool_response|>' + (message['content'] | tojson) + '<|/tool_response|>' -}}
+    {%- elif message['role'] != 'system' -%}
+        {{- '<|' + message['role'] + '|>' -}}
+        {%- if message.content -%}
+            {{- message['content'] -}}
+        {%- endif -%}  
+        {%- for tool_call in message.tool_calls -%}
+            {{- '<|tool_call|>' + (tool_call | tojson) + '<|/tool_call|>' -}}
+        {%- endfor -%}
+        {{- '<|end|>' -}}
+    {%- endif -%}
+{%- endfor -%}
+{%- if add_generation_prompt -%}
+   {{- '<|assistant|>' -}}
+{%- else -%}
+   {{- eos_token -}}
+{%- endif -%}