Skip to content

Commit eae5d97

Browse files
committed
Merge pull request #1 from ochafik/Telosnex_phi4_tools_template
Fixes for phi-4 support
1 parent c3aac4e commit eae5d97

File tree

4 files changed

+68
-126
lines changed

4 files changed

+68
-126
lines changed

common/chat.cpp

Lines changed: 10 additions & 89 deletions
Original file line numberDiff line numberDiff line change
@@ -584,10 +584,7 @@ static common_chat_msg parse_json_tool_calls(
584584
}
585585

586586
if (!result.tool_calls.empty()) {
587-
if (!string_strip(result.content).empty()) {
588-
LOG_WRN("Content found with tool calls: %s\n", result.content.c_str());
589-
}
590-
result.content = "";
587+
result.content = string_strip(result.content);
591588
}
592589
return result;
593590
}
@@ -1371,14 +1368,15 @@ static common_chat_params common_chat_params_init_phi_4(const common_chat_templa
13711368
std::string name = function.at("name");
13721369
auto parameters = function.at("parameters");
13731370
builder.resolve_refs(parameters);
1374-
tool_rules.push_back(builder.add_schema(name + "-call", {
1371+
auto call_rule = builder.add_schema(name + "-call", {
13751372
{"type", "object"},
13761373
{"properties", {
13771374
{"name", {{"const", name}}},
13781375
{"arguments", parameters},
13791376
}},
13801377
{"required", json::array({"name", "arguments"})},
1381-
}));
1378+
});
1379+
tool_rules.push_back(builder.add_rule(name + "-call", "\"<|tool_call|>\" " + call_rule + " \"<|/tool_call|>\""));
13821380
});
13831381
auto any_tool_call = builder.add_rule("any_tool_call", "( " + string_join(tool_rules, " | ") + " ) space");
13841382
std::vector<std::string> alt_tags {
@@ -1391,6 +1389,9 @@ static common_chat_params common_chat_params_init_phi_4(const common_chat_templa
13911389
data.preserved_tokens = {
13921390
"<|tool_call|>",
13931391
"</|tool_call|>",
1392+
"<|tool_response|>",
1393+
"<|tool|>",
1394+
"</|tool|>",
13941395
};
13951396
});
13961397

@@ -1449,89 +1450,9 @@ static common_chat_params common_chat_params_init_phi_4(const common_chat_templa
14491450
}
14501451

14511452
static common_chat_msg common_chat_parse_phi_4(const std::string & input) {
1452-
common_chat_msg result;
1453-
result.role = "assistant";
1454-
1455-
std::string final_content = "";
1456-
1457-
const std::string opening_tag = "<|tool_call|>";
1458-
const std::string closing_tag = "</|tool_call|>";
1459-
1460-
size_t start_pos = 0;
1461-
while (true) {
1462-
// Find next tool call
1463-
size_t tool_start = input.find(opening_tag, start_pos);
1464-
if (tool_start == std::string::npos) {
1465-
// No more tool calls.
1466-
1467-
// Is start_pos within string bounds?
1468-
if (start_pos < input.length()) {
1469-
// Add the rest of the string to final_content
1470-
final_content += input.substr(start_pos);
1471-
}
1472-
break;
1473-
}
1474-
1475-
// Add content before the tool call to final_content
1476-
final_content += input.substr(start_pos, tool_start - start_pos);
1477-
1478-
// Find closing tag
1479-
size_t content_start = tool_start + opening_tag.length();
1480-
size_t tool_end = input.find(closing_tag, content_start);
1481-
1482-
if (tool_end == std::string::npos) {
1483-
// No closing tag found, so just include the rest of the string as tool.
1484-
tool_end = input.length();
1485-
}
1486-
1487-
// Extract tool call content
1488-
std::string tool_content = input.substr(
1489-
content_start,
1490-
tool_end - content_start
1491-
);
1492-
1493-
// Try to parse the tool call
1494-
try {
1495-
auto tool_call = json::parse(tool_content);
1496-
1497-
// Verify the required fields exist
1498-
if (!tool_call.contains("name")) {
1499-
throw std::runtime_error("Missing 'name' field in tool call");
1500-
}
1501-
1502-
if (!tool_call.contains("arguments")) {
1503-
throw std::runtime_error("Missing 'arguments' field in tool call");
1504-
}
1505-
1506-
std::string name = tool_call["name"].get<std::string>();
1507-
1508-
std::string arguments;
1509-
try {
1510-
arguments = tool_call["arguments"].dump();
1511-
} catch (const std::exception & e) {
1512-
LOG_ERR("Failed to serialize arguments: %s\n", e.what());
1513-
arguments = "{}";
1514-
}
1515-
1516-
result.tool_calls.push_back({
1517-
name,
1518-
arguments,
1519-
/* id= */ "",
1520-
});
1521-
} catch (const std::exception & e) {
1522-
// If parsing fails, include the entire tool call in the content
1523-
final_content += input.substr(
1524-
tool_start,
1525-
tool_end + closing_tag.length() - tool_start
1526-
);
1527-
}
1528-
1529-
// Move past this tool call for next iteration
1530-
start_pos = tool_end + closing_tag.length();
1531-
}
1532-
1533-
result.content = final_content;
1534-
return result;
1453+
static std::regex function_regex("<\\|tool_call\\|>\\s*\\{\\s*\"name\"\\s*:\\s*\"([^\"]+)\"\\s*,\\s*\"arguments\"\\s*:");
1454+
static std::regex close_regex(R"(\}\s*(</\|tool_call\|>)?)");
1455+
return parse_json_tool_calls(input, std::nullopt, function_regex, close_regex);
15351456
}
15361457

15371458

docs/function-calling.md

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,11 +12,12 @@ Function calling is supported for all models (see https://github.com/ggml-org/ll
1212
- Llama 3.1 / 3.3 (including builtin tools support - tool names for `wolfram_alpha`, `web_search` / `brave_search`, `code_interpreter`), Llama 3.2
1313
- Functionary v3.1 / v3.2
1414
- Hermes 2/3, Qwen 2.5
15-
- Qwen 2.5 Coder (WIP: https://github.com/ggml-org/llama.cpp/pull/12034)
15+
- Qwen 2.5 Coder (#12034)
1616
- Mistral Nemo
1717
- Firefunction v2
18-
- Command R7B
19-
- DeepSeek R1 (WIP / seems reluctant to call any tools?)
18+
- Command R7B (#11585)
19+
- DeepSeek R1 (#11607)
20+
- Phi 4 (#12288)
2021

2122
- Generic tool call is supported when the template isn't recognized by native format handlers (you'll see `Chat format: Generic` in the logs).
2223
- Use `--chat-template-file` to override the template when appropriate (see examples below)
@@ -297,9 +298,14 @@ llama-server --jinja -fa -hf bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q6_K_L \
297298
llama-server --jinja -fa -hf bartowski/DeepSeek-R1-Distill-Qwen-32B-GGUF:Q4_K_M \
298299
--chat-template-file models/templates/llama-cpp-deepseek-r1.jinja
299300

301+
# Native support for Phi 4 also needs a template override (official template is buggy)
302+
303+
llama-server --jinja -fa -hf bartowski/microsoft_Phi-4-mini-instruct-GGUF \
304+
--chat-template-file models/templates/llama-cpp-microsoft-Phi-4-mini-instruct.jinja
305+
300306
# Native support requires the right template for these GGUFs:
301307

302-
llama-server --jinja -fa -hf bartowski/functionary-small-v3.2-GGUF:Q4_K_M
308+
llama-server --jinja -fa -hf bartowski/functionary-small-v3.2-GGUF:Q4_K_M \
303309
--chat-template-file models/templates/meetkai-functionary-medium-v3.2.jinja
304310

305311
llama-server --jinja -fa -hf bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M \

examples/server/server.cpp

Lines changed: 13 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -386,7 +386,7 @@ struct server_task {
386386
trigger.type = COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN;
387387
trigger.value = word;
388388
trigger.token = token;
389-
params.sampling.grammar_triggers.push_back(std::move(trigger));
389+
params.sampling.grammar_triggers.push_back(trigger);
390390
} else {
391391
SRV_DBG("Grammar trigger word: `%s`\n", word.c_str());
392392
params.sampling.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, word});
@@ -751,10 +751,7 @@ struct server_task_result_cmpl_final : server_task_result {
751751
{"name", tc.name},
752752
{"arguments", tc.arguments},
753753
}},
754-
// Some templates generate and require an id (sometimes in a very specific format, e.g. Mistral Nemo).
755-
// We only generate a random id for the ones that don't generate one by themselves
756-
// (they also won't get to see it as their template likely doesn't use it, so it's all for the client)
757-
{"id", tc.id.empty() ? gen_tool_call_id() : tc.id},
754+
{"id", tc.id},
758755
});
759756
}
760757
message["tool_calls"] = tool_calls;
@@ -2040,18 +2037,6 @@ struct server_context {
20402037
return ret;
20412038
}
20422039

2043-
bool can_be_detokenized(const struct llama_context * ctx, const std::vector<llama_token> & tokens) {
2044-
const llama_model * model = llama_get_model(ctx);
2045-
const llama_vocab * vocab = llama_model_get_vocab(model);
2046-
const int32_t n_vocab = llama_vocab_n_tokens(vocab);
2047-
for (const auto & token : tokens) {
2048-
if (token < 0 || token >= n_vocab) {
2049-
return false;
2050-
}
2051-
}
2052-
return true;
2053-
}
2054-
20552040
bool launch_slot_with_task(server_slot & slot, const server_task & task) {
20562041
slot.reset();
20572042
slot.id_task = task.id;
@@ -2066,11 +2051,6 @@ struct server_context {
20662051
slot.lora = task.params.lora;
20672052
}
20682053

2069-
bool can_detokenize = can_be_detokenized(ctx, slot.prompt_tokens);
2070-
if (!can_detokenize) {
2071-
send_error(task, "Prompt contains invalid tokens", ERROR_TYPE_INVALID_REQUEST);
2072-
return false;
2073-
}
20742054
SLT_DBG(slot, "launching slot : %s\n", safe_json_to_str(slot.to_json()).c_str());
20752055

20762056
if (slot.n_predict > 0 && slot.params.n_predict > slot.n_predict) {
@@ -2113,7 +2093,7 @@ struct server_context {
21132093
SRV_DBG("%s", "clearing KV cache\n");
21142094

21152095
// clear the entire KV cache
2116-
llama_kv_self_clear(ctx);
2096+
llama_kv_cache_clear(ctx);
21172097
clean_kv_cache = false;
21182098
}
21192099

@@ -2655,8 +2635,8 @@ struct server_context {
26552635
res->n_tasks_deferred = queue_tasks.queue_tasks_deferred.size();
26562636
res->t_start = metrics.t_start;
26572637

2658-
res->kv_cache_tokens_count = llama_kv_self_n_tokens(ctx);
2659-
res->kv_cache_used_cells = llama_kv_self_used_cells(ctx);
2638+
res->kv_cache_tokens_count = llama_get_kv_cache_token_count(ctx);
2639+
res->kv_cache_used_cells = llama_get_kv_cache_used_cells(ctx);
26602640

26612641
res->n_prompt_tokens_processed_total = metrics.n_prompt_tokens_processed_total;
26622642
res->t_prompt_processing_total = metrics.t_prompt_processing_total;
@@ -2772,7 +2752,7 @@ struct server_context {
27722752

27732753
// Erase token cache
27742754
const size_t n_erased = slot->cache_tokens.size();
2775-
llama_kv_self_seq_rm(ctx, slot->id, -1, -1);
2755+
llama_kv_cache_seq_rm(ctx, slot->id, -1, -1);
27762756
slot->cache_tokens.clear();
27772757

27782758
auto res = std::make_unique<server_task_result_slot_erase>();
@@ -2840,8 +2820,8 @@ struct server_context {
28402820

28412821
SLT_WRN(slot, "slot context shift, n_keep = %d, n_left = %d, n_discard = %d\n", n_keep, n_left, n_discard);
28422822

2843-
llama_kv_self_seq_rm (ctx, slot.id, n_keep , n_keep + n_discard);
2844-
llama_kv_self_seq_add(ctx, slot.id, n_keep + n_discard, slot.n_past, -n_discard);
2823+
llama_kv_cache_seq_rm (ctx, slot.id, n_keep , n_keep + n_discard);
2824+
llama_kv_cache_seq_add(ctx, slot.id, n_keep + n_discard, slot.n_past, -n_discard);
28452825

28462826
if (slot.params.cache_prompt) {
28472827
for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++) {
@@ -3032,8 +3012,8 @@ struct server_context {
30323012

30333013
const int64_t kv_shift = (int64_t) head_p - (int64_t) head_c;
30343014

3035-
llama_kv_self_seq_rm (ctx, slot.id, head_p, head_c);
3036-
llama_kv_self_seq_add(ctx, slot.id, head_c, head_c + n_match, kv_shift);
3015+
llama_kv_cache_seq_rm (ctx, slot.id, head_p, head_c);
3016+
llama_kv_cache_seq_add(ctx, slot.id, head_c, head_c + n_match, kv_shift);
30373017

30383018
for (size_t i = 0; i < n_match; i++) {
30393019
slot.cache_tokens[head_p + i] = slot.cache_tokens[head_c + i];
@@ -3071,9 +3051,9 @@ struct server_context {
30713051
}
30723052

30733053
// keep only the common part
3074-
if (!llama_kv_self_seq_rm(ctx, slot.id, slot.n_past, -1)) {
3054+
if (!llama_kv_cache_seq_rm(ctx, slot.id, slot.n_past, -1)) {
30753055
// could not partially delete (likely using a non-Transformer model)
3076-
llama_kv_self_seq_rm(ctx, slot.id, -1, -1);
3056+
llama_kv_cache_seq_rm(ctx, slot.id, -1, -1);
30773057

30783058
// there is no common part left
30793059
slot.n_past = 0;
@@ -3313,7 +3293,7 @@ struct server_context {
33133293
slot.cache_tokens.push_back(id);
33143294
slot.cache_tokens.insert(slot.cache_tokens.end(), ids.begin(), ids.end() - 1);
33153295

3316-
llama_kv_self_seq_rm(ctx, slot.id, slot.n_past, -1);
3296+
llama_kv_cache_seq_rm(ctx, slot.id, slot.n_past, -1);
33173297

33183298
for (size_t i = 0; i < ids.size(); ++i) {
33193299
completion_token_output result;
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
{%- if messages[0]["role"] == "system" %}
2+
{%- set system_message = messages[0]["content"] %}
3+
{% elif tools is defined -%}
4+
{%- set system_message = "You are a helpful assistant with access to tools." -%}
5+
{% else %}
6+
{%- set system_message = "" -%}
7+
{%- endif %}
8+
{%- if tools is defined -%}
9+
{%- set system_message = system_message + '<|tool|>' + (tools | tojson) + '<|/tool|>' -%}
10+
{%- if '<|tool_call|>' not in system_message -%}
11+
{%- set system_message = system_message + "\nTo use a tool, respond in this format: <|tool_call|>{\"name\": \"foo\", \"arguments\": {\"a\": 1}}<|/tool_call|>" %}
12+
{%- endif %}
13+
{%- endif %}
14+
{%- if system_message is defined -%}
15+
{{- '<|system|>' + system_message + '<|end|>' -}}
16+
{%- endif -%}
17+
{%- for message in messages -%}
18+
{%- if message['role'] == 'tool' -%}
19+
{{- '<|tool_response|>' + (message['content'] | tojson) + '<|/tool_response|>' -}}
20+
{%- elif message['role'] != 'system' -%}
21+
{{- '<|' + message['role'] + '|>' -}}
22+
{%- if message.content -%}
23+
{{- message['content'] -}}
24+
{%- endif -%}
25+
{%- for tool_call in message.tool_calls -%}
26+
{{- '<|tool_call|>' + (tool_call | tojson) + '<|/tool_call|>' -}}
27+
{%- endfor -%}
28+
{{- '<|end|>' -}}
29+
{%- endif -%}
30+
{%- endfor -%}
31+
{%- if add_generation_prompt -%}
32+
{{- '<|assistant|>' -}}
33+
{%- else -%}
34+
{{- eos_token -}}
35+
{%- endif -%}

0 commit comments

Comments
 (0)