Skip to content

Commit

Permalink
runner: Flush pending responses before returning
Browse files Browse the repository at this point in the history
If there are any pending reponses (such as from potential stop
tokens) then we should send them back before ending the sequence.
Otherwise, we can be missing tokens at the end of a response.

Fixes ollama#6707
  • Loading branch information
jessegross committed Sep 11, 2024
1 parent abed273 commit 93ac376
Showing 1 changed file with 33 additions and 27 deletions.
60 changes: 33 additions & 27 deletions llm/ext_server/server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -913,7 +913,9 @@ struct llama_server_context
slot.sampled = result.tok;

// search stop word and delete it
slot.generated_text += token_str;
if (!llama_token_is_eog(model, result.tok))
slot.generated_text += token_str;

slot.has_next_token = true;

if (slot.ctx_sampling->params.use_penalty_prompt_tokens && result.tok != -1)
Expand Down Expand Up @@ -954,30 +956,36 @@ struct llama_server_context
if (!incomplete)
{
size_t pos = std::min(slot.n_sent_text, slot.generated_text.size());
const std::string str_test = slot.generated_text.substr(pos);
bool is_stop_full = false;
size_t stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_FULL, slot);
if (stop_pos != std::string::npos)
{
is_stop_full = true;
slot.generated_text.erase(
slot.generated_text.begin() + pos + stop_pos,
slot.generated_text.end());
pos = std::min(slot.n_sent_text, slot.generated_text.size());
}
else
{
is_stop_full = false;
stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_PARTIAL, slot);
}

// check if there is any token to predict
if (stop_pos == std::string::npos || (!slot.has_next_token && !is_stop_full && stop_pos > 0))
{
// no send the stop word in the response
result.text_to_send = slot.generated_text.substr(pos, std::string::npos);
slot.n_sent_text += result.text_to_send.size();
// add the token to slot queue and cache
if (!llama_token_is_eog(model, result.tok)) {
const std::string str_test = slot.generated_text.substr(pos);
bool is_stop_full = false;
size_t stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_FULL, slot);
if (stop_pos != std::string::npos)
{
is_stop_full = true;
slot.generated_text.erase(
slot.generated_text.begin() + pos + stop_pos,
slot.generated_text.end());
pos = std::min(slot.n_sent_text, slot.generated_text.size());
}
else
{
is_stop_full = false;
stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_PARTIAL, slot);
}

// check if there is any token to predict
if (stop_pos == std::string::npos || (!slot.has_next_token && !is_stop_full && stop_pos > 0))
{
// no send the stop word in the response
result.text_to_send = slot.generated_text.substr(pos, std::string::npos);
slot.n_sent_text += result.text_to_send.size();
// add the token to slot queue and cache
}
} else {
result.text_to_send = slot.generated_text.substr(pos, std::string::npos);
slot.n_sent_text += result.text_to_send.size();
}

if (slot.params.stream)
Expand Down Expand Up @@ -1117,9 +1125,7 @@ struct llama_server_context
{"multimodal", multimodal}
};

if (!llama_token_is_eog(model, tkn.tok)) {
res.result_json["content"] = tkn.text_to_send;
}
res.result_json["content"] = tkn.text_to_send;

if (slot.sparams.n_probs > 0)
{
Expand Down

0 comments on commit 93ac376

Please sign in to comment.