runner: Flush pending responses before returning

If there are any pending reponses (such as from potential stop tokens) then we should send them back before ending the sequence. Otherwise, we can be missing tokens at the end of a response. Fixes ollama#6707
ai-smartnest-cloud · Sep 11, 2024 · 93ac376 · 93ac376
1 parent abed273
commit 93ac376
Showing 1 changed file with 33 additions and 27 deletions.
diff --git a/llm/ext_server/server.cpp b/llm/ext_server/server.cpp
@@ -913,7 +913,9 @@ struct llama_server_context
         slot.sampled = result.tok;
 
         // search stop word and delete it
-        slot.generated_text += token_str;
+        if (!llama_token_is_eog(model, result.tok))
+            slot.generated_text += token_str;
+
         slot.has_next_token = true;
 
         if (slot.ctx_sampling->params.use_penalty_prompt_tokens && result.tok != -1)
@@ -954,30 +956,36 @@ struct llama_server_context
         if (!incomplete)
         {
             size_t pos = std::min(slot.n_sent_text, slot.generated_text.size());
-            const std::string str_test = slot.generated_text.substr(pos);
-            bool is_stop_full = false;
-            size_t stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_FULL, slot);
-            if (stop_pos != std::string::npos)
-            {
-                is_stop_full = true;
-                slot.generated_text.erase(
-                    slot.generated_text.begin() + pos + stop_pos,
-                    slot.generated_text.end());
-                pos = std::min(slot.n_sent_text, slot.generated_text.size());
-            }
-            else
-            {
-                is_stop_full = false;
-                stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_PARTIAL, slot);
-            }
 
-            // check if there is any token to predict
-            if (stop_pos == std::string::npos || (!slot.has_next_token && !is_stop_full && stop_pos > 0))
-            {
-                // no send the stop word in the response
-                result.text_to_send = slot.generated_text.substr(pos, std::string::npos);
-                slot.n_sent_text += result.text_to_send.size();
-                // add the token to slot queue and cache
+            if (!llama_token_is_eog(model, result.tok)) {
+                const std::string str_test = slot.generated_text.substr(pos);
+                bool is_stop_full = false;
+                size_t stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_FULL, slot);
+                if (stop_pos != std::string::npos)
+                {
+                    is_stop_full = true;
+                    slot.generated_text.erase(
+                        slot.generated_text.begin() + pos + stop_pos,
+                        slot.generated_text.end());
+                    pos = std::min(slot.n_sent_text, slot.generated_text.size());
+                }
+                else
+                {
+                    is_stop_full = false;
+                    stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_PARTIAL, slot);
+                }
+
+                // check if there is any token to predict
+                if (stop_pos == std::string::npos || (!slot.has_next_token && !is_stop_full && stop_pos > 0))
+                {
+                    // no send the stop word in the response
+                    result.text_to_send = slot.generated_text.substr(pos, std::string::npos);
+                    slot.n_sent_text += result.text_to_send.size();
+                    // add the token to slot queue and cache
+                }
+            } else {
+                    result.text_to_send = slot.generated_text.substr(pos, std::string::npos);
+                    slot.n_sent_text += result.text_to_send.size();
             }
 
             if (slot.params.stream)
@@ -1117,9 +1125,7 @@ struct llama_server_context
             {"multimodal", multimodal}
         };
 
-        if (!llama_token_is_eog(model, tkn.tok)) {
-            res.result_json["content"] = tkn.text_to_send;
-        }
+        res.result_json["content"] = tkn.text_to_send;
 
         if (slot.sparams.n_probs > 0)
         {