Skip to content

Commit 17b23eb

Browse files
authored
server : fix multibyte handle in partial response (#3706)
1 parent 778c070 commit 17b23eb

File tree

1 file changed

+30
-26
lines changed

1 file changed

+30
-26
lines changed

examples/server/server.cpp

Lines changed: 30 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1005,32 +1005,6 @@ struct llama_server_context
10051005
slot.generated_text += token_str;
10061006
slot.has_next_token = true;
10071007

1008-
size_t pos = std::min(slot.sent_count, slot.generated_text.size());
1009-
const std::string str_test = slot.generated_text.substr(pos);
1010-
bool is_stop_full = false;
1011-
size_t stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_FULL, slot);
1012-
if (stop_pos != std::string::npos) {
1013-
is_stop_full = true;
1014-
slot.generated_text.erase(
1015-
slot.generated_text.begin() + pos + stop_pos,
1016-
slot.generated_text.end());
1017-
pos = std::min(slot.sent_count, slot.generated_text.size());
1018-
} else {
1019-
is_stop_full = false;
1020-
stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_PARTIAL, slot);
1021-
}
1022-
1023-
// check if there is any token to predict
1024-
if(stop_pos == std::string::npos || (!slot.has_next_token && !is_stop_full && stop_pos > 0)) {
1025-
// no send the stop word in the response
1026-
result.text_to_send = slot.generated_text.substr(pos, std::string::npos);
1027-
slot.sent_count += result.text_to_send.size();
1028-
// add the token to slot queue and cache
1029-
}
1030-
slot.add_token_string(result);
1031-
if(slot.params.stream) {
1032-
send_partial_response(slot, result);
1033-
}
10341008
if (slot.multibyte_pending > 0)
10351009
{
10361010
slot.multibyte_pending -= token_str.size();
@@ -1059,6 +1033,36 @@ struct llama_server_context
10591033
}
10601034
}
10611035

1036+
if (slot.multibyte_pending == 0)
1037+
{
1038+
size_t pos = std::min(slot.sent_count, slot.generated_text.size());
1039+
const std::string str_test = slot.generated_text.substr(pos);
1040+
bool is_stop_full = false;
1041+
size_t stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_FULL, slot);
1042+
if (stop_pos != std::string::npos) {
1043+
is_stop_full = true;
1044+
slot.generated_text.erase(
1045+
slot.generated_text.begin() + pos + stop_pos,
1046+
slot.generated_text.end());
1047+
pos = std::min(slot.sent_count, slot.generated_text.size());
1048+
} else {
1049+
is_stop_full = false;
1050+
stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_PARTIAL, slot);
1051+
}
1052+
1053+
// check if there is any token to predict
1054+
if(stop_pos == std::string::npos || (!slot.has_next_token && !is_stop_full && stop_pos > 0)) {
1055+
// no send the stop word in the response
1056+
result.text_to_send = slot.generated_text.substr(pos, std::string::npos);
1057+
slot.sent_count += result.text_to_send.size();
1058+
// add the token to slot queue and cache
1059+
}
1060+
slot.add_token_string(result);
1061+
if (slot.params.stream) {
1062+
send_partial_response(slot, result);
1063+
}
1064+
}
1065+
10621066
if (slot.multibyte_pending > 0 && !slot.has_next_token)
10631067
{
10641068
slot.has_next_token = true;

0 commit comments

Comments
 (0)