@@ -1005,32 +1005,6 @@ struct llama_server_context
1005
1005
slot.generated_text += token_str;
1006
1006
slot.has_next_token = true ;
1007
1007
1008
- size_t pos = std::min (slot.sent_count , slot.generated_text .size ());
1009
- const std::string str_test = slot.generated_text .substr (pos);
1010
- bool is_stop_full = false ;
1011
- size_t stop_pos = find_stopping_strings (str_test, token_str.size (), STOP_FULL, slot);
1012
- if (stop_pos != std::string::npos) {
1013
- is_stop_full = true ;
1014
- slot.generated_text .erase (
1015
- slot.generated_text .begin () + pos + stop_pos,
1016
- slot.generated_text .end ());
1017
- pos = std::min (slot.sent_count , slot.generated_text .size ());
1018
- } else {
1019
- is_stop_full = false ;
1020
- stop_pos = find_stopping_strings (str_test, token_str.size (), STOP_PARTIAL, slot);
1021
- }
1022
-
1023
- // check if there is any token to predict
1024
- if (stop_pos == std::string::npos || (!slot.has_next_token && !is_stop_full && stop_pos > 0 )) {
1025
- // no send the stop word in the response
1026
- result.text_to_send = slot.generated_text .substr (pos, std::string::npos);
1027
- slot.sent_count += result.text_to_send .size ();
1028
- // add the token to slot queue and cache
1029
- }
1030
- slot.add_token_string (result);
1031
- if (slot.params .stream ) {
1032
- send_partial_response (slot, result);
1033
- }
1034
1008
if (slot.multibyte_pending > 0 )
1035
1009
{
1036
1010
slot.multibyte_pending -= token_str.size ();
@@ -1059,6 +1033,36 @@ struct llama_server_context
1059
1033
}
1060
1034
}
1061
1035
1036
+ if (slot.multibyte_pending == 0 )
1037
+ {
1038
+ size_t pos = std::min (slot.sent_count , slot.generated_text .size ());
1039
+ const std::string str_test = slot.generated_text .substr (pos);
1040
+ bool is_stop_full = false ;
1041
+ size_t stop_pos = find_stopping_strings (str_test, token_str.size (), STOP_FULL, slot);
1042
+ if (stop_pos != std::string::npos) {
1043
+ is_stop_full = true ;
1044
+ slot.generated_text .erase (
1045
+ slot.generated_text .begin () + pos + stop_pos,
1046
+ slot.generated_text .end ());
1047
+ pos = std::min (slot.sent_count , slot.generated_text .size ());
1048
+ } else {
1049
+ is_stop_full = false ;
1050
+ stop_pos = find_stopping_strings (str_test, token_str.size (), STOP_PARTIAL, slot);
1051
+ }
1052
+
1053
+ // check if there is any token to predict
1054
+ if (stop_pos == std::string::npos || (!slot.has_next_token && !is_stop_full && stop_pos > 0 )) {
1055
+ // no send the stop word in the response
1056
+ result.text_to_send = slot.generated_text .substr (pos, std::string::npos);
1057
+ slot.sent_count += result.text_to_send .size ();
1058
+ // add the token to slot queue and cache
1059
+ }
1060
+ slot.add_token_string (result);
1061
+ if (slot.params .stream ) {
1062
+ send_partial_response (slot, result);
1063
+ }
1064
+ }
1065
+
1062
1066
if (slot.multibyte_pending > 0 && !slot.has_next_token )
1063
1067
{
1064
1068
slot.has_next_token = true ;
0 commit comments