@@ -618,6 +618,11 @@ struct llama_server_context
618618
619619 std::vector<llama_token> tokenize (const json & json_prompt, bool add_bos) const
620620 {
621+ // TODO: currently, we tokenize using special tokens by default
622+ // this is not always correct (see https://github.com/ggerganov/llama.cpp/pull/4160#issuecomment-1824826216)
623+ // but it's better compared to completely ignoring ChatML and other chat templates
624+ const bool TMP_FORCE_SPECIAL = true ;
625+
621626 // If `add_bos` is true, we only add BOS, when json_prompt is a string,
622627 // or the first element of the json_prompt array is a string.
623628 std::vector<llama_token> prompt_tokens;
@@ -633,12 +638,12 @@ struct llama_server_context
633638 std::vector<llama_token> p;
634639 if (first)
635640 {
636- p = ::llama_tokenize (ctx, s, add_bos);
641+ p = ::llama_tokenize (ctx, s, add_bos, TMP_FORCE_SPECIAL );
637642 first = false ;
638643 }
639644 else
640645 {
641- p = ::llama_tokenize (ctx, s, false );
646+ p = ::llama_tokenize (ctx, s, false , TMP_FORCE_SPECIAL );
642647 }
643648 prompt_tokens.insert (prompt_tokens.end (), p.begin (), p.end ());
644649 }
@@ -655,7 +660,7 @@ struct llama_server_context
655660 else
656661 {
657662 auto s = json_prompt.template get <std::string>();
658- prompt_tokens = ::llama_tokenize (ctx, s, add_bos);
663+ prompt_tokens = ::llama_tokenize (ctx, s, add_bos, TMP_FORCE_SPECIAL );
659664 }
660665
661666 return prompt_tokens;
@@ -2235,7 +2240,7 @@ std::string format_chatml(std::vector<json> messages)
22352240
22362241 for (auto it = messages.begin (); it != messages.end (); ++it) {
22372242 chatml_msgs << " <|im_start|>"
2238- << json_value (*it, " role" , std::string (" user" )) << ' \n ' ;
2243+ << json_value (*it, " role" , std::string (" user" )) << ' \n ' ;
22392244 chatml_msgs << json_value (*it, " content" , std::string (" " ))
22402245 << " <|im_end|>\n " ;
22412246 }
0 commit comments