Skip to content

Commit ee1b497

Browse files
llama : support more diverse tokenizers? (#2420)
* supporting more diverse tokenizers * Update llama.cpp --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
1 parent d73b8d4 commit ee1b497

File tree

1 file changed

+3
-1
lines changed

1 file changed

+3
-1
lines changed

llama.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1924,7 +1924,9 @@ struct llama_tokenizer {
19241924
if (token == vocab_.token_to_id.end()) {
19251925
// output any symbols that did not form tokens as bytes.
19261926
for (int j = 0; j < (int) symbol.n; ++j) {
1927-
llama_vocab::id token_id = static_cast<uint8_t>(symbol.text[j]) + 3;
1927+
// NOTE: old version, before #2420 - not sure what are the implications of this
1928+
//llama_vocab::id token_id = static_cast<uint8_t>(symbol.text[j]) + 3;
1929+
llama_vocab::id token_id = vocab_.token_to_id.at(std::string(1, symbol.text[j]));
19281930
output.push_back(token_id);
19291931
}
19301932
} else {

0 commit comments

Comments
 (0)