Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Detokenizer fixes #8039

Merged
merged 30 commits into from
Jul 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
eea8dfa
Add llama_detokenize()
Jun 20, 2024
d779bab
Using llama_tokenize() in tests
Jun 20, 2024
40a6660
Using llama_tokenize() in tests
Jun 20, 2024
16a7503
Fix tokenizer tests
Jun 20, 2024
03dbcc8
minor: confusing hexadecimal codepoint
Jun 20, 2024
071bf42
Clean old known problematic codepoints
Jun 20, 2024
064b35e
Update bruteforce random tests
Jun 20, 2024
503b753
Fix add_space_prefix, set false by default
Jun 20, 2024
0cc6593
Remove previous space
Jun 20, 2024
6d233bc
Remove previous space
Jun 20, 2024
b452e82
Add tokenizer flag: clean_up_tokenization_spaces
Jun 21, 2024
9af762c
tests: unexpected vocab type as test fail instead of error
Jun 23, 2024
0cf2989
tests: gracefully exit threads
Jun 23, 2024
38d54b3
tets: skip unicode surrogaes and undefined
Jun 23, 2024
44c8648
Fix detokenizer():
Jun 23, 2024
9eb0fca
Do not remove space when decoding special tokens
Jun 24, 2024
12e2c31
style: remove trailing whitespace
Jun 24, 2024
95a0df5
Bugfix: custom regexs splits undefined unicode codepoints
Jun 24, 2024
4a28063
Update brute force test:
Jun 24, 2024
9854a9c
Symetric params for llama_tokenize() and llama_detokenize()
Jun 25, 2024
107923c
Better leading space removal
Jun 25, 2024
68220fe
Update bruteforce test
Jun 25, 2024
98fc182
style : remove spaces
Jul 4, 2024
8072089
Merge commit 'f8c4c073' into detokenizer
Jul 4, 2024
8f5e1e0
'viking' detokenizer clean spaces
Jul 4, 2024
2f15019
Better leading space removal
Jul 4, 2024
11ac641
Update bruteforce test: header files location
Jul 4, 2024
4db8c0d
Update bruteforce test: add more models
Jul 4, 2024
906476f
style: spaces
Jul 4, 2024
0137683
style: spaces
jaime-m-p Jul 5, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 21 additions & 37 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2559,51 +2559,35 @@ std::vector<llama_token> llama_tokenize(
}

std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
std::vector<char> result(8, 0);
const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);
if (n_tokens < 0) {
result.resize(-n_tokens);
int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);
GGML_ASSERT(check == -n_tokens);
} else {
result.resize(n_tokens);
}

return std::string(result.data(), result.size());
}

std::string llama_detokenize_spm(llama_context * ctx, const std::vector<llama_token> & tokens) {
const llama_token bos_id = llama_token_bos(llama_get_model(ctx));

std::string piece;
std::string result;

for (size_t i = 0; i < tokens.size(); ++i) {
piece = llama_token_to_piece(ctx, tokens[i]);

// remove the leading space of the first non-BOS token
if (((tokens[0] == bos_id && i == 1) || (tokens[0] != bos_id && i == 0)) && piece[0] == ' ') {
piece = piece.substr(1);
}

result += piece;
piece.resize(piece.capacity()); // using string internal cache, 15 bytes + '\n'
const int n_chars = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), 0, special);
if (n_chars < 0) {
piece.resize(-n_chars);
int check = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), 0, special);
GGML_ASSERT(check == -n_chars);
}
else {
piece.resize(n_chars);
}

return result;
return piece;
}

std::string llama_detokenize_bpe(llama_context * ctx, const std::vector<llama_token> & tokens) {
std::string piece;
std::string result;

for (size_t i = 0; i < tokens.size(); ++i) {
piece = llama_token_to_piece(ctx, tokens[i]);

result += piece;
std::string llama_detokenize(llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
std::string text;
text.resize(std::max(text.capacity(), tokens.size()));
int32_t n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
if (n_chars < 0) {
text.resize(-n_chars);
n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
GGML_ASSERT(n_chars <= (int32_t)text.size()); // whitespace trimming is performed after per-token detokenization
}

text.resize(n_chars);

// NOTE: the original tokenizer decodes bytes after collecting the pieces.
return result;
return text;
}

bool llama_should_add_bos_token(const llama_model * model) {
Expand Down
16 changes: 4 additions & 12 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -349,21 +349,13 @@ std::string llama_token_to_piece(
llama_token token,
bool special = true);

// TODO: these should be moved in llama.h C-style API under single `llama_detokenize` function
// that takes into account the tokenizer type and decides how to handle the leading space
//
// detokenizes a vector of tokens into a string
// should work similar to Python's `tokenizer.decode`
// removes the leading space from the first non-BOS token
std::string llama_detokenize_spm(
llama_context * ctx,
const std::vector<llama_token> & tokens);

// detokenizes a vector of tokens into a string
// should work similar to Python's `tokenizer.decode`
std::string llama_detokenize_bpe(
// optionally renders special/control tokens
std::string llama_detokenize(
llama_context * ctx,
const std::vector<llama_token> & tokens);
const std::vector<llama_token> & tokens,
bool special = true);

// Uses the value from the model metadata if possible, otherwise
// defaults to true when model type is SPM, otherwise false.
Expand Down
3 changes: 2 additions & 1 deletion examples/batched.swift/Sources/main.swift
Original file line number Diff line number Diff line change
Expand Up @@ -229,7 +229,7 @@ private func tokenize(text: String, add_bos: Bool) -> [llama_token] {

private func token_to_piece(token: llama_token, buffer: inout [CChar]) -> String? {
var result = [CChar](repeating: 0, count: 8)
let nTokens = llama_token_to_piece(model, token, &result, Int32(result.count), false)
let nTokens = llama_token_to_piece(model, token, &result, Int32(result.count), 0, false)
if nTokens < 0 {
let actualTokensCount = -Int(nTokens)
result = .init(repeating: 0, count: actualTokensCount)
Expand All @@ -238,6 +238,7 @@ private func token_to_piece(token: llama_token, buffer: inout [CChar]) -> String
token,
&result,
Int32(result.count),
0,
false
)
assert(check == actualTokensCount)
Expand Down
4 changes: 2 additions & 2 deletions examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
Original file line number Diff line number Diff line change
Expand Up @@ -322,15 +322,15 @@ actor LlamaContext {
defer {
result.deallocate()
}
let nTokens = llama_token_to_piece(model, token, result, 8, false)
let nTokens = llama_token_to_piece(model, token, result, 8, 0, false)

if nTokens < 0 {
let newResult = UnsafeMutablePointer<Int8>.allocate(capacity: Int(-nTokens))
newResult.initialize(repeating: Int8(0), count: Int(-nTokens))
defer {
newResult.deallocate()
}
let nNewTokens = llama_token_to_piece(model, token, newResult, -nTokens, false)
let nNewTokens = llama_token_to_piece(model, token, newResult, -nTokens, 0, false)
let bufferPointer = UnsafeBufferPointer(start: newResult, count: Int(nNewTokens))
return Array(bufferPointer)
} else {
Expand Down
19 changes: 18 additions & 1 deletion include/llama.h
Original file line number Diff line number Diff line change
Expand Up @@ -882,6 +882,7 @@ extern "C" {
/// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
/// @return Returns the number of tokens on success, no more than n_tokens_max
/// @return Returns a negative number on failure - the number of tokens that would have been returned
/// @param add_special Allow to add BOS and EOS tokens if model is configured to do so.
/// @param parse_special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated
/// as plaintext. Does not insert a leading space.
LLAMA_API int32_t llama_tokenize(
Expand All @@ -896,15 +897,31 @@ extern "C" {
// Token Id -> Piece.
// Uses the vocabulary in the provided context.
// Does not write null terminator to the buffer.
// User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens.
// User can skip up to 'lstrip' leading spaces before copying (useful when encoding/decoding multiple tokens with 'add_space_prefix')
// @param special If true, special tokens are rendered in the output.
LLAMA_API int32_t llama_token_to_piece(
const struct llama_model * model,
llama_token token,
char * buf,
int32_t length,
int32_t lstrip,
bool special);

/// @details Convert the provided tokens into text (inverse of llama_tokenize()).
/// @param text The char pointer must be large enough to hold the resulting text.
/// @return Returns the number of chars/bytes on success, no more than text_len_max.
/// @return Returns a negative number on failure - the number of chars/bytes that would have been returned.
/// @param remove_special Allow to remove BOS and EOS tokens if model is configured to do so.
/// @param unparse_special If true, special tokens are rendered in the output.
LLAMA_API int32_t llama_detokenize(
const struct llama_model * model,
const llama_token * tokens,
int32_t n_tokens,
char * text,
int32_t text_len_max,
bool remove_special,
bool unparse_special);

/// Apply chat template. Inspired by hf apply_chat_template() on python.
/// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model"
/// NOTE: This function does not use a jinja parser. It only support a pre-defined list of template. See more: https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
Expand Down
Loading
Loading