Skip to content

Commit

Permalink
Detokenizer fixes ggerganov#8039 (28)
Browse files Browse the repository at this point in the history
style: spaces

Update bruteforce test: add more models

Update bruteforce test: header files location

Better leading space removal

'viking' detokenizer clean spaces

style : remove spaces

Update bruteforce test

Better leading space removal

Symetric params for llama_tokenize() and llama_detokenize()

Update brute force test:

Detokenize special tokens.
Replace errors with '\uFFFD' when detokenizing to 'utf-8'.
More edge cases.
Better detokenization results check.

Bugfix: custom regexs splits undefined unicode codepoints

style: remove trailing whitespace

Do not remove space when decoding special tokens

Fix detokenizer():

UNKNOWN and CONTROL are 'special pieces'.
Remove space after UNKNOWN and CONTROL.
Refactor llama_token_to_piece().

tets: skip unicode surrogaes and undefined

tests: gracefully exit threads

Using exit() is throwing random exceptions

tests: unexpected vocab type as test fail instead of error

Useful when automating tests:
 - If you don't know in advance the vocab type.
 - Differenciate other loading errors.

Add tokenizer flag: clean_up_tokenization_spaces

Remove previous space

Remove previous space

Fix add_space_prefix, set false by default

Update bruteforce random tests

Add detokenizer checks
New generator: ascii_lr_strip
New generator: apostrophe
Add more vocabs files

Clean old known problematic codepoints

minor: confusing hexadecimal codepoint

Fix tokenizer tests

Using llama_tokenize() in tests

Using llama_tokenize() in tests

Add llama_detokenize()
  • Loading branch information
jaime-m-p authored and Nexesenex committed Jul 6, 2024
1 parent 2badd4a commit 230a42d
Show file tree
Hide file tree
Showing 11 changed files with 1,358 additions and 143 deletions.
58 changes: 21 additions & 37 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2606,51 +2606,35 @@ std::vector<llama_token> llama_tokenize(
}

std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
std::vector<char> result(8, 0);
const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);
if (n_tokens < 0) {
result.resize(-n_tokens);
int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);
GGML_ASSERT(check == -n_tokens);
} else {
result.resize(n_tokens);
}

return std::string(result.data(), result.size());
}

std::string llama_detokenize_spm(llama_context * ctx, const std::vector<llama_token> & tokens) {
const llama_token bos_id = llama_token_bos(llama_get_model(ctx));

std::string piece;
std::string result;

for (size_t i = 0; i < tokens.size(); ++i) {
piece = llama_token_to_piece(ctx, tokens[i]);

// remove the leading space of the first non-BOS token
if (((tokens[0] == bos_id && i == 1) || (tokens[0] != bos_id && i == 0)) && piece[0] == ' ') {
piece = piece.substr(1);
}

result += piece;
piece.resize(piece.capacity()); // using string internal cache, 15 bytes + '\n'
const int n_chars = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), 0, special);
if (n_chars < 0) {
piece.resize(-n_chars);
int check = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), 0, special);
GGML_ASSERT(check == -n_chars);
}
else {
piece.resize(n_chars);
}

return result;
return piece;
}

std::string llama_detokenize_bpe(llama_context * ctx, const std::vector<llama_token> & tokens) {
std::string piece;
std::string result;

for (size_t i = 0; i < tokens.size(); ++i) {
piece = llama_token_to_piece(ctx, tokens[i]);

result += piece;
std::string llama_detokenize(llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
std::string text;
text.resize(std::max(text.capacity(), tokens.size()));
int32_t n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
if (n_chars < 0) {
text.resize(-n_chars);
n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
GGML_ASSERT(n_chars <= (int32_t)text.size()); // whitespace trimming is performed after per-token detokenization
}

text.resize(n_chars);

// NOTE: the original tokenizer decodes bytes after collecting the pieces.
return result;
return text;
}

bool llama_should_add_bos_token(const llama_model * model) {
Expand Down
16 changes: 4 additions & 12 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -366,21 +366,13 @@ std::string llama_token_to_piece(
llama_token token,
bool special = true);

// TODO: these should be moved in llama.h C-style API under single `llama_detokenize` function
// that takes into account the tokenizer type and decides how to handle the leading space
//
// detokenizes a vector of tokens into a string
// should work similar to Python's `tokenizer.decode`
// removes the leading space from the first non-BOS token
std::string llama_detokenize_spm(
llama_context * ctx,
const std::vector<llama_token> & tokens);

// detokenizes a vector of tokens into a string
// should work similar to Python's `tokenizer.decode`
std::string llama_detokenize_bpe(
// optionally renders special/control tokens
std::string llama_detokenize(
llama_context * ctx,
const std::vector<llama_token> & tokens);
const std::vector<llama_token> & tokens,
bool special = true);

// Uses the value from the model metadata if possible, otherwise
// defaults to true when model type is SPM, otherwise false.
Expand Down
3 changes: 2 additions & 1 deletion examples/batched.swift/Sources/main.swift
Original file line number Diff line number Diff line change
Expand Up @@ -229,7 +229,7 @@ private func tokenize(text: String, add_bos: Bool) -> [llama_token] {

private func token_to_piece(token: llama_token, buffer: inout [CChar]) -> String? {
var result = [CChar](repeating: 0, count: 8)
let nTokens = llama_token_to_piece(model, token, &result, Int32(result.count), false)
let nTokens = llama_token_to_piece(model, token, &result, Int32(result.count), 0, false)
if nTokens < 0 {
let actualTokensCount = -Int(nTokens)
result = .init(repeating: 0, count: actualTokensCount)
Expand All @@ -238,6 +238,7 @@ private func token_to_piece(token: llama_token, buffer: inout [CChar]) -> String
token,
&result,
Int32(result.count),
0,
false
)
assert(check == actualTokensCount)
Expand Down
4 changes: 2 additions & 2 deletions examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
Original file line number Diff line number Diff line change
Expand Up @@ -322,15 +322,15 @@ actor LlamaContext {
defer {
result.deallocate()
}
let nTokens = llama_token_to_piece(model, token, result, 8, false)
let nTokens = llama_token_to_piece(model, token, result, 8, 0, false)

if nTokens < 0 {
let newResult = UnsafeMutablePointer<Int8>.allocate(capacity: Int(-nTokens))
newResult.initialize(repeating: Int8(0), count: Int(-nTokens))
defer {
newResult.deallocate()
}
let nNewTokens = llama_token_to_piece(model, token, newResult, -nTokens, false)
let nNewTokens = llama_token_to_piece(model, token, newResult, -nTokens, 0, false)
let bufferPointer = UnsafeBufferPointer(start: newResult, count: Int(nNewTokens))
return Array(bufferPointer)
} else {
Expand Down
Loading

0 comments on commit 230a42d

Please sign in to comment.