Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add tokenizer test + revert to C++11 #355

Merged
merged 2 commits into from
Mar 21, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Revert back to C++11, avoid std::string_view in the tokenizer
  • Loading branch information
ggerganov committed Mar 21, 2023
commit a0d00bd8096d67f464ee79fab2d9636766ad5ede
4 changes: 2 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -221,14 +221,14 @@ add_library(utils OBJECT
utils.h)

target_include_directories(utils PUBLIC .)
target_compile_features(utils PUBLIC cxx_std_11)
target_compile_features(utils PUBLIC cxx_std_11) # don't bump

add_library(ggml OBJECT
ggml.c
ggml.h)

target_include_directories(ggml PUBLIC .)
target_compile_features(ggml PUBLIC c_std_17)
target_compile_features(ggml PUBLIC c_std_11) # don't bump

#
# Linking
Expand Down
3 changes: 2 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,9 @@ endif
# Compile flags
#

# keep standard at C11 and C++11
CFLAGS = -I. -O3 -DNDEBUG -std=c11 -fPIC
CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++17 -fPIC
CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC
LDFLAGS =

# OS specific
Expand Down
45 changes: 24 additions & 21 deletions utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -250,7 +250,8 @@ struct llama_sp_symbol {
using index = int;
index prev;
index next;
std::string_view text;
const char * text;
size_t n;
};

struct llama_sp_bigram {
Expand All @@ -267,19 +268,23 @@ struct llama_sp_bigram {
size_t size;
};

// original implementation:
// https://github.com/ggerganov/llama.cpp/commit/074bea2eb1f1349a0118239c4152914aecaa1be4
struct llama_tokenizer {
llama_tokenizer(const llama_vocab & vocab): vocab_(vocab) {}

void tokenize(std::string_view text, std::vector<llama_vocab::id> & output) {
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
// split string into utf8 chars
int index = 0;
while (!text.empty()) {
size_t offs = 0;
while (offs < text.size()) {
llama_sp_symbol sym;
size_t char_len = std::min(text.size(), utf8_len(text.data()[0]));
sym.text = std::string_view(text.data(), char_len);
size_t char_len = std::min(text.size() - offs, utf8_len(text[offs]));
sym.text = text.c_str() + offs;
sym.n = char_len;
offs += char_len;
sym.prev = index - 1;
text.remove_prefix(char_len);
sym.next = text.empty() ? -1 : index + 1;
sym.next = offs == text.size() ? -1 : index + 1;
index++;
symbols_.emplace_back(std::move(sym));
}
Expand All @@ -298,14 +303,16 @@ struct llama_tokenizer {
auto & right_sym = symbols_[bigram.right];

// if one of the symbols already got merged, skip it.
if (left_sym.text.empty() || right_sym.text.empty() ||
left_sym.text.size() + right_sym.text.size() != bigram.size) {
if (left_sym.n == 0 || right_sym.n == 0 ||
left_sym.n + right_sym.n != bigram.size) {
continue;
}

// merge the right sym into the left one
left_sym.text = std::string_view(left_sym.text.data(), left_sym.text.size() + right_sym.text.size());
right_sym.text = std::string_view("");
left_sym.n += right_sym.n;
right_sym.n = 0;

//printf("left = '%*s' size = %zu\n", (int) left_sym.n, left_sym.text, bigram.size);

// remove the right sym from the chain
left_sym.next = right_sym.next;
Expand All @@ -319,12 +326,12 @@ struct llama_tokenizer {
}

for (int i = 0; i != -1; i = symbols_[i].next) {
auto& symbol = symbols_[i];
auto token = vocab_.token_to_id.find(std::string(symbol.text));
auto & symbol = symbols_[i];
auto token = vocab_.token_to_id.find(std::string(symbol.text, symbol.n));

if (token == vocab_.token_to_id.end()) {
// output any symbols that did not form tokens as bytes.
for (int j = 0; j < symbol.text.size(); ++j) {
for (int j = 0; j < (int) symbol.n; ++j) {
llama_vocab::id token_id = static_cast<uint8_t>(symbol.text[j]) + 3;
output.push_back(token_id);
}
Expand All @@ -340,8 +347,8 @@ struct llama_tokenizer {
return;
}

std::string_view text(symbols_[left].text.data(), symbols_[left].text.size() + symbols_[right].text.size());
auto token = vocab_.token_to_id.find(std::string(text));
const std::string text = std::string(symbols_[left].text, symbols_[left].n + symbols_[right].n);
auto token = vocab_.token_to_id.find(text);

if (token == vocab_.token_to_id.end()) {
return;
Expand Down Expand Up @@ -399,16 +406,12 @@ bool llama_vocab_load(const std::string & fname, llama_vocab & vocab) {
vocab.token_to_id[word] = i;
vocab.id_to_token[i] = word;
vocab.score[i] = score;

//if (i < 30000) {
// fprintf(stderr, "%s: vocab[%d] = '%s'\n", __func__, i, word.c_str());
//}
}

return true;
}

std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, std::string_view text, bool bos) {
std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, const std::string & text, bool bos) {
llama_tokenizer tokenizer(vocab);
std::vector<llama_vocab::id> output;

Expand Down
2 changes: 1 addition & 1 deletion utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ bool llama_vocab_load(const std::string & fname, llama_vocab & vocab);

// TODO: this is probably wrong, but I cannot figure out how this tokenizer works ..
// ref: https://github.com/google/sentencepiece
std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, std::string_view text, bool bos);
std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, const std::string & text, bool bos);

// sample next token given probabilities for each embedding
//
Expand Down