Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix spm whitespaces #2806

Merged
merged 3 commits into from
Aug 26, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 12 additions & 5 deletions examples/main/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -189,12 +189,19 @@ int main(int argc, char ** argv) {
}
}

const bool is_spm = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM;
// Add BOS if SPM tokenizer
const bool add_bos = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM;

// tokenize the prompt
std::vector<llama_token> embd_inp;

if (llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM) {
// Add a space in front of the first character to match OG llama tokenizer behavior
params.prompt.insert(0, 1, ' ');
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do we insist to automatically add white space to the user's prompt? Isn't it better to let the user decide? Me as a user can decide to prompt with -p "Blah blah" or -p " Blah blah". I know the LLM response will be different, and I can decide to do whatever I believe is better. The way it is here, I have no way to prompt the model without a space at the beginning of my prompt, even if I knew exactly that this would be better.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You are right, but users in general have no clue that the sentencepiece tokenizer should have a space prepended to every word including the first word in the prompt.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have no way to prompt the model without a space at the beginning of my prompt, even if I knew exactly that this would be better.

Is there any model that llama.cpp supports where it would actually be better? (Worst comes to worse, could probably add a commandline argument to disable that behavior but there would probably have to be a practical use case.)

}

if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) {
embd_inp = ::llama_tokenize(ctx, params.prompt, is_spm);
embd_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
} else {
embd_inp = session_tokens;
}
Expand All @@ -210,9 +217,9 @@ int main(int argc, char ** argv) {
int original_prompt_len = 0;
if (ctx_guidance) {
params.cfg_negative_prompt.insert(0, 1, ' ');
guidance_inp = ::llama_tokenize(ctx_guidance, params.cfg_negative_prompt, is_spm);
guidance_inp = ::llama_tokenize(ctx_guidance, params.cfg_negative_prompt, add_bos);

std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, is_spm);
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
original_prompt_len = original_inp.size();
guidance_offset = (int)guidance_inp.size() - original_prompt_len;
}
Expand Down Expand Up @@ -259,7 +266,7 @@ int main(int argc, char ** argv) {
}

// prefix & suffix for instruct mode
const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", is_spm);
const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", add_bos);
const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n", false);

// in instruct mode, we inject a prefix and a suffix to each input by the user
Expand Down
48 changes: 13 additions & 35 deletions llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1635,7 +1635,7 @@ static void llm_load_hparams(
}

// TODO: This should probably be in llama.h
static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, const std::string & raw_text, bool bos, bool escape);
static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, const std::string & raw_text, bool bos);

static void llm_load_vocab(
llama_model_loader & ml,
Expand Down Expand Up @@ -1737,7 +1737,7 @@ static void llm_load_vocab(
}

// determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
vocab.linefeed_id = llama_tokenize_internal(vocab, "\n", false, false)[0];
vocab.linefeed_id = llama_tokenize_internal(vocab, "\n", false)[0];

// special tokens
GGUF_GET_KEY(ctx, vocab.special_bos_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_BOS_ID));
Expand Down Expand Up @@ -3027,14 +3027,8 @@ static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
}

static std::string llama_escape_whitespace(const std::string& text) {
std::string result = "\xe2\x96\x81";
for (size_t offs = 0; offs < text.length(); ++offs) {
if (text[offs] == ' ') {
result += "\xe2\x96\x81";
} else {
result += text[offs];
}
}
std::string result = text;
replace_all(result, " ", "\xe2\x96\x81");
return result;
}

Expand Down Expand Up @@ -3219,7 +3213,7 @@ struct llm_bigram_bpe {
};

struct llm_tokenizer_bpe {
llm_tokenizer_bpe(const llama_vocab & vocab, bool g2ws): vocab(vocab) { flag_g2ws = g2ws; }
llm_tokenizer_bpe(const llama_vocab & vocab): vocab(vocab) {}

void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
int final_prev_index = -1;
Expand Down Expand Up @@ -3371,8 +3365,6 @@ struct llm_tokenizer_bpe {
return words;
}

bool flag_g2ws = false;

const llama_vocab & vocab;

std::vector<llm_symbol> symbols;
Expand All @@ -3381,39 +3373,26 @@ struct llm_tokenizer_bpe {
llm_bigram_bpe::queue work_queue;
};

static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, const std::string & raw_text, bool bos, bool escape) {
static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, const std::string & raw_text, bool bos) {
std::vector<llama_vocab::id> output;

if (raw_text.empty()) {
return output;
}

if (bos && vocab.special_bos_id != -1) {
output.push_back(vocab.special_bos_id);
}

switch (vocab.type) {
case LLAMA_VOCAB_TYPE_SPM:
{
llm_tokenizer_spm tokenizer(vocab);

if (bos) {
output.push_back(vocab.special_bos_id);
}

std::string text;
if (escape) {
text = llama_escape_whitespace(raw_text);
} else {
text = raw_text;
}

tokenizer.tokenize(text, output);
tokenizer.tokenize(llama_escape_whitespace(raw_text), output);
} break;
case LLAMA_VOCAB_TYPE_BPE:
{
llm_tokenizer_bpe tokenizer(vocab, escape);

if (bos && vocab.special_bos_id != -1) {
output.push_back(vocab.special_bos_id);
}

llm_tokenizer_bpe tokenizer(vocab);
tokenizer.tokenize(raw_text, output);
} break;
};
Expand Down Expand Up @@ -6095,8 +6074,7 @@ int llama_tokenize_with_model(
llama_token * tokens,
int n_max_tokens,
bool add_bos) {
auto escape = llama_vocab_get_type(model->vocab) == LLAMA_VOCAB_TYPE_SPM;
auto res = llama_tokenize_internal(model->vocab, text, add_bos, escape);
auto res = llama_tokenize_internal(model->vocab, text, add_bos);

if (n_max_tokens < (int) res.size()) {
LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
Expand Down
3 changes: 2 additions & 1 deletion tests/test-tokenizer-0.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,8 @@ int main(int argc, char **argv) {
bool success = true;

for (const auto & test_kv : k_tests()) {
std::vector<llama_token> res = llama_tokenize(ctx, test_kv.first, true);
// Add a space in front of the first character to match OG llama tokenizer behavior
std::vector<llama_token> res = llama_tokenize(ctx, " " + test_kv.first, true);
fprintf(stderr, "%s : '%s' tokenized to '%s'\n",
__func__, test_kv.first.c_str(), unescape_whitespace(ctx, res).c_str());

Expand Down