Skip to content

fix: crash on bge-m3 embedding model #8883

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion src/llama-vocab.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -279,7 +279,13 @@ struct llm_tokenizer_spm {
// output any symbols that did not form tokens as bytes.
output.reserve(output.size() + symbol.n);
for (int j = 0; j < (int)symbol.n; ++j) {
llama_vocab::id token_id = llama_byte_to_token_impl(vocab, symbol.text[j]);
llama_vocab::id token_id;
try {
token_id = llama_byte_to_token_impl(vocab, symbol.text[j]);
} catch(const std::exception & e) {
// not found, use UNK token instead.
token_id = vocab.special_unk_id;
}
Comment on lines +282 to +288
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm unsure about this change - if this happened, wouldn't it imply a problem with the model / tokenizer? Seems better to find and fix the root of the problem instead of hiding it

Copy link
Contributor Author

@thxCode thxCode Aug 6, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

get it. this fix is inspired by https://github.com/ggerganov/llama.cpp/blame/1e6f6554aa11fa10160a5fda689e736c3c34169f/src/llama.cpp#L5560-L5565, maybe my understanding is not correct.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ggerganov should I close this PR if the last commit is not a reasonable change? thanks.

output.push_back(token_id);
}
return;
Expand Down
162 changes: 81 additions & 81 deletions src/llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5517,6 +5517,86 @@ static void llm_load_vocab(
}
GGML_ASSERT(vocab.id_to_token.size() == vocab.token_to_id.size());

// special tokens
{
const std::vector<std::pair<enum llm_kv, int32_t &>> special_token_types = {
{ LLM_KV_TOKENIZER_BOS_ID, vocab.special_bos_id },
{ LLM_KV_TOKENIZER_EOS_ID, vocab.special_eos_id },
{ LLM_KV_TOKENIZER_UNK_ID, vocab.special_unk_id },
{ LLM_KV_TOKENIZER_SEP_ID, vocab.special_sep_id },
{ LLM_KV_TOKENIZER_PAD_ID, vocab.special_pad_id },
{ LLM_KV_TOKENIZER_CLS_ID, vocab.special_cls_id },
{ LLM_KV_TOKENIZER_MASK_ID, vocab.special_mask_id },
{ LLM_KV_TOKENIZER_PREFIX_ID, vocab.special_prefix_id },
{ LLM_KV_TOKENIZER_SUFFIX_ID, vocab.special_suffix_id },
{ LLM_KV_TOKENIZER_MIDDLE_ID, vocab.special_middle_id },
{ LLM_KV_TOKENIZER_EOT_ID, vocab.special_eot_id },
{ LLM_KV_TOKENIZER_EOM_ID, vocab.special_eom_id },
};

for (const auto & it : special_token_types) {
const std::string & key = kv(std::get<0>(it));
int32_t & id = std::get<1>(it);

uint32_t new_id;
if (!ml.get_key(std::get<0>(it), new_id, false)) {
continue;
}
if (new_id >= vocab.id_to_token.size()) {
LLAMA_LOG_WARN("%s: bad special token: '%s' = %ud, using default id %d\n",
__func__, key.c_str(), new_id, id);
} else {
id = new_id;
}
}

// Handle add_bos_token and add_eos_token
{
bool temp = true;

if (ml.get_key(LLM_KV_TOKENIZER_ADD_BOS, temp, false)) {
vocab.tokenizer_add_bos = temp;
}
if (ml.get_key(LLM_KV_TOKENIZER_ADD_EOS, temp, false)) {
vocab.tokenizer_add_eos = temp;
}
}

// find EOT token: "<|eot_id|>", "<|im_end|>", "<end_of_turn>", etc.
//
// TODO: convert scripts should provide this token through the KV metadata LLAMA_KV_TOKENIZER_EOT_ID
// for now, we apply this workaround to find the EOT token based on its text
if (vocab.special_eot_id == -1) {
for (const auto & t : vocab.token_to_id) {
if (
// TODO: gemma "<end_of_turn>" is exported as a normal token, so the following check does not work
// need to fix convert script
//vocab.id_to_token[t.second].type == LLAMA_TOKEN_TYPE_CONTROL &&
(t.first == "<|eot_id|>" ||
t.first == "<|im_end|>" ||
t.first == "<|end|>" ||
t.first == "<end_of_turn>" ||
t.first == "<|endoftext|>"
)
) {
vocab.special_eot_id = t.second;
break;
}
}
}

// find EOM token: "<|eom_id|>"
//
// TODO: convert scripts should provide this token through the KV metadata LLAMA_KV_TOKENIZER_EOM_ID
// for now, we apply this workaround to find the EOM token based on its text
if (vocab.special_eom_id == -1) {
const auto & t = vocab.token_to_id.find("<|eom_id|>");
if (t != vocab.token_to_id.end()) {
vocab.special_eom_id = t->second;
}
}
}

// determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
// For Fill-In-the-Middle (FIM)/infill models which where converted
Expand Down Expand Up @@ -5571,86 +5651,6 @@ static void llm_load_vocab(
vocab.linefeed_id = ids[0];
}

// special tokens
{
const std::vector<std::pair<enum llm_kv, int32_t &>> special_token_types = {
{ LLM_KV_TOKENIZER_BOS_ID, vocab.special_bos_id },
{ LLM_KV_TOKENIZER_EOS_ID, vocab.special_eos_id },
{ LLM_KV_TOKENIZER_UNK_ID, vocab.special_unk_id },
{ LLM_KV_TOKENIZER_SEP_ID, vocab.special_sep_id },
{ LLM_KV_TOKENIZER_PAD_ID, vocab.special_pad_id },
{ LLM_KV_TOKENIZER_CLS_ID, vocab.special_cls_id },
{ LLM_KV_TOKENIZER_MASK_ID, vocab.special_mask_id },
{ LLM_KV_TOKENIZER_PREFIX_ID, vocab.special_prefix_id },
{ LLM_KV_TOKENIZER_SUFFIX_ID, vocab.special_suffix_id },
{ LLM_KV_TOKENIZER_MIDDLE_ID, vocab.special_middle_id },
{ LLM_KV_TOKENIZER_EOT_ID, vocab.special_eot_id },
{ LLM_KV_TOKENIZER_EOM_ID, vocab.special_eom_id },
};

for (const auto & it : special_token_types) {
const std::string & key = kv(std::get<0>(it));
int32_t & id = std::get<1>(it);

uint32_t new_id;
if (!ml.get_key(std::get<0>(it), new_id, false)) {
continue;
}
if (new_id >= vocab.id_to_token.size()) {
LLAMA_LOG_WARN("%s: bad special token: '%s' = %ud, using default id %d\n",
__func__, key.c_str(), new_id, id);
} else {
id = new_id;
}
}

// Handle add_bos_token and add_eos_token
{
bool temp = true;

if (ml.get_key(LLM_KV_TOKENIZER_ADD_BOS, temp, false)) {
vocab.tokenizer_add_bos = temp;
}
if (ml.get_key(LLM_KV_TOKENIZER_ADD_EOS, temp, false)) {
vocab.tokenizer_add_eos = temp;
}
}

// find EOT token: "<|eot_id|>", "<|im_end|>", "<end_of_turn>", etc.
//
// TODO: convert scripts should provide this token through the KV metadata LLAMA_KV_TOKENIZER_EOT_ID
// for now, we apply this workaround to find the EOT token based on its text
if (vocab.special_eot_id == -1) {
for (const auto & t : vocab.token_to_id) {
if (
// TODO: gemma "<end_of_turn>" is exported as a normal token, so the following check does not work
// need to fix convert script
//vocab.id_to_token[t.second].type == LLAMA_TOKEN_TYPE_CONTROL &&
(t.first == "<|eot_id|>" ||
t.first == "<|im_end|>" ||
t.first == "<|end|>" ||
t.first == "<end_of_turn>" ||
t.first == "<|endoftext|>"
)
) {
vocab.special_eot_id = t.second;
break;
}
}
}

// find EOM token: "<|eom_id|>"
//
// TODO: convert scripts should provide this token through the KV metadata LLAMA_KV_TOKENIZER_EOM_ID
// for now, we apply this workaround to find the EOM token based on its text
if (vocab.special_eom_id == -1) {
const auto & t = vocab.token_to_id.find("<|eom_id|>");
if (t != vocab.token_to_id.end()) {
vocab.special_eom_id = t->second;
}
}
}

// build special tokens cache
{
for (llama_vocab::id id = 0; id < (llama_vocab::id)n_vocab; ++id) {
Expand Down Expand Up @@ -16524,7 +16524,7 @@ struct llama_context * llama_new_context_with_model(
cparams.n_batch = GGML_KQ_MASK_PAD;
}

cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
cparams.n_ubatch = hparams.causal_attn ? std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch) : cparams.n_batch;

cparams.n_ctx_orig_yarn = params.yarn_orig_ctx != 0 ? params.yarn_orig_ctx :
hparams.n_ctx_orig_yarn != 0 ? hparams.n_ctx_orig_yarn :
Expand Down
Loading