From bec14943e05bcb9fe83bc20464b00d467ad43475 Mon Sep 17 00:00:00 2001 From: Oliver-Y <31872294+Oliver-Y@users.noreply.github.com> Date: Sun, 21 Jul 2024 22:05:26 -0700 Subject: [PATCH] Testing (#1) * a chinese word formed of 3 chinese charcters but the first 2 is not word * tokenizer-fix * E5 Pretokenizer bugfix * whitespace fix * remove extra wpm --------- Co-authored-by: Mike Fan <60965742+mike-fzy@users.noreply.github.com> Co-authored-by: Oliver Ye --- convert_hf_to_gguf.py | 12 +- convert_hf_to_gguf_update.py | 3 +- include/llama.h | 1 + src/llama.cpp | 226 +++++++++-------------------------- 4 files changed, 67 insertions(+), 175 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index c6d28de5adc19c..076ffd040f1870 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -458,18 +458,12 @@ def get_vocab_base_pre(self, tokenizer) -> str: if chkhsh == "6221ad2852e85ce96f791f476e0b390cf9b474c9e3d1362f53a24a06dc8220ff": # ref: https://huggingface.co/smallcloudai/Refact-1_6-base res = "refact" - if chkhsh == "9c2227e4dd922002fb81bde4fc02b0483ca4f12911410dee2255e4987644e3f8": - # ref: https://huggingface.co/CohereForAI/c4ai-command-r-v01 - res = "command-r" if chkhsh == "e636dc30a262dcc0d8c323492e32ae2b70728f4df7dfe9737d9f920a282b8aea": # ref: https://huggingface.co/Qwen/Qwen1.5-7B res = "qwen2" if chkhsh == "b6dc8df998e1cfbdc4eac8243701a65afe638679230920b50d6f17d81c098166": # ref: https://huggingface.co/allenai/OLMo-1.7-7B-hf res = "olmo" - if chkhsh == "a8594e3edff7c29c003940395316294b2c623e09894deebbc65f33f1515df79e": - # ref: https://huggingface.co/databricks/dbrx-base - res = "dbrx" if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f": # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-en res = "jina-v2-en" @@ -488,6 +482,9 @@ def get_vocab_base_pre(self, tokenizer) -> str: if chkhsh == "7967bfa498ade6b757b064f31e964dddbb80f8f9a4d68d4ba7998fcf281c531a": # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-code res = "jina-v2-code" + if chkhsh == "a81863d07e75497e2194eb1a1574d5e5cd4d5f85a87a0728b922bf2bed6fb327": + # ref: https://huggingface.co/intfloat/multilingual-e5-base + res = "multilingual-e5-base" if chkhsh == "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b": # ref: https://huggingface.co/THUDM/glm-4-9b-chat res = "chatglm-bpe" @@ -2354,7 +2351,7 @@ def set_gguf_parameters(self): self.gguf_writer.add_pooling_type(pooling_type) def set_vocab(self): - tokens, toktypes, tokpre = self.get_vocab_base('default') + tokens, toktypes, tokpre = self.get_vocab_base() self.vocab_size = len(tokens) self.gguf_writer.add_token_type_count(int(self.hparams['type_vocab_size'])) @@ -2364,6 +2361,7 @@ def set_vocab(self): self.gguf_writer.add_tokenizer_pre(tokpre) self.gguf_writer.add_token_list(tokens) self.gguf_writer.add_token_types(toktypes) + self.gguf_writer.add_add_eos_token(True) # handle special tokens special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py index e4165ae2d977c0..31a7996a4a6d07 100755 --- a/convert_hf_to_gguf_update.py +++ b/convert_hf_to_gguf_update.py @@ -86,6 +86,7 @@ class TOKENIZER_TYPE(IntEnum): {"name": "smaug-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", }, {"name": "poro-chat", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Poro-34B-chat", }, {"name": "jina-v2-code", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", }, + {"name": "multilingual-e5-base", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/intfloat/multilingual-e5-base", }, {"name": "viking", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Viking-7B", }, # Also used for Viking 13B and 33B {"name": "gemma", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/google/gemma-2b", }, {"name": "gemma-2", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/google/gemma-2-9b", }, @@ -141,7 +142,7 @@ def download_model(model): name = model["name"] tokt = model["tokt"] - if tokt == TOKENIZER_TYPE.SPM or tokt == TOKENIZER_TYPE.UGM: + if (tokt == TOKENIZER_TYPE.SPM and name != "multilingual-e5-base") or tokt == TOKENIZER_TYPE.UGM: continue # Skip if the tokenizer folder does not exist or there are other download issues previously diff --git a/include/llama.h b/include/llama.h index bb4b05ba636711..828ba67404a0e9 100644 --- a/include/llama.h +++ b/include/llama.h @@ -92,6 +92,7 @@ extern "C" { LLAMA_VOCAB_PRE_TYPE_CHATGLM4 = 17, LLAMA_VOCAB_PRE_TYPE_VIKING = 18, LLAMA_VOCAB_PRE_TYPE_JAIS = 19, + LLAMA_VOCAB_PRE_TYPE_E5 = 20, }; // note: these values should be synchronized with ggml_rope diff --git a/src/llama.cpp b/src/llama.cpp index c973b82ce7494f..4e5a591978609f 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -278,7 +278,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_BITNET, "bitnet" }, { LLM_ARCH_T5, "t5" }, { LLM_ARCH_JAIS, "jais" }, - { LLM_ARCH_XLMROBERTA, "xlm-roberta" }, + { LLM_ARCH_XLMROBERTA, "xlm-roberta" }, { LLM_ARCH_UNKNOWN, "(unknown)" }, }; @@ -5462,7 +5462,12 @@ static void llm_load_vocab( throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str())); } } else if (vocab.type == LLAMA_VOCAB_TYPE_SPM) { - vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT; + if (tokenizer_pre == "multilingual-e5-base") { + vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_E5; + } + else{ + vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT; + } vocab.tokenizer_add_space_prefix = true; vocab.tokenizer_clean_spaces = false; vocab.tokenizer_add_bos = true; @@ -15194,7 +15199,10 @@ static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) { } // Try to fall back to just the byte as a string const char buf2[2] = { (char)ch, 0 }; - return vocab.token_to_id.at(buf2); + // printf("%hhdu\n",buf2[0]); + // printf("%hhd\n",buf2[1]); + + return vocab.token_to_id.find(buf2) != vocab.token_to_id.end() ? token->second:vocab.special_unk_id; } case LLAMA_VOCAB_TYPE_WPM: case LLAMA_VOCAB_TYPE_BPE: { @@ -15679,34 +15687,59 @@ struct llm_tokenizer_bpe { }; struct llm_tokenizer_wpm { - llm_tokenizer_wpm(const llama_vocab & vocab): vocab(vocab) { - is_xlm_vocab = vocab.token_to_id.size() > 100000 && - vocab.token_to_id.find("数据") != vocab.token_to_id.end(); - } + llm_tokenizer_wpm(const llama_vocab & vocab): vocab(vocab) {} - void tokenize(const std::string & text, std::vector & output) { - if (is_xlm_vocab) { - tokenize_xlm(text, output); - } else { - tokenize_default(text, output); - } - } + void tokenize(const std::string & text, std::vector & output) const { + const auto & token_map = vocab.token_to_id; - void tokenize_default(const std::string & text, std::vector & output) { // normalize and split by whitespace - std::vector words = preprocess_default(text); + std::vector words = preprocess(text); // bos token prepended already // find the longest tokens that form the words - for (const std::string &word : words) { - if (word.size() > 0) { - tokenize_word_default(word, output); + for (const std::string & word : words) { + // skip empty words + if (word.size() == 0) { + continue; + } + + // prepend phantom space + const std::string word1 = "\xe2\x96\x81" + word; + const int n = word1.size(); + + const size_t current_tokens = output.size(); + + // we're at the start of a new word + // move through character position in word + for (int i = 0; i < n; ++i) { + // loop through possible match length + bool match = false; + for (int j = std::min(n, i + vocab.max_token_len + 1); j > i; j--) { + auto it = token_map.find(word1.substr(i, j - i)); + if (it != token_map.end()) { + output.push_back(it->second); + match = true; + i = j - 1; + break; + } + } + + if (!match) { // discard all + output.resize(current_tokens); + break; // and discard next tokens + } + } + + // we didn't find any matches for this word + if (current_tokens == output.size()) { + output.push_back(vocab.special_unk_id); } } } - std::vector preprocess_default(const std::string & text) { + // TODO: reduce string copies by using cpts_offs array + std::vector preprocess(const std::string & text) const { const std::vector cpts_nfd = unicode_cpts_normalize_nfd(unicode_cpts_from_utf8(text)); std::vector words(1, ""); @@ -15744,151 +15777,6 @@ struct llm_tokenizer_wpm { return words; } - void tokenize_xlm(const std::string & text, std::vector & output) { - auto cpts_word_2_str = [](const std::vector & cpts_word) { - std::string word; - for (auto c : cpts_word) { - word += unicode_cpt_to_utf8(c); - } - return word; - }; - - auto is_english_char = [](uint32_t cpt) { - const auto flags = unicode_cpt_flags(cpt); - return !(cpt == 0 || cpt == 0xFFFD || flags.is_control || flags.is_punctuation || - (cpt < 0x7F && flags.is_symbol ) || is_chinese_char(cpt)); - }; - - const auto & token_map = vocab.token_to_id; - - // normalize and split by whitespace - auto all_cpts_words = preprocess_xlm(text); - - // bos token prepended already - - // find the longest tokens that form the words - for (int i = 0; i < (int)all_cpts_words.size(); ++i) { - const auto & cpts_word = all_cpts_words[i]; - // skip empty words - if (cpts_word.size() == 0) { - continue; - } - - std::string word = cpts_word_2_str(cpts_word); - if (cpts_word.size() != 1 || (cpts_word.size() == 1 && is_english_char(cpts_word[0]))) { - tokenize_word_default(word, output); - continue; - } - - auto it = token_map.find(word); - auto token_id = it != token_map.end() ? it->second : vocab.special_unk_id; - if (token_id == vocab.special_unk_id) { - output.push_back(token_id); - continue; - } - - auto j = i + 1; - for (; j < (int)all_cpts_words.size(); j++) { - const auto& next_cpts_word = all_cpts_words[j]; - if (next_cpts_word.size() != 1 || (next_cpts_word.size() == 1 && is_english_char(next_cpts_word[0]))) { - break; - } - - auto next_word = cpts_word_2_str(next_cpts_word); - it = token_map.find(word + next_word); - auto token_id_2 = it != token_map.end() ? it->second : vocab.special_unk_id;; - if (token_id_2 == vocab.special_unk_id) { - break; - } - - token_id = token_id_2; - word += next_word; - } - - output.push_back(token_id); - i = j - 1; - } - } - - std::vector> preprocess_xlm(const std::string & text) { - std::vector cpts_word; - std::vector> all_cpts_words; - const std::vector cpts_nfd = unicode_cpts_normalize_nfd(unicode_cpts_from_utf8(text)); - for (const uint32_t cpt : cpts_nfd) { - const auto flags = unicode_cpt_flags(cpt); - - if (flags.is_whitespace) { - if (!cpts_word.empty()) { - all_cpts_words.emplace_back(cpts_word); - cpts_word.clear(); - } - continue; - } - - assert (!flags.is_separator); - if (cpt == 0 || cpt == 0xFFFD || flags.is_control) { - if (!cpts_word.empty()) { - all_cpts_words.emplace_back(cpts_word); - cpts_word.clear(); - } - continue; - } - - if (flags.is_punctuation || ( cpt < 0x7F && flags.is_symbol ) || is_chinese_char(cpt)) { - if (!cpts_word.empty()) { - all_cpts_words.emplace_back(cpts_word); - cpts_word.clear(); - } - all_cpts_words.emplace_back(std::vector{cpt}); - } - else { - cpts_word.emplace_back(cpt); - } - } - - if (!cpts_word.empty()) { - all_cpts_words.emplace_back(cpts_word); - } - - return all_cpts_words; - } - - void tokenize_word_default(const std::string & word, std::vector & output) { - const auto & token_map = vocab.token_to_id; - - // prepend phantom space - const std::string word1 = "\xe2\x96\x81" + word; - const int n = word1.size(); - - const size_t current_tokens = output.size(); - - // we're at the start of a new word - // move through character position in word - for (int i = 0; i < n; ++i) { - // loop through possible match length - bool match = false; - for (int j = n; j > i; j--) { - auto it = token_map.find(word1.substr(i, j - i)); - if (it != token_map.end()) { - output.push_back(it->second); - match = true; - i = j - 1; - break; - } - } - - if (!match) { // discard all - output.resize(current_tokens); - break; // and discard next tokens - } - } - - // we didn't find any matches for this word - if (current_tokens == output.size()) { - output.push_back(vocab.special_unk_id); - } - } - static bool is_chinese_char(uint32_t cpt) { return (cpt >= 0x04E00 && cpt <= 0x09FFF) || @@ -15903,8 +15791,7 @@ struct llm_tokenizer_wpm { //(cpt >= 0xFF00 && cpt <= 0xFFEF); } - bool is_xlm_vocab; - const llama_vocab & vocab; + const llama_vocab & vocab; }; struct naive_trie { @@ -16473,6 +16360,11 @@ static std::vector llama_tokenize_internal(const llama_vocab & LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str()); #endif llm_tokenizer_spm tokenizer(vocab); + //Temporary workaround for SPM Preprocessor + if(vocab.type_pre == LLAMA_VOCAB_PRE_TYPE_E5){ + std::regex ws_re("\\s+"); + raw_text = std::regex_replace(raw_text, ws_re, " "); + } llama_escape_whitespace(raw_text); tokenizer.tokenize(raw_text, output); is_prev_special = false;