From bec14943e05bcb9fe83bc20464b00d467ad43475 Mon Sep 17 00:00:00 2001
From: Oliver-Y <31872294+Oliver-Y@users.noreply.github.com>
Date: Sun, 21 Jul 2024 22:05:26 -0700
Subject: [PATCH] Testing (#1)

* a chinese word formed of 3 chinese charcters but the first 2 is not word

* tokenizer-fix

* E5 Pretokenizer bugfix

* whitespace fix

* remove extra wpm

---------

Co-authored-by: Mike Fan <60965742+mike-fzy@users.noreply.github.com>
Co-authored-by: Oliver Ye <OliverY@MacBook-Pro.local>
---
 convert_hf_to_gguf.py        |  12 +-
 convert_hf_to_gguf_update.py |   3 +-
 include/llama.h              |   1 +
 src/llama.cpp                | 226 +++++++++--------------------------
 4 files changed, 67 insertions(+), 175 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index c6d28de5adc19c..076ffd040f1870 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -458,18 +458,12 @@ def get_vocab_base_pre(self, tokenizer) -> str:
         if chkhsh == "6221ad2852e85ce96f791f476e0b390cf9b474c9e3d1362f53a24a06dc8220ff":
             # ref: https://huggingface.co/smallcloudai/Refact-1_6-base
             res = "refact"
-        if chkhsh == "9c2227e4dd922002fb81bde4fc02b0483ca4f12911410dee2255e4987644e3f8":
-            # ref: https://huggingface.co/CohereForAI/c4ai-command-r-v01
-            res = "command-r"
         if chkhsh == "e636dc30a262dcc0d8c323492e32ae2b70728f4df7dfe9737d9f920a282b8aea":
             # ref: https://huggingface.co/Qwen/Qwen1.5-7B
             res = "qwen2"
         if chkhsh == "b6dc8df998e1cfbdc4eac8243701a65afe638679230920b50d6f17d81c098166":
             # ref: https://huggingface.co/allenai/OLMo-1.7-7B-hf
             res = "olmo"
-        if chkhsh == "a8594e3edff7c29c003940395316294b2c623e09894deebbc65f33f1515df79e":
-            # ref: https://huggingface.co/databricks/dbrx-base
-            res = "dbrx"
         if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f":
             # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-en
             res = "jina-v2-en"
@@ -488,6 +482,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
         if chkhsh == "7967bfa498ade6b757b064f31e964dddbb80f8f9a4d68d4ba7998fcf281c531a":
             # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-code
             res = "jina-v2-code"
+        if chkhsh == "a81863d07e75497e2194eb1a1574d5e5cd4d5f85a87a0728b922bf2bed6fb327":
+            # ref: https://huggingface.co/intfloat/multilingual-e5-base
+            res = "multilingual-e5-base"
         if chkhsh == "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b":
             # ref: https://huggingface.co/THUDM/glm-4-9b-chat
             res = "chatglm-bpe"
@@ -2354,7 +2351,7 @@ def set_gguf_parameters(self):
             self.gguf_writer.add_pooling_type(pooling_type)
 
     def set_vocab(self):
-        tokens, toktypes, tokpre = self.get_vocab_base('default')
+        tokens, toktypes, tokpre = self.get_vocab_base()
         self.vocab_size = len(tokens)
 
         self.gguf_writer.add_token_type_count(int(self.hparams['type_vocab_size']))
@@ -2364,6 +2361,7 @@ def set_vocab(self):
         self.gguf_writer.add_tokenizer_pre(tokpre)
         self.gguf_writer.add_token_list(tokens)
         self.gguf_writer.add_token_types(toktypes)
+        self.gguf_writer.add_add_eos_token(True)
 
         # handle special tokens
         special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py
index e4165ae2d977c0..31a7996a4a6d07 100755
--- a/convert_hf_to_gguf_update.py
+++ b/convert_hf_to_gguf_update.py
@@ -86,6 +86,7 @@ class TOKENIZER_TYPE(IntEnum):
     {"name": "smaug-bpe",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", },
     {"name": "poro-chat",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Poro-34B-chat", },
     {"name": "jina-v2-code",   "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", },
+    {"name": "multilingual-e5-base",   "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/intfloat/multilingual-e5-base", },
     {"name": "viking",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Viking-7B", }, # Also used for Viking 13B and 33B
     {"name": "gemma",          "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/google/gemma-2b", },
     {"name": "gemma-2",        "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/google/gemma-2-9b", },
@@ -141,7 +142,7 @@ def download_model(model):
     name = model["name"]
     tokt = model["tokt"]
 
-    if tokt == TOKENIZER_TYPE.SPM or tokt == TOKENIZER_TYPE.UGM:
+    if (tokt == TOKENIZER_TYPE.SPM and name != "multilingual-e5-base") or tokt == TOKENIZER_TYPE.UGM:
         continue
 
     # Skip if the tokenizer folder does not exist or there are other download issues previously
diff --git a/include/llama.h b/include/llama.h
index bb4b05ba636711..828ba67404a0e9 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -92,6 +92,7 @@ extern "C" {
         LLAMA_VOCAB_PRE_TYPE_CHATGLM4       = 17,
         LLAMA_VOCAB_PRE_TYPE_VIKING         = 18,
         LLAMA_VOCAB_PRE_TYPE_JAIS           = 19,
+        LLAMA_VOCAB_PRE_TYPE_E5             = 20,
     };
 
     // note: these values should be synchronized with ggml_rope
diff --git a/src/llama.cpp b/src/llama.cpp
index c973b82ce7494f..4e5a591978609f 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -278,7 +278,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
     { LLM_ARCH_BITNET,          "bitnet"       },
     { LLM_ARCH_T5,              "t5"           },
     { LLM_ARCH_JAIS,            "jais"         },
-    { LLM_ARCH_XLMROBERTA,      "xlm-roberta"  },    
+    { LLM_ARCH_XLMROBERTA,      "xlm-roberta"  },
     { LLM_ARCH_UNKNOWN,         "(unknown)"    },
 };
 
@@ -5462,7 +5462,12 @@ static void llm_load_vocab(
                 throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
             }
         } else if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
-            vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
+            if (tokenizer_pre == "multilingual-e5-base") {
+                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_E5;
+            }
+            else{
+                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
+            }
             vocab.tokenizer_add_space_prefix = true;
             vocab.tokenizer_clean_spaces = false;
             vocab.tokenizer_add_bos = true;
@@ -15194,7 +15199,10 @@ static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
             }
             // Try to fall back to just the byte as a string
             const char buf2[2] = { (char)ch, 0 };
-            return vocab.token_to_id.at(buf2);
+            // printf("%hhdu\n",buf2[0]);
+            // printf("%hhd\n",buf2[1]);
+
+            return vocab.token_to_id.find(buf2) != vocab.token_to_id.end() ? token->second:vocab.special_unk_id;
         }
         case LLAMA_VOCAB_TYPE_WPM:
         case LLAMA_VOCAB_TYPE_BPE: {
@@ -15679,34 +15687,59 @@ struct llm_tokenizer_bpe {
 };
 
 struct llm_tokenizer_wpm {
-    llm_tokenizer_wpm(const llama_vocab & vocab): vocab(vocab) {
-        is_xlm_vocab = vocab.token_to_id.size() > 100000 &&
-            vocab.token_to_id.find("数据") != vocab.token_to_id.end();
-    }
+    llm_tokenizer_wpm(const llama_vocab & vocab): vocab(vocab) {}
 
-    void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
-        if (is_xlm_vocab) {
-            tokenize_xlm(text, output);
-        } else {
-            tokenize_default(text, output);
-        }
-    }
+    void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) const {
+        const auto & token_map = vocab.token_to_id;
 
-    void tokenize_default(const std::string & text, std::vector<llama_vocab::id> & output) {
         // normalize and split by whitespace
-        std::vector<std::string> words = preprocess_default(text);
+        std::vector<std::string> words = preprocess(text);
 
         // bos token prepended already
 
         // find the longest tokens that form the words
-        for (const std::string &word : words) {
-            if (word.size() > 0) {
-                tokenize_word_default(word, output);
+        for (const std::string & word : words) {
+            // skip empty words
+            if (word.size() == 0) {
+                continue;
+            }
+
+            // prepend phantom space
+            const std::string word1 = "\xe2\x96\x81" + word;
+            const int n = word1.size();
+
+            const size_t current_tokens = output.size();
+
+            // we're at the start of a new word
+            // move through character position in word
+            for (int i = 0; i < n; ++i) {
+                // loop through possible match length
+                bool match = false;
+                for (int j = std::min(n, i + vocab.max_token_len + 1); j > i; j--) {
+                    auto it = token_map.find(word1.substr(i, j - i));
+                    if (it != token_map.end()) {
+                        output.push_back(it->second);
+                        match = true;
+                        i = j - 1;
+                        break;
+                    }
+                }
+
+                if (!match) { // discard all
+                    output.resize(current_tokens);
+                    break;  // and discard next tokens
+                }
+            }
+
+            // we didn't find any matches for this word
+            if (current_tokens == output.size()) {
+                output.push_back(vocab.special_unk_id);
             }
         }
     }
 
-    std::vector<std::string> preprocess_default(const std::string & text) {
+    // TODO: reduce string copies by using cpts_offs array
+    std::vector<std::string> preprocess(const std::string & text) const {
         const std::vector<uint32_t> cpts_nfd = unicode_cpts_normalize_nfd(unicode_cpts_from_utf8(text));
         std::vector<std::string> words(1, "");
 
@@ -15744,151 +15777,6 @@ struct llm_tokenizer_wpm {
         return words;
     }
 
-    void tokenize_xlm(const std::string & text, std::vector<llama_vocab::id> & output) {
-        auto cpts_word_2_str = [](const std::vector<uint32_t> & cpts_word) {
-            std::string word;
-            for (auto c : cpts_word) {
-                word += unicode_cpt_to_utf8(c);
-            }
-            return word;
-        };
-
-        auto is_english_char = [](uint32_t cpt) {
-            const auto flags = unicode_cpt_flags(cpt);
-            return !(cpt == 0 || cpt == 0xFFFD || flags.is_control || flags.is_punctuation || 
-                    (cpt < 0x7F && flags.is_symbol ) || is_chinese_char(cpt));
-        };
-
-        const auto & token_map = vocab.token_to_id;
-
-        // normalize and split by whitespace
-        auto all_cpts_words = preprocess_xlm(text);
-
-        // bos token prepended already
-
-        // find the longest tokens that form the words
-        for (int i = 0; i < (int)all_cpts_words.size(); ++i) {
-            const auto & cpts_word = all_cpts_words[i];
-            // skip empty words
-            if (cpts_word.size() == 0) {
-                continue;
-            }
-
-            std::string word = cpts_word_2_str(cpts_word);
-            if (cpts_word.size() != 1 || (cpts_word.size() == 1 && is_english_char(cpts_word[0]))) {
-                tokenize_word_default(word, output);
-                continue;
-            }
-
-            auto it = token_map.find(word);
-            auto token_id = it != token_map.end() ? it->second : vocab.special_unk_id;
-            if (token_id == vocab.special_unk_id) {
-                output.push_back(token_id);
-                continue;
-            }
-
-            auto j = i + 1;
-            for (; j < (int)all_cpts_words.size(); j++) {
-                const auto& next_cpts_word = all_cpts_words[j];
-                if (next_cpts_word.size() != 1 || (next_cpts_word.size() == 1 && is_english_char(next_cpts_word[0]))) {
-                    break;
-                }
-
-                auto next_word = cpts_word_2_str(next_cpts_word);
-                it = token_map.find(word + next_word);
-                auto token_id_2 = it != token_map.end() ? it->second : vocab.special_unk_id;;
-                if (token_id_2 == vocab.special_unk_id) {
-                    break;
-                }
-
-                token_id = token_id_2;
-                word += next_word;
-            }
-
-            output.push_back(token_id);
-            i = j - 1;
-        }
-    }
-
-    std::vector<std::vector<uint32_t>> preprocess_xlm(const std::string & text) {
-        std::vector<uint32_t> cpts_word;
-        std::vector<std::vector<uint32_t>> all_cpts_words;
-        const std::vector<uint32_t> cpts_nfd = unicode_cpts_normalize_nfd(unicode_cpts_from_utf8(text));
-        for (const uint32_t cpt : cpts_nfd) {
-            const auto flags = unicode_cpt_flags(cpt);
-
-            if (flags.is_whitespace) {
-                if (!cpts_word.empty()) {
-                    all_cpts_words.emplace_back(cpts_word);
-                    cpts_word.clear();
-                }
-                continue;
-            }
-
-            assert (!flags.is_separator);
-            if (cpt == 0 || cpt == 0xFFFD || flags.is_control) {
-                if (!cpts_word.empty()) {
-                    all_cpts_words.emplace_back(cpts_word);
-                    cpts_word.clear();
-                }
-                continue;
-            }
-
-            if (flags.is_punctuation || ( cpt < 0x7F && flags.is_symbol ) || is_chinese_char(cpt)) {
-                if (!cpts_word.empty()) {
-                    all_cpts_words.emplace_back(cpts_word);
-                    cpts_word.clear();
-                }
-                all_cpts_words.emplace_back(std::vector<uint32_t>{cpt});
-            }
-            else {
-                cpts_word.emplace_back(cpt);
-            }
-        }
-
-        if (!cpts_word.empty()) {
-            all_cpts_words.emplace_back(cpts_word);
-        }
-
-        return all_cpts_words;
-    }
-
-    void tokenize_word_default(const std::string & word, std::vector<llama_vocab::id> & output) {
-        const auto & token_map = vocab.token_to_id;
-
-        // prepend phantom space
-        const std::string word1 = "\xe2\x96\x81" + word;
-        const int n = word1.size();
-
-        const size_t current_tokens = output.size();
-
-        // we're at the start of a new word
-        // move through character position in word
-        for (int i = 0; i < n; ++i) {
-            // loop through possible match length
-            bool match = false;
-            for (int j = n; j > i; j--) {
-                auto it = token_map.find(word1.substr(i, j - i));
-                if (it != token_map.end()) {
-                    output.push_back(it->second);
-                    match = true;
-                    i = j - 1;
-                    break;
-                }
-            }
-
-            if (!match) { // discard all
-                output.resize(current_tokens);
-                break;  // and discard next tokens
-            }
-        }
-
-        // we didn't find any matches for this word
-        if (current_tokens == output.size()) {
-            output.push_back(vocab.special_unk_id);
-        }
-    }
-
     static bool is_chinese_char(uint32_t cpt) {
         return
             (cpt >= 0x04E00 && cpt <= 0x09FFF) ||
@@ -15903,8 +15791,7 @@ struct llm_tokenizer_wpm {
             //(cpt >= 0xFF00  && cpt <= 0xFFEF);
     }
 
-    bool is_xlm_vocab;
-    const llama_vocab & vocab;   
+    const llama_vocab & vocab;
 };
 
 struct naive_trie {
@@ -16473,6 +16360,11 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
                         LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
 #endif
                         llm_tokenizer_spm tokenizer(vocab);
+                        //Temporary workaround for SPM Preprocessor
+                        if(vocab.type_pre == LLAMA_VOCAB_PRE_TYPE_E5){
+                            std::regex ws_re("\\s+");
+                            raw_text = std::regex_replace(raw_text, ws_re, " ");
+                        }
                         llama_escape_whitespace(raw_text);
                         tokenizer.tokenize(raw_text, output);
                         is_prev_special = false;