Skip to content

Commit 889bdd7

Browse files
command-r : add BPE pre-tokenization (#7063)
* Add BPE pre-tokenization for Command-R/R+. * Bump transformers convert requirement. * command-r : add individual digits regex --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
1 parent 6fbd432 commit 889bdd7

9 files changed

+168
-1
lines changed

convert-hf-to-gguf-update.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ class TOKENIZER_TYPE(IntEnum):
6666
{"name": "starcoder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", },
6767
{"name": "gpt-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", },
6868
{"name": "refact", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/smallcloudai/Refact-1_6-base", },
69+
{"name": "command-r", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereForAI/c4ai-command-r-v01", },
6970
]
7071

7172
# make directory "models/tokenizers" if it doesn't exist
@@ -106,6 +107,14 @@ def download_file_with_auth(url, token, save_path):
106107
save_path = f"models/tokenizers/{name}/tokenizer.json"
107108
download_file_with_auth(url, token, save_path)
108109

110+
# if downloaded file is less than 1KB, we likely need to download an LFS instead
111+
if os.path.getsize(save_path) < 1024:
112+
# remove the file
113+
os.remove(save_path)
114+
url = f"{repo}/resolve/main/tokenizer.json"
115+
save_path = f"models/tokenizers/{name}/tokenizer.json"
116+
download_file_with_auth(url, token, save_path)
117+
109118
if tokt == TOKENIZER_TYPE.SPM:
110119
url = f"{repo}/resolve/main/tokenizer.model"
111120
save_path = f"models/tokenizers/{name}/tokenizer.model"

convert-hf-to-gguf.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -311,6 +311,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
311311
if chkhsh == "6221ad2852e85ce96f791f476e0b390cf9b474c9e3d1362f53a24a06dc8220ff":
312312
# ref: https://huggingface.co/smallcloudai/Refact-1_6-base
313313
res = "refact"
314+
if chkhsh == "9c2227e4dd922002fb81bde4fc02b0483ca4f12911410dee2255e4987644e3f8":
315+
# ref: https://huggingface.co/CohereForAI/c4ai-command-r-v01
316+
res = "command-r"
314317

315318
if res is None:
316319
logger.warning("\n")

llama.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4386,6 +4386,9 @@ static void llm_load_vocab(
43864386
} else if (
43874387
tokenizer_pre == "refact") {
43884388
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_REFACT;
4389+
} else if (
4390+
tokenizer_pre == "command-r") {
4391+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_COMMAND_R;
43894392
} else {
43904393
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
43914394
}
@@ -12238,6 +12241,7 @@ struct llm_tokenizer_bpe {
1223812241
break;
1223912242
case LLAMA_VOCAB_PRE_TYPE_STARCODER:
1224012243
case LLAMA_VOCAB_PRE_TYPE_REFACT:
12244+
case LLAMA_VOCAB_PRE_TYPE_COMMAND_R:
1224112245
word_collection = unicode_regex_split(text, {
1224212246
"\\p{N}",
1224312247
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",

llama.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,7 @@ extern "C" {
8080
LLAMA_VOCAB_PRE_TYPE_STARCODER = 6,
8181
LLAMA_VOCAB_PRE_TYPE_GPT2 = 7,
8282
LLAMA_VOCAB_PRE_TYPE_REFACT = 8,
83+
LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9,
8384
};
8485

8586
// note: these values should be synchronized with ggml_rope

models/ggml-vocab-command-r.gguf

10.4 MB
Binary file not shown.

models/ggml-vocab-command-r.gguf.inp

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
ied 4 ½ months
2+
__ggml_vocab_test__
3+
Führer
4+
__ggml_vocab_test__
5+
6+
__ggml_vocab_test__
7+
8+
__ggml_vocab_test__
9+
10+
__ggml_vocab_test__
11+
12+
__ggml_vocab_test__
13+
14+
__ggml_vocab_test__
15+
16+
17+
__ggml_vocab_test__
18+
19+
20+
21+
__ggml_vocab_test__
22+
23+
24+
25+
26+
__ggml_vocab_test__
27+
28+
29+
__ggml_vocab_test__
30+
Hello world
31+
__ggml_vocab_test__
32+
Hello world
33+
__ggml_vocab_test__
34+
Hello World
35+
__ggml_vocab_test__
36+
Hello World
37+
__ggml_vocab_test__
38+
Hello World!
39+
__ggml_vocab_test__
40+
Hello, world!
41+
__ggml_vocab_test__
42+
Hello, world!
43+
__ggml_vocab_test__
44+
this is 🦙.cpp
45+
__ggml_vocab_test__
46+
w048 7tuijk dsdfhu
47+
__ggml_vocab_test__
48+
нещо на Български
49+
__ggml_vocab_test__
50+
កាន់តែពិសេសអាចខលចេញ
51+
__ggml_vocab_test__
52+
🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
53+
__ggml_vocab_test__
54+
Hello
55+
__ggml_vocab_test__
56+
Hello
57+
__ggml_vocab_test__
58+
Hello
59+
__ggml_vocab_test__
60+
Hello
61+
__ggml_vocab_test__
62+
Hello
63+
__ggml_vocab_test__
64+
Hello
65+
Hello
66+
__ggml_vocab_test__
67+
(
68+
__ggml_vocab_test__
69+
70+
=
71+
__ggml_vocab_test__
72+
' era
73+
__ggml_vocab_test__
74+
Hello, y'all! How are you 😁 ?我想在apple工作1314151天~
75+
__ggml_vocab_test__
76+
3
77+
__ggml_vocab_test__
78+
33
79+
__ggml_vocab_test__
80+
333
81+
__ggml_vocab_test__
82+
3333
83+
__ggml_vocab_test__
84+
33333
85+
__ggml_vocab_test__
86+
333333
87+
__ggml_vocab_test__
88+
3333333
89+
__ggml_vocab_test__
90+
33333333
91+
__ggml_vocab_test__
92+
333333333
93+
__ggml_vocab_test__
94+
95+
96+
97+
98+
99+
100+
101+
102+
103+
104+
105+
🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
106+
__ggml_vocab_test__

models/ggml-vocab-command-r.gguf.out

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
2536 228 27 228 22957 6983
2+
45 193433
3+
4+
228
5+
1667
6+
1742
7+
205
8+
206
9+
2126
10+
11516
11+
34777
12+
28339 3845
13+
46609 3845
14+
28339 3930
15+
46609 3930
16+
46609 3930 8
17+
28339 19 3845 8
18+
46609 19 3845 8
19+
2075 1801 11254 107 255 21 19317
20+
94 23 27 31 228 30 21213 20752 39267 6405 9980
21+
4929 40071 2196 3236 8750 1764 37097 41168
22+
38111 230 174833 38111 249 86325 241 38111 245 86325 232 38111 252 38111 123 38111 261 165 24629 38111 261 38111 103 174833 38111 235 38111 231 38111 257 38111 235 165 24629 38111 239
23+
2226 256 230 1737 18258 16 80503 122 35927 2226 242 112 57462 1737 54457 223165 106230 2096 16 48389 1737 10203 109160 1875 2222 2517 3342 12523 16
24+
28339
25+
46609
26+
228 46609
27+
1667 46609
28+
1742 46609
29+
1742 46609 1856 46609
30+
1737
31+
206 1857
32+
14 4515
33+
28339 19 1770 14 1954 8 4070 1955 1933 80503 231 5691 12081 13336 2648 29325 14315 24 26 24 27 24 28 24 5123 18372
34+
26
35+
26 26
36+
26 26 26
37+
26 26 26 26
38+
26 26 26 26 26
39+
26 26 26 26 26 26
40+
26 26 26 26 26 26 26
41+
26 26 26 26 26 26 26 26
42+
26 26 26 26 26 26 26 26 26
43+
127731 51628 205 57788 18494 97469 126134 206 2226 256 230 1737 18258 16 80503 122 35927 2226 242 112 57462 1737 54457 223165 106230 2096 16 48389 11254 107 255 2226 107 255 228 26 228 26 26 228 26 26 26 228 26 26 26 26 228 26 26 26 26 26 228 26 26 26 26 26 26 228 26 26 26 26 26 26 26 228 26 26 26 26 26 26 26 26 228 26 21 26 228 26 2271 26 228 26 3834 26 182018 230 174833 38111 249 86325 241 38111 245 86325 232 38111 252 38111 123 38111 261 165 24629 38111 261 38111 103 174833 38111 235 188568 231 5691 12081 13336 2648 29325 14315 24 26 24 27 24 28 24 5123 18372 8391 158343 3512 40071 2196 3236 8750 1764 37097 41168 29721 32797 25646 3802 4975 4975 116167 57178 10251 154048 27292 1767 5125 2632 2155 91 2378 1919 1914 2782 19 2155 3354 1933 5470 38 2155 52 2068 5470 1767 4961 3059 1894 19 2155 43 1933 3026 2725 23186 38 2930 14 20676 1671 14 83 51

requirements/requirements-convert.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
numpy~=1.24.4
22
sentencepiece~=0.1.98
3-
transformers>=4.35.2,<5.0.0
3+
transformers>=4.40.1,<5.0.0
44
gguf>=0.1.0
55
protobuf>=4.21.0,<5.0.0

tests/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@ llama_test(test-tokenizer-0 NAME test-tokenizer-0-bert-bge ARGS ${CMAKE
8383
llama_test(test-tokenizer-0 NAME test-tokenizer-0-starcoder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
8484
llama_test(test-tokenizer-0 NAME test-tokenizer-0-gpt-2 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-2.gguf)
8585
llama_test(test-tokenizer-0 NAME test-tokenizer-0-refact ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
86+
llama_test(test-tokenizer-0 NAME test-tokenizer-0-command-r ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-command-r.gguf)
8687

8788
# build test-tokenizer-1-bpe target once and add many tests
8889
add_executable(test-tokenizer-1-bpe test-tokenizer-1-bpe.cpp)

0 commit comments

Comments
 (0)