@@ -49,7 +49,7 @@ class TOKENIZER_TYPE(IntEnum):
49
49
50
50
# TODO: this string has to exercise as much pre-tokenizer functionality as possible
51
51
# will be updated with time - contributions welcome
52
- chktxt = " \n \n \n \n \n \n \t \t \t \t \n \n \n \n \n 🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български ''''' '```````\" \" \" \" ......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL"
52
+ chktxt = ' \n \n \n \n \n \n \t \t \t \t \n \n \n \n \n 🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български \' \' \' \' \' \ ' ```````\" \" \" \" ......!!!!!!?????? I\ ' ve been \ ' told he\ ' s there, \ ' RE you sure? \ ' M not sure I\ ' ll make it, \ ' D you like some tea? We\ ' Ve a\ ' lL'
53
53
54
54
if len (sys .argv ) == 2 :
55
55
token = sys .argv [1 ]
@@ -63,121 +63,29 @@ class TOKENIZER_TYPE(IntEnum):
63
63
64
64
# TODO: add models here, base models preferred
65
65
models = [
66
- {
67
- "name" : "llama-spm" ,
68
- "tokt" : TOKENIZER_TYPE .SPM ,
69
- "repo" : "https://huggingface.co/meta-llama/Llama-2-7b-hf" ,
70
- },
71
- {
72
- "name" : "llama-bpe" ,
73
- "tokt" : TOKENIZER_TYPE .BPE ,
74
- "repo" : "https://huggingface.co/meta-llama/Meta-Llama-3-8B" ,
75
- },
76
- {
77
- "name" : "phi-3" ,
78
- "tokt" : TOKENIZER_TYPE .SPM ,
79
- "repo" : "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct" ,
80
- },
81
- {
82
- "name" : "deepseek-llm" ,
83
- "tokt" : TOKENIZER_TYPE .BPE ,
84
- "repo" : "https://huggingface.co/deepseek-ai/deepseek-llm-7b-base" ,
85
- },
86
- {
87
- "name" : "deepseek-coder" ,
88
- "tokt" : TOKENIZER_TYPE .BPE ,
89
- "repo" : "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base" ,
90
- },
91
- {
92
- "name" : "falcon" ,
93
- "tokt" : TOKENIZER_TYPE .BPE ,
94
- "repo" : "https://huggingface.co/tiiuae/falcon-7b" ,
95
- },
96
- {
97
- "name" : "bert-bge" ,
98
- "tokt" : TOKENIZER_TYPE .WPM ,
99
- "repo" : "https://huggingface.co/BAAI/bge-small-en-v1.5" ,
100
- },
101
- {
102
- "name" : "mpt" ,
103
- "tokt" : TOKENIZER_TYPE .BPE ,
104
- "repo" : "https://huggingface.co/mosaicml/mpt-7b" ,
105
- },
106
- {
107
- "name" : "starcoder" ,
108
- "tokt" : TOKENIZER_TYPE .BPE ,
109
- "repo" : "https://huggingface.co/bigcode/starcoder2-3b" ,
110
- },
111
- {
112
- "name" : "gpt-2" ,
113
- "tokt" : TOKENIZER_TYPE .BPE ,
114
- "repo" : "https://huggingface.co/openai-community/gpt2" ,
115
- },
116
- {
117
- "name" : "stablelm2" ,
118
- "tokt" : TOKENIZER_TYPE .BPE ,
119
- "repo" : "https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b" ,
120
- },
121
- {
122
- "name" : "refact" ,
123
- "tokt" : TOKENIZER_TYPE .BPE ,
124
- "repo" : "https://huggingface.co/smallcloudai/Refact-1_6-base" ,
125
- },
126
- {
127
- "name" : "command-r" ,
128
- "tokt" : TOKENIZER_TYPE .BPE ,
129
- "repo" : "https://huggingface.co/CohereForAI/c4ai-command-r-v01" ,
130
- },
131
- {
132
- "name" : "qwen2" ,
133
- "tokt" : TOKENIZER_TYPE .BPE ,
134
- "repo" : "https://huggingface.co/Qwen/Qwen1.5-7B" ,
135
- },
136
- {
137
- "name" : "olmo" ,
138
- "tokt" : TOKENIZER_TYPE .BPE ,
139
- "repo" : "https://huggingface.co/allenai/OLMo-1.7-7B-hf" ,
140
- },
141
- {
142
- "name" : "dbrx" ,
143
- "tokt" : TOKENIZER_TYPE .BPE ,
144
- "repo" : "https://huggingface.co/databricks/dbrx-base" ,
145
- },
146
- {
147
- "name" : "jina-v2-en" ,
148
- "tokt" : TOKENIZER_TYPE .WPM ,
149
- "repo" : "https://huggingface.co/jinaai/jina-embeddings-v2-base-en" ,
150
- }, # WPM!
151
- {
152
- "name" : "jina-v2-es" ,
153
- "tokt" : TOKENIZER_TYPE .BPE ,
154
- "repo" : "https://huggingface.co/jinaai/jina-embeddings-v2-base-es" ,
155
- },
156
- {
157
- "name" : "jina-v2-de" ,
158
- "tokt" : TOKENIZER_TYPE .BPE ,
159
- "repo" : "https://huggingface.co/jinaai/jina-embeddings-v2-base-de" ,
160
- },
161
- {
162
- "name" : "smaug-bpe" ,
163
- "tokt" : TOKENIZER_TYPE .BPE ,
164
- "repo" : "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct" ,
165
- },
166
- {
167
- "name" : "poro-chat" ,
168
- "tokt" : TOKENIZER_TYPE .BPE ,
169
- "repo" : "https://huggingface.co/LumiOpen/Poro-34B-chat" ,
170
- },
171
- {
172
- "name" : "jina-v2-code" ,
173
- "tokt" : TOKENIZER_TYPE .BPE ,
174
- "repo" : "https://huggingface.co/jinaai/jina-embeddings-v2-base-code" ,
175
- },
176
- {
177
- "name" : "viking" ,
178
- "tokt" : TOKENIZER_TYPE .BPE ,
179
- "repo" : "https://huggingface.co/LumiOpen/Viking-7B" ,
180
- }, # Also used for Viking 13B and 33B
66
+ {"name" : "llama-spm" , "tokt" : TOKENIZER_TYPE .SPM , "repo" : "https://huggingface.co/meta-llama/Llama-2-7b-hf" , },
67
+ {"name" : "llama-bpe" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/meta-llama/Meta-Llama-3-8B" , },
68
+ {"name" : "phi-3" , "tokt" : TOKENIZER_TYPE .SPM , "repo" : "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct" , },
69
+ {"name" : "deepseek-llm" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/deepseek-ai/deepseek-llm-7b-base" , },
70
+ {"name" : "deepseek-coder" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base" , },
71
+ {"name" : "falcon" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/tiiuae/falcon-7b" , },
72
+ {"name" : "bert-bge" , "tokt" : TOKENIZER_TYPE .WPM , "repo" : "https://huggingface.co/BAAI/bge-small-en-v1.5" , },
73
+ {"name" : "mpt" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/mosaicml/mpt-7b" , },
74
+ {"name" : "starcoder" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/bigcode/starcoder2-3b" , },
75
+ {"name" : "gpt-2" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/openai-community/gpt2" , },
76
+ {"name" : "stablelm2" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b" , },
77
+ {"name" : "refact" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/smallcloudai/Refact-1_6-base" , },
78
+ {"name" : "command-r" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/CohereForAI/c4ai-command-r-v01" , },
79
+ {"name" : "qwen2" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/Qwen/Qwen1.5-7B" , },
80
+ {"name" : "olmo" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/allenai/OLMo-1.7-7B-hf" , },
81
+ {"name" : "dbrx" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/databricks/dbrx-base" , },
82
+ {"name" : "jina-v2-en" , "tokt" : TOKENIZER_TYPE .WPM , "repo" : "https://huggingface.co/jinaai/jina-embeddings-v2-base-en" , }, # WPM!
83
+ {"name" : "jina-v2-es" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/jinaai/jina-embeddings-v2-base-es" , },
84
+ {"name" : "jina-v2-de" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/jinaai/jina-embeddings-v2-base-de" , },
85
+ {"name" : "smaug-bpe" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct" , },
86
+ {"name" : "poro-chat" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/LumiOpen/Poro-34B-chat" , },
87
+ {"name" : "jina-v2-code" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/jinaai/jina-embeddings-v2-base-code" , },
88
+ {"name" : "viking" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/LumiOpen/Viking-7B" , }, # Also used for Viking 13B and 33B
181
89
]
182
90
183
91
@@ -186,7 +94,7 @@ def download_file_with_auth(url, token, save_path):
186
94
response = sess .get (url , headers = headers )
187
95
response .raise_for_status ()
188
96
os .makedirs (os .path .dirname (save_path ), exist_ok = True )
189
- with open (save_path , "wb" ) as f :
97
+ with open (save_path , 'wb' ) as f :
190
98
f .write (response .content )
191
99
logger .info (f"File { save_path } downloaded successfully" )
192
100
@@ -236,9 +144,7 @@ def download_model(model):
236
144
try :
237
145
tokenizer = AutoTokenizer .from_pretrained (f"models/tokenizers/{ name } " )
238
146
except OSError as e :
239
- logger .error (
240
- f"Error loading tokenizer for model { name } . The model may not exist or is not accessible with the provided token. Error: { e } "
241
- )
147
+ logger .error (f"Error loading tokenizer for model { name } . The model may not exist or is not accessible with the provided token. Error: { e } " )
242
148
continue # Skip to the next model if the tokenizer can't be loaded
243
149
244
150
chktok = tokenizer .encode (chktxt )
@@ -258,15 +164,13 @@ def download_model(model):
258
164
pre_tokenizer = cfg ["pre_tokenizer" ]
259
165
logger .info ("pre_tokenizer: " + json .dumps (pre_tokenizer , indent = 4 ))
260
166
if "ignore_merges" in cfg ["model" ]:
261
- logger .info (
262
- "ignore_merges: " + json .dumps (cfg ["model" ]["ignore_merges" ], indent = 4 )
263
- )
167
+ logger .info ("ignore_merges: " + json .dumps (cfg ["model" ]["ignore_merges" ], indent = 4 ))
264
168
265
169
logger .info ("" )
266
170
267
- src_ifs += f' if chkhsh == "{ chkhsh } ":\n '
171
+ src_ifs += f" if chkhsh == \ "{ chkhsh } \ " :\n "
268
172
src_ifs += f" # ref: { model ['repo' ]} \n "
269
- src_ifs += f' res = "{ name } "\n '
173
+ src_ifs += f" res = \ "{ name } \ "\n "
270
174
271
175
src_func = f"""
272
176
def get_vocab_base_pre(self, tokenizer) -> str:
@@ -422,8 +326,6 @@ def get_vocab_base_pre(self, tokenizer) -> str:
422
326
for model in models :
423
327
name = model ["name" ]
424
328
425
- print (
426
- f"python3 convert-hf-to-gguf.py models/tokenizers/{ name } / --outfile models/ggml-vocab-{ name } .gguf --vocab-only"
427
- ) # noqa: NP100
329
+ print (f"python3 convert-hf-to-gguf.py models/tokenizers/{ name } / --outfile models/ggml-vocab-{ name } .gguf --vocab-only" ) # noqa: NP100
428
330
429
331
logger .info ("\n " )
0 commit comments