Skip to content

Commit 9e836a1

Browse files
committed
chore: Remove rebase artifacts
1 parent adee1b8 commit 9e836a1

6 files changed

+30
-423
lines changed

convert_hf_to_gguf_update.py

100644100755
Lines changed: 30 additions & 128 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ class TOKENIZER_TYPE(IntEnum):
4949

5050
# TODO: this string has to exercise as much pre-tokenizer functionality as possible
5151
# will be updated with time - contributions welcome
52-
chktxt = "\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български ''''''```````\"\"\"\"......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL"
52+
chktxt = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
5353

5454
if len(sys.argv) == 2:
5555
token = sys.argv[1]
@@ -63,121 +63,29 @@ class TOKENIZER_TYPE(IntEnum):
6363

6464
# TODO: add models here, base models preferred
6565
models = [
66-
{
67-
"name": "llama-spm",
68-
"tokt": TOKENIZER_TYPE.SPM,
69-
"repo": "https://huggingface.co/meta-llama/Llama-2-7b-hf",
70-
},
71-
{
72-
"name": "llama-bpe",
73-
"tokt": TOKENIZER_TYPE.BPE,
74-
"repo": "https://huggingface.co/meta-llama/Meta-Llama-3-8B",
75-
},
76-
{
77-
"name": "phi-3",
78-
"tokt": TOKENIZER_TYPE.SPM,
79-
"repo": "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct",
80-
},
81-
{
82-
"name": "deepseek-llm",
83-
"tokt": TOKENIZER_TYPE.BPE,
84-
"repo": "https://huggingface.co/deepseek-ai/deepseek-llm-7b-base",
85-
},
86-
{
87-
"name": "deepseek-coder",
88-
"tokt": TOKENIZER_TYPE.BPE,
89-
"repo": "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base",
90-
},
91-
{
92-
"name": "falcon",
93-
"tokt": TOKENIZER_TYPE.BPE,
94-
"repo": "https://huggingface.co/tiiuae/falcon-7b",
95-
},
96-
{
97-
"name": "bert-bge",
98-
"tokt": TOKENIZER_TYPE.WPM,
99-
"repo": "https://huggingface.co/BAAI/bge-small-en-v1.5",
100-
},
101-
{
102-
"name": "mpt",
103-
"tokt": TOKENIZER_TYPE.BPE,
104-
"repo": "https://huggingface.co/mosaicml/mpt-7b",
105-
},
106-
{
107-
"name": "starcoder",
108-
"tokt": TOKENIZER_TYPE.BPE,
109-
"repo": "https://huggingface.co/bigcode/starcoder2-3b",
110-
},
111-
{
112-
"name": "gpt-2",
113-
"tokt": TOKENIZER_TYPE.BPE,
114-
"repo": "https://huggingface.co/openai-community/gpt2",
115-
},
116-
{
117-
"name": "stablelm2",
118-
"tokt": TOKENIZER_TYPE.BPE,
119-
"repo": "https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b",
120-
},
121-
{
122-
"name": "refact",
123-
"tokt": TOKENIZER_TYPE.BPE,
124-
"repo": "https://huggingface.co/smallcloudai/Refact-1_6-base",
125-
},
126-
{
127-
"name": "command-r",
128-
"tokt": TOKENIZER_TYPE.BPE,
129-
"repo": "https://huggingface.co/CohereForAI/c4ai-command-r-v01",
130-
},
131-
{
132-
"name": "qwen2",
133-
"tokt": TOKENIZER_TYPE.BPE,
134-
"repo": "https://huggingface.co/Qwen/Qwen1.5-7B",
135-
},
136-
{
137-
"name": "olmo",
138-
"tokt": TOKENIZER_TYPE.BPE,
139-
"repo": "https://huggingface.co/allenai/OLMo-1.7-7B-hf",
140-
},
141-
{
142-
"name": "dbrx",
143-
"tokt": TOKENIZER_TYPE.BPE,
144-
"repo": "https://huggingface.co/databricks/dbrx-base",
145-
},
146-
{
147-
"name": "jina-v2-en",
148-
"tokt": TOKENIZER_TYPE.WPM,
149-
"repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en",
150-
}, # WPM!
151-
{
152-
"name": "jina-v2-es",
153-
"tokt": TOKENIZER_TYPE.BPE,
154-
"repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es",
155-
},
156-
{
157-
"name": "jina-v2-de",
158-
"tokt": TOKENIZER_TYPE.BPE,
159-
"repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de",
160-
},
161-
{
162-
"name": "smaug-bpe",
163-
"tokt": TOKENIZER_TYPE.BPE,
164-
"repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct",
165-
},
166-
{
167-
"name": "poro-chat",
168-
"tokt": TOKENIZER_TYPE.BPE,
169-
"repo": "https://huggingface.co/LumiOpen/Poro-34B-chat",
170-
},
171-
{
172-
"name": "jina-v2-code",
173-
"tokt": TOKENIZER_TYPE.BPE,
174-
"repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code",
175-
},
176-
{
177-
"name": "viking",
178-
"tokt": TOKENIZER_TYPE.BPE,
179-
"repo": "https://huggingface.co/LumiOpen/Viking-7B",
180-
}, # Also used for Viking 13B and 33B
66+
{"name": "llama-spm", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/meta-llama/Llama-2-7b-hf", },
67+
{"name": "llama-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Meta-Llama-3-8B", },
68+
{"name": "phi-3", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct", },
69+
{"name": "deepseek-llm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-llm-7b-base", },
70+
{"name": "deepseek-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base", },
71+
{"name": "falcon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/falcon-7b", },
72+
{"name": "bert-bge", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/BAAI/bge-small-en-v1.5", },
73+
{"name": "mpt", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", },
74+
{"name": "starcoder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", },
75+
{"name": "gpt-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", },
76+
{"name": "stablelm2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b", },
77+
{"name": "refact", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/smallcloudai/Refact-1_6-base", },
78+
{"name": "command-r", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereForAI/c4ai-command-r-v01", },
79+
{"name": "qwen2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen1.5-7B", },
80+
{"name": "olmo", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/allenai/OLMo-1.7-7B-hf", },
81+
{"name": "dbrx", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/databricks/dbrx-base", },
82+
{"name": "jina-v2-en", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM!
83+
{"name": "jina-v2-es", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
84+
{"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
85+
{"name": "smaug-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", },
86+
{"name": "poro-chat", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Poro-34B-chat", },
87+
{"name": "jina-v2-code", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", },
88+
{"name": "viking", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Viking-7B", }, # Also used for Viking 13B and 33B
18189
]
18290

18391

@@ -186,7 +94,7 @@ def download_file_with_auth(url, token, save_path):
18694
response = sess.get(url, headers=headers)
18795
response.raise_for_status()
18896
os.makedirs(os.path.dirname(save_path), exist_ok=True)
189-
with open(save_path, "wb") as f:
97+
with open(save_path, 'wb') as f:
19098
f.write(response.content)
19199
logger.info(f"File {save_path} downloaded successfully")
192100

@@ -236,9 +144,7 @@ def download_model(model):
236144
try:
237145
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
238146
except OSError as e:
239-
logger.error(
240-
f"Error loading tokenizer for model {name}. The model may not exist or is not accessible with the provided token. Error: {e}"
241-
)
147+
logger.error(f"Error loading tokenizer for model {name}. The model may not exist or is not accessible with the provided token. Error: {e}")
242148
continue # Skip to the next model if the tokenizer can't be loaded
243149

244150
chktok = tokenizer.encode(chktxt)
@@ -258,15 +164,13 @@ def download_model(model):
258164
pre_tokenizer = cfg["pre_tokenizer"]
259165
logger.info("pre_tokenizer: " + json.dumps(pre_tokenizer, indent=4))
260166
if "ignore_merges" in cfg["model"]:
261-
logger.info(
262-
"ignore_merges: " + json.dumps(cfg["model"]["ignore_merges"], indent=4)
263-
)
167+
logger.info("ignore_merges: " + json.dumps(cfg["model"]["ignore_merges"], indent=4))
264168

265169
logger.info("")
266170

267-
src_ifs += f' if chkhsh == "{chkhsh}":\n'
171+
src_ifs += f" if chkhsh == \"{chkhsh}\":\n"
268172
src_ifs += f" # ref: {model['repo']}\n"
269-
src_ifs += f' res = "{name}"\n'
173+
src_ifs += f" res = \"{name}\"\n"
270174

271175
src_func = f"""
272176
def get_vocab_base_pre(self, tokenizer) -> str:
@@ -422,8 +326,6 @@ def get_vocab_base_pre(self, tokenizer) -> str:
422326
for model in models:
423327
name = model["name"]
424328

425-
print(
426-
f"python3 convert-hf-to-gguf.py models/tokenizers/{name}/ --outfile models/ggml-vocab-{name}.gguf --vocab-only"
427-
) # noqa: NP100
329+
print(f"python3 convert-hf-to-gguf.py models/tokenizers/{name}/ --outfile models/ggml-vocab-{name}.gguf --vocab-only") # noqa: NP100
428330

429331
logger.info("\n")

convert_lora_to_ggml.py

Lines changed: 0 additions & 149 deletions
This file was deleted.

0 commit comments

Comments
 (0)