Skip to content

Commit 74204cc

Browse files
committed
Clarify logic in conversion
1 parent 1dc1316 commit 74204cc

File tree

1 file changed

+8
-7
lines changed

1 file changed

+8
-7
lines changed

convert-mpt-hf-to-gguf.py

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -132,15 +132,16 @@ def parse_args() -> argparse.Namespace:
132132
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
133133

134134
for i in range(vocab_size):
135-
if i in reverse_vocab:
136-
tokens.append(reverse_vocab[i])
137-
if reverse_vocab[i] not in added_vocab:
138-
toktypes.append(gguf.TokenType.NORMAL)
139-
else:
140-
toktypes.append(gguf.TokenType.USER_DEFINED)
141-
else:
135+
if i not in reverse_vocab:
142136
tokens.append(f"[PAD{i}]")
143137
toktypes.append(gguf.TokenType.USER_DEFINED)
138+
elif reverse_vocab[i] in added_vocab:
139+
# NOTE: wouldn't we like to distinguish CONTROL tokens here?
140+
tokens.append(reverse_vocab[i])
141+
toktypes.append(gguf.TokenType.USER_DEFINED)
142+
else:
143+
tokens.append(reverse_vocab[i])
144+
toktypes.append(gguf.TokenType.NORMAL)
144145

145146
gguf_writer.add_token_list(tokens)
146147
gguf_writer.add_token_types(toktypes)

0 commit comments

Comments
 (0)