Skip to content

Commit c496fe0

Browse files
authored
convert : fix vocab padding code for bert models (ggml-org#13954)
1 parent e57bb87 commit c496fe0

File tree

1 file changed

+21
-28
lines changed

1 file changed

+21
-28
lines changed

convert_hf_to_gguf.py

Lines changed: 21 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -3814,7 +3814,7 @@ def _xlmroberta_set_vocab(self) -> None:
38143814
remove_whitespaces = tokenizer.clean_up_tokenization_spaces
38153815
precompiled_charsmap = b64decode(tokenizer_json["normalizer"]["precompiled_charsmap"])
38163816

3817-
vocab_size = self.hparams.get("vocab_size", tokenizer.vocab_size)
3817+
vocab_size = max(self.hparams.get("vocab_size", 0), tokenizer.vocab_size)
38183818
else:
38193819
sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue]
38203820
sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
@@ -3827,7 +3827,7 @@ def _xlmroberta_set_vocab(self) -> None:
38273827
tokenizer = SentencePieceProcessor()
38283828
tokenizer.LoadFromFile(str(tokenizer_path))
38293829

3830-
vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
3830+
vocab_size = max(self.hparams.get("vocab_size", 0), tokenizer.vocab_size())
38313831

38323832
tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
38333833
scores: list[float] = [-10000.0] * vocab_size
@@ -3857,33 +3857,26 @@ def _xlmroberta_set_vocab(self) -> None:
38573857
unk_token = tokenizer_config_json.get("unk_token")
38583858
unk_token_id = added_vocab.get(unk_token, tokenizer_json["model"].get("unk_id", 3))
38593859

3860-
for token_id in range(vocab_size):
3860+
for token_id in range(tokenizer.vocab_size):
38613861
piece = tokenizer._convert_id_to_token(token_id)
3862-
text = piece.encode("utf-8")
3863-
score = tokenizer_json["model"]["vocab"][token_id][1]
3864-
3865-
toktype = SentencePieceTokenTypes.NORMAL
3866-
if token_id == unk_token_id:
3867-
toktype = SentencePieceTokenTypes.UNKNOWN
3868-
elif token_id in tokenizer.all_special_ids:
3869-
toktype = SentencePieceTokenTypes.CONTROL
3870-
elif token_id in added_vocab.values():
3871-
toktype = SentencePieceTokenTypes.USER_DEFINED
3872-
# No reliable way to detect this, but jina doesn't have any
3873-
# elif tokenizer.IsByte(token_id):
3874-
# toktype = SentencePieceTokenTypes.BYTE
3875-
3876-
tokens[token_id] = text
3877-
scores[token_id] = score
3878-
toktypes[token_id] = toktype
3879-
3880-
if vocab_size > len(tokens):
3881-
pad_count = vocab_size - len(tokens)
3882-
logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
3883-
for i in range(1, pad_count + 1):
3884-
tokens.append(bytes(f"[PAD{i}]", encoding="utf-8"))
3885-
scores.append(-1000.0)
3886-
toktypes.append(SentencePieceTokenTypes.UNUSED)
3862+
if (piece := tokenizer._convert_id_to_token(token_id)) is not None:
3863+
text = piece.encode("utf-8")
3864+
score = tokenizer_json["model"]["vocab"][token_id][1]
3865+
3866+
toktype = SentencePieceTokenTypes.NORMAL
3867+
if token_id == unk_token_id:
3868+
toktype = SentencePieceTokenTypes.UNKNOWN
3869+
elif token_id in tokenizer.all_special_ids:
3870+
toktype = SentencePieceTokenTypes.CONTROL
3871+
elif token_id in added_vocab.values():
3872+
toktype = SentencePieceTokenTypes.USER_DEFINED
3873+
# No reliable way to detect this, but jina doesn't have any
3874+
# elif tokenizer.IsByte(token_id):
3875+
# toktype = SentencePieceTokenTypes.BYTE
3876+
3877+
tokens[token_id] = text
3878+
scores[token_id] = score
3879+
toktypes[token_id] = toktype
38873880

38883881
if isinstance(tokenizer, SentencePieceProcessor):
38893882
# realign tokens (see HF tokenizer code)

0 commit comments

Comments
 (0)