Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
166 changes: 83 additions & 83 deletions convert_hf_to_gguf.py

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions convert_hf_to_gguf_update.py
Original file line number Diff line number Diff line change
Expand Up @@ -296,7 +296,7 @@ def get_existing_models(convert_py):
except Exception as e:
raise OSError(f"Error loading tokenizer for model {name}.") from e

chktok = tokenizer.encode(CHK_TXT)
chktok = tokenizer.encode(CHK_TXT) # ty: ignore[unresolved-attribute]
chkhsh = sha256(str(chktok).encode()).hexdigest()

logger.info(f"model: {name}")
Expand Down Expand Up @@ -468,7 +468,7 @@ def get_vocab_base_pre(self, tokenizer) -> str:

with open(f"models/ggml-vocab-{name}.gguf.out", "w") as f:
for text in tests:
res = tokenizer.encode(text, add_special_tokens=False)
res = tokenizer.encode(text, add_special_tokens=False) # ty: ignore[unresolved-attribute]
for r in res:
f.write(f" {r}")
f.write("\n")
Expand Down
2 changes: 1 addition & 1 deletion convert_lora_to_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -402,7 +402,7 @@ def set_gguf_parameters(self):
# the invocation string includes the "<|start_of_turn|>"
# token, but the adapters themselves were trained to
# activate _after_ that first token, so we drop it here.
alora_invocation_tokens = tokenizer(invocation_string)["input_ids"][1:]
alora_invocation_tokens = tokenizer(invocation_string)["input_ids"][1:] # ty: ignore[call-non-callable]
if alora_invocation_tokens:
logger.debug("GGUF KV: %s = %s", gguf.Keys.Adapter.ALORA_INVOCATION_TOKENS, alora_invocation_tokens)
self.gguf_writer.add_key_value(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,10 +53,10 @@
print(f"Model name: {model_name}")

prompt = "Hello world today"
input_ids = tokenizer(prompt, return_tensors="pt").input_ids
input_ids = tokenizer(prompt, return_tensors="pt").input_ids # ty: ignore[call-non-callable]
print(f"Input tokens: {input_ids}")
print(f"Input text: {repr(prompt)}")
print(f"Tokenized: {tokenizer.convert_ids_to_tokens(input_ids[0])}")
print(f"Tokenized: {tokenizer.convert_ids_to_tokens(input_ids[0])}") # ty: ignore[unresolved-attribute]

with torch.no_grad():
outputs = model(input_ids, output_hidden_states=True)
Expand Down Expand Up @@ -92,7 +92,7 @@

# Print embeddings per token in the requested format
print("\nToken embeddings:")
tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
tokens = tokenizer.convert_ids_to_tokens(input_ids[0]) # ty: ignore[unresolved-attribute]
for i, embedding in enumerate(token_embeddings):
# Format: show first few values, ..., then last few values
if len(embedding) > 10:
Expand Down
4 changes: 2 additions & 2 deletions examples/model-conversion/scripts/utils/semantic_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,8 +207,8 @@ def main():
else:
model = AutoModel.from_pretrained(args.model_path, trust_remote_code=True)

encoded = tokenizer(prompt, return_tensors="pt")
tokens = tokenizer.convert_ids_to_tokens(encoded['input_ids'][0])
encoded = tokenizer(prompt, return_tensors="pt") # ty: ignore[call-non-callable]
tokens = tokenizer.convert_ids_to_tokens(encoded['input_ids'][0]) # ty: ignore[unresolved-attribute]
n_tokens = len(tokens)
print(f"n_tokens: {n_tokens}");
print(f"hidden_size: {model.config.hidden_size}")
Expand Down
18 changes: 9 additions & 9 deletions gguf-py/gguf/vocab.py
Original file line number Diff line number Diff line change
Expand Up @@ -543,7 +543,7 @@ def __init__(self, base_path: Path):
cache_dir=base_path,
local_files_only=True,
)
assert self.tokenizer.is_fast # assume tokenizer.json is used
assert self.tokenizer.is_fast # assume tokenizer.json is used # ty: ignore[unresolved-attribute]

# Initialize lists and dictionaries for added tokens
self.added_tokens_list = []
Expand All @@ -552,30 +552,30 @@ def __init__(self, base_path: Path):

# Process added tokens
for tok, tokidx in sorted(
self.tokenizer.get_added_vocab().items(), key=lambda x: x[1]
self.tokenizer.get_added_vocab().items(), key=lambda x: x[1] # ty: ignore[unresolved-attribute]
):
# Only consider added tokens that are not in the base vocabulary
if tokidx >= self.tokenizer.vocab_size:
if tokidx >= self.tokenizer.vocab_size: # ty: ignore[unresolved-attribute]
self.added_tokens_list.append(tok)
self.added_tokens_dict[tok] = tokidx
self.added_tokens_ids.add(tokidx)

# Store special tokens and their IDs
self.specials = {
tok: self.tokenizer.get_vocab()[tok]
for tok in self.tokenizer.all_special_tokens
tok: self.tokenizer.get_vocab()[tok] # ty: ignore[unresolved-attribute]
for tok in self.tokenizer.all_special_tokens # ty: ignore[unresolved-attribute]
}
self.special_ids = set(self.tokenizer.all_special_ids)
self.special_ids = set(self.tokenizer.all_special_ids) # ty: ignore[unresolved-attribute]

# Set vocabulary sizes
self.vocab_size_base = self.tokenizer.vocab_size
self.vocab_size_base = self.tokenizer.vocab_size # ty: ignore[unresolved-attribute]
self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)

self.fname_tokenizer = fname_tokenizer

def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
reverse_vocab = {
id: encoded_tok for encoded_tok, id in self.tokenizer.get_vocab().items()
id: encoded_tok for encoded_tok, id in self.tokenizer.get_vocab().items() # ty: ignore[unresolved-attribute]
}

for token_id in range(self.vocab_size_base):
Expand Down Expand Up @@ -616,7 +616,7 @@ def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
yield text.encode("utf-8"), score, toktype

def has_newline_token(self):
return "<0x0A>" in self.tokenizer.vocab or "\n" in self.tokenizer.vocab
return "<0x0A>" in self.tokenizer.vocab or "\n" in self.tokenizer.vocab # ty: ignore[unresolved-attribute]

def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
yield from self.hf_tokens()
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ classifiers = [
python = ">=3.9"
numpy = "^1.25.0"
sentencepiece = ">=0.1.98,<0.3.0"
transformers = ">=4.35.2,<5.0.0"
transformers = "==5.5.1"
protobuf = ">=4.21.0,<5.0.0"
gguf = { path = "./gguf-py" }
torch = { version = "^2.2.0", source = "pytorch" }
Expand Down
2 changes: 1 addition & 1 deletion requirements/requirements-convert_legacy_llama.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
numpy~=1.26.4
sentencepiece>=0.1.98,<0.3.0

transformers>=4.57.1,<5.0.0
transformers==5.5.1

gguf>=0.1.0
protobuf>=4.21.0,<5.0.0
2 changes: 1 addition & 1 deletion requirements/requirements-tool_bench.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
aiohttp~=3.9.3
pytest~=8.3.3
huggingface_hub>=0.34.0,<1.0
huggingface_hub>=1.5.0,<2.0
matplotlib~=3.10.0
numpy~=1.26.4
openai~=2.14.0
Expand Down
2 changes: 1 addition & 1 deletion tests/test-tokenizer-0.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
lines = f.readlines()
s = ''.join(lines)
t_start = time.time()
res = tokenizer.encode(s, add_special_tokens=False)
res = tokenizer.encode(s, add_special_tokens=False) # ty: ignore[unresolved-attribute]
t_end = time.time()
print('\nmain : tokenized in', "{:.3f}".format(1000.0 * (t_end - t_start)), 'ms (py)') # noqa: NP100
with open(fname_out, 'w', encoding='utf-8') as f:
Expand Down
6 changes: 3 additions & 3 deletions tests/test-tokenizer-random.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ def decode(self, ids: list[int]) -> str:
class TokenizerGroundtruth (Tokenizer):

def __init__(self, dir_tokenizer: str):
self.model: PreTrainedTokenizer = AutoTokenizer.from_pretrained(dir_tokenizer)
self.model: PreTrainedTokenizer = AutoTokenizer.from_pretrained(dir_tokenizer) # ty: ignore[invalid-assignment]
# guess BOS and EOS
ids = self.encode("a")
assert 1 <= len(ids) <= 3
Expand All @@ -142,15 +142,15 @@ def __init__(self, dir_tokenizer: str):
self.vocab = list(sorted(self.vocab))
# tokens and lists
self.special_tokens = list(self.model.all_special_tokens)
self.added_tokens = self.model.batch_decode(self.model.added_tokens_encoder.values(), skip_special_tokens=False)
self.added_tokens = self.model.batch_decode(list(self.model.added_tokens_encoder.values()), skip_special_tokens=False)
self.bos_token = self.model.bos_token
self.eos_token = self.model.eos_token

def encode(self, text: str) -> list[int]:
return self.model.encode(text, add_special_tokens=True)

def decode(self, ids: list[int]) -> str:
return self.model.decode(ids, skip_special_tokens=False)
return self.model.decode(ids, skip_special_tokens=False) # ty: ignore[invalid-return-type]


class TokenizerLlamaCpp (Tokenizer):
Expand Down
2 changes: 1 addition & 1 deletion tools/server/tests/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
aiohttp~=3.9.3
pytest~=8.3.3
huggingface_hub>=0.34.0,<1.0
huggingface_hub>=1.5.0,<2.0
numpy~=1.26.4
openai~=2.14.0
prometheus-client~=0.20.0
Expand Down