Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

gguf-py: Add support for loading merges.txt #3743

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 29 additions & 7 deletions convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -779,20 +779,29 @@ def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], conc
break
yield result

def check_vocab_size(params: Params, vocab: Vocab) -> None:
def check_vocab_size(params: Params, vocab: Vocab, pad_vocab: bool = False) -> None:
if params.n_vocab != vocab.vocab_size:
assert isinstance(vocab, BpeVocab) or isinstance(vocab, SentencePieceVocab)
if params.n_vocab == vocab.vocab_size_base:
print("Ignoring added_tokens.json since model matches vocab size without it.")
vocab.added_tokens_list = []
vocab.vocab_size = vocab.vocab_size_base
return
if pad_vocab and params.n_vocab > vocab.vocab_size:
pad_count = params.n_vocab - vocab.vocab_size
print(f'Padding vocab with {pad_count} token(s) - <dummy00001> through <dummy{pad_count:05}>')
for i in range(1, (params.n_vocab - vocab.vocab_size) + 1):
vocab.added_tokens_list.append(f'<dummy{i:05}>')
vocab.vocab_size = params.n_vocab
return
msg = f"Vocab size mismatch (model has {params.n_vocab}, but {vocab.fname_tokenizer}"
if vocab.fname_added_tokens is not None:
msg += f" combined with {vocab.fname_added_tokens}"
msg += f" has {vocab.vocab_size})."
if vocab.vocab_size < params.n_vocab < vocab.vocab_size + 20 and vocab.fname_added_tokens is None:
msg += f" Most likely you are missing added_tokens.json (should be in {vocab.fname_tokenizer.parent})."
if vocab.vocab_size < params.n_vocab:
msg += " Possibly try using the --padvocab option."
raise Exception(msg)


Expand Down Expand Up @@ -877,8 +886,12 @@ def close(self) -> None:
self.gguf.close()

@staticmethod
def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab, endianess:gguf.GGUFEndian=gguf.GGUFEndian.LITTLE) -> None:
check_vocab_size(params, vocab)
def write_vocab_only(
fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab,
endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
pad_vocab: bool = False,
) -> None:
check_vocab_size(params, vocab, pad_vocab = pad_vocab)

of = OutputFile(fname_out, endianess=endianess)

Expand All @@ -905,8 +918,14 @@ def maybe_do_quantize(item: tuple[DataType, NDArray]) -> NDArray:
return dt.quantize(arr)

@staticmethod
def write_all(fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab, concurrency: int = DEFAULT_CONCURRENCY, endianess=gguf.GGUFEndian.LITTLE) -> None:
check_vocab_size(params, vocab)
def write_all(
fname_out : Path, ftype: GGMLFileType, params: Params,
model : LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab,
concurrency: int = DEFAULT_CONCURRENCY,
endianess : gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
pad_vocab : bool = False,
) -> None:
check_vocab_size(params, vocab, pad_vocab = pad_vocab)

of = OutputFile(fname_out, endianess=endianess)

Expand Down Expand Up @@ -1126,6 +1145,7 @@ def main(args_in: list[str] | None = None) -> None:
parser.add_argument("--ctx", type=int, help="model training context (default: based on input)")
parser.add_argument("--concurrency", type=int, help=f"concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", default = DEFAULT_CONCURRENCY)
parser.add_argument("--bigendian", action="store_true", help="model is executed on big endian machine")
parser.add_argument("--padvocab", action="store_true", help="add pad tokens when model vocab expects more than tokenizer metadata provides")

args = parser.parse_args(args_in)
if args.dump_single:
Expand Down Expand Up @@ -1173,7 +1193,8 @@ def main(args_in: list[str] | None = None) -> None:
load_merges = args.vocabtype == 'bpe',
n_vocab = vocab.vocab_size)
outfile = args.outfile
OutputFile.write_vocab_only(outfile, params, vocab, special_vocab)
OutputFile.write_vocab_only(outfile, params, vocab, special_vocab,
endianess = endianess, pad_vocab = args.padvocab)
print(f"Wrote {outfile}")
return

Expand All @@ -1196,7 +1217,8 @@ def main(args_in: list[str] | None = None) -> None:
params.ftype = ftype
print(f"Writing {outfile}, format {ftype}")

OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, concurrency = args.concurrency, endianess=endianess)
OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab,
concurrency = args.concurrency, endianess = endianess, pad_vocab = args.padvocab)
print(f"Wrote {outfile}")


Expand Down
32 changes: 32 additions & 0 deletions gguf-py/gguf/gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -1023,6 +1023,35 @@ def __init__(
def _load(self, path: Path) -> None:
if not self._try_load_from_tokenizer_json(path):
self._try_load_from_config_json(path)
if self.load_merges and len(self.merges) == 0:
self._try_load_merges_txt(path)

def _try_load_merges_txt(self, path: Path) -> bool:
merges_file = path / 'merges.txt'
if not merges_file.is_file():
return False
with open(merges_file, 'r') as fp:
first_line = next(fp, '').strip()
if not first_line.startswith('#'):
fp.seek(0)
line_num = 0
else:
line_num = 1
merges = []
for line in fp:
line_num += 1
line = line.strip()
if len(line) == 0:
continue
parts = line.split(None, 3)
if len(parts) != 2:
print(f'gguf: WARNING: {merges_file.name}: Line {line_num}: Entry malformed, ignoring',
file = sys.stderr)
continue
merges.append(f'{parts[0]} {parts[1]}')
self.merges = merges
return True


def _set_special_token(self, typ: str, tid: Any):
if not isinstance(tid, int) or tid < 0:
Expand Down Expand Up @@ -1083,6 +1112,9 @@ def add_to_gguf(self, gw: GGUFWriter, quiet: bool = False) -> None:
if not quiet:
print(f'gguf: Adding {len(self.merges)} merge(s).')
gw.add_token_merges(self.merges)
elif self.load_merges:
print('gguf: WARNING: Adding merges requested but no merges found, output may be non-functional.',
file = sys.stderr)
for typ, tokid in self.special_token_ids.items():
handler: Callable[[int], None] | None = getattr(gw, f'add_{typ}_token_id', None)
if handler is None:
Expand Down
2 changes: 1 addition & 1 deletion gguf-py/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "gguf"
version = "0.4.5"
version = "0.4.6"
description = "Write ML models in GGUF for GGML"
authors = ["GGML <ggml@ggml.ai>"]
packages = [
Expand Down