Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

convert.py: Hot Fix with VocabFactory Integration #4818

Merged
merged 16 commits into from
Jan 9, 2024
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
feat: Introduce VocabFactory for flexible vocabulary management in mo…
…del conversion

- The VocabFactory class is added to facilitate modular vocabulary handling.
- The constructor initializes a directory path and detects vocabulary-related files.
- The _select_file method provides file paths based on vocabulary type (e.g., BPE, SentencePiece).
- _create_special_vocab generates special vocabularies, accommodating different types.
- The load_vocab method loads vocabularies, handling BPE, SentencePiece, and Hugging Face Fast Tokenizer.
- Error handling and logging enhance debugging and user feedback.
- The modular and flexible design simplifies vocabulary management and supports future extensions.

The VocabFactory class enhances code modularity and maintainability, allowing versatile vocabulary handling in the model conversion process.
  • Loading branch information
teleprint-me committed Jan 8, 2024
commit 8aa5818a20af134173ffe5daad7e529bd22d46f9
77 changes: 77 additions & 0 deletions convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -1355,6 +1355,83 @@ def load_some_model(path: Path) -> ModelPlus:
return model_plus


class VocabFactory:
def __init__(self, path: Path):
self.path = path
self.files = {
"tokenizer.model": None,
"vocab.json": None,
"tokenizer.json": None,
}
self._detect_files()

def _detect_files(self):
for file in self.files.keys():
file_path = self.path / file
parent_file_path = self.path.parent / file
if file_path.exists():
self.files[file] = file_path
elif parent_file_path.exists():
self.files[file] = parent_file_path

def _select_file(self, vocabtype: Optional[str]) -> Path:
if vocabtype in ["spm", "bpe"]:
# For SentencePiece and BPE, return specific files as before
file_key = "tokenizer.model" if vocabtype == "spm" else "vocab.json"
if self.files[file_key]:
return self.files[file_key]
else:
raise FileNotFoundError(f"{vocabtype} {file_key} not found.")
elif vocabtype == "hfft":
# For Hugging Face Fast Tokenizer, return the directory path instead of a specific file
return self.path
else:
raise ValueError(f"Unsupported vocabulary type {vocabtype}")
Comment on lines +1378 to +1389
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Several examples of else-after-return here, which is an anti-pattern.


def _create_special_vocab(
self,
vocab: Vocab,
vocabtype: str,
model_parent_path: Path,
) -> gguf.SpecialVocab:
load_merges = vocabtype == "bpe"
n_vocab = vocab.vocab_size if hasattr(vocab, "vocab_size") else None
return gguf.SpecialVocab(
model_parent_path,
load_merges=load_merges,
special_token_types=None, # Predetermined or passed as a parameter
n_vocab=n_vocab,
)

def load_vocab(
self, vocabtype: str, model_parent_path: Path
) -> Tuple[Vocab, gguf.SpecialVocab]:
path = self._select_file(vocabtype)
print(f"Loading vocab file '{path}', type '{vocabtype}'")

added_tokens_path = path.parent / "added_tokens.json"
if vocabtype == "bpe":
vocab = BpeVocab(
path, added_tokens_path if added_tokens_path.exists() else None
)
elif vocabtype == "spm":
vocab = SentencePieceVocab(
path, added_tokens_path if added_tokens_path.exists() else None
)
elif vocabtype == "hfft":
vocab = HfVocab(
path, added_tokens_path if added_tokens_path.exists() else None
)
else:
raise ValueError(f"Unsupported vocabulary type {vocabtype}")
special_vocab = self._create_special_vocab(
vocab,
vocabtype,
model_parent_path,
)
return vocab, special_vocab


def default_outfile(model_paths: list[Path], file_type: GGMLFileType) -> Path:
namestr = {
GGMLFileType.AllF32: "f32",
Expand Down