Skip to content

Commit

Permalink
chore: minor fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
philpax committed May 24, 2023
1 parent 7ddbdb2 commit b1945d7
Show file tree
Hide file tree
Showing 6 changed files with 25 additions and 15 deletions.
2 changes: 1 addition & 1 deletion binaries/llm-cli/src/cli_args.rs
Original file line number Diff line number Diff line change
Expand Up @@ -353,7 +353,7 @@ impl ModelVocabulary {
}
(Some(path), None) => VocabularySource::HuggingFaceTokenizerFile(path.to_owned()),
(None, Some(repo)) => VocabularySource::HuggingFaceRemote(repo.to_owned()),
(None, None) => VocabularySource::ModelFile,
(None, None) => VocabularySource::Model,
})
}
}
Expand Down
30 changes: 19 additions & 11 deletions crates/llm-base/src/vocabulary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,9 @@ impl VocabularyLoadError {
pub enum VocabularySource {
/// Read the vocabulary from the model if available, and use a simplistic tokenizer.
///
/// This is easy to use, but may not be the best choice for your use case.
ModelFile,
/// This is easy to use, but may not be the best choice for your use case, and is not
/// guaranteed to be available for all models.
Model,

/// Read the vocabulary from a local HuggingFace-format tokenizer file, and use the
/// HuggingFace tokenizer.
Expand Down Expand Up @@ -96,7 +97,7 @@ impl VocabularySource {
.into()
}

Self::ModelFile => ModelVocabulary::default().into(),
Self::Model => ModelVocabulary::default().into(),
})
}
}
Expand Down Expand Up @@ -305,7 +306,7 @@ pub struct ExternalVocabulary {
}

impl ExternalVocabulary {
/// Create a new `TokenizerVocabulary`.
/// Create a new `ExternalVocabulary`.
pub fn new(tokenizer: Tokenizer) -> Self {
Self { tokenizer }
}
Expand Down Expand Up @@ -336,7 +337,6 @@ impl ExternalVocabulary {
self.tokenizer.get_vocab_size(false) == 0
}

// SentencePiece implementation after https://guillaume-be.github.io/2020-05-30/sentence_piece
/// Tokenize a `text` with this vocabulary.
///
/// `bos` controls whether a beginning-of-string token should be inserted.
Expand All @@ -345,14 +345,22 @@ impl ExternalVocabulary {
text: &str,
bos: bool,
) -> Result<Vec<(Vec<u8>, TokenId)>, TokenizationError> {
Ok(self
let encoding = self
.tokenizer
.encode(text, false)
.map_err(|e| TokenizationError::TokenizationFailed { error: e })?;

let encoding = self
.tokenizer
.encode(text, bos)
.map_err(|e| TokenizationError::TokenizationFailed { error: e })?
.get_ids()
.post_process(encoding, None, bos)
.map_err(|e| TokenizationError::TokenizationFailed { error: e })?;

Ok(encoding
.get_tokens()
.iter()
.map(|id| (self.token(*id as usize), *id))
.collect::<Vec<(Vec<u8>, TokenId)>>())
.map(|t| t.as_bytes().to_vec())
.zip(encoding.get_ids().iter().copied())
.collect())
}
}

Expand Down
2 changes: 1 addition & 1 deletion crates/llm/examples/embeddings.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ fn main() {
let model = llm::load_dynamic(
model_architecture,
model_path,
llm::VocabularySource::ModelFile,
llm::VocabularySource::Model,
model_params,
overrides,
llm::load_progress_callback_stdout,
Expand Down
2 changes: 1 addition & 1 deletion crates/llm/examples/inference.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ fn main() {
let model = llm::load_dynamic(
model_architecture,
model_path,
llm::VocabularySource::ModelFile,
llm::VocabularySource::Model,
Default::default(),
overrides,
llm::load_progress_callback_stdout,
Expand Down
2 changes: 1 addition & 1 deletion crates/llm/examples/vicuna-chat.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ fn main() {
let model = llm::load_dynamic(
model_architecture,
model_path,
llm::VocabularySource::ModelFile,
llm::VocabularySource::Model,
Default::default(),
overrides,
llm::load_progress_callback_stdout,
Expand Down
2 changes: 2 additions & 0 deletions crates/llm/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
//! let llama = llm::load::<llm::models::Llama>(
//! // path to GGML file
//! std::path::Path::new("/path/to/model"),
//! // llm::VocabularySource
//! llm::VocabularySource::Model,
//! // llm::ModelParameters
//! Default::default(),
//! // llm::KnownModel::Overrides
Expand Down

0 comments on commit b1945d7

Please sign in to comment.