chore: minor fixes

Dognam · May 24, 2023 · b1945d7 · b1945d7
1 parent 7ddbdb2
commit b1945d7
Show file tree

Hide file tree

Showing 6 changed files with 25 additions and 15 deletions.
diff --git a/binaries/llm-cli/src/cli_args.rs b/binaries/llm-cli/src/cli_args.rs
@@ -353,7 +353,7 @@ impl ModelVocabulary {
             }
             (Some(path), None) => VocabularySource::HuggingFaceTokenizerFile(path.to_owned()),
             (None, Some(repo)) => VocabularySource::HuggingFaceRemote(repo.to_owned()),
-            (None, None) => VocabularySource::ModelFile,
+            (None, None) => VocabularySource::Model,
         })
     }
 }

diff --git a/crates/llm-base/src/vocabulary.rs b/crates/llm-base/src/vocabulary.rs
@@ -53,8 +53,9 @@ impl VocabularyLoadError {
 pub enum VocabularySource {
     /// Read the vocabulary from the model if available, and use a simplistic tokenizer.
     ///
-    /// This is easy to use, but may not be the best choice for your use case.
-    ModelFile,
+    /// This is easy to use, but may not be the best choice for your use case, and is not
+    /// guaranteed to be available for all models.
+    Model,
 
     /// Read the vocabulary from a local HuggingFace-format tokenizer file, and use the
     /// HuggingFace tokenizer.
@@ -96,7 +97,7 @@ impl VocabularySource {
                 .into()
             }
 
-            Self::ModelFile => ModelVocabulary::default().into(),
+            Self::Model => ModelVocabulary::default().into(),
         })
     }
 }
@@ -305,7 +306,7 @@ pub struct ExternalVocabulary {
 }
 
 impl ExternalVocabulary {
-    /// Create a new `TokenizerVocabulary`.
+    /// Create a new `ExternalVocabulary`.
     pub fn new(tokenizer: Tokenizer) -> Self {
         Self { tokenizer }
     }
@@ -336,7 +337,6 @@ impl ExternalVocabulary {
         self.tokenizer.get_vocab_size(false) == 0
     }
 
-    // SentencePiece implementation after https://guillaume-be.github.io/2020-05-30/sentence_piece
     /// Tokenize a `text` with this vocabulary.
     ///
     /// `bos` controls whether a beginning-of-string token should be inserted.
@@ -345,14 +345,22 @@ impl ExternalVocabulary {
         text: &str,
         bos: bool,
     ) -> Result<Vec<(Vec<u8>, TokenId)>, TokenizationError> {
-        Ok(self
+        let encoding = self
+            .tokenizer
+            .encode(text, false)
+            .map_err(|e| TokenizationError::TokenizationFailed { error: e })?;
+
+        let encoding = self
             .tokenizer
-            .encode(text, bos)
-            .map_err(|e| TokenizationError::TokenizationFailed { error: e })?
-            .get_ids()
+            .post_process(encoding, None, bos)
+            .map_err(|e| TokenizationError::TokenizationFailed { error: e })?;
+
+        Ok(encoding
+            .get_tokens()
             .iter()
-            .map(|id| (self.token(*id as usize), *id))
-            .collect::<Vec<(Vec<u8>, TokenId)>>())
+            .map(|t| t.as_bytes().to_vec())
+            .zip(encoding.get_ids().iter().copied())
+            .collect())
     }
 }
 

diff --git a/crates/llm/examples/embeddings.rs b/crates/llm/examples/embeddings.rs
@@ -35,7 +35,7 @@ fn main() {
     let model = llm::load_dynamic(
         model_architecture,
         model_path,
-        llm::VocabularySource::ModelFile,
+        llm::VocabularySource::Model,
         model_params,
         overrides,
         llm::load_progress_callback_stdout,

diff --git a/crates/llm/examples/inference.rs b/crates/llm/examples/inference.rs
@@ -20,7 +20,7 @@ fn main() {
     let model = llm::load_dynamic(
         model_architecture,
         model_path,
-        llm::VocabularySource::ModelFile,
+        llm::VocabularySource::Model,
         Default::default(),
         overrides,
         llm::load_progress_callback_stdout,

diff --git a/crates/llm/examples/vicuna-chat.rs b/crates/llm/examples/vicuna-chat.rs
@@ -15,7 +15,7 @@ fn main() {
     let model = llm::load_dynamic(
         model_architecture,
         model_path,
-        llm::VocabularySource::ModelFile,
+        llm::VocabularySource::Model,
         Default::default(),
         overrides,
         llm::load_progress_callback_stdout,

diff --git a/crates/llm/src/lib.rs b/crates/llm/src/lib.rs
@@ -21,6 +21,8 @@
 //! let llama = llm::load::<llm::models::Llama>(
 //!     // path to GGML file
 //!     std::path::Path::new("/path/to/model"),
+//!     // llm::VocabularySource
+//!     llm::VocabularySource::Model,
 //!     // llm::ModelParameters
 //!     Default::default(),
 //!     // llm::KnownModel::Overrides