refactor: remove model-associated inference params

Dognam · May 23, 2023 · 964b2cd · 964b2cd
1 parent c51ff5d
commit 964b2cd
Show file tree

Hide file tree

Showing 14 changed files with 40 additions and 113 deletions.
diff --git a/binaries/llm-cli/src/cli_args.rs b/binaries/llm-cli/src/cli_args.rs
@@ -367,7 +367,6 @@ impl ModelLoad {
             prefer_mmap: !self.no_mmap,
             context_size: self.num_ctx_tokens,
             lora_adapters: self.lora_paths.clone(),
-            ..Default::default()
         };
 
         let mut sp = Some(spinoff::Spinner::new(

diff --git a/binaries/llm-cli/src/main.rs b/binaries/llm-cli/src/main.rs
@@ -70,7 +70,7 @@ fn infer<M: llm::KnownModel + 'static>(
         &mut rng,
         &llm::InferenceRequest {
             prompt: prompt.as_str().into(),
-            parameters: Some(&inference_params),
+            parameters: &inference_params,
             play_back_previous_tokens: session_loaded,
             maximum_token_count: args.generate.num_predict,
         },
@@ -277,7 +277,7 @@ fn interactive<M: llm::KnownModel + 'static>(
                     &mut rng,
                     &llm::InferenceRequest {
                         prompt: "".into(),
-                        parameters: Some(&inference_params),
+                        parameters: &inference_params,
                         play_back_previous_tokens: session_loaded,
                         maximum_token_count: args.generate.num_predict,
                     },

diff --git a/crates/llm-base/src/inference_session.rs b/crates/llm-base/src/inference_session.rs
@@ -175,7 +175,7 @@ impl InferenceSession {
         let mut stats = InferenceStats::default();
         let start_at = std::time::SystemTime::now();
 
-        let parameters = request.parameters.unwrap_or(model.inference_parameters());
+        let parameters = request.parameters;
 
         // Feed the initial prompt through the transformer, to update its
         // context window with new data.
@@ -635,15 +635,13 @@ impl Default for InferenceSessionConfig {
     }
 }
 
-#[derive(Debug, PartialEq, Default, Clone, Copy)]
+#[derive(Debug, PartialEq, Clone, Copy)]
 /// Settings specific to [InferenceSession::infer].
 pub struct InferenceRequest<'a> {
     /// The prompt to feed to the model.
     pub prompt: Prompt<'a>,
     /// The parameters to use during this inference attempt.
-    /// If not specified, this will default to the parameters
-    /// specified in the model.
-    pub parameters: Option<&'a InferenceParameters>,
+    pub parameters: &'a InferenceParameters,
     /// Whether or not to call the callback with the previous tokens
     /// that were encountered in this session.
     ///

diff --git a/crates/llm-base/src/lib.rs b/crates/llm-base/src/lib.rs
@@ -64,16 +64,24 @@ pub struct InferenceParameters {
     /// The number of tokens to consider for the repetition penalty.
     pub repetition_penalty_last_n: usize,
 }
-impl Default for InferenceParameters {
-    fn default() -> Self {
+impl InferenceParameters {
+    /// Returns a reasonable default for the parameters.
+    ///
+    /// Note that these parameters are not necessarily optimal for all models, and that
+    /// you may want to tweak them for your use case.
+    ///
+    /// This is intentionally not a `Default` implementation. The values specified here may change
+    /// in the future, and we want to make sure that users are aware of this and do not accidentally
+    /// rely on the values.
+    pub const fn reasonable_default() -> Self {
         Self {
             n_threads: 8,
             n_batch: 8,
             top_k: 40,
             top_p: 0.95,
             repeat_penalty: 1.30,
             temperature: 0.80,
-            bias_tokens: TokenBias::default(),
+            bias_tokens: TokenBias::empty(),
             repetition_penalty_last_n: 512,
         }
     }

diff --git a/crates/llm-base/src/model/mod.rs b/crates/llm-base/src/model/mod.rs
@@ -163,11 +163,6 @@ pub trait KnownModel: Send + Sync {
 
     /// Get the end of text/end of string token ID. This value is defined by model implementers.
     fn eot_token_id(&self) -> TokenId;
-
-    /// Get the default [InferenceParameters] for this model (used by
-    /// [InferenceSession::infer]). This value is configured through
-    /// [ModelParameters::inference_parameters].
-    fn inference_parameters(&self) -> &InferenceParameters;
 }
 
 /// A type-erased model to allow for interacting with a model without knowing
@@ -200,11 +195,6 @@ pub trait Model: Send + Sync {
 
     /// Get the end of text/end of string token ID. This value is defined by model implementers.
     fn eot_token_id(&self) -> TokenId;
-
-    /// Get the default [InferenceParameters] for this model (used by
-    /// [InferenceSession::infer]). This value is configured through
-    /// [ModelParameters::inference_parameters].
-    fn inference_parameters(&self) -> &InferenceParameters;
 }
 impl<H: Hyperparameters, M: KnownModel<Hyperparameters = H>> Model for M {
     fn start_session(&self, config: InferenceSessionConfig) -> InferenceSession {
@@ -236,10 +226,6 @@ impl<H: Hyperparameters, M: KnownModel<Hyperparameters = H>> Model for M {
     fn eot_token_id(&self) -> TokenId {
         KnownModel::eot_token_id(self)
     }
-
-    fn inference_parameters(&self) -> &InferenceParameters {
-        KnownModel::inference_parameters(self)
-    }
 }
 
 /// Implemented by model hyperparameters for interacting with hyperparameters
@@ -280,8 +266,6 @@ pub struct ModelParameters {
     /// The context size ("memory") the model should use when evaluating a prompt. A larger context
     /// consumes more resources, but produces more consistent and coherent responses.
     pub context_size: usize,
-    /// Default InferenceParameters to use when [evaluating](Model::evaluate) a prompt with this model.
-    pub inference_parameters: InferenceParameters,
     /// The [LoRA](https://arxiv.org/abs/2106.09685) adapters to use when loading the model. If `None`, no adapters will be used.
     pub lora_adapters: Option<Vec<PathBuf>>,
 }
@@ -291,7 +275,6 @@ impl Default for ModelParameters {
         Self {
             prefer_mmap: true,
             context_size: 2048,
-            inference_parameters: Default::default(),
             lora_adapters: None,
         }
     }

diff --git a/crates/llm-base/src/vocabulary.rs b/crates/llm-base/src/vocabulary.rs
@@ -215,6 +215,11 @@ impl<'a> From<&'a Vec<TokenId>> for Prompt<'a> {
 pub struct TokenBias(Vec<(TokenId, f32)>);
 
 impl TokenBias {
+    /// Create an empty [TokenBias].
+    pub const fn empty() -> Self {
+        Self(Vec::new())
+    }
+
     /// Create a [TokenBias] from an existing `Vec`.
     pub fn new(mut v: Vec<(TokenId, f32)>) -> Self {
         v.sort_by_cached_key(|(tid, _)| *tid);

diff --git a/crates/llm/examples/inference.rs b/crates/llm/examples/inference.rs
@@ -1,6 +1,6 @@
 use llm::{
-    load_progress_callback_stdout as load_callback, InferenceFeedback, InferenceRequest,
-    InferenceResponse, ModelArchitecture,
+    load_progress_callback_stdout as load_callback, InferenceFeedback, InferenceParameters,
+    InferenceRequest, InferenceResponse, ModelArchitecture,
 };
 use std::{convert::Infallible, io::Write, path::Path};
 
@@ -44,7 +44,9 @@ fn main() {
         &mut rand::thread_rng(),
         &InferenceRequest {
             prompt: prompt.into(),
-            ..Default::default()
+            parameters: &InferenceParameters::reasonable_default(),
+            play_back_previous_tokens: false,
+            maximum_token_count: None,
         },
         // OutputRequest
         &mut Default::default(),

diff --git a/crates/llm/examples/vicuna-chat.rs b/crates/llm/examples/vicuna-chat.rs
@@ -1,6 +1,6 @@
 use llm::{
-    InferenceFeedback, InferenceRequest, InferenceResponse, InferenceStats, LoadProgress,
-    ModelArchitecture,
+    InferenceFeedback, InferenceParameters, InferenceRequest, InferenceResponse, InferenceStats,
+    LoadProgress, ModelArchitecture,
 };
 use rustyline::error::ReadlineError;
 use spinoff::{spinners::Dots2, Spinner};
@@ -43,10 +43,12 @@ fn main() {
          {character_name}:  Paris is the capital of France."
     );
 
+    let inference_parameters = InferenceParameters::reasonable_default();
+
     session
         .feed_prompt(
             model.as_ref(),
-            &Default::default(),
+            &inference_parameters,
             format!("{persona}\n{history}").as_str(),
             &mut Default::default(),
             llm::feed_prompt_callback(prompt_callback),
@@ -73,7 +75,9 @@ fn main() {
                             prompt: format!("{user_name}: {line}\n{character_name}:")
                                 .as_str()
                                 .into(),
-                            ..Default::default()
+                            parameters: &inference_parameters,
+                            play_back_previous_tokens: false,
+                            maximum_token_count: None,
                         },
                         &mut Default::default(),
                         inference_callback(String::from(user_name), &mut buf),

diff --git a/crates/models/bloom/src/lib.rs b/crates/models/bloom/src/lib.rs
@@ -35,9 +35,6 @@ pub struct Bloom {
     // weights for the model
     layers: Vec<Layer>,
 
-    // default parameters used by [InferenceSession::infer]
-    inference_parameters: InferenceParameters,
-
     // must be kept alive for the model
     _context: ggml::Context,
     _mmap: Option<Mmap>,
@@ -95,11 +92,7 @@ impl KnownModel for Bloom {
 
         let (_context, _, _mmap) = tl.finish();
 
-        let ModelParameters {
-            context_size,
-            inference_parameters,
-            ..
-        } = params;
+        let ModelParameters { context_size, .. } = params;
 
         Ok(Bloom {
             hyperparameters,
@@ -112,7 +105,6 @@ impl KnownModel for Bloom {
             out_norm_bias,
             output,
             layers,
-            inference_parameters,
             _context,
             _mmap,
         })
@@ -393,10 +385,6 @@ impl KnownModel for Bloom {
             .copied()
             .unwrap()
     }
-
-    fn inference_parameters(&self) -> &InferenceParameters {
-        &self.inference_parameters
-    }
 }
 
 /// BLOOM [hyperparameters](https://en.wikipedia.org/wiki/Hyperparameter_(machine_learning))

diff --git a/crates/models/gpt2/src/lib.rs b/crates/models/gpt2/src/lib.rs
@@ -34,9 +34,6 @@ pub struct Gpt2 {
     // weights for the model
     layers: Vec<Layer>,
 
-    // default parameters used by [InferenceSession::infer]
-    inference_parameters: InferenceParameters,
-
     // must be kept alive for the model
     _context: ggml::Context,
     _mmap: Option<Mmap>,
@@ -87,11 +84,7 @@ impl KnownModel for Gpt2 {
 
         let (_context, _, _mmap) = tl.finish();
 
-        let ModelParameters {
-            context_size,
-            inference_parameters,
-            ..
-        } = params;
+        let ModelParameters { context_size, .. } = params;
 
         Ok(Gpt2 {
             hyperparameters,
@@ -103,7 +96,6 @@ impl KnownModel for Gpt2 {
             wte,
             wpe,
             lm_head,
-            inference_parameters,
             _context,
             _mmap,
         })
@@ -349,10 +341,6 @@ impl KnownModel for Gpt2 {
             .copied()
             .unwrap()
     }
-
-    fn inference_parameters(&self) -> &InferenceParameters {
-        &self.inference_parameters
-    }
 }
 
 /// GPT-2 [hyperparameters](https://en.wikipedia.org/wiki/Hyperparameter_(machine_learning))

diff --git a/crates/models/gptj/src/lib.rs b/crates/models/gptj/src/lib.rs
@@ -35,9 +35,6 @@ pub struct GptJ {
     // weights for the model
     layers: Vec<Layer>,
 
-    // default parameters used by [InferenceSession::infer]
-    inference_parameters: InferenceParameters,
-
     // must be kept alive for the model
     _context: ggml::Context,
     _mmap: Option<Mmap>,
@@ -89,11 +86,7 @@ impl KnownModel for GptJ {
 
         let (_context, _, _mmap) = tl.finish();
 
-        let ModelParameters {
-            context_size,
-            inference_parameters,
-            ..
-        } = params;
+        let ModelParameters { context_size, .. } = params;
 
         Ok(GptJ {
             hyperparameters,
@@ -105,7 +98,6 @@ impl KnownModel for GptJ {
             lmh_g,
             lmh_b,
             layers,
-            inference_parameters,
             _mmap,
             _context,
         })
@@ -319,10 +311,6 @@ impl KnownModel for GptJ {
             .copied()
             .unwrap()
     }
-
-    fn inference_parameters(&self) -> &InferenceParameters {
-        &self.inference_parameters
-    }
 }
 
 /// GPT-J [hyperparameters](https://en.wikipedia.org/wiki/Hyperparameter_(machine_learning))

diff --git a/crates/models/gptneox/src/lib.rs b/crates/models/gptneox/src/lib.rs
@@ -35,9 +35,6 @@ pub struct GptNeoX {
     // weights for the model
     layers: Vec<Layer>,
 
-    // default parameters used by [InferenceSession::infer]
-    inference_parameters: InferenceParameters,
-
     // must be kept alive for the model
     _context: ggml::Context,
     _mmap: Option<Mmap>,
@@ -103,11 +100,7 @@ impl KnownModel for GptNeoX {
 
         let (_context, _, _mmap) = tl.finish();
 
-        let ModelParameters {
-            context_size,
-            inference_parameters,
-            ..
-        } = params;
+        let ModelParameters { context_size, .. } = params;
 
         Ok(GptNeoX {
             hyperparameters,
@@ -118,7 +111,6 @@ impl KnownModel for GptNeoX {
             wte,
             lmh_g,
             layers,
-            inference_parameters,
             _context,
             _mmap,
         })
@@ -400,10 +392,6 @@ impl KnownModel for GptNeoX {
             .copied()
             .unwrap()
     }
-
-    fn inference_parameters(&self) -> &InferenceParameters {
-        &self.inference_parameters
-    }
 }
 
 /// GPT-NeoX [hyperparameters](https://en.wikipedia.org/wiki/Hyperparameter_(machine_learning))