From fd10a3587bcadc7124a66f4945921993dfbbc5cb Mon Sep 17 00:00:00 2001 From: Philpax Date: Mon, 17 Jul 2023 01:11:48 +0200 Subject: [PATCH] fix(llm): pass use_gpu to InferenceSession --- binaries/llm-cli/src/cli_args.rs | 1 - crates/llm-base/src/inference_session.rs | 11 +++++------ crates/models/bloom/src/lib.rs | 14 ++++++-------- crates/models/falcon/src/lib.rs | 14 ++++++-------- crates/models/gpt2/src/lib.rs | 14 ++++++-------- crates/models/gptj/src/lib.rs | 14 ++++++-------- crates/models/gptneox/src/lib.rs | 14 ++++++-------- crates/models/llama/src/lib.rs | 18 +++++++----------- crates/models/mpt/src/lib.rs | 14 ++++++-------- 9 files changed, 48 insertions(+), 66 deletions(-) diff --git a/binaries/llm-cli/src/cli_args.rs b/binaries/llm-cli/src/cli_args.rs index ebd0398c..44388dc7 100644 --- a/binaries/llm-cli/src/cli_args.rs +++ b/binaries/llm-cli/src/cli_args.rs @@ -334,7 +334,6 @@ impl Generate { InferenceSessionConfig { memory_k_type: mem_typ, memory_v_type: mem_typ, - use_gpu: self.use_gpu, n_batch: self.batch_size, n_threads: self.num_threads(), } diff --git a/crates/llm-base/src/inference_session.rs b/crates/llm-base/src/inference_session.rs index 42314f41..4db7e80a 100644 --- a/crates/llm-base/src/inference_session.rs +++ b/crates/llm-base/src/inference_session.rs @@ -113,6 +113,7 @@ impl InferenceSession { /// Create a new InferenceSession pub fn new( config: InferenceSessionConfig, + use_gpu: bool, n_ctx: usize, n_layer: usize, n_embd: usize, @@ -137,7 +138,7 @@ impl InferenceSession { ctx_size }; - if config.use_gpu { + if use_gpu { ggml::accelerator::initialize(0); ggml::accelerator::set_scratch_size(config.n_batch * 1024 * 1024); } @@ -147,7 +148,7 @@ impl InferenceSession { // Initialize key + value memory tensors let n_mem = n_layer * n_ctx; let n_elements = n_embd * n_mem; - let (memory_k, memory_v) = kv_memory(&session_ctx, &config, n_elements); + let (memory_k, memory_v) = kv_memory(&session_ctx, &config, use_gpu, n_elements); let scratch = scratch_buffers(); @@ -784,8 +785,6 @@ pub struct InferenceSessionConfig { /// The type of the memory V tensor. pub memory_v_type: ModelKVMemoryType, - /// Whether to use GPU acceleration - pub use_gpu: bool, /// Controls batch/chunk size for prompt ingestion in [InferenceSession::feed_prompt]. /// /// This is the number of tokens that will be ingested at once. This is useful for @@ -817,7 +816,6 @@ impl Default for InferenceSessionConfig { Self { memory_k_type: ModelKVMemoryType::Float16, memory_v_type: ModelKVMemoryType::Float16, - use_gpu: false, n_batch: 8, n_threads: 8, } @@ -980,6 +978,7 @@ pub fn conversation_inference_callback<'a, E: std::error::Error + Send + Sync + fn kv_memory( context: &Context, config: &InferenceSessionConfig, + use_gpu: bool, n_elements: usize, ) -> (Tensor, Tensor) { let memory_k = context @@ -989,7 +988,7 @@ fn kv_memory( .new_tensor_1d(config.memory_v_type.into(), n_elements) .set_name("memory_v"); - if config.use_gpu { + if use_gpu { // CUDA requires the K/V-Memory to be on the GPU but excluded from the scratch buffer. // For OpenCL this is a no-op. // diff --git a/crates/models/bloom/src/lib.rs b/crates/models/bloom/src/lib.rs index 78c9cec6..da2562d1 100644 --- a/crates/models/bloom/src/lib.rs +++ b/crates/models/bloom/src/lib.rs @@ -16,8 +16,7 @@ use llm_base::{ /// # Safety /// This implements [Send] and [Sync] as it is immutable after construction. pub struct Bloom { - // the context size ("memory") the model should use when evaluating a prompt - context_size: usize, + params: ModelParameters, hyperparameters: Hyperparameters, tokenizer: Tokenizer, @@ -91,11 +90,9 @@ impl KnownModel for Bloom { let context = tl.finish(); - let ModelParameters { context_size, .. } = params; - Ok(Bloom { hyperparameters, - context_size, + params, tokenizer, wte, norm, @@ -111,7 +108,8 @@ impl KnownModel for Bloom { fn start_session(&self, config: InferenceSessionConfig) -> InferenceSession { InferenceSession::new( config, - self.context_size, + self.params.use_gpu, + self.params.context_size, self.hyperparameters.n_layer, self.hyperparameters.n_embd, self.hyperparameters.n_vocab, @@ -126,7 +124,7 @@ impl KnownModel for Bloom { ) { let input_len = input_tokens.len(); let session_len = session.n_past; - let ctx_size = self.context_size; + let ctx_size = self.params.context_size; let Hyperparameters { n_vocab, @@ -376,7 +374,7 @@ impl KnownModel for Bloom { } fn context_size(&self) -> usize { - self.context_size + self.params.context_size } fn bot_token_id(&self) -> Option { diff --git a/crates/models/falcon/src/lib.rs b/crates/models/falcon/src/lib.rs index 02fcd933..ee7603dc 100644 --- a/crates/models/falcon/src/lib.rs +++ b/crates/models/falcon/src/lib.rs @@ -22,8 +22,7 @@ use llm_base::{ /// # Safety /// This implements [Send] and [Sync] as it is immutable after construction. pub struct Falcon { - // the context size ("memory") the model should use when evaluating a prompt - context_size: usize, + params: ModelParameters, hyperparameters: Hyperparameters, @@ -83,11 +82,9 @@ impl KnownModel for Falcon { let context = tl.finish(); - let ModelParameters { context_size, .. } = params; - Ok(Falcon { hyperparameters, - context_size, + params, tokenizer, tok_embeddings, output_norm, @@ -101,7 +98,8 @@ impl KnownModel for Falcon { fn start_session(&self, config: InferenceSessionConfig) -> InferenceSession { InferenceSession::new( config, - self.context_size, + self.params.use_gpu, + self.params.context_size, self.hyperparameters.n_layer, self.hyperparameters.n_embd, self.hyperparameters.n_vocab, @@ -116,7 +114,7 @@ impl KnownModel for Falcon { ) { let input_len = input_tokens.len(); let session_len = session.n_past; - let ctx_size = self.context_size; + let ctx_size = self.params.context_size; let Hyperparameters { n_embd, @@ -335,7 +333,7 @@ impl KnownModel for Falcon { } fn context_size(&self) -> usize { - self.context_size + self.params.context_size } fn bot_token_id(&self) -> Option { diff --git a/crates/models/gpt2/src/lib.rs b/crates/models/gpt2/src/lib.rs index 534f46e7..0d08db3e 100644 --- a/crates/models/gpt2/src/lib.rs +++ b/crates/models/gpt2/src/lib.rs @@ -16,8 +16,7 @@ use llm_base::{ /// # Safety /// This implements [Send] and [Sync] as it is immutable after construction. pub struct Gpt2 { - // the context size ("memory") the model should use when evaluating a prompt - context_size: usize, + params: ModelParameters, hyperparameters: Hyperparameters, tokenizer: Tokenizer, @@ -88,11 +87,9 @@ impl KnownModel for Gpt2 { let context = tl.finish(); - let ModelParameters { context_size, .. } = params; - Ok(Gpt2 { hyperparameters, - context_size, + params, tokenizer, layers, ln_f_g, @@ -107,7 +104,8 @@ impl KnownModel for Gpt2 { fn start_session(&self, config: InferenceSessionConfig) -> InferenceSession { InferenceSession::new( config, - self.context_size, + self.params.use_gpu, + self.params.context_size, self.hyperparameters.n_layer, self.hyperparameters.n_embd, self.hyperparameters.n_vocab, @@ -122,7 +120,7 @@ impl KnownModel for Gpt2 { ) { let input_len = input_tokens.len(); let session_len = session.n_past; - let ctx_size = self.context_size; + let ctx_size = self.params.context_size; let Hyperparameters { n_embd, @@ -336,7 +334,7 @@ impl KnownModel for Gpt2 { } fn context_size(&self) -> usize { - self.context_size + self.params.context_size } fn bot_token_id(&self) -> Option { diff --git a/crates/models/gptj/src/lib.rs b/crates/models/gptj/src/lib.rs index 487123e0..10ea84d3 100644 --- a/crates/models/gptj/src/lib.rs +++ b/crates/models/gptj/src/lib.rs @@ -16,8 +16,7 @@ use llm_base::{ /// # Safety /// This implements [Send] and [Sync] as it is immutable after construction. pub struct GptJ { - // the context size ("memory") the model should use when evaluating a prompt - context_size: usize, + params: ModelParameters, hyperparameters: Hyperparameters, tokenizer: Tokenizer, @@ -83,11 +82,9 @@ impl KnownModel for GptJ { let context = tl.finish(); - let ModelParameters { context_size, .. } = params; - Ok(GptJ { hyperparameters, - context_size, + params, tokenizer, ln_f_g, ln_f_b, @@ -102,7 +99,8 @@ impl KnownModel for GptJ { fn start_session(&self, config: InferenceSessionConfig) -> InferenceSession { InferenceSession::new( config, - self.context_size, + self.params.use_gpu, + self.params.context_size, self.hyperparameters.n_layer, self.hyperparameters.n_embd, self.hyperparameters.n_vocab, @@ -117,7 +115,7 @@ impl KnownModel for GptJ { ) { let input_len = input_tokens.len(); let session_len = session.n_past; - let ctx_size = self.context_size; + let ctx_size = self.params.context_size; let Hyperparameters { n_embd, @@ -298,7 +296,7 @@ impl KnownModel for GptJ { } fn context_size(&self) -> usize { - self.context_size + self.params.context_size } fn bot_token_id(&self) -> Option { diff --git a/crates/models/gptneox/src/lib.rs b/crates/models/gptneox/src/lib.rs index 83187273..9ab13d53 100644 --- a/crates/models/gptneox/src/lib.rs +++ b/crates/models/gptneox/src/lib.rs @@ -17,8 +17,7 @@ use llm_base::{ /// # Safety /// This implements [Send] and [Sync] as it is immutable after construction. pub struct GptNeoX { - // the context size ("memory") the model should use when evaluating a prompt - context_size: usize, + params: ModelParameters, hyperparameters: Hyperparameters, tokenizer: Tokenizer, @@ -97,11 +96,9 @@ impl KnownModel for GptNeoX { let context = tl.finish(); - let ModelParameters { context_size, .. } = params; - Ok(GptNeoX { hyperparameters, - context_size, + params, tokenizer, ln_f_g, ln_f_b, @@ -115,7 +112,8 @@ impl KnownModel for GptNeoX { fn start_session(&self, config: InferenceSessionConfig) -> InferenceSession { InferenceSession::new( config, - self.context_size, + self.params.use_gpu, + self.params.context_size, self.hyperparameters.n_layer, self.hyperparameters.n_embd, self.hyperparameters.n_vocab, @@ -132,7 +130,7 @@ impl KnownModel for GptNeoX { ) { let n = input_tokens.len(); let n_past = session.n_past; - let n_ctx = self.context_size; + let n_ctx = self.params.context_size; let Hyperparameters { n_embd, @@ -344,7 +342,7 @@ impl KnownModel for GptNeoX { } fn context_size(&self) -> usize { - self.context_size + self.params.context_size } fn bot_token_id(&self) -> Option { diff --git a/crates/models/llama/src/lib.rs b/crates/models/llama/src/lib.rs index 0b4d185b..d8ed0a94 100644 --- a/crates/models/llama/src/lib.rs +++ b/crates/models/llama/src/lib.rs @@ -15,9 +15,7 @@ use llm_base::{ /// # Safety /// This implements [Send] and [Sync] as it is immutable after construction. pub struct Llama { - // the context size ("memory") the model should use when evaluating a prompt - context_size: usize, - model_params: ModelParameters, + params: ModelParameters, hyperparameters: Hyperparameters, tokenizer: Tokenizer, @@ -96,12 +94,9 @@ impl KnownModel for Llama { } let context = tl.finish(); - let ModelParameters { context_size, .. } = params; - Ok(Self { hyperparameters, - model_params: params, - context_size, + params, tokenizer, wte, norm, @@ -115,7 +110,8 @@ impl KnownModel for Llama { fn start_session(&self, config: InferenceSessionConfig) -> InferenceSession { InferenceSession::new( config, - self.context_size, + self.params.use_gpu, + self.params.context_size, self.hyperparameters.n_layer, self.hyperparameters.n_embd, self.hyperparameters.n_vocab, @@ -131,7 +127,7 @@ impl KnownModel for Llama { ) { let input_len = input_tokens.len(); let session_len = session.n_past; - let ctx_size = self.context_size; + let ctx_size = self.params.context_size; let Hyperparameters { n_vocab, @@ -152,7 +148,7 @@ impl KnownModel for Llama { let mut gf = ggml::ComputationGraph::new(); for il in 0..n_layer { - ctx0.set_offloading(self.model_params.should_offload(il)); + ctx0.set_offloading(self.params.should_offload(il)); let input_self_attention = input_layer.share(); let mut current: ggml::Tensor; @@ -353,7 +349,7 @@ impl KnownModel for Llama { } fn context_size(&self) -> usize { - self.context_size + self.params.context_size } fn bot_token_id(&self) -> Option { diff --git a/crates/models/mpt/src/lib.rs b/crates/models/mpt/src/lib.rs index 107c47aa..3e863184 100644 --- a/crates/models/mpt/src/lib.rs +++ b/crates/models/mpt/src/lib.rs @@ -16,8 +16,7 @@ use llm_base::{ /// # Safety /// This implements [Send] and [Sync] as it is immutable after construction. pub struct Mpt { - // the context size ("memory") the model should use when evaluating a prompt - context_size: usize, + params: ModelParameters, hyperparameters: Hyperparameters, tokenizer: Tokenizer, @@ -72,11 +71,9 @@ impl KnownModel for Mpt { let context = tl.finish(); - let ModelParameters { context_size, .. } = params; - Ok(Mpt { hyperparameters, - context_size, + params, tokenizer, wte, norm, @@ -88,7 +85,8 @@ impl KnownModel for Mpt { fn start_session(&self, config: InferenceSessionConfig) -> InferenceSession { InferenceSession::new( config, - self.context_size, + self.params.use_gpu, + self.params.context_size, self.hyperparameters.n_layer, self.hyperparameters.n_embd, self.hyperparameters.n_vocab, @@ -103,7 +101,7 @@ impl KnownModel for Mpt { ) { let n = input_tokens.len(); let session_len = session.n_past; - let ctx_size = self.context_size; + let ctx_size = self.params.context_size; let Hyperparameters { n_embd, @@ -278,7 +276,7 @@ impl KnownModel for Mpt { } fn context_size(&self) -> usize { - self.context_size + self.params.context_size } fn bot_token_id(&self) -> Option {