Skip to content

Commit

Permalink
fix(llm): pass use_gpu to InferenceSession
Browse files Browse the repository at this point in the history
  • Loading branch information
philpax committed Jul 16, 2023
1 parent 6bf657c commit fd10a35
Show file tree
Hide file tree
Showing 9 changed files with 48 additions and 66 deletions.
1 change: 0 additions & 1 deletion binaries/llm-cli/src/cli_args.rs
Original file line number Diff line number Diff line change
Expand Up @@ -334,7 +334,6 @@ impl Generate {
InferenceSessionConfig {
memory_k_type: mem_typ,
memory_v_type: mem_typ,
use_gpu: self.use_gpu,
n_batch: self.batch_size,
n_threads: self.num_threads(),
}
Expand Down
11 changes: 5 additions & 6 deletions crates/llm-base/src/inference_session.rs
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@ impl InferenceSession {
/// Create a new InferenceSession
pub fn new(
config: InferenceSessionConfig,
use_gpu: bool,
n_ctx: usize,
n_layer: usize,
n_embd: usize,
Expand All @@ -137,7 +138,7 @@ impl InferenceSession {
ctx_size
};

if config.use_gpu {
if use_gpu {
ggml::accelerator::initialize(0);
ggml::accelerator::set_scratch_size(config.n_batch * 1024 * 1024);
}
Expand All @@ -147,7 +148,7 @@ impl InferenceSession {
// Initialize key + value memory tensors
let n_mem = n_layer * n_ctx;
let n_elements = n_embd * n_mem;
let (memory_k, memory_v) = kv_memory(&session_ctx, &config, n_elements);
let (memory_k, memory_v) = kv_memory(&session_ctx, &config, use_gpu, n_elements);

let scratch = scratch_buffers();

Expand Down Expand Up @@ -784,8 +785,6 @@ pub struct InferenceSessionConfig {
/// The type of the memory V tensor.
pub memory_v_type: ModelKVMemoryType,

/// Whether to use GPU acceleration
pub use_gpu: bool,
/// Controls batch/chunk size for prompt ingestion in [InferenceSession::feed_prompt].
///
/// This is the number of tokens that will be ingested at once. This is useful for
Expand Down Expand Up @@ -817,7 +816,6 @@ impl Default for InferenceSessionConfig {
Self {
memory_k_type: ModelKVMemoryType::Float16,
memory_v_type: ModelKVMemoryType::Float16,
use_gpu: false,
n_batch: 8,
n_threads: 8,
}
Expand Down Expand Up @@ -980,6 +978,7 @@ pub fn conversation_inference_callback<'a, E: std::error::Error + Send + Sync +
fn kv_memory(
context: &Context,
config: &InferenceSessionConfig,
use_gpu: bool,
n_elements: usize,
) -> (Tensor, Tensor) {
let memory_k = context
Expand All @@ -989,7 +988,7 @@ fn kv_memory(
.new_tensor_1d(config.memory_v_type.into(), n_elements)
.set_name("memory_v");

if config.use_gpu {
if use_gpu {
// CUDA requires the K/V-Memory to be on the GPU but excluded from the scratch buffer.
// For OpenCL this is a no-op.
//
Expand Down
14 changes: 6 additions & 8 deletions crates/models/bloom/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,7 @@ use llm_base::{
/// # Safety
/// This implements [Send] and [Sync] as it is immutable after construction.
pub struct Bloom {
// the context size ("memory") the model should use when evaluating a prompt
context_size: usize,
params: ModelParameters,

hyperparameters: Hyperparameters,
tokenizer: Tokenizer,
Expand Down Expand Up @@ -91,11 +90,9 @@ impl KnownModel for Bloom {

let context = tl.finish();

let ModelParameters { context_size, .. } = params;

Ok(Bloom {
hyperparameters,
context_size,
params,
tokenizer,
wte,
norm,
Expand All @@ -111,7 +108,8 @@ impl KnownModel for Bloom {
fn start_session(&self, config: InferenceSessionConfig) -> InferenceSession {
InferenceSession::new(
config,
self.context_size,
self.params.use_gpu,
self.params.context_size,
self.hyperparameters.n_layer,
self.hyperparameters.n_embd,
self.hyperparameters.n_vocab,
Expand All @@ -126,7 +124,7 @@ impl KnownModel for Bloom {
) {
let input_len = input_tokens.len();
let session_len = session.n_past;
let ctx_size = self.context_size;
let ctx_size = self.params.context_size;

let Hyperparameters {
n_vocab,
Expand Down Expand Up @@ -376,7 +374,7 @@ impl KnownModel for Bloom {
}

fn context_size(&self) -> usize {
self.context_size
self.params.context_size
}

fn bot_token_id(&self) -> Option<TokenId> {
Expand Down
14 changes: 6 additions & 8 deletions crates/models/falcon/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,7 @@ use llm_base::{
/// # Safety
/// This implements [Send] and [Sync] as it is immutable after construction.
pub struct Falcon {
// the context size ("memory") the model should use when evaluating a prompt
context_size: usize,
params: ModelParameters,

hyperparameters: Hyperparameters,

Expand Down Expand Up @@ -83,11 +82,9 @@ impl KnownModel for Falcon {

let context = tl.finish();

let ModelParameters { context_size, .. } = params;

Ok(Falcon {
hyperparameters,
context_size,
params,
tokenizer,
tok_embeddings,
output_norm,
Expand All @@ -101,7 +98,8 @@ impl KnownModel for Falcon {
fn start_session(&self, config: InferenceSessionConfig) -> InferenceSession {
InferenceSession::new(
config,
self.context_size,
self.params.use_gpu,
self.params.context_size,
self.hyperparameters.n_layer,
self.hyperparameters.n_embd,
self.hyperparameters.n_vocab,
Expand All @@ -116,7 +114,7 @@ impl KnownModel for Falcon {
) {
let input_len = input_tokens.len();
let session_len = session.n_past;
let ctx_size = self.context_size;
let ctx_size = self.params.context_size;

let Hyperparameters {
n_embd,
Expand Down Expand Up @@ -335,7 +333,7 @@ impl KnownModel for Falcon {
}

fn context_size(&self) -> usize {
self.context_size
self.params.context_size
}

fn bot_token_id(&self) -> Option<TokenId> {
Expand Down
14 changes: 6 additions & 8 deletions crates/models/gpt2/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,7 @@ use llm_base::{
/// # Safety
/// This implements [Send] and [Sync] as it is immutable after construction.
pub struct Gpt2 {
// the context size ("memory") the model should use when evaluating a prompt
context_size: usize,
params: ModelParameters,

hyperparameters: Hyperparameters,
tokenizer: Tokenizer,
Expand Down Expand Up @@ -88,11 +87,9 @@ impl KnownModel for Gpt2 {

let context = tl.finish();

let ModelParameters { context_size, .. } = params;

Ok(Gpt2 {
hyperparameters,
context_size,
params,
tokenizer,
layers,
ln_f_g,
Expand All @@ -107,7 +104,8 @@ impl KnownModel for Gpt2 {
fn start_session(&self, config: InferenceSessionConfig) -> InferenceSession {
InferenceSession::new(
config,
self.context_size,
self.params.use_gpu,
self.params.context_size,
self.hyperparameters.n_layer,
self.hyperparameters.n_embd,
self.hyperparameters.n_vocab,
Expand All @@ -122,7 +120,7 @@ impl KnownModel for Gpt2 {
) {
let input_len = input_tokens.len();
let session_len = session.n_past;
let ctx_size = self.context_size;
let ctx_size = self.params.context_size;

let Hyperparameters {
n_embd,
Expand Down Expand Up @@ -336,7 +334,7 @@ impl KnownModel for Gpt2 {
}

fn context_size(&self) -> usize {
self.context_size
self.params.context_size
}

fn bot_token_id(&self) -> Option<TokenId> {
Expand Down
14 changes: 6 additions & 8 deletions crates/models/gptj/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,7 @@ use llm_base::{
/// # Safety
/// This implements [Send] and [Sync] as it is immutable after construction.
pub struct GptJ {
// the context size ("memory") the model should use when evaluating a prompt
context_size: usize,
params: ModelParameters,

hyperparameters: Hyperparameters,
tokenizer: Tokenizer,
Expand Down Expand Up @@ -83,11 +82,9 @@ impl KnownModel for GptJ {

let context = tl.finish();

let ModelParameters { context_size, .. } = params;

Ok(GptJ {
hyperparameters,
context_size,
params,
tokenizer,
ln_f_g,
ln_f_b,
Expand All @@ -102,7 +99,8 @@ impl KnownModel for GptJ {
fn start_session(&self, config: InferenceSessionConfig) -> InferenceSession {
InferenceSession::new(
config,
self.context_size,
self.params.use_gpu,
self.params.context_size,
self.hyperparameters.n_layer,
self.hyperparameters.n_embd,
self.hyperparameters.n_vocab,
Expand All @@ -117,7 +115,7 @@ impl KnownModel for GptJ {
) {
let input_len = input_tokens.len();
let session_len = session.n_past;
let ctx_size = self.context_size;
let ctx_size = self.params.context_size;

let Hyperparameters {
n_embd,
Expand Down Expand Up @@ -298,7 +296,7 @@ impl KnownModel for GptJ {
}

fn context_size(&self) -> usize {
self.context_size
self.params.context_size
}

fn bot_token_id(&self) -> Option<TokenId> {
Expand Down
14 changes: 6 additions & 8 deletions crates/models/gptneox/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,7 @@ use llm_base::{
/// # Safety
/// This implements [Send] and [Sync] as it is immutable after construction.
pub struct GptNeoX {
// the context size ("memory") the model should use when evaluating a prompt
context_size: usize,
params: ModelParameters,

hyperparameters: Hyperparameters,
tokenizer: Tokenizer,
Expand Down Expand Up @@ -97,11 +96,9 @@ impl KnownModel for GptNeoX {

let context = tl.finish();

let ModelParameters { context_size, .. } = params;

Ok(GptNeoX {
hyperparameters,
context_size,
params,
tokenizer,
ln_f_g,
ln_f_b,
Expand All @@ -115,7 +112,8 @@ impl KnownModel for GptNeoX {
fn start_session(&self, config: InferenceSessionConfig) -> InferenceSession {
InferenceSession::new(
config,
self.context_size,
self.params.use_gpu,
self.params.context_size,
self.hyperparameters.n_layer,
self.hyperparameters.n_embd,
self.hyperparameters.n_vocab,
Expand All @@ -132,7 +130,7 @@ impl KnownModel for GptNeoX {
) {
let n = input_tokens.len();
let n_past = session.n_past;
let n_ctx = self.context_size;
let n_ctx = self.params.context_size;

let Hyperparameters {
n_embd,
Expand Down Expand Up @@ -344,7 +342,7 @@ impl KnownModel for GptNeoX {
}

fn context_size(&self) -> usize {
self.context_size
self.params.context_size
}

fn bot_token_id(&self) -> Option<TokenId> {
Expand Down
18 changes: 7 additions & 11 deletions crates/models/llama/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,7 @@ use llm_base::{
/// # Safety
/// This implements [Send] and [Sync] as it is immutable after construction.
pub struct Llama {
// the context size ("memory") the model should use when evaluating a prompt
context_size: usize,
model_params: ModelParameters,
params: ModelParameters,
hyperparameters: Hyperparameters,
tokenizer: Tokenizer,

Expand Down Expand Up @@ -96,12 +94,9 @@ impl KnownModel for Llama {
}
let context = tl.finish();

let ModelParameters { context_size, .. } = params;

Ok(Self {
hyperparameters,
model_params: params,
context_size,
params,
tokenizer,
wte,
norm,
Expand All @@ -115,7 +110,8 @@ impl KnownModel for Llama {
fn start_session(&self, config: InferenceSessionConfig) -> InferenceSession {
InferenceSession::new(
config,
self.context_size,
self.params.use_gpu,
self.params.context_size,
self.hyperparameters.n_layer,
self.hyperparameters.n_embd,
self.hyperparameters.n_vocab,
Expand All @@ -131,7 +127,7 @@ impl KnownModel for Llama {
) {
let input_len = input_tokens.len();
let session_len = session.n_past;
let ctx_size = self.context_size;
let ctx_size = self.params.context_size;

let Hyperparameters {
n_vocab,
Expand All @@ -152,7 +148,7 @@ impl KnownModel for Llama {
let mut gf = ggml::ComputationGraph::new();

for il in 0..n_layer {
ctx0.set_offloading(self.model_params.should_offload(il));
ctx0.set_offloading(self.params.should_offload(il));

let input_self_attention = input_layer.share();
let mut current: ggml::Tensor;
Expand Down Expand Up @@ -353,7 +349,7 @@ impl KnownModel for Llama {
}

fn context_size(&self) -> usize {
self.context_size
self.params.context_size
}

fn bot_token_id(&self) -> Option<TokenId> {
Expand Down
Loading

0 comments on commit fd10a35

Please sign in to comment.