Skip to content

Commit

Permalink
Fixed stack-overflow in debug mode
Browse files Browse the repository at this point in the history
  • Loading branch information
LLukas22 committed Aug 5, 2023
1 parent bdd9c7d commit 605e0f5
Show file tree
Hide file tree
Showing 11 changed files with 103 additions and 60 deletions.
18 changes: 17 additions & 1 deletion crates/ggml/src/context.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@ use std::{
use memmap2::Mmap;

use crate::{
accelerator::Backend, sys, usize_to_i32, usize_to_i64, Buffer, RoPEOverrides, Tensor, Type,
accelerator::Backend, sys, usize_to_i32, usize_to_i64, Buffer, ComputationGraph, RoPEOverrides,
Tensor, Type,
};

/// Acts as a RAII-guard over a `sys::ggml_context`, allocating via
Expand Down Expand Up @@ -171,6 +172,21 @@ impl Context {
*self = Self::new(self.storage.take().unwrap());
}

///Crate a new [ComputationGraph] in this context.
pub fn create_compute_graph(&self) -> ComputationGraph {
let context = self.inner.to_owned().ptr.as_ptr();
unsafe {
let graph = sys::ggml_new_graph(context);
ComputationGraph::from_raw(graph)
}
}

/// Prints all ggml objects in this context. Mainly used for debugging.
pub fn list_ggml_objects(&self) {
let context = self.inner.to_owned().ptr.as_ptr();
unsafe { sys::ggml_print_objects(context) }
}

/// If offloading is enabled, all tensors created by this context will be offloaded to the GPU
pub fn set_offloading(&mut self, can_offload: bool) {
self.can_offload = can_offload;
Expand Down
34 changes: 14 additions & 20 deletions crates/ggml/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ pub mod util;
pub mod accelerator;

pub use context::{Context, ContextStorage};

pub use tensor::Tensor;

pub use ggml_sys as sys;
Expand Down Expand Up @@ -319,44 +320,32 @@ impl Drop for Buffer {

/// A `ggml` computation graph. Keeps track of all state during computation.
pub struct ComputationGraph {
inner: sys::ggml_cgraph,
inner: *mut sys::ggml_cgraph,
}

impl ComputationGraph {
/// Create a new [ComputationGraph] with the specified `n_threads`.
pub fn new() -> Self {
Self {
inner: sys::ggml_cgraph {
// SAFETY: This should be safe to zero. The original C++ impl
// just leaves it uninitialized
..unsafe { std::mem::zeroed::<sys::ggml_cgraph>() }
},
}
/// Create a new [ComputationGraph] from a raw [sys::ggml_cgraph].
pub fn from_raw(raw_context: *mut sys::ggml_cgraph) -> Self {
Self { inner: raw_context }
}

/// Build this computational graph in the forward direction in preparation for computation.
pub fn build_forward_expand(&mut self, tensor: &Tensor) {
unsafe { sys::ggml_build_forward_expand(&mut self.inner, tensor.ptr.as_ptr()) }
}
}

impl Default for ComputationGraph {
fn default() -> Self {
Self::new()
unsafe { sys::ggml_build_forward_expand(self.inner, tensor.ptr.as_ptr()) }
}
}

/// A `ggml` execution plan. Contains the information needed to execute a computation graph.
pub struct GraphExecutionPlan {
inner: sys::ggml_cplan,
inner_graph: sys::ggml_cgraph,
inner_graph: *mut sys::ggml_cgraph,
}

impl GraphExecutionPlan {
/// Create a new [GraphExecutionPlan] from a [ComputationGraph] and the number of threads to use.
pub fn new(graph: &mut ComputationGraph, n_threads: usize) -> Self {
Self {
inner: unsafe { sys::ggml_graph_plan(&mut graph.inner, usize_to_i32(n_threads)) },
inner: unsafe { sys::ggml_graph_plan(graph.inner, usize_to_i32(n_threads)) },
inner_graph: graph.inner,
}
}
Expand All @@ -383,7 +372,7 @@ impl GraphExecutionPlan {
self.assign_work_buffer(&mut work_buffer);

unsafe {
sys::ggml_graph_compute(&mut self.inner_graph, &mut self.inner);
sys::ggml_graph_compute(self.inner_graph, &mut self.inner);
}
}
}
Expand Down Expand Up @@ -502,3 +491,8 @@ pub fn cpu_has_blas() -> bool {
pub fn cpu_has_gpublas() -> bool {
unsafe { sys::ggml_cpu_has_gpublas() != 0 }
}

/// Returns the graph overhead in bytes.
pub fn graph_overhead() -> usize {
unsafe { sys::ggml_graph_overhead() }
}
2 changes: 1 addition & 1 deletion crates/llm-base/src/inference_session.rs
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@ impl InferenceSession {
} else {
1024
};
buf_size_mb * 1024 * 1024
buf_size_mb * 1024 * 1024 + ggml::graph_overhead()
};

let eval = Buffer::new(buf_size);
Expand Down
2 changes: 1 addition & 1 deletion crates/llm-base/src/lora.rs
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ impl LoraAdapter {

//Build a ggml context and apply the patch

let mut gf = ggml::ComputationGraph::new();
let mut gf = patch_context.create_compute_graph();

// LoRA formula: w = w + ba*s
let mut ba = patch_context.op_mul_mat(&a, &b);
Expand Down
2 changes: 1 addition & 1 deletion crates/models/bloom/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ impl KnownModel for Bloom {
input_layer = ctx0.op_mul(&input_layer, &self.norm);
input_layer = ctx0.op_add(&input_layer, &self.norm_bias);

let mut gf = ggml::ComputationGraph::new();
let mut gf = ctx0.create_compute_graph();
for il in 0..n_layer {
let input_self_attention = input_layer.share();
let mut current: ggml::Tensor;
Expand Down
2 changes: 1 addition & 1 deletion crates/models/falcon/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ impl KnownModel for Falcon {
let memory_v = builder.memory_v;
let memory_v_size = memory_v.element_size();

let mut gf = ggml::ComputationGraph::new();
let mut gf = ctx0.create_compute_graph();

let mut current: Tensor;
let mut layernorm_output: Tensor;
Expand Down
2 changes: 1 addition & 1 deletion crates/models/gpt2/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ impl KnownModel for Gpt2 {
&ctx0.op_get_rows(&self.wpe, &position),
);

let mut gf = ggml::ComputationGraph::new();
let mut gf = ctx0.create_compute_graph();
for il in 0..n_layer {
ctx0.use_scratch(builder.get_scratch(0));

Expand Down
2 changes: 1 addition & 1 deletion crates/models/gptj/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ impl KnownModel for GptJ {

let mut input_layer = ctx0.op_get_rows(&self.wte, embd);

let mut gf = ggml::ComputationGraph::new();
let mut gf = ctx0.create_compute_graph();
for il in 0..n_layer {
ctx0.set_offloading(self.params.should_offload(il));

Expand Down
95 changes: 64 additions & 31 deletions crates/models/gptneox/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -57,38 +57,70 @@ impl KnownModel for GptNeoX {

// model-global weights
let wte = tl.load("gpt_neox.embed_in.weight")?;
let ln_f_g = tl.load("gpt_neox.final_layer_norm.weight")?;
let ln_f_b = tl.load("gpt_neox.final_layer_norm.bias")?;
let lmh_g = tl.load("embed_out.weight")?;

let backend = params.backend(0);

let ln_f_g = tl
.load("gpt_neox.final_layer_norm.weight")?
.transfer_to(backend);
let ln_f_b = tl
.load("gpt_neox.final_layer_norm.bias")?
.transfer_to(backend);
let lmh_g = tl.load("embed_out.weight")?.transfer_to(backend);

let mut layers = Vec::new();
for i in 0..hyperparameters.n_layer {
let backend = params.backend(i);
let layer = Layer {
ln_1_g: tl.load(&format!("gpt_neox.layers.{i}.input_layernorm.weight"))?,
ln_1_b: tl.load(&format!("gpt_neox.layers.{i}.input_layernorm.bias"))?,

c_attn_attn_w: tl.load(&format!(
"gpt_neox.layers.{i}.attention.query_key_value.weight"
))?,
c_attn_attn_b: tl.load(&format!(
"gpt_neox.layers.{i}.attention.query_key_value.bias"
))?,

c_attn_proj_w: tl.load(&format!("gpt_neox.layers.{i}.attention.dense.weight"))?,
c_attn_proj_b: tl.load(&format!("gpt_neox.layers.{i}.attention.dense.bias"))?,

ln_2_g: tl.load(&format!(
"gpt_neox.layers.{i}.post_attention_layernorm.weight"
))?,
ln_2_b: tl.load(&format!(
"gpt_neox.layers.{i}.post_attention_layernorm.bias"
))?,

c_mlp_fc_w: tl.load(&format!("gpt_neox.layers.{i}.mlp.dense_h_to_4h.weight"))?,
c_mlp_fc_b: tl.load(&format!("gpt_neox.layers.{i}.mlp.dense_h_to_4h.bias"))?,

c_mlp_proj_w: tl.load(&format!("gpt_neox.layers.{i}.mlp.dense_4h_to_h.weight"))?,
c_mlp_proj_b: tl.load(&format!("gpt_neox.layers.{i}.mlp.dense_4h_to_h.bias"))?,
ln_1_g: tl
.load(&format!("gpt_neox.layers.{i}.input_layernorm.weight"))?
.transfer_to(backend),
ln_1_b: tl
.load(&format!("gpt_neox.layers.{i}.input_layernorm.bias"))?
.transfer_to(backend),

c_attn_attn_w: tl
.load(&format!(
"gpt_neox.layers.{i}.attention.query_key_value.weight"
))?
.transfer_to(backend),
c_attn_attn_b: tl
.load(&format!(
"gpt_neox.layers.{i}.attention.query_key_value.bias"
))?
.transfer_to(backend),

c_attn_proj_w: tl
.load(&format!("gpt_neox.layers.{i}.attention.dense.weight"))?
.transfer_to(backend),
c_attn_proj_b: tl
.load(&format!("gpt_neox.layers.{i}.attention.dense.bias"))?
.transfer_to(backend),

ln_2_g: tl
.load(&format!(
"gpt_neox.layers.{i}.post_attention_layernorm.weight"
))?
.transfer_to(backend),
ln_2_b: tl
.load(&format!(
"gpt_neox.layers.{i}.post_attention_layernorm.bias"
))?
.transfer_to(backend),

c_mlp_fc_w: tl
.load(&format!("gpt_neox.layers.{i}.mlp.dense_h_to_4h.weight"))?
.transfer_to(backend),
c_mlp_fc_b: tl
.load(&format!("gpt_neox.layers.{i}.mlp.dense_h_to_4h.bias"))?
.transfer_to(backend),

c_mlp_proj_w: tl
.load(&format!("gpt_neox.layers.{i}.mlp.dense_4h_to_h.weight"))?
.transfer_to(backend),
c_mlp_proj_b: tl
.load(&format!("gpt_neox.layers.{i}.mlp.dense_4h_to_h.bias"))?
.transfer_to(backend),
};

layers.push(layer);
Expand Down Expand Up @@ -142,17 +174,18 @@ impl KnownModel for GptNeoX {
} = self.hyperparameters;

let outputs = session.compute(self.context.clone(), input_tokens, |builder| {
let ctx0 = builder.ctx0.borrow();
let mut ctx0 = builder.ctx0.borrow_mut();
let embd = builder.embd;
let mut input_layer = ctx0.op_get_rows(&self.wte, embd);
let (memory_k_size, memory_v_size) = (
builder.memory_k.element_size(),
builder.memory_v.element_size(),
);

let mut gf = ggml::ComputationGraph::new();
let mut gf = ctx0.create_compute_graph();

for il in 0..n_layer {
ctx0.set_offloading(self.params.should_offload(il));
// attention uses first scratch buffer
ctx0.use_scratch(builder.get_scratch(0));

Expand Down Expand Up @@ -305,7 +338,7 @@ impl KnownModel for GptNeoX {

// Disable the scratchbuffer
ctx0.use_scratch(None);

ctx0.set_offloading(false);
// apply language model head
input_layer = ctx0.op_mul_mat(&self.lmh_g, &input_layer);

Expand Down
2 changes: 1 addition & 1 deletion crates/models/llama/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ impl KnownModel for Llama {

let mut input_layer = ctx0.op_get_rows(&self.wte, embd);

let mut gf = ggml::ComputationGraph::new();
let mut gf = ctx0.create_compute_graph();

for il in 0..n_layer {
ctx0.set_offloading(self.params.should_offload(il));
Expand Down
2 changes: 1 addition & 1 deletion crates/models/mpt/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ impl KnownModel for Mpt {

let f32_size = std::mem::size_of::<f32>();

let mut gf = ggml::ComputationGraph::new();
let mut gf = ctx0.create_compute_graph();
for il in 0..n_layer {
// attention uses first scratch buffer
ctx0.use_scratch(builder.get_scratch(0));
Expand Down

0 comments on commit 605e0f5

Please sign in to comment.