Fixed stack-overflow in debug mode

d2jvkpn · Aug 5, 2023 · 605e0f5 · 605e0f5
1 parent bdd9c7d
commit 605e0f5
Show file tree

Hide file tree

Showing 11 changed files with 103 additions and 60 deletions.
diff --git a/crates/ggml/src/context.rs b/crates/ggml/src/context.rs
@@ -9,7 +9,8 @@ use std::{
 use memmap2::Mmap;
 
 use crate::{
-    accelerator::Backend, sys, usize_to_i32, usize_to_i64, Buffer, RoPEOverrides, Tensor, Type,
+    accelerator::Backend, sys, usize_to_i32, usize_to_i64, Buffer, ComputationGraph, RoPEOverrides,
+    Tensor, Type,
 };
 
 /// Acts as a RAII-guard over a `sys::ggml_context`, allocating via
@@ -171,6 +172,21 @@ impl Context {
         *self = Self::new(self.storage.take().unwrap());
     }
 
+    ///Crate a new [ComputationGraph] in this context.
+    pub fn create_compute_graph(&self) -> ComputationGraph {
+        let context = self.inner.to_owned().ptr.as_ptr();
+        unsafe {
+            let graph = sys::ggml_new_graph(context);
+            ComputationGraph::from_raw(graph)
+        }
+    }
+
+    /// Prints all ggml objects in this context. Mainly used for debugging.
+    pub fn list_ggml_objects(&self) {
+        let context = self.inner.to_owned().ptr.as_ptr();
+        unsafe { sys::ggml_print_objects(context) }
+    }
+
     /// If offloading is enabled, all tensors created by this context will be offloaded to the GPU
     pub fn set_offloading(&mut self, can_offload: bool) {
         self.can_offload = can_offload;

diff --git a/crates/ggml/src/lib.rs b/crates/ggml/src/lib.rs
@@ -21,6 +21,7 @@ pub mod util;
 pub mod accelerator;
 
 pub use context::{Context, ContextStorage};
+
 pub use tensor::Tensor;
 
 pub use ggml_sys as sys;
@@ -319,44 +320,32 @@ impl Drop for Buffer {
 
 /// A `ggml` computation graph. Keeps track of all state during computation.
 pub struct ComputationGraph {
-    inner: sys::ggml_cgraph,
+    inner: *mut sys::ggml_cgraph,
 }
 
 impl ComputationGraph {
-    /// Create a new [ComputationGraph] with the specified `n_threads`.
-    pub fn new() -> Self {
-        Self {
-            inner: sys::ggml_cgraph {
-                // SAFETY: This should be safe to zero. The original C++ impl
-                // just leaves it uninitialized
-                ..unsafe { std::mem::zeroed::<sys::ggml_cgraph>() }
-            },
-        }
+    /// Create a new [ComputationGraph] from a raw [sys::ggml_cgraph].
+    pub fn from_raw(raw_context: *mut sys::ggml_cgraph) -> Self {
+        Self { inner: raw_context }
     }
 
     /// Build this computational graph in the forward direction in preparation for computation.
     pub fn build_forward_expand(&mut self, tensor: &Tensor) {
-        unsafe { sys::ggml_build_forward_expand(&mut self.inner, tensor.ptr.as_ptr()) }
-    }
-}
-
-impl Default for ComputationGraph {
-    fn default() -> Self {
-        Self::new()
+        unsafe { sys::ggml_build_forward_expand(self.inner, tensor.ptr.as_ptr()) }
     }
 }
 
 /// A `ggml` execution plan. Contains the information needed to execute a computation graph.
 pub struct GraphExecutionPlan {
     inner: sys::ggml_cplan,
-    inner_graph: sys::ggml_cgraph,
+    inner_graph: *mut sys::ggml_cgraph,
 }
 
 impl GraphExecutionPlan {
     /// Create a new [GraphExecutionPlan] from a [ComputationGraph] and the number of threads to use.
     pub fn new(graph: &mut ComputationGraph, n_threads: usize) -> Self {
         Self {
-            inner: unsafe { sys::ggml_graph_plan(&mut graph.inner, usize_to_i32(n_threads)) },
+            inner: unsafe { sys::ggml_graph_plan(graph.inner, usize_to_i32(n_threads)) },
             inner_graph: graph.inner,
         }
     }
@@ -383,7 +372,7 @@ impl GraphExecutionPlan {
         self.assign_work_buffer(&mut work_buffer);
 
         unsafe {
-            sys::ggml_graph_compute(&mut self.inner_graph, &mut self.inner);
+            sys::ggml_graph_compute(self.inner_graph, &mut self.inner);
         }
     }
 }
@@ -502,3 +491,8 @@ pub fn cpu_has_blas() -> bool {
 pub fn cpu_has_gpublas() -> bool {
     unsafe { sys::ggml_cpu_has_gpublas() != 0 }
 }
+
+/// Returns the graph overhead in bytes.
+pub fn graph_overhead() -> usize {
+    unsafe { sys::ggml_graph_overhead() }
+}
diff --git a/crates/llm-base/src/inference_session.rs b/crates/llm-base/src/inference_session.rs
@@ -170,7 +170,7 @@ impl InferenceSession {
             } else {
                 1024
             };
-            buf_size_mb * 1024 * 1024
+            buf_size_mb * 1024 * 1024 + ggml::graph_overhead()
         };
 
         let eval = Buffer::new(buf_size);

diff --git a/crates/llm-base/src/lora.rs b/crates/llm-base/src/lora.rs
@@ -114,7 +114,7 @@ impl LoraAdapter {
 
         //Build a ggml context and apply the patch
 
-        let mut gf = ggml::ComputationGraph::new();
+        let mut gf = patch_context.create_compute_graph();
 
         // LoRA formula: w = w + ba*s
         let mut ba = patch_context.op_mul_mat(&a, &b);

diff --git a/crates/models/bloom/src/lib.rs b/crates/models/bloom/src/lib.rs
@@ -148,7 +148,7 @@ impl KnownModel for Bloom {
             input_layer = ctx0.op_mul(&input_layer, &self.norm);
             input_layer = ctx0.op_add(&input_layer, &self.norm_bias);
 
-            let mut gf = ggml::ComputationGraph::new();
+            let mut gf = ctx0.create_compute_graph();
             for il in 0..n_layer {
                 let input_self_attention = input_layer.share();
                 let mut current: ggml::Tensor;

diff --git a/crates/models/falcon/src/lib.rs b/crates/models/falcon/src/lib.rs
@@ -159,7 +159,7 @@ impl KnownModel for Falcon {
             let memory_v = builder.memory_v;
             let memory_v_size = memory_v.element_size();
 
-            let mut gf = ggml::ComputationGraph::new();
+            let mut gf = ctx0.create_compute_graph();
 
             let mut current: Tensor;
             let mut layernorm_output: Tensor;

diff --git a/crates/models/gpt2/src/lib.rs b/crates/models/gpt2/src/lib.rs
@@ -147,7 +147,7 @@ impl KnownModel for Gpt2 {
                 &ctx0.op_get_rows(&self.wpe, &position),
             );
 
-            let mut gf = ggml::ComputationGraph::new();
+            let mut gf = ctx0.create_compute_graph();
             for il in 0..n_layer {
                 ctx0.use_scratch(builder.get_scratch(0));
 

diff --git a/crates/models/gptj/src/lib.rs b/crates/models/gptj/src/lib.rs
@@ -160,7 +160,7 @@ impl KnownModel for GptJ {
 
             let mut input_layer = ctx0.op_get_rows(&self.wte, embd);
 
-            let mut gf = ggml::ComputationGraph::new();
+            let mut gf = ctx0.create_compute_graph();
             for il in 0..n_layer {
                 ctx0.set_offloading(self.params.should_offload(il));
 

diff --git a/crates/models/gptneox/src/lib.rs b/crates/models/gptneox/src/lib.rs
@@ -57,38 +57,70 @@ impl KnownModel for GptNeoX {
 
         // model-global weights
         let wte = tl.load("gpt_neox.embed_in.weight")?;
-        let ln_f_g = tl.load("gpt_neox.final_layer_norm.weight")?;
-        let ln_f_b = tl.load("gpt_neox.final_layer_norm.bias")?;
-        let lmh_g = tl.load("embed_out.weight")?;
+
+        let backend = params.backend(0);
+
+        let ln_f_g = tl
+            .load("gpt_neox.final_layer_norm.weight")?
+            .transfer_to(backend);
+        let ln_f_b = tl
+            .load("gpt_neox.final_layer_norm.bias")?
+            .transfer_to(backend);
+        let lmh_g = tl.load("embed_out.weight")?.transfer_to(backend);
 
         let mut layers = Vec::new();
         for i in 0..hyperparameters.n_layer {
+            let backend = params.backend(i);
             let layer = Layer {
-                ln_1_g: tl.load(&format!("gpt_neox.layers.{i}.input_layernorm.weight"))?,
-                ln_1_b: tl.load(&format!("gpt_neox.layers.{i}.input_layernorm.bias"))?,
-
-                c_attn_attn_w: tl.load(&format!(
-                    "gpt_neox.layers.{i}.attention.query_key_value.weight"
-                ))?,
-                c_attn_attn_b: tl.load(&format!(
-                    "gpt_neox.layers.{i}.attention.query_key_value.bias"
-                ))?,
-
-                c_attn_proj_w: tl.load(&format!("gpt_neox.layers.{i}.attention.dense.weight"))?,
-                c_attn_proj_b: tl.load(&format!("gpt_neox.layers.{i}.attention.dense.bias"))?,
-
-                ln_2_g: tl.load(&format!(
-                    "gpt_neox.layers.{i}.post_attention_layernorm.weight"
-                ))?,
-                ln_2_b: tl.load(&format!(
-                    "gpt_neox.layers.{i}.post_attention_layernorm.bias"
-                ))?,
-
-                c_mlp_fc_w: tl.load(&format!("gpt_neox.layers.{i}.mlp.dense_h_to_4h.weight"))?,
-                c_mlp_fc_b: tl.load(&format!("gpt_neox.layers.{i}.mlp.dense_h_to_4h.bias"))?,
-
-                c_mlp_proj_w: tl.load(&format!("gpt_neox.layers.{i}.mlp.dense_4h_to_h.weight"))?,
-                c_mlp_proj_b: tl.load(&format!("gpt_neox.layers.{i}.mlp.dense_4h_to_h.bias"))?,
+                ln_1_g: tl
+                    .load(&format!("gpt_neox.layers.{i}.input_layernorm.weight"))?
+                    .transfer_to(backend),
+                ln_1_b: tl
+                    .load(&format!("gpt_neox.layers.{i}.input_layernorm.bias"))?
+                    .transfer_to(backend),
+
+                c_attn_attn_w: tl
+                    .load(&format!(
+                        "gpt_neox.layers.{i}.attention.query_key_value.weight"
+                    ))?
+                    .transfer_to(backend),
+                c_attn_attn_b: tl
+                    .load(&format!(
+                        "gpt_neox.layers.{i}.attention.query_key_value.bias"
+                    ))?
+                    .transfer_to(backend),
+
+                c_attn_proj_w: tl
+                    .load(&format!("gpt_neox.layers.{i}.attention.dense.weight"))?
+                    .transfer_to(backend),
+                c_attn_proj_b: tl
+                    .load(&format!("gpt_neox.layers.{i}.attention.dense.bias"))?
+                    .transfer_to(backend),
+
+                ln_2_g: tl
+                    .load(&format!(
+                        "gpt_neox.layers.{i}.post_attention_layernorm.weight"
+                    ))?
+                    .transfer_to(backend),
+                ln_2_b: tl
+                    .load(&format!(
+                        "gpt_neox.layers.{i}.post_attention_layernorm.bias"
+                    ))?
+                    .transfer_to(backend),
+
+                c_mlp_fc_w: tl
+                    .load(&format!("gpt_neox.layers.{i}.mlp.dense_h_to_4h.weight"))?
+                    .transfer_to(backend),
+                c_mlp_fc_b: tl
+                    .load(&format!("gpt_neox.layers.{i}.mlp.dense_h_to_4h.bias"))?
+                    .transfer_to(backend),
+
+                c_mlp_proj_w: tl
+                    .load(&format!("gpt_neox.layers.{i}.mlp.dense_4h_to_h.weight"))?
+                    .transfer_to(backend),
+                c_mlp_proj_b: tl
+                    .load(&format!("gpt_neox.layers.{i}.mlp.dense_4h_to_h.bias"))?
+                    .transfer_to(backend),
             };
 
             layers.push(layer);
@@ -142,17 +174,18 @@ impl KnownModel for GptNeoX {
         } = self.hyperparameters;
 
         let outputs = session.compute(self.context.clone(), input_tokens, |builder| {
-            let ctx0 = builder.ctx0.borrow();
+            let mut ctx0 = builder.ctx0.borrow_mut();
             let embd = builder.embd;
             let mut input_layer = ctx0.op_get_rows(&self.wte, embd);
             let (memory_k_size, memory_v_size) = (
                 builder.memory_k.element_size(),
                 builder.memory_v.element_size(),
             );
 
-            let mut gf = ggml::ComputationGraph::new();
+            let mut gf = ctx0.create_compute_graph();
 
             for il in 0..n_layer {
+                ctx0.set_offloading(self.params.should_offload(il));
                 // attention uses first scratch buffer
                 ctx0.use_scratch(builder.get_scratch(0));
 
@@ -305,7 +338,7 @@ impl KnownModel for GptNeoX {
 
             // Disable the scratchbuffer
             ctx0.use_scratch(None);
-
+            ctx0.set_offloading(false);
             // apply language model head
             input_layer = ctx0.op_mul_mat(&self.lmh_g, &input_layer);
 

diff --git a/crates/models/llama/src/lib.rs b/crates/models/llama/src/lib.rs
@@ -144,7 +144,7 @@ impl KnownModel for Llama {
 
             let mut input_layer = ctx0.op_get_rows(&self.wte, embd);
 
-            let mut gf = ggml::ComputationGraph::new();
+            let mut gf = ctx0.create_compute_graph();
 
             for il in 0..n_layer {
                 ctx0.set_offloading(self.params.should_offload(il));

diff --git a/crates/models/mpt/src/lib.rs b/crates/models/mpt/src/lib.rs
@@ -123,7 +123,7 @@ impl KnownModel for Mpt {
 
             let f32_size = std::mem::size_of::<f32>();
 
-            let mut gf = ggml::ComputationGraph::new();
+            let mut gf = ctx0.create_compute_graph();
             for il in 0..n_layer {
                 // attention uses first scratch buffer
                 ctx0.use_scratch(builder.get_scratch(0));