rustformers · philpax · Aug 13, 2023 · Aug 3, 2023 · Aug 3, 2023 · Aug 3, 2023
diff --git a/crates/ggml/src/accelerator/metal.rs b/crates/ggml/src/accelerator/metal.rs
@@ -83,7 +83,7 @@ impl MetalContext {
         unsafe {
             metal::ggml_metal_graph_compute(
                 self.ptr.as_ptr(),
-                &mut graph.inner as *mut ggml_sys::ggml_cgraph as *mut metal::ggml_cgraph,
+                graph.inner as *mut ggml_sys::ggml_cgraph as *mut metal::ggml_cgraph,
             );
         }
     }

diff --git a/crates/ggml/src/context.rs b/crates/ggml/src/context.rs
@@ -9,7 +9,8 @@ use std::{
 use memmap2::Mmap;
 
 use crate::{
-    accelerator::Backend, sys, usize_to_i32, usize_to_i64, Buffer, RoPEOverrides, Tensor, Type,
+    accelerator::Backend, sys, usize_to_i32, usize_to_i64, Buffer, ComputationGraph, RoPEOverrides,
+    Tensor, Type,
 };
 
 /// Acts as a RAII-guard over a `sys::ggml_context`, allocating via
@@ -171,6 +172,21 @@ impl Context {
         *self = Self::new(self.storage.take().unwrap());
     }
 
+    ///Crate a new [ComputationGraph] in this context.
+    pub fn create_compute_graph(&self) -> ComputationGraph {
+        let context = self.inner.to_owned().ptr.as_ptr();
+        unsafe {
+            let graph = sys::ggml_new_graph(context);
+            ComputationGraph::from_raw(graph)
+        }
+    }
+
+    /// Prints all ggml objects in this context. Mainly used for debugging.
+    pub fn list_ggml_objects(&self) {
+        let context = self.inner.to_owned().ptr.as_ptr();
+        unsafe { sys::ggml_print_objects(context) }
+    }
+
     /// If offloading is enabled, all tensors created by this context will be offloaded to the GPU
     pub fn set_offloading(&mut self, can_offload: bool) {
         self.can_offload = can_offload;
@@ -274,13 +290,13 @@ impl Context {
         self.new_tensor_raw(tensor)
     }
 
-    /// Creates a new tensor with the multiplication of `a` and `b`.
+    /// Creates a new tensor with the multiplication of `a` and `b`. Supports broadcasting if the dimensions are compatible, menaing the first dimensions of `a` must be devisible by the first dimensions of `b`.
     pub fn op_mul(&self, a: &Tensor, b: &Tensor) -> Tensor {
         let tensor = unsafe { sys::ggml_mul(self.as_ptr(), a.ptr.as_ptr(), b.ptr.as_ptr()) };
         self.new_tensor_raw(tensor)
     }
 
-    /// Unknown.
+    /// Repeats the `a` tensor along the first dimension of the `b` tensor.  
     pub fn op_repeat(&self, a: &Tensor, b: &Tensor) -> Tensor {
         let tensor = unsafe { sys::ggml_repeat(self.as_ptr(), a.ptr.as_ptr(), b.ptr.as_ptr()) };
         self.new_tensor_raw(tensor)
@@ -298,7 +314,7 @@ impl Context {
         self.new_tensor_raw(tensor)
     }
 
-    /// Creates a new tensor with the addition of `a` and `b`.
+    /// Creates a new tensor with the addition of `a` and `b`. Supports broadcasting if the dimensions are compatible, menaing the first dimensions of `a` must be devisible by the first dimensions of `b`.
     pub fn op_add(&self, a: &Tensor, b: &Tensor) -> Tensor {
         let tensor = unsafe { sys::ggml_add(self.as_ptr(), a.ptr.as_ptr(), b.ptr.as_ptr()) };
         self.new_tensor_raw(tensor)

diff --git a/crates/ggml/src/lib.rs b/crates/ggml/src/lib.rs
@@ -21,6 +21,7 @@ pub mod util;
 pub mod accelerator;
 
 pub use context::{Context, ContextStorage};
+
 pub use tensor::Tensor;
 
 pub use ggml_sys as sys;
@@ -319,44 +320,32 @@ impl Drop for Buffer {
 
 /// A `ggml` computation graph. Keeps track of all state during computation.
 pub struct ComputationGraph {
-    inner: sys::ggml_cgraph,
+    inner: *mut sys::ggml_cgraph,
 }
 
 impl ComputationGraph {
-    /// Create a new [ComputationGraph] with the specified `n_threads`.
-    pub fn new() -> Self {
-        Self {
-            inner: sys::ggml_cgraph {
-                // SAFETY: This should be safe to zero. The original C++ impl
-                // just leaves it uninitialized
-                ..unsafe { std::mem::zeroed::<sys::ggml_cgraph>() }
-            },
-        }
+    /// Create a new [ComputationGraph] from a raw [sys::ggml_cgraph].
+    pub fn from_raw(raw_context: *mut sys::ggml_cgraph) -> Self {
+        Self { inner: raw_context }
     }
 
     /// Build this computational graph in the forward direction in preparation for computation.
     pub fn build_forward_expand(&mut self, tensor: &Tensor) {
-        unsafe { sys::ggml_build_forward_expand(&mut self.inner, tensor.ptr.as_ptr()) }
-    }
-}
-
-impl Default for ComputationGraph {
-    fn default() -> Self {
-        Self::new()
+        unsafe { sys::ggml_build_forward_expand(self.inner, tensor.ptr.as_ptr()) }
     }
 }
 
 /// A `ggml` execution plan. Contains the information needed to execute a computation graph.
 pub struct GraphExecutionPlan {
     inner: sys::ggml_cplan,
-    inner_graph: sys::ggml_cgraph,
+    inner_graph: *mut sys::ggml_cgraph,
 }
 
 impl GraphExecutionPlan {
     /// Create a new [GraphExecutionPlan] from a [ComputationGraph] and the number of threads to use.
     pub fn new(graph: &mut ComputationGraph, n_threads: usize) -> Self {
         Self {
-            inner: unsafe { sys::ggml_graph_plan(&mut graph.inner, usize_to_i32(n_threads)) },
+            inner: unsafe { sys::ggml_graph_plan(graph.inner, usize_to_i32(n_threads)) },
             inner_graph: graph.inner,
         }
     }
@@ -383,7 +372,7 @@ impl GraphExecutionPlan {
         self.assign_work_buffer(&mut work_buffer);
 
         unsafe {
-            sys::ggml_graph_compute(&mut self.inner_graph, &mut self.inner);
+            sys::ggml_graph_compute(self.inner_graph, &mut self.inner);
         }
     }
 }
@@ -502,3 +491,8 @@ pub fn cpu_has_blas() -> bool {
 pub fn cpu_has_gpublas() -> bool {
     unsafe { sys::ggml_cpu_has_gpublas() != 0 }
 }
+
+/// Returns the graph overhead in bytes.
+pub fn graph_overhead() -> usize {
+    unsafe { sys::ggml_graph_overhead() }
+}
diff --git a/crates/ggml/sys/llama-cpp b/crates/ggml/sys/llama-cpp
diff --git a/crates/ggml/sys/src/cuda.rs b/crates/ggml/sys/src/cuda.rs
@@ -60,6 +60,9 @@ extern "C" {
 extern "C" {
     pub fn ggml_cuda_set_main_device(main_device: ::std::os::raw::c_int);
 }
+extern "C" {
+    pub fn ggml_cuda_set_mul_mat_q(mul_mat_q: bool);
+}
 extern "C" {
     pub fn ggml_cuda_set_scratch_size(scratch_size: usize);
 }

diff --git a/crates/ggml/sys/src/lib.rs b/crates/ggml/sys/src/lib.rs
@@ -1567,6 +1567,18 @@ extern "C" {
         n_ctx: ::std::os::raw::c_int,
     ) -> *mut ggml_tensor;
 }
+extern "C" {
+    pub fn ggml_rope_custom(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        n_past: ::std::os::raw::c_int,
+        n_dims: ::std::os::raw::c_int,
+        mode: ::std::os::raw::c_int,
+        n_ctx: ::std::os::raw::c_int,
+        freq_base: f32,
+        freq_scale: f32,
+    ) -> *mut ggml_tensor;
+}
 extern "C" {
     pub fn ggml_rope_custom_inplace(
         ctx: *mut ggml_context,

diff --git a/crates/llm-base/src/inference_session.rs b/crates/llm-base/src/inference_session.rs
@@ -170,7 +170,7 @@ impl InferenceSession {
             } else {
                 1024
             };
-            buf_size_mb * 1024 * 1024
+            buf_size_mb * 1024 * 1024 + ggml::graph_overhead()
         };
 
         let eval = Buffer::new(buf_size);

diff --git a/crates/llm-base/src/lora.rs b/crates/llm-base/src/lora.rs
@@ -114,7 +114,7 @@ impl LoraAdapter {
 
         //Build a ggml context and apply the patch
 
-        let mut gf = ggml::ComputationGraph::new();
+        let mut gf = patch_context.create_compute_graph();
 
         // LoRA formula: w = w + ba*s
         let mut ba = patch_context.op_mul_mat(&a, &b);

diff --git a/crates/models/bloom/src/lib.rs b/crates/models/bloom/src/lib.rs
@@ -145,10 +145,10 @@ impl KnownModel for Bloom {
 
             // normalize embeddings
             input_layer = ctx0.op_norm(&input_layer);
-            input_layer = ctx0.op_mul(&ctx0.op_repeat(&self.norm, &input_layer), &input_layer);
-            input_layer = ctx0.op_add(&ctx0.op_repeat(&self.norm_bias, &input_layer), &input_layer);
+            input_layer = ctx0.op_mul(&input_layer, &self.norm);
+            input_layer = ctx0.op_add(&input_layer, &self.norm_bias);
 
-            let mut gf = ggml::ComputationGraph::new();
+            let mut gf = ctx0.create_compute_graph();
             for il in 0..n_layer {
                 let input_self_attention = input_layer.share();
                 let mut current: ggml::Tensor;
@@ -157,21 +157,12 @@ impl KnownModel for Bloom {
                 current = ctx0.op_norm(&input_layer);
 
                 // cur = attention_norm * cur
-                current = ctx0.op_mul(
-                    &ctx0.op_repeat(&self.layers[il].attention_norm, &current),
-                    &current,
-                );
-                current = ctx0.op_add(
-                    &ctx0.op_repeat(&self.layers[il].attention_norm_b, &current),
-                    &current,
-                );
+                current = ctx0.op_mul(&current, &self.layers[il].attention_norm);
+                current = ctx0.op_add(&current, &self.layers[il].attention_norm_b);
 
                 //attention
                 current = ctx0.op_mul_mat(&self.layers[il].query_key_value, &current);
-                current = ctx0.op_add(
-                    &ctx0.op_repeat(&self.layers[il].query_key_value_b, &current),
-                    &current,
-                );
+                current = ctx0.op_add(&current, &self.layers[il].query_key_value_b);
 
                 // self-attention
                 let nb = current.get_nb()[1];
@@ -293,7 +284,7 @@ impl KnownModel for Bloom {
 
                 // projection
                 current = ctx0.op_mul_mat(&self.layers[il].wo, &current);
-                current = ctx0.op_add(&ctx0.op_repeat(&self.layers[il].wo_b, &current), &current);
+                current = ctx0.op_add(&current, &self.layers[il].wo_b);
 
                 let input_feed_forward = ctx0.op_add(&current, &input_self_attention);
 
@@ -302,27 +293,21 @@ impl KnownModel for Bloom {
                 current = ctx0.op_norm(&input_feed_forward);
 
                 // cur = ffn_norm*cur + ffn_norm_b
-                current = ctx0.op_mul(
-                    &ctx0.op_repeat(&self.layers[il].ffn_norm, &current),
-                    &current,
-                );
+                current = ctx0.op_mul(&current, &self.layers[il].ffn_norm);
 
-                current = ctx0.op_add(
-                    &ctx0.op_repeat(&self.layers[il].ffn_norm_b, &current),
-                    &current,
-                );
+                current = ctx0.op_add(&current, &self.layers[il].ffn_norm_b);
 
                 current = ctx0.op_mul_mat(&self.layers[il].w1, &current);
 
-                current = ctx0.op_add(&ctx0.op_repeat(&self.layers[il].w1_b, &current), &current);
+                current = ctx0.op_add(&current, &self.layers[il].w1_b);
 
                 // SILU activation
 
                 current = ctx0.op_gelu(&current);
 
                 current = ctx0.op_mul_mat(&self.layers[il].w2, &current);
 
-                current = ctx0.op_add(&ctx0.op_repeat(&self.layers[il].w2_b, &current), &current);
+                current = ctx0.op_add(&current, &self.layers[il].w2_b);
 
                 current = ctx0.op_add(&current, &input_feed_forward);
 
@@ -334,15 +319,9 @@ impl KnownModel for Bloom {
             input_layer = ctx0.op_norm(&input_layer);
 
             // inpL = norm*inpL
-            input_layer = ctx0.op_mul(
-                &ctx0.op_repeat(&self.output_norm, &input_layer),
-                &input_layer,
-            );
+            input_layer = ctx0.op_mul(&input_layer, &self.output_norm);
 
-            input_layer = ctx0.op_add(
-                &ctx0.op_repeat(&self.output_norm_bias, &input_layer),
-                &input_layer,
-            );
+            input_layer = ctx0.op_add(&input_layer, &self.output_norm_bias);
 
             let embeddings_tensor: ggml::Tensor = input_layer.share();