Skip to content
This repository was archived by the owner on Jun 24, 2024. It is now read-only.
2 changes: 1 addition & 1 deletion crates/ggml/src/accelerator/metal.rs
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ impl MetalContext {
unsafe {
metal::ggml_metal_graph_compute(
self.ptr.as_ptr(),
&mut graph.inner as *mut ggml_sys::ggml_cgraph as *mut metal::ggml_cgraph,
graph.inner as *mut ggml_sys::ggml_cgraph as *mut metal::ggml_cgraph,
);
}
}
Expand Down
24 changes: 20 additions & 4 deletions crates/ggml/src/context.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@ use std::{
use memmap2::Mmap;

use crate::{
accelerator::Backend, sys, usize_to_i32, usize_to_i64, Buffer, RoPEOverrides, Tensor, Type,
accelerator::Backend, sys, usize_to_i32, usize_to_i64, Buffer, ComputationGraph, RoPEOverrides,
Tensor, Type,
};

/// Acts as a RAII-guard over a `sys::ggml_context`, allocating via
Expand Down Expand Up @@ -171,6 +172,21 @@ impl Context {
*self = Self::new(self.storage.take().unwrap());
}

///Crate a new [ComputationGraph] in this context.
pub fn create_compute_graph(&self) -> ComputationGraph {
let context = self.inner.to_owned().ptr.as_ptr();
unsafe {
let graph = sys::ggml_new_graph(context);
ComputationGraph::from_raw(graph)
}
}

/// Prints all ggml objects in this context. Mainly used for debugging.
pub fn list_ggml_objects(&self) {
let context = self.inner.to_owned().ptr.as_ptr();
unsafe { sys::ggml_print_objects(context) }
}

/// If offloading is enabled, all tensors created by this context will be offloaded to the GPU
pub fn set_offloading(&mut self, can_offload: bool) {
self.can_offload = can_offload;
Expand Down Expand Up @@ -274,13 +290,13 @@ impl Context {
self.new_tensor_raw(tensor)
}

/// Creates a new tensor with the multiplication of `a` and `b`.
/// Creates a new tensor with the multiplication of `a` and `b`. Supports broadcasting if the dimensions are compatible, menaing the first dimensions of `a` must be devisible by the first dimensions of `b`.
pub fn op_mul(&self, a: &Tensor, b: &Tensor) -> Tensor {
let tensor = unsafe { sys::ggml_mul(self.as_ptr(), a.ptr.as_ptr(), b.ptr.as_ptr()) };
self.new_tensor_raw(tensor)
}

/// Unknown.
/// Repeats the `a` tensor along the first dimension of the `b` tensor.
pub fn op_repeat(&self, a: &Tensor, b: &Tensor) -> Tensor {
let tensor = unsafe { sys::ggml_repeat(self.as_ptr(), a.ptr.as_ptr(), b.ptr.as_ptr()) };
self.new_tensor_raw(tensor)
Expand All @@ -298,7 +314,7 @@ impl Context {
self.new_tensor_raw(tensor)
}

/// Creates a new tensor with the addition of `a` and `b`.
/// Creates a new tensor with the addition of `a` and `b`. Supports broadcasting if the dimensions are compatible, menaing the first dimensions of `a` must be devisible by the first dimensions of `b`.
pub fn op_add(&self, a: &Tensor, b: &Tensor) -> Tensor {
let tensor = unsafe { sys::ggml_add(self.as_ptr(), a.ptr.as_ptr(), b.ptr.as_ptr()) };
self.new_tensor_raw(tensor)
Expand Down
34 changes: 14 additions & 20 deletions crates/ggml/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ pub mod util;
pub mod accelerator;

pub use context::{Context, ContextStorage};

pub use tensor::Tensor;

pub use ggml_sys as sys;
Expand Down Expand Up @@ -319,44 +320,32 @@ impl Drop for Buffer {

/// A `ggml` computation graph. Keeps track of all state during computation.
pub struct ComputationGraph {
inner: sys::ggml_cgraph,
inner: *mut sys::ggml_cgraph,
}

impl ComputationGraph {
/// Create a new [ComputationGraph] with the specified `n_threads`.
pub fn new() -> Self {
Self {
inner: sys::ggml_cgraph {
// SAFETY: This should be safe to zero. The original C++ impl
// just leaves it uninitialized
..unsafe { std::mem::zeroed::<sys::ggml_cgraph>() }
},
}
/// Create a new [ComputationGraph] from a raw [sys::ggml_cgraph].
pub fn from_raw(raw_context: *mut sys::ggml_cgraph) -> Self {
Self { inner: raw_context }
}

/// Build this computational graph in the forward direction in preparation for computation.
pub fn build_forward_expand(&mut self, tensor: &Tensor) {
unsafe { sys::ggml_build_forward_expand(&mut self.inner, tensor.ptr.as_ptr()) }
}
}

impl Default for ComputationGraph {
fn default() -> Self {
Self::new()
unsafe { sys::ggml_build_forward_expand(self.inner, tensor.ptr.as_ptr()) }
}
}

/// A `ggml` execution plan. Contains the information needed to execute a computation graph.
pub struct GraphExecutionPlan {
inner: sys::ggml_cplan,
inner_graph: sys::ggml_cgraph,
inner_graph: *mut sys::ggml_cgraph,
}

impl GraphExecutionPlan {
/// Create a new [GraphExecutionPlan] from a [ComputationGraph] and the number of threads to use.
pub fn new(graph: &mut ComputationGraph, n_threads: usize) -> Self {
Self {
inner: unsafe { sys::ggml_graph_plan(&mut graph.inner, usize_to_i32(n_threads)) },
inner: unsafe { sys::ggml_graph_plan(graph.inner, usize_to_i32(n_threads)) },
inner_graph: graph.inner,
}
}
Expand All @@ -383,7 +372,7 @@ impl GraphExecutionPlan {
self.assign_work_buffer(&mut work_buffer);

unsafe {
sys::ggml_graph_compute(&mut self.inner_graph, &mut self.inner);
sys::ggml_graph_compute(self.inner_graph, &mut self.inner);
}
}
}
Expand Down Expand Up @@ -502,3 +491,8 @@ pub fn cpu_has_blas() -> bool {
pub fn cpu_has_gpublas() -> bool {
unsafe { sys::ggml_cpu_has_gpublas() != 0 }
}

/// Returns the graph overhead in bytes.
pub fn graph_overhead() -> usize {
unsafe { sys::ggml_graph_overhead() }
}
2 changes: 1 addition & 1 deletion crates/ggml/sys/llama-cpp
3 changes: 3 additions & 0 deletions crates/ggml/sys/src/cuda.rs
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,9 @@ extern "C" {
extern "C" {
pub fn ggml_cuda_set_main_device(main_device: ::std::os::raw::c_int);
}
extern "C" {
pub fn ggml_cuda_set_mul_mat_q(mul_mat_q: bool);
}
extern "C" {
pub fn ggml_cuda_set_scratch_size(scratch_size: usize);
}
Expand Down
12 changes: 12 additions & 0 deletions crates/ggml/sys/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1567,6 +1567,18 @@ extern "C" {
n_ctx: ::std::os::raw::c_int,
) -> *mut ggml_tensor;
}
extern "C" {
pub fn ggml_rope_custom(
ctx: *mut ggml_context,
a: *mut ggml_tensor,
n_past: ::std::os::raw::c_int,
n_dims: ::std::os::raw::c_int,
mode: ::std::os::raw::c_int,
n_ctx: ::std::os::raw::c_int,
freq_base: f32,
freq_scale: f32,
) -> *mut ggml_tensor;
}
extern "C" {
pub fn ggml_rope_custom_inplace(
ctx: *mut ggml_context,
Expand Down
2 changes: 1 addition & 1 deletion crates/llm-base/src/inference_session.rs
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@ impl InferenceSession {
} else {
1024
};
buf_size_mb * 1024 * 1024
buf_size_mb * 1024 * 1024 + ggml::graph_overhead()
};

let eval = Buffer::new(buf_size);
Expand Down
2 changes: 1 addition & 1 deletion crates/llm-base/src/lora.rs
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ impl LoraAdapter {

//Build a ggml context and apply the patch

let mut gf = ggml::ComputationGraph::new();
let mut gf = patch_context.create_compute_graph();

// LoRA formula: w = w + ba*s
let mut ba = patch_context.op_mul_mat(&a, &b);
Expand Down
47 changes: 13 additions & 34 deletions crates/models/bloom/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -145,10 +145,10 @@ impl KnownModel for Bloom {

// normalize embeddings
input_layer = ctx0.op_norm(&input_layer);
input_layer = ctx0.op_mul(&ctx0.op_repeat(&self.norm, &input_layer), &input_layer);
input_layer = ctx0.op_add(&ctx0.op_repeat(&self.norm_bias, &input_layer), &input_layer);
input_layer = ctx0.op_mul(&input_layer, &self.norm);
input_layer = ctx0.op_add(&input_layer, &self.norm_bias);

let mut gf = ggml::ComputationGraph::new();
let mut gf = ctx0.create_compute_graph();
for il in 0..n_layer {
let input_self_attention = input_layer.share();
let mut current: ggml::Tensor;
Expand All @@ -157,21 +157,12 @@ impl KnownModel for Bloom {
current = ctx0.op_norm(&input_layer);

// cur = attention_norm * cur
current = ctx0.op_mul(
&ctx0.op_repeat(&self.layers[il].attention_norm, &current),
&current,
);
current = ctx0.op_add(
&ctx0.op_repeat(&self.layers[il].attention_norm_b, &current),
&current,
);
current = ctx0.op_mul(&current, &self.layers[il].attention_norm);
current = ctx0.op_add(&current, &self.layers[il].attention_norm_b);

//attention
current = ctx0.op_mul_mat(&self.layers[il].query_key_value, &current);
current = ctx0.op_add(
&ctx0.op_repeat(&self.layers[il].query_key_value_b, &current),
&current,
);
current = ctx0.op_add(&current, &self.layers[il].query_key_value_b);

// self-attention
let nb = current.get_nb()[1];
Expand Down Expand Up @@ -293,7 +284,7 @@ impl KnownModel for Bloom {

// projection
current = ctx0.op_mul_mat(&self.layers[il].wo, &current);
current = ctx0.op_add(&ctx0.op_repeat(&self.layers[il].wo_b, &current), &current);
current = ctx0.op_add(&current, &self.layers[il].wo_b);

let input_feed_forward = ctx0.op_add(&current, &input_self_attention);

Expand All @@ -302,27 +293,21 @@ impl KnownModel for Bloom {
current = ctx0.op_norm(&input_feed_forward);

// cur = ffn_norm*cur + ffn_norm_b
current = ctx0.op_mul(
&ctx0.op_repeat(&self.layers[il].ffn_norm, &current),
&current,
);
current = ctx0.op_mul(&current, &self.layers[il].ffn_norm);

current = ctx0.op_add(
&ctx0.op_repeat(&self.layers[il].ffn_norm_b, &current),
&current,
);
current = ctx0.op_add(&current, &self.layers[il].ffn_norm_b);

current = ctx0.op_mul_mat(&self.layers[il].w1, &current);

current = ctx0.op_add(&ctx0.op_repeat(&self.layers[il].w1_b, &current), &current);
current = ctx0.op_add(&current, &self.layers[il].w1_b);

// SILU activation

current = ctx0.op_gelu(&current);

current = ctx0.op_mul_mat(&self.layers[il].w2, &current);

current = ctx0.op_add(&ctx0.op_repeat(&self.layers[il].w2_b, &current), &current);
current = ctx0.op_add(&current, &self.layers[il].w2_b);

current = ctx0.op_add(&current, &input_feed_forward);

Expand All @@ -334,15 +319,9 @@ impl KnownModel for Bloom {
input_layer = ctx0.op_norm(&input_layer);

// inpL = norm*inpL
input_layer = ctx0.op_mul(
&ctx0.op_repeat(&self.output_norm, &input_layer),
&input_layer,
);
input_layer = ctx0.op_mul(&input_layer, &self.output_norm);

input_layer = ctx0.op_add(
&ctx0.op_repeat(&self.output_norm_bias, &input_layer),
&input_layer,
);
input_layer = ctx0.op_add(&input_layer, &self.output_norm_bias);

let embeddings_tensor: ggml::Tensor = input_layer.share();

Expand Down
Loading