Refactor ggml stuff into a single crate

clarkmcc · Apr 29, 2023 · 0aea8f7 · 0aea8f7
1 parent 288df7f
commit 0aea8f7
Show file tree

Hide file tree

Showing 39 changed files with 1,027 additions and 16,074 deletions.
diff --git a/.gitmodules b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "ggml-rs/ggml"]
+	path = ggml-rs/ggml
+	url = git@github.com:ggerganov/ggml.git
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,17 +1,12 @@
 [workspace]
 members = [
     # Crates
-    "ggml-sys",
-    "ggml-format",
-    "ggml",
+    "ggml-rs",
     "llm-base",
     "llama",
     "bloom",
     "llm",
     "llm-cli",
-
-    # Tools
-    "generate-ggml-bindings"
 ]
 resolver = "2"
 

diff --git a/bloom/Cargo.toml b/bloom/Cargo.toml
@@ -6,7 +6,7 @@ edition = "2021"
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
 [dependencies]
-ggml = { path = "../ggml" }
+ggml-rs = { path = "../ggml-rs" }
 llm-base = { path = "../llm-base" }
 
 bytemuck = { workspace = true }

diff --git a/bloom/src/lib.rs b/bloom/src/lib.rs
@@ -3,7 +3,7 @@ use std::path::Path;
 // use ggml_loader::{LoadError, LoadProgress};
 use llm_base::{
     util, EvaluateOutputRequest, FileType, InferenceParameters, InferenceSession,
-    InferenceSessionParameters, LoadError, LoadProgress, Mmap, KnownModel, TokenId, Vocabulary,
+    InferenceSessionParameters, KnownModel, LoadError, LoadProgress, Mmap, TokenId, Vocabulary,
 };
 
 /// The weights for the BLOOM model. All the mutable state is split into a
@@ -13,16 +13,16 @@ pub struct Bloom {
     n_context_tokens: usize,
 
     vocabulary: Vocabulary,
-    tok_embeddings: ggml::Tensor,
-    norm: ggml::Tensor,
-    norm_b: ggml::Tensor,
-    output_norm: ggml::Tensor,
-    output_norm_b: ggml::Tensor,
-    output: ggml::Tensor,
+    tok_embeddings: ggml_rs::Tensor,
+    norm: ggml_rs::Tensor,
+    norm_b: ggml_rs::Tensor,
+    output_norm: ggml_rs::Tensor,
+    output_norm_b: ggml_rs::Tensor,
+    output: ggml_rs::Tensor,
     layers: Vec<Layer>,
 
     // Must be kept alive for the model
-    _context: ggml::Context,
+    _context: ggml_rs::context::Context,
     _mmap: Option<Mmap>,
 }
 
@@ -162,12 +162,12 @@ impl KnownModel for Bloom {
             // add 10% to account for ggml object overhead
             buf_size = (1.1f64 * session.mem_per_token as f64 * n as f64) as usize;
         };
-        let ctx0 = ggml::Context::init(buf_size, true);
+        let ctx0 = ggml_rs::context::Context::init(buf_size, true);
 
         // TODO: REMAKE THIS AFTER CHECKING GGML GRAPH
-        let mut gf = ggml::ComputationGraph::new(n_threads);
+        let mut gf = ggml_rs::ComputationGraph::new(n_threads);
 
-        let mut embd = ctx0.new_tensor_1d(ggml::Type::I32, n);
+        let mut embd = ctx0.new_tensor_1d(ggml_rs::Type::I32, n);
         unsafe { embd.write_data(bytemuck::cast_slice(input_tokens)) };
 
         let mut input_layer = ctx0.op_get_rows(&self.tok_embeddings, &embd);
@@ -181,7 +181,7 @@ impl KnownModel for Bloom {
 
         for il in 0..n_layer {
             let input_self_attention = input_layer.share();
-            let mut current: ggml::Tensor;
+            let mut current: ggml_rs::Tensor;
 
             // norm
             {
@@ -252,7 +252,7 @@ impl KnownModel for Bloom {
                 let q = ctx0.op_permute(
                     &ctx0.op_cpy(
                         &q_current,
-                        &ctx0.new_tensor_3d(ggml::Type::F32, n_embd / n_head, n_head, n),
+                        &ctx0.new_tensor_3d(ggml_rs::Type::F32, n_embd / n_head, n_head, n),
                     ),
                     0,
                     2,
@@ -336,7 +336,7 @@ impl KnownModel for Bloom {
                 // cur = KQV_merged.contiguous().view(n_embd, N)
                 current = ctx0.op_cpy(
                     &k_q_v_merged,
-                    &ctx0.new_tensor_2d(ggml::Type::F32, n_embd, n),
+                    &ctx0.new_tensor_2d(ggml_rs::Type::F32, n_embd, n),
                 );
 
                 // projection
@@ -499,18 +499,18 @@ impl llm_base::Hyperparameters for Hyperparameters {
 }
 
 struct Layer {
-    pub attention_norm: ggml::Tensor,
-    pub attention_norm_b: ggml::Tensor,
-    pub wo: ggml::Tensor,
-    pub wo_b: ggml::Tensor,
-    pub query_key_value: ggml::Tensor,
-    pub query_key_value_b: ggml::Tensor,
+    pub attention_norm: ggml_rs::Tensor,
+    pub attention_norm_b: ggml_rs::Tensor,
+    pub wo: ggml_rs::Tensor,
+    pub wo_b: ggml_rs::Tensor,
+    pub query_key_value: ggml_rs::Tensor,
+    pub query_key_value_b: ggml_rs::Tensor,
     // normalization
-    pub ffn_norm: ggml::Tensor,
-    pub ffn_norm_b: ggml::Tensor,
+    pub ffn_norm: ggml_rs::Tensor,
+    pub ffn_norm_b: ggml_rs::Tensor,
     // ff
-    pub w1: ggml::Tensor,
-    pub w1_b: ggml::Tensor,
-    pub w2: ggml::Tensor,
-    pub w2_b: ggml::Tensor,
+    pub w1: ggml_rs::Tensor,
+    pub w1_b: ggml_rs::Tensor,
+    pub w2: ggml_rs::Tensor,
+    pub w2_b: ggml_rs::Tensor,
 }
diff --git a/generate-ggml-bindings/Cargo.toml b/generate-ggml-bindings/Cargo.toml
diff --git a/generate-ggml-bindings/src/main.rs b/generate-ggml-bindings/src/main.rs
diff --git a/ggml-format/Cargo.toml b/ggml-format/Cargo.toml
diff --git a/ggml-format/src/lib.rs b/ggml-format/src/lib.rs
diff --git a/ggml-rs/Cargo.toml b/ggml-rs/Cargo.toml
@@ -0,0 +1,14 @@
+[package]
+name = "ggml-rs"
+version = { workspace = true }
+edition = "2021"
+
+[build-dependencies]
+bindgen = "0.64.0"
+cc = "^1.0"
+
+[dependencies]
+thiserror = "1.0"
+
+[dev-dependencies]
+rand = "0.8"
diff --git a/ggml-sys/build.rs → ggml-rs/build.rs b/ggml-sys/build.rs → ggml-rs/build.rs
@@ -1,17 +1,18 @@
-use std::env;
+use std::{env, path::PathBuf};
 
+// By default, this crate will attempt to compile ggml with the features of your host system if
+// the host and target are the same. If they are not, it will turn off auto-feature-detection,
+// and you will need to manually specify target features through target-features.
 fn main() {
-    // By default, this crate will attempt to compile ggml with the features of your host system if
-    // the host and target are the same. If they are not, it will turn off auto-feature-detection,
-    // and you will need to manually specify target features through target-features.
-
     println!("cargo:rerun-if-changed=ggml");
 
-    let ggml_src = ["ggml/ggml.c"];
+    let ggml_src = ["ggml/src/ggml.c"];
 
     let mut builder = cc::Build::new();
 
-    let build = builder.files(ggml_src.iter()).include("include");
+    let build = builder
+        .files(ggml_src.iter())
+        .include("./ggml/include/ggml");
 
     // This is a very basic heuristic for applying compile flags.
     // Feel free to update this to fit your operating system.
@@ -88,6 +89,16 @@ fn main() {
     }
     build.warnings(false);
     build.compile("ggml");
+
+    let header_path = "./ggml/include/ggml/ggml.h";
+    bindgen::Builder::default()
+        .header(String::from(header_path))
+        .allowlist_file(header_path)
+        .parse_callbacks(Box::new(bindgen::CargoCallbacks))
+        .generate()
+        .expect("Unable to generate bindings.")
+        .write_to_file(PathBuf::from(env::var("OUT_DIR").unwrap()).join("bindings.rs"))
+        .expect("Unable to write generated bindings to file.");
 }
 
 fn get_supported_target_features() -> std::collections::HashSet<String> {

diff --git a/ggml-rs/ggml b/ggml-rs/ggml