Merge pull request rustformers#325 from LLukas22/feat/cuda-opencl-acc…

…eleration CUDA/OpenCL Acceleration
LahiRumesh · Jul 16, 2023 · 3062a08 · 3062a08
2 parents 0269796 + d815857
commit 3062a08
Show file tree

Hide file tree

Showing 31 changed files with 1,244 additions and 634 deletions.
diff --git a/binaries/generate-ggml-bindings/src/main.rs b/binaries/generate-ggml-bindings/src/main.rs
@@ -23,6 +23,8 @@ fn main() {
 
 fn generate_main(ggml_path: &Path, src_path: &Path) {
     let bindings = bindgen::Builder::default()
+        .header(ggml_path.join("ggml.h").to_str().unwrap().to_string())
+        .allowlist_file(r".*ggml.h")
         .header(ggml_path.join("k_quants.h").to_string_lossy())
         .allowlist_file(r".*k_quants.h")
         // Suppress some warnings

diff --git a/binaries/llm-cli/src/cli_args.rs b/binaries/llm-cli/src/cli_args.rs
@@ -335,6 +335,8 @@ impl Generate {
             memory_k_type: mem_typ,
             memory_v_type: mem_typ,
             use_gpu: self.use_gpu,
+            n_batch: self.batch_size,
+            n_threads: self.num_threads(),
         }
     }
 
@@ -348,8 +350,6 @@ impl Generate {
 
     pub fn inference_parameters(&self, eot: llm::TokenId) -> InferenceParameters {
         InferenceParameters {
-            n_threads: self.num_threads(),
-            n_batch: self.batch_size,
             sampler: Arc::new(llm::samplers::TopPTopK {
                 top_k: self.top_k,
                 top_p: self.top_p,
@@ -457,6 +457,10 @@ pub struct ModelLoad {
     /// LoRA adapter to use for the model
     #[arg(long, num_args(0..))]
     pub lora_paths: Option<Vec<PathBuf>>,
+
+    /// Number of layers to run on the GPU. If not specified, all layers will be run on the GPU.
+    #[arg(long)]
+    pub gpu_layers: Option<usize>,
 }
 impl ModelLoad {
     pub fn load(&self, use_gpu: bool) -> eyre::Result<Box<dyn Model>> {
@@ -465,6 +469,7 @@ impl ModelLoad {
             context_size: self.num_ctx_tokens,
             lora_adapters: self.lora_paths.clone(),
             use_gpu,
+            gpu_layers: self.gpu_layers,
         };
 
         let mut sp = Some(spinoff::Spinner::new(

diff --git a/binaries/llm-cli/src/interactive.rs b/binaries/llm-cli/src/interactive.rs
@@ -34,7 +34,7 @@ pub fn repl(
             .as_deref()
             .map(|template| util::process_prompt(template, &line))
             .unwrap_or(line);
-        feed_prompt_with_spinner(model, &mut session, &parameters, prompt)?;
+        feed_prompt_with_spinner(model, &mut session, prompt)?;
 
         session.infer::<Infallible>(
             model,
@@ -79,7 +79,7 @@ pub fn chat(args: &Chat) -> eyre::Result<()> {
 
     let model = model.as_ref();
     let mut session = create_session(model, inference_session_config);
-    feed_prompt_with_spinner(model, &mut session, &parameters, prelude_prompt)?;
+    feed_prompt_with_spinner(model, &mut session, prelude_prompt)?;
 
     readline_loop(|raw_line| {
         let prompt = {
@@ -134,7 +134,6 @@ fn initialize_common_state(
 fn feed_prompt_with_spinner(
     model: &dyn llm::Model,
     session: &mut llm::InferenceSession,
-    parameters: &llm::InferenceParameters,
     mut prompt: String,
 ) -> eyre::Result<()> {
     // Add a newline to the beginning of the prompt if the last character in the session is not a newline
@@ -145,7 +144,6 @@ fn feed_prompt_with_spinner(
     let sp = spinoff::Spinner::new(spinoff::spinners::Dots2, "".to_string(), None);
     let result = session.feed_prompt(
         model,
-        parameters,
         &prompt,
         // OutputRequest
         &mut Default::default(),

diff --git a/binaries/llm-cli/src/main.rs b/binaries/llm-cli/src/main.rs
@@ -113,16 +113,10 @@ fn perplexity(args: &cli_args::Perplexity) -> eyre::Result<()> {
     let model = args.model_load.load(args.generate.use_gpu)?;
     let (mut session, _) =
         snapshot::read_or_create_session(model.as_ref(), None, None, inference_session_config);
-    let parameters = args.generate.inference_parameters(model.eot_token_id());
 
-    session.perplexity(
-        model.as_ref(),
-        &parameters,
-        prompt.as_str(),
-        |chunk, perplexity| {
-            println!("Perplexity[{chunk}]: {perplexity}");
-        },
-    )?;
+    session.perplexity(model.as_ref(), prompt.as_str(), |chunk, perplexity| {
+        println!("Perplexity[{chunk}]: {perplexity}");
+    })?;
 
     Ok(())
 }

diff --git a/binaries/llm-test/src/delete.rs b/binaries/llm-test/src/delete.rs
@@ -64,7 +64,7 @@ fn feed_prompt(
     model: &impl Model,
     output: &mut OutputRequest,
 ) -> Result<(), llm::InferenceError> {
-    session.feed_prompt(model, &Default::default(), prompt, output, always_continue)
+    session.feed_prompt(model, prompt, output, always_continue)
 }
 
 fn always_continue(_: &[u8]) -> Result<InferenceFeedback, Infallible> {

diff --git a/binaries/llm-test/src/inference.rs b/binaries/llm-test/src/inference.rs
@@ -4,7 +4,7 @@
 
 use std::{convert::Infallible, sync::Arc};
 
-use llm::InferenceStats;
+use llm::{InferenceSessionConfig, InferenceStats};
 
 use crate::{ModelConfig, TestCaseReport, TestCaseReportInner, TestCaseReportMeta};
 
@@ -15,14 +15,11 @@ pub(crate) fn can_infer(
     expected_output: Option<&str>,
     maximum_token_count: usize,
 ) -> anyhow::Result<TestCaseReport> {
-    let mut session = model.start_session(Default::default());
-    let (actual_output, res) = run_inference(
-        model,
-        model_config,
-        &mut session,
-        input,
-        maximum_token_count,
-    );
+    let mut session = model.start_session(InferenceSessionConfig {
+        n_threads: model_config.threads,
+        ..Default::default()
+    });
+    let (actual_output, res) = run_inference(model, &mut session, input, maximum_token_count);
 
     // Process the results
     Ok(TestCaseReport {
@@ -58,7 +55,6 @@ pub(crate) fn can_infer(
 
 fn run_inference(
     model: &dyn llm::Model,
-    model_config: &ModelConfig,
     session: &mut llm::InferenceSession,
     input: &str,
     maximum_token_count: usize,
@@ -70,8 +66,6 @@ fn run_inference(
         &llm::InferenceRequest {
             prompt: input.into(),
             parameters: &llm::InferenceParameters {
-                n_threads: model_config.threads,
-                n_batch: 1,
                 sampler: Arc::new(DeterministicSampler),
             },
             play_back_previous_tokens: false,

diff --git a/binaries/llm-test/src/tokens.rs b/binaries/llm-test/src/tokens.rs
@@ -65,9 +65,7 @@ fn feed_prompt(
     model: &impl Model,
     output: &mut OutputRequest,
 ) -> Result<(), llm::InferenceError> {
-    session.feed_prompt(model, &Default::default(), prompt, output, |x| {
-        always_continue(x)
-    })
+    session.feed_prompt(model, prompt, output, always_continue)
 }
 
 fn always_continue(_: &[u8]) -> Result<InferenceFeedback, Infallible> {

diff --git a/crates/ggml/src/metal.rs → crates/ggml/src/accelerator/metal.rs b/crates/ggml/src/metal.rs → crates/ggml/src/accelerator/metal.rs
@@ -1,6 +1,6 @@
 //! Metal support.
 use crate::{sys::metal, Buffer, ComputationGraph, Context, Tensor};
-use std::{ffi::c_void, ptr::NonNull, sync::Arc};
+use std::{ptr::NonNull, sync::Arc};
 
 /// Acts as a RAII-guard over a `sys::metal::ggml_metal_context`, allocating via
 /// `ggml_metal_init` and dropping via `ggml_metal_free`.
@@ -14,8 +14,8 @@ pub struct MetalContext {
 
 impl MetalContext {
     /// Create a new Metal context
-    pub fn new() -> Self {
-        let raw = unsafe { metal::ggml_metal_init() };
+    pub fn new(n_threads: usize) -> Self {
+        let raw = unsafe { metal::ggml_metal_init(n_threads.try_into().unwrap()) };
 
         MetalContext {
             contexts: vec![],
@@ -45,47 +45,32 @@ impl MetalContext {
 
     /// Add a context's memory as buffer to this Metal context
     pub fn add_context(&mut self, from_context: Arc<Context>) {
-        if self.ref_context(from_context.clone()) {
-            unsafe {
-                let raw_context = from_context.ptr.as_ptr();
-
-                let (data_ptr, data_size): (*mut c_void, usize) =
-                    if let Some(ref mmap) = from_context.mmap {
-                        // This is a bit naughty...
-                        (mmap.as_ptr().cast_mut().cast(), mmap.len())
-                    } else {
-                        (
-                            ggml_sys::ggml_get_mem_buffer(raw_context),
-                            ggml_sys::ggml_get_mem_size(raw_context),
-                        )
-                    };
-
-                let max_size = ggml_sys::ggml_get_max_tensor_size(raw_context);
-                assert!(
-                    metal::ggml_metal_add_buffer(
-                        self.ptr.as_ptr(),
-                        "wt\0".as_ptr().cast(), // FIXME provide an actual name
-                        data_ptr,
-                        data_size,
-                        max_size
-                    ),
-                    "Could not add weight buffer to metal context"
-                );
-            }
+        if !self.ref_context(from_context.clone()) {
+            return;
         }
-    }
-}
 
-impl Default for MetalContext {
-    fn default() -> Self {
-        Self::new()
+        unsafe {
+            let raw_context = from_context.as_ptr();
+            let (data_ptr, data_size) = from_context.storage().as_ptr_and_size(&from_context);
+            let max_size = ggml_sys::ggml_get_max_tensor_size(raw_context);
+            assert!(
+                metal::ggml_metal_add_buffer(
+                    self.ptr.as_ptr(),
+                    "wt\0".as_ptr().cast(), // FIXME provide an actual name
+                    data_ptr,
+                    data_size,
+                    max_size
+                ),
+                "Could not add weight buffer to metal context"
+            );
+        }
     }
 }
 
 impl MetalContext {
     /// Registers a context as a context that provides Metal buffers. Returns true if the context was not registered before.
     fn ref_context(&mut self, context: Arc<Context>) -> bool {
-        if self.contexts.iter().any(|c| c.ptr == context.ptr) {
+        if self.contexts.iter().any(|c| *c == context) {
             false
         } else {
             self.contexts.push(context);

diff --git a/crates/ggml/src/accelerator/mod.rs b/crates/ggml/src/accelerator/mod.rs
@@ -0,0 +1,94 @@
+//! Functionality related to hardware acceleration of GGML (GPU, etc.)
+use crate::sys;
+
+#[cfg(feature = "metal")]
+pub mod metal;
+
+#[derive(Debug, Copy, Clone, PartialEq, Eq)]
+/// Accelerators supported by `ggml`.
+pub enum Accelerator {
+    /// CuBLAS accelerated
+    CuBLAS,
+    /// CLBlast accelerated
+    CLBlast,
+    /// Metal accelerated
+    Metal,
+    /// Cpu accelerated
+    None,
+}
+
+/// Returns the accelerator `ggml` was compiled with.
+pub fn get_accelerator() -> Accelerator {
+    #[cfg(feature = "cublas")]
+    return Accelerator::CLBlast;
+    #[cfg(feature = "clblast")]
+    return Accelerator::CuBLAS;
+    #[cfg(feature = "metal")]
+    return Accelerator::Metal;
+    #[cfg(not(any(feature = "cublas", feature = "clblast", feature = "metal")))]
+    return Accelerator::None;
+}
+
+#[derive(Default, Debug, Copy, Clone, PartialEq, Eq)]
+/// Backend to use for a tensor.
+pub enum Backend {
+    /// CPU backend
+    #[default]
+    Cpu,
+    /// GPU backend
+    Gpu,
+    /// Multi-GPU backend
+    GpuSplit,
+}
+
+impl From<Backend> for sys::ggml_backend {
+    fn from(b: Backend) -> Self {
+        match b {
+            Backend::Cpu => sys::ggml_backend_GGML_BACKEND_CPU,
+            Backend::Gpu => sys::ggml_backend_GGML_BACKEND_GPU,
+            Backend::GpuSplit => sys::ggml_backend_GGML_BACKEND_GPU_SPLIT,
+        }
+    }
+}
+
+impl TryFrom<sys::ggml_backend> for Backend {
+    type Error = ();
+    fn try_from(b: sys::ggml_backend) -> Result<Self, Self::Error> {
+        match b {
+            sys::ggml_backend_GGML_BACKEND_CPU => Ok(Backend::Cpu),
+            sys::ggml_backend_GGML_BACKEND_GPU => Ok(Backend::Gpu),
+            sys::ggml_backend_GGML_BACKEND_GPU_SPLIT => Ok(Backend::GpuSplit),
+            _ => Err(()),
+        }
+    }
+}
+
+/// Initialize the accelerator. If ggml-sys is compiled with CUDA or CLBlast support, this function will initialize the accelerator. If not this is a no-op.
+#[allow(unused_variables)]
+pub fn initialize(device: i32) {
+    #[cfg(feature = "cublas")]
+    unsafe {
+        //TODO: Make this configurable
+        sys::cuda::ggml_init_cublas();
+        sys::cuda::ggml_cuda_set_main_device(device);
+        let split = 1.0f32;
+        sys::cuda::ggml_cuda_set_tensor_split(&split as *const f32);
+    }
+}
+
+///  Sets the scratch size for the GPU. If ggml-sys is compiled with CUDA support, this function will set the scratch size. If not this is a no-op.
+#[allow(unused_variables)]
+pub fn set_scratch_size(size: usize) {
+    #[cfg(feature = "cublas")]
+    unsafe {
+        sys::cuda::ggml_cuda_set_scratch_size(size);
+    }
+}
+
+/// Frees the scratch memory. If ggml-sys is compiled with CUDA support, this function will free the scratch memory. If not this is a no-op.
+pub fn free_scratch() {
+    #[cfg(feature = "cublas")]
+    unsafe {
+        sys::cuda::ggml_cuda_free_scratch();
+    }
+}