From 733ef0720b7f0ac4e2b4242dfc7dfc3bf0802d58 Mon Sep 17 00:00:00 2001 From: jempabroni <105541061+jempabroni@users.noreply.github.com> Date: Thu, 23 Mar 2023 06:17:44 -0700 Subject: [PATCH 1/4] fix magic bug https://github.com/setzer22/llama-rs/issues/59 --- llama-rs/src/lib.rs | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/llama-rs/src/lib.rs b/llama-rs/src/lib.rs index 4c1473ff..500006ee 100644 --- a/llama-rs/src/lib.rs +++ b/llama-rs/src/lib.rs @@ -147,6 +147,7 @@ type Token = String; pub struct Vocabulary { /// Maps every integer (index) token id to its corresponding string mapping: Vec, + score: Vec, } #[derive(serde::Serialize)] @@ -253,8 +254,12 @@ pub enum LoadError { #[error("invalid integer conversion")] InvalidIntegerConversion(#[from] std::num::TryFromIntError), + #[error("file is pre-versioned, generate another please! at {path:?}")] + PreVersioned { path: PathBuf }, #[error("invalid magic number for {path:?}")] InvalidMagic { path: PathBuf }, + #[error("invalid version number for {path:?}")] + InvalidVersion { path: PathBuf }, #[error("invalid value {value} for `f16` in hyperparameters")] HyperparametersF16Invalid { value: i32 }, #[error("unknown tensor `{tensor_name}` in {path:?}")] @@ -344,13 +349,29 @@ impl Model { // Verify magic { let magic = read_i32(&mut reader)?; - if magic != 0x67676d6c { + if magic == 0x67676d6c { + return Err(LoadError::PreVersioned { + path: main_path.to_owned(), + }); + } + + if magic != 0x67676d66 { return Err(LoadError::InvalidMagic { path: main_path.to_owned(), }); } } + // Verify the version + { + let format_version = read_i32(&mut reader)?; + if format_version != 1 { + return Err(LoadError::InvalidVersion { + path: main_path.to_owned(), + }); + } + } + // ================= // Load hyper params // ================= @@ -373,9 +394,7 @@ impl Model { load_progress_callback(LoadProgress::HyperparametersLoaded(&hparams)); - // =============== // Load vocabulary - // =============== let mut vocab = Vocabulary::default(); for i in 0..hparams.n_vocab { let len = read_i32(&mut reader)?; @@ -387,8 +406,10 @@ impl Model { }); vocab.mapping.push("�".to_string()); } - } + let score: f32 = read_i32(&mut reader)? as f32; + vocab.score.push(score); + } // for the big tensors, we have the option to store the data in 16-bit // floats or quantized in order to save memory and also to speed up the // computation From 1d8b2c0534b5bd682ebee70ae942d9d0f3a5b0f1 Mon Sep 17 00:00:00 2001 From: jempabroni <105541061+jempabroni@users.noreply.github.com> Date: Thu, 23 Mar 2023 09:24:53 -0700 Subject: [PATCH 2/4] 512 -> 1024: make 65B work --- llama-rs/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama-rs/src/lib.rs b/llama-rs/src/lib.rs index 500006ee..dd64faf7 100644 --- a/llama-rs/src/lib.rs +++ b/llama-rs/src/lib.rs @@ -932,7 +932,7 @@ impl Model { f16_: _, } = self.hparams; - let mut buf_size = 512 * 1024 * 1024; + let mut buf_size = 1024 * 1024 * 1024; if session.mem_per_token > 0 && session.mem_per_token * n > buf_size { // add 10% to account for ggml object overhead buf_size = (1.1f64 * session.mem_per_token as f64 * n as f64) as usize; From fd8f65c7ab0cfc4524d078410f8641b93f427433 Mon Sep 17 00:00:00 2001 From: jempabroni <105541061+jempabroni@users.noreply.github.com> Date: Thu, 23 Mar 2023 14:20:01 -0700 Subject: [PATCH 3/4] Revert "fix magic bug " This reverts commit 733ef0720b7f0ac4e2b4242dfc7dfc3bf0802d58. --- llama-rs/src/lib.rs | 29 ++++------------------------- 1 file changed, 4 insertions(+), 25 deletions(-) diff --git a/llama-rs/src/lib.rs b/llama-rs/src/lib.rs index dd64faf7..eb3c47c8 100644 --- a/llama-rs/src/lib.rs +++ b/llama-rs/src/lib.rs @@ -147,7 +147,6 @@ type Token = String; pub struct Vocabulary { /// Maps every integer (index) token id to its corresponding string mapping: Vec, - score: Vec, } #[derive(serde::Serialize)] @@ -254,12 +253,8 @@ pub enum LoadError { #[error("invalid integer conversion")] InvalidIntegerConversion(#[from] std::num::TryFromIntError), - #[error("file is pre-versioned, generate another please! at {path:?}")] - PreVersioned { path: PathBuf }, #[error("invalid magic number for {path:?}")] InvalidMagic { path: PathBuf }, - #[error("invalid version number for {path:?}")] - InvalidVersion { path: PathBuf }, #[error("invalid value {value} for `f16` in hyperparameters")] HyperparametersF16Invalid { value: i32 }, #[error("unknown tensor `{tensor_name}` in {path:?}")] @@ -349,29 +344,13 @@ impl Model { // Verify magic { let magic = read_i32(&mut reader)?; - if magic == 0x67676d6c { - return Err(LoadError::PreVersioned { - path: main_path.to_owned(), - }); - } - - if magic != 0x67676d66 { + if magic != 0x67676d6c { return Err(LoadError::InvalidMagic { path: main_path.to_owned(), }); } } - // Verify the version - { - let format_version = read_i32(&mut reader)?; - if format_version != 1 { - return Err(LoadError::InvalidVersion { - path: main_path.to_owned(), - }); - } - } - // ================= // Load hyper params // ================= @@ -394,7 +373,9 @@ impl Model { load_progress_callback(LoadProgress::HyperparametersLoaded(&hparams)); + // =============== // Load vocabulary + // =============== let mut vocab = Vocabulary::default(); for i in 0..hparams.n_vocab { let len = read_i32(&mut reader)?; @@ -406,10 +387,8 @@ impl Model { }); vocab.mapping.push("�".to_string()); } - - let score: f32 = read_i32(&mut reader)? as f32; - vocab.score.push(score); } + // for the big tensors, we have the option to store the data in 16-bit // floats or quantized in order to save memory and also to speed up the // computation From 79a0359493085a13a3335bef33c44cba647ad932 Mon Sep 17 00:00:00 2001 From: setzer22 Date: Thu, 23 Mar 2023 22:24:58 +0100 Subject: [PATCH 4/4] Update llama-rs/src/lib.rs --- llama-rs/src/lib.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/llama-rs/src/lib.rs b/llama-rs/src/lib.rs index 25a5b317..4a5616ff 100644 --- a/llama-rs/src/lib.rs +++ b/llama-rs/src/lib.rs @@ -1085,6 +1085,8 @@ impl Model { f16_: _, } = self.hparams; + // For the first run, we need to guess a maximum buffer size so we can measure + // the actual memory consumption of the temporary ggml context. let mut buf_size = 1024 * 1024 * 1024; if session.mem_per_token > 0 && session.mem_per_token * n > buf_size { // add 10% to account for ggml object overhead