From 733ef0720b7f0ac4e2b4242dfc7dfc3bf0802d58 Mon Sep 17 00:00:00 2001
From: jempabroni <105541061+jempabroni@users.noreply.github.com>
Date: Thu, 23 Mar 2023 06:17:44 -0700
Subject: [PATCH 1/4] fix magic bug

https://github.com/setzer22/llama-rs/issues/59
---
 llama-rs/src/lib.rs | 29 +++++++++++++++++++++++++----
 1 file changed, 25 insertions(+), 4 deletions(-)
diff --git a/llama-rs/src/lib.rs b/llama-rs/src/lib.rs
index 4c1473ff..500006ee 100644
--- a/llama-rs/src/lib.rs
+++ b/llama-rs/src/lib.rs
@@ -147,6 +147,7 @@ type Token = String;
 pub struct Vocabulary {
     /// Maps every integer (index) token id to its corresponding string
     mapping: Vec<Token>,
+    score: Vec<f32>,
 }
 
 #[derive(serde::Serialize)]
@@ -253,8 +254,12 @@ pub enum LoadError {
     #[error("invalid integer conversion")]
     InvalidIntegerConversion(#[from] std::num::TryFromIntError),
 
+    #[error("file is pre-versioned, generate another please! at {path:?}")]
+    PreVersioned { path: PathBuf },
     #[error("invalid magic number for {path:?}")]
     InvalidMagic { path: PathBuf },
+    #[error("invalid version number for {path:?}")]
+    InvalidVersion { path: PathBuf },
     #[error("invalid value {value} for `f16` in hyperparameters")]
     HyperparametersF16Invalid { value: i32 },
     #[error("unknown tensor `{tensor_name}` in {path:?}")]
@@ -344,13 +349,29 @@ impl Model {
         // Verify magic
         {
             let magic = read_i32(&mut reader)?;
-            if magic != 0x67676d6c {
+            if magic == 0x67676d6c {
+                return Err(LoadError::PreVersioned {
+                    path: main_path.to_owned(),
+                });
+            }
+
+            if magic != 0x67676d66 {
                 return Err(LoadError::InvalidMagic {
                     path: main_path.to_owned(),
                 });
             }
         }
 
+        // Verify the version
+        {
+            let format_version = read_i32(&mut reader)?;
+            if format_version != 1 {
+                return Err(LoadError::InvalidVersion {
+                    path: main_path.to_owned(),
+                });
+            }
+        }
+
         // =================
         // Load hyper params
         // =================
@@ -373,9 +394,7 @@ impl Model {
 
         load_progress_callback(LoadProgress::HyperparametersLoaded(&hparams));
 
-        // ===============
         // Load vocabulary
-        // ===============
         let mut vocab = Vocabulary::default();
         for i in 0..hparams.n_vocab {
             let len = read_i32(&mut reader)?;
@@ -387,8 +406,10 @@ impl Model {
                 });
                 vocab.mapping.push("�".to_string());
             }
-        }
 
+            let score: f32 = read_i32(&mut reader)? as f32;
+            vocab.score.push(score);
+        }
         // for the big tensors, we have the option to store the data in 16-bit
         // floats or quantized in order to save memory and also to speed up the
         // computation

From 1d8b2c0534b5bd682ebee70ae942d9d0f3a5b0f1 Mon Sep 17 00:00:00 2001
From: jempabroni <105541061+jempabroni@users.noreply.github.com>
Date: Thu, 23 Mar 2023 09:24:53 -0700
Subject: [PATCH 2/4] 512 -> 1024: make 65B work

---
 llama-rs/src/lib.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama-rs/src/lib.rs b/llama-rs/src/lib.rs
index 500006ee..dd64faf7 100644
--- a/llama-rs/src/lib.rs
+++ b/llama-rs/src/lib.rs
@@ -932,7 +932,7 @@ impl Model {
             f16_: _,
         } = self.hparams;
 
-        let mut buf_size = 512 * 1024 * 1024;
+        let mut buf_size = 1024 * 1024 * 1024;
         if session.mem_per_token > 0 && session.mem_per_token * n > buf_size {
             // add 10% to account for ggml object overhead
             buf_size = (1.1f64 * session.mem_per_token as f64 * n as f64) as usize;

From fd8f65c7ab0cfc4524d078410f8641b93f427433 Mon Sep 17 00:00:00 2001
From: jempabroni <105541061+jempabroni@users.noreply.github.com>
Date: Thu, 23 Mar 2023 14:20:01 -0700
Subject: [PATCH 3/4] Revert "fix magic bug "

This reverts commit 733ef0720b7f0ac4e2b4242dfc7dfc3bf0802d58.
---
 llama-rs/src/lib.rs | 29 ++++-------------------------
 1 file changed, 4 insertions(+), 25 deletions(-)

diff --git a/llama-rs/src/lib.rs b/llama-rs/src/lib.rs
index dd64faf7..eb3c47c8 100644
--- a/llama-rs/src/lib.rs
+++ b/llama-rs/src/lib.rs
@@ -147,7 +147,6 @@ type Token = String;
 pub struct Vocabulary {
     /// Maps every integer (index) token id to its corresponding string
     mapping: Vec<Token>,
-    score: Vec<f32>,
 }
 
 #[derive(serde::Serialize)]
@@ -254,12 +253,8 @@ pub enum LoadError {
     #[error("invalid integer conversion")]
     InvalidIntegerConversion(#[from] std::num::TryFromIntError),
 
-    #[error("file is pre-versioned, generate another please! at {path:?}")]
-    PreVersioned { path: PathBuf },
     #[error("invalid magic number for {path:?}")]
     InvalidMagic { path: PathBuf },
-    #[error("invalid version number for {path:?}")]
-    InvalidVersion { path: PathBuf },
     #[error("invalid value {value} for `f16` in hyperparameters")]
     HyperparametersF16Invalid { value: i32 },
     #[error("unknown tensor `{tensor_name}` in {path:?}")]
@@ -349,29 +344,13 @@ impl Model {
         // Verify magic
         {
             let magic = read_i32(&mut reader)?;
-            if magic == 0x67676d6c {
-                return Err(LoadError::PreVersioned {
-                    path: main_path.to_owned(),
-                });
-            }
-
-            if magic != 0x67676d66 {
+            if magic != 0x67676d6c {
                 return Err(LoadError::InvalidMagic {
                     path: main_path.to_owned(),
                 });
             }
         }
 
-        // Verify the version
-        {
-            let format_version = read_i32(&mut reader)?;
-            if format_version != 1 {
-                return Err(LoadError::InvalidVersion {
-                    path: main_path.to_owned(),
-                });
-            }
-        }
-
         // =================
         // Load hyper params
         // =================
@@ -394,7 +373,9 @@ impl Model {
 
         load_progress_callback(LoadProgress::HyperparametersLoaded(&hparams));
 
+        // ===============
         // Load vocabulary
+        // ===============
         let mut vocab = Vocabulary::default();
         for i in 0..hparams.n_vocab {
             let len = read_i32(&mut reader)?;
@@ -406,10 +387,8 @@ impl Model {
                 });
                 vocab.mapping.push("�".to_string());
             }
-
-            let score: f32 = read_i32(&mut reader)? as f32;
-            vocab.score.push(score);
         }
+
         // for the big tensors, we have the option to store the data in 16-bit
         // floats or quantized in order to save memory and also to speed up the
         // computation

From 79a0359493085a13a3335bef33c44cba647ad932 Mon Sep 17 00:00:00 2001
From: setzer22 <jsanchezfsms@gmail.com>
Date: Thu, 23 Mar 2023 22:24:58 +0100
Subject: [PATCH 4/4] Update llama-rs/src/lib.rs

---
 llama-rs/src/lib.rs | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/llama-rs/src/lib.rs b/llama-rs/src/lib.rs
index 25a5b317..4a5616ff 100644
--- a/llama-rs/src/lib.rs
+++ b/llama-rs/src/lib.rs
@@ -1085,6 +1085,8 @@ impl Model {
             f16_: _,
         } = self.hparams;
 
+        // For the first run, we need to guess a maximum buffer size so we can measure
+        // the actual memory consumption of the temporary ggml context.
         let mut buf_size = 1024 * 1024 * 1024;
         if session.mem_per_token > 0 && session.mem_per_token * n > buf_size {
             // add 10% to account for ggml object overhead