From 2322ec223a21625dfe9bd73ee677444a98a24ac9 Mon Sep 17 00:00:00 2001
From: Xiake Sun <xiake.sun@intel.com>
Date: Tue, 20 Jun 2023 05:42:40 -0700
Subject: [PATCH 01/15] Fix typo (#1949)

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/README.md b/README.md
index 2d05de333cb23..8136e706433ca 100644
--- a/README.md
+++ b/README.md
@@ -378,7 +378,7 @@ Building the program with BLAS support may lead to some performance improvements
       ```sh
       git clone https://github.com/CNugteren/CLBlast.git
       mkdir CLBlast/build
-      cd CLBLast/build
+      cd CLBlast/build
       cmake .. -DBUILD_SHARED_LIBS=OFF -DTUNERS=OFF
       cmake --build . --config Release
       cmake --install . --prefix /some/path

From 049aa16b8c5c6d086246e4e6b9feb18de4fbd663 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 20 Jun 2023 19:05:54 +0300
Subject: [PATCH 02/15] readme : add link to p1

---
 README.md | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 8136e706433ca..67012adabe0ea 100644
--- a/README.md
+++ b/README.md
@@ -9,12 +9,8 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
 
 **Hot topics:**
 
+- p1 : LLM-based code completion engine at the edge : https://github.com/ggml-org/p1/discussions/1
 - Roadmap June 2023: https://github.com/ggerganov/llama.cpp/discussions/1729
-- GPU support with Metal (Apple Silicon): https://github.com/ggerganov/llama.cpp/pull/1642
-- High-quality 2,3,4,5,6-bit quantization: https://github.com/ggerganov/llama.cpp/pull/1684
-- Multi-GPU support: https://github.com/ggerganov/llama.cpp/pull/1607
-- Training LLaMA models from scratch: https://github.com/ggerganov/llama.cpp/pull/1652
-- CPU threading improvements: https://github.com/ggerganov/llama.cpp/pull/1632
 
 <details>
   <summary>Table of Contents</summary>

From fb98254f99d769fcbbf20966ef386abdb48ef601 Mon Sep 17 00:00:00 2001
From: Rahul Vivek Nair <68507071+RahulVivekNair@users.noreply.github.com>
Date: Thu, 22 Jun 2023 03:18:43 +0530
Subject: [PATCH 03/15] Fix typo in README.md (#1961)

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 67012adabe0ea..ace588606ee8c 100644
--- a/README.md
+++ b/README.md
@@ -340,7 +340,7 @@ Building the program with BLAS support may lead to some performance improvements
   | LLAMA_CUDA_DMMV_X       | Positive integer >= 32 |      32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
   | LLAMA_CUDA_DMMV_Y       | Positive integer       |       1 | Block size in y direction for the CUDA dequantization + mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
   | LLAMA_CUDA_DMMV_F16     | Boolean                |   false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels. Can improve performance on relatively recent GPUs. |
-  | LLAMA_CUDA_KQUANTS_ITER | 1 or 2                 |       2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value 2 1 can improve performance for slow GPUs. |
+  | LLAMA_CUDA_KQUANTS_ITER | 1 or 2                 |       2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
 
 - #### CLBlast
 

From bbca06e26949686d61a5126332680ba3cccf235c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= <johannesg@5d6.de>
Date: Wed, 21 Jun 2023 23:49:25 +0200
Subject: [PATCH 04/15] cmake: revert CUDA arch default to 52, 61 if f16
 (#1959)

---
 CMakeLists.txt | 25 +++++++++----------------
 1 file changed, 9 insertions(+), 16 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2846d9b944499..cc7560a7ae54e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -250,6 +250,15 @@ if (LLAMA_CUBLAS)
             set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
         endif()
 
+    if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
+        if (LLAMA_CUDA_DMMV_F16)
+            set(CMAKE_CUDA_ARCHITECTURES "61") # needed for f16 CUDA intrinsics
+        else()
+            set(CMAKE_CUDA_ARCHITECTURES "52") # lowest CUDA 12 standard
+        endif()
+    endif()
+    message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
+
     else()
         message(WARNING "cuBLAS not found")
     endif()
@@ -493,22 +502,6 @@ if (BUILD_SHARED_LIBS)
     endif()
 endif()
 
-if (GGML_SOURCES_CUDA)
-    message(STATUS "GGML CUDA sources found, configuring CUDA architecture")
-    set_property(TARGET ggml  PROPERTY CUDA_ARCHITECTURES "native")
-    set_property(TARGET ggml  PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto")
-
-    set_property(TARGET ggml_static PROPERTY CUDA_ARCHITECTURES "native")
-    set_property(TARGET ggml_static PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto")
-
-    if (BUILD_SHARED_LIBS)
-        set_property(TARGET ggml_shared PROPERTY CUDA_ARCHITECTURES "native")
-        set_property(TARGET ggml_shared PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto")
-    endif()
-
-    set_property(TARGET llama PROPERTY CUDA_ARCHITECTURES "native")
-endif()
-
 
 #
 # programs, examples and tests

From 7487137227eb32ed9b12156338b865cb29b2dfd1 Mon Sep 17 00:00:00 2001
From: Erik Scholz <Green-Sky@users.noreply.github.com>
Date: Thu, 22 Jun 2023 14:20:47 +0200
Subject: [PATCH 05/15] rework convert.py to read hyper-parameters from
 config.json (#1958)

* Read hyper-parameters from HuggingFace-transformer config.json, if they exist, and fall back to guessing, like before otherwise.
  This allows converting open_llama 3B and other non-standard model designs.
---
 convert.py | 91 +++++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 69 insertions(+), 22 deletions(-)

diff --git a/convert.py b/convert.py
index 265c41fa04b18..de6c39c67672b 100644
--- a/convert.py
+++ b/convert.py
@@ -130,6 +130,14 @@ def make_tensors_list() -> List[str]:
 TENSORS_SET = set(TENSORS_LIST)
 
 
+def find_n_mult(n_ff: int, n_embd: int) -> int:
+    # hardcoded magic range
+    for n_mult in range(256, 1, -1):
+        calc_ff = (((8*n_embd) // 3 + n_mult - 1) // n_mult)*n_mult
+        if calc_ff == n_ff:
+            return n_mult
+    return 1
+
 @dataclass
 class Params:
     n_vocab: int
@@ -137,21 +145,61 @@ class Params:
     n_mult: int
     n_head: int
     n_layer: int
-    file_type: GGMLFileType
 
     @staticmethod
-    def guessed(model: 'LazyModel', file_type: GGMLFileType) -> 'Params':
-        n_vocab, n_embd = model["tok_embeddings.weight"].shape
+    def guessed(model: 'LazyModel') -> 'Params':
+        # try transformer naming first
+        n_vocab, n_embd = model["model.embed_tokens.weight"].shape if "model.embed_tokens.weight" in model else model["tok_embeddings.weight"].shape
+
+        # try transformer naming first
+        if "model.layers.0.self_attn.q_proj.weight" in model:
+            n_layer=next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model)
+        else:
+            n_layer=next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model)
+
+        n_head=n_embd // 128 # guessed
 
         return Params(
             n_vocab=n_vocab,
             n_embd=n_embd,
             n_mult=256,
-            n_head=n_embd // 128,
-            n_layer=next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model),
-            file_type=file_type,
+            n_head=n_head,
+            n_layer=n_layer,
         )
 
+    @staticmethod
+    def loadHFTransformerJson(model: 'LazyModel', config_path: 'Path') -> 'Params':
+        config = json.load(open(config_path))
+
+        n_vocab = config["vocab_size"];
+        n_embd = config["hidden_size"];
+        n_head = config["num_attention_heads"];
+        n_layer = config["num_hidden_layers"];
+        n_ff = config["intermediate_size"];
+
+        n_mult = find_n_mult(n_ff, n_embd);
+
+        return Params(
+            n_vocab=n_vocab,
+            n_embd=n_embd,
+            n_mult=n_mult,
+            n_head=n_head,
+            n_layer=n_layer,
+        )
+
+    @staticmethod
+    def load(model_plus: 'ModelPlus') -> 'Params':
+        orig_config_path = model_plus.paths[0].parent / "params.json"
+        hf_transformer_config_path = model_plus.paths[0].parent / "config.json"
+
+        if hf_transformer_config_path.exists():
+            params = Params.loadHFTransformerJson(model_plus.model, hf_transformer_config_path)
+        else:
+            params = Params.guessed(model_plus.model)
+
+        print(f'params: n_vocab:{params.n_vocab} n_embd:{params.n_embd} n_mult:{params.n_mult} n_head:{params.n_head} n_layer:{params.n_layer}')
+        return params
+
 
 class SentencePieceVocab:
     def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path]) -> None:
@@ -595,18 +643,17 @@ def load() -> Tensor:
     return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}) ' + lazy_tensor.description)
 
 
-def convert_transformers_to_orig(model: LazyModel) -> LazyModel:
+def convert_transformers_to_orig(model: LazyModel, params: Params) -> LazyModel:
     out: LazyModel = {}
     out["tok_embeddings.weight"] = model["model.embed_tokens.weight"]
     out["norm.weight"] = model["model.norm.weight"]
     out["output.weight"] = model["lm_head.weight"]
 
-    n_head = model["model.layers.0.self_attn.q_proj.weight"].shape[1] // 128
     for i in itertools.count():
         if f"model.layers.{i}.self_attn.q_proj.weight" not in model:
             break
-        out[f"layers.{i}.attention.wq.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], n_head)
-        out[f"layers.{i}.attention.wk.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], n_head)
+        out[f"layers.{i}.attention.wq.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head)
+        out[f"layers.{i}.attention.wk.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head)
         out[f"layers.{i}.attention.wv.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"]
         out[f"layers.{i}.attention.wo.weight"] = model[f"model.layers.{i}.self_attn.o_proj.weight"]
 
@@ -920,7 +967,7 @@ class OutputFile:
     def __init__(self, fname_out: Path) -> None:
         self.fout = open(fname_out, "wb")
 
-    def write_file_header(self, params: Params) -> None:
+    def write_file_header(self, params: Params, file_type: GGMLFileType) -> None:
         self.fout.write(b"ggjt"[::-1])  # magic
         values = [
             1,  # file version
@@ -930,7 +977,7 @@ def write_file_header(self, params: Params) -> None:
             params.n_head,
             params.n_layer,
             params.n_embd // params.n_head,  # rot (obsolete)
-            params.file_type.value,
+            file_type.value,
         ]
         self.fout.write(struct.pack("i" * len(values), *values))
 
@@ -958,10 +1005,10 @@ def write_vocab_only(fname_out: Path, vocab: Vocab) -> None:
         of.fout.close()
 
     @staticmethod
-    def write_all(fname_out: Path, params: Params, model: LazyModel, vocab: Vocab) -> None:
+    def write_all(fname_out: Path, params: Params, file_type: GGMLFileType, model: LazyModel, vocab: Vocab) -> None:
         check_vocab_size(params, vocab)
         of = OutputFile(fname_out)
-        of.write_file_header(params)
+        of.write_file_header(params, file_type)
         print("Writing vocab...")
         of.write_vocab(vocab)
 
@@ -997,11 +1044,11 @@ def pick_output_type(model: LazyModel, output_type_str: Optional[str]) -> GGMLFi
     raise Exception(f"Unexpected combination of types: {name_to_type}")
 
 
-def do_necessary_conversions(model: LazyModel) -> LazyModel:
+def do_necessary_conversions(model: LazyModel, params: Params) -> LazyModel:
     model = handle_quantization(model)
 
     if "lm_head.weight" in model:
-        model = convert_transformers_to_orig(model)
+        model = convert_transformers_to_orig(model, params)
     model = filter_and_sort_tensors(model)
 
     return model
@@ -1107,14 +1154,14 @@ def load_vocab(path: Path) -> SentencePieceVocab:
     return SentencePieceVocab(path, added_tokens_path if added_tokens_path.exists() else None)
 
 
-def default_outfile(model_paths: List[Path], params: Params) -> Path:
+def default_outfile(model_paths: List[Path], file_type: GGMLFileType) -> Path:
     namestr = {
         GGMLFileType.AllF32: "f32",
         GGMLFileType.MostlyF16: "f16",
         GGMLFileType.MostlyQ4_0: "q4_0",
         GGMLFileType.MostlyQ4_1: "q4_1",
         GGMLFileType.PerLayerIsQ4_1: "q4_1",
-    }[params.file_type]
+    }[file_type]
     ret = model_paths[0].parent / f"ggml-model-{namestr}.bin"
     if ret in model_paths:
         sys.stderr.write(
@@ -1164,13 +1211,13 @@ def main(args_in: Optional[List[str]] = None) -> None:
         else:
             vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent
             vocab = load_vocab(vocab_dir)
+        params = Params.load(model_plus)
         model = model_plus.model
-        model = do_necessary_conversions(model)
+        model = do_necessary_conversions(model, params)
         output_type = pick_output_type(model, args.outtype)
         model = convert_to_output_type(model, output_type)
-        params = Params.guessed(model, output_type)
-        outfile = args.outfile or default_outfile(model_plus.paths, params)
-        OutputFile.write_all(outfile, params, model, vocab)
+        outfile = args.outfile or default_outfile(model_plus.paths, output_type)
+        OutputFile.write_all(outfile, params, output_type, model, vocab)
         print(f"Wrote {outfile}")
 
 

From d7b7484f74d486f77feb4c0b7af7e1718ed91651 Mon Sep 17 00:00:00 2001
From: eiery <19350831+eiery@users.noreply.github.com>
Date: Fri, 23 Jun 2023 04:38:01 -0400
Subject: [PATCH 06/15] Add OpenLLaMA instructions to the README (#1954)

* add openllama to readme
---
 README.md | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/README.md b/README.md
index ace588606ee8c..b09498be64cd0 100644
--- a/README.md
+++ b/README.md
@@ -29,6 +29,7 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
         <li><a href="#quantization">Quantization</a></li>
         <li><a href="#interactive-mode">Interactive mode</a></li>
         <li><a href="#instruction-mode-with-alpaca">Instruction mode with Alpaca</a></li>
+        <li><a href="#using-openllama">Using OpenLLaMA</a></li>
         <li><a href="#using-gpt4all">Using GPT4All</a></li>
         <li><a href="#using-pygmalion-7b--metharme-7b">Using Pygmalion 7B & Metharme 7B</a></li>
         <li><a href="#obtaining-the-facebook-llama-original-model-and-stanford-alpaca-model-data">Obtaining the Facebook LLaMA original model and Stanford Alpaca model data</a></li>
@@ -543,6 +544,13 @@ cadaver, cauliflower, cabbage (vegetable), catalpa (tree) and Cailleach.
 >
 ```
 
+### Using [OpenLLaMA](https://github.com/openlm-research/open_llama)
+
+OpenLLaMA is an openly licensed reproduction of Meta's original LLaMA model. It uses the same architecture and is a drop-in replacement for the original LLaMA weights.
+
+- Download the [3B](https://huggingface.co/openlm-research/open_llama_3b), [7B](https://huggingface.co/openlm-research/open_llama_7b), or [13B](https://huggingface.co/openlm-research/open_llama_13b) model from Hugging Face.
+- Convert the model to ggml FP16 format using `python convert.py <path to OpenLLaMA directory>`
+
 ### Using [GPT4All](https://github.com/nomic-ai/gpt4all)
 
 - Obtain the `tokenizer.model` file from LLaMA model and put it to `models`

From df9135e3a9a6708bb62e6484d239e2b4ea212ed7 Mon Sep 17 00:00:00 2001
From: Concedo <39025047+LostRuins@users.noreply.github.com>
Date: Fri, 23 Jun 2023 18:41:23 +0800
Subject: [PATCH 07/15] fixing memory bugs

---
 gpttype_adapter.cpp    | 8 ++++++--
 koboldcpp.py           | 2 +-
 llama.cpp              | 4 ++--
 model_adapter.cpp      | 2 +-
 otherarch/llama_v2.cpp | 4 ++--
 5 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp
index d0ddaf99bf6c5..4e087bd65afd7 100644
--- a/gpttype_adapter.cpp
+++ b/gpttype_adapter.cpp
@@ -308,8 +308,12 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
     params.memory_f16 = inputs.f16_kv;
     params.n_ctx = inputs.max_context_length;
 
-    neox_ctx_v2.hparams.n_ctx = gptj_ctx_v1.hparams.n_ctx = gptj_ctx_v2.hparams.n_ctx = gpt2_ctx_v1.hparams.n_ctx = gpt2_ctx_v2.hparams.n_ctx
-    = neox_ctx_v3.hparams.n_ctx = gptj_ctx_v3.hparams.n_ctx = gptj_ctx_v3.hparams.n_ctx = mpt_ctx_v3.hparams.n_ctx = params.n_ctx;
+    neox_ctx_v2.hparams.n_ctx  = neox_ctx_v3.hparams.n_ctx
+    = gptj_ctx_v1.hparams.n_ctx = gptj_ctx_v2.hparams.n_ctx = gptj_ctx_v3.hparams.n_ctx
+    = gpt2_ctx_v1.hparams.n_ctx = gpt2_ctx_v2.hparams.n_ctx = gpt2_ctx_v3.hparams.n_ctx
+    = mpt_ctx_v3.hparams.n_ctx = params.n_ctx;
+
+    bool calc_mem_with_scratch = ggml_cpu_has_gpublas();
 
     printf("System Info: %s\n", llama_print_system_info());
     SetQuantsUnshuffled(false);
diff --git a/koboldcpp.py b/koboldcpp.py
index aa54262623c05..76e94b84ac63b 100644
--- a/koboldcpp.py
+++ b/koboldcpp.py
@@ -225,7 +225,7 @@ def utfprint(str):
 maxhordelen = 256
 modelbusy = False
 defaultport = 5001
-KcppVersion = "1.32"
+KcppVersion = "1.32.1"
 showdebug = True
 
 class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
diff --git a/llama.cpp b/llama.cpp
index 27d3d4a0a9a8d..aa67038e02db8 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -80,7 +80,7 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
         { MODEL_3B,    256ull * MB },
         { MODEL_7B,    512ull * MB },
         { MODEL_13B,   512ull * MB },
-        { MODEL_30B,   512ull * MB },
+        { MODEL_30B,   640ull * MB },
         { MODEL_65B,  1024ull * MB },
     };
     return k_sizes;
@@ -92,7 +92,7 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
         { MODEL_3B,    256ull * MB },
         { MODEL_7B,    512ull * MB },
         { MODEL_13B,   512ull * MB },
-        { MODEL_30B,   512ull * MB },
+        { MODEL_30B,   640ull * MB },
         { MODEL_65B,  1024ull * MB },
     };
     return k_sizes;
diff --git a/model_adapter.cpp b/model_adapter.cpp
index 547a8a1ef83ab..da9fa193edc4b 100644
--- a/model_adapter.cpp
+++ b/model_adapter.cpp
@@ -98,7 +98,7 @@ void print_tok_vec(std::vector<float> &embd)
        //we need to read more to determine
        int32_t vocabsiz = 0;
        fin.read((char *) &vocabsiz, sizeof(int32_t));
-       if(vocabsiz==4096) //actually the d_model for mpt
+       if(vocabsiz==4096 || vocabsiz==7168) //actually the d_model for mpt
        {
            fileformat = FileFormat::MPT_1;
        }
diff --git a/otherarch/llama_v2.cpp b/otherarch/llama_v2.cpp
index 167f3e9c39291..2f8e168ca299b 100644
--- a/otherarch/llama_v2.cpp
+++ b/otherarch/llama_v2.cpp
@@ -59,7 +59,7 @@ static const std::map<e_model2, size_t> & MEM_REQ_SCRATCH0_2()
         { MODEL_UNKNOWN_2, 512ull * MB_2 },
         { MODEL_7B_2,    512ull * MB_2 },
         { MODEL_13B_2,   512ull * MB_2 },
-        { MODEL_30B_2,   512ull * MB_2 },
+        { MODEL_30B_2,   640ull * MB_2 },
         { MODEL_65B_2,  1024ull * MB_2 },
     };
     return k_sizes;
@@ -71,7 +71,7 @@ static const std::map<e_model2, size_t> & MEM_REQ_SCRATCH1_2()
         { MODEL_UNKNOWN_2, 512ull * MB_2 },
         { MODEL_7B_2,    512ull * MB_2 },
         { MODEL_13B_2,   512ull * MB_2 },
-        { MODEL_30B_2,   512ull * MB_2 },
+        { MODEL_30B_2,   640ull * MB_2 },
         { MODEL_65B_2,  1024ull * MB_2 },
     };
     return k_sizes;

From d5e4cf7ffea99e66d2cf6c38826c2fdbc1d68c8a Mon Sep 17 00:00:00 2001
From: Concedo <39025047+LostRuins@users.noreply.github.com>
Date: Fri, 23 Jun 2023 19:01:15 +0800
Subject: [PATCH 08/15] handle ctx manip

---
 otherarch/gpt2_v3.cpp | 11 +++++++----
 otherarch/gptj_v3.cpp | 11 +++++++----
 otherarch/mpt_v3.cpp  |  1 +
 otherarch/neox_v3.cpp | 11 +++++++----
 4 files changed, 22 insertions(+), 12 deletions(-)

diff --git a/otherarch/gpt2_v3.cpp b/otherarch/gpt2_v3.cpp
index ba2222f9990f4..4be0a08b00e05 100644
--- a/otherarch/gpt2_v3.cpp
+++ b/otherarch/gpt2_v3.cpp
@@ -12,6 +12,7 @@
 #include <string>
 #include <vector>
 #include <iostream>
+#include <algorithm>
 
 #include "model_adapter.h"
 
@@ -39,6 +40,8 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g
         }
     }
 
+    int32_t origmaxctx = model.hparams.n_ctx;
+
     // load hparams
     {
         auto & hparams = model.hparams;
@@ -53,7 +56,7 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g
         const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
 
         printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
-        printf("%s: n_ctx   = %d\n", __func__, hparams.n_ctx);
+        printf("%s: n_ctx   = %d (%d)\n", __func__, hparams.n_ctx,origmaxctx);
         printf("%s: n_embd  = %d\n", __func__, hparams.n_embd);
         printf("%s: n_head  = %d\n", __func__, hparams.n_head);
         printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
@@ -154,8 +157,8 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g
         ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_proj_w
         ctx_size += n_layer*(         n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b
 
-        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F16); // memory_k
-        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F16); // memory_v
+        ctx_size += std::max(origmaxctx,n_ctx)*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F16); // memory_k
+        ctx_size += std::max(origmaxctx,n_ctx)*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F16); // memory_v
 
         ctx_size += (6 + 12*n_layer)*1024; // object overhead
 
@@ -256,7 +259,7 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g
         const int n_layer = hparams.n_layer;
         const int n_ctx   = hparams.n_ctx;
 
-        const int n_mem      = n_layer*n_ctx;
+        const int n_mem      = n_layer*std::max(origmaxctx,n_ctx);
         const int n_elements = n_embd*n_mem;
 
         model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
diff --git a/otherarch/gptj_v3.cpp b/otherarch/gptj_v3.cpp
index 0f0f8210516b7..2931ece5fbb9c 100644
--- a/otherarch/gptj_v3.cpp
+++ b/otherarch/gptj_v3.cpp
@@ -12,6 +12,7 @@
 #include <string>
 #include <vector>
 #include <iostream>
+#include <algorithm>
 
 #include "model_adapter.h"
 
@@ -39,6 +40,8 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g
         }
     }
 
+    int32_t origmaxctx = model.hparams.n_ctx;
+
     // load hparams
     {
         auto & hparams = model.hparams;
@@ -54,7 +57,7 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g
         const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
 
         printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
-        printf("%s: n_ctx   = %d\n", __func__, hparams.n_ctx);
+        printf("%s: n_ctx   = %d (%d)\n", __func__, hparams.n_ctx,origmaxctx);
         printf("%s: n_embd  = %d\n", __func__, hparams.n_embd);
         printf("%s: n_head  = %d\n", __func__, hparams.n_head);
         printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
@@ -138,8 +141,8 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g
         ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_proj_w
         ctx_size += n_layer*(         n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b
 
-        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(memory_type); // memory_k
-        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(memory_type); // memory_v
+        ctx_size += std::max(origmaxctx,n_ctx)*n_layer*n_embd*ggml_type_sizef(memory_type); // memory_k
+        ctx_size += std::max(origmaxctx,n_ctx)*n_layer*n_embd*ggml_type_sizef(memory_type); // memory_v
 
         ctx_size += (5 + 10*n_layer)*512; // object overhead
 
@@ -232,7 +235,7 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g
         const int n_layer = hparams.n_layer;
         const int n_ctx   = hparams.n_ctx;
 
-        const int n_mem      = n_layer*n_ctx;
+        const int n_mem      = n_layer*std::max(origmaxctx,n_ctx);
         const int n_elements = n_embd*n_mem;
 
         model.memory_k = ggml_new_tensor_1d(ctx, memory_type, n_elements);
diff --git a/otherarch/mpt_v3.cpp b/otherarch/mpt_v3.cpp
index f7ab03ec0d958..46ac0bd8b381f 100644
--- a/otherarch/mpt_v3.cpp
+++ b/otherarch/mpt_v3.cpp
@@ -12,6 +12,7 @@
 #include <string>
 #include <vector>
 #include <iostream>
+#include <algorithm>
 
 #include "model_adapter.h"
 
diff --git a/otherarch/neox_v3.cpp b/otherarch/neox_v3.cpp
index 3084bbda7630c..4f79171bd4cd0 100644
--- a/otherarch/neox_v3.cpp
+++ b/otherarch/neox_v3.cpp
@@ -12,6 +12,7 @@
 #include <string>
 #include <vector>
 #include <iostream>
+#include <algorithm>
 
 #if defined(GGML_USE_CLBLAST)
 #include "ggml-opencl.h"
@@ -37,6 +38,8 @@ ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model &
         }
     }
 
+    int32_t origmaxctx = model.hparams.n_ctx;
+
     // load hparams
     {
         auto & hparams = model.hparams;
@@ -53,7 +56,7 @@ ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model &
         const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
 
         printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
-        printf("%s: n_ctx   = %d\n", __func__, hparams.n_ctx);
+        printf("%s: n_ctx   = %d (%d)\n", __func__, hparams.n_ctx,origmaxctx);
         printf("%s: n_embd  = %d\n", __func__, hparams.n_embd);
         printf("%s: n_head  = %d\n", __func__, hparams.n_head);
         printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
@@ -133,8 +136,8 @@ ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model &
         ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_proj_w
         ctx_size += n_layer*(         n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b
 
-        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F16); // memory_k
-        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F16); // memory_v
+        ctx_size += std::max((size_t)origmaxctx,n_ctx)*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F16); // memory_k
+        ctx_size += std::max((size_t)origmaxctx,n_ctx)*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F16); // memory_v
 
         ctx_size += (6 + 16*n_layer)*1024; // object overhead
 
@@ -232,7 +235,7 @@ ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model &
         const int n_layer = hparams.n_layer;
         const int n_ctx   = hparams.n_ctx;
 
-        const int64_t n_mem      = n_layer*n_ctx;
+        const int64_t n_mem      = n_layer*std::max(origmaxctx,n_ctx);
         const int64_t n_elements = n_embd*n_mem;
 
         model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);

From 43c2891afabea24b9a8c2de845d12463f844b949 Mon Sep 17 00:00:00 2001
From: Concedo <39025047+LostRuins@users.noreply.github.com>
Date: Fri, 23 Jun 2023 19:01:36 +0800
Subject: [PATCH 09/15] option to not use scratch

---
 gpttype_adapter.cpp   | 12 +++++------
 otherarch/gpt2_v3.cpp | 26 +++++++++++++++++++-----
 otherarch/gptj_v3.cpp | 24 +++++++++++++++++-----
 otherarch/mpt_v3.cpp  | 46 ++++++++++++++++++++++++++++++++-----------
 otherarch/neox_v3.cpp | 26 +++++++++++++++++++-----
 5 files changed, 102 insertions(+), 32 deletions(-)

diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp
index 4e087bd65afd7..20093f2058836 100644
--- a/gpttype_adapter.cpp
+++ b/gpttype_adapter.cpp
@@ -549,7 +549,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
                 return res;
             }
             // determine the required inference memory per token:
-            gpt2_eval(gpt2_ctx_v3, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, file_format);
+            gpt2_eval(gpt2_ctx_v3, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, calc_mem_with_scratch);
             return ModelLoadResult::SUCCESS;
         }
         else
@@ -616,14 +616,14 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
             }
 
             // determine the required inference memory per token:
-            gptj_eval(gptj_ctx_v3, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);
+            gptj_eval(gptj_ctx_v3, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, calc_mem_with_scratch);
 
             //if the logits are NAN or duplicated, it means the model is incompatible
             std::vector<float> oldlogits(logits);
 
             //this is another hack because they change the library - we run the eval through the model
             //twice and compare logits. if they give the same logits for different inputs, model is broken
-            gptj_eval(gptj_ctx_v3, params.n_threads, 0, {4, 5, 6, 7}, logits, mem_per_token);
+            gptj_eval(gptj_ctx_v3, params.n_threads, 0, {4, 5, 6, 7}, logits, mem_per_token, calc_mem_with_scratch);
 
             if(logits.size()>0 && (IsNanCheck(logits[0]) || LogitsDuplicated(oldlogits,logits)))
             {
@@ -688,7 +688,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
             }
 
             // determine the required inference memory per token:
-            gpt_neox_eval(neox_ctx_v3, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);
+            gpt_neox_eval(neox_ctx_v3, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, calc_mem_with_scratch);
 
             return ModelLoadResult::SUCCESS;
         }
@@ -745,7 +745,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
         }
 
         // determine the required inference memory per token:
-        mpt_eval(mpt_ctx_v3, params.n_threads, 0, { 0, 1, 2, 3 }, logits, false, mem_per_token);
+        mpt_eval(mpt_ctx_v3, params.n_threads, 0, { 0, 1, 2, 3 }, logits, false, mem_per_token, calc_mem_with_scratch);
         return ModelLoadResult::SUCCESS;
     }
     else
@@ -1078,7 +1078,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
             }
             else if(file_format==FileFormat::GPT2_4)
             {
-                evalres = gpt2_eval(gpt2_ctx_v3, params.n_threads, n_past, embd, logits, mem_per_token, file_format);
+                evalres = gpt2_eval(gpt2_ctx_v3, params.n_threads, n_past, embd, logits, mem_per_token);
             }
             else if(file_format==FileFormat::NEOX_1 || file_format == FileFormat::NEOX_2 || file_format == FileFormat::NEOX_3 || file_format==FileFormat::NEOX_4 || file_format==FileFormat::NEOX_5)
             {
diff --git a/otherarch/gpt2_v3.cpp b/otherarch/gpt2_v3.cpp
index 4be0a08b00e05..f8b82fdd47851 100644
--- a/otherarch/gpt2_v3.cpp
+++ b/otherarch/gpt2_v3.cpp
@@ -389,7 +389,7 @@ bool gpt2_eval(
         const std::vector<gpt_vocab::id> & embd_inp,
               std::vector<float>         & embd_w,
               size_t                     & mem_per_token,
-              FileFormat file_format) {
+              bool use_scratch=true) {
     const int N = embd_inp.size();
 
     const auto & hparams = model.hparams;
@@ -406,13 +406,21 @@ bool gpt2_eval(
     // use 2 scratch buffers
     // TODO: very hacky solution - reimplement in a more elegant way
     static size_t scr0_size = (n_ctx>1024?512u:256u)*1024*1024;
-    static void * scr0 = malloc(scr0_size);
+    static void * scr0;
 
     static size_t scr1_size = (n_ctx>1024?512u:256u)*1024*1024;
-    static void * scr1 = malloc(scr1_size);
+    static void * scr1;
 
-    if (mem_per_token > 0 && mem_per_token*N*1.05 > buf_size) {
-        const size_t buf_size_new = 64u*1024*1024 + 1.15*(mem_per_token*N); // add 10% to account for ggml object overhead
+    if(use_scratch)
+    {
+        scr0 = malloc(scr0_size);
+        scr1 = malloc(scr1_size);
+    }
+
+    size_t scratch_needed_mem = mem_per_token*N;
+
+    if (mem_per_token > 0 && scratch_needed_mem*1.1 > buf_size) {
+        const size_t buf_size_new = 64u*1024*1024 + 1.2*(scratch_needed_mem); // add 10% to account for ggml object overhead
         //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
 
         // reallocate
@@ -455,7 +463,9 @@ bool gpt2_eval(
     for (int il = 0; il < n_layer; ++il) {
         struct ggml_tensor * cur;
 
+        if(use_scratch){
         ggml_set_scratch(ctx0, { 0, scr0_size, scr0, });
+        }
 
         // norm
         {
@@ -603,7 +613,9 @@ bool gpt2_eval(
 
         struct ggml_tensor * inpFF = cur;
 
+        if(use_scratch){
         ggml_set_scratch(ctx0, { 0, scr1_size, scr1, });
+        }
 
         // feed-forward network
         {
@@ -661,7 +673,9 @@ bool gpt2_eval(
         inpL = ggml_add(ctx0, cur, inpFF);
     }
 
+    if(use_scratch){
     ggml_set_scratch(ctx0, { 0, scr0_size, scr0, });
+    }
 
     // norm
     {
@@ -677,7 +691,9 @@ bool gpt2_eval(
                 ggml_repeat(ctx0, model.ln_f_b, inpL));
     }
 
+    if(use_scratch){
     ggml_set_scratch(ctx0, { 0, 0, nullptr, });
+    }
 
     // inpL = WTE * inpL
     // [ 768, 50257] - model.lm_head
diff --git a/otherarch/gptj_v3.cpp b/otherarch/gptj_v3.cpp
index 2931ece5fbb9c..8df2025f04466 100644
--- a/otherarch/gptj_v3.cpp
+++ b/otherarch/gptj_v3.cpp
@@ -382,7 +382,8 @@ bool gptj_eval(
         const int n_past,
         const std::vector<gpt_vocab::id> & embd_inp,
               std::vector<float>         & embd_w,
-              size_t                     & mem_per_token) {
+              size_t                     & mem_per_token,
+              bool use_scratch=true) {
     const int N = embd_inp.size();
 
     const auto & hparams = model.hparams;
@@ -400,13 +401,18 @@ bool gptj_eval(
     // use 2 scratch buffers
     // TODO: very hacky solution - reimplement in a more elegant way
     static size_t scr0_size = (n_ctx>1024?512u:256u)*1024*1024;
-    static void * scr0 = malloc(scr0_size);
+    static void * scr0;
 
     static size_t scr1_size = (n_ctx>1024?512u:256u)*1024*1024;
-    static void * scr1 = malloc(scr1_size);
+    static void * scr1;
+    if(use_scratch)
+    {
+        scr0 = malloc(scr0_size);
+        scr1 = malloc(scr1_size);
+    }
 
-    if (mem_per_token > 0 && mem_per_token*N*1.05 > buf_size) {
-        const size_t buf_size_new = 64u*1024*1024 + 1.15*(mem_per_token*N); // add 10% to account for ggml object overhead
+    if (mem_per_token > 0 && 32u*1024*1024 + mem_per_token*N*1.2 > buf_size) {
+        const size_t buf_size_new = 64u*1024*1024 + 1.2*(mem_per_token*N); // add 10% to account for ggml object overhead
         //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
 
         // reallocate
@@ -441,7 +447,9 @@ bool gptj_eval(
     for (int il = 0; il < n_layer; ++il) {
         struct ggml_tensor * cur;
 
+        if(use_scratch){
         ggml_set_scratch(ctx0, { 0, scr0_size, scr0, });
+        }
 
         // norm
         {
@@ -530,7 +538,9 @@ bool gptj_eval(
                     cur);
         }
 
+        if(use_scratch){
         ggml_set_scratch(ctx0, { 0, scr1_size, scr1, });
+        }
 
         struct ggml_tensor * inpFF = cur;
 
@@ -567,7 +577,9 @@ bool gptj_eval(
         inpL = ggml_add(ctx0, cur, inpL);
     }
 
+    if(use_scratch){
     ggml_set_scratch(ctx0, { 0, scr0_size, scr0, });
+    }
 
     // norm
     {
@@ -581,7 +593,9 @@ bool gptj_eval(
                 ggml_repeat(ctx0, model.ln_f_b, inpL));
     }
 
+    if(use_scratch){
     ggml_set_scratch(ctx0, { 0, 0, nullptr, });
+    }
 
     // lm_head
     {
diff --git a/otherarch/mpt_v3.cpp b/otherarch/mpt_v3.cpp
index 46ac0bd8b381f..ac4f321a2595b 100644
--- a/otherarch/mpt_v3.cpp
+++ b/otherarch/mpt_v3.cpp
@@ -316,7 +316,8 @@ bool mpt_model_load(const std::string & fname, mpt_model & model, gpt_vocab & vo
 //   - embd_w:    the predicted logits for the next token
 //
 bool mpt_eval(const mpt_model & model, const int n_threads, const int n_past,
-              const std::vector<gpt_vocab::id> & embd_inp, std::vector<float> & embd_w, bool logits_all, size_t & mem_per_token) {
+              const std::vector<gpt_vocab::id> & embd_inp, std::vector<float> & embd_w,
+              bool logits_all, size_t & mem_per_token, bool use_scratch=true) {
     const int N = embd_inp.size();
 
     const auto & hparams = model.hparams;
@@ -332,22 +333,37 @@ bool mpt_eval(const mpt_model & model, const int n_threads, const int n_past,
 
     // use 2 scratch buffers
     // TODO: very hacky solution - reimplement in a more elegant way
-    static size_t scr0_size = (n_ctx>2048?1024u:512u)*1024*1024;
-    static void * scr0 = malloc(scr0_size);
 
+    static size_t scr0_size = (n_ctx>2048?1024u:512u)*1024*1024;
     static size_t scr1_size = (n_ctx>2048?1024u:512u)*1024*1024;
-    static void * scr1 = malloc(scr1_size);
 
-    if (mem_per_token > 0 && mem_per_token * N > buf_size) {
-        const size_t buf_size_new = 1.1 * (mem_per_token * N); // add 10% to account for ggml object overhead
+    if(n_embd>=7168) //MPT 30B needs more scratch memory
+    {
+        scr0_size *= 2;
+        scr1_size *= 2;
+    }
+
+    static void * scr0;
+    static void * scr1;
+    if(use_scratch)
+    {
+        scr0 = malloc(scr0_size);
+        scr1 = malloc(scr1_size);
+    }
+
+    if (mem_per_token > 0 && mem_per_token * N *1.1 > buf_size) {
+        const size_t buf_size_new = 64u*1024*1024 + 1.2 * (mem_per_token * N); // add 10% to account for ggml object overhead
         // printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__,
         // buf_size, buf_size_new);
         // reallocate
-        buf_size = buf_size_new;
-        buf = realloc(buf, buf_size);
-        if (buf == nullptr) {
-            fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size);
-            return false;
+        if (buf_size_new > buf_size)
+        {
+            buf_size = buf_size_new;
+            buf = realloc(buf, buf_size);
+            if (buf == nullptr) {
+                fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size);
+                return false;
+            }
         }
     }
 
@@ -369,7 +385,9 @@ bool mpt_eval(const mpt_model & model, const int n_threads, const int n_past,
 
         struct ggml_tensor * cur;
 
+        if(use_scratch){
         ggml_set_scratch(ctx0, { 0, scr0_size, scr0, });
+        }
 
         // a = self.ln_1(x)
         {
@@ -465,7 +483,9 @@ bool mpt_eval(const mpt_model & model, const int n_threads, const int n_past,
 
         inpL = ggml_add(ctx0, inpL, cur);
 
+        if(use_scratch){
         ggml_set_scratch(ctx0, { 0, scr1_size, scr1, });
+        }
 
         // m = self.ln_2(x)
         {
@@ -491,7 +511,9 @@ bool mpt_eval(const mpt_model & model, const int n_threads, const int n_past,
         inpL = ggml_add(ctx0, inpL, cur);
     }
 
+    if(use_scratch){
     ggml_set_scratch(ctx0, { 0, scr0_size, scr0, });
+    }
 
     // norm
     {
@@ -500,7 +522,9 @@ bool mpt_eval(const mpt_model & model, const int n_threads, const int n_past,
         inpL = ggml_mul(ctx0, ggml_repeat(ctx0, model.norm_f_weight, inpL), inpL);
     }
 
+    if(use_scratch){
     ggml_set_scratch(ctx0, { 0, 0, nullptr, });
+    }
 
     // output embedding weight tied to input embedding
     inpL = ggml_mul_mat(ctx0, model.wte_weight, inpL);
diff --git a/otherarch/neox_v3.cpp b/otherarch/neox_v3.cpp
index 4f79171bd4cd0..40e1d1e18ac0b 100644
--- a/otherarch/neox_v3.cpp
+++ b/otherarch/neox_v3.cpp
@@ -400,7 +400,8 @@ bool gpt_neox_eval(
         const int n_past,
         const std::vector<gpt_vocab::id> & embd_inp,
               std::vector<float>         & embd_w,
-              size_t                     & mem_per_token) {
+              size_t                     & mem_per_token,
+              bool use_scratch=true) {
     const int N = embd_inp.size();
 
     const auto & hparams = model.hparams;
@@ -418,13 +419,20 @@ bool gpt_neox_eval(
     // use 2 scratch buffers
     // TODO: very hacky solution - reimplement in a more elegant way
     static size_t scr0_size = (n_ctx>1024?512u:256u)*1024*1024;
-    static void * scr0 = malloc(scr0_size);
+    static void * scr0;
 
     static size_t scr1_size = (n_ctx>1024?512u:256u)*1024*1024;
-    static void * scr1 = malloc(scr1_size);
+    static void * scr1;
+    if(use_scratch)
+    {
+        scr0 = malloc(scr0_size);
+        scr1 = malloc(scr1_size);
+    }
 
-    if (mem_per_token > 0 && mem_per_token*N*1.05 > buf_size) {
-        const size_t buf_size_new = 64u*1024*1024 + 1.15*(mem_per_token*N); // add 10% to account for ggml object overhead
+    size_t scratch_needed_mem = mem_per_token*N;
+
+    if (mem_per_token > 0 && scratch_needed_mem*1.1 > buf_size) {
+        const size_t buf_size_new = 64u*1024*1024 + 1.2*(scratch_needed_mem); // add 10% to account for ggml object overhead
         //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
 
         // reallocate
@@ -459,7 +467,9 @@ bool gpt_neox_eval(
     for (int il = 0; il < n_layer; ++il) {
         struct ggml_tensor * cur;
 
+        if(use_scratch){
         ggml_set_scratch(ctx0, { 0, scr0_size, scr0, });
+        }
 
         // self-attention
         {
@@ -564,7 +574,9 @@ bool gpt_neox_eval(
             }
         }
 
+        if(use_scratch){
         ggml_set_scratch(ctx0, { 0, scr1_size, scr1, });
+        }
 
         if (hparams.par_res == 0) {
             struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpL);
@@ -588,7 +600,9 @@ bool gpt_neox_eval(
         }
     }
 
+    if(use_scratch){
     ggml_set_scratch(ctx0, { 0, scr0_size, scr0, });
+    }
 
     // norm
     {
@@ -602,7 +616,9 @@ bool gpt_neox_eval(
                 ggml_repeat(ctx0, model.ln_f_b, inpL));
     }
 
+    if(use_scratch){
     ggml_set_scratch(ctx0, { 0, 0, nullptr, });
+    }
 
     // lm_head
     {

From f39a7460890de883b0d68d45d75d1780984ca76e Mon Sep 17 00:00:00 2001
From: Concedo <39025047+LostRuins@users.noreply.github.com>
Date: Fri, 23 Jun 2023 22:45:22 +0800
Subject: [PATCH 10/15] bug fixes for openblas

---
 gpttype_adapter.cpp   | 21 +++++++++++----------
 otherarch/gpt2_v3.cpp | 22 +++++++---------------
 otherarch/gptj_v3.cpp | 19 +++++++------------
 otherarch/mpt_v3.cpp  | 27 ++++++++-------------------
 otherarch/neox_v3.cpp | 21 +++++++--------------
 5 files changed, 40 insertions(+), 70 deletions(-)

diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp
index 20093f2058836..8c716c84a7de6 100644
--- a/gpttype_adapter.cpp
+++ b/gpttype_adapter.cpp
@@ -313,7 +313,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
     = gpt2_ctx_v1.hparams.n_ctx = gpt2_ctx_v2.hparams.n_ctx = gpt2_ctx_v3.hparams.n_ctx
     = mpt_ctx_v3.hparams.n_ctx = params.n_ctx;
 
-    bool calc_mem_with_scratch = ggml_cpu_has_gpublas();
+    bool use_scratch = ggml_cpu_has_gpublas();
 
     printf("System Info: %s\n", llama_print_system_info());
     SetQuantsUnshuffled(false);
@@ -549,7 +549,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
                 return res;
             }
             // determine the required inference memory per token:
-            gpt2_eval(gpt2_ctx_v3, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, calc_mem_with_scratch);
+            gpt2_eval(gpt2_ctx_v3, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, use_scratch);
             return ModelLoadResult::SUCCESS;
         }
         else
@@ -616,14 +616,14 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
             }
 
             // determine the required inference memory per token:
-            gptj_eval(gptj_ctx_v3, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, calc_mem_with_scratch);
+            gptj_eval(gptj_ctx_v3, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, use_scratch);
 
             //if the logits are NAN or duplicated, it means the model is incompatible
             std::vector<float> oldlogits(logits);
 
             //this is another hack because they change the library - we run the eval through the model
             //twice and compare logits. if they give the same logits for different inputs, model is broken
-            gptj_eval(gptj_ctx_v3, params.n_threads, 0, {4, 5, 6, 7}, logits, mem_per_token, calc_mem_with_scratch);
+            gptj_eval(gptj_ctx_v3, params.n_threads, 0, {4, 5, 6, 7}, logits, mem_per_token, use_scratch);
 
             if(logits.size()>0 && (IsNanCheck(logits[0]) || LogitsDuplicated(oldlogits,logits)))
             {
@@ -688,7 +688,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
             }
 
             // determine the required inference memory per token:
-            gpt_neox_eval(neox_ctx_v3, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, calc_mem_with_scratch);
+            gpt_neox_eval(neox_ctx_v3, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, use_scratch);
 
             return ModelLoadResult::SUCCESS;
         }
@@ -745,7 +745,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
         }
 
         // determine the required inference memory per token:
-        mpt_eval(mpt_ctx_v3, params.n_threads, 0, { 0, 1, 2, 3 }, logits, false, mem_per_token, calc_mem_with_scratch);
+        mpt_eval(mpt_ctx_v3, params.n_threads, 0, { 0, 1, 2, 3 }, logits, false, mem_per_token, use_scratch);
         return ModelLoadResult::SUCCESS;
     }
     else
@@ -904,6 +904,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
     concat_output = "";
 
     bool startedsampling = false;
+    bool use_scratch = true;
 
     timer_start();
     double time1 = 0, time2 = 0;
@@ -1078,7 +1079,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
             }
             else if(file_format==FileFormat::GPT2_4)
             {
-                evalres = gpt2_eval(gpt2_ctx_v3, params.n_threads, n_past, embd, logits, mem_per_token);
+                evalres = gpt2_eval(gpt2_ctx_v3, params.n_threads, n_past, embd, logits, mem_per_token, use_scratch);
             }
             else if(file_format==FileFormat::NEOX_1 || file_format == FileFormat::NEOX_2 || file_format == FileFormat::NEOX_3 || file_format==FileFormat::NEOX_4 || file_format==FileFormat::NEOX_5)
             {
@@ -1086,7 +1087,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
             }
             else if(file_format==FileFormat::NEOX_6|| file_format==FileFormat::NEOX_7)
             {
-                evalres = gpt_neox_eval(neox_ctx_v3, params.n_threads, n_past, embd, logits, mem_per_token);
+                evalres = gpt_neox_eval(neox_ctx_v3, params.n_threads, n_past, embd, logits, mem_per_token, use_scratch);
             }
             else if(file_format==FileFormat::GPTJ_1 || file_format==FileFormat::GPTJ_2)
             {
@@ -1098,11 +1099,11 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
             }
             else if(file_format==FileFormat::GPTJ_5)
             {
-                evalres = gptj_eval(gptj_ctx_v3, params.n_threads, n_past, embd, logits, mem_per_token);
+                evalres = gptj_eval(gptj_ctx_v3, params.n_threads, n_past, embd, logits, mem_per_token, use_scratch);
             }
             else if(file_format==FileFormat::MPT_1)
             {
-                evalres = mpt_eval(mpt_ctx_v3, params.n_threads, n_past, embd, logits, false, mem_per_token);
+                evalres = mpt_eval(mpt_ctx_v3, params.n_threads, n_past, embd, logits, false, mem_per_token, use_scratch);
             }
             else
             {
diff --git a/otherarch/gpt2_v3.cpp b/otherarch/gpt2_v3.cpp
index f8b82fdd47851..b716fe212d67e 100644
--- a/otherarch/gpt2_v3.cpp
+++ b/otherarch/gpt2_v3.cpp
@@ -389,7 +389,7 @@ bool gpt2_eval(
         const std::vector<gpt_vocab::id> & embd_inp,
               std::vector<float>         & embd_w,
               size_t                     & mem_per_token,
-              bool use_scratch=true) {
+              bool use_scratch) {
     const int N = embd_inp.size();
 
     const auto & hparams = model.hparams;
@@ -405,22 +405,14 @@ bool gpt2_eval(
 
     // use 2 scratch buffers
     // TODO: very hacky solution - reimplement in a more elegant way
-    static size_t scr0_size = (n_ctx>1024?512u:256u)*1024*1024;
-    static void * scr0;
+    static size_t scr0_size = (n_embd>2400?512u:256u)*1024*1024;
+    static size_t scr1_size = (n_embd>2400?512u:256u)*1024*1024;
 
-    static size_t scr1_size = (n_ctx>1024?512u:256u)*1024*1024;
-    static void * scr1;
+    static void * scr0 = malloc(scr0_size);
+    static void * scr1 = malloc(scr1_size);
 
-    if(use_scratch)
-    {
-        scr0 = malloc(scr0_size);
-        scr1 = malloc(scr1_size);
-    }
-
-    size_t scratch_needed_mem = mem_per_token*N;
-
-    if (mem_per_token > 0 && scratch_needed_mem*1.1 > buf_size) {
-        const size_t buf_size_new = 64u*1024*1024 + 1.2*(scratch_needed_mem); // add 10% to account for ggml object overhead
+    if (mem_per_token > 0 && (mem_per_token*N*2 + 64u*1024*1024) > buf_size) {
+        const size_t buf_size_new = 320u*1024*1024 + 1.2*(mem_per_token*N); // add 10% to account for ggml object overhead
         //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
 
         // reallocate
diff --git a/otherarch/gptj_v3.cpp b/otherarch/gptj_v3.cpp
index 8df2025f04466..031a2c051c5f2 100644
--- a/otherarch/gptj_v3.cpp
+++ b/otherarch/gptj_v3.cpp
@@ -383,7 +383,7 @@ bool gptj_eval(
         const std::vector<gpt_vocab::id> & embd_inp,
               std::vector<float>         & embd_w,
               size_t                     & mem_per_token,
-              bool use_scratch=true) {
+              bool use_scratch) {
     const int N = embd_inp.size();
 
     const auto & hparams = model.hparams;
@@ -400,19 +400,14 @@ bool gptj_eval(
 
     // use 2 scratch buffers
     // TODO: very hacky solution - reimplement in a more elegant way
-    static size_t scr0_size = (n_ctx>1024?512u:256u)*1024*1024;
-    static void * scr0;
+    static size_t scr0_size = 512u*1024*1024;
+    static size_t scr1_size = 512u*1024*1024;
 
-    static size_t scr1_size = (n_ctx>1024?512u:256u)*1024*1024;
-    static void * scr1;
-    if(use_scratch)
-    {
-        scr0 = malloc(scr0_size);
-        scr1 = malloc(scr1_size);
-    }
+    static void * scr0 = malloc(scr0_size);
+    static void * scr1 = malloc(scr1_size);
 
-    if (mem_per_token > 0 && 32u*1024*1024 + mem_per_token*N*1.2 > buf_size) {
-        const size_t buf_size_new = 64u*1024*1024 + 1.2*(mem_per_token*N); // add 10% to account for ggml object overhead
+    if (mem_per_token > 0 && (mem_per_token*N*2 + 64u*1024*1024) > buf_size) {
+        const size_t buf_size_new = 320u*1024*1024 + 1.2*(mem_per_token*N); // add 10% to account for ggml object overhead
         //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
 
         // reallocate
diff --git a/otherarch/mpt_v3.cpp b/otherarch/mpt_v3.cpp
index ac4f321a2595b..5d66f91f5f4cd 100644
--- a/otherarch/mpt_v3.cpp
+++ b/otherarch/mpt_v3.cpp
@@ -317,7 +317,7 @@ bool mpt_model_load(const std::string & fname, mpt_model & model, gpt_vocab & vo
 //
 bool mpt_eval(const mpt_model & model, const int n_threads, const int n_past,
               const std::vector<gpt_vocab::id> & embd_inp, std::vector<float> & embd_w,
-              bool logits_all, size_t & mem_per_token, bool use_scratch=true) {
+              bool logits_all, size_t & mem_per_token, bool use_scratch) {
     const int N = embd_inp.size();
 
     const auto & hparams = model.hparams;
@@ -333,26 +333,15 @@ bool mpt_eval(const mpt_model & model, const int n_threads, const int n_past,
 
     // use 2 scratch buffers
     // TODO: very hacky solution - reimplement in a more elegant way
+    //MPT 30B needs more scratch memory
+    static size_t scr0_size = (n_embd>=7168?2048u:1024u)*1024*1024;
+    static size_t scr1_size = (n_embd>=7168?2048u:1024u)*1024*1024;
 
-    static size_t scr0_size = (n_ctx>2048?1024u:512u)*1024*1024;
-    static size_t scr1_size = (n_ctx>2048?1024u:512u)*1024*1024;
+    static void * scr0 = malloc(scr0_size);
+    static void * scr1 = malloc(scr1_size);
 
-    if(n_embd>=7168) //MPT 30B needs more scratch memory
-    {
-        scr0_size *= 2;
-        scr1_size *= 2;
-    }
-
-    static void * scr0;
-    static void * scr1;
-    if(use_scratch)
-    {
-        scr0 = malloc(scr0_size);
-        scr1 = malloc(scr1_size);
-    }
-
-    if (mem_per_token > 0 && mem_per_token * N *1.1 > buf_size) {
-        const size_t buf_size_new = 64u*1024*1024 + 1.2 * (mem_per_token * N); // add 10% to account for ggml object overhead
+    if (mem_per_token > 0 && (mem_per_token*N*2 + 64u*1024*1024) > buf_size) {
+        const size_t buf_size_new = 320u*1024*1024 + 1.2*(mem_per_token*N); // add 10% to account for ggml object overhead
         // printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__,
         // buf_size, buf_size_new);
         // reallocate
diff --git a/otherarch/neox_v3.cpp b/otherarch/neox_v3.cpp
index 40e1d1e18ac0b..37f5ad9aed276 100644
--- a/otherarch/neox_v3.cpp
+++ b/otherarch/neox_v3.cpp
@@ -401,7 +401,7 @@ bool gpt_neox_eval(
         const std::vector<gpt_vocab::id> & embd_inp,
               std::vector<float>         & embd_w,
               size_t                     & mem_per_token,
-              bool use_scratch=true) {
+              bool use_scratch) {
     const int N = embd_inp.size();
 
     const auto & hparams = model.hparams;
@@ -418,21 +418,14 @@ bool gpt_neox_eval(
 
     // use 2 scratch buffers
     // TODO: very hacky solution - reimplement in a more elegant way
-    static size_t scr0_size = (n_ctx>1024?512u:256u)*1024*1024;
-    static void * scr0;
+    static size_t scr0_size = (n_embd>2400?512u:256u)*1024*1024;
+    static size_t scr1_size = (n_embd>2400?512u:256u)*1024*1024;
 
-    static size_t scr1_size = (n_ctx>1024?512u:256u)*1024*1024;
-    static void * scr1;
-    if(use_scratch)
-    {
-        scr0 = malloc(scr0_size);
-        scr1 = malloc(scr1_size);
-    }
-
-    size_t scratch_needed_mem = mem_per_token*N;
+    static void * scr0 = malloc(scr0_size);
+    static void * scr1 = malloc(scr1_size);
 
-    if (mem_per_token > 0 && scratch_needed_mem*1.1 > buf_size) {
-        const size_t buf_size_new = 64u*1024*1024 + 1.2*(scratch_needed_mem); // add 10% to account for ggml object overhead
+    if (mem_per_token > 0 && (mem_per_token*N*2 + 64u*1024*1024) > buf_size) {
+        const size_t buf_size_new = 360u*1024*1024 + 1.2*(mem_per_token*N); // add 10% to account for ggml object overhead
         //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
 
         // reallocate

From 490cf395f82d7d0582016a51054457e2d6f89769 Mon Sep 17 00:00:00 2001
From: Concedo <39025047+LostRuins@users.noreply.github.com>
Date: Fri, 23 Jun 2023 22:51:51 +0800
Subject: [PATCH 11/15] better alloc error

---
 otherarch/gpt2_v3.cpp | 2 +-
 otherarch/gptj_v3.cpp | 2 +-
 otherarch/mpt_v3.cpp  | 2 +-
 otherarch/neox_v3.cpp | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/otherarch/gpt2_v3.cpp b/otherarch/gpt2_v3.cpp
index b716fe212d67e..fb15d662be015 100644
--- a/otherarch/gpt2_v3.cpp
+++ b/otherarch/gpt2_v3.cpp
@@ -422,7 +422,7 @@ bool gpt2_eval(
             buf = realloc(buf, buf_size);
             if (buf == nullptr)
             {
-                fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size);
+                fprintf(stderr, "%s: failed to allocate %zu bytes. Try reducing batch size.\n", __func__, buf_size);
                 return false;
             }
         }
diff --git a/otherarch/gptj_v3.cpp b/otherarch/gptj_v3.cpp
index 031a2c051c5f2..b00bd6bd291d4 100644
--- a/otherarch/gptj_v3.cpp
+++ b/otherarch/gptj_v3.cpp
@@ -417,7 +417,7 @@ bool gptj_eval(
             buf = realloc(buf, buf_size);
             if (buf == nullptr)
             {
-                fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size);
+                fprintf(stderr, "%s: failed to allocate %zu bytes. Try reducing batch size.\n", __func__, buf_size);
                 return false;
             }
         }
diff --git a/otherarch/mpt_v3.cpp b/otherarch/mpt_v3.cpp
index 5d66f91f5f4cd..a60172f51611d 100644
--- a/otherarch/mpt_v3.cpp
+++ b/otherarch/mpt_v3.cpp
@@ -350,7 +350,7 @@ bool mpt_eval(const mpt_model & model, const int n_threads, const int n_past,
             buf_size = buf_size_new;
             buf = realloc(buf, buf_size);
             if (buf == nullptr) {
-                fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size);
+                fprintf(stderr, "%s: failed to allocate %zu bytes. Try reducing batch size.\n", __func__, buf_size);
                 return false;
             }
         }
diff --git a/otherarch/neox_v3.cpp b/otherarch/neox_v3.cpp
index 37f5ad9aed276..245d383d63793 100644
--- a/otherarch/neox_v3.cpp
+++ b/otherarch/neox_v3.cpp
@@ -435,7 +435,7 @@ bool gpt_neox_eval(
             buf = realloc(buf, buf_size);
             if (buf == nullptr)
             {
-                fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size);
+                fprintf(stderr, "%s: failed to allocate %zu bytes. Try reducing batch size.\n", __func__, buf_size);
                 return false;
             }
         }

From f7b096374dad99164c610196c1926d53d3e87831 Mon Sep 17 00:00:00 2001
From: Concedo <39025047+LostRuins@users.noreply.github.com>
Date: Fri, 23 Jun 2023 23:56:22 +0800
Subject: [PATCH 12/15] fixed string too long CI issue

---
 ggml-opencl.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/ggml-opencl.cpp b/ggml-opencl.cpp
index 7de91049d3766..fed4ffb0ccd05 100644
--- a/ggml-opencl.cpp
+++ b/ggml-opencl.cpp
@@ -183,7 +183,9 @@ void convert_f16(__global half* x, const int ib, const int iqs, float* v0, float
     *v0 = vload_half(0, &x[ib + 0]);
     *v1 = vload_half(0, &x[ib + 1]);
 }
+);
 
+static std::string k_quants_source = MULTILINE_QUOTE(
 inline void get_scale_min_k4(int j, const __global uint8_t *q, uint8_t *d, uint8_t *m)
 {
     if (j < 4)
@@ -853,6 +855,7 @@ std::string& replace(std::string& s, const std::string& from, const std::string&
 std::string generate_kernels() {
     std::stringstream src;
     src << program_source << '\n';
+    src << k_quants_source << '\n';
     for (size_t i = 0; i < dequant_str_values.size(); i += dequant_str_keys.size()) {
         std::string dequant_kernel = dequant_template;
         std::string dmmv_kernel = dequant_mul_mat_vec_template;

From 0485fa65a2fc3159ea9fb2ad7661a5837038b31d Mon Sep 17 00:00:00 2001
From: Concedo <39025047+LostRuins@users.noreply.github.com>
Date: Sat, 24 Jun 2023 11:43:42 +0800
Subject: [PATCH 13/15] wstring convert for mpt

---
 gpttype_adapter.cpp  |  3 ++-
 llama.cpp            |  4 ++--
 otherarch/mpt_v3.cpp | 14 ++++++++++++--
 otherarch/utils.cpp  | 10 ++++++++--
 otherarch/utils.h    |  6 ++++++
 5 files changed, 30 insertions(+), 7 deletions(-)

diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp
index 8c716c84a7de6..b166e2aac1922 100644
--- a/gpttype_adapter.cpp
+++ b/gpttype_adapter.cpp
@@ -313,6 +313,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
     = gpt2_ctx_v1.hparams.n_ctx = gpt2_ctx_v2.hparams.n_ctx = gpt2_ctx_v3.hparams.n_ctx
     = mpt_ctx_v3.hparams.n_ctx = params.n_ctx;
 
+    //this is used for the mem_per_token eval, openblas needs more RAM
     bool use_scratch = ggml_cpu_has_gpublas();
 
     printf("System Info: %s\n", llama_print_system_info());
@@ -904,7 +905,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
     concat_output = "";
 
     bool startedsampling = false;
-    bool use_scratch = true;
+    bool use_scratch = true; //for normal inference always use scratch
 
     timer_start();
     double time1 = 0, time2 = 0;
diff --git a/llama.cpp b/llama.cpp
index aa67038e02db8..5259fd52ef9b7 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -105,7 +105,7 @@ static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
         { MODEL_3B,    682ull * MB },
         { MODEL_7B,   1026ull * MB },
         { MODEL_13B,  1608ull * MB },
-        { MODEL_30B,  3124ull * MB },
+        { MODEL_30B,  3224ull * MB },
         { MODEL_65B,  5120ull * MB },
     };
     return k_sizes;
@@ -119,7 +119,7 @@ static const std::map<e_model, size_t> & MEM_REQ_EVAL()
         { MODEL_3B,   512ull * MB },
         { MODEL_7B,   800ull * MB },
         { MODEL_13B, 1024ull * MB },
-        { MODEL_30B, 1280ull * MB },
+        { MODEL_30B, 1380ull * MB },
         { MODEL_65B, 1536ull * MB },
     };
     return k_sizes;
diff --git a/otherarch/mpt_v3.cpp b/otherarch/mpt_v3.cpp
index a60172f51611d..b611b0703b8ba 100644
--- a/otherarch/mpt_v3.cpp
+++ b/otherarch/mpt_v3.cpp
@@ -86,6 +86,16 @@ bool mpt_model_load(const std::string & fname, mpt_model & model, gpt_vocab & vo
             fin.read((char *) buf.data(), len);
             word.assign(buf.data(), len);
 
+            // Convert token from utf-8
+            std::wstring word_multibytes = convert_to_wstring(word);
+            if(word_multibytes!=L"")
+            {
+                word.resize(word_multibytes.size());
+                for (int w = 0; w < word_multibytes.size(); w++) {
+                    word[w] = uint8_t(word_multibytes[w]);
+                }
+            }
+
             vocab.token_to_id[word] = i;
             vocab.id_to_token[i] = word;
         }
@@ -123,8 +133,8 @@ bool mpt_model_load(const std::string & fname, mpt_model & model, gpt_vocab & vo
         ctx_size += n_layer * (4 * n_embd * n_embd * ggml_type_sizef(wtype)); // mlp_mlp_up_weight
         ctx_size += n_layer * (n_embd * n_embd * 4 * ggml_type_sizef(wtype)); // mlp_mlp_down_weight
 
-        ctx_size += (n_ctx * n_layer * n_embd * ggml_type_sizef(GGML_TYPE_F16)); // memory_k
-        ctx_size += (n_ctx * n_layer * n_embd * ggml_type_sizef(GGML_TYPE_F16)); // memory_v
+        ctx_size += n_ctx * n_layer * n_embd * ggml_type_sizef(GGML_TYPE_F16); // memory_k
+        ctx_size += n_ctx * n_layer * n_embd * ggml_type_sizef(GGML_TYPE_F16); // memory_v
 
         ctx_size += (6 + 6 * n_layer) * 512; // object overhead
 
diff --git a/otherarch/utils.cpp b/otherarch/utils.cpp
index 57c362934c811..02637069a9b98 100644
--- a/otherarch/utils.cpp
+++ b/otherarch/utils.cpp
@@ -122,8 +122,14 @@ std::string convert_to_utf8(const std::wstring & input) {
 
 
 std::wstring convert_to_wstring(const std::string & input) {
-    std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
-    return converter.from_bytes(input);
+    try {
+        std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
+        return converter.from_bytes(input);
+    } catch (const std::range_error& e) {
+        return L"";
+    } catch (...) {
+        return L"";
+    }
 }
 
 void gpt_split_words(std::string str, std::vector<std::string>& words) {
diff --git a/otherarch/utils.h b/otherarch/utils.h
index bb57a8242f4e9..f9857823faf34 100644
--- a/otherarch/utils.h
+++ b/otherarch/utils.h
@@ -34,6 +34,12 @@ void utreplace(std::string & str, const std::string & needle, const std::string
 // poor-man's JSON parsing
 std::map<std::string, int32_t> json_parse(const std::string & fname);
 
+std::string convert_to_utf8(const std::wstring & input);
+
+std::wstring convert_to_wstring(const std::string & input);
+
+void gpt_split_words(std::string str, std::vector<std::string>& words);
+
 // split text into tokens
 //
 // ref: https://github.com/openai/gpt-2/blob/a74da5d99abaaba920de8131d64da2862a8f213b/src/encoder.py#L53

From 6da38b0d40a6476ccdd56e48143b21d4254b5da1 Mon Sep 17 00:00:00 2001
From: Concedo <39025047+LostRuins@users.noreply.github.com>
Date: Sat, 24 Jun 2023 12:30:38 +0800
Subject: [PATCH 14/15] up ver

---
 koboldcpp.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/koboldcpp.py b/koboldcpp.py
index 76e94b84ac63b..cc2ba50d4dc91 100644
--- a/koboldcpp.py
+++ b/koboldcpp.py
@@ -225,7 +225,7 @@ def utfprint(str):
 maxhordelen = 256
 modelbusy = False
 defaultport = 5001
-KcppVersion = "1.32.1"
+KcppVersion = "1.32.2"
 showdebug = True
 
 class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):

From 8342fe81b1c2a00aa81d44c9e1ffb7057df3b323 Mon Sep 17 00:00:00 2001
From: Concedo <39025047+LostRuins@users.noreply.github.com>
Date: Sat, 24 Jun 2023 12:58:49 +0800
Subject: [PATCH 15/15] revert the wstring tokenization. coherency was affected

---
 koboldcpp.py         |  2 +-
 otherarch/mpt_v3.cpp | 16 ++++++++--------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/koboldcpp.py b/koboldcpp.py
index cc2ba50d4dc91..99026ab857adb 100644
--- a/koboldcpp.py
+++ b/koboldcpp.py
@@ -225,7 +225,7 @@ def utfprint(str):
 maxhordelen = 256
 modelbusy = False
 defaultport = 5001
-KcppVersion = "1.32.2"
+KcppVersion = "1.32.3"
 showdebug = True
 
 class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
diff --git a/otherarch/mpt_v3.cpp b/otherarch/mpt_v3.cpp
index b611b0703b8ba..100e635ba8645 100644
--- a/otherarch/mpt_v3.cpp
+++ b/otherarch/mpt_v3.cpp
@@ -87,14 +87,14 @@ bool mpt_model_load(const std::string & fname, mpt_model & model, gpt_vocab & vo
             word.assign(buf.data(), len);
 
             // Convert token from utf-8
-            std::wstring word_multibytes = convert_to_wstring(word);
-            if(word_multibytes!=L"")
-            {
-                word.resize(word_multibytes.size());
-                for (int w = 0; w < word_multibytes.size(); w++) {
-                    word[w] = uint8_t(word_multibytes[w]);
-                }
-            }
+            // std::wstring word_multibytes = convert_to_wstring(word);
+            // if(word_multibytes!=L"")
+            // {
+            //     word.resize(word_multibytes.size());
+            //     for (int w = 0; w < word_multibytes.size(); w++) {
+            //         word[w] = uint8_t(word_multibytes[w]);
+            //     }
+            // }
 
             vocab.token_to_id[word] = i;
             vocab.id_to_token[i] = word;