From 2322ec223a21625dfe9bd73ee677444a98a24ac9 Mon Sep 17 00:00:00 2001 From: Xiake Sun Date: Tue, 20 Jun 2023 05:42:40 -0700 Subject: [PATCH 01/15] Fix typo (#1949) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 2d05de333cb23..8136e706433ca 100644 --- a/README.md +++ b/README.md @@ -378,7 +378,7 @@ Building the program with BLAS support may lead to some performance improvements ```sh git clone https://github.com/CNugteren/CLBlast.git mkdir CLBlast/build - cd CLBLast/build + cd CLBlast/build cmake .. -DBUILD_SHARED_LIBS=OFF -DTUNERS=OFF cmake --build . --config Release cmake --install . --prefix /some/path From 049aa16b8c5c6d086246e4e6b9feb18de4fbd663 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 20 Jun 2023 19:05:54 +0300 Subject: [PATCH 02/15] readme : add link to p1 --- README.md | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/README.md b/README.md index 8136e706433ca..67012adabe0ea 100644 --- a/README.md +++ b/README.md @@ -9,12 +9,8 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++ **Hot topics:** +- p1 : LLM-based code completion engine at the edge : https://github.com/ggml-org/p1/discussions/1 - Roadmap June 2023: https://github.com/ggerganov/llama.cpp/discussions/1729 -- GPU support with Metal (Apple Silicon): https://github.com/ggerganov/llama.cpp/pull/1642 -- High-quality 2,3,4,5,6-bit quantization: https://github.com/ggerganov/llama.cpp/pull/1684 -- Multi-GPU support: https://github.com/ggerganov/llama.cpp/pull/1607 -- Training LLaMA models from scratch: https://github.com/ggerganov/llama.cpp/pull/1652 -- CPU threading improvements: https://github.com/ggerganov/llama.cpp/pull/1632
Table of Contents From fb98254f99d769fcbbf20966ef386abdb48ef601 Mon Sep 17 00:00:00 2001 From: Rahul Vivek Nair <68507071+RahulVivekNair@users.noreply.github.com> Date: Thu, 22 Jun 2023 03:18:43 +0530 Subject: [PATCH 03/15] Fix typo in README.md (#1961) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 67012adabe0ea..ace588606ee8c 100644 --- a/README.md +++ b/README.md @@ -340,7 +340,7 @@ Building the program with BLAS support may lead to some performance improvements | LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. | | LLAMA_CUDA_DMMV_Y | Positive integer | 1 | Block size in y direction for the CUDA dequantization + mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. | | LLAMA_CUDA_DMMV_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels. Can improve performance on relatively recent GPUs. | - | LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value 2 1 can improve performance for slow GPUs. | + | LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. | - #### CLBlast From bbca06e26949686d61a5126332680ba3cccf235c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Wed, 21 Jun 2023 23:49:25 +0200 Subject: [PATCH 04/15] cmake: revert CUDA arch default to 52, 61 if f16 (#1959) --- CMakeLists.txt | 25 +++++++++---------------- 1 file changed, 9 insertions(+), 16 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 2846d9b944499..cc7560a7ae54e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -250,6 +250,15 @@ if (LLAMA_CUBLAS) set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt) endif() + if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES) + if (LLAMA_CUDA_DMMV_F16) + set(CMAKE_CUDA_ARCHITECTURES "61") # needed for f16 CUDA intrinsics + else() + set(CMAKE_CUDA_ARCHITECTURES "52") # lowest CUDA 12 standard + endif() + endif() + message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}") + else() message(WARNING "cuBLAS not found") endif() @@ -493,22 +502,6 @@ if (BUILD_SHARED_LIBS) endif() endif() -if (GGML_SOURCES_CUDA) - message(STATUS "GGML CUDA sources found, configuring CUDA architecture") - set_property(TARGET ggml PROPERTY CUDA_ARCHITECTURES "native") - set_property(TARGET ggml PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto") - - set_property(TARGET ggml_static PROPERTY CUDA_ARCHITECTURES "native") - set_property(TARGET ggml_static PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto") - - if (BUILD_SHARED_LIBS) - set_property(TARGET ggml_shared PROPERTY CUDA_ARCHITECTURES "native") - set_property(TARGET ggml_shared PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto") - endif() - - set_property(TARGET llama PROPERTY CUDA_ARCHITECTURES "native") -endif() - # # programs, examples and tests From 7487137227eb32ed9b12156338b865cb29b2dfd1 Mon Sep 17 00:00:00 2001 From: Erik Scholz Date: Thu, 22 Jun 2023 14:20:47 +0200 Subject: [PATCH 05/15] rework convert.py to read hyper-parameters from config.json (#1958) * Read hyper-parameters from HuggingFace-transformer config.json, if they exist, and fall back to guessing, like before otherwise. This allows converting open_llama 3B and other non-standard model designs. --- convert.py | 91 +++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 69 insertions(+), 22 deletions(-) diff --git a/convert.py b/convert.py index 265c41fa04b18..de6c39c67672b 100644 --- a/convert.py +++ b/convert.py @@ -130,6 +130,14 @@ def make_tensors_list() -> List[str]: TENSORS_SET = set(TENSORS_LIST) +def find_n_mult(n_ff: int, n_embd: int) -> int: + # hardcoded magic range + for n_mult in range(256, 1, -1): + calc_ff = (((8*n_embd) // 3 + n_mult - 1) // n_mult)*n_mult + if calc_ff == n_ff: + return n_mult + return 1 + @dataclass class Params: n_vocab: int @@ -137,21 +145,61 @@ class Params: n_mult: int n_head: int n_layer: int - file_type: GGMLFileType @staticmethod - def guessed(model: 'LazyModel', file_type: GGMLFileType) -> 'Params': - n_vocab, n_embd = model["tok_embeddings.weight"].shape + def guessed(model: 'LazyModel') -> 'Params': + # try transformer naming first + n_vocab, n_embd = model["model.embed_tokens.weight"].shape if "model.embed_tokens.weight" in model else model["tok_embeddings.weight"].shape + + # try transformer naming first + if "model.layers.0.self_attn.q_proj.weight" in model: + n_layer=next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model) + else: + n_layer=next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model) + + n_head=n_embd // 128 # guessed return Params( n_vocab=n_vocab, n_embd=n_embd, n_mult=256, - n_head=n_embd // 128, - n_layer=next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model), - file_type=file_type, + n_head=n_head, + n_layer=n_layer, ) + @staticmethod + def loadHFTransformerJson(model: 'LazyModel', config_path: 'Path') -> 'Params': + config = json.load(open(config_path)) + + n_vocab = config["vocab_size"]; + n_embd = config["hidden_size"]; + n_head = config["num_attention_heads"]; + n_layer = config["num_hidden_layers"]; + n_ff = config["intermediate_size"]; + + n_mult = find_n_mult(n_ff, n_embd); + + return Params( + n_vocab=n_vocab, + n_embd=n_embd, + n_mult=n_mult, + n_head=n_head, + n_layer=n_layer, + ) + + @staticmethod + def load(model_plus: 'ModelPlus') -> 'Params': + orig_config_path = model_plus.paths[0].parent / "params.json" + hf_transformer_config_path = model_plus.paths[0].parent / "config.json" + + if hf_transformer_config_path.exists(): + params = Params.loadHFTransformerJson(model_plus.model, hf_transformer_config_path) + else: + params = Params.guessed(model_plus.model) + + print(f'params: n_vocab:{params.n_vocab} n_embd:{params.n_embd} n_mult:{params.n_mult} n_head:{params.n_head} n_layer:{params.n_layer}') + return params + class SentencePieceVocab: def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path]) -> None: @@ -595,18 +643,17 @@ def load() -> Tensor: return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}) ' + lazy_tensor.description) -def convert_transformers_to_orig(model: LazyModel) -> LazyModel: +def convert_transformers_to_orig(model: LazyModel, params: Params) -> LazyModel: out: LazyModel = {} out["tok_embeddings.weight"] = model["model.embed_tokens.weight"] out["norm.weight"] = model["model.norm.weight"] out["output.weight"] = model["lm_head.weight"] - n_head = model["model.layers.0.self_attn.q_proj.weight"].shape[1] // 128 for i in itertools.count(): if f"model.layers.{i}.self_attn.q_proj.weight" not in model: break - out[f"layers.{i}.attention.wq.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], n_head) - out[f"layers.{i}.attention.wk.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], n_head) + out[f"layers.{i}.attention.wq.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head) + out[f"layers.{i}.attention.wk.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head) out[f"layers.{i}.attention.wv.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"] out[f"layers.{i}.attention.wo.weight"] = model[f"model.layers.{i}.self_attn.o_proj.weight"] @@ -920,7 +967,7 @@ class OutputFile: def __init__(self, fname_out: Path) -> None: self.fout = open(fname_out, "wb") - def write_file_header(self, params: Params) -> None: + def write_file_header(self, params: Params, file_type: GGMLFileType) -> None: self.fout.write(b"ggjt"[::-1]) # magic values = [ 1, # file version @@ -930,7 +977,7 @@ def write_file_header(self, params: Params) -> None: params.n_head, params.n_layer, params.n_embd // params.n_head, # rot (obsolete) - params.file_type.value, + file_type.value, ] self.fout.write(struct.pack("i" * len(values), *values)) @@ -958,10 +1005,10 @@ def write_vocab_only(fname_out: Path, vocab: Vocab) -> None: of.fout.close() @staticmethod - def write_all(fname_out: Path, params: Params, model: LazyModel, vocab: Vocab) -> None: + def write_all(fname_out: Path, params: Params, file_type: GGMLFileType, model: LazyModel, vocab: Vocab) -> None: check_vocab_size(params, vocab) of = OutputFile(fname_out) - of.write_file_header(params) + of.write_file_header(params, file_type) print("Writing vocab...") of.write_vocab(vocab) @@ -997,11 +1044,11 @@ def pick_output_type(model: LazyModel, output_type_str: Optional[str]) -> GGMLFi raise Exception(f"Unexpected combination of types: {name_to_type}") -def do_necessary_conversions(model: LazyModel) -> LazyModel: +def do_necessary_conversions(model: LazyModel, params: Params) -> LazyModel: model = handle_quantization(model) if "lm_head.weight" in model: - model = convert_transformers_to_orig(model) + model = convert_transformers_to_orig(model, params) model = filter_and_sort_tensors(model) return model @@ -1107,14 +1154,14 @@ def load_vocab(path: Path) -> SentencePieceVocab: return SentencePieceVocab(path, added_tokens_path if added_tokens_path.exists() else None) -def default_outfile(model_paths: List[Path], params: Params) -> Path: +def default_outfile(model_paths: List[Path], file_type: GGMLFileType) -> Path: namestr = { GGMLFileType.AllF32: "f32", GGMLFileType.MostlyF16: "f16", GGMLFileType.MostlyQ4_0: "q4_0", GGMLFileType.MostlyQ4_1: "q4_1", GGMLFileType.PerLayerIsQ4_1: "q4_1", - }[params.file_type] + }[file_type] ret = model_paths[0].parent / f"ggml-model-{namestr}.bin" if ret in model_paths: sys.stderr.write( @@ -1164,13 +1211,13 @@ def main(args_in: Optional[List[str]] = None) -> None: else: vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent vocab = load_vocab(vocab_dir) + params = Params.load(model_plus) model = model_plus.model - model = do_necessary_conversions(model) + model = do_necessary_conversions(model, params) output_type = pick_output_type(model, args.outtype) model = convert_to_output_type(model, output_type) - params = Params.guessed(model, output_type) - outfile = args.outfile or default_outfile(model_plus.paths, params) - OutputFile.write_all(outfile, params, model, vocab) + outfile = args.outfile or default_outfile(model_plus.paths, output_type) + OutputFile.write_all(outfile, params, output_type, model, vocab) print(f"Wrote {outfile}") From d7b7484f74d486f77feb4c0b7af7e1718ed91651 Mon Sep 17 00:00:00 2001 From: eiery <19350831+eiery@users.noreply.github.com> Date: Fri, 23 Jun 2023 04:38:01 -0400 Subject: [PATCH 06/15] Add OpenLLaMA instructions to the README (#1954) * add openllama to readme --- README.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/README.md b/README.md index ace588606ee8c..b09498be64cd0 100644 --- a/README.md +++ b/README.md @@ -29,6 +29,7 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
  • Quantization
  • Interactive mode
  • Instruction mode with Alpaca
  • +
  • Using OpenLLaMA
  • Using GPT4All
  • Using Pygmalion 7B & Metharme 7B
  • Obtaining the Facebook LLaMA original model and Stanford Alpaca model data
  • @@ -543,6 +544,13 @@ cadaver, cauliflower, cabbage (vegetable), catalpa (tree) and Cailleach. > ``` +### Using [OpenLLaMA](https://github.com/openlm-research/open_llama) + +OpenLLaMA is an openly licensed reproduction of Meta's original LLaMA model. It uses the same architecture and is a drop-in replacement for the original LLaMA weights. + +- Download the [3B](https://huggingface.co/openlm-research/open_llama_3b), [7B](https://huggingface.co/openlm-research/open_llama_7b), or [13B](https://huggingface.co/openlm-research/open_llama_13b) model from Hugging Face. +- Convert the model to ggml FP16 format using `python convert.py ` + ### Using [GPT4All](https://github.com/nomic-ai/gpt4all) - Obtain the `tokenizer.model` file from LLaMA model and put it to `models` From df9135e3a9a6708bb62e6484d239e2b4ea212ed7 Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Fri, 23 Jun 2023 18:41:23 +0800 Subject: [PATCH 07/15] fixing memory bugs --- gpttype_adapter.cpp | 8 ++++++-- koboldcpp.py | 2 +- llama.cpp | 4 ++-- model_adapter.cpp | 2 +- otherarch/llama_v2.cpp | 4 ++-- 5 files changed, 12 insertions(+), 8 deletions(-) diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp index d0ddaf99bf6c5..4e087bd65afd7 100644 --- a/gpttype_adapter.cpp +++ b/gpttype_adapter.cpp @@ -308,8 +308,12 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in params.memory_f16 = inputs.f16_kv; params.n_ctx = inputs.max_context_length; - neox_ctx_v2.hparams.n_ctx = gptj_ctx_v1.hparams.n_ctx = gptj_ctx_v2.hparams.n_ctx = gpt2_ctx_v1.hparams.n_ctx = gpt2_ctx_v2.hparams.n_ctx - = neox_ctx_v3.hparams.n_ctx = gptj_ctx_v3.hparams.n_ctx = gptj_ctx_v3.hparams.n_ctx = mpt_ctx_v3.hparams.n_ctx = params.n_ctx; + neox_ctx_v2.hparams.n_ctx = neox_ctx_v3.hparams.n_ctx + = gptj_ctx_v1.hparams.n_ctx = gptj_ctx_v2.hparams.n_ctx = gptj_ctx_v3.hparams.n_ctx + = gpt2_ctx_v1.hparams.n_ctx = gpt2_ctx_v2.hparams.n_ctx = gpt2_ctx_v3.hparams.n_ctx + = mpt_ctx_v3.hparams.n_ctx = params.n_ctx; + + bool calc_mem_with_scratch = ggml_cpu_has_gpublas(); printf("System Info: %s\n", llama_print_system_info()); SetQuantsUnshuffled(false); diff --git a/koboldcpp.py b/koboldcpp.py index aa54262623c05..76e94b84ac63b 100644 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -225,7 +225,7 @@ def utfprint(str): maxhordelen = 256 modelbusy = False defaultport = 5001 -KcppVersion = "1.32" +KcppVersion = "1.32.1" showdebug = True class ServerRequestHandler(http.server.SimpleHTTPRequestHandler): diff --git a/llama.cpp b/llama.cpp index 27d3d4a0a9a8d..aa67038e02db8 100644 --- a/llama.cpp +++ b/llama.cpp @@ -80,7 +80,7 @@ static const std::map & MEM_REQ_SCRATCH0() { MODEL_3B, 256ull * MB }, { MODEL_7B, 512ull * MB }, { MODEL_13B, 512ull * MB }, - { MODEL_30B, 512ull * MB }, + { MODEL_30B, 640ull * MB }, { MODEL_65B, 1024ull * MB }, }; return k_sizes; @@ -92,7 +92,7 @@ static const std::map & MEM_REQ_SCRATCH1() { MODEL_3B, 256ull * MB }, { MODEL_7B, 512ull * MB }, { MODEL_13B, 512ull * MB }, - { MODEL_30B, 512ull * MB }, + { MODEL_30B, 640ull * MB }, { MODEL_65B, 1024ull * MB }, }; return k_sizes; diff --git a/model_adapter.cpp b/model_adapter.cpp index 547a8a1ef83ab..da9fa193edc4b 100644 --- a/model_adapter.cpp +++ b/model_adapter.cpp @@ -98,7 +98,7 @@ void print_tok_vec(std::vector &embd) //we need to read more to determine int32_t vocabsiz = 0; fin.read((char *) &vocabsiz, sizeof(int32_t)); - if(vocabsiz==4096) //actually the d_model for mpt + if(vocabsiz==4096 || vocabsiz==7168) //actually the d_model for mpt { fileformat = FileFormat::MPT_1; } diff --git a/otherarch/llama_v2.cpp b/otherarch/llama_v2.cpp index 167f3e9c39291..2f8e168ca299b 100644 --- a/otherarch/llama_v2.cpp +++ b/otherarch/llama_v2.cpp @@ -59,7 +59,7 @@ static const std::map & MEM_REQ_SCRATCH0_2() { MODEL_UNKNOWN_2, 512ull * MB_2 }, { MODEL_7B_2, 512ull * MB_2 }, { MODEL_13B_2, 512ull * MB_2 }, - { MODEL_30B_2, 512ull * MB_2 }, + { MODEL_30B_2, 640ull * MB_2 }, { MODEL_65B_2, 1024ull * MB_2 }, }; return k_sizes; @@ -71,7 +71,7 @@ static const std::map & MEM_REQ_SCRATCH1_2() { MODEL_UNKNOWN_2, 512ull * MB_2 }, { MODEL_7B_2, 512ull * MB_2 }, { MODEL_13B_2, 512ull * MB_2 }, - { MODEL_30B_2, 512ull * MB_2 }, + { MODEL_30B_2, 640ull * MB_2 }, { MODEL_65B_2, 1024ull * MB_2 }, }; return k_sizes; From d5e4cf7ffea99e66d2cf6c38826c2fdbc1d68c8a Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Fri, 23 Jun 2023 19:01:15 +0800 Subject: [PATCH 08/15] handle ctx manip --- otherarch/gpt2_v3.cpp | 11 +++++++---- otherarch/gptj_v3.cpp | 11 +++++++---- otherarch/mpt_v3.cpp | 1 + otherarch/neox_v3.cpp | 11 +++++++---- 4 files changed, 22 insertions(+), 12 deletions(-) diff --git a/otherarch/gpt2_v3.cpp b/otherarch/gpt2_v3.cpp index ba2222f9990f4..4be0a08b00e05 100644 --- a/otherarch/gpt2_v3.cpp +++ b/otherarch/gpt2_v3.cpp @@ -12,6 +12,7 @@ #include #include #include +#include #include "model_adapter.h" @@ -39,6 +40,8 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g } } + int32_t origmaxctx = model.hparams.n_ctx; + // load hparams { auto & hparams = model.hparams; @@ -53,7 +56,7 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR; printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); - printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx); + printf("%s: n_ctx = %d (%d)\n", __func__, hparams.n_ctx,origmaxctx); printf("%s: n_embd = %d\n", __func__, hparams.n_embd); printf("%s: n_head = %d\n", __func__, hparams.n_head); printf("%s: n_layer = %d\n", __func__, hparams.n_layer); @@ -154,8 +157,8 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_proj_w ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b - ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F16); // memory_k - ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F16); // memory_v + ctx_size += std::max(origmaxctx,n_ctx)*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F16); // memory_k + ctx_size += std::max(origmaxctx,n_ctx)*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F16); // memory_v ctx_size += (6 + 12*n_layer)*1024; // object overhead @@ -256,7 +259,7 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g const int n_layer = hparams.n_layer; const int n_ctx = hparams.n_ctx; - const int n_mem = n_layer*n_ctx; + const int n_mem = n_layer*std::max(origmaxctx,n_ctx); const int n_elements = n_embd*n_mem; model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements); diff --git a/otherarch/gptj_v3.cpp b/otherarch/gptj_v3.cpp index 0f0f8210516b7..2931ece5fbb9c 100644 --- a/otherarch/gptj_v3.cpp +++ b/otherarch/gptj_v3.cpp @@ -12,6 +12,7 @@ #include #include #include +#include #include "model_adapter.h" @@ -39,6 +40,8 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g } } + int32_t origmaxctx = model.hparams.n_ctx; + // load hparams { auto & hparams = model.hparams; @@ -54,7 +57,7 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR; printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); - printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx); + printf("%s: n_ctx = %d (%d)\n", __func__, hparams.n_ctx,origmaxctx); printf("%s: n_embd = %d\n", __func__, hparams.n_embd); printf("%s: n_head = %d\n", __func__, hparams.n_head); printf("%s: n_layer = %d\n", __func__, hparams.n_layer); @@ -138,8 +141,8 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_proj_w ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b - ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(memory_type); // memory_k - ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(memory_type); // memory_v + ctx_size += std::max(origmaxctx,n_ctx)*n_layer*n_embd*ggml_type_sizef(memory_type); // memory_k + ctx_size += std::max(origmaxctx,n_ctx)*n_layer*n_embd*ggml_type_sizef(memory_type); // memory_v ctx_size += (5 + 10*n_layer)*512; // object overhead @@ -232,7 +235,7 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g const int n_layer = hparams.n_layer; const int n_ctx = hparams.n_ctx; - const int n_mem = n_layer*n_ctx; + const int n_mem = n_layer*std::max(origmaxctx,n_ctx); const int n_elements = n_embd*n_mem; model.memory_k = ggml_new_tensor_1d(ctx, memory_type, n_elements); diff --git a/otherarch/mpt_v3.cpp b/otherarch/mpt_v3.cpp index f7ab03ec0d958..46ac0bd8b381f 100644 --- a/otherarch/mpt_v3.cpp +++ b/otherarch/mpt_v3.cpp @@ -12,6 +12,7 @@ #include #include #include +#include #include "model_adapter.h" diff --git a/otherarch/neox_v3.cpp b/otherarch/neox_v3.cpp index 3084bbda7630c..4f79171bd4cd0 100644 --- a/otherarch/neox_v3.cpp +++ b/otherarch/neox_v3.cpp @@ -12,6 +12,7 @@ #include #include #include +#include #if defined(GGML_USE_CLBLAST) #include "ggml-opencl.h" @@ -37,6 +38,8 @@ ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model & } } + int32_t origmaxctx = model.hparams.n_ctx; + // load hparams { auto & hparams = model.hparams; @@ -53,7 +56,7 @@ ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model & const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR; printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); - printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx); + printf("%s: n_ctx = %d (%d)\n", __func__, hparams.n_ctx,origmaxctx); printf("%s: n_embd = %d\n", __func__, hparams.n_embd); printf("%s: n_head = %d\n", __func__, hparams.n_head); printf("%s: n_layer = %d\n", __func__, hparams.n_layer); @@ -133,8 +136,8 @@ ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model & ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_proj_w ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b - ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F16); // memory_k - ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F16); // memory_v + ctx_size += std::max((size_t)origmaxctx,n_ctx)*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F16); // memory_k + ctx_size += std::max((size_t)origmaxctx,n_ctx)*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F16); // memory_v ctx_size += (6 + 16*n_layer)*1024; // object overhead @@ -232,7 +235,7 @@ ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model & const int n_layer = hparams.n_layer; const int n_ctx = hparams.n_ctx; - const int64_t n_mem = n_layer*n_ctx; + const int64_t n_mem = n_layer*std::max(origmaxctx,n_ctx); const int64_t n_elements = n_embd*n_mem; model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements); From 43c2891afabea24b9a8c2de845d12463f844b949 Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Fri, 23 Jun 2023 19:01:36 +0800 Subject: [PATCH 09/15] option to not use scratch --- gpttype_adapter.cpp | 12 +++++------ otherarch/gpt2_v3.cpp | 26 +++++++++++++++++++----- otherarch/gptj_v3.cpp | 24 +++++++++++++++++----- otherarch/mpt_v3.cpp | 46 ++++++++++++++++++++++++++++++++----------- otherarch/neox_v3.cpp | 26 +++++++++++++++++++----- 5 files changed, 102 insertions(+), 32 deletions(-) diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp index 4e087bd65afd7..20093f2058836 100644 --- a/gpttype_adapter.cpp +++ b/gpttype_adapter.cpp @@ -549,7 +549,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in return res; } // determine the required inference memory per token: - gpt2_eval(gpt2_ctx_v3, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, file_format); + gpt2_eval(gpt2_ctx_v3, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, calc_mem_with_scratch); return ModelLoadResult::SUCCESS; } else @@ -616,14 +616,14 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in } // determine the required inference memory per token: - gptj_eval(gptj_ctx_v3, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token); + gptj_eval(gptj_ctx_v3, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, calc_mem_with_scratch); //if the logits are NAN or duplicated, it means the model is incompatible std::vector oldlogits(logits); //this is another hack because they change the library - we run the eval through the model //twice and compare logits. if they give the same logits for different inputs, model is broken - gptj_eval(gptj_ctx_v3, params.n_threads, 0, {4, 5, 6, 7}, logits, mem_per_token); + gptj_eval(gptj_ctx_v3, params.n_threads, 0, {4, 5, 6, 7}, logits, mem_per_token, calc_mem_with_scratch); if(logits.size()>0 && (IsNanCheck(logits[0]) || LogitsDuplicated(oldlogits,logits))) { @@ -688,7 +688,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in } // determine the required inference memory per token: - gpt_neox_eval(neox_ctx_v3, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token); + gpt_neox_eval(neox_ctx_v3, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, calc_mem_with_scratch); return ModelLoadResult::SUCCESS; } @@ -745,7 +745,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in } // determine the required inference memory per token: - mpt_eval(mpt_ctx_v3, params.n_threads, 0, { 0, 1, 2, 3 }, logits, false, mem_per_token); + mpt_eval(mpt_ctx_v3, params.n_threads, 0, { 0, 1, 2, 3 }, logits, false, mem_per_token, calc_mem_with_scratch); return ModelLoadResult::SUCCESS; } else @@ -1078,7 +1078,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o } else if(file_format==FileFormat::GPT2_4) { - evalres = gpt2_eval(gpt2_ctx_v3, params.n_threads, n_past, embd, logits, mem_per_token, file_format); + evalres = gpt2_eval(gpt2_ctx_v3, params.n_threads, n_past, embd, logits, mem_per_token); } else if(file_format==FileFormat::NEOX_1 || file_format == FileFormat::NEOX_2 || file_format == FileFormat::NEOX_3 || file_format==FileFormat::NEOX_4 || file_format==FileFormat::NEOX_5) { diff --git a/otherarch/gpt2_v3.cpp b/otherarch/gpt2_v3.cpp index 4be0a08b00e05..f8b82fdd47851 100644 --- a/otherarch/gpt2_v3.cpp +++ b/otherarch/gpt2_v3.cpp @@ -389,7 +389,7 @@ bool gpt2_eval( const std::vector & embd_inp, std::vector & embd_w, size_t & mem_per_token, - FileFormat file_format) { + bool use_scratch=true) { const int N = embd_inp.size(); const auto & hparams = model.hparams; @@ -406,13 +406,21 @@ bool gpt2_eval( // use 2 scratch buffers // TODO: very hacky solution - reimplement in a more elegant way static size_t scr0_size = (n_ctx>1024?512u:256u)*1024*1024; - static void * scr0 = malloc(scr0_size); + static void * scr0; static size_t scr1_size = (n_ctx>1024?512u:256u)*1024*1024; - static void * scr1 = malloc(scr1_size); + static void * scr1; - if (mem_per_token > 0 && mem_per_token*N*1.05 > buf_size) { - const size_t buf_size_new = 64u*1024*1024 + 1.15*(mem_per_token*N); // add 10% to account for ggml object overhead + if(use_scratch) + { + scr0 = malloc(scr0_size); + scr1 = malloc(scr1_size); + } + + size_t scratch_needed_mem = mem_per_token*N; + + if (mem_per_token > 0 && scratch_needed_mem*1.1 > buf_size) { + const size_t buf_size_new = 64u*1024*1024 + 1.2*(scratch_needed_mem); // add 10% to account for ggml object overhead //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new); // reallocate @@ -455,7 +463,9 @@ bool gpt2_eval( for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * cur; + if(use_scratch){ ggml_set_scratch(ctx0, { 0, scr0_size, scr0, }); + } // norm { @@ -603,7 +613,9 @@ bool gpt2_eval( struct ggml_tensor * inpFF = cur; + if(use_scratch){ ggml_set_scratch(ctx0, { 0, scr1_size, scr1, }); + } // feed-forward network { @@ -661,7 +673,9 @@ bool gpt2_eval( inpL = ggml_add(ctx0, cur, inpFF); } + if(use_scratch){ ggml_set_scratch(ctx0, { 0, scr0_size, scr0, }); + } // norm { @@ -677,7 +691,9 @@ bool gpt2_eval( ggml_repeat(ctx0, model.ln_f_b, inpL)); } + if(use_scratch){ ggml_set_scratch(ctx0, { 0, 0, nullptr, }); + } // inpL = WTE * inpL // [ 768, 50257] - model.lm_head diff --git a/otherarch/gptj_v3.cpp b/otherarch/gptj_v3.cpp index 2931ece5fbb9c..8df2025f04466 100644 --- a/otherarch/gptj_v3.cpp +++ b/otherarch/gptj_v3.cpp @@ -382,7 +382,8 @@ bool gptj_eval( const int n_past, const std::vector & embd_inp, std::vector & embd_w, - size_t & mem_per_token) { + size_t & mem_per_token, + bool use_scratch=true) { const int N = embd_inp.size(); const auto & hparams = model.hparams; @@ -400,13 +401,18 @@ bool gptj_eval( // use 2 scratch buffers // TODO: very hacky solution - reimplement in a more elegant way static size_t scr0_size = (n_ctx>1024?512u:256u)*1024*1024; - static void * scr0 = malloc(scr0_size); + static void * scr0; static size_t scr1_size = (n_ctx>1024?512u:256u)*1024*1024; - static void * scr1 = malloc(scr1_size); + static void * scr1; + if(use_scratch) + { + scr0 = malloc(scr0_size); + scr1 = malloc(scr1_size); + } - if (mem_per_token > 0 && mem_per_token*N*1.05 > buf_size) { - const size_t buf_size_new = 64u*1024*1024 + 1.15*(mem_per_token*N); // add 10% to account for ggml object overhead + if (mem_per_token > 0 && 32u*1024*1024 + mem_per_token*N*1.2 > buf_size) { + const size_t buf_size_new = 64u*1024*1024 + 1.2*(mem_per_token*N); // add 10% to account for ggml object overhead //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new); // reallocate @@ -441,7 +447,9 @@ bool gptj_eval( for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * cur; + if(use_scratch){ ggml_set_scratch(ctx0, { 0, scr0_size, scr0, }); + } // norm { @@ -530,7 +538,9 @@ bool gptj_eval( cur); } + if(use_scratch){ ggml_set_scratch(ctx0, { 0, scr1_size, scr1, }); + } struct ggml_tensor * inpFF = cur; @@ -567,7 +577,9 @@ bool gptj_eval( inpL = ggml_add(ctx0, cur, inpL); } + if(use_scratch){ ggml_set_scratch(ctx0, { 0, scr0_size, scr0, }); + } // norm { @@ -581,7 +593,9 @@ bool gptj_eval( ggml_repeat(ctx0, model.ln_f_b, inpL)); } + if(use_scratch){ ggml_set_scratch(ctx0, { 0, 0, nullptr, }); + } // lm_head { diff --git a/otherarch/mpt_v3.cpp b/otherarch/mpt_v3.cpp index 46ac0bd8b381f..ac4f321a2595b 100644 --- a/otherarch/mpt_v3.cpp +++ b/otherarch/mpt_v3.cpp @@ -316,7 +316,8 @@ bool mpt_model_load(const std::string & fname, mpt_model & model, gpt_vocab & vo // - embd_w: the predicted logits for the next token // bool mpt_eval(const mpt_model & model, const int n_threads, const int n_past, - const std::vector & embd_inp, std::vector & embd_w, bool logits_all, size_t & mem_per_token) { + const std::vector & embd_inp, std::vector & embd_w, + bool logits_all, size_t & mem_per_token, bool use_scratch=true) { const int N = embd_inp.size(); const auto & hparams = model.hparams; @@ -332,22 +333,37 @@ bool mpt_eval(const mpt_model & model, const int n_threads, const int n_past, // use 2 scratch buffers // TODO: very hacky solution - reimplement in a more elegant way - static size_t scr0_size = (n_ctx>2048?1024u:512u)*1024*1024; - static void * scr0 = malloc(scr0_size); + static size_t scr0_size = (n_ctx>2048?1024u:512u)*1024*1024; static size_t scr1_size = (n_ctx>2048?1024u:512u)*1024*1024; - static void * scr1 = malloc(scr1_size); - if (mem_per_token > 0 && mem_per_token * N > buf_size) { - const size_t buf_size_new = 1.1 * (mem_per_token * N); // add 10% to account for ggml object overhead + if(n_embd>=7168) //MPT 30B needs more scratch memory + { + scr0_size *= 2; + scr1_size *= 2; + } + + static void * scr0; + static void * scr1; + if(use_scratch) + { + scr0 = malloc(scr0_size); + scr1 = malloc(scr1_size); + } + + if (mem_per_token > 0 && mem_per_token * N *1.1 > buf_size) { + const size_t buf_size_new = 64u*1024*1024 + 1.2 * (mem_per_token * N); // add 10% to account for ggml object overhead // printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, // buf_size, buf_size_new); // reallocate - buf_size = buf_size_new; - buf = realloc(buf, buf_size); - if (buf == nullptr) { - fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size); - return false; + if (buf_size_new > buf_size) + { + buf_size = buf_size_new; + buf = realloc(buf, buf_size); + if (buf == nullptr) { + fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size); + return false; + } } } @@ -369,7 +385,9 @@ bool mpt_eval(const mpt_model & model, const int n_threads, const int n_past, struct ggml_tensor * cur; + if(use_scratch){ ggml_set_scratch(ctx0, { 0, scr0_size, scr0, }); + } // a = self.ln_1(x) { @@ -465,7 +483,9 @@ bool mpt_eval(const mpt_model & model, const int n_threads, const int n_past, inpL = ggml_add(ctx0, inpL, cur); + if(use_scratch){ ggml_set_scratch(ctx0, { 0, scr1_size, scr1, }); + } // m = self.ln_2(x) { @@ -491,7 +511,9 @@ bool mpt_eval(const mpt_model & model, const int n_threads, const int n_past, inpL = ggml_add(ctx0, inpL, cur); } + if(use_scratch){ ggml_set_scratch(ctx0, { 0, scr0_size, scr0, }); + } // norm { @@ -500,7 +522,9 @@ bool mpt_eval(const mpt_model & model, const int n_threads, const int n_past, inpL = ggml_mul(ctx0, ggml_repeat(ctx0, model.norm_f_weight, inpL), inpL); } + if(use_scratch){ ggml_set_scratch(ctx0, { 0, 0, nullptr, }); + } // output embedding weight tied to input embedding inpL = ggml_mul_mat(ctx0, model.wte_weight, inpL); diff --git a/otherarch/neox_v3.cpp b/otherarch/neox_v3.cpp index 4f79171bd4cd0..40e1d1e18ac0b 100644 --- a/otherarch/neox_v3.cpp +++ b/otherarch/neox_v3.cpp @@ -400,7 +400,8 @@ bool gpt_neox_eval( const int n_past, const std::vector & embd_inp, std::vector & embd_w, - size_t & mem_per_token) { + size_t & mem_per_token, + bool use_scratch=true) { const int N = embd_inp.size(); const auto & hparams = model.hparams; @@ -418,13 +419,20 @@ bool gpt_neox_eval( // use 2 scratch buffers // TODO: very hacky solution - reimplement in a more elegant way static size_t scr0_size = (n_ctx>1024?512u:256u)*1024*1024; - static void * scr0 = malloc(scr0_size); + static void * scr0; static size_t scr1_size = (n_ctx>1024?512u:256u)*1024*1024; - static void * scr1 = malloc(scr1_size); + static void * scr1; + if(use_scratch) + { + scr0 = malloc(scr0_size); + scr1 = malloc(scr1_size); + } - if (mem_per_token > 0 && mem_per_token*N*1.05 > buf_size) { - const size_t buf_size_new = 64u*1024*1024 + 1.15*(mem_per_token*N); // add 10% to account for ggml object overhead + size_t scratch_needed_mem = mem_per_token*N; + + if (mem_per_token > 0 && scratch_needed_mem*1.1 > buf_size) { + const size_t buf_size_new = 64u*1024*1024 + 1.2*(scratch_needed_mem); // add 10% to account for ggml object overhead //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new); // reallocate @@ -459,7 +467,9 @@ bool gpt_neox_eval( for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * cur; + if(use_scratch){ ggml_set_scratch(ctx0, { 0, scr0_size, scr0, }); + } // self-attention { @@ -564,7 +574,9 @@ bool gpt_neox_eval( } } + if(use_scratch){ ggml_set_scratch(ctx0, { 0, scr1_size, scr1, }); + } if (hparams.par_res == 0) { struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpL); @@ -588,7 +600,9 @@ bool gpt_neox_eval( } } + if(use_scratch){ ggml_set_scratch(ctx0, { 0, scr0_size, scr0, }); + } // norm { @@ -602,7 +616,9 @@ bool gpt_neox_eval( ggml_repeat(ctx0, model.ln_f_b, inpL)); } + if(use_scratch){ ggml_set_scratch(ctx0, { 0, 0, nullptr, }); + } // lm_head { From f39a7460890de883b0d68d45d75d1780984ca76e Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Fri, 23 Jun 2023 22:45:22 +0800 Subject: [PATCH 10/15] bug fixes for openblas --- gpttype_adapter.cpp | 21 +++++++++++---------- otherarch/gpt2_v3.cpp | 22 +++++++--------------- otherarch/gptj_v3.cpp | 19 +++++++------------ otherarch/mpt_v3.cpp | 27 ++++++++------------------- otherarch/neox_v3.cpp | 21 +++++++-------------- 5 files changed, 40 insertions(+), 70 deletions(-) diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp index 20093f2058836..8c716c84a7de6 100644 --- a/gpttype_adapter.cpp +++ b/gpttype_adapter.cpp @@ -313,7 +313,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in = gpt2_ctx_v1.hparams.n_ctx = gpt2_ctx_v2.hparams.n_ctx = gpt2_ctx_v3.hparams.n_ctx = mpt_ctx_v3.hparams.n_ctx = params.n_ctx; - bool calc_mem_with_scratch = ggml_cpu_has_gpublas(); + bool use_scratch = ggml_cpu_has_gpublas(); printf("System Info: %s\n", llama_print_system_info()); SetQuantsUnshuffled(false); @@ -549,7 +549,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in return res; } // determine the required inference memory per token: - gpt2_eval(gpt2_ctx_v3, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, calc_mem_with_scratch); + gpt2_eval(gpt2_ctx_v3, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, use_scratch); return ModelLoadResult::SUCCESS; } else @@ -616,14 +616,14 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in } // determine the required inference memory per token: - gptj_eval(gptj_ctx_v3, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, calc_mem_with_scratch); + gptj_eval(gptj_ctx_v3, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, use_scratch); //if the logits are NAN or duplicated, it means the model is incompatible std::vector oldlogits(logits); //this is another hack because they change the library - we run the eval through the model //twice and compare logits. if they give the same logits for different inputs, model is broken - gptj_eval(gptj_ctx_v3, params.n_threads, 0, {4, 5, 6, 7}, logits, mem_per_token, calc_mem_with_scratch); + gptj_eval(gptj_ctx_v3, params.n_threads, 0, {4, 5, 6, 7}, logits, mem_per_token, use_scratch); if(logits.size()>0 && (IsNanCheck(logits[0]) || LogitsDuplicated(oldlogits,logits))) { @@ -688,7 +688,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in } // determine the required inference memory per token: - gpt_neox_eval(neox_ctx_v3, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, calc_mem_with_scratch); + gpt_neox_eval(neox_ctx_v3, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, use_scratch); return ModelLoadResult::SUCCESS; } @@ -745,7 +745,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in } // determine the required inference memory per token: - mpt_eval(mpt_ctx_v3, params.n_threads, 0, { 0, 1, 2, 3 }, logits, false, mem_per_token, calc_mem_with_scratch); + mpt_eval(mpt_ctx_v3, params.n_threads, 0, { 0, 1, 2, 3 }, logits, false, mem_per_token, use_scratch); return ModelLoadResult::SUCCESS; } else @@ -904,6 +904,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o concat_output = ""; bool startedsampling = false; + bool use_scratch = true; timer_start(); double time1 = 0, time2 = 0; @@ -1078,7 +1079,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o } else if(file_format==FileFormat::GPT2_4) { - evalres = gpt2_eval(gpt2_ctx_v3, params.n_threads, n_past, embd, logits, mem_per_token); + evalres = gpt2_eval(gpt2_ctx_v3, params.n_threads, n_past, embd, logits, mem_per_token, use_scratch); } else if(file_format==FileFormat::NEOX_1 || file_format == FileFormat::NEOX_2 || file_format == FileFormat::NEOX_3 || file_format==FileFormat::NEOX_4 || file_format==FileFormat::NEOX_5) { @@ -1086,7 +1087,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o } else if(file_format==FileFormat::NEOX_6|| file_format==FileFormat::NEOX_7) { - evalres = gpt_neox_eval(neox_ctx_v3, params.n_threads, n_past, embd, logits, mem_per_token); + evalres = gpt_neox_eval(neox_ctx_v3, params.n_threads, n_past, embd, logits, mem_per_token, use_scratch); } else if(file_format==FileFormat::GPTJ_1 || file_format==FileFormat::GPTJ_2) { @@ -1098,11 +1099,11 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o } else if(file_format==FileFormat::GPTJ_5) { - evalres = gptj_eval(gptj_ctx_v3, params.n_threads, n_past, embd, logits, mem_per_token); + evalres = gptj_eval(gptj_ctx_v3, params.n_threads, n_past, embd, logits, mem_per_token, use_scratch); } else if(file_format==FileFormat::MPT_1) { - evalres = mpt_eval(mpt_ctx_v3, params.n_threads, n_past, embd, logits, false, mem_per_token); + evalres = mpt_eval(mpt_ctx_v3, params.n_threads, n_past, embd, logits, false, mem_per_token, use_scratch); } else { diff --git a/otherarch/gpt2_v3.cpp b/otherarch/gpt2_v3.cpp index f8b82fdd47851..b716fe212d67e 100644 --- a/otherarch/gpt2_v3.cpp +++ b/otherarch/gpt2_v3.cpp @@ -389,7 +389,7 @@ bool gpt2_eval( const std::vector & embd_inp, std::vector & embd_w, size_t & mem_per_token, - bool use_scratch=true) { + bool use_scratch) { const int N = embd_inp.size(); const auto & hparams = model.hparams; @@ -405,22 +405,14 @@ bool gpt2_eval( // use 2 scratch buffers // TODO: very hacky solution - reimplement in a more elegant way - static size_t scr0_size = (n_ctx>1024?512u:256u)*1024*1024; - static void * scr0; + static size_t scr0_size = (n_embd>2400?512u:256u)*1024*1024; + static size_t scr1_size = (n_embd>2400?512u:256u)*1024*1024; - static size_t scr1_size = (n_ctx>1024?512u:256u)*1024*1024; - static void * scr1; + static void * scr0 = malloc(scr0_size); + static void * scr1 = malloc(scr1_size); - if(use_scratch) - { - scr0 = malloc(scr0_size); - scr1 = malloc(scr1_size); - } - - size_t scratch_needed_mem = mem_per_token*N; - - if (mem_per_token > 0 && scratch_needed_mem*1.1 > buf_size) { - const size_t buf_size_new = 64u*1024*1024 + 1.2*(scratch_needed_mem); // add 10% to account for ggml object overhead + if (mem_per_token > 0 && (mem_per_token*N*2 + 64u*1024*1024) > buf_size) { + const size_t buf_size_new = 320u*1024*1024 + 1.2*(mem_per_token*N); // add 10% to account for ggml object overhead //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new); // reallocate diff --git a/otherarch/gptj_v3.cpp b/otherarch/gptj_v3.cpp index 8df2025f04466..031a2c051c5f2 100644 --- a/otherarch/gptj_v3.cpp +++ b/otherarch/gptj_v3.cpp @@ -383,7 +383,7 @@ bool gptj_eval( const std::vector & embd_inp, std::vector & embd_w, size_t & mem_per_token, - bool use_scratch=true) { + bool use_scratch) { const int N = embd_inp.size(); const auto & hparams = model.hparams; @@ -400,19 +400,14 @@ bool gptj_eval( // use 2 scratch buffers // TODO: very hacky solution - reimplement in a more elegant way - static size_t scr0_size = (n_ctx>1024?512u:256u)*1024*1024; - static void * scr0; + static size_t scr0_size = 512u*1024*1024; + static size_t scr1_size = 512u*1024*1024; - static size_t scr1_size = (n_ctx>1024?512u:256u)*1024*1024; - static void * scr1; - if(use_scratch) - { - scr0 = malloc(scr0_size); - scr1 = malloc(scr1_size); - } + static void * scr0 = malloc(scr0_size); + static void * scr1 = malloc(scr1_size); - if (mem_per_token > 0 && 32u*1024*1024 + mem_per_token*N*1.2 > buf_size) { - const size_t buf_size_new = 64u*1024*1024 + 1.2*(mem_per_token*N); // add 10% to account for ggml object overhead + if (mem_per_token > 0 && (mem_per_token*N*2 + 64u*1024*1024) > buf_size) { + const size_t buf_size_new = 320u*1024*1024 + 1.2*(mem_per_token*N); // add 10% to account for ggml object overhead //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new); // reallocate diff --git a/otherarch/mpt_v3.cpp b/otherarch/mpt_v3.cpp index ac4f321a2595b..5d66f91f5f4cd 100644 --- a/otherarch/mpt_v3.cpp +++ b/otherarch/mpt_v3.cpp @@ -317,7 +317,7 @@ bool mpt_model_load(const std::string & fname, mpt_model & model, gpt_vocab & vo // bool mpt_eval(const mpt_model & model, const int n_threads, const int n_past, const std::vector & embd_inp, std::vector & embd_w, - bool logits_all, size_t & mem_per_token, bool use_scratch=true) { + bool logits_all, size_t & mem_per_token, bool use_scratch) { const int N = embd_inp.size(); const auto & hparams = model.hparams; @@ -333,26 +333,15 @@ bool mpt_eval(const mpt_model & model, const int n_threads, const int n_past, // use 2 scratch buffers // TODO: very hacky solution - reimplement in a more elegant way + //MPT 30B needs more scratch memory + static size_t scr0_size = (n_embd>=7168?2048u:1024u)*1024*1024; + static size_t scr1_size = (n_embd>=7168?2048u:1024u)*1024*1024; - static size_t scr0_size = (n_ctx>2048?1024u:512u)*1024*1024; - static size_t scr1_size = (n_ctx>2048?1024u:512u)*1024*1024; + static void * scr0 = malloc(scr0_size); + static void * scr1 = malloc(scr1_size); - if(n_embd>=7168) //MPT 30B needs more scratch memory - { - scr0_size *= 2; - scr1_size *= 2; - } - - static void * scr0; - static void * scr1; - if(use_scratch) - { - scr0 = malloc(scr0_size); - scr1 = malloc(scr1_size); - } - - if (mem_per_token > 0 && mem_per_token * N *1.1 > buf_size) { - const size_t buf_size_new = 64u*1024*1024 + 1.2 * (mem_per_token * N); // add 10% to account for ggml object overhead + if (mem_per_token > 0 && (mem_per_token*N*2 + 64u*1024*1024) > buf_size) { + const size_t buf_size_new = 320u*1024*1024 + 1.2*(mem_per_token*N); // add 10% to account for ggml object overhead // printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, // buf_size, buf_size_new); // reallocate diff --git a/otherarch/neox_v3.cpp b/otherarch/neox_v3.cpp index 40e1d1e18ac0b..37f5ad9aed276 100644 --- a/otherarch/neox_v3.cpp +++ b/otherarch/neox_v3.cpp @@ -401,7 +401,7 @@ bool gpt_neox_eval( const std::vector & embd_inp, std::vector & embd_w, size_t & mem_per_token, - bool use_scratch=true) { + bool use_scratch) { const int N = embd_inp.size(); const auto & hparams = model.hparams; @@ -418,21 +418,14 @@ bool gpt_neox_eval( // use 2 scratch buffers // TODO: very hacky solution - reimplement in a more elegant way - static size_t scr0_size = (n_ctx>1024?512u:256u)*1024*1024; - static void * scr0; + static size_t scr0_size = (n_embd>2400?512u:256u)*1024*1024; + static size_t scr1_size = (n_embd>2400?512u:256u)*1024*1024; - static size_t scr1_size = (n_ctx>1024?512u:256u)*1024*1024; - static void * scr1; - if(use_scratch) - { - scr0 = malloc(scr0_size); - scr1 = malloc(scr1_size); - } - - size_t scratch_needed_mem = mem_per_token*N; + static void * scr0 = malloc(scr0_size); + static void * scr1 = malloc(scr1_size); - if (mem_per_token > 0 && scratch_needed_mem*1.1 > buf_size) { - const size_t buf_size_new = 64u*1024*1024 + 1.2*(scratch_needed_mem); // add 10% to account for ggml object overhead + if (mem_per_token > 0 && (mem_per_token*N*2 + 64u*1024*1024) > buf_size) { + const size_t buf_size_new = 360u*1024*1024 + 1.2*(mem_per_token*N); // add 10% to account for ggml object overhead //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new); // reallocate From 490cf395f82d7d0582016a51054457e2d6f89769 Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Fri, 23 Jun 2023 22:51:51 +0800 Subject: [PATCH 11/15] better alloc error --- otherarch/gpt2_v3.cpp | 2 +- otherarch/gptj_v3.cpp | 2 +- otherarch/mpt_v3.cpp | 2 +- otherarch/neox_v3.cpp | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/otherarch/gpt2_v3.cpp b/otherarch/gpt2_v3.cpp index b716fe212d67e..fb15d662be015 100644 --- a/otherarch/gpt2_v3.cpp +++ b/otherarch/gpt2_v3.cpp @@ -422,7 +422,7 @@ bool gpt2_eval( buf = realloc(buf, buf_size); if (buf == nullptr) { - fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size); + fprintf(stderr, "%s: failed to allocate %zu bytes. Try reducing batch size.\n", __func__, buf_size); return false; } } diff --git a/otherarch/gptj_v3.cpp b/otherarch/gptj_v3.cpp index 031a2c051c5f2..b00bd6bd291d4 100644 --- a/otherarch/gptj_v3.cpp +++ b/otherarch/gptj_v3.cpp @@ -417,7 +417,7 @@ bool gptj_eval( buf = realloc(buf, buf_size); if (buf == nullptr) { - fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size); + fprintf(stderr, "%s: failed to allocate %zu bytes. Try reducing batch size.\n", __func__, buf_size); return false; } } diff --git a/otherarch/mpt_v3.cpp b/otherarch/mpt_v3.cpp index 5d66f91f5f4cd..a60172f51611d 100644 --- a/otherarch/mpt_v3.cpp +++ b/otherarch/mpt_v3.cpp @@ -350,7 +350,7 @@ bool mpt_eval(const mpt_model & model, const int n_threads, const int n_past, buf_size = buf_size_new; buf = realloc(buf, buf_size); if (buf == nullptr) { - fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size); + fprintf(stderr, "%s: failed to allocate %zu bytes. Try reducing batch size.\n", __func__, buf_size); return false; } } diff --git a/otherarch/neox_v3.cpp b/otherarch/neox_v3.cpp index 37f5ad9aed276..245d383d63793 100644 --- a/otherarch/neox_v3.cpp +++ b/otherarch/neox_v3.cpp @@ -435,7 +435,7 @@ bool gpt_neox_eval( buf = realloc(buf, buf_size); if (buf == nullptr) { - fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size); + fprintf(stderr, "%s: failed to allocate %zu bytes. Try reducing batch size.\n", __func__, buf_size); return false; } } From f7b096374dad99164c610196c1926d53d3e87831 Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Fri, 23 Jun 2023 23:56:22 +0800 Subject: [PATCH 12/15] fixed string too long CI issue --- ggml-opencl.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ggml-opencl.cpp b/ggml-opencl.cpp index 7de91049d3766..fed4ffb0ccd05 100644 --- a/ggml-opencl.cpp +++ b/ggml-opencl.cpp @@ -183,7 +183,9 @@ void convert_f16(__global half* x, const int ib, const int iqs, float* v0, float *v0 = vload_half(0, &x[ib + 0]); *v1 = vload_half(0, &x[ib + 1]); } +); +static std::string k_quants_source = MULTILINE_QUOTE( inline void get_scale_min_k4(int j, const __global uint8_t *q, uint8_t *d, uint8_t *m) { if (j < 4) @@ -853,6 +855,7 @@ std::string& replace(std::string& s, const std::string& from, const std::string& std::string generate_kernels() { std::stringstream src; src << program_source << '\n'; + src << k_quants_source << '\n'; for (size_t i = 0; i < dequant_str_values.size(); i += dequant_str_keys.size()) { std::string dequant_kernel = dequant_template; std::string dmmv_kernel = dequant_mul_mat_vec_template; From 0485fa65a2fc3159ea9fb2ad7661a5837038b31d Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Sat, 24 Jun 2023 11:43:42 +0800 Subject: [PATCH 13/15] wstring convert for mpt --- gpttype_adapter.cpp | 3 ++- llama.cpp | 4 ++-- otherarch/mpt_v3.cpp | 14 ++++++++++++-- otherarch/utils.cpp | 10 ++++++++-- otherarch/utils.h | 6 ++++++ 5 files changed, 30 insertions(+), 7 deletions(-) diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp index 8c716c84a7de6..b166e2aac1922 100644 --- a/gpttype_adapter.cpp +++ b/gpttype_adapter.cpp @@ -313,6 +313,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in = gpt2_ctx_v1.hparams.n_ctx = gpt2_ctx_v2.hparams.n_ctx = gpt2_ctx_v3.hparams.n_ctx = mpt_ctx_v3.hparams.n_ctx = params.n_ctx; + //this is used for the mem_per_token eval, openblas needs more RAM bool use_scratch = ggml_cpu_has_gpublas(); printf("System Info: %s\n", llama_print_system_info()); @@ -904,7 +905,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o concat_output = ""; bool startedsampling = false; - bool use_scratch = true; + bool use_scratch = true; //for normal inference always use scratch timer_start(); double time1 = 0, time2 = 0; diff --git a/llama.cpp b/llama.cpp index aa67038e02db8..5259fd52ef9b7 100644 --- a/llama.cpp +++ b/llama.cpp @@ -105,7 +105,7 @@ static const std::map & MEM_REQ_KV_SELF() { MODEL_3B, 682ull * MB }, { MODEL_7B, 1026ull * MB }, { MODEL_13B, 1608ull * MB }, - { MODEL_30B, 3124ull * MB }, + { MODEL_30B, 3224ull * MB }, { MODEL_65B, 5120ull * MB }, }; return k_sizes; @@ -119,7 +119,7 @@ static const std::map & MEM_REQ_EVAL() { MODEL_3B, 512ull * MB }, { MODEL_7B, 800ull * MB }, { MODEL_13B, 1024ull * MB }, - { MODEL_30B, 1280ull * MB }, + { MODEL_30B, 1380ull * MB }, { MODEL_65B, 1536ull * MB }, }; return k_sizes; diff --git a/otherarch/mpt_v3.cpp b/otherarch/mpt_v3.cpp index a60172f51611d..b611b0703b8ba 100644 --- a/otherarch/mpt_v3.cpp +++ b/otherarch/mpt_v3.cpp @@ -86,6 +86,16 @@ bool mpt_model_load(const std::string & fname, mpt_model & model, gpt_vocab & vo fin.read((char *) buf.data(), len); word.assign(buf.data(), len); + // Convert token from utf-8 + std::wstring word_multibytes = convert_to_wstring(word); + if(word_multibytes!=L"") + { + word.resize(word_multibytes.size()); + for (int w = 0; w < word_multibytes.size(); w++) { + word[w] = uint8_t(word_multibytes[w]); + } + } + vocab.token_to_id[word] = i; vocab.id_to_token[i] = word; } @@ -123,8 +133,8 @@ bool mpt_model_load(const std::string & fname, mpt_model & model, gpt_vocab & vo ctx_size += n_layer * (4 * n_embd * n_embd * ggml_type_sizef(wtype)); // mlp_mlp_up_weight ctx_size += n_layer * (n_embd * n_embd * 4 * ggml_type_sizef(wtype)); // mlp_mlp_down_weight - ctx_size += (n_ctx * n_layer * n_embd * ggml_type_sizef(GGML_TYPE_F16)); // memory_k - ctx_size += (n_ctx * n_layer * n_embd * ggml_type_sizef(GGML_TYPE_F16)); // memory_v + ctx_size += n_ctx * n_layer * n_embd * ggml_type_sizef(GGML_TYPE_F16); // memory_k + ctx_size += n_ctx * n_layer * n_embd * ggml_type_sizef(GGML_TYPE_F16); // memory_v ctx_size += (6 + 6 * n_layer) * 512; // object overhead diff --git a/otherarch/utils.cpp b/otherarch/utils.cpp index 57c362934c811..02637069a9b98 100644 --- a/otherarch/utils.cpp +++ b/otherarch/utils.cpp @@ -122,8 +122,14 @@ std::string convert_to_utf8(const std::wstring & input) { std::wstring convert_to_wstring(const std::string & input) { - std::wstring_convert> converter; - return converter.from_bytes(input); + try { + std::wstring_convert> converter; + return converter.from_bytes(input); + } catch (const std::range_error& e) { + return L""; + } catch (...) { + return L""; + } } void gpt_split_words(std::string str, std::vector& words) { diff --git a/otherarch/utils.h b/otherarch/utils.h index bb57a8242f4e9..f9857823faf34 100644 --- a/otherarch/utils.h +++ b/otherarch/utils.h @@ -34,6 +34,12 @@ void utreplace(std::string & str, const std::string & needle, const std::string // poor-man's JSON parsing std::map json_parse(const std::string & fname); +std::string convert_to_utf8(const std::wstring & input); + +std::wstring convert_to_wstring(const std::string & input); + +void gpt_split_words(std::string str, std::vector& words); + // split text into tokens // // ref: https://github.com/openai/gpt-2/blob/a74da5d99abaaba920de8131d64da2862a8f213b/src/encoder.py#L53 From 6da38b0d40a6476ccdd56e48143b21d4254b5da1 Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Sat, 24 Jun 2023 12:30:38 +0800 Subject: [PATCH 14/15] up ver --- koboldcpp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/koboldcpp.py b/koboldcpp.py index 76e94b84ac63b..cc2ba50d4dc91 100644 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -225,7 +225,7 @@ def utfprint(str): maxhordelen = 256 modelbusy = False defaultport = 5001 -KcppVersion = "1.32.1" +KcppVersion = "1.32.2" showdebug = True class ServerRequestHandler(http.server.SimpleHTTPRequestHandler): From 8342fe81b1c2a00aa81d44c9e1ffb7057df3b323 Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Sat, 24 Jun 2023 12:58:49 +0800 Subject: [PATCH 15/15] revert the wstring tokenization. coherency was affected --- koboldcpp.py | 2 +- otherarch/mpt_v3.cpp | 16 ++++++++-------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/koboldcpp.py b/koboldcpp.py index cc2ba50d4dc91..99026ab857adb 100644 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -225,7 +225,7 @@ def utfprint(str): maxhordelen = 256 modelbusy = False defaultport = 5001 -KcppVersion = "1.32.2" +KcppVersion = "1.32.3" showdebug = True class ServerRequestHandler(http.server.SimpleHTTPRequestHandler): diff --git a/otherarch/mpt_v3.cpp b/otherarch/mpt_v3.cpp index b611b0703b8ba..100e635ba8645 100644 --- a/otherarch/mpt_v3.cpp +++ b/otherarch/mpt_v3.cpp @@ -87,14 +87,14 @@ bool mpt_model_load(const std::string & fname, mpt_model & model, gpt_vocab & vo word.assign(buf.data(), len); // Convert token from utf-8 - std::wstring word_multibytes = convert_to_wstring(word); - if(word_multibytes!=L"") - { - word.resize(word_multibytes.size()); - for (int w = 0; w < word_multibytes.size(); w++) { - word[w] = uint8_t(word_multibytes[w]); - } - } + // std::wstring word_multibytes = convert_to_wstring(word); + // if(word_multibytes!=L"") + // { + // word.resize(word_multibytes.size()); + // for (int w = 0; w < word_multibytes.size(); w++) { + // word[w] = uint8_t(word_multibytes[w]); + // } + // } vocab.token_to_id[word] = i; vocab.id_to_token[i] = word;