From d9a7bd577a4b445bd7582b961e8372f2630c984e Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Thu, 25 Jan 2024 17:40:05 +0800 Subject: [PATCH] gpu layer offloading disabled for phi models in clblast --- expose.cpp | 6 +----- gpttype_adapter.cpp | 35 ++++++++++++++++++----------------- model_adapter.cpp | 22 ++++++++++------------ model_adapter.h | 11 +++++++++-- 4 files changed, 38 insertions(+), 36 deletions(-) diff --git a/expose.cpp b/expose.cpp index ab1111ee245ae..eb7af70a7eedc 100644 --- a/expose.cpp +++ b/expose.cpp @@ -169,13 +169,9 @@ extern "C" { printf("\n---\nIdentified as RWKV model: (ver %d)\nAttempting to Load...\n---\n", file_format); } - else if(file_format==FileFormat::GGUF_FALCON) - { - printf("\n---\nIdentified as FALCON model: (ver %d)\nAttempting to Load...\n---\n", file_format); - } else { - printf("\n---\nIdentified as LLAMA model: (ver %d)\nAttempting to Load...\n---\n", file_format); + printf("\n---\nIdentified as GGUF model: (ver %d)\nAttempting to Load...\n---\n", file_format); } ModelLoadResult lr = gpttype_load_model(inputs, file_format, file_format_meta); if (lr == ModelLoadResult::FAIL || lr == ModelLoadResult::RETRY_LOAD) diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp index 423029ea5ea69..013c5321d6ee4 100644 --- a/gpttype_adapter.cpp +++ b/gpttype_adapter.cpp @@ -141,7 +141,7 @@ static std::string FileFormatTokenizeID(int id, FileFormat file_format) { return std::string(llama_v3_token_to_str(llama_ctx_v3, id)); } - else if(file_format == FileFormat::GGUF_LLAMA || file_format==FileFormat::GGUF_FALCON) + else if(file_format == FileFormat::GGUF_GENERIC) { return std::string(llama_token_to_str(llama_ctx_v4, id)); } @@ -153,7 +153,7 @@ static std::string FileFormatTokenizeID(int id, FileFormat file_format) static void TokenizeString(const std::string & str_to_tokenize, std::vector & output_tokens, FileFormat file_format) { - if (file_format == FileFormat::GGML || file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2 || file_format == FileFormat::GGJT_3 || file_format == FileFormat::GGUF_LLAMA || file_format==FileFormat::GGUF_FALCON) + if (file_format == FileFormat::GGML || file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2 || file_format == FileFormat::GGJT_3 || file_format == FileFormat::GGUF_GENERIC) { if(file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2 ) { @@ -182,9 +182,9 @@ static int GetEosID(FileFormat file_format, int32_t n_vocab) { unsigned int eosID = 0; - if(file_format == FileFormat::GGML || file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2 || file_format == FileFormat::GGJT_3 || file_format == FileFormat::GGUF_LLAMA || file_format==FileFormat::GGUF_FALCON) + if(file_format == FileFormat::GGML || file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2 || file_format == FileFormat::GGJT_3 || file_format == FileFormat::GGUF_GENERIC) { - if(file_format == FileFormat::GGUF_LLAMA || file_format==FileFormat::GGUF_FALCON) + if(file_format == FileFormat::GGUF_GENERIC) { eosID = llama_token_eos(&(llama_ctx_v4->model)); } @@ -696,7 +696,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in file_format = in_file_format; n_threads = kcpp_params->n_threads = inputs.threads; n_blasthreads = kcpp_params->n_threads_batch = inputs.blasthreads; - bool isGguf = (file_format == FileFormat::GGUF_LLAMA || file_format==FileFormat::GGUF_FALCON); + bool isGguf = (file_format == FileFormat::GGUF_GENERIC); n_batch = kcpp_params->n_batch = (isGguf?normalbatchsize:smallbatchsize); modelname = kcpp_params->model = inputs.model_filename; @@ -712,7 +712,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in auto clamped_max_context_length = inputs.max_context_length; if(clamped_max_context_length>16384 && - file_format != FileFormat::GGUF_LLAMA && file_format!=FileFormat::GGUF_FALCON) + file_format != FileFormat::GGUF_GENERIC) { printf("Warning: Only GGUF models can use max context above 16k. Max context lowered to 16k.\n"); clamped_max_context_length = 16384; @@ -748,7 +748,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in { //approximate NTK aware ctx auto effectivenctx = kcpp_params->n_ctx; - if((file_format == FileFormat::GGUF_LLAMA || file_format==FileFormat::GGUF_FALCON) && file_format_meta.n_ctx_train > 2048) + if((file_format == FileFormat::GGUF_GENERIC) && file_format_meta.n_ctx_train > 2048) { float factor = file_format_meta.n_ctx_train/2048; effectivenctx = effectivenctx/factor; @@ -781,7 +781,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in printf("System Info: %s\n", llama_print_system_info()); #if defined(GGML_USE_CUBLAS) - if(file_format!=FileFormat::GGUF_LLAMA && file_format!=FileFormat::GGUF_FALCON) + if(file_format!=FileFormat::GGUF_GENERIC) { if(ggml_v3_cpu_has_gpublas() && cu_parseinfo_maindevice>0) { @@ -915,7 +915,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in } return ModelLoadResult::SUCCESS; } - else if(file_format==FileFormat::GGUF_LLAMA || file_format==FileFormat::GGUF_FALCON) + else if(file_format==FileFormat::GGUF_GENERIC) { llama_model_params model_params = llama_model_default_params(); llama_context_params llama_ctx_params = llama_context_default_params(); @@ -932,10 +932,11 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in model_params.use_mmap = inputs.use_mmap; model_params.use_mlock = inputs.use_mlock; model_params.n_gpu_layers = inputs.gpulayers; + #if defined(GGML_USE_CLBLAST) - if(file_format==FileFormat::GGUF_FALCON && model_params.n_gpu_layers>0) + if(file_format==FileFormat::GGUF_GENERIC && (file_format_meta.model_architecture == GGUFArch::FALCON || file_format_meta.model_architecture == GGUFArch::PHI) && model_params.n_gpu_layers>0) { - printf("\nGPU layer offload for GGUF FALCON on OpenCL is known to have issues, it has been set to 0.\n"); + printf("\nOpenCL does not support GPU Layer offloading for this model architecture! GPU Offload has been disabled.\n"); model_params.n_gpu_layers = 0; } #endif @@ -1642,13 +1643,13 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o else { bool triggersc = useSmartContext; - if(useContextShift && (file_format == FileFormat::GGUF_LLAMA || file_format==FileFormat::GGUF_FALCON)) + if(useContextShift && (file_format == FileFormat::GGUF_GENERIC)) { PurgeMissingTokens(llama_ctx_v4, current_context_tokens, embd_inp, inputs.max_length, nctx); triggersc = false; } ContextFastForward(current_context_tokens, embd_inp, n_past, last_n_tokens, nctx, smartcontext, triggersc, false); - if(file_format == FileFormat::GGUF_LLAMA || file_format==FileFormat::GGUF_FALCON) + if(file_format == FileFormat::GGUF_GENERIC) { llama_kv_cache_seq_rm(llama_ctx_v4, 0, n_past, -1); } @@ -1669,7 +1670,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o { //for non llama, limit to 256 int bbs = blasbatchsize; - if (file_format != FileFormat::GGML && file_format != FileFormat::GGHF && file_format != FileFormat::GGJT && file_format != FileFormat::GGJT_2 && file_format != FileFormat::GGJT_3 && file_format != FileFormat::GGUF_LLAMA && file_format!=FileFormat::GGUF_FALCON) + if (file_format != FileFormat::GGML && file_format != FileFormat::GGHF && file_format != FileFormat::GGJT && file_format != FileFormat::GGJT_2 && file_format != FileFormat::GGJT_3 && file_format != FileFormat::GGUF_GENERIC) { bbs = (blasbatchsize > 256 ? 256 : blasbatchsize); } @@ -1821,7 +1822,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o { evalres = (llama_v3_eval(llama_ctx_v3, embd.data(), embdsize, n_past, kcpp_params->n_threads)==0); } - else if(file_format == FileFormat::GGUF_LLAMA || file_format==FileFormat::GGUF_FALCON) + else if(file_format == FileFormat::GGUF_GENERIC) { evalres = (llama_decode(llama_ctx_v4, llama_batch_get_one(embd.data(), embdsize, n_past, 0))==0); } @@ -1934,9 +1935,9 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o float * logitsPtr; float lowestLogit = 0; int btsize = banned_token_ids.size(); - if(file_format == FileFormat::GGML || file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2 || file_format == FileFormat::GGJT_3 || file_format == FileFormat::GGUF_LLAMA || file_format==FileFormat::GGUF_FALCON) + if(file_format == FileFormat::GGML || file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2 || file_format == FileFormat::GGJT_3 || file_format == FileFormat::GGUF_GENERIC) { - if(file_format == FileFormat::GGUF_LLAMA || file_format==FileFormat::GGUF_FALCON) + if(file_format == FileFormat::GGUF_GENERIC) { logitsPtr = llama_get_logits(llama_ctx_v4); } diff --git a/model_adapter.cpp b/model_adapter.cpp index 509e9b212ff16..5575c22a6608e 100644 --- a/model_adapter.cpp +++ b/model_adapter.cpp @@ -255,7 +255,7 @@ void print_tok_vec(std::vector &embd) else if(magic == 0x46554747) { fin.close(); - fileformat = FileFormat::GGUF_LLAMA; + fileformat = FileFormat::GGUF_GENERIC; struct gguf_init_params ggufparams; ggufparams.no_alloc = true; @@ -267,19 +267,8 @@ void print_tok_vec(std::vector &embd) std::string modelarch = ""; if (keyidx != -1) { modelarch = gguf_get_val_str(ctx, keyidx); } - if(modelarch=="llama") - { - fileformat = FileFormat::GGUF_LLAMA; - } - else if(modelarch=="falcon") - { - fileformat = FileFormat::GGUF_FALCON; //uses the same loader - } - - printf("\nThe reported GGUF Arch is: %s\n",(modelarch==""?"unknown":modelarch.c_str())); - if(modelarch!="" && fileformatmeta!=nullptr) { std::string fkey = modelarch+".context_length"; @@ -289,6 +278,15 @@ void print_tok_vec(std::vector &embd) } int filever = gguf_get_version(ctx); fileformatmeta->fileversion = filever; + fileformatmeta->model_architecture = GGUFArch::DEFAULT; + if(modelarch=="phi2") + { + fileformatmeta->model_architecture = GGUFArch::PHI; + } + else if(modelarch=="falcon") + { + fileformatmeta->model_architecture = GGUFArch::FALCON; + } } gguf_free(ctx); } diff --git a/model_adapter.h b/model_adapter.h index 7180d4f849444..0c7f92fafc5b8 100644 --- a/model_adapter.h +++ b/model_adapter.h @@ -21,7 +21,8 @@ enum FileFormat GGJT=3, // 3=(llama ggjt) GGJT_2=4, //newer llama format unshuffled GGJT_3=5, //using 16bit scalar - GGUF_LLAMA=6, //GGUF (llama newest ver) + + GGUF_GENERIC=6, //GGUF (llama newest ver) GPTJ_1=100, //the very first super old GPTJ format GPTJ_2=101, //pygmalion, uses old ggml lib @@ -47,14 +48,20 @@ enum FileFormat MPT_1=500, //first supported mpt version - GGUF_FALCON=600, //GGUF (falcon) +}; +enum GGUFArch +{ + DEFAULT = 0, //used for llama and other generic gguf + FALCON = 1, + PHI = 2, }; struct FileFormatExtraMeta { int n_ctx_train = 2048; int fileversion = 0; + GGUFArch model_architecture = GGUFArch::DEFAULT; }; enum ModelLoadResult