Skip to content

Commit

Permalink
gpu layer offloading disabled for phi models in clblast
Browse files Browse the repository at this point in the history
  • Loading branch information
LostRuins committed Jan 25, 2024
1 parent 0a70cc1 commit d9a7bd5
Show file tree
Hide file tree
Showing 4 changed files with 38 additions and 36 deletions.
6 changes: 1 addition & 5 deletions expose.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -169,13 +169,9 @@ extern "C"
{
printf("\n---\nIdentified as RWKV model: (ver %d)\nAttempting to Load...\n---\n", file_format);
}
else if(file_format==FileFormat::GGUF_FALCON)
{
printf("\n---\nIdentified as FALCON model: (ver %d)\nAttempting to Load...\n---\n", file_format);
}
else
{
printf("\n---\nIdentified as LLAMA model: (ver %d)\nAttempting to Load...\n---\n", file_format);
printf("\n---\nIdentified as GGUF model: (ver %d)\nAttempting to Load...\n---\n", file_format);
}
ModelLoadResult lr = gpttype_load_model(inputs, file_format, file_format_meta);
if (lr == ModelLoadResult::FAIL || lr == ModelLoadResult::RETRY_LOAD)
Expand Down
35 changes: 18 additions & 17 deletions gpttype_adapter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ static std::string FileFormatTokenizeID(int id, FileFormat file_format)
{
return std::string(llama_v3_token_to_str(llama_ctx_v3, id));
}
else if(file_format == FileFormat::GGUF_LLAMA || file_format==FileFormat::GGUF_FALCON)
else if(file_format == FileFormat::GGUF_GENERIC)
{
return std::string(llama_token_to_str(llama_ctx_v4, id));
}
Expand All @@ -153,7 +153,7 @@ static std::string FileFormatTokenizeID(int id, FileFormat file_format)

static void TokenizeString(const std::string & str_to_tokenize, std::vector<int> & output_tokens, FileFormat file_format)
{
if (file_format == FileFormat::GGML || file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2 || file_format == FileFormat::GGJT_3 || file_format == FileFormat::GGUF_LLAMA || file_format==FileFormat::GGUF_FALCON)
if (file_format == FileFormat::GGML || file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2 || file_format == FileFormat::GGJT_3 || file_format == FileFormat::GGUF_GENERIC)
{
if(file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2 )
{
Expand Down Expand Up @@ -182,9 +182,9 @@ static int GetEosID(FileFormat file_format, int32_t n_vocab)
{
unsigned int eosID = 0;

if(file_format == FileFormat::GGML || file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2 || file_format == FileFormat::GGJT_3 || file_format == FileFormat::GGUF_LLAMA || file_format==FileFormat::GGUF_FALCON)
if(file_format == FileFormat::GGML || file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2 || file_format == FileFormat::GGJT_3 || file_format == FileFormat::GGUF_GENERIC)
{
if(file_format == FileFormat::GGUF_LLAMA || file_format==FileFormat::GGUF_FALCON)
if(file_format == FileFormat::GGUF_GENERIC)
{
eosID = llama_token_eos(&(llama_ctx_v4->model));
}
Expand Down Expand Up @@ -696,7 +696,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
file_format = in_file_format;
n_threads = kcpp_params->n_threads = inputs.threads;
n_blasthreads = kcpp_params->n_threads_batch = inputs.blasthreads;
bool isGguf = (file_format == FileFormat::GGUF_LLAMA || file_format==FileFormat::GGUF_FALCON);
bool isGguf = (file_format == FileFormat::GGUF_GENERIC);

n_batch = kcpp_params->n_batch = (isGguf?normalbatchsize:smallbatchsize);
modelname = kcpp_params->model = inputs.model_filename;
Expand All @@ -712,7 +712,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
auto clamped_max_context_length = inputs.max_context_length;

if(clamped_max_context_length>16384 &&
file_format != FileFormat::GGUF_LLAMA && file_format!=FileFormat::GGUF_FALCON)
file_format != FileFormat::GGUF_GENERIC)
{
printf("Warning: Only GGUF models can use max context above 16k. Max context lowered to 16k.\n");
clamped_max_context_length = 16384;
Expand Down Expand Up @@ -748,7 +748,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
{
//approximate NTK aware ctx
auto effectivenctx = kcpp_params->n_ctx;
if((file_format == FileFormat::GGUF_LLAMA || file_format==FileFormat::GGUF_FALCON) && file_format_meta.n_ctx_train > 2048)
if((file_format == FileFormat::GGUF_GENERIC) && file_format_meta.n_ctx_train > 2048)
{
float factor = file_format_meta.n_ctx_train/2048;
effectivenctx = effectivenctx/factor;
Expand Down Expand Up @@ -781,7 +781,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in

printf("System Info: %s\n", llama_print_system_info());
#if defined(GGML_USE_CUBLAS)
if(file_format!=FileFormat::GGUF_LLAMA && file_format!=FileFormat::GGUF_FALCON)
if(file_format!=FileFormat::GGUF_GENERIC)
{
if(ggml_v3_cpu_has_gpublas() && cu_parseinfo_maindevice>0)
{
Expand Down Expand Up @@ -915,7 +915,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
}
return ModelLoadResult::SUCCESS;
}
else if(file_format==FileFormat::GGUF_LLAMA || file_format==FileFormat::GGUF_FALCON)
else if(file_format==FileFormat::GGUF_GENERIC)
{
llama_model_params model_params = llama_model_default_params();
llama_context_params llama_ctx_params = llama_context_default_params();
Expand All @@ -932,10 +932,11 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
model_params.use_mmap = inputs.use_mmap;
model_params.use_mlock = inputs.use_mlock;
model_params.n_gpu_layers = inputs.gpulayers;

#if defined(GGML_USE_CLBLAST)
if(file_format==FileFormat::GGUF_FALCON && model_params.n_gpu_layers>0)
if(file_format==FileFormat::GGUF_GENERIC && (file_format_meta.model_architecture == GGUFArch::FALCON || file_format_meta.model_architecture == GGUFArch::PHI) && model_params.n_gpu_layers>0)
{
printf("\nGPU layer offload for GGUF FALCON on OpenCL is known to have issues, it has been set to 0.\n");
printf("\nOpenCL does not support GPU Layer offloading for this model architecture! GPU Offload has been disabled.\n");
model_params.n_gpu_layers = 0;
}
#endif
Expand Down Expand Up @@ -1642,13 +1643,13 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
else
{
bool triggersc = useSmartContext;
if(useContextShift && (file_format == FileFormat::GGUF_LLAMA || file_format==FileFormat::GGUF_FALCON))
if(useContextShift && (file_format == FileFormat::GGUF_GENERIC))
{
PurgeMissingTokens(llama_ctx_v4, current_context_tokens, embd_inp, inputs.max_length, nctx);
triggersc = false;
}
ContextFastForward(current_context_tokens, embd_inp, n_past, last_n_tokens, nctx, smartcontext, triggersc, false);
if(file_format == FileFormat::GGUF_LLAMA || file_format==FileFormat::GGUF_FALCON)
if(file_format == FileFormat::GGUF_GENERIC)
{
llama_kv_cache_seq_rm(llama_ctx_v4, 0, n_past, -1);
}
Expand All @@ -1669,7 +1670,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
{
//for non llama, limit to 256
int bbs = blasbatchsize;
if (file_format != FileFormat::GGML && file_format != FileFormat::GGHF && file_format != FileFormat::GGJT && file_format != FileFormat::GGJT_2 && file_format != FileFormat::GGJT_3 && file_format != FileFormat::GGUF_LLAMA && file_format!=FileFormat::GGUF_FALCON)
if (file_format != FileFormat::GGML && file_format != FileFormat::GGHF && file_format != FileFormat::GGJT && file_format != FileFormat::GGJT_2 && file_format != FileFormat::GGJT_3 && file_format != FileFormat::GGUF_GENERIC)
{
bbs = (blasbatchsize > 256 ? 256 : blasbatchsize);
}
Expand Down Expand Up @@ -1821,7 +1822,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
{
evalres = (llama_v3_eval(llama_ctx_v3, embd.data(), embdsize, n_past, kcpp_params->n_threads)==0);
}
else if(file_format == FileFormat::GGUF_LLAMA || file_format==FileFormat::GGUF_FALCON)
else if(file_format == FileFormat::GGUF_GENERIC)
{
evalres = (llama_decode(llama_ctx_v4, llama_batch_get_one(embd.data(), embdsize, n_past, 0))==0);
}
Expand Down Expand Up @@ -1934,9 +1935,9 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
float * logitsPtr;
float lowestLogit = 0;
int btsize = banned_token_ids.size();
if(file_format == FileFormat::GGML || file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2 || file_format == FileFormat::GGJT_3 || file_format == FileFormat::GGUF_LLAMA || file_format==FileFormat::GGUF_FALCON)
if(file_format == FileFormat::GGML || file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2 || file_format == FileFormat::GGJT_3 || file_format == FileFormat::GGUF_GENERIC)
{
if(file_format == FileFormat::GGUF_LLAMA || file_format==FileFormat::GGUF_FALCON)
if(file_format == FileFormat::GGUF_GENERIC)
{
logitsPtr = llama_get_logits(llama_ctx_v4);
}
Expand Down
22 changes: 10 additions & 12 deletions model_adapter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -255,7 +255,7 @@ void print_tok_vec(std::vector<float> &embd)
else if(magic == 0x46554747)
{
fin.close();
fileformat = FileFormat::GGUF_LLAMA;
fileformat = FileFormat::GGUF_GENERIC;

struct gguf_init_params ggufparams;
ggufparams.no_alloc = true;
Expand All @@ -267,19 +267,8 @@ void print_tok_vec(std::vector<float> &embd)
std::string modelarch = "";
if (keyidx != -1) { modelarch = gguf_get_val_str(ctx, keyidx); }

if(modelarch=="llama")
{
fileformat = FileFormat::GGUF_LLAMA;
}
else if(modelarch=="falcon")
{
fileformat = FileFormat::GGUF_FALCON; //uses the same loader
}


printf("\nThe reported GGUF Arch is: %s\n",(modelarch==""?"unknown":modelarch.c_str()));


if(modelarch!="" && fileformatmeta!=nullptr)
{
std::string fkey = modelarch+".context_length";
Expand All @@ -289,6 +278,15 @@ void print_tok_vec(std::vector<float> &embd)
}
int filever = gguf_get_version(ctx);
fileformatmeta->fileversion = filever;
fileformatmeta->model_architecture = GGUFArch::DEFAULT;
if(modelarch=="phi2")
{
fileformatmeta->model_architecture = GGUFArch::PHI;
}
else if(modelarch=="falcon")
{
fileformatmeta->model_architecture = GGUFArch::FALCON;
}
}
gguf_free(ctx);
}
Expand Down
11 changes: 9 additions & 2 deletions model_adapter.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@ enum FileFormat
GGJT=3, // 3=(llama ggjt)
GGJT_2=4, //newer llama format unshuffled
GGJT_3=5, //using 16bit scalar
GGUF_LLAMA=6, //GGUF (llama newest ver)

GGUF_GENERIC=6, //GGUF (llama newest ver)

GPTJ_1=100, //the very first super old GPTJ format
GPTJ_2=101, //pygmalion, uses old ggml lib
Expand All @@ -47,14 +48,20 @@ enum FileFormat

MPT_1=500, //first supported mpt version

GGUF_FALCON=600, //GGUF (falcon)
};

enum GGUFArch
{
DEFAULT = 0, //used for llama and other generic gguf
FALCON = 1,
PHI = 2,
};

struct FileFormatExtraMeta
{
int n_ctx_train = 2048;
int fileversion = 0;
GGUFArch model_architecture = GGUFArch::DEFAULT;
};

enum ModelLoadResult
Expand Down

0 comments on commit d9a7bd5

Please sign in to comment.