From af99c6fbfc815df7dad94d8c1f20d55927b2203a Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 5 Dec 2023 18:18:16 +0200 Subject: [PATCH] llama : remove memory_f16 and kv_f16 flags --- common/common.cpp | 6 ------ common/common.h | 1 - examples/quantize-stats/quantize-stats.cpp | 1 - examples/server/server.cpp | 4 ---- llama.cpp | 3 --- llama.h | 1 - 6 files changed, 16 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 77332d5dbb77b..a5b5c468c802b 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -278,8 +278,6 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { break; } params.yarn_beta_slow = std::stof(argv[i]); - } else if (arg == "--memory-f32") { - params.memory_f16 = false; } else if (arg == "--top-p") { if (++i >= argc) { invalid_param = true; @@ -804,8 +802,6 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" --yarn-beta-fast N YaRN: low correction dim or beta (default: %.1f)\n", params.yarn_beta_fast); printf(" --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n"); printf(" --no-penalize-nl do not penalize newline token\n"); - printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n"); - printf(" not recommended: doubles context memory required and no measurable increase in quality\n"); printf(" --temp N temperature (default: %.1f)\n", (double)sparams.temp); printf(" --logits-all return logits for all tokens in the batch (default: disabled)\n"); printf(" --hellaswag compute HellaSwag score over random tasks from datafile supplied with -f\n"); @@ -948,7 +944,6 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param cparams.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch; cparams.mul_mat_q = params.mul_mat_q; cparams.seed = params.seed; - cparams.f16_kv = params.memory_f16; cparams.logits_all = params.logits_all; cparams.embedding = params.embedding; cparams.rope_scaling_type = params.rope_scaling_type; @@ -1375,7 +1370,6 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l } fprintf(stream, "lora_base: %s\n", params.lora_base.c_str()); fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu); - fprintf(stream, "memory_f32: %s # default: false\n", !params.memory_f16 ? "true" : "false"); fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat); fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau); fprintf(stream, "mirostat_lr: %f # default: 0.1\n", sparams.mirostat_eta); diff --git a/common/common.h b/common/common.h index 7f0d03e41dbd9..4cf471c7a8a16 100644 --- a/common/common.h +++ b/common/common.h @@ -98,7 +98,6 @@ struct gpt_params { size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score bool mul_mat_q = true; // if true, use mul_mat_q kernels instead of cuBLAS - bool memory_f16 = true; // use f16 instead of f32 for memory kv bool random_prompt = false; // do not randomize prompt if none provided bool use_color = false; // use color to distinguish generations and inputs bool interactive = false; // interactive mode diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp index 2712824774ae7..773024160f839 100644 --- a/examples/quantize-stats/quantize-stats.cpp +++ b/examples/quantize-stats/quantize-stats.cpp @@ -321,7 +321,6 @@ int main(int argc, char ** argv) { auto cparams = llama_context_default_params(); cparams.n_ctx = 256; cparams.seed = 1; - cparams.f16_kv = false; ctx = llama_new_context_with_model(model, cparams); diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 911f7bbe1f85a..ef2a95004f453 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -2108,10 +2108,6 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, } params.yarn_beta_slow = std::stof(argv[i]); } - else if (arg == "--memory-f32" || arg == "--memory_f32") - { - params.memory_f16 = false; - } else if (arg == "--threads" || arg == "-t") { if (++i >= argc) diff --git a/llama.cpp b/llama.cpp index 3f951dbe31952..800951ab8de72 100644 --- a/llama.cpp +++ b/llama.cpp @@ -8583,7 +8583,6 @@ struct llama_context_params llama_context_default_params() { /*.type_k =*/ GGML_TYPE_F16, /*.type_v =*/ GGML_TYPE_F16, /*.mul_mat_q =*/ true, - /*.f16_kv =*/ true, /*.logits_all =*/ false, /*.embedding =*/ false, /*.offload_kqv =*/ true, @@ -8737,8 +8736,6 @@ struct llama_context * llama_new_context_with_model( ctx->rng = std::mt19937(params.seed); ctx->logits_all = params.logits_all; - //const ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32; - const ggml_type type_k = params.type_k; const ggml_type type_v = params.type_v; diff --git a/llama.h b/llama.h index f6c9d17519e7e..ead37562e37f3 100644 --- a/llama.h +++ b/llama.h @@ -196,7 +196,6 @@ extern "C" { // Keep the booleans together to avoid misalignment during copy-by-value. bool mul_mat_q; // if true, use experimental mul_mat_q kernels (DEPRECATED - always true) - bool f16_kv; // use fp16 for KV cache, fp32 otherwise bool logits_all; // the llama_eval() call computes all logits, not just the last one bool embedding; // embedding mode only bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU