diff --git a/Makefile b/Makefile index 4d30f2dad40fa..c847b180942c6 100644 --- a/Makefile +++ b/Makefile @@ -106,6 +106,7 @@ ifdef LLAMA_OPENBLAS endif ifdef LLAMA_CUBLAS CFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include + CXXFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib OBJS += ggml-cuda.o NVCC = nvcc @@ -177,10 +178,10 @@ $(info ) # Build library # -ggml.o: ggml.c ggml.h +ggml.o: ggml.c ggml.h ggml-cuda.h $(CC) $(CFLAGS) -c $< -o $@ -llama.o: llama.cpp ggml.h llama.h llama_util.h +llama.o: llama.cpp ggml.h ggml-cuda.h llama.h llama_util.h $(CXX) $(CXXFLAGS) -c $< -o $@ common.o: examples/common.cpp examples/common.h diff --git a/README.md b/README.md index 731f491ca22cd..f55c576ab92cb 100644 --- a/README.md +++ b/README.md @@ -281,30 +281,29 @@ When running the larger models, make sure you have enough disk space to store al As the models are currently fully loaded into memory, you will need adequate disk space to save them and sufficient RAM to load them. At the moment, memory and disk requirements are the same. -| model | original size | quantized size (4-bit) | -|-------|---------------|------------------------| -| 7B | 13 GB | 3.9 GB | -| 13B | 24 GB | 7.8 GB | -| 30B | 60 GB | 19.5 GB | -| 65B | 120 GB | 38.5 GB | +| Model | Original size | Quantized size (4-bit) | +|------:|--------------:|-----------------------:| +| 7B | 13 GB | 3.9 GB | +| 13B | 24 GB | 7.8 GB | +| 30B | 60 GB | 19.5 GB | +| 65B | 120 GB | 38.5 GB | ### Quantization Several quantization methods are supported. They differ in the resulting model disk size and inference speed. -Model | F16 | Q4_0 | Q4_1 | Q4_2 | Q4_3 | Q5_0 | Q5_1 | Q8_0 --- | -- | -- | -- | -- | -- | -- | -- | -- -7B (ppl) | 5.9565 | 6.2103 | 6.1286 | 6.1698 | 6.0617 | 6.0139 | 5.9934 | 5.9571 -7B (size) | 13.0G | 4.0G | 4.8G | 4.0G | 4.8G | 4.4G | 4.8G | 7.1G -7B (ms/tok @ 4th) | 128 | 56 | 61 | 84 | 91 | 91 | 95 | 75 -7B (ms/tok @ 8th) | 128 | 47 | 55 | 48 | 53 | 53 | 59 | 75 -7B (bpw) | 16.0 | 5.0 | 6.0 | 5.0 | 6.0 | 5.5 | 6.0 | 9.0 --- | -- | -- | -- | -- | -- | -- | -- | -- -13B (ppl) | 5.2455 | 5.3748 | 5.3471 | 5.3433 | 5.3234 | 5.2768 | 5.2582 | 5.2458 -13B (size) | 25.0G | 7.6G | 9.1G | 7.6G | 9.1G | 8.4G | 9.1G | 14G -13B (ms/tok @ 4th) | 239 | 104 | 113 | 160 | 175 | 176 | 185 | 141 -13B (ms/tok @ 8th) | 240 | 85 | 99 | 97 | 114 | 108 | 117 | 147 -13B (bpw) | 16.0 | 5.0 | 6.0 | 5.0 | 6.0 | 5.5 | 6.0 | 9.0 +| Model | Measure | F16 | Q4_0 | Q4_1 | Q4_2 | Q5_0 | Q5_1 | Q8_0 | +|------:|--------------|-------:|-------:|-------:|-------:|-------:|-------:|-------:| +| 7B | perplexity | 5.9565 | 6.2103 | 6.1286 | 6.1698 | 6.0139 | 5.9934 | 5.9571 | +| 7B | file size | 13.0G | 4.0G | 4.8G | 4.0G | 4.4G | 4.8G | 7.1G | +| 7B | ms/tok @ 4th | 128 | 56 | 61 | 84 | 91 | 95 | 75 | +| 7B | ms/tok @ 8th | 128 | 47 | 55 | 48 | 53 | 59 | 75 | +| 7B | bits/weight | 16.0 | 5.0 | 6.0 | 5.0 | 5.5 | 6.0 | 9.0 | +| 13B | perplexity | 5.2455 | 5.3748 | 5.3471 | 5.3433 | 5.2768 | 5.2582 | 5.2458 | +| 13B | file size | 25.0G | 7.6G | 9.1G | 7.6G | 8.4G | 9.1G | 14G | +| 13B | ms/tok @ 4th | 239 | 104 | 113 | 160 | 176 | 185 | 141 | +| 13B | ms/tok @ 8th | 240 | 85 | 99 | 97 | 108 | 117 | 147 | +| 13B | bits/weight | 16.0 | 5.0 | 6.0 | 5.0 | 5.5 | 6.0 | 9.0 | ### Interactive mode diff --git a/SHA256SUMS b/SHA256SUMS index 87faa7f1b8587..e487bdca6c9c2 100644 --- a/SHA256SUMS +++ b/SHA256SUMS @@ -3,7 +3,6 @@ 99aeb35f26b577fa2732716cca4d8b5ada39a78ea9b2dca2651fc632b5d101b6 models/7B/ggml-model-q4_0.bin cc061458339a3eb8bcecbf0a825e9924fb7d1a8150f63cd5d091caa99215aafe models/7B/ggml-model-q4_1.bin 25b050337a87344da687a7f2adddc03bd99b7f6c140450e836649f3585fb6496 models/7B/ggml-model-q4_2.bin -3429bf198ec771886cf81a574df45245f3ebf04f0ce0956b73ef5d0ab01ff48b models/7B/ggml-model-q4_3.bin 7e89e242ddc0dd6f060b43ca219ce8b3e8f08959a72cb3c0855df8bb04d46265 models/7B/params.json 745bf4e29a4dd6f411e72976d92b452da1b49168a4f41c951cfcc8051823cf08 models/13B/consolidated.00.pth d5ccbcc465c71c0de439a5aeffebe8344c68a519bce70bc7f9f92654ee567085 models/13B/consolidated.01.pth @@ -11,7 +10,6 @@ d5ccbcc465c71c0de439a5aeffebe8344c68a519bce70bc7f9f92654ee567085 models/13B/con eecb575d325d935157761172e2bf05984dad216eb2b06777b73463cf9b818bab models/13B/ggml-model-q4_0.bin d9581b5b88e5622532fe897c9f9b0e67a317d22dd27a6f90fa4ab8c6d23ccdbb models/13B/ggml-model-q4_1.bin 75a218a47df03f5f96354656329864613abcb67779412b9bc2282b28c1c3cbaa models/13B/ggml-model-q4_2.bin -4208cdec9788ffa48dc1a17af2c36a0299f5bf3eb0e2b87889dda7fad591fca3 models/13B/ggml-model-q4_3.bin 4ab77bec4d4405ccb66a97b282574c89a94417e3c32e5f68f37e2876fc21322f models/13B/params.json e23294a58552d8cdec5b7e8abb87993b97ea6eced4178ff2697c02472539d067 models/30B/consolidated.00.pth 4e077b7136c7ae2302e954860cf64930458d3076fcde9443f4d0e939e95903ff models/30B/consolidated.01.pth @@ -21,7 +19,6 @@ e23294a58552d8cdec5b7e8abb87993b97ea6eced4178ff2697c02472539d067 models/30B/con 517b9e525742c42b5478a6280a4b41ec66f46298c57aba7f0453d491682fe42d models/30B/ggml-model-q4_0.bin 7b75ac615fa369ee593493a7e6ef87542bf0350255db928b22c5a24f6d598bcd models/30B/ggml-model-q4_1.bin aadbc9cf806313a55be570f62884eed289d30c313fac3b7838717e01bd553204 models/30B/ggml-model-q4_2.bin -a6188660199dbcb8d5658abe7d89169869e50423494385830d9e6b330ea7fc33 models/30B/ggml-model-q4_3.bin 2c07118ea98d69dbe7810d88520e30288fa994751b337f8fca02b171955f44cb models/30B/params.json 135c563f6b3938114458183afb01adc9a63bef3d8ff7cccc3977e5d3664ecafe models/65B/consolidated.00.pth 9a600b37b19d38c7e43809485f70d17d1dc12206c07efa83bc72bb498a568bde models/65B/consolidated.01.pth @@ -35,6 +32,5 @@ d27f5b0677d7ff129ceacd73fd461c4d06910ad7787cf217b249948c3f3bc638 models/65B/con 01672072136f8be6ca9d7cebe5f86ed316e8b85851b9fe3de951809233cea4f2 models/65B/ggml-model-q4_0.bin 4743a28aac3e5f32a6e838a815f51d3779de44fbbe251d745251e66c23c5950f models/65B/ggml-model-q4_1.bin 1b6f6588d0e2ecfe6c4d849088e48e5e3083466b962daa32e3261363e21fc5e9 models/65B/ggml-model-q4_2.bin -305e91a4608b4f627b9b8ad5b4af75187d2684254bfd76dcb9db571618ef293c models/65B/ggml-model-q4_3.bin 999ed1659b469ccc2a941714c0a9656fa571d17c9f7c8c7589817ca90edef51b models/65B/params.json 9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 models/tokenizer.model diff --git a/examples/common.cpp b/examples/common.cpp index 9f10dc268558b..6c712c713db9b 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -6,6 +6,8 @@ #include #include #include +#include +#include #if defined (_WIN32) #include @@ -114,6 +116,18 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { break; } params.temp = std::stof(argv[i]); + } else if (arg == "--tfs") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.tfs_z = std::stof(argv[i]); + } else if (arg == "--typical") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.typical_p = std::stof(argv[i]); } else if (arg == "--repeat_last_n") { if (++i >= argc) { invalid_param = true; @@ -126,6 +140,36 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { break; } params.repeat_penalty = std::stof(argv[i]); + } else if (arg == "--frequency_penalty") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.frequency_penalty = std::stof(argv[i]); + } else if (arg == "--presence_penalty") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.presence_penalty = std::stof(argv[i]); + } else if (arg == "--mirostat") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.mirostat = std::stoi(argv[i]); + } else if (arg == "--mirostat_lr") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.mirostat_eta = std::stof(argv[i]); + } else if (arg == "--mirostat_ent") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.mirostat_tau = std::stof(argv[i]); } else if (arg == "-b" || arg == "--batch_size") { if (++i >= argc) { invalid_param = true; @@ -185,7 +229,28 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { } else if (arg == "--perplexity") { params.perplexity = true; } else if (arg == "--ignore-eos") { - params.ignore_eos = true; + params.logit_bias[llama_token_eos()] = -INFINITY; + } else if (arg == "--no-penalize-nl") { + params.penalize_nl = false; + } else if (arg == "-l" || arg == "--logit-bias") { + if (++i >= argc) { + invalid_param = true; + break; + } + std::stringstream ss(argv[i]); + llama_token key; + char sign; + std::string value_str; + try { + if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) { + params.logit_bias[key] = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f); + } else { + throw std::exception(); + } + } catch (const std::exception &e) { + invalid_param = true; + break; + } } else if (arg == "--n_parts") { if (++i >= argc) { invalid_param = true; @@ -240,12 +305,26 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { fprintf(stderr, " -f FNAME, --file FNAME\n"); fprintf(stderr, " prompt file to start generation.\n"); fprintf(stderr, " -n N, --n_predict N number of tokens to predict (default: %d, -1 = infinity)\n", params.n_predict); - fprintf(stderr, " --top_k N top-k sampling (default: %d)\n", params.top_k); - fprintf(stderr, " --top_p N top-p sampling (default: %.1f)\n", (double)params.top_p); - fprintf(stderr, " --repeat_last_n N last n tokens to consider for penalize (default: %d)\n", params.repeat_last_n); - fprintf(stderr, " --repeat_penalty N penalize repeat sequence of tokens (default: %.1f)\n", (double)params.repeat_penalty); + fprintf(stderr, " --top_k N top-k sampling (default: %d, 0 = disabled)\n", params.top_k); + fprintf(stderr, " --top_p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)params.top_p); + fprintf(stderr, " --tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)params.tfs_z); + fprintf(stderr, " --typical N locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)params.typical_p); + fprintf(stderr, " --repeat_last_n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", params.repeat_last_n); + fprintf(stderr, " --repeat_penalty N penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)params.repeat_penalty); + fprintf(stderr, " --presence_penalty N repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)params.presence_penalty); + fprintf(stderr, " --frequency_penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)params.frequency_penalty); + fprintf(stderr, " --mirostat N use Mirostat sampling.\n"); + fprintf(stderr, " Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n"); + fprintf(stderr, " (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", params.mirostat); + fprintf(stderr, " --mirostat_lr N Mirostat learning rate, parameter eta (default: %.1f)\n", (double)params.mirostat_eta); + fprintf(stderr, " --mirostat_ent N Mirostat target entropy, parameter tau (default: %.1f)\n", (double)params.mirostat_tau); + fprintf(stderr, " -l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS\n"); + fprintf(stderr, " modifies the likelihood of token appearing in the completion,\n"); + fprintf(stderr, " i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n"); + fprintf(stderr, " or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'\n"); fprintf(stderr, " -c N, --ctx_size N size of the prompt context (default: %d)\n", params.n_ctx); - fprintf(stderr, " --ignore-eos ignore end of stream token and continue generating\n"); + fprintf(stderr, " --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n"); + fprintf(stderr, " --no-penalize-nl do not penalize newline token\n"); fprintf(stderr, " --memory_f32 use f32 instead of f16 for memory key+value\n"); fprintf(stderr, " --temp N temperature (default: %.1f)\n", (double)params.temp); fprintf(stderr, " --n_parts N number of model parts (default: -1 = determine from dimensions)\n"); diff --git a/examples/common.h b/examples/common.h index 9d3697d793eff..fce1d42a9da70 100644 --- a/examples/common.h +++ b/examples/common.h @@ -8,6 +8,7 @@ #include #include #include +#include // // CLI argument parsing @@ -16,18 +17,26 @@ struct gpt_params { int32_t seed = -1; // RNG seed int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency()); - int32_t n_predict = 128; // new tokens to predict - int32_t repeat_last_n = 64; // last n tokens to penalize + int32_t n_predict = -1; // new tokens to predict int32_t n_parts = -1; // amount of model parts (-1 = determine from model dimensions) int32_t n_ctx = 512; // context size int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS) int32_t n_keep = 0; // number of tokens to keep from initial prompt // sampling parameters - int32_t top_k = 40; - float top_p = 0.95f; - float temp = 0.80f; - float repeat_penalty = 1.10f; + std::unordered_map logit_bias; // logit bias for specific tokens + int32_t top_k = 40; // <= 0 to use vocab size + float top_p = 0.95f; // 1.0 = disabled + float tfs_z = 1.00f; // 1.0 = disabled + float typical_p = 1.00f; // 1.0 = disabled + float temp = 0.80f; // 1.0 = disabled + float repeat_penalty = 1.10f; // 1.0 = disabled + int32_t repeat_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size) + float frequency_penalty = 0.00f; // 0.0 = disabled + float presence_penalty = 0.00f; // 0.0 = disabled + int mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0 + float mirostat_tau = 5.00f; // target entropy + float mirostat_eta = 0.10f; // learning rate std::string model = "models/lamma-7B/ggml-model.bin"; // model path std::string prompt = ""; @@ -47,7 +56,7 @@ struct gpt_params { bool interactive_first = false; // wait for user input immediately bool instruct = false; // instruction mode (used for Alpaca models) - bool ignore_eos = false; // do not stop generating after eos + bool penalize_nl = true; // consider newlines as a repeatable token bool perplexity = false; // compute perplexity over the prompt bool use_mmap = true; // use mmap for faster loads bool use_mlock = false; // use mlock to keep model in memory diff --git a/examples/main/main.cpp b/examples/main/main.cpp index fda65574fad7a..990d0fa023c63 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -276,8 +276,8 @@ int main(int argc, char ** argv) { fprintf(stderr, "Input prefix: '%s'\n", params.input_prefix.c_str()); } } - fprintf(stderr, "sampling: temp = %f, top_k = %d, top_p = %f, repeat_last_n = %i, repeat_penalty = %f\n", - params.temp, params.top_k, params.top_p, params.repeat_last_n, params.repeat_penalty); + fprintf(stderr, "sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n", + params.repeat_last_n, params.repeat_penalty, params.presence_penalty, params.frequency_penalty, params.top_k, params.tfs_z, params.top_p, params.typical_p, params.temp, params.mirostat, params.mirostat_eta, params.mirostat_tau); fprintf(stderr, "generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep); fprintf(stderr, "\n\n"); @@ -387,10 +387,19 @@ int main(int argc, char ** argv) { if ((int) embd_inp.size() <= n_consumed && !is_interacting) { // out of user input, sample next token - const int32_t top_k = params.top_k; - const float top_p = params.top_p; - const float temp = params.temp; - const float repeat_penalty = params.repeat_penalty; + const float temp = params.temp; + const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k; + const float top_p = params.top_p; + const float tfs_z = params.tfs_z; + const float typical_p = params.typical_p; + const int32_t repeat_last_n = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n; + const float repeat_penalty = params.repeat_penalty; + const float alpha_presence = params.presence_penalty; + const float alpha_frequency = params.frequency_penalty; + const int mirostat = params.mirostat; + const float mirostat_tau = params.mirostat_tau; + const float mirostat_eta = params.mirostat_eta; + const bool penalize_nl = params.penalize_nl; // optionally save the session on first sample (for faster prompt loading next time) if (!path_session.empty() && need_to_save_session) { @@ -402,14 +411,58 @@ int main(int argc, char ** argv) { { auto logits = llama_get_logits(ctx); + auto n_vocab = llama_n_vocab(ctx); - if (params.ignore_eos) { - logits[llama_token_eos()] = 0; + // Apply params.logit_bias map + for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) { + logits[it->first] += it->second; } - id = llama_sample_top_p_top_k(ctx, - last_n_tokens.data() + n_ctx - params.repeat_last_n, - params.repeat_last_n, top_k, top_p, temp, repeat_penalty); + std::vector candidates; + candidates.reserve(n_vocab); + for (llama_token token_id = 0; token_id < n_vocab; token_id++) { + candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f}); + } + + llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; + + // Apply penalties + float nl_logit = logits[llama_token_nl()]; + auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx); + llama_sample_repetition_penalty(ctx, &candidates_p, + last_n_tokens.data() + last_n_tokens.size() - last_n_repeat, + last_n_repeat, repeat_penalty); + llama_sample_frequency_and_presence_penalties(ctx, &candidates_p, + last_n_tokens.data() + last_n_tokens.size() - last_n_repeat, + last_n_repeat, alpha_frequency, alpha_presence); + if (!penalize_nl) { + logits[llama_token_nl()] = nl_logit; + } + + if (temp <= 0) { + // Greedy sampling + id = llama_sample_token_greedy(ctx, &candidates_p); + } else { + if (mirostat == 1) { + static float mirostat_mu = 2.0f * mirostat_tau; + const int mirostat_m = 100; + llama_sample_temperature(ctx, &candidates_p, temp); + id = llama_sample_token_mirostat(ctx, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu); + } else if (mirostat == 2) { + static float mirostat_mu = 2.0f * mirostat_tau; + llama_sample_temperature(ctx, &candidates_p, temp); + id = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu); + } else { + // Temperature sampling + llama_sample_top_k(ctx, &candidates_p, top_k); + llama_sample_tail_free(ctx, &candidates_p, tfs_z); + llama_sample_typical(ctx, &candidates_p, typical_p); + llama_sample_top_p(ctx, &candidates_p, top_p); + llama_sample_temperature(ctx, &candidates_p, temp); + id = llama_sample_token(ctx, &candidates_p); + } + } + // printf("`%d`", candidates_p.size); last_n_tokens.erase(last_n_tokens.begin()); last_n_tokens.push_back(id); diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index 60966595e9561..dd175c690232e 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -9,7 +9,6 @@ static const std::map LLAMA_FTYPE_MAP = { {"q4_0", LLAMA_FTYPE_MOSTLY_Q4_0}, {"q4_1", LLAMA_FTYPE_MOSTLY_Q4_1}, {"q4_2", LLAMA_FTYPE_MOSTLY_Q4_2}, - {"q4_3", LLAMA_FTYPE_MOSTLY_Q4_3}, {"q5_0", LLAMA_FTYPE_MOSTLY_Q5_0}, {"q5_1", LLAMA_FTYPE_MOSTLY_Q5_1}, {"q8_0", LLAMA_FTYPE_MOSTLY_Q8_0}, diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp index 39aa7f82cae5c..07dfa2c74ed07 100644 --- a/examples/save-load-state/save-load-state.cpp +++ b/examples/save-load-state/save-load-state.cpp @@ -64,14 +64,15 @@ int main(int argc, char ** argv) { // first run printf("\n%s", params.prompt.c_str()); for (auto i = 0; i < params.n_predict; i++) { - auto next_token = llama_sample_top_p_top_k( - ctx, - &last_n_tokens_data.back() - params.repeat_last_n, - params.repeat_last_n, - 40, - 1.0, - 1.0, - 1.1); + auto logits = llama_get_logits(ctx); + auto n_vocab = llama_n_vocab(ctx); + std::vector candidates; + candidates.reserve(n_vocab); + for (llama_token token_id = 0; token_id < n_vocab; token_id++) { + candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f}); + } + llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; + auto next_token = llama_sample_token(ctx, &candidates_p); auto next_token_str = llama_token_to_str(ctx, next_token); last_n_tokens_data.push_back(next_token); printf("%s", next_token_str); @@ -106,14 +107,15 @@ int main(int argc, char ** argv) { // second run for (auto i = 0; i < params.n_predict; i++) { - auto next_token = llama_sample_top_p_top_k( - ctx2, - &last_n_tokens_data.back() - params.repeat_last_n, - params.repeat_last_n, - 40, - 1.0, - 1.0, - 1.1); + auto logits = llama_get_logits(ctx2); + auto n_vocab = llama_n_vocab(ctx2); + std::vector candidates; + candidates.reserve(n_vocab); + for (llama_token token_id = 0; token_id < n_vocab; token_id++) { + candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f}); + } + llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; + auto next_token = llama_sample_token(ctx2, &candidates_p); auto next_token_str = llama_token_to_str(ctx2, next_token); last_n_tokens_data.push_back(next_token); printf("%s", next_token_str); diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 9e3d561256fea..b9d60db010077 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -33,14 +33,6 @@ typedef struct { } block_q4_2; static_assert(sizeof(block_q4_2) == sizeof(ggml_fp16_t) + QK4_2 / 2, "wrong q4_2 block size/padding"); -#define QK4_3 16 -typedef struct { - __half d; // delta - __half m; // min - uint8_t qs[QK4_3 / 2]; // nibbles / quants -} block_q4_3; -static_assert(sizeof(block_q4_3) == 2 * sizeof(ggml_fp16_t) + QK4_3 / 2, "wrong q4_3 block size/padding"); - #define QK5_0 32 typedef struct { __half d; // delta @@ -135,30 +127,6 @@ static __global__ void dequantize_block_q4_2(const void * vx, float * y) { } } -static __global__ void dequantize_block_q4_3(const void * vx, float * y) { - const block_q4_3 * x = (const block_q4_3 *) vx; - - const int i = blockIdx.x; - - const float d = x[i].d; - const float m = x[i].m; - - const uint8_t * pp = x[i].qs; - - for (int l = 0; l < QK4_3; l += 2) { - const uint8_t vi = pp[l/2]; - - const int8_t vi0 = vi & 0xf; - const int8_t vi1 = vi >> 4; - - const float v0 = vi0*d + m; - const float v1 = vi1*d + m; - - y[i*QK4_3 + l + 0] = v0; - y[i*QK4_3 + l + 1] = v1; - } -} - static __global__ void dequantize_block_q5_0(const void * vx, float * y) { const block_q5_0 * x = (const block_q5_0 *) vx; @@ -248,11 +216,6 @@ void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, cudaStream_t st dequantize_block_q4_2<<>>(vx, y); } -void dequantize_row_q4_3_cuda(const void * vx, float * y, int k, cudaStream_t stream) { - const int nb = k / QK4_3; - dequantize_block_q4_3<<>>(vx, y); -} - void dequantize_row_q5_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) { const int nb = k / QK5_0; dequantize_block_q5_0<<>>(vx, y); @@ -268,6 +231,25 @@ void dequantize_row_q8_0_cuda(const void * vx, float * y, int k, cudaStream_t st dequantize_block_q8_0<<>>(vx, y); } +dequantize_row_q_cuda_t ggml_get_dequantize_row_q_cuda(ggml_type type) { + switch (type) { + case GGML_TYPE_Q4_0: + return dequantize_row_q4_0_cuda; + case GGML_TYPE_Q4_1: + return dequantize_row_q4_1_cuda; + case GGML_TYPE_Q4_2: + return dequantize_row_q4_2_cuda; + case GGML_TYPE_Q5_0: + return dequantize_row_q5_0_cuda; + case GGML_TYPE_Q5_1: + return dequantize_row_q5_1_cuda; + case GGML_TYPE_Q8_0: + return dequantize_row_q8_0_cuda; + default: + return nullptr; + } +} + // buffer pool for cuda #define MAX_CUDA_BUFFERS 16 @@ -327,19 +309,61 @@ void ggml_cuda_pool_free(void * ptr, size_t size) { CUDA_CHECK(cudaFree(ptr)); } -cublasHandle_t g_cublasH = NULL; -cudaStream_t g_cudaStream = NULL; +cublasHandle_t g_cublasH = nullptr; +cudaStream_t g_cudaStream = nullptr; +cudaStream_t g_cudaStream2 = nullptr; +cudaEvent_t g_cudaEvent = nullptr; -void ggml_init_cublas(void) { - if (g_cublasH == NULL) { +void ggml_init_cublas() { + if (g_cublasH == nullptr) { // create cublas handle, bind a stream CUBLAS_CHECK(cublasCreate(&g_cublasH)); - CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStream, cudaStreamNonBlocking)); - CUBLAS_CHECK(cublasSetStream(g_cublasH, g_cudaStream)); + // create additional stream and event for synchronization + CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStream2, cudaStreamNonBlocking)); + CUDA_CHECK(cudaEventCreateWithFlags(&g_cudaEvent, cudaEventDisableTiming)); + // configure logging to stdout // CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, NULL)); } } + +cudaError_t ggml_cuda_h2d_tensor_2d(void * dst, const struct ggml_tensor * src, uint64_t i3, uint64_t i2, cudaStream_t stream) { + const uint64_t ne0 = src->ne[0]; + const uint64_t ne1 = src->ne[1]; + const uint64_t nb0 = src->nb[0]; + const uint64_t nb1 = src->nb[1]; + const uint64_t nb2 = src->nb[2]; + const uint64_t nb3 = src->nb[3]; + const enum ggml_type type = src->type; + const size_t ts = ggml_type_size(type); + const size_t bs = ggml_blck_size(type); + + const void * x = (const void *) ((const char *) src->data + i2*nb2 + i3*nb3); + if (nb0 == ts && nb1 == ts*ne0/bs) { + return cudaMemcpyAsync(dst, x, ne1*nb1, cudaMemcpyHostToDevice, stream); + } else if (nb0 == ts) { + return cudaMemcpy2DAsync(dst, ts*ne0/bs, x, nb1, ts*ne0/bs, ne1, cudaMemcpyHostToDevice, stream); + } else { + for (uint64_t i1 = 0; i1 < ne1; i1++) { + const void * rx = (const void *) ((const char *) x + i1*nb1); + void * rd = (void *) ((char *) dst + i1*ts*ne0/bs); + // pretend the row is a matrix with cols=1 + cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0, cudaMemcpyHostToDevice, stream); + if (r != cudaSuccess) return r; + } + return cudaSuccess; + } +} + +void * ggml_cuda_host_malloc(size_t size) { + void * ptr; + CUDA_CHECK(cudaMallocHost((void **) &ptr, size)); + return ptr; +} + +void ggml_cuda_host_free(void * ptr) { + CUDA_CHECK(cudaFreeHost(ptr)); +} diff --git a/ggml-cuda.h b/ggml-cuda.h index 0ef8761e1d7f5..778995d372fbe 100644 --- a/ggml-cuda.h +++ b/ggml-cuda.h @@ -7,6 +7,8 @@ #define CUBLAS_OP_T HIPBLAS_OP_T #define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS #define CUBLAS_TF32_TENSOR_OP_MATH 0 +#define CUDA_R_16F HIPBLAS_R_16F +#define CUDA_R_32F HIPBLAS_R_32F #define cublasCreate hipblasCreate #define cublasGemmEx hipblasGemmEx #define cublasHandle_t hipblasHandle_t @@ -14,34 +16,34 @@ #define cublasSetStream hipblasSetStream #define cublasSgemm hipblasSgemm #define cublasStatus_t hipblasStatus_t -#define CUDA_R_16F HIPBLAS_R_16F -#define CUDA_R_32F HIPBLAS_R_32F #define cudaDeviceSynchronize hipDeviceSynchronize #define cudaError_t hipError_t -#define cudaEvent_t hipEvent_t #define cudaEventCreateWithFlags hipEventCreateWithFlags #define cudaEventDisableTiming hipEventDisableTiming #define cudaEventRecord hipEventRecord +#define cudaEvent_t hipEvent_t #define cudaFree hipFree #define cudaFreeHost hipFreeHost #define cudaGetErrorString hipGetErrorString #define cudaGetLastError hipGetLastError #define cudaMalloc hipMalloc #define cudaMallocHost hipMallocHost +#define cudaMemcpy2DAsync hipMemcpy2DAsync #define cudaMemcpyAsync hipMemcpyAsync #define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost #define cudaMemcpyHostToDevice hipMemcpyHostToDevice -#define cudaStream_t hipStream_t #define cudaStreamCreateWithFlags hipStreamCreateWithFlags #define cudaStreamNonBlocking hipStreamNonBlocking #define cudaStreamSynchronize hipStreamSynchronize #define cudaStreamWaitEvent hipStreamWaitEvent +#define cudaStream_t hipStream_t #define cudaSuccess hipSuccess #define GGML_USE_CUBLAS #else #include #include #endif +#include "ggml.h" #ifdef __cplusplus extern "C" { @@ -67,20 +69,29 @@ extern "C" { } while (0) extern cublasHandle_t g_cublasH; -extern cudaStream_t g_cudaStream; +extern cudaStream_t g_cudaStream; +extern cudaStream_t g_cudaStream2; +extern cudaEvent_t g_cudaEvent; void ggml_init_cublas(void); +void * ggml_cuda_host_malloc(size_t size); +void ggml_cuda_host_free(void * ptr); + void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size); void ggml_cuda_pool_free(void * ptr, size_t size); void dequantize_row_q4_0_cuda(const void * vx, float * y, int k, cudaStream_t stream); void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, cudaStream_t stream); void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, cudaStream_t stream); -void dequantize_row_q4_3_cuda(const void * vx, float * y, int k, cudaStream_t stream); void dequantize_row_q5_0_cuda(const void * vx, float * y, int k, cudaStream_t stream); void dequantize_row_q5_1_cuda(const void * vx, float * y, int k, cudaStream_t stream); void dequantize_row_q8_0_cuda(const void * vx, float * y, int k, cudaStream_t stream); +cudaError_t ggml_cuda_h2d_tensor_2d(void * dst, const struct ggml_tensor * src, uint64_t i3, uint64_t i2, cudaStream_t stream); + +typedef void (*dequantize_row_q_cuda_t)(const void * x, float * y, int k, cudaStream_t stream); +dequantize_row_q_cuda_t ggml_get_dequantize_row_q_cuda(enum ggml_type type); + #ifdef __cplusplus } #endif diff --git a/ggml-opencl-dequant.cl b/ggml-opencl-dequant.cl index 191b2e57500ad..a65a79f4d6b58 100644 --- a/ggml-opencl-dequant.cl +++ b/ggml-opencl-dequant.cl @@ -60,25 +60,4 @@ __kernel void dequantize_row_q4_2(__global struct block_q4_2* blocks, __global f result[index + 1] = ((vi >> 4) - 8)*d; } -struct block_q4_3 -{ - ushort d; - ushort m; - uchar qs[8]; -}; - -__kernel void dequantize_row_q4_3(__global struct block_q4_3* blocks, __global float* result) { - const uint i = get_global_id(0) / 16; - const uint l = get_local_id(0); - - const float d = vload_half(0, (__global half*) &(blocks[i].d)); - const float m = vload_half(0, (__global half*) &(blocks[i].m)); - - const uchar vi = blocks[i].qs[l]; - - const uint index = i*16 + l*2; - result[index + 0] = (vi & 0xf) * d + m; - result[index + 1] = (vi >> 4) * d + m; -} - ); diff --git a/ggml-opencl.c b/ggml-opencl.c index 1d68f19ee1e78..b748f86b7a37e 100644 --- a/ggml-opencl.c +++ b/ggml-opencl.c @@ -24,7 +24,7 @@ static cl_device_id device; static cl_context context; static cl_command_queue queue; static cl_program program; -static cl_kernel kernel_q4_0, kernel_q4_1, kernel_q4_2, kernel_q4_3; +static cl_kernel kernel_q4_0, kernel_q4_1, kernel_q4_2; static cl_mem cl_buffer_a, cl_buffer_qb, cl_buffer_b, cl_buffer_c; static size_t cl_size_a = 0, cl_size_qb = 0, cl_size_b = 0, cl_size_c = 0; @@ -97,8 +97,6 @@ void ggml_cl_init(void) { CL_CHECK(err, "clCreateKernel"); kernel_q4_2 = clCreateKernel(program, "dequantize_row_q4_2", &err); CL_CHECK(err, "clCreateKernel"); - kernel_q4_3 = clCreateKernel(program, "dequantize_row_q4_3", &err); - CL_CHECK(err, "clCreateKernel"); } static void ggml_cl_malloc(size_t req_size, size_t* cur_size, cl_mem_flags flags, cl_mem* buf) { @@ -150,12 +148,6 @@ void ggml_cl_sgemm_wrapper( local = 8; size_qb = global * (sizeof(short) + local) / 16; break; - case GGML_TYPE_Q4_3: - dequant = true; - kernel = kernel_q4_3; - local = 8; - size_qb = global * (sizeof(short) * 2 + local) / 16; - break; default: fprintf(stderr, "Error: Unsupported OpenCL btype %d\n", btype); abort(); diff --git a/ggml.c b/ggml.c index d6350082f40d2..748ea8cd5e688 100644 --- a/ggml.c +++ b/ggml.c @@ -694,14 +694,6 @@ typedef struct { } block_q4_2; static_assert(sizeof(block_q4_2) == sizeof(ggml_fp16_t) + QK4_2 / 2, "wrong q4_2 block size/padding"); -#define QK4_3 16 -typedef struct { - ggml_fp16_t d; // delta - ggml_fp16_t m; // min - uint8_t qs[QK4_3 / 2]; // nibbles / quants -} block_q4_3; -static_assert(sizeof(block_q4_3) == 2 * sizeof(ggml_fp16_t) + QK4_3 / 2, "wrong q4_3 block size/padding"); - #define QK5_0 32 typedef struct { ggml_fp16_t d; // delta @@ -1291,49 +1283,6 @@ static void quantize_row_q4_2(const float * restrict x, void * restrict vy, int quantize_row_q4_2_reference(x, y, k); } -static void quantize_row_q4_3_reference(const float * restrict x, block_q4_3 * restrict y, int k) { - assert(k % QK4_3 == 0); - const int nb = k / QK4_3; - - for (int i = 0; i < nb; i++) { - float min = FLT_MAX; - float max = -FLT_MAX; - - for (int l = 0; l < QK4_3; l++) { - const float v = x[i*QK4_3 + l]; - if (v < min) min = v; - if (v > max) max = v; - } - - const float d = (max - min) / ((1 << 4) - 1); - const float id = d ? 1.0f/d : 0.0f; - - y[i].d = GGML_FP32_TO_FP16(d); - y[i].m = GGML_FP32_TO_FP16(min); - - for (int l = 0; l < QK4_3; l += 2) { - const float v0 = (x[i*QK4_3 + l + 0] - min)*id; - const float v1 = (x[i*QK4_3 + l + 1] - min)*id; - - const uint8_t vi0 = (int) (v0 + 0.5f); - const uint8_t vi1 = (int) (v1 + 0.5f); - - assert(vi0 < 16); - assert(vi1 < 16); - - y[i].qs[l/2] = vi0 | (vi1 << 4); - } - } -} - -static void quantize_row_q4_3(const float * restrict x, void * restrict vy, int k) { - assert(k % QK4_3 == 0); - - block_q4_3 * restrict y = vy; - - quantize_row_q4_3_reference(x, y, k); -} - static void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int k) { assert(k % QK5_0 == 0); const int nb = k / QK5_0; @@ -1917,36 +1866,6 @@ static void dequantize_row_q4_2(const void * restrict vx, float * restrict y, in } } -static void dequantize_row_q4_3(const void * restrict vx, float * restrict y, int k) { - assert(k % QK4_3 == 0); - const int nb = k / QK4_3; - - const block_q4_3 * restrict x = vx; - - for (int i = 0; i < nb; i++) { - const float d = GGML_FP16_TO_FP32(x[i].d); - const float m = GGML_FP16_TO_FP32(x[i].m); - - const uint8_t * restrict pp = x[i].qs; - - for (int l = 0; l < QK4_3; l += 2) { - const uint8_t vi = pp[l/2]; - - const int8_t vi0 = vi & 0x0F; - const int8_t vi1 = vi >> 4; - - const float v0 = vi0*d + m; - const float v1 = vi1*d + m; - - y[i*QK4_3 + l + 0] = v0; - y[i*QK4_3 + l + 1] = v1; - - assert(!isnan(y[i*QK4_3 + l + 0])); - assert(!isnan(y[i*QK4_3 + l + 1])); - } - } -} - static void dequantize_row_q5_0(const void * restrict vx, float * restrict y, int k) { assert(k % QK5_0 == 0); const int nb = k / QK5_0; @@ -2040,7 +1959,6 @@ static void dequantize_row_q8_0(const void * restrict vx, float * restrict y, in static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); static void ggml_vec_dot_q4_2_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); -static void ggml_vec_dot_q4_3_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); @@ -2070,14 +1988,6 @@ static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = { .vec_dot_q = ggml_vec_dot_q4_2_q8_0, .vec_dot_type = GGML_TYPE_Q8_0, }, - [GGML_TYPE_Q4_3] = { - .dequantize_row_q = dequantize_row_q4_3, - .quantize_row_q = quantize_row_q4_3, - .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_3_reference, - .quantize_row_q_dot = quantize_row_q8_1, - .vec_dot_q = ggml_vec_dot_q4_3_q8_1, - .vec_dot_type = GGML_TYPE_Q8_1, - }, [GGML_TYPE_Q5_0] = { .dequantize_row_q = dequantize_row_q5_0, .quantize_row_q = quantize_row_q5_0, @@ -3171,136 +3081,6 @@ static void ggml_vec_dot_q4_2_q8_0(const int n, float * restrict s, const void * #endif } -static void ggml_vec_dot_q4_3_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { - const int nb = n / QK8_1; - - assert(n % QK8_1 == 0); - assert(nb % 2 == 0); - assert(QK8_1 == 2*QK4_3); - - const block_q4_3 * restrict x = vx; - const block_q8_1 * restrict y = vy; - -#if defined(__ARM_NEON) - float32x4_t sumv0 = vdupq_n_f32(0.0f); - float32x4_t sumv1 = vdupq_n_f32(0.0f); - - float summs0 = 0.0f; - float summs1 = 0.0f; - - for (int i = 0; i < nb; ++i) { - const block_q4_3 * restrict x0_0 = &x[2*(i + 0) + 0]; - const block_q4_3 * restrict x0_1 = &x[2*(i + 0) + 1]; - - const block_q8_1 * restrict y0 = &y[i + 0]; - - summs0 += GGML_FP16_TO_FP32(x0_0->m) * y0->s0; - summs1 += GGML_FP16_TO_FP32(x0_1->m) * y0->s1; - - const uint8x16_t v0_0 = vcombine_u8(vld1_u8(x0_0->qs), vld1_u8(x0_1->qs)); - - // 4-bit -> 8-bit - const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8 (v0_0, vdupq_n_u8(0x0F))); - const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4)); - - // interleave - const int8x16_t v0_0lz = vzip1q_s8(v0_0l, v0_0h); - const int8x16_t v0_0hz = vzip2q_s8(v0_0l, v0_0h); - - // load y - const int8x16_t v1_0l = vld1q_s8(y0->qs); - const int8x16_t v1_0h = vld1q_s8(y0->qs + 16); - - const float x0_0d = GGML_FP16_TO_FP32(x0_0->d); - const float x0_1d = GGML_FP16_TO_FP32(x0_1->d); - -#if defined(__ARM_FEATURE_DOTPROD) - sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vdotq_s32(vdupq_n_s32(0), v0_0lz, v1_0l)), x0_0d*y0->d); - sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vdotq_s32(vdupq_n_s32(0), v0_0hz, v1_0h)), x0_1d*y0->d); -#else - const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lz), vget_low_s8 (v1_0l)); - const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lz), vget_high_s8(v1_0l)); - const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hz), vget_low_s8 (v1_0h)); - const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hz), vget_high_s8(v1_0h)); - - const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h)); - const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h)); - - sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(pl0), x0_0d*y0->d); - sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(ph0), x0_1d*y0->d); -#endif - } - - *s = vaddvq_f32(vaddq_f32(sumv0, sumv1)) + summs0 + summs1; -#elif defined(__AVX2__) - // Initialize accumulator with zeros - __m256 acc = _mm256_setzero_ps(); - float summs = 0.0f; - - // Main loop - for (int i = 0; i < nb; i++) { - const __m128 d0 = _mm_set1_ps(GGML_FP16_TO_FP32(x[2*i + 0].d)); - const __m128 d1 = _mm_set1_ps(GGML_FP16_TO_FP32(x[2*i + 1].d)); - const __m256 dx = _mm256_set_m128(d1, d0); - - summs += GGML_FP16_TO_FP32(x[2*i + 0].m) * y[i].s0 - + GGML_FP16_TO_FP32(x[2*i + 1].m) * y[i].s1; - - const __m128i bx0 = bytes_from_nibbles_16(x[2*i + 0].qs); - const __m128i bx1 = bytes_from_nibbles_16(x[2*i + 1].qs); - const __m256i bx = _mm256_set_m128i(bx1, bx0); - - const __m256 dy = _mm256_broadcast_ss(&y[i].d); - const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs); - - const __m256 q = mul_sum_i8_pairs_float(bx, by); - - acc = _mm256_fmadd_ps(q, _mm256_mul_ps(dx, dy), acc); - } - - *s = hsum_float_8(acc) + summs; -#else - // scalar - float sumf = 0.0; - for (int i = 0; i < nb; i++) { - const uint8_t * restrict x0 = x[2*i + 0].qs; - const uint8_t * restrict x1 = x[2*i + 1].qs; - const int8_t * restrict y0 = y[i].qs; - - const float d0 = GGML_FP16_TO_FP32(x[2*i + 0].d); - const float m0 = GGML_FP16_TO_FP32(x[2*i + 0].m); - const float d1 = GGML_FP16_TO_FP32(x[2*i + 1].d); - const float m1 = GGML_FP16_TO_FP32(x[2*i + 1].m); - - int sxy_0 = 0; - int sxy_1 = 0; - - for (int j = 0; j < QK8_1/4; j++) { - const uint8_t v0 = x0[j]; - const uint8_t v1 = x1[j]; - - const int x0_0 = v0 & 0x0F; - const int x1_0 = v0 >> 4; - - const int x0_1 = v1 & 0x0F; - const int x1_1 = v1 >> 4; - - const int y0_0 = y0[2*j + 0]; - const int y1_0 = y0[2*j + 1]; - - const int y0_1 = y0[2*(j + QK8_1/4) + 0]; - const int y1_1 = y0[2*(j + QK8_1/4) + 1]; - - sxy_0 += x0_0*y0_0 + x1_0*y1_0; - sxy_1 += x0_1*y0_1 + x1_1*y1_1; - } - - sumf += (d0*sxy_0 + d1*sxy_1)*y[i].d + m0*y[i].s0 + m1*y[i].s1; - } - *s = sumf; -#endif -} - static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { const int nb = n / QK8_0; @@ -3925,7 +3705,6 @@ static const int GGML_BLCK_SIZE[GGML_TYPE_COUNT] = { [GGML_TYPE_Q4_0] = QK4_0, [GGML_TYPE_Q4_1] = QK4_1, [GGML_TYPE_Q4_2] = QK4_2, - [GGML_TYPE_Q4_3] = QK4_3, [GGML_TYPE_Q5_0] = QK5_0, [GGML_TYPE_Q5_1] = QK5_1, [GGML_TYPE_Q8_0] = QK8_0, @@ -3942,7 +3721,6 @@ static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = { [GGML_TYPE_Q4_0] = sizeof(block_q4_0), [GGML_TYPE_Q4_1] = sizeof(block_q4_1), [GGML_TYPE_Q4_2] = sizeof(block_q4_2), - [GGML_TYPE_Q4_3] = sizeof(block_q4_3), [GGML_TYPE_Q5_0] = sizeof(block_q5_0), [GGML_TYPE_Q5_1] = sizeof(block_q5_1), [GGML_TYPE_Q8_0] = sizeof(block_q8_0), @@ -3960,7 +3738,6 @@ static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = { [GGML_TYPE_Q4_0] = "q4_0", [GGML_TYPE_Q4_1] = "q4_1", [GGML_TYPE_Q4_2] = "q4_2", - [GGML_TYPE_Q4_3] = "q4_3", [GGML_TYPE_Q5_0] = "q5_0", [GGML_TYPE_Q5_1] = "q5_1", [GGML_TYPE_Q8_0] = "q8_0", @@ -3977,7 +3754,6 @@ static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = { [GGML_TYPE_Q4_0] = true, [GGML_TYPE_Q4_1] = true, [GGML_TYPE_Q4_2] = true, - [GGML_TYPE_Q4_3] = true, [GGML_TYPE_Q5_0] = true, [GGML_TYPE_Q5_1] = true, [GGML_TYPE_Q8_0] = true, @@ -7230,7 +7006,6 @@ static void ggml_compute_forward_add( case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: case GGML_TYPE_Q4_2: - case GGML_TYPE_Q4_3: case GGML_TYPE_Q5_0: case GGML_TYPE_Q5_1: case GGML_TYPE_Q8_0: @@ -8155,8 +7930,12 @@ static bool ggml_compute_forward_mul_mat_use_blas( const int64_t ne1 = dst->ne[1]; // TODO: find the optimal values for these - if (ggml_is_contiguous(src0) && - ggml_is_contiguous(src1) && ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32))) { + if ( +#if !defined(GGML_USE_CUBLAS) + ggml_is_contiguous(src0) && + ggml_is_contiguous(src1) && +#endif + ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32))) { /*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/ return true; @@ -8254,7 +8033,7 @@ static void ggml_compute_forward_mul_mat_f32( #if defined(GGML_USE_CUBLAS) const float alpha = 1.0f; const float beta = 0.0f; - const int x_ne = ne01 * ne10; + const int x_ne = ne01 * ne00; const int y_ne = ne11 * ne10; const int d_ne = ne11 * ne01; @@ -8266,15 +8045,16 @@ static void ggml_compute_forward_mul_mat_f32( for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i02 = 0; i02 < ne02; i02++) { +#if !defined(GGML_USE_CUBLAS) const float * x = (float *) ((char *) src0->data + i02*nb02 + i03*nb03); const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13); - +#endif float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); #if defined(GGML_USE_CUBLAS) // copy data to device - CUDA_CHECK(cudaMemcpyAsync(d_X, x, sizeof(float) * x_ne, cudaMemcpyHostToDevice, g_cudaStream)); - CUDA_CHECK(cudaMemcpyAsync(d_Y, y, sizeof(float) * y_ne, cudaMemcpyHostToDevice, g_cudaStream)); + CUDA_CHECK(ggml_cuda_h2d_tensor_2d(d_X, src0, i03, i02, g_cudaStream)); + CUDA_CHECK(ggml_cuda_h2d_tensor_2d(d_Y, src1, i03, i02, g_cudaStream)); // compute CUBLAS_CHECK( @@ -8455,25 +8235,27 @@ static void ggml_compute_forward_mul_mat_f16_f32( } #if defined(GGML_USE_CUBLAS) - ggml_fp16_t * const wdata = params->wdata; - const float alpha = 1.0f; const float beta = 0.0f; - const int x_ne = ne01 * ne10; + const int x_ne = ne01 * ne00; const int y_ne = ne11 * ne10; const int d_ne = ne11 * ne01; size_t x_size, y_size, d_size; - float *d_X = ggml_cuda_pool_malloc(sizeof(float) * x_ne, &x_size); - float *d_Y = ggml_cuda_pool_malloc(sizeof(float) * y_ne, &y_size); - float *d_D = ggml_cuda_pool_malloc(sizeof(float) * d_ne, &d_size); + ggml_fp16_t * d_X = ggml_cuda_pool_malloc(sizeof(float) * x_ne, &x_size); + ggml_fp16_t * d_Y = ggml_cuda_pool_malloc(sizeof(float) * y_ne, &y_size); + float * d_D = ggml_cuda_pool_malloc(sizeof(float) * d_ne, &d_size); #else float * const wdata = params->wdata; #endif for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i02 = 0; i02 < ne02; i02++) { #if defined(GGML_USE_CUBLAS) + // copy src0 while converting src1 + CUDA_CHECK(ggml_cuda_h2d_tensor_2d(d_X, src0, i03, i02, g_cudaStream)); + // with cuBlAS, instead of converting src0 to fp32, we convert src1 to fp16 + ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + (ne11 * ne10) * (i03 * ne02 + i02); { size_t id = 0; for (int64_t i01 = 0; i01 < ne11; ++i01) { @@ -8494,13 +8276,10 @@ static void ggml_compute_forward_mul_mat_f16_f32( #endif #if defined(GGML_USE_CUBLAS) - const ggml_fp16_t * x = (ggml_fp16_t *) ((char *) src0->data + i02*nb02 + i03*nb03); const ggml_fp16_t * y = (ggml_fp16_t *) wdata; - float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); // copy data to device - CUDA_CHECK(cudaMemcpyAsync(d_X, x, sizeof(ggml_fp16_t) * x_ne, cudaMemcpyHostToDevice, g_cudaStream)); CUDA_CHECK(cudaMemcpyAsync(d_Y, y, sizeof(ggml_fp16_t) * y_ne, cudaMemcpyHostToDevice, g_cudaStream)); // compute @@ -8719,42 +8498,19 @@ static void ggml_compute_forward_mul_mat_q_f32( #if defined(GGML_USE_CUBLAS) const float alpha = 1.0f; const float beta = 0.0f; - const int x_ne = ne01 * ne10; + const int x_ne = ne01 * ne00; const int y_ne = ne11 * ne10; const int d_ne = ne11 * ne01; size_t x_size, y_size, d_size, q_size; - float *d_X = ggml_cuda_pool_malloc(sizeof(float) * x_ne, &x_size); - float *d_Y = ggml_cuda_pool_malloc(sizeof(float) * y_ne, &y_size); - float *d_D = ggml_cuda_pool_malloc(sizeof(float) * d_ne, &d_size); - float *d_Q = ggml_cuda_pool_malloc(GGML_TYPE_SIZE[type] * x_ne / GGML_BLCK_SIZE[type], &q_size); + float * d_X = ggml_cuda_pool_malloc(sizeof(float) * x_ne, &x_size); + float * d_Y = ggml_cuda_pool_malloc(sizeof(float) * y_ne, &y_size); + float * d_D = ggml_cuda_pool_malloc(sizeof(float) * d_ne, &d_size); + void * d_Q = ggml_cuda_pool_malloc(GGML_TYPE_SIZE[type] * x_ne / GGML_BLCK_SIZE[type], &q_size); - void (*dequantize_row_q_cuda)(const void * x, float * y, int k, cudaStream_t stream) = NULL; - if (type == GGML_TYPE_Q4_0) { - dequantize_row_q_cuda = dequantize_row_q4_0_cuda; - } - else if (type == GGML_TYPE_Q4_1) { - dequantize_row_q_cuda = dequantize_row_q4_1_cuda; - } - else if (type == GGML_TYPE_Q4_2) { - dequantize_row_q_cuda = dequantize_row_q4_2_cuda; - } - else if (type == GGML_TYPE_Q4_3) { - dequantize_row_q_cuda = dequantize_row_q4_3_cuda; - } - else if (type == GGML_TYPE_Q5_0) { - dequantize_row_q_cuda = dequantize_row_q5_0_cuda; - } - else if (type == GGML_TYPE_Q5_1) { - dequantize_row_q_cuda = dequantize_row_q5_1_cuda; - } - else if (type == GGML_TYPE_Q8_0) { - dequantize_row_q_cuda = dequantize_row_q8_0_cuda; - } - else { - GGML_ASSERT(false); - } -#elif !defined(GGML_USE_CLBLAST) + const dequantize_row_q_cuda_t dequantize_row_q_cuda = ggml_get_dequantize_row_q_cuda(type); + GGML_ASSERT(dequantize_row_q_cuda != NULL); +#else float * const wdata = params->wdata; dequantize_row_q_t const dequantize_row_q = quantize_fns[type].dequantize_row_q; #endif @@ -8767,12 +8523,11 @@ static void ggml_compute_forward_mul_mat_q_f32( #if defined(GGML_USE_CUBLAS) // copy and dequantize on device - CUDA_CHECK( - cudaMemcpyAsync(d_Q, (char *) src0->data + i03*nb03 + i02*nb02, - GGML_TYPE_SIZE[type] * x_ne / GGML_BLCK_SIZE[type], cudaMemcpyHostToDevice, g_cudaStream)); + CUDA_CHECK(ggml_cuda_h2d_tensor_2d(d_Q, src0, i03, i02, g_cudaStream2)); - dequantize_row_q_cuda(d_Q, d_X, ne01 * ne00, g_cudaStream); + dequantize_row_q_cuda(d_Q, d_X, x_ne, g_cudaStream2); CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaEventRecord(g_cudaEvent, g_cudaStream2)); #elif defined(GGML_USE_CLBLAST) const void* x = (char *) src0->data + i03*nb03 + i02*nb02; #else @@ -8788,7 +8543,10 @@ static void ggml_compute_forward_mul_mat_q_f32( #if defined(GGML_USE_CUBLAS) // copy data to device - CUDA_CHECK(cudaMemcpyAsync(d_Y, y, sizeof(float) * y_ne, cudaMemcpyHostToDevice, g_cudaStream)); + CUDA_CHECK(ggml_cuda_h2d_tensor_2d(d_Y, src1, i03, i02, g_cudaStream)); + + // wait for dequantization + CUDA_CHECK(cudaStreamWaitEvent(g_cudaStream, g_cudaEvent, 0)); // compute CUBLAS_CHECK( @@ -8913,7 +8671,6 @@ static void ggml_compute_forward_mul_mat( case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: case GGML_TYPE_Q4_2: - case GGML_TYPE_Q4_3: case GGML_TYPE_Q5_0: case GGML_TYPE_Q5_1: case GGML_TYPE_Q8_0: @@ -9145,7 +8902,6 @@ static void ggml_compute_forward_get_rows( case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: case GGML_TYPE_Q4_2: - case GGML_TYPE_Q4_3: case GGML_TYPE_Q5_0: case GGML_TYPE_Q5_1: case GGML_TYPE_Q8_0: @@ -9471,7 +9227,6 @@ static void ggml_compute_forward_alibi( case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: case GGML_TYPE_Q4_2: - case GGML_TYPE_Q4_3: case GGML_TYPE_Q5_0: case GGML_TYPE_Q5_1: case GGML_TYPE_Q8_0: @@ -11816,7 +11571,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) { node->n_tasks = 1; // TODO: this actually is doing nothing // the threads are still spinning - cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]); + cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*MAX(ggml_nelements(node->src1), ggml_nelements(node->src0)); //printf("src0: ne0 = %d, ne1 = %d, ne = %d\n", node->src0->ne[0], node->src0->ne[1], node->src0->ne[0]*node->src0->ne[1]); //printf("src1: ne0 = %d, ne1 = %d, ne = %d\n", node->src1->ne[0], node->src1->ne[1], node->src1->ne[0]*node->src1->ne[1]); //printf("cur = %zu\n", cur); @@ -11828,6 +11583,11 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) #endif } else if (node->src0->type == GGML_TYPE_F32 && node->src1->type == GGML_TYPE_F32) { cur = 0; +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) + if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) { + node->n_tasks = 1; + } +#endif } else if (ggml_is_quantized(node->src0->type) && node->src1->type == GGML_TYPE_F32) { #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) { @@ -13087,29 +12847,6 @@ size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * return (n/QK4_2*sizeof(block_q4_2)); } -size_t ggml_quantize_q4_3(const float * src, void * dst, int n, int k, int64_t * hist) { - assert(k % QK4_3 == 0); - const int nb = k / QK4_3; - - for (int j = 0; j < n; j += k) { - block_q4_3 * restrict y = (block_q4_3 *)dst + j/QK4_3; - - quantize_row_q4_3_reference(src + j, y, k); - - for (int i = 0; i < nb; i++) { - for (int l = 0; l < QK4_3; l += 2) { - const uint8_t vi0 = y[i].qs[l/2] & 0x0F; - const uint8_t vi1 = y[i].qs[l/2] >> 4; - - hist[vi0]++; - hist[vi1]++; - } - } - } - - return (n/QK4_3*sizeof(block_q4_3)); -} - size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist) { assert(k % QK5_0 == 0); const int nb = k / QK5_0; @@ -13212,12 +12949,6 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i block_q4_2 * block = (block_q4_2*)dst + start / QK4_2; result = ggml_quantize_q4_2(src + start, block, n, n, hist); } break; - case GGML_TYPE_Q4_3: - { - GGML_ASSERT(start % QK4_3 == 0); - block_q4_3 * block = (block_q4_3*)dst + start / QK4_3; - result = ggml_quantize_q4_3(src + start, block, n, n, hist); - } break; case GGML_TYPE_Q5_0: { GGML_ASSERT(start % QK5_0 == 0); diff --git a/ggml.h b/ggml.h index 540901f15a7f1..38ae9a6eeeb71 100644 --- a/ggml.h +++ b/ggml.h @@ -221,7 +221,7 @@ extern "C" { GGML_TYPE_Q4_0 = 2, GGML_TYPE_Q4_1 = 3, GGML_TYPE_Q4_2 = 4, - GGML_TYPE_Q4_3 = 5, + // GGML_TYPE_Q4_3 (5) support has been removed GGML_TYPE_Q5_0 = 6, GGML_TYPE_Q5_1 = 7, GGML_TYPE_Q8_0 = 8, @@ -843,7 +843,6 @@ extern "C" { GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist); GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist); GGML_API size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist); - GGML_API size_t ggml_quantize_q4_3(const float * src, void * dst, int n, int k, int64_t * hist); GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist); GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist); GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist); diff --git a/llama.cpp b/llama.cpp index dca017db62503..1032fb9fa9363 100644 --- a/llama.cpp +++ b/llama.cpp @@ -28,6 +28,7 @@ #include #include #include +#include #define LLAMA_USE_SCRATCH #define LLAMA_MAX_SCRATCH_BUFFERS 16 @@ -136,7 +137,7 @@ struct llama_kv_cache { struct ggml_context * ctx = NULL; - llama_buffer buf; + llama_ctx_buffer buf; int n; // number of tokens currently in the cache @@ -167,7 +168,7 @@ struct llama_model { struct llama_kv_cache kv_self; // the model memory buffer - llama_buffer buf; + llama_ctx_buffer buf; // model memory mapped file std::unique_ptr mapping; @@ -228,8 +229,8 @@ struct llama_context { // memory buffers used to evaluate the model // TODO: move in llama_state - llama_buffer buf_compute; - llama_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS]; + llama_ctx_buffer buf_compute; + llama_ctx_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS]; int buf_last = 0; size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 }; @@ -483,7 +484,6 @@ struct llama_file_loader { case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: case GGML_TYPE_Q4_2: - case GGML_TYPE_Q4_3: case GGML_TYPE_Q5_0: case GGML_TYPE_Q5_1: case GGML_TYPE_Q8_0: @@ -560,7 +560,6 @@ struct llama_file_saver { case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: case GGML_TYPE_Q4_2: - case GGML_TYPE_Q4_3: case GGML_TYPE_Q5_0: case GGML_TYPE_Q5_1: case GGML_TYPE_Q8_0: @@ -853,7 +852,6 @@ static const char *llama_ftype_name(enum llama_ftype ftype) { case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16: return "mostly Q4_1, some F16"; case LLAMA_FTYPE_MOSTLY_Q4_2: return "mostly Q4_2"; - case LLAMA_FTYPE_MOSTLY_Q4_3: return "mostly Q4_3"; case LLAMA_FTYPE_MOSTLY_Q5_0: return "mostly Q5_0"; case LLAMA_FTYPE_MOSTLY_Q5_1: return "mostly Q5_1"; case LLAMA_FTYPE_MOSTLY_Q8_0: return "mostly Q8_0"; @@ -1478,109 +1476,402 @@ static std::vector llama_tokenize(const llama_vocab & vocab, co // sampling // -static void sample_top_k(std::vector> & logits_id, int top_k) { - // find the top k tokens - std::partial_sort( - logits_id.begin(), - logits_id.begin() + top_k, logits_id.end(), - [](const std::pair & a, const std::pair & b) { - return a.first > b.first; - }); +void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates) { + assert(candidates->size > 0); + + const int64_t t_start_sample_us = ggml_time_us(); - logits_id.resize(top_k); + // Sort the logits in descending order + if (!candidates->sorted) { + std::sort(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) { + return a.logit > b.logit; + }); + candidates->sorted = true; + } + + float max_l = candidates->data[0].logit; + float cum_sum = 0.0f; + for (size_t i = 0; i < candidates->size; ++i) { + float p = expf(candidates->data[i].logit - max_l); + candidates->data[i].p = p; + cum_sum += p; + } + for (size_t i = 0; i < candidates->size; ++i) { + candidates->data[i].p /= cum_sum; + } + + if (ctx) { + ctx->t_sample_us += ggml_time_us() - t_start_sample_us; + } } -static llama_vocab::id llama_sample_top_p_top_k( - llama_context & lctx, - const std::vector & last_n_tokens, - int top_k, - float top_p, - float temp, - float repeat_penalty) { - auto & rng = lctx.rng; - - const int n_logits = lctx.model.hparams.n_vocab; - - const auto & logits = lctx.logits; - const auto * plogits = logits.data() + logits.size() - n_logits; - - if (temp <= 0) { - // select the token with the highest logit directly - float max_logit = plogits[0]; - llama_vocab::id max_id = 0; - - for (int i = 1; i < n_logits; ++i) { - if (plogits[i] > max_logit) { - max_logit = plogits[i]; - max_id = i; - } +void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep) { + const int64_t t_start_sample_us = ggml_time_us(); + + k = std::max(k, (int) min_keep); + k = std::min(k, (int) candidates->size); + + // Sort scores in descending order + if (!candidates->sorted) { + auto comp = [](const llama_token_data & a, const llama_token_data & b) { + return a.logit > b.logit; + }; + if (k == (int) candidates->size) { + std::sort(candidates->data, candidates->data + candidates->size, comp); + } else { + std::partial_sort(candidates->data, candidates->data + k, candidates->data + candidates->size, comp); } - return max_id; + candidates->sorted = true; } + candidates->size = k; - std::vector> logits_id; - logits_id.reserve(n_logits); + if (ctx) { + ctx->t_sample_us += ggml_time_us() - t_start_sample_us; + } +} - { - const float scale = 1.0f/temp; - for (int i = 0; i < n_logits; ++i) { - // repetition penalty from ctrl paper (https://arxiv.org/abs/1909.05858) - // credit https://github.com/facebookresearch/llama/compare/main...shawwn:llama:main - if (std::find(last_n_tokens.begin(), last_n_tokens.end(), i) != last_n_tokens.end()) { - // if score < 0 then repetition penalty has to multiplied to reduce the previous token probability - if (plogits[i] < 0.0f) { - logits_id.push_back(std::make_pair(plogits[i]*scale*repeat_penalty, i)); - } else { - logits_id.push_back(std::make_pair(plogits[i]*scale/repeat_penalty, i)); - } - } else { - logits_id.push_back(std::make_pair(plogits[i]*scale, i)); - } +void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) { + if (p >= 1.0f) { + return; + } + + const int64_t t_start_sample_us = ggml_time_us(); + + llama_sample_softmax(ctx, candidates); + + // Compute the cumulative probabilities + float cum_sum = 0.0f; + size_t last_idx = candidates->size; + + for (size_t i = 0; i < candidates->size; ++i) { + cum_sum += candidates->data[i].p; + + // Check if the running sum is greater than p or if we have kept at least min_keep tokens + if (cum_sum > p && i >= min_keep) { + last_idx = i; + break; } } - sample_top_k(logits_id, top_k > 0 ? std::min(top_k, n_logits) : n_logits); + // Resize the output vector to keep only the top-p tokens + candidates->size = last_idx; - // compute probs for the top k tokens - std::vector probs; - probs.reserve(logits_id.size()); + if (ctx) { + ctx->t_sample_us += ggml_time_us() - t_start_sample_us; + } +} - float maxl = logits_id[0].first; - double sum = 0.0; - for (const auto & kv : logits_id) { - const float p = expf(kv.first - maxl); - probs.push_back(p); - sum += p; +void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep) { + if (z >= 1.0f || candidates->size <= 2) { + return; } - // normalize the probs - for (auto & p : probs) { - p /= sum; + const int64_t t_start_sample_us = ggml_time_us(); + + llama_sample_softmax(nullptr, candidates); + + // Compute the first and second derivatives + std::vector first_derivatives(candidates->size - 1); + std::vector second_derivatives(candidates->size - 2); + + for (size_t i = 0; i < first_derivatives.size(); ++i) { + first_derivatives[i] = candidates->data[i].p - candidates->data[i + 1].p; + } + for (size_t i = 0; i < second_derivatives.size(); ++i) { + second_derivatives[i] = first_derivatives[i] - first_derivatives[i + 1]; } - if (top_p < 1.0) { - double cumsum = 0.0; - for (int i = 0; i < (int) probs.size(); i++) { - cumsum += probs[i]; - if (cumsum >= top_p) { - probs.resize(i + 1); - logits_id.resize(i + 1); - break; - } + // Calculate absolute value of second derivatives + for (size_t i = 0; i < second_derivatives.size(); ++i) { + second_derivatives[i] = abs(second_derivatives[i]); + } + + // Normalize the second derivatives + float second_derivatives_sum = std::accumulate(second_derivatives.begin(), second_derivatives.end(), 0.0f); + for (float & value : second_derivatives) { + value /= second_derivatives_sum; + } + + float cum_sum = 0.0f; + size_t last_idx = candidates->size; + for (size_t i = 0; i < second_derivatives.size(); ++i) { + cum_sum += second_derivatives[i]; + + // Check if the running sum is greater than z or if we have kept at least min_keep tokens + if (cum_sum > z && i >= min_keep) { + last_idx = i; + break; } } - //printf("\n"); - //for (int i = 0; i < (int) 10; i++) { - // printf("%d: '%s' %f\n", i, lctx.vocab.id_to_token.at(logits_id[i].second).tok.c_str(), probs[i]); - //} - //printf("\n\n"); - //exit(0); + // Resize the output vector to keep only the tokens above the tail location + candidates->size = last_idx; + + if (ctx) { + ctx->t_sample_us += ggml_time_us() - t_start_sample_us; + } +} + + +void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) { + // Reference implementation: + // https://github.com/huggingface/transformers/compare/main...cimeister:typical-sampling:typical-pr + if (p >= 1.0f) { + return; + } + + const int64_t t_start_sample_us = ggml_time_us(); + + // Compute the softmax of logits and calculate entropy + llama_sample_softmax(nullptr, candidates); + + float entropy = 0.0f; + for (size_t i = 0; i < candidates->size; ++i) { + entropy += -candidates->data[i].p * logf(candidates->data[i].p); + } + + // Compute the absolute difference between negative log probability and entropy for each candidate + std::vector shifted_scores; + for (size_t i = 0; i < candidates->size; ++i) { + float shifted_score = fabsf(-logf(candidates->data[i].p) - entropy); + shifted_scores.push_back(shifted_score); + } + + // Sort tokens based on the shifted_scores and their corresponding indices + std::vector indices(candidates->size); + std::iota(indices.begin(), indices.end(), 0); + + std::sort(indices.begin(), indices.end(), [&](size_t a, size_t b) { + return shifted_scores[a] < shifted_scores[b]; + }); + + // Compute the cumulative probabilities + float cum_sum = 0.0f; + size_t last_idx = indices.size(); + + for (size_t i = 0; i < indices.size(); ++i) { + size_t idx = indices[i]; + cum_sum += candidates->data[idx].p; + + // Check if the running sum is greater than typical or if we have kept at least min_keep tokens + if (cum_sum > p && i >= min_keep - 1) { + last_idx = i + 1; + break; + } + } + + // Resize the output vector to keep only the locally typical tokens + std::vector new_candidates; + for (size_t i = 0; i < last_idx; ++i) { + size_t idx = indices[i]; + new_candidates.push_back(candidates->data[idx]); + } + + // Replace the data in candidates with the new_candidates data + std::copy(new_candidates.begin(), new_candidates.end(), candidates->data); + candidates->size = new_candidates.size(); + + if (ctx) { + ctx->t_sample_us += ggml_time_us() - t_start_sample_us; + } +} + +void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) { + const int64_t t_start_sample_us = ggml_time_us(); + + for (size_t i = 0; i < candidates_p->size; ++i) { + candidates_p->data[i].logit /= temp; + } + + if (ctx) { + ctx->t_sample_us += ggml_time_us() - t_start_sample_us; + } +} + +void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, llama_token * last_tokens, size_t last_tokens_size, float penalty) { + if (last_tokens_size == 0 || penalty == 1.0f) { + return; + } + + const int64_t t_start_sample_us = ggml_time_us(); + + for (size_t i = 0; i < candidates->size; ++i) { + auto token_iter = std::find(last_tokens, last_tokens + last_tokens_size, candidates->data[i].id); + if (token_iter == last_tokens + last_tokens_size) { + continue; + } + + // The academic publication that described this technique actually just only divided, but that would cause tokens with negative logits to become more likely, which is obviously wrong. + // This is common fix for this problem, which is to multiply by the penalty instead of dividing. + if (candidates->data[i].logit <= 0) { + candidates->data[i].logit *= penalty; + } else { + candidates->data[i].logit /= penalty; + } + } + + candidates->sorted = false; + + if (ctx) { + ctx->t_sample_us += ggml_time_us() - t_start_sample_us; + } +} + +void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, llama_token * last_tokens_p, size_t last_tokens_size, float alpha_frequency, float alpha_presence) { + if (last_tokens_size == 0 || (alpha_frequency == 0.0f && alpha_presence == 0.0f)) { + return; + } + + const int64_t t_start_sample_us = ggml_time_us(); + + // Create a frequency map to count occurrences of each token in last_tokens + std::unordered_map token_count; + for (size_t i = 0; i < last_tokens_size; ++i) { + token_count[last_tokens_p[i]]++; + } + + // Apply frequency and presence penalties to the candidates + for (size_t i = 0; i < candidates->size; ++i) { + auto token_iter = token_count.find(candidates->data[i].id); + if (token_iter == token_count.end()) { + continue; + } + + int count = token_iter->second; + candidates->data[i].logit -= float(count) * alpha_frequency + float(count > 0) * alpha_presence; + } + + candidates->sorted = false; + + if (ctx) { + ctx->t_sample_us += ggml_time_us() - t_start_sample_us; + } +} + + +llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu) { + assert(ctx); + auto N = float(llama_n_vocab(ctx)); + int64_t t_start_sample_us; + t_start_sample_us = ggml_time_us(); + + llama_sample_softmax(nullptr, candidates); + + // Estimate s_hat using the most probable m tokens + float s_hat = 0.0; + float sum_ti_bi = 0.0; + float sum_ti_sq = 0.0; + for (size_t i = 0; i < size_t(m - 1) && i < candidates->size - 1; ++i) { + float t_i = logf(float(i + 2) / float(i + 1)); + float b_i = logf(candidates->data[i].p / candidates->data[i + 1].p); + sum_ti_bi += t_i * b_i; + sum_ti_sq += t_i * t_i; + } + s_hat = sum_ti_bi / sum_ti_sq; + + // Compute k from the estimated s_hat and target surprise value + float epsilon_hat = s_hat - 1; + float k = powf((epsilon_hat * powf(2, *mu)) / (1 - powf(N, -epsilon_hat)), 1 / s_hat); + + // Sample the next word X using top-k sampling + llama_sample_top_k(nullptr, candidates, int(k)); + if (ctx) { + ctx->t_sample_us += ggml_time_us() - t_start_sample_us; + } + llama_token X = llama_sample_token(ctx, candidates); + t_start_sample_us = ggml_time_us(); + + // Compute error as the difference between observed surprise and target surprise value + size_t X_idx = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_token_data & candidate) { + return candidate.id == X; + })); + float observed_surprise = -log2f(candidates->data[X_idx].p); + float e = observed_surprise - tau; + + // Update mu using the learning rate and error + *mu = *mu - eta * e; + + if (ctx) { + ctx->t_sample_us += ggml_time_us() - t_start_sample_us; + ctx->n_sample++; + } + return X; +} + +llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu) { + assert(ctx); + int64_t t_start_sample_us; + t_start_sample_us = ggml_time_us(); + + llama_sample_softmax(ctx, candidates); + + // Truncate the words with surprise values greater than mu + candidates->size = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_token_data & candidate) { + return -log2f(candidate.p) > *mu; + })); + + // Normalize the probabilities of the remaining words + llama_sample_softmax(ctx, candidates); + + // Sample the next word X from the remaining words + if (ctx) { + ctx->t_sample_us += ggml_time_us() - t_start_sample_us; + } + llama_token X = llama_sample_token(ctx, candidates); + t_start_sample_us = ggml_time_us(); + + // Compute error as the difference between observed surprise and target surprise value + size_t X_idx = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_token_data & candidate) { + return candidate.id == X; + })); + float observed_surprise = -log2f(candidates->data[X_idx].p); + float e = observed_surprise - tau; + + // Update mu using the learning rate and error + *mu = *mu - eta * e; + + if (ctx) { + ctx->t_sample_us += ggml_time_us() - t_start_sample_us; + } + return X; +} + +llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_data_array * candidates) { + const int64_t t_start_sample_us = ggml_time_us(); + + // Find max element + auto max_iter = std::max_element(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) { + return a.logit < b.logit; + }); + + llama_token result = max_iter->id; + if (ctx) { + ctx->t_sample_us += ggml_time_us() - t_start_sample_us; + ctx->n_sample++; + } + return result; +} + +llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates) { + assert(ctx); + const int64_t t_start_sample_us = ggml_time_us(); + llama_sample_softmax(nullptr, candidates); + + std::vector probs; + probs.reserve(candidates->size); + for (size_t i = 0; i < candidates->size; ++i) { + probs.push_back(candidates->data[i].p); + } std::discrete_distribution<> dist(probs.begin(), probs.end()); + auto & rng = ctx->rng; int idx = dist(rng); - return logits_id[idx].second; + llama_token result = candidates->data[idx].id; + + ctx->t_sample_us += ggml_time_us() - t_start_sample_us; + ctx->n_sample++; + return result; } // @@ -1593,7 +1884,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break; case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break; case LLAMA_FTYPE_MOSTLY_Q4_2: quantized_type = GGML_TYPE_Q4_2; break; - case LLAMA_FTYPE_MOSTLY_Q4_3: quantized_type = GGML_TYPE_Q4_3; break; case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break; case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break; case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break; @@ -2352,33 +2642,8 @@ llama_token llama_token_eos() { return 2; } -llama_token llama_sample_top_p_top_k( - llama_context * ctx, - const llama_token * last_n_tokens_data, - int last_n_tokens_size, - int top_k, - float top_p, - float temp, - float repeat_penalty) { - const int64_t t_start_sample_us = ggml_time_us(); - - llama_token result = 0; - - // TODO: avoid this ... - const auto last_n_tokens = std::vector(last_n_tokens_data, last_n_tokens_data + last_n_tokens_size); - - result = llama_sample_top_p_top_k( - *ctx, - last_n_tokens, - top_k, - top_p, - temp, - repeat_penalty); - - ctx->t_sample_us += ggml_time_us() - t_start_sample_us; - ctx->n_sample++; - - return result; +llama_token llama_token_nl() { + return 13; } diff --git a/llama.h b/llama.h index 86a7d279a9ef4..34a8f5b3ca52c 100644 --- a/llama.h +++ b/llama.h @@ -39,12 +39,16 @@ extern "C" { typedef struct llama_token_data { llama_token id; // token id - + float logit; // log-odds of the token float p; // probability of the token - float plog; // log probability of the token - } llama_token_data; + typedef struct llama_token_data_array { + llama_token_data * data; + size_t size; + bool sorted; + } llama_token_data_array; + typedef void (*llama_progress_callback)(float progress, void *ctx); struct llama_context_params { @@ -73,7 +77,7 @@ extern "C" { LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16 LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // except 1d tensors - LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // except 1d tensors + // LLAMA_FTYPE_MOSTLY_Q4_3 (6) support has been removed LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors @@ -181,16 +185,52 @@ extern "C" { // Special tokens LLAMA_API llama_token llama_token_bos(); LLAMA_API llama_token llama_token_eos(); + LLAMA_API llama_token llama_token_nl(); + + // Sampling functions + + /// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix. + LLAMA_API void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, llama_token * last_tokens, size_t last_tokens_size, float penalty); + + /// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details. + LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence); + + /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits. + LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates); + + /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 + LLAMA_API void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep = 1); + + /// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 + LLAMA_API void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep = 1); + + /// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/. + LLAMA_API void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep = 1); + + /// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666. + LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep = 1); + LLAMA_API void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp); + + /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words. + /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text. + /// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text. + /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates. + /// @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm. + /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal. + LLAMA_API llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu); + + /// @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words. + /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text. + /// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text. + /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates. + /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal. + LLAMA_API llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu); + + /// @details Selects the token with the highest probability. + LLAMA_API llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_data_array * candidates); - // TODO: improve the last_n_tokens interface ? - LLAMA_API llama_token llama_sample_top_p_top_k( - struct llama_context * ctx, - const llama_token * last_n_tokens_data, - int last_n_tokens_size, - int top_k, - float top_p, - float temp, - float repeat_penalty); + /// @details Randomly selects a token from the candidates based on their probabilities. + LLAMA_API llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates); // Performance information LLAMA_API void llama_print_timings(struct llama_context * ctx); diff --git a/llama_util.h b/llama_util.h index acb207e653c10..6e66d12a8041c 100755 --- a/llama_util.h +++ b/llama_util.h @@ -405,4 +405,30 @@ struct llama_buffer { delete[] addr; } }; + +#ifdef GGML_USE_CUBLAS +#include "ggml-cuda.h" +struct llama_ctx_buffer { + uint8_t * addr = NULL; + size_t size = 0; + + void resize(size_t size) { + if (addr) { + ggml_cuda_host_free(addr); + } + addr = (uint8_t *) ggml_cuda_host_malloc(size); + this->size = size; + } + + ~llama_ctx_buffer() { + if (addr) { + ggml_cuda_host_free(addr); + } + } +}; +#else +typedef llama_buffer llama_ctx_buffer; +#endif + + #endif diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 81eadbc4db0a4..645648585ab3d 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -8,4 +8,5 @@ endfunction() # llama_add_test(test-double-float.c) # SLOW llama_add_test(test-quantize-fns.cpp) llama_add_test(test-quantize-perf.cpp) +llama_add_test(test-sampling.cpp) llama_add_test(test-tokenizer-0.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab.bin) diff --git a/tests/test-sampling.cpp b/tests/test-sampling.cpp new file mode 100644 index 0000000000000..7eee4f6d3a645 --- /dev/null +++ b/tests/test-sampling.cpp @@ -0,0 +1,199 @@ +#include "llama.h" +#include "ggml.h" +#include +#include +#include +#include +#include +#include +#include + + +void dump(const llama_token_data_array * candidates) { + for (size_t i = 0; i < candidates->size; i++) { + printf("%d: %f (%f)\n", candidates->data[i].id, candidates->data[i].p, candidates->data[i].logit); + } +} + +#define DUMP(__candidates) do { printf("%s:%d (%s)\n", __FILE__, __LINE__, __func__); dump((__candidates)); printf("-\n"); } while(0) + + +void test_top_k(const std::vector & probs, + const std::vector & expected_probs, + int k) { + size_t n_vocab = probs.size(); + std::vector candidates; + candidates.reserve(n_vocab); + for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) { + float logit = log(probs[token_id]); + candidates.emplace_back(llama_token_data{token_id, logit, 0.0f}); + } + + llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; + llama_sample_softmax(nullptr, &candidates_p); + DUMP(&candidates_p); + llama_sample_top_k(nullptr, &candidates_p, k); + DUMP(&candidates_p); + + assert(candidates_p.size == expected_probs.size()); + for (size_t i = 0; i < candidates_p.size; i++) { + assert(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-5); + } +} + + +void test_top_p(const std::vector & probs, + const std::vector & expected_probs, + float p) { + + size_t n_vocab = probs.size(); + std::vector candidates; + candidates.reserve(n_vocab); + for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) { + float logit = log(probs[token_id]); + candidates.emplace_back(llama_token_data{token_id, logit, 0.0f}); + } + + llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; + llama_sample_softmax(nullptr, &candidates_p); + DUMP(&candidates_p); + llama_sample_top_p(nullptr, &candidates_p, p); + DUMP(&candidates_p); + + assert(candidates_p.size == expected_probs.size()); + for (size_t i = 0; i < candidates_p.size; i++) { + assert(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3); + } +} + + +void test_tfs(const std::vector & probs, + const std::vector & expected_probs, + float z) { + size_t n_vocab = probs.size(); + std::vector candidates; + candidates.reserve(n_vocab); + for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) { + float logit = log(probs[token_id]); + candidates.emplace_back(llama_token_data{token_id, logit, 0.0f}); + } + + llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; + DUMP(&candidates_p); + llama_sample_tail_free(nullptr, &candidates_p, z); + DUMP(&candidates_p); + + assert(candidates_p.size == expected_probs.size()); + for (size_t i = 0; i < candidates_p.size; i++) { + assert(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3); + } +} + + +void test_typical(const std::vector & probs, + const std::vector & expected_probs, + float p) { + size_t n_vocab = probs.size(); + std::vector candidates; + candidates.reserve(n_vocab); + for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) { + float logit = log(probs[token_id]); + candidates.emplace_back(llama_token_data{token_id, logit, 0.0f}); + } + + llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; + DUMP(&candidates_p); + llama_sample_typical(nullptr, &candidates_p, p); + DUMP(&candidates_p); + + assert(candidates_p.size == expected_probs.size()); + for (size_t i = 0; i < candidates_p.size; i++) { + assert(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3); + } +} + + +void test_repetition_penalty( + const std::vector & probs, + const std::vector & last_tokens, + const std::vector & expected_probs, + float penalty) { + assert(probs.size() == expected_probs.size()); + + size_t n_vocab = probs.size(); + std::vector candidates; + candidates.reserve(n_vocab); + for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) { + float logit = log(probs[token_id]); + candidates.emplace_back(llama_token_data{token_id, logit, 0.0f}); + } + + llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; + llama_sample_softmax(nullptr, &candidates_p); + DUMP(&candidates_p); + llama_sample_repetition_penalty(nullptr, &candidates_p, (llama_token *)last_tokens.data(), last_tokens.size(), penalty); + llama_sample_softmax(nullptr, &candidates_p); + DUMP(&candidates_p); + + assert(candidates_p.size == expected_probs.size()); + for (size_t i = 0; i < candidates_p.size; i++) { + assert(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-6); + } +} + + +void test_frequency_presence_penalty( + const std::vector & probs, + const std::vector & last_tokens, + const std::vector & expected_probs, + float alpha_frequency, float alpha_presence) { + assert(probs.size() == expected_probs.size()); + + size_t n_vocab = probs.size(); + std::vector candidates; + candidates.reserve(n_vocab); + for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) { + float logit = log(probs[token_id]); + candidates.emplace_back(llama_token_data{token_id, logit, 0.0f}); + } + + llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; + llama_sample_softmax(nullptr, &candidates_p); + // DUMP(&candidates_p); + llama_sample_frequency_and_presence_penalties(nullptr, &candidates_p, (llama_token *)last_tokens.data(), last_tokens.size(), alpha_frequency, alpha_presence); + llama_sample_softmax(nullptr, &candidates_p); + // DUMP(&candidates_p); + + assert(candidates_p.size == expected_probs.size()); + for (size_t i = 0; i < candidates_p.size; i++) { + assert(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3); + } +} + +int main(void) { + ggml_time_init(); + + test_top_k({0.1, 0.2, 0.3, 0.4}, {0.4}, 1); + test_top_k({0.1, 0.2, 0.3, 0.4}, {0.4, 0.3, 0.2}, 3); + + test_top_p({0.1, 0.2, 0.3, 0.4}, {0.4}, 0); + test_top_p({0.1, 0.2, 0.3, 0.4}, {0.4, 0.3}, 0.7); + test_top_p({0.1, 0.2, 0.3, 0.4}, {0.4, 0.3, 0.2, 0.1}, 1); + + test_tfs({0.1, 0.15, 0.2, 0.25, 0.3}, {0.3}, 0.25); + test_tfs({0.1, 0.15, 0.2, 0.25, 0.3}, {0.3, 0.25}, 0.75); + test_tfs({0.1, 0.15, 0.2, 0.25, 0.3}, {0.3, 0.25}, 0.99); + + test_typical({0.97, 0.01, 0.01, 0.01}, {0.97}, 0.5); + test_typical({0.4, 0.2, 0.2, 0.2}, {0.2, 0.2, 0.2}, 0.5); + + test_repetition_penalty({0.2, 0.2, 0.2, 0.2, 0.2}, {0}, {0.25, 0.25, 0.25, 0.25, 0}, 50.0); + test_repetition_penalty({0.2, 0.2, 0.2, 0.2, 0.2}, {0, 1, 2}, {0.5, 0.5, 0, 0, 0}, 50.0); + test_repetition_penalty({0.2, 0.2, 0.2, 0.2, 0.2}, {0, 1, 2, 0, 0}, {0.5, 0.5, 0, 0, 0}, 50.0); + + test_frequency_presence_penalty({0.2, 0.2, 0.2, 0.2, 0.2}, {0}, {0.249997, 0.249997, 0.249997, 0.249997, 0.000011}, 5.0, 5.0); + test_frequency_presence_penalty({0.2, 0.2, 0.2, 0.2, 0.2}, {0, 1, 2}, {0.499966, 0.499966, 0.000023, 0.000023, 0.000023}, 5.0, 5.0); + test_frequency_presence_penalty({0.2, 0.2, 0.2, 0.2, 0.2}, {0, 1, 2, 0, 0}, {0.499977, 0.499977, 0.000023, 0.000023, 0.000000}, 5.0, 5.0); + + printf("OK\n"); +}