From a8777ad84e00cda0399e827cdf971e2c3fab1da2 Mon Sep 17 00:00:00 2001 From: pudepiedj Date: Fri, 6 Oct 2023 14:16:38 +0100 Subject: [PATCH] parallel : add option to load external prompt file (#3416) * Enable external file and add datestamp * Add name of external file at end * Upload ToK2024 * Delete ToK2024.txt * Experiments with jeopardy * Move ParallelQuestions to /proimpts and rename * Interim commit * Interim commit * Final revision * Remove trailing whitespace * remove cmake_all.sh * Remove cmake_all.sh * Changed .gitignore * Improved reporting and new question files. * Corrected typo * More LLM questions * Update LLM-questions.txt * Yet more LLM-questions * Remove jeopardy results file * Reinstate original jeopardy.sh * Update examples/parallel/parallel.cpp --------- Co-authored-by: Georgi Gerganov --- common/common.cpp | 2 ++ common/common.h | 1 + examples/jeopardy/README.md | 2 +- examples/parallel/parallel.cpp | 56 +++++++++++++++++++++++++++++++--- llama.cpp | 10 +++--- prompts/LLM-questions.txt | 49 +++++++++++++++++++++++++++++ prompts/parallel-questions.txt | 42 +++++++++++++++++++++++++ 7 files changed, 151 insertions(+), 11 deletions(-) create mode 100644 prompts/LLM-questions.txt create mode 100644 prompts/parallel-questions.txt diff --git a/common/common.cpp b/common/common.cpp index 186f5b26807d8..60b00b5fbb8f1 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -167,6 +167,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { invalid_param = true; break; } + // store the external file name in params + params.prompt_file = argv[i]; std::copy(std::istreambuf_iterator(file), std::istreambuf_iterator(), back_inserter(params.prompt)); if (params.prompt.back() == '\n') { params.prompt.pop_back(); diff --git a/common/common.h b/common/common.h index e095c56e309c2..c802152791797 100644 --- a/common/common.h +++ b/common/common.h @@ -79,6 +79,7 @@ struct gpt_params { std::string model_draft = ""; // draft model for speculative decoding std::string model_alias = "unknown"; // model alias std::string prompt = ""; + std::string prompt_file = ""; // store the external prompt file name std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state std::string input_prefix = ""; // string to prefix user inputs with std::string input_suffix = ""; // string to suffix user inputs with diff --git a/examples/jeopardy/README.md b/examples/jeopardy/README.md index 4c42e3cdbf526..ffa13cbf349b2 100644 --- a/examples/jeopardy/README.md +++ b/examples/jeopardy/README.md @@ -2,7 +2,7 @@ This is pretty much just a straight port of aigoopy/llm-jeopardy/ with an added graph viewer. -The jeopardy test can be used to compare the fact knowledge of different models and compare them to eachother. This is in contrast to some other tests, which test logical deduction, creativity, writing skills, etc. +The jeopardy test can be used to compare the fact knowledge of different models and compare them to each other. This is in contrast to some other tests, which test logical deduction, creativity, writing skills, etc. Step 1: Open jeopardy.sh and modify the following: diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp index ffd7b1db4abdd..721888da7de94 100644 --- a/examples/parallel/parallel.cpp +++ b/examples/parallel/parallel.cpp @@ -10,6 +10,7 @@ #include #include #include +#include // trim whitespace from the beginning and end of a string static std::string trim(const std::string & str) { @@ -70,6 +71,26 @@ struct client { std::vector tokens_prev; }; +static void print_date_time() { + std::time_t current_time = std::time(nullptr); + std::tm* local_time = std::localtime(¤t_time); + char buffer[80]; + strftime(buffer, sizeof(buffer), "%Y-%m-%d %H:%M:%S", local_time); + + printf("\n\033[35mrun parameters as at %s\033[0m\n", buffer); +} + +// Define a split string function to ... +static std::vector split_string(const std::string& input, char delimiter) { + std::vector tokens; + std::istringstream stream(input); + std::string token; + while (std::getline(stream, token, delimiter)) { + tokens.push_back(token); + } + return tokens; +} + int main(int argc, char ** argv) { srand(1234); @@ -104,6 +125,23 @@ int main(int argc, char ** argv) { params.logits_all = true; std::tie(model, ctx) = llama_init_from_gpt_params(params); + // load the prompts from an external file if there are any + if (params.prompt.empty()) { + printf("\n\033[32mNo new questions so proceed with build-in defaults.\033[0m\n"); + } else { + // Output each line of the input params.prompts vector and copy to k_prompts + int index = 0; + printf("\n\033[32mNow printing the external prompt file %s\033[0m\n\n", params.prompt_file.c_str()); + + std::vector prompts = split_string(params.prompt, '\n'); + for (const auto& prompt : prompts) { + k_prompts.resize(index + 1); + k_prompts[index] = prompt; + index++; + printf("%3d prompt: %s\n", index, prompt.c_str()); + } + } + fprintf(stderr, "\n\n"); fflush(stderr); @@ -233,7 +271,7 @@ int main(int argc, char ** argv) { client.n_decoded = 0; client.i_batch = batch.n_tokens - 1; - LOG_TEE("\033[1mClient %3d, seq %4d, started decoding ...\033[0m\n", client.id, client.seq_id); + LOG_TEE("\033[31mClient %3d, seq %4d, started decoding ...\033[0m\n", client.id, client.seq_id); g_seq_id += 1; @@ -336,8 +374,8 @@ int main(int argc, char ** argv) { const auto t_main_end = ggml_time_us(); - LOG_TEE("\033[1mClient %3d, seq %4d, prompt %4d t, response %4d t, time %5.2f s, speed %5.2f t/s, cache miss %d \033[0m \n\nInput: %s\nResponse: %s\n\n", - client.id, client.seq_id, client.n_prompt, client.n_decoded, + LOG_TEE("\033[31mClient %3d, seq %3d/%3d, prompt %4d t, response %4d t, time %5.2f s, speed %5.2f t/s, cache miss %d \033[0m \nInput: %s\n\033[35mResponse: %s\033[0m\n\n", + client.id, client.seq_id, n_seq, client.n_prompt, client.n_decoded, (t_main_end - client.t_start_prompt) / 1e6, (double) (client.n_prompt + client.n_decoded) / (t_main_end - client.t_start_prompt) * 1e6, n_cache_miss, @@ -357,13 +395,21 @@ int main(int argc, char ** argv) { const auto t_main_end = ggml_time_us(); - LOG_TEE("\n\n"); + print_date_time(); + + LOG_TEE("\n%s: n_parallel = %d, n_sequences = %d, cont_batching = %d, system tokens = %d\n", __func__, n_clients, n_seq, cont_batching, n_tokens_system); + if (params.prompt_file.empty()) { + params.prompt_file = "used built-in defaults"; + } + LOG_TEE("External prompt file: \033[32m%s\033[0m\n", params.prompt_file.c_str()); + LOG_TEE("Model and path used: \033[32m%s\033[0m\n\n", params.model.c_str()); + LOG_TEE("Total prompt tokens: %6d, speed: %5.2f t/s\n", n_total_prompt, (double) (n_total_prompt ) / (t_main_end - t_main_start) * 1e6); LOG_TEE("Total gen tokens: %6d, speed: %5.2f t/s\n", n_total_gen, (double) (n_total_gen ) / (t_main_end - t_main_start) * 1e6); LOG_TEE("Total speed (AVG): %6s speed: %5.2f t/s\n", "", (double) (n_total_prompt + n_total_gen) / (t_main_end - t_main_start) * 1e6); LOG_TEE("Cache misses: %6d\n", n_cache_miss); - LOG_TEE("\n\n"); + LOG_TEE("\n"); llama_print_timings(ctx); diff --git a/llama.cpp b/llama.cpp index 56413f3a241eb..1a7d37b8dec47 100644 --- a/llama.cpp +++ b/llama.cpp @@ -8219,14 +8219,14 @@ void llama_print_timings(struct llama_context * ctx) { const llama_timings timings = llama_get_timings(ctx); LLAMA_LOG_INFO("\n"); - LLAMA_LOG_INFO("%s: load time = %8.2f ms\n", __func__, timings.t_load_ms); - LLAMA_LOG_INFO("%s: sample time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", + LLAMA_LOG_INFO("%s: load time = %10.2f ms\n", __func__, timings.t_load_ms); + LLAMA_LOG_INFO("%s: sample time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", __func__, timings.t_sample_ms, timings.n_sample, timings.t_sample_ms / timings.n_sample, 1e3 / timings.t_sample_ms * timings.n_sample); - LLAMA_LOG_INFO("%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n", + LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n", __func__, timings.t_p_eval_ms, timings.n_p_eval, timings.t_p_eval_ms / timings.n_p_eval, 1e3 / timings.t_p_eval_ms * timings.n_p_eval); - LLAMA_LOG_INFO("%s: eval time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", + LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", __func__, timings.t_eval_ms, timings.n_eval, timings.t_eval_ms / timings.n_eval, 1e3 / timings.t_eval_ms * timings.n_eval); - LLAMA_LOG_INFO("%s: total time = %8.2f ms\n", __func__, (timings.t_end_ms - timings.t_start_ms)); + LLAMA_LOG_INFO("%s: total time = %10.2f ms\n", __func__, (timings.t_end_ms - timings.t_start_ms)); } void llama_reset_timings(struct llama_context * ctx) { diff --git a/prompts/LLM-questions.txt b/prompts/LLM-questions.txt new file mode 100644 index 0000000000000..fdf3d52f4416a --- /dev/null +++ b/prompts/LLM-questions.txt @@ -0,0 +1,49 @@ +In the context of LLMs, what is "Attention"? +In the context of LLMs, what is a completion? +In the context of LLMs, what is a prompt? +In the context of LLMs, what is GELU? +In the context of LLMs, what is RELU? +In the context of LLMs, what is softmax? +In the context of LLMs, what is decoding? +In the context of LLMs, what is encoding? +In the context of LLMs, what is tokenizing? +In the context of LLMs, what is an embedding? +In the context of LLMs, what is quantization? +In the context of LLMs, what is a tensor? +In the context of LLMs, what is a sparse tensor? +In the context of LLMs, what is a vector? +In the context of LLMs, how is attention implemented? +In the context of LLMs, why is attention all you need? +In the context of LLMs, what is "RoPe" and what is it used for? +In the context of LLMs, what is "LoRA" and what is it used for? +In the context of LLMs, what are weights? +In the context of LLMs, what are biases? +In the context of LLMs, what are checkpoints? +In the context of LLMs, what is "perplexity"? +In the context of LLMs, what are models? +In the context of machine-learning, what is "catastrophic forgetting"? +In the context of machine-learning, what is "elastic weight consolidation (EWC)"? +In the context of neural nets, what is a hidden layer? +In the context of neural nets, what is a convolution? +In the context of neural nets, what is dropout? +In the context of neural nets, what is cross-entropy? +In the context of neural nets, what is over-fitting? +In the context of neural nets, what is under-fitting? +What is the difference between an interpreted computer language and a compiled computer language? +In the context of software development, what is a debugger? +When processing using a GPU, what is off-loading? +When processing using a GPU, what is a batch? +When processing using a GPU, what is a block? +When processing using a GPU, what is the difference between a batch and a block? +When processing using a GPU, what is a scratch tensor? +When processing using a GPU, what is a layer? +When processing using a GPU, what is a cache? +When processing using a GPU, what is unified memory? +When processing using a GPU, what is VRAM? +When processing using a GPU, what is a kernel? +When processing using a GPU, what is "metal"? +In the context of LLMs, what are "Zero-Shot", "One-Shot" and "Few-Shot" learning models? +In the context of LLMs, what is the "Transformer-model" architecture? +In the context of LLMs, what is "Multi-Head Attention"? +In the context of LLMs, what is "Self-Attention"? +In the context of transformer-model architectures, how do attention mechanisms use masks? \ No newline at end of file diff --git a/prompts/parallel-questions.txt b/prompts/parallel-questions.txt new file mode 100644 index 0000000000000..0ef9d889330fd --- /dev/null +++ b/prompts/parallel-questions.txt @@ -0,0 +1,42 @@ +What do you know about Hobbits? +What is quantum field theory? +Why did the chicken cross the road? +Who is the president of the United States? +How do I run CMake on MacOS? +Do you agree that C++ is a really finicky language compared with Python3? +Is it a good idea to invest in technology? +Do you like Wagner's Ring? +Do you think this file input option is really neat? +What should we all do about climate change? +Is time-travel possible within the laws of current physics? +Is it like anything to be a bat? +Once the chicken has crossed the road, does it try to go back? +Who is the greatest of all musical composers? +What is art? +Is there life elsewhere in the universe? +What is intelligence? +What is the difference between knowledge and intelligence? +Will religion ever die? +Do we understand ourselves? +What is the best way to cook eggs? +If you cannot see things, on what basis do you evaluate them? +Explain the role of the np junction in photovoltaic cells? +Is professional sport a good or bad influence on human behaviour? +Is capital punishment immoral? +Should we care about other people? +Who are you? +Which sense would you surrender if you could? +Was Henry Ford a hero or a villain? +Do we need leaders? +What is nucleosynthesis? +Who is the greatest scientist of all time? +Who first observed what came to be known as the photovoltaic effect? +What is nuclear fusion and why does it release energy? +Can you know that you exist? +What is an exoplanet? +Do you like cream? +What is the difference? +Can I know that I exist while I'm dreaming that I'm Descartes? +Who said "I didn't know I thought that until I heard myself saying it"? +Does anything really matter? +Can you explain the unreasonable effectiveness of mathematics? \ No newline at end of file