Skip to content

Commit

Permalink
main : add option to save full output to session (ggerganov#1338)
Browse files Browse the repository at this point in the history
* main : add option to save full output to session

* split behavior into --session and --prompt-cache

* restore original implementation with new names

* PR comments

* move the check for incompatible parameters to gpt_params_parse

* Fix whitespace

Co-authored-by: DannyDaemonic <DannyDaemonic@gmail.com>

---------

Co-authored-by: DannyDaemonic <DannyDaemonic@gmail.com>
  • Loading branch information
ejones and DannyDaemonic authored May 10, 2023
1 parent e6a46b0 commit cf348a6
Show file tree
Hide file tree
Showing 4 changed files with 30 additions and 18 deletions.
17 changes: 14 additions & 3 deletions examples/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -118,12 +118,14 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
params.prompt = argv[i];
} else if (arg == "-e") {
escape_prompt = true;
} else if (arg == "--session") {
} else if (arg == "--prompt-cache") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.path_session = argv[i];
params.path_prompt_cache = argv[i];
} else if (arg == "--prompt-cache-all") {
params.prompt_cache_all = true;
} else if (arg == "-f" || arg == "--file") {
if (++i >= argc) {
invalid_param = true;
Expand Down Expand Up @@ -342,6 +344,13 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
gpt_print_usage(argc, argv, default_params);
exit(1);
}
if (params.prompt_cache_all &&
(params.interactive || params.interactive_first ||
params.instruct || params.antiprompt.size())) {
fprintf(stderr, "error: --prompt-cache-all not supported in interactive mode yet\n");
gpt_print_usage(argc, argv, default_params);
exit(1);
}
if (escape_prompt) {
process_escapes(params.prompt);
}
Expand All @@ -367,7 +376,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
fprintf(stderr, " -p PROMPT, --prompt PROMPT\n");
fprintf(stderr, " prompt to start generation with (default: empty)\n");
fprintf(stderr, " -e process prompt escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n");
fprintf(stderr, " --session FNAME file to cache model state in (may be large!) (default: none)\n");
fprintf(stderr, " --prompt-cache FNAME file to cache prompt state for faster startup (default: none)\n");
fprintf(stderr, " --prompt-cache-all if specified, saves user input and generations to cache as well.\n");
fprintf(stderr, " not supported with --interactive or other interactive options\n");
fprintf(stderr, " --random-prompt start with a randomized prompt.\n");
fprintf(stderr, " --in-prefix STRING string to prefix user inputs with (default: empty)\n");
fprintf(stderr, " --in-suffix STRING string to suffix after user inputs with (default: empty)\n");
Expand Down
7 changes: 4 additions & 3 deletions examples/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,9 @@ struct gpt_params {

std::string model = "models/lamma-7B/ggml-model.bin"; // model path
std::string prompt = "";
std::string path_session = ""; // path to file for saving/loading model eval state
std::string input_prefix = ""; // string to prefix user inputs with
std::string input_suffix = ""; // string to suffix user inputs with
std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state
std::string input_prefix = ""; // string to prefix user inputs with
std::string input_suffix = ""; // string to suffix user inputs with
std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted

std::string lora_adapter = ""; // lora adapter path
Expand All @@ -58,6 +58,7 @@ struct gpt_params {
bool random_prompt = false; // do not randomize prompt if none provided
bool use_color = false; // use color to distinguish generations and inputs
bool interactive = false; // interactive mode
bool prompt_cache_all = false; // save user input and generations to prompt cache

bool embedding = false; // get only sentence embedding
bool interactive_first = false; // wait for user input immediately
Expand Down
4 changes: 2 additions & 2 deletions examples/main/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -270,9 +270,9 @@ These options help improve the performance and memory usage of the LLaMA models.

- `-b N, --batch_size N`: Set the batch size for prompt processing (default: 512). This large batch size benefits users who have BLAS installed and enabled it during the build. If you don't have BLAS enabled ("BLAS=0"), you can use a smaller number, such as 8, to see the prompt progress as it's evaluated in some situations.

### Session Caching
### Prompt Caching

- `--session FNAME`: Specify a file to load/save the session, which caches the model state after the initial prompt. This can significantly speed up the startup time when you're using longer prompts. The session file is created during the first run and is reused in subsequent runs. If you change your prompt such that 75% or less of the session is reusable, the existing session file will be overwritten with a new, updated version to maintain optimal performance.
- `--prompt-cache FNAME`: Specify a file to cache the model state after the initial prompt. This can significantly speed up the startup time when you're using longer prompts. The file is created during the first run and is reused and updated in subsequent runs.

### Quantization

Expand Down
20 changes: 10 additions & 10 deletions examples/main/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ int main(int argc, char ** argv) {
// Add a space in front of the first character to match OG llama tokenizer behavior
params.prompt.insert(0, 1, ' ');

std::string path_session = params.path_session;
std::string path_session = params.path_prompt_cache;
std::vector<llama_token> session_tokens;

if (!path_session.empty()) {
Expand Down Expand Up @@ -292,14 +292,9 @@ int main(int argc, char ** argv) {
is_interacting = params.interactive_first;
}

bool is_antiprompt = false;
bool input_echo = true;

// HACK - because session saving incurs a non-negligible delay, for now skip re-saving session
// if we loaded a session with at least 75% similarity. It's currently just used to speed up the
// initial prompt so it doesn't need to be an exact match.
bool need_to_save_session = !path_session.empty() && n_matching_session_tokens < (embd_inp.size() * 3 / 4);

bool is_antiprompt = false;
bool input_echo = true;
bool need_to_save_session = !path_session.empty() && n_matching_session_tokens < embd_inp.size();

int n_past = 0;
int n_remain = params.n_predict;
Expand Down Expand Up @@ -328,7 +323,7 @@ int main(int argc, char ** argv) {
embd.insert(embd.begin(), last_n_tokens.begin() + n_ctx - n_left/2 - embd.size(), last_n_tokens.end() - embd.size());

// stop saving session if we run out of context
path_session = "";
path_session.clear();

//printf("\n---\n");
//printf("resetting: '");
Expand Down Expand Up @@ -603,6 +598,11 @@ int main(int argc, char ** argv) {
}
}

if (!path_session.empty() && params.prompt_cache_all) {
fprintf(stderr, "\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str());
llama_save_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
}

llama_print_timings(ctx);
llama_free(ctx);

Expand Down

0 comments on commit cf348a6

Please sign in to comment.