Closed
Description
I may be doing something wrong or misunderstanding the purpose of the kv_cache
API but I believe the recent PR #685 by @chrfalch which added the ability to get / set the kv_cache
is still insufficient to restore the state of the model even when resetting external model state such as last_n_tokens_data
and n_past
.
Here is a minimal example
#include "llama.h"
#include <vector>
#include <iostream>
using namespace std;
int main() {
// init
auto params = llama_context_default_params();
auto ctx = llama_init_from_file("../../models/ggml-model.bin", params);
auto tokens = vector<llama_token>(params.n_ctx);
auto prompt = "The quick brown fox";
auto n_tokens = llama_tokenize(ctx, prompt, tokens.data(), tokens.size(), true);
// evaluate prompt
llama_eval(ctx, tokens.data(), n_tokens, 0, 12);
auto last_n_tokens_size = 64;
auto last_n_tokens_data = vector<llama_token>(last_n_tokens_size, 0);
last_n_tokens_data.insert(last_n_tokens_data.end(), tokens.data(), tokens.data() + n_tokens);
auto n_past = n_tokens;
// save state
auto kv_cache_size = llama_get_kv_cache_size(ctx);
auto kv_cache_token_count = llama_get_kv_cache_token_count(ctx);
auto kv_cache = llama_get_kv_cache(ctx);
auto kv_cache_copy = vector<uint8_t>(kv_cache, kv_cache + kv_cache_size);
auto n_past_copy = n_past;
auto last_n_tokens_data_copy = vector<llama_token>(last_n_tokens_data);
// first run
cout << prompt;
for (auto i = 0; i < 6; i++) {
auto next_token = llama_sample_top_p_top_k(
ctx,
last_n_tokens_data.data() + last_n_tokens_data.size() - n_past,
last_n_tokens_size,
1,
1.0,
0.0,
1.1
);
auto next_token_str = llama_token_to_str(ctx, next_token);
last_n_tokens_data.push_back(next_token);
cout << next_token_str;
llama_eval(ctx, &next_token, 1, n_past, 12);
n_past += 1;
}
cout << endl;
//
// restore state
llama_set_kv_cache(ctx, kv_cache_copy.data(), kv_cache_size, kv_cache_token_count);
last_n_tokens_data = last_n_tokens_data_copy;
n_past = n_past_copy;
//
// second run
cout << prompt;
for (auto i = 0; i < 6; i++) {
auto next_token = llama_sample_top_p_top_k(
ctx,
last_n_tokens_data.data() + last_n_tokens_data.size() - n_past,
last_n_tokens_size,
1,
1.0,
0.0,
1.1
);
auto next_token_str = llama_token_to_str(ctx, next_token);
last_n_tokens_data.push_back(next_token);
cout << next_token_str;
llama_eval(ctx, &next_token, 1, n_past, 12);
n_past += 1;
}
cout << endl;
//
return 0;
}
I'd expect the following output
The quick brown fox jumps over the lazy dog
The quick brown fox jumps over the lazy dog
But instead I get
The quick brown fox jumps over the lazy dog
The quick brown fox.
The quick brown fo
Which implies the model is still generating from the end of the first run.