1+ #include " arg.h"
2+ #include " log.h"
13#include " ggml.h"
24#include " llama.h"
35#include " common.h"
4- #include " llama-vocab.h"
6+ #include " ../src/ llama-vocab.h"
57
68#ifdef _WIN32
79#define WIN32_LEAN_AND_MEAN
1820#include < vector>
1921
2022static void print_usage (int , char ** argv) {
21- LOG_TEE (" \n example usage:\n " );
22- LOG_TEE (" \n %s -m model.gguf -c 8192 -b 2048 -ub 512\n " , argv[0 ]);
23- LOG_TEE (" \n " );
23+ LOG_INF (" \n example usage:\n " );
24+ LOG_INF (" \n %s -m model.gguf -c 8192 -b 2048 -ub 512\n " , argv[0 ]);
25+ LOG_INF (" \n " );
2426}
2527
2628int main (int argc, char ** argv) {
29+ common_params params;
2730
28- gpt_params params;
29-
30- if (!gpt_params_parse (argc, argv, params)) {
31- print_usage (argc, argv);
31+ if (!common_params_parse (argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
3232 return 1 ;
3333 }
3434
35- // init LLM
35+ common_init ();
3636
37+ // init LLM
3738 llama_backend_init ();
3839 llama_numa_init (params.numa );
3940
4041 // initialize the model
42+ common_init_result llama_init = common_init_from_params (params);
4143
42- llama_model_params model_params = llama_model_params_from_gpt_params (params);
43-
44- llama_model * model = llama_load_model_from_file (params.model .c_str (), model_params);
44+ llama_model * model = llama_init.model .get ();
45+ llama_context * ctx = llama_init.context .get ();
4546
46- if (model == NULL ) {
47- fprintf (stderr , " %s: error: unable to load model \n " , __func__);
47+ if (model == nullptr || ctx == nullptr ) {
48+ LOG_ERR ( " %s : failed to init \n " , __func__);
4849 return 1 ;
4950 }
5051
51- llama_context_params ctx_params = llama_context_params_from_gpt_params (params);
52-
53- llama_context * ctx = llama_new_context_with_model (model, ctx_params);
54-
55- if (ctx == NULL ) {
56- fprintf (stderr , " %s: error: failed to create the llama_context\n " , __func__);
57- return 1 ;
52+ // print system information
53+ {
54+ LOG_INF (" \n " );
55+ LOG_INF (" %s\n " , common_params_get_system_info (params).c_str ());
56+ LOG_INF (" \n " );
5857 }
5958
6059 const unsigned int n_kv_max = llama_n_ctx (ctx);
6160
61+ const llama_vocab * vocab = llama_model_get_vocab (model);
62+ llama_token bos = vocab->token_bos ();
63+ const unsigned int n_vocab = llama_vocab_n_tokens (vocab);
6264
63- const llama_vocab * vocab = llama_get_vocab (ctx);
64- llama_token bos = llama_token_bos_impl (*vocab);
65- // llama_token eos = llama_token_eos_impl(*vocab);
66-
67- const unsigned int n_vocab = llama_n_vocab (model);
68-
69- // decode in batches of ctx_params.n_batch tokens
65+ // decode in batches of n_batch tokens
7066 auto decode_helper = [](llama_context * ctx, llama_batch & batch, int32_t n_batch) {
7167 for (int32_t i = 0 ; i < (int32_t ) batch.n_tokens ; i += n_batch) {
7268 const int32_t n_tokens = std::min (n_batch, (int32_t ) (batch.n_tokens - i));
@@ -83,7 +79,7 @@ int main(int argc, char ** argv) {
8379
8480 const int ret = llama_decode (ctx, batch_view);
8581 if (ret != 0 ) {
86- LOG_TEE (" failed to decode the batch, n_batch = %d, ret = %d\n " , n_batch, ret);
82+ LOG_INF (" failed to decode the batch, n_batch = %d, ret = %d\n " , n_batch, ret);
8783 return false ;
8884 }
8985
@@ -96,64 +92,66 @@ int main(int argc, char ** argv) {
9692 const unsigned int pp = params.n_ubatch ;
9793 const unsigned int tg = params.n_ubatch / 4 ;
9894
99- if (!params.sweep_bench_output_jsonl ) {
100- LOG_TEE (" \n " );
101- LOG_TEE (" %s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n " , __func__, n_kv_max, params.n_batch , params.n_ubatch , params.flash_attn , params.n_gpu_layers , ctx_params.n_threads , ctx_params.n_threads_batch );
102- LOG_TEE (" \n " );
103- LOG_TEE (" |%6s | %6s | %6s | %8s | %8s | %8s | %8s |\n " , " PP" , " TG" , " N_KV" , " T_PP s" , " S_PP t/s" , " T_TG s" , " S_TG t/s" );
104- LOG_TEE (" |%6s-|-%6s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|\n " , " ------" , " ------" , " ------" , " --------" , " --------" , " --------" , " --------" );
105- }
95+ const unsigned int n_threads = params.cpuparams .n_threads ;
96+ const unsigned int n_threads_batch = params.cpuparams_batch .n_threads ;
97+ const int32_t n_batch = llama_n_batch (ctx);
98+
99+ LOG_INF (" \n " );
100+ LOG_INF (" %s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n " , __func__, n_kv_max, params.n_batch , params.n_ubatch , params.flash_attn , params.n_gpu_layers , n_threads, n_threads_batch);
101+ LOG_INF (" \n " );
102+ LOG_INF (" |%6s | %6s | %6s | %8s | %8s | %8s | %8s |\n " , " PP" , " TG" , " N_KV" , " T_PP s" , " S_PP t/s" , " T_TG s" , " S_TG t/s" );
103+ LOG_INF (" |%6s-|-%6s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|\n " , " ------" , " ------" , " ------" , " --------" , " --------" , " --------" , " --------" );
106104
107105 llama_batch batch = llama_batch_init (n_kv_max, 0 , 1 );
108106
109107 // warm up
110108 {
111- llama_batch_add (batch, bos, 0 , { 0 }, false );
109+ common_batch_add (batch, bos, 0 , { 0 }, false );
112110
113- if (!decode_helper (ctx, batch, ctx_params. n_batch )) {
114- LOG_TEE (" %s: llama_decode() failed\n " , __func__);
111+ if (!decode_helper (ctx, batch, n_batch)) {
112+ LOG_INF (" %s: llama_decode() failed\n " , __func__);
115113 return 1 ;
116114 }
117115 }
118116
119- llama_batch_clear (batch);
120- llama_kv_cache_clear (ctx);
117+ common_batch_clear (batch);
118+ llama_kv_self_clear (ctx);
121119
122120 for (unsigned int n_kv = 0 ; n_kv < n_kv_max; n_kv += params.n_ubatch ) {
123121 // clean up KV cache before generation
124- llama_kv_cache_seq_rm (ctx, 0 , n_kv, -1 );
122+ llama_kv_self_seq_rm (ctx, 0 ,n_kv, -1 );
125123
126124 // first measure token generation performance at this context size
127125 const auto t_tg_start = ggml_time_us ();
128126
129127 for (unsigned int i = 0 ; i < tg; ++i) {
130- llama_batch_clear (batch);
131- llama_batch_add (batch, std::rand () % n_vocab, n_kv + i, { 0 }, true );
128+ common_batch_clear (batch);
129+ common_batch_add (batch, std::rand () % n_vocab, n_kv + i, { 0 }, true );
132130
133- if (!decode_helper (ctx, batch, ctx_params. n_batch )) {
134- LOG_TEE (" %s: llama_decode() failed\n " , __func__);
131+ if (!decode_helper (ctx, batch, n_batch)) {
132+ LOG_INF (" %s: llama_decode() failed\n " , __func__);
135133 return 1 ;
136134 }
137135 }
138136
139137 const auto t_tg_end = ggml_time_us ();
140138
141139 // clean up KV cache after generation
142- llama_kv_cache_seq_rm (ctx, 0 , n_kv, -1 );
140+ llama_kv_self_seq_rm (ctx, 0 , n_kv, -1 );
143141
144142 // prepare batch of pp size for prompt processing performance measurement
145- llama_batch_clear (batch);
143+ common_batch_clear (batch);
146144
147145 for (unsigned int i = 0 ; i < pp; ++i) {
148- llama_batch_add (batch, std::rand () % n_vocab, n_kv + i, { 0 }, false );
146+ common_batch_add (batch, std::rand () % n_vocab, n_kv + i, { 0 }, false );
149147 }
150148 batch.logits [batch.n_tokens - 1 ] = true ;
151149
152150 // measure prompt processing performance
153151 const auto t_pp_start = ggml_time_us ();
154152
155- if (!decode_helper (ctx, batch, ctx_params. n_batch )) {
156- LOG_TEE (" %s: llama_decode() failed\n " , __func__);
153+ if (!decode_helper (ctx, batch, n_batch)) {
154+ LOG_INF (" %s: llama_decode() failed\n " , __func__);
157155 return 1 ;
158156 }
159157
@@ -166,23 +164,9 @@ int main(int argc, char ** argv) {
166164 const float speed_pp = pp / t_pp;
167165 const float speed_tg = tg / t_tg;
168166
169- if (params.sweep_bench_output_jsonl ) {
170- LOG_TEE (
171- " {\" n_kv_max\" : %d, \" n_batch\" : %d, \" n_ubatch\" : %d, \" flash_attn\" : %d, \" n_gpu_layers\" : %d, \" n_threads\" : %u, \" n_threads_batch\" : %u, "
172- " \" pp\" : %d, \" tg\" : %d, \" n_kv\" : %d, \" t_pp\" : %f, \" speed_pp\" : %f, \" t_tg\" : %f, \" speed_tg\" : %f }\n " ,
173- n_kv_max, params.n_batch , params.n_ubatch , params.flash_attn , params.n_gpu_layers , ctx_params.n_threads , ctx_params.n_threads_batch ,
174- pp, tg, n_kv, t_pp, speed_pp, t_tg, speed_tg
175- );
176- } else {
177- LOG_TEE (" |%6d | %6d | %6d | %8.3f | %8.2f | %8.3f | %8.2f |\n " , pp, tg, n_kv, t_pp, speed_pp, t_tg, speed_tg);
178- }
167+ LOG_INF (" |%6d | %6d | %6d | %8.3f | %8.2f | %8.3f | %8.2f |\n " , pp, tg, n_kv, t_pp, speed_pp, t_tg, speed_tg);
179168 }
180169
181- llama_batch_free (batch);
182-
183- llama_free (ctx);
184- llama_free_model (model);
185-
186170 llama_backend_free ();
187171
188172 return 0 ;
0 commit comments