@@ -1428,7 +1428,7 @@ struct sql_printer : public printer {
14281428    }
14291429};
14301430
1431- static  void  test_prompt (llama_context * ctx, int  n_prompt, int  n_past,  int   n_batch, int  n_threads) {
1431+ static  void  test_prompt (llama_context * ctx, int  n_prompt, int  n_batch, int  n_threads) {
14321432    llama_set_n_threads (ctx, n_threads, n_threads);
14331433
14341434    const  llama_model * model = llama_get_model (ctx);
@@ -1444,14 +1444,14 @@ static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_bat
14441444        for  (int  i = 1 ; i < n_tokens; i++) {
14451445            tokens[i] = std::rand () % n_vocab;
14461446        }
1447-         llama_decode (ctx, llama_batch_get_one (tokens.data (), n_tokens, n_past + n_processed,  0 ));
1447+         llama_decode (ctx, llama_batch_get_one (tokens.data (), n_tokens));
14481448        n_processed += n_tokens;
14491449    }
14501450
14511451    llama_synchronize (ctx);
14521452}
14531453
1454- static  void  test_gen (llama_context * ctx, int  n_gen, int  n_past,  int   n_threads) {
1454+ static  void  test_gen (llama_context * ctx, int  n_gen, int  n_threads) {
14551455    llama_set_n_threads (ctx, n_threads, n_threads);
14561456
14571457    const  llama_model * model = llama_get_model (ctx);
@@ -1460,7 +1460,7 @@ static void test_gen(llama_context * ctx, int n_gen, int n_past, int n_threads)
14601460    llama_token token = llama_add_bos_token (model) ? llama_token_bos (model) : std::rand () % n_vocab;
14611461
14621462    for  (int  i = 0 ; i < n_gen; i++) {
1463-         llama_decode (ctx, llama_batch_get_one (&token, 1 , n_past + i,  0 ));
1463+         llama_decode (ctx, llama_batch_get_one (&token, 1 ));
14641464        llama_synchronize (ctx);
14651465        token = std::rand () % n_vocab;
14661466    }
@@ -1596,13 +1596,13 @@ int main(int argc, char ** argv) {
15961596                fprintf (stderr, " llama-bench: benchmark %d/%ld: warmup prompt run\n "  , params_idx, params_count);
15971597            }
15981598            // test_prompt(ctx, std::min(t.n_batch, std::min(t.n_prompt, 32)), 0, t.n_batch, t.n_threads);
1599-             test_prompt (ctx, t.n_prompt , 0 ,  t.n_batch , t.n_threads );
1599+             test_prompt (ctx, t.n_prompt , t.n_batch , t.n_threads );
16001600        }
16011601        if  (t.n_gen  > 0 ) {
16021602            if  (params.progress ) {
16031603                fprintf (stderr, " llama-bench: benchmark %d/%ld: warmup generation run\n "  , params_idx, params_count);
16041604            }
1605-             test_gen (ctx, 1 , 0 ,  t.n_threads );
1605+             test_gen (ctx, 1 , t.n_threads );
16061606        }
16071607
16081608        for  (int  i = 0 ; i < params.reps ; i++) {
@@ -1614,13 +1614,13 @@ int main(int argc, char ** argv) {
16141614                if  (params.progress ) {
16151615                    fprintf (stderr, " llama-bench: benchmark %d/%ld: prompt run %d/%d\n "  , params_idx, params_count, i + 1 , params.reps );
16161616                }
1617-                 test_prompt (ctx, t.n_prompt , 0 ,  t.n_batch , t.n_threads );
1617+                 test_prompt (ctx, t.n_prompt , t.n_batch , t.n_threads );
16181618            }
16191619            if  (t.n_gen  > 0 ) {
16201620                if  (params.progress ) {
16211621                    fprintf (stderr, " llama-bench: benchmark %d/%ld: generation run %d/%d\n "  , params_idx, params_count, i + 1 , params.reps );
16221622                }
1623-                 test_gen (ctx, t.n_gen , t.n_prompt , t. n_threads );
1623+                 test_gen (ctx, t.n_gen , t.n_threads );
16241624            }
16251625
16261626            uint64_t  t_ns = get_time_ns () - t_start;
0 commit comments