@@ -224,6 +224,7 @@ bool llama_batch_allocr::init(
224224 /* .seq_idx =*/ this ->seq_idx .data (),
225225 /* .output =*/ batch.logits ,
226226 /* .data =*/ {},
227+ /* .kv_position_of_token=*/ {},
227228 };
228229
229230 ubatch_print (ubatch, debug);
@@ -256,36 +257,38 @@ bool llama_batch_allocr::init(
256257 continue ;
257258 }
258259
259- const llama_pos p0 = memory ? memory->seq_pos_max (s) : -1 ;
260-
261- if (p0 >= 0 ) {
262- bool ok = true ;
263-
264- if (batch.token ) {
265- if (seq_pos_min (s) != p0 + 1 ) {
266- ok = false ;
267- }
268- } else {
269- assert (batch.embd );
270-
271- // for embeddings (typically used as vision input), we allow them to have repeating positions
272- // ref: https://github.com/ggml-org/llama.cpp/issues/13694#issuecomment-2983871762
273- if (seq_pos_min (s) != p0 && seq_pos_min (s) != p0 + 1 ) {
274- ok = false ;
275- }
276- }
277-
278- if (!ok) {
279- LLAMA_LOG_ERROR (
280- " %s: the tokens of sequence %d in the input batch have inconsistent sequence positions:\n "
281- " - the last position stored in the memory module of the context (i.e. the KV cache) for sequence %d is X = %d\n "
282- " - the tokens for sequence %d in the input batch have a starting position of Y = %d\n "
283- " it is required that the sequence positions remain consecutive: Y = X + 1\n " ,
284- __func__, s, s, p0, s, seq_pos_min (s));
285-
286- return false ;
287- }
288- }
260+ // @fmayran: these checks don't make sense with models using position encoding such as Qwen VL, because the position stored in the KV cache can jump around (it is not even always increasing).
261+ // it is not enough to let them be repeating. Within an image embedding, arbitrary jumps are expected.
262+ // const llama_pos p0 = memory ? memory->seq_pos_max(s) : -1;
263+ //
264+ // if (p0 >= 0) {
265+ // bool ok = true;
266+ //
267+ // if (batch.token) {
268+ // if (seq_pos_min(s) != p0 + 1) {
269+ // ok = false;
270+ // }
271+ // } else {
272+ // assert(batch.embd);
273+ //
274+ // // for embeddings (typically used as vision input), we allow them to have repeating positions
275+ // // ref: https://github.com/ggml-org/llama.cpp/issues/13694#issuecomment-2983871762
276+ // if (seq_pos_min(s) != p0 && seq_pos_min(s) != p0 + 1) {
277+ // ok = false;
278+ // }
279+ // }
280+ //
281+ // if (!ok) {
282+ // LLAMA_LOG_ERROR(
283+ // "%s: the tokens of sequence %d in the input batch have inconsistent sequence positions:\n"
284+ // " - the last position stored in the memory module of the context (i.e. the KV cache) for sequence %d is X = %d\n"
285+ // " - the tokens for sequence %d in the input batch have a starting position of Y = %d\n"
286+ // " it is required that the sequence positions remain consecutive: Y = X + 1\n",
287+ // __func__, s, s, p0, s, seq_pos_min(s));
288+ //
289+ // return false;
290+ // }
291+ // }
289292
290293 if (seq_pos_max (s) - seq_pos_min (s) + 1 > (int ) seq_pos[s].size ()) {
291294 LLAMA_LOG_ERROR (" %s: sequence %d positions are not continuous\n " , __func__, s);
@@ -369,36 +372,38 @@ llama_ubatch llama_batch_allocr::ubatch_reserve(uint32_t n_seq_tokens, uint32_t
369372
370373 auto udata = std::make_shared<llama_ubatch::data_t >();
371374
372- udata->token .resize (n_tokens);
373- udata->embd .clear ();
374- udata->pos .resize (n_tokens);
375- udata->n_seq_id .resize (n_tokens);
376- udata->seq_id .resize (n_tokens);
377- udata->seq_id_unq .resize (0 );
378- udata->seq_idx .resize (LLAMA_MAX_SEQ, -1 );
379- udata->output .resize (n_tokens);
375+ udata->token .resize (n_tokens);
376+ udata->embd .clear ();
377+ udata->pos .resize (n_tokens);
378+ udata->n_seq_id .resize (n_tokens);
379+ udata->seq_id .resize (n_tokens);
380+ udata->seq_id_unq .resize (0 );
381+ udata->seq_idx .resize (LLAMA_MAX_SEQ, -1 );
382+ udata->output .resize (n_tokens);
383+ udata->kv_position_of_token .resize (n_tokens, -1 );
380384
381385 for (uint32_t s = 0 ; s < n_seqs; ++s) {
382386 udata->seq_idx [s] = s;
383387 udata->seq_id_unq .push_back (s);
384388 }
385389
386390 llama_ubatch res {
387- /* .b_equal_seqs =*/ true ,
388- /* .n_tokens =*/ n_tokens,
389- /* .n_seq_tokens =*/ n_seq_tokens,
390- /* .n_seqs =*/ n_seqs,
391- /* .n_seqs_unq =*/ n_seqs,
392-
393- /* .token =*/ udata->token .data (),
394- /* .embd =*/ nullptr ,
395- /* .pos =*/ udata->pos .data (),
396- /* .n_seq_id =*/ udata->n_seq_id .data (),
397- /* .seq_id =*/ udata->seq_id .data (),
398- /* .seq_id_unq =*/ udata->seq_id_unq .data (),
399- /* .seq_idx =*/ udata->seq_idx .data (),
400- /* .output =*/ udata->output .data (),
401- /* .data =*/ std::move (udata),
391+ /* .b_equal_seqs =*/ true ,
392+ /* .n_tokens =*/ n_tokens,
393+ /* .n_seq_tokens =*/ n_seq_tokens,
394+ /* .n_seqs =*/ n_seqs,
395+ /* .n_seqs_unq =*/ n_seqs,
396+
397+ /* .token =*/ udata->token .data (),
398+ /* .embd =*/ nullptr ,
399+ /* .pos =*/ udata->pos .data (),
400+ /* .n_seq_id =*/ udata->n_seq_id .data (),
401+ /* .seq_id =*/ udata->seq_id .data (),
402+ /* .seq_id_unq =*/ udata->seq_id_unq .data (),
403+ /* .seq_idx =*/ udata->seq_idx .data (),
404+ /* .output =*/ udata->output .data (),
405+ /* .kv_position_of_token=*/ udata->kv_position_of_token .data (),
406+ /* .data =*/ std::move (udata),
402407 };
403408
404409 return res;
@@ -660,14 +665,15 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
660665 const int64_t n_embd_all = batch.embd ? (int64_t ) n_tokens*n_embd : 0 ;
661666 const int64_t n_pos_all = (int64_t ) n_tokens*n_pos_cur;
662667
663- udata->token .resize (n_tokens);
664- udata->embd .resize (n_embd_all);
665- udata->pos .resize (n_pos_all);
666- udata->n_seq_id .resize (n_tokens);
667- udata->seq_id .resize (n_tokens);
668- udata->seq_id_unq .resize (0 );
669- udata->seq_idx .resize (LLAMA_MAX_SEQ, -1 );
670- udata->output .resize (n_tokens);
668+ udata->token .resize (n_tokens);
669+ udata->embd .resize (n_embd_all);
670+ udata->pos .resize (n_pos_all);
671+ udata->n_seq_id .resize (n_tokens);
672+ udata->seq_id .resize (n_tokens);
673+ udata->seq_id_unq .resize (0 );
674+ udata->seq_idx .resize (LLAMA_MAX_SEQ, -1 );
675+ udata->output .resize (n_tokens);
676+ udata->kv_position_of_token .resize (n_tokens, -1 );
671677
672678 seq_set_t seq_set_unq;
673679
@@ -705,21 +711,23 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
705711 }
706712
707713 llama_ubatch res {
708- /* .b_equal_seqs =*/ equal_seqs,
709- /* .n_tokens =*/ n_tokens,
710- /* .n_seq_tokens =*/ n_tokens/n_seqs,
711- /* .n_seqs =*/ n_seqs,
712- /* .n_seqs_unq =*/ (uint32_t ) udata->seq_id_unq .size (),
713-
714- /* .token =*/ batch.token ? udata->token .data () : nullptr ,
715- /* .embd =*/ batch.embd ? udata->embd .data () : nullptr ,
716- /* .pos =*/ udata->pos .data (),
717- /* .n_seq_id =*/ udata->n_seq_id .data (),
718- /* .seq_id =*/ udata->seq_id .data (),
719- /* .seq_id_unq =*/ udata->seq_id_unq .data (),
720- /* .seq_idx =*/ udata->seq_idx .data (),
721- /* .output =*/ udata->output .data (),
722- /* .data =*/ std::move (udata),
714+ /* .b_equal_seqs =*/ equal_seqs,
715+ /* .n_tokens =*/ n_tokens,
716+ /* .n_seq_tokens =*/ n_tokens/n_seqs,
717+ /* .n_seqs =*/ n_seqs,
718+ /* .n_seqs_unq =*/ (uint32_t ) udata->seq_id_unq .size (),
719+
720+ /* .token =*/ batch.token ? udata->token .data () : nullptr ,
721+ /* .embd =*/ batch.embd ? udata->embd .data () : nullptr ,
722+ /* .pos =*/ udata->pos .data (),
723+ /* .n_seq_id =*/ udata->n_seq_id .data (),
724+ /* .seq_id =*/ udata->seq_id .data (),
725+ /* .seq_id_unq =*/ udata->seq_id_unq .data (),
726+ /* .seq_idx =*/ udata->seq_idx .data (),
727+ /* .output =*/ udata->output .data (),
728+ /* .kv_position_of_token=*/ udata->kv_position_of_token .data (),
729+ /* .data =*/ std::move (udata),
730+
723731 };
724732
725733 if (debug > 0 ) {
0 commit comments