flashinfer-ai
diff --git a/‎include/flashinfer/decode.cuh‎
Lines changed: 23 additions & 26 deletions b/‎include/flashinfer/decode.cuh‎
Lines changed: 23 additions & 26 deletions
diff --git a/‎include/flashinfer/handler.cuh‎
Lines changed: 23 additions & 21 deletions b/‎include/flashinfer/handler.cuh‎
Lines changed: 23 additions & 21 deletions
diff --git a/‎include/flashinfer/page.cuh‎
Lines changed: 15 additions & 5 deletions b/‎include/flashinfer/page.cuh‎
Lines changed: 15 additions & 5 deletions
@@ -497,7 +497,8 @@ template <bool partition_kv, RotaryMode rotary_mode, uint32_t num_stages_smem,
           PageStorage page_storage, QKVLayout kv_layout, typename DTypeIn, typename DTypeOut,
           typename IdType>
 __global__ void BatchDecodeWithPagedKVCacheKernel(
-    DTypeIn* __restrict__ q, paged_kv_t<page_storage, kv_layout, DTypeIn, IdType> paged_kv,
+    DTypeIn* __restrict__ q, IdType* __restrict__ q_rope_position,
+    paged_kv_t<page_storage, kv_layout, DTypeIn, IdType> paged_kv,
     kv_partition_info_t<IdType> kv_partition_info, DTypeOut* __restrict__ o,
     DTypeOut* __restrict__ tmp, float* __restrict__ lse, float sm_scale, float rope_rcp_scale,
     float rope_rcp_theta) {
@@ -520,6 +521,8 @@ __global__ void BatchDecodeWithPagedKVCacheKernel(
           : 0;
   const uint32_t seq_len =
       partition_kv ? kv_partition_info.seq_lens_before_partition[batch_idx] : kv_chunk_len;
+  const uint32_t mapped_batch_idx =
+      partition_kv ? kv_partition_info.batch_idx_map[batch_idx] : batch_idx;
 
   extern __shared__ uint8_t smem[];
   DTypeIn* k_smem = (DTypeIn*)smem;
@@ -541,23 +544,12 @@ __global__ void BatchDecodeWithPagedKVCacheKernel(
                        float(2 * ((tx * vec_size + i) % (head_dim / 2))) / float(head_dim));
     }
     // apply rotary embedding to q matrix
-    if constexpr (partition_kv) {
-      q_vec = vec_apply_llama_rope<vec_size, bdx>(
-          q + (kv_partition_info.batch_idx_map[batch_idx] * num_qo_heads + qo_head_idx) * head_dim,
-          freq, seq_len - 1);
-    } else {
-      q_vec = vec_apply_llama_rope<vec_size, bdx>(
-          q + (batch_idx * num_qo_heads + qo_head_idx) * head_dim, freq, seq_len - 1);
-    }
+    q_vec = vec_apply_llama_rope<vec_size, bdx>(
+        q + (mapped_batch_idx * num_qo_heads + qo_head_idx) * head_dim, freq,
+        q_rope_position == nullptr ? (seq_len - 1) : q_rope_position[mapped_batch_idx]);
   } else {
     // do not apply rotary embedding to q matrix
-    if constexpr (partition_kv) {
-      q_vec.cast_load(
-          q + (kv_partition_info.batch_idx_map[batch_idx] * num_qo_heads + qo_head_idx) * head_dim +
-          tx * vec_size);
-    } else {
-      q_vec.cast_load(q + (batch_idx * num_qo_heads + qo_head_idx) * head_dim + tx * vec_size);
-    }
+    q_vec.cast_load(q + (mapped_batch_idx * num_qo_heads + qo_head_idx) * head_dim + tx * vec_size);
   }
   block.sync();
 
@@ -627,7 +619,9 @@ __global__ void BatchDecodeWithPagedKVCacheKernel(
     block.sync();
     compute_qk<rotary_mode, vec_size, bdx, bdy * tile_size_per_bdx>(
         k_smem + (stage_idx * bdz + tz) * bdy * tile_size_per_bdx * head_dim, stage_idx, q_vec,
-        freq, cur_chunk_start + iter * tile_size_per_bdx * bdy * bdz,
+        freq,
+        (paged_kv.rope_pos_offset == nullptr ? 0 : paged_kv.rope_pos_offset[mapped_batch_idx]) +
+            cur_chunk_start + iter * tile_size_per_bdx * bdy * bdz,
         iter * tile_size_per_bdx * bdy * bdz, kv_chunk_len, sm_scale, s, st);
     block.sync();
 
@@ -1120,7 +1114,8 @@ cudaError_t BatchDecodeWithPagedKVCacheWorkEstimation(
 template <uint32_t GROUP_SIZE, uint32_t HEAD_DIM, PageStorage page_storage, QKVLayout kv_layout,
           RotaryMode ROTARY_MODE, typename DTypeIn, typename DTypeOut, typename IdType>
 cudaError_t BatchDecodeWithPagedKVCacheDispatched(
-    DTypeIn* q, paged_kv_t<page_storage, kv_layout, DTypeIn, IdType> paged_kv,
+    DTypeIn* q, IdType* q_rope_position,
+    paged_kv_t<page_storage, kv_layout, DTypeIn, IdType> paged_kv,
     kv_partition_info_t<IdType> kv_partition_info, DTypeOut* o, DTypeOut* tmp, float* lse,
     float rope_scale, float rope_theta, cudaStream_t stream) {
   const float sm_scale = 1.f / std::sqrt(float(HEAD_DIM));
@@ -1153,6 +1148,7 @@ cudaError_t BatchDecodeWithPagedKVCacheDispatched(
     FLASHINFER_CUDA_CALL(
         cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
     void* args[] = {(void*)&q,
+                    (void*)&q_rope_position,
                     (void*)&paged_kv,
                     (void*)&kv_partition_info,
                     (void*)&o,
@@ -1171,6 +1167,7 @@ cudaError_t BatchDecodeWithPagedKVCacheDispatched(
     FLASHINFER_CUDA_CALL(cudaFuncSetAttribute(
         partition_kv_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
     void* args[] = {(void*)&q,
+                    (void*)&q_rope_position,
                     (void*)&paged_kv,
                     (void*)&kv_partition_info,
                     (void*)&o,
@@ -1212,7 +1209,8 @@ cudaError_t BatchDecodeWithPagedKVCacheDispatched(
 template <PageStorage page_storage, QKVLayout kv_layout, typename DTypeIn, typename DTypeOut,
           typename IdType>
 cudaError_t BatchDecodeWithPagedKVCache(
-    DTypeIn* q, paged_kv_t<page_storage, kv_layout, DTypeIn, IdType> paged_kv,
+    DTypeIn* q, IdType* q_rope_position,
+    paged_kv_t<page_storage, kv_layout, DTypeIn, IdType> paged_kv,
     kv_partition_info_t<IdType> kv_partition_info, DTypeOut* o, DTypeOut* tmp, float* lse,
     uint32_t num_qo_heads, RotaryMode rotary_mode = RotaryMode::kNone, float rope_scale = 1.f,
     float rope_theta = 1e4, cudaStream_t stream = nullptr) {
@@ -1228,13 +1226,12 @@ cudaError_t BatchDecodeWithPagedKVCache(
 
   SWITCH_GQA_GROUP_SIZE(
       num_qo_heads / num_kv_heads, GROUP_SIZE,
-      {SWITCH_HEAD_DIM(
-          head_dim, HEAD_DIM, {SWITCH_ROTARY_MODE(rotary_mode, ROTARY_MODE, {
-            return BatchDecodeWithPagedKVCacheDispatched<GROUP_SIZE, HEAD_DIM, page_storage,
-                                                         kv_layout, ROTARY_MODE, DTypeIn, DTypeOut,
-                                                         IdType>(
-                q, paged_kv, kv_partition_info, o, tmp, lse, rope_scale, rope_theta, stream);
-          })})});
+      {SWITCH_HEAD_DIM(head_dim, HEAD_DIM, {SWITCH_ROTARY_MODE(rotary_mode, ROTARY_MODE, {
+                         return BatchDecodeWithPagedKVCacheDispatched<
+                             GROUP_SIZE, HEAD_DIM, page_storage, kv_layout, ROTARY_MODE, DTypeIn,
+                             DTypeOut, IdType>(q, q_rope_position, paged_kv, kv_partition_info, o,
+                                               tmp, lse, rope_scale, rope_theta, stream);
+                       })})});
 
   return cudaSuccess;
 }
 
@@ -267,7 +267,7 @@ class BatchPrefillHandler {
 template <PageStorage page_storage, QKVLayout kv_layout, typename DTypeIn, typename DTypeOut,
           typename IdType>
 cudaError_t BatchDecodeWithPagedKVCacheWrapper(
-    BatchDecodeHandler* handler, DTypeIn* q,
+    BatchDecodeHandler* handler, DTypeIn* q, IdType* q_rope_position,
     paged_kv_t<page_storage, kv_layout, DTypeIn, IdType> paged_kv, DTypeOut* o, float* lse,
     uint32_t num_qo_heads, RotaryMode rotary_mode = RotaryMode::kNone, float rope_scale = 1.f,
     float rope_theta = 1e4, cudaStream_t stream = nullptr) {
@@ -293,15 +293,15 @@ cudaError_t BatchDecodeWithPagedKVCacheWrapper(
     throw std::runtime_error(err_msg.str());
   }
   return BatchDecodeWithPagedKVCache<page_storage, kv_layout, DTypeIn, DTypeOut, IdType>(
-      q, new_paged_kv, kv_partition_info, o, tmp, lse, num_qo_heads, rotary_mode, rope_scale,
-      rope_theta, stream);
+      q, q_rope_position, new_paged_kv, kv_partition_info, o, tmp, lse, num_qo_heads, rotary_mode,
+      rope_scale, rope_theta, stream);
 }
 
 template <PageStorage page_storage, QKVLayout kv_layout, uint32_t GROUP_SIZE, uint32_t HEAD_DIM,
           RotaryMode ROTARY_MODE, bool ALLOW_FP16_QK_REDUCTION, bool CAUSAL, typename DTypeIn,
           typename DTypeOut, typename IdType>
 cudaError_t BatchPrefillWithPagedKVCacheWrapperDispatched(
-    BatchPrefillHandler* handler, DTypeIn* q, IdType* qo_indptr,
+    BatchPrefillHandler* handler, DTypeIn* q, IdType* qo_indptr, IdType* q_rope_position,
     paged_kv_t<page_storage, kv_layout, DTypeIn, IdType> paged_kv, DTypeOut* o, float* lse,
     uint32_t num_qo_heads, float rope_scale = 1.f, float rope_theta = 1e4,
     cudaStream_t stream = nullptr) {
@@ -328,14 +328,14 @@ cudaError_t BatchPrefillWithPagedKVCacheWrapperDispatched(
           return BatchPrefillWithPagedKVCacheFallbackDispatched<
               page_storage, kv_layout, NUM_FRAGS_X, GROUP_SIZE, HEAD_DIM, ROTARY_MODE,
               ALLOW_FP16_QK_REDUCTION, CAUSAL, DTypeIn, DTypeOut, IdType>(
-              q, request_indices, tile_indices, qo_indptr, paged_kv, o, tmp, lse, num_qo_tiles,
-              rope_scale, rope_theta, stream);
+              q, request_indices, tile_indices, qo_indptr, q_rope_position, paged_kv, o, tmp, lse,
+              num_qo_tiles, rope_scale, rope_theta, stream);
         } else {
           return BatchPrefillWithPagedKVCacheDispatched<
               page_storage, kv_layout, NUM_FRAGS_X, PAGE_SIZE, GROUP_SIZE, HEAD_DIM, ROTARY_MODE,
               ALLOW_FP16_QK_REDUCTION, CAUSAL, DTypeIn, DTypeOut, IdType>(
-              q, request_indices, tile_indices, qo_indptr, paged_kv, o, tmp, lse, num_qo_tiles,
-              rope_scale, rope_theta, stream);
+              q, request_indices, tile_indices, qo_indptr, q_rope_position, paged_kv, o, tmp, lse,
+              num_qo_tiles, rope_scale, rope_theta, stream);
         }
       })});
   return cudaSuccess;
@@ -344,7 +344,7 @@ cudaError_t BatchPrefillWithPagedKVCacheWrapperDispatched(
 template <PageStorage page_storage, QKVLayout kv_layout, typename DTypeIn, typename DTypeOut,
           typename IdType>
 cudaError_t BatchPrefillWithPagedKVCacheWrapper(
-    BatchPrefillHandler* handler, DTypeIn* q, IdType* qo_indptr,
+    BatchPrefillHandler* handler, DTypeIn* q, IdType* qo_indptr, IdType* q_rope_position,
     paged_kv_t<page_storage, kv_layout, DTypeIn, IdType> paged_kv, DTypeOut* o, float* lse,
     uint32_t num_qo_heads, bool causal = true, RotaryMode rotary_mode = RotaryMode::kNone,
     bool allow_fp16_qk_reduction = false, float rope_scale = 1.f, float rope_theta = 1e4,
@@ -363,8 +363,8 @@ cudaError_t BatchPrefillWithPagedKVCacheWrapper(
                                    return BatchPrefillWithPagedKVCacheWrapperDispatched<
                                        page_storage, kv_layout, GROUP_SIZE, HEAD_DIM, ROTARY_MODE,
                                        ALLOW_FP16_QK_REDUCTION, CAUSAL, DTypeIn, DTypeOut, IdType>(
-                                       handler, q, qo_indptr, paged_kv, o, lse, num_qo_heads,
-                                       rope_scale, rope_theta, stream);
+                                       handler, q, qo_indptr, q_rope_position, paged_kv, o, lse,
+                                       num_qo_heads, rope_scale, rope_theta, stream);
                                  })})})})});
   return cudaSuccess;
 }
@@ -374,9 +374,9 @@ template <uint32_t GROUP_SIZE, uint32_t HEAD_DIM, QKVLayout KV_LAYOUT, RotaryMod
           typename IdType>
 cudaError_t BatchPrefillWithRaggedKVCacheWrapperDispatched(
     BatchPrefillHandler* handler, DTypeIn* q, IdType* qo_indptr, DTypeIn* k, DTypeIn* v,
-    IdType* kv_indptr, DTypeOut* o, float* lse, const uint32_t batch_size,
-    const uint32_t num_kv_heads, const float rope_scale = 1.f, const float rope_theta = 1e4,
-    cudaStream_t stream = nullptr) {
+    IdType* kv_indptr, IdType* q_rope_position, IdType* k_rope_pos_offset, DTypeOut* o, float* lse,
+    const uint32_t batch_size, const uint32_t num_kv_heads, const float rope_scale = 1.f,
+    const float rope_theta = 1e4, cudaStream_t stream = nullptr) {
   float* tmp = nullptr;
   IdType* request_indices = nullptr;
   IdType* tile_indices = nullptr;
@@ -398,18 +398,19 @@ cudaError_t BatchPrefillWithRaggedKVCacheWrapperDispatched(
     return BatchPrefillWithRaggedKVCacheDispatched<NUM_FRAGS_X, GROUP_SIZE, HEAD_DIM, KV_LAYOUT,
                                                    ROTARY_MODE, ALLOW_FP16_QK_REDUCTION, CAUSAL,
                                                    DTypeIn, DTypeOut, IdType>(
-        q, request_indices, tile_indices, qo_indptr, k, v, kv_indptr, o, tmp, lse, batch_size,
-        num_qo_tiles, num_kv_heads, rope_scale, rope_theta, stream);
+        q, request_indices, tile_indices, qo_indptr, k, v, kv_indptr, q_rope_position,
+        k_rope_pos_offset, o, tmp, lse, batch_size, num_qo_tiles, num_kv_heads, rope_scale,
+        rope_theta, stream);
   });
   return cudaSuccess;
 }
 
 template <typename DTypeIn, typename DTypeOut, typename IdType>
 cudaError_t BatchPrefillWithRaggedKVCacheWrapper(
     BatchPrefillHandler* handler, DTypeIn* q, IdType* qo_indptr, DTypeIn* k, DTypeIn* v,
-    IdType* kv_indptr, DTypeOut* o, float* lse, const uint32_t batch_size,
-    const uint32_t num_qo_heads, const uint32_t num_kv_heads, const uint32_t head_dim,
-    bool causal = true, RotaryMode rotary_mode = RotaryMode::kNone,
+    IdType* kv_indptr, IdType* q_rope_position, IdType* k_rope_pos_offset, DTypeOut* o, float* lse,
+    const uint32_t batch_size, const uint32_t num_qo_heads, const uint32_t num_kv_heads,
+    const uint32_t head_dim, bool causal = true, RotaryMode rotary_mode = RotaryMode::kNone,
     bool allow_fp16_qk_reduction = false, const float rope_scale = 1.f,
     const float rope_theta = 1e4, cudaStream_t stream = nullptr) {
   constexpr QKVLayout KV_LAYOUT = QKVLayout::kNHD;
@@ -425,8 +426,9 @@ cudaError_t BatchPrefillWithRaggedKVCacheWrapper(
                                    return BatchPrefillWithRaggedKVCacheWrapperDispatched<
                                        GROUP_SIZE, HEAD_DIM, KV_LAYOUT, ROTARY_MODE,
                                        ALLOW_FP16_QK_REDUCTION, CAUSAL, DTypeIn, DTypeOut, IdType>(
-                                       handler, q, qo_indptr, k, v, kv_indptr, o, lse, batch_size,
-                                       num_kv_heads, rope_scale, rope_theta, stream);
+                                       handler, q, qo_indptr, k, v, kv_indptr, q_rope_position,
+                                       k_rope_pos_offset, o, lse, batch_size, num_kv_heads,
+                                       rope_scale, rope_theta, stream);
                                  })})})})});
   return cudaSuccess;
 }
 
@@ -88,6 +88,8 @@ struct paged_kv_t {
   IdType* indptr;
   // [batch_size] The offset of the last page for each request in the batch
   IdType* last_page_len;
+  // [batch_size] The start position of each request in the batch.
+  IdType* rope_pos_offset;
 
   /*!
    * \brief Construct an empty paged key-value cache
@@ -101,7 +103,8 @@ struct paged_kv_t {
         indices(nullptr),
         ptrs(nullptr),
         indptr(nullptr),
-        last_page_len(nullptr) {}
+        last_page_len(nullptr),
+        rope_pos_offset(nullptr) {}
 
   /*!
    * \brief Construct a paged key-value cache
@@ -113,20 +116,23 @@ struct paged_kv_t {
    * \param indices The page indices array
    * \param indptr The page indptr array
    * \param last_page_len The offset of the last page for each request in the batch
+   * \param rope_pos_offset The start position of each request in the batch.
    * \note This constructor should only be used when page_storage == kIndices
    */
   __host__ __device__ __forceinline__ paged_kv_t(uint32_t num_heads, uint32_t page_size,
                                                  uint32_t head_dim, uint32_t batch_size,
                                                  DType* data, IdType* indices, IdType* indptr,
-                                                 IdType* last_page_len)
+                                                 IdType* last_page_len,
+                                                 IdType* rope_pos_offset = nullptr)
       : num_heads(num_heads),
         page_size(page_size),
         head_dim(head_dim),
         batch_size(batch_size),
         data(data),
         indices(indices),
         indptr(indptr),
-        last_page_len(last_page_len) {}
+        last_page_len(last_page_len),
+        rope_pos_offset(rope_pos_offset) {}
 
   /*!
    * \brief Construct a paged key-value cache
@@ -137,18 +143,22 @@ struct paged_kv_t {
    * \param ptrs The array of pointers to each active page
    * \param indptr The page indptr array
    * \param last_page_len The offset of the last page for each request in the batch
+   * \param rope_pos_offset The start position of each request in the batch.
    * \note This constructor should only be used when page_storage == kIndices
    */
   __host__ __device__ __forceinline__ paged_kv_t(uint32_t num_heads, uint32_t page_size,
                                                  uint32_t head_dim, uint32_t batch_size,
                                                  DType** ptrs, IdType* indptr,
-                                                 IdType* last_page_len)
+                                                 IdType* last_page_len,
+                                                 IdType* rope_pos_offset = nullptr)
       : num_heads(num_heads),
         page_size(page_size),
         head_dim(head_dim),
         batch_size(batch_size),
         ptrs(ptrs),
-        indptr(indptr) {}
+        indptr(indptr),
+        last_page_len(last_page_len),
+        rope_pos_offset(rope_pos_offset) {}
 
   /*!
    * \brief Compute the offset of k element in the allocated buffer.