fix query_start_loc_p affected by metadata refactor

cyang49 · cyang49 · commit 6db94a4cba71 · 2025-09-04T10:21:09.000-04:00
Signed-off-by: Chih-Chieh-Yang &lt;7364402+cyang49@users.noreply.github.com&gt;
diff --git a/vllm/model_executor/models/plamo2.py b/vllm/model_executor/models/plamo2.py
@@ -30,7 +30,7 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.mamba.abstract import MambaBase
 from vllm.model_executor.layers.mamba.mamba2_metadata import (
-    Mamba2Metadata, prepare_mamba2_metadata, update_metadata)
+    Mamba2Metadata, prepare_mamba2_metadata)
 from vllm.model_executor.layers.mamba.mamba_utils import (
     MambaStateDtypeCalculator, MambaStateShapeCalculator)
 from vllm.model_executor.layers.mamba.ops.causal_conv1d import (
@@ -285,6 +285,7 @@ def forward_cuda(
                 seq_idx_p = attn_metadata.seq_idx_p
                 chunk_indices_p = attn_metadata.chunk_indices_p
                 chunk_offsets_p = attn_metadata.chunk_offsets_p
+                query_start_loc_p = attn_metadata.query_start_loc_p
         else:
             conv_state = mamba_cache_params.conv_state
             ssm_state = mamba_cache_params.ssm_state
@@ -295,6 +296,7 @@ def forward_cuda(
             seq_idx_p = mamba2_metadata.seq_idx
             chunk_indices_p = mamba2_metadata.chunk_indices
             chunk_offsets_p = mamba2_metadata.chunk_offsets
+            query_start_loc_p = mamba2_metadata.query_start_loc_p
 
         # 1. Gated MLP's linear projection
         projected_states = self.in_proj(hidden_states)
@@ -336,9 +338,6 @@ def forward_cuda(
                 [num_decodes, num_prefills],
                 dim=0,
             )
-            query_start_loc_p = (
-                attn_metadata.query_start_loc[-num_prefills - 1:] -
-                num_decodes if has_prefill else None)
         else:
             hidden_states_p, hidden_states_d = torch.split(
                 hidden_states,
@@ -354,9 +353,6 @@ def forward_cuda(
                 [num_prefills, num_decodes],
                 dim=0,
             )
-            query_start_loc_p = (attn_metadata.query_start_loc[:num_prefills +
-                                                               1]
-                                 if has_prefill else None)
 
         # Preallocate output tensor to avoid memcpy cost for merging prefill
         # and decode outputs
@@ -388,9 +384,6 @@ def forward_cuda(
             #   pointed to by "state_indices_tensor"
             x = hidden_states_p.transpose(
                 0, 1)  # this is the form that causal-conv see
-            if mamba2_metadata.cu_seqlen is None:
-                mamba2_metadata = update_metadata(x, query_start_loc_p,
-                                                  mamba2_metadata)
             hidden_states_p = causal_conv1d_fn(
                 x,
                 conv_weights,