Support FlashMLA backend cuda graph capture

sleepcoo · FlamingoPg · Hongbosherlock · sleepcoo · commit c28f9bc907bf · 2025-03-18T04:28:03.000Z
Co-authored-by: yinfan98 &lt;1106310035@qq.com&gt;
Co-authored-by: Hongbosherlock &lt;hongbosherlock@gmail.com&gt;
diff --git a/python/sglang/srt/layers/attention/flashmla_backend.py b/python/sglang/srt/layers/attention/flashmla_backend.py
@@ -27,6 +27,7 @@
     from sglang.srt.layers.radix_attention import RadixAttention
     from sglang.srt.model_executor.model_runner import ModelRunner
     from sglang.srt.speculative.eagle_utils import EagleDraftInput, EagleVerifyInput
+    from sglang.srt.speculative.spec_info import SpecInfo
 
 
 # FlashMLA only supports pagesize=64
@@ -76,9 +77,7 @@ def __init__(
         self.num_local_heads = (
             model_runner.model_config.num_attention_heads // get_attention_tp_size()
         )
-        self.forward_metadata: Union[
-            PrefillMetadata, DecodeMetadata, FlashMLADecodeMetadata
-        ] = None
+        self.forward_metadata: Union[FlashMLADecodeMetadata] = None
         self.kv_lora_rank = model_runner.model_config.kv_lora_rank
         self.qk_nope_head_dim = model_runner.model_config.qk_nope_head_dim
         self.qk_rope_head_dim = model_runner.model_config.qk_rope_head_dim
@@ -111,7 +110,6 @@ def init_forward_metadata(self, forward_batch: ForwardBatch):
                     block_kv_indices,
                     self.indices_updater_decode.req_to_token.size(1),
                     max_seqlen_pad,
-                    max_seqlen_pad,
                 )
                 mla_metadata, num_splits = get_mla_metadata(
                     forward_batch.seq_lens.to(torch.int32),
@@ -136,7 +134,7 @@ def init_cuda_graph_state(
         if block_kv_indices is None:
             cuda_graph_kv_indices = torch.full(
                 (max_bs, (self.max_context_len + PAGE_SIZE) // PAGE_SIZE),
-                -1,
+                1,
                 dtype=torch.int32,
                 device="cuda",
             )
@@ -167,7 +165,6 @@ def init_forward_metadata_capture_cuda_graph(
     ):
         if forward_mode.is_decode_or_idle():
             if spec_info is None:
-
                 max_seqlen_pad = triton.cdiv(seq_lens.max().item(), PAGE_SIZE)
 
                 create_flashmla_kv_indices_triton[(bs,)](
@@ -178,7 +175,6 @@ def init_forward_metadata_capture_cuda_graph(
                     self.cuda_graph_kv_indices,
                     self.indices_updater_decode.req_to_token.size(1),
                     max_seqlen_pad,
-                    max_seqlen_pad,
                 )
                 mla_metadata, num_splits = get_mla_metadata(
                     seq_lens.to(torch.int32),
@@ -227,7 +223,6 @@ def init_forward_metadata_replay_cuda_graph(
                 block_kv_indices,
                 self.indices_updater_decode.req_to_token.size(1),
                 max_seqlen_pad,
-                max_seqlen_pad,
             )
             mla_metadata, num_splits = get_mla_metadata(
                 seq_lens.to(torch.int32),