Remove old cutlass MLA kernel

MatthewBonanni · MatthewBonanni · commit 6be6496be16e · 2025-09-16T12:40:39.000-04:00
Signed-off-by: Matthew Bonanni &lt;mbonanni001@gmail.com&gt;
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -298,7 +298,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     "csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu"
     "csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
     "csrc/cutlass_extensions/common.cpp"
-    "csrc/attention/mla/cutlass_mla_entry.cu"
     "csrc/quantization/fp8/per_token_group_quant.cu")
 
   set_gencode_flags_for_srcs(
@@ -585,7 +584,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   cuda_archs_loose_intersection(MLA_ARCHS "10.0a" "${CUDA_ARCHS}")
   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND MLA_ARCHS)
     set(SRCS
-      "csrc/attention/mla/cutlass_mla_kernels.cu"
       "csrc/attention/mla/sm100_cutlass_mla_kernel.cu")
     set_gencode_flags_for_srcs(
       SRCS "${SRCS}"
diff --git a/csrc/attention/mla/cutlass_mla_entry.cu b/csrc/attention/mla/cutlass_mla_entry.cu
diff --git a/csrc/attention/mla/cutlass_mla_kernels.cu b/csrc/attention/mla/cutlass_mla_kernels.cu
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
@@ -510,13 +510,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.def("cutlass_sparse_compress(Tensor a) -> Tensor[]");
   ops.impl("cutlass_sparse_compress", &cutlass_sparse_compress);
 
-  // CUTLASS MLA decode
-  ops.def(
-      "cutlass_mla_decode(Tensor! out, Tensor q_nope, Tensor q_pe,"
-      "                   Tensor kv_c_and_k_pe_cache, Tensor seq_lens,"
-      "                   Tensor page_table, float scale) -> ()");
-  ops.impl("cutlass_mla_decode", torch::kCUDA, &cutlass_mla_decode);
-
   // SM100 CUTLASS MLA decode
   ops.def(
       "sm100_cutlass_mla_decode(Tensor! out, Tensor! lse, Tensor q_nope,"
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
@@ -1823,17 +1823,8 @@ def flash_mla_with_kvcache(
     return out, softmax_lse
 
 
-def cutlass_mla_decode(out: torch.Tensor, q_nope: torch.Tensor,
-                       q_pe: torch.Tensor, kv_c_and_k_pe_cache: torch.Tensor,
-                       seq_lens: torch.Tensor, page_table: torch.Tensor,
-                       scale: float) -> torch.Tensor:
-    torch.ops._C.cutlass_mla_decode(out, q_nope, q_pe, kv_c_and_k_pe_cache,
-                                    seq_lens, page_table, scale)
-    return out
-
-
-def sm100_cutlass_mla_decode(out: torch.Tensor, lse: torch.Tensor,
-                             q_nope: torch.Tensor, q_pe: torch.Tensor,
+def sm100_cutlass_mla_decode(out: torch.Tensor, q_nope: torch.Tensor,
+                             q_pe: torch.Tensor,
                              kv_c_and_k_pe_cache: torch.Tensor,
                              seq_lens: torch.Tensor, page_table: torch.Tensor,
                              workspace: torch.Tensor, scale: float,
diff --git a/vllm/v1/attention/backends/mla/cutlass_mla.py b/vllm/v1/attention/backends/mla/cutlass_mla.py
@@ -219,12 +219,13 @@ def _sm100_cutlass_mla_decode(
 
         return out, returned_lse
 
-    def _sm100_forward_decode(
+    def _forward_decode(
         self,
         q_nope: torch.Tensor,
         q_pe: torch.Tensor,
         kv_c_and_k_pe_cache: torch.Tensor,
         attn_metadata: MLACommonMetadata,
+        layer: AttentionLayer,
     ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
         assert kv_c_and_k_pe_cache.numel() > 0
         assert attn_metadata.decode is not None
@@ -245,57 +246,3 @@ def _sm100_forward_decode(
         )
 
         return o, (lse if self.need_to_return_lse_for_decode else None)
-
-    # TODO: Currently we leave it here only for backup in case something is
-    #       wrong with the new SM100 CUTLASS MLA kernel
-    def _old_forward_decode(
-        self,
-        q_nope: torch.Tensor,
-        q_pe: torch.Tensor,
-        kv_c_and_k_pe_cache: torch.Tensor,
-        attn_metadata: MLACommonMetadata,
-    ) -> torch.Tensor:
-        assert kv_c_and_k_pe_cache.numel() > 0
-        assert attn_metadata.decode is not None
-
-        if is_quantized_kv_cache(self.kv_cache_dtype):
-            raise NotImplementedError(
-                "FP8 Cutlass MLA not supported with FORCE_OLD_CUTLASS_MLA")
-
-        B = q_nope.shape[0]
-
-        o = torch.empty((B, self.num_heads, self.kv_lora_rank),
-                        dtype=q_nope.dtype,
-                        device=q_nope.device)
-
-        # Run MLA
-        # Clone q_nope and q_pe to make sure strides computation is correct.
-        q_nope = q_nope.clone()
-        q_pe = q_pe.clone()
-
-        ops.cutlass_mla_decode(o, q_nope, q_pe, kv_c_and_k_pe_cache,
-                               attn_metadata.decode.seq_lens,
-                               attn_metadata.decode.block_table, self.scale)
-
-        return o
-
-    def _forward_decode(
-        self,
-        q: torch.Tensor,
-        kv_c_and_k_pe_cache: torch.Tensor,
-        attn_metadata: MLACommonMetadata,
-        layer: AttentionLayer,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
-        if type(q) is tuple:
-            q_nope, q_pe = q
-        else:
-            q_nope, q_pe = torch.split(
-                q, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1)
-        if self._use_old_cutlass_mla:
-            # TODO: Remove the old cutlass MLA kernel after more extensive
-            #       testing
-            return self._old_forward_decode(q_nope, q_pe, kv_c_and_k_pe_cache,
-                                            attn_metadata), None
-
-        return self._sm100_forward_decode(q_nope, q_pe, kv_c_and_k_pe_cache,
-                                          attn_metadata)