Merge pull request vllm-project#13 from vllm-model-0920/lwilkinson/build-sparse-flash-mla

LucasWilkinson · web-flow · commit fa13a8bf0ef0 · 2025-09-21T21:32:52.000-04:00
Build and bind sparse-FlashMLA kernels
diff --git a/cmake/external_projects/flashmla.cmake b/cmake/external_projects/flashmla.cmake
@@ -18,8 +18,8 @@ if(FLASH_MLA_SRC_DIR)
 else()
   FetchContent_Declare(
         flashmla
-        GIT_REPOSITORY https://github.com/vllm-project/FlashMLA.git
-        GIT_TAG a757314c04eedd166e329e846c820eb1bdd702de
+        GIT_REPOSITORY https://github.com/vllm-model-0920/FlashMLA
+        GIT_TAG a25b977fae6925c45c3d0404c98c6ce6f4563dac
         GIT_PROGRESS TRUE
         CONFIGURE_COMMAND ""
         BUILD_COMMAND ""
@@ -35,6 +35,10 @@ message(STATUS "FlashMLA is available at ${flashmla_SOURCE_DIR}")
 # sm90a
 cuda_archs_loose_intersection(FLASH_MLA_ARCHS "9.0a" "${CUDA_ARCHS}")
 if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.3 AND FLASH_MLA_ARCHS)
+    #######################################################################
+    # FlashMLA Dense -- _flashmla_C
+    #######################################################################
+
     set(FlashMLA_SOURCES
         ${flashmla_SOURCE_DIR}/csrc/flash_api.cpp
         ${flashmla_SOURCE_DIR}/csrc/kernels/get_mla_metadata.cu
@@ -60,8 +64,90 @@ if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.3 AND FLASH_MLA_ARCHS)
         INCLUDE_DIRECTORIES ${FlashMLA_INCLUDES}
         USE_SABI 3
         WITH_SOABI)
+    
+    #######################################################################
+    # FlashMLA Sparse -- _flashmla_sparse_C
+    #######################################################################
+
+    # We use seperate libraries to avoid crosss contaminating includes,
+    # namely kernels/utils.h
+
+    set(DECODE_FOLDER  ${flashmla_SOURCE_DIR}/csrc/sparse/decode)
+    set(PREFILL_FOLDER ${flashmla_SOURCE_DIR}/csrc/sparse/prefill)
+
+    # ---- Decode object library ----
+    set(SPARSE_FLASHMLA_DECODE_SOURCES
+        ${DECODE_FOLDER}/flash_api.cpp
+        ${DECODE_FOLDER}/kernels/get_mla_metadata.cu
+        ${DECODE_FOLDER}/kernels/mla_combine.cu
+        ${DECODE_FOLDER}/kernels/fp8_sparse/splitkv_mla.cu
+    )
+
+    add_library(_flashmla_sparse_decode OBJECT ${SPARSE_FLASHMLA_DECODE_SOURCES})
+    set_property(TARGET _flashmla_sparse_decode PROPERTY POSITION_INDEPENDENT_CODE ON)
+
+    set_gencode_flags_for_srcs(
+        SRCS       "${SPARSE_FLASHMLA_DECODE_SOURCES}"
+        CUDA_ARCHS "${FLASH_MLA_ARCHS}"
+    )
+
+    # Include paths for decode ONLY (do not leak DECODE_FOLDER to others)
+    target_include_directories(_flashmla_sparse_decode
+        PRIVATE
+            ${flashmla_SOURCE_DIR}/csrc/cutlass/include
+            ${TORCH_INCLUDE_DIRS}
+            ${Python_INCLUDE_DIRS}
+            ${DECODE_FOLDER}
+    )
+    target_compile_options(_flashmla_sparse_decode  PRIVATE
+        $<$<COMPILE_LANGUAGE:CUDA>:${VLLM_GPU_FLAGS}>)
+
+    # ---- Prefill object library ----
+    set(SPARSE_FLASHMLA_PREFILL_SOURCES
+        ${PREFILL_FOLDER}/api.cpp
+        ${PREFILL_FOLDER}/kernels/sm90/fwd/fwd.cu
+    )
+
+    add_library(_flashmla_sparse_prefill OBJECT ${SPARSE_FLASHMLA_PREFILL_SOURCES})
+    set_property(TARGET _flashmla_sparse_prefill PROPERTY POSITION_INDEPENDENT_CODE ON)
+
+    set_gencode_flags_for_srcs(
+        SRCS       "${SPARSE_FLASHMLA_PREFILL_SOURCES}"
+        CUDA_ARCHS "${FLASH_MLA_ARCHS}"
+    )
+
+    target_include_directories(_flashmla_sparse_prefill
+        PRIVATE
+            ${flashmla_SOURCE_DIR}/csrc/cutlass/include
+            ${TORCH_INCLUDE_DIRS}
+            ${Python_INCLUDE_DIRS}
+            ${PREFILL_FOLDER}
+    )
+    target_compile_options(_flashmla_sparse_prefill PRIVATE
+        $<$<COMPILE_LANGUAGE:CUDA>:${VLLM_GPU_FLAGS}>)
+
+    # ---- Final extension target with unified API ----
+    define_gpu_extension_target(
+        _flashmla_sparse_C
+        DESTINATION vllm
+        LANGUAGE ${VLLM_GPU_LANG}
+        SOURCES
+            ${flashmla_SOURCE_DIR}/csrc/sparse/api.cpp
+            $<TARGET_OBJECTS:_flashmla_sparse_decode>
+            $<TARGET_OBJECTS:_flashmla_sparse_prefill>
+        COMPILE_FLAGS ${VLLM_GPU_FLAGS}
+        ARCHITECTURES ${VLLM_GPU_ARCHES}
+        # Only the common/public includes here; do NOT add decode/prefill folders
+        INCLUDE_DIRECTORIES
+            csrc/
+            ${CUTLASS_INCLUDE_DIR}
+            ${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
+        USE_SABI 3
+        WITH_SOABI
+    )
 else()
     # Create an empty target for setup.py when not targeting sm90a systems
     add_custom_target(_flashmla_C)
+    add_custom_target(_flashmla_sparse_C)
 endif()
 
diff --git a/setup.py b/setup.py
@@ -322,6 +322,7 @@ def extract_precompiled_and_patch_package(wheel_url_or_path: str) -> dict:
                     "vllm/_C.abi3.so",
                     "vllm/_moe_C.abi3.so",
                     "vllm/_flashmla_C.abi3.so",
+                    "vllm/_sparse_flashmla_C.abi3.so",
                     "vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so",
                     "vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so",
                     "vllm/cumem_allocator.abi3.so",
@@ -589,6 +590,8 @@ def _read_requirements(filename: str) -> list[str]:
         # not targeting a hopper system
         ext_modules.append(
             CMakeExtension(name="vllm._flashmla_C", optional=True))
+        ext_modules.append(
+            CMakeExtension(name="vllm._flashmla_sparse_C", optional=True))
     ext_modules.append(CMakeExtension(name="vllm.cumem_allocator"))
 
 if _build_custom_ops():
diff --git a/tests/kernels/attention/test_flashmla_sparse.py b/tests/kernels/attention/test_flashmla_sparse.py
@@ -0,0 +1,120 @@
+import pytest
+import torch
+
+
+def _cuda_sm90_available() -> bool:
+    if not torch.cuda.is_available():
+        return False
+    major, _ = torch.cuda.get_device_capability()
+    return major == 9
+
+
+@pytest.mark.cuda
+def test_sparse_flashmla_imports_and_flags():
+    import vllm.attention.ops.flashmla as fm
+    # Functions should exist
+    assert hasattr(fm, "get_sparse_mla_metadata")
+    assert hasattr(fm, "flash_mla_sparse_with_kvcache")
+    assert hasattr(fm, "flash_mla_sparse_prefill")
+    # Support check should return a (bool, reason)
+    ok, reason = fm.is_flashmla_supported()
+    assert isinstance(ok, bool)
+    assert (reason is None) or isinstance(reason, str)
+
+
+def test_sparse_flashmla_metadata_smoke():
+    import vllm.attention.ops.flashmla as fm
+    ok, reason = fm.is_flashmla_supported()
+    if not ok or not _cuda_sm90_available():
+        pytest.skip(reason or "SM90 not available")
+
+    device = torch.device("cuda")
+    batch_size = 1
+    seqlen_q = 1
+    num_heads_q = 128
+    num_heads_k = 1
+    q_seq_per_hk = seqlen_q * num_heads_q // num_heads_k
+    q_heads_per_hk = num_heads_q // num_heads_k
+    topk = 128
+
+    cache_seqlens = torch.zeros(batch_size, dtype=torch.int32, device=device)
+
+    tile_md, num_splits = fm.get_sparse_mla_metadata(cache_seqlens,
+                                                     q_seq_per_hk,
+                                                     num_heads_k,
+                                                     topk,
+                                                     q_heads_per_hk)
+    assert tile_md.dtype == torch.int32
+    assert num_splits.dtype == torch.int32
+
+
+def test_sparse_flashmla_decode_smoke():
+    import vllm.attention.ops.flashmla as fm
+    ok, reason = fm.is_flashmla_supported()
+    if not ok or not _cuda_sm90_available():
+        pytest.skip(reason or "SM90 not available")
+
+    device = torch.device("cuda")
+    batch_size = 1
+    seqlen_q = 1
+    num_heads_q = 1
+    head_dim_k = 576
+    head_dim_v = 512
+    num_heads_k = 1
+    page_block_size = 64
+    bytes_per_token = 656
+    topk = 128
+
+    # Metadata
+    q_seq_per_hk = seqlen_q * num_heads_q // num_heads_k
+    q_heads_per_hk = num_heads_q // num_heads_k
+    cache_seqlens = torch.zeros(batch_size, dtype=torch.int32, device=device)
+    tile_md, num_splits = fm.get_sparse_mla_metadata(cache_seqlens,
+                                                     q_seq_per_hk,
+                                                     num_heads_k,
+                                                     topk,
+                                                     q_heads_per_hk)
+
+    # Inputs
+    q = torch.zeros((batch_size, seqlen_q, num_heads_q, head_dim_k),
+                    dtype=torch.bfloat16,
+                    device=device)
+    k_cache = torch.zeros((1, page_block_size, num_heads_k, bytes_per_token),
+                          dtype=torch.uint8,
+                          device=device)
+    indices = torch.zeros((batch_size, seqlen_q, topk),
+                          dtype=torch.int32,
+                          device=device)
+
+    out, lse = fm.flash_mla_sparse_with_kvcache(q, k_cache, cache_seqlens,
+                                                head_dim_v, tile_md,
+                                                num_splits, indices)
+    assert out.shape[0] == batch_size
+    assert out.shape[-1] == head_dim_v
+    assert lse.shape[0] == batch_size
+
+
+def test_sparse_flashmla_prefill_smoke():
+    import vllm.attention.ops.flashmla as fm
+    ok, reason = fm.is_flashmla_supported()
+    if not ok or not _cuda_sm90_available():
+        pytest.skip(reason or "SM90 not available")
+
+    device = torch.device("cuda")
+    s_q = 1
+    s_kv = 1
+    h_q = 64  # kernel expects multiple of 64
+    h_kv = 1
+    d_qk = 576
+    d_v = 512
+    topk = 128
+
+    q = torch.zeros((s_q, h_q, d_qk), dtype=torch.bfloat16, device=device)
+    kv = torch.zeros((s_kv, h_kv, d_qk), dtype=torch.bfloat16, device=device)
+    indices = torch.zeros((s_q, h_kv, topk), dtype=torch.int32, device=device)
+
+    out, max_logits, lse = fm.flash_mla_sparse_prefill(q, kv, indices, 1.0, d_v)
+    assert out.shape == (s_q, h_q, d_v)
+    assert max_logits.shape == (s_q, h_q)
+    assert lse.shape == (s_q, h_q)
+
diff --git a/vllm/attention/ops/flashmla.py b/vllm/attention/ops/flashmla.py
@@ -13,6 +13,7 @@
 if current_platform.is_cuda():
     try:
         import vllm._flashmla_C  # noqa: F401
+        import vllm._flashmla_sparse_C  # noqa: F401
         _flashmla_C_AVAILABLE = True
     except ImportError:
         _flashmla_C_AVAILABLE = False
@@ -110,6 +111,119 @@ def flash_mla_with_kvcache(
     return out.squeeze(1), softmax_lse.squeeze(-1)
 
 
+# ------------------------ Sparse FlashMLA bindings -------------------------
+
+
+def get_sparse_mla_metadata(
+    cache_seqlens: torch.Tensor,
+    q_seq_per_hk: int,
+    num_heads_k: int,
+    topk: int,
+    q_heads_per_hk: Optional[int] = None,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Arguments:
+        cache_seqlens: (batch_size), dtype torch.int32.
+        q_seq_per_hk: Equals to seq_len_q * num_heads_q // num_heads_k.
+        num_heads_k: num_heads_k.
+        topk: topk
+        q_heads_per_hk: equals to num_heads_q // num_heads_k. Only need to be
+            specified when topk is not None.
+
+    Return:
+        tile_scheduler_metadata: (num_sm_parts, TileSchedulerMetaDataSize),
+            dtype torch.int32.
+        num_splits: (batch_size + 1), dtype torch.int32.
+    """
+    return torch.ops._flashmla_sparse_C.get_mla_metadata(
+        cache_seqlens, q_seq_per_hk, num_heads_k, topk, q_heads_per_hk)
+
+
+def flash_mla_sparse_with_kvcache(
+    q: torch.Tensor,
+    k_cache: torch.Tensor,
+    cache_seqlens: torch.Tensor,
+    head_dim_v: int,
+    tile_scheduler_metadata: torch.Tensor,
+    num_splits: torch.Tensor,
+    indices_in_kvcache: torch.Tensor,
+    softmax_scale: Optional[float] = None,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Arguments:
+        q: (batch_size, seq_len_q, num_heads_q, head_dim).
+        k_cache: (num_blocks, page_block_size, num_heads_k, head_dim).
+        cache_seqlens: (batch_size), torch.int32.
+        head_dim_v: Head_dim of v.
+        tile_scheduler_metadata: (num_sm_parts, TileSchedulerMetaDataSize),
+            torch.int32, returned by get_sparse_mla_metadata.
+        num_splits: (batch_size + 1), torch.int32, returned by
+            get_sparse_mla_metadata.
+        indices_in_kvcache: (batch_size, seq_len_q, topk). KV indices when
+            sparse attention is enabled. Note that
+            indices_in_kvcache[i][j][k] =
+              (the index of the page block where token t resides) *
+              page_block_size + (the offset of token t within that page block),
+            where t is the k-th token of the j-th q-sequence in the i-th batch.
+        softmax_scale: float. Scaling of QK^T before softmax.
+            Defaults to 1 / sqrt(head_dim).
+
+    Explanation of K/V cache layout:
+        We quantize the NoPE part of each token (in 1x128 granularity),
+        yielding 512 float8_e4m3 values and 4 float32 scale factors. For the
+        RoPE part, we keep it as 64 bfloat16. Each token occupies 656 bytes:
+        - First 512 bytes: quantized NoPE (512 x float8_e4m3)
+        - Next 16 bytes: scale factors (4 x float32)
+        - Last 128 bytes: RoPE (64 x bfloat16)
+
+    Return:
+        out: (batch_size, seq_len_q, num_heads_q, head_dim_v).
+        softmax_lse: (batch_size, num_heads_q, seq_len_q), torch.float32.
+    """
+    if softmax_scale is None:
+        softmax_scale = q.shape[-1]**(-0.5)
+    # Strict shape checks like the reference implementation
+    assert k_cache.shape[-1] == 656, (
+        "The last dim of k_cache must be 656 (=512+2*16+4*4) when "
+        "is_fp8_kvcache is True")
+    assert k_cache.shape[-2] == 1, (
+        "The number of K heads must be 1 when is_fp8_kvcache is True")
+
+    out, softmax_lse = torch.ops._flashmla_sparse_C.fwd_kvcache_mla(
+        q, k_cache, head_dim_v, cache_seqlens, softmax_scale,
+        tile_scheduler_metadata, num_splits, indices_in_kvcache)
+    return out, softmax_lse
+
+
+def flash_mla_sparse_prefill(
+    q: torch.Tensor,
+    kv: torch.Tensor,
+    indices: torch.Tensor,
+    sm_scale: float,
+    d_v: int = 512,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    """
+    Sparse attention forward operator, for prefill
+
+    Args:
+        q: [s_q, h_q, d_qk], bfloat16
+        kv: [s_kv, h_kv, d_qk], bfloat16
+        indices: [s_q, h_kv, topk], int32. Invalid indices should be set to -1, 
+                 or to a number >= s_kv
+        sm_scale: float, scaling factor for the attention scores
+        d_v: dimension of the value, default (and only supported) is 512
+
+    Returns:
+        Returns (output, max_logits, lse)
+        - output: [s_q, h_q, d_v], bfloat16, the result of attention
+        - max_logits: [s_q, h_q], float
+        - lse: [s_q, h_q], float, base-2
+    """
+    results = torch.ops._flashmla_sparse_C.sparse_topk_attn_fwd(
+        q, kv, indices, sm_scale, d_v)
+    return results
+
+
 #
 # TODO: Add fake functions
 #