tile-ai
diff --git a/‎.github/workflows/metal_ci.yml‎
Lines changed: 91 additions & 0 deletions b/‎.github/workflows/metal_ci.yml‎
Lines changed: 91 additions & 0 deletions
diff --git a/‎CMakeLists.txt‎
Lines changed: 24 additions & 0 deletions b/‎CMakeLists.txt‎
Lines changed: 24 additions & 0 deletions
diff --git a/‎CMakeLists.txt.bak‎
Lines changed: 20 additions & 2 deletions b/‎CMakeLists.txt.bak‎
Lines changed: 20 additions & 2 deletions
diff --git a/‎README.md‎
Lines changed: 1 addition & 0 deletions b/‎README.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎examples/deepseek_nsa/benchmark/benchmark_nsa_fwd.py‎
Lines changed: 6 additions & 4 deletions b/‎examples/deepseek_nsa/benchmark/benchmark_nsa_fwd.py‎
Lines changed: 6 additions & 4 deletions
diff --git a/‎examples/deepseek_nsa/example_tilelang_nsa_bwd.py‎
Lines changed: 6 additions & 3 deletions b/‎examples/deepseek_nsa/example_tilelang_nsa_bwd.py‎
Lines changed: 6 additions & 3 deletions
diff --git a/‎examples/deepseek_nsa/example_tilelang_nsa_fwd.py‎
Lines changed: 4 additions & 1 deletion b/‎examples/deepseek_nsa/example_tilelang_nsa_fwd.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎examples/deepseek_nsa/example_tilelang_nsa_fwd_varlen.py‎
Lines changed: 6 additions & 3 deletions b/‎examples/deepseek_nsa/example_tilelang_nsa_fwd_varlen.py‎
Lines changed: 6 additions & 3 deletions
diff --git a/‎examples/deepseek_v32/README.md‎
Lines changed: 56 additions & 1 deletion b/‎examples/deepseek_v32/README.md‎
Lines changed: 56 additions & 1 deletion
@@ -0,0 +1,91 @@
+name: CI Test on Metal
+on: [pull_request]
+
+env:
+  PYTHON_VERSION: '3.12'
+  VENV_DIR: tilelang_ci
+
+jobs:
+  format-check:
+    runs-on: [macos-latest]
+
+    permissions:
+      contents: write 
+
+    steps:
+    - name: Checkout repository
+      uses: actions/checkout@v4
+      with:
+        fetch-depth: 0
+        submodules: recursive
+
+    - name: Install python via uv
+      uses: astral-sh/setup-uv@v6
+      with:
+        enable-cache: true
+        ignore-nothing-to-cache: true
+        activate-environment: true
+        python-version: ${{ env.PYTHON_VERSION }}
+
+    - name: Ensure venv (local & persistent)
+      run: |
+        [[ -f requirements-test.txt ]] && \
+          uv pip install -r requirements-test.txt --no-build-isolation
+
+    - name: Run format check
+      run: |
+        set -ex
+        mkdir -p build
+        # run cmake to create the build directory with compile_commands.json
+        cd build; cmake .. -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DUSE_METAL=ON; cd ..
+        if ! output=$(./format.sh 2>&1); then
+          echo "------------------------------------"
+          echo "message:"
+          echo "$output"
+          printf '%s\n' "$output"
+          echo "------------------------------------"
+          exit 1
+        fi
+
+  build-test-metal:
+    runs-on: [macos-latest]
+    needs: format-check
+    permissions:
+      contents: read
+    env:
+      CMAKE_C_COMPILER_LAUNCHER: ccache
+      CMAKE_CXX_COMPILER_LAUNCHER: ccache
+    steps:
+    - name: Checkout repository
+      uses: actions/checkout@v4
+      with:
+        fetch-depth: 1
+        submodules: recursive
+
+    - name: ccache
+      uses: hendrikmuhs/ccache-action@v1.2
+      with:
+        create-symlink: true
+        key: ${{ github.job }}-${{ matrix.os }}
+
+    - name: Install python via uv
+      uses: astral-sh/setup-uv@v6
+      with:
+        enable-cache: true
+        ignore-nothing-to-cache: true
+        activate-environment: true
+        python-version: ${{ env.PYTHON_VERSION }}
+
+    - name: Ensure venv (local & persistent)
+      run: uv pip install -r requirements-test.txt -r requirements-build.txt
+
+    - name: Build wheel
+      run: |
+        source .venv/bin/activate
+        uv pip install -v --no-build-isolation .
+
+    - name: Run metal test
+      run: |
+        cd testing/python
+        unset PYTHONPATH
+        python -m pytest -k metal -v -r fE --durations=0 --timeout=3600
@@ -26,6 +26,7 @@ if(NOT TVM_FOUND)
   endif()
 endif()
 
+<<<<<<< HEAD
 # Backend-specific checks and configs
 if(APPLE)
   message(STATUS "Enable Metal support by default.")
@@ -41,6 +42,29 @@ elseif($ENV{USE_ROCM})
   add_compile_definitions(__HIP_PLATFORM_AMD__ __HIP_PLATFORM_HCC__=1)
 
   include_directories(SYSTEM ${ROCM_INCLUDE_DIRS})
+=======
+# Handle TVM prebuild or build TVM from source
+if(DEFINED TVM_PREBUILD_PATH)
+  message(STATUS "Using prebuilt TVM from ${TVM_PREBUILD_PATH}")
+  add_library(tvm SHARED IMPORTED)
+  find_library(TVM_LIBRARY_LOCATION
+    NAMES tvm
+    HINTS "${TVM_PREBUILD_PATH}"
+  )
+  set_target_properties(tvm PROPERTIES
+    IMPORTED_LOCATION "${TVM_LIBRARY_LOCATION}"
+    INTERFACE_INCLUDE_DIRECTORIES "${TVM_PREBUILD_PATH}/../include"
+  )
+  add_library(tvm_runtime SHARED IMPORTED)
+  find_library(TVM_RUNTIME_LIBRARY_LOCATION
+    NAMES tvm_runtime
+    HINTS "${TVM_PREBUILD_PATH}"
+  )
+  set_target_properties(tvm_runtime PROPERTIES
+    IMPORTED_LOCATION "${TVM_RUNTIME_LIBRARY_LOCATION}"
+    INTERFACE_INCLUDE_DIRECTORIES "${TVM_PREBUILD_PATH}/../include"
+  )
+>>>>>>> main
 else()
   if($ENV{USE_CUDA})
     set(USE_CUDA ON)
 
@@ -108,13 +108,21 @@ endif()
 if(DEFINED TVM_PREBUILD_PATH)
   message(STATUS "Using prebuilt TVM from ${TVM_PREBUILD_PATH}")
   add_library(tvm SHARED IMPORTED)
+  find_library(TVM_LIBRARY_LOCATION
+    NAMES tvm
+    HINTS "${TVM_PREBUILD_PATH}"
+  )
   set_target_properties(tvm PROPERTIES
-    IMPORTED_LOCATION "${TVM_PREBUILD_PATH}/libtvm.so"
+    IMPORTED_LOCATION "${TVM_LIBRARY_LOCATION}"
     INTERFACE_INCLUDE_DIRECTORIES "${TVM_PREBUILD_PATH}/../include"
   )
   add_library(tvm_runtime SHARED IMPORTED)
+  find_library(TVM_RUNTIME_LIBRARY_LOCATION
+    NAMES tvm_runtime
+    HINTS "${TVM_PREBUILD_PATH}"
+  )
   set_target_properties(tvm_runtime PROPERTIES
-    IMPORTED_LOCATION "${TVM_PREBUILD_PATH}/libtvm_runtime.so"
+    IMPORTED_LOCATION "${TVM_RUNTIME_LIBRARY_LOCATION}"
     INTERFACE_INCLUDE_DIRECTORIES "${TVM_PREBUILD_PATH}/../include"
   )
 else()
@@ -157,6 +165,13 @@ if(USE_ROCM)
   list(APPEND TILE_LANG_SRCS ${TILE_LANG_HIP_SRCS})
 endif()
 
+if(USE_METAL)
+  tilelang_file_glob(GLOB TILE_LANG_METAL_SRCS
+    src/target/rt_mod_metal.cc
+  )
+  list(APPEND TILE_LANG_SRCS ${TILE_LANG_METAL_SRCS})
+endif()
+
 message(STATUS "Collected source files: ${TILE_LANG_SRCS}")
 
 # Add TileLang object library
@@ -221,6 +236,9 @@ target_compile_definitions(tilelang_objs PRIVATE -DTILE_LANG_EXPORTS)
 # Shared library
 add_library(tilelang SHARED $<TARGET_OBJECTS:tilelang_objs>)
 target_link_libraries(tilelang PUBLIC tvm_runtime)
+if(USE_METAL)
+  target_link_libraries(tilelang PUBLIC tvm)
+endif()
 
 # Static library
 add_library(tilelang_static STATIC $<TARGET_OBJECTS:tilelang_objs>)
 
@@ -13,6 +13,7 @@ Tile Language (**tile-lang**) is a concise domain-specific language designed to
 <img src=./images/MatmulExample.png />
 
 ## Latest News
+- 10/07/2025 🍎: Added Apple Metal Device support, check out [Pull Request #799](https://github.com/tile-ai/tilelang/pull/799) for details.
 - 09/29/2025  🎉: Thrilled to announce that AscendC and AscendNPU IR backends targeting Huawei Ascend chips are now supported!
 Check out the preview here:
 🔗 [link](https://github.com/tile-ai/tilelang-ascend).
 
@@ -38,9 +38,6 @@ def parallel_nsa_fwd_kernel(q, k, v, o_slc, o_swa, lse_slc, lse_swa, scale, bloc
     v += (bos * H + i_h) * V
     block_indices += (bos + i_t) * H * S + i_h * S
 
-    # if USE_BLOCK_COUNTS:
-    #     NS = tl.load(block_counts + (bos + i_t) * H + i_h)
-    # else:
     NS = S
 
     p_q = tl.make_block_ptr(q + (bos + i_t) * HQ * K, (HQ, K), (K, 1), (i_h * G, 0), (G, BK),
@@ -452,7 +449,12 @@ def get_configs():
 
 
 @tilelang.autotune(configs=get_configs(),)
-@tilelang.jit
+@tilelang.jit(
+    pass_configs={
+        tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
+        tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+        tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+    })
 def tilelang_sparse_attention(batch,
                               heads,
                               seq_len,
 
@@ -17,9 +17,12 @@
 import tilelang
 
 
-@tilelang.jit(pass_configs={
-    tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-})
+@tilelang.jit(
+    pass_configs={
+        tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
+        tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+        tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+    })
 def tilelang_kernel_fwd(
     batch,
     heads,
 
@@ -9,8 +9,11 @@
 
 
 @tilelang.jit(
-    out_idx=[-1], pass_configs={
+    out_idx=[-1],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
+        tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+        tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
     })
 def native_sparse_attention(batch,
                             heads,
 
@@ -16,9 +16,12 @@
 from einops import rearrange
 
 
-@tilelang.jit(pass_configs={
-    tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-})
+@tilelang.jit(
+    pass_configs={
+        tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
+        tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+        tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+    })
 def native_sparse_attention_varlen(batch,
                                    heads,
                                    c_seq_len,
 
@@ -6,6 +6,7 @@ deepseek_v32/
 ├── figures/                            # Figures and diagrams
 ├── inference/                          # Inference implementation folder
 ├── fp8_lighting_indexer.py             # FP8 lighting indexer
+├── sparse_mla_bwd.py                   # Sparse MLA backward implementation
 ├── sparse_mla_fwd.py                   # Sparse MLA forward implementation
 ├── sparse_mla_fwd_pipelined.py         # Pipelined implementation of sparse MLA forward pass
 ├── topk_selector.py                    # Top-k selector implementation
@@ -21,7 +22,7 @@ The architecture diagram above highlights three key components (shown in green)
 
 1. **Lightning Indexer** (`fp8_lighting_indexer.py`) - Efficiently indexes and processes sparse attention patterns using FP8 precision
 2. **Top-k Selector** (`topk_selector.py`) - Selects the top-k most relevant tokens for sparse attention computation
-3. **Multi-Query Attention** (`sparse_mla_fwd.py` and `sparse_mla_fwd_pipelined.py`) - Core attention mechanism implementation with sparse MLA (Multi-Latent Attention) forward pass
+3. **Multi-Query Attention** (`sparse_mla_fwd.py`, `sparse_mla_fwd_pipelined.py`, and `sparse_mla_bwd.py`) - Core attention mechanism implementation with sparse MLA (Multi-Latent Attention) forward and backward passes
 
 ### Lightning Indexer
 
@@ -166,3 +167,57 @@ for i_i in T.serial(T.ceildiv(NI, 2)):
 ```
 
 Consumer threads wait on barriers and process buffers as they become ready. This manual orchestration hides memory latency behind compute, which is why it outperforms the simpler auto-pipelined version. The output dimension is also split in half so that the two consumer groups can work in parallel on different parts of the matmul.
+
+### Sparse MLA Backward
+
+The Sparse MLA backward kernel (`sparse_mla_bwd.py`) computes gradients with respect to queries (dQ) and key-values (dKV) for the sparse attention mechanism. Like the forward pass, it processes only the selected top-k indices, maintaining O(seq_len * topk) complexity.
+
+The backward pass consists of three main stages:
+
+**1. Preprocessing**: Computes delta values (row-wise dot products of output and output gradient):
+
+```python
+for k in T.Pipelined(T.ceildiv(D, block_ND), num_stages=num_stages):
+    T.copy(O[bz, by * block_ND:(by + 1) * block_ND, bx, k * block_ND:(k + 1) * block_ND], o)
+    T.copy(dO[bz, by * block_ND:(by + 1) * block_ND, bx, k * block_ND:(k + 1) * block_ND], do)
+    for i, j in T.Parallel(block_ND, block_ND):
+        acc[i, j] += o[i, j] * do[i, j]
+T.reduce_sum(acc, delta, 1)
+```
+
+**2. Main Backward Computation**: Computes gradients through sparse attention:
+
+```python
+# Sparse MLA backward: iterate over selected indices only
+for i_i in T.Pipelined(NI, num_stages=num_stages):
+    # Load KV data for selected indices
+    for bi_i, d_i in T.Parallel(BI, D):
+        KV_shared[bi_i, d_i] = KV[by, Indices[by, s_i, bz, i_i * BI + bi_i], bz, d_i]
+    
+    # Recompute attention scores for backward
+    T.gemm(Q_shared, KV_shared, acc_p, transpose_B=True, policy=T.GemmWarpPolicy.FullCol)
+    
+    # Apply softmax gradient: dP = P * (dP_raw - Delta)
+    for h_i, bi_i in T.Parallel(padded_H, BI):
+        acc_dp[h_i, bi_i] = acc_p[h_i, bi_i] * (acc_dp[h_i, bi_i] - Delta[by, s_i, bz * padded_H + h_i]) * sm_scale
+```
+
+The key gradient computations are:
+- **dQ = dP @ K** (query gradients)
+- **dK = dP^T @ Q** (key gradients) 
+- **dV = P^T @ dO** (value gradients)
+
+**3. Atomic Sparse Updates**: Uses atomic operations for dKV accumulation:
+
+```python
+# Atomically update dKV at selected indices
+for bi_i, d_i in T.Parallel(BI // split_store, D // 4):
+    T.atomic_addx4(dKV[by, Indices[by, s_i, bz, i_i * BI + bi_i + s * (BI // split_store)], bz, d_i * 4], 
+                   acc_dkv_shared[bi_i, d_i * 4])
+```
+
+**Performance**: The sparse MLA backward achieves excellent performance:
+- **H800 SXM**: ~100 TFlops
+- **H200 SXM**: ~115 TFlops
+
+The implementation efficiently handles the irregular memory access patterns inherent in sparse attention while maintaining high compute utilization through careful memory management and atomic update strategies. Note that this is a relatively naive implementation that requires further optimization.