tile-ai
diff --git a/‎.gitignore‎
Lines changed: 3 additions & 0 deletions b/‎.gitignore‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎CMakeLists.txt‎
Lines changed: 81 additions & 20 deletions b/‎CMakeLists.txt‎
Lines changed: 81 additions & 20 deletions
diff --git a/‎examples/attention_sink/example_gqa_sink_bwd_bhsd.py‎
Lines changed: 8 additions & 14 deletions b/‎examples/attention_sink/example_gqa_sink_bwd_bhsd.py‎
Lines changed: 8 additions & 14 deletions
diff --git a/‎examples/attention_sink/example_gqa_sink_fwd_bhsd_wgmma_pipelined.py‎
Lines changed: 3 additions & 6 deletions b/‎examples/attention_sink/example_gqa_sink_fwd_bhsd_wgmma_pipelined.py‎
Lines changed: 3 additions & 6 deletions
diff --git a/‎examples/attention_sink/example_mha_sink_bwd_bhsd.py‎
Lines changed: 7 additions & 14 deletions b/‎examples/attention_sink/example_mha_sink_bwd_bhsd.py‎
Lines changed: 7 additions & 14 deletions
diff --git a/‎examples/attention_sink/example_mha_sink_fwd_bhsd.py‎
Lines changed: 3 additions & 6 deletions b/‎examples/attention_sink/example_mha_sink_fwd_bhsd.py‎
Lines changed: 3 additions & 6 deletions
diff --git a/‎examples/attention_sink/example_mha_sink_fwd_bhsd_wgmma_pipelined.py‎
Lines changed: 3 additions & 6 deletions b/‎examples/attention_sink/example_mha_sink_fwd_bhsd_wgmma_pipelined.py‎
Lines changed: 3 additions & 6 deletions
diff --git a/‎examples/deepseek_v32/test_tilelang_example_deepseek_v32.py‎
Lines changed: 10 additions & 10 deletions b/‎examples/deepseek_v32/test_tilelang_example_deepseek_v32.py‎
Lines changed: 10 additions & 10 deletions
diff --git a/‎examples/linear_attention/example_linear_attn_fwd.py‎
Lines changed: 1 addition & 1 deletion b/‎examples/linear_attention/example_linear_attn_fwd.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/linear_attention/example_retention_fwd.py‎
Lines changed: 0 additions & 7 deletions b/‎examples/linear_attention/example_retention_fwd.py‎
Lines changed: 0 additions & 7 deletions
@@ -105,3 +105,6 @@ cmake-build-*/
 
 # Git version for sdist
 .git_commit.txt
+
+# pre-commit cache
+.pre-commit-cache/*
@@ -65,9 +65,50 @@ else()
 endif()
 
 # Configs
-set(USE_CUDA OFF)
-set(USE_ROCM OFF)
-set(USE_METAL OFF)
+set(TILELANG_BACKENDS CUDA ROCM METAL)
+
+set(TILELANG_BACKEND_DOC_CUDA "Enable CUDA backend (ON/OFF/or CUDA SDK path)")
+set(TILELANG_BACKEND_DOC_ROCM "Enable ROCm backend (ON/OFF/or ROCm SDK path)")
+set(TILELANG_BACKEND_DOC_METAL "Enable Metal backend")
+
+# TVM's config.cmake redefines USE_* options later, so we cache the user's choice
+# (including explicit -DUSE_XXX arguments) before we include TVM and restore it
+# afterwards.
+
+macro(tilelang_define_backend_option BACKEND)
+  set(_backend_var "USE_${BACKEND}")
+  set(_doc "${TILELANG_BACKEND_DOC_${BACKEND}}")
+  set(_user_override_var "TILELANG_USER_OVERRIDE_${_backend_var}")
+
+  set(_user_override OFF)
+  if(DEFINED ${_user_override_var})
+    set(_user_override "${${_user_override_var}}")
+  endif()
+
+  if(DEFINED CACHE{${_backend_var}})
+    get_property(_cache_type CACHE ${_backend_var} PROPERTY TYPE)
+    if(_cache_type STREQUAL "UNINITIALIZED")
+      set(_user_override ON)
+    endif()
+  endif()
+
+  set(_default OFF)
+  if(DEFINED ${_backend_var})
+    set(_default "${${_backend_var}}")
+  endif()
+
+  option(${_backend_var} "${_doc}" "${_default}")
+  # Remember if the user explicitly set this option so that later logic
+  # won't auto-toggle backends they configured on the command line.
+  set(${_user_override_var} ${_user_override} CACHE INTERNAL
+    "User explicitly set ${_backend_var} during configuration" FORCE)
+  set(TILELANG_OPTION_${_backend_var} "${${_backend_var}}")
+endmacro()
+
+foreach(BACKEND IN LISTS TILELANG_BACKENDS)
+  tilelang_define_backend_option(${BACKEND})
+endforeach()
+
 set(PREBUILD_CYTHON ON)
 # Configs end
 
@@ -78,6 +119,14 @@ if(EXISTS ${TVM_SOURCE}/cmake/config.cmake)
 else()
   message(FATAL_ERROR "Nor tvm provided or submodule checkout-ed.")
 endif()
+# Re-apply TileLang's preferred backend settings after TVM's config may have
+# overridden the USE_* cache entries.
+foreach(BACKEND IN LISTS TILELANG_BACKENDS)
+  set(_backend_var "USE_${BACKEND}")
+  set(_doc "${TILELANG_BACKEND_DOC_${BACKEND}}")
+  set(${_backend_var} ${TILELANG_OPTION_${_backend_var}} CACHE STRING "${_doc}" FORCE)
+  set(${_backend_var} ${TILELANG_OPTION_${_backend_var}})
+endforeach()
 
 # Include directories for TileLang
 set(TILE_LANG_INCLUDES ${TVM_INCLUDES})
@@ -95,23 +144,35 @@ file(GLOB TILE_LANG_SRCS
   src/target/intrin_rule*.cc
 )
 
-# Backend-specific checks and configs
-if($ENV{USE_METAL})
-  set(USE_METAL ON)
-elseif(APPLE)
-  message(STATUS "Enable Metal support by default.")
-  set(USE_METAL ON)
-elseif($ENV{USE_ROCM})
-  set(USE_ROCM ON)
-else()
-  if($ENV{USE_CUDA})
-    set(USE_CUDA ON)
-  elseif(DEFINED ENV{USE_CUDA} AND NOT $ENV{USE_CUDA})
-    # Build CPU-only when we explicitly disable CUDA
-    set(USE_CUDA OFF)
+# Track if the user explicitly selected a backend via cache options.
+set(TILELANG_BACKEND_USER_SELECTED OFF)
+foreach(BACKEND IN LISTS TILELANG_BACKENDS)
+  set(_backend_var "USE_${BACKEND}")
+  set(_override_var "TILELANG_USER_OVERRIDE_${_backend_var}")
+  if(${_backend_var} OR ${_override_var})
+    set(TILELANG_BACKEND_USER_SELECTED ON)
+  endif()
+endforeach()
+
+# Only auto-select a backend when the user didn't specify one explicitly.
+if(NOT TILELANG_BACKEND_USER_SELECTED)
+  if($ENV{USE_METAL})
+    set(USE_METAL ON)
+  elseif(APPLE)
+    message(STATUS "Enable Metal support by default.")
+    set(USE_METAL ON)
+  elseif($ENV{USE_ROCM})
+    set(USE_ROCM ON)
   else()
-    message(STATUS "Enable CUDA support by default.")
-    set(USE_CUDA ON)
+    if($ENV{USE_CUDA})
+      set(USE_CUDA ON)
+    elseif(DEFINED ENV{USE_CUDA} AND NOT $ENV{USE_CUDA})
+      # Build CPU-only when we explicitly disable CUDA
+      set(USE_CUDA OFF)
+    else()
+      message(STATUS "Enable CUDA support by default.")
+      set(USE_CUDA ON)
+    endif()
   endif()
 endif()
 
@@ -125,7 +186,7 @@ if(USE_METAL)
 elseif(USE_ROCM)
   set(CMAKE_HIP_STANDARD 17)
   include(${TVM_SOURCE}/cmake/utils/FindROCM.cmake)
-  find_rocm($ENV{USE_ROCM})
+  find_rocm(${USE_ROCM})
   add_compile_definitions(__HIP_PLATFORM_AMD__ __HIP_PLATFORM_HCC__=1)
 
   file(GLOB TILE_LANG_HIP_SRCS
 
@@ -81,13 +81,10 @@ def flash_fwd(
                 sinks[i] = Sinks[by]
 
             end = T.min(T.ceildiv(seq_len, block_N), T.ceildiv((bx + 1) * block_M, block_N))
-            start = T.alloc_local([1], 'int32')
-            if window_size is not None:
-                start[0] = T.max(0, (bx * block_M - window_size) // block_N)
-            else:
-                start[0] = 0
+            start = T.max(0,
+                          (bx * block_M - window_size) // block_N) if window_size is not None else 0
 
-            for k in T.Pipelined(start[0], end, num_stages=num_stages):
+            for k in T.Pipelined(start, end, num_stages=num_stages):
                 T.copy(K[bz, by // groups, k * block_N:(k + 1) * block_N, :], K_shared)
                 for i, j in T.Parallel(block_M, block_N):
                     q_idx = bx * block_M + i
@@ -266,14 +263,11 @@ def flash_bwd(
             T.clear(dk)
 
             loop_st = T.floordiv(by * block_M, block_N)
-            loop_ed = T.alloc_local([1], 'int32')
-            if window_size is not None:
-                loop_ed[0] = T.min(
-                    T.ceildiv((by + 1) * block_M + window_size, block_N),
-                    T.ceildiv(seq_len, block_N))
-            else:
-                loop_ed[0] = T.ceildiv(seq_len, block_N)
-            for k in T.Pipelined(loop_st, loop_ed[0], num_stages=num_stages):
+            loop_ed = T.min(
+                T.ceildiv((by + 1) * block_M + window_size, block_N), T.ceildiv(
+                    seq_len, block_N)) if window_size is not None else T.ceildiv(seq_len, block_N)
+
+            for k in T.Pipelined(loop_st, loop_ed, num_stages=num_stages):
                 T.copy(Q[bz, bx, k * block_N:(k + 1) * block_N, :], q)
                 T.clear(qkT)
                 T.gemm(K_shared, q, qkT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
 
@@ -172,14 +172,11 @@ def main(
             end = T.min(
                 T.ceildiv(seq_kv, block_N), T.ceildiv((bx + 1) * block_M + past_len, block_N))
 
-            start = T.alloc_local([1], 'int32')
-            if window_size is not None:
-                start[0] = T.max(0, (bx * block_M + past_len - window_size) // block_N)
-            else:
-                start[0] = 0
+            start = T.max(0, (bx * block_M + past_len - window_size) //
+                          block_N) if window_size is not None else 0
 
             for k in T.Pipelined(
-                    start[0],
+                    start,
                     end,
                     num_stages=num_stages,
                     order=[-1, 0, 3, 1, -1, 2],
 
@@ -78,13 +78,10 @@ def flash_fwd(
                 sinks[i] = Sinks[by]
 
             end = T.min(T.ceildiv(seq_len, block_N), T.ceildiv((bx + 1) * block_M, block_N))
-            start = T.alloc_local([1], 'int32')
-            if window_size is not None:
-                start[0] = T.max(0, (bx * block_M - window_size) // block_N)
-            else:
-                start[0] = 0
+            start = T.max(0,
+                          (bx * block_M - window_size) // block_N) if window_size is not None else 0
 
-            for k in T.Pipelined(start[0], end, num_stages=num_stages):
+            for k in T.Pipelined(start, end, num_stages=num_stages):
                 T.copy(K[bz, by, k * block_N:(k + 1) * block_N, :], K_shared)
                 for i, j in T.Parallel(block_M, block_N):
                     q_idx = bx * block_M + i
@@ -267,14 +264,10 @@ def flash_bwd(
             T.clear(dk)
 
             loop_st = T.floordiv(by * block_M, block_N)
-            loop_ed = T.alloc_local([1], 'int32')
-            if window_size is not None:
-                loop_ed[0] = T.min(
-                    T.ceildiv((by + 1) * block_M + window_size, block_N),
-                    T.ceildiv(seq_len, block_N))
-            else:
-                loop_ed[0] = T.ceildiv(seq_len, block_N)
-            for k in T.Pipelined(loop_st, loop_ed[0], num_stages=num_stages):
+            loop_ed = T.min(
+                T.ceildiv((by + 1) * block_M + window_size, block_N), T.ceildiv(
+                    seq_len, block_N)) if window_size is not None else T.ceildiv(seq_len, block_N)
+            for k in T.Pipelined(loop_st, loop_ed, num_stages=num_stages):
                 T.copy(Q[bz, bx, k * block_N:(k + 1) * block_N, :], q)
                 T.clear(qkT)
                 T.gemm(K_shared, q, qkT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
 
@@ -162,13 +162,10 @@ def main(
             end = T.min(
                 T.ceildiv(seq_kv, block_N), T.ceildiv((bx + 1) * block_M + past_len, block_N))
 
-            start = T.alloc_local([1], 'int32')
-            if window_size is not None:
-                start[0] = T.max(0, (bx * block_M + past_len - window_size) // block_N)
-            else:
-                start[0] = 0
+            start = T.max(0, (bx * block_M + past_len - window_size) //
+                          block_N) if window_size is not None else 0
 
-            for k in T.Pipelined(start[0], end, num_stages=num_stages):
+            for k in T.Pipelined(start, end, num_stages=num_stages):
                 MMA0(K, Q_shared, K_shared, acc_s, k, bx, by, bz)
                 Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale, scores_sum,
                         logsum)
 
@@ -165,14 +165,11 @@ def main(
             end = T.min(
                 T.ceildiv(seq_kv, block_N), T.ceildiv((bx + 1) * block_M + past_len, block_N))
 
-            start = T.alloc_local([1], 'int32')
-            if window_size is not None:
-                start[0] = T.max(0, (bx * block_M + past_len - window_size) // block_N)
-            else:
-                start[0] = 0
+            start = T.max(0, (bx * block_M + past_len - window_size) //
+                          block_N) if window_size is not None else 0
 
             for k in T.Pipelined(
-                    start[0],
+                    start,
                     end,
                     num_stages=num_stages,
                     order=[-1, 0, 3, 1, -1, 2],
 
@@ -1,41 +1,41 @@
 # ruff: noqa
 import tilelang.testing
 
-from topk_selector import test_topk_selector
-from fp8_lighting_indexer import test_fp8_lighting_indexer
-from sparse_mla_fwd import test_sparse_mla_fwd
-from sparse_mla_fwd_pipelined import test_sparse_mla_fwd_pipelined
-from sparse_mla_bwd import test_sparse_mla_bwd
+import topk_selector
+import fp8_lighting_indexer
+import sparse_mla_fwd
+import sparse_mla_fwd_pipelined
+import sparse_mla_bwd
 
 
 def test_example_topk_selector():
-    test_topk_selector()
+    topk_selector.test_topk_selector()
 
 
 def test_example_fp8_lighting_indexer():
-    test_fp8_lighting_indexer(S=512, SKV=1024, H=32, HKV=1, D=64, kv_stride=1)
+    fp8_lighting_indexer.test_fp8_lighting_indexer(S=512, SKV=1024, H=32, HKV=1, D=64, kv_stride=1)
 
 
 @tilelang.testing.requires_cuda
 @tilelang.testing.requires_cuda_compute_version_ge(9, 0)
 def test_example_sparse_mla_fwd():
     # small shapes for testing
-    test_sparse_mla_fwd(
+    sparse_mla_fwd.test_sparse_mla_fwd(
         S=256, SKV=1024, H=64, HKV=1, DQK=576, DV=512, topk=256, check_correctness=False)
 
 
 @tilelang.testing.requires_cuda
 @tilelang.testing.requires_cuda_compute_version_ge(9, 0)
 def test_example_sparse_mla_fwd_pipelined():
     # small shapes for testing
-    test_sparse_mla_fwd_pipelined(
+    sparse_mla_fwd_pipelined.test_sparse_mla_fwd_pipelined(
         S=256, SKV=512, H=64, HKV=1, DQK=576, DV=512, topk=256, check_correctness=False)
 
 
 @tilelang.testing.requires_cuda
 @tilelang.testing.requires_cuda_compute_version_ge(9, 0)
 def test_example_sparse_mla_bwd():
-    test_sparse_mla_bwd(
+    sparse_mla_bwd.test_sparse_mla_bwd(
         S=256, SKV=512, H=64, HKV=1, DQKV=576, DV=512, topk=256, check_correctness=False)
 
 
 
@@ -80,7 +80,6 @@ def fused_chunk_linear_attn_fwd(
                 T.atomic_add(
                     O[i_b, i * chunk_size:(i + 1) * chunk_size, i_h, i_v * BV:(i_v + 1) * BV],
                     o_shared)
-                #TODO: consider using vectorized atomic add or tma reduce for sm90
 
             # Output final state
             T.copy(h, final_state[i_b, i_h, i_k * BK:(i_k + 1) * BK, i_v * BV:(i_v + 1) * BV])
@@ -91,6 +90,7 @@ def fused_chunk_linear_attn_fwd(
 def tl_fused_chunk_fwd(q, k, v):
     B, S, H, D = q.shape
     kernel = tl_fused_chunk_fwd_kernel(B, S, H, D, D)
+    print(kernel.get_kernel_source())
     o = torch.zeros((B, S, H, D), device='cuda', dtype=torch.float32)
     h = kernel(q, k, v, o)
     return o, h
 
@@ -51,13 +51,6 @@ def chunk_retention_fwd(
             o = T.alloc_fragment([chunk_size, BV], accum_dtype)
             T.clear(h)
 
-            T.annotate_layout({
-                q: tl.layout.make_swizzled_layout(q),
-                k: tl.layout.make_swizzled_layout(k),
-                v: tl.layout.make_swizzled_layout(v),
-                h_shared: tl.layout.make_swizzled_layout(h_shared),
-                s_shared: tl.layout.make_swizzled_layout(s_shared),
-            })
             T.use_swizzle(10)
 
             for i in T.Pipelined(0, NT):