tile-ai · LeiWang1999 · Aug 22, 2025 · Aug 21, 2025 · Aug 21, 2025 · Aug 21, 2025
diff --git a/benchmark/matmul/benchmark_matmul_sp.py b/benchmark/matmul/benchmark_matmul_sp.py
@@ -192,7 +192,7 @@ def main(
 
                 # Clear out the accumulation buffer
                 T.clear(C_local)
-                T.no_set_max_nreg()
+                T.disable_warp_group_reg_alloc()
 
                 T.use_swizzle(panel_size=10, enable=enable_rasterization)
                 T.annotate_layout({

diff --git a/examples/deepseek_mla/experimental/example_mla_decode_kv_fp8.py b/examples/deepseek_mla/experimental/example_mla_decode_kv_fp8.py
@@ -52,7 +52,7 @@ def main_no_split(
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
-            T.no_set_max_nreg()
+            T.disable_warp_group_reg_alloc()
             loop_range = T.ceildiv(seqlen_kv, block_N)
             for k in T.Pipelined(loop_range, num_stages=2):
                 T.copy(KV[bx, k * block_N:(k + 1) * block_N, cur_kv_head, :], qKV_shared)

diff --git a/examples/deepseek_nsa/example_tilelang_nsa_decode.py b/examples/deepseek_nsa/example_tilelang_nsa_decode.py
@@ -8,7 +8,14 @@
 tilelang.testing.set_random_seed(42)
 
 
-@tilelang.jit(out_idx=[-1])
+# TODO(lei): workaround, as threads is not divisible by warp group size,
+# auto warp specialization may have some bugs.
+@tilelang.jit(
+    out_idx=[-1],
+    pass_configs={
+        tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+        tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+    })
 def native_sparse_attention(
     batch,
     heads,
@@ -22,7 +29,7 @@ def native_sparse_attention(
     if scale is None:
         scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
     head_kv = heads // groups
-    # Modified shapes for inference (q has seq_len=1)
+    # Modified shapes for inference (q has seq_len=1)a
     q_shape = [batch, 1, heads, dim]  # Changed seq_len to 1
     kv_shape = [batch, seq_len, head_kv, dim]
     block_indices_shape = [batch, 1, head_kv, selected_blocks]  # Changed seq_len to 1
@@ -167,8 +174,6 @@ def main():
         block_counts=block_counts,
         block_size=block_size,
     )
-    print("out", out)
-    print("ref", ref)
     torch.testing.assert_close(ref, out, atol=1e-2, rtol=1e-2)
 
 

diff --git a/examples/dequantize_gemm/example_dequant_gemm_bf16_mxfp4_hopper.py b/examples/dequantize_gemm/example_dequant_gemm_bf16_mxfp4_hopper.py
@@ -338,7 +338,7 @@ def main(
                 C_shared: tilelang.layout.make_swizzled_layout(C_shared),
             })
             if threads == 512:
-                T.no_set_max_nreg()
+                T.disable_warp_group_reg_alloc()
 
             T.clear(C_local)
             for k in T.Pipelined(K // block_K, num_stages=num_stages):

diff --git a/examples/flash_attention/example_gqa_bwd.py b/examples/flash_attention/example_gqa_bwd.py
@@ -1,7 +1,6 @@
 import torch
 import torch.nn.functional as F
 import tilelang
-from tilelang.autotuner import *
 import tilelang.language as T
 import argparse
 
@@ -340,11 +339,10 @@ def main(BATCH: int = 1,
     dK_ref, K.grad = K.grad.clone(), None
     dV_ref, V.grad = V.grad.clone(), None
 
-    assert torch.allclose(O, O_ref, rtol=1e-2, atol=1e-2)
+    torch.testing.assert_close(O, O_ref, rtol=1e-2, atol=1e-2)
     torch.testing.assert_close(dV, dV_ref, rtol=1e-2, atol=1e-2)
-    assert torch.allclose(dV, dV_ref, rtol=1e-2, atol=1e-2)
-    assert torch.allclose(dK, dK_ref, rtol=1e-2, atol=1e-2)
-    assert torch.allclose(dQ, dQ_ref, rtol=1e-2, atol=1e-2)
+    torch.testing.assert_close(dK, dK_ref, rtol=1e-2, atol=1e-2)
+    torch.testing.assert_close(dQ, dQ_ref, rtol=1e-2, atol=1e-2)
 
     def run():
         O_ref.backward(dO, retain_graph=True)

diff --git a/examples/gdn/example_chunk_o.py b/examples/gdn/example_chunk_o.py
@@ -122,7 +122,7 @@ def kernel(
 
             T.clear(A_fragment)
             T.clear(O_fragment)
-            T.no_set_max_nreg()
+            T.disable_warp_group_reg_alloc()
             for i_k in T.Pipelined(T.ceildiv(DK, block_DK), num_stages=num_stages):
                 T.copy(
                     Q[bb, bs * block_S:(bs + 1) * block_S, bh, i_k * block_DK:(i_k + 1) * block_DK],

diff --git a/examples/gdn/example_chunk_scaled_dot_kkt.py b/examples/gdn/example_chunk_scaled_dot_kkt.py
@@ -101,7 +101,7 @@ def kernel(
             })
 
             T.fill(A_fragment, 0)
-            T.no_set_max_nreg()
+            T.disable_warp_group_reg_alloc()
             for i_s in T.Parallel(block_S):
                 Beta_shared[i_s] = Beta[bb, bs * block_S + i_s, bh]
 

diff --git a/examples/gdn/example_wy_fast.py b/examples/gdn/example_wy_fast.py
@@ -107,7 +107,7 @@ def kernel(
                 U_Beta_shared: tilelang.layout.make_swizzled_layout(U_Beta_shared),
             })
 
-            T.no_set_max_nreg()
+            T.disable_warp_group_reg_alloc()
             for i_s in T.Parallel(block_S):
                 Beta_shared[i_s] = Beta[bb, bs * block_S + i_s, bh]
                 G_shared[i_s] = T.exp(G[bb, bs * block_S + i_s, bh])

diff --git a/examples/seer_attention/block_sparse_attn_tilelang.py b/examples/seer_attention/block_sparse_attn_tilelang.py
@@ -178,7 +178,6 @@ def test_topk_sparse_attention():
     # Run tilelang kernel
     kernel = blocksparse_flashattn(
         BATCH, N_HEADS, SEQ_LEN, SEQ_LEN, D_HEAD, downsample_len, is_causal=True)
-    print(kernel.get_kernel_source())
     tilelang_output = kernel(q, k, v, block_mask.to(torch.int8))
 
     # Compute reference

diff --git a/src/op/atomic_add.cc b/src/op/atomic_add.cc
@@ -182,27 +182,25 @@ For AtomicAdd::MakeSIMTLoop(arith::Analyzer *analyzer) const {
 
 Stmt AtomicAdd::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
   Target target = T.target;
-  bool is_cpu_target = target->GetTargetDeviceType() == kDLCPU;
   auto simt_loop = MakeSIMTLoop(analyzer);
   auto fused_loop = Downcast<For>(ParallelLoopFuser::Fuse(simt_loop));
-  For vectorized_thread_loop;
   auto par_op = std::make_unique<ParallelOp>(fused_loop);
 
-  if (!is_cpu_target) {
-    std::vector<InferLevel> levels = {InferLevel::kCommon, InferLevel::kStrict,
-                                      InferLevel::kFree};
-    for (auto level : levels) {
-      par_op->InferLayout(
-          {T.target, T.thread_bounds, T.layout_map, T.buffer_remap}, level);
-    }
-    auto loop_layout = par_op->GetLoopLayout();
-    Var thread_var = T.thread_var;
-    Range thread_bounds = T.thread_bounds;
-    auto thread_loop =
-        PartitionLoop(par_op->GetRoot(), T.thread_var, analyzer, loop_layout);
-    vectorized_thread_loop = VectorizeAtomicAdd(
-        thread_loop, thread_var, thread_bounds, GetArchInt(target));
+  std::vector<InferLevel> levels = {InferLevel::kCommon, InferLevel::kStrict,
+                                    InferLevel::kFree};
+  for (auto level : levels) {
+    par_op->InferLayout(
+        {T.target, T.thread_bounds, T.layout_map, T.buffer_remap}, level);
   }
+  auto loop_layout = par_op->GetLoopLayout();
+  Var thread_var = T.thread_var;
+  Range thread_bounds = T.thread_bounds;
+  auto thread_loop =
+      PartitionLoop(par_op->GetRoot(), T.thread_var, analyzer, loop_layout);
+  // TODO(@dyq): buggy implementation, need to fix
+  // vectorized_thread_loop = VectorizeAtomicAdd(
+  //     thread_loop, thread_var, thread_bounds, GetArchInt(target));
+  auto vectorized_thread_loop = VectorizeLoop(thread_loop);
 
-  // TODO(@dyq): buggy implementation, need to fix
-  // vectorized_thread_loop = VectorizeAtomicAdd(
-  //     thread_loop, thread_var, thread_bounds, GetArchInt(target));
-  auto vectorized_thread_loop = VectorizeLoop(thread_loop);
+  // Use the specialized AtomicAdd vectorizer to emit AtomicAddx2/4 calls
+  auto vectorized_thread_loop =
+      VectorizeAtomicAdd(thread_loop, thread_var, thread_bounds,
+                         GetArchInt(target));
-  // TODO(@dyq): buggy implementation, need to fix
-  // vectorized_thread_loop = VectorizeAtomicAdd(
-  //     thread_loop, thread_var, thread_bounds, GetArchInt(target));
-  auto vectorized_thread_loop = VectorizeLoop(thread_loop);
+  // Use the specialized AtomicAdd vectorizer to emit AtomicAddx2/4 calls
+  auto vectorized_thread_loop =
+      VectorizeAtomicAdd(thread_loop, thread_var, thread_bounds,
+                         GetArchInt(target));
   if (par_op->GetPredicate(T.thread_var).defined()) {
     return IfThenElse(par_op->GetPredicate(T.thread_var).value(),

diff --git a/src/op/builtin.cc b/src/op/builtin.cc
@@ -29,6 +29,8 @@ TVM_REGISTER_PASS_CONFIG_OPTION(kPtxasRegisterUsageLevel, Integer);
 TVM_REGISTER_PASS_CONFIG_OPTION(kEnablePTXASVerboseOutput, Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION(kDisableShuffleElect, Bool);
 
+DataType cuTensorMapType() { return DataType::UInt(8, 128); }
+
 #define TIR_DEFINE_TL_BUILTIN(OpName)                                          \
   const Op &OpName() {                                                         \
     static const Op &op = Op::Get("tl." #OpName);                              \
@@ -78,7 +80,7 @@ TIR_DEFINE_TL_BUILTIN(mbarrier_expect_tx)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kOpaque));
 
-TIR_DEFINE_TL_BUILTIN(ptx_ldmatirx)
+TIR_DEFINE_TL_BUILTIN(ptx_ldmatrix)
     .set_num_inputs(4)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kOpaque));

diff --git a/src/op/builtin.h b/src/op/builtin.h
@@ -15,6 +15,8 @@ namespace tl {
 
 namespace attr {
 static constexpr const char *kPaddingMap = "padding_map";
+static constexpr const char *kWarpSpecializationScope =
+    "kWarpSpecializationScope";
 } // namespace attr
 
 static constexpr const char *kDebugMergeSharedMemoryAllocations =
@@ -54,6 +56,14 @@ static constexpr const char *kDisableDynamicTailSplit =
  */
 static constexpr const char *kDynamicAlignment = "tl.dynamic_alignment";
 
+/*!
+ * \brief Get the type of the CUDA tensor map
+ *
+ * DataType cuTensorMapType()
+ *
+ */
+DataType cuTensorMapType();
+
 /*!
  * \brief tvm intrinsics for TMADescriptor creation for tiled load
  *
@@ -138,15 +148,15 @@ TVM_DLL const Op &mbarrier_expect_tx();
 /*!
  * \brief tvm intrinsics for ldmatrix
  *
- * ptx_ldmatirx(transposed, num, shared_addr, local_addr)
+ * ptx_ldmatrix(transposed, num, shared_addr, local_addr)
  *
  */
-TVM_DLL const Op &ptx_ldmatirx();
+TVM_DLL const Op &ptx_ldmatrix();
 
 /*!
  * \brief tvm intrinsics for stmatrix
  *
- * ptx_ldmatirx(transposed, num, shared_addr, int32_values...)
+ * ptx_ldmatrix(transposed, num, shared_addr, int32_values...)
  *
  */
 TVM_DLL const Op &ptx_stmatrix();