tile-ai
diff --git a/‎examples/deepseek_nsa/example_tilelang_nsa_decode.py‎
Lines changed: 2 additions & 3 deletions b/‎examples/deepseek_nsa/example_tilelang_nsa_decode.py‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎src/op/copy.cc‎
Lines changed: 2 additions & 0 deletions b/‎src/op/copy.cc‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/transform/warp_specialized_rewriter.cc‎
Lines changed: 25 additions & 2 deletions b/‎src/transform/warp_specialized_rewriter.cc‎
Lines changed: 25 additions & 2 deletions
@@ -8,7 +8,8 @@
 tilelang.testing.set_random_seed(42)
 
 
-# TODO(@yu): checkout tma with nsa
+# TODO(lei): workaround, as threads is not divisible by warp group size,
+# auto warp specialization may have some bugs.
 @tilelang.jit(
     out_idx=[-1],
     pass_configs={
@@ -173,8 +174,6 @@ def main():
         block_counts=block_counts,
         block_size=block_size,
     )
-    print("out", out)
-    print("ref", ref)
     torch.testing.assert_close(ref, out, atol=1e-2, rtol=1e-2)
 
 
 
@@ -881,6 +881,8 @@ Stmt Copy::LowerBulkCopy(const LowerArgs &T, arith::Analyzer *analyzer,
     auto stride = as_const_int(shared_layout->InputShape()[0]);
     auto continuous = as_const_int(shared_layout->InputShape()[1]);
     ICHECK(stride != nullptr && continuous != nullptr);
+    // We also need to check if the shape satisfies the following doc:
+    // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TENSOR__MEMORY.html#group__CUDA__TENSOR__MEMORY_1ga7c7d2aaac9e49294304e755e6f341d7
     if (StructuralEqual()(shared_layout, makeGemmABLayoutPadded(
                                              *stride, *continuous,
                                              shared_tensor->dtype.bits()))) {
 
@@ -1405,17 +1405,36 @@ class WarpSpecializedRewriter : public StmtExprMutator {
 
 class WarpSpecializedDetector : public IRVisitorWithAnalyzer {
 public:
+  // return true means this aws will be disabled
   static bool Detect(Stmt stmt, bool skip_thread_partition = false) {
     WarpSpecializedDetector detector;
     detector.VisitStmt(stmt);
-    return detector.has_warp_specialization_ ||
-           (detector.has_tma_op_ && detector.has_mbarrier_op_);
+    if (!detector.num_threads_is_divisible_by_warp_group_) {
+      LOG(WARNING)
+          << "Auto warp specialization will be disabled because the number of "
+             "threads"
+          << detector.thread_var_->dom->extent
+          << "is not divisible by warp group size";
+      return true;
+    }
+    if (detector.has_warp_specialization_) {
+      LOG(WARNING) << "Auto warp specialization will be disabled because warp "
+                      "specialization is manually enabled";
+      return true;
+    }
+    if (detector.has_tma_op_ && detector.has_mbarrier_op_) {
+      LOG(WARNING) << "Auto warp specialization will be disabled because TMA "
+                      "and mbarrier are both present";
+      return true;
+    }
+    return false;
   }
 
   WarpSpecializedDetector() {
     has_tma_op_ = false;
     has_mbarrier_op_ = false;
     has_warp_specialization_ = false;
+    num_threads_is_divisible_by_warp_group_ = false;
   }
 
 private:
@@ -1449,6 +1468,8 @@ class WarpSpecializedDetector : public IRVisitorWithAnalyzer {
       if (iv->thread_tag == "threadIdx.x") {
         ICHECK(iv->dom->extent.as<IntImmNode>());
         thread_var_ = iv;
+        num_threads_is_divisible_by_warp_group_ =
+            iv->dom->extent.as<IntImmNode>()->value % warp_group_size_ == 0;
       }
     }
     IRVisitorWithAnalyzer::VisitStmt_(op);
@@ -1458,6 +1479,8 @@ class WarpSpecializedDetector : public IRVisitorWithAnalyzer {
   IterVar thread_var_;
   bool has_mbarrier_op_{false};
   bool has_warp_specialization_{false};
+  bool num_threads_is_divisible_by_warp_group_{false};
+  const int warp_group_size_ = 128;
 };
 
 using namespace tir::transform;