Enhance bulk copy and store checks in Copy class

LeiWang1999 · LeiWang1999 · commit ee01a935e9ec · 2025-08-22T14:34:01.000+08:00
- Updated scope validation for source and destination tensors in `CheckBulkLoad` and `CheckBulkStore` methods to include both `shared.dyn` and `shared` as valid options.
- Modified `CheckLDSMCopy` and `CheckSTSMCopy` methods to accommodate the new scope validation, ensuring compatibility with shared memory configurations.
- Improved logging in `LowerBulkCopy` to provide clearer warnings regarding unsupported swizzle layouts, including source and destination names for better debugging.
diff --git a/benchmark/matmul/benchmark_matmul_sp.py b/benchmark/matmul/benchmark_matmul_sp.py
@@ -192,7 +192,7 @@ def main(
 
                 # Clear out the accumulation buffer
                 T.clear(C_local)
-                T.no_set_max_nreg()
+                T.disable_warp_group_reg_alloc()
 
                 T.use_swizzle(panel_size=10, enable=enable_rasterization)
                 T.annotate_layout({
diff --git a/examples/deepseek_mla/experimental/example_mla_decode_kv_fp8.py b/examples/deepseek_mla/experimental/example_mla_decode_kv_fp8.py
@@ -52,7 +52,7 @@ def main_no_split(
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
-            T.no_set_max_nreg()
+            T.disable_warp_group_reg_alloc()
             loop_range = T.ceildiv(seqlen_kv, block_N)
             for k in T.Pipelined(loop_range, num_stages=2):
                 T.copy(KV[bx, k * block_N:(k + 1) * block_N, cur_kv_head, :], qKV_shared)
diff --git a/examples/dequantize_gemm/example_dequant_gemm_bf16_mxfp4_hopper.py b/examples/dequantize_gemm/example_dequant_gemm_bf16_mxfp4_hopper.py
@@ -338,7 +338,7 @@ def main(
                 C_shared: tilelang.layout.make_swizzled_layout(C_shared),
             })
             if threads == 512:
-                T.no_set_max_nreg()
+                T.disable_warp_group_reg_alloc()
 
             T.clear(C_local)
             for k in T.Pipelined(K // block_K, num_stages=num_stages):
diff --git a/examples/gdn/example_chunk_o.py b/examples/gdn/example_chunk_o.py
@@ -122,7 +122,7 @@ def kernel(
 
             T.clear(A_fragment)
             T.clear(O_fragment)
-            T.no_set_max_nreg()
+            T.disable_warp_group_reg_alloc()
             for i_k in T.Pipelined(T.ceildiv(DK, block_DK), num_stages=num_stages):
                 T.copy(
                     Q[bb, bs * block_S:(bs + 1) * block_S, bh, i_k * block_DK:(i_k + 1) * block_DK],
diff --git a/examples/gdn/example_chunk_scaled_dot_kkt.py b/examples/gdn/example_chunk_scaled_dot_kkt.py
@@ -101,7 +101,7 @@ def kernel(
             })
 
             T.fill(A_fragment, 0)
-            T.no_set_max_nreg()
+            T.disable_warp_group_reg_alloc()
             for i_s in T.Parallel(block_S):
                 Beta_shared[i_s] = Beta[bb, bs * block_S + i_s, bh]
 
diff --git a/examples/gdn/example_wy_fast.py b/examples/gdn/example_wy_fast.py
@@ -107,7 +107,7 @@ def kernel(
                 U_Beta_shared: tilelang.layout.make_swizzled_layout(U_Beta_shared),
             })
 
-            T.no_set_max_nreg()
+            T.disable_warp_group_reg_alloc()
             for i_s in T.Parallel(block_S):
                 Beta_shared[i_s] = Beta[bb, bs * block_S + i_s, bh]
                 G_shared[i_s] = T.exp(G[bb, bs * block_S + i_s, bh])
diff --git a/src/op/copy.cc b/src/op/copy.cc
@@ -363,8 +363,8 @@ bool Copy::CheckBulkLoad(Target target) const {
   // 1. arch must have bulk copy support
   if (!TargetHasBulkCopy(target))
     return false;
-  // 2. src and dst must be shared.dyn and local.fragment
-  if (src.scope() != "global" || dst.scope() != "shared.dyn")
+  // 2. src and dst must be global and shared
+  if (src.scope() != "global" || (dst.scope() != "shared.dyn" && dst.scope() != "shared"))
     return false;
   // 3. check shape.
   // TODO(lei): validate if we can utilize tma under this shape.
@@ -391,7 +391,7 @@ bool Copy::CheckBulkStore(Target target) const {
   if (!TargetHasBulkCopy(target))
     return false;
   // 2. src and dst must be shared.dyn and local.fragment
-  if (src.scope() != "shared.dyn" || dst.scope() != "global")
+  if ((src.scope() != "shared.dyn" && src.scope() != "shared") || dst.scope() != "global")
     return false;
   // 3. check shape.
   // TODO(lei): validate if we can utilize tma under this shape.
@@ -414,7 +414,7 @@ bool Copy::CheckBulkStore(Target target) const {
  * otherwise.
  */
 bool Copy::CheckLDSMCopy(Target target) const {
-  return TargetHasLdmatrix(target) && src.scope() == "shared.dyn" &&
+  return TargetHasLdmatrix(target) && (src.scope() == "shared.dyn" || src.scope() == "shared") &&
          dst.scope() == "local.fragment";
 }
 
@@ -428,7 +428,7 @@ bool Copy::CheckLDSMCopy(Target target) const {
  */
 bool Copy::CheckSTSMCopy(Target target) const {
   return TargetHasStmatrix(target) && src.scope() == "local.fragment" &&
-         dst.scope() == "shared.dyn";
+         (dst.scope() == "shared.dyn" || dst.scope() == "shared");
 }
 
 /*!
@@ -883,11 +883,7 @@ Stmt Copy::LowerBulkCopy(const LowerArgs &T, arith::Analyzer *analyzer,
     ICHECK(stride != nullptr && continuous != nullptr);
     // We also need to check if the shape satisfies the following doc:
     // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TENSOR__MEMORY.html#group__CUDA__TENSOR__MEMORY_1ga7c7d2aaac9e49294304e755e6f341d7
-    if (StructuralEqual()(shared_layout, makeGemmABLayoutPadded(
-                                             *stride, *continuous,
-                                             shared_tensor->dtype.bits()))) {
-      desc.swizzle = static_cast<int>(CU_TENSOR_MAP_SWIZZLE_NONE);
-    } else if (StructuralEqual()(
+    if (StructuralEqual()(
                    shared_layout,
                    makeQuarterBankSwizzleLayout(*stride, *continuous,
                                                 shared_tensor->dtype.bits()))) {
@@ -902,9 +898,18 @@ Stmt Copy::LowerBulkCopy(const LowerArgs &T, arith::Analyzer *analyzer,
                    makeFullBankSwizzleLayout(*stride, *continuous,
                                              shared_tensor->dtype.bits()))) {
       desc.swizzle = static_cast<int>(CU_TENSOR_MAP_SWIZZLE_128B);
+    } else if (StructuralEqual()(shared_layout, makeGemmABLayoutPadded(
+      *stride, *continuous,
+      shared_tensor->dtype.bits()))) {
+      LOG(WARNING) << "Bulk copy cannot support a padded layout for src: " 
+                   << src->name << ", dst: " << dst->name 
+                   << ", fallback to normal copy";
+      return LowerNormalCopy(T, analyzer);
     } else {
       LOG(WARNING)
-          << "Came across unsupported swizzle layout, fallback to normal copy";
+          << "Came across unsupported swizzle layout for src: " 
+          << src->name << ", dst: " << dst->name 
+          << ", fallback to normal copy";
       return LowerNormalCopy(T, analyzer);
     }
   }
diff --git a/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm_sp.py b/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm_sp.py
@@ -70,7 +70,7 @@ def main(
                         backend="cutlass",
                         block_k=block_K),
             })
-            T.no_set_max_nreg()
+            T.disable_warp_group_reg_alloc()
             T.clear(C_local)
             for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
                 T.copy(E[by * block_M, k * block_K // E_factor], E_shared)
diff --git a/testing/python/transform/test_tilelang_transform_inject_set_max_nreg.py b/testing/python/transform/test_tilelang_transform_inject_set_max_nreg.py
@@ -100,7 +100,7 @@ def before_no_set_max_nreg(A: T.Tensor((512, 512), "float16")):
             T.writes()
 
             # Add no_set_max_nreg to disable register hinting
-            T.no_set_max_nreg()
+            T.disable_warp_group_reg_alloc()
 
             T.create_list_of_mbarrier(128, 128)
             T.attr([128, 128], "kWarpSpecializationScope", 0)
diff --git a/tilelang/language/builtin.py b/tilelang/language/builtin.py
@@ -159,6 +159,10 @@ def no_set_max_nreg():
     """
     return tir.call_intrin("handle", tir.op.Op.get("tl.no_set_max_nreg"))
 
+def disable_warp_group_reg_alloc():
+    """Disable the warp group reg alloc.
+    """
+    return no_set_max_nreg()
 
 def mbarrier_wait_parity(mbarrier: Union[int, PrimExpr, tir.Call], parity: Union[int, Var]):
     """Wait for memory barrier parity condition.