add warpmask for __syncwarp

Varun Sundar Rabindranath · Varun Sundar Rabindranath · commit 9bfbcd6870df · 2025-09-15T23:55:07.000Z
Signed-off-by: Varun Sundar Rabindranath &lt;vsundarr@redhat.com&gt;
diff --git a/csrc/kernels/intranode.cu b/csrc/kernels/intranode.cu
@@ -600,6 +600,10 @@ combine(dtype_t* recv_x, float* recv_topk_weights,
 
     // TMA stuffs
 #ifndef DISABLE_SM90_FEATURES
+    auto set_n_bits = [](const int x) -> uint32_t {
+        static constexpr uint64_t one = 1; 
+        return static_cast<uint32_t>((one << x) - one);
+    };
     extern __shared__ __align__(1024) uint8_t smem_buffer[];
     auto tma_buffer = smem_buffer + (thread_id / 32) * kNumTMABytesPerWarp;
 #endif
@@ -839,24 +843,27 @@ combine(dtype_t* recv_x, float* recv_topk_weights,
                         out_dtypes[j] = static_cast<dtype_t>(values[j]);
 
 #ifndef DISABLE_SM90_FEATURES
+		    const int num_participating_threads = min(32, hidden_int4 - i);
+                    const unsigned int warp_mask = set_n_bits(num_participating_threads);
+
                     // Wait TMA arrival
                     if (lane_id == 0)
                         tma_store_wait<kNumStages - 1>();
-                    __syncwarp();
+                    __syncwarp(warp_mask);
 
                     // Write into TMA buffer
                     auto tma_stage_idx = (i / 32) % kNumStages;
                     reinterpret_cast<int4*>(tma_buffer)[tma_stage_idx * 32 + lane_id] = out_int4;
 
                     // Issue TMA
                     tma_store_fence();
-                    __syncwarp();
+                    __syncwarp(warp_mask);
                     if (lane_id == 0) {
                         auto tma_bytes = min(32, hidden_int4 - i) * static_cast<int>(sizeof(int4));
                         tma_store_1d(reinterpret_cast<int4*>(tma_buffer) + tma_stage_idx * 32,
                                      recv_int4 + token_idx * hidden_int4 + i, tma_bytes, false);
                     }
-                    __syncwarp();
+                    __syncwarp(warp_mask);
 #else
                     recv_int4[token_idx * hidden_int4 + i] = out_int4;
 #endif
diff --git a/tests/test_intranode.py b/tests/test_intranode.py
@@ -25,8 +25,11 @@ def test_main(args: argparse.Namespace, num_sms: int, local_rank: int, num_ranks
     # Random data
     x = torch.ones((num_tokens, hidden), dtype=torch.bfloat16, device='cuda') * rank
     x_pure_rand = torch.randn((num_tokens, hidden), dtype=torch.bfloat16, device='cuda')
-    x_e4m3 = per_token_cast_to_fp8(x) if deep_ep.Buffer.is_sm90_compiled() else None
-    x_e4m3 = (x_e4m3[0], x_e4m3[1].T.contiguous().T) if x_e4m3 is not None else None
+
+    x_e4m3 = None
+    if hidden % 128 == 0:
+        x_e4m3 = per_token_cast_to_fp8(x) if deep_ep.Buffer.is_sm90_compiled() else None
+        x_e4m3 = (x_e4m3[0], x_e4m3[1].T.contiguous().T) if x_e4m3 is not None else None
     scores = torch.randn((num_tokens, num_experts), dtype=torch.float32, device='cuda').abs() + 1
     topk_idx = torch.topk(scores, num_topk, dim=-1, largest=True, sorted=False)[1]
     topk_weights = torch.ones((num_tokens, num_topk), dtype=torch.float32, device='cuda') * rank