- 
        Couldn't load subscription status. 
- Fork 286
[Bugfix] Add missing FP8 header include #752
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
0933f4d
              4f26bf6
              00c76f5
              701c291
              9fc010a
              374b0c7
              7a51333
              File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | 
|---|---|---|
| @@ -0,0 +1,56 @@ | ||
| #pragma once | ||
|  | ||
| #if __CUDA_ARCH_LIST__ >= 900 | ||
| #include "cute/arch/cluster_sm90.hpp" | ||
| #include "cutlass/cutlass.h" | ||
|  | ||
| namespace tl { | ||
| // Template parameter: | ||
| // thread_extent: the logical size (in number of threads) of each "group" | ||
| // within which we want to elect exactly ONE representative | ||
| // thread. | ||
| template <int thread_extent> TL_DEVICE bool tl_shuffle_elect() { | ||
|  | ||
| // Special case: thread_extent == 0 means "elect exactly one thread | ||
| // in the entire thread block", i.e., the leader of the first warp of the | ||
| // block. | ||
| if constexpr (thread_extent == 0) { | ||
| // cutlass::canonical_warp_idx_sync(): | ||
| // Returns the warp ID within the thread block in a "canonical" way | ||
| // (0 for the first warp, 1 for the second, ...). | ||
| // cute::elect_one_sync(): | ||
| // Elect exactly one lane in the warp to return true (typically lane 0), | ||
| // other lanes return false. | ||
| // The condition ensures that: | ||
| // (1) We are in warp 0 of the block. | ||
| // (2) We are the elected lane in this warp. | ||
| return cutlass::canonical_warp_idx_sync() == 0 && cute::elect_one_sync(); | ||
| } | ||
|  | ||
| // General case: thread_extent != 0 | ||
| // (threadIdx.x / 32) is the warp index in the block. | ||
| // (thread_extent / 32) is the number of warps in one group of size | ||
| // thread_extent. We take warp_id % num_warps_in_group to get the warp's index | ||
| // within the group. | ||
| // __shfl_sync(mask, value, srcLane): broadcast 'value' from srcLane to all | ||
| // lanes in the warp. Here it broadcasts the group-local warp index from lane | ||
| // 0. Comparing to 0 selects only the group's warp 0. | ||
| return __shfl_sync(0xffffffff, // full warp mask | ||
| (threadIdx.x / 32) % | ||
| (thread_extent / 32), // warp index within group | ||
| 0 // take the value from lane 0 | ||
| ) == 0 && | ||
| // Within that group leader warp, elect exactly one lane (typically | ||
| // lane 0) to be the single representative for the group. | ||
| cute::elect_one_sync(); | ||
| } | ||
|         
                  coderabbitai[bot] marked this conversation as resolved.
              Show resolved
            Hide resolved 
      Comment on lines
    
      +30
     to 
      +46
    
   There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fix correctness: modulo-by-zero risk and 3D block handling in group election. 
 The diff in my other comment rewrites this logic safely with a  🤖 Prompt for AI Agents | ||
|  | ||
| template <uint32_t RegCount> TL_DEVICE void warpgroup_reg_alloc() { | ||
| asm volatile("setmaxnreg.inc.sync.aligned.u32 %0;\n" : : "n"(RegCount)); | ||
| } | ||
|  | ||
| template <uint32_t RegCount> TL_DEVICE void warpgroup_reg_dealloc() { | ||
| asm volatile("setmaxnreg.dec.sync.aligned.u32 %0;\n" : : "n"(RegCount)); | ||
| } | ||
| } // namespace tl | ||
| #endif | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
🛠️ Refactor suggestion
Make the header self-contained and guard architecture checks safely.
common.h,TL_DEVICEis undefined. Provide fallbacks here to avoid order-dependence.__CUDA_ARCH_LIST__may be undefined. Use a defensive guard that also supports the standard__CUDA_ARCH__.<stdint.h>to guaranteeuint32_tavailability.Apply this diff at the top of the file:
📝 Committable suggestion
🤖 Prompt for AI Agents