-
Notifications
You must be signed in to change notification settings - Fork 13.6k
[Headers][NFC] Deduplicate gpu_match_ between targets via inlining #131141
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-x86 @llvm/pr-subscribers-backend-amdgpu Author: Jon Chesterfield (JonChesterfield) ChangesDeclare a few functions before including the target specific headers then define a fallback_match_any, used by amdgpu and by older nvptx. Full diff: https://github.com/llvm/llvm-project/pull/131141.diff 3 Files Affected:
diff --git a/clang/lib/Headers/amdgpuintrin.h b/clang/lib/Headers/amdgpuintrin.h
index 56748f6c3e818..74054068c9714 100644
--- a/clang/lib/Headers/amdgpuintrin.h
+++ b/clang/lib/Headers/amdgpuintrin.h
@@ -30,10 +30,6 @@ _Pragma("omp begin declare variant match(device = {arch(amdgcn)})");
// Attribute to declare a function as a kernel.
#define __gpu_kernel __attribute__((amdgpu_kernel, visibility("protected")))
-// Defined in gpuintrin.h, used later in this file.
-_DEFAULT_FN_ATTRS static __inline__ uint64_t
-__gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x);
-
// Returns the number of workgroups in the 'x' dimension of the grid.
_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_x(void) {
return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x();
@@ -146,20 +142,7 @@ __gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x,
// Returns a bitmask marking all lanes that have the same value of __x.
_DEFAULT_FN_ATTRS static __inline__ uint64_t
__gpu_match_any_u32(uint64_t __lane_mask, uint32_t __x) {
- uint32_t __match_mask = 0;
-
- bool __done = 0;
- while (__gpu_ballot(__lane_mask, !__done)) {
- if (!__done) {
- uint32_t __first = __gpu_read_first_lane_u32(__lane_mask, __x);
- if (__first == __x) {
- __match_mask = __gpu_lane_mask();
- __done = 1;
- }
- }
- }
- __gpu_sync_lane(__lane_mask);
- return __match_mask;
+ return __gpu_fallback_match_any_u32(__lane_mask, __x);
}
// Returns a bitmask marking all lanes that have the same value of __x.
diff --git a/clang/lib/Headers/gpuintrin.h b/clang/lib/Headers/gpuintrin.h
index ac79d685337c5..e4a9a49e10e1f 100644
--- a/clang/lib/Headers/gpuintrin.h
+++ b/clang/lib/Headers/gpuintrin.h
@@ -32,6 +32,52 @@ _Pragma("push_macro(\"bool\")");
#define bool _Bool
#endif
+
+_Pragma("omp begin declare target device_type(nohost)");
+_Pragma("omp begin declare variant match(device = {kind(gpu)})");
+
+// Returns the bit-mask of active threads in the current warp or wavefront.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_lane_mask(void) {
+
+// Returns a bitmask of threads in the current lane for which \p x is true.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_ballot(uint64_t __lane_mask,
+ bool __x);
+
+// Copies the value from the first active thread to the rest.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t
+__gpu_read_first_lane_u32(uint64_t __lane_mask, uint32_t __x) {
+
+
+// Copies the value from the first active thread to the rest.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x);
+
+
+
+// Returns a bitmask marking all lanes that have the same value of __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_fallback_match_any_u32(uint64_t __lane_mask, uint32_t __x) {
+ uint32_t __match_mask = 0;
+
+ bool __done = 0;
+ while (__gpu_ballot(__lane_mask, !__done)) {
+ if (!__done) {
+ uint32_t __first = __gpu_read_first_lane_u32(__lane_mask, __x);
+ if (__first == __x) {
+ __match_mask = __gpu_lane_mask();
+ __done = 1;
+ }
+ }
+ }
+ __gpu_sync_lane(__lane_mask);
+ return __match_mask;
+}
+
+
+_Pragma("omp end declare variant");
+_Pragma("omp end declare target");
+
+
#if defined(__NVPTX__)
#include <nvptxintrin.h>
#elif defined(__AMDGPU__)
@@ -115,7 +161,7 @@ __gpu_is_first_in_lane(uint64_t __lane_mask) {
return __gpu_lane_id() == __gpu_first_lane_id(__lane_mask);
}
-// Copies the value from the first active thread in the wavefront to the rest.
+// Copies the value from the first active thread to the rest.
_DEFAULT_FN_ATTRS static __inline__ uint64_t
__gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x) {
uint32_t __hi = (uint32_t)(__x >> 32ull);
diff --git a/clang/lib/Headers/nvptxintrin.h b/clang/lib/Headers/nvptxintrin.h
index 10ad7a682d4cd..1da9402040b52 100644
--- a/clang/lib/Headers/nvptxintrin.h
+++ b/clang/lib/Headers/nvptxintrin.h
@@ -34,10 +34,6 @@ _Pragma("omp begin declare variant match(device = {arch(nvptx64)})");
// Attribute to declare a function as a kernel.
#define __gpu_kernel __attribute__((nvptx_kernel, visibility("protected")))
-// Defined in gpuintrin.h, used later in this file.
-_DEFAULT_FN_ATTRS static __inline__ uint64_t
-__gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x);
-
// Returns the number of CUDA blocks in the 'x' dimension.
_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_x(void) {
return __nvvm_read_ptx_sreg_nctaid_x();
@@ -156,20 +152,9 @@ __gpu_match_any_u32(uint64_t __lane_mask, uint32_t __x) {
// Newer targets can use the dedicated CUDA support.
#if __CUDA_ARCH__ >= 700
return __nvvm_match_any_sync_i32(__lane_mask, __x);
+#else
+ return __gpu_fallback_match_any_u32(__lane_mask, __x);
#endif
-
- uint32_t __match_mask = 0;
- bool __done = 0;
- while (__gpu_ballot(__lane_mask, !__done)) {
- if (!__done) {
- uint32_t __first = __gpu_read_first_lane_u32(__lane_mask, __x);
- if (__first == __x) {
- __match_mask = __gpu_lane_mask();
- __done = 1;
- }
- }
- }
- return __match_mask;
}
// Returns a bitmask marking all lanes that have the same value of __x.
|
@llvm/pr-subscribers-clang Author: Jon Chesterfield (JonChesterfield) ChangesDeclare a few functions before including the target specific headers then define a fallback_match_any, used by amdgpu and by older nvptx. Full diff: https://github.com/llvm/llvm-project/pull/131141.diff 3 Files Affected:
diff --git a/clang/lib/Headers/amdgpuintrin.h b/clang/lib/Headers/amdgpuintrin.h
index 56748f6c3e818..74054068c9714 100644
--- a/clang/lib/Headers/amdgpuintrin.h
+++ b/clang/lib/Headers/amdgpuintrin.h
@@ -30,10 +30,6 @@ _Pragma("omp begin declare variant match(device = {arch(amdgcn)})");
// Attribute to declare a function as a kernel.
#define __gpu_kernel __attribute__((amdgpu_kernel, visibility("protected")))
-// Defined in gpuintrin.h, used later in this file.
-_DEFAULT_FN_ATTRS static __inline__ uint64_t
-__gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x);
-
// Returns the number of workgroups in the 'x' dimension of the grid.
_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_x(void) {
return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x();
@@ -146,20 +142,7 @@ __gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x,
// Returns a bitmask marking all lanes that have the same value of __x.
_DEFAULT_FN_ATTRS static __inline__ uint64_t
__gpu_match_any_u32(uint64_t __lane_mask, uint32_t __x) {
- uint32_t __match_mask = 0;
-
- bool __done = 0;
- while (__gpu_ballot(__lane_mask, !__done)) {
- if (!__done) {
- uint32_t __first = __gpu_read_first_lane_u32(__lane_mask, __x);
- if (__first == __x) {
- __match_mask = __gpu_lane_mask();
- __done = 1;
- }
- }
- }
- __gpu_sync_lane(__lane_mask);
- return __match_mask;
+ return __gpu_fallback_match_any_u32(__lane_mask, __x);
}
// Returns a bitmask marking all lanes that have the same value of __x.
diff --git a/clang/lib/Headers/gpuintrin.h b/clang/lib/Headers/gpuintrin.h
index ac79d685337c5..e4a9a49e10e1f 100644
--- a/clang/lib/Headers/gpuintrin.h
+++ b/clang/lib/Headers/gpuintrin.h
@@ -32,6 +32,52 @@ _Pragma("push_macro(\"bool\")");
#define bool _Bool
#endif
+
+_Pragma("omp begin declare target device_type(nohost)");
+_Pragma("omp begin declare variant match(device = {kind(gpu)})");
+
+// Returns the bit-mask of active threads in the current warp or wavefront.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_lane_mask(void) {
+
+// Returns a bitmask of threads in the current lane for which \p x is true.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_ballot(uint64_t __lane_mask,
+ bool __x);
+
+// Copies the value from the first active thread to the rest.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t
+__gpu_read_first_lane_u32(uint64_t __lane_mask, uint32_t __x) {
+
+
+// Copies the value from the first active thread to the rest.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x);
+
+
+
+// Returns a bitmask marking all lanes that have the same value of __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_fallback_match_any_u32(uint64_t __lane_mask, uint32_t __x) {
+ uint32_t __match_mask = 0;
+
+ bool __done = 0;
+ while (__gpu_ballot(__lane_mask, !__done)) {
+ if (!__done) {
+ uint32_t __first = __gpu_read_first_lane_u32(__lane_mask, __x);
+ if (__first == __x) {
+ __match_mask = __gpu_lane_mask();
+ __done = 1;
+ }
+ }
+ }
+ __gpu_sync_lane(__lane_mask);
+ return __match_mask;
+}
+
+
+_Pragma("omp end declare variant");
+_Pragma("omp end declare target");
+
+
#if defined(__NVPTX__)
#include <nvptxintrin.h>
#elif defined(__AMDGPU__)
@@ -115,7 +161,7 @@ __gpu_is_first_in_lane(uint64_t __lane_mask) {
return __gpu_lane_id() == __gpu_first_lane_id(__lane_mask);
}
-// Copies the value from the first active thread in the wavefront to the rest.
+// Copies the value from the first active thread to the rest.
_DEFAULT_FN_ATTRS static __inline__ uint64_t
__gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x) {
uint32_t __hi = (uint32_t)(__x >> 32ull);
diff --git a/clang/lib/Headers/nvptxintrin.h b/clang/lib/Headers/nvptxintrin.h
index 10ad7a682d4cd..1da9402040b52 100644
--- a/clang/lib/Headers/nvptxintrin.h
+++ b/clang/lib/Headers/nvptxintrin.h
@@ -34,10 +34,6 @@ _Pragma("omp begin declare variant match(device = {arch(nvptx64)})");
// Attribute to declare a function as a kernel.
#define __gpu_kernel __attribute__((nvptx_kernel, visibility("protected")))
-// Defined in gpuintrin.h, used later in this file.
-_DEFAULT_FN_ATTRS static __inline__ uint64_t
-__gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x);
-
// Returns the number of CUDA blocks in the 'x' dimension.
_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_x(void) {
return __nvvm_read_ptx_sreg_nctaid_x();
@@ -156,20 +152,9 @@ __gpu_match_any_u32(uint64_t __lane_mask, uint32_t __x) {
// Newer targets can use the dedicated CUDA support.
#if __CUDA_ARCH__ >= 700
return __nvvm_match_any_sync_i32(__lane_mask, __x);
+#else
+ return __gpu_fallback_match_any_u32(__lane_mask, __x);
#endif
-
- uint32_t __match_mask = 0;
- bool __done = 0;
- while (__gpu_ballot(__lane_mask, !__done)) {
- if (!__done) {
- uint32_t __first = __gpu_read_first_lane_u32(__lane_mask, __x);
- if (__first == __x) {
- __match_mask = __gpu_lane_mask();
- __done = 1;
- }
- }
- }
- return __match_mask;
}
// Returns a bitmask marking all lanes that have the same value of __x.
|
4225329
to
e1456be
Compare
✅ With the latest revision this PR passed the C/C++ code formatter. |
28cb801
to
cde4232
Compare
cde4232
to
5e55b82
Compare
clang/lib/Headers/gpuintrin.h
Outdated
|
||
// Returns a bitmask marking all lanes that have the same value of __x. | ||
_DEFAULT_FN_ATTRS static __inline__ uint64_t | ||
__gpu_fallback_match_any_u32(uint64_t __lane_mask, uint32_t __x); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
__gpu_fallback_match_any_u32(uint64_t __lane_mask, uint32_t __x); | |
__gpu_match_any_u32_impl(uint64_t __lane_mask, uint32_t __x); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sure, done
5e55b82
to
fbeb177
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LG
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/88/builds/9091 Here is the relevant piece of the build log for the reference
|
…lvm#131141) Declare a few functions before including the target specific headers then define a fallback_match_{any,all} used by amdgpu and by older nvptx. Fixes a minor bug on pre-volta where one of the four fallback paths was missing a sync_lane.
Declare a few functions before including the target specific headers then define a fallback_match_{any,all} used by amdgpu and by older nvptx.
Fixes a minor bug on pre-volta where one of the four fallback paths was missing a sync_lane.