Skip to content

Commit c9d7f70

Browse files
[Headers][NFC] Deduplicate gpu_match_ between targets via inlining (#131141)
Declare a few functions before including the target specific headers then define a fallback_match_{any,all} used by amdgpu and by older nvptx. Fixes a minor bug on pre-volta where one of the four fallback paths was missing a sync_lane.
1 parent 2044dd0 commit c9d7f70

File tree

3 files changed

+93
-81
lines changed

3 files changed

+93
-81
lines changed

clang/lib/Headers/amdgpuintrin.h

Lines changed: 4 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -30,10 +30,6 @@ _Pragma("omp begin declare variant match(device = {arch(amdgcn)})");
3030
// Attribute to declare a function as a kernel.
3131
#define __gpu_kernel __attribute__((amdgpu_kernel, visibility("protected")))
3232

33-
// Defined in gpuintrin.h, used later in this file.
34-
_DEFAULT_FN_ATTRS static __inline__ uint64_t
35-
__gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x);
36-
3733
// Returns the number of workgroups in the 'x' dimension of the grid.
3834
_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_x(void) {
3935
return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x();
@@ -146,57 +142,25 @@ __gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x,
146142
// Returns a bitmask marking all lanes that have the same value of __x.
147143
_DEFAULT_FN_ATTRS static __inline__ uint64_t
148144
__gpu_match_any_u32(uint64_t __lane_mask, uint32_t __x) {
149-
uint32_t __match_mask = 0;
150-
151-
bool __done = 0;
152-
while (__gpu_ballot(__lane_mask, !__done)) {
153-
if (!__done) {
154-
uint32_t __first = __gpu_read_first_lane_u32(__lane_mask, __x);
155-
if (__first == __x) {
156-
__match_mask = __gpu_lane_mask();
157-
__done = 1;
158-
}
159-
}
160-
}
161-
__gpu_sync_lane(__lane_mask);
162-
return __match_mask;
145+
return __gpu_match_any_u32_impl(__lane_mask, __x);
163146
}
164147

165148
// Returns a bitmask marking all lanes that have the same value of __x.
166149
_DEFAULT_FN_ATTRS static __inline__ uint64_t
167150
__gpu_match_any_u64(uint64_t __lane_mask, uint64_t __x) {
168-
uint64_t __match_mask = 0;
169-
170-
bool __done = 0;
171-
while (__gpu_ballot(__lane_mask, !__done)) {
172-
if (!__done) {
173-
uint64_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
174-
if (__first == __x) {
175-
__match_mask = __gpu_lane_mask();
176-
__done = 1;
177-
}
178-
}
179-
}
180-
__gpu_sync_lane(__lane_mask);
181-
return __match_mask;
151+
return __gpu_match_any_u64_impl(__lane_mask, __x);
182152
}
183153

184154
// Returns the current lane mask if every lane contains __x.
185155
_DEFAULT_FN_ATTRS static __inline__ uint64_t
186156
__gpu_match_all_u32(uint64_t __lane_mask, uint32_t __x) {
187-
uint32_t __first = __gpu_read_first_lane_u32(__lane_mask, __x);
188-
uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
189-
__gpu_sync_lane(__lane_mask);
190-
return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull;
157+
return __gpu_match_all_u32_impl(__lane_mask, __x);
191158
}
192159

193160
// Returns the current lane mask if every lane contains __x.
194161
_DEFAULT_FN_ATTRS static __inline__ uint64_t
195162
__gpu_match_all_u64(uint64_t __lane_mask, uint64_t __x) {
196-
uint64_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
197-
uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
198-
__gpu_sync_lane(__lane_mask);
199-
return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull;
163+
return __gpu_match_all_u64_impl(__lane_mask, __x);
200164
}
201165

202166
// Returns true if the flat pointer points to AMDGPU 'shared' memory.

clang/lib/Headers/gpuintrin.h

Lines changed: 81 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,30 @@ _Pragma("push_macro(\"bool\")");
3232
#define bool _Bool
3333
#endif
3434

35+
_Pragma("omp begin declare target device_type(nohost)");
36+
_Pragma("omp begin declare variant match(device = {kind(gpu)})");
37+
38+
// Forward declare a few functions for the implementation header.
39+
40+
// Returns a bitmask marking all lanes that have the same value of __x.
41+
_DEFAULT_FN_ATTRS static __inline__ uint64_t
42+
__gpu_match_any_u32_impl(uint64_t __lane_mask, uint32_t __x);
43+
44+
// Returns a bitmask marking all lanes that have the same value of __x.
45+
_DEFAULT_FN_ATTRS static __inline__ uint64_t
46+
__gpu_match_any_u64_impl(uint64_t __lane_mask, uint64_t __x);
47+
48+
// Returns the current lane mask if every lane contains __x.
49+
_DEFAULT_FN_ATTRS static __inline__ uint64_t
50+
__gpu_match_all_u32_impl(uint64_t __lane_mask, uint32_t __x);
51+
52+
// Returns the current lane mask if every lane contains __x.
53+
_DEFAULT_FN_ATTRS static __inline__ uint64_t
54+
__gpu_match_all_u64_impl(uint64_t __lane_mask, uint64_t __x);
55+
56+
_Pragma("omp end declare variant");
57+
_Pragma("omp end declare target");
58+
3559
#if defined(__NVPTX__)
3660
#include <nvptxintrin.h>
3761
#elif defined(__AMDGPU__)
@@ -115,7 +139,7 @@ __gpu_is_first_in_lane(uint64_t __lane_mask) {
115139
return __gpu_lane_id() == __gpu_first_lane_id(__lane_mask);
116140
}
117141

118-
// Copies the value from the first active thread in the wavefront to the rest.
142+
// Copies the value from the first active thread to the rest.
119143
_DEFAULT_FN_ATTRS static __inline__ uint64_t
120144
__gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x) {
121145
uint32_t __hi = (uint32_t)(__x >> 32ull);
@@ -234,6 +258,62 @@ __DO_LANE_SUM(float, f32); // float __gpu_lane_sum_f32(m, x)
234258
__DO_LANE_SUM(double, f64); // double __gpu_lane_sum_f64(m, x)
235259
#undef __DO_LANE_SUM
236260

261+
// Returns a bitmask marking all lanes that have the same value of __x.
262+
_DEFAULT_FN_ATTRS static __inline__ uint64_t
263+
__gpu_match_any_u32_impl(uint64_t __lane_mask, uint32_t __x) {
264+
uint32_t __match_mask = 0;
265+
266+
bool __done = 0;
267+
while (__gpu_ballot(__lane_mask, !__done)) {
268+
if (!__done) {
269+
uint32_t __first = __gpu_read_first_lane_u32(__lane_mask, __x);
270+
if (__first == __x) {
271+
__match_mask = __gpu_lane_mask();
272+
__done = 1;
273+
}
274+
}
275+
}
276+
__gpu_sync_lane(__lane_mask);
277+
return __match_mask;
278+
}
279+
280+
// Returns a bitmask marking all lanes that have the same value of __x.
281+
_DEFAULT_FN_ATTRS static __inline__ uint64_t
282+
__gpu_match_any_u64_impl(uint64_t __lane_mask, uint64_t __x) {
283+
uint64_t __match_mask = 0;
284+
285+
bool __done = 0;
286+
while (__gpu_ballot(__lane_mask, !__done)) {
287+
if (!__done) {
288+
uint64_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
289+
if (__first == __x) {
290+
__match_mask = __gpu_lane_mask();
291+
__done = 1;
292+
}
293+
}
294+
}
295+
__gpu_sync_lane(__lane_mask);
296+
return __match_mask;
297+
}
298+
299+
// Returns the current lane mask if every lane contains __x.
300+
_DEFAULT_FN_ATTRS static __inline__ uint64_t
301+
__gpu_match_all_u32_impl(uint64_t __lane_mask, uint32_t __x) {
302+
uint32_t __first = __gpu_read_first_lane_u32(__lane_mask, __x);
303+
uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
304+
__gpu_sync_lane(__lane_mask);
305+
return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull;
306+
}
307+
308+
// Returns the current lane mask if every lane contains __x.
309+
_DEFAULT_FN_ATTRS static __inline__ uint64_t
310+
__gpu_match_all_u64_impl(uint64_t __lane_mask, uint64_t __x) {
311+
uint64_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
312+
uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
313+
__gpu_sync_lane(__lane_mask);
314+
return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull;
315+
}
316+
237317
_Pragma("omp end declare variant");
238318
_Pragma("omp end declare target");
239319

clang/lib/Headers/nvptxintrin.h

Lines changed: 8 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -34,10 +34,6 @@ _Pragma("omp begin declare variant match(device = {arch(nvptx64)})");
3434
// Attribute to declare a function as a kernel.
3535
#define __gpu_kernel __attribute__((nvptx_kernel, visibility("protected")))
3636

37-
// Defined in gpuintrin.h, used later in this file.
38-
_DEFAULT_FN_ATTRS static __inline__ uint64_t
39-
__gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x);
40-
4137
// Returns the number of CUDA blocks in the 'x' dimension.
4238
_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_x(void) {
4339
return __nvvm_read_ptx_sreg_nctaid_x();
@@ -156,20 +152,9 @@ __gpu_match_any_u32(uint64_t __lane_mask, uint32_t __x) {
156152
// Newer targets can use the dedicated CUDA support.
157153
#if __CUDA_ARCH__ >= 700
158154
return __nvvm_match_any_sync_i32(__lane_mask, __x);
155+
#else
156+
return __gpu_match_any_u32_impl(__lane_mask, __x);
159157
#endif
160-
161-
uint32_t __match_mask = 0;
162-
bool __done = 0;
163-
while (__gpu_ballot(__lane_mask, !__done)) {
164-
if (!__done) {
165-
uint32_t __first = __gpu_read_first_lane_u32(__lane_mask, __x);
166-
if (__first == __x) {
167-
__match_mask = __gpu_lane_mask();
168-
__done = 1;
169-
}
170-
}
171-
}
172-
return __match_mask;
173158
}
174159

175160
// Returns a bitmask marking all lanes that have the same value of __x.
@@ -178,22 +163,9 @@ __gpu_match_any_u64(uint64_t __lane_mask, uint64_t __x) {
178163
// Newer targets can use the dedicated CUDA support.
179164
#if __CUDA_ARCH__ >= 700
180165
return __nvvm_match_any_sync_i64(__lane_mask, __x);
166+
#else
167+
return __gpu_match_any_u64_impl(__lane_mask, __x);
181168
#endif
182-
183-
uint64_t __match_mask = 0;
184-
185-
bool __done = 0;
186-
while (__gpu_ballot(__lane_mask, !__done)) {
187-
if (!__done) {
188-
uint64_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
189-
if (__first == __x) {
190-
__match_mask = __gpu_lane_mask();
191-
__done = 1;
192-
}
193-
}
194-
}
195-
__gpu_sync_lane(__lane_mask);
196-
return __match_mask;
197169
}
198170

199171
// Returns the current lane mask if every lane contains __x.
@@ -203,11 +175,9 @@ __gpu_match_all_u32(uint64_t __lane_mask, uint32_t __x) {
203175
#if __CUDA_ARCH__ >= 700
204176
int predicate;
205177
return __nvvm_match_all_sync_i32p(__lane_mask, __x, &predicate);
178+
#else
179+
return __gpu_match_all_u32_impl(__lane_mask, __x);
206180
#endif
207-
208-
uint32_t __first = __gpu_read_first_lane_u32(__lane_mask, __x);
209-
uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
210-
return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull;
211181
}
212182

213183
// Returns the current lane mask if every lane contains __x.
@@ -217,11 +187,9 @@ __gpu_match_all_u64(uint64_t __lane_mask, uint64_t __x) {
217187
#if __CUDA_ARCH__ >= 700
218188
int predicate;
219189
return __nvvm_match_all_sync_i64p(__lane_mask, __x, &predicate);
190+
#else
191+
return __gpu_match_all_u64_impl(__lane_mask, __x);
220192
#endif
221-
222-
uint64_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
223-
uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
224-
return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull;
225193
}
226194

227195
// Returns true if the flat pointer points to CUDA 'shared' memory.

0 commit comments

Comments
 (0)