Skip to content
This repository was archived by the owner on Apr 23, 2020. It is now read-only.

Commit 537de38

Browse files
[libomptarget] Refactor activemask macro to inline function
Summary: [libomptarget] Refactor activemask macro to inline function See also abandoned D66846, split into this diff and others. Reviewers: jdoerfert, ABataev, grokos, ronlieb, gregrodgers Reviewed By: jdoerfert, ABataev Subscribers: openmp-commits Tags: #openmp Differential Revision: https://reviews.llvm.org/D66851 git-svn-id: https://llvm.org/svn/llvm-project/openmp/trunk@370781 91177308-0d34-0410-b5e6-96231b3b80d8
1 parent 1fe4d62 commit 537de38

File tree

8 files changed

+29
-30
lines changed

8 files changed

+29
-30
lines changed

libomptarget/deviceRTLs/nvptx/src/data_sharing.cu

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ INLINE static unsigned getLaneId() { return threadIdx.x % WARPSIZE; }
2020

2121
// Return true if this is the first active thread in the warp.
2222
INLINE static bool IsWarpMasterActiveThread() {
23-
unsigned long long Mask = __ACTIVEMASK();
23+
unsigned long long Mask = __kmpc_impl_activemask();
2424
unsigned long long ShNum = WARPSIZE - (GetThreadIdInBlock() % WARPSIZE);
2525
unsigned long long Sh = Mask << ShNum;
2626
// Truncate Sh to the 32 lower bits
@@ -112,7 +112,7 @@ EXTERN void *__kmpc_data_sharing_environment_begin(
112112
(unsigned long long)SharingDefaultDataSize);
113113

114114
unsigned WID = getWarpId();
115-
unsigned CurActiveThreads = __ACTIVEMASK();
115+
__kmpc_impl_lanemask_t CurActiveThreads = __kmpc_impl_activemask();
116116

117117
__kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
118118
void *&StackP = DataSharingState.StackPtr[WID];
@@ -252,7 +252,7 @@ EXTERN void __kmpc_data_sharing_environment_end(
252252
return;
253253
}
254254

255-
int32_t CurActive = __ACTIVEMASK();
255+
__kmpc_impl_lanemask_t CurActive = __kmpc_impl_activemask();
256256

257257
// Only the warp master can restore the stack and frame information, and only
258258
// if there are no other threads left behind in this environment (i.e. the
@@ -378,7 +378,7 @@ INLINE static void* data_sharing_push_stack_common(size_t PushSize) {
378378
// Frame pointer must be visible to all workers in the same warp.
379379
const unsigned WID = getWarpId();
380380
void *FrameP = 0;
381-
int32_t CurActive = __ACTIVEMASK();
381+
__kmpc_impl_lanemask_t CurActive = __kmpc_impl_activemask();
382382

383383
if (IsWarpMaster) {
384384
// SlotP will point to either the shared memory slot or an existing

libomptarget/deviceRTLs/nvptx/src/loop.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -389,7 +389,7 @@ public:
389389
}
390390

391391
INLINE static uint64_t NextIter() {
392-
__kmpc_impl_lanemask_t active = __ACTIVEMASK();
392+
__kmpc_impl_lanemask_t active = __kmpc_impl_activemask();
393393
uint32_t leader = __kmpc_impl_ffs(active) - 1;
394394
uint32_t change = __kmpc_impl_popc(active);
395395
__kmpc_impl_lanemask_t lane_mask_lt = __kmpc_impl_lanemask_lt();

libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -45,17 +45,6 @@
4545
#define BARRIER_COUNTER 0
4646
#define ORDERED_COUNTER 1
4747

48-
// Macros for Cuda intrinsics
49-
// In Cuda 9.0, the *_sync() version takes an extra argument 'mask'.
50-
// Also, __ballot(1) in Cuda 8.0 is replaced with __activemask().
51-
#ifndef CUDA_VERSION
52-
#error CUDA_VERSION macro is undefined, something wrong with cuda.
53-
#elif CUDA_VERSION >= 9000
54-
#define __ACTIVEMASK() __activemask()
55-
#else
56-
#define __ACTIVEMASK() __ballot(1)
57-
#endif // CUDA_VERSION
58-
5948
// arguments needed for L0 parallelism only.
6049
class omptarget_nvptx_SharedArgs {
6150
public:

libomptarget/deviceRTLs/nvptx/src/parallel.cu

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ EXTERN bool __kmpc_kernel_convergent_simd(void *buffer, uint32_t Mask,
5353
uint32_t WorkRemaining = ConvergentMask >> (*LaneSource + 1);
5454
*LaneSource += __kmpc_impl_ffs(WorkRemaining);
5555
*IsFinal = __kmpc_impl_popc(WorkRemaining) == 1;
56-
uint32_t lanemask_lt = __kmpc_impl_lanemask_lt();
56+
__kmpc_impl_lanemask_t lanemask_lt = __kmpc_impl_lanemask_lt();
5757
*LaneId = __kmpc_impl_popc(ConvergentMask & lanemask_lt);
5858

5959
int threadId = GetLogicalThreadIdInBlock(isSPMDMode());
@@ -126,7 +126,7 @@ EXTERN bool __kmpc_kernel_convergent_parallel(void *buffer, uint32_t Mask,
126126
uint32_t WorkRemaining = ConvergentMask >> (*LaneSource + 1);
127127
*LaneSource += __kmpc_impl_ffs(WorkRemaining);
128128
*IsFinal = __kmpc_impl_popc(WorkRemaining) == 1;
129-
uint32_t lanemask_lt = __kmpc_impl_lanemask_lt();
129+
__kmpc_impl_lanemask_t lanemask_lt = __kmpc_impl_lanemask_lt();
130130
uint32_t OmpId = __kmpc_impl_popc(ConvergentMask & lanemask_lt);
131131

132132
int threadId = GetLogicalThreadIdInBlock(isSPMDMode());

libomptarget/deviceRTLs/nvptx/src/reduction.cu

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -61,12 +61,12 @@ INLINE static uint32_t
6161
gpu_irregular_simd_reduce(void *reduce_data, kmp_ShuffleReductFctPtr shflFct) {
6262
uint32_t size, remote_id, physical_lane_id;
6363
physical_lane_id = GetThreadIdInBlock() % WARPSIZE;
64-
uint32_t lanemask_lt = __kmpc_impl_lanemask_lt();
65-
uint32_t Liveness = __ACTIVEMASK();
64+
__kmpc_impl_lanemask_t lanemask_lt = __kmpc_impl_lanemask_lt();
65+
__kmpc_impl_lanemask_t Liveness = __kmpc_impl_activemask();
6666
uint32_t logical_lane_id = __kmpc_impl_popc(Liveness & lanemask_lt) * 2;
67-
uint32_t lanemask_gt = __kmpc_impl_lanemask_gt();
67+
__kmpc_impl_lanemask_t lanemask_gt = __kmpc_impl_lanemask_gt();
6868
do {
69-
Liveness = __ACTIVEMASK();
69+
Liveness = __kmpc_impl_activemask();
7070
remote_id = __kmpc_impl_ffs(Liveness & lanemask_gt);
7171
size = __kmpc_impl_popc(Liveness);
7272
logical_lane_id /= 2;
@@ -81,7 +81,7 @@ int32_t __kmpc_nvptx_simd_reduce_nowait(int32_t global_tid, int32_t num_vars,
8181
size_t reduce_size, void *reduce_data,
8282
kmp_ShuffleReductFctPtr shflFct,
8383
kmp_InterWarpCopyFctPtr cpyFct) {
84-
uint32_t Liveness = __ACTIVEMASK();
84+
__kmpc_impl_lanemask_t Liveness = __kmpc_impl_activemask();
8585
if (Liveness == 0xffffffff) {
8686
gpu_regular_warp_reduce(reduce_data, shflFct);
8787
return GetThreadIdInBlock() % WARPSIZE ==
@@ -142,7 +142,7 @@ static int32_t nvptx_parallel_reduce_nowait(
142142
}
143143
return BlockThreadId == 0;
144144
#else
145-
uint32_t Liveness = __ACTIVEMASK();
145+
__kmpc_impl_lanemask_t Liveness = __kmpc_impl_activemask();
146146
if (Liveness == 0xffffffff) // Full warp
147147
gpu_regular_warp_reduce(reduce_data, shflFct);
148148
else if (!(Liveness & (Liveness + 1))) // Partial warp but contiguous lanes
@@ -317,7 +317,7 @@ static int32_t nvptx_teams_reduce_nowait(int32_t global_tid, int32_t num_vars,
317317
ldFct(reduce_data, scratchpad, i, NumTeams, /*Load and reduce*/ 1);
318318

319319
// Reduce across warps to the warp master.
320-
uint32_t Liveness = __ACTIVEMASK();
320+
__kmpc_impl_lanemask_t Liveness = __kmpc_impl_activemask();
321321
if (Liveness == 0xffffffff) // Full warp
322322
gpu_regular_warp_reduce(reduce_data, shflFct);
323323
else // Partial warp but contiguous lanes

libomptarget/deviceRTLs/nvptx/src/supporti.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -204,9 +204,9 @@ INLINE int IsTeamMaster(int ompThreadId) { return (ompThreadId == 0); }
204204
// Parallel level
205205

206206
INLINE void IncParallelLevel(bool ActiveParallel) {
207-
unsigned Active = __ACTIVEMASK();
207+
__kmpc_impl_lanemask_t Active = __kmpc_impl_activemask();
208208
__kmpc_impl_syncwarp(Active);
209-
unsigned LaneMaskLt = __kmpc_impl_lanemask_lt();
209+
__kmpc_impl_lanemask_t LaneMaskLt = __kmpc_impl_lanemask_lt();
210210
unsigned Rank = __kmpc_impl_popc(Active & LaneMaskLt);
211211
if (Rank == 0) {
212212
parallelLevel[GetWarpId()] +=
@@ -217,9 +217,9 @@ INLINE void IncParallelLevel(bool ActiveParallel) {
217217
}
218218

219219
INLINE void DecParallelLevel(bool ActiveParallel) {
220-
unsigned Active = __ACTIVEMASK();
220+
__kmpc_impl_lanemask_t Active = __kmpc_impl_activemask();
221221
__kmpc_impl_syncwarp(Active);
222-
unsigned LaneMaskLt = __kmpc_impl_lanemask_lt();
222+
__kmpc_impl_lanemask_t LaneMaskLt = __kmpc_impl_lanemask_lt();
223223
unsigned Rank = __kmpc_impl_popc(Active & LaneMaskLt);
224224
if (Rank == 0) {
225225
parallelLevel[GetWarpId()] -=

libomptarget/deviceRTLs/nvptx/src/sync.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -142,7 +142,7 @@ EXTERN void __kmpc_flush(kmp_Ident *loc) {
142142

143143
EXTERN int32_t __kmpc_warp_active_thread_mask() {
144144
PRINT0(LD_IO, "call __kmpc_warp_active_thread_mask\n");
145-
return __ACTIVEMASK();
145+
return __kmpc_impl_activemask();
146146
}
147147

148148
////////////////////////////////////////////////////////////////////////////////

libomptarget/deviceRTLs/nvptx/src/target_impl.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,16 @@ INLINE uint32_t __kmpc_impl_popc(uint32_t x) { return __popc(x); }
4848
#error CUDA_VERSION macro is undefined, something wrong with cuda.
4949
#endif
5050

51+
// In Cuda 9.0, __ballot(1) from Cuda 8.0 is replaced with __activemask().
52+
53+
INLINE __kmpc_impl_lanemask_t __kmpc_impl_activemask() {
54+
#if CUDA_VERSION >= 9000
55+
return __activemask();
56+
#else
57+
return __ballot(1);
58+
#endif
59+
}
60+
5161
// In Cuda 9.0, the *_sync() version takes an extra argument 'mask'.
5262

5363
INLINE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t Mask, int32_t Var,

0 commit comments

Comments
 (0)