Skip to content
This repository was archived by the owner on Apr 23, 2020. It is now read-only.

Commit cd9405f

Browse files
committed
[OPENMP][NVPTX]Use __syncwarp() to reconverge the threads.
Summary: In Cuda 9.0 it is not guaranteed that threads in the warps are convergent. We need to use __syncwarp() function to reconverge the threads and to guarantee the memory ordering among threads in the warps. This is the first patch to fix the problem with the test libomptarget/deviceRTLs/nvptx/src/sync.cu on Cuda9+. This patch just replaces calls to __shfl_sync() function with the call of __syncwarp() function where we need to reconverge the threads when we try to modify the value of the parallel level counter. Reviewers: grokos Subscribers: guansong, jfb, jdoerfert, caomhin, kkwli0, openmp-commits Tags: #openmp Differential Revision: https://reviews.llvm.org/D65013 git-svn-id: https://llvm.org/svn/llvm-project/openmp/trunk@369796 91177308-0d34-0410-b5e6-96231b3b80d8
1 parent 3f4c51a commit cd9405f

File tree

2 files changed

+19
-10
lines changed

2 files changed

+19
-10
lines changed

libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,11 +55,14 @@
5555
#define __SHFL_DOWN_SYNC(mask, var, delta, width) \
5656
__shfl_down_sync((mask), (var), (delta), (width))
5757
#define __ACTIVEMASK() __activemask()
58+
#define __SYNCWARP(Mask) __syncwarp(Mask)
5859
#else
5960
#define __SHFL_SYNC(mask, var, srcLane) __shfl((var), (srcLane))
6061
#define __SHFL_DOWN_SYNC(mask, var, delta, width) \
6162
__shfl_down((var), (delta), (width))
6263
#define __ACTIVEMASK() __ballot(1)
64+
// In Cuda < 9.0 no need to sync threads in warps.
65+
#define __SYNCWARP(Mask)
6366
#endif // CUDA_VERSION
6467

6568
#define __SYNCTHREADS_N(n) asm volatile("bar.sync %0;" : : "r"(n) : "memory");

libomptarget/deviceRTLs/nvptx/src/supporti.h

Lines changed: 16 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -202,25 +202,31 @@ INLINE int IsTeamMaster(int ompThreadId) { return (ompThreadId == 0); }
202202
// Parallel level
203203

204204
INLINE void IncParallelLevel(bool ActiveParallel) {
205-
unsigned tnum = __ACTIVEMASK();
206-
int leader = __ffs(tnum) - 1;
207-
__SHFL_SYNC(tnum, leader, leader);
208-
if (GetLaneId() == leader) {
205+
unsigned Active = __ACTIVEMASK();
206+
__SYNCWARP(Active);
207+
unsigned LaneMaskLt;
208+
asm("mov.u32 %0, %%lanemask_lt;" : "=r"(LaneMaskLt));
209+
unsigned Rank = __popc(Active & LaneMaskLt);
210+
if (Rank == 0) {
209211
parallelLevel[GetWarpId()] +=
210212
(1 + (ActiveParallel ? OMP_ACTIVE_PARALLEL_LEVEL : 0));
213+
__threadfence();
211214
}
212-
__SHFL_SYNC(tnum, leader, leader);
215+
__SYNCWARP(Active);
213216
}
214217

215218
INLINE void DecParallelLevel(bool ActiveParallel) {
216-
unsigned tnum = __ACTIVEMASK();
217-
int leader = __ffs(tnum) - 1;
218-
__SHFL_SYNC(tnum, leader, leader);
219-
if (GetLaneId() == leader) {
219+
unsigned Active = __ACTIVEMASK();
220+
__SYNCWARP(Active);
221+
unsigned LaneMaskLt;
222+
asm("mov.u32 %0, %%lanemask_lt;" : "=r"(LaneMaskLt));
223+
unsigned Rank = __popc(Active & LaneMaskLt);
224+
if (Rank == 0) {
220225
parallelLevel[GetWarpId()] -=
221226
(1 + (ActiveParallel ? OMP_ACTIVE_PARALLEL_LEVEL : 0));
227+
__threadfence();
222228
}
223-
__SHFL_SYNC(tnum, leader, leader);
229+
__SYNCWARP(Active);
224230
}
225231

226232
////////////////////////////////////////////////////////////////////////////////

0 commit comments

Comments
 (0)