Skip to content
This repository was archived by the owner on Apr 23, 2020. It is now read-only.

Commit b93e8ad

Browse files
committed
[OPENMP][NVPTX]Fix parallel level counter in non-SPMD mode.
Summary: In non-SPMD mode we may end up with the divergent threads when trying to increment/decrement parallel level counter. It may lead to incorrect calculations of the parallel level and wrong results when threads are divergent. We need to reconverge the threads before trying to modify the parallel level counter. Reviewers: grokos, jdoerfert Subscribers: guansong, openmp-commits, caomhin, kkwli0 Tags: #openmp Differential Revision: https://reviews.llvm.org/D66802 git-svn-id: https://llvm.org/svn/llvm-project/openmp/trunk@370803 91177308-0d34-0410-b5e6-96231b3b80d8
1 parent 537de38 commit b93e8ad

File tree

4 files changed

+45
-16
lines changed

4 files changed

+45
-16
lines changed

libomptarget/deviceRTLs/nvptx/src/parallel.cu

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -311,7 +311,16 @@ EXTERN bool __kmpc_kernel_parallel(void **WorkFn,
311311
(int)newTaskDescr->ThreadId(), (int)nThreads);
312312

313313
isActive = true;
314-
IncParallelLevel(threadsInTeam != 1);
314+
// Reconverge the threads at the end of the parallel region to correctly
315+
// handle parallel levels.
316+
// In Cuda9+ in non-SPMD mode we have either 1 worker thread or the whole
317+
// warp. If only 1 thread is active, not need to reconverge the threads.
318+
// If we have the whole warp, reconverge all the threads in the warp before
319+
// actually trying to change the parallel level. Otherwise, parallel level
320+
// can be changed incorrectly because of threads divergence.
321+
bool IsActiveParallelRegion = threadsInTeam != 1;
322+
IncParallelLevel(IsActiveParallelRegion,
323+
IsActiveParallelRegion ? 0xFFFFFFFF : 1u);
315324
}
316325

317326
return isActive;
@@ -329,7 +338,16 @@ EXTERN void __kmpc_kernel_end_parallel() {
329338
omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
330339
threadId, currTaskDescr->GetPrevTaskDescr());
331340

332-
DecParallelLevel(threadsInTeam != 1);
341+
// Reconverge the threads at the end of the parallel region to correctly
342+
// handle parallel levels.
343+
// In Cuda9+ in non-SPMD mode we have either 1 worker thread or the whole
344+
// warp. If only 1 thread is active, not need to reconverge the threads.
345+
// If we have the whole warp, reconverge all the threads in the warp before
346+
// actually trying to change the parallel level. Otherwise, parallel level can
347+
// be changed incorrectly because of threads divergence.
348+
bool IsActiveParallelRegion = threadsInTeam != 1;
349+
DecParallelLevel(IsActiveParallelRegion,
350+
IsActiveParallelRegion ? 0xFFFFFFFF : 1u);
333351
}
334352

335353
////////////////////////////////////////////////////////////////////////////////
@@ -339,7 +357,7 @@ EXTERN void __kmpc_kernel_end_parallel() {
339357
EXTERN void __kmpc_serialized_parallel(kmp_Ident *loc, uint32_t global_tid) {
340358
PRINT0(LD_IO, "call to __kmpc_serialized_parallel\n");
341359

342-
IncParallelLevel(/*ActiveParallel=*/false);
360+
IncParallelLevel(/*ActiveParallel=*/false, __kmpc_impl_activemask());
343361

344362
if (checkRuntimeUninitialized(loc)) {
345363
ASSERT0(LT_FUSSY, checkSPMDMode(loc),
@@ -378,7 +396,7 @@ EXTERN void __kmpc_end_serialized_parallel(kmp_Ident *loc,
378396
uint32_t global_tid) {
379397
PRINT0(LD_IO, "call to __kmpc_end_serialized_parallel\n");
380398

381-
DecParallelLevel(/*ActiveParallel=*/false);
399+
DecParallelLevel(/*ActiveParallel=*/false, __kmpc_impl_activemask());
382400

383401
if (checkRuntimeUninitialized(loc)) {
384402
ASSERT0(LT_FUSSY, checkSPMDMode(loc),

libomptarget/deviceRTLs/nvptx/src/support.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
//
1111
//===----------------------------------------------------------------------===//
1212

13+
#include "target_impl.h"
1314
////////////////////////////////////////////////////////////////////////////////
1415
// Execution Parameters
1516
////////////////////////////////////////////////////////////////////////////////
@@ -65,8 +66,8 @@ INLINE int GetNumberOfProcsInDevice(bool isSPMDExecutionMode);
6566
INLINE int IsTeamMaster(int ompThreadId);
6667

6768
// Parallel level
68-
INLINE void IncParallelLevel(bool ActiveParallel);
69-
INLINE void DecParallelLevel(bool ActiveParallel);
69+
INLINE void IncParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask);
70+
INLINE void DecParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask);
7071

7172
////////////////////////////////////////////////////////////////////////////////
7273
// Memory

libomptarget/deviceRTLs/nvptx/src/supporti.h

Lines changed: 8 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -203,30 +203,28 @@ INLINE int IsTeamMaster(int ompThreadId) { return (ompThreadId == 0); }
203203
////////////////////////////////////////////////////////////////////////////////
204204
// Parallel level
205205

206-
INLINE void IncParallelLevel(bool ActiveParallel) {
207-
__kmpc_impl_lanemask_t Active = __kmpc_impl_activemask();
208-
__kmpc_impl_syncwarp(Active);
206+
INLINE void IncParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask) {
207+
__kmpc_impl_syncwarp(Mask);
209208
__kmpc_impl_lanemask_t LaneMaskLt = __kmpc_impl_lanemask_lt();
210-
unsigned Rank = __kmpc_impl_popc(Active & LaneMaskLt);
209+
unsigned Rank = __kmpc_impl_popc(Mask & LaneMaskLt);
211210
if (Rank == 0) {
212211
parallelLevel[GetWarpId()] +=
213212
(1 + (ActiveParallel ? OMP_ACTIVE_PARALLEL_LEVEL : 0));
214213
__threadfence();
215214
}
216-
__kmpc_impl_syncwarp(Active);
215+
__kmpc_impl_syncwarp(Mask);
217216
}
218217

219-
INLINE void DecParallelLevel(bool ActiveParallel) {
220-
__kmpc_impl_lanemask_t Active = __kmpc_impl_activemask();
221-
__kmpc_impl_syncwarp(Active);
218+
INLINE void DecParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask) {
219+
__kmpc_impl_syncwarp(Mask);
222220
__kmpc_impl_lanemask_t LaneMaskLt = __kmpc_impl_lanemask_lt();
223-
unsigned Rank = __kmpc_impl_popc(Active & LaneMaskLt);
221+
unsigned Rank = __kmpc_impl_popc(Mask & LaneMaskLt);
224222
if (Rank == 0) {
225223
parallelLevel[GetWarpId()] -=
226224
(1 + (ActiveParallel ? OMP_ACTIVE_PARALLEL_LEVEL : 0));
227225
__threadfence();
228226
}
229-
__kmpc_impl_syncwarp(Active);
227+
__kmpc_impl_syncwarp(Mask);
230228
}
231229

232230
////////////////////////////////////////////////////////////////////////////////

libomptarget/deviceRTLs/nvptx/test/parallel/level.c

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,5 +135,17 @@ int main(int argc, char *argv[]) {
135135
}
136136
}
137137

138+
// Check for paraller level in non-SPMD kernels.
139+
level = 0;
140+
#pragma omp target teams distribute num_teams(1) thread_limit(32) reduction(+:level)
141+
for (int i=0; i<5032; i+=32) {
142+
int ub = (i+32 > 5032) ? 5032 : i+32;
143+
#pragma omp parallel for schedule(dynamic)
144+
for (int j=i ; j < ub; j++) ;
145+
level += omp_get_level();
146+
}
147+
// CHECK: Integral level = 0.
148+
printf("Integral level = %d.\n", level);
149+
138150
return 0;
139151
}

0 commit comments

Comments
 (0)