Skip to content
This repository was archived by the owner on Apr 23, 2020. It is now read-only.

Commit bfca7f7

Browse files
[libomptarget] Refactor shfl_sync macro to inline function
Summary: [libomptarget] Refactor shfl_sync macro to inline function See also abandoned D66846, split into this diff and others. Reviewers: jdoerfert, ABataev, grokos, ronlieb, gregrodgers Subscribers: openmp-commits Tags: #openmp Differential Revision: https://reviews.llvm.org/D66852 git-svn-id: https://llvm.org/svn/llvm-project/openmp/trunk@370144 91177308-0d34-0410-b5e6-96231b3b80d8
1 parent ad72f5e commit bfca7f7

File tree

5 files changed

+24
-8
lines changed

5 files changed

+24
-8
lines changed

libomptarget/deviceRTLs/nvptx/src/data_sharing.cu

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
//
1111
//===----------------------------------------------------------------------===//
1212
#include "omptarget-nvptx.h"
13+
#include "target_impl.h"
1314
#include <stdio.h>
1415

1516
// Warp ID in the CUDA block
@@ -430,9 +431,10 @@ INLINE static void* data_sharing_push_stack_common(size_t PushSize) {
430431
}
431432
}
432433
// Get address from lane 0.
433-
((int *)&FrameP)[0] = __SHFL_SYNC(CurActive, ((int *)&FrameP)[0], 0);
434+
int *FP = (int *)&FrameP;
435+
FP[0] = __kmpc_impl_shfl_sync(CurActive, FP[0], 0);
434436
if (sizeof(FrameP) == 8)
435-
((int *)&FrameP)[1] = __SHFL_SYNC(CurActive, ((int *)&FrameP)[1], 0);
437+
FP[1] = __kmpc_impl_shfl_sync(CurActive, FP[1], 0);
436438

437439
return FrameP;
438440
}

libomptarget/deviceRTLs/nvptx/src/loop.cu

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -383,8 +383,8 @@ public:
383383
INLINE static int64_t Shuffle(unsigned active, int64_t val, int leader) {
384384
int lo, hi;
385385
__kmpc_impl_unpack(val, lo, hi);
386-
hi = __SHFL_SYNC(active, hi, leader);
387-
lo = __SHFL_SYNC(active, lo, leader);
386+
hi = __kmpc_impl_shfl_sync(active, hi, leader);
387+
lo = __kmpc_impl_shfl_sync(active, lo, leader);
388388
return __kmpc_impl_pack(lo, hi);
389389
}
390390

libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,13 +51,11 @@
5151
#ifndef CUDA_VERSION
5252
#error CUDA_VERSION macro is undefined, something wrong with cuda.
5353
#elif CUDA_VERSION >= 9000
54-
#define __SHFL_SYNC(mask, var, srcLane) __shfl_sync((mask), (var), (srcLane))
5554
#define __SHFL_DOWN_SYNC(mask, var, delta, width) \
5655
__shfl_down_sync((mask), (var), (delta), (width))
5756
#define __ACTIVEMASK() __activemask()
5857
#define __SYNCWARP(Mask) __syncwarp(Mask)
5958
#else
60-
#define __SHFL_SYNC(mask, var, srcLane) __shfl((var), (srcLane))
6159
#define __SHFL_DOWN_SYNC(mask, var, delta, width) \
6260
__shfl_down((var), (delta), (width))
6361
#define __ACTIVEMASK() __ballot(1)

libomptarget/deviceRTLs/nvptx/src/parallel.cu

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
//===----------------------------------------------------------------------===//
3434

3535
#include "omptarget-nvptx.h"
36+
#include "target_impl.h"
3637

3738
typedef struct ConvergentSimdJob {
3839
omptarget_nvptx_TaskDescr taskDescr;
@@ -64,7 +65,7 @@ EXTERN bool __kmpc_kernel_convergent_simd(void *buffer, uint32_t Mask,
6465
omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(threadId);
6566
job->slimForNextSimd = SimdLimit;
6667

67-
int32_t SimdLimitSource = __SHFL_SYNC(Mask, SimdLimit, *LaneSource);
68+
int32_t SimdLimitSource = __kmpc_impl_shfl_sync(Mask, SimdLimit, *LaneSource);
6869
// reset simdlimit to avoid propagating to successive #simd
6970
if (SimdLimitSource > 0 && threadId == sourceThreadId)
7071
omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(threadId) = 0;
@@ -138,7 +139,8 @@ EXTERN bool __kmpc_kernel_convergent_parallel(void *buffer, uint32_t Mask,
138139
omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId);
139140
job->tnumForNextPar = NumThreadsClause;
140141

141-
int32_t NumThreadsSource = __SHFL_SYNC(Mask, NumThreadsClause, *LaneSource);
142+
int32_t NumThreadsSource =
143+
__kmpc_impl_shfl_sync(Mask, NumThreadsClause, *LaneSource);
142144
// reset numthreads to avoid propagating to successive #parallel
143145
if (NumThreadsSource > 0 && threadId == sourceThreadId)
144146
omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId) =

libomptarget/deviceRTLs/nvptx/src/target_impl.h

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,20 @@ INLINE int __kmpc_impl_ffs(uint32_t x) { return __ffs(x); }
3838

3939
INLINE int __kmpc_impl_popc(uint32_t x) { return __popc(x); }
4040

41+
#ifndef CUDA_VERSION
42+
#error CUDA_VERSION macro is undefined, something wrong with cuda.
43+
#endif
44+
45+
// In Cuda 9.0, the *_sync() version takes an extra argument 'mask'.
46+
INLINE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t Mask, int32_t Var,
47+
int32_t SrcLane) {
48+
#if CUDA_VERSION >= 9000
49+
return __shfl_sync(Mask, Var, SrcLane);
50+
#else
51+
return __shfl(Var, SrcLane);
52+
#endif // CUDA_VERSION
53+
}
54+
4155
INLINE void __kmpc_impl_syncwarp(int32_t Mask) { __SYNCWARP(Mask); }
4256

4357
#endif

0 commit comments

Comments
 (0)