[libomptarget] Refactor shfl_sync macro to inline function

JonChesterfield · JonChesterfield · commit bfca7f70c5f1 · 2019-08-28T01:31:04.000Z
Summary: [libomptarget] Refactor shfl_sync macro to inline function See also abandoned D66846, split into this diff and others. Reviewers: jdoerfert, ABataev, grokos, ronlieb, gregrodgers Subscribers: openmp-commits Tags: #openmp Differential Revision: https://reviews.llvm.org/D66852 git-svn-id: https://llvm.org/svn/llvm-project/openmp/trunk@370144 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu b/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
@@ -10,6 +10,7 @@
 //
 //===----------------------------------------------------------------------===//
 #include "omptarget-nvptx.h"
+#include "target_impl.h"
 #include <stdio.h>
 
 // Warp ID in the CUDA block
@@ -430,9 +431,10 @@ INLINE static void* data_sharing_push_stack_common(size_t PushSize) {
     }
   }
   // Get address from lane 0.
-  ((int *)&FrameP)[0] = __SHFL_SYNC(CurActive, ((int *)&FrameP)[0], 0);
+  int *FP = (int *)&FrameP;
+  FP[0] = __kmpc_impl_shfl_sync(CurActive, FP[0], 0);
   if (sizeof(FrameP) == 8)
-    ((int *)&FrameP)[1] = __SHFL_SYNC(CurActive, ((int *)&FrameP)[1], 0);
+    FP[1] = __kmpc_impl_shfl_sync(CurActive, FP[1], 0);
 
   return FrameP;
 }
diff --git a/libomptarget/deviceRTLs/nvptx/src/loop.cu b/libomptarget/deviceRTLs/nvptx/src/loop.cu
@@ -383,8 +383,8 @@ public:
   INLINE static int64_t Shuffle(unsigned active, int64_t val, int leader) {
     int lo, hi;
     __kmpc_impl_unpack(val, lo, hi);
-    hi = __SHFL_SYNC(active, hi, leader);
-    lo = __SHFL_SYNC(active, lo, leader);
+    hi = __kmpc_impl_shfl_sync(active, hi, leader);
+    lo = __kmpc_impl_shfl_sync(active, lo, leader);
     return __kmpc_impl_pack(lo, hi);
   }
 
diff --git a/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h b/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
@@ -51,13 +51,11 @@
 #ifndef CUDA_VERSION
 #error CUDA_VERSION macro is undefined, something wrong with cuda.
 #elif CUDA_VERSION >= 9000
-#define __SHFL_SYNC(mask, var, srcLane) __shfl_sync((mask), (var), (srcLane))
 #define __SHFL_DOWN_SYNC(mask, var, delta, width)                              \
   __shfl_down_sync((mask), (var), (delta), (width))
 #define __ACTIVEMASK() __activemask()
 #define __SYNCWARP(Mask) __syncwarp(Mask)
 #else
-#define __SHFL_SYNC(mask, var, srcLane) __shfl((var), (srcLane))
 #define __SHFL_DOWN_SYNC(mask, var, delta, width)                              \
   __shfl_down((var), (delta), (width))
 #define __ACTIVEMASK() __ballot(1)
diff --git a/libomptarget/deviceRTLs/nvptx/src/parallel.cu b/libomptarget/deviceRTLs/nvptx/src/parallel.cu
@@ -33,6 +33,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "omptarget-nvptx.h"
+#include "target_impl.h"
 
 typedef struct ConvergentSimdJob {
   omptarget_nvptx_TaskDescr taskDescr;
@@ -64,7 +65,7 @@ EXTERN bool __kmpc_kernel_convergent_simd(void *buffer, uint32_t Mask,
       omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(threadId);
   job->slimForNextSimd = SimdLimit;
 
-  int32_t SimdLimitSource = __SHFL_SYNC(Mask, SimdLimit, *LaneSource);
+  int32_t SimdLimitSource = __kmpc_impl_shfl_sync(Mask, SimdLimit, *LaneSource);
   // reset simdlimit to avoid propagating to successive #simd
   if (SimdLimitSource > 0 && threadId == sourceThreadId)
     omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(threadId) = 0;
@@ -138,7 +139,8 @@ EXTERN bool __kmpc_kernel_convergent_parallel(void *buffer, uint32_t Mask,
       omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId);
   job->tnumForNextPar = NumThreadsClause;
 
-  int32_t NumThreadsSource = __SHFL_SYNC(Mask, NumThreadsClause, *LaneSource);
+  int32_t NumThreadsSource =
+      __kmpc_impl_shfl_sync(Mask, NumThreadsClause, *LaneSource);
   // reset numthreads to avoid propagating to successive #parallel
   if (NumThreadsSource > 0 && threadId == sourceThreadId)
     omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId) =
diff --git a/libomptarget/deviceRTLs/nvptx/src/target_impl.h b/libomptarget/deviceRTLs/nvptx/src/target_impl.h
@@ -38,6 +38,20 @@ INLINE int __kmpc_impl_ffs(uint32_t x) { return __ffs(x); }
 
 INLINE int __kmpc_impl_popc(uint32_t x) { return __popc(x); }
 
+#ifndef CUDA_VERSION
+#error CUDA_VERSION macro is undefined, something wrong with cuda.
+#endif
+
+// In Cuda 9.0, the *_sync() version takes an extra argument 'mask'.
+INLINE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t Mask, int32_t Var,
+                                     int32_t SrcLane) {
+#if CUDA_VERSION >= 9000
+  return __shfl_sync(Mask, Var, SrcLane);
+#else
+  return __shfl(Var, SrcLane);
+#endif // CUDA_VERSION
+}
+
 INLINE void __kmpc_impl_syncwarp(int32_t Mask) { __SYNCWARP(Mask); }
 
 #endif

Original file line number	Diff line number	Diff line change
`@@ -10,6 +10,7 @@`
`10`	`10`	`//`
`11`	`11`	`//===----------------------------------------------------------------------===//`
`12`	`12`	`#include "omptarget-nvptx.h"`
	`13`	`+#include "target_impl.h"`
`13`	`14`	`#include <stdio.h>`
`14`	`15`
`15`	`16`	`// Warp ID in the CUDA block`
`@@ -430,9 +431,10 @@ INLINE static void* data_sharing_push_stack_common(size_t PushSize) {`
`430`	`431`	`}`
`431`	`432`	`}`
`432`	`433`	`// Get address from lane 0.`
`433`		`- ((int )&FrameP)[0] = __SHFL_SYNC(CurActive, ((int )&FrameP)[0], 0);`
	`434`	`+ int FP = (int )&FrameP;`
	`435`	`+ FP[0] = __kmpc_impl_shfl_sync(CurActive, FP[0], 0);`
`434`	`436`	`if (sizeof(FrameP) == 8)`
`435`		`- ((int )&FrameP)[1] = __SHFL_SYNC(CurActive, ((int )&FrameP)[1], 0);`
	`437`	`+ FP[1] = __kmpc_impl_shfl_sync(CurActive, FP[1], 0);`
`436`	`438`
`437`	`439`	`return FrameP;`
`438`	`440`	`}`