-
Notifications
You must be signed in to change notification settings - Fork 12.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[OpenMP][OMPX] Add ballot_sync #91297
Conversation
@llvm/pr-subscribers-offload Author: Shilei Tian (shiltian) Changes
Full diff: https://github.com/llvm/llvm-project/pull/91297.diff 5 Files Affected:
diff --git a/offload/DeviceRTL/include/Utils.h b/offload/DeviceRTL/include/Utils.h
index 4ab0aea46eea122..5048345fdbc1131 100644
--- a/offload/DeviceRTL/include/Utils.h
+++ b/offload/DeviceRTL/include/Utils.h
@@ -25,6 +25,8 @@ int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane);
int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta, int32_t Width);
+uint32_t ballotSync(uint32_t Mask, int32_t Pred);
+
/// Return \p LowBits and \p HighBits packed into a single 64 bit value.
uint64_t pack(uint32_t LowBits, uint32_t HighBits);
diff --git a/offload/DeviceRTL/src/Mapping.cpp b/offload/DeviceRTL/src/Mapping.cpp
index b2028a8fb4f5069..2894a4885292ecd 100644
--- a/offload/DeviceRTL/src/Mapping.cpp
+++ b/offload/DeviceRTL/src/Mapping.cpp
@@ -364,4 +364,8 @@ _TGT_KERNEL_LANGUAGE(block_id, getBlockIdInKernel)
_TGT_KERNEL_LANGUAGE(block_dim, getNumberOfThreadsInBlock)
_TGT_KERNEL_LANGUAGE(grid_dim, getNumberOfBlocksInKernel)
+extern "C" unsigned ompx_ballot_sync(unsigned mask, int pred) {
+ return utils::ballotSync(mask, pred);
+}
+
#pragma omp end declare target
diff --git a/offload/DeviceRTL/src/Utils.cpp b/offload/DeviceRTL/src/Utils.cpp
index d07ac0fb499c941..70df7d52822e0b7 100644
--- a/offload/DeviceRTL/src/Utils.cpp
+++ b/offload/DeviceRTL/src/Utils.cpp
@@ -37,6 +37,8 @@ int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane);
int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t LaneDelta,
int32_t Width);
+uint32_t ballotSync(uint32_t Mask, int32_t Pred);
+
/// AMDGCN Implementation
///
///{
@@ -57,6 +59,12 @@ int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t LaneDelta,
return __builtin_amdgcn_ds_bpermute(Index << 2, Var);
}
+uint32_t ballotSync(uint32_t Mask, int32_t Pred) {
+ if (__AMDGCN_WAVEFRONT_SIZE == 32)
+ return __builtin_amdgcn_ballot_w32(Pred);
+ return __builtin_amdgcn_ballot_w64(Pred);
+}
+
bool isSharedMemPtr(const void *Ptr) {
return __builtin_amdgcn_is_shared(
(const __attribute__((address_space(0))) void *)Ptr);
@@ -80,6 +88,10 @@ int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta, int32_t Width) {
return __nvvm_shfl_sync_down_i32(Mask, Var, Delta, T);
}
+uint32_t ballotSync(uint32_t Mask, int32_t Pred) {
+ return __nvvm_vote_ballot_sync(Mask, Pred);
+}
+
bool isSharedMemPtr(const void *Ptr) { return __nvvm_isspacep_shared(Ptr); }
#pragma omp end declare variant
diff --git a/offload/test/offloading/ompx_bare_ballot_sync.c b/offload/test/offloading/ompx_bare_ballot_sync.c
new file mode 100644
index 000000000000000..0bd6355d9ef6721
--- /dev/null
+++ b/offload/test/offloading/ompx_bare_ballot_sync.c
@@ -0,0 +1,38 @@
+// RUN: %libomptarget-compilexx-run-and-check-generic
+//
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+
+#include <assert.h>
+#include <ompx.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+int main(int argc, char *argv[]) {
+ const int num_blocks = 1;
+ const int block_size = 64;
+ const int N = num_blocks * block_size;
+ unsigned *data = (int *)malloc(N * sizeof(unsigned));
+
+ for (int i = 0; i < N; ++i)
+ data[i] = i & 0x1;
+
+#pragma omp target teams ompx_bare num_teams(num_blocks) thread_limit(block_size) map(tofrom: data[0:N])
+ {
+ int tid = ompx_thread_id_x();
+ unsigned mask = ompx_ballot_sync(0xffffffff, data[tid]);
+ data[tid] += mask;
+ }
+
+ for (int i = 0; i < N; ++i)
+ assert(data[i] == ((i & 0x1) + 0xaaaaaaaa));
+
+ // CHECK: PASS
+ printf("PASS\n");
+
+ return 0;
+}
diff --git a/openmp/runtime/src/include/ompx.h.var b/openmp/runtime/src/include/ompx.h.var
index 579d31aa98c54e6..827a28284c31366 100644
--- a/openmp/runtime/src/include/ompx.h.var
+++ b/openmp/runtime/src/include/ompx.h.var
@@ -9,6 +9,8 @@
#ifndef __OMPX_H
#define __OMPX_H
+#define INLINE [[clang::always_inline]]
+
#ifdef __cplusplus
extern "C" {
#endif
@@ -57,7 +59,7 @@ enum {
/// ompx_{thread,block}_{id,dim}
///{
#define _TGT_KERNEL_LANGUAGE_HOST_IMPL_GRID_C(NAME, VALUE) \
- static inline int ompx_##NAME(int Dim) { return VALUE; }
+ static INLINE int ompx_##NAME(int Dim) { return VALUE; }
_TGT_KERNEL_LANGUAGE_HOST_IMPL_GRID_C(thread_id,
omp_get_ancestor_thread_num(Dim + 1))
@@ -70,7 +72,7 @@ _TGT_KERNEL_LANGUAGE_HOST_IMPL_GRID_C(grid_dim, 1)
/// ompx_{sync_block}_{,divergent}
///{
#define _TGT_KERNEL_LANGUAGE_HOST_IMPL_SYNC_C(RETTY, NAME, ARGS, BODY) \
- static inline RETTY ompx_##NAME(ARGS) { BODY; }
+ static INLINE RETTY ompx_##NAME(ARGS) { BODY; }
_TGT_KERNEL_LANGUAGE_HOST_IMPL_SYNC_C(void, sync_block, int Ordering,
_Pragma("omp barrier"));
@@ -81,11 +83,15 @@ _TGT_KERNEL_LANGUAGE_HOST_IMPL_SYNC_C(void, sync_block_divergent, int Ordering,
#undef _TGT_KERNEL_LANGUAGE_HOST_IMPL_SYNC_C
///}
+static INLINE unsigned ompx_ballot_sync(unsigned mask, int pred) {
+ __builtin_trap();
+}
+
#pragma omp end declare variant
/// ompx_{sync_block}_{,divergent}
///{
-#define _TGT_KERNEL_LANGUAGE_DECL_SYNC_C(RETTY, NAME, ARGS) \
+#define _TGT_KERNEL_LANGUAGE_DECL_SYNC_C(RETTY, NAME, ARGS) \
RETTY ompx_##NAME(ARGS);
_TGT_KERNEL_LANGUAGE_DECL_SYNC_C(void, sync_block, int Ordering);
@@ -98,9 +104,9 @@ _TGT_KERNEL_LANGUAGE_DECL_SYNC_C(void, sync_block_divergent, int Ordering);
///{
#define _TGT_KERNEL_LANGUAGE_DECL_GRID_C(NAME) \
int ompx_##NAME(int Dim); \
- static inline int ompx_##NAME##_x() { return ompx_##NAME(ompx_dim_x); } \
- static inline int ompx_##NAME##_y() { return ompx_##NAME(ompx_dim_y); } \
- static inline int ompx_##NAME##_z() { return ompx_##NAME(ompx_dim_z); }
+ static INLINE int ompx_##NAME##_x() { return ompx_##NAME(ompx_dim_x); } \
+ static INLINE int ompx_##NAME##_y() { return ompx_##NAME(ompx_dim_y); } \
+ static INLINE int ompx_##NAME##_z() { return ompx_##NAME(ompx_dim_z); }
_TGT_KERNEL_LANGUAGE_DECL_GRID_C(thread_id)
_TGT_KERNEL_LANGUAGE_DECL_GRID_C(block_dim)
@@ -109,6 +115,8 @@ _TGT_KERNEL_LANGUAGE_DECL_GRID_C(grid_dim)
#undef _TGT_KERNEL_LANGUAGE_DECL_GRID_C
///}
+unsigned ompx_ballot_sync(unsigned mask, int pred);
+
#ifdef __cplusplus
}
#endif
@@ -134,10 +142,10 @@ enum {
/// ompx::{thread,block}_{id,dim}_{,x,y,z}
///{
#define _TGT_KERNEL_LANGUAGE_HOST_IMPL_GRID_CXX(NAME) \
- static inline int NAME(int Dim) noexcept { return ompx_##NAME(Dim); } \
- static inline int NAME##_x() noexcept { return NAME(ompx_dim_x); } \
- static inline int NAME##_y() noexcept { return NAME(ompx_dim_y); } \
- static inline int NAME##_z() noexcept { return NAME(ompx_dim_z); }
+ static INLINE int NAME(int Dim) noexcept { return ompx_##NAME(Dim); } \
+ static INLINE int NAME##_x() noexcept { return NAME(ompx_dim_x); } \
+ static INLINE int NAME##_y() noexcept { return NAME(ompx_dim_y); } \
+ static INLINE int NAME##_z() noexcept { return NAME(ompx_dim_z); }
_TGT_KERNEL_LANGUAGE_HOST_IMPL_GRID_CXX(thread_id)
_TGT_KERNEL_LANGUAGE_HOST_IMPL_GRID_CXX(block_dim)
@@ -149,7 +157,7 @@ _TGT_KERNEL_LANGUAGE_HOST_IMPL_GRID_CXX(grid_dim)
/// ompx_{sync_block}_{,divergent}
///{
#define _TGT_KERNEL_LANGUAGE_HOST_IMPL_SYNC_CXX(RETTY, NAME, ARGS, CALL_ARGS) \
- static inline RETTY NAME(ARGS) { \
+ static INLINE RETTY NAME(ARGS) { \
return ompx_##NAME(CALL_ARGS); \
}
@@ -160,9 +168,15 @@ _TGT_KERNEL_LANGUAGE_HOST_IMPL_SYNC_CXX(void, sync_block_divergent,
#undef _TGT_KERNEL_LANGUAGE_HOST_IMPL_SYNC_CXX
///}
+static INLINE unsigned ballot_sync(unsigned mask, int pred) {
+ return ompx_ballot_sync(mask, pred);
+}
+
} // namespace ompx
#endif
///}
+#undef INLINE
+
#endif /* __OMPX_H */
|
You can test this locally with the following command:git-clang-format --diff 70a54bca6f7f9f45b4b17974ddaa01cd7a5d64be 2d4fc4bc69e4dc7b3c4ab18027446f2eff046742 -- offload/test/offloading/ompx_bare_ballot_sync.c offload/DeviceRTL/include/Utils.h offload/DeviceRTL/src/Mapping.cpp offload/DeviceRTL/src/Utils.cpp View the diff from clang-format here.diff --git a/offload/test/offloading/ompx_bare_ballot_sync.c b/offload/test/offloading/ompx_bare_ballot_sync.c
index d8e17691bf..02a47fea02 100644
--- a/offload/test/offloading/ompx_bare_ballot_sync.c
+++ b/offload/test/offloading/ompx_bare_ballot_sync.c
@@ -28,7 +28,8 @@ int main(int argc, char *argv[]) {
for (int i = 0; i < N; ++i)
data[i] = i & 0x1;
-#pragma omp target teams ompx_bare num_teams(num_blocks) thread_limit(block_size) map(tofrom: data[0:N])
+#pragma omp target teams ompx_bare num_teams(num_blocks) \
+ thread_limit(block_size) map(tofrom : data[0 : N])
{
int tid = ompx_thread_id_x();
uint64_t mask = ompx_ballot_sync(~0U, data[tid]);
|
80a0636
to
ef8a29c
Compare
static INLINE uint64_t ompx_ballot_sync(uint64_t mask, int pred) { | ||
__builtin_trap(); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What's this for? If it's a host fallback I would just return mask & pred
.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We don't want application on host using these functions to run, especially soon I'll add shuffle.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why define it on the host at all then? I suppose it's non-trivial to detect if the consumer is a "GPU" since we probably don't want to use variants here and our macros define stuff all over the place.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is more like a temporary solution before we force the front end to stop emitting host fallback when kernel lang mode is turned on automatically; otherwise undefined symbol link error would appear.
typedef unsigned long uint64_t; | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I wish we could just include stdint.h
here, but we'd need to make sure it only uses the one in the clang
resource directory. That's doable if we don't set __STDC_HOSTED__
or use -ffreestanding
. Using -ffreestanding
would be an option if I could fix the bug that prevents me from landing the fixes that allow inlining.
This patch adds the support for
ballot_sync
in ompx.