Skip to content

Commit 44ee974

Browse files
[libomptarget][nfc] Refactor amdgpu partial barrier to simplify adding a second one
[libomptarget][nfc] Refactor amdgpu partial barrier to simplify adding a second one D101976 would require a second barrier instance. This NFC to amdgpu makes it simpler to add one (an extra global, one more line in init). Also renames the current barrier to L0. Reviewed By: jdoerfert Differential Revision: https://reviews.llvm.org/D102016
1 parent 5dc1ed3 commit 44ee974

File tree

1 file changed

+17
-12
lines changed

1 file changed

+17
-12
lines changed

openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip

Lines changed: 17 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -52,15 +52,8 @@ EXTERN __kmpc_impl_lanemask_t __kmpc_impl_activemask() {
5252
return __builtin_amdgcn_read_exec();
5353
}
5454

55-
uint32_t __kmpc_L1_Barrier [[clang::loader_uninitialized]];
56-
#pragma allocate(__kmpc_L1_Barrier) allocator(omp_pteam_mem_alloc)
57-
58-
EXTERN void __kmpc_impl_target_init() {
59-
// Don't have global ctors, and shared memory is not zero init
60-
__atomic_store_n(&__kmpc_L1_Barrier, 0u, __ATOMIC_RELEASE);
61-
}
62-
63-
EXTERN void __kmpc_impl_named_sync(uint32_t num_threads) {
55+
static void pteam_mem_barrier(uint32_t num_threads, uint32_t * barrier_state)
56+
{
6457
__atomic_thread_fence(__ATOMIC_ACQUIRE);
6558

6659
uint32_t num_waves = num_threads / WARPSIZE;
@@ -81,7 +74,7 @@ EXTERN void __kmpc_impl_named_sync(uint32_t num_threads) {
8174
bool isLowest = GetLaneId() == lowestActiveThread;
8275

8376
if (isLowest) {
84-
uint32_t load = __atomic_fetch_add(&__kmpc_L1_Barrier, 1,
77+
uint32_t load = __atomic_fetch_add(barrier_state, 1,
8578
__ATOMIC_RELAXED); // commutative
8679

8780
// Record the number of times the barrier has been passed
@@ -94,18 +87,30 @@ EXTERN void __kmpc_impl_named_sync(uint32_t num_threads) {
9487
load &= 0xffff0000u; // because bits zeroed second
9588

9689
// Reset the wave counter and release the waiting waves
97-
__atomic_store_n(&__kmpc_L1_Barrier, load, __ATOMIC_RELAXED);
90+
__atomic_store_n(barrier_state, load, __ATOMIC_RELAXED);
9891
} else {
9992
// more waves still to go, spin until generation counter changes
10093
do {
10194
__builtin_amdgcn_s_sleep(0);
102-
load = __atomic_load_n(&__kmpc_L1_Barrier, __ATOMIC_RELAXED);
95+
load = __atomic_load_n(barrier_state, __ATOMIC_RELAXED);
10396
} while ((load & 0xffff0000u) == generation);
10497
}
10598
}
10699
__atomic_thread_fence(__ATOMIC_RELEASE);
107100
}
108101

102+
uint32_t __kmpc_L0_Barrier [[clang::loader_uninitialized]];
103+
#pragma allocate(__kmpc_L0_Barrier) allocator(omp_pteam_mem_alloc)
104+
105+
EXTERN void __kmpc_impl_target_init() {
106+
// Don't have global ctors, and shared memory is not zero init
107+
__atomic_store_n(&__kmpc_L0_Barrier, 0u, __ATOMIC_RELEASE);
108+
}
109+
110+
EXTERN void __kmpc_impl_named_sync(uint32_t num_threads) {
111+
pteam_mem_barrier(num_threads, &__kmpc_L0_Barrier);
112+
}
113+
109114
namespace {
110115
uint32_t get_grid_dim(uint32_t n, uint16_t d) {
111116
uint32_t q = n / d;

0 commit comments

Comments
 (0)