Skip to content
This repository was archived by the owner on Apr 23, 2020. It is now read-only.

Commit 1fe4d62

Browse files
Use target_impl functions to replace more inline asm
Summary: Use target_impl functions to replace more inline asm Follow on from D65836. Removes remaining asm shuffles and lanemask accessors Also changes the types of target_impl bitwise functions to unsigned. Reviewers: jdoerfert, ABataev, grokos, Hahnfeld, gregrodgers, ronlieb, hfinkel Subscribers: openmp-commits Tags: #openmp Differential Revision: https://reviews.llvm.org/D66809 git-svn-id: https://llvm.org/svn/llvm-project/openmp/trunk@370216 91177308-0d34-0410-b5e6-96231b3b80d8
1 parent ee03160 commit 1fe4d62

File tree

5 files changed

+38
-39
lines changed

5 files changed

+38
-39
lines changed

libomptarget/deviceRTLs/nvptx/src/loop.cu

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -381,7 +381,7 @@ public:
381381
// Support for dispatch next
382382

383383
INLINE static int64_t Shuffle(unsigned active, int64_t val, int leader) {
384-
int lo, hi;
384+
uint32_t lo, hi;
385385
__kmpc_impl_unpack(val, lo, hi);
386386
hi = __kmpc_impl_shfl_sync(active, hi, leader);
387387
lo = __kmpc_impl_shfl_sync(active, lo, leader);
@@ -390,8 +390,8 @@ public:
390390

391391
INLINE static uint64_t NextIter() {
392392
__kmpc_impl_lanemask_t active = __ACTIVEMASK();
393-
int leader = __kmpc_impl_ffs(active) - 1;
394-
int change = __kmpc_impl_popc(active);
393+
uint32_t leader = __kmpc_impl_ffs(active) - 1;
394+
uint32_t change = __kmpc_impl_popc(active);
395395
__kmpc_impl_lanemask_t lane_mask_lt = __kmpc_impl_lanemask_lt();
396396
unsigned int rank = __kmpc_impl_popc(active & lane_mask_lt);
397397
uint64_t warp_res;

libomptarget/deviceRTLs/nvptx/src/parallel.cu

Lines changed: 10 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -49,13 +49,12 @@ EXTERN bool __kmpc_kernel_convergent_simd(void *buffer, uint32_t Mask,
4949
int32_t *LaneId, int32_t *NumLanes) {
5050
PRINT0(LD_IO, "call to __kmpc_kernel_convergent_simd\n");
5151
uint32_t ConvergentMask = Mask;
52-
int32_t ConvergentSize = __popc(ConvergentMask);
52+
int32_t ConvergentSize = __kmpc_impl_popc(ConvergentMask);
5353
uint32_t WorkRemaining = ConvergentMask >> (*LaneSource + 1);
54-
*LaneSource += __ffs(WorkRemaining);
55-
*IsFinal = __popc(WorkRemaining) == 1;
56-
uint32_t lanemask_lt;
57-
asm("mov.u32 %0, %%lanemask_lt;" : "=r"(lanemask_lt));
58-
*LaneId = __popc(ConvergentMask & lanemask_lt);
54+
*LaneSource += __kmpc_impl_ffs(WorkRemaining);
55+
*IsFinal = __kmpc_impl_popc(WorkRemaining) == 1;
56+
uint32_t lanemask_lt = __kmpc_impl_lanemask_lt();
57+
*LaneId = __kmpc_impl_popc(ConvergentMask & lanemask_lt);
5958

6059
int threadId = GetLogicalThreadIdInBlock(isSPMDMode());
6160
int sourceThreadId = (threadId & ~(WARPSIZE - 1)) + *LaneSource;
@@ -123,13 +122,12 @@ EXTERN bool __kmpc_kernel_convergent_parallel(void *buffer, uint32_t Mask,
123122
int32_t *LaneSource) {
124123
PRINT0(LD_IO, "call to __kmpc_kernel_convergent_parallel\n");
125124
uint32_t ConvergentMask = Mask;
126-
int32_t ConvergentSize = __popc(ConvergentMask);
125+
int32_t ConvergentSize = __kmpc_impl_popc(ConvergentMask);
127126
uint32_t WorkRemaining = ConvergentMask >> (*LaneSource + 1);
128-
*LaneSource += __ffs(WorkRemaining);
129-
*IsFinal = __popc(WorkRemaining) == 1;
130-
uint32_t lanemask_lt;
131-
asm("mov.u32 %0, %%lanemask_lt;" : "=r"(lanemask_lt));
132-
uint32_t OmpId = __popc(ConvergentMask & lanemask_lt);
127+
*LaneSource += __kmpc_impl_ffs(WorkRemaining);
128+
*IsFinal = __kmpc_impl_popc(WorkRemaining) == 1;
129+
uint32_t lanemask_lt = __kmpc_impl_lanemask_lt();
130+
uint32_t OmpId = __kmpc_impl_popc(ConvergentMask & lanemask_lt);
133131

134132
int threadId = GetLogicalThreadIdInBlock(isSPMDMode());
135133
int sourceThreadId = (threadId & ~(WARPSIZE - 1)) + *LaneSource;

libomptarget/deviceRTLs/nvptx/src/reduction.cu

Lines changed: 10 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -28,12 +28,11 @@ EXTERN int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size) {
2828
}
2929

3030
EXTERN int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size) {
31-
int lo, hi;
32-
asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(val));
31+
uint32_t lo, hi;
32+
__kmpc_impl_unpack(val, lo, hi);
3333
hi = __kmpc_impl_shfl_down_sync(0xFFFFFFFF, hi, delta, size);
3434
lo = __kmpc_impl_shfl_down_sync(0xFFFFFFFF, lo, delta, size);
35-
asm volatile("mov.b64 %0, {%1,%2};" : "=l"(val) : "r"(lo), "r"(hi));
36-
return val;
35+
return __kmpc_impl_pack(lo, hi);
3736
}
3837

3938
INLINE static void gpu_regular_warp_reduce(void *reduce_data,
@@ -60,18 +59,16 @@ INLINE static void gpu_irregular_warp_reduce(void *reduce_data,
6059

6160
INLINE static uint32_t
6261
gpu_irregular_simd_reduce(void *reduce_data, kmp_ShuffleReductFctPtr shflFct) {
63-
uint32_t lanemask_lt;
64-
uint32_t lanemask_gt;
6562
uint32_t size, remote_id, physical_lane_id;
6663
physical_lane_id = GetThreadIdInBlock() % WARPSIZE;
67-
asm("mov.u32 %0, %%lanemask_lt;" : "=r"(lanemask_lt));
64+
uint32_t lanemask_lt = __kmpc_impl_lanemask_lt();
6865
uint32_t Liveness = __ACTIVEMASK();
69-
uint32_t logical_lane_id = __popc(Liveness & lanemask_lt) * 2;
70-
asm("mov.u32 %0, %%lanemask_gt;" : "=r"(lanemask_gt));
66+
uint32_t logical_lane_id = __kmpc_impl_popc(Liveness & lanemask_lt) * 2;
67+
uint32_t lanemask_gt = __kmpc_impl_lanemask_gt();
7168
do {
7269
Liveness = __ACTIVEMASK();
73-
remote_id = __ffs(Liveness & lanemask_gt);
74-
size = __popc(Liveness);
70+
remote_id = __kmpc_impl_ffs(Liveness & lanemask_gt);
71+
size = __kmpc_impl_popc(Liveness);
7572
logical_lane_id /= 2;
7673
shflFct(reduce_data, /*LaneId =*/logical_lane_id,
7774
/*Offset=*/remote_id - 1 - physical_lane_id, /*AlgoVersion=*/2);
@@ -150,7 +147,7 @@ static int32_t nvptx_parallel_reduce_nowait(
150147
gpu_regular_warp_reduce(reduce_data, shflFct);
151148
else if (!(Liveness & (Liveness + 1))) // Partial warp but contiguous lanes
152149
gpu_irregular_warp_reduce(reduce_data, shflFct,
153-
/*LaneCount=*/__popc(Liveness),
150+
/*LaneCount=*/__kmpc_impl_popc(Liveness),
154151
/*LaneId=*/GetThreadIdInBlock() % WARPSIZE);
155152
else if (!isRuntimeUninitialized) // Dispersed lanes. Only threads in L2
156153
// parallel region may enter here; return
@@ -325,7 +322,7 @@ static int32_t nvptx_teams_reduce_nowait(int32_t global_tid, int32_t num_vars,
325322
gpu_regular_warp_reduce(reduce_data, shflFct);
326323
else // Partial warp but contiguous lanes
327324
gpu_irregular_warp_reduce(reduce_data, shflFct,
328-
/*LaneCount=*/__popc(Liveness),
325+
/*LaneCount=*/__kmpc_impl_popc(Liveness),
329326
/*LaneId=*/ThreadId % WARPSIZE);
330327

331328
// When we have more than [warpsize] number of threads

libomptarget/deviceRTLs/nvptx/src/supporti.h

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -206,9 +206,8 @@ INLINE int IsTeamMaster(int ompThreadId) { return (ompThreadId == 0); }
206206
INLINE void IncParallelLevel(bool ActiveParallel) {
207207
unsigned Active = __ACTIVEMASK();
208208
__kmpc_impl_syncwarp(Active);
209-
unsigned LaneMaskLt;
210-
asm("mov.u32 %0, %%lanemask_lt;" : "=r"(LaneMaskLt));
211-
unsigned Rank = __popc(Active & LaneMaskLt);
209+
unsigned LaneMaskLt = __kmpc_impl_lanemask_lt();
210+
unsigned Rank = __kmpc_impl_popc(Active & LaneMaskLt);
212211
if (Rank == 0) {
213212
parallelLevel[GetWarpId()] +=
214213
(1 + (ActiveParallel ? OMP_ACTIVE_PARALLEL_LEVEL : 0));
@@ -220,9 +219,8 @@ INLINE void IncParallelLevel(bool ActiveParallel) {
220219
INLINE void DecParallelLevel(bool ActiveParallel) {
221220
unsigned Active = __ACTIVEMASK();
222221
__kmpc_impl_syncwarp(Active);
223-
unsigned LaneMaskLt;
224-
asm("mov.u32 %0, %%lanemask_lt;" : "=r"(LaneMaskLt));
225-
unsigned Rank = __popc(Active & LaneMaskLt);
222+
unsigned LaneMaskLt = __kmpc_impl_lanemask_lt();
223+
unsigned Rank = __kmpc_impl_popc(Active & LaneMaskLt);
226224
if (Rank == 0) {
227225
parallelLevel[GetWarpId()] -=
228226
(1 + (ActiveParallel ? OMP_ACTIVE_PARALLEL_LEVEL : 0));

libomptarget/deviceRTLs/nvptx/src/target_impl.h

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,12 @@
1616

1717
#include "option.h"
1818

19-
INLINE void __kmpc_impl_unpack(int64_t val, int32_t &lo, int32_t &hi) {
19+
INLINE void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi) {
2020
asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(val));
2121
}
2222

23-
INLINE int64_t __kmpc_impl_pack(int32_t lo, int32_t hi) {
24-
int64_t val;
23+
INLINE uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi) {
24+
uint64_t val;
2525
asm volatile("mov.b64 %0, {%1,%2};" : "=l"(val) : "r"(lo), "r"(hi));
2626
return val;
2727
}
@@ -34,9 +34,15 @@ INLINE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt() {
3434
return res;
3535
}
3636

37-
INLINE int __kmpc_impl_ffs(uint32_t x) { return __ffs(x); }
37+
INLINE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_gt() {
38+
__kmpc_impl_lanemask_t res;
39+
asm("mov.u32 %0, %%lanemask_gt;" : "=r"(res));
40+
return res;
41+
}
42+
43+
INLINE uint32_t __kmpc_impl_ffs(uint32_t x) { return __ffs(x); }
3844

39-
INLINE int __kmpc_impl_popc(uint32_t x) { return __popc(x); }
45+
INLINE uint32_t __kmpc_impl_popc(uint32_t x) { return __popc(x); }
4046

4147
#ifndef CUDA_VERSION
4248
#error CUDA_VERSION macro is undefined, something wrong with cuda.

0 commit comments

Comments
 (0)