Skip to content

Commit 78a0b19

Browse files
authored
[SYCL][CUDA] Add initial support for FP atomics (#3276)
Generates native FP32 and FP64 atomics with the following flags: -DSYCL_USE_NATIVE_FP_ATOMICS -Xsycl-target-backend --cuda-gpu-arch=sm_60 Several known issues: - __spirv_AtomicFAddExt is not inlined, so order and scope do not propagate - Generated PTX does not respect order or scope (defaults to relaxed) - Fatal error when compiling with --cuda-gpu-arch <= sm_50 A complete implementation of this feature requires libspirv to be made aware of __nvvm_reflect, so that NVVMReflect can be used to branch on __CUDA_ARCH. Signed-off-by: John Pennycook <john.pennycook@intel.com>
1 parent e152b0d commit 78a0b19

File tree

3 files changed

+263
-0
lines changed

3 files changed

+263
-0
lines changed

libclc/ptx-nvidiacl/libspirv/SOURCES

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,3 +85,5 @@ images/image_helpers.ll
8585
images/image.cl
8686
group/collectives_helpers.ll
8787
group/collectives.cl
88+
SPV_EXT_shader_atomic_float_add/atomicfaddext.cl
89+
SPV_EXT_shader_atomic_float_add/faddext_helpers.ll
Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#include <spirv/spirv.h>
10+
#include <spirv/spirv_types.h>
11+
12+
// TODO: Convert scope to LLVM IR syncscope if __CUDA_ARCH >= sm_60
13+
// TODO: Error if scope is not relaxed and __CUDA_ARCH <= sm_60
14+
#define __CLC_ATOMICFADDEXT(TYPE, AS) \
15+
_CLC_OVERLOAD _CLC_DEF TYPE __spirv_AtomicFAddEXT( \
16+
__##AS TYPE *pointer, unsigned int scope, unsigned int semantics, \
17+
TYPE value) { \
18+
/* Semantics mask may include memory order, storage class and other info \
19+
Memory order is stored in the lowest 5 bits */ \
20+
unsigned int order = semantics & 0x1F; \
21+
\
22+
switch (order) { \
23+
case None: \
24+
return __clc__atomic_fetch_add_##TYPE##_##AS##_relaxed(pointer, value); \
25+
case Acquire: \
26+
return __clc__atomic_fetch_add_##TYPE##_##AS##_acquire(pointer, value); \
27+
case Release: \
28+
return __clc__atomic_fetch_add_##TYPE##_##AS##_release(pointer, value); \
29+
case AcquireRelease: \
30+
return __clc__atomic_fetch_add_##TYPE##_##AS##_acq_rel(pointer, value); \
31+
default: \
32+
/* Sequentially consistent atomics should never be incorrect */ \
33+
case SequentiallyConsistent: \
34+
return __clc__atomic_fetch_add_##TYPE##_##AS##_seq_cst(pointer, value); \
35+
} \
36+
}
37+
38+
// FP32 atomics - must work without additional extensions
39+
float __clc__atomic_fetch_add_float_global_relaxed(
40+
__global float *,
41+
float) __asm("__clc__atomic_fetch_add_float_global_relaxed");
42+
float __clc__atomic_fetch_add_float_global_acquire(
43+
__global float *,
44+
float) __asm("__clc__atomic_fetch_add_float_global_acquire");
45+
float __clc__atomic_fetch_add_float_global_release(
46+
__global float *,
47+
float) __asm("__clc__atomic_fetch_add_float_global_release");
48+
float __clc__atomic_fetch_add_float_global_acq_rel(
49+
__global float *,
50+
float) __asm("__clc__atomic_fetch_add_float_global_acq_rel");
51+
float __clc__atomic_fetch_add_float_global_seq_cst(
52+
__global float *,
53+
float) __asm("__clc__atomic_fetch_add_float_global_seq_cst");
54+
float __clc__atomic_fetch_add_float_local_relaxed(__local float *, float) __asm(
55+
"__clc__atomic_fetch_add_float_local_relaxed");
56+
float __clc__atomic_fetch_add_float_local_acquire(__local float *, float) __asm(
57+
"__clc__atomic_fetch_add_float_local_acquire");
58+
float __clc__atomic_fetch_add_float_local_release(__local float *, float) __asm(
59+
"__clc__atomic_fetch_add_float_local_release");
60+
float __clc__atomic_fetch_add_float_local_acq_rel(__local float *, float) __asm(
61+
"__clc__atomic_fetch_add_float_local_acq_rel");
62+
float __clc__atomic_fetch_add_float_local_seq_cst(__local float *, float) __asm(
63+
"__clc__atomic_fetch_add_float_local_seq_cst");
64+
65+
__CLC_ATOMICFADDEXT(float, global)
66+
__CLC_ATOMICFADDEXT(float, local)
67+
68+
_CLC_DECL float
69+
_Z21__spirv_AtomicFAddEXTPU3AS1fN5__spv5Scope4FlagENS1_19MemorySemanticsMask4FlagEf(
70+
__global float *pointer, unsigned int scope, unsigned int semantics,
71+
float value) {
72+
return __spirv_AtomicFAddEXT(pointer, scope, semantics, value);
73+
}
74+
75+
_CLC_DECL float
76+
_Z21__spirv_AtomicFAddEXTPU3AS3fN5__spv5Scope4FlagENS1_19MemorySemanticsMask4FlagEf(
77+
__local float *pointer, unsigned int scope, unsigned int semantics,
78+
float value) {
79+
return __spirv_AtomicFAddEXT(pointer, scope, semantics, value);
80+
}
81+
82+
// FP64 atomics - require cl_khr_fp64 extension
83+
#ifdef cl_khr_fp64
84+
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
85+
double __clc__atomic_fetch_add_double_global_relaxed(
86+
__global double *,
87+
double) __asm("__clc__atomic_fetch_add_double_global_relaxed");
88+
double __clc__atomic_fetch_add_double_global_acquire(
89+
__global double *,
90+
double) __asm("__clc__atomic_fetch_add_double_global_acquire");
91+
double __clc__atomic_fetch_add_double_global_release(
92+
__global double *,
93+
double) __asm("__clc__atomic_fetch_add_double_global_release");
94+
double __clc__atomic_fetch_add_double_global_acq_rel(
95+
__global double *,
96+
double) __asm("__clc__atomic_fetch_add_double_global_acq_rel");
97+
double __clc__atomic_fetch_add_double_global_seq_cst(
98+
__global double *,
99+
double) __asm("__clc__atomic_fetch_add_double_global_seq_cst");
100+
double __clc__atomic_fetch_add_double_local_relaxed(
101+
__local double *,
102+
double) __asm("__clc__atomic_fetch_add_double_local_relaxed");
103+
double __clc__atomic_fetch_add_double_local_acquire(
104+
__local double *,
105+
double) __asm("__clc__atomic_fetch_add_double_local_acquire");
106+
double __clc__atomic_fetch_add_double_local_release(
107+
__local double *,
108+
double) __asm("__clc__atomic_fetch_add_double_local_release");
109+
double __clc__atomic_fetch_add_double_local_acq_rel(
110+
__local double *,
111+
double) __asm("__clc__atomic_fetch_add_double_local_acq_rel");
112+
double __clc__atomic_fetch_add_double_local_seq_cst(
113+
__local double *,
114+
double) __asm("__clc__atomic_fetch_add_double_local_seq_cst");
115+
116+
__CLC_ATOMICFADDEXT(double, global)
117+
__CLC_ATOMICFADDEXT(double, local)
118+
119+
_CLC_DECL double
120+
_Z21__spirv_AtomicFAddEXTPU3AS1dN5__spv5Scope4FlagENS1_19MemorySemanticsMask4FlagEd(
121+
__global double *pointer, unsigned int scope, unsigned int semantics,
122+
double value) {
123+
// FIXME: Double-precision atomics must be emulated for __CUDA_ARCH <= sm_50
124+
return __spirv_AtomicFAddEXT(pointer, scope, semantics, value);
125+
}
126+
127+
_CLC_DECL double
128+
_Z21__spirv_AtomicFAddEXTPU3AS3dN5__spv5Scope4FlagENS1_19MemorySemanticsMask4FlagEd(
129+
__local double *pointer, unsigned int scope, unsigned int semantics,
130+
double value) {
131+
// FIXME: Double-precision atomics must be emulated for __CUDA_ARCH <= sm_50
132+
return __spirv_AtomicFAddEXT(pointer, scope, semantics, value);
133+
}
134+
#endif // cl_khr_fp64
135+
136+
#undef __CLC_ATOMICFADDEXT
Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
#if __clang_major__ >= 7
2+
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
3+
#else
4+
target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
5+
#endif
6+
7+
define float @__clc__atomic_fetch_add_float_global_relaxed(float addrspace(1)* nocapture %ptr, float %value) nounwind alwaysinline {
8+
entry:
9+
%0 = atomicrmw volatile fadd float addrspace(1)* %ptr, float %value monotonic
10+
ret float %0
11+
}
12+
13+
define float @__clc__atomic_fetch_add_float_global_acquire(float addrspace(1)* nocapture %ptr, float %value) nounwind alwaysinline {
14+
entry:
15+
%0 = atomicrmw volatile fadd float addrspace(1)* %ptr, float %value acquire
16+
ret float %0
17+
}
18+
19+
define float @__clc__atomic_fetch_add_float_global_release(float addrspace(1)* nocapture %ptr, float %value) nounwind alwaysinline {
20+
entry:
21+
%0 = atomicrmw volatile fadd float addrspace(1)* %ptr, float %value release
22+
ret float %0
23+
}
24+
25+
define float @__clc__atomic_fetch_add_float_global_acq_rel(float addrspace(1)* nocapture %ptr, float %value) nounwind alwaysinline {
26+
entry:
27+
%0 = atomicrmw volatile fadd float addrspace(1)* %ptr, float %value acq_rel
28+
ret float %0
29+
}
30+
31+
define float @__clc__atomic_fetch_add_float_global_seq_cst(float addrspace(1)* nocapture %ptr, float %value) nounwind alwaysinline {
32+
entry:
33+
%0 = atomicrmw volatile fadd float addrspace(1)* %ptr, float %value seq_cst
34+
ret float %0
35+
}
36+
37+
define float @__clc__atomic_fetch_add_float_local_relaxed(float addrspace(3)* nocapture %ptr, float %value) nounwind alwaysinline {
38+
entry:
39+
%0 = atomicrmw volatile fadd float addrspace(3)* %ptr, float %value monotonic
40+
ret float %0
41+
}
42+
43+
define float @__clc__atomic_fetch_add_float_local_acquire(float addrspace(3)* nocapture %ptr, float %value) nounwind alwaysinline {
44+
entry:
45+
%0 = atomicrmw volatile fadd float addrspace(3)* %ptr, float %value acquire
46+
ret float %0
47+
}
48+
49+
define float @__clc__atomic_fetch_add_float_local_release(float addrspace(3)* nocapture %ptr, float %value) nounwind alwaysinline {
50+
entry:
51+
%0 = atomicrmw volatile fadd float addrspace(3)* %ptr, float %value release
52+
ret float %0
53+
}
54+
55+
define float @__clc__atomic_fetch_add_float_local_acq_rel(float addrspace(3)* nocapture %ptr, float %value) nounwind alwaysinline {
56+
entry:
57+
%0 = atomicrmw volatile fadd float addrspace(3)* %ptr, float %value acq_rel
58+
ret float %0
59+
}
60+
61+
define float @__clc__atomic_fetch_add_float_local_seq_cst(float addrspace(3)* nocapture %ptr, float %value) nounwind alwaysinline {
62+
entry:
63+
%0 = atomicrmw volatile fadd float addrspace(3)* %ptr, float %value seq_cst
64+
ret float %0
65+
}
66+
67+
define double @__clc__atomic_fetch_add_double_global_relaxed(double addrspace(1)* nocapture %ptr, double %value) nounwind alwaysinline {
68+
entry:
69+
%0 = atomicrmw volatile fadd double addrspace(1)* %ptr, double %value monotonic
70+
ret double %0
71+
}
72+
73+
define double @__clc__atomic_fetch_add_double_global_acquire(double addrspace(1)* nocapture %ptr, double %value) nounwind alwaysinline {
74+
entry:
75+
%0 = atomicrmw volatile fadd double addrspace(1)* %ptr, double %value acquire
76+
ret double %0
77+
}
78+
79+
define double @__clc__atomic_fetch_add_double_global_release(double addrspace(1)* nocapture %ptr, double %value) nounwind alwaysinline {
80+
entry:
81+
%0 = atomicrmw volatile fadd double addrspace(1)* %ptr, double %value release
82+
ret double %0
83+
}
84+
85+
define double @__clc__atomic_fetch_add_double_global_acq_rel(double addrspace(1)* nocapture %ptr, double %value) nounwind alwaysinline {
86+
entry:
87+
%0 = atomicrmw volatile fadd double addrspace(1)* %ptr, double %value acq_rel
88+
ret double %0
89+
}
90+
91+
define double @__clc__atomic_fetch_add_double_global_seq_cst(double addrspace(1)* nocapture %ptr, double %value) nounwind alwaysinline {
92+
entry:
93+
%0 = atomicrmw volatile fadd double addrspace(1)* %ptr, double %value seq_cst
94+
ret double %0
95+
}
96+
97+
define double @__clc__atomic_fetch_add_double_local_relaxed(double addrspace(3)* nocapture %ptr, double %value) nounwind alwaysinline {
98+
entry:
99+
%0 = atomicrmw volatile fadd double addrspace(3)* %ptr, double %value monotonic
100+
ret double %0
101+
}
102+
103+
define double @__clc__atomic_fetch_add_double_local_acquire(double addrspace(3)* nocapture %ptr, double %value) nounwind alwaysinline {
104+
entry:
105+
%0 = atomicrmw volatile fadd double addrspace(3)* %ptr, double %value acquire
106+
ret double %0
107+
}
108+
109+
define double @__clc__atomic_fetch_add_double_local_release(double addrspace(3)* nocapture %ptr, double %value) nounwind alwaysinline {
110+
entry:
111+
%0 = atomicrmw volatile fadd double addrspace(3)* %ptr, double %value release
112+
ret double %0
113+
}
114+
115+
define double @__clc__atomic_fetch_add_double_local_acq_rel(double addrspace(3)* nocapture %ptr, double %value) nounwind alwaysinline {
116+
entry:
117+
%0 = atomicrmw volatile fadd double addrspace(3)* %ptr, double %value acq_rel
118+
ret double %0
119+
}
120+
121+
define double @__clc__atomic_fetch_add_double_local_seq_cst(double addrspace(3)* nocapture %ptr, double %value) nounwind alwaysinline {
122+
entry:
123+
%0 = atomicrmw volatile fadd double addrspace(3)* %ptr, double %value seq_cst
124+
ret double %0
125+
}

0 commit comments

Comments
 (0)