Skip to content

[SYCL][CUDA] Add initial support for FP atomics #3276

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Mar 8, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions libclc/ptx-nvidiacl/libspirv/SOURCES
Original file line number Diff line number Diff line change
Expand Up @@ -85,3 +85,5 @@ images/image_helpers.ll
images/image.cl
group/collectives_helpers.ll
group/collectives.cl
SPV_EXT_shader_atomic_float_add/atomicfaddext.cl
SPV_EXT_shader_atomic_float_add/faddext_helpers.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
//===----------------------------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include <spirv/spirv.h>
#include <spirv/spirv_types.h>

// TODO: Convert scope to LLVM IR syncscope if __CUDA_ARCH >= sm_60
// TODO: Error if scope is not relaxed and __CUDA_ARCH <= sm_60
#define __CLC_ATOMICFADDEXT(TYPE, AS) \
_CLC_OVERLOAD _CLC_DEF TYPE __spirv_AtomicFAddEXT( \
__##AS TYPE *pointer, unsigned int scope, unsigned int semantics, \
TYPE value) { \
/* Semantics mask may include memory order, storage class and other info \
Memory order is stored in the lowest 5 bits */ \
unsigned int order = semantics & 0x1F; \
\
switch (order) { \
case None: \
return __clc__atomic_fetch_add_##TYPE##_##AS##_relaxed(pointer, value); \
case Acquire: \
return __clc__atomic_fetch_add_##TYPE##_##AS##_acquire(pointer, value); \
case Release: \
return __clc__atomic_fetch_add_##TYPE##_##AS##_release(pointer, value); \
case AcquireRelease: \
return __clc__atomic_fetch_add_##TYPE##_##AS##_acq_rel(pointer, value); \
default: \
/* Sequentially consistent atomics should never be incorrect */ \
case SequentiallyConsistent: \
return __clc__atomic_fetch_add_##TYPE##_##AS##_seq_cst(pointer, value); \
} \
}

// FP32 atomics - must work without additional extensions
float __clc__atomic_fetch_add_float_global_relaxed(
__global float *,
float) __asm("__clc__atomic_fetch_add_float_global_relaxed");
float __clc__atomic_fetch_add_float_global_acquire(
__global float *,
float) __asm("__clc__atomic_fetch_add_float_global_acquire");
float __clc__atomic_fetch_add_float_global_release(
__global float *,
float) __asm("__clc__atomic_fetch_add_float_global_release");
float __clc__atomic_fetch_add_float_global_acq_rel(
__global float *,
float) __asm("__clc__atomic_fetch_add_float_global_acq_rel");
float __clc__atomic_fetch_add_float_global_seq_cst(
__global float *,
float) __asm("__clc__atomic_fetch_add_float_global_seq_cst");
float __clc__atomic_fetch_add_float_local_relaxed(__local float *, float) __asm(
"__clc__atomic_fetch_add_float_local_relaxed");
float __clc__atomic_fetch_add_float_local_acquire(__local float *, float) __asm(
"__clc__atomic_fetch_add_float_local_acquire");
float __clc__atomic_fetch_add_float_local_release(__local float *, float) __asm(
"__clc__atomic_fetch_add_float_local_release");
float __clc__atomic_fetch_add_float_local_acq_rel(__local float *, float) __asm(
"__clc__atomic_fetch_add_float_local_acq_rel");
float __clc__atomic_fetch_add_float_local_seq_cst(__local float *, float) __asm(
"__clc__atomic_fetch_add_float_local_seq_cst");

__CLC_ATOMICFADDEXT(float, global)
__CLC_ATOMICFADDEXT(float, local)

_CLC_DECL float
_Z21__spirv_AtomicFAddEXTPU3AS1fN5__spv5Scope4FlagENS1_19MemorySemanticsMask4FlagEf(
__global float *pointer, unsigned int scope, unsigned int semantics,
float value) {
return __spirv_AtomicFAddEXT(pointer, scope, semantics, value);
}

_CLC_DECL float
_Z21__spirv_AtomicFAddEXTPU3AS3fN5__spv5Scope4FlagENS1_19MemorySemanticsMask4FlagEf(
__local float *pointer, unsigned int scope, unsigned int semantics,
float value) {
return __spirv_AtomicFAddEXT(pointer, scope, semantics, value);
}

// FP64 atomics - require cl_khr_fp64 extension
#ifdef cl_khr_fp64
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
double __clc__atomic_fetch_add_double_global_relaxed(
__global double *,
double) __asm("__clc__atomic_fetch_add_double_global_relaxed");
double __clc__atomic_fetch_add_double_global_acquire(
__global double *,
double) __asm("__clc__atomic_fetch_add_double_global_acquire");
double __clc__atomic_fetch_add_double_global_release(
__global double *,
double) __asm("__clc__atomic_fetch_add_double_global_release");
double __clc__atomic_fetch_add_double_global_acq_rel(
__global double *,
double) __asm("__clc__atomic_fetch_add_double_global_acq_rel");
double __clc__atomic_fetch_add_double_global_seq_cst(
__global double *,
double) __asm("__clc__atomic_fetch_add_double_global_seq_cst");
double __clc__atomic_fetch_add_double_local_relaxed(
__local double *,
double) __asm("__clc__atomic_fetch_add_double_local_relaxed");
double __clc__atomic_fetch_add_double_local_acquire(
__local double *,
double) __asm("__clc__atomic_fetch_add_double_local_acquire");
double __clc__atomic_fetch_add_double_local_release(
__local double *,
double) __asm("__clc__atomic_fetch_add_double_local_release");
double __clc__atomic_fetch_add_double_local_acq_rel(
__local double *,
double) __asm("__clc__atomic_fetch_add_double_local_acq_rel");
double __clc__atomic_fetch_add_double_local_seq_cst(
__local double *,
double) __asm("__clc__atomic_fetch_add_double_local_seq_cst");

__CLC_ATOMICFADDEXT(double, global)
__CLC_ATOMICFADDEXT(double, local)

_CLC_DECL double
_Z21__spirv_AtomicFAddEXTPU3AS1dN5__spv5Scope4FlagENS1_19MemorySemanticsMask4FlagEd(
__global double *pointer, unsigned int scope, unsigned int semantics,
double value) {
// FIXME: Double-precision atomics must be emulated for __CUDA_ARCH <= sm_50
return __spirv_AtomicFAddEXT(pointer, scope, semantics, value);
}

_CLC_DECL double
_Z21__spirv_AtomicFAddEXTPU3AS3dN5__spv5Scope4FlagENS1_19MemorySemanticsMask4FlagEd(
__local double *pointer, unsigned int scope, unsigned int semantics,
double value) {
// FIXME: Double-precision atomics must be emulated for __CUDA_ARCH <= sm_50
return __spirv_AtomicFAddEXT(pointer, scope, semantics, value);
}
#endif // cl_khr_fp64

#undef __CLC_ATOMICFADDEXT
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
#if __clang_major__ >= 7
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
#else
target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
#endif

define float @__clc__atomic_fetch_add_float_global_relaxed(float addrspace(1)* nocapture %ptr, float %value) nounwind alwaysinline {
entry:
%0 = atomicrmw volatile fadd float addrspace(1)* %ptr, float %value monotonic
ret float %0
}

define float @__clc__atomic_fetch_add_float_global_acquire(float addrspace(1)* nocapture %ptr, float %value) nounwind alwaysinline {
entry:
%0 = atomicrmw volatile fadd float addrspace(1)* %ptr, float %value acquire
ret float %0
}

define float @__clc__atomic_fetch_add_float_global_release(float addrspace(1)* nocapture %ptr, float %value) nounwind alwaysinline {
entry:
%0 = atomicrmw volatile fadd float addrspace(1)* %ptr, float %value release
ret float %0
}

define float @__clc__atomic_fetch_add_float_global_acq_rel(float addrspace(1)* nocapture %ptr, float %value) nounwind alwaysinline {
entry:
%0 = atomicrmw volatile fadd float addrspace(1)* %ptr, float %value acq_rel
ret float %0
}

define float @__clc__atomic_fetch_add_float_global_seq_cst(float addrspace(1)* nocapture %ptr, float %value) nounwind alwaysinline {
entry:
%0 = atomicrmw volatile fadd float addrspace(1)* %ptr, float %value seq_cst
ret float %0
}

define float @__clc__atomic_fetch_add_float_local_relaxed(float addrspace(3)* nocapture %ptr, float %value) nounwind alwaysinline {
entry:
%0 = atomicrmw volatile fadd float addrspace(3)* %ptr, float %value monotonic
ret float %0
}

define float @__clc__atomic_fetch_add_float_local_acquire(float addrspace(3)* nocapture %ptr, float %value) nounwind alwaysinline {
entry:
%0 = atomicrmw volatile fadd float addrspace(3)* %ptr, float %value acquire
ret float %0
}

define float @__clc__atomic_fetch_add_float_local_release(float addrspace(3)* nocapture %ptr, float %value) nounwind alwaysinline {
entry:
%0 = atomicrmw volatile fadd float addrspace(3)* %ptr, float %value release
ret float %0
}

define float @__clc__atomic_fetch_add_float_local_acq_rel(float addrspace(3)* nocapture %ptr, float %value) nounwind alwaysinline {
entry:
%0 = atomicrmw volatile fadd float addrspace(3)* %ptr, float %value acq_rel
ret float %0
}

define float @__clc__atomic_fetch_add_float_local_seq_cst(float addrspace(3)* nocapture %ptr, float %value) nounwind alwaysinline {
entry:
%0 = atomicrmw volatile fadd float addrspace(3)* %ptr, float %value seq_cst
ret float %0
}

define double @__clc__atomic_fetch_add_double_global_relaxed(double addrspace(1)* nocapture %ptr, double %value) nounwind alwaysinline {
entry:
%0 = atomicrmw volatile fadd double addrspace(1)* %ptr, double %value monotonic
ret double %0
}

define double @__clc__atomic_fetch_add_double_global_acquire(double addrspace(1)* nocapture %ptr, double %value) nounwind alwaysinline {
entry:
%0 = atomicrmw volatile fadd double addrspace(1)* %ptr, double %value acquire
ret double %0
}

define double @__clc__atomic_fetch_add_double_global_release(double addrspace(1)* nocapture %ptr, double %value) nounwind alwaysinline {
entry:
%0 = atomicrmw volatile fadd double addrspace(1)* %ptr, double %value release
ret double %0
}

define double @__clc__atomic_fetch_add_double_global_acq_rel(double addrspace(1)* nocapture %ptr, double %value) nounwind alwaysinline {
entry:
%0 = atomicrmw volatile fadd double addrspace(1)* %ptr, double %value acq_rel
ret double %0
}

define double @__clc__atomic_fetch_add_double_global_seq_cst(double addrspace(1)* nocapture %ptr, double %value) nounwind alwaysinline {
entry:
%0 = atomicrmw volatile fadd double addrspace(1)* %ptr, double %value seq_cst
ret double %0
}

define double @__clc__atomic_fetch_add_double_local_relaxed(double addrspace(3)* nocapture %ptr, double %value) nounwind alwaysinline {
entry:
%0 = atomicrmw volatile fadd double addrspace(3)* %ptr, double %value monotonic
ret double %0
}

define double @__clc__atomic_fetch_add_double_local_acquire(double addrspace(3)* nocapture %ptr, double %value) nounwind alwaysinline {
entry:
%0 = atomicrmw volatile fadd double addrspace(3)* %ptr, double %value acquire
ret double %0
}

define double @__clc__atomic_fetch_add_double_local_release(double addrspace(3)* nocapture %ptr, double %value) nounwind alwaysinline {
entry:
%0 = atomicrmw volatile fadd double addrspace(3)* %ptr, double %value release
ret double %0
}

define double @__clc__atomic_fetch_add_double_local_acq_rel(double addrspace(3)* nocapture %ptr, double %value) nounwind alwaysinline {
entry:
%0 = atomicrmw volatile fadd double addrspace(3)* %ptr, double %value acq_rel
ret double %0
}

define double @__clc__atomic_fetch_add_double_local_seq_cst(double addrspace(3)* nocapture %ptr, double %value) nounwind alwaysinline {
entry:
%0 = atomicrmw volatile fadd double addrspace(3)* %ptr, double %value seq_cst
ret double %0
}