Skip to content

[libspirv] Implement SPIR-V vload builtins via CLC #19174

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: sycl
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
184 changes: 5 additions & 179 deletions libclc/libspirv/lib/generic/shared/vload.cl
Original file line number Diff line number Diff line change
Expand Up @@ -6,185 +6,11 @@
//
//===----------------------------------------------------------------------===//

#include <clc/shared/clc_vload.h>
#include <libspirv/spirv.h>

#define VLOAD_VECTORIZE(RTYPE, PRIM_TYPE, ADDR_SPACE) \
typedef PRIM_TYPE less_aligned_##ADDR_SPACE##PRIM_TYPE \
__attribute__((aligned(sizeof(PRIM_TYPE)))); \
_CLC_OVERLOAD _CLC_DEF PRIM_TYPE __spirv_ocl_vload_R##RTYPE( \
size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \
return *(( \
const ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE *)(&x[offset])); \
} \
\
typedef PRIM_TYPE##2 less_aligned_##ADDR_SPACE##PRIM_TYPE##2 \
__attribute__((aligned(sizeof(PRIM_TYPE)))); \
_CLC_OVERLOAD _CLC_DEF PRIM_TYPE##2 __spirv_ocl_vloadn_R##RTYPE##2( \
size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \
return *((const ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##2 \
*)(&x[2 * offset])); \
} \
\
typedef PRIM_TYPE##3 less_aligned_##ADDR_SPACE##PRIM_TYPE##3 \
__attribute__((aligned(sizeof(PRIM_TYPE)))); \
_CLC_OVERLOAD _CLC_DEF PRIM_TYPE##3 __spirv_ocl_vloadn_R##RTYPE##3( \
size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \
PRIM_TYPE##2 vec = \
*((const ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##2 \
*)(&x[3 * offset])); \
return (PRIM_TYPE##3)(vec.s0, vec.s1, x[offset * 3 + 2]); \
} \
\
typedef PRIM_TYPE##4 less_aligned_##ADDR_SPACE##PRIM_TYPE##4 \
__attribute__((aligned(sizeof(PRIM_TYPE)))); \
_CLC_OVERLOAD _CLC_DEF PRIM_TYPE##4 __spirv_ocl_vloadn_R##RTYPE##4( \
size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \
return *((const ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##4 \
*)(&x[4 * offset])); \
} \
\
typedef PRIM_TYPE##8 less_aligned_##ADDR_SPACE##PRIM_TYPE##8 \
__attribute__((aligned(sizeof(PRIM_TYPE)))); \
_CLC_OVERLOAD _CLC_DEF PRIM_TYPE##8 __spirv_ocl_vloadn_R##RTYPE##8( \
size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \
return *((const ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##8 \
*)(&x[8 * offset])); \
} \
\
typedef PRIM_TYPE##16 less_aligned_##ADDR_SPACE##PRIM_TYPE##16 \
__attribute__((aligned(sizeof(PRIM_TYPE)))); \
_CLC_OVERLOAD _CLC_DEF PRIM_TYPE##16 __spirv_ocl_vloadn_R##RTYPE##16( \
size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \
return *((const ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##16 \
*)(&x[16 * offset])); \
}
#define __CLC_BODY "vload.inc"
#include <clc/integer/gentype.inc>

#if _CLC_DISTINCT_GENERIC_AS_SUPPORTED
#define VLOAD_VECTORIZE_GENERIC VLOAD_VECTORIZE
#else
// The generic address space isn't available, so make the macro do nothing
#define VLOAD_VECTORIZE_GENERIC(X,Y,Z)
#endif

#define VLOAD_ADDR_SPACES_IMPL(__CLC_RET_GENTYPE, __CLC_SCALAR_GENTYPE) \
VLOAD_VECTORIZE(__CLC_RET_GENTYPE, __CLC_SCALAR_GENTYPE, __private) \
VLOAD_VECTORIZE(__CLC_RET_GENTYPE, __CLC_SCALAR_GENTYPE, __local) \
VLOAD_VECTORIZE(__CLC_RET_GENTYPE, __CLC_SCALAR_GENTYPE, __constant) \
VLOAD_VECTORIZE(__CLC_RET_GENTYPE, __CLC_SCALAR_GENTYPE, __global) \
VLOAD_VECTORIZE_GENERIC(__CLC_RET_GENTYPE, __CLC_SCALAR_GENTYPE, __generic)

#define VLOAD_ADDR_SPACES(__CLC_SCALAR_GENTYPE) \
VLOAD_ADDR_SPACES_IMPL(__CLC_SCALAR_GENTYPE, __CLC_SCALAR_GENTYPE)

VLOAD_ADDR_SPACES_IMPL(char, char)

#define VLOAD_TYPES() \
VLOAD_ADDR_SPACES(uchar) \
VLOAD_ADDR_SPACES(short) \
VLOAD_ADDR_SPACES(ushort) \
VLOAD_ADDR_SPACES(int) \
VLOAD_ADDR_SPACES(uint) \
VLOAD_ADDR_SPACES(long) \
VLOAD_ADDR_SPACES(ulong) \
VLOAD_ADDR_SPACES(float)

VLOAD_TYPES()

#ifdef cl_khr_fp64
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
VLOAD_ADDR_SPACES(double)
#endif
#ifdef cl_khr_fp16
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
VLOAD_ADDR_SPACES(half)
#endif

/* vload_half are legal even without cl_khr_fp16 */
/* no vload_half for double */
#if __clang_major__ < 6
float __clc_vload_half_float_helper__constant(const __constant half *);
float __clc_vload_half_float_helper__global(const __global half *);
float __clc_vload_half_float_helper__local(const __local half *);
float __clc_vload_half_float_helper__private(const __private half *);

#if _CLC_DISTINCT_GENERIC_AS_SUPPORTED
float __clc_vload_half_float_helper__generic(const __generic half *);
#endif

#define VEC_LOAD1(val, AS) \
val = __clc_vload_half_float_helper##AS(&mem[offset++]);
#else
#define VEC_LOAD1(val, AS) val = __builtin_load_halff(&mem[offset++]);
#endif

#define VEC_LOAD2(val, AS) \
VEC_LOAD1(val.lo, AS) \
VEC_LOAD1(val.hi, AS)
#define VEC_LOAD3(val, AS) \
VEC_LOAD1(val.s0, AS) \
VEC_LOAD1(val.s1, AS) \
VEC_LOAD1(val.s2, AS)
#define VEC_LOAD4(val, AS) \
VEC_LOAD2(val.lo, AS) \
VEC_LOAD2(val.hi, AS)
#define VEC_LOAD8(val, AS) \
VEC_LOAD4(val.lo, AS) \
VEC_LOAD4(val.hi, AS)
#define VEC_LOAD16(val, AS) \
VEC_LOAD8(val.lo, AS) \
VEC_LOAD8(val.hi, AS)

#define VLOAD_HALF_VEC_IMPL(VEC_SIZE, OFFSET_SIZE, AS) \
_CLC_OVERLOAD _CLC_DEF float##VEC_SIZE \
__spirv_ocl_vload_halfn_Rfloat##VEC_SIZE(size_t offset, \
const AS half *mem) { \
offset *= VEC_SIZE; \
float##VEC_SIZE __tmp; \
VEC_LOAD##VEC_SIZE(__tmp, AS) return __tmp; \
} \
_CLC_OVERLOAD _CLC_DEF float##VEC_SIZE \
__spirv_ocl_vloada_halfn_Rfloat##VEC_SIZE(size_t offset, \
const AS half *mem) { \
offset *= OFFSET_SIZE; \
float##VEC_SIZE __tmp; \
VEC_LOAD##VEC_SIZE(__tmp, AS) return __tmp; \
}

#define VLOAD_HALF_IMPL(AS) \
_CLC_OVERLOAD _CLC_DEF float __spirv_ocl_vload_half(size_t offset, \
const AS half *mem) { \
float __tmp; \
VEC_LOAD1(__tmp, AS) return __tmp; \
}

#define GEN_VLOAD_HALF(AS) \
VLOAD_HALF_IMPL(AS) \
VLOAD_HALF_VEC_IMPL(2, 2, AS) \
VLOAD_HALF_VEC_IMPL(3, 4, AS) \
VLOAD_HALF_VEC_IMPL(4, 4, AS) \
VLOAD_HALF_VEC_IMPL(8, 8, AS) \
VLOAD_HALF_VEC_IMPL(16, 16, AS)

GEN_VLOAD_HALF(__private)
GEN_VLOAD_HALF(__global)
GEN_VLOAD_HALF(__local)
GEN_VLOAD_HALF(__constant)

#if _CLC_DISTINCT_GENERIC_AS_SUPPORTED
GEN_VLOAD_HALF(__generic)
#endif

#undef VLOAD_HALF_IMPL
#undef VLOAD_HALF_VEC_IMPL
#undef GEN_VLOAD_HALF
#undef VEC_LOAD16
#undef VEC_LOAD8
#undef VEC_LOAD4
#undef VEC_LOAD3
#undef VEC_LOAD2
#undef VEC_LOAD1
#undef VLOAD_TYPES
#undef VLOAD_ADDR_SPACES
#undef VLOAD_VECTORIZE
#undef VLOAD_VECTORIZE_GENERIC
#undef VLOAD_VECTORIZE
#define __CLC_BODY "vload.inc"
#include <clc/math/gentype.inc>
106 changes: 106 additions & 0 deletions libclc/libspirv/lib/generic/shared/vload.inc
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
//===----------------------------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#ifdef __CLC_SCALAR
#define N_IF_VEC
#else
#define N_IF_VEC n
#endif

#define CLC_VLOAD_NAME() __CLC_XCONCAT(__clc_vload, __CLC_VECSIZE)
#define SPIRV_VLOAD_NAME() \
__CLC_XCONCAT(__CLC_XCONCAT(__CLC_XCONCAT(__spirv_ocl_vload, N_IF_VEC), _R), \
__CLC_GENTYPE)

#define CLC_VLOAD_HALF_NAME(a) \
__CLC_XCONCAT(__CLC_XCONCAT(__CLC_XCONCAT(__clc_vload, a), _half), \
__CLC_VECSIZE)

#define CLC_VLOAD_TY __CLC_XCONCAT(less_aligned_, __CLC_GENTYPE)

#ifdef __CLC_SCALAR

#define VLOAD_DEF(ADDRSPACE) \
_CLC_OVERLOAD _CLC_DEF CLC_VLOAD_TY SPIRV_VLOAD_NAME()( \
size_t offset, const ADDRSPACE __CLC_SCALAR_GENTYPE *x) { \
return *((const ADDRSPACE CLC_VLOAD_TY *)(&x[offset])); \
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OpenCL spec doesn't define scalar variant of vloadn, right? Can we delete it?

}

#define SPIRV_VLOAD_HALF_NAME(a) \
__CLC_XCONCAT(__CLC_XCONCAT(__spirv_ocl_vload, a), _half)

#else

#define VLOAD_DEF(ADDRSPACE) \
_CLC_OVERLOAD _CLC_DEF CLC_VLOAD_TY SPIRV_VLOAD_NAME()( \
size_t offset, const ADDRSPACE __CLC_SCALAR_GENTYPE *x) { \
return CLC_VLOAD_NAME()(offset, x); \
}

#define SPIRV_VLOAD_HALF_NAME(a) \
__CLC_XCONCAT( \
__CLC_XCONCAT( \
__CLC_XCONCAT(__CLC_XCONCAT(__spirv_ocl_vload, a), _halfn), _R), \
__CLC_GENTYPE)

#endif

VLOAD_DEF(__private)
VLOAD_DEF(__local)
VLOAD_DEF(__constant)
VLOAD_DEF(__global)

#if _CLC_DISTINCT_GENERIC_AS_SUPPORTED
VLOAD_DEF(__generic)
#endif

#undef VLOAD_DEF
#undef CLC_VLOAD_TY

// vload_half and vloada_half are available even if cl_khr_fp16 is unavailable.
// Declare these functions when working on float types, which we know are
// always available.
#ifdef __CLC_FPSIZE
#if __CLC_FPSIZE == 32

#define VLOAD_HALF_DEF(ADDRSPACE, A) \
_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE SPIRV_VLOAD_HALF_NAME(A)( \
size_t offset, const ADDRSPACE half *mem) { \
return CLC_VLOAD_HALF_NAME(A)(offset, mem); \
}

VLOAD_HALF_DEF(__private, )
VLOAD_HALF_DEF(__local, )
VLOAD_HALF_DEF(__constant, )
VLOAD_HALF_DEF(__global, )

#if _CLC_DISTINCT_GENERIC_AS_SUPPORTED
VLOAD_HALF_DEF(__generic, )
#endif

#ifndef __CLC_SCALAR
VLOAD_HALF_DEF(__private, a)
VLOAD_HALF_DEF(__local, a)
VLOAD_HALF_DEF(__constant, a)
VLOAD_HALF_DEF(__global, a)

#if _CLC_DISTINCT_GENERIC_AS_SUPPORTED
VLOAD_HALF_DEF(__generic, a)
#endif
#endif

#undef VLOAD_HALF_DEF
#endif
#endif

#undef CLC_VLOAD_NAME
#undef CLC_VLOAD_HALF_NAME
#undef SPIRV_VLOAD_NAME
#undef SPIRV_VLOAD_HALF_NAME

#undef N_IF_VEC