From 599b1b96e80d479fe669b4e3bfecb794fa8c850c Mon Sep 17 00:00:00 2001 From: jinge90 Date: Wed, 26 Oct 2022 19:58:03 -0400 Subject: [PATCH] [SYCL][libdevice] Add type cast functions between half and float/integral type (#6930) Signed-off-by: jinge90 --- libdevice/device.h | 9 + libdevice/device_imf.hpp | 1 + libdevice/imf_half.hpp | 488 ++++++++++++++++-- libdevice/imf_utils/double_convert.cpp | 9 + libdevice/imf_utils/half_convert.cpp | 488 ++++++++++++++++++ libdevice/imf_wrapper.cpp | 449 ++++++++++++++++ libdevice/imf_wrapper_fp64.cpp | 8 + .../sycl-post-link/SYCLDeviceLibReqMask.cpp | 59 +++ sycl/include/sycl/builtins.hpp | 58 +++ 9 files changed, 1528 insertions(+), 41 deletions(-) diff --git a/libdevice/device.h b/libdevice/device.h index 0770d7d82d29a..1b96a5031a039 100644 --- a/libdevice/device.h +++ b/libdevice/device.h @@ -40,4 +40,13 @@ #define DEVICE_EXTERN_C_INLINE DEVICE_EXTERN_C __attribute__((always_inline)) #endif // __LIBDEVICE_HOST_IMPL__ +// Rounding mode are used internally by type convert functions in imf libdevice +// and we don't want to include system's fenv.h, so we define ourselves'. +typedef enum { + __IML_RTE, // round to nearest-even + __IML_RTZ, // round to zero + __IML_RTP, // round to +inf + __IML_RTN, // round to -inf +} __iml_rounding_mode; + #endif // __LIBDEVICE_DEVICE_H__ diff --git a/libdevice/device_imf.hpp b/libdevice/device_imf.hpp index f7ee874aaf617..afc914892c52d 100644 --- a/libdevice/device_imf.hpp +++ b/libdevice/device_imf.hpp @@ -90,6 +90,7 @@ static inline TyFP __integral2FP_host(TyINT x, int rdMode) { fesetround(roundingOriginal); return res; } +#pragma STDC FENV_ACCESS OFF #endif // __LIBDEVICE_HOST_IMPL__ template static inline Ty __imax(Ty x, Ty y) { diff --git a/libdevice/imf_half.hpp b/libdevice/imf_half.hpp index af8a32a389ae9..221665b64194e 100644 --- a/libdevice/imf_half.hpp +++ b/libdevice/imf_half.hpp @@ -10,8 +10,10 @@ #define __LIBDEVICE_HALF_EMUL_H__ #include "device.h" +#include #include - +#include +#include #ifdef __LIBDEVICE_IMF_ENABLED__ #if defined(__SPIR__) @@ -20,48 +22,452 @@ typedef _Float16 _iml_half_internal; typedef uint16_t _iml_half_internal; #endif -// TODO: need to support float to half conversion with different -// rounding mode. +template struct __iml_get_unsigned {}; +template <> struct __iml_get_unsigned { + using utype = uint16_t; +}; + +template <> struct __iml_get_unsigned { + using utype = uint32_t; +}; + +template <> struct __iml_get_unsigned { + using utype = uint64_t; +}; + +static uint16_t __iml_half_exp_mask = 0x7C00; + +template struct __iml_fp_config {}; + +template <> struct __iml_fp_config { + // signed/unsigned integral type with same size + using utype = uint32_t; + using stype = int32_t; + const static uint32_t exp_mask = 0xFF; +}; + +template <> struct __iml_fp_config { + using utype = uint64_t; + using stype = int64_t; + const static uint64_t exp_mask = 0x7FF; +}; + +static uint16_t __iml_half_overflow_handle(__iml_rounding_mode rounding_mode, + uint16_t sign) { + if (rounding_mode == __IML_RTZ) { + return (sign << 15) | 0x7BFF; + } + + if (rounding_mode == __IML_RTP && sign) { + return 0xFBFF; + } + if (rounding_mode == __IML_RTN && !sign) { + return 0x7BFF; + } + return (sign << 15) | 0x7C00; +} + +static uint16_t __iml_half_underflow_handle(__iml_rounding_mode rounding_mode, + uint16_t sign) { + if (rounding_mode == __IML_RTN && sign) { + return 0x8001; + } + + if (rounding_mode == __IML_RTP && !sign) { + return 0x1; + } + return (sign << 15); +} + +template +static uint16_t __iml_fp2half(Ty x, __iml_rounding_mode rounding_mode) { + typedef typename __iml_fp_config::utype UTy; + typedef typename __iml_fp_config::stype STy; + union { + Ty xf; + UTy xu; + } xs; + + // extract sign bit + UTy one_bit = 0x1; + xs.xf = x; + uint16_t h_sign = xs.xu >> (sizeof(Ty) * 8 - 1); + // extract exponent and mantissa + UTy x_exp = (xs.xu >> (std::numeric_limits::digits - 1)) & + (__iml_fp_config::exp_mask); + UTy x_mant = xs.xu & ((one_bit << (std::numeric_limits::digits - 1)) - 1); + STy x_exp1 = x_exp - std::numeric_limits::max_exponent + 1; + uint16_t h_exp = static_cast(x_exp1 + 15); + uint16_t mant_shift = std::numeric_limits::digits - 11; + if (x_exp == __iml_fp_config::exp_mask) { + uint16_t res; + if (x_mant) { + // NAN. + uint16_t h_mant = static_cast( + x_mant >> (std::numeric_limits::digits - 11)); + h_mant |= 0x200; + res = (h_sign << 15) | __iml_half_exp_mask | h_mant; + } else { + // Infinity, zero mantissa + res = (h_sign << 15) | __iml_half_exp_mask; + } + return res; + } + + if (!x_exp && !x_mant) { + return (h_sign << 15); + } + + // overflow happens + if (x_exp1 > 15) { + return __iml_half_overflow_handle(rounding_mode, h_sign); + } + + // underflow, if x < minmum denormal half value. + if (x_exp1 < -25) { + return __iml_half_underflow_handle(rounding_mode, h_sign); + } + + // some number should be encoded as denorm number when converting to half + // minimum positive normalized half value is 2^-14 + if (x_exp1 < -14) { + h_exp = 0; + x_mant |= (one_bit << (std::numeric_limits::digits - 1)); + mant_shift = -x_exp1 - 14 + std::numeric_limits::digits - 11; + } + + uint16_t h_mant = (uint16_t)(x_mant >> mant_shift); + // Used to get discarded mantissa from original fp value. + UTy mant_discard_mask = ((UTy)1 << mant_shift) - 1; + UTy mid_val = (UTy)1 << (mant_shift - 1); + switch (rounding_mode) { + case __IML_RTZ: + break; + case __IML_RTP: + if ((x_mant & mant_discard_mask) && !h_sign) { + ++h_mant; + } + break; + case __IML_RTN: + if ((x_mant & mant_discard_mask) && h_sign) { + ++h_mant; + } + break; + case __IML_RTE: { + UTy tmp = x_mant & mant_discard_mask; + if ((tmp > mid_val) || ((tmp == mid_val) && ((h_mant & 0x1) == 0x1))) { + ++h_mant; + } + break; + } + } + + if (h_mant & 0x400) { + h_exp += 1; + h_mant = 0; + } + return (h_sign << 15) | (h_exp << 10) | h_mant; +} + +template +static Ty __iml_half2integral_u(uint16_t h, __iml_rounding_mode rounding_mode) { + static_assert(std::is_unsigned::value && std::is_integral::value, + "__iml_half2integral_u only accepts unsigned integral type."); + uint16_t h_sign = h >> 15; + uint16_t h_exp = (h >> 10) & 0x1F; + uint16_t h_mant = h & 0x3FF; + int16_t h_exp1 = (int16_t)h_exp - 15; + if (h_sign) + return 0; + + // For subnorm values, return 1 if rounding to +infinity. + if (!h_exp) + return (h_mant && (__IML_RTP == rounding_mode)) ? 1 : 0; + + if (h_exp == 0x1F) + return h_mant ? 0 : std::numeric_limits::max(); + + // Normalized value can be represented as 1.signifcand * 2^h_exp1 + // and is equivalent to 1.signifcand * 2^10 * 2^(h_exp1 - 10). + // -24 <= h_exp1 - 10 <= 5 + Ty x_val = h_mant; + Ty x_discard; + x_val |= (0x1 << 10); + h_exp1 -= 10; + + if (h_exp1 >= 0) + return x_val <<= h_exp1; + + // h_exp1 < 0, need right shift -h_exp1 bits, if -h_exp1 > 11, the value + // is less than 0.5, so don't need to take special care for RTE + if (-h_exp1 > 11) + return (__IML_RTP == rounding_mode) ? 1 : 0; + + x_discard = x_val & (((Ty)1 << -h_exp1) - 1); + Ty mid = 1 << (-h_exp1 - 1); + x_val >>= -h_exp1; + if (!x_discard) + return x_val; + switch (rounding_mode) { + case __IML_RTE: + if ((x_discard > mid) || ((x_discard == mid) && ((x_val & 0x1) == 0x1))) + x_val++; + break; + case __IML_RTN: + break; + case __IML_RTP: + x_val++; + break; + case __IML_RTZ: + break; + } + + return x_val; +} + +template +static Ty __iml_half2integral_s(uint16_t h, __iml_rounding_mode rounding_mode) { + static_assert(std::is_signed::value && std::is_integral::value, + "__iml_half2integral_s only accepts signed integral type."); + typedef typename __iml_get_unsigned::utype UTy; + uint16_t h_sign = h >> 15; + uint16_t h_exp = (h >> 10) & 0x1F; + uint16_t h_mant = h & 0x3FF; + int h_exp1 = (int16_t)h_exp - 15; + if (!h_exp) { + if (!h_mant) + return 0; + else { + // For subnormal values + if (h_sign && (__IML_RTN == rounding_mode)) + return -1; + if (!h_sign && (__IML_RTP == rounding_mode)) + return 1; + return 0; + } + } + + if (h_exp == 0x1F) { + // For NAN, return 0 + if (h_mant) { + return 0; + } else { + // For +/-infinity value, return max and min integral value + return h_sign ? std::numeric_limits::min() + : std::numeric_limits::max(); + } + } + + // Normalized value can be represented as 1.signifcand * 2^h_exp1 + // and is equivalent to 1.signifcand * 2^10 * 2^(h_exp1 - 10). + // -24 <= h_exp1 - 10 <= 5 + UTy x_val = h_mant; + UTy x_discard; + x_val |= (0x1 << 10); + h_exp1 -= 10; + // Overflow happens + if (h_exp1 >= (int)((sizeof(Ty) * 8) - 11)) { + return h_sign ? std::numeric_limits::min() + : std::numeric_limits::max(); + } + + if (h_exp1 >= 0) { + x_val <<= h_exp1; + return !h_sign ? x_val : (~x_val + 1); + } + + // h_exp1 < 0, need right shift -h_exp1 bits, if -h_exp1 > 11, the value + // is less than 0.5, so don't need to take special care for RTE + if (-h_exp1 > 11) { + if (h_sign && (__IML_RTN == rounding_mode)) + return -1; + if (!h_sign && (__IML_RTP == rounding_mode)) + return 1; + return 0; + } + + x_discard = x_val & (((UTy)1 << -h_exp1) - 1); + UTy mid = (UTy)1 << (-h_exp1 - 1); + x_val >>= -h_exp1; + if (!x_discard) + return x_val; + switch (rounding_mode) { + case __IML_RTE: + if ((x_discard > mid) || ((x_discard == mid) && ((x_val & 0x1) == 0x1))) + x_val++; + break; + case __IML_RTN: + if (h_sign) + x_val++; + break; + case __IML_RTP: + if (!h_sign) + x_val++; + break; + case __IML_RTZ: + break; + } + + return !h_sign ? x_val : (~x_val + 1); +} + +// pre assumes input value is not 0. +template static size_t get_msb_pos(Ty x) { + size_t idx = 0; + Ty mask = ((Ty)1 << (sizeof(Ty) * 8 - 1)); + for (idx = 0; idx < (sizeof(Ty) * 8); ++idx) { + if ((x & mask) == mask) + break; + mask >>= 1; + } + + return (sizeof(Ty) * 8 - 1 - idx); +} + +template +static uint16_t __iml_integral2half_u(Ty u, __iml_rounding_mode rounding_mode) { + static_assert(std::is_unsigned::value && std::is_integral::value, + "__iml_integral2half_u only accepts unsigned integral type."); + if (!u) + return 0; + size_t msb_pos = get_msb_pos(u); + // return half representation for 1 + if (msb_pos == 0) + return 0x3C00; + Ty mant = u & (((Ty)1 << msb_pos) - 1); + // Unsigned integral value can be represented by 1.mant * (2^msb_pos), + // msb_pos is also the bit number of mantissa, 0 < msb_pos < sizeof(Ty) * 8, + // exponent of half precision value range is [-14, 15]. + bool is_overflow = false; + if (msb_pos > 15) + is_overflow = true; + + uint16_t h_exp = msb_pos; + uint16_t h_mant; + if (!is_overflow) { + if (msb_pos <= 10) { + mant <<= (10 - msb_pos); + h_mant = (uint16_t)mant; + } else { + h_mant = (uint16_t)(mant >> (msb_pos - 10)); + Ty mant_discard = mant & (((Ty)1 << (msb_pos - 10)) - 1); + Ty mid = (Ty)1 << (msb_pos - 11); + switch (rounding_mode) { + case __IML_RTE: + if ((mant_discard > mid) || + ((mant_discard == mid) && ((h_mant & 0x1) == 0x1))) + h_mant++; + break; + case __IML_RTP: + if (mant_discard) + h_mant++; + break; + case __IML_RTN: + case __IML_RTZ: + break; + } + } + } + + if (h_mant == 0x400) { + h_exp++; + h_mant = 0; + if (h_exp > 15) + is_overflow = true; + } + + if (is_overflow) { + // According to IEEE-754 standards(Ch 7.4), RTE carries all overflows + // to infinity with sign, RTZ carries all overflows to format's largest + // finite number with sign, RTN carries positive overflows to format's + // largest finite number and carries negative overflows to -infinity. + // RTP carries negative overflows to the format's most negative finite + // number and carries positive overflow to +infinity. + if (__IML_RTZ == rounding_mode || __IML_RTN == rounding_mode) + return 0x7BFF; + else + return 0x7C00; + } + h_exp += 15; + return (h_exp << 10) | h_mant; +} + +template +static uint16_t __iml_integral2half_s(Ty i, __iml_rounding_mode rounding_mode) { + static_assert(std::is_signed::value && std::is_integral::value, + "__iml_integral2half_s only accepts unsigned integral type."); + + typedef typename __iml_get_unsigned::utype UTy; + if (!i) + return 0; + uint16_t h_sign = (i >= 0) ? 0 : 0x8000; + UTy ui = (i > 0) ? static_cast(i) : static_cast(-i); + size_t msb_pos = get_msb_pos(ui); + if (msb_pos == 0) + return h_sign ? 0xBC00 : 0x3C00; + UTy mant = ui & (((UTy)1 << msb_pos) - 1); + bool is_overflow = false; + if (msb_pos > 15) + is_overflow = true; + + uint16_t h_exp = msb_pos; + uint16_t h_mant; + if (!is_overflow) { + if (msb_pos <= 10) { + mant <<= (10 - msb_pos); + h_mant = (uint16_t)mant; + } else { + h_mant = (uint16_t)(mant >> (msb_pos - 10)); + Ty mant_discard = mant & ((1 << (msb_pos - 10)) - 1); + Ty mid = 1 << (msb_pos - 11); + switch (rounding_mode) { + case __IML_RTE: + if ((mant_discard > mid) || + ((mant_discard == mid) && ((h_mant & 0x1) == 0x1))) + h_mant++; + break; + case __IML_RTP: + if (mant_discard && !h_sign) + h_mant++; + break; + case __IML_RTN: + if (mant_discard && h_sign) + h_mant++; + case __IML_RTZ: + break; + } + } + } + + if (h_mant == 0x400) { + h_exp++; + h_mant = 0; + if (h_exp > 15) + is_overflow = true; + } + + if (is_overflow) { + // According to IEEE-754 standards(Ch 7.4), RTE carries all overflows + // to infinity with sign, RTZ carries all overflows to format's largest + // finite number with sign, RTN carries positive overflows to format's + // largest finite number and carries negative overflows to -infinity. + // RTP carries negative overflows to the format's most negative finite + // number and carries positive overflow to +infinity. + if (__IML_RTE == rounding_mode || ((__IML_RTP == rounding_mode) && !h_sign)) + return h_sign ? 0xFC00 : 0x7C00; + if (__IML_RTZ == rounding_mode || + ((__IML_RTN == rounding_mode) && !h_sign) || + ((__IML_RTP == rounding_mode) && h_sign)) + return h_sign ? 0xFBFF : 0x7BFF; + return 0xFC00; + } + h_exp += 15; + return h_sign | (h_exp << 10) | h_mant; +} + static inline _iml_half_internal __float2half(float x) { #if defined(__LIBDEVICE_HOST_IMPL__) - uint32_t fp32_bits = __builtin_bit_cast(uint32_t, x); - - const uint16_t sign = (fp32_bits & 0x80000000) >> 16; - const uint32_t frac32 = fp32_bits & 0x7fffff; - const uint8_t exp32 = (fp32_bits & 0x7f800000) >> 23; - const int16_t exp32_diff = exp32 - 127; - - // initialize to 0, covers the case for 0 and small numbers - uint16_t exp16 = 0, frac16 = 0; - - if (__builtin_expect(exp32_diff > 15, 0)) { - // Infinity and big numbers convert to infinity - exp16 = 0x1f; - } else if (__builtin_expect(exp32_diff > -14, 0)) { - // normal range for half type - exp16 = exp32_diff + 15; - // convert 23-bit mantissa to 10-bit mantissa. - frac16 = frac32 >> 13; - if (frac32 >> 12 & 0x01) - frac16 += 1; - } else if (__builtin_expect(exp32_diff > -24, 0)) { - // subnormals - frac16 = (frac32 | (uint32_t(1) << 23)) >> (-exp32_diff - 1); - } - - if (__builtin_expect(exp32 == 0xff && frac32 != 0, 0)) { - // corner case: FP32 is NaN - exp16 = 0x1F; - frac16 = 0x200; - } - - // Compose the final FP16 binary - uint16_t res = 0; - res |= sign; - res |= exp16 << 10; - res += frac16; // Add the carry bit from operation Frac16 += 1; - - return res; + return __iml_fp2half(x, __IML_RTE); #elif defined(__SPIR__) return __spirv_FConvert_Rhalf_rte(x); #endif diff --git a/libdevice/imf_utils/double_convert.cpp b/libdevice/imf_utils/double_convert.cpp index 6506f8a40e250..61a606dc37062 100644 --- a/libdevice/imf_utils/double_convert.cpp +++ b/libdevice/imf_utils/double_convert.cpp @@ -441,4 +441,13 @@ DEVICE_EXTERN_C_INLINE double __devicelib_imf_ull2double_rz(unsigned long long int x) { return __ull2double_rz(x); } + +DEVICE_EXTERN_C_INLINE +_iml_half_internal __devicelib_imf_double2half(double x) { +#if defined(__LIBDEVICE_HOST_IMPL__) + return __iml_fp2half(x, __IML_RTE); +#elif defined(__SPIR__) + return __spirv_FConvert_Rhalf_rte(x); +#endif +} #endif // __LIBDEVICE_IMF_ENABLED__ diff --git a/libdevice/imf_utils/half_convert.cpp b/libdevice/imf_utils/half_convert.cpp index 25f3d2a010914..9c20ad5e674f4 100644 --- a/libdevice/imf_utils/half_convert.cpp +++ b/libdevice/imf_utils/half_convert.cpp @@ -15,4 +15,492 @@ DEVICE_EXTERN_C_INLINE float __devicelib_imf_half2float(_iml_half_internal x) { return __half2float(x); } + +DEVICE_EXTERN_C_INLINE +_iml_half_internal __devicelib_imf_float2half_rd(float x) { +#if defined(__SPIR__) + return __spirv_FConvert_Rhalf_rtn(x); +#else + return __iml_fp2half(x, __IML_RTN); +#endif +} + +DEVICE_EXTERN_C_INLINE +_iml_half_internal __devicelib_imf_float2half_rn(float x) { +#if defined(__SPIR__) + return __spirv_FConvert_Rhalf_rte(x); +#else + return __iml_fp2half(x, __IML_RTE); +#endif +} + +DEVICE_EXTERN_C_INLINE +_iml_half_internal __devicelib_imf_float2half_ru(float x) { +#if defined(__SPIR__) + return __spirv_FConvert_Rhalf_rtp(x); +#else + return __iml_fp2half(x, __IML_RTP); +#endif +} + +DEVICE_EXTERN_C_INLINE +_iml_half_internal __devicelib_imf_float2half_rz(float x) { +#if defined(__SPIR__) + return __spirv_FConvert_Rhalf_rtz(x); +#else + return __iml_fp2half(x, __IML_RTZ); +#endif +} + +DEVICE_EXTERN_C_INLINE +int __devicelib_imf_half2int_rd(_iml_half_internal h) { +#if defined(__SPIR__) + return __spirv_ConvertFToS_Rint_rtn(h); +#else + return __iml_half2integral_s(h, __IML_RTN); +#endif +} + +DEVICE_EXTERN_C_INLINE +int __devicelib_imf_half2int_rn(_iml_half_internal h) { +#if defined(__SPIR__) + return __spirv_ConvertFToS_Rint_rte(h); +#else + return __iml_half2integral_s(h, __IML_RTE); +#endif +} + +DEVICE_EXTERN_C_INLINE +int __devicelib_imf_half2int_ru(_iml_half_internal h) { +#if defined(__SPIR__) + return __spirv_ConvertFToS_Rint_rtp(h); +#else + return __iml_half2integral_s(h, __IML_RTP); +#endif +} + +DEVICE_EXTERN_C_INLINE +int __devicelib_imf_half2int_rz(_iml_half_internal h) { +#if defined(__SPIR__) + return __spirv_ConvertFToS_Rint_rtz(h); +#else + return __iml_half2integral_s(h, __IML_RTZ); +#endif +} + +DEVICE_EXTERN_C_INLINE +long long __devicelib_imf_half2ll_rd(_iml_half_internal h) { +#if defined(__SPIR__) + return __spirv_ConvertFToS_Rlong_rtn(h); +#else + return __iml_half2integral_s(h, __IML_RTN); +#endif +} + +DEVICE_EXTERN_C_INLINE +long long __devicelib_imf_half2ll_rn(_iml_half_internal h) { +#if defined(__SPIR__) + return __spirv_ConvertFToS_Rlong_rte(h); +#else + return __iml_half2integral_s(h, __IML_RTE); +#endif +} + +DEVICE_EXTERN_C_INLINE +long long __devicelib_imf_half2ll_ru(_iml_half_internal h) { +#if defined(__SPIR__) + return __spirv_ConvertFToS_Rlong_rtp(h); +#else + return __iml_half2integral_s(h, __IML_RTP); +#endif +} + +DEVICE_EXTERN_C_INLINE +long long __devicelib_imf_half2ll_rz(_iml_half_internal h) { +#if defined(__SPIR__) + return __spirv_ConvertFToS_Rlong_rtz(h); +#else + return __iml_half2integral_s(h, __IML_RTZ); +#endif +} + +DEVICE_EXTERN_C_INLINE +short __devicelib_imf_half2short_rd(_iml_half_internal h) { +#if defined(__SPIR__) + return __spirv_ConvertFToS_Rshort_rtn(h); +#else + return __iml_half2integral_s(h, __IML_RTN); +#endif +} + +DEVICE_EXTERN_C_INLINE +short __devicelib_imf_half2short_rn(_iml_half_internal h) { +#if defined(__SPIR__) + return __spirv_ConvertFToS_Rshort_rte(h); +#else + return __iml_half2integral_s(h, __IML_RTE); +#endif +} + +DEVICE_EXTERN_C_INLINE +short __devicelib_imf_half2short_ru(_iml_half_internal h) { +#if defined(__SPIR__) + return __spirv_ConvertFToS_Rshort_rtp(h); +#else + return __iml_half2integral_s(h, __IML_RTP); +#endif +} + +DEVICE_EXTERN_C_INLINE +short __devicelib_imf_half2short_rz(_iml_half_internal h) { +#if defined(__SPIR__) + return __spirv_ConvertFToS_Rshort_rtz(h); +#else + return __iml_half2integral_s(h, __IML_RTZ); +#endif +} + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_half2uint_rd(_iml_half_internal h) { +#if defined(__SPIR__) + return __spirv_ConvertFToU_Ruint_rtn(h); +#else + return __iml_half2integral_u(h, __IML_RTN); +#endif +} + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_half2uint_rn(_iml_half_internal h) { +#if defined(__SPIR__) + return __spirv_ConvertFToU_Ruint_rte(h); +#else + return __iml_half2integral_u(h, __IML_RTE); +#endif +} + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_half2uint_ru(_iml_half_internal h) { +#if defined(__SPIR__) + return __spirv_ConvertFToU_Ruint_rtp(h); +#else + return __iml_half2integral_u(h, __IML_RTP); +#endif +} + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_half2uint_rz(_iml_half_internal h) { +#if defined(__SPIR__) + return __spirv_ConvertFToU_Ruint_rtz(h); +#else + return __iml_half2integral_u(h, __IML_RTZ); +#endif +} + +DEVICE_EXTERN_C_INLINE +unsigned long long __devicelib_imf_half2ull_rd(_iml_half_internal h) { +#if defined(__SPIR__) + return __spirv_ConvertFToU_Rulong_rtn(h); +#else + return __iml_half2integral_u(h, __IML_RTN); +#endif +} + +DEVICE_EXTERN_C_INLINE +unsigned long long __devicelib_imf_half2ull_rn(_iml_half_internal h) { +#if defined(__SPIR__) + return __spirv_ConvertFToU_Rulong_rte(h); +#else + return __iml_half2integral_u(h, __IML_RTE); +#endif +} + +DEVICE_EXTERN_C_INLINE +unsigned long long __devicelib_imf_half2ull_ru(_iml_half_internal h) { +#if defined(__SPIR__) + return __spirv_ConvertFToU_Rulong_rtp(h); +#else + return __iml_half2integral_u(h, __IML_RTP); +#endif +} + +DEVICE_EXTERN_C_INLINE +unsigned long long __devicelib_imf_half2ull_rz(_iml_half_internal h) { +#if defined(__SPIR__) + return __spirv_ConvertFToU_Rulong_rtz(h); +#else + return __iml_half2integral_u(h, __IML_RTZ); +#endif +} + +DEVICE_EXTERN_C_INLINE +unsigned short __devicelib_imf_half2ushort_rd(_iml_half_internal h) { +#if defined(__SPIR__) + return __spirv_ConvertFToU_Rushort_rtn(h); +#else + return __iml_half2integral_u(h, __IML_RTN); +#endif +} + +DEVICE_EXTERN_C_INLINE +unsigned short __devicelib_imf_half2ushort_rn(_iml_half_internal h) { +#if defined(__SPIR__) + return __spirv_ConvertFToU_Rushort_rte(h); +#else + return __iml_half2integral_u(h, __IML_RTE); +#endif +} + +DEVICE_EXTERN_C_INLINE +unsigned short __devicelib_imf_half2ushort_ru(_iml_half_internal h) { +#if defined(__SPIR__) + return __spirv_ConvertFToU_Rushort_rtp(h); +#else + return __iml_half2integral_u(h, __IML_RTP); +#endif +} + +DEVICE_EXTERN_C_INLINE +unsigned short __devicelib_imf_half2ushort_rz(_iml_half_internal h) { +#if defined(__SPIR__) + return __spirv_ConvertFToU_Rushort_rtz(h); +#else + return __iml_half2integral_u(h, __IML_RTZ); +#endif +} + +DEVICE_EXTERN_C_INLINE +short __devicelib_imf_half_as_short(_iml_half_internal h) { + return __builtin_bit_cast(short, h); +} + +DEVICE_EXTERN_C_INLINE +unsigned short __devicelib_imf_half_as_ushort(_iml_half_internal h) { + return __builtin_bit_cast(unsigned short, h); +} + +DEVICE_EXTERN_C_INLINE +_iml_half_internal __devicelib_imf_int2half_rd(int x) { +#if defined(__SPIR__) + return __spirv_ConvertSToF_Rhalf_rtn(x); +#else + return __iml_integral2half_s(x, __IML_RTN); +#endif +} + +DEVICE_EXTERN_C_INLINE +_iml_half_internal __devicelib_imf_int2half_rn(int x) { +#if defined(__SPIR__) + return __spirv_ConvertSToF_Rhalf_rte(x); +#else + return __iml_integral2half_s(x, __IML_RTE); +#endif +} + +DEVICE_EXTERN_C_INLINE +_iml_half_internal __devicelib_imf_int2half_ru(int x) { +#if defined(__SPIR__) + return __spirv_ConvertSToF_Rhalf_rtp(x); +#else + return __iml_integral2half_s(x, __IML_RTP); +#endif +} + +DEVICE_EXTERN_C_INLINE +_iml_half_internal __devicelib_imf_int2half_rz(int x) { +#if defined(__SPIR__) + return __spirv_ConvertSToF_Rhalf_rtz(x); +#else + return __iml_integral2half_s(x, __IML_RTZ); +#endif +} + +DEVICE_EXTERN_C_INLINE +_iml_half_internal __devicelib_imf_ll2half_rd(long long x) { +#if defined(__SPIR__) + return __spirv_ConvertSToF_Rhalf_rtn((int64_t)x); +#else + return __iml_integral2half_s(x, __IML_RTN); +#endif +} + +DEVICE_EXTERN_C_INLINE +_iml_half_internal __devicelib_imf_ll2half_rn(long long x) { +#if defined(__SPIR__) + return __spirv_ConvertSToF_Rhalf_rte((int64_t)x); +#else + return __iml_integral2half_s(x, __IML_RTE); +#endif +} + +DEVICE_EXTERN_C_INLINE +_iml_half_internal __devicelib_imf_ll2half_ru(long long x) { +#if defined(__SPIR__) + return __spirv_ConvertSToF_Rhalf_rtp((int64_t)x); +#else + return __iml_integral2half_s(x, __IML_RTP); +#endif +} + +DEVICE_EXTERN_C_INLINE +_iml_half_internal __devicelib_imf_ll2half_rz(long long x) { +#if defined(__SPIR__) + return __spirv_ConvertSToF_Rhalf_rtz((int64_t)x); +#else + return __iml_integral2half_s(x, __IML_RTZ); +#endif +} + +DEVICE_EXTERN_C_INLINE +_iml_half_internal __devicelib_imf_short2half_rd(short x) { +#if defined(__SPIR__) + return __spirv_ConvertSToF_Rhalf_rtn(x); +#else + return __iml_integral2half_s(x, __IML_RTN); +#endif +} + +DEVICE_EXTERN_C_INLINE +_iml_half_internal __devicelib_imf_short2half_rn(short x) { +#if defined(__SPIR__) + return __spirv_ConvertSToF_Rhalf_rte(x); +#else + return __iml_integral2half_s(x, __IML_RTE); +#endif +} + +DEVICE_EXTERN_C_INLINE +_iml_half_internal __devicelib_imf_short2half_ru(short x) { +#if defined(__SPIR__) + return __spirv_ConvertSToF_Rhalf_rtp(x); +#else + return __iml_integral2half_s(x, __IML_RTP); +#endif +} + +DEVICE_EXTERN_C_INLINE +_iml_half_internal __devicelib_imf_short2half_rz(short x) { +#if defined(__SPIR__) + return __spirv_ConvertSToF_Rhalf_rtz(x); +#else + return __iml_integral2half_s(x, __IML_RTZ); +#endif +} + +DEVICE_EXTERN_C_INLINE +_iml_half_internal __devicelib_imf_short_as_half(short x) { + return __builtin_bit_cast(_iml_half_internal, x); +} + +DEVICE_EXTERN_C_INLINE +_iml_half_internal __devicelib_imf_uint2half_rd(unsigned int x) { +#if defined(__SPIR__) + return __spirv_ConvertUToF_Rhalf_rtn(x); +#else + return __iml_integral2half_u(x, __IML_RTN); +#endif +} + +DEVICE_EXTERN_C_INLINE +_iml_half_internal __devicelib_imf_uint2half_rn(unsigned int x) { +#if defined(__SPIR__) + return __spirv_ConvertUToF_Rhalf_rte(x); +#else + return __iml_integral2half_u(x, __IML_RTE); +#endif +} + +DEVICE_EXTERN_C_INLINE +_iml_half_internal __devicelib_imf_uint2half_ru(unsigned int x) { +#if defined(__SPIR__) + return __spirv_ConvertUToF_Rhalf_rtp(x); +#else + return __iml_integral2half_u(x, __IML_RTP); +#endif +} + +DEVICE_EXTERN_C_INLINE +_iml_half_internal __devicelib_imf_uint2half_rz(unsigned int x) { +#if defined(__SPIR__) + return __spirv_ConvertUToF_Rhalf_rtz(x); +#else + return __iml_integral2half_u(x, __IML_RTZ); +#endif +} + +DEVICE_EXTERN_C_INLINE +_iml_half_internal __devicelib_imf_ull2half_rd(unsigned long long x) { +#if defined(__SPIR__) + return __spirv_ConvertUToF_Rhalf_rtn((uint64_t)x); +#else + return __iml_integral2half_u(x, __IML_RTN); +#endif +} + +DEVICE_EXTERN_C_INLINE +_iml_half_internal __devicelib_imf_ull2half_rn(unsigned long long x) { +#if defined(__SPIR__) + return __spirv_ConvertUToF_Rhalf_rte((uint64_t)x); +#else + return __iml_integral2half_u(x, __IML_RTE); +#endif +} + +DEVICE_EXTERN_C_INLINE +_iml_half_internal __devicelib_imf_ull2half_ru(unsigned long long x) { +#if defined(__SPIR__) + return __spirv_ConvertUToF_Rhalf_rtp((uint64_t)x); +#else + return __iml_integral2half_u(x, __IML_RTP); +#endif +} + +DEVICE_EXTERN_C_INLINE +_iml_half_internal __devicelib_imf_ull2half_rz(unsigned long long x) { +#if defined(__SPIR__) + return __spirv_ConvertUToF_Rhalf_rtz((uint64_t)x); +#else + return __iml_integral2half_u(x, __IML_RTZ); +#endif +} + +DEVICE_EXTERN_C_INLINE +_iml_half_internal __devicelib_imf_ushort2half_rd(unsigned short x) { +#if defined(__SPIR__) + return __spirv_ConvertUToF_Rhalf_rtn(x); +#else + return __iml_integral2half_u(x, __IML_RTN); +#endif +} + +DEVICE_EXTERN_C_INLINE +_iml_half_internal __devicelib_imf_ushort2half_rn(unsigned short x) { +#if defined(__SPIR__) + return __spirv_ConvertUToF_Rhalf_rte(x); +#else + return __iml_integral2half_u(x, __IML_RTE); +#endif +} + +DEVICE_EXTERN_C_INLINE +_iml_half_internal __devicelib_imf_ushort2half_ru(unsigned short x) { +#if defined(__SPIR__) + return __spirv_ConvertUToF_Rhalf_rtp(x); +#else + return __iml_integral2half_u(x, __IML_RTP); +#endif +} + +DEVICE_EXTERN_C_INLINE +_iml_half_internal __devicelib_imf_ushort2half_rz(unsigned short x) { +#if defined(__SPIR__) + return __spirv_ConvertUToF_Rhalf_rtz(x); +#else + return __iml_integral2half_u(x, __IML_RTZ); +#endif +} + +DEVICE_EXTERN_C_INLINE +_iml_half_internal __devicelib_imf_ushort_as_half(unsigned short x) { + return __builtin_bit_cast(_iml_half_internal, x); +} #endif // __LIBDEVICE_IMF_ENABLED__ diff --git a/libdevice/imf_wrapper.cpp b/libdevice/imf_wrapper.cpp index b3bc5e5058b22..8b10535569362 100644 --- a/libdevice/imf_wrapper.cpp +++ b/libdevice/imf_wrapper.cpp @@ -1327,4 +1327,453 @@ DEVICE_EXTERN_C_INLINE unsigned int __imf_vsadu4(unsigned int x, unsigned int y) { return __devicelib_imf_vsadu4(x, y); } + +// FP16 type cast functions +DEVICE_EXTERN_C_INLINE +_iml_half_internal __devicelib_imf_float2half_rn(float); + +DEVICE_EXTERN_C_INLINE +_iml_half_internal __devicelib_imf_float2half_rd(float); + +DEVICE_EXTERN_C_INLINE +_iml_half_internal __devicelib_imf_float2half_ru(float); + +DEVICE_EXTERN_C_INLINE +_iml_half_internal __devicelib_imf_float2half_rz(float); + +DEVICE_EXTERN_C_INLINE +_iml_half_internal __imf_float2half_rn(float x) { + return __devicelib_imf_float2half_rn(x); +} + +DEVICE_EXTERN_C_INLINE +_iml_half_internal __imf_float2half_rd(float x) { + return __devicelib_imf_float2half_rd(x); +} + +DEVICE_EXTERN_C_INLINE +_iml_half_internal __imf_float2half_ru(float x) { + return __devicelib_imf_float2half_ru(x); +} + +DEVICE_EXTERN_C_INLINE +_iml_half_internal __imf_float2half_rz(float x) { + return __devicelib_imf_float2half_rz(x); +} + +DEVICE_EXTERN_C_INLINE +int __devicelib_imf_half2int_rd(_iml_half_internal); + +DEVICE_EXTERN_C_INLINE +int __devicelib_imf_half2int_rn(_iml_half_internal); + +DEVICE_EXTERN_C_INLINE +int __devicelib_imf_half2int_ru(_iml_half_internal); + +DEVICE_EXTERN_C_INLINE +int __devicelib_imf_half2int_rz(_iml_half_internal); + +DEVICE_EXTERN_C_INLINE +int __imf_half2int_rd(_iml_half_internal h) { + return __devicelib_imf_half2int_rd(h); +} + +DEVICE_EXTERN_C_INLINE +int __imf_half2int_rn(_iml_half_internal h) { + return __devicelib_imf_half2int_rn(h); +} + +DEVICE_EXTERN_C_INLINE +int __imf_half2int_ru(_iml_half_internal h) { + return __devicelib_imf_half2int_ru(h); +} + +DEVICE_EXTERN_C_INLINE +int __imf_half2int_rz(_iml_half_internal h) { + return __devicelib_imf_half2int_rz(h); +} + +DEVICE_EXTERN_C_INLINE +long long __devicelib_imf_half2ll_rd(_iml_half_internal); + +DEVICE_EXTERN_C_INLINE +long long __devicelib_imf_half2ll_rn(_iml_half_internal); + +DEVICE_EXTERN_C_INLINE +long long __devicelib_imf_half2ll_ru(_iml_half_internal); + +DEVICE_EXTERN_C_INLINE +long long __devicelib_imf_half2ll_rz(_iml_half_internal); + +DEVICE_EXTERN_C_INLINE +long long __imf_half2ll_rd(_iml_half_internal h) { + return __devicelib_imf_half2ll_rd(h); +} + +DEVICE_EXTERN_C_INLINE +long long __imf_half2ll_rn(_iml_half_internal h) { + return __devicelib_imf_half2ll_rn(h); +} + +DEVICE_EXTERN_C_INLINE +long long __imf_half2ll_ru(_iml_half_internal h) { + return __devicelib_imf_half2ll_ru(h); +} + +DEVICE_EXTERN_C_INLINE +long long __imf_half2ll_rz(_iml_half_internal h) { + return __devicelib_imf_half2ll_rz(h); +} + +DEVICE_EXTERN_C_INLINE +short __devicelib_imf_half2short_rd(_iml_half_internal); + +DEVICE_EXTERN_C_INLINE +short __devicelib_imf_half2short_rn(_iml_half_internal); + +DEVICE_EXTERN_C_INLINE +short __devicelib_imf_half2short_ru(_iml_half_internal); + +DEVICE_EXTERN_C_INLINE +short __devicelib_imf_half2short_rz(_iml_half_internal); + +DEVICE_EXTERN_C_INLINE +short __imf_half2short_rd(_iml_half_internal h) { + return __devicelib_imf_half2short_rd(h); +} + +DEVICE_EXTERN_C_INLINE +short __imf_half2short_rn(_iml_half_internal h) { + return __devicelib_imf_half2short_rn(h); +} + +DEVICE_EXTERN_C_INLINE +short __imf_half2short_ru(_iml_half_internal h) { + return __devicelib_imf_half2short_ru(h); +} + +DEVICE_EXTERN_C_INLINE +short __imf_half2short_rz(_iml_half_internal h) { + return __devicelib_imf_half2short_rz(h); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_half2uint_rd(_iml_half_internal); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_half2uint_rn(_iml_half_internal); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_half2uint_ru(_iml_half_internal); + +DEVICE_EXTERN_C_INLINE +unsigned int __devicelib_imf_half2uint_rz(_iml_half_internal); + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_half2uint_rd(_iml_half_internal h) { + return __devicelib_imf_half2uint_rd(h); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_half2uint_rn(_iml_half_internal h) { + return __devicelib_imf_half2uint_rn(h); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_half2uint_ru(_iml_half_internal h) { + return __devicelib_imf_half2uint_ru(h); +} + +DEVICE_EXTERN_C_INLINE +unsigned int __imf_half2uint_rz(_iml_half_internal h) { + return __devicelib_imf_half2uint_rz(h); +} + +DEVICE_EXTERN_C_INLINE +unsigned long long __devicelib_imf_half2ull_rd(_iml_half_internal); + +DEVICE_EXTERN_C_INLINE +unsigned long long __devicelib_imf_half2ull_rn(_iml_half_internal); + +DEVICE_EXTERN_C_INLINE +unsigned long long __devicelib_imf_half2ull_ru(_iml_half_internal); + +DEVICE_EXTERN_C_INLINE +unsigned long long __devicelib_imf_half2ull_rz(_iml_half_internal); + +DEVICE_EXTERN_C_INLINE +unsigned long long __imf_half2ull_rd(_iml_half_internal h) { + return __devicelib_imf_half2ull_rd(h); +} + +DEVICE_EXTERN_C_INLINE +unsigned long long __imf_half2ull_rn(_iml_half_internal h) { + return __devicelib_imf_half2ull_rn(h); +} + +DEVICE_EXTERN_C_INLINE +unsigned long long __imf_half2ull_ru(_iml_half_internal h) { + return __devicelib_imf_half2ull_ru(h); +} + +DEVICE_EXTERN_C_INLINE +unsigned long long __imf_half2ull_rz(_iml_half_internal h) { + return __devicelib_imf_half2ull_rz(h); +} + +DEVICE_EXTERN_C_INLINE +unsigned short __devicelib_imf_half2ushort_rd(_iml_half_internal); + +DEVICE_EXTERN_C_INLINE +unsigned short __devicelib_imf_half2ushort_rn(_iml_half_internal); + +DEVICE_EXTERN_C_INLINE +unsigned short __devicelib_imf_half2ushort_ru(_iml_half_internal); + +DEVICE_EXTERN_C_INLINE +unsigned short __devicelib_imf_half2ushort_rz(_iml_half_internal); + +DEVICE_EXTERN_C_INLINE +unsigned short __imf_half2ushort_rd(_iml_half_internal h) { + return __devicelib_imf_half2ushort_rd(h); +} + +DEVICE_EXTERN_C_INLINE +unsigned short __imf_half2ushort_rn(_iml_half_internal h) { + return __devicelib_imf_half2ushort_rn(h); +} + +DEVICE_EXTERN_C_INLINE +unsigned short __imf_half2ushort_ru(_iml_half_internal h) { + return __devicelib_imf_half2ushort_ru(h); +} + +DEVICE_EXTERN_C_INLINE +unsigned short __imf_half2ushort_rz(_iml_half_internal h) { + return __devicelib_imf_half2ushort_rz(h); +} + +DEVICE_EXTERN_C_INLINE +short __devicelib_imf_half_as_short(_iml_half_internal); + +DEVICE_EXTERN_C_INLINE +unsigned short __devicelib_imf_half_as_ushort(_iml_half_internal); + +DEVICE_EXTERN_C_INLINE +short __imf_half_as_short(_iml_half_internal h) { + return __devicelib_imf_half_as_short(h); +} + +DEVICE_EXTERN_C_INLINE +unsigned short __imf_half_as_ushort(_iml_half_internal h) { + return __devicelib_imf_half_as_ushort(h); +} + +DEVICE_EXTERN_C_INLINE +_iml_half_internal __devicelib_imf_int2half_rd(int); + +DEVICE_EXTERN_C_INLINE +_iml_half_internal __devicelib_imf_int2half_rn(int); + +DEVICE_EXTERN_C_INLINE +_iml_half_internal __devicelib_imf_int2half_ru(int); + +DEVICE_EXTERN_C_INLINE +_iml_half_internal __devicelib_imf_int2half_rz(int); + +DEVICE_EXTERN_C_INLINE +_iml_half_internal __imf_int2half_rd(int x) { + return __devicelib_imf_int2half_rd(x); +} + +DEVICE_EXTERN_C_INLINE +_iml_half_internal __imf_int2half_rn(int x) { + return __devicelib_imf_int2half_rn(x); +} + +DEVICE_EXTERN_C_INLINE +_iml_half_internal __imf_int2half_ru(int x) { + return __devicelib_imf_int2half_ru(x); +} + +DEVICE_EXTERN_C_INLINE +_iml_half_internal __imf_int2half_rz(int x) { + return __devicelib_imf_int2half_rz(x); +} + +DEVICE_EXTERN_C_INLINE +_iml_half_internal __devicelib_imf_ll2half_rd(long long); + +DEVICE_EXTERN_C_INLINE +_iml_half_internal __devicelib_imf_ll2half_rn(long long); + +DEVICE_EXTERN_C_INLINE +_iml_half_internal __devicelib_imf_ll2half_ru(long long); + +DEVICE_EXTERN_C_INLINE +_iml_half_internal __devicelib_imf_ll2half_rz(long long); + +DEVICE_EXTERN_C_INLINE +_iml_half_internal __imf_ll2half_rd(long long x) { + return __devicelib_imf_ll2half_rd(x); +} + +DEVICE_EXTERN_C_INLINE +_iml_half_internal __imf_ll2half_rn(long long x) { + return __devicelib_imf_ll2half_rn(x); +} + +DEVICE_EXTERN_C_INLINE +_iml_half_internal __imf_ll2half_ru(long long x) { + return __devicelib_imf_ll2half_ru(x); +} + +DEVICE_EXTERN_C_INLINE +_iml_half_internal __imf_ll2half_rz(long long x) { + return __devicelib_imf_ll2half_rz(x); +} + +DEVICE_EXTERN_C_INLINE +_iml_half_internal __devicelib_imf_short2half_rd(short); + +DEVICE_EXTERN_C_INLINE +_iml_half_internal __devicelib_imf_short2half_rn(short); + +DEVICE_EXTERN_C_INLINE +_iml_half_internal __devicelib_imf_short2half_ru(short); + +DEVICE_EXTERN_C_INLINE +_iml_half_internal __devicelib_imf_short2half_rz(short); + +DEVICE_EXTERN_C_INLINE +_iml_half_internal __imf_short2half_rd(short x) { + return __devicelib_imf_short2half_rd(x); +} + +DEVICE_EXTERN_C_INLINE +_iml_half_internal __imf_short2half_rn(short x) { + return __devicelib_imf_short2half_rn(x); +} + +DEVICE_EXTERN_C_INLINE +_iml_half_internal __imf_short2half_ru(short x) { + return __devicelib_imf_short2half_ru(x); +} + +DEVICE_EXTERN_C_INLINE +_iml_half_internal __imf_short2half_rz(short x) { + return __devicelib_imf_short2half_rz(x); +} + +DEVICE_EXTERN_C_INLINE +_iml_half_internal __devicelib_imf_short_as_half(short); + +DEVICE_EXTERN_C_INLINE +_iml_half_internal __imf_short_as_half(short x) { + return __devicelib_imf_short_as_half(x); +} + +DEVICE_EXTERN_C_INLINE +_iml_half_internal __devicelib_imf_uint2half_rd(unsigned int); + +DEVICE_EXTERN_C_INLINE +_iml_half_internal __devicelib_imf_uint2half_rn(unsigned int); + +DEVICE_EXTERN_C_INLINE +_iml_half_internal __devicelib_imf_uint2half_ru(unsigned int); + +DEVICE_EXTERN_C_INLINE +_iml_half_internal __devicelib_imf_uint2half_rz(unsigned int); + +DEVICE_EXTERN_C_INLINE +_iml_half_internal __imf_uint2half_rd(unsigned int x) { + return __devicelib_imf_uint2half_rd(x); +} + +DEVICE_EXTERN_C_INLINE +_iml_half_internal __imf_uint2half_rn(unsigned int x) { + return __devicelib_imf_uint2half_rn(x); +} + +DEVICE_EXTERN_C_INLINE +_iml_half_internal __imf_uint2half_ru(unsigned int x) { + return __devicelib_imf_uint2half_ru(x); +} + +DEVICE_EXTERN_C_INLINE +_iml_half_internal __imf_uint2half_rz(unsigned int x) { + return __devicelib_imf_uint2half_rz(x); +} + +DEVICE_EXTERN_C_INLINE +_iml_half_internal __devicelib_imf_ull2half_rd(unsigned long long); + +DEVICE_EXTERN_C_INLINE +_iml_half_internal __devicelib_imf_ull2half_rn(unsigned long long); + +DEVICE_EXTERN_C_INLINE +_iml_half_internal __devicelib_imf_ull2half_ru(unsigned long long); + +DEVICE_EXTERN_C_INLINE +_iml_half_internal __devicelib_imf_ull2half_rz(unsigned long long); + +DEVICE_EXTERN_C_INLINE +_iml_half_internal __imf_ull2half_rd(unsigned long long x) { + return __devicelib_imf_ull2half_rd(x); +} + +DEVICE_EXTERN_C_INLINE +_iml_half_internal __imf_ull2half_rn(unsigned long long x) { + return __devicelib_imf_ull2half_rn(x); +} + +DEVICE_EXTERN_C_INLINE +_iml_half_internal __imf_ull2half_ru(unsigned long long x) { + return __devicelib_imf_ull2half_ru(x); +} + +DEVICE_EXTERN_C_INLINE +_iml_half_internal __imf_ull2half_rz(unsigned long long x) { + return __devicelib_imf_ull2half_rz(x); +} + +DEVICE_EXTERN_C_INLINE +_iml_half_internal __devicelib_imf_ushort2half_rd(unsigned short); + +DEVICE_EXTERN_C_INLINE +_iml_half_internal __devicelib_imf_ushort2half_rn(unsigned short); + +DEVICE_EXTERN_C_INLINE +_iml_half_internal __devicelib_imf_ushort2half_ru(unsigned short); + +DEVICE_EXTERN_C_INLINE +_iml_half_internal __devicelib_imf_ushort2half_rz(unsigned short); + +DEVICE_EXTERN_C_INLINE +_iml_half_internal __imf_ushort2half_rd(unsigned short x) { + return __devicelib_imf_ushort2half_rd(x); +} + +DEVICE_EXTERN_C_INLINE +_iml_half_internal __imf_ushort2half_rn(unsigned short x) { + return __devicelib_imf_ushort2half_rn(x); +} + +DEVICE_EXTERN_C_INLINE +_iml_half_internal __imf_ushort2half_ru(unsigned short x) { + return __devicelib_imf_ushort2half_ru(x); +} + +DEVICE_EXTERN_C_INLINE +_iml_half_internal __imf_ushort2half_rz(unsigned short x) { + return __devicelib_imf_ushort2half_rz(x); +} + +DEVICE_EXTERN_C_INLINE +_iml_half_internal __devicelib_imf_ushort_as_half(unsigned short); + +DEVICE_EXTERN_C_INLINE +_iml_half_internal __imf_ushort_as_half(unsigned short x) { + return __devicelib_imf_ushort_as_half(x); +} #endif // __LIBDEVICE_IMF_ENABLED__ diff --git a/libdevice/imf_wrapper_fp64.cpp b/libdevice/imf_wrapper_fp64.cpp index eddba077a3f30..6cce2ac0e025e 100644 --- a/libdevice/imf_wrapper_fp64.cpp +++ b/libdevice/imf_wrapper_fp64.cpp @@ -361,4 +361,12 @@ DEVICE_EXTERN_C_INLINE double __imf_copysign(double x, double y) { return __devicelib_imf_copysign(x, y); } + +DEVICE_EXTERN_C_INLINE +_iml_half_internal __devicelib_imf_double2half(double); + +DEVICE_EXTERN_C_INLINE +_iml_half_internal __imf_double2half(double x) { + return __devicelib_imf_double2half(x); +} #endif // __LIBDEVICE_IMF_ENABLED__ diff --git a/llvm/tools/sycl-post-link/SYCLDeviceLibReqMask.cpp b/llvm/tools/sycl-post-link/SYCLDeviceLibReqMask.cpp index 445acfefb4145..f03f9d489dcc7 100644 --- a/llvm/tools/sycl-post-link/SYCLDeviceLibReqMask.cpp +++ b/llvm/tools/sycl-post-link/SYCLDeviceLibReqMask.cpp @@ -247,6 +247,64 @@ SYCLDeviceLibFuncMap SDLMap = { {"__devicelib_imf_ull2float_rn", DeviceLibExt::cl_intel_devicelib_imf}, {"__devicelib_imf_ull2float_ru", DeviceLibExt::cl_intel_devicelib_imf}, {"__devicelib_imf_ull2float_rz", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_float2half_rd", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_float2half_rn", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_float2half_ru", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_float2half_rz", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_half2float", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_half2int_rd", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_half2int_rn", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_half2int_ru", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_half2int_rz", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_half2ll_rd", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_half2ll_rn", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_half2ll_ru", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_half2ll_rz", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_half2short_rd", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_half2short_rn", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_half2short_ru", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_half2short_rz", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_half2uint_rd", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_half2uint_rn", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_half2uint_ru", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_half2uint_rz", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_half2ull_rd", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_half2ull_rn", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_half2ull_ru", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_half2ull_rz", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_half2ushort_rd", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_half2ushort_rn", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_half2ushort_ru", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_half2ushort_rz", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_half_as_short", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_half_as_ushort", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_uint2half_rd", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_uint2half_rn", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_uint2half_ru", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_uint2half_rz", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_ull2half_rd", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_ull2half_rn", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_ull2half_ru", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_ull2half_rz", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_ushort2half_rd", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_ushort2half_rn", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_ushort2half_ru", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_ushort2half_rz", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_ushort_as_half", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_int2half_rd", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_int2half_rn", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_int2half_ru", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_int2half_rz", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_ll2half_rd", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_ll2half_rn", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_ll2half_ru", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_ll2half_rz", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_short2half_rd", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_short2half_rn", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_short2half_ru", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_short2half_rz", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_short_as_half", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_fmaf16", DeviceLibExt::cl_intel_devicelib_imf}, {"__devicelib_imf_floorf16", DeviceLibExt::cl_intel_devicelib_imf}, {"__devicelib_imf_ceilf16", DeviceLibExt::cl_intel_devicelib_imf}, @@ -342,6 +400,7 @@ SYCLDeviceLibFuncMap SDLMap = { {"__devicelib_imf_vsads4", DeviceLibExt::cl_intel_devicelib_imf}, {"__devicelib_imf_vsadu2", DeviceLibExt::cl_intel_devicelib_imf}, {"__devicelib_imf_vsadu4", DeviceLibExt::cl_intel_devicelib_imf}, + {"__devicelib_imf_double2half", DeviceLibExt::cl_intel_devicelib_imf_fp64}, {"__devicelib_imf_fma", DeviceLibExt::cl_intel_devicelib_imf_fp64}, {"__devicelib_imf_floor", DeviceLibExt::cl_intel_devicelib_imf_fp64}, {"__devicelib_imf_ceil", DeviceLibExt::cl_intel_devicelib_imf_fp64}, diff --git a/sycl/include/sycl/builtins.hpp b/sycl/include/sycl/builtins.hpp index 2e9fae173974a..4dbbec61409c8 100644 --- a/sycl/include/sycl/builtins.hpp +++ b/sycl/include/sycl/builtins.hpp @@ -1823,6 +1823,64 @@ extern SYCL_EXTERNAL float __imf_ull2float_rd(unsigned long long int x); extern SYCL_EXTERNAL float __imf_ull2float_rn(unsigned long long int x); extern SYCL_EXTERNAL float __imf_ull2float_ru(unsigned long long int x); extern SYCL_EXTERNAL float __imf_ull2float_rz(unsigned long long int x); +extern SYCL_EXTERNAL float __imf_half2float(_Float16 x); +extern SYCL_EXTERNAL _Float16 __imf_float2half_rd(float x); +extern SYCL_EXTERNAL _Float16 __imf_float2half_rn(float x); +extern SYCL_EXTERNAL _Float16 __imf_float2half_ru(float x); +extern SYCL_EXTERNAL _Float16 __imf_float2half_rz(float x); +extern SYCL_EXTERNAL int __imf_half2int_rd(_Float16 x); +extern SYCL_EXTERNAL int __imf_half2int_rn(_Float16 x); +extern SYCL_EXTERNAL int __imf_half2int_ru(_Float16 x); +extern SYCL_EXTERNAL int __imf_half2int_rz(_Float16 x); +extern SYCL_EXTERNAL long long __imf_half2ll_rd(_Float16 x); +extern SYCL_EXTERNAL long long __imf_half2ll_rn(_Float16 x); +extern SYCL_EXTERNAL long long __imf_half2ll_ru(_Float16 x); +extern SYCL_EXTERNAL long long __imf_half2ll_rz(_Float16 x); +extern SYCL_EXTERNAL short __imf_half2short_rd(_Float16 x); +extern SYCL_EXTERNAL short __imf_half2short_rn(_Float16 x); +extern SYCL_EXTERNAL short __imf_half2short_ru(_Float16 x); +extern SYCL_EXTERNAL short __imf_half2short_rz(_Float16 x); +extern SYCL_EXTERNAL unsigned int __imf_half2uint_rd(_Float16 x); +extern SYCL_EXTERNAL unsigned int __imf_half2uint_rn(_Float16 x); +extern SYCL_EXTERNAL unsigned int __imf_half2uint_ru(_Float16 x); +extern SYCL_EXTERNAL unsigned int __imf_half2uint_rz(_Float16 x); +extern SYCL_EXTERNAL unsigned long long __imf_half2ull_rd(_Float16 x); +extern SYCL_EXTERNAL unsigned long long __imf_half2ull_rn(_Float16 x); +extern SYCL_EXTERNAL unsigned long long __imf_half2ull_ru(_Float16 x); +extern SYCL_EXTERNAL unsigned long long __imf_half2ull_rz(_Float16 x); +extern SYCL_EXTERNAL unsigned short __imf_half2ushort_rd(_Float16 x); +extern SYCL_EXTERNAL unsigned short __imf_half2ushort_rn(_Float16 x); +extern SYCL_EXTERNAL unsigned short __imf_half2ushort_ru(_Float16 x); +extern SYCL_EXTERNAL unsigned short __imf_half2ushort_rz(_Float16 x); +extern SYCL_EXTERNAL short __imf_half_as_short(_Float16 x); +extern SYCL_EXTERNAL unsigned short __imf_half_as_ushort(_Float16 x); +extern SYCL_EXTERNAL _Float16 __imf_int2half_rd(int x); +extern SYCL_EXTERNAL _Float16 __imf_int2half_rn(int x); +extern SYCL_EXTERNAL _Float16 __imf_int2half_ru(int x); +extern SYCL_EXTERNAL _Float16 __imf_int2half_rz(int x); +extern SYCL_EXTERNAL _Float16 __imf_ll2half_rd(long long x); +extern SYCL_EXTERNAL _Float16 __imf_ll2half_rn(long long x); +extern SYCL_EXTERNAL _Float16 __imf_ll2half_ru(long long x); +extern SYCL_EXTERNAL _Float16 __imf_ll2half_rz(long long x); +extern SYCL_EXTERNAL _Float16 __imf_short2half_rd(short x); +extern SYCL_EXTERNAL _Float16 __imf_short2half_rn(short x); +extern SYCL_EXTERNAL _Float16 __imf_short2half_ru(short x); +extern SYCL_EXTERNAL _Float16 __imf_short2half_rz(short x); +extern SYCL_EXTERNAL _Float16 __imf_short_as_half(short x); +extern SYCL_EXTERNAL _Float16 __imf_uint2half_rd(unsigned int x); +extern SYCL_EXTERNAL _Float16 __imf_uint2half_rn(unsigned int x); +extern SYCL_EXTERNAL _Float16 __imf_uint2half_ru(unsigned int x); +extern SYCL_EXTERNAL _Float16 __imf_uint2half_rz(unsigned int x); +extern SYCL_EXTERNAL _Float16 __imf_ull2half_rd(unsigned long long x); +extern SYCL_EXTERNAL _Float16 __imf_ull2half_rn(unsigned long long x); +extern SYCL_EXTERNAL _Float16 __imf_ull2half_ru(unsigned long long x); +extern SYCL_EXTERNAL _Float16 __imf_ull2half_rz(unsigned long long x); +extern SYCL_EXTERNAL _Float16 __imf_ushort2half_rd(unsigned short x); +extern SYCL_EXTERNAL _Float16 __imf_ushort2half_rn(unsigned short x); +extern SYCL_EXTERNAL _Float16 __imf_ushort2half_ru(unsigned short x); +extern SYCL_EXTERNAL _Float16 __imf_ushort2half_rz(unsigned short x); +extern SYCL_EXTERNAL _Float16 __imf_ushort_as_half(unsigned short x); +extern SYCL_EXTERNAL _Float16 __imf_double2half(double x); extern SYCL_EXTERNAL _Float16 __imf_fmaf16(_Float16 x, _Float16 y, _Float16 z); extern SYCL_EXTERNAL _Float16 __imf_fabsf16(_Float16 x);