diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp index 4af429916cd08..59725e7db8ecf 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp @@ -474,8 +474,16 @@ lsc_gather(AccessorTy acc, __ESIMD_NS::simd offsets, /// Supported platforms: DG2, PVC /// VISA instruction: lsc_load.ugm /// -/// Collects elements located at specified address and returns them -/// as a single \ref simd object. +/// Accesses contiguous block of memory of `NElts * S` bytes starting from +/// given address, where S is a byte size of an "element" defined by the \c DS +/// template parameter. The maximum size of accessed block is 512 bytes for PVC +/// and 256 bytes for ACM (DG2). +/// When \? DS equals \? lsc_data_size::u64, the address must be 8-byte aligned, +/// otherwise - 4-bytes aligned. Allowed values for the data size are +/// \? lsc_data_size::u32 and \? lsc_data_size::u64. Allowed NElts values are +/// 1, 2, 3, 4, 8, 16, 32, 64. +/// Note that to access 512 bytes, DS must be \? lsc_data_size::u64 and \c NElts +/// must be 64. /// /// @tparam T is element type. /// @tparam NElts is the number of elements to load per address. @@ -492,22 +500,34 @@ template __ESIMD_API __ESIMD_NS::simd lsc_block_load(const T *p, __ESIMD_NS::simd_mask<1> pred = 1) { - detail::check_lsc_vector_size(); detail::check_lsc_data_size(); detail::check_lsc_cache_hint(); constexpr uint16_t _AddressScale = 1; constexpr int _ImmOffset = 0; constexpr lsc_data_size _DS = detail::finalize_data_size(); - static_assert(_DS == lsc_data_size::u32 || _DS == lsc_data_size::u64, - "Transposed load is supported only for data size u32 or u64"); - constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size(); constexpr detail::lsc_data_order _Transposed = detail::lsc_data_order::transpose; constexpr int N = 1; __ESIMD_NS::simd addrs = reinterpret_cast(p); - return __esimd_lsc_load_stateless(pred.data(), - addrs.data()); + constexpr int SmallIntFactor = + (_DS == lsc_data_size::u16) ? 2 : (_DS == lsc_data_size::u8 ? 4 : 1); + static_assert(NElts % SmallIntFactor == 0, + "Number of elements is not supported by Transposed load"); + + detail::check_lsc_vector_size(); + constexpr detail::lsc_vector_size _VS = + detail::to_lsc_vector_size(); + if constexpr (SmallIntFactor == 1) { + return __esimd_lsc_load_stateless(pred.data(), + addrs.data()); + } else { + __ESIMD_NS::simd result = + __esimd_lsc_load_stateless(pred.data(), addrs.data()); + return result.template bit_cast_view(); + } } /// Accessor-based transposed gather with 1 channel. @@ -516,6 +536,8 @@ lsc_block_load(const T *p, __ESIMD_NS::simd_mask<1> pred = 1) { /// /// Collects elements located at surface and returns them /// as a single \ref simd object. +/// See comments in the \ref lsc_block_load API for description and parameter +/// constraints. /// /// @tparam T is element type. /// @tparam NElts is the number of elements to load per address. @@ -541,22 +563,36 @@ lsc_block_load(AccessorTy acc, uint32_t offset, return lsc_block_load( __ESIMD_DNS::accessorToPointer(acc, offset), pred); #else - detail::check_lsc_vector_size(); detail::check_lsc_data_size(); detail::check_lsc_cache_hint(); constexpr uint16_t _AddressScale = 1; constexpr int _ImmOffset = 0; constexpr lsc_data_size _DS = detail::finalize_data_size(); - static_assert(_DS == lsc_data_size::u32 || _DS == lsc_data_size::u64, - "Transposed load is supported only for data size u32 or u64"); - constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size(); constexpr detail::lsc_data_order _Transposed = detail::lsc_data_order::transpose; constexpr int N = 1; __ESIMD_NS::simd offsets = offset; auto si = __ESIMD_NS::get_surface_index(acc); - return __esimd_lsc_load_bti(pred.data(), offsets.data(), si); + constexpr int SmallIntFactor = + (_DS == lsc_data_size::u16) ? 2 : (_DS == lsc_data_size::u8 ? 4 : 1); + static_assert(NElts % SmallIntFactor == 0, + "Number of elements is not supported by Transposed load"); + detail::check_lsc_vector_size(); + constexpr detail::lsc_vector_size _VS = + detail::to_lsc_vector_size(); + + if constexpr (SmallIntFactor == 1) { + return __esimd_lsc_load_bti(pred.data(), + offsets.data(), si); + } else { + + __ESIMD_NS::simd result = + __esimd_lsc_load_bti( + pred.data(), offsets.data(), si); + return result.template bit_cast_view(); + } #endif } @@ -622,6 +658,7 @@ __ESIMD_API void lsc_prefetch(const T *p) { constexpr uint16_t _AddressScale = 1; constexpr int _ImmOffset = 0; constexpr lsc_data_size _DS = detail::finalize_data_size(); + static_assert( _DS == lsc_data_size::u32 || _DS == lsc_data_size::u64, "Transposed prefetch is supported only for data size u32 or u64"); @@ -630,6 +667,7 @@ __ESIMD_API void lsc_prefetch(const T *p) { detail::lsc_data_order::transpose; constexpr int N = 1; __ESIMD_NS::simd_mask pred = 1; + __ESIMD_NS::simd addrs = reinterpret_cast(p); __esimd_lsc_prefetch_stateless(pred.data(), @@ -894,6 +932,8 @@ lsc_scatter(AccessorTy acc, __ESIMD_NS::simd offsets, /// VISA instruction: lsc_store.ugm /// /// Scatters elements to specific address. +/// See comments in the \ref lsc_block_load API for description and parameter +/// constraints. /// /// @tparam T is element type. /// @tparam NElts is the number of elements to store per address. @@ -910,22 +950,35 @@ template __ESIMD_API void lsc_block_store(T *p, __ESIMD_NS::simd vals, __ESIMD_NS::simd_mask<1> pred = 1) { - detail::check_lsc_vector_size(); detail::check_lsc_data_size(); detail::check_lsc_cache_hint(); constexpr uint16_t _AddressScale = 1; constexpr int _ImmOffset = 0; constexpr lsc_data_size _DS = detail::finalize_data_size(); - static_assert(_DS == lsc_data_size::u32 || _DS == lsc_data_size::u64, - "Transposed store is supported only for data size u32 or u64"); - constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size(); constexpr detail::lsc_data_order _Transposed = detail::lsc_data_order::transpose; constexpr int N = 1; __ESIMD_NS::simd addrs = reinterpret_cast(p); - __esimd_lsc_store_stateless(pred.data(), addrs.data(), - vals.data()); + constexpr int SmallIntFactor = + (_DS == lsc_data_size::u16) ? 2 : (_DS == lsc_data_size::u8 ? 4 : 1); + static_assert(NElts % SmallIntFactor == 0, + "Number of elements is not supported by Transposed store"); + detail::check_lsc_vector_size(); + constexpr detail::lsc_vector_size _VS = + detail::to_lsc_vector_size(); + if constexpr (SmallIntFactor == 1) { + + __esimd_lsc_store_stateless(pred.data(), addrs.data(), + vals.data()); + } else { + __ESIMD_NS::simd tmp = + vals.template bit_cast_view(); + + __esimd_lsc_store_stateless( + pred.data(), addrs.data(), tmp.data()); + } } /// Accessor-based transposed scatter with 1 channel. @@ -933,6 +986,8 @@ __ESIMD_API void lsc_block_store(T *p, __ESIMD_NS::simd vals, /// VISA instruction: lsc_store.ugm /// /// Scatters elements to surface. +/// See comments in the \ref lsc_block_load API for description and parameter +/// constraints. /// /// @tparam T is element type. /// @tparam NElts is the number of elements to store per address. @@ -958,23 +1013,36 @@ lsc_block_store(AccessorTy acc, uint32_t offset, lsc_block_store( __ESIMD_DNS::accessorToPointer(acc, offset), vals, pred); #else - detail::check_lsc_vector_size(); detail::check_lsc_data_size(); detail::check_lsc_cache_hint(); constexpr uint16_t _AddressScale = 1; constexpr int _ImmOffset = 0; constexpr lsc_data_size _DS = detail::finalize_data_size(); - static_assert(_DS == lsc_data_size::u32 || _DS == lsc_data_size::u64, - "Transposed store is supported only for data size u32 or u64"); - constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size(); constexpr detail::lsc_data_order _Transposed = detail::lsc_data_order::transpose; constexpr int N = 1; + __ESIMD_NS::simd offsets = offset; auto si = __ESIMD_NS::get_surface_index(acc); - __esimd_lsc_store_bti(pred.data(), offsets.data(), - vals.data(), si); + constexpr int SmallIntFactor = + (_DS == lsc_data_size::u16) ? 2 : (_DS == lsc_data_size::u8 ? 4 : 1); + + detail::check_lsc_vector_size(); + static_assert(NElts % SmallIntFactor == 0, + "Number of elements is not supported by Transposed store"); + constexpr detail::lsc_vector_size _VS = + detail::to_lsc_vector_size(); + if constexpr (SmallIntFactor > 1) { + __ESIMD_NS::simd Tmp = + vals.template bit_cast_view(); + __esimd_lsc_store_bti( + pred.data(), offsets.data(), Tmp.data(), si); + } else { + __esimd_lsc_store_bti(pred.data(), offsets.data(), + vals.data(), si); + } #endif }