Skip to content

Commit

Permalink
[SYCL][ESIMD] Add 8/16-bit type support to lsc_block_load/store (#6757)
Browse files Browse the repository at this point in the history
  • Loading branch information
fineg74 authored Sep 22, 2022
1 parent b5023ea commit f9d8059
Showing 1 changed file with 97 additions and 29 deletions.
126 changes: 97 additions & 29 deletions sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -474,8 +474,16 @@ lsc_gather(AccessorTy acc, __ESIMD_NS::simd<uint32_t, N> offsets,
/// Supported platforms: DG2, PVC
/// VISA instruction: lsc_load.ugm
///
/// Collects elements located at specified address and returns them
/// as a single \ref simd object.
/// Accesses contiguous block of memory of `NElts * S` bytes starting from
/// given address, where S is a byte size of an "element" defined by the \c DS
/// template parameter. The maximum size of accessed block is 512 bytes for PVC
/// and 256 bytes for ACM (DG2).
/// When \? DS equals \? lsc_data_size::u64, the address must be 8-byte aligned,
/// otherwise - 4-bytes aligned. Allowed values for the data size are
/// \? lsc_data_size::u32 and \? lsc_data_size::u64. Allowed NElts values are
/// 1, 2, 3, 4, 8, 16, 32, 64.
/// Note that to access 512 bytes, DS must be \? lsc_data_size::u64 and \c NElts
/// must be 64.
///
/// @tparam T is element type.
/// @tparam NElts is the number of elements to load per address.
Expand All @@ -492,22 +500,34 @@ template <typename T, int NElts, lsc_data_size DS = lsc_data_size::default_size,
cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none>
__ESIMD_API __ESIMD_NS::simd<T, NElts>
lsc_block_load(const T *p, __ESIMD_NS::simd_mask<1> pred = 1) {
detail::check_lsc_vector_size<NElts>();
detail::check_lsc_data_size<T, DS>();
detail::check_lsc_cache_hint<detail::lsc_action::load, L1H, L3H>();
constexpr uint16_t _AddressScale = 1;
constexpr int _ImmOffset = 0;
constexpr lsc_data_size _DS = detail::finalize_data_size<T, DS>();
static_assert(_DS == lsc_data_size::u32 || _DS == lsc_data_size::u64,
"Transposed load is supported only for data size u32 or u64");
constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
constexpr detail::lsc_data_order _Transposed =
detail::lsc_data_order::transpose;
constexpr int N = 1;
__ESIMD_NS::simd<uintptr_t, N> addrs = reinterpret_cast<uintptr_t>(p);
return __esimd_lsc_load_stateless<T, L1H, L3H, _AddressScale, _ImmOffset, _DS,
_VS, _Transposed, N>(pred.data(),
addrs.data());
constexpr int SmallIntFactor =
(_DS == lsc_data_size::u16) ? 2 : (_DS == lsc_data_size::u8 ? 4 : 1);
static_assert(NElts % SmallIntFactor == 0,
"Number of elements is not supported by Transposed load");

detail::check_lsc_vector_size<NElts / SmallIntFactor>();
constexpr detail::lsc_vector_size _VS =
detail::to_lsc_vector_size<NElts / SmallIntFactor>();
if constexpr (SmallIntFactor == 1) {
return __esimd_lsc_load_stateless<T, L1H, L3H, _AddressScale, _ImmOffset,
_DS, _VS, _Transposed, N>(pred.data(),
addrs.data());
} else {
__ESIMD_NS::simd<uint32_t, NElts / SmallIntFactor> result =
__esimd_lsc_load_stateless<uint32_t, L1H, L3H, _AddressScale,
_ImmOffset, lsc_data_size::u32, _VS,
_Transposed, N>(pred.data(), addrs.data());
return result.template bit_cast_view<T>();
}
}

/// Accessor-based transposed gather with 1 channel.
Expand All @@ -516,6 +536,8 @@ lsc_block_load(const T *p, __ESIMD_NS::simd_mask<1> pred = 1) {
///
/// Collects elements located at surface and returns them
/// as a single \ref simd object.
/// See comments in the \ref lsc_block_load API for description and parameter
/// constraints.
///
/// @tparam T is element type.
/// @tparam NElts is the number of elements to load per address.
Expand All @@ -541,22 +563,36 @@ lsc_block_load(AccessorTy acc, uint32_t offset,
return lsc_block_load<T, NElts, DS, L1H, L3H>(
__ESIMD_DNS::accessorToPointer<T>(acc, offset), pred);
#else
detail::check_lsc_vector_size<NElts>();
detail::check_lsc_data_size<T, DS>();
detail::check_lsc_cache_hint<detail::lsc_action::load, L1H, L3H>();
constexpr uint16_t _AddressScale = 1;
constexpr int _ImmOffset = 0;
constexpr lsc_data_size _DS = detail::finalize_data_size<T, DS>();
static_assert(_DS == lsc_data_size::u32 || _DS == lsc_data_size::u64,
"Transposed load is supported only for data size u32 or u64");
constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
constexpr detail::lsc_data_order _Transposed =
detail::lsc_data_order::transpose;
constexpr int N = 1;
__ESIMD_NS::simd<uint32_t, N> offsets = offset;
auto si = __ESIMD_NS::get_surface_index(acc);
return __esimd_lsc_load_bti<T, L1H, L3H, _AddressScale, _ImmOffset, _DS, _VS,
_Transposed, N>(pred.data(), offsets.data(), si);
constexpr int SmallIntFactor =
(_DS == lsc_data_size::u16) ? 2 : (_DS == lsc_data_size::u8 ? 4 : 1);
static_assert(NElts % SmallIntFactor == 0,
"Number of elements is not supported by Transposed load");
detail::check_lsc_vector_size<NElts / SmallIntFactor>();
constexpr detail::lsc_vector_size _VS =
detail::to_lsc_vector_size<NElts / SmallIntFactor>();

if constexpr (SmallIntFactor == 1) {
return __esimd_lsc_load_bti<T, L1H, L3H, _AddressScale, _ImmOffset, _DS,
_VS, _Transposed, N>(pred.data(),
offsets.data(), si);
} else {

__ESIMD_NS::simd<uint32_t, NElts / SmallIntFactor> result =
__esimd_lsc_load_bti<uint32_t, L1H, L3H, _AddressScale, _ImmOffset,
lsc_data_size::u32, _VS, _Transposed, N>(
pred.data(), offsets.data(), si);
return result.template bit_cast_view<T>();
}
#endif
}

Expand Down Expand Up @@ -622,6 +658,7 @@ __ESIMD_API void lsc_prefetch(const T *p) {
constexpr uint16_t _AddressScale = 1;
constexpr int _ImmOffset = 0;
constexpr lsc_data_size _DS = detail::finalize_data_size<T, DS>();

static_assert(
_DS == lsc_data_size::u32 || _DS == lsc_data_size::u64,
"Transposed prefetch is supported only for data size u32 or u64");
Expand All @@ -630,6 +667,7 @@ __ESIMD_API void lsc_prefetch(const T *p) {
detail::lsc_data_order::transpose;
constexpr int N = 1;
__ESIMD_NS::simd_mask<N> pred = 1;

__ESIMD_NS::simd<uintptr_t, N> addrs = reinterpret_cast<uintptr_t>(p);
__esimd_lsc_prefetch_stateless<T, L1H, L3H, _AddressScale, _ImmOffset, _DS,
_VS, _Transposed, N>(pred.data(),
Expand Down Expand Up @@ -894,6 +932,8 @@ lsc_scatter(AccessorTy acc, __ESIMD_NS::simd<uint32_t, N> offsets,
/// VISA instruction: lsc_store.ugm
///
/// Scatters elements to specific address.
/// See comments in the \ref lsc_block_load API for description and parameter
/// constraints.
///
/// @tparam T is element type.
/// @tparam NElts is the number of elements to store per address.
Expand All @@ -910,29 +950,44 @@ template <typename T, int NElts, lsc_data_size DS = lsc_data_size::default_size,
cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none>
__ESIMD_API void lsc_block_store(T *p, __ESIMD_NS::simd<T, NElts> vals,
__ESIMD_NS::simd_mask<1> pred = 1) {
detail::check_lsc_vector_size<NElts>();
detail::check_lsc_data_size<T, DS>();
detail::check_lsc_cache_hint<detail::lsc_action::store, L1H, L3H>();
constexpr uint16_t _AddressScale = 1;
constexpr int _ImmOffset = 0;
constexpr lsc_data_size _DS = detail::finalize_data_size<T, DS>();
static_assert(_DS == lsc_data_size::u32 || _DS == lsc_data_size::u64,
"Transposed store is supported only for data size u32 or u64");
constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
constexpr detail::lsc_data_order _Transposed =
detail::lsc_data_order::transpose;
constexpr int N = 1;
__ESIMD_NS::simd<uintptr_t, N> addrs = reinterpret_cast<uintptr_t>(p);
__esimd_lsc_store_stateless<T, L1H, L3H, _AddressScale, _ImmOffset, _DS, _VS,
_Transposed, N>(pred.data(), addrs.data(),
vals.data());
constexpr int SmallIntFactor =
(_DS == lsc_data_size::u16) ? 2 : (_DS == lsc_data_size::u8 ? 4 : 1);
static_assert(NElts % SmallIntFactor == 0,
"Number of elements is not supported by Transposed store");
detail::check_lsc_vector_size<NElts / SmallIntFactor>();
constexpr detail::lsc_vector_size _VS =
detail::to_lsc_vector_size<NElts / SmallIntFactor>();
if constexpr (SmallIntFactor == 1) {

__esimd_lsc_store_stateless<T, L1H, L3H, _AddressScale, _ImmOffset, _DS,
_VS, _Transposed, N>(pred.data(), addrs.data(),
vals.data());
} else {
__ESIMD_NS::simd<uint32_t, NElts / SmallIntFactor> tmp =
vals.template bit_cast_view<uint32_t>();

__esimd_lsc_store_stateless<uint32_t, L1H, L3H, _AddressScale, _ImmOffset,
lsc_data_size::u32, _VS, _Transposed, N>(
pred.data(), addrs.data(), tmp.data());
}
}

/// Accessor-based transposed scatter with 1 channel.
/// Supported platforms: DG2, PVC
/// VISA instruction: lsc_store.ugm
///
/// Scatters elements to surface.
/// See comments in the \ref lsc_block_load API for description and parameter
/// constraints.
///
/// @tparam T is element type.
/// @tparam NElts is the number of elements to store per address.
Expand All @@ -958,23 +1013,36 @@ lsc_block_store(AccessorTy acc, uint32_t offset,
lsc_block_store<T, NElts, DS, L1H>(
__ESIMD_DNS::accessorToPointer<T>(acc, offset), vals, pred);
#else
detail::check_lsc_vector_size<NElts>();
detail::check_lsc_data_size<T, DS>();
detail::check_lsc_cache_hint<detail::lsc_action::store, L1H, L3H>();
constexpr uint16_t _AddressScale = 1;
constexpr int _ImmOffset = 0;
constexpr lsc_data_size _DS = detail::finalize_data_size<T, DS>();
static_assert(_DS == lsc_data_size::u32 || _DS == lsc_data_size::u64,
"Transposed store is supported only for data size u32 or u64");
constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
constexpr detail::lsc_data_order _Transposed =
detail::lsc_data_order::transpose;
constexpr int N = 1;

__ESIMD_NS::simd<uint32_t, N> offsets = offset;
auto si = __ESIMD_NS::get_surface_index(acc);
__esimd_lsc_store_bti<T, L1H, L3H, _AddressScale, _ImmOffset, _DS, _VS,
_Transposed, N>(pred.data(), offsets.data(),
vals.data(), si);
constexpr int SmallIntFactor =
(_DS == lsc_data_size::u16) ? 2 : (_DS == lsc_data_size::u8 ? 4 : 1);

detail::check_lsc_vector_size<NElts / SmallIntFactor>();
static_assert(NElts % SmallIntFactor == 0,
"Number of elements is not supported by Transposed store");
constexpr detail::lsc_vector_size _VS =
detail::to_lsc_vector_size<NElts / SmallIntFactor>();
if constexpr (SmallIntFactor > 1) {
__ESIMD_NS::simd<uint32_t, NElts / SmallIntFactor> Tmp =
vals.template bit_cast_view<uint32_t>();
__esimd_lsc_store_bti<uint32_t, L1H, L3H, _AddressScale, _ImmOffset,
lsc_data_size::u32, _VS, _Transposed, N>(
pred.data(), offsets.data(), Tmp.data(), si);
} else {
__esimd_lsc_store_bti<T, L1H, L3H, _AddressScale, _ImmOffset, _DS, _VS,
_Transposed, N>(pred.data(), offsets.data(),
vals.data(), si);
}
#endif
}

Expand Down

0 comments on commit f9d8059

Please sign in to comment.