-
Notifications
You must be signed in to change notification settings - Fork 789
[SYCL][ESIMD] Add support for align flags for simd::copy_from/to operations #4848
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
2ecffef
c97a0bf
5de4050
03a2de3
0e507c4
04a38ec
8cdbcd2
48135ea
a2e9870
8d767f8
50eff7e
e5e9ee0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -22,6 +22,55 @@ namespace ext { | |
namespace intel { | ||
namespace experimental { | ||
namespace esimd { | ||
|
||
/// Flags for use with simd load/store operations. | ||
/// \ingroup sycl_esimd | ||
/// @{ | ||
/// element_aligned_tag type. Flag of this type should be used in load and store | ||
/// operations when memory address is aligned by simd object's element type. | ||
struct element_aligned_tag { | ||
template <typename VT, typename ET = typename detail::element_type<VT>::type> | ||
static constexpr unsigned alignment = alignof(ET); | ||
}; | ||
|
||
/// vector_aligned_tag type. Flag of this type should be used in load and store | ||
/// operations when memory address is guaranteed to be aligned by simd object's | ||
/// vector type. | ||
struct vector_aligned_tag { | ||
template <typename VT> static constexpr unsigned alignment = alignof(VT); | ||
sndmitriev marked this conversation as resolved.
Show resolved
Hide resolved
|
||
}; | ||
|
||
/// overaligned_tag type. Flag of this type should be used in load and store | ||
/// operations when memory address is aligned by the user-provided alignment | ||
/// value N. | ||
/// \tparam N is the alignment value. N must be a power of two. | ||
template <unsigned N> struct overaligned_tag { | ||
static_assert( | ||
detail::isPowerOf2(N), | ||
"Alignment value N for overaligned_tag<N> must be a power of two"); | ||
template <typename> static constexpr unsigned alignment = N; | ||
}; | ||
|
||
inline constexpr element_aligned_tag element_aligned = {}; | ||
|
||
inline constexpr vector_aligned_tag vector_aligned = {}; | ||
|
||
template <unsigned N> inline constexpr overaligned_tag<N> overaligned = {}; | ||
/// @} | ||
|
||
/// Checks if type is a simd load/store flag. | ||
template <typename T> struct is_simd_flag_type : std::false_type {}; | ||
|
||
template <> struct is_simd_flag_type<element_aligned_tag> : std::true_type {}; | ||
|
||
template <> struct is_simd_flag_type<vector_aligned_tag> : std::true_type {}; | ||
|
||
template <unsigned N> | ||
struct is_simd_flag_type<overaligned_tag<N>> : std::true_type {}; | ||
|
||
template <typename T> | ||
static inline constexpr bool is_simd_flag_type_v = is_simd_flag_type<T>::value; | ||
|
||
namespace detail { | ||
|
||
/// The simd_obj_impl vector class. | ||
|
@@ -70,6 +119,9 @@ template <typename Ty, int N, class Derived, class SFINAE> class simd_obj_impl { | |
|
||
private: | ||
Derived &cast_this_to_derived() { return reinterpret_cast<Derived &>(*this); } | ||
const Derived &cast_this_to_derived() const { | ||
return reinterpret_cast<const Derived &>(*this); | ||
} | ||
|
||
public: | ||
/// @{ | ||
|
@@ -137,11 +189,31 @@ template <typename Ty, int N, class Derived, class SFINAE> class simd_obj_impl { | |
init_from_array(std::move(Arr)); | ||
} | ||
|
||
/// Load constructor. | ||
template <typename Flags = element_aligned_tag, | ||
typename = std::enable_if_t<is_simd_flag_type_v<Flags>>> | ||
simd_obj_impl(const Ty *ptr, Flags = {}) noexcept { | ||
__esimd_dbg_print(simd_obj_impl(const Ty *ptr, Flags)); | ||
copy_from(ptr, Flags{}); | ||
} | ||
|
||
/// Accessor-based load constructor. | ||
template <typename AccessorT, typename Flags = element_aligned_tag, | ||
typename = std::enable_if_t< | ||
detail::is_sycl_accessor_with< | ||
AccessorT, accessor_mode_cap::can_read, | ||
sycl::access::target::global_buffer>::value && | ||
is_simd_flag_type_v<Flags>>> | ||
simd_obj_impl(AccessorT acc, uint32_t offset, Flags = {}) noexcept { | ||
__esimd_dbg_print(simd_obj_impl(AccessorT acc, uint32_t offset, Flags)); | ||
copy_from(acc, offset, Flags{}); | ||
} | ||
|
||
/// @} | ||
|
||
// Load the object's value from array. | ||
template <int N1> std::enable_if_t<N1 == N> copy_from(const Ty (&Arr)[N1]) { | ||
__esimd_dbg_print(copy_from(const Ty(&Arr)[N1])); | ||
template <int N1> std::enable_if_t<N1 == N> copy_from(const Ty(&&Arr)[N1]) { | ||
__esimd_dbg_print(copy_from(const Ty(&&Arr)[N1])); | ||
vector_type Tmp; | ||
for (auto I = 0; I < N; ++I) { | ||
Tmp[I] = Arr[I]; | ||
|
@@ -150,8 +222,8 @@ template <typename Ty, int N, class Derived, class SFINAE> class simd_obj_impl { | |
} | ||
|
||
// Store the object's value to array. | ||
template <int N1> std::enable_if_t<N1 == N> copy_to(Ty (&Arr)[N1]) const { | ||
__esimd_dbg_print(copy_to(Ty(&Arr)[N1])); | ||
template <int N1> std::enable_if_t<N1 == N> copy_to(Ty(&&Arr)[N1]) const { | ||
__esimd_dbg_print(copy_to(Ty(&&Arr)[N1])); | ||
for (auto I = 0; I < N; ++I) { | ||
Arr[I] = data()[I]; | ||
} | ||
|
@@ -500,33 +572,59 @@ template <typename Ty, int N, class Derived, class SFINAE> class simd_obj_impl { | |
/// elements in this object. | ||
/// @param addr the memory address to copy from. Must be a pointer to the | ||
/// global address space, otherwise behavior is undefined. | ||
ESIMD_INLINE void copy_from(const Ty *addr) SYCL_ESIMD_FUNCTION; | ||
/// @param flags for the copy operation. If the template parameter Flags is | ||
/// is element_aligned_tag, \p addr must be aligned by alignof(T). If Flags is | ||
/// vector_aligned_tag, \p addr must be aligned by simd_obj_impl's vector_type | ||
/// alignment. If Flags is overaligned_tag<N>, \p addr must be aligned by N. | ||
/// Program not meeting alignment requirements results in undefined behavior. | ||
template <typename Flags = element_aligned_tag, | ||
sndmitriev marked this conversation as resolved.
Show resolved
Hide resolved
|
||
typename = std::enable_if_t<is_simd_flag_type_v<Flags>>> | ||
ESIMD_INLINE void copy_from(const Ty *addr, Flags = {}) SYCL_ESIMD_FUNCTION; | ||
|
||
/// Copy a contiguous block of data from memory into this simd_obj_impl | ||
/// object. The amount of memory copied equals the total size of vector | ||
/// elements in this object. Source memory location is represented via a | ||
/// global accessor and offset. | ||
/// @param acc accessor to copy from. | ||
/// @param offset offset to copy from (in bytes). | ||
template <typename AccessorT> | ||
/// @param flags for the copy operation. If the template parameter Flags is | ||
/// is element_aligned_tag, offset must be aligned by alignof(T). If Flags is | ||
/// vector_aligned_tag, offset must be aligned by simd_obj_impl's vector_type | ||
/// alignment. If Flags is overaligned_tag<N>, offset must be aligned by N. | ||
/// Program not meeting alignment requirements results in undefined behavior. | ||
template <typename AccessorT, typename Flags = element_aligned_tag, | ||
typename = std::enable_if_t<is_simd_flag_type_v<Flags>>> | ||
ESIMD_INLINE EnableIfAccessor<AccessorT, accessor_mode_cap::can_read, | ||
sycl::access::target::global_buffer, void> | ||
copy_from(AccessorT acc, uint32_t offset) SYCL_ESIMD_FUNCTION; | ||
copy_from(AccessorT acc, uint32_t offset, Flags = {}) SYCL_ESIMD_FUNCTION; | ||
|
||
/// Copy all vector elements of this object into a contiguous block in memory. | ||
/// @param addr the memory address to copy to. Must be a pointer to the | ||
/// global address space, otherwise behavior is undefined. | ||
ESIMD_INLINE void copy_to(Ty *addr) const SYCL_ESIMD_FUNCTION; | ||
/// @param flags for the copy operation. If the template parameter Flags is | ||
/// is element_aligned_tag, \p addr must be aligned by alignof(T). If Flags is | ||
/// vector_aligned_tag, \p addr must be aligned by simd_obj_impl's vector_type | ||
/// alignment. If Flags is overaligned_tag<N>, \p addr must be aligned by N. | ||
/// Program not meeting alignment requirements results in undefined behavior. | ||
template <typename Flags = element_aligned_tag, | ||
typename = std::enable_if_t<is_simd_flag_type_v<Flags>>> | ||
sndmitriev marked this conversation as resolved.
Show resolved
Hide resolved
|
||
ESIMD_INLINE void copy_to(Ty *addr, Flags = {}) const SYCL_ESIMD_FUNCTION; | ||
|
||
/// Copy all vector elements of this object into a contiguous block in memory. | ||
/// Destination memory location is represented via a global accessor and | ||
/// offset. | ||
/// @param acc accessor to copy from. | ||
/// @param offset offset to copy from. | ||
template <typename AccessorT> | ||
/// @param flags for the copy operation. If the template parameter Flags is | ||
/// is element_aligned_tag, offset must be aligned by alignof(T). If Flags is | ||
/// vector_aligned_tag, offset must be aligned by simd_obj_impl's vector_type | ||
/// alignment. If Flags is overaligned_tag<N>, offset must be aligned by N. | ||
kbobrovs marked this conversation as resolved.
Show resolved
Hide resolved
|
||
/// Program not meeting alignment requirements results in undefined behavior. | ||
template <typename AccessorT, typename Flags = element_aligned_tag, | ||
typename = std::enable_if_t<is_simd_flag_type_v<Flags>>> | ||
ESIMD_INLINE EnableIfAccessor<AccessorT, accessor_mode_cap::can_write, | ||
sycl::access::target::global_buffer, void> | ||
copy_to(AccessorT acc, uint32_t offset) const SYCL_ESIMD_FUNCTION; | ||
copy_to(AccessorT acc, uint32_t offset, Flags = {}) const SYCL_ESIMD_FUNCTION; | ||
|
||
/// @} // Memory operations | ||
|
||
|
@@ -635,86 +733,145 @@ template <typename Ty, int N, class Derived, class SFINAE> class simd_obj_impl { | |
// ----------- Outlined implementations of simd_obj_impl class APIs. | ||
|
||
template <typename T, int N, class T1, class SFINAE> | ||
void simd_obj_impl<T, N, T1, SFINAE>::copy_from(const T *Addr) | ||
SYCL_ESIMD_FUNCTION { | ||
constexpr unsigned Sz = sizeof(T) * N; | ||
static_assert(Sz >= OperandSize::OWORD, | ||
"block size must be at least 1 oword"); | ||
static_assert(Sz % OperandSize::OWORD == 0, | ||
"block size must be whole number of owords"); | ||
static_assert(isPowerOf2(Sz / OperandSize::OWORD), | ||
"block must be 1, 2, 4 or 8 owords long"); | ||
static_assert(Sz <= 8 * OperandSize::OWORD, | ||
"block size must be at most 8 owords"); | ||
|
||
uintptr_t AddrVal = reinterpret_cast<uintptr_t>(Addr); | ||
*this = __esimd_svm_block_ld_unaligned<T, N>(AddrVal); | ||
template <typename Flags, typename> | ||
void simd_obj_impl<T, N, T1, SFINAE>::copy_from(const T *Addr, | ||
Flags) SYCL_ESIMD_FUNCTION { | ||
constexpr unsigned Size = sizeof(T) * N; | ||
constexpr unsigned Align = Flags::template alignment<T1>; | ||
|
||
simd<T, N> Tmp; | ||
if constexpr (Align >= OperandSize::DWORD && Size % OperandSize::OWORD == 0 && | ||
detail::isPowerOf2(Size / OperandSize::OWORD)) { | ||
Tmp = block_load<T, N, Flags>(Addr, Flags{}); | ||
} else if constexpr (sizeof(T) == 8) { | ||
constexpr unsigned AlignUH = | ||
(N * 4) % Align == 0 ? Align : std::min(Align, 4u); | ||
simd<int32_t, N> LH(reinterpret_cast<const int32_t *>(Addr), Flags{}); | ||
simd<int32_t, N> UH(reinterpret_cast<const int32_t *>(Addr) + N, | ||
overaligned<AlignUH>); | ||
Tmp.template bit_cast_view<int32_t>().template select<N, 1>(0) = LH; | ||
Tmp.template bit_cast_view<int32_t>().template select<N, 1>(N) = UH; | ||
} else if constexpr (N == 1) { | ||
Tmp = *Addr; | ||
} else if constexpr (N == 8 || N == 16 || N == 32) { | ||
simd<uint32_t, N> Offsets(0u, sizeof(T)); | ||
Tmp = gather<T, N>(Addr, Offsets); | ||
} else { | ||
constexpr int N1 = N < 8 ? 8 : N < 16 ? 16 : 32; | ||
simd_mask_type<N1> Pred(0); | ||
Pred.template select<N, 1>() = 1; | ||
simd<uint32_t, N1> Offsets(0u, sizeof(T)); | ||
simd<T, N1> Vals = gather<T, N1>(Addr, Offsets, Pred); | ||
Tmp = Vals.template select<N, 1>(); | ||
} | ||
*this = Tmp.data(); | ||
} | ||
|
||
template <typename T, int N, class T1, class SFINAE> | ||
template <typename AccessorT> | ||
template <typename AccessorT, typename Flags, typename> | ||
ESIMD_INLINE EnableIfAccessor<AccessorT, accessor_mode_cap::can_read, | ||
sycl::access::target::global_buffer, void> | ||
simd_obj_impl<T, N, T1, SFINAE>::copy_from(AccessorT acc, uint32_t offset) | ||
SYCL_ESIMD_FUNCTION { | ||
constexpr unsigned Sz = sizeof(T) * N; | ||
static_assert(Sz >= OperandSize::OWORD, | ||
"block size must be at least 1 oword"); | ||
static_assert(Sz % OperandSize::OWORD == 0, | ||
"block size must be whole number of owords"); | ||
static_assert(isPowerOf2(Sz / OperandSize::OWORD), | ||
"block must be 1, 2, 4 or 8 owords long"); | ||
static_assert(Sz <= 8 * OperandSize::OWORD, | ||
"block size must be at most 8 owords"); | ||
#if defined(__SYCL_DEVICE_ONLY__) | ||
auto surf_ind = | ||
__esimd_get_surface_index(AccessorPrivateProxy::getNativeImageObj(acc)); | ||
*this = __esimd_oword_ld_unaligned<T, N>(surf_ind, offset); | ||
#else | ||
*this = __esimd_oword_ld_unaligned<T, N>(acc, offset); | ||
#endif // __SYCL_DEVICE_ONLY__ | ||
simd_obj_impl<T, N, T1, SFINAE>::copy_from(AccessorT acc, uint32_t offset, | ||
Flags) SYCL_ESIMD_FUNCTION { | ||
constexpr unsigned Size = sizeof(T) * N; | ||
constexpr unsigned Align = Flags::template alignment<T1>; | ||
|
||
simd<T, N> Tmp; | ||
if constexpr (Align >= OperandSize::DWORD && Size % OperandSize::OWORD == 0 && | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should it be Align >= OperandSize::DWORD or Align >= OperandSize::OWORD here? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Block load requires offset to be at least dword aligned, so it should be OperandSize::DWORD. block_load() then will check what alignment is and will use either aligned load if alignment is >= OperandSize::OWORD or unaligned otherwise. |
||
detail::isPowerOf2(Size / OperandSize::OWORD)) { | ||
Tmp = block_load<T, N, AccessorT, Flags>(acc, offset, Flags{}); | ||
} else if constexpr (sizeof(T) == 8) { | ||
constexpr unsigned AlignUH = | ||
(N * 4) % Align == 0 ? Align : std::min(Align, 4u); | ||
simd<int32_t, N> LH(acc, offset, Flags{}); | ||
simd<int32_t, N> UH(acc, offset + N * 4, overaligned<AlignUH>); | ||
Tmp.template bit_cast_view<int32_t>().template select<N, 1>(0) = LH; | ||
Tmp.template bit_cast_view<int32_t>().template select<N, 1>(N) = UH; | ||
} else if constexpr (N == 1 || N == 8 || N == 16 || N == 32) { | ||
simd<uint32_t, N> Offsets(0u, sizeof(T)); | ||
Tmp = gather<T, N, AccessorT>(acc, Offsets, offset); | ||
} else { | ||
constexpr int N1 = N < 8 ? 8 : N < 16 ? 16 : 32; | ||
simd_mask_type<N1> Pred(0); | ||
Pred.template select<N, 1>() = 1; | ||
simd<uint32_t, N1> Offsets(0u, sizeof(T)); | ||
simd<T, N1> Vals = gather<T, N1>(acc, Offsets, offset, Pred); | ||
Tmp = Vals.template select<N, 1>(); | ||
} | ||
*this = Tmp.data(); | ||
} | ||
|
||
template <typename T, int N, class T1, class SFINAE> | ||
void simd_obj_impl<T, N, T1, SFINAE>::copy_to(T *addr) const | ||
SYCL_ESIMD_FUNCTION { | ||
constexpr unsigned Sz = sizeof(T) * N; | ||
static_assert(Sz >= OperandSize::OWORD, | ||
"block size must be at least 1 oword"); | ||
static_assert(Sz % OperandSize::OWORD == 0, | ||
"block size must be whole number of owords"); | ||
static_assert(isPowerOf2(Sz / OperandSize::OWORD), | ||
"block must be 1, 2, 4 or 8 owords long"); | ||
static_assert(Sz <= 8 * OperandSize::OWORD, | ||
"block size must be at most 8 owords"); | ||
|
||
uintptr_t AddrVal = reinterpret_cast<uintptr_t>(addr); | ||
__esimd_svm_block_st<T, N>(AddrVal, data()); | ||
template <typename Flags, typename> | ||
void simd_obj_impl<T, N, T1, SFINAE>::copy_to(T *addr, | ||
Flags) const SYCL_ESIMD_FUNCTION { | ||
constexpr unsigned Size = sizeof(T) * N; | ||
constexpr unsigned Align = Flags::template alignment<T1>; | ||
|
||
if constexpr (Align >= OperandSize::OWORD && Size % OperandSize::OWORD == 0 && | ||
detail::isPowerOf2(Size / OperandSize::OWORD)) { | ||
block_store<T, N>(addr, cast_this_to_derived()); | ||
} else if constexpr (sizeof(T) == 8) { | ||
constexpr unsigned AlignUH = | ||
(N * 4) % Align == 0 ? Align : std::min(Align, 4u); | ||
simd<T, N> Tmp = data(); | ||
simd<int32_t, N> LH = | ||
Tmp.template bit_cast_view<int32_t>().template select<N, 1>(0); | ||
simd<int32_t, N> UH = | ||
Tmp.template bit_cast_view<int32_t>().template select<N, 1>(N); | ||
LH.copy_to(reinterpret_cast<int32_t *>(addr), Flags{}); | ||
UH.copy_to(reinterpret_cast<int32_t *>(addr) + N, overaligned<AlignUH>); | ||
} else if constexpr (N == 1) { | ||
*addr = data()[0]; | ||
} else if constexpr (N == 8 || N == 16 || N == 32) { | ||
simd<uint32_t, N> offsets(0u, sizeof(T)); | ||
scatter<T, N>(addr, offsets, cast_this_to_derived().data()); | ||
} else { | ||
constexpr int N1 = N < 8 ? 8 : N < 16 ? 16 : 32; | ||
simd_mask_type<N1> pred(0); | ||
pred.template select<N, 1>() = 1; | ||
simd<T, N1> vals(0); | ||
vals.template select<N, 1>() = cast_this_to_derived().data(); | ||
simd<uint32_t, N1> offsets(0u, sizeof(T)); | ||
scatter<T, N1>(addr, offsets, vals, pred); | ||
} | ||
} | ||
|
||
template <typename T, int N, class T1, class SFINAE> | ||
template <typename AccessorT> | ||
template <typename AccessorT, typename Flags, typename> | ||
ESIMD_INLINE EnableIfAccessor<AccessorT, accessor_mode_cap::can_write, | ||
sycl::access::target::global_buffer, void> | ||
simd_obj_impl<T, N, T1, SFINAE>::copy_to(AccessorT acc, uint32_t offset) const | ||
SYCL_ESIMD_FUNCTION { | ||
constexpr unsigned Sz = sizeof(T) * N; | ||
static_assert(Sz >= OperandSize::OWORD, | ||
"block size must be at least 1 oword"); | ||
static_assert(Sz % OperandSize::OWORD == 0, | ||
"block size must be whole number of owords"); | ||
static_assert(isPowerOf2(Sz / OperandSize::OWORD), | ||
"block must be 1, 2, 4 or 8 owords long"); | ||
static_assert(Sz <= 8 * OperandSize::OWORD, | ||
"block size must be at most 8 owords"); | ||
|
||
#if defined(__SYCL_DEVICE_ONLY__) | ||
auto surf_ind = | ||
__esimd_get_surface_index(AccessorPrivateProxy::getNativeImageObj(acc)); | ||
__esimd_oword_st<T, N>(surf_ind, offset >> 4, data()); | ||
#else | ||
__esimd_oword_st<T, N>(acc, offset >> 4, data()); | ||
#endif // __SYCL_DEVICE_ONLY__ | ||
simd_obj_impl<T, N, T1, SFINAE>::copy_to(AccessorT acc, uint32_t offset, | ||
Flags) const SYCL_ESIMD_FUNCTION { | ||
constexpr unsigned Size = sizeof(T) * N; | ||
constexpr unsigned Align = Flags::template alignment<T1>; | ||
|
||
if constexpr (Align >= OperandSize::OWORD && Size % OperandSize::OWORD == 0 && | ||
detail::isPowerOf2(Size / OperandSize::OWORD)) { | ||
block_store<T, N, AccessorT>(acc, offset, cast_this_to_derived()); | ||
} else if constexpr (sizeof(T) == 8) { | ||
constexpr unsigned AlignUH = | ||
(N * 4) % Align == 0 ? Align : std::min(Align, 4u); | ||
simd<T, N> Tmp = data(); | ||
simd<int32_t, N> LH = | ||
Tmp.template bit_cast_view<int32_t>().template select<N, 1>(0); | ||
simd<int32_t, N> UH = | ||
Tmp.template bit_cast_view<int32_t>().template select<N, 1>(N); | ||
LH.copy_to(acc, offset, Flags{}); | ||
UH.copy_to(acc, offset + N * 4, overaligned<AlignUH>); | ||
} else if constexpr (N == 1 || N == 8 || N == 16 || N == 32) { | ||
simd<uint32_t, N> offsets(0u, sizeof(T)); | ||
scatter<T, N, AccessorT>(acc, offsets, cast_this_to_derived().data(), | ||
offset); | ||
} else { | ||
constexpr int N1 = N < 8 ? 8 : N < 16 ? 16 : 32; | ||
simd_mask_type<N1> pred(0); | ||
pred.template select<N, 1>() = 1; | ||
simd<T, N1> vals(0); | ||
vals.template select<N, 1>() = cast_this_to_derived().data(); | ||
simd<uint32_t, N1> offsets(0u, sizeof(T)); | ||
scatter<T, N1, AccessorT>(acc, offsets, vals, offset, pred); | ||
} | ||
} | ||
} // namespace detail | ||
|
||
|
Uh oh!
There was an error while loading. Please reload this page.