Skip to content

[SYCL][ESIMD] Add support for align flags for simd::copy_from/to operations #4848

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Nov 16, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,55 @@ namespace ext {
namespace intel {
namespace experimental {
namespace esimd {

/// Flags for use with simd load/store operations.
/// \ingroup sycl_esimd
/// @{
/// element_aligned_tag type. Flag of this type should be used in load and store
/// operations when memory address is aligned by simd object's element type.
struct element_aligned_tag {
template <typename VT, typename ET = typename detail::element_type<VT>::type>
static constexpr unsigned alignment = alignof(ET);
};

/// vector_aligned_tag type. Flag of this type should be used in load and store
/// operations when memory address is guaranteed to be aligned by simd object's
/// vector type.
struct vector_aligned_tag {
template <typename VT> static constexpr unsigned alignment = alignof(VT);
};

/// overaligned_tag type. Flag of this type should be used in load and store
/// operations when memory address is aligned by the user-provided alignment
/// value N.
/// \tparam N is the alignment value. N must be a power of two.
template <unsigned N> struct overaligned_tag {
static_assert(
detail::isPowerOf2(N),
"Alignment value N for overaligned_tag<N> must be a power of two");
template <typename> static constexpr unsigned alignment = N;
};

inline constexpr element_aligned_tag element_aligned = {};

inline constexpr vector_aligned_tag vector_aligned = {};

template <unsigned N> inline constexpr overaligned_tag<N> overaligned = {};
/// @}

/// Checks if type is a simd load/store flag.
template <typename T> struct is_simd_flag_type : std::false_type {};

template <> struct is_simd_flag_type<element_aligned_tag> : std::true_type {};

template <> struct is_simd_flag_type<vector_aligned_tag> : std::true_type {};

template <unsigned N>
struct is_simd_flag_type<overaligned_tag<N>> : std::true_type {};

template <typename T>
static inline constexpr bool is_simd_flag_type_v = is_simd_flag_type<T>::value;

namespace detail {

/// The simd_obj_impl vector class.
Expand Down Expand Up @@ -70,6 +119,9 @@ template <typename Ty, int N, class Derived, class SFINAE> class simd_obj_impl {

private:
Derived &cast_this_to_derived() { return reinterpret_cast<Derived &>(*this); }
const Derived &cast_this_to_derived() const {
return reinterpret_cast<const Derived &>(*this);
}

public:
/// @{
Expand Down Expand Up @@ -137,11 +189,31 @@ template <typename Ty, int N, class Derived, class SFINAE> class simd_obj_impl {
init_from_array(std::move(Arr));
}

/// Load constructor.
template <typename Flags = element_aligned_tag,
typename = std::enable_if_t<is_simd_flag_type_v<Flags>>>
simd_obj_impl(const Ty *ptr, Flags = {}) noexcept {
__esimd_dbg_print(simd_obj_impl(const Ty *ptr, Flags));
copy_from(ptr, Flags{});
}

/// Accessor-based load constructor.
template <typename AccessorT, typename Flags = element_aligned_tag,
typename = std::enable_if_t<
detail::is_sycl_accessor_with<
AccessorT, accessor_mode_cap::can_read,
sycl::access::target::global_buffer>::value &&
is_simd_flag_type_v<Flags>>>
simd_obj_impl(AccessorT acc, uint32_t offset, Flags = {}) noexcept {
__esimd_dbg_print(simd_obj_impl(AccessorT acc, uint32_t offset, Flags));
copy_from(acc, offset, Flags{});
}

/// @}

// Load the object's value from array.
template <int N1> std::enable_if_t<N1 == N> copy_from(const Ty (&Arr)[N1]) {
__esimd_dbg_print(copy_from(const Ty(&Arr)[N1]));
template <int N1> std::enable_if_t<N1 == N> copy_from(const Ty(&&Arr)[N1]) {
__esimd_dbg_print(copy_from(const Ty(&&Arr)[N1]));
vector_type Tmp;
for (auto I = 0; I < N; ++I) {
Tmp[I] = Arr[I];
Expand All @@ -150,8 +222,8 @@ template <typename Ty, int N, class Derived, class SFINAE> class simd_obj_impl {
}

// Store the object's value to array.
template <int N1> std::enable_if_t<N1 == N> copy_to(Ty (&Arr)[N1]) const {
__esimd_dbg_print(copy_to(Ty(&Arr)[N1]));
template <int N1> std::enable_if_t<N1 == N> copy_to(Ty(&&Arr)[N1]) const {
__esimd_dbg_print(copy_to(Ty(&&Arr)[N1]));
for (auto I = 0; I < N; ++I) {
Arr[I] = data()[I];
}
Expand Down Expand Up @@ -500,33 +572,59 @@ template <typename Ty, int N, class Derived, class SFINAE> class simd_obj_impl {
/// elements in this object.
/// @param addr the memory address to copy from. Must be a pointer to the
/// global address space, otherwise behavior is undefined.
ESIMD_INLINE void copy_from(const Ty *addr) SYCL_ESIMD_FUNCTION;
/// @param flags for the copy operation. If the template parameter Flags is
/// is element_aligned_tag, \p addr must be aligned by alignof(T). If Flags is
/// vector_aligned_tag, \p addr must be aligned by simd_obj_impl's vector_type
/// alignment. If Flags is overaligned_tag<N>, \p addr must be aligned by N.
/// Program not meeting alignment requirements results in undefined behavior.
template <typename Flags = element_aligned_tag,
typename = std::enable_if_t<is_simd_flag_type_v<Flags>>>
ESIMD_INLINE void copy_from(const Ty *addr, Flags = {}) SYCL_ESIMD_FUNCTION;

/// Copy a contiguous block of data from memory into this simd_obj_impl
/// object. The amount of memory copied equals the total size of vector
/// elements in this object. Source memory location is represented via a
/// global accessor and offset.
/// @param acc accessor to copy from.
/// @param offset offset to copy from (in bytes).
template <typename AccessorT>
/// @param flags for the copy operation. If the template parameter Flags is
/// is element_aligned_tag, offset must be aligned by alignof(T). If Flags is
/// vector_aligned_tag, offset must be aligned by simd_obj_impl's vector_type
/// alignment. If Flags is overaligned_tag<N>, offset must be aligned by N.
/// Program not meeting alignment requirements results in undefined behavior.
template <typename AccessorT, typename Flags = element_aligned_tag,
typename = std::enable_if_t<is_simd_flag_type_v<Flags>>>
ESIMD_INLINE EnableIfAccessor<AccessorT, accessor_mode_cap::can_read,
sycl::access::target::global_buffer, void>
copy_from(AccessorT acc, uint32_t offset) SYCL_ESIMD_FUNCTION;
copy_from(AccessorT acc, uint32_t offset, Flags = {}) SYCL_ESIMD_FUNCTION;

/// Copy all vector elements of this object into a contiguous block in memory.
/// @param addr the memory address to copy to. Must be a pointer to the
/// global address space, otherwise behavior is undefined.
ESIMD_INLINE void copy_to(Ty *addr) const SYCL_ESIMD_FUNCTION;
/// @param flags for the copy operation. If the template parameter Flags is
/// is element_aligned_tag, \p addr must be aligned by alignof(T). If Flags is
/// vector_aligned_tag, \p addr must be aligned by simd_obj_impl's vector_type
/// alignment. If Flags is overaligned_tag<N>, \p addr must be aligned by N.
/// Program not meeting alignment requirements results in undefined behavior.
template <typename Flags = element_aligned_tag,
typename = std::enable_if_t<is_simd_flag_type_v<Flags>>>
ESIMD_INLINE void copy_to(Ty *addr, Flags = {}) const SYCL_ESIMD_FUNCTION;

/// Copy all vector elements of this object into a contiguous block in memory.
/// Destination memory location is represented via a global accessor and
/// offset.
/// @param acc accessor to copy from.
/// @param offset offset to copy from.
template <typename AccessorT>
/// @param flags for the copy operation. If the template parameter Flags is
/// is element_aligned_tag, offset must be aligned by alignof(T). If Flags is
/// vector_aligned_tag, offset must be aligned by simd_obj_impl's vector_type
/// alignment. If Flags is overaligned_tag<N>, offset must be aligned by N.
/// Program not meeting alignment requirements results in undefined behavior.
template <typename AccessorT, typename Flags = element_aligned_tag,
typename = std::enable_if_t<is_simd_flag_type_v<Flags>>>
ESIMD_INLINE EnableIfAccessor<AccessorT, accessor_mode_cap::can_write,
sycl::access::target::global_buffer, void>
copy_to(AccessorT acc, uint32_t offset) const SYCL_ESIMD_FUNCTION;
copy_to(AccessorT acc, uint32_t offset, Flags = {}) const SYCL_ESIMD_FUNCTION;

/// @} // Memory operations

Expand Down Expand Up @@ -635,86 +733,145 @@ template <typename Ty, int N, class Derived, class SFINAE> class simd_obj_impl {
// ----------- Outlined implementations of simd_obj_impl class APIs.

template <typename T, int N, class T1, class SFINAE>
void simd_obj_impl<T, N, T1, SFINAE>::copy_from(const T *Addr)
SYCL_ESIMD_FUNCTION {
constexpr unsigned Sz = sizeof(T) * N;
static_assert(Sz >= OperandSize::OWORD,
"block size must be at least 1 oword");
static_assert(Sz % OperandSize::OWORD == 0,
"block size must be whole number of owords");
static_assert(isPowerOf2(Sz / OperandSize::OWORD),
"block must be 1, 2, 4 or 8 owords long");
static_assert(Sz <= 8 * OperandSize::OWORD,
"block size must be at most 8 owords");

uintptr_t AddrVal = reinterpret_cast<uintptr_t>(Addr);
*this = __esimd_svm_block_ld_unaligned<T, N>(AddrVal);
template <typename Flags, typename>
void simd_obj_impl<T, N, T1, SFINAE>::copy_from(const T *Addr,
Flags) SYCL_ESIMD_FUNCTION {
constexpr unsigned Size = sizeof(T) * N;
constexpr unsigned Align = Flags::template alignment<T1>;

simd<T, N> Tmp;
if constexpr (Align >= OperandSize::DWORD && Size % OperandSize::OWORD == 0 &&
detail::isPowerOf2(Size / OperandSize::OWORD)) {
Tmp = block_load<T, N, Flags>(Addr, Flags{});
} else if constexpr (sizeof(T) == 8) {
constexpr unsigned AlignUH =
(N * 4) % Align == 0 ? Align : std::min(Align, 4u);
simd<int32_t, N> LH(reinterpret_cast<const int32_t *>(Addr), Flags{});
simd<int32_t, N> UH(reinterpret_cast<const int32_t *>(Addr) + N,
overaligned<AlignUH>);
Tmp.template bit_cast_view<int32_t>().template select<N, 1>(0) = LH;
Tmp.template bit_cast_view<int32_t>().template select<N, 1>(N) = UH;
} else if constexpr (N == 1) {
Tmp = *Addr;
} else if constexpr (N == 8 || N == 16 || N == 32) {
simd<uint32_t, N> Offsets(0u, sizeof(T));
Tmp = gather<T, N>(Addr, Offsets);
} else {
constexpr int N1 = N < 8 ? 8 : N < 16 ? 16 : 32;
simd_mask_type<N1> Pred(0);
Pred.template select<N, 1>() = 1;
simd<uint32_t, N1> Offsets(0u, sizeof(T));
simd<T, N1> Vals = gather<T, N1>(Addr, Offsets, Pred);
Tmp = Vals.template select<N, 1>();
}
*this = Tmp.data();
}

template <typename T, int N, class T1, class SFINAE>
template <typename AccessorT>
template <typename AccessorT, typename Flags, typename>
ESIMD_INLINE EnableIfAccessor<AccessorT, accessor_mode_cap::can_read,
sycl::access::target::global_buffer, void>
simd_obj_impl<T, N, T1, SFINAE>::copy_from(AccessorT acc, uint32_t offset)
SYCL_ESIMD_FUNCTION {
constexpr unsigned Sz = sizeof(T) * N;
static_assert(Sz >= OperandSize::OWORD,
"block size must be at least 1 oword");
static_assert(Sz % OperandSize::OWORD == 0,
"block size must be whole number of owords");
static_assert(isPowerOf2(Sz / OperandSize::OWORD),
"block must be 1, 2, 4 or 8 owords long");
static_assert(Sz <= 8 * OperandSize::OWORD,
"block size must be at most 8 owords");
#if defined(__SYCL_DEVICE_ONLY__)
auto surf_ind =
__esimd_get_surface_index(AccessorPrivateProxy::getNativeImageObj(acc));
*this = __esimd_oword_ld_unaligned<T, N>(surf_ind, offset);
#else
*this = __esimd_oword_ld_unaligned<T, N>(acc, offset);
#endif // __SYCL_DEVICE_ONLY__
simd_obj_impl<T, N, T1, SFINAE>::copy_from(AccessorT acc, uint32_t offset,
Flags) SYCL_ESIMD_FUNCTION {
constexpr unsigned Size = sizeof(T) * N;
constexpr unsigned Align = Flags::template alignment<T1>;

simd<T, N> Tmp;
if constexpr (Align >= OperandSize::DWORD && Size % OperandSize::OWORD == 0 &&
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should it be Align >= OperandSize::DWORD or Align >= OperandSize::OWORD here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Block load requires offset to be at least dword aligned, so it should be OperandSize::DWORD. block_load() then will check what alignment is and will use either aligned load if alignment is >= OperandSize::OWORD or unaligned otherwise.

detail::isPowerOf2(Size / OperandSize::OWORD)) {
Tmp = block_load<T, N, AccessorT, Flags>(acc, offset, Flags{});
} else if constexpr (sizeof(T) == 8) {
constexpr unsigned AlignUH =
(N * 4) % Align == 0 ? Align : std::min(Align, 4u);
simd<int32_t, N> LH(acc, offset, Flags{});
simd<int32_t, N> UH(acc, offset + N * 4, overaligned<AlignUH>);
Tmp.template bit_cast_view<int32_t>().template select<N, 1>(0) = LH;
Tmp.template bit_cast_view<int32_t>().template select<N, 1>(N) = UH;
} else if constexpr (N == 1 || N == 8 || N == 16 || N == 32) {
simd<uint32_t, N> Offsets(0u, sizeof(T));
Tmp = gather<T, N, AccessorT>(acc, Offsets, offset);
} else {
constexpr int N1 = N < 8 ? 8 : N < 16 ? 16 : 32;
simd_mask_type<N1> Pred(0);
Pred.template select<N, 1>() = 1;
simd<uint32_t, N1> Offsets(0u, sizeof(T));
simd<T, N1> Vals = gather<T, N1>(acc, Offsets, offset, Pred);
Tmp = Vals.template select<N, 1>();
}
*this = Tmp.data();
}

template <typename T, int N, class T1, class SFINAE>
void simd_obj_impl<T, N, T1, SFINAE>::copy_to(T *addr) const
SYCL_ESIMD_FUNCTION {
constexpr unsigned Sz = sizeof(T) * N;
static_assert(Sz >= OperandSize::OWORD,
"block size must be at least 1 oword");
static_assert(Sz % OperandSize::OWORD == 0,
"block size must be whole number of owords");
static_assert(isPowerOf2(Sz / OperandSize::OWORD),
"block must be 1, 2, 4 or 8 owords long");
static_assert(Sz <= 8 * OperandSize::OWORD,
"block size must be at most 8 owords");

uintptr_t AddrVal = reinterpret_cast<uintptr_t>(addr);
__esimd_svm_block_st<T, N>(AddrVal, data());
template <typename Flags, typename>
void simd_obj_impl<T, N, T1, SFINAE>::copy_to(T *addr,
Flags) const SYCL_ESIMD_FUNCTION {
constexpr unsigned Size = sizeof(T) * N;
constexpr unsigned Align = Flags::template alignment<T1>;

if constexpr (Align >= OperandSize::OWORD && Size % OperandSize::OWORD == 0 &&
detail::isPowerOf2(Size / OperandSize::OWORD)) {
block_store<T, N>(addr, cast_this_to_derived());
} else if constexpr (sizeof(T) == 8) {
constexpr unsigned AlignUH =
(N * 4) % Align == 0 ? Align : std::min(Align, 4u);
simd<T, N> Tmp = data();
simd<int32_t, N> LH =
Tmp.template bit_cast_view<int32_t>().template select<N, 1>(0);
simd<int32_t, N> UH =
Tmp.template bit_cast_view<int32_t>().template select<N, 1>(N);
LH.copy_to(reinterpret_cast<int32_t *>(addr), Flags{});
UH.copy_to(reinterpret_cast<int32_t *>(addr) + N, overaligned<AlignUH>);
} else if constexpr (N == 1) {
*addr = data()[0];
} else if constexpr (N == 8 || N == 16 || N == 32) {
simd<uint32_t, N> offsets(0u, sizeof(T));
scatter<T, N>(addr, offsets, cast_this_to_derived().data());
} else {
constexpr int N1 = N < 8 ? 8 : N < 16 ? 16 : 32;
simd_mask_type<N1> pred(0);
pred.template select<N, 1>() = 1;
simd<T, N1> vals(0);
vals.template select<N, 1>() = cast_this_to_derived().data();
simd<uint32_t, N1> offsets(0u, sizeof(T));
scatter<T, N1>(addr, offsets, vals, pred);
}
}

template <typename T, int N, class T1, class SFINAE>
template <typename AccessorT>
template <typename AccessorT, typename Flags, typename>
ESIMD_INLINE EnableIfAccessor<AccessorT, accessor_mode_cap::can_write,
sycl::access::target::global_buffer, void>
simd_obj_impl<T, N, T1, SFINAE>::copy_to(AccessorT acc, uint32_t offset) const
SYCL_ESIMD_FUNCTION {
constexpr unsigned Sz = sizeof(T) * N;
static_assert(Sz >= OperandSize::OWORD,
"block size must be at least 1 oword");
static_assert(Sz % OperandSize::OWORD == 0,
"block size must be whole number of owords");
static_assert(isPowerOf2(Sz / OperandSize::OWORD),
"block must be 1, 2, 4 or 8 owords long");
static_assert(Sz <= 8 * OperandSize::OWORD,
"block size must be at most 8 owords");

#if defined(__SYCL_DEVICE_ONLY__)
auto surf_ind =
__esimd_get_surface_index(AccessorPrivateProxy::getNativeImageObj(acc));
__esimd_oword_st<T, N>(surf_ind, offset >> 4, data());
#else
__esimd_oword_st<T, N>(acc, offset >> 4, data());
#endif // __SYCL_DEVICE_ONLY__
simd_obj_impl<T, N, T1, SFINAE>::copy_to(AccessorT acc, uint32_t offset,
Flags) const SYCL_ESIMD_FUNCTION {
constexpr unsigned Size = sizeof(T) * N;
constexpr unsigned Align = Flags::template alignment<T1>;

if constexpr (Align >= OperandSize::OWORD && Size % OperandSize::OWORD == 0 &&
detail::isPowerOf2(Size / OperandSize::OWORD)) {
block_store<T, N, AccessorT>(acc, offset, cast_this_to_derived());
} else if constexpr (sizeof(T) == 8) {
constexpr unsigned AlignUH =
(N * 4) % Align == 0 ? Align : std::min(Align, 4u);
simd<T, N> Tmp = data();
simd<int32_t, N> LH =
Tmp.template bit_cast_view<int32_t>().template select<N, 1>(0);
simd<int32_t, N> UH =
Tmp.template bit_cast_view<int32_t>().template select<N, 1>(N);
LH.copy_to(acc, offset, Flags{});
UH.copy_to(acc, offset + N * 4, overaligned<AlignUH>);
} else if constexpr (N == 1 || N == 8 || N == 16 || N == 32) {
simd<uint32_t, N> offsets(0u, sizeof(T));
scatter<T, N, AccessorT>(acc, offsets, cast_this_to_derived().data(),
offset);
} else {
constexpr int N1 = N < 8 ? 8 : N < 16 ? 16 : 32;
simd_mask_type<N1> pred(0);
pred.template select<N, 1>() = 1;
simd<T, N1> vals(0);
vals.template select<N, 1>() = cast_this_to_derived().data();
simd<uint32_t, N1> offsets(0u, sizeof(T));
scatter<T, N1, AccessorT>(acc, offsets, vals, offset, pred);
}
}
} // namespace detail

Expand Down
Loading