Skip to content

[SYCL][ESIMD][EMU] Handle intrinsic operations promoted to 4-byte element type #5727

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
4bd07fe
[SYCL][ESIMD][EMU] Handling intrinsic operations promoted to 4-byte e…
dongkyunahn-intel Mar 3, 2022
858c5c6
__SYCL_DEVICE_ONLY__ is removed from memory.hpp
dongkyunahn-intel Mar 3, 2022
35ee39f
ElemsPerAddrDecoding(), instead of using TySizeLog2 directly
dongkyunahn-intel Mar 4, 2022
1036f5d
Update sycl/include/sycl/ext/intel/experimental/esimd/detail/memory_i…
dongkyunahn-intel Mar 4, 2022
c158b84
Update sycl/include/sycl/ext/intel/experimental/esimd/detail/memory_i…
dongkyunahn-intel Mar 4, 2022
d1a8919
Missing changes
dongkyunahn-intel Mar 4, 2022
5391f0b
Debugging floating type errors
dongkyunahn-intel Mar 8, 2022
d72078d
Preparing pulling origin/sycl branch
dongkyunahn-intel Mar 9, 2022
33d8bba
Merge branch 'sycl' of https://github.com/intel/llvm into esimd_emu_p…
dongkyunahn-intel Mar 9, 2022
f37583f
Re-applying changes to relocated header files
dongkyunahn-intel Mar 9, 2022
421bd78
Bug fix in __esimd_pack/unpack_mask()
dongkyunahn-intel Mar 11, 2022
91aa18b
Merge branch 'sycl' of https://github.com/intel/llvm into esimd_emu_p…
dongkyunahn-intel Mar 16, 2022
a339d71
Removing 'conditional_t' in handling 4-byte promoted vector argument
dongkyunahn-intel Mar 16, 2022
e374a07
Merge branch 'sycl' of https://github.com/intel/llvm into esimd_emu_p…
dongkyunahn-intel Mar 17, 2022
5913d06
Recovering FP-working version for test using CI
dongkyunahn-intel Mar 18, 2022
e4b0c21
Revert "Recovering FP-working version for test using CI"
dongkyunahn-intel Mar 18, 2022
3ef244f
Working version - no regression
dongkyunahn-intel Mar 18, 2022
09e17a4
convert_vector<>() for Type-adjustved vectors
dongkyunahn-intel Mar 19, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 2 additions & 4 deletions sycl/include/sycl/ext/intel/esimd/detail/math_intrin.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -698,7 +698,7 @@ __esimd_pack_mask(__ESIMD_DNS::vector_type_t<uint16_t, N> src0) {
// wrapper code (which does the checks already)
uint32_t retv = 0;
for (int i = 0; i < N; i++) {
if (src0[i] & 0x1) {
if (src0[i] != 0) {
retv |= 0x1 << i;
}
}
Expand All @@ -709,12 +709,10 @@ __esimd_pack_mask(__ESIMD_DNS::vector_type_t<uint16_t, N> src0) {
template <int N>
__ESIMD_INTRIN __ESIMD_DNS::vector_type_t<uint16_t, N>
__esimd_unpack_mask(uint32_t src0) {
__ESIMD_DNS::vector_type_t<uint16_t, N> retv;
__ESIMD_DNS::vector_type_t<uint16_t, N> retv = 0;
for (int i = 0; i < N; i++) {
if ((src0 >> i) & 0x1) {
retv[i] = 1;
} else {
retv[i] = 0;
}
}
return retv;
Expand Down
50 changes: 40 additions & 10 deletions sycl/include/sycl/ext/intel/esimd/detail/memory_intrin.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,12 +42,15 @@ namespace __ESIMD_DNS {
// Provides access to sycl accessor class' private members.
class AccessorPrivateProxy {
public:
#ifdef __SYCL_DEVICE_ONLY__
template <typename AccessorTy>
static auto getNativeImageObj(const AccessorTy &Acc) {
#ifdef __SYCL_DEVICE_ONLY__
return Acc.getNativeImageObj();
}
#else // __SYCL_DEVICE_ONLY__
return Acc;
#endif // __SYCL_DEVICE_ONLY__
}
#ifndef __SYCL_DEVICE_ONLY__
static void *getPtr(const sycl::detail::AccessorBaseHost &Acc) {
return Acc.getPtr();
}
Expand Down Expand Up @@ -421,18 +424,32 @@ __esimd_scatter_scaled(__ESIMD_DNS::simd_mask_storage_t<N> pred,
static_assert(TySizeLog2 <= 2);
static_assert(std::is_integral<Ty>::value || TySizeLog2 == 2);

// determine the original element's type size (as __esimd_scatter_scaled
// requires vals to be a vector of 4-byte integers)
constexpr size_t OrigSize = __ESIMD_DNS::ElemsPerAddrDecoding(TySizeLog2);
using RestoredTy = __ESIMD_DNS::uint_type_t<OrigSize>;

sycl::detail::ESIMDDeviceInterface *I =
sycl::detail::getESIMDDeviceInterface();

__ESIMD_DNS::vector_type_t<RestoredTy, N> TypeAdjustedVals;
if constexpr (OrigSize == 4) {
TypeAdjustedVals = __ESIMD_DNS::bitcast<RestoredTy, Ty, N>(vals);
} else {
static_assert(OrigSize == 1 || OrigSize == 2);
TypeAdjustedVals = __ESIMD_DNS::convert_vector<RestoredTy, Ty, N>(vals);
}

if (surf_ind == __ESIMD_NS::detail::SLM_BTI) {
// Scattered-store for Shared Local Memory
// __ESIMD_NS::detail::SLM_BTI is special binding table index for SLM
assert(global_offset == 0);
char *SlmBase = I->__cm_emu_get_slm_ptr();
for (int i = 0; i < N; ++i) {
if (pred[i]) {
Ty *addr = reinterpret_cast<Ty *>(elem_offsets[i] + SlmBase);
*addr = vals[i];
RestoredTy *addr =
reinterpret_cast<RestoredTy *>(elem_offsets[i] + SlmBase);
*addr = TypeAdjustedVals[i];
}
}
} else {
Expand All @@ -449,8 +466,9 @@ __esimd_scatter_scaled(__ESIMD_DNS::simd_mask_storage_t<N> pred,

for (int idx = 0; idx < N; idx++) {
if (pred[idx]) {
Ty *addr = reinterpret_cast<Ty *>(elem_offsets[idx] + writeBase);
*addr = vals[idx];
RestoredTy *addr =
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For later: please check with CM library devs that acquiring a mutex is really needed when accessing through a cm buffer. That seems really strange requirement. It is also strange that no mutex is needed when accessing SLM.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will check with CM Lib devs.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Issue ticket : #5863

reinterpret_cast<RestoredTy *>(elem_offsets[idx] + writeBase);
*addr = TypeAdjustedVals[idx];
}
}

Expand Down Expand Up @@ -629,7 +647,12 @@ __esimd_gather_masked_scaled2(SurfIndAliasTy surf_ind, uint32_t global_offset,
{
static_assert(Scale == 0);

__ESIMD_DNS::vector_type_t<Ty, N> retv = 0;
// determine the original element's type size (as __esimd_scatter_scaled
// requires vals to be a vector of 4-byte integers)
constexpr size_t OrigSize = __ESIMD_DNS::ElemsPerAddrDecoding(TySizeLog2);
using RestoredTy = __ESIMD_DNS::uint_type_t<OrigSize>;

__ESIMD_DNS::vector_type_t<RestoredTy, N> retv = 0;
sycl::detail::ESIMDDeviceInterface *I =
sycl::detail::getESIMDDeviceInterface();

Expand All @@ -639,7 +662,8 @@ __esimd_gather_masked_scaled2(SurfIndAliasTy surf_ind, uint32_t global_offset,
char *SlmBase = I->__cm_emu_get_slm_ptr();
for (int idx = 0; idx < N; ++idx) {
if (pred[idx]) {
Ty *addr = reinterpret_cast<Ty *>(offsets[idx] + SlmBase);
RestoredTy *addr =
reinterpret_cast<RestoredTy *>(offsets[idx] + SlmBase);
retv[idx] = *addr;
}
}
Expand All @@ -655,15 +679,21 @@ __esimd_gather_masked_scaled2(SurfIndAliasTy surf_ind, uint32_t global_offset,
std::unique_lock<std::mutex> lock(*mutexLock);
for (int idx = 0; idx < N; idx++) {
if (pred[idx]) {
Ty *addr = reinterpret_cast<Ty *>(offsets[idx] + readBase);
RestoredTy *addr =
reinterpret_cast<RestoredTy *>(offsets[idx] + readBase);
retv[idx] = *addr;
}
}

// TODO : Optimize
I->cm_fence_ptr();
}
return retv;

if constexpr (OrigSize == 4) {
return __ESIMD_DNS::bitcast<Ty, RestoredTy, N>(retv);
} else {
return __ESIMD_DNS::convert_vector<Ty, RestoredTy, N>(retv);
}
}
#endif // __SYCL_DEVICE_ONLY__

Expand Down
16 changes: 2 additions & 14 deletions sycl/include/sycl/ext/intel/esimd/memory.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,12 +61,8 @@ __ESIMD_API SurfaceIndex get_surface_index(AccessorTy acc) {
if constexpr (std::is_same_v<detail::LocalAccessorMarker, AccessorTy>) {
return detail::SLM_BTI;
} else {
#ifdef __SYCL_DEVICE_ONLY__
const auto mem_obj = detail::AccessorPrivateProxy::getNativeImageObj(acc);
return __esimd_get_surface_index(mem_obj);
#else // __SYCL_DEVICE_ONLY__
return __esimd_get_surface_index(acc);
#endif // __SYCL_DEVICE_ONLY__
return __esimd_get_surface_index(
detail::AccessorPrivateProxy::getNativeImageObj(acc));
}
}

Expand Down Expand Up @@ -253,12 +249,8 @@ __ESIMD_API simd<Tx, N> block_load(AccessorTy acc, uint32_t offset,
static_assert(Sz <= 8 * detail::OperandSize::OWORD,
"block size must be at most 8 owords");

#if defined(__SYCL_DEVICE_ONLY__)
auto surf_ind = __esimd_get_surface_index(
detail::AccessorPrivateProxy::getNativeImageObj(acc));
#else // __SYCL_DEVICE_ONLY__
auto surf_ind = __esimd_get_surface_index(acc);
#endif // __SYCL_DEVICE_ONLY__

if constexpr (Flags::template alignment<simd<T, N>> >=
detail::OperandSize::OWORD) {
Expand Down Expand Up @@ -317,12 +309,8 @@ __ESIMD_API void block_store(AccessorTy acc, uint32_t offset,
static_assert(Sz <= 8 * detail::OperandSize::OWORD,
"block size must be at most 8 owords");

#if defined(__SYCL_DEVICE_ONLY__)
auto surf_ind = __esimd_get_surface_index(
detail::AccessorPrivateProxy::getNativeImageObj(acc));
#else //
auto surf_ind = __esimd_get_surface_index(acc);
#endif
__esimd_oword_st<T, N>(surf_ind, offset >> 4, vals.data());
}

Expand Down