Skip to content

[ESIMD] Implement gather(acc) accepting compile-time properties #12414

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Jan 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
486 changes: 451 additions & 35 deletions sycl/include/sycl/ext/intel/esimd/memory.hpp

Large diffs are not rendered by default.

43 changes: 7 additions & 36 deletions sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -803,23 +803,10 @@ __ESIMD_API
return lsc_gather<T, NElts, DS, L1H, L3H>(
reinterpret_cast<T *>(acc.get_pointer().get()), offsets, pred);
#else
detail::check_lsc_vector_size<NElts>();
detail::check_lsc_data_size<T, DS>();
detail::check_lsc_cache_hint<detail::lsc_action::load, L1H, L3H>();
constexpr uint16_t _AddressScale = 1;
constexpr int _ImmOffset = 0;
constexpr lsc_data_size _DS =
detail::expand_data_size(detail::finalize_data_size<T, DS>());
constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
constexpr detail::lsc_data_order _Transposed =
detail::lsc_data_order::nontranspose;
using MsgT = typename detail::lsc_expand_type<T>::type;
auto si = __ESIMD_NS::get_surface_index(acc);
__ESIMD_NS::simd<MsgT, N * NElts> Tmp =
__esimd_lsc_load_bti<MsgT, L1H, L3H, _AddressScale, _ImmOffset, _DS, _VS,
_Transposed, N>(pred.data(), offsets.data(), si);
return detail::lsc_format_ret<T>(Tmp);
#endif
__ESIMD_NS::simd<T, N * NElts> PassThru; // Intentionally unitialized.
return __ESIMD_DNS::gather_impl<T, N * NElts, NElts, L1H, L3H, DS>(
acc, offsets, pred, PassThru);
#endif // __ESIMD_FORCE_STATELESS_MEM
}

#ifdef __ESIMD_FORCE_STATELESS_MEM
Expand Down Expand Up @@ -891,25 +878,9 @@ __ESIMD_API
reinterpret_cast<T *>(acc.get_pointer().get()), offsets, pred, pass_thru);

#else
detail::check_lsc_vector_size<NElts>();
detail::check_lsc_data_size<T, DS>();
detail::check_lsc_cache_hint<detail::lsc_action::load, L1H, L3H>();
constexpr uint16_t _AddressScale = 1;
constexpr int _ImmOffset = 0;
constexpr lsc_data_size _DS =
detail::expand_data_size(detail::finalize_data_size<T, DS>());
constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
constexpr auto _Transposed = detail::lsc_data_order::nontranspose;
using MsgT = typename detail::lsc_expand_type<T>::type;
auto SI = __ESIMD_NS::get_surface_index(acc);
__ESIMD_NS::simd<MsgT, N * NElts> PassThruExpanded =
detail::lsc_format_input<MsgT>(pass_thru);
__ESIMD_NS::simd<MsgT, N * NElts> Result =
__esimd_lsc_load_merge_bti<MsgT, L1H, L3H, _AddressScale, _ImmOffset, _DS,
_VS, _Transposed, N>(
pred.data(), offsets.data(), SI, PassThruExpanded.data());
return detail::lsc_format_ret<T>(Result);
#endif
return __ESIMD_DNS::gather_impl<T, N * NElts, NElts, L1H, L3H, DS>(
acc, offsets, pred, pass_thru);
#endif // __ESIMD_FORCE_STATELESS_MEM
}

#ifdef __ESIMD_FORCE_STATELESS_MEM
Expand Down
241 changes: 240 additions & 1 deletion sycl/test-e2e/ESIMD/unified_memory_api/Inputs/gather.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,188 @@ bool testUSM(queue Q, uint32_t MaskStride, PropertiesT) {
}
} // end if (VS == 1)
Vals.copy_to(Out + GlobalID * N);
// scatter(Out, ByteOffsets.template select<NOffsets, 1>(), Vals);
}).wait();
} catch (sycl::exception const &e) {
std::cout << "SYCL exception caught: " << e.what() << '\n';
sycl::free(In, Q);
sycl::free(Out, Q);
return false;
}

bool Passed = verify(In, Out, N, Size, VS, MaskStride, UseMask, UsePassThru);
if (!Passed)
std::cout << "Case FAILED" << std::endl;

sycl::free(In, Q);
sycl::free(Out, Q);
return Passed;
}

template <typename T, uint16_t N, uint16_t VS, bool UseMask, bool UsePassThru,
bool UseProperties, typename PropertiesT>
bool testACC(queue Q, uint32_t MaskStride, PropertiesT) {

static_assert(VS > 0 && N % VS == 0,
"Incorrect VS parameter. N must be divisible by VS.");
constexpr int NOffsets = N / VS;
static_assert(!UsePassThru || UseMask,
"PassThru cannot be used without using mask");

uint32_t Groups = 8;
uint32_t Threads = 16;

std::cout << "Running case: T=" << esimd_test::type_name<T>() << ", N=" << N
<< ", VS=" << VS << ", MaskStride=" << MaskStride
<< ", Groups=" << Groups << ", Threads=" << Threads
<< ", use_mask=" << UseMask << ", use_pass_thru=" << UsePassThru
<< ", use_properties=" << UseProperties << std::endl;

uint16_t Size = Groups * Threads * N;
using Tuint = esimd_test::uint_type_t<sizeof(T)>;

sycl::range<1> GlobalRange{Groups};
sycl::range<1> LocalRange{Threads};
sycl::nd_range<1> Range{GlobalRange * LocalRange, LocalRange};

T *Out = sycl::malloc_shared<T>(Size, Q);
std::memset(Out, 0, Size * sizeof(T));

T *In = sycl::malloc_shared<T>(Size * 2, Q);
for (int I = 0; I < Size; I++)
In[I] = esimd_test::getRandomValue<T>();

try {
buffer<T, 1> InBuf(In, Size * 2);
Q.submit([&](handler &CGH) {
accessor InAcc{InBuf, CGH};
CGH.parallel_for(Range, [=](sycl::nd_item<1> NDI) SYCL_ESIMD_KERNEL {
int GlobalID = NDI.get_global_id(0);
PropertiesT Props{};

simd<OffsetT, NOffsets> ByteOffsets(GlobalID * N * sizeof(T),
VS * sizeof(T));
simd_view ByteOffsetsView = ByteOffsets.template select<NOffsets, 1>();

simd_mask<NOffsets> Pred;
for (int I = 0; I < NOffsets; I++)
Pred[I] = (I % MaskStride == 0) ? 1 : 0;

using Tuint = esimd_test::uint_type_t<sizeof(T)>;
simd<Tuint, N> PassThruInt(GlobalID * N, 1);
simd<T, N> PassThru = PassThruInt.template bit_cast_view<T>();
auto PassThruView = PassThru.template select<N, 1>(0);

simd<T, N> Vals;
if constexpr (VS > 1) { // VS > 1 requires specifying <T, N, VS>
if constexpr (UsePassThru) {
if constexpr (UseProperties) {
if (GlobalID % 4 == 0) // ByteOffset - simd, PassThru - simd
Vals = gather<T, N, VS>(InAcc, ByteOffsets, Pred, PassThru,
Props);
else if (GlobalID % 4 == 1) // ByteOffset - simd, PassThru - view
Vals = gather<T, N, VS>(InAcc, ByteOffsets, Pred, PassThruView,
Props);
else if (GlobalID % 4 == 2) // ByteOffset - view, PassThru - simd
Vals = gather<T, N, VS>(InAcc, ByteOffsetsView, Pred, PassThru,
Props);
else // ByteOffset - view, PassThru - view
Vals = gather<T, N, VS>(InAcc, ByteOffsetsView, Pred,
PassThruView, Props);
} else { // UseProperties is false
if (GlobalID % 4 == 0) // ByteOffset - simd, PassThru - simd
Vals = gather<T, N, VS>(InAcc, ByteOffsets, Pred, PassThru);
else if (GlobalID % 4 == 1) // ByteOffset - simd, PassThru - view
Vals =
gather<T, N, VS>(InAcc, ByteOffsets, Pred, PassThruView);
else if (GlobalID % 4 == 2) // ByteOffset - view, PassThru - simd
Vals =
gather<T, N, VS>(InAcc, ByteOffsetsView, Pred, PassThru);
else // ByteOffset - view, PassThru - view
Vals = gather<T, N, VS>(InAcc, ByteOffsetsView, Pred,
PassThruView);
}
} else if constexpr (UseMask) { // UsePassThru is false
if constexpr (UseProperties) {
if (GlobalID % 2 == 0) // ByteOffset - simd
Vals = gather<T, N, VS>(InAcc, ByteOffsets, Pred, Props);
else // ByteOffset - simd_view
Vals = gather<T, N, VS>(InAcc, ByteOffsetsView, Pred, Props);
} else { // UseProperties is false
if (GlobalID % 2 == 0) // ByteOffset - simd
Vals = gather<T, N, VS>(InAcc, ByteOffsets, Pred);
else // ByteOffset - simd_view
Vals = gather<T, N, VS>(InAcc, ByteOffsetsView, Pred);
}
} else { // UseMask is false, UsePassThru is false
if constexpr (UseProperties) {
if (GlobalID % 2 == 0) // ByteOffset - simd
Vals = gather<T, N, VS>(InAcc, ByteOffsets, Props);
else // ByteOffset - simd_view
Vals = gather<T, N, VS>(InAcc, ByteOffsetsView, Props);
} else { // UseProperties is false
if (GlobalID % 2 == 0) // ByteOffset - simd
Vals = gather<T, N, VS>(InAcc, ByteOffsets);
else // ByteOffset - simd_view
Vals = gather<T, N, VS>(InAcc, ByteOffsetsView);
}
}
} else {
// if (VS == 1) then <T, N, VS> can often be omitted - test it here.
// The variants accepting simd_view for 'PassThru' operand though
// still require <T, N> to be specified explicitly to help
// C++ FE do simd to simd_view matching.
if constexpr (UsePassThru) {
if constexpr (UseProperties) {
if (GlobalID % 4 == 0) // ByteOffset - simd, PassThru - simd
Vals = gather<T>(InAcc, ByteOffsets, Pred, PassThru, Props);
else if (GlobalID % 4 == 1) // ByteOffset - simd, PassThru - view
Vals = gather<T, N>(InAcc, ByteOffsets, Pred, PassThruView,
Props);
else if (GlobalID % 4 == 2) // ByteOffset - view, PassThru - simd
Vals = gather(InAcc, ByteOffsetsView, Pred, PassThru, Props);
else // ByteOffset - view, PassThru - view
Vals = gather<T, N>(InAcc, ByteOffsetsView, Pred, PassThruView,
Props);
} else { // UseProperties is false
if (GlobalID % 4 == 0) // ByteOffset - simd, PassThru - simd
Vals = gather(InAcc, ByteOffsets, Pred, PassThru);
else if (GlobalID % 4 == 1) // ByteOffset - simd, PassThru - view
Vals = gather<T, N>(InAcc, ByteOffsets, Pred, PassThruView);
else if (GlobalID % 4 == 2) // ByteOffset - view, PassThru - simd
Vals = gather<T, N>(InAcc, ByteOffsetsView, Pred, PassThru);
else // ByteOffset - view, PassThru - view
Vals =
gather<T, N>(InAcc, ByteOffsetsView, Pred, PassThruView);
}
} else if constexpr (UseMask) { // UsePassThru is false
if constexpr (UseProperties) {
if (GlobalID % 2 == 0) // ByteOffset - simd
Vals = gather<T>(InAcc, ByteOffsets, Pred, Props);
else // ByteOffset - simd_view
Vals = gather<T, N>(InAcc, ByteOffsetsView, Pred, Props);
} else { // UseProperties is false
if (GlobalID % 2 == 0) // ByteOffset - simd
Vals = gather<T>(InAcc, ByteOffsets, Pred);
else // ByteOffset - simd_view
Vals = gather<T, N>(InAcc, ByteOffsetsView, Pred);
}
} else { // UsePassThru is false, UseMask is false
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This must have been very painful to write

if constexpr (UseProperties) {
if (GlobalID % 2 == 0) // ByteOffset - simd
Vals = gather<T>(InAcc, ByteOffsets, Props);
else // ByteOffset - simd_view
Vals = gather<T, N>(InAcc, ByteOffsetsView, Props);
} else {
if (GlobalID % 2 == 0) // ByteOffset - simd
Vals = gather<T>(InAcc, ByteOffsets);
else // ByteOffset - simd_view
Vals = gather<T, N>(InAcc, ByteOffsetsView);
}
}
} // end if (VS == 1)
Vals.copy_to(Out + GlobalID * N);
// scatter(Out, ByteOffsets.template select<NOffsets, 1>(), Vals);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: remove commented code

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, removed.

});
}).wait();
} catch (sycl::exception const &e) {
std::cout << "SYCL exception caught: " << e.what() << '\n';
Expand Down Expand Up @@ -286,3 +467,61 @@ template <typename T, TestFeatures Features> bool testUSM(queue Q) {
}
return Passed;
}

template <typename T, TestFeatures Features> bool testACC(queue Q) {
constexpr bool UseMask = true;
constexpr bool UsePassThru = true;
constexpr bool UseProperties = true;

properties AlignElemProps{alignment<sizeof(T)>};

bool Passed = true;
Passed &= testACC<T, 1, 1, !UseMask, !UsePassThru, !UseProperties>(
Q, 2, AlignElemProps);
#ifdef __ESIMD_FORCE_STATELESS_MEM
Passed &= testACC<T, 2, 1, UseMask, !UsePassThru, !UseProperties>(
Q, 2, AlignElemProps);
Passed &= testACC<T, 4, 1, UseMask, !UsePassThru, !UseProperties>(
Q, 2, AlignElemProps);
#endif // __ESIMD_FORCE_STATELESS_MEM
Passed &= testACC<T, 8, 1, UseMask, !UsePassThru, !UseProperties>(
Q, 3, AlignElemProps);
Passed &= testACC<T, 16, 1, UseMask, !UsePassThru, UseProperties>(
Q, 2, AlignElemProps);
Passed &= testACC<T, 32, 1, UseMask, !UsePassThru, !UseProperties>(
Q, 3, AlignElemProps);

if constexpr (Features == TestFeatures::PVC ||
Features == TestFeatures::DG2) {
properties LSCProps{cache_hint_L1<cache_hint::streaming>,
cache_hint_L2<cache_hint::cached>,
alignment<sizeof(T)>};
Passed &=
testACC<T, 1, 1, !UseMask, !UsePassThru, UseProperties>(Q, 2, LSCProps);
Passed &=
testACC<T, 2, 1, UseMask, !UsePassThru, UseProperties>(Q, 2, LSCProps);
Passed &=
testACC<T, 4, 1, UseMask, UsePassThru, UseProperties>(Q, 2, LSCProps);
Passed &=
testACC<T, 8, 1, UseMask, UsePassThru, UseProperties>(Q, 3, LSCProps);

Passed &=
testACC<T, 32, 1, UseMask, UsePassThru, UseProperties>(Q, 2, LSCProps);

// Check VS > 1. GPU supports only dwords and qwords in this mode.
if constexpr (sizeof(T) >= 4) {
// TODO: This test case causes flaky fail. Enable it after the issue
// in GPU driver is fixed.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we have an internal tracker for this or a version we know works?

Copy link
Contributor Author

@v-klochkov v-klochkov Jan 22, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I created a tracker before uploading this PR.
GPU generated code where the mask is used but is not initialized.

// Passed &= testACC<T, 16, 2, UseMask, !UsePassThru, UseProperties>(
// Q, 3, AlignElemProps);

Passed &= testACC<T, 32, 2, !UseMask, !UsePassThru, UseProperties>(
Q, 3, AlignElemProps);
Passed &= testACC<T, 32, 2, UseMask, !UsePassThru, UseProperties>(
Q, 3, AlignElemProps);
Passed &= testACC<T, 32, 2, UseMask, UsePassThru, UseProperties>(
Q, 3, AlignElemProps);
}
}
return Passed;
}
40 changes: 40 additions & 0 deletions sycl/test-e2e/ESIMD/unified_memory_api/gather_acc.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
//==------- gather_acc.cpp - DPC++ ESIMD on-device test --------------------==//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// Use per-kernel compilation to have more information about failing cases.
// RUN: %{build} -fsycl-device-code-split=per_kernel -o %t.out
// RUN: %{run} %t.out

// The test verifies esimd::gather() functions accepting ACCESSOR
// and optional compile-time esimd::properties.
// The gather() calls in this test do not use cache-hint properties
// or VS > 1 (number of loads per offset) to not impose using DG2/PVC features.

#include "Inputs/gather.hpp"

int main() {
auto Q = queue{gpu_selector_v};
esimd_test::printTestLabel(Q);

constexpr auto TestFeatures = TestFeatures::Generic;
bool Passed = true;

Passed &= testACC<int8_t, TestFeatures>(Q);
Passed &= testACC<int16_t, TestFeatures>(Q);
if (Q.get_device().has(sycl::aspect::fp16))
Passed &= testACC<sycl::half, TestFeatures>(Q);
Passed &= testACC<uint32_t, TestFeatures>(Q);
Passed &= testACC<float, TestFeatures>(Q);
Passed &= testACC<ext::intel::experimental::esimd::tfloat32, TestFeatures>(Q);
#ifdef __ESIMD_FORCE_STATELESS_MEM
Passed &= testACC<int64_t, TestFeatures>(Q);
if (Q.get_device().has(sycl::aspect::fp64))
Passed &= testACC<double, TestFeatures>(Q);
#endif // __ESIMD_FORCE_STATELESS_MEM
std::cout << (Passed ? "Passed\n" : "FAILED\n");
return Passed ? 0 : 1;
}
43 changes: 43 additions & 0 deletions sycl/test-e2e/ESIMD/unified_memory_api/gather_acc_dg2_pvc.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
//==------- gather_acc_dg2_pvc.cpp - DPC++ ESIMD on-device test ------------==//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

// REQUIRES: gpu-intel-dg2 || gpu-intel-pvc

// Use per-kernel compilation to have more information about failing cases.
// RUN: %{build} -fsycl-device-code-split=per_kernel -o %t.out
// RUN: %{run} %t.out

// The test verifies esimd::gather() functions accepting ACCESSOR
// and optional compile-time esimd::properties.
// The gather() calls in this test can use cache-hint properties
// or VS > 1 (number of loads per offset).

#include "Inputs/gather.hpp"

int main() {
auto Q = queue{gpu_selector_v};
esimd_test::printTestLabel(Q);

constexpr auto TestFeatures = TestFeatures::DG2;
bool Passed = true;

Passed &= testACC<int8_t, TestFeatures>(Q);
Passed &= testACC<int16_t, TestFeatures>(Q);
if (Q.get_device().has(sycl::aspect::fp16))
Passed &= testACC<sycl::half, TestFeatures>(Q);
Passed &= testACC<uint32_t, TestFeatures>(Q);
Passed &= testACC<float, TestFeatures>(Q);
Passed &= testACC<ext::intel::experimental::esimd::tfloat32, TestFeatures>(Q);
#ifdef __ESIMD_FORCE_STATELESS_MEM
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we be running this test and the previous in stateless mode as well? It seems we are not right now.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, good catch. I'll add the tests here soon.

Passed &= testACC<int64_t, TestFeatures>(Q);
if (Q.get_device().has(sycl::aspect::fp64))
Passed &= testACC<double, TestFeatures>(Q);
#endif // __ESIMD_FORCE_STATELESS_MEM
std::cout << (Passed ? "Passed\n" : "FAILED\n");
return Passed ? 0 : 1;
}
Loading