11// UNSUPPORTED: cuda
22// Reductions use work-group builtins not yet supported by CUDA.
33
4- // UNSUPPORTED: linux
5- // TODO: Enable the test for Linux when CI uses GPU driver 20.06.15619 or newer.
6-
74// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
85// RUN: %CPU_RUN_PLACEHOLDER %t.out
96// RUN: %GPU_RUN_PLACEHOLDER %t.out
107// RUN: %ACC_RUN_PLACEHOLDER %t.out
118
9+ // RUNx: env SYCL_DEVICE_TYPE=HOST %t.out
1210// TODO: Enable the test for HOST when it supports intel::reduce() and barrier()
1311
1412// This test performs basic checks of parallel_for(nd_range, reduction, func)
@@ -24,22 +22,29 @@ template <typename T, int Dim, class BinaryOperation>
2422class SomeClass ;
2523
2624template <typename T, int Dim, class BinaryOperation >
27- void test (T Identity, size_t WGSize, size_t NWItems) {
25+ void test (T Identity, size_t WGSize, size_t NWItems, usm::alloc AllocType ) {
2826 queue Q;
2927 auto Dev = Q.get_device ();
30- if (!Dev.get_info <info::device::usm_shared_allocations>())
28+
29+ if (AllocType == usm::alloc::shared &&
30+ !Dev.get_info <info::device::usm_shared_allocations>())
31+ return ;
32+ if (AllocType == usm::alloc::host &&
33+ !Dev.get_info <info::device::usm_host_allocations>())
3134 return ;
3235
36+ T *ReduVarPtr = (T *)malloc (sizeof (T), Dev, Q.get_context (), AllocType);
37+ if (ReduVarPtr == nullptr )
38+ return ;
39+ *ReduVarPtr = Identity;
40+
3341 // Initialize.
3442 T CorrectOut;
3543 BinaryOperation BOp;
3644
3745 buffer<T, 1 > InBuf (NWItems);
3846 initInputData (InBuf, CorrectOut, Identity, BOp, NWItems);
3947
40- T *ReduVarPtr = (T *)malloc_shared (sizeof (T), Dev, Q.get_context ());
41- *ReduVarPtr = Identity;
42-
4348 // Compute.
4449 Q.submit ([&](handler &CGH) {
4550 auto In = InBuf.template get_access <access::mode::read>(CGH);
@@ -61,26 +66,36 @@ void test(T Identity, size_t WGSize, size_t NWItems) {
6166 << " , Expected value: " << CorrectOut << " \n " ;
6267 assert (0 && " Wrong value." );
6368 }
69+
6470 free (ReduVarPtr, Q.get_context ());
6571}
6672
73+ template <typename T, int Dim, class BinaryOperation >
74+ void testUSM (T Identity, size_t WGSize, size_t NWItems) {
75+ test<T, Dim, BinaryOperation>(Identity, WGSize, NWItems, usm::alloc::shared);
76+ test<T, Dim, BinaryOperation>(Identity, WGSize, NWItems, usm::alloc::host);
77+ }
78+
6779int main () {
6880 // fast atomics and fast reduce
69- test <int , 1 , intel::plus<int >>(0 , 49 , 49 * 5 );
70- test <int , 0 , intel::plus<int >>(0 , 8 , 128 );
81+ testUSM <int , 1 , intel::plus<int >>(0 , 49 , 49 * 5 );
82+ testUSM <int , 0 , intel::plus<int >>(0 , 8 , 128 );
7183
7284 // fast atomics
73- test <int , 0 , intel::bit_or<int >>(0 , 7 , 7 * 3 );
74- test <int , 1 , intel::bit_or<int >>(0 , 4 , 128 );
85+ testUSM <int , 0 , intel::bit_or<int >>(0 , 7 , 7 * 3 );
86+ testUSM <int , 1 , intel::bit_or<int >>(0 , 4 , 128 );
7587
7688 // fast reduce
77- test<float , 1 , intel::minimum<float >>(std::numeric_limits<float >::max (), 5 , 5 * 7 );
78- test<float , 0 , intel::maximum<float >>(std::numeric_limits<float >::min (), 4 , 128 );
89+ testUSM<float , 1 , intel::minimum<float >>(
90+ (std::numeric_limits<float >::max)(), 5 , 5 * 7 );
91+ testUSM<float , 0 , intel::maximum<float >>(
92+ (std::numeric_limits<float >::min)(), 4 , 128 );
7993
8094 // generic algorithm
81- test<int , 0 , std::multiplies<int >>(1 , 7 , 7 * 5 );
82- test<int , 1 , std::multiplies<int >>(1 , 8 , 16 );
83- test<CustomVec<short >, 0 , CustomVecPlus<short >>(CustomVec<short >(0 ), 8 , 8 * 3 );
95+ testUSM<int , 0 , std::multiplies<int >>(1 , 7 , 7 * 5 );
96+ testUSM<int , 1 , std::multiplies<int >>(1 , 8 , 16 );
97+ testUSM<CustomVec<short >, 0 , CustomVecPlus<short >>(
98+ CustomVec<short >(0 ), 8 , 8 * 3 );
8499
85100 std::cout << " Test passed\n " ;
86101 return 0 ;
0 commit comments