1
1
// UNSUPPORTED: cuda
2
2
// Reductions use work-group builtins not yet supported by CUDA.
3
3
4
- // UNSUPPORTED: linux
5
- // TODO: Enable the test for Linux when CI uses GPU driver 20.06.15619 or newer.
6
-
7
4
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
8
5
// RUN: %CPU_RUN_PLACEHOLDER %t.out
9
6
// RUN: %GPU_RUN_PLACEHOLDER %t.out
10
7
// RUN: %ACC_RUN_PLACEHOLDER %t.out
11
8
9
+ // RUNx: env SYCL_DEVICE_TYPE=HOST %t.out
12
10
// TODO: Enable the test for HOST when it supports intel::reduce() and barrier()
13
11
14
12
// This test performs basic checks of parallel_for(nd_range, reduction, func)
@@ -24,22 +22,29 @@ template <typename T, int Dim, class BinaryOperation>
24
22
class SomeClass ;
25
23
26
24
template <typename T, int Dim, class BinaryOperation >
27
- void test (T Identity, size_t WGSize, size_t NWItems) {
25
+ void test (T Identity, size_t WGSize, size_t NWItems, usm::alloc AllocType ) {
28
26
queue Q;
29
27
auto Dev = Q.get_device ();
30
- if (!Dev.get_info <info::device::usm_shared_allocations>())
28
+
29
+ if (AllocType == usm::alloc::shared &&
30
+ !Dev.get_info <info::device::usm_shared_allocations>())
31
+ return ;
32
+ if (AllocType == usm::alloc::host &&
33
+ !Dev.get_info <info::device::usm_host_allocations>())
31
34
return ;
32
35
36
+ T *ReduVarPtr = (T *)malloc (sizeof (T), Dev, Q.get_context (), AllocType);
37
+ if (ReduVarPtr == nullptr )
38
+ return ;
39
+ *ReduVarPtr = Identity;
40
+
33
41
// Initialize.
34
42
T CorrectOut;
35
43
BinaryOperation BOp;
36
44
37
45
buffer<T, 1 > InBuf (NWItems);
38
46
initInputData (InBuf, CorrectOut, Identity, BOp, NWItems);
39
47
40
- T *ReduVarPtr = (T *)malloc_shared (sizeof (T), Dev, Q.get_context ());
41
- *ReduVarPtr = Identity;
42
-
43
48
// Compute.
44
49
Q.submit ([&](handler &CGH) {
45
50
auto In = InBuf.template get_access <access::mode::read>(CGH);
@@ -61,26 +66,36 @@ void test(T Identity, size_t WGSize, size_t NWItems) {
61
66
<< " , Expected value: " << CorrectOut << " \n " ;
62
67
assert (0 && " Wrong value." );
63
68
}
69
+
64
70
free (ReduVarPtr, Q.get_context ());
65
71
}
66
72
73
+ template <typename T, int Dim, class BinaryOperation >
74
+ void testUSM (T Identity, size_t WGSize, size_t NWItems) {
75
+ test<T, Dim, BinaryOperation>(Identity, WGSize, NWItems, usm::alloc::shared);
76
+ test<T, Dim, BinaryOperation>(Identity, WGSize, NWItems, usm::alloc::host);
77
+ }
78
+
67
79
int main () {
68
80
// fast atomics and fast reduce
69
- test <int , 1 , intel::plus<int >>(0 , 49 , 49 * 5 );
70
- test <int , 0 , intel::plus<int >>(0 , 8 , 128 );
81
+ testUSM <int , 1 , intel::plus<int >>(0 , 49 , 49 * 5 );
82
+ testUSM <int , 0 , intel::plus<int >>(0 , 8 , 128 );
71
83
72
84
// fast atomics
73
- test <int , 0 , intel::bit_or<int >>(0 , 7 , 7 * 3 );
74
- test <int , 1 , intel::bit_or<int >>(0 , 4 , 128 );
85
+ testUSM <int , 0 , intel::bit_or<int >>(0 , 7 , 7 * 3 );
86
+ testUSM <int , 1 , intel::bit_or<int >>(0 , 4 , 128 );
75
87
76
88
// fast reduce
77
- test<float , 1 , intel::minimum<float >>(std::numeric_limits<float >::max (), 5 , 5 * 7 );
78
- test<float , 0 , intel::maximum<float >>(std::numeric_limits<float >::min (), 4 , 128 );
89
+ testUSM<float , 1 , intel::minimum<float >>(
90
+ (std::numeric_limits<float >::max)(), 5 , 5 * 7 );
91
+ testUSM<float , 0 , intel::maximum<float >>(
92
+ (std::numeric_limits<float >::min)(), 4 , 128 );
79
93
80
94
// generic algorithm
81
- test<int , 0 , std::multiplies<int >>(1 , 7 , 7 * 5 );
82
- test<int , 1 , std::multiplies<int >>(1 , 8 , 16 );
83
- test<CustomVec<short >, 0 , CustomVecPlus<short >>(CustomVec<short >(0 ), 8 , 8 * 3 );
95
+ testUSM<int , 0 , std::multiplies<int >>(1 , 7 , 7 * 5 );
96
+ testUSM<int , 1 , std::multiplies<int >>(1 , 8 , 16 );
97
+ testUSM<CustomVec<short >, 0 , CustomVecPlus<short >>(
98
+ CustomVec<short >(0 ), 8 , 8 * 3 );
84
99
85
100
std::cout << " Test passed\n " ;
86
101
return 0 ;
0 commit comments