Skip to content
This repository was archived by the owner on Mar 28, 2023. It is now read-only.

Commit d944065

Browse files
authored
[SYCL][CUDA] Added tests for atomic loads and stores for various orders and scopes (#648)
1 parent 0eae0ff commit d944065

14 files changed

+351
-303
lines changed

SYCL/AtomicRef/load.cpp

Lines changed: 3 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,31 +1,9 @@
1-
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
1+
// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70
22
// RUN: %HOST_RUN_PLACEHOLDER %t.out
3-
// RUN: %CPU_RUN_PLACEHOLDER %t.out
43
// RUN: %GPU_RUN_PLACEHOLDER %t.out
4+
// RUN: %CPU_RUN_PLACEHOLDER %t.out
55
// RUN: %ACC_RUN_PLACEHOLDER %t.out
66

77
#include "load.h"
8-
#include <iostream>
9-
using namespace sycl;
10-
11-
int main() {
12-
queue q;
13-
14-
constexpr int N = 32;
15-
load_test<int>(q, N);
16-
load_test<unsigned int>(q, N);
17-
load_test<float>(q, N);
18-
19-
// Include long tests if they are 32 bits wide
20-
if constexpr (sizeof(long) == 4) {
21-
load_test<long>(q, N);
22-
load_test<unsigned long>(q, N);
23-
}
24-
25-
// Include pointer tests if they are 32 bits wide
26-
if constexpr (sizeof(char *) == 4) {
27-
load_test<char *>(q, N);
28-
}
298

30-
std::cout << "Test passed." << std::endl;
31-
}
9+
int main() { load_test_all<access::address_space::global_space>(); }

SYCL/AtomicRef/load.h

Lines changed: 144 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,61 @@
11
#pragma once
22

3+
#ifndef TEST_GENERIC_IN_LOCAL
4+
#define TEST_GENERIC_IN_LOCAL 0
5+
#endif
6+
37
#include <CL/sycl.hpp>
48
#include <algorithm>
59
#include <cassert>
10+
#include <iostream>
611
#include <numeric>
712
#include <vector>
813

914
using namespace sycl;
1015

1116
template <template <typename, memory_order, memory_scope, access::address_space>
1217
class AtomicRef,
13-
access::address_space address_space, typename T>
14-
class load_kernel;
18+
access::address_space space, typename T,
19+
memory_order order = memory_order::relaxed,
20+
memory_scope scope = memory_scope::device>
21+
void load_local_test(queue q, size_t N) {
22+
T initial = T(42);
23+
T load = initial;
24+
std::vector<T> output(N);
25+
std::fill(output.begin(), output.end(), T(0));
26+
{
27+
buffer<T> load_buf(&load, 1);
28+
buffer<T> output_buf(output.data(), output.size());
29+
30+
q.submit([&](handler &cgh) {
31+
auto ld = load_buf.template get_access<access::mode::read_write>(cgh);
32+
auto out =
33+
output_buf.template get_access<access::mode::discard_write>(cgh);
34+
accessor<T, 1, access::mode::read_write, access::target::local> loc(1,
35+
cgh);
36+
cgh.parallel_for(nd_range<1>(N, N), [=](nd_item<1> it) {
37+
int gid = it.get_global_id(0);
38+
if (gid == 0)
39+
loc[0] = initial;
40+
it.barrier(access::fence_space::local_space);
41+
auto atm = AtomicRef<T, memory_order::relaxed, scope, space>(loc[0]);
42+
out[gid] = atm.load(order);
43+
});
44+
}).wait_and_throw();
45+
}
46+
47+
// All work-items should read the same value
48+
// Atomicity isn't tested here, but support for load() is
49+
assert(std::all_of(output.begin(), output.end(),
50+
[&](T x) { return (x == initial); }));
51+
}
1552

1653
template <template <typename, memory_order, memory_scope, access::address_space>
1754
class AtomicRef,
18-
access::address_space address_space, typename T>
19-
void load_test(queue q, size_t N) {
55+
access::address_space space, typename T,
56+
memory_order order = memory_order::relaxed,
57+
memory_scope scope = memory_scope::device>
58+
void load_global_test(queue q, size_t N) {
2059
T initial = T(42);
2160
T load = initial;
2261
std::vector<T> output(N);
@@ -29,13 +68,11 @@ void load_test(queue q, size_t N) {
2968
auto ld = load_buf.template get_access<access::mode::read_write>(cgh);
3069
auto out =
3170
output_buf.template get_access<access::mode::discard_write>(cgh);
32-
cgh.parallel_for<load_kernel<AtomicRef, address_space, T>>(
33-
range<1>(N), [=](item<1> it) {
34-
size_t gid = it.get_id(0);
35-
auto atm = AtomicRef<T, memory_order::relaxed, memory_scope::device,
36-
address_space>(ld[0]);
37-
out[gid] = atm.load();
38-
});
71+
cgh.parallel_for(range<1>(N), [=](item<1> it) {
72+
size_t gid = it.get_id(0);
73+
auto atm = AtomicRef<T, memory_order::relaxed, scope, space>(ld[0]);
74+
out[gid] = atm.load(order);
75+
});
3976
});
4077
}
4178

@@ -45,15 +82,105 @@ void load_test(queue q, size_t N) {
4582
[&](T x) { return (x == initial); }));
4683
}
4784

48-
template <typename T> void load_test(queue q, size_t N) {
85+
template <access::address_space space, typename T,
86+
memory_order order = memory_order::relaxed,
87+
memory_scope scope = memory_scope::device>
88+
void load_test(queue q, size_t N) {
89+
constexpr bool do_local_tests =
90+
space == access::address_space::local_space ||
91+
(space == access::address_space::generic_space && TEST_GENERIC_IN_LOCAL);
92+
constexpr bool do_global_tests =
93+
space == access::address_space::global_space ||
94+
(space == access::address_space::generic_space && !TEST_GENERIC_IN_LOCAL);
95+
constexpr bool do_ext_tests = space != access::address_space::generic_space;
96+
if constexpr (do_local_tests) {
97+
#ifdef RUN_DEPRECATED
98+
if constexpr (do_ext_tests) {
99+
load_local_test<::sycl::ext::oneapi::atomic_ref, space, T, order, scope>(
100+
q, N);
101+
}
102+
#else
103+
load_local_test<::sycl::atomic_ref, space, T, order, scope>(q, N);
104+
#endif
105+
}
106+
if constexpr (do_global_tests) {
49107
#ifdef RUN_DEPRECATED
50-
load_test<::sycl::ext::oneapi::atomic_ref,
51-
access::address_space::global_space, T>(q, N);
108+
if constexpr (do_ext_tests) {
109+
load_global_test<::sycl::ext::oneapi::atomic_ref, space, T, order, scope>(
110+
q, N);
111+
}
52112
#else
53-
load_test<::sycl::atomic_ref, access::address_space::global_space, T>(q, N);
113+
load_global_test<::sycl::atomic_ref, space, T, order, scope>(q, N);
54114
#endif
115+
}
116+
}
117+
118+
template <access::address_space space, typename T,
119+
memory_order order = memory_order::relaxed>
120+
void load_test_scopes(queue q, size_t N) {
121+
std::vector<memory_scope> scopes =
122+
q.get_device().get_info<info::device::atomic_memory_scope_capabilities>();
123+
if (std::find(scopes.begin(), scopes.end(), memory_scope::system) !=
124+
scopes.end()) {
125+
load_test<space, T, order, memory_scope::system>(q, N);
126+
}
127+
if (std::find(scopes.begin(), scopes.end(), memory_scope::work_group) !=
128+
scopes.end()) {
129+
load_test<space, T, order, memory_scope::work_group>(q, N);
130+
}
131+
if (std::find(scopes.begin(), scopes.end(), memory_scope::sub_group) !=
132+
scopes.end()) {
133+
load_test<space, T, order, memory_scope::sub_group>(q, N);
134+
}
135+
load_test<space, T, order, memory_scope::device>(q, N);
136+
}
137+
138+
template <access::address_space space, typename T>
139+
void load_test_orders_scopes(queue q, size_t N) {
140+
std::vector<memory_order> orders =
141+
q.get_device().get_info<info::device::atomic_memory_order_capabilities>();
142+
if (std::find(orders.begin(), orders.end(), memory_order::acquire) !=
143+
orders.end()) {
144+
load_test_scopes<space, T, memory_order::acquire>(q, N);
145+
}
146+
load_test_scopes<space, T, memory_order::relaxed>(q, N);
55147
}
56148

57-
template <typename T> void load_generic_test(queue q, size_t N) {
58-
load_test<::sycl::atomic_ref, access::address_space::generic_space, T>(q, N);
149+
template <access::address_space space> void load_test_all() {
150+
queue q;
151+
152+
constexpr int N = 32;
153+
#ifdef FULL_ATOMIC64_COVERAGE
154+
if (!q.get_device().has(aspect::atomic64)) {
155+
std::cout << "Skipping atomic64 tests\n";
156+
return;
157+
}
158+
159+
load_test_orders_scopes<space, double>(q, N);
160+
if constexpr (sizeof(long) == 8) {
161+
load_test_orders_scopes<space, long>(q, N);
162+
load_test_orders_scopes<space, unsigned long>(q, N);
163+
}
164+
if constexpr (sizeof(long long) == 8) {
165+
load_test_orders_scopes<space, long long>(q, N);
166+
load_test_orders_scopes<space, unsigned long long>(q, N);
167+
}
168+
if constexpr (sizeof(char *) == 8) {
169+
load_test_orders_scopes<space, char *>(q, N);
170+
}
171+
#endif
172+
load_test_orders_scopes<space, float>(q, N);
173+
#ifdef FULL_ATOMIC32_COVERAGE
174+
load_test_orders_scopes<space, int>(q, N);
175+
load_test_orders_scopes<space, unsigned int>(q, N);
176+
if constexpr (sizeof(long) == 4) {
177+
load_test_orders_scopes<space, long>(q, N);
178+
load_test_orders_scopes<space, unsigned long>(q, N);
179+
}
180+
if constexpr (sizeof(char *) == 4) {
181+
load_test_orders_scopes<space, char *>(q, N);
182+
}
183+
#endif
184+
185+
std::cout << "Test passed." << std::endl;
59186
}

SYCL/AtomicRef/load_atomic64.cpp

Lines changed: 0 additions & 40 deletions
This file was deleted.

SYCL/AtomicRef/load_atomic64_generic.cpp

Lines changed: 0 additions & 43 deletions
This file was deleted.

SYCL/AtomicRef/load_generic.cpp

Lines changed: 4 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,34 +1,12 @@
1-
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
1+
// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70
22
// RUN: %HOST_RUN_PLACEHOLDER %t.out
3-
// RUN: %CPU_RUN_PLACEHOLDER %t.out
43
// RUN: %GPU_RUN_PLACEHOLDER %t.out
4+
// RUN: %CPU_RUN_PLACEHOLDER %t.out
55
// RUN: %ACC_RUN_PLACEHOLDER %t.out
66

7-
// CUDA backend has had no support for the generic address space yet
7+
// CUDA and HIP backends have had no support for the generic address space yet
88
// XFAIL: cuda || hip
99

1010
#include "load.h"
11-
#include <iostream>
12-
using namespace sycl;
13-
14-
int main() {
15-
queue q;
16-
17-
constexpr int N = 32;
18-
load_generic_test<int>(q, N);
19-
load_generic_test<unsigned int>(q, N);
20-
load_generic_test<float>(q, N);
21-
22-
// Include long tests if they are 32 bits wide
23-
if constexpr (sizeof(long) == 4) {
24-
load_generic_test<long>(q, N);
25-
load_generic_test<unsigned long>(q, N);
26-
}
27-
28-
// Include pointer tests if they are 32 bits wide
29-
if constexpr (sizeof(char *) == 4) {
30-
load_generic_test<char *>(q, N);
31-
}
3211

33-
std::cout << "Test passed." << std::endl;
34-
}
12+
int main() { load_test_all<access::address_space::generic_space>(); }

SYCL/AtomicRef/load_generic_local.cpp

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70
2+
// RUN: %HOST_RUN_PLACEHOLDER %t.out
3+
// RUN: %GPU_RUN_PLACEHOLDER %t.out
4+
// RUN: %CPU_RUN_PLACEHOLDER %t.out
5+
// RUN: %ACC_RUN_PLACEHOLDER %t.out
6+
7+
// CUDA backend has had no support for the generic address space yet. Barrier is
8+
// not supported on host.
9+
// XFAIL: cuda, hip, host
10+
11+
#define TEST_GENERIC_IN_LOCAL 1
12+
13+
#include "load.h"
14+
15+
int main() { load_test_all<access::address_space::generic_space>(); }

SYCL/AtomicRef/load_local.cpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70
2+
// RUN: %HOST_RUN_PLACEHOLDER %t.out
3+
// RUN: %GPU_RUN_PLACEHOLDER %t.out
4+
// RUN: %CPU_RUN_PLACEHOLDER %t.out
5+
// RUN: %ACC_RUN_PLACEHOLDER %t.out
6+
7+
// Barrier is not supported on host.
8+
// XFAIL: host
9+
10+
#include "load.h"
11+
12+
int main() { load_test_all<access::address_space::local_space>(); }

0 commit comments

Comments
 (0)