Skip to content
This repository was archived by the owner on Mar 28, 2023. It is now read-only.

[SYCL][CUDA] Added tests for atomic loads and stores for various orders and scopes #648

Merged
merged 9 commits into from
Mar 14, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 3 additions & 25 deletions SYCL/AtomicRef/load.cpp
Original file line number Diff line number Diff line change
@@ -1,31 +1,9 @@
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70
// RUN: %HOST_RUN_PLACEHOLDER %t.out
// RUN: %CPU_RUN_PLACEHOLDER %t.out
// RUN: %GPU_RUN_PLACEHOLDER %t.out
// RUN: %CPU_RUN_PLACEHOLDER %t.out
// RUN: %ACC_RUN_PLACEHOLDER %t.out

#include "load.h"
#include <iostream>
using namespace sycl;

int main() {
queue q;

constexpr int N = 32;
load_test<int>(q, N);
load_test<unsigned int>(q, N);
load_test<float>(q, N);

// Include long tests if they are 32 bits wide
if constexpr (sizeof(long) == 4) {
load_test<long>(q, N);
load_test<unsigned long>(q, N);
}

// Include pointer tests if they are 32 bits wide
if constexpr (sizeof(char *) == 4) {
load_test<char *>(q, N);
}

std::cout << "Test passed." << std::endl;
}
int main() { load_test_all<access::address_space::global_space>(); }
161 changes: 144 additions & 17 deletions SYCL/AtomicRef/load.h
Original file line number Diff line number Diff line change
@@ -1,22 +1,61 @@
#pragma once

#ifndef TEST_GENERIC_IN_LOCAL
#define TEST_GENERIC_IN_LOCAL 0
#endif

#include <CL/sycl.hpp>
#include <algorithm>
#include <cassert>
#include <iostream>
#include <numeric>
#include <vector>

using namespace sycl;

template <template <typename, memory_order, memory_scope, access::address_space>
class AtomicRef,
access::address_space address_space, typename T>
class load_kernel;
access::address_space space, typename T,
memory_order order = memory_order::relaxed,
memory_scope scope = memory_scope::device>
void load_local_test(queue q, size_t N) {
T initial = T(42);
T load = initial;
std::vector<T> output(N);
std::fill(output.begin(), output.end(), T(0));
{
buffer<T> load_buf(&load, 1);
buffer<T> output_buf(output.data(), output.size());

q.submit([&](handler &cgh) {
auto ld = load_buf.template get_access<access::mode::read_write>(cgh);
auto out =
output_buf.template get_access<access::mode::discard_write>(cgh);
accessor<T, 1, access::mode::read_write, access::target::local> loc(1,
cgh);
cgh.parallel_for(nd_range<1>(N, N), [=](nd_item<1> it) {
int gid = it.get_global_id(0);
if (gid == 0)
loc[0] = initial;
it.barrier(access::fence_space::local_space);
auto atm = AtomicRef<T, memory_order::relaxed, scope, space>(loc[0]);
out[gid] = atm.load(order);
});
}).wait_and_throw();
}

// All work-items should read the same value
// Atomicity isn't tested here, but support for load() is
assert(std::all_of(output.begin(), output.end(),
[&](T x) { return (x == initial); }));
}

template <template <typename, memory_order, memory_scope, access::address_space>
class AtomicRef,
access::address_space address_space, typename T>
void load_test(queue q, size_t N) {
access::address_space space, typename T,
memory_order order = memory_order::relaxed,
memory_scope scope = memory_scope::device>
void load_global_test(queue q, size_t N) {
T initial = T(42);
T load = initial;
std::vector<T> output(N);
Expand All @@ -29,13 +68,11 @@ void load_test(queue q, size_t N) {
auto ld = load_buf.template get_access<access::mode::read_write>(cgh);
auto out =
output_buf.template get_access<access::mode::discard_write>(cgh);
cgh.parallel_for<load_kernel<AtomicRef, address_space, T>>(
range<1>(N), [=](item<1> it) {
size_t gid = it.get_id(0);
auto atm = AtomicRef<T, memory_order::relaxed, memory_scope::device,
address_space>(ld[0]);
out[gid] = atm.load();
});
cgh.parallel_for(range<1>(N), [=](item<1> it) {
size_t gid = it.get_id(0);
auto atm = AtomicRef<T, memory_order::relaxed, scope, space>(ld[0]);
out[gid] = atm.load(order);
});
});
}

Expand All @@ -45,15 +82,105 @@ void load_test(queue q, size_t N) {
[&](T x) { return (x == initial); }));
}

template <typename T> void load_test(queue q, size_t N) {
template <access::address_space space, typename T,
memory_order order = memory_order::relaxed,
memory_scope scope = memory_scope::device>
void load_test(queue q, size_t N) {
constexpr bool do_local_tests =
space == access::address_space::local_space ||
(space == access::address_space::generic_space && TEST_GENERIC_IN_LOCAL);
constexpr bool do_global_tests =
space == access::address_space::global_space ||
(space == access::address_space::generic_space && !TEST_GENERIC_IN_LOCAL);
constexpr bool do_ext_tests = space != access::address_space::generic_space;
if constexpr (do_local_tests) {
#ifdef RUN_DEPRECATED
if constexpr (do_ext_tests) {
load_local_test<::sycl::ext::oneapi::atomic_ref, space, T, order, scope>(
q, N);
}
#else
load_local_test<::sycl::atomic_ref, space, T, order, scope>(q, N);
#endif
}
if constexpr (do_global_tests) {
#ifdef RUN_DEPRECATED
load_test<::sycl::ext::oneapi::atomic_ref,
access::address_space::global_space, T>(q, N);
if constexpr (do_ext_tests) {
load_global_test<::sycl::ext::oneapi::atomic_ref, space, T, order, scope>(
q, N);
}
#else
load_test<::sycl::atomic_ref, access::address_space::global_space, T>(q, N);
load_global_test<::sycl::atomic_ref, space, T, order, scope>(q, N);
#endif
}
}

template <access::address_space space, typename T,
memory_order order = memory_order::relaxed>
void load_test_scopes(queue q, size_t N) {
std::vector<memory_scope> scopes =
q.get_device().get_info<info::device::atomic_memory_scope_capabilities>();
if (std::find(scopes.begin(), scopes.end(), memory_scope::system) !=
scopes.end()) {
load_test<space, T, order, memory_scope::system>(q, N);
}
if (std::find(scopes.begin(), scopes.end(), memory_scope::work_group) !=
scopes.end()) {
load_test<space, T, order, memory_scope::work_group>(q, N);
}
if (std::find(scopes.begin(), scopes.end(), memory_scope::sub_group) !=
scopes.end()) {
load_test<space, T, order, memory_scope::sub_group>(q, N);
}
load_test<space, T, order, memory_scope::device>(q, N);
}

template <access::address_space space, typename T>
void load_test_orders_scopes(queue q, size_t N) {
std::vector<memory_order> orders =
q.get_device().get_info<info::device::atomic_memory_order_capabilities>();
if (std::find(orders.begin(), orders.end(), memory_order::acquire) !=
orders.end()) {
load_test_scopes<space, T, memory_order::acquire>(q, N);
}
load_test_scopes<space, T, memory_order::relaxed>(q, N);
}

template <typename T> void load_generic_test(queue q, size_t N) {
load_test<::sycl::atomic_ref, access::address_space::generic_space, T>(q, N);
template <access::address_space space> void load_test_all() {
queue q;

constexpr int N = 32;
#ifdef FULL_ATOMIC64_COVERAGE
if (!q.get_device().has(aspect::atomic64)) {
std::cout << "Skipping atomic64 tests\n";
return;
}

load_test_orders_scopes<space, double>(q, N);
if constexpr (sizeof(long) == 8) {
load_test_orders_scopes<space, long>(q, N);
load_test_orders_scopes<space, unsigned long>(q, N);
}
if constexpr (sizeof(long long) == 8) {
load_test_orders_scopes<space, long long>(q, N);
load_test_orders_scopes<space, unsigned long long>(q, N);
}
if constexpr (sizeof(char *) == 8) {
load_test_orders_scopes<space, char *>(q, N);
}
#endif
load_test_orders_scopes<space, float>(q, N);
#ifdef FULL_ATOMIC32_COVERAGE
load_test_orders_scopes<space, int>(q, N);
load_test_orders_scopes<space, unsigned int>(q, N);
if constexpr (sizeof(long) == 4) {
load_test_orders_scopes<space, long>(q, N);
load_test_orders_scopes<space, unsigned long>(q, N);
}
if constexpr (sizeof(char *) == 4) {
load_test_orders_scopes<space, char *>(q, N);
}
#endif

std::cout << "Test passed." << std::endl;
}
40 changes: 0 additions & 40 deletions SYCL/AtomicRef/load_atomic64.cpp

This file was deleted.

43 changes: 0 additions & 43 deletions SYCL/AtomicRef/load_atomic64_generic.cpp

This file was deleted.

30 changes: 4 additions & 26 deletions SYCL/AtomicRef/load_generic.cpp
Original file line number Diff line number Diff line change
@@ -1,34 +1,12 @@
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70
// RUN: %HOST_RUN_PLACEHOLDER %t.out
// RUN: %CPU_RUN_PLACEHOLDER %t.out
// RUN: %GPU_RUN_PLACEHOLDER %t.out
// RUN: %CPU_RUN_PLACEHOLDER %t.out
// RUN: %ACC_RUN_PLACEHOLDER %t.out

// CUDA backend has had no support for the generic address space yet
// CUDA and HIP backends have had no support for the generic address space yet
// XFAIL: cuda || hip

#include "load.h"
#include <iostream>
using namespace sycl;

int main() {
queue q;

constexpr int N = 32;
load_generic_test<int>(q, N);
load_generic_test<unsigned int>(q, N);
load_generic_test<float>(q, N);

// Include long tests if they are 32 bits wide
if constexpr (sizeof(long) == 4) {
load_generic_test<long>(q, N);
load_generic_test<unsigned long>(q, N);
}

// Include pointer tests if they are 32 bits wide
if constexpr (sizeof(char *) == 4) {
load_generic_test<char *>(q, N);
}

std::cout << "Test passed." << std::endl;
}
int main() { load_test_all<access::address_space::generic_space>(); }
15 changes: 15 additions & 0 deletions SYCL/AtomicRef/load_generic_local.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70
// RUN: %HOST_RUN_PLACEHOLDER %t.out
// RUN: %GPU_RUN_PLACEHOLDER %t.out
// RUN: %CPU_RUN_PLACEHOLDER %t.out
// RUN: %ACC_RUN_PLACEHOLDER %t.out

// CUDA backend has had no support for the generic address space yet. Barrier is
// not supported on host.
// XFAIL: cuda, hip, host

#define TEST_GENERIC_IN_LOCAL 1

#include "load.h"

int main() { load_test_all<access::address_space::generic_space>(); }
12 changes: 12 additions & 0 deletions SYCL/AtomicRef/load_local.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70
// RUN: %HOST_RUN_PLACEHOLDER %t.out
// RUN: %GPU_RUN_PLACEHOLDER %t.out
// RUN: %CPU_RUN_PLACEHOLDER %t.out
// RUN: %ACC_RUN_PLACEHOLDER %t.out

// Barrier is not supported on host.
// XFAIL: host

#include "load.h"

int main() { load_test_all<access::address_space::local_space>(); }
Loading