Closed
Description
I implemented a simple kernel that converts data between different data types. This kernel demonstrates reasonably good bandwidth utilization on RTX TITAN NVIDIA GPU (250-550 GB/S depending on data types), which is similar to what clpeak shows.
However, when I run it on MI200 bandwidth utilization is very poor (~45GB/S), though clpeak shows ~1100 GB/S.
I used the latest commit: 599b1b9. OS Linux.
Kernel(clickable)
#include <chrono>
#include <cstdint>
#include <iostream>
#include <CL/sycl.hpp>
#include <sycl/ext/oneapi/experimental/bfloat16.hpp>
using sycl::ext::oneapi::experimental::bfloat16;
using namespace std::chrono;
using namespace std;
using namespace sycl;
enum class dt_t : int { bf16, f16, f32, s32, s8, u8 };
template <dt_t dt>
struct dt_traits;
#define DECLARE_TRAIT(dt, t, n) \
template <> \
struct dt_traits<dt_t::dt> { \
using type = t; \
static constexpr size_t size = sizeof(t); \
static constexpr char *name = (decltype(name))n; \
};
DECLARE_TRAIT(bf16, bfloat16, "bf16")
DECLARE_TRAIT(f16, half, "f16")
DECLARE_TRAIT(f32, float, "f32")
DECLARE_TRAIT(s32, int32_t, "s32")
DECLARE_TRAIT(s8, int8_t, "s8")
DECLARE_TRAIT(u8, uint8_t, "u8")
#undef DECLARE_TRAIT
template <dt_t sdt, dt_t ddt>
struct transform_kernel_t {
using src_type = typename dt_traits<sdt>::type;
using dst_type = typename dt_traits<ddt>::type;
transform_kernel_t(const void *src, void *dst, int nelems)
: src(src), dst(dst), nelems(nelems) {}
void operator()(nd_item<1> id) const {
const auto local_id = id.get_global_id(0);
if (local_id < nelems) {
const auto *src_typed = reinterpret_cast<const src_type *>(src);
auto *dst_typed = reinterpret_cast<dst_type *>(dst);
dst_typed[local_id] = static_cast<dst_type>(src_typed[local_id]);
}
}
const void *src;
void *dst;
int nelems;
};
void *allocate_buffer(dt_t dt, int nelems, queue q) {
#define CASE(dt) \
case dt_t::dt: \
return malloc_shared(nelems * sizeof(dt_traits<dt_t::dt>::type), q);
switch (dt) {
CASE(f32)
CASE(f16)
CASE(bf16)
CASE(s32)
CASE(s8)
CASE(u8)
default: throw std::runtime_error("unexpected dt");
}
#undef CASE
}
void init_buffer(void *ptr, dt_t dt, int nelems) {
#define CASE(dt) \
case dt_t::dt: \
reinterpret_cast<dt_traits<dt_t::dt>::type *>(ptr)[i] \
= (i % 2 == 0) ? 4 : 9; \
break;
for (int i = 0; i < nelems; i++) {
switch (dt) {
CASE(f32)
CASE(f16)
CASE(bf16)
CASE(s32)
CASE(s8)
CASE(u8)
default: throw std::runtime_error("unexpected dt");
}
}
#undef CASE
}
void check_buffer(void *ptr, dt_t dt, int nelems) {
#define CASE(dt) \
case dt_t::dt: { \
auto *p = reinterpret_cast<dt_traits<dt_t::dt>::type *>(ptr); \
const auto exp = (i % 2 == 0) ? 4 : 9; \
if (p[i] != exp) throw std::runtime_error("result mismatch"); \
break; \
}
for (int i = 0; i < nelems; i++) {
switch (dt) {
CASE(f32)
CASE(f16)
CASE(bf16)
CASE(s32)
CASE(s8)
CASE(u8)
default: throw std::runtime_error("unexpected dt");
}
}
printf("Test passed\n");
#undef CASE
}
sycl::nd_range<1> get_nd_range(const device &dev, int nelems) {
const size_t max_wg_size
= dev.get_info<sycl::info::device::max_work_group_size>();
const size_t max_work_item
= dev.get_info<sycl::info::device::max_work_item_sizes>()[0];
const size_t optimal_ls = std::min(max_wg_size, max_work_item);
const size_t ls = std::min((size_t)nelems, optimal_ls);
const size_t gs = nelems % ls ? (nelems / ls + 1) * ls : nelems;
printf("ls:%lu, gs:%lu\n", ls, gs);
return {{gs}, {ls}};
}
int main() {
constexpr int64_t nelems = 1024 * 1024 * 256;
constexpr int64_t niters = 10;
auto dev = device(gpu_selector_v);
auto q = queue(dev);
#define SUBMIT_CASE(sdt, ddt) \
{ \
printf("%s -> %s\n", dt_traits<dt_t::sdt>::name, \
dt_traits<dt_t::ddt>::name); \
void *src = allocate_buffer(dt_t::sdt, nelems, q); \
void *dst = allocate_buffer(dt_t::ddt, nelems, q); \
init_buffer(src, dt_t::sdt, nelems); \
const auto nd_range = get_nd_range(dev, nelems); \
/* Warm-up run */ \
auto e = q.submit([&](handler &cgh) { \
transform_kernel_t<dt_t::sdt, dt_t::ddt> tk(src, dst, nelems); \
cgh.parallel_for(nd_range, tk); \
}); \
q.wait_and_throw(); \
\
auto start = high_resolution_clock::now(); \
for (int i = 0; i < niters; i++) { \
e = q.submit([&](handler &cgh) { \
cgh.depends_on({e}); \
transform_kernel_t<dt_t::sdt, dt_t::ddt> tk(src, dst, nelems); \
cgh.parallel_for(nd_range, tk); \
}); \
} \
q.wait_and_throw(); \
auto end = high_resolution_clock::now(); \
\
try { \
check_buffer(dst, dt_t::ddt, nelems); \
} catch (std::exception & e) { \
std::cout << e.what() << std::endl; \
return 1; \
} \
/* Time in seconds */ \
double time = (duration_cast<microseconds>(end - start)).count() \
/ niters / 1e6f; \
/* Size in GB */ \
double size = ((dt_traits<dt_t::sdt>::size * nelems) \
+ (dt_traits<dt_t::ddt>::size * nelems)) \
/ 1e9; \
printf("size(gb):%.2f, time(sec):%f, BW:%f\n", size, time, \
size / time); \
free(src, q); \
free(dst, q); \
}
//-----------------------
SUBMIT_CASE(f16, f16)
SUBMIT_CASE(f16, f32)
SUBMIT_CASE(f16, s32)
SUBMIT_CASE(f16, s8)
SUBMIT_CASE(f16, u8)
//-----------------------
SUBMIT_CASE(f32, f32)
SUBMIT_CASE(f32, f16)
SUBMIT_CASE(f32, s32)
SUBMIT_CASE(f32, s8)
SUBMIT_CASE(f32, u8)
//-----------------------
SUBMIT_CASE(s32, s32)
SUBMIT_CASE(s32, f32)
SUBMIT_CASE(s32, f16)
SUBMIT_CASE(s32, s8)
SUBMIT_CASE(s32, u8)
//-----------------------
SUBMIT_CASE(s8, s8)
SUBMIT_CASE(s8, f32)
SUBMIT_CASE(s8, f16)
SUBMIT_CASE(s8, s32)
SUBMIT_CASE(s8, u8)
//-----------------------
SUBMIT_CASE(u8, u8)
SUBMIT_CASE(u8, f32)
SUBMIT_CASE(u8, f16)
SUBMIT_CASE(u8, s32)
SUBMIT_CASE(u8, s8)
//-----------------------
#undef SUBMIT_CASE
return 0;
}