[HIP][Perf] Simple data conversion kernel is significantly slower on AMD

I implemented a simple kernel that converts data between different data types. This kernel demonstrates reasonably good bandwidth utilization on RTX TITAN NVIDIA GPU (250-550 GB/S depending on data types), which is similar to what [clpeak](https://github.com/krrishnarraj/clpeak) shows.
However, when I run it on MI200 bandwidth utilization is very poor (~45GB/S), though clpeak shows ~1100 GB/S.

I used the latest commit: 599b1b96e80d479fe669b4e3bfecb794fa8c850c. OS Linux. 

<details>
  <summary>Kernel(clickable)</summary>

```cpp
#include <chrono>
#include <cstdint>
#include <iostream>
#include <CL/sycl.hpp>
#include <sycl/ext/oneapi/experimental/bfloat16.hpp>

using sycl::ext::oneapi::experimental::bfloat16;

using namespace std::chrono;

using namespace std;
using namespace sycl;

enum class dt_t : int { bf16, f16, f32, s32, s8, u8 };

template <dt_t dt>
struct dt_traits;

#define DECLARE_TRAIT(dt, t, n) \
    template <> \
    struct dt_traits<dt_t::dt> { \
        using type = t; \
        static constexpr size_t size = sizeof(t); \
        static constexpr char *name = (decltype(name))n; \
    };

DECLARE_TRAIT(bf16, bfloat16, "bf16")
DECLARE_TRAIT(f16, half, "f16")
DECLARE_TRAIT(f32, float, "f32")
DECLARE_TRAIT(s32, int32_t, "s32")
DECLARE_TRAIT(s8, int8_t, "s8")
DECLARE_TRAIT(u8, uint8_t, "u8")

#undef DECLARE_TRAIT

template <dt_t sdt, dt_t ddt>
struct transform_kernel_t {
    using src_type = typename dt_traits<sdt>::type;
    using dst_type = typename dt_traits<ddt>::type;

    transform_kernel_t(const void *src, void *dst, int nelems)
        : src(src), dst(dst), nelems(nelems) {}

    void operator()(nd_item<1> id) const {
        const auto local_id = id.get_global_id(0);
        if (local_id < nelems) {
            const auto *src_typed = reinterpret_cast<const src_type *>(src);
            auto *dst_typed = reinterpret_cast<dst_type *>(dst);
            dst_typed[local_id] = static_cast<dst_type>(src_typed[local_id]);
        }
    }

    const void *src;
    void *dst;
    int nelems;
};

void *allocate_buffer(dt_t dt, int nelems, queue q) {
#define CASE(dt) \
    case dt_t::dt: \
        return malloc_shared(nelems * sizeof(dt_traits<dt_t::dt>::type), q);

    switch (dt) {
        CASE(f32)
        CASE(f16)
        CASE(bf16)
        CASE(s32)
        CASE(s8)
        CASE(u8)
        default: throw std::runtime_error("unexpected dt");
    }
#undef CASE
}

void init_buffer(void *ptr, dt_t dt, int nelems) {
#define CASE(dt) \
    case dt_t::dt: \
        reinterpret_cast<dt_traits<dt_t::dt>::type *>(ptr)[i] \
                = (i % 2 == 0) ? 4 : 9; \
        break;

    for (int i = 0; i < nelems; i++) {
        switch (dt) {
            CASE(f32)
            CASE(f16)
            CASE(bf16)
            CASE(s32)
            CASE(s8)
            CASE(u8)
            default: throw std::runtime_error("unexpected dt");
        }
    }
#undef CASE
}

void check_buffer(void *ptr, dt_t dt, int nelems) {
#define CASE(dt) \
    case dt_t::dt: { \
        auto *p = reinterpret_cast<dt_traits<dt_t::dt>::type *>(ptr); \
        const auto exp = (i % 2 == 0) ? 4 : 9; \
        if (p[i] != exp) throw std::runtime_error("result mismatch"); \
        break; \
    }

    for (int i = 0; i < nelems; i++) {
        switch (dt) {
            CASE(f32)
            CASE(f16)
            CASE(bf16)
            CASE(s32)
            CASE(s8)
            CASE(u8)
            default: throw std::runtime_error("unexpected dt");
        }
    }
    printf("Test passed\n");
#undef CASE
}

sycl::nd_range<1> get_nd_range(const device &dev, int nelems) {
    const size_t max_wg_size
            = dev.get_info<sycl::info::device::max_work_group_size>();
    const size_t max_work_item
            = dev.get_info<sycl::info::device::max_work_item_sizes>()[0];
    const size_t optimal_ls = std::min(max_wg_size, max_work_item);

    const size_t ls = std::min((size_t)nelems, optimal_ls);
    const size_t gs = nelems % ls ? (nelems / ls + 1) * ls : nelems;
printf("ls:%lu, gs:%lu\n", ls, gs);
    return {{gs}, {ls}};
}

int main() {
    constexpr int64_t nelems = 1024 * 1024 * 256;
    constexpr int64_t niters = 10;

    auto dev = device(gpu_selector_v);
    auto q = queue(dev);


#define SUBMIT_CASE(sdt, ddt) \
    { \
        printf("%s -> %s\n", dt_traits<dt_t::sdt>::name, \
                dt_traits<dt_t::ddt>::name); \
        void *src = allocate_buffer(dt_t::sdt, nelems, q); \
        void *dst = allocate_buffer(dt_t::ddt, nelems, q); \
        init_buffer(src, dt_t::sdt, nelems); \
        const auto nd_range = get_nd_range(dev, nelems); \
        /* Warm-up run */ \
        auto e = q.submit([&](handler &cgh) { \
            transform_kernel_t<dt_t::sdt, dt_t::ddt> tk(src, dst, nelems); \
            cgh.parallel_for(nd_range, tk); \
        }); \
        q.wait_and_throw(); \
\
        auto start = high_resolution_clock::now(); \
        for (int i = 0; i < niters; i++) { \
            e = q.submit([&](handler &cgh) { \
                cgh.depends_on({e}); \
                transform_kernel_t<dt_t::sdt, dt_t::ddt> tk(src, dst, nelems); \
                cgh.parallel_for(nd_range, tk); \
            }); \
        } \
        q.wait_and_throw(); \
        auto end = high_resolution_clock::now(); \
\
        try { \
            check_buffer(dst, dt_t::ddt, nelems); \
        } catch (std::exception & e) { \
            std::cout << e.what() << std::endl; \
            return 1; \
        } \
        /* Time in seconds */ \
        double time = (duration_cast<microseconds>(end - start)).count() \
                / niters / 1e6f; \
        /* Size in GB */ \
        double size = ((dt_traits<dt_t::sdt>::size * nelems) \
                             + (dt_traits<dt_t::ddt>::size * nelems)) \
                / 1e9; \
        printf("size(gb):%.2f, time(sec):%f, BW:%f\n", size, time, \
                size / time); \
        free(src, q); \
        free(dst, q); \
    }

    //-----------------------
    SUBMIT_CASE(f16, f16)
    SUBMIT_CASE(f16, f32)
    SUBMIT_CASE(f16, s32)
    SUBMIT_CASE(f16, s8)
    SUBMIT_CASE(f16, u8)
    //-----------------------
    SUBMIT_CASE(f32, f32)
    SUBMIT_CASE(f32, f16)
    SUBMIT_CASE(f32, s32)
    SUBMIT_CASE(f32, s8)
    SUBMIT_CASE(f32, u8)
    //-----------------------
    SUBMIT_CASE(s32, s32)
    SUBMIT_CASE(s32, f32)
    SUBMIT_CASE(s32, f16)
    SUBMIT_CASE(s32, s8)
    SUBMIT_CASE(s32, u8)
    //-----------------------
    SUBMIT_CASE(s8, s8)
    SUBMIT_CASE(s8, f32)
    SUBMIT_CASE(s8, f16)
    SUBMIT_CASE(s8, s32)
    SUBMIT_CASE(s8, u8)
    //-----------------------
    SUBMIT_CASE(u8, u8)
    SUBMIT_CASE(u8, f32)
    SUBMIT_CASE(u8, f16)
    SUBMIT_CASE(u8, s32)
    SUBMIT_CASE(u8, s8)
    //-----------------------

#undef SUBMIT_CASE
    return 0;
}
````

</details>

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[HIP][Perf] Simple data conversion kernel is significantly slower on AMD #7195

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

[HIP][Perf] Simple data conversion kernel is significantly slower on AMD #7195

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions