Skip to content

[HIP][Perf] Simple data conversion kernel is significantly slower on AMD #7195

Closed
@densamoilov

Description

@densamoilov

I implemented a simple kernel that converts data between different data types. This kernel demonstrates reasonably good bandwidth utilization on RTX TITAN NVIDIA GPU (250-550 GB/S depending on data types), which is similar to what clpeak shows.
However, when I run it on MI200 bandwidth utilization is very poor (~45GB/S), though clpeak shows ~1100 GB/S.

I used the latest commit: 599b1b9. OS Linux.

Kernel(clickable)
#include <chrono>
#include <cstdint>
#include <iostream>
#include <CL/sycl.hpp>
#include <sycl/ext/oneapi/experimental/bfloat16.hpp>

using sycl::ext::oneapi::experimental::bfloat16;

using namespace std::chrono;

using namespace std;
using namespace sycl;

enum class dt_t : int { bf16, f16, f32, s32, s8, u8 };

template <dt_t dt>
struct dt_traits;

#define DECLARE_TRAIT(dt, t, n) \
    template <> \
    struct dt_traits<dt_t::dt> { \
        using type = t; \
        static constexpr size_t size = sizeof(t); \
        static constexpr char *name = (decltype(name))n; \
    };

DECLARE_TRAIT(bf16, bfloat16, "bf16")
DECLARE_TRAIT(f16, half, "f16")
DECLARE_TRAIT(f32, float, "f32")
DECLARE_TRAIT(s32, int32_t, "s32")
DECLARE_TRAIT(s8, int8_t, "s8")
DECLARE_TRAIT(u8, uint8_t, "u8")

#undef DECLARE_TRAIT

template <dt_t sdt, dt_t ddt>
struct transform_kernel_t {
    using src_type = typename dt_traits<sdt>::type;
    using dst_type = typename dt_traits<ddt>::type;

    transform_kernel_t(const void *src, void *dst, int nelems)
        : src(src), dst(dst), nelems(nelems) {}

    void operator()(nd_item<1> id) const {
        const auto local_id = id.get_global_id(0);
        if (local_id < nelems) {
            const auto *src_typed = reinterpret_cast<const src_type *>(src);
            auto *dst_typed = reinterpret_cast<dst_type *>(dst);
            dst_typed[local_id] = static_cast<dst_type>(src_typed[local_id]);
        }
    }

    const void *src;
    void *dst;
    int nelems;
};

void *allocate_buffer(dt_t dt, int nelems, queue q) {
#define CASE(dt) \
    case dt_t::dt: \
        return malloc_shared(nelems * sizeof(dt_traits<dt_t::dt>::type), q);

    switch (dt) {
        CASE(f32)
        CASE(f16)
        CASE(bf16)
        CASE(s32)
        CASE(s8)
        CASE(u8)
        default: throw std::runtime_error("unexpected dt");
    }
#undef CASE
}

void init_buffer(void *ptr, dt_t dt, int nelems) {
#define CASE(dt) \
    case dt_t::dt: \
        reinterpret_cast<dt_traits<dt_t::dt>::type *>(ptr)[i] \
                = (i % 2 == 0) ? 4 : 9; \
        break;

    for (int i = 0; i < nelems; i++) {
        switch (dt) {
            CASE(f32)
            CASE(f16)
            CASE(bf16)
            CASE(s32)
            CASE(s8)
            CASE(u8)
            default: throw std::runtime_error("unexpected dt");
        }
    }
#undef CASE
}

void check_buffer(void *ptr, dt_t dt, int nelems) {
#define CASE(dt) \
    case dt_t::dt: { \
        auto *p = reinterpret_cast<dt_traits<dt_t::dt>::type *>(ptr); \
        const auto exp = (i % 2 == 0) ? 4 : 9; \
        if (p[i] != exp) throw std::runtime_error("result mismatch"); \
        break; \
    }

    for (int i = 0; i < nelems; i++) {
        switch (dt) {
            CASE(f32)
            CASE(f16)
            CASE(bf16)
            CASE(s32)
            CASE(s8)
            CASE(u8)
            default: throw std::runtime_error("unexpected dt");
        }
    }
    printf("Test passed\n");
#undef CASE
}

sycl::nd_range<1> get_nd_range(const device &dev, int nelems) {
    const size_t max_wg_size
            = dev.get_info<sycl::info::device::max_work_group_size>();
    const size_t max_work_item
            = dev.get_info<sycl::info::device::max_work_item_sizes>()[0];
    const size_t optimal_ls = std::min(max_wg_size, max_work_item);

    const size_t ls = std::min((size_t)nelems, optimal_ls);
    const size_t gs = nelems % ls ? (nelems / ls + 1) * ls : nelems;
printf("ls:%lu, gs:%lu\n", ls, gs);
    return {{gs}, {ls}};
}

int main() {
    constexpr int64_t nelems = 1024 * 1024 * 256;
    constexpr int64_t niters = 10;

    auto dev = device(gpu_selector_v);
    auto q = queue(dev);


#define SUBMIT_CASE(sdt, ddt) \
    { \
        printf("%s -> %s\n", dt_traits<dt_t::sdt>::name, \
                dt_traits<dt_t::ddt>::name); \
        void *src = allocate_buffer(dt_t::sdt, nelems, q); \
        void *dst = allocate_buffer(dt_t::ddt, nelems, q); \
        init_buffer(src, dt_t::sdt, nelems); \
        const auto nd_range = get_nd_range(dev, nelems); \
        /* Warm-up run */ \
        auto e = q.submit([&](handler &cgh) { \
            transform_kernel_t<dt_t::sdt, dt_t::ddt> tk(src, dst, nelems); \
            cgh.parallel_for(nd_range, tk); \
        }); \
        q.wait_and_throw(); \
\
        auto start = high_resolution_clock::now(); \
        for (int i = 0; i < niters; i++) { \
            e = q.submit([&](handler &cgh) { \
                cgh.depends_on({e}); \
                transform_kernel_t<dt_t::sdt, dt_t::ddt> tk(src, dst, nelems); \
                cgh.parallel_for(nd_range, tk); \
            }); \
        } \
        q.wait_and_throw(); \
        auto end = high_resolution_clock::now(); \
\
        try { \
            check_buffer(dst, dt_t::ddt, nelems); \
        } catch (std::exception & e) { \
            std::cout << e.what() << std::endl; \
            return 1; \
        } \
        /* Time in seconds */ \
        double time = (duration_cast<microseconds>(end - start)).count() \
                / niters / 1e6f; \
        /* Size in GB */ \
        double size = ((dt_traits<dt_t::sdt>::size * nelems) \
                             + (dt_traits<dt_t::ddt>::size * nelems)) \
                / 1e9; \
        printf("size(gb):%.2f, time(sec):%f, BW:%f\n", size, time, \
                size / time); \
        free(src, q); \
        free(dst, q); \
    }

    //-----------------------
    SUBMIT_CASE(f16, f16)
    SUBMIT_CASE(f16, f32)
    SUBMIT_CASE(f16, s32)
    SUBMIT_CASE(f16, s8)
    SUBMIT_CASE(f16, u8)
    //-----------------------
    SUBMIT_CASE(f32, f32)
    SUBMIT_CASE(f32, f16)
    SUBMIT_CASE(f32, s32)
    SUBMIT_CASE(f32, s8)
    SUBMIT_CASE(f32, u8)
    //-----------------------
    SUBMIT_CASE(s32, s32)
    SUBMIT_CASE(s32, f32)
    SUBMIT_CASE(s32, f16)
    SUBMIT_CASE(s32, s8)
    SUBMIT_CASE(s32, u8)
    //-----------------------
    SUBMIT_CASE(s8, s8)
    SUBMIT_CASE(s8, f32)
    SUBMIT_CASE(s8, f16)
    SUBMIT_CASE(s8, s32)
    SUBMIT_CASE(s8, u8)
    //-----------------------
    SUBMIT_CASE(u8, u8)
    SUBMIT_CASE(u8, f32)
    SUBMIT_CASE(u8, f16)
    SUBMIT_CASE(u8, s32)
    SUBMIT_CASE(u8, s8)
    //-----------------------

#undef SUBMIT_CASE
    return 0;
}

Metadata

Metadata

Assignees

No one assigned

    Labels

    bugSomething isn't workinghipIssues related to execution on HIP backend.

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions