Description
Describe the bug
The performance from joint_matrix multiplication is around 10 TOPS with int8 on a device with theoretical capability of 383 TOPS. This specific code snippet is for the AMD GPU specified but similar behaviour was observed on NVIDIA 3080.
Is this underperforming expected behaviour or is it due do the use of a sub-optimal logic to compute the tiles? Are there examples available in the documentation that extract higher performance?
To reproduce
CODE:
int32_t *out = malloc_device<int32_t>(sizeSquare * sizeSquare, Q);
int8_t *A = malloc_device<int8_t>(sizeSquare * sizeSquare, Q);
int8_t *B = malloc_device<int8_t>(sizeSquare * sizeSquare, Q);
#define SQUARE 32768
constexpr int N_THREADS_PER_MATRIX_OP = 64;
auto Acast = address_space_cast<sycl::access::address_space::global_space,
sycl::access::decorated::no>(A);
auto Bcast = address_space_cast<sycl::access::address_space::global_space,
sycl::access::decorated::no>(B);
auto outCAst = address_space_cast<sycl::access::address_space::global_space,
sycl::access::decorated::no>(out);
static constexpr size_t M = 16, N = 16, K = 16;
int Sub_Tiles_M = SQUARE / M;
int Sub_Tiles_N = SQUARE / N;
int Sub_Tiles_K = SQUARE / K;
size_t Big_M = M * Sub_Tiles_M, Big_N = N * Sub_Tiles_N, Big_K = K * Sub_Tiles_K;
{
Q.submit([&](handler &cgh)
{
range<2> LocalRange = {1, N_THREADS_PER_MATRIX_OP};
range<2> GlobalRange = {static_cast<size_t>(Sub_Tiles_M), static_cast<size_t>(Sub_Tiles_N * N_THREADS_PER_MATRIX_OP)};
cgh.parallel_for<>(
nd_range<2>(GlobalRange, LocalRange), [=](nd_item<2> item) {
sycl::sub_group sg = item.get_sub_group();
// row id of current submatrix of BIG C matrix
const auto m = item.get_group().get_group_id()[0];
// column id of current submatrix of BIG C matrix
const auto n = item.get_group().get_group_id()[1];
joint_matrix<sycl::sub_group, std::remove_const_t<int8_t>, use::a, M, K, layout::col_major> sub_a;
joint_matrix<sycl::sub_group, std::remove_const_t<int8_t>, use::b, K, N, layout::row_major> sub_b;
joint_matrix<sycl::sub_group, std::remove_const_t<int32_t>, use::accumulator, M, N> sub_c;
joint_matrix_fill(sg, sub_c, 0);
// k = row/col id of current submatrix of BIG A/B matrices
for (int k = 0; k < Sub_Tiles_K; k++) {
joint_matrix_load(
sg, sub_a,
Acast +
(k * K) + (m * M * Big_K),
Big_K);
joint_matrix_load(
sg, sub_b,
Bcast +
(k * K * Big_N) + (n * N),
Big_N);
joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
}
joint_matrix_store(
sg, sub_c,
outCAst +
(m * M) * Big_N + n * N,
Big_N, layout::row_major);
}); });
Q.wait();
}
COMPILE:
icpx -fsycl -O2 -fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend --offload-arch=gfx90a -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 -o ver_amd_test.out joint_matrix_hip_gfx90a.cpp
RUN:
ONEAPI_DEVICE_SELECTOR="hip:*" SYCL_PI_TRACE=1 ./ver_amd_test.out
The question is if the significant underperforming is expected behaviour or user error.
Environment
- OS: SUSE Linux Enterprise Server 15 SP4
- AMD MI250x
- DPC++ version: Intel(R) oneAPI DPC++/C++ Compiler 2024.2.0 (2024.2.0.20240602)
Additional context
No response