Skip to content

Significant underperforming on joint matrix multiplication #14683

Open
@Hitman4Reason

Description

@Hitman4Reason

Describe the bug

The performance from joint_matrix multiplication is around 10 TOPS with int8 on a device with theoretical capability of 383 TOPS. This specific code snippet is for the AMD GPU specified but similar behaviour was observed on NVIDIA 3080.
Is this underperforming expected behaviour or is it due do the use of a sub-optimal logic to compute the tiles? Are there examples available in the documentation that extract higher performance?

To reproduce

CODE:

int32_t *out = malloc_device<int32_t>(sizeSquare * sizeSquare, Q);

int8_t *A = malloc_device<int8_t>(sizeSquare * sizeSquare, Q);
int8_t *B = malloc_device<int8_t>(sizeSquare * sizeSquare, Q);

#define SQUARE 32768
constexpr int N_THREADS_PER_MATRIX_OP = 64;

auto Acast = address_space_cast<sycl::access::address_space::global_space,
                                sycl::access::decorated::no>(A);
auto Bcast = address_space_cast<sycl::access::address_space::global_space,
                                sycl::access::decorated::no>(B);
auto outCAst = address_space_cast<sycl::access::address_space::global_space,
                                  sycl::access::decorated::no>(out);

static constexpr size_t M = 16, N = 16, K = 16;

int Sub_Tiles_M = SQUARE / M;
int Sub_Tiles_N = SQUARE / N;
int Sub_Tiles_K = SQUARE / K;

size_t Big_M = M * Sub_Tiles_M, Big_N = N * Sub_Tiles_N, Big_K = K * Sub_Tiles_K;
{

    Q.submit([&](handler &cgh)
             {


  range<2> LocalRange = {1, N_THREADS_PER_MATRIX_OP}; 
  range<2> GlobalRange = {static_cast<size_t>(Sub_Tiles_M), static_cast<size_t>(Sub_Tiles_N * N_THREADS_PER_MATRIX_OP)};

  cgh.parallel_for<>(
      nd_range<2>(GlobalRange, LocalRange), [=](nd_item<2> item) {
        sycl::sub_group sg = item.get_sub_group();
        // row id of current submatrix of BIG C matrix
        const auto m = item.get_group().get_group_id()[0];
        // column id of current submatrix of BIG C matrix
        const auto n = item.get_group().get_group_id()[1];

        joint_matrix<sycl::sub_group, std::remove_const_t<int8_t>, use::a, M, K, layout::col_major> sub_a;

        
        joint_matrix<sycl::sub_group, std::remove_const_t<int8_t>, use::b, K, N, layout::row_major> sub_b;
        

        joint_matrix<sycl::sub_group, std::remove_const_t<int32_t>, use::accumulator, M, N> sub_c;

        joint_matrix_fill(sg, sub_c, 0);

        // k = row/col id of current submatrix of BIG A/B matrices
        for (int k = 0; k < Sub_Tiles_K; k++) {
          joint_matrix_load(
              sg, sub_a,
              Acast +
                  (k * K) + (m * M * Big_K),
              Big_K);

          joint_matrix_load(
              sg, sub_b,
              Bcast +
                  (k * K * Big_N) + (n * N),
              Big_N);

         
          
          joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
        }

        joint_matrix_store(
            sg, sub_c,
            outCAst +
                (m * M) * Big_N + n * N,
            Big_N, layout::row_major);
      }); });
    Q.wait();
}

COMPILE:
icpx -fsycl -O2 -fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend --offload-arch=gfx90a -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 -o ver_amd_test.out joint_matrix_hip_gfx90a.cpp

RUN:
ONEAPI_DEVICE_SELECTOR="hip:*" SYCL_PI_TRACE=1 ./ver_amd_test.out

The question is if the significant underperforming is expected behaviour or user error.

Environment

  • OS: SUSE Linux Enterprise Server 15 SP4
  • AMD MI250x
  • DPC++ version: Intel(R) oneAPI DPC++/C++ Compiler 2024.2.0 (2024.2.0.20240602)

Additional context

No response

Metadata

Metadata

Assignees

No one assigned

    Labels

    bugSomething isn't workingcudaCUDA back-endhipIssues related to execution on HIP backend.

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions