Skip to content

Commit

Permalink
xe: ocl: gemm: improve reference parallelization
Browse files Browse the repository at this point in the history
  • Loading branch information
rjoursler committed Sep 23, 2024
1 parent bd43b55 commit d9f2f4c
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 23 deletions.
42 changes: 20 additions & 22 deletions src/gpu/intel/ocl/gemm/ref_gemm.cl
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,8 @@ __kernel void ref_gemm(__global A_DATA_T *a, __global B_DATA_T *b,
int bias_mask, __global int *ao, __global int *bo, __global int *c0,
int c0_mask, __global float *scales, long scale_stride, float beta) {

int n = get_global_id(1);
int n = get_global_id(0);
int m = get_global_id(1);
int mb = get_global_id(2);

#if WITH_BIAS
Expand All @@ -61,39 +62,36 @@ __kernel void ref_gemm(__global A_DATA_T *a, __global B_DATA_T *b,
c0_mask, MB, M, N, &c0_strides[0], &c0_strides[1], &c0_strides[2]);
#endif

for (long m = 0; m < M; ++m) {
ACC_DATA_T acc = 0;
for (long k = 0; k < K; ++k) {
long off_a = mb * stride_a + (transa ? m * lda + k : k * lda + m);
long off_b = mb * stride_b + (transb ? k * ldb + n : n * ldb + k);
acc += TO_ACC(A_TO_REF(a[off_a]) - ATTR_A0)
* TO_ACC(B_TO_REF(b[off_b]) - ATTR_B0);
}
ACC_DATA_T acc = 0;
for (long k = 0; k < K; ++k) {
long off_a = mb * stride_a + (transa ? m * lda + k : k * lda + m);
long off_b = mb * stride_b + (transb ? k * ldb + n : n * ldb + k);
acc += TO_ACC(A_TO_REF(a[off_a]) - ATTR_A0)
* TO_ACC(B_TO_REF(b[off_b]) - ATTR_B0);
}

long off_c = mb * stride_c + n * ldc + m;
long off_c = mb * stride_c + n * ldc + m;
#if WITH_BIAS || NON_DEFAULT_ATTRS
POST_OP_DATA_T temp = (POST_OP_DATA_T)acc;
POST_OP_DATA_T temp = (POST_OP_DATA_T)acc;
#if WITH_BIAS
long off_bias = mb * b_strides[0] + m * b_strides[1] + n * b_strides[2];
temp += BIA_TO_REF(bias[off_bias]);
long off_bias = mb * b_strides[0] + m * b_strides[1] + n * b_strides[2];
temp += BIA_TO_REF(bias[off_bias]);
#endif
#if WITH_SCALES
temp *= scales[scale_stride * n];
temp *= scales[scale_stride * n];
#endif
#if WITH_SUM
temp += (POST_OP_DATA_T)(beta * C_TO_REF(c[off_c]));
temp += (POST_OP_DATA_T)(beta * C_TO_REF(c[off_c]));
#endif
#if WITH_ELTWISE
temp = fwd_eltwise(temp, eltwise_alpha, eltwise_beta, eltwise_scale);
temp = fwd_eltwise(temp, eltwise_alpha, eltwise_beta, eltwise_scale);
#endif
#if WITH_DST_ZPOINTS
long off_c0
= mb * c0_strides[0] + m * c0_strides[1] + n * c0_strides[2];
temp += c0[off_c0];
long off_c0 = mb * c0_strides[0] + m * c0_strides[1] + n * c0_strides[2];
temp += c0[off_c0];
#endif
c[off_c] = TO_C(temp);
c[off_c] = TO_C(temp);
#else
c[off_c] = TO_C(acc);
c[off_c] = TO_C(acc);
#endif
}
}
2 changes: 1 addition & 1 deletion src/gpu/intel/ocl/gemm/ref_gemm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ status_t ref_gemm_t::execute(const gemm_exec_ctx_t &ctx) const {
arg_list.set(29, scale_stride);
arg_list.set(30, beta);

const compute::range_t gws = {1, (size_t)N, (size_t)MB};
const compute::range_t gws = {(size_t)N, (size_t)M, (size_t)MB};
const auto nd_range = compute::nd_range_t(gws);

status_t status = parallel_for(ctx, nd_range, kernel_, arg_list);
Expand Down

0 comments on commit d9f2f4c

Please sign in to comment.