-
Notifications
You must be signed in to change notification settings - Fork 797
[SYCL][Matrix] Enable wi_slice for joint_matrix #4979
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 5 commits
16c0bbe
4803165
9a867ad
d236a56
a894d04
93c3e22
3ffe959
1d2dab3
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||||||
---|---|---|---|---|---|---|---|---|---|---|
|
@@ -86,6 +86,30 @@ __spirv_JointMatrixSUMadINTEL( | |||||||||
__spv::__spirv_JointMatrixINTEL<T3, M, N, LC, S> *C, | ||||||||||
__spv::Scope::Flag Sc = __spv::Scope::Flag::Subgroup); | ||||||||||
|
||||||||||
template <typename T> | ||||||||||
using __spirv_wi_slice_t = T __attribute__((ext_vector_type(0xffffff))); | ||||||||||
|
||||||||||
template <typename T, std::size_t R, std::size_t C, __spv::MatrixLayout U, | ||||||||||
__spv::Scope::Flag S = __spv::Scope::Flag::Subgroup> | ||||||||||
extern SYCL_EXTERNAL __spirv_wi_slice_t<T> &__spirv_JointMatrixGetSliceData( | ||||||||||
__spv::__spirv_JointMatrixINTEL<T, R, C, U, S> *); | ||||||||||
|
||||||||||
template <typename T, std::size_t R, std::size_t C, __spv::MatrixLayout U, | ||||||||||
__spv::Scope::Flag S = __spv::Scope::Flag::Subgroup> | ||||||||||
extern SYCL_EXTERNAL size_t __spirv_JointMatrixGetSliceLength( | ||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we add |
||||||||||
__spv::__spirv_JointMatrixINTEL<T, R, C, U, S> *); | ||||||||||
|
||||||||||
template <typename T, std::size_t R, std::size_t C, __spv::MatrixLayout U, | ||||||||||
__spv::Scope::Flag S = __spv::Scope::Flag::Subgroup> | ||||||||||
extern SYCL_EXTERNAL T __spirv_JointMatrixGetSliceElem( | ||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||
__spv::__spirv_JointMatrixINTEL<T, R, C, U, S> *, size_t i); | ||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The width of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 32 bits is enough. But size_t is widely used in other APIs. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If we explicitly say that this parameter must be |
||||||||||
|
||||||||||
template <typename T, std::size_t R, std::size_t C, __spv::MatrixLayout U, | ||||||||||
__spv::Scope::Flag S = __spv::Scope::Flag::Subgroup> | ||||||||||
extern SYCL_EXTERNAL __spv::__spirv_JointMatrixINTEL<T, R, C, U, S> * | ||||||||||
__spirv_JointMatrixSetSliceElem( | ||||||||||
__spv::__spirv_JointMatrixINTEL<T, R, C, U, S> *, size_t i, T val); | ||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||
|
||||||||||
#ifndef __SPIRV_BUILTIN_DECLARATIONS__ | ||||||||||
#error \ | ||||||||||
"SPIR-V built-ins are not available. Please set -fdeclare-spirv-builtins flag." | ||||||||||
|
Original file line number | Diff line number | Diff line change | ||||||||
---|---|---|---|---|---|---|---|---|---|---|
|
@@ -44,6 +44,11 @@ template <int D> struct spv_scope_traits<sycl::group<D>> { | |||||||||
constexpr static auto value = __spv::Scope::Workgroup; | ||||||||||
}; | ||||||||||
|
||||||||||
template <typename T, size_t NumRows, size_t NumCols, | ||||||||||
matrix_layout Layout = matrix_layout::row_major, | ||||||||||
typename Group = sycl::sub_group> | ||||||||||
class wi_slice; | ||||||||||
|
||||||||||
template <typename T, size_t NumRows, size_t NumCols, | ||||||||||
matrix_layout Layout = matrix_layout::row_major, | ||||||||||
typename Group = sycl::sub_group> | ||||||||||
|
@@ -58,6 +63,11 @@ struct joint_matrix { | |||||||||
PI_INVALID_DEVICE); | ||||||||||
#endif // __SYCL_DEVICE_ONLY__ | ||||||||||
} | ||||||||||
|
||||||||||
inline __SYCL_ALWAYS_INLINE wi_slice<T, NumRows, NumCols, Layout, Group> | ||||||||||
get_wi_data() { | ||||||||||
return wi_slice<T, NumRows, NumCols, Layout, Group>(*this); | ||||||||||
} | ||||||||||
}; | ||||||||||
|
||||||||||
template <typename Group, typename T, size_t NumRows, size_t NumCols, | ||||||||||
|
@@ -191,6 +201,69 @@ joint_matrix_mad(Group sg, joint_matrix<T1, M, K, LayoutA, Group> &mA, | |||||||||
PI_INVALID_DEVICE); | ||||||||||
#endif // __SYCL_DEVICE_ONLY__ | ||||||||||
} | ||||||||||
|
||||||||||
template <typename T, size_t NumRows, size_t NumCols, | ||||||||||
matrix_layout Layout = matrix_layout::row_major, | ||||||||||
typename Group = sycl::sub_group> | ||||||||||
class wi_elem { | ||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. change it to wi_element |
||||||||||
joint_matrix<T, NumRows, NumCols, Layout, Group> &M; | ||||||||||
std::size_t idx; | ||||||||||
|
||||||||||
public: | ||||||||||
wi_elem(joint_matrix<T, NumRows, NumCols, Layout, Group> &Mat, std::size_t i) | ||||||||||
: M(Mat), idx(i) {} | ||||||||||
operator T() { | ||||||||||
#ifdef __SYCL_DEVICE_ONLY__ | ||||||||||
return __spirv_JointMatrixGetSliceElem(M.spvm, idx); | ||||||||||
#else | ||||||||||
throw runtime_error("joint matrix is not supported on host device.", | ||||||||||
PI_INVALID_DEVICE); | ||||||||||
#endif // __SYCL_DEVICE_ONLY__ | ||||||||||
} | ||||||||||
wi_elem &operator=(const T &rhs) { | ||||||||||
#ifdef __SYCL_DEVICE_ONLY__ | ||||||||||
M.spvm = __spirv_JointMatrixSetSliceElem(M.spvm, idx, rhs); | ||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Note that these will be converted to There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We should change it to
Suggested change
|
||||||||||
return *this; | ||||||||||
#else | ||||||||||
(void)rhs; | ||||||||||
throw runtime_error("joint matrix is not supported on host device.", | ||||||||||
PI_INVALID_DEVICE); | ||||||||||
#endif // __SYCL_DEVICE_ONLY__ | ||||||||||
} | ||||||||||
wi_elem &operator*=(const T &rhs) { | ||||||||||
#ifdef __SYCL_DEVICE_ONLY__ | ||||||||||
M.spvm = __spirv_JointMatrixSetSliceElem( | ||||||||||
M.spvm, idx, __spirv_JointMatrixGetSliceElem(M.spvm, idx) * rhs); | ||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||
return *this; | ||||||||||
#else | ||||||||||
(void)rhs; | ||||||||||
throw runtime_error("joint matrix is not supported on host device.", | ||||||||||
PI_INVALID_DEVICE); | ||||||||||
#endif // __SYCL_DEVICE_ONLY__ | ||||||||||
} | ||||||||||
// TODO: add other arithmetic operators | ||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @yubingex007-a11y please do not forget to add overloading for other operators |
||||||||||
}; | ||||||||||
|
||||||||||
template <typename T, size_t NumRows, size_t NumCols, matrix_layout Layout, | ||||||||||
typename Group> | ||||||||||
class wi_slice { | ||||||||||
joint_matrix<T, NumRows, NumCols, Layout, Group> &M; | ||||||||||
|
||||||||||
public: | ||||||||||
wi_slice(joint_matrix<T, NumRows, NumCols, Layout, Group> &Mat) : M(Mat) {} | ||||||||||
size_t length() { | ||||||||||
#ifdef __SYCL_DEVICE_ONLY__ | ||||||||||
return __spirv_JointMatrixGetSliceLength(M.spvm); | ||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||
#else | ||||||||||
throw runtime_error("joint matrix is not supported on host device.", | ||||||||||
PI_INVALID_DEVICE); | ||||||||||
#endif // __SYCL_DEVICE_ONLY__ | ||||||||||
} | ||||||||||
wi_elem<T, NumRows, NumCols, Layout, Group> operator[](size_t i) { | ||||||||||
return wi_elem<T, NumRows, NumCols, Layout, Group>(M, i); | ||||||||||
} | ||||||||||
}; | ||||||||||
|
||||||||||
} // namespace experimental::matrix | ||||||||||
} // namespace oneapi | ||||||||||
} // namespace ext | ||||||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,175 @@ | ||
// RUN: %clangxx -fsycl -O2 %s -o %t.out | ||
#include <CL/sycl.hpp> | ||
#if (SYCL_EXT_ONEAPI_MATRIX == 2) | ||
#include <iostream> | ||
|
||
using namespace sycl; | ||
using namespace sycl::ext::oneapi::experimental::matrix; | ||
|
||
#define TILE_SZ 16 | ||
#define TM (TILE_SZ - 4) | ||
#define TN (TILE_SZ - 4) | ||
#define TK (4 * TILE_SZ - 16) | ||
|
||
#define SG_SZ 16 | ||
|
||
template <typename T, size_t NUM_ROWS, size_t NUM_COLS> struct big_matrix { | ||
public: | ||
T *mat; | ||
|
||
public: | ||
T *get_data() { return mat; } | ||
void set_data(T *data) { mat = data; } | ||
big_matrix(T *data) : mat(data) {} | ||
}; | ||
|
||
template <typename T1, typename T2, size_t NUM_ROWS_A, size_t NUM_COLS_A, | ||
size_t NUM_ROWS_B, size_t NUM_COLS_B, size_t NUM_ROWS_C, | ||
size_t NUM_COLS_C> | ||
void matrix_multiply(big_matrix<T1, NUM_ROWS_C, NUM_COLS_C> &C, | ||
big_matrix<T2, NUM_ROWS_A, NUM_COLS_A> &A, | ||
big_matrix<T2, NUM_ROWS_B, NUM_COLS_B> &B) { | ||
size_t M = NUM_ROWS_C; | ||
size_t N = NUM_COLS_C; | ||
size_t K = NUM_COLS_A; | ||
// B => K/4 x N*4, A => M x K, C => M, N | ||
// stride should be X's cols, e.g., B's stirde = N*4 | ||
assert(NUM_ROWS_C == NUM_ROWS_A && NUM_COLS_A == NUM_ROWS_B * 4); | ||
size_t NDRangeM = M / TM; | ||
size_t NDRangeN = N / TN; | ||
buffer<int8_t, 2> bufA(A.get_data(), range<2>(M, K)); | ||
buffer<int8_t, 2> bufB(B.get_data(), range<2>(K, N)); | ||
buffer<int32_t, 2> bufC(C.get_data(), range<2>(M, N)); | ||
|
||
queue q; | ||
q.submit([&](handler &cgh) { | ||
auto accC = bufC.get_access<access::mode::read_write>(cgh); | ||
auto accA = bufA.get_access<access::mode::read_write>(cgh); | ||
auto accB = bufB.get_access<access::mode::read_write>(cgh); | ||
|
||
cgh.parallel_for<class imatrix>( | ||
nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}), | ||
[accA, accB, accC, M, N, K](nd_item<2> spmd_item) | ||
|
||
{ | ||
// The submatrix API has to be accessed by all the workitems in a | ||
// subgroup these functions will be called once by the subgroup no | ||
// code divergence between the workitems | ||
const auto global_idx = spmd_item.get_global_id(0); | ||
const auto global_idy = spmd_item.get_global_id(1); | ||
const auto sg_startx = global_idx - spmd_item.get_local_id(0); | ||
const auto sg_starty = global_idy - spmd_item.get_local_id(1); | ||
|
||
ext::oneapi::sub_group sg = spmd_item.get_sub_group(); | ||
joint_matrix<int8_t, TM, TK> sub_a(sg); | ||
// For B, since current implementation does not support non-packed | ||
// layout, users need to specify the updated VNNI sizes along with | ||
// the packed_b layout. By default, the layout is row_major and size | ||
// is (TK, TN). | ||
joint_matrix<int8_t, TK, TN, matrix_layout::packed_b> sub_b(sg); | ||
joint_matrix<int32_t, TM, TN> sub_c(sg); | ||
|
||
// AMX: 8 register tiles : 1k byte size, SMmaxxSKmax =16x64 | ||
// strideX = X's cols, so strideC = N, strideA = K, strideB = N*4 | ||
joint_matrix_load(sg, sub_c, | ||
accC.get_pointer() + (sg_startx * TM) * N + | ||
sg_starty / SG_SZ * TN, | ||
N, matrix_layout::row_major); | ||
for (int k = 0; k < K / TK; k += 1) { | ||
joint_matrix_load( | ||
sg, sub_a, accA.get_pointer() + (sg_startx * TM) * K + k * TK, | ||
K, matrix_layout::row_major); | ||
// Assuming B data is already in VNNI format. | ||
joint_matrix_load(sg, sub_b, | ||
accB.get_pointer() + (k * TK / 4) * (N * 4) + | ||
sg_starty / SG_SZ * TN * 4, | ||
N * 4, matrix_layout::packed_b); | ||
sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c); | ||
auto wi_slice_c = sub_c.get_wi_data(); | ||
for (int i = 0; i < wi_slice_c.length(); i++) { | ||
wi_slice_c[i] *= 1; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. choose a different number other than the neutral element :) |
||
} | ||
} | ||
joint_matrix_store(sg, sub_c, | ||
accC.get_pointer() + (sg_startx * TM) * N + | ||
sg_starty / SG_SZ * TN, | ||
N, matrix_layout::row_major); | ||
}); // parallel for | ||
}).wait(); | ||
} | ||
|
||
static constexpr size_t MATRIX_M = TM * 2; | ||
static constexpr size_t MATRIX_N = TN * 2; | ||
static constexpr size_t MATRIX_K = TK * 2; | ||
int8_t A[MATRIX_M][MATRIX_K]; | ||
int8_t B[MATRIX_K / 4][MATRIX_N * 4]; | ||
int32_t C[MATRIX_M][MATRIX_N]; | ||
int32_t D[MATRIX_M][MATRIX_N]; | ||
|
||
void matrix_multiply_ref(int32_t *A_mem, int32_t *B_mem, int32_t *C_mem, int M, | ||
int N, int K) { | ||
// tiling | ||
for (int m = 0; m < M; m++) | ||
for (int n = 0; n < N; n++) { | ||
for (int k = 0; k < K; k++) { | ||
char *va = (char *)(A_mem + m * K + k); | ||
char *vb = (char *)(B_mem + k * N + n); | ||
int acc = *(C_mem + m * N + n); | ||
for (int i = 0; i < 4; i++) { | ||
acc += (va[i] * vb[i]); | ||
} | ||
*(C_mem + m * N + n) = acc; | ||
} | ||
} | ||
} | ||
|
||
int main() { | ||
for (int i = 0; i < MATRIX_M; i++) { | ||
for (int j = 0; j < MATRIX_K; j++) { | ||
A[i][j] = i + 2 * j; | ||
} | ||
} | ||
for (int i = 0; i < MATRIX_K / 4; i++) { | ||
for (int j = 0; j < MATRIX_N * 4; j++) { | ||
B[i][j] = i + j; | ||
} | ||
} | ||
for (int i = 0; i < MATRIX_M; i++) { | ||
for (int j = 0; j < MATRIX_N; j++) { | ||
C[i][j] = 1; | ||
D[i][j] = 1; | ||
} | ||
} | ||
|
||
big_matrix<int32_t, MATRIX_M, MATRIX_N> MC((int32_t *)&C); | ||
big_matrix<int32_t, MATRIX_M, MATRIX_N> MD((int32_t *)&D); | ||
big_matrix<int8_t, MATRIX_M, MATRIX_K> MA((int8_t *)&A); | ||
big_matrix<int8_t, MATRIX_K / 4, MATRIX_N * 4> MB((int8_t *)&B); | ||
matrix_multiply(MC, MA, MB); | ||
matrix_multiply_ref((int32_t *)A, (int32_t *)B, (int32_t *)D, MATRIX_M, | ||
MATRIX_N, MATRIX_K / 4); | ||
|
||
bool res = true; | ||
for (int i = 0; i < MATRIX_M; i++) { | ||
for (int j = 0; j < MATRIX_N; j++) { | ||
if (C[i][j] != D[i][j]) | ||
res = false; | ||
} | ||
} | ||
if (res) | ||
std::cout << "passed\n"; | ||
else | ||
std::cout << "failed\n"; | ||
for (int i = 0; i < MATRIX_M; i++) { | ||
for (int j = 0; j < MATRIX_N; j++) | ||
std::cout << C[i][j] << ", "; | ||
std::cout << "\n"; | ||
} | ||
std::cout << std::endl; | ||
for (int i = 0; i < MATRIX_M; i++) { | ||
for (int j = 0; j < MATRIX_N; j++) | ||
std::cout << D[i][j] << ", "; | ||
std::cout << "\n"; | ||
} | ||
} | ||
#endif // (SYCL_EXT_ONEAPI_MATRIX == 2) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
With the current SPIR-V design we don't need these.