Open
Description
I was compiling a sample code of SVE Intrinsic with CLANGCL 19.1.1 on Windows platform. The compiler output as follows:
error: Incorrect size for matrix_multiply_sve epilogue: 12 bytes of instructions in range, but .seh directives corresponding to 8 bytes
void matrix_multiply_sve(const float32_t* A, const float32_t* B, float32_t* C, uint32_t n, uint32_t m, uint32_t k) {
/*
* Multiply matrices A and B, store the result in C.
* It is the users responsibility to make sure the matrices are compatible.
*/
int a_idx;
int b_idx;
int c_idx;
// these are the columns of a nx4 sub matrix of A
svfloat32_t A0;
svfloat32_t A1;
svfloat32_t A2;
svfloat32_t A3;
// these are the columns of a 4x4 sub matrix of B
svfloat32_t B0;
svfloat32_t B1;
svfloat32_t B2;
svfloat32_t B3;
// these are the columns of a nx4 sub matrix of C
svfloat32_t C0;
svfloat32_t C1;
svfloat32_t C2;
svfloat32_t C3;
for (int i_idx = 0; i_idx < n; i_idx += svcntw()) {
// calculate predicate for this i_idx
svbool_t pred = svwhilelt_b32_u32(i_idx, n);
for (int j_idx = 0; j_idx < m; j_idx += 4) {
// zero accumulators before matrix op
C0 = svdup_n_f32(0);
C1 = svdup_n_f32(0);
C2 = svdup_n_f32(0);
C3 = svdup_n_f32(0);
for (int k_idx = 0; k_idx < k; k_idx += 4) {
// compute base index to 4x4 block
a_idx = i_idx + n * k_idx;
b_idx = k * j_idx + k_idx;
// load most current a values in row
A0 = svld1_f32(pred, A + a_idx);
A1 = svld1_f32(pred, A + a_idx + n);
A2 = svld1_f32(pred, A + a_idx + 2 * n);
A3 = svld1_f32(pred, A + a_idx + 3 * n);
// multiply accumulate 4x1 blocks, that is each column C
B0 = svld1rq_f32(svptrue_b32(), B + b_idx);
C0 = svmla_lane_f32(C0, A0, B0, 0);
C0 = svmla_lane_f32(C0, A1, B0, 1);
C0 = svmla_lane_f32(C0, A2, B0, 2);
C0 = svmla_lane_f32(C0, A3, B0, 3);
B1 = svld1rq_f32(svptrue_b32(), B + b_idx + k);
C1 = svmla_lane_f32(C1, A0, B1, 0);
C1 = svmla_lane_f32(C1, A1, B1, 1);
C1 = svmla_lane_f32(C1, A2, B1, 2);
C1 = svmla_lane_f32(C1, A3, B1, 3);
B2 = svld1rq_f32(svptrue_b32(), B + b_idx + 2 * k);
C2 = svmla_lane_f32(C2, A0, B2, 0);
C2 = svmla_lane_f32(C2, A1, B2, 1);
C2 = svmla_lane_f32(C2, A2, B2, 2);
C2 = svmla_lane_f32(C2, A3, B2, 3);
B3 = svld1rq_f32(svptrue_b32(), B + b_idx + 3 * k);
C3 = svmla_lane_f32(C3, A0, B3, 0);
C3 = svmla_lane_f32(C3, A1, B3, 1);
C3 = svmla_lane_f32(C3, A2, B3, 2);
C3 = svmla_lane_f32(C3, A3, B3, 3);
}
// compute base index for stores
c_idx = n * j_idx + i_idx;
svst1_f32(pred, C + c_idx, C0);
svst1_f32(pred, C + c_idx + n, C1);
svst1_f32(pred, C + c_idx + 2 * n, C2);
svst1_f32(pred, C + c_idx + 3 * n, C3);
}
}
}
The code is a sample code from ARM developer website introduce me to implement matrix multiply with SVE intrinsic. I don't know how to solve this error, is there a compiler error?