Skip to content

[SVE] Compile error with SVE intrinsic with windows on arm #142561

Open
@scoutzeng

Description

@scoutzeng

I was compiling a sample code of SVE Intrinsic with CLANGCL 19.1.1 on Windows platform. The compiler output as follows:
error: Incorrect size for matrix_multiply_sve epilogue: 12 bytes of instructions in range, but .seh directives corresponding to 8 bytes

void matrix_multiply_sve(const float32_t* A, const float32_t* B, float32_t* C, uint32_t n, uint32_t m, uint32_t k) {
    /*
     * Multiply matrices A and B, store the result in C.
     * It is the users responsibility to make sure the matrices are compatible.
     */

    int a_idx;
    int b_idx;
    int c_idx;

    // these are the columns of a nx4 sub matrix of A
    svfloat32_t A0;
    svfloat32_t A1;
    svfloat32_t A2;
    svfloat32_t A3;

    // these are the columns of a 4x4 sub matrix of B
    svfloat32_t B0;
    svfloat32_t B1;
    svfloat32_t B2;
    svfloat32_t B3;

    // these are the columns of a nx4 sub matrix of C
    svfloat32_t C0;
    svfloat32_t C1;
    svfloat32_t C2;
    svfloat32_t C3;

    for (int i_idx = 0; i_idx < n; i_idx += svcntw()) {
        // calculate predicate for this i_idx
        svbool_t pred = svwhilelt_b32_u32(i_idx, n);

        for (int j_idx = 0; j_idx < m; j_idx += 4) {
            // zero accumulators before matrix op
            C0 = svdup_n_f32(0);
            C1 = svdup_n_f32(0);
            C2 = svdup_n_f32(0);
            C3 = svdup_n_f32(0);
            for (int k_idx = 0; k_idx < k; k_idx += 4) {
                // compute base index to 4x4 block
                a_idx = i_idx + n * k_idx;
                b_idx = k * j_idx + k_idx;

                // load most current a values in row
                A0 = svld1_f32(pred, A + a_idx);
                A1 = svld1_f32(pred, A + a_idx + n);
                A2 = svld1_f32(pred, A + a_idx + 2 * n);
                A3 = svld1_f32(pred, A + a_idx + 3 * n);

                // multiply accumulate 4x1 blocks, that is each column C
                B0 = svld1rq_f32(svptrue_b32(), B + b_idx);
                C0 = svmla_lane_f32(C0, A0, B0, 0);
                C0 = svmla_lane_f32(C0, A1, B0, 1);
                C0 = svmla_lane_f32(C0, A2, B0, 2);
                C0 = svmla_lane_f32(C0, A3, B0, 3);

                B1 = svld1rq_f32(svptrue_b32(), B + b_idx + k);
                C1 = svmla_lane_f32(C1, A0, B1, 0);
                C1 = svmla_lane_f32(C1, A1, B1, 1);
                C1 = svmla_lane_f32(C1, A2, B1, 2);
                C1 = svmla_lane_f32(C1, A3, B1, 3);

                B2 = svld1rq_f32(svptrue_b32(), B + b_idx + 2 * k);
                C2 = svmla_lane_f32(C2, A0, B2, 0);
                C2 = svmla_lane_f32(C2, A1, B2, 1);
                C2 = svmla_lane_f32(C2, A2, B2, 2);
                C2 = svmla_lane_f32(C2, A3, B2, 3);

                B3 = svld1rq_f32(svptrue_b32(), B + b_idx + 3 * k);
                C3 = svmla_lane_f32(C3, A0, B3, 0);
                C3 = svmla_lane_f32(C3, A1, B3, 1);
                C3 = svmla_lane_f32(C3, A2, B3, 2);
                C3 = svmla_lane_f32(C3, A3, B3, 3);
            }
            // compute base index for stores
            c_idx = n * j_idx + i_idx;
            svst1_f32(pred, C + c_idx, C0);
            svst1_f32(pred, C + c_idx + n, C1);
            svst1_f32(pred, C + c_idx + 2 * n, C2);
            svst1_f32(pred, C + c_idx + 3 * n, C3);
        }
    }
}

The code is a sample code from ARM developer website introduce me to implement matrix multiply with SVE intrinsic. I don't know how to solve this error, is there a compiler error?

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions