Skip to content

[aarch64] clang/LLVM fails to vectorize simple loop #130872

Open
@haasn

Description

@haasn

I believe this to be a bug, as the scalar version is significantly slower.

Code

void read32x2(char *restrict a, char *restrict b, const char *restrict in)
{
    for (int i = 0; i < 32; i++) {
        a[i] = in[2 * i + 0];
        b[i] = in[2 * i + 1];
    }
}

clang trunk -O3:

read32x2:
        ldrb    w8, [x2]
        ldrb    w9, [x2, #1]
        strb    w8, [x0]
        ldrb    w8, [x2, #2]
        strb    w9, [x1]
        ldrb    w9, [x2, #3]
        strb    w8, [x0, #1]
        ldrb    w8, [x2, #4]
        strb    w9, [x1, #1]
        ldrb    w9, [x2, #5]
        strb    w8, [x0, #2]
        ldrb    w8, [x2, #6]
        strb    w9, [x1, #2]
        ldrb    w9, [x2, #7]
        strb    w8, [x0, #3]
        ldrb    w8, [x2, #8]
        strb    w9, [x1, #3]
        ldrb    w9, [x2, #9]
        strb    w8, [x0, #4]
        ldrb    w8, [x2, #10]
        strb    w9, [x1, #4]
        ldrb    w9, [x2, #11]
        strb    w8, [x0, #5]
        ldrb    w8, [x2, #12]
        strb    w9, [x1, #5]
        ldrb    w9, [x2, #13]
        strb    w8, [x0, #6]
        ldrb    w8, [x2, #14]
        strb    w9, [x1, #6]
        ldrb    w9, [x2, #15]
        strb    w8, [x0, #7]
        ldrb    w8, [x2, #16]
        strb    w9, [x1, #7]
        ldrb    w9, [x2, #17]
        strb    w8, [x0, #8]
        ldrb    w8, [x2, #18]
        strb    w9, [x1, #8]
        ldrb    w9, [x2, #19]
        strb    w8, [x0, #9]
        ldrb    w8, [x2, #20]
        strb    w9, [x1, #9]
        ldrb    w9, [x2, #21]
        strb    w8, [x0, #10]
        ldrb    w8, [x2, #22]
        strb    w9, [x1, #10]
        ldrb    w9, [x2, #23]
        strb    w8, [x0, #11]
        ldrb    w8, [x2, #24]
        strb    w9, [x1, #11]
        ldrb    w9, [x2, #25]
        strb    w8, [x0, #12]
        ldrb    w8, [x2, #26]
        strb    w9, [x1, #12]
        ldrb    w9, [x2, #27]
        strb    w8, [x0, #13]
        ldrb    w8, [x2, #28]
        strb    w9, [x1, #13]
        ldrb    w9, [x2, #29]
        strb    w8, [x0, #14]
        ldrb    w8, [x2, #30]
        strb    w9, [x1, #14]
        ldrb    w9, [x2, #31]
        strb    w8, [x0, #15]
        ldrb    w8, [x2, #32]
        strb    w9, [x1, #15]
        ldrb    w9, [x2, #33]
        strb    w8, [x0, #16]
        ldrb    w8, [x2, #34]
        strb    w9, [x1, #16]
        ldrb    w9, [x2, #35]
        strb    w8, [x0, #17]
        ldrb    w8, [x2, #36]
        strb    w9, [x1, #17]
        ldrb    w9, [x2, #37]
        strb    w8, [x0, #18]
        ldrb    w8, [x2, #38]
        strb    w9, [x1, #18]
        ldrb    w9, [x2, #39]
        strb    w8, [x0, #19]
        ldrb    w8, [x2, #40]
        strb    w9, [x1, #19]
        ldrb    w9, [x2, #41]
        strb    w8, [x0, #20]
        ldrb    w8, [x2, #42]
        strb    w9, [x1, #20]
        ldrb    w9, [x2, #43]
        strb    w8, [x0, #21]
        ldrb    w8, [x2, #44]
        strb    w9, [x1, #21]
        ldrb    w9, [x2, #45]
        strb    w8, [x0, #22]
        ldrb    w8, [x2, #46]
        strb    w9, [x1, #22]
        ldrb    w9, [x2, #47]
        strb    w8, [x0, #23]
        ldrb    w8, [x2, #48]
        strb    w9, [x1, #23]
        ldrb    w9, [x2, #49]
        strb    w8, [x0, #24]
        ldrb    w8, [x2, #50]
        strb    w9, [x1, #24]
        ldrb    w9, [x2, #51]
        strb    w8, [x0, #25]
        ldrb    w8, [x2, #52]
        strb    w9, [x1, #25]
        ldrb    w9, [x2, #53]
        strb    w8, [x0, #26]
        ldrb    w8, [x2, #54]
        strb    w9, [x1, #26]
        ldrb    w9, [x2, #55]
        strb    w8, [x0, #27]
        ldrb    w8, [x2, #56]
        strb    w9, [x1, #27]
        ldrb    w9, [x2, #57]
        strb    w8, [x0, #28]
        ldrb    w8, [x2, #58]
        strb    w9, [x1, #28]
        ldrb    w9, [x2, #59]
        strb    w8, [x0, #29]
        ldrb    w8, [x2, #60]
        strb    w9, [x1, #29]
        ldrb    w9, [x2, #61]
        strb    w8, [x0, #30]
        ldrb    w8, [x2, #62]
        strb    w9, [x1, #30]
        ldrb    w9, [x2, #63]
        strb    w8, [x0, #31]
        strb    w9, [x1, #31]
        ret

GCC trunk -O3:

read32x2:
        ld2     {v28.16b - v29.16b}, [x2], 32
        ld2     {v30.16b - v31.16b}, [x2]
        stp     q28, q30, [x0]
        stp     q29, q31, [x1]
        ret

See Also

https://godbolt.org/z/5aWdbjTEx

Metadata

Metadata

Assignees

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions