Skip to content

for loop in closure is not unrolled and not vectorlized correctly #120189

Open
@shuangsilab

Description

@shuangsilab

I tried this code on godbolt.org :

#[derive(Default, Clone, Copy)]
struct Data {
    a: u8,
    b: u8,
    c: u8,
    d: u8,
    e: u8,
    f: u8,
    g: u8,
    h: u8,
}

#[inline(never)]
fn for_in_closure() {
    let mut v = [Data::default(); 5000];
    let mut closure = || {
        for item in &mut v {
            item.a += 1;
            item.b += 1;
            item.c += 1;
            item.d += 1;
            item.e += 1;
            item.f += 1;
            item.g += 1;
            item.h += 1;
        }
    };
    closure();
}

The generated assembly code is as follows. The additions in the for loop are compiled into four inc instructions and one psubb instruction. Is there any particular reason why these additions cannot be compiled into one SSE addition?

example::for_in_closure:
        push    rbx
        mov     r11, rsp
        sub     r11, 36864
.LBB0_1:
        sub     rsp, 4096
        mov     qword ptr [rsp], 0
        cmp     rsp, r11
        jne     .LBB0_1
        sub     rsp, 3136
        mov     rdi, rsp
        xor     ebx, ebx
        mov     edx, 40000
        xor     esi, esi
        call    qword ptr [rip + memset@GOTPCREL]
        pcmpeqd xmm0, xmm0
.LBB0_3:
        inc     byte ptr [rsp + 8*rbx]
        movd    xmm1, dword ptr [rsp + 8*rbx + 1]
        psubb   xmm1, xmm0
        movd    dword ptr [rsp + 8*rbx + 1], xmm1
        inc     byte ptr [rsp + 8*rbx + 5]
        inc     byte ptr [rsp + 8*rbx + 6]
        inc     byte ptr [rsp + 8*rbx + 7]
        lea     rax, [rbx + 1]
        mov     rbx, rax
        cmp     rax, 5000
        jne     .LBB0_3
        add     rsp, 40000
        pop     rbx
        ret

Instead, if you move the for loop outside the closure, the for loop will be unrolled into five psubb instructions.

example::for_out_closure:
        mov     r11, rsp
        sub     r11, 36864
.LBB1_1:
        sub     rsp, 4096
        mov     qword ptr [rsp], 0
        cmp     rsp, r11
        jne     .LBB1_1
        sub     rsp, 3144
        lea     rdi, [rsp + 8]
        mov     edx, 40000
        xor     esi, esi
        call    qword ptr [rip + memset@GOTPCREL]
        mov     eax, 4
        pcmpeqd xmm0, xmm0
.LBB1_3:
        movq    xmm1, qword ptr [rsp + 8*rax - 24]
        psubb   xmm1, xmm0
        movq    xmm2, qword ptr [rsp + 8*rax - 16]
        psubb   xmm2, xmm0
        punpcklqdq      xmm1, xmm2
        movdqu  xmmword ptr [rsp + 8*rax - 24], xmm1
        movq    xmm1, qword ptr [rsp + 8*rax - 8]
        psubb   xmm1, xmm0
        movq    xmm2, qword ptr [rsp + 8*rax]
        psubb   xmm2, xmm0
        punpcklqdq      xmm1, xmm2
        movdqu  xmmword ptr [rsp + 8*rax - 8], xmm1
        movq    xmm1, qword ptr [rsp + 8*rax + 8]
        psubb   xmm1, xmm0
        movq    qword ptr [rsp + 8*rax + 8], xmm1
        add     rax, 5
        cmp     rax, 5004
        jne     .LBB1_3
        add     rsp, 40008
        ret

The complete test code is avaliable here: https://godbolt.org/z/YoMaWWzW7

Metadata

Metadata

Assignees

No one assigned

    Labels

    A-LLVMArea: Code generation parts specific to LLVM. Both correctness bugs and optimization-related issues.A-autovectorizationArea: Autovectorization, which can impact perf or code sizeC-bugCategory: This is a bug.E-needs-testCall for participation: An issue has been fixed and does not reproduce, but no test has been added.I-slowIssue: Problems and improvements with respect to performance of generated code.

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions