for loop in closure is not unrolled and not vectorlized correctly

I tried this code on godbolt.org :

```rust
#[derive(Default, Clone, Copy)]
struct Data {
    a: u8,
    b: u8,
    c: u8,
    d: u8,
    e: u8,
    f: u8,
    g: u8,
    h: u8,
}

#[inline(never)]
fn for_in_closure() {
    let mut v = [Data::default(); 5000];
    let mut closure = || {
        for item in &mut v {
            item.a += 1;
            item.b += 1;
            item.c += 1;
            item.d += 1;
            item.e += 1;
            item.f += 1;
            item.g += 1;
            item.h += 1;
        }
    };
    closure();
}
```

The generated assembly code is as follows. The additions in the for loop are compiled into four `inc` instructions and one `psubb` instruction. Is there any particular reason why these additions cannot be compiled into one SSE addition?

```asm
example::for_in_closure:
        push    rbx
        mov     r11, rsp
        sub     r11, 36864
.LBB0_1:
        sub     rsp, 4096
        mov     qword ptr [rsp], 0
        cmp     rsp, r11
        jne     .LBB0_1
        sub     rsp, 3136
        mov     rdi, rsp
        xor     ebx, ebx
        mov     edx, 40000
        xor     esi, esi
        call    qword ptr [rip + memset@GOTPCREL]
        pcmpeqd xmm0, xmm0
.LBB0_3:
        inc     byte ptr [rsp + 8*rbx]
        movd    xmm1, dword ptr [rsp + 8*rbx + 1]
        psubb   xmm1, xmm0
        movd    dword ptr [rsp + 8*rbx + 1], xmm1
        inc     byte ptr [rsp + 8*rbx + 5]
        inc     byte ptr [rsp + 8*rbx + 6]
        inc     byte ptr [rsp + 8*rbx + 7]
        lea     rax, [rbx + 1]
        mov     rbx, rax
        cmp     rax, 5000
        jne     .LBB0_3
        add     rsp, 40000
        pop     rbx
        ret
```

Instead, if you move the for loop outside the closure, the for loop will be unrolled into five `psubb` instructions.

```asm
example::for_out_closure:
        mov     r11, rsp
        sub     r11, 36864
.LBB1_1:
        sub     rsp, 4096
        mov     qword ptr [rsp], 0
        cmp     rsp, r11
        jne     .LBB1_1
        sub     rsp, 3144
        lea     rdi, [rsp + 8]
        mov     edx, 40000
        xor     esi, esi
        call    qword ptr [rip + memset@GOTPCREL]
        mov     eax, 4
        pcmpeqd xmm0, xmm0
.LBB1_3:
        movq    xmm1, qword ptr [rsp + 8*rax - 24]
        psubb   xmm1, xmm0
        movq    xmm2, qword ptr [rsp + 8*rax - 16]
        psubb   xmm2, xmm0
        punpcklqdq      xmm1, xmm2
        movdqu  xmmword ptr [rsp + 8*rax - 24], xmm1
        movq    xmm1, qword ptr [rsp + 8*rax - 8]
        psubb   xmm1, xmm0
        movq    xmm2, qword ptr [rsp + 8*rax]
        psubb   xmm2, xmm0
        punpcklqdq      xmm1, xmm2
        movdqu  xmmword ptr [rsp + 8*rax - 8], xmm1
        movq    xmm1, qword ptr [rsp + 8*rax + 8]
        psubb   xmm1, xmm0
        movq    qword ptr [rsp + 8*rax + 8], xmm1
        add     rax, 5
        cmp     rax, 5004
        jne     .LBB1_3
        add     rsp, 40008
        ret
```

The complete test code is avaliable here: https://godbolt.org/z/YoMaWWzW7

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

for loop in closure is not unrolled and not vectorlized correctly #120189

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

for loop in closure is not unrolled and not vectorlized correctly #120189

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions