Open
Description
I tried this code on godbolt.org :
#[derive(Default, Clone, Copy)]
struct Data {
a: u8,
b: u8,
c: u8,
d: u8,
e: u8,
f: u8,
g: u8,
h: u8,
}
#[inline(never)]
fn for_in_closure() {
let mut v = [Data::default(); 5000];
let mut closure = || {
for item in &mut v {
item.a += 1;
item.b += 1;
item.c += 1;
item.d += 1;
item.e += 1;
item.f += 1;
item.g += 1;
item.h += 1;
}
};
closure();
}
The generated assembly code is as follows. The additions in the for loop are compiled into four inc
instructions and one psubb
instruction. Is there any particular reason why these additions cannot be compiled into one SSE addition?
example::for_in_closure:
push rbx
mov r11, rsp
sub r11, 36864
.LBB0_1:
sub rsp, 4096
mov qword ptr [rsp], 0
cmp rsp, r11
jne .LBB0_1
sub rsp, 3136
mov rdi, rsp
xor ebx, ebx
mov edx, 40000
xor esi, esi
call qword ptr [rip + memset@GOTPCREL]
pcmpeqd xmm0, xmm0
.LBB0_3:
inc byte ptr [rsp + 8*rbx]
movd xmm1, dword ptr [rsp + 8*rbx + 1]
psubb xmm1, xmm0
movd dword ptr [rsp + 8*rbx + 1], xmm1
inc byte ptr [rsp + 8*rbx + 5]
inc byte ptr [rsp + 8*rbx + 6]
inc byte ptr [rsp + 8*rbx + 7]
lea rax, [rbx + 1]
mov rbx, rax
cmp rax, 5000
jne .LBB0_3
add rsp, 40000
pop rbx
ret
Instead, if you move the for loop outside the closure, the for loop will be unrolled into five psubb
instructions.
example::for_out_closure:
mov r11, rsp
sub r11, 36864
.LBB1_1:
sub rsp, 4096
mov qword ptr [rsp], 0
cmp rsp, r11
jne .LBB1_1
sub rsp, 3144
lea rdi, [rsp + 8]
mov edx, 40000
xor esi, esi
call qword ptr [rip + memset@GOTPCREL]
mov eax, 4
pcmpeqd xmm0, xmm0
.LBB1_3:
movq xmm1, qword ptr [rsp + 8*rax - 24]
psubb xmm1, xmm0
movq xmm2, qword ptr [rsp + 8*rax - 16]
psubb xmm2, xmm0
punpcklqdq xmm1, xmm2
movdqu xmmword ptr [rsp + 8*rax - 24], xmm1
movq xmm1, qword ptr [rsp + 8*rax - 8]
psubb xmm1, xmm0
movq xmm2, qword ptr [rsp + 8*rax]
psubb xmm2, xmm0
punpcklqdq xmm1, xmm2
movdqu xmmword ptr [rsp + 8*rax - 8], xmm1
movq xmm1, qword ptr [rsp + 8*rax + 8]
psubb xmm1, xmm0
movq qword ptr [rsp + 8*rax + 8], xmm1
add rax, 5
cmp rax, 5004
jne .LBB1_3
add rsp, 40008
ret
The complete test code is avaliable here: https://godbolt.org/z/YoMaWWzW7
Metadata
Metadata
Assignees
Labels
Area: Code generation parts specific to LLVM. Both correctness bugs and optimization-related issues.Area: Autovectorization, which can impact perf or code sizeCategory: This is a bug.Call for participation: An issue has been fixed and does not reproduce, but no test has been added.Issue: Problems and improvements with respect to performance of generated code.