Description
I have experienced more than 300% longer execution time in specific functions that use loops along with indexing into slices. After several hours of work with a profiler, I was able to isolate the problem from a 60K lines codebase into the following short program
use std::cmp;
#[inline(never)]
pub fn cmp_gt_and(in1: &[i16], in2: &[i16], destination: &mut [bool]) {
let max = cmp::min(cmp::min(in1.len(), in2.len()), destination.len());
let src1 = &in1[0..max];
let src2 = &in2[0..max];
let dst = &mut destination[0..max];
for index in 0..max {
dst[index] &= src1[index] < src2[index];
}
}
fn main() {
let len = 100;
let a: Vec<i16> = (1..len).collect();
let b: Vec<i16> = (1..len).map(|x| len - x).collect();
let mut result = vec![false; len as usize];
for _ in 0..100*1000*1000 {
cmp_gt_and(&a, &b, &mut result);
}
let sum: i32 = b.into_iter().map(|x| x as i32).sum();
std::process::exit(sum);
}
Code is also available in the following repository
With rust 1.44.0, I observe excecution time around 1.7 sec
$ rustc --version --verbose; cargo build --release;time ./target/release/rust-perf-demo
rustc 1.44.0 (49cae5576 2020-06-01)
binary: rustc
commit-hash: 49cae55760da0a43428eba73abcb659bb70cf2e4
commit-date: 2020-06-01
host: x86_64-unknown-linux-gnu
release: 1.44.0
LLVM version: 9.0
Finished release [optimized] target(s) in 0.04s
real 0m1.681s
user 0m1.676s
sys 0m0.004s
Rust versions 1.45.2 and current stable 1.46.0 produce binaries that run more than 6.0 seconds with the same source code
$ rustc --version --verbose; cargo build --release;time ./target/release/rust-perf-demo
rustc 1.45.2 (d3fb005a3 2020-07-31)
binary: rustc
commit-hash: d3fb005a39e62501b8b0b356166e515ae24e2e54
commit-date: 2020-07-31
host: x86_64-unknown-linux-gnu
release: 1.45.2
LLVM version: 10.0
Finished release [optimized] target(s) in 0.05s
real 0m6.643s
user 0m6.643s
sys 0m0.000s
$ rustc --version --verbose; cargo build --release;time ./target/release/rust-perf-demo
rustc 1.46.0 (04488afe3 2020-08-24)
binary: rustc
commit-hash: 04488afe34512aa4c33566eb16d8c912a3ae04f9
commit-date: 2020-08-24
host: x86_64-unknown-linux-gnu
release: 1.46.0
LLVM version: 10.0
Finished release [optimized] target(s) in 0.00s
real 0m6.642s
user 0m6.606s
sys 0m0.012s
I use several more functions like cmp_gt_and
in a core of image processing software that also show similar performance drop.
Has anything significantly changed between rustc 1.44 and 1.45 that may have impacted the code so significantly? Maybe LLVM 10 has a different behavior? Any thoughts how to modify the code to gain the performance back with the current compiler or other things to try in order to clarify the problem? For some time, I can stick with 1.44 to keep the performance.
Function cmp_gt_and
also appears to have much shorter assembly code with rustc 1.44 than with its successors, not sure if that is the reason for the performnace drop, though:
Rustc 1.44.0
_ZN14rust_perf_demo10cmp_gt_and17h7a0e5899b697a5eaE:
.cfi_startproc
push rax
.cfi_def_cfa_offset 16
mov r10, rdi
cmp rsi, rcx
mov rdi, rsi
cmova rdi, rcx
cmp rdi, r9
cmova rdi, r9
cmp rdi, rsi
ja .LBB8_10
cmp rdi, rcx
ja .LBB8_11
test rdi, rdi
je .LBB8_9
cmp rdi, 15
ja .LBB8_5
xor ecx, ecx
jmp .LBB8_8
.LBB8_5:
mov rcx, rdi
and rcx, -16
xor esi, esi
pxor xmm0, xmm0
.p2align 4, 0x90
.LBB8_6:
movdqu xmm1, xmmword ptr [r10 + 2*rsi]
movdqu xmm2, xmmword ptr [r10 + 2*rsi + 16]
movdqu xmm3, xmmword ptr [rdx + 2*rsi]
pcmpgtw xmm3, xmm1
movdqu xmm1, xmmword ptr [rdx + 2*rsi + 16]
pcmpgtw xmm1, xmm2
movq xmm2, qword ptr [r8 + rsi]
punpcklbw xmm2, xmm0
movq xmm4, qword ptr [r8 + rsi + 8]
punpcklbw xmm4, xmm0
pcmpeqw xmm2, xmm0
pandn xmm2, xmm3
pcmpeqw xmm4, xmm0
pandn xmm4, xmm1
psrlw xmm2, 15
packuswb xmm2, xmm0
psrlw xmm4, 15
packuswb xmm4, xmm0
movq qword ptr [r8 + rsi], xmm2
movq qword ptr [r8 + rsi + 8], xmm4
add rsi, 16
cmp rcx, rsi
jne .LBB8_6
cmp rdi, rcx
je .LBB8_9
.p2align 4, 0x90
.LBB8_8:
movzx esi, word ptr [r10 + 2*rcx]
cmp si, word ptr [rdx + 2*rcx]
setl sil
cmp byte ptr [r8 + rcx], 0
setne al
and al, sil
mov byte ptr [r8 + rcx], al
add rcx, 1
cmp rcx, rdi
jb .LBB8_8
.LBB8_9:
pop rax
.cfi_def_cfa_offset 8
ret
.LBB8_10:
.cfi_def_cfa_offset 16
lea rdx, [rip + .L__unnamed_2]
call qword ptr [rip + _ZN4core5slice20slice_index_len_fail17he661f5dd1689ef3bE@GOTPCREL]
ud2
.LBB8_11:
lea rdx, [rip + .L__unnamed_3]
mov rsi, rcx
call qword ptr [rip + _ZN4core5slice20slice_index_len_fail17he661f5dd1689ef3bE@GOTPCREL]
ud2
.Lfunc_end8:
.size _ZN14rust_perf_demo10cmp_gt_and17h7a0e5899b697a5eaE, .Lfunc_end8-_ZN14rust_perf_demo10cmp_gt_and17h7a0e5899b697a5eaE
.cfi_endproc
.section .rodata.cst16,"aM",@progbits,16
.p2align 4
Rustc 1.45.2
_ZN14rust_perf_demo10cmp_gt_and17h39391ca255a87f0fE:
.cfi_startproc
push rbx
.cfi_def_cfa_offset 16
sub rsp, 32
.cfi_def_cfa_offset 48
.cfi_offset rbx, -16
mov r10, rdi
cmp rsi, rcx
mov rdi, rsi
cmova rdi, rcx
cmp rdi, r9
cmova rdi, r9
cmp rdi, rsi
ja .LBB8_10
cmp rdi, rcx
ja .LBB8_11
test rdi, rdi
je .LBB8_9
cmp rdi, 15
ja .LBB8_5
xor esi, esi
jmp .LBB8_8
.LBB8_5:
mov rsi, rdi
and rsi, -16
xor r11d, r11d
pxor xmm0, xmm0
pcmpeqd xmm1, xmm1
.p2align 4, 0x90
.LBB8_6:
movdqu xmm2, xmmword ptr [r10 + 2*r11]
movdqu xmm3, xmmword ptr [r10 + 2*r11 + 16]
movdqu xmm4, xmmword ptr [rdx + 2*r11]
pcmpgtw xmm4, xmm2
movdqu xmm2, xmmword ptr [rdx + 2*r11 + 16]
pcmpgtw xmm2, xmm3
movq xmm5, qword ptr [r8 + r11]
movq xmm3, qword ptr [r8 + r11 + 8]
pcmpeqb xmm5, xmm0
pxor xmm5, xmm1
punpcklbw xmm5, xmm0
pand xmm5, xmm4
pcmpeqb xmm3, xmm0
pxor xmm3, xmm1
punpcklbw xmm3, xmm0
pand xmm3, xmm2
movdqa xmmword ptr [rsp], xmm5
movzx eax, byte ptr [rsp + 4]
and al, 1
movzx r9d, al
movzx eax, byte ptr [rsp + 6]
and al, 1
movzx eax, al
shl eax, 8
or eax, r9d
movzx ecx, byte ptr [rsp]
movzx r9d, byte ptr [rsp + 2]
and cl, 1
movzx ebx, cl
and r9b, 1
movzx ecx, r9b
shl ecx, 8
or ecx, ebx
movd xmm2, ecx
pinsrw xmm2, eax, 1
movzx eax, byte ptr [rsp + 8]
and al, 1
movzx eax, al
movzx ecx, byte ptr [rsp + 10]
and cl, 1
movzx ecx, cl
shl ecx, 8
or ecx, eax
pinsrw xmm2, ecx, 2
movzx eax, byte ptr [rsp + 12]
and al, 1
movzx eax, al
movzx ecx, byte ptr [rsp + 14]
and cl, 1
movzx ecx, cl
shl ecx, 8
or ecx, eax
pinsrw xmm2, ecx, 3
movdqa xmmword ptr [rsp + 16], xmm3
movzx eax, byte ptr [rsp + 20]
and al, 1
movzx eax, al
movzx ecx, byte ptr [rsp + 22]
and cl, 1
movzx ecx, cl
shl ecx, 8
or ecx, eax
movzx eax, byte ptr [rsp + 16]
movzx ebx, byte ptr [rsp + 18]
and al, 1
movzx eax, al
and bl, 1
movzx ebx, bl
shl ebx, 8
or ebx, eax
movd xmm3, ebx
pinsrw xmm3, ecx, 1
movzx eax, byte ptr [rsp + 24]
and al, 1
movzx eax, al
movzx ecx, byte ptr [rsp + 26]
and cl, 1
movzx ecx, cl
shl ecx, 8
or ecx, eax
pinsrw xmm3, ecx, 2
movzx eax, byte ptr [rsp + 28]
and al, 1
movzx eax, al
movzx ecx, byte ptr [rsp + 30]
and cl, 1
movzx ecx, cl
shl ecx, 8
or ecx, eax
pinsrw xmm3, ecx, 3
movq qword ptr [r8 + r11], xmm2
movq qword ptr [r8 + r11 + 8], xmm3
add r11, 16
cmp rsi, r11
jne .LBB8_6
cmp rdi, rsi
je .LBB8_9
.p2align 4, 0x90
.LBB8_8:
movzx eax, word ptr [r10 + 2*rsi]
cmp ax, word ptr [rdx + 2*rsi]
setl al
cmp byte ptr [r8 + rsi], 0
setne cl
and cl, al
mov byte ptr [r8 + rsi], cl
add rsi, 1
cmp rsi, rdi
jb .LBB8_8
.LBB8_9:
add rsp, 32
.cfi_def_cfa_offset 16
pop rbx
.cfi_def_cfa_offset 8
ret
.LBB8_10:
.cfi_def_cfa_offset 48
lea rdx, [rip + .L__unnamed_2]
call qword ptr [rip + _ZN4core5slice20slice_index_len_fail17h9254c9506d16ff21E@GOTPCREL]
ud2
.LBB8_11:
lea rdx, [rip + .L__unnamed_3]
mov rsi, rcx
call qword ptr [rip + _ZN4core5slice20slice_index_len_fail17h9254c9506d16ff21E@GOTPCREL]
ud2
.Lfunc_end8:
.size _ZN14rust_perf_demo10cmp_gt_and17h39391ca255a87f0fE, .Lfunc_end8-_ZN14rust_perf_demo10cmp_gt_and17h39391ca255a87f0fE
.cfi_endproc
.section .rodata.cst16,"aM",@progbits,16
.p2align 4