Description
I tried this code:
pub fn fold_val(s: &[i32]) -> Option<i32> {
s.iter().fold(None, |_, i| Some(*i))
}
pub fn fold_ptr(s: &[i32]) -> Option<*const i32> {
s.iter().fold(None, |_, i| Some(<*const i32>::from(i)))
}
pub fn fold_nonnull(s: &[i32]) -> Option<std::ptr::NonNull<i32>> {
s.iter().fold(None, |_, i| Some(From::from(i)))
}
pub fn fold_ref(s: &[i32]) -> Option<&i32> {
s.iter().fold(None, |_, i| Some(i))
}
(nevermind the fact that these could obviously just use slice::back
)
(godbolt link: https://rust.godbolt.org/z/6fjzo4faW )
I expected that all of these functions produce more or less similar assembly, as all of them just need to peel the last loop iteration to be able to optimize away the whole loop body. Indeed, the first two functions optimize just fine:
example::fold_val:
test rsi, rsi
je .LBB0_1
mov edx, dword ptr [rdi + 4*rsi - 4]
mov eax, 1
ret
.LBB0_1:
xor eax, eax
ret
example::fold_ptr:
xor eax, eax
test rsi, rsi
setne al
lea rdx, [rdi + 4*rsi]
add rdx, -4
ret
The fold_{nonnull,ref}
functions however don't optimize away the loop:
example::fold_nonnull:
movabs r8, 4611686018427387903
and r8, rsi
lea ecx, [rsi + 1]
and rcx, 7
je .LBB2_1
xor r9d, r9d
mov rdx, rdi
.LBB2_3:
mov rax, r9
mov r9, rdx
add rdx, 4
dec rcx
jne .LBB2_3
cmp r8, 7
jae .LBB2_5
jmp .LBB2_7
.LBB2_1:
mov rdx, rdi
cmp r8, 7
jb .LBB2_7
.LBB2_5:
lea rcx, [rdi + 4*rsi]
add rdx, -8
.LBB2_6:
lea rax, [rdx + 32]
add rdx, 36
cmp rdx, rcx
mov rdx, rax
jne .LBB2_6
.LBB2_7:
ret
example::fold_ref:
movabs r8, 4611686018427387903
and r8, rsi
lea ecx, [rsi + 1]
and rcx, 7
je .LBB3_1
xor r9d, r9d
mov rdx, rdi
.LBB3_3:
mov rax, r9
mov r9, rdx
add rdx, 4
dec rcx
jne .LBB3_3
cmp r8, 7
jae .LBB3_5
jmp .LBB3_7
.LBB3_1:
mov rdx, rdi
cmp r8, 7
jb .LBB3_7
.LBB3_5:
lea rcx, [rdi + 4*rsi]
add rdx, -8
.LBB3_6:
lea rax, [rdx + 32]
add rdx, 36
cmp rdx, rcx
mov rdx, rax
jne .LBB3_6
.LBB3_7:
ret
I'm assuming this somehow has to do with NonNull
and &T
having the null niche value, as I don't see any other reason for the differences between *const T
and NonNull<T>
. It doesn't seem to be happening with all niche optimized types though, as functions like these do optimize away the loop:
pub fn fold_bool(s: &[bool]) -> Option<bool> {
s.iter().fold(None, |_, i| Some(*i))
}
pub fn fold_nz(s: &[NonZeroUsize]) -> Option<NonZeroUsize> {
s.iter().fold(None, |_, i| Some(*i))
}
This is using nightly rustc on godbolt, which currently is:
rustc 1.68.0-nightly (ad8ae0504 2022-12-29)
binary: rustc
commit-hash: ad8ae0504c54bc2bd8306abfcfe8546c1bb16a49
commit-date: 2022-12-29
host: x86_64-unknown-linux-gnu
release: 1.68.0-nightly
LLVM version: 15.0.6