Skip to content

slice::Iter::fold optimizes poorly for some niche optimized types. #106288

Closed
@Sp00ph

Description

@Sp00ph

I tried this code:

pub fn fold_val(s: &[i32]) -> Option<i32> {
    s.iter().fold(None, |_, i| Some(*i))
}

pub fn fold_ptr(s: &[i32]) -> Option<*const i32> {
    s.iter().fold(None, |_, i| Some(<*const i32>::from(i)))
}

pub fn fold_nonnull(s: &[i32]) -> Option<std::ptr::NonNull<i32>> {
    s.iter().fold(None, |_, i| Some(From::from(i)))
}

pub fn fold_ref(s: &[i32]) -> Option<&i32> {
    s.iter().fold(None, |_, i| Some(i))
}

(nevermind the fact that these could obviously just use slice::back)
(godbolt link: https://rust.godbolt.org/z/6fjzo4faW )

I expected that all of these functions produce more or less similar assembly, as all of them just need to peel the last loop iteration to be able to optimize away the whole loop body. Indeed, the first two functions optimize just fine:

example::fold_val:
        test    rsi, rsi
        je      .LBB0_1
        mov     edx, dword ptr [rdi + 4*rsi - 4]
        mov     eax, 1
        ret
.LBB0_1:
        xor     eax, eax
        ret

example::fold_ptr:
        xor     eax, eax
        test    rsi, rsi
        setne   al
        lea     rdx, [rdi + 4*rsi]
        add     rdx, -4
        ret

The fold_{nonnull,ref} functions however don't optimize away the loop:

example::fold_nonnull:
        movabs  r8, 4611686018427387903
        and     r8, rsi
        lea     ecx, [rsi + 1]
        and     rcx, 7
        je      .LBB2_1
        xor     r9d, r9d
        mov     rdx, rdi
.LBB2_3:
        mov     rax, r9
        mov     r9, rdx
        add     rdx, 4
        dec     rcx
        jne     .LBB2_3
        cmp     r8, 7
        jae     .LBB2_5
        jmp     .LBB2_7
.LBB2_1:
        mov     rdx, rdi
        cmp     r8, 7
        jb      .LBB2_7
.LBB2_5:
        lea     rcx, [rdi + 4*rsi]
        add     rdx, -8
.LBB2_6:
        lea     rax, [rdx + 32]
        add     rdx, 36
        cmp     rdx, rcx
        mov     rdx, rax
        jne     .LBB2_6
.LBB2_7:
        ret

example::fold_ref:
        movabs  r8, 4611686018427387903
        and     r8, rsi
        lea     ecx, [rsi + 1]
        and     rcx, 7
        je      .LBB3_1
        xor     r9d, r9d
        mov     rdx, rdi
.LBB3_3:
        mov     rax, r9
        mov     r9, rdx
        add     rdx, 4
        dec     rcx
        jne     .LBB3_3
        cmp     r8, 7
        jae     .LBB3_5
        jmp     .LBB3_7
.LBB3_1:
        mov     rdx, rdi
        cmp     r8, 7
        jb      .LBB3_7
.LBB3_5:
        lea     rcx, [rdi + 4*rsi]
        add     rdx, -8
.LBB3_6:
        lea     rax, [rdx + 32]
        add     rdx, 36
        cmp     rdx, rcx
        mov     rdx, rax
        jne     .LBB3_6
.LBB3_7:
        ret

I'm assuming this somehow has to do with NonNull and &T having the null niche value, as I don't see any other reason for the differences between *const T and NonNull<T>. It doesn't seem to be happening with all niche optimized types though, as functions like these do optimize away the loop:

pub fn fold_bool(s: &[bool]) -> Option<bool> {
    s.iter().fold(None, |_, i| Some(*i))
}

pub fn fold_nz(s: &[NonZeroUsize]) -> Option<NonZeroUsize> {
    s.iter().fold(None, |_, i| Some(*i))
}

This is using nightly rustc on godbolt, which currently is:

rustc 1.68.0-nightly (ad8ae0504 2022-12-29)
binary: rustc
commit-hash: ad8ae0504c54bc2bd8306abfcfe8546c1bb16a49
commit-date: 2022-12-29
host: x86_64-unknown-linux-gnu
release: 1.68.0-nightly
LLVM version: 15.0.6

Metadata

Metadata

Assignees

No one assigned

    Labels

    A-LLVMArea: Code generation parts specific to LLVM. Both correctness bugs and optimization-related issues.I-slowIssue: Problems and improvements with respect to performance of generated code.T-compilerRelevant to the compiler team, which will review and decide on the PR/issue.

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions