Doc: LTO Builds of libs don't always produce the same codegen as bench/binary does.

There's something I've noticed using `cargo-show-asm` in the past 2-3 months.

There are occasional cases where building a `lib` crate with LTO doesn't produce the same results as building a bin/bench crate for a given function. (Note: Turning off LTO makes the code match on `lib` crates).

Below is a simple example based on an in-progress repo of mine:

```rust
/// The implementation of a generic histogram, storing the for each byte using type `T`.
/// `T` should be a type that can be incremented.
#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
pub struct Histogram<T> {
    pub counter: [T; 256],
}

/// Implementation of a histogram using unsigned 32 bit integers as the counter.
#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Default)]
pub struct Histogram32 {
    pub inner: Histogram<u32>,
}

impl Default for Histogram<u32> {
    // Defaults to a zero'd array.
    fn default() -> Self {
        Histogram { counter: [0; 256] }
    }
}

const NUM_SLICES: usize = 4;
const SLICE_SIZE_U32S: usize = 256;

pub fn histogram_nonaliased_withruns_core(data: &[u8]) -> Histogram32 {
    // 1K on stack, should be good.
    let mut histogram = [Histogram32::default(); NUM_SLICES];

    unsafe {
        let mut ptr = data.as_ptr();
        let end = ptr.add(data.len());
        let current_ptr = histogram[0].inner.counter.as_mut_ptr();

        if data.len() > 24 {
            let aligned_end = end.sub(24);
            let mut current = (ptr as *const u64).read_unaligned();

            while ptr < aligned_end {
                // Prefetch next 1 iteration.
                let next = (ptr.add(8) as *const u64).read_unaligned();

                if current == next {
                    // Check if all bytes are the same within 'current'.

                    // With a XOR, we can check every byte (except byte 0)
                    // with its predecessor. If our value is <256,
                    // then all bytes are the same value.
                    let shifted = current << 8;
                    if (shifted ^ current) < 256 {
                        // All bytes same - increment single bucket by 16
                        // (current is all same byte and current equals next)
                        *current_ptr.add((current & 0xFF) as usize) += 16;
                    } else {
                        // Same 8 bytes twice - sum with INC2
                        sum8(current_ptr, current, 2);
                    }
                } else {
                    // Process both 8-byte chunks with INC1
                    sum8(current_ptr, current, 1);
                    sum8(current_ptr, next, 1);
                }

                current = ((ptr.add(16)) as *const u64).read_unaligned();
                ptr = ptr.add(16);
            }
        }

        while ptr < end {
            let byte = *ptr;
            *current_ptr.add(byte as usize) += 1;
            ptr = ptr.add(1);
        }

        // Sum up all bytes
        // Vectorization-friendly summation
        if NUM_SLICES <= 1 {
            histogram[0]
        } else {
            let mut result = histogram[0];
            for x in (0..256).step_by(4) {
                let mut sum0 = 0_u32;
                let mut sum1 = 0_u32;
                let mut sum2 = 0_u32;
                let mut sum3 = 0_u32;

                // Changing to suggested code breaks.
                #[allow(clippy::needless_range_loop)]
                for slice in 0..NUM_SLICES {
                    sum0 += histogram[slice].inner.counter[x];
                    sum1 += histogram[slice].inner.counter[x + 1];
                    sum2 += histogram[slice].inner.counter[x + 2];
                    sum3 += histogram[slice].inner.counter[x + 3];
                }

                result.inner.counter[x] = sum0;
                result.inner.counter[x + 1] = sum1;
                result.inner.counter[x + 2] = sum2;
                result.inner.counter[x + 3] = sum3;
            }

            result
        }
    }
}

#[inline(always)]
unsafe fn sum8(current_ptr: *mut u32, mut value: u64, increment: u32) {
    for index in 0..8 {
        let byte = (value & 0xFF) as usize;
        let slice_offset = (index % NUM_SLICES) * SLICE_SIZE_U32S;
        let write_ptr = current_ptr.add(slice_offset + byte);
        let current = (write_ptr as *const u32).read_unaligned();
        (write_ptr).write_unaligned(current + increment);
        value >>= 8;
    }
}
```

Or if you'd prefer [repo and commit, here](https://github.com/Sewer56/lossless-transform-utils/commit/d2c3216c36624e694ca6be0a0d73ab0f4c4ac27d).

```
cargo asm --release --lib histogram_nonaliased_withruns_core
```

Will need to add `no_mangle` as usual.

Building with `LTO` enabled for release in `Cargo.toml` gives:

   ```assembly
    .section .text.lossless_transform_utils::histogram::histogram32::histogram_nonaliased_withruns_core,"ax",@progbits
            .globl  lossless_transform_utils::histogram::histogram32::histogram_nonaliased_withruns_core
            .p2align        4, 0x90
    .type   lossless_transform_utils::histogram::histogram32::histogram_nonaliased_withruns_core,@function
    lossless_transform_utils::histogram::histogram32::histogram_nonaliased_withruns_core:
            .cfi_startproc
            push r15
            .cfi_def_cfa_offset 16
            push r14
            .cfi_def_cfa_offset 24
            push r12
            .cfi_def_cfa_offset 32
            push rbx
            .cfi_def_cfa_offset 40
            sub rsp, 4096
            .cfi_adjust_cfa_offset 4096
            mov qword ptr [rsp], 0
            sub rsp, 1032
            .cfi_def_cfa_offset 5168
            .cfi_offset rbx, -40
            .cfi_offset r12, -32
            .cfi_offset r14, -24
            .cfi_offset r15, -16
            mov r15, rdx
            mov r14, rsi
            mov rbx, rdi
            lea rdi, [rsp + 3080]
            mov r12, qword ptr [rip + memset@GOTPCREL]
            mov edx, 1024
            xor esi, esi
            call r12
            lea rdi, [rsp + 8]
            mov edx, 3072
            xor esi, esi
            call r12
            lea rax, [r14 + r15]
            cmp r15, 24
            jbe .LBB1_1
            lea rcx, [rax - 24]
            cmp rcx, r14
            jbe .LBB1_1
            jmp .LBB1_6
            .p2align        4, 0x90
    .LBB1_7:
            movzx edi, dl
            inc dword ptr [rsp + 4*rdi + 8]
            mov edi, edx
            shr edi, 6
            and edi, 1020
            inc dword ptr [rsp + rdi + 1032]
            mov edi, edx
            shr edi, 14
            and edi, 1020
            inc dword ptr [rsp + rdi + 2056]
            mov edi, edx
            shr edi, 22
            and edi, -4
            inc dword ptr [rsp + rdi + 3080]
            mov rdi, rdx
            shr rdi, 32
            movzx edi, dil
            inc dword ptr [rsp + 4*rdi + 8]
            mov rdi, rdx
            shr rdi, 40
            movzx edi, dil
            inc dword ptr [rsp + 4*rdi + 1032]
            mov rdi, rdx
            shr rdi, 48
            movzx edi, dil
            inc dword ptr [rsp + 4*rdi + 2056]
            shr rdx, 56
            inc dword ptr [rsp + 4*rdx + 3080]
            movzx edx, sil
            inc dword ptr [rsp + 4*rdx + 8]
            mov edx, esi
            shr edx, 6
            and edx, 1020
            inc dword ptr [rsp + rdx + 1032]
            mov edx, esi
            shr edx, 14
            and edx, 1020
            inc dword ptr [rsp + rdx + 2056]
            mov edx, esi
            shr edx, 22
            and edx, -4
            inc dword ptr [rsp + rdx + 3080]
            mov rdx, rsi
            shr rdx, 32
            movzx edx, dl
            inc dword ptr [rsp + 4*rdx + 8]
            mov rdx, rsi
            shr rdx, 40
            movzx edx, dl
            inc dword ptr [rsp + 4*rdx + 1032]
            mov rdx, rsi
            shr rdx, 48
            movzx edx, dl
            inc dword ptr [rsp + 4*rdx + 2056]
            shr rsi, 56
            inc dword ptr [rsp + 4*rsi + 3080]
    .LBB1_11:
            add r14, 16
            cmp r14, rcx
            jae .LBB1_1
    .LBB1_6:
            mov rdx, qword ptr [r14]
            mov rsi, qword ptr [r14 + 8]
            cmp rdx, rsi
            jne .LBB1_7
            mov rdi, rdx
            shl rdi, 8
            xor rdi, rdx
            movzx esi, dl
            cmp rdi, 256
            jae .LBB1_10
            add dword ptr [rsp + 4*rsi + 8], 16
            jmp .LBB1_11
            .p2align        4, 0x90
    .LBB1_10:
            add dword ptr [rsp + 4*rsi + 8], 2
            mov esi, edx
            shr esi, 6
            and esi, 1020
            add dword ptr [rsp + rsi + 1032], 2
            mov esi, edx
            shr esi, 14
            and esi, 1020
            add dword ptr [rsp + rsi + 2056], 2
            mov esi, edx
            shr esi, 22
            and esi, -4
            add dword ptr [rsp + rsi + 3080], 2
            mov rsi, rdx
            shr rsi, 32
            movzx esi, sil
            add dword ptr [rsp + 4*rsi + 8], 2
            mov rsi, rdx
            shr rsi, 40
            movzx esi, sil
            add dword ptr [rsp + 4*rsi + 1032], 2
            mov rsi, rdx
            shr rsi, 48
            movzx esi, sil
            add dword ptr [rsp + 4*rsi + 2056], 2
            shr rdx, 56
            add dword ptr [rsp + 4*rdx + 3080], 2
            jmp .LBB1_11
            .p2align        4, 0x90
    .LBB1_8:
            movzx ecx, byte ptr [r14]
            inc dword ptr [rsp + 4*rcx + 8]
            inc r14
    .LBB1_1:
            cmp r14, rax
            jb .LBB1_8
            lea rdi, [rsp + 4104]
            lea rsi, [rsp + 8]
            mov edx, 1024
            call qword ptr [rip + memcpy@GOTPCREL]
            xor eax, eax
            .p2align        4, 0x90
    .LBB1_3:
            mov ecx, dword ptr [rsp + rax + 1036]
            add ecx, dword ptr [rsp + rax + 12]
            mov edx, dword ptr [rsp + rax + 1040]
            add edx, dword ptr [rsp + rax + 16]
            mov esi, dword ptr [rsp + rax + 1032]
            mov edi, dword ptr [rsp + rax + 1044]
            add edi, dword ptr [rsp + rax + 20]
            add ecx, dword ptr [rsp + rax + 2060]
            add edx, dword ptr [rsp + rax + 2064]
            add edi, dword ptr [rsp + rax + 2068]
            add ecx, dword ptr [rsp + rax + 3084]
            add edx, dword ptr [rsp + rax + 3088]
            add edi, dword ptr [rsp + rax + 3092]
            add esi, dword ptr [rsp + rax + 8]
            add esi, dword ptr [rsp + rax + 2056]
            add esi, dword ptr [rsp + rax + 3080]
            mov dword ptr [rsp + rax + 4104], esi
            mov dword ptr [rsp + rax + 4108], ecx
            mov dword ptr [rsp + rax + 4112], edx
            mov dword ptr [rsp + rax + 4116], edi
            add rax, 16
            cmp rax, 1024
            jne .LBB1_3
            lea rsi, [rsp + 4104]
            mov edx, 1024
            mov rdi, rbx
            call qword ptr [rip + memcpy@GOTPCREL]
            mov rax, rbx
            add rsp, 5128
            .cfi_def_cfa_offset 40
            pop rbx
            .cfi_def_cfa_offset 32
            pop r12
            .cfi_def_cfa_offset 24
            pop r14
            .cfi_def_cfa_offset 16
            pop r15
            .cfi_def_cfa_offset 8
            ret
    ```
    
 If you build a `bench`, `bin` or disable LTO for the lib:
 
 ```assembly
 .section .text.lossless_transform_utils::histogram::histogram32::histogram_nonaliased_withruns_core,"ax",@progbits
        .globl  lossless_transform_utils::histogram::histogram32::histogram_nonaliased_withruns_core
        .p2align        4, 0x90
.type   lossless_transform_utils::histogram::histogram32::histogram_nonaliased_withruns_core,@function
lossless_transform_utils::histogram::histogram32::histogram_nonaliased_withruns_core:
        .cfi_startproc
        push r15
        .cfi_def_cfa_offset 16
        push r14
        .cfi_def_cfa_offset 24
        push r12
        .cfi_def_cfa_offset 32
        push rbx
        .cfi_def_cfa_offset 40
        sub rsp, 4096
        .cfi_adjust_cfa_offset 4096
        mov qword ptr [rsp], 0
        sub rsp, 1032
        .cfi_def_cfa_offset 5168
        .cfi_offset rbx, -40
        .cfi_offset r12, -32
        .cfi_offset r14, -24
        .cfi_offset r15, -16
        mov r15, rdx
        mov r14, rsi
        mov rbx, rdi
        lea rdi, [rsp + 3080]
        mov r12, qword ptr [rip + memset@GOTPCREL]
        mov edx, 1024
        xor esi, esi
        call r12
        lea rdi, [rsp + 8]
        mov edx, 3072
        xor esi, esi
        call r12
        lea rax, [r14 + r15]
        cmp r15, 24
        jbe .LBB1_1
        lea rcx, [rax - 24]
        cmp rcx, r14
        ja .LBB1_9
.LBB1_1:
        mov rcx, r14
        sub rcx, rax
        jae .LBB1_5
        mov edx, eax
        sub edx, r14d
        and edx, 3
        je .LBB1_4
        .p2align        4, 0x90
.LBB1_3:
        movzx esi, byte ptr [r14]
        inc dword ptr [rsp + 4*rsi + 8]
        inc r14
        dec rdx
        jne .LBB1_3
.LBB1_4:
        cmp rcx, -4
        ja .LBB1_5
        .p2align        4, 0x90
.LBB1_11:
        movzx ecx, byte ptr [r14]
        inc dword ptr [rsp + 4*rcx + 8]
        movzx ecx, byte ptr [r14 + 1]
        inc dword ptr [rsp + 4*rcx + 8]
        movzx ecx, byte ptr [r14 + 2]
        inc dword ptr [rsp + 4*rcx + 8]
        movzx ecx, byte ptr [r14 + 3]
        inc dword ptr [rsp + 4*rcx + 8]
        add r14, 4
        cmp r14, rax
        jb .LBB1_11
.LBB1_5:
        lea rdi, [rsp + 4104]
        lea rsi, [rsp + 8]
        mov edx, 1024
        call qword ptr [rip + memcpy@GOTPCREL]
        xor eax, eax
        .p2align        4, 0x90
.LBB1_6:
        movdqu xmm0, xmmword ptr [rsp + rax + 3080]
        movdqu xmm1, xmmword ptr [rsp + rax + 2056]
        paddd xmm1, xmm0
        movdqu xmm0, xmmword ptr [rsp + rax + 1032]
        movdqu xmm2, xmmword ptr [rsp + rax + 8]
        paddd xmm2, xmm0
        paddd xmm2, xmm1
        movdqu xmm0, xmmword ptr [rsp + rax + 24]
        movdqu xmmword ptr [rsp + rax + 4104], xmm2
        movdqu xmm1, xmmword ptr [rsp + rax + 3096]
        movdqu xmm2, xmmword ptr [rsp + rax + 2072]
        paddd xmm2, xmm1
        movdqu xmm1, xmmword ptr [rsp + rax + 1048]
        paddd xmm1, xmm0
        paddd xmm1, xmm2
        movdqu xmmword ptr [rsp + rax + 4120], xmm1
        add rax, 32
        cmp rax, 1024
        jne .LBB1_6
        lea rsi, [rsp + 4104]
        mov edx, 1024
        mov rdi, rbx
        call qword ptr [rip + memcpy@GOTPCREL]
        mov rax, rbx
        add rsp, 5128
        .cfi_def_cfa_offset 40
        pop rbx
        .cfi_def_cfa_offset 32
        pop r12
        .cfi_def_cfa_offset 24
        pop r14
        .cfi_def_cfa_offset 16
        pop r15
        .cfi_def_cfa_offset 8
        ret
        .p2align        4, 0x90
.LBB1_10:
        .cfi_def_cfa_offset 5168
        movzx edi, dl
        inc dword ptr [rsp + 4*rdi + 8]
        mov edi, edx
        shr edi, 6
        and edi, 1020
        inc dword ptr [rsp + rdi + 1032]
        mov edi, edx
        shr edi, 14
        and edi, 1020
        inc dword ptr [rsp + rdi + 2056]
        mov edi, edx
        shr edi, 22
        and edi, -4
        inc dword ptr [rsp + rdi + 3080]
        mov rdi, rdx
        shr rdi, 32
        movzx edi, dil
        inc dword ptr [rsp + 4*rdi + 8]
        mov rdi, rdx
        shr rdi, 40
        movzx edi, dil
        inc dword ptr [rsp + 4*rdi + 1032]
        mov rdi, rdx
        shr rdi, 48
        movzx edi, dil
        inc dword ptr [rsp + 4*rdi + 2056]
        shr rdx, 56
        inc dword ptr [rsp + 4*rdx + 3080]
        movzx edx, sil
        inc dword ptr [rsp + 4*rdx + 8]
        mov edx, esi
        shr edx, 6
        and edx, 1020
        inc dword ptr [rsp + rdx + 1032]
        mov edx, esi
        shr edx, 14
        and edx, 1020
        inc dword ptr [rsp + rdx + 2056]
        mov edx, esi
        shr edx, 22
        and edx, -4
        inc dword ptr [rsp + rdx + 3080]
        mov rdx, rsi
        shr rdx, 32
        movzx edx, dl
        inc dword ptr [rsp + 4*rdx + 8]
        mov rdx, rsi
        shr rdx, 40
        movzx edx, dl
        inc dword ptr [rsp + 4*rdx + 1032]
        mov rdx, rsi
        shr rdx, 48
        movzx edx, dl
        inc dword ptr [rsp + 4*rdx + 2056]
        shr rsi, 56
        inc dword ptr [rsp + 4*rsi + 3080]
.LBB1_14:
        add r14, 16
        cmp r14, rcx
        jae .LBB1_1
.LBB1_9:
        mov rdx, qword ptr [r14]
        mov rsi, qword ptr [r14 + 8]
        cmp rdx, rsi
        jne .LBB1_10
        mov r8, rdx
        shl r8, 8
        xor r8, rdx
        movzx esi, dl
        mov edi, esi
        mov edi, dword ptr [rsp + 4*rdi + 8]
        cmp r8, 256
        jae .LBB1_13
        add edi, 16
        mov dword ptr [rsp + 4*rsi + 8], edi
        jmp .LBB1_14
        .p2align        4, 0x90
.LBB1_13:
        add edi, 2
        mov dword ptr [rsp + 4*rsi + 8], edi
        mov esi, edx
        shr esi, 6
        and esi, 1020
        add dword ptr [rsp + rsi + 1032], 2
        mov esi, edx
        shr esi, 14
        and esi, 1020
        add dword ptr [rsp + rsi + 2056], 2
        mov esi, edx
        shr esi, 22
        and esi, -4
        add dword ptr [rsp + rsi + 3080], 2
        mov rsi, rdx
        shr rsi, 32
        movzx esi, sil
        add dword ptr [rsp + 4*rsi + 8], 2
        mov rsi, rdx
        shr rsi, 40
        movzx esi, sil
        add dword ptr [rsp + 4*rsi + 1032], 2
        mov rsi, rdx
        shr rsi, 48
        movzx esi, sil
        add dword ptr [rsp + 4*rsi + 2056], 2
        shr rdx, 56
        add dword ptr [rsp + 4*rdx + 3080], 2
        jmp .LBB1_14
````

Apologies for the long assembly, it's the best example I have on hand that I can think of off the top of my head.

The code respects `opt-level`, but certain optimisations are missed; typically auto-vectorization from the one or two times I've ran into this issue. Host is `Linux x86-64`, but OS nor `target-cpu` seems to have any impact here.

This isn't a help request or anything of the sort; I was just wondering if this behaviour is worth documenting somewhere.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Doc: LTO Builds of libs don't always produce the same codegen as bench/binary does. #361

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

Doc: LTO Builds of libs don't always produce the same codegen as bench/binary does. #361

Description

Activity

Metadata

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

Issue actions