Description
There's something I've noticed using cargo-show-asm
in the past 2-3 months.
There are occasional cases where building a lib
crate with LTO doesn't produce the same results as building a bin/bench crate for a given function. (Note: Turning off LTO makes the code match on lib
crates).
Below is a simple example based on an in-progress repo of mine:
/// The implementation of a generic histogram, storing the for each byte using type `T`.
/// `T` should be a type that can be incremented.
#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
pub struct Histogram<T> {
pub counter: [T; 256],
}
/// Implementation of a histogram using unsigned 32 bit integers as the counter.
#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Default)]
pub struct Histogram32 {
pub inner: Histogram<u32>,
}
impl Default for Histogram<u32> {
// Defaults to a zero'd array.
fn default() -> Self {
Histogram { counter: [0; 256] }
}
}
const NUM_SLICES: usize = 4;
const SLICE_SIZE_U32S: usize = 256;
pub fn histogram_nonaliased_withruns_core(data: &[u8]) -> Histogram32 {
// 1K on stack, should be good.
let mut histogram = [Histogram32::default(); NUM_SLICES];
unsafe {
let mut ptr = data.as_ptr();
let end = ptr.add(data.len());
let current_ptr = histogram[0].inner.counter.as_mut_ptr();
if data.len() > 24 {
let aligned_end = end.sub(24);
let mut current = (ptr as *const u64).read_unaligned();
while ptr < aligned_end {
// Prefetch next 1 iteration.
let next = (ptr.add(8) as *const u64).read_unaligned();
if current == next {
// Check if all bytes are the same within 'current'.
// With a XOR, we can check every byte (except byte 0)
// with its predecessor. If our value is <256,
// then all bytes are the same value.
let shifted = current << 8;
if (shifted ^ current) < 256 {
// All bytes same - increment single bucket by 16
// (current is all same byte and current equals next)
*current_ptr.add((current & 0xFF) as usize) += 16;
} else {
// Same 8 bytes twice - sum with INC2
sum8(current_ptr, current, 2);
}
} else {
// Process both 8-byte chunks with INC1
sum8(current_ptr, current, 1);
sum8(current_ptr, next, 1);
}
current = ((ptr.add(16)) as *const u64).read_unaligned();
ptr = ptr.add(16);
}
}
while ptr < end {
let byte = *ptr;
*current_ptr.add(byte as usize) += 1;
ptr = ptr.add(1);
}
// Sum up all bytes
// Vectorization-friendly summation
if NUM_SLICES <= 1 {
histogram[0]
} else {
let mut result = histogram[0];
for x in (0..256).step_by(4) {
let mut sum0 = 0_u32;
let mut sum1 = 0_u32;
let mut sum2 = 0_u32;
let mut sum3 = 0_u32;
// Changing to suggested code breaks.
#[allow(clippy::needless_range_loop)]
for slice in 0..NUM_SLICES {
sum0 += histogram[slice].inner.counter[x];
sum1 += histogram[slice].inner.counter[x + 1];
sum2 += histogram[slice].inner.counter[x + 2];
sum3 += histogram[slice].inner.counter[x + 3];
}
result.inner.counter[x] = sum0;
result.inner.counter[x + 1] = sum1;
result.inner.counter[x + 2] = sum2;
result.inner.counter[x + 3] = sum3;
}
result
}
}
}
#[inline(always)]
unsafe fn sum8(current_ptr: *mut u32, mut value: u64, increment: u32) {
for index in 0..8 {
let byte = (value & 0xFF) as usize;
let slice_offset = (index % NUM_SLICES) * SLICE_SIZE_U32S;
let write_ptr = current_ptr.add(slice_offset + byte);
let current = (write_ptr as *const u32).read_unaligned();
(write_ptr).write_unaligned(current + increment);
value >>= 8;
}
}
Or if you'd prefer repo and commit, here.
cargo asm --release --lib histogram_nonaliased_withruns_core
Will need to add no_mangle
as usual.
Building with LTO
enabled for release in Cargo.toml
gives:
.section .text.lossless_transform_utils::histogram::histogram32::histogram_nonaliased_withruns_core,"ax",@progbits
.globl lossless_transform_utils::histogram::histogram32::histogram_nonaliased_withruns_core
.p2align 4, 0x90
.type lossless_transform_utils::histogram::histogram32::histogram_nonaliased_withruns_core,@function
lossless_transform_utils::histogram::histogram32::histogram_nonaliased_withruns_core:
.cfi_startproc
push r15
.cfi_def_cfa_offset 16
push r14
.cfi_def_cfa_offset 24
push r12
.cfi_def_cfa_offset 32
push rbx
.cfi_def_cfa_offset 40
sub rsp, 4096
.cfi_adjust_cfa_offset 4096
mov qword ptr [rsp], 0
sub rsp, 1032
.cfi_def_cfa_offset 5168
.cfi_offset rbx, -40
.cfi_offset r12, -32
.cfi_offset r14, -24
.cfi_offset r15, -16
mov r15, rdx
mov r14, rsi
mov rbx, rdi
lea rdi, [rsp + 3080]
mov r12, qword ptr [rip + memset@GOTPCREL]
mov edx, 1024
xor esi, esi
call r12
lea rdi, [rsp + 8]
mov edx, 3072
xor esi, esi
call r12
lea rax, [r14 + r15]
cmp r15, 24
jbe .LBB1_1
lea rcx, [rax - 24]
cmp rcx, r14
jbe .LBB1_1
jmp .LBB1_6
.p2align 4, 0x90
.LBB1_7:
movzx edi, dl
inc dword ptr [rsp + 4*rdi + 8]
mov edi, edx
shr edi, 6
and edi, 1020
inc dword ptr [rsp + rdi + 1032]
mov edi, edx
shr edi, 14
and edi, 1020
inc dword ptr [rsp + rdi + 2056]
mov edi, edx
shr edi, 22
and edi, -4
inc dword ptr [rsp + rdi + 3080]
mov rdi, rdx
shr rdi, 32
movzx edi, dil
inc dword ptr [rsp + 4*rdi + 8]
mov rdi, rdx
shr rdi, 40
movzx edi, dil
inc dword ptr [rsp + 4*rdi + 1032]
mov rdi, rdx
shr rdi, 48
movzx edi, dil
inc dword ptr [rsp + 4*rdi + 2056]
shr rdx, 56
inc dword ptr [rsp + 4*rdx + 3080]
movzx edx, sil
inc dword ptr [rsp + 4*rdx + 8]
mov edx, esi
shr edx, 6
and edx, 1020
inc dword ptr [rsp + rdx + 1032]
mov edx, esi
shr edx, 14
and edx, 1020
inc dword ptr [rsp + rdx + 2056]
mov edx, esi
shr edx, 22
and edx, -4
inc dword ptr [rsp + rdx + 3080]
mov rdx, rsi
shr rdx, 32
movzx edx, dl
inc dword ptr [rsp + 4*rdx + 8]
mov rdx, rsi
shr rdx, 40
movzx edx, dl
inc dword ptr [rsp + 4*rdx + 1032]
mov rdx, rsi
shr rdx, 48
movzx edx, dl
inc dword ptr [rsp + 4*rdx + 2056]
shr rsi, 56
inc dword ptr [rsp + 4*rsi + 3080]
.LBB1_11:
add r14, 16
cmp r14, rcx
jae .LBB1_1
.LBB1_6:
mov rdx, qword ptr [r14]
mov rsi, qword ptr [r14 + 8]
cmp rdx, rsi
jne .LBB1_7
mov rdi, rdx
shl rdi, 8
xor rdi, rdx
movzx esi, dl
cmp rdi, 256
jae .LBB1_10
add dword ptr [rsp + 4*rsi + 8], 16
jmp .LBB1_11
.p2align 4, 0x90
.LBB1_10:
add dword ptr [rsp + 4*rsi + 8], 2
mov esi, edx
shr esi, 6
and esi, 1020
add dword ptr [rsp + rsi + 1032], 2
mov esi, edx
shr esi, 14
and esi, 1020
add dword ptr [rsp + rsi + 2056], 2
mov esi, edx
shr esi, 22
and esi, -4
add dword ptr [rsp + rsi + 3080], 2
mov rsi, rdx
shr rsi, 32
movzx esi, sil
add dword ptr [rsp + 4*rsi + 8], 2
mov rsi, rdx
shr rsi, 40
movzx esi, sil
add dword ptr [rsp + 4*rsi + 1032], 2
mov rsi, rdx
shr rsi, 48
movzx esi, sil
add dword ptr [rsp + 4*rsi + 2056], 2
shr rdx, 56
add dword ptr [rsp + 4*rdx + 3080], 2
jmp .LBB1_11
.p2align 4, 0x90
.LBB1_8:
movzx ecx, byte ptr [r14]
inc dword ptr [rsp + 4*rcx + 8]
inc r14
.LBB1_1:
cmp r14, rax
jb .LBB1_8
lea rdi, [rsp + 4104]
lea rsi, [rsp + 8]
mov edx, 1024
call qword ptr [rip + memcpy@GOTPCREL]
xor eax, eax
.p2align 4, 0x90
.LBB1_3:
mov ecx, dword ptr [rsp + rax + 1036]
add ecx, dword ptr [rsp + rax + 12]
mov edx, dword ptr [rsp + rax + 1040]
add edx, dword ptr [rsp + rax + 16]
mov esi, dword ptr [rsp + rax + 1032]
mov edi, dword ptr [rsp + rax + 1044]
add edi, dword ptr [rsp + rax + 20]
add ecx, dword ptr [rsp + rax + 2060]
add edx, dword ptr [rsp + rax + 2064]
add edi, dword ptr [rsp + rax + 2068]
add ecx, dword ptr [rsp + rax + 3084]
add edx, dword ptr [rsp + rax + 3088]
add edi, dword ptr [rsp + rax + 3092]
add esi, dword ptr [rsp + rax + 8]
add esi, dword ptr [rsp + rax + 2056]
add esi, dword ptr [rsp + rax + 3080]
mov dword ptr [rsp + rax + 4104], esi
mov dword ptr [rsp + rax + 4108], ecx
mov dword ptr [rsp + rax + 4112], edx
mov dword ptr [rsp + rax + 4116], edi
add rax, 16
cmp rax, 1024
jne .LBB1_3
lea rsi, [rsp + 4104]
mov edx, 1024
mov rdi, rbx
call qword ptr [rip + memcpy@GOTPCREL]
mov rax, rbx
add rsp, 5128
.cfi_def_cfa_offset 40
pop rbx
.cfi_def_cfa_offset 32
pop r12
.cfi_def_cfa_offset 24
pop r14
.cfi_def_cfa_offset 16
pop r15
.cfi_def_cfa_offset 8
ret
```
If you build a `bench`, `bin` or disable LTO for the lib:
```assembly
.section .text.lossless_transform_utils::histogram::histogram32::histogram_nonaliased_withruns_core,"ax",@progbits
.globl lossless_transform_utils::histogram::histogram32::histogram_nonaliased_withruns_core
.p2align 4, 0x90
.type lossless_transform_utils::histogram::histogram32::histogram_nonaliased_withruns_core,@function
lossless_transform_utils::histogram::histogram32::histogram_nonaliased_withruns_core:
.cfi_startproc
push r15
.cfi_def_cfa_offset 16
push r14
.cfi_def_cfa_offset 24
push r12
.cfi_def_cfa_offset 32
push rbx
.cfi_def_cfa_offset 40
sub rsp, 4096
.cfi_adjust_cfa_offset 4096
mov qword ptr [rsp], 0
sub rsp, 1032
.cfi_def_cfa_offset 5168
.cfi_offset rbx, -40
.cfi_offset r12, -32
.cfi_offset r14, -24
.cfi_offset r15, -16
mov r15, rdx
mov r14, rsi
mov rbx, rdi
lea rdi, [rsp + 3080]
mov r12, qword ptr [rip + memset@GOTPCREL]
mov edx, 1024
xor esi, esi
call r12
lea rdi, [rsp + 8]
mov edx, 3072
xor esi, esi
call r12
lea rax, [r14 + r15]
cmp r15, 24
jbe .LBB1_1
lea rcx, [rax - 24]
cmp rcx, r14
ja .LBB1_9
.LBB1_1:
mov rcx, r14
sub rcx, rax
jae .LBB1_5
mov edx, eax
sub edx, r14d
and edx, 3
je .LBB1_4
.p2align 4, 0x90
.LBB1_3:
movzx esi, byte ptr [r14]
inc dword ptr [rsp + 4*rsi + 8]
inc r14
dec rdx
jne .LBB1_3
.LBB1_4:
cmp rcx, -4
ja .LBB1_5
.p2align 4, 0x90
.LBB1_11:
movzx ecx, byte ptr [r14]
inc dword ptr [rsp + 4*rcx + 8]
movzx ecx, byte ptr [r14 + 1]
inc dword ptr [rsp + 4*rcx + 8]
movzx ecx, byte ptr [r14 + 2]
inc dword ptr [rsp + 4*rcx + 8]
movzx ecx, byte ptr [r14 + 3]
inc dword ptr [rsp + 4*rcx + 8]
add r14, 4
cmp r14, rax
jb .LBB1_11
.LBB1_5:
lea rdi, [rsp + 4104]
lea rsi, [rsp + 8]
mov edx, 1024
call qword ptr [rip + memcpy@GOTPCREL]
xor eax, eax
.p2align 4, 0x90
.LBB1_6:
movdqu xmm0, xmmword ptr [rsp + rax + 3080]
movdqu xmm1, xmmword ptr [rsp + rax + 2056]
paddd xmm1, xmm0
movdqu xmm0, xmmword ptr [rsp + rax + 1032]
movdqu xmm2, xmmword ptr [rsp + rax + 8]
paddd xmm2, xmm0
paddd xmm2, xmm1
movdqu xmm0, xmmword ptr [rsp + rax + 24]
movdqu xmmword ptr [rsp + rax + 4104], xmm2
movdqu xmm1, xmmword ptr [rsp + rax + 3096]
movdqu xmm2, xmmword ptr [rsp + rax + 2072]
paddd xmm2, xmm1
movdqu xmm1, xmmword ptr [rsp + rax + 1048]
paddd xmm1, xmm0
paddd xmm1, xmm2
movdqu xmmword ptr [rsp + rax + 4120], xmm1
add rax, 32
cmp rax, 1024
jne .LBB1_6
lea rsi, [rsp + 4104]
mov edx, 1024
mov rdi, rbx
call qword ptr [rip + memcpy@GOTPCREL]
mov rax, rbx
add rsp, 5128
.cfi_def_cfa_offset 40
pop rbx
.cfi_def_cfa_offset 32
pop r12
.cfi_def_cfa_offset 24
pop r14
.cfi_def_cfa_offset 16
pop r15
.cfi_def_cfa_offset 8
ret
.p2align 4, 0x90
.LBB1_10:
.cfi_def_cfa_offset 5168
movzx edi, dl
inc dword ptr [rsp + 4*rdi + 8]
mov edi, edx
shr edi, 6
and edi, 1020
inc dword ptr [rsp + rdi + 1032]
mov edi, edx
shr edi, 14
and edi, 1020
inc dword ptr [rsp + rdi + 2056]
mov edi, edx
shr edi, 22
and edi, -4
inc dword ptr [rsp + rdi + 3080]
mov rdi, rdx
shr rdi, 32
movzx edi, dil
inc dword ptr [rsp + 4*rdi + 8]
mov rdi, rdx
shr rdi, 40
movzx edi, dil
inc dword ptr [rsp + 4*rdi + 1032]
mov rdi, rdx
shr rdi, 48
movzx edi, dil
inc dword ptr [rsp + 4*rdi + 2056]
shr rdx, 56
inc dword ptr [rsp + 4*rdx + 3080]
movzx edx, sil
inc dword ptr [rsp + 4*rdx + 8]
mov edx, esi
shr edx, 6
and edx, 1020
inc dword ptr [rsp + rdx + 1032]
mov edx, esi
shr edx, 14
and edx, 1020
inc dword ptr [rsp + rdx + 2056]
mov edx, esi
shr edx, 22
and edx, -4
inc dword ptr [rsp + rdx + 3080]
mov rdx, rsi
shr rdx, 32
movzx edx, dl
inc dword ptr [rsp + 4*rdx + 8]
mov rdx, rsi
shr rdx, 40
movzx edx, dl
inc dword ptr [rsp + 4*rdx + 1032]
mov rdx, rsi
shr rdx, 48
movzx edx, dl
inc dword ptr [rsp + 4*rdx + 2056]
shr rsi, 56
inc dword ptr [rsp + 4*rsi + 3080]
.LBB1_14:
add r14, 16
cmp r14, rcx
jae .LBB1_1
.LBB1_9:
mov rdx, qword ptr [r14]
mov rsi, qword ptr [r14 + 8]
cmp rdx, rsi
jne .LBB1_10
mov r8, rdx
shl r8, 8
xor r8, rdx
movzx esi, dl
mov edi, esi
mov edi, dword ptr [rsp + 4*rdi + 8]
cmp r8, 256
jae .LBB1_13
add edi, 16
mov dword ptr [rsp + 4*rsi + 8], edi
jmp .LBB1_14
.p2align 4, 0x90
.LBB1_13:
add edi, 2
mov dword ptr [rsp + 4*rsi + 8], edi
mov esi, edx
shr esi, 6
and esi, 1020
add dword ptr [rsp + rsi + 1032], 2
mov esi, edx
shr esi, 14
and esi, 1020
add dword ptr [rsp + rsi + 2056], 2
mov esi, edx
shr esi, 22
and esi, -4
add dword ptr [rsp + rsi + 3080], 2
mov rsi, rdx
shr rsi, 32
movzx esi, sil
add dword ptr [rsp + 4*rsi + 8], 2
mov rsi, rdx
shr rsi, 40
movzx esi, sil
add dword ptr [rsp + 4*rsi + 1032], 2
mov rsi, rdx
shr rsi, 48
movzx esi, sil
add dword ptr [rsp + 4*rsi + 2056], 2
shr rdx, 56
add dword ptr [rsp + 4*rdx + 3080], 2
jmp .LBB1_14
Apologies for the long assembly, it's the best example I have on hand that I can think of off the top of my head.
The code respects opt-level
, but certain optimisations are missed; typically auto-vectorization from the one or two times I've ran into this issue. Host is Linux x86-64
, but OS nor target-cpu
seems to have any impact here.
This isn't a help request or anything of the sort; I was just wondering if this behaviour is worth documenting somewhere.
Activity