Description
An example on Compiler Explorer uses nightly only and unsafe intrinsics to assert alignment of some arrays:
#![feature(core_intrinsics)]
// Requires the use of the nightly rust
// Compile with -O
pub fn max_array(x: &mut[f64; 65536], y: &[f64; 65536]) {
unsafe {
std::intrinsics::assume(x.as_ptr() as usize % 64 == 0);
std::intrinsics::assume(y.as_ptr() as usize % 64 == 0);
}
for i in 0..65536 {
x[i] = if y[i] > x[i] { y[i] } else { x[i] };
}
}
The code optimizes to some more efficient vector operations if you can assume that the input arrays have the given alignment.
With std::intrinsics::assume
:
example::max_array:
xor eax, eax
.LBB0_1:
movapd xmm0, xmmword ptr [rsi + 8*rax]
movapd xmm1, xmmword ptr [rsi + 8*rax + 16]
maxpd xmm0, xmmword ptr [rdi + 8*rax]
maxpd xmm1, xmmword ptr [rdi + 8*rax + 16]
movapd xmmword ptr [rdi + 8*rax], xmm0
movapd xmmword ptr [rdi + 8*rax + 16], xmm1
movapd xmm0, xmmword ptr [rsi + 8*rax + 32]
movapd xmm1, xmmword ptr [rsi + 8*rax + 48]
maxpd xmm0, xmmword ptr [rdi + 8*rax + 32]
maxpd xmm1, xmmword ptr [rdi + 8*rax + 48]
movapd xmmword ptr [rdi + 8*rax + 32], xmm0
movapd xmmword ptr [rdi + 8*rax + 48], xmm1
add rax, 8
cmp rax, 65536
jne .LBB0_1
ret
Without std::intrinsics::assume
, it uses instructions that don't assume alignment and are presumably slower, as well as requiring more loads to registers:
example::max_array:
xor eax, eax
.LBB0_1:
movupd xmm0, xmmword ptr [rsi + 8*rax]
movupd xmm1, xmmword ptr [rsi + 8*rax + 16]
movupd xmm2, xmmword ptr [rdi + 8*rax]
maxpd xmm0, xmm2
movupd xmm2, xmmword ptr [rdi + 8*rax + 16]
maxpd xmm1, xmm2
movupd xmm2, xmmword ptr [rdi + 8*rax + 32]
movupd xmm3, xmmword ptr [rdi + 8*rax + 48]
movupd xmmword ptr [rdi + 8*rax], xmm0
movupd xmmword ptr [rdi + 8*rax + 16], xmm1
movupd xmm0, xmmword ptr [rsi + 8*rax + 32]
maxpd xmm0, xmm2
movupd xmm1, xmmword ptr [rsi + 8*rax + 48]
maxpd xmm1, xmm3
movupd xmmword ptr [rdi + 8*rax + 32], xmm0
movupd xmmword ptr [rdi + 8*rax + 48], xmm1
add rax, 8
cmp rax, 65536
jne .LBB0_1
ret
Since there is support for #[align(64)]
now, I thought that it would be possible to do this type safely and without using any feature flags by adding that to a wrapper struct, and sure enough, we can get the same optimization with the following (result not shown since it's identical):
#[repr(align(64))]
pub struct AlignedArray([f64; 65536]);
pub fn max_array(x: &mut AlignedArray, y: &AlignedArray) {
for i in 0..65536 {
x.0[i] = if y.0[i] > x.0[i] { y.0[i] } else { x.0[i] };
}
}
However, that has the slight downside of needing to use .0
instead of just indexing on these wrapper structs. We can fix that by implementing Index
and IndexMut
, which I would expect to be a zero-cost abstraction. However, even though the code is still inlined and vectorized with this approach, it lost the benefit of knowing the alignment and compiled as the version with no alignment information (shown earlier):
use std::ops::{Index, IndexMut};
#[repr(align(64))]
pub struct AlignedArray([f64; 65536]);
impl Index<usize> for AlignedArray {
type Output = f64;
#[inline]
fn index(&self, i: usize) -> &f64 {
&self.0[i]
}
}
impl IndexMut<usize> for AlignedArray {
#[inline]
fn index_mut(&mut self, i: usize) -> &mut f64 {
&mut self.0[i]
}
}
pub fn max_array(x: &mut AlignedArray, y: &AlignedArray) {
for i in 0..65536 {
x[i] = if y[i] > x[i] { y[i] } else { x[i] };
}
}
Try it yourself on Compiler Explorer.
According to Compiler Explorer, this is all using rustc 1.31.0-nightly (96cafc5 2018-10-09). Also worth noting that the ability to produce the alignment optimized version using the wrapper struct with #[align(64)]
only shows up in the nightly compiler; beta and released compilers give the unaligned version even without the use if Index
/IndexMut
.