Description
I tried this code:
#[inline(never)]
fn decrypt(data: &mut [u8], key: [u8; 4096]) {
for i in 0..data.len() {
data[i] ^= key[i % 4096];
}
}
fn main() {
// Generate 128 MiB of data
let mut data = vec![1; 128 * 1024 * 1024];
let key = [2; 4096];
decrypt(&mut data, key);
assert_eq!(data[1234567], 3);
}
I expected to see this happen: the loop gets vectorized, and spends all of its time in the vectorized path, working through 32 bytes per iteration instead of one.
Instead, this happened: the loop did get vectorized, but the initial check of the size seems to get miscompiled (tested on both amd64 and AArch64) into jumping right into the non-SIMD path.
Meta
rustc --version --verbose
:
rustc 1.69.0-nightly (9a7cc6c32 2023-02-16)
binary: rustc
commit-hash: 9a7cc6c32f1a690f86827e4724bcda85e506ef35
commit-date: 2023-02-16
host: x86_64-unknown-linux-gnu
release: 1.69.0-nightly
LLVM version: 15.0.7
Also tested on:
rustc 1.67.1 (d5a82bbd2 2023-02-07) (Arch Linux rust 1:1.67.1-1)
binary: rustc
commit-hash: d5a82bbd26e1ad8b7401f6a718a9c57c96905483
commit-date: 2023-02-07
host: aarch64-unknown-linux-gnu
release: 1.67.1
LLVM version: 15.0.7
Generated AArch64 assembly
0000000000006634 <test::decrypt>:
6634: b4000621 cbz x1, 66f8 <test::decrypt+0xc4>
6638: 92820008 mov x8, #0xffffffffffffefff // #-4097
663c: 8b080028 add x8, x1, x8
6640: b13fe51f cmn x8, #0xff9
6644: 54000062 b.cs 6650 <test::decrypt+0x1c> // b.hs, b.nlast
6648: aa1f03e8 mov x8, xzr
664c: 14000022 b 66d4 <test::decrypt+0xa0>
6650: f100803f cmp x1, #0x20
6654: 54000062 b.cs 6660 <test::decrypt+0x2c> // b.hs, b.nlast
6658: aa1f03e8 mov x8, xzr
665c: 14000012 b 66a4 <test::decrypt+0x70>
6660: 927be828 and x8, x1, #0xffffffffffffffe0
6664: 91004049 add x9, x2, #0x10
6668: 9100400a add x10, x0, #0x10
666c: aa0803eb mov x11, x8
6670: ad7f8520 ldp q0, q1, [x9, #-16]
6674: 91008129 add x9, x9, #0x20
6678: f100816b subs x11, x11, #0x20
667c: ad7f8d42 ldp q2, q3, [x10, #-16]
6680: 6e201c40 eor v0.16b, v2.16b, v0.16b
6684: 6e211c61 eor v1.16b, v3.16b, v1.16b
6688: ad3f8540 stp q0, q1, [x10, #-16]
668c: 9100814a add x10, x10, #0x20
6690: 54ffff01 b.ne 6670 <test::decrypt+0x3c> // b.any
6694: eb01011f cmp x8, x1
6698: 54000300 b.eq 66f8 <test::decrypt+0xc4> // b.none
669c: f27d043f tst x1, #0x18
66a0: 540001a0 b.eq 66d4 <test::decrypt+0xa0> // b.none
66a4: aa0803e9 mov x9, x8
66a8: 927df028 and x8, x1, #0xfffffffffffffff8
66ac: 927d212a and x10, x9, #0xff8
66b0: fc696800 ldr d0, [x0, x9]
66b4: fc6a6841 ldr d1, [x2, x10]
66b8: 2e211c00 eor v0.8b, v0.8b, v1.8b
66bc: fc296800 str d0, [x0, x9]
66c0: 91002129 add x9, x9, #0x8
66c4: eb09011f cmp x8, x9
66c8: 54ffff21 b.ne 66ac <test::decrypt+0x78> // b.any
66cc: eb01011f cmp x8, x1
66d0: 54000140 b.eq 66f8 <test::decrypt+0xc4> // b.none
66d4: 92402d09 and x9, x8, #0xfff
66d8: 3868680a ldrb w10, [x0, x8]
66dc: 9100050b add x11, x8, #0x1
66e0: eb0b003f cmp x1, x11
66e4: 38696849 ldrb w9, [x2, x9]
66e8: 4a090149 eor w9, w10, w9
66ec: 38286809 strb w9, [x0, x8]
66f0: aa0b03e8 mov x8, x11
66f4: 54ffff01 b.ne 66d4 <test::decrypt+0xa0> // b.any
66f8: d65f03c0 ret
Edit: without the #[inline(never)]
, the loop doesn’t even get vectorized at all.
Edit 2: here is the result in godbolt: https://godbolt.org/z/9TcqvPhz3
I’ve tested using both -C target-cpu=native
on a Kaby Lake and without, in both cases I end up with some SIMD being emitted but it’s always jumped over to what should only be used for the tail of the data.