Skip to content

SIMD path never taken in simple vectorized loop #108187

Open
@linkmauve

Description

@linkmauve

I tried this code:

#[inline(never)]
fn decrypt(data: &mut [u8], key: [u8; 4096]) {
    for i in 0..data.len() {
        data[i] ^= key[i % 4096];
    }
}

fn main() {
    // Generate 128 MiB of data
    let mut data = vec![1; 128 * 1024 * 1024];
    let key = [2; 4096];
    decrypt(&mut data, key);
    assert_eq!(data[1234567], 3);
}

I expected to see this happen: the loop gets vectorized, and spends all of its time in the vectorized path, working through 32 bytes per iteration instead of one.

Instead, this happened: the loop did get vectorized, but the initial check of the size seems to get miscompiled (tested on both amd64 and AArch64) into jumping right into the non-SIMD path.

Meta

rustc --version --verbose:

rustc 1.69.0-nightly (9a7cc6c32 2023-02-16)
binary: rustc
commit-hash: 9a7cc6c32f1a690f86827e4724bcda85e506ef35
commit-date: 2023-02-16
host: x86_64-unknown-linux-gnu
release: 1.69.0-nightly
LLVM version: 15.0.7

Also tested on:

rustc 1.67.1 (d5a82bbd2 2023-02-07) (Arch Linux rust 1:1.67.1-1)
binary: rustc
commit-hash: d5a82bbd26e1ad8b7401f6a718a9c57c96905483
commit-date: 2023-02-07
host: aarch64-unknown-linux-gnu
release: 1.67.1
LLVM version: 15.0.7
Generated AArch64 assembly

0000000000006634 <test::decrypt>:
    6634:	b4000621 	cbz	x1, 66f8 <test::decrypt+0xc4>
    6638:	92820008 	mov	x8, #0xffffffffffffefff    	// #-4097
    663c:	8b080028 	add	x8, x1, x8
    6640:	b13fe51f 	cmn	x8, #0xff9
    6644:	54000062 	b.cs	6650 <test::decrypt+0x1c>  // b.hs, b.nlast
    6648:	aa1f03e8 	mov	x8, xzr
    664c:	14000022 	b	66d4 <test::decrypt+0xa0>
    6650:	f100803f 	cmp	x1, #0x20
    6654:	54000062 	b.cs	6660 <test::decrypt+0x2c>  // b.hs, b.nlast
    6658:	aa1f03e8 	mov	x8, xzr
    665c:	14000012 	b	66a4 <test::decrypt+0x70>
    6660:	927be828 	and	x8, x1, #0xffffffffffffffe0
    6664:	91004049 	add	x9, x2, #0x10
    6668:	9100400a 	add	x10, x0, #0x10
    666c:	aa0803eb 	mov	x11, x8
    6670:	ad7f8520 	ldp	q0, q1, [x9, #-16]
    6674:	91008129 	add	x9, x9, #0x20
    6678:	f100816b 	subs	x11, x11, #0x20
    667c:	ad7f8d42 	ldp	q2, q3, [x10, #-16]
    6680:	6e201c40 	eor	v0.16b, v2.16b, v0.16b
    6684:	6e211c61 	eor	v1.16b, v3.16b, v1.16b
    6688:	ad3f8540 	stp	q0, q1, [x10, #-16]
    668c:	9100814a 	add	x10, x10, #0x20
    6690:	54ffff01 	b.ne	6670 <test::decrypt+0x3c>  // b.any
    6694:	eb01011f 	cmp	x8, x1
    6698:	54000300 	b.eq	66f8 <test::decrypt+0xc4>  // b.none
    669c:	f27d043f 	tst	x1, #0x18
    66a0:	540001a0 	b.eq	66d4 <test::decrypt+0xa0>  // b.none
    66a4:	aa0803e9 	mov	x9, x8
    66a8:	927df028 	and	x8, x1, #0xfffffffffffffff8
    66ac:	927d212a 	and	x10, x9, #0xff8
    66b0:	fc696800 	ldr	d0, [x0, x9]
    66b4:	fc6a6841 	ldr	d1, [x2, x10]
    66b8:	2e211c00 	eor	v0.8b, v0.8b, v1.8b
    66bc:	fc296800 	str	d0, [x0, x9]
    66c0:	91002129 	add	x9, x9, #0x8
    66c4:	eb09011f 	cmp	x8, x9
    66c8:	54ffff21 	b.ne	66ac <test::decrypt+0x78>  // b.any
    66cc:	eb01011f 	cmp	x8, x1
    66d0:	54000140 	b.eq	66f8 <test::decrypt+0xc4>  // b.none
    66d4:	92402d09 	and	x9, x8, #0xfff
    66d8:	3868680a 	ldrb	w10, [x0, x8]
    66dc:	9100050b 	add	x11, x8, #0x1
    66e0:	eb0b003f 	cmp	x1, x11
    66e4:	38696849 	ldrb	w9, [x2, x9]
    66e8:	4a090149 	eor	w9, w10, w9
    66ec:	38286809 	strb	w9, [x0, x8]
    66f0:	aa0b03e8 	mov	x8, x11
    66f4:	54ffff01 	b.ne	66d4 <test::decrypt+0xa0>  // b.any
    66f8:	d65f03c0 	ret

Edit: without the #[inline(never)], the loop doesn’t even get vectorized at all.

Edit 2: here is the result in godbolt: https://godbolt.org/z/9TcqvPhz3

I’ve tested using both -C target-cpu=native on a Kaby Lake and without, in both cases I end up with some SIMD being emitted but it’s always jumped over to what should only be used for the tail of the data.

Metadata

Metadata

Assignees

No one assigned

    Labels

    A-LLVMArea: Code generation parts specific to LLVM. Both correctness bugs and optimization-related issues.A-SIMDArea: SIMD (Single Instruction Multiple Data)A-autovectorizationArea: Autovectorization, which can impact perf or code sizeC-bugCategory: This is a bug.I-slowIssue: Problems and improvements with respect to performance of generated code.T-compilerRelevant to the compiler team, which will review and decide on the PR/issue.

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions