Skip to content

Very bad SIMD code generation for simple Euler integration #145248

@SludgePhD

Description

@SludgePhD

I tried this code:

use std::ops::AddAssign;
use std::iter::zip;
use std::ops::Mul;

#[derive(Clone, Copy)]
pub struct Vector<T, const N: usize>([T; N]);

impl<T: Mul<Output = T> + Copy, const N: usize> Mul<T> for Vector<T, N> {
    type Output = Self;
    fn mul(self, other: T) -> Self {
        Self(self.0.map(|s| s * other))
    }
}
impl<T: AddAssign, const N: usize> AddAssign for Vector<T, N> {
    fn add_assign(&mut self, other: Self) {
        for (d, s) in zip(&mut self.0, other.0) {
            *d += s;
        }
    }
}

type Vec4f = Vector<f32, 4>;

#[unsafe(no_mangle)]
pub fn euler(x: &mut [Vec4f], v: &mut [Vec4f], a: &mut [Vec4f], dt: f32) {
    if x.len() != v.len() || v.len() != a.len() {
        return;
    }

    for ((x, v), a) in zip(zip(x, v), a) {
        *v += *a * dt;
        *x += *v * dt;
    }
}

I expected to see this happen: When targeting x86-64, the generated code should load, process, and store one Vec4f at a time, using SSE (potentially unrolled).

The compiler does in fact generate such code, but it is only used to process the remainder of the slices, when there are fewer than 4 Vec4fs left:

.LBB0_7:                                # =>This Inner Loop Header: Depth=1
	movups	xmm1, xmmword ptr [r8 + rax]
	mulps	xmm1, xmm0
	movups	xmm2, xmmword ptr [rdx + rax]
	addps	xmm2, xmm1
	movups	xmmword ptr [rdx + rax], xmm2
	mulps	xmm2, xmm0
	movups	xmm1, xmmword ptr [rdi + rax]
	addps	xmm1, xmm2
	movups	xmmword ptr [rdi + rax], xmm1
	add	rax, 16
	dec	rsi
	jne	.LBB0_7

Instead, this happened: In a desperate attempt at unrolling the loop to process 4 Vec4fs at a time, the compiler generates code that loads each f32 individually and spends more time loading and shuffling lanes than actually doing useful math.

.LBB0_4:                                # =>This Inner Loop Header: Depth=1
	movss	xmm2, dword ptr [r8 + rcx + 48] # xmm2 = mem[0],zero,zero,zero
	movss	xmm3, dword ptr [r8 + rcx + 32] # xmm3 = mem[0],zero,zero,zero
	unpcklps	xmm3, xmm2                      # xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
	movss	xmm4, dword ptr [r8 + rcx + 16] # xmm4 = mem[0],zero,zero,zero
	movss	xmm2, dword ptr [r8 + rcx]      # xmm2 = mem[0],zero,zero,zero
	unpcklps	xmm2, xmm4                      # xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
	movlhps	xmm2, xmm3                      # xmm2 = xmm2[0],xmm3[0]
	movss	xmm7, dword ptr [r8 + rcx + 4]  # xmm7 = mem[0],zero,zero,zero
	movss	xmm6, dword ptr [r8 + rcx + 8]  # xmm6 = mem[0],zero,zero,zero
	movss	xmm5, dword ptr [r8 + rcx + 12] # xmm5 = mem[0],zero,zero,zero
	movss	xmm3, dword ptr [r8 + rcx + 52] # xmm3 = mem[0],zero,zero,zero
	movss	xmm4, dword ptr [r8 + rcx + 36] # xmm4 = mem[0],zero,zero,zero
	unpcklps	xmm4, xmm3                      # xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
	movss	xmm3, dword ptr [r8 + rcx + 20] # xmm3 = mem[0],zero,zero,zero
	unpcklps	xmm7, xmm3                      # xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1]
	movlhps	xmm7, xmm4                      # xmm7 = xmm7[0],xmm4[0]
	movss	xmm3, dword ptr [r8 + rcx + 56] # xmm3 = mem[0],zero,zero,zero
	movss	xmm4, dword ptr [r8 + rcx + 40] # xmm4 = mem[0],zero,zero,zero
	unpcklps	xmm4, xmm3                      # xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
	movss	xmm3, dword ptr [r8 + rcx + 24] # xmm3 = mem[0],zero,zero,zero
	unpcklps	xmm6, xmm3                      # xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
	movlhps	xmm6, xmm4                      # xmm6 = xmm6[0],xmm4[0]
	movss	xmm3, dword ptr [r8 + rcx + 60] # xmm3 = mem[0],zero,zero,zero
	movss	xmm4, dword ptr [r8 + rcx + 44] # xmm4 = mem[0],zero,zero,zero
	unpcklps	xmm4, xmm3                      # xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
	movss	xmm3, dword ptr [r8 + rcx + 28] # xmm3 = mem[0],zero,zero,zero
	unpcklps	xmm5, xmm3                      # xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
	movlhps	xmm5, xmm4                      # xmm5 = xmm5[0],xmm4[0]
    ; <MANY lines omitted...>

This also happens when adding #[repr(align(16))] to Vector, so it is not related to unaligned operations.

Meta

Reproduces on the playground, on both stable 1.89.0, and nightly 2025-08-10.

Metadata

Metadata

Assignees

No one assigned

    Labels

    A-SIMDArea: SIMD (Single Instruction Multiple Data)A-autovectorizationArea: Autovectorization, which can impact perf or code sizeC-bugCategory: This is a bug.I-slowIssue: Problems and improvements with respect to performance of generated code.O-x86_64Target: x86-64 processors (like x86_64-*) (also known as amd64 and x64)T-compilerRelevant to the compiler team, which will review and decide on the PR/issue.

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions