Very bad SIMD code generation for simple Euler integration

I tried this code:

```rust
use std::ops::AddAssign;
use std::iter::zip;
use std::ops::Mul;

#[derive(Clone, Copy)]
pub struct Vector<T, const N: usize>([T; N]);

impl<T: Mul<Output = T> + Copy, const N: usize> Mul<T> for Vector<T, N> {
    type Output = Self;
    fn mul(self, other: T) -> Self {
        Self(self.0.map(|s| s * other))
    }
}
impl<T: AddAssign, const N: usize> AddAssign for Vector<T, N> {
    fn add_assign(&mut self, other: Self) {
        for (d, s) in zip(&mut self.0, other.0) {
            *d += s;
        }
    }
}

type Vec4f = Vector<f32, 4>;

#[unsafe(no_mangle)]
pub fn euler(x: &mut [Vec4f], v: &mut [Vec4f], a: &mut [Vec4f], dt: f32) {
    if x.len() != v.len() || v.len() != a.len() {
        return;
    }

    for ((x, v), a) in zip(zip(x, v), a) {
        *v += *a * dt;
        *x += *v * dt;
    }
}
```

I expected to see this happen: When targeting x86-64, the generated code should load, process, and store one `Vec4f` at a time, using SSE (potentially unrolled).

The compiler does in fact generate such code, but it is only used to process the remainder of the slices, when there are fewer than 4 `Vec4f`s left:

```asm
.LBB0_7:                                # =>This Inner Loop Header: Depth=1
	movups	xmm1, xmmword ptr [r8 + rax]
	mulps	xmm1, xmm0
	movups	xmm2, xmmword ptr [rdx + rax]
	addps	xmm2, xmm1
	movups	xmmword ptr [rdx + rax], xmm2
	mulps	xmm2, xmm0
	movups	xmm1, xmmword ptr [rdi + rax]
	addps	xmm1, xmm2
	movups	xmmword ptr [rdi + rax], xmm1
	add	rax, 16
	dec	rsi
	jne	.LBB0_7
```

Instead, this happened: In a desperate attempt at unrolling the loop to process 4 `Vec4f`s at a time, the compiler generates code that loads each `f32` individually and spends more time loading and shuffling lanes than actually doing useful math.

```asm
.LBB0_4:                                # =>This Inner Loop Header: Depth=1
	movss	xmm2, dword ptr [r8 + rcx + 48] # xmm2 = mem[0],zero,zero,zero
	movss	xmm3, dword ptr [r8 + rcx + 32] # xmm3 = mem[0],zero,zero,zero
	unpcklps	xmm3, xmm2                      # xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
	movss	xmm4, dword ptr [r8 + rcx + 16] # xmm4 = mem[0],zero,zero,zero
	movss	xmm2, dword ptr [r8 + rcx]      # xmm2 = mem[0],zero,zero,zero
	unpcklps	xmm2, xmm4                      # xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
	movlhps	xmm2, xmm3                      # xmm2 = xmm2[0],xmm3[0]
	movss	xmm7, dword ptr [r8 + rcx + 4]  # xmm7 = mem[0],zero,zero,zero
	movss	xmm6, dword ptr [r8 + rcx + 8]  # xmm6 = mem[0],zero,zero,zero
	movss	xmm5, dword ptr [r8 + rcx + 12] # xmm5 = mem[0],zero,zero,zero
	movss	xmm3, dword ptr [r8 + rcx + 52] # xmm3 = mem[0],zero,zero,zero
	movss	xmm4, dword ptr [r8 + rcx + 36] # xmm4 = mem[0],zero,zero,zero
	unpcklps	xmm4, xmm3                      # xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
	movss	xmm3, dword ptr [r8 + rcx + 20] # xmm3 = mem[0],zero,zero,zero
	unpcklps	xmm7, xmm3                      # xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1]
	movlhps	xmm7, xmm4                      # xmm7 = xmm7[0],xmm4[0]
	movss	xmm3, dword ptr [r8 + rcx + 56] # xmm3 = mem[0],zero,zero,zero
	movss	xmm4, dword ptr [r8 + rcx + 40] # xmm4 = mem[0],zero,zero,zero
	unpcklps	xmm4, xmm3                      # xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
	movss	xmm3, dword ptr [r8 + rcx + 24] # xmm3 = mem[0],zero,zero,zero
	unpcklps	xmm6, xmm3                      # xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
	movlhps	xmm6, xmm4                      # xmm6 = xmm6[0],xmm4[0]
	movss	xmm3, dword ptr [r8 + rcx + 60] # xmm3 = mem[0],zero,zero,zero
	movss	xmm4, dword ptr [r8 + rcx + 44] # xmm4 = mem[0],zero,zero,zero
	unpcklps	xmm4, xmm3                      # xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
	movss	xmm3, dword ptr [r8 + rcx + 28] # xmm3 = mem[0],zero,zero,zero
	unpcklps	xmm5, xmm3                      # xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
	movlhps	xmm5, xmm4                      # xmm5 = xmm5[0],xmm4[0]
    ; <MANY lines omitted...>
```

This also happens when adding `#[repr(align(16))]` to `Vector`, so it is not related to unaligned operations.

### Meta


Reproduces on the playground, on both stable 1.89.0, and nightly 2025-08-10.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Very bad SIMD code generation for simple Euler integration #145248

Meta

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Very bad SIMD code generation for simple Euler integration #145248

Description

Meta

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions