-
Notifications
You must be signed in to change notification settings - Fork 14.1k
Open
Labels
A-SIMDArea: SIMD (Single Instruction Multiple Data)Area: SIMD (Single Instruction Multiple Data)A-autovectorizationArea: Autovectorization, which can impact perf or code sizeArea: Autovectorization, which can impact perf or code sizeC-bugCategory: This is a bug.Category: This is a bug.I-slowIssue: Problems and improvements with respect to performance of generated code.Issue: Problems and improvements with respect to performance of generated code.O-x86_64Target: x86-64 processors (like x86_64-*) (also known as amd64 and x64)Target: x86-64 processors (like x86_64-*) (also known as amd64 and x64)T-compilerRelevant to the compiler team, which will review and decide on the PR/issue.Relevant to the compiler team, which will review and decide on the PR/issue.
Description
I tried this code:
use std::ops::AddAssign;
use std::iter::zip;
use std::ops::Mul;
#[derive(Clone, Copy)]
pub struct Vector<T, const N: usize>([T; N]);
impl<T: Mul<Output = T> + Copy, const N: usize> Mul<T> for Vector<T, N> {
type Output = Self;
fn mul(self, other: T) -> Self {
Self(self.0.map(|s| s * other))
}
}
impl<T: AddAssign, const N: usize> AddAssign for Vector<T, N> {
fn add_assign(&mut self, other: Self) {
for (d, s) in zip(&mut self.0, other.0) {
*d += s;
}
}
}
type Vec4f = Vector<f32, 4>;
#[unsafe(no_mangle)]
pub fn euler(x: &mut [Vec4f], v: &mut [Vec4f], a: &mut [Vec4f], dt: f32) {
if x.len() != v.len() || v.len() != a.len() {
return;
}
for ((x, v), a) in zip(zip(x, v), a) {
*v += *a * dt;
*x += *v * dt;
}
}I expected to see this happen: When targeting x86-64, the generated code should load, process, and store one Vec4f at a time, using SSE (potentially unrolled).
The compiler does in fact generate such code, but it is only used to process the remainder of the slices, when there are fewer than 4 Vec4fs left:
.LBB0_7: # =>This Inner Loop Header: Depth=1
movups xmm1, xmmword ptr [r8 + rax]
mulps xmm1, xmm0
movups xmm2, xmmword ptr [rdx + rax]
addps xmm2, xmm1
movups xmmword ptr [rdx + rax], xmm2
mulps xmm2, xmm0
movups xmm1, xmmword ptr [rdi + rax]
addps xmm1, xmm2
movups xmmword ptr [rdi + rax], xmm1
add rax, 16
dec rsi
jne .LBB0_7Instead, this happened: In a desperate attempt at unrolling the loop to process 4 Vec4fs at a time, the compiler generates code that loads each f32 individually and spends more time loading and shuffling lanes than actually doing useful math.
.LBB0_4: # =>This Inner Loop Header: Depth=1
movss xmm2, dword ptr [r8 + rcx + 48] # xmm2 = mem[0],zero,zero,zero
movss xmm3, dword ptr [r8 + rcx + 32] # xmm3 = mem[0],zero,zero,zero
unpcklps xmm3, xmm2 # xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
movss xmm4, dword ptr [r8 + rcx + 16] # xmm4 = mem[0],zero,zero,zero
movss xmm2, dword ptr [r8 + rcx] # xmm2 = mem[0],zero,zero,zero
unpcklps xmm2, xmm4 # xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
movlhps xmm2, xmm3 # xmm2 = xmm2[0],xmm3[0]
movss xmm7, dword ptr [r8 + rcx + 4] # xmm7 = mem[0],zero,zero,zero
movss xmm6, dword ptr [r8 + rcx + 8] # xmm6 = mem[0],zero,zero,zero
movss xmm5, dword ptr [r8 + rcx + 12] # xmm5 = mem[0],zero,zero,zero
movss xmm3, dword ptr [r8 + rcx + 52] # xmm3 = mem[0],zero,zero,zero
movss xmm4, dword ptr [r8 + rcx + 36] # xmm4 = mem[0],zero,zero,zero
unpcklps xmm4, xmm3 # xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
movss xmm3, dword ptr [r8 + rcx + 20] # xmm3 = mem[0],zero,zero,zero
unpcklps xmm7, xmm3 # xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1]
movlhps xmm7, xmm4 # xmm7 = xmm7[0],xmm4[0]
movss xmm3, dword ptr [r8 + rcx + 56] # xmm3 = mem[0],zero,zero,zero
movss xmm4, dword ptr [r8 + rcx + 40] # xmm4 = mem[0],zero,zero,zero
unpcklps xmm4, xmm3 # xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
movss xmm3, dword ptr [r8 + rcx + 24] # xmm3 = mem[0],zero,zero,zero
unpcklps xmm6, xmm3 # xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
movlhps xmm6, xmm4 # xmm6 = xmm6[0],xmm4[0]
movss xmm3, dword ptr [r8 + rcx + 60] # xmm3 = mem[0],zero,zero,zero
movss xmm4, dword ptr [r8 + rcx + 44] # xmm4 = mem[0],zero,zero,zero
unpcklps xmm4, xmm3 # xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
movss xmm3, dword ptr [r8 + rcx + 28] # xmm3 = mem[0],zero,zero,zero
unpcklps xmm5, xmm3 # xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
movlhps xmm5, xmm4 # xmm5 = xmm5[0],xmm4[0]
; <MANY lines omitted...>This also happens when adding #[repr(align(16))] to Vector, so it is not related to unaligned operations.
Meta
Reproduces on the playground, on both stable 1.89.0, and nightly 2025-08-10.
Metadata
Metadata
Assignees
Labels
A-SIMDArea: SIMD (Single Instruction Multiple Data)Area: SIMD (Single Instruction Multiple Data)A-autovectorizationArea: Autovectorization, which can impact perf or code sizeArea: Autovectorization, which can impact perf or code sizeC-bugCategory: This is a bug.Category: This is a bug.I-slowIssue: Problems and improvements with respect to performance of generated code.Issue: Problems and improvements with respect to performance of generated code.O-x86_64Target: x86-64 processors (like x86_64-*) (also known as amd64 and x64)Target: x86-64 processors (like x86_64-*) (also known as amd64 and x64)T-compilerRelevant to the compiler team, which will review and decide on the PR/issue.Relevant to the compiler team, which will review and decide on the PR/issue.