Open
Description
Leaves poorly optimized assembly in its wake.
use std::arch::x86::*;
#[cfg(target_arch = "x86_64")]
use std::arch::x86_64::*;
// Creates non inlined calls to intrinsics
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe fn foo(input: &[__m256]) -> f32 {
let accum = |val: __m256| {
let roll = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0);
let mut sum = val;
let mut tmp = _mm256_permutevar8x32_ps(val, roll);
for i in 0..7 {
sum = _mm256_add_ps(tmp, sum);
tmp = _mm256_permutevar8x32_ps(tmp, roll);
}
sum
};
// Once we call a complex internal closure or fn multiple
// times, we find that the compiler hasn't told them that
// they can inline or use avx2 intrinsics. Not the sharpest.
let sum1 = accum(input[0]);
let sum2 = accum(input[1]);
_mm256_cvtss_f32(sum1) + _mm256_cvtss_f32(sum2)
}
// Works as expected
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe fn bar(input: &[__m256]) -> f32 {
// When we pull this tool out of the shed every thing works
#[target_feature(enable = "avx2")]
unsafe fn accum(val: __m256) -> __m256 {
let roll = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0);
let mut sum = val;
let mut tmp = _mm256_permutevar8x32_ps(val, roll);
for i in 0..7 {
sum = _mm256_add_ps(tmp, sum);
tmp = _mm256_permutevar8x32_ps(tmp, roll);
}
sum
}
let sum1 = accum(input[0]);
let sum2 = accum(input[1]);
_mm256_cvtss_f32(sum1) + _mm256_cvtss_f32(sum2)
}
https://rust.godbolt.org/z/cIr7qS
I found this bug by triggering this one with closures. I wasn't able to trigger it from godbolt. I'm using the latest stable, so if I copied the code in it would work (as in not work).
#50154
Making a separate issue since this one is a performance bug.