Skip to content

target_feature doesn't trickle down to closures and internal fns #58279

Open
@KyleSiefring

Description

@KyleSiefring

Leaves poorly optimized assembly in its wake.

use std::arch::x86::*;
#[cfg(target_arch = "x86_64")]
use std::arch::x86_64::*;

// Creates non inlined calls to intrinsics
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe fn foo(input: &[__m256]) -> f32 {
    let accum = |val: __m256| {
        let roll = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0);
        let mut sum = val;
        let mut tmp = _mm256_permutevar8x32_ps(val, roll);
        for i in 0..7 {
            sum = _mm256_add_ps(tmp, sum);
            tmp = _mm256_permutevar8x32_ps(tmp, roll);
        }
        sum
    };
    // Once we call a complex internal closure or fn multiple
    //  times, we find that the compiler hasn't told them that
    //  they can inline or use avx2 intrinsics. Not the sharpest.
    let sum1 = accum(input[0]);
    let sum2 = accum(input[1]);
    _mm256_cvtss_f32(sum1) + _mm256_cvtss_f32(sum2)
}

// Works as expected
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe fn bar(input: &[__m256]) -> f32 {
    // When we pull this tool out of the shed every thing works
    #[target_feature(enable = "avx2")]
    unsafe fn accum(val: __m256) -> __m256 {
        let roll = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0);
        let mut sum = val;
        let mut tmp = _mm256_permutevar8x32_ps(val, roll);
        for i in 0..7 {
            sum = _mm256_add_ps(tmp, sum);
            tmp = _mm256_permutevar8x32_ps(tmp, roll);
        }
        sum
    }
    let sum1 = accum(input[0]);
    let sum2 = accum(input[1]);
    _mm256_cvtss_f32(sum1) + _mm256_cvtss_f32(sum2)
}

https://rust.godbolt.org/z/cIr7qS

I found this bug by triggering this one with closures. I wasn't able to trigger it from godbolt. I'm using the latest stable, so if I copied the code in it would work (as in not work).
#50154

Making a separate issue since this one is a performance bug.

Metadata

Metadata

Assignees

No one assigned

    Labels

    A-SIMDArea: SIMD (Single Instruction Multiple Data)A-codegenArea: Code generationA-target-featureArea: Enabling/disabling target features like AVX, Neon, etc.F-target_feature_11target feature 1.1 RFCT-langRelevant to the language team

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions