Skip to content

extern "platform-intrinsics" float functions often call libm #76

Open
@workingjubilee

Description

@workingjubilee

I tried this code (Godbolt):

Rust Code

#![no_std]
#![allow(non_camel_case_types)]
#![feature(repr_simd, platform_intrinsics)]

#[repr(simd)]
#[derive(Debug)]
pub struct f32x4(f32, f32, f32, f32);

extern "platform-intrinsic" {
    fn simd_fsqrt<T>(x: T) -> T;
    fn simd_fabs<T>(x: T) -> T;
    fn simd_fsin<T>(x: T) -> T;
    fn simd_fcos<T>(x: T) -> T;
    fn simd_ceil<T>(x: T) -> T;
    fn simd_fexp<T>(x: T) -> T;
    fn simd_fexp2<T>(x: T) -> T;
    fn simd_floor<T>(x: T) -> T;
    fn simd_fma<T>(x: T, y: T, z: T) -> T;
    fn simd_flog<T>(x: T) -> T;
    fn simd_flog10<T>(x: T) -> T;
    fn simd_flog2<T>(x: T) -> T;
    fn simd_fpow<T>(x: T, y: T) -> T;
    fn simd_fpowi<T>(x: T, y: i32) -> T;
    fn simd_trunc<T>(x: T) -> T;
    fn simd_round<T>(x: T) -> T;
}

impl f32x4 {
    // Rounding
    pub fn ceil(self) -> Self {
        unsafe { simd_ceil(self) }
    }
    pub fn floor(self) -> Self {
        unsafe { simd_floor(self) }
    }
    pub fn round(self) -> Self {
        unsafe { simd_round(self) }
    }
    pub fn trunc(self) -> Self {
        unsafe { simd_trunc(self) }
    }

    // Arithmetic
    pub fn mul_add(self, y: Self, z: Self) -> Self {
        unsafe { simd_fma(self, y, z) }
    }
    pub fn abs(self) -> Self {
        unsafe { simd_fabs(self) }
    }
    pub fn sqrt(self) -> Self {
        unsafe { simd_fsqrt(self) }
    }
    pub fn powi(self, exp: i32) -> Self {
        unsafe { simd_fpowi(self, exp) }
    }
    pub fn powf(self, exp: Self) -> Self {
        unsafe { simd_fpow(self, exp) }
    }

    // Calculus
    pub fn flog2(self) -> Self {
        unsafe { simd_flog2(self) }
    }
    pub fn flog10(self) -> Self {
        unsafe { simd_flog10(self) }
    }
    pub fn flog(self) -> Self {
        unsafe { simd_flog(self) }
    }
    pub fn fexp(self) -> Self {
        unsafe { simd_fexp(self) }
    }
    pub fn fexp2(self) -> Self {
        unsafe { simd_fexp2(self) }
    }

    // Trigonometry
    pub fn cos(self) -> Self {
        unsafe { simd_fcos(self) }
    }
    pub fn sin(self) -> Self {
        unsafe { simd_fsin(self) }
    }
}

I expected to see this happen: Compilations to "pure assembly".

Instead, this happened: Mostly compiled to calls to libm!

When sufficient vector features are enabled, these do compile to vectorized assembly instructions. However, the problem is that compilation without those features enabled means code that depends on libm... which is not allowed in core. We are going to have to either solve this or push our implementation of SimdF32 and SimdF64 mostly into std, not core.

Notable winners on x64: simd_fsqrt, simd_fabs become vector instructions just fine. I'm worried about them on x86_32 or Arm architectures, though.

Meta

rustc --version --verbose:

rustc 1.52.0-nightly (d1206f950 2021-02-15)
binary: rustc
commit-hash: d1206f950ffb76c76e1b74a19ae33c2b7d949454
commit-date: 2021-02-15
host: x86_64-unknown-linux-gnu
release: 1.52.0-nightly
LLVM version: 11.0.1
x86 Assembly

<&T as core::fmt::Debug>::fmt:
        movq    (%rdi), %rdi
        jmpq    *_ZN4core3fmt5float50_$LT$impl$u20$core..fmt..Debug$u20$for$u20$f32$GT$3fmt17hf2084266ae57b528E@GOTPCREL(%rip)

core::ptr::drop_in_place<&f32>:
        retq

example::f32x4::ceil:
        pushq   %r14
        pushq   %rbx
        subq    $56, %rsp
        movq    %rdi, %r14
        movaps  (%rsi), %xmm0
        movaps  %xmm0, 16(%rsp)
        shufps  $255, %xmm0, %xmm0
        movq    ceilf@GOTPCREL(%rip), %rbx
        callq   *%rbx
        movaps  %xmm0, (%rsp)
        movaps  16(%rsp), %xmm0
        movhlps %xmm0, %xmm0
        callq   *%rbx
        unpcklps        (%rsp), %xmm0
        movaps  %xmm0, (%rsp)
        movaps  16(%rsp), %xmm0
        callq   *%rbx
        movaps  %xmm0, 32(%rsp)
        movaps  16(%rsp), %xmm0
        shufps  $85, %xmm0, %xmm0
        callq   *%rbx
        movaps  32(%rsp), %xmm1
        unpcklps        %xmm0, %xmm1
        unpcklpd        (%rsp), %xmm1
        movaps  %xmm1, (%r14)
        movq    %r14, %rax
        addq    $56, %rsp
        popq    %rbx
        popq    %r14
        retq

example::f32x4::floor:
        pushq   %r14
        pushq   %rbx
        subq    $56, %rsp
        movq    %rdi, %r14
        movaps  (%rsi), %xmm0
        movaps  %xmm0, 16(%rsp)
        shufps  $255, %xmm0, %xmm0
        movq    floorf@GOTPCREL(%rip), %rbx
        callq   *%rbx
        movaps  %xmm0, (%rsp)
        movaps  16(%rsp), %xmm0
        movhlps %xmm0, %xmm0
        callq   *%rbx
        unpcklps        (%rsp), %xmm0
        movaps  %xmm0, (%rsp)
        movaps  16(%rsp), %xmm0
        callq   *%rbx
        movaps  %xmm0, 32(%rsp)
        movaps  16(%rsp), %xmm0
        shufps  $85, %xmm0, %xmm0
        callq   *%rbx
        movaps  32(%rsp), %xmm1
        unpcklps        %xmm0, %xmm1
        unpcklpd        (%rsp), %xmm1
        movaps  %xmm1, (%r14)
        movq    %r14, %rax
        addq    $56, %rsp
        popq    %rbx
        popq    %r14
        retq

example::f32x4::round:
        pushq   %r14
        pushq   %rbx
        subq    $56, %rsp
        movq    %rdi, %r14
        movaps  (%rsi), %xmm0
        movaps  %xmm0, 16(%rsp)
        shufps  $255, %xmm0, %xmm0
        movq    roundf@GOTPCREL(%rip), %rbx
        callq   *%rbx
        movaps  %xmm0, (%rsp)
        movaps  16(%rsp), %xmm0
        movhlps %xmm0, %xmm0
        callq   *%rbx
        unpcklps        (%rsp), %xmm0
        movaps  %xmm0, (%rsp)
        movaps  16(%rsp), %xmm0
        callq   *%rbx
        movaps  %xmm0, 32(%rsp)
        movaps  16(%rsp), %xmm0
        shufps  $85, %xmm0, %xmm0
        callq   *%rbx
        movaps  32(%rsp), %xmm1
        unpcklps        %xmm0, %xmm1
        unpcklpd        (%rsp), %xmm1
        movaps  %xmm1, (%r14)
        movq    %r14, %rax
        addq    $56, %rsp
        popq    %rbx
        popq    %r14
        retq

example::f32x4::trunc:
        pushq   %r14
        pushq   %rbx
        subq    $56, %rsp
        movq    %rdi, %r14
        movaps  (%rsi), %xmm0
        movaps  %xmm0, 16(%rsp)
        shufps  $255, %xmm0, %xmm0
        movq    truncf@GOTPCREL(%rip), %rbx
        callq   *%rbx
        movaps  %xmm0, (%rsp)
        movaps  16(%rsp), %xmm0
        movhlps %xmm0, %xmm0
        callq   *%rbx
        unpcklps        (%rsp), %xmm0
        movaps  %xmm0, (%rsp)
        movaps  16(%rsp), %xmm0
        callq   *%rbx
        movaps  %xmm0, 32(%rsp)
        movaps  16(%rsp), %xmm0
        shufps  $85, %xmm0, %xmm0
        callq   *%rbx
        movaps  32(%rsp), %xmm1
        unpcklps        %xmm0, %xmm1
        unpcklpd        (%rsp), %xmm1
        movaps  %xmm1, (%r14)
        movq    %r14, %rax
        addq    $56, %rsp
        popq    %rbx
        popq    %r14
        retq

example::f32x4::mul_add:
        movaps  (%rsi), %xmm0
        mulps   (%rdx), %xmm0
        movq    %rdi, %rax
        addps   (%rcx), %xmm0
        movaps  %xmm0, (%rdi)
        retq

.LCPI7_0:
        .long   0x7fffffff
        .long   0x7fffffff
        .long   0x7fffffff
        .long   0x7fffffff
example::f32x4::abs:
        movq    %rdi, %rax
        movaps  (%rsi), %xmm0
        andps   .LCPI7_0(%rip), %xmm0
        movaps  %xmm0, (%rdi)
        retq

.LCPI8_0:
        .long   0xbf000000
        .long   0xbf000000
        .long   0xbf000000
        .long   0xbf000000
.LCPI8_1:
        .long   0xc0400000
        .long   0xc0400000
        .long   0xc0400000
        .long   0xc0400000
.LCPI8_2:
        .long   0x7fffffff
        .long   0x7fffffff
        .long   0x7fffffff
        .long   0x7fffffff
.LCPI8_3:
        .long   0x00800000
        .long   0x00800000
        .long   0x00800000
        .long   0x00800000
example::f32x4::sqrt:
        movaps  (%rsi), %xmm0
        rsqrtps %xmm0, %xmm1
        movaps  %xmm0, %xmm2
        mulps   %xmm1, %xmm2
        movaps  .LCPI8_0(%rip), %xmm3
        mulps   %xmm2, %xmm3
        mulps   %xmm1, %xmm2
        addps   .LCPI8_1(%rip), %xmm2
        movq    %rdi, %rax
        mulps   %xmm3, %xmm2
        andps   .LCPI8_2(%rip), %xmm0
        movaps  .LCPI8_3(%rip), %xmm1
        cmpleps %xmm0, %xmm1
        andps   %xmm2, %xmm1
        movaps  %xmm1, (%rdi)
        retq

example::f32x4::powi:
        pushq   %rbp
        pushq   %r14
        pushq   %rbx
        subq    $48, %rsp
        movl    %edx, %ebp
        movq    %rdi, %r14
        movaps  (%rsi), %xmm0
        movaps  %xmm0, 16(%rsp)
        shufps  $255, %xmm0, %xmm0
        movq    __powisf2@GOTPCREL(%rip), %rbx
        movl    %edx, %edi
        callq   *%rbx
        movaps  %xmm0, (%rsp)
        movaps  16(%rsp), %xmm0
        movhlps %xmm0, %xmm0
        movl    %ebp, %edi
        callq   *%rbx
        unpcklps        (%rsp), %xmm0
        movaps  %xmm0, (%rsp)
        movaps  16(%rsp), %xmm0
        movl    %ebp, %edi
        callq   *%rbx
        movaps  %xmm0, 32(%rsp)
        movaps  16(%rsp), %xmm0
        shufps  $85, %xmm0, %xmm0
        movl    %ebp, %edi
        callq   *%rbx
        movaps  32(%rsp), %xmm1
        unpcklps        %xmm0, %xmm1
        unpcklpd        (%rsp), %xmm1
        movaps  %xmm1, (%r14)
        movq    %r14, %rax
        addq    $48, %rsp
        popq    %rbx
        popq    %r14
        popq    %rbp
        retq

example::f32x4::powf:
        pushq   %r14
        pushq   %rbx
        subq    $72, %rsp
        movq    %rdi, %r14
        movaps  (%rsi), %xmm0
        movaps  %xmm0, 32(%rsp)
        movaps  (%rdx), %xmm1
        movaps  %xmm1, 16(%rsp)
        shufps  $255, %xmm0, %xmm0
        shufps  $255, %xmm1, %xmm1
        movq    powf@GOTPCREL(%rip), %rbx
        callq   *%rbx
        movaps  %xmm0, (%rsp)
        movaps  32(%rsp), %xmm0
        movhlps %xmm0, %xmm0
        movaps  16(%rsp), %xmm1
        movhlps %xmm1, %xmm1
        callq   *%rbx
        unpcklps        (%rsp), %xmm0
        movaps  %xmm0, (%rsp)
        movaps  32(%rsp), %xmm0
        movaps  16(%rsp), %xmm1
        callq   *%rbx
        movaps  %xmm0, 48(%rsp)
        movaps  32(%rsp), %xmm0
        shufps  $85, %xmm0, %xmm0
        movaps  16(%rsp), %xmm1
        shufps  $85, %xmm1, %xmm1
        callq   *%rbx
        movaps  48(%rsp), %xmm1
        unpcklps        %xmm0, %xmm1
        unpcklpd        (%rsp), %xmm1
        movaps  %xmm1, (%r14)
        movq    %r14, %rax
        addq    $72, %rsp
        popq    %rbx
        popq    %r14
        retq

example::f32x4::flog2:
        pushq   %r14
        pushq   %rbx
        subq    $56, %rsp
        movq    %rdi, %r14
        movaps  (%rsi), %xmm0
        movaps  %xmm0, 16(%rsp)
        shufps  $255, %xmm0, %xmm0
        movq    log2f@GOTPCREL(%rip), %rbx
        callq   *%rbx
        movaps  %xmm0, (%rsp)
        movaps  16(%rsp), %xmm0
        movhlps %xmm0, %xmm0
        callq   *%rbx
        unpcklps        (%rsp), %xmm0
        movaps  %xmm0, (%rsp)
        movaps  16(%rsp), %xmm0
        callq   *%rbx
        movaps  %xmm0, 32(%rsp)
        movaps  16(%rsp), %xmm0
        shufps  $85, %xmm0, %xmm0
        callq   *%rbx
        movaps  32(%rsp), %xmm1
        unpcklps        %xmm0, %xmm1
        unpcklpd        (%rsp), %xmm1
        movaps  %xmm1, (%r14)
        movq    %r14, %rax
        addq    $56, %rsp
        popq    %rbx
        popq    %r14
        retq

example::f32x4::flog10:
        pushq   %r14
        pushq   %rbx
        subq    $56, %rsp
        movq    %rdi, %r14
        movaps  (%rsi), %xmm0
        movaps  %xmm0, 16(%rsp)
        shufps  $255, %xmm0, %xmm0
        movq    log10f@GOTPCREL(%rip), %rbx
        callq   *%rbx
        movaps  %xmm0, (%rsp)
        movaps  16(%rsp), %xmm0
        movhlps %xmm0, %xmm0
        callq   *%rbx
        unpcklps        (%rsp), %xmm0
        movaps  %xmm0, (%rsp)
        movaps  16(%rsp), %xmm0
        callq   *%rbx
        movaps  %xmm0, 32(%rsp)
        movaps  16(%rsp), %xmm0
        shufps  $85, %xmm0, %xmm0
        callq   *%rbx
        movaps  32(%rsp), %xmm1
        unpcklps        %xmm0, %xmm1
        unpcklpd        (%rsp), %xmm1
        movaps  %xmm1, (%r14)
        movq    %r14, %rax
        addq    $56, %rsp
        popq    %rbx
        popq    %r14
        retq

example::f32x4::flog:
        pushq   %r14
        pushq   %rbx
        subq    $56, %rsp
        movq    %rdi, %r14
        movaps  (%rsi), %xmm0
        movaps  %xmm0, 16(%rsp)
        shufps  $255, %xmm0, %xmm0
        movq    logf@GOTPCREL(%rip), %rbx
        callq   *%rbx
        movaps  %xmm0, (%rsp)
        movaps  16(%rsp), %xmm0
        movhlps %xmm0, %xmm0
        callq   *%rbx
        unpcklps        (%rsp), %xmm0
        movaps  %xmm0, (%rsp)
        movaps  16(%rsp), %xmm0
        callq   *%rbx
        movaps  %xmm0, 32(%rsp)
        movaps  16(%rsp), %xmm0
        shufps  $85, %xmm0, %xmm0
        callq   *%rbx
        movaps  32(%rsp), %xmm1
        unpcklps        %xmm0, %xmm1
        unpcklpd        (%rsp), %xmm1
        movaps  %xmm1, (%r14)
        movq    %r14, %rax
        addq    $56, %rsp
        popq    %rbx
        popq    %r14
        retq

example::f32x4::fexp:
        pushq   %r14
        pushq   %rbx
        subq    $56, %rsp
        movq    %rdi, %r14
        movaps  (%rsi), %xmm0
        movaps  %xmm0, 16(%rsp)
        shufps  $255, %xmm0, %xmm0
        movq    expf@GOTPCREL(%rip), %rbx
        callq   *%rbx
        movaps  %xmm0, (%rsp)
        movaps  16(%rsp), %xmm0
        movhlps %xmm0, %xmm0
        callq   *%rbx
        unpcklps        (%rsp), %xmm0
        movaps  %xmm0, (%rsp)
        movaps  16(%rsp), %xmm0
        callq   *%rbx
        movaps  %xmm0, 32(%rsp)
        movaps  16(%rsp), %xmm0
        shufps  $85, %xmm0, %xmm0
        callq   *%rbx
        movaps  32(%rsp), %xmm1
        unpcklps        %xmm0, %xmm1
        unpcklpd        (%rsp), %xmm1
        movaps  %xmm1, (%r14)
        movq    %r14, %rax
        addq    $56, %rsp
        popq    %rbx
        popq    %r14
        retq

example::f32x4::fexp2:
        pushq   %r14
        pushq   %rbx
        subq    $56, %rsp
        movq    %rdi, %r14
        movaps  (%rsi), %xmm0
        movaps  %xmm0, 16(%rsp)
        shufps  $255, %xmm0, %xmm0
        movq    exp2f@GOTPCREL(%rip), %rbx
        callq   *%rbx
        movaps  %xmm0, (%rsp)
        movaps  16(%rsp), %xmm0
        movhlps %xmm0, %xmm0
        callq   *%rbx
        unpcklps        (%rsp), %xmm0
        movaps  %xmm0, (%rsp)
        movaps  16(%rsp), %xmm0
        callq   *%rbx
        movaps  %xmm0, 32(%rsp)
        movaps  16(%rsp), %xmm0
        shufps  $85, %xmm0, %xmm0
        callq   *%rbx
        movaps  32(%rsp), %xmm1
        unpcklps        %xmm0, %xmm1
        unpcklpd        (%rsp), %xmm1
        movaps  %xmm1, (%r14)
        movq    %r14, %rax
        addq    $56, %rsp
        popq    %rbx
        popq    %r14
        retq

example::f32x4::cos:
        pushq   %r14
        pushq   %rbx
        subq    $56, %rsp
        movq    %rdi, %r14
        movaps  (%rsi), %xmm0
        movaps  %xmm0, 16(%rsp)
        shufps  $255, %xmm0, %xmm0
        movq    cosf@GOTPCREL(%rip), %rbx
        callq   *%rbx
        movaps  %xmm0, (%rsp)
        movaps  16(%rsp), %xmm0
        movhlps %xmm0, %xmm0
        callq   *%rbx
        unpcklps        (%rsp), %xmm0
        movaps  %xmm0, (%rsp)
        movaps  16(%rsp), %xmm0
        callq   *%rbx
        movaps  %xmm0, 32(%rsp)
        movaps  16(%rsp), %xmm0
        shufps  $85, %xmm0, %xmm0
        callq   *%rbx
        movaps  32(%rsp), %xmm1
        unpcklps        %xmm0, %xmm1
        unpcklpd        (%rsp), %xmm1
        movaps  %xmm1, (%r14)
        movq    %r14, %rax
        addq    $56, %rsp
        popq    %rbx
        popq    %r14
        retq

example::f32x4::sin:
        pushq   %r14
        pushq   %rbx
        subq    $56, %rsp
        movq    %rdi, %r14
        movaps  (%rsi), %xmm0
        movaps  %xmm0, 16(%rsp)
        shufps  $255, %xmm0, %xmm0
        movq    sinf@GOTPCREL(%rip), %rbx
        callq   *%rbx
        movaps  %xmm0, (%rsp)
        movaps  16(%rsp), %xmm0
        movhlps %xmm0, %xmm0
        callq   *%rbx
        unpcklps        (%rsp), %xmm0
        movaps  %xmm0, (%rsp)
        movaps  16(%rsp), %xmm0
        callq   *%rbx
        movaps  %xmm0, 32(%rsp)
        movaps  16(%rsp), %xmm0
        shufps  $85, %xmm0, %xmm0
        callq   *%rbx
        movaps  32(%rsp), %xmm1
        unpcklps        %xmm0, %xmm1
        unpcklpd        (%rsp), %xmm1
        movaps  %xmm1, (%r14)
        movq    %r14, %rax
        addq    $56, %rsp
        popq    %rbx
        popq    %r14
        retq

<example::f32x4 as core::fmt::Debug>::fmt:
        pushq   %rbp
        pushq   %r15
        pushq   %r14
        pushq   %r13
        pushq   %r12
        pushq   %rbx
        subq    $40, %rsp
        movq    %rdi, %rbx
        leaq    4(%rdi), %r12
        leaq    8(%rdi), %r13
        leaq    12(%rdi), %r15
        leaq    .L__unnamed_1(%rip), %rdx
        leaq    16(%rsp), %r14
        movl    $5, %ecx
        movq    %r14, %rdi
        callq   *core::fmt::Formatter::debug_tuple@GOTPCREL(%rip)
        movq    %rbx, 8(%rsp)
        leaq    .L__unnamed_2(%rip), %rbx
        movq    core::fmt::builders::DebugTuple::field@GOTPCREL(%rip), %rbp
        leaq    8(%rsp), %rsi
        movq    %r14, %rdi
        movq    %rbx, %rdx
        callq   *%rbp
        movq    %r12, 8(%rsp)
        leaq    8(%rsp), %rsi
        movq    %r14, %rdi
        movq    %rbx, %rdx
        callq   *%rbp
        movq    %r13, 8(%rsp)
        leaq    8(%rsp), %rsi
        movq    %r14, %rdi
        movq    %rbx, %rdx
        callq   *%rbp
        movq    %r15, 8(%rsp)
        leaq    8(%rsp), %rsi
        movq    %r14, %rdi
        movq    %rbx, %rdx
        callq   *%rbp
        movq    %r14, %rdi
        callq   *core::fmt::builders::DebugTuple::finish@GOTPCREL(%rip)
        addq    $40, %rsp
        popq    %rbx
        popq    %r12
        popq    %r13
        popq    %r14
        popq    %r15
        popq    %rbp
        retq

.L__unnamed_1:
        .ascii  "f32x4"

.L__unnamed_2:
        .quad   core::ptr::drop_in_place<&f32>
        .quad   8
        .quad   8
        .quad   <&T as core::fmt::Debug>::fmt

AArch64 Assembly

<&T as core::fmt::Debug>::fmt:
        ldr     x0, [x0]
        b       _ZN4core3fmt5float50_$LT$impl$u20$core..fmt..Debug$u20$for$u20$f32$GT$3fmt17h68f66863527610f0E

core::ptr::drop_in_place<&f32>:
        ret

example::f32x4::ceil:
        ldr     q0, [x0]
        frintp  v0.4s, v0.4s
        str     q0, [x8]
        ret

example::f32x4::floor:
        ldr     q0, [x0]
        frintm  v0.4s, v0.4s
        str     q0, [x8]
        ret

example::f32x4::round:
        ldr     q0, [x0]
        frinta  v0.4s, v0.4s
        str     q0, [x8]
        ret

example::f32x4::trunc:
        ldr     q0, [x0]
        frintz  v0.4s, v0.4s
        str     q0, [x8]
        ret

example::f32x4::mul_add:
        ldr     q0, [x0]
        ldr     q1, [x1]
        ldr     q2, [x2]
        fmla    v2.4s, v1.4s, v0.4s
        str     q2, [x8]
        ret

example::f32x4::abs:
        ldr     q0, [x0]
        fabs    v0.4s, v0.4s
        str     q0, [x8]
        ret

example::f32x4::sqrt:
        ldr     q0, [x0]
        fsqrt   v0.4s, v0.4s
        str     q0, [x8]
        ret

example::f32x4::powi:
        sub     sp, sp, #64
        str     x30, [sp, #32]
        stp     x20, x19, [sp, #48]
        ldr     q0, [x0]
        mov     w0, w1
        mov     w19, w1
        mov     x20, x8
        str     q0, [sp, #16]
        mov     s0, v0.s[1]
        bl      __powisf2
        str     d0, [sp]
        ldr     q0, [sp, #16]
        mov     w0, w19
        bl      __powisf2
        ldr     q1, [sp]
        mov     w0, w19
        mov     v0.s[1], v1.s[0]
        str     q0, [sp]
        ldr     q0, [sp, #16]
        mov     s0, v0.s[2]
        bl      __powisf2
        ldr     q1, [sp]
        mov     w0, w19
        mov     v1.s[2], v0.s[0]
        ldr     q0, [sp, #16]
        str     q1, [sp]
        mov     s0, v0.s[3]
        bl      __powisf2
        ldr     q1, [sp]
        ldr     x30, [sp, #32]
        mov     v1.s[3], v0.s[0]
        str     q1, [x20]
        ldp     x20, x19, [sp, #48]
        add     sp, sp, #64
        ret

example::f32x4::powf:
        sub     sp, sp, #64
        stp     x30, x19, [sp, #48]
        ldr     q0, [x0]
        ldr     q1, [x1]
        mov     x19, x8
        stp     q1, q0, [sp, #16]
        mov     s0, v0.s[1]
        mov     s1, v1.s[1]
        bl      powf
        str     d0, [sp]
        ldp     q1, q0, [sp, #16]
        bl      powf
        ldr     q1, [sp]
        mov     v0.s[1], v1.s[0]
        str     q0, [sp]
        ldp     q1, q0, [sp, #16]
        mov     s0, v0.s[2]
        mov     s1, v1.s[2]
        bl      powf
        ldr     q1, [sp]
        mov     v1.s[2], v0.s[0]
        str     q1, [sp]
        ldp     q1, q0, [sp, #16]
        mov     s0, v0.s[3]
        mov     s1, v1.s[3]
        bl      powf
        ldr     q1, [sp]
        mov     v1.s[3], v0.s[0]
        str     q1, [x19]
        ldp     x30, x19, [sp, #48]
        add     sp, sp, #64
        ret

example::f32x4::flog2:
        sub     sp, sp, #48
        stp     x30, x19, [sp, #32]
        ldr     q0, [x0]
        mov     x19, x8
        str     q0, [sp, #16]
        mov     s0, v0.s[1]
        bl      log2f
        str     d0, [sp]
        ldr     q0, [sp, #16]
        bl      log2f
        ldr     q1, [sp]
        mov     v0.s[1], v1.s[0]
        str     q0, [sp]
        ldr     q0, [sp, #16]
        mov     s0, v0.s[2]
        bl      log2f
        ldr     q1, [sp]
        mov     v1.s[2], v0.s[0]
        ldr     q0, [sp, #16]
        str     q1, [sp]
        mov     s0, v0.s[3]
        bl      log2f
        ldr     q1, [sp]
        mov     v1.s[3], v0.s[0]
        str     q1, [x19]
        ldp     x30, x19, [sp, #32]
        add     sp, sp, #48
        ret

example::f32x4::flog10:
        sub     sp, sp, #48
        stp     x30, x19, [sp, #32]
        ldr     q0, [x0]
        mov     x19, x8
        str     q0, [sp, #16]
        mov     s0, v0.s[1]
        bl      log10f
        str     d0, [sp]
        ldr     q0, [sp, #16]
        bl      log10f
        ldr     q1, [sp]
        mov     v0.s[1], v1.s[0]
        str     q0, [sp]
        ldr     q0, [sp, #16]
        mov     s0, v0.s[2]
        bl      log10f
        ldr     q1, [sp]
        mov     v1.s[2], v0.s[0]
        ldr     q0, [sp, #16]
        str     q1, [sp]
        mov     s0, v0.s[3]
        bl      log10f
        ldr     q1, [sp]
        mov     v1.s[3], v0.s[0]
        str     q1, [x19]
        ldp     x30, x19, [sp, #32]
        add     sp, sp, #48
        ret

example::f32x4::flog:
        sub     sp, sp, #48
        stp     x30, x19, [sp, #32]
        ldr     q0, [x0]
        mov     x19, x8
        str     q0, [sp, #16]
        mov     s0, v0.s[1]
        bl      logf
        str     d0, [sp]
        ldr     q0, [sp, #16]
        bl      logf
        ldr     q1, [sp]
        mov     v0.s[1], v1.s[0]
        str     q0, [sp]
        ldr     q0, [sp, #16]
        mov     s0, v0.s[2]
        bl      logf
        ldr     q1, [sp]
        mov     v1.s[2], v0.s[0]
        ldr     q0, [sp, #16]
        str     q1, [sp]
        mov     s0, v0.s[3]
        bl      logf
        ldr     q1, [sp]
        mov     v1.s[3], v0.s[0]
        str     q1, [x19]
        ldp     x30, x19, [sp, #32]
        add     sp, sp, #48
        ret

example::f32x4::fexp:
        sub     sp, sp, #48
        stp     x30, x19, [sp, #32]
        ldr     q0, [x0]
        mov     x19, x8
        str     q0, [sp, #16]
        mov     s0, v0.s[1]
        bl      expf
        str     d0, [sp]
        ldr     q0, [sp, #16]
        bl      expf
        ldr     q1, [sp]
        mov     v0.s[1], v1.s[0]
        str     q0, [sp]
        ldr     q0, [sp, #16]
        mov     s0, v0.s[2]
        bl      expf
        ldr     q1, [sp]
        mov     v1.s[2], v0.s[0]
        ldr     q0, [sp, #16]
        str     q1, [sp]
        mov     s0, v0.s[3]
        bl      expf
        ldr     q1, [sp]
        mov     v1.s[3], v0.s[0]
        str     q1, [x19]
        ldp     x30, x19, [sp, #32]
        add     sp, sp, #48
        ret

example::f32x4::fexp2:
        sub     sp, sp, #48
        stp     x30, x19, [sp, #32]
        ldr     q0, [x0]
        mov     x19, x8
        str     q0, [sp, #16]
        mov     s0, v0.s[1]
        bl      exp2f
        str     d0, [sp]
        ldr     q0, [sp, #16]
        bl      exp2f
        ldr     q1, [sp]
        mov     v0.s[1], v1.s[0]
        str     q0, [sp]
        ldr     q0, [sp, #16]
        mov     s0, v0.s[2]
        bl      exp2f
        ldr     q1, [sp]
        mov     v1.s[2], v0.s[0]
        ldr     q0, [sp, #16]
        str     q1, [sp]
        mov     s0, v0.s[3]
        bl      exp2f
        ldr     q1, [sp]
        mov     v1.s[3], v0.s[0]
        str     q1, [x19]
        ldp     x30, x19, [sp, #32]
        add     sp, sp, #48
        ret

example::f32x4::cos:
        sub     sp, sp, #48
        stp     x30, x19, [sp, #32]
        ldr     q0, [x0]
        mov     x19, x8
        str     q0, [sp, #16]
        mov     s0, v0.s[1]
        bl      cosf
        str     d0, [sp]
        ldr     q0, [sp, #16]
        bl      cosf
        ldr     q1, [sp]
        mov     v0.s[1], v1.s[0]
        str     q0, [sp]
        ldr     q0, [sp, #16]
        mov     s0, v0.s[2]
        bl      cosf
        ldr     q1, [sp]
        mov     v1.s[2], v0.s[0]
        ldr     q0, [sp, #16]
        str     q1, [sp]
        mov     s0, v0.s[3]
        bl      cosf
        ldr     q1, [sp]
        mov     v1.s[3], v0.s[0]
        str     q1, [x19]
        ldp     x30, x19, [sp, #32]
        add     sp, sp, #48
        ret

example::f32x4::sin:
        sub     sp, sp, #48
        stp     x30, x19, [sp, #32]
        ldr     q0, [x0]
        mov     x19, x8
        str     q0, [sp, #16]
        mov     s0, v0.s[1]
        bl      sinf
        str     d0, [sp]
        ldr     q0, [sp, #16]
        bl      sinf
        ldr     q1, [sp]
        mov     v0.s[1], v1.s[0]
        str     q0, [sp]
        ldr     q0, [sp, #16]
        mov     s0, v0.s[2]
        bl      sinf
        ldr     q1, [sp]
        mov     v1.s[2], v0.s[0]
        ldr     q0, [sp, #16]
        str     q1, [sp]
        mov     s0, v0.s[3]
        bl      sinf
        ldr     q1, [sp]
        mov     v1.s[3], v0.s[0]
        str     q1, [x19]
        ldp     x30, x19, [sp, #32]
        add     sp, sp, #48
        ret

<example::f32x4 as core::fmt::Debug>::fmt:
        sub     sp, sp, #80
        str     x30, [sp, #32]
        stp     x22, x21, [sp, #48]
        stp     x20, x19, [sp, #64]
        mov     x9, x1
        adrp    x1, .L__unnamed_1
        mov     x19, x0
        add     x20, x0, #4
        add     x21, x0, #8
        add     x22, x0, #12
        add     x1, x1, :lo12:.L__unnamed_1
        add     x8, sp, #8
        mov     w2, #5
        mov     x0, x9
        bl      core::fmt::Formatter::debug_tuple
        str     x19, [sp, #40]
        adrp    x19, .L__unnamed_2
        add     x19, x19, :lo12:.L__unnamed_2
        add     x0, sp, #8
        add     x1, sp, #40
        mov     x2, x19
        bl      core::fmt::builders::DebugTuple::field
        add     x0, sp, #8
        add     x1, sp, #40
        mov     x2, x19
        str     x20, [sp, #40]
        bl      core::fmt::builders::DebugTuple::field
        add     x0, sp, #8
        add     x1, sp, #40
        mov     x2, x19
        str     x21, [sp, #40]
        bl      core::fmt::builders::DebugTuple::field
        add     x0, sp, #8
        add     x1, sp, #40
        mov     x2, x19
        str     x22, [sp, #40]
        bl      core::fmt::builders::DebugTuple::field
        add     x0, sp, #8
        bl      core::fmt::builders::DebugTuple::finish
        ldp     x20, x19, [sp, #64]
        ldp     x22, x21, [sp, #48]
        ldr     x30, [sp, #32]
        add     sp, sp, #80
        ret

.L__unnamed_1:
        .ascii  "f32x4"

.L__unnamed_2:
        .xword  core::ptr::drop_in_place<&f32>
        .xword  8
        .xword  8
        .xword  <&T as core::fmt::Debug>::fmt

Metadata

Metadata

Assignees

No one assigned

    Labels

    A-LLVMArea: LLVMA-simdArea: SIMD. Put this on tracking issues to help with cross-repo issue organizationC-bugCategory: BugE-needs-designCall for participation: Needs design.

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions