Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[libc][math][c23] Optimize fabsf16 on x86 with Clang #104869

Open
wants to merge 1 commit into
base: main
Choose a base branch
from

Conversation

overmighty
Copy link
Member

Works around optimizations introduced in LLVM 17 and 18 that slow down
fputil::abs<float16>() on x86.

Works around optimizations introduced in LLVM 17 and 18 that slow down
`fputil::abs<float16>()` on x86.
@overmighty overmighty requested a review from lntue August 19, 2024 21:56
@llvmbot llvmbot added the libc label Aug 19, 2024
@llvmbot
Copy link
Collaborator

llvmbot commented Aug 19, 2024

@llvm/pr-subscribers-libc

Author: OverMighty (overmighty)

Changes

Works around optimizations introduced in LLVM 17 and 18 that slow down
fputil::abs&lt;float16&gt;() on x86.


Full diff: https://github.com/llvm/llvm-project/pull/104869.diff

2 Files Affected:

  • (modified) libc/src/math/generic/CMakeLists.txt (+3-1)
  • (modified) libc/src/math/generic/fabsf16.cpp (+14-3)
diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt
index 350072f4b9649..7fa86f17269f2 100644
--- a/libc/src/math/generic/CMakeLists.txt
+++ b/libc/src/math/generic/CMakeLists.txt
@@ -563,10 +563,12 @@ add_entrypoint_object(
   HDRS
     ../fabsf16.h
   DEPENDS
-    libc.src.__support.macros.properties.types
     libc.src.__support.FPUtil.basic_operations
+    libc.src.__support.FPUtil.fp_bits
     libc.src.__support.macros.properties.architectures
     libc.src.__support.macros.properties.compiler
+    libc.src.__support.macros.properties.cpu_features
+    libc.src.__support.macros.properties.types
   COMPILE_OPTIONS
     -O3
   FLAGS
diff --git a/libc/src/math/generic/fabsf16.cpp b/libc/src/math/generic/fabsf16.cpp
index 02e11330db718..a86aa0cb00a73 100644
--- a/libc/src/math/generic/fabsf16.cpp
+++ b/libc/src/math/generic/fabsf16.cpp
@@ -8,19 +8,30 @@
 
 #include "src/math/fabsf16.h"
 #include "src/__support/FPUtil/BasicOperations.h"
+#include "src/__support/FPUtil/FPBits.h"
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/properties/architectures.h"
 #include "src/__support/macros/properties/compiler.h"
+#include "src/__support/macros/properties/cpu_features.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(float16, fabsf16, (float16 x)) {
-  // For x86, GCC generates better code from the generic implementation.
-  // https://godbolt.org/z/K9orM4hTa
 #if defined(__LIBC_MISC_MATH_BASIC_OPS_OPT) &&                                 \
-    !(defined(LIBC_TARGET_ARCH_IS_X86) && defined(LIBC_COMPILER_IS_GCC))
+    defined(LIBC_TARGET_CPU_HAS_FAST_FLOAT16_OPS)
   return __builtin_fabsf16(x);
+#elif defined(LIBC_TARGET_ARCH_IS_X86) && defined(LIBC_COMPILER_IS_CLANG)
+  // Prevent Clang from generating calls to slow soft-float conversion
+  // functions on x86. See https://godbolt.org/z/hvo6jbnGz.
+
+  using FPBits = fputil::FPBits<float16>;
+  using StorageType = typename FPBits::StorageType;
+
+  static constexpr volatile StorageType ABS_MASK = FPBits::EXP_SIG_MASK;
+
+  return FPBits(static_cast<StorageType>(FPBits(x).uintval() & ABS_MASK))
+      .get_val();
 #else
   return fputil::abs(x);
 #endif

@overmighty
Copy link
Member Author

overmighty commented Aug 19, 2024

Minimum latencies:

Before:

  • Clang 18, libgcc 14, Intel Core i7-13700H:
    fabsf16 from 0x3c00 to 0x4000:	 8.01073 ns
    fabsf16 from 0 to 0x400:	 66.8712 ns
    
    Disassembly:
    0000000000002290 <__llvm_libc_20_0_0_git::fabsf16(_Float16)>:
        2290:       55                      push   rbp
        2291:       48 89 e5                mov    rbp,rsp
        2294:       e8 07 01 00 00          call   23a0 <__extendhfsf2@plt>
        2299:       c4 e2 79 18 0d 92 e7    vbroadcastss xmm1,DWORD PTR [rip+0xffffffffffffe792]        # a34 <_IO_stdin_used+0x4>
        22a0:       ff ff
        22a2:       c5 f8 54 c1             vandps xmm0,xmm0,xmm1
        22a6:       e8 05 01 00 00          call   23b0 <__truncsfhf2@plt>
        22ab:       5d                      pop    rbp
        22ac:       c3                      ret
  • Clang 18, libgcc 14, Intel Core i7-13700H, -march=native:
    fabsf16 from 0x3c00 to 0x4000:	 6.66634 ns
    fabsf16 from 0 to 0x400:	 6.70341 ns
    
    Disassembly:
    0000000000002210 <__llvm_libc_20_0_0_git::fabsf16(_Float16)>:
        2210:       55                      push   rbp
        2211:       48 89 e5                mov    rbp,rsp
        2214:       c5 f9 c5 c0 00          vpextrw eax,xmm0,0x0
        2219:       0f b7 c0                movzx  eax,ax
        221c:       c5 f9 6e c0             vmovd  xmm0,eax
        2220:       c4 e2 79 13 c0          vcvtph2ps xmm0,xmm0
        2225:       c4 e2 79 18 0d 76 e7    vbroadcastss xmm1,DWORD PTR [rip+0xffffffffffffe776]        # 9a4 <_IO_stdin_used+0x4>
        222c:       ff ff
        222e:       c5 f8 54 c1             vandps xmm0,xmm0,xmm1
        2232:       c4 e3 79 1d c0 04       vcvtps2ph xmm0,xmm0,0x4
        2238:       c5 f9 7e c0             vmovd  eax,xmm0
        223c:       c5 f9 c4 c0 00          vpinsrw xmm0,xmm0,eax,0x0
        2241:       5d                      pop    rbp
        2242:       c3                      ret

After:

  • Clang 18, libgcc 14, Intel Core i7-13700H:
    fabsf16 from 0x3c00 to 0x4000:	 2.24488 ns
    fabsf16 from 0 to 0x400:	 2.23415 ns
    
    Disassembly:
    0000000000002200 <__llvm_libc_20_0_0_git::fabsf16(_Float16)>:
        2200:       55                      push   rbp
        2201:       48 89 e5                mov    rbp,rsp
        2204:       c5 f9 c5 c0 00          vpextrw eax,xmm0,0x0
        2209:       66 23 05 ee e7 ff ff    and    ax,WORD PTR [rip+0xffffffffffffe7ee]        # 9fe <_ZZN22__llvm_libc_20_0_0_git6fp>
        2210:       c5 f9 c4 c0 00          vpinsrw xmm0,xmm0,eax,0x0
        2215:       5d                      pop    rbp
        2216:       c3                      ret
  • Clang 18, libgcc 14, Intel Core i7-13700H, -march=native:
    fabsf16 from 0x3c00 to 0x4000:	 2.2878 ns
    fabsf16 from 0 to 0x400:	 2.26927 ns
    
    Disassembly:
    0000000000002210 <__llvm_libc_20_0_0_git::fabsf16(_Float16)>:
        2210:       55                      push   rbp
        2211:       48 89 e5                mov    rbp,rsp
        2214:       c5 f9 c5 c0 00          vpextrw eax,xmm0,0x0
        2219:       66 23 05 de e7 ff ff    and    ax,WORD PTR [rip+0xffffffffffffe7de]        # 9fe <_ZZN22__llvm_libc_20_0_0_git6fp>
        2220:       c5 f9 c4 c0 00          vpinsrw xmm0,xmm0,eax,0x0
        2225:       5d                      pop    rbp
        2226:       c3                      ret

If we enabled the same code path for other compilers/targets:

Before:

  • GCC 14, Intel Core i7-13700H:
    fabsf16 from 0x3c00 to 0x4000:	 1.05366 ns
    fabsf16 from 0 to 0x400:	 1.05366 ns
    
    Disassembly:
    0000000000002200 <__llvm_libc_20_0_0_git::fabsf16(_Float16)>:
        2200:       c5 f9 7e c0             vmovd  eax,xmm0
        2204:       66 25 ff 7f             and    ax,0x7fff
        2208:       c5 f9 6e c0             vmovd  xmm0,eax
        220c:       c3                      ret
  • GCC 14, Intel Core i7-13700H, -march=native:
    fabsf16 from 0x3c00 to 0x4000:	 1.05268 ns
    fabsf16 from 0 to 0x400:	 1.05171 ns
    
    Disassembly:
    0000000000002210 <__llvm_libc_20_0_0_git::fabsf16(_Float16)>:
        2210:       c5 f9 7e c0             vmovd  eax,xmm0
        2214:       66 25 ff 7f             and    ax,0x7fff
        2218:       c5 f9 6e c0             vmovd  xmm0,eax
        221c:       c3                      ret
  • Clang 18, Google Tensor G3, -Xclang -target-feature -Xclang +fullfp16:
    fabsf16 from 0x3c00 to 0x4000:	 0.873171 ns
    fabsf16 from 0 to 0x400:	 0.675122 ns
    
    Disassembly:
    0000000000024cdc <__llvm_libc_20_0_0_git::fabsf16(_Float16)>:
       24cdc: 1ee0c000      fabs    h0, h0
       24ce0: d65f03c0      ret

After:

  • GCC 14, Intel Core i7-13700H:
    fabsf16 from 0x3c00 to 0x4000:	 1.05366 ns
    fabsf16 from 0 to 0x400:	 1.05366 ns
    
    Disassembly:
    0000000000002210 <__llvm_libc_20_0_0_git::fabsf16(_Float16)>:
        2210:       0f b7 05 77 e8 ff ff    movzx  eax,WORD PTR [rip+0xffffffffffffe877]        # a8e <__llvm_libc_20_0_0_git::fputil>
        2217:       c5 f9 7e c2             vmovd  edx,xmm0
        221b:       21 c2                   and    edx,eax
        221d:       c5 f9 6e c2             vmovd  xmm0,edx
        2221:       c3                      ret
  • GCC 14, Intel Core i7-13700H, -march=native:
    fabsf16 from 0x3c00 to 0x4000:	 1.05366 ns
    fabsf16 from 0 to 0x400:	 1.05366 ns
    
    Disassembly:
    0000000000002220 <__llvm_libc_20_0_0_git::fabsf16(_Float16)>:
        2220:       0f b7 05 67 e8 ff ff    movzx  eax,WORD PTR [rip+0xffffffffffffe867]        # a8e <__llvm_libc_20_0_0_git::fputil>
        2227:       c5 f9 7e c2             vmovd  edx,xmm0
        222b:       21 c2                   and    edx,eax
        222d:       c5 f9 6e c2             vmovd  xmm0,edx
        2231:       c3                      ret
  • Clang 18, Google Tensor G3, -Xclang -target-feature -Xclang +fullfp16:
    fabsf16 from 0x3c00 to 0x4000:	 0.873171 ns
    fabsf16 from 0 to 0x400:	 0.674146 ns
    
    Disassembly:
    0000000000024cdc <__llvm_libc_20_0_0_git::fabsf16(_Float16)>:
       24cdc: d0ffff68      adrp    x8, 0x12000 <wmemchr+0x12000>
       24ce0: 1e260009      fmov    w9, s0
       24ce4: 794ad508      ldrh    w8, [x8, #0x56a]
       24ce8: 0a090108      and     w8, w8, w9
       24cec: 1e270100      fmov    s0, w8
       24cf0: d65f03c0      ret

@lntue
Copy link
Contributor

lntue commented Aug 19, 2024

Minimum latencies:

Before:

  • Clang 18, libgcc 14, Intel Core i7-13700H:
    fabsf16 from 0x3c00 to 0x4000:	 8.01073 ns
    fabsf16 from 0 to 0x400:	 66.8712 ns
    
    Disassembly:
    0000000000002290 <__llvm_libc_20_0_0_git::fabsf16(_Float16)>:
        2290:       55                      push   rbp
        2291:       48 89 e5                mov    rbp,rsp
        2294:       e8 07 01 00 00          call   23a0 <__extendhfsf2@plt>
        2299:       c4 e2 79 18 0d 92 e7    vbroadcastss xmm1,DWORD PTR [rip+0xffffffffffffe792]        # a34 <_IO_stdin_used+0x4>
        22a0:       ff ff
        22a2:       c5 f8 54 c1             vandps xmm0,xmm0,xmm1
        22a6:       e8 05 01 00 00          call   23b0 <__truncsfhf2@plt>
        22ab:       5d                      pop    rbp
        22ac:       c3                      ret
  • Clang 18, libgcc 14, Intel Core i7-13700H, -march=native:
    fabsf16 from 0x3c00 to 0x4000:	 6.66634 ns
    fabsf16 from 0 to 0x400:	 6.70341 ns
    
    Disassembly:
    0000000000002210 <__llvm_libc_20_0_0_git::fabsf16(_Float16)>:
        2210:       55                      push   rbp
        2211:       48 89 e5                mov    rbp,rsp
        2214:       c5 f9 c5 c0 00          vpextrw eax,xmm0,0x0
        2219:       0f b7 c0                movzx  eax,ax
        221c:       c5 f9 6e c0             vmovd  xmm0,eax
        2220:       c4 e2 79 13 c0          vcvtph2ps xmm0,xmm0
        2225:       c4 e2 79 18 0d 76 e7    vbroadcastss xmm1,DWORD PTR [rip+0xffffffffffffe776]        # 9a4 <_IO_stdin_used+0x4>
        222c:       ff ff
        222e:       c5 f8 54 c1             vandps xmm0,xmm0,xmm1
        2232:       c4 e3 79 1d c0 04       vcvtps2ph xmm0,xmm0,0x4
        2238:       c5 f9 7e c0             vmovd  eax,xmm0
        223c:       c5 f9 c4 c0 00          vpinsrw xmm0,xmm0,eax,0x0
        2241:       5d                      pop    rbp
        2242:       c3                      ret

After:

  • Clang 18, libgcc 14, Intel Core i7-13700H:
    fabsf16 from 0x3c00 to 0x4000:	 2.24488 ns
    fabsf16 from 0 to 0x400:	 2.23415 ns
    
    Disassembly:
    0000000000002200 <__llvm_libc_20_0_0_git::fabsf16(_Float16)>:
        2200:       55                      push   rbp
        2201:       48 89 e5                mov    rbp,rsp
        2204:       c5 f9 c5 c0 00          vpextrw eax,xmm0,0x0
        2209:       66 23 05 ee e7 ff ff    and    ax,WORD PTR [rip+0xffffffffffffe7ee]        # 9fe <_ZZN22__llvm_libc_20_0_0_git6fp>
        2210:       c5 f9 c4 c0 00          vpinsrw xmm0,xmm0,eax,0x0
        2215:       5d                      pop    rbp
        2216:       c3                      ret
  • Clang 18, libgcc 14, Intel Core i7-13700H, -march=native:
    fabsf16 from 0x3c00 to 0x4000:	 2.2878 ns
    fabsf16 from 0 to 0x400:	 2.26927 ns
    
    Disassembly:
    0000000000002210 <__llvm_libc_20_0_0_git::fabsf16(_Float16)>:
        2210:       55                      push   rbp
        2211:       48 89 e5                mov    rbp,rsp
        2214:       c5 f9 c5 c0 00          vpextrw eax,xmm0,0x0
        2219:       66 23 05 de e7 ff ff    and    ax,WORD PTR [rip+0xffffffffffffe7de]        # 9fe <_ZZN22__llvm_libc_20_0_0_git6fp>
        2220:       c5 f9 c4 c0 00          vpinsrw xmm0,xmm0,eax,0x0
        2225:       5d                      pop    rbp
        2226:       c3                      ret

If we enabled the same code path for other compilers/targets:

Before:

  • GCC 14, Intel Core i7-13700H:
    fabsf16 from 0x3c00 to 0x4000:	 1.05366 ns
    fabsf16 from 0 to 0x400:	 1.05366 ns
    
    Disassembly:
    0000000000002200 <__llvm_libc_20_0_0_git::fabsf16(_Float16)>:
        2200:       c5 f9 7e c0             vmovd  eax,xmm0
        2204:       66 25 ff 7f             and    ax,0x7fff
        2208:       c5 f9 6e c0             vmovd  xmm0,eax
        220c:       c3                      ret
  • GCC 14, Intel Core i7-13700H, -march=native:
    fabsf16 from 0x3c00 to 0x4000:	 1.05268 ns
    fabsf16 from 0 to 0x400:	 1.05171 ns
    
    Disassembly:
    0000000000002210 <__llvm_libc_20_0_0_git::fabsf16(_Float16)>:
        2210:       c5 f9 7e c0             vmovd  eax,xmm0
        2214:       66 25 ff 7f             and    ax,0x7fff
        2218:       c5 f9 6e c0             vmovd  xmm0,eax
        221c:       c3                      ret
  • Clang 18, Google Tensor G3, -Xclang -target-feature -Xclang +fullfp16:
    fabsf16 from 0x3c00 to 0x4000:	 0.873171 ns
    fabsf16 from 0 to 0x400:	 0.675122 ns
    
    Disassembly:
    0000000000024cdc <__llvm_libc_20_0_0_git::fabsf16(_Float16)>:
       24cdc: 1ee0c000      fabs    h0, h0
       24ce0: d65f03c0      ret

After:

  • GCC 14, Intel Core i7-13700H:
    fabsf16 from 0x3c00 to 0x4000:	 1.05366 ns
    fabsf16 from 0 to 0x400:	 1.05366 ns
    
    Disassembly:
    0000000000002210 <__llvm_libc_20_0_0_git::fabsf16(_Float16)>:
        2210:       0f b7 05 77 e8 ff ff    movzx  eax,WORD PTR [rip+0xffffffffffffe877]        # a8e <__llvm_libc_20_0_0_git::fputil>
        2217:       c5 f9 7e c2             vmovd  edx,xmm0
        221b:       21 c2                   and    edx,eax
        221d:       c5 f9 6e c2             vmovd  xmm0,edx
        2221:       c3                      ret
  • GCC 14, Intel Core i7-13700H, -march=native:
    fabsf16 from 0x3c00 to 0x4000:	 1.05366 ns
    fabsf16 from 0 to 0x400:	 1.05366 ns
    
    Disassembly:
    0000000000002220 <__llvm_libc_20_0_0_git::fabsf16(_Float16)>:
        2220:       0f b7 05 67 e8 ff ff    movzx  eax,WORD PTR [rip+0xffffffffffffe867]        # a8e <__llvm_libc_20_0_0_git::fputil>
        2227:       c5 f9 7e c2             vmovd  edx,xmm0
        222b:       21 c2                   and    edx,eax
        222d:       c5 f9 6e c2             vmovd  xmm0,edx
        2231:       c3                      ret
  • Clang 18, Google Tensor G3, -Xclang -target-feature -Xclang +fullfp16:
    fabsf16 from 0x3c00 to 0x4000:	 0.873171 ns
    fabsf16 from 0 to 0x400:	 0.674146 ns
    
    Disassembly:
    0000000000024cdc <__llvm_libc_20_0_0_git::fabsf16(_Float16)>:
       24cdc: d0ffff68      adrp    x8, 0x12000 <wmemchr+0x12000>
       24ce0: 1e260009      fmov    w9, s0
       24ce4: 794ad508      ldrh    w8, [x8, #0x56a]
       24ce8: 0a090108      and     w8, w8, w9
       24cec: 1e270100      fmov    s0, w8
       24cf0: d65f03c0      ret

@arsenm

return __builtin_fabsf16(x);
#elif defined(LIBC_TARGET_ARCH_IS_X86) && defined(LIBC_COMPILER_IS_CLANG)
// Prevent Clang from generating calls to slow soft-float conversion
// functions on x86. See https://godbolt.org/z/hvo6jbnGz.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is just a codegen bug. it's wrong to introduce a canonicalizing extend when legalizing fabs. Don't workaround it here?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
Projects
None yet
Development

Successfully merging this pull request may close these issues.

4 participants