Description
I tried this code:
#[unsafe(no_mangle)]
pub fn test(a: u64, b: u64) -> u64 {
1 + pext_wrapper(a, b)
}
#[inline(always)]
pub fn pext_wrapper(a: u64, b: u64) -> u64 {
unsafe {_pext_u64(a, b) }
}
I expected to see this happen: test()
should compile into 3 asm instructions:
pext rax, a, b
inc rax
ret
Instead, this happened:
_ZN4core9core_arch6x86_644bmi29_pext_u6417he8cc84cfeae4b65aE:
.cfi_startproc
pextq %rsi, %rdi, %rax
retq
# ...
callq _ZN4core9core_arch6x86_644bmi29_pext_u6417he8cc84cfeae4b65aE
incq %rax
retq
Meta
rustc --version --verbose
:
rustc 1.87.0-nightly (920d95eaf 2025-03-28)
binary: rustc
commit-hash: 920d95eaf23d7eb6b415d09868e4f793024fa604
commit-date: 2025-03-28
host: x86_64-unknown-linux-gnu
release: 1.87.0-nightly
LLVM version: 20.1.1
Bug description
This is caused by rustc
not annotating the caller function (test()
) with "target-features"="+bmi2"
attribute when generating LLVM IR. However, pext_wrapper()
is annotated with the attribute and therefore, opt
will not inline it since pext_wrapper()
supports a microarch feature that test()
does not explicit support.
Either annotating test()
with "target-features"="+bmi2"
, OR removing the attribute from pext_wrapper()
in LLVM IR makes inlining possible.
I feel that this is a non-trivial optimization. In my use case, I increased throughput of my program by 10% by switching to inline assembly. The main usage of intrinsic functions is to speed up an otherwise slow operation. Call overhead to using a single assembly instruction is counterintuitive and slow.
To fix this, I think either rustc
should inline intrinsics before LLVM, or somehow add feature attributes to functions that call inlineable intrinsics wrappers.
llvm-ir of above code, generated with cargo llvm-ir
; ModuleID = 'lib.699d057212ea87f8-cgu.0'
source_filename = "lib.699d057212ea87f8-cgu.0"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
; core::core_arch::x86_64::bmi2::_pext_u64
; Function Attrs: inlinehint nonlazybind uwtable
define internal noundef i64 @_ZN4core9core_arch6x86_644bmi29_pext_u6417he8cc84cfeae4b65aE(i64 noundef %a, i64 noundef %mask) unnamed_addr #0 {
start:
%_0 = call noundef i64 @llvm.x86.bmi.pext.64(i64 noundef %a, i64 noundef %mask) #3
ret i64 %_0
}
; Function Attrs: nonlazybind uwtable
define noundef i64 @test(i64 noundef %a, i64 noundef %b) unnamed_addr #1 {
start:
; call core::core_arch::x86_64::bmi2::_pext_u64
%_3 = call noundef i64 @_ZN4core9core_arch6x86_644bmi29_pext_u6417he8cc84cfeae4b65aE(i64 noundef %a, i64 noundef %b)
%_0 = add i64 1, %_3
ret i64 %_0
}
; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
declare i64 @llvm.x86.bmi.pext.64(i64, i64) unnamed_addr #2
attributes #0 = { inlinehint nonlazybind uwtable "probe-stack"="inline-asm" "target-cpu"="x86-64" "target-features"="+bmi2" }
attributes #1 = { nonlazybind uwtable "probe-stack"="inline-asm" "target-cpu"="x86-64" } ; adding "target-features"="+bmi2" here allows opt to inline
attributes #2 = { nocallback nofree nosync nounwind willreturn memory(none) }
attributes #3 = { nounwind }
!llvm.module.flags = !{!0, !1}
!llvm.ident = !{!2}
!0 = !{i32 8, !"PIC Level", i32 2}
!1 = !{i32 2, !"RtLibUseGOT", i32 1}
!2 = !{!"rustc version 1.87.0-nightly (920d95eaf 2025-03-28)"}