Closed
Description
I tried this code:
pub struct Foo([u8; 1000]);
extern "C" {
fn init(p: *mut Foo);
}
impl Foo {
pub fn new_from_uninit() -> Self {
let mut x = std::mem::MaybeUninit::uninit();
unsafe {
init(x.as_mut_ptr());
x.assume_init()
}
}
pub fn new() -> Self {
let mut x = Self([0; 1000]);
unsafe { init(&mut x) }
x
}
}
The generated assembly at -Oz on x86 is:
example::Foo::new_from_uninit:
push r14
push rbx
sub rsp, 1000
mov rbx, rdi
mov r14, rsp
mov rdi, r14
call qword ptr [rip + init@GOTPCREL]
mov edx, 1000
mov rdi, rbx
mov rsi, r14
call qword ptr [rip + memcpy@GOTPCREL] // No RVO! >:(
mov rax, rbx
add rsp, 1000
pop rbx
pop r14
ret
example::Foo::new:
push rbx
mov rbx, rdi
mov edx, 1000
xor esi, esi
call qword ptr [rip + memset@GOTPCREL]
mov rdi, rbx
call qword ptr [rip + init@GOTPCREL]
mov rax, rbx
pop rbx
ret
Observe that Rust (or LLVM, as the case may be) fails to RVO x
when using MaybeUninit
, ironically having potentially worse performance than the one that calls memset
.
Complete godbolt example: https://godbolt.org/z/c1ccf7WeK
Here is the pertinent optimized IR:
define void @_ZN7example3Foo15new_from_uninit17h20aebee91382058eE(%Foo* noalias nocapture sret(%Foo) dereferenceable(1000) %0) unnamed_addr #0 !dbg !6 {
start:
%x = alloca %"std::mem::MaybeUninit<Foo>", align 1
%1 = getelementptr inbounds %"std::mem::MaybeUninit<Foo>", %"std::mem::MaybeUninit<Foo>"* %x, i64 0, i32 0, i64 0, !dbg !11
call void @llvm.lifetime.start.p0i8(i64 1000, i8* nonnull %1), !dbg !11
%2 = bitcast %"std::mem::MaybeUninit<Foo>"* %x to %Foo*, !dbg !12
call void @init(%Foo* nonnull %2), !dbg !20
%3 = getelementptr inbounds %Foo, %Foo* %0, i64 0, i32 0, i64 0
call void @llvm.memcpy.p0i8.p0i8.i64(i8* noundef nonnull align 1 dereferenceable(1000) %3, i8* noundef nonnull align 1 dereferenceable(1000) %1, i64 1000, i1 false), !dbg !21
call void @llvm.lifetime.end.p0i8(i64 1000, i8* nonnull %1), !dbg !22
ret void, !dbg !23
}
define void @_ZN7example3Foo3new17h8ad79a0e3ddd97ffE(%Foo* noalias nocapture sret(%Foo) dereferenceable(1000) %x) unnamed_addr #0 !dbg !24 {
start:
%x89 = getelementptr inbounds %Foo, %Foo* %x, i64 0, i32 0, i64 0
call void @llvm.memset.p0i8.i64(i8* noundef nonnull align 1 dereferenceable(1000) %x89, i8 0, i64 1000, i1 false), !dbg !25
tail call void @init(%Foo* nonnull %x), !dbg !26
ret void, !dbg !27
}
Skimming the -O0 IR doesn't help much, since Rust needs to generate a whole mess of code in the MaybeUninit version. The thing that's very mysterious to me is that, somehow, the bitcast
in @_ZN7example3Foo15new_from_uninit17h20aebee91382058eE
is acting as a barrier for merging %x
with the return slot %0
??
This seems like an LLVM bug but filing here first in case it's bad IR codegen on rustc's part.