Skip to content

MaybeUninit seems to prevent RVO in even the most trivial cases. #90595

Closed
@mcy

Description

@mcy

I tried this code:

pub struct Foo([u8; 1000]);

extern "C" {
  fn init(p: *mut Foo);
}

impl Foo {
  pub fn new_from_uninit() -> Self {
    let mut x = std::mem::MaybeUninit::uninit();
    unsafe {
      init(x.as_mut_ptr());
      x.assume_init()
    }
  }

  pub fn new() -> Self {
    let mut x = Self([0; 1000]);
    unsafe { init(&mut x) }
    x
  }
}

The generated assembly at -Oz on x86 is:

example::Foo::new_from_uninit:
        push    r14
        push    rbx
        sub     rsp, 1000
        mov     rbx, rdi
        mov     r14, rsp
        mov     rdi, r14
        call    qword ptr [rip + init@GOTPCREL]
        mov     edx, 1000
        mov     rdi, rbx
        mov     rsi, r14
        call    qword ptr [rip + memcpy@GOTPCREL] // No RVO! >:(
        mov     rax, rbx
        add     rsp, 1000
        pop     rbx
        pop     r14
        ret

example::Foo::new:
        push    rbx
        mov     rbx, rdi
        mov     edx, 1000
        xor     esi, esi
        call    qword ptr [rip + memset@GOTPCREL]
        mov     rdi, rbx
        call    qword ptr [rip + init@GOTPCREL]
        mov     rax, rbx
        pop     rbx
        ret

Observe that Rust (or LLVM, as the case may be) fails to RVO x when using MaybeUninit, ironically having potentially worse performance than the one that calls memset.

Complete godbolt example: https://godbolt.org/z/c1ccf7WeK

Here is the pertinent optimized IR:

define void @_ZN7example3Foo15new_from_uninit17h20aebee91382058eE(%Foo* noalias nocapture sret(%Foo) dereferenceable(1000) %0) unnamed_addr #0 !dbg !6 {
start:
  %x = alloca %"std::mem::MaybeUninit<Foo>", align 1
  %1 = getelementptr inbounds %"std::mem::MaybeUninit<Foo>", %"std::mem::MaybeUninit<Foo>"* %x, i64 0, i32 0, i64 0, !dbg !11
  call void @llvm.lifetime.start.p0i8(i64 1000, i8* nonnull %1), !dbg !11
  
  %2 = bitcast %"std::mem::MaybeUninit<Foo>"* %x to %Foo*, !dbg !12
  call void @init(%Foo* nonnull %2), !dbg !20
  %3 = getelementptr inbounds %Foo, %Foo* %0, i64 0, i32 0, i64 0
  call void @llvm.memcpy.p0i8.p0i8.i64(i8* noundef nonnull align 1 dereferenceable(1000) %3, i8* noundef nonnull align 1 dereferenceable(1000) %1, i64 1000, i1 false), !dbg !21
  call void @llvm.lifetime.end.p0i8(i64 1000, i8* nonnull %1), !dbg !22
  ret void, !dbg !23
}

define void @_ZN7example3Foo3new17h8ad79a0e3ddd97ffE(%Foo* noalias nocapture sret(%Foo) dereferenceable(1000) %x) unnamed_addr #0 !dbg !24 {
start:
  %x89 = getelementptr inbounds %Foo, %Foo* %x, i64 0, i32 0, i64 0
  call void @llvm.memset.p0i8.i64(i8* noundef nonnull align 1 dereferenceable(1000) %x89, i8 0, i64 1000, i1 false), !dbg !25
  tail call void @init(%Foo* nonnull %x), !dbg !26
  ret void, !dbg !27
}

Skimming the -O0 IR doesn't help much, since Rust needs to generate a whole mess of code in the MaybeUninit version. The thing that's very mysterious to me is that, somehow, the bitcast in @_ZN7example3Foo15new_from_uninit17h20aebee91382058eE is acting as a barrier for merging %x with the return slot %0??

This seems like an LLVM bug but filing here first in case it's bad IR codegen on rustc's part.

Metadata

Metadata

Assignees

Labels

C-bugCategory: This is a bug.I-slowIssue: Problems and improvements with respect to performance of generated code.

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions