Skip to content

Do copy[_nonoverlapping]/swap[_nonoverlapping] do typed copies? #63159

Closed
@gnzlbg

Description

@gnzlbg

The following example (godbolt):

#[repr(align(128))] #[derive(Copy, Clone)] pub struct A(u8);

pub unsafe fn foo(x: &A) -> A { 
    *x
}

pub unsafe fn bar(x: &A) -> A { 
    let mut y: A = std::mem::uninitialized();
    std::ptr::copy_nonoverlapping(
        x as *const A, &mut y as *mut A, 1
    );
    y
}

produces the following LLVM-IR:

%A = type { [0 x i8], i8, [127 x i8] }

define void @_ZN7example3foo17h89c067fa0b9a17bcE(%A* noalias nocapture sret dereferenceable(128), %A* noalias nocapture readonly align 128 dereferenceable(128) %x) unnamed_addr #0 {
  %1 = getelementptr inbounds %A, %A* %0, i64 0, i32 0, i64 0
  %2 = getelementptr inbounds %A, %A* %x, i64 0, i32 0, i64 0
  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 128 %1, i8* nonnull align 128 %2, i64 128, i1 false)
  ret void
}

define void @_ZN7example3bar17hd5d27715385ba486E(%A* noalias nocapture sret dereferenceable(128), %A* noalias nocapture readonly align 128 dereferenceable(128) %x) unnamed_addr #0 {
  %1 = getelementptr inbounds %A, %A* %x, i64 0, i32 0, i64 0
  %2 = getelementptr inbounds %A, %A* %0, i64 0, i32 0, i64 0
  call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 128 %2, i8* nonnull align 128 %1, i64 128, i1 false) #2
  ret void
}

and machine code:

example::foo:
        mov     rax, rdi
        movaps  xmm0, xmmword ptr [rsi + 112]
        movaps  xmmword ptr [rdi + 112], xmm0
        movaps  xmm0, xmmword ptr [rsi + 96]
        movaps  xmmword ptr [rdi + 96], xmm0
        movaps  xmm0, xmmword ptr [rsi + 80]
        movaps  xmmword ptr [rdi + 80], xmm0
        movaps  xmm0, xmmword ptr [rsi + 64]
        movaps  xmmword ptr [rdi + 64], xmm0
        movaps  xmm0, xmmword ptr [rsi]
        movaps  xmm1, xmmword ptr [rsi + 16]
        movaps  xmm2, xmmword ptr [rsi + 32]
        movaps  xmm3, xmmword ptr [rsi + 48]
        movaps  xmmword ptr [rdi + 48], xmm3
        movaps  xmmword ptr [rdi + 32], xmm2
        movaps  xmmword ptr [rdi + 16], xmm1
        movaps  xmmword ptr [rdi], xmm0
        ret

example::bar:
        mov     rax, rdi
        movaps  xmm0, xmmword ptr [rsi + 112]
        movaps  xmmword ptr [rdi + 112], xmm0
        movaps  xmm0, xmmword ptr [rsi + 96]
        movaps  xmmword ptr [rdi + 96], xmm0
        movaps  xmm0, xmmword ptr [rsi + 80]
        movaps  xmmword ptr [rdi + 80], xmm0
        movaps  xmm0, xmmword ptr [rsi + 64]
        movaps  xmmword ptr [rdi + 64], xmm0
        movaps  xmm0, xmmword ptr [rsi]
        movaps  xmm1, xmmword ptr [rsi + 16]
        movaps  xmm2, xmmword ptr [rsi + 32]
        movaps  xmm3, xmmword ptr [rsi + 48]
        movaps  xmmword ptr [rdi + 48], xmm3
        movaps  xmmword ptr [rdi + 32], xmm2
        movaps  xmmword ptr [rdi + 16], xmm1
        movaps  xmmword ptr [rdi], xmm0
        ret

where 128 bytes are copied every time a value of type A is moved/copied/read/...

However, one actually only has to copy a single byte, since all other bytes are trailing padding. The expected machine code is (godbolt):

example::foo:
        mov     al, byte ptr [rdi]
        ret

example::bar:
        mov     al, byte ptr [rdi]
        ret

cc @nikic @rkruppe

Metadata

Metadata

Assignees

No one assigned

    Labels

    A-LLVMArea: Code generation parts specific to LLVM. Both correctness bugs and optimization-related issues.C-enhancementCategory: An issue proposing an enhancement or a PR with one.I-slowIssue: Problems and improvements with respect to performance of generated code.T-compilerRelevant to the compiler team, which will review and decide on the PR/issue.

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions