Skip to content

Passing kernel argument objects by reference results in local copies #1168

Open
@maleadt

Description

@maleadt

The following trivial sequence results in local memory usage because LLVM/NVPTX decides it needs a local copy of the object that's being passed by-reference:

julia> using CUDA

julia> @noinline child(a) = @inbounds a[] = 1
child (generic function with 1 method)

julia> parent(a) = (child(a); nothing)
parent (generic function with 1 method)

julia> CUDA.code_ptx(parent, Tuple{CuDeviceArray{Float32,1,AS.Global}}; kernel=true)
//
// Generated by LLVM NVPTX Back-End
//

.version 6.3
.target sm_75
.address_size 64

        // .globl       _Z17julia_parent_261913CuDeviceArrayI7Float32Li1ELi1EE // -- Begin function _Z17julia_parent_261913CuDeviceArrayI7Float32Li1ELi1EE
.extern .func julia_child_2622
(
        .param .b64 julia_child_2622_param_0
)
;
                                        // @_Z17julia_parent_261913CuDeviceArrayI7Float32Li1ELi1EE
.visible .entry _Z17julia_parent_261913CuDeviceArrayI7Float32Li1ELi1EE(
        .param .align 8 .b8 _Z17julia_parent_261913CuDeviceArrayI7Float32Li1ELi1EE_param_0[24]
)
{
        .local .align 8 .b8     __local_depot0[24];
        .reg .b64       %SP;
        .reg .b64       %SPL;
        .reg .b64       %rd<6>;

// %bb.0:                               // %entry
        mov.u64         %SPL, __local_depot0;
        cvta.local.u64  %SP, %SPL;
        ld.param.u64    %rd1, [_Z17julia_parent_261913CuDeviceArrayI7Float32Li1ELi1EE_param_0+16];
        ld.param.u64    %rd2, [_Z17julia_parent_261913CuDeviceArrayI7Float32Li1ELi1EE_param_0+8];
        ld.param.u64    %rd3, [_Z17julia_parent_261913CuDeviceArrayI7Float32Li1ELi1EE_param_0];
        add.u64         %rd4, %SP, 0;
        add.u64         %rd5, %SPL, 0;
        st.local.u64    [%rd5], %rd3;
        st.local.u64    [%rd5+8], %rd2;
        st.local.u64    [%rd5+16], %rd1;
        { // callseq 8, 0
        .reg .b32 temp_param_reg;
        .param .b64 param0;
        st.param.b64    [param0+0], %rd4;
        call.uni 
        julia_child_2622, 
        (
        param0
        );
        } // callseq 8
        ret;
                                        // -- End function
}

This is also a problem with C++ code:

struct SomeStruct {
    int* something;
};

__device__ __attribute__((noinline)) void child(const SomeStruct state) {
    *state.something = 1;
    return;
}

__global__ void parent(const SomeStruct state) {
    child(state);
    return;
}
clang -O3 -S  --cuda-device-only --cuda-gpu-arch=sm_70 -emit-llvm -o - wip.cu
%struct.SomeStruct = type { i32* }

; Function Attrs: mustprogress nofree noinline norecurse nosync nounwind willreturn
define dso_local void @_Z5child10SomeStruct(%struct.SomeStruct* nocapture readonly byval(%struct.SomeStruct) align 8 %state) local_unnamed_addr #0 {
entry:
  %something = getelementptr inbounds %struct.SomeStruct, %struct.SomeStruct* %state, i64 0, i32 0
  %0 = load i32*, i32** %something, align 8
  store i32 1, i32* %0, align 4
  ret void
}

; Function Attrs: convergent mustprogress nofree norecurse nosync nounwind willreturn
define dso_local void @_Z6parent10SomeStruct(%struct.SomeStruct* nocapture readonly byval(%struct.SomeStruct) align 8 %state) local_unnamed_addr #1 {
entry:
  %agg.tmp = alloca %struct.SomeStruct, align 8
  %0 = bitcast %struct.SomeStruct* %state to i64*
  %1 = bitcast %struct.SomeStruct* %agg.tmp to i64*
  %2 = load i64, i64* %0, align 8
  store i64 %2, i64* %1, align 8
  tail call void @_Z5child10SomeStruct(%struct.SomeStruct* nonnull byval(%struct.SomeStruct) align 8 %agg.tmp) #2
  ret void
}
.visible .func _Z5child10SomeStruct(
        .param .align 8 .b8 _Z5child10SomeStruct_param_0[8]
)
{
        .reg .b32       %r<2>;
        .reg .b64       %rd<2>;

        ld.param.u64    %rd1, [_Z5child10SomeStruct_param_0];
        mov.u32         %r1, 1;
        st.u32  [%rd1], %r1;
        ret;

}
        // .globl       _Z6parent10SomeStruct
.visible .entry _Z6parent10SomeStruct(
        .param .align 8 .b8 _Z6parent10SomeStruct_param_0[8]
)
{
        .local .align 8 .b8     __local_depot1[8];
        .reg .b64       %SP;
        .reg .b64       %SPL;
        .reg .b64       %rd<5>;

        mov.u64         %SPL, __local_depot1;
        cvta.local.u64  %SP, %SPL;
        add.u64         %rd2, %SPL, 0;
        ld.param.u64    %rd3, [_Z6parent10SomeStruct_param_0];
        st.local.u64    [%rd2], %rd3;
        ld.u64  %rd4, [%SP+0];
        { // callseq 0, 0
        .reg .b32 temp_param_reg;
        .param .align 8 .b8 param0[8];
        st.param.b64    [param0+0], %rd4;
        call.uni 
        _Z5child10SomeStruct, 
        (
        param0
        );
        } // callseq 0
        ret;

}

I would have expected the nocapture readonly byval to be enough for LLVM to know a copy isn't necessary, but alas. See #1167 (comment) for more details why this doesn't currently work. nvcc is better at this, avoiding the local copy for the same input:

nvcc -arch=sm_70 wip.cu -ptx -o -
.func _Z5child10SomeStruct(
        .param .align 8 .b8 _Z5child10SomeStruct_param_0[8]
)
{
        .reg .b32       %r<2>;
        .reg .b64       %rd<2>;


        ld.param.u64    %rd1, [_Z5child10SomeStruct_param_0];
        mov.u32         %r1, 1;
        st.u32  [%rd1], %r1;
        ret;

}
        // .globl       _Z6parent10SomeStruct
.visible .entry _Z6parent10SomeStruct(
        .param .align 8 .b8 _Z6parent10SomeStruct_param_0[8]
)
{
        .reg .b64       %rd<2>;


        ld.param.u64    %rd1, [_Z6parent10SomeStruct_param_0];
        { // callseq 0, 0
        .reg .b32 temp_param_reg;
        .param .align 8 .b8 param0[8];
        st.param.b64    [param0+0], %rd1;
        call.uni 
        _Z5child10SomeStruct, 
        (
        param0
        );
        } // callseq 0
        ret;

}

Of course, when you start mutating the object even nvcc emits a local depot, but with Julia our structs are immutable, so we should be able to emit better code since we're guaranteeing these objects can't modified. Changing the calling convention to pass objects by value seems pretty invasive though; maybe we should look into teaching LLVM about nocapture readonly byval arguments.

Metadata

Metadata

Assignees

No one assigned

    Labels

    cuda kernelsStuff about writing CUDA kernels.performanceHow fast can we go?

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions