Passing kernel argument objects by reference results in local copies

The following trivial sequence results in local memory usage because LLVM/NVPTX decides it needs a local copy of the object that's being passed by-reference:

```julia
julia> using CUDA

julia> @noinline child(a) = @inbounds a[] = 1
child (generic function with 1 method)

julia> parent(a) = (child(a); nothing)
parent (generic function with 1 method)

julia> CUDA.code_ptx(parent, Tuple{CuDeviceArray{Float32,1,AS.Global}}; kernel=true)
```
```
//
// Generated by LLVM NVPTX Back-End
//

.version 6.3
.target sm_75
.address_size 64

        // .globl       _Z17julia_parent_261913CuDeviceArrayI7Float32Li1ELi1EE // -- Begin function _Z17julia_parent_261913CuDeviceArrayI7Float32Li1ELi1EE
.extern .func julia_child_2622
(
        .param .b64 julia_child_2622_param_0
)
;
                                        // @_Z17julia_parent_261913CuDeviceArrayI7Float32Li1ELi1EE
.visible .entry _Z17julia_parent_261913CuDeviceArrayI7Float32Li1ELi1EE(
        .param .align 8 .b8 _Z17julia_parent_261913CuDeviceArrayI7Float32Li1ELi1EE_param_0[24]
)
{
        .local .align 8 .b8     __local_depot0[24];
        .reg .b64       %SP;
        .reg .b64       %SPL;
        .reg .b64       %rd<6>;

// %bb.0:                               // %entry
        mov.u64         %SPL, __local_depot0;
        cvta.local.u64  %SP, %SPL;
        ld.param.u64    %rd1, [_Z17julia_parent_261913CuDeviceArrayI7Float32Li1ELi1EE_param_0+16];
        ld.param.u64    %rd2, [_Z17julia_parent_261913CuDeviceArrayI7Float32Li1ELi1EE_param_0+8];
        ld.param.u64    %rd3, [_Z17julia_parent_261913CuDeviceArrayI7Float32Li1ELi1EE_param_0];
        add.u64         %rd4, %SP, 0;
        add.u64         %rd5, %SPL, 0;
        st.local.u64    [%rd5], %rd3;
        st.local.u64    [%rd5+8], %rd2;
        st.local.u64    [%rd5+16], %rd1;
        { // callseq 8, 0
        .reg .b32 temp_param_reg;
        .param .b64 param0;
        st.param.b64    [param0+0], %rd4;
        call.uni 
        julia_child_2622, 
        (
        param0
        );
        } // callseq 8
        ret;
                                        // -- End function
}
```

This is also a problem with C++ code:

```cuda
struct SomeStruct {
    int* something;
};

__device__ __attribute__((noinline)) void child(const SomeStruct state) {
    *state.something = 1;
    return;
}

__global__ void parent(const SomeStruct state) {
    child(state);
    return;
}
```

```
clang -O3 -S  --cuda-device-only --cuda-gpu-arch=sm_70 -emit-llvm -o - wip.cu
```

```llvm
%struct.SomeStruct = type { i32* }

; Function Attrs: mustprogress nofree noinline norecurse nosync nounwind willreturn
define dso_local void @_Z5child10SomeStruct(%struct.SomeStruct* nocapture readonly byval(%struct.SomeStruct) align 8 %state) local_unnamed_addr #0 {
entry:
  %something = getelementptr inbounds %struct.SomeStruct, %struct.SomeStruct* %state, i64 0, i32 0
  %0 = load i32*, i32** %something, align 8
  store i32 1, i32* %0, align 4
  ret void
}

; Function Attrs: convergent mustprogress nofree norecurse nosync nounwind willreturn
define dso_local void @_Z6parent10SomeStruct(%struct.SomeStruct* nocapture readonly byval(%struct.SomeStruct) align 8 %state) local_unnamed_addr #1 {
entry:
  %agg.tmp = alloca %struct.SomeStruct, align 8
  %0 = bitcast %struct.SomeStruct* %state to i64*
  %1 = bitcast %struct.SomeStruct* %agg.tmp to i64*
  %2 = load i64, i64* %0, align 8
  store i64 %2, i64* %1, align 8
  tail call void @_Z5child10SomeStruct(%struct.SomeStruct* nonnull byval(%struct.SomeStruct) align 8 %agg.tmp) #2
  ret void
}
```

```
.visible .func _Z5child10SomeStruct(
        .param .align 8 .b8 _Z5child10SomeStruct_param_0[8]
)
{
        .reg .b32       %r<2>;
        .reg .b64       %rd<2>;

        ld.param.u64    %rd1, [_Z5child10SomeStruct_param_0];
        mov.u32         %r1, 1;
        st.u32  [%rd1], %r1;
        ret;

}
        // .globl       _Z6parent10SomeStruct
.visible .entry _Z6parent10SomeStruct(
        .param .align 8 .b8 _Z6parent10SomeStruct_param_0[8]
)
{
        .local .align 8 .b8     __local_depot1[8];
        .reg .b64       %SP;
        .reg .b64       %SPL;
        .reg .b64       %rd<5>;

        mov.u64         %SPL, __local_depot1;
        cvta.local.u64  %SP, %SPL;
        add.u64         %rd2, %SPL, 0;
        ld.param.u64    %rd3, [_Z6parent10SomeStruct_param_0];
        st.local.u64    [%rd2], %rd3;
        ld.u64  %rd4, [%SP+0];
        { // callseq 0, 0
        .reg .b32 temp_param_reg;
        .param .align 8 .b8 param0[8];
        st.param.b64    [param0+0], %rd4;
        call.uni 
        _Z5child10SomeStruct, 
        (
        param0
        );
        } // callseq 0
        ret;

}
```

I would have expected the `nocapture readonly byval` to be enough for LLVM to know a copy isn't necessary, but alas. See https://github.com/JuliaGPU/CUDA.jl/pull/1167#issuecomment-929813084 for more details why this doesn't currently work. `nvcc` is better at this, avoiding the local copy for the same input:

```
nvcc -arch=sm_70 wip.cu -ptx -o -
```

```
.func _Z5child10SomeStruct(
        .param .align 8 .b8 _Z5child10SomeStruct_param_0[8]
)
{
        .reg .b32       %r<2>;
        .reg .b64       %rd<2>;


        ld.param.u64    %rd1, [_Z5child10SomeStruct_param_0];
        mov.u32         %r1, 1;
        st.u32  [%rd1], %r1;
        ret;

}
        // .globl       _Z6parent10SomeStruct
.visible .entry _Z6parent10SomeStruct(
        .param .align 8 .b8 _Z6parent10SomeStruct_param_0[8]
)
{
        .reg .b64       %rd<2>;


        ld.param.u64    %rd1, [_Z6parent10SomeStruct_param_0];
        { // callseq 0, 0
        .reg .b32 temp_param_reg;
        .param .align 8 .b8 param0[8];
        st.param.b64    [param0+0], %rd1;
        call.uni 
        _Z5child10SomeStruct, 
        (
        param0
        );
        } // callseq 0
        ret;

}
```

Of course, when you start mutating the object even `nvcc` emits a local depot, but with Julia our structs are immutable, so we should be able to emit better code since we're guaranteeing these objects can't modified. Changing the calling convention to pass objects by value seems pretty invasive though; maybe we should look into teaching LLVM about `nocapture readonly byval` arguments.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Passing kernel argument objects by reference results in local copies #1168

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Passing kernel argument objects by reference results in local copies #1168

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions