Description
The following trivial sequence results in local memory usage because LLVM/NVPTX decides it needs a local copy of the object that's being passed by-reference:
julia> using CUDA
julia> @noinline child(a) = @inbounds a[] = 1
child (generic function with 1 method)
julia> parent(a) = (child(a); nothing)
parent (generic function with 1 method)
julia> CUDA.code_ptx(parent, Tuple{CuDeviceArray{Float32,1,AS.Global}}; kernel=true)
//
// Generated by LLVM NVPTX Back-End
//
.version 6.3
.target sm_75
.address_size 64
// .globl _Z17julia_parent_261913CuDeviceArrayI7Float32Li1ELi1EE // -- Begin function _Z17julia_parent_261913CuDeviceArrayI7Float32Li1ELi1EE
.extern .func julia_child_2622
(
.param .b64 julia_child_2622_param_0
)
;
// @_Z17julia_parent_261913CuDeviceArrayI7Float32Li1ELi1EE
.visible .entry _Z17julia_parent_261913CuDeviceArrayI7Float32Li1ELi1EE(
.param .align 8 .b8 _Z17julia_parent_261913CuDeviceArrayI7Float32Li1ELi1EE_param_0[24]
)
{
.local .align 8 .b8 __local_depot0[24];
.reg .b64 %SP;
.reg .b64 %SPL;
.reg .b64 %rd<6>;
// %bb.0: // %entry
mov.u64 %SPL, __local_depot0;
cvta.local.u64 %SP, %SPL;
ld.param.u64 %rd1, [_Z17julia_parent_261913CuDeviceArrayI7Float32Li1ELi1EE_param_0+16];
ld.param.u64 %rd2, [_Z17julia_parent_261913CuDeviceArrayI7Float32Li1ELi1EE_param_0+8];
ld.param.u64 %rd3, [_Z17julia_parent_261913CuDeviceArrayI7Float32Li1ELi1EE_param_0];
add.u64 %rd4, %SP, 0;
add.u64 %rd5, %SPL, 0;
st.local.u64 [%rd5], %rd3;
st.local.u64 [%rd5+8], %rd2;
st.local.u64 [%rd5+16], %rd1;
{ // callseq 8, 0
.reg .b32 temp_param_reg;
.param .b64 param0;
st.param.b64 [param0+0], %rd4;
call.uni
julia_child_2622,
(
param0
);
} // callseq 8
ret;
// -- End function
}
This is also a problem with C++ code:
struct SomeStruct {
int* something;
};
__device__ __attribute__((noinline)) void child(const SomeStruct state) {
*state.something = 1;
return;
}
__global__ void parent(const SomeStruct state) {
child(state);
return;
}
clang -O3 -S --cuda-device-only --cuda-gpu-arch=sm_70 -emit-llvm -o - wip.cu
%struct.SomeStruct = type { i32* }
; Function Attrs: mustprogress nofree noinline norecurse nosync nounwind willreturn
define dso_local void @_Z5child10SomeStruct(%struct.SomeStruct* nocapture readonly byval(%struct.SomeStruct) align 8 %state) local_unnamed_addr #0 {
entry:
%something = getelementptr inbounds %struct.SomeStruct, %struct.SomeStruct* %state, i64 0, i32 0
%0 = load i32*, i32** %something, align 8
store i32 1, i32* %0, align 4
ret void
}
; Function Attrs: convergent mustprogress nofree norecurse nosync nounwind willreturn
define dso_local void @_Z6parent10SomeStruct(%struct.SomeStruct* nocapture readonly byval(%struct.SomeStruct) align 8 %state) local_unnamed_addr #1 {
entry:
%agg.tmp = alloca %struct.SomeStruct, align 8
%0 = bitcast %struct.SomeStruct* %state to i64*
%1 = bitcast %struct.SomeStruct* %agg.tmp to i64*
%2 = load i64, i64* %0, align 8
store i64 %2, i64* %1, align 8
tail call void @_Z5child10SomeStruct(%struct.SomeStruct* nonnull byval(%struct.SomeStruct) align 8 %agg.tmp) #2
ret void
}
.visible .func _Z5child10SomeStruct(
.param .align 8 .b8 _Z5child10SomeStruct_param_0[8]
)
{
.reg .b32 %r<2>;
.reg .b64 %rd<2>;
ld.param.u64 %rd1, [_Z5child10SomeStruct_param_0];
mov.u32 %r1, 1;
st.u32 [%rd1], %r1;
ret;
}
// .globl _Z6parent10SomeStruct
.visible .entry _Z6parent10SomeStruct(
.param .align 8 .b8 _Z6parent10SomeStruct_param_0[8]
)
{
.local .align 8 .b8 __local_depot1[8];
.reg .b64 %SP;
.reg .b64 %SPL;
.reg .b64 %rd<5>;
mov.u64 %SPL, __local_depot1;
cvta.local.u64 %SP, %SPL;
add.u64 %rd2, %SPL, 0;
ld.param.u64 %rd3, [_Z6parent10SomeStruct_param_0];
st.local.u64 [%rd2], %rd3;
ld.u64 %rd4, [%SP+0];
{ // callseq 0, 0
.reg .b32 temp_param_reg;
.param .align 8 .b8 param0[8];
st.param.b64 [param0+0], %rd4;
call.uni
_Z5child10SomeStruct,
(
param0
);
} // callseq 0
ret;
}
I would have expected the nocapture readonly byval
to be enough for LLVM to know a copy isn't necessary, but alas. See #1167 (comment) for more details why this doesn't currently work. nvcc
is better at this, avoiding the local copy for the same input:
nvcc -arch=sm_70 wip.cu -ptx -o -
.func _Z5child10SomeStruct(
.param .align 8 .b8 _Z5child10SomeStruct_param_0[8]
)
{
.reg .b32 %r<2>;
.reg .b64 %rd<2>;
ld.param.u64 %rd1, [_Z5child10SomeStruct_param_0];
mov.u32 %r1, 1;
st.u32 [%rd1], %r1;
ret;
}
// .globl _Z6parent10SomeStruct
.visible .entry _Z6parent10SomeStruct(
.param .align 8 .b8 _Z6parent10SomeStruct_param_0[8]
)
{
.reg .b64 %rd<2>;
ld.param.u64 %rd1, [_Z6parent10SomeStruct_param_0];
{ // callseq 0, 0
.reg .b32 temp_param_reg;
.param .align 8 .b8 param0[8];
st.param.b64 [param0+0], %rd1;
call.uni
_Z5child10SomeStruct,
(
param0
);
} // callseq 0
ret;
}
Of course, when you start mutating the object even nvcc
emits a local depot, but with Julia our structs are immutable, so we should be able to emit better code since we're guaranteeing these objects can't modified. Changing the calling convention to pass objects by value seems pretty invasive though; maybe we should look into teaching LLVM about nocapture readonly byval
arguments.