Open
Description
The following code demonstrates that on the CPU / with Array we get allocationless code, while on the GPU there's several boxes and allocations:
@inbounds function test_kernel(xs, y)
@inbounds x = xs[]
@inbounds y[] = if x isa Int32
1
elseif x isa Int64
2
elseif x isa Float32
3
elseif x isa Float64
4
else
-1
end
return
end
function test(Ts=Union{Int32,Int64,Float32, Float64}; kwargs...)
code_llvm(test_kernel, Tuple{Vector{Ts}, Vector{Int}}; kwargs...)
CUDA.code_llvm(test_kernel, Tuple{CuDeviceVector{Ts, AS.Global}, CuDeviceVector{Int,AS.Global}}; kwargs...)
end
CPU:
define nonnull {}* @japi1_test_kernel_13215({}* %0, {}** %1, i32 %2) #0 {
top:
%3 = alloca {}**, align 8
store volatile {}** %1, {}*** %3, align 8
%4 = load {}*, {}** %1, align 8
%5 = getelementptr inbounds {}*, {}** %1, i64 1
%6 = bitcast {}** %5 to i64***
%7 = load i64**, i64*** %6, align 8
%8 = bitcast {}* %4 to [1 x i64]**
%9 = load [1 x i64]*, [1 x i64]** %8, align 8
%10 = bitcast {}* %4 to { i8*, i64, i16, i16, i32 }*
%11 = getelementptr inbounds { i8*, i64, i16, i16, i32 }, { i8*, i64, i16, i16, i32 }* %10, i64 0, i32 4
%12 = load i32, i32* %11, align 4
%13 = zext i32 %12 to i64
%14 = bitcast {}* %4 to {}**
%15 = getelementptr inbounds {}*, {}** %14, i64 4
%16 = bitcast {}** %15 to i64*
%17 = load i64, i64* %16, align 8
%18 = sub nsw i64 %17, %13
%19 = getelementptr inbounds [1 x i64], [1 x i64]* %9, i64 %18
%20 = bitcast [1 x i64]* %19 to i8*
%21 = sext i32 %12 to i64
%22 = getelementptr inbounds i8, i8* %20, i64 %21
%23 = load i8, i8* %22, align 1
switch i8 %23, label %L33 [
i8 2, label %L63
i8 3, label %L63.fold.split
]
L33: ; preds = %top
%.not = icmp eq i8 %23, 0
%. = select i1 %.not, i64 3, i64 4
br label %L63
L63.fold.split: ; preds = %top
br label %L63
L63: ; preds = %L63.fold.split, %L33, %top
%value_phi1 = phi i64 [ 1, %top ], [ %., %L33 ], [ 2, %L63.fold.split ]
%24 = load i64*, i64** %7, align 8
store i64 %value_phi1, i64* %24, align 8
ret {}* inttoptr (i64 140387449655304 to {}*)
}
GPU:
define void @julia_test_kernel_13218({ i8 addrspace(1)*, i64, [1 x i64], i64 }* nocapture nonnull readonly align 8 dereferenceable(32) %0, { i8 addrspace(1)*, i64, [1 x i64], i64 }* nocapture nonnull readonly align 8 dereferenceable(32) %1) local_unnamed_addr #0 {
top:
%2 = getelementptr inbounds { i8 addrspace(1)*, i64, [1 x i64], i64 }, { i8 addrspace(1)*, i64, [1 x i64], i64 }* %0, i64 0, i32 0
%3 = getelementptr inbounds { i8 addrspace(1)*, i64, [1 x i64], i64 }, { i8 addrspace(1)*, i64, [1 x i64], i64 }* %0, i64 0, i32 1
%4 = load i8 addrspace(1)*, i8 addrspace(1)** %2, align 8
%5 = load i64, i64* %3, align 8
%6 = getelementptr i8, i8 addrspace(1)* %4, i64 %5
%7 = getelementptr i8, i8 addrspace(1)* %6, i64 1
%8 = getelementptr i8, i8 addrspace(1)* %7, i64 -1
%9 = load i8, i8 addrspace(1)* %8, align 1
switch i8 %9, label %L71 [
i8 0, label %L48
i8 1, label %L57
i8 2, label %L66
]
L48: ; preds = %top
%10 = bitcast i8 addrspace(1)* %4 to float addrspace(1)*
%11 = load float, float addrspace(1)* %10, align 8
%12 = call fastcc {}* @jl_box_float32(float %11)
br label %L87
L57: ; preds = %top
%13 = bitcast i8 addrspace(1)* %4 to double addrspace(1)*
%14 = load double, double addrspace(1)* %13, align 8
%15 = call fastcc {}* @gpu_gc_pool_alloc(i64 8)
%16 = bitcast {}* %15 to double*
store double %14, double* %16, align 8
br label %L87
L66: ; preds = %top
%17 = bitcast i8 addrspace(1)* %4 to i32 addrspace(1)*
%18 = load i32, i32 addrspace(1)* %17, align 8
%19 = call fastcc nonnull {}* @jl_box_int32(i32 signext %18)
br label %L87
L71: ; preds = %top
%.not17 = icmp eq i8 %9, 3
call void @llvm.assume(i1 %.not17)
%20 = bitcast i8 addrspace(1)* %4 to i64 addrspace(1)*
%21 = load i64, i64 addrspace(1)* %20, align 8
%22 = call fastcc nonnull {}* @jl_box_int64(i64 signext %21)
br label %L87
L87: ; preds = %L71, %L66, %L57, %L48
%value_phi = phi {}* [ %12, %L48 ], [ %15, %L57 ], [ %19, %L66 ], [ %22, %L71 ]
%23 = bitcast {}* %value_phi to i64*
%24 = getelementptr inbounds i64, i64* %23, i64 -1
%25 = load atomic i64, i64* %24 unordered, align 8
%26 = and i64 %25, -16
%27 = inttoptr i64 %26 to {}*
%magicptr2 = ptrtoint {}* %27 to i64
switch i64 %magicptr2, label %L96 [
i64 140387202558400, label %L119
i64 140387200991040, label %L119.fold.split
i64 140387202912752, label %L119.fold.split1
]
L96: ; preds = %L87
%.not10 = icmp eq {}* %27, inttoptr (i64 140387202852256 to {}*)
%. = select i1 %.not10, i64 4, i64 -1
br label %L119
L119.fold.split: ; preds = %L87
br label %L119
L119.fold.split1: ; preds = %L87
br label %L119
L119: ; preds = %L119.fold.split1, %L119.fold.split, %L96, %L87
%value_phi1 = phi i64 [ 1, %L87 ], [ %., %L96 ], [ 2, %L119.fold.split ], [ 3, %L119.fold.split1 ]
%28 = bitcast { i8 addrspace(1)*, i64, [1 x i64], i64 }* %1 to i64 addrspace(1)**
%29 = load i64 addrspace(1)*, i64 addrspace(1)** %28, align 8
store i64 %value_phi1, i64 addrspace(1)* %29, align 8
ret void
}
FWIW, manual union splitting remains finicky though. Changing the kernel to do for x in xs
results in allocations on the CPU as well.