-
Notifications
You must be signed in to change notification settings - Fork 254
Open
Labels
cuda kernelsStuff about writing CUDA kernels.Stuff about writing CUDA kernels.performanceHow fast can we go?How fast can we go?
Description
The following code demonstrates that on the CPU / with Array we get allocationless code, while on the GPU there's several boxes and allocations:
@inbounds function test_kernel(xs, y)
@inbounds x = xs[]
@inbounds y[] = if x isa Int32
1
elseif x isa Int64
2
elseif x isa Float32
3
elseif x isa Float64
4
else
-1
end
return
end
function test(Ts=Union{Int32,Int64,Float32, Float64}; kwargs...)
code_llvm(test_kernel, Tuple{Vector{Ts}, Vector{Int}}; kwargs...)
CUDA.code_llvm(test_kernel, Tuple{CuDeviceVector{Ts, AS.Global}, CuDeviceVector{Int,AS.Global}}; kwargs...)
end
CPU:
define nonnull {}* @japi1_test_kernel_13215({}* %0, {}** %1, i32 %2) #0 {
top:
%3 = alloca {}**, align 8
store volatile {}** %1, {}*** %3, align 8
%4 = load {}*, {}** %1, align 8
%5 = getelementptr inbounds {}*, {}** %1, i64 1
%6 = bitcast {}** %5 to i64***
%7 = load i64**, i64*** %6, align 8
%8 = bitcast {}* %4 to [1 x i64]**
%9 = load [1 x i64]*, [1 x i64]** %8, align 8
%10 = bitcast {}* %4 to { i8*, i64, i16, i16, i32 }*
%11 = getelementptr inbounds { i8*, i64, i16, i16, i32 }, { i8*, i64, i16, i16, i32 }* %10, i64 0, i32 4
%12 = load i32, i32* %11, align 4
%13 = zext i32 %12 to i64
%14 = bitcast {}* %4 to {}**
%15 = getelementptr inbounds {}*, {}** %14, i64 4
%16 = bitcast {}** %15 to i64*
%17 = load i64, i64* %16, align 8
%18 = sub nsw i64 %17, %13
%19 = getelementptr inbounds [1 x i64], [1 x i64]* %9, i64 %18
%20 = bitcast [1 x i64]* %19 to i8*
%21 = sext i32 %12 to i64
%22 = getelementptr inbounds i8, i8* %20, i64 %21
%23 = load i8, i8* %22, align 1
switch i8 %23, label %L33 [
i8 2, label %L63
i8 3, label %L63.fold.split
]
L33: ; preds = %top
%.not = icmp eq i8 %23, 0
%. = select i1 %.not, i64 3, i64 4
br label %L63
L63.fold.split: ; preds = %top
br label %L63
L63: ; preds = %L63.fold.split, %L33, %top
%value_phi1 = phi i64 [ 1, %top ], [ %., %L33 ], [ 2, %L63.fold.split ]
%24 = load i64*, i64** %7, align 8
store i64 %value_phi1, i64* %24, align 8
ret {}* inttoptr (i64 140387449655304 to {}*)
}
GPU:
define void @julia_test_kernel_13218({ i8 addrspace(1)*, i64, [1 x i64], i64 }* nocapture nonnull readonly align 8 dereferenceable(32) %0, { i8 addrspace(1)*, i64, [1 x i64], i64 }* nocapture nonnull readonly align 8 dereferenceable(32) %1) local_unnamed_addr #0 {
top:
%2 = getelementptr inbounds { i8 addrspace(1)*, i64, [1 x i64], i64 }, { i8 addrspace(1)*, i64, [1 x i64], i64 }* %0, i64 0, i32 0
%3 = getelementptr inbounds { i8 addrspace(1)*, i64, [1 x i64], i64 }, { i8 addrspace(1)*, i64, [1 x i64], i64 }* %0, i64 0, i32 1
%4 = load i8 addrspace(1)*, i8 addrspace(1)** %2, align 8
%5 = load i64, i64* %3, align 8
%6 = getelementptr i8, i8 addrspace(1)* %4, i64 %5
%7 = getelementptr i8, i8 addrspace(1)* %6, i64 1
%8 = getelementptr i8, i8 addrspace(1)* %7, i64 -1
%9 = load i8, i8 addrspace(1)* %8, align 1
switch i8 %9, label %L71 [
i8 0, label %L48
i8 1, label %L57
i8 2, label %L66
]
L48: ; preds = %top
%10 = bitcast i8 addrspace(1)* %4 to float addrspace(1)*
%11 = load float, float addrspace(1)* %10, align 8
%12 = call fastcc {}* @jl_box_float32(float %11)
br label %L87
L57: ; preds = %top
%13 = bitcast i8 addrspace(1)* %4 to double addrspace(1)*
%14 = load double, double addrspace(1)* %13, align 8
%15 = call fastcc {}* @gpu_gc_pool_alloc(i64 8)
%16 = bitcast {}* %15 to double*
store double %14, double* %16, align 8
br label %L87
L66: ; preds = %top
%17 = bitcast i8 addrspace(1)* %4 to i32 addrspace(1)*
%18 = load i32, i32 addrspace(1)* %17, align 8
%19 = call fastcc nonnull {}* @jl_box_int32(i32 signext %18)
br label %L87
L71: ; preds = %top
%.not17 = icmp eq i8 %9, 3
call void @llvm.assume(i1 %.not17)
%20 = bitcast i8 addrspace(1)* %4 to i64 addrspace(1)*
%21 = load i64, i64 addrspace(1)* %20, align 8
%22 = call fastcc nonnull {}* @jl_box_int64(i64 signext %21)
br label %L87
L87: ; preds = %L71, %L66, %L57, %L48
%value_phi = phi {}* [ %12, %L48 ], [ %15, %L57 ], [ %19, %L66 ], [ %22, %L71 ]
%23 = bitcast {}* %value_phi to i64*
%24 = getelementptr inbounds i64, i64* %23, i64 -1
%25 = load atomic i64, i64* %24 unordered, align 8
%26 = and i64 %25, -16
%27 = inttoptr i64 %26 to {}*
%magicptr2 = ptrtoint {}* %27 to i64
switch i64 %magicptr2, label %L96 [
i64 140387202558400, label %L119
i64 140387200991040, label %L119.fold.split
i64 140387202912752, label %L119.fold.split1
]
L96: ; preds = %L87
%.not10 = icmp eq {}* %27, inttoptr (i64 140387202852256 to {}*)
%. = select i1 %.not10, i64 4, i64 -1
br label %L119
L119.fold.split: ; preds = %L87
br label %L119
L119.fold.split1: ; preds = %L87
br label %L119
L119: ; preds = %L119.fold.split1, %L119.fold.split, %L96, %L87
%value_phi1 = phi i64 [ 1, %L87 ], [ %., %L96 ], [ 2, %L119.fold.split ], [ 3, %L119.fold.split1 ]
%28 = bitcast { i8 addrspace(1)*, i64, [1 x i64], i64 }* %1 to i64 addrspace(1)**
%29 = load i64 addrspace(1)*, i64 addrspace(1)** %28, align 8
store i64 %value_phi1, i64 addrspace(1)* %29, align 8
ret void
}
FWIW, manual union splitting remains finicky though. Changing the kernel to do for x in xs
results in allocations on the CPU as well.
Metadata
Metadata
Assignees
Labels
cuda kernelsStuff about writing CUDA kernels.Stuff about writing CUDA kernels.performanceHow fast can we go?How fast can we go?