Skip to content

Generated code issue with manual union splitting #1386

Open
@maleadt

Description

@maleadt

The following code demonstrates that on the CPU / with Array we get allocationless code, while on the GPU there's several boxes and allocations:

@inbounds function test_kernel(xs, y)
    @inbounds x = xs[]
    @inbounds y[] = if x isa Int32
        1
    elseif x isa Int64
        2
    elseif x isa Float32
        3
    elseif x isa Float64
        4
    else
        -1
    end
    return
end

function test(Ts=Union{Int32,Int64,Float32, Float64}; kwargs...)
    code_llvm(test_kernel, Tuple{Vector{Ts}, Vector{Int}}; kwargs...)
    CUDA.code_llvm(test_kernel, Tuple{CuDeviceVector{Ts, AS.Global}, CuDeviceVector{Int,AS.Global}}; kwargs...)
end

CPU:

define nonnull {}* @japi1_test_kernel_13215({}* %0, {}** %1, i32 %2) #0 {
top:
  %3 = alloca {}**, align 8
  store volatile {}** %1, {}*** %3, align 8
  %4 = load {}*, {}** %1, align 8
  %5 = getelementptr inbounds {}*, {}** %1, i64 1
  %6 = bitcast {}** %5 to i64***
  %7 = load i64**, i64*** %6, align 8
  %8 = bitcast {}* %4 to [1 x i64]**
  %9 = load [1 x i64]*, [1 x i64]** %8, align 8
  %10 = bitcast {}* %4 to { i8*, i64, i16, i16, i32 }*
  %11 = getelementptr inbounds { i8*, i64, i16, i16, i32 }, { i8*, i64, i16, i16, i32 }* %10, i64 0, i32 4
  %12 = load i32, i32* %11, align 4
  %13 = zext i32 %12 to i64
  %14 = bitcast {}* %4 to {}**
  %15 = getelementptr inbounds {}*, {}** %14, i64 4
  %16 = bitcast {}** %15 to i64*
  %17 = load i64, i64* %16, align 8
  %18 = sub nsw i64 %17, %13
  %19 = getelementptr inbounds [1 x i64], [1 x i64]* %9, i64 %18
  %20 = bitcast [1 x i64]* %19 to i8*
  %21 = sext i32 %12 to i64
  %22 = getelementptr inbounds i8, i8* %20, i64 %21
  %23 = load i8, i8* %22, align 1
  switch i8 %23, label %L33 [
    i8 2, label %L63
    i8 3, label %L63.fold.split
  ]

L33:                                              ; preds = %top
  %.not = icmp eq i8 %23, 0
  %. = select i1 %.not, i64 3, i64 4
  br label %L63

L63.fold.split:                                   ; preds = %top
  br label %L63

L63:                                              ; preds = %L63.fold.split, %L33, %top
  %value_phi1 = phi i64 [ 1, %top ], [ %., %L33 ], [ 2, %L63.fold.split ]
  %24 = load i64*, i64** %7, align 8
  store i64 %value_phi1, i64* %24, align 8
  ret {}* inttoptr (i64 140387449655304 to {}*)
}

GPU:

define void @julia_test_kernel_13218({ i8 addrspace(1)*, i64, [1 x i64], i64 }* nocapture nonnull readonly align 8 dereferenceable(32) %0, { i8 addrspace(1)*, i64, [1 x i64], i64 }* nocapture nonnull readonly align 8 dereferenceable(32) %1) local_unnamed_addr #0 {
top:
  %2 = getelementptr inbounds { i8 addrspace(1)*, i64, [1 x i64], i64 }, { i8 addrspace(1)*, i64, [1 x i64], i64 }* %0, i64 0, i32 0
  %3 = getelementptr inbounds { i8 addrspace(1)*, i64, [1 x i64], i64 }, { i8 addrspace(1)*, i64, [1 x i64], i64 }* %0, i64 0, i32 1
  %4 = load i8 addrspace(1)*, i8 addrspace(1)** %2, align 8
  %5 = load i64, i64* %3, align 8
  %6 = getelementptr i8, i8 addrspace(1)* %4, i64 %5
  %7 = getelementptr i8, i8 addrspace(1)* %6, i64 1
  %8 = getelementptr i8, i8 addrspace(1)* %7, i64 -1
  %9 = load i8, i8 addrspace(1)* %8, align 1
  switch i8 %9, label %L71 [
    i8 0, label %L48
    i8 1, label %L57
    i8 2, label %L66
  ]

L48:                                              ; preds = %top
  %10 = bitcast i8 addrspace(1)* %4 to float addrspace(1)*
  %11 = load float, float addrspace(1)* %10, align 8
  %12 = call fastcc {}* @jl_box_float32(float %11)
  br label %L87

L57:                                              ; preds = %top
  %13 = bitcast i8 addrspace(1)* %4 to double addrspace(1)*
  %14 = load double, double addrspace(1)* %13, align 8
  %15 = call fastcc {}* @gpu_gc_pool_alloc(i64 8)
  %16 = bitcast {}* %15 to double*
  store double %14, double* %16, align 8
  br label %L87

L66:                                              ; preds = %top
  %17 = bitcast i8 addrspace(1)* %4 to i32 addrspace(1)*
  %18 = load i32, i32 addrspace(1)* %17, align 8
  %19 = call fastcc nonnull {}* @jl_box_int32(i32 signext %18)
  br label %L87

L71:                                              ; preds = %top
  %.not17 = icmp eq i8 %9, 3
  call void @llvm.assume(i1 %.not17)
  %20 = bitcast i8 addrspace(1)* %4 to i64 addrspace(1)*
  %21 = load i64, i64 addrspace(1)* %20, align 8
  %22 = call fastcc nonnull {}* @jl_box_int64(i64 signext %21)
  br label %L87

L87:                                              ; preds = %L71, %L66, %L57, %L48
  %value_phi = phi {}* [ %12, %L48 ], [ %15, %L57 ], [ %19, %L66 ], [ %22, %L71 ]
  %23 = bitcast {}* %value_phi to i64*
  %24 = getelementptr inbounds i64, i64* %23, i64 -1
  %25 = load atomic i64, i64* %24 unordered, align 8
  %26 = and i64 %25, -16
  %27 = inttoptr i64 %26 to {}*
  %magicptr2 = ptrtoint {}* %27 to i64
  switch i64 %magicptr2, label %L96 [
    i64 140387202558400, label %L119
    i64 140387200991040, label %L119.fold.split
    i64 140387202912752, label %L119.fold.split1
  ]

L96:                                              ; preds = %L87
  %.not10 = icmp eq {}* %27, inttoptr (i64 140387202852256 to {}*)
  %. = select i1 %.not10, i64 4, i64 -1
  br label %L119

L119.fold.split:                                  ; preds = %L87
  br label %L119

L119.fold.split1:                                 ; preds = %L87
  br label %L119

L119:                                             ; preds = %L119.fold.split1, %L119.fold.split, %L96, %L87
  %value_phi1 = phi i64 [ 1, %L87 ], [ %., %L96 ], [ 2, %L119.fold.split ], [ 3, %L119.fold.split1 ]
  %28 = bitcast { i8 addrspace(1)*, i64, [1 x i64], i64 }* %1 to i64 addrspace(1)**
  %29 = load i64 addrspace(1)*, i64 addrspace(1)** %28, align 8
  store i64 %value_phi1, i64 addrspace(1)* %29, align 8
  ret void
}

FWIW, manual union splitting remains finicky though. Changing the kernel to do for x in xs results in allocations on the CPU as well.

Metadata

Metadata

Assignees

No one assigned

    Labels

    cuda kernelsStuff about writing CUDA kernels.performanceHow fast can we go?

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions