Generated code issue with manual union splitting

The following code demonstrates that on the CPU / with Array we get allocationless code, while on the GPU there's several boxes and allocations:

```julia
@inbounds function test_kernel(xs, y)
    @inbounds x = xs[]
    @inbounds y[] = if x isa Int32
        1
    elseif x isa Int64
        2
    elseif x isa Float32
        3
    elseif x isa Float64
        4
    else
        -1
    end
    return
end

function test(Ts=Union{Int32,Int64,Float32, Float64}; kwargs...)
    code_llvm(test_kernel, Tuple{Vector{Ts}, Vector{Int}}; kwargs...)
    CUDA.code_llvm(test_kernel, Tuple{CuDeviceVector{Ts, AS.Global}, CuDeviceVector{Int,AS.Global}}; kwargs...)
end
```

CPU:

```llvm
define nonnull {}* @japi1_test_kernel_13215({}* %0, {}** %1, i32 %2) #0 {
top:
  %3 = alloca {}**, align 8
  store volatile {}** %1, {}*** %3, align 8
  %4 = load {}*, {}** %1, align 8
  %5 = getelementptr inbounds {}*, {}** %1, i64 1
  %6 = bitcast {}** %5 to i64***
  %7 = load i64**, i64*** %6, align 8
  %8 = bitcast {}* %4 to [1 x i64]**
  %9 = load [1 x i64]*, [1 x i64]** %8, align 8
  %10 = bitcast {}* %4 to { i8*, i64, i16, i16, i32 }*
  %11 = getelementptr inbounds { i8*, i64, i16, i16, i32 }, { i8*, i64, i16, i16, i32 }* %10, i64 0, i32 4
  %12 = load i32, i32* %11, align 4
  %13 = zext i32 %12 to i64
  %14 = bitcast {}* %4 to {}**
  %15 = getelementptr inbounds {}*, {}** %14, i64 4
  %16 = bitcast {}** %15 to i64*
  %17 = load i64, i64* %16, align 8
  %18 = sub nsw i64 %17, %13
  %19 = getelementptr inbounds [1 x i64], [1 x i64]* %9, i64 %18
  %20 = bitcast [1 x i64]* %19 to i8*
  %21 = sext i32 %12 to i64
  %22 = getelementptr inbounds i8, i8* %20, i64 %21
  %23 = load i8, i8* %22, align 1
  switch i8 %23, label %L33 [
    i8 2, label %L63
    i8 3, label %L63.fold.split
  ]

L33:                                              ; preds = %top
  %.not = icmp eq i8 %23, 0
  %. = select i1 %.not, i64 3, i64 4
  br label %L63

L63.fold.split:                                   ; preds = %top
  br label %L63

L63:                                              ; preds = %L63.fold.split, %L33, %top
  %value_phi1 = phi i64 [ 1, %top ], [ %., %L33 ], [ 2, %L63.fold.split ]
  %24 = load i64*, i64** %7, align 8
  store i64 %value_phi1, i64* %24, align 8
  ret {}* inttoptr (i64 140387449655304 to {}*)
}
```

GPU:

```llvm
define void @julia_test_kernel_13218({ i8 addrspace(1)*, i64, [1 x i64], i64 }* nocapture nonnull readonly align 8 dereferenceable(32) %0, { i8 addrspace(1)*, i64, [1 x i64], i64 }* nocapture nonnull readonly align 8 dereferenceable(32) %1) local_unnamed_addr #0 {
top:
  %2 = getelementptr inbounds { i8 addrspace(1)*, i64, [1 x i64], i64 }, { i8 addrspace(1)*, i64, [1 x i64], i64 }* %0, i64 0, i32 0
  %3 = getelementptr inbounds { i8 addrspace(1)*, i64, [1 x i64], i64 }, { i8 addrspace(1)*, i64, [1 x i64], i64 }* %0, i64 0, i32 1
  %4 = load i8 addrspace(1)*, i8 addrspace(1)** %2, align 8
  %5 = load i64, i64* %3, align 8
  %6 = getelementptr i8, i8 addrspace(1)* %4, i64 %5
  %7 = getelementptr i8, i8 addrspace(1)* %6, i64 1
  %8 = getelementptr i8, i8 addrspace(1)* %7, i64 -1
  %9 = load i8, i8 addrspace(1)* %8, align 1
  switch i8 %9, label %L71 [
    i8 0, label %L48
    i8 1, label %L57
    i8 2, label %L66
  ]

L48:                                              ; preds = %top
  %10 = bitcast i8 addrspace(1)* %4 to float addrspace(1)*
  %11 = load float, float addrspace(1)* %10, align 8
  %12 = call fastcc {}* @jl_box_float32(float %11)
  br label %L87

L57:                                              ; preds = %top
  %13 = bitcast i8 addrspace(1)* %4 to double addrspace(1)*
  %14 = load double, double addrspace(1)* %13, align 8
  %15 = call fastcc {}* @gpu_gc_pool_alloc(i64 8)
  %16 = bitcast {}* %15 to double*
  store double %14, double* %16, align 8
  br label %L87

L66:                                              ; preds = %top
  %17 = bitcast i8 addrspace(1)* %4 to i32 addrspace(1)*
  %18 = load i32, i32 addrspace(1)* %17, align 8
  %19 = call fastcc nonnull {}* @jl_box_int32(i32 signext %18)
  br label %L87

L71:                                              ; preds = %top
  %.not17 = icmp eq i8 %9, 3
  call void @llvm.assume(i1 %.not17)
  %20 = bitcast i8 addrspace(1)* %4 to i64 addrspace(1)*
  %21 = load i64, i64 addrspace(1)* %20, align 8
  %22 = call fastcc nonnull {}* @jl_box_int64(i64 signext %21)
  br label %L87

L87:                                              ; preds = %L71, %L66, %L57, %L48
  %value_phi = phi {}* [ %12, %L48 ], [ %15, %L57 ], [ %19, %L66 ], [ %22, %L71 ]
  %23 = bitcast {}* %value_phi to i64*
  %24 = getelementptr inbounds i64, i64* %23, i64 -1
  %25 = load atomic i64, i64* %24 unordered, align 8
  %26 = and i64 %25, -16
  %27 = inttoptr i64 %26 to {}*
  %magicptr2 = ptrtoint {}* %27 to i64
  switch i64 %magicptr2, label %L96 [
    i64 140387202558400, label %L119
    i64 140387200991040, label %L119.fold.split
    i64 140387202912752, label %L119.fold.split1
  ]

L96:                                              ; preds = %L87
  %.not10 = icmp eq {}* %27, inttoptr (i64 140387202852256 to {}*)
  %. = select i1 %.not10, i64 4, i64 -1
  br label %L119

L119.fold.split:                                  ; preds = %L87
  br label %L119

L119.fold.split1:                                 ; preds = %L87
  br label %L119

L119:                                             ; preds = %L119.fold.split1, %L119.fold.split, %L96, %L87
  %value_phi1 = phi i64 [ 1, %L87 ], [ %., %L96 ], [ 2, %L119.fold.split ], [ 3, %L119.fold.split1 ]
  %28 = bitcast { i8 addrspace(1)*, i64, [1 x i64], i64 }* %1 to i64 addrspace(1)**
  %29 = load i64 addrspace(1)*, i64 addrspace(1)** %28, align 8
  store i64 %value_phi1, i64 addrspace(1)* %29, align 8
  ret void
}
```

FWIW, manual union splitting remains finicky though. Changing the kernel to do `for x in xs` results in allocations on the CPU as well.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Generated code issue with manual union splitting #1386

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Generated code issue with manual union splitting #1386

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions