Skip to content

incorrect first value for simple kernel on 1.12 #780

@simeonschaub

Description

@simeonschaub

Reduced from the example in #766 (comment):

julia> using AMDGPU

julia> function kern!(C, A, ta)
           i = workitemIdx().x
           @inbounds begin
               x = ta == 0 ? A[1] : (ta == 1 ? A[1] : A[1])
               C[i] = x
           end
           nothing
       end
kern! (generic function with 1 method)

julia> C = AMDGPU.zeros(ComplexF64, 2, 2)
2×2 ROCArray{ComplexF64, 2, AMDGPU.Runtime.Mem.HIPBuffer}:
 0.0+0.0im  0.0+0.0im
 0.0+0.0im  0.0+0.0im

julia> @roc groupsize=2 gridsize=2 kern!(C, AMDGPU.ones(ComplexF64, 1), 1)
AMDGPU.Runtime.HIPKernel{typeof(kern!), Tuple{AMDGPU.Device.ROCDeviceMatrix{ComplexF64, 1}, AMDGPU.Device.ROCDeviceVector{ComplexF64, 1}, Int64}}(kern!, AMDGPU.HIP.HIPFunction(Ptr{Nothing}(0x000000000fcec3c0), AMDGPU.HIP.HIPModule(Ptr{Nothing}(0x000000000fe23de0)), Symbol[]))

julia> C
2×2 ROCArray{ComplexF64, 2, AMDGPU.Runtime.Mem.HIPBuffer}:
 0.0-4.84973e17im  0.0+0.0im
 1.0+0.0im         0.0+0.0im

The first item in C should be 1 as well, but it is not. Curiously, this only happens for ComplexF64 and everything works as expected if one either changes ta from 1 to 0 or makes C a vector instead of a matrix. Any ideas what could be happening here?

The kernel lowers to the following LLVM IR:

; GPUCompiler.CompilerJob{GPUCompiler.GCNCompilerTarget, AMDGPU.Compiler.HIPCompilerParams}(MethodInstance for kern!(::AMDGPU.Device.ROCDeviceMatrix{ComplexF64, 1}, ::AMDGPU.Device.ROCDeviceVector{ComplexF64, 1}, ::Int64), CompilerConfig for GPUCompiler.GCNCompilerTarget, 0x00000000000096cb)
;  @ REPL[2]:1 within `kern!`
define amdgpu_kernel void @_Z5kern_14ROCDeviceArrayI7ComplexI7Float64ELi2ELi1EES_IS2_Li1ELi1EE5Int64({ ptr, ptr, ptr, ptr, ptr, ptr, i32, i32, ptr, ptr, ptr, ptr } %state, { [2 x i64], ptr addrspace(1), i64 } %"C::ROCDeviceArray", { [1 x i64], ptr addrspace(1), i64 } %"A::ROCDeviceArray", i64 signext %"ta::Int64") local_unnamed_addr #2 {
conversion:
  %"A::ROCDeviceArray.fca.1.extract" = extractvalue { [1 x i64], ptr addrspace(1), i64 } %"A::ROCDeviceArray", 1
  %0 = alloca [2 x double], align 8, addrspace(5)
  %1 = alloca [2 x i64], align 8, addrspace(5)
  %2 = alloca [2 x double], align 8, addrspace(5)
;  @ REPL[2]:4 within `kern!`
  switch i64 %"ta::Int64", label %L69 [
    i64 0, label %guard_pass20
    i64 1, label %L48
  ]

L48:                                              ; preds = %conversion
; ┌ @ /home/simeon/.julia/dev/AMDGPU/src/device/gcn/array.jl:81 within `#getindex`
; │┌ @ /home/simeon/.julia/packages/LLVM/2JPxT/src/interop/pointer.jl:85 within `unsafe_load`
; ││┌ @ none within `pointerref`
; │││┌ @ none within `macro expansion` @ /home/simeon/.julia/packages/LLVM/2JPxT/src/interop/base.jl:39
      %.unpack21 = load double, ptr addrspace(1) %"A::ROCDeviceArray.fca.1.extract", align 8
      %.elt22 = getelementptr inbounds [2 x double], ptr addrspace(1) %"A::ROCDeviceArray.fca.1.extract", i64 0, i64 1
      %.unpack23 = load double, ptr addrspace(1) %.elt22, align 8
      store double %.unpack21, ptr addrspace(5) %0, align 8
      %.fca.1.gep4 = getelementptr inbounds [2 x double], ptr addrspace(5) %0, i32 0, i32 1
      store double %.unpack23, ptr addrspace(5) %.fca.1.gep4, align 8
      %.not24 = icmp eq ptr addrspace(5) %0, null
      br i1 %.not24, label %L75, label %guard_pass30

L69:                                              ; preds = %conversion
      %.unpack25 = load double, ptr addrspace(1) %"A::ROCDeviceArray.fca.1.extract", align 8
      %.elt26 = getelementptr inbounds [2 x double], ptr addrspace(1) %"A::ROCDeviceArray.fca.1.extract", i64 0, i64 1
      %.unpack27 = load double, ptr addrspace(1) %.elt26, align 8
      store double %.unpack25, ptr addrspace(5) %2, align 8
      %.fca.1.gep = getelementptr inbounds [2 x double], ptr addrspace(5) %2, i32 0, i32 1
      store double %.unpack27, ptr addrspace(5) %.fca.1.gep, align 8
      %.not28 = icmp eq ptr addrspace(5) %2, null
      br i1 %.not28, label %L75, label %guard_pass35

L75:                                              ; preds = %guard_pass35, %guard_pass30, %L69, %L48
      %.not29 = icmp eq ptr addrspace(5) %1, null
      br i1 %.not29, label %L76, label %guard_pass25

L76:                                              ; preds = %guard_pass25, %guard_pass20, %L75
      %.sroa.05.0 = phi double [ %.sroa.05.0.copyload6, %guard_pass25 ], [ undef, %L75 ], [ %.unpack, %guard_pass20 ]
      %.sroa.6.0 = phi double [ %.sroa.6.0.copyload8, %guard_pass25 ], [ undef, %L75 ], [ %.unpack18, %guard_pass20 ]
; └└└└
;  @ REPL[2]:2 within `kern!`
; ┌ @ /home/simeon/.julia/dev/AMDGPU/src/device/gcn/indexing.jl:122 within `workitemIdx`
; │┌ @ /home/simeon/.julia/dev/AMDGPU/src/device/gcn/indexing.jl:87 within `workitemIdx_x`
; ││┌ @ /home/simeon/.julia/dev/AMDGPU/src/device/gcn/indexing.jl:3 within `_index`
; │││┌ @ /home/simeon/.julia/dev/AMDGPU/src/device/gcn/indexing.jl:3 within `macro expansion` @ /home/simeon/.julia/packages/LLVM/2JPxT/src/interop/base.jl:39
      %3 = call i32 @llvm.amdgcn.workitem.id.x()
      %"C::ROCDeviceArray.fca.1.extract" = extractvalue { [2 x i64], ptr addrspace(1), i64 } %"C::ROCDeviceArray", 1
; └└└└
;  @ REPL[2]:5 within `kern!`
; ┌ @ /home/simeon/.julia/dev/AMDGPU/src/device/gcn/array.jl:86 within `#setindex!`
; │┌ @ /home/simeon/.julia/packages/LLVM/2JPxT/src/interop/pointer.jl:88 within `unsafe_store!`
; ││┌ @ none within `pointerset`
; │││┌ @ none within `macro expansion` @ /home/simeon/.julia/packages/LLVM/2JPxT/src/interop/base.jl:39
      %4 = zext nneg i32 %3 to i64
      %5 = getelementptr inbounds [2 x double], ptr addrspace(1) %"C::ROCDeviceArray.fca.1.extract", i64 %4
      store double %.sroa.05.0, ptr addrspace(1) %5, align 8
      %.repack30 = getelementptr inbounds [2 x double], ptr addrspace(1) %"C::ROCDeviceArray.fca.1.extract", i64 %4, i64 1
      store double %.sroa.6.0, ptr addrspace(1) %.repack30, align 8
; │└└└
; │ @ /home/simeon/.julia/dev/AMDGPU/src/device/gcn/array.jl:87 within `#setindex!`
   ret void

guard_pass20:                                     ; preds = %conversion
; └
;  @ REPL[2]:4 within `kern!`
; ┌ @ /home/simeon/.julia/dev/AMDGPU/src/device/gcn/array.jl:81 within `#getindex`
; │┌ @ /home/simeon/.julia/packages/LLVM/2JPxT/src/interop/pointer.jl:85 within `unsafe_load`
; ││┌ @ none within `pointerref`
; │││┌ @ none within `macro expansion` @ /home/simeon/.julia/packages/LLVM/2JPxT/src/interop/base.jl:39
      %.elt17 = getelementptr inbounds [2 x double], ptr addrspace(1) %"A::ROCDeviceArray.fca.1.extract", i64 0, i64 1
      %.unpack18 = load double, ptr addrspace(1) %.elt17, align 8
      %.unpack = load double, ptr addrspace(1) %"A::ROCDeviceArray.fca.1.extract", align 8
      br label %L76

guard_pass25:                                     ; preds = %L75
; └└└└
;  @ REPL[2]:1 within `kern!`
  %.sroa.05.0.copyload6 = load double, ptr addrspace(5) %1, align 8
  %.sroa.6.0..sroa_idx7 = getelementptr inbounds i8, ptr addrspace(5) %1, i32 8
  %.sroa.6.0.copyload8 = load double, ptr addrspace(5) %.sroa.6.0..sroa_idx7, align 8
  br label %L76

guard_pass30:                                     ; preds = %L48
  call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noundef align 8 dereferenceable(16) %1, ptr addrspace(5) noundef nonnull align 8 dereferenceable(16) %0, i64 16, i1 false)
  br label %L75

guard_pass35:                                     ; preds = %L69
  call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noundef align 8 dereferenceable(16) %1, ptr addrspace(5) noundef nonnull align 8 dereferenceable(16) %2, i64 16, i1 false)
  br label %L75
}

Metadata

Metadata

Assignees

No one assigned

    Labels

    bugSomething isn't working

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions