-
Couldn't load subscription status.
- Fork 60
Closed
JuliaLang/julia
#58837Labels
bugSomething isn't workingSomething isn't working
Description
Reduced from the example in #766 (comment):
julia> using AMDGPU
julia> function kern!(C, A, ta)
i = workitemIdx().x
@inbounds begin
x = ta == 0 ? A[1] : (ta == 1 ? A[1] : A[1])
C[i] = x
end
nothing
end
kern! (generic function with 1 method)
julia> C = AMDGPU.zeros(ComplexF64, 2, 2)
2×2 ROCArray{ComplexF64, 2, AMDGPU.Runtime.Mem.HIPBuffer}:
0.0+0.0im 0.0+0.0im
0.0+0.0im 0.0+0.0im
julia> @roc groupsize=2 gridsize=2 kern!(C, AMDGPU.ones(ComplexF64, 1), 1)
AMDGPU.Runtime.HIPKernel{typeof(kern!), Tuple{AMDGPU.Device.ROCDeviceMatrix{ComplexF64, 1}, AMDGPU.Device.ROCDeviceVector{ComplexF64, 1}, Int64}}(kern!, AMDGPU.HIP.HIPFunction(Ptr{Nothing}(0x000000000fcec3c0), AMDGPU.HIP.HIPModule(Ptr{Nothing}(0x000000000fe23de0)), Symbol[]))
julia> C
2×2 ROCArray{ComplexF64, 2, AMDGPU.Runtime.Mem.HIPBuffer}:
0.0-4.84973e17im 0.0+0.0im
1.0+0.0im 0.0+0.0imThe first item in C should be 1 as well, but it is not. Curiously, this only happens for ComplexF64 and everything works as expected if one either changes ta from 1 to 0 or makes C a vector instead of a matrix. Any ideas what could be happening here?
The kernel lowers to the following LLVM IR:
; GPUCompiler.CompilerJob{GPUCompiler.GCNCompilerTarget, AMDGPU.Compiler.HIPCompilerParams}(MethodInstance for kern!(::AMDGPU.Device.ROCDeviceMatrix{ComplexF64, 1}, ::AMDGPU.Device.ROCDeviceVector{ComplexF64, 1}, ::Int64), CompilerConfig for GPUCompiler.GCNCompilerTarget, 0x00000000000096cb)
; @ REPL[2]:1 within `kern!`
define amdgpu_kernel void @_Z5kern_14ROCDeviceArrayI7ComplexI7Float64ELi2ELi1EES_IS2_Li1ELi1EE5Int64({ ptr, ptr, ptr, ptr, ptr, ptr, i32, i32, ptr, ptr, ptr, ptr } %state, { [2 x i64], ptr addrspace(1), i64 } %"C::ROCDeviceArray", { [1 x i64], ptr addrspace(1), i64 } %"A::ROCDeviceArray", i64 signext %"ta::Int64") local_unnamed_addr #2 {
conversion:
%"A::ROCDeviceArray.fca.1.extract" = extractvalue { [1 x i64], ptr addrspace(1), i64 } %"A::ROCDeviceArray", 1
%0 = alloca [2 x double], align 8, addrspace(5)
%1 = alloca [2 x i64], align 8, addrspace(5)
%2 = alloca [2 x double], align 8, addrspace(5)
; @ REPL[2]:4 within `kern!`
switch i64 %"ta::Int64", label %L69 [
i64 0, label %guard_pass20
i64 1, label %L48
]
L48: ; preds = %conversion
; ┌ @ /home/simeon/.julia/dev/AMDGPU/src/device/gcn/array.jl:81 within `#getindex`
; │┌ @ /home/simeon/.julia/packages/LLVM/2JPxT/src/interop/pointer.jl:85 within `unsafe_load`
; ││┌ @ none within `pointerref`
; │││┌ @ none within `macro expansion` @ /home/simeon/.julia/packages/LLVM/2JPxT/src/interop/base.jl:39
%.unpack21 = load double, ptr addrspace(1) %"A::ROCDeviceArray.fca.1.extract", align 8
%.elt22 = getelementptr inbounds [2 x double], ptr addrspace(1) %"A::ROCDeviceArray.fca.1.extract", i64 0, i64 1
%.unpack23 = load double, ptr addrspace(1) %.elt22, align 8
store double %.unpack21, ptr addrspace(5) %0, align 8
%.fca.1.gep4 = getelementptr inbounds [2 x double], ptr addrspace(5) %0, i32 0, i32 1
store double %.unpack23, ptr addrspace(5) %.fca.1.gep4, align 8
%.not24 = icmp eq ptr addrspace(5) %0, null
br i1 %.not24, label %L75, label %guard_pass30
L69: ; preds = %conversion
%.unpack25 = load double, ptr addrspace(1) %"A::ROCDeviceArray.fca.1.extract", align 8
%.elt26 = getelementptr inbounds [2 x double], ptr addrspace(1) %"A::ROCDeviceArray.fca.1.extract", i64 0, i64 1
%.unpack27 = load double, ptr addrspace(1) %.elt26, align 8
store double %.unpack25, ptr addrspace(5) %2, align 8
%.fca.1.gep = getelementptr inbounds [2 x double], ptr addrspace(5) %2, i32 0, i32 1
store double %.unpack27, ptr addrspace(5) %.fca.1.gep, align 8
%.not28 = icmp eq ptr addrspace(5) %2, null
br i1 %.not28, label %L75, label %guard_pass35
L75: ; preds = %guard_pass35, %guard_pass30, %L69, %L48
%.not29 = icmp eq ptr addrspace(5) %1, null
br i1 %.not29, label %L76, label %guard_pass25
L76: ; preds = %guard_pass25, %guard_pass20, %L75
%.sroa.05.0 = phi double [ %.sroa.05.0.copyload6, %guard_pass25 ], [ undef, %L75 ], [ %.unpack, %guard_pass20 ]
%.sroa.6.0 = phi double [ %.sroa.6.0.copyload8, %guard_pass25 ], [ undef, %L75 ], [ %.unpack18, %guard_pass20 ]
; └└└└
; @ REPL[2]:2 within `kern!`
; ┌ @ /home/simeon/.julia/dev/AMDGPU/src/device/gcn/indexing.jl:122 within `workitemIdx`
; │┌ @ /home/simeon/.julia/dev/AMDGPU/src/device/gcn/indexing.jl:87 within `workitemIdx_x`
; ││┌ @ /home/simeon/.julia/dev/AMDGPU/src/device/gcn/indexing.jl:3 within `_index`
; │││┌ @ /home/simeon/.julia/dev/AMDGPU/src/device/gcn/indexing.jl:3 within `macro expansion` @ /home/simeon/.julia/packages/LLVM/2JPxT/src/interop/base.jl:39
%3 = call i32 @llvm.amdgcn.workitem.id.x()
%"C::ROCDeviceArray.fca.1.extract" = extractvalue { [2 x i64], ptr addrspace(1), i64 } %"C::ROCDeviceArray", 1
; └└└└
; @ REPL[2]:5 within `kern!`
; ┌ @ /home/simeon/.julia/dev/AMDGPU/src/device/gcn/array.jl:86 within `#setindex!`
; │┌ @ /home/simeon/.julia/packages/LLVM/2JPxT/src/interop/pointer.jl:88 within `unsafe_store!`
; ││┌ @ none within `pointerset`
; │││┌ @ none within `macro expansion` @ /home/simeon/.julia/packages/LLVM/2JPxT/src/interop/base.jl:39
%4 = zext nneg i32 %3 to i64
%5 = getelementptr inbounds [2 x double], ptr addrspace(1) %"C::ROCDeviceArray.fca.1.extract", i64 %4
store double %.sroa.05.0, ptr addrspace(1) %5, align 8
%.repack30 = getelementptr inbounds [2 x double], ptr addrspace(1) %"C::ROCDeviceArray.fca.1.extract", i64 %4, i64 1
store double %.sroa.6.0, ptr addrspace(1) %.repack30, align 8
; │└└└
; │ @ /home/simeon/.julia/dev/AMDGPU/src/device/gcn/array.jl:87 within `#setindex!`
ret void
guard_pass20: ; preds = %conversion
; └
; @ REPL[2]:4 within `kern!`
; ┌ @ /home/simeon/.julia/dev/AMDGPU/src/device/gcn/array.jl:81 within `#getindex`
; │┌ @ /home/simeon/.julia/packages/LLVM/2JPxT/src/interop/pointer.jl:85 within `unsafe_load`
; ││┌ @ none within `pointerref`
; │││┌ @ none within `macro expansion` @ /home/simeon/.julia/packages/LLVM/2JPxT/src/interop/base.jl:39
%.elt17 = getelementptr inbounds [2 x double], ptr addrspace(1) %"A::ROCDeviceArray.fca.1.extract", i64 0, i64 1
%.unpack18 = load double, ptr addrspace(1) %.elt17, align 8
%.unpack = load double, ptr addrspace(1) %"A::ROCDeviceArray.fca.1.extract", align 8
br label %L76
guard_pass25: ; preds = %L75
; └└└└
; @ REPL[2]:1 within `kern!`
%.sroa.05.0.copyload6 = load double, ptr addrspace(5) %1, align 8
%.sroa.6.0..sroa_idx7 = getelementptr inbounds i8, ptr addrspace(5) %1, i32 8
%.sroa.6.0.copyload8 = load double, ptr addrspace(5) %.sroa.6.0..sroa_idx7, align 8
br label %L76
guard_pass30: ; preds = %L48
call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noundef align 8 dereferenceable(16) %1, ptr addrspace(5) noundef nonnull align 8 dereferenceable(16) %0, i64 16, i1 false)
br label %L75
guard_pass35: ; preds = %L69
call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noundef align 8 dereferenceable(16) %1, ptr addrspace(5) noundef nonnull align 8 dereferenceable(16) %2, i64 16, i1 false)
br label %L75
}Metadata
Metadata
Assignees
Labels
bugSomething isn't workingSomething isn't working