Skip to content

PrecompileTools.jl with CUDA.jl causes kernels to fail to run on 1.11 #2637

@gbaraldi

Description

@gbaraldi

This reproduces with just precompileTools.jl

Having a package like

module Foo


using CUDA
using PrecompileTools
PrecompileTools.@compile_workload begin
function square_kernel!(x)
    i = CUDA.threadIdx().x
    x[i] *= x[i]
    return nothing
end

function square!(x)
    CUDA.@cuda blocks = 1 threads = length(x) square_kernel!(x)
    return nothing
end
y2 = CuArray([2.0])
CUDA.@device_code_llvm CUDA.@cuda square_kernel!(y2)
end

greet() = print("Hello World!")

end # module Foo

and loading is enough to fail with

julia> CUDA.@cuda square_kernel!(y2)
ERROR: InvalidIRError: compiling MethodInstance for square_kernel!(::CuDeviceVector{Float64, 1}) resulted in invalid LLVM IR
Reason: unsupported dynamic function invocation (call to throw_boundserror() @ CUDA ~/.julia/packages/CUDA/1kIOw/src/device/quirks.jl:51)
Stacktrace:
 [1] #throw_boundserror
   @ ~/.julia/packages/CUDA/1kIOw/src/device/quirks.jl:53
 [2] checkbounds
   @ ./abstractarray.jl:699
 [3] #arrayref
   @ ~/.julia/packages/CUDA/1kIOw/src/device/array.jl:81
 [4] getindex
   @ ~/.julia/packages/CUDA/1kIOw/src/device/array.jl:164
 [5] square_kernel!
   @ ./REPL[6]:3
Hint: catch this exception as `err` and call `code_typed(err; interactive = true)` to introspect the erronous code with Cthulhu.jl
Stacktrace:
  [1] check_ir(job::GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}, args::LLVM.Module)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/Nxf8r/src/validation.jl:167
  [2] macro expansion
    @ ~/.julia/packages/GPUCompiler/Nxf8r/src/driver.jl:382 [inlined]
  [3] macro expansion
    @ ~/.julia/packages/TimerOutputs/6KVfH/src/TimerOutput.jl:253 [inlined]
  [4] macro expansion
    @ ~/.julia/packages/GPUCompiler/Nxf8r/src/driver.jl:381 [inlined]
  [5] emit_llvm(job::GPUCompiler.CompilerJob; toplevel::Bool, libraries::Bool, optimize::Bool, cleanup::Bool, validate::Bool, only_entry::Bool)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/Nxf8r/src/utils.jl:108
  [6] emit_llvm
    @ ~/.julia/packages/GPUCompiler/Nxf8r/src/utils.jl:106 [inlined]
  [7] 
    @ GPUCompiler ~/.julia/packages/GPUCompiler/Nxf8r/src/driver.jl:100
  [8] codegen
    @ ~/.julia/packages/GPUCompiler/Nxf8r/src/driver.jl:82 [inlined]
  [9] compile(target::Symbol, job::GPUCompiler.CompilerJob; kwargs::@Kwargs{})
    @ GPUCompiler ~/.julia/packages/GPUCompiler/Nxf8r/src/driver.jl:79
 [10] compile
    @ ~/.julia/packages/GPUCompiler/Nxf8r/src/driver.jl:74 [inlined]
 [11] #1147
    @ ~/.julia/packages/CUDA/1kIOw/src/compiler/compilation.jl:250 [inlined]
 [12] JuliaContext(f::CUDA.var"#1147#1150"{GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}}; kwargs::@Kwargs{})
    @ GPUCompiler ~/.julia/packages/GPUCompiler/Nxf8r/src/driver.jl:34
 [13] JuliaContext(f::Function)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/Nxf8r/src/driver.jl:25
 [14] compile(job::GPUCompiler.CompilerJob)
    @ CUDA ~/.julia/packages/CUDA/1kIOw/src/compiler/compilation.jl:249
 [15] actual_compilation(cache::Dict{…}, src::Core.MethodInstance, world::UInt64, cfg::GPUCompiler.CompilerConfig{…}, compiler::typeof(CUDA.compile), linker::typeof(CUDA.link))
    @ GPUCompiler ~/.julia/packages/GPUCompiler/Nxf8r/src/execution.jl:237
 [16] cached_compilation(cache::Dict{…}, src::Core.MethodInstance, cfg::GPUCompiler.CompilerConfig{…}, compiler::Function, linker::Function)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/Nxf8r/src/execution.jl:151
 [17] macro expansion
    @ ~/.julia/packages/CUDA/1kIOw/src/compiler/execution.jl:380 [inlined]
 [18] macro expansion
    @ ./lock.jl:273 [inlined]
 [19] cufunction(f::typeof(square_kernel!), tt::Type{Tuple{CuDeviceVector{Float64, 1}}}; kwargs::@Kwargs{})
    @ CUDA ~/.julia/packages/CUDA/1kIOw/src/compiler/execution.jl:375
 [20] cufunction(f::typeof(square_kernel!), tt::Type{Tuple{CuDeviceVector{Float64, 1}}})
    @ CUDA ~/.julia/packages/CUDA/1kIOw/src/compiler/execution.jl:372
 [21] top-level scope
    @ ~/.julia/packages/CUDA/1kIOw/src/compiler/execution.jl:112
Some type information was truncated. Use `show(err)` to see complete types.

Old version that needed Reactant.jl

julia> using CUDA

julia> using Reactant

julia> function square!(x)
           CUDA.@cuda blocks = 1 threads = length(x) square_kernel!(x)
           return nothing
       end
square! (generic function with 1 method)

julia> function square_kernel!(x)
                           i = CUDA.threadIdx().x
                           x[i] *= x[i]
                           return nothing
                       end
square_kernel! (generic function with 1 method)

julia> y2 = CuArray([2.0])
1-element CuArray{Float64, 1, CUDA.DeviceMemory}:
 2.0

julia> square!(y2)
ERROR: InvalidIRError: compiling MethodInstance for square_kernel!(::CuDeviceVector{Float64, 1}) resulted in invalid LLVM IR
Reason: unsupported dynamic function invocation (call to throw_boundserror() @ CUDA ~/.julia/packages/CUDA/1kIOw/src/device/quirks.jl:51)
Stacktrace:
 [1] #throw_boundserror
   @ ~/.julia/packages/CUDA/1kIOw/src/device/quirks.jl:53
 [2] checkbounds
   @ ./abstractarray.jl:699
 [3] #arrayref
   @ ~/.julia/packages/CUDA/1kIOw/src/device/array.jl:81
 [4] getindex
   @ ~/.julia/packages/CUDA/1kIOw/src/device/array.jl:164
 [5] square_kernel!
   @ ./REPL[4]:3
Hint: catch this exception as `err` and call `code_typed(err; interactive = true)` to introspect the erronous code with Cthulhu.jl
Stacktrace:
  [1] check_ir(job::GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}, args::LLVM.Module)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/Nxf8r/src/validation.jl:167
  [2] macro expansion
    @ ~/.julia/packages/GPUCompiler/Nxf8r/src/driver.jl:382 [inlined]
  [3] macro expansion
    @ ~/.julia/packages/TimerOutputs/6KVfH/src/TimerOutput.jl:253 [inlined]
  [4] macro expansion
    @ ~/.julia/packages/GPUCompiler/Nxf8r/src/driver.jl:381 [inlined]
  [5] emit_llvm(job::GPUCompiler.CompilerJob; toplevel::Bool, libraries::Bool, optimize::Bool, cleanup::Bool, validate::Bool, only_entry::Bool)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/Nxf8r/src/utils.jl:108
  [6] 
    @ GPUCompiler ~/.julia/packages/GPUCompiler/Nxf8r/src/driver.jl:100
  [7] codegen(output::Symbol, job::GPUCompiler.CompilerJob)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/Nxf8r/src/driver.jl:82
  [8] compile(target::Symbol, job::GPUCompiler.CompilerJob; kwargs::@Kwargs{})
    @ GPUCompiler ~/.julia/packages/GPUCompiler/Nxf8r/src/driver.jl:79
  [9] compile
    @ ~/.julia/packages/GPUCompiler/Nxf8r/src/driver.jl:74 [inlined]
 [10] #1147
    @ ~/.julia/packages/CUDA/1kIOw/src/compiler/compilation.jl:250 [inlined]
 [11] JuliaContext(f::CUDA.var"#1147#1150"{GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}}; kwargs::@Kwargs{})
    @ GPUCompiler ~/.julia/packages/GPUCompiler/Nxf8r/src/driver.jl:34
 [12] JuliaContext(f::Function)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/Nxf8r/src/driver.jl:25
 [13] compile(job::GPUCompiler.CompilerJob)
    @ CUDA ~/.julia/packages/CUDA/1kIOw/src/compiler/compilation.jl:249
 [14] actual_compilation(cache::Dict{…}, src::Core.MethodInstance, world::UInt64, cfg::GPUCompiler.CompilerConfig{…}, compiler::typeof(CUDA.compile), linker::typeof(CUDA.link))
    @ GPUCompiler ~/.julia/packages/GPUCompiler/Nxf8r/src/execution.jl:237
 [15] cached_compilation(cache::Dict{…}, src::Core.MethodInstance, cfg::GPUCompiler.CompilerConfig{…}, compiler::Function, linker::Function)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/Nxf8r/src/execution.jl:151
 [16] macro expansion
    @ ~/.julia/packages/CUDA/1kIOw/src/compiler/execution.jl:380 [inlined]
 [17] macro expansion
    @ ./lock.jl:273 [inlined]
 [18] cufunction(f::typeof(square_kernel!), tt::Type{Tuple{CuDeviceVector{Float64, 1}}}; kwargs::@Kwargs{})
    @ CUDA ~/.julia/packages/CUDA/1kIOw/src/compiler/execution.jl:375
 [19] cufunction
    @ ~/.julia/packages/CUDA/1kIOw/src/compiler/execution.jl:372 [inlined]
 [20] macro expansion
    @ ~/.julia/packages/CUDA/1kIOw/src/compiler/execution.jl:112 [inlined]
 [21] square!(x::CuArray{Float64, 1, CUDA.DeviceMemory})
    @ Main ./REPL[3]:2
 [22] top-level scope
    @ REPL[6]:1
Some type information was truncated. Use `show(err)` to see complete types.

Reactant.jl has a precompileTools.jl workload that might cause issues https://github.com/EnzymeAD/Reactant.jl/blob/23a57dfc8cd56a4f12b79a6fafc72446a53ec058/ext/ReactantCUDAExt.jl#L871, but I suspect what's happening is that this is relying on an abstract type not having too many types defined on it, and adding a subtype in reactant causes things to blow up

Metadata

Metadata

Assignees

No one assigned

    Labels

    bugSomething isn't working

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions