Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 21 additions & 18 deletions .buildkite/pipeline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ steps:
version: 1.6
- JuliaCI/julia-test#v1:
julia_args: "-g2"
test_args: "--sanitize --quickfail --jobs=1"
test_args: "--sanitize --quickfail"
- JuliaCI/julia-coverage#v1:
codecov: true
dirs:
Expand All @@ -37,13 +37,14 @@ steps:
- examples
agents:
queue: "juliagpu"
cuda: "11.3" # compute-sanitizer uses a lot of memory, so we need device_reset!
cuda: "11.0"
cap: "recent" # test as much as possible
env:
JULIA_CUDA_VERSION: '11.2' # older versions of CUDA have issues
JULIA_CUDA_VERSION: '11.4'
JULIA_CUDA_DEBUG_INFO: 'false' # NVIDIA bug #3305774: ptxas segfaults with out debug info
JULIA_CUDA_USE_BINARYBUILDER: 'true'
if: build.message !~ /\[skip tests\]/
if: build.message !~ /\[skip tests\]/ &&
!build.pull_request.draft
timeout_in_minutes: 120

- label: "Julia 1.7"
Expand All @@ -60,7 +61,7 @@ steps:
agents:
queue: "juliagpu"
cuda: "*"
if: build.message !~ /\[skip tests\]/
if: build.message !~ /\[skip tests\]/ && !build.pull_request.draft
timeout_in_minutes: 120

- label: "Julia 1.8"
Expand All @@ -77,7 +78,7 @@ steps:
agents:
queue: "juliagpu"
cuda: "*"
if: build.message !~ /\[skip tests\]/
if: build.message !~ /\[skip tests\]/ && !build.pull_request.draft
timeout_in_minutes: 120
soft_fail:
- exit_status: 1
Expand Down Expand Up @@ -110,7 +111,7 @@ steps:
env:
JULIA_CUDA_VERSION: '11.4'
JULIA_CUDA_USE_BINARYBUILDER: 'true'
if: build.message !~ /\[skip tests\]/
if: build.message !~ /\[skip tests\]/ && !build.pull_request.draft
timeout_in_minutes: 120

- label: "CUDA 11.3"
Expand All @@ -130,7 +131,7 @@ steps:
env:
JULIA_CUDA_VERSION: '11.3'
JULIA_CUDA_USE_BINARYBUILDER: 'true'
if: build.message !~ /\[skip tests\]/
if: build.message !~ /\[skip tests\]/ && !build.pull_request.draft
timeout_in_minutes: 120

- label: "CUDA 11.2"
Expand All @@ -150,7 +151,7 @@ steps:
env:
JULIA_CUDA_VERSION: '11.2'
JULIA_CUDA_USE_BINARYBUILDER: 'true'
if: build.message !~ /\[skip tests\]/
if: build.message !~ /\[skip tests\]/ && !build.pull_request.draft
timeout_in_minutes: 120

- label: "CUDA 11.1"
Expand All @@ -170,7 +171,7 @@ steps:
env:
JULIA_CUDA_VERSION: '11.1'
JULIA_CUDA_USE_BINARYBUILDER: 'true'
if: build.message !~ /\[skip tests\]/
if: build.message !~ /\[skip tests\]/ && !build.pull_request.draft
timeout_in_minutes: 120

- label: "CUDA 11.0"
Expand All @@ -190,7 +191,7 @@ steps:
env:
JULIA_CUDA_VERSION: '11.0'
JULIA_CUDA_USE_BINARYBUILDER: 'true'
if: build.message !~ /\[skip tests\]/
if: build.message !~ /\[skip tests\]/ && !build.pull_request.draft
timeout_in_minutes: 120

- label: "CUDA 10.2"
Expand All @@ -210,7 +211,7 @@ steps:
env:
JULIA_CUDA_VERSION: '10.2'
JULIA_CUDA_USE_BINARYBUILDER: 'true'
if: build.message !~ /\[skip tests\]/
if: build.message !~ /\[skip tests\]/ && !build.pull_request.draft
timeout_in_minutes: 120

- label: "CUDA 10.1"
Expand All @@ -230,7 +231,7 @@ steps:
env:
JULIA_CUDA_VERSION: '10.1'
JULIA_CUDA_USE_BINARYBUILDER: 'true'
if: build.message !~ /\[skip tests\]/
if: build.message !~ /\[skip tests\]/ && !build.pull_request.draft
timeout_in_minutes: 120


Expand All @@ -250,7 +251,7 @@ steps:
agents:
queue: "juliagpu-windows"
cuda: "*"
if: build.message !~ /\[skip tests\]/
if: build.message !~ /\[skip tests\]/ && !build.pull_request.draft
timeout_in_minutes: 120

- label: "NNlibCUDA.jl"
Expand All @@ -276,7 +277,7 @@ steps:
agents:
queue: "juliagpu"
cuda: "*"
if: build.message !~ /\[skip tests\]/
if: build.message !~ /\[skip tests\]/ && !build.pull_request.draft
timeout_in_minutes: 60


Expand Down Expand Up @@ -315,7 +316,8 @@ steps:
queue: "benchmark"
cuda: "*"
if: build.message !~ /\[skip benchmarks\]/ &&
build.branch =~ /^master$$/
build.branch =~ /^master$$/ &&
!build.pull_request.draft
timeout_in_minutes: 30

- wait
Expand All @@ -341,7 +343,8 @@ steps:
queue: "juliagpu"
cuda: "*"
if: build.message !~ /\[skip benchmarks\]/ &&
build.branch !~ /^master$$/
build.branch !~ /^master$$/ &&
!build.pull_request.draft
timeout_in_minutes: 30

- label: "Documentation"
Expand All @@ -362,7 +365,7 @@ steps:
agents:
queue: "juliagpu"
cuda: "*"
if: build.message !~ /\[skip docs\]/
if: build.message !~ /\[skip docs\]/ && !build.pull_request.draft
timeout_in_minutes: 30


Expand Down
14 changes: 6 additions & 8 deletions Manifest.toml
Original file line number Diff line number Diff line change
Expand Up @@ -77,17 +77,15 @@ version = "0.1.6"

[[GPUArrays]]
deps = ["Adapt", "LinearAlgebra", "Printf", "Random", "Serialization", "Statistics"]
git-tree-sha1 = "59aa54826b2667e2a9161f6dbd9e37255fdb541b"
repo-rev = "e1a4b3d"
repo-url = "https://github.com/JuliaGPU/GPUArrays.jl.git"
git-tree-sha1 = "7c39d767a9c55fafd01f7bc8b3fd0adf175fbc97"
uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
version = "8.0.2"
version = "8.1.0"

[[GPUCompiler]]
deps = ["ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "TimerOutputs", "UUIDs"]
git-tree-sha1 = "4ed2616d5e656c8716736b64da86755467f26cf5"
git-tree-sha1 = "55ea723d032654a52671923fdce9d785e02ed577"
uuid = "61eb1bfa-7361-4325-ad38-22787b887f55"
version = "0.12.9"
version = "0.13.0"

[[InteractiveUtils]]
deps = ["Markdown"]
Expand All @@ -106,9 +104,9 @@ version = "1.3.0"

[[LLVM]]
deps = ["CEnum", "LLVMExtra_jll", "Libdl", "Printf", "Unicode"]
git-tree-sha1 = "23a47d417a3cd9c2e73c854bac7dd4731c105ef7"
git-tree-sha1 = "36d95ecdfbc3240d728f68d73064d5b097fbf2ef"
uuid = "929cbde3-209d-540e-8aea-75f648917ca0"
version = "4.4.0"
version = "4.5.2"

[[LLVMExtra_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
Expand Down
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ BFloat16s = "0.1"
CEnum = "0.2, 0.3, 0.4"
ExprTools = "0.1"
GPUArrays = "8"
GPUCompiler = "0.12.6"
GPUCompiler = "0.13.0"
LLVM = "4.1.1"
Random123 = "1.2"
RandomNumbers = "1.5.3"
Expand Down
12 changes: 4 additions & 8 deletions lib/cudadrv/execution.jl
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,12 @@ export cudacall
## device

# pack arguments in a buffer that CUDA expects
@generated function pack_arguments(f::Function, args...)
@inline @generated function pack_arguments(f::Function, args...)
for arg in args
isbitstype(arg) || throw(ArgumentError("Arguments to kernel should be bitstype."))
end

ex = quote
Base.@_inline_meta
end
ex = quote end

# If f has N parameters, then kernelParams needs to be an array of N pointers.
# Each of kernelParams[0] through kernelParams[N-1] must point to a region of memory
Expand Down Expand Up @@ -78,12 +76,10 @@ end

# convert the argument values to match the kernel's signature (specified by the user)
# (this mimics `lower-ccall` in julia-syntax.scm)
@generated function convert_arguments(f::Function, ::Type{tt}, args...) where {tt}
@inline @generated function convert_arguments(f::Function, ::Type{tt}, args...) where {tt}
types = tt.parameters

ex = quote
Base.@_inline_meta
end
ex = quote end

converted_args = Vector{Symbol}(undef, length(args))
arg_ptrs = Vector{Symbol}(undef, length(args))
Expand Down
3 changes: 1 addition & 2 deletions lib/cudnn/util.jl
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,7 @@ juliaDataType(a)=(a==CUDNN_DATA_HALF ? Float16 :
tuple_strides(A::Tuple) = _strides((1,), A)
_strides(out::Tuple{Int}, A::Tuple{}) = ()
_strides(out::NTuple{N,Int}, A::NTuple{N}) where {N} = out
function _strides(out::NTuple{M,Int}, A::Tuple) where M
Base.@_inline_meta
@inline function _strides(out::NTuple{M,Int}, A::Tuple) where M
_strides((out..., out[M]*A[M]), A)
end

Expand Down
6 changes: 2 additions & 4 deletions perf/volumerhs.jl
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,7 @@ for (jlf, f) in zip((:+, :*, :-), (:add, :mul, :sub))
"""
@eval begin
# the @pure is necessary so that we can constant propagate.
Base.@pure function $jlf(a::$T, b::$T)
Base.@_inline_meta
@inline Base.@pure function $jlf(a::$T, b::$T)
Base.llvmcall($ir, $T, Tuple{$T, $T}, a, b)
end
end
Expand All @@ -46,8 +45,7 @@ let (jlf, f) = (:div_arcp, :div)
"""
@eval begin
# the @pure is necessary so that we can constant propagate.
Base.@pure function $jlf(a::$T, b::$T)
@Base._inline_meta
@inline Base.@pure function $jlf(a::$T, b::$T)
Base.llvmcall($ir, $T, Tuple{$T, $T}, a, b)
end
end
Expand Down
4 changes: 1 addition & 3 deletions src/compiler/execution.jl
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@ The following keyword arguments are supported:
"""
AbstractKernel

@generated function call(kernel::AbstractKernel{F,TT}, args...; call_kwargs...) where {F,TT}
@inline @generated function call(kernel::AbstractKernel{F,TT}, args...; call_kwargs...) where {F,TT}
sig = Tuple{F, TT.parameters...} # Base.signature_type with a function type
args = (:(kernel.f), (:( args[$i] ) for i in 1:length(args))...)

Expand All @@ -197,8 +197,6 @@ AbstractKernel
call_tt = Base.to_tuple_type(call_t)

quote
Base.@_inline_meta

cudacall(kernel.fun, $call_tt, $(call_args...); call_kwargs...)
end
end
Expand Down
3 changes: 1 addition & 2 deletions src/device/intrinsics/dynamic_parallelism.jl
Original file line number Diff line number Diff line change
Expand Up @@ -73,10 +73,9 @@ function launch(f::CuDeviceFunction, args::Vararg{Any,N}; blocks::CuDim=1, threa
return
end

@generated function parameter_buffer(f::CuDeviceFunction, blocks, threads, shmem, args...)
@inline @generated function parameter_buffer(f::CuDeviceFunction, blocks, threads, shmem, args...)
# allocate a buffer
ex = quote
Base.@_inline_meta
buf = cudaGetParameterBufferV2(f, blocks, threads, shmem)
ptr = Base.unsafe_convert(Ptr{UInt32}, buf)
end
Expand Down
3 changes: 1 addition & 2 deletions src/device/intrinsics/output.jl
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ const cuprint_specifiers = Dict(
Cstring => "%s",
)

@generated function _cuprint(parts...)
@inline @generated function _cuprint(parts...)
fmt = ""
args = Expr[]

Expand Down Expand Up @@ -170,7 +170,6 @@ const cuprint_specifiers = Dict(
end

quote
Base.@_inline_meta
@cuprintf($fmt, $(args...))
end
end
Expand Down
10 changes: 10 additions & 0 deletions src/device/quirks.jl
Original file line number Diff line number Diff line change
Expand Up @@ -36,3 +36,13 @@ end
# trig.jl
@device_override @noinline Base.Math.sincos_domain_error(x) =
@print_and_throw "sincos(x) is only defined for finite x."

# multidimensional.jl
if VERSION >= v"1.7-"
# XXX: the boundscheck change in JuliaLang/julia#42119 has exposed additional issues
# with bad code generation by ptxas, so revert that changen for now.
@device_override Base.@propagate_inbounds function Base.getindex(iter::CartesianIndices{N,R},
I::Vararg{Int, N}) where {N,R}
CartesianIndex(getindex.(iter.indices, I))
end
end
12 changes: 8 additions & 4 deletions test/cublas.jl
Original file line number Diff line number Diff line change
Expand Up @@ -1574,8 +1574,10 @@ end
dU += triu(h_A,k)
end
#compare
@test C.L ≈ dL rtol=1e-2
@test C.U ≈ dU rtol=1e-2
@test C.L ≈ dL rtol=1e-1
@test C.U ≈ dU rtol=1e-1
# XXX: implement these as direct comparisons (L*U≈...)
# instead if comparing against the CPU BLAS
end
for i in 1:length(A)
d_A[ i ] = CuArray(A[i])
Expand Down Expand Up @@ -1631,8 +1633,10 @@ end
dL += tril(h_B,-k-1)
end
#compare
@test C.L ≈ dL rtol=1e-2
@test C.U ≈ dU rtol=1e-2
@test C.L ≈ dL rtol=1e-1
@test C.U ≈ dU rtol=1e-1
# XXX: implement these as direct comparisons (L*U≈...)
# instead if comparing against the CPU BLAS
end
end

Expand Down
Loading