Skip to content

Commit aee14d7

Browse files
committed
Work around CUDA 11.4 compute-sanitizer issues by restarting after every test.
1 parent 0f8058d commit aee14d7

File tree

2 files changed

+38
-9
lines changed

2 files changed

+38
-9
lines changed

.buildkite/pipeline.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,10 +37,10 @@ steps:
3737
- examples
3838
agents:
3939
queue: "juliagpu"
40-
cuda: "11.3" # compute-sanitizer uses a lot of memory, so we need device_reset!
40+
cuda: "11.0"
4141
cap: "recent" # test as much as possible
4242
env:
43-
JULIA_CUDA_VERSION: '11.2' # older versions of CUDA have issues
43+
JULIA_CUDA_VERSION: '11.4'
4444
JULIA_CUDA_DEBUG_INFO: 'false' # NVIDIA bug #3305774: ptxas segfaults with out debug info
4545
JULIA_CUDA_USE_BINARYBUILDER: 'true'
4646
if: build.message !~ /\[skip tests\]/

test/runtests.jl

Lines changed: 36 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -164,8 +164,9 @@ skip_tests = []
164164
has_cudnn() || push!(skip_tests, "cudnn")
165165
has_cusolvermg() || push!(skip_tests, "cusolvermg")
166166
has_nvml() || push!(skip_tests, "nvml")
167-
if !has_cutensor() || CUDA.version() < v"10.1" || first(picks).cap < v"7.0"
168-
push!(skip_tests, "cutensor")
167+
if !has_cutensor() || CUDA.version() < v"10.1" || first(picks).cap < v"7.0" || do_sanitize
168+
# XXX: some library tests fail under compute-sanitizer
169+
append!(skip_tests, ["cutensor", "cusparse"])
169170
end
170171
is_debug = ccall(:jl_is_debugbuild, Cint, ()) != 0
171172
if first(picks).cap < v"7.0"
@@ -199,6 +200,30 @@ else
199200
all_tests = copy(tests)
200201
end
201202

203+
# handle compute-sanitizer
204+
struct rlimit
205+
cur::Culong
206+
max::Culong
207+
end
208+
const RLIMIT_NOFILE = 7
209+
if do_sanitize
210+
sanitizer = CUDA.compute_sanitizer()
211+
@info "Running under $(readchomp(`$sanitizer --version`))"
212+
213+
# bump the per-process file descriptor limit to work around NVIDIA bug #3273266.
214+
# this value will be inherited by child processes.
215+
if Sys.islinux()
216+
local limit
217+
limit = Ref{rlimit}()
218+
ret = ccall(:getrlimit, Cint, (Cint, Ptr{rlimit}), RLIMIT_NOFILE, limit)
219+
systemerror(:getrlimit, ret != 0)
220+
@warn "Bumping file descriptor limit from $(Int(limit[].cur)) to $(Int(limit[].max))"
221+
limit[] = rlimit(limit[].max, limit[].max)
222+
ret = ccall(:setrlimit, Cint, (Cint, Ptr{rlimit}), RLIMIT_NOFILE, limit)
223+
systemerror(:getrlimit, ret != 0)
224+
end
225+
end
226+
202227
# add workers
203228
const test_exeflags = Base.julia_cmd()
204229
filter!(test_exeflags.exec) do c
@@ -214,9 +239,7 @@ const test_exename = popfirst!(test_exeflags.exec)
214239
function addworker(X; kwargs...)
215240
exename = if do_sanitize
216241
sanitizer = CUDA.compute_sanitizer()
217-
@info "Running under $(readchomp(`$sanitizer --version`))"
218-
# NVIDIA bug 3263616: compute-sanitizer crashes when generating host backtraces
219-
`$sanitizer --tool $sanitize_tool --launch-timeout=0 --show-backtrace=no --target-processes=all --report-api-errors=no $test_exename`
242+
`$sanitizer --tool $sanitize_tool --launch-timeout=0 --target-processes=all --report-api-errors=no $test_exename`
220243
else
221244
test_exename
222245
end
@@ -353,8 +376,14 @@ try
353376
push!(all_tasks, current_task())
354377
while length(tests) > 0
355378
test = popfirst!(tests)
356-
local resp
379+
380+
# sometimes a worker failed, and we need to spawn a new one
381+
if p === nothing
382+
p = addworker(1)[1]
383+
end
357384
wrkr = p
385+
386+
local resp
358387
snoop = do_snoop ? mktemp() : (nothing, nothing)
359388

360389
# tests that muck with the context should not be timed with CUDA events,
@@ -380,7 +409,7 @@ try
380409
# the worker encountered some failure, recycle it
381410
# so future tests get a fresh environment
382411
rmprocs(wrkr, waitfor=30)
383-
p = addworker(1)[1]
412+
p = nothing
384413
else
385414
print_testworker_stats(test, wrkr, resp)
386415
end

0 commit comments

Comments
 (0)