@@ -164,8 +164,9 @@ skip_tests = []
164
164
has_cudnn () || push! (skip_tests, " cudnn" )
165
165
has_cusolvermg () || push! (skip_tests, " cusolvermg" )
166
166
has_nvml () || push! (skip_tests, " nvml" )
167
- if ! has_cutensor () || CUDA. version () < v " 10.1" || first (picks). cap < v " 7.0"
168
- push! (skip_tests, " cutensor" )
167
+ if ! has_cutensor () || CUDA. version () < v " 10.1" || first (picks). cap < v " 7.0" || do_sanitize
168
+ # XXX : some library tests fail under compute-sanitizer
169
+ append! (skip_tests, [" cutensor" , " cusparse" ])
169
170
end
170
171
is_debug = ccall (:jl_is_debugbuild , Cint, ()) != 0
171
172
if first (picks). cap < v " 7.0"
@@ -199,6 +200,30 @@ else
199
200
all_tests = copy (tests)
200
201
end
201
202
203
+ # handle compute-sanitizer
204
+ struct rlimit
205
+ cur:: Culong
206
+ max:: Culong
207
+ end
208
+ const RLIMIT_NOFILE = 7
209
+ if do_sanitize
210
+ sanitizer = CUDA. compute_sanitizer ()
211
+ @info " Running under $(readchomp (` $sanitizer --version` )) "
212
+
213
+ # bump the per-process file descriptor limit to work around NVIDIA bug #3273266.
214
+ # this value will be inherited by child processes.
215
+ if Sys. islinux ()
216
+ local limit
217
+ limit = Ref {rlimit} ()
218
+ ret = ccall (:getrlimit , Cint, (Cint, Ptr{rlimit}), RLIMIT_NOFILE, limit)
219
+ systemerror (:getrlimit , ret != 0 )
220
+ @warn " Bumping file descriptor limit from $(Int (limit[]. cur)) to $(Int (limit[]. max)) "
221
+ limit[] = rlimit (limit[]. max, limit[]. max)
222
+ ret = ccall (:setrlimit , Cint, (Cint, Ptr{rlimit}), RLIMIT_NOFILE, limit)
223
+ systemerror (:getrlimit , ret != 0 )
224
+ end
225
+ end
226
+
202
227
# add workers
203
228
const test_exeflags = Base. julia_cmd ()
204
229
filter! (test_exeflags. exec) do c
@@ -214,9 +239,7 @@ const test_exename = popfirst!(test_exeflags.exec)
214
239
function addworker (X; kwargs... )
215
240
exename = if do_sanitize
216
241
sanitizer = CUDA. compute_sanitizer ()
217
- @info " Running under $(readchomp (` $sanitizer --version` )) "
218
- # NVIDIA bug 3263616: compute-sanitizer crashes when generating host backtraces
219
- ` $sanitizer --tool $sanitize_tool --launch-timeout=0 --show-backtrace=no --target-processes=all --report-api-errors=no $test_exename `
242
+ ` $sanitizer --tool $sanitize_tool --launch-timeout=0 --target-processes=all --report-api-errors=no $test_exename `
220
243
else
221
244
test_exename
222
245
end
353
376
push! (all_tasks, current_task ())
354
377
while length (tests) > 0
355
378
test = popfirst! (tests)
356
- local resp
379
+
380
+ # sometimes a worker failed, and we need to spawn a new one
381
+ if p === nothing
382
+ p = addworker (1 )[1 ]
383
+ end
357
384
wrkr = p
385
+
386
+ local resp
358
387
snoop = do_snoop ? mktemp () : (nothing , nothing )
359
388
360
389
# tests that muck with the context should not be timed with CUDA events,
380
409
# the worker encountered some failure, recycle it
381
410
# so future tests get a fresh environment
382
411
rmprocs (wrkr, waitfor= 30 )
383
- p = addworker ( 1 )[ 1 ]
412
+ p = nothing
384
413
else
385
414
print_testworker_stats (test, wrkr, resp)
386
415
end
0 commit comments