From b0bce46c9e6f145b5a5e4423a63e49191f56d744 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim@juliacomputing.com>
Date: Mon, 22 Feb 2021 17:22:30 +0100
Subject: [PATCH 1/5] Switch tests over to compute-sanitizer.

---
 .buildkite/pipeline.yml       |  4 +--
 Artifacts.toml                | 57 ++++++++++++++++++-----------------
 deps/bindeps.jl               | 17 +++++------
 test/codegen.jl               |  8 ++---
 test/cudadrv/module.jl        |  6 ++--
 test/cudadrv/pool.jl          |  2 +-
 test/cutensor/contractions.jl |  5 +++
 test/examples.jl              |  6 ++++
 test/exceptions.jl            |  6 ++++
 test/execution.jl             |  8 ++---
 test/initialization.jl        |  2 +-
 test/pool.jl                  |  2 +-
 test/runtests.jl              | 27 ++++++++---------
 test/setup.jl                 |  8 ++---
 14 files changed, 87 insertions(+), 71 deletions(-)

diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index 39a277a44d..6c57c79913 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -27,7 +27,7 @@ steps:
           version: 1.6-nightly
       - JuliaCI/julia-test#v1:
           julia_args: "-g2"
-          test_args: "--memcheck"
+          test_args: "--sanitize"
       - JuliaCI/julia-coverage#v1:
           codecov: true
           dirs:
@@ -38,8 +38,6 @@ steps:
       queue: "juliagpu"
       cuda: "11.2"                    # older versions of CUDA have issues
       cap: "recent"                   # test as much as possible
-    env:
-      JULIA_CUDA_MEMORY_POOL: 'none'  # CUDA's memory pool requires compute-sanitizer
     if: build.message !~ /\[skip tests\]/
     timeout_in_minutes: 120
 
diff --git a/Artifacts.toml b/Artifacts.toml
index d518be731a..dce3ddc503 100644
--- a/Artifacts.toml
+++ b/Artifacts.toml
@@ -85,99 +85,102 @@ lazy = true
 
 [[CUDA110]]
 arch = "powerpc64le"
-git-tree-sha1 = "b22672705ca4f00c784a3f9d58619408d4af9de0"
+git-tree-sha1 = "25a70e995c5457a9b3c7dd7ff8a62d14acc2abc5"
 libc = "glibc"
 os = "linux"
 lazy = true
 
     [[CUDA110.download]]
-    sha256 = "e86a67aa8b1b2cd73d78572401efa75f9bb26f6a259f12d0471c64b74fbe204f"
-    url = "https://github.com/JuliaBinaryWrappers/CUDA_jll.jl/releases/download/CUDA-v11.0.3+3/CUDA.v11.0.3.powerpc64le-linux-gnu.tar.gz"
+    sha256 = "120ee6f20fc3c3c59611cf3c5b1584ed14658bb5d1bf9fd1b25a14182247d262"
+    url = "https://github.com/JuliaBinaryWrappers/CUDA_jll.jl/releases/download/CUDA-v11.0.3+4/CUDA.v11.0.3.powerpc64le-linux-gnu.tar.gz"
 
 [[CUDA110]]
 arch = "x86_64"
-git-tree-sha1 = "6b1a60793e5e98abdcfc3724cfa22b2a5348dc09"
+git-tree-sha1 = "74e3e04bdbf56ccf276cd8dd896ad07033846fae"
 libc = "glibc"
 os = "linux"
 lazy = true
 
     [[CUDA110.download]]
-    sha256 = "520e690529f67afe6aabdd8d18dc34d18acf5020cb3dc1fd4e904998d9e17aba"
-    url = "https://github.com/JuliaBinaryWrappers/CUDA_jll.jl/releases/download/CUDA-v11.0.3+3/CUDA.v11.0.3.x86_64-linux-gnu.tar.gz"
+    sha256 = "291e84f0d598ecbcbe438b1d42022583d061ad5f4eece2b1c06d600332b0367e"
+    url = "https://github.com/JuliaBinaryWrappers/CUDA_jll.jl/releases/download/CUDA-v11.0.3+4/CUDA.v11.0.3.x86_64-linux-gnu.tar.gz"
 
 [[CUDA110]]
 arch = "x86_64"
-git-tree-sha1 = "2d09da4d71a0762750dee0861e28029d38b08d1e"
+git-tree-sha1 = "1ab27f582deafbc99077d540a01141e620620177"
 os = "windows"
 lazy = true
 
     [[CUDA110.download]]
-    sha256 = "d11ca219e9b91725c6677f36b339459d149ffdcfa3f5e51928fb133158caa15a"
-    url = "https://github.com/JuliaBinaryWrappers/CUDA_jll.jl/releases/download/CUDA-v11.0.3+3/CUDA.v11.0.3.x86_64-w64-mingw32.tar.gz"
+    sha256 = "0ea0100ee7fa6d67c8d63ea44e719d76f6f70ce1ab5f657d7c97f30fae173af5"
+    url = "https://github.com/JuliaBinaryWrappers/CUDA_jll.jl/releases/download/CUDA-v11.0.3+4/CUDA.v11.0.3.x86_64-w64-mingw32.tar.gz"
+
 
 [[CUDA111]]
 arch = "powerpc64le"
-git-tree-sha1 = "44dba03dc848a148c9d2430354bf7e52e216364c"
+git-tree-sha1 = "8837163c5563af77039b4a04a49b6e2c3f123ab4"
 libc = "glibc"
 os = "linux"
 lazy = true
 
     [[CUDA111.download]]
-    sha256 = "ac85a364080ea8b97e77fb83967046c54099f7c63769577fa39a1311b68add81"
-    url = "https://github.com/JuliaBinaryWrappers/CUDA_jll.jl/releases/download/CUDA-v11.1.1+2/CUDA.v11.1.1.powerpc64le-linux-gnu.tar.gz"
+    sha256 = "847f43a4f68c2b08c6275c988ff7c7e5414ad477a625ac78f6e4970969fccc48"
+    url = "https://github.com/JuliaBinaryWrappers/CUDA_jll.jl/releases/download/CUDA-v11.1.1+3/CUDA.v11.1.1.powerpc64le-linux-gnu.tar.gz"
 
 [[CUDA111]]
 arch = "x86_64"
-git-tree-sha1 = "48c41dccb8db0c9aa9483267cb33719207abe4c1"
+git-tree-sha1 = "4670dd02df5210bd53199f14ec9f8cc027d889e0"
 libc = "glibc"
 os = "linux"
 lazy = true
 
     [[CUDA111.download]]
-    sha256 = "b7242ce10b3fb06d886725209d5b19d565c15c7e244eb84b50262f281a04291c"
-    url = "https://github.com/JuliaBinaryWrappers/CUDA_jll.jl/releases/download/CUDA-v11.1.1+2/CUDA.v11.1.1.x86_64-linux-gnu.tar.gz"
+    sha256 = "84a9574db7bfb0a59dd03ef1a85874d3f33a7686507d89312700f5c519307cba"
+    url = "https://github.com/JuliaBinaryWrappers/CUDA_jll.jl/releases/download/CUDA-v11.1.1+3/CUDA.v11.1.1.x86_64-linux-gnu.tar.gz"
 
 [[CUDA111]]
 arch = "x86_64"
-git-tree-sha1 = "ad4cf0816c2c327477c512f476649bfde7ada206"
+git-tree-sha1 = "86505c4367204e1769e6341380841f7f589a2f4d"
 os = "windows"
 lazy = true
 
     [[CUDA111.download]]
-    sha256 = "026a92bcb8d7a5ff6f2e6e262ed8d8387164314941f0dc1b3228e383e04a60a0"
-    url = "https://github.com/JuliaBinaryWrappers/CUDA_jll.jl/releases/download/CUDA-v11.1.1+2/CUDA.v11.1.1.x86_64-w64-mingw32.tar.gz"
+    sha256 = "a56db28c70e9736f9ea024f3afa7fdedf899b7c998808db7d8a368e0a1208ed9"
+    url = "https://github.com/JuliaBinaryWrappers/CUDA_jll.jl/releases/download/CUDA-v11.1.1+3/CUDA.v11.1.1.x86_64-w64-mingw32.tar.gz"
+
 
 [[CUDA112]]
 arch = "powerpc64le"
-git-tree-sha1 = "3141108f3144f5170dacc12749a61c14101b42b5"
+git-tree-sha1 = "ef3928da3f9b68a5213a93f91da0d27e32c01e50"
 libc = "glibc"
 os = "linux"
 lazy = true
 
     [[CUDA112.download]]
-    sha256 = "d7d6c399c77cabc75f1387869ca8bbef93cb6a745004993b34306e0b23d5bd18"
-    url = "https://github.com/JuliaBinaryWrappers/CUDA_jll.jl/releases/download/CUDA-v11.2.1+0/CUDA.v11.2.1.powerpc64le-linux-gnu.tar.gz"
+    sha256 = "770235b69868b88e6db4efc30a8659e9708f3b432028e2032ba589cf2c3efaf8"
+    url = "https://github.com/JuliaBinaryWrappers/CUDA_jll.jl/releases/download/CUDA-v11.2.1+1/CUDA.v11.2.1.powerpc64le-linux-gnu.tar.gz"
 
 [[CUDA112]]
 arch = "x86_64"
-git-tree-sha1 = "43b02b66f55952515d3cc933404d027fb904cd8b"
+git-tree-sha1 = "18f4e83091aec02d8229c2b009a45a5c22b47664"
 libc = "glibc"
 os = "linux"
 lazy = true
 
     [[CUDA112.download]]
-    sha256 = "70089c452bf923c4951048d336ac32ed28ee3672f8667bc7595fdc6190bf1990"
-    url = "https://github.com/JuliaBinaryWrappers/CUDA_jll.jl/releases/download/CUDA-v11.2.1+0/CUDA.v11.2.1.x86_64-linux-gnu.tar.gz"
+    sha256 = "6da495c82fae19e0aae8691addc72829376547543324358f39e16835cb208e6e"
+    url = "https://github.com/JuliaBinaryWrappers/CUDA_jll.jl/releases/download/CUDA-v11.2.1+1/CUDA.v11.2.1.x86_64-linux-gnu.tar.gz"
 
 [[CUDA112]]
 arch = "x86_64"
-git-tree-sha1 = "8b7275b36a973e6345a76b2931ddf397228e34ca"
+git-tree-sha1 = "4765905e93e1e93ca8d2eb52a1e8cec5de4627b1"
 os = "windows"
 lazy = true
 
     [[CUDA112.download]]
-    sha256 = "ed69a6b9630fc83e75856486fd157903c6e93e1d70e0fc7e6c67ca0dacea2b15"
-    url = "https://github.com/JuliaBinaryWrappers/CUDA_jll.jl/releases/download/CUDA-v11.2.1+0/CUDA.v11.2.1.x86_64-w64-mingw32.tar.gz"
+    sha256 = "6dc0ae6aab8b878864bf926fd9446c71f92f689e6115d6dcedc54ac492d30ea3"
+    url = "https://github.com/JuliaBinaryWrappers/CUDA_jll.jl/releases/download/CUDA-v11.2.1+1/CUDA.v11.2.1.x86_64-w64-mingw32.tar.gz"
+
 
 
 # CUDNN
diff --git a/deps/bindeps.jl b/deps/bindeps.jl
index d1e5218aa6..3e4ffcc5dc 100644
--- a/deps/bindeps.jl
+++ b/deps/bindeps.jl
@@ -33,7 +33,7 @@ Returns the CUDA release part of the version as returned by [`version`](@ref).
 toolkit_release() = @after_init(VersionNumber(__toolkit_version[].major, __toolkit_version[].minor))
 
 const __nvdisasm = Ref{String}()
-const __memcheck = Ref{Union{Nothing,String}}()
+const __compute_sanitizer = Ref{Union{Nothing,String}}()
 const __libdevice = Ref{String}()
 const __libcudadevrt = Ref{String}()
 const __libcupti = Ref{Union{Nothing,String}}()
@@ -47,10 +47,10 @@ const __libcudnn = Ref{Union{Nothing,String}}(nothing)
 const __libcutensor = Ref{Union{Nothing,String}}(nothing)
 
 nvdisasm() = @after_init(__nvdisasm[])
-function memcheck()
+function compute_sanitizer()
     @after_init begin
-        @assert has_memcheck() "This functionality is unavailabe as CUDA-MEMCHECK is missing."
-        __memcheck[]
+        @assert has_compute_sanitizer() "This functionality is unavailabe as compute-sanitizer is missing."
+        __compute_sanitizer[]
     end
 end
 libdevice() = @after_init(__libdevice[])
@@ -68,8 +68,8 @@ function libnvtx()
     end
 end
 
-export has_memcheck, has_cupti, has_nvtx
-has_memcheck() = @after_init(__memcheck[]) !== nothing
+export has_compute_sanitizer, has_cupti, has_nvtx
+has_compute_sanitizer() = @after_init(__compute_sanitizer[]) !== nothing
 has_cupti() = @after_init(__libcupti[]) !== nothing
 has_nvtx() = @after_init(__libnvtx[]) !== nothing
 
@@ -173,8 +173,7 @@ function use_artifact_cuda()
 
     __nvdisasm[] = artifact_binary(artifact.dir, "nvdisasm")
     @assert isfile(__nvdisasm[])
-    __memcheck[] = artifact_binary(artifact.dir, "cuda-memcheck")
-    @assert isfile(__memcheck[])
+    __compute_sanitizer[] = artifact_binary(artifact.dir, "compute-sanitizer")
 
     __libcupti[] = artifact_cuda_library(artifact.dir, "cupti", artifact.version)
     @assert isfile(__libcupti[])
@@ -221,7 +220,7 @@ function use_local_cuda()
         __nvdisasm[] = path
     end
 
-    __memcheck[] = find_cuda_binary("cuda-memcheck", cuda_dirs)
+    __compute_sanitizer[] = find_cuda_binary("compute-sanitizer", cuda_dirs)
 
     cuda_version = parse_toolkit_version("nvdisasm", __nvdisasm[])
     if cuda_version === nothing
diff --git a/test/codegen.jl b/test/codegen.jl
index 5f7d42daed..e0acbf142d 100644
--- a/test/codegen.jl
+++ b/test/codegen.jl
@@ -140,8 +140,8 @@ end
     valid_kernel() = return
     invalid_kernel() = 1
 
-    @not_if_memcheck @test CUDA.code_sass(devnull, valid_kernel, Tuple{}) == nothing
-    @not_if_memcheck @test_throws CUDA.KernelError CUDA.code_sass(devnull, invalid_kernel, Tuple{})
+    @not_if_sanitize @test CUDA.code_sass(devnull, valid_kernel, Tuple{}) == nothing
+    @not_if_sanitize @test_throws CUDA.KernelError CUDA.code_sass(devnull, invalid_kernel, Tuple{})
 end
 
 @testset "function name mangling" begin
@@ -149,13 +149,13 @@ end
 
     @eval kernel_341(ptr) = (@inbounds unsafe_store!(ptr, $(Symbol("dummy_^"))(unsafe_load(ptr))); nothing)
 
-    @not_if_memcheck CUDA.code_sass(devnull, kernel_341, Tuple{Ptr{Int}})
+    @not_if_sanitize CUDA.code_sass(devnull, kernel_341, Tuple{Ptr{Int}})
 end
 
 @testset "device runtime" begin
     kernel() = (CUDA.cudaGetLastError(); return)
 
-    @not_if_memcheck CUDA.code_sass(devnull, kernel, Tuple{})
+    @not_if_sanitize CUDA.code_sass(devnull, kernel, Tuple{})
 end
 
 end
diff --git a/test/cudadrv/module.jl b/test/cudadrv/module.jl
index 351555e6f8..def2d81fc9 100644
--- a/test/cudadrv/module.jl
+++ b/test/cudadrv/module.jl
@@ -24,7 +24,7 @@ let
     @test md != md2
 end
 
-@not_if_memcheck @test_throws_cuerror CUDA.ERROR_INVALID_IMAGE CuModule("foobar")
+@not_if_sanitize @test_throws_cuerror CUDA.ERROR_INVALID_IMAGE CuModule("foobar")
 
 @testset "globals" begin
     md = CuModuleFile(joinpath(@__DIR__, "ptx/global.ptx"))
@@ -54,11 +54,11 @@ end
     # TODO: test with valid object code
     # NOTE: apparently, on Windows cuLinkAddData! _does_ accept object data containing \0
     if !Sys.iswindows()
-        @not_if_memcheck @test_throws_cuerror CUDA.ERROR_UNKNOWN add_data!(link, "vadd_parent", UInt8[0])
+        @not_if_sanitize @test_throws_cuerror CUDA.ERROR_UNKNOWN add_data!(link, "vadd_parent", UInt8[0])
     end
 end
 
-@not_if_memcheck @testset "error log" begin
+@not_if_sanitize @testset "error log" begin
     @test_throws_message contains("ptxas fatal") CuError CuModule(".version 3.1")
 
     link = CuLink()
diff --git a/test/cudadrv/pool.jl b/test/cudadrv/pool.jl
index c4594f9f61..d3c9c744d0 100644
--- a/test/cudadrv/pool.jl
+++ b/test/cudadrv/pool.jl
@@ -1,4 +1,4 @@
-@not_if_memcheck let
+@not_if_sanitize let
     dev = device()
 
     pool = memory_pool(dev)
diff --git a/test/cutensor/contractions.jl b/test/cutensor/contractions.jl
index 84a7a200bb..6928b61ce3 100644
--- a/test/cutensor/contractions.jl
+++ b/test/cutensor/contractions.jl
@@ -2,6 +2,9 @@ using CUDA.CUTENSOR
 using CUDA
 using LinearAlgebra
 
+# these tests perform a lot of harmless-but-invalid API calls, poluting sanitizer logs
+@not_if_sanitize begin
+
 eltypes = ( (Float32, Float32, Float32, Float32),
             (Float32, Float32, Float32, Float16),
             (ComplexF32, ComplexF32, ComplexF32, ComplexF32),
@@ -196,3 +199,5 @@ can_pin = !Sys.iswindows()
         end
     end
 end
+
+end
diff --git a/test/examples.jl b/test/examples.jl
index d6902071e6..fa03f93a8a 100644
--- a/test/examples.jl
+++ b/test/examples.jl
@@ -1,3 +1,7 @@
+# NVIDIA bug 3263616: compute-sanitizer crashes when generating host backtraces,
+#                     but --show-backtrace=no does not survive execve.
+@not_if_sanitize begin
+
 # these tests spawn subprocesses, so reset the current context to conserve memory
 CUDA.release() == v"11.2" || CUDA.device_reset!()
 
@@ -28,3 +32,5 @@ cd(examples_dir) do
         @test success(pipeline(`$cmd $example`, stderr=stderr))
     end
 end
+
+end
diff --git a/test/exceptions.jl b/test/exceptions.jl
index 1141879117..5f7aaad6ce 100644
--- a/test/exceptions.jl
+++ b/test/exceptions.jl
@@ -1,3 +1,7 @@
+# NVIDIA bug 3263616: compute-sanitizer crashes when generating host backtraces,
+#                     but --show-backtrace=no does not survive execve.
+@not_if_sanitize begin
+
 # these tests spawn subprocesses, so reset the current context to conserve memory
 CUDA.release() == v"11.2" || CUDA.device_reset!()
 
@@ -83,3 +87,5 @@ let (code, out, err) = julia_script(script, `-g2`)
 end
 
 end
+
+end
diff --git a/test/execution.jl b/test/execution.jl
index 96afc68509..26ed2e4e8f 100644
--- a/test/execution.jl
+++ b/test/execution.jl
@@ -36,7 +36,7 @@ end
 @testset "compilation params" begin
     @cuda dummy()
 
-    @not_if_memcheck @test_throws CuError @cuda threads=2 maxthreads=1 dummy()
+    @not_if_sanitize @test_throws CuError @cuda threads=2 maxthreads=1 dummy()
     @cuda threads=2 dummy()
 end
 
@@ -58,14 +58,14 @@ end
     CUDA.code_warntype(devnull, dummy, Tuple{})
     CUDA.code_llvm(devnull, dummy, Tuple{})
     CUDA.code_ptx(devnull, dummy, Tuple{})
-    @not_if_memcheck CUDA.code_sass(devnull, dummy, Tuple{})
+    @not_if_sanitize CUDA.code_sass(devnull, dummy, Tuple{})
 
     @device_code_lowered @cuda dummy()
     @device_code_typed @cuda dummy()
     @device_code_warntype io=devnull @cuda dummy()
     @device_code_llvm io=devnull @cuda dummy()
     @device_code_ptx io=devnull @cuda dummy()
-    @not_if_memcheck @device_code_sass io=devnull @cuda dummy()
+    @not_if_sanitize @device_code_sass io=devnull @cuda dummy()
 
     mktempdir() do dir
         @device_code dir=dir @cuda dummy()
@@ -77,7 +77,7 @@ end
     @test occursin("julia_dummy", sprint(io->(@device_code_llvm io=io optimize=false @cuda dummy())))
     @test occursin("julia_dummy", sprint(io->(@device_code_llvm io=io @cuda dummy())))
     @test occursin("julia_dummy", sprint(io->(@device_code_ptx io=io @cuda dummy())))
-    @not_if_memcheck @test occursin("julia_dummy", sprint(io->(@device_code_sass io=io @cuda dummy())))
+    @not_if_sanitize @test occursin("julia_dummy", sprint(io->(@device_code_sass io=io @cuda dummy())))
 
     # make sure invalid kernels can be partially reflected upon
     let
diff --git a/test/initialization.jl b/test/initialization.jl
index 34eb7952dc..c1629b6d4d 100644
--- a/test/initialization.jl
+++ b/test/initialization.jl
@@ -3,7 +3,7 @@
 
 # the API shouldn't have been initialized
 @test CuCurrentContext() == nothing
-@not_if_memcheck @test CuCurrentDevice() == nothing
+@not_if_sanitize @test CuCurrentDevice() == nothing
 
 task_cb = Any[nothing for tid in 1:Threads.nthreads()]
 CUDA.attaskswitch() do
diff --git a/test/pool.jl b/test/pool.jl
index 6a69679663..3652109b01 100644
--- a/test/pool.jl
+++ b/test/pool.jl
@@ -1,6 +1,6 @@
 CUDA.alloc(0)
 
-@not_if_memcheck @test_throws OutOfGPUMemoryError CuArray{Int}(undef, 10^20)
+@not_if_sanitize @test_throws OutOfGPUMemoryError CuArray{Int}(undef, 10^20)
 
 @testset "@allocated" begin
     @test (CUDA.@allocated CuArray{Int32}(undef,1)) == 4
diff --git a/test/runtests.jl b/test/runtests.jl
index da4ab05783..0bb05debd7 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -35,7 +35,7 @@ if do_help
                --quickfail        Fail the entire run as soon as a single test errored.
                --jobs=N           Launch `N` processes to perform tests (default: Threads.nthreads()).
                --gpus=N           Expose `N` GPUs to test processes (default: 1).
-               --memcheck[=tool]  Run the tests under `cuda-memcheck`.
+               --sanitize[=tool]  Run the tests under `compute-sanitizer`.
                --snoop=FILE       Snoop on compiled methods and save to `FILE`.
 
                Remaining arguments filter the tests that will be executed.""")
@@ -43,7 +43,7 @@ if do_help
 end
 _, jobs = extract_flag!(ARGS, "--jobs", Threads.nthreads())
 _, gpus = extract_flag!(ARGS, "--gpus", 1)
-do_memcheck, memcheck_tool = extract_flag!(ARGS, "--memcheck", "memcheck")
+do_sanitize, sanitize_tool = extract_flag!(ARGS, "--sanitize", "memcheck")
 do_snoop, snoop_path = extract_flag!(ARGS, "--snoop")
 do_thorough, _ = extract_flag!(ARGS, "--thorough")
 do_quickfail, _ = extract_flag!(ARGS, "--quickfail")
@@ -96,6 +96,11 @@ if do_list
     end
     exit(0)
 end
+## no options should remain
+optlike_args = filter(startswith("-"), ARGS)
+if !isempty(optlike_args)
+    error("Unknown test options `$(join(optlike_args, " "))` (try `--help` for usage instructions)")
+end
 ## the remaining args filter tests
 if !isempty(ARGS)
   filter!(tests) do test
@@ -161,13 +166,6 @@ is_debug = ccall(:jl_is_debugbuild, Cint, ()) != 0
 if VERSION < v"1.5-" || first(picks).cap < v"7.0"
     push!(skip_tests, "device/wmma")
 end
-if do_memcheck
-    # CUFFT causes internal failures in cuda-memcheck
-    push!(skip_tests, "cufft")
-    # CUTENSOR tests result in illegal memory accesses unregistering memory
-    push!(skip_tests, "cutensor")
-    # there's also a bunch of `memcheck || ...` expressions in the tests themselves
-end
 if Sys.ARCH == :aarch64
     # CUFFT segfaults on ARM
     push!(skip_tests, "cufft")
@@ -206,10 +204,11 @@ if Base.JLOptions().project != C_NULL
 end
 const test_exename = popfirst!(test_exeflags.exec)
 function addworker(X; kwargs...)
-    exename = if do_memcheck
-        memcheck = CUDA.memcheck()
-        @info "Running under $(readchomp(`$memcheck --version`))"
-        `$memcheck --tool $memcheck_tool $test_exename`
+    exename = if do_sanitize
+        sanitizer = CUDA.compute_sanitizer()
+        @info "Running under $(readchomp(`$sanitizer --version`))"
+        # NVIDIA bug 3263616: compute-sanitizer crashes when generating host backtraces
+        `$sanitizer --tool $sanitize_tool --launch-timeout=0 --show-backtrace=no --target-processes=all $test_exename`
     else
         test_exename
     end
@@ -283,7 +282,7 @@ function print_testworker_stats(test, wrkr, resp)
     end
 end
 global print_testworker_started = (name, wrkr)->begin
-    if do_memcheck
+    if do_sanitize
         lock(print_lock)
         try
             printstyled(name, color=:white)
diff --git a/test/setup.jl b/test/setup.jl
index b32ca68be4..5d75d14a21 100644
--- a/test/setup.jl
+++ b/test/setup.jl
@@ -10,10 +10,10 @@ testf(f, xs...; kwargs...) = TestSuite.compare(f, CuArray, xs...; kwargs...)
 
 using Random
 
-# detect cuda-memcheck, to disable testts that are known to fail under cuda-memcheck
-# (e.g. those using CUPTI) or result in verbose output (deliberate API errors)
-macro not_if_memcheck(ex)
-    haskey(ENV, "CUDA_MEMCHECK") || return esc(ex)
+# detect compute-sanitizer, to disable incompatible tests (e.g. using CUPTI),
+# and to skip tests that are known to generate innocuous API errors
+macro not_if_sanitize(ex)
+    any(contains("NV_SANITIZER"), keys(ENV)) || return esc(ex)
     quote
         @test_skip $ex
     end

From dc810b53dfd22f56674f886163bc26a47a1a4b40 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim@juliacomputing.com>
Date: Tue, 9 Mar 2021 09:00:52 +0100
Subject: [PATCH 2/5] Don't use artifacts so that we really use CUDA 11.2 for
 debug tests.

---
 .buildkite/pipeline.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index 6c57c79913..dce4ec858a 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -38,6 +38,9 @@ steps:
       queue: "juliagpu"
       cuda: "11.2"                    # older versions of CUDA have issues
       cap: "recent"                   # test as much as possible
+    env:
+      JULIA_CUDA_VERSION: '11.2'
+      JULIA_CUDA_USE_BINARYBUILDER: 'true'
     if: build.message !~ /\[skip tests\]/
     timeout_in_minutes: 120
 

From 6b20aa469056df59c613e12849d4ca6967e1eb68 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim@juliacomputing.com>
Date: Tue, 9 Mar 2021 11:21:04 +0100
Subject: [PATCH 3/5] Don't use the stream-ordered allocator with
 compute-sanitizer.

Running under the sanitizer uses a lot of device memory,
so we need to be able to reset the device after every testset.
---
 .buildkite/pipeline.yml | 3 ++-
 src/pool.jl             | 3 +++
 src/state.jl            | 3 +--
 test/examples.jl        | 2 +-
 test/exceptions.jl      | 2 +-
 test/initialization.jl  | 2 +-
 test/setup.jl           | 4 +---
 7 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index dce4ec858a..1330d6fc33 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -27,7 +27,7 @@ steps:
           version: 1.6-nightly
       - JuliaCI/julia-test#v1:
           julia_args: "-g2"
-          test_args: "--sanitize"
+          test_args: "--sanitize --quickfail"
       - JuliaCI/julia-coverage#v1:
           codecov: true
           dirs:
@@ -39,6 +39,7 @@ steps:
       cuda: "11.2"                    # older versions of CUDA have issues
       cap: "recent"                   # test as much as possible
     env:
+      JULIA_CUDA_MEMORY_POOL: 'none'  # compute-sanitizer uses a lot of memory, so we need device_reset!
       JULIA_CUDA_VERSION: '11.2'
       JULIA_CUDA_USE_BINARYBUILDER: 'true'
     if: build.message !~ /\[skip tests\]/
diff --git a/src/pool.jl b/src/pool.jl
index 4ace097716..1810c60ae4 100644
--- a/src/pool.jl
+++ b/src/pool.jl
@@ -223,6 +223,9 @@ const pools = PerDevice{AbstractPool}(dev->begin
   pool
 end)
 
+# NVIDIA bug #3240770
+@memoize any_stream_ordered() = any(dev->pools[dev].stream_ordered, devices())
+
 
 ## interface
 
diff --git a/src/state.jl b/src/state.jl
index 46ef1f535f..16f1dc6f23 100644
--- a/src/state.jl
+++ b/src/state.jl
@@ -344,8 +344,7 @@ so it is generally not needed to subscribe to the reset hook specifically.
     this package.
 """
 function device_reset!(dev::CuDevice=device())
-    stream_ordered = any(dev->pools[dev].stream_ordered, devices())
-    if stream_ordered # NVIDIA bug #3240770
+    if any_stream_ordered()
         @error """Due to a bug in CUDA, resetting the device is not possible on CUDA 11.2 when using the stream-ordered memory allocator.
 
                   If you are calling this function to free memory, that may not be required anymore
diff --git a/test/examples.jl b/test/examples.jl
index fa03f93a8a..e9859322fe 100644
--- a/test/examples.jl
+++ b/test/examples.jl
@@ -3,7 +3,7 @@
 @not_if_sanitize begin
 
 # these tests spawn subprocesses, so reset the current context to conserve memory
-CUDA.release() == v"11.2" || CUDA.device_reset!()
+CUDA.any_stream_ordered() || CUDA.device_reset!()
 
 function find_sources(path::String, sources=String[])
     if isdir(path)
diff --git a/test/exceptions.jl b/test/exceptions.jl
index 5f7aaad6ce..5198c93478 100644
--- a/test/exceptions.jl
+++ b/test/exceptions.jl
@@ -3,7 +3,7 @@
 @not_if_sanitize begin
 
 # these tests spawn subprocesses, so reset the current context to conserve memory
-CUDA.release() == v"11.2" || CUDA.device_reset!()
+CUDA.any_stream_ordered() || CUDA.device_reset!()
 
 @testset "stack traces at different debug levels" begin
 
diff --git a/test/initialization.jl b/test/initialization.jl
index c1629b6d4d..f9bf3f5bcb 100644
--- a/test/initialization.jl
+++ b/test/initialization.jl
@@ -68,7 +68,7 @@ end
 
 reset_cb()
 
-if CUDA.release() != v"11.2"
+if !CUDA.any_stream_ordered()
     # NVIDIA bug #3240770
     device_reset!()
 
diff --git a/test/setup.jl b/test/setup.jl
index 5d75d14a21..a1d2257c7a 100644
--- a/test/setup.jl
+++ b/test/setup.jl
@@ -102,9 +102,7 @@ function runtests(f, name, time_source=:cuda, snoop=nothing)
         end
         res = vcat(collect(data), cpu_rss, gpu_rss)
 
-        if CUDA.release() != v"11.2" # NVIDIA bug #3240770
-            device_reset!()
-        end
+        CUDA.any_stream_ordered() || device_reset!()
         res
     finally
         if snoop !== nothing

From 392ac92d254117b27e24f3c66b88be5b1cbc611a Mon Sep 17 00:00:00 2001
From: Tim Besard <tim@juliacomputing.com>
Date: Tue, 9 Mar 2021 15:58:45 +0100
Subject: [PATCH 4/5] Don't use multiple jobs for the test suite under
 compute-sanitizer.

---
 .buildkite/pipeline.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index 1330d6fc33..002730e63e 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -27,7 +27,7 @@ steps:
           version: 1.6-nightly
       - JuliaCI/julia-test#v1:
           julia_args: "-g2"
-          test_args: "--sanitize --quickfail"
+          test_args: "--sanitize --quickfail --jobs=1"
       - JuliaCI/julia-coverage#v1:
           codecov: true
           dirs:

From 86a6dc09a8dbb94a1b2bea409962d6b7e089ba57 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim@juliacomputing.com>
Date: Wed, 10 Mar 2021 08:46:09 +0100
Subject: [PATCH 5/5] Disable a hanging test.

---
 test/sorting.jl | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/test/sorting.jl b/test/sorting.jl
index 68eaa7679a..b7155183f6 100644
--- a/test/sorting.jl
+++ b/test/sorting.jl
@@ -214,7 +214,8 @@ function test_sort(T, N, f=identity; kwargs...)
 end
 
 
-@testset "interface" begin
+# FIXME: these tests hang when running under compute-sanitizer on CUDA 11.2 with -g2
+@not_if_sanitize @testset "interface" begin
     # pre-sorted
     test_sort!(Int, 1000000)
     test_sort!(Int32, 1000000)