forked from pytorch/pytorch
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun_cuda_memcheck.py
executable file
·162 lines (143 loc) · 6.64 KB
/
run_cuda_memcheck.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
#!/usr/bin/env python3
"""This script runs cuda-memcheck on the specified unit test. Each test case
is run in its isolated process with a timeout so that:
1) different test cases won't influence each other, and
2) in case of hang, the script would still finish in a finite amount of time.
The output will be written to a log file result.log
Example usage:
python run_cuda_memcheck.py ../test_torch.py 600
Note that running cuda-memcheck could be very slow.
"""
import asyncio
import torch
import multiprocessing
import argparse
import subprocess
import tqdm
import os
import sys
import cuda_memcheck_common as cmc
ALL_TESTS = []
GPUS = torch.cuda.device_count()
# parse arguments
parser = argparse.ArgumentParser(description="Run isolated cuda-memcheck on unit tests")
parser.add_argument('filename', help="the python file for a test, such as test_torch.py")
parser.add_argument('timeout', type=int, help='kill the test if it does not terminate in a certain amount of seconds')
parser.add_argument('--strict', action='store_true',
help='Whether to show cublas/cudnn errors. These errors are ignored by default because'
'cublas/cudnn does not run error-free under cuda-memcheck, and ignoring these errors')
parser.add_argument('--nproc', type=int, default=multiprocessing.cpu_count(),
help='Number of processes running tests, default to number of cores in the system')
parser.add_argument('--gpus', default='all',
help='GPU assignments for each process, it could be "all", or : separated list like "1,2:3,4:5,6"')
parser.add_argument('--ci', action='store_true',
help='Whether this script is executed in CI. When executed inside a CI, this script fails when '
'an error is detected. Also, it will not show tqdm progress bar, but directly print the error'
'to stdout instead.')
parser.add_argument('--nohang', action='store_true', help='Treat timeout as success')
parser.add_argument('--split', type=int, default=1, help='Split the job into pieces')
parser.add_argument('--rank', type=int, default=0, help='Which piece this process should pick')
args = parser.parse_args()
# Filters that ignores cublas/cudnn errors
# TODO (@zasdfgbnm): When can we remove this? Will cublas/cudnn run error-free under cuda-memcheck?
def is_ignored_only(output):
try:
report = cmc.parse(output)
except cmc.ParseError:
# in case the simple parser fails parsing the output of cuda memcheck
# then this error is never ignored.
return False
count_ignored_errors = 0
for e in report.errors:
if 'libcublas' in ''.join(e.stack) or 'libcudnn' in ''.join(e.stack) or 'libcufft' in ''.join(e.stack):
count_ignored_errors += 1
return count_ignored_errors == report.num_errors
# Set environment PYTORCH_CUDA_MEMCHECK=1 to allow skipping some tests
os.environ['PYTORCH_CUDA_MEMCHECK'] = '1'
# Discover tests:
# To get a list of tests, run:
# pytest --setup-only test/test_torch.py
# and then parse the output
proc = subprocess.Popen(['pytest', '--setup-only', args.filename], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
stdout, stderr = proc.communicate()
lines = stdout.decode().strip().splitlines()
for line in lines:
if '(fixtures used:' in line:
line = line.strip().split()[0]
line = line[line.find('::') + 2:]
line = line.replace('::', '.')
ALL_TESTS.append(line)
# Do a simple filtering:
# if 'cpu' or 'CPU' is in the name and 'cuda' or 'CUDA' is not in the name, then skip it
def is_cpu_only(name):
name = name.lower()
return ('cpu' in name) and not ('cuda' in name)
ALL_TESTS = [x for x in ALL_TESTS if not is_cpu_only(x)]
# Split all tests into chunks, and only on the selected chunk
ALL_TESTS.sort()
chunk_size = (len(ALL_TESTS) + args.split - 1) // args.split
start = chunk_size * args.rank
end = chunk_size * (args.rank + 1)
ALL_TESTS = ALL_TESTS[start:end]
# Run tests:
# Since running cuda-memcheck on PyTorch unit tests is very slow, these tests must be run in parallel.
# This is done by using the coroutine feature in new Python versions. A number of coroutines are created;
# they create subprocesses and awaiting them to finish. The number of running subprocesses could be
# specified by the user and by default is the same as the number of CPUs in the machine.
# These subprocesses are balanced across different GPUs on the system by assigning one devices per process,
# or as specified by the user
progress = 0
if not args.ci:
logfile = open('result.log', 'w')
progressbar = tqdm.tqdm(total=len(ALL_TESTS))
else:
logfile = sys.stdout
# create a fake progress bar that does not display anything
class ProgressbarStub:
def update(self, *args):
return
progressbar = ProgressbarStub()
async def run1(coroutine_id):
global progress
if args.gpus == 'all':
gpuid = coroutine_id % GPUS
else:
gpu_assignments = args.gpus.split(':')
assert args.nproc == len(gpu_assignments), 'Please specify GPU assignmnent for each process, separated by :'
gpuid = gpu_assignments[coroutine_id]
while progress < len(ALL_TESTS):
test = ALL_TESTS[progress]
progress += 1
cmd = f'CUDA_VISIBLE_DEVICES={gpuid} cuda-memcheck --error-exitcode 1 python {args.filename} {test}'
proc = await asyncio.create_subprocess_shell(cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE)
try:
stdout, stderr = await asyncio.wait_for(proc.communicate(), args.timeout)
except asyncio.TimeoutError:
print('Timeout:', test, file=logfile)
proc.kill()
if args.ci and not args.nohang:
sys.exit("Hang detected on cuda-memcheck")
else:
if proc.returncode == 0:
print('Success:', test, file=logfile)
else:
stdout = stdout.decode()
stderr = stderr.decode()
should_display = args.strict or not is_ignored_only(stdout)
if should_display:
print('Fail:', test, file=logfile)
print(stdout, file=logfile)
print(stderr, file=logfile)
if args.ci:
sys.exit("Failure detected on cuda-memcheck")
else:
print('Ignored:', test, file=logfile)
del proc
progressbar.update(1)
async def main():
tasks = [asyncio.ensure_future(run1(i)) for i in range(args.nproc)]
for t in tasks:
await t
if __name__ == '__main__':
loop = asyncio.get_event_loop()
loop.run_until_complete(main())