Skip to content

Commit c3c7c4c

Browse files
authored
[Profiler] Fix graph_executor_debug hang (#12382)
For some operations such as `__nop` or `__copy` the measured inference time is equal to 0. In this case we are in infinite loop and we won't exit from it. Added new parameter `limit_zero_time_iterations ` which specify the maximum number of repeats then the inference time is equal to 0. When we exceed this value then we will exit from a loop.
1 parent 369e8b2 commit c3c7c4c

File tree

11 files changed

+159
-61
lines changed

11 files changed

+159
-61
lines changed

include/tvm/runtime/profiling.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -573,6 +573,8 @@ PackedFunc ProfileFunction(Module mod, std::string func_name, int device_type, i
573573
* minimum duration requirement of one `repeat`.
574574
* i.e., When the run time of one `repeat` falls below this time,
575575
* the `number` parameter will be automatically increased.
576+
* \param limit_zero_time_iterations The maximum number of repeats when
577+
* measured time is equal to 0. It helps to avoid hanging during measurements.
576578
* \param cooldown_interval_ms The cooldown interval in milliseconds between the number of repeats
577579
* defined by `repeats_to_cooldown`.
578580
* \param repeats_to_cooldown The number of repeats before the
@@ -582,8 +584,8 @@ PackedFunc ProfileFunction(Module mod, std::string func_name, int device_type, i
582584
* \return f_timer A timer function.
583585
*/
584586
PackedFunc WrapTimeEvaluator(PackedFunc f, Device dev, int number, int repeat, int min_repeat_ms,
585-
int cooldown_interval_ms, int repeats_to_cooldown,
586-
PackedFunc f_preproc = nullptr);
587+
int limit_zero_time_iterations, int cooldown_interval_ms,
588+
int repeats_to_cooldown, PackedFunc f_preproc = nullptr);
587589

588590
} // namespace profiling
589591
} // namespace runtime

python/tvm/contrib/debugger/debug_executor.py

Lines changed: 45 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -223,7 +223,15 @@ def _run_per_layer(self):
223223
output_tensors.append(self._get_node_output(i, j))
224224
self.debug_datum.update_output_tensors(output_tensors)
225225

226-
def _run_debug(self, number, repeat, min_repeat_ms, cooldown_interval_ms, repeats_to_cooldown):
226+
def _run_debug(
227+
self,
228+
number,
229+
repeat,
230+
min_repeat_ms,
231+
limit_zero_time_iterations,
232+
cooldown_interval_ms,
233+
repeats_to_cooldown,
234+
):
227235
"""Execute the node specified with index will be executed.
228236
Each debug output will be copied to the buffer
229237
Time consumed for each execution will be set as debug output.
@@ -233,6 +241,7 @@ def _run_debug(self, number, repeat, min_repeat_ms, cooldown_interval_ms, repeat
233241
number=number,
234242
repeat=repeat,
235243
min_repeat_ms=min_repeat_ms,
244+
limit_zero_time_iterations=limit_zero_time_iterations,
236245
cooldown_interval_ms=cooldown_interval_ms,
237246
repeats_to_cooldown=repeats_to_cooldown,
238247
)
@@ -272,6 +281,7 @@ def run(
272281
number=10,
273282
repeat=1,
274283
min_repeat_ms=1,
284+
limit_zero_time_iterations=100,
275285
cooldown_interval_ms=0,
276286
repeats_to_cooldown=1,
277287
**input_dict,
@@ -299,6 +309,10 @@ def run(
299309
i.e., When the run time of one `repeat` falls below this time, the `number` parameter
300310
will be automatically increased.
301311
312+
limit_zero_time_iterations: int, optional
313+
The maximum number of repeats when measured time is equal to 0.
314+
It helps to avoid hanging during measurements.
315+
302316
cooldown_interval_ms: int, optional
303317
The cooldown interval in milliseconds between the number of repeats defined by
304318
`repeats_to_cooldown`.
@@ -317,6 +331,7 @@ def run(
317331
number=number,
318332
repeat=repeat,
319333
min_repeat_ms=min_repeat_ms,
334+
limit_zero_time_iterations=limit_zero_time_iterations,
320335
cooldown_interval_ms=cooldown_interval_ms,
321336
repeats_to_cooldown=repeats_to_cooldown,
322337
)
@@ -328,7 +343,13 @@ def run(
328343
self.debug_datum.display_debug_result()
329344

330345
def run_individual(
331-
self, number, repeat=1, min_repeat_ms=0, cooldown_interval_ms=0, repeats_to_cooldown=1
346+
self,
347+
number,
348+
repeat=1,
349+
min_repeat_ms=0,
350+
limit_zero_time_iterations=100,
351+
cooldown_interval_ms=0,
352+
repeats_to_cooldown=1,
332353
):
333354
"""Run each operation in the graph and get the time per op for all ops.
334355
@@ -351,6 +372,10 @@ def run_individual(
351372
i.e., When the run time of one `repeat` falls below this time, the `number` parameter
352373
will be automatically increased.
353374
375+
limit_zero_time_iterations: int, optional
376+
The maximum number of repeats when measured time is equal to 0.
377+
It helps to avoid hanging during measurements.
378+
354379
cooldown_interval_ms: int, optional
355380
The cooldown interval in milliseconds between the number of repeats defined by
356381
`repeats_to_cooldown`.
@@ -364,7 +389,12 @@ def run_individual(
364389
the repeat of the measurement.
365390
"""
366391
res = self._run_individual(
367-
number, repeat, min_repeat_ms, cooldown_interval_ms, repeats_to_cooldown
392+
number,
393+
repeat,
394+
min_repeat_ms,
395+
limit_zero_time_iterations,
396+
cooldown_interval_ms,
397+
repeats_to_cooldown,
368398
)
369399
results = []
370400
offset = 0
@@ -384,6 +414,7 @@ def run_individual_node(
384414
number=10,
385415
repeat=1,
386416
min_repeat_ms=0,
417+
limit_zero_time_iterations=100,
387418
cooldown_interval_ms=0,
388419
repeats_to_cooldown=1,
389420
):
@@ -415,6 +446,10 @@ def run_individual_node(
415446
i.e., When the run time of one `repeat` falls below this time, the `number` parameter
416447
will be automatically increased.
417448
449+
limit_zero_time_iterations: int, optional
450+
The maximum number of repeats when measured time is equal to 0.
451+
It helps to avoid hanging during measurements.
452+
418453
cooldown_interval_ms: int, optional
419454
The cooldown interval in milliseconds between the number of repeats defined by
420455
`repeats_to_cooldown`.
@@ -428,7 +463,13 @@ def run_individual_node(
428463
"""
429464
# Results are returned as serialized strings which we deserialize
430465
res = self._run_individual_node(
431-
index, number, repeat, min_repeat_ms, cooldown_interval_ms, repeats_to_cooldown
466+
index,
467+
number,
468+
repeat,
469+
min_repeat_ms,
470+
limit_zero_time_iterations,
471+
cooldown_interval_ms,
472+
repeats_to_cooldown,
432473
)
433474
fmt = "@" + ("d" * repeat)
434475
results = struct.unpack(fmt, res)

python/tvm/contrib/graph_executor.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -355,6 +355,7 @@ def benchmark(
355355
repeat=5,
356356
number=5,
357357
min_repeat_ms=None,
358+
limit_zero_time_iterations=100,
358359
end_to_end=False,
359360
cooldown_interval_ms=0,
360361
repeats_to_cooldown=1,
@@ -402,6 +403,10 @@ def benchmark(
402403
milliseconds. This can be used to ensure that the function is run enough to get an
403404
accurate measurement.
404405
406+
limit_zero_time_iterations : Optional[int]
407+
The maximum number of repeats when measured time is equal to 0.
408+
It helps to avoid hanging during measurements.
409+
405410
end_to_end : bool
406411
If set, include time to transfer input tensors to the device and time to transfer
407412
returned tensors in the total runtime. This will give accurate timings for end to end
@@ -437,6 +442,7 @@ def benchmark(
437442
repeat=repeat,
438443
number=number,
439444
min_repeat_ms=min_repeat_ms,
445+
limit_zero_time_iterations=limit_zero_time_iterations,
440446
)(device.device_type % rpc_base.RPC_SESS_MASK, device.device_id, *args)
441447
if kwargs:
442448
self.set_input(**kwargs)
@@ -446,6 +452,7 @@ def benchmark(
446452
repeat=repeat,
447453
number=number,
448454
min_repeat_ms=min_repeat_ms,
455+
limit_zero_time_iterations=limit_zero_time_iterations,
449456
cooldown_interval_ms=cooldown_interval_ms,
450457
repeats_to_cooldown=repeats_to_cooldown,
451458
)()

python/tvm/runtime/module.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -277,6 +277,7 @@ def time_evaluator(
277277
number=10,
278278
repeat=1,
279279
min_repeat_ms=0,
280+
limit_zero_time_iterations=100,
280281
cooldown_interval_ms=0,
281282
repeats_to_cooldown=1,
282283
f_preproc="",
@@ -310,6 +311,10 @@ def time_evaluator(
310311
i.e., When the run time of one `repeat` falls below this time, the `number` parameter
311312
will be automatically increased.
312313
314+
limit_zero_time_iterations: int, optional
315+
The maximum number of repeats when measured time is equal to 0.
316+
It helps to avoid hanging during measurements.
317+
313318
cooldown_interval_ms: int, optional
314319
The cooldown interval in milliseconds between the number of repeats defined by
315320
`repeats_to_cooldown`.
@@ -340,6 +345,7 @@ def time_evaluator(
340345
number,
341346
repeat,
342347
min_repeat_ms,
348+
limit_zero_time_iterations,
343349
cooldown_interval_ms,
344350
repeats_to_cooldown,
345351
f_preproc,

python/tvm/runtime/vm.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -583,6 +583,7 @@ def benchmark(
583583
repeat=5,
584584
number=5,
585585
min_repeat_ms=None,
586+
limit_zero_time_iterations=100,
586587
end_to_end=False,
587588
cooldown_interval_ms=0,
588589
repeats_to_cooldown=1,
@@ -630,6 +631,10 @@ def benchmark(
630631
milliseconds. This can be used to ensure that the function is run enough to get an
631632
accurate measurement.
632633
634+
limit_zero_time_iterations : Optional[int]
635+
The maximum number of repeats when measured time is equal to 0.
636+
It helps to avoid hanging during measurements.
637+
633638
end_to_end : bool
634639
If set, include time to transfer input tensors to the device and time to transfer
635640
returned tensors in the total runtime. This will give accurate timings for end to end
@@ -672,6 +677,7 @@ def benchmark(
672677
repeat=repeat,
673678
number=number,
674679
min_repeat_ms=min_repeat_ms,
680+
limit_zero_time_iterations=limit_zero_time_iterations,
675681
)(func_name, device.device_type % RPC_SESS_MASK, device.device_id, *packed_args)
676682
if args or kwargs:
677683
self.set_input(func_name, *args, **kwargs)
@@ -681,6 +687,7 @@ def benchmark(
681687
repeat=repeat,
682688
number=number,
683689
min_repeat_ms=min_repeat_ms,
690+
limit_zero_time_iterations=limit_zero_time_iterations,
684691
cooldown_interval_ms=cooldown_interval_ms,
685692
repeats_to_cooldown=repeats_to_cooldown,
686693
)(func_name)

src/runtime/crt/common/crt_runtime_api.c

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121

2222
#include <assert.h>
2323
#include <inttypes.h>
24+
#include <math.h>
2425
#include <stdarg.h>
2526
#include <stdbool.h>
2627
#include <stdio.h>
@@ -477,6 +478,7 @@ typedef struct {
477478
int number;
478479
int repeat;
479480
int min_repeat_ms;
481+
int limit_zero_time_iterations;
480482
int cooldown_interval_ms;
481483
int repeats_to_cooldown;
482484
} time_evaluator_state_t;
@@ -487,14 +489,14 @@ int RPCTimeEvaluator(TVMValue* args, int* type_codes, int num_args, TVMValue* re
487489
int* ret_type_code) {
488490
ret_val[0].v_handle = NULL;
489491
ret_type_code[0] = kTVMNullptr;
490-
if (num_args < 10) {
492+
if (num_args < 11) {
491493
TVMAPIErrorf("not enough args");
492494
return kTvmErrorFunctionCallNumArguments;
493495
}
494496
if (type_codes[0] != kTVMModuleHandle || type_codes[1] != kTVMStr ||
495497
type_codes[2] != kTVMArgInt || type_codes[3] != kTVMArgInt || type_codes[4] != kTVMArgInt ||
496498
type_codes[5] != kTVMArgInt || type_codes[6] != kTVMArgInt || type_codes[7] != kTVMArgInt ||
497-
type_codes[8] != kTVMArgInt || type_codes[9] != kTVMStr) {
499+
type_codes[8] != kTVMArgInt || type_codes[9] != kTVMArgInt || type_codes[10] != kTVMStr) {
498500
TVMAPIErrorf("one or more invalid arg types");
499501
return kTvmErrorFunctionCallWrongArgType;
500502
}
@@ -506,8 +508,9 @@ int RPCTimeEvaluator(TVMValue* args, int* type_codes, int num_args, TVMValue* re
506508
g_time_evaluator_state.number = args[4].v_int64;
507509
g_time_evaluator_state.repeat = args[5].v_int64;
508510
g_time_evaluator_state.min_repeat_ms = args[6].v_int64;
509-
g_time_evaluator_state.cooldown_interval_ms = args[7].v_int64;
510-
g_time_evaluator_state.repeats_to_cooldown = args[8].v_int64;
511+
g_time_evaluator_state.limit_zero_time_iterations = args[7].v_int64;
512+
g_time_evaluator_state.cooldown_interval_ms = args[8].v_int64;
513+
g_time_evaluator_state.repeats_to_cooldown = args[9].v_int64;
511514

512515
int ret_code =
513516
TVMModGetFunction(mod, name, /* query_imports */ 0, &g_time_evaluator_state.func_to_time);
@@ -556,6 +559,7 @@ tvm_crt_error_t RunTimeEvaluator(tvm_function_index_t function_index, TVMValue*
556559
double* iter = (double*)result_byte_arr->data;
557560
for (int i = 0; i < g_time_evaluator_state.repeat; i++) {
558561
double curr_res_seconds = 0.0;
562+
int absolute_zero_times = 0;
559563
// do-while structure ensures we run even when `min_repeat_ms` isn't set (i.e., is 0).
560564
do {
561565
if (curr_res_seconds > 0.0) {
@@ -588,7 +592,9 @@ tvm_crt_error_t RunTimeEvaluator(tvm_function_index_t function_index, TVMValue*
588592
if (err != kTvmErrorNoError) {
589593
goto release_and_return;
590594
}
591-
} while (curr_res_seconds < min_repeat_seconds);
595+
if (fpclassify(curr_res_seconds) == FP_ZERO) absolute_zero_times++;
596+
} while (curr_res_seconds < min_repeat_seconds &&
597+
absolute_zero_times < g_time_evaluator_state.limit_zero_time_iterations);
592598
double mean_exec_seconds = curr_res_seconds / g_time_evaluator_state.number;
593599
*iter = mean_exec_seconds;
594600
iter++;

0 commit comments

Comments
 (0)