Skip to content

Commit

Permalink
[Profiler] Fix graph_executor_debug hang (apache#12382)
Browse files Browse the repository at this point in the history
For some operations such as `__nop` or `__copy` the measured inference
time is equal to 0. In this case we are in infinite loop and we won't
exit from it. Added new parameter `limit_zero_time_iterations ` which specify the
maximum number of repeats then the inference time is equal to 0. When
we exceed this value then we will exit from a loop.
  • Loading branch information
echuraev authored Aug 12, 2022
1 parent 369e8b2 commit c3c7c4c
Show file tree
Hide file tree
Showing 11 changed files with 159 additions and 61 deletions.
6 changes: 4 additions & 2 deletions include/tvm/runtime/profiling.h
Original file line number Diff line number Diff line change
Expand Up @@ -573,6 +573,8 @@ PackedFunc ProfileFunction(Module mod, std::string func_name, int device_type, i
* minimum duration requirement of one `repeat`.
* i.e., When the run time of one `repeat` falls below this time,
* the `number` parameter will be automatically increased.
* \param limit_zero_time_iterations The maximum number of repeats when
* measured time is equal to 0. It helps to avoid hanging during measurements.
* \param cooldown_interval_ms The cooldown interval in milliseconds between the number of repeats
* defined by `repeats_to_cooldown`.
* \param repeats_to_cooldown The number of repeats before the
Expand All @@ -582,8 +584,8 @@ PackedFunc ProfileFunction(Module mod, std::string func_name, int device_type, i
* \return f_timer A timer function.
*/
PackedFunc WrapTimeEvaluator(PackedFunc f, Device dev, int number, int repeat, int min_repeat_ms,
int cooldown_interval_ms, int repeats_to_cooldown,
PackedFunc f_preproc = nullptr);
int limit_zero_time_iterations, int cooldown_interval_ms,
int repeats_to_cooldown, PackedFunc f_preproc = nullptr);

} // namespace profiling
} // namespace runtime
Expand Down
49 changes: 45 additions & 4 deletions python/tvm/contrib/debugger/debug_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,15 @@ def _run_per_layer(self):
output_tensors.append(self._get_node_output(i, j))
self.debug_datum.update_output_tensors(output_tensors)

def _run_debug(self, number, repeat, min_repeat_ms, cooldown_interval_ms, repeats_to_cooldown):
def _run_debug(
self,
number,
repeat,
min_repeat_ms,
limit_zero_time_iterations,
cooldown_interval_ms,
repeats_to_cooldown,
):
"""Execute the node specified with index will be executed.
Each debug output will be copied to the buffer
Time consumed for each execution will be set as debug output.
Expand All @@ -233,6 +241,7 @@ def _run_debug(self, number, repeat, min_repeat_ms, cooldown_interval_ms, repeat
number=number,
repeat=repeat,
min_repeat_ms=min_repeat_ms,
limit_zero_time_iterations=limit_zero_time_iterations,
cooldown_interval_ms=cooldown_interval_ms,
repeats_to_cooldown=repeats_to_cooldown,
)
Expand Down Expand Up @@ -272,6 +281,7 @@ def run(
number=10,
repeat=1,
min_repeat_ms=1,
limit_zero_time_iterations=100,
cooldown_interval_ms=0,
repeats_to_cooldown=1,
**input_dict,
Expand Down Expand Up @@ -299,6 +309,10 @@ def run(
i.e., When the run time of one `repeat` falls below this time, the `number` parameter
will be automatically increased.
limit_zero_time_iterations: int, optional
The maximum number of repeats when measured time is equal to 0.
It helps to avoid hanging during measurements.
cooldown_interval_ms: int, optional
The cooldown interval in milliseconds between the number of repeats defined by
`repeats_to_cooldown`.
Expand All @@ -317,6 +331,7 @@ def run(
number=number,
repeat=repeat,
min_repeat_ms=min_repeat_ms,
limit_zero_time_iterations=limit_zero_time_iterations,
cooldown_interval_ms=cooldown_interval_ms,
repeats_to_cooldown=repeats_to_cooldown,
)
Expand All @@ -328,7 +343,13 @@ def run(
self.debug_datum.display_debug_result()

def run_individual(
self, number, repeat=1, min_repeat_ms=0, cooldown_interval_ms=0, repeats_to_cooldown=1
self,
number,
repeat=1,
min_repeat_ms=0,
limit_zero_time_iterations=100,
cooldown_interval_ms=0,
repeats_to_cooldown=1,
):
"""Run each operation in the graph and get the time per op for all ops.
Expand All @@ -351,6 +372,10 @@ def run_individual(
i.e., When the run time of one `repeat` falls below this time, the `number` parameter
will be automatically increased.
limit_zero_time_iterations: int, optional
The maximum number of repeats when measured time is equal to 0.
It helps to avoid hanging during measurements.
cooldown_interval_ms: int, optional
The cooldown interval in milliseconds between the number of repeats defined by
`repeats_to_cooldown`.
Expand All @@ -364,7 +389,12 @@ def run_individual(
the repeat of the measurement.
"""
res = self._run_individual(
number, repeat, min_repeat_ms, cooldown_interval_ms, repeats_to_cooldown
number,
repeat,
min_repeat_ms,
limit_zero_time_iterations,
cooldown_interval_ms,
repeats_to_cooldown,
)
results = []
offset = 0
Expand All @@ -384,6 +414,7 @@ def run_individual_node(
number=10,
repeat=1,
min_repeat_ms=0,
limit_zero_time_iterations=100,
cooldown_interval_ms=0,
repeats_to_cooldown=1,
):
Expand Down Expand Up @@ -415,6 +446,10 @@ def run_individual_node(
i.e., When the run time of one `repeat` falls below this time, the `number` parameter
will be automatically increased.
limit_zero_time_iterations: int, optional
The maximum number of repeats when measured time is equal to 0.
It helps to avoid hanging during measurements.
cooldown_interval_ms: int, optional
The cooldown interval in milliseconds between the number of repeats defined by
`repeats_to_cooldown`.
Expand All @@ -428,7 +463,13 @@ def run_individual_node(
"""
# Results are returned as serialized strings which we deserialize
res = self._run_individual_node(
index, number, repeat, min_repeat_ms, cooldown_interval_ms, repeats_to_cooldown
index,
number,
repeat,
min_repeat_ms,
limit_zero_time_iterations,
cooldown_interval_ms,
repeats_to_cooldown,
)
fmt = "@" + ("d" * repeat)
results = struct.unpack(fmt, res)
Expand Down
7 changes: 7 additions & 0 deletions python/tvm/contrib/graph_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -355,6 +355,7 @@ def benchmark(
repeat=5,
number=5,
min_repeat_ms=None,
limit_zero_time_iterations=100,
end_to_end=False,
cooldown_interval_ms=0,
repeats_to_cooldown=1,
Expand Down Expand Up @@ -402,6 +403,10 @@ def benchmark(
milliseconds. This can be used to ensure that the function is run enough to get an
accurate measurement.
limit_zero_time_iterations : Optional[int]
The maximum number of repeats when measured time is equal to 0.
It helps to avoid hanging during measurements.
end_to_end : bool
If set, include time to transfer input tensors to the device and time to transfer
returned tensors in the total runtime. This will give accurate timings for end to end
Expand Down Expand Up @@ -437,6 +442,7 @@ def benchmark(
repeat=repeat,
number=number,
min_repeat_ms=min_repeat_ms,
limit_zero_time_iterations=limit_zero_time_iterations,
)(device.device_type % rpc_base.RPC_SESS_MASK, device.device_id, *args)
if kwargs:
self.set_input(**kwargs)
Expand All @@ -446,6 +452,7 @@ def benchmark(
repeat=repeat,
number=number,
min_repeat_ms=min_repeat_ms,
limit_zero_time_iterations=limit_zero_time_iterations,
cooldown_interval_ms=cooldown_interval_ms,
repeats_to_cooldown=repeats_to_cooldown,
)()
6 changes: 6 additions & 0 deletions python/tvm/runtime/module.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,6 +277,7 @@ def time_evaluator(
number=10,
repeat=1,
min_repeat_ms=0,
limit_zero_time_iterations=100,
cooldown_interval_ms=0,
repeats_to_cooldown=1,
f_preproc="",
Expand Down Expand Up @@ -310,6 +311,10 @@ def time_evaluator(
i.e., When the run time of one `repeat` falls below this time, the `number` parameter
will be automatically increased.
limit_zero_time_iterations: int, optional
The maximum number of repeats when measured time is equal to 0.
It helps to avoid hanging during measurements.
cooldown_interval_ms: int, optional
The cooldown interval in milliseconds between the number of repeats defined by
`repeats_to_cooldown`.
Expand Down Expand Up @@ -340,6 +345,7 @@ def time_evaluator(
number,
repeat,
min_repeat_ms,
limit_zero_time_iterations,
cooldown_interval_ms,
repeats_to_cooldown,
f_preproc,
Expand Down
7 changes: 7 additions & 0 deletions python/tvm/runtime/vm.py
Original file line number Diff line number Diff line change
Expand Up @@ -583,6 +583,7 @@ def benchmark(
repeat=5,
number=5,
min_repeat_ms=None,
limit_zero_time_iterations=100,
end_to_end=False,
cooldown_interval_ms=0,
repeats_to_cooldown=1,
Expand Down Expand Up @@ -630,6 +631,10 @@ def benchmark(
milliseconds. This can be used to ensure that the function is run enough to get an
accurate measurement.
limit_zero_time_iterations : Optional[int]
The maximum number of repeats when measured time is equal to 0.
It helps to avoid hanging during measurements.
end_to_end : bool
If set, include time to transfer input tensors to the device and time to transfer
returned tensors in the total runtime. This will give accurate timings for end to end
Expand Down Expand Up @@ -672,6 +677,7 @@ def benchmark(
repeat=repeat,
number=number,
min_repeat_ms=min_repeat_ms,
limit_zero_time_iterations=limit_zero_time_iterations,
)(func_name, device.device_type % RPC_SESS_MASK, device.device_id, *packed_args)
if args or kwargs:
self.set_input(func_name, *args, **kwargs)
Expand All @@ -681,6 +687,7 @@ def benchmark(
repeat=repeat,
number=number,
min_repeat_ms=min_repeat_ms,
limit_zero_time_iterations=limit_zero_time_iterations,
cooldown_interval_ms=cooldown_interval_ms,
repeats_to_cooldown=repeats_to_cooldown,
)(func_name)
16 changes: 11 additions & 5 deletions src/runtime/crt/common/crt_runtime_api.c
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@

#include <assert.h>
#include <inttypes.h>
#include <math.h>
#include <stdarg.h>
#include <stdbool.h>
#include <stdio.h>
Expand Down Expand Up @@ -477,6 +478,7 @@ typedef struct {
int number;
int repeat;
int min_repeat_ms;
int limit_zero_time_iterations;
int cooldown_interval_ms;
int repeats_to_cooldown;
} time_evaluator_state_t;
Expand All @@ -487,14 +489,14 @@ int RPCTimeEvaluator(TVMValue* args, int* type_codes, int num_args, TVMValue* re
int* ret_type_code) {
ret_val[0].v_handle = NULL;
ret_type_code[0] = kTVMNullptr;
if (num_args < 10) {
if (num_args < 11) {
TVMAPIErrorf("not enough args");
return kTvmErrorFunctionCallNumArguments;
}
if (type_codes[0] != kTVMModuleHandle || type_codes[1] != kTVMStr ||
type_codes[2] != kTVMArgInt || type_codes[3] != kTVMArgInt || type_codes[4] != kTVMArgInt ||
type_codes[5] != kTVMArgInt || type_codes[6] != kTVMArgInt || type_codes[7] != kTVMArgInt ||
type_codes[8] != kTVMArgInt || type_codes[9] != kTVMStr) {
type_codes[8] != kTVMArgInt || type_codes[9] != kTVMArgInt || type_codes[10] != kTVMStr) {
TVMAPIErrorf("one or more invalid arg types");
return kTvmErrorFunctionCallWrongArgType;
}
Expand All @@ -506,8 +508,9 @@ int RPCTimeEvaluator(TVMValue* args, int* type_codes, int num_args, TVMValue* re
g_time_evaluator_state.number = args[4].v_int64;
g_time_evaluator_state.repeat = args[5].v_int64;
g_time_evaluator_state.min_repeat_ms = args[6].v_int64;
g_time_evaluator_state.cooldown_interval_ms = args[7].v_int64;
g_time_evaluator_state.repeats_to_cooldown = args[8].v_int64;
g_time_evaluator_state.limit_zero_time_iterations = args[7].v_int64;
g_time_evaluator_state.cooldown_interval_ms = args[8].v_int64;
g_time_evaluator_state.repeats_to_cooldown = args[9].v_int64;

int ret_code =
TVMModGetFunction(mod, name, /* query_imports */ 0, &g_time_evaluator_state.func_to_time);
Expand Down Expand Up @@ -556,6 +559,7 @@ tvm_crt_error_t RunTimeEvaluator(tvm_function_index_t function_index, TVMValue*
double* iter = (double*)result_byte_arr->data;
for (int i = 0; i < g_time_evaluator_state.repeat; i++) {
double curr_res_seconds = 0.0;
int absolute_zero_times = 0;
// do-while structure ensures we run even when `min_repeat_ms` isn't set (i.e., is 0).
do {
if (curr_res_seconds > 0.0) {
Expand Down Expand Up @@ -588,7 +592,9 @@ tvm_crt_error_t RunTimeEvaluator(tvm_function_index_t function_index, TVMValue*
if (err != kTvmErrorNoError) {
goto release_and_return;
}
} while (curr_res_seconds < min_repeat_seconds);
if (fpclassify(curr_res_seconds) == FP_ZERO) absolute_zero_times++;
} while (curr_res_seconds < min_repeat_seconds &&
absolute_zero_times < g_time_evaluator_state.limit_zero_time_iterations);
double mean_exec_seconds = curr_res_seconds / g_time_evaluator_state.number;
*iter = mean_exec_seconds;
iter++;
Expand Down
Loading

0 comments on commit c3c7c4c

Please sign in to comment.