[Profiler] Fix graph_executor_debug hang (apache#12382)

For some operations such as `__nop` or `__copy` the measured inference time is equal to 0. In this case we are in infinite loop and we won't exit from it. Added new parameter `limit_zero_time_iterations ` which specify the maximum number of repeats then the inference time is equal to 0. When we exceed this value then we will exit from a loop.
cjia4 · Aug 12, 2022 · c3c7c4c · c3c7c4c
1 parent 369e8b2
commit c3c7c4c
Show file tree

Hide file tree

Showing 11 changed files with 159 additions and 61 deletions.
diff --git a/include/tvm/runtime/profiling.h b/include/tvm/runtime/profiling.h
@@ -573,6 +573,8 @@ PackedFunc ProfileFunction(Module mod, std::string func_name, int device_type, i
  *        minimum duration requirement of one `repeat`.
  *        i.e., When the run time of one `repeat` falls below this time,
  *        the `number` parameter will be automatically increased.
+ * \param limit_zero_time_iterations The maximum number of repeats when
+ *        measured time is equal to 0.  It helps to avoid hanging during measurements.
  * \param cooldown_interval_ms The cooldown interval in milliseconds between the number of repeats
  *        defined by `repeats_to_cooldown`.
  * \param repeats_to_cooldown The number of repeats before the
@@ -582,8 +584,8 @@ PackedFunc ProfileFunction(Module mod, std::string func_name, int device_type, i
  * \return f_timer A timer function.
  */
 PackedFunc WrapTimeEvaluator(PackedFunc f, Device dev, int number, int repeat, int min_repeat_ms,
-                             int cooldown_interval_ms, int repeats_to_cooldown,
-                             PackedFunc f_preproc = nullptr);
+                             int limit_zero_time_iterations, int cooldown_interval_ms,
+                             int repeats_to_cooldown, PackedFunc f_preproc = nullptr);
 
 }  // namespace profiling
 }  // namespace runtime

diff --git a/python/tvm/contrib/debugger/debug_executor.py b/python/tvm/contrib/debugger/debug_executor.py
@@ -223,7 +223,15 @@ def _run_per_layer(self):
                 output_tensors.append(self._get_node_output(i, j))
         self.debug_datum.update_output_tensors(output_tensors)
 
-    def _run_debug(self, number, repeat, min_repeat_ms, cooldown_interval_ms, repeats_to_cooldown):
+    def _run_debug(
+        self,
+        number,
+        repeat,
+        min_repeat_ms,
+        limit_zero_time_iterations,
+        cooldown_interval_ms,
+        repeats_to_cooldown,
+    ):
         """Execute the node specified with index will be executed.
         Each debug output will be copied to the buffer
         Time consumed for each execution will be set as debug output.
@@ -233,6 +241,7 @@ def _run_debug(self, number, repeat, min_repeat_ms, cooldown_interval_ms, repeat
             number=number,
             repeat=repeat,
             min_repeat_ms=min_repeat_ms,
+            limit_zero_time_iterations=limit_zero_time_iterations,
             cooldown_interval_ms=cooldown_interval_ms,
             repeats_to_cooldown=repeats_to_cooldown,
         )
@@ -272,6 +281,7 @@ def run(
         number=10,
         repeat=1,
         min_repeat_ms=1,
+        limit_zero_time_iterations=100,
         cooldown_interval_ms=0,
         repeats_to_cooldown=1,
         **input_dict,
@@ -299,6 +309,10 @@ def run(
             i.e., When the run time of one `repeat` falls below this time, the `number` parameter
             will be automatically increased.
 
+        limit_zero_time_iterations: int, optional
+            The maximum number of repeats when measured time is equal to 0.
+            It helps to avoid hanging during measurements.
+
         cooldown_interval_ms: int, optional
             The cooldown interval in milliseconds between the number of repeats defined by
             `repeats_to_cooldown`.
@@ -317,6 +331,7 @@ def run(
             number=number,
             repeat=repeat,
             min_repeat_ms=min_repeat_ms,
+            limit_zero_time_iterations=limit_zero_time_iterations,
             cooldown_interval_ms=cooldown_interval_ms,
             repeats_to_cooldown=repeats_to_cooldown,
         )
@@ -328,7 +343,13 @@ def run(
         self.debug_datum.display_debug_result()
 
     def run_individual(
-        self, number, repeat=1, min_repeat_ms=0, cooldown_interval_ms=0, repeats_to_cooldown=1
+        self,
+        number,
+        repeat=1,
+        min_repeat_ms=0,
+        limit_zero_time_iterations=100,
+        cooldown_interval_ms=0,
+        repeats_to_cooldown=1,
     ):
         """Run each operation in the graph and get the time per op for all ops.
 
@@ -351,6 +372,10 @@ def run_individual(
             i.e., When the run time of one `repeat` falls below this time, the `number` parameter
             will be automatically increased.
 
+        limit_zero_time_iterations: int, optional
+            The maximum number of repeats when measured time is equal to 0.
+            It helps to avoid hanging during measurements.
+
         cooldown_interval_ms: int, optional
             The cooldown interval in milliseconds between the number of repeats defined by
             `repeats_to_cooldown`.
@@ -364,7 +389,12 @@ def run_individual(
         the repeat of the measurement.
         """
         res = self._run_individual(
-            number, repeat, min_repeat_ms, cooldown_interval_ms, repeats_to_cooldown
+            number,
+            repeat,
+            min_repeat_ms,
+            limit_zero_time_iterations,
+            cooldown_interval_ms,
+            repeats_to_cooldown,
         )
         results = []
         offset = 0
@@ -384,6 +414,7 @@ def run_individual_node(
         number=10,
         repeat=1,
         min_repeat_ms=0,
+        limit_zero_time_iterations=100,
         cooldown_interval_ms=0,
         repeats_to_cooldown=1,
     ):
@@ -415,6 +446,10 @@ def run_individual_node(
             i.e., When the run time of one `repeat` falls below this time, the `number` parameter
             will be automatically increased.
 
+        limit_zero_time_iterations: int, optional
+            The maximum number of repeats when measured time is equal to 0.
+            It helps to avoid hanging during measurements.
+
         cooldown_interval_ms: int, optional
             The cooldown interval in milliseconds between the number of repeats defined by
             `repeats_to_cooldown`.
@@ -428,7 +463,13 @@ def run_individual_node(
         """
         # Results are returned as serialized strings which we deserialize
         res = self._run_individual_node(
-            index, number, repeat, min_repeat_ms, cooldown_interval_ms, repeats_to_cooldown
+            index,
+            number,
+            repeat,
+            min_repeat_ms,
+            limit_zero_time_iterations,
+            cooldown_interval_ms,
+            repeats_to_cooldown,
         )
         fmt = "@" + ("d" * repeat)
         results = struct.unpack(fmt, res)

diff --git a/python/tvm/contrib/graph_executor.py b/python/tvm/contrib/graph_executor.py
@@ -355,6 +355,7 @@ def benchmark(
         repeat=5,
         number=5,
         min_repeat_ms=None,
+        limit_zero_time_iterations=100,
         end_to_end=False,
         cooldown_interval_ms=0,
         repeats_to_cooldown=1,
@@ -402,6 +403,10 @@ def benchmark(
             milliseconds. This can be used to ensure that the function is run enough to get an
             accurate measurement.
 
+        limit_zero_time_iterations : Optional[int]
+            The maximum number of repeats when measured time is equal to 0.
+            It helps to avoid hanging during measurements.
+
         end_to_end : bool
             If set, include time to transfer input tensors to the device and time to transfer
             returned tensors in the total runtime. This will give accurate timings for end to end
@@ -437,6 +442,7 @@ def benchmark(
                 repeat=repeat,
                 number=number,
                 min_repeat_ms=min_repeat_ms,
+                limit_zero_time_iterations=limit_zero_time_iterations,
             )(device.device_type % rpc_base.RPC_SESS_MASK, device.device_id, *args)
         if kwargs:
             self.set_input(**kwargs)
@@ -446,6 +452,7 @@ def benchmark(
             repeat=repeat,
             number=number,
             min_repeat_ms=min_repeat_ms,
+            limit_zero_time_iterations=limit_zero_time_iterations,
             cooldown_interval_ms=cooldown_interval_ms,
             repeats_to_cooldown=repeats_to_cooldown,
         )()
diff --git a/python/tvm/runtime/module.py b/python/tvm/runtime/module.py
@@ -277,6 +277,7 @@ def time_evaluator(
         number=10,
         repeat=1,
         min_repeat_ms=0,
+        limit_zero_time_iterations=100,
         cooldown_interval_ms=0,
         repeats_to_cooldown=1,
         f_preproc="",
@@ -310,6 +311,10 @@ def time_evaluator(
             i.e., When the run time of one `repeat` falls below this time, the `number` parameter
             will be automatically increased.
 
+        limit_zero_time_iterations: int, optional
+            The maximum number of repeats when measured time is equal to 0.
+            It helps to avoid hanging during measurements.
+
         cooldown_interval_ms: int, optional
             The cooldown interval in milliseconds between the number of repeats defined by
             `repeats_to_cooldown`.
@@ -340,6 +345,7 @@ def time_evaluator(
                 number,
                 repeat,
                 min_repeat_ms,
+                limit_zero_time_iterations,
                 cooldown_interval_ms,
                 repeats_to_cooldown,
                 f_preproc,

diff --git a/python/tvm/runtime/vm.py b/python/tvm/runtime/vm.py
@@ -583,6 +583,7 @@ def benchmark(
         repeat=5,
         number=5,
         min_repeat_ms=None,
+        limit_zero_time_iterations=100,
         end_to_end=False,
         cooldown_interval_ms=0,
         repeats_to_cooldown=1,
@@ -630,6 +631,10 @@ def benchmark(
             milliseconds. This can be used to ensure that the function is run enough to get an
             accurate measurement.
 
+        limit_zero_time_iterations : Optional[int]
+            The maximum number of repeats when measured time is equal to 0.
+            It helps to avoid hanging during measurements.
+
         end_to_end : bool
             If set, include time to transfer input tensors to the device and time to transfer
             returned tensors in the total runtime. This will give accurate timings for end to end
@@ -672,6 +677,7 @@ def benchmark(
                 repeat=repeat,
                 number=number,
                 min_repeat_ms=min_repeat_ms,
+                limit_zero_time_iterations=limit_zero_time_iterations,
             )(func_name, device.device_type % RPC_SESS_MASK, device.device_id, *packed_args)
         if args or kwargs:
             self.set_input(func_name, *args, **kwargs)
@@ -681,6 +687,7 @@ def benchmark(
             repeat=repeat,
             number=number,
             min_repeat_ms=min_repeat_ms,
+            limit_zero_time_iterations=limit_zero_time_iterations,
             cooldown_interval_ms=cooldown_interval_ms,
             repeats_to_cooldown=repeats_to_cooldown,
         )(func_name)
diff --git a/src/runtime/crt/common/crt_runtime_api.c b/src/runtime/crt/common/crt_runtime_api.c
@@ -21,6 +21,7 @@
 
 #include <assert.h>
 #include <inttypes.h>
+#include <math.h>
 #include <stdarg.h>
 #include <stdbool.h>
 #include <stdio.h>
@@ -477,6 +478,7 @@ typedef struct {
   int number;
   int repeat;
   int min_repeat_ms;
+  int limit_zero_time_iterations;
   int cooldown_interval_ms;
   int repeats_to_cooldown;
 } time_evaluator_state_t;
@@ -487,14 +489,14 @@ int RPCTimeEvaluator(TVMValue* args, int* type_codes, int num_args, TVMValue* re
                      int* ret_type_code) {
   ret_val[0].v_handle = NULL;
   ret_type_code[0] = kTVMNullptr;
-  if (num_args < 10) {
+  if (num_args < 11) {
     TVMAPIErrorf("not enough args");
     return kTvmErrorFunctionCallNumArguments;
   }
   if (type_codes[0] != kTVMModuleHandle || type_codes[1] != kTVMStr ||
       type_codes[2] != kTVMArgInt || type_codes[3] != kTVMArgInt || type_codes[4] != kTVMArgInt ||
       type_codes[5] != kTVMArgInt || type_codes[6] != kTVMArgInt || type_codes[7] != kTVMArgInt ||
-      type_codes[8] != kTVMArgInt || type_codes[9] != kTVMStr) {
+      type_codes[8] != kTVMArgInt || type_codes[9] != kTVMArgInt || type_codes[10] != kTVMStr) {
     TVMAPIErrorf("one or more invalid arg types");
     return kTvmErrorFunctionCallWrongArgType;
   }
@@ -506,8 +508,9 @@ int RPCTimeEvaluator(TVMValue* args, int* type_codes, int num_args, TVMValue* re
   g_time_evaluator_state.number = args[4].v_int64;
   g_time_evaluator_state.repeat = args[5].v_int64;
   g_time_evaluator_state.min_repeat_ms = args[6].v_int64;
-  g_time_evaluator_state.cooldown_interval_ms = args[7].v_int64;
-  g_time_evaluator_state.repeats_to_cooldown = args[8].v_int64;
+  g_time_evaluator_state.limit_zero_time_iterations = args[7].v_int64;
+  g_time_evaluator_state.cooldown_interval_ms = args[8].v_int64;
+  g_time_evaluator_state.repeats_to_cooldown = args[9].v_int64;
 
   int ret_code =
       TVMModGetFunction(mod, name, /* query_imports */ 0, &g_time_evaluator_state.func_to_time);
@@ -556,6 +559,7 @@ tvm_crt_error_t RunTimeEvaluator(tvm_function_index_t function_index, TVMValue*
   double* iter = (double*)result_byte_arr->data;
   for (int i = 0; i < g_time_evaluator_state.repeat; i++) {
     double curr_res_seconds = 0.0;
+    int absolute_zero_times = 0;
     // do-while structure ensures we run even when `min_repeat_ms` isn't set (i.e., is 0).
     do {
       if (curr_res_seconds > 0.0) {
@@ -588,7 +592,9 @@ tvm_crt_error_t RunTimeEvaluator(tvm_function_index_t function_index, TVMValue*
       if (err != kTvmErrorNoError) {
         goto release_and_return;
       }
-    } while (curr_res_seconds < min_repeat_seconds);
+      if (fpclassify(curr_res_seconds) == FP_ZERO) absolute_zero_times++;
+    } while (curr_res_seconds < min_repeat_seconds &&
+             absolute_zero_times < g_time_evaluator_state.limit_zero_time_iterations);
     double mean_exec_seconds = curr_res_seconds / g_time_evaluator_state.number;
     *iter = mean_exec_seconds;
     iter++;