apache
diff --git a/‎python/tvm/relay/backend/_backend.py‎
Lines changed: 0 additions & 7 deletions b/‎python/tvm/relay/backend/_backend.py‎
Lines changed: 0 additions & 7 deletions
diff --git a/‎python/tvm/relay/frontend/onnx.py‎
Lines changed: 44 additions & 14 deletions b/‎python/tvm/relay/frontend/onnx.py‎
Lines changed: 44 additions & 14 deletions
diff --git a/‎python/tvm/relay/op/vision/nms.py‎
Lines changed: 3 additions & 3 deletions b/‎python/tvm/relay/op/vision/nms.py‎
Lines changed: 3 additions & 3 deletions
@@ -88,13 +88,6 @@ def _tensor_value_repr(tvalue):
     return str(tvalue.data.asnumpy())
 
 
-@tvm._ffi.register_func("relay._ndarray_repr")
-def _tensor_constant_repr(tvalue):
-    tmp = tvalue.asnumpy()
-    return "NDArray of shape " + str(tmp.shape) + " and dtype " + str(tmp.dtype) +"\n\t" + str(tmp)
-
-
-
 @tvm._ffi.register_func("relay._constant_repr")
 def _tensor_constant_repr(tvalue):
     dtype = tvm.runtime.DataType(tvalue.data.dtype)
 
@@ -2241,6 +2241,17 @@ class NonMaxSuppression(OnnxOpConverter):
 
     @classmethod
     def _impl_v10(cls, inputs, attr, params):
+        """
+        High level note: ONNX implements what TF calls combined_non_max_suppression
+        It passes in scores for each box for every class in the output and expects boxes to be
+        analyzed for each class independently
+
+        It also asks for the data to be returned in a particular format.
+
+        To support these, we implement a series of lops:
+        The first loop splits over class number, performs NMS, and collects the outputs.
+        The second (nested) loop takes the outputs and transforms them into the format ONNX wants
+        """
         # Get parameter values
         boxes = inputs[0]
         scores = inputs[1]
@@ -2270,17 +2281,17 @@ def conditionally_squeeze_scalar(x):
         max_output_boxes_per_class = conditionally_squeeze_scalar(max_output_boxes_per_class)
         iou_threshold = conditionally_squeeze_scalar(iou_threshold)
         score_threshold = conditionally_squeeze_scalar(score_threshold)
+
+        ## prepare utility constants
         zero = _op.const(np.array([0]), dtype="int64")
         one = _op.const(np.array([1]), dtype="int64")
+        two = _op.const(np.array([2]), dtype="int64")
         three = _op.const(np.array([3]), dtype="int64")
-        two_ones = _op.const(np.array([1, 1]), dtype="int64")
         three_ones = _op.const(np.array([1, 1, 1]), dtype="int64")
         four_ones = _op.const(np.array([1, 1, 1, 1]), dtype="int64")
 
-        def pad_last_dim(x):
-            return _op.expand_dims(x, -1, 1)
-
-        # First Loop Vars
+        ## First loop: split by class and perform NMS
+        # Create Loop Vars
         i = _expr.var("i", shape=(1,), dtype="int64")
         scores_var = _expr.var("scores_var", shape=(_ty.Any(), _ty.Any(), _ty.Any()), dtype=dtype)
         boxes_var = _expr.var("boxes_var", shape=(_ty.Any(), _ty.Any(), 4), dtype=dtype)
@@ -2292,7 +2303,7 @@ def pad_last_dim(x):
         B = _expr.var("B", shape=(1,), dtype="int64")
         C = _expr.var("C", shape=(1,), dtype="int64")
         S = _expr.var("S", shape=(1,), dtype="int64")
-        # Outputs of first loop should be padded nms values shape (B, C, 3)
+        # Outputs of first loop should be padded nms values shape (B, C, S, 3)
         onnx_out = _expr.var("onnx_out", shape=(_ty.Any(), _ty.Any(), _ty.Any(), 3), dtype="int64")
         # and sizes of valid outputs, shape (B, C, 1)
         nms_size_out = _expr.var("nms_size_out", shape=(_ty.Any(), _ty.Any(), 1), dtype="int64")
@@ -2310,6 +2321,7 @@ def _first_cond(
             onnx_out,
             nms_size_out,
         ):
+            # Loop over classes, end when i == C
             return _op.min(_op.less(i, C))
 
         def _first_body(
@@ -2325,12 +2337,15 @@ def _first_body(
             onnx_out,
             nms_size_out,
         ):
+            # slice to get current class
             begin = _op.concatenate([zero, i, zero], axis=0)
             end = _op.concatenate([B, i + one, S], axis=0)
             class_scores = _op.strided_slice(scores, begin, end, three_ones)
             class_scores = _op.expand_dims(_op.squeeze(class_scores, [1]), -1, 1)
+            # combine scores and boxes
             data = _op.concatenate([class_scores, boxes], axis=-1)
 
+            # get valid counts
             ct, data, indices = _op.vision.get_valid_counts(
                 data, score_threshold=score_threshold, id_index=-1, score_index=0
             )
@@ -2339,6 +2354,7 @@ def _first_body(
             top_k = -1
             # ONNX doesn't have class id for nms input
             score_index = 0
+            # perform nms on current class
             nms_ret = _op.vision.non_max_suppression(
                 data=data,
                 valid_count=ct,
@@ -2353,6 +2369,7 @@ def _first_body(
                 return_indices=True,
                 invalid_to_bottom=False,
             )
+            # partially prepare ONNX output format by labeling batch_num, class_id
             nms_padded_out = _op.expand_dims(nms_ret[0], -1, 1)
             batch_num = _op.expand_dims(_op.arange(_op.squeeze(B, [0]), dtype="int64"), -1, 1)
             batch_num = _op.broadcast_to(batch_num, _op.shape_of(nms_ret[0], dtype="int64"))
@@ -2362,6 +2379,7 @@ def _first_body(
                 [batch_num, class_num, _op.cast(nms_padded_out, "int64")], -1
             )
             new_onnx_out = _op.expand_dims(new_onnx_out, 1, 1)
+            # store valid nms outputs for this class
             nms_size = _op.cast(nms_ret[1], "int64")
             nms_size = _op.expand_dims(nms_size, 1, 1)
             return [
@@ -2378,6 +2396,7 @@ def _first_body(
                 _op.concatenate([nms_size_out, nms_size], axis=1),
             ]
 
+        # create the first loop
         first_loop = _loops.while_loop(
             _first_cond,
             [
@@ -2396,6 +2415,8 @@ def _first_body(
             _first_body,
         )
 
+        ## Second loop slices outputs of the first loop for valid boxes and
+        ##  concats in the order ONNX wants
         # Second inner Loop Vars
         i = _expr.var("i", shape=(1,), dtype="int64")
         j = _expr.var("j", shape=(1,), dtype="int64")
@@ -2408,14 +2429,17 @@ def _first_body(
         out = _expr.var("out", shape=(_ty.Any(), 3), dtype="int64")
 
         def _inner_cond(i, j, C, onnx_out, nms_size, out):
+            # inner loop over number of classes
             return _op.min(_op.less(j, C))
 
         def _inner_body(i, j, C, onnx_out, nms_size, out):
-            start = _op.concatenate([i, j, zero], axis=0)
-            end = _op.concatenate([i + one, j + one, one], axis=0)
+            # slice to get current batch and class for valid box indicator
+            start = _op.concatenate([i, j + one, zero], axis=0)
+            end = _op.concatenate([i + one, j + two, one], axis=0)
             num_valid_boxes = _op.reshape(_op.strided_slice(nms_size, start, end, three_ones), [1])
-            start = _op.concatenate([i, j, zero, zero], axis=0)
-            end = _op.concatenate([i + one, j + one, num_valid_boxes, three], axis=0)
+            # slice to get current batch, class, and valid outputs
+            start = _op.concatenate([i, j + one, zero, zero], axis=0)
+            end = _op.concatenate([i + one, j + two, num_valid_boxes, three], axis=0)
             new_out = _op.squeeze(_op.strided_slice(onnx_out, start, end, four_ones), [0, 1])
             return i, j + one, C, onnx_out, nms_size, _op.concatenate([out, new_out], axis=0)
 
@@ -2435,23 +2459,27 @@ def _inner_body(i, j, C, onnx_out, nms_size, out):
         out = _expr.var("out", shape=(_ty.Any(), 3), dtype="int64")
 
         def _outer_cond(i, B, C, onnx_out, nms_size_out, out):
+            # Outer loop is over batch size
             return _op.min(_op.less(i, B))
 
         def _outer_body(i, B, C, onnx_out, nms_size_out, out):
+            # Outer loop just calls inner loop
             init_count = _op.const(np.array([0]), dtype="int64")
             inner_loop_vals = inner_loop(i, init_count, C, onnx_out, nms_size_out, out)
             return i + one, B, C, onnx_out, nms_size_out, _expr.TupleGetItem(inner_loop_vals, 5)
 
+        # Create the second loop
         outer_loop = _loops.while_loop(
             _outer_cond, [i, B, C, onnx_out, nms_size_out, out], _outer_body
         )
 
+        # Call the first loop, perform NMS
         B, C, S = _op.split(_op.shape_of(scores, dtype="int64"), 3)
         init_count = _op.const(np.array([0]), dtype="int64")
-        init_onnx_out = _op.const([], dtype="int64")
-        init_onnx_out = _op.broadcast_to(init_onnx_out, _op.concatenate([B, zero, S, three], 0))
-        init_nms_size_out = _op.const([], dtype="int64")
-        init_nms_size_out = _op.broadcast_to(init_nms_size_out, _op.concatenate([B, zero, one], 0))
+        init_onnx_out = _op.const([1], dtype="int64")
+        init_onnx_out = _op.broadcast_to(init_onnx_out, _op.concatenate([B, one, S, three], 0))
+        init_nms_size_out = _op.const([1], dtype="int64")
+        init_nms_size_out = _op.broadcast_to(init_nms_size_out, _op.concatenate([B, one, one], 0))
         loop_vals = first_loop(
             init_count,
             scores,
@@ -2468,9 +2496,11 @@ def _outer_body(i, B, C, onnx_out, nms_size_out, out):
         onnx_output = _expr.TupleGetItem(loop_vals, 9)
         nms_size_output = _expr.TupleGetItem(loop_vals, 10)
 
+        # Call the second loop, rework outputs into correct form
         init_count = _op.const(np.array([0]).astype("int64"), dtype="int64")
         init_out = _op.const(np.array([]).reshape([0, 3]).astype("int64"), dtype="int64")
         loop_vals = outer_loop(init_count, B, C, onnx_output, nms_size_output, init_out)
+
         return _expr.TupleGetItem(loop_vals, 5)
 
 
 
@@ -48,7 +48,7 @@ def get_valid_counts(data, score_threshold, id_index=0, score_index=1):
     out_indices: relay.Expr
         Indices in input data
     """
-    if isinstance(score_threshold, float):
+    if not isinstance(score_threshold, expr.Expr):
         score_threshold = expr.const(score_threshold, "float32")
     return expr.TupleWrapper(
         _make.get_valid_counts(data, score_threshold, id_index, score_index), 3
@@ -128,9 +128,9 @@ def non_max_suppression(
         If return_indices is True, return relay.Tuple of two 2-D tensors, with
         shape [batch_size, num_anchors] and [batch_size, num_valid_anchors] respectively.
     """
-    if isinstance(max_output_size, int):
+    if not isinstance(max_output_size, expr.Expr):
         max_output_size = expr.const(max_output_size, "int32")
-    if isinstance(iou_threshold, float):
+    if not isinstance(iou_threshold, expr.Expr):
         iou_threshold = expr.const(iou_threshold, "float32")
     out = _make.non_max_suppression(
         data,