Fix legalization for non spatial operators. (#6)

Josh Fromm · Matthew · commit 90951ccd41d1 · 2021-09-03T13:09:17.000-06:00
* Fix legalization for non spatial operators.

* Fix axis checks for end2end functionality.
diff --git a/python/tvm/relay/qnn/op/legalizations.py b/python/tvm/relay/qnn/op/legalizations.py
@@ -20,6 +20,7 @@
 
 import tvm
 from tvm import relay
+from tvm._ffi.base import TVMError
 from .. import op as reg
 
 #################################################
@@ -148,8 +149,21 @@ def helper_no_fast_int8_hw_legalization(attrs, inputs, types, relay_op):
         )
     # Otherwise it needs to be broadcast.
     else:
-        # Determine output axis of kernel.
-        output_axis = tvm.tir.layout(attrs["kernel_layout"]).index_of("O")
+        # Determine output axis of kernel for spatial operations.
+        if hasattr(attrs, "kernel_layout"):
+            output_axis = tvm.tir.layout(attrs["kernel_layout"]).index_of("O")
+        # For dense operations, broadcast to [N, K] layout.
+        elif isinstance(attrs, relay.op.op_attrs.DenseAttrs):
+            output_axis = 0
+        # For matrix multiplication instead expand to [K, N] layout.
+        elif isinstance(attrs, relay.op.op_attrs.MatmulAttrs):
+            output_axis = 1
+        else:
+            raise TVMError(
+                "Legalization of %s is not yet supported with per channel parameters"
+                % str(type(attrs))
+            )
+
         shift_kernel = relay.nn.bias_add(
             relay.cast(kernel, dtype="int16"),
             relay.cast(kernel_zero_point, dtype="int16"),
diff --git a/src/relay/qnn/op/dequantize.cc b/src/relay/qnn/op/dequantize.cc
@@ -54,9 +54,16 @@ bool DequantizeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   const auto* dequantize_attrs = attrs.as<DequantizeAttrs>();
   int axis = dequantize_attrs->axis;
   auto rank = static_cast<int>(data->shape.size());
-  axis = (axis < 0) ? ((rank > 0) ? data->shape.size() + axis : 0) : axis;
-  ICHECK_LT(axis, rank > 0 ? rank : 1) << "axis " << dequantize_attrs->axis << " is out of range";
-  ICHECK_GE(axis, 0) << "axis " << dequantize_attrs->axis << " is out of range";
+
+  // If zero point and scale are scalar then axis doesnt matter.
+  bool scale_is_scalar = (types[1].as<TensorTypeNode>())->shape.size() == 0;
+  bool zp_is_scalar = (types[2].as<TensorTypeNode>())->shape.size() == 0;
+
+  if (!(scale_is_scalar && zp_is_scalar)) {
+    axis = (axis < 0) ? ((rank > 0) ? data->shape.size() + axis : 0) : axis;
+    ICHECK_LT(axis, rank > 0 ? rank : 1) << "axis " << dequantize_attrs->axis << " is out of range";
+    ICHECK_GE(axis, 0) << "axis " << dequantize_attrs->axis << " is out of range";
+  }
 
   PrimExpr axis_shape;
   if (rank > 0) {
diff --git a/src/relay/qnn/op/quantize.cc b/src/relay/qnn/op/quantize.cc
@@ -52,9 +52,16 @@ bool QuantizeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   const auto* quantize_attrs = attrs.as<QuantizeAttrs>();
   int axis = quantize_attrs->axis;
   auto rank = static_cast<int>(data->shape.size());
-  axis = (axis < 0) ? ((rank > 0) ? data->shape.size() + axis : 0) : axis;
-  ICHECK_LT(axis, rank > 0 ? rank : 1) << "axis " << quantize_attrs->axis << " is out of range";
-  ICHECK_GE(axis, 0) << "axis " << quantize_attrs->axis << " is out of range";
+
+  // If zero point and scale are scalar then axis doesnt matter.
+  bool scale_is_scalar = (types[1].as<TensorTypeNode>())->shape.size() == 0;
+  bool zp_is_scalar = (types[2].as<TensorTypeNode>())->shape.size() == 0;
+
+  if (!(scale_is_scalar && zp_is_scalar)) {
+    axis = (axis < 0) ? ((rank > 0) ? data->shape.size() + axis : 0) : axis;
+    ICHECK_LT(axis, rank > 0 ? rank : 1) << "axis " << quantize_attrs->axis << " is out of range";
+    ICHECK_GE(axis, 0) << "axis " << quantize_attrs->axis << " is out of range";
+  }
 
   PrimExpr axis_shape;
   if (rank > 0) {
diff --git a/tests/python/relay/test_pass_fake_quantization_to_integer.py b/tests/python/relay/test_pass_fake_quantization_to_integer.py
@@ -130,7 +130,7 @@ def test_fake_quantize_dense_per_channel():
         x_np = np.random.randint(-128, 127, size=[128, 64], dtype="int8")
         w_np = np.random.randint(-128, 127, size=[256, 64], dtype="int8")
 
-        compare_fq_to_int(op, [x_np, w_np])
+        compare_fq_to_int(op, [x_np, w_np], allow_rounding_error=True)
 
 
 def test_fake_quantize_batch_matmul():