respond to review comments

Matthew · Matthew · commit e5d80e5464b9 · 2021-09-07T14:35:40.000-06:00
diff --git a/include/tvm/ir/affine_type.h b/include/tvm/ir/affine_type.h
@@ -71,7 +71,7 @@ class TensorAffineTypeNode : public AffineTypeNode {
   RelayExpr zero_point;
   /*! \brief The data type of this type */
   DataType dtype;
-  /*! \brief The data type of this type */
+  /*! \brief The axis for per-channel quantization */
   int axis;
 
   void VisitAttrs(tvm::AttrVisitor* v) {
diff --git a/python/tvm/ir/affine_type.py b/python/tvm/ir/affine_type.py
@@ -48,6 +48,9 @@ class TensorAffineType(AffineType):
 
     dtype : str
         The content data type.
+
+    axis : int
+        The axis for per-channel quantization.
     """
 
     def __init__(self, scale, zero_point, dtype, axis=-1):
diff --git a/python/tvm/relay/qnn/op/qnn.py b/python/tvm/relay/qnn/op/qnn.py
@@ -276,8 +276,10 @@ def conv2d(
 ):
     r"""Quantized 2D convolution.
 
-    This operator convolves quantized data with quantized kernel. The scale of
-    the output quantized tensor is the product of the kernel_scale and
+    This operator convolves quantized data with quantized kernel.
+    If doing Per-channel quantization, qnn expects the kernel_zero_scale
+    and optionally the kernel_zero_point will be 1-D vectors instead of scalars.
+    The scale of the output quantized tensor is the product of the kernel_scale and
     input_scale of the input quantized tensors. The zero point of the output
     quantized tensor is 0. By default, the dtype of output is int32. Please also
     refer to Requantize operator to understand how to scale back the int32
@@ -544,6 +546,9 @@ def dense(
 
      `Y = X * W`
 
+    If doing Per-channel quantization, qnn expects the kernel_zero_scale
+    and optionally the kernel_zero_point will be 1-D vectors instead of scalars.
+
     Parameters
     ----------
     data : tvm.relay.Expr
diff --git a/python/tvm/relay/transform/fake_quantization_to_integer.py b/python/tvm/relay/transform/fake_quantization_to_integer.py
@@ -255,12 +255,13 @@ def relu(expr, type_map):
     arg = expr.args[0]
     t = type_map[arg]
     scale_shape = infer_shape(t.scale)
-    zero = relay.const(0, dtype="float32")
-    if len(scale_shape) > 0 and scale_shape[0] > 1:
+    z_p = t.zero_point
+    assert len(scale_shape) <= 1
+    if len(scale_shape) == 1 and scale_shape[0] > 1:
         b_shape = [1] * len(infer_shape(arg))
         b_shape[t.axis] = -1
-        zero = relay.op.reshape(relay.op.broadcast_to(zero, scale_shape), b_shape)
-    zero = relay.qnn.op.quantize(zero, t.scale, t.zero_point, t.axis, t.dtype)
+        z_p = relay.op.reshape(relay.op.broadcast_to(z_p, scale_shape), b_shape)
+    zero = relay.op.cast(z_p, t.dtype)
     return [relay.op.maximum(arg, fold_constant(zero)), t]
 
 
diff --git a/src/relay/qnn/op/dense.cc b/src/relay/qnn/op/dense.cc
@@ -62,6 +62,7 @@ bool QnnDenseRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   }
   ICHECK(IsScalarType(types[2], DataType::Int(32)));                  // input_zero_point
   ICHECK(IsScalarType(types[4], DataType::Float(32)));                // input_scale
+  // weight_zero_point can be a scalar or a vector of the same shape as the weight_scale
   AssignType(types[5], DataType::Float(32), param->units, reporter);  // weight_scale
 
   ICHECK(param->out_dtype.bits() > 0) << "Output dtype bits should be greater than 0.";

Original file line number	Diff line number	Diff line change
`@@ -62,6 +62,7 @@ bool QnnDenseRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,`
`62`	`62`	`}`
`63`	`63`	`ICHECK(IsScalarType(types[2], DataType::Int(32))); // input_zero_point`
`64`	`64`	`ICHECK(IsScalarType(types[4], DataType::Float(32))); // input_scale`
	`65`	`+ // weight_zero_point can be a scalar or a vector of the same shape as the weight_scale`
`65`	`66`	`AssignType(types[5], DataType::Float(32), param->units, reporter); // weight_scale`
`66`	`67`
`67`	`68`	`ICHECK(param->out_dtype.bits() > 0) << "Output dtype bits should be greater than 0.";`