-
Notifications
You must be signed in to change notification settings - Fork 3.7k
[Relay][Quantization] Per-Channel FQ2I #8883
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
208d048
f30d698
f461c81
65ab701
3d07acb
fefca2c
c4c746b
fffc549
4b81904
9172d4d
298b81d
e5d80e5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -479,7 +479,7 @@ def _impl_v1(cls, inputs, attr, params): | |
| attr["dilations"] = [1] + list(attr["dilations"]) | ||
| if "pads" in attr: | ||
| attr["pads"] = [0, attr["pads"][0], 0, attr["pads"][1]] | ||
|
|
||
| attr["channels"] = kernel_shapes[0][0] | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. does this change relate to the subject of the PR or it is side fix? Don't see where "channels" attribute is used in other changes of this PR
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is a side fix, I utilized it for an intermediate solution for conv, but I guess I don't technically need it. It doesn't seem to be required in many of the relay passes. |
||
| out = AttrCvt( | ||
| op_name=dimension_picker("conv"), | ||
| transforms={ | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -18,13 +18,22 @@ | |
| import tvm | ||
| from tvm import relay | ||
| from tvm.ir import TensorAffineType, TupleAffineType | ||
| from tvm.tir import bijective_layout | ||
| from ..op import register_fake_quantization_to_integer | ||
|
|
||
|
|
||
| def fold_constant(expr): | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmm can you use the functions in
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Unfortunately, since this is in transforms and the frontends call the transforms namespace extensively, importing that will cause an import loop. |
||
| return relay.transform.FoldConstantExpr(expr, tvm.IRModule()) | ||
|
|
||
|
|
||
| def get_zeros(scale): | ||
| return fold_constant(relay.op.cast(relay.op.zeros_like(scale), "int32")) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should this be a cast_like or do we always want things to be int32?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. zero_points in qnn are always int32 |
||
|
|
||
|
|
||
| def infer_shape(expr): | ||
| return relay.transform.InferType()(tvm.IRModule.from_expr(expr))["main"].body.checked_type.shape | ||
|
|
||
|
|
||
| @register_fake_quantization_to_integer("qnn.dequantize") | ||
| def dequantize(expr, type_map): | ||
| """Remove dequantize op""" | ||
|
|
@@ -52,8 +61,13 @@ def quantize(expr, type_map): | |
| expr.args[1], | ||
| expr.args[2], | ||
| out_dtype=expr.attrs.out_dtype, | ||
| axis=t.axis, | ||
| ) | ||
| return [out, TensorAffineType(expr.args[1], expr.args[2], expr.attrs.out_dtype)] | ||
|
|
||
| return [ | ||
| out, | ||
| TensorAffineType(expr.args[1], expr.args[2], expr.attrs.out_dtype, expr.attrs.axis), | ||
| ] | ||
|
|
||
|
|
||
| def register_unary_identity(op_name): | ||
|
|
@@ -94,14 +108,19 @@ def bias_add(expr, type_map): | |
| b_t = type_map[b] | ||
| in_scale = fold_constant(x_t.scale) | ||
| in_zero_point = fold_constant(x_t.zero_point) | ||
| if not tvm.ir.structural_equal(x_t, b_t): | ||
| if not ( | ||
| tvm.ir.structural_equal(x_t.scale, b_t.scale) | ||
| and tvm.ir.structural_equal(x_t.zero_point, b_t.zero_point) | ||
| and tvm.ir.structural_equal(x_t.dtype, b_t.dtype) | ||
| ): | ||
| b = relay.qnn.op.requantize( | ||
| b, | ||
| b_t.scale, | ||
| b_t.zero_point, | ||
| in_scale, | ||
| in_zero_point, | ||
| out_dtype=x_t.dtype, | ||
| axis=0, | ||
| ) | ||
| out = relay.op.nn.bias_add(x, b, **expr.attrs) | ||
| return [out, x_t] | ||
|
|
@@ -116,11 +135,13 @@ def conv2d(expr, type_map): | |
| x_t = type_map[x] | ||
| w_t = type_map[weight] | ||
| conv_scale = fold_constant(x_t.scale * w_t.scale) | ||
| conv_zp = relay.const(0) | ||
| conv_zp = get_zeros(conv_scale) | ||
| out = relay.qnn.op.conv2d( | ||
| x, weight, x_t.zero_point, w_t.zero_point, x_t.scale, w_t.scale, **attrs | ||
| ) | ||
| return [out, TensorAffineType(conv_scale, conv_zp, out.attrs.out_dtype)] | ||
| out_layout = attrs["out_layout"] if attrs["out_layout"] != "" else attrs["data_layout"] | ||
| out_axis = bijective_layout(out_layout, "NCHW").backward_index(list(range(4)))[1] | ||
| return [out, TensorAffineType(conv_scale, conv_zp, out.attrs.out_dtype, out_axis.value)] | ||
|
|
||
|
|
||
| @register_fake_quantization_to_integer("nn.dense") | ||
|
|
@@ -132,11 +153,11 @@ def dense(expr, type_map): | |
| x_t = type_map[x] | ||
| w_t = type_map[weight] | ||
| dense_scale = fold_constant(x_t.scale * w_t.scale) | ||
| dense_zp = relay.const(0) | ||
| dense_zp = get_zeros(dense_scale) | ||
| out = relay.qnn.op.dense( | ||
| x, weight, x_t.zero_point, w_t.zero_point, x_t.scale, w_t.scale, **attrs | ||
| ) | ||
| return [out, TensorAffineType(dense_scale, dense_zp, out.attrs.out_dtype)] | ||
| return [out, TensorAffineType(dense_scale, dense_zp, out.attrs.out_dtype, 1)] | ||
|
|
||
|
|
||
| @register_fake_quantization_to_integer("nn.batch_matmul") | ||
|
|
@@ -148,7 +169,7 @@ def batch_matmul(expr, type_map): | |
| matmul_scale = fold_constant(x_t.scale * y_t.scale) | ||
| matmul_zp = relay.const(0) | ||
| out = relay.qnn.op.batch_matmul(x, y, x_t.zero_point, y_t.zero_point, x_t.scale, y_t.scale) | ||
| return [out, TensorAffineType(matmul_scale, matmul_zp, out.attrs.out_dtype)] | ||
| return [out, TensorAffineType(matmul_scale, matmul_zp, out.attrs.out_dtype, x_t.axis)] | ||
|
|
||
|
|
||
| @register_fake_quantization_to_integer("concatenate") | ||
|
|
@@ -198,19 +219,52 @@ def clip(expr, type_map): | |
| amax = expr.attrs.a_max | ||
| scale = fold_constant(t.scale) | ||
| z_p = fold_constant(t.zero_point) | ||
| if isinstance(scale, relay.expr.Constant) and isinstance(z_p, relay.expr.Constant): | ||
| if ( | ||
| isinstance(scale, relay.expr.Constant) | ||
| and scale.data.numpy().size == 1 | ||
| and isinstance(z_p, relay.expr.Constant) | ||
| and z_p.data.numpy().size == 1 | ||
| ): | ||
| scale = scale.data.numpy().item() | ||
| z_p = z_p.data.numpy().item() | ||
| new_min = int(amin / scale + z_p) | ||
| new_max = int(amax / scale + z_p) | ||
| out = relay.op.clip(arg, new_min, new_max) | ||
| else: | ||
| amin = relay.op.round(relay.op.const(amin) / scale + z_p) | ||
| amax = relay.op.round(relay.op.const(amax) / scale + z_p) | ||
| out = relay.op.minimum(relay.op.maximum(arg, amin), amax) | ||
| if not isinstance(amin, relay.expr.Constant): | ||
| amin = relay.op.const(amin) | ||
| if not isinstance(amax, relay.expr.Constant): | ||
| amax = relay.op.const(amax) | ||
|
|
||
| scale_shape = infer_shape(scale) | ||
| if len(scale_shape) > 0 and scale_shape[0] > 1: | ||
| b_shape = [1] * len(infer_shape(arg)) | ||
| b_shape[t.axis] = -1 | ||
| amin = relay.op.reshape(relay.op.broadcast_to(amin, scale_shape), b_shape) | ||
| amax = relay.op.reshape(relay.op.broadcast_to(amax, scale_shape), b_shape) | ||
| amin = relay.qnn.op.quantize(amin, scale, z_p, t.axis, t.dtype) | ||
| amax = relay.qnn.op.quantize(amax, scale, z_p, t.axis, t.dtype) | ||
| out = relay.op.minimum(relay.op.maximum(arg, fold_constant(amin)), fold_constant(amax)) | ||
|
|
||
| return [out, t] | ||
|
|
||
|
|
||
| @register_fake_quantization_to_integer("nn.relu") | ||
| def relu(expr, type_map): | ||
| """Rewrite a relu op""" | ||
| arg = expr.args[0] | ||
| t = type_map[arg] | ||
| scale_shape = infer_shape(t.scale) | ||
| z_p = t.zero_point | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we might have here 4 situations
cases 3 and 4 are covered by next if, we broadcast zp to scale shapes by axist from AffineType structure case 1 is ok case 2 - Q: don't we need to handle this explicitly and broadcast scale?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. on the first Q, no, the broadcasted z_p will not be updated in place, it's only used in the computation. On the second Q, that's an interesting point. I guess I haven't every seen it, but it's feasible. Qnn currently supports scalar scale and scalar zp OR vector scale and scalar zp OR vector scale and vector zp, which matches all of the combinations I've ever seen in the wild. What do you think, should we try to support that in QNN? |
||
| assert len(scale_shape) <= 1 | ||
| if len(scale_shape) == 1 and scale_shape[0] > 1: | ||
| b_shape = [1] * len(infer_shape(arg)) | ||
| b_shape[t.axis] = -1 | ||
| z_p = relay.op.reshape(relay.op.broadcast_to(z_p, scale_shape), b_shape) | ||
| zero = relay.op.cast(z_p, t.dtype) | ||
| return [relay.op.maximum(arg, fold_constant(zero)), t] | ||
|
|
||
|
|
||
| @register_fake_quantization_to_integer("nn.pad") | ||
| def pad(expr, type_map): | ||
| """Rewite an nn.pad op""" | ||
|
|
@@ -231,6 +285,7 @@ def pad(expr, type_map): | |
| t.scale, | ||
| t.zero_point, | ||
| out_dtype=t.dtype, | ||
| axis=pad_t.axis, | ||
| ) | ||
| else: | ||
| ## If the pad-value is a constant, we need to quantize it | ||
|
|
@@ -319,6 +374,7 @@ def binary(expr, type_map): | |
| out_t.scale, | ||
| out_t.zero_point, | ||
| out_dtype=out_t.dtype, | ||
| axis=left_t.axis, | ||
| ) | ||
|
|
||
| if right_t != out_t: | ||
|
|
@@ -329,6 +385,7 @@ def binary(expr, type_map): | |
| out_t.scale, | ||
| out_t.zero_point, | ||
| out_dtype=out_t.dtype, | ||
| axis=right_t.axis, | ||
| ) | ||
| out = op(left, right) | ||
| return [out, out_t] | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.