Skip to content

Commit

Permalink
[QNN][TFLite] Parsing TFLite quantized models.
Browse files Browse the repository at this point in the history
  • Loading branch information
anijain2305 committed Sep 27, 2019
1 parent 2ded2d8 commit 1e47074
Show file tree
Hide file tree
Showing 4 changed files with 172 additions and 18 deletions.
157 changes: 141 additions & 16 deletions python/tvm/relay/frontend/tflite.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
from .. import expr as _expr
from .. import module as _module
from .. import op as _op
from .. import qnn as _qnn
from ... import nd as _nd
from .common import ExprTable
from .common import infer_shape as _infer_shape
Expand All @@ -32,10 +33,11 @@

class TensorWrapper(object):
"""Tensor wrapper for TFLite Tensor"""
def __init__(self, tensor_idx, tensor, buffer):
def __init__(self, tensor_idx, tensor, buffer, qnn_params=None):
self.tensor_idx = tensor_idx
self.tensor = tensor
self.buffer = buffer
self.qnn_params = qnn_params

class OperatorConverter(object):
"""Operator Converted for converting TFLite ops to Relay ops"""
Expand Down Expand Up @@ -152,13 +154,24 @@ def get_tensors(self, tensors_idx_list):
return_list = list()
for tensor_idx in tensors_idx_list:
if tensor_idx < 0:
return_list.append(TensorWrapper(tensor_idx, 0, 0))
return_list.append(TensorWrapper(tensor_idx, 0, 0, None))
continue

tensor = self.subgraph.Tensors(tensor_idx)
buffer_idx = tensor.Buffer()
buffer = self.model.Buffers(buffer_idx)
return_list.append(TensorWrapper(tensor_idx, tensor, buffer))

# Check if the tensors are quantized. Parse if yes.
qnn_params = None
tflite_qnn_params = tensor.Quantization()
if tflite_qnn_params is not None:
qnn_params = dict()
qnn_params['scale'] = float(tflite_qnn_params.ScaleAsNumpy())
qnn_params['zero_point'] = int(tflite_qnn_params.ZeroPointAsNumpy())
# Check that the scale and zero points are valid.
if qnn_params['scale'] == 0 and qnn_params['zero_point'] == 0:
qnn_params = None
return_list.append(TensorWrapper(tensor_idx, tensor, buffer, qnn_params))
return return_list

def get_tensor_value(self, tensor_wrapper):
Expand Down Expand Up @@ -198,6 +211,10 @@ def get_tensor_type_str(self, tensor_type):
raise NotImplementedError("Tensor type {} is currently not supported"
.format(str(tensor_type)))

def has_same_qnn_params(self, tensor_a, tensor_b):
return tensor_a.qnn_params['scale'] == tensor_b.qnn_params['scale'] and \
tensor_a.qnn_params['zero_point'] == tensor_b.qnn_params['zero_point']

def convert_conv2d(self, op):
"""Convert TFLite conv2d"""
return self.convert_conv(op, "conv2d")
Expand Down Expand Up @@ -236,8 +253,15 @@ def convert_reshape(self, op):
target_shape = reshape_options.NewShapeAsNumpy()

in_expr = self.get_expr(input_tensor_idx)
out = _op.reshape(in_expr, newshape=tuple(target_shape))

# If the tensors are quantized, ensure that input/output qnn params are same.
if input_tensor.qnn_params is not None:
output_tensors = self.get_output_tensors(op)
assert len(output_tensors) == 1, "output tensors should be 1"
output_tensor = output_tensors[0]
assert self.has_same_qnn_params(input_tensor, output_tensor), \
"qnn.op.reshape requires input and output qnn params to be same"
out = _op.reshape(in_expr, newshape=tuple(target_shape))
return out

def _convert_resize(self, method, op):
Expand Down Expand Up @@ -324,8 +348,14 @@ def convert_softmax(self, op):
input_tensor_idx = input_tensor.tensor_idx
params = {'axis': 1} # 1 is channel
in_expr = self.get_expr(input_tensor_idx)
out = _op.nn.softmax(in_expr, **params)

# Softmax does not go well with Int8. So Dequantize back to FP32.
if input_tensor.qnn_params is not None:
in_expr = _qnn.op.dequantize(data=in_expr,
input_scale=input_tensor.qnn_params['scale'],
input_zero_point=input_tensor.qnn_params['zero_point'])

out = _op.nn.softmax(in_expr, **params)
return out

def convert_tanh(self, op):
Expand Down Expand Up @@ -362,6 +392,7 @@ def convert_concatenation(self, op):

output_tensors = self.get_output_tensors(op)
assert len(output_tensors) == 1, "output tensors should be 1"
output_tensor = output_tensors[0]

assert op.BuiltinOptionsType() == BuiltinOptions.ConcatenationOptions
op_options = op.BuiltinOptions()
Expand All @@ -370,12 +401,27 @@ def convert_concatenation(self, op):
concatenation_axis = concatenation_options.Axis()
fused_activation_fn = concatenation_options.FusedActivationFunction()

# with axis in N H W C
out = _op.concatenate(in_exprs, axis=concatenation_axis)
if input_tensors[0].qnn_params is None:
out = _op.concatenate(in_exprs, axis=concatenation_axis)
else:
input_scales = [input_tensor.qnn_params['scale'] for input_tensor in input_tensors]
input_zero_points = \
[input_tensor.qnn_params['zero_point'] for input_tensor in input_tensors]
out = _qnn.op.concatenate(in_exprs,
input_scales=input_scales,
input_zero_points=input_zero_points,
output_scale=output_tensor.qnn_params['scale'],
output_zero_point=output_tensor.qnn_params['zero_point'],
axis=concatenation_axis)

# if we have activation fn
if fused_activation_fn != ActivationFunctionType.NONE:
out = self.convert_fused_activation_function(out, fused_activation_fn)
if output_tensor.qnn_params is None:
out = self.convert_fused_activation_function(out, fused_activation_fn)
else:
raise tvm.error.OpNotImplemented(
'Operator {} with fused activation is not supported yet.'
.format('qnn.op.concatenate'))
return out

def _convert_elemwise(self, relay_op, op):
Expand Down Expand Up @@ -521,6 +567,12 @@ def convert_fully_connected(self, op):
input_tensor_idx = input_tensor.tensor_idx
weight_tensor = input_tensors[1]

output_tensors = self.get_output_tensors(op)
assert len(output_tensors) == 1, "output tensors length should be 1"
output_tensor = output_tensors[0]
output_tensor_type = output_tensor.tensor.Type()
output_tensor_type_str = self.get_tensor_type_str(output_tensor_type)

input_tensor_shape = input_tensor.tensor.ShapeAsNumpy()
weight_tensor_shape = weight_tensor.tensor.ShapeAsNumpy()

Expand Down Expand Up @@ -548,7 +600,14 @@ def convert_fully_connected(self, op):
weight_value = self.get_tensor_value(weight_tensor)
weight_expr = self.exp_tab.new_const(weight_value, dtype=weight_tensor_type_str)

out = _op.nn.dense(in_expr, weight_expr)
# Check if the inputs are quantized. If yes, call qnn dense.
if input_tensor.qnn_params is None:
out = _op.nn.dense(in_expr, weight_expr)
else:
out = _qnn.op.dense(in_expr, weight_expr,
input_zero_point=input_tensor.qnn_params['zero_point'],
kernel_zero_point=weight_tensor.qnn_params['zero_point'],
out_dtype='int32')

# if we have bias
if len(input_tensors) == 3:
Expand All @@ -563,7 +622,22 @@ def convert_fully_connected(self, op):

# If we have fused activations
if fused_activation_fn != ActivationFunctionType.NONE:
out = self.convert_fused_activation_function(out, fused_activation_fn)
if output_tensor.qnn_params is None:
out = self.convert_fused_activation_function(out, fused_activation_fn)
else:
raise tvm.error.OpNotImplemented(
'Operator {} with fused activation is not supported yet.'
.format('qnn.op.dense'))

# Finally if the dense is quantized. Add a requantize at the end.
if output_tensor.qnn_params is not None:
input_scale = input_tensor.qnn_params['scale'] * weight_tensor.qnn_params['scale']
out = _qnn.op.requantize(out,
input_scale=input_scale,
input_zero_point=input_zero_point,
output_scale=output_tensor.qnn_params['scale'],
output_zero_point=output_tensor.qnn_params['zero_point'],
out_dtype=output_tensor_type_str)

return out

Expand Down Expand Up @@ -635,6 +709,12 @@ def convert_conv(self, op, conv_type):
input_tensor_idx = input_tensor.tensor_idx
weight_tensor = input_tensors[1]

output_tensors = self.get_output_tensors(op)
assert len(output_tensors) == 1, "output tensors should be 1"
output_tensor = output_tensors[0]
output_tensor_type = output_tensor.tensor.Type()
output_tensor_type_str = self.get_tensor_type_str(output_tensor_type)

is_depthwise_conv = False
if conv_type == 'conv2d':
assert op.BuiltinOptionsType() == BuiltinOptions.Conv2DOptions
Expand Down Expand Up @@ -720,7 +800,15 @@ def convert_conv(self, op, conv_type):
raise tvm.error.OpAttributeUnImplemented(
'Padding format {} is not supported for operator Conv.'.format(padding))

out = _op.nn.conv2d(data=in_expr, weight=weight_expr, **params)
# Check if the inputs are quantized. If yes, call qnn conv2d.
if input_tensor.qnn_params is None:
out = _op.nn.conv2d(in_expr, weight_expr, **params)
else:
qnn_conv2d_params = dict(params)
qnn_conv2d_params['input_zero_point'] = input_tensor.qnn_params['zero_point']
qnn_conv2d_params['kernel_zero_point'] = weight_tensor.qnn_params['zero_point']
qnn_conv2d_params['out_dtype'] = 'int32'
out = _qnn.op.conv2d(in_expr, weight_expr, **qnn_conv2d_params)

# if we have bias
if len(input_tensors) == 3:
Expand All @@ -736,7 +824,23 @@ def convert_conv(self, op, conv_type):

# If we have fused activations
if fused_activation_fn != ActivationFunctionType.NONE:
out = self.convert_fused_activation_function(out, fused_activation_fn)
if output_tensor.qnn_params is None:
out = self.convert_fused_activation_function(out, fused_activation_fn)
else:
raise tvm.error.OpNotImplemented(
'Operator {} with fused activation is not supported yet.'
.format('qnn.op.conv2d'))

# Finally if the conv is quantized. Add a requantize at the end.
if output_tensor.qnn_params is not None:
input_scale = input_tensor.qnn_params['scale'] * weight_tensor.qnn_params['scale']
input_zero_point = 0
out = _qnn.op.requantize(out,
input_scale=input_scale,
input_zero_point=input_zero_point,
output_scale=output_tensor.qnn_params['scale'],
output_zero_point=output_tensor.qnn_params['zero_point'],
out_dtype=output_tensor_type_str)

return out

Expand Down Expand Up @@ -841,6 +945,12 @@ def convert_pool2d(self, op, pool_type):
input_tensor = input_tensors[0]
input_tensor_idx = input_tensor.tensor_idx

output_tensors = self.get_output_tensors(op)
assert len(output_tensors) == 1, "output tensors should be 1"
output_tensor = output_tensors[0]
output_tensor_type = output_tensor.tensor.Type()
output_tensor_type_str = self.get_tensor_type_str(output_tensor_type)

assert op.BuiltinOptionsType() == BuiltinOptions.Pool2DOptions
op_options = op.BuiltinOptions()
pool2d_options = Pool2DOptions()
Expand Down Expand Up @@ -871,17 +981,31 @@ def convert_pool2d(self, op, pool_type):
'Padding format {} for operator Pool2D is not supported.'.format(padding))

if pool_type == "average":
out = _op.nn.avg_pool2d(in_expr, **params)
if input_tensor.qnn_params is None:
out = _op.nn.avg_pool2d(in_expr, **params)
else:
assert self.has_same_qnn_params(input_tensor, output_tensor), \
"qnn.op.avg_pool2d requires input and output qnn params to be same"
out = _op.cast(in_expr, dtype="int32")
out = _op.nn.avg_pool2d(out, **params)
out = _op.cast(out, dtype=output_tensor_type_str)
elif pool_type == "max":
if input_tensor.qnn_params is not None:
assert self.has_same_qnn_params(input_tensor, output_tensor), \
"qnn.op.max_pool2d requires input and output qnn params to be same"
out = _op.nn.max_pool2d(in_expr, **params)
else:
raise tvm.error.OpNotImplemented(
'Operator {} is not supported for frontend TFLite.'.format(pool_type + ' pool'))

# If we have fused activations
if fused_activation_fn != ActivationFunctionType.NONE:
out = self.convert_fused_activation_function(out, fused_activation_fn)

if input_tensor.qnn_params is None:
out = self.convert_fused_activation_function(out, fused_activation_fn)
else:
raise tvm.error.OpNotImplemented(
'Operator {} with fused activation is not supported yet.'
.format('qnn.op.pool2d'))
return out

def convert_pad(self, op):
Expand Down Expand Up @@ -1172,4 +1296,5 @@ def from_tflite(model, shape_dict, dtype_dict):
outputs = [exp_tab.get_expr(get_tensor_name(subgraph, i)) for i in model_outputs]
outputs = outputs[0] if len(outputs) == 1 else _expr.Tuple(outputs)
func = _expr.Function(analysis.free_vars(outputs), outputs)
return _module.Module.from_expr(func), params
mod = _module.Module.from_expr(func)
return mod, params
2 changes: 1 addition & 1 deletion python/tvm/relay/qnn/op/qnn.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def requantize(data,
input_zero_point,
output_scale,
output_zero_point,
rounding="TONEAREST",
rounding="UPWARD",
out_dtype="int8"):
r"""Requantized operator.
Expand Down
2 changes: 1 addition & 1 deletion src/relay/qnn/util.h
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ Expr RequantizeLower(const Expr& input_tensor, const RequantizeAttrs* param,
static inline Expr Requantize(const Expr& data, const Array<IndexExpr>& input_shape,
double input_scale, int32_t input_zero_point, double output_scale,
int32_t output_zero_point, const DataType& out_dtype,
const std::string& rounding = "TONEAREST") {
const std::string& rounding = "UPWARD") {
auto attrs = make_node<RequantizeAttrs>();
attrs->input_scale = std::move(input_scale);
attrs->input_zero_point = std::move(input_zero_point);
Expand Down
29 changes: 29 additions & 0 deletions tests/python/frontend/tflite/test_forward.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,13 @@ def run_tvm_graph(tflite_model_buf, input_data, input_node, num_output=1, target
mod, params = relay.frontend.from_tflite(tflite_model,
shape_dict=shape_dict,
dtype_dict=dtype_dict)

# FIXME - Will be removed once we resolve https://github.com/dmlc/tvm/pull/3971
# Currently, QNN transforms are not a part of relay build. So, we have to call it outside. Once
# it is a part of relay.build, we can remove this. QNN transforms do not modify the graphs that
# do not have any QNN ops.
mod = tvm.relay.qnn.transform.CanonicalizeOps()(mod)

with relay.build_config(opt_level=3):
graph, lib, params = relay.build(mod, target, params=params)

Expand Down Expand Up @@ -948,6 +955,25 @@ def test_forward_inception_v4_net():
tvm.testing.assert_allclose(np.squeeze(tvm_output[0]), np.squeeze(tflite_output[0]),
rtol=1e-5, atol=1e-5)

def test_forward_qnn_inception_v1_net():
"""Test the Quantized TFLite Inception model."""
# InceptionV1
tflite_model_file = tf_testing.get_workload_official(
"https://storage.googleapis.com/download.tensorflow.org/models/inception_v1_224_quant_20181026.tgz",
"inception_v1_224_quant.tflite")
with open(tflite_model_file, "rb") as f:
tflite_model_buf = f.read()
# Checking the labels because the requantize implementation is different between TFLite and
# Relay. This cause final output numbers to mismatch. So, testing accuracy via labels.
data = np.random.uniform(size=(1, 224, 224, 3)).astype('uint8')
tflite_output = run_tflite_graph(tflite_model_buf, data)
tflite_predictions = np.squeeze(tflite_output)
tflite_sorted_labels = tflite_predictions.argsort()[-3:][::-1]
tvm_output = run_tvm_graph(tflite_model_buf, data, 'input')
tvm_predictions = np.squeeze(tvm_output)
tvm_sorted_labels = tvm_predictions.argsort()[-3:][::-1]
tvm.testing.assert_allclose(tvm_sorted_labels, tflite_sorted_labels)

#######################################################################
# SSD Mobilenet
# -------------
Expand Down Expand Up @@ -1013,3 +1039,6 @@ def test_forward_ssd_mobilenet_v1():
test_forward_inception_v3_net()
test_forward_inception_v4_net()
test_forward_ssd_mobilenet_v1()

# End to End quantized
test_forward_qnn_inception_v1_net()

0 comments on commit 1e47074

Please sign in to comment.