[Relay] [Quantization] WIP - Protoyping the quantized convolution op

Goal - Act as medium of discussion for pull request apache#2351 Features - New quantized conv2D op in Relay - Python API interface to instantiate the Relay op - Infer Type implemented - Lowering of quantized_conv op to low-level Relay ops Discussion points - Does the namespace look correct? - Relay op is called 'relay.op.nn._quantize.quantized_conv2d' - Idea is that any op under '_quantize' namespace will go through rewrite. - Should we reuse Conv2DRel and Conv2DAttrs - Tried protoyping. Found it hard to derive from Conv2DAttr struct - Infer Type has a param field. This need to come from the right datatype. Missing implememtation - Lowering of quantized conv into conv+cast is incomplete. - Will work on it async. This is orthogonal to the discussion.
anijain2305 · Jul 8, 2019 · 788b20c · 788b20c
1 parent 9596535
commit 788b20c
Show file tree

Hide file tree

Showing 10 changed files with 661 additions and 0 deletions.
diff --git a/include/tvm/relay/attrs/nn_quantize.h b/include/tvm/relay/attrs/nn_quantize.h
@@ -0,0 +1,116 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file tvm/relay/attrs/nn.h
+ * \brief Auxiliary attributes for nn operators.
+ */
+#ifndef TVM_RELAY_ATTRS_NN_QUANTIZE_H_
+#define TVM_RELAY_ATTRS_NN_QUANTIZE_H_
+
+#include <tvm/attrs.h>
+#include <string>
+
+namespace tvm {
+namespace relay {
+
+// TODO(anijain2305) - Copy of QuantizedConv2DAttrs. Should we inherit?
+/*! \brief Attribute for quantized conv2d operator */
+struct QuantizedConv2DAttrs : public tvm::AttrsNode<QuantizedConv2DAttrs> {
+  // Traditional conv2d attributes.
+  Array<IndexExpr> strides;
+  Array<IndexExpr> padding;
+  Array<IndexExpr> dilation;
+  int groups;
+  IndexExpr channels;
+  Array<IndexExpr> kernel_size;
+  std::string data_layout;
+  std::string kernel_layout;
+  std::string out_layout;
+  DataType out_dtype;
+
+  // Quantization related attributes.
+  int32_t input_zero_point;
+  int32_t kernel_zero_point;
+  int32_t output_zero_point;
+  double input_scale;
+  double kernel_scale;
+  double output_scale;
+
+  TVM_DECLARE_ATTRS(QuantizedConv2DAttrs, "relay.attrs.QuantizedConv2DAttrs") {
+    TVM_ATTR_FIELD(strides).set_default(Array<IndexExpr>({1, 1}))
+        .describe("Specifies the strides of the convolution.");
+    TVM_ATTR_FIELD(padding).set_default(Array<IndexExpr>({0, 0}))
+        .describe("If padding is non-zero, then the input is implicitly zero-padded"
+                  "on both sides for padding number of points");
+    TVM_ATTR_FIELD(dilation).set_default(Array<IndexExpr>({1, 1}))
+        .describe("Specifies the dilation rate to use for dilated convolution.");
+    TVM_ATTR_FIELD(groups).set_default(1)
+        .describe("Controls the connections between inputs and outputs."
+                  "At groups=1, all inputs are convolved to all outputs."
+                  "At groups=2, the operation becomes equivalent to having two convolution"
+                  "layers side by side, each seeing half the input channels, and producing"
+                  "half the output channels, and both subsequently concatenated.");
+    TVM_ATTR_FIELD(channels)
+        .describe("The number of output channels in the convolution."
+                  " If it is not set, inferred by shape of the weight.")
+        .set_default(NullValue<IndexExpr>());
+    TVM_ATTR_FIELD(kernel_size)
+        .describe("Specifies the dimensions of the convolution window.")
+        .set_default(NullValue<Array<IndexExpr> >());
+    TVM_ATTR_FIELD(data_layout).set_default("NCHW")
+        .describe("Dimension ordering of input data. Can be 'NCHW', 'NHWC', etc."
+                  "'N', 'C', 'H', 'W' stands for batch, channel, height, and width"
+                  "dimensions respectively. Convolution is applied on the 'H' and"
+                  "'W' dimensions.");
+    TVM_ATTR_FIELD(kernel_layout).set_default("OIHW")
+        .describe("Dimension ordering of weight. Can be 'OIHW', 'OIHW16o16i', etc."
+                  "'O', 'I', 'H', 'W' stands for num_filter, input_channel, height, and width"
+                  "dimensions respectively.");
+    TVM_ATTR_FIELD(out_layout).set_default("")
+        .describe("Dimension ordering of output. Can be 'NCHW', 'NHWC', etc."
+                  "'N', 'C', 'H', 'W' stands for batch, channel, height, and width"
+                  "dimensions respectively. Default to be same as input layout.");
+
+    // use 0 bits to indicate none.
+    TVM_ATTR_FIELD(out_dtype)
+        .set_default(NullValue<DataType>())
+        .describe("Output data type, set to explicit type under mixed precision setting");
+
+
+    TVM_ATTR_FIELD(input_zero_point)
+        .describe("The zero point of the input tensor.");
+    TVM_ATTR_FIELD(kernel_zero_point)
+        .describe("The zero point of the kernel tensor.");
+    TVM_ATTR_FIELD(output_zero_point)
+        .describe("The zero point of the output tensor.");
+    TVM_ATTR_FIELD(input_scale)
+        .describe("The scale of the input tensor.");
+    TVM_ATTR_FIELD(kernel_scale)
+        .describe("The scale of the kernel tensor.");
+    TVM_ATTR_FIELD(output_scale)
+        .describe("The scale of the output tensor.");
+
+
+  }
+};
+
+}  // namespace relay
+}  // namespace tvm
+#endif  // TVM_RELAY_ATTRS_NN_QUANTIZE_H_
diff --git a/python/tvm/relay/op/nn/__init__.py b/python/tvm/relay/op/nn/__init__.py
@@ -18,4 +18,5 @@
 """Neural network related operators."""
 from __future__ import absolute_import as _abs
 from .nn import *
+from . import _quantize
 from . import _nn
diff --git a/python/tvm/relay/op/nn/_make_quantize.py b/python/tvm/relay/op/nn/_make_quantize.py
@@ -0,0 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Constructor APIs"""
+from ...._ffi.function import _init_api
+
+_init_api("relay.op.nn._quantize._make", __name__)
diff --git a/python/tvm/relay/op/nn/_quantize.py b/python/tvm/relay/op/nn/_quantize.py
@@ -0,0 +1,133 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#pylint: disable=invalid-name, too-many-lines
+"""Neural network operations."""
+from __future__ import absolute_import as _abs
+from . import _make_quantize
+
+
+def quantized_conv2d(quantized_data,
+                    quantized_weight,
+                    input_zero_point,
+                    kernel_zero_point,
+                    output_zero_point,
+                    input_scale,
+                    kernel_scale,
+                    output_scale,
+                    strides=(1, 1),
+                    padding=(0, 0),
+                    dilation=(1, 1),
+                    groups=1,
+                    channels=None,
+                    kernel_size=None,
+                    data_layout="NCHW",
+                    kernel_layout="OIHW",
+                    out_layout="",
+                    out_dtype=""):
+    r"""Quantized 2D convolution.
+
+    This operator takes the quantized_weight as the convolution kernel
+    and convolves it with quantized_data to produce an output.
+
+
+    In the default case, where the data_layout is `NCHW`
+    and kernel_layout is `OIHW`, conv2d takes in
+    a quantized_data Tensor with shape `(batch_size, in_channels, height, width)`,
+    and a quantized_weight Tensor with shape `(channels, in_channels, kernel_size[0], kernel_size[1])`
+    to produce an output Tensor with the following rule:
+
+    .. math::
+
+        \mbox{out}[b, c, y, x] = \sum_{dy, dx, k}
+           \mbox{quantized_data}[b, k, \mbox{strides}[0] * y  + dy, \mbox{strides}[1] * x + dx] *
+           \mbox{quantized_weight}[c, k, dy, dx]
+
+    Padding and dilation are applied to quantized_data and quantized_weight respectively before the computation.
+    This operator accepts quantized_data layout specification.
+    Semantically, the operator will convert the layout to the canonical layout
+    (`NCHW` for quantized_data and `OIHW` for quantized_weight), perform the computation,
+    then convert to the out_layout.
+
+
+    Parameters
+    ----------
+    quantized_data : tvm.relay.Expr
+        The input quantized_data to the operator.
+
+    quantized_weight : tvm.relay.Expr
+        The quantized_weight expressions.
+
+    input_scale: float
+           The float scalar to scale the quantized_data int8 values back to FP32.
+
+    kernel_scale: float 
+           The float scalar to scale the quantized_kernel int8 values back to FP32.
+
+    output_scale: float
+           The float scalar to scale the quantized_output int8 values back to FP32.
+
+    input_zero_point: int 
+           The zero point of the quantized_data distribution.
+
+    kernel_zero_point: int 
+           The zero point of the quantized_kernel distribution.
+
+    output_zero_point: int 
+           The zero point of the quantized_output distribution.
+
+    strides : tuple of int, optional
+        The strides of convolution.
+
+    padding : tuple of int, optional
+        The padding of convolution on both sides of inputs before convolution.
+
+    dilation : tuple of int, optional
+        Specifies the dilation rate to be used for dilated convolution.
+
+    groups : int, optional
+        Number of groups for grouped convolution.
+
+    channels : int, optional
+        Number of output channels of this convolution.
+
+    kernel_size : tuple of int, optional
+        The spatial of the convolution kernel.
+
+    data_layout : str, optional
+        Layout of the input.
+
+    kernel_layout : str, optional
+        Layout of the quantized_weight.
+
+    out_layout : str, optional
+        Layout of the output, by default, out_layout is the same as data_layout
+
+    out_dtype : str, optional
+        Specifies the output quantized_data type for mixed precision conv2d.
+
+    Returns
+    -------
+    result : tvm.relay.Expr
+        The computed result.
+    """
+    return _make_quantize.quantized_conv2d(quantized_data, quantized_weight,
+                        input_zero_point, kernel_zero_point, output_zero_point,
+                        input_scale, kernel_scale, output_scale,
+                        strides, padding, dilation,
+                        groups, channels, kernel_size,
+                        data_layout, kernel_layout, out_layout,
+                        out_dtype)
diff --git a/python/tvm/relay/quantize/__init__.py b/python/tvm/relay/quantize/__init__.py
@@ -19,4 +19,5 @@
 from __future__ import absolute_import as _abs
 
 from .quantize import *
+from .quantize_rewrite import *
 from ._annotate import register_annotate_function
diff --git a/python/tvm/relay/quantize/quantize_rewrite.py b/python/tvm/relay/quantize/quantize_rewrite.py
@@ -0,0 +1,38 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#pylint: disable=unused-argument
+"""Automatic quantization toolkit."""
+from __future__ import absolute_import
+
+from . import _quantize
+from .. import expr as _expr
+
+def quantize_rewrite(expr):
+    """
+    Rewrites the high-level quantized ops into low-level exisiting Relay ops.
+
+    Parameters
+    ----------
+    expr : tvm.relay.Expr
+        The input expression.
+
+    Returns
+    -------
+    expr : tvm.relay.Expr
+        The output expression.
+    """
+    return _quantize.quantize_rewrite(expr)