simplify conv dnnlowp ops by not allowing fp32 in/out (pytorch#15758)

Summary: Pull Request resolved: pytorch#15758 DNNLOWP Conv operators became very complex due to many options. This diff simplifies them by not allowing fp32 in/out. This is OK for Conv operators because Conv operators are usually used in deep networks where quantizing and dequantizing using separate operators is not much overhead. Reviewed By: csummersea Differential Revision: D13587341 fbshipit-source-id: e88c919dae79d1c5b7d787ea539edf5bcb064afc
HaimianYu · Jan 7, 2019 · bc328d0 · bc328d0
1 parent 49ba2cb
commit bc328d0
Show file tree

Hide file tree

Showing 10 changed files with 338 additions and 729 deletions.
diff --git a/caffe2/quantization/server/conv_dnnlowp_acc16_op.cc b/caffe2/quantization/server/conv_dnnlowp_acc16_op.cc
diff --git a/caffe2/quantization/server/conv_dnnlowp_acc16_op.h b/caffe2/quantization/server/conv_dnnlowp_acc16_op.h
@@ -18,7 +18,6 @@ class ConvDNNLowPAcc16Op final : public ConvDNNLowPOp<std::uint8_t, ReluFused> {
   using BaseType = ConvDNNLowPOp<std::uint8_t, ReluFused>;
   using BaseType::BIAS;
   using BaseType::col_buffer_;
-  using BaseType::dequantize_output_;
   using BaseType::FILTER;
   using BaseType::in_qparams_;
   using BaseType::INPUT;
@@ -36,15 +35,10 @@ class ConvDNNLowPAcc16Op final : public ConvDNNLowPOp<std::uint8_t, ReluFused> {
 
   bool GetQuantizationParameters_();
 
-  template <typename InType>
-  bool RunOnDeviceWithOrderNCHWAndType_();
-  template <typename InType>
-  bool RunOnDeviceWithOrderNHWCAndType_();
-
-  template <typename PackAMatrix, fbgemm::QuantizationGranularity Q_GRAN>
+  template <fbgemm::QuantizationGranularity Q_GRAN>
   void DispatchFBGEMM_(
-      PackAMatrix& packA,
-      const std::uint8_t* col_buffer_quantized_data,
+      fbgemm::PackAWithRowOffset<std::uint8_t, std::int16_t>& packA,
+      const std::uint8_t* col_buffer_data,
       vector<std::int32_t>* Y_int32,
       uint8_t* Y_uint8_data);
 

diff --git a/caffe2/quantization/server/conv_dnnlowp_acc16_op_test.py b/caffe2/quantization/server/conv_dnnlowp_acc16_op_test.py
@@ -31,8 +31,6 @@ class DNNLowPOpConvAcc16OpTest(hu.HypothesisTestCase):
         output_channels_per_group=st.integers(2, 16),
         batch_size=st.integers(1, 3),
         order=st.sampled_from(["NCHW", "NHWC"]),
-        in_quantized=st.booleans(),
-        out_quantized=st.booleans(),
         weight_quantized=st.booleans(),
         share_col_buffer=st.booleans(),
         preserve_activation_sparsity=st.booleans(),
@@ -51,8 +49,6 @@ def test_dnnlowp_conv_acc16_int(
         output_channels_per_group,
         batch_size,
         order,
-        in_quantized,
-        out_quantized,
         weight_quantized,
         share_col_buffer,
         preserve_activation_sparsity,
@@ -121,8 +117,8 @@ def test_dnnlowp_conv_acc16_int(
         for op_type, engine in op_engine_list:
             net = core.Net("test_net")
 
-            do_quantize = "DNNLOWP" in engine and in_quantized
-            do_dequantize = "DNNLOWP" in engine and out_quantized
+            do_quantize = "DNNLOWP" in engine
+            do_dequantize = "DNNLOWP" in engine
             do_quantize_weight = (
                 "DNNLOWP" in engine and weight_quantized and len(outputs) > 0
             )
@@ -166,7 +162,6 @@ def test_dnnlowp_conv_acc16_int(
                 dilation=dilation,
                 pad=pad,
                 order=order,
-                dequantize_output=not do_dequantize,
                 shared_buffer=(1 if share_col_buffer else 0),
                 preserve_activation_sparsity=preserve_activation_sparsity,
                 preserve_weight_sparsity=preserve_weight_sparsity,
@@ -210,8 +205,6 @@ def test_dnnlowp_conv_acc16_int(
         output_channels_per_group=st.integers(2, 16),
         batch_size=st.integers(1, 3),
         order=st.sampled_from(["NHWC"]),
-        in_quantized=st.booleans(),
-        out_quantized=st.booleans(),
         weight_quantized=st.booleans(),
         prepack_weight=st.booleans(),
         nbits_in_non_outlier=st.sampled_from((0, 1, 6, 8)),
@@ -232,8 +225,6 @@ def test_dnnlowp_conv_acc16_outlier(
         output_channels_per_group,
         batch_size,
         order,
-        in_quantized,
-        out_quantized,
         weight_quantized,
         prepack_weight,
         nbits_in_non_outlier,
@@ -295,8 +286,8 @@ def test_dnnlowp_conv_acc16_outlier(
             init_net = core.Net("test_init_net")
             net = core.Net("test_net")
 
-            do_quantize = "DNNLOWP" in engine and in_quantized
-            do_dequantize = "DNNLOWP" in engine and out_quantized
+            do_quantize = "DNNLOWP" in engine
+            do_dequantize = "DNNLOWP" in engine
             do_quantize_weight = "DNNLOWP" in engine and weight_quantized
             do_prepack_weight = "DNNLOWP" in engine and prepack_weight
 
@@ -357,7 +348,6 @@ def test_dnnlowp_conv_acc16_outlier(
                 dilation=dilation,
                 pad=pad,
                 order=order,
-                dequantize_output=not do_dequantize,
                 nbits_in_non_outlier=nbits_in_non_outlier,
                 shared_buffer=(1 if share_col_buffer else 0),
                 preserve_activation_sparsity=preserve_activation_sparsity,