[pt][quant] qmul and qadd should preserve input memory format (pytorc…

…h#34834) Summary: Pull Request resolved: pytorch#34834 They should keep the activations in channelLast format, i.e., the same as input tensors to these operations. ### Before ``` ------------------------- --------------- --------------- --------------- --------------- --------------- --------------- Name Self CPU total % Self CPU total CPU total % CPU total CPU time avg Number of Calls ------------------------- --------------- --------------- --------------- --------------- --------------- --------------- quantize_per_tensor 0.06% 129.181us 0.06% 129.181us 129.181us 1 quantized::conv2d 21.74% 47.744ms 21.74% 47.744ms 408.067us 117 quantized::add_scalar 16.36% 35.930ms 16.36% 35.930ms 520.726us 69 quantized::relu6 0.69% 1.515ms 0.69% 1.515ms 21.959us 69 quantized::mul_scalar 6.08% 13.364ms 6.08% 13.364ms 193.676us 69 quantized::mul 53.17% 116.781ms 53.17% 116.781ms 1.269ms 92 adaptive_avg_pool2d 0.02% 42.700us 1.61% 3.527ms 146.948us 24 _adaptive_avg_pool2d 1.59% 3.484ms 1.59% 3.484ms 145.169us 24 sigmoid 0.08% 173.702us 0.08% 173.702us 7.552us 23 quantized::add 0.20% 445.648us 0.20% 445.648us 27.853us 16 dropout 0.00% 2.598us 0.00% 2.598us 2.598us 1 view 0.00% 10.311us 0.00% 10.311us 10.311us 1 dequantize 0.00% 4.645us 0.00% 4.645us 4.645us 1 ------------------------- --------------- --------------- --------------- --------------- --------------- --------------- Self CPU time total: 219.627ms ``` ### After ``` ------------------------- --------------- --------------- --------------- --------------- --------------- --------------- Name Self CPU total % Self CPU total CPU total % CPU total CPU time avg Number of Calls ------------------------- --------------- --------------- --------------- --------------- --------------- --------------- quantize_per_tensor 0.12% 155.807us 0.12% 155.807us 155.807us 1 quantized::conv2d 25.50% 31.981ms 25.50% 31.981ms 273.343us 117 quantized::add_scalar 44.53% 55.840ms 44.53% 55.840ms 809.281us 69 quantized::relu6 1.25% 1.570ms 1.25% 1.570ms 22.749us 69 quantized::mul_scalar 10.73% 13.449ms 10.73% 13.449ms 194.914us 69 quantized::mul 16.67% 20.904ms 16.67% 20.904ms 227.220us 92 adaptive_avg_pool2d 0.03% 41.713us 0.69% 862.922us 35.955us 24 _adaptive_avg_pool2d 0.65% 821.209us 0.65% 821.209us 34.217us 24 sigmoid 0.15% 182.344us 0.15% 182.344us 7.928us 23 quantized::add 0.34% 431.939us 0.34% 431.939us 26.996us 16 dropout 0.00% 1.936us 0.00% 1.936us 1.936us 1 view 0.01% 10.281us 0.01% 10.281us 10.281us 1 dequantize 0.00% 4.562us 0.00% 4.562us 4.562us 1 ------------------------- --------------- --------------- --------------- --------------- --------------- --------------- Self CPU time total: 125.394ms ``` ghstack-source-id: 100305788 Test Plan: buck test //caffe2/test:quantized Differential Revision: D20473713 fbshipit-source-id: c878fbb8f5a1a33f0cdac2657cc61e97ceb6c183
lcy-seso · Mar 19, 2020 · 7335f07 · 7335f07
1 parent 6d48871
commit 7335f07
Show file tree

Hide file tree

Showing 2 changed files with 12 additions and 9 deletions.
diff --git a/aten/src/ATen/native/quantized/cpu/qadd.cpp b/aten/src/ATen/native/quantized/cpu/qadd.cpp
@@ -58,7 +58,7 @@ Tensor _add_scalar_out(Tensor& out, const Tensor& self, Scalar other) {
   // Let s' = the calculated scale or the output
   // z' = the calculated zero-point for the output
   //
-  // If q_min > c_q
+  // If q_min > z - c_q
   //   s' = [(q_max - (z - c_q)]/[q_max - q_min] * s
   //   z' = q_min
   //   Xq' = torch.quantize_linear(Xq.dequantize() + c_q.dequantize() , s', z')
@@ -89,15 +89,15 @@ Tensor _add_scalar_out(Tensor& out, const Tensor& self, Scalar other) {
       if (ReLUFused) {
         dequantized_add.relu_();
       }
-      out = at::quantize_per_tensor(dequantized_add, s_prime, z_prime, self.scalar_type());
+      out.copy_(at::quantize_per_tensor(dequantized_add, s_prime, z_prime, self.scalar_type()));
     } else if (q_max < z - c_q) {
       s_prime = ((double)(z - c_q) - q_min) / ((double)q_max - q_min) * s;
       z_prime = q_max;
       auto dequantized_add = self.dequantize() + c_q * s;
       if (ReLUFused) {
         dequantized_add.relu_();
       }
-      out = at::quantize_per_tensor(dequantized_add, s_prime, z_prime, self.scalar_type());
+      out.copy_(at::quantize_per_tensor(dequantized_add, s_prime, z_prime, self.scalar_type()));
     } else {
       s_prime = s;
       z_prime = z - c_q;
@@ -226,7 +226,7 @@ class QAddScalar final : public c10::OperatorKernel {
   TORCH_CHECK(qa.qscheme() == kPerTensorAffine ||
               qa.qscheme() == kPerTensorSymmetric,
               "Only per tensor quantization is suuported in Add.");
-    auto qc = at::empty_like(qa, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+    auto qc = at::empty_like(qa, qa.suggest_memory_format());
     return _add_scalar_out<ReLUFused>(qc, qa, b);
   }
 };

diff --git a/aten/src/ATen/native/quantized/cpu/qmul.cpp b/aten/src/ATen/native/quantized/cpu/qmul.cpp
@@ -102,11 +102,14 @@ Tensor _mul_scalar_out(Tensor& out, const Tensor& self, Scalar other) {
 template <bool ReLUFused = false>
 class QMul final : public c10::OperatorKernel {
  public:
-  Tensor operator()(Tensor qa, Tensor qb,
-                    double scale, int64_t zero_point) {
+  Tensor operator()(Tensor qa, Tensor qb, double scale, int64_t zero_point) {
     check_inputs(qa, qb);
-    auto qc = at::_empty_affine_quantized(qa.sizes(),
-      at::device(kCPU).dtype(qa.scalar_type()), scale, zero_point);
+    auto qc = at::_empty_affine_quantized(
+        qa.sizes(),
+        at::device(kCPU).dtype(qa.scalar_type()),
+        scale,
+        zero_point,
+        qa.suggest_memory_format());
     return _mul_out<ReLUFused>(qc, qa, qb);
   }
 };
@@ -128,7 +131,7 @@ class QMulScalar final : public c10::OperatorKernel {
     TORCH_CHECK(qa.qscheme() == kPerTensorAffine ||
               qa.qscheme() == kPerTensorSymmetric,
               "Only per tensor quantization is suuported in Mul.");
-    auto qc = at::empty_like(qa, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+    auto qc = at::empty_like(qa, qa.suggest_memory_format());
     return _mul_scalar_out<ReLUFused>(qc, qa, b);
   }
 };