Skip to content

Commit

Permalink
[pt][quant] qmul and qadd should preserve input memory format (pytorc…
Browse files Browse the repository at this point in the history
…h#34834)

Summary:
Pull Request resolved: pytorch#34834

They should keep the activations in channelLast format, i.e., the same as input tensors to these operations.

### Before
```
 -------------------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------
Name                       Self CPU total %  Self CPU total   CPU total %      CPU total        CPU time avg     Number of Calls
-------------------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------
quantize_per_tensor        0.06%            129.181us        0.06%            129.181us        129.181us        1
quantized::conv2d          21.74%           47.744ms         21.74%           47.744ms         408.067us        117
quantized::add_scalar      16.36%           35.930ms         16.36%           35.930ms         520.726us        69
quantized::relu6           0.69%            1.515ms          0.69%            1.515ms          21.959us         69
quantized::mul_scalar      6.08%            13.364ms         6.08%            13.364ms         193.676us        69
quantized::mul             53.17%           116.781ms        53.17%           116.781ms        1.269ms          92
adaptive_avg_pool2d        0.02%            42.700us         1.61%            3.527ms          146.948us        24
_adaptive_avg_pool2d       1.59%            3.484ms          1.59%            3.484ms          145.169us        24
sigmoid                    0.08%            173.702us        0.08%            173.702us        7.552us          23
quantized::add             0.20%            445.648us        0.20%            445.648us        27.853us         16
dropout                    0.00%            2.598us          0.00%            2.598us          2.598us          1
view                       0.00%            10.311us         0.00%            10.311us         10.311us         1
dequantize                 0.00%            4.645us          0.00%            4.645us          4.645us          1
-------------------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------
Self CPU time total: 219.627ms
```

### After
```
  -------------------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------
Name                       Self CPU total %  Self CPU total   CPU total %      CPU total        CPU time avg     Number of Calls
-------------------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------
quantize_per_tensor        0.12%            155.807us        0.12%            155.807us        155.807us        1
quantized::conv2d          25.50%           31.981ms         25.50%           31.981ms         273.343us        117
quantized::add_scalar      44.53%           55.840ms         44.53%           55.840ms         809.281us        69
quantized::relu6           1.25%            1.570ms          1.25%            1.570ms          22.749us         69
quantized::mul_scalar      10.73%           13.449ms         10.73%           13.449ms         194.914us        69
quantized::mul             16.67%           20.904ms         16.67%           20.904ms         227.220us        92
adaptive_avg_pool2d        0.03%            41.713us         0.69%            862.922us        35.955us         24
_adaptive_avg_pool2d       0.65%            821.209us        0.65%            821.209us        34.217us         24
sigmoid                    0.15%            182.344us        0.15%            182.344us        7.928us          23
quantized::add             0.34%            431.939us        0.34%            431.939us        26.996us         16
dropout                    0.00%            1.936us          0.00%            1.936us          1.936us          1
view                       0.01%            10.281us         0.01%            10.281us         10.281us         1
dequantize                 0.00%            4.562us          0.00%            4.562us          4.562us          1
-------------------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------
Self CPU time total: 125.394ms

```
ghstack-source-id: 100305788

Test Plan: buck test //caffe2/test:quantized

Differential Revision: D20473713

fbshipit-source-id: c878fbb8f5a1a33f0cdac2657cc61e97ceb6c183
  • Loading branch information
dskhudia authored and facebook-github-bot committed Mar 19, 2020
1 parent 6d48871 commit 7335f07
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 9 deletions.
8 changes: 4 additions & 4 deletions aten/src/ATen/native/quantized/cpu/qadd.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ Tensor _add_scalar_out(Tensor& out, const Tensor& self, Scalar other) {
// Let s' = the calculated scale or the output
// z' = the calculated zero-point for the output
//
// If q_min > c_q
// If q_min > z - c_q
// s' = [(q_max - (z - c_q)]/[q_max - q_min] * s
// z' = q_min
// Xq' = torch.quantize_linear(Xq.dequantize() + c_q.dequantize() , s', z')
Expand Down Expand Up @@ -89,15 +89,15 @@ Tensor _add_scalar_out(Tensor& out, const Tensor& self, Scalar other) {
if (ReLUFused) {
dequantized_add.relu_();
}
out = at::quantize_per_tensor(dequantized_add, s_prime, z_prime, self.scalar_type());
out.copy_(at::quantize_per_tensor(dequantized_add, s_prime, z_prime, self.scalar_type()));
} else if (q_max < z - c_q) {
s_prime = ((double)(z - c_q) - q_min) / ((double)q_max - q_min) * s;
z_prime = q_max;
auto dequantized_add = self.dequantize() + c_q * s;
if (ReLUFused) {
dequantized_add.relu_();
}
out = at::quantize_per_tensor(dequantized_add, s_prime, z_prime, self.scalar_type());
out.copy_(at::quantize_per_tensor(dequantized_add, s_prime, z_prime, self.scalar_type()));
} else {
s_prime = s;
z_prime = z - c_q;
Expand Down Expand Up @@ -226,7 +226,7 @@ class QAddScalar final : public c10::OperatorKernel {
TORCH_CHECK(qa.qscheme() == kPerTensorAffine ||
qa.qscheme() == kPerTensorSymmetric,
"Only per tensor quantization is suuported in Add.");
auto qc = at::empty_like(qa, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
auto qc = at::empty_like(qa, qa.suggest_memory_format());
return _add_scalar_out<ReLUFused>(qc, qa, b);
}
};
Expand Down
13 changes: 8 additions & 5 deletions aten/src/ATen/native/quantized/cpu/qmul.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -102,11 +102,14 @@ Tensor _mul_scalar_out(Tensor& out, const Tensor& self, Scalar other) {
template <bool ReLUFused = false>
class QMul final : public c10::OperatorKernel {
public:
Tensor operator()(Tensor qa, Tensor qb,
double scale, int64_t zero_point) {
Tensor operator()(Tensor qa, Tensor qb, double scale, int64_t zero_point) {
check_inputs(qa, qb);
auto qc = at::_empty_affine_quantized(qa.sizes(),
at::device(kCPU).dtype(qa.scalar_type()), scale, zero_point);
auto qc = at::_empty_affine_quantized(
qa.sizes(),
at::device(kCPU).dtype(qa.scalar_type()),
scale,
zero_point,
qa.suggest_memory_format());
return _mul_out<ReLUFused>(qc, qa, qb);
}
};
Expand All @@ -128,7 +131,7 @@ class QMulScalar final : public c10::OperatorKernel {
TORCH_CHECK(qa.qscheme() == kPerTensorAffine ||
qa.qscheme() == kPerTensorSymmetric,
"Only per tensor quantization is suuported in Mul.");
auto qc = at::empty_like(qa, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
auto qc = at::empty_like(qa, qa.suggest_memory_format());
return _mul_scalar_out<ReLUFused>(qc, qa, b);
}
};
Expand Down

0 comments on commit 7335f07

Please sign in to comment.