[pt][quant] Parallelize quantize and dequantize (pytorch#33765)

Summary: Pull Request resolved: pytorch#33765 quantize and dequantize methods now make use of multiple threads. This makes use of shz0116's recent parallelization of quantize/dequantize routines in FBGEMM. Fixes: pytorch#32006 pytorch/FBGEMM#142 Alternative to pytorch#30153 ``` #!/usr/bin/env python import time import torch import torch.nn as nn torch.set_num_threads(4) # print(torch.__config__.parallel_info()) W = torch.rand(1, 54, 54, 256) NITER = 1000 s = time.time() for i in range(NITER): W_q = torch.quantize_per_tensor(W, scale=1.0, zero_point = 0, dtype=torch.quint8) time_per_iter = (time.time() - s) / NITER print('quantize time per iter ms', time_per_iter * 1000) s = time.time() for i in range(NITER): W_deq = W_q.dequantize() time_per_iter = (time.time() - s) / NITER print('dequantize time per iter ms', time_per_iter * 1000) ``` ### With 1 thread quantize time per iter ms 0.22633790969848633 dequantize time per iter ms 0.6573665142059326 ### With 4 threads quantize time per iter ms 0.0905618667602539 dequantize time per iter ms 0.19511842727661133 ghstack-source-id: 98935895 Test Plan: python test/test_quantized.py Reviewed By: jspark1105 Differential Revision: D20098521 fbshipit-source-id: bd8c45761b4651fcd5b20b95759e3868a136c048
atalman · Feb 26, 2020 · a8e7ed4 · a8e7ed4
1 parent 2eb95d8
commit a8e7ed4
Showing 1 changed file with 25 additions and 8 deletions.
diff --git a/aten/src/ATen/quantized/Quantizer.cpp b/aten/src/ATen/quantized/Quantizer.cpp
@@ -1,4 +1,5 @@
 #include <ATen/ATen.h>
+#include <ATen/Parallel.h>
 #include <ATen/quantized/Quantizer.h>
 #include <c10/core/Allocator.h>
 #include <c10/core/CPUAllocator.h>
@@ -126,10 +127,18 @@ Tensor quantize_tensor(Tensor rtensor, Tensor qtensor, double scale, int64_t zer
   qparams.scale = scale;
   qparams.zero_point = zero_point;
   qparams.precision = CHAR_BIT * sizeof(typename T::underlying);
-  fbgemm::Quantize<typename T::underlying>(/*src=*/rd,
-                             /*dst=*/qd,
-                             /*len=*/rtensor.numel(),
-                             /*qparams=*/qparams);
+  int num_tasks = at::get_num_threads();
+  at::parallel_for(0, num_tasks, 1, [&](int64_t begin, int64_t end) {
+    for (int task_id = begin; task_id < end; ++task_id) {
+      fbgemm::Quantize<typename T::underlying>(
+          rd, /*src=*/
+          qd, /*dst=*/
+          rtensor.numel(), /*len*/
+          qparams, /*qparams=*/
+          task_id, /*thread_id*/
+          num_tasks /*num_threads*/);
+    }
+  });
   return qtensor;
 }
 
@@ -153,10 +162,18 @@ Tensor dequantize_tensor(Tensor qtensor, Tensor rtensor, double scale, int64_t z
   qparams.zero_point = zero_point;
   qparams.precision = CHAR_BIT * sizeof(typename T::underlying);
   float* rd = rtensor.data_ptr<float>();
-  fbgemm::Dequantize<typename T::underlying>(/*src=*/qd,
-                              /*dst=*/rd,
-                              /*len=*/qtensor.numel(),
-                              /*qparams=*/qparams);
+  int num_tasks = at::get_num_threads();
+  at::parallel_for(0, num_tasks, 1, [&](int64_t begin, int64_t end) {
+    for (int task_id = begin; task_id < end; ++task_id) {
+      fbgemm::Dequantize<typename T::underlying>(
+          qd, /*src=*/
+          rd, /*dst=*/
+          qtensor.numel(), /*len=*/
+          qparams, /*qparams=*/
+          task_id, /*thread_id*/
+          num_tasks /*num_threads*/);
+    }
+  });
   return rtensor;
 }
 #else  // USE_FBGEMM