Dedup _choose_qparams_per_token_asymmetric

andrewor14 · andrewor14 · commit bf5c81681103 · 2024-05-09T08:47:09.000-07:00
diff --git a/test/quantization/test_qat.py b/test/quantization/test_qat.py
@@ -13,7 +13,6 @@
 import torch
 from torch.ao.quantization.fx._decomposed import quantized_decomposed_lib  # noqa: F401
 from torchao.quantization.prototype.qat import (
-    _choose_qparams_per_token_asymmetric,
     fake_quantize_per_channel_group,
     fake_quantize_per_token,
 )
@@ -91,8 +90,7 @@ def test_fake_quantize_per_token(self):
         torch.manual_seed(self.SEED)
         x = torch.randn(100, 256).requires_grad_()
         x2 = copy.deepcopy(x)
-        # TODO: use torch.ops.aten.quantized_decomposed version instead
-        (s, zp) = _choose_qparams_per_token_asymmetric(
+        (s, zp) = torch.ops.quantized_decomposed._choose_qparams_per_token_asymmetric_impl(
             x,
             torch.int8,  # not used
         )
diff --git a/torchao/quantization/prototype/qat.py b/torchao/quantization/prototype/qat.py
@@ -142,7 +142,10 @@ def disable_fake_quant(self):
         def forward(self, x: torch.Tensor) -> torch.Tensor:
             # activations: int8 dynamic asymmetric quant
             if self._fake_quant_enabled:
-                (act_scales, act_zp) =_choose_qparams_per_token_asymmetric(
+                (
+                    act_scales,
+                    act_zp
+                ) = torch.ops.quantized_decomposed._choose_qparams_per_token_asymmetric_impl(
                     x, torch.int8,  # dtype not used
                 )
                 (act_qmin, act_qmax) = self._get_qmin_qmax(8)
@@ -269,49 +272,3 @@ def fake_quantize_per_token(
     return _GenericFakeQuantize.apply(
         input, scales, zero_points, quant_min, quant_max,
     )
-
-# TODO: This is copied from torch/ao/quantization/fx/_decomposed.py.
-# The version in pytorch does not have backward support yet so we add
-# it here for now until https://github.com/pytorch/pytorch/pull/123452
-# is landed.
-def _choose_qparams_per_token_asymmetric(
-    input: torch.Tensor,
-    dtype: torch.dtype,
-) -> Tuple[torch.Tensor, torch.Tensor]:
-    """Choose quantization parameters for per token quantization. This means for a N dimension Tensor
-    (M1, M2, ...Mn, N), we calculate scales/zero_points for each N elements and quantize
-    every N elements with the same quantization parameter. The dimension for scales/zero_points
-    will be (M1 * M2 ... * Mn)
-
-    Args:
-       input (torch.Tensor): original float32/float16 Tensor
-       dtype (torch.dtype): dtype (e.g. torch.uint8) for input Tensor
-
-    Returns:
-        scales and zero_points, both float32 Tensors
-    """
-    # Based on https://github.com/google/XNNPACK/blob/df156f0cf3db5a4576cc711123eeb54915f82ffc/src/xnnpack/quantization.h#L18
-    qmin, qmax = -128, 127
-    min_val = torch.amin(input, dim=-1, keepdim=True)
-    max_val = torch.amax(input, dim=-1, keepdim=True)
-    min_val_neg = torch.min(min_val, torch.zeros_like(min_val))
-    max_val_pos = torch.max(max_val, torch.zeros_like(max_val))
-    eps = torch.finfo(torch.float32).eps  # use xnnpack eps?
-
-    # scale
-    scale = (max_val_pos - min_val_neg) / float(qmax - qmin)
-    scale = scale.clamp(min=eps)
-
-    # zero point
-    descaled_min = min_val_neg / scale
-    descaled_max = max_val_pos / scale
-    zero_point_from_min_error = qmin + descaled_min
-    zero_point_from_max_error = qmax + descaled_max
-    zero_point = torch.where(
-        zero_point_from_min_error + zero_point_from_max_error > 0,
-        qmin - descaled_min,
-        qmax - descaled_max,
-    )
-    zero_point = torch.clamp(zero_point, qmin, qmax).round()
-
-    return scale.to(torch.float32), zero_point.to(torch.float32)