Check dequantize_affine is idempotent (#309)

cpuhrsch · web-flow · commit d75f450716c2 · 2024-06-03T17:04:14.000-04:00
diff --git a/test/quantization/test_quant_primitives.py b/test/quantization/test_quant_primitives.py
@@ -25,10 +25,24 @@
 _SEED = 1234
 torch.manual_seed(_SEED)
 
+# Helper function to run a function twice
+# and verify that the result is the same.
+# Adds some verification to avoid side effects.
+# NOTE:
+# - Does not verify the args and kwargs are unchanged.
+# - Assumes the output is a single Tensor
+def check_idempotent(self, fn, *args, **kwargs):
+    output0 = fn(*args, **kwargs)
+    assert torch.is_tensor(output0)
+    output1 = fn(*args, **kwargs)
+    self.assertTrue(torch.equal(output0, output1), f"Expected given function {fn} to be idempotent.")
+    return output1
+
+
 class TestQuantPrimitives(unittest.TestCase):
     SEED = 123
 
-    @unittest.skipIf(not TORCH_VERSION_AFTER_2_3, "skipping when torch verion is 2.3 or lower")
+    @unittest.skipIf(not TORCH_VERSION_AFTER_2_3, "skipping when torch version is 2.3 or lower")
     def test_get_group_qparams_symmetric(self):
         """
         Test that `get_group_qparams_symmetric` produces the exact same scales as
@@ -77,7 +91,7 @@ def test_choose_qparams_group_sym(self):
         self.assertTrue(torch.equal(scale, scale_ref))
         self.assertTrue(torch.equal(zero_point, zp_ref))
 
-    @unittest.skipIf(not TORCH_VERSION_AFTER_2_3, "skipping when torch verion is 2.3 or lower")
+    @unittest.skipIf(not TORCH_VERSION_AFTER_2_3, "skipping when torch version is 2.3 or lower")
     def test_choose_qparams_token_asym(self):
         input = torch.randn(10, 10)
         mapping_type = MappingType.ASYMMETRIC
@@ -127,7 +141,7 @@ def test_choose_qparams_tensor_sym(self):
         self.assertTrue(torch.equal(scale, scale_ref))
         self.assertTrue(torch.equal(zero_point, zp_ref))
 
-    @unittest.skipIf(not TORCH_VERSION_AFTER_2_4, "skipping when torch verion is 2.4 or lower")
+    @unittest.skipIf(not TORCH_VERSION_AFTER_2_4, "skipping when torch version is 2.4 or lower")
     def test_quantize_activation_per_token_abs_max(self):
         from torchao.quantization.quant_primitives import quantize_activation_per_token_absmax
         input = torch.randn(10, 10)
@@ -148,15 +162,15 @@ def test_quantize_activation_per_token_abs_max(self):
         self.assertTrue(torch.equal(quantized, quantized_ref))
         self.assertTrue(torch.equal(scale, scale_ref))
 
-    @unittest.skipIf(not TORCH_VERSION_AFTER_2_4, "skipping when torch verion is 2.4 or lower")
+    @unittest.skipIf(not TORCH_VERSION_AFTER_2_4, "skipping when torch version is 2.4 or lower")
     def test_quantize_activation_per_token_abs_max_zero_input(self):
         from torchao.quantization.quant_primitives import quantize_activation_per_token_absmax
         input = torch.zeros(10, 10)
         # make sure it still works
         quantized_ref, scale_ref = quantize_activation_per_token_absmax(input)
 
 
-    @unittest.skipIf(not TORCH_VERSION_AFTER_2_4, "skipping when torch verion is 2.4 or lower")
+    @unittest.skipIf(not TORCH_VERSION_AFTER_2_4, "skipping when torch version is 2.4 or lower")
     def test_quantize_activation_per_token_abs_max_dtype(self):
         from torchao.quantization.quant_primitives import quantize_activation_per_token_absmax
         input = torch.zeros(10, 10, dtype=torch.bfloat16)
@@ -172,7 +186,7 @@ def test_quantize_activation_per_token_abs_max_dtype(self):
         self.assertTrue(scale_ref.dtype, torch.float32)
 
 
-    @unittest.skipIf(not TORCH_VERSION_AFTER_2_4, "skipping when torch verion is 2.4 or lower")
+    @unittest.skipIf(not TORCH_VERSION_AFTER_2_4, "skipping when torch version is 2.4 or lower")
     def test_quantize_dequantize_group_sym(self):
         input = torch.randn(10, 10)
         mapping_type = MappingType.SYMMETRIC
@@ -181,7 +195,7 @@ def test_quantize_dequantize_group_sym(self):
         scale, zero_point = choose_qparams_affine(input, mapping_type, block_size, dtype, eps=torch.finfo(torch.float32).eps)
 
         quantized = quantize_affine(input, block_size, scale, zero_point, dtype)
-        dequantized = dequantize_affine(quantized, block_size, scale, zero_point, dtype, output_dtype=torch.float32)
+        dequantized = check_idempotent(self, dequantize_affine, quantized, block_size, scale, zero_point, dtype, output_dtype=torch.float32)
 
         group_size = 2
         quant_min = -128
@@ -196,7 +210,7 @@ def test_quantize_dequantize_group_sym(self):
         self.assertTrue(torch.equal(quantized, quantized_ref))
         self.assertTrue(torch.equal(dequantized, dequantized_ref))
 
-    @unittest.skipIf(not TORCH_VERSION_AFTER_2_4, "skipping when torch verion is 2.4 or lower")
+    @unittest.skipIf(not TORCH_VERSION_AFTER_2_4, "skipping when torch version is 2.4 or lower")
     def test_quantize_dequantize_channel_asym(self):
         input = torch.randn(10, 10)
         mapping_type = MappingType.ASYMMETRIC
@@ -205,7 +219,7 @@ def test_quantize_dequantize_channel_asym(self):
         scale, zero_point = choose_qparams_affine(input, mapping_type, block_size, dtype, eps=torch.finfo(torch.float32).eps)
         output_dtype = torch.float32
         quantized = quantize_affine(input, block_size, scale, zero_point, dtype)
-        dequantized = dequantize_affine(quantized, block_size, scale, zero_point, dtype, output_dtype=output_dtype)
+        dequantized = check_idempotent(self, dequantize_affine, quantized, block_size, scale, zero_point, dtype, output_dtype=output_dtype)
 
         axis = 1
         quant_min = -128
@@ -219,7 +233,7 @@ def test_quantize_dequantize_channel_asym(self):
         self.assertTrue(torch.equal(quantized, quantized_ref))
         self.assertTrue(torch.equal(dequantized, dequantized_ref))
 
-    @unittest.skipIf(not TORCH_VERSION_AFTER_2_4, "skipping when torch verion is 2.4 or lower")
+    @unittest.skipIf(not TORCH_VERSION_AFTER_2_4, "skipping when torch version is 2.4 or lower")
     def test_quantize_dequantize_tensor_asym(self):
         input = torch.randn(10, 10)
         mapping_type = MappingType.ASYMMETRIC
@@ -228,7 +242,7 @@ def test_quantize_dequantize_tensor_asym(self):
         output_dtype = torch.float32
         scale, zero_point = choose_qparams_affine(input, mapping_type, block_size, dtype, eps=torch.finfo(torch.float32).eps)
         quantized = quantize_affine(input, block_size, scale, zero_point, dtype)
-        dequantized = dequantize_affine(quantized, block_size, scale, zero_point, dtype, output_dtype=output_dtype)
+        dequantized = check_idempotent(self, dequantize_affine, quantized, block_size, scale, zero_point, dtype, output_dtype=output_dtype)
 
         axis = 1
         quant_min = -128
@@ -242,15 +256,15 @@ def test_quantize_dequantize_tensor_asym(self):
         self.assertTrue(torch.equal(quantized, quantized_ref))
         self.assertTrue(torch.equal(dequantized, dequantized_ref))
 
-    @unittest.skipIf(not TORCH_VERSION_AFTER_2_4, "skipping when torch verion is 2.4 or lower")
+    @unittest.skipIf(not TORCH_VERSION_AFTER_2_4, "skipping when torch version is 2.4 or lower")
     def test_quantize_dequantize_channel_asym_4d(self):
         input = torch.randn(3, 3, 10, 10)
         mapping_type = MappingType.ASYMMETRIC
         dtype = torch.int8
         block_size = (3, 3, 1, 10)
         scale, zero_point = choose_qparams_affine(input, mapping_type, block_size, dtype, eps=torch.finfo(torch.float32).eps)
         quantized = quantize_affine(input, block_size, scale, zero_point, dtype)
-        dequantized = dequantize_affine(quantized, block_size, scale, zero_point, dtype, output_dtype=torch.float32)
+        dequantized = check_idempotent(self, dequantize_affine, quantized, block_size, scale, zero_point, dtype, output_dtype=torch.float32)
 
         axis = 2
         quant_min = -128
@@ -264,15 +278,15 @@ def test_quantize_dequantize_channel_asym_4d(self):
         self.assertTrue(torch.equal(quantized, quantized_ref))
         self.assertTrue(torch.equal(dequantized, dequantized_ref))
 
-    @unittest.skipIf(not TORCH_VERSION_AFTER_2_3, "skipping when torch verion is 2.3 or lower")
+    @unittest.skipIf(not TORCH_VERSION_AFTER_2_3, "skipping when torch version is 2.3 or lower")
     def test_quantize_dequantize_channel_asym_4d_multi_dim_reduction(self):
         input = torch.randn(3, 3, 10, 10)
         mapping_type = MappingType.ASYMMETRIC
         dtype = torch.int8
         block_size = (3, 3, 2, 2)
         scale, zero_point = choose_qparams_affine(input, mapping_type, block_size, dtype, eps=torch.finfo(torch.float32).eps)
         quantized = quantize_affine(input, block_size, scale, zero_point, dtype)
-        dequantized = dequantize_affine(quantized, block_size, scale, zero_point, dtype, output_dtype=torch.float32)
+        dequantized = check_idempotent(self, dequantize_affine, quantized, block_size, scale, zero_point, dtype, output_dtype=torch.float32)
         # we don't have corresponding ops in existing primitives, so just make sure it runs and it's close to float
         torch.testing.assert_close(dequantized, input, rtol=2, atol=0.02)
 
diff --git a/torchao/quantization/quant_primitives.py b/torchao/quantization/quant_primitives.py
@@ -249,6 +249,7 @@ def dequantize_affine(
     Output:
       dequantized Tensor, with requested dtype or fp32
     """
+
     # TODO: validations
     # TODO: validate scale/zero_point dimensions are compatible with block_size
     assert input.dtype == input_dtype
@@ -266,14 +267,17 @@ def dequantize_affine(
         zero_point = zero_point.view(shape_after_reduction)
 
     if zero_point_domain == ZeroPointDomain.INT:
-        dequant = input.to(torch.int32)
+        # Force a copy to avoid input modification due
+        # to upcoming in-place operations.
+        dequant = input.to(torch.int32, copy=True)
         if zero_point is not None:
             dequant -= zero_point.to(torch.int32)
         dequant = dequant.to(output_dtype)
         dequant *= scale
     else:
         assert zero_point_domain == ZeroPointDomain.FLOAT, f"Unexpected zero point domain: {zero_point_domain}"
         mid_point = (quant_max + quant_min + 1) / 2
+        # This should allocate new memory and avoid input modification
         dequant = input - mid_point
         dequant = dequant.to(output_dtype)
         dequant *= scale