Add support for dynamic quant in GPTQ (#80)

jerryzh168 · web-flow · commit bed4cb461cba · 2024-03-25T10:04:11.000-07:00
Summary:
These changes were missed in the beginning, add them back

Test Plan:
there is some problem with testing gptq quantizer locally, will test in executorch instead

Reviewers:

Subscribers:

Tasks:

Tags:
diff --git a/torchao/quantization/GPTQ.py b/torchao/quantization/GPTQ.py
@@ -348,6 +348,7 @@ def configure_quantization_mode(
         combine_qparams_list_func,
         make_names_and_values_dict_func,
         skip_layer_func,
+        dyn_quant_func = None,
     ):
         # these functions need to already be curried with all inputs other than weight, qparams
 
@@ -371,6 +372,8 @@ def configure_quantization_mode(
         #  `make_names_and_values_dict_func`.
         self.make_names_and_values_dict_func = make_names_and_values_dict_func  # accepts [2d quantized tensor], [qparams], returns a dict of names, values to put in state_dict
         # note any final packing for storage should happen here
+
+        self.dyn_quant_func = dyn_quant_func
         return self
 
     def run(self):
@@ -451,6 +454,8 @@ def tensors_to_cuda(args):
                 quantize_linear
             ):  # calculate H instead of output (will run the linear eventually with updated weight)
                 x = cur_args[0].float()
+                if self.dyn_quant_func is not None:
+                    x = self.dyn_quant_func(x)
                 shape = x.shape
                 n = 1 if len(shape) == 2 else shape[0]
                 H *= total_batches / (total_batches + n)
diff --git a/torchao/quantization/quant_api.py b/torchao/quantization/quant_api.py
@@ -283,6 +283,13 @@ class GPTQQuantizer(Quantizer):
             Returns:
                 weight: A 2d weight tensor with non-integer dtype.
 
+        dyn_quant_func (optional):
+             A function that dynamically quantizes inputs
+             Args:
+                 input: input Tensor in f32/bf16/f16
+             Returns:
+                 output: dynamically quantized and dequantized Tensor (with the same dtype as input)
+
         combine_qparams_list_func:
             A function that combines several qparams into one qparam.
             Args:
@@ -397,6 +404,7 @@ def _create_quantized_state_dict(
                 self.combine_qparams_list_func,  # pyre-ignore[16]
                 self.make_names_and_values_dict_func,  # pyre-ignore[16]
                 self.skip_layer_func,  # pyre-ignore[16]
+                self.dyn_quant_func if hasattr(self, "dyn_quant_func") else None,  # pyre-ignore[16]
             )
             print("Applying GPTQ to weights")
             GPTQ_runner.run()
@@ -747,7 +755,7 @@ def __init__(
 
             self.precision = precision
 
-            self.dyn_quant_func = lambda x: per_token_dynamic_quant(x)
+            self.dyn_quant_func = per_token_dynamic_quant
             n_bit = 4
 
             self.get_qparams_func = lambda w: get_group_qparams_symmetric(