remove the set_inductor_config argument of quantize_.

vkuzo · vkuzo · commit 7f16377bf562 · 2025-03-12T13:59:05.000-07:00
Summary: Test Plan: ``` pytest test/quantization/test_quant_api.py -s -x -k test_workflow_e2e_numerics ``` Reviewers: Subscribers: Tasks: Tags: ghstack-source-id: e6694cc ghstack-comment-id: 2712016215 Pull Request resolved: #1865
diff --git a/test/integration/test_integration.py b/test/integration/test_integration.py
@@ -113,7 +113,7 @@
 
 def _int8wo_api(mod):
     if TORCH_VERSION_AT_LEAST_2_4:
-        quantize_(mod, int8_weight_only(), set_inductor_config=False)
+        quantize_(mod, int8_weight_only(set_inductor_config=False))
         if not TORCH_VERSION_AT_LEAST_2_5 or (
             not TORCH_VERSION_AT_LEAST_2_6 and torch._inductor.config.freezing
         ):
@@ -124,7 +124,7 @@ def _int8wo_api(mod):
 
 def _int8wo_groupwise_api(mod):
     group_size = 32
-    quantize_(mod, int8_weight_only(group_size=group_size), set_inductor_config=False)
+    quantize_(mod, int8_weight_only(group_size=group_size, set_inductor_config=False))
 
 
 def _int8da_int8w_api(
@@ -136,8 +136,8 @@ def _int8da_int8w_api(
             mod,
             int8_dynamic_activation_int8_weight(
                 act_mapping_type=act_mapping_type,
+                set_inductor_config=False,
             ),
-            set_inductor_config=False,
         )
         if not TORCH_VERSION_AT_LEAST_2_5:
             unwrap_tensor_subclass(mod)
@@ -152,20 +152,21 @@ def _int4wo_api(mod, use_hqq=False):
     ):
         quantize_(
             mod,
-            int4_weight_only(layout=Int4CPULayout(), use_hqq=use_hqq),
-            set_inductor_config=False,
+            int4_weight_only(
+                layout=Int4CPULayout(), use_hqq=use_hqq, set_inductor_config=False
+            ),
         )
         unwrap_tensor_subclass(mod)
     elif TORCH_VERSION_AT_LEAST_2_4:
-        quantize_(mod, int4_weight_only(), set_inductor_config=False)
+        quantize_(mod, int4_weight_only(set_inductor_config=False))
         if not TORCH_VERSION_AT_LEAST_2_5:
             unwrap_tensor_subclass(mod)
     else:
         change_linear_weights_to_int4_woqtensors(mod)
 
 
 def _int8da_int4w_api(mod):
-    quantize_(mod, int8_dynamic_activation_int4_weight(), set_inductor_config=False)
+    quantize_(mod, int8_dynamic_activation_int4_weight(set_inductor_config=False))
     if not TORCH_VERSION_AT_LEAST_2_5:
         unwrap_tensor_subclass(mod)
 
diff --git a/test/prototype/test_quantized_training.py b/test/prototype/test_quantized_training.py
@@ -46,7 +46,6 @@ def _reset():
     torch._dynamo.reset()
 
 
-# we always use `quantize_(set_inductor_config=False)` to reduce compile time in CI.
 class TestQuantizedTraining(TestCase):
     @parametrize("device", _DEVICES)
     def test_int8_stochastic_rounding(self, device):
@@ -81,7 +80,6 @@ def test_int8_weight_only_correctness(self, leading_dims, bias, device):
         quantize_(
             linear_int8,
             int8_weight_only_quantized_training(),
-            set_inductor_config=False,
         )
         linear_fp32.weight.data = linear_int8.weight.data.dequantize()
 
@@ -108,7 +106,6 @@ def test_int8_weight_only_compile(self, leading_dims, bias, device):
         quantize_(
             linear_eager,
             int8_weight_only_quantized_training(),
-            set_inductor_config=False,
         )
         linear_compiled = copy.deepcopy(linear_eager)
         linear_compiled.compile()
@@ -145,9 +142,7 @@ def test_int8_weight_only_training(self, compile, device):
             nn.Linear(embed_dim * 2, n_classes),
         ).to(device)
         model_int8 = copy.deepcopy(model_fp32)
-        quantize_(
-            model_int8, int8_weight_only_quantized_training(), set_inductor_config=False
-        )
+        quantize_(model_int8, int8_weight_only_quantized_training())
 
         if compile:
             model_fp32.compile()
@@ -195,7 +190,7 @@ def test_int8_mixed_precision_training(self, compile, config, module_swap):
         linear_int8mp = copy.deepcopy(linear)
         config.module_swap = module_swap
         apply_func = int8_mixed_precision_training(config)
-        quantize_(linear_int8mp, apply_func, set_inductor_config=False)
+        quantize_(linear_int8mp, apply_func)
 
         if compile:
             linear.compile()
@@ -255,7 +250,7 @@ def forward(self, x):
             nn.Linear(embed_dim, embed_dim),
         ).to(device)
         model = copy.deepcopy(model_ref)
-        quantize_(model, bitnet_training(), set_inductor_config=False)
+        quantize_(model, bitnet_training())
 
         # change model_ref to use BitLinear
         model_ref[0].__class__ = BitLinear
@@ -346,8 +341,8 @@ def _run_subtest(self, args):
         base_model = Transformer(model_args).cuda()
         fsdp_model = copy.deepcopy(base_model)
 
-        quantize_(base_model.layers, quantize_fn, set_inductor_config=False)
-        quantize_(fsdp_model.layers, quantize_fn, set_inductor_config=False)
+        quantize_(base_model.layers, quantize_fn)
+        quantize_(fsdp_model.layers, quantize_fn)
 
         for layer in fsdp_model.layers:
             fully_shard(layer, mp_policy=mp_policy)
diff --git a/torchao/prototype/awq/api.py b/torchao/prototype/awq/api.py
@@ -3,6 +3,7 @@
 
 import torch
 
+import torchao
 from torchao.core.config import AOBaseConfig
 from torchao.dtypes import (
     TensorCoreTiledLayout,
@@ -101,11 +102,13 @@ class AWQUIntXConfig(AOBaseConfig):
         quant_dtype: The data type of the quantized weights. Currently only torch.uint4 is intended to be used but can be used with torch.uint1 -> torch.uint8
         group_size: Quantization granularity. Use -1 for channel wise quantization
         weight_quant_fn: The quantization function to be used, which takes in the weight and returns the quantized weight. If None, then affine uint4 quantization is used
+        set_inductor_config: if True, adjusts `torchinductor` settings to recommended values.
     """
 
     quant_dtype: torch.dtype = torch.uint4
     group_size: int = 64
     use_hqq: bool = False
+    set_inductor_config: bool = True
 
 
 # for bc
@@ -120,6 +123,8 @@ def _awq_uintx_transform(
     quant_dtype = config.quant_dtype
     group_size = config.group_size
     use_hqq = config.use_hqq
+    if config.set_inductor_config:
+        torchao.quantization.utils.recommended_inductor_config_setter()
     observed_linear = module
 
     assert (
diff --git a/torchao/prototype/quantization/mixed_precision/scripts/naive_intNwo.py b/torchao/prototype/quantization/mixed_precision/scripts/naive_intNwo.py
@@ -2,6 +2,7 @@
 
 import torch
 
+import torchao
 from torchao.core.config import AOBaseConfig
 from torchao.quantization.quant_primitives import (
     MappingType,
@@ -18,6 +19,7 @@ class IntNWeightOnlyConfig(AOBaseConfig):
     Args:
         `group_size`: parameter for quantization, controls the granularity of quantization, smaller size is more fine grained, choices are [512, 256, 128, 64, 32]
         `n`: number of bits to quantize to, choices are [8, 6, 5, 4, 3, 2]
+        `set_inductor_config`: if True, adjusts `torchinductor` settings to recommended values.
     Usage:
         from torchao.quantization import quantize_
         quantize_(model, intN_weight_only(n=your_bit_choice, group_size=group_size), optional_filter_func_for_desired_layers_to_quantize)
@@ -26,6 +28,7 @@ class IntNWeightOnlyConfig(AOBaseConfig):
     group_size: int = 32
     n: int = 8
     symmetric: bool = False
+    set_inductor_config: bool = True
 
 
 # for bc
@@ -41,6 +44,8 @@ def _intN_weight_only_transform(
     n = config.n
     symmetric = config.symmetric
     weight = module.weight
+    if config.set_inductor_config:
+        torchao.quantization.utils.recommended_inductor_config_setter()
 
     # for asymmetric quantization
     def apply_intN_weight_only_quant_asym(weight):
diff --git a/torchao/prototype/smoothquant/api.py b/torchao/prototype/smoothquant/api.py
@@ -4,6 +4,7 @@
 
 import torch
 
+import torchao
 from torchao.core.config import AOBaseConfig
 from torchao.dtypes import to_affine_quantized_intx, to_affine_quantized_intx_static
 from torchao.prototype.smoothquant.core import (
@@ -158,11 +159,13 @@ class SmoothQuantConfig(AOBaseConfig):
         smoothing_factor: The smoothing factor for the layer. Acquired from the layer's observer if None.
         act_scales: The activation scales for the layer. Acquired from the layer's observer if None.
         wei_scales: The weight scales for the layer. Acquired from the layer's observer if None.
+        set_inductor_config: if True, adjusts `torchinductor` settings to recommended values.
     """
 
     smoothing_factor: Optional[torch.Tensor] = None
     act_scales: Optional[torch.Tensor] = None
     wei_scales: Optional[torch.Tensor] = None
+    set_inductor_config: bool = True
 
 
 @register_quantize_module_handler(SmoothQuantConfig)
@@ -173,6 +176,8 @@ def _smooth_quant_transform(
     smoothing_factor = config.smoothing_factor
     act_scales = config.act_scales
     wei_scales = config.wei_scales
+    if config.set_inductor_config:
+        torchao.quantization.utils.recommended_inductor_config_setter()
     observed_linear = module
 
     linear = torch.nn.Linear(
diff --git a/torchao/quantization/quant_api.py b/torchao/quantization/quant_api.py