pytorch · jerryzh168 · Sep 8, 2025 · Sep 8, 2025
diff --git a/README.md b/README.md
@@ -74,7 +74,7 @@ pip install torchao
 Quantize your model weights to int4!
 ```
 from torchao.quantization import Int4WeightOnlyConfig, quantize_
-quantize_(model, Int4WeightOnlyConfig(group_size=32))
+quantize_(model, Int4WeightOnlyConfig(group_size=32, version=1))
 ```
 Compared to a `torch.compiled` bf16 baseline, your quantized model should be significantly smaller and faster on a single A100 GPU:
 ```
@@ -102,7 +102,7 @@ pip install torchao
   ```
   # Nightly
   pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
-  
+
   # Different CUDA versions
   pip install torchao --index-url https://download.pytorch.org/whl/cu126  # CUDA 12.6
   pip install torchao --index-url https://download.pytorch.org/whl/cpu    # CPU only
@@ -144,7 +144,7 @@ Quantize any model with `nn.Linear` layers in just one line (Option 1), or load
 
 ```python
 from torchao.quantization.quant_api import quantize_, Int4WeightOnlyConfig
-quantize_(model, Int4WeightOnlyConfig(group_size=128, use_hqq=True))
+quantize_(model, Int4WeightOnlyConfig(group_size=128, use_hqq=True, version=1))
 ```
 
 #### Option 2: HuggingFace Integration
@@ -154,7 +154,7 @@ from transformers import TorchAoConfig, AutoModelForCausalLM
 from torchao.quantization.quant_api import Int4WeightOnlyConfig
 
 # Create quantization configuration
-quantization_config = TorchAoConfig(quant_type=Int4WeightOnlyConfig(group_size=128, use_hqq=True))
+quantization_config = TorchAoConfig(quant_type=Int4WeightOnlyConfig(group_size=128, use_hqq=True, version=1))
 
 # Load and automatically quantize
 quantized_model = AutoModelForCausalLM.from_pretrained(

diff --git a/benchmarks/benchmark_aq.py b/benchmarks/benchmark_aq.py
@@ -197,7 +197,7 @@ def _bench_quantized_tensor_subclass_perf(api, ref_api, M, N, K, kwargs=None):
         )
 
     print("_int4wo_api")
-    kwargs = {"groupsize": 32}
+    kwargs = {"groupsize": 32, "version": 1}
 
     for M, N, K in all_shapes:
         _bench_quantized_tensor_subclass_perf(

diff --git a/benchmarks/microbenchmarks/test/test_benchmark_inference.py b/benchmarks/microbenchmarks/test/test_benchmark_inference.py
@@ -58,7 +58,8 @@ def test_run_inference_with_semi_sparse_marlin(self, mock_string_to_config):
 
         # Test with semi-sparse config
         mock_string_to_config.return_value = Int4WeightOnlyConfig(
-            layout=MarlinSparseLayout()
+            layout=MarlinSparseLayout(),
+            version=1,
         )
         config = BenchmarkConfig(
             quantization="marlin",

diff --git a/benchmarks/microbenchmarks/utils.py b/benchmarks/microbenchmarks/utils.py
@@ -206,7 +206,7 @@ def string_to_config(
             128,
             256,
         ], f"int4wo group_size needs to be one of [32,64,128,256] but got {group_size}"
-        return Int4WeightOnlyConfig(group_size=group_size, use_hqq=use_hqq)
+        return Int4WeightOnlyConfig(group_size=group_size, use_hqq=use_hqq, version=1)
     elif "int8adq-int4w-symm" in quantization:
         from torchao.dtypes import CutlassInt4PackedLayout
 
@@ -229,7 +229,7 @@ def string_to_config(
         elif sparsity is not None and ("semi" in sparsity or "2:4" in sparsity):
             from torchao.dtypes import MarlinSparseLayout
 
-            return Int4WeightOnlyConfig(layout=MarlinSparseLayout())
+            return Int4WeightOnlyConfig(layout=MarlinSparseLayout(), version=1)
     if "fp6" in quantization:
         return FPXWeightOnlyConfig(3, 2)
     elif "uintx" in quantization:

diff --git a/docs/source/quick_start.rst b/docs/source/quick_start.rst
@@ -57,7 +57,7 @@ for efficient mixed dtype matrix multiplication:
 
   # torch 2.4+ only
   from torchao.quantization import Int4WeightOnlyConfig, quantize_
-  quantize_(model, Int4WeightOnlyConfig(group_size=32))
+  quantize_(model, Int4WeightOnlyConfig(group_size=32, version=1))
 
 The quantized model is now ready to use! Note that the quantization
 logic is inserted through tensor subclasses, so there is no change

diff --git a/docs/source/serialization.rst b/docs/source/serialization.rst
@@ -7,7 +7,7 @@ Serialization and deserialization flow
 ======================================
 
 Here is the serialization and deserialization flow::
-  
+
   import copy
   import tempfile
   import torch
@@ -36,7 +36,7 @@ Here is the serialization and deserialization flow::
   print(f"original model size: {get_model_size_in_bytes(m) / 1024 / 1024} MB")
 
   example_inputs = m.example_inputs(dtype=dtype, device="cuda")
-  quantize_(m, Int4WeightOnlyConfig())
+  quantize_(m, Int4WeightOnlyConfig(version=1))
   print(f"quantized model size: {get_model_size_in_bytes(m) / 1024 / 1024} MB")
 
   ref = m(*example_inputs)
@@ -62,7 +62,7 @@ What happens when serializing an optimized model?
 To serialize an optimized model, we just need to call ``torch.save(m.state_dict(), f)``, because in torchao, we use tensor subclass to represent different dtypes or support different optimization techniques like quantization and sparsity. So after optimization, the only thing change is the weight Tensor is changed to an optimized weight Tensor, and the model structure is not changed at all. For example:
 
 original floating point model ``state_dict``::
-  
+
   {"linear1.weight": float_weight1, "linear2.weight": float_weight2}
 
 quantized model ``state_dict``::
@@ -75,7 +75,7 @@ The size of the quantized model is typically going to be smaller to the original
   original model size: 4.0 MB
   quantized model size: 1.0625 MB
 
-  
+
 What happens when deserializing an optimized model?
 ===================================================
 To deserialize an optimized model, we can initialize the floating point model in `meta <https://pytorch.org/docs/stable/meta.html>`__ device and then load the optimized ``state_dict`` with ``assign=True`` using `model.load_state_dict <https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module.load_state_dict>`__::
@@ -97,5 +97,3 @@ We can also verify that the weight is properly loaded by checking the type of we
 
   type of weight before loading: (<class 'torch.Tensor'>, <class 'torch.Tensor'>)
   type of weight after loading: (<class 'torchao.dtypes.affine_quantized_tensor.AffineQuantizedTensor'>, <class 'torchao.dtypes.affine_quantized_tensor.AffineQuantizedTensor'>)
-
-
diff --git a/docs/source/torchao_vllm_integration.md b/docs/source/torchao_vllm_integration.md
@@ -45,6 +45,7 @@ from torchao.quantization import Int4WeightOnlyConfig
 config = Int4WeightOnlyConfig(
     group_size=128,
     use_hqq=True,
+    version=1,
 )
 assert isinstance(config, AOBaseConfig)
 ```
@@ -65,7 +66,7 @@ config = ModuleFqnToConfig({
     "model.layers.0.self_attn.q_proj": Int4WeightOnlyConfig(group_size=64),
     "model.layers.0.self_attn.k_proj": Int4WeightOnlyConfig(group_size=64),
     "model.layers.0.mlp.gate_proj": Int8WeightOnlyConfig(),
-    "_default": Int4WeightOnlyConfig(group_size=128)  # Default for other modules
+    "_default": Int4WeightOnlyConfig(group_size=128, version=1)  # Default for other modules
 })
 ```
 
@@ -81,7 +82,7 @@ from torchao.quantization import Int4WeightOnlyConfig
 
 # Create quantization configuration
 quantization_config = TorchAoConfig(
-    quant_type=Int4WeightOnlyConfig(group_size=128, use_hqq=True)
+    quant_type=Int4WeightOnlyConfig(group_size=128, use_hqq=True, version=1)
 )
 
 # Load and automatically quantize the model

diff --git a/scripts/quick_start.py b/scripts/quick_start.py
@@ -39,7 +39,7 @@ def forward(self, x):
 # ========================
 
 # torch 2.4+ only
-quantize_(model, Int4WeightOnlyConfig(group_size=32))
+quantize_(model, Int4WeightOnlyConfig(group_size=32, version=1))
 
 
 # =============

diff --git a/test/dtypes/test_affine_quantized.py b/test/dtypes/test_affine_quantized.py
@@ -66,22 +66,23 @@ def get_quantization_functions(
     if do_int4:
         if check_cpu_version(device):
             base_functions.append(
-                int4_weight_only(group_size=32, layout=Int4CPULayout())
+                int4_weight_only(group_size=32, layout=Int4CPULayout(), version=1)
             )
         elif check_xpu_version(device):
             base_functions.append(
-                int4_weight_only(group_size=32, layout=Int4XPULayout())
+                int4_weight_only(group_size=32, layout=Int4XPULayout(), version=1)
             )
             if int4_zp_int:
                 base_functions.append(
                     int4_weight_only(
                         group_size=32,
                         layout=Int4XPULayout(),
                         zero_point_domain=ZeroPointDomain.INT,
+                        version=1,
                     )
                 )
         else:
-            base_functions.append(int4_weight_only(group_size=32))
+            base_functions.append(int4_weight_only(group_size=32, version=1))
             if device == "cuda" and not is_ROCM():
                 base_functions.append(
                     int8_dynamic_activation_int4_weight(
@@ -118,7 +119,7 @@ def test_tensor_core_layout_transpose(self):
         linear = torch.nn.Linear(128, 256, dtype=torch.bfloat16, device="cuda")
         t = linear.weight
         shape = t.shape
-        apply_int4_weight_only_quant = int4_weight_only(group_size=32)
+        apply_int4_weight_only_quant = int4_weight_only(group_size=32, version=1)
         quantize_(linear, apply_int4_weight_only_quant)
         ql = linear
         aqt = ql.weight
@@ -353,7 +354,7 @@ def test_slice_int4wo(self, device, dtype):
         # out_feature not divisible by 8
         # to test slice + padding for int4 weight only quantization
         dummy = nn.Linear(256, 321, dtype=dtype, device=device)
-        quantize_(dummy, Int4WeightOnlyConfig())
+        quantize_(dummy, Int4WeightOnlyConfig(version=1))
         # make sure these run without error
         _ = dummy.weight.narrow(0, 0, 64)
         _ = dummy.weight.narrow(1, 0, 128)
@@ -467,7 +468,7 @@ def test_slice_and_copy_int4wo(self, device, dtype):
         l.weight = torch.nn.Parameter(
             torch.zeros(1024, 1024, dtype=torch.bfloat16, device="cuda")
         )
-        quantize_(l, Int4WeightOnlyConfig())
+        quantize_(l, Int4WeightOnlyConfig(version=1))
         param = l.weight
         param_data = param.data
         param_data = param_data.narrow(0, 0, 512)
@@ -483,7 +484,7 @@ def test_slice_and_copy_int4wo(self, device, dtype):
 
         # dummy_l has random input (shouldn't be 0)
         dummy_l = torch.nn.Linear(1024, 1024).to("cuda").to(torch.bfloat16)
-        quantize_(dummy_l, Int4WeightOnlyConfig())
+        quantize_(dummy_l, Int4WeightOnlyConfig(version=1))
         quantized = dummy_l.weight
         quantized = quantized.narrow(0, 0, 512)
 
@@ -502,7 +503,7 @@ def test_mm_int4wo(self, device, dtype):
 
         l = torch.nn.Linear(512, 1024).to(device).to(dtype)
         l.weight = torch.nn.Parameter(weight)
-        quantize_(l, Int4WeightOnlyConfig())
+        quantize_(l, Int4WeightOnlyConfig(version=1))
         # weight shape: 1024 x 512
         weight = l.weight
 

diff --git a/test/hqq/test_hqq_affine.py b/test/hqq/test_hqq_affine.py
@@ -55,7 +55,7 @@ def _eval_hqq(dtype):
     )
     dummy_linear.weight.data = W
     if dtype == torch.uint4:
-        config = int4_weight_only(group_size=max(block_size), use_hqq=True)
+        config = int4_weight_only(group_size=max(block_size), use_hqq=True, version=1)
     else:
         config = uintx_weight_only(dtype, group_size=max(block_size), use_hqq=True)
     quantize_(dummy_linear, config)

diff --git a/test/integration/test_integration.py b/test/integration/test_integration.py
@@ -135,17 +135,23 @@ def _int4wo_api(mod, use_hqq=False):
         quantize_(
             mod,
             int4_weight_only(
-                layout=Int4CPULayout(), use_hqq=use_hqq, set_inductor_config=False
+                layout=Int4CPULayout(),
+                use_hqq=use_hqq,
+                set_inductor_config=False,
+                version=1,
             ),
         )
         unwrap_tensor_subclass(mod)
     elif check_xpu_version(next(mod.parameters()).device):
         quantize_(
-            mod, int4_weight_only(layout=Int4XPULayout()), set_inductor_config=False
+            mod,
+            int4_weight_only(
+                layout=Int4XPULayout(), set_inductor_config=False, version=1
+            ),
         )
         unwrap_tensor_subclass(mod)
     else:
-        quantize_(mod, int4_weight_only(set_inductor_config=False))
+        quantize_(mod, int4_weight_only(set_inductor_config=False, version=1))
 
 
 def _int8da_int4w_api(mod):
@@ -1077,7 +1083,7 @@ def test_int4_weight_only_quant_subclass_api_grouped(self, device, dtype):
         ):
             for groupsize in [64, 32]:
                 for layout in layout_list:
-                    kwargs = {"groupsize": groupsize, "layout": layout}
+                    kwargs = {"groupsize": groupsize, "layout": layout, "version": 1}
 
                     def api(mod):
                         kwargs_copy = kwargs.copy()

diff --git a/test/prototype/test_parq.py b/test/prototype/test_parq.py
@@ -211,7 +211,7 @@ def test_int4_weight_only(self, group_size: int = 32):
         model.reset_parameters()
 
         m_ref = copy.deepcopy(model).eval().to(_DEVICE)
-        config = int4_weight_only(group_size=group_size)
+        config = int4_weight_only(group_size=group_size, version=1)
         if check_cpu_version(_DEVICE):
             config.layout = Int4CPULayout()
         quantize_(m_ref, config)
@@ -244,7 +244,7 @@ def test_int4_weight_only_e2e(self, group_size: int = 32):
         model.reset_parameters()
 
         m_ref = copy.deepcopy(model).eval().to(_DEVICE)
-        config = int4_weight_only(group_size=group_size)
+        config = int4_weight_only(group_size=group_size, version=1)
         if check_cpu_version(_DEVICE):
             config.layout = Int4CPULayout()
         quantize_(m_ref, config)

diff --git a/test/quantization/test_gptq.py b/test/quantization/test_gptq.py
@@ -1,3 +1,9 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+
 import unittest
 from pathlib import Path
 
@@ -173,7 +179,7 @@ def test_gptq_with_input_recorder(self):
 
         model2 = copy.deepcopy(model)
         out = model(*test_input)
-        quantize_(model2, Int4WeightOnlyConfig())
+        quantize_(model2, Int4WeightOnlyConfig(version=1))
 
         outq = model2(*test_input)
         del model2

diff --git a/test/quantization/test_moe_quant.py b/test/quantization/test_moe_quant.py
@@ -1,3 +1,9 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+
 import unittest
 
 import pytest
@@ -114,7 +120,8 @@ def test_int4wo_fake_dim(self, name, num_tokens, fullgraph):
             self.skipTest("Need CUDA available")
 
         config = MoEQuantConfig(
-            Int4WeightOnlyConfig(), use_fake_extra_dim_tensor=UseFakeExtraDimTensor.TRUE
+            Int4WeightOnlyConfig(version=1),
+            use_fake_extra_dim_tensor=UseFakeExtraDimTensor.TRUE,
         )
         tensor_impl_class = TensorCoreTiledAQTTensorImpl
 
@@ -137,7 +144,7 @@ def test_int4wo_base(self, name, num_tokens, fullgraph):
         if not is_sm_at_least_90():
             self.skipTest("Requires CUDA capability >= 9.0")
 
-        config = MoEQuantConfig(Int4WeightOnlyConfig())
+        config = MoEQuantConfig(Int4WeightOnlyConfig(version=1))
         tensor_impl_class = TensorCoreTiledAQTTensorImpl
 
         self._test_impl_moe_quant(