Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ pip install torchao
Quantize your model weights to int4!
```
from torchao.quantization import Int4WeightOnlyConfig, quantize_
quantize_(model, Int4WeightOnlyConfig(group_size=32))
quantize_(model, Int4WeightOnlyConfig(group_size=32, version=1))
```
Compared to a `torch.compiled` bf16 baseline, your quantized model should be significantly smaller and faster on a single A100 GPU:
```
Expand Down Expand Up @@ -102,7 +102,7 @@ pip install torchao
```
# Nightly
pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126

# Different CUDA versions
pip install torchao --index-url https://download.pytorch.org/whl/cu126 # CUDA 12.6
pip install torchao --index-url https://download.pytorch.org/whl/cpu # CPU only
Expand Down Expand Up @@ -144,7 +144,7 @@ Quantize any model with `nn.Linear` layers in just one line (Option 1), or load

```python
from torchao.quantization.quant_api import quantize_, Int4WeightOnlyConfig
quantize_(model, Int4WeightOnlyConfig(group_size=128, use_hqq=True))
quantize_(model, Int4WeightOnlyConfig(group_size=128, use_hqq=True, version=1))
```

#### Option 2: HuggingFace Integration
Expand All @@ -154,7 +154,7 @@ from transformers import TorchAoConfig, AutoModelForCausalLM
from torchao.quantization.quant_api import Int4WeightOnlyConfig

# Create quantization configuration
quantization_config = TorchAoConfig(quant_type=Int4WeightOnlyConfig(group_size=128, use_hqq=True))
quantization_config = TorchAoConfig(quant_type=Int4WeightOnlyConfig(group_size=128, use_hqq=True, version=1))

# Load and automatically quantize
quantized_model = AutoModelForCausalLM.from_pretrained(
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/benchmark_aq.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,7 @@ def _bench_quantized_tensor_subclass_perf(api, ref_api, M, N, K, kwargs=None):
)

print("_int4wo_api")
kwargs = {"groupsize": 32}
kwargs = {"groupsize": 32, "version": 1}

for M, N, K in all_shapes:
_bench_quantized_tensor_subclass_perf(
Expand Down
3 changes: 2 additions & 1 deletion benchmarks/microbenchmarks/test/test_benchmark_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,8 @@ def test_run_inference_with_semi_sparse_marlin(self, mock_string_to_config):

# Test with semi-sparse config
mock_string_to_config.return_value = Int4WeightOnlyConfig(
layout=MarlinSparseLayout()
layout=MarlinSparseLayout(),
version=1,
)
config = BenchmarkConfig(
quantization="marlin",
Expand Down
4 changes: 2 additions & 2 deletions benchmarks/microbenchmarks/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,7 @@ def string_to_config(
128,
256,
], f"int4wo group_size needs to be one of [32,64,128,256] but got {group_size}"
return Int4WeightOnlyConfig(group_size=group_size, use_hqq=use_hqq)
return Int4WeightOnlyConfig(group_size=group_size, use_hqq=use_hqq, version=1)
elif "int8adq-int4w-symm" in quantization:
from torchao.dtypes import CutlassInt4PackedLayout

Expand All @@ -229,7 +229,7 @@ def string_to_config(
elif sparsity is not None and ("semi" in sparsity or "2:4" in sparsity):
from torchao.dtypes import MarlinSparseLayout

return Int4WeightOnlyConfig(layout=MarlinSparseLayout())
return Int4WeightOnlyConfig(layout=MarlinSparseLayout(), version=1)
if "fp6" in quantization:
return FPXWeightOnlyConfig(3, 2)
elif "uintx" in quantization:
Expand Down
2 changes: 1 addition & 1 deletion docs/source/quick_start.rst
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ for efficient mixed dtype matrix multiplication:

# torch 2.4+ only
from torchao.quantization import Int4WeightOnlyConfig, quantize_
quantize_(model, Int4WeightOnlyConfig(group_size=32))
quantize_(model, Int4WeightOnlyConfig(group_size=32, version=1))

The quantized model is now ready to use! Note that the quantization
logic is inserted through tensor subclasses, so there is no change
Expand Down
10 changes: 4 additions & 6 deletions docs/source/serialization.rst
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ Serialization and deserialization flow
======================================

Here is the serialization and deserialization flow::

import copy
import tempfile
import torch
Expand Down Expand Up @@ -36,7 +36,7 @@ Here is the serialization and deserialization flow::
print(f"original model size: {get_model_size_in_bytes(m) / 1024 / 1024} MB")

example_inputs = m.example_inputs(dtype=dtype, device="cuda")
quantize_(m, Int4WeightOnlyConfig())
quantize_(m, Int4WeightOnlyConfig(version=1))
print(f"quantized model size: {get_model_size_in_bytes(m) / 1024 / 1024} MB")

ref = m(*example_inputs)
Expand All @@ -62,7 +62,7 @@ What happens when serializing an optimized model?
To serialize an optimized model, we just need to call ``torch.save(m.state_dict(), f)``, because in torchao, we use tensor subclass to represent different dtypes or support different optimization techniques like quantization and sparsity. So after optimization, the only thing change is the weight Tensor is changed to an optimized weight Tensor, and the model structure is not changed at all. For example:

original floating point model ``state_dict``::

{"linear1.weight": float_weight1, "linear2.weight": float_weight2}

quantized model ``state_dict``::
Expand All @@ -75,7 +75,7 @@ The size of the quantized model is typically going to be smaller to the original
original model size: 4.0 MB
quantized model size: 1.0625 MB


What happens when deserializing an optimized model?
===================================================
To deserialize an optimized model, we can initialize the floating point model in `meta <https://pytorch.org/docs/stable/meta.html>`__ device and then load the optimized ``state_dict`` with ``assign=True`` using `model.load_state_dict <https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module.load_state_dict>`__::
Expand All @@ -97,5 +97,3 @@ We can also verify that the weight is properly loaded by checking the type of we

type of weight before loading: (<class 'torch.Tensor'>, <class 'torch.Tensor'>)
type of weight after loading: (<class 'torchao.dtypes.affine_quantized_tensor.AffineQuantizedTensor'>, <class 'torchao.dtypes.affine_quantized_tensor.AffineQuantizedTensor'>)


5 changes: 3 additions & 2 deletions docs/source/torchao_vllm_integration.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ from torchao.quantization import Int4WeightOnlyConfig
config = Int4WeightOnlyConfig(
group_size=128,
use_hqq=True,
version=1,
)
assert isinstance(config, AOBaseConfig)
```
Expand All @@ -65,7 +66,7 @@ config = ModuleFqnToConfig({
"model.layers.0.self_attn.q_proj": Int4WeightOnlyConfig(group_size=64),
"model.layers.0.self_attn.k_proj": Int4WeightOnlyConfig(group_size=64),
"model.layers.0.mlp.gate_proj": Int8WeightOnlyConfig(),
"_default": Int4WeightOnlyConfig(group_size=128) # Default for other modules
"_default": Int4WeightOnlyConfig(group_size=128, version=1) # Default for other modules
})
```

Expand All @@ -81,7 +82,7 @@ from torchao.quantization import Int4WeightOnlyConfig

# Create quantization configuration
quantization_config = TorchAoConfig(
quant_type=Int4WeightOnlyConfig(group_size=128, use_hqq=True)
quant_type=Int4WeightOnlyConfig(group_size=128, use_hqq=True, version=1)
)

# Load and automatically quantize the model
Expand Down
2 changes: 1 addition & 1 deletion scripts/quick_start.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def forward(self, x):
# ========================

# torch 2.4+ only
quantize_(model, Int4WeightOnlyConfig(group_size=32))
quantize_(model, Int4WeightOnlyConfig(group_size=32, version=1))


# =============
Expand Down
17 changes: 9 additions & 8 deletions test/dtypes/test_affine_quantized.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,22 +66,23 @@ def get_quantization_functions(
if do_int4:
if check_cpu_version(device):
base_functions.append(
int4_weight_only(group_size=32, layout=Int4CPULayout())
int4_weight_only(group_size=32, layout=Int4CPULayout(), version=1)
)
elif check_xpu_version(device):
base_functions.append(
int4_weight_only(group_size=32, layout=Int4XPULayout())
int4_weight_only(group_size=32, layout=Int4XPULayout(), version=1)
)
if int4_zp_int:
base_functions.append(
int4_weight_only(
group_size=32,
layout=Int4XPULayout(),
zero_point_domain=ZeroPointDomain.INT,
version=1,
)
)
else:
base_functions.append(int4_weight_only(group_size=32))
base_functions.append(int4_weight_only(group_size=32, version=1))
if device == "cuda" and not is_ROCM():
base_functions.append(
int8_dynamic_activation_int4_weight(
Expand Down Expand Up @@ -118,7 +119,7 @@ def test_tensor_core_layout_transpose(self):
linear = torch.nn.Linear(128, 256, dtype=torch.bfloat16, device="cuda")
t = linear.weight
shape = t.shape
apply_int4_weight_only_quant = int4_weight_only(group_size=32)
apply_int4_weight_only_quant = int4_weight_only(group_size=32, version=1)
quantize_(linear, apply_int4_weight_only_quant)
ql = linear
aqt = ql.weight
Expand Down Expand Up @@ -353,7 +354,7 @@ def test_slice_int4wo(self, device, dtype):
# out_feature not divisible by 8
# to test slice + padding for int4 weight only quantization
dummy = nn.Linear(256, 321, dtype=dtype, device=device)
quantize_(dummy, Int4WeightOnlyConfig())
quantize_(dummy, Int4WeightOnlyConfig(version=1))
# make sure these run without error
_ = dummy.weight.narrow(0, 0, 64)
_ = dummy.weight.narrow(1, 0, 128)
Expand Down Expand Up @@ -467,7 +468,7 @@ def test_slice_and_copy_int4wo(self, device, dtype):
l.weight = torch.nn.Parameter(
torch.zeros(1024, 1024, dtype=torch.bfloat16, device="cuda")
)
quantize_(l, Int4WeightOnlyConfig())
quantize_(l, Int4WeightOnlyConfig(version=1))
param = l.weight
param_data = param.data
param_data = param_data.narrow(0, 0, 512)
Expand All @@ -483,7 +484,7 @@ def test_slice_and_copy_int4wo(self, device, dtype):

# dummy_l has random input (shouldn't be 0)
dummy_l = torch.nn.Linear(1024, 1024).to("cuda").to(torch.bfloat16)
quantize_(dummy_l, Int4WeightOnlyConfig())
quantize_(dummy_l, Int4WeightOnlyConfig(version=1))
quantized = dummy_l.weight
quantized = quantized.narrow(0, 0, 512)

Expand All @@ -502,7 +503,7 @@ def test_mm_int4wo(self, device, dtype):

l = torch.nn.Linear(512, 1024).to(device).to(dtype)
l.weight = torch.nn.Parameter(weight)
quantize_(l, Int4WeightOnlyConfig())
quantize_(l, Int4WeightOnlyConfig(version=1))
# weight shape: 1024 x 512
weight = l.weight

Expand Down
2 changes: 1 addition & 1 deletion test/hqq/test_hqq_affine.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def _eval_hqq(dtype):
)
dummy_linear.weight.data = W
if dtype == torch.uint4:
config = int4_weight_only(group_size=max(block_size), use_hqq=True)
config = int4_weight_only(group_size=max(block_size), use_hqq=True, version=1)
else:
config = uintx_weight_only(dtype, group_size=max(block_size), use_hqq=True)
quantize_(dummy_linear, config)
Expand Down
14 changes: 10 additions & 4 deletions test/integration/test_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,17 +135,23 @@ def _int4wo_api(mod, use_hqq=False):
quantize_(
mod,
int4_weight_only(
layout=Int4CPULayout(), use_hqq=use_hqq, set_inductor_config=False
layout=Int4CPULayout(),
use_hqq=use_hqq,
set_inductor_config=False,
version=1,
),
)
unwrap_tensor_subclass(mod)
elif check_xpu_version(next(mod.parameters()).device):
quantize_(
mod, int4_weight_only(layout=Int4XPULayout()), set_inductor_config=False
mod,
int4_weight_only(
layout=Int4XPULayout(), set_inductor_config=False, version=1
),
)
unwrap_tensor_subclass(mod)
else:
quantize_(mod, int4_weight_only(set_inductor_config=False))
quantize_(mod, int4_weight_only(set_inductor_config=False, version=1))


def _int8da_int4w_api(mod):
Expand Down Expand Up @@ -1077,7 +1083,7 @@ def test_int4_weight_only_quant_subclass_api_grouped(self, device, dtype):
):
for groupsize in [64, 32]:
for layout in layout_list:
kwargs = {"groupsize": groupsize, "layout": layout}
kwargs = {"groupsize": groupsize, "layout": layout, "version": 1}

def api(mod):
kwargs_copy = kwargs.copy()
Expand Down
4 changes: 2 additions & 2 deletions test/prototype/test_parq.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,7 +211,7 @@ def test_int4_weight_only(self, group_size: int = 32):
model.reset_parameters()

m_ref = copy.deepcopy(model).eval().to(_DEVICE)
config = int4_weight_only(group_size=group_size)
config = int4_weight_only(group_size=group_size, version=1)
if check_cpu_version(_DEVICE):
config.layout = Int4CPULayout()
quantize_(m_ref, config)
Expand Down Expand Up @@ -244,7 +244,7 @@ def test_int4_weight_only_e2e(self, group_size: int = 32):
model.reset_parameters()

m_ref = copy.deepcopy(model).eval().to(_DEVICE)
config = int4_weight_only(group_size=group_size)
config = int4_weight_only(group_size=group_size, version=1)
if check_cpu_version(_DEVICE):
config.layout = Int4CPULayout()
quantize_(m_ref, config)
Expand Down
8 changes: 7 additions & 1 deletion test/quantization/test_gptq.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD 3-Clause license found in the
# LICENSE file in the root directory of this source tree.

import unittest
from pathlib import Path

Expand Down Expand Up @@ -173,7 +179,7 @@ def test_gptq_with_input_recorder(self):

model2 = copy.deepcopy(model)
out = model(*test_input)
quantize_(model2, Int4WeightOnlyConfig())
quantize_(model2, Int4WeightOnlyConfig(version=1))

outq = model2(*test_input)
del model2
Expand Down
11 changes: 9 additions & 2 deletions test/quantization/test_moe_quant.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD 3-Clause license found in the
# LICENSE file in the root directory of this source tree.

import unittest

import pytest
Expand Down Expand Up @@ -114,7 +120,8 @@ def test_int4wo_fake_dim(self, name, num_tokens, fullgraph):
self.skipTest("Need CUDA available")

config = MoEQuantConfig(
Int4WeightOnlyConfig(), use_fake_extra_dim_tensor=UseFakeExtraDimTensor.TRUE
Int4WeightOnlyConfig(version=1),
use_fake_extra_dim_tensor=UseFakeExtraDimTensor.TRUE,
)
tensor_impl_class = TensorCoreTiledAQTTensorImpl

Expand All @@ -137,7 +144,7 @@ def test_int4wo_base(self, name, num_tokens, fullgraph):
if not is_sm_at_least_90():
self.skipTest("Requires CUDA capability >= 9.0")

config = MoEQuantConfig(Int4WeightOnlyConfig())
config = MoEQuantConfig(Int4WeightOnlyConfig(version=1))
tensor_impl_class = TensorCoreTiledAQTTensorImpl

self._test_impl_moe_quant(
Expand Down
Loading
Loading