Skip to content

Add symmetric quantization with no clipping error in the tensor subclass based API #845

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Sep 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 7 additions & 3 deletions test/quantization/test_quant_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@
import tempfile
import gc
from torch.testing._internal.common_utils import TestCase
from torch.testing._internal import common_utils


def dynamic_quant(model, example_inputs):
Expand Down Expand Up @@ -500,12 +501,13 @@ def test_eval_wrapper_llama3(self):

# TODO: move to a separate test file
@unittest.skipIf(not TORCH_VERSION_AT_LEAST_2_4, "Test only enabled for 2.4+")
def test_quantized_tensor_subclass_8da4w(self):
@common_utils.parametrize("mapping_type", [MappingType.SYMMETRIC, MappingType.SYMMETRIC_NO_CLIPPING_ERR])
def test_quantized_tensor_subclass_8da4w(self, mapping_type):
group_size = 32
m = ToyLinearModel().eval()
m_copy = copy.deepcopy(m)
example_inputs = m.example_inputs()
quantize_(m, int8_dynamic_activation_int4_weight(group_size=group_size))
quantize_(m, int8_dynamic_activation_int4_weight(group_size=group_size, mapping_type=mapping_type))

assert isinstance(m.linear1.weight, LinearActivationQuantizedTensor)
assert isinstance(m.linear2.weight, LinearActivationQuantizedTensor)
Expand All @@ -516,7 +518,7 @@ def test_quantized_tensor_subclass_8da4w(self):
from torchao.quantization.quant_api import Int8DynActInt4WeightQuantizer
from torchao.quantization.GPTQ import Int8DynActInt4WeightLinear

quantizer = Int8DynActInt4WeightQuantizer(groupsize=group_size)
quantizer = Int8DynActInt4WeightQuantizer(groupsize=group_size, mapping_type=mapping_type)
m_copy = quantizer.quantize(m_copy)
assert isinstance(m_copy.linear1, Int8DynActInt4WeightLinear)
assert isinstance(m_copy.linear2, Int8DynActInt4WeightLinear)
Expand Down Expand Up @@ -704,6 +706,8 @@ def reset_memory():
assert param.is_cuda
self.assertLess(memory_streaming, memory_baseline)

common_utils.instantiate_parametrized_tests(TestQuantFlow)


if __name__ == "__main__":
unittest.main()
7 changes: 3 additions & 4 deletions torchao/quantization/quant_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -473,14 +473,13 @@ def _int8_asymm_per_token_quant(x: torch.Tensor) -> torch.Tensor:
target_dtype = torch.int8
return to_affine_quantized_intx(x, mapping_type, _get_per_token_block_size(x), target_dtype)

def apply_int8_dynamic_activation_int4_weight_quant(weight, group_size=32):
def apply_int8_dynamic_activation_int4_weight_quant(weight, group_size=32, mapping_type=MappingType.SYMMETRIC):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

might be better to clarify this is weight_mapping_type I feel

"""This is defined here instead of local function to support serialization
"""
if weight.shape[-1] % group_size != 0:
return weight

# weight settings
mapping_type = MappingType.SYMMETRIC
block_size = (1, group_size)
target_dtype = torch.int8
eps = torch.finfo(torch.float32).eps
Expand All @@ -494,7 +493,7 @@ def apply_int8_dynamic_activation_int4_weight_quant(weight, group_size=32):
weight = to_linear_activation_quantized(weight, input_quant_func)
return weight

def int8_dynamic_activation_int4_weight(group_size=32):
def int8_dynamic_activation_int4_weight(group_size=32, mapping_type=MappingType.SYMMETRIC):
"""Applies int8 dynamic per token asymmetric activation quantization and int4 per group weight symmetric quantization to linear
This is used to produce a model for executorch backend, but currently executorch did not
support lowering for the quantized model from this flow yet
Expand All @@ -503,7 +502,7 @@ def int8_dynamic_activation_int4_weight(group_size=32):
`group_size`: parameter for quantization, controls the granularity of quantization, smaller
size is more fine grained
"""
return _get_linear_subclass_inserter(apply_int8_dynamic_activation_int4_weight_quant, group_size=group_size)
return _get_linear_subclass_inserter(apply_int8_dynamic_activation_int4_weight_quant, group_size=group_size, mapping_type=mapping_type)


def int4_weight_only(group_size=128, layout_type=TensorCoreTiledLayoutType(inner_k_tiles=8), use_hqq=False):
Expand Down
Loading