-
Notifications
You must be signed in to change notification settings - Fork 283
[ONNX] Compress quantize weights transformation #3662
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
9a4fa4f
cc09695
44a55be
a6240fa
9779e3b
5796b59
7e3551d
c884c23
0f2bce8
61a3fb4
0adf45c
1ffcd90
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||
|---|---|---|---|---|
|
|
@@ -10,6 +10,7 @@ | |||
| # limitations under the License. | ||||
|
|
||||
| import sys | ||||
| from copy import deepcopy | ||||
| from pathlib import Path | ||||
| from typing import Any, Callable, Iterable, Optional, TypeVar, Union | ||||
|
|
||||
|
|
@@ -30,7 +31,10 @@ | |||
| from nncf.onnx.graph.model_metadata import set_metadata | ||||
| from nncf.onnx.graph.nncf_graph_builder import GraphConverter | ||||
| from nncf.onnx.graph.passes import apply_preprocess_passes | ||||
| from nncf.onnx.graph.passes import compress_quantize_weights_transformation | ||||
| from nncf.onnx.quantization.backend_parameters import BackendParameters | ||||
| from nncf.onnx.quantization.backend_parameters import get_external_data_dir | ||||
| from nncf.onnx.quantization.backend_parameters import is_weight_compression_needed | ||||
| from nncf.parameters import BackupMode | ||||
| from nncf.parameters import CompressionFormat | ||||
| from nncf.parameters import CompressWeightsMode | ||||
|
|
@@ -177,6 +181,9 @@ def quantize_impl( | |||
| remove_metadata(quantized_model, MetadataKey.EXTERNAL_DATA_DIR) | ||||
| load_external_data_for_model(quantized_model, external_data_dir) | ||||
|
|
||||
| if is_weight_compression_needed(advanced_parameters): | ||||
| compress_quantize_weights_transformation(quantized_model) | ||||
|
|
||||
| return quantized_model | ||||
|
|
||||
|
|
||||
|
|
@@ -202,8 +209,13 @@ def quantize_with_accuracy_control_impl( | |||
| if advanced_accuracy_restorer_parameters is None: | ||||
| advanced_accuracy_restorer_parameters = AdvancedAccuracyRestorerParameters() | ||||
|
|
||||
| compress_weights = is_weight_compression_needed(advanced_quantization_parameters) | ||||
|
|
||||
| if advanced_quantization_parameters is None: | ||||
| advanced_quantization_parameters = AdvancedQuantizationParameters() | ||||
| copied_parameters = AdvancedQuantizationParameters() | ||||
| else: | ||||
| copied_parameters = deepcopy(advanced_quantization_parameters) | ||||
| copied_parameters.backend_params[BackendParameters.COMPRESS_WEIGHTS] = False | ||||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why should we update this parameter here?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We need to disable
|
||||
|
|
||||
| quantized_model = quantize_impl( | ||||
| model=model, | ||||
|
|
@@ -214,7 +226,7 @@ def quantize_with_accuracy_control_impl( | |||
| fast_bias_correction=fast_bias_correction, | ||||
| model_type=model_type, | ||||
| ignored_scope=ignored_scope, | ||||
| advanced_parameters=advanced_quantization_parameters, | ||||
| advanced_parameters=copied_parameters, | ||||
| ) | ||||
|
|
||||
| if advanced_accuracy_restorer_parameters.intermediate_model_dir: | ||||
|
|
@@ -254,7 +266,7 @@ def quantize_with_accuracy_control_impl( | |||
| fast_bias_correction, | ||||
| model_type, | ||||
| ignored_scope, | ||||
| advanced_quantization_parameters, | ||||
| copied_parameters, | ||||
| ) | ||||
| tuned_quantized_metric_results = evaluator.collect_metric_results( | ||||
| tuned_quantized_model, validation_dataset, model_name="tuned" | ||||
|
|
@@ -292,6 +304,9 @@ def quantize_with_accuracy_control_impl( | |||
| evaluator, | ||||
| ) | ||||
|
|
||||
| if compress_weights: | ||||
| compress_quantize_weights_transformation(quantized_model) | ||||
|
|
||||
| return quantized_model | ||||
|
|
||||
|
|
||||
|
|
||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -8,8 +8,14 @@ | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| # See the License for the specific language governing permissions and | ||
| # limitations under the License. | ||
| import numpy as np | ||
| import onnx | ||
|
|
||
| import nncf | ||
| from nncf.onnx.graph.passes import apply_preprocess_passes | ||
| from nncf.onnx.graph.passes import compress_quantize_weights_transformation | ||
| from nncf.onnx.quantization.backend_parameters import BackendParameters | ||
| from tests.onnx.common import ModelBuilder | ||
| from tests.onnx.models import build_matmul_model_with_nop_cast | ||
|
|
||
|
|
||
|
|
@@ -21,3 +27,43 @@ def test_apply_preprocess_passes(): | |
|
|
||
| assert set(after_nodes) - set(before_nodes) == set() | ||
| assert set(before_nodes) - set(after_nodes) == set(["cast"]) | ||
|
|
||
|
|
||
| def _build_model(): | ||
| w = np.array([[0.1, 0.3, 0.2, -0.1], [-0.9, 0.1, 0.5, -0.3], [0.0, -0.1, -0.4, -0.9]], dtype=np.float32) | ||
|
|
||
| b = np.array([0.1, 0.1, 0.1, 0.1], dtype=np.float32) | ||
|
|
||
| mb = ModelBuilder() | ||
| x = mb.add_input("X", (2, 3)) | ||
| x = mb.add_gemm(x, w.shape, weight_data=w, bias_data=b) | ||
| mb.add_output(x, (2, 4)) | ||
| return mb.build(opset_version=19, ir_version=9) | ||
|
|
||
|
|
||
| def check_operation_count(model: onnx.ModelProto, op_type_to_count: dict[str, int]): | ||
| count = {} | ||
| for node in model.graph.node: | ||
| if node.op_type in op_type_to_count: | ||
| count[node.op_type] = count.get(node.op_type, 0) + 1 | ||
| assert count == op_type_to_count | ||
|
|
||
|
|
||
| def test_compress_quantize_weights_transformation(): | ||
| model = _build_model() | ||
|
|
||
| x = np.array([[0.2, -0.1, 0.9], [-0.1, -0.9, 0.5]], dtype=np.float32) | ||
|
|
||
| # Prepare quantized model without weight compression | ||
| calibration_dataset = nncf.Dataset([{"X": x}]) | ||
| quantized_model = nncf.quantize( | ||
| model, | ||
| calibration_dataset, | ||
| advanced_parameters=nncf.AdvancedQuantizationParameters( | ||
| backend_params={BackendParameters.COMPRESS_WEIGHTS: False} | ||
| ), | ||
| ) | ||
|
|
||
| check_operation_count(quantized_model, {"QuantizeLinear": 2, "DequantizeLinear": 2}) | ||
| compress_quantize_weights_transformation(quantized_model) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This test is covering the transformation, but I would suggest to additionally test the
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think it is necessary here. We already have an end-to-end test ( |
||
| check_operation_count(quantized_model, {"QuantizeLinear": 1, "DequantizeLinear": 2}) | ||
Uh oh!
There was an error while loading. Please reload this page.