Pull request pytorch#64: Feature/EIEX-90 quantization and conversion of aten native batch norm legit no training

MartinPavella · MartinPavella · commit 931c24c362c3 · 2025-02-10T09:09:29.000+01:00
Merge in AITEC/executorch from feature/EIEX-90-quantization-and-conversion-of-aten-_native_batch_norm_legit_no_training to main-nxp

* commit '246b61f9e3b8fa50210460cf9f419c7eb670fa8b':
  Add pre-processing pass to fuse BatchNorm into preceding Linear nodes.
  Add pre-processing pass to fuse BatchNorm into preceding Conv nodes.
  Add infrastructure for pre-processing passes of aten programs.
diff --git a/backends/nxp/pytorch_passes/fuse_batch_norm_with_conv_pass.py b/backends/nxp/pytorch_passes/fuse_batch_norm_with_conv_pass.py
@@ -0,0 +1,125 @@
+# Copyright 2025 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from torch.export.unflatten import _AttrKind, _assign_attr
+from torch.fx import Node
+from torch.nn.utils import fuse_conv_bn_weights
+
+from executorch.backends.nxp.pytorch_passes.nxp_pytorch_pass import NXPPyTorchPass
+
+
+class FuseBatchNormWithConvPass(NXPPyTorchPass):
+    """ The executorch batch normalization carries out the following computation [1].
+
+            (x - mean) / (var + eps) * W + B
+
+        Which can be expressed as
+
+            x * (W / sqrt(var + eps)) + (B - mean * (W / sqrt(var + eps)))
+
+        So the batch norm can be done as 1 multiplication and 1 addition, provided that the parameters are static,
+         and the terms can be precomputed. If there is a `Conv` operator before the batch normalization, this scale and
+         bias can be statically integrated into the weights and bias of the `Conv`, which allows the batch norm to be
+         completely removed.
+         
+        
+                                   │
+                     ┌─────────────▼─────────────┐
+                     │ aten.conv1d | aten.conv2d │
+                     └─────────────┬─────────────┘
+                                   │                                                                │
+             ┌─────────────────────▼─────────────────────┐        replace with        ┌─────────────▼─────────────┐
+             │ aten._native_batch_norm_legit_no_training │       ──────────────►      │ aten.conv1d | aten.conv2d │
+             └─────────────────────┬─────────────────────┘                            └─────────────┬─────────────┘
+                                   │                                                                ▼
+                             ┌─────▼──────┐
+                             │ getitem(0) │
+                             └─────┬──────┘
+                                   ▼
+
+        [1] https://github.com/pytorch/executorch/blob/v0.5.0-rc2/kernels/portable/cpu/op_native_batch_norm.cpp#L118-L128
+    """
+
+    def run(self) -> bool:
+        def _is_batch_norm(node_: Node) -> bool:
+            return node_.op == "call_function" and node_.target == torch.ops.aten._native_batch_norm_legit_no_training.default
+
+        def _is_conv(node_: Node):
+            return node_.op == "call_function" and node_.target in (
+                torch.ops.aten.conv1d.default,
+                torch.ops.aten.conv2d.default
+            )
+
+        def _is_getitem(node_: Node) -> bool:
+            return node_.op == "call_function" and node_.target.__name__ == "getitem"
+
+        made_changes = False
+
+        if not any(map(_is_batch_norm, self.module.graph.nodes)):
+            return made_changes  # No batch norm nodes in the model.
+
+        for node in self.module.graph.nodes:
+            if not _is_batch_norm(node):
+                continue  # Not BatchNorm.
+
+            bn_node = node
+            if not all(_is_getitem(user) and user.args[1] == 0 for user in bn_node.users):
+                # Nodes other than just `getitem(0)` follow after the BatchNorm. Probably `getitem` nodes accessing
+                #  other outputs of the BN. After the fusion with a Conv op, only the first output can be accessed.
+                continue
+
+            if not _is_conv(bn_node.args[0]):
+                continue  # Something other than a Conv node comes before the BatchNorm.
+
+            conv_node = bn_node.args[0]
+            conv_weight_node = conv_node.args[1]
+            conv_bias_node = conv_node.args[2] if len(conv_node.args) > 2 else None
+
+            # conv args: input, weight, bias, stride, padding, dilation, ...
+            conv_w = self.get_tensor_constant_from_node(conv_weight_node)
+            conv_b = self.get_tensor_constant_from_node(conv_bias_node)
+
+            # batch norm legit no training args: input, weight, bias, running mean, running var, momentum, eps
+            bn_w = self.get_tensor_constant_from_node(bn_node.args[1])
+            bn_b = self.get_tensor_constant_from_node(bn_node.args[2])
+            bn_rm = self.get_tensor_constant_from_node(bn_node.args[3])
+            bn_rv = self.get_tensor_constant_from_node(bn_node.args[4])
+            bn_eps = bn_node.args[6]
+
+            if any(t is None for t in (conv_w, bn_rm, bn_rv)):  # The other inputs can be None.
+                continue  # The data is not static. Leave this BatchNorm as is (probably a rare case).
+            fused_weight, fused_bias = fuse_conv_bn_weights(conv_w, conv_b, bn_rm, bn_rv, bn_eps, bn_w, bn_b)
+
+            # Update the weight and bias for Conv.
+            conv_args = list(conv_node.args)
+            if len(conv_args) == 2:
+                # Fill in the default bias argument.
+                conv_args.append(None)
+
+            weight_attr_name = conv_weight_node.target
+            _assign_attr(fused_weight, self.module, weight_attr_name, _AttrKind.PARAMETER)
+
+            if conv_bias_node is not None:
+                bias_attr_name = conv_bias_node.target
+                _assign_attr(fused_bias, self.module, str(bias_attr_name), _AttrKind.PARAMETER)
+            else:
+                # The Conv doesn't have a bias. Create a new one.
+                bias_attr_name = weight_attr_name + "_bias"
+                _assign_attr(fused_bias, self.module, bias_attr_name, _AttrKind.PARAMETER)
+                with self.module.graph.inserting_before(conv_node):
+                    get_bias_node = self.module.graph.get_attr(bias_attr_name)
+
+                conv_args[2] = get_bias_node
+
+            conv_node.args = tuple(conv_args)
+
+            # Replace the uses of the BatchNorm with the Conv.
+            for user in bn_node.users:
+                user.replace_all_uses_with(conv_node)
+
+            made_changes = True
+
+        return made_changes
diff --git a/backends/nxp/pytorch_passes/fuse_batch_norm_with_linear_pass.py b/backends/nxp/pytorch_passes/fuse_batch_norm_with_linear_pass.py
@@ -0,0 +1,121 @@
+# Copyright 2025 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from torch.export.unflatten import _AttrKind, _assign_attr
+from torch.fx import Node
+from torch.nn.utils import fuse_linear_bn_weights
+
+from executorch.backends.nxp.pytorch_passes.nxp_pytorch_pass import NXPPyTorchPass
+
+
+class FuseBatchNormWithLinearPass(NXPPyTorchPass):
+    """ The executorch batch normalization carries out the following computation [1].
+
+            (x - mean) / (var + eps) * W + B
+
+        Which can be expressed as
+
+            x * (W / sqrt(var + eps)) + (B - mean * (W / sqrt(var + eps)))
+
+        So the batch norm can be done as 1 multiplication and 1 addition, provided that the parameters are static,
+         and the terms can be precomputed. If there is a `Linear` operator before the batch normalization, this scale
+         and bias can be statically integrated into the weights and bias of the `Linear`, which allows the batch norm
+         to be completely removed.
+         
+        
+                                   │
+                            ┌──────▼──────┐
+                            │ aten.linear │
+                            └──────┬──────┘
+                                   │                                                         │
+             ┌─────────────────────▼─────────────────────┐        replace with        ┌──────▼──────┐
+             │ aten._native_batch_norm_legit_no_training │       ──────────────►      │ aten.linear │
+             └─────────────────────┬─────────────────────┘                            └──────┬──────┘
+                                   │                                                         ▼
+                             ┌─────▼──────┐
+                             │ getitem(0) │
+                             └─────┬──────┘
+                                   ▼
+
+        [1] https://github.com/pytorch/executorch/blob/v0.5.0-rc2/kernels/portable/cpu/op_native_batch_norm.cpp#L118-L128
+    """
+
+    def run(self) -> bool:
+        def _is_batch_norm(node_: Node) -> bool:
+            return node_.op == "call_function" and node_.target == torch.ops.aten._native_batch_norm_legit_no_training.default
+
+        def _is_linear(node_: Node):
+            return node_.op == "call_function" and node_.target == torch.ops.aten.linear.default
+
+        def _is_getitem(node_: Node) -> bool:
+            return node_.op == "call_function" and node_.target.__name__ == "getitem"
+
+        made_changes = False
+
+        if not any(map(_is_batch_norm, self.module.graph.nodes)):
+            return made_changes  # No batch norm nodes in the model.
+
+        for node in self.module.graph.nodes:
+            if not _is_batch_norm(node):
+                continue  # Not BatchNorm.
+
+            bn_node = node
+            if not all(_is_getitem(user) and user.args[1] == 0 for user in bn_node.users):
+                # Nodes other than just `getitem(0)` follow after the BatchNorm. Probably `getitem` nodes accessing
+                #  other outputs of the BN. After the fusion with a Linear op, only the first output can be accessed.
+                continue
+
+            if not _is_linear(bn_node.args[0]):
+                continue  # Something other than a Linear node comes before the BatchNorm.
+
+            linear_node = bn_node.args[0]
+            linear_weight_node = linear_node.args[1]
+            linear_bias_node = linear_node.args[2] if len(linear_node.args) > 2 else None
+
+            linear_w = self.get_tensor_constant_from_node(linear_weight_node)
+            linear_b = self.get_tensor_constant_from_node(linear_bias_node)
+
+            # batch norm legit no training args: input, weight, bias, running mean, running var, momentum, eps
+            bn_w = self.get_tensor_constant_from_node(bn_node.args[1])
+            bn_b = self.get_tensor_constant_from_node(bn_node.args[2])
+            bn_rm = self.get_tensor_constant_from_node(bn_node.args[3])
+            bn_rv = self.get_tensor_constant_from_node(bn_node.args[4])
+            bn_eps = bn_node.args[6]
+
+            if any(t is None for t in (linear_w, bn_w, bn_b, bn_rm, bn_rv)):  # The Linear bias can be None.
+                continue  # The data is not static. Leave this BatchNorm as is (probably a rare case).
+            fused_weight, fused_bias = fuse_linear_bn_weights(linear_w, linear_b, bn_rm, bn_rv, bn_eps, bn_w, bn_b)
+
+            # Update the weight and bias for Linear.
+            linear_args = list(linear_node.args)
+            if len(linear_args) == 2:
+                # Fill in the default bias argument.
+                linear_args.append(None)
+
+            weight_attr_name = linear_weight_node.target
+            _assign_attr(fused_weight, self.module, weight_attr_name, _AttrKind.PARAMETER)
+
+            if linear_bias_node is not None:
+                bias_attr_name = linear_bias_node.target
+                _assign_attr(fused_bias, self.module, str(bias_attr_name), _AttrKind.PARAMETER)
+            else:
+                # The Linear doesn't have a bias. Create a new one.
+                bias_attr_name = weight_attr_name + "_bias"
+                _assign_attr(fused_bias, self.module, bias_attr_name, _AttrKind.PARAMETER)
+                with self.module.graph.inserting_before(linear_node):
+                    get_bias_node = self.module.graph.get_attr(bias_attr_name)
+
+                linear_args[2] = get_bias_node
+
+            linear_node.args = tuple(linear_args)
+
+            # Replace the uses of the BatchNorm with the Linear.
+            for user in bn_node.users:
+                user.replace_all_uses_with(linear_node)
+
+            made_changes = True
+
+        return made_changes
diff --git a/backends/nxp/pytorch_passes/nxp_pytorch_pass.py b/backends/nxp/pytorch_passes/nxp_pytorch_pass.py
@@ -0,0 +1,35 @@
+# Copyright 2025 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from abc import abstractmethod, ABC
+
+from torch.fx import GraphModule
+from torch.nn.parameter import Parameter
+
+
+class NXPPyTorchPass(ABC):
+    """ Abstract parent class for pre-processing passes on the aten dialect level. """
+
+    def __init__(self, module: GraphModule) -> None:
+        super().__init__()
+        self.module = module
+
+    @abstractmethod
+    def run(self) -> bool:
+        """ Execute the pass and return a bool indicating if any changes have been made. """
+        pass
+
+    def get_tensor_constant_from_node(self, node) -> Parameter | None:
+        """ Get the static data from a given node. If it doesn't have any data, return `None`. """
+        if node is None or node.op != 'get_attr':
+            return None
+
+        target_atoms = node.target.split('.')
+        attr_itr = self.module
+        for i, atom in enumerate(target_atoms):
+            if not hasattr(attr_itr, atom):
+                return None
+            attr_itr = getattr(attr_itr, atom)
+        return attr_itr
diff --git a/backends/nxp/pytorch_passes/nxp_pytorch_pass_manager.py b/backends/nxp/pytorch_passes/nxp_pytorch_pass_manager.py
@@ -0,0 +1,59 @@
+# Copyright 2025 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+from typing import Iterable
+
+import itertools
+from torch.fx import GraphModule
+
+from executorch.backends.nxp.pytorch_passes.fuse_batch_norm_with_conv_pass import FuseBatchNormWithConvPass
+from executorch.backends.nxp.pytorch_passes.fuse_batch_norm_with_linear_pass import FuseBatchNormWithLinearPass
+from executorch.backends.nxp.pytorch_passes.nxp_pytorch_pass import NXPPyTorchPass
+
+
+class NXPPyTorchPassManager:
+    """ Class iteratively calls provided passes which inherit from the `NXPPyTorchPass` class. """
+
+    def __init__(self, module: GraphModule, passes: Iterable[type[NXPPyTorchPass]] | None = None):
+        self.module = module
+        self.passes = passes or [  # New passes should be added here.
+            FuseBatchNormWithConvPass,
+            FuseBatchNormWithLinearPass
+        ]
+
+    def _clean_up_graph_module(self):
+        self.module.graph.eliminate_dead_code()
+        self.module.recompile()
+
+    def run(self) -> GraphModule:
+        """ Iteratively apply all available passes for as long as they are changing the graph. """
+        graph_module = self.module
+        num_passes = len(self.passes)
+        hard_limit = 10 * num_passes  # Empirical value.
+        num_passes_since_last_change = 0
+
+        self._clean_up_graph_module()
+
+        # Cycle through all passes as long as they are making changes.
+        for i, pass_class in enumerate(itertools.cycle(self.passes)):
+            try:
+                pass_ = pass_class(graph_module)
+                made_changes = pass_.run()
+                self._clean_up_graph_module()
+
+                if made_changes:
+                    num_passes_since_last_change = 0
+                else:
+                    num_passes_since_last_change += 1
+
+                if num_passes_since_last_change >= num_passes or i >= hard_limit:
+                    break
+
+            except Exception as e:
+                logging.warning(f'An exception occurred during the pre-processing pass `{pass_class}`. '
+                                'Please report this issue.\n' + str(e))
+
+        return graph_module
diff --git a/backends/nxp/tests/executorch_pipeline.py b/backends/nxp/tests/executorch_pipeline.py
@@ -1,10 +1,16 @@
+# Copyright 2024-2025 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
 import torch
 from torch import nn
 from torch.ao.quantization.quantize_pt2e import prepare_pt2e, convert_pt2e
 
 from executorch import exir
 from executorch.backends.nxp.neutron_partitioner import NeutronPartitioner
 from executorch.backends.nxp.nxp_backend import generate_neutron_compile_spec
+from executorch.backends.nxp.pytorch_passes.nxp_pytorch_pass_manager import NXPPyTorchPassManager
 from executorch.backends.nxp.quantizer.neutron_quantizer import NeutronQuantizer
 from executorch.extension.export_util.utils import export_to_edge
 from executorch.exir import EdgeProgramManager, ExecutorchBackendConfig, ExecutorchProgramManager
@@ -26,6 +32,11 @@ def to_quantized_edge_program(model: torch.nn.Module, input_shape: tuple, target
     example_input = (torch.ones(*input_shape),)
 
     exir_program_aten = torch._export.capture_pre_autograd_graph(model, example_input)
+
+    # Run pre-processing passes of the float32 aten dialect program.
+    pass_manager = NXPPyTorchPassManager(exir_program_aten)
+    pass_manager.run()  # All passes by default.
+
     exir_program_aten_quant = _quantize_model(exir_program_aten, calibration_inputs)
     edge_program_manager = export_to_edge(exir_program_aten_quant, example_input)
 
diff --git a/backends/nxp/tests/test_batch_norm_fusion.py b/backends/nxp/tests/test_batch_norm_fusion.py
diff --git a/examples/nxp/aot_neutron_compile.py b/examples/nxp/aot_neutron_compile.py