pytorch
diff --git a/‎CMakeLists.txt
Lines changed: 4 additions & 29 deletions b/‎CMakeLists.txt
Lines changed: 4 additions & 29 deletions
diff --git a/‎backends/apple/coreml/scripts/install_requirements.sh
Lines changed: 1 addition & 1 deletion b/‎backends/apple/coreml/scripts/install_requirements.sh
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/arm/_passes/__init__.py
Lines changed: 5 additions & 0 deletions b/‎backends/arm/_passes/__init__.py
Lines changed: 5 additions & 0 deletions
diff --git a/‎backends/arm/_passes/arm_pass.py
Lines changed: 33 additions & 0 deletions b/‎backends/arm/_passes/arm_pass.py
Lines changed: 33 additions & 0 deletions
diff --git a/‎backends/arm/_passes/arm_pass_manager.py
Lines changed: 6 additions & 7 deletions b/‎backends/arm/_passes/arm_pass_manager.py
Lines changed: 6 additions & 7 deletions
diff --git a/‎backends/arm/_passes/arm_pass_utils.py
Lines changed: 16 additions & 2 deletions b/‎backends/arm/_passes/arm_pass_utils.py
Lines changed: 16 additions & 2 deletions
diff --git a/‎backends/arm/_passes/decompose_layernorm_pass.py
Lines changed: 30 additions & 8 deletions b/‎backends/arm/_passes/decompose_layernorm_pass.py
Lines changed: 30 additions & 8 deletions
diff --git a/‎backends/arm/_passes/decompose_meandim_pass.py
Lines changed: 6 additions & 6 deletions b/‎backends/arm/_passes/decompose_meandim_pass.py
Lines changed: 6 additions & 6 deletions
diff --git a/‎backends/arm/_passes/decompose_softmax_unstable_pass.py
Lines changed: 7 additions & 7 deletions b/‎backends/arm/_passes/decompose_softmax_unstable_pass.py
Lines changed: 7 additions & 7 deletions
@@ -764,10 +764,6 @@ if(EXECUTORCH_BUILD_EXTENSION_MODULE)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/module)
 endif()
 
-if(EXECUTORCH_BUILD_EXTENSION_TRAINING)
-  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/training)
-endif()
-
 if(EXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/runner_util)
 endif()
@@ -872,34 +868,13 @@ if(EXECUTORCH_BUILD_PYBIND)
 
   if(EXECUTORCH_BUILD_EXTENSION_TRAINING)
 
-    set(_pybind_training_dep_libs
-        ${TORCH_PYTHON_LIBRARY}
-        etdump
-        executorch
-        util
-        torch
-        extension_training
-    )
-
-    if(EXECUTORCH_BUILD_XNNPACK)
-      # need to explicitly specify XNNPACK and microkernels-prod
-      # here otherwise uses XNNPACK and microkernel-prod symbols from libtorch_cpu
-      list(APPEND _pybind_training_dep_libs xnnpack_backend XNNPACK microkernels-prod)
-    endif()
-
-    # pybind training
-    pybind11_add_module(_training_lib SHARED extension/training/pybindings/_training_lib.cpp)
-
-    target_include_directories(_training_lib PRIVATE ${TORCH_INCLUDE_DIRS})
-    target_compile_options(_training_lib PUBLIC ${_pybind_compile_options})
-    target_link_libraries(_training_lib PRIVATE ${_pybind_training_dep_libs})
-
-    install(TARGETS _training_lib
-            LIBRARY DESTINATION executorch/extension/training/pybindings
-    )
   endif()
 endif()
 
+if(EXECUTORCH_BUILD_EXTENSION_TRAINING)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/training)
+endif()
+
 if(EXECUTORCH_BUILD_KERNELS_CUSTOM)
   # TODO: move all custom kernels to ${CMAKE_CURRENT_SOURCE_DIR}/kernels/custom
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/llm/custom_ops)
 
@@ -12,7 +12,7 @@ SCRIPT_DIR_PATH="$(
 
 # TODO(jathu): remove the need to fetch coremltools to build deps for coreml_executor_runner.
 # Keep this version in sync with: pyproject.toml
-COREMLTOOLS_VERSION="8.1"
+COREMLTOOLS_VERSION="8.2"
 
 red=`tput setaf 1`
 green=`tput setaf 2`
 
@@ -7,6 +7,7 @@
 from . import arm_pass_utils  # noqa
 from .annotate_channels_last_dim_order_pass import AnnotateChannelsLastDimOrder  # noqa
 from .annotate_decomposed_matmul import AnnotateDecomposedMatmulPass  # noqa
+from .arm_pass import ArmPass  # noqa
 from .cast_int64_pass import CastInt64BuffersToInt32Pass  # noqa
 from .cast_to_int32_pass import CastToInt32Pass  # noqa
 from .conv1d_unsqueeze_pass import Conv1dUnsqueezePass  # noqa
@@ -41,6 +42,10 @@
 from .meandim_to_averagepool_pass import ConvertMeanDimToAveragePoolPass  # noqa
 from .mm_to_bmm_pass import ConvertMmToBmmPass  # noqa
 from .remove_clone_pass import RemoveClonePass  # noqa
+from .replace_scalar_with_tensor_pass import (  # noqa
+    ReplaceScalarWithTensorArgPassTOSABI,
+    ReplaceScalarWithTensorArgPassTOSAMI,
+)
 from .scalars_to_attribute_pass import ScalarsToAttributePass  # noqa
 from .size_adjust_conv2d_pass import SizeAdjustConv2DPass  # noqa
 from .unsqueeze_before_repeat_pass import UnsqueezeBeforeRepeatPass  # noqa
 
@@ -0,0 +1,33 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-unsafe
+
+import traceback
+from typing import Optional
+
+import torch
+from executorch.exir.pass_base import ExportPass, NodeMetadata
+
+
+class ArmPass(ExportPass):
+    """Base class for Arm passes"""
+
+    def __init__(self, exported_program: Optional[torch.export.ExportedProgram] = None):
+        super(ArmPass, self).__init__()
+        self.exported_program = exported_program
+
+    def call_operator(self, op, args, kwargs, meta, updated: Optional[bool] = False):
+        if not updated:
+            return super().call_operator(op, args, kwargs, meta)
+
+        # if updated we should update metadata
+        new_meta = {}
+        keys = meta.data.keys()
+        for key in keys:
+            new_meta[key] = meta[key]
+        old_stack_trace = new_meta.get("stack_trace", "")
+        new_meta["stack_trace"] = f"{old_stack_trace}\n{traceback.format_stack()[-2]}"
+        return super().call_operator(op, args, kwargs, NodeMetadata(new_meta))
@@ -42,18 +42,17 @@
     MatchArgRanksPass,
     QuantizeOperatorArguments,
     RemoveClonePass,
+    ReplaceScalarWithTensorArgPassTOSABI,
+    ReplaceScalarWithTensorArgPassTOSAMI,
     RetraceFoldedDtypesPass,
     ScalarsToAttributePass,
     SizeAdjustConv2DPass,
     UnsqueezeBeforeRepeatPass,
     UnsqueezeScalarPlaceholdersPass,
 )
+
 from executorch.backends.arm.tosa_specification import Tosa_0_80, TosaSpecification
 from executorch.backends.transforms.fuse_view_copy import FuseViewCopyTransform
-
-from executorch.backends.transforms.replace_scalar_with_tensor import (
-    ReplaceScalarWithTensorArgPass,
-)
 from executorch.backends.xnnpack._passes.remove_getitem_op import RemoveGetItemPass
 from executorch.exir import ExportedProgram
 from executorch.exir.pass_manager import PassManager
@@ -84,7 +83,7 @@ def _tosa_080_BI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
         if isinstance(self.tosa_spec, Tosa_0_80) and self.tosa_spec.is_U55_subset:
             self.add_pass(CastToInt32Pass())
 
-        self.add_pass(ReplaceScalarWithTensorArgPass())
+        self.add_pass(ReplaceScalarWithTensorArgPassTOSABI())
         self.add_pass(AnnotateDecomposedMatmulPass())
         self.add_pass(QuantizeOperatorArguments())
         self.add_pass(FoldAndAnnotateQParamsPass())  # type: ignore[call-arg]
@@ -113,7 +112,7 @@ def _tosa_080_BI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
         return self._transform(exported_program.graph_module)
 
     def _tosa_080_MI_pipeline(self, exported_program: ExportedProgram) -> GraphModule:
-        self.add_pass(ReplaceScalarWithTensorArgPass())
+        self.add_pass(ReplaceScalarWithTensorArgPassTOSAMI())
         self.add_pass(FuseQuantizedActivationPass())
         self.add_pass(RemoveGetItemPass())
         self.add_pass(ConvertSplitToSlicePass())
@@ -170,7 +169,7 @@ def transform_to_backend_pipeline(self, exported_program: ExportedProgram):
             )
 
     def transform_for_annotation_pipeline(self, graph_module: GraphModule):
-        self.add_pass(ReplaceScalarWithTensorArgPass())
+        self.add_pass(ReplaceScalarWithTensorArgPassTOSABI())
         self.add_pass(ScalarsToAttributePass())
         self.add_pass(DecomposeLayerNormPass())
         self.add_pass(DecomposeVarPass())
 
@@ -7,12 +7,12 @@
 
 # pyre-unsafe
 
+import traceback
 from inspect import isclass
 from typing import Optional, Sequence
 
 import torch
 import torch.fx
-
 from executorch.exir import ExportedProgram
 from executorch.exir.dialects._ops import ops as exir_ops
 
@@ -96,6 +96,7 @@ def create_node(
     kwargs: Optional[dict] = None,
     quantize: bool = False,
     q_params: Optional[tuple] = None,
+    from_node: Optional[torch.fx.Node] = None,
 ):
     """
     Adds a node to 'graph'. graph.inserting_before/after() should be used before the call to decide where to insert the node.
@@ -108,15 +109,26 @@ def create_node(
         args=args,
         kwargs=kwargs or {},
     )
+
+    new_meta = {}
+    if from_node:
+        keys = from_node.meta.keys()
+        for key in keys:
+            new_meta[key] = from_node.meta[key]
+    old_stack_trace = new_meta.get("stack_trace", "")
+    new_meta["stack_trace"] = f"{old_stack_trace}\n{traceback.format_stack()[-2]}"
+    node.meta = new_meta
+
     if quantize and q_params:
-        return insert_q_dq_pair(graph, node, q_params)
+        return insert_q_dq_pair(graph, node, q_params, from_node)
     return node
 
 
 def insert_q_dq_pair(
     graph: torch.fx.Graph,
     anchor: torch.fx.Node,
     q_params: tuple,
+    from_node: Optional[torch.fx.Node] = None,
 ):
     """
     Inserts a q dq node pair after the node 'anchor'.
@@ -127,13 +139,15 @@ def insert_q_dq_pair(
             graph=graph,
             op_target=exir_ops.edge.quantized_decomposed.quantize_per_tensor.default,
             args=(),  # We add the argument last
+            from_node=from_node if from_node else anchor,
         )
         q.meta = anchor.meta
     with graph.inserting_after(q):
         dq = create_node(
             graph=graph,
             op_target=exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default,
             args=(q,) + q_params,
+            from_node=from_node if from_node else anchor,
         )
         dq.meta = q.meta
     anchor.replace_all_uses_with(dq)
 
@@ -9,9 +9,10 @@
 import operator
 
 import torch
+from executorch.backends.arm._passes import ArmPass
 from executorch.backends.arm._passes.arm_pass_utils import create_node
 from executorch.exir.dialects._ops import ops as exir_ops
-from executorch.exir.pass_base import ExportPass, PassResult
+from executorch.exir.pass_base import PassResult
 
 
 def get_layer_norm_decomposition(op) -> tuple:
@@ -40,7 +41,7 @@ def get_layer_norm_decomposition(op) -> tuple:
     raise RuntimeError(f"Can't get layer_norm composition for op {op}")
 
 
-class DecomposeLayerNormPass(ExportPass):
+class DecomposeLayerNormPass(ArmPass):
     """
     layernorm is defined as: ((x - E[x]) / sqrt(Var[x] + eps)) * weights + bias
     Decompose layernorm(x, normalized_shape, weights, bias, eps) to a sequence of:
@@ -111,35 +112,56 @@ def call(self, graph_module: torch.fx.GraphModule):
                     var_op,
                     args=(x, dims),
                     kwargs={"correction": 0, "keepdim": keepdim},
+                    from_node=node,
                 )
                 full = create_node(
                     graph_module.graph,
                     full_op,
                     args=(epsilon_reshaped_shape, epsilon),
                     kwargs={"dtype": dtype},
+                    from_node=node,
+                )
+                add0 = create_node(
+                    graph_module.graph, add_op, args=(var, full), from_node=node
+                )
+                rsqrt = create_node(
+                    graph_module.graph, rsqrt_op, args=(add0,), from_node=node
+                )
+                mul0 = create_node(
+                    graph_module.graph, mul_op, args=(sub, rsqrt), from_node=node
                 )
-                add0 = create_node(graph_module.graph, add_op, args=(var, full))
-                rsqrt = create_node(graph_module.graph, rsqrt_op, args=(add0,))
-                mul0 = create_node(graph_module.graph, mul_op, args=(sub, rsqrt))
                 if weights is not None:
                     weights_reshaped = create_node(
                         graph_module.graph,
                         view_op,
                         args=(weights, weights_reshaped_shape),
+                        from_node=node,
                     )
                     mul1 = create_node(
-                        graph_module.graph, mul_op, args=(mul0, weights_reshaped)
+                        graph_module.graph,
+                        mul_op,
+                        args=(
+                            mul0,
+                            weights_reshaped,
+                        ),
+                        from_node=node,
                     )
                 else:
                     mul1 = mul0
                 output = mul1
                 if bias is not None:
                     bias_reshaped_shape = weights_reshaped_shape
                     bias_reshaped = create_node(
-                        graph_module.graph, view_op, args=(bias, bias_reshaped_shape)
+                        graph_module.graph,
+                        view_op,
+                        args=(bias, bias_reshaped_shape),
+                        from_node=node,
                     )
                     output = create_node(
-                        graph_module.graph, add_op, args=(mul1, bias_reshaped)
+                        graph_module.graph,
+                        add_op,
+                        args=(mul1, bias_reshaped),
+                        from_node=node,
                     )
 
                 users = [user for user in node.users if node != user]
 
@@ -1,4 +1,4 @@
-# Copyright 2024 Arm Limited and/or its affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
@@ -7,9 +7,9 @@
 # pyre-unsafe
 
 import torch
+from executorch.backends.arm._passes import ArmPass
 from executorch.backends.arm._passes.arm_pass_utils import get_node_arg
 from executorch.exir.dialects._ops import ops as exir_ops
-from executorch.exir.pass_base import ExportPass
 
 
 def get_meandim_decomposition(op) -> tuple:
@@ -28,7 +28,7 @@ def get_meandim_decomposition(op) -> tuple:
     raise RuntimeError(f"Can't get meandim decomposition for op {op}")
 
 
-class DecomposeMeanDimPass(ExportPass):
+class DecomposeMeanDimPass(ArmPass):
     """
     This pass decomposes meandim into a sum and mul node.
 
@@ -62,8 +62,8 @@ def call_operator(self, op, args, kwargs, meta):
 
         sum_op, full_op, mul_op = get_meandim_decomposition(op)
 
-        sum = super().call_operator(sum_op, (x, dim, keepdim), {}, meta)
+        sum = super().call_operator(sum_op, (x, dim, keepdim), {}, meta, True)
         full = super().call_operator(
-            full_op, ([1] * len(shape), 1 / N), {"dtype": dtype}, meta
+            full_op, ([1] * len(shape), 1 / N), {"dtype": dtype}, meta, True
         )
-        return super().call_operator(mul_op, (sum, full), {}, meta)
+        return super().call_operator(mul_op, (sum, full), {}, meta, True)
@@ -6,8 +6,8 @@
 # pyre-unsafe
 
 import torch
+from executorch.backends.arm._passes import ArmPass
 from executorch.exir.dialects._ops import ops as exir_ops
-from executorch.exir.pass_base import ExportPass
 
 # For BI case
 torch_softmax = (torch.ops.aten.softmax.int, torch.ops.aten.log_softmax.int)
@@ -45,7 +45,7 @@ def get_logsoftmax_ops(op) -> tuple:
     raise RuntimeError(f"Can't get softmax decomposition ops for op {op}")
 
 
-class DecomposeSoftmaxUnstablePass(ExportPass):
+class DecomposeSoftmaxUnstablePass(ArmPass):
     """
     This pass decomposes log softmax or softmax into more primitive ops.
 
@@ -66,10 +66,10 @@ def call_operator(self, op, args, kwargs, meta):
         _input = args[0]
         dim = [args[1]]
 
-        op1 = super().call_operator(exp_op, (_input,), {}, meta)
-        op2 = super().call_operator(sum_op, (op1, dim, True), {}, meta)
-        op3 = super().call_operator(reciprocal_op, (op2,), {}, meta)
-        op4 = super().call_operator(mul_op, (op1, op3), {}, meta)
+        op1 = super().call_operator(exp_op, (_input,), {}, meta, True)
+        op2 = super().call_operator(sum_op, (op1, dim, True), {}, meta, True)
+        op3 = super().call_operator(reciprocal_op, (op2,), {}, meta, True)
+        op4 = super().call_operator(mul_op, (op1, op3), {}, meta, True)
         if op in log_softmax:
-            op4 = super().call_operator(log_op, (op4,), {}, meta)
+            op4 = super().call_operator(log_op, (op4,), {}, meta, True)
         return op4