pytorch
diff --git a/‎.github/workflows/pull.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/pull.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/trunk.yml‎
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/trunk.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎backends/arm/_passes/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎backends/arm/_passes/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎backends/arm/_passes/arm_pass_manager.py‎
Lines changed: 3 additions & 0 deletions b/‎backends/arm/_passes/arm_pass_manager.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎backends/arm/_passes/decompose_glu_pass.py‎
Lines changed: 75 additions & 0 deletions b/‎backends/arm/_passes/decompose_glu_pass.py‎
Lines changed: 75 additions & 0 deletions
diff --git a/‎backends/arm/_passes/fuse_equal_placeholders_pass.py‎
Lines changed: 43 additions & 44 deletions b/‎backends/arm/_passes/fuse_equal_placeholders_pass.py‎
Lines changed: 43 additions & 44 deletions
diff --git a/‎backends/arm/operator_support/tosa_supported_operators.py‎
Lines changed: 2 additions & 0 deletions b/‎backends/arm/operator_support/tosa_supported_operators.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎backends/arm/scripts/build_executor_runner.sh‎
Lines changed: 7 additions & 1 deletion b/‎backends/arm/scripts/build_executor_runner.sh‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎backends/arm/scripts/build_portable_kernels.sh‎
Lines changed: 1 addition & 89 deletions b/‎backends/arm/scripts/build_portable_kernels.sh‎
Lines changed: 1 addition & 89 deletions
@@ -315,7 +315,7 @@ jobs:
         bash examples/models/moshi/mimi/install_requirements.sh
 
         # reinstall executorch
-        bash ./install_executorch.sh
+        bash ./install_executorch.sh --minimal
 
         # run python unittest
         python -m unittest examples.models.moshi.mimi.test_mimi
 
@@ -288,6 +288,7 @@ jobs:
           - test_arm_baremetal: test_models_tosa
           - test_arm_baremetal: test_models_ethos-u55
           - test_arm_baremetal: test_models_ethos-u85
+          - test_arm_baremetal: test_smaller_stories_llama
       fail-fast: false
     with:
       runner: linux.2xlarge.memory
 
@@ -36,6 +36,7 @@
 from .decompose_div_pass import DecomposeDivPass  # noqa
 from .decompose_embedding_pass import DecomposeEmbeddingPass  # noqa  # noqa
 from .decompose_gelu_pass import DecomposeGeluPass  # noqa
+from .decompose_glu_pass import DecomposeGluPass  # noqa
 from .decompose_grouped_conv import DecomposeGroupedConv  # noqa
 from .decompose_groupnorm_pass import DecomposeGroupNormPass  # noqa
 from .decompose_layernorm_pass import DecomposeLayerNormPass  # noqa
 
@@ -41,6 +41,7 @@
     DecomposeDivPass,
     DecomposeEmbeddingPass,
     DecomposeGeluPass,
+    DecomposeGluPass,
     DecomposeGroupedConv,
     DecomposeGroupNormPass,
     DecomposeLayerNormPass,
@@ -184,6 +185,7 @@ def _tosa_FP_pipeline(self, exported_program: ExportedProgram) -> GraphModule:
         self.add_pass(ConvertSplitToSlicePass())
         self.add_pass(FuseBatchnorm2DPass(exported_program))
         self.add_pass(ConvertMmToBmmPass())
+        self.add_pass(DecomposeGluPass())
         self.add_pass(DecomposeLinearPass())
         self.add_pass(DecomposeLeakyReLUPass())
         self.add_pass(DecomposeGroupNormPass())
@@ -264,6 +266,7 @@ def transform_for_annotation_pipeline(self, graph_module: GraphModule):
         self.add_pass(DecomposeMeanDimPass(graph_module, self.tosa_spec))
         self.add_pass(DecomposeNotEqualPass())
         self.add_pass(DecomposeCosineSimilarityPass())
+        self.add_pass(DecomposeGluPass())
         self.add_pass(DecomposeDivPass())
         self.add_pass(DecomposeLeakyReLUPass())
         self.add_pass(DecomposeLinearVectorNormPass())
 
@@ -0,0 +1,75 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from executorch.backends.arm._passes import ArmPass
+from executorch.exir.dialects._ops import ops as exir_ops
+
+
+# For FP case
+edge_glu = exir_ops.edge.aten.glu.default
+
+# For INT case
+aten_glu = torch.ops.aten.glu.default
+
+
+def get_ops(op):
+    """Returns the appropriate operator functions based on the input operator."""
+    if op == edge_glu:
+        return (
+            exir_ops.edge.aten.mul.Tensor,
+            exir_ops.edge.aten.sigmoid.default,
+            exir_ops.edge.aten.slice_copy.Tensor,
+        )
+    elif op == aten_glu:
+        return (
+            torch.ops.aten.mul.Tensor,
+            torch.ops.aten.sigmoid.default,
+            torch.ops.aten.slice_copy.Tensor,
+        )
+    else:
+        raise ValueError(f"Unsupported operator: {op}")
+
+
+class DecomposeGluPass(ArmPass):
+    """Decomposes the GLU operator into hadamard product and sigmoid."""
+
+    def call_operator(self, op, args, kwargs, meta):
+        if op not in [edge_glu, aten_glu]:
+            return super().call_operator(op, args, kwargs, meta)
+
+        hadamard_prod, sigmoid, slice_op = get_ops(op)
+        X = args[0]
+
+        dim = args[1] if len(args) > 1 else kwargs.get("dim", -1)
+
+        if "val" not in X.node.meta:
+            raise Exception("Could not get dimension metadata in input.")
+
+        if dim < 0:
+            dim += X.node.meta["val"].dim()
+
+        n = X.node.meta["val"].size(dim)
+
+        if n % 2:
+            raise RuntimeError(
+                f"glu expects an even split along dim={dim}, got size {n}"
+            )
+
+        middle = n // 2
+
+        T1 = super().call_operator(
+            slice_op, (X, dim, 0, middle), {}, meta, updated=True
+        )
+
+        T2 = super().call_operator(
+            slice_op, (X, dim, middle, n), {}, meta, updated=True
+        )
+
+        T2_sigmoid = super().call_operator(sigmoid, (T2,), {}, meta, updated=True)
+
+        return super().call_operator(
+            hadamard_prod, (T1, T2_sigmoid), {}, meta, updated=True
+        )
@@ -3,6 +3,9 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+import hashlib
+from collections import defaultdict
+
 import torch
 from executorch.backends.arm._passes.arm_pass_utils import (
     get_constant_placeholder_kind,
@@ -21,7 +24,7 @@ class FuseEqualPlaceholdersPass(ExportPass):
     """
     This pass optimizes memory usage by finding constant placeholders
     pointing to identical tensors and fusing them to one single placeholder
-    with multiple users.
+    with multiple users, using a cache for faster comparison.
     """
 
     def __init__(self, exported_program: ExportedProgram):
@@ -30,58 +33,54 @@ def __init__(self, exported_program: ExportedProgram):
 
     def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
         modified = False
-        const_placeholder_nodes = []
-        for node in graph_module.graph.nodes:
-            if is_param_node(self.exported_program, node):
-                const_placeholder_nodes.append(node)
-
-        while const_placeholder_nodes:
 
-            # Find equal tensors
-            node1 = const_placeholder_nodes.pop()
-            eq_nodes = [node1]
-            tensor1 = get_param_tensor(self.exported_program, node1)
-            if tensor1 is None:
+        # Build a cache of params: mapping hash_key -> list of (node, tensor)
+        hash_buckets = defaultdict(list)
+        for node in graph_module.graph.nodes:
+            if not is_param_node(self.exported_program, node):
                 continue
+            tensor = get_param_tensor(self.exported_program, node)
+            if tensor is None:
+                continue
+            # Create a lightweight fingerprint: dtype + shape + SHA1 of raw bytes
+            # Ensure tensor is on CPU and contiguous
+            t_cpu = tensor.detach().cpu().contiguous()
+            data_bytes = t_cpu.numpy().tobytes()
+            key = (
+                str(t_cpu.dtype),
+                tuple(t_cpu.shape),
+                hashlib.sha1(data_bytes).hexdigest(),
+            )
+            hash_buckets[key].append((node, t_cpu))
 
-            for node2 in const_placeholder_nodes:
-                tensor2 = get_param_tensor(self.exported_program, node2)
-                if tensor2 is None:
-                    continue
-
-                if (
-                    tensor1.dtype == tensor2.dtype
-                    and tensor1.shape == tensor2.shape
-                    and torch.allclose(tensor1, tensor2, atol=1e-08)
-                ):
-                    eq_nodes.append(node2)
+        # For each bucket with more than one entry, fuse:
+        for nodes_tensors in hash_buckets.values():
+            if len(nodes_tensors) < 2:
+                continue
 
-            if len(eq_nodes) > 1:
-                common_name = node1.name + "_common"
-                common_kind = get_constant_placeholder_kind(
-                    self.exported_program, node1
+            # Create a new placeholder from first in list of equal placeholders.
+            rep_node, rep_tensor = nodes_tensors[0]
+            common_name = rep_node.name + "_common"
+            common_kind = get_constant_placeholder_kind(self.exported_program, rep_node)
+            common_persistent = True
+            with graph_module.graph.inserting_before(rep_node):
+                common_node = create_constant_placeholder(
+                    self.exported_program,
+                    graph_module.graph,
+                    common_name,
+                    common_kind,
+                    rep_tensor,
+                    common_persistent,
                 )
-                common_persisten_buffer = True
-
-                with graph_module.graph.inserting_before(node1):
-                    common_node = create_constant_placeholder(
-                        self.exported_program,
-                        graph_module.graph,
-                        common_name,
-                        common_kind,
-                        tensor1,
-                        common_persisten_buffer,
-                    )
-
-                for eq_node in eq_nodes:
-                    eq_node.replace_all_uses_with(common_node)
-                    delete_constant_placeholder(self.exported_program, eq_node)
-                    if eq_node != node1:
-                        const_placeholder_nodes.remove(eq_node)
 
+            # Replace uses and delete duplicates
+            for node, _ in nodes_tensors:
+                node.replace_all_uses_with(common_node)
+                delete_constant_placeholder(self.exported_program, node)
                 modified = True
 
         if modified:
             graph_module.recompile()
             graph_module = super().call(graph_module).graph_module
+
         return PassResult(graph_module=graph_module, modified=modified)
@@ -258,6 +258,7 @@ def is_node_supported(
             exir_ops.edge.aten.masked_fill.Scalar,
             exir_ops.edge.aten.asinh.default,
             exir_ops.edge.aten.cosh.default,
+            exir_ops.edge.aten.glu.default,
         ]
 
         return supported
@@ -299,6 +300,7 @@ def is_node_supported(
             exir_ops.edge.aten.leaky_relu.default: None,
             exir_ops.edge.aten.round.default: None,
             exir_ops.edge.aten.addmm.default: None,
+            exir_ops.edge.aten.glu.default: None,
         }
 
         if node.target in needs_decomp_dict:
 
@@ -25,6 +25,7 @@ output_folder_set=false
 output_folder="."
 et_build_root="${et_root_dir}/arm_test"
 ethosu_tools_dir=${et_root_dir}/examples/arm/ethos-u-scratch
+select_ops_list=""
 
 build_bundleio_flags=" -DET_BUNDLE_IO=OFF "
 build_with_etdump_flags=" -DEXECUTORCH_ENABLE_EVENT_TRACER=OFF "
@@ -47,7 +48,10 @@ help() {
     echo "  --et_build_root=<FOLDER>        Build output root folder to use, defaults to ${et_build_root}"
     echo "  --ethosu_tools_dir=<FOLDER>     Path to your Ethos-U tools dir if you not using default: ${ethosu_tools_dir}"
     echo "  --toolchain=<TOOLCHAIN>         Toolchain can be specified (e.g. bare metal as arm-none-eabi-gcc or zephyr as arm-zephyr-eabi-gcc"
-    exit 0
+    echo "  --select_ops_list=<OPS>         Comma separated list of portable (non delagated) kernels to include Default: ${select_ops_list}"
+    echo "                                     NOTE: This is used when select_ops_model is not possible to use, e.g. for semihosting or bundleio."
+    echo "                                     See https://docs.pytorch.org/executorch/stable/kernel-library-selective-build.html for more information."
+   exit 0
 }
 
 for arg in "$@"; do
@@ -65,6 +69,7 @@ for arg in "$@"; do
       --et_build_root=*) et_build_root="${arg#*=}";;
       --ethosu_tools_dir=*) ethosu_tools_dir="${arg#*=}";;
       --toolchain=*) toolchain="${arg#*=}";;
+      --select_ops_list=*) select_ops_list="${arg#*=}";;
       *)
       ;;
     esac
@@ -157,6 +162,7 @@ cmake \
     -DPYTHON_EXECUTABLE=$(which python3)        \
     -DSYSTEM_CONFIG=${system_config}            \
     -DMEMORY_MODE=${memory_mode}                \
+    -DEXECUTORCH_SELECT_OPS_LIST="${select_ops_list}" \
     ${extra_build_flags}                        \
     -B ${output_folder}/cmake-out
 
 
@@ -4,92 +4,4 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-# Optional parameter:
-# --build_type= "Release" | "Debug" | "RelWithDebInfo"
-# --etdump      build with devtools-etdump support
-
-set -eu
-
-script_dir=$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
-et_root_dir=$(cd ${script_dir}/../../.. && pwd)
-et_root_dir=$(realpath ${et_root_dir})
-toolchain=arm-none-eabi-gcc
-setup_path_script=${et_root_dir}/examples/arm/ethos-u-scratch/setup_path.sh
-_setup_msg="please refer to ${et_root_dir}/examples/arm/setup.sh to properly install necessary tools."
-
-
-et_build_root="${et_root_dir}/arm_test"
-build_type="Release"
-portable_kernels="aten::_softmax.out"
-
-help() {
-    echo "Usage: $(basename $0) [options]"
-    echo "Options:"
-    echo "  --et_build_root=<FOLDER>   Build output root folder to use, defaults to ${et_build_root}"
-    echo "  --build_type=<TYPE>        Build with Release, Debug or RelWithDebInfo, default is ${build_type}"
-    echo "  --portable_kernels=<OPS>   Comma separated list of portable (non delagated) kernels to include Default: ${portable_kernels}"
-    echo "  --toolchain=<TOOLCHAIN>    Toolchain can be specified (e.g. bare metal as arm-none-eabi-gcc or zephyr as arm-zephyr-eabi-gcc"
-    exit 0
-}
-
-for arg in "$@"; do
-    case $arg in
-      -h|--help) help ;;
-      --et_build_root=*) et_build_root="${arg#*=}";;
-      --build_type=*) build_type="${arg#*=}";;
-      --portable_kernels=*) portable_kernels="${arg#*=}";;
-      --toolchain=*) toolchain="${arg#*=}";;
-      *)
-      ;;
-    esac
-done
-
-if [[ ${toolchain} == "arm-none-eabi-gcc" ]]; then
-    toolchain_cmake=${et_root_dir}/examples/arm/ethos-u-setup/${toolchain}.cmake
-elif [[ ${toolchain} == "arm-zephyr-eabi-gcc" ]]; then 
-    toolchain_cmake=${et_root_dir}/examples/zephyr/x86_64-linux-arm-zephyr-eabi-gcc.cmake
-else
-    echo "Error: Invalid toolchain selection, provided: ${tolchain}"
-    echo "    Valid options are {arm-none-eabi-gcc, arm-zephyr-eabi-gcc}"
-    exit 1;
-fi
-toolchain_cmake=$(realpath ${toolchain_cmake})
-
-# Source the tools
-# This should be prepared by the setup.sh
-[[ -f ${setup_path_script} ]] \
-    || { echo "Missing ${setup_path_script}. ${_setup_msg}"; exit 1; }
-
-source ${setup_path_script}
-
-et_build_dir=${et_build_root}/cmake-out
-
-cd "${et_root_dir}"
-
-echo "--------------------------------------------------------------------------------" ;
-echo "Build ExecuTorch Libraries ${build_type} portable kernels: ${portable_kernels} into '${et_build_dir}'" ;
-echo "--------------------------------------------------------------------------------"
-
-if ! [[ $portable_kernels =~ ^((^|,)aten::[a-zA-Z0-9_]+\.[a-zA-Z0-9_]*out)*$ ]]; then
-    echo " ERROR: specified argument --portable_kernels=${portable_kernels}"
-    echo "        is in the wrong format please use \"aten::<OP1>.out,aten::<OP2>.out,...\""
-    echo "        e.g. \"aten::_softmax.out,aten::add.out\""
-    exit 1
-fi
-
-set -x
-
-cmake                                                 \
-    -DCMAKE_INSTALL_PREFIX=${et_build_dir}            \
-    -DCMAKE_BUILD_TYPE=${build_type}                  \
-    -DCMAKE_TOOLCHAIN_FILE="${toolchain_cmake}"       \
-    -DEXECUTORCH_SELECT_OPS_LIST=${portable_kernels}  \
-    -B"${et_build_dir}/examples/arm"                  \
-    "${et_root_dir}/examples/arm"
-
-cmake --build "${et_build_dir}/examples/arm" -j$(nproc) --config ${build_type} --
-
-set +x
-
-echo "[$(basename $0)] Generated static libraries for ExecuTorch:"
-find "${et_build_dir}/examples/arm" -name "*.a" -exec ls -al {} \;
+echo "DEPRECATED: build_portable_kernels.sh is deprecated and will be removed. The kernel registration library is now built directly with the arm_executor_runner."
Original file line number	Diff line number	Diff line change
`@@ -258,6 +258,7 @@ def is_node_supported(`
`258`	`258`	`exir_ops.edge.aten.masked_fill.Scalar,`
`259`	`259`	`exir_ops.edge.aten.asinh.default,`
`260`	`260`	`exir_ops.edge.aten.cosh.default,`
	`261`	`+ exir_ops.edge.aten.glu.default,`
`261`	`262`	`]`
`262`	`263`
`263`	`264`	`return supported`
`@@ -299,6 +300,7 @@ def is_node_supported(`
`299`	`300`	`exir_ops.edge.aten.leaky_relu.default: None,`
`300`	`301`	`exir_ops.edge.aten.round.default: None,`
`301`	`302`	`exir_ops.edge.aten.addmm.default: None,`
	`303`	`+ exir_ops.edge.aten.glu.default: None,`
`302`	`304`	`}`
`303`	`305`
`304`	`306`	`if node.target in needs_decomp_dict:`