Skip to content

Commit e28b9b9

Browse files
authored
Merge branch 'main' into add-dim-order-clone-kernel
2 parents b7bc064 + 2d4533a commit e28b9b9

File tree

77 files changed

+2441
-855
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

77 files changed

+2441
-855
lines changed

.github/workflows/pull.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -315,7 +315,7 @@ jobs:
315315
bash examples/models/moshi/mimi/install_requirements.sh
316316
317317
# reinstall executorch
318-
bash ./install_executorch.sh
318+
bash ./install_executorch.sh --minimal
319319
320320
# run python unittest
321321
python -m unittest examples.models.moshi.mimi.test_mimi

.github/workflows/trunk.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -288,6 +288,7 @@ jobs:
288288
- test_arm_baremetal: test_models_tosa
289289
- test_arm_baremetal: test_models_ethos-u55
290290
- test_arm_baremetal: test_models_ethos-u85
291+
- test_arm_baremetal: test_smaller_stories_llama
291292
fail-fast: false
292293
with:
293294
runner: linux.2xlarge.memory

backends/arm/_passes/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
from .decompose_div_pass import DecomposeDivPass # noqa
3737
from .decompose_embedding_pass import DecomposeEmbeddingPass # noqa # noqa
3838
from .decompose_gelu_pass import DecomposeGeluPass # noqa
39+
from .decompose_glu_pass import DecomposeGluPass # noqa
3940
from .decompose_grouped_conv import DecomposeGroupedConv # noqa
4041
from .decompose_groupnorm_pass import DecomposeGroupNormPass # noqa
4142
from .decompose_layernorm_pass import DecomposeLayerNormPass # noqa

backends/arm/_passes/arm_pass_manager.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
DecomposeDivPass,
4242
DecomposeEmbeddingPass,
4343
DecomposeGeluPass,
44+
DecomposeGluPass,
4445
DecomposeGroupedConv,
4546
DecomposeGroupNormPass,
4647
DecomposeLayerNormPass,
@@ -184,6 +185,7 @@ def _tosa_FP_pipeline(self, exported_program: ExportedProgram) -> GraphModule:
184185
self.add_pass(ConvertSplitToSlicePass())
185186
self.add_pass(FuseBatchnorm2DPass(exported_program))
186187
self.add_pass(ConvertMmToBmmPass())
188+
self.add_pass(DecomposeGluPass())
187189
self.add_pass(DecomposeLinearPass())
188190
self.add_pass(DecomposeLeakyReLUPass())
189191
self.add_pass(DecomposeGroupNormPass())
@@ -264,6 +266,7 @@ def transform_for_annotation_pipeline(self, graph_module: GraphModule):
264266
self.add_pass(DecomposeMeanDimPass(graph_module, self.tosa_spec))
265267
self.add_pass(DecomposeNotEqualPass())
266268
self.add_pass(DecomposeCosineSimilarityPass())
269+
self.add_pass(DecomposeGluPass())
267270
self.add_pass(DecomposeDivPass())
268271
self.add_pass(DecomposeLeakyReLUPass())
269272
self.add_pass(DecomposeLinearVectorNormPass())
Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
# Copyright 2025 Arm Limited and/or its affiliates.
2+
#
3+
# This source code is licensed under the BSD-style license found in the
4+
# LICENSE file in the root directory of this source tree.
5+
6+
import torch
7+
from executorch.backends.arm._passes import ArmPass
8+
from executorch.exir.dialects._ops import ops as exir_ops
9+
10+
11+
# For FP case
12+
edge_glu = exir_ops.edge.aten.glu.default
13+
14+
# For INT case
15+
aten_glu = torch.ops.aten.glu.default
16+
17+
18+
def get_ops(op):
19+
"""Returns the appropriate operator functions based on the input operator."""
20+
if op == edge_glu:
21+
return (
22+
exir_ops.edge.aten.mul.Tensor,
23+
exir_ops.edge.aten.sigmoid.default,
24+
exir_ops.edge.aten.slice_copy.Tensor,
25+
)
26+
elif op == aten_glu:
27+
return (
28+
torch.ops.aten.mul.Tensor,
29+
torch.ops.aten.sigmoid.default,
30+
torch.ops.aten.slice_copy.Tensor,
31+
)
32+
else:
33+
raise ValueError(f"Unsupported operator: {op}")
34+
35+
36+
class DecomposeGluPass(ArmPass):
37+
"""Decomposes the GLU operator into hadamard product and sigmoid."""
38+
39+
def call_operator(self, op, args, kwargs, meta):
40+
if op not in [edge_glu, aten_glu]:
41+
return super().call_operator(op, args, kwargs, meta)
42+
43+
hadamard_prod, sigmoid, slice_op = get_ops(op)
44+
X = args[0]
45+
46+
dim = args[1] if len(args) > 1 else kwargs.get("dim", -1)
47+
48+
if "val" not in X.node.meta:
49+
raise Exception("Could not get dimension metadata in input.")
50+
51+
if dim < 0:
52+
dim += X.node.meta["val"].dim()
53+
54+
n = X.node.meta["val"].size(dim)
55+
56+
if n % 2:
57+
raise RuntimeError(
58+
f"glu expects an even split along dim={dim}, got size {n}"
59+
)
60+
61+
middle = n // 2
62+
63+
T1 = super().call_operator(
64+
slice_op, (X, dim, 0, middle), {}, meta, updated=True
65+
)
66+
67+
T2 = super().call_operator(
68+
slice_op, (X, dim, middle, n), {}, meta, updated=True
69+
)
70+
71+
T2_sigmoid = super().call_operator(sigmoid, (T2,), {}, meta, updated=True)
72+
73+
return super().call_operator(
74+
hadamard_prod, (T1, T2_sigmoid), {}, meta, updated=True
75+
)

backends/arm/_passes/fuse_equal_placeholders_pass.py

Lines changed: 43 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,9 @@
33
# This source code is licensed under the BSD-style license found in the
44
# LICENSE file in the root directory of this source tree.
55

6+
import hashlib
7+
from collections import defaultdict
8+
69
import torch
710
from executorch.backends.arm._passes.arm_pass_utils import (
811
get_constant_placeholder_kind,
@@ -21,7 +24,7 @@ class FuseEqualPlaceholdersPass(ExportPass):
2124
"""
2225
This pass optimizes memory usage by finding constant placeholders
2326
pointing to identical tensors and fusing them to one single placeholder
24-
with multiple users.
27+
with multiple users, using a cache for faster comparison.
2528
"""
2629

2730
def __init__(self, exported_program: ExportedProgram):
@@ -30,58 +33,54 @@ def __init__(self, exported_program: ExportedProgram):
3033

3134
def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
3235
modified = False
33-
const_placeholder_nodes = []
34-
for node in graph_module.graph.nodes:
35-
if is_param_node(self.exported_program, node):
36-
const_placeholder_nodes.append(node)
37-
38-
while const_placeholder_nodes:
3936

40-
# Find equal tensors
41-
node1 = const_placeholder_nodes.pop()
42-
eq_nodes = [node1]
43-
tensor1 = get_param_tensor(self.exported_program, node1)
44-
if tensor1 is None:
37+
# Build a cache of params: mapping hash_key -> list of (node, tensor)
38+
hash_buckets = defaultdict(list)
39+
for node in graph_module.graph.nodes:
40+
if not is_param_node(self.exported_program, node):
4541
continue
42+
tensor = get_param_tensor(self.exported_program, node)
43+
if tensor is None:
44+
continue
45+
# Create a lightweight fingerprint: dtype + shape + SHA1 of raw bytes
46+
# Ensure tensor is on CPU and contiguous
47+
t_cpu = tensor.detach().cpu().contiguous()
48+
data_bytes = t_cpu.numpy().tobytes()
49+
key = (
50+
str(t_cpu.dtype),
51+
tuple(t_cpu.shape),
52+
hashlib.sha1(data_bytes).hexdigest(),
53+
)
54+
hash_buckets[key].append((node, t_cpu))
4655

47-
for node2 in const_placeholder_nodes:
48-
tensor2 = get_param_tensor(self.exported_program, node2)
49-
if tensor2 is None:
50-
continue
51-
52-
if (
53-
tensor1.dtype == tensor2.dtype
54-
and tensor1.shape == tensor2.shape
55-
and torch.allclose(tensor1, tensor2, atol=1e-08)
56-
):
57-
eq_nodes.append(node2)
56+
# For each bucket with more than one entry, fuse:
57+
for nodes_tensors in hash_buckets.values():
58+
if len(nodes_tensors) < 2:
59+
continue
5860

59-
if len(eq_nodes) > 1:
60-
common_name = node1.name + "_common"
61-
common_kind = get_constant_placeholder_kind(
62-
self.exported_program, node1
61+
# Create a new placeholder from first in list of equal placeholders.
62+
rep_node, rep_tensor = nodes_tensors[0]
63+
common_name = rep_node.name + "_common"
64+
common_kind = get_constant_placeholder_kind(self.exported_program, rep_node)
65+
common_persistent = True
66+
with graph_module.graph.inserting_before(rep_node):
67+
common_node = create_constant_placeholder(
68+
self.exported_program,
69+
graph_module.graph,
70+
common_name,
71+
common_kind,
72+
rep_tensor,
73+
common_persistent,
6374
)
64-
common_persisten_buffer = True
65-
66-
with graph_module.graph.inserting_before(node1):
67-
common_node = create_constant_placeholder(
68-
self.exported_program,
69-
graph_module.graph,
70-
common_name,
71-
common_kind,
72-
tensor1,
73-
common_persisten_buffer,
74-
)
75-
76-
for eq_node in eq_nodes:
77-
eq_node.replace_all_uses_with(common_node)
78-
delete_constant_placeholder(self.exported_program, eq_node)
79-
if eq_node != node1:
80-
const_placeholder_nodes.remove(eq_node)
8175

76+
# Replace uses and delete duplicates
77+
for node, _ in nodes_tensors:
78+
node.replace_all_uses_with(common_node)
79+
delete_constant_placeholder(self.exported_program, node)
8280
modified = True
8381

8482
if modified:
8583
graph_module.recompile()
8684
graph_module = super().call(graph_module).graph_module
85+
8786
return PassResult(graph_module=graph_module, modified=modified)

backends/arm/operator_support/tosa_supported_operators.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -258,6 +258,7 @@ def is_node_supported(
258258
exir_ops.edge.aten.masked_fill.Scalar,
259259
exir_ops.edge.aten.asinh.default,
260260
exir_ops.edge.aten.cosh.default,
261+
exir_ops.edge.aten.glu.default,
261262
]
262263

263264
return supported
@@ -299,6 +300,7 @@ def is_node_supported(
299300
exir_ops.edge.aten.leaky_relu.default: None,
300301
exir_ops.edge.aten.round.default: None,
301302
exir_ops.edge.aten.addmm.default: None,
303+
exir_ops.edge.aten.glu.default: None,
302304
}
303305

304306
if node.target in needs_decomp_dict:

backends/arm/scripts/build_executor_runner.sh

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ output_folder_set=false
2525
output_folder="."
2626
et_build_root="${et_root_dir}/arm_test"
2727
ethosu_tools_dir=${et_root_dir}/examples/arm/ethos-u-scratch
28+
select_ops_list=""
2829

2930
build_bundleio_flags=" -DET_BUNDLE_IO=OFF "
3031
build_with_etdump_flags=" -DEXECUTORCH_ENABLE_EVENT_TRACER=OFF "
@@ -47,7 +48,10 @@ help() {
4748
echo " --et_build_root=<FOLDER> Build output root folder to use, defaults to ${et_build_root}"
4849
echo " --ethosu_tools_dir=<FOLDER> Path to your Ethos-U tools dir if you not using default: ${ethosu_tools_dir}"
4950
echo " --toolchain=<TOOLCHAIN> Toolchain can be specified (e.g. bare metal as arm-none-eabi-gcc or zephyr as arm-zephyr-eabi-gcc"
50-
exit 0
51+
echo " --select_ops_list=<OPS> Comma separated list of portable (non delagated) kernels to include Default: ${select_ops_list}"
52+
echo " NOTE: This is used when select_ops_model is not possible to use, e.g. for semihosting or bundleio."
53+
echo " See https://docs.pytorch.org/executorch/stable/kernel-library-selective-build.html for more information."
54+
exit 0
5155
}
5256

5357
for arg in "$@"; do
@@ -65,6 +69,7 @@ for arg in "$@"; do
6569
--et_build_root=*) et_build_root="${arg#*=}";;
6670
--ethosu_tools_dir=*) ethosu_tools_dir="${arg#*=}";;
6771
--toolchain=*) toolchain="${arg#*=}";;
72+
--select_ops_list=*) select_ops_list="${arg#*=}";;
6873
*)
6974
;;
7075
esac
@@ -157,6 +162,7 @@ cmake \
157162
-DPYTHON_EXECUTABLE=$(which python3) \
158163
-DSYSTEM_CONFIG=${system_config} \
159164
-DMEMORY_MODE=${memory_mode} \
165+
-DEXECUTORCH_SELECT_OPS_LIST="${select_ops_list}" \
160166
${extra_build_flags} \
161167
-B ${output_folder}/cmake-out
162168

backends/arm/scripts/build_portable_kernels.sh

Lines changed: 1 addition & 89 deletions
Original file line numberDiff line numberDiff line change
@@ -4,92 +4,4 @@
44
# This source code is licensed under the BSD-style license found in the
55
# LICENSE file in the root directory of this source tree.
66

7-
# Optional parameter:
8-
# --build_type= "Release" | "Debug" | "RelWithDebInfo"
9-
# --etdump build with devtools-etdump support
10-
11-
set -eu
12-
13-
script_dir=$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
14-
et_root_dir=$(cd ${script_dir}/../../.. && pwd)
15-
et_root_dir=$(realpath ${et_root_dir})
16-
toolchain=arm-none-eabi-gcc
17-
setup_path_script=${et_root_dir}/examples/arm/ethos-u-scratch/setup_path.sh
18-
_setup_msg="please refer to ${et_root_dir}/examples/arm/setup.sh to properly install necessary tools."
19-
20-
21-
et_build_root="${et_root_dir}/arm_test"
22-
build_type="Release"
23-
portable_kernels="aten::_softmax.out"
24-
25-
help() {
26-
echo "Usage: $(basename $0) [options]"
27-
echo "Options:"
28-
echo " --et_build_root=<FOLDER> Build output root folder to use, defaults to ${et_build_root}"
29-
echo " --build_type=<TYPE> Build with Release, Debug or RelWithDebInfo, default is ${build_type}"
30-
echo " --portable_kernels=<OPS> Comma separated list of portable (non delagated) kernels to include Default: ${portable_kernels}"
31-
echo " --toolchain=<TOOLCHAIN> Toolchain can be specified (e.g. bare metal as arm-none-eabi-gcc or zephyr as arm-zephyr-eabi-gcc"
32-
exit 0
33-
}
34-
35-
for arg in "$@"; do
36-
case $arg in
37-
-h|--help) help ;;
38-
--et_build_root=*) et_build_root="${arg#*=}";;
39-
--build_type=*) build_type="${arg#*=}";;
40-
--portable_kernels=*) portable_kernels="${arg#*=}";;
41-
--toolchain=*) toolchain="${arg#*=}";;
42-
*)
43-
;;
44-
esac
45-
done
46-
47-
if [[ ${toolchain} == "arm-none-eabi-gcc" ]]; then
48-
toolchain_cmake=${et_root_dir}/examples/arm/ethos-u-setup/${toolchain}.cmake
49-
elif [[ ${toolchain} == "arm-zephyr-eabi-gcc" ]]; then
50-
toolchain_cmake=${et_root_dir}/examples/zephyr/x86_64-linux-arm-zephyr-eabi-gcc.cmake
51-
else
52-
echo "Error: Invalid toolchain selection, provided: ${tolchain}"
53-
echo " Valid options are {arm-none-eabi-gcc, arm-zephyr-eabi-gcc}"
54-
exit 1;
55-
fi
56-
toolchain_cmake=$(realpath ${toolchain_cmake})
57-
58-
# Source the tools
59-
# This should be prepared by the setup.sh
60-
[[ -f ${setup_path_script} ]] \
61-
|| { echo "Missing ${setup_path_script}. ${_setup_msg}"; exit 1; }
62-
63-
source ${setup_path_script}
64-
65-
et_build_dir=${et_build_root}/cmake-out
66-
67-
cd "${et_root_dir}"
68-
69-
echo "--------------------------------------------------------------------------------" ;
70-
echo "Build ExecuTorch Libraries ${build_type} portable kernels: ${portable_kernels} into '${et_build_dir}'" ;
71-
echo "--------------------------------------------------------------------------------"
72-
73-
if ! [[ $portable_kernels =~ ^((^|,)aten::[a-zA-Z0-9_]+\.[a-zA-Z0-9_]*out)*$ ]]; then
74-
echo " ERROR: specified argument --portable_kernels=${portable_kernels}"
75-
echo " is in the wrong format please use \"aten::<OP1>.out,aten::<OP2>.out,...\""
76-
echo " e.g. \"aten::_softmax.out,aten::add.out\""
77-
exit 1
78-
fi
79-
80-
set -x
81-
82-
cmake \
83-
-DCMAKE_INSTALL_PREFIX=${et_build_dir} \
84-
-DCMAKE_BUILD_TYPE=${build_type} \
85-
-DCMAKE_TOOLCHAIN_FILE="${toolchain_cmake}" \
86-
-DEXECUTORCH_SELECT_OPS_LIST=${portable_kernels} \
87-
-B"${et_build_dir}/examples/arm" \
88-
"${et_root_dir}/examples/arm"
89-
90-
cmake --build "${et_build_dir}/examples/arm" -j$(nproc) --config ${build_type} --
91-
92-
set +x
93-
94-
echo "[$(basename $0)] Generated static libraries for ExecuTorch:"
95-
find "${et_build_dir}/examples/arm" -name "*.a" -exec ls -al {} \;
7+
echo "DEPRECATED: build_portable_kernels.sh is deprecated and will be removed. The kernel registration library is now built directly with the arm_executor_runner."

0 commit comments

Comments
 (0)