Skip to content

Commit 726bf5e

Browse files
committed
Update on "[ET-VK] De vectorise all vectors in conv2d pw shader to improve perf."
This diff improves the performance of the conv2d pw shader by de-vectorizing all vectors. Differential Revision: [D75423245](https://our.internmc.facebook.com/intern/diff/D75423245/) [ghstack-poisoned]
2 parents 2ee154f + a624f01 commit 726bf5e

File tree

72 files changed

+1587
-1617
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

72 files changed

+1587
-1617
lines changed

.ci/docker/ci_commit_pins/pytorch.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
59d5cf083b4f860dea76fe8936076177f9367f10
1+
01f1cc44cbbfdf6307aa01b803a4ee22f9ade946

.github/workflows/build-presets.yml

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,3 +66,30 @@ jobs:
6666
./install_requirements.sh > /dev/null
6767
cmake --preset ${{ matrix.preset }}
6868
cmake --build cmake-out -j$(( $(nproc) - 1 ))
69+
70+
windows:
71+
uses: pytorch/test-infra/.github/workflows/windows_job.yml@main
72+
strategy:
73+
fail-fast: false
74+
matrix:
75+
preset: [pybind]
76+
with:
77+
job-name: build
78+
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
79+
submodules: recursive
80+
timeout: 90
81+
script: |
82+
set -eux
83+
conda init powershell
84+
powershell -Command "& {
85+
\$ErrorActionPreference = 'Stop'
86+
Set-PSDebug -Trace 1
87+
88+
conda create --yes --quiet -n et python=3.12
89+
conda activate et
90+
91+
python install_requirements.py
92+
cmake --preset ${{ matrix.preset }}
93+
\$numCores = [System.Environment]::GetEnvironmentVariable('NUMBER_OF_PROCESSORS') - 1
94+
cmake --build cmake-out -j \$numCores
95+
}"

.github/workflows/trunk.yml

Lines changed: 0 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -692,29 +692,3 @@ jobs:
692692
build-mode: Release
693693
build-tool: cmake
694694
docker-image: executorch-ubuntu-22.04-clang12
695-
696-
unittest-nxp-neutron:
697-
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
698-
permissions:
699-
id-token: write
700-
contents: read
701-
with:
702-
runner: linux.2xlarge
703-
docker-image: executorch-ubuntu-22.04-clang12
704-
submodules: 'true'
705-
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
706-
timeout: 90
707-
script: |
708-
set -eux
709-
710-
# The generic Linux job chooses to use base env, not the one setup by the image
711-
CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
712-
conda activate "${CONDA_ENV}"
713-
714-
# Build and install Executorch
715-
PYTHON_EXECUTABLE=python \
716-
CMAKE_ARGS="-DEXECUTORCH_BUILD_NXP_NEUTRON=ON" \
717-
.ci/scripts/setup-linux.sh --build-tool "cmake"
718-
719-
# Run pytest
720-
PYTHON_EXECUTABLE=python bash backends/nxp/run_unittests.sh

.gitmodules

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,3 +67,6 @@
6767
[submodule "shim"]
6868
path = shim
6969
url = https://github.com/facebook/buck2-shims-meta
70+
[submodule "third-party/json"]
71+
path = third-party/json
72+
url = https://github.com/nlohmann/json.git

backends/apple/coreml/runtime/workspace/executorchcoreml.xcodeproj/project.pbxproj

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -925,7 +925,7 @@
925925
"$(SRCROOT)/../include/executorch/runtime/core/portable_type/c10",
926926
"$(SRCROOT)/../sdk",
927927
"$(SRCROOT)/../util",
928-
"$(SRCROOT)/../../third-party/nlohmann_json/single_include",
928+
"$(SRCROOT)/../../../../../third-party/json/single_include",
929929
"$(SRCROOT)/../../third-party/coremltools/deps/protobuf/src",
930930
);
931931
IPHONEOS_DEPLOYMENT_TARGET = 16.0;
@@ -957,7 +957,7 @@
957957
"$(SRCROOT)/../include/executorch/runtime/core/portable_type/c10",
958958
"$(SRCROOT)/../sdk",
959959
"$(SRCROOT)/../util",
960-
"$(SRCROOT)/../../third-party/nlohmann_json/single_include",
960+
"$(SRCROOT)/../../../../../third-party/json/single_include",
961961
"$(SRCROOT)/../../third-party/coremltools/deps/protobuf/src",
962962
);
963963
IPHONEOS_DEPLOYMENT_TARGET = 16.0;

backends/apple/coreml/scripts/install_requirements.sh

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -49,14 +49,6 @@ mkdir "$COREMLTOOLS_DIR_PATH/build"
4949
cmake -S "$COREMLTOOLS_DIR_PATH" -B "$COREMLTOOLS_DIR_PATH/build"
5050
cmake --build "$COREMLTOOLS_DIR_PATH/build" --parallel --target mlmodel
5151

52-
echo "${green}ExecuTorch: Cloning nlohmann."
53-
git clone https://github.com/nlohmann/json.git "$COREML_DIR_PATH/third-party/nlohmann_json"
54-
STATUS=$?
55-
if [ $STATUS -ne 0 ]; then
56-
echo "${red}ExecuTorch: Failed to clone nlohmann."
57-
exit 1
58-
fi
59-
6052
echo "${green}ExecuTorch: Copying protobuf files."
6153
mkdir -p "$COREML_DIR_PATH/runtime/sdk/format/"
6254
cp -rf "$PROTOBUF_FILES_DIR_PATH" "$COREML_DIR_PATH/runtime/sdk/format/"

backends/arm/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ backends/arm/test/setup_testing.sh
101101
The you can run the tests with
102102

103103
```
104-
pytest -c /dev/null -v -n auto backends/arm/test --arm_run_corstoneFVP
104+
pytest -c /dev/null -v -n auto backends/arm/test
105105
```
106106

107107
## Passes

backends/arm/_passes/__init__.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@
2020
from .convert_split_to_slice import ConvertSplitToSlicePass # noqa
2121
from .convert_squeezes_to_view import ConvertSqueezesToViewPass # noqa
2222
from .convert_to_clamp import ConvertToClampPass # noqa
23-
from .decompose_batchnorm_pass import DecomposeBatchNormPass # noqa
2423
from .decompose_cosine_similarity_pass import DecomposeCosineSimilarityPass # noqa
2524
from .decompose_div_pass import DecomposeDivPass # noqa
2625
from .decompose_gelu_pass import DecomposeGeluPass # noqa

backends/arm/_passes/annotate_channels_last_dim_order_pass.py

Lines changed: 58 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
# Copyright 2024-2025 Arm Limited and/or its affiliates.
2-
# All rights reserved.
32
#
43
# This source code is licensed under the BSD-style license found in the
54
# LICENSE file in the root directory of this source tree.
@@ -36,7 +35,7 @@
3635
def _transpose_impl(*args, **kwargs):
3736
# Validate length of dim_order array
3837
dim = args[1]
39-
assert len(dim) <= 4
38+
assert len(dim) in (4, 5)
4039
# Pass-through in edge-IR
4140
return args[0]
4241

@@ -45,13 +44,15 @@ class AnnotateChannelsLastDimOrder(ExportPass):
4544
"""
4645
Annotates each node with a tosa_dim_order. tosa_dim_order can be seen as a channels-last dim-order
4746
that in most cases will be (0, 2, 3, 1) for nodes with 4D-shapes. The pass also inserts passthrough_to_tosa._transpose
48-
when a transition between 3D and 4D tensors happen.
47+
when a transition between 3D and 4D/5D tensors happen.
4948
The annotated tosa_dim_order is used to permute the node's shape such that it gives a TOSA-compliant shape.
5049
"""
5150

5251
NHWC_order = (0, 2, 3, 1)
5352
NHWC_inverse_order = (0, 3, 1, 2)
5453
HWCM_order = (2, 3, 0, 1)
54+
NNHWC_order = (0, 1, 3, 4, 2)
55+
NNHWC_inverse_order = (0, 1, 4, 2, 3)
5556

5657
def is_weight_node_for_depthwise_conv2d(self, node: torch.fx.Node):
5758
"""
@@ -81,8 +82,12 @@ def is_weight_node_for_depthwise_conv2d(self, node: torch.fx.Node):
8182

8283
@staticmethod
8384
def memory_format_differs(shape):
84-
"""Returns true if the shape will have a different memory layout in NCHW and NHWC format"""
85-
if len(shape) >= 4:
85+
"""Returns true if the shape will have a different memory layout in (N)NCHW and (N)NHWC format"""
86+
if len(shape) >= 5:
87+
C = shape[2]
88+
H = shape[3]
89+
W = shape[4]
90+
elif len(shape) == 4:
8691
C = shape[1]
8792
H = shape[2]
8893
W = shape[3]
@@ -98,14 +103,24 @@ def memory_format_differs(shape):
98103
@staticmethod
99104
def is_channel_reshape(input_shape, output_shape):
100105
"""Returns true if the reshape changes the channel dimension"""
101-
if not len(input_shape) == len(output_shape) == 4:
106+
if not (
107+
(len(input_shape) == len(output_shape) and (len(output_shape) in (4, 5)))
108+
or (len(input_shape) == 4 and len(output_shape) == 5)
109+
or (len(input_shape) == 5 and len(output_shape) == 4)
110+
):
102111
return False
103112

104-
C_old = input_shape[1]
105-
C_new = output_shape[1]
113+
C_old = input_shape[-3]
114+
C_new = output_shape[-3]
106115

107-
N_new = output_shape[0]
108-
N_old = input_shape[0]
116+
N_new = (
117+
output_shape[0]
118+
if len(output_shape) == 4
119+
else output_shape[0] * output_shape[1]
120+
)
121+
N_old = (
122+
input_shape[0] if len(input_shape) == 4 else input_shape[0] * input_shape[1]
123+
)
109124

110125
return (N_old != N_new) or (C_old != C_new)
111126

@@ -119,7 +134,11 @@ def insert_input_transpose(node, input_node, graph_module):
119134
torch.ops.passthrough_to_tosa._transpose.default,
120135
args=(
121136
input_node,
122-
list(AnnotateChannelsLastDimOrder.NHWC_inverse_order),
137+
list(
138+
AnnotateChannelsLastDimOrder.NNHWC_inverse_order
139+
if len(get_first_fake_tensor(input_node).size()) == 5
140+
else AnnotateChannelsLastDimOrder.NHWC_inverse_order
141+
),
123142
),
124143
quantize=quantize,
125144
q_params=q_params,
@@ -137,15 +156,28 @@ def insert_output_transpose(node, graph_module):
137156
permute_node = create_node(
138157
graph_module.graph,
139158
torch.ops.passthrough_to_tosa._transpose.default,
140-
args=(node, list(AnnotateChannelsLastDimOrder.NHWC_order)),
159+
args=(
160+
node,
161+
list(
162+
AnnotateChannelsLastDimOrder.NNHWC_order
163+
if len(get_first_fake_tensor(node).size()) == 5
164+
else AnnotateChannelsLastDimOrder.NHWC_order
165+
),
166+
),
141167
)
142168
permute_node.meta["tosa_dim_order"] = (
143-
AnnotateChannelsLastDimOrder.NHWC_order
169+
AnnotateChannelsLastDimOrder.NNHWC_order
170+
if len(get_first_fake_tensor(node).size()) == 5
171+
else AnnotateChannelsLastDimOrder.NHWC_order
172+
)
173+
permute_node.meta["val"] = get_first_fake_tensor(node).permute(
174+
AnnotateChannelsLastDimOrder.NNHWC_order
175+
if len(get_first_fake_tensor(node).size()) == 5
176+
else AnnotateChannelsLastDimOrder.NHWC_order
144177
)
145-
permute_node.meta["val"] = node.meta["val"].permute(
146-
AnnotateChannelsLastDimOrder.NHWC_order
178+
node.meta["tosa_dim_order"] = tuple(
179+
range(len(get_first_fake_tensor(node).size()))
147180
)
148-
node.meta["tosa_dim_order"] = (0, 1, 2, 3)
149181
users = [user for user in node.users if user != permute_node]
150182
for user in users:
151183
user.replace_input_with(node, permute_node)
@@ -159,8 +191,8 @@ def insert_output_transpose(node, graph_module):
159191
def _insert_view_transpose(
160192
input_shape, output_shape, node, input_node, graph_module
161193
):
162-
nchw_to_nhwc = len(input_shape) < 4 and len(output_shape) == 4
163-
nhwc_to_nchw = len(input_shape) == 4 and len(output_shape) < 4
194+
nchw_to_nhwc = len(input_shape) < 4 and len(output_shape) >= 4
195+
nhwc_to_nchw = len(input_shape) >= 4 and len(output_shape) < 4
164196
channel_reshape = AnnotateChannelsLastDimOrder.is_channel_reshape(
165197
output_shape, input_shape
166198
)
@@ -178,11 +210,11 @@ def _insert_view_transpose(
178210

179211
def insert_tosa_transposes(self, graph_module: torch.fx.GraphModule):
180212
"""
181-
Transposes are needed for operators transforming the input to a different rank, as 4D-tensors are assumed to be in NHWC-format, whereas all other are in NCHW format.
213+
Transposes are needed for operators transforming the input to a different rank, as 4D and 5D-tensors are assumed to be in (N)NHWC-format, whereas all other are in (N)NCHW format.
182214
This is relevant for the following cases:
183-
- view: <4D -> 4D
184-
- view: 4D -> <4D
185-
Additionally, a 4D->4D view operation acting on the channel dimension currently needs to be performed in NCHW format, leadning to one extra input and output transpose for this case.
215+
- view: <4D -> >=4D
216+
- view: >=4D -> <4D
217+
Additionally, a 4D/5D->4D/5D view operation acting on the channel dimension currently needs to be performed in (N)NCHW format, leadning to one extra input and output transpose for this case.
186218
187219
Transposes can be avoided for shapes where there is no difference in actual memory, e.g for
188220
- H == W == 1
@@ -212,12 +244,13 @@ def call(self, graph_module: torch.fx.GraphModule):
212244
# The weights of TOSA DEPTHWISE_CONV2D have shape (H, W, C, M) which corresponds to
213245
# dim_order = (2, 3, 0, 1) (https://www.mlplatform.org/tosa/tosa_spec.html#_depthwise_conv2d).
214246
dim_order = self.HWCM_order
247+
elif node_data.dim() == 5:
248+
dim_order = self.NNHWC_order # type: ignore[assignment]
215249
else:
216250
dim_order = tuple(range(node_data.dim())) # type: ignore[assignment]
217251
node.meta["tosa_dim_order"] = dim_order
218-
# Take care of cases when:
219-
# 4D (NHWC) -> >4D (NCH)
220-
# 3D (NCH) -> 4D (NHWC)
252+
# Insert TOSA transposes to convert between (N)NCHW and (N)NHWC format.
253+
# See insert_tosa_transposes for insertion conditions.
221254
self.insert_tosa_transposes(graph_module)
222255
graph_module.recompile()
223256
graph_module = super().call(graph_module).graph_module

backends/arm/_passes/arm_pass_manager.py

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,6 @@
2424
ConvertSplitToSlicePass,
2525
ConvertSqueezesToViewPass,
2626
ConvertToClampPass,
27-
DecomposeBatchNormPass,
2827
DecomposeCosineSimilarityPass,
2928
DecomposeDivPass,
3029
DecomposeGeluPass,
@@ -85,12 +84,13 @@ def _transform(self, graph_module: GraphModule):
8584
def _tosa_080_BI_pipeline(self, exported_program: ExportedProgram) -> GraphModule:
8685
self.add_pass(FuseQuantizedActivationPass())
8786
self.add_pass(RemoveGetItemPass())
88-
self.add_pass(DecomposeBatchNormPass())
8987
self.add_pass(ConvertSplitToSlicePass())
9088
self.add_pass(ConvertMmToBmmPass())
9189
self.add_pass(DecomposeLinearPass())
9290
self.add_pass(DecomposeLinearVectorNormPass())
93-
self.add_pass(DecomposeMeanDimPass())
91+
self.add_pass(
92+
DecomposeMeanDimPass(exported_program.graph_module, self.tosa_spec)
93+
)
9494
self.add_pass(ConvertFullLikeToFullPass())
9595
self.add_pass(ConvertToClampPass())
9696
self.add_pass(ConvertMinMaxPass())
@@ -116,7 +116,7 @@ def _tosa_080_BI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
116116
self.add_pass(UnsqueezeBeforeRepeatPass())
117117
self.add_pass(CastInt64BuffersToInt32Pass(exported_program))
118118
self.add_pass(DecomposeSumPass())
119-
self.add_pass(Conv1dUnsqueezePass(exported_program))
119+
self.add_pass(Conv1dUnsqueezePass())
120120
self.add_pass(DecomposeSelectPass())
121121
self.add_pass(ConvertSqueezesToViewPass())
122122

@@ -141,10 +141,11 @@ def _tosa_080_MI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
141141
self.add_pass(ConvertMmToBmmPass())
142142
self.add_pass(DecomposeLinearPass())
143143
self.add_pass(DecomposeLeakyReLUPass())
144-
self.add_pass(DecomposeBatchNormPass())
145144
self.add_pass(DecomposeLayerNormPass())
146145
self.add_pass(DecomposeVarPass())
147-
self.add_pass(DecomposeMeanDimPass())
146+
self.add_pass(
147+
DecomposeMeanDimPass(exported_program.graph_module, self.tosa_spec)
148+
)
148149
self.add_pass(DecomposeNotEqualPass())
149150
self.add_pass(DecomposeDivPass())
150151
self.add_pass(DecomposeSoftmaxPass())
@@ -169,7 +170,7 @@ def _tosa_080_MI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
169170
self.add_pass(UnsqueezeBeforeRepeatPass())
170171
self.add_pass(CastInt64BuffersToInt32Pass(exported_program))
171172
self.add_pass(DecomposeSumPass())
172-
self.add_pass(Conv1dUnsqueezePass(exported_program))
173+
self.add_pass(Conv1dUnsqueezePass())
173174
self.add_pass(DecomposeSelectPass())
174175
self.add_pass(ConvertSqueezesToViewPass())
175176

@@ -209,7 +210,7 @@ def transform_for_annotation_pipeline(self, graph_module: GraphModule):
209210
self.add_pass(ScalarsToAttributePass())
210211
self.add_pass(DecomposeLayerNormPass())
211212
self.add_pass(DecomposeVarPass())
212-
self.add_pass(DecomposeMeanDimPass())
213+
self.add_pass(DecomposeMeanDimPass(graph_module, self.tosa_spec))
213214
self.add_pass(DecomposeNotEqualPass())
214215
self.add_pass(DecomposeCosineSimilarityPass())
215216
self.add_pass(DecomposeDivPass())

0 commit comments

Comments
 (0)