Skip to content

Commit 6eff57b

Browse files
Rushi-caddijopaul
andauthored
Adding permute_copy operator kernel optimization (#21)
* Adding permute_copy operator kernel optimization * Adding permute_copy operator kernel optimization * Code cleanup --------- Co-authored-by: dijopaul <87994875+dijopaul@users.noreply.github.com>
1 parent fe91c10 commit 6eff57b

File tree

3 files changed

+191
-2
lines changed

3 files changed

+191
-2
lines changed

backends/cadence/aot/functions_hifi.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -160,7 +160,7 @@
160160
- op: permute_copy.out
161161
kernels:
162162
- arg_meta: null
163-
kernel_name: torch::executor::permute_copy_out
163+
kernel_name: impl::HiFi::permute_copy_out
164164

165165
- op: sigmoid.out
166166
kernels:

backends/cadence/hifi/operators/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ set(_aten_ops__srcs
3636
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_minimum.cpp"
3737
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_mm.cpp"
3838
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_mul.cpp"
39+
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_permute_copy.cpp"
3940
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_ne.cpp"
4041
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_sigmoid.cpp"
4142
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_sub.cpp"
@@ -45,7 +46,6 @@ set(_aten_ops__srcs
4546
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_clone.cpp"
4647
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_embedding.cpp"
4748
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_full.cpp"
48-
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_permute_copy.cpp"
4949
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_slice_copy.cpp"
5050
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_softmax.cpp"
5151
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_split_with_sizes_copy.cpp"
Lines changed: 189 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,189 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
#include <executorch/backends/cadence/hifi/kernels/kernels.h>
10+
#include <executorch/kernels/portable/cpu/util/copy_ops_util.h>
11+
#include <executorch/runtime/kernel/kernel_includes.h>
12+
13+
using exec_aten::ScalarType;
14+
using exec_aten::SizesType;
15+
using exec_aten::Tensor;
16+
using executorch::runtime::IntArrayRef;
17+
using executorch::runtime::KernelRuntimeContext;
18+
using executorch::runtime::kTensorDimensionLimit;
19+
using torch::executor::Error;
20+
21+
namespace impl {
22+
namespace HiFi {
23+
namespace native {
24+
25+
namespace {
26+
27+
void increment_coordinate_permuted(
28+
const Tensor& tensor,
29+
size_t* const coordinate,
30+
IntArrayRef dims) {
31+
for (int i = dims.size() - 1; i >= 0; i--) {
32+
size_t d = dims[i] >= 0 ? dims[i] : dims[i] + tensor.dim();
33+
coordinate[d]++;
34+
if (coordinate[d] == tensor.size(d)) {
35+
coordinate[d] = 0;
36+
} else {
37+
return;
38+
}
39+
}
40+
}
41+
42+
} // namespace
43+
44+
Tensor& permute_copy_out(
45+
KernelRuntimeContext& ctx,
46+
const Tensor& in,
47+
IntArrayRef dims,
48+
Tensor& out) {
49+
(void)ctx;
50+
51+
ET_KERNEL_CHECK(
52+
ctx, check_permute_copy_args(in, dims, out), InvalidArgument, out);
53+
54+
ET_KERNEL_CHECK(
55+
ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
56+
57+
Tensor::SizesType expected_out_size[kTensorDimensionLimit];
58+
size_t expected_out_dim = 0;
59+
get_permute_copy_out_target_size(
60+
in, dims, expected_out_size, &expected_out_dim);
61+
ET_KERNEL_CHECK(
62+
ctx,
63+
resize_tensor(out, {expected_out_size, expected_out_dim}) == Error::Ok,
64+
InvalidArgument,
65+
out);
66+
67+
const auto in_type = out.scalar_type();
68+
69+
constexpr auto name = "permute_copy.out";
70+
constexpr int kNnlibMaxDim = 16;
71+
72+
bool optimized = 0;
73+
74+
if (out.scalar_type() == ScalarType::Float)
75+
optimized = 1;
76+
else if (out.scalar_type() == ScalarType::Char)
77+
optimized = 1;
78+
else if (out.scalar_type() == ScalarType::Byte)
79+
optimized = 1;
80+
81+
if (in.dim() > kNnlibMaxDim)
82+
optimized = 0;
83+
84+
if (optimized) {
85+
if (in_type == ScalarType::Float) {
86+
WORD32* p_inp = (WORD32*)in.const_data_ptr<float>();
87+
WORD32* p_out = (WORD32*)out.mutable_data_ptr<float>();
88+
89+
WORD32 num_inp_dims = in.dim();
90+
WORD32 num_out_dims = num_inp_dims;
91+
92+
WORD32 p_inp_shape[kNnlibMaxDim];
93+
WORD32 p_out_shape[kNnlibMaxDim];
94+
WORD32 p_permute_vec[kNnlibMaxDim];
95+
96+
for (int i = 0; i < num_inp_dims; i++) {
97+
p_inp_shape[i] = in.size(i);
98+
p_out_shape[i] = in.size(dims[i]);
99+
p_permute_vec[i] = dims[i];
100+
}
101+
102+
xa_nn_transpose_32_32(
103+
p_out,
104+
p_out_shape,
105+
p_inp,
106+
p_inp_shape,
107+
p_permute_vec,
108+
num_out_dims,
109+
num_inp_dims);
110+
111+
return out;
112+
} else if (in_type == ScalarType::Char) {
113+
WORD8* p_inp = (WORD8*)in.const_data_ptr<char>();
114+
WORD8* p_out = (WORD8*)out.mutable_data_ptr<char>();
115+
116+
WORD32 num_inp_dims = in.dim();
117+
WORD32 num_out_dims = num_inp_dims;
118+
119+
WORD32 p_inp_shape[kNnlibMaxDim];
120+
WORD32 p_out_shape[kNnlibMaxDim];
121+
WORD32 p_permute_vec[kNnlibMaxDim];
122+
123+
for (int i = 0; i < num_inp_dims; i++) {
124+
p_inp_shape[i] = in.size(i);
125+
p_out_shape[i] = in.size(dims[i]);
126+
p_permute_vec[i] = dims[i];
127+
}
128+
129+
xa_nn_transpose_8_8(
130+
p_out,
131+
p_out_shape,
132+
p_inp,
133+
p_inp_shape,
134+
p_permute_vec,
135+
num_out_dims,
136+
num_inp_dims);
137+
138+
} else if (in_type == ScalarType::Byte) {
139+
WORD8* p_inp = (WORD8*)in.const_data_ptr<uint8_t>();
140+
WORD8* p_out = (WORD8*)out.mutable_data_ptr<uint8_t>();
141+
142+
WORD32 num_inp_dims = in.dim();
143+
WORD32 num_out_dims = num_inp_dims;
144+
145+
WORD32 p_inp_shape[kNnlibMaxDim];
146+
WORD32 p_out_shape[kNnlibMaxDim];
147+
WORD32 p_permute_vec[kNnlibMaxDim];
148+
149+
for (int i = 0; i < num_inp_dims; i++) {
150+
p_inp_shape[i] = in.size(i);
151+
p_out_shape[i] = in.size(dims[i]);
152+
p_permute_vec[i] = dims[i];
153+
}
154+
155+
xa_nn_transpose_8_8(
156+
p_out,
157+
p_out_shape,
158+
p_inp,
159+
p_inp_shape,
160+
p_permute_vec,
161+
num_out_dims,
162+
num_inp_dims);
163+
}
164+
return out;
165+
}
166+
167+
size_t in_coord[kTensorDimensionLimit] = {0};
168+
size_t trailing_dims_memo[kTensorDimensionLimit];
169+
executorch::runtime::memoizeTrailingDims(in, trailing_dims_memo);
170+
171+
// in and out must be the same dtype
172+
ET_SWITCH_ALL_TYPES(in_type, ctx, name, CTYPE, [&] {
173+
const CTYPE* const in_data = in.const_data_ptr<CTYPE>();
174+
CTYPE* const out_data = out.mutable_data_ptr<CTYPE>();
175+
176+
for (size_t i = 0; i < out.numel(); ++i) {
177+
out_data[i] =
178+
in_data[executorch::runtime::coordinateToIndexWithTrailingDimsMemo(
179+
in, in_coord, trailing_dims_memo)];
180+
increment_coordinate_permuted(in, in_coord, dims);
181+
}
182+
});
183+
184+
return out;
185+
}
186+
187+
} // namespace native
188+
} // namespace HiFi
189+
} // namespace impl

0 commit comments

Comments
 (0)