Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimized atan2, _softmax, cat, clamp, full, relu, remainder, permute_copy_out ops and updates to use memory_allocator #7567

Merged
merged 27 commits into from
Jan 24, 2025
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
216389c
Adding mean and where ops optimized on HiFi
dijopaul Oct 23, 2024
3d849bb
Merge pull request #14 from dijopaul/main
cad-audio Oct 24, 2024
9b71aed
Adding quantized linear optimized versions for int8 and uint8
dijopaul Nov 6, 2024
07743ab
adding pow, remainder, minimum, maximum operators (#33)
nishpoonia Nov 7, 2024
edc1b3d
Fix for build issue faced in div_mod on old tools
dijopaul Nov 13, 2024
222beee
Merge pull request #15 from dijopaul/main
cad-audio Nov 14, 2024
6e074ec
Merge branch 'main' into main
cad-audio Nov 14, 2024
afca3db
Fix build failure due to merge issue
dijopaul Nov 19, 2024
10a0ee0
Merge branch 'main' into main
mcremon-meta Nov 21, 2024
f1f0bb3
Fixing review comments on PR 6867
dijopaul Nov 22, 2024
f8cf408
Malloc fix (#39)
dijopaul Nov 28, 2024
911021f
Cleaning cmakelist to avoid duplications
dijopaul Dec 2, 2024
18cf518
Fixing lint issues and removing free statements
dijopaul Dec 3, 2024
5e471f2
adding ET_KERNEL_CHECK for allocate_temp_memory (#41)
nishpoonia Dec 23, 2024
6928f95
Merge branch 'main' into main_PR18
dijopaul Jan 9, 2025
991961b
Fixing lint error due to merge
dijopaul Jan 9, 2025
7585ee0
Merge pull request #18 from dijopaul/main_PR18
cad-audio Jan 9, 2025
540243a
Update functions_hifi.yaml
dijopaul Jan 9, 2025
85e7c59
Merge pull request #19 from dijopaul/patch-1
cad-audio Jan 9, 2025
1f681c7
Incorporating review comments: removing nesting to check data type an…
nishpoonia Jan 10, 2025
3539f52
clean up
nishpoonia Jan 13, 2025
fe5e7d7
Merge pull request #20 from dijopaul/main_PR18
cad-audio Jan 13, 2025
4923b83
Fixing review comment on PR 7567
dijopaul Jan 21, 2025
224aaf4
Fixing review comments in PR 7567
dijopaul Jan 23, 2025
7f9a78f
Merge branch 'main' into main
zonglinpeng Jan 24, 2025
6409958
Fixing lint error in PR7567
dijopaul Jan 24, 2025
d62648a
Updating cat to support Int variant
dijopaul Jan 24, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
Adding mean and where ops optimized on HiFi
  • Loading branch information
dijopaul committed Oct 23, 2024
commit 216389c8e32010b15895b4def1a76c3eae209c04
7 changes: 6 additions & 1 deletion backends/cadence/aot/functions_hifi.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,11 @@
- arg_meta: null
kernel_name: torch::executor::full_out

- op: mean.out
kernels:
- arg_meta: null
kernel_name: cadence::impl::HiFi::mean_dim_out

- op: mul.out
kernels:
- arg_meta: null
Expand Down Expand Up @@ -105,7 +110,7 @@
- op: where.self_out
kernels:
- arg_meta: null
kernel_name: torch::executor::where_out
kernel_name: cadence::impl::HiFi::where_out

# custom ops
- func: cadence::quantize_per_tensor.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)
Expand Down
2 changes: 2 additions & 0 deletions backends/cadence/hifi/kernels/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ add_library(
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_f32_broadcast.c
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_mode_f32_broadcast.c
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_mul_f32_broadcast.c
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_where_f32xf32_f32.c
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_reduce_32_32.c
)
# Let files say "include <executorch/path/to/header.h>".
set(_common_include_directories ${EXECUTORCH_ROOT}/..)
Expand Down
28 changes: 28 additions & 0 deletions backends/cadence/hifi/kernels/kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,34 @@ extern "C" WORD32 xa_nn_elm_mul_broadcast_4D_f32xf32_f32(
const FLOAT32* __restrict__ p_inp2,
const WORD32* const p_inp2_shape);

extern "C" WORD32 xa_nn_elm_where_f32xf32_f32(
FLOAT32* __restrict__ p_out,
const FLOAT32* __restrict__ p_inp1,
const FLOAT32* __restrict__ p_inp2,
const unsigned char* __restrict__ p_condition,
WORD32 num_elm);

extern "C" WORD32 xa_nn_elm_where_broadcast_4D_f32xf32_f32(
FLOAT32* __restrict__ p_out,
const WORD32* const p_out_shape,
const FLOAT32* __restrict__ p_inp1,
const WORD32* const p_inp1_shape,
const FLOAT32* __restrict__ p_inp2,
const WORD32* const p_inp2_shape,
const unsigned char* __restrict__ p_condition,
const WORD32* const p_condition_shape);

extern "C" WORD32 xa_nn_reduce_mean_4D_f32_f32(
FLOAT32* __restrict__ p_out,
const WORD32* const p_out_shape,
const FLOAT32* __restrict__ p_inp,
const WORD32* const p_inp_shape,
const WORD32* __restrict__ p_axis,
WORD32 num_out_dims,
WORD32 num_inp_dims,
WORD32 num_axis_dims,
void* __restrict__ p_scratch_in);

namespace cadence {
namespace impl {
namespace HiFi {
Expand Down
12 changes: 3 additions & 9 deletions backends/cadence/hifi/operators/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,19 +22,12 @@ endif()
set(_aten_ops__srcs
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_add.cpp"
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_div.cpp"
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_mean.cpp"
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_mul.cpp"
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_sigmoid.cpp"
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_sub.cpp"
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_tanh.cpp"
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/activation_ops_util.cpp"
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/copy_ops_util.cpp"
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/broadcast_util.cpp"
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/index_util.cpp"
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/kernel_ops_util.cpp"
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/matmul_ops_util.cpp"
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/reduce_util.cpp"
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/repeat_util.cpp"
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/slice_util.cpp"
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_where.cpp"
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_bmm.cpp"
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_cat.cpp"
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_clone.cpp"
Expand All @@ -57,6 +50,7 @@ set(_aten_ops__srcs
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/matmul_ops_util.cpp"
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/reduce_util.cpp"
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/repeat_util.cpp"
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/slice_util.cpp"
)
add_library(aten_ops_cadence ${_aten_ops__srcs})
target_link_libraries(aten_ops_cadence PUBLIC executorch)
Expand Down
170 changes: 170 additions & 0 deletions backends/cadence/hifi/operators/op_mean.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

#include <executorch/kernels/portable/cpu/util/kernel_ops_util.h>
#include <executorch/kernels/portable/cpu/util/reduce_util.h>
#include <executorch/runtime/kernel/kernel_includes.h>
#include <executorch/runtime/platform/assert.h>

#include <executorch/backends/cadence/hifi/kernels/kernels.h>

using exec_aten::ScalarType;
using exec_aten::Tensor;
using executorch::aten::RuntimeContext;
using executorch::runtime::ArrayRef;
using torch::executor::Error;
using torch::executor::optional;

namespace cadence {
namespace impl {
namespace HiFi {
namespace native {

int prepare_data(
const Tensor& in,
Tensor& out,
optional<ArrayRef<int64_t>> dim_list,
int* inp_shape,
int* out_shape,
int* p_axis,
int num_inp_dims,
int num_out_dims) {
for (int i = 0; i < num_inp_dims; i++) {
inp_shape[i] = in.size(i);
}

for (int i = 0; i < num_out_dims; i++) {
out_shape[i] = out.size(i);
}

int num_axis_dims = 0;
for (const auto& d : dim_list.value()) {
if (d < 0) {
p_axis[num_axis_dims] = num_inp_dims + d;
num_axis_dims++;
} else {
p_axis[num_axis_dims] = d;
num_axis_dims++;
}
}

return num_axis_dims;
}

Tensor& mean_dim_out(
RuntimeContext& ctx,
const Tensor& in,
optional<ArrayRef<int64_t>> dim_list,
bool keepdim,
optional<ScalarType> dtype,
Tensor& out) {
ET_KERNEL_CHECK(
ctx,
torch::executor::check_mean_dim_args(in, dim_list, keepdim, dtype, out),
InvalidArgument,
out);

ET_KERNEL_CHECK(
ctx,
torch::executor::resize_reduction_out(in, dim_list, keepdim, out) ==
Error::Ok,
InvalidArgument,
out);

constexpr auto name = "mean.out";
constexpr int kNnlibMaxDim = 4;

bool optimized = 1;

if (out.scalar_type() != ScalarType::Float)
optimized = 0;

if (in.dim() > kNnlibMaxDim)
optimized = 0;

if (optimized) {
float* __restrict__ p_out = out.mutable_data_ptr<float>();
const float* __restrict__ p_inp =
(const float* __restrict__)in.const_data_ptr<float>();

int num_elm = in.numel();

int num_inp_dims = in.dim();
int num_out_dims = out.dim();

int inp_shape[kNnlibMaxDim];
int out_shape[kNnlibMaxDim];
int p_axis[kNnlibMaxDim];

for (int i = 0; i < kNnlibMaxDim; i++) {
out_shape[i] = 1;
inp_shape[i] = 1;
p_axis[i] = 1;
}

int num_axis_dims = prepare_data(
in,
out,
dim_list,
inp_shape,
out_shape,
p_axis,
num_inp_dims,
num_out_dims);

if (num_axis_dims == num_inp_dims) {
num_out_dims = 1;
out_shape[0] = 1;
}

int scratch_size = xa_nn_reduce_getsize_nhwc(
-3, inp_shape, num_inp_dims, p_axis, num_axis_dims, 1);

void* __restrict__ p_scratch_in = (void* __restrict__)malloc(scratch_size);

xa_nn_reduce_mean_4D_f32_f32(
p_out,
out_shape,
p_inp,
inp_shape,
p_axis,
num_out_dims,
num_inp_dims,
num_axis_dims,
p_scratch_in);

return out;
}

ET_SWITCH_REALHB_TYPES(in.scalar_type(), ctx, name, CTYPE_IN, [&] {
ET_SWITCH_FLOATH_TYPES(out.scalar_type(), ctx, name, CTYPE_OUT, [&] {
CTYPE_OUT* out_data = out.mutable_data_ptr<CTYPE_OUT>();
const size_t num = torch::executor::get_reduced_dim_product(in, dim_list);

for (size_t out_ix = 0; out_ix < out.numel(); ++out_ix) {
CTYPE_OUT sum = 0;
if (in.numel() > 0) {
sum = torch::executor::map_reduce_over_dim_list<CTYPE_IN, CTYPE_OUT>(
[](CTYPE_IN v) { return static_cast<CTYPE_OUT>(v); },
[](CTYPE_OUT outv, CTYPE_OUT acc) { return acc + outv; },
in,
dim_list,
out_ix);
}
out_data[out_ix] = sum / static_cast<float>(num);
}
});
});

return out;
}

} // namespace native
} // namespace HiFi
} // namespace impl
} // namespace cadence
Loading