Adding mean and where ops optimized on HiFi

pytorch · facebook-github-bot · Jan 24, 2025 · Oct 23, 2024 · Oct 24, 2024 · Nov 6, 2024
commit 216389c8e32010b15895b4def1a76c3eae209c04
diff --git a/backends/cadence/aot/functions_hifi.yaml b/backends/cadence/aot/functions_hifi.yaml
@@ -62,6 +62,11 @@
     - arg_meta: null
       kernel_name: torch::executor::full_out
 
+- op: mean.out
+  kernels:
+    - arg_meta: null
+      kernel_name: cadence::impl::HiFi::mean_dim_out   
+
 - op: mul.out
   kernels:
     - arg_meta: null
@@ -105,7 +110,7 @@
 - op: where.self_out
   kernels:
     - arg_meta: null
-      kernel_name: torch::executor::where_out
+      kernel_name: cadence::impl::HiFi::where_out
 
 # custom ops
 - func: cadence::quantize_per_tensor.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)

diff --git a/backends/cadence/hifi/kernels/CMakeLists.txt b/backends/cadence/hifi/kernels/CMakeLists.txt
@@ -13,6 +13,8 @@ add_library(
   ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_f32_broadcast.c
   ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_mode_f32_broadcast.c
   ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_mul_f32_broadcast.c
+  ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_where_f32xf32_f32.c
+  ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_reduce_32_32.c
 )
 # Let files say "include <executorch/path/to/header.h>".
 set(_common_include_directories ${EXECUTORCH_ROOT}/..)

diff --git a/backends/cadence/hifi/kernels/kernels.h b/backends/cadence/hifi/kernels/kernels.h
@@ -55,6 +55,34 @@ extern "C" WORD32 xa_nn_elm_mul_broadcast_4D_f32xf32_f32(
     const FLOAT32* __restrict__ p_inp2,
     const WORD32* const p_inp2_shape);
 
+extern "C" WORD32 xa_nn_elm_where_f32xf32_f32(
+    FLOAT32* __restrict__ p_out,
+    const FLOAT32* __restrict__ p_inp1,
+    const FLOAT32* __restrict__ p_inp2,
+    const unsigned char* __restrict__ p_condition,
+    WORD32 num_elm);
+
+extern "C" WORD32 xa_nn_elm_where_broadcast_4D_f32xf32_f32(
+    FLOAT32* __restrict__ p_out,
+    const WORD32* const p_out_shape,
+    const FLOAT32* __restrict__ p_inp1,
+    const WORD32* const p_inp1_shape,
+    const FLOAT32* __restrict__ p_inp2,
+    const WORD32* const p_inp2_shape,
+    const unsigned char* __restrict__ p_condition,
+    const WORD32* const p_condition_shape);
+
+extern "C" WORD32 xa_nn_reduce_mean_4D_f32_f32(
+    FLOAT32* __restrict__ p_out,
+    const WORD32* const p_out_shape,
+    const FLOAT32* __restrict__ p_inp,
+    const WORD32* const p_inp_shape,
+    const WORD32* __restrict__ p_axis,
+    WORD32 num_out_dims,
+    WORD32 num_inp_dims,
+    WORD32 num_axis_dims,
+    void* __restrict__ p_scratch_in);
+
 namespace cadence {
 namespace impl {
 namespace HiFi {

diff --git a/backends/cadence/hifi/operators/CMakeLists.txt b/backends/cadence/hifi/operators/CMakeLists.txt
@@ -22,19 +22,12 @@ endif()
 set(_aten_ops__srcs
     "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_add.cpp"
     "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_div.cpp"
+    "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_mean.cpp"
     "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_mul.cpp"
     "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_sigmoid.cpp"
     "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_sub.cpp"
     "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_tanh.cpp"
-    "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/activation_ops_util.cpp"
-    "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/copy_ops_util.cpp"
-    "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/broadcast_util.cpp"
-    "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/index_util.cpp"
-    "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/kernel_ops_util.cpp"
-    "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/matmul_ops_util.cpp"
-    "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/reduce_util.cpp"
-    "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/repeat_util.cpp"
-    "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/slice_util.cpp"
+    "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_where.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_bmm.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_cat.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_clone.cpp"
@@ -57,6 +50,7 @@ set(_aten_ops__srcs
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/matmul_ops_util.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/reduce_util.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/repeat_util.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/slice_util.cpp"
     )
 add_library(aten_ops_cadence ${_aten_ops__srcs})
 target_link_libraries(aten_ops_cadence PUBLIC executorch)

diff --git a/backends/cadence/hifi/operators/op_mean.cpp b/backends/cadence/hifi/operators/op_mean.cpp
@@ -0,0 +1,170 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/kernels/portable/cpu/util/kernel_ops_util.h>
+#include <executorch/kernels/portable/cpu/util/reduce_util.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+#include <executorch/runtime/platform/assert.h>
+
+#include <executorch/backends/cadence/hifi/kernels/kernels.h>
+
+using exec_aten::ScalarType;
+using exec_aten::Tensor;
+using executorch::aten::RuntimeContext;
+using executorch::runtime::ArrayRef;
+using torch::executor::Error;
+using torch::executor::optional;
+
+namespace cadence {
+namespace impl {
+namespace HiFi {
+namespace native {
+
+int prepare_data(
+    const Tensor& in,
+    Tensor& out,
+    optional<ArrayRef<int64_t>> dim_list,
+    int* inp_shape,
+    int* out_shape,
+    int* p_axis,
+    int num_inp_dims,
+    int num_out_dims) {
+  for (int i = 0; i < num_inp_dims; i++) {
+    inp_shape[i] = in.size(i);
+  }
+
+  for (int i = 0; i < num_out_dims; i++) {
+    out_shape[i] = out.size(i);
+  }
+
+  int num_axis_dims = 0;
+  for (const auto& d : dim_list.value()) {
+    if (d < 0) {
+      p_axis[num_axis_dims] = num_inp_dims + d;
+      num_axis_dims++;
+    } else {
+      p_axis[num_axis_dims] = d;
+      num_axis_dims++;
+    }
+  }
+
+  return num_axis_dims;
+}
+
+Tensor& mean_dim_out(
+    RuntimeContext& ctx,
+    const Tensor& in,
+    optional<ArrayRef<int64_t>> dim_list,
+    bool keepdim,
+    optional<ScalarType> dtype,
+    Tensor& out) {
+  ET_KERNEL_CHECK(
+      ctx,
+      torch::executor::check_mean_dim_args(in, dim_list, keepdim, dtype, out),
+      InvalidArgument,
+      out);
+
+  ET_KERNEL_CHECK(
+      ctx,
+      torch::executor::resize_reduction_out(in, dim_list, keepdim, out) ==
+          Error::Ok,
+      InvalidArgument,
+      out);
+
+  constexpr auto name = "mean.out";
+  constexpr int kNnlibMaxDim = 4;
+
+  bool optimized = 1;
+
+  if (out.scalar_type() != ScalarType::Float)
+    optimized = 0;
+
+  if (in.dim() > kNnlibMaxDim)
+    optimized = 0;
+
+  if (optimized) {
+    float* __restrict__ p_out = out.mutable_data_ptr<float>();
+    const float* __restrict__ p_inp =
+        (const float* __restrict__)in.const_data_ptr<float>();
+
+    int num_elm = in.numel();
+
+    int num_inp_dims = in.dim();
+    int num_out_dims = out.dim();
+
+    int inp_shape[kNnlibMaxDim];
+    int out_shape[kNnlibMaxDim];
+    int p_axis[kNnlibMaxDim];
+
+    for (int i = 0; i < kNnlibMaxDim; i++) {
+      out_shape[i] = 1;
+      inp_shape[i] = 1;
+      p_axis[i] = 1;
+    }
+
+    int num_axis_dims = prepare_data(
+        in,
+        out,
+        dim_list,
+        inp_shape,
+        out_shape,
+        p_axis,
+        num_inp_dims,
+        num_out_dims);
+
+    if (num_axis_dims == num_inp_dims) {
+      num_out_dims = 1;
+      out_shape[0] = 1;
+    }
+
+    int scratch_size = xa_nn_reduce_getsize_nhwc(
+        -3, inp_shape, num_inp_dims, p_axis, num_axis_dims, 1);
+
+    void* __restrict__ p_scratch_in = (void* __restrict__)malloc(scratch_size);
+
+    xa_nn_reduce_mean_4D_f32_f32(
+        p_out,
+        out_shape,
+        p_inp,
+        inp_shape,
+        p_axis,
+        num_out_dims,
+        num_inp_dims,
+        num_axis_dims,
+        p_scratch_in);
+
+    return out;
+  }
+
+  ET_SWITCH_REALHB_TYPES(in.scalar_type(), ctx, name, CTYPE_IN, [&] {
+    ET_SWITCH_FLOATH_TYPES(out.scalar_type(), ctx, name, CTYPE_OUT, [&] {
+      CTYPE_OUT* out_data = out.mutable_data_ptr<CTYPE_OUT>();
+      const size_t num = torch::executor::get_reduced_dim_product(in, dim_list);
+
+      for (size_t out_ix = 0; out_ix < out.numel(); ++out_ix) {
+        CTYPE_OUT sum = 0;
+        if (in.numel() > 0) {
+          sum = torch::executor::map_reduce_over_dim_list<CTYPE_IN, CTYPE_OUT>(
+              [](CTYPE_IN v) { return static_cast<CTYPE_OUT>(v); },
+              [](CTYPE_OUT outv, CTYPE_OUT acc) { return acc + outv; },
+              in,
+              dim_list,
+              out_ix);
+        }
+        out_data[out_ix] = sum / static_cast<float>(num);
+      }
+    });
+  });
+
+  return out;
+}
+
+} // namespace native
+} // namespace HiFi
+} // namespace impl
+} // namespace cadence