Skip to content

Add optimized ELU implementation #9521

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 26 commits into from
Apr 4, 2025
Merged
2 changes: 1 addition & 1 deletion .ci/docker/ci_commit_pins/pytorch.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
7ae0ce6360b6e4f944906502d20da24c04debee5
59d5cf083b4f860dea76fe8936076177f9367f10
2 changes: 1 addition & 1 deletion backends/arm/test/models/test_conformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ class TestConformer(unittest.TestCase):
# .to_executorch step, i.e. after Arm partitioner.
ops_after_partitioner = {
"executorch_exir_dialects_edge__ops_aten_max_default": 1,
"torch.ops.aten._assert_scalar.default": 10,
"torch.ops.aten._assert_scalar.default": 7,
"torch.ops.aten._local_scalar_dense.default": 1,
}

Expand Down
4 changes: 2 additions & 2 deletions install_requirements.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ def python_is_compatible():
#
# NOTE: If you're changing, make the corresponding change in .ci/docker/ci_commit_pins/pytorch.txt
# by picking the hash from the same date in https://hud.pytorch.org/hud/pytorch/pytorch/nightly/
NIGHTLY_VERSION = "dev20250310"
NIGHTLY_VERSION = "dev20250325"


def install_requirements(use_pytorch_nightly):
Expand All @@ -80,7 +80,7 @@ def install_requirements(use_pytorch_nightly):
# Setting use_pytorch_nightly to false to test the pinned PyTorch commit. Note
# that we don't need to set any version number there because they have already
# been installed on CI before this step, so pip won't reinstall them
f"torch==2.7.0.{NIGHTLY_VERSION}" if use_pytorch_nightly else "torch",
f"torch==2.8.0.{NIGHTLY_VERSION}" if use_pytorch_nightly else "torch",
(
f"torchvision==0.22.0.{NIGHTLY_VERSION}"
if use_pytorch_nightly
Expand Down
96 changes: 96 additions & 0 deletions kernels/optimized/cpu/op_elu.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

#include <ATen/native/cpu/Elu.h>

#include <executorch/kernels/portable/cpu/scalar_utils.h>
#include <executorch/runtime/kernel/kernel_includes.h>
#include <executorch/runtime/kernel/thread_parallel_interface.h>
#include <executorch/runtime/platform/assert.h>

namespace torch::executor::native {

namespace {
template <typename CTYPE>
void elu(
KernelRuntimeContext& context,
const Tensor& input,
const Scalar& alpha,
const Scalar& scale,
const Scalar& input_scale,
Tensor& out) {
const CTYPE* in_data = input.const_data_ptr<CTYPE>();
CTYPE* out_data = out.mutable_data_ptr<CTYPE>();
using MathT =
std::conditional_t<c10::is_reduced_floating_point_v<CTYPE>, float, CTYPE>;
MathT math_alpha = 0;
MathT math_scale = 0;
MathT math_input_scale = 0;
ET_EXTRACT_SCALAR(alpha, math_alpha);
ET_EXTRACT_SCALAR(scale, math_scale);
ET_EXTRACT_SCALAR(input_scale, math_input_scale);
const auto scalar_func =
at::native::get_scalar_elu_elementwise_func<CTYPE, MathT>(
math_alpha, math_scale, math_input_scale);
const auto vec_func = at::native::get_vectorized_elu_elementwise_func<CTYPE>(
math_alpha, math_scale, math_input_scale);

::executorch::extension::parallel_for(
0,
out.numel(),
::executorch::extension::internal::GRAIN_SIZE,
[&](const auto begin, const auto end) {
using Vec = at::vec::Vectorized<CTYPE>;
const auto vectorized_begin =
begin + (Vec::size() - begin % Vec::size()) % Vec::size();
const auto vectorized_end = end - (end % Vec::size());
// Scalar prologue.
for (const auto idx : c10::irange(begin, vectorized_begin)) {
out_data[idx] = scalar_func(in_data[idx]);
}

// Main vectorized loop.
for (auto idx = vectorized_begin; idx < vectorized_end;
idx += Vec::size()) {
auto result_vec = vec_func(Vec::loadu(&in_data[idx]));
result_vec.store(&out_data[idx]);
}

// Scalar epilogue.
for (const auto idx : c10::irange(vectorized_end, end)) {
out_data[idx] = scalar_func(in_data[idx]);
}
});
}
} // namespace

Tensor& opt_elu_out(
KernelRuntimeContext& ctx,
const Tensor& in,
const Scalar& alpha,
const Scalar& scale,
const Scalar& input_scale,
Tensor& out) {
ET_KERNEL_CHECK(ctx, tensors_have_same_dtype(in, out), InvalidArgument, out);
ET_KERNEL_CHECK(
ctx, resize_tensor(out, in.sizes()) == Error::Ok, InvalidArgument, out);

ET_KERNEL_CHECK(
ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);

ET_KERNEL_CHECK(ctx, tensor_is_floating_type(in), InvalidArgument, out);

ET_KERNEL_CHECK(ctx, tensors_have_same_dtype(in, out), InvalidArgument, out);

ET_SWITCH_FLOATHBF16_TYPES(in.scalar_type(), ctx, "elu.out", CTYPE, [&]() {
elu<CTYPE>(ctx, in, alpha, scale, input_scale, out);
});
return out;
}

} // namespace torch::executor::native
8 changes: 8 additions & 0 deletions kernels/optimized/cpu/targets.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,14 @@ _OPTIMIZED_ATEN_OPS = (
"//executorch/kernels/portable/cpu/util:broadcast_util",
],
),
op_target(
name = "op_elu",
deps = [
"//executorch/extension/threadpool:threadpool",
"//executorch/kernels/portable/cpu:scalar_utils",
"//executorch/runtime/core/portable_type/c10/c10:aten_headers_for_executorch",
],
),
op_target(name = "op_exp"),
op_target(
name = "op_fft_r2c",
Expand Down
5 changes: 5 additions & 0 deletions kernels/optimized/optimized.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,11 @@
- arg_meta: null
kernel_name: torch::executor::opt_div_scalar_out

- op: elu.out
kernels:
- arg_meta: null
kernel_name: torch::executor::opt_elu_out

- op: exp.out
kernels:
- arg_meta: null
Expand Down
1 change: 1 addition & 0 deletions kernels/test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -274,6 +274,7 @@ set(_optimized_kernels_test_sources
"op_add_test.cpp"
"op_bmm_test.cpp"
"op_div_test.cpp"
"op_elu_test.cpp"
"op_exp_test.cpp"
"op_fft_r2c_test.cpp"
"op_gelu_test.cpp"
Expand Down
2 changes: 1 addition & 1 deletion kernels/test/targets.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,7 @@ def define_common_targets():
_common_op_test("op_detach_copy_test", ["aten", "portable"])
_common_op_test("op_diagonal_copy_test", ["aten", "portable"])
_common_op_test("op_div_test", ["aten", "portable", "optimized"])
_common_op_test("op_elu_test", ["aten", "portable"])
_common_op_test("op_elu_test", ["aten", "portable", "optimized"])
_common_op_test("op_embedding_test", ["aten", "portable"])
_common_op_test("op_empty_test", ["aten", "portable"])
_common_op_test("op_eq_test", ["aten", "portable"])
Expand Down
Loading