Extract reusable portions of elu_kernel into header (#149673)

swolchok · pytorchmergebot · commit c73a52659913 · 2025-03-21T23:54:26.000Z
Similar to #140425, we are making the implementation usable via header-only code sharing. Review note: #62546 by @yanbing-j removed expm1 usage from this path. I don't know why and expm1 should be more efficient, so I've put it back. Please let me know if there is a good reason I shouldn't. Testing: existing correctness tests should cover. Pull Request resolved: #149673 Approved by: https://github.com/cyyever, https://github.com/Skylion007
diff --git a/aten/src/ATen/native/cpu/Activation.cpp b/aten/src/ATen/native/cpu/Activation.cpp
@@ -15,6 +15,7 @@
 #include <ATen/cpu/vec/functional.h>
 #include <ATen/cpu/vec/vec.h>
 #include <ATen/native/TensorIterator.h>
+#include <ATen/native/cpu/Elu.h>
 #include <ATen/native/cpu/Gelu.h>
 #include <ATen/native/cpu/Loops.h>
 #include <ATen/Parallel.h>
@@ -190,56 +191,17 @@ static void threshold_kernel(
 void elu_kernel(TensorIteratorBase& it, const Scalar& alpha, const Scalar& scale, const Scalar& input_scale) {
   if (at::isReducedFloatingType(it.common_dtype())) {
     AT_DISPATCH_REDUCED_FLOATING_TYPES(it.common_dtype(), "elu_cpu", [&]() {
-      auto negcoef = alpha.to<float>() * scale.to<float>();
-      auto poscoef = scale.to<float>();
-      auto negiptcoef = input_scale.to<float>();
-      const Vectorized<float> negcoef_vec(negcoef);
-      const Vectorized<float> negiptcoef_vec(negiptcoef);
-      const Vectorized<float> poscoef_vec(poscoef);
-      const Vectorized<float> one_vec(static_cast<float>(1));
-      const Vectorized<float> zero_vec(static_cast<float>(0));
       cpu_kernel_vec(
         it,
-        [negcoef, negiptcoef, poscoef](scalar_t a) -> scalar_t {
-          return float(a) <= float(0) ? (std::exp(float(a) * negiptcoef) - float(1)) * negcoef : float(a) * poscoef;
-        },
-        [&negcoef_vec, &negiptcoef_vec, &poscoef_vec, &one_vec, &zero_vec](Vectorized<scalar_t> a) -> Vectorized<scalar_t> {
-          auto [a0, a1] = convert_to_float<scalar_t>(a);
-          auto cmp0 = (a0 > zero_vec);
-          auto cmp1 = (a1 > zero_vec);
-          auto get_res_masked = [&](Vectorized<float>& cmp, Vectorized<float>& a) {
-            return !cmp.zero_mask() ? a * poscoef_vec :
-              Vectorized<float>::blendv(((a * negiptcoef_vec).exp() - one_vec) * negcoef_vec, a * poscoef_vec, cmp);
-          };
-          auto res0 = get_res_masked(cmp0, a0);
-          auto res1 = get_res_masked(cmp1, a1);
-          return convert_from_float<scalar_t>(res0, res1);
-        });
+        get_scalar_elu_elementwise_func<scalar_t, float>(alpha.to<float>(), scale.to<float>(), input_scale.to<float>()),
+        get_vectorized_elu_elementwise_func<scalar_t>(alpha.to<float>(), scale.to<float>(), input_scale.to<float>()));
     });
   } else {
     AT_DISPATCH_FLOATING_TYPES(it.common_dtype(), "elu_cpu", [&]() {
-      using Vec = Vectorized<scalar_t>;
-      auto negcoef = alpha.to<scalar_t>() * scale.to<scalar_t>();
-      auto poscoef = scale.to<scalar_t>();
-      auto negiptcoef = input_scale.to<scalar_t>();
-      const Vec negcoef_vec(negcoef);
-      const Vec negiptcoef_vec(negiptcoef);
-      const Vec poscoef_vec(poscoef);
-      const Vec one_vec(static_cast<scalar_t>(1));
-      const Vec zero_vec(static_cast<scalar_t>(0));
       cpu_kernel_vec(
           it,
-          [negcoef, negiptcoef, poscoef](scalar_t a) -> scalar_t {
-            return a <= scalar_t(0) ? (std::exp(a * negiptcoef) - scalar_t(1)) * negcoef : a * poscoef;
-          },
-          [&negcoef_vec, &negiptcoef_vec, &poscoef_vec, &one_vec, &zero_vec](Vec a) -> Vec {
-            auto cmp = (a > zero_vec);
-            if (!cmp.zero_mask()) {  // only a * poscoef (which is very quick) needs to be computed
-              return a * poscoef_vec;
-            } else {
-              return Vec::blendv(((a * negiptcoef_vec).exp() - one_vec) * negcoef_vec, a * poscoef_vec, cmp);
-            }
-          });
+          get_scalar_elu_elementwise_func<scalar_t>(alpha.to<scalar_t>(), scale.to<scalar_t>(), input_scale.to<scalar_t>()),
+          get_vectorized_elu_elementwise_func<scalar_t>(alpha.to<scalar_t>(), scale.to<scalar_t>(), input_scale.to<scalar_t>()));
     });
   }
 }
diff --git a/aten/src/ATen/native/cpu/Elu.h b/aten/src/ATen/native/cpu/Elu.h
@@ -0,0 +1,72 @@
+#pragma once
+
+// On Windows, math.h needs to be included with _USE_MATH_DEFINES defined to
+// access constants such as M_SQRT2 and M_2_SQRTPI.
+#ifdef _WIN32
+#define _USE_MATH_DEFINES
+#include <cmath>
+#endif // _WIN32
+
+#include <ATen/cpu/vec/vec.h>
+#include <c10/util/BFloat16.h> // For c10::is_reduced_floating_point_v.
+
+namespace at::native {
+/**
+ * Return a function object that calculates ELU with the given
+ * parameters on its input element.  ParamT is the type of the input
+ * and output to the ELU, and MathT is the type (possibly
+ * higher-precision, e.g. float if ParamT is reduced-precision float)
+ * in which to do intermediate calculations.
+ */
+template <typename ParamT, typename MathT=ParamT>
+auto get_scalar_elu_elementwise_func(MathT alpha, MathT scale, MathT input_scale) {
+  const auto negcoef = alpha * scale;
+  const auto poscoef = scale;
+  const auto negiptcoef = input_scale;
+  return [negcoef, negiptcoef, poscoef](ParamT a) -> ParamT {
+    return MathT(a) <= MathT(0)
+      ? std::expm1(MathT(a) * negiptcoef) * negcoef
+      : MathT(a) * poscoef;
+  };
+}
+
+/**
+ * Return a function object that calculates ELU with the given
+ * parameters on its input element. The function object takes and
+ * returns Vectorized<T>.
+ */
+template <typename T, std::enable_if_t<!c10::is_reduced_floating_point_v<T>, bool> = true>
+auto get_vectorized_elu_elementwise_func(T alpha, T scale, T input_scale) {
+  const vec::Vectorized<T> negcoef_vec(alpha * scale);
+  const vec::Vectorized<T> poscoef_vec(scale);
+  const vec::Vectorized<T> negiptcoef_vec(input_scale);
+  const vec::Vectorized<T> zero_vec(static_cast<T>(0));
+  return [negcoef_vec, poscoef_vec, negiptcoef_vec, zero_vec](vec::Vectorized<T> a) -> vec::Vectorized<T> {
+    const auto cmp = a > zero_vec;
+    if (!cmp.zero_mask()) {
+      return a * poscoef_vec;
+    } else {
+      return vec::Vectorized<T>::blendv((a * negiptcoef_vec).expm1() * negcoef_vec, a * poscoef_vec, cmp);
+    }
+  };
+}
+
+/**
+ * Return a function object that calculates ELU with the given
+ * parameters on its input element. The function object takes and
+ * returns Vectorized<ParamT>, and Vectorized<MathT> is the type
+ * (possibly higher-precision) in which to do intermediate
+ * calculations.
+ */
+template <typename T, std::enable_if_t<c10::is_reduced_floating_point_v<T>, bool> = true>
+auto get_vectorized_elu_elementwise_func(float alpha, float scale, float input_scale) {
+  // Takes float->float.
+  const auto float_func = get_vectorized_elu_elementwise_func<float>(alpha, scale, input_scale);
+  return [float_func](vec::Vectorized<T> a) -> vec::Vectorized<T> {
+    auto [a0, a1] = vec::convert_to_float<T>(a);
+    auto res0 = float_func(a0);
+    auto res1 = float_func(a1);
+    return vec::convert_from_float<T>(res0, res1);
+  };
+}
+} // namespace at::native
diff --git a/test/cpp/api/functional.cpp b/test/cpp/api/functional.cpp
@@ -1063,7 +1063,7 @@ TEST_F(FunctionalTest, ELU) {
       x_bf16.resize_({size, size, size});
 
       auto y_exp = torch::max(torch::zeros_like(x), x) +
-          torch::min(torch::zeros_like(x), alpha * (torch::exp(x) - 1.0));
+          torch::min(torch::zeros_like(x), alpha * (torch::expm1(x)));
       auto y = F::elu(x, F::ELUFuncOptions().alpha(alpha).inplace(inplace));
       auto y_bf16 =
           F::elu(x_bf16, F::ELUFuncOptions().alpha(alpha).inplace(inplace));
@@ -1090,8 +1090,7 @@ TEST_F(FunctionalTest, SELU) {
       auto input_bf16 = input.clone().to(torch::kBFloat16);
       auto expected = scale *
           (torch::max(torch::zeros_like(input), input) +
-           torch::min(
-               torch::zeros_like(input), alpha * (torch::exp(input) - 1)));
+           torch::min(torch::zeros_like(input), alpha * (torch::expm1(input))));
       auto output = F::selu(input, inplace);
       auto output_bf16 = F::selu(input_bf16, inplace);
 
@@ -1711,8 +1710,7 @@ TEST_F(FunctionalTest, CELU) {
       x.resize_({size, size, size});
       auto x_bf16 = x.clone().to(torch::kBFloat16);
       auto y_exp = torch::max(torch::zeros_like(x), x) +
-          torch::min(torch::zeros_like(x),
-                     alpha * (torch::exp(x / alpha) - 1.0));
+          torch::min(torch::zeros_like(x), alpha * (torch::expm1(x / alpha)));
       auto y = F::celu(x, F::CELUFuncOptions().alpha(alpha).inplace(inplace));
       auto y_bf16 =
           F::celu(x_bf16, F::CELUFuncOptions().alpha(alpha).inplace(inplace));
@@ -1737,7 +1735,7 @@ TEST_F(FunctionalTest, CELUDefaultOptions) {
   x.resize_({size, size, size});
   auto x_bf16 = x.clone().to(torch::kBFloat16);
   auto y_exp = torch::max(torch::zeros_like(x), x) +
-      torch::min(torch::zeros_like(x), alpha * (torch::exp(x / alpha) - 1.0));
+      torch::min(torch::zeros_like(x), alpha * (torch::expm1(x / alpha)));
   auto y = F::celu(x);
   auto y_bf16 = F::celu(x_bf16);
 
diff --git a/test/cpp/api/modules.cpp b/test/cpp/api/modules.cpp
@@ -2432,8 +2432,7 @@ TEST_F(ModulesTest, ELU) {
       ASSERT_EQ(y.ndimension(), 3);
       ASSERT_EQ(y.sizes(), std::vector<int64_t>({size, size, size}));
       auto y_exp = torch::max(torch::zeros_like(x_orig), x_orig) +
-          torch::min(torch::zeros_like(x_orig),
-                     alpha * (torch::exp(x_orig) - 1.0));
+          torch::min(torch::zeros_like(x_orig), alpha * (torch::expm1(x_orig)));
       ASSERT_TRUE(torch::allclose(y, y_exp));
       if (inplace) {
         ASSERT_TRUE(torch::allclose(x, y_exp));
@@ -2458,7 +2457,7 @@ TEST_F(ModulesTest, SELU) {
     auto zero = torch::zeros_like(input);
     auto expected = scale *
         (torch::max(zero, input_orig) +
-         torch::min(zero, alpha * (torch::exp(input_orig) - 1)));
+         torch::min(zero, alpha * (torch::expm1(input_orig))));
     auto s = output.sum();
 
     ASSERT_EQ(s.ndimension(), 0);
@@ -2848,7 +2847,7 @@ TEST_F(ModulesTest, CELU) {
       ASSERT_EQ(y.sizes(), std::vector<int64_t>({size, size, size}));
       auto y_exp = torch::max(torch::zeros_like(x_orig), x_orig) +
           torch::min(torch::zeros_like(x_orig),
-                     alpha * (torch::exp(x_orig / alpha) - 1.0));
+                     alpha * (torch::expm1(x_orig / alpha)));
       ASSERT_TRUE(torch::allclose(y, y_exp));
       if (inplace) {
         ASSERT_TRUE(torch::allclose(x, y_exp));