[vec128] Fix fmsub NEON defintion (pytorch#152075)

malfet · pytorchmergebot · commit 2ea865339185 · 2025-04-24T05:10:45.000Z
As reported in pytorch#149292, according to manual, `vfmsq_f32` implements `c - a * b` rather than `a * b - c`, so it's call must be prefixed with `vnegq_f32` Also, adjust the tests to use OpMath for FMA computation to avoid accuracy error accumulation due to non-fused multiply-and-add over lower precision dtypes Note that `Vectorized::fmsub` is not currently instantiated anywhere, so it could safely remain broken TODO: - Enable C++ testing on MacOS and/or aarch64 platforms (right now Mac tests are build without C++ tests) Fixes pytorch#149292 Pull Request resolved: pytorch#152075 Approved by: https://github.com/swolchok ghstack dependencies: pytorch#151955
diff --git a/aten/src/ATen/cpu/vec/vec128/vec128_float_neon.h b/aten/src/ATen/cpu/vec/vec128/vec128_float_neon.h
@@ -540,7 +540,7 @@ Vectorized<float> inline fmadd(const Vectorized<float>& a, const Vectorized<floa
 
 template <>
 Vectorized<float> inline fmsub(const Vectorized<float>& a, const Vectorized<float>& b, const Vectorized<float>& c) {
-  return Vectorized<float>(vfmsq_f32(c, a, b));
+  return Vectorized<float>(vnegq_f32(vfmsq_f32(c, a, b)));
 }
 
 inline Vectorized<float> Vectorized<float>::erf() const{
diff --git a/aten/src/ATen/cpu/vec/vec128/vec128_half_neon.h b/aten/src/ATen/cpu/vec/vec128/vec128_half_neon.h
@@ -582,7 +582,7 @@ Vectorized<c10::Half> inline fmsub(
     const Vectorized<c10::Half>& b,
     const Vectorized<c10::Half>& c) {
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-  return Vectorized<c10::Half>(vfmsq_f16(c, a, b));
+  return Vectorized<c10::Half>(vnegq_f16(vfmsq_f16(c, a, b)));
 #else
   return a * b - c;
 #endif
diff --git a/aten/src/ATen/test/vec_test_all_types.h b/aten/src/ATen/test/vec_test_all_types.h
@@ -64,6 +64,16 @@ CACHE_ALIGN #define
 #undef CHECK_WITH_FMA
 #endif
 
+template <typename scalar_t>
+struct OpMathType {
+  using type = scalar_t;
+};
+template <>
+struct OpMathType<c10::Half> {
+  using type = float;
+};
+
+
 template<typename T>
 using Complex = typename c10::complex<T>;
 
@@ -1317,15 +1327,17 @@ std::enable_if_t<is_complex<Complex<T>>::value, Complex<T>> local_division(Compl
 template <typename T>
 std::enable_if_t<!is_complex<T>::value, T> local_fmadd(T a, T b, T c) {
     PreventFma noFma;
-    T ab = a * b;
-    return noFma.add(ab, c);
+    using op_math_t = typename OpMathType<T>::type;
+    auto ab = static_cast<op_math_t>(a) * static_cast<op_math_t>(b);
+    return static_cast<T>(noFma.add(ab, op_math_t(c)));
 }
 
 template <typename T>
 std::enable_if_t<!is_complex<T>::value, T> local_fmsub(T a, T b, T c) {
     PreventFma noFma;
-    T ab = a * b;
-    return noFma.sub(ab, c);
+    using op_math_t = typename OpMathType<T>::type;
+    auto ab = static_cast<op_math_t>(a) * static_cast<op_math_t>(b);
+    return static_cast<T>(noFma.sub(ab, op_math_t(c)));
 }
 
 template <typename T>

Original file line number	Diff line number	Diff line change
`@@ -540,7 +540,7 @@ Vectorized<float> inline fmadd(const Vectorized<float>& a, const Vectorized<floa`
`540`	`540`
`541`	`541`	`template <>`
`542`	`542`	`Vectorized<float> inline fmsub(const Vectorized<float>& a, const Vectorized<float>& b, const Vectorized<float>& c) {`
`543`		`- return Vectorized<float>(vfmsq_f32(c, a, b));`
	`543`	`+ return Vectorized<float>(vnegq_f32(vfmsq_f32(c, a, b)));`
`544`	`544`	`}`
`545`	`545`
`546`	`546`	`inline Vectorized<float> Vectorized<float>::erf() const{`