diff --git a/include/xsimd/arch/xsimd_avx.hpp b/include/xsimd/arch/xsimd_avx.hpp index 7a2436d5a..3292aaaaf 100644 --- a/include/xsimd/arch/xsimd_avx.hpp +++ b/include/xsimd/arch/xsimd_avx.hpp @@ -1046,31 +1046,13 @@ namespace xsimd } // reduce_add - template - XSIMD_INLINE float reduce_add(batch const& rhs, requires_arch) noexcept - { - // Warning about _mm256_hadd_ps: - // _mm256_hadd_ps(a,b) gives - // (a0+a1,a2+a3,b0+b1,b2+b3,a4+a5,a6+a7,b4+b5,b6+b7). Hence we can't - // rely on a naive use of this method - // rhs = (x0, x1, x2, x3, x4, x5, x6, x7) - // tmp = (x4, x5, x6, x7, x0, x1, x2, x3) - __m256 tmp = _mm256_permute2f128_ps(rhs, rhs, 1); - // tmp = (x4+x0, x5+x1, x6+x2, x7+x3, x0+x4, x1+x5, x2+x6, x3+x7) - tmp = _mm256_add_ps(rhs, tmp); - // tmp = (x4+x0+x5+x1, x6+x2+x7+x3, -, -, -, -, -, -) - tmp = _mm256_hadd_ps(tmp, tmp); - // tmp = (x4+x0+x5+x1+x6+x2+x7+x3, -, -, -, -, -, -, -) - tmp = _mm256_hadd_ps(tmp, tmp); - return _mm_cvtss_f32(_mm256_extractf128_ps(tmp, 0)); - } - template ::value || std::is_same::value, void>::type> + template ::value || std::is_same::value || std::is_same::value, void>::type> XSIMD_INLINE T reduce_add(batch const& self, requires_arch) noexcept { typename batch::register_type low, high; detail::split_avx(self, low, high); batch blow(low), bhigh(high); - return reduce_add(blow) + reduce_add(bhigh); + return reduce_add(blow + bhigh); } // reduce_max