crypto: implement ARM NEON backend for Echo512's ShiftAndMix()

kwvg · kwvg · commit fa68c704de7a · 2025-09-22T09:52:06.000Z
diff --git a/configure.ac b/configure.ac
@@ -537,6 +537,7 @@ AX_CHECK_COMPILE_FLAG([-fstack-reuse=none], [CORE_CXXFLAGS="$CORE_CXXFLAGS -fsta
 
 enable_arm_aes=no
 enable_arm_crc=no
+enable_arm_neon=no
 enable_arm_shani=no
 enable_ssse3=no
 enable_sse42=no
@@ -714,6 +715,27 @@ AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
 )
 CXXFLAGS="$TEMP_CXXFLAGS"
 
+ARM_NEON_CXXFLAGS=""
+TEMP_CXXFLAGS="$CXXFLAGS"
+for flag in "-march=armv8-a" "-march=armv7-a -mfpu=neon"; do
+  AX_CHECK_COMPILE_FLAG([$flag], [
+    CXXFLAGS="$CXXFLAGS $flag"
+    AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
+      #include <arm_neon.h>
+    ]], [[
+      float32x4_t f = vdupq_n_f32(0.0);
+      return 0;
+    ]])], [
+      ARM_NEON_CXXFLAGS="$flag"
+      enable_arm_neon=yes
+      AC_DEFINE([ENABLE_ARM_NEON], [1], [Define this symbol to build code that uses ARM NEON intrinsics])
+      break
+    ])
+    CXXFLAGS="$TEMP_CXXFLAGS"
+  ])
+done
+CXXFLAGS="$TEMP_CXXFLAGS"
+
 TEMP_CXXFLAGS="$CXXFLAGS"
 CXXFLAGS="$ARM_SHANI_CXXFLAGS $CXXFLAGS"
 AC_MSG_CHECKING([for ARMv8 SHA-NI intrinsics])
@@ -1879,6 +1901,7 @@ AM_CONDITIONAL([ENABLE_X86_AESNI], [test "$enable_x86_aesni" = "yes"])
 AM_CONDITIONAL([ENABLE_X86_SHANI], [test "$enable_x86_shani" = "yes"])
 AM_CONDITIONAL([ENABLE_ARM_AES], [test "$enable_arm_aes" = "yes"])
 AM_CONDITIONAL([ENABLE_ARM_CRC], [test "$enable_arm_crc" = "yes"])
+AM_CONDITIONAL([ENABLE_ARM_NEON], [test "$enable_arm_neon" = "yes"])
 AM_CONDITIONAL([ENABLE_ARM_SHANI], [test "$enable_arm_shani" = "yes"])
 AM_CONDITIONAL([WORDS_BIGENDIAN], [test "$ac_cv_c_bigendian" = "yes"])
 AM_CONDITIONAL([USE_NATPMP], [test "$use_natpmp" = "yes"])
@@ -1941,6 +1964,7 @@ AC_SUBST(X86_AESNI_CXXFLAGS)
 AC_SUBST(X86_SHANI_CXXFLAGS)
 AC_SUBST(ARM_AES_CXXFLAGS)
 AC_SUBST(ARM_CRC_CXXFLAGS)
+AC_SUBST(ARM_NEON_CXXFLAGS)
 AC_SUBST(ARM_SHANI_CXXFLAGS)
 AC_SUBST(LIBTOOL_APP_LDFLAGS)
 AC_SUBST(USE_SQLITE)
diff --git a/src/Makefile.am b/src/Makefile.am
@@ -99,6 +99,10 @@ if ENABLE_ARM_AES
 LIBBITCOIN_CRYPTO_ARM_AES = crypto/libbitcoin_crypto_arm_aes.la
 LIBBITCOIN_CRYPTO += $(LIBBITCOIN_CRYPTO_ARM_AES)
 endif
+if ENABLE_ARM_NEON
+LIBBITCOIN_CRYPTO_ARM_NEON = crypto/libbitcoin_crypto_arm_neon.la
+LIBBITCOIN_CRYPTO += $(LIBBITCOIN_CRYPTO_ARM_NEON)
+endif
 if ENABLE_ARM_SHANI
 LIBBITCOIN_CRYPTO_ARM_SHANI = crypto/libbitcoin_crypto_arm_shani.la
 LIBBITCOIN_CRYPTO += $(LIBBITCOIN_CRYPTO_ARM_SHANI)
@@ -789,6 +793,16 @@ crypto_libbitcoin_crypto_arm_aes_la_SOURCES = \
   crypto/x11/arm_crypto/echo.cpp \
   crypto/x11/arm_crypto/shavite.cpp
 
+# See explanation for -static in crypto_libbitcoin_crypto_base_la's LDFLAGS and
+# CXXFLAGS above
+crypto_libbitcoin_crypto_arm_neon_la_LDFLAGS = $(AM_LDFLAGS) -static
+crypto_libbitcoin_crypto_arm_neon_la_CXXFLAGS = $(AM_CXXFLAGS) $(PIE_FLAGS) -static
+crypto_libbitcoin_crypto_arm_neon_la_CPPFLAGS = $(AM_CPPFLAGS)
+crypto_libbitcoin_crypto_arm_neon_la_CXXFLAGS += $(ARM_NEON_CXXFLAGS)
+crypto_libbitcoin_crypto_arm_neon_la_CPPFLAGS += -DENABLE_ARM_NEON
+crypto_libbitcoin_crypto_arm_neon_la_SOURCES = \
+  crypto/x11/arm_neon/echo.cpp
+
 # See explanation for -static in crypto_libbitcoin_crypto_base_la's LDFLAGS and
 # CXXFLAGS above
 crypto_libbitcoin_crypto_ssse3_la_LDFLAGS = $(AM_LDFLAGS) -static
diff --git a/src/crypto/x11/arm_neon/echo.cpp b/src/crypto/x11/arm_neon/echo.cpp
@@ -0,0 +1,132 @@
+// Copyright (c) 2025 The Dash Core developers
+// Distributed under the MIT software license, see the accompanying
+// file COPYING or http://www.opensource.org/licenses/mit-license.php.
+
+#if defined(ENABLE_ARM_NEON)
+#include <attributes.h>
+#include <crypto/x11/util/util.hpp>
+
+#include <cstdint>
+
+#include <arm_neon.h>
+
+namespace sapphire {
+namespace {
+uint8x16_t ALWAYS_INLINE gf8_mul2(const uint8x16_t& x)
+{
+    // (x << 1)
+    const uint8x16_t lhs = vshlq_n_u8(x, 1);
+    // (x & 0x80) ? 0xff : 0x00
+    const uint8x16_t msb_set = vandq_u8(x, vmovq_n_u8(0x80));
+    const uint8x16_t mask = vceqq_u8(msb_set, vmovq_n_u8(0x80));
+    // Replace 0xff with 0x1b
+    const uint8x16_t rhs = vandq_u8(mask, vmovq_n_u8(0x1b));
+    // (x << 1) ^ ((x & 0x80) ? 0x1b : 0x00))
+    return util::Xor(lhs, rhs);
+}
+
+void ALWAYS_INLINE MixColumn(uint8x16_t& Wa, uint8x16_t& Wb, uint8x16_t& Wc, uint8x16_t& Wd)
+{
+    const uint8x16_t a = Wa;
+    const uint8x16_t b = Wb;
+    const uint8x16_t c = Wc;
+    const uint8x16_t d = Wd;
+
+    const uint8x16_t ab = util::Xor(a, b);
+    const uint8x16_t bc = util::Xor(b, c);
+    const uint8x16_t cd = util::Xor(c, d);
+
+    const uint8x16_t abx = gf8_mul2(ab);
+    const uint8x16_t bcx = gf8_mul2(bc);
+    const uint8x16_t cdx = gf8_mul2(cd);
+
+    // Wa = abx ^ bc ^ d
+    Wa = util::Xor(util::Xor(abx, bc), d);
+    // Wb = bcx ^ a ^ cd
+    Wb = util::Xor(util::Xor(bcx, a), cd);
+    // Wc = cdx ^ ab ^ d
+    Wc = util::Xor(util::Xor(cdx, ab), d);
+    // Wd = abx ^ bcx ^ cdx ^ ab ^ c
+    Wd = util::Xor(util::Xor(util::Xor(util::Xor(abx, bcx), cdx), ab), c);
+}
+
+void ALWAYS_INLINE ShiftRow1(uint8x16_t& Wa, uint8x16_t& Wb, uint8x16_t& Wc, uint8x16_t& Wd)
+{
+    uint8x16_t tmp = Wa;
+    Wa = Wb;
+    Wb = Wc;
+    Wc = Wd;
+    Wd = tmp;
+}
+
+void ALWAYS_INLINE ShiftRow2(uint8x16_t& Wa, uint8x16_t& Wb, uint8x16_t& Wc, uint8x16_t& Wd)
+{
+    uint8x16_t tmp1 = Wa;
+    uint8x16_t tmp2 = Wb;
+    Wa = Wc;
+    Wb = Wd;
+    Wc = tmp1;
+    Wd = tmp2;
+}
+
+void ALWAYS_INLINE ShiftRow3(uint8x16_t& Wa, uint8x16_t& Wb, uint8x16_t& Wc, uint8x16_t& Wd)
+{
+    uint8x16_t tmp = Wd;
+    Wd = Wc;
+    Wc = Wb;
+    Wb = Wa;
+    Wa = tmp;
+}
+} // anonymous namespace
+
+namespace arm_neon_echo {
+void ShiftAndMix(uint64_t W[16][2])
+{
+    alignas(16) uint8x16_t w[16];
+    w[0] = vreinterpretq_u8_u64(vld1q_u64(&W[0][0]));
+    w[1] = vreinterpretq_u8_u64(vld1q_u64(&W[1][0]));
+    w[2] = vreinterpretq_u8_u64(vld1q_u64(&W[2][0]));
+    w[3] = vreinterpretq_u8_u64(vld1q_u64(&W[3][0]));
+    w[4] = vreinterpretq_u8_u64(vld1q_u64(&W[4][0]));
+    w[5] = vreinterpretq_u8_u64(vld1q_u64(&W[5][0]));
+    w[6] = vreinterpretq_u8_u64(vld1q_u64(&W[6][0]));
+    w[7] = vreinterpretq_u8_u64(vld1q_u64(&W[7][0]));
+    w[8] = vreinterpretq_u8_u64(vld1q_u64(&W[8][0]));
+    w[9] = vreinterpretq_u8_u64(vld1q_u64(&W[9][0]));
+    w[10] = vreinterpretq_u8_u64(vld1q_u64(&W[10][0]));
+    w[11] = vreinterpretq_u8_u64(vld1q_u64(&W[11][0]));
+    w[12] = vreinterpretq_u8_u64(vld1q_u64(&W[12][0]));
+    w[13] = vreinterpretq_u8_u64(vld1q_u64(&W[13][0]));
+    w[14] = vreinterpretq_u8_u64(vld1q_u64(&W[14][0]));
+    w[15] = vreinterpretq_u8_u64(vld1q_u64(&W[15][0]));
+
+    ShiftRow1(w[1], w[5], w[9], w[13]);
+    ShiftRow2(w[2], w[6], w[10], w[14]);
+    ShiftRow3(w[3], w[7], w[11], w[15]);
+
+    MixColumn(w[0], w[1], w[2], w[3]);
+    MixColumn(w[4], w[5], w[6], w[7]);
+    MixColumn(w[8], w[9], w[10], w[11]);
+    MixColumn(w[12], w[13], w[14], w[15]);
+
+    vst1q_u64(&W[0][0], vreinterpretq_u64_u8(w[0]));
+    vst1q_u64(&W[1][0], vreinterpretq_u64_u8(w[1]));
+    vst1q_u64(&W[2][0], vreinterpretq_u64_u8(w[2]));
+    vst1q_u64(&W[3][0], vreinterpretq_u64_u8(w[3]));
+    vst1q_u64(&W[4][0], vreinterpretq_u64_u8(w[4]));
+    vst1q_u64(&W[5][0], vreinterpretq_u64_u8(w[5]));
+    vst1q_u64(&W[6][0], vreinterpretq_u64_u8(w[6]));
+    vst1q_u64(&W[7][0], vreinterpretq_u64_u8(w[7]));
+    vst1q_u64(&W[8][0], vreinterpretq_u64_u8(w[8]));
+    vst1q_u64(&W[9][0], vreinterpretq_u64_u8(w[9]));
+    vst1q_u64(&W[10][0], vreinterpretq_u64_u8(w[10]));
+    vst1q_u64(&W[11][0], vreinterpretq_u64_u8(w[11]));
+    vst1q_u64(&W[12][0], vreinterpretq_u64_u8(w[12]));
+    vst1q_u64(&W[13][0], vreinterpretq_u64_u8(w[13]));
+    vst1q_u64(&W[14][0], vreinterpretq_u64_u8(w[14]));
+    vst1q_u64(&W[15][0], vreinterpretq_u64_u8(w[15]));
+}
+} // namespace arm_neon_echo
+} // namespace sapphire
+
+#endif // ENABLE_ARM_NEON
diff --git a/src/crypto/x11/dispatch.cpp b/src/crypto/x11/dispatch.cpp
@@ -11,7 +11,7 @@
 #if !defined(DISABLE_OPTIMIZED_SHA256)
 #include <compat/cpuid.h>
 
-#if defined(ENABLE_ARM_AES)
+#if defined(ENABLE_ARM_AES) || defined(ENABLE_ARM_NEON)
 #if defined(__APPLE__)
 #include <sys/sysctl.h>
 #include <sys/types.h>
@@ -31,7 +31,7 @@
 #include <processthreadsapi.h>
 #include <winnt.h>
 #endif // _WIN32
-#endif // ENABLE_ARM_AES
+#endif // ENABLE_ARM_AES || ENABLE_ARM_NEON
 #endif // !DISABLE_OPTIMIZED_SHA256
 
 #include <cstddef>
@@ -55,6 +55,12 @@ void CompressElement(uint32_t& l0, uint32_t& l1, uint32_t& l2, uint32_t& l3,
 } // namespace arm_crypto_shavite
 #endif // ENABLE_ARM_AES
 
+#if defined(ENABLE_ARM_NEON)
+namespace arm_neon_echo {
+void ShiftAndMix(uint64_t W[16][2]);
+} // namespace arm_neon_echo
+#endif // ENABLE_ARM_NEON
+
 #if defined(ENABLE_SSSE3)
 namespace ssse3_echo {
 void ShiftAndMix(uint64_t W[16][2]);
@@ -96,6 +102,21 @@ void CompressElement(uint32_t& l0, uint32_t& l1, uint32_t& l2, uint32_t& l3,
 } // namespace soft_shavite
 } // namespace sapphire
 
+namespace {
+#if !defined(DISABLE_OPTIMIZED_SHA256)
+#if defined(ENABLE_ARM_AES) || defined(ENABLE_ARM_NEON)
+#if defined(__APPLE__)
+bool IsSysCtlNonZero(const char* name)
+{
+    int val = 0;
+    size_t len = sizeof(val);
+    return ::sysctlbyname(name, &val, &len, nullptr, 0) == 0 && val != 0;
+}
+#endif // __APPLE__
+#endif // ENABLE_ARM_AES || ENABLE_ARM_NEON
+#endif // !DISABLE_OPTIMIZED_SHA256
+} // anonymous namespace
+
 extern sapphire::dispatch::AESRoundFn aes_round;
 extern sapphire::dispatch::AESRoundFnNk aes_round_nk;
 extern sapphire::dispatch::EchoShiftMix echo_shift_mix;
@@ -132,45 +153,60 @@ void SapphireAutoDetect()
 #endif // ENABLE_SSSE3
 #endif // HAVE_GETCPUID
 
-#if defined(ENABLE_ARM_AES)
-    bool have_arm_aes = false;
+#if defined(ENABLE_ARM_AES) || defined(ENABLE_ARM_NEON)
+    [[maybe_unused]] bool have_arm_aes = false;
+    [[maybe_unused]] bool have_arm_neon = false;
+
 #if defined(__APPLE__)
-    int val = 0;
-    size_t len = sizeof(val);
-    if (::sysctlbyname("hw.optional.arm.FEAT_AES", &val, &len, nullptr, 0) == 0) {
-        have_arm_aes = val != 0;
-    }
+    have_arm_aes = IsSysCtlNonZero("hw.optional.arm.FEAT_AES");
+    have_arm_neon = IsSysCtlNonZero("hw.optional.neon") || IsSysCtlNonZero("hw.optional.AdvSIMD") ||
+                    IsSysCtlNonZero("hw.optional.arm.AdvSIMD"); // See https://github.com/google/cpu_features/issues/390
 #endif // __APPLE__
 
 #if defined(__linux__)
 #if defined(__arm__)
     have_arm_aes = (::getauxval(AT_HWCAP2) & HWCAP2_AES);
+    have_arm_neon = (::getauxval(AT_HWCAP) & HWCAP_NEON);
 #endif // __arm__
 #if defined(__aarch64__)
     have_arm_aes = (::getauxval(AT_HWCAP) & HWCAP_AES);
+    have_arm_neon = (::getauxval(AT_HWCAP) & HWCAP_ASIMD);
 #endif // __aarch64__
 #endif // __linux__
 
 #if defined(__FreeBSD__)
-    [[maybe_unused]] unsigned long hwcap{0};
+    [[maybe_unused]] unsigned long hwcap{0}, hwcap2{0};
 #if defined(__arm__)
-    have_arm_aes = ((::elf_aux_info(AT_HWCAP2, &hwcap, sizeof(hwcap)) == 0) && ((hwcap & HWCAP2_AES) != 0));
+    have_arm_aes = ((::elf_aux_info(AT_HWCAP2, &hwcap2, sizeof(hwcap2)) == 0) && ((hwcap2 & HWCAP2_AES) != 0));
+    have_arm_neon = ((::elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap)) == 0) && ((hwcap & HWCAP_NEON) != 0));
 #endif // __arm__
 #if defined(__aarch64__)
-    have_arm_aes = ((::elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap)) == 0) && ((hwcap & HWCAP_AES) != 0));
+    if (::elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap)) == 0) {
+        have_arm_aes = ((hwcap & HWCAP_AES) != 0);
+        have_arm_neon = ((hwcap & HWCAP_ASIMD) != 0);
+    }
 #endif // __aarch64__
 #endif // __FreeBSD__
 
 #if defined(_WIN32)
     have_arm_aes = ::IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE);
+    have_arm_neon = ::IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE);
 #endif // _WIN32
 
+#if defined(ENABLE_ARM_AES)
     if (have_arm_aes) {
         aes_round = sapphire::arm_crypto_aes::Round;
         aes_round_nk = sapphire::arm_crypto_aes::RoundKeyless;
         echo_round = sapphire::arm_crypto_echo::FullStateRound;
         shavite_c512e = sapphire::arm_crypto_shavite::CompressElement;
     }
 #endif // ENABLE_ARM_AES
+
+#if defined (ENABLE_ARM_NEON)
+    if (have_arm_neon) {
+        echo_shift_mix = sapphire::arm_neon_echo::ShiftAndMix;
+    }
+#endif // ENABLE_ARM_NEON
+#endif // ENABLE_ARM_AES || ENABLE_ARM_NEON
 #endif // !DISABLE_OPTIMIZED_SHA256
 }
diff --git a/src/crypto/x11/util/util.hpp b/src/crypto/x11/util/util.hpp
@@ -10,9 +10,9 @@
 #if !defined(DISABLE_OPTIMIZED_SHA256)
 #include <attributes.h>
 
-#if defined(ENABLE_ARM_AES)
+#if defined(ENABLE_ARM_AES) || defined(ENABLE_ARM_NEON)
 #include <arm_neon.h>
-#endif // ENABLE_ARM_AES
+#endif // ENABLE_ARM_AES || ENABLE_ARM_NEON
 
 #if defined(ENABLE_SSSE3) || (defined(ENABLE_SSE41) && defined(ENABLE_X86_AESNI))
 #include <immintrin.h>
@@ -30,7 +30,7 @@ constexpr inline uint32_t pack_le(uint8_t b3, uint8_t b2, uint8_t b1, uint8_t b0
 }
 
 #if !defined(DISABLE_OPTIMIZED_SHA256)
-#if defined(ENABLE_ARM_AES)
+#if defined(ENABLE_ARM_AES) || defined(ENABLE_ARM_NEON)
 uint8x16_t ALWAYS_INLINE Xor(const uint8x16_t& x, const uint8x16_t& y) { return veorq_u8(x, y); }
 
 uint8x16_t ALWAYS_INLINE pack_le(const uint32_t& w0, const uint32_t& w1, const uint32_t& w2, const uint32_t& w3)
@@ -47,6 +47,7 @@ void ALWAYS_INLINE unpack_le(const uint8x16_t& i, uint32_t& w0, uint32_t& w1, ui
     w3 = vgetq_lane_u32(r, 3);
 }
 
+#if defined(ENABLE_ARM_AES)
 uint8x16_t ALWAYS_INLINE aes_round(const uint8x16_t& input, const uint8x16_t& key)
 {
     // See "Emulating x86 AES Intrinsics on ARMv8-A" by Michael Brase for _mm_aesenc_si128
@@ -60,6 +61,7 @@ uint8x16_t ALWAYS_INLINE aes_round_nk(const uint8x16_t& input)
     return vaesmcq_u8(vaeseq_u8(input, vmovq_n_u8(0)));
 }
 #endif // ENABLE_ARM_AES
+#endif // ENABLE_ARM_AES || ENABLE_ARM_NEON
 
 #if defined(ENABLE_SSSE3) || (defined(ENABLE_SSE41) && defined(ENABLE_X86_AESNI))
 __m128i ALWAYS_INLINE Xor(const __m128i& x, const __m128i& y) { return _mm_xor_si128(x, y); }