Skip to content

Commit fa68c70

Browse files
committed
crypto: implement ARM NEON backend for Echo512's ShiftAndMix()
1 parent 963215b commit fa68c70

File tree

5 files changed

+223
-15
lines changed

5 files changed

+223
-15
lines changed

configure.ac

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -537,6 +537,7 @@ AX_CHECK_COMPILE_FLAG([-fstack-reuse=none], [CORE_CXXFLAGS="$CORE_CXXFLAGS -fsta
537537

538538
enable_arm_aes=no
539539
enable_arm_crc=no
540+
enable_arm_neon=no
540541
enable_arm_shani=no
541542
enable_ssse3=no
542543
enable_sse42=no
@@ -714,6 +715,27 @@ AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
714715
)
715716
CXXFLAGS="$TEMP_CXXFLAGS"
716717

718+
ARM_NEON_CXXFLAGS=""
719+
TEMP_CXXFLAGS="$CXXFLAGS"
720+
for flag in "-march=armv8-a" "-march=armv7-a -mfpu=neon"; do
721+
AX_CHECK_COMPILE_FLAG([$flag], [
722+
CXXFLAGS="$CXXFLAGS $flag"
723+
AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
724+
#include <arm_neon.h>
725+
]], [[
726+
float32x4_t f = vdupq_n_f32(0.0);
727+
return 0;
728+
]])], [
729+
ARM_NEON_CXXFLAGS="$flag"
730+
enable_arm_neon=yes
731+
AC_DEFINE([ENABLE_ARM_NEON], [1], [Define this symbol to build code that uses ARM NEON intrinsics])
732+
break
733+
])
734+
CXXFLAGS="$TEMP_CXXFLAGS"
735+
])
736+
done
737+
CXXFLAGS="$TEMP_CXXFLAGS"
738+
717739
TEMP_CXXFLAGS="$CXXFLAGS"
718740
CXXFLAGS="$ARM_SHANI_CXXFLAGS $CXXFLAGS"
719741
AC_MSG_CHECKING([for ARMv8 SHA-NI intrinsics])
@@ -1879,6 +1901,7 @@ AM_CONDITIONAL([ENABLE_X86_AESNI], [test "$enable_x86_aesni" = "yes"])
18791901
AM_CONDITIONAL([ENABLE_X86_SHANI], [test "$enable_x86_shani" = "yes"])
18801902
AM_CONDITIONAL([ENABLE_ARM_AES], [test "$enable_arm_aes" = "yes"])
18811903
AM_CONDITIONAL([ENABLE_ARM_CRC], [test "$enable_arm_crc" = "yes"])
1904+
AM_CONDITIONAL([ENABLE_ARM_NEON], [test "$enable_arm_neon" = "yes"])
18821905
AM_CONDITIONAL([ENABLE_ARM_SHANI], [test "$enable_arm_shani" = "yes"])
18831906
AM_CONDITIONAL([WORDS_BIGENDIAN], [test "$ac_cv_c_bigendian" = "yes"])
18841907
AM_CONDITIONAL([USE_NATPMP], [test "$use_natpmp" = "yes"])
@@ -1941,6 +1964,7 @@ AC_SUBST(X86_AESNI_CXXFLAGS)
19411964
AC_SUBST(X86_SHANI_CXXFLAGS)
19421965
AC_SUBST(ARM_AES_CXXFLAGS)
19431966
AC_SUBST(ARM_CRC_CXXFLAGS)
1967+
AC_SUBST(ARM_NEON_CXXFLAGS)
19441968
AC_SUBST(ARM_SHANI_CXXFLAGS)
19451969
AC_SUBST(LIBTOOL_APP_LDFLAGS)
19461970
AC_SUBST(USE_SQLITE)

src/Makefile.am

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,10 @@ if ENABLE_ARM_AES
9999
LIBBITCOIN_CRYPTO_ARM_AES = crypto/libbitcoin_crypto_arm_aes.la
100100
LIBBITCOIN_CRYPTO += $(LIBBITCOIN_CRYPTO_ARM_AES)
101101
endif
102+
if ENABLE_ARM_NEON
103+
LIBBITCOIN_CRYPTO_ARM_NEON = crypto/libbitcoin_crypto_arm_neon.la
104+
LIBBITCOIN_CRYPTO += $(LIBBITCOIN_CRYPTO_ARM_NEON)
105+
endif
102106
if ENABLE_ARM_SHANI
103107
LIBBITCOIN_CRYPTO_ARM_SHANI = crypto/libbitcoin_crypto_arm_shani.la
104108
LIBBITCOIN_CRYPTO += $(LIBBITCOIN_CRYPTO_ARM_SHANI)
@@ -789,6 +793,16 @@ crypto_libbitcoin_crypto_arm_aes_la_SOURCES = \
789793
crypto/x11/arm_crypto/echo.cpp \
790794
crypto/x11/arm_crypto/shavite.cpp
791795

796+
# See explanation for -static in crypto_libbitcoin_crypto_base_la's LDFLAGS and
797+
# CXXFLAGS above
798+
crypto_libbitcoin_crypto_arm_neon_la_LDFLAGS = $(AM_LDFLAGS) -static
799+
crypto_libbitcoin_crypto_arm_neon_la_CXXFLAGS = $(AM_CXXFLAGS) $(PIE_FLAGS) -static
800+
crypto_libbitcoin_crypto_arm_neon_la_CPPFLAGS = $(AM_CPPFLAGS)
801+
crypto_libbitcoin_crypto_arm_neon_la_CXXFLAGS += $(ARM_NEON_CXXFLAGS)
802+
crypto_libbitcoin_crypto_arm_neon_la_CPPFLAGS += -DENABLE_ARM_NEON
803+
crypto_libbitcoin_crypto_arm_neon_la_SOURCES = \
804+
crypto/x11/arm_neon/echo.cpp
805+
792806
# See explanation for -static in crypto_libbitcoin_crypto_base_la's LDFLAGS and
793807
# CXXFLAGS above
794808
crypto_libbitcoin_crypto_ssse3_la_LDFLAGS = $(AM_LDFLAGS) -static

src/crypto/x11/arm_neon/echo.cpp

Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
// Copyright (c) 2025 The Dash Core developers
2+
// Distributed under the MIT software license, see the accompanying
3+
// file COPYING or http://www.opensource.org/licenses/mit-license.php.
4+
5+
#if defined(ENABLE_ARM_NEON)
6+
#include <attributes.h>
7+
#include <crypto/x11/util/util.hpp>
8+
9+
#include <cstdint>
10+
11+
#include <arm_neon.h>
12+
13+
namespace sapphire {
14+
namespace {
15+
uint8x16_t ALWAYS_INLINE gf8_mul2(const uint8x16_t& x)
16+
{
17+
// (x << 1)
18+
const uint8x16_t lhs = vshlq_n_u8(x, 1);
19+
// (x & 0x80) ? 0xff : 0x00
20+
const uint8x16_t msb_set = vandq_u8(x, vmovq_n_u8(0x80));
21+
const uint8x16_t mask = vceqq_u8(msb_set, vmovq_n_u8(0x80));
22+
// Replace 0xff with 0x1b
23+
const uint8x16_t rhs = vandq_u8(mask, vmovq_n_u8(0x1b));
24+
// (x << 1) ^ ((x & 0x80) ? 0x1b : 0x00))
25+
return util::Xor(lhs, rhs);
26+
}
27+
28+
void ALWAYS_INLINE MixColumn(uint8x16_t& Wa, uint8x16_t& Wb, uint8x16_t& Wc, uint8x16_t& Wd)
29+
{
30+
const uint8x16_t a = Wa;
31+
const uint8x16_t b = Wb;
32+
const uint8x16_t c = Wc;
33+
const uint8x16_t d = Wd;
34+
35+
const uint8x16_t ab = util::Xor(a, b);
36+
const uint8x16_t bc = util::Xor(b, c);
37+
const uint8x16_t cd = util::Xor(c, d);
38+
39+
const uint8x16_t abx = gf8_mul2(ab);
40+
const uint8x16_t bcx = gf8_mul2(bc);
41+
const uint8x16_t cdx = gf8_mul2(cd);
42+
43+
// Wa = abx ^ bc ^ d
44+
Wa = util::Xor(util::Xor(abx, bc), d);
45+
// Wb = bcx ^ a ^ cd
46+
Wb = util::Xor(util::Xor(bcx, a), cd);
47+
// Wc = cdx ^ ab ^ d
48+
Wc = util::Xor(util::Xor(cdx, ab), d);
49+
// Wd = abx ^ bcx ^ cdx ^ ab ^ c
50+
Wd = util::Xor(util::Xor(util::Xor(util::Xor(abx, bcx), cdx), ab), c);
51+
}
52+
53+
void ALWAYS_INLINE ShiftRow1(uint8x16_t& Wa, uint8x16_t& Wb, uint8x16_t& Wc, uint8x16_t& Wd)
54+
{
55+
uint8x16_t tmp = Wa;
56+
Wa = Wb;
57+
Wb = Wc;
58+
Wc = Wd;
59+
Wd = tmp;
60+
}
61+
62+
void ALWAYS_INLINE ShiftRow2(uint8x16_t& Wa, uint8x16_t& Wb, uint8x16_t& Wc, uint8x16_t& Wd)
63+
{
64+
uint8x16_t tmp1 = Wa;
65+
uint8x16_t tmp2 = Wb;
66+
Wa = Wc;
67+
Wb = Wd;
68+
Wc = tmp1;
69+
Wd = tmp2;
70+
}
71+
72+
void ALWAYS_INLINE ShiftRow3(uint8x16_t& Wa, uint8x16_t& Wb, uint8x16_t& Wc, uint8x16_t& Wd)
73+
{
74+
uint8x16_t tmp = Wd;
75+
Wd = Wc;
76+
Wc = Wb;
77+
Wb = Wa;
78+
Wa = tmp;
79+
}
80+
} // anonymous namespace
81+
82+
namespace arm_neon_echo {
83+
void ShiftAndMix(uint64_t W[16][2])
84+
{
85+
alignas(16) uint8x16_t w[16];
86+
w[0] = vreinterpretq_u8_u64(vld1q_u64(&W[0][0]));
87+
w[1] = vreinterpretq_u8_u64(vld1q_u64(&W[1][0]));
88+
w[2] = vreinterpretq_u8_u64(vld1q_u64(&W[2][0]));
89+
w[3] = vreinterpretq_u8_u64(vld1q_u64(&W[3][0]));
90+
w[4] = vreinterpretq_u8_u64(vld1q_u64(&W[4][0]));
91+
w[5] = vreinterpretq_u8_u64(vld1q_u64(&W[5][0]));
92+
w[6] = vreinterpretq_u8_u64(vld1q_u64(&W[6][0]));
93+
w[7] = vreinterpretq_u8_u64(vld1q_u64(&W[7][0]));
94+
w[8] = vreinterpretq_u8_u64(vld1q_u64(&W[8][0]));
95+
w[9] = vreinterpretq_u8_u64(vld1q_u64(&W[9][0]));
96+
w[10] = vreinterpretq_u8_u64(vld1q_u64(&W[10][0]));
97+
w[11] = vreinterpretq_u8_u64(vld1q_u64(&W[11][0]));
98+
w[12] = vreinterpretq_u8_u64(vld1q_u64(&W[12][0]));
99+
w[13] = vreinterpretq_u8_u64(vld1q_u64(&W[13][0]));
100+
w[14] = vreinterpretq_u8_u64(vld1q_u64(&W[14][0]));
101+
w[15] = vreinterpretq_u8_u64(vld1q_u64(&W[15][0]));
102+
103+
ShiftRow1(w[1], w[5], w[9], w[13]);
104+
ShiftRow2(w[2], w[6], w[10], w[14]);
105+
ShiftRow3(w[3], w[7], w[11], w[15]);
106+
107+
MixColumn(w[0], w[1], w[2], w[3]);
108+
MixColumn(w[4], w[5], w[6], w[7]);
109+
MixColumn(w[8], w[9], w[10], w[11]);
110+
MixColumn(w[12], w[13], w[14], w[15]);
111+
112+
vst1q_u64(&W[0][0], vreinterpretq_u64_u8(w[0]));
113+
vst1q_u64(&W[1][0], vreinterpretq_u64_u8(w[1]));
114+
vst1q_u64(&W[2][0], vreinterpretq_u64_u8(w[2]));
115+
vst1q_u64(&W[3][0], vreinterpretq_u64_u8(w[3]));
116+
vst1q_u64(&W[4][0], vreinterpretq_u64_u8(w[4]));
117+
vst1q_u64(&W[5][0], vreinterpretq_u64_u8(w[5]));
118+
vst1q_u64(&W[6][0], vreinterpretq_u64_u8(w[6]));
119+
vst1q_u64(&W[7][0], vreinterpretq_u64_u8(w[7]));
120+
vst1q_u64(&W[8][0], vreinterpretq_u64_u8(w[8]));
121+
vst1q_u64(&W[9][0], vreinterpretq_u64_u8(w[9]));
122+
vst1q_u64(&W[10][0], vreinterpretq_u64_u8(w[10]));
123+
vst1q_u64(&W[11][0], vreinterpretq_u64_u8(w[11]));
124+
vst1q_u64(&W[12][0], vreinterpretq_u64_u8(w[12]));
125+
vst1q_u64(&W[13][0], vreinterpretq_u64_u8(w[13]));
126+
vst1q_u64(&W[14][0], vreinterpretq_u64_u8(w[14]));
127+
vst1q_u64(&W[15][0], vreinterpretq_u64_u8(w[15]));
128+
}
129+
} // namespace arm_neon_echo
130+
} // namespace sapphire
131+
132+
#endif // ENABLE_ARM_NEON

src/crypto/x11/dispatch.cpp

Lines changed: 48 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
#if !defined(DISABLE_OPTIMIZED_SHA256)
1212
#include <compat/cpuid.h>
1313

14-
#if defined(ENABLE_ARM_AES)
14+
#if defined(ENABLE_ARM_AES) || defined(ENABLE_ARM_NEON)
1515
#if defined(__APPLE__)
1616
#include <sys/sysctl.h>
1717
#include <sys/types.h>
@@ -31,7 +31,7 @@
3131
#include <processthreadsapi.h>
3232
#include <winnt.h>
3333
#endif // _WIN32
34-
#endif // ENABLE_ARM_AES
34+
#endif // ENABLE_ARM_AES || ENABLE_ARM_NEON
3535
#endif // !DISABLE_OPTIMIZED_SHA256
3636

3737
#include <cstddef>
@@ -55,6 +55,12 @@ void CompressElement(uint32_t& l0, uint32_t& l1, uint32_t& l2, uint32_t& l3,
5555
} // namespace arm_crypto_shavite
5656
#endif // ENABLE_ARM_AES
5757

58+
#if defined(ENABLE_ARM_NEON)
59+
namespace arm_neon_echo {
60+
void ShiftAndMix(uint64_t W[16][2]);
61+
} // namespace arm_neon_echo
62+
#endif // ENABLE_ARM_NEON
63+
5864
#if defined(ENABLE_SSSE3)
5965
namespace ssse3_echo {
6066
void ShiftAndMix(uint64_t W[16][2]);
@@ -96,6 +102,21 @@ void CompressElement(uint32_t& l0, uint32_t& l1, uint32_t& l2, uint32_t& l3,
96102
} // namespace soft_shavite
97103
} // namespace sapphire
98104

105+
namespace {
106+
#if !defined(DISABLE_OPTIMIZED_SHA256)
107+
#if defined(ENABLE_ARM_AES) || defined(ENABLE_ARM_NEON)
108+
#if defined(__APPLE__)
109+
bool IsSysCtlNonZero(const char* name)
110+
{
111+
int val = 0;
112+
size_t len = sizeof(val);
113+
return ::sysctlbyname(name, &val, &len, nullptr, 0) == 0 && val != 0;
114+
}
115+
#endif // __APPLE__
116+
#endif // ENABLE_ARM_AES || ENABLE_ARM_NEON
117+
#endif // !DISABLE_OPTIMIZED_SHA256
118+
} // anonymous namespace
119+
99120
extern sapphire::dispatch::AESRoundFn aes_round;
100121
extern sapphire::dispatch::AESRoundFnNk aes_round_nk;
101122
extern sapphire::dispatch::EchoShiftMix echo_shift_mix;
@@ -132,45 +153,60 @@ void SapphireAutoDetect()
132153
#endif // ENABLE_SSSE3
133154
#endif // HAVE_GETCPUID
134155

135-
#if defined(ENABLE_ARM_AES)
136-
bool have_arm_aes = false;
156+
#if defined(ENABLE_ARM_AES) || defined(ENABLE_ARM_NEON)
157+
[[maybe_unused]] bool have_arm_aes = false;
158+
[[maybe_unused]] bool have_arm_neon = false;
159+
137160
#if defined(__APPLE__)
138-
int val = 0;
139-
size_t len = sizeof(val);
140-
if (::sysctlbyname("hw.optional.arm.FEAT_AES", &val, &len, nullptr, 0) == 0) {
141-
have_arm_aes = val != 0;
142-
}
161+
have_arm_aes = IsSysCtlNonZero("hw.optional.arm.FEAT_AES");
162+
have_arm_neon = IsSysCtlNonZero("hw.optional.neon") || IsSysCtlNonZero("hw.optional.AdvSIMD") ||
163+
IsSysCtlNonZero("hw.optional.arm.AdvSIMD"); // See https://github.com/google/cpu_features/issues/390
143164
#endif // __APPLE__
144165

145166
#if defined(__linux__)
146167
#if defined(__arm__)
147168
have_arm_aes = (::getauxval(AT_HWCAP2) & HWCAP2_AES);
169+
have_arm_neon = (::getauxval(AT_HWCAP) & HWCAP_NEON);
148170
#endif // __arm__
149171
#if defined(__aarch64__)
150172
have_arm_aes = (::getauxval(AT_HWCAP) & HWCAP_AES);
173+
have_arm_neon = (::getauxval(AT_HWCAP) & HWCAP_ASIMD);
151174
#endif // __aarch64__
152175
#endif // __linux__
153176

154177
#if defined(__FreeBSD__)
155-
[[maybe_unused]] unsigned long hwcap{0};
178+
[[maybe_unused]] unsigned long hwcap{0}, hwcap2{0};
156179
#if defined(__arm__)
157-
have_arm_aes = ((::elf_aux_info(AT_HWCAP2, &hwcap, sizeof(hwcap)) == 0) && ((hwcap & HWCAP2_AES) != 0));
180+
have_arm_aes = ((::elf_aux_info(AT_HWCAP2, &hwcap2, sizeof(hwcap2)) == 0) && ((hwcap2 & HWCAP2_AES) != 0));
181+
have_arm_neon = ((::elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap)) == 0) && ((hwcap & HWCAP_NEON) != 0));
158182
#endif // __arm__
159183
#if defined(__aarch64__)
160-
have_arm_aes = ((::elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap)) == 0) && ((hwcap & HWCAP_AES) != 0));
184+
if (::elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap)) == 0) {
185+
have_arm_aes = ((hwcap & HWCAP_AES) != 0);
186+
have_arm_neon = ((hwcap & HWCAP_ASIMD) != 0);
187+
}
161188
#endif // __aarch64__
162189
#endif // __FreeBSD__
163190

164191
#if defined(_WIN32)
165192
have_arm_aes = ::IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE);
193+
have_arm_neon = ::IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE);
166194
#endif // _WIN32
167195

196+
#if defined(ENABLE_ARM_AES)
168197
if (have_arm_aes) {
169198
aes_round = sapphire::arm_crypto_aes::Round;
170199
aes_round_nk = sapphire::arm_crypto_aes::RoundKeyless;
171200
echo_round = sapphire::arm_crypto_echo::FullStateRound;
172201
shavite_c512e = sapphire::arm_crypto_shavite::CompressElement;
173202
}
174203
#endif // ENABLE_ARM_AES
204+
205+
#if defined (ENABLE_ARM_NEON)
206+
if (have_arm_neon) {
207+
echo_shift_mix = sapphire::arm_neon_echo::ShiftAndMix;
208+
}
209+
#endif // ENABLE_ARM_NEON
210+
#endif // ENABLE_ARM_AES || ENABLE_ARM_NEON
175211
#endif // !DISABLE_OPTIMIZED_SHA256
176212
}

src/crypto/x11/util/util.hpp

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,9 @@
1010
#if !defined(DISABLE_OPTIMIZED_SHA256)
1111
#include <attributes.h>
1212

13-
#if defined(ENABLE_ARM_AES)
13+
#if defined(ENABLE_ARM_AES) || defined(ENABLE_ARM_NEON)
1414
#include <arm_neon.h>
15-
#endif // ENABLE_ARM_AES
15+
#endif // ENABLE_ARM_AES || ENABLE_ARM_NEON
1616

1717
#if defined(ENABLE_SSSE3) || (defined(ENABLE_SSE41) && defined(ENABLE_X86_AESNI))
1818
#include <immintrin.h>
@@ -30,7 +30,7 @@ constexpr inline uint32_t pack_le(uint8_t b3, uint8_t b2, uint8_t b1, uint8_t b0
3030
}
3131

3232
#if !defined(DISABLE_OPTIMIZED_SHA256)
33-
#if defined(ENABLE_ARM_AES)
33+
#if defined(ENABLE_ARM_AES) || defined(ENABLE_ARM_NEON)
3434
uint8x16_t ALWAYS_INLINE Xor(const uint8x16_t& x, const uint8x16_t& y) { return veorq_u8(x, y); }
3535

3636
uint8x16_t ALWAYS_INLINE pack_le(const uint32_t& w0, const uint32_t& w1, const uint32_t& w2, const uint32_t& w3)
@@ -47,6 +47,7 @@ void ALWAYS_INLINE unpack_le(const uint8x16_t& i, uint32_t& w0, uint32_t& w1, ui
4747
w3 = vgetq_lane_u32(r, 3);
4848
}
4949

50+
#if defined(ENABLE_ARM_AES)
5051
uint8x16_t ALWAYS_INLINE aes_round(const uint8x16_t& input, const uint8x16_t& key)
5152
{
5253
// See "Emulating x86 AES Intrinsics on ARMv8-A" by Michael Brase for _mm_aesenc_si128
@@ -60,6 +61,7 @@ uint8x16_t ALWAYS_INLINE aes_round_nk(const uint8x16_t& input)
6061
return vaesmcq_u8(vaeseq_u8(input, vmovq_n_u8(0)));
6162
}
6263
#endif // ENABLE_ARM_AES
64+
#endif // ENABLE_ARM_AES || ENABLE_ARM_NEON
6365

6466
#if defined(ENABLE_SSSE3) || (defined(ENABLE_SSE41) && defined(ENABLE_X86_AESNI))
6567
__m128i ALWAYS_INLINE Xor(const __m128i& x, const __m128i& y) { return _mm_xor_si128(x, y); }

0 commit comments

Comments
 (0)