|
| 1 | +// Copyright (c) 2025 The Dash Core developers |
| 2 | +// Distributed under the MIT software license, see the accompanying |
| 3 | +// file COPYING or http://www.opensource.org/licenses/mit-license.php. |
| 4 | + |
| 5 | +#if defined(ENABLE_ARM_NEON) |
| 6 | +#include <attributes.h> |
| 7 | +#include <crypto/x11/util/util.hpp> |
| 8 | + |
| 9 | +#include <cstdint> |
| 10 | + |
| 11 | +#include <arm_neon.h> |
| 12 | + |
| 13 | +namespace sapphire { |
| 14 | +namespace { |
| 15 | +uint8x16_t ALWAYS_INLINE gf8_mul2(const uint8x16_t& x) |
| 16 | +{ |
| 17 | + // (x << 1) |
| 18 | + const uint8x16_t lhs = vshlq_n_u8(x, 1); |
| 19 | + // (x & 0x80) ? 0xff : 0x00 |
| 20 | + const uint8x16_t msb_set = vandq_u8(x, vmovq_n_u8(0x80)); |
| 21 | + const uint8x16_t mask = vceqq_u8(msb_set, vmovq_n_u8(0x80)); |
| 22 | + // Replace 0xff with 0x1b |
| 23 | + const uint8x16_t rhs = vandq_u8(mask, vmovq_n_u8(0x1b)); |
| 24 | + // (x << 1) ^ ((x & 0x80) ? 0x1b : 0x00)) |
| 25 | + return util::Xor(lhs, rhs); |
| 26 | +} |
| 27 | + |
| 28 | +void ALWAYS_INLINE MixColumn(uint8x16_t& Wa, uint8x16_t& Wb, uint8x16_t& Wc, uint8x16_t& Wd) |
| 29 | +{ |
| 30 | + const uint8x16_t a = Wa; |
| 31 | + const uint8x16_t b = Wb; |
| 32 | + const uint8x16_t c = Wc; |
| 33 | + const uint8x16_t d = Wd; |
| 34 | + |
| 35 | + const uint8x16_t ab = util::Xor(a, b); |
| 36 | + const uint8x16_t bc = util::Xor(b, c); |
| 37 | + const uint8x16_t cd = util::Xor(c, d); |
| 38 | + |
| 39 | + const uint8x16_t abx = gf8_mul2(ab); |
| 40 | + const uint8x16_t bcx = gf8_mul2(bc); |
| 41 | + const uint8x16_t cdx = gf8_mul2(cd); |
| 42 | + |
| 43 | + // Wa = abx ^ bc ^ d |
| 44 | + Wa = util::Xor(util::Xor(abx, bc), d); |
| 45 | + // Wb = bcx ^ a ^ cd |
| 46 | + Wb = util::Xor(util::Xor(bcx, a), cd); |
| 47 | + // Wc = cdx ^ ab ^ d |
| 48 | + Wc = util::Xor(util::Xor(cdx, ab), d); |
| 49 | + // Wd = abx ^ bcx ^ cdx ^ ab ^ c |
| 50 | + Wd = util::Xor(util::Xor(util::Xor(util::Xor(abx, bcx), cdx), ab), c); |
| 51 | +} |
| 52 | + |
| 53 | +void ALWAYS_INLINE ShiftRow1(uint8x16_t& Wa, uint8x16_t& Wb, uint8x16_t& Wc, uint8x16_t& Wd) |
| 54 | +{ |
| 55 | + uint8x16_t tmp = Wa; |
| 56 | + Wa = Wb; |
| 57 | + Wb = Wc; |
| 58 | + Wc = Wd; |
| 59 | + Wd = tmp; |
| 60 | +} |
| 61 | + |
| 62 | +void ALWAYS_INLINE ShiftRow2(uint8x16_t& Wa, uint8x16_t& Wb, uint8x16_t& Wc, uint8x16_t& Wd) |
| 63 | +{ |
| 64 | + uint8x16_t tmp1 = Wa; |
| 65 | + uint8x16_t tmp2 = Wb; |
| 66 | + Wa = Wc; |
| 67 | + Wb = Wd; |
| 68 | + Wc = tmp1; |
| 69 | + Wd = tmp2; |
| 70 | +} |
| 71 | + |
| 72 | +void ALWAYS_INLINE ShiftRow3(uint8x16_t& Wa, uint8x16_t& Wb, uint8x16_t& Wc, uint8x16_t& Wd) |
| 73 | +{ |
| 74 | + uint8x16_t tmp = Wd; |
| 75 | + Wd = Wc; |
| 76 | + Wc = Wb; |
| 77 | + Wb = Wa; |
| 78 | + Wa = tmp; |
| 79 | +} |
| 80 | +} // anonymous namespace |
| 81 | + |
| 82 | +namespace arm_neon_echo { |
| 83 | +void ShiftAndMix(uint64_t W[16][2]) |
| 84 | +{ |
| 85 | + alignas(16) uint8x16_t w[16]; |
| 86 | + w[0] = vreinterpretq_u8_u64(vld1q_u64(&W[0][0])); |
| 87 | + w[1] = vreinterpretq_u8_u64(vld1q_u64(&W[1][0])); |
| 88 | + w[2] = vreinterpretq_u8_u64(vld1q_u64(&W[2][0])); |
| 89 | + w[3] = vreinterpretq_u8_u64(vld1q_u64(&W[3][0])); |
| 90 | + w[4] = vreinterpretq_u8_u64(vld1q_u64(&W[4][0])); |
| 91 | + w[5] = vreinterpretq_u8_u64(vld1q_u64(&W[5][0])); |
| 92 | + w[6] = vreinterpretq_u8_u64(vld1q_u64(&W[6][0])); |
| 93 | + w[7] = vreinterpretq_u8_u64(vld1q_u64(&W[7][0])); |
| 94 | + w[8] = vreinterpretq_u8_u64(vld1q_u64(&W[8][0])); |
| 95 | + w[9] = vreinterpretq_u8_u64(vld1q_u64(&W[9][0])); |
| 96 | + w[10] = vreinterpretq_u8_u64(vld1q_u64(&W[10][0])); |
| 97 | + w[11] = vreinterpretq_u8_u64(vld1q_u64(&W[11][0])); |
| 98 | + w[12] = vreinterpretq_u8_u64(vld1q_u64(&W[12][0])); |
| 99 | + w[13] = vreinterpretq_u8_u64(vld1q_u64(&W[13][0])); |
| 100 | + w[14] = vreinterpretq_u8_u64(vld1q_u64(&W[14][0])); |
| 101 | + w[15] = vreinterpretq_u8_u64(vld1q_u64(&W[15][0])); |
| 102 | + |
| 103 | + ShiftRow1(w[1], w[5], w[9], w[13]); |
| 104 | + ShiftRow2(w[2], w[6], w[10], w[14]); |
| 105 | + ShiftRow3(w[3], w[7], w[11], w[15]); |
| 106 | + |
| 107 | + MixColumn(w[0], w[1], w[2], w[3]); |
| 108 | + MixColumn(w[4], w[5], w[6], w[7]); |
| 109 | + MixColumn(w[8], w[9], w[10], w[11]); |
| 110 | + MixColumn(w[12], w[13], w[14], w[15]); |
| 111 | + |
| 112 | + vst1q_u64(&W[0][0], vreinterpretq_u64_u8(w[0])); |
| 113 | + vst1q_u64(&W[1][0], vreinterpretq_u64_u8(w[1])); |
| 114 | + vst1q_u64(&W[2][0], vreinterpretq_u64_u8(w[2])); |
| 115 | + vst1q_u64(&W[3][0], vreinterpretq_u64_u8(w[3])); |
| 116 | + vst1q_u64(&W[4][0], vreinterpretq_u64_u8(w[4])); |
| 117 | + vst1q_u64(&W[5][0], vreinterpretq_u64_u8(w[5])); |
| 118 | + vst1q_u64(&W[6][0], vreinterpretq_u64_u8(w[6])); |
| 119 | + vst1q_u64(&W[7][0], vreinterpretq_u64_u8(w[7])); |
| 120 | + vst1q_u64(&W[8][0], vreinterpretq_u64_u8(w[8])); |
| 121 | + vst1q_u64(&W[9][0], vreinterpretq_u64_u8(w[9])); |
| 122 | + vst1q_u64(&W[10][0], vreinterpretq_u64_u8(w[10])); |
| 123 | + vst1q_u64(&W[11][0], vreinterpretq_u64_u8(w[11])); |
| 124 | + vst1q_u64(&W[12][0], vreinterpretq_u64_u8(w[12])); |
| 125 | + vst1q_u64(&W[13][0], vreinterpretq_u64_u8(w[13])); |
| 126 | + vst1q_u64(&W[14][0], vreinterpretq_u64_u8(w[14])); |
| 127 | + vst1q_u64(&W[15][0], vreinterpretq_u64_u8(w[15])); |
| 128 | +} |
| 129 | +} // namespace arm_neon_echo |
| 130 | +} // namespace sapphire |
| 131 | + |
| 132 | +#endif // ENABLE_ARM_NEON |
0 commit comments