Skip to content

Commit 31f93ad

Browse files
committed
crypto: unroll Echo512's FullStateRound()
Also lets us get rid of an extra unnecessary pack/unpack
1 parent fa68c70 commit 31f93ad

File tree

2 files changed

+100
-34
lines changed

2 files changed

+100
-34
lines changed

src/crypto/x11/arm_crypto/echo.cpp

Lines changed: 50 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -5,32 +5,65 @@
55
#if defined(ENABLE_ARM_AES)
66
#include <crypto/x11/util/util.hpp>
77

8-
#include <cstdint>
8+
#include <cstddef>
99

1010
#include <arm_neon.h>
1111

1212
namespace sapphire {
13-
namespace arm_crypto_echo {
14-
void FullStateRound(uint64_t W[16][2], uint32_t& k0, uint32_t& k1, uint32_t& k2, uint32_t& k3)
13+
namespace {
14+
void ALWAYS_INLINE StateRound(uint64_t W[16][2], size_t idx, uint8x16_t& key, uint32_t& k0, uint32_t& k1, uint32_t& k2, uint32_t& k3)
1515
{
16-
uint8x16_t key = util::pack_le(k0, k1, k2, k3);
17-
for (int n = 0; n < 16; n++) {
18-
uint8x16_t block = vreinterpretq_u8_u64(vld1q_u64(&W[n][0]));
19-
block = util::aes_round(block, key);
20-
block = util::aes_round_nk(block);
21-
vst1q_u64(&W[n][0], vreinterpretq_u64_u8(block));
16+
uint8x16_t block = vreinterpretq_u8_u64(vld1q_u64(&W[idx][0]));
17+
block = util::aes_round(block, key);
18+
block = util::aes_round_nk(block);
19+
vst1q_u64(&W[idx][0], vreinterpretq_u64_u8(block));
2220

23-
util::unpack_le(key, k0, k1, k2, k3);
24-
if ((k0 = (k0 + 1)) == 0) {
25-
if ((k1 = (k1 + 1)) == 0) {
26-
if ((k2 = (k2 + 1)) == 0) {
27-
k3 = (k3 + 1);
28-
}
21+
util::unpack_le(key, k0, k1, k2, k3);
22+
if ((k0 = (k0 + 1)) == 0) {
23+
if ((k1 = (k1 + 1)) == 0) {
24+
if ((k2 = (k2 + 1)) == 0) {
25+
k3 = (k3 + 1);
2926
}
3027
}
31-
key = util::pack_le(k0, k1, k2, k3);
3228
}
33-
util::unpack_le(key, k0, k1, k2, k3);
29+
}
30+
} // anonymous namespace
31+
32+
namespace arm_crypto_echo {
33+
void FullStateRound(uint64_t W[16][2], uint32_t& k0, uint32_t& k1, uint32_t& k2, uint32_t& k3)
34+
{
35+
uint8x16_t key = util::pack_le(k0, k1, k2, k3);
36+
StateRound(W, 0, key, k0, k1, k2, k3);
37+
key = util::pack_le(k0, k1, k2, k3);
38+
StateRound(W, 1, key, k0, k1, k2, k3);
39+
key = util::pack_le(k0, k1, k2, k3);
40+
StateRound(W, 2, key, k0, k1, k2, k3);
41+
key = util::pack_le(k0, k1, k2, k3);
42+
StateRound(W, 3, key, k0, k1, k2, k3);
43+
key = util::pack_le(k0, k1, k2, k3);
44+
StateRound(W, 4, key, k0, k1, k2, k3);
45+
key = util::pack_le(k0, k1, k2, k3);
46+
StateRound(W, 5, key, k0, k1, k2, k3);
47+
key = util::pack_le(k0, k1, k2, k3);
48+
StateRound(W, 6, key, k0, k1, k2, k3);
49+
key = util::pack_le(k0, k1, k2, k3);
50+
StateRound(W, 7, key, k0, k1, k2, k3);
51+
key = util::pack_le(k0, k1, k2, k3);
52+
StateRound(W, 8, key, k0, k1, k2, k3);
53+
key = util::pack_le(k0, k1, k2, k3);
54+
StateRound(W, 9, key, k0, k1, k2, k3);
55+
key = util::pack_le(k0, k1, k2, k3);
56+
StateRound(W, 10, key, k0, k1, k2, k3);
57+
key = util::pack_le(k0, k1, k2, k3);
58+
StateRound(W, 11, key, k0, k1, k2, k3);
59+
key = util::pack_le(k0, k1, k2, k3);
60+
StateRound(W, 12, key, k0, k1, k2, k3);
61+
key = util::pack_le(k0, k1, k2, k3);
62+
StateRound(W, 13, key, k0, k1, k2, k3);
63+
key = util::pack_le(k0, k1, k2, k3);
64+
StateRound(W, 14, key, k0, k1, k2, k3);
65+
key = util::pack_le(k0, k1, k2, k3);
66+
StateRound(W, 15, key, k0, k1, k2, k3);
3467
}
3568
} // namespace arm_crypto_echo
3669
} // namespace sapphire

src/crypto/x11/x86_aesni/echo.cpp

Lines changed: 50 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -5,33 +5,66 @@
55
#if defined(ENABLE_SSE41) && defined(ENABLE_X86_AESNI)
66
#include <crypto/x11/util/util.hpp>
77

8-
#include <cstdint>
8+
#include <cstddef>
99

1010
#include <immintrin.h>
1111
#include <wmmintrin.h>
1212

1313
namespace sapphire {
14-
namespace x86_aesni_echo {
15-
void FullStateRound(uint64_t W[16][2], uint32_t& k0, uint32_t& k1, uint32_t& k2, uint32_t& k3)
14+
namespace {
15+
void ALWAYS_INLINE StateRound(uint64_t W[16][2], size_t idx, __m128i& key, uint32_t& k0, uint32_t& k1, uint32_t& k2, uint32_t& k3)
1616
{
17-
__m128i key = util::pack_le(k0, k1, k2, k3);
18-
for (int n = 0; n < 16; n++) {
19-
__m128i block = _mm_load_si128((const __m128i*)&W[n][0]);
20-
block = util::aes_round(block, key);
21-
block = util::aes_round(block, _mm_setzero_si128());
22-
_mm_store_si128((__m128i*)&W[n][0], block);
17+
__m128i block = _mm_load_si128((const __m128i*)&W[idx][0]);
18+
block = util::aes_round(block, key);
19+
block = util::aes_round(block, _mm_setzero_si128());
20+
_mm_store_si128((__m128i*)&W[idx][0], block);
2321

24-
util::unpack_le(key, k0, k1, k2, k3);
25-
if ((k0 = (k0 + 1)) == 0) {
26-
if ((k1 = (k1 + 1)) == 0) {
27-
if ((k2 = (k2 + 1)) == 0) {
28-
k3 = (k3 + 1);
29-
}
22+
util::unpack_le(key, k0, k1, k2, k3);
23+
if ((k0 = (k0 + 1)) == 0) {
24+
if ((k1 = (k1 + 1)) == 0) {
25+
if ((k2 = (k2 + 1)) == 0) {
26+
k3 = (k3 + 1);
3027
}
3128
}
32-
key = util::pack_le(k0, k1, k2, k3);
3329
}
34-
util::unpack_le(key, k0, k1, k2, k3);
30+
}
31+
} // anonymous namespace
32+
33+
namespace x86_aesni_echo {
34+
void FullStateRound(uint64_t W[16][2], uint32_t& k0, uint32_t& k1, uint32_t& k2, uint32_t& k3)
35+
{
36+
__m128i key = util::pack_le(k0, k1, k2, k3);
37+
StateRound(W, 0, key, k0, k1, k2, k3);
38+
key = util::pack_le(k0, k1, k2, k3);
39+
StateRound(W, 1, key, k0, k1, k2, k3);
40+
key = util::pack_le(k0, k1, k2, k3);
41+
StateRound(W, 2, key, k0, k1, k2, k3);
42+
key = util::pack_le(k0, k1, k2, k3);
43+
StateRound(W, 3, key, k0, k1, k2, k3);
44+
key = util::pack_le(k0, k1, k2, k3);
45+
StateRound(W, 4, key, k0, k1, k2, k3);
46+
key = util::pack_le(k0, k1, k2, k3);
47+
StateRound(W, 5, key, k0, k1, k2, k3);
48+
key = util::pack_le(k0, k1, k2, k3);
49+
StateRound(W, 6, key, k0, k1, k2, k3);
50+
key = util::pack_le(k0, k1, k2, k3);
51+
StateRound(W, 7, key, k0, k1, k2, k3);
52+
key = util::pack_le(k0, k1, k2, k3);
53+
StateRound(W, 8, key, k0, k1, k2, k3);
54+
key = util::pack_le(k0, k1, k2, k3);
55+
StateRound(W, 9, key, k0, k1, k2, k3);
56+
key = util::pack_le(k0, k1, k2, k3);
57+
StateRound(W, 10, key, k0, k1, k2, k3);
58+
key = util::pack_le(k0, k1, k2, k3);
59+
StateRound(W, 11, key, k0, k1, k2, k3);
60+
key = util::pack_le(k0, k1, k2, k3);
61+
StateRound(W, 12, key, k0, k1, k2, k3);
62+
key = util::pack_le(k0, k1, k2, k3);
63+
StateRound(W, 13, key, k0, k1, k2, k3);
64+
key = util::pack_le(k0, k1, k2, k3);
65+
StateRound(W, 14, key, k0, k1, k2, k3);
66+
key = util::pack_le(k0, k1, k2, k3);
67+
StateRound(W, 15, key, k0, k1, k2, k3);
3568
}
3669
} // namespace x86_aesni_echo
3770
} // namespace sapphire

0 commit comments

Comments
 (0)