Skip to content

Commit 0f96754

Browse files
committed
use rotl/rotr in 8x Montgomery mul
1 parent dd4320f commit 0f96754

File tree

1 file changed

+9
-9
lines changed

1 file changed

+9
-9
lines changed

cp-algo/util/simd.hpp

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,12 @@ namespace cp_algo {
4141
[[gnu::always_inline]] inline u64x4 low32(u64x4 x) {
4242
return x & uint32_t(-1);
4343
}
44+
[[gnu::always_inline]] inline auto rotr(auto x) {
45+
return decltype(x)(__builtin_shufflevector(u32x8(x), u32x8(x), 1, 2, 3, 0, 5, 6, 7, 4));
46+
}
47+
[[gnu::always_inline]] inline auto rotl(auto x) {
48+
return decltype(x)(__builtin_shufflevector(u32x8(x), u32x8(x), 3, 0, 1, 2, 7, 4, 5, 6));
49+
}
4450

4551
[[gnu::always_inline]] inline u64x4 montgomery_reduce(u64x4 x, uint32_t mod, uint32_t imod) {
4652
#ifdef __AVX2__
@@ -50,7 +56,7 @@ namespace cp_algo {
5056
auto x_ninv = x * imod;
5157
x += low32(x_ninv) * mod;
5258
#endif
53-
return x >> 32;
59+
return rotr(x);
5460
}
5561

5662
[[gnu::always_inline]] inline u64x4 montgomery_mul(u64x4 x, u64x4 y, uint32_t mod, uint32_t imod) {
@@ -60,16 +66,10 @@ namespace cp_algo {
6066
return montgomery_reduce(low32(x) * low32(y), mod, imod);
6167
#endif
6268
}
63-
6469
[[gnu::always_inline]] inline u32x8 montgomery_mul(u32x8 x, u32x8 y, uint32_t mod, uint32_t imod) {
65-
auto x0246 = u64x4(x);
66-
auto y0246 = u64x4(y);
67-
auto x1357 = u64x4(x) >> 32;
68-
auto y1357 = u64x4(y) >> 32;
69-
return u32x8(montgomery_mul(x0246, y0246, mod, imod)) |
70-
u32x8(montgomery_mul(x1357, y1357, mod, imod) << 32);
70+
return u32x8(montgomery_mul(u64x4(x), u64x4(y), mod, imod)) |
71+
u32x8(rotl(montgomery_mul(u64x4(rotr(x)), u64x4(rotr(y)), mod, imod)));
7172
}
72-
7373
[[gnu::always_inline]] inline dx4 rotate_right(dx4 x) {
7474
static constexpr u64x4 shuffler = {3, 0, 1, 2};
7575
return __builtin_shuffle(x, shuffler);

0 commit comments

Comments
 (0)