@@ -41,6 +41,12 @@ namespace cp_algo {
41
41
[[gnu::always_inline]] inline u64x4 low32 (u64x4 x) {
42
42
return x & uint32_t (-1 );
43
43
}
44
+ [[gnu::always_inline]] inline auto rotr (auto x) {
45
+ return decltype (x)(__builtin_shufflevector (u32x8 (x), u32x8 (x), 1 , 2 , 3 , 0 , 5 , 6 , 7 , 4 ));
46
+ }
47
+ [[gnu::always_inline]] inline auto rotl (auto x) {
48
+ return decltype (x)(__builtin_shufflevector (u32x8 (x), u32x8 (x), 3 , 0 , 1 , 2 , 7 , 4 , 5 , 6 ));
49
+ }
44
50
45
51
[[gnu::always_inline]] inline u64x4 montgomery_reduce (u64x4 x, uint32_t mod, uint32_t imod) {
46
52
#ifdef __AVX2__
@@ -50,7 +56,7 @@ namespace cp_algo {
50
56
auto x_ninv = x * imod;
51
57
x += low32 (x_ninv) * mod;
52
58
#endif
53
- return x >> 32 ;
59
+ return rotr (x) ;
54
60
}
55
61
56
62
[[gnu::always_inline]] inline u64x4 montgomery_mul (u64x4 x, u64x4 y, uint32_t mod, uint32_t imod) {
@@ -60,16 +66,10 @@ namespace cp_algo {
60
66
return montgomery_reduce (low32 (x) * low32 (y), mod, imod);
61
67
#endif
62
68
}
63
-
64
69
[[gnu::always_inline]] inline u32x8 montgomery_mul (u32x8 x, u32x8 y, uint32_t mod, uint32_t imod) {
65
- auto x0246 = u64x4 (x);
66
- auto y0246 = u64x4 (y);
67
- auto x1357 = u64x4 (x) >> 32 ;
68
- auto y1357 = u64x4 (y) >> 32 ;
69
- return u32x8 (montgomery_mul (x0246, y0246, mod, imod)) |
70
- u32x8 (montgomery_mul (x1357, y1357, mod, imod) << 32 );
70
+ return u32x8 (montgomery_mul (u64x4 (x), u64x4 (y), mod, imod)) |
71
+ u32x8 (rotl (montgomery_mul (u64x4 (rotr (x)), u64x4 (rotr (y)), mod, imod)));
71
72
}
72
-
73
73
[[gnu::always_inline]] inline dx4 rotate_right (dx4 x) {
74
74
static constexpr u64x4 shuffler = {3 , 0 , 1 , 2 };
75
75
return __builtin_shuffle (x, shuffler);
0 commit comments