@@ -25,12 +25,12 @@ __m128i ALWAYS_INLINE gf8_mul2(const __m128i& x)
2525 return util::Xor (lhs, rhs);
2626}
2727
28- void ALWAYS_INLINE MixColumn (uint64_t W[ 16 ][ 2 ], int ia, int ib, int ic, int id )
28+ void ALWAYS_INLINE MixColumn (__m128i& Wa, __m128i& Wb, __m128i& Wc, __m128i& Wd )
2929{
30- const __m128i a = _mm_load_si128 (( const __m128i*)&W[ia][ 0 ]) ;
31- const __m128i b = _mm_load_si128 (( const __m128i*)&W[ib][ 0 ]) ;
32- const __m128i c = _mm_load_si128 (( const __m128i*)&W[ic][ 0 ]) ;
33- const __m128i d = _mm_load_si128 (( const __m128i*)&W[id][ 0 ]) ;
30+ const __m128i a = Wa ;
31+ const __m128i b = Wb ;
32+ const __m128i c = Wc ;
33+ const __m128i d = Wd ;
3434
3535 const __m128i ab = util::Xor (a, b);
3636 const __m128i bc = util::Xor (b, c);
@@ -40,14 +40,14 @@ void ALWAYS_INLINE MixColumn(uint64_t W[16][2], int ia, int ib, int ic, int id)
4040 const __m128i bcx = gf8_mul2 (bc);
4141 const __m128i cdx = gf8_mul2 (cd);
4242
43- // W[ia] = abx ^ bc ^ d
44- _mm_store_si128 ((__m128i*)&W[ia][ 0 ], util::Xor (util::Xor (abx, bc), d) );
45- // W[ib] = bcx ^ a ^ cd
46- _mm_store_si128 ((__m128i*)&W[ib][ 0 ], util::Xor (util::Xor (bcx, a), cd) );
47- // W[ic] = cdx ^ ab ^ d
48- _mm_store_si128 ((__m128i*)&W[ic][ 0 ], util::Xor (util::Xor (cdx, ab), d) );
49- // W[id] = abx ^ bcx ^ cdx ^ ab ^ c
50- _mm_store_si128 ((__m128i*)&W[id][ 0 ], util::Xor (util::Xor (util::Xor (util::Xor (abx, bcx), cdx), ab), c) );
43+ // Wa = abx ^ bc ^ d
44+ Wa = util::Xor (util::Xor (abx, bc), d);
45+ // Wb = bcx ^ a ^ cd
46+ Wb = util::Xor (util::Xor (bcx, a), cd);
47+ // Wc = cdx ^ ab ^ d
48+ Wc = util::Xor (util::Xor (cdx, ab), d);
49+ // Wd = abx ^ bcx ^ cdx ^ ab ^ c
50+ Wd = util::Xor (util::Xor (util::Xor (util::Xor (abx, bcx), cdx), ab), c);
5151}
5252
5353void ALWAYS_INLINE ShiftRow1 (__m128i& Wa, __m128i& Wb, __m128i& Wc, __m128i& Wd)
@@ -104,6 +104,11 @@ void ShiftAndMix(uint64_t W[16][2])
104104 ShiftRow2 (w[2 ], w[6 ], w[10 ], w[14 ]);
105105 ShiftRow3 (w[3 ], w[7 ], w[11 ], w[15 ]);
106106
107+ MixColumn (w[0 ], w[1 ], w[2 ], w[3 ]);
108+ MixColumn (w[4 ], w[5 ], w[6 ], w[7 ]);
109+ MixColumn (w[8 ], w[9 ], w[10 ], w[11 ]);
110+ MixColumn (w[12 ], w[13 ], w[14 ], w[15 ]);
111+
107112 _mm_store_si128 ((__m128i*)&W[0 ][0 ], w[0 ]);
108113 _mm_store_si128 ((__m128i*)&W[1 ][0 ], w[1 ]);
109114 _mm_store_si128 ((__m128i*)&W[2 ][0 ], w[2 ]);
@@ -120,11 +125,6 @@ void ShiftAndMix(uint64_t W[16][2])
120125 _mm_store_si128 ((__m128i*)&W[13 ][0 ], w[13 ]);
121126 _mm_store_si128 ((__m128i*)&W[14 ][0 ], w[14 ]);
122127 _mm_store_si128 ((__m128i*)&W[15 ][0 ], w[15 ]);
123-
124- MixColumn (W, 0 , 1 , 2 , 3 );
125- MixColumn (W, 4 , 5 , 6 , 7 );
126- MixColumn (W, 8 , 9 , 10 , 11 );
127- MixColumn (W, 12 , 13 , 14 , 15 );
128128}
129129} // namespace ssse3_echo
130130} // namespace sapphire
0 commit comments