Skip to content

Commit 7254377

Browse files
committed
Make AESFastEngine actually faster than AESEngine by condensing all table lookups into a single array, eliminating some range checking.
1 parent a2705e1 commit 7254377

File tree

1 file changed

+62
-54
lines changed

1 file changed

+62
-54
lines changed

core/src/main/java/org/bouncycastle/crypto/engines/AESFastEngine.java

Lines changed: 62 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -110,8 +110,9 @@ public class AESFastEngine
110110
0x2f, 0x5e, 0xbc, 0x63, 0xc6, 0x97, 0x35, 0x6a, 0xd4, 0xb3, 0x7d, 0xfa, 0xef, 0xc5, 0x91 };
111111

112112
// precomputation tables of calculations for rounds
113-
private static final int[] T0 =
113+
private static final int[] T =
114114
{
115+
// T0
115116
0xa56363c6, 0x847c7cf8, 0x997777ee, 0x8d7b7bf6, 0x0df2f2ff,
116117
0xbd6b6bd6, 0xb16f6fde, 0x54c5c591, 0x50303060, 0x03010102,
117118
0xa96767ce, 0x7d2b2b56, 0x19fefee7, 0x62d7d7b5, 0xe6abab4d,
@@ -163,10 +164,9 @@ public class AESFastEngine
163164
0x8f8c8c03, 0xf8a1a159, 0x80898909, 0x170d0d1a, 0xdabfbf65,
164165
0x31e6e6d7, 0xc6424284, 0xb86868d0, 0xc3414182, 0xb0999929,
165166
0x772d2d5a, 0x110f0f1e, 0xcbb0b07b, 0xfc5454a8, 0xd6bbbb6d,
166-
0x3a16162c};
167+
0x3a16162c,
167168

168-
private static final int[] T1 =
169-
{
169+
// T1
170170
0x6363c6a5, 0x7c7cf884, 0x7777ee99, 0x7b7bf68d, 0xf2f2ff0d,
171171
0x6b6bd6bd, 0x6f6fdeb1, 0xc5c59154, 0x30306050, 0x01010203,
172172
0x6767cea9, 0x2b2b567d, 0xfefee719, 0xd7d7b562, 0xabab4de6,
@@ -218,10 +218,9 @@ public class AESFastEngine
218218
0x8c8c038f, 0xa1a159f8, 0x89890980, 0x0d0d1a17, 0xbfbf65da,
219219
0xe6e6d731, 0x424284c6, 0x6868d0b8, 0x414182c3, 0x999929b0,
220220
0x2d2d5a77, 0x0f0f1e11, 0xb0b07bcb, 0x5454a8fc, 0xbbbb6dd6,
221-
0x16162c3a};
221+
0x16162c3a,
222222

223-
private static final int[] T2 =
224-
{
223+
// T2
225224
0x63c6a563, 0x7cf8847c, 0x77ee9977, 0x7bf68d7b, 0xf2ff0df2,
226225
0x6bd6bd6b, 0x6fdeb16f, 0xc59154c5, 0x30605030, 0x01020301,
227226
0x67cea967, 0x2b567d2b, 0xfee719fe, 0xd7b562d7, 0xab4de6ab,
@@ -273,10 +272,9 @@ public class AESFastEngine
273272
0x8c038f8c, 0xa159f8a1, 0x89098089, 0x0d1a170d, 0xbf65dabf,
274273
0xe6d731e6, 0x4284c642, 0x68d0b868, 0x4182c341, 0x9929b099,
275274
0x2d5a772d, 0x0f1e110f, 0xb07bcbb0, 0x54a8fc54, 0xbb6dd6bb,
276-
0x162c3a16};
275+
0x162c3a16,
277276

278-
private static final int[] T3 =
279-
{
277+
// T3
280278
0xc6a56363, 0xf8847c7c, 0xee997777, 0xf68d7b7b, 0xff0df2f2,
281279
0xd6bd6b6b, 0xdeb16f6f, 0x9154c5c5, 0x60503030, 0x02030101,
282280
0xcea96767, 0x567d2b2b, 0xe719fefe, 0xb562d7d7, 0x4de6abab,
@@ -330,8 +328,9 @@ public class AESFastEngine
330328
0x5a772d2d, 0x1e110f0f, 0x7bcbb0b0, 0xa8fc5454, 0x6dd6bbbb,
331329
0x2c3a1616};
332330

333-
private static final int[] Tinv0 =
331+
private static final int[] Tinv =
334332
{
333+
// Tinv0
335334
0x50a7f451, 0x5365417e, 0xc3a4171a, 0x965e273a, 0xcb6bab3b,
336335
0xf1459d1f, 0xab58faac, 0x9303e34b, 0x55fa3020, 0xf66d76ad,
337336
0x9176cc88, 0x254c02f5, 0xfcd7e54f, 0xd7cb2ac5, 0x80443526,
@@ -383,10 +382,9 @@ public class AESFastEngine
383382
0x81f3afca, 0x3ec468b9, 0x2c342438, 0x5f40a3c2, 0x72c31d16,
384383
0x0c25e2bc, 0x8b493c28, 0x41950dff, 0x7101a839, 0xdeb30c08,
385384
0x9ce4b4d8, 0x90c15664, 0x6184cb7b, 0x70b632d5, 0x745c6c48,
386-
0x4257b8d0};
385+
0x4257b8d0,
387386

388-
private static final int[] Tinv1 =
389-
{
387+
// Tinv1
390388
0xa7f45150, 0x65417e53, 0xa4171ac3, 0x5e273a96, 0x6bab3bcb,
391389
0x459d1ff1, 0x58faacab, 0x03e34b93, 0xfa302055, 0x6d76adf6,
392390
0x76cc8891, 0x4c02f525, 0xd7e54ffc, 0xcb2ac5d7, 0x44352680,
@@ -438,10 +436,9 @@ public class AESFastEngine
438436
0xf3afca81, 0xc468b93e, 0x3424382c, 0x40a3c25f, 0xc31d1672,
439437
0x25e2bc0c, 0x493c288b, 0x950dff41, 0x01a83971, 0xb30c08de,
440438
0xe4b4d89c, 0xc1566490, 0x84cb7b61, 0xb632d570, 0x5c6c4874,
441-
0x57b8d042};
439+
0x57b8d042,
442440

443-
private static final int[] Tinv2 =
444-
{
441+
// Tinv2
445442
0xf45150a7, 0x417e5365, 0x171ac3a4, 0x273a965e, 0xab3bcb6b,
446443
0x9d1ff145, 0xfaacab58, 0xe34b9303, 0x302055fa, 0x76adf66d,
447444
0xcc889176, 0x02f5254c, 0xe54ffcd7, 0x2ac5d7cb, 0x35268044,
@@ -493,10 +490,9 @@ public class AESFastEngine
493490
0xafca81f3, 0x68b93ec4, 0x24382c34, 0xa3c25f40, 0x1d1672c3,
494491
0xe2bc0c25, 0x3c288b49, 0x0dff4195, 0xa8397101, 0x0c08deb3,
495492
0xb4d89ce4, 0x566490c1, 0xcb7b6184, 0x32d570b6, 0x6c48745c,
496-
0xb8d04257};
493+
0xb8d04257,
497494

498-
private static final int[] Tinv3 =
499-
{
495+
// Tinv3
500496
0x5150a7f4, 0x7e536541, 0x1ac3a417, 0x3a965e27, 0x3bcb6bab,
501497
0x1ff1459d, 0xacab58fa, 0x4b9303e3, 0x2055fa30, 0xadf66d76,
502498
0x889176cc, 0xf5254c02, 0x4ffcd7e5, 0xc5d7cb2a, 0x26804435,
@@ -804,37 +800,49 @@ private void packBlock(
804800
private void encryptBlock(int[][] KW)
805801
{
806802
int r, r0, r1, r2, r3;
807-
808-
C0 ^= KW[0][0];
809-
C1 ^= KW[0][1];
810-
C2 ^= KW[0][2];
811-
C3 ^= KW[0][3];
812803

804+
// Registerising helps fast engine encrypt on x64, but hurts decrypt and AESEngine
805+
int C0 = this.C0 ^= KW[0][0];
806+
int C1 = this.C1 ^= KW[0][1];
807+
int C2 = this.C2 ^= KW[0][2];
808+
int C3 = this.C3 ^= KW[0][3];
809+
810+
/*
811+
* Fast engine has precomputed rotr(T0, 8/16/24) tables T1/T2/T3.
812+
*
813+
* Placing all precomputes in one array requires offsets additions for 8/16/24 rotations but
814+
* avoids additional array range checks on 3 more arrays (which on HotSpot are more
815+
* expensive than the offset additions).
816+
*/
813817
r = 1;
814818
while (r < ROUNDS - 1)
815819
{
816-
r0 = T0[C0&255] ^ T1[(C1>>8)&255] ^ T2[(C2>>16)&255] ^ T3[(C3>>24)&255] ^ KW[r][0];
817-
r1 = T0[C1&255] ^ T1[(C2>>8)&255] ^ T2[(C3>>16)&255] ^ T3[(C0>>24)&255] ^ KW[r][1];
818-
r2 = T0[C2&255] ^ T1[(C3>>8)&255] ^ T2[(C0>>16)&255] ^ T3[(C1>>24)&255] ^ KW[r][2];
819-
r3 = T0[C3&255] ^ T1[(C0>>8)&255] ^ T2[(C1>>16)&255] ^ T3[(C2>>24)&255] ^ KW[r++][3];
820-
C0 = T0[r0&255] ^ T1[(r1>>8)&255] ^ T2[(r2>>16)&255] ^ T3[(r3>>24)&255] ^ KW[r][0];
821-
C1 = T0[r1&255] ^ T1[(r2>>8)&255] ^ T2[(r3>>16)&255] ^ T3[(r0>>24)&255] ^ KW[r][1];
822-
C2 = T0[r2&255] ^ T1[(r3>>8)&255] ^ T2[(r0>>16)&255] ^ T3[(r1>>24)&255] ^ KW[r][2];
823-
C3 = T0[r3&255] ^ T1[(r0>>8)&255] ^ T2[(r1>>16)&255] ^ T3[(r2>>24)&255] ^ KW[r++][3];
820+
r0 = T[C0&255] ^ T[((C1>>8)&255)|256] ^ T[((C2>>16)&255)|512] ^ T[((C3>>24)&255)|768] ^ KW[r][0];
821+
r1 = T[C1&255] ^ T[((C2>>8)&255)|256] ^ T[((C3>>16)&255)|512] ^ T[((C0>>24)&255)|768] ^ KW[r][1];
822+
r2 = T[C2&255] ^ T[((C3>>8)&255)|256] ^ T[((C0>>16)&255)|512] ^ T[((C1>>24)&255)|768] ^ KW[r][2];
823+
r3 = T[C3&255] ^ T[((C0>>8)&255)|256] ^ T[((C1>>16)&255)|512] ^ T[((C2>>24)&255)|768] ^ KW[r++][3];
824+
C0 = T[r0&255] ^ T[((r1>>8)&255)|256] ^ T[((r2>>16)&255)|512] ^ T[((r3>>24)&255)|768] ^ KW[r][0];
825+
C1 = T[r1&255] ^ T[((r2>>8)&255)|256] ^ T[((r3>>16)&255)|512] ^ T[((r0>>24)&255)|768] ^ KW[r][1];
826+
C2 = T[r2&255] ^ T[((r3>>8)&255)|256] ^ T[((r0>>16)&255)|512] ^ T[((r1>>24)&255)|768] ^ KW[r][2];
827+
C3 = T[r3&255] ^ T[((r0>>8)&255)|256] ^ T[((r1>>16)&255)|512] ^ T[((r2>>24)&255)|768] ^ KW[r++][3];
824828
}
825829

826-
r0 = T0[C0&255] ^ T1[(C1>>8)&255] ^ T2[(C2>>16)&255] ^ T3[(C3>>24)&255] ^ KW[r][0];
827-
r1 = T0[C1&255] ^ T1[(C2>>8)&255] ^ T2[(C3>>16)&255] ^ T3[(C0>>24)&255] ^ KW[r][1];
828-
r2 = T0[C2&255] ^ T1[(C3>>8)&255] ^ T2[(C0>>16)&255] ^ T3[(C1>>24)&255] ^ KW[r][2];
829-
r3 = T0[C3&255] ^ T1[(C0>>8)&255] ^ T2[(C1>>16)&255] ^ T3[(C2>>24)&255] ^ KW[r++][3];
830-
830+
r0 = T[C0&255] ^ T[((C1>>8)&255)|256] ^ T[((C2>>16)&255)|512] ^ T[((C3>>24)&255)|768] ^ KW[r][0];
831+
r1 = T[C1&255] ^ T[((C2>>8)&255)|256] ^ T[((C3>>16)&255)|512] ^ T[((C0>>24)&255)|768] ^ KW[r][1];
832+
r2 = T[C2&255] ^ T[((C3>>8)&255)|256] ^ T[((C0>>16)&255)|512] ^ T[((C1>>24)&255)|768] ^ KW[r][2];
833+
r3 = T[C3&255] ^ T[((C0>>8)&255)|256] ^ T[((C1>>16)&255)|512] ^ T[((C2>>24)&255)|768] ^ KW[r++][3];
834+
831835
// the final round's table is a simple function of S so we don't use a whole other four tables for it
832836

833837
C0 = (S[r0&255]&255) ^ ((S[(r1>>8)&255]&255)<<8) ^ ((S[(r2>>16)&255]&255)<<16) ^ (S[(r3>>24)&255]<<24) ^ KW[r][0];
834838
C1 = (S[r1&255]&255) ^ ((S[(r2>>8)&255]&255)<<8) ^ ((S[(r3>>16)&255]&255)<<16) ^ (S[(r0>>24)&255]<<24) ^ KW[r][1];
835839
C2 = (S[r2&255]&255) ^ ((S[(r3>>8)&255]&255)<<8) ^ ((S[(r0>>16)&255]&255)<<16) ^ (S[(r1>>24)&255]<<24) ^ KW[r][2];
836840
C3 = (S[r3&255]&255) ^ ((S[(r0>>8)&255]&255)<<8) ^ ((S[(r1>>16)&255]&255)<<16) ^ (S[(r2>>24)&255]<<24) ^ KW[r][3];
837841

842+
this.C0 = C0;
843+
this.C1 = C1;
844+
this.C2 = C2;
845+
this.C3 = C3;
838846
}
839847

840848
private void decryptBlock(int[][] KW)
@@ -846,25 +854,25 @@ private void decryptBlock(int[][] KW)
846854
C2 ^= KW[ROUNDS][2];
847855
C3 ^= KW[ROUNDS][3];
848856

849-
int r = ROUNDS-1;
850-
851-
while (r>1)
857+
int r = ROUNDS-1;
858+
859+
while (r>1)
852860
{
853-
r0 = Tinv0[C0&255] ^ Tinv1[(C3>>8)&255] ^ Tinv2[(C2>>16)&255] ^ Tinv3[(C1>>24)&255] ^ KW[r][0];
854-
r1 = Tinv0[C1&255] ^ Tinv1[(C0>>8)&255] ^ Tinv2[(C3>>16)&255] ^ Tinv3[(C2>>24)&255] ^ KW[r][1];
855-
r2 = Tinv0[C2&255] ^ Tinv1[(C1>>8)&255] ^ Tinv2[(C0>>16)&255] ^ Tinv3[(C3>>24)&255] ^ KW[r][2];
856-
r3 = Tinv0[C3&255] ^ Tinv1[(C2>>8)&255] ^ Tinv2[(C1>>16)&255] ^ Tinv3[(C0>>24)&255] ^ KW[r--][3];
857-
C0 = Tinv0[r0&255] ^ Tinv1[(r3>>8)&255] ^ Tinv2[(r2>>16)&255] ^ Tinv3[(r1>>24)&255] ^ KW[r][0];
858-
C1 = Tinv0[r1&255] ^ Tinv1[(r0>>8)&255] ^ Tinv2[(r3>>16)&255] ^ Tinv3[(r2>>24)&255] ^ KW[r][1];
859-
C2 = Tinv0[r2&255] ^ Tinv1[(r1>>8)&255] ^ Tinv2[(r0>>16)&255] ^ Tinv3[(r3>>24)&255] ^ KW[r][2];
860-
C3 = Tinv0[r3&255] ^ Tinv1[(r2>>8)&255] ^ Tinv2[(r1>>16)&255] ^ Tinv3[(r0>>24)&255] ^ KW[r--][3];
861+
r0 = Tinv[C0&255] ^ Tinv[((C3>>8)&255)|256] ^ Tinv[((C2>>16)&255)|512] ^ Tinv[((C1>>24)&255)|768] ^ KW[r][0];
862+
r1 = Tinv[C1&255] ^ Tinv[((C0>>8)&255)|256] ^ Tinv[((C3>>16)&255)|512] ^ Tinv[((C2>>24)&255)|768] ^ KW[r][1];
863+
r2 = Tinv[C2&255] ^ Tinv[((C1>>8)&255)|256] ^ Tinv[((C0>>16)&255)|512] ^ Tinv[((C3>>24)&255)|768] ^ KW[r][2];
864+
r3 = Tinv[C3&255] ^ Tinv[((C2>>8)&255)|256] ^ Tinv[((C1>>16)&255)|512] ^ Tinv[((C0>>24)&255)|768] ^ KW[r--][3];
865+
C0 = Tinv[r0&255] ^ Tinv[((r3>>8)&255)|256] ^ Tinv[((r2>>16)&255)|512] ^ Tinv[((r1>>24)&255)|768] ^ KW[r][0];
866+
C1 = Tinv[r1&255] ^ Tinv[((r0>>8)&255)|256] ^ Tinv[((r3>>16)&255)|512] ^ Tinv[((r2>>24)&255)|768] ^ KW[r][1];
867+
C2 = Tinv[r2&255] ^ Tinv[((r1>>8)&255)|256] ^ Tinv[((r0>>16)&255)|512] ^ Tinv[((r3>>24)&255)|768] ^ KW[r][2];
868+
C3 = Tinv[r3&255] ^ Tinv[((r2>>8)&255)|256] ^ Tinv[((r1>>16)&255)|512] ^ Tinv[((r0>>24)&255)|768] ^ KW[r--][3];
861869
}
862870

863-
r0 = Tinv0[C0&255] ^ Tinv1[(C3>>8)&255] ^ Tinv2[(C2>>16)&255] ^ Tinv3[(C1>>24)&255] ^ KW[r][0];
864-
r1 = Tinv0[C1&255] ^ Tinv1[(C0>>8)&255] ^ Tinv2[(C3>>16)&255] ^ Tinv3[(C2>>24)&255] ^ KW[r][1];
865-
r2 = Tinv0[C2&255] ^ Tinv1[(C1>>8)&255] ^ Tinv2[(C0>>16)&255] ^ Tinv3[(C3>>24)&255] ^ KW[r][2];
866-
r3 = Tinv0[C3&255] ^ Tinv1[(C2>>8)&255] ^ Tinv2[(C1>>16)&255] ^ Tinv3[(C0>>24)&255] ^ KW[r][3];
867-
871+
r0 = Tinv[C0&255] ^ Tinv[((C3>>8)&255)|256] ^ Tinv[((C2>>16)&255)|512] ^ Tinv[((C1>>24)&255)|768] ^ KW[r][0];
872+
r1 = Tinv[C1&255] ^ Tinv[((C0>>8)&255)|256] ^ Tinv[((C3>>16)&255)|512] ^ Tinv[((C2>>24)&255)|768] ^ KW[r][1];
873+
r2 = Tinv[C2&255] ^ Tinv[((C1>>8)&255)|256] ^ Tinv[((C0>>16)&255)|512] ^ Tinv[((C3>>24)&255)|768] ^ KW[r][2];
874+
r3 = Tinv[C3&255] ^ Tinv[((C2>>8)&255)|256] ^ Tinv[((C1>>16)&255)|512] ^ Tinv[((C0>>24)&255)|768] ^ KW[r][3];
875+
868876
// the final round's table is a simple function of Si so we don't use a whole other four tables for it
869877

870878
C0 = (Si[r0&255]&255) ^ ((Si[(r3>>8)&255]&255)<<8) ^ ((Si[(r2>>16)&255]&255)<<16) ^ (Si[(r1>>24)&255]<<24) ^ KW[0][0];

0 commit comments

Comments
 (0)