-
Notifications
You must be signed in to change notification settings - Fork 6.1k
8371450: AES performance improvements for key schedule generation #28188
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -941,8 +941,9 @@ void init(boolean decrypting, String algorithm, byte[] key) | |
| * Generate the cipher's round keys as outlined in section 5.2 of the spec. | ||
| * | ||
| * @param key [in] the symmetric key byte array. | ||
| * @param rounds [in] the number rounds for generating the round keys. | ||
| * | ||
| * @return w the cipher round keys. | ||
| * @return the cipher round keys. | ||
| */ | ||
| private static int[] genRoundKeys(byte[] key, int rounds) { | ||
| int wLen = WB * (rounds + 1); | ||
|
|
@@ -970,53 +971,58 @@ private static int[] genRoundKeys(byte[] key, int rounds) { | |
| /** | ||
| * Generate the inverse cipher round keys. | ||
| * | ||
| * @return w1 the inverse cipher round keys. | ||
| * @param w [in] the targeted word for substituion. | ||
| * @param rounds [in] the number rounds for generating the round keys. | ||
| * | ||
| * @return the inverse cipher round keys. | ||
| */ | ||
| private static int[] genInvRoundKeys(int[] w, int rounds) { | ||
| int kLen = w.length;; | ||
| int[] temp = new int[WB]; | ||
| int[] dw = new int[kLen]; | ||
| int[] dw = new int[w.length]; | ||
|
|
||
| // Intrinsics requires the inverse key expansion to be reverse order | ||
| // except for the first and last round key as the first two round keys | ||
| // are without a mix column transform. | ||
| for (int i = 1; i < rounds; i++) { | ||
| System.arraycopy(w, i * WB, temp, 0, WB); | ||
| temp[0] = TMI0[temp[0] >>> 24] ^ TMI1[(temp[0] >> 16) & 0xFF] | ||
| ^ TMI2[(temp[0] >> 8) & 0xFF] ^ TMI3[temp[0] & 0xFF]; | ||
| temp[1] = TMI0[temp[1] >>> 24] ^ TMI1[(temp[1] >> 16) & 0xFF] | ||
| ^ TMI2[(temp[1] >> 8) & 0xFF] ^ TMI3[temp[1] & 0xFF]; | ||
| temp[2] = TMI0[temp[2] >>> 24] ^ TMI1[(temp[2] >> 16) & 0xFF] | ||
| ^ TMI2[(temp[2] >> 8) & 0xFF] ^ TMI3[temp[2] & 0xFF]; | ||
| temp[3] = TMI0[temp[3] >>> 24] ^ TMI1[(temp[3] >> 16) & 0xFF] | ||
| ^ TMI2[(temp[3] >> 8) & 0xFF] ^ TMI3[temp[3] & 0xFF]; | ||
| System.arraycopy(temp, 0, dw, kLen - (i * WB), WB); | ||
| int widx = i * WB; | ||
| int idx = w.length - widx; | ||
|
|
||
| dw[idx] = TMI0[w[widx] >>> 24] ^ TMI1[(w[widx] >> 16) & 0xFF] | ||
| ^ TMI2[(w[widx] >> 8) & 0xFF] ^ TMI3[w[widx] & 0xFF]; | ||
| dw[idx + 1] = TMI0[w[widx + 1] >>> 24] | ||
| ^ TMI1[(w[widx + 1] >> 16) & 0xFF] | ||
| ^ TMI2[(w[widx + 1] >> 8) & 0xFF] | ||
| ^ TMI3[w[widx + 1] & 0xFF]; | ||
| dw[idx + 2] = TMI0[w[widx + 2] >>> 24] | ||
| ^ TMI1[(w[widx + 2] >> 16) & 0xFF] | ||
| ^ TMI2[(w[widx + 2] >> 8) & 0xFF] | ||
| ^ TMI3[w[widx + 2] & 0xFF]; | ||
| dw[idx + 3] = TMI0[w[widx + 3] >>> 24] | ||
| ^ TMI1[(w[widx + 3] >> 16) & 0xFF] | ||
| ^ TMI2[(w[widx + 3] >> 8) & 0xFF] | ||
| ^ TMI3[w[widx + 3] & 0xFF]; | ||
| } | ||
| System.arraycopy(w, kLen - WB, dw, WB, WB); | ||
| System.arraycopy(w, w.length - WB, dw, WB, WB); | ||
| System.arraycopy(w, 0, dw, 0, WB); | ||
| Arrays.fill(temp, 0); | ||
|
|
||
| return dw; | ||
| } | ||
|
|
||
| /** | ||
| * Subtitute the word as a step of key expansion. | ||
| * | ||
| * @param state [in] the targeted word for substituion. | ||
| * @param sub [in] the substitute table for cipher and inverse cipher. | ||
| * @param word [in] the targeted word for substituion. | ||
| * | ||
| * @return the substituted word. | ||
| */ | ||
| private static int subWord(int word) { | ||
| byte b0 = (byte) (word >>> 24); | ||
smemery marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| byte b1 = (byte) ((word >> 16) & 0xFF); | ||
| byte b2 = (byte) ((word >> 8) & 0xFF); | ||
| byte b3 = (byte) (word & 0xFF); | ||
| byte b0 = (byte) (word >> 24); | ||
| byte b1 = (byte) (word >> 16); | ||
| byte b2 = (byte) (word >> 8); | ||
|
|
||
| return ((SBOX[(b0 & 0xF0) >> 4][b0 & 0x0F] & 0xFF) << 24) | ||
| | ((SBOX[(b1 & 0xF0) >> 4][b1 & 0x0F] & 0xFF) << 16) | ||
| | ((SBOX[(b2 & 0xF0) >> 4][b2 & 0x0F] & 0xFF) << 8) | ||
| | (SBOX[(b3 & 0xF0) >> 4][b3 & 0x0F] & 0xFF); | ||
| | (SBOX[(word & 0xF0) >> 4][word & 0x0F] & 0xFF); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think there is no need for any of these bytes. Every index can be computed as "(word >> offset) & 0x0F". Actually, if you define SBOX as a 1-dim array, you can index into it with "(word >> offset) & 0xFF".
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thank you for your review. The byte assignments were to avoid three redundant shift operations. |
||
| } | ||
|
|
||
| /** | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do you think there would be any benefit to putting w[widx] through w[widx+3] on local int variables? In some cases I've seen where that increases register pressure and can lead to some perf benefits. I'm not sure if this is one of those cases but it seems like you'd only need to reference memory once instead of 4 times per assignment.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I believe my original changes here utilize a "MergeStore" technique that the compiler optimizes. I've asked @minborg to see if I got this right. To verify the optimization here, I used the separate local int variable technique and saw a 0.7% decrease in benchmark performance.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Well, your experiment answers my question. Unless @minborg says otherwise I'm fine with what you have. You're already seeing improvements so that's great.